source: trunk/FACT++/src/root2csv.cc@ 19794

Last change on this file since 19794 was 19794, checked in by tbretz, 5 years ago
Improvements and fixes.
File size: 31.9 KB
Line 
1#include <random>
2
3#include <boost/regex.hpp>
4#include <boost/filesystem.hpp>
5#include <boost/algorithm/string/join.hpp>
6
7#include "tools.h"
8#include "Time.h"
9#include "Configuration.h"
10
11#include <TROOT.h>
12#include <TSystem.h>
13#include <TChain.h>
14#include <TLeaf.h>
15#include <TError.h>
16#include <TTreeFormula.h>
17#include <TTreeFormulaManager.h>
18
19using namespace std;
20namespace fs = boost::filesystem;
21
22// ------------------------------------------------------------------------
23
24struct Map : pair<string, string>
25{
26 Map() { }
27};
28
29std::istream &operator>>(std::istream &in, Map &m)
30{
31 const istreambuf_iterator<char> eos;
32 string txt(istreambuf_iterator<char>(in), eos);
33
34 const boost::regex expr("((.*)[^\\\\])/(.*)");
35 boost::smatch match;
36 if (!boost::regex_match(txt, match, expr))
37 throw runtime_error("Could not evaluate map argument: "+txt);
38
39 m.first = match[1].str();
40 m.second = match[3].str();
41
42 return in;
43}
44
45void SetupConfiguration(Configuration &conf)
46{
47 po::options_description control("Root to SQL");
48 control.add_options()
49 ("file", vars<string>()->required(),"The root files to read from")
50 ("out,o", var<string>()->required(), "Output file name")
51 ("force,f", po_switch(), "Force overwrite if output file already exists.")
52 ("append,a", po_switch(), "Append to an existing file (not check for the format is done!)")
53 ("tree,t", var<string>("Events"), "Name of the root tree to convert")
54 ("ignore", vars<string>(), "Ignore the given leaf, if the given regular expression matches")
55 ("alias.*", var<string>(), "Define an alias")
56 ("auto-alias", vars<Map>(), "Regular expression to define aliases from the branch names automatically")
57 ("header", var<uint16_t>(uint16_t(0)),"Type of header line (0: preceeding #, 1: without preceeding #, 2: none)")
58 ("add.*", var<string>(), "Define an additional column")
59 ("selector", var<string>("1"), "Define a selector for the columns (colums where this evaluates to a value <=0 are discarded)")
60 ("skip", po_switch(), "Discards all default leaves and writes only the columns defined by --add.*")
61 ("first", var<int64_t>(int64_t(0)), "First event to start with (default: 0), mainly for test purpose")
62 ("max", var<int64_t>(int64_t(0)), "Maximum number of events to process (0: all), mainly for test purpose")
63 //("const.*", var<string>(), "Insert a constant number into the given column (--const.mycolumn=5). A special case is `/.../.../`")
64 ("dry-run", po_switch(), "Do not create or manipulate any output file")
65 ;
66
67 po::options_description split("Splitting options");
68 split.add_options()
69 ("split-sequence,S", vars<uint16_t>(), "Split data sequentially into several trees/files (e.g. 1, 1, 2)")
70 ("split-quantile,Q", vars<double>(), "Split data randomly into several trees/files (e.g. 0.5, 1)")
71 ("seed", var<uint64_t>(mt19937_64::default_seed), "Seed value in case of random split")
72 ;
73
74 po::options_description debug("Debug options");
75 debug.add_options()
76 ("print-ls", po_switch(), "Calls TFile::ls()")
77 ("print-branches", po_switch(), "Print the branches found in the tree")
78 ("print-leaves", po_switch(), "Print the leaves found in the tree (this is what is processed)")
79 ("verbose,v", var<uint16_t>(1), "Verbosity (0: quiet, 1: default, 2: more, 3, ...)")
80 ;
81
82 po::positional_options_description p;
83 p.add("file", -1); // All positional options
84
85 conf.AddOptions(control);
86 conf.AddOptions(split);
87 conf.AddOptions(debug);
88 conf.SetArgumentPositions(p);
89}
90
91void PrintUsage()
92{
93 cout <<
94 "root2csv - Reads data from a root tree and writes a csv file\n"
95 "\n"
96 "For convenience, this documentation uses the extended version of the options, "
97 "refer to the output below to get the abbreviations.\n"
98 "\n"
99 "This is a general purpose tool to fill the contents of a root file into a database "
100 "as long as this is technically possible and makes sense. Note that root can even "
101 "write complex data like a TH1F into a database, this is not the purpose of this "
102 "program.\n"
103 "\n"
104 "Each root tree has branches and leaves (the basic data types). These leaves can "
105 "be read independently of the classes which were used to write the root file. "
106 "The default tree to read from is 'Events' but the name can be overwritten "
107 "using --tree. The default table name to fill the data into is identical to "
108 "the tree name. It can be overwritten using --table.\n"
109 "\n"
110 "To get a list of the contents (keys and trees) of a root file, you can use --print-ls. "
111 "The name of each column to which data is filled from a leave is obtained from "
112 "the leaves' names. The leave names can be checked using --print-leaves. "
113 "A --print-branches exists for convenience to print only the high-level branches. "
114 "Sometimes these names might be quite unconvenient like MTime.fTime.fMilliSec or "
115 "just MHillas.fWidth. To allow to simplify column names, regular expressions "
116 "(using boost's regex) can be defined to change the names. Note that these regular "
117 "expressions are applied one by one on each leaf's name. A valid expression could "
118 "be:\n"
119 " --map=MHillas\\.f/\n"
120 "which would remove all occurances of 'MHillas.f'. This option can be used more than "
121 "once. They are applied in sequence. A single match does not stop the sequence.\n"
122 "\n"
123 "Sometimes it might also be convenient to skip a leaf. This can be done with "
124 "the --ignore resource. If the given regular expresion yields a match, the "
125 "leaf will be ignored. Note that the regular expression works on the raw-name "
126 "of the leaf not the readily mapped SQL column names. Example:\n"
127 " --ignore=ThetaSq\\..*\n"
128 "will skip all leaved which start with 'ThetaSq.'. This option can be used"
129 "more than once.\n"
130 "\n"
131 "The data type of each column is kept as close as possible to the leaves' data "
132 "types. If for some reason this is not wanted, the data type of the SQL column "
133 "can be overwritten with --sql-type sql-column/sql-ytpe, for example:\n"
134 " --sql-type=FileId/UNSIGNED INT\n"
135 "while the first argument of the name of the SQL column to which the data type "
136 "should be applied. The second column is the basic SQL data type. The option can "
137 "be given more than once.\n"
138 "\n"
139 "Database interaction:\n"
140 "\n"
141 "To drop an existing table, --drop can be used.\n"
142 "\n"
143 "To create a table according to theSQL column names and data types, --create "
144 "can be used. The query used can be printed with --print-create even --create "
145 "has not been specified.\n"
146 "\n"
147 "To choose the columns which should become primary keys, use --primary, "
148 "for example:\n"
149 " --primary=col1\n"
150 "To define more than one column as primary key, the option can be given more than "
151 "once. Note that the combination of these columns must be unique.\n"
152 "\n"
153 "All columns are created as NOT NULL as default. To force a database engine "
154 "and/or a storage format, use --engine and --row-format.\n"
155 "\n"
156 "Usually, the INSERT query would fail if the PRIMARY key exists already. "
157 "This can be avoided using the 'ON DUPLICATE KEY UPDATE' directive. With the "
158 "--duplicate, you can specify what should be updated in case of a duplicate key. "
159 "To keep the row untouched, you can just update the primary key "
160 "with the identical primary key, e.g. --duplicate='MyPrimary=VALUES(MyPrimary)'. "
161 "The --duplicate resource can be specified more than once to add more expressions "
162 "to the assignment_list. For more details, see the MySQL manual.\n"
163 "\n"
164 "For debugging purpose, or to just create or drop a table, the final insert "
165 "query can be skipped using --no-insert. Note that for performance reason, "
166 "all data is collected in memory and a single INSERT query is issued at the "
167 "end.\n"
168 "\n"
169 "Another possibility is to add the IGNORE keyword to the INSERT query by "
170 "--ignore-errors, which essentially ignores all errors and turns them into "
171 "warnings which are printed after the query succeeded.\n"
172 "\n"
173 "Using a higher verbosity level (-v), an overview of the written columns or all "
174 "processed leaves is printed depending on the verbosity level. The output looks "
175 "like the following\n"
176 " Leaf name [root data type] (SQL name)\n"
177 "for example\n"
178 " MTime.fTime.fMilliSec [Long64_t] (MilliSec)\n"
179 "which means that the leaf MTime.fTime.fMilliSec is detected to be a Long64_t "
180 "which is filled into a column called MilliSec. Leaves with non basic data types "
181 "are ignored automatically and are marked as (-n/a-). User ignored columns "
182 "are marked as (-ignored-).\n"
183 "\n"
184 "A constant value for the given file can be inserted by using the --const directive. "
185 "For example --const.mycolumn=42 would insert 42 into a column called mycolumn. "
186 "The column is created as INT UNSIGNED as default which can be altered by "
187 "--sql-type. A special case is a value of the form `/regex/format/`. Here, the given "
188 "regular expression is applied to the filename and it is newly formated with "
189 "the new format string. Uses the standard formatting rules to replace matches "
190 "(those used by ECMAScript's replace method).\n"
191 "\n"
192 "Usually the previously defined constant values are helpful to create an index "
193 "which relates unambiguously the inserted data to the file. It might be useful "
194 "to delete all data which belongs to this particular file before new data is "
195 "entered. This can be achieved with the `--delete` directive. It deletes all "
196 "data from the table before inserting new data which fulfills the condition "
197 "defined by the `--const` directives.\n"
198 "\n"
199 "The constant values can also be used for a conditional execution (--conditional). "
200 "If any row with the given constant values are found, the execution is stopped "
201 "(note that this happend after the table drop/create but before the delete/insert.\n"
202 "\n"
203 "To ensure efficient access for a conditonal execution, it makes sense to have "
204 "an index created for those columns. This can be done during table creation "
205 "with the --index option.\n"
206 "\n"
207 "To create the index as a UNIQUE INDEX, you can use the --unique option which "
208 "implies --index.\n"
209 "\n"
210 "If a query failed, the query is printed to stderr together with the error message. "
211 "For the main INSERT query, this is only true if the verbosity level is at least 2 "
212 "or the query has less than 80*25 bytes.\n"
213 "\n"
214 "In case of success, 0 is returned, a value>0 otherwise.\n"
215 "\n"
216 "Usage: root2sql [options] -uri URI rootfile.root\n"
217 "\n"
218 ;
219 cout << endl;
220}
221
222enum BasicType_t
223{
224 kNone = 0,
225 kConst,
226 kFloat,
227 kDouble,
228 kInt16,
229 kUInt16,
230 kInt32,
231 kUInt32,
232 kInt64,
233 kUInt64,
234};
235
236static const map<string, pair<BasicType_t, string>> ConvRoot =
237{
238 { "Float_t", { kFloat, "FLOAT" } },
239 { "Double_t", { kDouble, "DOUBLE" } },
240 { "ULong64_t", { kUInt64, "BIGINT UNSIGNED" } },
241 { "Long64_t", { kInt64, "BIGINT" } },
242 { "UInt_t", { kUInt32, "INT UNSIGNED" } },
243 { "Int_t", { kInt32, "INT" } },
244 { "UShort_t", { kUInt16, "SMALLINT UNSIGNED" } },
245 { "Short_t", { kInt16, "SMALLINT" } },
246};
247
248struct Container
249{
250 static map<void*, size_t> counter;
251
252 string branch; // branch name
253 string column; // column name
254 BasicType_t type;
255 size_t num;
256 void *ptr;
257
258 Container(const string &b, const string &c, const BasicType_t &t, const size_t n=1) : branch(b), column(c), type(t), num(n), ptr(0)
259 {
260 switch (t)
261 {
262 case kFloat: ptr = new Float_t[n]; break;
263 case kDouble: ptr = new Double_t[n]; break;
264 case kInt16: ptr = new Short_t[n]; break;
265 case kUInt16: ptr = new UShort_t[n]; break;
266 case kInt32: ptr = new Int_t[n]; break;
267 case kUInt32: ptr = new UInt_t[n]; break;
268 case kInt64: ptr = new Long64_t[n]; break;
269 case kUInt64: ptr = new ULong64_t[n]; break;
270 case kConst:
271 case kNone:
272 break;
273 }
274 counter[ptr]++;
275 }
276 Container(const string &c, const string &value) : branch(value), column(c), type(kConst), num(1), ptr(0)
277 {
278 }
279
280 Container(const Container &c) : branch(c.branch), column(c.column), type(c.type), num(c.num), ptr(c.ptr)
281 {
282 counter[ptr]++;
283 }
284
285 ~Container()
286 {
287 counter[ptr]--;
288 if (counter[ptr]==0)
289 ::operator delete[](ptr); // It seems root is deleting it already
290 }
291
292 string fmt(const size_t &index) const
293 {
294 ostringstream str;
295
296 switch (type)
297 {
298 case kFloat: str << setprecision(8) << reinterpret_cast<Float_t*>(ptr)[index]; break;
299 case kDouble: str << setprecision(16) << reinterpret_cast<Double_t*>(ptr)[index]; break;
300 case kInt16: str << reinterpret_cast<Short_t*>(ptr)[index]; break;
301 case kUInt16: str << reinterpret_cast<UShort_t*>(ptr)[index]; break;
302 case kInt32: str << reinterpret_cast<Int_t*>(ptr)[index]; break;
303 case kUInt32: str << reinterpret_cast<UInt_t*>(ptr)[index]; break;
304 case kInt64: str << reinterpret_cast<Long64_t*>(ptr)[index]; break;
305 case kUInt64: str << reinterpret_cast<ULong64_t*>(ptr)[index]; break;
306 case kConst: str << branch; break;
307 case kNone:
308 break;
309 }
310
311 //if (str.str()=="nan" || str.str()=="-nan" || str.str()=="inf" || str.str()=="-inf")
312 // return "NULL";
313
314 return str.str();
315 }
316};
317
318map<void*, size_t> Container::counter;
319
320void ErrorHandlerAll(Int_t level, Bool_t abort, const char *location, const char *msg)
321{
322 if (string(msg).substr(0,24)=="no dictionary for class ")
323 return;
324 if (string(msg).substr(0,15)=="unknown branch ")
325 return;
326
327 DefaultErrorHandler(level, abort, location, msg);
328}
329
330// --------------------------- Write Header --------------------------------
331void WriteHeader(ostream &out, const vector<Container> &vec, const vector<TTreeFormula*> &form, bool skip, uint16_t header)
332{
333 if (header>1)
334 return;
335 if (header==0)
336 out << "# ";
337
338 vector<string> join;
339
340 if (!skip)
341 {
342 for (auto v=vec.cbegin(); v!=vec.cend(); v++)
343 {
344 const size_t N = v->num;
345 for (size_t i=0; i<N; i++)
346 {
347 string name = v->column;
348 if (N!=1)
349 name += "["+to_string(i)+"]";
350 join.emplace_back(name);
351 }
352 }
353 }
354
355 for (auto v=form.cbegin(); v!=form.cend(); v++)
356 join.emplace_back((*v)->GetName());
357
358 out << boost::join(join, " ") << "\n";
359}
360
361int CheckFile(TString &path, bool force, int verbose)
362{
363 gSystem->ExpandPathName(path);
364
365 FileStat_t stat;
366 const Int_t exist = !gSystem->GetPathInfo(path, stat);
367 const Bool_t _write = !gSystem->AccessPathName(path, kWritePermission) && R_ISREG(stat.fMode);
368
369 if (exist)
370 {
371 if (!_write)
372 {
373 cerr << "File '" << path << "' is not writable." << endl;
374 return 2;
375 }
376
377 if (!force)
378 {
379 cerr << "File '" << path << "' already exists." << endl;
380 return 3;
381 }
382 else
383 {
384 if (verbose>0)
385 cerr << "File '" << path << "' will be overwritten." << endl;
386 }
387 }
388 return exist ? 0 : -1;
389}
390
391void GetLeaves(vector<string> &list, const TTreeFormula &f)
392{
393 int i=0;
394 while (1)
395 {
396 const auto l = f.GetLeaf(i++);
397 if (!l)
398 return;
399 list.emplace_back(l->GetName());
400 }
401}
402
403int main(int argc, const char* argv[])
404{
405 Time start;
406
407 gROOT->SetBatch();
408 SetErrorHandler(ErrorHandlerAll);
409
410 Configuration conf(argv[0]);
411 conf.SetPrintUsage(PrintUsage);
412 SetupConfiguration(conf);
413
414 if (!conf.DoParse(argc, argv))
415 return 127;
416
417 // ----------------------------- Evaluate options --------------------------
418 const vector<string> files = conf.Vec<string>("file");
419 const string out = conf.Get<string>("out");
420 const string tree = conf.Get<string>("tree");
421
422 const bool force = conf.Get<bool>("force");
423 const bool append = conf.Get<bool>("append");
424 const bool dryrun = conf.Get<bool>("dry-run");
425 const bool skip = conf.Get<bool>("skip");
426
427 const uint16_t verbose = conf.Get<uint16_t>("verbose");
428 const int64_t first = conf.Get<int64_t>("first");
429 const int64_t max = conf.Get<int64_t>("max");
430 const uint16_t header = conf.Get<uint16_t>("header");
431
432 const bool print_ls = conf.Get<bool>("print-ls");
433 const bool print_branches = conf.Get<bool>("print-branches");
434 const bool print_leaves = conf.Get<bool>("print-leaves");
435
436 const vector<string> _ignore = conf.Vec<string>("ignore");
437 const vector<Map> autoalias = conf.Vec<Map>("auto-alias");
438
439 // ----------------------------- Setup splitting ---------------------------
440
441 vector<uint16_t> split_seq = conf.Vec<uint16_t>("split-sequence");
442 vector<double> split_quant = conf.Vec<double>("split-quantile");
443
444 if (!split_seq.empty() && !split_quant.empty())
445 throw runtime_error("Only splitting by --split-sequence or --split-quantile is allowed.");
446
447 const size_t num_split = split_seq.size()+split_quant.size()==0 ? 0 :
448 ::max(split_seq.size(), split_quant.size()+1);
449
450 map<size_t, size_t> split_lut;
451 for (size_t i=0; i<split_seq.size(); i++)
452 {
453 const size_t sz = split_lut.size();
454 for (size_t j=0; j<split_seq[i]; j++)
455 split_lut.emplace(j+sz, i);
456 }
457
458 for (size_t i=0; i<split_quant.size(); i++)
459 if (split_quant[i]<0 || split_quant[i]>=1)
460 throw runtime_error("Splitting quantiles must be in the range [0;1)");
461
462 for (size_t i=1; i<split_quant.size(); i++)
463 {
464 if (split_quant[i]<=split_quant[i-1])
465 throw runtime_error("Splitting quantiles must be in increasing order.");
466 }
467
468 // -------------------------------------------------------------------------
469
470 const uniform_real_distribution<double> distribution(0,1);
471 mt19937_64 generator;
472 generator.seed(conf.Get<uint64_t>("seed"));
473 auto rndm = bind(distribution, generator);
474
475 // -------------------------------------------------------------------------
476
477 if (verbose>0)
478 {
479 cout << "\n-------------------------- Evaluating input ------------------------\n";
480 cout << "Start Time: " << Time::sql << Time(Time::local) << endl;
481 }
482
483 if (verbose>0)
484 cout << "Processing Tree: " << tree << endl;
485
486 TChain c(tree.c_str());
487
488 uint64_t cnt = 0;
489 for (const auto &file : files)
490 {
491 const auto add = c.Add(file.c_str(), 0);
492 if (verbose>0)
493 cout << file << ": " << add << " file(s) added." << endl;
494 cnt += add;
495 }
496
497 if (cnt==0)
498 {
499 cerr << "No files found." << endl;
500 return 1;
501 }
502
503 if (verbose>0)
504 cout << cnt << " file(s) found." << endl;
505
506 if (print_ls)
507 {
508 cout << '\n';
509 c.ls();
510 cout << '\n';
511 }
512
513 c.SetMakeClass(1);
514
515 TObjArray *branches = c.GetListOfBranches();
516 TObjArray *leaves = c.GetListOfLeaves();
517
518 if (print_branches)
519 {
520 cout << '\n';
521 branches->Print();
522 }
523
524 const auto entries = c.GetEntriesFast();
525
526 if (verbose>0)
527 cout << branches->GetEntries() << " branches found." << endl;
528
529 if (print_leaves)
530 {
531 cout << '\n';
532 leaves->Print();
533 }
534 if (verbose>0)
535 {
536 cout << leaves->GetEntries() << " leaves found." << endl;
537 cout << entries << " events found." << endl;
538 }
539
540 // ----------------------------------------------------------------------
541
542 if (verbose>0)
543 cout << "\n-------------------------- Evaluating output -----------------------" << endl;
544
545 vector<Container> vec;
546
547/*
548 const auto fixed = conf.GetWildcardOptions("const.*");
549
550 string where;
551 vector<string> vindex;
552 for (auto it=fixed.cbegin(); it!=fixed.cend(); it++)
553 {
554 const string name = it->substr(6);
555 string val = conf.Get<string>(*it);
556
557 boost::smatch match;
558 if (boost::regex_match(val, match, boost::regex("\\/(.+)(?<!\\\\)\\/(.*)(?<!\\\\)\\/")))
559 {
560 const string reg = match[1];
561 const string fmt = match[2];
562
563 val = boost::regex_replace(file, boost::regex(reg), fmt.empty()?"$0":fmt,
564 boost::regex_constants::format_default|boost::regex_constants::format_no_copy);
565
566 if (verbose>0)
567 {
568 cout << "Regular expression detected for constant column `" << *it << "`\n";
569 cout << "Filename converted with /" << reg << "/ to /" << fmt << "/\n";
570 cout << "Filename: " << file << '\n';
571 cout << "Result: " << val << endl;
572 }
573 }
574
575 if (verbose>2)
576 cout << "\n" << val << " [-const-]";
577 if (verbose>1)
578 cout << " (" << name << ")";
579
580 string sqltype = "INT UNSIGNED";
581
582 for (auto m=sqltypes.cbegin(); m!=sqltypes.cend(); m++)
583 if (m->first==name)
584 sqltype = m->second;
585
586 if (!vec.empty())
587 query += ",\n";
588 query += " `"+name+"` "+sqltype+" NOT NULL COMMENT '--user--'";
589
590 vec.emplace_back(name, val);
591 where += " AND `"+name+"`="+val;
592 vindex.emplace_back(name);
593 }
594 */
595
596 // ------------------------- Setup all branches in tree -------------------
597
598 TIter Next(leaves);
599 TObject *o = 0;
600 while ((o=Next()))
601 {
602 TLeaf *L = c.GetLeaf(o->GetName());
603
604 string name = o->GetName();
605
606 for (auto m=autoalias.cbegin(); m!=autoalias.cend(); m++)
607 name = boost::regex_replace(name, boost::regex(m->first), m->second);
608
609 if (name!=o->GetName())
610 {
611 if (verbose>0)
612 cout << "Auto-alias: " << name << " = " << o->GetName() << endl;
613 if (!c.SetAlias(name.c_str(), o->GetName()))
614 cout << "WARNING - Alias could not be established!" << endl;
615 }
616
617 if (skip)
618 continue;
619
620 if (verbose>2)
621 cout << '\n' << L->GetTitle() << " {" << L->GetTypeName() << "}";
622
623 if (L->GetLenStatic()!=L->GetLen())
624 {
625 if (verbose>2)
626 cout << " (-skipped-)";
627 continue;
628 }
629
630 bool found = false;
631 for (auto b=_ignore.cbegin(); b!=_ignore.cend(); b++)
632 {
633 if (boost::regex_match(o->GetName(), boost::regex(*b)))
634 {
635 found = true;
636 if (verbose>2)
637 cout << " (-ignored-)";
638 break;
639 }
640 }
641
642 if (found)
643 continue;
644
645 const string tn = L->GetTypeName();
646
647 auto it = ConvRoot.find(tn);
648 if (it==ConvRoot.end())
649 {
650 if (verbose>2)
651 cout << " (-n/a-)";
652 continue;
653 }
654
655 if (verbose==2)
656 cout << '\n' << L->GetTitle() << " {" << L->GetTypeName() << "}";
657
658 if (verbose>1)
659 cout << " (" << name << ")";
660
661 vec.emplace_back(o->GetTitle(), name, it->second.first, L->GetLenStatic());
662 c.SetBranchAddress(o->GetTitle(), vec.back().ptr);
663 }
664
665 if (verbose>0)
666 {
667 if (skip)
668 cout << "Default columns skipped: ";
669 cout << vec.size() << " default leaf/leaves setup for reading." << endl;
670 }
671
672
673 // ------------------- Configure manual aliases ----------------------------
674
675 const auto valiases = conf.GetWildcardOptions("alias.*");
676 if (verbose>0 && valiases.size()>0)
677 cout << '\n';
678 for (auto it=valiases.cbegin(); it!=valiases.cend(); it++)
679 {
680 const string name = it->substr(6);
681 const string val = conf.Get<string>(*it);
682
683 if (verbose>0)
684 cout << "Alias: " << name << " = " << val << endl;
685
686 if (!c.SetAlias(name.c_str(), val.c_str()))
687 {
688 cerr << "Alias could not be established!" << endl;
689 return 2;
690 }
691 }
692
693 // -------------------------- Configure Selector --------------------------
694
695 vector<string> leaflist;
696 c.SetBranchStatus("*", 1);
697
698 TTreeFormulaManager *manager = new TTreeFormulaManager;
699
700 if (verbose>0)
701 cout << "\nSelector: " << conf.Get<string>("selector") << endl;
702
703 TTreeFormula selector("Selector", conf.Get<string>("selector").c_str(), &c);
704 if (selector.GetNdim()==0)
705 {
706 cerr << "Compilation of Selector failed!" << endl;
707 return 3;
708 }
709 selector.SetQuickLoad(kTRUE);
710 manager->Add(&selector);
711 GetLeaves(leaflist, selector);
712
713 // -------------------- Configure additional columns ----------------------
714
715 vector<TTreeFormula*> formulas;
716
717 const auto vform = conf.GetWildcardOptions("add.*");
718 if (verbose>0 && vform.size()>0)
719 cout << '\n';
720 for (auto it=vform.cbegin(); it!=vform.cend(); it++)
721 {
722 const string name = it->substr(4);
723 const string val = conf.Get<string>(*it);
724
725 if (verbose>0)
726 cout << "Adding column: " << name << " = " << val << endl;
727
728 TTreeFormula *form = new TTreeFormula(name.c_str(), val.c_str(), &c);
729 if (form->GetNdim()==0)
730 {
731 cerr << "Compilation of Column failed!" << endl;
732 return 4;
733 }
734 form->SetQuickLoad(kTRUE);
735 formulas.emplace_back(form);
736 manager->Add(form);
737 GetLeaves(leaflist, *form);
738 }
739 manager->Sync();
740
741 if (verbose>0)
742 cout << '\n' << formulas.size() << " additional columns setup for writing." << endl;
743
744 // --------------------- Setup all branches in formulas -------------------
745
746 for (auto l=leaflist.cbegin(); l!=leaflist.cend(); l++)
747 {
748 // Branch address already set
749 if (c.GetBranch(l->c_str())->GetAddress())
750 continue;
751
752 TLeaf *L = c.GetLeaf(l->c_str());
753
754 if (verbose>2)
755 cout << '\n' << L->GetTitle() << " {" << L->GetTypeName() << "}";
756
757 if (L->GetLenStatic()!=L->GetLen())
758 {
759 if (verbose>2)
760 cout << " (-skipped-)";
761 continue;
762 }
763
764 const string tn = L->GetTypeName();
765
766 auto it = ConvRoot.find(tn);
767 if (it==ConvRoot.end())
768 {
769 if (verbose>2)
770 cout << " (-n/a-)";
771 continue;
772 }
773
774 if (verbose==2)
775 cout << '\n' << L->GetTitle() << " {" << L->GetTypeName() << "}";
776
777 if (verbose>1)
778 cout << " (" << *l << ")";
779
780 vec.emplace_back(l->c_str(), l->c_str(), it->second.first, L->GetLenStatic());
781 c.SetBranchAddress(l->c_str(), vec.back().ptr);
782 }
783 if (verbose>1)
784 cout << '\n';
785
786 // ------------------------- Enable branch reading ------------------------
787
788 UInt_t datatype = 0;
789 const bool has_datatype = c.SetBranchAddress("DataType.fVal", &datatype) >= 0;
790
791 // Seting up branch status (must be after all SetBranchAddress)
792 c.SetBranchStatus("*", 0);
793 for (auto v=vec.cbegin(); v!=vec.cend(); v++)
794 if (v->type!=kConst)
795 c.SetBranchStatus(v->branch.c_str(), 1);
796
797 if (has_datatype)
798 {
799 c.SetBranchStatus("DataType.fVal", 1);
800 if (verbose>0)
801 cout << "Rows with DataType.fVal!=1 will be skipped." << endl;
802 }
803
804 // -------------------------------------------------------------------------
805
806 if (num_split)
807 {
808 cout << "\nSplitting configured " << (split_seq.empty()?"randomly":"in sequence") << " into " << num_split << " files." << endl;
809 if (!split_quant.empty())
810 cout << "Seed value configured as " << conf.Get<uint64_t>("seed") << "." << endl;
811 }
812
813 if (dryrun)
814 {
815 cout << "\nDry run: file output skipped!" << endl;
816 return 0;
817 }
818
819 if (verbose>0)
820 cout << "\n-------------------------- Converting file -------------------------" << endl;
821
822 vector<ofstream> outfiles;
823
824 if (num_split==0)
825 {
826 TString path(out.c_str());
827 const int rc = CheckFile(path, force, verbose);
828 if (rc>0)
829 return rc;
830
831 outfiles.emplace_back(path.Data(), append ? ios::app : ios::trunc);
832 if (rc==-1 || (force && rc==0 && !append))
833 WriteHeader(outfiles.back(), vec, formulas, skip, header);
834 }
835 else
836 {
837 for (size_t i=0; i<num_split; i++)
838 {
839 TString path(out.c_str());
840 path += "-";
841 path += i;
842
843 const int rc = CheckFile(path, force, verbose);
844 if (rc>0)
845 return rc;
846 outfiles.emplace_back(path.Data(), append ? ios::app : ios::trunc);
847 if (rc==-1 || (force && rc==0 && !append))
848 WriteHeader(outfiles.back(), vec, formulas, skip, header);
849 }
850 }
851
852 // ---------------------------- Write Body --------------------------------
853 size_t count = 0;
854 vector<size_t> ncount(num_split?num_split:1);
855
856 auto itree = c.GetTreeNumber();
857
858 const size_t num = max>0 && (max-first)<entries ? (max-first) : entries;
859 for (size_t j=first; j<num; j++)
860 {
861 c.GetEntry(j);
862 if (has_datatype && datatype!=1)
863 continue;
864
865 if (itree != c.GetTreeNumber())
866 {
867 manager->UpdateFormulaLeaves();
868 itree = c.GetTreeNumber();
869 }
870
871 if (selector.GetNdim() && selector.EvalInstance(0)<=0)
872 continue;
873
874 size_t index = 0;
875 if (!split_lut.empty())
876 index = split_lut[count % split_lut.size()];
877 if (!split_quant.empty())
878 {
879 const float r = rndm();
880 for (; r>=split_quant[index]; index++)
881 if (index==split_quant.size())
882 break;
883 }
884
885 vector<string> join;
886
887 if (!skip)
888 {
889 for (auto v=vec.cbegin(); v!=vec.cend(); v++)
890 {
891 const size_t N = v->num;
892 for (size_t i=0; i<N; i++)
893 join.emplace_back(v->fmt(i));
894 }
895 }
896
897 for (auto v=formulas.cbegin(); v!=formulas.cend(); v++)
898 join.emplace_back(to_string((*v)->EvalInstance(0)));
899
900 outfiles[index] << boost::join(join, " ") << "\n";
901
902 count ++;
903 ncount[index] ++;
904 }
905
906 if (verbose>0)
907 {
908 cout << "\nTotal: N=" << count << " out of " << num << " row(s) written [N=" << first << ".." << num-1 << "]." << endl;
909 for (int i=0; i<num_split; i++)
910 cout << "File " << i << ": nrows=" << ncount[i] << '\n';
911 cout << '\n';
912 }
913
914 if (verbose>0)
915 {
916 cout << "Total execution time: " << Time().UnixTime()-start.UnixTime() << "s.\n";
917 cout << "Success!\n" << endl;
918 }
919 return 0;
920}
Note: See TracBrowser for help on using the repository browser.