source: trunk/FACT++/src/root2csv.cc@ 19795

Last change on this file since 19795 was 19795, checked in by tbretz, 5 years ago
Updated help text and added possibility to apply ignore also to the aliased column names.
File size: 30.2 KB
Line 
1#include <random>
2
3#include <boost/regex.hpp>
4#include <boost/filesystem.hpp>
5#include <boost/algorithm/string/join.hpp>
6
7#include "tools.h"
8#include "Time.h"
9#include "Configuration.h"
10
11#include <TROOT.h>
12#include <TSystem.h>
13#include <TChain.h>
14#include <TLeaf.h>
15#include <TError.h>
16#include <TTreeFormula.h>
17#include <TTreeFormulaManager.h>
18
19using namespace std;
20namespace fs = boost::filesystem;
21
22// ------------------------------------------------------------------------
23
24struct Map : pair<string, string>
25{
26 Map() { }
27};
28
29std::istream &operator>>(std::istream &in, Map &m)
30{
31 const istreambuf_iterator<char> eos;
32 string txt(istreambuf_iterator<char>(in), eos);
33
34 const boost::regex expr("((.*)[^\\\\])/(.*)");
35 boost::smatch match;
36 if (!boost::regex_match(txt, match, expr))
37 throw runtime_error("Could not evaluate map argument: "+txt);
38
39 m.first = match[1].str();
40 m.second = match[3].str();
41
42 return in;
43}
44
45void SetupConfiguration(Configuration &conf)
46{
47 po::options_description control("Root to SQL");
48 control.add_options()
49 ("file", vars<string>()->required(),"The root files to read from")
50 ("out,o", var<string>()->required(), "Output file name")
51 ("force,f", po_switch(), "Force overwrite if output file already exists.")
52 ("append,a", po_switch(), "Append to an existing file (not check for the format is done!)")
53 ("tree,t", var<string>("Events"), "Name of the root tree to convert")
54 ("ignore", vars<string>(), "Ignore the given leaf, if the given regular expression matches")
55 ("alias.*", var<string>(), "Define an alias")
56 ("auto-alias", vars<Map>(), "Regular expression to define aliases from the branch names automatically")
57 ("header", var<uint16_t>(uint16_t(0)),"Type of header line (0: preceeding #, 1: without preceeding #, 2: none)")
58 ("add.*", var<string>(), "Define an additional column")
59 ("selector,s", var<string>("1"), "Define a selector for the columns (colums where this evaluates to a value <=0 are discarded)")
60 ("skip", po_switch(), "Discards all default leaves and writes only the columns defined by --add.*")
61 ("first", var<int64_t>(int64_t(0)), "First event to start with (default: 0), mainly for test purpose")
62 ("max", var<int64_t>(int64_t(0)), "Maximum number of events to process (0: all), mainly for test purpose")
63 //("const.*", var<string>(), "Insert a constant number into the given column (--const.mycolumn=5). A special case is `/.../.../`")
64 ("dry-run", po_switch(), "Do not create or manipulate any output file")
65 ;
66
67 po::options_description split("Splitting options");
68 split.add_options()
69 ("split-sequence,S", vars<uint16_t>(), "Split data sequentially into several trees/files (e.g. 1, 1, 2)")
70 ("split-quantile,Q", vars<double>(), "Split data randomly into several trees/files (e.g. 0.5, 1)")
71 ("seed", var<uint64_t>(mt19937_64::default_seed), "Seed value in case of random split")
72 ;
73
74 po::options_description debug("Debug options");
75 debug.add_options()
76 ("print-ls", po_switch(), "Calls TFile::ls()")
77 ("print-branches", po_switch(), "Print the branches found in the tree")
78 ("print-leaves", po_switch(), "Print the leaves found in the tree (this is what is processed)")
79 ("verbose,v", var<uint16_t>(1), "Verbosity (0: quiet, 1: default, 2: more, 3, ...)")
80 ;
81
82 po::positional_options_description p;
83 p.add("file", -1); // All positional options
84
85 conf.AddOptions(control);
86 conf.AddOptions(split);
87 conf.AddOptions(debug);
88 conf.SetArgumentPositions(p);
89}
90
91void PrintUsage()
92{
93 cout <<
94 "root2csv - Reads data from a root tree and writes a csv file\n"
95 "\n"
96 "For convenience, this documentation uses the extended version of the options, "
97 "refer to the output below to get the abbreviations.\n"
98 "\n"
99 "Similar functionaliy is also provided by root2sql. In addition to root2sql, "
100 "this tool is more flexible in the slection of columns and adds the possibility "
101 "to use formulas (implemented through TTreeFormula) to calculate values for "
102 "additional columns. Note that root can even write complex data like a TH1F "
103 "into a file. Here, only numeric columns are supported.\n"
104 "\n"
105 "Input files are given as positional arguments or with --file. "
106 "As files are read by adding them through TChain::Add, wildcards are "
107 "supported in file names. Note that on the command lines, file names "
108 "with wildcards have to be escaped in quotation marks if the wildcards "
109 "should be evaluated by the program and not by the shell. The output base "
110 "name of the output file(s) is given with --out.\n"
111 "\n"
112 "The format of the first line on the file is defined with the --header option:\n"
113 " 0: '# Col1 Col2 Col3 ...'\n"
114 " 1: 'Col1 Col2 Col3 ...'\n"
115 " 2: first data row\n"
116 "\n"
117 "As default, existing files are not overwritten. To force overwriting use "
118 "--force. To append data to existing files use --append. Note that no "
119 "check is done if this created valid and reasonable files.\n"
120 "\n"
121 "Each root tree has branches and leaves (the basic data types). These leaves can "
122 "be read independently of the classes which were used to write the root file. "
123 "The default tree to read from is 'Events' but the name can be overwritten "
124 "using --tree. The default table name to fill the data into is identical to "
125 "the tree name. It can be overwritten using --table.\n"
126 "\n"
127 "To get a list of the contents (keys and trees) of a root file, you can use --print-ls. "
128 "The name of each column to which data is filled from a leave is obtained from "
129 "the leaves' names. The leave names can be checked using --print-leaves. "
130 "A --print-branches exists for convenience to print only the high-level branches.\n"
131 "\n"
132 "Assuming a leaf with name MHillas.fWidth and a leaf with MHillas.fLength, "
133 "a new column can be added with name Area by\n"
134 " --add.Area='TMath::TwoPi()*MHillas.fWidth*MHillas.fLength'\n"
135 "\n"
136 "To simplify expression, root allows to define aliases, for example\n"
137 " --alias.Width='MHillas.fWidth'\n"
138 " --alias.Length='MHillas.fLength'\n"
139 "\n"
140 "This can then be used to simplyfy the above expression as\n"
141 " --add.Area='TMath::TwoPi()*Width*Length'\n"
142 "\n"
143 "Sometimes leaf names might be quite unconvenient like MTime.fTime.fMilliSec or "
144 "just MHillas.fWidth. To allow to simplify column names, regular expressions "
145 "(using boost's regex) can be defined to change the names. Note that these regular "
146 "expressions are applied one by one on each leaf's name. A valid expression could "
147 "be:\n"
148 " --auto-alias=MHillas\\.f/\n"
149 "which would remove all occurances of 'MHillas.f'. This option can be used more than "
150 "once. They are applied in sequence. A single match does not stop the sequence. "
151 "In addition to replacing the column names accordingly, a alias is created "
152 "automatically allowing to access the columns in a formula with the new name.\n"
153 "\n"
154 "Sometimes it might also be convenient to skip a leaf, i.e. not writing the "
155 "coresponding column in the output file. This can be done with "
156 "the --ignore resource. If the given regular expresion yields a match, the "
157 "leaf will be ignored. An automatic alias would still be created and the "
158 "leaf could still be used in a formula. Example\n"
159 " --ignore=ThetaSq\\..*\n"
160 "will skip all leaved which start with 'ThetaSq.'. This directive can be given "
161 "more than once. The so defined ignore list is applied entry-wise, first to the "
162 "raw leaf names, then to the aliased names.\n"
163 "\n"
164 "To select only certain extries from the file, a selector (cut) can be defined "
165 "in the same style as the --add directives, for exmple:\n"
166 " --selector='MHillas.fLength*Width<0'\n"
167 "Note that the selctor is not evaluated to a boolean expression (==0 or !=0) "
168 "but all positive none zero values are considered 'true' (select the entry) "
169 "and all negative values are considered 'fales' (discard the entry).\n"
170 "\n"
171 "For several purposes, it might be convenient to split the output to several "
172 "files. This can be achieved using the --split-sequence (-S) "
173 "and the --split-quantile (-Q) options. If a split sequence is defined as "
174 "-S 1 -S 2 -S 1 the events are split by 1:2:1 in this sequence order. If "
175 "quantiles are given as -Q 0.5 -Q 0.6, the first tree will contain 50% of "
176 "the second one 10% and the third one 40%. The corresponding seed value can "
177 "be set with --seed. Filenames are then created by adding an index after(!) "
178 "the extension, e.g. file.csv-0, file.csv-1, ...\n"
179 "\n"
180 "In case of success, 0 is returned, a value>0 otherwise.\n"
181 "\n"
182 "Usage: root2csv input1.root [input2.root ...] -o output.csv [-t tree] [-u] [-f] [-n] [-vN] [-cN]\n"
183 "\n"
184 ;
185 cout << endl;
186}
187
188enum BasicType_t
189{
190 kNone = 0,
191 kConst,
192 kFloat,
193 kDouble,
194 kInt16,
195 kUInt16,
196 kInt32,
197 kUInt32,
198 kInt64,
199 kUInt64,
200};
201
202static const map<string, pair<BasicType_t, string>> ConvRoot =
203{
204 { "Float_t", { kFloat, "FLOAT" } },
205 { "Double_t", { kDouble, "DOUBLE" } },
206 { "ULong64_t", { kUInt64, "BIGINT UNSIGNED" } },
207 { "Long64_t", { kInt64, "BIGINT" } },
208 { "UInt_t", { kUInt32, "INT UNSIGNED" } },
209 { "Int_t", { kInt32, "INT" } },
210 { "UShort_t", { kUInt16, "SMALLINT UNSIGNED" } },
211 { "Short_t", { kInt16, "SMALLINT" } },
212};
213
214struct Container
215{
216 static map<void*, size_t> counter;
217
218 string branch; // branch name
219 string column; // column name
220 BasicType_t type;
221 size_t num;
222 void *ptr;
223
224 Container(const string &b, const string &c, const BasicType_t &t, const size_t n=1) : branch(b), column(c), type(t), num(n), ptr(0)
225 {
226 switch (t)
227 {
228 case kFloat: ptr = new Float_t[n]; break;
229 case kDouble: ptr = new Double_t[n]; break;
230 case kInt16: ptr = new Short_t[n]; break;
231 case kUInt16: ptr = new UShort_t[n]; break;
232 case kInt32: ptr = new Int_t[n]; break;
233 case kUInt32: ptr = new UInt_t[n]; break;
234 case kInt64: ptr = new Long64_t[n]; break;
235 case kUInt64: ptr = new ULong64_t[n]; break;
236 case kConst:
237 case kNone:
238 break;
239 }
240 counter[ptr]++;
241 }
242 Container(const string &c, const string &value) : branch(value), column(c), type(kConst), num(1), ptr(0)
243 {
244 }
245
246 Container(const Container &c) : branch(c.branch), column(c.column), type(c.type), num(c.num), ptr(c.ptr)
247 {
248 counter[ptr]++;
249 }
250
251 ~Container()
252 {
253 counter[ptr]--;
254 if (counter[ptr]==0)
255 ::operator delete[](ptr); // It seems root is deleting it already
256 }
257
258 string fmt(const size_t &index) const
259 {
260 ostringstream str;
261
262 switch (type)
263 {
264 case kFloat: str << setprecision(8) << reinterpret_cast<Float_t*>(ptr)[index]; break;
265 case kDouble: str << setprecision(16) << reinterpret_cast<Double_t*>(ptr)[index]; break;
266 case kInt16: str << reinterpret_cast<Short_t*>(ptr)[index]; break;
267 case kUInt16: str << reinterpret_cast<UShort_t*>(ptr)[index]; break;
268 case kInt32: str << reinterpret_cast<Int_t*>(ptr)[index]; break;
269 case kUInt32: str << reinterpret_cast<UInt_t*>(ptr)[index]; break;
270 case kInt64: str << reinterpret_cast<Long64_t*>(ptr)[index]; break;
271 case kUInt64: str << reinterpret_cast<ULong64_t*>(ptr)[index]; break;
272 case kConst: str << branch; break;
273 case kNone:
274 break;
275 }
276
277 //if (str.str()=="nan" || str.str()=="-nan" || str.str()=="inf" || str.str()=="-inf")
278 // return "NULL";
279
280 return str.str();
281 }
282};
283
284map<void*, size_t> Container::counter;
285
286void ErrorHandlerAll(Int_t level, Bool_t abort, const char *location, const char *msg)
287{
288 if (string(msg).substr(0,24)=="no dictionary for class ")
289 return;
290 if (string(msg).substr(0,15)=="unknown branch ")
291 return;
292
293 DefaultErrorHandler(level, abort, location, msg);
294}
295
296// --------------------------- Write Header --------------------------------
297void WriteHeader(ostream &out, const vector<Container> &vec, const vector<TTreeFormula*> &form, bool skip, uint16_t header)
298{
299 if (header>1)
300 return;
301 if (header==0)
302 out << "# ";
303
304 vector<string> join;
305
306 if (!skip)
307 {
308 for (auto v=vec.cbegin(); v!=vec.cend(); v++)
309 {
310 const size_t N = v->num;
311 for (size_t i=0; i<N; i++)
312 {
313 string name = v->column;
314 if (N!=1)
315 name += "["+to_string(i)+"]";
316 join.emplace_back(name);
317 }
318 }
319 }
320
321 for (auto v=form.cbegin(); v!=form.cend(); v++)
322 join.emplace_back((*v)->GetName());
323
324 out << boost::join(join, " ") << "\n";
325}
326
327int CheckFile(TString &path, bool force, int verbose)
328{
329 gSystem->ExpandPathName(path);
330
331 FileStat_t stat;
332 const Int_t exist = !gSystem->GetPathInfo(path, stat);
333 const Bool_t _write = !gSystem->AccessPathName(path, kWritePermission) && R_ISREG(stat.fMode);
334
335 if (exist)
336 {
337 if (!_write)
338 {
339 cerr << "File '" << path << "' is not writable." << endl;
340 return 2;
341 }
342
343 if (!force)
344 {
345 cerr << "File '" << path << "' already exists." << endl;
346 return 3;
347 }
348 else
349 {
350 if (verbose>0)
351 cerr << "File '" << path << "' will be overwritten." << endl;
352 }
353 }
354 return exist ? 0 : -1;
355}
356
357void GetLeaves(vector<string> &list, const TTreeFormula &f)
358{
359 int i=0;
360 while (1)
361 {
362 const auto l = f.GetLeaf(i++);
363 if (!l)
364 return;
365 list.emplace_back(l->GetName());
366 }
367}
368
369int main(int argc, const char* argv[])
370{
371 Time start;
372
373 gROOT->SetBatch();
374 SetErrorHandler(ErrorHandlerAll);
375
376 Configuration conf(argv[0]);
377 conf.SetPrintUsage(PrintUsage);
378 SetupConfiguration(conf);
379
380 if (!conf.DoParse(argc, argv))
381 return 127;
382
383 // ----------------------------- Evaluate options --------------------------
384 const vector<string> files = conf.Vec<string>("file");
385 const string out = conf.Get<string>("out");
386 const string tree = conf.Get<string>("tree");
387
388 const bool force = conf.Get<bool>("force");
389 const bool append = conf.Get<bool>("append");
390 const bool dryrun = conf.Get<bool>("dry-run");
391 const bool skip = conf.Get<bool>("skip");
392
393 const uint16_t verbose = conf.Get<uint16_t>("verbose");
394 const int64_t first = conf.Get<int64_t>("first");
395 const int64_t max = conf.Get<int64_t>("max");
396 const uint16_t header = conf.Get<uint16_t>("header");
397
398 const bool print_ls = conf.Get<bool>("print-ls");
399 const bool print_branches = conf.Get<bool>("print-branches");
400 const bool print_leaves = conf.Get<bool>("print-leaves");
401
402 const vector<string> _ignore = conf.Vec<string>("ignore");
403 const vector<Map> autoalias = conf.Vec<Map>("auto-alias");
404
405 // ----------------------------- Setup splitting ---------------------------
406
407 vector<uint16_t> split_seq = conf.Vec<uint16_t>("split-sequence");
408 vector<double> split_quant = conf.Vec<double>("split-quantile");
409
410 if (!split_seq.empty() && !split_quant.empty())
411 throw runtime_error("Only splitting by --split-sequence or --split-quantile is allowed.");
412
413 const size_t num_split = split_seq.size()+split_quant.size()==0 ? 0 :
414 ::max(split_seq.size(), split_quant.size()+1);
415
416 map<size_t, size_t> split_lut;
417 for (size_t i=0; i<split_seq.size(); i++)
418 {
419 const size_t sz = split_lut.size();
420 for (size_t j=0; j<split_seq[i]; j++)
421 split_lut.emplace(j+sz, i);
422 }
423
424 for (size_t i=0; i<split_quant.size(); i++)
425 if (split_quant[i]<0 || split_quant[i]>=1)
426 throw runtime_error("Splitting quantiles must be in the range [0;1)");
427
428 for (size_t i=1; i<split_quant.size(); i++)
429 {
430 if (split_quant[i]<=split_quant[i-1])
431 throw runtime_error("Splitting quantiles must be in increasing order.");
432 }
433
434 // -------------------------------------------------------------------------
435
436 const uniform_real_distribution<double> distribution(0,1);
437 mt19937_64 generator;
438 generator.seed(conf.Get<uint64_t>("seed"));
439 auto rndm = bind(distribution, generator);
440
441 // -------------------------------------------------------------------------
442
443 if (verbose>0)
444 {
445 cout << "\n-------------------------- Evaluating input ------------------------\n";
446 cout << "Start Time: " << Time::sql << Time(Time::local) << endl;
447 }
448
449 if (verbose>0)
450 cout << "Processing Tree: " << tree << endl;
451
452 TChain c(tree.c_str());
453
454 uint64_t cnt = 0;
455 for (const auto &file : files)
456 {
457 const auto add = c.Add(file.c_str(), 0);
458 if (verbose>0)
459 cout << file << ": " << add << " file(s) added." << endl;
460 cnt += add;
461 }
462
463 if (cnt==0)
464 {
465 cerr << "No files found." << endl;
466 return 1;
467 }
468
469 if (verbose>0)
470 cout << cnt << " file(s) found." << endl;
471
472 if (print_ls)
473 {
474 cout << '\n';
475 c.ls();
476 cout << '\n';
477 }
478
479 c.SetMakeClass(1);
480
481 TObjArray *branches = c.GetListOfBranches();
482 TObjArray *leaves = c.GetListOfLeaves();
483
484 if (print_branches)
485 {
486 cout << '\n';
487 branches->Print();
488 }
489
490 const auto entries = c.GetEntriesFast();
491
492 if (verbose>0)
493 cout << branches->GetEntries() << " branches found." << endl;
494
495 if (print_leaves)
496 {
497 cout << '\n';
498 leaves->Print();
499 }
500 if (verbose>0)
501 {
502 cout << leaves->GetEntries() << " leaves found." << endl;
503 cout << entries << " events found." << endl;
504 }
505
506 // ----------------------------------------------------------------------
507
508 if (verbose>0)
509 cout << "\n-------------------------- Evaluating output -----------------------" << endl;
510
511 vector<Container> vec;
512
513/*
514 const auto fixed = conf.GetWildcardOptions("const.*");
515
516 string where;
517 vector<string> vindex;
518 for (auto it=fixed.cbegin(); it!=fixed.cend(); it++)
519 {
520 const string name = it->substr(6);
521 string val = conf.Get<string>(*it);
522
523 boost::smatch match;
524 if (boost::regex_match(val, match, boost::regex("\\/(.+)(?<!\\\\)\\/(.*)(?<!\\\\)\\/")))
525 {
526 const string reg = match[1];
527 const string fmt = match[2];
528
529 val = boost::regex_replace(file, boost::regex(reg), fmt.empty()?"$0":fmt,
530 boost::regex_constants::format_default|boost::regex_constants::format_no_copy);
531
532 if (verbose>0)
533 {
534 cout << "Regular expression detected for constant column `" << *it << "`\n";
535 cout << "Filename converted with /" << reg << "/ to /" << fmt << "/\n";
536 cout << "Filename: " << file << '\n';
537 cout << "Result: " << val << endl;
538 }
539 }
540
541 if (verbose>2)
542 cout << "\n" << val << " [-const-]";
543 if (verbose>1)
544 cout << " (" << name << ")";
545
546 string sqltype = "INT UNSIGNED";
547
548 for (auto m=sqltypes.cbegin(); m!=sqltypes.cend(); m++)
549 if (m->first==name)
550 sqltype = m->second;
551
552 if (!vec.empty())
553 query += ",\n";
554 query += " `"+name+"` "+sqltype+" NOT NULL COMMENT '--user--'";
555
556 vec.emplace_back(name, val);
557 where += " AND `"+name+"`="+val;
558 vindex.emplace_back(name);
559 }
560 */
561
562 // ------------------------- Setup all branches in tree -------------------
563
564 TIter Next(leaves);
565 TObject *o = 0;
566 while ((o=Next()))
567 {
568 TLeaf *L = c.GetLeaf(o->GetName());
569
570 string name = o->GetName();
571
572 for (auto m=autoalias.cbegin(); m!=autoalias.cend(); m++)
573 name = boost::regex_replace(name, boost::regex(m->first), m->second);
574
575 if (name!=o->GetName())
576 {
577 if (verbose>0)
578 cout << "Auto-alias: " << name << " = " << o->GetName() << endl;
579 if (!c.SetAlias(name.c_str(), o->GetName()))
580 cout << "WARNING - Alias could not be established!" << endl;
581 }
582
583 if (skip)
584 continue;
585
586 if (verbose>2)
587 cout << '\n' << L->GetTitle() << " {" << L->GetTypeName() << "}";
588
589 if (L->GetLenStatic()!=L->GetLen())
590 {
591 if (verbose>2)
592 cout << " (-skipped-)";
593 continue;
594 }
595
596 bool found = false;
597 for (auto b=_ignore.cbegin(); b!=_ignore.cend(); b++)
598 {
599 if (boost::regex_match(o->GetName(), boost::regex(*b)))
600 {
601 found = true;
602 if (verbose>2)
603 cout << " (-ignored-)";
604 break;
605 }
606 }
607 for (auto b=_ignore.cbegin(); b!=_ignore.cend(); b++)
608 {
609 if (boost::regex_match(name.c_str(), boost::regex(*b)))
610 {
611 found = true;
612 if (verbose>2)
613 cout << " (-ignored-)";
614 break;
615 }
616 }
617
618 if (found)
619 continue;
620
621 const string tn = L->GetTypeName();
622
623 auto it = ConvRoot.find(tn);
624 if (it==ConvRoot.end())
625 {
626 if (verbose>2)
627 cout << " (-n/a-)";
628 continue;
629 }
630
631 if (verbose==2)
632 cout << '\n' << L->GetTitle() << " {" << L->GetTypeName() << "}";
633
634 if (verbose>1)
635 cout << " (" << name << ")";
636
637 vec.emplace_back(o->GetTitle(), name, it->second.first, L->GetLenStatic());
638 c.SetBranchAddress(o->GetTitle(), vec.back().ptr);
639 }
640
641 if (verbose>0)
642 {
643 if (skip)
644 cout << "Default columns skipped: ";
645 cout << vec.size() << " default leaf/leaves setup for reading." << endl;
646 }
647
648
649 // ------------------- Configure manual aliases ----------------------------
650
651 const auto valiases = conf.GetWildcardOptions("alias.*");
652 if (verbose>0 && valiases.size()>0)
653 cout << '\n';
654 for (auto it=valiases.cbegin(); it!=valiases.cend(); it++)
655 {
656 const string name = it->substr(6);
657 const string val = conf.Get<string>(*it);
658
659 if (verbose>0)
660 cout << "Alias: " << name << " = " << val << endl;
661
662 if (!c.SetAlias(name.c_str(), val.c_str()))
663 {
664 cerr << "Alias could not be established!" << endl;
665 return 2;
666 }
667 }
668
669 // -------------------------- Configure Selector --------------------------
670
671 vector<string> leaflist;
672 c.SetBranchStatus("*", 1);
673
674 TTreeFormulaManager *manager = new TTreeFormulaManager;
675
676 if (verbose>0)
677 cout << "\nSelector: " << conf.Get<string>("selector") << endl;
678
679 TTreeFormula selector("Selector", conf.Get<string>("selector").c_str(), &c);
680 if (selector.GetNdim()==0)
681 {
682 cerr << "Compilation of Selector failed!" << endl;
683 return 3;
684 }
685 selector.SetQuickLoad(kTRUE);
686 manager->Add(&selector);
687 GetLeaves(leaflist, selector);
688
689 // -------------------- Configure additional columns ----------------------
690
691 vector<TTreeFormula*> formulas;
692
693 const auto vform = conf.GetWildcardOptions("add.*");
694 if (verbose>0 && vform.size()>0)
695 cout << '\n';
696 for (auto it=vform.cbegin(); it!=vform.cend(); it++)
697 {
698 const string name = it->substr(4);
699 const string val = conf.Get<string>(*it);
700
701 if (verbose>0)
702 cout << "Adding column: " << name << " = " << val << endl;
703
704 TTreeFormula *form = new TTreeFormula(name.c_str(), val.c_str(), &c);
705 if (form->GetNdim()==0)
706 {
707 cerr << "Compilation of Column failed!" << endl;
708 return 4;
709 }
710 form->SetQuickLoad(kTRUE);
711 formulas.emplace_back(form);
712 manager->Add(form);
713 GetLeaves(leaflist, *form);
714 }
715 manager->Sync();
716
717 if (verbose>0)
718 cout << '\n' << formulas.size() << " additional columns setup for writing." << endl;
719
720 // --------------------- Setup all branches in formulas -------------------
721
722 for (auto l=leaflist.cbegin(); l!=leaflist.cend(); l++)
723 {
724 // Branch address already set
725 if (c.GetBranch(l->c_str())->GetAddress())
726 continue;
727
728 TLeaf *L = c.GetLeaf(l->c_str());
729
730 if (verbose>2)
731 cout << '\n' << L->GetTitle() << " {" << L->GetTypeName() << "}";
732
733 if (L->GetLenStatic()!=L->GetLen())
734 {
735 if (verbose>2)
736 cout << " (-skipped-)";
737 continue;
738 }
739
740 const string tn = L->GetTypeName();
741
742 auto it = ConvRoot.find(tn);
743 if (it==ConvRoot.end())
744 {
745 if (verbose>2)
746 cout << " (-n/a-)";
747 continue;
748 }
749
750 if (verbose==2)
751 cout << '\n' << L->GetTitle() << " {" << L->GetTypeName() << "}";
752
753 if (verbose>1)
754 cout << " (" << *l << ")";
755
756 vec.emplace_back(l->c_str(), l->c_str(), it->second.first, L->GetLenStatic());
757 c.SetBranchAddress(l->c_str(), vec.back().ptr);
758 }
759 if (verbose>1)
760 cout << '\n';
761
762 // ------------------------- Enable branch reading ------------------------
763
764 UInt_t datatype = 0;
765 const bool has_datatype = c.SetBranchAddress("DataType.fVal", &datatype) >= 0;
766
767 // Seting up branch status (must be after all SetBranchAddress)
768 c.SetBranchStatus("*", 0);
769 for (auto v=vec.cbegin(); v!=vec.cend(); v++)
770 if (v->type!=kConst)
771 c.SetBranchStatus(v->branch.c_str(), 1);
772
773 if (has_datatype)
774 {
775 c.SetBranchStatus("DataType.fVal", 1);
776 if (verbose>0)
777 cout << "Rows with DataType.fVal!=1 will be skipped." << endl;
778 }
779
780 // -------------------------------------------------------------------------
781
782 if (num_split)
783 {
784 cout << "\nSplitting configured " << (split_seq.empty()?"randomly":"in sequence") << " into " << num_split << " files." << endl;
785 if (!split_quant.empty())
786 cout << "Seed value configured as " << conf.Get<uint64_t>("seed") << "." << endl;
787 }
788
789 if (dryrun)
790 {
791 cout << "\nDry run: file output skipped!" << endl;
792 return 0;
793 }
794
795 if (verbose>0)
796 cout << "\n-------------------------- Converting file -------------------------" << endl;
797
798 vector<ofstream> outfiles;
799
800 if (num_split==0)
801 {
802 TString path(out.c_str());
803 const int rc = CheckFile(path, force, verbose);
804 if (rc>0)
805 return rc;
806
807 outfiles.emplace_back(path.Data(), append ? ios::app : ios::trunc);
808 if (rc==-1 || (force && rc==0 && !append))
809 WriteHeader(outfiles.back(), vec, formulas, skip, header);
810 }
811 else
812 {
813 for (size_t i=0; i<num_split; i++)
814 {
815 TString path(out.c_str());
816 path += "-";
817 path += i;
818
819 const int rc = CheckFile(path, force, verbose);
820 if (rc>0)
821 return rc;
822 outfiles.emplace_back(path.Data(), append ? ios::app : ios::trunc);
823 if (rc==-1 || (force && rc==0 && !append))
824 WriteHeader(outfiles.back(), vec, formulas, skip, header);
825 }
826 }
827
828 // ---------------------------- Write Body --------------------------------
829 size_t count = 0;
830 vector<size_t> ncount(num_split?num_split:1);
831
832 auto itree = c.GetTreeNumber();
833
834 const size_t num = max>0 && (max-first)<entries ? (max-first) : entries;
835 for (size_t j=first; j<num; j++)
836 {
837 c.GetEntry(j);
838 if (has_datatype && datatype!=1)
839 continue;
840
841 if (itree != c.GetTreeNumber())
842 {
843 manager->UpdateFormulaLeaves();
844 itree = c.GetTreeNumber();
845 }
846
847 if (selector.GetNdim() && selector.EvalInstance(0)<=0)
848 continue;
849
850 size_t index = 0;
851 if (!split_lut.empty())
852 index = split_lut[count % split_lut.size()];
853 if (!split_quant.empty())
854 {
855 const float r = rndm();
856 for (; r>=split_quant[index]; index++)
857 if (index==split_quant.size())
858 break;
859 }
860
861 vector<string> join;
862
863 if (!skip)
864 {
865 for (auto v=vec.cbegin(); v!=vec.cend(); v++)
866 {
867 const size_t N = v->num;
868 for (size_t i=0; i<N; i++)
869 join.emplace_back(v->fmt(i));
870 }
871 }
872
873 for (auto v=formulas.cbegin(); v!=formulas.cend(); v++)
874 join.emplace_back(to_string((*v)->EvalInstance(0)));
875
876 outfiles[index] << boost::join(join, " ") << "\n";
877
878 count ++;
879 ncount[index] ++;
880 }
881
882 if (verbose>0)
883 {
884 cout << "\nTotal: N=" << count << " out of " << num << " row(s) written [N=" << first << ".." << num-1 << "]." << endl;
885 for (int i=0; i<num_split; i++)
886 cout << "File " << i << ": nrows=" << ncount[i] << '\n';
887 cout << '\n';
888 }
889
890 if (verbose>0)
891 {
892 cout << "Total execution time: " << Time().UnixTime()-start.UnixTime() << "s.\n";
893 cout << "Success!\n" << endl;
894 }
895 return 0;
896}
Note: See TracBrowser for help on using the repository browser.