source: trunk/FACT++/src/csv2root.cc@ 19793

Last change on this file since 19793 was 19793, checked in by tbretz, 5 years ago
Now also the opposite direction is possible.
File size: 17.8 KB
Line 
1#include <random>
2
3#include <boost/regex.hpp>
4#include <boost/filesystem.hpp>
5#include <boost/algorithm/string/join.hpp>
6
7#include "tools.h"
8#include "Time.h"
9#include "Configuration.h"
10
11#include <TROOT.h>
12#include <TSystem.h>
13#include <TFile.h>
14#include <TTree.h>
15#include <TError.h>
16#include <TObjArray.h>
17
18using namespace std;
19namespace fs = boost::filesystem;
20
21// ------------------------------------------------------------------------
22
23void SetupConfiguration(Configuration &conf)
24{
25 po::options_description control("Root to SQL");
26 control.add_options()
27 ("file", var<string>()->required(), "The csv input file")
28 ("out,o", var<string>(""), "Output root file name")
29 ("force,f", po_switch(), "Force overwrite if output file already exists.")
30 ("update,u", po_switch(), "Update an existing file")
31 ("tree,t", var<string>("Events"), "Name of the root tree to convert")
32 ("compression,c", var<uint16_t>(1), "zlib compression level for the root file")
33 ("no-header", po_switch(), "Use if the first line contains no header")
34 ("dry-run", po_switch(), "Do not create or manipulate any output file")
35 ;
36
37 po::options_description split("Splitting options");
38 split.add_options()
39 ("split-sequence,S", vars<uint16_t>(), "Split data sequentially into several trees/files (e.g. 1, 1, 2)")
40 ("split-quantile,Q", vars<double>(), "Split data randomly into several trees/files (e.g. 0.5, 1)")
41 ("seed", var<uint64_t>(mt19937_64::default_seed), "Seed value in case of random split")
42 ;
43
44 po::options_description debug("Debug options");
45 debug.add_options()
46 ("verbose,v", var<uint16_t>(1), "Verbosity (0: quiet, 1: default, 2: more, 3, ...)")
47 ;
48
49 po::positional_options_description p;
50 p.add("file", 1); // All positional options
51 p.add("out", 1); // All positional options
52
53 conf.AddOptions(control);
54 conf.AddOptions(split);
55 conf.AddOptions(debug);
56 conf.SetArgumentPositions(p);
57}
58
59void PrintUsage()
60{
61 cout <<
62 "csv2root - Reads data from a root tree and writes a csv file\n"
63 "\n"
64 "For convenience, this documentation uses the extended version of the options, "
65 "refer to the output below to get the abbreviations.\n"
66 "\n"
67 "This is a general purpose tool to fill the contents of a root file into a database "
68 "as long as this is technically possible and makes sense. Note that root can even "
69 "write complex data like a TH1F into a database, this is not the purpose of this "
70 "program.\n"
71 "\n"
72 "Each root tree has branches and leaves (the basic data types). These leaves can "
73 "be read independently of the classes which were used to write the root file. "
74 "The default tree to read from is 'Events' but the name can be overwritten "
75 "using --tree. The default table name to fill the data into is identical to "
76 "the tree name. It can be overwritten using --table.\n"
77 "\n"
78 "To get a list of the contents (keys and trees) of a root file, you can use --print-ls. "
79 "The name of each column to which data is filled from a leave is obtained from "
80 "the leaves' names. The leave names can be checked using --print-leaves. "
81 "A --print-branches exists for convenience to print only the high-level branches. "
82 "Sometimes these names might be quite unconvenient like MTime.fTime.fMilliSec or "
83 "just MHillas.fWidth. To allow to simplify column names, regular expressions "
84 "(using boost's regex) can be defined to change the names. Note that these regular "
85 "expressions are applied one by one on each leaf's name. A valid expression could "
86 "be:\n"
87 " --map=MHillas\\.f/\n"
88 "which would remove all occurances of 'MHillas.f'. This option can be used more than "
89 "once. They are applied in sequence. A single match does not stop the sequence.\n"
90 "\n"
91 "Sometimes it might also be convenient to skip a leaf. This can be done with "
92 "the --ignore resource. If the given regular expresion yields a match, the "
93 "leaf will be ignored. Note that the regular expression works on the raw-name "
94 "of the leaf not the readily mapped SQL column names. Example:\n"
95 " --ignore=ThetaSq\\..*\n"
96 "will skip all leaved which start with 'ThetaSq.'. This option can be used"
97 "more than once.\n"
98 "\n"
99 "The data type of each column is kept as close as possible to the leaves' data "
100 "types. If for some reason this is not wanted, the data type of the SQL column "
101 "can be overwritten with --sql-type sql-column/sql-ytpe, for example:\n"
102 " --sql-type=FileId/UNSIGNED INT\n"
103 "while the first argument of the name of the SQL column to which the data type "
104 "should be applied. The second column is the basic SQL data type. The option can "
105 "be given more than once.\n"
106 "\n"
107 "Database interaction:\n"
108 "\n"
109 "To drop an existing table, --drop can be used.\n"
110 "\n"
111 "To create a table according to theSQL column names and data types, --create "
112 "can be used. The query used can be printed with --print-create even --create "
113 "has not been specified.\n"
114 "\n"
115 "To choose the columns which should become primary keys, use --primary, "
116 "for example:\n"
117 " --primary=col1\n"
118 "To define more than one column as primary key, the option can be given more than "
119 "once. Note that the combination of these columns must be unique.\n"
120 "\n"
121 "All columns are created as NOT NULL as default. To force a database engine "
122 "and/or a storage format, use --engine and --row-format.\n"
123 "\n"
124 "Usually, the INSERT query would fail if the PRIMARY key exists already. "
125 "This can be avoided using the 'ON DUPLICATE KEY UPDATE' directive. With the "
126 "--duplicate, you can specify what should be updated in case of a duplicate key. "
127 "To keep the row untouched, you can just update the primary key "
128 "with the identical primary key, e.g. --duplicate='MyPrimary=VALUES(MyPrimary)'. "
129 "The --duplicate resource can be specified more than once to add more expressions "
130 "to the assignment_list. For more details, see the MySQL manual.\n"
131 "\n"
132 "For debugging purpose, or to just create or drop a table, the final insert "
133 "query can be skipped using --no-insert. Note that for performance reason, "
134 "all data is collected in memory and a single INSERT query is issued at the "
135 "end.\n"
136 "\n"
137 "Another possibility is to add the IGNORE keyword to the INSERT query by "
138 "--ignore-errors, which essentially ignores all errors and turns them into "
139 "warnings which are printed after the query succeeded.\n"
140 "\n"
141 "Using a higher verbosity level (-v), an overview of the written columns or all "
142 "processed leaves is printed depending on the verbosity level. The output looks "
143 "like the following\n"
144 " Leaf name [root data type] (SQL name)\n"
145 "for example\n"
146 " MTime.fTime.fMilliSec [Long64_t] (MilliSec)\n"
147 "which means that the leaf MTime.fTime.fMilliSec is detected to be a Long64_t "
148 "which is filled into a column called MilliSec. Leaves with non basic data types "
149 "are ignored automatically and are marked as (-n/a-). User ignored columns "
150 "are marked as (-ignored-).\n"
151 "\n"
152 "A constant value for the given file can be inserted by using the --const directive. "
153 "For example --const.mycolumn=42 would insert 42 into a column called mycolumn. "
154 "The column is created as INT UNSIGNED as default which can be altered by "
155 "--sql-type. A special case is a value of the form `/regex/format/`. Here, the given "
156 "regular expression is applied to the filename and it is newly formated with "
157 "the new format string. Uses the standard formatting rules to replace matches "
158 "(those used by ECMAScript's replace method).\n"
159 "\n"
160 "Usually the previously defined constant values are helpful to create an index "
161 "which relates unambiguously the inserted data to the file. It might be useful "
162 "to delete all data which belongs to this particular file before new data is "
163 "entered. This can be achieved with the `--delete` directive. It deletes all "
164 "data from the table before inserting new data which fulfills the condition "
165 "defined by the `--const` directives.\n"
166 "\n"
167 "The constant values can also be used for a conditional execution (--conditional). "
168 "If any row with the given constant values are found, the execution is stopped "
169 "(note that this happend after the table drop/create but before the delete/insert.\n"
170 "\n"
171 "To ensure efficient access for a conditonal execution, it makes sense to have "
172 "an index created for those columns. This can be done during table creation "
173 "with the --index option.\n"
174 "\n"
175 "To create the index as a UNIQUE INDEX, you can use the --unique option which "
176 "implies --index.\n"
177 "\n"
178 "If a query failed, the query is printed to stderr together with the error message. "
179 "For the main INSERT query, this is only true if the verbosity level is at least 2 "
180 "or the query has less than 80*25 bytes.\n"
181 "\n"
182 "In case of success, 0 is returned, a value>0 otherwise.\n"
183 "\n"
184 "Usage: root2sql [options] -uri URI rootfile.root\n"
185 "\n"
186 ;
187 cout << endl;
188}
189
190/*
191void ErrorHandlerAll(Int_t level, Bool_t abort, const char *location, const char *msg)
192{
193 if (string(msg).substr(0,24)=="no dictionary for class ")
194 return;
195 if (string(msg).substr(0,15)=="unknown branch ")
196 return;
197
198 DefaultErrorHandler(level, abort, location, msg);
199}*/
200
201
202void AddTree(vector<TTree*> &ttree, TFile &file, const string &tree, bool update, int verbose)
203{
204 TTree *T = 0;
205 if (update)
206 {
207 file.GetObject(tree.c_str(), T);
208 if (T)
209 {
210 ttree.emplace_back(T);
211 if (verbose>0)
212 cout << "Updating tree: " << tree << endl;
213 }
214 }
215 if (!T)
216 ttree.emplace_back(new TTree(tree.c_str(), "csv2root"));
217}
218
219int main(int argc, const char* argv[])
220{
221 Time start;
222
223 gROOT->SetBatch();
224 //SetErrorHandler(ErrorHandlerAll);
225
226 Configuration conf(argv[0]);
227 conf.SetPrintUsage(PrintUsage);
228 SetupConfiguration(conf);
229
230 if (!conf.DoParse(argc, argv))
231 return 127;
232
233 // ----------------------------- Evaluate options --------------------------
234 const string file = conf.Get<string>("file");
235 const string tree = conf.Get<string>("tree");
236
237 const bool force = conf.Get<bool>("force");
238 const bool update = conf.Get<bool>("update");
239 const bool dryrun = conf.Get<bool>("dry-run");
240 const bool noheader = conf.Get<bool>("no-header");
241
242 const uint16_t verbose = conf.Get<uint16_t>("verbose");
243// const int64_t first = conf.Get<int64_t>("first");
244// const int64_t max = conf.Get<int64_t>("max");
245
246 const uint16_t compression = conf.Get<uint16_t>("compression");
247
248 string out = conf.Get<string>("out");
249 if (out.empty())
250 {
251 out = file;
252 const auto p = out.find_last_of('.');
253 if (p!=string::npos)
254 out = string(out.substr(0, p))+".root";
255 }
256
257 // ----------------------------- Setup splitting ---------------------------
258
259 vector<uint16_t> split_seq = conf.Vec<uint16_t>("split-sequence");
260 vector<double> split_quant = conf.Vec<double>("split-quantile");
261
262 if (!split_seq.empty() && !split_quant.empty())
263 throw runtime_error("Only splitting by --split-sequence or --split-quantile is allowed.");
264
265 const size_t num_split = split_seq.size()+split_quant.size()==0 ? 0 :
266 ::max(split_seq.size(), split_quant.size()+1);
267
268 map<size_t, size_t> split_lut;
269 for (size_t i=0; i<split_seq.size(); i++)
270 {
271 const size_t sz = split_lut.size();
272 for (size_t j=0; j<split_seq[i]; j++)
273 split_lut.emplace(j+sz, i);
274 }
275
276 for (size_t i=0; i<split_quant.size(); i++)
277 if (split_quant[i]<0 || split_quant[i]>=1)
278 throw runtime_error("Splitting quantiles must be in the range [0;1)");
279
280 for (size_t i=1; i<split_quant.size(); i++)
281 {
282 if (split_quant[i]<=split_quant[i-1])
283 throw runtime_error("Splitting quantiles must be in increasing order.");
284 }
285
286 // -------------------------------------------------------------------------
287
288 const uniform_real_distribution<double> distribution(0,1);
289 mt19937_64 generator;
290 generator.seed(conf.Get<uint64_t>("seed"));
291 auto rndm = bind(distribution, generator);
292
293 // -------------------------------------------------------------------------
294
295 if (verbose>0)
296 {
297 cout << "\n-------------------------- Evaluating input ------------------------\n";
298 cout << "Start Time: " << Time::sql << Time(Time::local) << endl;
299 }
300
301
302 // -------------------------------------------------------------------------
303
304 cout << "Reading from '" << file << "'.\n";
305
306 ifstream fin(file.c_str());
307 if (!fin.good())
308 {
309 cerr << file << ": " << strerror(errno) << endl;
310 return 1;
311 }
312
313 TString buf;
314 buf.ReadLine(fin);
315 if (!fin)
316 {
317 cerr << file << ": " << strerror(errno) << endl;
318 return 2;
319 }
320
321 TObjArray *title = buf.Tokenize(" ");
322 if (title->GetEntries()==0)
323 {
324 cerr << "First line empty." << endl;
325 return 3;
326 }
327
328 if (title->At(0)->GetName()[0]=='#')
329 title->RemoveAt(0);
330
331 const auto numcol = title->GetEntries();
332
333 if (verbose>0)
334 cout << "Found " << numcol << " columns." << endl;
335
336 if (noheader)
337 fin.seekg(0);
338
339 // -------------------------------------------------------------------------
340
341 TString path(out.c_str());
342 gSystem->ExpandPathName(path);
343
344 if (!dryrun)
345 {
346 FileStat_t stat;
347 const Int_t exist = !gSystem->GetPathInfo(path, stat);
348 const Bool_t _write = !gSystem->AccessPathName(path, kWritePermission) && R_ISREG(stat.fMode);
349
350 if ((update && !exist) || (update && exist && !_write) || (force && exist && !_write))
351 {
352 cerr << "File '" << path << "' is not writable." << endl;
353 return 4;
354 }
355
356 if (!update && !force && exist)
357 {
358 cerr << "File '" << path << "' already exists." << endl;
359 return 5;
360 }
361 }
362
363 TFile tfile(path, update?"UPDATE":(force?"RECREATE":"CREATE"), file.c_str(), compression);
364 if (tfile.IsZombie())
365 {
366 cerr << "Opening file '" << path << "' failed." << endl;
367 return 6;
368 }
369
370 if (verbose>0)
371 {
372 cout << "Opened root file '" << path << "'.\n";
373 cout << "Writing to tree: " << tree << ".\n";
374 }
375
376 // -------------------- Configure branches of TTree ------------------------
377 vector<TTree*> ttree;
378
379 if (num_split==0)
380 AddTree(ttree, tfile, tree, update, verbose);
381 else
382 {
383 for (size_t i=0; i<num_split; i++)
384 AddTree(ttree, tfile, tree+"["+to_string(i)+"]", update, verbose);
385 }
386
387
388 vector<float> vec(numcol);
389 for (int i=0; i<numcol; i++)
390 {
391 string col = noheader ? Tools::Form("col%d", i) : title->At(i)->GetName();
392
393 boost::regex rexpr(":");
394 col = boost::regex_replace(col, rexpr, "");
395
396 if (verbose>1)
397 cout << "Column: " << col << '\n';
398
399 for (auto it=ttree.begin(); it!=ttree.end(); it++)
400 it[0]->Branch(col.c_str(), vec.data()+i);
401 }
402
403 delete title;
404
405 // -------------------------------------------------------------------------
406
407 size_t line = 0;
408
409 while (1)
410 {
411 buf.ReadLine(fin);
412 if (!fin)
413 break;
414
415 TObjArray *arr = buf.Tokenize(" ");
416 if (arr->GetEntries()!=numcol)
417 {
418 cerr << "Column count mismatch in line " << line+1 << "!" << endl;
419 return 6;
420 }
421
422 for (int i=0; i<numcol; i++)
423 {
424 try
425 {
426 vec[i] = stof(arr->At(i)->GetName());
427 }
428 catch (const exception &e)
429 {
430 cerr << "Conversion of '" << arr->At(i)->GetName() << "' failed!" << endl;
431 return 7;
432 }
433 }
434
435 delete arr;
436
437
438 size_t index = 0;
439 if (!split_lut.empty())
440 index = split_lut[line % split_lut.size()];
441 if (!split_quant.empty())
442 {
443 const float r = rndm();
444 for (; r>=split_quant[index]; index++)
445 if (index==split_quant.size())
446 break;
447 }
448
449 ttree[index]->Fill();
450 line++;
451 }
452
453 if (verbose>0)
454 {
455 cout << line << " data rows read from file." << endl;
456 for (size_t i=0; i<ttree.size(); i++)
457 cout << ttree[i]->GetEntries() << " rows filled into tree #" << i << "." << endl;
458 }
459
460 for (auto it=ttree.begin(); it!=ttree.end(); it++)
461 (*it)->Write("", TObject::kOverwrite);
462 tfile.Close();
463
464 if (verbose>0)
465 {
466 const auto sec = Time().UnixTime()-start.UnixTime();
467
468 cout << Tools::Scientific(tfile.GetSize()) << "B written to disk.\n";
469 cout << "File closed.\n";
470 cout << "Execution time: " << sec << "s ";
471 cout << "(" << Tools::Fractional(sec/line) << "s/row)\n";
472 cout << "--------------------------------------------------------------" << endl;
473 }
474
475 return 0;
476}
Note: See TracBrowser for help on using the repository browser.