source: trunk/FACT++/src/root2sql.cc@ 19120

Last change on this file since 19120 was 19120, checked in by tbretz, 6 years ago
Print average execution time per row.
File size: 22.9 KB
Line 
1#include <boost/algorithm/string/join.hpp>
2#include <boost/regex.hpp>
3
4#include "Database.h"
5
6#include "Time.h"
7#include "Configuration.h"
8
9#include <TROOT.h>
10#include <TFile.h>
11#include <TTree.h>
12#include <TLeaf.h>
13#include <TError.h>
14
15using namespace std;
16
17// ------------------------------------------------------------------------
18
19struct Map : pair<string, string>
20{
21 Map() { }
22};
23
24std::istream &operator>>(std::istream &in, Map &m)
25{
26 const istreambuf_iterator<char> eos;
27 string txt(istreambuf_iterator<char>(in), eos);
28
29 const boost::regex expr("((.*)[^\\\\])/(.*)");
30 boost::smatch match;
31 if (!boost::regex_match(txt, match, expr))
32 throw runtime_error("Could not evaluate map argument: "+txt);
33
34 m.first = match[1].str();
35 m.second = match[3].str();
36
37 return in;
38}
39
40
41void SetupConfiguration(Configuration &conf)
42{
43 po::options_description control("Root to SQL");
44 control.add_options()
45 ("uri,u", var<string>()
46#if BOOST_VERSION >= 104200
47 ->required()
48#endif
49 , "Database link as in\n\tuser:password@server[:port]/database.")
50 ("file", var<string>("")
51#if BOOST_VERSION >= 104200
52 ->required()
53#endif
54 , "The root file to read from")
55 ("force", po_switch(), "Force processing even if there is no database connection")
56 ("create", po_switch(), "Create the database if not existing")
57 ("drop", po_switch(), "Drop the table (implies create)")
58 ("tree,t", var<string>("Events"), "Name of the root tree to convert")
59 ("table", var<string>(""), "Name of the table to use (default is the tree name)")
60 ("map", vars<Map>(), "A regular expression which is applied to the leaf name befoee it is used as SQL column name)")
61 ("sql-type", vars<Map>(), "Allows to overwrite the calculated SQL type for a given column e.g. 'sql-column-name/UNSIGNED IN'")
62 ("ignore", vars<string>(), "Ignore the given leaf, if the given regular expression matches")
63 ("primary", vars<string>(), "List of columns to be used as primary keys during table creation (in connection with --create)")
64 ("first", var<int64_t>(int64_t(0)), "First event to start with (default: 0), mainly for test purpose")
65 ("max", var<int64_t>(int64_t(0)), "Maximum number of events to process (0: all), mainly for test purpose")
66 ("engine", var<string>(""), "Database engine to be used when a new table is created")
67 ("row-format", var<string>(""), "Defines the ROW_FORMAT keyword for table creation")
68 ("duplicate", vars<string>(""), "Specifies an assignment_list for an 'ON DUPLICATE KEY UPDATE' expression")
69 ("ignore-errors", po_switch(), "Adds the IGNORE keyword to the INSERT query (turns errors into warnings, ignores rows with errors)")
70 ;
71
72 po::options_description debug("Debug options");
73 debug.add_options()
74 ("no-insert", po_switch(), "Does not insert any data into the table")
75 ("dry-run", po_switch(), "Skip any query which changes the databse (might result in consecutive failures)")
76 ("print-branches", po_switch(), "Print the branches found in the tree")
77 ("print-leaves", po_switch(), "Print the leaves found in the tree (this is what is processed)")
78 ("print-insert", po_switch(), "Print the INSERT query (note that it contains all data)")
79 ("print-create", po_switch(), "Print the CREATE query")
80 ("verbose,v", var<uint16_t>(1), "Verbosity (0: quiet, 1: default, 2: more, 3, ...)")
81 ;
82
83 po::positional_options_description p;
84 p.add("file", 1); // The 1st positional options (n=1)
85
86 conf.AddOptions(control);
87 conf.AddOptions(debug);
88 conf.SetArgumentPositions(p);
89}
90
91void PrintUsage()
92{
93 cout <<
94 "root2sql - Fills the data from a root file into a database\n"
95 "\n"
96 "For convenience, this documentation uses the extended version of the options, "
97 "refer to the output below to get the abbreviations.\n"
98 "\n"
99 "This is a general purpose tool to fill the contents of a root file into a database "
100 "as long as this is technically possible and makes sense. Note that root can even "
101 "write complex data like a TH1F into a database, this is not the purpose of this "
102 "program.\n"
103 "\n"
104 "Each root tree has branches and leaves (the basic data types). These leaves can "
105 "be read independently of the classes which were used to write the root file. "
106 "The default tree to read from is 'Events' but the name can be overwritten "
107 "using --tree. The default table name to fill the data into is identical to "
108 "the tree name. It can be overwritten using --table.\n"
109 "\n"
110 "The name of each column to which data is filled from a leave is obtained from "
111 "the leaves' names. The leave names can be checked using --print-leaves. "
112 "A --print-branches exists for convenience to print only the high-level branches. "
113 "Sometimes these names might be quite unconvenient like MTime.fTime.fMilliSec or "
114 "just MHillas.fWidth. To allow to simplify column names, regular expressions "
115 "(using boost's regex) can be defined to change the names. Note that these regular "
116 "expressions are applied one by one on each leaf's name. A valid expression could "
117 "be:\n"
118 " --map=MHillas\\.f/\n"
119 "which would remove all occurances of 'MHillas.f'. This option can be used more than "
120 "once. They are applied in sequence. A single match does not stop the sequence.\n"
121 "\n"
122 "Sometimes it might also be convenient to skip a leaf. This can be done with "
123 "the --ignore resource. If the given regular expresion yields a match, the "
124 "leaf will be ignored. Note that the regular expression works on the raw-name "
125 "of the leaf not the readily mapped SQL column names. Example:\n"
126 " --ignore=ThetaSq\\..*\n"
127 "will skip all leaved which start with 'ThetaSq.'. This option can be used"
128 "more than once.\n"
129 "\n"
130 "The data type of each column is kept as close as possible to the leaves' data "
131 "types. If for some reason this is not wanted, the data type of the SQL column "
132 "can be overwritten with --sql-type sql-column/sql-ytpe, for example:\n"
133 " --sql-type=FileId/UNSIGNED INT\n"
134 "while the first argument of the name of the SQL column to which the data type "
135 "should be applied. The second column is the basic SQL data type. The option can "
136 "be given more than once.\n"
137 "\n"
138 "Database interaction:\n"
139 "\n"
140 "To drop an existing table, --drop can be used.\n"
141 "\n"
142 "To create a table according to theSQL column names and data types, --create "
143 "can be used. The query used can be printed with --print-create even --create "
144 "has not been specified.\n"
145 "\n"
146 "To choose the columns which should become primary keys, use --primary, "
147 "for exmaple:\n"
148 " --primary=col1\n"
149 "To define more than one column as primary key, the option can be given more than "
150 "once. Note that the combination of these columns must be unique.\n"
151 "\n"
152 "All columns are created as NOT NULL as default. To force a database engine "
153 "and/or a storage format, use --engine and --rot-format.\n"
154 "\n"
155 "Usually, the INSERT query would fail if the PRIMARY key exists already. "
156 "This can be avoided using the 'ON DUPLICATE KEY UPDATE' directive. With the "
157 "--duplicate, you can specify what should be updated in case of a duplicate key. "
158 "To keep the row untouched, you can just update the primary key "
159 "with the identical primary key, e.g. --duplicate='MyPrimary=VALUES(MyPrimary)'. "
160 "The --duplicate resource can be specified more than once to add more expressions "
161 "to the assignment_list. For more details, see the MySQL manual.\n"
162 "\n"
163 "For debugging purpose, or to just create or drop a table, the final insert "
164 "query can be skipped using --no-insert. Note that for performance reason, "
165 "all data is collected in memory and a single INSERT query is issued at the "
166 "end.\n"
167 "\n"
168 "Another possibility is to add the IGNORE keyword to the INSERT query by "
169 "--ignore-errors, which essentially ignores all errors and turns them into "
170 "warnings which are printed after the query succeeded.\n"
171 "\n"
172 "Using a higher verbosity level (-v), an overview of the written columns or all "
173 "processed leaves is printed depending on the verbosity level. The output looks "
174 "like the following\n"
175 " Leaf name [root data type] (SQL name)\n"
176 "for example\n"
177 " MTime.fTime.fMilliSec [Long64_t] (MilliSec)\n"
178 "which means that the leaf MTime.fTime.fMilliSec is detected to be a Long64_t "
179 "which is filled into a column called MilliSec. Leaves with non basic data types "
180 "are ignored automatically and are marked as (-n/a-). User ignored columnd "
181 "are marked as (-ignored-).\n"
182 "\n"
183 "If a query failed, the query is printed to stderr together with the error message. "
184 "For the main INSERT query, this is only true if the verbosity level is at least 2 "
185 "or the query has less than 80*25 bytes.\n"
186 "\n"
187 "In case of succes, 0 is returned, a value>0 otherwise.\n"
188 "\n"
189 "Usage: root2sql [options] -uri URI rootfile.root\n"
190 "\n"
191 ;
192 cout << endl;
193}
194
195enum BasicType_t
196{
197 kNone = 0,
198 kFloat,
199 kDouble,
200 kInt16,
201 kUInt16,
202 kInt32,
203 kUInt32,
204 kInt64,
205 kUInt64,
206};
207
208static const map<string, pair<BasicType_t, string>> ConvRoot =
209{
210 { "Float_t", { kFloat, "FLOAT" } },
211 { "Double_t", { kDouble, "DOUBLE" } },
212 { "ULong64_t", { kUInt64, "BIGINT UNSIGNED" } },
213 { "Long64_t", { kInt64, "BIGINT" } },
214 { "UInt_t", { kUInt32, "INT UNSIGNED" } },
215 { "Int_t", { kInt32, "INT" } },
216 { "UShort_t", { kUInt16, "SMALLINT UNSIGNED" } },
217 { "Short_t", { kInt16, "SMALLINT" } },
218};
219
220struct Container
221{
222 string branch; // branch name
223 string column; // column name
224 BasicType_t type;
225 void *ptr;
226
227 Container(const string &b, const string &c, const BasicType_t &t) : branch(b), column(c), type(t), ptr(0)
228 {
229 switch (t)
230 {
231 case kFloat: ptr = new Float_t; break;
232 case kDouble: ptr = new Double_t; break;
233 case kInt16: ptr = new Short_t; break;
234 case kUInt16: ptr = new UShort_t; break;
235 case kInt32: ptr = new Int_t; break;
236 case kUInt32: ptr = new UInt_t; break;
237 case kInt64: ptr = new Long64_t; break;
238 case kUInt64: ptr = new ULong64_t; break;
239 case kNone:
240 break;
241 }
242 }
243 ~Container()
244 {
245 //::operator delete(ptr); // It seems root is deleting it already
246 }
247
248 string fmt() const
249 {
250 ostringstream str;
251
252 switch (type)
253 {
254 case kFloat: str << setprecision(8) << *reinterpret_cast<Float_t*>(ptr); break;
255 case kDouble: str << setprecision(16) << *reinterpret_cast<Double_t*>(ptr); break;
256 case kInt16: str << *reinterpret_cast<Short_t*>(ptr); break;
257 case kUInt16: str << *reinterpret_cast<UShort_t*>(ptr); break;
258 case kInt32: str << *reinterpret_cast<Int_t*>(ptr); break;
259 case kUInt32: str << *reinterpret_cast<UInt_t*>(ptr); break;
260 case kInt64: str << *reinterpret_cast<Long64_t*>(ptr); break;
261 case kUInt64: str << *reinterpret_cast<ULong64_t*>(ptr); break;
262 case kNone:
263 break;
264 }
265
266 //if (str.str()=="nan" || str.str()=="-nan" || str.str()=="inf" || str.str()=="-inf")
267 // return "NULL";
268
269 return str.str();
270 }
271};
272
273void ErrorHandlerAll(Int_t level, Bool_t abort, const char *location, const char *msg)
274{
275 if (string(msg).substr(0,24)=="no dictionary for class ")
276 return;
277
278 DefaultErrorHandler(level, abort, location, msg);
279}
280
281int main(int argc, const char* argv[])
282{
283 Time start;
284
285 gROOT->SetBatch();
286 SetErrorHandler(ErrorHandlerAll);
287
288 Configuration conf(argv[0]);
289 conf.SetPrintUsage(PrintUsage);
290 SetupConfiguration(conf);
291
292 if (!conf.DoParse(argc, argv))
293 return 127;
294
295 // ----------------------------- Evaluate options --------------------------
296 const string uri = conf.Get<string>("uri");
297 const string file = conf.Get<string>("file");
298 const string tree = conf.Get<string>("tree");
299 const string table = conf.Get<string>("table").empty() ? tree : conf.Get<string>("table");
300
301 const uint16_t verbose = conf.Get<uint16_t>("verbose");
302 const int64_t first = conf.Get<int64_t>("first");
303 const int64_t max = conf.Get<int64_t>("max");
304
305 const bool force = conf.Get<bool>("force");
306 const bool drop = conf.Get<bool>("drop");
307 const bool create = conf.Get<bool>("create") || drop;
308 const bool noinsert = conf.Get<bool>("no-insert");
309 const bool dry_run = conf.Get<bool>("dry-run");
310
311 const string engine = conf.Get<string>("engine");
312 const string row_format = conf.Get<string>("row-format");
313
314 const vector<string> duplicate = conf.Vec<string>("duplicate");
315
316 const bool ignore_errors = conf.Get<bool>("ignore-errors");
317
318 const bool print_branches = conf.Get<bool>("print-branches");
319 const bool print_leaves = conf.Get<bool>("print-leaves");
320 const bool print_create = conf.Get<bool>("print-create");
321 const bool print_insert = conf.Get<bool>("print-insert");
322
323 const vector<Map> mymap = conf.Vec<Map>("map");
324 const vector<Map> sqltypes = conf.Vec<Map>("sql-type");
325 const vector<string> _ignore = conf.Vec<string>("ignore");
326 const vector<string> primary = conf.Vec<string>("primary");
327
328 // -------------------------------------------------------------------------
329
330 if (verbose>0)
331 cout << "\n-------------------------- Evaluating file -------------------------" << endl;
332
333 TFile f(file.c_str());
334 if (f.IsZombie())
335 {
336 cerr << "Could not open file " << file << endl;
337 return 1;
338 }
339
340 if (verbose>0)
341 cout << "File: " << file << endl;
342
343 TTree *T = 0;
344 f.GetObject(tree.c_str(), T);
345 if (!T)
346 {
347 cerr << "Could not open tree " << tree << endl;
348 return 2;
349 }
350
351 if (verbose>0)
352 cout << "Tree: " << tree << endl;
353
354 T->SetMakeClass(1);
355
356 TObjArray *branches = T->GetListOfBranches();
357 TObjArray *leaves = T->GetListOfLeaves();
358
359 if (print_branches)
360 {
361 cout << '\n';
362 branches->Print();
363 }
364
365 if (verbose>0)
366 cout << T->GetEntriesFast() << " events found." << endl;
367
368
369 if (verbose>0)
370 cout << branches->GetEntries() << " branches found." << endl;
371
372 if (print_leaves)
373 {
374 cout << '\n';
375 leaves->Print();
376 }
377 if (verbose>0)
378 cout << leaves->GetEntries() << " leaves found." << endl;
379
380 string query =
381 "CREATE TABLE IF NOT EXISTS `"+table+"`\n"
382 "(\n";
383
384 vector<Container> vec;
385
386
387 TIter Next(leaves);
388 TObject *o = 0;
389 while ((o=Next()))
390 {
391 TLeaf *L = T->GetLeaf(o->GetName());
392
393 if (verbose>2)
394 cout << '\n' << o->GetName() << " [" << L->GetTypeName() << "]";
395
396 string name = o->GetName();
397
398
399 bool found = false;
400 for (auto b=_ignore.cbegin(); b!=_ignore.cend(); b++)
401 {
402 if (boost::regex_match(name, boost::regex(*b)))
403 {
404 found = true;
405 if (verbose>2)
406 cout << " (-ignored-)";
407 break;
408 }
409 }
410 if (found)
411 continue;
412
413 const string tn = L->GetTypeName();
414
415 auto it = ConvRoot.find(tn);
416 if (it==ConvRoot.end())
417 {
418 if (verbose>2)
419 cout << " (-n/a-)";
420 continue;
421 }
422
423 if (verbose==2)
424 cout << '\n' << o->GetName() << " [" << L->GetTypeName() << "]";
425
426 for (auto m=mymap.cbegin(); m!=mymap.cend(); m++)
427 name = boost::regex_replace(name, boost::regex(m->first), m->second);
428
429 if (verbose>1)
430 cout << " (" << name << ")";
431
432 string sqltype = it->second.second;
433
434 for (auto m=sqltypes.cbegin(); m!=sqltypes.cend(); m++)
435 if (m->first==name)
436 sqltype = m->second;
437
438 if (!vec.empty())
439 query += ",\n";
440 query += " `"+name+"` "+sqltype+" NOT NULL COMMENT '"+o->GetName()+"'";
441
442 vec.emplace_back(o->GetName(), name, it->second.first);
443 T->SetBranchAddress(o->GetName(), vec.back().ptr);
444 }
445
446 if (verbose>1)
447 cout << "\n\n";
448 if (verbose>0)
449 cout << vec.size() << " leaves setup for reading." << endl;
450
451 UInt_t datatype = 0;
452 const bool has_datatype = T->SetBranchAddress("DataType.fVal", &datatype) >= 0;
453
454 // Setiing up branch status (must be after all SetBranchAddress)
455 T->SetBranchStatus("*", 0);
456 for (auto c=vec.cbegin(); c!=vec.cend(); c++)
457 T->SetBranchStatus(c->branch.c_str(), 1);
458
459 if (has_datatype)
460 {
461 T->SetBranchStatus("DataType.fVal", 1);
462 if (verbose>0)
463 cout << "Rows with DataType.fVal!=1 will be skipped." << endl;
464 }
465
466 // -------------------------------------------------------------------------
467 // Checking for database connection
468
469 Database connection(uri);
470
471 try
472 {
473 if (!force)
474 connection.connected();
475 }
476 catch (const exception &e)
477 {
478 cerr << "SQL connection failed: " << e.what() << '\n' << endl;
479 return 3;
480 }
481
482 // -------------------------------------------------------------------------
483
484 if (verbose>0)
485 cout << "\n--------------------------- Database Table -------------------------" << endl;
486
487 if (!primary.empty())
488 query += ",\n PRIMARY KEY USING BTREE (`"+boost::algorithm::join(primary, "`, `")+"`)\n";
489
490 query +=
491 ")\n"
492 "DEFAULT CHARSET=latin1 COLLATE=latin1_general_ci\n";
493 if (!engine.empty())
494 query += "ENGINE="+engine+"\n";
495 if (!row_format.empty())
496 query += "ROW_FORMAT="+row_format+"\n";
497 query += "COMMENT='created by "+conf.GetName()+"'\n";
498
499 // FIXME: Can we omit the catching to be able to print the
500 // query 'autmatically'?
501 try
502 {
503 if (drop)
504 {
505 // => Simple result
506 if (!dry_run)
507 connection.query("DROP TABLE `"+table+"`").execute();
508 if (verbose>0)
509 cout << "Table `" << table << "` dropped." << endl;
510 }
511 }
512 catch (const exception &e)
513 {
514 cerr << "DROP TABLE `" << table << "`\n\n";
515 cerr << "SQL query failed:\n" << e.what() << '\n' << endl;
516 return 4;
517 }
518
519 try
520 {
521 if (create && !dry_run)
522 connection.query(query).execute();
523 }
524 catch (const exception &e)
525 {
526 cerr << query << "\n\n";
527 cerr << "SQL query failed:\n" << e.what() << '\n' << endl;
528 return 5;
529 }
530
531 if (print_create)
532 cout << query << endl;
533
534 if (create && verbose>0)
535 cout << "Table `" << table << "` created." << endl;
536
537 // -------------------------------------------------------------------------
538
539 if (verbose>0)
540 cout << "\n---------------------------- Reading file --------------------------" << endl;
541
542 //query = update ? "UPDATE" : "INSERT";
543 query = "INSERT ";
544 if (ignore_errors)
545 query += "IGNORE ";
546 query += "`"+table+"`\n"
547 "(\n";
548
549 for (auto c=vec.cbegin(); c!=vec.cend(); c++)
550 {
551 if (c!=vec.cbegin())
552 query += ",\n";
553 query += " `"+c->column+"`";
554 }
555
556 query +=
557 "\n)\n"
558 "VALUES\n";
559
560 size_t count = 0;
561
562 const size_t num = max>0 && (max-first)<T->GetEntriesFast() ? (max-first) : T->GetEntriesFast();
563 for (size_t j=first; j<num; j++)
564 {
565 T->GetEntry(j);
566 if (has_datatype && datatype!=1)
567 continue;
568
569 if (count>0)
570 query += ",\n";
571
572 query += "(\n";
573
574 for (auto c=vec.cbegin(); c!=vec.cend(); c++)
575 {
576 if (c!=vec.cbegin())
577 query += ",\n";
578
579 query += " "+c->fmt();
580
581 if (print_insert)
582 query += " /* "+c->column+" -> "+c->branch+" */";
583 }
584 query += "\n)";
585
586 count ++;
587 }
588
589 if (!duplicate.empty())
590 query += "\nON DUPLICATE KEY UPDATE\n " + boost::join(duplicate, ",\n ");
591
592 if (verbose>0)
593 cout << count << " out of " << num << " row(s) read from file [N=" << first << ".." << num-1 << "]." << endl;
594
595 if (count==0)
596 {
597 if (verbose>0)
598 cout << "Total execution time: " << Time().UnixTime()-start.UnixTime() << "s\n" << endl;
599 return 0;
600 }
601
602 // -------------------------------------------------------------------------
603
604 if (verbose>0)
605 {
606 cout << "\n--------------------------- Inserting data -------------------------" << endl;
607 cout << "Sending INSERT query (" << query.length() << " bytes)" << endl;
608 }
609
610 try
611 {
612 if (!noinsert && !dry_run)
613 {
614 auto q = connection.query(query);
615 q.execute();
616 cout << q.info() << '\n' << endl;
617 }
618 else
619 cout << "Insert query skipped!" << endl;
620
621 if (print_insert)
622 cout << query << endl;
623 }
624 catch (const exception &e)
625 {
626 if (verbose>1 || query.length()<80*25)
627 cerr << query << "\n\n";
628 cerr << "SQL query failed (" << query.length() << " bytes):\n" << e.what() << '\n' << endl;
629 return 6;
630 }
631
632 if (verbose>0)
633 {
634 const auto sec = Time().UnixTime()-start.UnixTime();
635 cout << "Total execution time: " << sec << "s ";
636 cout << "(" << Tools::Fractional(sec/count) << "s/row)\n";
637
638 try
639 {
640 const auto resw =
641 connection.query("SHOW WARNINGS").store();
642
643 for (size_t i=0; i<resw.num_rows(); i++)
644 {
645 const mysqlpp::Row &roww = resw[i];
646
647 cout << roww["Level"] << '[' << roww["Code"] << "]: ";
648 cout << roww["Message"] << '\n';
649 }
650 cout << endl;
651
652 }
653 catch (const exception &e)
654 {
655 cerr << "\nSHOW WARNINGS\n\n";
656 cerr << "SQL query failed:\n" << e.what() << '\n' <<endl;
657 return 7;
658 }
659 }
660
661 return 0;
662}
Note: See TracBrowser for help on using the repository browser.