source: trunk/FACT++/src/root2sql.cc@ 19114

Last change on this file since 19114 was 19114, checked in by tbretz, 6 years ago
Read duplicate as a vector of strings.
File size: 22.7 KB
Line 
1#include <boost/algorithm/string/join.hpp>
2#include <boost/regex.hpp>
3
4#include "Database.h"
5
6#include "Time.h"
7#include "Configuration.h"
8
9#include <TROOT.h>
10#include <TFile.h>
11#include <TTree.h>
12#include <TLeaf.h>
13#include <TError.h>
14
15using namespace std;
16
17// ------------------------------------------------------------------------
18
19struct Map : pair<string, string>
20{
21 Map() { }
22};
23
24std::istream &operator>>(std::istream &in, Map &m)
25{
26 const istreambuf_iterator<char> eos;
27 string txt(istreambuf_iterator<char>(in), eos);
28
29 const boost::regex expr("((.*)[^\\\\])/(.*)");
30 boost::smatch match;
31 if (!boost::regex_match(txt, match, expr))
32 throw runtime_error("Could not evaluate map argument: "+txt);
33
34 m.first = match[1].str();
35 m.second = match[3].str();
36
37 return in;
38}
39
40
41void SetupConfiguration(Configuration &conf)
42{
43 po::options_description control("Root to SQL");
44 control.add_options()
45 ("uri,u", var<string>()
46#if BOOST_VERSION >= 104200
47 ->required()
48#endif
49 , "Database link as in\n\tuser:password@server[:port]/database.")
50 ("file", var<string>("")
51#if BOOST_VERSION >= 104200
52 ->required()
53#endif
54 , "The root file to read from")
55 ("force", po_switch(), "Force processing even if there is no database connection")
56 ("create", po_switch(), "Create the database if not existing")
57 ("drop", po_switch(), "Drop the table (implies create)")
58 ("tree,t", var<string>("Events"), "Name of the root tree to convert")
59 ("table", var<string>(""), "Name of the table to use (default is the tree name)")
60 ("map", vars<Map>(), "A regular expression which is applied to the leaf name befoee it is used as SQL column name)")
61 ("sql-type", vars<Map>(), "Allows to overwrite the calculated SQL type for a given column e.g. 'sql-column-name/UNSIGNED IN'")
62 ("ignore", vars<string>(), "Ignore the given leaf, if the given regular expression matches")
63 ("primary", vars<string>(), "List of columns to be used as primary keys during table creation (in connection with --create)")
64 ("first", var<int64_t>(int64_t(0)), "First event to start with (default: 0), mainly for test purpose")
65 ("max", var<int64_t>(int64_t(0)), "Maximum number of events to process (0: all), mainly for test purpose")
66 ("engine", var<string>(""), "Database engine to be used when a new table is created")
67 ("row-format", var<string>(""), "Defines the ROW_FORMAT keyword for table creation")
68 ("duplicate", vars<string>(""), "Specifies an assignment_list for an 'ON DUPLICATE KEY UPDATE' expression")
69 ("ignore-errors", po_switch(), "Adds the IGNORE keyword to the INSERT query (turns errors into warnings, ignores rows with errors)")
70 ;
71
72 po::options_description debug("Debug options");
73 debug.add_options()
74 ("no-insert", po_switch(), "Does not insert any data into the table")
75 ("dry-run", po_switch(), "Skip any query which changes the databse (might result in consecutive failures)")
76 ("print-branches", po_switch(), "Print the branches found in the tree")
77 ("print-leaves", po_switch(), "Print the leaves found in the tree (this is what is processed)")
78 ("print-insert", po_switch(), "Print the INSERT query (note that it contains all data)")
79 ("print-create", po_switch(), "Print the CREATE query")
80 ("verbose,v", var<uint16_t>(1), "Verbosity (0: quiet, 1: default, 2: more, 3, ...)")
81 ;
82
83 po::positional_options_description p;
84 p.add("file", 1); // The 1st positional options (n=1)
85
86 conf.AddOptions(control);
87 conf.AddOptions(debug);
88 conf.SetArgumentPositions(p);
89}
90
91void PrintUsage()
92{
93 cout <<
94 "root2sql - Fills the data from a root file into a database\n"
95 "\n"
96 "For convenience, this documentation uses the extended version of the options, "
97 "refer to the output below to get the abbreviations.\n"
98 "\n"
99 "This is a general purpose tool to fill the contents of a root file into a database "
100 "as long as this is technically possible and makes sense. Note that root can even "
101 "write complex data like a TH1F into a database, this is not the purpose of this "
102 "program.\n"
103 "\n"
104 "Each root tree has branches and leaves (the basic data types). These leaves can "
105 "be read independently of the classes which were used to write the root file. "
106 "The default tree to read from is 'Events' but the name can be overwritten "
107 "using --tree. The default table name to fill the data into is identical to "
108 "the tree name. It can be overwritten using --table.\n"
109 "\n"
110 "The name of each column to which data is filled from a leave is obtained from "
111 "the leaves' names. The leave names can be checked using --print-leaves. "
112 "A --print-branches exists for convenience to print only the high-level branches. "
113 "Sometimes these names might be quite unconvenient like MTime.fTime.fMilliSec or "
114 "just MHillas.fWidth. To allow to simplify column names, regular expressions "
115 "(using boost's regex) can be defined to change the names. Note that these regular "
116 "expressions are applied one by one on each leaf's name. A valid expression could "
117 "be:\n"
118 " --map=MHillas\\.f/\n"
119 "which would remove all occurances of 'MHillas.f'. This option can be used more than "
120 "once. They are applied in sequence. A single match does not stop the sequence.\n"
121 "\n"
122 "Sometimes it might also be convenient to skip a leaf. This can be done with "
123 "the --ignore resource. If the given regular expresion yields a match, the "
124 "leaf will be ignored. Note that the regular expression works on the raw-name "
125 "of the leaf not the readily mapped SQL column names. Example:\n"
126 " --ignore=ThetaSq\\..*\n"
127 "will skip all leaved which start with 'ThetaSq.'. This option can be used"
128 "more than once.\n"
129 "\n"
130 "The data type of each column is kept as close as possible to the leaves' data "
131 "types. If for some reason this is not wanted, the data type of the SQL column "
132 "can be overwritten with --sql-type sql-column/sql-ytpe, for example:\n"
133 " --sql-type=FileId/UNSIGNED INT\n"
134 "while the first argument of the name of the SQL column to which the data type "
135 "should be applied. The second column is the basic SQL data type. The option can "
136 "be given more than once.\n"
137 "\n"
138 "Database interaction:\n"
139 "\n"
140 "To drop an existing table, --drop can be used.\n"
141 "\n"
142 "To create a table according to theSQL column names and data types, --create "
143 "can be used. The query used can be printed with --print-create even --create "
144 "has not been specified.\n"
145 "\n"
146 "To choose the columns which should become primary keys, use --primary, "
147 "for exmaple:\n"
148 " --primary=col1\n"
149 "To define more than one column as primary key, the option can be given more than "
150 "once. Note that the combination of these columns must be unique.\n"
151 "\n"
152 "All columns are created as NOT NULL as default. To force a database engine "
153 "and/or a storage format, use --engine and --rot-format.\n"
154 "\n"
155 "Usually, the INSERT query would fail if the PRIMARY key exists already. "
156 "This can be avoided using the 'ON DUPLICATE KEY' directive. With the "
157 "--duplicate,you can specify what should be updated in case of a duplicate key. "
158 "To keep the row untouched, you can just update the primary key "
159 "with the identical primary key, e.g. --duplicate='MyPrimary=VALUES(MyPrimary)'. "
160 "For more details, see the MySQL manual.\n"
161 "\n"
162 "For debugging purpose, or to just create or drop a table, the final insert "
163 "query can be skipped using --no-insert. Note that for performance reason, "
164 "all data is collected in memory and a single INSERT query is issued at the "
165 "end.\n"
166 "\n"
167 "Another possibility is to add the IGNORE keyword to the INSERT query by "
168 "--ignore-errors, which essentially ignores all errors and turns them into "
169 "warnings which are printed after the query succeeded.\n"
170 "\n"
171 "Using a higher verbosity level (-v), an overview of the written columns or all "
172 "processed leaves is printed depending on the verbosity level. The output looks "
173 "like the following\n"
174 " Leaf name [root data type] (SQL name)\n"
175 "for example\n"
176 " MTime.fTime.fMilliSec [Long64_t] (MilliSec)\n"
177 "which means that the leaf MTime.fTime.fMilliSec is detected to be a Long64_t "
178 "which is filled into a column called MilliSec. Leaves with non basic data types "
179 "are ignored automatically and are marked as (-n/a-). User ignored columnd "
180 "are marked as (-ignored-).\n"
181 "\n"
182 "If a query failed, the query is printed to stderr together with the error message. "
183 "For the main INSERT query, this is only true if the verbosity level is at least 2 "
184 "or the query has less than 80*25 bytes.\n"
185 "\n"
186 "In case of succes, 0 is returned, a value>0 otherwise.\n"
187 "\n"
188 "Usage: root2sql [options] -uri URI rootfile.root\n"
189 "\n"
190 ;
191 cout << endl;
192}
193
194enum BasicType_t
195{
196 kNone = 0,
197 kFloat,
198 kDouble,
199 kInt16,
200 kUInt16,
201 kInt32,
202 kUInt32,
203 kInt64,
204 kUInt64,
205};
206
207static const map<string, pair<BasicType_t, string>> ConvRoot =
208{
209 { "Float_t", { kFloat, "FLOAT" } },
210 { "Double_t", { kDouble, "DOUBLE" } },
211 { "ULong64_t", { kUInt64, "BIGINT UNSIGNED" } },
212 { "Long64_t", { kInt64, "BIGINT" } },
213 { "UInt_t", { kUInt32, "INT UNSIGNED" } },
214 { "Int_t", { kInt32, "INT" } },
215 { "UShort_t", { kUInt16, "SMALLINT UNSIGNED" } },
216 { "Short_t", { kInt16, "SMALLINT" } },
217};
218
219struct Container
220{
221 string branch; // branch name
222 string column; // column name
223 BasicType_t type;
224 void *ptr;
225
226 Container(const string &b, const string &c, const BasicType_t &t) : branch(b), column(c), type(t), ptr(0)
227 {
228 switch (t)
229 {
230 case kFloat: ptr = new Float_t; break;
231 case kDouble: ptr = new Double_t; break;
232 case kInt16: ptr = new Short_t; break;
233 case kUInt16: ptr = new UShort_t; break;
234 case kInt32: ptr = new Int_t; break;
235 case kUInt32: ptr = new UInt_t; break;
236 case kInt64: ptr = new Long64_t; break;
237 case kUInt64: ptr = new ULong64_t; break;
238 case kNone:
239 break;
240 }
241 }
242 ~Container()
243 {
244 //::operator delete(ptr); // It seems root is deleting it already
245 }
246
247 string fmt() const
248 {
249 ostringstream str;
250
251 switch (type)
252 {
253 case kFloat: str << setprecision(8) << *reinterpret_cast<Float_t*>(ptr); break;
254 case kDouble: str << setprecision(16) << *reinterpret_cast<Double_t*>(ptr); break;
255 case kInt16: str << *reinterpret_cast<Short_t*>(ptr); break;
256 case kUInt16: str << *reinterpret_cast<UShort_t*>(ptr); break;
257 case kInt32: str << *reinterpret_cast<Int_t*>(ptr); break;
258 case kUInt32: str << *reinterpret_cast<UInt_t*>(ptr); break;
259 case kInt64: str << *reinterpret_cast<Long64_t*>(ptr); break;
260 case kUInt64: str << *reinterpret_cast<ULong64_t*>(ptr); break;
261 case kNone:
262 break;
263 }
264
265 //if (str.str()=="nan" || str.str()=="-nan" || str.str()=="inf" || str.str()=="-inf")
266 // return "NULL";
267
268 return str.str();
269 }
270};
271
272void ErrorHandlerAll(Int_t level, Bool_t abort, const char *location, const char *msg)
273{
274 if (string(msg).substr(0,24)=="no dictionary for class ")
275 return;
276
277 DefaultErrorHandler(level, abort, location, msg);
278}
279
280int main(int argc, const char* argv[])
281{
282 Time start;
283
284 gROOT->SetBatch();
285 SetErrorHandler(ErrorHandlerAll);
286
287 Configuration conf(argv[0]);
288 conf.SetPrintUsage(PrintUsage);
289 SetupConfiguration(conf);
290
291 if (!conf.DoParse(argc, argv))
292 return 127;
293
294 // ----------------------------- Evaluate options --------------------------
295 const string uri = conf.Get<string>("uri");
296 const string file = conf.Get<string>("file");
297 const string tree = conf.Get<string>("tree");
298 const string table = conf.Get<string>("table").empty() ? tree : conf.Get<string>("table");
299
300 const uint16_t verbose = conf.Get<uint16_t>("verbose");
301 const int64_t first = conf.Get<int64_t>("first");
302 const int64_t max = conf.Get<int64_t>("max");
303
304 const bool force = conf.Get<bool>("force");
305 const bool drop = conf.Get<bool>("drop");
306 const bool create = conf.Get<bool>("create") || drop;
307 const bool noinsert = conf.Get<bool>("no-insert");
308 const bool dry_run = conf.Get<bool>("dry-run");
309
310 const string engine = conf.Get<string>("engine");
311 const string row_format = conf.Get<string>("row-format");
312
313 const vector<string> duplicate = conf.Vec<string>("duplicate");
314
315 const bool ignore_errors = conf.Get<bool>("ignore-errors");
316
317 const bool print_branches = conf.Get<bool>("print-branches");
318 const bool print_leaves = conf.Get<bool>("print-leaves");
319 const bool print_create = conf.Get<bool>("print-create");
320 const bool print_insert = conf.Get<bool>("print-insert");
321
322 const vector<Map> mymap = conf.Vec<Map>("map");
323 const vector<Map> sqltypes = conf.Vec<Map>("sql-type");
324 const vector<string> _ignore = conf.Vec<string>("ignore");
325 const vector<string> primary = conf.Vec<string>("primary");
326
327 // -------------------------------------------------------------------------
328
329 if (verbose>0)
330 cout << "\n-------------------------- Evaluating file -------------------------" << endl;
331
332 TFile f(file.c_str());
333 if (f.IsZombie())
334 {
335 cerr << "Could not open file " << file << endl;
336 return 1;
337 }
338
339 if (verbose>0)
340 cout << "File: " << file << endl;
341
342 TTree *T = 0;
343 f.GetObject(tree.c_str(), T);
344 if (!T)
345 {
346 cerr << "Could not open tree " << tree << endl;
347 return 2;
348 }
349
350 if (verbose>0)
351 cout << "Tree: " << tree << endl;
352
353 T->SetMakeClass(1);
354
355 TObjArray *branches = T->GetListOfBranches();
356 TObjArray *leaves = T->GetListOfLeaves();
357
358 if (print_branches)
359 {
360 cout << '\n';
361 branches->Print();
362 }
363
364 if (verbose>0)
365 cout << T->GetEntriesFast() << " events found." << endl;
366
367
368 if (verbose>0)
369 cout << branches->GetEntries() << " branches found." << endl;
370
371 if (print_leaves)
372 {
373 cout << '\n';
374 leaves->Print();
375 }
376 if (verbose>0)
377 cout << leaves->GetEntries() << " leaves found." << endl;
378
379 string query =
380 "CREATE TABLE IF NOT EXISTS `"+table+"`\n"
381 "(\n";
382
383 vector<Container> vec;
384
385
386 TIter Next(leaves);
387 TObject *o = 0;
388 while ((o=Next()))
389 {
390 TLeaf *L = T->GetLeaf(o->GetName());
391
392 if (verbose>2)
393 cout << '\n' << o->GetName() << " [" << L->GetTypeName() << "]";
394
395 string name = o->GetName();
396
397
398 bool found = false;
399 for (auto b=_ignore.cbegin(); b!=_ignore.cend(); b++)
400 {
401 if (boost::regex_match(name, boost::regex(*b)))
402 {
403 found = true;
404 if (verbose>2)
405 cout << " (-ignored-)";
406 break;
407 }
408 }
409 if (found)
410 continue;
411
412 const string tn = L->GetTypeName();
413
414 auto it = ConvRoot.find(tn);
415 if (it==ConvRoot.end())
416 {
417 if (verbose>2)
418 cout << " (-n/a-)";
419 continue;
420 }
421
422 if (verbose==2)
423 cout << '\n' << o->GetName() << " [" << L->GetTypeName() << "]";
424
425 for (auto m=mymap.cbegin(); m!=mymap.cend(); m++)
426 name = boost::regex_replace(name, boost::regex(m->first), m->second);
427
428 if (verbose>1)
429 cout << " (" << name << ")";
430
431 string sqltype = it->second.second;
432
433 for (auto m=sqltypes.cbegin(); m!=sqltypes.cend(); m++)
434 if (m->first==name)
435 sqltype = m->second;
436
437 if (!vec.empty())
438 query += ",\n";
439 query += " `"+name+"` "+sqltype+" NOT NULL COMMENT '"+o->GetName()+"'";
440
441 vec.emplace_back(o->GetName(), name, it->second.first);
442 T->SetBranchAddress(o->GetName(), vec.back().ptr);
443 }
444
445 if (verbose>1)
446 cout << "\n\n";
447 if (verbose>0)
448 cout << vec.size() << " leaves setup for reading." << endl;
449
450 UInt_t datatype = 0;
451 const bool has_datatype = T->SetBranchAddress("DataType.fVal", &datatype) >= 0;
452
453 // Setiing up branch status (must be after all SetBranchAddress)
454 T->SetBranchStatus("*", 0);
455 for (auto c=vec.cbegin(); c!=vec.cend(); c++)
456 T->SetBranchStatus(c->branch.c_str(), 1);
457
458 if (has_datatype)
459 {
460 T->SetBranchStatus("DataType.fVal", 1);
461 if (verbose>0)
462 cout << "Rows with DataType.fVal!=1 will be skipped." << endl;
463 }
464
465 // -------------------------------------------------------------------------
466 // Checking for database connection
467
468 Database connection(uri);
469
470 try
471 {
472 if (!force)
473 connection.connected();
474 }
475 catch (const exception &e)
476 {
477 cerr << "SQL connection failed: " << e.what() << '\n' << endl;
478 return 3;
479 }
480
481 // -------------------------------------------------------------------------
482
483 if (verbose>0)
484 cout << "\n--------------------------- Database Table -------------------------" << endl;
485
486 if (!primary.empty())
487 query += ",\n PRIMARY KEY USING BTREE (`"+boost::algorithm::join(primary, "`, `")+"`)\n";
488
489 query +=
490 ")\n"
491 "DEFAULT CHARSET=latin1 COLLATE=latin1_general_ci\n";
492 if (!engine.empty())
493 query += "ENGINE="+engine+"\n";
494 if (!row_format.empty())
495 query += "ROW_FORMAT="+row_format+"\n";
496 query += "COMMENT='created by "+conf.GetName()+"'\n";
497
498 // FIXME: Can we omit the catching to be able to print the
499 // query 'autmatically'?
500 try
501 {
502 if (drop)
503 {
504 // => Simple result
505 if (!dry_run)
506 connection.query("DROP TABLE `"+table+"`").execute();
507 if (verbose>0)
508 cout << "Table `" << table << "` dropped." << endl;
509 }
510 }
511 catch (const exception &e)
512 {
513 cerr << "DROP TABLE `" << table << "`\n\n";
514 cerr << "SQL query failed:\n" << e.what() << '\n' << endl;
515 return 4;
516 }
517
518 try
519 {
520 if (create && !dry_run)
521 connection.query(query).execute();
522 }
523 catch (const exception &e)
524 {
525 cerr << query << "\n\n";
526 cerr << "SQL query failed:\n" << e.what() << '\n' << endl;
527 return 5;
528 }
529
530 if (print_create)
531 cout << query << endl;
532
533 if (create && verbose>0)
534 cout << "Table `" << table << "` created." << endl;
535
536 // -------------------------------------------------------------------------
537
538 if (verbose>0)
539 cout << "\n---------------------------- Reading file --------------------------" << endl;
540
541 //query = update ? "UPDATE" : "INSERT";
542 query = "INSERT ";
543 if (ignore_errors)
544 query += "IGNORE ";
545 query += "`"+table+"`\n"
546 "(\n";
547
548 for (auto c=vec.cbegin(); c!=vec.cend(); c++)
549 {
550 if (c!=vec.cbegin())
551 query += ",\n";
552 query += " `"+c->column+"`";
553 }
554
555 query +=
556 "\n)\n"
557 "VALUES\n";
558
559 size_t count = 0;
560
561 const size_t num = max>0 && (max-first)<T->GetEntriesFast() ? (max-first) : T->GetEntriesFast();
562 for (size_t j=first; j<num; j++)
563 {
564 T->GetEntry(j);
565 if (has_datatype && datatype!=1)
566 continue;
567
568 if (count>0)
569 query += ",\n";
570
571 query += "(\n";
572
573 for (auto c=vec.cbegin(); c!=vec.cend(); c++)
574 {
575 if (c!=vec.cbegin())
576 query += ",\n";
577
578 query += " "+c->fmt();
579
580 if (print_insert)
581 query += " /* "+c->column+" -> "+c->branch+" */";
582 }
583 query += "\n)";
584
585 count ++;
586 }
587
588 if (!duplicate.empty())
589 query += "\nON DUPLICATE KEY UPDATE\n " + boost::join(duplicate, ",\n ");
590
591 if (verbose>0)
592 cout << count << " out of " << num << " row(s) read from file [N=" << first << ".." << num-1 << "]." << endl;
593
594 if (count==0)
595 {
596 if (verbose>0)
597 cout << "Total execution time: " << Time().UnixTime()-start.UnixTime() << "s\n" << endl;
598 return 0;
599 }
600
601 // -------------------------------------------------------------------------
602
603 if (verbose>0)
604 {
605 cout << "\n--------------------------- Inserting data -------------------------" << endl;
606 cout << "Sending INSERT query (" << query.length() << " bytes)" << endl;
607 }
608
609 try
610 {
611 if (!noinsert && !dry_run)
612 {
613 auto q = connection.query(query);
614 q.execute();
615 cout << q.info() << '\n' << endl;
616 }
617 else
618 cout << "Insert query skipped!" << endl;
619
620 if (print_insert)
621 cout << query << endl;
622 }
623 catch (const exception &e)
624 {
625 if (verbose>1 || query.length()<80*25)
626 cerr << query << "\n\n";
627 cerr << "SQL query failed (" << query.length() << " bytes):\n" << e.what() << '\n' << endl;
628 return 6;
629 }
630
631 if (verbose>0)
632 {
633 cout << "Total execution time: " << Time().UnixTime()-start.UnixTime() << "s\n" << endl;
634
635 try
636 {
637 const auto resw =
638 connection.query("SHOW WARNINGS").store();
639
640 for (size_t i=0; i<resw.num_rows(); i++)
641 {
642 const mysqlpp::Row &roww = resw[i];
643
644 cout << roww["Level"] << '[' << roww["Code"] << "]: ";
645 cout << roww["Message"] << '\n';
646 }
647 cout << endl;
648
649 }
650 catch (const exception &e)
651 {
652 cerr << "\nSHOW WARNINGS\n\n";
653 cerr << "SQL query failed:\n" << e.what() << '\n' <<endl;
654 return 7;
655 }
656 }
657
658 return 0;
659}
Note: See TracBrowser for help on using the repository browser.