source: trunk/FACT++/src/root2sql.cc@ 19104

Last change on this file since 19104 was 19104, checked in by tbretz, 6 years ago
Some cosmetics to the output.
File size: 22.3 KB
Line 
1#include <boost/algorithm/string/join.hpp>
2#include <boost/regex.hpp>
3
4#include "Database.h"
5
6#include "Time.h"
7#include "Configuration.h"
8
9#include <TROOT.h>
10#include <TFile.h>
11#include <TTree.h>
12#include <TLeaf.h>
13#include <TError.h>
14
15using namespace std;
16
17// ------------------------------------------------------------------------
18
19struct Map : pair<string, string>
20{
21 Map() { }
22};
23
24std::istream &operator>>(std::istream &in, Map &m)
25{
26 const istreambuf_iterator<char> eos;
27 string txt(istreambuf_iterator<char>(in), eos);
28
29 const boost::regex expr("((.*)[^\\\\])/(.*)");
30 boost::smatch match;
31 if (!boost::regex_match(txt, match, expr))
32 throw runtime_error("Could not evaluate map argument: "+txt);
33
34 m.first = match[1].str();
35 m.second = match[3].str();
36
37 return in;
38}
39
40
41void SetupConfiguration(Configuration &conf)
42{
43 po::options_description control("Root to SQL");
44 control.add_options()
45 ("uri,u", var<string>()
46#if BOOST_VERSION >= 104200
47 ->required()
48#endif
49 , "Database link as in\n\tuser:password@server[:port]/database.")
50 ("file", var<string>(""), "The root file to read from")
51 ("force", po_switch(), "Force processing even if there is no database connection")
52 ("create", po_switch(), "Create the database if not existing")
53 ("drop", po_switch(), "Drop the table (implies create)")
54 ("tree,t", var<string>("Events"), "Name of the root tree to convert")
55 ("table", var<string>(""), "Name of the table to use (default is the tree name)")
56 ("map", vars<Map>(), "A regular expression which is applied to the leaf name befoee it is used as SQL column name)")
57 ("sql-type", vars<Map>(), "Allows to overwrite the calculated SQL type for a given column e.g. 'sql-column-name/UNSIGNED IN'")
58 ("ignore", vars<string>(), "Ignore the given leaf, if the given regular expression matches")
59 ("primary", vars<string>(), "List of columns to be used as primary keys during table creation (in connection with --create)")
60 ("first", var<int64_t>(int64_t(0)), "First event to start with (default: 0), mainly for test purpose")
61 ("max", var<int64_t>(int64_t(0)), "Maximum number of events to process (0: all), mainly for test purpose")
62 ("engine", var<string>("InnoDB"), "Database engine to be used when a new table is created")
63 ("duplicate", var<string>(""), "Specifies an assignment_list for an 'ON DUPLICATE KEY UPDATE' expression")
64 ("ignore-errors", po_switch(), "Adds the IGNORE keyword to the INSERT query (turns errors into warnings, ignores rows with errors)")
65 ;
66
67 po::options_description debug("Debug options");
68 debug.add_options()
69 ("no-insert", po_switch(), "Does not insert any data into the table")
70 ("dry-run", po_switch(), "Skip any query which changes the databse (might result in consecutive failures)")
71 ("print-branches", po_switch(), "Print the branches found in the tree")
72 ("print-leaves", po_switch(), "Print the leaves found in the tree (this is what is processed)")
73 ("print-insert", po_switch(), "Print the INSERT query (note that it contains all data)")
74 ("print-create", po_switch(), "Print the CREATE query")
75 ("verbose,v", var<uint16_t>(1), "Verbosity (0: quiet, 1: default, 2: more, 3, ...)")
76 ;
77
78 po::positional_options_description p;
79 p.add("file", 1); // The 1st positional options (n=1)
80
81 conf.AddOptions(control);
82 conf.AddOptions(debug);
83 conf.SetArgumentPositions(p);
84}
85
86void PrintUsage()
87{
88 cout <<
89 "root2sql - Fills the data from a root file into a database\n"
90 "\n"
91 "For convenience, this documentation uses the extended version of the options, "
92 "refer to the output below to get the abbreviations.\n"
93 "\n"
94 "This is a general purpose tool to fill the contents of a root file into a database "
95 "as long as this is technically possible and makes sense. Note that root can even "
96 "write complex data like a TH1F into a database, this is not the purpose of this "
97 "program.\n"
98 "\n"
99 "Each root tree has branches and leaves (the basic data types). These leaves can "
100 "be read independently of the classes which were used to write the root file. "
101 "The default tree to read from is 'Events' but the name can be overwritten "
102 "using --tree. The default table name to fill the data into is identical to "
103 "the tree name. It can be overwritten using --table.\n"
104 "\n"
105 "The name of each column to which data is filled from a leave is obtained from "
106 "the leaves' names. The leave names can be checked using --print-leaves. "
107 "A --print-branches exists for convenience to print only the high-level branches. "
108 "Sometimes these names might be quite unconvenient like MTime.fTime.fMilliSec or "
109 "just MHillas.fWidth. To allow to simplify column names, regular expressions "
110 "(using boost's regex) can be defined to change the names. Note that these regular "
111 "expressions are applied one by one on each leaf's name. A valid expression could "
112 "be:\n"
113 " --map=MHillas\\.f/\n"
114 "which would remove all occurances of 'MHillas.f'. This option can be used more than "
115 "once. They are applied in sequence. A single match does not stop the sequence.\n"
116 "\n"
117 "Sometimes it might also be convenient to skip a leaf. This can be done with "
118 "the --ignore resource. If the given regular expresion yields a match, the "
119 "leaf will be ignored. Note that the regular expression works on the raw-name "
120 "of the leaf not the readily mapped SQL column names. Example:\n"
121 " --ignore=ThetaSq\\..*\n"
122 "will skip all leaved which start with 'ThetaSq.'. This option can be used"
123 "more than once.\n"
124 "\n"
125 "The data type of each column is kept as close as possible to the leaves' data "
126 "types. If for some reason this is not wanted, the data type of the SQL column "
127 "can be overwritten with --sql-type sql-column/sql-ytpe, for example:\n"
128 " --sql-type=FileId/UNSIGNED INT\n"
129 "while the first argument of the name of the SQL column to which the data type "
130 "should be applied. The second column is the basic SQL data type. The option can "
131 "be given more than once.\n"
132 "\n"
133 "Database interaction:\n"
134 "\n"
135 "To drop an existing table, --drop can be used.\n"
136 "\n"
137 "To create a table according to theSQL column names and data types, --create "
138 "can be used. The query used can be printed with --print-create even --create "
139 "has not been specified.\n"
140 "\n"
141 "To choose the columns which should become primary keys, use --primary, "
142 "for exmaple:\n"
143 " --primary=col1\n"
144 "To define more than one column as primary key, the option can be given more than "
145 "once. Note that the combination of these columns must be unique.\n"
146 "\n"
147 "All columns are created as NOT NULL as default and the table is created as "
148 "InnoDB (default).\n"
149 "\n"
150 "Usually, the INSERT query would fail if the PRIMARY key exists already. "
151 "This can be avoided using the 'ON DUPLICATE KEY' directive. With the "
152 "--duplicate,you can specify what should be updated in case of a duplicate key. "
153 "To keep the row untouched, you can just update the primary key "
154 "with the identical primary key, e.g. --duplicate='MyPrimary=VALUES(MyPrimary)'. "
155 "For more details, see the MySQL manual.\n"
156 "\n"
157 "For debugging purpose, or to just create or drop a table, the final insert "
158 "query can be skipped using --no-insert. Note that for performance reason, "
159 "all data is collected in memory and a single INSERT query is issued at the "
160 "end.\n"
161 "\n"
162 "Another possibility is to add the IGNORE keyword to the INSERT query by "
163 "--ignore-errors, which essentially ignores all errors and turns them into "
164 "warnings which are printed after the query succeeded.\n"
165 "\n"
166 "Using a higher verbosity level (-v), an overview of the written columns or all "
167 "processed leaves is printed depending on the verbosity level. The output looks "
168 "like the following\n"
169 " Leaf name [root data type] (SQL name)\n"
170 "for example\n"
171 " MTime.fTime.fMilliSec [Long64_t] (MilliSec)\n"
172 "which means that the leaf MTime.fTime.fMilliSec is detected to be a Long64_t "
173 "which is filled into a column called MilliSec. Leaves with non basic data types "
174 "are ignored automatically and are marked as (-n/a-). User ignored columnd "
175 "are marked as (-ignored-).\n"
176 "\n"
177 "If a query failed, the query is printed to stderr together with the error message. "
178 "For the main INSERT query, this is only true if the verbosity level is at least 2 "
179 "or the query has less than 80*25 bytes.\n"
180 "\n"
181 "In case of succes, 0 is returned, a value>0 otherwise.\n"
182 "\n"
183 "Usage: root2sql [options] rootfile.root\n"
184 "\n"
185 ;
186 cout << endl;
187}
188
189enum BasicType_t
190{
191 kNone = 0,
192 kFloat,
193 kDouble,
194 kInt16,
195 kUInt16,
196 kInt32,
197 kUInt32,
198 kInt64,
199 kUInt64,
200};
201
202static const map<string, pair<BasicType_t, string>> ConvRoot =
203{
204 { "Float_t", { kFloat, "FLOAT" } },
205 { "Double_t", { kDouble, "DOUBLE" } },
206 { "ULong64_t", { kUInt64, "BIGINT UNSIGNED" } },
207 { "Long64_t", { kInt64, "BIGINT" } },
208 { "UInt_t", { kUInt32, "INT UNSIGNED" } },
209 { "Int_t", { kInt32, "INT" } },
210 { "UShort_t", { kUInt16, "SMALLINT UNSIGNED" } },
211 { "Short_t", { kInt16, "SMALLINT" } },
212};
213
214struct Container
215{
216 string branch; // branch name
217 string column; // column name
218 BasicType_t type;
219 void *ptr;
220
221 Container(const string &b, const string &c, const BasicType_t &t) : branch(b), column(c), type(t), ptr(0)
222 {
223 switch (t)
224 {
225 case kFloat: ptr = new Float_t; break;
226 case kDouble: ptr = new Double_t; break;
227 case kInt16: ptr = new Short_t; break;
228 case kUInt16: ptr = new UShort_t; break;
229 case kInt32: ptr = new Int_t; break;
230 case kUInt32: ptr = new UInt_t; break;
231 case kInt64: ptr = new Long64_t; break;
232 case kUInt64: ptr = new ULong64_t; break;
233 case kNone:
234 break;
235 }
236 }
237 ~Container()
238 {
239 //::operator delete(ptr); // It seems root is deleting it already
240 }
241
242 string fmt() const
243 {
244 ostringstream str;
245
246 switch (type)
247 {
248 case kFloat: str << setprecision(8) << *reinterpret_cast<Float_t*>(ptr); break;
249 case kDouble: str << setprecision(16) << *reinterpret_cast<Double_t*>(ptr); break;
250 case kInt16: str << *reinterpret_cast<Short_t*>(ptr); break;
251 case kUInt16: str << *reinterpret_cast<UShort_t*>(ptr); break;
252 case kInt32: str << *reinterpret_cast<Int_t*>(ptr); break;
253 case kUInt32: str << *reinterpret_cast<UInt_t*>(ptr); break;
254 case kInt64: str << *reinterpret_cast<Long64_t*>(ptr); break;
255 case kUInt64: str << *reinterpret_cast<ULong64_t*>(ptr); break;
256 case kNone:
257 break;
258 }
259
260 //if (str.str()=="nan" || str.str()=="-nan" || str.str()=="inf" || str.str()=="-inf")
261 // return "NULL";
262
263 return str.str();
264 }
265};
266
267void ErrorHandlerAll(Int_t level, Bool_t abort, const char *location, const char *msg)
268{
269 if (string(msg).substr(0,24)=="no dictionary for class ")
270 return;
271
272 DefaultErrorHandler(level, abort, location, msg);
273}
274
275int main(int argc, const char* argv[])
276{
277 Time start;
278
279 gROOT->SetBatch();
280 SetErrorHandler(ErrorHandlerAll);
281
282 Configuration conf(argv[0]);
283 conf.SetPrintUsage(PrintUsage);
284 SetupConfiguration(conf);
285
286 if (!conf.DoParse(argc, argv))
287 return 127;
288
289 // ----------------------------- Evaluate options --------------------------
290 const string uri = conf.Get<string>("uri");
291 const string file = conf.Get<string>("file");
292 const string tree = conf.Get<string>("tree");
293 const string table = conf.Get<string>("table").empty() ? tree : conf.Get<string>("table");
294
295 const uint16_t verbose = conf.Get<uint16_t>("verbose");
296 const int64_t first = conf.Get<int64_t>("first");
297 const int64_t max = conf.Get<int64_t>("max");
298
299 const bool force = conf.Get<bool>("force");
300 const bool drop = conf.Get<bool>("drop");
301 const bool create = conf.Get<bool>("create") || drop;
302 const bool noinsert = conf.Get<bool>("no-insert");
303 const bool dry_run = conf.Get<bool>("dry-run");
304
305 const string engine = conf.Get<string>("engine");
306 const string duplicate = conf.Get<string>("duplicate");
307
308 const bool ignore_errors = conf.Get<bool>("ignore-errors");
309
310 const bool print_branches = conf.Get<bool>("print-branches");
311 const bool print_leaves = conf.Get<bool>("print-leaves");
312 const bool print_create = conf.Get<bool>("print-create");
313 const bool print_insert = conf.Get<bool>("print-insert");
314
315 const vector<Map> mymap = conf.Vec<Map>("map");
316 const vector<Map> sqltypes = conf.Vec<Map>("sql-type");
317 const vector<string> _ignore = conf.Vec<string>("ignore");
318 const vector<string> primary = conf.Vec<string>("primary");
319
320 // -------------------------------------------------------------------------
321
322 if (verbose>0)
323 cout << "\n-------------------------- Evaluating file -------------------------" << endl;
324
325 TFile f(file.c_str());
326 if (f.IsZombie())
327 {
328 cerr << "Could not open file " << file << endl;
329 return 1;
330 }
331
332 if (verbose>0)
333 cout << "File: " << file << endl;
334
335 TTree *T = 0;
336 f.GetObject(tree.c_str(), T);
337 if (!T)
338 {
339 cerr << "Could not open tree " << tree << endl;
340 return 2;
341 }
342
343 if (verbose>0)
344 cout << "Tree: " << tree << endl;
345
346 T->SetMakeClass(1);
347
348 TObjArray *branches = T->GetListOfBranches();
349 TObjArray *leaves = T->GetListOfLeaves();
350
351 if (print_branches)
352 {
353 cout << '\n';
354 branches->Print();
355 }
356
357 if (verbose>0)
358 cout << T->GetEntriesFast() << " events found." << endl;
359
360
361 if (verbose>0)
362 cout << branches->GetEntries() << " branches found." << endl;
363
364 if (print_leaves)
365 {
366 cout << '\n';
367 leaves->Print();
368 }
369 if (verbose>0)
370 cout << leaves->GetEntries() << " leaves found." << endl;
371
372 string query =
373 "CREATE TABLE IF NOT EXISTS `"+table+"`\n"
374 "(\n";
375
376 vector<Container> vec;
377
378
379 TIter Next(leaves);
380 TObject *o = 0;
381 while ((o=Next()))
382 {
383 TLeaf *L = T->GetLeaf(o->GetName());
384
385 if (verbose>2)
386 cout << '\n' << o->GetName() << " [" << L->GetTypeName() << "]";
387
388 string name = o->GetName();
389
390
391 bool found = false;
392 for (auto b=_ignore.cbegin(); b!=_ignore.cend(); b++)
393 {
394 if (boost::regex_match(name, boost::regex(*b)))
395 {
396 found = true;
397 if (verbose>2)
398 cout << " (-ignored-)";
399 break;
400 }
401 }
402 if (found)
403 continue;
404
405 const string tn = L->GetTypeName();
406
407 auto it = ConvRoot.find(tn);
408 if (it==ConvRoot.end())
409 {
410 if (verbose>2)
411 cout << " (-n/a-)";
412 continue;
413 }
414
415 if (verbose==2)
416 cout << '\n' << o->GetName() << " [" << L->GetTypeName() << "]";
417
418 for (auto m=mymap.cbegin(); m!=mymap.cend(); m++)
419 name = boost::regex_replace(name, boost::regex(m->first), m->second);
420
421 if (verbose>1)
422 cout << " (" << name << ")";
423
424 string sqltype = it->second.second;
425
426 for (auto m=sqltypes.cbegin(); m!=sqltypes.cend(); m++)
427 if (m->first==name)
428 sqltype = m->second;
429
430 if (!vec.empty())
431 query += ",\n";
432 query += " `"+name+"` "+sqltype+" NOT NULL COMMENT '"+o->GetName()+"'";
433
434 vec.emplace_back(o->GetName(), name, it->second.first);
435 T->SetBranchAddress(o->GetName(), vec.back().ptr);
436 }
437
438 if (verbose>1)
439 cout << "\n\n";
440 if (verbose>0)
441 cout << vec.size() << " leaves setup for reading." << endl;
442
443 UInt_t datatype = 0;
444 const bool has_datatype = T->SetBranchAddress("DataType.fVal", &datatype) >= 0;
445
446 // Setiing up branch status (must be after all SetBranchAddress)
447 T->SetBranchStatus("*", 0);
448 for (auto c=vec.cbegin(); c!=vec.cend(); c++)
449 T->SetBranchStatus(c->branch.c_str(), 1);
450
451 if (has_datatype)
452 {
453 T->SetBranchStatus("DataType.fVal", 1);
454 if (verbose>0)
455 cout << "Rows with DataType.fVal!=1 will be skipped." << endl;
456 }
457
458 // -------------------------------------------------------------------------
459 // Checking for database connection
460
461 Database connection(uri);
462
463 try
464 {
465 if (!force)
466 connection.connected();
467 }
468 catch (const exception &e)
469 {
470 cerr << "SQL connection failed: " << e.what() << endl;
471 return 3;
472 }
473
474 // -------------------------------------------------------------------------
475
476 if (verbose>0)
477 cout << "\n--------------------------- Database Table -------------------------" << endl;
478
479 if (!primary.empty())
480 query += ",\n PRIMARY KEY USING BTREE (`"+boost::algorithm::join(primary, "`, `")+"`)\n";
481
482 query +=
483 ")\n"
484 "DEFAULT CHARSET=latin1 COLLATE=latin1_general_ci\n"
485 "ENGINE="+engine+"\n"
486 "COMMENT='created by "+conf.GetName()+"'\n";
487
488 // FIXME: Can we omit the catching to be able to print the
489 // query 'autmatically'?
490 try
491 {
492 if (drop)
493 {
494 // => Simple result
495 if (!dry_run)
496 connection.query("DROP TABLE `"+table+"`").execute();
497 if (verbose>0)
498 cout << "Table `" << table << "` dropped." << endl;
499 }
500 }
501 catch (const exception &e)
502 {
503 cerr << "DROP TABLE `" << table << "`\n\n";
504 cerr << "SQL query failed:\n" << e.what() << endl;
505 return 4;
506 }
507
508 try
509 {
510 if (create && !dry_run)
511 connection.query(query).execute();
512 }
513 catch (const exception &e)
514 {
515 cerr << query << "\n\n";
516 cerr << "SQL query failed:\n" << e.what() << endl;
517 return 5;
518 }
519
520 if (print_create)
521 cout << query << endl;
522
523 if (create && verbose>0)
524 cout << "Table `" << table << "` created." << endl;
525
526 // -------------------------------------------------------------------------
527
528 if (verbose>0)
529 cout << "\n---------------------------- Reading file --------------------------" << endl;
530
531 //query = update ? "UPDATE" : "INSERT";
532 query = "INSERT ";
533 if (ignore_errors)
534 query += "IGNORE ";
535 query += "`"+table+"`\n"
536 "(\n";
537
538 for (auto c=vec.cbegin(); c!=vec.cend(); c++)
539 {
540 if (c!=vec.cbegin())
541 query += ",\n";
542 query += " `"+c->column+"`";
543 }
544
545 query +=
546 "\n)\n"
547 "VALUES\n";
548
549 size_t count = 0;
550
551 const size_t num = max>0 && (max-first)<T->GetEntriesFast() ? (max-first) : T->GetEntriesFast();
552 for (size_t j=first; j<num; j++)
553 {
554 T->GetEntry(j);
555 if (has_datatype && datatype!=1)
556 continue;
557
558 if (count>0)
559 query += ",\n";
560
561 query += "(\n";
562
563 for (auto c=vec.cbegin(); c!=vec.cend(); c++)
564 {
565 if (c!=vec.cbegin())
566 query += ",\n";
567
568 query += " "+c->fmt();
569
570 if (print_insert)
571 query += " /* "+c->column+" -> "+c->branch+" */";
572 }
573 query += "\n)";
574
575 count ++;
576 }
577
578 if (!duplicate.empty())
579 query += "\nON DUPLICATE KEY UPDATE\n " + duplicate;
580
581 if (verbose>0)
582 cout << count << " out of " << num << " row(s) read from file [N=" << first << ".." << num-1 << "]." << endl;
583
584 if (count==0)
585 {
586 if (verbose>0)
587 cout << "Total execution time: " << Time().UnixTime()-start.UnixTime() << "s\n" << endl;
588 return 0;
589 }
590
591 // -------------------------------------------------------------------------
592
593 if (verbose>0)
594 {
595 cout << "\n--------------------------- Inserting data -------------------------" << endl;
596 cout << "Sending INSERT query (" << query.length() << " bytes)" << endl;
597 }
598
599 try
600 {
601 if (!noinsert && !dry_run)
602 {
603 auto q = connection.query(query);
604 q.execute();
605 cout << q.info() << '\n' << endl;
606 }
607 else
608 cout << "Insert query skipped!" << endl;
609
610 if (print_insert)
611 cout << query << endl;
612 }
613 catch (const exception &e)
614 {
615 if (verbose>1 || query.length()<80*25)
616 cerr << query << "\n\n";
617 cerr << "SQL query failed (" << query.length() << " bytes):\n" << e.what() << endl;
618 return 6;
619 }
620
621 if (verbose>0)
622 {
623 cout << "Total execution time: " << Time().UnixTime()-start.UnixTime() << "s\n" << endl;
624
625 try
626 {
627 const auto resw =
628 connection.query("SHOW WARNINGS").store();
629
630 for (size_t i=0; i<resw.num_rows(); i++)
631 {
632 const mysqlpp::Row &roww = resw[i];
633
634 cout << roww["Level"] << '[' << roww["Code"] << "]: ";
635 cout << roww["Message"] << '\n';
636 }
637 cout << endl;
638
639 }
640 catch (const exception &e)
641 {
642 cerr << "\nSHOW WARNINGS\n\n";
643 cerr << "SQL query failed:\n" << e.what() << '\n' <<endl;
644 return 7;
645 }
646 }
647
648 return 0;
649}
Note: See TracBrowser for help on using the repository browser.