source: trunk/FACT++/src/root2sql.cc@ 19126

Last change on this file since 19126 was 19126, checked in by tbretz, 6 years ago
Implemented the possibility to print basic connection informations.
File size: 23.7 KB
Line 
1#include <boost/algorithm/string/join.hpp>
2#include <boost/regex.hpp>
3
4#include "Database.h"
5
6#include "Time.h"
7#include "Configuration.h"
8
9#include <TROOT.h>
10#include <TFile.h>
11#include <TTree.h>
12#include <TLeaf.h>
13#include <TError.h>
14
15using namespace std;
16
17// ------------------------------------------------------------------------
18
19struct Map : pair<string, string>
20{
21 Map() { }
22};
23
24std::istream &operator>>(std::istream &in, Map &m)
25{
26 const istreambuf_iterator<char> eos;
27 string txt(istreambuf_iterator<char>(in), eos);
28
29 const boost::regex expr("((.*)[^\\\\])/(.*)");
30 boost::smatch match;
31 if (!boost::regex_match(txt, match, expr))
32 throw runtime_error("Could not evaluate map argument: "+txt);
33
34 m.first = match[1].str();
35 m.second = match[3].str();
36
37 return in;
38}
39
40
41void SetupConfiguration(Configuration &conf)
42{
43 po::options_description control("Root to SQL");
44 control.add_options()
45 ("uri,u", var<string>()
46#if BOOST_VERSION >= 104200
47 ->required()
48#endif
49 , "Database link as in\n\tuser:password@server[:port]/database[/comp].")
50 ("file", var<string>("")
51#if BOOST_VERSION >= 104200
52 ->required()
53#endif
54 , "The root file to read from")
55 ("force", po_switch(), "Force processing even if there is no database connection")
56 ("create", po_switch(), "Create the database if not existing")
57 ("drop", po_switch(), "Drop the table (implies create)")
58 ("tree,t", var<string>("Events"), "Name of the root tree to convert")
59 ("table", var<string>(""), "Name of the table to use (default is the tree name)")
60 ("map", vars<Map>(), "A regular expression which is applied to the leaf name befoee it is used as SQL column name)")
61 ("sql-type", vars<Map>(), "Allows to overwrite the calculated SQL type for a given column e.g. 'sql-column-name/UNSIGNED IN'")
62 ("ignore", vars<string>(), "Ignore the given leaf, if the given regular expression matches")
63 ("primary", vars<string>(), "List of columns to be used as primary keys during table creation (in connection with --create)")
64 ("first", var<int64_t>(int64_t(0)), "First event to start with (default: 0), mainly for test purpose")
65 ("max", var<int64_t>(int64_t(0)), "Maximum number of events to process (0: all), mainly for test purpose")
66 ("engine", var<string>(""), "Database engine to be used when a new table is created")
67 ("row-format", var<string>(""), "Defines the ROW_FORMAT keyword for table creation")
68 ("duplicate", vars<string>(""), "Specifies an assignment_list for an 'ON DUPLICATE KEY UPDATE' expression")
69 ("ignore-errors", po_switch(), "Adds the IGNORE keyword to the INSERT query (turns errors into warnings, ignores rows with errors)")
70 ;
71
72 po::options_description debug("Debug options");
73 debug.add_options()
74 ("no-insert", po_switch(), "Does not insert any data into the table")
75 ("dry-run", po_switch(), "Skip any query which changes the databse (might result in consecutive failures)")
76 ("print-connection", po_switch(), "Print database connection information")
77 ("print-branches", po_switch(), "Print the branches found in the tree")
78 ("print-leaves", po_switch(), "Print the leaves found in the tree (this is what is processed)")
79 ("print-insert", po_switch(), "Print the INSERT query (note that it contains all data)")
80 ("print-create", po_switch(), "Print the CREATE query")
81 ("verbose,v", var<uint16_t>(1), "Verbosity (0: quiet, 1: default, 2: more, 3, ...)")
82 ;
83
84 po::positional_options_description p;
85 p.add("file", 1); // The 1st positional options (n=1)
86
87 conf.AddOptions(control);
88 conf.AddOptions(debug);
89 conf.SetArgumentPositions(p);
90}
91
92void PrintUsage()
93{
94 cout <<
95 "root2sql - Fills the data from a root file into a database\n"
96 "\n"
97 "For convenience, this documentation uses the extended version of the options, "
98 "refer to the output below to get the abbreviations.\n"
99 "\n"
100 "This is a general purpose tool to fill the contents of a root file into a database "
101 "as long as this is technically possible and makes sense. Note that root can even "
102 "write complex data like a TH1F into a database, this is not the purpose of this "
103 "program.\n"
104 "\n"
105 "Each root tree has branches and leaves (the basic data types). These leaves can "
106 "be read independently of the classes which were used to write the root file. "
107 "The default tree to read from is 'Events' but the name can be overwritten "
108 "using --tree. The default table name to fill the data into is identical to "
109 "the tree name. It can be overwritten using --table.\n"
110 "\n"
111 "The name of each column to which data is filled from a leave is obtained from "
112 "the leaves' names. The leave names can be checked using --print-leaves. "
113 "A --print-branches exists for convenience to print only the high-level branches. "
114 "Sometimes these names might be quite unconvenient like MTime.fTime.fMilliSec or "
115 "just MHillas.fWidth. To allow to simplify column names, regular expressions "
116 "(using boost's regex) can be defined to change the names. Note that these regular "
117 "expressions are applied one by one on each leaf's name. A valid expression could "
118 "be:\n"
119 " --map=MHillas\\.f/\n"
120 "which would remove all occurances of 'MHillas.f'. This option can be used more than "
121 "once. They are applied in sequence. A single match does not stop the sequence.\n"
122 "\n"
123 "Sometimes it might also be convenient to skip a leaf. This can be done with "
124 "the --ignore resource. If the given regular expresion yields a match, the "
125 "leaf will be ignored. Note that the regular expression works on the raw-name "
126 "of the leaf not the readily mapped SQL column names. Example:\n"
127 " --ignore=ThetaSq\\..*\n"
128 "will skip all leaved which start with 'ThetaSq.'. This option can be used"
129 "more than once.\n"
130 "\n"
131 "The data type of each column is kept as close as possible to the leaves' data "
132 "types. If for some reason this is not wanted, the data type of the SQL column "
133 "can be overwritten with --sql-type sql-column/sql-ytpe, for example:\n"
134 " --sql-type=FileId/UNSIGNED INT\n"
135 "while the first argument of the name of the SQL column to which the data type "
136 "should be applied. The second column is the basic SQL data type. The option can "
137 "be given more than once.\n"
138 "\n"
139 "Database interaction:\n"
140 "\n"
141 "To drop an existing table, --drop can be used.\n"
142 "\n"
143 "To create a table according to theSQL column names and data types, --create "
144 "can be used. The query used can be printed with --print-create even --create "
145 "has not been specified.\n"
146 "\n"
147 "To choose the columns which should become primary keys, use --primary, "
148 "for exmaple:\n"
149 " --primary=col1\n"
150 "To define more than one column as primary key, the option can be given more than "
151 "once. Note that the combination of these columns must be unique.\n"
152 "\n"
153 "All columns are created as NOT NULL as default. To force a database engine "
154 "and/or a storage format, use --engine and --rot-format.\n"
155 "\n"
156 "Usually, the INSERT query would fail if the PRIMARY key exists already. "
157 "This can be avoided using the 'ON DUPLICATE KEY UPDATE' directive. With the "
158 "--duplicate, you can specify what should be updated in case of a duplicate key. "
159 "To keep the row untouched, you can just update the primary key "
160 "with the identical primary key, e.g. --duplicate='MyPrimary=VALUES(MyPrimary)'. "
161 "The --duplicate resource can be specified more than once to add more expressions "
162 "to the assignment_list. For more details, see the MySQL manual.\n"
163 "\n"
164 "For debugging purpose, or to just create or drop a table, the final insert "
165 "query can be skipped using --no-insert. Note that for performance reason, "
166 "all data is collected in memory and a single INSERT query is issued at the "
167 "end.\n"
168 "\n"
169 "Another possibility is to add the IGNORE keyword to the INSERT query by "
170 "--ignore-errors, which essentially ignores all errors and turns them into "
171 "warnings which are printed after the query succeeded.\n"
172 "\n"
173 "Using a higher verbosity level (-v), an overview of the written columns or all "
174 "processed leaves is printed depending on the verbosity level. The output looks "
175 "like the following\n"
176 " Leaf name [root data type] (SQL name)\n"
177 "for example\n"
178 " MTime.fTime.fMilliSec [Long64_t] (MilliSec)\n"
179 "which means that the leaf MTime.fTime.fMilliSec is detected to be a Long64_t "
180 "which is filled into a column called MilliSec. Leaves with non basic data types "
181 "are ignored automatically and are marked as (-n/a-). User ignored columnd "
182 "are marked as (-ignored-).\n"
183 "\n"
184 "If a query failed, the query is printed to stderr together with the error message. "
185 "For the main INSERT query, this is only true if the verbosity level is at least 2 "
186 "or the query has less than 80*25 bytes.\n"
187 "\n"
188 "In case of succes, 0 is returned, a value>0 otherwise.\n"
189 "\n"
190 "Usage: root2sql [options] -uri URI rootfile.root\n"
191 "\n"
192 ;
193 cout << endl;
194}
195
196enum BasicType_t
197{
198 kNone = 0,
199 kFloat,
200 kDouble,
201 kInt16,
202 kUInt16,
203 kInt32,
204 kUInt32,
205 kInt64,
206 kUInt64,
207};
208
209static const map<string, pair<BasicType_t, string>> ConvRoot =
210{
211 { "Float_t", { kFloat, "FLOAT" } },
212 { "Double_t", { kDouble, "DOUBLE" } },
213 { "ULong64_t", { kUInt64, "BIGINT UNSIGNED" } },
214 { "Long64_t", { kInt64, "BIGINT" } },
215 { "UInt_t", { kUInt32, "INT UNSIGNED" } },
216 { "Int_t", { kInt32, "INT" } },
217 { "UShort_t", { kUInt16, "SMALLINT UNSIGNED" } },
218 { "Short_t", { kInt16, "SMALLINT" } },
219};
220
221struct Container
222{
223 string branch; // branch name
224 string column; // column name
225 BasicType_t type;
226 void *ptr;
227
228 Container(const string &b, const string &c, const BasicType_t &t) : branch(b), column(c), type(t), ptr(0)
229 {
230 switch (t)
231 {
232 case kFloat: ptr = new Float_t; break;
233 case kDouble: ptr = new Double_t; break;
234 case kInt16: ptr = new Short_t; break;
235 case kUInt16: ptr = new UShort_t; break;
236 case kInt32: ptr = new Int_t; break;
237 case kUInt32: ptr = new UInt_t; break;
238 case kInt64: ptr = new Long64_t; break;
239 case kUInt64: ptr = new ULong64_t; break;
240 case kNone:
241 break;
242 }
243 }
244 ~Container()
245 {
246 //::operator delete(ptr); // It seems root is deleting it already
247 }
248
249 string fmt() const
250 {
251 ostringstream str;
252
253 switch (type)
254 {
255 case kFloat: str << setprecision(8) << *reinterpret_cast<Float_t*>(ptr); break;
256 case kDouble: str << setprecision(16) << *reinterpret_cast<Double_t*>(ptr); break;
257 case kInt16: str << *reinterpret_cast<Short_t*>(ptr); break;
258 case kUInt16: str << *reinterpret_cast<UShort_t*>(ptr); break;
259 case kInt32: str << *reinterpret_cast<Int_t*>(ptr); break;
260 case kUInt32: str << *reinterpret_cast<UInt_t*>(ptr); break;
261 case kInt64: str << *reinterpret_cast<Long64_t*>(ptr); break;
262 case kUInt64: str << *reinterpret_cast<ULong64_t*>(ptr); break;
263 case kNone:
264 break;
265 }
266
267 //if (str.str()=="nan" || str.str()=="-nan" || str.str()=="inf" || str.str()=="-inf")
268 // return "NULL";
269
270 return str.str();
271 }
272};
273
274void ErrorHandlerAll(Int_t level, Bool_t abort, const char *location, const char *msg)
275{
276 if (string(msg).substr(0,24)=="no dictionary for class ")
277 return;
278
279 DefaultErrorHandler(level, abort, location, msg);
280}
281
282int main(int argc, const char* argv[])
283{
284 Time start;
285
286 gROOT->SetBatch();
287 SetErrorHandler(ErrorHandlerAll);
288
289 Configuration conf(argv[0]);
290 conf.SetPrintUsage(PrintUsage);
291 SetupConfiguration(conf);
292
293 if (!conf.DoParse(argc, argv))
294 return 127;
295
296 // ----------------------------- Evaluate options --------------------------
297 const string uri = conf.Get<string>("uri");
298 const string file = conf.Get<string>("file");
299 const string tree = conf.Get<string>("tree");
300 const string table = conf.Get<string>("table").empty() ? tree : conf.Get<string>("table");
301
302 const uint16_t verbose = conf.Get<uint16_t>("verbose");
303 const int64_t first = conf.Get<int64_t>("first");
304 const int64_t max = conf.Get<int64_t>("max");
305
306 const bool force = conf.Get<bool>("force");
307 const bool drop = conf.Get<bool>("drop");
308 const bool create = conf.Get<bool>("create") || drop;
309 const bool noinsert = conf.Get<bool>("no-insert");
310 const bool dry_run = conf.Get<bool>("dry-run");
311
312 const string engine = conf.Get<string>("engine");
313 const string row_format = conf.Get<string>("row-format");
314
315 const vector<string> duplicate = conf.Vec<string>("duplicate");
316
317 const bool ignore_errors = conf.Get<bool>("ignore-errors");
318
319 const bool print_connection = conf.Get<bool>("print-connection");
320 const bool print_branches = conf.Get<bool>("print-branches");
321 const bool print_leaves = conf.Get<bool>("print-leaves");
322 const bool print_create = conf.Get<bool>("print-create");
323 const bool print_insert = conf.Get<bool>("print-insert");
324
325 const vector<Map> mymap = conf.Vec<Map>("map");
326 const vector<Map> sqltypes = conf.Vec<Map>("sql-type");
327 const vector<string> _ignore = conf.Vec<string>("ignore");
328 const vector<string> primary = conf.Vec<string>("primary");
329
330 // -------------------------------------------------------------------------
331
332 if (verbose>0)
333 cout << "\n-------------------------- Evaluating file -------------------------" << endl;
334
335 TFile f(file.c_str());
336 if (f.IsZombie())
337 {
338 cerr << "Could not open file " << file << endl;
339 return 1;
340 }
341
342 if (verbose>0)
343 cout << "File: " << file << endl;
344
345 TTree *T = 0;
346 f.GetObject(tree.c_str(), T);
347 if (!T)
348 {
349 cerr << "Could not open tree " << tree << endl;
350 return 2;
351 }
352
353 if (verbose>0)
354 cout << "Tree: " << tree << endl;
355
356 T->SetMakeClass(1);
357
358 TObjArray *branches = T->GetListOfBranches();
359 TObjArray *leaves = T->GetListOfLeaves();
360
361 if (print_branches)
362 {
363 cout << '\n';
364 branches->Print();
365 }
366
367 if (verbose>0)
368 cout << T->GetEntriesFast() << " events found." << endl;
369
370
371 if (verbose>0)
372 cout << branches->GetEntries() << " branches found." << endl;
373
374 if (print_leaves)
375 {
376 cout << '\n';
377 leaves->Print();
378 }
379 if (verbose>0)
380 cout << leaves->GetEntries() << " leaves found." << endl;
381
382 string query =
383 "CREATE TABLE IF NOT EXISTS `"+table+"`\n"
384 "(\n";
385
386 vector<Container> vec;
387
388
389 TIter Next(leaves);
390 TObject *o = 0;
391 while ((o=Next()))
392 {
393 TLeaf *L = T->GetLeaf(o->GetName());
394
395 if (verbose>2)
396 cout << '\n' << o->GetName() << " [" << L->GetTypeName() << "]";
397
398 string name = o->GetName();
399
400
401 bool found = false;
402 for (auto b=_ignore.cbegin(); b!=_ignore.cend(); b++)
403 {
404 if (boost::regex_match(name, boost::regex(*b)))
405 {
406 found = true;
407 if (verbose>2)
408 cout << " (-ignored-)";
409 break;
410 }
411 }
412 if (found)
413 continue;
414
415 const string tn = L->GetTypeName();
416
417 auto it = ConvRoot.find(tn);
418 if (it==ConvRoot.end())
419 {
420 if (verbose>2)
421 cout << " (-n/a-)";
422 continue;
423 }
424
425 if (verbose==2)
426 cout << '\n' << o->GetName() << " [" << L->GetTypeName() << "]";
427
428 for (auto m=mymap.cbegin(); m!=mymap.cend(); m++)
429 name = boost::regex_replace(name, boost::regex(m->first), m->second);
430
431 if (verbose>1)
432 cout << " (" << name << ")";
433
434 string sqltype = it->second.second;
435
436 for (auto m=sqltypes.cbegin(); m!=sqltypes.cend(); m++)
437 if (m->first==name)
438 sqltype = m->second;
439
440 if (!vec.empty())
441 query += ",\n";
442 query += " `"+name+"` "+sqltype+" NOT NULL COMMENT '"+o->GetName()+"'";
443
444 vec.emplace_back(o->GetName(), name, it->second.first);
445 T->SetBranchAddress(o->GetName(), vec.back().ptr);
446 }
447
448 if (verbose>1)
449 cout << "\n\n";
450 if (verbose>0)
451 cout << vec.size() << " leaves setup for reading." << endl;
452
453 UInt_t datatype = 0;
454 const bool has_datatype = T->SetBranchAddress("DataType.fVal", &datatype) >= 0;
455
456 // Setiing up branch status (must be after all SetBranchAddress)
457 T->SetBranchStatus("*", 0);
458 for (auto c=vec.cbegin(); c!=vec.cend(); c++)
459 T->SetBranchStatus(c->branch.c_str(), 1);
460
461 if (has_datatype)
462 {
463 T->SetBranchStatus("DataType.fVal", 1);
464 if (verbose>0)
465 cout << "Rows with DataType.fVal!=1 will be skipped." << endl;
466 }
467
468 // -------------------------------------------------------------------------
469 // Checking for database connection
470
471 Database connection(uri);
472
473 try
474 {
475 if (!force)
476 connection.connected();
477 }
478 catch (const exception &e)
479 {
480 cerr << "SQL connection failed: " << e.what() << '\n' << endl;
481 return 3;
482 }
483
484 if (print_connection)
485 {
486 try
487 {
488 const auto &res1 = connection.query("SHOW STATUS LIKE 'Compression'").store();
489 cout << "Compression of databse connection is " << string(res1[0][1]) << endl;
490
491 const auto &res2 = connection.query("SHOW STATUS LIKE 'Ssl_cipher'").store();
492 cout << "Connection to databases is " << (string(res2[0][1]).empty()?"UNENCRYPTED":"ENCRYPTED ("+string(res2[0][1])+")") << endl;
493 }
494 catch (const exception &e)
495 {
496 cerr << "\nSHOW STATUS LIKE COMPRESSION\n\n";
497 cerr << "SQL query failed:\n" << e.what() << endl;
498 return 6;
499 }
500 }
501
502 // -------------------------------------------------------------------------
503
504 if (verbose>0)
505 cout << "\n--------------------------- Database Table -------------------------" << endl;
506
507 if (!primary.empty())
508 query += ",\n PRIMARY KEY USING BTREE (`"+boost::algorithm::join(primary, "`, `")+"`)\n";
509
510 query +=
511 ")\n"
512 "DEFAULT CHARSET=latin1 COLLATE=latin1_general_ci\n";
513 if (!engine.empty())
514 query += "ENGINE="+engine+"\n";
515 if (!row_format.empty())
516 query += "ROW_FORMAT="+row_format+"\n";
517 query += "COMMENT='created by "+conf.GetName()+"'\n";
518
519 // FIXME: Can we omit the catching to be able to print the
520 // query 'autmatically'?
521 try
522 {
523 if (drop)
524 {
525 // => Simple result
526 if (!dry_run)
527 connection.query("DROP TABLE `"+table+"`").execute();
528 if (verbose>0)
529 cout << "Table `" << table << "` dropped." << endl;
530 }
531 }
532 catch (const exception &e)
533 {
534 cerr << "DROP TABLE `" << table << "`\n\n";
535 cerr << "SQL query failed:\n" << e.what() << '\n' << endl;
536 return 4;
537 }
538
539 try
540 {
541 if (create && !dry_run)
542 connection.query(query).execute();
543 }
544 catch (const exception &e)
545 {
546 cerr << query << "\n\n";
547 cerr << "SQL query failed:\n" << e.what() << '\n' << endl;
548 return 5;
549 }
550
551 if (print_create)
552 cout << query << endl;
553
554 if (create && verbose>0)
555 cout << "Table `" << table << "` created." << endl;
556
557 // -------------------------------------------------------------------------
558
559 if (verbose>0)
560 cout << "\n---------------------------- Reading file --------------------------" << endl;
561
562 //query = update ? "UPDATE" : "INSERT";
563 query = "INSERT ";
564 if (ignore_errors)
565 query += "IGNORE ";
566 query += "`"+table+"`\n"
567 "(\n";
568
569 for (auto c=vec.cbegin(); c!=vec.cend(); c++)
570 {
571 if (c!=vec.cbegin())
572 query += ",\n";
573 query += " `"+c->column+"`";
574 }
575
576 query +=
577 "\n)\n"
578 "VALUES\n";
579
580 size_t count = 0;
581
582 const size_t num = max>0 && (max-first)<T->GetEntriesFast() ? (max-first) : T->GetEntriesFast();
583 for (size_t j=first; j<num; j++)
584 {
585 T->GetEntry(j);
586 if (has_datatype && datatype!=1)
587 continue;
588
589 if (count>0)
590 query += ",\n";
591
592 query += "(\n";
593
594 for (auto c=vec.cbegin(); c!=vec.cend(); c++)
595 {
596 if (c!=vec.cbegin())
597 query += ",\n";
598
599 query += " "+c->fmt();
600
601 if (print_insert)
602 query += " /* "+c->column+" -> "+c->branch+" */";
603 }
604 query += "\n)";
605
606 count ++;
607 }
608
609 if (!duplicate.empty())
610 query += "\nON DUPLICATE KEY UPDATE\n " + boost::join(duplicate, ",\n ");
611
612 if (verbose>0)
613 cout << count << " out of " << num << " row(s) read from file [N=" << first << ".." << num-1 << "]." << endl;
614
615 if (count==0)
616 {
617 if (verbose>0)
618 cout << "Total execution time: " << Time().UnixTime()-start.UnixTime() << "s\n" << endl;
619 return 0;
620 }
621
622 // -------------------------------------------------------------------------
623
624 if (verbose>0)
625 {
626 cout << "\n--------------------------- Inserting data -------------------------" << endl;
627 cout << "Sending INSERT query (" << query.length() << " bytes)" << endl;
628 }
629
630 try
631 {
632 if (!noinsert && !dry_run)
633 {
634 auto q = connection.query(query);
635 q.execute();
636 cout << q.info() << '\n' << endl;
637 }
638 else
639 cout << "Insert query skipped!" << endl;
640
641 if (print_insert)
642 cout << query << endl;
643 }
644 catch (const exception &e)
645 {
646 if (verbose>1 || query.length()<80*25)
647 cerr << query << "\n\n";
648 cerr << "SQL query failed (" << query.length() << " bytes):\n" << e.what() << '\n' << endl;
649 return 6;
650 }
651
652 if (verbose>0)
653 {
654 const auto sec = Time().UnixTime()-start.UnixTime();
655 cout << "Total execution time: " << sec << "s ";
656 cout << "(" << Tools::Fractional(sec/count) << "s/row)\n";
657
658 try
659 {
660 const auto resw =
661 connection.query("SHOW WARNINGS").store();
662
663 for (size_t i=0; i<resw.num_rows(); i++)
664 {
665 const mysqlpp::Row &roww = resw[i];
666
667 cout << roww["Level"] << '[' << roww["Code"] << "]: ";
668 cout << roww["Message"] << '\n';
669 }
670 cout << endl;
671
672 }
673 catch (const exception &e)
674 {
675 cerr << "\nSHOW WARNINGS\n\n";
676 cerr << "SQL query failed:\n" << e.what() << '\n' <<endl;
677 return 7;
678 }
679 }
680
681 return 0;
682}
Note: See TracBrowser for help on using the repository browser.