source: trunk/FACT++/src/root2sql.cc@ 19110

Last change on this file since 19110 was 19110, checked in by tbretz, 6 years ago
Implemented row-format and now using the database default engine if not specified.
File size: 22.7 KB
Line 
1#include <boost/algorithm/string/join.hpp>
2#include <boost/regex.hpp>
3
4#include "Database.h"
5
6#include "Time.h"
7#include "Configuration.h"
8
9#include <TROOT.h>
10#include <TFile.h>
11#include <TTree.h>
12#include <TLeaf.h>
13#include <TError.h>
14
15using namespace std;
16
17// ------------------------------------------------------------------------
18
19struct Map : pair<string, string>
20{
21 Map() { }
22};
23
24std::istream &operator>>(std::istream &in, Map &m)
25{
26 const istreambuf_iterator<char> eos;
27 string txt(istreambuf_iterator<char>(in), eos);
28
29 const boost::regex expr("((.*)[^\\\\])/(.*)");
30 boost::smatch match;
31 if (!boost::regex_match(txt, match, expr))
32 throw runtime_error("Could not evaluate map argument: "+txt);
33
34 m.first = match[1].str();
35 m.second = match[3].str();
36
37 return in;
38}
39
40
41void SetupConfiguration(Configuration &conf)
42{
43 po::options_description control("Root to SQL");
44 control.add_options()
45 ("uri,u", var<string>()
46#if BOOST_VERSION >= 104200
47 ->required()
48#endif
49 , "Database link as in\n\tuser:password@server[:port]/database.")
50 ("file", var<string>("")
51#if BOOST_VERSION >= 104200
52 ->required()
53#endif
54 , "The root file to read from")
55 ("force", po_switch(), "Force processing even if there is no database connection")
56 ("create", po_switch(), "Create the database if not existing")
57 ("drop", po_switch(), "Drop the table (implies create)")
58 ("tree,t", var<string>("Events"), "Name of the root tree to convert")
59 ("table", var<string>(""), "Name of the table to use (default is the tree name)")
60 ("map", vars<Map>(), "A regular expression which is applied to the leaf name befoee it is used as SQL column name)")
61 ("sql-type", vars<Map>(), "Allows to overwrite the calculated SQL type for a given column e.g. 'sql-column-name/UNSIGNED IN'")
62 ("ignore", vars<string>(), "Ignore the given leaf, if the given regular expression matches")
63 ("primary", vars<string>(), "List of columns to be used as primary keys during table creation (in connection with --create)")
64 ("first", var<int64_t>(int64_t(0)), "First event to start with (default: 0), mainly for test purpose")
65 ("max", var<int64_t>(int64_t(0)), "Maximum number of events to process (0: all), mainly for test purpose")
66 ("engine", var<string>(""), "Database engine to be used when a new table is created")
67 ("row-format", var<string>(""), "Defines the ROW_FORMAT keyword for table creation")
68 ("duplicate", var<string>(""), "Specifies an assignment_list for an 'ON DUPLICATE KEY UPDATE' expression")
69 ("ignore-errors", po_switch(), "Adds the IGNORE keyword to the INSERT query (turns errors into warnings, ignores rows with errors)")
70 ;
71
72 po::options_description debug("Debug options");
73 debug.add_options()
74 ("no-insert", po_switch(), "Does not insert any data into the table")
75 ("dry-run", po_switch(), "Skip any query which changes the databse (might result in consecutive failures)")
76 ("print-branches", po_switch(), "Print the branches found in the tree")
77 ("print-leaves", po_switch(), "Print the leaves found in the tree (this is what is processed)")
78 ("print-insert", po_switch(), "Print the INSERT query (note that it contains all data)")
79 ("print-create", po_switch(), "Print the CREATE query")
80 ("verbose,v", var<uint16_t>(1), "Verbosity (0: quiet, 1: default, 2: more, 3, ...)")
81 ;
82
83 po::positional_options_description p;
84 p.add("file", 1); // The 1st positional options (n=1)
85
86 conf.AddOptions(control);
87 conf.AddOptions(debug);
88 conf.SetArgumentPositions(p);
89}
90
91void PrintUsage()
92{
93 cout <<
94 "root2sql - Fills the data from a root file into a database\n"
95 "\n"
96 "For convenience, this documentation uses the extended version of the options, "
97 "refer to the output below to get the abbreviations.\n"
98 "\n"
99 "This is a general purpose tool to fill the contents of a root file into a database "
100 "as long as this is technically possible and makes sense. Note that root can even "
101 "write complex data like a TH1F into a database, this is not the purpose of this "
102 "program.\n"
103 "\n"
104 "Each root tree has branches and leaves (the basic data types). These leaves can "
105 "be read independently of the classes which were used to write the root file. "
106 "The default tree to read from is 'Events' but the name can be overwritten "
107 "using --tree. The default table name to fill the data into is identical to "
108 "the tree name. It can be overwritten using --table.\n"
109 "\n"
110 "The name of each column to which data is filled from a leave is obtained from "
111 "the leaves' names. The leave names can be checked using --print-leaves. "
112 "A --print-branches exists for convenience to print only the high-level branches. "
113 "Sometimes these names might be quite unconvenient like MTime.fTime.fMilliSec or "
114 "just MHillas.fWidth. To allow to simplify column names, regular expressions "
115 "(using boost's regex) can be defined to change the names. Note that these regular "
116 "expressions are applied one by one on each leaf's name. A valid expression could "
117 "be:\n"
118 " --map=MHillas\\.f/\n"
119 "which would remove all occurances of 'MHillas.f'. This option can be used more than "
120 "once. They are applied in sequence. A single match does not stop the sequence.\n"
121 "\n"
122 "Sometimes it might also be convenient to skip a leaf. This can be done with "
123 "the --ignore resource. If the given regular expresion yields a match, the "
124 "leaf will be ignored. Note that the regular expression works on the raw-name "
125 "of the leaf not the readily mapped SQL column names. Example:\n"
126 " --ignore=ThetaSq\\..*\n"
127 "will skip all leaved which start with 'ThetaSq.'. This option can be used"
128 "more than once.\n"
129 "\n"
130 "The data type of each column is kept as close as possible to the leaves' data "
131 "types. If for some reason this is not wanted, the data type of the SQL column "
132 "can be overwritten with --sql-type sql-column/sql-ytpe, for example:\n"
133 " --sql-type=FileId/UNSIGNED INT\n"
134 "while the first argument of the name of the SQL column to which the data type "
135 "should be applied. The second column is the basic SQL data type. The option can "
136 "be given more than once.\n"
137 "\n"
138 "Database interaction:\n"
139 "\n"
140 "To drop an existing table, --drop can be used.\n"
141 "\n"
142 "To create a table according to theSQL column names and data types, --create "
143 "can be used. The query used can be printed with --print-create even --create "
144 "has not been specified.\n"
145 "\n"
146 "To choose the columns which should become primary keys, use --primary, "
147 "for exmaple:\n"
148 " --primary=col1\n"
149 "To define more than one column as primary key, the option can be given more than "
150 "once. Note that the combination of these columns must be unique.\n"
151 "\n"
152 "All columns are created as NOT NULL as default. To force a database engine "
153 "and/or a storage format, use --engine and --rot-format.\n"
154 "\n"
155 "Usually, the INSERT query would fail if the PRIMARY key exists already. "
156 "This can be avoided using the 'ON DUPLICATE KEY' directive. With the "
157 "--duplicate,you can specify what should be updated in case of a duplicate key. "
158 "To keep the row untouched, you can just update the primary key "
159 "with the identical primary key, e.g. --duplicate='MyPrimary=VALUES(MyPrimary)'. "
160 "For more details, see the MySQL manual.\n"
161 "\n"
162 "For debugging purpose, or to just create or drop a table, the final insert "
163 "query can be skipped using --no-insert. Note that for performance reason, "
164 "all data is collected in memory and a single INSERT query is issued at the "
165 "end.\n"
166 "\n"
167 "Another possibility is to add the IGNORE keyword to the INSERT query by "
168 "--ignore-errors, which essentially ignores all errors and turns them into "
169 "warnings which are printed after the query succeeded.\n"
170 "\n"
171 "Using a higher verbosity level (-v), an overview of the written columns or all "
172 "processed leaves is printed depending on the verbosity level. The output looks "
173 "like the following\n"
174 " Leaf name [root data type] (SQL name)\n"
175 "for example\n"
176 " MTime.fTime.fMilliSec [Long64_t] (MilliSec)\n"
177 "which means that the leaf MTime.fTime.fMilliSec is detected to be a Long64_t "
178 "which is filled into a column called MilliSec. Leaves with non basic data types "
179 "are ignored automatically and are marked as (-n/a-). User ignored columnd "
180 "are marked as (-ignored-).\n"
181 "\n"
182 "If a query failed, the query is printed to stderr together with the error message. "
183 "For the main INSERT query, this is only true if the verbosity level is at least 2 "
184 "or the query has less than 80*25 bytes.\n"
185 "\n"
186 "In case of succes, 0 is returned, a value>0 otherwise.\n"
187 "\n"
188 "Usage: root2sql [options] -uri URI rootfile.root\n"
189 "\n"
190 ;
191 cout << endl;
192}
193
194enum BasicType_t
195{
196 kNone = 0,
197 kFloat,
198 kDouble,
199 kInt16,
200 kUInt16,
201 kInt32,
202 kUInt32,
203 kInt64,
204 kUInt64,
205};
206
207static const map<string, pair<BasicType_t, string>> ConvRoot =
208{
209 { "Float_t", { kFloat, "FLOAT" } },
210 { "Double_t", { kDouble, "DOUBLE" } },
211 { "ULong64_t", { kUInt64, "BIGINT UNSIGNED" } },
212 { "Long64_t", { kInt64, "BIGINT" } },
213 { "UInt_t", { kUInt32, "INT UNSIGNED" } },
214 { "Int_t", { kInt32, "INT" } },
215 { "UShort_t", { kUInt16, "SMALLINT UNSIGNED" } },
216 { "Short_t", { kInt16, "SMALLINT" } },
217};
218
219struct Container
220{
221 string branch; // branch name
222 string column; // column name
223 BasicType_t type;
224 void *ptr;
225
226 Container(const string &b, const string &c, const BasicType_t &t) : branch(b), column(c), type(t), ptr(0)
227 {
228 switch (t)
229 {
230 case kFloat: ptr = new Float_t; break;
231 case kDouble: ptr = new Double_t; break;
232 case kInt16: ptr = new Short_t; break;
233 case kUInt16: ptr = new UShort_t; break;
234 case kInt32: ptr = new Int_t; break;
235 case kUInt32: ptr = new UInt_t; break;
236 case kInt64: ptr = new Long64_t; break;
237 case kUInt64: ptr = new ULong64_t; break;
238 case kNone:
239 break;
240 }
241 }
242 ~Container()
243 {
244 //::operator delete(ptr); // It seems root is deleting it already
245 }
246
247 string fmt() const
248 {
249 ostringstream str;
250
251 switch (type)
252 {
253 case kFloat: str << setprecision(8) << *reinterpret_cast<Float_t*>(ptr); break;
254 case kDouble: str << setprecision(16) << *reinterpret_cast<Double_t*>(ptr); break;
255 case kInt16: str << *reinterpret_cast<Short_t*>(ptr); break;
256 case kUInt16: str << *reinterpret_cast<UShort_t*>(ptr); break;
257 case kInt32: str << *reinterpret_cast<Int_t*>(ptr); break;
258 case kUInt32: str << *reinterpret_cast<UInt_t*>(ptr); break;
259 case kInt64: str << *reinterpret_cast<Long64_t*>(ptr); break;
260 case kUInt64: str << *reinterpret_cast<ULong64_t*>(ptr); break;
261 case kNone:
262 break;
263 }
264
265 //if (str.str()=="nan" || str.str()=="-nan" || str.str()=="inf" || str.str()=="-inf")
266 // return "NULL";
267
268 return str.str();
269 }
270};
271
272void ErrorHandlerAll(Int_t level, Bool_t abort, const char *location, const char *msg)
273{
274 if (string(msg).substr(0,24)=="no dictionary for class ")
275 return;
276
277 DefaultErrorHandler(level, abort, location, msg);
278}
279
280int main(int argc, const char* argv[])
281{
282 Time start;
283
284 gROOT->SetBatch();
285 SetErrorHandler(ErrorHandlerAll);
286
287 Configuration conf(argv[0]);
288 conf.SetPrintUsage(PrintUsage);
289 SetupConfiguration(conf);
290
291 if (!conf.DoParse(argc, argv))
292 return 127;
293
294 // ----------------------------- Evaluate options --------------------------
295 const string uri = conf.Get<string>("uri");
296 const string file = conf.Get<string>("file");
297 const string tree = conf.Get<string>("tree");
298 const string table = conf.Get<string>("table").empty() ? tree : conf.Get<string>("table");
299
300 const uint16_t verbose = conf.Get<uint16_t>("verbose");
301 const int64_t first = conf.Get<int64_t>("first");
302 const int64_t max = conf.Get<int64_t>("max");
303
304 const bool force = conf.Get<bool>("force");
305 const bool drop = conf.Get<bool>("drop");
306 const bool create = conf.Get<bool>("create") || drop;
307 const bool noinsert = conf.Get<bool>("no-insert");
308 const bool dry_run = conf.Get<bool>("dry-run");
309
310 const string engine = conf.Get<string>("engine");
311 const string row_format = conf.Get<string>("row-format");
312 const string duplicate = conf.Get<string>("duplicate");
313
314 const bool ignore_errors = conf.Get<bool>("ignore-errors");
315
316 const bool print_branches = conf.Get<bool>("print-branches");
317 const bool print_leaves = conf.Get<bool>("print-leaves");
318 const bool print_create = conf.Get<bool>("print-create");
319 const bool print_insert = conf.Get<bool>("print-insert");
320
321 const vector<Map> mymap = conf.Vec<Map>("map");
322 const vector<Map> sqltypes = conf.Vec<Map>("sql-type");
323 const vector<string> _ignore = conf.Vec<string>("ignore");
324 const vector<string> primary = conf.Vec<string>("primary");
325
326 // -------------------------------------------------------------------------
327
328 if (verbose>0)
329 cout << "\n-------------------------- Evaluating file -------------------------" << endl;
330
331 TFile f(file.c_str());
332 if (f.IsZombie())
333 {
334 cerr << "Could not open file " << file << endl;
335 return 1;
336 }
337
338 if (verbose>0)
339 cout << "File: " << file << endl;
340
341 TTree *T = 0;
342 f.GetObject(tree.c_str(), T);
343 if (!T)
344 {
345 cerr << "Could not open tree " << tree << endl;
346 return 2;
347 }
348
349 if (verbose>0)
350 cout << "Tree: " << tree << endl;
351
352 T->SetMakeClass(1);
353
354 TObjArray *branches = T->GetListOfBranches();
355 TObjArray *leaves = T->GetListOfLeaves();
356
357 if (print_branches)
358 {
359 cout << '\n';
360 branches->Print();
361 }
362
363 if (verbose>0)
364 cout << T->GetEntriesFast() << " events found." << endl;
365
366
367 if (verbose>0)
368 cout << branches->GetEntries() << " branches found." << endl;
369
370 if (print_leaves)
371 {
372 cout << '\n';
373 leaves->Print();
374 }
375 if (verbose>0)
376 cout << leaves->GetEntries() << " leaves found." << endl;
377
378 string query =
379 "CREATE TABLE IF NOT EXISTS `"+table+"`\n"
380 "(\n";
381
382 vector<Container> vec;
383
384
385 TIter Next(leaves);
386 TObject *o = 0;
387 while ((o=Next()))
388 {
389 TLeaf *L = T->GetLeaf(o->GetName());
390
391 if (verbose>2)
392 cout << '\n' << o->GetName() << " [" << L->GetTypeName() << "]";
393
394 string name = o->GetName();
395
396
397 bool found = false;
398 for (auto b=_ignore.cbegin(); b!=_ignore.cend(); b++)
399 {
400 if (boost::regex_match(name, boost::regex(*b)))
401 {
402 found = true;
403 if (verbose>2)
404 cout << " (-ignored-)";
405 break;
406 }
407 }
408 if (found)
409 continue;
410
411 const string tn = L->GetTypeName();
412
413 auto it = ConvRoot.find(tn);
414 if (it==ConvRoot.end())
415 {
416 if (verbose>2)
417 cout << " (-n/a-)";
418 continue;
419 }
420
421 if (verbose==2)
422 cout << '\n' << o->GetName() << " [" << L->GetTypeName() << "]";
423
424 for (auto m=mymap.cbegin(); m!=mymap.cend(); m++)
425 name = boost::regex_replace(name, boost::regex(m->first), m->second);
426
427 if (verbose>1)
428 cout << " (" << name << ")";
429
430 string sqltype = it->second.second;
431
432 for (auto m=sqltypes.cbegin(); m!=sqltypes.cend(); m++)
433 if (m->first==name)
434 sqltype = m->second;
435
436 if (!vec.empty())
437 query += ",\n";
438 query += " `"+name+"` "+sqltype+" NOT NULL COMMENT '"+o->GetName()+"'";
439
440 vec.emplace_back(o->GetName(), name, it->second.first);
441 T->SetBranchAddress(o->GetName(), vec.back().ptr);
442 }
443
444 if (verbose>1)
445 cout << "\n\n";
446 if (verbose>0)
447 cout << vec.size() << " leaves setup for reading." << endl;
448
449 UInt_t datatype = 0;
450 const bool has_datatype = T->SetBranchAddress("DataType.fVal", &datatype) >= 0;
451
452 // Setiing up branch status (must be after all SetBranchAddress)
453 T->SetBranchStatus("*", 0);
454 for (auto c=vec.cbegin(); c!=vec.cend(); c++)
455 T->SetBranchStatus(c->branch.c_str(), 1);
456
457 if (has_datatype)
458 {
459 T->SetBranchStatus("DataType.fVal", 1);
460 if (verbose>0)
461 cout << "Rows with DataType.fVal!=1 will be skipped." << endl;
462 }
463
464 // -------------------------------------------------------------------------
465 // Checking for database connection
466
467 Database connection(uri);
468
469 try
470 {
471 if (!force)
472 connection.connected();
473 }
474 catch (const exception &e)
475 {
476 cerr << "SQL connection failed: " << e.what() << '\n' << endl;
477 return 3;
478 }
479
480 // -------------------------------------------------------------------------
481
482 if (verbose>0)
483 cout << "\n--------------------------- Database Table -------------------------" << endl;
484
485 if (!primary.empty())
486 query += ",\n PRIMARY KEY USING BTREE (`"+boost::algorithm::join(primary, "`, `")+"`)\n";
487
488 query +=
489 ")\n"
490 "DEFAULT CHARSET=latin1 COLLATE=latin1_general_ci\n";
491 if (!engine.empty())
492 query += "ENGINE="+engine+"\n";
493 if (!row_format.empty())
494 query += "ROW_FORMAT="+row_format+"\n";
495 query += "COMMENT='created by "+conf.GetName()+"'\n";
496
497 // FIXME: Can we omit the catching to be able to print the
498 // query 'autmatically'?
499 try
500 {
501 if (drop)
502 {
503 // => Simple result
504 if (!dry_run)
505 connection.query("DROP TABLE `"+table+"`").execute();
506 if (verbose>0)
507 cout << "Table `" << table << "` dropped." << endl;
508 }
509 }
510 catch (const exception &e)
511 {
512 cerr << "DROP TABLE `" << table << "`\n\n";
513 cerr << "SQL query failed:\n" << e.what() << '\n' << endl;
514 return 4;
515 }
516
517 try
518 {
519 if (create && !dry_run)
520 connection.query(query).execute();
521 }
522 catch (const exception &e)
523 {
524 cerr << query << "\n\n";
525 cerr << "SQL query failed:\n" << e.what() << '\n' << endl;
526 return 5;
527 }
528
529 if (print_create)
530 cout << query << endl;
531
532 if (create && verbose>0)
533 cout << "Table `" << table << "` created." << endl;
534
535 // -------------------------------------------------------------------------
536
537 if (verbose>0)
538 cout << "\n---------------------------- Reading file --------------------------" << endl;
539
540 //query = update ? "UPDATE" : "INSERT";
541 query = "INSERT ";
542 if (ignore_errors)
543 query += "IGNORE ";
544 query += "`"+table+"`\n"
545 "(\n";
546
547 for (auto c=vec.cbegin(); c!=vec.cend(); c++)
548 {
549 if (c!=vec.cbegin())
550 query += ",\n";
551 query += " `"+c->column+"`";
552 }
553
554 query +=
555 "\n)\n"
556 "VALUES\n";
557
558 size_t count = 0;
559
560 const size_t num = max>0 && (max-first)<T->GetEntriesFast() ? (max-first) : T->GetEntriesFast();
561 for (size_t j=first; j<num; j++)
562 {
563 T->GetEntry(j);
564 if (has_datatype && datatype!=1)
565 continue;
566
567 if (count>0)
568 query += ",\n";
569
570 query += "(\n";
571
572 for (auto c=vec.cbegin(); c!=vec.cend(); c++)
573 {
574 if (c!=vec.cbegin())
575 query += ",\n";
576
577 query += " "+c->fmt();
578
579 if (print_insert)
580 query += " /* "+c->column+" -> "+c->branch+" */";
581 }
582 query += "\n)";
583
584 count ++;
585 }
586
587 if (!duplicate.empty())
588 query += "\nON DUPLICATE KEY UPDATE\n " + duplicate;
589
590 if (verbose>0)
591 cout << count << " out of " << num << " row(s) read from file [N=" << first << ".." << num-1 << "]." << endl;
592
593 if (count==0)
594 {
595 if (verbose>0)
596 cout << "Total execution time: " << Time().UnixTime()-start.UnixTime() << "s\n" << endl;
597 return 0;
598 }
599
600 // -------------------------------------------------------------------------
601
602 if (verbose>0)
603 {
604 cout << "\n--------------------------- Inserting data -------------------------" << endl;
605 cout << "Sending INSERT query (" << query.length() << " bytes)" << endl;
606 }
607
608 try
609 {
610 if (!noinsert && !dry_run)
611 {
612 auto q = connection.query(query);
613 q.execute();
614 cout << q.info() << '\n' << endl;
615 }
616 else
617 cout << "Insert query skipped!" << endl;
618
619 if (print_insert)
620 cout << query << endl;
621 }
622 catch (const exception &e)
623 {
624 if (verbose>1 || query.length()<80*25)
625 cerr << query << "\n\n";
626 cerr << "SQL query failed (" << query.length() << " bytes):\n" << e.what() << '\n' << endl;
627 return 6;
628 }
629
630 if (verbose>0)
631 {
632 cout << "Total execution time: " << Time().UnixTime()-start.UnixTime() << "s\n" << endl;
633
634 try
635 {
636 const auto resw =
637 connection.query("SHOW WARNINGS").store();
638
639 for (size_t i=0; i<resw.num_rows(); i++)
640 {
641 const mysqlpp::Row &roww = resw[i];
642
643 cout << roww["Level"] << '[' << roww["Code"] << "]: ";
644 cout << roww["Message"] << '\n';
645 }
646 cout << endl;
647
648 }
649 catch (const exception &e)
650 {
651 cerr << "\nSHOW WARNINGS\n\n";
652 cerr << "SQL query failed:\n" << e.what() << '\n' <<endl;
653 return 7;
654 }
655 }
656
657 return 0;
658}
Note: See TracBrowser for help on using the repository browser.