source: trunk/FACT++/src/root2sql.cc@ 19108

Last change on this file since 19108 was 19108, checked in by tbretz, 6 years ago
The filename is also required.
File size: 22.4 KB
Line 
1#include <boost/algorithm/string/join.hpp>
2#include <boost/regex.hpp>
3
4#include "Database.h"
5
6#include "Time.h"
7#include "Configuration.h"
8
9#include <TROOT.h>
10#include <TFile.h>
11#include <TTree.h>
12#include <TLeaf.h>
13#include <TError.h>
14
15using namespace std;
16
17// ------------------------------------------------------------------------
18
19struct Map : pair<string, string>
20{
21 Map() { }
22};
23
24std::istream &operator>>(std::istream &in, Map &m)
25{
26 const istreambuf_iterator<char> eos;
27 string txt(istreambuf_iterator<char>(in), eos);
28
29 const boost::regex expr("((.*)[^\\\\])/(.*)");
30 boost::smatch match;
31 if (!boost::regex_match(txt, match, expr))
32 throw runtime_error("Could not evaluate map argument: "+txt);
33
34 m.first = match[1].str();
35 m.second = match[3].str();
36
37 return in;
38}
39
40
41void SetupConfiguration(Configuration &conf)
42{
43 po::options_description control("Root to SQL");
44 control.add_options()
45 ("uri,u", var<string>()
46#if BOOST_VERSION >= 104200
47 ->required()
48#endif
49 , "Database link as in\n\tuser:password@server[:port]/database.")
50 ("file", var<string>("")
51#if BOOST_VERSION >= 104200
52 ->required()
53#endif
54 , "The root file to read from")
55 ("force", po_switch(), "Force processing even if there is no database connection")
56 ("create", po_switch(), "Create the database if not existing")
57 ("drop", po_switch(), "Drop the table (implies create)")
58 ("tree,t", var<string>("Events"), "Name of the root tree to convert")
59 ("table", var<string>(""), "Name of the table to use (default is the tree name)")
60 ("map", vars<Map>(), "A regular expression which is applied to the leaf name befoee it is used as SQL column name)")
61 ("sql-type", vars<Map>(), "Allows to overwrite the calculated SQL type for a given column e.g. 'sql-column-name/UNSIGNED IN'")
62 ("ignore", vars<string>(), "Ignore the given leaf, if the given regular expression matches")
63 ("primary", vars<string>(), "List of columns to be used as primary keys during table creation (in connection with --create)")
64 ("first", var<int64_t>(int64_t(0)), "First event to start with (default: 0), mainly for test purpose")
65 ("max", var<int64_t>(int64_t(0)), "Maximum number of events to process (0: all), mainly for test purpose")
66 ("engine", var<string>("InnoDB"), "Database engine to be used when a new table is created")
67 ("duplicate", var<string>(""), "Specifies an assignment_list for an 'ON DUPLICATE KEY UPDATE' expression")
68 ("ignore-errors", po_switch(), "Adds the IGNORE keyword to the INSERT query (turns errors into warnings, ignores rows with errors)")
69 ;
70
71 po::options_description debug("Debug options");
72 debug.add_options()
73 ("no-insert", po_switch(), "Does not insert any data into the table")
74 ("dry-run", po_switch(), "Skip any query which changes the databse (might result in consecutive failures)")
75 ("print-branches", po_switch(), "Print the branches found in the tree")
76 ("print-leaves", po_switch(), "Print the leaves found in the tree (this is what is processed)")
77 ("print-insert", po_switch(), "Print the INSERT query (note that it contains all data)")
78 ("print-create", po_switch(), "Print the CREATE query")
79 ("verbose,v", var<uint16_t>(1), "Verbosity (0: quiet, 1: default, 2: more, 3, ...)")
80 ;
81
82 po::positional_options_description p;
83 p.add("file", 1); // The 1st positional options (n=1)
84
85 conf.AddOptions(control);
86 conf.AddOptions(debug);
87 conf.SetArgumentPositions(p);
88}
89
90void PrintUsage()
91{
92 cout <<
93 "root2sql - Fills the data from a root file into a database\n"
94 "\n"
95 "For convenience, this documentation uses the extended version of the options, "
96 "refer to the output below to get the abbreviations.\n"
97 "\n"
98 "This is a general purpose tool to fill the contents of a root file into a database "
99 "as long as this is technically possible and makes sense. Note that root can even "
100 "write complex data like a TH1F into a database, this is not the purpose of this "
101 "program.\n"
102 "\n"
103 "Each root tree has branches and leaves (the basic data types). These leaves can "
104 "be read independently of the classes which were used to write the root file. "
105 "The default tree to read from is 'Events' but the name can be overwritten "
106 "using --tree. The default table name to fill the data into is identical to "
107 "the tree name. It can be overwritten using --table.\n"
108 "\n"
109 "The name of each column to which data is filled from a leave is obtained from "
110 "the leaves' names. The leave names can be checked using --print-leaves. "
111 "A --print-branches exists for convenience to print only the high-level branches. "
112 "Sometimes these names might be quite unconvenient like MTime.fTime.fMilliSec or "
113 "just MHillas.fWidth. To allow to simplify column names, regular expressions "
114 "(using boost's regex) can be defined to change the names. Note that these regular "
115 "expressions are applied one by one on each leaf's name. A valid expression could "
116 "be:\n"
117 " --map=MHillas\\.f/\n"
118 "which would remove all occurances of 'MHillas.f'. This option can be used more than "
119 "once. They are applied in sequence. A single match does not stop the sequence.\n"
120 "\n"
121 "Sometimes it might also be convenient to skip a leaf. This can be done with "
122 "the --ignore resource. If the given regular expresion yields a match, the "
123 "leaf will be ignored. Note that the regular expression works on the raw-name "
124 "of the leaf not the readily mapped SQL column names. Example:\n"
125 " --ignore=ThetaSq\\..*\n"
126 "will skip all leaved which start with 'ThetaSq.'. This option can be used"
127 "more than once.\n"
128 "\n"
129 "The data type of each column is kept as close as possible to the leaves' data "
130 "types. If for some reason this is not wanted, the data type of the SQL column "
131 "can be overwritten with --sql-type sql-column/sql-ytpe, for example:\n"
132 " --sql-type=FileId/UNSIGNED INT\n"
133 "while the first argument of the name of the SQL column to which the data type "
134 "should be applied. The second column is the basic SQL data type. The option can "
135 "be given more than once.\n"
136 "\n"
137 "Database interaction:\n"
138 "\n"
139 "To drop an existing table, --drop can be used.\n"
140 "\n"
141 "To create a table according to theSQL column names and data types, --create "
142 "can be used. The query used can be printed with --print-create even --create "
143 "has not been specified.\n"
144 "\n"
145 "To choose the columns which should become primary keys, use --primary, "
146 "for exmaple:\n"
147 " --primary=col1\n"
148 "To define more than one column as primary key, the option can be given more than "
149 "once. Note that the combination of these columns must be unique.\n"
150 "\n"
151 "All columns are created as NOT NULL as default and the table is created as "
152 "InnoDB (default).\n"
153 "\n"
154 "Usually, the INSERT query would fail if the PRIMARY key exists already. "
155 "This can be avoided using the 'ON DUPLICATE KEY' directive. With the "
156 "--duplicate,you can specify what should be updated in case of a duplicate key. "
157 "To keep the row untouched, you can just update the primary key "
158 "with the identical primary key, e.g. --duplicate='MyPrimary=VALUES(MyPrimary)'. "
159 "For more details, see the MySQL manual.\n"
160 "\n"
161 "For debugging purpose, or to just create or drop a table, the final insert "
162 "query can be skipped using --no-insert. Note that for performance reason, "
163 "all data is collected in memory and a single INSERT query is issued at the "
164 "end.\n"
165 "\n"
166 "Another possibility is to add the IGNORE keyword to the INSERT query by "
167 "--ignore-errors, which essentially ignores all errors and turns them into "
168 "warnings which are printed after the query succeeded.\n"
169 "\n"
170 "Using a higher verbosity level (-v), an overview of the written columns or all "
171 "processed leaves is printed depending on the verbosity level. The output looks "
172 "like the following\n"
173 " Leaf name [root data type] (SQL name)\n"
174 "for example\n"
175 " MTime.fTime.fMilliSec [Long64_t] (MilliSec)\n"
176 "which means that the leaf MTime.fTime.fMilliSec is detected to be a Long64_t "
177 "which is filled into a column called MilliSec. Leaves with non basic data types "
178 "are ignored automatically and are marked as (-n/a-). User ignored columnd "
179 "are marked as (-ignored-).\n"
180 "\n"
181 "If a query failed, the query is printed to stderr together with the error message. "
182 "For the main INSERT query, this is only true if the verbosity level is at least 2 "
183 "or the query has less than 80*25 bytes.\n"
184 "\n"
185 "In case of succes, 0 is returned, a value>0 otherwise.\n"
186 "\n"
187 "Usage: root2sql [options] -uri URI rootfile.root\n"
188 "\n"
189 ;
190 cout << endl;
191}
192
193enum BasicType_t
194{
195 kNone = 0,
196 kFloat,
197 kDouble,
198 kInt16,
199 kUInt16,
200 kInt32,
201 kUInt32,
202 kInt64,
203 kUInt64,
204};
205
206static const map<string, pair<BasicType_t, string>> ConvRoot =
207{
208 { "Float_t", { kFloat, "FLOAT" } },
209 { "Double_t", { kDouble, "DOUBLE" } },
210 { "ULong64_t", { kUInt64, "BIGINT UNSIGNED" } },
211 { "Long64_t", { kInt64, "BIGINT" } },
212 { "UInt_t", { kUInt32, "INT UNSIGNED" } },
213 { "Int_t", { kInt32, "INT" } },
214 { "UShort_t", { kUInt16, "SMALLINT UNSIGNED" } },
215 { "Short_t", { kInt16, "SMALLINT" } },
216};
217
218struct Container
219{
220 string branch; // branch name
221 string column; // column name
222 BasicType_t type;
223 void *ptr;
224
225 Container(const string &b, const string &c, const BasicType_t &t) : branch(b), column(c), type(t), ptr(0)
226 {
227 switch (t)
228 {
229 case kFloat: ptr = new Float_t; break;
230 case kDouble: ptr = new Double_t; break;
231 case kInt16: ptr = new Short_t; break;
232 case kUInt16: ptr = new UShort_t; break;
233 case kInt32: ptr = new Int_t; break;
234 case kUInt32: ptr = new UInt_t; break;
235 case kInt64: ptr = new Long64_t; break;
236 case kUInt64: ptr = new ULong64_t; break;
237 case kNone:
238 break;
239 }
240 }
241 ~Container()
242 {
243 //::operator delete(ptr); // It seems root is deleting it already
244 }
245
246 string fmt() const
247 {
248 ostringstream str;
249
250 switch (type)
251 {
252 case kFloat: str << setprecision(8) << *reinterpret_cast<Float_t*>(ptr); break;
253 case kDouble: str << setprecision(16) << *reinterpret_cast<Double_t*>(ptr); break;
254 case kInt16: str << *reinterpret_cast<Short_t*>(ptr); break;
255 case kUInt16: str << *reinterpret_cast<UShort_t*>(ptr); break;
256 case kInt32: str << *reinterpret_cast<Int_t*>(ptr); break;
257 case kUInt32: str << *reinterpret_cast<UInt_t*>(ptr); break;
258 case kInt64: str << *reinterpret_cast<Long64_t*>(ptr); break;
259 case kUInt64: str << *reinterpret_cast<ULong64_t*>(ptr); break;
260 case kNone:
261 break;
262 }
263
264 //if (str.str()=="nan" || str.str()=="-nan" || str.str()=="inf" || str.str()=="-inf")
265 // return "NULL";
266
267 return str.str();
268 }
269};
270
271void ErrorHandlerAll(Int_t level, Bool_t abort, const char *location, const char *msg)
272{
273 if (string(msg).substr(0,24)=="no dictionary for class ")
274 return;
275
276 DefaultErrorHandler(level, abort, location, msg);
277}
278
279int main(int argc, const char* argv[])
280{
281 Time start;
282
283 gROOT->SetBatch();
284 SetErrorHandler(ErrorHandlerAll);
285
286 Configuration conf(argv[0]);
287 conf.SetPrintUsage(PrintUsage);
288 SetupConfiguration(conf);
289
290 if (!conf.DoParse(argc, argv))
291 return 127;
292
293 // ----------------------------- Evaluate options --------------------------
294 const string uri = conf.Get<string>("uri");
295 const string file = conf.Get<string>("file");
296 const string tree = conf.Get<string>("tree");
297 const string table = conf.Get<string>("table").empty() ? tree : conf.Get<string>("table");
298
299 const uint16_t verbose = conf.Get<uint16_t>("verbose");
300 const int64_t first = conf.Get<int64_t>("first");
301 const int64_t max = conf.Get<int64_t>("max");
302
303 const bool force = conf.Get<bool>("force");
304 const bool drop = conf.Get<bool>("drop");
305 const bool create = conf.Get<bool>("create") || drop;
306 const bool noinsert = conf.Get<bool>("no-insert");
307 const bool dry_run = conf.Get<bool>("dry-run");
308
309 const string engine = conf.Get<string>("engine");
310 const string duplicate = conf.Get<string>("duplicate");
311
312 const bool ignore_errors = conf.Get<bool>("ignore-errors");
313
314 const bool print_branches = conf.Get<bool>("print-branches");
315 const bool print_leaves = conf.Get<bool>("print-leaves");
316 const bool print_create = conf.Get<bool>("print-create");
317 const bool print_insert = conf.Get<bool>("print-insert");
318
319 const vector<Map> mymap = conf.Vec<Map>("map");
320 const vector<Map> sqltypes = conf.Vec<Map>("sql-type");
321 const vector<string> _ignore = conf.Vec<string>("ignore");
322 const vector<string> primary = conf.Vec<string>("primary");
323
324 // -------------------------------------------------------------------------
325
326 if (verbose>0)
327 cout << "\n-------------------------- Evaluating file -------------------------" << endl;
328
329 TFile f(file.c_str());
330 if (f.IsZombie())
331 {
332 cerr << "Could not open file " << file << endl;
333 return 1;
334 }
335
336 if (verbose>0)
337 cout << "File: " << file << endl;
338
339 TTree *T = 0;
340 f.GetObject(tree.c_str(), T);
341 if (!T)
342 {
343 cerr << "Could not open tree " << tree << endl;
344 return 2;
345 }
346
347 if (verbose>0)
348 cout << "Tree: " << tree << endl;
349
350 T->SetMakeClass(1);
351
352 TObjArray *branches = T->GetListOfBranches();
353 TObjArray *leaves = T->GetListOfLeaves();
354
355 if (print_branches)
356 {
357 cout << '\n';
358 branches->Print();
359 }
360
361 if (verbose>0)
362 cout << T->GetEntriesFast() << " events found." << endl;
363
364
365 if (verbose>0)
366 cout << branches->GetEntries() << " branches found." << endl;
367
368 if (print_leaves)
369 {
370 cout << '\n';
371 leaves->Print();
372 }
373 if (verbose>0)
374 cout << leaves->GetEntries() << " leaves found." << endl;
375
376 string query =
377 "CREATE TABLE IF NOT EXISTS `"+table+"`\n"
378 "(\n";
379
380 vector<Container> vec;
381
382
383 TIter Next(leaves);
384 TObject *o = 0;
385 while ((o=Next()))
386 {
387 TLeaf *L = T->GetLeaf(o->GetName());
388
389 if (verbose>2)
390 cout << '\n' << o->GetName() << " [" << L->GetTypeName() << "]";
391
392 string name = o->GetName();
393
394
395 bool found = false;
396 for (auto b=_ignore.cbegin(); b!=_ignore.cend(); b++)
397 {
398 if (boost::regex_match(name, boost::regex(*b)))
399 {
400 found = true;
401 if (verbose>2)
402 cout << " (-ignored-)";
403 break;
404 }
405 }
406 if (found)
407 continue;
408
409 const string tn = L->GetTypeName();
410
411 auto it = ConvRoot.find(tn);
412 if (it==ConvRoot.end())
413 {
414 if (verbose>2)
415 cout << " (-n/a-)";
416 continue;
417 }
418
419 if (verbose==2)
420 cout << '\n' << o->GetName() << " [" << L->GetTypeName() << "]";
421
422 for (auto m=mymap.cbegin(); m!=mymap.cend(); m++)
423 name = boost::regex_replace(name, boost::regex(m->first), m->second);
424
425 if (verbose>1)
426 cout << " (" << name << ")";
427
428 string sqltype = it->second.second;
429
430 for (auto m=sqltypes.cbegin(); m!=sqltypes.cend(); m++)
431 if (m->first==name)
432 sqltype = m->second;
433
434 if (!vec.empty())
435 query += ",\n";
436 query += " `"+name+"` "+sqltype+" NOT NULL COMMENT '"+o->GetName()+"'";
437
438 vec.emplace_back(o->GetName(), name, it->second.first);
439 T->SetBranchAddress(o->GetName(), vec.back().ptr);
440 }
441
442 if (verbose>1)
443 cout << "\n\n";
444 if (verbose>0)
445 cout << vec.size() << " leaves setup for reading." << endl;
446
447 UInt_t datatype = 0;
448 const bool has_datatype = T->SetBranchAddress("DataType.fVal", &datatype) >= 0;
449
450 // Setiing up branch status (must be after all SetBranchAddress)
451 T->SetBranchStatus("*", 0);
452 for (auto c=vec.cbegin(); c!=vec.cend(); c++)
453 T->SetBranchStatus(c->branch.c_str(), 1);
454
455 if (has_datatype)
456 {
457 T->SetBranchStatus("DataType.fVal", 1);
458 if (verbose>0)
459 cout << "Rows with DataType.fVal!=1 will be skipped." << endl;
460 }
461
462 // -------------------------------------------------------------------------
463 // Checking for database connection
464
465 Database connection(uri);
466
467 try
468 {
469 if (!force)
470 connection.connected();
471 }
472 catch (const exception &e)
473 {
474 cerr << "SQL connection failed: " << e.what() << '\n' << endl;
475 return 3;
476 }
477
478 // -------------------------------------------------------------------------
479
480 if (verbose>0)
481 cout << "\n--------------------------- Database Table -------------------------" << endl;
482
483 if (!primary.empty())
484 query += ",\n PRIMARY KEY USING BTREE (`"+boost::algorithm::join(primary, "`, `")+"`)\n";
485
486 query +=
487 ")\n"
488 "DEFAULT CHARSET=latin1 COLLATE=latin1_general_ci\n"
489 "ENGINE="+engine+"\n"
490 "COMMENT='created by "+conf.GetName()+"'\n";
491
492 // FIXME: Can we omit the catching to be able to print the
493 // query 'autmatically'?
494 try
495 {
496 if (drop)
497 {
498 // => Simple result
499 if (!dry_run)
500 connection.query("DROP TABLE `"+table+"`").execute();
501 if (verbose>0)
502 cout << "Table `" << table << "` dropped." << endl;
503 }
504 }
505 catch (const exception &e)
506 {
507 cerr << "DROP TABLE `" << table << "`\n\n";
508 cerr << "SQL query failed:\n" << e.what() << '\n' << endl;
509 return 4;
510 }
511
512 try
513 {
514 if (create && !dry_run)
515 connection.query(query).execute();
516 }
517 catch (const exception &e)
518 {
519 cerr << query << "\n\n";
520 cerr << "SQL query failed:\n" << e.what() << '\n' << endl;
521 return 5;
522 }
523
524 if (print_create)
525 cout << query << endl;
526
527 if (create && verbose>0)
528 cout << "Table `" << table << "` created." << endl;
529
530 // -------------------------------------------------------------------------
531
532 if (verbose>0)
533 cout << "\n---------------------------- Reading file --------------------------" << endl;
534
535 //query = update ? "UPDATE" : "INSERT";
536 query = "INSERT ";
537 if (ignore_errors)
538 query += "IGNORE ";
539 query += "`"+table+"`\n"
540 "(\n";
541
542 for (auto c=vec.cbegin(); c!=vec.cend(); c++)
543 {
544 if (c!=vec.cbegin())
545 query += ",\n";
546 query += " `"+c->column+"`";
547 }
548
549 query +=
550 "\n)\n"
551 "VALUES\n";
552
553 size_t count = 0;
554
555 const size_t num = max>0 && (max-first)<T->GetEntriesFast() ? (max-first) : T->GetEntriesFast();
556 for (size_t j=first; j<num; j++)
557 {
558 T->GetEntry(j);
559 if (has_datatype && datatype!=1)
560 continue;
561
562 if (count>0)
563 query += ",\n";
564
565 query += "(\n";
566
567 for (auto c=vec.cbegin(); c!=vec.cend(); c++)
568 {
569 if (c!=vec.cbegin())
570 query += ",\n";
571
572 query += " "+c->fmt();
573
574 if (print_insert)
575 query += " /* "+c->column+" -> "+c->branch+" */";
576 }
577 query += "\n)";
578
579 count ++;
580 }
581
582 if (!duplicate.empty())
583 query += "\nON DUPLICATE KEY UPDATE\n " + duplicate;
584
585 if (verbose>0)
586 cout << count << " out of " << num << " row(s) read from file [N=" << first << ".." << num-1 << "]." << endl;
587
588 if (count==0)
589 {
590 if (verbose>0)
591 cout << "Total execution time: " << Time().UnixTime()-start.UnixTime() << "s\n" << endl;
592 return 0;
593 }
594
595 // -------------------------------------------------------------------------
596
597 if (verbose>0)
598 {
599 cout << "\n--------------------------- Inserting data -------------------------" << endl;
600 cout << "Sending INSERT query (" << query.length() << " bytes)" << endl;
601 }
602
603 try
604 {
605 if (!noinsert && !dry_run)
606 {
607 auto q = connection.query(query);
608 q.execute();
609 cout << q.info() << '\n' << endl;
610 }
611 else
612 cout << "Insert query skipped!" << endl;
613
614 if (print_insert)
615 cout << query << endl;
616 }
617 catch (const exception &e)
618 {
619 if (verbose>1 || query.length()<80*25)
620 cerr << query << "\n\n";
621 cerr << "SQL query failed (" << query.length() << " bytes):\n" << e.what() << '\n' << endl;
622 return 6;
623 }
624
625 if (verbose>0)
626 {
627 cout << "Total execution time: " << Time().UnixTime()-start.UnixTime() << "s\n" << endl;
628
629 try
630 {
631 const auto resw =
632 connection.query("SHOW WARNINGS").store();
633
634 for (size_t i=0; i<resw.num_rows(); i++)
635 {
636 const mysqlpp::Row &roww = resw[i];
637
638 cout << roww["Level"] << '[' << roww["Code"] << "]: ";
639 cout << roww["Message"] << '\n';
640 }
641 cout << endl;
642
643 }
644 catch (const exception &e)
645 {
646 cerr << "\nSHOW WARNINGS\n\n";
647 cerr << "SQL query failed:\n" << e.what() << '\n' <<endl;
648 return 7;
649 }
650 }
651
652 return 0;
653}
Note: See TracBrowser for help on using the repository browser.