source: trunk/FACT++/src/root2sql.cc@ 19102

Last change on this file since 19102 was 19102, checked in by tbretz, 6 years ago
Minor improvemt of the usage text
File size: 21.8 KB
Line 
1#include <boost/algorithm/string/join.hpp>
2#include <boost/regex.hpp>
3
4#include "Database.h"
5
6#include "Time.h"
7#include "Configuration.h"
8
9#include <TROOT.h>
10#include <TFile.h>
11#include <TTree.h>
12#include <TLeaf.h>
13#include <TError.h>
14
15using namespace std;
16
17// ------------------------------------------------------------------------
18
19struct Map : pair<string, string>
20{
21 Map() { }
22};
23
24std::istream &operator>>(std::istream &in, Map &m)
25{
26 const istreambuf_iterator<char> eos;
27 string txt(istreambuf_iterator<char>(in), eos);
28
29 const boost::regex expr("((.*)[^\\\\])/(.*)");
30 boost::smatch match;
31 if (!boost::regex_match(txt, match, expr))
32 throw runtime_error("Could not evaluate map argument: "+txt);
33
34 m.first = match[1].str();
35 m.second = match[3].str();
36
37 return in;
38}
39
40
41void SetupConfiguration(Configuration &conf)
42{
43 po::options_description control("Root to SQL");
44 control.add_options()
45 ("uri,u", var<string>()
46#if BOOST_VERSION >= 104200
47 ->required()
48#endif
49 , "Database link as in\n\tuser:password@server[:port]/database.")
50 ("file", var<string>(""), "The root file to read from")
51 ("force", po_switch(), "Force processing even if there is no database connection")
52 ("create", po_switch(), "Create the database if not existing")
53 ("drop", po_switch(), "Drop the table (implies create)")
54 ("tree,t", var<string>("Events"), "Name of the root tree to convert")
55 ("table", var<string>(""), "Name of the table to use (default is the tree name)")
56 ("map", vars<Map>(), "A regular expression which is applied to the leaf name befoee it is used as SQL column name)")
57 ("sql-type", vars<Map>(), "Allows to overwrite the calculated SQL type for a given column e.g. 'sql-column-name/UNSIGNED IN'")
58 ("ignore", vars<string>(), "Ignore the given leaf, if the given regular expression matches")
59 ("primary", vars<string>(), "List of columns to be used as primary keys during table creation (in connection with --create)")
60 ("first", var<int64_t>(int64_t(0)), "First event to start with (default: 0), mainly for test purpose")
61 ("max", var<int64_t>(int64_t(0)), "Maximum number of events to process (0: all), mainly for test purpose")
62 ("engine", var<string>("InnoDB"), "Database engine to be used when a new table is created")
63 ("duplicate", var<string>(""), "Specifies an assignment_list for an 'ON DUPLICATE KEY UPDATE' expression")
64 ;
65
66 po::options_description debug("Debug options");
67 debug.add_options()
68 ("no-insert", po_switch(), "Does not insert any data into the table")
69 ("dry-run", po_switch(), "Skip any query which changes the databse (might result in consecutive failures)")
70 ("print-branches", po_switch(), "Print the branches found in the tree")
71 ("print-leaves", po_switch(), "Print the leaves found in the tree (this is what is processed)")
72 ("print-insert", po_switch(), "Print the INSERT query (note that it contains all data)")
73 ("print-create", po_switch(), "Print the CREATE query")
74 ("verbose,v", var<uint16_t>(1), "Verbosity (0: quiet, 1: default, 2: more, 3, ...)")
75 ;
76
77 po::positional_options_description p;
78 p.add("file", 1); // The 1st positional options (n=1)
79
80 conf.AddOptions(control);
81 conf.AddOptions(debug);
82 conf.SetArgumentPositions(p);
83}
84
85void PrintUsage()
86{
87 cout <<
88 "root2sql - Fills the data from a root file into a database\n"
89 "\n"
90 "For convenience, this documentation uses the extended version of the options, "
91 "refer to the output below to get the abbreviations.\n"
92 "\n"
93 "This is a general purpose tool to fill the contents of a root file into a database "
94 "as long as this is technically possible and makes sense. Note that root can even "
95 "write complex data like a TH1F into a database, this is not the purpose of this "
96 "program.\n"
97 "\n"
98 "Each root tree has branches and leaves (the basic data types). These leaves can "
99 "be read independently of the classes which were used to write the root file. "
100 "The default tree to read from is 'Events' but the name can be overwritten "
101 "using --tree. The default table name to fill the data into is identical to "
102 "the tree name. It can be overwritten using --table.\n"
103 "\n"
104 "The name of each column to which data is filled from a leave is obtained from "
105 "the leaves' names. The leave names can be checked using --print-leaves. "
106 "A --print-branches exists for convenience to print only the high-level branches. "
107 "Sometimes these names might be quite unconvenient like MTime.fTime.fMilliSec or "
108 "just MHillas.fWidth. To allow to simplify column names, regular expressions "
109 "(using boost's regex) can be defined to change the names. Note that these regular "
110 "expressions are applied one by one on each leaf's name. A valid expression could "
111 "be:\n"
112 " --map=MHillas\\.f/\n"
113 "which would remove all occurances of 'MHillas.f'. This option can be used more than "
114 "once. They are applied in sequence. A single match does not stop the sequence.\n"
115 "\n"
116 "Sometimes it might also be convenient to skip a leaf. This can be done with "
117 "the --ignore resource. If the given regular expresion yields a match, the "
118 "leaf will be ignored. Note that the regular expression works on the raw-name "
119 "of the leaf not the readily mapped SQL column names. Example:\n"
120 " --ignore=ThetaSq\\..*\n"
121 "will skip all leaved which start with 'ThetaSq.'. This option can be used"
122 "more than once.\n"
123 "\n"
124 "The data type of each column is kept as close as possible to the leaves' data "
125 "types. If for some reason this is not wanted, the data type of the SQL column "
126 "can be overwritten with --sql-type sql-column/sql-ytpe, for example:\n"
127 " --sql-type=FileId/UNSIGNED INT\n"
128 "while the first argument of the name of the SQL column to which the data type "
129 "should be applied. The second column is the basic SQL data type. The option can "
130 "be given more than once.\n"
131 "\n"
132 "Database interaction:\n"
133 "\n"
134 "To drop an existing table, --drop can be used.\n"
135 "\n"
136 "To create a table according to theSQL column names and data types, --create "
137 "can be used. The query used can be printed with --print-create even --create "
138 "has not been specified.\n"
139 "\n"
140 "To choose the columns which should become primary keys, use --primary, "
141 "for exmaple:\n"
142 " --primary=col1\n"
143 "To define more than one column as primary key, the option can be given more than "
144 "once. Note that the combination of these columns must be unique.\n"
145 "\n"
146 "All columns are created as NOT NULL as default and the table is created as "
147 "InnoDB (default).\n"
148 "\n"
149 "Usually, the INSERT query would fail if the PRIMARY key exists already. "
150 "This can be avoided using the 'ON DUPLICATE KEY' directive. With the "
151 "--duplicate,you can specify what should be updated in case of a duplicate key. "
152 "To keep the row untouched, you can just update the primary key "
153 "with the identical primary key, e.g. --duplicate='MyPrimary=VALUES(MyPrimary)'. "
154 "For more details, see the MySQL manual.\n"
155 "\n"
156 "For debugging purpose, or to just create or drop a table, the final insert "
157 "query can be skipped using --no-insert. Note that for performance reason, "
158 "all data is collected in memory and a single INSERT query is issued at the "
159 "end.\n"
160 "\n"
161 "Using a higher verbosity level (-v), an overview of the written columns or all "
162 "processed leaves is printed depending on the verbosity level. The output looks "
163 "like the following\n"
164 " Leaf name [root data type] (SQL name)\n"
165 "for example\n"
166 " MTime.fTime.fMilliSec [Long64_t] (MilliSec)\n"
167 "which means that the leaf MTime.fTime.fMilliSec is detected to be a Long64_t "
168 "which is filled into a column called MilliSec. Leaves with non basic data types "
169 "are ignored automatically and are marked as (-n/a-). User ignored columnd "
170 "are marked as (-ignored-).\n"
171 "\n"
172 "If a query failed, the query is printed to stderr together with the error message. "
173 "For the main INSERT query, this is only true if the verbosity level is at least 2 "
174 "or the query has less than 80*25 bytes.\n"
175 "\n"
176 "In case of succes, 0 is returned, a value>0 otherwise.\n"
177 "\n"
178 "Usage: root2sql [options] rootfile.root\n"
179 "\n"
180 ;
181 cout << endl;
182}
183
184enum BasicType_t
185{
186 kNone = 0,
187 kFloat,
188 kDouble,
189 kInt16,
190 kUInt16,
191 kInt32,
192 kUInt32,
193 kInt64,
194 kUInt64,
195};
196
197static const map<string, pair<BasicType_t, string>> ConvRoot =
198{
199 { "Float_t", { kFloat, "FLOAT" } },
200 { "Double_t", { kDouble, "DOUBLE" } },
201 { "ULong64_t", { kUInt64, "BIGINT UNSIGNED" } },
202 { "Long64_t", { kInt64, "BIGINT" } },
203 { "UInt_t", { kUInt32, "INT UNSIGNED" } },
204 { "Int_t", { kInt32, "INT" } },
205 { "UShort_t", { kUInt16, "SMALLINT UNSIGNED" } },
206 { "Short_t", { kInt16, "SMALLINT" } },
207};
208
209struct Container
210{
211 string branch; // branch name
212 string column; // column name
213 BasicType_t type;
214 void *ptr;
215
216 Container(const string &b, const string &c, const BasicType_t &t) : branch(b), column(c), type(t), ptr(0)
217 {
218 switch (t)
219 {
220 case kFloat: ptr = new Float_t; break;
221 case kDouble: ptr = new Double_t; break;
222 case kInt16: ptr = new Short_t; break;
223 case kUInt16: ptr = new UShort_t; break;
224 case kInt32: ptr = new Int_t; break;
225 case kUInt32: ptr = new UInt_t; break;
226 case kInt64: ptr = new Long64_t; break;
227 case kUInt64: ptr = new ULong64_t; break;
228 case kNone:
229 break;
230 }
231 }
232 ~Container()
233 {
234 //::operator delete(ptr); // It seems root is deleting it already
235 }
236
237 string fmt() const
238 {
239 ostringstream str;
240
241 switch (type)
242 {
243 case kFloat: str << setprecision(8) << *reinterpret_cast<Float_t*>(ptr); break;
244 case kDouble: str << setprecision(16) << *reinterpret_cast<Double_t*>(ptr); break;
245 case kInt16: str << *reinterpret_cast<Short_t*>(ptr); break;
246 case kUInt16: str << *reinterpret_cast<UShort_t*>(ptr); break;
247 case kInt32: str << *reinterpret_cast<Int_t*>(ptr); break;
248 case kUInt32: str << *reinterpret_cast<UInt_t*>(ptr); break;
249 case kInt64: str << *reinterpret_cast<Long64_t*>(ptr); break;
250 case kUInt64: str << *reinterpret_cast<ULong64_t*>(ptr); break;
251 case kNone:
252 break;
253 }
254
255 //if (str.str()=="nan" || str.str()=="-nan" || str.str()=="inf" || str.str()=="-inf")
256 // return "NULL";
257
258 return str.str();
259 }
260};
261
262void ErrorHandlerAll(Int_t level, Bool_t abort, const char *location, const char *msg)
263{
264 if (string(msg).substr(0,24)=="no dictionary for class ")
265 return;
266
267 DefaultErrorHandler(level, abort, location, msg);
268}
269
270int main(int argc, const char* argv[])
271{
272 Time start;
273
274 gROOT->SetBatch();
275 SetErrorHandler(ErrorHandlerAll);
276
277 Configuration conf(argv[0]);
278 conf.SetPrintUsage(PrintUsage);
279 SetupConfiguration(conf);
280
281 if (!conf.DoParse(argc, argv))
282 return 127;
283
284 // ----------------------------- Evaluate options --------------------------
285 const string uri = conf.Get<string>("uri");
286 const string file = conf.Get<string>("file");
287 const string tree = conf.Get<string>("tree");
288 const string table = conf.Get<string>("table").empty() ? tree : conf.Get<string>("table");
289
290 const uint16_t verbose = conf.Get<uint16_t>("verbose");
291 const int64_t first = conf.Get<int64_t>("first");
292 const int64_t max = conf.Get<int64_t>("max");
293
294 const bool force = conf.Get<bool>("force");
295 const bool drop = conf.Get<bool>("drop");
296 const bool create = conf.Get<bool>("create") || drop;
297 const bool noinsert = conf.Get<bool>("no-insert");
298 const bool dry_run = conf.Get<bool>("dry-run");
299
300 const string engine = conf.Get<string>("engine");
301 const string duplicate = conf.Get<string>("duplicate");
302
303 const bool print_branches = conf.Get<bool>("print-branches");
304 const bool print_leaves = conf.Get<bool>("print-leaves");
305 const bool print_create = conf.Get<bool>("print-create");
306 const bool print_insert = conf.Get<bool>("print-insert");
307
308 const vector<Map> mymap = conf.Vec<Map>("map");
309 const vector<Map> sqltypes = conf.Vec<Map>("sql-type");
310 const vector<string> _ignore = conf.Vec<string>("ignore");
311 const vector<string> primary = conf.Vec<string>("primary");
312
313 // -------------------------------------------------------------------------
314
315 if (verbose>0)
316 cout << "\n-------------------------- Evaluating file -------------------------" << endl;
317
318 TFile f(file.c_str());
319 if (f.IsZombie())
320 {
321 cerr << "Could not open file " << file << endl;
322 return 1;
323 }
324
325 if (verbose>0)
326 cout << "File: " << file << endl;
327
328 TTree *T = 0;
329 f.GetObject(tree.c_str(), T);
330 if (!T)
331 {
332 cerr << "Could not open tree " << tree << endl;
333 return 2;
334 }
335
336 if (verbose>0)
337 cout << "Tree: " << tree << endl;
338
339 T->SetMakeClass(1);
340
341 TObjArray *branches = T->GetListOfBranches();
342 TObjArray *leaves = T->GetListOfLeaves();
343
344 if (print_branches)
345 {
346 cout << '\n';
347 branches->Print();
348 }
349
350 if (verbose>0)
351 cout << T->GetEntriesFast() << " events found." << endl;
352
353
354 if (verbose>0)
355 cout << branches->GetEntries() << " branches found." << endl;
356
357 if (print_leaves)
358 {
359 cout << '\n';
360 leaves->Print();
361 }
362 if (verbose>0)
363 cout << leaves->GetEntries() << " leaves found." << endl;
364
365 string query =
366 "CREATE TABLE IF NOT EXISTS `"+table+"`\n"
367 "(\n";
368
369 vector<Container> vec;
370
371
372 TIter Next(leaves);
373 TObject *o = 0;
374 while ((o=Next()))
375 {
376 TLeaf *L = T->GetLeaf(o->GetName());
377
378 if (verbose>2)
379 cout << '\n' << o->GetName() << " [" << L->GetTypeName() << "]";
380
381 string name = o->GetName();
382
383
384 bool found = false;
385 for (auto b=_ignore.cbegin(); b!=_ignore.cend(); b++)
386 {
387 if (boost::regex_match(name, boost::regex(*b)))
388 {
389 found = true;
390 if (verbose>2)
391 cout << " (-ignored-)";
392 break;
393 }
394 }
395 if (found)
396 continue;
397
398 const string tn = L->GetTypeName();
399
400 auto it = ConvRoot.find(tn);
401 if (it==ConvRoot.end())
402 {
403 if (verbose>2)
404 cout << " (-n/a-)";
405 continue;
406 }
407
408 if (verbose==2)
409 cout << '\n' << o->GetName() << " [" << L->GetTypeName() << "]";
410
411 for (auto m=mymap.cbegin(); m!=mymap.cend(); m++)
412 name = boost::regex_replace(name, boost::regex(m->first), m->second);
413
414 if (verbose>1)
415 cout << " (" << name << ")";
416
417 string sqltype = it->second.second;
418
419 for (auto m=sqltypes.cbegin(); m!=sqltypes.cend(); m++)
420 if (m->first==name)
421 sqltype = m->second;
422
423 if (!vec.empty())
424 query += ",\n";
425 query += " `"+name+"` "+sqltype+" NOT NULL COMMENT '"+o->GetName()+"'";
426
427 vec.emplace_back(o->GetName(), name, it->second.first);
428 T->SetBranchAddress(o->GetName(), vec.back().ptr);
429 }
430
431 if (verbose>1)
432 cout << "\n\n";
433 if (verbose>0)
434 cout << vec.size() << " leaves setup for reading." << endl;
435
436 UInt_t datatype = 0;
437 const bool has_datatype = T->SetBranchAddress("DataType.fVal", &datatype) >= 0;
438
439 // Setiing up branch status (must be after all SetBranchAddress)
440 T->SetBranchStatus("*", 0);
441 for (auto c=vec.cbegin(); c!=vec.cend(); c++)
442 T->SetBranchStatus(c->branch.c_str(), 1);
443
444 if (has_datatype)
445 {
446 T->SetBranchStatus("DataType.fVal", 1);
447 if (verbose>0)
448 cout << "Rows with DataType.fVal!=1 will be skipped." << endl;
449 }
450
451 // -------------------------------------------------------------------------
452 // Checking for database connection
453
454 try
455 {
456 if (!force)
457 Database(uri).connected();
458 }
459 catch (const exception &e)
460 {
461 cerr << "SQL connection failed: " << e.what() << endl;
462 return 3;
463 }
464
465 // -------------------------------------------------------------------------
466
467 if (verbose>0)
468 cout << "\n--------------------------- Database Table -------------------------" << endl;
469
470 if (!primary.empty())
471 query += ",\n PRIMARY KEY USING BTREE (`"+boost::algorithm::join(primary, "`, `")+"`)\n";
472
473 query +=
474 ")\n"
475 "DEFAULT CHARSET=latin1 COLLATE=latin1_general_ci\n"
476 "ENGINE="+engine+"\n"
477 "COMMENT='created by "+conf.GetName()+"'\n";
478
479 // FIXME: Can we omit the catching to be able to print the
480 // query 'autmatically'?
481 try
482 {
483 if (drop)
484 {
485 // => Simple result
486 if (!dry_run)
487 Database(uri).query("DROP TABLE `"+table+"`").execute();
488 if (verbose>0)
489 cout << "Table `" << table << "` dropped." << endl;
490 }
491 }
492 catch (const exception &e)
493 {
494 cerr << "DROP TABLE `" << table << "`\n\n";
495 cerr << "SQL query failed:\n" << e.what() << endl;
496 return 4;
497 }
498
499 try
500 {
501 if (create && !dry_run)
502 Database(uri).query(query).execute();
503 }
504 catch (const exception &e)
505 {
506 cerr << query << "\n\n";
507 cerr << "SQL query failed:\n" << e.what() << endl;
508 return 5;
509 }
510
511 if (print_create)
512 cout << query << endl;
513
514 if (create && verbose>0)
515 cout << "Table `" << table << "` created." << endl;
516
517 // -------------------------------------------------------------------------
518
519 if (verbose>0)
520 cout << "\n---------------------------- Reading file --------------------------" << endl;
521
522 //query = update ? "UPDATE" : "INSERT";
523 query = "INSERT `"+table+"`\n"
524 "(\n";
525
526 for (auto c=vec.cbegin(); c!=vec.cend(); c++)
527 {
528 if (c!=vec.cbegin())
529 query += ",\n";
530 query += " `"+c->column+"`";
531 }
532
533 query +=
534 "\n)\n"
535 "VALUES\n";
536
537 size_t count = 0;
538
539 const size_t num = max>0 && (max-first)<T->GetEntriesFast() ? (max-first) : T->GetEntriesFast();
540 for (size_t j=first; j<num; j++)
541 {
542 T->GetEntry(j);
543 if (has_datatype && datatype!=1)
544 continue;
545
546 if (count>0)
547 query += ",\n";
548
549 query += "(\n";
550
551 for (auto c=vec.cbegin(); c!=vec.cend(); c++)
552 {
553 if (c!=vec.cbegin())
554 query += ",\n";
555
556 query += " "+c->fmt();
557
558 if (print_insert)
559 query += " /* "+c->column+" -> "+c->branch+" */";
560 }
561 query += "\n)";
562
563 count ++;
564 }
565
566 if (!duplicate.empty())
567 query += "\nON DUPLICATE KEY UPDATE\n " + duplicate;
568
569 if (verbose>0)
570 cout << count << " out of " << num << " row(s) read from file [N=" << first << ".." << num-1 << "]." << endl;
571
572 if (count==0)
573 {
574 if (verbose>0)
575 cout << "Total execution time: " << Time().UnixTime()-start.UnixTime() << "s\n" << endl;
576 return 0;
577 }
578
579 // -------------------------------------------------------------------------
580
581 if (verbose>0)
582 {
583 cout << "\n--------------------------- Inserting data -------------------------" << endl;
584 cout << "Sending INSERT query (" << query.length() << " bytes)" << endl;
585 }
586
587 try
588 {
589 if (!noinsert && !dry_run)
590 // => Simple result
591 Database(uri).query(query).execute();
592 else
593 cout << "Insert query skipped!" << endl;
594
595 if (print_insert)
596 cout << query << endl;
597 }
598 catch (const exception &e)
599 {
600 if (verbose>1 || query.length()<80*25)
601 cerr << query << "\n\n";
602 cerr << "SQL query failed (" << query.length() << " bytes):\n" << e.what() << endl;
603 return 6;
604 }
605
606 if (verbose>0)
607 {
608 cout << count << " row(s) inserted.\n\n";
609 cout << "Total execution time: " << Time().UnixTime()-start.UnixTime() << "s\n" << endl;
610/*
611 try
612 {
613 const auto resw =
614 connection.query("SHOW WARNINGS").store();
615
616 if (resw.num_rows()>0)
617 cout << "\nWARNINGS:\n\n";
618
619 for (size_t i=0; i<resw.num_rows(); i++)
620 {
621 const mysqlpp::Row &roww = resw[i];
622
623 cout << roww["Level"] << '[' << roww["Code"] << "]: ";
624 cout << roww["Message"] << '\n' << endl;
625 }
626
627 }
628 catch (const exception &e)
629 {
630 cerr << "\nSHOW WARNINGS\n\n";
631 cerr << "SQL query failed:\n" << e.what() << endl;
632 return 6;
633 }*/
634 }
635
636
637 return 0;
638}
Note: See TracBrowser for help on using the repository browser.