source: trunk/FACT++/src/root2sql.cc@ 19101

Last change on this file since 19101 was 19101, checked in by tbretz, 6 years ago
Allow an 'ON DUPLICATE KEY UPDATE' clause
File size: 21.6 KB
Line 
1#include <boost/algorithm/string/join.hpp>
2#include <boost/regex.hpp>
3
4#include "Database.h"
5
6#include "Time.h"
7#include "Configuration.h"
8
9#include <TROOT.h>
10#include <TFile.h>
11#include <TTree.h>
12#include <TLeaf.h>
13#include <TError.h>
14
15using namespace std;
16
17// ------------------------------------------------------------------------
18
19struct Map : pair<string, string>
20{
21 Map() { }
22};
23
24std::istream &operator>>(std::istream &in, Map &m)
25{
26 const istreambuf_iterator<char> eos;
27 string txt(istreambuf_iterator<char>(in), eos);
28
29 const boost::regex expr("((.*)[^\\\\])/(.*)");
30 boost::smatch match;
31 if (!boost::regex_match(txt, match, expr))
32 throw runtime_error("Could not evaluate map argument: "+txt);
33
34 m.first = match[1].str();
35 m.second = match[3].str();
36
37 return in;
38}
39
40
41void SetupConfiguration(Configuration &conf)
42{
43 po::options_description control("Root to SQL");
44 control.add_options()
45 ("uri,u", var<string>()
46#if BOOST_VERSION >= 104200
47 ->required()
48#endif
49 , "Database link as in\n\tuser:password@server[:port]/database.")
50 ("file", var<string>(""), "The root file to read from")
51 ("force", po_switch(), "Force processing even if there is no database connection")
52 ("create", po_switch(), "Create the database if not existing")
53 ("drop", po_switch(), "Drop the table (implies create)")
54 ("tree,t", var<string>("Events"), "Name of the root tree to convert")
55 ("table", var<string>(""), "Name of the table to use (default is the tree name)")
56 ("map", vars<Map>(), "A regular expression which is applied to the leaf name befoee it is used as SQL column name)")
57 ("sql-type", vars<Map>(), "Allows to overwrite the calculated SQL type for a given column e.g. 'sql-column-name/UNSIGNED IN'")
58 ("ignore", vars<string>(), "Ignore the given leaf, if the given regular expression matches")
59 ("primary", vars<string>(), "List of columns to be used as primary keys during table creation (in connection with --create)")
60 ("first", var<int64_t>(int64_t(0)), "First event to start with (default: 0), mainly for test purpose")
61 ("max", var<int64_t>(int64_t(0)), "Maximum number of events to process (0: all), mainly for test purpose")
62 ("engine", var<string>("InnoDB"), "Database engine to be used when a new table is created")
63 ("duplicate", var<string>(""), "Specifies an assignment_list for an 'ON DUPLICATE KEY UPDATE' expression")
64 ;
65
66 po::options_description debug("Debug options");
67 debug.add_options()
68 ("no-insert", po_switch(), "Does not insert any data into the table")
69 ("dry-run", po_switch(), "Skip any query which changes the databse (might result in consecutive failures)")
70 ("print-branches", po_switch(), "Print the branches found in the tree")
71 ("print-leaves", po_switch(), "Print the leaves found in the tree (this is what is processed)")
72 ("print-insert", po_switch(), "Print the INSERT query (note that it contains all data)")
73 ("print-create", po_switch(), "Print the CREATE query")
74 ("verbose,v", var<uint16_t>(1), "Verbosity (0: quiet, 1: default, 2: more, 3, ...)")
75 ;
76
77 po::positional_options_description p;
78 p.add("file", 1); // The 1st positional options (n=1)
79
80 conf.AddOptions(control);
81 conf.AddOptions(debug);
82 conf.SetArgumentPositions(p);
83}
84
85void PrintUsage()
86{
87 cout <<
88 "root2sql - Fills the data from a root file into a database\n"
89 "\n"
90 "For convenience, this documentation uses the extended version of the options, "
91 "refer to the output below to get the abbreviations.\n"
92 "\n"
93 "This is a general purpose tool to fill the contents of a root file into a database "
94 "as long as this is technically possible and makes sense. Note that root can even "
95 "write complex data like a TH1F into a database, this is not the purpose of this "
96 "program.\n"
97 "\n"
98 "Each root tree has branches and leaves (the basic data types). These leaves can "
99 "be read independently of the classes which were used to write the root file. "
100 "The default tree to read from is 'Events' but the name can be overwritten "
101 "using --tree. The default table name to fill the data into is identical to "
102 "the tree name. It can be overwritten using --table.\n"
103 "\n"
104 "The name of each column to which data is filled from a leave is obtained from "
105 "the leaves' names. The leave names can be checked using --print-leaves. "
106 "A --print-branches exists for convenience to print only the high-level branches. "
107 "Sometimes these names might be quite unconvenient like MTime.fTime.fMilliSec or "
108 "just MHillas.fWidth. To allow to simplify column names, regular expressions "
109 "(using boost's regex) can be defined to change the names. Note that these regular "
110 "expressions are applied one by one on each leaf's name. A valid expression could "
111 "be:\n"
112 " --map=MHillas\\.f/\n"
113 "which would remove all occurances of 'MHillas.f'. This option can be used more than "
114 "once. They are applied in sequence. A single match does not stop the sequence.\n"
115 "\n"
116 "Sometimes it might also be convenient to skip a leaf. This can be done with "
117 "the --ignore resource. If the given regular expresion yields a match, the "
118 "leaf will be ignored. Note that the regular expression works on the raw-name "
119 "of the leaf not the readily mapped SQL column names. Example:\n"
120 " --ignore=ThetaSq\\..*\n"
121 "will skip all leaved which start with 'ThetaSq.'. This option can be used"
122 "more than once.\n"
123 "\n"
124 "The data type of each column is kept as close as possible to the leaves' data "
125 "types. If for some reason this is not wanted, the data type of the SQL column "
126 "can be overwritten with --sql-type sql-column/sql-ytpe, for example:\n"
127 " --sql-type=FileId/UNSIGNED INT\n"
128 "while the first argument of the name of the SQL column to which the data type "
129 "should be applied. The second column is the basic SQL data type. The option can "
130 "be given more than once.\n"
131 "\n"
132 "Database interaction:\n"
133 "\n"
134 "To drop an existing table, --drop can be used.\n"
135 "\n"
136 "To create a table according to theSQL column names and data types, --create "
137 "can be used. The query used can be printed with --print-create even --create "
138 "has not been specified.\n"
139 "\n"
140 "To choose the columns which should become primary keys, use --primary, "
141 "for exmaple:\n"
142 " --primary=col1\n"
143 "To define more than one column as primary key, the option can be given more than "
144 "once. Note that the combination of these columns must be unique.\n"
145 "\n"
146 "All columns are created as NOT NULL as default and the table is created as "
147 "InnoDB (default).\n"
148 "\n"
149 "Usually, the INSERT query would fail if the PRIMARY key exists already. "
150 "This can be avoided using the 'ON DUPLICATE KEY' directive. With the "
151 "--duplicate,you can specify what should be updated in case of a duplicate key. "
152 "For detaily, see the MySQL manual.\n"
153 "\n"
154 "For debugging purpose, or to just create or drop a table, the final insert "
155 "query can be skipped using --no-insert. Note that for performance reason, "
156 "all data is collected in memory and a single INSERT query is issued at the "
157 "end.\n"
158 "\n"
159 "Using a higher verbosity level (-v), an overview of the written columns or all "
160 "processed leaves is printed depending on the verbosity level. The output looks "
161 "like the following\n"
162 " Leaf name [root data type] (SQL name)\n"
163 "for example\n"
164 " MTime.fTime.fMilliSec [Long64_t] (MilliSec)\n"
165 "which means that the leaf MTime.fTime.fMilliSec is detected to be a Long64_t "
166 "which is filled into a column called MilliSec. Leaves with non basic data types "
167 "are ignored automatically and are marked as (-n/a-). User ignored columnd "
168 "are marked as (-ignored-).\n"
169 "\n"
170 "If a query failed, the query is printed to stderr together with the error message. "
171 "For the main INSERT query, this is only true if the verbosity level is at least 2 "
172 "or the query has less than 80*25 bytes.\n"
173 "\n"
174 "In case of succes, 0 is returned, a value>0 otherwise.\n"
175 "\n"
176 "Usage: root2sql [options] rootfile.root\n"
177 "\n"
178 ;
179 cout << endl;
180}
181
182enum BasicType_t
183{
184 kNone = 0,
185 kFloat,
186 kDouble,
187 kInt16,
188 kUInt16,
189 kInt32,
190 kUInt32,
191 kInt64,
192 kUInt64,
193};
194
195static const map<string, pair<BasicType_t, string>> ConvRoot =
196{
197 { "Float_t", { kFloat, "FLOAT" } },
198 { "Double_t", { kDouble, "DOUBLE" } },
199 { "ULong64_t", { kUInt64, "BIGINT UNSIGNED" } },
200 { "Long64_t", { kInt64, "BIGINT" } },
201 { "UInt_t", { kUInt32, "INT UNSIGNED" } },
202 { "Int_t", { kInt32, "INT" } },
203 { "UShort_t", { kUInt16, "SMALLINT UNSIGNED" } },
204 { "Short_t", { kInt16, "SMALLINT" } },
205};
206
207struct Container
208{
209 string branch; // branch name
210 string column; // column name
211 BasicType_t type;
212 void *ptr;
213
214 Container(const string &b, const string &c, const BasicType_t &t) : branch(b), column(c), type(t), ptr(0)
215 {
216 switch (t)
217 {
218 case kFloat: ptr = new Float_t; break;
219 case kDouble: ptr = new Double_t; break;
220 case kInt16: ptr = new Short_t; break;
221 case kUInt16: ptr = new UShort_t; break;
222 case kInt32: ptr = new Int_t; break;
223 case kUInt32: ptr = new UInt_t; break;
224 case kInt64: ptr = new Long64_t; break;
225 case kUInt64: ptr = new ULong64_t; break;
226 case kNone:
227 break;
228 }
229 }
230 ~Container()
231 {
232 //::operator delete(ptr); // It seems root is deleting it already
233 }
234
235 string fmt() const
236 {
237 ostringstream str;
238
239 switch (type)
240 {
241 case kFloat: str << setprecision(8) << *reinterpret_cast<Float_t*>(ptr); break;
242 case kDouble: str << setprecision(16) << *reinterpret_cast<Double_t*>(ptr); break;
243 case kInt16: str << *reinterpret_cast<Short_t*>(ptr); break;
244 case kUInt16: str << *reinterpret_cast<UShort_t*>(ptr); break;
245 case kInt32: str << *reinterpret_cast<Int_t*>(ptr); break;
246 case kUInt32: str << *reinterpret_cast<UInt_t*>(ptr); break;
247 case kInt64: str << *reinterpret_cast<Long64_t*>(ptr); break;
248 case kUInt64: str << *reinterpret_cast<ULong64_t*>(ptr); break;
249 case kNone:
250 break;
251 }
252
253 //if (str.str()=="nan" || str.str()=="-nan" || str.str()=="inf" || str.str()=="-inf")
254 // return "NULL";
255
256 return str.str();
257 }
258};
259
260void ErrorHandlerAll(Int_t level, Bool_t abort, const char *location, const char *msg)
261{
262 if (string(msg).substr(0,24)=="no dictionary for class ")
263 return;
264
265 DefaultErrorHandler(level, abort, location, msg);
266}
267
268int main(int argc, const char* argv[])
269{
270 Time start;
271
272 gROOT->SetBatch();
273 SetErrorHandler(ErrorHandlerAll);
274
275 Configuration conf(argv[0]);
276 conf.SetPrintUsage(PrintUsage);
277 SetupConfiguration(conf);
278
279 if (!conf.DoParse(argc, argv))
280 return 127;
281
282 // ----------------------------- Evaluate options --------------------------
283 const string uri = conf.Get<string>("uri");
284 const string file = conf.Get<string>("file");
285 const string tree = conf.Get<string>("tree");
286 const string table = conf.Get<string>("table").empty() ? tree : conf.Get<string>("table");
287
288 const uint16_t verbose = conf.Get<uint16_t>("verbose");
289 const int64_t first = conf.Get<int64_t>("first");
290 const int64_t max = conf.Get<int64_t>("max");
291
292 const bool force = conf.Get<bool>("force");
293 const bool drop = conf.Get<bool>("drop");
294 const bool create = conf.Get<bool>("create") || drop;
295 const bool noinsert = conf.Get<bool>("no-insert");
296 const bool dry_run = conf.Get<bool>("dry-run");
297
298 const string engine = conf.Get<string>("engine");
299 const string duplicate = conf.Get<string>("duplicate");
300
301 const bool print_branches = conf.Get<bool>("print-branches");
302 const bool print_leaves = conf.Get<bool>("print-leaves");
303 const bool print_create = conf.Get<bool>("print-create");
304 const bool print_insert = conf.Get<bool>("print-insert");
305
306 const vector<Map> mymap = conf.Vec<Map>("map");
307 const vector<Map> sqltypes = conf.Vec<Map>("sql-type");
308 const vector<string> _ignore = conf.Vec<string>("ignore");
309 const vector<string> primary = conf.Vec<string>("primary");
310
311 // -------------------------------------------------------------------------
312
313 if (verbose>0)
314 cout << "\n-------------------------- Evaluating file -------------------------" << endl;
315
316 TFile f(file.c_str());
317 if (f.IsZombie())
318 {
319 cerr << "Could not open file " << file << endl;
320 return 1;
321 }
322
323 if (verbose>0)
324 cout << "File: " << file << endl;
325
326 TTree *T = 0;
327 f.GetObject(tree.c_str(), T);
328 if (!T)
329 {
330 cerr << "Could not open tree " << tree << endl;
331 return 2;
332 }
333
334 if (verbose>0)
335 cout << "Tree: " << tree << endl;
336
337 T->SetMakeClass(1);
338
339 TObjArray *branches = T->GetListOfBranches();
340 TObjArray *leaves = T->GetListOfLeaves();
341
342 if (print_branches)
343 {
344 cout << '\n';
345 branches->Print();
346 }
347
348 if (verbose>0)
349 cout << T->GetEntriesFast() << " events found." << endl;
350
351
352 if (verbose>0)
353 cout << branches->GetEntries() << " branches found." << endl;
354
355 if (print_leaves)
356 {
357 cout << '\n';
358 leaves->Print();
359 }
360 if (verbose>0)
361 cout << leaves->GetEntries() << " leaves found." << endl;
362
363 string query =
364 "CREATE TABLE IF NOT EXISTS `"+table+"`\n"
365 "(\n";
366
367 vector<Container> vec;
368
369
370 TIter Next(leaves);
371 TObject *o = 0;
372 while ((o=Next()))
373 {
374 TLeaf *L = T->GetLeaf(o->GetName());
375
376 if (verbose>2)
377 cout << '\n' << o->GetName() << " [" << L->GetTypeName() << "]";
378
379 string name = o->GetName();
380
381
382 bool found = false;
383 for (auto b=_ignore.cbegin(); b!=_ignore.cend(); b++)
384 {
385 if (boost::regex_match(name, boost::regex(*b)))
386 {
387 found = true;
388 if (verbose>2)
389 cout << " (-ignored-)";
390 break;
391 }
392 }
393 if (found)
394 continue;
395
396 const string tn = L->GetTypeName();
397
398 auto it = ConvRoot.find(tn);
399 if (it==ConvRoot.end())
400 {
401 if (verbose>2)
402 cout << " (-n/a-)";
403 continue;
404 }
405
406 if (verbose==2)
407 cout << '\n' << o->GetName() << " [" << L->GetTypeName() << "]";
408
409 for (auto m=mymap.cbegin(); m!=mymap.cend(); m++)
410 name = boost::regex_replace(name, boost::regex(m->first), m->second);
411
412 if (verbose>1)
413 cout << " (" << name << ")";
414
415 string sqltype = it->second.second;
416
417 for (auto m=sqltypes.cbegin(); m!=sqltypes.cend(); m++)
418 if (m->first==name)
419 sqltype = m->second;
420
421 if (!vec.empty())
422 query += ",\n";
423 query += " `"+name+"` "+sqltype+" NOT NULL COMMENT '"+o->GetName()+"'";
424
425 vec.emplace_back(o->GetName(), name, it->second.first);
426 T->SetBranchAddress(o->GetName(), vec.back().ptr);
427 }
428
429 if (verbose>1)
430 cout << "\n\n";
431 if (verbose>0)
432 cout << vec.size() << " leaves setup for reading." << endl;
433
434 UInt_t datatype = 0;
435 const bool has_datatype = T->SetBranchAddress("DataType.fVal", &datatype) >= 0;
436
437 // Setiing up branch status (must be after all SetBranchAddress)
438 T->SetBranchStatus("*", 0);
439 for (auto c=vec.cbegin(); c!=vec.cend(); c++)
440 T->SetBranchStatus(c->branch.c_str(), 1);
441
442 if (has_datatype)
443 {
444 T->SetBranchStatus("DataType.fVal", 1);
445 if (verbose>0)
446 cout << "Rows with DataType.fVal!=1 will be skipped." << endl;
447 }
448
449 // -------------------------------------------------------------------------
450 // Checking for database connection
451
452 try
453 {
454 if (!force)
455 Database(uri).connected();
456 }
457 catch (const exception &e)
458 {
459 cerr << "SQL connection failed: " << e.what() << endl;
460 return 3;
461 }
462
463 // -------------------------------------------------------------------------
464
465 if (verbose>0)
466 cout << "\n--------------------------- Database Table -------------------------" << endl;
467
468 if (!primary.empty())
469 query += ",\n PRIMARY KEY USING BTREE (`"+boost::algorithm::join(primary, "`, `")+"`)\n";
470
471 query +=
472 ")\n"
473 "DEFAULT CHARSET=latin1 COLLATE=latin1_general_ci\n"
474 "ENGINE="+engine+"\n"
475 "COMMENT='created by "+conf.GetName()+"'\n";
476
477 // FIXME: Can we omit the catching to be able to print the
478 // query 'autmatically'?
479 try
480 {
481 if (drop)
482 {
483 // => Simple result
484 if (!dry_run)
485 Database(uri).query("DROP TABLE `"+table+"`").execute();
486 if (verbose>0)
487 cout << "Table `" << table << "` dropped." << endl;
488 }
489 }
490 catch (const exception &e)
491 {
492 cerr << "DROP TABLE `" << table << "`\n\n";
493 cerr << "SQL query failed:\n" << e.what() << endl;
494 return 4;
495 }
496
497 try
498 {
499 if (create && !dry_run)
500 Database(uri).query(query).execute();
501 }
502 catch (const exception &e)
503 {
504 cerr << query << "\n\n";
505 cerr << "SQL query failed:\n" << e.what() << endl;
506 return 5;
507 }
508
509 if (print_create)
510 cout << query << endl;
511
512 if (create && verbose>0)
513 cout << "Table `" << table << "` created." << endl;
514
515 // -------------------------------------------------------------------------
516
517 if (verbose>0)
518 cout << "\n---------------------------- Reading file --------------------------" << endl;
519
520 //query = update ? "UPDATE" : "INSERT";
521 query = "INSERT `"+table+"`\n"
522 "(\n";
523
524 for (auto c=vec.cbegin(); c!=vec.cend(); c++)
525 {
526 if (c!=vec.cbegin())
527 query += ",\n";
528 query += " `"+c->column+"`";
529 }
530
531 query +=
532 "\n)\n"
533 "VALUES\n";
534
535 size_t count = 0;
536
537 const size_t num = max>0 && (max-first)<T->GetEntriesFast() ? (max-first) : T->GetEntriesFast();
538 for (size_t j=first; j<num; j++)
539 {
540 T->GetEntry(j);
541 if (has_datatype && datatype!=1)
542 continue;
543
544 if (count>0)
545 query += ",\n";
546
547 query += "(\n";
548
549 for (auto c=vec.cbegin(); c!=vec.cend(); c++)
550 {
551 if (c!=vec.cbegin())
552 query += ",\n";
553
554 query += " "+c->fmt();
555
556 if (print_insert)
557 query += " /* "+c->column+" -> "+c->branch+" */";
558 }
559 query += "\n)";
560
561 count ++;
562 }
563
564 if (!duplicate.empty())
565 query += "\nON DUPLICATE KEY UPDATE\n " + duplicate;
566
567 if (verbose>0)
568 cout << count << " out of " << num << " row(s) read from file [N=" << first << ".." << num-1 << "]." << endl;
569
570 if (count==0)
571 {
572 if (verbose>0)
573 cout << "Total execution time: " << Time().UnixTime()-start.UnixTime() << "s\n" << endl;
574 return 0;
575 }
576
577 // -------------------------------------------------------------------------
578
579 if (verbose>0)
580 {
581 cout << "\n--------------------------- Inserting data -------------------------" << endl;
582 cout << "Sending INSERT query (" << query.length() << " bytes)" << endl;
583 }
584
585 try
586 {
587 if (!noinsert && !dry_run)
588 // => Simple result
589 Database(uri).query(query).execute();
590 else
591 cout << "Insert query skipped!" << endl;
592
593 if (print_insert)
594 cout << query << endl;
595 }
596 catch (const exception &e)
597 {
598 if (verbose>1 || query.length()<80*25)
599 cerr << query << "\n\n";
600 cerr << "SQL query failed (" << query.length() << " bytes):\n" << e.what() << endl;
601 return 6;
602 }
603
604 if (verbose>0)
605 {
606 cout << count << " row(s) inserted.\n\n";
607 cout << "Total execution time: " << Time().UnixTime()-start.UnixTime() << "s\n" << endl;
608/*
609 try
610 {
611 const auto resw =
612 connection.query("SHOW WARNINGS").store();
613
614 if (resw.num_rows()>0)
615 cout << "\nWARNINGS:\n\n";
616
617 for (size_t i=0; i<resw.num_rows(); i++)
618 {
619 const mysqlpp::Row &roww = resw[i];
620
621 cout << roww["Level"] << '[' << roww["Code"] << "]: ";
622 cout << roww["Message"] << '\n' << endl;
623 }
624
625 }
626 catch (const exception &e)
627 {
628 cerr << "\nSHOW WARNINGS\n\n";
629 cerr << "SQL query failed:\n" << e.what() << endl;
630 return 6;
631 }*/
632 }
633
634
635 return 0;
636}
Note: See TracBrowser for help on using the repository browser.