source: trunk/FACT++/src/root2sql.cc@ 19084

Last change on this file since 19084 was 19084, checked in by tbretz, 6 years ago
Do not print overly long insert queries if not allowed by the user explicitly.
File size: 20.4 KB
Line 
1#include <boost/algorithm/string/join.hpp>
2#include <boost/regex.hpp>
3
4#include "Database.h"
5
6#include "Time.h"
7#include "Configuration.h"
8
9#include <TROOT.h>
10#include <TFile.h>
11#include <TTree.h>
12#include <TLeaf.h>
13#include <TError.h>
14
15using namespace std;
16
17// ------------------------------------------------------------------------
18
19struct Map : pair<string, string>
20{
21 Map() { }
22};
23
24std::istream &operator>>(std::istream &in, Map &m)
25{
26 const istreambuf_iterator<char> eos;
27 string txt(istreambuf_iterator<char>(in), eos);
28
29 const boost::regex expr("((.*)[^\\\\])/(.*)");
30 boost::smatch match;
31 if (!boost::regex_match(txt, match, expr))
32 throw runtime_error("Could not evaluate map argument: "+txt);
33
34 m.first = match[1].str();
35 m.second = match[3].str();
36
37 return in;
38}
39
40
41void SetupConfiguration(Configuration &conf)
42{
43 po::options_description control("Root to SQL");
44 control.add_options()
45 ("uri,u", var<string>()
46#if BOOST_VERSION >= 104200
47 ->required()
48#endif
49 , "Database link as in\n\tuser:password@server[:port]/database.")
50 ("file", var<string>(""), "The root file to read from")
51 ("force", po_switch(), "Force processing even if there is no database connection")
52 ("create", po_switch(), "Create the database if not existing")
53 ("drop", po_switch(), "Drop the table (implies create)")
54 ("tree,t", var<string>("Events"), "Name of the root tree to convert")
55 ("table", var<string>(""), "Name of the table to use (default is the tree name)")
56 ("map", vars<Map>(), "A regular expression which is applied to the leaf name befoee it is used as SQL column name)")
57 ("sql-type", vars<Map>(), "Allows to overwrite the calculated SQL type for a given column e.g. 'sql-column-name/UNSIGNED IN'")
58 ("ignore", vars<string>(), "Ignore the given leaf, if the given regular expression matches")
59 ("primary", vars<string>(), "List of columns to be used as primary keys during table creation (in connection with --create)")
60 ("first", var<int64_t>(int64_t(0)), "First event to start with (default: 0), mainly for test purpose")
61 ("max", var<int64_t>(int64_t(0)), "Maximum number of events to process (0: all), mainly for test purpose")
62 ("engine", var<string>("InnoDB"), "Database engine to be used when a new table is created")
63 ;
64
65 po::options_description debug("Debug options");
66 debug.add_options()
67 ("no-insert", po_switch(), "Does not insert any data into the table")
68 ("dry-run", po_switch(), "Skip any query which changes the databse (might result in consecutive failures)")
69 ("print-branches", po_switch(), "Print the branches found in the tree")
70 ("print-leaves", po_switch(), "Print the leaves found in the tree (this is what is processed)")
71 ("print-insert", po_switch(), "Print the INSERT query (note that it contains all data)")
72 ("print-create", po_switch(), "Print the CREATE query")
73 ("verbose,v", var<uint16_t>(1), "Verbosity (0: quiet, 1: default, 2: more, 3, ...)")
74 ;
75
76 po::positional_options_description p;
77 p.add("file", 1); // The 1st positional options (n=1)
78
79 conf.AddOptions(control);
80 conf.AddOptions(debug);
81 conf.SetArgumentPositions(p);
82}
83
84void PrintUsage()
85{
86 cout <<
87 "root2sql - Fills the data from a root file into a database\n"
88 "\n"
89 "For convenience, this documentation uses the extended version of the options, "
90 "refer to the output below to get the abbreviations.\n"
91 "\n"
92 "This is a general purpose tool to fill the contents of a root file into a database "
93 "as long as this is technically possible and makes sense. Note that root can even "
94 "write complex data like a TH1F into a database, this is not the purpose of this "
95 "program.\n"
96 "\n"
97 "Each root tree has branches and leaves (the basic data types). These leaves can "
98 "be read independently of the classes which were used to write the root file. "
99 "The default tree to read from is 'Events' but the name can be overwritten "
100 "using --tree. The default table name to fill the data into is identical to "
101 "the tree name. It can be overwritten using --table.\n"
102 "\n"
103 "The name of each column to which data is filled from a leave is obtained from "
104 "the leaves' names. The leave names can be checked using --print-leaves. "
105 "A --print-branches exists for convenience to print only the high-level branches. "
106 "Sometimes these names might be quite unconvenient like MTime.fTime.fMilliSec or "
107 "just MHillas.fWidth. To allow to simplify column names, regular expressions "
108 "(using boost's regex) can be defined to change the names. Note that these regular "
109 "expressions are applied one by one on each leaf's name. A valid expression could "
110 "be:\n"
111 " --map=MHillas\\.f/\n"
112 "which would remove all occurances of 'MHillas.f'. This option can be used more than "
113 "once. They are applied in sequence. A single match does not stop the sequence.\n"
114 "\n"
115 "Sometimes it might also be convenient to skip a leaf. This can be done with "
116 "the --ignore resource. If the given regular expresion yield a match, the "
117 "leaf will be ignored. Note that the regular expression work on the raw-name "
118 "of the leaf not the readily mapped SQL column names. Example:\n"
119 " --ignore=ThetaSq\\..*\n"
120 "will skip all leaved which start with 'ThetaSq.'. This option can be used"
121 "more than once.\n"
122 "\n"
123 "The data type of each column is kept as close as possible to the leaves' data "
124 "types. If for some reason this is not wanted, the data type of the SQL column "
125 "can be overwritten with --sql-type sql-column/sql-ytpe, for example:\n"
126 " --sql-type=FileId/UNSIGNED INT\n"
127 "while the first argument of the name of the SQL column to which the data type "
128 "should be applied. The second column is the basic SQL data type. The option can "
129 "be given more than once.\n"
130 "\n"
131 "Database interaction:\n"
132 "\n"
133 "To drop an existing table, --drop can be used.\n"
134 "\n"
135 "To create a table according to theSQL column names and data types, --create "
136 "can be used. The query used can be printed with --print-create even --create "
137 "has not been specified.\n"
138 "\n"
139 "To choose the columns which should become primary keys, use --primary, "
140 "for exmaple:\n"
141 " --primary=col1\n"
142 "To define more than one column as primary key, the option can be given more than "
143 "once. Note that the combination of these columns must be unique.\n"
144 "\n"
145 "All columns are created as NOT NULL as default and the table is created as "
146 "MyISAM (default).\n"
147 "\n"
148 "For debugging purpose, or to just create or drop a table, the final insert "
149 "query can be skipped using --no-insert. Note that for performance reason, "
150 "all data is collected in memory and a single INSERT query is issued at the "
151 "end.\n"
152 "\n"
153 "Using a higher verbosity level (-v), an overview of the written columns or all "
154 "processed leaves is printed depending on the verbosity level. The output looks "
155 "like the following\n"
156 " Leaf name [root data type] (SQL name)\n"
157 "for example\n"
158 " MTime.fTime.fMilliSec [Long64_t] (MilliSec)\n"
159 "which means that the leaf MTime.fTime.fMilliSec is detected to be a Long64_t "
160 "which is filled into a column called MilliSec. Leaves with non basic data types "
161 "are ignored automatically and are marked as (-n/a-). User ignored columnd "
162 "are marked as (-ignored-).\n"
163 "\n"
164 "If a query failed, the query is printed to stderr together with the error message. "
165 "For the main INSERT query, this is only true if the verbosity level is at least 2 "
166 "or the query has less than 80*25 bytes.\n"
167 "\n"
168 "In case of succes, 0 is returned, a value>0 otherwise.\n"
169 "\n"
170 "Usage: root2sql [options] rootfile.root\n"
171 "\n"
172 ;
173 cout << endl;
174}
175
176enum BasicType_t
177{
178 kNone = 0,
179 kFloat,
180 kDouble,
181 kInt16,
182 kUInt16,
183 kInt32,
184 kUInt32,
185 kInt64,
186 kUInt64,
187};
188
189static const map<string, pair<BasicType_t, string>> ConvRoot =
190{
191 { "Float_t", { kFloat, "FLOAT" } },
192 { "Double_t", { kDouble, "DOUBLE" } },
193 { "ULong64_t", { kUInt64, "BIGINT UNSIGNED" } },
194 { "Long64_t", { kInt64, "BIGINT" } },
195 { "UInt_t", { kUInt32, "INT UNSIGNED" } },
196 { "Int_t", { kInt32, "INT" } },
197 { "UShort_t", { kUInt16, "SMALLINT UNSIGNED" } },
198 { "Short_t", { kInt16, "SMALLINT" } },
199};
200
201struct Container
202{
203 string branch; // branch name
204 string column; // column name
205 BasicType_t type;
206 void *ptr;
207
208 Container(const string &b, const string &c, const BasicType_t &t) : branch(b), column(c), type(t), ptr(0)
209 {
210 switch (t)
211 {
212 case kFloat: ptr = new Float_t; break;
213 case kDouble: ptr = new Double_t; break;
214 case kInt16: ptr = new Short_t; break;
215 case kUInt16: ptr = new UShort_t; break;
216 case kInt32: ptr = new Int_t; break;
217 case kUInt32: ptr = new UInt_t; break;
218 case kInt64: ptr = new Long64_t; break;
219 case kUInt64: ptr = new ULong64_t; break;
220 case kNone:
221 break;
222 }
223 }
224 ~Container()
225 {
226 //::operator delete(ptr); // It seems root is deleting it already
227 }
228
229 string fmt() const
230 {
231 ostringstream str;
232
233 switch (type)
234 {
235 case kFloat: str << setprecision(8) << *reinterpret_cast<Float_t*>(ptr); break;
236 case kDouble: str << setprecision(16) << *reinterpret_cast<Double_t*>(ptr); break;
237 case kInt16: str << *reinterpret_cast<Short_t*>(ptr); break;
238 case kUInt16: str << *reinterpret_cast<UShort_t*>(ptr); break;
239 case kInt32: str << *reinterpret_cast<Int_t*>(ptr); break;
240 case kUInt32: str << *reinterpret_cast<UInt_t*>(ptr); break;
241 case kInt64: str << *reinterpret_cast<Long64_t*>(ptr); break;
242 case kUInt64: str << *reinterpret_cast<ULong64_t*>(ptr); break;
243 case kNone:
244 break;
245 }
246
247 if (str.str()=="nan" || str.str()=="-nan" || str.str()=="inf" || str.str()=="-inf")
248 return "NULL";
249
250 return str.str();
251 }
252};
253
254void ErrorHandlerAll(Int_t level, Bool_t abort, const char *location, const char *msg)
255{
256 if (string(msg).substr(0,24)=="no dictionary for class ")
257 return;
258
259 DefaultErrorHandler(level, abort, location, msg);
260}
261
262int main(int argc, const char* argv[])
263{
264 Time start;
265
266 gROOT->SetBatch();
267 SetErrorHandler(ErrorHandlerAll);
268
269 Configuration conf(argv[0]);
270 conf.SetPrintUsage(PrintUsage);
271 SetupConfiguration(conf);
272
273 if (!conf.DoParse(argc, argv))
274 return 127;
275
276 // ----------------------------- Evaluate options --------------------------
277 const string uri = conf.Get<string>("uri");
278 const string file = conf.Get<string>("file");
279 const string tree = conf.Get<string>("tree");
280 const string table = conf.Get<string>("table").empty() ? tree : conf.Get<string>("table");
281
282 const uint16_t verbose = conf.Get<uint16_t>("verbose");
283 const int64_t first = conf.Get<int64_t>("first");
284 const int64_t max = conf.Get<int64_t>("max");
285
286 const bool force = conf.Get<bool>("force");
287 const bool drop = conf.Get<bool>("drop");
288 const bool create = conf.Get<bool>("create") || drop;
289 const bool noinsert = conf.Get<bool>("no-insert");
290 const bool dry_run = conf.Get<bool>("dry-run");
291
292 const string engine = conf.Get<string>("engine");
293
294 const bool print_branches = conf.Get<bool>("print-branches");
295 const bool print_leaves = conf.Get<bool>("print-leaves");
296 const bool print_create = conf.Get<bool>("print-create");
297 const bool print_insert = conf.Get<bool>("print-insert");
298
299 const vector<Map> mymap = conf.Vec<Map>("map");
300 const vector<Map> sqltypes = conf.Vec<Map>("sql-type");
301 const vector<string> _ignore = conf.Vec<string>("ignore");
302 const vector<string> primary = conf.Vec<string>("primary");
303
304 // -------------------------------------------------------------------------
305
306 if (verbose>0)
307 cout << "\n-------------------------- Evaluating file -------------------------" << endl;
308
309 TFile f(file.c_str());
310 if (f.IsZombie())
311 {
312 cerr << "Could not open file " << file << endl;
313 return 1;
314 }
315
316 if (verbose>0)
317 cout << "File: " << file << endl;
318
319 TTree *T = 0;
320 f.GetObject(tree.c_str(), T);
321 if (!T)
322 {
323 cerr << "Could not open tree " << tree << endl;
324 return 2;
325 }
326
327 if (verbose>0)
328 cout << "Tree: " << tree << endl;
329
330 T->SetMakeClass(1);
331
332 TObjArray *branches = T->GetListOfBranches();
333 TObjArray *leaves = T->GetListOfLeaves();
334
335 if (print_branches)
336 {
337 cout << '\n';
338 branches->Print();
339 }
340
341 if (verbose>0)
342 cout << T->GetEntriesFast() << " events found." << endl;
343
344
345 if (verbose>0)
346 cout << branches->GetEntries() << " branches found." << endl;
347
348 if (print_leaves)
349 {
350 cout << '\n';
351 leaves->Print();
352 }
353 if (verbose>0)
354 cout << leaves->GetEntries() << " leaves found." << endl;
355
356 string query =
357 "CREATE TABLE IF NOT EXISTS `"+table+"`\n"
358 "(\n";
359
360 vector<Container> vec;
361
362
363 TIter Next(leaves);
364 TObject *o = 0;
365 while ((o=Next()))
366 {
367 TLeaf *L = T->GetLeaf(o->GetName());
368
369 if (verbose>2)
370 cout << '\n' << o->GetName() << " [" << L->GetTypeName() << "]";
371
372 string name = o->GetName();
373
374
375 bool found = false;
376 for (auto b=_ignore.cbegin(); b!=_ignore.cend(); b++)
377 {
378 if (boost::regex_match(name, boost::regex(*b)))
379 {
380 found = true;
381 if (verbose>2)
382 cout << " (-ignored-)";
383 break;
384 }
385 }
386 if (found)
387 continue;
388
389 const string tn = L->GetTypeName();
390
391 auto it = ConvRoot.find(tn);
392 if (it==ConvRoot.end())
393 {
394 if (verbose>2)
395 cout << " (-n/a-)";
396 continue;
397 }
398
399 if (verbose==2)
400 cout << '\n' << o->GetName() << " [" << L->GetTypeName() << "]";
401
402 for (auto m=mymap.cbegin(); m!=mymap.cend(); m++)
403 name = boost::regex_replace(name, boost::regex(m->first), m->second);
404
405 if (verbose>1)
406 cout << " (" << name << ")";
407
408 string sqltype = it->second.second;
409
410 for (auto m=sqltypes.cbegin(); m!=sqltypes.cend(); m++)
411 if (m->first==name)
412 sqltype = m->second;
413
414 if (!vec.empty())
415 query += ",\n";
416 query += " `"+name+"` "+sqltype+" NOT NULL COMMENT '"+o->GetName()+"'";
417
418 vec.emplace_back(o->GetName(), name, it->second.first);
419 T->SetBranchAddress(o->GetName(), vec.back().ptr);
420 }
421
422 if (verbose>1)
423 cout << "\n\n";
424 if (verbose>0)
425 cout << vec.size() << " leaves setup for reading." << endl;
426
427 UInt_t datatype = 0;
428 const bool has_datatype = T->SetBranchAddress("DataType.fVal", &datatype) >= 0;
429
430 // Setiing up branch status (must be after all SetBranchAddress)
431 T->SetBranchStatus("*", 0);
432 for (auto c=vec.cbegin(); c!=vec.cend(); c++)
433 T->SetBranchStatus(c->branch.c_str(), 1);
434
435 if (has_datatype)
436 {
437 T->SetBranchStatus("DataType.fVal", 1);
438 if (verbose>0)
439 cout << "Rows with DataType.fVal!=1 will be skipped." << endl;
440 }
441
442 // -------------------------------------------------------------------------
443 // Checking for database connection
444
445 try
446 {
447 if (!force)
448 Database(uri).connected();
449 }
450 catch (const exception &e)
451 {
452 cerr << "SQL connection failed: " << e.what() << endl;
453 return 3;
454 }
455
456 // -------------------------------------------------------------------------
457
458 if (verbose>0)
459 cout << "\n--------------------------- Database Table -------------------------" << endl;
460
461 if (!primary.empty())
462 query += ",\n PRIMARY KEY USING BTREE (`"+boost::algorithm::join(primary, "`, `")+"`)\n";
463
464 query +=
465 ")\n"
466 "DEFAULT CHARSET=latin1 COLLATE=latin1_general_ci\n"
467 "ENGINE="+engine+"\n"
468 "COMMENT='created by "+conf.GetName()+"'\n";
469
470 // FIXME: Can we omit the catching to be able to print the
471 // query 'autmatically'?
472 try
473 {
474 if (drop)
475 {
476 // => Simple result
477 if (!dry_run)
478 Database(uri).query("DROP TABLE `"+table+"`").execute();
479 if (verbose>0)
480 cout << "Table `" << table << "` dropped." << endl;
481 }
482 }
483 catch (const exception &e)
484 {
485 cerr << "DROP TABLE `" << table << "`\n\n";
486 cerr << "SQL query failed:\n" << e.what() << endl;
487 return 4;
488 }
489
490 try
491 {
492 if (create && !dry_run)
493 Database(uri).query(query).execute();
494 }
495 catch (const exception &e)
496 {
497 cerr << query << "\n\n";
498 cerr << "SQL query failed:\n" << e.what() << endl;
499 return 5;
500 }
501
502 if (print_create)
503 cout << query << endl;
504
505 if (create && verbose>0)
506 cout << "Table `" << table << "` created." << endl;
507
508 // -------------------------------------------------------------------------
509
510 if (verbose>0)
511 cout << "\n---------------------------- Reading file --------------------------" << endl;
512
513 //query = update ? "UPDATE" : "INSERT";
514 query = "INSERT `"+table+"`\n"
515 "(\n";
516
517 for (auto c=vec.cbegin(); c!=vec.cend(); c++)
518 {
519 if (c!=vec.cbegin())
520 query += ",\n";
521 query += " `"+c->column+"`";
522 }
523
524 query +=
525 "\n)\n"
526 "VALUES\n";
527
528 size_t count = 0;
529
530 const size_t num = max>0 && (max-first)<T->GetEntriesFast() ? (max-first) : T->GetEntriesFast();
531 for (size_t j=first; j<num; j++)
532 {
533 T->GetEntry(j);
534 if (has_datatype && datatype!=1)
535 continue;
536
537 if (count>0)
538 query += ",\n";
539
540 query += "(\n";
541
542 for (auto c=vec.cbegin(); c!=vec.cend(); c++)
543 {
544 if (c!=vec.cbegin())
545 query += ",\n";
546
547 query += " "+c->fmt();
548
549 if (print_insert)
550 query += " /* "+c->column+" -> "+c->branch+" */";
551 }
552 query += "\n)"; // ON DUPLICATE KEY UPDATE\n";
553
554 count ++;
555 }
556
557 if (verbose>0)
558 cout << count << " out of " << num << " row(s) read from file [N=" << first << ".." << num-1 << "]." << endl;
559
560 if (count==0)
561 {
562 if (verbose>0)
563 cout << "Total execution time: " << Time().UnixTime()-start.UnixTime() << "s\n" << endl;
564 return 0;
565 }
566
567 // -------------------------------------------------------------------------
568
569 if (verbose>0)
570 {
571 cout << "\n--------------------------- Inserting data -------------------------" << endl;
572 cout << "Sending INSERT query (" << query.length() << " bytes)" << endl;
573 }
574
575 try
576 {
577 if (!noinsert && !dry_run)
578 // => Simple result
579 Database(uri).query(query).execute();
580 else
581 cout << "Insert query skipped!" << endl;
582
583 if (print_insert)
584 cout << query << endl;
585 }
586 catch (const exception &e)
587 {
588 if (verbose>1 || query.length()<80*25)
589 cerr << query << "\n\n";
590 cerr << "SQL query failed (" << query.length() << " bytes):\n" << e.what() << endl;
591 return 6;
592 }
593
594 if (verbose>0)
595 {
596 cout << count << " row(s) inserted.\n\n";
597 cout << "Total execution time: " << Time().UnixTime()-start.UnixTime() << "s\n" << endl;
598 }
599
600 return 0;
601}
Note: See TracBrowser for help on using the repository browser.