source: trunk/FACT++/src/root2sql.cc@ 19082

Last change on this file since 19082 was 19082, checked in by tbretz, 6 years ago
Error in the same line, so it is easier to grep for it.
File size: 20.1 KB
Line 
1#include <boost/algorithm/string/join.hpp>
2#include <boost/regex.hpp>
3
4#include "Database.h"
5
6#include "Time.h"
7#include "Configuration.h"
8
9#include <TROOT.h>
10#include <TFile.h>
11#include <TTree.h>
12#include <TLeaf.h>
13#include <TError.h>
14
15using namespace std;
16
17// ------------------------------------------------------------------------
18
19struct Map : pair<string, string>
20{
21 Map() { }
22};
23
24std::istream &operator>>(std::istream &in, Map &m)
25{
26 const istreambuf_iterator<char> eos;
27 string txt(istreambuf_iterator<char>(in), eos);
28
29 const boost::regex expr("((.*)[^\\\\])/(.*)");
30 boost::smatch match;
31 if (!boost::regex_match(txt, match, expr))
32 throw runtime_error("Could not evaluate map argument: "+txt);
33
34 m.first = match[1].str();
35 m.second = match[3].str();
36
37 return in;
38}
39
40
41void SetupConfiguration(Configuration &conf)
42{
43 po::options_description control("Root to SQL");
44 control.add_options()
45 ("uri,u", var<string>()
46#if BOOST_VERSION >= 104200
47 ->required()
48#endif
49 , "Database link as in\n\tuser:password@server[:port]/database.")
50 ("file", var<string>(""), "The root file to read from")
51 ("force", po_switch(), "Force processing even if there is no database connection")
52 ("create", po_switch(), "Create the database if not existing")
53 ("drop", po_switch(), "Drop the table (implies create)")
54 ("tree,t", var<string>("Events"), "Name of the root tree to convert")
55 ("table", var<string>(""), "Name of the table to use (default is the tree name)")
56 ("map", vars<Map>(), "A regular expression which is applied to the leaf name befoee it is used as SQL column name)")
57 ("sql-type", vars<Map>(), "Allows to overwrite the calculated SQL type for a given column e.g. 'sql-column-name/UNSIGNED IN'")
58 ("ignore", vars<string>(), "Ignore the given leaf, if the given regular expression matches")
59 ("primary", vars<string>(), "List of columns to be used as primary keys during table creation (in connection with --create)")
60 ("first", var<int64_t>(int64_t(0)), "First event to start with (default: 0), mainly for test purpose")
61 ("max", var<int64_t>(int64_t(0)), "Maximum number of events to process (0: all), mainly for test purpose")
62 ("engine", var<string>("InnoDB"), "Database engine to be used when a new table is created")
63 ;
64
65 po::options_description debug("Debug options");
66 debug.add_options()
67 ("no-insert", po_switch(), "Does not insert any data into the table")
68 ("dry-run", po_switch(), "Skip any query which changes the databse (might result in consecutive failures)")
69 ("print-branches", po_switch(), "Print the branches found in the tree")
70 ("print-leaves", po_switch(), "Print the leaves found in the tree (this is what is processed)")
71 ("print-insert", po_switch(), "Print the INSERT query (note that it contains all data)")
72 ("print-create", po_switch(), "Print the CREATE query")
73 ("verbose,v", var<uint16_t>(1), "Verbosity (0: quiet, 1: default, 2: more, 3, ...)")
74 ;
75
76 po::positional_options_description p;
77 p.add("file", 1); // The 1st positional options (n=1)
78
79 conf.AddOptions(control);
80 conf.AddOptions(debug);
81 conf.SetArgumentPositions(p);
82}
83
84void PrintUsage()
85{
86 cout <<
87 "root2sql - Fills the data from a root file into a database\n"
88 "\n"
89 "For convenience, this documentation uses the extended version of the options, "
90 "refer to the output below to get the abbreviations.\n"
91 "\n"
92 "This is a general purpose tool to fill the contents of a root file into a database "
93 "as long as this is technically possible and makes sense. Note that root can even "
94 "write complex data like a TH1F into a database, this is not the purpose of this "
95 "program.\n"
96 "\n"
97 "Each root tree has branches and leaves (the basic data types). These leaves can "
98 "be read independently of the classes which were used to write the root file. "
99 "The default tree to read from is 'Events' but the name can be overwritten "
100 "using --tree. The default table name to fill the data into is identical to "
101 "the tree name. It can be overwritten using --table.\n"
102 "\n"
103 "The name of each column to which data is filled from a leave is obtained from "
104 "the leaves' names. The leave names can be checked using --print-leaves. "
105 "A --print-branches exists for convenience to print only the high-level branches. "
106 "Sometimes these names might be quite unconvenient like MTime.fTime.fMilliSec or "
107 "just MHillas.fWidth. To allow to simplify column names, regular expressions "
108 "(using boost's regex) can be defined to change the names. Note that these regular "
109 "expressions are applied one by one on each leaf's name. A valid expression could "
110 "be:\n"
111 " --map=MHillas\\.f/\n"
112 "which would remove all occurances of 'MHillas.f'. This option can be used more than "
113 "once. They are applied in sequence. A single match does not stop the sequence.\n"
114 "\n"
115 "Sometimes it might also be convenient to skip a leaf. This can be done with "
116 "the --ignore resource. If the given regular expresion yield a match, the "
117 "leaf will be ignored. Note that the regular expression work on the raw-name "
118 "of the leaf not the readily mapped SQL column names. Example:\n"
119 " --ignore=ThetaSq\\..*\n"
120 "will skip all leaved which start with 'ThetaSq.'. This option can be used"
121 "more than once.\n"
122 "\n"
123 "The data type of each column is kept as close as possible to the leaves' data "
124 "types. If for some reason this is not wanted, the data type of the SQL column "
125 "can be overwritten with --sql-type sql-column/sql-ytpe, for example:\n"
126 " --sql-type=FileId/UNSIGNED INT\n"
127 "while the first argument of the name of the SQL column to which the data type "
128 "should be applied. The second column is the basic SQL data type. The option can "
129 "be given more than once.\n"
130 "\n"
131 "Database interaction:\n"
132 "\n"
133 "To drop an existing table, --drop can be used.\n"
134 "\n"
135 "To create a table according to theSQL column names and data types, --create "
136 "can be used. The query used can be printed with --print-create even --create "
137 "has not been specified.\n"
138 "\n"
139 "To choose the columns which should become primary keys, use --primary, "
140 "for exmaple:\n"
141 " --primary=col1\n"
142 "To define more than one column as primary key, the option can be given more than "
143 "once. Note that the combination of these columns must be unique.\n"
144 "\n"
145 "All columns are created as NOT NULL as default and the table is created as "
146 "MyISAM (default).\n"
147 "\n"
148 "For debugging purpose, or to just create or drop a table, the final insert "
149 "query can be skipped using --no-insert. Note that for performance reason, "
150 "all data is collected in memory and a single INSERT query is issued at the "
151 "end.\n"
152 "\n"
153 "Using a higher verbosity level (-v), an overview of the written columns or all "
154 "processed leaves is printed depending on the verbosity level. The output looks "
155 "like the following\n"
156 " Leaf name [root data type] (SQL name)\n"
157 "for example\n"
158 " MTime.fTime.fMilliSec [Long64_t] (MilliSec)\n"
159 "which means that the leaf MTime.fTime.fMilliSec is detected to be a Long64_t "
160 "which is filled into a column called MilliSec. Leaves with non basic data types "
161 "are ignored automatically and are marked as (-n/a-). User ignored columnd "
162 "are marked as (-ignored-).\n"
163 "\n"
164 "In case of succes, 0 is returned, a value>0 otherwise.\n"
165 "\n"
166 "Usage: root2sql [options] rootfile.root\n"
167 "\n"
168 ;
169 cout << endl;
170}
171
172enum BasicType_t
173{
174 kNone = 0,
175 kFloat,
176 kDouble,
177 kInt16,
178 kUInt16,
179 kInt32,
180 kUInt32,
181 kInt64,
182 kUInt64,
183};
184
185static const map<string, pair<BasicType_t, string>> ConvRoot =
186{
187 { "Float_t", { kFloat, "FLOAT" } },
188 { "Double_t", { kDouble, "DOUBLE" } },
189 { "ULong64_t", { kUInt64, "BIGINT UNSIGNED" } },
190 { "Long64_t", { kInt64, "BIGINT" } },
191 { "UInt_t", { kUInt32, "INT UNSIGNED" } },
192 { "Int_t", { kInt32, "INT" } },
193 { "UShort_t", { kUInt16, "SMALLINT UNSIGNED" } },
194 { "Short_t", { kInt16, "SMALLINT" } },
195};
196
197struct Container
198{
199 string branch; // branch name
200 string column; // column name
201 BasicType_t type;
202 void *ptr;
203
204 Container(const string &b, const string &c, const BasicType_t &t) : branch(b), column(c), type(t), ptr(0)
205 {
206 switch (t)
207 {
208 case kFloat: ptr = new Float_t; break;
209 case kDouble: ptr = new Double_t; break;
210 case kInt16: ptr = new Short_t; break;
211 case kUInt16: ptr = new UShort_t; break;
212 case kInt32: ptr = new Int_t; break;
213 case kUInt32: ptr = new UInt_t; break;
214 case kInt64: ptr = new Long64_t; break;
215 case kUInt64: ptr = new ULong64_t; break;
216 case kNone:
217 break;
218 }
219 }
220 ~Container()
221 {
222 //::operator delete(ptr); // It seems root is deleting it already
223 }
224
225 string fmt() const
226 {
227 ostringstream str;
228
229 switch (type)
230 {
231 case kFloat: str << setprecision(8) << *reinterpret_cast<Float_t*>(ptr); break;
232 case kDouble: str << setprecision(16) << *reinterpret_cast<Double_t*>(ptr); break;
233 case kInt16: str << *reinterpret_cast<Short_t*>(ptr); break;
234 case kUInt16: str << *reinterpret_cast<UShort_t*>(ptr); break;
235 case kInt32: str << *reinterpret_cast<Int_t*>(ptr); break;
236 case kUInt32: str << *reinterpret_cast<UInt_t*>(ptr); break;
237 case kInt64: str << *reinterpret_cast<Long64_t*>(ptr); break;
238 case kUInt64: str << *reinterpret_cast<ULong64_t*>(ptr); break;
239 case kNone:
240 break;
241 }
242
243 if (str.str()=="nan" || str.str()=="-nan" || str.str()=="inf" || str.str()=="-inf")
244 return "NULL";
245
246 return str.str();
247 }
248};
249
250void ErrorHandlerAll(Int_t level, Bool_t abort, const char *location, const char *msg)
251{
252 if (string(msg).substr(0,24)=="no dictionary for class ")
253 return;
254
255 DefaultErrorHandler(level, abort, location, msg);
256}
257
258int main(int argc, const char* argv[])
259{
260 Time start;
261
262 gROOT->SetBatch();
263 SetErrorHandler(ErrorHandlerAll);
264
265 Configuration conf(argv[0]);
266 conf.SetPrintUsage(PrintUsage);
267 SetupConfiguration(conf);
268
269 if (!conf.DoParse(argc, argv))
270 return 127;
271
272 // ----------------------------- Evaluate options --------------------------
273 const string uri = conf.Get<string>("uri");
274 const string file = conf.Get<string>("file");
275 const string tree = conf.Get<string>("tree");
276 const string table = conf.Get<string>("table").empty() ? tree : conf.Get<string>("table");
277
278 const uint16_t verbose = conf.Get<uint16_t>("verbose");
279 const int64_t first = conf.Get<int64_t>("first");
280 const int64_t max = conf.Get<int64_t>("max");
281
282 const bool force = conf.Get<bool>("force");
283 const bool drop = conf.Get<bool>("drop");
284 const bool create = conf.Get<bool>("create") || drop;
285 const bool noinsert = conf.Get<bool>("no-insert");
286 const bool dry_run = conf.Get<bool>("dry-run");
287
288 const string engine = conf.Get<string>("engine");
289
290 const bool print_branches = conf.Get<bool>("print-branches");
291 const bool print_leaves = conf.Get<bool>("print-leaves");
292 const bool print_create = conf.Get<bool>("print-create");
293 const bool print_insert = conf.Get<bool>("print-insert");
294
295 const vector<Map> mymap = conf.Vec<Map>("map");
296 const vector<Map> sqltypes = conf.Vec<Map>("sql-type");
297 const vector<string> _ignore = conf.Vec<string>("ignore");
298 const vector<string> primary = conf.Vec<string>("primary");
299
300 // -------------------------------------------------------------------------
301
302 if (verbose>0)
303 cout << "\n-------------------------- Evaluating file -------------------------" << endl;
304
305 TFile f(file.c_str());
306 if (f.IsZombie())
307 {
308 cerr << "Could not open file " << file << endl;
309 return 1;
310 }
311
312 if (verbose>0)
313 cout << "File: " << file << endl;
314
315 TTree *T = 0;
316 f.GetObject(tree.c_str(), T);
317 if (!T)
318 {
319 cerr << "Could not open tree " << tree << endl;
320 return 2;
321 }
322
323 if (verbose>0)
324 cout << "Tree: " << tree << endl;
325
326 T->SetMakeClass(1);
327
328 TObjArray *branches = T->GetListOfBranches();
329 TObjArray *leaves = T->GetListOfLeaves();
330
331 if (print_branches)
332 {
333 cout << '\n';
334 branches->Print();
335 }
336
337 if (verbose>0)
338 cout << T->GetEntriesFast() << " events found." << endl;
339
340
341 if (verbose>0)
342 cout << branches->GetEntries() << " branches found." << endl;
343
344 if (print_leaves)
345 {
346 cout << '\n';
347 leaves->Print();
348 }
349 if (verbose>0)
350 cout << leaves->GetEntries() << " leaves found." << endl;
351
352 string query =
353 "CREATE TABLE IF NOT EXISTS `"+table+"`\n"
354 "(\n";
355
356 vector<Container> vec;
357
358
359 TIter Next(leaves);
360 TObject *o = 0;
361 while ((o=Next()))
362 {
363 TLeaf *L = T->GetLeaf(o->GetName());
364
365 if (verbose>2)
366 cout << '\n' << o->GetName() << " [" << L->GetTypeName() << "]";
367
368 string name = o->GetName();
369
370
371 bool found = false;
372 for (auto b=_ignore.cbegin(); b!=_ignore.cend(); b++)
373 {
374 if (boost::regex_match(name, boost::regex(*b)))
375 {
376 found = true;
377 if (verbose>2)
378 cout << " (-ignored-)";
379 break;
380 }
381 }
382 if (found)
383 continue;
384
385 const string tn = L->GetTypeName();
386
387 auto it = ConvRoot.find(tn);
388 if (it==ConvRoot.end())
389 {
390 if (verbose>2)
391 cout << " (-n/a-)";
392 continue;
393 }
394
395 if (verbose==2)
396 cout << '\n' << o->GetName() << " [" << L->GetTypeName() << "]";
397
398 for (auto m=mymap.cbegin(); m!=mymap.cend(); m++)
399 name = boost::regex_replace(name, boost::regex(m->first), m->second);
400
401 if (verbose>1)
402 cout << " (" << name << ")";
403
404 string sqltype = it->second.second;
405
406 for (auto m=sqltypes.cbegin(); m!=sqltypes.cend(); m++)
407 if (m->first==name)
408 sqltype = m->second;
409
410 if (!vec.empty())
411 query += ",\n";
412 query += " `"+name+"` "+sqltype+" NOT NULL COMMENT '"+o->GetName()+"'";
413
414 vec.emplace_back(o->GetName(), name, it->second.first);
415 T->SetBranchAddress(o->GetName(), vec.back().ptr);
416 }
417
418 if (verbose>1)
419 cout << "\n\n";
420 if (verbose>0)
421 cout << vec.size() << " leaves setup for reading." << endl;
422
423 UInt_t datatype = 0;
424 const bool has_datatype = T->SetBranchAddress("DataType.fVal", &datatype) >= 0;
425
426 // Setiing up branch status (must be after all SetBranchAddress)
427 T->SetBranchStatus("*", 0);
428 for (auto c=vec.cbegin(); c!=vec.cend(); c++)
429 T->SetBranchStatus(c->branch.c_str(), 1);
430
431 if (has_datatype)
432 {
433 T->SetBranchStatus("DataType.fVal", 1);
434 if (verbose>0)
435 cout << "Rows with DataType.fVal!=1 will be skipped." << endl;
436 }
437
438 // -------------------------------------------------------------------------
439 // Checking for database connection
440
441 try
442 {
443 if (!force)
444 Database(uri).connected();
445 }
446 catch (const exception &e)
447 {
448 cerr << "SQL connection failed: " << e.what() << endl;
449 return 3;
450 }
451
452 // -------------------------------------------------------------------------
453
454 if (verbose>0)
455 cout << "\n--------------------------- Database Table -------------------------" << endl;
456
457 if (!primary.empty())
458 query += ",\n PRIMARY KEY USING BTREE (`"+boost::algorithm::join(primary, "`, `")+"`)\n";
459
460 query +=
461 ")\n"
462 "DEFAULT CHARSET=latin1 COLLATE=latin1_general_ci\n"
463 "ENGINE="+engine+"\n"
464 "COMMENT='created by "+conf.GetName()+"'\n";
465
466 // FIXME: Can we omit the catching to be able to print the
467 // query 'autmatically'?
468 try
469 {
470 if (drop)
471 {
472 // => Simple result
473 if (!dry_run)
474 Database(uri).query("DROP TABLE `"+table+"`").execute();
475 if (verbose>0)
476 cout << "Table `" << table << "` dropped." << endl;
477 }
478 }
479 catch (const exception &e)
480 {
481 cerr << "DROP TABLE `" << table << "`\n\n";
482 cerr << "SQL query failed: " << e.what() << endl;
483 return 4;
484 }
485
486 try
487 {
488 if (create && !dry_run)
489 Database(uri).query(query).execute();
490 }
491 catch (const exception &e)
492 {
493 cerr << query << "\n\n";
494 cerr << "SQL query failed: " << e.what() << endl;
495 return 5;
496 }
497
498 if (print_create)
499 cout << query << endl;
500
501 if (create && verbose>0)
502 cout << "Table `" << table << "` created." << endl;
503
504 // -------------------------------------------------------------------------
505
506 if (verbose>0)
507 cout << "\n---------------------------- Reading file --------------------------" << endl;
508
509 //query = update ? "UPDATE" : "INSERT";
510 query = "INSERT `"+table+"`\n"
511 "(\n";
512
513 for (auto c=vec.cbegin(); c!=vec.cend(); c++)
514 {
515 if (c!=vec.cbegin())
516 query += ",\n";
517 query += " `"+c->column+"`";
518 }
519
520 query +=
521 "\n)\n"
522 "VALUES\n";
523
524 size_t count = 0;
525
526 const size_t num = max>0 && (max-first)<T->GetEntriesFast() ? (max-first) : T->GetEntriesFast();
527 for (size_t j=first; j<num; j++)
528 {
529 T->GetEntry(j);
530 if (has_datatype && datatype!=1)
531 continue;
532
533 if (count>0)
534 query += ",\n";
535
536 query += "(\n";
537
538 for (auto c=vec.cbegin(); c!=vec.cend(); c++)
539 {
540 if (c!=vec.cbegin())
541 query += ",\n";
542
543 query += " "+c->fmt();
544
545 if (print_insert)
546 query += " /* "+c->column+" -> "+c->branch+" */";
547 }
548 query += "\n)"; // ON DUPLICATE KEY UPDATE\n";
549
550 count ++;
551 }
552
553 if (verbose>0)
554 cout << count << " out of " << num << " row(s) read from file [N=" << first << ".." << num-1 << "]." << endl;
555
556 if (count==0)
557 {
558 if (verbose>0)
559 cout << "Total execution time: " << Time().UnixTime()-start.UnixTime() << "s\n" << endl;
560 return 0;
561 }
562
563 // -------------------------------------------------------------------------
564
565 if (verbose>0)
566 {
567 cout << "\n--------------------------- Inserting data -------------------------" << endl;
568 cout << "Sending INSERT query (" << query.length() << " bytes)" << endl;
569 }
570
571 try
572 {
573 if (!noinsert && !dry_run)
574 // => Simple result
575 Database(uri).query(query).execute();
576 else
577 cout << "Insert query skipped!" << endl;
578
579 if (print_insert)
580 cout << query << endl;
581 }
582 catch (const exception &e)
583 {
584 cerr << query << "\n\n";
585 cerr << "SQL query failed (" << query.length() << " bytes): " << e.what() << endl;
586 return 6;
587 }
588
589 if (verbose>0)
590 {
591 cout << count << " row(s) inserted.\n\n";
592 cout << "Total execution time: " << Time().UnixTime()-start.UnixTime() << "s\n" << endl;
593 }
594
595 return 0;
596}
Note: See TracBrowser for help on using the repository browser.