source: trunk/FACT++/src/root2sql.cc@ 19065

Last change on this file since 19065 was 19063, checked in by tbretz, 8 years ago
Made it compile with an older boost version. Added force option and a check for the database connection. Do not add the comments to the query if not printed to the screen. Adapted the return values.
File size: 19.2 KB
Line 
1#include <boost/algorithm/string/join.hpp>
2#include <boost/regex.hpp>
3
4#include "Database.h"
5
6#include "Time.h"
7#include "Configuration.h"
8
9#include <TROOT.h>
10#include <TFile.h>
11#include <TTree.h>
12#include <TLeaf.h>
13#include <TError.h>
14
15using namespace std;
16
17// ------------------------------------------------------------------------
18
19struct Map : pair<string, string>
20{
21 Map() { }
22};
23
24std::istream &operator>>(std::istream &in, Map &m)
25{
26 const istreambuf_iterator<char> eos;
27 string txt(istreambuf_iterator<char>(in), eos);
28
29 const boost::regex expr("((.*)[^\\\\])/(.*)");
30 boost::smatch match;
31 if (!boost::regex_match(txt, match, expr))
32 throw runtime_error("Could not evaluate map argument: "+txt);
33
34 m.first = match[1].str();
35 m.second = match[3].str();
36
37 return in;
38}
39
40
41void SetupConfiguration(Configuration &conf)
42{
43 po::options_description control("Root to SQL");
44 control.add_options()
45 ("uri,u", var<string>()
46#if BOOST_VERSION >= 104200
47 ->required()
48#endif
49 , "Database link as in\n\tuser:password@server[:port]/database.")
50 ("file", var<string>(""), "The root file to read from")
51 ("force", po_switch(), "Force processing even if there is no database connection")
52 ("create", po_switch(), "Create the database if not existing")
53 ("drop", po_switch(), "Drop the table (implies create)")
54 ("tree,t", var<string>("Events"), "Name of the root tree to convert")
55 ("table", var<string>(""), "Name of the table to use (default is the tree name)")
56 ("map", vars<Map>(), "A regular expression which is applied to the leaf name befoee it is used as SQL column name)")
57 ("sql-type", vars<Map>(), "Allows to overwrite the calculated SQL type for a given column e.g. 'sql-column-name/UNSIGNED IN'")
58 ("ignore", vars<string>(), "Ignore the given leaf, if the given regular expression matches")
59 ("primary", vars<string>(), "List of columns to be used as primary keys during table creation (in connection with --create)")
60 ("first", var<int64_t>(int64_t(0)), "First event to start with (default: 0), mainly for test purpose")
61 ("max", var<int64_t>(int64_t(0)), "Maximum number of events to process (0: all), mainly for test purpose")
62 ("no-insert", po_switch(), "Does not insert any data into the table")
63 ("engine", var<string>("MyISAM"), "Database engine to be used when a new table is created")
64 ("print-branches", po_switch(), "Print the branches found in the tree")
65 ("print-leaves", po_switch(), "Print the leaves found in the tree (this is what is processed)")
66 ("print-insert", po_switch(), "Print the INSERT query (note that it contains all data)")
67 ("print-create", po_switch(), "Print the CREATE query")
68 ("verbose,v", var<uint16_t>(1), "Verbosity (0: quiet, 1: default, 2: more, 3, ...)")
69 ;
70
71 po::positional_options_description p;
72 p.add("file", 1); // The 1st positional options (n=1)
73
74 conf.AddOptions(control);
75 conf.SetArgumentPositions(p);
76}
77
78void PrintUsage()
79{
80 cout <<
81 "root2sql - Fills the data from a root file into a database\n"
82 "\n"
83 "For convenience, this documentation uses the extended version of the options, "
84 "refer to the output below to get the abbreviations.\n"
85 "\n"
86 "This is a general purpose tool to fill the contents of a root file into a database "
87 "as long as this is technically possible and makes sense. Note that root can even "
88 "write complex data like a TH1F into a database, this is not the purpose of this "
89 "program.\n"
90 "\n"
91 "Each root tree has branches and leaves (the basic data types). These leaves can "
92 "be read independently of the classes which were used to write the root file. "
93 "The default tree to read from is 'Events' but the name can be overwritten "
94 "using --tree. The default table name to fill the data into is identical to "
95 "the tree name. It can be overwritten using --table.\n"
96 "\n"
97 "The name of each column to which data is filled from a leave is obtained from "
98 "the leaves' names. The leave names can be checked using --print-leaves. "
99 "A --print-branches exists for convenience to print only the high-level branches. "
100 "Sometimes these names might be quite unconvenient like MTime.fTime.fMilliSec or "
101 "just MHillas.fWidth. To allow to simplify column names, regular expressions "
102 "(using boost's regex) can be defined to change the names. Note that these regular "
103 "expressions are applied one by one on each leaf's name. A valid expression could "
104 "be:\n"
105 " --map=MHillas\\.f/\n"
106 "which would remove all occurances of 'MHillas.f'. This option can be used more than "
107 "once. They are applied in sequence. A single match does not stop the sequence.\n"
108 "\n"
109 "Sometimes it might also be convenient to skip a leaf. This can be done with "
110 "the --ignore resource. If the given regular expresion yield a match, the "
111 "leaf will be ignored. Note that the regular expression work on the raw-name "
112 "of the leaf not the readily mapped SQL column names. Example:\n"
113 " --ignore=ThetaSq\\..*\n"
114 "will skip all leaved which start with 'ThetaSq.'. This option can be used"
115 "more than once.\n"
116 "\n"
117 "The data type of each column is kept as close as possible to the leaves' data "
118 "types. If for some reason this is not wanted, the data type of the SQL column "
119 "can be overwritten with --sql-type sql-column/sql-ytpe, for example:\n"
120 " --sql-type=FileId/UNSIGNED INT\n"
121 "while the first argument of the name of the SQL column to which the data type "
122 "should be applied. The second column is the basic SQL data type. The option can "
123 "be given more than once.\n"
124 "\n"
125 "Database interaction:\n"
126 "\n"
127 "To drop an existing table, --drop can be used.\n"
128 "\n"
129 "To create a table according to theSQL column names and data types, --create "
130 "can be used. The query used can be printed with --print-create even --create "
131 "has not been specified.\n"
132 "\n"
133 "To choose the columns which should become primary keys, use --primary, "
134 "for exmaple:\n"
135 " --primary=col1\n"
136 "To define more than one column as primary key, the option can be given more than "
137 "once. Note that the combination of these columns must be unique.\n"
138 "\n"
139 "All columns are created as NOT NULL as default and the table is created as "
140 "MyISAM (default).\n"
141 "\n"
142 "For debugging purpose, or to just create or drop a table, the final insert "
143 "query can be skipped using --no-insert. Note that for performance reason, "
144 "all data is collected in memory and a single INSERT query is issued at the "
145 "end.\n"
146 "\n"
147 "Using a higher verbosity level (-v), an overview of the written columns or all "
148 "processed leaves is printed depending on the verbosity level. The output looks "
149 "like the following\n"
150 " Leaf name [root data type] (SQL name)\n"
151 "for example\n"
152 " MTime.fTime.fMilliSec [Long64_t] (MilliSec)\n"
153 "which means that the leaf MTime.fTime.fMilliSec is detected to be a Long64_t "
154 "which is filled into a column called MilliSec. Leaves with non basic data types "
155 "are ignored automatically and are marked as (-n/a-). User ignored columnd "
156 "are marked as (-ignored-).\n"
157 "\n"
158 "In case of succes, 0 is returned, a value>0 otherwise.\n"
159 "\n"
160 "Usage: root2sql [options] rootfile.root\n"
161 "\n"
162 ;
163 cout << endl;
164}
165
166enum BasicType_t
167{
168 kNone = 0,
169 kFloat,
170 kDouble,
171 kInt16,
172 kUInt16,
173 kInt32,
174 kUInt32,
175 kInt64,
176 kUInt64,
177};
178
179static const map<string, pair<BasicType_t, string>> ConvRoot =
180{
181 { "Float_t", { kFloat, "FLOAT" } },
182 { "Double_t", { kDouble, "DOUBLE" } },
183 { "ULong64_t", { kUInt64, "BIGINT UNSIGNED" } },
184 { "Long64_t", { kInt64, "BIGINT" } },
185 { "UInt_t", { kUInt32, "INT UNSIGNED" } },
186 { "Int_t", { kInt32, "INT" } },
187 { "UShort_t", { kUInt16, "SMALLINT UNSIGNED" } },
188 { "Short_t", { kInt16, "SMALLINT" } },
189};
190
191struct Container
192{
193 string branch; // branch name
194 string column; // column name
195 BasicType_t type;
196 void *ptr;
197
198 Container(const string &b, const string &c, const BasicType_t &t) : branch(b), column(c), type(t), ptr(0)
199 {
200 switch (t)
201 {
202 case kFloat: ptr = new Float_t; break;
203 case kDouble: ptr = new Double_t; break;
204 case kInt16: ptr = new Short_t; break;
205 case kUInt16: ptr = new UShort_t; break;
206 case kInt32: ptr = new Int_t; break;
207 case kUInt32: ptr = new UInt_t; break;
208 case kInt64: ptr = new Long64_t; break;
209 case kUInt64: ptr = new ULong64_t; break;
210 case kNone:
211 break;
212 }
213 }
214 ~Container()
215 {
216 //::operator delete(ptr); // It seems root is deleting it already
217 }
218
219 string fmt() const
220 {
221 ostringstream str;
222
223 switch (type)
224 {
225 case kFloat: str << setprecision(8) << *reinterpret_cast<Float_t*>(ptr); break;
226 case kDouble: str << setprecision(16) << *reinterpret_cast<Double_t*>(ptr); break;
227 case kInt16: str << *reinterpret_cast<Short_t*>(ptr); break;
228 case kUInt16: str << *reinterpret_cast<UShort_t*>(ptr); break;
229 case kInt32: str << *reinterpret_cast<Int_t*>(ptr); break;
230 case kUInt32: str << *reinterpret_cast<UInt_t*>(ptr); break;
231 case kInt64: str << *reinterpret_cast<Long64_t*>(ptr); break;
232 case kUInt64: str << *reinterpret_cast<ULong64_t*>(ptr); break;
233 case kNone:
234 break;
235 }
236
237 return str.str();
238 }
239};
240
241void ErrorHandlerAll(Int_t level, Bool_t abort, const char *location, const char *msg)
242{
243 if (string(msg).substr(0,24)=="no dictionary for class ")
244 return;
245
246 DefaultErrorHandler(level, abort, location, msg);
247}
248
249int main(int argc, const char* argv[])
250{
251 Time start;
252
253 gROOT->SetBatch();
254 SetErrorHandler(ErrorHandlerAll);
255
256 Configuration conf(argv[0]);
257 conf.SetPrintUsage(PrintUsage);
258 SetupConfiguration(conf);
259
260 if (!conf.DoParse(argc, argv))
261 return 127;
262
263 // ----------------------------- Evaluate options --------------------------
264 const string uri = conf.Get<string>("uri");
265 const string file = conf.Get<string>("file");
266 const string tree = conf.Get<string>("tree");
267 const string table = conf.Get<string>("table").empty() ? tree : conf.Get<string>("table");
268
269 const uint16_t verbose = conf.Get<uint16_t>("verbose");
270 const int64_t first = conf.Get<int64_t>("first");
271 const int64_t max = conf.Get<int64_t>("max");
272
273 const bool force = conf.Get<bool>("force");
274 const bool drop = conf.Get<bool>("drop");
275 const bool create = conf.Get<bool>("create") || drop;
276 const bool noinsert = conf.Get<bool>("no-insert");
277
278 const string engine = conf.Get<string>("engine");
279
280 const bool print_branches = conf.Get<bool>("print-branches");
281 const bool print_leaves = conf.Get<bool>("print-leaves");
282 const bool print_create = conf.Get<bool>("print-create");
283 const bool print_insert = conf.Get<bool>("print-insert");
284
285 const vector<Map> mymap = conf.Vec<Map>("map");
286 const vector<Map> sqltypes = conf.Vec<Map>("sql-type");
287 const vector<string> _ignore = conf.Vec<string>("ignore");
288 const vector<string> primary = conf.Vec<string>("primary");
289
290 // -------------------------------------------------------------------------
291
292 if (verbose>0)
293 cout << "\n-------------------------- Evaluating file -------------------------" << endl;
294
295 TFile f(file.c_str());
296 if (f.IsZombie())
297 {
298 cerr << "Could not open file " << file << endl;
299 return 1;
300 }
301
302 if (verbose>0)
303 cout << "File: " << file << endl;
304
305 TTree *T = 0;
306 f.GetObject(tree.c_str(), T);
307 if (!T)
308 {
309 cerr << "Could not open tree " << tree << endl;
310 return 2;
311 }
312
313 if (verbose>0)
314 cout << "Tree: " << tree << endl;
315
316 T->SetMakeClass(1);
317
318 TObjArray *branches = T->GetListOfBranches();
319 TObjArray *leaves = T->GetListOfLeaves();
320
321 if (print_branches)
322 {
323 cout << '\n';
324 branches->Print();
325 }
326
327 if (verbose>0)
328 cout << T->GetEntriesFast() << " events found." << endl;
329
330
331 if (verbose>0)
332 cout << branches->GetEntries() << " branches found." << endl;
333
334 if (print_leaves)
335 {
336 cout << '\n';
337 leaves->Print();
338 }
339 if (verbose>0)
340 cout << leaves->GetEntries() << " leaves found." << endl;
341
342 string query =
343 "CREATE TABLE IF NOT EXISTS `"+table+"`\n"
344 "(\n"
345 " `Night` SMALLINT UNSIGNED NOT NULL";
346
347 vector<Container> vec;
348
349
350 TIter Next(leaves);
351 TObject *o = 0;
352 while ((o=Next()))
353 {
354 TLeaf *L = T->GetLeaf(o->GetName());
355
356 if (verbose>2)
357 cout << '\n' << o->GetName() << " [" << L->GetTypeName() << "]";
358
359 string name = o->GetName();
360
361
362 bool found = false;
363 for (auto b=_ignore.cbegin(); b!=_ignore.cend(); b++)
364 {
365 if (boost::regex_match(name, boost::regex(*b)))
366 {
367 found = true;
368 if (verbose>2)
369 cout << " (-ignored-)";
370 break;
371 }
372 }
373 if (found)
374 continue;
375
376 const string tn = L->GetTypeName();
377
378 auto it = ConvRoot.find(tn);
379 if (it==ConvRoot.end())
380 {
381 if (verbose>2)
382 cout << " (-n/a-)";
383 continue;
384 }
385
386 if (verbose==2)
387 cout << '\n' << o->GetName() << " [" << L->GetTypeName() << "]";
388
389 for (auto m=mymap.cbegin(); m!=mymap.cend(); m++)
390 name = boost::regex_replace(name, boost::regex(m->first), m->second);
391
392 if (verbose>1)
393 cout << " (" << name << ")";
394
395 string sqltype = it->second.second;
396
397 for (auto m=sqltypes.cbegin(); m!=sqltypes.cend(); m++)
398 if (m->first==name)
399 sqltype = m->second;
400
401 query += ",\n `"+name+"` "+sqltype+" NOT NULL COMMENT '"+o->GetName()+"'";
402
403 vec.emplace_back(o->GetName(), name, it->second.first);
404 T->SetBranchAddress(o->GetName(), vec.back().ptr);
405 }
406
407 if (verbose>1)
408 cout << "\n\n";
409 if (verbose>0)
410 cout << vec.size() << " leaves setup for reading." << endl;
411
412 UInt_t datatype = 0;
413 const bool has_datatype = T->SetBranchAddress("DataType.fVal", &datatype) >= 0;
414
415 // Setiing up branch status (must be after all SetBranchAddress)
416 T->SetBranchStatus("*", 0);
417 for (auto c=vec.cbegin(); c!=vec.cend(); c++)
418 T->SetBranchStatus(c->branch.c_str(), 1);
419
420 if (has_datatype)
421 {
422 T->SetBranchStatus("DataType.fVal", 1);
423 if (verbose>0)
424 cout << "Rows with DataType.fVal!=1 will be skipped." << endl;
425 }
426
427 // -------------------------------------------------------------------------
428
429 try
430 {
431 if (!force)
432 Database(uri).connected();
433 }
434 catch (const exception &e)
435 {
436 cerr << "SQL connection failed: " << e.what() << endl;
437 return 3;
438 }
439
440 // -------------------------------------------------------------------------
441
442 if (verbose>0)
443 cout << "\n--------------------------- Database Table -------------------------" << endl;
444
445 if (!primary.empty())
446 query += ",\n PRIMARY KEY USING BTREE (`"+boost::algorithm::join(primary, "`, `")+"`)\n";
447
448 query +=
449 ")\n"
450 "DEFAULT CHARSET=latin1 COLLATE=latin1_general_ci\n"
451 "ENGINE="+engine+"\n"
452 "COMMENT='created by "+conf.GetName()+"'\n";
453
454 // FIXME: Can we omit the catching to be able to print the
455 // query 'autmatically'?
456 try
457 {
458 if (drop)
459 {
460 // => Simple result
461 Database(uri).query("DROP TABLE `"+table+"`").execute();
462 if (verbose>0)
463 cout << "Table `" << table << "` dropped." << endl;
464 }
465 }
466 catch (const exception &e)
467 {
468 cerr << "DROP TABLE `" << table << "`\n\n";
469 cerr << "SQL query failed:\n" << e.what() << endl;
470 return 4;
471 }
472
473 try
474 {
475 if (create)
476 Database(uri).query(query).execute();
477 }
478 catch (const exception &e)
479 {
480 cerr << query << "\n\n";
481 cerr << "SQL query failed:\n" << e.what() << endl;
482 return 5;
483 }
484
485 if (print_create)
486 cout << query << endl;
487
488 if (create && verbose>0)
489 cout << "Table `" << table << "` created." << endl;
490
491 // -------------------------------------------------------------------------
492
493 if (verbose>0)
494 cout << "\n---------------------------- Reading file --------------------------" << endl;
495
496 //query = update ? "UPDATE" : "INSERT";
497 query = "INSERT `"+table+"`\n"
498 "(\n"
499 " `Night`";
500
501 for (auto c=vec.cbegin(); c!=vec.cend(); c++)
502 query += ",\n `"+c->column+"`";
503
504 query +=
505 "\n)\n"
506 "VALUES\n";
507
508 size_t count = 0;
509
510 const size_t num = max>0 && (max-first)<T->GetEntriesFast() ? (max-first) : T->GetEntriesFast();
511 for (size_t j=first; j<num; j++)
512 {
513 T->GetEntry(j);
514 if (has_datatype && datatype!=1)
515 continue;
516
517 query += "(\n 0";
518 for (auto c=vec.cbegin(); c!=vec.cend(); c++)
519 {
520 query += ",\n "+c->fmt();
521 if (print_insert)
522 query += " /* "+c->column+" -> "+c->branch+" */";
523 }
524 query += "\n)"; // ON DUPLICATE KEY UPDATE\n";
525
526 if (j!=num-1)
527 query += ",\n";
528
529 count ++;
530 }
531
532 if (verbose>0)
533 cout << count << " out of " << num << " rows read from file [N=" << first << ".." << num-1 << "]." << endl;
534
535 // -------------------------------------------------------------------------
536
537 if (verbose>0)
538 cout << "\n--------------------------- Inserting data -------------------------" << endl;
539
540 try
541 {
542 if (!noinsert)
543 // => Simple result
544 Database(uri).query(query).execute();
545 else
546 cout << "Insert query skipped!" << endl;
547 if (print_insert)
548 cout << query << endl;
549 }
550 catch (const exception &e)
551 {
552 cerr << query << "\n\n";
553 cerr << "SQL query failed:\n" << e.what() << endl;
554 return 6;
555 }
556
557 if (verbose>0)
558 {
559 cout << count << " rows inserted.\n\n";
560 cout << "Total execution time: " << Time().UnixTime()-start.UnixTime() << "s\n" << endl;
561 }
562
563 return 0;
564}
Note: See TracBrowser for help on using the repository browser.