source: trunk/FACT++/src/root2sql.cc@ 18960

Last change on this file since 18960 was 18928, checked in by tbretz, 7 years ago
First version of a tool to fill a SQL table with data from a root file.
File size: 18.2 KB
Line 
1#include <boost/algorithm/string/join.hpp>
2#include <boost/regex.hpp>
3
4#include "Database.h"
5
6#include "Time.h"
7#include "Configuration.h"
8
9#include <TROOT.h>
10#include <TFile.h>
11#include <TTree.h>
12#include <TLeaf.h>
13#include <TError.h>
14
15using namespace std;
16
17// ------------------------------------------------------------------------
18
19struct Map : pair<string, string>
20{
21 Map() { }
22};
23
24std::istream &operator>>(std::istream &in, Map &m)
25{
26 string txt(istreambuf_iterator<char>(in), {});
27
28 const boost::regex expr("((.*)[^\\\\])/(.*)");
29 boost::smatch match;
30 if (!boost::regex_match(txt, match, expr))
31 throw runtime_error("Could not evaluate map argument: "+txt);
32
33 m.first = match[1].str();
34 m.second = match[3].str();
35
36 return in;
37}
38
39
40void SetupConfiguration(Configuration &conf)
41{
42 po::options_description control("Root to SQL");
43 control.add_options()
44 ("uri,u", var<string>()->required(), "Database link as in\n\tuser:password@server[:port]/database.")
45 ("file", var<string>(""), "The root file to read from")
46 ("create", po_switch(), "Create the database if not existing")
47 ("drop", po_switch(), "Drop the table (implies create)")
48 ("tree,t", var<string>("Events"), "Name of the root tree to convert")
49 ("table", var<string>(""), "Name of the table to use (default is the tree name)")
50 ("map", vars<Map>(), "A regular expression which is applied to the leaf name befoee it is used as SQL column name)")
51 ("sql-type", vars<Map>(), "Allows to overwrite the calculated SQL type for a given column e.g. 'sql-column-name/UNSIGNED IN'")
52 ("ignore", vars<string>(), "Ignore the given leaf, if the given regular expression matches")
53 ("primary", vars<string>(), "List of columns to be used as primary keys during table creation (in connection with --create)")
54 ("first", var<int64_t>(int64_t(0)), "First event to start with (default: 0), mainly for test purpose")
55 ("max", var<int64_t>(int64_t(0)), "Maximum number of events to process (0: all), mainly for test purpose")
56 ("no-insert", po_switch(), "Does not insert any data into the table")
57 ("print-branches", po_switch(), "Print the branches found in the tree")
58 ("print-leaves", po_switch(), "Print the leaves found in the tree (this is what is processed)")
59 ("print-insert", po_switch(), "Print the INSERT query (note that it contains all data)")
60 ("print-create", po_switch(), "Print the CREATE query")
61 ("verbose,v", var<uint16_t>(1), "Verbosity (0: quiet, 1: default, 2: more, 3, ...)")
62 ;
63
64 po::positional_options_description p;
65 p.add("file", 1); // The 1st positional options (n=1)
66
67 conf.AddOptions(control);
68 conf.SetArgumentPositions(p);
69}
70
71void PrintUsage()
72{
73 cout <<
74 "root2sql - Fills the data from a root file into a database\n"
75 "\n"
76 "For convenience, this documentation uses the extended version of the options, "
77 "refer to the output below to get the abbreviations.\n"
78 "\n"
79 "This is a general purpose tool to fill the contents of a root file into a database "
80 "as long as this is technically possible and makes sense. Note that root can even "
81 "write complex data like a TH1F into a database, this is not the purpose of this "
82 "program.\n"
83 "\n"
84 "Each root tree has branches and leaves (the basic data types). These leaves can "
85 "be read independently of the classes which were used to write the root file. "
86 "The default tree to read from is 'Events' but the name can be overwritten "
87 "using --tree. The default table name to fill the data into is identical to "
88 "the tree name. It can be overwritten using --table.\n"
89 "\n"
90 "The name of each column to which data is filled from a leave is obtained from "
91 "the leaves' names. The leave names can be checked using --print-leaves. "
92 "A --print-branches exists for convenience to print only the high-level branches. "
93 "Sometimes these names might be quite unconvenient like MTime.fTime.fMilliSec or "
94 "just MHillas.fWidth. To allow to simplify column names, regular expressions "
95 "(using boost's regex) can be defined to change the names. Note that these regular "
96 "expressions are applied one by one on each leaf's name. A valid expression could "
97 "be:\n"
98 " --map=MHillas\\.f/\n"
99 "which would remove all occurances of 'MHillas.f'. This option can be used more than "
100 "once. They are applied in sequence. A single match does not stop the sequence.\n"
101 "\n"
102 "Sometimes it might also be convenient to skip a leaf. Tis can be done with "
103 "the --ignore resource. If the given regular expresion yield a match, the "
104 "leaf will be ignored. Note that the regular expression work on the raw-name "
105 "of the leaf not the readily mapped SQL column names. Example:\n"
106 " --ignore=ThetaSq\\..*\n"
107 "will skip all leaved which start with 'ThetaSq.'. This option can be used"
108 "more than once.\n"
109 "\n"
110 "The data type of each column is kept as close as possible to the leaves' data "
111 "types. If for some reason this is not wanted, the data type of the SQL column "
112 "can be overwritten with --sql-type sql-column/sql-ytpe, for example:\n"
113 " --sql-type=FileId/UNSIGNED INT\n"
114 "while the first argument of the name of the SQL column to which the data type "
115 "should be applied. The second column is the basic SQL data type. The option can "
116 "be given more than once.\n"
117 "\n"
118 "Database interaction:\n"
119 "\n"
120 "To drop an existing table, --drop can be used.\n"
121 "\n"
122 "To create a table according to theSQL column names and data types, --create "
123 "can be used. The query used can be printed with --print-create even --create "
124 "has not been specified.\n"
125 "\n"
126 "To choose the columns which should become primary keys, use --primary, "
127 "for exmaple:\n"
128 " --primary=col1\n"
129 "To define more than one column as primary key, the option can be given more than "
130 "once. Note that the combination of these columns must be unique.\n"
131 "\n"
132 "All columns are created as NOT NULL as default and the table is created as "
133 "MyISAM.\n"
134 "\n"
135 "For debugging purpose, or to just create or drop a table, the final insert "
136 "query can be skipped using --no-insert. Note that for performance reason, "
137 "all data is collected in memory and a single INSERT query is issued at the "
138 "end.\n"
139 "\n"
140 "Using a higher verbosity level, an overview of the written columns or all "
141 "processed leaves is printed depending on the verbosity level. The output looks "
142 "like the following\n"
143 " Leaf name [root data type] (SQL name)\n"
144 "for example\n"
145 " MTime.fTime.fMilliSec [Long64_t] (MilliSec)\n"
146 "which means that the leaf MTime.fTime.fMilliSec is detected to be a Long64_t "
147 "which is filled into a column called MilliSec. Leaves with non basic data types "
148 "are ignored automatically and are marked as (-n/a-). User ignored columnd "
149 "are marked as (-ignored-).\n"
150 "\n"
151 "In case of succes, 0 is returned, a value>0 otherwise.\n"
152 "\n"
153 "Usage: root2sql [options] rootfile.root\n"
154 "\n"
155 ;
156 cout << endl;
157}
158
159enum BasicType_t
160{
161 kNone = 0,
162 kFloat,
163 kDouble,
164 kInt16,
165 kUInt16,
166 kInt32,
167 kUInt32,
168 kInt64,
169 kUInt64,
170};
171
172static const map<string, pair<BasicType_t, string>> ConvRoot =
173{
174 { "Float_t", { kFloat, "FLOAT" } },
175 { "Double_t", { kDouble, "DOUBLE" } },
176 { "ULong64_t", { kUInt64, "BIGINT UNSIGNED" } },
177 { "Long64_t", { kInt64, "BIGINT" } },
178 { "UInt_t", { kUInt32, "INT UNSIGNED" } },
179 { "Int_t", { kInt32, "INT" } },
180 { "UShort_t", { kUInt16, "SMALLINT UNSIGNED" } },
181 { "Short_t", { kInt16, "SMALLINT" } },
182};
183
184struct Container
185{
186 string branch; // branch name
187 string column; // column name
188 BasicType_t type;
189 void *ptr;
190
191 Container(const string &b, const string &c, const BasicType_t &t) : branch(b), column(c), type(t), ptr(0)
192 {
193 switch (t)
194 {
195 case kFloat: ptr = new Float_t; break;
196 case kDouble: ptr = new Double_t; break;
197 case kInt16: ptr = new Short_t; break;
198 case kUInt16: ptr = new UShort_t; break;
199 case kInt32: ptr = new Int_t; break;
200 case kUInt32: ptr = new UInt_t; break;
201 case kInt64: ptr = new Long64_t; break;
202 case kUInt64: ptr = new ULong64_t; break;
203 case kNone:
204 break;
205 }
206 }
207 ~Container()
208 {
209 //::operator delete(ptr); // It seems root is deleting it already
210 }
211
212 string fmt() const
213 {
214 ostringstream str;
215
216 switch (type)
217 {
218 case kFloat: str << setprecision(8) << *reinterpret_cast<Float_t*>(ptr); break;
219 case kDouble: str << setprecision(16) << *reinterpret_cast<Double_t*>(ptr); break;
220 case kInt16: str << *reinterpret_cast<Short_t*>(ptr); break;
221 case kUInt16: str << *reinterpret_cast<UShort_t*>(ptr); break;
222 case kInt32: str << *reinterpret_cast<Int_t*>(ptr); break;
223 case kUInt32: str << *reinterpret_cast<UInt_t*>(ptr); break;
224 case kInt64: str << *reinterpret_cast<Long64_t*>(ptr); break;
225 case kUInt64: str << *reinterpret_cast<ULong64_t*>(ptr); break;
226 case kNone:
227 break;
228 }
229
230 return str.str();
231 }
232};
233
234void ErrorHandlerAll(Int_t level, Bool_t abort, const char *location, const char *msg)
235{
236 if (string(msg).substr(0,24)=="no dictionary for class ")
237 return;
238
239 DefaultErrorHandler(level, abort, location, msg);
240}
241
242int main(int argc, const char* argv[])
243{
244 Time start;
245
246 gROOT->SetBatch();
247 SetErrorHandler(ErrorHandlerAll);
248
249 Configuration conf(argv[0]);
250 conf.SetPrintUsage(PrintUsage);
251 SetupConfiguration(conf);
252
253 if (!conf.DoParse(argc, argv))
254 return 127;
255
256 // ----------------------------- Evaluate options --------------------------
257 const string uri = conf.Get<string>("uri");
258 const string file = conf.Get<string>("file");
259 const string tree = conf.Get<string>("tree");
260 const string table = conf.Get<string>("table").empty() ? tree : conf.Get<string>("table");
261
262 const uint16_t verbose = conf.Get<uint16_t>("verbose");
263 const int64_t first = conf.Get<int64_t>("first");
264 const int64_t max = conf.Get<int64_t>("max");
265
266 const bool drop = conf.Get<bool>("drop");
267 const bool create = conf.Get<bool>("create") || drop;
268 const bool noinsert = conf.Get<bool>("no-insert");
269
270 const bool print_branches = conf.Get<bool>("print-branches");
271 const bool print_leaves = conf.Get<bool>("print-leaves");
272 const bool print_create = conf.Get<bool>("print-create");
273 const bool print_insert = conf.Get<bool>("print-insert");
274
275 const vector<Map> mymap = conf.Vec<Map>("map");
276 const vector<Map> sqltypes = conf.Vec<Map>("sql-type");
277 const vector<string> ignore = conf.Vec<string>("ignore");
278 const vector<string> primary = conf.Vec<string>("primary");
279
280 // -------------------------------------------------------------------------
281
282 if (verbose>0)
283 cout << "\n-------------------------- Evaluating file -------------------------" << endl;
284
285 TFile f(file.c_str());
286 if (f.IsZombie())
287 {
288 cerr << "Could not open file " << file << endl;
289 return 1;
290 }
291
292 if (verbose>0)
293 cout << "File: " << file << endl;
294
295 TTree *T = 0;
296 f.GetObject(tree.c_str(), T);
297 if (!T)
298 {
299 cerr << "Could not open tree " << tree << endl;
300 return 2;
301 }
302
303 if (verbose>0)
304 cout << "Tree: " << tree << endl;
305
306 T->SetMakeClass(1);
307
308 TObjArray *branches = T->GetListOfBranches();
309 TObjArray *leaves = T->GetListOfLeaves();
310
311 if (print_branches)
312 {
313 cout << '\n';
314 branches->Print();
315 }
316
317 if (verbose>0)
318 cout << T->GetEntriesFast() << " events found." << endl;
319
320
321 if (verbose>0)
322 cout << branches->GetEntries() << " branches found." << endl;
323
324 if (print_leaves)
325 {
326 cout << '\n';
327 leaves->Print();
328 }
329 if (verbose>0)
330 cout << leaves->GetEntries() << " leaves found." << endl;
331
332 string query =
333 "CREATE TABLE IF NOT EXISTS `"+table+"`\n"
334 "(\n"
335 " `Night` SMALLINT UNSIGNED NOT NULL";
336
337 vector<Container> vec;
338
339
340 TIter Next(leaves);
341 TObject *o = 0;
342 while ((o=Next()))
343 {
344 TLeaf *L = T->GetLeaf(o->GetName());
345
346 if (verbose>2)
347 cout << '\n' << o->GetName() << " [" << L->GetTypeName() << "]";
348
349 string name = o->GetName();
350
351
352 bool found = false;
353 for (const auto &b: ignore)
354 {
355 if (boost::regex_match(name, boost::regex(b)))
356 {
357 found = true;
358 if (verbose>2)
359 cout << " (-ignored-)";
360 break;
361 }
362 }
363 if (found)
364 continue;
365
366 const string tn = L->GetTypeName();
367
368 auto it = ConvRoot.find(tn);
369 if (it==ConvRoot.end())
370 {
371 if (verbose>2)
372 cout << " (-n/a-)";
373 continue;
374 }
375
376 if (verbose==2)
377 cout << '\n' << o->GetName() << " [" << L->GetTypeName() << "]";
378
379 for (const auto &m: mymap)
380 name = boost::regex_replace(name, boost::regex(m.first), m.second);
381
382 if (verbose>1)
383 cout << " (" << name << ")";
384
385 string sqltype = it->second.second;
386
387 for (const auto &m: sqltypes)
388 if (m.first==name)
389 sqltype = m.second;
390
391 query += ",\n `"+name+"` "+sqltype+" NOT NULL COMMENT '"+o->GetName()+"'";
392
393 vec.emplace_back(o->GetName(), name, it->second.first);
394 T->SetBranchAddress(o->GetName(), vec.back().ptr);
395 }
396
397 if (verbose>1)
398 cout << "\n\n";
399 if (verbose>0)
400 cout << vec.size() << " leaves setup for reading." << endl;
401
402 UInt_t datatype = 0;
403 const bool has_datatype = T->SetBranchAddress("DataType.fVal", &datatype) >= 0;
404
405 // Setiing up branch status (must be after all SetBranchAddress)
406 T->SetBranchStatus("*", 0);
407 for (const auto &c: vec)
408 T->SetBranchStatus(c.branch.c_str(), 1);
409
410 if (has_datatype)
411 {
412 T->SetBranchStatus("DataType.fVal", 1);
413 if (verbose>0)
414 cout << "Rows with DataType.fVal!=1 will be skipped." << endl;
415 }
416
417 // -------------------------------------------------------------------------
418
419 if (verbose>0)
420 cout << "\n--------------------------- Database Table -------------------------" << endl;
421
422 if (!primary.empty())
423 query += ",\n PRIMARY KEY USING BTREE (`"+boost::algorithm::join(primary, "`, `")+"`)\n";
424
425 query +=
426 ")\n"
427 "DEFAULT CHARSET=latin1 COLLATE=latin1_general_ci\n"
428 "ENGINE=MyISAM\n"
429 "COMMENT='created by "+conf.GetName()+"'\n";
430
431 // FIXME: Can we omit the catching to be able to print the
432 // query 'autmatically'?
433 try
434 {
435 if (drop)
436 {
437 // => Simple result
438 Database(uri).query("DROP TABLE `"+table+"`").execute();
439 if (verbose>0)
440 cout << "Table `" << table << "` dropped." << endl;
441 }
442 }
443 catch (const exception &e)
444 {
445 cerr << "DROP TABLE `" << table << "`\n\n";
446 cerr << "SQL query failed:\n" << e.what() << endl;
447 return 3;
448 }
449
450 try
451 {
452 if (create)
453 Database(uri).query(query).execute();
454 }
455 catch (const exception &e)
456 {
457 cerr << query << "\n\n";
458 cerr << "SQL query failed:\n" << e.what() << endl;
459 return 4;
460 }
461
462 if (print_create)
463 cout << query << endl;
464
465 if (create && verbose>0)
466 cout << "Table `" << table << "` created." << endl;
467
468 // -------------------------------------------------------------------------
469
470 if (verbose>0)
471 cout << "\n---------------------------- Reading file --------------------------" << endl;
472
473 //query = update ? "UPDATE" : "INSERT";
474 query = "INSERT `"+table+"`\n"
475 "(\n"
476 " `Night`";
477
478 for (const auto &c: vec)
479 query += ",\n `"+c.column+"`";
480
481 query +=
482 "\n)\n"
483 "VALUES\n";
484
485 size_t count = 0;
486
487 const size_t num = max>0 && (max-first)<T->GetEntriesFast() ? (max-first) : T->GetEntriesFast();
488 for (size_t j=first; j<num; j++)
489 {
490 T->GetEntry(j);
491 if (has_datatype && datatype!=1)
492 continue;
493
494 query += "(\n 0";
495 for (const auto &c: vec)
496 query += ",\n "+c.fmt()+" /* "+c.column+" -> "+c.branch+" */";
497 query += "\n)"; // ON DUPLICATE KEY UPDATE\n";
498
499 if (j!=num-1)
500 query += ",\n";
501
502 count ++;
503 }
504
505 if (verbose>0)
506 cout << count << " out of " << num << " rows read from file [N=" << first << ".." << num-1 << "]." << endl;
507
508 // -------------------------------------------------------------------------
509
510 if (verbose>0)
511 cout << "\n--------------------------- Inserting data -------------------------" << endl;
512
513 try
514 {
515 if (!noinsert)
516 // => Simple result
517 Database(uri).query(query).execute();
518 else
519 cout << "Insert query skipped!" << endl;
520 if (print_insert)
521 cout << query << endl;
522 }
523 catch (const exception &e)
524 {
525 cerr << query << "\n\n";
526 cerr << "SQL query failed:\n" << e.what() << endl;
527 return 5;
528 }
529
530 if (verbose>0)
531 {
532 cout << count << " rows inserted.\n\n";
533 cout << "Total execution time: " << Time().UnixTime()-start.UnixTime() << "s\n" << endl;
534 }
535
536 return 0;
537}
Note: See TracBrowser for help on using the repository browser.