source: trunk/FACT++/src/fits2sql.cc@ 19100

Last change on this file since 19100 was 19099, checked in by tbretz, 6 years ago
Added fits2sql -- maybe it can be unified with root2sql?
File size: 20.2 KB
Line 
1#include <boost/algorithm/string/join.hpp>
2#include <boost/regex.hpp>
3
4#include "Database.h"
5
6#include "Time.h"
7#include "Configuration.h"
8
9#include "zfits.h"
10
11/*
12#include <TROOT.h>
13#include <TFile.h>
14#include <TTree.h>
15#include <TLeaf.h>
16#include <TError.h>
17*/
18
19using namespace std;
20
21// ------------------------------------------------------------------------
22
23struct Map : pair<string, string>
24{
25 Map() { }
26};
27
28std::istream &operator>>(std::istream &in, Map &m)
29{
30 const istreambuf_iterator<char> eos;
31 string txt(istreambuf_iterator<char>(in), eos);
32
33 const boost::regex expr("((.*)[^\\\\])/(.*)");
34 boost::smatch match;
35 if (!boost::regex_match(txt, match, expr))
36 throw runtime_error("Could not evaluate map argument: "+txt);
37
38 m.first = match[1].str();
39 m.second = match[3].str();
40
41 return in;
42}
43
44
45void SetupConfiguration(Configuration &conf)
46{
47 po::options_description control("Root to SQL");
48 control.add_options()
49 ("uri,u", var<string>()
50#if BOOST_VERSION >= 104200
51 ->required()
52#endif
53 , "Database link as in\n\tuser:password@server[:port]/database.")
54 ("file", var<string>(""), "The root file to read from")
55 ("force", po_switch(), "Force processing even if there is no database connection")
56 ("create", po_switch(), "Create the database if not existing")
57 ("drop", po_switch(), "Drop the table (implies create)")
58 ("extension,e", var<string>(""), "Name of the fits extension (table) to convert")
59 ("table", var<string>(""), "Name of the table to use (default is the tree name)")
60 ("map", vars<Map>(), "A regular expression which is applied to the leaf name befoee it is used as SQL column name)")
61 ("sql-type", vars<Map>(), "Allows to overwrite the calculated SQL type for a given column e.g. 'sql-column-name/UNSIGNED IN'")
62 ("ignore", vars<string>(), "Ignore the given leaf, if the given regular expression matches")
63 ("primary", vars<string>(), "List of columns to be used as primary keys during table creation (in connection with --create)")
64 ("first", var<int64_t>(int64_t(0)), "First event to start with (default: 0), mainly for test purpose")
65 ("max", var<int64_t>(int64_t(0)), "Maximum number of events to process (0: all), mainly for test purpose")
66 ("engine", var<string>("InnoDB"), "Database engine to be used when a new table is created")
67 ;
68
69 po::options_description debug("Debug options");
70 debug.add_options()
71 ("no-insert", po_switch(), "Does not insert any data into the table")
72 ("dry-run", po_switch(), "Skip any query which changes the databse (might result in consecutive failures)")
73 ("print-extensions", po_switch(), "Print extensions (tables) from fits file")
74 ("print-columns", po_switch(), "Print columns in fits table")
75 ("print-insert", po_switch(), "Print the INSERT query (note that it contains all data)")
76 ("print-create", po_switch(), "Print the CREATE query")
77 ("verbose,v", var<uint16_t>(1), "Verbosity (0: quiet, 1: default, 2: more, 3, ...)")
78 ;
79
80 po::positional_options_description p;
81 p.add("file", 1); // The 1st positional options (n=1)
82
83 conf.AddOptions(control);
84 conf.AddOptions(debug);
85 conf.SetArgumentPositions(p);
86}
87
88void PrintUsage()
89{
90 cout <<
91 "fits2sql - Fills the data from a fits file into a database\n"
92 "\n"
93 "For convenience, this documentation uses the extended version of the options, "
94 "refer to the output below to get the abbreviations.\n"
95 "\n"
96 "This is a general purpose tool to fill the contents of a root file into a database "
97 "as long as this is technically possible and makes sense. Note that root can even "
98 "write complex data like a TH1F into a database, this is not the purpose of this "
99 "program.\n"
100 "\n"
101 "Each root tree has branches and leaves (the basic data types). These leaves can "
102 "be read independently of the classes which were used to write the root file. "
103 "The default tree to read from is 'Events' but the name can be overwritten "
104 "using --tree. The default table name to fill the data into is identical to "
105 "the tree name. It can be overwritten using --table.\n"
106 "\n"
107 "The name of each column to which data is filled from a leave is obtained from "
108 "the leaves' names. The leave names can be checked using --print-leaves. "
109 "A --print-branches exists for convenience to print only the high-level branches. "
110 "Sometimes these names might be quite unconvenient like MTime.fTime.fMilliSec or "
111 "just MHillas.fWidth. To allow to simplify column names, regular expressions "
112 "(using boost's regex) can be defined to change the names. Note that these regular "
113 "expressions are applied one by one on each leaf's name. A valid expression could "
114 "be:\n"
115 " --map=MHillas\\.f/\n"
116 "which would remove all occurances of 'MHillas.f'. This option can be used more than "
117 "once. They are applied in sequence. A single match does not stop the sequence.\n"
118 "\n"
119 "Sometimes it might also be convenient to skip a leaf. This can be done with "
120 "the --ignore resource. If the given regular expresion yield a match, the "
121 "leaf will be ignored. Note that the regular expression work on the raw-name "
122 "of the leaf not the readily mapped SQL column names. Example:\n"
123 " --ignore=ThetaSq\\..*\n"
124 "will skip all leaved which start with 'ThetaSq.'. This option can be used"
125 "more than once.\n"
126 "\n"
127 "The data type of each column is kept as close as possible to the leaves' data "
128 "types. If for some reason this is not wanted, the data type of the SQL column "
129 "can be overwritten with --sql-type sql-column/sql-ytpe, for example:\n"
130 " --sql-type=FileId/UNSIGNED INT\n"
131 "while the first argument of the name of the SQL column to which the data type "
132 "should be applied. The second column is the basic SQL data type. The option can "
133 "be given more than once.\n"
134 "\n"
135 "Database interaction:\n"
136 "\n"
137 "To drop an existing table, --drop can be used.\n"
138 "\n"
139 "To create a table according to theSQL column names and data types, --create "
140 "can be used. The query used can be printed with --print-create even --create "
141 "has not been specified.\n"
142 "\n"
143 "To choose the columns which should become primary keys, use --primary, "
144 "for exmaple:\n"
145 " --primary=col1\n"
146 "To define more than one column as primary key, the option can be given more than "
147 "once. Note that the combination of these columns must be unique.\n"
148 "\n"
149 "All columns are created as NOT NULL as default and the table is created as "
150 "MyISAM (default).\n"
151 "\n"
152 "For debugging purpose, or to just create or drop a table, the final insert "
153 "query can be skipped using --no-insert. Note that for performance reason, "
154 "all data is collected in memory and a single INSERT query is issued at the "
155 "end.\n"
156 "\n"
157 "Using a higher verbosity level (-v), an overview of the written columns or all "
158 "processed leaves is printed depending on the verbosity level. The output looks "
159 "like the following\n"
160 " Leaf name [root data type] (SQL name)\n"
161 "for example\n"
162 " MTime.fTime.fMilliSec [Long64_t] (MilliSec)\n"
163 "which means that the leaf MTime.fTime.fMilliSec is detected to be a Long64_t "
164 "which is filled into a column called MilliSec. Leaves with non basic data types "
165 "are ignored automatically and are marked as (-n/a-). User ignored columnd "
166 "are marked as (-ignored-).\n"
167 "\n"
168 "If a query failed, the query is printed to stderr together with the error message. "
169 "For the main INSERT query, this is only true if the verbosity level is at least 2 "
170 "or the query has less than 80*25 bytes.\n"
171 "\n"
172 "In case of succes, 0 is returned, a value>0 otherwise.\n"
173 "\n"
174 "Usage: fits2sql [options] rootfile.root\n"
175 "\n"
176 ;
177 cout << endl;
178}
179
180enum BasicType_t
181{
182 kNone = 0,
183 kVarchar,
184 kBool,
185 kFloat,
186 kDouble,
187 kInt8,
188 kInt16,
189 kInt32,
190 kInt64,
191// kMJD,
192};
193
194static const map<char, pair<BasicType_t, string>> ConvFits =
195{
196 { 'A', { kVarchar, "VARCHAR" } },
197 { 'L', { kBool, "BOOLEAN" } },
198 { 'B', { kInt8, "TINYINT" } },
199 { 'I', { kInt16, "SMALLINT" } },
200 { 'J', { kInt32, "INT" } },
201 { 'K', { kInt64, "BIGINT" } },
202 { 'E', { kFloat, "FLOAT" } },
203 { 'D', { kDouble, "DOUBLE" } },
204};
205
206struct Container
207{
208 string branch; // fits column name
209 string column; // sql column name
210 BasicType_t type;
211 size_t num;
212// double offset;
213 void *ptr;
214
215 Container(const string &b, const string &c, const BasicType_t &t, const size_t &n/*, const double &offset=0*/) : branch(b), column(c), type(t), num(n), ptr(0)
216 {
217 }
218 ~Container()
219 {
220 }
221
222 string fmt(const size_t &index) const
223 {
224 ostringstream str;
225
226 switch (type)
227 {
228 //case kVarchar: str << string(reinterpret_cast<char*>(ptr), num); break;
229 case kVarchar: str << string(reinterpret_cast<char*>(ptr), num).c_str(); break;
230 case kFloat: str << setprecision(8) << reinterpret_cast<float*>(ptr)[index]; break;
231// case kMJD: str << setprecision(16) << reinterpret_cast<double*>(ptr)[0]+offset; break;
232 case kDouble: str << setprecision(16) << reinterpret_cast<double*>(ptr)[index]; break;
233 case kBool:
234 case kInt8: str << int32_t(reinterpret_cast<int8_t*>(ptr)[index]); break;
235 case kInt16: str << reinterpret_cast<int16_t*>(ptr)[index]; break;
236 case kInt32: str << reinterpret_cast<int32_t*>(ptr)[index]; break;
237 case kInt64: str << reinterpret_cast<int64_t*>(ptr)[index]; break;
238 case kNone:
239 break;
240 }
241
242 return str.str();
243 }
244};
245
246int main(int argc, const char* argv[])
247{
248 Time start;
249
250 Configuration conf(argv[0]);
251 conf.SetPrintUsage(PrintUsage);
252 SetupConfiguration(conf);
253
254 if (!conf.DoParse(argc, argv))
255 return 127;
256
257 // ----------------------------- Evaluate options --------------------------
258 const string uri = conf.Get<string>("uri");
259 const string file = conf.Get<string>("file");
260 const string extension = conf.Get<string>("extension");
261 string table = conf.Get<string>("table");
262
263 const uint16_t verbose = conf.Get<uint16_t>("verbose");
264 const int64_t first = conf.Get<int64_t>("first");
265 const int64_t max = conf.Get<int64_t>("max");
266
267 const bool force = conf.Get<bool>("force");
268 const bool drop = conf.Get<bool>("drop");
269 const bool create = conf.Get<bool>("create") || drop;
270 const bool noinsert = conf.Get<bool>("no-insert");
271 const bool dry_run = conf.Get<bool>("dry-run");
272
273 const string engine = conf.Get<string>("engine");
274
275 const bool print_extensions = conf.Get<bool>("print-extensions");
276 const bool print_columns = conf.Get<bool>("print-columns");
277 const bool print_create = conf.Get<bool>("print-create");
278 const bool print_insert = conf.Get<bool>("print-insert");
279
280 const vector<Map> mymap = conf.Vec<Map>("map");
281 const vector<Map> sqltypes = conf.Vec<Map>("sql-type");
282 const vector<string> _ignore = conf.Vec<string>("ignore");
283 const vector<string> primary = conf.Vec<string>("primary");
284
285 // -------------------------------------------------------------------------
286
287 if (verbose>0)
288 cout << "\n-------------------------- Evaluating file -------------------------" << endl;
289
290 zfits f(file.c_str(), extension.c_str());
291 if (!f)
292 {
293 cerr << "Could not open file " << file << ": " << strerror(errno) << endl;
294 return 1;
295 }
296
297 if (verbose>0)
298 cout << "File: " << file << endl;
299
300 if (!extension.empty() && extension!=f.Get<string>("EXTNAME"))
301 {
302 cerr << "Extension " << extension << " not found in file." << endl;
303 return 2;
304 }
305
306 if (print_extensions)
307 {
308 cout << "\nTables:\n - " << boost::join(f.GetTables(), "\n - ") << '\n' << endl;
309 return 2;
310 }
311
312 if (verbose>0)
313 cout << "FITS extension [table]: " << f.Get<string>("EXTNAME") << endl;
314
315 if (table.empty())
316 table = f.Get<string>("EXTNAME");
317
318 if (verbose>0)
319 cout << "SQL table: " << table << endl;
320
321// const double mjdref = f.Get("MJDREF", double(0));
322
323 if (print_columns)
324 {
325 cout << '\n';
326 f.PrintColumns();
327 cout << '\n';
328 }
329
330 const auto cols = f.GetColumns();
331
332 if (verbose>0)
333 {
334 cout << f.GetNumRows() << " events found." << endl;
335 cout << cols.size() << " columns found." << endl;
336 }
337
338 string query =
339 "CREATE TABLE IF NOT EXISTS `"+table+"`\n"
340 "(\n";
341
342 vector<Container> vec;
343
344 for (const auto &ic : cols)
345 {
346 const auto &col = ic.second;
347
348 if (verbose>2)
349 cout << '\n' << col.type << " " << ic.first << "[" << col.num << "]";
350
351 string name = ic.first;
352
353 bool found = false;
354 for (auto b=_ignore.cbegin(); b!=_ignore.cend(); b++)
355 {
356 if (boost::regex_match(name, boost::regex(*b)))
357 {
358 found = true;
359 if (verbose>2)
360 cout << " (-ignored-)";
361 break;
362 }
363 }
364 if (found)
365 continue;
366
367 const char tn = col.type;
368
369 auto it = ConvFits.find(tn);
370 if (it==ConvFits.end())
371 {
372 if (verbose>2)
373 cout << " (-n/a-)";
374 continue;
375 }
376
377 if (verbose==2)
378 cout << '\n' << name << " [" << tn << "]";
379
380 for (auto m=mymap.cbegin(); m!=mymap.cend(); m++)
381 name = boost::regex_replace(name, boost::regex(m->first), m->second);
382
383 if (verbose>1)
384 cout << " (" << name << ")";
385
386 string sqltype = it->second.second;
387
388 for (auto m=sqltypes.cbegin(); m!=sqltypes.cend(); m++)
389 if (m->first==name)
390 sqltype = m->second;
391
392 if (!vec.empty())
393 query += ",\n";
394
395 const size_t N = col.type=='A' ? 1 : col.num;
396 for (int i=0; i<N; i++)
397 {
398 query += " `"+name;
399 if (N>1)
400 query += "["+to_string(i)+"]";
401 query += "` "+sqltype;
402 if (col.type=='A')
403 query += '('+to_string(col.num)+')';
404 query += " NOT NULL COMMENT '"+ic.first;
405 if (!col.unit.empty())
406 query += "["+col.unit+"]";
407 query += ": "+col.comment+"'";
408 }
409
410 const BasicType_t bt =
411 /*ic.first=="Time" && col.num==1 && col.unit=="MJD" && it->second.first==kDouble ?
412 kMJD :*/ it->second.first;
413
414 vec.emplace_back(ic.first, name, bt, col.num/*, mjdref*/);
415 vec.back().ptr = f.SetPtrAddress(ic.first);
416 }
417
418 if (verbose>1)
419 cout << "\n\n";
420 if (verbose>0)
421 cout << vec.size() << " columns setup for reading." << endl;
422
423 // -------------------------------------------------------------------------
424 // Checking for database connection
425
426 try
427 {
428 if (!force)
429 Database(uri).connected();
430 }
431 catch (const exception &e)
432 {
433 cerr << "SQL connection failed: " << e.what() << endl;
434 return 3;
435 }
436
437 // -------------------------------------------------------------------------
438
439 if (verbose>0)
440 cout << "\n--------------------------- Database Table -------------------------" << endl;
441
442 if (!primary.empty())
443 query += ",\n PRIMARY KEY USING BTREE (`"+boost::algorithm::join(primary, "`, `")+"`)\n";
444
445 query +=
446 ")\n"
447 "DEFAULT CHARSET=latin1 COLLATE=latin1_general_ci\n"
448 "ENGINE="+engine+"\n"
449 "COMMENT='created by "+conf.GetName()+"'\n";
450
451
452 // FIXME: Can we omit the catching to be able to print the
453 // query 'autmatically'?
454 try
455 {
456 if (drop)
457 {
458 // => Simple result
459 if (!dry_run)
460 Database(uri).query("DROP TABLE `"+table+"`").execute();
461 if (verbose>0)
462 cout << "Table `" << table << "` dropped." << endl;
463 }
464 }
465 catch (const exception &e)
466 {
467 cerr << "DROP TABLE `" << table << "`\n\n";
468 cerr << "SQL query failed:\n" << e.what() << endl;
469 return 4;
470 }
471
472 try
473 {
474 if (create && !dry_run)
475 Database(uri).query(query).execute();
476 }
477 catch (const exception &e)
478 {
479 cerr << query << "\n\n";
480 cerr << "SQL query failed:\n" << e.what() << endl;
481 return 5;
482 }
483
484 if (print_create)
485 cout << query << endl;
486
487 if (create && verbose>0)
488 cout << "Table `" << table << "` created." << endl;
489
490 // -------------------------------------------------------------------------
491
492 if (verbose>0)
493 cout << "\n---------------------------- Reading file --------------------------" << endl;
494
495 //query = update ? "UPDATE" : "INSERT";
496 query = "INSERT `"+table+"`\n"
497 "(\n";
498
499 for (auto c=vec.cbegin(); c!=vec.cend(); c++)
500 {
501 const size_t N = c->type==kVarchar ? 1 : c->num;
502 for (int i=0; i<N; i++)
503 {
504 if (c!=vec.cbegin())
505 query += ",\n";
506
507 if (N==1)
508 query += " `"+c->column+"`";
509 else
510 query += " `"+c->column+"["+to_string(i)+"]`";
511 }
512 }
513
514 query +=
515 "\n)\n"
516 "VALUES\n";
517
518 size_t count = 0;
519
520 const size_t num = max>0 && (max-first)<f.GetNumRows() ? (max-first) : f.GetNumRows();
521 for (size_t j=first; j<num; j++)
522 {
523 f.GetRow(j);
524
525 if (count>0)
526 query += ",\n";
527
528 query += "(\n";
529
530 for (auto c=vec.cbegin(); c!=vec.cend(); c++)
531 {
532 const size_t N = c->type==kVarchar ? 1 : c->num;
533 for (int i=0; i<N; i++)
534 {
535 if (c!=vec.cbegin())
536 query += ",\n";
537
538 if (c->type==kVarchar)
539 query += " '"+c->fmt(i)+"'";
540 else
541 query += " "+c->fmt(i);
542
543 if (print_insert && i==0)
544 query += " /* "+c->column+" -> "+c->branch+" */";
545 }
546 }
547 query += "\n)"; // ON DUPLICATE KEY UPDATE\n";
548
549 count ++;
550 }
551
552 if (verbose>0)
553 cout << count << " out of " << num << " row(s) read from file [N=" << first << ".." << num-1 << "]." << endl;
554
555 if (count==0)
556 {
557 if (verbose>0)
558 cout << "Total execution time: " << Time().UnixTime()-start.UnixTime() << "s\n" << endl;
559 return 0;
560 }
561
562 // -------------------------------------------------------------------------
563
564 if (verbose>0)
565 {
566 cout << "\n--------------------------- Inserting data -------------------------" << endl;
567 cout << "Sending INSERT query (" << query.length() << " bytes)" << endl;
568 }
569
570 try
571 {
572 if (!noinsert && !dry_run)
573 // => Simple result
574 Database(uri).query(query).execute();
575 else
576 cout << "Insert query skipped!" << endl;
577
578 if (print_insert)
579 cout << query << endl;
580 }
581 catch (const exception &e)
582 {
583 if (verbose>1 || query.length()<80*25)
584 cerr << query << "\n\n";
585 cerr << "SQL query failed (" << query.length() << " bytes):\n" << e.what() << endl;
586 return 6;
587 }
588
589 if (verbose>0)
590 {
591 cout << count << " row(s) inserted.\n\n";
592 cout << "Total execution time: " << Time().UnixTime()-start.UnixTime() << "s\n" << endl;
593 }
594
595 return 0;
596}
Note: See TracBrowser for help on using the repository browser.