source: trunk/FACT++/src/fits2sql.cc@ 19105

Last change on this file since 19105 was 19105, checked in by tbretz, 6 years ago
Some more cosmetics to the output.
File size: 21.8 KB
Line 
1#include <boost/algorithm/string/join.hpp>
2#include <boost/regex.hpp>
3
4#include "Database.h"
5
6#include "Time.h"
7#include "Configuration.h"
8
9#include "zfits.h"
10
11/*
12#include <TROOT.h>
13#include <TFile.h>
14#include <TTree.h>
15#include <TLeaf.h>
16#include <TError.h>
17*/
18
19using namespace std;
20
21// ------------------------------------------------------------------------
22
23struct Map : pair<string, string>
24{
25 Map() { }
26};
27
28std::istream &operator>>(std::istream &in, Map &m)
29{
30 const istreambuf_iterator<char> eos;
31 string txt(istreambuf_iterator<char>(in), eos);
32
33 const boost::regex expr("((.*)[^\\\\])/(.*)");
34 boost::smatch match;
35 if (!boost::regex_match(txt, match, expr))
36 throw runtime_error("Could not evaluate map argument: "+txt);
37
38 m.first = match[1].str();
39 m.second = match[3].str();
40
41 return in;
42}
43
44
45void SetupConfiguration(Configuration &conf)
46{
47 po::options_description control("Root to SQL");
48 control.add_options()
49 ("uri,u", var<string>()
50#if BOOST_VERSION >= 104200
51 ->required()
52#endif
53 , "Database link as in\n\tuser:password@server[:port]/database.")
54 ("file", var<string>(""), "The root file to read from")
55 ("force", po_switch(), "Force processing even if there is no database connection")
56 ("create", po_switch(), "Create the database if not existing")
57 ("drop", po_switch(), "Drop the table (implies create)")
58 ("extension,e", var<string>(""), "Name of the fits extension (table) to convert")
59 ("table", var<string>(""), "Name of the table to use (default is the tree name)")
60 ("map", vars<Map>(), "A regular expression which is applied to the leaf name befoee it is used as SQL column name)")
61 ("sql-type", vars<Map>(), "Allows to overwrite the calculated SQL type for a given column e.g. 'sql-column-name/UNSIGNED IN'")
62 ("ignore", vars<string>(), "Ignore the given leaf, if the given regular expression matches")
63 ("primary", vars<string>(), "List of columns to be used as primary keys during table creation (in connection with --create)")
64 ("first", var<int64_t>(int64_t(0)), "First event to start with (default: 0), mainly for test purpose")
65 ("max", var<int64_t>(int64_t(0)), "Maximum number of events to process (0: all), mainly for test purpose")
66 ("engine", var<string>("InnoDB"), "Database engine to be used when a new table is created")
67 ("duplicate", var<string>(""), "Specifies an assignment_list for an 'ON DUPLICATE KEY UPDATE' expression")
68 ("ignore-errors", po_switch(), "Adds the IGNORE keyword to the INSERT query (turns errors into warnings, ignores rows with errors)")
69 ;
70
71 po::options_description debug("Debug options");
72 debug.add_options()
73 ("no-insert", po_switch(), "Does not insert any data into the table")
74 ("dry-run", po_switch(), "Skip any query which changes the databse (might result in consecutive failures)")
75 ("print-extensions", po_switch(), "Print extensions (tables) from fits file")
76 ("print-columns", po_switch(), "Print columns in fits table")
77 ("print-insert", po_switch(), "Print the INSERT query (note that it contains all data)")
78 ("print-create", po_switch(), "Print the CREATE query")
79 ("verbose,v", var<uint16_t>(1), "Verbosity (0: quiet, 1: default, 2: more, 3, ...)")
80 ;
81
82 po::positional_options_description p;
83 p.add("file", 1); // The 1st positional options (n=1)
84
85 conf.AddOptions(control);
86 conf.AddOptions(debug);
87 conf.SetArgumentPositions(p);
88}
89
90void PrintUsage()
91{
92 cout <<
93 "fits2sql - Fills the data from a fits file into a database\n"
94 "\n"
95 "For convenience, this documentation uses the extended version of the options, "
96 "refer to the output below to get the abbreviations.\n"
97 "\n"
98 "This is a general purpose tool to fill the contents of a fite file into a database.\n"
99 "\n"
100 "Each fits file contians several table, so called extenbsions. Each tables has "
101 "a number of columns compilsed from basic data types. The default extension to "
102 "read from is the first one on the file but the name can be overwritten using "
103 "--extension. The default table name to fill the data into is identical to "
104 "the extension name. It can be overwritten using --table.\n"
105 "\n"
106 "The name of each column to which data is filled is obtained from "
107 "the fits column names. The column names can be checked using --print-columns. "
108 "Sometimes these names might not be convenient. To allow to simplify or replace "
109 "column names, regular expressions (using boost's regex) can be defined to change "
110 "the names. Note that these regular expressions are applied one by one on each "
111 "columns's name. A valid expression could "
112 "be:\n"
113 " --map=MHillas\\.f/\n"
114 "which would remove all occurances of 'MHillas.f'. This option can be used more than "
115 "once. They are applied in sequence. A single match does not stop the sequence.\n"
116 "\n"
117 "Sometimes it might also be convenient to skip a column. This can be done with "
118 "the --ignore resource. If the given regular expresion yields a match, the "
119 "column will be ignored. Note that the regular expression works on the raw-name "
120 "of the column not the readily mapped SQL column names. Example:\n"
121 " --ignore=ThetaSq\\..*\n"
122 "will skip all leaved which start with 'ThetaSq.'. This option can be used"
123 "more than once.\n"
124 "\n"
125 "The data type of each column is kept as close as possible to the columns' data "
126 "types. If for some reason this is not wanted, the data type of the SQL column "
127 "can be overwritten with --sql-type sql-column/sql-ytpe, for example:\n"
128 " --sql-type=FileId/UNSIGNED INT\n"
129 "while the first argument of the name of the SQL column to which the data type "
130 "should be applied. The second column is the basic SQL data type. The option can "
131 "be given more than once.\n"
132 "\n"
133 "Database interaction:\n"
134 "\n"
135 "To drop an existing table, --drop can be used.\n"
136 "\n"
137 "To create a table according to theSQL column names and data types, --create "
138 "can be used. The query used can be printed with --print-create even --create "
139 "has not been specified.\n"
140 "\n"
141 "To choose the columns which should become primary keys, use --primary, "
142 "for exmaple:\n"
143 " --primary=col1\n"
144 "To define more than one column as primary key, the option can be given more than "
145 "once. Note that the combination of these columns must be unique.\n"
146 "\n"
147 "All columns are created as NOT NULL as default and the table is created as "
148 "InnoDB (default).\n"
149 "\n"
150 "Usually, the INSERT query would fail if the PRIMARY key exists already. "
151 "This can be avoided using the 'ON DUPLICATE KEY' directive. With the "
152 "--duplicate,you can specify what should be updated in case of a duplicate key. "
153 "To keep the row untouched, you can just update the primary key "
154 "with the identical primary key, e.g. --duplicate='MyPrimary=VALUES(MyPrimary)'. "
155 "For more details, see the MySQL manual.\n"
156 "\n"
157 "Another possibility is to add the IGNORE keyword to the INSERT query by "
158 "--ignore-errors, which essentially ignores all errors and turns them into "
159 "warnings which are printed after the query succeeded.\n"
160 "\n"
161 "For debugging purpose, or to just create or drop a table, the final insert "
162 "query can be skipped using --no-insert. Note that for performance reason, "
163 "all data is collected in memory and a single INSERT query is issued at the "
164 "end.\n"
165 "\n"
166 "Using a higher verbosity level (-v), an overview of the written columns or all "
167 "processed leaves is printed depending on the verbosity level. The output looks "
168 "like the following\n"
169 " Leaf name [root data type] (SQL name)\n"
170 "for example\n"
171 " MTime.fTime.fMilliSec [Long64_t] (MilliSec)\n"
172 "which means that the leaf MTime.fTime.fMilliSec is detected to be a Long64_t "
173 "which is filled into a column called MilliSec. Leaves with non basic data types "
174 "are ignored automatically and are marked as (-n/a-). User ignored columnd "
175 "are marked as (-ignored-).\n"
176 "\n"
177 "If a query failed, the query is printed to stderr together with the error message. "
178 "For the main INSERT query, this is only true if the verbosity level is at least 2 "
179 "or the query has less than 80*25 bytes.\n"
180 "\n"
181 "In case of succes, 0 is returned, a value>0 otherwise.\n"
182 "\n"
183 "Usage: fits2sql [options] rootfile.root\n"
184 "\n"
185 ;
186 cout << endl;
187}
188
189enum BasicType_t
190{
191 kNone = 0,
192 kVarchar,
193 kBool,
194 kFloat,
195 kDouble,
196 kInt8,
197 kInt16,
198 kInt32,
199 kInt64,
200// kMJD,
201};
202
203static const map<char, pair<BasicType_t, string>> ConvFits =
204{
205 { 'A', { kVarchar, "VARCHAR" } },
206 { 'L', { kBool, "BOOLEAN" } },
207 { 'B', { kInt8, "TINYINT" } },
208 { 'I', { kInt16, "SMALLINT" } },
209 { 'J', { kInt32, "INT" } },
210 { 'K', { kInt64, "BIGINT" } },
211 { 'E', { kFloat, "FLOAT" } },
212 { 'D', { kDouble, "DOUBLE" } },
213};
214
215struct Container
216{
217 string branch; // fits column name
218 string column; // sql column name
219 BasicType_t type;
220 size_t num;
221// double offset;
222 void *ptr;
223
224 Container(const string &b, const string &c, const BasicType_t &t, const size_t &n/*, const double &offset=0*/) : branch(b), column(c), type(t), num(n), ptr(0)
225 {
226 }
227 ~Container()
228 {
229 }
230
231 string fmt(const size_t &index) const
232 {
233 ostringstream str;
234
235 switch (type)
236 {
237 //case kVarchar: str << string(reinterpret_cast<char*>(ptr), num); break;
238 case kVarchar: str << string(reinterpret_cast<char*>(ptr), num).c_str(); break;
239 case kFloat: str << setprecision(8) << reinterpret_cast<float*>(ptr)[index]; break;
240// case kMJD: str << setprecision(16) << reinterpret_cast<double*>(ptr)[0]+offset; break;
241 case kDouble: str << setprecision(16) << reinterpret_cast<double*>(ptr)[index]; break;
242 case kBool:
243 case kInt8: str << int32_t(reinterpret_cast<int8_t*>(ptr)[index]); break;
244 case kInt16: str << reinterpret_cast<int16_t*>(ptr)[index]; break;
245 case kInt32: str << reinterpret_cast<int32_t*>(ptr)[index]; break;
246 case kInt64: str << reinterpret_cast<int64_t*>(ptr)[index]; break;
247 case kNone:
248 break;
249 }
250
251 return str.str();
252 }
253};
254
255int main(int argc, const char* argv[])
256{
257 Time start;
258
259 Configuration conf(argv[0]);
260 conf.SetPrintUsage(PrintUsage);
261 SetupConfiguration(conf);
262
263 if (!conf.DoParse(argc, argv))
264 return 127;
265
266 // ----------------------------- Evaluate options --------------------------
267 const string uri = conf.Get<string>("uri");
268 const string file = conf.Get<string>("file");
269 const string extension = conf.Get<string>("extension");
270 string table = conf.Get<string>("table");
271
272 const uint16_t verbose = conf.Get<uint16_t>("verbose");
273 const int64_t first = conf.Get<int64_t>("first");
274 const int64_t max = conf.Get<int64_t>("max");
275
276 const bool force = conf.Get<bool>("force");
277 const bool drop = conf.Get<bool>("drop");
278 const bool create = conf.Get<bool>("create") || drop;
279 const bool noinsert = conf.Get<bool>("no-insert");
280 const bool dry_run = conf.Get<bool>("dry-run");
281
282 const string engine = conf.Get<string>("engine");
283 const string duplicate = conf.Get<string>("duplicate");
284
285 const bool ignore_errors = conf.Get<bool>("ignore-errors");
286
287 const bool print_extensions = conf.Get<bool>("print-extensions");
288 const bool print_columns = conf.Get<bool>("print-columns");
289 const bool print_create = conf.Get<bool>("print-create");
290 const bool print_insert = conf.Get<bool>("print-insert");
291
292 const vector<Map> mymap = conf.Vec<Map>("map");
293 const vector<Map> sqltypes = conf.Vec<Map>("sql-type");
294 const vector<string> _ignore = conf.Vec<string>("ignore");
295 const vector<string> primary = conf.Vec<string>("primary");
296
297 // -------------------------------------------------------------------------
298
299 if (verbose>0)
300 cout << "\n-------------------------- Evaluating file -------------------------" << endl;
301
302 zfits f(file.c_str(), extension.c_str());
303 if (!f)
304 {
305 cerr << "Could not open file " << file << ": " << strerror(errno) << endl;
306 return 1;
307 }
308
309 if (verbose>0)
310 cout << "File: " << file << endl;
311
312 if (!extension.empty() && extension!=f.Get<string>("EXTNAME"))
313 {
314 cerr << "Extension " << extension << " not found in file." << endl;
315 return 2;
316 }
317
318 if (print_extensions)
319 {
320 cout << "\nTables:\n - " << boost::join(f.GetTables(), "\n - ") << '\n' << endl;
321 return 3;
322 }
323
324 if (verbose>0)
325 cout << "FITS extension [table]: " << f.Get<string>("EXTNAME") << endl;
326
327 if (table.empty())
328 table = f.Get<string>("EXTNAME");
329
330 if (verbose>0)
331 cout << "SQL table: " << table << endl;
332
333// const double mjdref = f.Get("MJDREF", double(0));
334
335 if (print_columns)
336 {
337 cout << '\n';
338 f.PrintColumns();
339 cout << '\n';
340 }
341
342 const auto cols = f.GetColumns();
343
344 if (verbose>0)
345 {
346 cout << f.GetNumRows() << " events found." << endl;
347 cout << cols.size() << " columns found." << endl;
348 }
349
350 string query =
351 "CREATE TABLE IF NOT EXISTS `"+table+"`\n"
352 "(\n";
353
354 vector<Container> vec;
355
356 for (const auto &ic : cols)
357 {
358 const auto &col = ic.second;
359
360 if (verbose>2)
361 cout << '\n' << col.type << " " << ic.first << "[" << col.num << "]";
362
363 string name = ic.first;
364
365 bool found = false;
366 for (auto b=_ignore.cbegin(); b!=_ignore.cend(); b++)
367 {
368 if (boost::regex_match(name, boost::regex(*b)))
369 {
370 found = true;
371 if (verbose>2)
372 cout << " (-ignored-)";
373 break;
374 }
375 }
376 if (found)
377 continue;
378
379 const char tn = col.type;
380
381 auto it = ConvFits.find(tn);
382 if (it==ConvFits.end())
383 {
384 if (verbose>2)
385 cout << " (-n/a-)";
386 continue;
387 }
388
389 if (verbose==2)
390 cout << '\n' << name << " [" << tn << "]";
391
392 for (auto m=mymap.cbegin(); m!=mymap.cend(); m++)
393 name = boost::regex_replace(name, boost::regex(m->first), m->second);
394
395 if (verbose>1)
396 cout << " (" << name << ")";
397
398 string sqltype = it->second.second;
399
400 for (auto m=sqltypes.cbegin(); m!=sqltypes.cend(); m++)
401 if (m->first==name)
402 sqltype = m->second;
403
404 if (!vec.empty())
405 query += ",\n";
406
407 const size_t N = col.type=='A' ? 1 : col.num;
408 for (int i=0; i<N; i++)
409 {
410 query += " `"+name;
411 if (N>1)
412 query += "["+to_string(i)+"]";
413 query += "` "+sqltype;
414 if (col.type=='A')
415 query += '('+to_string(col.num)+')';
416 query += " NOT NULL COMMENT '"+ic.first;
417 if (!col.unit.empty())
418 query += "["+col.unit+"]";
419 query += ": "+col.comment+"'";
420 }
421
422 const BasicType_t bt =
423 /*ic.first=="Time" && col.num==1 && col.unit=="MJD" && it->second.first==kDouble ?
424 kMJD :*/ it->second.first;
425
426 vec.emplace_back(ic.first, name, bt, col.num/*, mjdref*/);
427 vec.back().ptr = f.SetPtrAddress(ic.first);
428 }
429
430 if (verbose>1)
431 cout << "\n\n";
432 if (verbose>0)
433 cout << vec.size() << " columns setup for reading." << endl;
434
435 // -------------------------------------------------------------------------
436 // Checking for database connection
437
438 Database connection(uri);
439
440 try
441 {
442 if (!force)
443 connection.connected();
444 }
445 catch (const exception &e)
446 {
447 cerr << "SQL connection failed: " << e.what() << '\n' << endl;
448 return 4;
449 }
450
451 // -------------------------------------------------------------------------
452
453 if (verbose>0)
454 cout << "\n--------------------------- Database Table -------------------------" << endl;
455
456 if (!primary.empty())
457 query += ",\n PRIMARY KEY USING BTREE (`"+boost::algorithm::join(primary, "`, `")+"`)\n";
458
459 query +=
460 ")\n"
461 "DEFAULT CHARSET=latin1 COLLATE=latin1_general_ci\n"
462 "ENGINE="+engine+"\n"
463 "COMMENT='created by "+conf.GetName()+"'\n";
464
465
466 // FIXME: Can we omit the catching to be able to print the
467 // query 'autmatically'?
468 try
469 {
470 if (drop)
471 {
472 // => Simple result
473 if (!dry_run)
474 connection.query("DROP TABLE `"+table+"`").execute();
475 if (verbose>0)
476 cout << "Table `" << table << "` dropped." << endl;
477 }
478 }
479 catch (const exception &e)
480 {
481 cerr << "DROP TABLE `" << table << "`\n\n";
482 cerr << "SQL query failed:\n" << e.what() << '\n' << endl;
483 return 5;
484 }
485
486 try
487 {
488 if (create && !dry_run)
489 connection.query(query).execute();
490 }
491 catch (const exception &e)
492 {
493 cerr << query << "\n\n";
494 cerr << "SQL query failed:\n" << e.what() << '\n' << endl;
495 return 6;
496 }
497
498 if (print_create)
499 cout << query << endl;
500
501 if (create && verbose>0)
502 cout << "Table `" << table << "` created." << endl;
503
504 // -------------------------------------------------------------------------
505
506 if (verbose>0)
507 cout << "\n---------------------------- Reading file --------------------------" << endl;
508
509 //query = update ? "UPDATE" : "INSERT";
510 query = "INSERT ";
511 if (ignore_errors)
512 query += "IGNORE ";
513 query += "`"+table+"`\n"
514 "(\n";
515
516 for (auto c=vec.cbegin(); c!=vec.cend(); c++)
517 {
518 const size_t N = c->type==kVarchar ? 1 : c->num;
519 for (int i=0; i<N; i++)
520 {
521 if (c!=vec.cbegin())
522 query += ",\n";
523
524 if (N==1)
525 query += " `"+c->column+"`";
526 else
527 query += " `"+c->column+"["+to_string(i)+"]`";
528 }
529 }
530
531 query +=
532 "\n)\n"
533 "VALUES\n";
534
535 size_t count = 0;
536
537 const size_t num = max>0 && (max-first)<f.GetNumRows() ? (max-first) : f.GetNumRows();
538 for (size_t j=first; j<num; j++)
539 {
540 f.GetRow(j);
541
542 if (count>0)
543 query += ",\n";
544
545 query += "(\n";
546
547 for (auto c=vec.cbegin(); c!=vec.cend(); c++)
548 {
549 const size_t N = c->type==kVarchar ? 1 : c->num;
550 for (int i=0; i<N; i++)
551 {
552 if (c!=vec.cbegin())
553 query += ",\n";
554
555 if (c->type==kVarchar)
556 query += " '"+c->fmt(i)+"'";
557 else
558 query += " "+c->fmt(i);
559
560 if (print_insert && i==0)
561 query += " /* "+c->column+" -> "+c->branch+" */";
562 }
563 }
564 query += "\n)";
565
566 count ++;
567 }
568
569 if (!duplicate.empty())
570 query += "\nON DUPLICATE KEY UPDATE\n " + duplicate;
571
572 if (verbose>0)
573 cout << count << " out of " << num << " row(s) read from file [N=" << first << ".." << num-1 << "]." << endl;
574
575 if (count==0)
576 {
577 if (verbose>0)
578 cout << "Total execution time: " << Time().UnixTime()-start.UnixTime() << "s\n" << endl;
579 return 0;
580 }
581
582 // -------------------------------------------------------------------------
583
584 if (verbose>0)
585 {
586 cout << "\n--------------------------- Inserting data -------------------------" << endl;
587 cout << "Sending INSERT query (" << query.length() << " bytes)" << endl;
588 }
589
590 try
591 {
592 if (!noinsert && !dry_run)
593 {
594 auto q = connection.query(query);
595 q.execute();
596 cout << q.info() << '\n' << endl;
597 }
598 else
599 cout << "Insert query skipped!" << endl;
600
601 if (print_insert)
602 cout << query << endl;
603 }
604 catch (const exception &e)
605 {
606 if (verbose>1 || query.length()<80*25)
607 cerr << query << "\n\n";
608 cerr << "SQL query failed (" << query.length() << " bytes):\n" << e.what() << '\n' << endl;
609 return 7;
610 }
611
612 if (verbose>0)
613 {
614 cout << "Total execution time: " << Time().UnixTime()-start.UnixTime() << "s\n" << endl;
615
616 try
617 {
618 const auto resw =
619 connection.query("SHOW WARNINGS").store();
620
621 for (size_t i=0; i<resw.num_rows(); i++)
622 {
623 const mysqlpp::Row &roww = resw[i];
624
625 cout << roww["Level"] << '[' << roww["Code"] << "]: ";
626 cout << roww["Message"] << '\n';
627 }
628 cout << endl;
629
630 }
631 catch (const exception &e)
632 {
633 cerr << "\nSHOW WARNINGS\n\n";
634 cerr << "SQL query failed:\n" << e.what() << '\n' <<endl;
635 return 8;
636 }
637 }
638
639 return 0;
640}
Note: See TracBrowser for help on using the repository browser.