source: trunk/FACT++/src/fits2sql.cc@ 19101

Last change on this file since 19101 was 19101, checked in by tbretz, 7 years ago
Allow an 'ON DUPLICATE KEY UPDATE' clause
File size: 21.1 KB
Line 
1#include <boost/algorithm/string/join.hpp>
2#include <boost/regex.hpp>
3
4#include "Database.h"
5
6#include "Time.h"
7#include "Configuration.h"
8
9#include "zfits.h"
10
11/*
12#include <TROOT.h>
13#include <TFile.h>
14#include <TTree.h>
15#include <TLeaf.h>
16#include <TError.h>
17*/
18
19using namespace std;
20
21// ------------------------------------------------------------------------
22
23struct Map : pair<string, string>
24{
25 Map() { }
26};
27
28std::istream &operator>>(std::istream &in, Map &m)
29{
30 const istreambuf_iterator<char> eos;
31 string txt(istreambuf_iterator<char>(in), eos);
32
33 const boost::regex expr("((.*)[^\\\\])/(.*)");
34 boost::smatch match;
35 if (!boost::regex_match(txt, match, expr))
36 throw runtime_error("Could not evaluate map argument: "+txt);
37
38 m.first = match[1].str();
39 m.second = match[3].str();
40
41 return in;
42}
43
44
45void SetupConfiguration(Configuration &conf)
46{
47 po::options_description control("Root to SQL");
48 control.add_options()
49 ("uri,u", var<string>()
50#if BOOST_VERSION >= 104200
51 ->required()
52#endif
53 , "Database link as in\n\tuser:password@server[:port]/database.")
54 ("file", var<string>(""), "The root file to read from")
55 ("force", po_switch(), "Force processing even if there is no database connection")
56 ("create", po_switch(), "Create the database if not existing")
57 ("drop", po_switch(), "Drop the table (implies create)")
58 ("extension,e", var<string>(""), "Name of the fits extension (table) to convert")
59 ("table", var<string>(""), "Name of the table to use (default is the tree name)")
60 ("map", vars<Map>(), "A regular expression which is applied to the leaf name befoee it is used as SQL column name)")
61 ("sql-type", vars<Map>(), "Allows to overwrite the calculated SQL type for a given column e.g. 'sql-column-name/UNSIGNED IN'")
62 ("ignore", vars<string>(), "Ignore the given leaf, if the given regular expression matches")
63 ("primary", vars<string>(), "List of columns to be used as primary keys during table creation (in connection with --create)")
64 ("first", var<int64_t>(int64_t(0)), "First event to start with (default: 0), mainly for test purpose")
65 ("max", var<int64_t>(int64_t(0)), "Maximum number of events to process (0: all), mainly for test purpose")
66 ("engine", var<string>("InnoDB"), "Database engine to be used when a new table is created")
67 ("duplicate", var<string>(""), "Specifies an assignment_list for an 'ON DUPLICATE KEY UPDATE' expression")
68 ;
69
70 po::options_description debug("Debug options");
71 debug.add_options()
72 ("no-insert", po_switch(), "Does not insert any data into the table")
73 ("dry-run", po_switch(), "Skip any query which changes the databse (might result in consecutive failures)")
74 ("print-extensions", po_switch(), "Print extensions (tables) from fits file")
75 ("print-columns", po_switch(), "Print columns in fits table")
76 ("print-insert", po_switch(), "Print the INSERT query (note that it contains all data)")
77 ("print-create", po_switch(), "Print the CREATE query")
78 ("verbose,v", var<uint16_t>(1), "Verbosity (0: quiet, 1: default, 2: more, 3, ...)")
79 ;
80
81 po::positional_options_description p;
82 p.add("file", 1); // The 1st positional options (n=1)
83
84 conf.AddOptions(control);
85 conf.AddOptions(debug);
86 conf.SetArgumentPositions(p);
87}
88
89void PrintUsage()
90{
91 cout <<
92 "fits2sql - Fills the data from a fits file into a database\n"
93 "\n"
94 "For convenience, this documentation uses the extended version of the options, "
95 "refer to the output below to get the abbreviations.\n"
96 "\n"
97 "This is a general purpose tool to fill the contents of a fite file into a database.\n"
98 "\n"
99 "Each fits file contians several table, so called extenbsions. Each tables has "
100 "a number of columns compilsed from basic data types. The default extension to "
101 "read from is the first one on the file but the name can be overwritten using "
102 "--extension. The default table name to fill the data into is identical to "
103 "the extension name. It can be overwritten using --table.\n"
104 "\n"
105 "The name of each column to which data is filled is obtained from "
106 "the fits column names. The column names can be checked using --print-columns. "
107 "Sometimes these names might not be convenient. To allow to simplify or replace "
108 "column names, regular expressions (using boost's regex) can be defined to change "
109 "the names. Note that these regular expressions are applied one by one on each "
110 "columns's name. A valid expression could "
111 "be:\n"
112 " --map=MHillas\\.f/\n"
113 "which would remove all occurances of 'MHillas.f'. This option can be used more than "
114 "once. They are applied in sequence. A single match does not stop the sequence.\n"
115 "\n"
116 "Sometimes it might also be convenient to skip a column. This can be done with "
117 "the --ignore resource. If the given regular expresion yields a match, the "
118 "column will be ignored. Note that the regular expression works on the raw-name "
119 "of the column not the readily mapped SQL column names. Example:\n"
120 " --ignore=ThetaSq\\..*\n"
121 "will skip all leaved which start with 'ThetaSq.'. This option can be used"
122 "more than once.\n"
123 "\n"
124 "The data type of each column is kept as close as possible to the columns' data "
125 "types. If for some reason this is not wanted, the data type of the SQL column "
126 "can be overwritten with --sql-type sql-column/sql-ytpe, for example:\n"
127 " --sql-type=FileId/UNSIGNED INT\n"
128 "while the first argument of the name of the SQL column to which the data type "
129 "should be applied. The second column is the basic SQL data type. The option can "
130 "be given more than once.\n"
131 "\n"
132 "Database interaction:\n"
133 "\n"
134 "To drop an existing table, --drop can be used.\n"
135 "\n"
136 "To create a table according to theSQL column names and data types, --create "
137 "can be used. The query used can be printed with --print-create even --create "
138 "has not been specified.\n"
139 "\n"
140 "To choose the columns which should become primary keys, use --primary, "
141 "for exmaple:\n"
142 " --primary=col1\n"
143 "To define more than one column as primary key, the option can be given more than "
144 "once. Note that the combination of these columns must be unique.\n"
145 "\n"
146 "All columns are created as NOT NULL as default and the table is created as "
147 "InnoDB (default).\n"
148 "\n"
149 "Usually, the INSERT query would fail if the PRIMARY key exists already. "
150 "This can be avoided using the 'ON DUPLICATE KEY' directive. With the "
151 "--duplicate,you can specify what should be updated in case of a duplicate key. "
152 "For detaily, see the MySQL manual.\n"
153 "\n"
154 "For debugging purpose, or to just create or drop a table, the final insert "
155 "query can be skipped using --no-insert. Note that for performance reason, "
156 "all data is collected in memory and a single INSERT query is issued at the "
157 "end.\n"
158 "\n"
159 "Using a higher verbosity level (-v), an overview of the written columns or all "
160 "processed leaves is printed depending on the verbosity level. The output looks "
161 "like the following\n"
162 " Leaf name [root data type] (SQL name)\n"
163 "for example\n"
164 " MTime.fTime.fMilliSec [Long64_t] (MilliSec)\n"
165 "which means that the leaf MTime.fTime.fMilliSec is detected to be a Long64_t "
166 "which is filled into a column called MilliSec. Leaves with non basic data types "
167 "are ignored automatically and are marked as (-n/a-). User ignored columnd "
168 "are marked as (-ignored-).\n"
169 "\n"
170 "If a query failed, the query is printed to stderr together with the error message. "
171 "For the main INSERT query, this is only true if the verbosity level is at least 2 "
172 "or the query has less than 80*25 bytes.\n"
173 "\n"
174 "In case of succes, 0 is returned, a value>0 otherwise.\n"
175 "\n"
176 "Usage: fits2sql [options] rootfile.root\n"
177 "\n"
178 ;
179 cout << endl;
180}
181
182enum BasicType_t
183{
184 kNone = 0,
185 kVarchar,
186 kBool,
187 kFloat,
188 kDouble,
189 kInt8,
190 kInt16,
191 kInt32,
192 kInt64,
193// kMJD,
194};
195
196static const map<char, pair<BasicType_t, string>> ConvFits =
197{
198 { 'A', { kVarchar, "VARCHAR" } },
199 { 'L', { kBool, "BOOLEAN" } },
200 { 'B', { kInt8, "TINYINT" } },
201 { 'I', { kInt16, "SMALLINT" } },
202 { 'J', { kInt32, "INT" } },
203 { 'K', { kInt64, "BIGINT" } },
204 { 'E', { kFloat, "FLOAT" } },
205 { 'D', { kDouble, "DOUBLE" } },
206};
207
208struct Container
209{
210 string branch; // fits column name
211 string column; // sql column name
212 BasicType_t type;
213 size_t num;
214// double offset;
215 void *ptr;
216
217 Container(const string &b, const string &c, const BasicType_t &t, const size_t &n/*, const double &offset=0*/) : branch(b), column(c), type(t), num(n), ptr(0)
218 {
219 }
220 ~Container()
221 {
222 }
223
224 string fmt(const size_t &index) const
225 {
226 ostringstream str;
227
228 switch (type)
229 {
230 //case kVarchar: str << string(reinterpret_cast<char*>(ptr), num); break;
231 case kVarchar: str << string(reinterpret_cast<char*>(ptr), num).c_str(); break;
232 case kFloat: str << setprecision(8) << reinterpret_cast<float*>(ptr)[index]; break;
233// case kMJD: str << setprecision(16) << reinterpret_cast<double*>(ptr)[0]+offset; break;
234 case kDouble: str << setprecision(16) << reinterpret_cast<double*>(ptr)[index]; break;
235 case kBool:
236 case kInt8: str << int32_t(reinterpret_cast<int8_t*>(ptr)[index]); break;
237 case kInt16: str << reinterpret_cast<int16_t*>(ptr)[index]; break;
238 case kInt32: str << reinterpret_cast<int32_t*>(ptr)[index]; break;
239 case kInt64: str << reinterpret_cast<int64_t*>(ptr)[index]; break;
240 case kNone:
241 break;
242 }
243
244 return str.str();
245 }
246};
247
248int main(int argc, const char* argv[])
249{
250 Time start;
251
252 Configuration conf(argv[0]);
253 conf.SetPrintUsage(PrintUsage);
254 SetupConfiguration(conf);
255
256 if (!conf.DoParse(argc, argv))
257 return 127;
258
259 // ----------------------------- Evaluate options --------------------------
260 const string uri = conf.Get<string>("uri");
261 const string file = conf.Get<string>("file");
262 const string extension = conf.Get<string>("extension");
263 string table = conf.Get<string>("table");
264
265 const uint16_t verbose = conf.Get<uint16_t>("verbose");
266 const int64_t first = conf.Get<int64_t>("first");
267 const int64_t max = conf.Get<int64_t>("max");
268
269 const bool force = conf.Get<bool>("force");
270 const bool drop = conf.Get<bool>("drop");
271 const bool create = conf.Get<bool>("create") || drop;
272 const bool noinsert = conf.Get<bool>("no-insert");
273 const bool dry_run = conf.Get<bool>("dry-run");
274
275 const string engine = conf.Get<string>("engine");
276 const string duplicate = conf.Get<string>("duplicate");
277
278 const bool print_extensions = conf.Get<bool>("print-extensions");
279 const bool print_columns = conf.Get<bool>("print-columns");
280 const bool print_create = conf.Get<bool>("print-create");
281 const bool print_insert = conf.Get<bool>("print-insert");
282
283 const vector<Map> mymap = conf.Vec<Map>("map");
284 const vector<Map> sqltypes = conf.Vec<Map>("sql-type");
285 const vector<string> _ignore = conf.Vec<string>("ignore");
286 const vector<string> primary = conf.Vec<string>("primary");
287
288 // -------------------------------------------------------------------------
289
290 if (verbose>0)
291 cout << "\n-------------------------- Evaluating file -------------------------" << endl;
292
293 zfits f(file.c_str(), extension.c_str());
294 if (!f)
295 {
296 cerr << "Could not open file " << file << ": " << strerror(errno) << endl;
297 return 1;
298 }
299
300 if (verbose>0)
301 cout << "File: " << file << endl;
302
303 if (!extension.empty() && extension!=f.Get<string>("EXTNAME"))
304 {
305 cerr << "Extension " << extension << " not found in file." << endl;
306 return 2;
307 }
308
309 if (print_extensions)
310 {
311 cout << "\nTables:\n - " << boost::join(f.GetTables(), "\n - ") << '\n' << endl;
312 return 2;
313 }
314
315 if (verbose>0)
316 cout << "FITS extension [table]: " << f.Get<string>("EXTNAME") << endl;
317
318 if (table.empty())
319 table = f.Get<string>("EXTNAME");
320
321 if (verbose>0)
322 cout << "SQL table: " << table << endl;
323
324// const double mjdref = f.Get("MJDREF", double(0));
325
326 if (print_columns)
327 {
328 cout << '\n';
329 f.PrintColumns();
330 cout << '\n';
331 }
332
333 const auto cols = f.GetColumns();
334
335 if (verbose>0)
336 {
337 cout << f.GetNumRows() << " events found." << endl;
338 cout << cols.size() << " columns found." << endl;
339 }
340
341 string query =
342 "CREATE TABLE IF NOT EXISTS `"+table+"`\n"
343 "(\n";
344
345 vector<Container> vec;
346
347 for (const auto &ic : cols)
348 {
349 const auto &col = ic.second;
350
351 if (verbose>2)
352 cout << '\n' << col.type << " " << ic.first << "[" << col.num << "]";
353
354 string name = ic.first;
355
356 bool found = false;
357 for (auto b=_ignore.cbegin(); b!=_ignore.cend(); b++)
358 {
359 if (boost::regex_match(name, boost::regex(*b)))
360 {
361 found = true;
362 if (verbose>2)
363 cout << " (-ignored-)";
364 break;
365 }
366 }
367 if (found)
368 continue;
369
370 const char tn = col.type;
371
372 auto it = ConvFits.find(tn);
373 if (it==ConvFits.end())
374 {
375 if (verbose>2)
376 cout << " (-n/a-)";
377 continue;
378 }
379
380 if (verbose==2)
381 cout << '\n' << name << " [" << tn << "]";
382
383 for (auto m=mymap.cbegin(); m!=mymap.cend(); m++)
384 name = boost::regex_replace(name, boost::regex(m->first), m->second);
385
386 if (verbose>1)
387 cout << " (" << name << ")";
388
389 string sqltype = it->second.second;
390
391 for (auto m=sqltypes.cbegin(); m!=sqltypes.cend(); m++)
392 if (m->first==name)
393 sqltype = m->second;
394
395 if (!vec.empty())
396 query += ",\n";
397
398 const size_t N = col.type=='A' ? 1 : col.num;
399 for (int i=0; i<N; i++)
400 {
401 query += " `"+name;
402 if (N>1)
403 query += "["+to_string(i)+"]";
404 query += "` "+sqltype;
405 if (col.type=='A')
406 query += '('+to_string(col.num)+')';
407 query += " NOT NULL COMMENT '"+ic.first;
408 if (!col.unit.empty())
409 query += "["+col.unit+"]";
410 query += ": "+col.comment+"'";
411 }
412
413 const BasicType_t bt =
414 /*ic.first=="Time" && col.num==1 && col.unit=="MJD" && it->second.first==kDouble ?
415 kMJD :*/ it->second.first;
416
417 vec.emplace_back(ic.first, name, bt, col.num/*, mjdref*/);
418 vec.back().ptr = f.SetPtrAddress(ic.first);
419 }
420
421 if (verbose>1)
422 cout << "\n\n";
423 if (verbose>0)
424 cout << vec.size() << " columns setup for reading." << endl;
425
426 // -------------------------------------------------------------------------
427 // Checking for database connection
428
429 try
430 {
431 if (!force)
432 Database(uri).connected();
433 }
434 catch (const exception &e)
435 {
436 cerr << "SQL connection failed: " << e.what() << endl;
437 return 3;
438 }
439
440 // -------------------------------------------------------------------------
441
442 if (verbose>0)
443 cout << "\n--------------------------- Database Table -------------------------" << endl;
444
445 if (!primary.empty())
446 query += ",\n PRIMARY KEY USING BTREE (`"+boost::algorithm::join(primary, "`, `")+"`)\n";
447
448 query +=
449 ")\n"
450 "DEFAULT CHARSET=latin1 COLLATE=latin1_general_ci\n"
451 "ENGINE="+engine+"\n"
452 "COMMENT='created by "+conf.GetName()+"'\n";
453
454
455 // FIXME: Can we omit the catching to be able to print the
456 // query 'autmatically'?
457 try
458 {
459 if (drop)
460 {
461 // => Simple result
462 if (!dry_run)
463 Database(uri).query("DROP TABLE `"+table+"`").execute();
464 if (verbose>0)
465 cout << "Table `" << table << "` dropped." << endl;
466 }
467 }
468 catch (const exception &e)
469 {
470 cerr << "DROP TABLE `" << table << "`\n\n";
471 cerr << "SQL query failed:\n" << e.what() << endl;
472 return 4;
473 }
474
475 try
476 {
477 if (create && !dry_run)
478 Database(uri).query(query).execute();
479 }
480 catch (const exception &e)
481 {
482 cerr << query << "\n\n";
483 cerr << "SQL query failed:\n" << e.what() << endl;
484 return 5;
485 }
486
487 if (print_create)
488 cout << query << endl;
489
490 if (create && verbose>0)
491 cout << "Table `" << table << "` created." << endl;
492
493 // -------------------------------------------------------------------------
494
495 if (verbose>0)
496 cout << "\n---------------------------- Reading file --------------------------" << endl;
497
498 //query = update ? "UPDATE" : "INSERT";
499 query = "INSERT `"+table+"`\n"
500 "(\n";
501
502 for (auto c=vec.cbegin(); c!=vec.cend(); c++)
503 {
504 const size_t N = c->type==kVarchar ? 1 : c->num;
505 for (int i=0; i<N; i++)
506 {
507 if (c!=vec.cbegin())
508 query += ",\n";
509
510 if (N==1)
511 query += " `"+c->column+"`";
512 else
513 query += " `"+c->column+"["+to_string(i)+"]`";
514 }
515 }
516
517 query +=
518 "\n)\n"
519 "VALUES\n";
520
521 size_t count = 0;
522
523 const size_t num = max>0 && (max-first)<f.GetNumRows() ? (max-first) : f.GetNumRows();
524 for (size_t j=first; j<num; j++)
525 {
526 f.GetRow(j);
527
528 if (count>0)
529 query += ",\n";
530
531 query += "(\n";
532
533 for (auto c=vec.cbegin(); c!=vec.cend(); c++)
534 {
535 const size_t N = c->type==kVarchar ? 1 : c->num;
536 for (int i=0; i<N; i++)
537 {
538 if (c!=vec.cbegin())
539 query += ",\n";
540
541 if (c->type==kVarchar)
542 query += " '"+c->fmt(i)+"'";
543 else
544 query += " "+c->fmt(i);
545
546 if (print_insert && i==0)
547 query += " /* "+c->column+" -> "+c->branch+" */";
548 }
549 }
550 query += "\n)";
551
552 count ++;
553 }
554
555 if (!duplicate.empty())
556 query += "\nON DUPLICATE KEY UPDATE\n " + duplicate;
557
558 if (verbose>0)
559 cout << count << " out of " << num << " row(s) read from file [N=" << first << ".." << num-1 << "]." << endl;
560
561 if (count==0)
562 {
563 if (verbose>0)
564 cout << "Total execution time: " << Time().UnixTime()-start.UnixTime() << "s\n" << endl;
565 return 0;
566 }
567
568 // -------------------------------------------------------------------------
569
570 if (verbose>0)
571 {
572 cout << "\n--------------------------- Inserting data -------------------------" << endl;
573 cout << "Sending INSERT query (" << query.length() << " bytes)" << endl;
574 }
575
576 try
577 {
578 if (!noinsert && !dry_run)
579 // => Simple result
580 Database(uri).query(query).execute();
581 else
582 cout << "Insert query skipped!" << endl;
583
584 if (print_insert)
585 cout << query << endl;
586 }
587 catch (const exception &e)
588 {
589 if (verbose>1 || query.length()<80*25)
590 cerr << query << "\n\n";
591 cerr << "SQL query failed (" << query.length() << " bytes):\n" << e.what() << endl;
592 return 6;
593 }
594
595 if (verbose>0)
596 {
597 cout << count << " row(s) inserted.\n\n";
598 cout << "Total execution time: " << Time().UnixTime()-start.UnixTime() << "s\n" << endl;
599/*
600 try
601 {
602 const auto resw =
603 connection.query("SHOW WARNINGS").store();
604
605 if (resw.num_rows()>0)
606 cout << "\nWARNINGS:\n\n";
607
608 for (size_t i=0; i<resw.num_rows(); i++)
609 {
610 const mysqlpp::Row &roww = resw[i];
611
612 cout << roww["Level"] << '[' << roww["Code"] << "]: ";
613 cout << roww["Message"] << '\n' << endl;
614 }
615
616 }
617 catch (const exception &e)
618 {
619 cerr << "\nSHOW WARNINGS\n\n";
620 cerr << "SQL query failed:\n" << e.what() << endl;
621 return 6;
622 }*/
623 }
624
625 return 0;
626}
Note: See TracBrowser for help on using the repository browser.