source: trunk/FACT++/src/fits2sql.cc@ 19125

Last change on this file since 19125 was 19125, checked in by tbretz, 6 years ago
Updated the explanation for the databse resource to reflect the possibility to force or prohibit compression.
File size: 23.6 KB
Line 
1#include <boost/algorithm/string/join.hpp>
2#include <boost/regex.hpp>
3
4#include "Database.h"
5
6#include "Time.h"
7#include "Configuration.h"
8
9#include "zfits.h"
10
11using namespace std;
12
13// ------------------------------------------------------------------------
14
15struct Map : pair<string, string>
16{
17 Map() { }
18};
19
20std::istream &operator>>(std::istream &in, Map &m)
21{
22 const istreambuf_iterator<char> eos;
23 string txt(istreambuf_iterator<char>(in), eos);
24
25 const boost::regex expr("((.*)[^\\\\])/(.*)");
26 boost::smatch match;
27 if (!boost::regex_match(txt, match, expr))
28 throw runtime_error("Could not evaluate map argument: "+txt);
29
30 m.first = match[1].str();
31 m.second = match[3].str();
32
33 return in;
34}
35
36
37void SetupConfiguration(Configuration &conf)
38{
39 po::options_description control("Root to SQL");
40 control.add_options()
41 ("uri,u", var<string>()
42#if BOOST_VERSION >= 104200
43 ->required()
44#endif
45 , "Database link as in\n\tuser:password@server[:port]/database[/comp].")
46 ("file", var<string>("")
47#if BOOST_VERSION >= 104200
48 ->required()
49#endif
50 , "The root file to read from")
51 ("force", po_switch(), "Force processing even if there is no database connection")
52 ("create", po_switch(), "Create the database if not existing")
53 ("drop", po_switch(), "Drop the table (implies create)")
54 ("extension,e", var<string>(""), "Name of the fits extension (table) to convert")
55 ("table", var<string>(""), "Name of the table to use (default is the tree name)")
56 ("map", vars<Map>(), "A regular expression which is applied to the leaf name befoee it is used as SQL column name)")
57 ("sql-type", vars<Map>(), "Allows to overwrite the calculated SQL type for a given column e.g. 'sql-column-name/UNSIGNED IN'")
58 ("ignore", vars<string>(), "Ignore the given leaf, if the given regular expression matches")
59 ("unsigned", vars<string>(), "In fits files per default columns are signed. This interpretss the column as unsigned value. Use the FITS column name.")
60 ("primary", vars<string>(), "List of columns to be used as primary keys during table creation (in connection with --create)")
61 ("first", var<int64_t>(int64_t(0)), "First event to start with (default: 0), mainly for test purpose")
62 ("max", var<int64_t>(int64_t(0)), "Maximum number of events to process (0: all), mainly for test purpose")
63 ("engine", var<string>(""), "Database engine to be used when a new table is created")
64 ("row-format", var<string>(""), "Defines the ROW_FORMAT keyword for table creation")
65 ("duplicate", vars<string>(""), "Specifies an assignment_list for an 'ON DUPLICATE KEY UPDATE' expression")
66 ("ignore-errors", po_switch(), "Adds the IGNORE keyword to the INSERT query (turns errors into warnings, ignores rows with errors)")
67 ;
68
69 po::options_description debug("Debug options");
70 debug.add_options()
71 ("no-insert", po_switch(), "Does not insert any data into the table")
72 ("dry-run", po_switch(), "Skip any query which changes the databse (might result in consecutive failures)")
73 ("print-extensions", po_switch(), "Print extensions (tables) from fits file")
74 ("print-columns", po_switch(), "Print columns in fits table")
75 ("print-insert", po_switch(), "Print the INSERT query (note that it contains all data)")
76 ("print-create", po_switch(), "Print the CREATE query")
77 ("verbose,v", var<uint16_t>(1), "Verbosity (0: quiet, 1: default, 2: more, 3, ...)")
78 ;
79
80 po::positional_options_description p;
81 p.add("file", 1); // The 1st positional options (n=1)
82
83 conf.AddOptions(control);
84 conf.AddOptions(debug);
85 conf.SetArgumentPositions(p);
86}
87
88void PrintUsage()
89{
90 cout <<
91 "fits2sql - Fills the data from a fits file into a database\n"
92 "\n"
93 "For convenience, this documentation uses the extended version of the options, "
94 "refer to the output below to get the abbreviations.\n"
95 "\n"
96 "This is a general purpose tool to fill the contents of a fite file into a database.\n"
97 "\n"
98 "Each fits file contians several table, so called extenbsions. Each tables has "
99 "a number of columns compilsed from basic data types. The default extension to "
100 "read from is the first one on the file but the name can be overwritten using "
101 "--extension. The default table name to fill the data into is identical to "
102 "the extension name. It can be overwritten using --table.\n"
103 "\n"
104 "The name of each column to which data is filled is obtained from "
105 "the fits column names. The column names can be checked using --print-columns. "
106 "Sometimes these names might not be convenient. To allow to simplify or replace "
107 "column names, regular expressions (using boost's regex) can be defined to change "
108 "the names. Note that these regular expressions are applied one by one on each "
109 "columns's name. A valid expression could "
110 "be:\n"
111 " --map=MHillas\\.f/\n"
112 "which would remove all occurances of 'MHillas.f'. This option can be used more than "
113 "once. They are applied in sequence. A single match does not stop the sequence.\n"
114 "\n"
115 "Sometimes it might also be convenient to skip a column. This can be done with "
116 "the --ignore resource. If the given regular expresion yields a match, the "
117 "column will be ignored. Note that the regular expression works on the raw-name "
118 "of the column not the readily mapped SQL column names. Example:\n"
119 " --ignore=ThetaSq\\..*\n"
120 "will skip all leaved which start with 'ThetaSq.'. This option can be used"
121 "more than once.\n"
122 "\n"
123 "The data type of each column is kept as close as possible to the columns' data "
124 "types. If for some reason this is not wanted, the data type of the SQL column "
125 "can be overwritten with --sql-type sql-column/sql-ytpe, for example:\n"
126 " --sql-type=FileId/UNSIGNED INT\n"
127 "while the first argument of the name of the SQL column to which the data type "
128 "should be applied. The second column is the basic SQL data type. The option can "
129 "be given more than once.\n"
130 "\n"
131 "Database interaction:\n"
132 "\n"
133 "To drop an existing table, --drop can be used.\n"
134 "\n"
135 "To create a table according to theSQL column names and data types, --create "
136 "can be used. The query used can be printed with --print-create even --create "
137 "has not been specified.\n"
138 "\n"
139 "To choose the columns which should become primary keys, use --primary, "
140 "for exmaple:\n"
141 " --primary=col1\n"
142 "To define more than one column as primary key, the option can be given more than "
143 "once. Note that the combination of these columns must be unique.\n"
144 "\n"
145 "All columns are created as NOT NULL as default. To force a database engine "
146 "and/or a storage format, use --engine and --rot-format.\n"
147 "\n"
148 "Usually, the INSERT query would fail if the PRIMARY key exists already. "
149 "This can be avoided using the 'ON DUPLICATE KEY UPDATE' directive. With the "
150 "--duplicate, you can specify what should be updated in case of a duplicate key. "
151 "To keep the row untouched, you can just update the primary key "
152 "with the identical primary key, e.g. --duplicate='MyPrimary=VALUES(MyPrimary)'. "
153 "The --duplicate resource can be specified more than once to add more expressions "
154 "to the assignment_list. For more details, see the MySQL manual.\n"
155 "\n"
156 "Another possibility is to add the IGNORE keyword to the INSERT query by "
157 "--ignore-errors, which essentially ignores all errors and turns them into "
158 "warnings which are printed after the query succeeded.\n"
159 "\n"
160 "For debugging purpose, or to just create or drop a table, the final insert "
161 "query can be skipped using --no-insert. Note that for performance reason, "
162 "all data is collected in memory and a single INSERT query is issued at the "
163 "end.\n"
164 "\n"
165 "Using a higher verbosity level (-v), an overview of the written columns or all "
166 "processed leaves is printed depending on the verbosity level. The output looks "
167 "like the following\n"
168 " Leaf name [root data type] (SQL name)\n"
169 "for example\n"
170 " MTime.fTime.fMilliSec [Long64_t] (MilliSec)\n"
171 "which means that the leaf MTime.fTime.fMilliSec is detected to be a Long64_t "
172 "which is filled into a column called MilliSec. Leaves with non basic data types "
173 "are ignored automatically and are marked as (-n/a-). User ignored columnd "
174 "are marked as (-ignored-).\n"
175 "\n"
176 "If a query failed, the query is printed to stderr together with the error message. "
177 "For the main INSERT query, this is only true if the verbosity level is at least 2 "
178 "or the query has less than 80*25 bytes.\n"
179 "\n"
180 "In case of succes, 0 is returned, a value>0 otherwise.\n"
181 "\n"
182 "Usage: fits2sql [options] --uri URI fitsfile.fits[.gz]\n"
183 "\n"
184 ;
185 cout << endl;
186}
187
188enum BasicType_t
189{
190 kNone = 0,
191 kVarchar,
192 kBool,
193 kFloat,
194 kDouble,
195 kInt8,
196 kUInt8,
197 kInt16,
198 kUInt16,
199 kInt32,
200 kUInt32,
201 kInt64,
202 kUInt64,
203// kMJD,
204};
205
206static const map<char, pair<BasicType_t, string>> ConvFits =
207{
208 { 'A', { kVarchar, "VARCHAR" } },
209 { 'a', { kVarchar, "VARCHAR" } },
210 { 'L', { kBool, "BOOLEAN" } },
211 { 'l', { kBool, "BOOLEAN" } },
212 { 'B', { kInt8, "TINYINT" } },
213 { 'b', { kUInt8, "TINYINT UNSIGNED" } },
214 { 'I', { kInt16, "SMALLINT" } },
215 { 'i', { kUInt16, "SMALLINT UNSIGNED" } },
216 { 'J', { kInt32, "INT" } },
217 { 'j', { kUInt32, "INT UNSIGNED" } },
218 { 'K', { kInt64, "BIGINT" } },
219 { 'k', { kUInt64, "BIGINT UNSIGNED" } },
220 { 'E', { kFloat, "FLOAT" } },
221 { 'e', { kFloat, "FLOAT" } },
222 { 'D', { kDouble, "DOUBLE" } },
223 { 'd', { kDouble, "DOUBLE" } },
224};
225
226struct Container
227{
228 string branch; // fits column name
229 string column; // sql column name
230 BasicType_t type;
231 size_t num;
232// double offset;
233 void *ptr;
234
235 Container(const string &b, const string &c, const BasicType_t &t, const size_t &n/*, const double &offset=0*/) : branch(b), column(c), type(t), num(n), ptr(0)
236 {
237 }
238 ~Container()
239 {
240 }
241
242 string fmt(const size_t &index) const
243 {
244 ostringstream str;
245
246 switch (type)
247 {
248 //case kVarchar: str << string(reinterpret_cast<char*>(ptr), num); break;
249 case kVarchar: str << string(reinterpret_cast<char*>(ptr), num).c_str(); break;
250 case kFloat: str << setprecision(8) << reinterpret_cast<float*>(ptr)[index]; break;
251// case kMJD: str << setprecision(16) << reinterpret_cast<double*>(ptr)[0]+offset; break;
252 case kDouble: str << setprecision(16) << reinterpret_cast<double*>(ptr)[index]; break;
253 case kBool:
254 case kInt8: str << int32_t(reinterpret_cast<int8_t*>(ptr)[index]); break;
255 case kUInt8: str << uint32_t(reinterpret_cast<uint8_t*>(ptr)[index]); break;
256 case kInt16: str << reinterpret_cast<int16_t*>(ptr)[index]; break;
257 case kUInt16: str << reinterpret_cast<uint16_t*>(ptr)[index]; break;
258 case kInt32: str << reinterpret_cast<int32_t*>(ptr)[index]; break;
259 case kUInt32: str << reinterpret_cast<uint32_t*>(ptr)[index]; break;
260 case kInt64: str << reinterpret_cast<int64_t*>(ptr)[index]; break;
261 case kUInt64: str << reinterpret_cast<uint64_t*>(ptr)[index]; break;
262 case kNone:
263 break;
264 }
265
266 return str.str();
267 }
268};
269
270int main(int argc, const char* argv[])
271{
272 Time start;
273
274 Configuration conf(argv[0]);
275 conf.SetPrintUsage(PrintUsage);
276 SetupConfiguration(conf);
277
278 if (!conf.DoParse(argc, argv))
279 return 127;
280
281 // ----------------------------- Evaluate options --------------------------
282 const string uri = conf.Get<string>("uri");
283 const string file = conf.Get<string>("file");
284 const string extension = conf.Get<string>("extension");
285 string table = conf.Get<string>("table");
286
287 const uint16_t verbose = conf.Get<uint16_t>("verbose");
288 const int64_t first = conf.Get<int64_t>("first");
289 const int64_t max = conf.Get<int64_t>("max");
290
291 const bool force = conf.Get<bool>("force");
292 const bool drop = conf.Get<bool>("drop");
293 const bool create = conf.Get<bool>("create") || drop;
294 const bool noinsert = conf.Get<bool>("no-insert");
295 const bool dry_run = conf.Get<bool>("dry-run");
296
297 const string engine = conf.Get<string>("engine");
298 const string row_format = conf.Get<string>("row-format");
299
300 const bool ignore_errors = conf.Get<bool>("ignore-errors");
301
302 const bool print_extensions = conf.Get<bool>("print-extensions");
303 const bool print_columns = conf.Get<bool>("print-columns");
304 const bool print_create = conf.Get<bool>("print-create");
305 const bool print_insert = conf.Get<bool>("print-insert");
306
307 const vector<Map> mymap = conf.Vec<Map>("map");
308 const vector<Map> sqltypes = conf.Vec<Map>("sql-type");
309 const vector<string> _ignore = conf.Vec<string>("ignore");
310 const vector<string> primary = conf.Vec<string>("primary");
311
312 const vactor<string> duplicate = conf.Vec<string>("duplicate");
313 const vector<string> notsigned = conf.Vec<string>("unsigned");
314
315 // -------------------------------------------------------------------------
316
317 if (verbose>0)
318 cout << "\n-------------------------- Evaluating file -------------------------" << endl;
319
320 zfits f(file.c_str(), extension.c_str());
321 if (!f)
322 {
323 cerr << "Could not open file " << file << ": " << strerror(errno) << endl;
324 return 1;
325 }
326
327 if (verbose>0)
328 cout << "File: " << file << endl;
329
330 if (!extension.empty() && extension!=f.Get<string>("EXTNAME"))
331 {
332 cerr << "Extension " << extension << " not found in file." << endl;
333 return 2;
334 }
335
336 if (print_extensions)
337 {
338 cout << "\nTables:\n - " << boost::join(f.GetTables(), "\n - ") << '\n' << endl;
339 return 3;
340 }
341
342 if (verbose>0)
343 cout << "FITS extension [table]: " << f.Get<string>("EXTNAME") << endl;
344
345 if (table.empty())
346 table = f.Get<string>("EXTNAME");
347
348 if (verbose>0)
349 cout << "SQL table: " << table << endl;
350
351// const double mjdref = f.Get("MJDREF", double(0));
352
353 if (print_columns)
354 {
355 cout << '\n';
356 f.PrintColumns();
357 cout << '\n';
358 }
359
360 const auto cols = f.GetColumns();
361
362 if (verbose>0)
363 {
364 cout << f.GetNumRows() << " events found." << endl;
365 cout << cols.size() << " columns found." << endl;
366 }
367
368 string query =
369 "CREATE TABLE IF NOT EXISTS `"+table+"`\n"
370 "(\n";
371
372 vector<Container> vec;
373
374 for (const auto &ic : cols)
375 {
376 const auto &col = ic.second;
377
378 if (verbose>2)
379 cout << '\n' << col.type << " " << ic.first << "[" << col.num << "]";
380
381 string name = ic.first;
382
383 bool found = false;
384 for (auto b=_ignore.cbegin(); b!=_ignore.cend(); b++)
385 {
386 if (boost::regex_match(name, boost::regex(*b)))
387 {
388 found = true;
389 if (verbose>2)
390 cout << " (-ignored-)";
391 break;
392 }
393 }
394 if (found)
395 continue;
396
397 const char tn = find(notsigned.cbegin(), notsigned.cend(), ic.first)!=notsigned.cend() ?
398 tolower(col.type) : toupper(col.type);
399
400 auto it = ConvFits.find(tn);
401 if (it==ConvFits.end())
402 {
403 if (verbose>2)
404 cout << " (-n/a-)";
405 continue;
406 }
407
408 if (verbose==2)
409 cout << '\n' << name << " [" << tn << "]";
410
411 for (auto m=mymap.cbegin(); m!=mymap.cend(); m++)
412 name = boost::regex_replace(name, boost::regex(m->first), m->second);
413
414 if (verbose>1)
415 cout << " (" << name << ")";
416
417 string sqltype = it->second.second;
418
419 for (auto m=sqltypes.cbegin(); m!=sqltypes.cend(); m++)
420 if (m->first==name)
421 sqltype = m->second;
422
423 if (!vec.empty())
424 query += ",\n";
425
426 const size_t N = col.type=='A' ? 1 : col.num;
427 for (int i=0; i<N; i++)
428 {
429 query += " `"+name;
430 if (N>1)
431 query += "["+to_string(i)+"]";
432 query += "` "+sqltype;
433 if (col.type=='A')
434 query += '('+to_string(col.num)+')';
435 query += " NOT NULL COMMENT '"+ic.first;
436 if (!col.unit.empty())
437 query += "["+col.unit+"]";
438 if (!col.comment.empty())
439 query += ": "+col.comment;
440 query += +"'";
441 if (N>1 && i!=N-1)
442 query += ",\n";
443 }
444
445 const BasicType_t bt =
446 /*ic.first=="Time" && col.num==1 && col.unit=="MJD" && it->second.first==kDouble ?
447 kMJD :*/ it->second.first;
448
449 vec.emplace_back(ic.first, name, bt, col.num/*, mjdref*/);
450 vec.back().ptr = f.SetPtrAddress(ic.first);
451 }
452
453 if (verbose>1)
454 cout << "\n\n";
455 if (verbose>0)
456 cout << vec.size() << " columns setup for reading." << endl;
457
458 // -------------------------------------------------------------------------
459 // Checking for database connection
460
461 Database connection(uri);
462
463 try
464 {
465 if (!force)
466 connection.connected();
467 }
468 catch (const exception &e)
469 {
470 cerr << "SQL connection failed: " << e.what() << '\n' << endl;
471 return 4;
472 }
473
474 // -------------------------------------------------------------------------
475
476 if (verbose>0)
477 cout << "\n--------------------------- Database Table -------------------------" << endl;
478
479 if (!primary.empty())
480 query += ",\n PRIMARY KEY USING BTREE (`"+boost::algorithm::join(primary, "`, `")+"`)\n";
481
482 query +=
483 ")\n"
484 "DEFAULT CHARSET=latin1 COLLATE=latin1_general_ci\n";
485 if (!engine.empty())
486 query += "ENGINE="+engine+"\n";
487 if (!row_format.empty())
488 query += "ROW_FORMAT="+row_format+"\n";
489 query += "COMMENT='created by "+conf.GetName()+"'\n";
490
491
492 // FIXME: Can we omit the catching to be able to print the
493 // query 'autmatically'?
494 try
495 {
496 if (drop)
497 {
498 // => Simple result
499 if (!dry_run)
500 connection.query("DROP TABLE `"+table+"`").execute();
501 if (verbose>0)
502 cout << "Table `" << table << "` dropped." << endl;
503 }
504 }
505 catch (const exception &e)
506 {
507 cerr << "DROP TABLE `" << table << "`\n\n";
508 cerr << "SQL query failed:\n" << e.what() << '\n' << endl;
509 return 5;
510 }
511
512 try
513 {
514 if (create && !dry_run)
515 connection.query(query).execute();
516 }
517 catch (const exception &e)
518 {
519 cerr << query << "\n\n";
520 cerr << "SQL query failed:\n" << e.what() << '\n' << endl;
521 return 6;
522 }
523
524 if (print_create)
525 cout << query << endl;
526
527 if (create && verbose>0)
528 cout << "Table `" << table << "` created." << endl;
529
530 // -------------------------------------------------------------------------
531
532 if (verbose>0)
533 cout << "\n---------------------------- Reading file --------------------------" << endl;
534
535 //query = update ? "UPDATE" : "INSERT";
536 query = "INSERT ";
537 if (ignore_errors)
538 query += "IGNORE ";
539 query += "`"+table+"`\n"
540 "(\n";
541
542 for (auto c=vec.cbegin(); c!=vec.cend(); c++)
543 {
544 const size_t N = c->type==kVarchar ? 1 : c->num;
545 for (int i=0; i<N; i++)
546 {
547 if (c!=vec.cbegin())
548 query += ",\n";
549
550 if (N==1)
551 query += " `"+c->column+"`";
552 else
553 query += " `"+c->column+"["+to_string(i)+"]`";
554
555 if (N>1 && i!=N-1)
556 query += ",\n";
557 }
558 }
559
560 query +=
561 "\n)\n"
562 "VALUES\n";
563
564 size_t count = 0;
565
566 const size_t num = max>0 && (max-first)<f.GetNumRows() ? (max-first) : f.GetNumRows();
567 for (size_t j=first; j<num; j++)
568 {
569 f.GetRow(j);
570
571 if (count>0)
572 query += ",\n";
573
574 query += "(\n";
575
576 for (auto c=vec.cbegin(); c!=vec.cend(); c++)
577 {
578 const size_t N = c->type==kVarchar ? 1 : c->num;
579 for (int i=0; i<N; i++)
580 {
581 if (c!=vec.cbegin())
582 query += ",\n";
583
584 if (c->type==kVarchar)
585 query += " '"+c->fmt(i)+"'";
586 else
587 query += " "+c->fmt(i);
588
589 if (print_insert && i==0)
590 query += " /* "+c->column+" -> "+c->branch+" */";
591
592 if (N>1 && i!=N-1)
593 query += ",\n";
594 }
595 }
596 query += "\n)";
597
598 count ++;
599 }
600
601 if (!duplicate.empty())
602 query += "\nON DUPLICATE KEY UPDATE\n " + boost::join(duplicate, ",\n ");
603
604 if (verbose>0)
605 cout << count << " out of " << num << " row(s) read from file [N=" << first << ".." << num-1 << "]." << endl;
606
607 if (count==0)
608 {
609 if (verbose>0)
610 cout << "Total execution time: " << Time().UnixTime()-start.UnixTime() << "s\n" << endl;
611 return 0;
612 }
613
614 // -------------------------------------------------------------------------
615
616 if (verbose>0)
617 {
618 cout << "\n--------------------------- Inserting data -------------------------" << endl;
619 cout << "Sending INSERT query (" << query.length() << " bytes)" << endl;
620 }
621
622 try
623 {
624 if (!noinsert && !dry_run)
625 {
626 auto q = connection.query(query);
627 q.execute();
628 cout << q.info() << '\n' << endl;
629 }
630 else
631 cout << "Insert query skipped!" << endl;
632
633 if (print_insert)
634 cout << query << endl;
635 }
636 catch (const exception &e)
637 {
638 if (verbose>1 || query.length()<80*25)
639 cerr << query << "\n\n";
640 cerr << "SQL query failed (" << query.length() << " bytes):\n" << e.what() << '\n' << endl;
641 return 7;
642 }
643
644 if (verbose>0)
645 {
646 const auto sec = Time().UnixTime()-start.UnixTime();
647 cout << "Total execution time: " << sec << "s ";
648 cout << "(" << Tools::Fractional(sec/count) << "s/row)\n";
649
650 try
651 {
652 const auto resw =
653 connection.query("SHOW WARNINGS").store();
654
655 for (size_t i=0; i<resw.num_rows(); i++)
656 {
657 const mysqlpp::Row &roww = resw[i];
658
659 cout << roww["Level"] << '[' << roww["Code"] << "]: ";
660 cout << roww["Message"] << '\n';
661 }
662 cout << endl;
663
664 }
665 catch (const exception &e)
666 {
667 cerr << "\nSHOW WARNINGS\n\n";
668 cerr << "SQL query failed:\n" << e.what() << '\n' <<endl;
669 return 8;
670 }
671 }
672
673 return 0;
674}
Note: See TracBrowser for help on using the repository browser.