source: trunk/FACT++/src/fits2sql.cc@ 19109

Last change on this file since 19109 was 19109, checked in by tbretz, 6 years ago
Implemented unisgned columns, row-format and now using the database default engine if not specified.
File size: 23.4 KB
Line 
1#include <boost/algorithm/string/join.hpp>
2#include <boost/regex.hpp>
3
4#include "Database.h"
5
6#include "Time.h"
7#include "Configuration.h"
8
9#include "zfits.h"
10
11using namespace std;
12
13// ------------------------------------------------------------------------
14
15struct Map : pair<string, string>
16{
17 Map() { }
18};
19
20std::istream &operator>>(std::istream &in, Map &m)
21{
22 const istreambuf_iterator<char> eos;
23 string txt(istreambuf_iterator<char>(in), eos);
24
25 const boost::regex expr("((.*)[^\\\\])/(.*)");
26 boost::smatch match;
27 if (!boost::regex_match(txt, match, expr))
28 throw runtime_error("Could not evaluate map argument: "+txt);
29
30 m.first = match[1].str();
31 m.second = match[3].str();
32
33 return in;
34}
35
36
37void SetupConfiguration(Configuration &conf)
38{
39 po::options_description control("Root to SQL");
40 control.add_options()
41 ("uri,u", var<string>()
42#if BOOST_VERSION >= 104200
43 ->required()
44#endif
45 , "Database link as in\n\tuser:password@server[:port]/database.")
46 ("file", var<string>("")
47#if BOOST_VERSION >= 104200
48 ->required()
49#endif
50 , "The root file to read from")
51 ("force", po_switch(), "Force processing even if there is no database connection")
52 ("create", po_switch(), "Create the database if not existing")
53 ("drop", po_switch(), "Drop the table (implies create)")
54 ("extension,e", var<string>(""), "Name of the fits extension (table) to convert")
55 ("table", var<string>(""), "Name of the table to use (default is the tree name)")
56 ("map", vars<Map>(), "A regular expression which is applied to the leaf name befoee it is used as SQL column name)")
57 ("sql-type", vars<Map>(), "Allows to overwrite the calculated SQL type for a given column e.g. 'sql-column-name/UNSIGNED IN'")
58 ("ignore", vars<string>(), "Ignore the given leaf, if the given regular expression matches")
59 ("unsigned", vars<string>(), "In fits files per default columns are signed. This interpretss the column as unsigned value. Use the FITS column name.")
60 ("primary", vars<string>(), "List of columns to be used as primary keys during table creation (in connection with --create)")
61 ("first", var<int64_t>(int64_t(0)), "First event to start with (default: 0), mainly for test purpose")
62 ("max", var<int64_t>(int64_t(0)), "Maximum number of events to process (0: all), mainly for test purpose")
63 ("engine", var<string>(""), "Database engine to be used when a new table is created")
64 ("row-format", var<string>(""), "Defines the ROW_FORMAT keyword for table creation")
65 ("duplicate", var<string>(""), "Specifies an assignment_list for an 'ON DUPLICATE KEY UPDATE' expression")
66 ("ignore-errors", po_switch(), "Adds the IGNORE keyword to the INSERT query (turns errors into warnings, ignores rows with errors)")
67 ;
68
69 po::options_description debug("Debug options");
70 debug.add_options()
71 ("no-insert", po_switch(), "Does not insert any data into the table")
72 ("dry-run", po_switch(), "Skip any query which changes the databse (might result in consecutive failures)")
73 ("print-extensions", po_switch(), "Print extensions (tables) from fits file")
74 ("print-columns", po_switch(), "Print columns in fits table")
75 ("print-insert", po_switch(), "Print the INSERT query (note that it contains all data)")
76 ("print-create", po_switch(), "Print the CREATE query")
77 ("verbose,v", var<uint16_t>(1), "Verbosity (0: quiet, 1: default, 2: more, 3, ...)")
78 ;
79
80 po::positional_options_description p;
81 p.add("file", 1); // The 1st positional options (n=1)
82
83 conf.AddOptions(control);
84 conf.AddOptions(debug);
85 conf.SetArgumentPositions(p);
86}
87
88void PrintUsage()
89{
90 cout <<
91 "fits2sql - Fills the data from a fits file into a database\n"
92 "\n"
93 "For convenience, this documentation uses the extended version of the options, "
94 "refer to the output below to get the abbreviations.\n"
95 "\n"
96 "This is a general purpose tool to fill the contents of a fite file into a database.\n"
97 "\n"
98 "Each fits file contians several table, so called extenbsions. Each tables has "
99 "a number of columns compilsed from basic data types. The default extension to "
100 "read from is the first one on the file but the name can be overwritten using "
101 "--extension. The default table name to fill the data into is identical to "
102 "the extension name. It can be overwritten using --table.\n"
103 "\n"
104 "The name of each column to which data is filled is obtained from "
105 "the fits column names. The column names can be checked using --print-columns. "
106 "Sometimes these names might not be convenient. To allow to simplify or replace "
107 "column names, regular expressions (using boost's regex) can be defined to change "
108 "the names. Note that these regular expressions are applied one by one on each "
109 "columns's name. A valid expression could "
110 "be:\n"
111 " --map=MHillas\\.f/\n"
112 "which would remove all occurances of 'MHillas.f'. This option can be used more than "
113 "once. They are applied in sequence. A single match does not stop the sequence.\n"
114 "\n"
115 "Sometimes it might also be convenient to skip a column. This can be done with "
116 "the --ignore resource. If the given regular expresion yields a match, the "
117 "column will be ignored. Note that the regular expression works on the raw-name "
118 "of the column not the readily mapped SQL column names. Example:\n"
119 " --ignore=ThetaSq\\..*\n"
120 "will skip all leaved which start with 'ThetaSq.'. This option can be used"
121 "more than once.\n"
122 "\n"
123 "The data type of each column is kept as close as possible to the columns' data "
124 "types. If for some reason this is not wanted, the data type of the SQL column "
125 "can be overwritten with --sql-type sql-column/sql-ytpe, for example:\n"
126 " --sql-type=FileId/UNSIGNED INT\n"
127 "while the first argument of the name of the SQL column to which the data type "
128 "should be applied. The second column is the basic SQL data type. The option can "
129 "be given more than once.\n"
130 "\n"
131 "Database interaction:\n"
132 "\n"
133 "To drop an existing table, --drop can be used.\n"
134 "\n"
135 "To create a table according to theSQL column names and data types, --create "
136 "can be used. The query used can be printed with --print-create even --create "
137 "has not been specified.\n"
138 "\n"
139 "To choose the columns which should become primary keys, use --primary, "
140 "for exmaple:\n"
141 " --primary=col1\n"
142 "To define more than one column as primary key, the option can be given more than "
143 "once. Note that the combination of these columns must be unique.\n"
144 "\n"
145 "All columns are created as NOT NULL as default. To force a database engine "
146 "and/or a storage format, use --engine and --rot-format.\n"
147 "\n"
148 "Usually, the INSERT query would fail if the PRIMARY key exists already. "
149 "This can be avoided using the 'ON DUPLICATE KEY' directive. With the "
150 "--duplicate,you can specify what should be updated in case of a duplicate key. "
151 "To keep the row untouched, you can just update the primary key "
152 "with the identical primary key, e.g. --duplicate='MyPrimary=VALUES(MyPrimary)'. "
153 "For more details, see the MySQL manual.\n"
154 "\n"
155 "Another possibility is to add the IGNORE keyword to the INSERT query by "
156 "--ignore-errors, which essentially ignores all errors and turns them into "
157 "warnings which are printed after the query succeeded.\n"
158 "\n"
159 "For debugging purpose, or to just create or drop a table, the final insert "
160 "query can be skipped using --no-insert. Note that for performance reason, "
161 "all data is collected in memory and a single INSERT query is issued at the "
162 "end.\n"
163 "\n"
164 "Using a higher verbosity level (-v), an overview of the written columns or all "
165 "processed leaves is printed depending on the verbosity level. The output looks "
166 "like the following\n"
167 " Leaf name [root data type] (SQL name)\n"
168 "for example\n"
169 " MTime.fTime.fMilliSec [Long64_t] (MilliSec)\n"
170 "which means that the leaf MTime.fTime.fMilliSec is detected to be a Long64_t "
171 "which is filled into a column called MilliSec. Leaves with non basic data types "
172 "are ignored automatically and are marked as (-n/a-). User ignored columnd "
173 "are marked as (-ignored-).\n"
174 "\n"
175 "If a query failed, the query is printed to stderr together with the error message. "
176 "For the main INSERT query, this is only true if the verbosity level is at least 2 "
177 "or the query has less than 80*25 bytes.\n"
178 "\n"
179 "In case of succes, 0 is returned, a value>0 otherwise.\n"
180 "\n"
181 "Usage: fits2sql [options] --uri URI fitsfile.fits[.gz]\n"
182 "\n"
183 ;
184 cout << endl;
185}
186
187enum BasicType_t
188{
189 kNone = 0,
190 kVarchar,
191 kBool,
192 kFloat,
193 kDouble,
194 kInt8,
195 kUInt8,
196 kInt16,
197 kUInt16,
198 kInt32,
199 kUInt32,
200 kInt64,
201 kUInt64,
202// kMJD,
203};
204
205static const map<char, pair<BasicType_t, string>> ConvFits =
206{
207 { 'A', { kVarchar, "VARCHAR" } },
208 { 'a', { kVarchar, "VARCHAR" } },
209 { 'L', { kBool, "BOOLEAN" } },
210 { 'l', { kBool, "BOOLEAN" } },
211 { 'B', { kInt8, "TINYINT" } },
212 { 'b', { kUInt8, "TINYINT UNSIGNED" } },
213 { 'I', { kInt16, "SMALLINT" } },
214 { 'i', { kUInt16, "SMALLINT UNSIGNED" } },
215 { 'J', { kInt32, "INT" } },
216 { 'j', { kUInt32, "INT UNSIGNED" } },
217 { 'K', { kInt64, "BIGINT" } },
218 { 'k', { kUInt64, "BIGINT UNSIGNED" } },
219 { 'E', { kFloat, "FLOAT" } },
220 { 'e', { kFloat, "FLOAT" } },
221 { 'D', { kDouble, "DOUBLE" } },
222 { 'd', { kDouble, "DOUBLE" } },
223};
224
225struct Container
226{
227 string branch; // fits column name
228 string column; // sql column name
229 BasicType_t type;
230 size_t num;
231// double offset;
232 void *ptr;
233
234 Container(const string &b, const string &c, const BasicType_t &t, const size_t &n/*, const double &offset=0*/) : branch(b), column(c), type(t), num(n), ptr(0)
235 {
236 }
237 ~Container()
238 {
239 }
240
241 string fmt(const size_t &index) const
242 {
243 ostringstream str;
244
245 switch (type)
246 {
247 //case kVarchar: str << string(reinterpret_cast<char*>(ptr), num); break;
248 case kVarchar: str << string(reinterpret_cast<char*>(ptr), num).c_str(); break;
249 case kFloat: str << setprecision(8) << reinterpret_cast<float*>(ptr)[index]; break;
250// case kMJD: str << setprecision(16) << reinterpret_cast<double*>(ptr)[0]+offset; break;
251 case kDouble: str << setprecision(16) << reinterpret_cast<double*>(ptr)[index]; break;
252 case kBool:
253 case kInt8: str << int32_t(reinterpret_cast<int8_t*>(ptr)[index]); break;
254 case kUInt8: str << uint32_t(reinterpret_cast<uint8_t*>(ptr)[index]); break;
255 case kInt16: str << reinterpret_cast<int16_t*>(ptr)[index]; break;
256 case kUInt16: str << reinterpret_cast<uint16_t*>(ptr)[index]; break;
257 case kInt32: str << reinterpret_cast<int32_t*>(ptr)[index]; break;
258 case kUInt32: str << reinterpret_cast<uint32_t*>(ptr)[index]; break;
259 case kInt64: str << reinterpret_cast<int64_t*>(ptr)[index]; break;
260 case kUInt64: str << reinterpret_cast<uint64_t*>(ptr)[index]; break;
261 case kNone:
262 break;
263 }
264
265 return str.str();
266 }
267};
268
269int main(int argc, const char* argv[])
270{
271 Time start;
272
273 Configuration conf(argv[0]);
274 conf.SetPrintUsage(PrintUsage);
275 SetupConfiguration(conf);
276
277 if (!conf.DoParse(argc, argv))
278 return 127;
279
280 // ----------------------------- Evaluate options --------------------------
281 const string uri = conf.Get<string>("uri");
282 const string file = conf.Get<string>("file");
283 const string extension = conf.Get<string>("extension");
284 string table = conf.Get<string>("table");
285
286 const uint16_t verbose = conf.Get<uint16_t>("verbose");
287 const int64_t first = conf.Get<int64_t>("first");
288 const int64_t max = conf.Get<int64_t>("max");
289
290 const bool force = conf.Get<bool>("force");
291 const bool drop = conf.Get<bool>("drop");
292 const bool create = conf.Get<bool>("create") || drop;
293 const bool noinsert = conf.Get<bool>("no-insert");
294 const bool dry_run = conf.Get<bool>("dry-run");
295
296 const string engine = conf.Get<string>("engine");
297 const string row_format = conf.Get<string>("row-format");
298 const string duplicate = conf.Get<string>("duplicate");
299
300 const bool ignore_errors = conf.Get<bool>("ignore-errors");
301
302 const bool print_extensions = conf.Get<bool>("print-extensions");
303 const bool print_columns = conf.Get<bool>("print-columns");
304 const bool print_create = conf.Get<bool>("print-create");
305 const bool print_insert = conf.Get<bool>("print-insert");
306
307 const vector<Map> mymap = conf.Vec<Map>("map");
308 const vector<Map> sqltypes = conf.Vec<Map>("sql-type");
309 const vector<string> _ignore = conf.Vec<string>("ignore");
310 const vector<string> primary = conf.Vec<string>("primary");
311
312 const vector<string> notsigned = conf.Vec<string>("unsigned");
313
314 // -------------------------------------------------------------------------
315
316 if (verbose>0)
317 cout << "\n-------------------------- Evaluating file -------------------------" << endl;
318
319 zfits f(file.c_str(), extension.c_str());
320 if (!f)
321 {
322 cerr << "Could not open file " << file << ": " << strerror(errno) << endl;
323 return 1;
324 }
325
326 if (verbose>0)
327 cout << "File: " << file << endl;
328
329 if (!extension.empty() && extension!=f.Get<string>("EXTNAME"))
330 {
331 cerr << "Extension " << extension << " not found in file." << endl;
332 return 2;
333 }
334
335 if (print_extensions)
336 {
337 cout << "\nTables:\n - " << boost::join(f.GetTables(), "\n - ") << '\n' << endl;
338 return 3;
339 }
340
341 if (verbose>0)
342 cout << "FITS extension [table]: " << f.Get<string>("EXTNAME") << endl;
343
344 if (table.empty())
345 table = f.Get<string>("EXTNAME");
346
347 if (verbose>0)
348 cout << "SQL table: " << table << endl;
349
350// const double mjdref = f.Get("MJDREF", double(0));
351
352 if (print_columns)
353 {
354 cout << '\n';
355 f.PrintColumns();
356 cout << '\n';
357 }
358
359 const auto cols = f.GetColumns();
360
361 if (verbose>0)
362 {
363 cout << f.GetNumRows() << " events found." << endl;
364 cout << cols.size() << " columns found." << endl;
365 }
366
367 string query =
368 "CREATE TABLE IF NOT EXISTS `"+table+"`\n"
369 "(\n";
370
371 vector<Container> vec;
372
373 for (const auto &ic : cols)
374 {
375 const auto &col = ic.second;
376
377 if (verbose>2)
378 cout << '\n' << col.type << " " << ic.first << "[" << col.num << "]";
379
380 string name = ic.first;
381
382 bool found = false;
383 for (auto b=_ignore.cbegin(); b!=_ignore.cend(); b++)
384 {
385 if (boost::regex_match(name, boost::regex(*b)))
386 {
387 found = true;
388 if (verbose>2)
389 cout << " (-ignored-)";
390 break;
391 }
392 }
393 if (found)
394 continue;
395
396 const char tn = find(notsigned.cbegin(), notsigned.cend(), ic.first)!=notsigned.cend() ?
397 tolower(col.type) : toupper(col.type);
398
399 auto it = ConvFits.find(tn);
400 if (it==ConvFits.end())
401 {
402 if (verbose>2)
403 cout << " (-n/a-)";
404 continue;
405 }
406
407 if (verbose==2)
408 cout << '\n' << name << " [" << tn << "]";
409
410 for (auto m=mymap.cbegin(); m!=mymap.cend(); m++)
411 name = boost::regex_replace(name, boost::regex(m->first), m->second);
412
413 if (verbose>1)
414 cout << " (" << name << ")";
415
416 string sqltype = it->second.second;
417
418 for (auto m=sqltypes.cbegin(); m!=sqltypes.cend(); m++)
419 if (m->first==name)
420 sqltype = m->second;
421
422 if (!vec.empty())
423 query += ",\n";
424
425 const size_t N = col.type=='A' ? 1 : col.num;
426 for (int i=0; i<N; i++)
427 {
428 query += " `"+name;
429 if (N>1)
430 query += "["+to_string(i)+"]";
431 query += "` "+sqltype;
432 if (col.type=='A')
433 query += '('+to_string(col.num)+')';
434 query += " NOT NULL COMMENT '"+ic.first;
435 if (!col.unit.empty())
436 query += "["+col.unit+"]";
437 if (!col.comment.empty())
438 query += ": "+col.comment;
439 query += +"'";
440 if (N>1 && i!=N-1)
441 query += ",\n";
442 }
443
444 const BasicType_t bt =
445 /*ic.first=="Time" && col.num==1 && col.unit=="MJD" && it->second.first==kDouble ?
446 kMJD :*/ it->second.first;
447
448 vec.emplace_back(ic.first, name, bt, col.num/*, mjdref*/);
449 vec.back().ptr = f.SetPtrAddress(ic.first);
450 }
451
452 if (verbose>1)
453 cout << "\n\n";
454 if (verbose>0)
455 cout << vec.size() << " columns setup for reading." << endl;
456
457 // -------------------------------------------------------------------------
458 // Checking for database connection
459
460 Database connection(uri);
461
462 try
463 {
464 if (!force)
465 connection.connected();
466 }
467 catch (const exception &e)
468 {
469 cerr << "SQL connection failed: " << e.what() << '\n' << endl;
470 return 4;
471 }
472
473 // -------------------------------------------------------------------------
474
475 if (verbose>0)
476 cout << "\n--------------------------- Database Table -------------------------" << endl;
477
478 if (!primary.empty())
479 query += ",\n PRIMARY KEY USING BTREE (`"+boost::algorithm::join(primary, "`, `")+"`)\n";
480
481 query +=
482 ")\n"
483 "DEFAULT CHARSET=latin1 COLLATE=latin1_general_ci\n";
484 if (!engine.empty())
485 query += "ENGINE="+engine+"\n";
486 if (!row_format.empty())
487 query += "ROW_FORMAT="+row_format+"\n";
488 query += "COMMENT='created by "+conf.GetName()+"'\n";
489
490
491 // FIXME: Can we omit the catching to be able to print the
492 // query 'autmatically'?
493 try
494 {
495 if (drop)
496 {
497 // => Simple result
498 if (!dry_run)
499 connection.query("DROP TABLE `"+table+"`").execute();
500 if (verbose>0)
501 cout << "Table `" << table << "` dropped." << endl;
502 }
503 }
504 catch (const exception &e)
505 {
506 cerr << "DROP TABLE `" << table << "`\n\n";
507 cerr << "SQL query failed:\n" << e.what() << '\n' << endl;
508 return 5;
509 }
510
511 try
512 {
513 if (create && !dry_run)
514 connection.query(query).execute();
515 }
516 catch (const exception &e)
517 {
518 cerr << query << "\n\n";
519 cerr << "SQL query failed:\n" << e.what() << '\n' << endl;
520 return 6;
521 }
522
523 if (print_create)
524 cout << query << endl;
525
526 if (create && verbose>0)
527 cout << "Table `" << table << "` created." << endl;
528
529 // -------------------------------------------------------------------------
530
531 if (verbose>0)
532 cout << "\n---------------------------- Reading file --------------------------" << endl;
533
534 //query = update ? "UPDATE" : "INSERT";
535 query = "INSERT ";
536 if (ignore_errors)
537 query += "IGNORE ";
538 query += "`"+table+"`\n"
539 "(\n";
540
541 for (auto c=vec.cbegin(); c!=vec.cend(); c++)
542 {
543 const size_t N = c->type==kVarchar ? 1 : c->num;
544 for (int i=0; i<N; i++)
545 {
546 if (c!=vec.cbegin())
547 query += ",\n";
548
549 if (N==1)
550 query += " `"+c->column+"`";
551 else
552 query += " `"+c->column+"["+to_string(i)+"]`";
553
554 if (N>1 && i!=N-1)
555 query += ",\n";
556 }
557 }
558
559 query +=
560 "\n)\n"
561 "VALUES\n";
562
563 size_t count = 0;
564
565 const size_t num = max>0 && (max-first)<f.GetNumRows() ? (max-first) : f.GetNumRows();
566 for (size_t j=first; j<num; j++)
567 {
568 f.GetRow(j);
569
570 if (count>0)
571 query += ",\n";
572
573 query += "(\n";
574
575 for (auto c=vec.cbegin(); c!=vec.cend(); c++)
576 {
577 const size_t N = c->type==kVarchar ? 1 : c->num;
578 for (int i=0; i<N; i++)
579 {
580 if (c!=vec.cbegin())
581 query += ",\n";
582
583 if (c->type==kVarchar)
584 query += " '"+c->fmt(i)+"'";
585 else
586 query += " "+c->fmt(i);
587
588 if (print_insert && i==0)
589 query += " /* "+c->column+" -> "+c->branch+" */";
590
591 if (N>1 && i!=N-1)
592 query += ",\n";
593 }
594 }
595 query += "\n)";
596
597 count ++;
598 }
599
600 if (!duplicate.empty())
601 query += "\nON DUPLICATE KEY UPDATE\n " + duplicate;
602
603 if (verbose>0)
604 cout << count << " out of " << num << " row(s) read from file [N=" << first << ".." << num-1 << "]." << endl;
605
606 if (count==0)
607 {
608 if (verbose>0)
609 cout << "Total execution time: " << Time().UnixTime()-start.UnixTime() << "s\n" << endl;
610 return 0;
611 }
612
613 // -------------------------------------------------------------------------
614
615 if (verbose>0)
616 {
617 cout << "\n--------------------------- Inserting data -------------------------" << endl;
618 cout << "Sending INSERT query (" << query.length() << " bytes)" << endl;
619 }
620
621 try
622 {
623 if (!noinsert && !dry_run)
624 {
625 auto q = connection.query(query);
626 q.execute();
627 cout << q.info() << '\n' << endl;
628 }
629 else
630 cout << "Insert query skipped!" << endl;
631
632 if (print_insert)
633 cout << query << endl;
634 }
635 catch (const exception &e)
636 {
637 if (verbose>1 || query.length()<80*25)
638 cerr << query << "\n\n";
639 cerr << "SQL query failed (" << query.length() << " bytes):\n" << e.what() << '\n' << endl;
640 return 7;
641 }
642
643 if (verbose>0)
644 {
645 cout << "Total execution time: " << Time().UnixTime()-start.UnixTime() << "s\n" << endl;
646
647 try
648 {
649 const auto resw =
650 connection.query("SHOW WARNINGS").store();
651
652 for (size_t i=0; i<resw.num_rows(); i++)
653 {
654 const mysqlpp::Row &roww = resw[i];
655
656 cout << roww["Level"] << '[' << roww["Code"] << "]: ";
657 cout << roww["Message"] << '\n';
658 }
659 cout << endl;
660
661 }
662 catch (const exception &e)
663 {
664 cerr << "\nSHOW WARNINGS\n\n";
665 cerr << "SQL query failed:\n" << e.what() << '\n' <<endl;
666 return 8;
667 }
668 }
669
670 return 0;
671}
Note: See TracBrowser for help on using the repository browser.