Context Navigation

← Previous Revision
Latest Revision
Next Revision →
Blame
Revision Log

source: trunk/FACT++/src/csv2root.cc@ 19793

Visit:

Last change on this file since 19793 was 19793, checked in by tbretz, 5 years ago
Now also the opposite direction is possible.
File size: 17.8 KB

Line
1	#include <random>
2
3	#include <boost/regex.hpp>
4	#include <boost/filesystem.hpp>
5	#include <boost/algorithm/string/join.hpp>
6
7	#include "tools.h"
8	#include "Time.h"
9	#include "Configuration.h"
10
11	#include <TROOT.h>
12	#include <TSystem.h>
13	#include <TFile.h>
14	#include <TTree.h>
15	#include <TError.h>
16	#include <TObjArray.h>
17
18	using namespace std;
19	namespace fs = boost::filesystem;
20
21	// ------------------------------------------------------------------------
22
23	void SetupConfiguration(Configuration &conf)
24	{
25	po::options_description control("Root to SQL");
26	control.add_options()
27	("file", var<string>()->required(), "The csv input file")
28	("out,o", var<string>(""), "Output root file name")
29	("force,f", po_switch(), "Force overwrite if output file already exists.")
30	("update,u", po_switch(), "Update an existing file")
31	("tree,t", var<string>("Events"), "Name of the root tree to convert")
32	("compression,c", var<uint16_t>(1), "zlib compression level for the root file")
33	("no-header", po_switch(), "Use if the first line contains no header")
34	("dry-run", po_switch(), "Do not create or manipulate any output file")
35	;
36
37	po::options_description split("Splitting options");
38	split.add_options()
39	("split-sequence,S", vars<uint16_t>(), "Split data sequentially into several trees/files (e.g. 1, 1, 2)")
40	("split-quantile,Q", vars<double>(), "Split data randomly into several trees/files (e.g. 0.5, 1)")
41	("seed", var<uint64_t>(mt19937_64::default_seed), "Seed value in case of random split")
42	;
43
44	po::options_description debug("Debug options");
45	debug.add_options()
46	("verbose,v", var<uint16_t>(1), "Verbosity (0: quiet, 1: default, 2: more, 3, ...)")
47	;
48
49	po::positional_options_description p;
50	p.add("file", 1); // All positional options
51	p.add("out", 1); // All positional options
52
53	conf.AddOptions(control);
54	conf.AddOptions(split);
55	conf.AddOptions(debug);
56	conf.SetArgumentPositions(p);
57	}
58
59	void PrintUsage()
60	{
61	cout <<
62	"csv2root - Reads data from a root tree and writes a csv file\n"
63	"\n"
64	"For convenience, this documentation uses the extended version of the options, "
65	"refer to the output below to get the abbreviations.\n"
66	"\n"
67	"This is a general purpose tool to fill the contents of a root file into a database "
68	"as long as this is technically possible and makes sense. Note that root can even "
69	"write complex data like a TH1F into a database, this is not the purpose of this "
70	"program.\n"
71	"\n"
72	"Each root tree has branches and leaves (the basic data types). These leaves can "
73	"be read independently of the classes which were used to write the root file. "
74	"The default tree to read from is 'Events' but the name can be overwritten "
75	"using --tree. The default table name to fill the data into is identical to "
76	"the tree name. It can be overwritten using --table.\n"
77	"\n"
78	"To get a list of the contents (keys and trees) of a root file, you can use --print-ls. "
79	"The name of each column to which data is filled from a leave is obtained from "
80	"the leaves' names. The leave names can be checked using --print-leaves. "
81	"A --print-branches exists for convenience to print only the high-level branches. "
82	"Sometimes these names might be quite unconvenient like MTime.fTime.fMilliSec or "
83	"just MHillas.fWidth. To allow to simplify column names, regular expressions "
84	"(using boost's regex) can be defined to change the names. Note that these regular "
85	"expressions are applied one by one on each leaf's name. A valid expression could "
86	"be:\n"
87	" --map=MHillas\\.f/\n"
88	"which would remove all occurances of 'MHillas.f'. This option can be used more than "
89	"once. They are applied in sequence. A single match does not stop the sequence.\n"
90	"\n"
91	"Sometimes it might also be convenient to skip a leaf. This can be done with "
92	"the --ignore resource. If the given regular expresion yields a match, the "
93	"leaf will be ignored. Note that the regular expression works on the raw-name "
94	"of the leaf not the readily mapped SQL column names. Example:\n"
95	" --ignore=ThetaSq\\..*\n"
96	"will skip all leaved which start with 'ThetaSq.'. This option can be used"
97	"more than once.\n"
98	"\n"
99	"The data type of each column is kept as close as possible to the leaves' data "
100	"types. If for some reason this is not wanted, the data type of the SQL column "
101	"can be overwritten with --sql-type sql-column/sql-ytpe, for example:\n"
102	" --sql-type=FileId/UNSIGNED INT\n"
103	"while the first argument of the name of the SQL column to which the data type "
104	"should be applied. The second column is the basic SQL data type. The option can "
105	"be given more than once.\n"
106	"\n"
107	"Database interaction:\n"
108	"\n"
109	"To drop an existing table, --drop can be used.\n"
110	"\n"
111	"To create a table according to theSQL column names and data types, --create "
112	"can be used. The query used can be printed with --print-create even --create "
113	"has not been specified.\n"
114	"\n"
115	"To choose the columns which should become primary keys, use --primary, "
116	"for example:\n"
117	" --primary=col1\n"
118	"To define more than one column as primary key, the option can be given more than "
119	"once. Note that the combination of these columns must be unique.\n"
120	"\n"
121	"All columns are created as NOT NULL as default. To force a database engine "
122	"and/or a storage format, use --engine and --row-format.\n"
123	"\n"
124	"Usually, the INSERT query would fail if the PRIMARY key exists already. "
125	"This can be avoided using the 'ON DUPLICATE KEY UPDATE' directive. With the "
126	"--duplicate, you can specify what should be updated in case of a duplicate key. "
127	"To keep the row untouched, you can just update the primary key "
128	"with the identical primary key, e.g. --duplicate='MyPrimary=VALUES(MyPrimary)'. "
129	"The --duplicate resource can be specified more than once to add more expressions "
130	"to the assignment_list. For more details, see the MySQL manual.\n"
131	"\n"
132	"For debugging purpose, or to just create or drop a table, the final insert "
133	"query can be skipped using --no-insert. Note that for performance reason, "
134	"all data is collected in memory and a single INSERT query is issued at the "
135	"end.\n"
136	"\n"
137	"Another possibility is to add the IGNORE keyword to the INSERT query by "
138	"--ignore-errors, which essentially ignores all errors and turns them into "
139	"warnings which are printed after the query succeeded.\n"
140	"\n"
141	"Using a higher verbosity level (-v), an overview of the written columns or all "
142	"processed leaves is printed depending on the verbosity level. The output looks "
143	"like the following\n"
144	" Leaf name [root data type] (SQL name)\n"
145	"for example\n"
146	" MTime.fTime.fMilliSec [Long64_t] (MilliSec)\n"
147	"which means that the leaf MTime.fTime.fMilliSec is detected to be a Long64_t "
148	"which is filled into a column called MilliSec. Leaves with non basic data types "
149	"are ignored automatically and are marked as (-n/a-). User ignored columns "
150	"are marked as (-ignored-).\n"
151	"\n"
152	"A constant value for the given file can be inserted by using the --const directive. "
153	"For example --const.mycolumn=42 would insert 42 into a column called mycolumn. "
154	"The column is created as INT UNSIGNED as default which can be altered by "
155	"--sql-type. A special case is a value of the form `/regex/format/`. Here, the given "
156	"regular expression is applied to the filename and it is newly formated with "
157	"the new format string. Uses the standard formatting rules to replace matches "
158	"(those used by ECMAScript's replace method).\n"
159	"\n"
160	"Usually the previously defined constant values are helpful to create an index "
161	"which relates unambiguously the inserted data to the file. It might be useful "
162	"to delete all data which belongs to this particular file before new data is "
163	"entered. This can be achieved with the `--delete` directive. It deletes all "
164	"data from the table before inserting new data which fulfills the condition "
165	"defined by the `--const` directives.\n"
166	"\n"
167	"The constant values can also be used for a conditional execution (--conditional). "
168	"If any row with the given constant values are found, the execution is stopped "
169	"(note that this happend after the table drop/create but before the delete/insert.\n"
170	"\n"
171	"To ensure efficient access for a conditonal execution, it makes sense to have "
172	"an index created for those columns. This can be done during table creation "
173	"with the --index option.\n"
174	"\n"
175	"To create the index as a UNIQUE INDEX, you can use the --unique option which "
176	"implies --index.\n"
177	"\n"
178	"If a query failed, the query is printed to stderr together with the error message. "
179	"For the main INSERT query, this is only true if the verbosity level is at least 2 "
180	"or the query has less than 80*25 bytes.\n"
181	"\n"
182	"In case of success, 0 is returned, a value>0 otherwise.\n"
183	"\n"
184	"Usage: root2sql [options] -uri URI rootfile.root\n"
185	"\n"
186	;
187	cout << endl;
188	}
189
190	/*
191	void ErrorHandlerAll(Int_t level, Bool_t abort, const char location, const char msg)
192	{
193	if (string(msg).substr(0,24)=="no dictionary for class ")
194	return;
195	if (string(msg).substr(0,15)=="unknown branch ")
196	return;
197
198	DefaultErrorHandler(level, abort, location, msg);
199	}*/
200
201
202	void AddTree(vector<TTree*> &ttree, TFile &file, const string &tree, bool update, int verbose)
203	{
204	TTree *T = 0;
205	if (update)
206	{
207	file.GetObject(tree.c_str(), T);
208	if (T)
209	{
210	ttree.emplace_back(T);
211	if (verbose>0)
212	cout << "Updating tree: " << tree << endl;
213	}
214	}
215	if (!T)
216	ttree.emplace_back(new TTree(tree.c_str(), "csv2root"));
217	}
218
219	int main(int argc, const char* argv[])
220	{
221	Time start;
222
223	gROOT->SetBatch();
224	//SetErrorHandler(ErrorHandlerAll);
225
226	Configuration conf(argv[0]);
227	conf.SetPrintUsage(PrintUsage);
228	SetupConfiguration(conf);
229
230	if (!conf.DoParse(argc, argv))
231	return 127;
232
233	// ----------------------------- Evaluate options --------------------------
234	const string file = conf.Get<string>("file");
235	const string tree = conf.Get<string>("tree");
236
237	const bool force = conf.Get<bool>("force");
238	const bool update = conf.Get<bool>("update");
239	const bool dryrun = conf.Get<bool>("dry-run");
240	const bool noheader = conf.Get<bool>("no-header");
241
242	const uint16_t verbose = conf.Get<uint16_t>("verbose");
243	// const int64_t first = conf.Get<int64_t>("first");
244	// const int64_t max = conf.Get<int64_t>("max");
245
246	const uint16_t compression = conf.Get<uint16_t>("compression");
247
248	string out = conf.Get<string>("out");
249	if (out.empty())
250	{
251	out = file;
252	const auto p = out.find_last_of('.');
253	if (p!=string::npos)
254	out = string(out.substr(0, p))+".root";
255	}
256
257	// ----------------------------- Setup splitting ---------------------------
258
259	vector<uint16_t> split_seq = conf.Vec<uint16_t>("split-sequence");
260	vector<double> split_quant = conf.Vec<double>("split-quantile");
261
262	if (!split_seq.empty() && !split_quant.empty())
263	throw runtime_error("Only splitting by --split-sequence or --split-quantile is allowed.");
264
265	const size_t num_split = split_seq.size()+split_quant.size()==0 ? 0 :
266	::max(split_seq.size(), split_quant.size()+1);
267
268	map<size_t, size_t> split_lut;
269	for (size_t i=0; i<split_seq.size(); i++)
270	{
271	const size_t sz = split_lut.size();
272	for (size_t j=0; j<split_seq[i]; j++)
273	split_lut.emplace(j+sz, i);
274	}
275
276	for (size_t i=0; i<split_quant.size(); i++)
277	if (split_quant[i]<0 \|\| split_quant[i]>=1)
278	throw runtime_error("Splitting quantiles must be in the range [0;1)");
279
280	for (size_t i=1; i<split_quant.size(); i++)
281	{
282	if (split_quant[i]<=split_quant[i-1])
283	throw runtime_error("Splitting quantiles must be in increasing order.");
284	}
285
286	// -------------------------------------------------------------------------
287
288	const uniform_real_distribution<double> distribution(0,1);
289	mt19937_64 generator;
290	generator.seed(conf.Get<uint64_t>("seed"));
291	auto rndm = bind(distribution, generator);
292
293	// -------------------------------------------------------------------------
294
295	if (verbose>0)
296	{
297	cout << "\n-------------------------- Evaluating input ------------------------\n";
298	cout << "Start Time: " << Time::sql << Time(Time::local) << endl;
299	}
300
301
302	// -------------------------------------------------------------------------
303
304	cout << "Reading from '" << file << "'.\n";
305
306	ifstream fin(file.c_str());
307	if (!fin.good())
308	{
309	cerr << file << ": " << strerror(errno) << endl;
310	return 1;
311	}
312
313	TString buf;
314	buf.ReadLine(fin);
315	if (!fin)
316	{
317	cerr << file << ": " << strerror(errno) << endl;
318	return 2;
319	}
320
321	TObjArray *title = buf.Tokenize(" ");
322	if (title->GetEntries()==0)
323	{
324	cerr << "First line empty." << endl;
325	return 3;
326	}
327
328	if (title->At(0)->GetName()[0]=='#')
329	title->RemoveAt(0);
330
331	const auto numcol = title->GetEntries();
332
333	if (verbose>0)
334	cout << "Found " << numcol << " columns." << endl;
335
336	if (noheader)
337	fin.seekg(0);
338
339	// -------------------------------------------------------------------------
340
341	TString path(out.c_str());
342	gSystem->ExpandPathName(path);
343
344	if (!dryrun)
345	{
346	FileStat_t stat;
347	const Int_t exist = !gSystem->GetPathInfo(path, stat);
348	const Bool_t _write = !gSystem->AccessPathName(path, kWritePermission) && R_ISREG(stat.fMode);
349
350	if ((update && !exist) \|\| (update && exist && !_write) \|\| (force && exist && !_write))
351	{
352	cerr << "File '" << path << "' is not writable." << endl;
353	return 4;
354	}
355
356	if (!update && !force && exist)
357	{
358	cerr << "File '" << path << "' already exists." << endl;
359	return 5;
360	}
361	}
362
363	TFile tfile(path, update?"UPDATE":(force?"RECREATE":"CREATE"), file.c_str(), compression);
364	if (tfile.IsZombie())
365	{
366	cerr << "Opening file '" << path << "' failed." << endl;
367	return 6;
368	}
369
370	if (verbose>0)
371	{
372	cout << "Opened root file '" << path << "'.\n";
373	cout << "Writing to tree: " << tree << ".\n";
374	}
375
376	// -------------------- Configure branches of TTree ------------------------
377	vector<TTree*> ttree;
378
379	if (num_split==0)
380	AddTree(ttree, tfile, tree, update, verbose);
381	else
382	{
383	for (size_t i=0; i<num_split; i++)
384	AddTree(ttree, tfile, tree+"["+to_string(i)+"]", update, verbose);
385	}
386
387
388	vector<float> vec(numcol);
389	for (int i=0; i<numcol; i++)
390	{
391	string col = noheader ? Tools::Form("col%d", i) : title->At(i)->GetName();
392
393	boost::regex rexpr(":");
394	col = boost::regex_replace(col, rexpr, "");
395
396	if (verbose>1)
397	cout << "Column: " << col << '\n';
398
399	for (auto it=ttree.begin(); it!=ttree.end(); it++)
400	it[0]->Branch(col.c_str(), vec.data()+i);
401	}
402
403	delete title;
404
405	// -------------------------------------------------------------------------
406
407	size_t line = 0;
408
409	while (1)
410	{
411	buf.ReadLine(fin);
412	if (!fin)
413	break;
414
415	TObjArray *arr = buf.Tokenize(" ");
416	if (arr->GetEntries()!=numcol)
417	{
418	cerr << "Column count mismatch in line " << line+1 << "!" << endl;
419	return 6;
420	}
421
422	for (int i=0; i<numcol; i++)
423	{
424	try
425	{
426	vec[i] = stof(arr->At(i)->GetName());
427	}
428	catch (const exception &e)
429	{
430	cerr << "Conversion of '" << arr->At(i)->GetName() << "' failed!" << endl;
431	return 7;
432	}
433	}
434
435	delete arr;
436
437
438	size_t index = 0;
439	if (!split_lut.empty())
440	index = split_lut[line % split_lut.size()];
441	if (!split_quant.empty())
442	{
443	const float r = rndm();
444	for (; r>=split_quant[index]; index++)
445	if (index==split_quant.size())
446	break;
447	}
448
449	ttree[index]->Fill();
450	line++;
451	}
452
453	if (verbose>0)
454	{
455	cout << line << " data rows read from file." << endl;
456	for (size_t i=0; i<ttree.size(); i++)
457	cout << ttree[i]->GetEntries() << " rows filled into tree #" << i << "." << endl;
458	}
459
460	for (auto it=ttree.begin(); it!=ttree.end(); it++)
461	(*it)->Write("", TObject::kOverwrite);
462	tfile.Close();
463
464	if (verbose>0)
465	{
466	const auto sec = Time().UnixTime()-start.UnixTime();
467
468	cout << Tools::Scientific(tfile.GetSize()) << "B written to disk.\n";
469	cout << "File closed.\n";
470	cout << "Execution time: " << sec << "s ";
471	cout << "(" << Tools::Fractional(sec/line) << "s/row)\n";
472	cout << "--------------------------------------------------------------" << endl;
473	}
474
475	return 0;
476	}

Note: See TracBrowser for help on using the repository browser.

Download in other formats: