source: trunk/FACT++/src/csv2root.cc@ 19796

Last change on this file since 19796 was 19796, checked in by tbretz, 5 years ago
Updated help text and improved the handling of the update option (can not be combined with the split rules if tre already exists), make tree the third positional argument.
File size: 13.8 KB
Line 
1#include <random>
2
3#include <boost/regex.hpp>
4#include <boost/filesystem.hpp>
5#include <boost/algorithm/string/join.hpp>
6
7#include "tools.h"
8#include "Time.h"
9#include "Configuration.h"
10
11#include <TROOT.h>
12#include <TSystem.h>
13#include <TFile.h>
14#include <TTree.h>
15#include <TError.h>
16#include <TObjArray.h>
17
18using namespace std;
19namespace fs = boost::filesystem;
20
21// ------------------------------------------------------------------------
22
23void SetupConfiguration(Configuration &conf)
24{
25 po::options_description control("Root to SQL");
26 control.add_options()
27 ("file", var<string>()->required(), "The csv input file")
28 ("out,o", var<string>(""), "Output root file name")
29 ("force,f", po_switch(), "Force overwrite if output file already exists.")
30 ("update,u", po_switch(), "Update an existing file")
31 ("tree,t", var<string>("Events"), "Name of the root tree to convert")
32 ("compression,c", var<uint16_t>(1), "zlib compression level for the root file")
33 ("no-header,n", po_switch(), "Use if the first line contains no header")
34 ("dry-run", po_switch(), "Do not create or manipulate any output file")
35 ;
36
37 po::options_description split("Splitting options");
38 split.add_options()
39 ("split-sequence,S", vars<uint16_t>(), "Split data sequentially into several trees/files (e.g. 1, 1, 2)")
40 ("split-quantile,Q", vars<double>(), "Split data randomly into several trees/files (e.g. 0.5, 1)")
41 ("seed", var<uint64_t>(mt19937_64::default_seed), "Seed value in case of random split")
42 ;
43
44 po::options_description debug("Debug options");
45 debug.add_options()
46 ("verbose,v", var<uint16_t>(1), "Verbosity (0: quiet, 1: default, 2: more, 3, ...)")
47 ;
48
49 po::positional_options_description p;
50 p.add("file", 1); // All positional options
51 p.add("out", 1); // All positional options
52 p.add("tree", 1); // All positional options
53
54 conf.AddOptions(control);
55 conf.AddOptions(split);
56 conf.AddOptions(debug);
57 conf.SetArgumentPositions(p);
58}
59
60void PrintUsage()
61{
62 cout <<
63 "csv2root - Converts a data table from a csv file to a root tree\n"
64 "\n"
65 "For convenience, this documentation uses the extended version of the options, "
66 "refer to the output below to get the abbreviations.\n"
67 "\n"
68 "As a default, the first row in the file is considered to contain the column "
69 "names separated by a whitespace. Column names must not contain whitespaces "
70 "themselves and special characters (':') are replaces by an underscore. "
71 "If the first line contains the first data row, the --no-header directive "
72 "can be used to instruct the program to consider the first line as the first "
73 "data row and use it only for column count. The branch names in the tree "
74 "are then 'colN' where N is the column index starting from 0.\n"
75 "\n"
76 "Each consecutive row in the file is supposed to contain an identical number "
77 "of floating point values. Leading and trailing whitespaces are ignored. "
78 "Empty lines or lines starting with a '#' are discarded.\n"
79 "\n"
80 "Input and output file are given either as first and second positional argument "
81 "or with the --file and --out command line option. If no output file name is "
82 "provided then the input file is used instead and the extension replaced by .root. "
83 "The target tree name of the root file is given with the --tree command line "
84 "option or the third positional argument. The default tree name is 'Events'.\n"
85 "\n"
86 "As a default, existing files are not overwritten. If overwriting is intended, "
87 "it can be turned on with --force. To update an existing root file, the "
88 "--update option can be used. If a tree with the same name already exists, "
89 "the tree is updated. The compression level for a new root file can be set "
90 "with --compression.\n"
91 "\n"
92 "For several purposes, it might be convenient to split the output to several "
93 "different root-treess. This can be done using the --split-sequence (-S) "
94 "and the --split-quantile (-Q) options. If a split sequence is defined as "
95 "-S 1 -S 2 -S 1 the events are split by 1:2:1 in this sequence order. If "
96 "quantiles are given as -Q 0.5 -Q 0.6, the first tree will contain 50% of "
97 "the second one 10% and the third one 40%. The corresponding seed value can "
98 "be set with --seed.\n"
99 "\n"
100 "In case of success, 0 is returned, a value>0 otherwise.\n"
101 "\n"
102 "Usage: csv2root input.csv [output.root] [-t tree] [-u] [-f] [-n] [-vN] [-cN]\n"
103 "\n"
104 ;
105 cout << endl;
106}
107
108/*
109void ErrorHandlerAll(Int_t level, Bool_t abort, const char *location, const char *msg)
110{
111 if (string(msg).substr(0,24)=="no dictionary for class ")
112 return;
113 if (string(msg).substr(0,15)=="unknown branch ")
114 return;
115
116 DefaultErrorHandler(level, abort, location, msg);
117}*/
118
119
120bool AddTree(vector<TTree*> &ttree, TFile &file, const string &tree, bool update, int verbose)
121{
122 bool found = false;
123
124 TTree *T = 0;
125 if (update)
126 {
127 file.GetObject(tree.c_str(), T);
128 if (T)
129 {
130 ttree.emplace_back(T);
131 found = true;
132 if (verbose>0)
133 cout << "Updating tree: " << tree << endl;
134 }
135 }
136 if (!T)
137 ttree.emplace_back(new TTree(tree.c_str(), "csv2root"));
138
139 return found;
140}
141
142int main(int argc, const char* argv[])
143{
144 Time start;
145
146 gROOT->SetBatch();
147 //SetErrorHandler(ErrorHandlerAll);
148
149 Configuration conf(argv[0]);
150 conf.SetPrintUsage(PrintUsage);
151 SetupConfiguration(conf);
152
153 if (!conf.DoParse(argc, argv))
154 return 127;
155
156 // ----------------------------- Evaluate options --------------------------
157 const string file = conf.Get<string>("file");
158 const string tree = conf.Get<string>("tree");
159
160 const bool force = conf.Get<bool>("force");
161 const bool update = conf.Get<bool>("update");
162// const bool dryrun = conf.Get<bool>("dry-run");
163 const bool noheader = conf.Get<bool>("no-header");
164
165 const uint16_t verbose = conf.Get<uint16_t>("verbose");
166// const int64_t first = conf.Get<int64_t>("first");
167// const int64_t max = conf.Get<int64_t>("max");
168
169 const uint16_t compression = conf.Get<uint16_t>("compression");
170
171 string out = conf.Get<string>("out");
172 if (out.empty())
173 {
174 out = file;
175 const auto p = out.find_last_of('.');
176 if (p!=string::npos)
177 out = string(out.substr(0, p))+".root";
178 }
179
180 // ----------------------------- Setup splitting ---------------------------
181
182 vector<uint16_t> split_seq = conf.Vec<uint16_t>("split-sequence");
183 vector<double> split_quant = conf.Vec<double>("split-quantile");
184
185 if (!split_seq.empty() && !split_quant.empty())
186 throw runtime_error("Only splitting by --split-sequence or --split-quantile is allowed.");
187
188 const size_t num_split = split_seq.size()+split_quant.size()==0 ? 0 :
189 ::max(split_seq.size(), split_quant.size()+1);
190
191 map<size_t, size_t> split_lut;
192 for (size_t i=0; i<split_seq.size(); i++)
193 {
194 const size_t sz = split_lut.size();
195 for (size_t j=0; j<split_seq[i]; j++)
196 split_lut.emplace(j+sz, i);
197 }
198
199 for (size_t i=0; i<split_quant.size(); i++)
200 if (split_quant[i]<0 || split_quant[i]>=1)
201 throw runtime_error("Splitting quantiles must be in the range [0;1)");
202
203 for (size_t i=1; i<split_quant.size(); i++)
204 {
205 if (split_quant[i]<=split_quant[i-1])
206 throw runtime_error("Splitting quantiles must be in increasing order.");
207 }
208
209 // -------------------------------------------------------------------------
210
211 const uniform_real_distribution<double> distribution(0,1);
212 mt19937_64 generator;
213 generator.seed(conf.Get<uint64_t>("seed"));
214 auto rndm = bind(distribution, generator);
215
216 // -------------------------------------------------------------------------
217
218 if (verbose>0)
219 {
220 cout << "\n-------------------------- Evaluating input ------------------------\n";
221 cout << "Start Time: " << Time::sql << Time(Time::local) << endl;
222 }
223
224
225 // -------------------------------------------------------------------------
226
227 cout << "Reading from '" << file << "'.\n";
228
229 ifstream fin(file.c_str());
230 if (!fin.good())
231 {
232 cerr << file << ": " << strerror(errno) << endl;
233 return 1;
234 }
235
236 TString buf;
237 buf.ReadLine(fin);
238 if (!fin)
239 {
240 cerr << file << ": " << strerror(errno) << endl;
241 return 2;
242 }
243
244 buf = buf.Strip(TString::kBoth);
245 TObjArray *title = buf.Tokenize(" ");
246 if (title->GetEntries()==0)
247 {
248 cerr << "First line empty." << endl;
249 return 3;
250 }
251
252 if (title->At(0)->GetName()[0]=='#')
253 title->RemoveAt(0);
254
255 const auto numcol = title->GetEntries();
256
257 if (verbose>0)
258 cout << "Found " << numcol << " columns." << endl;
259
260 if (noheader)
261 {
262 fin.seekg(0);
263 if (verbose>0)
264 cout << "No header line interpreted." << endl;
265 }
266
267 // -------------------------------------------------------------------------
268
269 TString path(out.c_str());
270 gSystem->ExpandPathName(path);
271
272// if (!dryrun)
273 {
274 FileStat_t stat;
275 const Int_t exist = !gSystem->GetPathInfo(path, stat);
276 const Bool_t _write = !gSystem->AccessPathName(path, kWritePermission) && R_ISREG(stat.fMode);
277
278 if ((update && !exist) || (update && exist && !_write) || (force && exist && !_write))
279 {
280 cerr << "File '" << path << "' is not writable." << endl;
281 return 4;
282 }
283
284 if (!update && !force && exist)
285 {
286 cerr << "File '" << path << "' already exists." << endl;
287 return 5;
288 }
289 }
290
291 TFile tfile(path, update?"UPDATE":(force?"RECREATE":"CREATE"), file.c_str(), compression);
292 if (tfile.IsZombie())
293 {
294 cerr << "Opening file '" << path << "' failed." << endl;
295 return 6;
296 }
297
298 if (verbose>0)
299 {
300 cout << "Opened root file '" << path << "'.\n";
301 cout << "Writing to tree: " << tree << ".\n";
302 }
303
304 // -------------------- Configure branches of TTree ------------------------
305 vector<TTree*> ttree;
306
307 size_t entries = 0;
308 if (num_split==0)
309 {
310 if (AddTree(ttree, tfile, tree, update, verbose))
311 {
312 entries = ttree[0]->GetEntries();
313 if (verbose>0)
314 cout << "Tree has " << entries << " entries." << endl;
315 }
316 }
317 else
318 {
319 bool found = false;
320 for (size_t i=0; i<num_split; i++)
321 found |= AddTree(ttree, tfile, tree+"["+to_string(i)+"]", update, verbose);
322
323 if (found && update)
324 {
325 cerr << "Trees can not be updated in split mode, only files!" << endl;
326 return 7;
327 }
328 }
329
330 vector<float> vec(numcol);
331 for (int i=0; i<numcol; i++)
332 {
333 string col = noheader ? Tools::Form("col%d", i) : title->At(i)->GetName();
334
335 boost::regex rexpr(":");
336 col = boost::regex_replace(col, rexpr, "");
337
338 if (verbose>1)
339 cout << "Column: " << col << '\n';
340
341 for (auto it=ttree.begin(); it!=ttree.end(); it++)
342 it[0]->Branch(col.c_str(), vec.data()+i);
343 }
344
345 delete title;
346
347 // -------------------------------------------------------------------------
348
349 size_t line = 0;
350 size_t valid = 0;
351
352 while (1)
353 {
354 buf.ReadLine(fin);
355 if (!fin)
356 break;
357
358 line++;
359
360 buf = buf.Strip(TString::kBoth);
361 if (buf.IsNull() || buf[0]=='#')
362 continue;
363
364 valid++;
365
366 TObjArray *arr = buf.Tokenize(" ");
367 if (arr->GetEntries()!=numcol)
368 {
369 cerr << "Column count mismatch in line " << line+1 << "!" << endl;
370 return 7;
371 }
372
373 for (int i=0; i<numcol; i++)
374 {
375 try
376 {
377 vec[i] = stof(arr->At(i)->GetName());
378 }
379 catch (const exception &e)
380 {
381 cerr << "Conversion of '" << arr->At(i)->GetName() << "' failed!" << endl;
382 return 8;
383 }
384 }
385
386 delete arr;
387
388
389 size_t index = 0;
390 if (!split_lut.empty())
391 index = split_lut[line % split_lut.size()];
392 if (!split_quant.empty())
393 {
394 const float r = rndm();
395 for (; r>=split_quant[index]; index++)
396 if (index==split_quant.size())
397 break;
398 }
399
400 ttree[index]->Fill();
401 }
402
403 if (verbose>0)
404 {
405 cout << valid << " data rows found in " << line << " lines (excl. title)." << endl;
406 if (!update || !entries)
407 {
408 for (size_t i=0; i<ttree.size(); i++)
409 cout << ttree[i]->GetEntries() << " rows filled into tree #" << i << "." << endl;
410 }
411 }
412
413 if (entries && entries!=line)
414 cerr << "\nWARNING - Number of updated entries does not match number of entries in tree!\n" << endl;
415
416 for (auto it=ttree.begin(); it!=ttree.end(); it++)
417 (*it)->Write("", TObject::kOverwrite);
418 tfile.Close();
419
420 if (verbose>0)
421 {
422 const auto sec = Time().UnixTime()-start.UnixTime();
423
424 cout << Tools::Scientific(tfile.GetSize()) << "B written to disk.\n";
425 cout << "File closed.\n";
426 cout << "Execution time: " << sec << "s ";
427 cout << "(" << Tools::Fractional(sec/line) << "s/row)\n";
428 cout << "--------------------------------------------------------------" << endl;
429 }
430
431 return 0;
432}
Note: See TracBrowser for help on using the repository browser.