source: trunk/FACT++/src/csv2root.cc@ 20016

Last change on this file since 20016 was 19881, checked in by tbretz, 5 years ago
Fixed compiler warning...
File size: 14.2 KB
Line 
1#include <boost/regex.hpp>
2#include <boost/filesystem.hpp>
3#include <boost/algorithm/string/join.hpp>
4
5#include "tools.h"
6#include "Time.h"
7#include "Splitting.h"
8
9#include <TROOT.h>
10#include <TSystem.h>
11#include <TFile.h>
12#include <TTree.h>
13#include <TError.h>
14#include <TObjArray.h>
15
16using namespace std;
17namespace fs = boost::filesystem;
18
19// ------------------------------------------------------------------------
20
21
22// ------------------------------------------------------------------------
23
24void SetupConfiguration(Configuration &conf)
25{
26 po::options_description control("Root to SQL");
27 control.add_options()
28 ("file", var<string>("-"), "The csv input file. The default ('-') is reading from stdin.")
29 ("out,o", var<string>(""), "Output root file name")
30 ("force,f", po_switch(), "Force overwrite if output file already exists.")
31 ("update,u", po_switch(), "Update an existing file")
32 ("tree,t", var<string>("Events"), "Name of the root tree to convert")
33 ("compression,c", var<uint16_t>(1), "zlib compression level for the root file")
34 ("no-header,n", po_switch(), "Use if the first line contains no header")
35 ("rename.*", var<string>(), "Can be used to rename a column")
36 ("delimiter", var<string>(";:, \t"), "List of possible delimiters")
37 ("null", po_switch(), "Enable detection of NULL and replace it with 0")
38 ("empty", po_switch(), "Enable detection of empty fields (two immediately consecutive delimiters) and replace them with 0")
39 ("dry-run", po_switch(), "Do not create or manipulate any output file")
40 ("verbose,v", var<uint16_t>(1), "Verbosity (0: quiet, 1: default, 2: more, 3, ...)")
41 ;
42
43 po::positional_options_description p;
44 p.add("file", 1); // All positional options
45 p.add("out", 1); // All positional options
46 p.add("tree", 1); // All positional options
47
48 conf.AddOptions(control);
49 conf.AddOptions(Tools::Splitting::options());
50 conf.SetArgumentPositions(p);
51}
52
53void PrintUsage()
54{
55 cout <<
56 "csv2root - Converts a data table from a csv file to a root tree\n"
57 "\n"
58 "For convenience, this documentation uses the extended version of the options, "
59 "refer to the output below to get the abbreviations.\n"
60 "\n"
61 "As a default, the first row in the file is considered to contain the column "
62 "names separated by a whitespace. Column names must not contain whitespaces "
63 "themselves and special characters (':') are replaces by an underscore. "
64 "If the first line contains the first data row, the --no-header directive "
65 "can be used to instruct the program to consider the first line as the first "
66 "data row and use it only for column count. The branch names in the tree "
67 "are then 'colN' where N is the column index starting from 0.\n"
68 "\n"
69 "Each consecutive row in the file is supposed to contain an identical number "
70 "of floating point values. Leading and trailing whitespaces are ignored. "
71 "Empty lines or lines starting with a '#' are discarded.\n"
72 "\n"
73 "Input and output file are given either as first and second positional argument "
74 "or with the --file and --out command line option. If no output file name is "
75 "provided then the input file is used instead and the extension replaced by .root. "
76 "The target tree name of the root file is given with the --tree command line "
77 "option or the third positional argument. The default tree name is 'Events'.\n"
78 "\n"
79 "As a default, existing files are not overwritten. If overwriting is intended, "
80 "it can be turned on with --force. To update an existing root file, the "
81 "--update option can be used. If a tree with the same name already exists, "
82 "the tree is updated. The compression level for a new root file can be set "
83 "with --compression.\n"
84 "\n"
85 "Columns can be renamed with --rename.new=old\n"
86 "\n"
87 << Tools::Splitting::usage() <<
88 "\n"
89 "In case of success, 0 is returned, a value>0 otherwise.\n"
90 "\n"
91 "Usage: csv2root input.csv [output.root] [-t tree] [-u] [-f] [-n] [-vN] [-cN]\n"
92 "\n"
93 ;
94 cout << endl;
95}
96
97/*
98void ErrorHandlerAll(Int_t level, Bool_t abort, const char *location, const char *msg)
99{
100 if (string(msg).substr(0,24)=="no dictionary for class ")
101 return;
102 if (string(msg).substr(0,15)=="unknown branch ")
103 return;
104
105 DefaultErrorHandler(level, abort, location, msg);
106}*/
107
108
109bool AddTree(vector<TTree*> &ttree, TFile &file, const string &tree, bool update, int verbose)
110{
111 bool found = false;
112
113 TTree *T = 0;
114 if (update)
115 {
116 file.GetObject(tree.c_str(), T);
117 if (T)
118 {
119 ttree.emplace_back(T);
120 found = true;
121 if (verbose>0)
122 cout << "Updating tree: " << tree << endl;
123 }
124 }
125 if (!T)
126 ttree.emplace_back(new TTree(tree.c_str(), "csv2root"));
127
128 return found;
129}
130
131int main(int argc, const char* argv[])
132{
133 Time start;
134
135 gROOT->SetBatch();
136 //SetErrorHandler(ErrorHandlerAll);
137
138 Configuration conf(argv[0]);
139 conf.SetPrintUsage(PrintUsage);
140 SetupConfiguration(conf);
141
142 if (!conf.DoParse(argc, argv))
143 return 127;
144
145 // ----------------------------- Evaluate options --------------------------
146 const string file = conf.Get<string>("file");
147 const string tree = conf.Get<string>("tree");
148
149 const bool force = conf.Get<bool>("force");
150 const bool update = conf.Get<bool>("update");
151 const bool detectnull = conf.Get<bool>("null");
152 const bool detectempty = conf.Get<bool>("empty");
153// const bool dryrun = conf.Get<bool>("dry-run");
154 const bool noheader = conf.Get<bool>("no-header");
155 const string delimiter = conf.Get<string>("delimiter");
156
157 const uint16_t verbose = conf.Get<uint16_t>("verbose");
158// const int64_t first = conf.Get<int64_t>("first");
159// const int64_t max = conf.Get<int64_t>("max");
160
161 const uint16_t compression = conf.Get<uint16_t>("compression");
162
163 string out = conf.Get<string>("out");
164 if (out.empty())
165 {
166 out = file.empty() || file=="-" ? "csv2root" : file;
167 const auto p = out.find_last_of('.');
168 if (p!=string::npos)
169 out.erase(p);
170 out += ".root";
171 }
172
173 // -------------------------------------------------------------------------
174
175 /*const*/ Tools::Splitting split(conf);
176
177 if (verbose>0)
178 {
179 cout << "\n-------------------------- Evaluating input ------------------------\n";
180 cout << "Start Time: " << Time::sql << Time(Time::local) << endl;
181 }
182
183
184 // -------------------------------------------------------------------------
185
186 cout << "Reading from '" << file << "'.\n";
187
188 ifstream ifs(file.c_str());
189
190 istream &fin = file=="-" ? cin : ifs;
191 if (!fin.good())
192 {
193 cerr << file << ": " << strerror(errno) << endl;
194 return 1;
195 }
196
197 TString buf;
198 buf.ReadLine(fin);
199 if (!fin)
200 {
201 cerr << file << ": " << strerror(errno) << endl;
202 return 2;
203 }
204
205 buf = buf.Strip(TString::kBoth);
206 TObjArray *title = buf.Tokenize(delimiter);
207 if (title->GetEntries()==0)
208 {
209 cerr << "First line empty." << endl;
210 return 3;
211 }
212
213 if (title->At(0)->GetName()[0]=='#')
214 {
215 title->RemoveAt(0);
216 title->Compress();
217 }
218
219 const auto numcol = title->GetEntries();
220
221 if (verbose>0)
222 cout << "Found " << numcol << " columns." << endl;
223
224 if (noheader)
225 {
226 fin.seekg(0);
227 if (verbose>0)
228 cout << "No header line interpreted." << endl;
229 }
230
231 // -------------------------------------------------------------------------
232
233 TString path(out.c_str());
234 gSystem->ExpandPathName(path);
235
236// if (!dryrun)
237 {
238 FileStat_t stat;
239 const Int_t exist = !gSystem->GetPathInfo(path, stat);
240 const Bool_t _write = !gSystem->AccessPathName(path, kWritePermission) && R_ISREG(stat.fMode);
241
242 if ((update && !exist) || (update && exist && !_write) || (force && exist && !_write))
243 {
244 cerr << "File '" << path << "' is not writable." << endl;
245 return 4;
246 }
247
248 if (!update && !force && exist)
249 {
250 cerr << "File '" << path << "' already exists." << endl;
251 return 5;
252 }
253 }
254
255 TFile tfile(path, update?"UPDATE":(force?"RECREATE":"CREATE"), file.c_str(), compression);
256 if (tfile.IsZombie())
257 {
258 cerr << "Opening file '" << path << "' failed." << endl;
259 return 6;
260 }
261
262 if (verbose>0)
263 {
264 cout << "Opened root file '" << path << "'.\n";
265 cout << "Writing to tree: " << tree << ".\n";
266 split.print();
267 }
268
269 // -------------------- Configure branches of TTree ------------------------
270 vector<TTree*> ttree;
271
272 size_t entries = 0;
273 if (split.empty())
274 {
275 if (AddTree(ttree, tfile, tree, update, verbose))
276 {
277 entries = ttree[0]->GetEntries();
278 if (verbose>0)
279 cout << "Tree has " << entries << " entries." << endl;
280 }
281 }
282 else
283 {
284 bool found = false;
285 for (size_t i=0; i<split.size(); i++)
286 found |= AddTree(ttree, tfile, tree+"["+to_string(i)+"]", update, verbose);
287
288 if (found && update)
289 {
290 cerr << "Trees can not be updated in split mode, only files!" << endl;
291 return 7;
292 }
293 }
294 const auto rename = conf.GetWildcardOptions("rename.*");
295
296 vector<float> vec(numcol);
297 for (int i=0; i<numcol; i++)
298 {
299 string col = noheader ? Tools::Form("col%d", i) : title->At(i)->GetName();
300
301 if (verbose>1)
302 cout << "Column: " << col;
303
304 boost::regex rexpr(":");
305 col = boost::regex_replace(col, rexpr, "");
306
307 if (col[0]=='.')
308 col.erase(0, 1);
309
310 if (verbose>1)
311 cout << " -> " << col;
312
313 for (auto it=rename.cbegin(); it!=rename.cend(); it++)
314 {
315 if (col!=it->substr(7))
316 continue;
317
318 col = conf.Get<string>(*it);
319 if (verbose>1)
320 cout << " -> " << col;
321 break;
322 }
323 if (verbose>1)
324 cout << endl;
325
326 for (auto it=ttree.begin(); it!=ttree.end(); it++)
327 it[0]->Branch(col.c_str(), vec.data()+i);
328 }
329
330 delete title;
331
332 // -------------------------------------------------------------------------
333
334 size_t line = 0;
335 size_t valid = 0;
336
337 while (1)
338 {
339 buf.ReadLine(fin);
340 if (!fin)
341 break;
342
343 line++;
344
345 buf = buf.Strip(TString::kBoth);
346 if (buf.IsNull() || buf[0]=='#')
347 continue;
348
349 if (detectempty)
350 {
351 string delim = delimiter;
352
353
354 for (size_t i=0; i<delimiter.size(); i++)
355 if (delimiter[i]!=' ')
356 {
357 boost::regex rexpr1("(["+delimiter+"])(["+delimiter+"])");
358 buf = boost::regex_replace(string(buf.Data()), rexpr1, "\\10\\2");
359
360 boost::regex rexpr2("^(["+delimiter+"])");
361 buf = boost::regex_replace(string(buf.Data()), rexpr2, "0\\1");
362
363 boost::regex rexpr3("(["+delimiter+"])$");
364 buf = boost::regex_replace(string(buf.Data()), rexpr3, "\\10");
365 }
366 }
367
368 TObjArray *arr = buf.Tokenize(delimiter);
369 if (arr->GetEntries()!=numcol)
370 {
371 cerr << buf << endl;
372 cerr << "Column count [" << arr->GetEntries() << "] mismatch in line " << line+1 << "!" << endl;
373 return 7;
374 }
375
376 for (int i=0; i<numcol; i++)
377 {
378 try
379 {
380 if (detectnull && arr->At(i)->GetName()==string("NULL"))
381 {
382 vec[i] = 0;
383 continue;
384 }
385
386 vec[i] = stof(arr->At(i)->GetName());
387 }
388 catch (const exception &e)
389 {
390 cerr << buf << endl;
391 cerr << "Conversion of field " << i << " '" << arr->At(i)->GetName() << "' in line " << line+1 << " failed!" << endl;
392 return 8;
393 }
394 }
395
396 delete arr;
397
398 const size_t index = split.index(valid++);
399
400 // Fill only branches for which an adress was set
401 // If we fill the tree, we get empty entries at the
402 // end of the already written branches
403 TIter NextBranch(ttree[index]->GetListOfBranches());
404 TBranch *b=0;
405 while ((b=static_cast<TBranch*>(NextBranch())))
406 if (b->GetAddress())
407 b->Fill();
408 }
409
410 for (auto it=ttree.begin(); it!=ttree.end(); it++)
411 {
412 TIter NextBranch((*it)->GetListOfBranches());
413 TBranch *b=0;
414 while ((b=static_cast<TBranch*>(NextBranch())))
415 if (b->GetAddress() && b->GetEntries()>0)
416 {
417 (*it)->SetEntries(b->GetEntries());
418 break;
419 }
420
421 (*it)->Write("", TObject::kOverwrite);
422 }
423
424 if (verbose>0)
425 {
426 cout << valid << " data rows found in " << line << " lines (excl. title)." << endl;
427 for (size_t i=0; i<ttree.size(); i++)
428 cout << ttree[i]->GetEntries() << " rows filled into tree #" << i << "." << endl;
429 }
430
431 if (entries && entries!=line)
432 cerr << "\nWARNING - Number of updated entries does not match number of entries in tree!\n" << endl;
433
434 tfile.Close();
435
436 if (verbose>0)
437 {
438 const auto sec = Time().UnixTime()-start.UnixTime();
439
440 cout << Tools::Scientific(tfile.GetSize()) << "B written to disk.\n";
441 cout << "File closed.\n";
442 cout << "Execution time: " << sec << "s ";
443 cout << "(" << Tools::Fractional(sec/line) << "s/row)\n";
444 cout << "--------------------------------------------------------------" << endl;
445 }
446
447 return 0;
448}
Note: See TracBrowser for help on using the repository browser.