source: trunk/FACT++/src/csv2root.cc@ 19808

Last change on this file since 19808 was 19802, checked in by tbretz, 5 years ago
Moved the common Splitting algorithm to a common class 'Splitting' (added) and added the possibility to rename columns (csv2root)
File size: 12.5 KB
Line 
1#include <boost/regex.hpp>
2#include <boost/filesystem.hpp>
3#include <boost/algorithm/string/join.hpp>
4
5#include "tools.h"
6#include "Time.h"
7#include "Splitting.h"
8
9#include <TROOT.h>
10#include <TSystem.h>
11#include <TFile.h>
12#include <TTree.h>
13#include <TError.h>
14#include <TObjArray.h>
15
16using namespace std;
17namespace fs = boost::filesystem;
18
19// ------------------------------------------------------------------------
20
21
22// ------------------------------------------------------------------------
23
24void SetupConfiguration(Configuration &conf)
25{
26 po::options_description control("Root to SQL");
27 control.add_options()
28 ("file", var<string>()->required(), "The csv input file")
29 ("out,o", var<string>(""), "Output root file name")
30 ("force,f", po_switch(), "Force overwrite if output file already exists.")
31 ("update,u", po_switch(), "Update an existing file")
32 ("tree,t", var<string>("Events"), "Name of the root tree to convert")
33 ("compression,c", var<uint16_t>(1), "zlib compression level for the root file")
34 ("no-header,n", po_switch(), "Use if the first line contains no header")
35 ("rename.*", var<string>(), "Can be used to rename a column")
36 ("dry-run", po_switch(), "Do not create or manipulate any output file")
37 ("verbose,v", var<uint16_t>(1), "Verbosity (0: quiet, 1: default, 2: more, 3, ...)")
38 ;
39
40 po::positional_options_description p;
41 p.add("file", 1); // All positional options
42 p.add("out", 1); // All positional options
43 p.add("tree", 1); // All positional options
44
45 conf.AddOptions(control);
46 conf.AddOptions(Tools::Splitting::options());
47 conf.SetArgumentPositions(p);
48}
49
50void PrintUsage()
51{
52 cout <<
53 "csv2root - Converts a data table from a csv file to a root tree\n"
54 "\n"
55 "For convenience, this documentation uses the extended version of the options, "
56 "refer to the output below to get the abbreviations.\n"
57 "\n"
58 "As a default, the first row in the file is considered to contain the column "
59 "names separated by a whitespace. Column names must not contain whitespaces "
60 "themselves and special characters (':') are replaces by an underscore. "
61 "If the first line contains the first data row, the --no-header directive "
62 "can be used to instruct the program to consider the first line as the first "
63 "data row and use it only for column count. The branch names in the tree "
64 "are then 'colN' where N is the column index starting from 0.\n"
65 "\n"
66 "Each consecutive row in the file is supposed to contain an identical number "
67 "of floating point values. Leading and trailing whitespaces are ignored. "
68 "Empty lines or lines starting with a '#' are discarded.\n"
69 "\n"
70 "Input and output file are given either as first and second positional argument "
71 "or with the --file and --out command line option. If no output file name is "
72 "provided then the input file is used instead and the extension replaced by .root. "
73 "The target tree name of the root file is given with the --tree command line "
74 "option or the third positional argument. The default tree name is 'Events'.\n"
75 "\n"
76 "As a default, existing files are not overwritten. If overwriting is intended, "
77 "it can be turned on with --force. To update an existing root file, the "
78 "--update option can be used. If a tree with the same name already exists, "
79 "the tree is updated. The compression level for a new root file can be set "
80 "with --compression.\n"
81 "\n"
82 "Columns can be renamed with --rename.new=old\n"
83 "\n"
84 << Tools::Splitting::usage() <<
85 "\n"
86 "In case of success, 0 is returned, a value>0 otherwise.\n"
87 "\n"
88 "Usage: csv2root input.csv [output.root] [-t tree] [-u] [-f] [-n] [-vN] [-cN]\n"
89 "\n"
90 ;
91 cout << endl;
92}
93
94/*
95void ErrorHandlerAll(Int_t level, Bool_t abort, const char *location, const char *msg)
96{
97 if (string(msg).substr(0,24)=="no dictionary for class ")
98 return;
99 if (string(msg).substr(0,15)=="unknown branch ")
100 return;
101
102 DefaultErrorHandler(level, abort, location, msg);
103}*/
104
105
106bool AddTree(vector<TTree*> &ttree, TFile &file, const string &tree, bool update, int verbose)
107{
108 bool found = false;
109
110 TTree *T = 0;
111 if (update)
112 {
113 file.GetObject(tree.c_str(), T);
114 if (T)
115 {
116 ttree.emplace_back(T);
117 found = true;
118 if (verbose>0)
119 cout << "Updating tree: " << tree << endl;
120 }
121 }
122 if (!T)
123 ttree.emplace_back(new TTree(tree.c_str(), "csv2root"));
124
125 return found;
126}
127
128int main(int argc, const char* argv[])
129{
130 Time start;
131
132 gROOT->SetBatch();
133 //SetErrorHandler(ErrorHandlerAll);
134
135 Configuration conf(argv[0]);
136 conf.SetPrintUsage(PrintUsage);
137 SetupConfiguration(conf);
138
139 if (!conf.DoParse(argc, argv))
140 return 127;
141
142 // ----------------------------- Evaluate options --------------------------
143 const string file = conf.Get<string>("file");
144 const string tree = conf.Get<string>("tree");
145
146 const bool force = conf.Get<bool>("force");
147 const bool update = conf.Get<bool>("update");
148// const bool dryrun = conf.Get<bool>("dry-run");
149 const bool noheader = conf.Get<bool>("no-header");
150
151 const uint16_t verbose = conf.Get<uint16_t>("verbose");
152// const int64_t first = conf.Get<int64_t>("first");
153// const int64_t max = conf.Get<int64_t>("max");
154
155 const uint16_t compression = conf.Get<uint16_t>("compression");
156
157 string out = conf.Get<string>("out");
158 if (out.empty())
159 {
160 out = file;
161 const auto p = out.find_last_of('.');
162 if (p!=string::npos)
163 out = string(out.substr(0, p))+".root";
164 }
165
166 // -------------------------------------------------------------------------
167
168 /*const*/ Tools::Splitting split(conf);
169
170 if (verbose>0)
171 {
172 cout << "\n-------------------------- Evaluating input ------------------------\n";
173 cout << "Start Time: " << Time::sql << Time(Time::local) << endl;
174 }
175
176
177 // -------------------------------------------------------------------------
178
179 cout << "Reading from '" << file << "'.\n";
180
181 ifstream fin(file.c_str());
182 if (!fin.good())
183 {
184 cerr << file << ": " << strerror(errno) << endl;
185 return 1;
186 }
187
188 TString buf;
189 buf.ReadLine(fin);
190 if (!fin)
191 {
192 cerr << file << ": " << strerror(errno) << endl;
193 return 2;
194 }
195
196 buf = buf.Strip(TString::kBoth);
197 TObjArray *title = buf.Tokenize(" ");
198 if (title->GetEntries()==0)
199 {
200 cerr << "First line empty." << endl;
201 return 3;
202 }
203
204 if (title->At(0)->GetName()[0]=='#')
205 title->RemoveAt(0);
206
207 const auto numcol = title->GetEntries();
208
209 if (verbose>0)
210 cout << "Found " << numcol << " columns." << endl;
211
212 if (noheader)
213 {
214 fin.seekg(0);
215 if (verbose>0)
216 cout << "No header line interpreted." << endl;
217 }
218
219 // -------------------------------------------------------------------------
220
221 TString path(out.c_str());
222 gSystem->ExpandPathName(path);
223
224// if (!dryrun)
225 {
226 FileStat_t stat;
227 const Int_t exist = !gSystem->GetPathInfo(path, stat);
228 const Bool_t _write = !gSystem->AccessPathName(path, kWritePermission) && R_ISREG(stat.fMode);
229
230 if ((update && !exist) || (update && exist && !_write) || (force && exist && !_write))
231 {
232 cerr << "File '" << path << "' is not writable." << endl;
233 return 4;
234 }
235
236 if (!update && !force && exist)
237 {
238 cerr << "File '" << path << "' already exists." << endl;
239 return 5;
240 }
241 }
242
243 TFile tfile(path, update?"UPDATE":(force?"RECREATE":"CREATE"), file.c_str(), compression);
244 if (tfile.IsZombie())
245 {
246 cerr << "Opening file '" << path << "' failed." << endl;
247 return 6;
248 }
249
250 if (verbose>0)
251 {
252 cout << "Opened root file '" << path << "'.\n";
253 cout << "Writing to tree: " << tree << ".\n";
254 split.print();
255 }
256
257 // -------------------- Configure branches of TTree ------------------------
258 vector<TTree*> ttree;
259
260 size_t entries = 0;
261 if (split.empty())
262 {
263 if (AddTree(ttree, tfile, tree, update, verbose))
264 {
265 entries = ttree[0]->GetEntries();
266 if (verbose>0)
267 cout << "Tree has " << entries << " entries." << endl;
268 }
269 }
270 else
271 {
272 bool found = false;
273 for (size_t i=0; i<split.size(); i++)
274 found |= AddTree(ttree, tfile, tree+"["+to_string(i)+"]", update, verbose);
275
276 if (found && update)
277 {
278 cerr << "Trees can not be updated in split mode, only files!" << endl;
279 return 7;
280 }
281 }
282
283 const auto rename = conf.GetWildcardOptions("rename.*");
284
285 vector<float> vec(numcol);
286 for (int i=0; i<numcol; i++)
287 {
288 string col = noheader ? Tools::Form("col%d", i) : title->At(i)->GetName();
289
290 if (verbose>1)
291 cout << "Column: " << col;
292
293 boost::regex rexpr(":");
294 col = boost::regex_replace(col, rexpr, "");
295
296 if (verbose>1)
297 cout << " -> " << col;
298
299 for (auto it=rename.cbegin(); it!=rename.cend(); it++)
300 {
301 if (col!=it->substr(7))
302 continue;
303
304 col = conf.Get<string>(*it);
305 if (verbose>1)
306 cout << " -> " << col;
307 break;
308 }
309 if (verbose>1)
310 cout << endl;
311
312 for (auto it=ttree.begin(); it!=ttree.end(); it++)
313 it[0]->Branch(col.c_str(), vec.data()+i);
314 }
315
316 delete title;
317
318 // -------------------------------------------------------------------------
319
320 size_t line = 0;
321 size_t valid = 0;
322
323 while (1)
324 {
325 buf.ReadLine(fin);
326 if (!fin)
327 break;
328
329 line++;
330
331 buf = buf.Strip(TString::kBoth);
332 if (buf.IsNull() || buf[0]=='#')
333 continue;
334
335 TObjArray *arr = buf.Tokenize(" ");
336 if (arr->GetEntries()!=numcol)
337 {
338 cerr << "Column count mismatch in line " << line+1 << "!" << endl;
339 return 7;
340 }
341
342 for (int i=0; i<numcol; i++)
343 {
344 try
345 {
346 vec[i] = stof(arr->At(i)->GetName());
347 }
348 catch (const exception &e)
349 {
350 cerr << "Conversion of '" << arr->At(i)->GetName() << "' failed!" << endl;
351 return 8;
352 }
353 }
354
355 delete arr;
356
357 const size_t index = split.index(valid++);
358
359 // Fill only branches for which an adress was set
360 // If we fill the tree, we get empty entries at the
361 // end of the already written branches
362 TIter NextBranch(ttree[index]->GetListOfBranches());
363 TBranch *b=0;
364 while ((b=static_cast<TBranch*>(NextBranch())))
365 if (b->GetAddress())
366 b->Fill();
367 }
368
369 for (auto it=ttree.begin(); it!=ttree.end(); it++)
370 {
371 if (update)
372 {
373 TIter NextBranch((*it)->GetListOfBranches());
374 TBranch *b=0;
375 while ((b=static_cast<TBranch*>(NextBranch())))
376 if (b->GetAddress() && b->GetEntries()>0)
377 {
378 (*it)->SetEntries(b->GetEntries());
379 break;
380 }
381 }
382
383 (*it)->Write("", TObject::kOverwrite);
384 }
385
386 if (verbose>0)
387 {
388 cout << valid << " data rows found in " << line << " lines (excl. title)." << endl;
389 for (size_t i=0; i<ttree.size(); i++)
390 cout << ttree[i]->GetEntries() << " rows filled into tree #" << i << "." << endl;
391 }
392
393 if (entries && entries!=line)
394 cerr << "\nWARNING - Number of updated entries does not match number of entries in tree!\n" << endl;
395
396 tfile.Close();
397
398 if (verbose>0)
399 {
400 const auto sec = Time().UnixTime()-start.UnixTime();
401
402 cout << Tools::Scientific(tfile.GetSize()) << "B written to disk.\n";
403 cout << "File closed.\n";
404 cout << "Execution time: " << sec << "s ";
405 cout << "(" << Tools::Fractional(sec/line) << "s/row)\n";
406 cout << "--------------------------------------------------------------" << endl;
407 }
408
409 return 0;
410}
Note: See TracBrowser for help on using the repository browser.