#include #include #include #include #include "tools.h" #include "Time.h" #include "Configuration.h" #include #include #include #include #include #include using namespace std; namespace fs = boost::filesystem; // ------------------------------------------------------------------------ void SetupConfiguration(Configuration &conf) { po::options_description control("Root to SQL"); control.add_options() ("file", var()->required(), "The csv input file") ("out,o", var(""), "Output root file name") ("force,f", po_switch(), "Force overwrite if output file already exists.") ("update,u", po_switch(), "Update an existing file") ("tree,t", var("Events"), "Name of the root tree to convert") ("compression,c", var(1), "zlib compression level for the root file") ("no-header,n", po_switch(), "Use if the first line contains no header") ("dry-run", po_switch(), "Do not create or manipulate any output file") ; po::options_description split("Splitting options"); split.add_options() ("split-sequence,S", vars(), "Split data sequentially into several trees/files (e.g. 1, 1, 2)") ("split-quantile,Q", vars(), "Split data randomly into several trees/files (e.g. 0.5, 1)") ("seed", var(mt19937_64::default_seed), "Seed value in case of random split") ; po::options_description debug("Debug options"); debug.add_options() ("verbose,v", var(1), "Verbosity (0: quiet, 1: default, 2: more, 3, ...)") ; po::positional_options_description p; p.add("file", 1); // All positional options p.add("out", 1); // All positional options p.add("tree", 1); // All positional options conf.AddOptions(control); conf.AddOptions(split); conf.AddOptions(debug); conf.SetArgumentPositions(p); } void PrintUsage() { cout << "csv2root - Converts a data table from a csv file to a root tree\n" "\n" "For convenience, this documentation uses the extended version of the options, " "refer to the output below to get the abbreviations.\n" "\n" "As a default, the first row in the file is considered to contain the column " "names separated by a whitespace. Column names must not contain whitespaces " "themselves and special characters (':') are replaces by an underscore. " "If the first line contains the first data row, the --no-header directive " "can be used to instruct the program to consider the first line as the first " "data row and use it only for column count. The branch names in the tree " "are then 'colN' where N is the column index starting from 0.\n" "\n" "Each consecutive row in the file is supposed to contain an identical number " "of floating point values. Leading and trailing whitespaces are ignored. " "Empty lines or lines starting with a '#' are discarded.\n" "\n" "Input and output file are given either as first and second positional argument " "or with the --file and --out command line option. If no output file name is " "provided then the input file is used instead and the extension replaced by .root. " "The target tree name of the root file is given with the --tree command line " "option or the third positional argument. The default tree name is 'Events'.\n" "\n" "As a default, existing files are not overwritten. If overwriting is intended, " "it can be turned on with --force. To update an existing root file, the " "--update option can be used. If a tree with the same name already exists, " "the tree is updated. The compression level for a new root file can be set " "with --compression.\n" "\n" "For several purposes, it might be convenient to split the output to several " "different root-treess. This can be done using the --split-sequence (-S) " "and the --split-quantile (-Q) options. If a split sequence is defined as " "-S 1 -S 2 -S 1 the events are split by 1:2:1 in this sequence order. If " "quantiles are given as -Q 0.5 -Q 0.6, the first tree will contain 50% of " "the second one 10% and the third one 40%. The corresponding seed value can " "be set with --seed.\n" "\n" "In case of success, 0 is returned, a value>0 otherwise.\n" "\n" "Usage: csv2root input.csv [output.root] [-t tree] [-u] [-f] [-n] [-vN] [-cN]\n" "\n" ; cout << endl; } /* void ErrorHandlerAll(Int_t level, Bool_t abort, const char *location, const char *msg) { if (string(msg).substr(0,24)=="no dictionary for class ") return; if (string(msg).substr(0,15)=="unknown branch ") return; DefaultErrorHandler(level, abort, location, msg); }*/ bool AddTree(vector &ttree, TFile &file, const string &tree, bool update, int verbose) { bool found = false; TTree *T = 0; if (update) { file.GetObject(tree.c_str(), T); if (T) { ttree.emplace_back(T); found = true; if (verbose>0) cout << "Updating tree: " << tree << endl; } } if (!T) ttree.emplace_back(new TTree(tree.c_str(), "csv2root")); return found; } int main(int argc, const char* argv[]) { Time start; gROOT->SetBatch(); //SetErrorHandler(ErrorHandlerAll); Configuration conf(argv[0]); conf.SetPrintUsage(PrintUsage); SetupConfiguration(conf); if (!conf.DoParse(argc, argv)) return 127; // ----------------------------- Evaluate options -------------------------- const string file = conf.Get("file"); const string tree = conf.Get("tree"); const bool force = conf.Get("force"); const bool update = conf.Get("update"); // const bool dryrun = conf.Get("dry-run"); const bool noheader = conf.Get("no-header"); const uint16_t verbose = conf.Get("verbose"); // const int64_t first = conf.Get("first"); // const int64_t max = conf.Get("max"); const uint16_t compression = conf.Get("compression"); string out = conf.Get("out"); if (out.empty()) { out = file; const auto p = out.find_last_of('.'); if (p!=string::npos) out = string(out.substr(0, p))+".root"; } // ----------------------------- Setup splitting --------------------------- vector split_seq = conf.Vec("split-sequence"); vector split_quant = conf.Vec("split-quantile"); if (!split_seq.empty() && !split_quant.empty()) throw runtime_error("Only splitting by --split-sequence or --split-quantile is allowed."); const size_t num_split = split_seq.size()+split_quant.size()==0 ? 0 : ::max(split_seq.size(), split_quant.size()+1); map split_lut; for (size_t i=0; i=1) throw runtime_error("Splitting quantiles must be in the range [0;1)"); for (size_t i=1; i distribution(0,1); mt19937_64 generator; generator.seed(conf.Get("seed")); auto rndm = bind(distribution, generator); // ------------------------------------------------------------------------- if (verbose>0) { cout << "\n-------------------------- Evaluating input ------------------------\n"; cout << "Start Time: " << Time::sql << Time(Time::local) << endl; } // ------------------------------------------------------------------------- cout << "Reading from '" << file << "'.\n"; ifstream fin(file.c_str()); if (!fin.good()) { cerr << file << ": " << strerror(errno) << endl; return 1; } TString buf; buf.ReadLine(fin); if (!fin) { cerr << file << ": " << strerror(errno) << endl; return 2; } buf = buf.Strip(TString::kBoth); TObjArray *title = buf.Tokenize(" "); if (title->GetEntries()==0) { cerr << "First line empty." << endl; return 3; } if (title->At(0)->GetName()[0]=='#') title->RemoveAt(0); const auto numcol = title->GetEntries(); if (verbose>0) cout << "Found " << numcol << " columns." << endl; if (noheader) { fin.seekg(0); if (verbose>0) cout << "No header line interpreted." << endl; } // ------------------------------------------------------------------------- TString path(out.c_str()); gSystem->ExpandPathName(path); // if (!dryrun) { FileStat_t stat; const Int_t exist = !gSystem->GetPathInfo(path, stat); const Bool_t _write = !gSystem->AccessPathName(path, kWritePermission) && R_ISREG(stat.fMode); if ((update && !exist) || (update && exist && !_write) || (force && exist && !_write)) { cerr << "File '" << path << "' is not writable." << endl; return 4; } if (!update && !force && exist) { cerr << "File '" << path << "' already exists." << endl; return 5; } } TFile tfile(path, update?"UPDATE":(force?"RECREATE":"CREATE"), file.c_str(), compression); if (tfile.IsZombie()) { cerr << "Opening file '" << path << "' failed." << endl; return 6; } if (verbose>0) { cout << "Opened root file '" << path << "'.\n"; cout << "Writing to tree: " << tree << ".\n"; } // -------------------- Configure branches of TTree ------------------------ vector ttree; size_t entries = 0; if (num_split==0) { if (AddTree(ttree, tfile, tree, update, verbose)) { entries = ttree[0]->GetEntries(); if (verbose>0) cout << "Tree has " << entries << " entries." << endl; } } else { bool found = false; for (size_t i=0; i vec(numcol); for (int i=0; iAt(i)->GetName(); boost::regex rexpr(":"); col = boost::regex_replace(col, rexpr, ""); if (verbose>1) cout << "Column: " << col << '\n'; for (auto it=ttree.begin(); it!=ttree.end(); it++) it[0]->Branch(col.c_str(), vec.data()+i); } delete title; // ------------------------------------------------------------------------- size_t line = 0; size_t valid = 0; while (1) { buf.ReadLine(fin); if (!fin) break; line++; buf = buf.Strip(TString::kBoth); if (buf.IsNull() || buf[0]=='#') continue; valid++; TObjArray *arr = buf.Tokenize(" "); if (arr->GetEntries()!=numcol) { cerr << "Column count mismatch in line " << line+1 << "!" << endl; return 7; } for (int i=0; iAt(i)->GetName()); } catch (const exception &e) { cerr << "Conversion of '" << arr->At(i)->GetName() << "' failed!" << endl; return 8; } } delete arr; size_t index = 0; if (!split_lut.empty()) index = split_lut[line % split_lut.size()]; if (!split_quant.empty()) { const float r = rndm(); for (; r>=split_quant[index]; index++) if (index==split_quant.size()) break; } ttree[index]->Fill(); } if (verbose>0) { cout << valid << " data rows found in " << line << " lines (excl. title)." << endl; if (!update || !entries) { for (size_t i=0; iGetEntries() << " rows filled into tree #" << i << "." << endl; } } if (entries && entries!=line) cerr << "\nWARNING - Number of updated entries does not match number of entries in tree!\n" << endl; for (auto it=ttree.begin(); it!=ttree.end(); it++) (*it)->Write("", TObject::kOverwrite); tfile.Close(); if (verbose>0) { const auto sec = Time().UnixTime()-start.UnixTime(); cout << Tools::Scientific(tfile.GetSize()) << "B written to disk.\n"; cout << "File closed.\n"; cout << "Execution time: " << sec << "s "; cout << "(" << Tools::Fractional(sec/line) << "s/row)\n"; cout << "--------------------------------------------------------------" << endl; } return 0; }