#include #include #include #include #include "tools.h" #include "Time.h" #include "Configuration.h" #include #include #include #include #include #include #include using namespace std; namespace fs = boost::filesystem; // ------------------------------------------------------------------------ struct Map : pair { Map() { } }; std::istream &operator>>(std::istream &in, Map &m) { const istreambuf_iterator eos; string txt(istreambuf_iterator(in), eos); const boost::regex expr("((.*)[^\\\\])/(.*)"); boost::smatch match; if (!boost::regex_match(txt, match, expr)) throw runtime_error("Could not evaluate map argument: "+txt); m.first = match[1].str(); m.second = match[3].str(); return in; } void SetupConfiguration(Configuration &conf) { po::options_description control("Root to SQL"); control.add_options() ("file", vars()->required(),"The root files to read from") ("out,o", var()->required(), "Output file name") ("force,f", po_switch(), "Force overwrite if output file already exists.") ("append,a", po_switch(), "Append to an existing file (not check for the format is done!)") ("tree,t", var("Events"), "Name of the root tree to convert") ("ignore", vars(), "Ignore the given leaf, if the given regular expression matches") ("alias.*", var(), "Define an alias") ("auto-alias", vars(), "Regular expression to define aliases from the branch names automatically") ("header", var(uint16_t(0)),"Type of header line (0: preceeding #, 1: without preceeding #, 2: none)") ("add.*", var(), "Define an additional column") ("selector,s", var("1"), "Define a selector for the columns (colums where this evaluates to a value <=0 are discarded)") ("skip", po_switch(), "Discards all default leaves and writes only the columns defined by --add.*") ("first", var(int64_t(0)), "First event to start with (default: 0), mainly for test purpose") ("max", var(int64_t(0)), "Maximum number of events to process (0: all), mainly for test purpose") //("const.*", var(), "Insert a constant number into the given column (--const.mycolumn=5). A special case is `/.../.../`") ("dry-run", po_switch(), "Do not create or manipulate any output file") ; po::options_description split("Splitting options"); split.add_options() ("split-sequence,S", vars(), "Split data sequentially into several trees/files (e.g. 1, 1, 2)") ("split-quantile,Q", vars(), "Split data randomly into several trees/files (e.g. 0.5, 1)") ("seed", var(mt19937_64::default_seed), "Seed value in case of random split") ; po::options_description debug("Debug options"); debug.add_options() ("print-ls", po_switch(), "Calls TFile::ls()") ("print-branches", po_switch(), "Print the branches found in the tree") ("print-leaves", po_switch(), "Print the leaves found in the tree (this is what is processed)") ("verbose,v", var(1), "Verbosity (0: quiet, 1: default, 2: more, 3, ...)") ; po::positional_options_description p; p.add("file", -1); // All positional options conf.AddOptions(control); conf.AddOptions(split); conf.AddOptions(debug); conf.SetArgumentPositions(p); } void PrintUsage() { cout << "root2csv - Reads data from a root tree and writes a csv file\n" "\n" "For convenience, this documentation uses the extended version of the options, " "refer to the output below to get the abbreviations.\n" "\n" "Similar functionaliy is also provided by root2sql. In addition to root2sql, " "this tool is more flexible in the slection of columns and adds the possibility " "to use formulas (implemented through TTreeFormula) to calculate values for " "additional columns. Note that root can even write complex data like a TH1F " "into a file. Here, only numeric columns are supported.\n" "\n" "Input files are given as positional arguments or with --file. " "As files are read by adding them through TChain::Add, wildcards are " "supported in file names. Note that on the command lines, file names " "with wildcards have to be escaped in quotation marks if the wildcards " "should be evaluated by the program and not by the shell. The output base " "name of the output file(s) is given with --out.\n" "\n" "The format of the first line on the file is defined with the --header option:\n" " 0: '# Col1 Col2 Col3 ...'\n" " 1: 'Col1 Col2 Col3 ...'\n" " 2: first data row\n" "\n" "As default, existing files are not overwritten. To force overwriting use " "--force. To append data to existing files use --append. Note that no " "check is done if this created valid and reasonable files.\n" "\n" "Each root tree has branches and leaves (the basic data types). These leaves can " "be read independently of the classes which were used to write the root file. " "The default tree to read from is 'Events' but the name can be overwritten " "using --tree. The default table name to fill the data into is identical to " "the tree name. It can be overwritten using --table.\n" "\n" "To get a list of the contents (keys and trees) of a root file, you can use --print-ls. " "The name of each column to which data is filled from a leave is obtained from " "the leaves' names. The leave names can be checked using --print-leaves. " "A --print-branches exists for convenience to print only the high-level branches.\n" "\n" "Assuming a leaf with name MHillas.fWidth and a leaf with MHillas.fLength, " "a new column can be added with name Area by\n" " --add.Area='TMath::TwoPi()*MHillas.fWidth*MHillas.fLength'\n" "\n" "To simplify expression, root allows to define aliases, for example\n" " --alias.Width='MHillas.fWidth'\n" " --alias.Length='MHillas.fLength'\n" "\n" "This can then be used to simplyfy the above expression as\n" " --add.Area='TMath::TwoPi()*Width*Length'\n" "\n" "Sometimes leaf names might be quite unconvenient like MTime.fTime.fMilliSec or " "just MHillas.fWidth. To allow to simplify column names, regular expressions " "(using boost's regex) can be defined to change the names. Note that these regular " "expressions are applied one by one on each leaf's name. A valid expression could " "be:\n" " --auto-alias=MHillas\\.f/\n" "which would remove all occurances of 'MHillas.f'. This option can be used more than " "once. They are applied in sequence. A single match does not stop the sequence. " "In addition to replacing the column names accordingly, a alias is created " "automatically allowing to access the columns in a formula with the new name.\n" "\n" "Sometimes it might also be convenient to skip a leaf, i.e. not writing the " "coresponding column in the output file. This can be done with " "the --ignore resource. If the given regular expresion yields a match, the " "leaf will be ignored. An automatic alias would still be created and the " "leaf could still be used in a formula. Example\n" " --ignore=ThetaSq\\..*\n" "will skip all leaved which start with 'ThetaSq.'. This directive can be given " "more than once. The so defined ignore list is applied entry-wise, first to the " "raw leaf names, then to the aliased names.\n" "\n" "To select only certain extries from the file, a selector (cut) can be defined " "in the same style as the --add directives, for exmple:\n" " --selector='MHillas.fLength*Width<0'\n" "Note that the selctor is not evaluated to a boolean expression (==0 or !=0) " "but all positive none zero values are considered 'true' (select the entry) " "and all negative values are considered 'fales' (discard the entry).\n" "\n" "For several purposes, it might be convenient to split the output to several " "files. This can be achieved using the --split-sequence (-S) " "and the --split-quantile (-Q) options. If a split sequence is defined as " "-S 1 -S 2 -S 1 the events are split by 1:2:1 in this sequence order. If " "quantiles are given as -Q 0.5 -Q 0.6, the first tree will contain 50% of " "the second one 10% and the third one 40%. The corresponding seed value can " "be set with --seed. Filenames are then created by adding an index after(!) " "the extension, e.g. file.csv-0, file.csv-1, ...\n" "\n" "In case of success, 0 is returned, a value>0 otherwise.\n" "\n" "Usage: root2csv input1.root [input2.root ...] -o output.csv [-t tree] [-u] [-f] [-n] [-vN] [-cN]\n" "\n" ; cout << endl; } enum BasicType_t { kNone = 0, kConst, kFloat, kDouble, kInt16, kUInt16, kInt32, kUInt32, kInt64, kUInt64, }; static const map> ConvRoot = { { "Float_t", { kFloat, "FLOAT" } }, { "Double_t", { kDouble, "DOUBLE" } }, { "ULong64_t", { kUInt64, "BIGINT UNSIGNED" } }, { "Long64_t", { kInt64, "BIGINT" } }, { "UInt_t", { kUInt32, "INT UNSIGNED" } }, { "Int_t", { kInt32, "INT" } }, { "UShort_t", { kUInt16, "SMALLINT UNSIGNED" } }, { "Short_t", { kInt16, "SMALLINT" } }, }; struct Container { static map counter; string branch; // branch name string column; // column name BasicType_t type; size_t num; void *ptr; Container(const string &b, const string &c, const BasicType_t &t, const size_t n=1) : branch(b), column(c), type(t), num(n), ptr(0) { switch (t) { case kFloat: ptr = new Float_t[n]; break; case kDouble: ptr = new Double_t[n]; break; case kInt16: ptr = new Short_t[n]; break; case kUInt16: ptr = new UShort_t[n]; break; case kInt32: ptr = new Int_t[n]; break; case kUInt32: ptr = new UInt_t[n]; break; case kInt64: ptr = new Long64_t[n]; break; case kUInt64: ptr = new ULong64_t[n]; break; case kConst: case kNone: break; } counter[ptr]++; } Container(const string &c, const string &value) : branch(value), column(c), type(kConst), num(1), ptr(0) { } Container(const Container &c) : branch(c.branch), column(c.column), type(c.type), num(c.num), ptr(c.ptr) { counter[ptr]++; } ~Container() { counter[ptr]--; if (counter[ptr]==0) ::operator delete[](ptr); // It seems root is deleting it already } string fmt(const size_t &index) const { ostringstream str; switch (type) { case kFloat: str << setprecision(8) << reinterpret_cast(ptr)[index]; break; case kDouble: str << setprecision(16) << reinterpret_cast(ptr)[index]; break; case kInt16: str << reinterpret_cast(ptr)[index]; break; case kUInt16: str << reinterpret_cast(ptr)[index]; break; case kInt32: str << reinterpret_cast(ptr)[index]; break; case kUInt32: str << reinterpret_cast(ptr)[index]; break; case kInt64: str << reinterpret_cast(ptr)[index]; break; case kUInt64: str << reinterpret_cast(ptr)[index]; break; case kConst: str << branch; break; case kNone: break; } //if (str.str()=="nan" || str.str()=="-nan" || str.str()=="inf" || str.str()=="-inf") // return "NULL"; return str.str(); } }; map Container::counter; void ErrorHandlerAll(Int_t level, Bool_t abort, const char *location, const char *msg) { if (string(msg).substr(0,24)=="no dictionary for class ") return; if (string(msg).substr(0,15)=="unknown branch ") return; DefaultErrorHandler(level, abort, location, msg); } // --------------------------- Write Header -------------------------------- void WriteHeader(ostream &out, const vector &vec, const vector &form, bool skip, uint16_t header) { if (header>1) return; if (header==0) out << "# "; vector join; if (!skip) { for (auto v=vec.cbegin(); v!=vec.cend(); v++) { const size_t N = v->num; for (size_t i=0; icolumn; if (N!=1) name += "["+to_string(i)+"]"; join.emplace_back(name); } } } for (auto v=form.cbegin(); v!=form.cend(); v++) join.emplace_back((*v)->GetName()); out << boost::join(join, " ") << "\n"; } int CheckFile(TString &path, bool force, int verbose) { gSystem->ExpandPathName(path); FileStat_t stat; const Int_t exist = !gSystem->GetPathInfo(path, stat); const Bool_t _write = !gSystem->AccessPathName(path, kWritePermission) && R_ISREG(stat.fMode); if (exist) { if (!_write) { cerr << "File '" << path << "' is not writable." << endl; return 2; } if (!force) { cerr << "File '" << path << "' already exists." << endl; return 3; } else { if (verbose>0) cerr << "File '" << path << "' will be overwritten." << endl; } } return exist ? 0 : -1; } void GetLeaves(vector &list, const TTreeFormula &f) { int i=0; while (1) { const auto l = f.GetLeaf(i++); if (!l) return; list.emplace_back(l->GetName()); } } int main(int argc, const char* argv[]) { Time start; gROOT->SetBatch(); SetErrorHandler(ErrorHandlerAll); Configuration conf(argv[0]); conf.SetPrintUsage(PrintUsage); SetupConfiguration(conf); if (!conf.DoParse(argc, argv)) return 127; // ----------------------------- Evaluate options -------------------------- const vector files = conf.Vec("file"); const string out = conf.Get("out"); const string tree = conf.Get("tree"); const bool force = conf.Get("force"); const bool append = conf.Get("append"); const bool dryrun = conf.Get("dry-run"); const bool skip = conf.Get("skip"); const uint16_t verbose = conf.Get("verbose"); const int64_t first = conf.Get("first"); const int64_t max = conf.Get("max"); const uint16_t header = conf.Get("header"); const bool print_ls = conf.Get("print-ls"); const bool print_branches = conf.Get("print-branches"); const bool print_leaves = conf.Get("print-leaves"); const vector _ignore = conf.Vec("ignore"); const vector autoalias = conf.Vec("auto-alias"); // ----------------------------- Setup splitting --------------------------- vector split_seq = conf.Vec("split-sequence"); vector split_quant = conf.Vec("split-quantile"); if (!split_seq.empty() && !split_quant.empty()) throw runtime_error("Only splitting by --split-sequence or --split-quantile is allowed."); const size_t num_split = split_seq.size()+split_quant.size()==0 ? 0 : ::max(split_seq.size(), split_quant.size()+1); map split_lut; for (size_t i=0; i=1) throw runtime_error("Splitting quantiles must be in the range [0;1)"); for (size_t i=1; i distribution(0,1); mt19937_64 generator; generator.seed(conf.Get("seed")); auto rndm = bind(distribution, generator); // ------------------------------------------------------------------------- if (verbose>0) { cout << "\n-------------------------- Evaluating input ------------------------\n"; cout << "Start Time: " << Time::sql << Time(Time::local) << endl; } if (verbose>0) cout << "Processing Tree: " << tree << endl; TChain c(tree.c_str()); uint64_t cnt = 0; for (const auto &file : files) { const auto add = c.Add(file.c_str(), 0); if (verbose>0) cout << file << ": " << add << " file(s) added." << endl; cnt += add; } if (cnt==0) { cerr << "No files found." << endl; return 1; } if (verbose>0) cout << cnt << " file(s) found." << endl; if (print_ls) { cout << '\n'; c.ls(); cout << '\n'; } c.SetMakeClass(1); TObjArray *branches = c.GetListOfBranches(); TObjArray *leaves = c.GetListOfLeaves(); if (print_branches) { cout << '\n'; branches->Print(); } const auto entries = c.GetEntriesFast(); if (verbose>0) cout << branches->GetEntries() << " branches found." << endl; if (print_leaves) { cout << '\n'; leaves->Print(); } if (verbose>0) { cout << leaves->GetEntries() << " leaves found." << endl; cout << entries << " events found." << endl; } // ---------------------------------------------------------------------- if (verbose>0) cout << "\n-------------------------- Evaluating output -----------------------" << endl; vector vec; /* const auto fixed = conf.GetWildcardOptions("const.*"); string where; vector vindex; for (auto it=fixed.cbegin(); it!=fixed.cend(); it++) { const string name = it->substr(6); string val = conf.Get(*it); boost::smatch match; if (boost::regex_match(val, match, boost::regex("\\/(.+)(?0) { cout << "Regular expression detected for constant column `" << *it << "`\n"; cout << "Filename converted with /" << reg << "/ to /" << fmt << "/\n"; cout << "Filename: " << file << '\n'; cout << "Result: " << val << endl; } } if (verbose>2) cout << "\n" << val << " [-const-]"; if (verbose>1) cout << " (" << name << ")"; string sqltype = "INT UNSIGNED"; for (auto m=sqltypes.cbegin(); m!=sqltypes.cend(); m++) if (m->first==name) sqltype = m->second; if (!vec.empty()) query += ",\n"; query += " `"+name+"` "+sqltype+" NOT NULL COMMENT '--user--'"; vec.emplace_back(name, val); where += " AND `"+name+"`="+val; vindex.emplace_back(name); } */ // ------------------------- Setup all branches in tree ------------------- TIter Next(leaves); TObject *o = 0; while ((o=Next())) { TLeaf *L = c.GetLeaf(o->GetName()); string name = o->GetName(); for (auto m=autoalias.cbegin(); m!=autoalias.cend(); m++) name = boost::regex_replace(name, boost::regex(m->first), m->second); if (name!=o->GetName()) { if (verbose>0) cout << "Auto-alias: " << name << " = " << o->GetName() << endl; if (!c.SetAlias(name.c_str(), o->GetName())) cout << "WARNING - Alias could not be established!" << endl; } if (skip) continue; if (verbose>2) cout << '\n' << L->GetTitle() << " {" << L->GetTypeName() << "}"; if (L->GetLenStatic()!=L->GetLen()) { if (verbose>2) cout << " (-skipped-)"; continue; } bool found = false; for (auto b=_ignore.cbegin(); b!=_ignore.cend(); b++) { if (boost::regex_match(o->GetName(), boost::regex(*b))) { found = true; if (verbose>2) cout << " (-ignored-)"; break; } } for (auto b=_ignore.cbegin(); b!=_ignore.cend(); b++) { if (boost::regex_match(name.c_str(), boost::regex(*b))) { found = true; if (verbose>2) cout << " (-ignored-)"; break; } } if (found) continue; const string tn = L->GetTypeName(); auto it = ConvRoot.find(tn); if (it==ConvRoot.end()) { if (verbose>2) cout << " (-n/a-)"; continue; } if (verbose==2) cout << '\n' << L->GetTitle() << " {" << L->GetTypeName() << "}"; if (verbose>1) cout << " (" << name << ")"; vec.emplace_back(o->GetTitle(), name, it->second.first, L->GetLenStatic()); c.SetBranchAddress(o->GetTitle(), vec.back().ptr); } if (verbose>0) { if (skip) cout << "Default columns skipped: "; cout << vec.size() << " default leaf/leaves setup for reading." << endl; } // ------------------- Configure manual aliases ---------------------------- const auto valiases = conf.GetWildcardOptions("alias.*"); if (verbose>0 && valiases.size()>0) cout << '\n'; for (auto it=valiases.cbegin(); it!=valiases.cend(); it++) { const string name = it->substr(6); const string val = conf.Get(*it); if (verbose>0) cout << "Alias: " << name << " = " << val << endl; if (!c.SetAlias(name.c_str(), val.c_str())) { cerr << "Alias could not be established!" << endl; return 2; } } // -------------------------- Configure Selector -------------------------- vector leaflist; c.SetBranchStatus("*", 1); TTreeFormulaManager *manager = new TTreeFormulaManager; if (verbose>0) cout << "\nSelector: " << conf.Get("selector") << endl; TTreeFormula selector("Selector", conf.Get("selector").c_str(), &c); if (selector.GetNdim()==0) { cerr << "Compilation of Selector failed!" << endl; return 3; } selector.SetQuickLoad(kTRUE); manager->Add(&selector); GetLeaves(leaflist, selector); // -------------------- Configure additional columns ---------------------- vector formulas; const auto vform = conf.GetWildcardOptions("add.*"); if (verbose>0 && vform.size()>0) cout << '\n'; for (auto it=vform.cbegin(); it!=vform.cend(); it++) { const string name = it->substr(4); const string val = conf.Get(*it); if (verbose>0) cout << "Adding column: " << name << " = " << val << endl; TTreeFormula *form = new TTreeFormula(name.c_str(), val.c_str(), &c); if (form->GetNdim()==0) { cerr << "Compilation of Column failed!" << endl; return 4; } form->SetQuickLoad(kTRUE); formulas.emplace_back(form); manager->Add(form); GetLeaves(leaflist, *form); } manager->Sync(); if (verbose>0) cout << '\n' << formulas.size() << " additional columns setup for writing." << endl; // --------------------- Setup all branches in formulas ------------------- for (auto l=leaflist.cbegin(); l!=leaflist.cend(); l++) { // Branch address already set if (c.GetBranch(l->c_str())->GetAddress()) continue; TLeaf *L = c.GetLeaf(l->c_str()); if (verbose>2) cout << '\n' << L->GetTitle() << " {" << L->GetTypeName() << "}"; if (L->GetLenStatic()!=L->GetLen()) { if (verbose>2) cout << " (-skipped-)"; continue; } const string tn = L->GetTypeName(); auto it = ConvRoot.find(tn); if (it==ConvRoot.end()) { if (verbose>2) cout << " (-n/a-)"; continue; } if (verbose==2) cout << '\n' << L->GetTitle() << " {" << L->GetTypeName() << "}"; if (verbose>1) cout << " (" << *l << ")"; vec.emplace_back(l->c_str(), l->c_str(), it->second.first, L->GetLenStatic()); c.SetBranchAddress(l->c_str(), vec.back().ptr); } if (verbose>1) cout << '\n'; // ------------------------- Enable branch reading ------------------------ UInt_t datatype = 0; const bool has_datatype = c.SetBranchAddress("DataType.fVal", &datatype) >= 0; // Seting up branch status (must be after all SetBranchAddress) c.SetBranchStatus("*", 0); for (auto v=vec.cbegin(); v!=vec.cend(); v++) if (v->type!=kConst) c.SetBranchStatus(v->branch.c_str(), 1); if (has_datatype) { c.SetBranchStatus("DataType.fVal", 1); if (verbose>0) cout << "Rows with DataType.fVal!=1 will be skipped." << endl; } // ------------------------------------------------------------------------- if (num_split) { cout << "\nSplitting configured " << (split_seq.empty()?"randomly":"in sequence") << " into " << num_split << " files." << endl; if (!split_quant.empty()) cout << "Seed value configured as " << conf.Get("seed") << "." << endl; } if (dryrun) { cout << "\nDry run: file output skipped!" << endl; return 0; } if (verbose>0) cout << "\n-------------------------- Converting file -------------------------" << endl; vector outfiles; if (num_split==0) { TString path(out.c_str()); const int rc = CheckFile(path, force, verbose); if (rc>0) return rc; outfiles.emplace_back(path.Data(), append ? ios::app : ios::trunc); if (rc==-1 || (force && rc==0 && !append)) WriteHeader(outfiles.back(), vec, formulas, skip, header); } else { for (size_t i=0; i0) return rc; outfiles.emplace_back(path.Data(), append ? ios::app : ios::trunc); if (rc==-1 || (force && rc==0 && !append)) WriteHeader(outfiles.back(), vec, formulas, skip, header); } } // ---------------------------- Write Body -------------------------------- size_t count = 0; vector ncount(num_split?num_split:1); auto itree = c.GetTreeNumber(); const size_t num = max>0 && (max-first)UpdateFormulaLeaves(); itree = c.GetTreeNumber(); } if (selector.GetNdim() && selector.EvalInstance(0)<=0) continue; size_t index = 0; if (!split_lut.empty()) index = split_lut[count % split_lut.size()]; if (!split_quant.empty()) { const float r = rndm(); for (; r>=split_quant[index]; index++) if (index==split_quant.size()) break; } vector join; if (!skip) { for (auto v=vec.cbegin(); v!=vec.cend(); v++) { const size_t N = v->num; for (size_t i=0; ifmt(i)); } } for (auto v=formulas.cbegin(); v!=formulas.cend(); v++) join.emplace_back(to_string((*v)->EvalInstance(0))); outfiles[index] << boost::join(join, " ") << "\n"; count ++; ncount[index] ++; } if (verbose>0) { cout << "\nTotal: N=" << count << " out of " << num << " row(s) written [N=" << first << ".." << num-1 << "]." << endl; for (int i=0; i