Changeset 19802 for trunk/FACT++
- Timestamp:
- 10/27/19 11:11:01 (5 years ago)
- Location:
- trunk/FACT++/src
- Files:
-
- 1 added
- 3 edited
Legend:
- Unmodified
- Added
- Removed
-
trunk/FACT++/src/csv2root.cc
r19798 r19802 1 #include <random>2 3 1 #include <boost/regex.hpp> 4 2 #include <boost/filesystem.hpp> … … 7 5 #include "tools.h" 8 6 #include "Time.h" 9 #include " Configuration.h"7 #include "Splitting.h" 10 8 11 9 #include <TROOT.h> … … 18 16 using namespace std; 19 17 namespace fs = boost::filesystem; 18 19 // ------------------------------------------------------------------------ 20 20 21 21 22 // ------------------------------------------------------------------------ … … 32 33 ("compression,c", var<uint16_t>(1), "zlib compression level for the root file") 33 34 ("no-header,n", po_switch(), "Use if the first line contains no header") 35 ("rename.*", var<string>(), "Can be used to rename a column") 34 36 ("dry-run", po_switch(), "Do not create or manipulate any output file") 35 ;36 37 po::options_description split("Splitting options");38 split.add_options()39 ("split-sequence,S", vars<uint16_t>(), "Split data sequentially into several trees/files (e.g. 1, 1, 2)")40 ("split-quantile,Q", vars<double>(), "Split data randomly into several trees/files (e.g. 0.5, 1)")41 ("seed", var<uint64_t>(mt19937_64::default_seed), "Seed value in case of random split")42 ;43 44 po::options_description debug("Debug options");45 debug.add_options()46 37 ("verbose,v", var<uint16_t>(1), "Verbosity (0: quiet, 1: default, 2: more, 3, ...)") 47 38 ; … … 53 44 54 45 conf.AddOptions(control); 55 conf.AddOptions(split); 56 conf.AddOptions(debug); 46 conf.AddOptions(Tools::Splitting::options()); 57 47 conf.SetArgumentPositions(p); 58 48 } … … 90 80 "with --compression.\n" 91 81 "\n" 92 "For several purposes, it might be convenient to split the output to several " 93 "different root-treess. This can be done using the --split-sequence (-S) " 94 "and the --split-quantile (-Q) options. If a split sequence is defined as " 95 "-S 1 -S 2 -S 1 the events are split by 1:2:1 in this sequence order. If " 96 "quantiles are given as -Q 0.5 -Q 0.6, the first tree will contain 50% of " 97 "the second one 10% and the third one 40%. The corresponding seed value can " 98 "be set with --seed.\n" 82 "Columns can be renamed with --rename.new=old\n" 83 "\n" 84 << Tools::Splitting::usage() << 99 85 "\n" 100 86 "In case of success, 0 is returned, a value>0 otherwise.\n" … … 178 164 } 179 165 180 // ----------------------------- Setup splitting ---------------------------181 182 vector<uint16_t> split_seq = conf.Vec<uint16_t>("split-sequence");183 vector<double> split_quant = conf.Vec<double>("split-quantile");184 185 if (!split_seq.empty() && !split_quant.empty())186 throw runtime_error("Only splitting by --split-sequence or --split-quantile is allowed.");187 188 const size_t num_split = split_seq.size()+split_quant.size()==0 ? 0 :189 ::max(split_seq.size(), split_quant.size()+1);190 191 map<size_t, size_t> split_lut;192 for (size_t i=0; i<split_seq.size(); i++)193 {194 const size_t sz = split_lut.size();195 for (size_t j=0; j<split_seq[i]; j++)196 split_lut.emplace(j+sz, i);197 }198 199 for (size_t i=0; i<split_quant.size(); i++)200 if (split_quant[i]<0 || split_quant[i]>=1)201 throw runtime_error("Splitting quantiles must be in the range [0;1)");202 203 for (size_t i=1; i<split_quant.size(); i++)204 {205 if (split_quant[i]<=split_quant[i-1])206 throw runtime_error("Splitting quantiles must be in increasing order.");207 }208 209 166 // ------------------------------------------------------------------------- 210 167 211 const uniform_real_distribution<double> distribution(0,1); 212 mt19937_64 generator; 213 generator.seed(conf.Get<uint64_t>("seed")); 214 auto rndm = bind(distribution, generator); 215 216 // ------------------------------------------------------------------------- 168 /*const*/ Tools::Splitting split(conf); 217 169 218 170 if (verbose>0) … … 300 252 cout << "Opened root file '" << path << "'.\n"; 301 253 cout << "Writing to tree: " << tree << ".\n"; 254 split.print(); 302 255 } 303 256 … … 306 259 307 260 size_t entries = 0; 308 if ( num_split==0)261 if (split.empty()) 309 262 { 310 263 if (AddTree(ttree, tfile, tree, update, verbose)) … … 318 271 { 319 272 bool found = false; 320 for (size_t i=0; i< num_split; i++)273 for (size_t i=0; i<split.size(); i++) 321 274 found |= AddTree(ttree, tfile, tree+"["+to_string(i)+"]", update, verbose); 322 275 … … 328 281 } 329 282 283 const auto rename = conf.GetWildcardOptions("rename.*"); 284 330 285 vector<float> vec(numcol); 331 286 for (int i=0; i<numcol; i++) … … 333 288 string col = noheader ? Tools::Form("col%d", i) : title->At(i)->GetName(); 334 289 290 if (verbose>1) 291 cout << "Column: " << col; 292 335 293 boost::regex rexpr(":"); 336 294 col = boost::regex_replace(col, rexpr, ""); 337 295 338 296 if (verbose>1) 339 cout << "Column: " << col << '\n'; 297 cout << " -> " << col; 298 299 for (auto it=rename.cbegin(); it!=rename.cend(); it++) 300 { 301 if (col!=it->substr(7)) 302 continue; 303 304 col = conf.Get<string>(*it); 305 if (verbose>1) 306 cout << " -> " << col; 307 break; 308 } 309 if (verbose>1) 310 cout << endl; 340 311 341 312 for (auto it=ttree.begin(); it!=ttree.end(); it++) … … 361 332 if (buf.IsNull() || buf[0]=='#') 362 333 continue; 363 364 valid++;365 334 366 335 TObjArray *arr = buf.Tokenize(" "); … … 386 355 delete arr; 387 356 388 389 size_t index = 0; 390 if (!split_lut.empty()) 391 index = split_lut[line % split_lut.size()]; 392 if (!split_quant.empty()) 393 { 394 const float r = rndm(); 395 for (; r>=split_quant[index]; index++) 396 if (index==split_quant.size()) 397 break; 398 } 357 const size_t index = split.index(valid++); 399 358 400 359 // Fill only branches for which an adress was set -
trunk/FACT++/src/root2csv.cc
r19795 r19802 1 #include <random>2 3 1 #include <boost/regex.hpp> 4 2 #include <boost/filesystem.hpp> … … 7 5 #include "tools.h" 8 6 #include "Time.h" 9 #include " Configuration.h"7 #include "Splitting.h" 10 8 11 9 #include <TROOT.h> … … 65 63 ; 66 64 67 po::options_description split("Splitting options");68 split.add_options()69 ("split-sequence,S", vars<uint16_t>(), "Split data sequentially into several trees/files (e.g. 1, 1, 2)")70 ("split-quantile,Q", vars<double>(), "Split data randomly into several trees/files (e.g. 0.5, 1)")71 ("seed", var<uint64_t>(mt19937_64::default_seed), "Seed value in case of random split")72 ;73 74 65 po::options_description debug("Debug options"); 75 66 debug.add_options() … … 84 75 85 76 conf.AddOptions(control); 86 conf.AddOptions( split);77 conf.AddOptions(Tools::Splitting::options()); 87 78 conf.AddOptions(debug); 88 79 conf.SetArgumentPositions(p); … … 169 160 "and all negative values are considered 'fales' (discard the entry).\n" 170 161 "\n" 171 "For several purposes, it might be convenient to split the output to several " 172 "files. This can be achieved using the --split-sequence (-S) " 173 "and the --split-quantile (-Q) options. If a split sequence is defined as " 174 "-S 1 -S 2 -S 1 the events are split by 1:2:1 in this sequence order. If " 175 "quantiles are given as -Q 0.5 -Q 0.6, the first tree will contain 50% of " 176 "the second one 10% and the third one 40%. The corresponding seed value can " 177 "be set with --seed. Filenames are then created by adding an index after(!) " 178 "the extension, e.g. file.csv-0, file.csv-1, ...\n" 162 << Tools::Splitting::usage() << 179 163 "\n" 180 164 "In case of success, 0 is returned, a value>0 otherwise.\n" … … 403 387 const vector<Map> autoalias = conf.Vec<Map>("auto-alias"); 404 388 405 // ----------------------------- Setup splitting ---------------------------406 407 vector<uint16_t> split_seq = conf.Vec<uint16_t>("split-sequence");408 vector<double> split_quant = conf.Vec<double>("split-quantile");409 410 if (!split_seq.empty() && !split_quant.empty())411 throw runtime_error("Only splitting by --split-sequence or --split-quantile is allowed.");412 413 const size_t num_split = split_seq.size()+split_quant.size()==0 ? 0 :414 ::max(split_seq.size(), split_quant.size()+1);415 416 map<size_t, size_t> split_lut;417 for (size_t i=0; i<split_seq.size(); i++)418 {419 const size_t sz = split_lut.size();420 for (size_t j=0; j<split_seq[i]; j++)421 split_lut.emplace(j+sz, i);422 }423 424 for (size_t i=0; i<split_quant.size(); i++)425 if (split_quant[i]<0 || split_quant[i]>=1)426 throw runtime_error("Splitting quantiles must be in the range [0;1)");427 428 for (size_t i=1; i<split_quant.size(); i++)429 {430 if (split_quant[i]<=split_quant[i-1])431 throw runtime_error("Splitting quantiles must be in increasing order.");432 }433 434 389 // ------------------------------------------------------------------------- 435 390 436 const uniform_real_distribution<double> distribution(0,1); 437 mt19937_64 generator; 438 generator.seed(conf.Get<uint64_t>("seed")); 439 auto rndm = bind(distribution, generator); 440 441 // ------------------------------------------------------------------------- 391 /*const*/ Tools::Splitting split(conf); 442 392 443 393 if (verbose>0) … … 780 730 // ------------------------------------------------------------------------- 781 731 782 if (num_split) 783 { 784 cout << "\nSplitting configured " << (split_seq.empty()?"randomly":"in sequence") << " into " << num_split << " files." << endl; 785 if (!split_quant.empty()) 786 cout << "Seed value configured as " << conf.Get<uint64_t>("seed") << "." << endl; 732 if (verbose>0) 733 { 734 cout << '\n'; 735 split.print(); 787 736 } 788 737 … … 798 747 vector<ofstream> outfiles; 799 748 800 if ( num_split==0)749 if (split,empty()) 801 750 { 802 751 TString path(out.c_str()); … … 811 760 else 812 761 { 813 for (size_t i=0; i< num_split; i++)762 for (size_t i=0; i<split.size(); i++) 814 763 { 815 764 TString path(out.c_str()); … … 828 777 // ---------------------------- Write Body -------------------------------- 829 778 size_t count = 0; 830 vector<size_t> ncount( num_split?num_split:1);779 vector<size_t> ncount(split.empty()?1:split.size()); 831 780 832 781 auto itree = c.GetTreeNumber(); … … 848 797 continue; 849 798 850 size_t index = 0; 851 if (!split_lut.empty()) 852 index = split_lut[count % split_lut.size()]; 853 if (!split_quant.empty()) 854 { 855 const float r = rndm(); 856 for (; r>=split_quant[index]; index++) 857 if (index==split_quant.size()) 858 break; 859 } 799 const size_t index = split.index(count++); 800 ncount[index]++; 860 801 861 802 vector<string> join; … … 875 816 876 817 outfiles[index] << boost::join(join, " ") << "\n"; 877 878 count ++;879 ncount[index] ++;880 818 } 881 819 -
trunk/FACT++/src/rootifysql.cc
r19482 r19802 1 1 #include "Database.h" 2 3 #include <random>4 2 5 3 #include <boost/regex.hpp> … … 10 8 #include "tools.h" 11 9 #include "Time.h" 12 #include " Configuration.h"10 #include "Splitting.h" 13 11 14 12 #include <TROOT.h> … … 62 60 ; 63 61 64 po::options_description split("Splitting options");65 split.add_options()66 ("split-sequence,S", vars<uint16_t>(), "Split data sequentially into several trees/files (e.g. 1, 1, 2)")67 ("split-quantile,Q", vars<double>(), "Split data randomly into several trees/files (e.g. 0.5, 1)")68 ("seed", var<uint64_t>(mt19937_64::default_seed), "Seed value in case of random split")69 ;70 71 62 po::positional_options_description p; 72 63 p.add("file", 1); // The 1st positional options (n=1) … … 76 67 conf.AddOptions(ascii); 77 68 conf.AddOptions(root); 78 conf.AddOptions( split);69 conf.AddOptions(Tools::Splitting::options()); 79 70 conf.SetArgumentPositions(p); 80 71 } … … 125 116 "/*comment*/ or introduced with # (shell script style) or -- (SQL style).\n" 126 117 "\n" 127 "For several purposes, it might be convenient to split the output to several " 128 "different root-trees or ascii files. This can be done using the --split-sequence (-S) " 129 "and the --split-quantile (-Q) options. If a split sequence is defined as " 130 "-S 1 -S 2 -S 1 the events are split by 1:2:1 in this sequence order. If " 131 "quantiled are given as -Q 0.5 -Q 0.6, the first tree will contain 50% of " 132 "the second one 10% and the third one 40%. The corresponding seed value can " 133 "be set with --seed.\n" 118 << Tools::Splitting::usage() << 134 119 "\n" 135 120 "In case of success, 0 is returned, a value>0 otherwise.\n" … … 522 507 // ----------------------- Setup splitting --------------------------------- 523 508 524 vector<uint16_t> split_seq = conf.Vec<uint16_t>("split-sequence");525 vector<double> split_quant = conf.Vec<double>("split-quantile");526 527 if (!split_seq.empty() && !split_quant.empty())528 throw runtime_error("Only splitting by --split-sequence or --split-quantile is allowed.");529 530 const size_t num_split = split_seq.size()+split_quant.size()==0 ? 0 :531 ::max(split_seq.size(), split_quant.size()+1);532 533 map<size_t, size_t> split_lut;534 for (size_t i=0; i<split_seq.size(); i++)535 {536 const size_t sz = split_lut.size();537 for (size_t j=0; j<split_seq[i]; j++)538 split_lut.emplace(j+sz, i);539 }540 541 for (size_t i=0; i<split_quant.size(); i++)542 if (split_quant[i]<0 || split_quant[i]>=1)543 throw runtime_error("Splitting quantiles must be in the range [0;1)");544 545 for (size_t i=1; i<split_quant.size(); i++)546 {547 if (split_quant[i]<=split_quant[i-1])548 throw runtime_error("Splitting quantiles must be in increasing order.");549 }550 551 // -------------------------------------------------------------------------552 553 509 const auto vars = conf.GetWildcardOptions("var.*"); 554 510 … … 558 514 559 515 // ------------------------------------------------------------------------- 516 517 /*const*/ Tools::Splitting split(conf); 560 518 561 519 if (verbose>0) … … 826 784 cout << "Opening file '" << path << "' [compression=" << compression << "]...\n"; 827 785 cout << "Writing data to tree '" << tree << "'" << (nofill?" (--skipped--)":"") << endl; 828 if (num_split) 829 { 830 cout << "Splitting configured " << (split_seq.empty()?"randomly":"in sequence") << " into " << num_split << " branches." << endl; 831 if (!split_quant.empty()) 832 cout << "Seed value configured as " << conf.Get<uint64_t>("seed") << "." << endl; 833 } 786 split.print(); 834 787 } 835 788 … … 875 828 vector<TTree*> ttree; 876 829 877 if ( num_split==0)830 if (split.empty()) 878 831 ttree.emplace_back(new TTree(tree.c_str(), query.c_str())); 879 832 else 880 for (size_t i=0; i< num_split; i++)833 for (size_t i=0; i<split.size(); i++) 881 834 ttree.emplace_back(new TTree((tree+"["+to_string(i)+"]").c_str(), query.c_str())); 882 835 … … 955 908 { 956 909 vector<string> names; 957 if ( num_split==0)910 if (split.empty()) 958 911 names.emplace_back(write); 959 912 else 960 for (size_t i=0; i< num_split; i++)913 for (size_t i=0; i<split.size(); i++) 961 914 names.emplace_back(write+"-"+to_string(i)); 962 915 … … 1017 970 // ---------------------- Fill TTree with DB data -------------------------- 1018 971 1019 const uniform_real_distribution<double> distribution(0,1);1020 mt19937_64 generator;1021 generator.seed(conf.Get<uint64_t>("seed"));1022 auto rndm = bind(distribution, generator);1023 1024 972 size_t count = 0; 1025 973 size_t skip = 0; 1026 974 do 1027 975 { 1028 size_t index = 0; 1029 if (!split_lut.empty()) 1030 index = split_lut[count % split_lut.size()]; 1031 if (!split_quant.empty()) 1032 { 1033 const float r = rndm(); 1034 for (; r>=split_quant[index]; index++) 1035 if (index==split_quant.size()) 1036 break; 1037 } 1038 1039 count++; 976 size_t index = split.index(count++); 1040 977 1041 978 ostringstream rtxt;
Note:
See TracChangeset
for help on using the changeset viewer.