Changeset 19802 for trunk/FACT++/src/root2csv.cc
- Timestamp:
- 10/27/19 11:11:01 (5 years ago)
- File:
-
- 1 edited
Legend:
- Unmodified
- Added
- Removed
-
trunk/FACT++/src/root2csv.cc
r19795 r19802 1 #include <random>2 3 1 #include <boost/regex.hpp> 4 2 #include <boost/filesystem.hpp> … … 7 5 #include "tools.h" 8 6 #include "Time.h" 9 #include " Configuration.h"7 #include "Splitting.h" 10 8 11 9 #include <TROOT.h> … … 65 63 ; 66 64 67 po::options_description split("Splitting options");68 split.add_options()69 ("split-sequence,S", vars<uint16_t>(), "Split data sequentially into several trees/files (e.g. 1, 1, 2)")70 ("split-quantile,Q", vars<double>(), "Split data randomly into several trees/files (e.g. 0.5, 1)")71 ("seed", var<uint64_t>(mt19937_64::default_seed), "Seed value in case of random split")72 ;73 74 65 po::options_description debug("Debug options"); 75 66 debug.add_options() … … 84 75 85 76 conf.AddOptions(control); 86 conf.AddOptions( split);77 conf.AddOptions(Tools::Splitting::options()); 87 78 conf.AddOptions(debug); 88 79 conf.SetArgumentPositions(p); … … 169 160 "and all negative values are considered 'fales' (discard the entry).\n" 170 161 "\n" 171 "For several purposes, it might be convenient to split the output to several " 172 "files. This can be achieved using the --split-sequence (-S) " 173 "and the --split-quantile (-Q) options. If a split sequence is defined as " 174 "-S 1 -S 2 -S 1 the events are split by 1:2:1 in this sequence order. If " 175 "quantiles are given as -Q 0.5 -Q 0.6, the first tree will contain 50% of " 176 "the second one 10% and the third one 40%. The corresponding seed value can " 177 "be set with --seed. Filenames are then created by adding an index after(!) " 178 "the extension, e.g. file.csv-0, file.csv-1, ...\n" 162 << Tools::Splitting::usage() << 179 163 "\n" 180 164 "In case of success, 0 is returned, a value>0 otherwise.\n" … … 403 387 const vector<Map> autoalias = conf.Vec<Map>("auto-alias"); 404 388 405 // ----------------------------- Setup splitting ---------------------------406 407 vector<uint16_t> split_seq = conf.Vec<uint16_t>("split-sequence");408 vector<double> split_quant = conf.Vec<double>("split-quantile");409 410 if (!split_seq.empty() && !split_quant.empty())411 throw runtime_error("Only splitting by --split-sequence or --split-quantile is allowed.");412 413 const size_t num_split = split_seq.size()+split_quant.size()==0 ? 0 :414 ::max(split_seq.size(), split_quant.size()+1);415 416 map<size_t, size_t> split_lut;417 for (size_t i=0; i<split_seq.size(); i++)418 {419 const size_t sz = split_lut.size();420 for (size_t j=0; j<split_seq[i]; j++)421 split_lut.emplace(j+sz, i);422 }423 424 for (size_t i=0; i<split_quant.size(); i++)425 if (split_quant[i]<0 || split_quant[i]>=1)426 throw runtime_error("Splitting quantiles must be in the range [0;1)");427 428 for (size_t i=1; i<split_quant.size(); i++)429 {430 if (split_quant[i]<=split_quant[i-1])431 throw runtime_error("Splitting quantiles must be in increasing order.");432 }433 434 389 // ------------------------------------------------------------------------- 435 390 436 const uniform_real_distribution<double> distribution(0,1); 437 mt19937_64 generator; 438 generator.seed(conf.Get<uint64_t>("seed")); 439 auto rndm = bind(distribution, generator); 440 441 // ------------------------------------------------------------------------- 391 /*const*/ Tools::Splitting split(conf); 442 392 443 393 if (verbose>0) … … 780 730 // ------------------------------------------------------------------------- 781 731 782 if (num_split) 783 { 784 cout << "\nSplitting configured " << (split_seq.empty()?"randomly":"in sequence") << " into " << num_split << " files." << endl; 785 if (!split_quant.empty()) 786 cout << "Seed value configured as " << conf.Get<uint64_t>("seed") << "." << endl; 732 if (verbose>0) 733 { 734 cout << '\n'; 735 split.print(); 787 736 } 788 737 … … 798 747 vector<ofstream> outfiles; 799 748 800 if ( num_split==0)749 if (split,empty()) 801 750 { 802 751 TString path(out.c_str()); … … 811 760 else 812 761 { 813 for (size_t i=0; i< num_split; i++)762 for (size_t i=0; i<split.size(); i++) 814 763 { 815 764 TString path(out.c_str()); … … 828 777 // ---------------------------- Write Body -------------------------------- 829 778 size_t count = 0; 830 vector<size_t> ncount( num_split?num_split:1);779 vector<size_t> ncount(split.empty()?1:split.size()); 831 780 832 781 auto itree = c.GetTreeNumber(); … … 848 797 continue; 849 798 850 size_t index = 0; 851 if (!split_lut.empty()) 852 index = split_lut[count % split_lut.size()]; 853 if (!split_quant.empty()) 854 { 855 const float r = rndm(); 856 for (; r>=split_quant[index]; index++) 857 if (index==split_quant.size()) 858 break; 859 } 799 const size_t index = split.index(count++); 800 ncount[index]++; 860 801 861 802 vector<string> join; … … 875 816 876 817 outfiles[index] << boost::join(join, " ") << "\n"; 877 878 count ++;879 ncount[index] ++;880 818 } 881 819
Note:
See TracChangeset
for help on using the changeset viewer.