source: trunk/FACT++/src/Splitting.h@ 19802

Last change on this file since 19802 was 19802, checked in by tbretz, 5 years ago
Moved the common Splitting algorithm to a common class 'Splitting' (added) and added the possibility to rename columns (csv2root)
File size: 4.0 KB
Line 
1#ifndef FACT_Splitting
2#define FACT_Splitting
3
4#include <map>
5#include <vector>
6#include <random>
7#include <stdexcept>
8#include <algorithm>
9
10#include "Configuration.h"
11
12namespace Tools
13{
14 class Splitting
15 {
16 std::uniform_real_distribution<double> distribution;
17 std::mt19937_64 generator;
18 uint64_t seed;
19
20 std::vector<uint16_t> seq;
21 std::vector<double> quant;
22
23 size_t num;
24
25 std::map<size_t, size_t> lut;
26
27 public:
28 static const po::options_description &options()
29 {
30 static po::options_description split("Splitting options");
31 if (split.find_nothrow("seed", false))
32 return split;
33
34 split.add_options()
35 ("split-sequence,S", vars<uint16_t>(), "Split data sequentially into several trees/files (e.g. 1, 1, 2)")
36 ("split-quantile,Q", vars<double>(), "Split data randomly into several trees/files (e.g. 0.5, 1)")
37 ("seed", var<uint64_t>(std::mt19937_64::default_seed), "Seed value in case of random split")
38 ;
39
40 return split;
41 }
42
43 static const char *usage()
44 {
45 return
46 "For several purposes, it might be convenient to split the output to several "
47 "different root-treess. This can be done using the --split-sequence (-S) "
48 "and the --split-quantile (-Q) options. If a split sequence is defined as "
49 "-S 1 -S 2 -S 1 the events are split by 1:2:1 in this sequence order. If "
50 "quantiles are given as -Q 0.5 -Q 0.6, the first tree will contain 50% of "
51 "the second one 10% and the third one 40%. The corresponding seed value can "
52 "be set with --seed.\n";
53 }
54
55 Splitting(Configuration &conf) : distribution(0, 1)
56 {
57 seq = conf.Vec<uint16_t>("split-sequence");
58 quant = conf.Vec<double>("split-quantile");
59
60 if (!seq.empty() && !quant.empty())
61 throw std::runtime_error("Only splitting by --split-sequence or --split-quantile is allowed.");
62
63 num = seq.size()+quant.size()==0 ? 0 : std::max(seq.size(), quant.size()+1);
64
65 for (size_t i=0; i<seq.size(); i++)
66 {
67 const size_t sz = lut.size();
68 for (size_t j=0; j<seq[i]; j++)
69 lut.emplace(j+sz, i);
70 }
71
72 for (size_t i=0; i<quant.size(); i++)
73 if (quant[i]<0 || quant[i]>=1)
74 throw std::runtime_error("Splitting quantiles must be in the range [0;1)");
75
76 for (size_t i=1; i<quant.size(); i++)
77 {
78 if (quant[i]<=quant[i-1])
79 throw std::runtime_error("Splitting quantiles must be in increasing order.");
80 }
81
82 seed = conf.Get<uint64_t>("seed");
83 generator.seed(seed);
84
85 //auto rndm = std::bind(distribution, generator);
86 //(bind(&StateMachineFTM::ResetConfig, this))
87 }
88
89 size_t index(const size_t &count) /*const*/
90 {
91 size_t index = 0;
92 if (!lut.empty())
93 index = lut.find(count % lut.size())->second;
94
95 if (quant.empty())
96 return index;
97
98 const double rndm = distribution(generator);
99 for (; rndm>=quant[index]; index++)
100 if (index==quant.size())
101 return index;
102
103 return index;
104 }
105
106 void print()
107 {
108 if (!num)
109 return;
110
111 std::cout << "Splitting configured " << (seq.empty()?"randomly":"in sequence") << " into " << num << " branches.";
112 if (!quant.empty())
113 std::cout << "\nSeed value configured as " << seed << ".";
114 std::cout << std::endl;
115 }
116
117 const size_t &size() const
118 {
119 return num;
120 }
121
122 const bool empty() const
123 {
124 return num==0;
125 }
126 };
127};
128#endif
Note: See TracBrowser for help on using the repository browser.