1 | #ifndef FACT_Splitting
|
---|
2 | #define FACT_Splitting
|
---|
3 |
|
---|
4 | #include <map>
|
---|
5 | #include <vector>
|
---|
6 | #include <random>
|
---|
7 | #include <stdexcept>
|
---|
8 | #include <algorithm>
|
---|
9 |
|
---|
10 | #include "Configuration.h"
|
---|
11 |
|
---|
12 | namespace Tools
|
---|
13 | {
|
---|
14 | class Splitting
|
---|
15 | {
|
---|
16 | std::uniform_real_distribution<double> distribution;
|
---|
17 | std::mt19937_64 generator;
|
---|
18 | uint64_t seed;
|
---|
19 |
|
---|
20 | std::vector<uint16_t> seq;
|
---|
21 | std::vector<double> quant;
|
---|
22 |
|
---|
23 | size_t num;
|
---|
24 |
|
---|
25 | std::map<size_t, size_t> lut;
|
---|
26 |
|
---|
27 | public:
|
---|
28 | static const po::options_description &options()
|
---|
29 | {
|
---|
30 | static po::options_description split("Splitting options");
|
---|
31 | if (split.find_nothrow("seed", false))
|
---|
32 | return split;
|
---|
33 |
|
---|
34 | split.add_options()
|
---|
35 | ("split-sequence,S", vars<uint16_t>(), "Split data sequentially into several trees/files (e.g. 1, 1, 2)")
|
---|
36 | ("split-quantile,Q", vars<double>(), "Split data randomly into several trees/files (e.g. 0.5, 1)")
|
---|
37 | ("seed", var<uint64_t>(std::mt19937_64::default_seed), "Seed value in case of random split")
|
---|
38 | ;
|
---|
39 |
|
---|
40 | return split;
|
---|
41 | }
|
---|
42 |
|
---|
43 | static const char *usage()
|
---|
44 | {
|
---|
45 | return
|
---|
46 | "For several purposes, it might be convenient to split the output to several "
|
---|
47 | "different root-treess. This can be done using the --split-sequence (-S) "
|
---|
48 | "and the --split-quantile (-Q) options. If a split sequence is defined as "
|
---|
49 | "-S 1 -S 2 -S 1 the events are split by 1:2:1 in this sequence order. If "
|
---|
50 | "quantiles are given as -Q 0.5 -Q 0.6, the first tree will contain 50% of "
|
---|
51 | "the second one 10% and the third one 40%. The corresponding seed value can "
|
---|
52 | "be set with --seed.\n";
|
---|
53 | }
|
---|
54 |
|
---|
55 | Splitting(Configuration &conf) : distribution(0, 1)
|
---|
56 | {
|
---|
57 | seq = conf.Vec<uint16_t>("split-sequence");
|
---|
58 | quant = conf.Vec<double>("split-quantile");
|
---|
59 |
|
---|
60 | if (!seq.empty() && !quant.empty())
|
---|
61 | throw std::runtime_error("Only splitting by --split-sequence or --split-quantile is allowed.");
|
---|
62 |
|
---|
63 | num = seq.size()+quant.size()==0 ? 0 : std::max(seq.size(), quant.size()+1);
|
---|
64 |
|
---|
65 | for (size_t i=0; i<seq.size(); i++)
|
---|
66 | {
|
---|
67 | const size_t sz = lut.size();
|
---|
68 | for (size_t j=0; j<seq[i]; j++)
|
---|
69 | lut.emplace(j+sz, i);
|
---|
70 | }
|
---|
71 |
|
---|
72 | for (size_t i=0; i<quant.size(); i++)
|
---|
73 | if (quant[i]<0 || quant[i]>=1)
|
---|
74 | throw std::runtime_error("Splitting quantiles must be in the range [0;1)");
|
---|
75 |
|
---|
76 | for (size_t i=1; i<quant.size(); i++)
|
---|
77 | {
|
---|
78 | if (quant[i]<=quant[i-1])
|
---|
79 | throw std::runtime_error("Splitting quantiles must be in increasing order.");
|
---|
80 | }
|
---|
81 |
|
---|
82 | seed = conf.Get<uint64_t>("seed");
|
---|
83 | generator.seed(seed);
|
---|
84 |
|
---|
85 | //auto rndm = std::bind(distribution, generator);
|
---|
86 | //(bind(&StateMachineFTM::ResetConfig, this))
|
---|
87 | }
|
---|
88 |
|
---|
89 | size_t index(const size_t &count) /*const*/
|
---|
90 | {
|
---|
91 | size_t index = 0;
|
---|
92 | if (!lut.empty())
|
---|
93 | index = lut.find(count % lut.size())->second;
|
---|
94 |
|
---|
95 | if (quant.empty())
|
---|
96 | return index;
|
---|
97 |
|
---|
98 | const double rndm = distribution(generator);
|
---|
99 | for (; rndm>=quant[index]; index++)
|
---|
100 | if (index==quant.size())
|
---|
101 | return index;
|
---|
102 |
|
---|
103 | return index;
|
---|
104 | }
|
---|
105 |
|
---|
106 | void print()
|
---|
107 | {
|
---|
108 | if (!num)
|
---|
109 | return;
|
---|
110 |
|
---|
111 | std::cout << "Splitting configured " << (seq.empty()?"randomly":"in sequence") << " into " << num << " branches.";
|
---|
112 | if (!quant.empty())
|
---|
113 | std::cout << "\nSeed value configured as " << seed << ".";
|
---|
114 | std::cout << std::endl;
|
---|
115 | }
|
---|
116 |
|
---|
117 | const size_t &size() const
|
---|
118 | {
|
---|
119 | return num;
|
---|
120 | }
|
---|
121 |
|
---|
122 | const bool empty() const
|
---|
123 | {
|
---|
124 | return num==0;
|
---|
125 | }
|
---|
126 | };
|
---|
127 | };
|
---|
128 | #endif
|
---|