| 1 | #ifndef FACT_Splitting
|
|---|
| 2 | #define FACT_Splitting
|
|---|
| 3 |
|
|---|
| 4 | #include <map>
|
|---|
| 5 | #include <vector>
|
|---|
| 6 | #include <random>
|
|---|
| 7 | #include <stdexcept>
|
|---|
| 8 | #include <algorithm>
|
|---|
| 9 |
|
|---|
| 10 | #include "Configuration.h"
|
|---|
| 11 |
|
|---|
| 12 | namespace Tools
|
|---|
| 13 | {
|
|---|
| 14 | class Splitting
|
|---|
| 15 | {
|
|---|
| 16 | std::uniform_real_distribution<double> distribution;
|
|---|
| 17 | std::mt19937_64 generator;
|
|---|
| 18 | uint64_t seed;
|
|---|
| 19 |
|
|---|
| 20 | std::vector<uint16_t> seq;
|
|---|
| 21 | std::vector<double> quant;
|
|---|
| 22 |
|
|---|
| 23 | size_t num;
|
|---|
| 24 |
|
|---|
| 25 | std::map<size_t, size_t> lut;
|
|---|
| 26 |
|
|---|
| 27 | public:
|
|---|
| 28 | static const po::options_description &options()
|
|---|
| 29 | {
|
|---|
| 30 | static po::options_description split("Splitting options");
|
|---|
| 31 | if (split.find_nothrow("seed", false))
|
|---|
| 32 | return split;
|
|---|
| 33 |
|
|---|
| 34 | split.add_options()
|
|---|
| 35 | ("split-sequence,S", vars<uint16_t>(), "Split data sequentially into several trees/files (e.g. 1, 1, 2)")
|
|---|
| 36 | ("split-quantile,Q", vars<double>(), "Split data randomly into several trees/files (e.g. 0.5, 1)")
|
|---|
| 37 | ("seed", var<uint64_t>(std::mt19937_64::default_seed), "Seed value in case of random split")
|
|---|
| 38 | ;
|
|---|
| 39 |
|
|---|
| 40 | return split;
|
|---|
| 41 | }
|
|---|
| 42 |
|
|---|
| 43 | static const char *usage()
|
|---|
| 44 | {
|
|---|
| 45 | return
|
|---|
| 46 | "For several purposes, it might be convenient to split the output to several "
|
|---|
| 47 | "different root-treess. This can be done using the --split-sequence (-S) "
|
|---|
| 48 | "and the --split-quantile (-Q) options. If a split sequence is defined as "
|
|---|
| 49 | "-S 1 -S 2 -S 1 the events are split by 1:2:1 in this sequence order. If "
|
|---|
| 50 | "quantiles are given as -Q 0.5 -Q 0.6, the first tree will contain 50% of "
|
|---|
| 51 | "the second one 10% and the third one 40%. The corresponding seed value can "
|
|---|
| 52 | "be set with --seed.\n";
|
|---|
| 53 | }
|
|---|
| 54 |
|
|---|
| 55 | Splitting(Configuration &conf) : distribution(0, 1)
|
|---|
| 56 | {
|
|---|
| 57 | seq = conf.Vec<uint16_t>("split-sequence");
|
|---|
| 58 | quant = conf.Vec<double>("split-quantile");
|
|---|
| 59 |
|
|---|
| 60 | if (!seq.empty() && !quant.empty())
|
|---|
| 61 | throw std::runtime_error("Only splitting by --split-sequence or --split-quantile is allowed.");
|
|---|
| 62 |
|
|---|
| 63 | num = seq.size()+quant.size()==0 ? 0 : std::max(seq.size(), quant.size()+1);
|
|---|
| 64 |
|
|---|
| 65 | for (size_t i=0; i<seq.size(); i++)
|
|---|
| 66 | {
|
|---|
| 67 | const size_t sz = lut.size();
|
|---|
| 68 | for (size_t j=0; j<seq[i]; j++)
|
|---|
| 69 | lut.emplace(j+sz, i);
|
|---|
| 70 | }
|
|---|
| 71 |
|
|---|
| 72 | for (size_t i=0; i<quant.size(); i++)
|
|---|
| 73 | if (quant[i]<0 || quant[i]>=1)
|
|---|
| 74 | throw std::runtime_error("Splitting quantiles must be in the range [0;1)");
|
|---|
| 75 |
|
|---|
| 76 | for (size_t i=1; i<quant.size(); i++)
|
|---|
| 77 | {
|
|---|
| 78 | if (quant[i]<=quant[i-1])
|
|---|
| 79 | throw std::runtime_error("Splitting quantiles must be in increasing order.");
|
|---|
| 80 | }
|
|---|
| 81 |
|
|---|
| 82 | seed = conf.Get<uint64_t>("seed");
|
|---|
| 83 | generator.seed(seed);
|
|---|
| 84 |
|
|---|
| 85 | //auto rndm = std::bind(distribution, generator);
|
|---|
| 86 | //(bind(&StateMachineFTM::ResetConfig, this))
|
|---|
| 87 | }
|
|---|
| 88 |
|
|---|
| 89 | size_t index(const size_t &count) /*const*/
|
|---|
| 90 | {
|
|---|
| 91 | size_t index = 0;
|
|---|
| 92 | if (!lut.empty())
|
|---|
| 93 | index = lut.find(count % lut.size())->second;
|
|---|
| 94 |
|
|---|
| 95 | if (quant.empty())
|
|---|
| 96 | return index;
|
|---|
| 97 |
|
|---|
| 98 | const double rndm = distribution(generator);
|
|---|
| 99 | for (; rndm>=quant[index]; index++)
|
|---|
| 100 | if (index==quant.size())
|
|---|
| 101 | return index;
|
|---|
| 102 |
|
|---|
| 103 | return index;
|
|---|
| 104 | }
|
|---|
| 105 |
|
|---|
| 106 | void print()
|
|---|
| 107 | {
|
|---|
| 108 | if (!num)
|
|---|
| 109 | return;
|
|---|
| 110 |
|
|---|
| 111 | std::cout << "Splitting configured " << (seq.empty()?"randomly":"in sequence") << " into " << num << " branches.";
|
|---|
| 112 | if (!quant.empty())
|
|---|
| 113 | std::cout << "\nSeed value configured as " << seed << ".";
|
|---|
| 114 | std::cout << std::endl;
|
|---|
| 115 | }
|
|---|
| 116 |
|
|---|
| 117 | const size_t &size() const
|
|---|
| 118 | {
|
|---|
| 119 | return num;
|
|---|
| 120 | }
|
|---|
| 121 |
|
|---|
| 122 | const bool empty() const
|
|---|
| 123 | {
|
|---|
| 124 | return num==0;
|
|---|
| 125 | }
|
|---|
| 126 | };
|
|---|
| 127 | };
|
|---|
| 128 | #endif
|
|---|