Ignore:
Timestamp:
10/27/19 11:11:01 (5 years ago)
Author:
tbretz
Message:
Moved the common Splitting algorithm to a common class 'Splitting' (added) and added the possibility to rename columns (csv2root)
File:
1 edited

Legend:

Unmodified
Added
Removed
  • trunk/FACT++/src/rootifysql.cc

    r19482 r19802  
    11#include "Database.h"
    2 
    3 #include <random>
    42
    53#include <boost/regex.hpp>
     
    108#include "tools.h"
    119#include "Time.h"
    12 #include "Configuration.h"
     10#include "Splitting.h"
    1311
    1412#include <TROOT.h>
     
    6260        ;
    6361
    64     po::options_description split("Splitting options");
    65     split.add_options()
    66         ("split-sequence,S", vars<uint16_t>(),            "Split data sequentially into several trees/files (e.g. 1, 1, 2)")
    67         ("split-quantile,Q", vars<double>(),              "Split data randomly into several trees/files (e.g. 0.5, 1)")
    68         ("seed", var<uint64_t>(mt19937_64::default_seed), "Seed value in case of random split")
    69         ;
    70 
    7162    po::positional_options_description p;
    7263    p.add("file", 1); // The 1st positional options (n=1)
     
    7667    conf.AddOptions(ascii);
    7768    conf.AddOptions(root);
    78     conf.AddOptions(split);
     69    conf.AddOptions(Tools::Splitting::options());
    7970    conf.SetArgumentPositions(p);
    8071}
     
    125116        "/*comment*/ or introduced with # (shell script style) or -- (SQL style).\n"
    126117        "\n"
    127         "For several purposes, it might be convenient to split the output to several "
    128         "different root-trees or ascii files. This can be done using the --split-sequence (-S) "
    129         "and the --split-quantile (-Q) options. If a split sequence is defined as "
    130         "-S 1 -S 2 -S 1 the events are split by 1:2:1 in this sequence order. If "
    131         "quantiled are given as -Q 0.5 -Q 0.6, the first tree will contain 50% of "
    132         "the second one 10% and the third one 40%. The corresponding seed value can "
    133         "be set with --seed.\n"
     118        << Tools::Splitting::usage() <<
    134119        "\n"
    135120        "In case of success, 0 is returned, a value>0 otherwise.\n"
     
    522507    // ----------------------- Setup splitting ---------------------------------
    523508
    524     vector<uint16_t> split_seq   = conf.Vec<uint16_t>("split-sequence");
    525     vector<double>   split_quant = conf.Vec<double>("split-quantile");
    526 
    527     if (!split_seq.empty() && !split_quant.empty())
    528         throw runtime_error("Only splitting by --split-sequence or --split-quantile is allowed.");
    529 
    530     const size_t num_split = split_seq.size()+split_quant.size()==0 ? 0 :
    531         ::max(split_seq.size(), split_quant.size()+1);
    532 
    533     map<size_t, size_t> split_lut;
    534     for (size_t i=0; i<split_seq.size(); i++)
    535     {
    536         const size_t sz = split_lut.size();
    537         for (size_t j=0; j<split_seq[i]; j++)
    538             split_lut.emplace(j+sz, i);
    539     }
    540 
    541     for (size_t i=0; i<split_quant.size(); i++)
    542         if (split_quant[i]<0 || split_quant[i]>=1)
    543             throw runtime_error("Splitting quantiles must be in the range [0;1)");
    544 
    545     for (size_t i=1; i<split_quant.size(); i++)
    546     {
    547         if (split_quant[i]<=split_quant[i-1])
    548             throw runtime_error("Splitting quantiles must be in increasing order.");
    549     }
    550 
    551     // -------------------------------------------------------------------------
    552 
    553509    const auto vars = conf.GetWildcardOptions("var.*");
    554510
     
    558514
    559515    // -------------------------------------------------------------------------
     516
     517    /*const*/ Tools::Splitting split(conf);
    560518
    561519    if (verbose>0)
     
    826784        cout << "Opening file '" << path << "' [compression=" << compression << "]...\n";
    827785        cout << "Writing data to tree '" << tree << "'" << (nofill?" (--skipped--)":"") << endl;
    828         if (num_split)
    829         {
    830             cout << "Splitting configured " << (split_seq.empty()?"randomly":"in sequence") << " into " << num_split << " branches." << endl;
    831             if (!split_quant.empty())
    832                 cout << "Seed value configured as " << conf.Get<uint64_t>("seed") << "." << endl;
    833         }
     786        split.print();
    834787    }
    835788
     
    875828    vector<TTree*> ttree;
    876829
    877     if (num_split==0)
     830    if (split.empty())
    878831        ttree.emplace_back(new TTree(tree.c_str(), query.c_str()));
    879832    else
    880         for (size_t i=0; i<num_split; i++)
     833        for (size_t i=0; i<split.size(); i++)
    881834            ttree.emplace_back(new TTree((tree+"["+to_string(i)+"]").c_str(), query.c_str()));
    882835
     
    955908    {
    956909        vector<string> names;
    957         if (num_split==0)
     910        if (split.empty())
    958911            names.emplace_back(write);
    959912        else
    960             for (size_t i=0; i<num_split; i++)
     913            for (size_t i=0; i<split.size(); i++)
    961914                names.emplace_back(write+"-"+to_string(i));
    962915
     
    1017970    // ---------------------- Fill TTree with DB data --------------------------
    1018971
    1019     const uniform_real_distribution<double> distribution(0,1);
    1020     mt19937_64 generator;
    1021     generator.seed(conf.Get<uint64_t>("seed"));
    1022     auto rndm = bind(distribution, generator);
    1023 
    1024972    size_t count = 0;
    1025973    size_t skip  = 0;
    1026974    do
    1027975    {
    1028         size_t index = 0;
    1029         if (!split_lut.empty())
    1030             index = split_lut[count % split_lut.size()];
    1031         if (!split_quant.empty())
    1032         {
    1033             const float r = rndm();
    1034             for (; r>=split_quant[index]; index++)
    1035                 if (index==split_quant.size())
    1036                     break;
    1037         }
    1038 
    1039         count++;
     976        size_t index = split.index(count++);
    1040977
    1041978        ostringstream rtxt;
Note: See TracChangeset for help on using the changeset viewer.