Ignore:
Timestamp:
10/27/19 11:11:01 (5 years ago)
Author:
tbretz
Message:
Moved the common Splitting algorithm to a common class 'Splitting' (added) and added the possibility to rename columns (csv2root)
File:
1 edited

Legend:

Unmodified
Added
Removed
  • trunk/FACT++/src/root2csv.cc

    r19795 r19802  
    1 #include <random>
    2 
    31#include <boost/regex.hpp>
    42#include <boost/filesystem.hpp>
     
    75#include "tools.h"
    86#include "Time.h"
    9 #include "Configuration.h"
     7#include "Splitting.h"
    108
    119#include <TROOT.h>
     
    6563        ;
    6664
    67     po::options_description split("Splitting options");
    68     split.add_options()
    69         ("split-sequence,S", vars<uint16_t>(),            "Split data sequentially into several trees/files (e.g. 1, 1, 2)")
    70         ("split-quantile,Q", vars<double>(),              "Split data randomly into several trees/files (e.g. 0.5, 1)")
    71         ("seed", var<uint64_t>(mt19937_64::default_seed), "Seed value in case of random split")
    72         ;
    73 
    7465    po::options_description debug("Debug options");
    7566    debug.add_options()
     
    8475
    8576    conf.AddOptions(control);
    86     conf.AddOptions(split);
     77    conf.AddOptions(Tools::Splitting::options());
    8778    conf.AddOptions(debug);
    8879    conf.SetArgumentPositions(p);
     
    169160        "and all negative values are considered 'fales' (discard the entry).\n"
    170161        "\n"
    171         "For several purposes, it might be convenient to split the output to several "
    172         "files. This can be achieved using the --split-sequence (-S) "
    173         "and the --split-quantile (-Q) options. If a split sequence is defined as "
    174         "-S 1 -S 2 -S 1 the events are split by 1:2:1 in this sequence order. If "
    175         "quantiles are given as -Q 0.5 -Q 0.6, the first tree will contain 50% of "
    176         "the second one 10% and the third one 40%. The corresponding seed value can "
    177         "be set with --seed. Filenames are then created by adding an index after(!) "
    178         "the extension, e.g. file.csv-0, file.csv-1, ...\n"
     162        << Tools::Splitting::usage() <<
    179163        "\n"
    180164        "In case of success, 0 is returned, a value>0 otherwise.\n"
     
    403387    const vector<Map> autoalias  = conf.Vec<Map>("auto-alias");
    404388
    405     // ----------------------------- Setup splitting ---------------------------
    406 
    407     vector<uint16_t> split_seq   = conf.Vec<uint16_t>("split-sequence");
    408     vector<double>   split_quant = conf.Vec<double>("split-quantile");
    409 
    410     if (!split_seq.empty() && !split_quant.empty())
    411         throw runtime_error("Only splitting by --split-sequence or --split-quantile is allowed.");
    412 
    413     const size_t num_split = split_seq.size()+split_quant.size()==0 ? 0 :
    414         ::max(split_seq.size(), split_quant.size()+1);
    415 
    416     map<size_t, size_t> split_lut;
    417     for (size_t i=0; i<split_seq.size(); i++)
    418     {
    419         const size_t sz = split_lut.size();
    420         for (size_t j=0; j<split_seq[i]; j++)
    421             split_lut.emplace(j+sz, i);
    422     }
    423 
    424     for (size_t i=0; i<split_quant.size(); i++)
    425         if (split_quant[i]<0 || split_quant[i]>=1)
    426             throw runtime_error("Splitting quantiles must be in the range [0;1)");
    427 
    428     for (size_t i=1; i<split_quant.size(); i++)
    429     {
    430         if (split_quant[i]<=split_quant[i-1])
    431             throw runtime_error("Splitting quantiles must be in increasing order.");
    432     }
    433 
    434389    // -------------------------------------------------------------------------
    435390
    436     const uniform_real_distribution<double> distribution(0,1);
    437     mt19937_64 generator;
    438     generator.seed(conf.Get<uint64_t>("seed"));
    439     auto rndm = bind(distribution, generator);
    440 
    441     // -------------------------------------------------------------------------
     391    /*const*/ Tools::Splitting split(conf);
    442392
    443393    if (verbose>0)
     
    780730    // -------------------------------------------------------------------------
    781731
    782     if (num_split)
    783     {
    784         cout << "\nSplitting configured " << (split_seq.empty()?"randomly":"in sequence") << " into " << num_split << " files." << endl;
    785         if (!split_quant.empty())
    786             cout << "Seed value configured as " << conf.Get<uint64_t>("seed") << "." << endl;
     732    if (verbose>0)
     733    {
     734        cout << '\n';
     735        split.print();
    787736    }
    788737
     
    798747    vector<ofstream> outfiles;
    799748
    800     if (num_split==0)
     749    if (split,empty())
    801750    {
    802751        TString path(out.c_str());
     
    811760    else
    812761    {
    813         for (size_t i=0; i<num_split; i++)
     762        for (size_t i=0; i<split.size(); i++)
    814763        {
    815764            TString path(out.c_str());
     
    828777    // ---------------------------- Write Body --------------------------------
    829778    size_t count = 0;
    830     vector<size_t> ncount(num_split?num_split:1);
     779    vector<size_t> ncount(split.empty()?1:split.size());
    831780
    832781    auto itree = c.GetTreeNumber();
     
    848797            continue;
    849798
    850         size_t index = 0;
    851         if (!split_lut.empty())
    852             index = split_lut[count % split_lut.size()];
    853         if (!split_quant.empty())
    854         {
    855             const float r = rndm();
    856             for (; r>=split_quant[index]; index++)
    857                 if (index==split_quant.size())
    858                     break;
    859         }
     799        const size_t index = split.index(count++);
     800        ncount[index]++;
    860801
    861802        vector<string> join;
     
    875816
    876817        outfiles[index] << boost::join(join, " ") << "\n";
    877 
    878         count ++;
    879         ncount[index] ++;
    880818    }
    881819
Note: See TracChangeset for help on using the changeset viewer.