Ignore:
Timestamp:
10/27/19 11:11:01 (5 years ago)
Author:
tbretz
Message:
Moved the common Splitting algorithm to a common class 'Splitting' (added) and added the possibility to rename columns (csv2root)
File:
1 edited

Legend:

Unmodified
Added
Removed
  • trunk/FACT++/src/csv2root.cc

    r19798 r19802  
    1 #include <random>
    2 
    31#include <boost/regex.hpp>
    42#include <boost/filesystem.hpp>
     
    75#include "tools.h"
    86#include "Time.h"
    9 #include "Configuration.h"
     7#include "Splitting.h"
    108
    119#include <TROOT.h>
     
    1816using namespace std;
    1917namespace fs = boost::filesystem;
     18
     19// ------------------------------------------------------------------------
     20
    2021
    2122// ------------------------------------------------------------------------
     
    3233        ("compression,c",  var<uint16_t>(1),          "zlib compression level for the root file")
    3334        ("no-header,n",    po_switch(),               "Use if the first line contains no header")
     35        ("rename.*",       var<string>(),             "Can be used to rename a column")
    3436        ("dry-run",        po_switch(),               "Do not create or manipulate any output file")
    35         ;
    36 
    37     po::options_description split("Splitting options");
    38     split.add_options()
    39         ("split-sequence,S", vars<uint16_t>(),            "Split data sequentially into several trees/files (e.g. 1, 1, 2)")
    40         ("split-quantile,Q", vars<double>(),              "Split data randomly into several trees/files (e.g. 0.5, 1)")
    41         ("seed", var<uint64_t>(mt19937_64::default_seed), "Seed value in case of random split")
    42         ;
    43 
    44     po::options_description debug("Debug options");
    45     debug.add_options()
    4637        ("verbose,v",      var<uint16_t>(1),          "Verbosity (0: quiet, 1: default, 2: more, 3, ...)")
    4738        ;
     
    5344
    5445    conf.AddOptions(control);
    55     conf.AddOptions(split);
    56     conf.AddOptions(debug);
     46    conf.AddOptions(Tools::Splitting::options());
    5747    conf.SetArgumentPositions(p);
    5848}
     
    9080        "with --compression.\n"
    9181        "\n"
    92         "For several purposes, it might be convenient to split the output to several "
    93         "different root-treess. This can be done using the --split-sequence (-S) "
    94         "and the --split-quantile (-Q) options. If a split sequence is defined as "
    95         "-S 1 -S 2 -S 1 the events are split by 1:2:1 in this sequence order. If "
    96         "quantiles are given as -Q 0.5 -Q 0.6, the first tree will contain 50% of "
    97         "the second one 10% and the third one 40%. The corresponding seed value can "
    98         "be set with --seed.\n"
     82        "Columns can be renamed with --rename.new=old\n"
     83        "\n"
     84        << Tools::Splitting::usage() <<
    9985        "\n"
    10086        "In case of success, 0 is returned, a value>0 otherwise.\n"
     
    178164    }
    179165
    180     // ----------------------------- Setup splitting ---------------------------
    181 
    182     vector<uint16_t> split_seq   = conf.Vec<uint16_t>("split-sequence");
    183     vector<double>   split_quant = conf.Vec<double>("split-quantile");
    184 
    185     if (!split_seq.empty() && !split_quant.empty())
    186         throw runtime_error("Only splitting by --split-sequence or --split-quantile is allowed.");
    187 
    188     const size_t num_split = split_seq.size()+split_quant.size()==0 ? 0 :
    189         ::max(split_seq.size(), split_quant.size()+1);
    190 
    191     map<size_t, size_t> split_lut;
    192     for (size_t i=0; i<split_seq.size(); i++)
    193     {
    194         const size_t sz = split_lut.size();
    195         for (size_t j=0; j<split_seq[i]; j++)
    196             split_lut.emplace(j+sz, i);
    197     }
    198 
    199     for (size_t i=0; i<split_quant.size(); i++)
    200         if (split_quant[i]<0 || split_quant[i]>=1)
    201             throw runtime_error("Splitting quantiles must be in the range [0;1)");
    202 
    203     for (size_t i=1; i<split_quant.size(); i++)
    204     {
    205         if (split_quant[i]<=split_quant[i-1])
    206             throw runtime_error("Splitting quantiles must be in increasing order.");
    207     }
    208 
    209166    // -------------------------------------------------------------------------
    210167
    211     const uniform_real_distribution<double> distribution(0,1);
    212     mt19937_64 generator;
    213     generator.seed(conf.Get<uint64_t>("seed"));
    214     auto rndm = bind(distribution, generator);
    215 
    216     // -------------------------------------------------------------------------
     168    /*const*/ Tools::Splitting split(conf);
    217169
    218170    if (verbose>0)
     
    300252        cout << "Opened root file '" << path << "'.\n";
    301253        cout << "Writing to tree: " << tree << ".\n";
     254        split.print();
    302255    }
    303256
     
    306259
    307260    size_t entries = 0;
    308     if (num_split==0)
     261    if (split.empty())
    309262    {
    310263        if (AddTree(ttree, tfile, tree, update, verbose))
     
    318271    {
    319272        bool found = false;
    320         for (size_t i=0; i<num_split; i++)
     273        for (size_t i=0; i<split.size(); i++)
    321274            found |= AddTree(ttree, tfile, tree+"["+to_string(i)+"]", update, verbose);
    322275
     
    328281    }
    329282
     283    const auto rename = conf.GetWildcardOptions("rename.*");
     284
    330285    vector<float> vec(numcol);
    331286    for (int i=0; i<numcol; i++)
     
    333288        string col = noheader ? Tools::Form("col%d", i) : title->At(i)->GetName();
    334289
     290        if (verbose>1)
     291            cout << "Column: " << col;
     292
    335293        boost::regex rexpr(":");
    336294        col = boost::regex_replace(col, rexpr, "");
    337295
    338296        if (verbose>1)
    339             cout << "Column: " << col << '\n';
     297            cout << " -> " << col;
     298
     299        for (auto it=rename.cbegin(); it!=rename.cend(); it++)
     300        {
     301            if (col!=it->substr(7))
     302                continue;
     303
     304            col = conf.Get<string>(*it);
     305            if (verbose>1)
     306                cout << " -> " << col;
     307            break;
     308        }
     309        if (verbose>1)
     310            cout << endl;
    340311
    341312        for (auto it=ttree.begin(); it!=ttree.end(); it++)
     
    361332        if (buf.IsNull() || buf[0]=='#')
    362333            continue;
    363 
    364         valid++;
    365334
    366335        TObjArray *arr = buf.Tokenize(" ");
     
    386355        delete arr;
    387356
    388 
    389         size_t index = 0;
    390         if (!split_lut.empty())
    391             index = split_lut[line % split_lut.size()];
    392         if (!split_quant.empty())
    393         {
    394             const float r = rndm();
    395             for (; r>=split_quant[index]; index++)
    396                 if (index==split_quant.size())
    397                     break;
    398         }
     357        const size_t index = split.index(valid++);
    399358
    400359        // Fill only branches for which an adress was set
Note: See TracChangeset for help on using the changeset viewer.