Changeset 19802 for trunk/FACT++


Ignore:
Timestamp:
10/27/19 11:11:01 (5 years ago)
Author:
tbretz
Message:
Moved the common Splitting algorithm to a common class 'Splitting' (added) and added the possibility to rename columns (csv2root)
Location:
trunk/FACT++/src
Files:
1 added
3 edited

Legend:

Unmodified
Added
Removed
  • trunk/FACT++/src/csv2root.cc

    r19798 r19802  
    1 #include <random>
    2 
    31#include <boost/regex.hpp>
    42#include <boost/filesystem.hpp>
     
    75#include "tools.h"
    86#include "Time.h"
    9 #include "Configuration.h"
     7#include "Splitting.h"
    108
    119#include <TROOT.h>
     
    1816using namespace std;
    1917namespace fs = boost::filesystem;
     18
     19// ------------------------------------------------------------------------
     20
    2021
    2122// ------------------------------------------------------------------------
     
    3233        ("compression,c",  var<uint16_t>(1),          "zlib compression level for the root file")
    3334        ("no-header,n",    po_switch(),               "Use if the first line contains no header")
     35        ("rename.*",       var<string>(),             "Can be used to rename a column")
    3436        ("dry-run",        po_switch(),               "Do not create or manipulate any output file")
    35         ;
    36 
    37     po::options_description split("Splitting options");
    38     split.add_options()
    39         ("split-sequence,S", vars<uint16_t>(),            "Split data sequentially into several trees/files (e.g. 1, 1, 2)")
    40         ("split-quantile,Q", vars<double>(),              "Split data randomly into several trees/files (e.g. 0.5, 1)")
    41         ("seed", var<uint64_t>(mt19937_64::default_seed), "Seed value in case of random split")
    42         ;
    43 
    44     po::options_description debug("Debug options");
    45     debug.add_options()
    4637        ("verbose,v",      var<uint16_t>(1),          "Verbosity (0: quiet, 1: default, 2: more, 3, ...)")
    4738        ;
     
    5344
    5445    conf.AddOptions(control);
    55     conf.AddOptions(split);
    56     conf.AddOptions(debug);
     46    conf.AddOptions(Tools::Splitting::options());
    5747    conf.SetArgumentPositions(p);
    5848}
     
    9080        "with --compression.\n"
    9181        "\n"
    92         "For several purposes, it might be convenient to split the output to several "
    93         "different root-treess. This can be done using the --split-sequence (-S) "
    94         "and the --split-quantile (-Q) options. If a split sequence is defined as "
    95         "-S 1 -S 2 -S 1 the events are split by 1:2:1 in this sequence order. If "
    96         "quantiles are given as -Q 0.5 -Q 0.6, the first tree will contain 50% of "
    97         "the second one 10% and the third one 40%. The corresponding seed value can "
    98         "be set with --seed.\n"
     82        "Columns can be renamed with --rename.new=old\n"
     83        "\n"
     84        << Tools::Splitting::usage() <<
    9985        "\n"
    10086        "In case of success, 0 is returned, a value>0 otherwise.\n"
     
    178164    }
    179165
    180     // ----------------------------- Setup splitting ---------------------------
    181 
    182     vector<uint16_t> split_seq   = conf.Vec<uint16_t>("split-sequence");
    183     vector<double>   split_quant = conf.Vec<double>("split-quantile");
    184 
    185     if (!split_seq.empty() && !split_quant.empty())
    186         throw runtime_error("Only splitting by --split-sequence or --split-quantile is allowed.");
    187 
    188     const size_t num_split = split_seq.size()+split_quant.size()==0 ? 0 :
    189         ::max(split_seq.size(), split_quant.size()+1);
    190 
    191     map<size_t, size_t> split_lut;
    192     for (size_t i=0; i<split_seq.size(); i++)
    193     {
    194         const size_t sz = split_lut.size();
    195         for (size_t j=0; j<split_seq[i]; j++)
    196             split_lut.emplace(j+sz, i);
    197     }
    198 
    199     for (size_t i=0; i<split_quant.size(); i++)
    200         if (split_quant[i]<0 || split_quant[i]>=1)
    201             throw runtime_error("Splitting quantiles must be in the range [0;1)");
    202 
    203     for (size_t i=1; i<split_quant.size(); i++)
    204     {
    205         if (split_quant[i]<=split_quant[i-1])
    206             throw runtime_error("Splitting quantiles must be in increasing order.");
    207     }
    208 
    209166    // -------------------------------------------------------------------------
    210167
    211     const uniform_real_distribution<double> distribution(0,1);
    212     mt19937_64 generator;
    213     generator.seed(conf.Get<uint64_t>("seed"));
    214     auto rndm = bind(distribution, generator);
    215 
    216     // -------------------------------------------------------------------------
     168    /*const*/ Tools::Splitting split(conf);
    217169
    218170    if (verbose>0)
     
    300252        cout << "Opened root file '" << path << "'.\n";
    301253        cout << "Writing to tree: " << tree << ".\n";
     254        split.print();
    302255    }
    303256
     
    306259
    307260    size_t entries = 0;
    308     if (num_split==0)
     261    if (split.empty())
    309262    {
    310263        if (AddTree(ttree, tfile, tree, update, verbose))
     
    318271    {
    319272        bool found = false;
    320         for (size_t i=0; i<num_split; i++)
     273        for (size_t i=0; i<split.size(); i++)
    321274            found |= AddTree(ttree, tfile, tree+"["+to_string(i)+"]", update, verbose);
    322275
     
    328281    }
    329282
     283    const auto rename = conf.GetWildcardOptions("rename.*");
     284
    330285    vector<float> vec(numcol);
    331286    for (int i=0; i<numcol; i++)
     
    333288        string col = noheader ? Tools::Form("col%d", i) : title->At(i)->GetName();
    334289
     290        if (verbose>1)
     291            cout << "Column: " << col;
     292
    335293        boost::regex rexpr(":");
    336294        col = boost::regex_replace(col, rexpr, "");
    337295
    338296        if (verbose>1)
    339             cout << "Column: " << col << '\n';
     297            cout << " -> " << col;
     298
     299        for (auto it=rename.cbegin(); it!=rename.cend(); it++)
     300        {
     301            if (col!=it->substr(7))
     302                continue;
     303
     304            col = conf.Get<string>(*it);
     305            if (verbose>1)
     306                cout << " -> " << col;
     307            break;
     308        }
     309        if (verbose>1)
     310            cout << endl;
    340311
    341312        for (auto it=ttree.begin(); it!=ttree.end(); it++)
     
    361332        if (buf.IsNull() || buf[0]=='#')
    362333            continue;
    363 
    364         valid++;
    365334
    366335        TObjArray *arr = buf.Tokenize(" ");
     
    386355        delete arr;
    387356
    388 
    389         size_t index = 0;
    390         if (!split_lut.empty())
    391             index = split_lut[line % split_lut.size()];
    392         if (!split_quant.empty())
    393         {
    394             const float r = rndm();
    395             for (; r>=split_quant[index]; index++)
    396                 if (index==split_quant.size())
    397                     break;
    398         }
     357        const size_t index = split.index(valid++);
    399358
    400359        // Fill only branches for which an adress was set
  • trunk/FACT++/src/root2csv.cc

    r19795 r19802  
    1 #include <random>
    2 
    31#include <boost/regex.hpp>
    42#include <boost/filesystem.hpp>
     
    75#include "tools.h"
    86#include "Time.h"
    9 #include "Configuration.h"
     7#include "Splitting.h"
    108
    119#include <TROOT.h>
     
    6563        ;
    6664
    67     po::options_description split("Splitting options");
    68     split.add_options()
    69         ("split-sequence,S", vars<uint16_t>(),            "Split data sequentially into several trees/files (e.g. 1, 1, 2)")
    70         ("split-quantile,Q", vars<double>(),              "Split data randomly into several trees/files (e.g. 0.5, 1)")
    71         ("seed", var<uint64_t>(mt19937_64::default_seed), "Seed value in case of random split")
    72         ;
    73 
    7465    po::options_description debug("Debug options");
    7566    debug.add_options()
     
    8475
    8576    conf.AddOptions(control);
    86     conf.AddOptions(split);
     77    conf.AddOptions(Tools::Splitting::options());
    8778    conf.AddOptions(debug);
    8879    conf.SetArgumentPositions(p);
     
    169160        "and all negative values are considered 'fales' (discard the entry).\n"
    170161        "\n"
    171         "For several purposes, it might be convenient to split the output to several "
    172         "files. This can be achieved using the --split-sequence (-S) "
    173         "and the --split-quantile (-Q) options. If a split sequence is defined as "
    174         "-S 1 -S 2 -S 1 the events are split by 1:2:1 in this sequence order. If "
    175         "quantiles are given as -Q 0.5 -Q 0.6, the first tree will contain 50% of "
    176         "the second one 10% and the third one 40%. The corresponding seed value can "
    177         "be set with --seed. Filenames are then created by adding an index after(!) "
    178         "the extension, e.g. file.csv-0, file.csv-1, ...\n"
     162        << Tools::Splitting::usage() <<
    179163        "\n"
    180164        "In case of success, 0 is returned, a value>0 otherwise.\n"
     
    403387    const vector<Map> autoalias  = conf.Vec<Map>("auto-alias");
    404388
    405     // ----------------------------- Setup splitting ---------------------------
    406 
    407     vector<uint16_t> split_seq   = conf.Vec<uint16_t>("split-sequence");
    408     vector<double>   split_quant = conf.Vec<double>("split-quantile");
    409 
    410     if (!split_seq.empty() && !split_quant.empty())
    411         throw runtime_error("Only splitting by --split-sequence or --split-quantile is allowed.");
    412 
    413     const size_t num_split = split_seq.size()+split_quant.size()==0 ? 0 :
    414         ::max(split_seq.size(), split_quant.size()+1);
    415 
    416     map<size_t, size_t> split_lut;
    417     for (size_t i=0; i<split_seq.size(); i++)
    418     {
    419         const size_t sz = split_lut.size();
    420         for (size_t j=0; j<split_seq[i]; j++)
    421             split_lut.emplace(j+sz, i);
    422     }
    423 
    424     for (size_t i=0; i<split_quant.size(); i++)
    425         if (split_quant[i]<0 || split_quant[i]>=1)
    426             throw runtime_error("Splitting quantiles must be in the range [0;1)");
    427 
    428     for (size_t i=1; i<split_quant.size(); i++)
    429     {
    430         if (split_quant[i]<=split_quant[i-1])
    431             throw runtime_error("Splitting quantiles must be in increasing order.");
    432     }
    433 
    434389    // -------------------------------------------------------------------------
    435390
    436     const uniform_real_distribution<double> distribution(0,1);
    437     mt19937_64 generator;
    438     generator.seed(conf.Get<uint64_t>("seed"));
    439     auto rndm = bind(distribution, generator);
    440 
    441     // -------------------------------------------------------------------------
     391    /*const*/ Tools::Splitting split(conf);
    442392
    443393    if (verbose>0)
     
    780730    // -------------------------------------------------------------------------
    781731
    782     if (num_split)
    783     {
    784         cout << "\nSplitting configured " << (split_seq.empty()?"randomly":"in sequence") << " into " << num_split << " files." << endl;
    785         if (!split_quant.empty())
    786             cout << "Seed value configured as " << conf.Get<uint64_t>("seed") << "." << endl;
     732    if (verbose>0)
     733    {
     734        cout << '\n';
     735        split.print();
    787736    }
    788737
     
    798747    vector<ofstream> outfiles;
    799748
    800     if (num_split==0)
     749    if (split,empty())
    801750    {
    802751        TString path(out.c_str());
     
    811760    else
    812761    {
    813         for (size_t i=0; i<num_split; i++)
     762        for (size_t i=0; i<split.size(); i++)
    814763        {
    815764            TString path(out.c_str());
     
    828777    // ---------------------------- Write Body --------------------------------
    829778    size_t count = 0;
    830     vector<size_t> ncount(num_split?num_split:1);
     779    vector<size_t> ncount(split.empty()?1:split.size());
    831780
    832781    auto itree = c.GetTreeNumber();
     
    848797            continue;
    849798
    850         size_t index = 0;
    851         if (!split_lut.empty())
    852             index = split_lut[count % split_lut.size()];
    853         if (!split_quant.empty())
    854         {
    855             const float r = rndm();
    856             for (; r>=split_quant[index]; index++)
    857                 if (index==split_quant.size())
    858                     break;
    859         }
     799        const size_t index = split.index(count++);
     800        ncount[index]++;
    860801
    861802        vector<string> join;
     
    875816
    876817        outfiles[index] << boost::join(join, " ") << "\n";
    877 
    878         count ++;
    879         ncount[index] ++;
    880818    }
    881819
  • trunk/FACT++/src/rootifysql.cc

    r19482 r19802  
    11#include "Database.h"
    2 
    3 #include <random>
    42
    53#include <boost/regex.hpp>
     
    108#include "tools.h"
    119#include "Time.h"
    12 #include "Configuration.h"
     10#include "Splitting.h"
    1311
    1412#include <TROOT.h>
     
    6260        ;
    6361
    64     po::options_description split("Splitting options");
    65     split.add_options()
    66         ("split-sequence,S", vars<uint16_t>(),            "Split data sequentially into several trees/files (e.g. 1, 1, 2)")
    67         ("split-quantile,Q", vars<double>(),              "Split data randomly into several trees/files (e.g. 0.5, 1)")
    68         ("seed", var<uint64_t>(mt19937_64::default_seed), "Seed value in case of random split")
    69         ;
    70 
    7162    po::positional_options_description p;
    7263    p.add("file", 1); // The 1st positional options (n=1)
     
    7667    conf.AddOptions(ascii);
    7768    conf.AddOptions(root);
    78     conf.AddOptions(split);
     69    conf.AddOptions(Tools::Splitting::options());
    7970    conf.SetArgumentPositions(p);
    8071}
     
    125116        "/*comment*/ or introduced with # (shell script style) or -- (SQL style).\n"
    126117        "\n"
    127         "For several purposes, it might be convenient to split the output to several "
    128         "different root-trees or ascii files. This can be done using the --split-sequence (-S) "
    129         "and the --split-quantile (-Q) options. If a split sequence is defined as "
    130         "-S 1 -S 2 -S 1 the events are split by 1:2:1 in this sequence order. If "
    131         "quantiled are given as -Q 0.5 -Q 0.6, the first tree will contain 50% of "
    132         "the second one 10% and the third one 40%. The corresponding seed value can "
    133         "be set with --seed.\n"
     118        << Tools::Splitting::usage() <<
    134119        "\n"
    135120        "In case of success, 0 is returned, a value>0 otherwise.\n"
     
    522507    // ----------------------- Setup splitting ---------------------------------
    523508
    524     vector<uint16_t> split_seq   = conf.Vec<uint16_t>("split-sequence");
    525     vector<double>   split_quant = conf.Vec<double>("split-quantile");
    526 
    527     if (!split_seq.empty() && !split_quant.empty())
    528         throw runtime_error("Only splitting by --split-sequence or --split-quantile is allowed.");
    529 
    530     const size_t num_split = split_seq.size()+split_quant.size()==0 ? 0 :
    531         ::max(split_seq.size(), split_quant.size()+1);
    532 
    533     map<size_t, size_t> split_lut;
    534     for (size_t i=0; i<split_seq.size(); i++)
    535     {
    536         const size_t sz = split_lut.size();
    537         for (size_t j=0; j<split_seq[i]; j++)
    538             split_lut.emplace(j+sz, i);
    539     }
    540 
    541     for (size_t i=0; i<split_quant.size(); i++)
    542         if (split_quant[i]<0 || split_quant[i]>=1)
    543             throw runtime_error("Splitting quantiles must be in the range [0;1)");
    544 
    545     for (size_t i=1; i<split_quant.size(); i++)
    546     {
    547         if (split_quant[i]<=split_quant[i-1])
    548             throw runtime_error("Splitting quantiles must be in increasing order.");
    549     }
    550 
    551     // -------------------------------------------------------------------------
    552 
    553509    const auto vars = conf.GetWildcardOptions("var.*");
    554510
     
    558514
    559515    // -------------------------------------------------------------------------
     516
     517    /*const*/ Tools::Splitting split(conf);
    560518
    561519    if (verbose>0)
     
    826784        cout << "Opening file '" << path << "' [compression=" << compression << "]...\n";
    827785        cout << "Writing data to tree '" << tree << "'" << (nofill?" (--skipped--)":"") << endl;
    828         if (num_split)
    829         {
    830             cout << "Splitting configured " << (split_seq.empty()?"randomly":"in sequence") << " into " << num_split << " branches." << endl;
    831             if (!split_quant.empty())
    832                 cout << "Seed value configured as " << conf.Get<uint64_t>("seed") << "." << endl;
    833         }
     786        split.print();
    834787    }
    835788
     
    875828    vector<TTree*> ttree;
    876829
    877     if (num_split==0)
     830    if (split.empty())
    878831        ttree.emplace_back(new TTree(tree.c_str(), query.c_str()));
    879832    else
    880         for (size_t i=0; i<num_split; i++)
     833        for (size_t i=0; i<split.size(); i++)
    881834            ttree.emplace_back(new TTree((tree+"["+to_string(i)+"]").c_str(), query.c_str()));
    882835
     
    955908    {
    956909        vector<string> names;
    957         if (num_split==0)
     910        if (split.empty())
    958911            names.emplace_back(write);
    959912        else
    960             for (size_t i=0; i<num_split; i++)
     913            for (size_t i=0; i<split.size(); i++)
    961914                names.emplace_back(write+"-"+to_string(i));
    962915
     
    1017970    // ---------------------- Fill TTree with DB data --------------------------
    1018971
    1019     const uniform_real_distribution<double> distribution(0,1);
    1020     mt19937_64 generator;
    1021     generator.seed(conf.Get<uint64_t>("seed"));
    1022     auto rndm = bind(distribution, generator);
    1023 
    1024972    size_t count = 0;
    1025973    size_t skip  = 0;
    1026974    do
    1027975    {
    1028         size_t index = 0;
    1029         if (!split_lut.empty())
    1030             index = split_lut[count % split_lut.size()];
    1031         if (!split_quant.empty())
    1032         {
    1033             const float r = rndm();
    1034             for (; r>=split_quant[index]; index++)
    1035                 if (index==split_quant.size())
    1036                     break;
    1037         }
    1038 
    1039         count++;
     976        size_t index = split.index(count++);
    1040977
    1041978        ostringstream rtxt;
Note: See TracChangeset for help on using the changeset viewer.