Changeset 19280


Ignore:
Timestamp:
Oct 23, 2018, 6:48:24 PM (8 months ago)
Author:
tbretz
Message:
Implemented automatic splitting into root-trees or ascii-files.
File:
1 edited

Legend:

Unmodified
Added
Removed
  • trunk/FACT++/src/rootifysql.cc

    r19235 r19280  
    6060        ;
    6161
     62    po::options_description split("Splitting options");
     63    split.add_options()
     64        ("split-sequence,S", vars<uint16_t>(),            "Split data sequentially into several trees/files (e.g. 1, 1, 2)")
     65        ("split-quantile,Q", vars<double>(),              "Split data randomly into several trees/files (e.g. 0.5, 1)")
     66        ("seed", var<uint64_t>(mt19937_64::default_seed), "Seed value in case of random split")
     67        ;
     68
    6269    po::positional_options_description p;
    6370    p.add("file", 1); // The 1st positional options (n=1)
     
    6774    conf.AddOptions(ascii);
    6875    conf.AddOptions(root);
     76    conf.AddOptions(split);
    6977    conf.SetArgumentPositions(p);
    7078}
     
    114122        "Comments in the query-file can be placed according to the SQL standard inline "
    115123        "/*comment*/ or introduced with # (shell script style) or -- (SQL style).\n"
     124        "\n"
     125        "For several purposes, it might be convenient to split the output to several "
     126        "different root-trees or ascii files. This can be done using the --split-sequence (-S) "
     127        "and the --split-quantile (-Q) options. If a split sequence is defined as "
     128        "-S 1 -S 2 -S 1 the events are split by 1:2:1 in this sequence order. If "
     129        "quantiled are given as -Q 0.5 -Q 0.6, the first tree will contain 50% of "
     130        "the second one 10% and the third one 40%. The corresponding seed value can "
     131        "be set with --seed.\n"
    116132        "\n"
    117133        "In case of success, 0 is returned, a value>0 otherwise.\n"
     
    502518    //const vector<Map> mymap    = conf.Vec<Map>("map");
    503519
     520    vector<uint16_t> split_seq   = conf.Vec<uint16_t>("split-sequence");
     521    vector<double>   split_quant = conf.Vec<double>("split-quantile");
     522
     523    if (!split_seq.empty() && !split_quant.empty())
     524        throw runtime_error("Only splitting by --split-sequence or --split-quantile is allowed.");
     525
     526    const size_t num_split = ::max(split_seq.size(), split_quant.size()+1);
     527
     528    map<size_t, size_t> split_lut;
     529    for (size_t i=0; i<split_seq.size(); i++)
     530    {
     531        const size_t sz = split_lut.size();
     532        for (size_t j=0; j<split_seq[i]; j++)
     533            split_lut.emplace(j+sz, i);
     534    }
     535
     536    for (size_t i=0; i<split_quant.size(); i++)
     537        if (split_quant[i]<0 || split_quant[i]>=1)
     538            throw runtime_error("Splitting quantiles must be in the range [0;1)");
     539
     540    for (size_t i=1; i<split_quant.size(); i++)
     541    {
     542        if (split_quant[i]<=split_quant[i-1])
     543            throw runtime_error("Splitting quantiles must be in increasing order.");
     544    }
     545
    504546    // -------------------------------------------------------------------------
    505547
     
    582624    // waiting long for the query result
    583625    //
     626
    584627    // I am using root here instead of boost to be
    585628    // consistent with the access pattern by TFile
     
    778821        cout << "Opening file '" << path << "' [compression=" << compression << "]...\n";
    779822        cout << "Writing data to tree '" << tree << "'" << (nofill?" (--skipped--)":"") << endl;
     823        if (num_split)
     824            cout << "Splitting configured with " << num_split << " branches." << endl;
    780825    }
    781826
     
    808853    UInt_t cols = 0;
    809854
     855    // IMPLEMENT FILE SPLITTING!
     856    // OpenFile(tree, query)
     857    // SetupColumns
     858    // WriteRow
     859    // CloseFile
     860
     861    // Ratio[3]: 50%, 20%, 30%
     862    // File[x3]: root, cout, fout
     863
    810864
    811865    // -------------------- Configure branches of TTree ------------------------
    812     TTree *ttree = new TTree(tree.c_str(), query.c_str());
     866    vector<TTree*> ttree;
     867
     868    if (num_split==0)
     869        ttree.emplace_back(new TTree(tree.c_str(), query.c_str()));
     870    else
     871        for (int i=0; i<num_split; i++)
     872            ttree.emplace_back(new TTree((tree+"["+to_string(i)+"]").c_str(), query.c_str()));
    813873
    814874    size_t skipat  = 0;
     
    862922            //     name = boost::regex_replace(l[i], boost::regex(m.first), m.second);
    863923
    864             ttree->Branch(l[i].c_str(), buf.data()+i);
     924            for (auto it=ttree.begin(); it!=ttree.end(); it++)
     925                it[0]->Branch(l[i].c_str(), buf.data()+i);
    865926            cols++;
    866927        }
     
    879940    }
    880941
    881     // -------------------------------------------------------------------------
     942    // ------------------------- Open the ascii files --------------------------
     943
     944    vector<ofstream> fout;
     945    if (!write.empty())
     946    {
     947        vector<string> names;
     948        if (num_split==0)
     949            names.emplace_back(write);
     950        else
     951            for (int i=0; i<num_split; i++)
     952                names.emplace_back(write+"-"+to_string(i));
     953
     954        for (auto it=names.cbegin(); it!=names.cend(); it++)
     955        {
     956            fout.emplace_back(*it);
     957            if (!*fout.rbegin())
     958                cout << "WARNING: Writing to '" << write << "' failed: " << strerror(errno) << endl;
     959        }
     960    }
     961
     962    // ----------------------- Prepare the ascii comment -----------------------
    882963
    883964    string contents;
     
    905986            (copy_comments && comment && !header))
    906987            contents += '#' + ibuf + '\n';
    907 
    908     }
    909 
    910     ofstream fout(write);
    911     if (!write.empty() && !fout)
    912         cout << "WARNING: Writing to '" << write << "' failed: " << strerror(errno) << endl;
     988    }
     989
     990    // ----------------------- Write the ascii headers -------------------------
     991
     992    ostringstream htxt;
     993    if (display || !fout.empty())
     994        htxt << row.field_list(delimiter.c_str());
    913995
    914996    if (display)
     
    916998        cout << endl;
    917999        cout << contents << endl;
    918         cout << "# " << row.field_list(delimiter.c_str()) << endl;
    919     }
    920 
    921     if (!write.empty())
    922     {
    923         fout << contents;
    924         fout << "# " << row.field_list(delimiter.c_str()) << endl;
     1000        cout << "# " << htxt.str() << endl;
     1001    }
     1002    for (auto ff=fout.begin(); ff!=fout.end(); ff++)
     1003    {
     1004        *ff << contents;
     1005        *ff << "# " << htxt.str() << endl;
    9251006    }
    9261007
    9271008    // ---------------------- Fill TTree with DB data --------------------------
     1009
     1010    const uniform_real_distribution<double> distribution(0,1);
     1011    mt19937_64 generator;
     1012    if (conf.Has("seed"))
     1013        generator.seed(conf.Get<uint64_t>("seed"));
     1014    auto rndm = bind(distribution, generator);
     1015
    9281016    size_t count = 0;
    9291017    size_t skip  = 0;
    9301018    do
    9311019    {
     1020        size_t index = 0;
     1021        if (!split_lut.empty())
     1022            index = split_lut[count % split_lut.size()];
     1023        if (!split_quant.empty())
     1024        {
     1025            const float r = rndm();
     1026            for (; r>=split_quant[index]; index++)
     1027                if (index==split_quant.size())
     1028                    break;
     1029        }
     1030
    9321031        count++;
    9331032
     1033        ostringstream rtxt;
     1034        if (display || !fout.empty())
     1035            rtxt << row.value_list(delimiter.c_str(), mysqlpp::do_nothing);
     1036
    9341037        if (display)
    935             cout << row.value_list(delimiter.c_str(), mysqlpp::do_nothing) << '\n';
    936         if (!write.empty())
    937             fout << row.value_list(delimiter.c_str(), mysqlpp::do_nothing) << '\n';
     1038            cout << rtxt.str() << '\n';
     1039        if (!fout.empty())
     1040            fout[index] << rtxt.str() << '\n';
    9381041
    9391042        size_t idx=0;
     
    9721075
    9731076        if (idx==row.size() && !nofill)
    974             ttree->Fill();
     1077            ttree[index]->Fill();
    9751078
    9761079        row = res.fetch_row();
     
    9901093            cout << skip << " rows skipped due to NULL field." << endl;
    9911094
    992         cout << ttree->GetEntries() << " rows filled into tree." << endl;
    993     }
    994 
    995     ttree->Write();
     1095        for (int i=0; i<ttree.size(); i++)
     1096            cout << ttree[i]->GetEntries() << " rows filled into tree #" << i << "." << endl;
     1097    }
     1098
     1099    for (auto it=ttree.begin(); it!=ttree.end(); it++)
     1100        (*it)->Write();
    9961101    tfile.Close();
    9971102
Note: See TracChangeset for help on using the changeset viewer.