Index: trunk/FACT++/src/root2sql.cc
===================================================================
--- trunk/FACT++/src/root2sql.cc	(revision 18928)
+++ trunk/FACT++/src/root2sql.cc	(revision 18928)
@@ -0,0 +1,537 @@
+#include <boost/algorithm/string/join.hpp>
+#include <boost/regex.hpp>
+
+#include "Database.h"
+
+#include "Time.h"
+#include "Configuration.h"
+
+#include <TROOT.h>
+#include <TFile.h>
+#include <TTree.h>
+#include <TLeaf.h>
+#include <TError.h>
+
+using namespace std;
+
+// ------------------------------------------------------------------------
+
+struct Map : pair<string, string>
+{
+    Map() { }
+};
+
+std::istream &operator>>(std::istream &in, Map &m)
+{
+    string txt(istreambuf_iterator<char>(in), {});
+
+    const boost::regex expr("((.*)[^\\\\])/(.*)");
+    boost::smatch match;
+    if (!boost::regex_match(txt, match, expr))
+        throw runtime_error("Could not evaluate map argument: "+txt);
+
+    m.first  = match[1].str();
+    m.second = match[3].str();
+
+    return in;
+}
+
+
+void SetupConfiguration(Configuration &conf)
+{
+    po::options_description control("Root to SQL");
+    control.add_options()
+        ("uri,u",          var<string>()->required(), "Database link as in\n\tuser:password@server[:port]/database.")
+        ("file",           var<string>(""),           "The root file to read from")
+        ("create",         po_switch(),               "Create the database if not existing")
+        ("drop",           po_switch(),               "Drop the table (implies create)")
+        ("tree,t",         var<string>("Events"),     "Name of the root tree to convert")
+        ("table",          var<string>(""),           "Name of the table to use (default is the tree name)")
+        ("map",            vars<Map>(),               "A regular expression which is applied to the leaf name befoee it is used as SQL column name)")
+        ("sql-type",       vars<Map>(),               "Allows to overwrite the calculated SQL type for a given column e.g. 'sql-column-name/UNSIGNED IN'")
+        ("ignore",         vars<string>(),            "Ignore the given leaf, if the given regular expression matches")
+        ("primary",        vars<string>(),            "List of columns to be used as primary keys during table creation (in connection with --create)")
+        ("first",          var<int64_t>(int64_t(0)),  "First event to start with (default: 0), mainly for test purpose")
+        ("max",            var<int64_t>(int64_t(0)),  "Maximum number of events to process (0: all), mainly for test purpose")
+        ("no-insert",      po_switch(),               "Does not insert any data into the table")
+        ("print-branches", po_switch(),               "Print the branches found in the tree")
+        ("print-leaves",   po_switch(),               "Print the leaves found in the tree (this is what is processed)")
+        ("print-insert",   po_switch(),               "Print the INSERT query (note that it contains all data)")
+        ("print-create",   po_switch(),               "Print the CREATE query")
+        ("verbose,v",      var<uint16_t>(1),          "Verbosity (0: quiet, 1: default, 2: more, 3, ...)")
+        ;
+
+    po::positional_options_description p;
+    p.add("file", 1); // The 1st positional options (n=1)
+
+    conf.AddOptions(control);
+    conf.SetArgumentPositions(p);
+}
+
+void PrintUsage()
+{
+    cout <<
+        "root2sql - Fills the data from a root file into a database\n"
+        "\n"
+        "For convenience, this documentation uses the extended version of the options, "
+        "refer to the output below to get the abbreviations.\n"
+        "\n"
+        "This is a general purpose tool to fill the contents of a root file into a database "
+        "as long as this is technically possible and makes sense. Note that root can even "
+        "write complex data like a TH1F into a database, this is not the purpose of this "
+        "program.\n"
+        "\n"
+        "Each root tree has branches and leaves (the basic data types). These leaves can "
+        "be read independently of the classes which were used to write the root file. "
+        "The default tree to read from is 'Events' but the name can be overwritten "
+        "using --tree. The default table name to fill the data into is identical to "
+        "the tree name. It can be overwritten using --table.\n"
+        "\n"
+        "The name of each column to which data is filled from a leave is obtained from "
+        "the leaves' names. The leave names can be checked using --print-leaves. "
+        "A --print-branches exists for convenience to print only the high-level branches. "
+        "Sometimes these names might be quite unconvenient like MTime.fTime.fMilliSec or "
+        "just MHillas.fWidth. To allow to simplify column names, regular expressions "
+        "(using boost's regex) can be defined to change the names. Note that these regular "
+        "expressions are applied one by one on each leaf's name. A valid expression could "
+        "be:\n"
+        "   --map=MHillas\\.f/\n"
+        "which would remove all occurances of 'MHillas.f'. This option can be used more than "
+        "once. They are applied in sequence. A single match does not stop the sequence.\n"
+        "\n"
+        "Sometimes it might also be convenient to skip a leaf. Tis can be done with "
+        "the --ignore resource. If the given regular expresion yield a match, the "
+        "leaf will be ignored. Note that the regular expression work on the raw-name "
+        "of the leaf not the readily mapped SQL column names. Example:\n"
+        "   --ignore=ThetaSq\\..*\n"
+        "will skip all leaved which start with 'ThetaSq.'. This option can be used"
+        "more than once.\n"
+        "\n"
+        "The data type of each column is kept as close as possible to the leaves' data "
+        "types. If for some reason this is not wanted, the data type of the SQL column "
+        "can be overwritten with --sql-type sql-column/sql-ytpe, for example:\n"
+        "   --sql-type=FileId/UNSIGNED INT\n"
+        "while the first argument of the name of the SQL column to which the data type "
+        "should be applied. The second column is the basic SQL data type. The option can "
+        "be given more than once.\n"
+        "\n"
+        "Database interaction:\n"
+        "\n"
+        "To drop an existing table, --drop can be used.\n"
+        "\n"
+        "To create a table according to theSQL  column names and data types, --create "
+        "can be used. The query used can be printed with --print-create even --create "
+        "has not been specified.\n"
+        "\n"
+        "To choose the columns which should become primary keys, use --primary, "
+        "for exmaple:\n"
+        "   --primary=col1\n"
+        "To define more than one column as primary key, the option can be given more than "
+        "once. Note that the combination of these columns must be unique.\n"
+        "\n"
+        "All columns are created as NOT NULL as default and the table is created as "
+        "MyISAM.\n"
+        "\n"
+        "For debugging purpose, or to just create or drop a table, the final insert "
+        "query can be skipped using --no-insert. Note that for performance reason, "
+        "all data is collected in memory and a single INSERT query is issued at the "
+        "end.\n"
+        "\n"
+        "Using a higher verbosity level, an overview of the written columns or all "
+        "processed leaves is printed depending on the verbosity level. The output looks "
+        "like the following\n"
+        "   Leaf name [root data type] (SQL name)\n"
+        "for example\n"
+        "   MTime.fTime.fMilliSec [Long64_t] (MilliSec)\n"
+        "which means that the leaf MTime.fTime.fMilliSec is detected to be a Long64_t "
+        "which is filled into a column called MilliSec. Leaves with non basic data types "
+        "are ignored automatically and are marked as (-n/a-). User ignored columnd "
+        "are marked as (-ignored-).\n"
+        "\n"
+        "In case of succes, 0 is returned, a value>0 otherwise.\n"
+        "\n"
+        "Usage: root2sql [options] rootfile.root\n"
+        "\n"
+        ;
+    cout << endl;
+}
+
+enum BasicType_t
+{
+    kNone = 0,
+    kFloat,
+    kDouble,
+    kInt16,
+    kUInt16,
+    kInt32,
+    kUInt32,
+    kInt64,
+    kUInt64,
+};
+
+static const map<string, pair<BasicType_t, string>> ConvRoot =
+{
+    { "Float_t",   { kFloat,  "FLOAT"             } },
+    { "Double_t",  { kDouble, "DOUBLE"            } },
+    { "ULong64_t", { kUInt64, "BIGINT UNSIGNED"   } },
+    { "Long64_t",  { kInt64,  "BIGINT"            } },
+    { "UInt_t",    { kUInt32, "INT UNSIGNED"      } },
+    { "Int_t",     { kInt32,  "INT"               } },
+    { "UShort_t",  { kUInt16, "SMALLINT UNSIGNED" } },
+    { "Short_t",   { kInt16,  "SMALLINT"          } },
+};
+
+struct Container
+{
+    string branch; // branch name
+    string column; // column name
+    BasicType_t type;
+    void *ptr;
+
+    Container(const string &b, const string &c, const BasicType_t &t) : branch(b), column(c), type(t), ptr(0)
+    {
+        switch (t)
+        {
+        case kFloat:  ptr = new Float_t;   break;
+        case kDouble: ptr = new Double_t;  break;
+        case kInt16:  ptr = new Short_t;   break;
+        case kUInt16: ptr = new UShort_t;  break;
+        case kInt32:  ptr = new Int_t;     break;
+        case kUInt32: ptr = new UInt_t;    break;
+        case kInt64:  ptr = new Long64_t;  break;
+        case kUInt64: ptr = new ULong64_t; break;
+        case kNone:
+            break;
+        }
+    }
+    ~Container()
+    {
+        //::operator delete(ptr); // It seems root is deleting it already
+    }
+
+    string fmt() const
+    {
+        ostringstream str;
+
+        switch (type)
+        {
+        case kFloat:   str << setprecision(8) << *reinterpret_cast<Float_t*>(ptr); break;
+        case kDouble:  str << setprecision(16) << *reinterpret_cast<Double_t*>(ptr); break;
+        case kInt16:   str << *reinterpret_cast<Short_t*>(ptr); break;
+        case kUInt16:  str << *reinterpret_cast<UShort_t*>(ptr); break;
+        case kInt32:   str << *reinterpret_cast<Int_t*>(ptr); break;
+        case kUInt32:  str << *reinterpret_cast<UInt_t*>(ptr); break;
+        case kInt64:   str << *reinterpret_cast<Long64_t*>(ptr); break;
+        case kUInt64:  str << *reinterpret_cast<ULong64_t*>(ptr); break;
+        case kNone:
+            break;
+        }
+
+        return str.str();
+    }
+};
+
+void ErrorHandlerAll(Int_t level, Bool_t abort, const char *location, const char *msg)
+{
+    if (string(msg).substr(0,24)=="no dictionary for class ")
+        return;
+
+    DefaultErrorHandler(level, abort, location, msg);
+}
+
+int main(int argc, const char* argv[])
+{
+    Time start;
+
+    gROOT->SetBatch();
+    SetErrorHandler(ErrorHandlerAll);
+
+    Configuration conf(argv[0]);
+    conf.SetPrintUsage(PrintUsage);
+    SetupConfiguration(conf);
+
+    if (!conf.DoParse(argc, argv))
+        return 127;
+
+    // ----------------------------- Evaluate options --------------------------
+    const string uri             = conf.Get<string>("uri");
+    const string file            = conf.Get<string>("file");
+    const string tree            = conf.Get<string>("tree");
+    const string table           = conf.Get<string>("table").empty() ? tree : conf.Get<string>("table");
+
+    const uint16_t verbose       = conf.Get<uint16_t>("verbose");
+    const int64_t  first         = conf.Get<int64_t>("first");
+    const int64_t  max           = conf.Get<int64_t>("max");
+
+    const bool drop              = conf.Get<bool>("drop");
+    const bool create            = conf.Get<bool>("create") || drop;
+    const bool noinsert          = conf.Get<bool>("no-insert");
+
+    const bool print_branches    = conf.Get<bool>("print-branches");
+    const bool print_leaves      = conf.Get<bool>("print-leaves");
+    const bool print_create      = conf.Get<bool>("print-create");
+    const bool print_insert      = conf.Get<bool>("print-insert");
+
+    const vector<Map> mymap      = conf.Vec<Map>("map");
+    const vector<Map> sqltypes   = conf.Vec<Map>("sql-type");
+    const vector<string> ignore  = conf.Vec<string>("ignore");
+    const vector<string> primary = conf.Vec<string>("primary");
+
+    // -------------------------------------------------------------------------
+
+    if (verbose>0)
+        cout << "\n-------------------------- Evaluating file -------------------------" << endl;
+
+    TFile f(file.c_str());
+    if (f.IsZombie())
+    {
+        cerr << "Could not open file " << file << endl;
+        return 1;
+    }
+
+    if (verbose>0)
+        cout << "File: " << file << endl;
+
+    TTree *T = 0;
+    f.GetObject(tree.c_str(), T);
+    if (!T)
+    {
+        cerr << "Could not open tree " << tree << endl;
+        return 2;
+    }
+
+    if (verbose>0)
+        cout << "Tree: " << tree << endl;
+
+    T->SetMakeClass(1);
+
+    TObjArray *branches = T->GetListOfBranches();
+    TObjArray *leaves   = T->GetListOfLeaves();
+
+    if (print_branches)
+    {
+        cout << '\n';
+        branches->Print();
+    }
+
+    if (verbose>0)
+        cout << T->GetEntriesFast() << " events found." << endl;
+
+
+    if (verbose>0)
+        cout << branches->GetEntries() << " branches found." << endl;
+
+    if (print_leaves)
+    {
+        cout << '\n';
+        leaves->Print();
+    }
+    if (verbose>0)
+        cout << leaves->GetEntries() << " leaves found." << endl;
+
+    string query =
+        "CREATE TABLE IF NOT EXISTS `"+table+"`\n"
+        "(\n"
+        "   `Night` SMALLINT UNSIGNED NOT NULL";
+
+    vector<Container> vec;
+
+
+    TIter Next(leaves);
+    TObject *o = 0;
+    while ((o=Next()))
+    {
+        TLeaf *L = T->GetLeaf(o->GetName());
+
+        if (verbose>2)
+            cout << '\n' << o->GetName() << " [" << L->GetTypeName() << "]";
+
+        string name = o->GetName();
+
+
+        bool found = false;
+        for (const auto &b: ignore)
+        {
+            if (boost::regex_match(name, boost::regex(b)))
+            {
+                found = true;
+                if (verbose>2)
+                    cout << " (-ignored-)";
+                break;
+            }
+        }
+        if (found)
+            continue;
+
+        const string tn = L->GetTypeName();
+
+        auto it = ConvRoot.find(tn);
+        if (it==ConvRoot.end())
+        {
+            if (verbose>2)
+                cout << " (-n/a-)";
+            continue;
+        }
+
+        if (verbose==2)
+            cout << '\n' << o->GetName() << " [" << L->GetTypeName() << "]";
+
+        for (const auto &m: mymap)
+            name = boost::regex_replace(name, boost::regex(m.first), m.second);
+
+        if (verbose>1)
+            cout << " (" << name << ")";
+
+        string sqltype = it->second.second;
+
+        for (const auto &m: sqltypes)
+            if (m.first==name)
+                sqltype = m.second;
+
+        query += ",\n   `"+name+"` "+sqltype+" NOT NULL COMMENT '"+o->GetName()+"'";
+
+        vec.emplace_back(o->GetName(), name, it->second.first);
+        T->SetBranchAddress(o->GetName(), vec.back().ptr);
+    }
+
+    if (verbose>1)
+        cout << "\n\n";
+    if (verbose>0)
+        cout << vec.size() << " leaves setup for reading." << endl;
+
+    UInt_t datatype = 0;
+    const bool has_datatype = T->SetBranchAddress("DataType.fVal", &datatype) >= 0;
+
+    // Setiing up branch status (must be after all SetBranchAddress)
+    T->SetBranchStatus("*", 0);
+    for (const auto &c: vec)
+        T->SetBranchStatus(c.branch.c_str(), 1);
+
+    if (has_datatype)
+    {
+        T->SetBranchStatus("DataType.fVal", 1);
+        if (verbose>0)
+            cout << "Rows with DataType.fVal!=1 will be skipped." << endl;
+    }
+
+    // -------------------------------------------------------------------------
+
+    if (verbose>0)
+        cout << "\n--------------------------- Database Table -------------------------" << endl;
+
+    if (!primary.empty())
+        query += ",\n   PRIMARY KEY USING BTREE (`"+boost::algorithm::join(primary, "`, `")+"`)\n";
+
+    query +=
+        ")\n"
+        "DEFAULT CHARSET=latin1 COLLATE=latin1_general_ci\n"
+        "ENGINE=MyISAM\n"
+        "COMMENT='created by "+conf.GetName()+"'\n";
+
+    // FIXME: Can we omit the catching to be able to print the
+    //        query 'autmatically'?
+    try
+    {
+        if (drop)
+        {
+            // => Simple result
+            Database(uri).query("DROP TABLE `"+table+"`").execute();
+            if (verbose>0)
+                cout << "Table `" << table << "` dropped." << endl;
+        }
+    }
+    catch (const exception &e)
+    {
+        cerr << "DROP TABLE `" << table << "`\n\n";
+        cerr << "SQL query failed:\n" << e.what() << endl;
+        return 3;
+    }
+
+    try
+    {
+        if (create)
+            Database(uri).query(query).execute();
+    }
+    catch (const exception &e)
+    {
+        cerr << query << "\n\n";
+        cerr << "SQL query failed:\n" << e.what() << endl;
+        return 4;
+    }
+
+    if (print_create)
+        cout << query << endl;
+
+    if (create && verbose>0)
+        cout << "Table `" << table << "` created." << endl;
+
+    // -------------------------------------------------------------------------
+
+    if (verbose>0)
+        cout << "\n---------------------------- Reading file --------------------------" << endl;
+
+    //query = update ? "UPDATE" : "INSERT";
+    query = "INSERT `"+table+"`\n"
+        "(\n"
+        "   `Night`";
+
+    for (const auto &c: vec)
+        query += ",\n   `"+c.column+"`";
+
+    query +=
+        "\n)\n"
+        "VALUES\n";
+
+    size_t count = 0;
+
+    const size_t num = max>0 && (max-first)<T->GetEntriesFast() ? (max-first) : T->GetEntriesFast();
+    for (size_t j=first; j<num; j++)
+    {
+        T->GetEntry(j);
+        if (has_datatype && datatype!=1)
+            continue;
+
+        query += "(\n   0";
+        for (const auto &c: vec)
+            query += ",\n   "+c.fmt()+" /* "+c.column+" -> "+c.branch+" */";
+        query += "\n)";  // ON DUPLICATE KEY UPDATE\n";
+
+        if (j!=num-1)
+            query += ",\n";
+
+        count ++;
+    }
+
+    if (verbose>0)
+        cout << count << " out of " << num << " rows read from file [N=" << first << ".." << num-1 << "]." << endl;
+
+    // -------------------------------------------------------------------------
+
+    if (verbose>0)
+        cout << "\n--------------------------- Inserting data -------------------------" << endl;
+
+    try
+    {
+        if (!noinsert)
+            // => Simple result
+            Database(uri).query(query).execute();
+        else
+            cout << "Insert query skipped!" << endl;
+        if (print_insert)
+            cout << query << endl;
+    }
+    catch (const exception &e)
+    {
+        cerr << query << "\n\n";
+        cerr << "SQL query failed:\n" << e.what() << endl;
+        return 5;
+    }
+
+    if (verbose>0)
+    {
+        cout << count << " rows inserted.\n\n";
+        cout << "Total execution time: " << Time().UnixTime()-start.UnixTime() << "s\n" << endl;
+    }
+
+    return 0;
+}
