Index: trunk/FACT++/src/csv2root.cc
===================================================================
--- trunk/FACT++/src/csv2root.cc	(revision 19795)
+++ trunk/FACT++/src/csv2root.cc	(revision 19796)
@@ -31,5 +31,5 @@
         ("tree,t",         var<string>("Events"),     "Name of the root tree to convert")
         ("compression,c",  var<uint16_t>(1),          "zlib compression level for the root file")
-        ("no-header",      po_switch(),               "Use if the first line contains no header")
+        ("no-header,n",    po_switch(),               "Use if the first line contains no header")
         ("dry-run",        po_switch(),               "Do not create or manipulate any output file")
         ;
@@ -50,4 +50,5 @@
     p.add("file", 1); // All positional options
     p.add("out",  1); // All positional options
+    p.add("tree", 1); // All positional options
 
     conf.AddOptions(control);
@@ -60,127 +61,44 @@
 {
     cout <<
-        "csv2root - Reads data from a root tree and writes a csv file\n"
+        "csv2root - Converts a data table from a csv file to a root tree\n"
         "\n"
         "For convenience, this documentation uses the extended version of the options, "
         "refer to the output below to get the abbreviations.\n"
         "\n"
-        "This is a general purpose tool to fill the contents of a root file into a database "
-        "as long as this is technically possible and makes sense. Note that root can even "
-        "write complex data like a TH1F into a database, this is not the purpose of this "
-        "program.\n"
-        "\n"
-        "Each root tree has branches and leaves (the basic data types). These leaves can "
-        "be read independently of the classes which were used to write the root file. "
-        "The default tree to read from is 'Events' but the name can be overwritten "
-        "using --tree. The default table name to fill the data into is identical to "
-        "the tree name. It can be overwritten using --table.\n"
-        "\n"
-        "To get a list of the contents (keys and trees) of a root file, you can use --print-ls. "
-        "The name of each column to which data is filled from a leave is obtained from "
-        "the leaves' names. The leave names can be checked using --print-leaves. "
-        "A --print-branches exists for convenience to print only the high-level branches. "
-        "Sometimes these names might be quite unconvenient like MTime.fTime.fMilliSec or "
-        "just MHillas.fWidth. To allow to simplify column names, regular expressions "
-        "(using boost's regex) can be defined to change the names. Note that these regular "
-        "expressions are applied one by one on each leaf's name. A valid expression could "
-        "be:\n"
-        "   --map=MHillas\\.f/\n"
-        "which would remove all occurances of 'MHillas.f'. This option can be used more than "
-        "once. They are applied in sequence. A single match does not stop the sequence.\n"
-        "\n"
-        "Sometimes it might also be convenient to skip a leaf. This can be done with "
-        "the --ignore resource. If the given regular expresion yields a match, the "
-        "leaf will be ignored. Note that the regular expression works on the raw-name "
-        "of the leaf not the readily mapped SQL column names. Example:\n"
-        "   --ignore=ThetaSq\\..*\n"
-        "will skip all leaved which start with 'ThetaSq.'. This option can be used"
-        "more than once.\n"
-        "\n"
-        "The data type of each column is kept as close as possible to the leaves' data "
-        "types. If for some reason this is not wanted, the data type of the SQL column "
-        "can be overwritten with --sql-type sql-column/sql-ytpe, for example:\n"
-        "   --sql-type=FileId/UNSIGNED INT\n"
-        "while the first argument of the name of the SQL column to which the data type "
-        "should be applied. The second column is the basic SQL data type. The option can "
-        "be given more than once.\n"
-        "\n"
-        "Database interaction:\n"
-        "\n"
-        "To drop an existing table, --drop can be used.\n"
-        "\n"
-        "To create a table according to theSQL  column names and data types, --create "
-        "can be used. The query used can be printed with --print-create even --create "
-        "has not been specified.\n"
-        "\n"
-        "To choose the columns which should become primary keys, use --primary, "
-        "for example:\n"
-        "   --primary=col1\n"
-        "To define more than one column as primary key, the option can be given more than "
-        "once. Note that the combination of these columns must be unique.\n"
-        "\n"
-        "All columns are created as NOT NULL as default. To force a database engine "
-        "and/or a storage format, use --engine and --row-format.\n"
-        "\n"
-        "Usually, the INSERT query would fail if the PRIMARY key exists already. "
-        "This can be avoided using the 'ON DUPLICATE KEY UPDATE' directive. With the "
-        "--duplicate, you can specify what should be updated in case of a duplicate key. "
-        "To keep the row untouched, you can just update the primary key "
-        "with the identical primary key, e.g. --duplicate='MyPrimary=VALUES(MyPrimary)'. "
-        "The --duplicate resource can be specified more than once to add more expressions "
-        "to the assignment_list. For more details, see the MySQL manual.\n"
-        "\n"
-        "For debugging purpose, or to just create or drop a table, the final insert "
-        "query can be skipped using --no-insert. Note that for performance reason, "
-        "all data is collected in memory and a single INSERT query is issued at the "
-        "end.\n"
-        "\n"
-        "Another possibility is to add the IGNORE keyword to the INSERT query by "
-        "--ignore-errors, which essentially ignores all errors and turns them into "
-        "warnings which are printed after the query succeeded.\n"
-        "\n"
-        "Using a higher verbosity level (-v), an overview of the written columns or all "
-        "processed leaves is printed depending on the verbosity level. The output looks "
-        "like the following\n"
-        "   Leaf name [root data type] (SQL name)\n"
-        "for example\n"
-        "   MTime.fTime.fMilliSec [Long64_t] (MilliSec)\n"
-        "which means that the leaf MTime.fTime.fMilliSec is detected to be a Long64_t "
-        "which is filled into a column called MilliSec. Leaves with non basic data types "
-        "are ignored automatically and are marked as (-n/a-). User ignored columns "
-        "are marked as (-ignored-).\n"
-        "\n"
-        "A constant value for the given file can be inserted by using the --const directive. "
-        "For example --const.mycolumn=42 would insert 42 into a column called mycolumn. "
-        "The column is created as INT UNSIGNED as default which can be altered by "
-        "--sql-type. A special case is a value of the form `/regex/format/`. Here, the given "
-        "regular expression is applied to the filename and it is newly formated with "
-        "the new format string. Uses the standard formatting rules to replace matches "
-        "(those used by ECMAScript's replace method).\n"
-        "\n"
-        "Usually the previously defined constant values are helpful to create an index "
-        "which relates unambiguously the inserted data to the file. It might be useful "
-        "to delete all data which belongs to this particular file before new data is "
-        "entered. This can be achieved with the `--delete` directive. It deletes all "
-        "data from the table before inserting new data which fulfills the condition "
-        "defined by the `--const` directives.\n"
-        "\n"
-        "The constant values can also be used for a conditional execution (--conditional). "
-        "If any row with the given constant values are found, the execution is stopped "
-        "(note that this happend after the table drop/create but before the delete/insert.\n"
-        "\n"
-        "To ensure efficient access for a conditonal execution, it makes sense to have "
-        "an index created for those columns. This can be done during table creation "
-        "with the --index option.\n"
-        "\n"
-        "To create the index as a UNIQUE INDEX, you can use the --unique option which "
-        "implies --index.\n"
-        "\n"
-        "If a query failed, the query is printed to stderr together with the error message. "
-        "For the main INSERT query, this is only true if the verbosity level is at least 2 "
-        "or the query has less than 80*25 bytes.\n"
+        "As a default, the first row in the file is considered to contain the column "
+        "names separated by a whitespace. Column names must not contain whitespaces "
+        "themselves and special characters (':') are replaces by an underscore. "
+        "If the first line contains the first data row, the --no-header directive "
+        "can be used to instruct the program to consider the first line as the first "
+        "data row and use it only for column count. The branch names in the tree "
+        "are then 'colN' where N is the column index starting from 0.\n"
+        "\n"
+        "Each consecutive row in the file is supposed to contain an identical number "
+        "of floating point values. Leading and trailing whitespaces are ignored. "
+        "Empty lines or lines starting with a '#' are discarded.\n"
+        "\n"
+        "Input and output file are given either as first and second positional argument "
+        "or with the --file and --out command line option. If no output file name is "
+        "provided then the input file is used instead and the extension replaced by .root. "
+        "The target tree name of the root file is given with the --tree command line "
+        "option or the third positional argument. The default tree name is 'Events'.\n"
+        "\n"
+        "As a default, existing files are not overwritten. If overwriting is intended, "
+        "it can be turned on with --force. To update an existing root file, the "
+        "--update option can be used. If a tree with the same name already exists, "
+        "the tree is updated. The compression level for a new root file can be set "
+        "with --compression.\n"
+        "\n"
+        "For several purposes, it might be convenient to split the output to several "
+        "different root-treess. This can be done using the --split-sequence (-S) "
+        "and the --split-quantile (-Q) options. If a split sequence is defined as "
+        "-S 1 -S 2 -S 1 the events are split by 1:2:1 in this sequence order. If "
+        "quantiles are given as -Q 0.5 -Q 0.6, the first tree will contain 50% of "
+        "the second one 10% and the third one 40%. The corresponding seed value can "
+        "be set with --seed.\n"
         "\n"
         "In case of success, 0 is returned, a value>0 otherwise.\n"
         "\n"
-        "Usage: root2sql [options] -uri URI rootfile.root\n"
+        "Usage: csv2root input.csv [output.root] [-t tree] [-u] [-f] [-n] [-vN] [-cN]\n"
         "\n"
         ;
@@ -200,6 +118,8 @@
 
 
-void AddTree(vector<TTree*> &ttree, TFile &file, const string &tree, bool update, int verbose)
+bool AddTree(vector<TTree*> &ttree, TFile &file, const string &tree, bool update, int verbose)
 {
+    bool found = false;
+
     TTree *T = 0;
     if (update)
@@ -209,4 +129,5 @@
         {
             ttree.emplace_back(T);
+            found = true;
             if (verbose>0)
                 cout << "Updating tree: " << tree << endl;
@@ -215,4 +136,6 @@
     if (!T)
         ttree.emplace_back(new TTree(tree.c_str(), "csv2root"));
+
+    return found;
 }
 
@@ -237,5 +160,5 @@
     const bool force             = conf.Get<bool>("force");
     const bool update            = conf.Get<bool>("update");
-    const bool dryrun            = conf.Get<bool>("dry-run");
+//    const bool dryrun            = conf.Get<bool>("dry-run");
     const bool noheader          = conf.Get<bool>("no-header");
 
@@ -319,4 +242,5 @@
     }
 
+    buf = buf.Strip(TString::kBoth);
     TObjArray *title = buf.Tokenize(" ");
     if (title->GetEntries()==0)
@@ -335,5 +259,9 @@
 
     if (noheader)
+    {
         fin.seekg(0);
+        if (verbose>0)
+            cout << "No header line interpreted." << endl;
+    }
 
     // -------------------------------------------------------------------------
@@ -342,5 +270,5 @@
     gSystem->ExpandPathName(path);
 
-    if (!dryrun)
+//    if (!dryrun)
     {
         FileStat_t stat;
@@ -377,12 +305,26 @@
     vector<TTree*> ttree;
 
+    size_t entries = 0;
     if (num_split==0)
-        AddTree(ttree, tfile, tree, update, verbose);
+    {
+        if (AddTree(ttree, tfile, tree, update, verbose))
+        {
+            entries = ttree[0]->GetEntries();
+            if (verbose>0)
+                cout << "Tree has " << entries << " entries." << endl;
+        }
+    }
     else
     {
+        bool found = false;
         for (size_t i=0; i<num_split; i++)
-            AddTree(ttree, tfile, tree+"["+to_string(i)+"]", update, verbose);
-    }
-
+            found |= AddTree(ttree, tfile, tree+"["+to_string(i)+"]", update, verbose);
+
+        if (found && update)
+        {
+            cerr << "Trees can not be updated in split mode, only files!" << endl;
+            return 7;
+        }
+    }
 
     vector<float> vec(numcol);
@@ -406,4 +348,5 @@
 
     size_t line = 0;
+    size_t valid = 0;
 
     while (1)
@@ -413,9 +356,17 @@
             break;
 
+        line++;
+
+        buf = buf.Strip(TString::kBoth);
+        if (buf.IsNull() || buf[0]=='#')
+            continue;
+
+        valid++;
+
         TObjArray *arr = buf.Tokenize(" ");
         if (arr->GetEntries()!=numcol)
         {
             cerr << "Column count mismatch in line " << line+1 << "!" << endl;
-            return 6;
+            return 7;
         }
 
@@ -429,5 +380,5 @@
             {
                 cerr << "Conversion of '" << arr->At(i)->GetName() << "' failed!" << endl;
-                return 7;
+                return 8;
             }
         }
@@ -448,13 +399,18 @@
 
         ttree[index]->Fill();
-        line++;
     }
 
     if (verbose>0)
     {
-        cout << line << " data rows read from file." << endl;
-        for (size_t i=0; i<ttree.size(); i++)
-            cout << ttree[i]->GetEntries() << " rows filled into tree #" << i << "." << endl;
-    }
+        cout << valid << " data rows found in " << line << " lines (excl. title)." << endl;
+        if (!update || !entries)
+        {
+            for (size_t i=0; i<ttree.size(); i++)
+                cout << ttree[i]->GetEntries() << " rows filled into tree #" << i << "." << endl;
+        }
+    }
+
+    if (entries && entries!=line)
+        cerr << "\nWARNING - Number of updated entries does not match number of entries in tree!\n" << endl;
 
     for (auto it=ttree.begin(); it!=ttree.end(); it++)