== Random Forest == I am using '''Ranger''' a fast implementation of the random forest machine learning algorithm. The documentation and the paper can be found here: https://doi.org/10.18637/jss.v077.i01. Quoting from their documentation ```ranger is a fast implementation of random forests (Breiman 2001) or recursive partitioning, particularly suited for high dimensional data. Classification, regression, and survival forests are supported. Classification and regression forests are implemented as in the original Random Forest (Breiman 2001), survival forests as in Random Survival Forests (Ishwaran et al. 2008). Includes implementations of extremely randomized trees (Geurts et al. 2006) and quantile regression forests (Meinshausen 2006).``` The source code can be grabbed from here: https://github.com/imbs-hl/ranger (where also some additional information are available) or cloned from !GitHub via {{{ git clone https://github.com/imbs-hl/ranger.git }}} To compile Range then do {{{ cd ranger/cpp_version mkdir build cd build cmake .. make }}} Here is the full output (on an Ubuntu 18.04) {{{#!Spoiler {{{ user@machine:~> git clone https://github.com/imbs-hl/ranger.git Cloning into 'ranger'... remote: Counting objects: 8000, done. remote: Compressing objects: 100% (32/32), done. remote: Total 8000 (delta 6), reused 5 (delta 3), pack-reused 7965 Receiving objects: 100% (8000/8000), 19.88 MiB | 2.30 MiB/s, done. Resolving deltas: 100% (6172/6172), done. user@machine:~> cd ranger user@machine:~/ranger> cd cpp_version/ user@machine:~/ranger/cpp_version> mkdir build user@machine:~/ranger/cpp_version> cd build user@machine:~/ranger/cpp_version/build> cmake .. -- The C compiler identification is GNU 7.3.0 -- The CXX compiler identification is GNU 7.3.0 -- Check for working C compiler: /usr/bin/cc -- Check for working C compiler: /usr/bin/cc -- works -- Detecting C compiler ABI info -- Detecting C compiler ABI info - done -- Detecting C compile features -- Detecting C compile features - done -- Check for working CXX compiler: /usr/bin/c++ -- Check for working CXX compiler: /usr/bin/c++ -- works -- Detecting CXX compiler ABI info -- Detecting CXX compiler ABI info - done -- Detecting CXX compile features -- Detecting CXX compile features - done CMake Warning (dev) at CMakeLists.txt:2 (cmake_minimum_required): Compatibility with CMake < 2.4 is not supported by CMake >= 3.0. This warning is for project developers. Use -Wno-dev to suppress it. -- Performing Test COMPILER_SUPPORTS_CXX11 -- Performing Test COMPILER_SUPPORTS_CXX11 - Success Compiler with C++11 support found. -- Configuring done -- Generating done -- Build files have been written to: /home/fact/ranger/cpp_version/build user@machine:~/ranger/cpp_version/build> make -j8 Scanning dependencies of target ranger [ 23%] Building CXX object CMakeFiles/ranger.dir/src/Forest/ForestProbability.o [ 23%] Building CXX object CMakeFiles/ranger.dir/src/Forest/ForestClassification.o [ 29%] Building CXX object CMakeFiles/ranger.dir/src/Forest/ForestSurvival.o [ 23%] Building CXX object CMakeFiles/ranger.dir/src/Forest/Forest.o [ 23%] Building CXX object CMakeFiles/ranger.dir/src/Forest/ForestRegression.o [ 35%] Building CXX object CMakeFiles/ranger.dir/src/Tree/Tree.o [ 41%] Building CXX object CMakeFiles/ranger.dir/src/Tree/TreeClassification.o [ 47%] Building CXX object CMakeFiles/ranger.dir/src/Tree/TreeProbability.o [ 52%] Building CXX object CMakeFiles/ranger.dir/src/Tree/TreeRegression.o [ 58%] Building CXX object CMakeFiles/ranger.dir/src/Tree/TreeSurvival.o [ 64%] Building CXX object CMakeFiles/ranger.dir/src/main.o [ 70%] Building CXX object CMakeFiles/ranger.dir/src/utility/ArgumentHandler.o [ 76%] Building CXX object CMakeFiles/ranger.dir/src/utility/Data.o [ 82%] Building CXX object CMakeFiles/ranger.dir/src/utility/DataChar.o [ 88%] Building CXX object CMakeFiles/ranger.dir/src/utility/DataFloat.o [ 94%] Building CXX object CMakeFiles/ranger.dir/src/utility/utility.o [100%] Linking CXX executable ranger [100%] Built target ranger user@machine:~/ranger/cpp_version/build> }}} }}} == Writing Input Files == Now we have the random forest, now we need the data. I assume that you have created a root-file with the simulation data you want to use already which is named '''simulation.root'''. {{{#!Spoiler {{{#!cpp #include #include #include #include #include using namespace std; void writesim() { // Create chain for the tree Result // This is just easier than using TFile/TTree TChain c("Result"); // Add the input file to the c.AddFile("simulation.root"); // Define variables for all leaves to be accessed // By definition rootifysql writes only doubles double X, Y, MeanX, MeanY, Width, Length, CosDelta, SinDelta, M3Long, SlopeLong, Leakage1, SlopeSpreadWeighted, Size, ConcCore, ConcCOG, NumIslands, NumUsedPixels, Zd, Energy; // Connect the variables to the cordesponding leaves c.SetBranchAddress("X", &X); c.SetBranchAddress("Y", &Y); c.SetBranchAddress("MeanX", &MeanX); c.SetBranchAddress("MeanY", &MeanY); c.SetBranchAddress("Width", &Width); c.SetBranchAddress("Length", &Length); c.SetBranchAddress("CosDelta", &CosDelta); c.SetBranchAddress("SinDelta", &SinDelta); c.SetBranchAddress("M3Long", &M3Long); c.SetBranchAddress("SlopeLong", &SlopeLong); c.SetBranchAddress("Leakage1", &Leakage1); c.SetBranchAddress("NumIslands", &NumIslands); c.SetBranchAddress("NumUsedPixels", &NumUsedPixels); c.SetBranchAddress("Size", &Size); c.SetBranchAddress("Zd", &Zd); c.SetBranchAddress("Energy", &Energy); // Set some constants (they could be included in the database // in the future) double mm2deg = +0.0117193246260285378; // -------------------- Source dependent parameter calculation ------------------- ofstream fout0("sim-train.csv"); // %1 ofstream fout1("sim-test.csv"); // %0 ofstream fout2("sim-test-cuts.csv"); fout0 << "Energy Size Zd Dist Disp Slope M3L Leakage Width Length" << endl; fout1 << "Energy Size Zd Dist Disp Slope M3L Leakage Width Length" << endl; fout2 << "Energy Size Zd Dist Disp Slope M3L Leakage Width Length" << endl; // Loop over all wobble positions in the camera for (int i=0; i5.5 && Leakage1<0.1; if (!cutq) continue; bool cut0 = area < log10(Size)*898-1535; if (!cut0) continue; fout2 << log10(Energy) << " "; fout2 << log10(Size) << " "; fout2 << Zd << " "; fout2 << dist << " "; fout2 << disp << " "; fout2 << slope << " "; fout2 << m3l << " "; fout2 << Leakage1 << " "; fout2 << Width << " "; fout2 << Length << endl; } } } }}} }}} == Training == Now we train the random forest with the train data using ranger. I am using mainly the default setup. An improved setup might still be possible. {{{ user@machine> nice -n 10 ranger --file sim-train.csv --depvarname Energy --memmode 1 --treetype 3 --verbose --impmeasure 1 --outprefix sim-train --write Starting Ranger. Loading input file: sim-train.csv. Growing trees .. Computing prediction error .. Tree type: Regression Dependent variable name: Energy Dependent variable ID: 0 Number of trees: 500 Sample size: 55417 Number of independent variables: 9 Mtry: 3 Target node size: 5 Variable importance mode: 1 Memory mode: 1 Seed: 0 Number of threads: 8 Overall OOB prediction error: 0.0178514 Saved variable importance to file sim-train.importance. Saved prediction error to file sim-train.confusion. Finished Ranger. }}} It will write a file called '''sim-train.forest''' with the forest and give you a list of the importance of the parameters {{{ user@machine> cat ranger_out.importance Size: 2764.52 Zd: 1198.02 Dist: 695.544 Disp: 163.821 Slope: 246.754 M3L: 277.035 Leakage: 118.393 Width: 420.597 Length: 565.094 }}} == Testing == Now we can apply the forest to out test data. Once to all data and once to the data after background suppression: {{{ nice -n 10 ranger --verbose --predict sim-train.forest --file sim-test.csv nice -n 10 ranger --verbose --predict sim-train.forest --file sim-test-cuts.csv }}} Here is an example output {{{ Starting Ranger. Loading input file: sim-test-cuts.csv. Loading forest from file sim-train.forest. Predicting .. [...] Saved predictions to file ranger_out.prediction. Finished Ranger. }}} == Plotting Result == The predicted values have been written to '''ranger_out.prediction'''. We can read them now together with the test-data and do some control plots. Here is an example ROOT macro which plots the result for the test-data after background suppresion cuts. {{{#!Spoiler {{{#!cpp #include #include #include #include #include void readsim() { TH2F h2svse ("H_SvsE", "", 26*3, 2.2, 4.8, 3*30, 1.3, 4.3); TH2F h2est ("H_EstVsMC", "", 26*3, 2.2, 4.8, 26*3, 2.2, 4.8); TH2F h2bias ("H_BiasLog", "", 26, 2.2, 4.8, 100, -1, 1); TH2F h2lin ("H_BiasLin", "", 26, 2.2, 4.8, 100, -1, 2); TH2F h2size ("H_ResSize", "", 26*3, 2.2, 4.8, 100, -1, 1); TH2F h2dist ("H_ResDist", "", 50, 0, 2.5, 100, -1, 1); TH2F h2slope("H_ResSlope", "", 50, -10, 10, 100, -1, 1); TH2F h2zd ("H_ResZd", "", 90, 0, 90, 100, -1, 1); ifstream fin0("sim-test-cuts.csv"); ifstream fin1("sim-test-results-cuts.csv"); TString str; str.ReadLine(fin0); str.ReadLine(fin1); while (1) { double Emc, Size, Zd, Dist, Disp, Slope, M3L, Leakage, Width, Length, Eest; fin0 >> Emc >> Size >> Zd >> Dist >> Disp >> Slope >> M3L >> Leakage >> Width >> Length; fin1 >> Eest; if (!fin0 || !fin1) break; h2svse.Fill(Emc, Size); h2est.Fill( Emc, Eest); h2bias.Fill(Eest, Eest-Emc); h2lin.Fill( Eest, (pow(10, Eest)-pow(10, Emc))/pow(10, Emc)); h2size.Fill( Size, Eest-Emc); h2dist.Fill( Dist, Eest-Emc); h2slope.Fill(Slope, Eest-Emc); h2zd.Fill( Zd, Eest-Emc); } TF1 fx("f", "x", -100, 100); TF1 f0("f", "0", -100, 100); TCanvas *canv = new TCanvas("Canvas", "Energy estimation", 800, 900); canv->Divide(2,4); canv->cd(1); gPad->SetTopMargin(0.005); gPad->SetGridy(); h2size.SetStats(kFALSE); h2size.SetXTitle("lg(E_{mc}/GeV)"); h2size.SetYTitle("lg(E_{est}/GeV)-lg(E_{mc}/GeV)"); h2size.DrawCopy("colz"); f0.DrawCopy("same"); canv->cd(3); gPad->SetTopMargin(0.005); gPad->SetGridy(); h2dist.SetStats(kFALSE); h2dist.SetXTitle("Dist / degree"); h2dist.SetYTitle("lg(E_{est}/GeV)-lg(E_{mc}/GeV)"); h2dist.DrawCopy("colz"); f0.DrawCopy("same"); canv->cd(5); gPad->SetTopMargin(0.005); gPad->SetGridy(); h2slope.SetStats(kFALSE); h2slope.SetXTitle("Slope / ns/degree"); h2slope.SetYTitle("lg(E_{est}/GeV)-lg(E_{mc}/GeV)"); h2slope.DrawCopy("colz"); f0.DrawCopy("same"); canv->cd(7); gPad->SetTopMargin(0.005); gPad->SetGridy(); h2zd.SetStats(kFALSE); h2zd.SetXTitle("Zenith Distance / degree"); h2zd.SetYTitle("lg(E_{est}/GeV)-lg(E_{mc}/GeV)"); h2zd.DrawCopy("colz"); f0.DrawCopy("same"); canv->cd(2); gPad->SetTopMargin(0.005); gPad->SetGridy(); h2svse.SetStats(kFALSE); h2svse.SetXTitle("lg(Size)"); h2svse.SetYTitle("lg(E_{mc}/GeV)"); h2svse.DrawCopy("colz"); fx.DrawCopy("same"); canv->cd(4); gPad->SetTopMargin(0.005); gPad->SetGridy(); h2est.SetStats(kFALSE); h2est.SetXTitle("lg(E_{mc})"); h2est.SetYTitle("lg(E_{est}/GeV)"); h2est.DrawCopy("colz"); fx.DrawCopy("same"); canv->cd(6); gPad->SetTopMargin(0.005); gPad->SetGridy(); h2bias.SetStats(kFALSE); h2bias.SetXTitle("lg(E_{est}/GeV)"); h2bias.SetYTitle("lg(E_{est}/GeV)-lg(E_{mc}/GeV)"); h2bias.DrawCopy("colz"); f0.DrawCopy("same"); canv->cd(8); gPad->SetTopMargin(0.005); gPad->SetGridy(); TGraph grlin; TGraph grlog; for (int x=1; xGetRMS()>0) grlog.SetPoint(grlog.GetN(), h2bias.GetXaxis()->GetBinCenter(x), pow(10, p->GetRMS())-1); delete p; p = h2lin.ProjectionY("_py", x, x); if (p->GetRMS()>0) grlin.SetPoint(grlin.GetN(), h2lin.GetXaxis()->GetBinCenter(x), p->GetRMS()); delete p; } grlog.SetMarkerColor(kBlue); grlin.SetMinimum(0); grlin.SetMaximum(0.38); grlin.GetHistogram()->SetXTitle("log(E_{est}/GeV)"); grlin.GetHistogram()->SetYTitle("RMS"); grlin.DrawClone("A*"); grlog.DrawClone("*"); } }}} }}} == Result == And here are the plots which were produced: [[Image(Result.png)]]