== Random Forest == I am using '''Ranger''' a fast implementation of the random forest machine learning algorithm. The source code can be grabbed from here: https://github.com/imbs-hl/ranger or cloned from !GitHub via {{{ git clone https://github.com/imbs-hl/ranger.git }}} To compile Range then do {{{ cd ranger/cpp_version mkdir build cd build cmake .. make }}} == Writing Input Files == {{{#!Spoiler {{{#!cpp #include #include #include #include #include using namespace std; void writesim() { // Create chain for the tree Result // This is just easier than using TFile/TTree TChain c("Result"); // Add the input file to the c.AddFile("simulation.root"); // Define variables for all leaves to be accessed // By definition rootifysql writes only doubles double X, Y, MeanX, MeanY, Width, Length, CosDelta, SinDelta, M3Long, SlopeLong, Leakage1, SlopeSpreadWeighted, Size, ConcCore, ConcCOG, NumIslands, NumUsedPixels, Zd, Energy; // Connect the variables to the cordesponding leaves //c.SetBranchAddress("FileId", &FileId); //c.SetBranchAddress("EvtNumber", &EvtNumber); c.SetBranchAddress("X", &X); c.SetBranchAddress("Y", &Y); c.SetBranchAddress("MeanX", &MeanX); c.SetBranchAddress("MeanY", &MeanY); c.SetBranchAddress("Width", &Width); c.SetBranchAddress("Length", &Length); c.SetBranchAddress("CosDelta", &CosDelta); c.SetBranchAddress("SinDelta", &SinDelta); c.SetBranchAddress("M3Long", &M3Long); c.SetBranchAddress("SlopeLong", &SlopeLong); c.SetBranchAddress("Leakage1", &Leakage1); c.SetBranchAddress("NumIslands", &NumIslands); c.SetBranchAddress("NumUsedPixels", &NumUsedPixels); c.SetBranchAddress("Size", &Size); c.SetBranchAddress("Zd", &Zd); c.SetBranchAddress("Energy", &Energy); // Set some constants (they could be included in the database // in the future) double mm2deg = +0.0117193246260285378; //double abberation = 1.02; // -------------------- Source dependent parameter calculation ------------------- ofstream fout0("sim-train.csv"); // %1 ofstream fout1("sim-test.csv"); // %0 ofstream fout2("sim-test-cuts.csv"); fout0 << "Energy Size Zd Dist Disp Slope M3L Leakage Width Length" << endl; fout1 << "Energy Size Zd Dist Disp Slope M3L Leakage Width Length" << endl; fout2 << "Energy Size Zd Dist Disp Slope M3L Leakage Width Length" << endl; // Loop over all wobble positions in the camera for (int i=0; i5.5 && Leakage1<0.1; if (!cutq) continue; bool cut0 = area < log10(Size)*898-1535; if (!cut0) continue; fout2 << log10(Energy) << " "; fout2 << log10(Size) << " "; fout2 << Zd << " "; fout2 << dist << " "; fout2 << disp << " "; fout2 << slope << " "; fout2 << m3l << " "; fout2 << Leakage1 << " "; fout2 << Width << " "; fout2 << Length << endl; } } } }}} }}} == Training == {{{ fact@ihp-pc45:~/Analysis> nice -n 10 ~/ranger-master/cpp_version/build/ranger --file sim-train.csv --depvarname Energy --memmode 1 --treetype 3 --verbose --impmeasure 1 --outprefix sim-train Starting Ranger. Loading input file: sim-train.csv. Growing trees .. Computing prediction error .. Tree type: Regression Dependent variable name: Energy Dependent variable ID: 0 Number of trees: 500 Sample size: 55417 Number of independent variables: 9 Mtry: 3 Target node size: 5 Variable importance mode: 1 Memory mode: 1 Seed: 0 Number of threads: 8 Overall OOB prediction error: 0.0178514 Saved variable importance to file sim-train.importance. Saved prediction error to file sim-train.confusion. Finished Ranger. }}} It will write a file called '''sim-train.forest''' and {{{ user@machine> cat ranger_out.importance Size: 2764.52 Zd: 1198.02 Dist: 695.544 Disp: 163.821 Slope: 246.754 M3L: 277.035 Leakage: 118.393 Width: 420.597 Length: 565.094 }}} == Testing == {{{ nice -n 10 ~/ranger-master/cpp_version/build/ranger --file sim-test.csv --depvarname Energy --memmode 1 --treetype 3 --verbose --impmeasure 1 --predict sim-train.forest nice -n 10 ~/ranger-master/cpp_version/build/ranger --file sim-test-cuts.csv --depvarname Energy --memmode 1 --treetype 3 --verbose --impmeasure 1 --predict sim-train.forest }}} Here is an example output {{{ Starting Ranger. Loading input file: sim-test-cuts.csv. Loading forest from file sim-train.forest. Predicting .. Tree type: Regression Dependent variable name: Energy Dependent variable ID: 0 Number of trees: 500 Sample size: 5135 Number of independent variables: 9 Mtry: 3 Target node size: 5 Variable importance mode: 1 Memory mode: 1 Seed: 0 Number of threads: 8 Saved predictions to file ranger_out.prediction. Finished Ranger. }}} == Plotting Result == {{{#!Spoiler {{{#!cpp #include #include #include #include #include void readsim() { TH2F h2svse ("H_SvsE", "", 26*3, 2.2, 4.8, 3*30, 1.3, 4.3); TH2F h2est ("H_EstVsMC", "", 26*3, 2.2, 4.8, 26*3, 2.2, 4.8); TH2F h2bias ("H_BiasLog", "", 26*3, 2.2, 4.8, 100, -1, 1); TH2F h2lin ("H_BiasLin", "", 26*3, 2.2, 4.8, 100, -1, 2); TH2F h2size ("H_ResSize", "", 26*3, 2.2, 4.8, 100, -1, 1); TH2F h2dist ("H_ResDist", "", 50, 0, 2.5, 100, -1, 1); TH2F h2slope("H_ResSlope", "", 50, -10, 10, 100, -1, 1); TH2F h2zd ("H_ResZd", "", 90, 0, 90, 100, -1, 1); ifstream fin0("sim-test-cuts.csv"); ifstream fin1("sim-test-results-cuts.csv"); TString str; str.ReadLine(fin0); str.ReadLine(fin1); while (1) { if (!fin0 || !fin1) break; double Emc, Size, Zd, Dist, Disp, Slope, M3L, Leakage, Width, Length, Eest; fin0 >> Emc >> Size >> Zd >> Dist >> Disp >> Slope >> M3L >> Leakage >> Width >> Length; fin1 >> Eest; h2svse.Fill(Emc, Size); h2est.Fill( Emc, Eest); h2bias.Fill(Eest, Eest-Emc); h2lin.Fill( Eest, (pow(10, Eest)-pow(10, Emc))/pow(10, Emc)); h2size.Fill( Size, Eest-Emc); h2dist.Fill( Dist, Eest-Emc); h2slope.Fill(Slope, Eest-Emc); h2zd.Fill( Zd, Eest-Emc); } TF1 fx("f", "x", -100, 100); TF1 f0("f", "0", -100, 100); TCanvas *canv = new TCanvas("Canvas", "Energy estimation", 800, 900); canv->Divide(2,4); canv->cd(1); gPad->SetTopMargin(0.005); gPad->SetGridy(); h2size.SetStats(kFALSE); h2size.SetXTitle("lg(E_{mc}/GeV)"); h2size.SetYTitle("lg(E_{est}/GeV)-lg(E_{mc}/GeV)"); h2size.DrawCopy("colz"); f0.DrawCopy("same"); canv->cd(3); gPad->SetTopMargin(0.005); gPad->SetGridy(); h2dist.SetStats(kFALSE); h2dist.SetXTitle("Dist / degree"); h2dist.SetYTitle("lg(E_{est}/GeV)-lg(E_{mc}/GeV)"); h2dist.DrawCopy("colz"); f0.DrawCopy("same"); canv->cd(5); gPad->SetTopMargin(0.005); gPad->SetGridy(); h2slope.SetStats(kFALSE); h2slope.SetXTitle("Slope / ns/degree"); h2slope.SetYTitle("lg(E_{est}/GeV)-lg(E_{mc}/GeV)"); h2slope.DrawCopy("colz"); f0.DrawCopy("same"); canv->cd(7); gPad->SetTopMargin(0.005); gPad->SetGridy(); h2zd.SetStats(kFALSE); h2zd.SetXTitle("Zenith Distance / degree"); h2zd.SetYTitle("lg(E_{est}/GeV)-lg(E_{mc}/GeV)"); h2zd.DrawCopy("colz"); f0.DrawCopy("same"); canv->cd(2); gPad->SetTopMargin(0.005); gPad->SetGridy(); h2svse.SetStats(kFALSE); h2svse.SetXTitle("lg(Size)"); h2svse.SetYTitle("lg(E_{mc}/GeV)"); h2svse.DrawCopy("colz"); fx.DrawCopy("same"); canv->cd(4); gPad->SetTopMargin(0.005); gPad->SetGridy(); h2est.SetStats(kFALSE); h2est.SetXTitle("lg(E_{mc})"); h2est.SetYTitle("lg(E_{est}/GeV)"); h2est.DrawCopy("colz"); fx.DrawCopy("same"); canv->cd(6); gPad->SetTopMargin(0.005); gPad->SetGridy(); h2bias.SetStats(kFALSE); h2bias.SetXTitle("lg(E_{est}/GeV)"); h2bias.SetYTitle("lg(E_{est}/GeV)-lg(E_{mc}/GeV)"); h2bias.DrawCopy("colz"); f0.DrawCopy("same"); canv->cd(8); gPad->SetTopMargin(0.005); gPad->SetGridy(); TGraph grlin; TGraph grlog; for (int x=1; xGetRMS()>0) grlog.SetPoint(grlog.GetN(), h2bias.GetXaxis()->GetBinCenter(x), pow(10, p->GetRMS())-1); delete p; p = h2lin.ProjectionY("_py", x, x); if (p->GetRMS()>0) grlin.SetPoint(grlin.GetN(), h2lin.GetXaxis()->GetBinCenter(x), p->GetRMS()); delete p; } grlog.SetMarkerColor(kBlue); grlin.SetMinimum(0); grlin.GetHistogram()->SetXTitle("log(E_{est}/GeV)"); grlin.GetHistogram()->SetYTitle("RMS"); grlin.DrawClone("A*"); grlog.DrawClone("*"); } }}} }}} == Result == I trained without any cuts applied. This is how the result looks like if the random forest prediction is applied to the test sample with cuts applied. [[Image(Result.png)]]