wiki:DatabaseBasedAnalysis/RandomForest

Version 2 (modified by tbretz, 6 years ago) ( diff )

--

Writing Input Files

#include <iostream>
#include <iomanip>
#include <fstream>

#include <TMath.h>
#include <TChain.h>

using namespace std;

void writesim()
{
    // Create chain for the tree Result
    // This is just easier than using TFile/TTree
    TChain c("Result");

    // Add the input file to the
    c.AddFile("simulation.root");

    // Define variables for all leaves to be accessed
    // By definition rootifysql writes only doubles
    double X, Y, MeanX, MeanY, Width, Length, CosDelta, SinDelta,
        M3Long, SlopeLong, Leakage1, SlopeSpreadWeighted, Size,
        ConcCore, ConcCOG, NumIslands, NumUsedPixels, Zd, Energy;

    // Connect the variables to the cordesponding leaves
    //c.SetBranchAddress("FileId", &FileId);
    //c.SetBranchAddress("EvtNumber", &EvtNumber);
    c.SetBranchAddress("X", &X);
    c.SetBranchAddress("Y", &Y);
    c.SetBranchAddress("MeanX", &MeanX);
    c.SetBranchAddress("MeanY", &MeanY);
    c.SetBranchAddress("Width", &Width);
    c.SetBranchAddress("Length", &Length);
    c.SetBranchAddress("CosDelta", &CosDelta);
    c.SetBranchAddress("SinDelta", &SinDelta);
    c.SetBranchAddress("M3Long", &M3Long);
    c.SetBranchAddress("SlopeLong", &SlopeLong);
    c.SetBranchAddress("Leakage1", &Leakage1);
    c.SetBranchAddress("NumIslands", &NumIslands);
    c.SetBranchAddress("NumUsedPixels", &NumUsedPixels);
    c.SetBranchAddress("Size", &Size);
    c.SetBranchAddress("Zd", &Zd);
    c.SetBranchAddress("Energy", &Energy);

    // Set some constants (they could be included in the database
    // in the future)
    double mm2deg = +0.0117193246260285378;
    //double abberation = 1.02;

    // -------------------- Source dependent parameter calculation -------------------

    ofstream fout0("sim-train.csv"); // %1
    ofstream fout1("sim-test.csv");  // %0
    ofstream fout2("sim-test-cuts.csv");

    fout0 << "Energy Size Zd Dist Disp Slope M3L Leakage Width Length" << endl;
    fout1 << "Energy Size Zd Dist Disp Slope M3L Leakage Width Length" << endl;
    fout2 << "Energy Size Zd Dist Disp Slope M3L Leakage Width Length" << endl;

    // Loop over all wobble positions in the camera
    for (int i=0; i<c.GetEntries(); i++)
    {
        // read the i-th event from the file
        c.GetEntry(i);

        // First calculate all cuts to speedup the analysis
        double area = TMath::Pi()*Width*Length;

        // The abberation correction does increase also Width and Length by 1.02

        int angle = 0;

        // -------------------- Source dependent parameter calculation -------------------

        double cr = cos(angle*TMath::DegToRad());
        double sr = sin(angle*TMath::DegToRad());

        double px = cr*X-sr*Y;
        double py = cr*Y+sr*X;

        double dx = MeanX - px*1.022;
        double dy = MeanY - py*1.022;

        double norm = sqrt(dx*dx + dy*dy);
        double dist = norm*mm2deg;

        double lx = min(max((CosDelta*dy - SinDelta*dx)/norm, -1.), 1.);
        double ly = min(max((CosDelta*dx + SinDelta*dy)/norm, -1.), 1.);

        double alpha = asin(lx);
        double sgn   = TMath::Sign(1., ly);

        // ------------------------------- Application ----------------------------------

        double m3l   = M3Long*sgn*mm2deg;
        double slope = SlopeLong*sgn/mm2deg;

        // --------------------------------- Analysis -----------------------------------

        //double xi = 1.34723 + 0.15214 *slope + 0.970704*(1-1/(1+8.89826*Leakage1));
        double xi = 1.340 + 0.0755*slope + 1.67972*(1-1/(1+4.86232*Leakage1));

        double sign1 = m3l+0.07;
        double sign2 = (dist-0.5)*7.2-slope;

        double disp  = (sign1<0 || sign2<0 ? -xi : xi)*(1-Width/Length);

        double thetasq = disp*disp + dist*dist - 2*disp*dist*sqrt(1-lx*lx);

        if (i%2==0)
        {
            fout0 << log10(Energy) << " ";
            fout0 << log10(Size) << " ";
            fout0 << Zd << " ";
            fout0 << dist << " ";
            fout0 << disp << " ";
            fout0 << slope << " ";
            fout0 << m3l << " ";
            fout0 << Leakage1 << " ";
            fout0 << Width << " ";
            fout0 << Length << endl;
        }
        else
        {
            fout1 << log10(Energy) << " ";
            fout1 << log10(Size) << " ";
            fout1 << Zd << " ";
            fout1 << dist << " ";
            fout1 << disp << " ";
            fout1 << slope << " ";
            fout1 << m3l << " ";
            fout1 << Leakage1 << " ";
            fout1 << Width << " ";
            fout1 << Length << endl;

            if (thetasq<0.024)
                continue;

            bool cutq = NumIslands<3.5 && NumUsedPixels>5.5 && Leakage1<0.1;
            if (!cutq)
                continue;

            bool cut0 = area < log10(Size)*898-1535;
            if (!cut0)
                continue;

            fout2 << log10(Energy) << " ";
            fout2 << log10(Size) << " ";
            fout2 << Zd << " ";
            fout2 << dist << " ";
            fout2 << disp << " ";
            fout2 << slope << " ";
            fout2 << m3l << " ";
            fout2 << Leakage1 << " ";
            fout2 << Width << " ";
            fout2 << Length << endl;
        }
    }
}

Training

fact@ihp-pc45:~/Analysis> nice -n 10 ~/ranger-master/cpp_version/build/ranger --file sim-train.csv --depvarname Energy --memmode 1 --treetype 3 --verbose --impmeasure 1 --outprefix sim-train
Starting Ranger.
Loading input file: sim-train.csv.
Growing trees ..
Computing prediction error ..

Tree type:                         Regression
Dependent variable name:           Energy
Dependent variable ID:             0
Number of trees:                   500
Sample size:                       55417
Number of independent variables:   9
Mtry:                              3
Target node size:                  5
Variable importance mode:          1
Memory mode:                       1
Seed:                              0
Number of threads:                 8

Overall OOB prediction error:      0.0178514

Saved variable importance to file sim-train.importance.
Saved prediction error to file sim-train.confusion.
Finished Ranger.

It will write a file called sim-train.forest and

user@machine> cat ranger_out.importance
Size: 2764.52
Zd: 1198.02
Dist: 695.544
Disp: 163.821
Slope: 246.754
M3L: 277.035
Leakage: 118.393
Width: 420.597
Length: 565.094

Testing

nice -n 10 ~/ranger-master/cpp_version/build/ranger --file sim-test.csv      --depvarname Energy --memmode 1 --treetype 3 --verbose --impmeasure 1 --predict sim-train.forest 
nice -n 10 ~/ranger-master/cpp_version/build/ranger --file sim-test-cuts.csv --depvarname Energy --memmode 1 --treetype 3 --verbose --impmeasure 1 --predict sim-train.forest 

Here is an example output

Starting Ranger.
Loading input file: sim-test-cuts.csv.
Loading forest from file sim-train.forest.
Predicting ..

Tree type:                         Regression
Dependent variable name:           Energy
Dependent variable ID:             0
Number of trees:                   500
Sample size:                       5135
Number of independent variables:   9
Mtry:                              3
Target node size:                  5
Variable importance mode:          1
Memory mode:                       1
Seed:                              0
Number of threads:                 8

Saved predictions to file ranger_out.prediction.
Finished Ranger.

Attachments (1)

Download all attachments as: .zip

Note: See TracWiki for help on using the wiki.