/* ======================================================================== *\ ! ! * ! * This file is part of MARS, the MAGIC Analysis and Reconstruction ! * Software. It is distributed to you in the hope that it can be a useful ! * and timesaving tool in analysing Data of imaging Cerenkov telescopes. ! * It is distributed WITHOUT ANY WARRANTY. ! * ! * Permission to use, copy, modify and distribute this software and its ! * documentation for any purpose is hereby granted without fee, ! * provided that the above copyright notice appear in all copies and ! * that both that copyright notice and this permission notice appear ! * in supporting documentation. It is provided "as is" without express ! * or implied warranty. ! * ! ! ! Author(s): Thomas Hengstebeck 3/2003 ! ! Copyright: MAGIC Software Development, 2000-2005 ! ! \* ======================================================================== */ ///////////////////////////////////////////////////////////////////////////// // // MRanTree // // ParameterContainer for Tree structure // ///////////////////////////////////////////////////////////////////////////// #include "MRanTree.h" #include #include #include #include #include "MArrayI.h" #include "MArrayF.h" #include "MLog.h" #include "MLogManip.h" ClassImp(MRanTree); using namespace std; // -------------------------------------------------------------------------- // Default constructor. // MRanTree::MRanTree(const char *name, const char *title):fClassify(kTRUE),fNdSize(0), fNumTry(3) { fName = name ? name : "MRanTree"; fTitle = title ? title : "Storage container for structure of a single tree"; } // -------------------------------------------------------------------------- // Copy constructor // MRanTree::MRanTree(const MRanTree &tree) { fName = tree.fName; fTitle = tree.fTitle; fClassify = tree.fClassify; fNdSize = tree.fNdSize; fNumTry = tree.fNumTry; fNumNodes = tree.fNumNodes; fNumEndNodes = tree.fNumEndNodes; fBestVar = tree.fBestVar; fTreeMap1 = tree.fTreeMap1; fTreeMap2 = tree.fTreeMap2; fBestSplit = tree.fBestSplit; fGiniDec = tree.fGiniDec; } void MRanTree::SetNdSize(Int_t n) { // threshold nodesize of terminal nodes, i.e. the training data is splitted // until there is only pure date in the subsets(=terminal nodes) or the // subset size is LE n fNdSize=TMath::Max(1,n);//at least 1 event per node } void MRanTree::SetNumTry(Int_t n) { // number of trials in random split selection: // choose at least 1 variable to split in fNumTry=TMath::Max(1,n); } void MRanTree::GrowTree(TMatrix *mat, const MArrayF &hadtrue, const MArrayI &idclass, MArrayI &datasort, const MArrayI &datarang, const MArrayF &tclasspop, const Float_t &mean, const Float_t &square, const MArrayI &jinbag, const MArrayF &winbag, const int nclass) { // arrays have to be initialized with generous size, so number of total nodes (nrnodes) // is estimated for worst case const Int_t numdim =mat->GetNcols(); const Int_t numdata=winbag.GetSize(); const Int_t nrnodes=2*numdata+1; // number of events in bootstrap sample Int_t ninbag=0; for (Int_t n=0;nInteger(mdim); const Int_t mn = mvar*numdata; // Gini index = rrn/rrd+rln/rld Double_t rrn=pno; Double_t rrd=pdo; Double_t rln=0; Double_t rld=0; MArrayF wl(nclass); // left node //nclass MArrayF wr(tclasspop); // right node//nclass Double_t critvar=-1.0e20; for(Int_t nsp=ndstart;nsp<=ndend-1;nsp++) { const Int_t &nc = datasort[mn+nsp]; const Int_t &k = idclass[nc]; const Float_t &u = winbag[nc]; // do classification, Gini index as split rule rln+=u*( 2*wl[k]+u); rrn+=u*(-2*wr[k]+u); rld+=u; rrd-=u; wl[k]+=u; wr[k]-=u; if (datarang[mn+nc]>=datarang[mn+datasort[mn+nsp+1]]) continue; if (TMath::Min(rrd,rld)<=1.0e-5) continue; const Double_t crit=(rln/rld)+(rrn/rrd); if (crit<=critvar) continue; nbestvar=nsp; critvar=crit; } if (critvar<=critmax) continue; msplit=mvar; nbest=nbestvar; critmax=critvar; } decsplit=critmax-crit0; return critmax<-1.0e10 ? 1 : 0; } int MRanTree::FindBestSplitSigma(const MArrayI &datasort,const MArrayI &datarang, const MArrayF &hadtrue, const MArrayI &idclass, Int_t ndstart,Int_t ndend, const MArrayF &tclasspop, const Float_t &mean, const Float_t &square, Int_t &msplit, Float_t &decsplit,Int_t &nbest, const MArrayF &winbag, const int nclass) { const Int_t nrnodes = fBestSplit.GetSize(); const Int_t numdata = (nrnodes-1)/2; const Int_t mdim = fGiniDec.GetSize(); // For the best split, msplit is the index of the variable (e.g Hillas par., zenith angle ,...) // split on. decsplit is the decreae in impurity measured by Gini-index. // nsplit is the case number of value of msplit split on, // and nsplitnext is the case number of the next larger value of msplit. Int_t nbestvar=0; // compute initial values of numerator and denominator of split-index, // resolution //Double_t pno=-(tclasspop[0]*square-mean*mean)*tclasspop[0]; //Double_t pdo= (tclasspop[0]-1.)*mean*mean; // n*resolution //Double_t pno=-(tclasspop[0]*square-mean*mean)*tclasspop[0]; //Double_t pdo= mean*mean; // variance //Double_t pno=-(square-mean*mean/tclasspop[0]); //Double_t pdo= (tclasspop[0]-1.); // n*variance Double_t pno= (square-mean*mean/tclasspop[0]); Double_t pdo= 1.; // 1./(n*variance) //Double_t pno= 1.; //Double_t pdo= (square-mean*mean/tclasspop[0]); const Double_t crit0=pno/pdo; // start main loop through variables to find best split, Double_t critmin=1.0e40; // random split selection, number of trials = fNumTry for (Int_t mt=0; mtInteger(mdim); const Int_t mn = mvar*numdata; Double_t rrn=0, rrd=0, rln=0, rld=0; Double_t esumr =mean; Double_t e2sumr=square; Double_t esuml =0; Double_t e2suml=0; float wl=0.;// left node float wr=tclasspop[0]; // right node Double_t critvar=critmin; for(Int_t nsp=ndstart;nsp<=ndend-1;nsp++) { const Int_t &nc=datasort[mn+nsp]; const Float_t &f=hadtrue[nc];; const Float_t &u=winbag[nc]; e2sumr-=u*f*f; esumr -=u*f; wr -=u; //------------------------------------------- // resolution //rrn=(wr*e2sumr-esumr*esumr)*wr; //rrd=(wr-1.)*esumr*esumr; // resolution times n //rrn=(wr*e2sumr-esumr*esumr)*wr; //rrd=esumr*esumr; // sigma //rrn=(e2sumr-esumr*esumr/wr); //rrd=(wr-1.); // sigma times n rrn=(e2sumr-esumr*esumr/wr); rrd=1.; // 1./(n*variance) //rrn=1.; //rrd=(e2sumr-esumr*esumr/wr); //------------------------------------------- e2suml+=u*f*f; esuml +=u*f; wl +=u; //------------------------------------------- // resolution //rln=(wl*e2suml-esuml*esuml)*wl; //rld=(wl-1.)*esuml*esuml; // resolution times n //rln=(wl*e2suml-esuml*esuml)*wl; //rld=esuml*esuml; // sigma //rln=(e2suml-esuml*esuml/wl); //rld=(wl-1.); // sigma times n rln=(e2suml-esuml*esuml/wl); rld=1.; // 1./(n*variance) //rln=1.; //rld=(e2suml-esuml*esuml/wl); //------------------------------------------- if (datarang[mn+nc]>=datarang[mn+datasort[mn+nsp+1]]) continue; if (TMath::Min(rrd,rld)<=1.0e-5) continue; const Double_t crit=(rln/rld)+(rrn/rrd); if (crit>=critvar) continue; nbestvar=nsp; critvar=crit; } if (critvar>=critmin) continue; msplit=mvar; nbest=nbestvar; critmin=critvar; } decsplit=crit0-critmin; //return critmin>1.0e20 ? 1 : 0; return decsplit<0 ? 1 : 0; } void MRanTree::MoveData(MArrayI &datasort,Int_t ndstart, Int_t ndend, MArrayI &idmove,MArrayI &ncase,Int_t msplit, Int_t nbest,Int_t &ndendl) { // This is the heart of the BuildTree construction. Based on the best split // the data in the part of datasort corresponding to the current node is moved to the // left if it belongs to the left child and right if it belongs to the right child-node. const Int_t numdata = ncase.GetSize(); const Int_t mdim = fGiniDec.GetSize(); MArrayI tdatasort(numdata); // compute idmove = indicator of case nos. going left for (Int_t nsp=ndstart;nsp<=ndend;nsp++) { const Int_t &nc=datasort[msplit*numdata+nsp]; idmove[nc]= nsp<=nbest?1:0; } ndendl=nbest; // shift case. nos. right and left for numerical variables. for(Int_t msh=0;mshncur) break; if (nodestatus[kbuild]!=2) continue; // initialize for next call to FindBestSplit const Int_t ndstart=nodestart[kbuild]; const Int_t ndend=ndstart+nodepop[kbuild]-1; for (Int_t j=0;j*FindBestSplit)(datasort,datarang,hadtrue,idclass,ndstart, ndend, lclasspop,mean[kbuild],square[kbuild],msplit,decsplit, nbest,winbag,nclass)) { nodestatus[kbuild]=-1; continue; } fBestVar[kbuild]=msplit; fGiniDec[msplit]+=decsplit; bestsplit[kbuild]=datasort[msplit*numdata+nbest]; bestsplitnext[kbuild]=datasort[msplit*numdata+nbest+1]; Int_t ndendl; MoveData(datasort,ndstart,ndend,idmove,ncase, msplit,nbest,ndendl); // leftnode no.= ncur+1, rightnode no. = ncur+2. nodepop[ncur+1]=ndendl-ndstart+1; nodepop[ncur+2]=ndend-ndendl; nodestart[ncur+1]=ndstart; nodestart[ncur+2]=ndendl+1; // find class populations in both nodes for (Int_t n=ndstart;n<=ndendl;n++) { const Int_t &nc=ncase[n]; const int j=idclass[nc]; mean[ncur+1]+=hadtrue[nc]*winbag[nc]; square[ncur+1]+=hadtrue[nc]*hadtrue[nc]*winbag[nc]; classpop[j*nrnodes+ncur+1]+=winbag[nc]; } for (Int_t n=ndendl+1;n<=ndend;n++) { const Int_t &nc=ncase[n]; const int j=idclass[nc]; mean[ncur+2] +=hadtrue[nc]*winbag[nc]; square[ncur+2]+=hadtrue[nc]*hadtrue[nc]*winbag[nc]; classpop[j*nrnodes+ncur+2]+=winbag[nc]; } // check on nodestatus nodestatus[ncur+1]=2; nodestatus[ncur+2]=2; if (nodepop[ncur+1]<=fNdSize) nodestatus[ncur+1]=-1; if (nodepop[ncur+2]<=fNdSize) nodestatus[ncur+2]=-1; Double_t popt1=0; Double_t popt2=0; for (Int_t j=0;j=nrnodes) break; } // determine number of nodes fNumNodes=nrnodes; for (Int_t k=nrnodes-1;k>=0;k--) { if (nodestatus[k]==0) fNumNodes-=1; if (nodestatus[k]==2) nodestatus[k]=-1; } fNumEndNodes=0; for (Int_t kn=0;knpp) { // class + status of node kn coded into fBestVar[kn] fBestVar[kn]=j-nclass; pp=classpop[j*nrnodes+kn]; } } float sum=0; for(int i=0;i