/* ======================================================================== *\ ! ! * ! * This file is part of MARS, the MAGIC Analysis and Reconstruction ! * Software. It is distributed to you in the hope that it can be a useful ! * and timesaving tool in analysing Data of imaging Cerenkov telescopes. ! * It is distributed WITHOUT ANY WARRANTY. ! * ! * Permission to use, copy, modify and distribute this software and its ! * documentation for any purpose is hereby granted without fee, ! * provided that the above copyright notice appear in all copies and ! * that both that copyright notice and this permission notice appear ! * in supporting documentation. It is provided "as is" without express ! * or implied warranty. ! * ! ! ! Author(s): Thomas Hengstebeck 3/2003 ! ! Copyright: MAGIC Software Development, 2000-2005 ! ! \* ======================================================================== */ ///////////////////////////////////////////////////////////////////////////// // // MRanTree // // ParameterContainer for Tree structure // ///////////////////////////////////////////////////////////////////////////// #include "MRanTree.h" #include #include #include #include #include "MArrayI.h" #include "MArrayF.h" #include "MMath.h" #include "MLog.h" #include "MLogManip.h" ClassImp(MRanTree); using namespace std; // -------------------------------------------------------------------------- // Default constructor. // MRanTree::MRanTree(const char *name, const char *title):fClassify(kTRUE),fNdSize(0), fNumTry(3) { fName = name ? name : "MRanTree"; fTitle = title ? title : "Storage container for structure of a single tree"; } // -------------------------------------------------------------------------- // Copy constructor // MRanTree::MRanTree(const MRanTree &tree) { fName = tree.fName; fTitle = tree.fTitle; fClassify = tree.fClassify; fNdSize = tree.fNdSize; fNumTry = tree.fNumTry; fNumNodes = tree.fNumNodes; fNumEndNodes = tree.fNumEndNodes; fBestVar = tree.fBestVar; fTreeMap1 = tree.fTreeMap1; fTreeMap2 = tree.fTreeMap2; fBestSplit = tree.fBestSplit; fGiniDec = tree.fGiniDec; } void MRanTree::SetNdSize(Int_t n) { // threshold nodesize of terminal nodes, i.e. the training data is splitted // until there is only pure date in the subsets(=terminal nodes) or the // subset size is LE n fNdSize=TMath::Max(1,n);//at least 1 event per node } void MRanTree::SetNumTry(Int_t n) { // number of trials in random split selection: // choose at least 1 variable to split in fNumTry=TMath::Max(1,n); } void MRanTree::GrowTree(TMatrix *mat, const MArrayF &hadtrue, const MArrayI &idclass, MArrayI &datasort, const MArrayI &datarang, const MArrayF &tclasspop, const Float_t &mean, const Float_t &square, const MArrayI &jinbag, const MArrayF &winbag, const int nclass) { // arrays have to be initialized with generous size, so number of total nodes (nrnodes) // is estimated for worst case const Int_t numdim =mat->GetNcols(); const Int_t numdata=winbag.GetSize(); const Int_t nrnodes=2*numdata+1; // number of events in bootstrap sample Int_t ninbag=0; for (Int_t n=0;nInteger(mdim); const Int_t mn = mvar*numdata; // Gini index = rrn/rrd+rln/rld Double_t rrn=pno; Double_t rrd=pdo; Double_t rln=0; Double_t rld=0; MArrayF wl(nclass); // left node //nclass MArrayF wr(tclasspop); // right node//nclass Double_t critvar=-FLT_MAX; for(Int_t nsp=ndstart;nsp<=ndend-1;nsp++) { const Int_t &nc = datasort[mn+nsp]; const Int_t &k = idclass[nc]; const Float_t &u = winbag[nc]; // do classification, Gini index as split rule rln +=u*(2*wl[k]+u); // += u*(wl[k]{i-1} + wl[k]{i-1}+u{i}) rld +=u; // sum of weights left from cut total wl[k] +=u; // sum of weights left from cut for class k rrn -=u*(2*wr[k]-u); // -= u*(wr[k]{i-1} + wr[k]{i-1}-u{i}) // rr0=0; rr0+=u*2*tclasspop[k] // rrn = pno - rr0 + rln rrd -=u; // sum of weights right from cut total wr[k] -=u; // sum of weights right from cut for class k // REPLACE BY? // rr0 = 0 // rr0 += u*2*tclasspop[k] // rrn = pno - rr0 + rln // rrd = pdo - rld // wr[k] = tclasspop[k] - wl[k] // crit = (rln*(pdo - rld + 1) + pno - rr0) / rld*(pdo - rld) /* if (k==background) continue; crit = TMath::Max(MMath::SignificanceLiMa(rld, rld-wl[k]), MMath::SignificanceLiMa(rrd, rrd-wr[k])) */ // This condition is in fact a == (> cannot happen at all) // This is because we cannot set the cut between two identical values //if (datarang[mn+datasort[mn+nsp]]>=datarang[mn+datasort[mn+nsp+1]]) if (datarang[mn+nc]>=datarang[mn+datasort[mn+nsp+1]]) continue; // If crit starts to become pretty large do WHAT??? //if (TMath::Min(rrd,rld)<=1.0e-5) // FIXME: CHECKIT FOR WEIGHTS! // continue; const Double_t crit=(rln/rld)+(rrn/rrd); if (!TMath::Finite(crit)) continue; // Search for the highest value of crit if (crit<=critvar) continue; // store the highest crit value and the corresponding event to cut at nbestvar=nsp; critvar=crit; } if (critvar<=critmax) continue; msplit=mvar; // Variable in which to split nbest=nbestvar; // event at which the best split was found critmax=critvar; } // crit0 = MMath::SignificanceLiMa(pdo, pdo-tclasspop[0]) // mean increase of sensitivity // decsplit = sqrt(critmax/crit0) decsplit=critmax-crit0; return critmax<-1.0e10 ? 1 : 0; } int MRanTree::FindBestSplitSigma(const MArrayI &datasort,const MArrayI &datarang, const MArrayF &hadtrue, const MArrayI &idclass, Int_t ndstart,Int_t ndend, const MArrayF &tclasspop, const Float_t &mean, const Float_t &square, Int_t &msplit, Float_t &decsplit,Int_t &nbest, const MArrayF &winbag, const int nclass) { const Int_t nrnodes = fBestSplit.GetSize(); const Int_t numdata = (nrnodes-1)/2; const Int_t mdim = fGiniDec.GetSize(); // For the best split, msplit is the index of the variable (e.g Hillas par., zenith angle ,...) // split on. decsplit is the decreae in impurity measured by Gini-index. // nsplit is the case number of value of msplit split on, // and nsplitnext is the case number of the next larger value of msplit. Int_t nbestvar=0; // compute initial values of numerator and denominator of split-index, // resolution //Double_t pno=-(tclasspop[0]*square-mean*mean)*tclasspop[0]; //Double_t pdo= (tclasspop[0]-1.)*mean*mean; // n*resolution //Double_t pno=-(tclasspop[0]*square-mean*mean)*tclasspop[0]; //Double_t pdo= mean*mean; // variance //Double_t pno=-(square-mean*mean/tclasspop[0]); //Double_t pdo= (tclasspop[0]-1.); // n*variance Double_t pno= (square-mean*mean/tclasspop[0]); Double_t pdo= 1.; // 1./(n*variance) //Double_t pno= 1.; //Double_t pdo= (square-mean*mean/tclasspop[0]); const Double_t crit0=pno/pdo; // start main loop through variables to find best split, Double_t critmin=FLT_MAX; // random split selection, number of trials = fNumTry for (Int_t mt=0; mtInteger(mdim); const Int_t mn = mvar*numdata; Double_t esumr =mean; Double_t e2sumr=square; Double_t esuml =0; Double_t e2suml=0; float wl=0.;// left node float wr=tclasspop[0]; // right node Double_t critvar=critmin; for(Int_t nsp=ndstart;nsp<=ndend-1;nsp++) { const Int_t &nc=datasort[mn+nsp]; const Float_t &f=hadtrue[nc];; const Float_t &u=winbag[nc]; e2suml+=u*f*f; esuml +=u*f; wl +=u; //------------------------------------------- // resolution //const Double_t rln=(wl*e2suml-esuml*esuml)*wl; //const Double_t rld=(wl-1.)*esuml*esuml; // resolution times n //const Double_t rln=(wl*e2suml-esuml*esuml)*wl; //const Double_t rld=esuml*esuml; // sigma //const Double_t rln=(e2suml-esuml*esuml/wl); //const Double_t rld=(wl-1.); // sigma times n Double_t rln=(e2suml-esuml*esuml/wl); Double_t rld=1.; // 1./(n*variance) //const Double_t rln=1.; //const Double_t rld=(e2suml-esuml*esuml/wl); //------------------------------------------- // REPLACE BY??? e2sumr-=u*f*f; // e2sumr = square - e2suml esumr -=u*f; // esumr = mean - esuml wr -=u; // wr = tclasspop[0] - wl //------------------------------------------- // resolution //const Double_t rrn=(wr*e2sumr-esumr*esumr)*wr; //const Double_t rrd=(wr-1.)*esumr*esumr; // resolution times n //const Double_t rrn=(wr*e2sumr-esumr*esumr)*wr; //const Double_t rrd=esumr*esumr; // sigma //const Double_t rrn=(e2sumr-esumr*esumr/wr); //const Double_t rrd=(wr-1.); // sigma times n const Double_t rrn=(e2sumr-esumr*esumr/wr); const Double_t rrd=1.; // 1./(n*variance) //const Double_t rrn=1.; //const Double_t rrd=(e2sumr-esumr*esumr/wr); //------------------------------------------- if (datarang[mn+nc]>=datarang[mn+datasort[mn+nsp+1]]) continue; //if (TMath::Min(rrd,rld)<=1.0e-5) // continue; const Double_t crit=(rln/rld)+(rrn/rrd); if (!TMath::Finite(crit)) continue; if (crit>=critvar) continue; nbestvar=nsp; critvar=crit; } if (critvar>=critmin) continue; msplit=mvar; nbest=nbestvar; critmin=critvar; } decsplit=crit0-critmin; //return critmin>1.0e20 ? 1 : 0; return decsplit<0 ? 1 : 0; } void MRanTree::MoveData(MArrayI &datasort,Int_t ndstart, Int_t ndend, MArrayI &idmove,MArrayI &ncase,Int_t msplit, Int_t nbest,Int_t &ndendl) { // This is the heart of the BuildTree construction. Based on the best split // the data in the part of datasort corresponding to the current node is moved to the // left if it belongs to the left child and right if it belongs to the right child-node. const Int_t numdata = ncase.GetSize(); const Int_t mdim = fGiniDec.GetSize(); MArrayI tdatasort(numdata); // compute idmove = indicator of case nos. going left for (Int_t nsp=ndstart;nsp<=ndend;nsp++) { const Int_t &nc=datasort[msplit*numdata+nsp]; idmove[nc]= nsp<=nbest?1:0; } ndendl=nbest; // shift case. nos. right and left for numerical variables. for(Int_t msh=0;mshncur) break; if (nodestatus[kbuild]!=2) continue; // initialize for next call to FindBestSplit const Int_t ndstart=nodestart[kbuild]; const Int_t ndend=ndstart+nodepop[kbuild]-1; for (Int_t j=0;j*FindBestSplit)(datasort,datarang,hadtrue,idclass,ndstart, ndend, lclasspop,mean[kbuild],square[kbuild],msplit,decsplit, nbest,winbag,nclass)) { nodestatus[kbuild]=-1; continue; } fBestVar[kbuild]=msplit; fGiniDec[msplit]+=decsplit; bestsplit[kbuild]=datasort[msplit*numdata+nbest]; bestsplitnext[kbuild]=datasort[msplit*numdata+nbest+1]; Int_t ndendl; MoveData(datasort,ndstart,ndend,idmove,ncase, msplit,nbest,ndendl); // leftnode no.= ncur+1, rightnode no. = ncur+2. nodepop[ncur+1]=ndendl-ndstart+1; nodepop[ncur+2]=ndend-ndendl; nodestart[ncur+1]=ndstart; nodestart[ncur+2]=ndendl+1; // find class populations in both nodes for (Int_t n=ndstart;n<=ndendl;n++) { const Int_t &nc=ncase[n]; const int j=idclass[nc]; // statistics left from cut mean[ncur+1]+=hadtrue[nc]*winbag[nc]; square[ncur+1]+=hadtrue[nc]*hadtrue[nc]*winbag[nc]; // sum of weights left from cut classpop[j*nrnodes+ncur+1]+=winbag[nc]; } for (Int_t n=ndendl+1;n<=ndend;n++) { const Int_t &nc=ncase[n]; const int j=idclass[nc]; // statistics right from cut mean[ncur+2] +=hadtrue[nc]*winbag[nc]; square[ncur+2]+=hadtrue[nc]*hadtrue[nc]*winbag[nc]; // sum of weights right from cut classpop[j*nrnodes+ncur+2]+=winbag[nc]; } // check on nodestatus nodestatus[ncur+1]=2; nodestatus[ncur+2]=2; if (nodepop[ncur+1]<=fNdSize) nodestatus[ncur+1]=-1; if (nodepop[ncur+2]<=fNdSize) nodestatus[ncur+2]=-1; Double_t popt1=0; Double_t popt2=0; for (Int_t j=0;j=nrnodes) break; } // determine number of nodes fNumNodes=nrnodes; for (Int_t k=nrnodes-1;k>=0;k--) { if (nodestatus[k]==0) fNumNodes-=1; if (nodestatus[k]==2) nodestatus[k]=-1; } fNumEndNodes=0; for (Int_t kn=0;knpp) { // class + status of node kn coded into fBestVar[kn] fBestVar[kn]=j-nclass; pp=classpop[j*nrnodes+kn]; } } float sum=0; for(int i=0;i