source: trunk/MagicSoft/Mars/mranforest/MRanTree.cc@ 6724

Last change on this file since 6724 was 4647, checked in by tbretz, 20 years ago
*** empty log message ***
File size: 15.1 KB
Line 
1/* ======================================================================== *\
2!
3! *
4! * This file is part of MARS, the MAGIC Analysis and Reconstruction
5! * Software. It is distributed to you in the hope that it can be a useful
6! * and timesaving tool in analysing Data of imaging Cerenkov telescopes.
7! * It is distributed WITHOUT ANY WARRANTY.
8! *
9! * Permission to use, copy, modify and distribute this software and its
10! * documentation for any purpose is hereby granted without fee,
11! * provided that the above copyright notice appear in all copies and
12! * that both that copyright notice and this permission notice appear
13! * in supporting documentation. It is provided "as is" without express
14! * or implied warranty.
15! *
16!
17!
18! Author(s): Thomas Hengstebeck 3/2003 <mailto:hengsteb@alwa02.physik.uni-siegen.de>
19!
20! Copyright: MAGIC Software Development, 2000-2003
21!
22!
23\* ======================================================================== */
24
25/////////////////////////////////////////////////////////////////////////////
26//
27// MRanTree
28//
29// ParameterContainer for Tree structure
30//
31/////////////////////////////////////////////////////////////////////////////
32#include "MRanTree.h"
33
34#include <iostream>
35
36#include <TVector.h>
37#include <TMatrix.h>
38#include <TRandom.h>
39
40#include "MDataArray.h"
41
42#include "MLog.h"
43#include "MLogManip.h"
44
45ClassImp(MRanTree);
46
47using namespace std;
48
49// --------------------------------------------------------------------------
50//
51// Default constructor.
52//
53MRanTree::MRanTree(const char *name, const char *title):fNdSize(0), fNumTry(3), fData(NULL)
54{
55
56 fName = name ? name : "MRanTree";
57 fTitle = title ? title : "Storage container for structure of a single tree";
58}
59
60void MRanTree::SetNdSize(Int_t n)
61{
62 // threshold nodesize of terminal nodes, i.e. the training data is splitted
63 // until there is only pure date in the subsets(=terminal nodes) or the
64 // subset size is LE n
65
66 fNdSize=TMath::Max(1,n);//at least 1 event per node
67}
68
69void MRanTree::SetNumTry(Int_t n)
70{
71 // number of trials in random split selection:
72 // choose at least 1 variable to split in
73
74 fNumTry=TMath::Max(1,n);
75}
76
77void MRanTree::GrowTree(const TMatrix &mhad, const TMatrix &mgam,
78 const TArrayI &hadtrue, TArrayI &datasort,
79 const TArrayI &datarang, TArrayF &tclasspop, TArrayI &jinbag,
80 const TArrayF &winbag)
81{
82 // arrays have to be initialized with generous size, so number of total nodes (nrnodes)
83 // is estimated for worst case
84 const Int_t numdim =mhad.GetNcols();
85 const Int_t numdata=winbag.GetSize();
86 const Int_t nrnodes=2*numdata+1;
87
88 // number of events in bootstrap sample
89 Int_t ninbag=0;
90 for (Int_t n=0;n<numdata;n++)
91 if(jinbag[n]==1) ninbag++;
92
93 TArrayI bestsplit(nrnodes);
94 TArrayI bestsplitnext(nrnodes);
95
96 fBestVar.Set(nrnodes);
97 fTreeMap1.Set(nrnodes);
98 fTreeMap2.Set(nrnodes);
99 fBestSplit.Set(nrnodes);
100
101 fTreeMap1.Reset();
102 fTreeMap2.Reset();
103 fBestSplit.Reset();
104
105 fGiniDec.Set(numdim);
106 fGiniDec.Reset();
107
108 // tree growing
109 BuildTree(datasort,datarang,hadtrue,bestsplit,
110 bestsplitnext,tclasspop,winbag,ninbag);
111
112 // post processing, determine cut (or split) values fBestSplit
113 Int_t nhad=mhad.GetNrows();
114
115 for(Int_t k=0; k<nrnodes; k++)
116 {
117 if (GetNodeStatus(k)==-1)
118 continue;
119
120 const Int_t &bsp =bestsplit[k];
121 const Int_t &bspn=bestsplitnext[k];
122 const Int_t &msp =fBestVar[k];
123
124 fBestSplit[k] = bsp<nhad ? mhad(bsp, msp):mgam(bsp-nhad, msp);
125 fBestSplit[k] += bspn<nhad ? mhad(bspn,msp):mgam(bspn-nhad,msp);
126 fBestSplit[k] /= 2;
127 }
128
129 // resizing arrays to save memory
130 fBestVar.Set(fNumNodes);
131 fTreeMap1.Set(fNumNodes);
132 fTreeMap2.Set(fNumNodes);
133 fBestSplit.Set(fNumNodes);
134}
135
136Int_t MRanTree::FindBestSplit(const TArrayI &datasort,const TArrayI &datarang,
137 const TArrayI &hadtrue,Int_t ndstart,Int_t ndend,TArrayF &tclasspop,
138 Int_t &msplit,Float_t &decsplit,Int_t &nbest,
139 const TArrayF &winbag)
140{
141 const Int_t nrnodes = fBestSplit.GetSize();
142 const Int_t numdata = (nrnodes-1)/2;
143 const Int_t mdim = fGiniDec.GetSize();
144
145 // weighted class populations after split
146 TArrayF wc(2);
147 TArrayF wr(2); // right node
148
149 // For the best split, msplit is the index of the variable (e.g Hillas par., zenith angle ,...)
150 // split on. decsplit is the decreae in impurity measured by Gini-index.
151 // nsplit is the case number of value of msplit split on,
152 // and nsplitnext is the case number of the next larger value of msplit.
153
154 Int_t nbestvar=0;
155
156 // compute initial values of numerator and denominator of Gini-index,
157 // Gini index= pno/dno
158 Double_t pno=0;
159 Double_t pdo=0;
160 for (Int_t j=0; j<2; j++)
161 {
162 pno+=tclasspop[j]*tclasspop[j];
163 pdo+=tclasspop[j];
164 }
165
166 const Double_t crit0=pno/pdo;
167 Int_t jstat=0;
168
169 // start main loop through variables to find best split,
170 // (Gini-index as criterium crit)
171
172 Double_t critmax=-FLT_MAX;
173
174 // random split selection, number of trials = fNumTry
175 for (Int_t mt=0; mt<fNumTry; mt++)
176 {
177 const Int_t mvar=Int_t(gRandom->Rndm()*mdim);
178 const Int_t mn = mvar*numdata;
179
180 // Gini index = rrn/rrd+rln/rld
181 Double_t rrn=pno;
182 Double_t rrd=pdo;
183 Double_t rln=0;
184 Double_t rld=0;
185
186 TArrayF wl(2); // left node
187 wr = tclasspop;
188
189 Double_t critvar=-1.0e20;
190
191 for(Int_t nsp=ndstart;nsp<=ndend-1;nsp++)
192 {
193 const Int_t &nc=datasort[mn+nsp];
194 const Int_t &k=hadtrue[nc];
195
196 const Float_t &u=winbag[nc];
197
198 rln+=u*(2*wl[k]+u);
199 rrn+=u*(-2*wr[k]+u);
200 rld+=u;
201 rrd-=u;
202
203 wl[k]+=u;
204 wr[k]-=u;
205
206 if (datarang[mn+nc]>=datarang[mn+datasort[mn+nsp+1]])
207 continue;
208 if (TMath::Min(rrd,rld)<=1.0e-5)
209 continue;
210
211 const Double_t crit=(rln/rld)+(rrn/rrd);
212 if (crit<=critvar)
213 continue;
214
215 nbestvar=nsp;
216 critvar=crit;
217 }
218
219 if (critvar<=critmax)
220 continue;
221
222 msplit=mvar;
223 nbest=nbestvar;
224 critmax=critvar;
225 }
226
227 decsplit=critmax-crit0;
228
229 return critmax<-1.0e10 ? 1 : jstat;
230}
231
232void MRanTree::MoveData(TArrayI &datasort,Int_t ndstart,
233 Int_t ndend,TArrayI &idmove,TArrayI &ncase,Int_t msplit,
234 Int_t nbest,Int_t &ndendl)
235{
236 // This is the heart of the BuildTree construction. Based on the best split
237 // the data in the part of datasort corresponding to the current node is moved to the
238 // left if it belongs to the left child and right if it belongs to the right child-node.
239 const Int_t numdata = ncase.GetSize();
240 const Int_t mdim = fGiniDec.GetSize();
241
242 TArrayI tdatasort(numdata);
243
244 // compute idmove = indicator of case nos. going left
245
246 for (Int_t nsp=ndstart;nsp<=ndend;nsp++)
247 {
248 const Int_t &nc=datasort[msplit*numdata+nsp];
249 idmove[nc]= nsp<=nbest?1:0;
250 }
251 ndendl=nbest;
252
253 // shift case. nos. right and left for numerical variables.
254
255 for(Int_t msh=0;msh<mdim;msh++)
256 {
257 Int_t k=ndstart-1;
258 for (Int_t n=ndstart;n<=ndend;n++)
259 {
260 const Int_t &ih=datasort[msh*numdata+n];
261 if (idmove[ih]==1)
262 tdatasort[++k]=datasort[msh*numdata+n];
263 }
264
265 for (Int_t n=ndstart;n<=ndend;n++)
266 {
267 const Int_t &ih=datasort[msh*numdata+n];
268 if (idmove[ih]==0)
269 tdatasort[++k]=datasort[msh*numdata+n];
270 }
271
272 for(Int_t m=ndstart;m<=ndend;m++)
273 datasort[msh*numdata+m]=tdatasort[m];
274 }
275
276 // compute case nos. for right and left nodes.
277
278 for(Int_t n=ndstart;n<=ndend;n++)
279 ncase[n]=datasort[msplit*numdata+n];
280}
281
282void MRanTree::BuildTree(TArrayI &datasort,const TArrayI &datarang,
283 const TArrayI &hadtrue, TArrayI &bestsplit,
284 TArrayI &bestsplitnext, TArrayF &tclasspop,
285 const TArrayF &winbag, Int_t ninbag)
286{
287 // Buildtree consists of repeated calls to two void functions, FindBestSplit and MoveData.
288 // Findbestsplit does just that--it finds the best split of the current node.
289 // MoveData moves the data in the split node right and left so that the data
290 // corresponding to each child node is contiguous.
291 //
292 // buildtree bookkeeping:
293 // ncur is the total number of nodes to date. nodestatus(k)=1 if the kth node has been split.
294 // nodestatus(k)=2 if the node exists but has not yet been split, and =-1 if the node is
295 // terminal. A node is terminal if its size is below a threshold value, or if it is all
296 // one class, or if all the data-values are equal. If the current node k is split, then its
297 // children are numbered ncur+1 (left), and ncur+2(right), ncur increases to ncur+2 and
298 // the next node to be split is numbered k+1. When no more nodes can be split, buildtree
299 // returns.
300 const Int_t mdim = fGiniDec.GetSize();
301 const Int_t nrnodes = fBestSplit.GetSize();
302 const Int_t numdata = (nrnodes-1)/2;
303
304 TArrayI nodepop(nrnodes);
305 TArrayI nodestart(nrnodes);
306 TArrayI parent(nrnodes);
307
308 TArrayI ncase(numdata);
309 TArrayI idmove(numdata);
310 TArrayI iv(mdim);
311
312 TArrayF classpop(nrnodes*2);
313 TArrayI nodestatus(nrnodes);
314
315 for (Int_t j=0;j<2;j++)
316 classpop[j*nrnodes+0]=tclasspop[j];
317
318 Int_t ncur=0;
319 nodepop[0]=ninbag;
320 nodestatus[0]=2;
321
322 // start main loop
323 for (Int_t kbuild=0; kbuild<nrnodes; kbuild++)
324 {
325 if (kbuild>ncur) break;
326 if (nodestatus[kbuild]!=2) continue;
327
328 // initialize for next call to FindBestSplit
329
330 const Int_t ndstart=nodestart[kbuild];
331 const Int_t ndend=ndstart+nodepop[kbuild]-1;
332 for (Int_t j=0;j<2;j++)
333 tclasspop[j]=classpop[j*nrnodes+kbuild];
334
335 Int_t msplit, nbest;
336 Float_t decsplit=0;
337 const Int_t jstat=FindBestSplit(datasort,datarang,hadtrue,
338 ndstart,ndend,tclasspop,msplit,
339 decsplit,nbest,winbag);
340
341 if (jstat==1)
342 {
343 nodestatus[kbuild]=-1;
344 continue;
345 }
346
347 fBestVar[kbuild]=msplit;
348 fGiniDec[msplit]+=decsplit;
349
350 bestsplit[kbuild]=datasort[msplit*numdata+nbest];
351 bestsplitnext[kbuild]=datasort[msplit*numdata+nbest+1];
352
353 Int_t ndendl;
354 MoveData(datasort,ndstart,ndend,idmove,ncase,
355 msplit,nbest,ndendl);
356
357 // leftnode no.= ncur+1, rightnode no. = ncur+2.
358
359 nodepop[ncur+1]=ndendl-ndstart+1;
360 nodepop[ncur+2]=ndend-ndendl;
361 nodestart[ncur+1]=ndstart;
362 nodestart[ncur+2]=ndendl+1;
363
364 // find class populations in both nodes
365
366 for (Int_t n=ndstart;n<=ndendl;n++)
367 {
368 const Int_t &nc=ncase[n];
369 const Int_t &j=hadtrue[nc];
370 classpop[j*nrnodes+ncur+1]+=winbag[nc];
371 }
372
373 for (Int_t n=ndendl+1;n<=ndend;n++)
374 {
375 const Int_t &nc=ncase[n];
376 const Int_t &j=hadtrue[nc];
377 classpop[j*nrnodes+ncur+2]+=winbag[nc];
378 }
379
380 // check on nodestatus
381
382 nodestatus[ncur+1]=2;
383 nodestatus[ncur+2]=2;
384 if (nodepop[ncur+1]<=fNdSize) nodestatus[ncur+1]=-1;
385 if (nodepop[ncur+2]<=fNdSize) nodestatus[ncur+2]=-1;
386
387 Double_t popt1=0;
388 Double_t popt2=0;
389 for (Int_t j=0;j<2;j++)
390 {
391 popt1+=classpop[j*nrnodes+ncur+1];
392 popt2+=classpop[j*nrnodes+ncur+2];
393 }
394
395 for (Int_t j=0;j<2;j++)
396 {
397 if (classpop[j*nrnodes+ncur+1]==popt1) nodestatus[ncur+1]=-1;
398 if (classpop[j*nrnodes+ncur+2]==popt2) nodestatus[ncur+2]=-1;
399 }
400
401 fTreeMap1[kbuild]=ncur+1;
402 fTreeMap2[kbuild]=ncur+2;
403 parent[ncur+1]=kbuild;
404 parent[ncur+2]=kbuild;
405 nodestatus[kbuild]=1;
406 ncur+=2;
407 if (ncur>=nrnodes) break;
408 }
409
410 // determine number of nodes
411 fNumNodes=nrnodes;
412 for (Int_t k=nrnodes-1;k>=0;k--)
413 {
414 if (nodestatus[k]==0) fNumNodes-=1;
415 if (nodestatus[k]==2) nodestatus[k]=-1;
416 }
417
418 fNumEndNodes=0;
419 for (Int_t kn=0;kn<fNumNodes;kn++)
420 if(nodestatus[kn]==-1)
421 {
422 fNumEndNodes++;
423 Double_t pp=0;
424 for (Int_t j=0;j<2;j++)
425 {
426 if(classpop[j*nrnodes+kn]>pp)
427 {
428 // class + status of node kn coded into fBestVar[kn]
429 fBestVar[kn]=j-2;
430 pp=classpop[j*nrnodes+kn];
431 }
432 }
433 fBestSplit[kn] =classpop[1*nrnodes+kn];
434 fBestSplit[kn]/=(classpop[0*nrnodes+kn]+classpop[1*nrnodes+kn]);
435 }
436}
437
438void MRanTree::SetRules(MDataArray *rules)
439{
440 fData=rules;
441}
442
443Double_t MRanTree::TreeHad(const TVector &event)
444{
445 Int_t kt=0;
446 // to optimize on storage space node status and node class
447 // are coded into fBestVar:
448 // status of node kt = TMath::Sign(1,fBestVar[kt])
449 // class of node kt = fBestVar[kt]+2 (class defined by larger
450 // node population, actually not used)
451 // hadronness assigned to node kt = fBestSplit[kt]
452
453 for (Int_t k=0;k<fNumNodes;k++)
454 {
455 if (fBestVar[kt]<0)
456 break;
457
458 const Int_t m=fBestVar[kt];
459 kt = event(m)<=fBestSplit[kt] ? fTreeMap1[kt] : fTreeMap2[kt];
460 }
461
462 return fBestSplit[kt];
463}
464
465Double_t MRanTree::TreeHad(const TMatrixRow &event)
466{
467 Int_t kt=0;
468 // to optimize on storage space node status and node class
469 // are coded into fBestVar:
470 // status of node kt = TMath::Sign(1,fBestVar[kt])
471 // class of node kt = fBestVar[kt]+2 (class defined by larger
472 // node population, actually not used)
473 // hadronness assigned to node kt = fBestSplit[kt]
474
475 for (Int_t k=0;k<fNumNodes;k++)
476 {
477 if (fBestVar[kt]<0)
478 break;
479
480 const Int_t m=fBestVar[kt];
481 kt = event(m)<=fBestSplit[kt] ? fTreeMap1[kt] : fTreeMap2[kt];
482 }
483
484 return fBestSplit[kt];
485}
486
487Double_t MRanTree::TreeHad(const TMatrix &m, Int_t ievt)
488{
489#if ROOT_VERSION_CODE < ROOT_VERSION(4,00,8)
490 return TreeHad(TMatrixRow(m, ievt));
491#else
492 return TreeHad(TMatrixFRow_const(m, ievt));
493#endif
494}
495
496Double_t MRanTree::TreeHad()
497{
498 TVector event;
499 *fData >> event;
500
501 return TreeHad(event);
502}
503
504Bool_t MRanTree::AsciiWrite(ostream &out) const
505{
506 TString str;
507 Int_t k;
508
509 out.width(5);out<<fNumNodes<<endl;
510
511 for (k=0;k<fNumNodes;k++)
512 {
513 str=Form("%f",GetBestSplit(k));
514
515 out.width(5); out << k;
516 out.width(5); out << GetNodeStatus(k);
517 out.width(5); out << GetTreeMap1(k);
518 out.width(5); out << GetTreeMap2(k);
519 out.width(5); out << GetBestVar(k);
520 out.width(15); out << str<<endl;
521 out.width(5); out << GetNodeClass(k);
522 }
523 out<<endl;
524
525 return k==fNumNodes;
526}
Note: See TracBrowser for help on using the repository browser.