source: trunk/MagicSoft/Mars/mranforest/MRanTree.cc@ 2124

Last change on this file since 2124 was 2114, checked in by tbretz, 22 years ago
*** empty log message ***
File size: 15.3 KB
Line 
1/* ======================================================================== *\
2!
3! *
4! * This file is part of MARS, the MAGIC Analysis and Reconstruction
5! * Software. It is distributed to you in the hope that it can be a useful
6! * and timesaving tool in analysing Data of imaging Cerenkov telescopes.
7! * It is distributed WITHOUT ANY WARRANTY.
8! *
9! * Permission to use, copy, modify and distribute this software and its
10! * documentation for any purpose is hereby granted without fee,
11! * provided that the above copyright notice appear in all copies and
12! * that both that copyright notice and this permission notice appear
13! * in supporting documentation. It is provided "as is" without express
14! * or implied warranty.
15! *
16!
17!
18! Author(s): Thomas Hengstebeck 3/2003 <mailto:hengsteb@alwa02.physik.uni-siegen.de>
19!
20! Copyright: MAGIC Software Development, 2000-2003
21!
22!
23\* ======================================================================== */
24
25/////////////////////////////////////////////////////////////////////////////
26// //
27// MRanTree //
28// //
29// ParameterContainer for Tree structure //
30// //
31// //
32/////////////////////////////////////////////////////////////////////////////
33#include "MRanTree.h"
34
35#include <iostream.h>
36
37#include <TVector.h>
38#include <TMatrix.h>
39#include <TRandom.h>
40
41#include "MDataArray.h"
42
43#include "MLog.h"
44#include "MLogManip.h"
45
46ClassImp(MRanTree);
47
48// --------------------------------------------------------------------------
49//
50// Default constructor.
51//
52MRanTree::MRanTree(const char *name, const char *title):fNdSize(0), fNumTry(3), fData(NULL)
53{
54
55 fName = name ? name : "MRanTree";
56 fTitle = title ? title : "Storage container for structure of a single tree";
57}
58
59void MRanTree::SetNdSize(Int_t n)
60{
61 // threshold nodesize of terminal nodes, i.e. the training data is splitted
62 // until there is only pure date in the subsets(=terminal nodes) or the
63 // subset size is LE n
64
65 fNdSize=TMath::Max(1,n);//at least 1 event per node
66}
67
68void MRanTree::SetNumTry(Int_t n)
69{
70 // number of trials in random split selection:
71 // choose at least 1 variable to split in
72
73 fNumTry=TMath::Max(1,n);
74}
75
76void MRanTree::GrowTree(TMatrix &mhad,TMatrix &mgam,Int_t numdata, Int_t numdim,TArrayI &hadtrue,
77 TArrayI &datasort,TArrayI &datarang,TArrayF &tclasspop,TArrayI &jinbag,
78 TArrayF &winbag,TArrayF &weight)
79{
80 // arrays have to be initialized with generous size, so number of total nodes (nrnodes)
81 // is estimated for worst case
82 Int_t nrnodes=2*numdata+1;
83
84 // number of events in bootstrap sample
85 Int_t ninbag=0;
86 for (Int_t n=0;n<numdata;n++)
87 if(jinbag[n]==1) ninbag++;
88
89 // weighted class populations after split
90 TArrayF wl(2); // left node
91 TArrayF wc(2);
92 TArrayF wr(2); // right node
93 TArrayI nc(2);
94
95 TArrayI bestsplit(nrnodes);
96 TArrayI bestsplitnext(nrnodes);
97 TArrayI nodepop(nrnodes);
98 TArrayI parent(nrnodes);
99 TArrayI nodex(numdata);
100 TArrayI nodestart(nrnodes);
101
102 TArrayI ncase(numdata);
103 TArrayI iv(numdim);
104 TArrayI idmove(numdata);
105
106 idmove.Reset();
107
108 fBestVar.Set(nrnodes);
109 fTreeMap1.Set(nrnodes);
110 fTreeMap2.Set(nrnodes);
111 fBestSplit.Set(nrnodes);
112
113 fTreeMap1.Reset();
114 fTreeMap2.Reset();
115 fBestSplit.Reset();
116
117 fGiniDec.Set(numdim);
118 fGiniDec.Reset();
119
120 // tree growing
121 BuildTree(datasort,datarang,hadtrue,numdim,numdata,bestsplit,
122 bestsplitnext,nodepop,nodestart,tclasspop,nrnodes,
123 idmove,ncase,parent,jinbag,iv,winbag,wr,wc,wl,ninbag);
124
125 // post processing, determine cut (or split) values fBestSplit
126 Int_t nhad=mhad.GetNrows();
127
128 for(Int_t k=0;k<nrnodes;k++)
129 {
130 Int_t bsp=bestsplit[k];
131 Int_t bspn=bestsplitnext[k];
132 Int_t msp=fBestVar[k];
133
134 if (GetNodeStatus(k)!=-1)
135 {
136 fBestSplit[k] = bsp<nhad ? mhad(bsp,msp):mgam(bsp-nhad,msp);
137 fBestSplit[k] += bspn<nhad ? mhad(bspn,msp):mgam(bspn-nhad,msp);
138 fBestSplit[k] /=2.;
139 }
140 }
141
142 // resizing arrays to save memory
143 fBestVar.Set(fNumNodes);
144 fTreeMap1.Set(fNumNodes);
145 fTreeMap2.Set(fNumNodes);
146 fBestSplit.Set(fNumNodes);
147}
148
149Int_t MRanTree::FindBestSplit(TArrayI &datasort,TArrayI &datarang,TArrayI &hadtrue,Int_t mdim,
150 Int_t numdata,Int_t ndstart,Int_t ndend,TArrayF &tclasspop,
151 Int_t &msplit,Float_t &decsplit,Int_t &nbest,TArrayI &ncase,
152 TArrayI &jinbag,TArrayI &iv,TArrayF &winbag,TArrayF &wr,
153 TArrayF &wc,TArrayF &wl,Int_t kbuild)
154{
155 // For the best split, msplit is the index of the variable (e.g Hillas par., zenith angle ,...)
156 // split on. decsplit is the decreae in impurity measured by Gini-index.
157 // nsplit is the case number of value of msplit split on,
158 // and nsplitnext is the case number of the next larger value of msplit.
159
160 Int_t mvar,nc,nbestvar=0,jstat,k;
161 Float_t crit,crit0,critmax,critvar=0;
162 Float_t rrn, rrd, rln, rld, u;
163
164 // compute initial values of numerator and denominator of Gini-index,
165 // Gini index= pno/dno
166 Float_t pno=0;
167 Float_t pdo=0;
168
169 for (Int_t j=0;j<2;j++)
170 {
171 pno+=tclasspop[j]*tclasspop[j];
172 pdo+=tclasspop[j];
173 }
174 crit0=pno/pdo;
175 jstat=0;
176
177 // start main loop through variables to find best split,
178 // (Gini-index as criterium crit)
179
180 critmax=-1.0e20; // FIXME: Replace by a constant from limits.h
181
182 // random split selection, number of trials = fNumTry
183 if(!gRandom)
184 {
185 *fLog << err << dbginf << "gRandom not initialized... aborting." << endl;
186 return kFALSE;
187 }
188 for(Int_t mt=0;mt<fNumTry;mt++)
189 {
190 mvar=Int_t(mdim*gRandom->Rndm());
191
192 // Gini index = rrn/rrd+rln/rld
193 rrn=pno;
194 rrd=pdo;
195 rln=0;
196 rld=0;
197 wl.Reset();
198
199 for (Int_t j=0;j<2;j++)
200 {
201 wr[j]=tclasspop[j];
202 }
203
204 critvar=-1.0e20;
205
206 for(Int_t nsp=ndstart;nsp<=ndend-1;nsp++)
207 {
208 nc=datasort[mvar*numdata+nsp];
209
210 u=winbag[nc];
211 k=hadtrue[nc];
212
213 rln=rln+u*(2*wl[k]+u);
214 rrn=rrn+u*(-2*wr[k]+u);
215 rld=rld+u;
216 rrd=rrd-u;
217
218 wl[k]=wl[k]+u;
219 wr[k]=wr[k]-u;
220
221 if (datarang[mvar*numdata+nc]<datarang[mvar*numdata+datasort[mvar*numdata+nsp+1]])
222 {
223 if(TMath::Min(rrd,rld)>1.0e-5)
224 {
225 crit=(rln/rld)+(rrn/rrd);
226 if (crit>critvar)
227 {
228 nbestvar=nsp;
229 critvar=crit;
230 }
231 }
232 }
233 }
234
235 if (critvar>critmax) {
236 msplit=mvar;
237 nbest=nbestvar;
238 critmax=critvar;
239 }
240 }
241
242 decsplit=critmax-crit0;
243 if (critmax<-1.0e10) jstat=1;
244
245 return jstat;
246}
247
248void MRanTree::MoveData(TArrayI &datasort,Int_t mdim,Int_t numdata,Int_t ndstart,
249 Int_t ndend,TArrayI &idmove,TArrayI &ncase,Int_t msplit,
250 Int_t nbest,Int_t &ndendl)
251{
252 // This is the heart of the BuildTree construction. Based on the best split
253 // the data in the part of datasort corresponding to the current node is moved to the
254 // left if it belongs to the left child and right if it belongs to the right child-node.
255
256 Int_t nc,k,ih;
257 TArrayI tdatasort(numdata);
258
259 // compute idmove = indicator of case nos. going left
260
261 for (Int_t nsp=ndstart;nsp<=nbest;nsp++)
262 {
263 nc=datasort[msplit*numdata+nsp];
264 idmove[nc]=1;
265 }
266 for (Int_t nsp=nbest+1;nsp<=ndend;nsp++)
267 {
268 nc=datasort[msplit*numdata+nsp];
269 idmove[nc]=0;
270 }
271 ndendl=nbest;
272
273 // shift case. nos. right and left for numerical variables.
274
275 for(Int_t msh=0;msh<mdim;msh++)
276 {
277 k=ndstart-1;
278 for (Int_t n=ndstart;n<=ndend;n++)
279 {
280 ih=datasort[msh*numdata+n];
281 if (idmove[ih]==1) {
282 k++;
283 tdatasort[k]=datasort[msh*numdata+n];
284 }
285 }
286
287 for (Int_t n=ndstart;n<=ndend;n++)
288 {
289 ih=datasort[msh*numdata+n];
290 if (idmove[ih]==0){
291 k++;
292 tdatasort[k]=datasort[msh*numdata+n];
293 }
294 }
295 for(Int_t k=ndstart;k<=ndend;k++)
296 datasort[msh*numdata+k]=tdatasort[k];
297 }
298
299 // compute case nos. for right and left nodes.
300
301 for(Int_t n=ndstart;n<=ndend;n++)
302 ncase[n]=datasort[msplit*numdata+n];
303}
304
305void MRanTree::BuildTree(TArrayI &datasort,TArrayI &datarang,TArrayI &hadtrue,Int_t mdim,
306 Int_t numdata,TArrayI &bestsplit,TArrayI &bestsplitnext,
307 TArrayI &nodepop,TArrayI &nodestart,TArrayF &tclasspop,
308 Int_t nrnodes,TArrayI &idmove,TArrayI &ncase,TArrayI &parent,
309 TArrayI &jinbag,TArrayI &iv,TArrayF &winbag,TArrayF &wr,TArrayF &wc,
310 TArrayF &wl,Int_t ninbag)
311{
312 // Buildtree consists of repeated calls to two void functions, FindBestSplit and MoveData.
313 // Findbestsplit does just that--it finds the best split of the current node.
314 // MoveData moves the data in the split node right and left so that the data
315 // corresponding to each child node is contiguous.
316 //
317 // buildtree bookkeeping:
318 // ncur is the total number of nodes to date. nodestatus(k)=1 if the kth node has been split.
319 // nodestatus(k)=2 if the node exists but has not yet been split, and =-1 if the node is
320 // terminal. A node is terminal if its size is below a threshold value, or if it is all
321 // one class, or if all the data-values are equal. If the current node k is split, then its
322 // children are numbered ncur+1 (left), and ncur+2(right), ncur increases to ncur+2 and
323 // the next node to be split is numbered k+1. When no more nodes can be split, buildtree
324 // returns.
325
326 Int_t msplit,nbest,ndendl,nc,jstat,ndend,ndstart;
327 Float_t decsplit=0;
328 Float_t popt1,popt2,pp;
329 TArrayF classpop;
330 TArrayI nodestatus;
331
332 nodestatus.Set(nrnodes);
333 classpop.Set(2*nrnodes);
334
335 nodestatus.Reset();
336 nodestart.Reset();
337 nodepop.Reset();
338 classpop.Reset();
339
340
341 for (Int_t j=0;j<2;j++)
342 classpop[j*nrnodes+0]=tclasspop[j];
343
344 Int_t ncur=0;
345 nodestart[0]=0;
346 nodepop[0]=ninbag;
347 nodestatus[0]=2;
348
349 // start main loop
350 for (Int_t kbuild=0;kbuild<nrnodes;kbuild++)
351 {
352 if (kbuild>ncur) break;
353 if (nodestatus[kbuild]!=2) continue;
354
355 // initialize for next call to FindBestSplit
356
357 ndstart=nodestart[kbuild];
358 ndend=ndstart+nodepop[kbuild]-1;
359 for (Int_t j=0;j<2;j++)
360 tclasspop[j]=classpop[j*nrnodes+kbuild];
361
362 jstat=FindBestSplit(datasort,datarang,hadtrue,mdim,numdata,
363 ndstart,ndend,tclasspop,msplit,decsplit,
364 nbest,ncase,jinbag,iv,winbag,wr,wc,wl,
365 kbuild);
366
367 if(jstat==1) {
368 nodestatus[kbuild]=-1;
369 continue;
370 }else{
371 fBestVar[kbuild]=msplit;
372 fGiniDec[msplit]+=decsplit;
373
374 bestsplit[kbuild]=datasort[msplit*numdata+nbest];
375 bestsplitnext[kbuild]=datasort[msplit*numdata+nbest+1];
376 }
377
378 MoveData(datasort,mdim,numdata,ndstart,ndend,idmove,ncase,
379 msplit,nbest,ndendl);
380
381 // leftnode no.= ncur+1, rightnode no. = ncur+2.
382
383 nodepop[ncur+1]=ndendl-ndstart+1;
384 nodepop[ncur+2]=ndend-ndendl;
385 nodestart[ncur+1]=ndstart;
386 nodestart[ncur+2]=ndendl+1;
387
388 // find class populations in both nodes
389
390 for (Int_t n=ndstart;n<=ndendl;n++)
391 {
392 nc=ncase[n];
393 Int_t j=hadtrue[nc];
394 classpop[j*nrnodes+ncur+1]+=winbag[nc];
395 }
396
397 for (Int_t n=ndendl+1;n<=ndend;n++)
398 {
399 nc=ncase[n];
400 Int_t j=hadtrue[nc];
401 classpop[j*nrnodes+ncur+2]+=winbag[nc];
402 }
403
404 // check on nodestatus
405
406 nodestatus[ncur+1]=2;
407 nodestatus[ncur+2]=2;
408 if (nodepop[ncur+1]<=fNdSize) nodestatus[ncur+1]=-1;
409 if (nodepop[ncur+2]<=fNdSize) nodestatus[ncur+2]=-1;
410 popt1=0;
411 popt2=0;
412 for (Int_t j=0;j<2;j++)
413 {
414 popt1+=classpop[j*nrnodes+ncur+1];
415 popt2+=classpop[j*nrnodes+ncur+2];
416 }
417
418 for (Int_t j=0;j<2;j++)
419 {
420 if (classpop[j*nrnodes+ncur+1]==popt1) nodestatus[ncur+1]=-1;
421 if (classpop[j*nrnodes+ncur+2]==popt2) nodestatus[ncur+2]=-1;
422 }
423
424 fTreeMap1[kbuild]=ncur+1;
425 fTreeMap2[kbuild]=ncur+2;
426 parent[ncur+1]=kbuild;
427 parent[ncur+2]=kbuild;
428 nodestatus[kbuild]=1;
429 ncur+=2;
430 if (ncur>=nrnodes) break;
431 }
432
433 // determine number of nodes
434 fNumNodes=nrnodes;
435 for (Int_t k=nrnodes-1;k>=0;k--)
436 {
437 if (nodestatus[k]==0) fNumNodes-=1;
438 if (nodestatus[k]==2) nodestatus[k]=-1;
439 }
440
441 fNumEndNodes=0;
442 for (Int_t kn=0;kn<fNumNodes;kn++)
443 if(nodestatus[kn]==-1)
444 {
445 fNumEndNodes++;
446 pp=0;
447 for (Int_t j=0;j<2;j++)
448 {
449 if(classpop[j*nrnodes+kn]>pp)
450 {
451 // class + status of node kn coded into fBestVar[kn]
452 fBestVar[kn]=j-2;
453 pp=classpop[j*nrnodes+kn];
454 }
455 }
456 fBestSplit[kn] =classpop[1*nrnodes+kn];
457 fBestSplit[kn]/=(classpop[0*nrnodes+kn]+classpop[1*nrnodes+kn]);
458 }
459}
460
461void MRanTree::SetRules(MDataArray *rules)
462{
463 fData=rules;
464}
465
466Double_t MRanTree::TreeHad(const TVector &event)
467{
468 Int_t kt=0;
469 // to optimize on storage space node status and node class
470 // are coded into fBestVar:
471 // status of node kt = TMath::Sign(1,fBestVar[kt])
472 // class of node kt = fBestVar[kt]+2 (class defined by larger
473 // node population, actually not used)
474 // hadronness assigned to node kt = fBestSplit[kt]
475
476 for (Int_t k=0;k<fNumNodes;k++)
477 {
478 if (fBestVar[kt]<0)
479 break;
480
481 const Int_t m=fBestVar[kt];
482
483 kt = event(m)<=fBestSplit[kt] ? fTreeMap1[kt] : fTreeMap2[kt];
484 }
485
486 return fBestSplit[kt];
487}
488
489Double_t MRanTree::TreeHad()
490{
491 TVector event;
492 *fData >> event;
493
494 return TreeHad(event);
495}
496
497Bool_t MRanTree::AsciiWrite(ostream &out) const
498{
499 TString str;
500 Int_t k;
501
502 out.width(5);out<<fNumNodes<<endl;
503
504 for (k=0;k<fNumNodes;k++)
505 {
506 str=Form("%f",GetBestSplit(k));
507
508 out.width(5); out << k;
509 out.width(5); out << GetNodeStatus(k);
510 out.width(5); out << GetTreeMap1(k);
511 out.width(5); out << GetTreeMap2(k);
512 out.width(5); out << GetBestVar(k);
513 out.width(15); out << str<<endl;
514 out.width(5); out << GetNodeClass(k);
515 }
516 out<<endl;
517
518 return k==fNumNodes;
519}
Note: See TracBrowser for help on using the repository browser.