source: trunk/MagicSoft/Mars/mranforest/MRanTree.cc@ 2273

Last change on this file since 2273 was 2173, checked in by tbretz, 22 years ago
*** empty log message ***
File size: 15.3 KB
Line 
1/* ======================================================================== *\
2!
3! *
4! * This file is part of MARS, the MAGIC Analysis and Reconstruction
5! * Software. It is distributed to you in the hope that it can be a useful
6! * and timesaving tool in analysing Data of imaging Cerenkov telescopes.
7! * It is distributed WITHOUT ANY WARRANTY.
8! *
9! * Permission to use, copy, modify and distribute this software and its
10! * documentation for any purpose is hereby granted without fee,
11! * provided that the above copyright notice appear in all copies and
12! * that both that copyright notice and this permission notice appear
13! * in supporting documentation. It is provided "as is" without express
14! * or implied warranty.
15! *
16!
17!
18! Author(s): Thomas Hengstebeck 3/2003 <mailto:hengsteb@alwa02.physik.uni-siegen.de>
19!
20! Copyright: MAGIC Software Development, 2000-2003
21!
22!
23\* ======================================================================== */
24
25/////////////////////////////////////////////////////////////////////////////
26// //
27// MRanTree //
28// //
29// ParameterContainer for Tree structure //
30// //
31// //
32/////////////////////////////////////////////////////////////////////////////
33#include "MRanTree.h"
34
35#include <iostream>
36
37#include <TVector.h>
38#include <TMatrix.h>
39#include <TRandom.h>
40
41#include "MDataArray.h"
42
43#include "MLog.h"
44#include "MLogManip.h"
45
46ClassImp(MRanTree);
47
48using namespace std;
49
50// --------------------------------------------------------------------------
51//
52// Default constructor.
53//
54MRanTree::MRanTree(const char *name, const char *title):fNdSize(0), fNumTry(3), fData(NULL)
55{
56
57 fName = name ? name : "MRanTree";
58 fTitle = title ? title : "Storage container for structure of a single tree";
59}
60
61void MRanTree::SetNdSize(Int_t n)
62{
63 // threshold nodesize of terminal nodes, i.e. the training data is splitted
64 // until there is only pure date in the subsets(=terminal nodes) or the
65 // subset size is LE n
66
67 fNdSize=TMath::Max(1,n);//at least 1 event per node
68}
69
70void MRanTree::SetNumTry(Int_t n)
71{
72 // number of trials in random split selection:
73 // choose at least 1 variable to split in
74
75 fNumTry=TMath::Max(1,n);
76}
77
78void MRanTree::GrowTree(TMatrix &mhad,TMatrix &mgam,Int_t numdata, Int_t numdim,TArrayI &hadtrue,
79 TArrayI &datasort,TArrayI &datarang,TArrayF &tclasspop,TArrayI &jinbag,
80 TArrayF &winbag,TArrayF &weight)
81{
82 // arrays have to be initialized with generous size, so number of total nodes (nrnodes)
83 // is estimated for worst case
84 Int_t nrnodes=2*numdata+1;
85
86 // number of events in bootstrap sample
87 Int_t ninbag=0;
88 for (Int_t n=0;n<numdata;n++)
89 if(jinbag[n]==1) ninbag++;
90
91 // weighted class populations after split
92 TArrayF wl(2); // left node
93 TArrayF wc(2);
94 TArrayF wr(2); // right node
95 TArrayI nc(2);
96
97 TArrayI bestsplit(nrnodes);
98 TArrayI bestsplitnext(nrnodes);
99 TArrayI nodepop(nrnodes);
100 TArrayI parent(nrnodes);
101 TArrayI nodex(numdata);
102 TArrayI nodestart(nrnodes);
103
104 TArrayI ncase(numdata);
105 TArrayI iv(numdim);
106 TArrayI idmove(numdata);
107
108 idmove.Reset();
109
110 fBestVar.Set(nrnodes);
111 fTreeMap1.Set(nrnodes);
112 fTreeMap2.Set(nrnodes);
113 fBestSplit.Set(nrnodes);
114
115 fTreeMap1.Reset();
116 fTreeMap2.Reset();
117 fBestSplit.Reset();
118
119 fGiniDec.Set(numdim);
120 fGiniDec.Reset();
121
122 // tree growing
123 BuildTree(datasort,datarang,hadtrue,numdim,numdata,bestsplit,
124 bestsplitnext,nodepop,nodestart,tclasspop,nrnodes,
125 idmove,ncase,parent,jinbag,iv,winbag,wr,wc,wl,ninbag);
126
127 // post processing, determine cut (or split) values fBestSplit
128 Int_t nhad=mhad.GetNrows();
129
130 for(Int_t k=0;k<nrnodes;k++)
131 {
132 Int_t bsp=bestsplit[k];
133 Int_t bspn=bestsplitnext[k];
134 Int_t msp=fBestVar[k];
135
136 if (GetNodeStatus(k)!=-1)
137 {
138 fBestSplit[k] = bsp<nhad ? mhad(bsp,msp):mgam(bsp-nhad,msp);
139 fBestSplit[k] += bspn<nhad ? mhad(bspn,msp):mgam(bspn-nhad,msp);
140 fBestSplit[k] /=2.;
141 }
142 }
143
144 // resizing arrays to save memory
145 fBestVar.Set(fNumNodes);
146 fTreeMap1.Set(fNumNodes);
147 fTreeMap2.Set(fNumNodes);
148 fBestSplit.Set(fNumNodes);
149}
150
151Int_t MRanTree::FindBestSplit(TArrayI &datasort,TArrayI &datarang,TArrayI &hadtrue,Int_t mdim,
152 Int_t numdata,Int_t ndstart,Int_t ndend,TArrayF &tclasspop,
153 Int_t &msplit,Float_t &decsplit,Int_t &nbest,TArrayI &ncase,
154 TArrayI &jinbag,TArrayI &iv,TArrayF &winbag,TArrayF &wr,
155 TArrayF &wc,TArrayF &wl,Int_t kbuild)
156{
157 // For the best split, msplit is the index of the variable (e.g Hillas par., zenith angle ,...)
158 // split on. decsplit is the decreae in impurity measured by Gini-index.
159 // nsplit is the case number of value of msplit split on,
160 // and nsplitnext is the case number of the next larger value of msplit.
161
162 Int_t mvar,nc,nbestvar=0,jstat,k;
163 Float_t crit,crit0,critmax,critvar=0;
164 Float_t rrn, rrd, rln, rld, u;
165
166 // compute initial values of numerator and denominator of Gini-index,
167 // Gini index= pno/dno
168 Float_t pno=0;
169 Float_t pdo=0;
170
171 for (Int_t j=0;j<2;j++)
172 {
173 pno+=tclasspop[j]*tclasspop[j];
174 pdo+=tclasspop[j];
175 }
176 crit0=pno/pdo;
177 jstat=0;
178
179 // start main loop through variables to find best split,
180 // (Gini-index as criterium crit)
181
182 critmax=-1.0e20; // FIXME: Replace by a constant from limits.h
183
184 // random split selection, number of trials = fNumTry
185 if(!gRandom)
186 {
187 *fLog << err << dbginf << "gRandom not initialized... aborting." << endl;
188 return kFALSE;
189 }
190 for(Int_t mt=0;mt<fNumTry;mt++)
191 {
192 mvar=Int_t(mdim*gRandom->Rndm());
193
194 // Gini index = rrn/rrd+rln/rld
195 rrn=pno;
196 rrd=pdo;
197 rln=0;
198 rld=0;
199 wl.Reset();
200
201 for (Int_t j=0;j<2;j++)
202 {
203 wr[j]=tclasspop[j];
204 }
205
206 critvar=-1.0e20;
207
208 for(Int_t nsp=ndstart;nsp<=ndend-1;nsp++)
209 {
210 nc=datasort[mvar*numdata+nsp];
211
212 u=winbag[nc];
213 k=hadtrue[nc];
214
215 rln=rln+u*(2*wl[k]+u);
216 rrn=rrn+u*(-2*wr[k]+u);
217 rld=rld+u;
218 rrd=rrd-u;
219
220 wl[k]=wl[k]+u;
221 wr[k]=wr[k]-u;
222
223 if (datarang[mvar*numdata+nc]<datarang[mvar*numdata+datasort[mvar*numdata+nsp+1]])
224 {
225 if(TMath::Min(rrd,rld)>1.0e-5)
226 {
227 crit=(rln/rld)+(rrn/rrd);
228 if (crit>critvar)
229 {
230 nbestvar=nsp;
231 critvar=crit;
232 }
233 }
234 }
235 }
236
237 if (critvar>critmax) {
238 msplit=mvar;
239 nbest=nbestvar;
240 critmax=critvar;
241 }
242 }
243
244 decsplit=critmax-crit0;
245 if (critmax<-1.0e10) jstat=1;
246
247 return jstat;
248}
249
250void MRanTree::MoveData(TArrayI &datasort,Int_t mdim,Int_t numdata,Int_t ndstart,
251 Int_t ndend,TArrayI &idmove,TArrayI &ncase,Int_t msplit,
252 Int_t nbest,Int_t &ndendl)
253{
254 // This is the heart of the BuildTree construction. Based on the best split
255 // the data in the part of datasort corresponding to the current node is moved to the
256 // left if it belongs to the left child and right if it belongs to the right child-node.
257
258 Int_t nc,k,ih;
259 TArrayI tdatasort(numdata);
260
261 // compute idmove = indicator of case nos. going left
262
263 for (Int_t nsp=ndstart;nsp<=nbest;nsp++)
264 {
265 nc=datasort[msplit*numdata+nsp];
266 idmove[nc]=1;
267 }
268 for (Int_t nsp=nbest+1;nsp<=ndend;nsp++)
269 {
270 nc=datasort[msplit*numdata+nsp];
271 idmove[nc]=0;
272 }
273 ndendl=nbest;
274
275 // shift case. nos. right and left for numerical variables.
276
277 for(Int_t msh=0;msh<mdim;msh++)
278 {
279 k=ndstart-1;
280 for (Int_t n=ndstart;n<=ndend;n++)
281 {
282 ih=datasort[msh*numdata+n];
283 if (idmove[ih]==1) {
284 k++;
285 tdatasort[k]=datasort[msh*numdata+n];
286 }
287 }
288
289 for (Int_t n=ndstart;n<=ndend;n++)
290 {
291 ih=datasort[msh*numdata+n];
292 if (idmove[ih]==0){
293 k++;
294 tdatasort[k]=datasort[msh*numdata+n];
295 }
296 }
297 for(Int_t k=ndstart;k<=ndend;k++)
298 datasort[msh*numdata+k]=tdatasort[k];
299 }
300
301 // compute case nos. for right and left nodes.
302
303 for(Int_t n=ndstart;n<=ndend;n++)
304 ncase[n]=datasort[msplit*numdata+n];
305}
306
307void MRanTree::BuildTree(TArrayI &datasort,TArrayI &datarang,TArrayI &hadtrue,Int_t mdim,
308 Int_t numdata,TArrayI &bestsplit,TArrayI &bestsplitnext,
309 TArrayI &nodepop,TArrayI &nodestart,TArrayF &tclasspop,
310 Int_t nrnodes,TArrayI &idmove,TArrayI &ncase,TArrayI &parent,
311 TArrayI &jinbag,TArrayI &iv,TArrayF &winbag,TArrayF &wr,TArrayF &wc,
312 TArrayF &wl,Int_t ninbag)
313{
314 // Buildtree consists of repeated calls to two void functions, FindBestSplit and MoveData.
315 // Findbestsplit does just that--it finds the best split of the current node.
316 // MoveData moves the data in the split node right and left so that the data
317 // corresponding to each child node is contiguous.
318 //
319 // buildtree bookkeeping:
320 // ncur is the total number of nodes to date. nodestatus(k)=1 if the kth node has been split.
321 // nodestatus(k)=2 if the node exists but has not yet been split, and =-1 if the node is
322 // terminal. A node is terminal if its size is below a threshold value, or if it is all
323 // one class, or if all the data-values are equal. If the current node k is split, then its
324 // children are numbered ncur+1 (left), and ncur+2(right), ncur increases to ncur+2 and
325 // the next node to be split is numbered k+1. When no more nodes can be split, buildtree
326 // returns.
327
328 Int_t msplit,nbest,ndendl,nc,jstat,ndend,ndstart;
329 Float_t decsplit=0;
330 Float_t popt1,popt2,pp;
331 TArrayF classpop;
332 TArrayI nodestatus;
333
334 nodestatus.Set(nrnodes);
335 classpop.Set(2*nrnodes);
336
337 nodestatus.Reset();
338 nodestart.Reset();
339 nodepop.Reset();
340 classpop.Reset();
341
342
343 for (Int_t j=0;j<2;j++)
344 classpop[j*nrnodes+0]=tclasspop[j];
345
346 Int_t ncur=0;
347 nodestart[0]=0;
348 nodepop[0]=ninbag;
349 nodestatus[0]=2;
350
351 // start main loop
352 for (Int_t kbuild=0;kbuild<nrnodes;kbuild++)
353 {
354 if (kbuild>ncur) break;
355 if (nodestatus[kbuild]!=2) continue;
356
357 // initialize for next call to FindBestSplit
358
359 ndstart=nodestart[kbuild];
360 ndend=ndstart+nodepop[kbuild]-1;
361 for (Int_t j=0;j<2;j++)
362 tclasspop[j]=classpop[j*nrnodes+kbuild];
363
364 jstat=FindBestSplit(datasort,datarang,hadtrue,mdim,numdata,
365 ndstart,ndend,tclasspop,msplit,decsplit,
366 nbest,ncase,jinbag,iv,winbag,wr,wc,wl,
367 kbuild);
368
369 if(jstat==1) {
370 nodestatus[kbuild]=-1;
371 continue;
372 }else{
373 fBestVar[kbuild]=msplit;
374 fGiniDec[msplit]+=decsplit;
375
376 bestsplit[kbuild]=datasort[msplit*numdata+nbest];
377 bestsplitnext[kbuild]=datasort[msplit*numdata+nbest+1];
378 }
379
380 MoveData(datasort,mdim,numdata,ndstart,ndend,idmove,ncase,
381 msplit,nbest,ndendl);
382
383 // leftnode no.= ncur+1, rightnode no. = ncur+2.
384
385 nodepop[ncur+1]=ndendl-ndstart+1;
386 nodepop[ncur+2]=ndend-ndendl;
387 nodestart[ncur+1]=ndstart;
388 nodestart[ncur+2]=ndendl+1;
389
390 // find class populations in both nodes
391
392 for (Int_t n=ndstart;n<=ndendl;n++)
393 {
394 nc=ncase[n];
395 Int_t j=hadtrue[nc];
396 classpop[j*nrnodes+ncur+1]+=winbag[nc];
397 }
398
399 for (Int_t n=ndendl+1;n<=ndend;n++)
400 {
401 nc=ncase[n];
402 Int_t j=hadtrue[nc];
403 classpop[j*nrnodes+ncur+2]+=winbag[nc];
404 }
405
406 // check on nodestatus
407
408 nodestatus[ncur+1]=2;
409 nodestatus[ncur+2]=2;
410 if (nodepop[ncur+1]<=fNdSize) nodestatus[ncur+1]=-1;
411 if (nodepop[ncur+2]<=fNdSize) nodestatus[ncur+2]=-1;
412 popt1=0;
413 popt2=0;
414 for (Int_t j=0;j<2;j++)
415 {
416 popt1+=classpop[j*nrnodes+ncur+1];
417 popt2+=classpop[j*nrnodes+ncur+2];
418 }
419
420 for (Int_t j=0;j<2;j++)
421 {
422 if (classpop[j*nrnodes+ncur+1]==popt1) nodestatus[ncur+1]=-1;
423 if (classpop[j*nrnodes+ncur+2]==popt2) nodestatus[ncur+2]=-1;
424 }
425
426 fTreeMap1[kbuild]=ncur+1;
427 fTreeMap2[kbuild]=ncur+2;
428 parent[ncur+1]=kbuild;
429 parent[ncur+2]=kbuild;
430 nodestatus[kbuild]=1;
431 ncur+=2;
432 if (ncur>=nrnodes) break;
433 }
434
435 // determine number of nodes
436 fNumNodes=nrnodes;
437 for (Int_t k=nrnodes-1;k>=0;k--)
438 {
439 if (nodestatus[k]==0) fNumNodes-=1;
440 if (nodestatus[k]==2) nodestatus[k]=-1;
441 }
442
443 fNumEndNodes=0;
444 for (Int_t kn=0;kn<fNumNodes;kn++)
445 if(nodestatus[kn]==-1)
446 {
447 fNumEndNodes++;
448 pp=0;
449 for (Int_t j=0;j<2;j++)
450 {
451 if(classpop[j*nrnodes+kn]>pp)
452 {
453 // class + status of node kn coded into fBestVar[kn]
454 fBestVar[kn]=j-2;
455 pp=classpop[j*nrnodes+kn];
456 }
457 }
458 fBestSplit[kn] =classpop[1*nrnodes+kn];
459 fBestSplit[kn]/=(classpop[0*nrnodes+kn]+classpop[1*nrnodes+kn]);
460 }
461}
462
463void MRanTree::SetRules(MDataArray *rules)
464{
465 fData=rules;
466}
467
468Double_t MRanTree::TreeHad(const TVector &event)
469{
470 Int_t kt=0;
471 // to optimize on storage space node status and node class
472 // are coded into fBestVar:
473 // status of node kt = TMath::Sign(1,fBestVar[kt])
474 // class of node kt = fBestVar[kt]+2 (class defined by larger
475 // node population, actually not used)
476 // hadronness assigned to node kt = fBestSplit[kt]
477
478 for (Int_t k=0;k<fNumNodes;k++)
479 {
480 if (fBestVar[kt]<0)
481 break;
482
483 const Int_t m=fBestVar[kt];
484
485 kt = event(m)<=fBestSplit[kt] ? fTreeMap1[kt] : fTreeMap2[kt];
486 }
487
488 return fBestSplit[kt];
489}
490
491Double_t MRanTree::TreeHad()
492{
493 TVector event;
494 *fData >> event;
495
496 return TreeHad(event);
497}
498
499Bool_t MRanTree::AsciiWrite(ostream &out) const
500{
501 TString str;
502 Int_t k;
503
504 out.width(5);out<<fNumNodes<<endl;
505
506 for (k=0;k<fNumNodes;k++)
507 {
508 str=Form("%f",GetBestSplit(k));
509
510 out.width(5); out << k;
511 out.width(5); out << GetNodeStatus(k);
512 out.width(5); out << GetTreeMap1(k);
513 out.width(5); out << GetTreeMap2(k);
514 out.width(5); out << GetBestVar(k);
515 out.width(15); out << str<<endl;
516 out.width(5); out << GetNodeClass(k);
517 }
518 out<<endl;
519
520 return k==fNumNodes;
521}
Note: See TracBrowser for help on using the repository browser.