source: trunk/MagicSoft/Mars/manalysis/MRanForest.cc@ 1976

Last change on this file since 1976 was 1880, checked in by tbretz, 22 years ago
*** empty log message ***
File size: 11.7 KB
Line 
1/* ======================================================================== *\
2!
3! *
4! * This file is part of MARS, the MAGIC Analysis and Reconstruction
5! * Software. It is distributed to you in the hope that it can be a useful
6! * and timesaving tool in analysing Data of imaging Cerenkov telescopes.
7! * It is distributed WITHOUT ANY WARRANTY.
8! *
9! * Permission to use, copy, modify and distribute this software and its
10! * documentation for any purpose is hereby granted without fee,
11! * provided that the above copyright notice appear in all copies and
12! * that both that copyright notice and this permission notice appear
13! * in supporting documentation. It is provided "as is" without express
14! * or implied warranty.
15! *
16!
17!
18! Author(s): Thomas Hengstebeck 3/2003 <mailto:hengsteb@alwa02.physik.uni-siegen.de>
19!
20! Copyright: MAGIC Software Development, 2000-2003
21!
22!
23\* ======================================================================== */
24
25/////////////////////////////////////////////////////////////////////////////
26// //
27// MRanForest //
28// //
29// ParameterContainer for Forest structure //
30// //
31// A random forest can be grown by calling GrowForest. //
32// In advance SetupGrow must be called in order to initialize arrays and //
33// do some preprocessing. //
34// GrowForest() provides the training data for a single tree (bootstrap //
35// aggregate procedure) //
36// //
37// Essentially two random elements serve to provide a "random" forest, //
38// namely bootstrap aggregating (which is done in GrowForest()) and random //
39// split selection (which is subject to MRanTree::GrowTree()) //
40// //
41/////////////////////////////////////////////////////////////////////////////
42#include "MRanForest.h"
43
44#include <TMatrix.h>
45#include <TRandom3.h>
46
47#include "MHMatrix.h"
48#include "MRanTree.h"
49
50#include "MLog.h"
51#include "MLogManip.h"
52
53ClassImp(MRanForest);
54
55// --------------------------------------------------------------------------
56//
57// Default constructor.
58//
59MRanForest::MRanForest(const char *name, const char *title) : fNumTrees(100), fRanTree(NULL),fUsePriors(kFALSE)
60{
61 fName = name ? name : "MRanForest";
62 fTitle = title ? title : "Storage container for Random Forest";
63
64 fForest=new TObjArray();
65 fForest->SetOwner(kTRUE);
66}
67
68// --------------------------------------------------------------------------
69//
70// Destructor.
71//
72MRanForest::~MRanForest()
73{
74 delete fForest;
75}
76
77void MRanForest::SetNumTrees(Int_t n)
78{
79 //at least 1 tree
80 fNumTrees=TMath::Max(n,1);
81 fTreeHad.Set(fNumTrees);
82 fTreeHad.Reset();
83}
84
85void MRanForest::SetPriors(Float_t prior_had, Float_t prior_gam)
86{
87 Float_t sum;
88
89 sum=prior_gam+prior_had;
90
91 prior_gam/=sum;
92 prior_had/=sum;
93
94 fPrior[0]=prior_had;
95 fPrior[1]=prior_gam;
96
97 fUsePriors=kTRUE;
98}
99
100Double_t MRanForest::CalcHadroness(TVector &event)
101{
102 Double_t hadroness=0;
103 Int_t ntree=0;
104 MRanTree *tree;
105
106 TIter forest(fForest);
107 forest.Reset();
108
109 while ((tree=(MRanTree*)forest.Next()))
110 {
111 fTreeHad[ntree]=tree->TreeHad(event);
112 hadroness+=fTreeHad[ntree];
113 ntree++;
114 }
115 return hadroness/Double_t(ntree);
116}
117
118Bool_t MRanForest::AddTree(MRanTree *rantree=NULL)
119{
120 if (rantree)
121 fRanTree=rantree;
122 if (!fRanTree)
123 return kFALSE;
124
125 fForest->Add((MRanTree*)fRanTree->Clone());
126
127 return kTRUE;
128}
129
130Bool_t MRanForest::SetupGrow(MHMatrix *mhad,MHMatrix *mgam)
131{
132 // pointer to training data
133 fHadrons=mhad;
134 fGammas=mgam;
135
136 // determine data entries and dimension of Hillas-parameter space
137 fNumHad=fHadrons->GetM().GetNrows();
138 fNumGam=fGammas->GetM().GetNrows();
139 fNumDim=fHadrons->GetM().GetNcols();
140
141 if (fNumDim!=fHadrons->GetM().GetNcols()) return kFALSE;
142
143 fNumData=fNumHad+fNumGam;
144
145 // allocating and initializing arrays
146 fHadTrue.Set(fNumData);
147 fHadTrue.Reset();
148 fHadEst.Set(fNumData);
149
150 fPrior.Set(2);
151 fClassPop.Set(2);
152 fWeight.Set(fNumData);
153 fNTimesOutBag.Set(fNumData);
154 fNTimesOutBag.Reset();
155
156 fDataSort.Set(fNumDim*fNumData);
157 fDataRang.Set(fNumDim*fNumData);
158
159 // calculating class populations (= no. of gammas and hadrons)
160 fClassPop.Reset();
161 for(Int_t n=0;n<fNumData;n++)
162 fClassPop[fHadTrue[n]]++;
163
164 // setting weights taking into account priors
165 if (!fUsePriors)
166 {
167 fWeight.Reset(1.);
168 }else{
169 for(Int_t j=0;j<2;j++)
170 fPrior[j] *= (fClassPop[j]>=1) ?
171 Float_t(fNumData)/Float_t(fClassPop[j]):0;
172
173 for(Int_t n=0;n<fNumData;n++)
174 fWeight[n]=fPrior[fHadTrue[n]];
175 }
176
177 // create fDataSort
178 CreateDataSort();
179
180 if(!fRanTree)
181 {
182 *fLog << err << dbginf << "MRanForest, fRanTree not initialized... aborting." << endl;
183 return kFALSE;
184 }
185 fRanTree->SetRules(fGammas->GetColumns());
186 fTreeNo=0;
187
188 return kTRUE;
189}
190
191Bool_t MRanForest::GrowForest()
192{
193 Int_t ninbag=0;
194 TArrayI datsortinbag;
195 TArrayF classpopw;
196 TArrayI jinbag;
197 TArrayF winbag;
198
199 jinbag.Set(fNumData);
200 winbag.Set(fNumData);
201 classpopw.Set(2);
202
203 TMatrix hadrons=fHadrons->GetM();
204 TMatrix gammas=fGammas->GetM();
205
206 fTreeNo++;
207
208 // initialize running output
209 if(fTreeNo==1)
210 {
211 cout<<endl<<endl<<"1st col.: no. of tree"<<endl;
212 cout<<"2nd col.: error in % (calulated using oob-data -> overestim. of error)"<<endl;
213 }
214
215 jinbag.Reset();
216 classpopw.Reset();
217 winbag.Reset();
218
219 // bootstrap aggregating (bagging) -> sampling with replacement:
220 //
221 // The integer k is randomly (uniformly) chosen from the set
222 // {0,1,...,fNumData-1}, which is the set of the index numbers of
223 // all events in the training sample
224
225 for (Int_t n=0;n<fNumData;n++)
226 {
227 if(!gRandom)
228 {
229 *fLog << err << dbginf << "gRandom not initialized... aborting." << endl;
230 return kFALSE;
231 }
232
233 Int_t k=Int_t(fNumData*gRandom->Rndm());
234
235 classpopw[fHadTrue[k]]+=fWeight[k];
236 winbag[k]+=fWeight[k];
237 jinbag[k]=1;
238 }
239
240 // modifying sorted-data array for in-bag data:
241 //
242 // In bagging procedure ca. 2/3 of all elements in the original
243 // training sample are used to build the in-bag data
244 datsortinbag=fDataSort;
245
246 ModifyDataSort(datsortinbag,ninbag,jinbag);
247
248 // growing single tree
249 fRanTree->GrowTree(hadrons,gammas,fNumData,fNumDim,fHadTrue,datsortinbag,
250 fDataRang,classpopw,jinbag,winbag,fWeight);
251
252 // error-estimates from out-of-bag data (oob data):
253 //
254 // For a single tree the events not(!) contained in the bootstrap sample of
255 // this tree can be used to obtain estimates for the classification error of
256 // this tree.
257 // If you take a certain event, it is contained in the oob-data of 1/3 of
258 // the trees (see comment to ModifyData). This means that the classification error
259 // determined from oob-data is underestimated, but can still be taken as upper limit.
260
261 TVector event(fNumDim);
262 for(Int_t ievt=0;ievt<fNumHad;ievt++)
263 {
264 if(jinbag[ievt]>0)continue;
265 event=TMatrixRow(hadrons,ievt);
266 fHadEst[ievt]+=fRanTree->TreeHad(event);
267 fNTimesOutBag[ievt]++;
268 }
269 for(Int_t ievt=0;ievt<fNumGam;ievt++)
270 {
271 if(jinbag[fNumHad+ievt]>0)continue;
272 event=TMatrixRow(gammas,ievt);
273 fHadEst[fNumHad+ievt]+=fRanTree->TreeHad(event);
274 fNTimesOutBag[fNumHad+ievt]++;
275 }
276
277 Int_t n=0;
278 fErr=0;
279 for(Int_t ievt=0;ievt<fNumData;ievt++)
280 if(fNTimesOutBag[ievt]!=0)
281 {
282 fErr+=TMath::Power(fHadEst[ievt]/fNTimesOutBag[ievt]-fHadTrue[ievt],2.);
283 n++;
284 }
285
286 fErr/=Float_t(n);
287 fErr=TMath::Sqrt(fErr);
288
289 // give running output
290 cout << setw(5) << fTreeNo << setw(15) << Form("%.2f",100.*fErr) << endl;
291
292 // adding tree to forest
293 AddTree();
294
295 return(fTreeNo<fNumTrees);
296}
297
298void MRanForest::CreateDataSort()
299{
300 // The values of concatenated data arrays fHadrons and fGammas (denoted in the following by fData,
301 // which does actually not exist) are sorted from lowest to highest.
302 //
303 //
304 // fHadrons(0,0) ... fHadrons(0,nhad-1) fGammas(0,0) ... fGammas(0,ngam-1)
305 // . . . .
306 // fData(m,n) = . . . .
307 // . . . .
308 // fHadrons(m-1,0)...fHadrons(m-1,nhad-1) fGammas(m-1,0)...fGammas(m-1,ngam-1)
309 //
310 //
311 // Then fDataSort(m,n) is the event number in which fData(m,n) occurs.
312 // fDataRang(m,n) is the rang of fData(m,n), i.e. if rang = r, fData(m,n) is the r-th highest
313 // value of all fData(m,.). There may be more then 1 event with rang r (due to bagging).
314
315 TArrayF v(fNumData);
316 TArrayI isort(fNumData);
317
318 TMatrix hadrons=fHadrons->GetM();
319 TMatrix gammas=fGammas->GetM();
320
321 for (Int_t j=0;j<fNumHad;j++)
322 fHadTrue[j]=1;
323
324 for (Int_t j=0;j<fNumGam;j++)
325 fHadTrue[j+fNumHad]=0;
326
327 for (Int_t mvar=0;mvar<fNumDim;mvar++)
328 {
329 for(Int_t n=0;n<fNumHad;n++)
330 {
331 v[n]=hadrons(n,mvar);
332 isort[n]=n;
333 }
334
335 for(Int_t n=0;n<fNumGam;n++)
336 {
337 v[n+fNumHad]=gammas(n,mvar);
338 isort[n+fNumHad]=n;
339 }
340
341 TMath::Sort(fNumData,v.GetArray(),isort.GetArray(),kFALSE);
342
343 // this sorts the v[n] in ascending order. isort[n] is the event number
344 // of that v[n], which is the n-th from the lowest (assume the original
345 // event numbers are 0,1,...).
346
347 for(Int_t n=0;n<fNumData-1;n++)
348 {
349 Int_t n1=isort[n];
350 Int_t n2=isort[n+1];
351 fDataSort[mvar*fNumData+n]=n1;
352 if(n==0) fDataRang[mvar*fNumData+n1]=0;
353 if(v[n]<v[n+1])
354 {
355 fDataRang[mvar*fNumData+n2]=fDataRang[mvar*fNumData+n1]+1;
356 }else{
357 fDataRang[mvar*fNumData+n2]=fDataRang[mvar*fNumData+n1];
358 }
359 }
360 fDataSort[(mvar+1)*fNumData-1]=isort[fNumData-1];
361 }
362
363 return;
364}
365
366void MRanForest::ModifyDataSort(TArrayI &datsortinbag,Int_t ninbag,TArrayI &jinbag)
367{
368 ninbag=0;
369 for (Int_t n=0;n<fNumData;n++)
370 if(jinbag[n]==1) ninbag++;
371
372 for(Int_t m=0;m<fNumDim;m++)
373 {
374 Int_t k=0;
375 Int_t nt=0;
376 for(Int_t n=0;n<fNumData;n++)
377 {
378 if(jinbag[datsortinbag[m*fNumData+k]]==1)
379 {
380 datsortinbag[m*fNumData+nt]=datsortinbag[m*fNumData+k];
381 k++;
382 }else{
383 for(Int_t j=1;j<fNumData-k;j++)
384 {
385 if(jinbag[datsortinbag[m*fNumData+k+j]]==1)
386 {
387 datsortinbag[m*fNumData+nt]=datsortinbag[m*fNumData+k+j];
388 k+=j+1;
389 break;
390 }
391 }
392 }
393 nt++;
394 if(nt>=ninbag) break;
395 }
396 }
397 return;
398}
399
400Bool_t MRanForest::AsciiWrite(ostream &out) const
401{
402 Int_t n=0;
403 MRanTree *tree;
404 TIter forest(fForest);
405
406 while ((tree=(MRanTree*)forest.Next()))
407 {
408 tree->AsciiWrite(out);
409 n++;
410 }
411
412 return n==fNumTrees;
413}
Note: See TracBrowser for help on using the repository browser.