source: trunk/MagicSoft/Mars/manalysis/MRanForest.cc@ 1859

Last change on this file since 1859 was 1859, checked in by hengsteb, 22 years ago
*** empty log message ***
File size: 11.7 KB
Line 
1/* ======================================================================== *\
2!
3! *
4! * This file is part of MARS, the MAGIC Analysis and Reconstruction
5! * Software. It is distributed to you in the hope that it can be a useful
6! * and timesaving tool in analysing Data of imaging Cerenkov telescopes.
7! * It is distributed WITHOUT ANY WARRANTY.
8! *
9! * Permission to use, copy, modify and distribute this software and its
10! * documentation for any purpose is hereby granted without fee,
11! * provided that the above copyright notice appear in all copies and
12! * that both that copyright notice and this permission notice appear
13! * in supporting documentation. It is provided "as is" without express
14! * or implied warranty.
15! *
16!
17!
18! Author(s): Thomas Hengstebeck 3/2003 <mailto:hengsteb@alwa02.physik.uni-siegen.de>
19!
20! Copyright: MAGIC Software Development, 2000-2003
21!
22!
23\* ======================================================================== */
24
25/////////////////////////////////////////////////////////////////////////////
26// //
27// MRanForest //
28// //
29// ParameterContainer for Forest structure //
30// //
31// A random forest can be grown by calling GrowForest. //
32// In advance SetupGrow must be called in order to initialize arrays and //
33// do some preprocessing. //
34// GrowForest() provides the training data for a single tree (bootstrap //
35// aggregate procedure) //
36// //
37// Essentially two random elements serve to provide a "random" forest, //
38// namely bootstrap aggregating (which is done in GrowForest()) and random //
39// split selection (which is subject to MRanTree::GrowTree()) //
40/////////////////////////////////////////////////////////////////////////////
41#include "MRanForest.h"
42#include <iostream.h>
43#include <fstream.h>
44
45#include <TFile.h> // gFile
46
47#include "MLog.h"
48#include "MLogManip.h"
49
50
51ClassImp(MRanForest);
52// --------------------------------------------------------------------------
53//
54// Default constructor.
55//
56MRanForest::MRanForest(const char *name, const char *title) : fNumTrees(100), fRanTree(NULL),fUsePriors(kFALSE)
57{
58 fName = name ? name : "MRanForest";
59 fTitle = title ? title : "Storage container for Random Forest";
60}
61
62void MRanForest::SetNumTrees(Int_t n)
63{
64 //at least 1 tree
65 fNumTrees=TMath::Max(n,1);
66 fTreeHad.Set(fNumTrees);
67 fTreeHad.Reset();
68
69 return;
70}
71
72void MRanForest::SetPriors(Float_t prior_had, Float_t prior_gam)
73{
74 Float_t sum;
75
76 sum=prior_gam+prior_had;
77
78 prior_gam/=sum;
79 prior_had/=sum;
80
81 fPrior[0]=prior_had;
82 fPrior[1]=prior_gam;
83
84 fUsePriors=kTRUE;
85
86 return;
87}
88
89Double_t MRanForest::CalcHadroness(TVector &event)
90{
91 Double_t hadroness=0;
92 Double_t ntree=0;
93 MRanTree *tree;
94 TIter forest(fForest);
95 while ((tree=(MRanTree*)forest.Next()))
96 {
97 fTreeHad[ntree]=tree->TreeHad(event);
98 hadroness+=fTreeHad[ntree];
99 ntree++;
100 }
101 return hadroness/ntree;
102}
103
104void MRanForest::SetupForest()
105{
106 fForest=new TObjArray();
107 fForest->SetOwner(kTRUE);
108
109 return;
110}
111
112void MRanForest::SetTree(MRanTree *rantree)
113{
114 // initialize current tree for tree-growing loop in GrowForest
115 // ndsize + numtry are set in MRanForestGrow!!
116 fRanTree=rantree;
117
118 return;
119}
120
121Bool_t MRanForest::AddTree()
122{
123 if (!fRanTree)
124 return kFALSE;
125 MRanTree *newtree=(MRanTree*)fRanTree->Clone();
126 fForest->Add(newtree);
127
128 return kTRUE;
129}
130
131Bool_t MRanForest::SetupGrow(MHMatrix *mhad,MHMatrix *mgam)
132{
133 // pointer to training data
134 fHadrons=mhad;
135 fGammas=mgam;
136
137 // determine data entries and dimension of Hillas-parameter space
138 fNumHad=fHadrons->GetM().GetNrows();
139 fNumGam=fGammas->GetM().GetNrows();
140 fNumDim=fHadrons->GetM().GetNcols();
141
142 if (fNumDim!=fHadrons->GetM().GetNcols()) return kFALSE;
143
144 fNumData=fNumHad+fNumGam;
145
146 // allocating and initializing arrays
147 fHadTrue.Set(fNumData);
148 fHadTrue.Reset();
149 fHadEst.Set(fNumData);
150
151 fPrior.Set(2);
152 fClassPop.Set(2);
153 fWeight.Set(fNumData);
154 fNTimesOutBag.Set(fNumData);
155 fNTimesOutBag.Reset();
156
157 fGiniDec.Set(fNumDim);
158
159 fDataSort.Set(fNumDim*fNumData);
160 fDataRang.Set(fNumDim*fNumData);
161
162 // calculating class populations (= no. of gammas and hadrons)
163 fClassPop.Reset();
164 for(Int_t n=0;n<fNumData;n++)
165 fClassPop[fHadTrue[n]]++;
166
167 // setting weights taking into account priors
168 if (!fUsePriors)
169 {
170 fWeight.Reset(1.);
171 }else{
172 for(Int_t j=0;j<2;j++)
173 fPrior[j] *= (fClassPop[j]>=1) ?
174 Float_t(fNumData)/Float_t(fClassPop[j]):0;
175
176 for(Int_t n=0;n<fNumData;n++)
177 fWeight[n]=fPrior[fHadTrue[n]];
178 }
179
180 // create fDataSort
181 CreateDataSort();
182
183 SetupForest();
184
185 if(!fRanTree)return kFALSE;
186 fRanTree->SetRules(fGammas->GetColumns());
187 fTreeNo=0;
188
189 return kTRUE;
190}
191
192Bool_t MRanForest::GrowForest()
193{
194 Int_t ninbag=0;
195 TArrayI datsortinbag;
196 TArrayF classpopw;
197 TArrayI jinbag;
198 TArrayF winbag;
199
200 jinbag.Set(fNumData);
201 winbag.Set(fNumData);
202 classpopw.Set(2);
203
204 TMatrix hadrons=fHadrons->GetM();
205 TMatrix gammas=fGammas->GetM();
206
207 fTreeNo++;
208
209 // initialize running output
210 if(fTreeNo==1)
211 {
212 cout<<endl<<endl<<"1st col.: no. of tree"<<endl;
213 cout<<"2nd col.: error in % (calulated using oob-data -> overestim. of error)"<<endl;
214 }
215
216 jinbag.Reset();
217 classpopw.Reset();
218 winbag.Reset();
219
220 // bootstrap aggregating (bagging) -> sampling with replacement:
221 //
222 // The integer k is randomly (uniformly) chosen from the set
223 // {0,1,...,fNumData-1}, which is the set of the index numbers of
224 // all events in the training sample
225
226 for (Int_t n=0;n<fNumData;n++)
227 {
228 Int_t k=Int_t(fNumData*fRand.Rndm());
229
230 classpopw[fHadTrue[k]]+=fWeight[k];
231 winbag[k]+=fWeight[k];
232 jinbag[k]=1;
233 }
234
235 // modifying sorted-data array for in-bag data:
236 //
237 // In bagging procedure ca. 2/3 of all elements in the original
238 // training sample are used to build the in-bag data
239 datsortinbag=fDataSort;
240
241 ModifyDataSort(datsortinbag,ninbag,jinbag);
242
243 // growing single tree
244 fRanTree->GrowTree(hadrons,gammas,fNumData,fNumDim,fHadTrue,datsortinbag,
245 fDataRang,fGiniDec,classpopw,jinbag,winbag,fWeight,fRand);
246
247 // error-estimates from out-of-bag data (oob data):
248 //
249 // For a single tree the events not(!) contained in the bootstrap sample of
250 // this tree can be used to obtain estimates for the classification error of
251 // this tree.
252 // If you take a certain event, it is contained in the oob-data of 1/3 of
253 // the trees (see comment to ModifyData). This means that the classification error
254 // determined from oob-data is underestimated, but can still be taken as upper limit.
255
256 TVector event(fNumDim);
257 for(Int_t ievt=0;ievt<fNumHad;ievt++)
258 {
259 if(jinbag[ievt]>0)continue;
260 event=TMatrixRow(hadrons,ievt);
261 fHadEst[ievt]+=fRanTree->TreeHad(event);
262 fNTimesOutBag[ievt]++;
263 }
264 for(Int_t ievt=0;ievt<fNumGam;ievt++)
265 {
266 if(jinbag[fNumHad+ievt]>0)continue;
267 event=TMatrixRow(gammas,ievt);
268 fHadEst[fNumHad+ievt]+=fRanTree->TreeHad(event);
269 fNTimesOutBag[fNumHad+ievt]++;
270 }
271
272 Int_t n=0;
273 fErr=0;
274 for(Int_t ievt=0;ievt<fNumData;ievt++)
275 {
276 if(fNTimesOutBag[ievt]!=0)
277 {
278 fErr+=TMath::Power(fHadEst[ievt]/fNTimesOutBag[ievt]-fHadTrue[ievt],2.);
279 n++;
280 }
281 }
282 fErr/=Float_t(n);
283 fErr=TMath::Sqrt(fErr);
284
285 // give running output
286 TString str=Form("%.2f",100.*fErr);
287 cout.width(5); cout<<fTreeNo;
288 cout.width(15); cout<<str<<endl;
289
290 // adding tree to forest
291 AddTree();
292
293 return(fTreeNo<fNumTrees);
294}
295
296void MRanForest::CreateDataSort()
297{
298 // The values of concatenated data arrays fHadrons and fGammas (denoted in the following by fData,
299 // which does actually not exist) are sorted from lowest to highest.
300 //
301 //
302 // fHadrons(0,0) ... fHadrons(0,nhad-1) fGammas(0,0) ... fGammas(0,ngam-1)
303 // . . . .
304 // fData(m,n) = . . . .
305 // . . . .
306 // fHadrons(m-1,0)...fHadrons(m-1,nhad-1) fGammas(m-1,0)...fGammas(m-1,ngam-1)
307 //
308 //
309 // Then fDataSort(m,n) is the event number in which fData(m,n) occurs.
310 // fDataRang(m,n) is the rang of fData(m,n), i.e. if rang = r, fData(m,n) is the r-th highest
311 // value of all fData(m,.). There may be more then 1 event with rang r (due to bagging).
312
313
314 TArrayF v(fNumData);
315 TArrayI isort(fNumData);
316
317 TMatrix hadrons=fHadrons->GetM();
318 TMatrix gammas=fGammas->GetM();
319
320 for (Int_t j=0;j<fNumHad;j++)
321 {
322 fHadTrue[j]=1;
323 }
324
325 for (Int_t j=0;j<fNumGam;j++)
326 {
327 fHadTrue[j+fNumHad]=0;
328 }
329
330
331 for (Int_t mvar=0;mvar<fNumDim;mvar++)
332 {
333 for(Int_t n=0;n<fNumHad;n++)
334 {
335 v[n]=hadrons(n,mvar);
336 isort[n]=n;
337 }
338
339 for(Int_t n=0;n<fNumGam;n++)
340 {
341 v[n+fNumHad]=gammas(n,mvar);
342 isort[n+fNumHad]=n;
343 }
344
345 TMath::Sort(fNumData,v.GetArray(),isort.GetArray(),kFALSE);
346
347 // this sorts the v[n] in ascending order. isort[n] is the event number
348 // of that v[n], which is the n-th from the lowest (assume the original
349 // event numbers are 0,1,...).
350
351 for(Int_t n=0;n<fNumData-1;n++)
352 {
353 Int_t n1=isort[n];
354 Int_t n2=isort[n+1];
355 fDataSort[mvar*fNumData+n]=n1;
356 if(n==0) fDataRang[mvar*fNumData+n1]=0;
357 if(v[n]<v[n+1])
358 {
359 fDataRang[mvar*fNumData+n2]=fDataRang[mvar*fNumData+n1]+1;
360 }else{
361 fDataRang[mvar*fNumData+n2]=fDataRang[mvar*fNumData+n1];
362 }
363 }
364 fDataSort[(mvar+1)*fNumData-1]=isort[fNumData-1];
365 }
366
367 return;
368}
369
370void MRanForest::ModifyDataSort(TArrayI &datsortinbag,Int_t ninbag,TArrayI &jinbag)
371{
372 ninbag=0;
373 for (Int_t n=0;n<fNumData;n++)
374 if(jinbag[n]==1) ninbag++;
375
376 for(Int_t m=0;m<fNumDim;m++)
377 {
378 Int_t k=0;
379 Int_t nt=0;
380 for(Int_t n=0;n<fNumData;n++)
381 {
382 if(jinbag[datsortinbag[m*fNumData+k]]==1)
383 {
384 datsortinbag[m*fNumData+nt]=datsortinbag[m*fNumData+k];
385 k++;
386 }else{
387 for(Int_t j=1;j<fNumData-k;j++)
388 {
389 if(jinbag[datsortinbag[m*fNumData+k+j]]==1)
390 {
391 datsortinbag[m*fNumData+nt]=datsortinbag[m*fNumData+k+j];
392 k+=j+1;
393 break;
394 }
395 }
396 }
397 nt++;
398 if(nt>=ninbag) break;
399 }
400 }
401 return;
402}
403
404Bool_t MRanForest::AsciiWrite(ostream &out) const
405{
406 Int_t n=0;
407 MRanTree *tree;
408 TIter forest(fForest);
409
410 while ((tree=(MRanTree*)forest.Next()))
411 {
412 tree->AsciiWrite(out);
413 n++;
414 }
415
416 return n==fNumTrees;
417}
Note: See TracBrowser for help on using the repository browser.