tmva_remid_train_classifiers.C
Go to the documentation of this file.
1 //std / stl stuff
2 #include <iostream>
3 #include <string>
4 #include <sstream>
5 #include <vector>
6 #include <map>
7 
8 //ROOT
9 #include "TTree.h"
10 #include "TFile.h"
11 #include "TCut.h"
12 
13 //TMVA
14 #include "TMVA/Factory.h"
15 #include "TMVA/Types.h"
16 
17 #define INFO std::cerr << "INFO : "
18 #define ERROR std::cerr << "ERROR : "
19 
20 #ifdef __CINT__
21 void tmva_remid_train_classifiers(std::string trainingFileName, std::string outputFileName, std::string classifierList="KNN", bool debug=true){
22  ERROR << "You must run in compiled mode" << std::endl;
23 }
24 
25 #else
26 
27 struct Classifier{
28 public:
29  Classifier(TMVA::Types::EMVA type=TMVA::Types::kVariable, std::string options=""):
30  fType(type),
32  {}
33 
34  TMVA::Types::EMVA fType;
36 };
37 
38 std::map<std::string, Classifier> makeClassifierMapKNN(){
39  std::map<std::string, Classifier> classifiers;
40 
41  classifiers["KNN80Default"] = Classifier(TMVA::Types::kKNN, "nkNN=80:ScaleFrac=0.8:SigmaFact=1.0:Kernel=Gaus:UseKernel=F:UseWeight=T:!Trim");
42  // classifiers["KNN80Gaus"] = Classifier(TMVA::Types::kKNN, "nkNN=80:ScaleFrac=0.8:SigmaFact=1.0:Kernel=Gaus:UseKernel=T:UseWeight=T:!Trim");
43  return classifiers;
44 }
45 
46 std::map<std::string, Classifier> makeClassifierMapSVM(){
47  std::map<std::string, Classifier> classifiers;
48  classifiers["SVMA"] = Classifier(TMVA::Types::kSVM, "Gamma=1:C=0.1");
49  classifiers["SVMB"] = Classifier( TMVA::Types::kSVM, "Gamma=0.25:Tol=0.001:VarTransform=Norm" );
50  return classifiers;
51 }
52 
53 std::map<std::string, Classifier> makeClassifierMapMLP(){
54  std::map<std::string, Classifier> classifiers;
55  // TMVA ANN: MLP (recommended ANN) -- all ANNs in TMVA are Multilayer Perceptrons
56  classifiers["MLP"] = Classifier(TMVA::Types::kMLP, "H:!V:NeuronType=tanh:VarTransform=N:NCycles=600:HiddenLayers=N+5:TestRate=5:!UseRegulator" );
57  classifiers["MLPBFGS"] = Classifier(TMVA::Types::kMLP, "H:!V:NeuronType=tanh:VarTransform=N:NCycles=600:HiddenLayers=N+5:TestRate=5:TrainingMethod=BFGS:!UseRegulator" );
58  classifiers["MLPBNN"] = Classifier(TMVA::Types::kMLP, "H:!V:NeuronType=tanh:VarTransform=N:NCycles=600:HiddenLayers=N+5:TestRate=5:TrainingMethod=BFGS:UseRegulator" ); // BFGS training with bayesian regulators
59 
60  return classifiers;
61 }
62 
63 std::map<std::string, Classifier> makeClassifierMapBDT(){
64  std::map<std::string, Classifier> classifiers;
65  // Gradient Boost
66  classifiers["BDTG"] = Classifier( TMVA::Types::kBDT,
67  "!H:!V:NTrees=1000:MinNodeSize=2.5%:BoostType=Grad:Shrinkage=0.10:UseBaggedBoost:BaggedSampleFraction=0.5:nCuts=20:MaxDepth=2" );
68  // Adaptive Boost
69  classifiers["BDTA"] = Classifier( TMVA::Types::kBDT,
70  "!H:!V:NTrees=850:MinNodeSize=2.5%:MaxDepth=3:BoostType=AdaBoost:AdaBoostBeta=0.5:UseBaggedBoost:BaggedSampleFraction=0.5:SeparationType=GiniIndex:nCuts=20" );
71  // Bagging
72  // classifiers["BDTB"] = Classifier( TMVA::Types::kBDT,
73  // "!H:!V:NTrees=400:BoostType=Bagging:SeparationType=GiniIndex:nCuts=20" );
74  // Decorrelation + Adaptive Boost
75  classifiers["BDTD"] = Classifier( TMVA::Types::kBDT,
76  "!H:!V:NTrees=400:MinNodeSize=5%:MaxDepth=3:BoostType=AdaBoost:SeparationType=GiniIndex:nCuts=20:VarTransform=Decorrelate" );
77  // Allow Using Fisher discriminant in node splitting for (strong) linearly correlated variables
78  // classifiers["BDTF"] = Classifier( TMVA::Types::kBDT,
79  // "!H:!V:NTrees=50:MinNodeSize=2.5%:UseFisherCuts:MaxDepth=3:BoostType=AdaBoost:AdaBoostBeta=0.5:SeparationType=GiniIndex:nCuts=20" );
80 
81  return classifiers;
82 }
83 
84 
85 std::map<std::string, Classifier> makeClassifierMapAll(){
86  std::map<std::string, Classifier> classifiers;
87  std::vector<std::map<std::string, Classifier>> classifiersVec;
88 
89  classifiersVec.push_back(makeClassifierMapBDT());
90  classifiersVec.push_back(makeClassifierMapMLP());
91  classifiersVec.push_back(makeClassifierMapSVM());
92  classifiersVec.push_back(makeClassifierMapKNN());
93  for(auto classifierMap: classifiersVec) classifiers.insert(classifierMap.begin(), classifierMap.end());
94 
95  return classifiers;
96 }
97 
98 std::vector<std::string> tokenise(const std::string &s, const std::string &delim=":"){
99  std::stringstream ss;
100  ss.str(s);
101  std::string item;
102  std::vector<std::string> items;
103  while(std::getline(ss, item, delim.c_str()[0])) items.push_back(item);
104  return items;
105 }
106 
107 TMVA::Factory* prepareFactory(std::string name, TFile* fpTraining, TFile* fpOutput, bool debug=true){
108 
109  INFO << "preparing factory, debug: " << debug << std::endl;
110  TTree *signalTree = (TTree*)fpTraining->FindObjectAny("SigTree");
111  TTree *backgroundTree = (TTree*)fpTraining->FindObjectAny("BackTree");
112 
113  //Create the TMVA factory
114 
115  std::string factoryOptions;
116  factoryOptions += "!V";
117  factoryOptions += ":!Silent";
118  factoryOptions += ":Color";
119  factoryOptions += ":DrawProgressBar";
120  //These transformations don't do anything other than produce plots. They are not used in the MVAs
121  factoryOptions += ":Transformations=I;D;P;G;D";
122  factoryOptions += ":AnalysisType=Classification";
123 
124  for(auto option : tokenise(factoryOptions)) INFO << "Factory options: " << option << std::endl;
125  INFO << std::endl;
126 
127  TMVA::Factory *factory = new TMVA::Factory(name, fpOutput, factoryOptions);
128 
129  factory->AddVariable("trackLength","track length","cm",'F');
130  factory->AddVariable("dedxSep","dedxSep","",'F');
131  factory->AddVariable("scatSep","scatSep","",'F');
132  factory->AddVariable("measFrac","measFrac","",'F');
133  factory->AddSignalTree(signalTree,1.0);
134  factory->AddBackgroundTree(backgroundTree,1.0);
135 
136  //Find out how many events we have after the cuts are applied
137  int numSignalEvents = signalTree->Draw("","1");
138  int numBackgroundEvents = backgroundTree->Draw("","1");
139 
140  INFO << "signalTree has: " << signalTree->GetEntries() << " entries" << std::endl;
141  INFO << "backgroundTree has: " << backgroundTree->GetEntries() << " entries" << std::endl;
142  INFO << std::endl;
143  INFO << "signalTree has: " << numSignalEvents << " entries that pass cuts" << std::endl;
144  INFO << "backgroundTree has: " << numBackgroundEvents << " entries that pass cuts" << std::endl;
145  INFO << std::endl;
146  INFO << "Reseting TEventLists" << std::endl;
147  INFO << std::endl;
148 
149  //Reset the TEventLists in the TTrees (i.e. remove the cuts)
150 
151  signalTree->SetEventList(0);
152  backgroundTree->SetEventList(0);
153 
154  int numEvents = numBackgroundEvents;
155  if(numSignalEvents < numBackgroundEvents) numEvents = numSignalEvents;
156  if(debug) numEvents = 1e4;
157 
158  int numTrainEvents = int(numEvents*0.9);
159  int numTestEvents = int(numEvents*0.1);
160 
161  std::string trainTestOptions;
162  trainTestOptions += "nTrain_Signal=" + std::to_string(numTrainEvents);
163  trainTestOptions += ":nTrain_Background=" + std::to_string(numTrainEvents);
164  trainTestOptions += ":nTest_Signal=" + std::to_string(numTestEvents);
165  trainTestOptions += ":nTest_Background=" + std::to_string(numTestEvents);
166  //Split the data set randomly
167  trainTestOptions += ":SplitMode=Random";
168  //SplitSeed=0 random number generator starts with random seed
169  trainTestOptions += ":SplitSeed=100";
170 
171  for(auto option : tokenise(trainTestOptions)) INFO << "Train/Test options: " << option << std::endl;
172  INFO << std::endl;
173 
174  factory->PrepareTrainingAndTestTree("","",trainTestOptions);
175 
176  return factory;
177 
178 }
179 
180 
181 void tmva_remid_train_classifiers(std::string trainingFileName, std::string outputFileName, std::string classifierList="KNN", bool debug=false){
182 
183  INFO << "trainingFileName: " << trainingFileName << std::endl;
184  INFO << "outputFileName: " << outputFileName << std::endl;
185  INFO << "debug: " << debug << std::endl;
186  INFO << std::endl;
187 
188  TFile* fpTraining = TFile::Open(trainingFileName.c_str(), "READ");
189  TFile* fpOutput = TFile::Open(outputFileName.c_str(), "RECREATE");
190 
191  if(!fpTraining || fpTraining->IsZombie()){ ERROR << "Failed to open training file" << std::endl; return;}
192  if(!fpOutput || fpOutput->IsZombie()){ ERROR << "Failed to open output file" << std::endl; return;}
193 
194  TMVA::Factory *factory = prepareFactory("tmva_muonid", fpTraining, fpOutput, debug);
195 
196 
197  std::map<std::string, Classifier> classifiers;
198  std::vector<std::string> classifierListVec = tokenise(classifierList);
199  for(auto classifierType : classifierListVec){
200  INFO << "classifierType: " << classifierType << std::endl;
201  std::map<std::string, Classifier> theseClassifiers;
202  if(classifierType == "BDT") theseClassifiers = makeClassifierMapBDT();
203  if(classifierType == "MLP") theseClassifiers = makeClassifierMapMLP();
204  if(classifierType == "SVM") theseClassifiers = makeClassifierMapSVM();
205  if(classifierType == "KNN") theseClassifiers = makeClassifierMapKNN();
206  classifiers.insert(theseClassifiers.begin(), theseClassifiers.end());
207  }
208 
209  for(auto thisClassifierPair : classifiers){
210  std::string thisName = thisClassifierPair.first;
211  INFO << "classifier name: " << thisName << std::endl;
212  for(auto option : tokenise(thisClassifierPair.second.fOptions)) INFO << thisName << " options: " << option << std::endl;
213  INFO << std::endl;
214  factory->BookMethod(thisClassifierPair.second.fType, thisClassifierPair.first, thisClassifierPair.second.fOptions);
215 
216  }//classifiers
217 
218 
219  factory->TrainAllMethods();
220  factory->TestAllMethods();
221  factory->EvaluateAllMethods();
222 
223  fpOutput->Close();
224 
225  delete factory;
226 
227 
228 }
229 
230 #endif
231 
232 
std::map< std::string, Classifier > makeClassifierMapSVM()
const XML_Char * name
Definition: expat.h:151
std::map< std::string, Classifier > makeClassifierMapKNN()
Float_t ss
Definition: plot.C:24
std::vector< std::string > tokenise(const std::string &s, const std::string &delim=":")
TMVA::Factory * prepareFactory(std::string name, TFile *fpTraining, TFile *fpOutput, bool debug=true)
::xsd::cxx::tree::type type
Definition: Database.h:110
const XML_Char * s
Definition: expat.h:262
Classifier(TMVA::Types::EMVA type=TMVA::Types::kVariable, std::string options="")
TMVA::Types::EMVA fType
std::map< std::string, Classifier > makeClassifierMapAll()
void tmva_remid_train_classifiers(std::string trainingFileName, std::string outputFileName, std::string classifierList="KNN", bool debug=false)
::xsd::cxx::tree::string< char, simple_type > string
Definition: Database.h:154
std::string to_string(ModuleType mt)
Definition: ModuleType.h:32
std::map< std::string, Classifier > makeClassifierMapBDT()
std::map< std::string, Classifier > makeClassifierMapMLP()