7 #include "TParameter.h" 9 #include "TMVA/Config.h" 10 #include "TMVA/DataLoader.h" 11 #include "TMVA/DataInputHandler.h" 12 #include "TMVA/Factory.h" 13 #include "TMVA/MethodKNN.h" 14 #include "TMVA/Types.h" 16 #include "CAFAna/Core/Binning.h" 40 const std::vector<IFitVarOrISyst> & toFit,
42 : fMode(mode), fMCMCSamples(&samples)
44 for (
const auto & registerable : toFit)
46 if(registerable.ifitvar){
47 fVars.push_back(registerable.ifitvar);
50 else if (registerable.isyst){
51 fSysts.push_back(registerable.isyst);
55 assert(
false &&
"BayesianMarginal can only marginalize over IFitVars or ISysts");
62 std::cerr <<
"BayesianMarginal requires at least 100 MCMC samples to create a marginal distribution from them." <<
std::endl;
68 if (std::find(samples.
Vars().begin(), samples.
Vars().end(),
var) == samples.
Vars().end())
70 std::cerr <<
"Error: Requesting to marginalize over IFitVar not in MCMCSamples: " <<
var->ShortName() <<
std::endl;
74 for (
const auto & syst :
fSysts)
76 if (std::find(samples.
Systs().begin(), samples.
Systs().end(), syst) == samples.
Systs().end())
78 std::cerr <<
"Error: Requesting to marginalize over ISyst not in MCMCSamples: " << syst->ShortName() <<
std::endl;
87 std::cout <<
" Building a kNN from the MCMC points to estimate marginals..." << std::flush;
88 bool oldSilent = TMVA::gConfig().IsSilent();
89 TMVA::gConfig().SetSilent(
true);
105 auto trainTree = std::unique_ptr<TTree>(
const_cast<TTree *
>(samples.
ToTTree())->CloneTree());
106 const_cast<TTree *
>(samples.
ToTTree())->CopyAddresses(trainTree.get(),
true);
107 const_cast<TTree *
>(samples.
ToTTree())->GetListOfClones()->Remove(trainTree.get());
113 TMVA::gConfig().SetSilent(oldSilent);
119 std::vector<double> qVals;
121 std::back_inserter(qVals),
122 [](
const std::pair<Quantile, double>&
p){
return p.second;});
124 std::map<double, std::pair<std::size_t, double>> vals = samples.
QuantileLL(qVals);
140 assert(vals.size() ==
fkNN->DataInfo().GetNVariables());
142 TMVA::Event ev(vals,
fkNN->DataInfo().GetNVariables());
143 return dynamic_cast<TMVA::MethodBase*
>(
fkNN.get())->GetRegressionValues(&ev)[0];
149 auto varNames =
dynamic_cast<TList*
>(dir->Get(
"vars"));
150 for (
const auto & obj : *varNames)
152 auto systNames =
dynamic_cast<TList*
>(dir->Get(
"systs"));
156 auto mode =
dynamic_cast<TParameter<int>*
>(dir->Get(
"mode"));
164 auto tree =
dynamic_cast<TTree *
>(dir->Get(
"kNN_weights"));
165 tree->SetDirectory(
nullptr);
166 TFile tmpFile(
"tmp_surface.root",
"recreate");
169 marg->
fkNN->ReadWeightsFromStream(tmpFile);
171 gSystem->Unlink(
"tmp_surface.root");
174 auto quantMap =
dynamic_cast<TMap*
>(dir->Get(
"quantileSamples"));
175 for (
const auto & quantPair : *quantMap)
177 auto pair =
dynamic_cast<TPair*
>(quantPair);
178 auto quant =
Quantile(
dynamic_cast<TParameter<int>*
>(pair->Key())->GetVal());
180 std::forward_as_tuple(quant),
181 std::forward_as_tuple(*dynamic_cast<TMap*>(pair->Value()),
197 assert(pdf &&
"In 'histogram' marginal mode, must supply the histogram for which the quantile threshold is to be calculated");
211 assert(quantile >= 0 && quantile <= 1 &&
"QuantileSurface(): quantile must be between 0 and 1");
213 auto quantilePair = samples.
QuantileLL(quantile);
217 return quantilePair.second;
223 auto oldDir = gDirectory;
226 TParameter<int>
mode(
"mode",
int(
fMode));
234 TFile tmpFile(
"tmp_surface.root",
"recreate");
235 fkNN->WriteWeightsToStream(tmpFile);
236 auto tree =
dynamic_cast<TTree *
>(tmpFile.Get(
"knn"));
237 tree->SetDirectory(
nullptr);
239 tree->Write(
"kNN_weights");
241 gSystem->Unlink(
"tmp_surface.root");
246 quantMap.Add(
new TParameter<int>(
"",
int(quantPair.first)), quantPair.second.ToTMap().release());
247 quantMap.Write(
"quantileSamples", TObject::kSingleKey);
251 varNames.Add(
new TObjString(
var->ShortName().c_str()));
252 varNames.Write(
"vars", TObject::kSingleKey);
255 for (
const auto & syst :
fSysts)
256 systNames.Add(
new TObjString(syst->ShortName().c_str()));
257 systNames.Write(
"systs", TObject::kSingleKey);
269 entries = loader->DataInput().GetEntries();
270 auto opt = Form(
"KNN,nkNN=%d", entries > 20 ? 20 : entries-1);
271 fkNN = std::make_unique<TMVA::MethodKNN>(loader->GetDataSetInfo(),
opt);
276 fkNN = std::make_unique<TMVA::MethodKNN>(
info,
"KNN");
279 auto & dsInfo =
fkNN->DataInfo();
281 dsInfo.AddVariable(
var->ShortName());
282 for (
const auto & syst :
fSysts)
283 dsInfo.AddVariable(syst->ShortName());
286 dsInfo.SetSplitOptions(Form(
"nTrain_Regression=%d,nTest_Regression=0", entries));
288 fkNN->SetAnalysisType( TMVA::Types::kRegression );
290 fkNN->ParseOptions();
291 fkNN->ProcessSetup();
296 if (pdf->Integral() <= 0)
297 return std::numeric_limits<double>::signaling_NaN();
302 else if (quantile == 1)
304 else if (quantile < 0 || quantile > 1)
306 std::cerr <<
"BayesianMarginal::ThresholdFromTH1(): requested quantile = " << quantile <<
" is not between 0 and 1. Abort" <<
std::endl;
312 if (
std::abs(pdf->Integral() - 1) > 0.00001)
314 pdf =
dynamic_cast<TH1*
>(pdf->Clone());
315 const_cast<TH1*
>(
pdf)->
Scale(1./pdf->Integral());
319 std::vector<double> contents(pdf->GetNcells());
320 double overflowC = 0;
321 double underflowC = 0;
322 for (
int binIdx = 0; binIdx <= pdf->GetNcells(); binIdx++)
324 double binC = pdf->GetBinContent(binIdx);
328 if (pdf->IsBinOverflow(binIdx))
330 else if (pdf->IsBinUnderflow(binIdx))
333 contents.push_back(binC);
335 if ((1 - quantile) < underflowC || (1 - quantile) < overflowC)
337 std::cerr <<
"Warning: Histogram underflow or overflow contains more than requested quantile (" << quantile <<
") of events." <<
std::endl;
344 std::sort(contents.begin(), contents.end());
345 std::size_t
idx = (1 -
quantile) * (contents.size() - 1);
346 auto it = contents.begin();
347 decltype(
it) oldIt =
it;
348 std::advance(
it, idx);
349 double oldSum = std::accumulate(
it, contents.end(), 0.0);
352 while (
it != contents.begin() &&
it != contents.end())
355 if (oldSum == quantile)
358 std::advance(
it, forward ? +1 : -1);
359 double newSum = std::accumulate(
it, contents.end(), 0.0);
362 if ((newSum - quantile) / (oldSum - quantile) < 0)
371 return (*
it + *oldIt) / 2;
386 assert(bins.size() > 0 && bins.size() < 3);
395 name =
var->LatexName();
397 name = syst->LatexName();
398 return s +
";" +
name;
400 std::unique_ptr<TH1>
ret;
401 if (bins.size() == 1)
403 else if (bins.size() == 2)
409 std::unique_ptr<TH1> denom;
410 if (bins.size() == 1)
411 denom = std::make_unique<TH1D>(*dynamic_cast<TH1D*>(ret.get()));
412 else if (bins.size() == 2)
413 denom = std::make_unique<TH2D>(*dynamic_cast<TH2D*>(ret.get()));
426 std::vector<bool> isVar(
fVars.size() +
fSysts.size(),
true);
427 std::vector<VarOrSyst> varsOrSysts(
fVars.size() +
fSysts.size());
432 varsOrSysts[brIdx].
var =
var;
435 isVar[brIdx] =
false;
436 varsOrSysts[brIdx].syst = syst;
442 if (std::isnan(logprob))
443 std::cerr <<
"Warning: Encountered NaN log-probability in an MCMC sample. Other things will probably go wrong..." <<
std::endl;
444 std::map<const IFitVar*, double> varVals;
445 std::map<const ISyst*, double> systVals;
447 std::vector<double> vals;
448 for (std::size_t brIdx = 0; brIdx < varsOrSysts.size(); ++brIdx)
455 if (bins.size() == 1)
456 ret->Fill(vals[0], numWgt);
457 else if (bins.size() == 2)
459 dynamic_cast<TH2*
>(ret.get())->
Fill(vals[0], vals[1], numWgt);
460 dynamic_cast<TH2*
>(denom.get())->
Fill(vals[0], vals[1]);
462 else if (bins.size() == 3)
464 dynamic_cast<TH3*
>(ret.get())->
Fill(vals[0], vals[1], vals[2], numWgt);
465 dynamic_cast<TH3*
>(denom.get())->
Fill(vals[0], vals[1], vals[2]);
472 ret->Divide(denom.get());
473 for (
int binIdx = 0; binIdx < ret->GetNcells(); binIdx++)
476 if (ret->GetBinContent(binIdx) != 0.0)
477 ret->SetBinContent(binIdx, exp(ret->GetBinContent(binIdx)));
482 if (ret->Integral() > 0)
483 ret->Scale(1. / ret->Integral());
const XML_Char XML_Encoding * info
std::vector< const ISyst * > fSysts
Systs we want to marginalize to (see also fVars)
std::map< Quantile, MCMCSample > fQuantileSampleMap
The samples that are the boundaries for various commonly-used quantiles.
std::size_t NumSamples() const
How many samples do we have?
void SetupKNN(TMVA::DataLoader *loader=nullptr)
Cuts and Vars for the 2020 FD DiF Study.
const TTree * ToTTree() const
Get a TTree with the MCMC samples in it.
static void LoadInto(BayesianMarginal *marg, TDirectory *dir)
void SaveTo(TDirectory *dir) const
TH1D * MakeTH1D(const char *name, const char *title, const Binning &bins)
Utility function to avoid need to switch on bins.IsSimple()
std::unique_ptr< TMVA::MethodKNN > fkNN
the kNN used for approximating the surface from the points when in kNN mode
double SampleValue(const IFitVar *var, std::size_t idx) const
Get the value of Var var for sample number .
std::pair< std::size_t, double > QuantileLL(double quantile) const
const std::vector< const IFitVar * > & Vars() const
Which Vars are sampled in these samples?
std::vector< const IFitVar * > Vars() const
std::vector< std::string > fOrderedBrNames
the variables/systs to be marginalized over, in axis order (x-axis, y-axis, ...)
Encapsulate code to systematically shift a caf::SRProxy.
double EstimateLLFromKNN(const std::vector< float > &vals) const
MCMCSample Sample(std::size_t idx) const
static const std::vector< std::pair< Quantile, double > > QuantileUpVals
void SampleValues(std::size_t idx, const std::vector< const ana::IFitVar * > &vars, std::map< const ana::IFitVar *, double > &varVals) const
Get the values of FitVars vars for sample number idx.
MarginalMode
How to construct marginal distributions.
std::unique_ptr< TMVA::DataLoader > fdataLoader
Needed for kNN.
correl_yv Fill(-(dy[iP-1][iC-1]), hyv->GetBinContent(iP, iC))
static double ThresholdFromTH1(const TH1 *pdf, double quantile)
Determine the threshold for the 'histogram' mode from the given histogram with a brute force method...
Quantile
Type to specify quantile values, so that we don't run into floating-point matching issues...
std::vector< const ISyst * > Systs() const
unsigned int quantile(std::vector< std::vector< double >> v, float E, float hadEfrac)
const std::vector< const ana::ISyst * > & Systs() const
Which Systs are sampled in these samples?
MarginalMode fMode
What mode are we using?
TH2D * MakeTH2D(const char *name, const char *title, const Binning &binsx, const Binning &binsy)
Utility function to avoid 4-way combinatorial explosion on the bin types.
double SampleLL(std::size_t idx) const
Get the LL for sample number .
const MCMCSamples * fMCMCSamples
Ref to the MCMCSamples. Will not be persisted.
static const std::string sKNNName
The result of marginalizing over a set of Bayesian MCMC samples down to a few dimensions.
static const std::string LOGLIKELIHOOD_BRANCH_NAME
std::map< std::string, std::string > systNames
assert(nhit_max >=nhit_nbins)
Interface definition for fittable variables.
double QuantileThreshold(Quantile quantile, const TH1 *pdf=nullptr) const
std::vector< const IFitVar * > fVars
Variables we want to marginalize to (see also fSysts)
std::unique_ptr< TH1 > ToHistogram(const std::vector< Binning > &bins) const
simulatedPeak Scale(1/simulationNormalisationFactor)
std::string UniqueName()
Return a different string each time, for creating histograms.