-
Notifications
You must be signed in to change notification settings - Fork 36
/
Copy pathfit_MVA_dists.py
122 lines (94 loc) · 3.55 KB
/
fit_MVA_dists.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
import sys, os, argparse
import json
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import joblib
from collections import OrderedDict
import uproot
import pickle
from decay_mode_xs import modes as bkg_modes
from scipy import interpolate
#Local code
from userConfig import loc, train_vars, train_vars_vtx, Ediff_cut
import plotting
import utils as ut
from matplotlib import rc
rc('font',**{'family':'serif','serif':['Roman']})
rc('text', usetex=True)
#Fetch optimal cuts for NZ = 5e12 to show on plot
with open(f'{loc.JSON}/optimal_yields_5.json') as f:
yields = json.load(f)
path = loc.ANALYSIS
modes = OrderedDict()
file_prefix = "p8_ee_Zbb_ecm91_EvtGen"
#Background decays
for b in bkg_modes:
for d in bkg_modes[b]:
modes[f"{b}_{d}"] = {"name": f"{file_prefix}_{b}2{d}"}
#Cuts
bdt = {"MVA1": 0.95,
"MVA2": 0.95
}
cut = f"EVT_ThrustDiff_E {Ediff_cut} and EVT_MVA1 > {bdt['MVA1']} and EVT_MVA2 > {bdt['MVA2']}"
#Load dataframes for each mode
tree = {}
df = {}
#Make a summed sample of all modes to compare to
df_all = pd.DataFrame()
for m in modes:
#Load samples with MVA > 0.95 cuts applied
tree[m] = uproot.open(f"{path}/{modes[m]['name']}_sel2.root")["events"]
df[m] = tree[m].arrays(library="pd", how="zip", filter_name=["EVT_*"])
#Apply Ediff cut, BDT1 cut, and BDT2 cut
df[m] = df[m].query(cut)
print(f"Decay mode {m} stats: {len(df[m])}")
#log transform the BDT
df[m]["log_EVT_MVA1"] = -np.log(1. - df[m]["EVT_MVA1"])
df[m]["log_EVT_MVA2"] = -np.log(1. - df[m]["EVT_MVA2"])
df_all = df_all.append(df[m])
#Fit the -log(1 - MVA) distributions above 0.95, to use in efficiency evaluation above a certain cut
for x in bdt:
bins = 100
fig,ax = plt.subplots(figsize=(8,8))
if(x=="MVA1"):
xmax = 10.7
else:
xmax = 7.
xmin = -np.log(1. - bdt[x])
counts, bin_edges = np.histogram(df_all[f"log_EVT_{x}"], bins, range=(xmin, xmax))
bin_centres = (bin_edges[:-1] + bin_edges[1:])/2.
err = np.sqrt(counts)
#Normalise
err = err / np.sum(counts)
counts = counts / np.sum(counts)
plt.errorbar(bin_centres, counts, yerr=err, fmt='o', color='k', markersize=2)
weights = 1./err
#Cubic spline of the MVA distribution
spline = interpolate.splrep(bin_centres, counts, w=weights)
xvals = np.linspace(xmin, xmax, 1000)
spline_vals = interpolate.splev(xvals, spline)
plt.plot(xvals, spline_vals, color="red")
name = x.replace("MVA", "BDT")
plt.xlabel(f"-log(1 - {x})",fontsize=30)
plt.xlim(xmin, xmax)
ax.tick_params(axis='both', which='major', labelsize=25)
cut_99_val = 0.99
cut_99 = -np.log(1. - cut_99_val)
plt.axvline(x=cut_99,color="dodgerblue",linestyle="--",label=f"BDT $>{cut_99_val}$")
cut_best_val = yields[x]
cut_best = -np.log(1. - cut_best_val)
plt.axvline(x=cut_best,color="orange",linestyle="--",label="Optimal BDT $>%.5f$" % cut_best_val)
x_vals_above_cut = np.linspace(cut_best,xmax,int(1000*((cut_best - xmin)/(xmax-xmin))))
spline_vals_above_cut = interpolate.splev(x_vals_above_cut, spline)
plt.fill_between(x_vals_above_cut,spline_vals_above_cut,color="crimson",alpha=0.2)
#ax.get_yaxis().set_visible(False)
plt.yscale('log')
ymin, ymax = plt.ylim()
plt.ylim(ymin,2*ymax)
plt.legend(fontsize=25,loc="lower left")
plt.tight_layout()
fig.savefig(f"{loc.PLOTS}/{x}_spline.pdf")
#Save the spline to pickle for use in background efficiency determination
with open(f'{loc.PKL}/{x}_spline.pkl', 'wb') as f:
pickle.dump(spline, f)