Skip to content

Commit

Permalink
Merge pull request #50 from ucsd-hep-ex/bdt
Browse files Browse the repository at this point in the history
Add a bdt for boosted hhh baseline
  • Loading branch information
billy000400 authored Jul 18, 2024
2 parents db329cd + 258cf03 commit cea4c5f
Show file tree
Hide file tree
Showing 5 changed files with 237 additions and 66 deletions.
2 changes: 1 addition & 1 deletion notebooks/bdt/fj_bdt.ipynb → notebooks/bdt/hhh_bdt.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -432,7 +432,7 @@
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.9.18"
"version": "3.9.10"
}
},
"nbformat": 4,
Expand Down
163 changes: 140 additions & 23 deletions notebooks/bdt/hhh_qcd_bdt.ipynb

Large diffs are not rendered by default.

120 changes: 87 additions & 33 deletions src/analysis/plot.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,47 +26,54 @@ def calc_pur_eff(target_path, pred_path, bins):
for event in LUT_resolved_wOR_pred:
event_no_OR = []
for predH in event:
if predH[2]==0:
if predH[2] == 0:
event_no_OR.append(predH)
LUT_resolved_pred_no_OR.append(event_no_OR)

LUT_resolved_target_no_OR = []
for event in LUT_resolved_wOR_target:
event_no_OR = []
for targetH in event:
if targetH[2]==0:
if targetH[2] == 0:
event_no_OR.append(targetH)
LUT_resolved_target_no_OR.append(event_no_OR)

# calculate efficiencies and purities for b+r, b, and r
results = {}
results['pur_m'], results['purerr_m'] = calc_eff(LUT_boosted_pred, LUT_resolved_wOR_pred, bins)
results['eff_m'], results['efferr_m'] = calc_pur(LUT_boosted_target, LUT_resolved_wOR_target, bins)
results["pur_m"], results["purerr_m"] = calc_eff(LUT_boosted_pred, LUT_resolved_wOR_pred, bins)
results["eff_m"], results["efferr_m"] = calc_pur(LUT_boosted_target, LUT_resolved_wOR_target, bins)

results['pur_b'], results['purerr_b'] = calc_eff(LUT_boosted_pred, None, bins)
results['eff_b'], results['efferr_b'] = calc_pur(LUT_boosted_target, None, bins)
results["pur_b"], results["purerr_b"] = calc_eff(LUT_boosted_pred, None, bins)
results["eff_b"], results["efferr_b"] = calc_pur(LUT_boosted_target, None, bins)

results['pur_r'], results['purerr_r'] = calc_eff(None, LUT_resolved_pred, bins)
results['eff_r'], results['efferr_r'] = calc_pur(None, LUT_resolved_target, bins)
results["pur_r"], results["purerr_r"] = calc_eff(None, LUT_resolved_pred, bins)
results["eff_r"], results["efferr_r"] = calc_pur(None, LUT_resolved_target, bins)

results['pur_r_or'], results['purerr_r_or'] = calc_eff(None, LUT_resolved_pred_no_OR, bins)
results['eff_r_or'], results['efferr_r_or'] = calc_pur(None, LUT_resolved_target_no_OR, bins)
results["pur_r_or"], results["purerr_r_or"] = calc_eff(None, LUT_resolved_pred_no_OR, bins)
results["eff_r_or"], results["efferr_r_or"] = calc_pur(None, LUT_resolved_target_no_OR, bins)

print("Number of Boosted Prediction:", np.array([pred for event in LUT_boosted_pred for pred in event]).shape[0] )
print("Number of Resolved Prediction before OR:", np.array([pred for event in LUT_resolved_pred for pred in event]).shape[0] )
print("Number of Resolved Prediction after OR:", np.array([pred for event in LUT_resolved_pred_no_OR for pred in event]).shape[0] )
print("Number of Boosted Prediction:", np.array([pred for event in LUT_boosted_pred for pred in event]).shape[0])
print(
"Number of Resolved Prediction before OR:",
np.array([pred for event in LUT_resolved_pred for pred in event]).shape[0],
)
print(
"Number of Resolved Prediction after OR:",
np.array([pred for event in LUT_resolved_pred_no_OR for pred in event]).shape[0],
)

return results


# I started to use "efficiency" for describing how many gen Higgs were reconstructed
# and "purity" for desrcribing how many reco Higgs are actually gen Higgs
def plot_pur_eff_w_dict(plot_dict, target_path, save_path=None, proj_name=None, bins=None):
if bins == None:
bins = np.arange(0, 1050, 50)

plot_bins = np.append(bins, 2*bins[-1]-bins[-2])
bin_centers = [(plot_bins[i]+plot_bins[i+1])/2 for i in range(plot_bins.size-1)]
xerr=(plot_bins[1]-plot_bins[0])/2*np.ones(plot_bins.shape[0]-1)
plot_bins = np.append(bins, 2 * bins[-1] - bins[-2])
bin_centers = [(plot_bins[i] + plot_bins[i + 1]) / 2 for i in range(plot_bins.size - 1)]
xerr = (plot_bins[1] - plot_bins[0]) / 2 * np.ones(plot_bins.shape[0] - 1)

# m: merged (b+r w OR)
# b: boosted
Expand All @@ -77,28 +84,75 @@ def plot_pur_eff_w_dict(plot_dict, target_path, save_path=None, proj_name=None,
fig_r_or, ax_r_or = plt.subplots(1, 2, figsize=(12, 5))

# preset figure labels, titles, limits, etc.
ax_m[0].set(xlabel=r"Merged Reco H pT (GeV)", ylabel=r"Reconstruction Purity", title=f"Reconstruction Purity vs. Merged Reco H pT")
ax_m[1].set(xlabel=r"Merged Gen H pT (GeV)", ylabel=r"Reconstruction Efficiency", title=f"Reconstruction Efficiency vs. Merged Gen H pT")
ax_b[0].set(xlabel=r"Reco Boosted H pT (GeV)", ylabel=r"Reconstruction Purity", title=f"Reconstruction Purity vs. Reco Boosted H pT")
ax_b[1].set(xlabel=r"Gen Boosted H pT (GeV)", ylabel=r"Reconstruction Efficiency", title=f"Reconstruction Efficiency vs. Gen Boosted H pT")
ax_r[0].set(xlabel=r"Reco Resolved H pT (GeV)", ylabel=r"Reconstruction Purity", title=f"Reconstruction Purity vs. Reco Resolved H pT")
ax_r[1].set(xlabel=r"Gen Resolved H pT (GeV)", ylabel=r"Reconstruction Efficiency", title=f"Reconstruction Efficiency vs. Gen Resolved H pT")
ax_r_or[0].set(xlabel=r"Reco Resolved H pT (GeV)", ylabel=r"Reconstruction Purity", title=f"Resolved Purity After OR vs. Reco Resolved H pT")
ax_r_or[1].set(xlabel=r"Gen Resolved H pT (GeV)", ylabel=r"Reconstruction Efficiency", title=f"Resolved Efficiency After OR vs. Gen Resolved H pT")

ax_m[0].set(
xlabel=r"Merged Reco H pT (GeV)",
ylabel=r"Reconstruction Purity",
title=f"Reconstruction Purity vs. Merged Reco H pT",
)
ax_m[1].set(
xlabel=r"Merged Gen H pT (GeV)",
ylabel=r"Reconstruction Efficiency",
title=f"Reconstruction Efficiency vs. Merged Gen H pT",
)
ax_b[0].set(
xlabel=r"Reco Boosted H pT (GeV)",
ylabel=r"Reconstruction Purity",
title=f"Reconstruction Purity vs. Reco Boosted H pT",
)
ax_b[1].set(
xlabel=r"Gen Boosted H pT (GeV)",
ylabel=r"Reconstruction Efficiency",
title=f"Reconstruction Efficiency vs. Gen Boosted H pT",
)
ax_r[0].set(
xlabel=r"Reco Resolved H pT (GeV)",
ylabel=r"Reconstruction Purity",
title=f"Reconstruction Purity vs. Reco Resolved H pT",
)
ax_r[1].set(
xlabel=r"Gen Resolved H pT (GeV)",
ylabel=r"Reconstruction Efficiency",
title=f"Reconstruction Efficiency vs. Gen Resolved H pT",
)
ax_r_or[0].set(
xlabel=r"Reco Resolved H pT (GeV)",
ylabel=r"Reconstruction Purity",
title=f"Resolved Purity After OR vs. Reco Resolved H pT",
)
ax_r_or[1].set(
xlabel=r"Gen Resolved H pT (GeV)",
ylabel=r"Reconstruction Efficiency",
title=f"Resolved Efficiency After OR vs. Gen Resolved H pT",
)

# plot purities and efficiencies
for tag, pred_path in plot_dict.items():
print("Processing", tag)
results = calc_pur_eff(target_path, pred_path, bins)
ax_m[0].errorbar(x=bin_centers, y=results['pur_m'], xerr=xerr, yerr=results['purerr_m'], fmt='o', capsize=5, label=tag)
ax_m[1].errorbar(x=bin_centers, y=results['eff_m'], xerr=xerr, yerr=results['efferr_m'], fmt='o', capsize=5, label=tag)
ax_b[0].errorbar(x=bin_centers, y=results['pur_b'], xerr=xerr, yerr=results['purerr_b'], fmt='o', capsize=5, label=tag)
ax_b[1].errorbar(x=bin_centers, y=results['eff_b'], xerr=xerr, yerr=results['efferr_b'], fmt='o', capsize=5, label=tag)
ax_r[0].errorbar(x=bin_centers, y=results['pur_r'], xerr=xerr, yerr=results['purerr_r'], fmt='o', capsize=5, label=tag)
ax_r[1].errorbar(x=bin_centers, y=results['eff_r'], xerr=xerr, yerr=results['efferr_r'], fmt='o', capsize=5, label=tag)
ax_r_or[0].errorbar(x=bin_centers, y=results['pur_r_or'], xerr=xerr, yerr=results['purerr_r_or'], fmt='o', capsize=5, label=tag)
ax_r_or[1].errorbar(x=bin_centers, y=results['eff_r_or'], xerr=xerr, yerr=results['efferr_r_or'], fmt='o', capsize=5, label=tag)
ax_m[0].errorbar(
x=bin_centers, y=results["pur_m"], xerr=xerr, yerr=results["purerr_m"], fmt="o", capsize=5, label=tag
)
ax_m[1].errorbar(
x=bin_centers, y=results["eff_m"], xerr=xerr, yerr=results["efferr_m"], fmt="o", capsize=5, label=tag
)
ax_b[0].errorbar(
x=bin_centers, y=results["pur_b"], xerr=xerr, yerr=results["purerr_b"], fmt="o", capsize=5, label=tag
)
ax_b[1].errorbar(
x=bin_centers, y=results["eff_b"], xerr=xerr, yerr=results["efferr_b"], fmt="o", capsize=5, label=tag
)
ax_r[0].errorbar(
x=bin_centers, y=results["pur_r"], xerr=xerr, yerr=results["purerr_r"], fmt="o", capsize=5, label=tag
)
ax_r[1].errorbar(
x=bin_centers, y=results["eff_r"], xerr=xerr, yerr=results["efferr_r"], fmt="o", capsize=5, label=tag
)
ax_r_or[0].errorbar(
x=bin_centers, y=results["pur_r_or"], xerr=xerr, yerr=results["purerr_r_or"], fmt="o", capsize=5, label=tag
)
ax_r_or[1].errorbar(
x=bin_centers, y=results["eff_r_or"], xerr=xerr, yerr=results["efferr_r_or"], fmt="o", capsize=5, label=tag
)

# adjust limits and legends
ax_m[0].legend()
Expand Down
8 changes: 4 additions & 4 deletions src/analysis/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -80,8 +80,8 @@ def calc_eff(LUT_boosted_pred, LUT_resolved_pred, bins):
predHs_inds = np.digitize(predHs[:, 1], bins)

correctTruth_per_bin = []
for bin_i in range(1, len(bins)+1):
correctTruth_per_bin.append(predHs[:,0][predHs_inds==bin_i])
for bin_i in range(1, len(bins) + 1):
correctTruth_per_bin.append(predHs[:, 0][predHs_inds == bin_i])
correctTruth_per_bin = ak.Array(correctTruth_per_bin)

means = ak.mean(correctTruth_per_bin, axis=-1)
Expand Down Expand Up @@ -120,8 +120,8 @@ def calc_pur(LUT_boosted_target, LUT_resolved_target, bins):
targetHs_inds = np.digitize(targetHs[:, 1], bins)

correctTruth_per_bin = []
for bin_i in range(1, len(bins)+1):
correctTruth_per_bin.append(targetHs[:,0][targetHs_inds==bin_i])
for bin_i in range(1, len(bins) + 1):
correctTruth_per_bin.append(targetHs[:, 0][targetHs_inds == bin_i])
correctTruth_per_bin = ak.Array(correctTruth_per_bin)

means = ak.mean(correctTruth_per_bin, axis=-1)
Expand Down
10 changes: 5 additions & 5 deletions src/models/mix_baseline.py
Original file line number Diff line number Diff line change
Expand Up @@ -108,12 +108,12 @@ def main(test_file, pred_file, n_higgs):
# Remove Overlap jets

# find ak4jets that matched to selected ak8jets (dR check)
matched_fj_idx = match_fjet_to_jet(fjs_selected, js, ak.ArrayBuilder()).snapshot()
# matched_fj_idx = match_fjet_to_jet(fjs_selected, js, ak.ArrayBuilder()).snapshot()

# remove overlapped ak4jets and padded jets
unoverlapped = matched_fj_idx == -1
# remove padded jets
not_padded = js["mask"]
j_cond = unoverlapped & not_padded
j_cond = not_padded

js_selected = js[j_cond]

# Reconstruct resolved higgs
Expand All @@ -128,7 +128,7 @@ def main(test_file, pred_file, n_higgs):
# all combinations of input jets
# for different numbers of resolved higgs and jets
JET_ASSIGNMENTS = {}
for nH in range(0, n_higgs+1):
for nH in range(0, n_higgs + 1):
JET_ASSIGNMENTS[nH] = {}
for nj in range(0, nH * 2):
JET_ASSIGNMENTS[nH][nj] = []
Expand Down

0 comments on commit cea4c5f

Please sign in to comment.