Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Prepare for higher dimensional observables #969

Merged
merged 10 commits into from
Feb 7, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
55 changes: 30 additions & 25 deletions machine_learning_hep/analysis/analyzer_jets.py
Original file line number Diff line number Diff line change
Expand Up @@ -74,8 +74,7 @@ def __init__(self, datap, case, typean, period):

self.observables = {
'qa': ['zg', 'rg', 'nsd', 'zpar', 'dr', 'lntheta', 'lnkt', 'lntheta-lnkt'],
'all': [var for var, spec in self.cfg('observables', {}).items()
if '-' not in var and 'arraycols' not in spec],
'all': [*self.cfg('observables', {})],
}

self.bins_candpt = np.asarray(self.cfg('sel_an_binmin', []) + self.cfg('sel_an_binmax', [])[-1:], 'd')
Expand Down Expand Up @@ -742,7 +741,7 @@ def _analyze(self, method = 'sidesub'):
self.logger.info("Signal extraction (method %s): obs. %s, %s, ipt %d",
method, var, mcordata, ipt)
if not self.cfg('hfjet', True):
h = project_hist(h_in, axes_proj[1:], {})
h = project_hist(h_in, list(range(1, get_dim(h_in))), {})
elif method == 'sidesub':
h = self._subtract_sideband(h_in, var, mcordata, ipt)
elif method == 'sigextr':
Expand All @@ -751,8 +750,10 @@ def _analyze(self, method = 'sidesub'):
self.logger.critical('invalid method %s', method)
self._save_hist(h, f'h_ptjet{label}_{method}_noeff_{mcordata}_pt{ipt}.png')
if mcordata == 'mc':
h_proj = project_hist(h_in, axes_proj[1:], {})
h_proj_lim = project_hist(h_in, axes_proj[1:], {0: (1, get_nbins(h_in, 0))})
self.logger.info('projecting %s onto axes: %s', h_in, axes_proj[1:])
h_proj = project_hist(h_in, list(range(1, get_dim(h_in))), {})
h_proj_lim = project_hist(h_in, list(range(1, get_dim(h_in))),
{0: (1, get_nbins(h_in, 0))})
self._save_hist(h_proj, f'h_ptjet{label}_proj_noeff_{mcordata}_pt{ipt}.png')
if h and h_proj:
self.logger.debug('signal loss %s-%i: %g, fraction in under-/overflow: %g',
Expand Down Expand Up @@ -793,7 +794,7 @@ def _analyze(self, method = 'sidesub'):
self._clip_neg(fh_sum_fdsub)
self._save_hist(fh_sum_fdsub, f'h_ptjet{label}_{method}_{mcordata}.png')

if get_dim(fh_sum) > 1:
if get_dim(fh_sum) == 2:
axes = list(range(get_dim(fh_sum)))
axis_ptjet = get_axis(fh_sum, 0)
for iptjet in range(get_nbins(fh_sum, 0)):
Expand Down Expand Up @@ -823,8 +824,7 @@ def _analyze(self, method = 'sidesub'):
continue
axis_ptjet = get_axis(fh_sum_fdsub, 0)
for j in range(get_nbins(fh_sum_fdsub, 0)):
# TODO: generalize to higher dimensions
hproj = project_hist(fh_sum_fdsub, [1], {0: [j+1, j+1]})
hproj = project_hist(fh_sum_fdsub, list(range(1, get_dim(fh_sum_fdsub))), {0: [j+1, j+1]})
range_ptjet = get_bin_limits(axis_ptjet, j + 1)
self._save_hist(
hproj, f'uf/h_{var}_{method}_{mcordata}_{string_range_ptjet(range_ptjet)}.png')
Expand All @@ -837,7 +837,7 @@ def _analyze(self, method = 'sidesub'):
range_ptjet = get_bin_limits(axis_ptjet, j + 1)
c = TCanvas()
for i, h in enumerate(fh_unfolded):
hproj = project_hist(h, [1], {0: (j+1, j+1)})
hproj = project_hist(h, list(range(1, get_dim(h))), {0: (j+1, j+1)})
empty = hproj.Integral() < 1.e-7
if empty and i == 0:
self.logger.error("Projection %s %s %s is empty.", var, mcordata,
Expand All @@ -851,7 +851,7 @@ def _analyze(self, method = 'sidesub'):
self._save_hist(
hproj,
f'uf/h_{var}_{method}_unfolded_{mcordata}_' +
f'{string_range_ptjet(range_ptjet)}_sel.png')
f'{string_range_ptjet(range_ptjet)}_sel.png', "colz")
# Save also the self-normalised version.
if not empty:
hproj_sel = hproj.Clone(f"{hproj.GetName()}_selfnorm")
Expand Down Expand Up @@ -939,6 +939,7 @@ def estimate_feeddown(self):
df = pd.read_parquet(self.cfg('fd_parquet'))
col_mapping = {'dr': 'delta_r_jet', 'zpar': 'z'} # TODO: check mapping

# TODO: generalize to higher dimensions
for var in self.observables['all']:
bins_ptjet = np.asarray(self.cfg('bins_ptjet'), 'd')
# TODO: generalize or derive from histogram?
Expand All @@ -960,6 +961,7 @@ def estimate_feeddown(self):
if f'{colname}' not in df:
if var is not None:
self.logger.error('No feeddown information for %s (%s), cannot estimate feeddown', var, colname)
print(df.info(), flush=True)
continue

# TODO: derive histogram
Expand Down Expand Up @@ -990,6 +992,10 @@ def estimate_feeddown(self):
rfile.Get(f'h_effkine_fd_det_nocuts_{var}'),
rfile.Get(f'h_effkine_fd_det_cut_{var}'))
h_response = rfile.Get(f'h_response_fd_{var}')
if not h_response:
self.logger.error("Could not find response matrix for fd estimation of %s", var)
rfile.ls()
continue
h_response_norm = norm_response(h_response, 3)
h3_fd_gen.Multiply(h_effkine_gen)
self._save_hist(project_hist(h3_fd_gen, [0, 2], {}), f'fd/h_ptjet-{var}_fdnew_gen_genkine.png')
Expand Down Expand Up @@ -1086,27 +1092,26 @@ def _build_effkine(self, h_nocuts, h_cuts):


def _build_response_matrix(self, h_response, h_eff = None, frac_flat = 0.):
dim = (get_dim(h_response) - 1) // 2
self.logger.info("Building %i-dim response matrix from %s", dim, h_response)
rm = ROOT.RooUnfoldResponse(
project_hist(h_response, [0, 1], {}), project_hist(h_response, [2, 3], {}))
h_gen = project_hist(h_response, [2, 3], {})
for hbin in itertools.product(
enumerate(list(get_axis(h_response, 0).GetXbins())[:-1], 1),
enumerate(list(get_axis(h_response, 1).GetXbins())[:-1], 1),
enumerate(list(get_axis(h_response, 2).GetXbins())[:-1], 1),
enumerate(list(get_axis(h_response, 3).GetXbins())[:-1], 1),
enumerate(list(get_axis(h_response, 4).GetXbins())[:-1], 1)):
project_hist(h_response, list(range(dim)), {}), project_hist(h_response, list(range(dim, 2 * dim)), {}))
h_gen = project_hist(h_response, list(range(dim, 2 * dim)), {})

x = (enumerate(list(get_axis(h_response, iaxis).GetXbins())[:-1], 1) for iaxis in range(2*dim+1))
for hbin in itertools.product(*x):
n = h_response.GetBinContent(
np.asarray([hbin[0][0], hbin[1][0], hbin[2][0], hbin[3][0], hbin[4][0]], 'i'))
eff = h_eff.GetBinContent(hbin[4][0]) if h_eff else 1.
np.asarray([hbin[i][0] for i in range(2*dim+1)], 'i'))
eff = h_eff.GetBinContent(hbin[2*dim][0]) if h_eff else 1.
if np.isclose(eff, 0.):
self.logger.error('efficiency 0 for %s', hbin[4])
continue
if (cnt_gen := h_gen.GetBinContent(hbin[2][0], hbin[3][0])) > 0.:
if (cnt_gen := h_gen.GetBinContent(*(hbin[i][0] for i in range(dim, 2*dim)))) > 0.:
fac = 1.
if frac_flat > 0.:
fac += frac_flat * (1. / cnt_gen - 1.)
for _ in range(int(n)):
rm.Fill(hbin[0][1], hbin[1][1], hbin[2][1], hbin[3][1], 1./eff * fac)
rm.Fill(*(hbin[iaxis][1] for iaxis in range(2*dim)), 1./eff * fac)
# rm.Mresponse().Print()
return rm

Expand All @@ -1127,7 +1132,7 @@ def _subtract_feeddown(self, hist, var, mcordata):

#region unfolding
def _unfold(self, hist, var, mcordata):
self.logger.debug('Unfolding for %s', var)
self.logger.info('Unfolding for %s', var)
suffix = '_frac' if mcordata == 'mc' else ''
with TFile(self.n_fileeff) as rfile:
h_response = rfile.Get(f'h_response_pr_{var}{suffix}')
Expand Down Expand Up @@ -1158,7 +1163,7 @@ def _unfold(self, hist, var, mcordata):
self._save_hist(h_effkine_gen, f'uf/h_effkine-ptjet-{var}_pr_gen_{mcordata}.png', 'text')

# TODO: move, has nothing to do with unfolding
if mcordata == 'mc':
if mcordata == 'mc' and get_dim(hist) <= 2:
h_mctruth_pr = rfile.Get(f'h_ptjet-pthf-{var}_pr_gen')
if h_mctruth_pr:
h_mctruth_pr = project_hist(h_mctruth_pr, [0, 2], {})
Expand All @@ -1181,7 +1186,7 @@ def _unfold(self, hist, var, mcordata):
self._save_hist(fh_unfolding_output, f'uf/h_ptjet-{var}_{mcordata}_unfoldeffcorr{n}.png', 'texte')
h_unfolding_output.append(fh_unfolding_output)

if mcordata == 'mc':
if mcordata == 'mc' and get_dim(hist) <= 2:
if h_mctruth_pr:
h_mcunfolded = fh_unfolding_output.Clone()
h_mcunfolded.Divide(h_mctruth_pr)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -509,7 +509,7 @@ D0Jet_pp:
multi:
data:
nprocessesparallel: 80
maxfiles: [-1] #list of periods
maxfiles: [1] #list of periods
chunksizeunp: [100] #list of periods
chunksizeskim: [100] #list of periods
fracmerge: [.1] #list of periods
Expand All @@ -532,7 +532,7 @@ D0Jet_pp:
mcreweights: [../Analyses] #list of periods
mc:
nprocessesparallel: 80
maxfiles: [-1] #list of periods
maxfiles: [1] #list of periods
chunksizeunp: [100] #list of periods
chunksizeskim: [1000] #list of periods
fracmerge: [1.] #list of periods
Expand Down Expand Up @@ -721,17 +721,39 @@ D0Jet_pp:
bins_det_fix: [10, 0., 1.]
label: "#Delta#it{r}"
lntheta:
bins_gen_fix: [10, 0., 5.]
bins_det_fix: [10, 0., 5.]
bins_gen_fix: [8, 1., 5.]
bins_det_fix: [8, 1., 5.]
label: "#minusln(#it{#theta})"
arraycols: [3]
lnkt:
bins_gen_fix: [10, -8., 2.]
bins_det_fix: [10, -8., 2.]
bins_gen_fix: [8, -4., 4.]
bins_det_fix: [8, -4., 4.]
label: "ln(#it{k}_{T}/(GeV/#it{c}))"
arraycols: [3]
lntheta-lnkt:
arraycols: [3, 4]
# new variables
fEnergyMother:
bins_gen_fix: [1, 0., 100.]
bins_det_fix: [1, 0., 100.]
arraycols: [3]
# lntheta-lnkt-fEnergyMother:
# arraycols: [3, 4, 5]
fJetNConstituents:
bins_gen_fix: [5, 0., 20.]
bins_det_fix: [5, 0., 20.]
zpar-fJetNConstituents: {}
nsub21:
# TODO: check for 1-track jets
bins_gen_fix: [11, -1., 1.]
bins_det_fix: [11, -1., 1.]
eecweight:
# TODO: adjust binning
bins_gen_fix: [10, 0., 1.]
bins_det_fix: [10, 0., 1.]
arraycols: [3]
fPairTheta-eecweight:
arraycols: [3, 4]

data_selections:
mcsig:
Expand Down
6 changes: 5 additions & 1 deletion machine_learning_hep/processer.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,8 +21,10 @@
import re
import sys
import tempfile
import traceback
from copy import deepcopy
from functools import reduce
from typing import TypeVar
from pandas.api.types import is_numeric_dtype

import numpy as np
Expand Down Expand Up @@ -288,7 +290,8 @@ def __init__(self, case, datap, run_param, mcordata, p_maxfiles, # pylint: disab
# Flag if they should be used
self.do_custom_analysis_cuts = datap["analysis"][self.typean].get("use_cuts", False)

def cfg(self, param, default = None):
T = TypeVar("T")
def cfg(self, param: str, default: T = None) -> T:
return reduce(lambda d, key: d.get(key, default) if isinstance(d, dict) else default,
param.split("."), self.datap['analysis'][self.typean])

Expand Down Expand Up @@ -492,6 +495,7 @@ def applymodel(self, file_index):
@staticmethod
def callback(ex):
get_logger().exception('Error callback: %s', ex)
traceback.print_stack()
raise ex

def parallelizer(self, function, argument_list, maxperchunk):
Expand Down
Loading
Loading