diff --git a/machine_learning_hep/analysis/analyzer_jets.py b/machine_learning_hep/analysis/analyzer_jets.py index 266b2c319a..9918750510 100644 --- a/machine_learning_hep/analysis/analyzer_jets.py +++ b/machine_learning_hep/analysis/analyzer_jets.py @@ -74,8 +74,7 @@ def __init__(self, datap, case, typean, period): self.observables = { 'qa': ['zg', 'rg', 'nsd', 'zpar', 'dr', 'lntheta', 'lnkt', 'lntheta-lnkt'], - 'all': [var for var, spec in self.cfg('observables', {}).items() - if '-' not in var and 'arraycols' not in spec], + 'all': [*self.cfg('observables', {})], } self.bins_candpt = np.asarray(self.cfg('sel_an_binmin', []) + self.cfg('sel_an_binmax', [])[-1:], 'd') @@ -742,7 +741,7 @@ def _analyze(self, method = 'sidesub'): self.logger.info("Signal extraction (method %s): obs. %s, %s, ipt %d", method, var, mcordata, ipt) if not self.cfg('hfjet', True): - h = project_hist(h_in, axes_proj[1:], {}) + h = project_hist(h_in, list(range(1, get_dim(h_in))), {}) elif method == 'sidesub': h = self._subtract_sideband(h_in, var, mcordata, ipt) elif method == 'sigextr': @@ -751,8 +750,10 @@ def _analyze(self, method = 'sidesub'): self.logger.critical('invalid method %s', method) self._save_hist(h, f'h_ptjet{label}_{method}_noeff_{mcordata}_pt{ipt}.png') if mcordata == 'mc': - h_proj = project_hist(h_in, axes_proj[1:], {}) - h_proj_lim = project_hist(h_in, axes_proj[1:], {0: (1, get_nbins(h_in, 0))}) + self.logger.info('projecting %s onto axes: %s', h_in, axes_proj[1:]) + h_proj = project_hist(h_in, list(range(1, get_dim(h_in))), {}) + h_proj_lim = project_hist(h_in, list(range(1, get_dim(h_in))), + {0: (1, get_nbins(h_in, 0))}) self._save_hist(h_proj, f'h_ptjet{label}_proj_noeff_{mcordata}_pt{ipt}.png') if h and h_proj: self.logger.debug('signal loss %s-%i: %g, fraction in under-/overflow: %g', @@ -793,7 +794,7 @@ def _analyze(self, method = 'sidesub'): self._clip_neg(fh_sum_fdsub) self._save_hist(fh_sum_fdsub, f'h_ptjet{label}_{method}_{mcordata}.png') - if get_dim(fh_sum) > 1: + if get_dim(fh_sum) == 2: axes = list(range(get_dim(fh_sum))) axis_ptjet = get_axis(fh_sum, 0) for iptjet in range(get_nbins(fh_sum, 0)): @@ -823,8 +824,7 @@ def _analyze(self, method = 'sidesub'): continue axis_ptjet = get_axis(fh_sum_fdsub, 0) for j in range(get_nbins(fh_sum_fdsub, 0)): - # TODO: generalize to higher dimensions - hproj = project_hist(fh_sum_fdsub, [1], {0: [j+1, j+1]}) + hproj = project_hist(fh_sum_fdsub, list(range(1, get_dim(fh_sum_fdsub))), {0: [j+1, j+1]}) range_ptjet = get_bin_limits(axis_ptjet, j + 1) self._save_hist( hproj, f'uf/h_{var}_{method}_{mcordata}_{string_range_ptjet(range_ptjet)}.png') @@ -837,7 +837,7 @@ def _analyze(self, method = 'sidesub'): range_ptjet = get_bin_limits(axis_ptjet, j + 1) c = TCanvas() for i, h in enumerate(fh_unfolded): - hproj = project_hist(h, [1], {0: (j+1, j+1)}) + hproj = project_hist(h, list(range(1, get_dim(h))), {0: (j+1, j+1)}) empty = hproj.Integral() < 1.e-7 if empty and i == 0: self.logger.error("Projection %s %s %s is empty.", var, mcordata, @@ -851,7 +851,7 @@ def _analyze(self, method = 'sidesub'): self._save_hist( hproj, f'uf/h_{var}_{method}_unfolded_{mcordata}_' + - f'{string_range_ptjet(range_ptjet)}_sel.png') + f'{string_range_ptjet(range_ptjet)}_sel.png', "colz") # Save also the self-normalised version. if not empty: hproj_sel = hproj.Clone(f"{hproj.GetName()}_selfnorm") @@ -939,6 +939,7 @@ def estimate_feeddown(self): df = pd.read_parquet(self.cfg('fd_parquet')) col_mapping = {'dr': 'delta_r_jet', 'zpar': 'z'} # TODO: check mapping + # TODO: generalize to higher dimensions for var in self.observables['all']: bins_ptjet = np.asarray(self.cfg('bins_ptjet'), 'd') # TODO: generalize or derive from histogram? @@ -960,6 +961,7 @@ def estimate_feeddown(self): if f'{colname}' not in df: if var is not None: self.logger.error('No feeddown information for %s (%s), cannot estimate feeddown', var, colname) + print(df.info(), flush=True) continue # TODO: derive histogram @@ -990,6 +992,10 @@ def estimate_feeddown(self): rfile.Get(f'h_effkine_fd_det_nocuts_{var}'), rfile.Get(f'h_effkine_fd_det_cut_{var}')) h_response = rfile.Get(f'h_response_fd_{var}') + if not h_response: + self.logger.error("Could not find response matrix for fd estimation of %s", var) + rfile.ls() + continue h_response_norm = norm_response(h_response, 3) h3_fd_gen.Multiply(h_effkine_gen) self._save_hist(project_hist(h3_fd_gen, [0, 2], {}), f'fd/h_ptjet-{var}_fdnew_gen_genkine.png') @@ -1086,27 +1092,26 @@ def _build_effkine(self, h_nocuts, h_cuts): def _build_response_matrix(self, h_response, h_eff = None, frac_flat = 0.): + dim = (get_dim(h_response) - 1) // 2 + self.logger.info("Building %i-dim response matrix from %s", dim, h_response) rm = ROOT.RooUnfoldResponse( - project_hist(h_response, [0, 1], {}), project_hist(h_response, [2, 3], {})) - h_gen = project_hist(h_response, [2, 3], {}) - for hbin in itertools.product( - enumerate(list(get_axis(h_response, 0).GetXbins())[:-1], 1), - enumerate(list(get_axis(h_response, 1).GetXbins())[:-1], 1), - enumerate(list(get_axis(h_response, 2).GetXbins())[:-1], 1), - enumerate(list(get_axis(h_response, 3).GetXbins())[:-1], 1), - enumerate(list(get_axis(h_response, 4).GetXbins())[:-1], 1)): + project_hist(h_response, list(range(dim)), {}), project_hist(h_response, list(range(dim, 2 * dim)), {})) + h_gen = project_hist(h_response, list(range(dim, 2 * dim)), {}) + + x = (enumerate(list(get_axis(h_response, iaxis).GetXbins())[:-1], 1) for iaxis in range(2*dim+1)) + for hbin in itertools.product(*x): n = h_response.GetBinContent( - np.asarray([hbin[0][0], hbin[1][0], hbin[2][0], hbin[3][0], hbin[4][0]], 'i')) - eff = h_eff.GetBinContent(hbin[4][0]) if h_eff else 1. + np.asarray([hbin[i][0] for i in range(2*dim+1)], 'i')) + eff = h_eff.GetBinContent(hbin[2*dim][0]) if h_eff else 1. if np.isclose(eff, 0.): self.logger.error('efficiency 0 for %s', hbin[4]) continue - if (cnt_gen := h_gen.GetBinContent(hbin[2][0], hbin[3][0])) > 0.: + if (cnt_gen := h_gen.GetBinContent(*(hbin[i][0] for i in range(dim, 2*dim)))) > 0.: fac = 1. if frac_flat > 0.: fac += frac_flat * (1. / cnt_gen - 1.) for _ in range(int(n)): - rm.Fill(hbin[0][1], hbin[1][1], hbin[2][1], hbin[3][1], 1./eff * fac) + rm.Fill(*(hbin[iaxis][1] for iaxis in range(2*dim)), 1./eff * fac) # rm.Mresponse().Print() return rm @@ -1127,7 +1132,7 @@ def _subtract_feeddown(self, hist, var, mcordata): #region unfolding def _unfold(self, hist, var, mcordata): - self.logger.debug('Unfolding for %s', var) + self.logger.info('Unfolding for %s', var) suffix = '_frac' if mcordata == 'mc' else '' with TFile(self.n_fileeff) as rfile: h_response = rfile.Get(f'h_response_pr_{var}{suffix}') @@ -1158,7 +1163,7 @@ def _unfold(self, hist, var, mcordata): self._save_hist(h_effkine_gen, f'uf/h_effkine-ptjet-{var}_pr_gen_{mcordata}.png', 'text') # TODO: move, has nothing to do with unfolding - if mcordata == 'mc': + if mcordata == 'mc' and get_dim(hist) <= 2: h_mctruth_pr = rfile.Get(f'h_ptjet-pthf-{var}_pr_gen') if h_mctruth_pr: h_mctruth_pr = project_hist(h_mctruth_pr, [0, 2], {}) @@ -1181,7 +1186,7 @@ def _unfold(self, hist, var, mcordata): self._save_hist(fh_unfolding_output, f'uf/h_ptjet-{var}_{mcordata}_unfoldeffcorr{n}.png', 'texte') h_unfolding_output.append(fh_unfolding_output) - if mcordata == 'mc': + if mcordata == 'mc' and get_dim(hist) <= 2: if h_mctruth_pr: h_mcunfolded = fh_unfolding_output.Clone() h_mcunfolded.Divide(h_mctruth_pr) diff --git a/machine_learning_hep/data/data_run3/database_ml_parameters_D0Jet_pp.yml b/machine_learning_hep/data/data_run3/database_ml_parameters_D0Jet_pp.yml index b1dd5cf8c8..3c34c6418d 100644 --- a/machine_learning_hep/data/data_run3/database_ml_parameters_D0Jet_pp.yml +++ b/machine_learning_hep/data/data_run3/database_ml_parameters_D0Jet_pp.yml @@ -509,7 +509,7 @@ D0Jet_pp: multi: data: nprocessesparallel: 80 - maxfiles: [-1] #list of periods + maxfiles: [1] #list of periods chunksizeunp: [100] #list of periods chunksizeskim: [100] #list of periods fracmerge: [.1] #list of periods @@ -532,7 +532,7 @@ D0Jet_pp: mcreweights: [../Analyses] #list of periods mc: nprocessesparallel: 80 - maxfiles: [-1] #list of periods + maxfiles: [1] #list of periods chunksizeunp: [100] #list of periods chunksizeskim: [1000] #list of periods fracmerge: [1.] #list of periods @@ -721,17 +721,39 @@ D0Jet_pp: bins_det_fix: [10, 0., 1.] label: "#Delta#it{r}" lntheta: - bins_gen_fix: [10, 0., 5.] - bins_det_fix: [10, 0., 5.] + bins_gen_fix: [8, 1., 5.] + bins_det_fix: [8, 1., 5.] label: "#minusln(#it{#theta})" arraycols: [3] lnkt: - bins_gen_fix: [10, -8., 2.] - bins_det_fix: [10, -8., 2.] + bins_gen_fix: [8, -4., 4.] + bins_det_fix: [8, -4., 4.] label: "ln(#it{k}_{T}/(GeV/#it{c}))" arraycols: [3] lntheta-lnkt: arraycols: [3, 4] + # new variables + fEnergyMother: + bins_gen_fix: [1, 0., 100.] + bins_det_fix: [1, 0., 100.] + arraycols: [3] + # lntheta-lnkt-fEnergyMother: + # arraycols: [3, 4, 5] + fJetNConstituents: + bins_gen_fix: [5, 0., 20.] + bins_det_fix: [5, 0., 20.] + zpar-fJetNConstituents: {} + nsub21: + # TODO: check for 1-track jets + bins_gen_fix: [11, -1., 1.] + bins_det_fix: [11, -1., 1.] + eecweight: + # TODO: adjust binning + bins_gen_fix: [10, 0., 1.] + bins_det_fix: [10, 0., 1.] + arraycols: [3] + fPairTheta-eecweight: + arraycols: [3, 4] data_selections: mcsig: diff --git a/machine_learning_hep/processer.py b/machine_learning_hep/processer.py index 0b8f9a8e17..c5a014d75d 100644 --- a/machine_learning_hep/processer.py +++ b/machine_learning_hep/processer.py @@ -21,8 +21,10 @@ import re import sys import tempfile +import traceback from copy import deepcopy from functools import reduce +from typing import TypeVar from pandas.api.types import is_numeric_dtype import numpy as np @@ -288,7 +290,8 @@ def __init__(self, case, datap, run_param, mcordata, p_maxfiles, # pylint: disab # Flag if they should be used self.do_custom_analysis_cuts = datap["analysis"][self.typean].get("use_cuts", False) - def cfg(self, param, default = None): + T = TypeVar("T") + def cfg(self, param: str, default: T = None) -> T: return reduce(lambda d, key: d.get(key, default) if isinstance(d, dict) else default, param.split("."), self.datap['analysis'][self.typean]) @@ -492,6 +495,7 @@ def applymodel(self, file_index): @staticmethod def callback(ex): get_logger().exception('Error callback: %s', ex) + traceback.print_stack() raise ex def parallelizer(self, function, argument_list, maxperchunk): diff --git a/machine_learning_hep/processer_jet.py b/machine_learning_hep/processer_jet.py index 2e32a520f6..e57fd461cf 100644 --- a/machine_learning_hep/processer_jet.py +++ b/machine_learning_hep/processer_jet.py @@ -21,7 +21,7 @@ from machine_learning_hep.processer import Processer from machine_learning_hep.utilities import dfquery, read_df -from machine_learning_hep.utils.hist import bin_array, create_hist, fill_hist, get_axis, get_range +from machine_learning_hep.utils.hist import bin_array, create_hist, fill_hist, get_axis, get_range, project_hist # pylint: disable=too-many-instance-attributes, too-many-statements @@ -61,7 +61,7 @@ def __init__(self, case, datap, run_param, mcordata, p_maxfiles, # pylint: disab self.binarray_pthf = np.asarray(self.cfg('sel_an_binmin', []) + self.cfg('sel_an_binmax', [])[-1:], 'd') self.binarrays_obs = {'gen': {}, 'det': {}} self.binarrays_ptjet = {'gen': {}, 'det': {}} - for obs in self.cfg('observables'): + for obs in self.cfg('observables', {}): var = obs.split('-') for v in var: if v in self.binarrays_obs: @@ -131,6 +131,7 @@ def _calculate_variables(self, df, verify=False): # pylint: disable=invalid-name df['lntheta'] = None return df df['nsub21'] = df.fNSub2 / df.fNSub1 + # TODO: catch nsub1 == 0 self.logger.debug('zg') df['zg_array'] = np.array(.5 - abs(df.fPtSubLeading / (df.fPtLeading + df.fPtSubLeading) - .5)) zcut = self.cfg('zcut', .1) @@ -145,6 +146,10 @@ def _calculate_variables(self, df, verify=False): # pylint: disable=invalid-name df['lntheta'] = df['fTheta'].apply(lambda x: -np.log(x)) # df['lntheta'] = np.array(-np.log(df.fTheta)) + self.logger.info('EEC') + df['eecweight'] = df[['fPairPt', 'fJetPt']].apply( + (lambda ar: ar.fPairPt / ar.fJetPt**2), axis=1) + if self.cfg('hfjet', True): df['dr'] = np.sqrt((df.fJetEta - df.fEta)**2 + ((df.fJetPhi - df.fPhi + math.pi) % math.tau - math.pi)**2) df['jetPx'] = df.fJetPt * np.cos(df.fJetPhi) @@ -245,7 +250,7 @@ def process_histomass_single(self, index): self._calculate_variables(df) for obs, spec in self.cfg('observables', {}).items(): - self.logger.debug('preparing histograms for %s', obs) + self.logger.info('preparing histograms for %s', obs) var = obs.split('-') if not all(v in df for v in var): self.logger.error('dataframe does not contain %s', var) @@ -265,7 +270,7 @@ def process_histomass_single(self, index): # - priors (reweight response matrix) # region efficiency - # pylint: disable=too-many-branches,too-many-statements + # pylint: disable=too-many-branches,too-many-statements,too-many-locals def process_efficiency_single(self, index): self.logger.info('Processing (efficiency) %s', self.l_evtorig[index]) @@ -273,50 +278,60 @@ def process_efficiency_single(self, index): levels_eff = ['gen', 'det', 'genmatch', 'detmatch', 'detmatch_gencuts'] levels_effkine = ['gen', 'det'] cuts = ['nocuts', 'cut'] - observables = self.cfg('observables', []) + observables = self.cfg('observables', {}) observables.update({'fPt': {'label': 'p_{T}^{HF} (GeV/#it{c})'}}) h_eff = {(cat, level): create_hist(f'h_ptjet-pthf_{cat}_{level}', ';p_{T}^{jet} (GeV/#it{c});p_{T}^{HF} (GeV/#it{c})', self.binarrays_ptjet['det']['fPt'], self.binarray_pthf) for cat in cats for level in levels_eff} - # TODO: extend to multi-dimensional observables - h_response = { - (cat, var): create_hist( - f'h_response_{cat}_{var}', - f";p_{{T}}^{{jet}} (GeV/#it{{c}});{var};p_{{T}}^{{jet}} (GeV/#it{{c}});{var};p_{{T}} (GeV/#it{{c}})", - self.binarrays_ptjet['det'][var], self.binarrays_obs['det'][var], - self.binarrays_ptjet['gen'][var], self.binarrays_obs['gen'][var], - self.binarray_pthf) - for (cat, var) in itertools.product(cats, observables) - if not '-' in var} - h_response_fd = {var: - create_hist( - f'h_response_fd_{var}', - f";p_{{T}}^{{jet}} (GeV/#it{{c}});{var};p_{{T}}^{{jet}} (GeV/#it{{c}});{var};p_{{T}} (GeV/#it{{c}})", - self.binarrays_ptjet['det'][var], self.binarrays_obs['det']['fPt'], self.binarrays_obs['det'][var], - self.binarrays_ptjet['gen'][var], self.binarrays_obs['gen']['fPt'], self.binarrays_obs['gen'][var]) - for var in self.cfg('observables', []) if not '-' in var} - # TODO: derive bins from response histogram - h_effkine = {(cat, level, cut, var): - create_hist(f'h_effkine_{cat}_{level}_{cut}_{var}', - f";p_{{T}}^{{jet}} (GeV/#it{{c}});{var_spec['label']}", - self.binarrays_ptjet[level][var], self.binarrays_obs[level][var]) - for (var, var_spec), level, cat, cut - in itertools.product(observables.items(), levels_effkine, cats, cuts) - if not '-' in var} - h_effkine_fd = {(level, cut, var): create_hist(f'h_effkine_fd_{level}_{cut}_{var}', - f";p_{{T}}^{{jet}} (GeV/#it{{c}});{var_spec['label']}", - self.binarrays_ptjet[level][var], self.binarrays_obs[level]['fPt'], self.binarrays_obs[level][var]) - for (var, var_spec), level, cut - in itertools.product(self.cfg('observables', {}).items(), levels_effkine, cuts) - if not '-' in var} - h_mctruth = { - (cat, var): create_hist( - f'h_ptjet-pthf-{var}_{cat}_gen', - f";p_{{T}}^{{jet}} (GeV/#it{{c}});p_{{T}}^{{HF}} (GeV/#it{{c}});{var}", - self.binarrays_ptjet['gen'][var], self.binarray_pthf, self.binarrays_obs['gen'][var]) - for (cat, var) in itertools.product(cats, observables) - if not '-' in var} + h_response = {} + h_effkine = {} + h_response_fd = {} + h_effkine_fd = {} + h_mctruth = {} + for cat in cats: + for obs in self.cfg('observables', {}): + self.logger.info('preparing response matrix for %s', obs) + var = obs.split('-') + dim = len(var) + 1 + h_response[(cat, obs)] = h = create_hist( + f'h_response_{cat}_{obs}', f"response matrix {obs}", + self.binarrays_ptjet['det'][var[0]], *[self.binarrays_obs['det'][v] for v in var], + self.binarrays_ptjet['gen'][var[0]], *[self.binarrays_obs['gen'][v] for v in var], + self.binarray_pthf) + get_axis(h, 0).SetTitle("p_{T}^{jet} (GeV/#it{c})") + get_axis(h, dim).SetTitle("p_{T}^{jet} (GeV/#it{c})") + get_axis(h, 2*dim).SetTitle("p_{T}^{HF} (GeV/#it{c})") + for i, v in enumerate(var, 1): + get_axis(h, i).SetTitle(self.cfg(f'observables.{v}.label', v)) + get_axis(h, i+dim).SetTitle(self.cfg(f'observables.{v}.label', v)) + for cut in cuts: + h_effkine[(cat, 'det', cut, obs)] = he = project_hist(h, list(range(dim)), {}).Clone() + he.SetName(f'h_effkine_{cat}_det_{cut}_{obs}') + h_effkine[(cat, 'gen', cut, obs)] = he = project_hist(h, list(range(dim, 2*dim)), {}).Clone() + he.SetName(f'h_effkine_{cat}_gen_{cut}_{obs}') + h_mctruth[(cat, obs)] = create_hist( + f'h_ptjet-pthf-{obs}_{cat}_gen', + f";p_{{T}}^{{jet}} (GeV/#it{{c}});p_{{T}}^{{HF}} (GeV/#it{{c}});{obs}", + self.binarrays_ptjet['gen'][var[0]], + self.binarray_pthf, + *[self.binarrays_obs['gen'][v] for v in var]) + h_response_fd[obs] = create_hist( + f'h_response_fd_{obs}', + f";response matrix fd {obs}", + self.binarrays_ptjet['det'][var[0]], + self.binarrays_obs['det']['fPt'], + *[self.binarrays_obs['det'][v] for v in var], + self.binarrays_ptjet['gen'][var[0]], + self.binarrays_obs['gen']['fPt'], + *[self.binarrays_obs['gen'][v] for v in var]) + for level, cut in itertools.product(levels_effkine, cuts): + h_effkine_fd[(level, cut, obs)] = create_hist( + f'h_effkine_fd_{level}_{cut}_{obs}', + f"effkine {obs}", + self.binarrays_ptjet[level][var[0]], + self.binarrays_obs[level]['fPt'], + *[self.binarrays_obs[level][v] for v in var]) # create partial versions for closure testing h_effkine_frac = copy.deepcopy(h_effkine) @@ -326,9 +341,9 @@ def process_efficiency_single(self, index): with TFile.Open(self.l_histoeff[index], "recreate") as rfile: # TODO: avoid hard-coding values here (check if restriction is needed at all) - cols = ['ismcprompt', 'ismcsignal', 'ismcfd', + cols = None if not self.cfg('hfjet', True) else ['ismcprompt', 'ismcsignal', 'ismcfd', 'fPt', 'fEta', 'fPhi', 'fJetPt', 'fJetEta', 'fJetPhi', 'fPtLeading', 'fPtSubLeading', 'fTheta', - 'fNSub2DR', 'fNSub1', 'fNSub2'] if self.cfg('hfjet', True) else None + 'fNSub2DR', 'fNSub1', 'fNSub2', 'fJetNConstituents', 'fEnergyMother', 'fPairTheta', 'fPairPt'] # read generator level dfgen_orig = pd.concat(read_df(self.mptfiles_gensk[bin][index], columns=cols) @@ -383,10 +398,15 @@ def process_efficiency_single(self, index): else: self.logger.error('No matching, could not fill matched detector-level histograms') - for var, cat in itertools.product(observables, cats): - # TODO: add support for more complex observables - if '-' in var or self.cfg(f'observables.{var}.arraycols'): - continue + for obs, cat in itertools.product(observables, cats): + if cat in dfmatch and dfmatch[cat] is not None: + self._prepare_response(dfmatch[cat], h_effkine, h_response, cat, obs) + f = self.cfg('frac_mcana', .2) + _, df_mccorr = self.split_df(dfmatch[cat], f if f < 1. else 0.) + self._prepare_response(df_mccorr, h_effkine_frac, h_response_frac, cat, obs) + self._prepare_response_fd(dfmatch[cat], h_effkine_fd, h_response_fd, obs) + + # TODO: move outside of loop? if self.cfg('closure.use_matched'): self.logger.info('using matched for truth') df_mcana, _ = self.split_df(dfmatch[cat], self.cfg('frac_mcana', .2)) @@ -395,14 +415,13 @@ def process_efficiency_single(self, index): if f := self.cfg('closure.exclude_feeddown_gen'): self.logger.debug('excluding feeddown gen') dfquery(df_mcana, f, inplace=True) - fill_hist(h_mctruth[(cat, var)], df_mcana[['fJetPt_gen', 'fPt_gen', f'{var}_gen']]) - if cat in dfmatch and dfmatch[cat] is not None: - self._prepare_response(dfmatch[cat], h_effkine, h_response, cat, var) - self._prepare_response_fd(dfmatch[cat], h_effkine_fd, h_response_fd, var) - f = self.cfg('frac_mcana', .2) - _, df_mccorr = self.split_df(dfmatch[cat], f if f < 1. else 0.) - self._prepare_response(df_mccorr, h_effkine_frac, h_response_frac, cat, var) + arraycols = [i - 3 for i in self.cfg(f'observables.{obs}.arraycols', [])] + var = obs.split('-') + self.logger.debug("Observable %s has arraycols %s -> %s", + obs, arraycols, [var[icol] for icol in arraycols]) + df_mcana = self._explode_arraycols(df_mcana, [var[icol] for icol in arraycols]) + fill_hist(h_mctruth[(cat, obs)], df_mcana[['fJetPt_gen', 'fPt_gen', *(f'{v}_gen' for v in var)]]) for name, obj in itertools.chain(h_eff.items(), h_effkine.items(), h_response.items(), h_effkine_fd.items(), h_response_fd.items(), @@ -412,59 +431,83 @@ def process_efficiency_single(self, index): except Exception as ex: # pylint: disable=broad-exception-caught self.logger.error('Writing of <%s> (%s) failed: %s', name, str(obj), str(ex)) - def _prepare_response(self, dfi, h_effkine, h_response, cat, var): - axis_ptjet_det = get_axis(h_response[(cat, var)], 0) - axis_var_det = get_axis(h_response[(cat, var)], 1) - axis_ptjet_gen = get_axis(h_response[(cat, var)], 2) - axis_var_gen = get_axis(h_response[(cat, var)], 3) + def _explode_arraycols(self, df: pd.DataFrame, arraycols: "list[str]") -> pd.DataFrame: + if len(arraycols) > 0: + self.logger.debug("Exploding columns %s", arraycols) + # only consider rows with corresponding det- and gen-level entries + df['length'] = [len(x) for x in df[arraycols[0]]] + df['length_gen'] = [len(x) for x in df[arraycols[0] + '_gen']] + df = df.loc[df.length == df.length_gen] + df = df.explode(arraycols + [col + '_gen' for col in arraycols]) + df.dropna(inplace=True) + return df + + def _prepare_response(self, dfi, h_effkine, h_response, cat, obs): + var = obs.split('-') + dim = len(var) + 1 + axes_det = [get_axis(h_response[(cat, obs)], i) for i in range(dim)] + axes_gen = [get_axis(h_response[(cat, obs)], i) for i in range(dim, 2 * dim)] + arraycols = [i - 3 for i in self.cfg(f'observables.{obs}', {}).get('arraycols', [])] df = dfi - # TODO: the first cut should be taken care of by under-/overflow bins, check their usage in analyzer - df = df.loc[(df.fJetPt >= axis_ptjet_det.GetXmin()) & (df.fJetPt < axis_ptjet_det.GetXmax()) & - (df[var] >= axis_var_det.GetXmin()) & (df[var] < axis_var_det.GetXmax())] - fill_hist(h_effkine[(cat, 'det', 'nocuts', var)], df[['fJetPt', var]]) - df = df.loc[(df.fJetPt_gen >= axis_ptjet_gen.GetXmin()) & (df.fJetPt_gen < axis_ptjet_gen.GetXmax()) & - (df[f'{var}_gen'] >= axis_var_gen.GetXmin()) & (df[f'{var}_gen'] < axis_var_gen.GetXmax())] - fill_hist(h_effkine[(cat, 'det', 'cut', var)], df[['fJetPt', var]]) + df = self._explode_arraycols(df, [var[icol] for icol in arraycols]) - fill_hist(h_response[(cat, var)], df[['fJetPt', f'{var}', 'fJetPt_gen', f'{var}_gen', 'fPt']]) + df = df.loc[(df.fJetPt >= axes_det[0].GetXmin()) & (df.fJetPt < axes_det[0].GetXmax())] + for i, v in enumerate(var, 1): + df = df.loc[(df[v] >= axes_det[i].GetXmin()) & (df[v] < axes_det[i].GetXmax())] + fill_hist(h_effkine[(cat, 'det', 'nocuts', obs)], df[['fJetPt', *var]]) + df = df.loc[(df.fJetPt >= axes_gen[0].GetXmin()) & (df.fJetPt < axes_gen[0].GetXmax())] + for i, v in enumerate(var, 1): + df = df.loc[(df[f'{v}_gen'] >= axes_gen[i].GetXmin()) & (df[f'{v}_gen'] < axes_gen[i].GetXmax())] + fill_hist(h_effkine[(cat, 'det', 'cut', obs)], df[['fJetPt', *var]]) + + # print(df[['fJetPt', *var, 'fJetPt_gen', *(f'{v}_gen' for v in var), 'fPt']].info(), flush=True) + fill_hist(h_response[(cat, obs)], df[['fJetPt', *var, 'fJetPt_gen', *(f'{v}_gen' for v in var), 'fPt']]) df = dfi - df = df.loc[(df.fJetPt_gen >= axis_ptjet_gen.GetXmin()) & (df.fJetPt_gen < axis_ptjet_gen.GetXmax()) & - (df[f'{var}_gen'] >= axis_var_gen.GetXmin()) & (df[f'{var}_gen'] < axis_var_gen.GetXmax())] - fill_hist(h_effkine[(cat, 'gen', 'nocuts', var)], df[['fJetPt_gen', f'{var}_gen']]) - df = df.loc[(df.fJetPt >= axis_ptjet_det.GetXmin()) & (df.fJetPt < axis_ptjet_det.GetXmax()) & - (df[f'{var}'] >= axis_var_det.GetXmin()) & (df[f'{var}'] < axis_var_det.GetXmax())] - fill_hist(h_effkine[(cat, 'gen', 'cut', var)], df[['fJetPt_gen', f'{var}_gen']]) - - - def _prepare_response_fd(self, dfi, h_effkine, h_response, var): - axis_ptjet_det = get_axis(h_response[var], 0) - axis_pthf_det = get_axis(h_response[var], 1) - axis_var_det = get_axis(h_response[var], 2) - axis_ptjet_gen = get_axis(h_response[var], 3) - axis_pthf_gen = get_axis(h_response[var], 4) - axis_var_gen = get_axis(h_response[var], 5) + df = self._explode_arraycols(df, [var[icol] for icol in arraycols]) + df = df.loc[(df.fJetPt >= axes_gen[0].GetXmin()) & (df.fJetPt < axes_gen[0].GetXmax())] + for i, v in enumerate(var, 1): + df = df.loc[(df[f'{v}_gen'] >= axes_gen[i].GetXmin()) & (df[f'{v}_gen'] < axes_gen[i].GetXmax())] + fill_hist(h_effkine[(cat, 'gen', 'nocuts', obs)], df[['fJetPt_gen', *(f'{v}_gen' for v in var)]]) + df = df.loc[(df.fJetPt >= axes_det[0].GetXmin()) & (df.fJetPt < axes_det[0].GetXmax())] + for i, v in enumerate(var, 1): + df = df.loc[(df[v] >= axes_det[i].GetXmin()) & (df[v] < axes_det[i].GetXmax())] + fill_hist(h_effkine[(cat, 'gen', 'cut', obs)], df[['fJetPt_gen', *(f'{v}_gen' for v in var)]]) + + + def _prepare_response_fd(self, dfi, h_effkine, h_response, obs): + var = obs.split('-') + dim = len(var) + 2 + axes_det = [get_axis(h_response[obs], i) for i in range(dim)] + axes_gen = [get_axis(h_response[obs], i) for i in range(dim, 2 * dim)] + arraycols = [i - 3 for i in self.cfg(f'observables.{obs}', {}).get('arraycols', [])] df = dfi + df = self._explode_arraycols(df, [var[icol] for icol in arraycols]) # TODO: the first cut should be taken care of by under-/overflow bins, check their usage in analyzer - df = df.loc[(df.fJetPt >= axis_ptjet_det.GetXmin()) & (df.fJetPt < axis_ptjet_det.GetXmax()) & - (df.fPt >= axis_pthf_det.GetXmin()) & (df.fPt < axis_pthf_det.GetXmax()) & - (df[var] >= axis_var_det.GetXmin()) & (df[var] < axis_var_det.GetXmax())] - fill_hist(h_effkine[('det', 'nocuts', var)], df[['fJetPt', 'fPt', var]]) - df = df.loc[(df.fJetPt_gen >= axis_ptjet_gen.GetXmin()) & (df.fJetPt_gen < axis_ptjet_gen.GetXmax()) & - (df.fPt_gen >= axis_pthf_gen.GetXmin()) & (df.fPt_gen < axis_pthf_gen.GetXmax()) & - (df[f'{var}_gen'] >= axis_var_gen.GetXmin()) & (df[f'{var}_gen'] < axis_var_gen.GetXmax())] - fill_hist(h_effkine[('det', 'cut', var)], df[['fJetPt', 'fPt', var]]) - - fill_hist(h_response[var], df[['fJetPt', 'fPt', f'{var}', 'fJetPt_gen', 'fPt_gen', f'{var}_gen']]) + df = df.loc[(df.fJetPt >= axes_det[0].GetXmin()) & (df.fJetPt < axes_det[0].GetXmax()) & + (df.fPt >= axes_det[1].GetXmin()) & (df.fPt < axes_det[1].GetXmax())] + for i, v in enumerate(var, 2): + df = df.loc[(df[v] >= axes_det[i].GetXmin()) & (df[v] < axes_det[i].GetXmax())] + fill_hist(h_effkine[('det', 'nocuts', obs)], df[['fJetPt', 'fPt', *var]]) + df = df.loc[(df.fJetPt_gen >= axes_gen[0].GetXmin()) & (df.fJetPt_gen < axes_gen[0].GetXmax()) & + (df.fPt_gen >= axes_gen[1].GetXmin()) & (df.fPt_gen < axes_gen[1].GetXmax())] + for i, v in enumerate(var, 2): + df = df.loc[(df[f'{v}_gen'] >= axes_gen[i].GetXmin()) & (df[f'{v}_gen'] < axes_gen[i].GetXmax())] + fill_hist(h_effkine[('det', 'cut', obs)], df[['fJetPt', 'fPt', *var]]) + + fill_hist(h_response[obs], df[['fJetPt', 'fPt', *var, 'fJetPt_gen', 'fPt_gen', *(f'{v}_gen' for v in var)]]) df = dfi - df = df.loc[(df.fJetPt_gen >= axis_ptjet_gen.GetXmin()) & (df.fJetPt_gen < axis_ptjet_gen.GetXmax()) & - (df.fPt_gen >= axis_pthf_gen.GetXmin()) & (df.fPt_gen < axis_pthf_gen.GetXmax()) & - (df[f'{var}_gen'] >= axis_var_gen.GetXmin()) & (df[f'{var}_gen'] < axis_var_gen.GetXmax())] - fill_hist(h_effkine[('gen', 'nocuts', var)], df[['fJetPt_gen', 'fPt', f'{var}_gen']]) - df = df.loc[(df.fJetPt >= axis_ptjet_det.GetXmin()) & (df.fJetPt < axis_ptjet_det.GetXmax()) & - (df.fPt >= axis_pthf_det.GetXmin()) & (df.fPt < axis_pthf_det.GetXmax()) & - (df[f'{var}'] >= axis_var_det.GetXmin()) & (df[f'{var}'] < axis_var_det.GetXmax())] - fill_hist(h_effkine[('gen', 'cut', var)], df[['fJetPt_gen', 'fPt', f'{var}_gen']]) + df = self._explode_arraycols(df, [var[icol] for icol in arraycols]) + df = df.loc[(df.fJetPt_gen >= axes_gen[0].GetXmin()) & (df.fJetPt_gen < axes_gen[0].GetXmax()) & + (df.fPt_gen >= axes_gen[1].GetXmin()) & (df.fPt_gen < axes_gen[1].GetXmax())] + for i, v in enumerate(var, 2): + df = df.loc[(df[f'{v}_gen'] >= axes_gen[i].GetXmin()) & (df[f'{v}_gen'] < axes_gen[i].GetXmax())] + fill_hist(h_effkine[('gen', 'nocuts', obs)], df[['fJetPt_gen', 'fPt', *(f'{v}_gen' for v in var)]]) + df = df.loc[(df.fJetPt >= axes_det[0].GetXmin()) & (df.fJetPt < axes_det[0].GetXmax()) & + (df.fPt >= axes_det[1].GetXmin()) & (df.fPt < axes_det[1].GetXmax())] + for i, v in enumerate(var, 2): + df = df.loc[(df[v] >= axes_det[i].GetXmin()) & (df[v] < axes_det[i].GetXmax())] + fill_hist(h_effkine[('gen', 'cut', obs)], df[['fJetPt_gen', 'fPt', *(f'{v}_gen' for v in var)]]) diff --git a/machine_learning_hep/utils/hist.py b/machine_learning_hep/utils/hist.py index 1bbe330320..99f09ab53a 100644 --- a/machine_learning_hep/utils/hist.py +++ b/machine_learning_hep/utils/hist.py @@ -123,7 +123,7 @@ def create_hist(name, title, *bin_specs): var_bins = [hasattr(spec, '__len__') for spec in bin_specs] assert all(var_bins) or not any(var_bins), f'either all bins must be variable or fixed width: {bin_specs=}' dim = len(bin_specs) if all(var_bins) else len(bin_specs) / 3 - assert dim in range(1, 10), 'only dimensions from 1 to 10 are supported' + assert dim in range(1, 12), 'only dimensions from 1 to 10 are supported' if all(var_bins): nbins = list(map(lambda a: len(a) - 1, bin_specs)) @@ -154,7 +154,7 @@ def fill_hist(hist, dfi: pd.DataFrame, weights = None, arraycols = None, write = """ dim_hist = hist.GetDimension() if isinstance(hist, ROOT.TH1) else hist.GetNdimensions() dim_df = dfi.shape[1] if dfi.ndim > 1 else dfi.ndim - assert dim_df in range(1, 10), f'{dim_df} not supported' + assert dim_df in range(1, 12), f'{dim_df} not supported' assert dim_df == dim_hist, 'dimensions of df and histogram do not match' if len(dfi) == 0: return @@ -183,7 +183,7 @@ def fill_hist(hist, dfi: pd.DataFrame, weights = None, arraycols = None, write = elif dim_hist > 3: assert weights is None, 'weights not supported' if not arraycols: - dfi.apply(lambda row: hist.Fill(*row), axis=1) + dfi.apply(lambda row: hist.Fill(np.array(row, 'd'), 1.), axis=1) else: m = [-1] * dim_hist idx = 0 @@ -252,8 +252,15 @@ def sum_hists(hists, name = None): def ensure_sumw2(hist): - if hist.GetSumw2N() < 1: - hist.Sumw2() + if isinstance(hist, ROOT.TH1): + if hist.GetSumw2N() < 1: + hist.Sumw2() + elif isinstance(hist, ROOT.THn): + if hist.GetSumw2() < 0.: + hist.Sumw2() + else: + raise NotImplementedError + def get_bin_val(hist, hbin):