From 569a6b7b4c0ee41f89d015323c3f5b2baeb68cc6 Mon Sep 17 00:00:00 2001 From: Jochen Klein Date: Tue, 21 Jan 2025 15:00:42 +0100 Subject: [PATCH 01/10] Establish lntheta-lnkt plots for D0 --- .../analysis/analyzer_jets.py | 35 +++-- .../database_ml_parameters_D0Jet_pp.yml | 8 +- machine_learning_hep/submission/analyzer.yml | 144 +++++++++--------- machine_learning_hep/utils/hist.py | 11 +- 4 files changed, 105 insertions(+), 93 deletions(-) diff --git a/machine_learning_hep/analysis/analyzer_jets.py b/machine_learning_hep/analysis/analyzer_jets.py index 266b2c319a..dd5236f595 100644 --- a/machine_learning_hep/analysis/analyzer_jets.py +++ b/machine_learning_hep/analysis/analyzer_jets.py @@ -74,8 +74,8 @@ def __init__(self, datap, case, typean, period): self.observables = { 'qa': ['zg', 'rg', 'nsd', 'zpar', 'dr', 'lntheta', 'lnkt', 'lntheta-lnkt'], - 'all': [var for var, spec in self.cfg('observables', {}).items() - if '-' not in var and 'arraycols' not in spec], + 'all': [var for var, spec in self.cfg('observables', {}).items()] + # if '-' not in var and 'arraycols' not in spec], } self.bins_candpt = np.asarray(self.cfg('sel_an_binmin', []) + self.cfg('sel_an_binmax', [])[-1:], 'd') @@ -716,6 +716,7 @@ def _subtract_sideband(self, hist, var, mcordata, ipt): fh_subtracted.Scale(1. / frac_sig) self._save_hist(fh_subtracted, f'sideband/h_ptjet{label}_subtracted_' f'{string_range_pthf(range_pthf)}_{mcordata}.png') + print('subtraction done', flush=True) return fh_subtracted @@ -750,19 +751,19 @@ def _analyze(self, method = 'sidesub'): else: self.logger.critical('invalid method %s', method) self._save_hist(h, f'h_ptjet{label}_{method}_noeff_{mcordata}_pt{ipt}.png') - if mcordata == 'mc': - h_proj = project_hist(h_in, axes_proj[1:], {}) - h_proj_lim = project_hist(h_in, axes_proj[1:], {0: (1, get_nbins(h_in, 0))}) - self._save_hist(h_proj, f'h_ptjet{label}_proj_noeff_{mcordata}_pt{ipt}.png') - if h and h_proj: - self.logger.debug('signal loss %s-%i: %g, fraction in under-/overflow: %g', - mcordata, ipt, - 1. - h.Integral()/h_proj.Integral(), - 1. - h_proj_lim.Integral()/h_proj.Integral()) - if self.cfg('closure.pure_signal'): - self.logger.debug('assuming pure signal, using projection') - h = h_proj - # Efficiency correction + # if mcordata == 'mc': + # h_proj = project_hist(h_in, axes_proj[1:], {}) + # h_proj_lim = project_hist(h_in, axes_proj[1:], {0: (1, get_nbins(h_in, 0))}) + # self._save_hist(h_proj, f'h_ptjet{label}_proj_noeff_{mcordata}_pt{ipt}.png') + # if h and h_proj: + # self.logger.debug('signal loss %s-%i: %g, fraction in under-/overflow: %g', + # mcordata, ipt, + # 1. - h.Integral()/h_proj.Integral(), + # 1. - h_proj_lim.Integral()/h_proj.Integral()) + # if self.cfg('closure.pure_signal'): + # self.logger.debug('assuming pure signal, using projection') + # h = h_proj + # # Efficiency correction if mcordata == 'data' or not self.cfg('closure.use_matched'): self.logger.info("Efficiency correction: obs. %s, %s, ipt %d", var, mcordata, ipt) @@ -785,6 +786,10 @@ def _analyze(self, method = 'sidesub'): f'_{string_range_ptjet(range_ptjet)}.png') self._save_canvas(c, filename) + # TODO: remove restriction on higher dimensions + if var and '-' in var: + continue + fh_sum_fdsub = fh_sum.Clone() # Feed-down subtraction self.logger.info("Feed-down subtraction: obs. %s, %s", var, mcordata) diff --git a/machine_learning_hep/data/data_run3/database_ml_parameters_D0Jet_pp.yml b/machine_learning_hep/data/data_run3/database_ml_parameters_D0Jet_pp.yml index b1dd5cf8c8..b235705ab0 100644 --- a/machine_learning_hep/data/data_run3/database_ml_parameters_D0Jet_pp.yml +++ b/machine_learning_hep/data/data_run3/database_ml_parameters_D0Jet_pp.yml @@ -721,13 +721,13 @@ D0Jet_pp: bins_det_fix: [10, 0., 1.] label: "#Delta#it{r}" lntheta: - bins_gen_fix: [10, 0., 5.] - bins_det_fix: [10, 0., 5.] + bins_gen_fix: [4, 1., 5.] + bins_det_fix: [4, 1., 5.] label: "#minusln(#it{#theta})" arraycols: [3] lnkt: - bins_gen_fix: [10, -8., 2.] - bins_det_fix: [10, -8., 2.] + bins_gen_fix: [10, -4., 6.] + bins_det_fix: [10, -4., 6.] label: "ln(#it{k}_{T}/(GeV/#it{c}))" arraycols: [3] lntheta-lnkt: diff --git a/machine_learning_hep/submission/analyzer.yml b/machine_learning_hep/submission/analyzer.yml index e08341b099..5942788fb9 100644 --- a/machine_learning_hep/submission/analyzer.yml +++ b/machine_learning_hep/submission/analyzer.yml @@ -1,87 +1,87 @@ --- case: XXXX # used to find the database file unless specified explicitly as do_entire_analysis -d database_analysis download: - alice: - activate: false + alice: + activate: false conversion: # pkl - mc: - activate: false - data: - activate: false + mc: + activate: false + data: + activate: false skimming: # pkl_skimmed (pklsk), pkl_evtcounter_all - mc: - activate: false - data: - activate: false + mc: + activate: false + data: + activate: false merging: # pkl_skimmed_merge_for_ml (pklskml) - mc: - activate: false - data: - activate: false + mc: + activate: false + data: + activate: false mergingperiods: # pkl_skimmed_merge_for_ml_all - mc: - activate: false - data: - activate: false + mc: + activate: false + data: + activate: false ml_study: # mlout, mlplot - activate: false - dotraining: false - dotesting: false - doplotdistr: false - doroc: false - doroctraintest: false - doimportance: false - doimportanceshap: false - docorrelation: false - dolearningcurve: false - doapplytodatamc: false - doscancuts: false - doefficiency: false - dosignifopt: false - doboundary: false - docrossvalidation: false - dogridsearch: false - dobayesianopt: false + activate: false + dotraining: false + dotesting: false + doplotdistr: false + doroc: false + doroctraintest: false + doimportance: false + doimportanceshap: false + docorrelation: false + dolearningcurve: false + doapplytodatamc: false + doscancuts: false + doefficiency: false + dosignifopt: false + doboundary: false + docrossvalidation: false + dogridsearch: false + dobayesianopt: false mlapplication: - data: - doapply: false # pkl_skimmed_dec (pklskdec) - domergeapply: false # pkl_skimmed_decmerged (pklskdecmerged) - docontinueafterstop: false # set to true to resume interrupted processing (existing corrupted output will be overwritten) - mc: - doapply: false # pkl_skimmed_dec (pklskdec) - domergeapply: false # pkl_skimmed_decmerged (pklskdecmerged) - docontinueafterstop: false # set to true to resume interrupted processing (existing corrupted output will be overwritten) + data: + doapply: false # pkl_skimmed_dec (pklskdec) + domergeapply: false # pkl_skimmed_decmerged (pklskdecmerged) + docontinueafterstop: false # set to true to resume interrupted processing (existing corrupted output will be overwritten) + mc: + doapply: false # pkl_skimmed_dec (pklskdec) + domergeapply: false # pkl_skimmed_decmerged (pklskdecmerged) + docontinueafterstop: false # set to true to resume interrupted processing (existing corrupted output will be overwritten) analysis: - type: "YYYY" # used unless specified explicitly as do_entire_analysis -a type_ana - # Do each period separately including merged (true) - # Do only merged (false) - doperperiod: false - data: - histomass: false # processer: process_histomass - mc: - histomass: false # processer: process_histomass - efficiency: false # processer: process_efficiency - steps: # analyzer methods to run (uncomment to activate) - ##### Inclusive hadrons - # fit: - # efficiency: - # makenormyields: - ##### Jets - init: - calculate_efficiencies: - qa: - fit: - estimate_feeddown: - analyze_with_sidesub: - # analyze_with_sigextr: + type: "YYYY" # used unless specified explicitly as do_entire_analysis -a type_ana + # Do each period separately including merged (true) + # Do only merged (false) + doperperiod: false + data: + histomass: false # processer: process_histomass + mc: + histomass: false # processer: process_histomass + efficiency: false # processer: process_efficiency + steps: # analyzer methods to run (uncomment to activate) + ##### Inclusive hadrons + # fit: + # efficiency: + # makenormyields: + ##### Jets + init: + calculate_efficiencies: + qa: + fit: + # estimate_feeddown: + analyze_with_sidesub: + # analyze_with_sigextr: systematics: - cutvar: - activate: false - do_only_analysis: false # This can be done anytime when mass and efficiency histograms have been produced already for a number of trials - resume: false # already done mass and efficiency histograms will not be done again, continue with left trials - mcptshape: - activate: false + cutvar: + activate: false + do_only_analysis: false # This can be done anytime when mass and efficiency histograms have been produced already for a number of trials + resume: false # already done mass and efficiency histograms will not be done again, continue with left trials + mcptshape: + activate: false diff --git a/machine_learning_hep/utils/hist.py b/machine_learning_hep/utils/hist.py index 1bbe330320..8c16895c2f 100644 --- a/machine_learning_hep/utils/hist.py +++ b/machine_learning_hep/utils/hist.py @@ -252,8 +252,15 @@ def sum_hists(hists, name = None): def ensure_sumw2(hist): - if hist.GetSumw2N() < 1: - hist.Sumw2() + if isinstance(hist, ROOT.TH1): + if hist.GetSumw2N() < 1: + hist.Sumw2() + elif isinstance(hist, ROOT.THn): + if hist.GetSumw2() < 0.: + hist.Sumw2() + else: + raise NotImplementedError + def get_bin_val(hist, hbin): From bd3b7c2289bb69c4fe0b40c6cd18e6b54021620a Mon Sep 17 00:00:00 2001 From: Jochen Klein Date: Tue, 21 Jan 2025 16:48:36 +0100 Subject: [PATCH 02/10] Generalize for higher dimensions --- .../analysis/analyzer_jets.py | 54 +++---- machine_learning_hep/submission/analyzer.yml | 144 +++++++++--------- 2 files changed, 100 insertions(+), 98 deletions(-) diff --git a/machine_learning_hep/analysis/analyzer_jets.py b/machine_learning_hep/analysis/analyzer_jets.py index dd5236f595..8cd6188c36 100644 --- a/machine_learning_hep/analysis/analyzer_jets.py +++ b/machine_learning_hep/analysis/analyzer_jets.py @@ -74,8 +74,7 @@ def __init__(self, datap, case, typean, period): self.observables = { 'qa': ['zg', 'rg', 'nsd', 'zpar', 'dr', 'lntheta', 'lnkt', 'lntheta-lnkt'], - 'all': [var for var, spec in self.cfg('observables', {}).items()] - # if '-' not in var and 'arraycols' not in spec], + 'all': [*self.cfg('observables', {})], } self.bins_candpt = np.asarray(self.cfg('sel_an_binmin', []) + self.cfg('sel_an_binmax', [])[-1:], 'd') @@ -716,7 +715,6 @@ def _subtract_sideband(self, hist, var, mcordata, ipt): fh_subtracted.Scale(1. / frac_sig) self._save_hist(fh_subtracted, f'sideband/h_ptjet{label}_subtracted_' f'{string_range_pthf(range_pthf)}_{mcordata}.png') - print('subtraction done', flush=True) return fh_subtracted @@ -743,7 +741,7 @@ def _analyze(self, method = 'sidesub'): self.logger.info("Signal extraction (method %s): obs. %s, %s, ipt %d", method, var, mcordata, ipt) if not self.cfg('hfjet', True): - h = project_hist(h_in, axes_proj[1:], {}) + h = project_hist(h_in, list(range(1, get_dim(h_in))), {}) elif method == 'sidesub': h = self._subtract_sideband(h_in, var, mcordata, ipt) elif method == 'sigextr': @@ -751,19 +749,21 @@ def _analyze(self, method = 'sidesub'): else: self.logger.critical('invalid method %s', method) self._save_hist(h, f'h_ptjet{label}_{method}_noeff_{mcordata}_pt{ipt}.png') - # if mcordata == 'mc': - # h_proj = project_hist(h_in, axes_proj[1:], {}) - # h_proj_lim = project_hist(h_in, axes_proj[1:], {0: (1, get_nbins(h_in, 0))}) - # self._save_hist(h_proj, f'h_ptjet{label}_proj_noeff_{mcordata}_pt{ipt}.png') - # if h and h_proj: - # self.logger.debug('signal loss %s-%i: %g, fraction in under-/overflow: %g', - # mcordata, ipt, - # 1. - h.Integral()/h_proj.Integral(), - # 1. - h_proj_lim.Integral()/h_proj.Integral()) - # if self.cfg('closure.pure_signal'): - # self.logger.debug('assuming pure signal, using projection') - # h = h_proj - # # Efficiency correction + if mcordata == 'mc': + self.logger.info('projecting %s onto axes: %s', h_in, axes_proj[1:]) + h_proj = project_hist(h_in, list(range(1, get_dim(h_in))), {}) + h_proj_lim = project_hist(h_in, list(range(1, get_dim(h_in))), + {0: (1, get_nbins(h_in, 0))}) + self._save_hist(h_proj, f'h_ptjet{label}_proj_noeff_{mcordata}_pt{ipt}.png') + if h and h_proj: + self.logger.debug('signal loss %s-%i: %g, fraction in under-/overflow: %g', + mcordata, ipt, + 1. - h.Integral()/h_proj.Integral(), + 1. - h_proj_lim.Integral()/h_proj.Integral()) + if self.cfg('closure.pure_signal'): + self.logger.debug('assuming pure signal, using projection') + h = h_proj + # Efficiency correction if mcordata == 'data' or not self.cfg('closure.use_matched'): self.logger.info("Efficiency correction: obs. %s, %s, ipt %d", var, mcordata, ipt) @@ -786,10 +786,6 @@ def _analyze(self, method = 'sidesub'): f'_{string_range_ptjet(range_ptjet)}.png') self._save_canvas(c, filename) - # TODO: remove restriction on higher dimensions - if var and '-' in var: - continue - fh_sum_fdsub = fh_sum.Clone() # Feed-down subtraction self.logger.info("Feed-down subtraction: obs. %s, %s", var, mcordata) @@ -798,7 +794,7 @@ def _analyze(self, method = 'sidesub'): self._clip_neg(fh_sum_fdsub) self._save_hist(fh_sum_fdsub, f'h_ptjet{label}_{method}_{mcordata}.png') - if get_dim(fh_sum) > 1: + if get_dim(fh_sum) == 2: axes = list(range(get_dim(fh_sum))) axis_ptjet = get_axis(fh_sum, 0) for iptjet in range(get_nbins(fh_sum, 0)): @@ -824,17 +820,21 @@ def _analyze(self, method = 'sidesub'): f'_{string_range_ptjet(range_ptjet)}.png') self._save_canvas(c, filename) + # TODO: remove restriction on higher dimensions if not var: continue axis_ptjet = get_axis(fh_sum_fdsub, 0) for j in range(get_nbins(fh_sum_fdsub, 0)): - # TODO: generalize to higher dimensions - hproj = project_hist(fh_sum_fdsub, [1], {0: [j+1, j+1]}) + hproj = project_hist(fh_sum_fdsub, list(range(1, get_dim(fh_sum_fdsub))), {0: [j+1, j+1]}) range_ptjet = get_bin_limits(axis_ptjet, j + 1) self._save_hist( hproj, f'uf/h_{var}_{method}_{mcordata}_{string_range_ptjet(range_ptjet)}.png') # Unfolding - self.logger.info("Unfolding: obs. %s, %s", var, mcordata) + if get_dim(fh_sum_fdsub) > 2: + self.logger.info("No unfolding for 2d distributions: obs. %s, %s", var, mcordata) + continue + else: + self.logger.info("Unfolding: obs. %s, %s", var, mcordata) fh_unfolded = self._unfold(fh_sum_fdsub, var, mcordata) for i, h in enumerate(fh_unfolded): self._save_hist(h, f'h_ptjet-{var}_{method}_unfolded_{mcordata}_{i}.png') @@ -842,7 +842,7 @@ def _analyze(self, method = 'sidesub'): range_ptjet = get_bin_limits(axis_ptjet, j + 1) c = TCanvas() for i, h in enumerate(fh_unfolded): - hproj = project_hist(h, [1], {0: (j+1, j+1)}) + hproj = project_hist(h, list(range(1, get_dim(h))), {0: (j+1, j+1)}) empty = hproj.Integral() < 1.e-7 if empty and i == 0: self.logger.error("Projection %s %s %s is empty.", var, mcordata, @@ -1133,6 +1133,8 @@ def _subtract_feeddown(self, hist, var, mcordata): #region unfolding def _unfold(self, hist, var, mcordata): self.logger.debug('Unfolding for %s', var) + if get_dim(hist) > 2: + raise NotImplementedError suffix = '_frac' if mcordata == 'mc' else '' with TFile(self.n_fileeff) as rfile: h_response = rfile.Get(f'h_response_pr_{var}{suffix}') diff --git a/machine_learning_hep/submission/analyzer.yml b/machine_learning_hep/submission/analyzer.yml index 5942788fb9..e08341b099 100644 --- a/machine_learning_hep/submission/analyzer.yml +++ b/machine_learning_hep/submission/analyzer.yml @@ -1,87 +1,87 @@ --- case: XXXX # used to find the database file unless specified explicitly as do_entire_analysis -d database_analysis download: - alice: - activate: false + alice: + activate: false conversion: # pkl - mc: - activate: false - data: - activate: false + mc: + activate: false + data: + activate: false skimming: # pkl_skimmed (pklsk), pkl_evtcounter_all - mc: - activate: false - data: - activate: false + mc: + activate: false + data: + activate: false merging: # pkl_skimmed_merge_for_ml (pklskml) - mc: - activate: false - data: - activate: false + mc: + activate: false + data: + activate: false mergingperiods: # pkl_skimmed_merge_for_ml_all - mc: - activate: false - data: - activate: false + mc: + activate: false + data: + activate: false ml_study: # mlout, mlplot - activate: false - dotraining: false - dotesting: false - doplotdistr: false - doroc: false - doroctraintest: false - doimportance: false - doimportanceshap: false - docorrelation: false - dolearningcurve: false - doapplytodatamc: false - doscancuts: false - doefficiency: false - dosignifopt: false - doboundary: false - docrossvalidation: false - dogridsearch: false - dobayesianopt: false + activate: false + dotraining: false + dotesting: false + doplotdistr: false + doroc: false + doroctraintest: false + doimportance: false + doimportanceshap: false + docorrelation: false + dolearningcurve: false + doapplytodatamc: false + doscancuts: false + doefficiency: false + dosignifopt: false + doboundary: false + docrossvalidation: false + dogridsearch: false + dobayesianopt: false mlapplication: - data: - doapply: false # pkl_skimmed_dec (pklskdec) - domergeapply: false # pkl_skimmed_decmerged (pklskdecmerged) - docontinueafterstop: false # set to true to resume interrupted processing (existing corrupted output will be overwritten) - mc: - doapply: false # pkl_skimmed_dec (pklskdec) - domergeapply: false # pkl_skimmed_decmerged (pklskdecmerged) - docontinueafterstop: false # set to true to resume interrupted processing (existing corrupted output will be overwritten) + data: + doapply: false # pkl_skimmed_dec (pklskdec) + domergeapply: false # pkl_skimmed_decmerged (pklskdecmerged) + docontinueafterstop: false # set to true to resume interrupted processing (existing corrupted output will be overwritten) + mc: + doapply: false # pkl_skimmed_dec (pklskdec) + domergeapply: false # pkl_skimmed_decmerged (pklskdecmerged) + docontinueafterstop: false # set to true to resume interrupted processing (existing corrupted output will be overwritten) analysis: - type: "YYYY" # used unless specified explicitly as do_entire_analysis -a type_ana - # Do each period separately including merged (true) - # Do only merged (false) - doperperiod: false - data: - histomass: false # processer: process_histomass - mc: - histomass: false # processer: process_histomass - efficiency: false # processer: process_efficiency - steps: # analyzer methods to run (uncomment to activate) - ##### Inclusive hadrons - # fit: - # efficiency: - # makenormyields: - ##### Jets - init: - calculate_efficiencies: - qa: - fit: - # estimate_feeddown: - analyze_with_sidesub: - # analyze_with_sigextr: + type: "YYYY" # used unless specified explicitly as do_entire_analysis -a type_ana + # Do each period separately including merged (true) + # Do only merged (false) + doperperiod: false + data: + histomass: false # processer: process_histomass + mc: + histomass: false # processer: process_histomass + efficiency: false # processer: process_efficiency + steps: # analyzer methods to run (uncomment to activate) + ##### Inclusive hadrons + # fit: + # efficiency: + # makenormyields: + ##### Jets + init: + calculate_efficiencies: + qa: + fit: + estimate_feeddown: + analyze_with_sidesub: + # analyze_with_sigextr: systematics: - cutvar: - activate: false - do_only_analysis: false # This can be done anytime when mass and efficiency histograms have been produced already for a number of trials - resume: false # already done mass and efficiency histograms will not be done again, continue with left trials - mcptshape: - activate: false + cutvar: + activate: false + do_only_analysis: false # This can be done anytime when mass and efficiency histograms have been produced already for a number of trials + resume: false # already done mass and efficiency histograms will not be done again, continue with left trials + mcptshape: + activate: false From ec83f02336652d9973be293d933eed973d795949 Mon Sep 17 00:00:00 2001 From: Jochen Klein Date: Wed, 22 Jan 2025 13:47:51 +0100 Subject: [PATCH 03/10] Establish response matrices for higher dimensions --- machine_learning_hep/processer_jet.py | 85 +++++++++++++++------------ 1 file changed, 47 insertions(+), 38 deletions(-) diff --git a/machine_learning_hep/processer_jet.py b/machine_learning_hep/processer_jet.py index 2e32a520f6..3c427adb98 100644 --- a/machine_learning_hep/processer_jet.py +++ b/machine_learning_hep/processer_jet.py @@ -21,7 +21,7 @@ from machine_learning_hep.processer import Processer from machine_learning_hep.utilities import dfquery, read_df -from machine_learning_hep.utils.hist import bin_array, create_hist, fill_hist, get_axis, get_range +from machine_learning_hep.utils.hist import bin_array, create_hist, fill_hist, get_axis, get_range, project_hist # pylint: disable=too-many-instance-attributes, too-many-statements @@ -279,16 +279,30 @@ def process_efficiency_single(self, index): ';p_{T}^{jet} (GeV/#it{c});p_{T}^{HF} (GeV/#it{c})', self.binarrays_ptjet['det']['fPt'], self.binarray_pthf) for cat in cats for level in levels_eff} - # TODO: extend to multi-dimensional observables - h_response = { - (cat, var): create_hist( - f'h_response_{cat}_{var}', - f";p_{{T}}^{{jet}} (GeV/#it{{c}});{var};p_{{T}}^{{jet}} (GeV/#it{{c}});{var};p_{{T}} (GeV/#it{{c}})", - self.binarrays_ptjet['det'][var], self.binarrays_obs['det'][var], - self.binarrays_ptjet['gen'][var], self.binarrays_obs['gen'][var], - self.binarray_pthf) - for (cat, var) in itertools.product(cats, observables) - if not '-' in var} + h_response = {} + h_effkine = {} + for cat in cats: + for obs, spec in self.cfg('observables', {}).items(): + self.logger.info('preparing response matrix for %s', obs) + var = obs.split('-') + dim = len(var) + 1 + h_response[(cat, obs)] = h = create_hist( + f'h_response_{cat}_{obs}', f"response matrix {obs}", + self.binarrays_ptjet['det'][var[0]], *[self.binarrays_obs['det'][v] for v in var], + self.binarrays_ptjet['gen'][var[0]], *[self.binarrays_obs['gen'][v] for v in var], + self.binarray_pthf) + get_axis(h, 0).SetTitle("p_{T}^{jet} (GeV/#it{c})") + get_axis(h, dim).SetTitle("p_{T}^{jet} (GeV/#it{c})") + get_axis(h, 2*dim).SetTitle("p_{T}^{HF} (GeV/#it{c})") + for i, v in enumerate(var, 1): + get_axis(h, i).SetTitle(self.cfg(f'observables.{v}.label', v)) + get_axis(h, i+dim).SetTitle(self.cfg(f'observables.{v}.label', v)) + for cut in cuts: + h_effkine[(cat, 'det', cut, obs)] = he = project_hist(h, list(range(dim)), {}).Clone() + he.SetName(f'h_effkine_{cat}_det_{cut}_{obs}') + h_effkine[(cat, 'gen', cut, obs)] = he = project_hist(h, list(range(dim, 2*dim)), {}).Clone() + he.SetName(f'h_effkine_{cat}_gen_{cut}_{obs}') + h_response_fd = {var: create_hist( f'h_response_fd_{var}', @@ -296,14 +310,6 @@ def process_efficiency_single(self, index): self.binarrays_ptjet['det'][var], self.binarrays_obs['det']['fPt'], self.binarrays_obs['det'][var], self.binarrays_ptjet['gen'][var], self.binarrays_obs['gen']['fPt'], self.binarrays_obs['gen'][var]) for var in self.cfg('observables', []) if not '-' in var} - # TODO: derive bins from response histogram - h_effkine = {(cat, level, cut, var): - create_hist(f'h_effkine_{cat}_{level}_{cut}_{var}', - f";p_{{T}}^{{jet}} (GeV/#it{{c}});{var_spec['label']}", - self.binarrays_ptjet[level][var], self.binarrays_obs[level][var]) - for (var, var_spec), level, cat, cut - in itertools.product(observables.items(), levels_effkine, cats, cuts) - if not '-' in var} h_effkine_fd = {(level, cut, var): create_hist(f'h_effkine_fd_{level}_{cut}_{var}', f";p_{{T}}^{{jet}} (GeV/#it{{c}});{var_spec['label']}", self.binarrays_ptjet[level][var], self.binarrays_obs[level]['fPt'], self.binarrays_obs[level][var]) @@ -412,30 +418,33 @@ def process_efficiency_single(self, index): except Exception as ex: # pylint: disable=broad-exception-caught self.logger.error('Writing of <%s> (%s) failed: %s', name, str(obj), str(ex)) - def _prepare_response(self, dfi, h_effkine, h_response, cat, var): - axis_ptjet_det = get_axis(h_response[(cat, var)], 0) - axis_var_det = get_axis(h_response[(cat, var)], 1) - axis_ptjet_gen = get_axis(h_response[(cat, var)], 2) - axis_var_gen = get_axis(h_response[(cat, var)], 3) + def _prepare_response(self, dfi, h_effkine, h_response, cat, obs): + var = obs.split('-') + dim = len(var) + 1 + axes_det = [get_axis(h_response[(cat, obs)], i) for i in range(dim)] + axes_gen = [get_axis(h_response[(cat, obs)], i) for i in range(dim, 2 * dim)] df = dfi - # TODO: the first cut should be taken care of by under-/overflow bins, check their usage in analyzer - df = df.loc[(df.fJetPt >= axis_ptjet_det.GetXmin()) & (df.fJetPt < axis_ptjet_det.GetXmax()) & - (df[var] >= axis_var_det.GetXmin()) & (df[var] < axis_var_det.GetXmax())] - fill_hist(h_effkine[(cat, 'det', 'nocuts', var)], df[['fJetPt', var]]) - df = df.loc[(df.fJetPt_gen >= axis_ptjet_gen.GetXmin()) & (df.fJetPt_gen < axis_ptjet_gen.GetXmax()) & - (df[f'{var}_gen'] >= axis_var_gen.GetXmin()) & (df[f'{var}_gen'] < axis_var_gen.GetXmax())] - fill_hist(h_effkine[(cat, 'det', 'cut', var)], df[['fJetPt', var]]) + df = df.loc[(df.fJetPt >= axes_det[0].GetXmin()) & (df.fJetPt < axes_det[0].GetXmax())] + for i, v in enumerate(var, 1): + df = df.loc[(df[v] >= axes_det[i].GetXmin()) & (df[v] < axes_det[i].GetXmax())] + fill_hist(h_effkine[(cat, 'det', 'nocuts', obs)], df[['fJetPt', *var]]) + df = df.loc[(df.fJetPt >= axes_gen[0].GetXmin()) & (df.fJetPt < axes_gen[0].GetXmax())] + for i, v in enumerate(var, 1): + df = df.loc[(df[f'{v}_gen'] >= axes_gen[i].GetXmin()) & (df[f'{v}_gen'] < axes_gen[i].GetXmax())] + fill_hist(h_effkine[(cat, 'det', 'cut', obs)], df[['fJetPt', *var]]) - fill_hist(h_response[(cat, var)], df[['fJetPt', f'{var}', 'fJetPt_gen', f'{var}_gen', 'fPt']]) + fill_hist(h_response[(cat, obs)], df[['fJetPt', *var, 'fJetPt_gen', *(f'{v}_gen' for v in var), 'fPt']]) df = dfi - df = df.loc[(df.fJetPt_gen >= axis_ptjet_gen.GetXmin()) & (df.fJetPt_gen < axis_ptjet_gen.GetXmax()) & - (df[f'{var}_gen'] >= axis_var_gen.GetXmin()) & (df[f'{var}_gen'] < axis_var_gen.GetXmax())] - fill_hist(h_effkine[(cat, 'gen', 'nocuts', var)], df[['fJetPt_gen', f'{var}_gen']]) - df = df.loc[(df.fJetPt >= axis_ptjet_det.GetXmin()) & (df.fJetPt < axis_ptjet_det.GetXmax()) & - (df[f'{var}'] >= axis_var_det.GetXmin()) & (df[f'{var}'] < axis_var_det.GetXmax())] - fill_hist(h_effkine[(cat, 'gen', 'cut', var)], df[['fJetPt_gen', f'{var}_gen']]) + df = df.loc[(df.fJetPt >= axes_gen[0].GetXmin()) & (df.fJetPt < axes_gen[0].GetXmax())] + for i, v in enumerate(var, 1): + df = df.loc[(df[f'{v}_gen'] >= axes_gen[i].GetXmin()) & (df[f'{v}_gen'] < axes_gen[i].GetXmax())] + fill_hist(h_effkine[(cat, 'gen', 'nocuts', obs)], df[['fJetPt_gen', *(f'{v}_gen' for v in var)]]) + df = df.loc[(df.fJetPt >= axes_det[0].GetXmin()) & (df.fJetPt < axes_det[0].GetXmax())] + for i, v in enumerate(var, 1): + df = df.loc[(df[v] >= axes_det[i].GetXmin()) & (df[v] < axes_det[i].GetXmax())] + fill_hist(h_effkine[(cat, 'gen', 'cut', obs)], df[['fJetPt_gen', *(f'{v}_gen' for v in var)]]) def _prepare_response_fd(self, dfi, h_effkine, h_response, var): From 004799b4619d7e4bb9d808f67b04160500ca13f8 Mon Sep 17 00:00:00 2001 From: Jochen Klein Date: Wed, 22 Jan 2025 15:34:41 +0100 Subject: [PATCH 04/10] Generalize unfolding to higher dimensions --- .../analysis/analyzer_jets.py | 36 ++++++++----------- 1 file changed, 14 insertions(+), 22 deletions(-) diff --git a/machine_learning_hep/analysis/analyzer_jets.py b/machine_learning_hep/analysis/analyzer_jets.py index 8cd6188c36..4496f5082d 100644 --- a/machine_learning_hep/analysis/analyzer_jets.py +++ b/machine_learning_hep/analysis/analyzer_jets.py @@ -820,7 +820,6 @@ def _analyze(self, method = 'sidesub'): f'_{string_range_ptjet(range_ptjet)}.png') self._save_canvas(c, filename) - # TODO: remove restriction on higher dimensions if not var: continue axis_ptjet = get_axis(fh_sum_fdsub, 0) @@ -830,11 +829,7 @@ def _analyze(self, method = 'sidesub'): self._save_hist( hproj, f'uf/h_{var}_{method}_{mcordata}_{string_range_ptjet(range_ptjet)}.png') # Unfolding - if get_dim(fh_sum_fdsub) > 2: - self.logger.info("No unfolding for 2d distributions: obs. %s, %s", var, mcordata) - continue - else: - self.logger.info("Unfolding: obs. %s, %s", var, mcordata) + self.logger.info("Unfolding: obs. %s, %s", var, mcordata) fh_unfolded = self._unfold(fh_sum_fdsub, var, mcordata) for i, h in enumerate(fh_unfolded): self._save_hist(h, f'h_ptjet-{var}_{method}_unfolded_{mcordata}_{i}.png') @@ -1091,18 +1086,17 @@ def _build_effkine(self, h_nocuts, h_cuts): def _build_response_matrix(self, h_response, h_eff = None, frac_flat = 0.): + dim = (get_dim(h_response) - 1) // 2 + self.logger.info("Building %i-dim response matrix from %s", dim, h_response) rm = ROOT.RooUnfoldResponse( - project_hist(h_response, [0, 1], {}), project_hist(h_response, [2, 3], {})) - h_gen = project_hist(h_response, [2, 3], {}) - for hbin in itertools.product( - enumerate(list(get_axis(h_response, 0).GetXbins())[:-1], 1), - enumerate(list(get_axis(h_response, 1).GetXbins())[:-1], 1), - enumerate(list(get_axis(h_response, 2).GetXbins())[:-1], 1), - enumerate(list(get_axis(h_response, 3).GetXbins())[:-1], 1), - enumerate(list(get_axis(h_response, 4).GetXbins())[:-1], 1)): + project_hist(h_response, list(range(dim)), {}), project_hist(h_response, list(range(dim, 2 * dim)), {})) + h_gen = project_hist(h_response, list(range(dim, 2 * dim)), {}) + + x = (enumerate(list(get_axis(h_response, iaxis).GetXbins())[:-1], 1) for iaxis in range(2*dim+1)) + for hbin in itertools.product(*x): n = h_response.GetBinContent( - np.asarray([hbin[0][0], hbin[1][0], hbin[2][0], hbin[3][0], hbin[4][0]], 'i')) - eff = h_eff.GetBinContent(hbin[4][0]) if h_eff else 1. + np.asarray([hbin[i][0] for i in range(2*dim+1)], 'i')) + eff = h_eff.GetBinContent(hbin[2*dim][0]) if h_eff else 1. if np.isclose(eff, 0.): self.logger.error('efficiency 0 for %s', hbin[4]) continue @@ -1111,7 +1105,7 @@ def _build_response_matrix(self, h_response, h_eff = None, frac_flat = 0.): if frac_flat > 0.: fac += frac_flat * (1. / cnt_gen - 1.) for _ in range(int(n)): - rm.Fill(hbin[0][1], hbin[1][1], hbin[2][1], hbin[3][1], 1./eff * fac) + rm.Fill(*(hbin[iaxis][1] for iaxis in range(2*dim)), 1./eff * fac) # rm.Mresponse().Print() return rm @@ -1132,9 +1126,7 @@ def _subtract_feeddown(self, hist, var, mcordata): #region unfolding def _unfold(self, hist, var, mcordata): - self.logger.debug('Unfolding for %s', var) - if get_dim(hist) > 2: - raise NotImplementedError + self.logger.info('Unfolding for %s', var) suffix = '_frac' if mcordata == 'mc' else '' with TFile(self.n_fileeff) as rfile: h_response = rfile.Get(f'h_response_pr_{var}{suffix}') @@ -1165,7 +1157,7 @@ def _unfold(self, hist, var, mcordata): self._save_hist(h_effkine_gen, f'uf/h_effkine-ptjet-{var}_pr_gen_{mcordata}.png', 'text') # TODO: move, has nothing to do with unfolding - if mcordata == 'mc': + if mcordata == 'mc' and get_dim(hist) <= 2: h_mctruth_pr = rfile.Get(f'h_ptjet-pthf-{var}_pr_gen') if h_mctruth_pr: h_mctruth_pr = project_hist(h_mctruth_pr, [0, 2], {}) @@ -1188,7 +1180,7 @@ def _unfold(self, hist, var, mcordata): self._save_hist(fh_unfolding_output, f'uf/h_ptjet-{var}_{mcordata}_unfoldeffcorr{n}.png', 'texte') h_unfolding_output.append(fh_unfolding_output) - if mcordata == 'mc': + if mcordata == 'mc' and get_dim(hist) <= 2: if h_mctruth_pr: h_mcunfolded = fh_unfolding_output.Clone() h_mcunfolded.Divide(h_mctruth_pr) From 8d997aa72c3c93b8fbef9122f6cfec0adebe0cc1 Mon Sep 17 00:00:00 2001 From: Jochen Klein Date: Fri, 24 Jan 2025 15:16:56 +0100 Subject: [PATCH 05/10] Add support for array columns --- machine_learning_hep/processer_jet.py | 33 +++++++++++++++++++++------ machine_learning_hep/utils/hist.py | 2 +- 2 files changed, 27 insertions(+), 8 deletions(-) diff --git a/machine_learning_hep/processer_jet.py b/machine_learning_hep/processer_jet.py index 3c427adb98..31244de8d4 100644 --- a/machine_learning_hep/processer_jet.py +++ b/machine_learning_hep/processer_jet.py @@ -390,6 +390,14 @@ def process_efficiency_single(self, index): self.logger.error('No matching, could not fill matched detector-level histograms') for var, cat in itertools.product(observables, cats): + if cat in dfmatch and dfmatch[cat] is not None: + self._prepare_response(dfmatch[cat], h_effkine, h_response, cat, var) + f = self.cfg('frac_mcana', .2) + _, df_mccorr = self.split_df(dfmatch[cat], f if f < 1. else 0.) + self._prepare_response(df_mccorr, h_effkine_frac, h_response_frac, cat, var) + if not '-' in var and not self.cfg(f'observables.{var}.arraycols'): + self._prepare_response_fd(dfmatch[cat], h_effkine_fd, h_response_fd, var) + # TODO: add support for more complex observables if '-' in var or self.cfg(f'observables.{var}.arraycols'): continue @@ -403,13 +411,6 @@ def process_efficiency_single(self, index): dfquery(df_mcana, f, inplace=True) fill_hist(h_mctruth[(cat, var)], df_mcana[['fJetPt_gen', 'fPt_gen', f'{var}_gen']]) - if cat in dfmatch and dfmatch[cat] is not None: - self._prepare_response(dfmatch[cat], h_effkine, h_response, cat, var) - self._prepare_response_fd(dfmatch[cat], h_effkine_fd, h_response_fd, var) - f = self.cfg('frac_mcana', .2) - _, df_mccorr = self.split_df(dfmatch[cat], f if f < 1. else 0.) - self._prepare_response(df_mccorr, h_effkine_frac, h_response_frac, cat, var) - for name, obj in itertools.chain(h_eff.items(), h_effkine.items(), h_response.items(), h_effkine_fd.items(), h_response_fd.items(), h_effkine_frac.items(), h_response_frac.items(), h_mctruth.items()): @@ -423,8 +424,18 @@ def _prepare_response(self, dfi, h_effkine, h_response, cat, obs): dim = len(var) + 1 axes_det = [get_axis(h_response[(cat, obs)], i) for i in range(dim)] axes_gen = [get_axis(h_response[(cat, obs)], i) for i in range(dim, 2 * dim)] + arraycols = [i - 3 for i in self.cfg(f'observables.{obs}').get('arraycols', [])] df = dfi + if arraycols: + self.logger.info("Exploding columns %s -> %s", arraycols, [var[icol] for icol in arraycols]) + # only consider rows with corresponding det- and gen-level entries + df['length'] = [len(x) for x in df[var[0]]] + df['length_gen'] = [len(x) for x in df[var[0] + '_gen']] + df = df.loc[df.length == df.length_gen] + df = df.explode([var[icol] for icol in arraycols] + [var[icol] + '_gen' for icol in arraycols]) + df.dropna(inplace=True) + df = df.loc[(df.fJetPt >= axes_det[0].GetXmin()) & (df.fJetPt < axes_det[0].GetXmax())] for i, v in enumerate(var, 1): df = df.loc[(df[v] >= axes_det[i].GetXmin()) & (df[v] < axes_det[i].GetXmax())] @@ -434,9 +445,17 @@ def _prepare_response(self, dfi, h_effkine, h_response, cat, obs): df = df.loc[(df[f'{v}_gen'] >= axes_gen[i].GetXmin()) & (df[f'{v}_gen'] < axes_gen[i].GetXmax())] fill_hist(h_effkine[(cat, 'det', 'cut', obs)], df[['fJetPt', *var]]) + # print(df[['fJetPt', *var, 'fJetPt_gen', *(f'{v}_gen' for v in var), 'fPt']].info(), flush=True) fill_hist(h_response[(cat, obs)], df[['fJetPt', *var, 'fJetPt_gen', *(f'{v}_gen' for v in var), 'fPt']]) df = dfi + if arraycols: + self.logger.info("Exploding columns %s -> %s", arraycols, [var[icol] for icol in arraycols]) + df['length'] = [len(x) for x in df[var[0]]] + df['length_gen'] = [len(x) for x in df[var[0] + '_gen']] + df = df.loc[df.length == df.length_gen] + df = df.explode([var[icol] for icol in arraycols] + [var[icol] + '_gen' for icol in arraycols]) + df.dropna(inplace=True) df = df.loc[(df.fJetPt >= axes_gen[0].GetXmin()) & (df.fJetPt < axes_gen[0].GetXmax())] for i, v in enumerate(var, 1): df = df.loc[(df[f'{v}_gen'] >= axes_gen[i].GetXmin()) & (df[f'{v}_gen'] < axes_gen[i].GetXmax())] diff --git a/machine_learning_hep/utils/hist.py b/machine_learning_hep/utils/hist.py index 8c16895c2f..b5b4549e68 100644 --- a/machine_learning_hep/utils/hist.py +++ b/machine_learning_hep/utils/hist.py @@ -183,7 +183,7 @@ def fill_hist(hist, dfi: pd.DataFrame, weights = None, arraycols = None, write = elif dim_hist > 3: assert weights is None, 'weights not supported' if not arraycols: - dfi.apply(lambda row: hist.Fill(*row), axis=1) + dfi.apply(lambda row: hist.Fill(np.array(row, 'd'), 1.), axis=1) else: m = [-1] * dim_hist idx = 0 From 1b6aedd8734d7392ba0a51c9d559ab8454108aef Mon Sep 17 00:00:00 2001 From: Jochen Klein Date: Fri, 24 Jan 2025 17:01:08 +0100 Subject: [PATCH 06/10] Generalise building of response matrix --- machine_learning_hep/analysis/analyzer_jets.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/machine_learning_hep/analysis/analyzer_jets.py b/machine_learning_hep/analysis/analyzer_jets.py index 4496f5082d..9ebc501c17 100644 --- a/machine_learning_hep/analysis/analyzer_jets.py +++ b/machine_learning_hep/analysis/analyzer_jets.py @@ -1100,7 +1100,7 @@ def _build_response_matrix(self, h_response, h_eff = None, frac_flat = 0.): if np.isclose(eff, 0.): self.logger.error('efficiency 0 for %s', hbin[4]) continue - if (cnt_gen := h_gen.GetBinContent(hbin[2][0], hbin[3][0])) > 0.: + if (cnt_gen := h_gen.GetBinContent(*(hbin[i][0] for i in range(dim, 2*dim)))) > 0.: fac = 1. if frac_flat > 0.: fac += frac_flat * (1. / cnt_gen - 1.) From 8d905a23c0a8e3b0e52885a1be43595e861adf6e Mon Sep 17 00:00:00 2001 From: Jochen Klein Date: Fri, 24 Jan 2025 18:00:34 +0100 Subject: [PATCH 07/10] Fix pylint --- machine_learning_hep/processer.py | 4 +++- machine_learning_hep/processer_jet.py | 8 ++++---- 2 files changed, 7 insertions(+), 5 deletions(-) diff --git a/machine_learning_hep/processer.py b/machine_learning_hep/processer.py index 0b8f9a8e17..b5bfe7fc04 100644 --- a/machine_learning_hep/processer.py +++ b/machine_learning_hep/processer.py @@ -23,6 +23,7 @@ import tempfile from copy import deepcopy from functools import reduce +from typing import TypeVar from pandas.api.types import is_numeric_dtype import numpy as np @@ -288,7 +289,8 @@ def __init__(self, case, datap, run_param, mcordata, p_maxfiles, # pylint: disab # Flag if they should be used self.do_custom_analysis_cuts = datap["analysis"][self.typean].get("use_cuts", False) - def cfg(self, param, default = None): + T = TypeVar("T") + def cfg(self, param: str, default: T = None) -> T: return reduce(lambda d, key: d.get(key, default) if isinstance(d, dict) else default, param.split("."), self.datap['analysis'][self.typean]) diff --git a/machine_learning_hep/processer_jet.py b/machine_learning_hep/processer_jet.py index 31244de8d4..ac350a0246 100644 --- a/machine_learning_hep/processer_jet.py +++ b/machine_learning_hep/processer_jet.py @@ -61,7 +61,7 @@ def __init__(self, case, datap, run_param, mcordata, p_maxfiles, # pylint: disab self.binarray_pthf = np.asarray(self.cfg('sel_an_binmin', []) + self.cfg('sel_an_binmax', [])[-1:], 'd') self.binarrays_obs = {'gen': {}, 'det': {}} self.binarrays_ptjet = {'gen': {}, 'det': {}} - for obs in self.cfg('observables'): + for obs in self.cfg('observables', {}): var = obs.split('-') for v in var: if v in self.binarrays_obs: @@ -265,7 +265,7 @@ def process_histomass_single(self, index): # - priors (reweight response matrix) # region efficiency - # pylint: disable=too-many-branches,too-many-statements + # pylint: disable=too-many-branches,too-many-statements,too-many-locals def process_efficiency_single(self, index): self.logger.info('Processing (efficiency) %s', self.l_evtorig[index]) @@ -273,7 +273,7 @@ def process_efficiency_single(self, index): levels_eff = ['gen', 'det', 'genmatch', 'detmatch', 'detmatch_gencuts'] levels_effkine = ['gen', 'det'] cuts = ['nocuts', 'cut'] - observables = self.cfg('observables', []) + observables = self.cfg('observables', {}) observables.update({'fPt': {'label': 'p_{T}^{HF} (GeV/#it{c})'}}) h_eff = {(cat, level): create_hist(f'h_ptjet-pthf_{cat}_{level}', ';p_{T}^{jet} (GeV/#it{c});p_{T}^{HF} (GeV/#it{c})', @@ -282,7 +282,7 @@ def process_efficiency_single(self, index): h_response = {} h_effkine = {} for cat in cats: - for obs, spec in self.cfg('observables', {}).items(): + for obs in self.cfg('observables', {}): self.logger.info('preparing response matrix for %s', obs) var = obs.split('-') dim = len(var) + 1 From 908ea383485676daa7768dcb10b4ebf7e8490884 Mon Sep 17 00:00:00 2001 From: Jochen Klein Date: Mon, 27 Jan 2025 13:55:41 +0100 Subject: [PATCH 08/10] Generalize feeddown and mctruth to higher dimensions --- machine_learning_hep/processer_jet.py | 151 ++++++++++++++------------ 1 file changed, 81 insertions(+), 70 deletions(-) diff --git a/machine_learning_hep/processer_jet.py b/machine_learning_hep/processer_jet.py index ac350a0246..77e9479a3b 100644 --- a/machine_learning_hep/processer_jet.py +++ b/machine_learning_hep/processer_jet.py @@ -281,6 +281,9 @@ def process_efficiency_single(self, index): for cat in cats for level in levels_eff} h_response = {} h_effkine = {} + h_response_fd = {} + h_effkine_fd = {} + h_mctruth = {} for cat in cats: for obs in self.cfg('observables', {}): self.logger.info('preparing response matrix for %s', obs) @@ -302,27 +305,28 @@ def process_efficiency_single(self, index): he.SetName(f'h_effkine_{cat}_det_{cut}_{obs}') h_effkine[(cat, 'gen', cut, obs)] = he = project_hist(h, list(range(dim, 2*dim)), {}).Clone() he.SetName(f'h_effkine_{cat}_gen_{cut}_{obs}') - - h_response_fd = {var: - create_hist( - f'h_response_fd_{var}', - f";p_{{T}}^{{jet}} (GeV/#it{{c}});{var};p_{{T}}^{{jet}} (GeV/#it{{c}});{var};p_{{T}} (GeV/#it{{c}})", - self.binarrays_ptjet['det'][var], self.binarrays_obs['det']['fPt'], self.binarrays_obs['det'][var], - self.binarrays_ptjet['gen'][var], self.binarrays_obs['gen']['fPt'], self.binarrays_obs['gen'][var]) - for var in self.cfg('observables', []) if not '-' in var} - h_effkine_fd = {(level, cut, var): create_hist(f'h_effkine_fd_{level}_{cut}_{var}', - f";p_{{T}}^{{jet}} (GeV/#it{{c}});{var_spec['label']}", - self.binarrays_ptjet[level][var], self.binarrays_obs[level]['fPt'], self.binarrays_obs[level][var]) - for (var, var_spec), level, cut - in itertools.product(self.cfg('observables', {}).items(), levels_effkine, cuts) - if not '-' in var} - h_mctruth = { - (cat, var): create_hist( - f'h_ptjet-pthf-{var}_{cat}_gen', - f";p_{{T}}^{{jet}} (GeV/#it{{c}});p_{{T}}^{{HF}} (GeV/#it{{c}});{var}", - self.binarrays_ptjet['gen'][var], self.binarray_pthf, self.binarrays_obs['gen'][var]) - for (cat, var) in itertools.product(cats, observables) - if not '-' in var} + h_mctruth[(cat, obs)] = create_hist( + f'h_ptjet-pthf-{obs}_{cat}_gen', + f";p_{{T}}^{{jet}} (GeV/#it{{c}});p_{{T}}^{{HF}} (GeV/#it{{c}});{obs}", + self.binarrays_ptjet['gen'][var[0]], + self.binarray_pthf, + *[self.binarrays_obs['gen'][v] for v in var]) + h_response_fd[obs] = create_hist( + f'h_response_fd_{var}', + f";response matrix fd {obs}", + self.binarrays_ptjet['det'][var[0]], + self.binarrays_obs['det']['fPt'], + *[self.binarrays_obs['det'][v] for v in var], + self.binarrays_ptjet['gen'][var[0]], + self.binarrays_obs['gen']['fPt'], + *[self.binarrays_obs['gen'][v] for v in var]) + for level, cut in itertools.product(levels_effkine, cuts): + h_effkine_fd[(level, cut, obs)] = create_hist( + f'h_effkine_fd_{level}_{cut}_{obs}', + f"effkine {obs}", + self.binarrays_ptjet[level][var[0]], + self.binarrays_obs[level]['fPt'], + *[self.binarrays_obs[level][v] for v in var]) # create partial versions for closure testing h_effkine_frac = copy.deepcopy(h_effkine) @@ -389,18 +393,15 @@ def process_efficiency_single(self, index): else: self.logger.error('No matching, could not fill matched detector-level histograms') - for var, cat in itertools.product(observables, cats): + for obs, cat in itertools.product(observables, cats): if cat in dfmatch and dfmatch[cat] is not None: - self._prepare_response(dfmatch[cat], h_effkine, h_response, cat, var) + self._prepare_response(dfmatch[cat], h_effkine, h_response, cat, obs) f = self.cfg('frac_mcana', .2) _, df_mccorr = self.split_df(dfmatch[cat], f if f < 1. else 0.) - self._prepare_response(df_mccorr, h_effkine_frac, h_response_frac, cat, var) - if not '-' in var and not self.cfg(f'observables.{var}.arraycols'): - self._prepare_response_fd(dfmatch[cat], h_effkine_fd, h_response_fd, var) + self._prepare_response(df_mccorr, h_effkine_frac, h_response_frac, cat, obs) + self._prepare_response_fd(dfmatch[cat], h_effkine_fd, h_response_fd, obs) - # TODO: add support for more complex observables - if '-' in var or self.cfg(f'observables.{var}.arraycols'): - continue + # TODO: move outside of loop? if self.cfg('closure.use_matched'): self.logger.info('using matched for truth') df_mcana, _ = self.split_df(dfmatch[cat], self.cfg('frac_mcana', .2)) @@ -409,7 +410,12 @@ def process_efficiency_single(self, index): if f := self.cfg('closure.exclude_feeddown_gen'): self.logger.debug('excluding feeddown gen') dfquery(df_mcana, f, inplace=True) - fill_hist(h_mctruth[(cat, var)], df_mcana[['fJetPt_gen', 'fPt_gen', f'{var}_gen']]) + + arraycols = [i - 3 for i in self.cfg(f'observables.{obs}.arraycols', [])] + var = obs.split('-') + self.logger.info("Observable %s has arraycols %s -> %s", obs, arraycols, [var[icol] for icol in arraycols]) + df_mcana = self._explode_arraycols(df_mcana, [var[icol] for icol in arraycols]) + fill_hist(h_mctruth[(cat, obs)], df_mcana[['fJetPt_gen', 'fPt_gen', *(f'{v}_gen' for v in var)]]) for name, obj in itertools.chain(h_eff.items(), h_effkine.items(), h_response.items(), h_effkine_fd.items(), h_response_fd.items(), @@ -419,22 +425,26 @@ def process_efficiency_single(self, index): except Exception as ex: # pylint: disable=broad-exception-caught self.logger.error('Writing of <%s> (%s) failed: %s', name, str(obj), str(ex)) + def _explode_arraycols(self, df: pd.DataFrame, arraycols: "list[str]") -> pd.DataFrame: + if len(arraycols) > 0: + self.logger.info("Exploding columns %s", arraycols) + # only consider rows with corresponding det- and gen-level entries + df['length'] = [len(x) for x in df[arraycols[0]]] + df['length_gen'] = [len(x) for x in df[arraycols[0] + '_gen']] + df = df.loc[df.length == df.length_gen] + df = df.explode(arraycols + [col + '_gen' for col in arraycols]) + df.dropna(inplace=True) + return df + def _prepare_response(self, dfi, h_effkine, h_response, cat, obs): var = obs.split('-') dim = len(var) + 1 axes_det = [get_axis(h_response[(cat, obs)], i) for i in range(dim)] axes_gen = [get_axis(h_response[(cat, obs)], i) for i in range(dim, 2 * dim)] - arraycols = [i - 3 for i in self.cfg(f'observables.{obs}').get('arraycols', [])] + arraycols = [i - 3 for i in self.cfg(f'observables.{obs}', {}).get('arraycols', [])] df = dfi - if arraycols: - self.logger.info("Exploding columns %s -> %s", arraycols, [var[icol] for icol in arraycols]) - # only consider rows with corresponding det- and gen-level entries - df['length'] = [len(x) for x in df[var[0]]] - df['length_gen'] = [len(x) for x in df[var[0] + '_gen']] - df = df.loc[df.length == df.length_gen] - df = df.explode([var[icol] for icol in arraycols] + [var[icol] + '_gen' for icol in arraycols]) - df.dropna(inplace=True) + df = self._explode_arraycols(df, [var[icol] for icol in arraycols]) df = df.loc[(df.fJetPt >= axes_det[0].GetXmin()) & (df.fJetPt < axes_det[0].GetXmax())] for i, v in enumerate(var, 1): @@ -449,13 +459,7 @@ def _prepare_response(self, dfi, h_effkine, h_response, cat, obs): fill_hist(h_response[(cat, obs)], df[['fJetPt', *var, 'fJetPt_gen', *(f'{v}_gen' for v in var), 'fPt']]) df = dfi - if arraycols: - self.logger.info("Exploding columns %s -> %s", arraycols, [var[icol] for icol in arraycols]) - df['length'] = [len(x) for x in df[var[0]]] - df['length_gen'] = [len(x) for x in df[var[0] + '_gen']] - df = df.loc[df.length == df.length_gen] - df = df.explode([var[icol] for icol in arraycols] + [var[icol] + '_gen' for icol in arraycols]) - df.dropna(inplace=True) + df = self._explode_arraycols(df, [var[icol] for icol in arraycols]) df = df.loc[(df.fJetPt >= axes_gen[0].GetXmin()) & (df.fJetPt < axes_gen[0].GetXmax())] for i, v in enumerate(var, 1): df = df.loc[(df[f'{v}_gen'] >= axes_gen[i].GetXmin()) & (df[f'{v}_gen'] < axes_gen[i].GetXmax())] @@ -466,33 +470,40 @@ def _prepare_response(self, dfi, h_effkine, h_response, cat, obs): fill_hist(h_effkine[(cat, 'gen', 'cut', obs)], df[['fJetPt_gen', *(f'{v}_gen' for v in var)]]) - def _prepare_response_fd(self, dfi, h_effkine, h_response, var): - axis_ptjet_det = get_axis(h_response[var], 0) - axis_pthf_det = get_axis(h_response[var], 1) - axis_var_det = get_axis(h_response[var], 2) - axis_ptjet_gen = get_axis(h_response[var], 3) - axis_pthf_gen = get_axis(h_response[var], 4) - axis_var_gen = get_axis(h_response[var], 5) + def _prepare_response_fd(self, dfi, h_effkine, h_response, obs): + var = obs.split('-') + dim = len(var) + 2 + axes_det = [get_axis(h_response[obs], i) for i in range(dim)] + axes_gen = [get_axis(h_response[obs], i) for i in range(dim, 2 * dim)] + arraycols = [i - 3 for i in self.cfg(f'observables.{obs}', {}).get('arraycols', [])] df = dfi + df = self._explode_arraycols(df, [var[icol] for icol in arraycols]) # TODO: the first cut should be taken care of by under-/overflow bins, check their usage in analyzer - df = df.loc[(df.fJetPt >= axis_ptjet_det.GetXmin()) & (df.fJetPt < axis_ptjet_det.GetXmax()) & - (df.fPt >= axis_pthf_det.GetXmin()) & (df.fPt < axis_pthf_det.GetXmax()) & - (df[var] >= axis_var_det.GetXmin()) & (df[var] < axis_var_det.GetXmax())] - fill_hist(h_effkine[('det', 'nocuts', var)], df[['fJetPt', 'fPt', var]]) - df = df.loc[(df.fJetPt_gen >= axis_ptjet_gen.GetXmin()) & (df.fJetPt_gen < axis_ptjet_gen.GetXmax()) & - (df.fPt_gen >= axis_pthf_gen.GetXmin()) & (df.fPt_gen < axis_pthf_gen.GetXmax()) & - (df[f'{var}_gen'] >= axis_var_gen.GetXmin()) & (df[f'{var}_gen'] < axis_var_gen.GetXmax())] - fill_hist(h_effkine[('det', 'cut', var)], df[['fJetPt', 'fPt', var]]) + df = df.loc[(df.fJetPt >= axes_det[0].GetXmin()) & (df.fJetPt < axes_det[0].GetXmax()) & + (df.fPt >= axes_det[1].GetXmin()) & (df.fPt < axes_det[1].GetXmax())] + self.logger.info('cutting for %s -> %s', obs, var) + for i, v in enumerate(var, 2): + self.logger.info('%i: %s', i, v) + df = df.loc[(df[v] >= axes_det[i].GetXmin()) & (df[v] < axes_det[i].GetXmax())] + fill_hist(h_effkine[('det', 'nocuts', obs)], df[['fJetPt', 'fPt', *var]]) + df = df.loc[(df.fJetPt_gen >= axes_gen[0].GetXmin()) & (df.fJetPt_gen < axes_gen[0].GetXmax()) & + (df.fPt_gen >= axes_gen[1].GetXmin()) & (df.fPt_gen < axes_gen[1].GetXmax())] + for i, v in enumerate(var, 2): + df = df.loc[(df[f'{v}_gen'] >= axes_gen[i].GetXmin()) & (df[f'{v}_gen'] < axes_gen[i].GetXmax())] + fill_hist(h_effkine[('det', 'cut', obs)], df[['fJetPt', 'fPt', *var]]) - fill_hist(h_response[var], df[['fJetPt', 'fPt', f'{var}', 'fJetPt_gen', 'fPt_gen', f'{var}_gen']]) + fill_hist(h_response[obs], df[['fJetPt', 'fPt', *var, 'fJetPt_gen', 'fPt_gen', *(f'{v}_gen' for v in var)]]) df = dfi - df = df.loc[(df.fJetPt_gen >= axis_ptjet_gen.GetXmin()) & (df.fJetPt_gen < axis_ptjet_gen.GetXmax()) & - (df.fPt_gen >= axis_pthf_gen.GetXmin()) & (df.fPt_gen < axis_pthf_gen.GetXmax()) & - (df[f'{var}_gen'] >= axis_var_gen.GetXmin()) & (df[f'{var}_gen'] < axis_var_gen.GetXmax())] - fill_hist(h_effkine[('gen', 'nocuts', var)], df[['fJetPt_gen', 'fPt', f'{var}_gen']]) - df = df.loc[(df.fJetPt >= axis_ptjet_det.GetXmin()) & (df.fJetPt < axis_ptjet_det.GetXmax()) & - (df.fPt >= axis_pthf_det.GetXmin()) & (df.fPt < axis_pthf_det.GetXmax()) & - (df[f'{var}'] >= axis_var_det.GetXmin()) & (df[f'{var}'] < axis_var_det.GetXmax())] - fill_hist(h_effkine[('gen', 'cut', var)], df[['fJetPt_gen', 'fPt', f'{var}_gen']]) + df = self._explode_arraycols(df, [var[icol] for icol in arraycols]) + df = df.loc[(df.fJetPt_gen >= axes_gen[0].GetXmin()) & (df.fJetPt_gen < axes_gen[0].GetXmax()) & + (df.fPt_gen >= axes_gen[1].GetXmin()) & (df.fPt_gen < axes_gen[1].GetXmax())] + for i, v in enumerate(var, 2): + df = df.loc[(df[f'{v}_gen'] >= axes_gen[i].GetXmin()) & (df[f'{v}_gen'] < axes_gen[i].GetXmax())] + fill_hist(h_effkine[('gen', 'nocuts', obs)], df[['fJetPt_gen', 'fPt', *(f'{v}_gen' for v in var)]]) + df = df.loc[(df.fJetPt >= axes_det[0].GetXmin()) & (df.fJetPt < axes_det[0].GetXmax()) & + (df.fPt >= axes_det[1].GetXmin()) & (df.fPt < axes_det[1].GetXmax())] + for i, v in enumerate(var, 2): + df = df.loc[(df[v] >= axes_det[i].GetXmin()) & (df[v] < axes_det[i].GetXmax())] + fill_hist(h_effkine[('gen', 'cut', obs)], df[['fJetPt_gen', 'fPt', *(f'{v}_gen' for v in var)]]) From 7bbbcf4aa3057cf7179a4dd0a538a27edadb68e0 Mon Sep 17 00:00:00 2001 From: Jochen Klein Date: Fri, 31 Jan 2025 13:52:41 +0100 Subject: [PATCH 09/10] Add additional observables --- .../analysis/analyzer_jets.py | 8 ++++- .../database_ml_parameters_D0Jet_pp.yml | 34 +++++++++++++++---- machine_learning_hep/processer.py | 2 ++ machine_learning_hep/processer_jet.py | 17 ++++++---- machine_learning_hep/utils/hist.py | 4 +-- 5 files changed, 49 insertions(+), 16 deletions(-) diff --git a/machine_learning_hep/analysis/analyzer_jets.py b/machine_learning_hep/analysis/analyzer_jets.py index 9ebc501c17..9918750510 100644 --- a/machine_learning_hep/analysis/analyzer_jets.py +++ b/machine_learning_hep/analysis/analyzer_jets.py @@ -851,7 +851,7 @@ def _analyze(self, method = 'sidesub'): self._save_hist( hproj, f'uf/h_{var}_{method}_unfolded_{mcordata}_' + - f'{string_range_ptjet(range_ptjet)}_sel.png') + f'{string_range_ptjet(range_ptjet)}_sel.png', "colz") # Save also the self-normalised version. if not empty: hproj_sel = hproj.Clone(f"{hproj.GetName()}_selfnorm") @@ -939,6 +939,7 @@ def estimate_feeddown(self): df = pd.read_parquet(self.cfg('fd_parquet')) col_mapping = {'dr': 'delta_r_jet', 'zpar': 'z'} # TODO: check mapping + # TODO: generalize to higher dimensions for var in self.observables['all']: bins_ptjet = np.asarray(self.cfg('bins_ptjet'), 'd') # TODO: generalize or derive from histogram? @@ -960,6 +961,7 @@ def estimate_feeddown(self): if f'{colname}' not in df: if var is not None: self.logger.error('No feeddown information for %s (%s), cannot estimate feeddown', var, colname) + print(df.info(), flush=True) continue # TODO: derive histogram @@ -990,6 +992,10 @@ def estimate_feeddown(self): rfile.Get(f'h_effkine_fd_det_nocuts_{var}'), rfile.Get(f'h_effkine_fd_det_cut_{var}')) h_response = rfile.Get(f'h_response_fd_{var}') + if not h_response: + self.logger.error("Could not find response matrix for fd estimation of %s", var) + rfile.ls() + continue h_response_norm = norm_response(h_response, 3) h3_fd_gen.Multiply(h_effkine_gen) self._save_hist(project_hist(h3_fd_gen, [0, 2], {}), f'fd/h_ptjet-{var}_fdnew_gen_genkine.png') diff --git a/machine_learning_hep/data/data_run3/database_ml_parameters_D0Jet_pp.yml b/machine_learning_hep/data/data_run3/database_ml_parameters_D0Jet_pp.yml index b235705ab0..3c34c6418d 100644 --- a/machine_learning_hep/data/data_run3/database_ml_parameters_D0Jet_pp.yml +++ b/machine_learning_hep/data/data_run3/database_ml_parameters_D0Jet_pp.yml @@ -509,7 +509,7 @@ D0Jet_pp: multi: data: nprocessesparallel: 80 - maxfiles: [-1] #list of periods + maxfiles: [1] #list of periods chunksizeunp: [100] #list of periods chunksizeskim: [100] #list of periods fracmerge: [.1] #list of periods @@ -532,7 +532,7 @@ D0Jet_pp: mcreweights: [../Analyses] #list of periods mc: nprocessesparallel: 80 - maxfiles: [-1] #list of periods + maxfiles: [1] #list of periods chunksizeunp: [100] #list of periods chunksizeskim: [1000] #list of periods fracmerge: [1.] #list of periods @@ -721,17 +721,39 @@ D0Jet_pp: bins_det_fix: [10, 0., 1.] label: "#Delta#it{r}" lntheta: - bins_gen_fix: [4, 1., 5.] - bins_det_fix: [4, 1., 5.] + bins_gen_fix: [8, 1., 5.] + bins_det_fix: [8, 1., 5.] label: "#minusln(#it{#theta})" arraycols: [3] lnkt: - bins_gen_fix: [10, -4., 6.] - bins_det_fix: [10, -4., 6.] + bins_gen_fix: [8, -4., 4.] + bins_det_fix: [8, -4., 4.] label: "ln(#it{k}_{T}/(GeV/#it{c}))" arraycols: [3] lntheta-lnkt: arraycols: [3, 4] + # new variables + fEnergyMother: + bins_gen_fix: [1, 0., 100.] + bins_det_fix: [1, 0., 100.] + arraycols: [3] + # lntheta-lnkt-fEnergyMother: + # arraycols: [3, 4, 5] + fJetNConstituents: + bins_gen_fix: [5, 0., 20.] + bins_det_fix: [5, 0., 20.] + zpar-fJetNConstituents: {} + nsub21: + # TODO: check for 1-track jets + bins_gen_fix: [11, -1., 1.] + bins_det_fix: [11, -1., 1.] + eecweight: + # TODO: adjust binning + bins_gen_fix: [10, 0., 1.] + bins_det_fix: [10, 0., 1.] + arraycols: [3] + fPairTheta-eecweight: + arraycols: [3, 4] data_selections: mcsig: diff --git a/machine_learning_hep/processer.py b/machine_learning_hep/processer.py index b5bfe7fc04..c5a014d75d 100644 --- a/machine_learning_hep/processer.py +++ b/machine_learning_hep/processer.py @@ -21,6 +21,7 @@ import re import sys import tempfile +import traceback from copy import deepcopy from functools import reduce from typing import TypeVar @@ -494,6 +495,7 @@ def applymodel(self, file_index): @staticmethod def callback(ex): get_logger().exception('Error callback: %s', ex) + traceback.print_stack() raise ex def parallelizer(self, function, argument_list, maxperchunk): diff --git a/machine_learning_hep/processer_jet.py b/machine_learning_hep/processer_jet.py index 77e9479a3b..087822d5c0 100644 --- a/machine_learning_hep/processer_jet.py +++ b/machine_learning_hep/processer_jet.py @@ -131,6 +131,7 @@ def _calculate_variables(self, df, verify=False): # pylint: disable=invalid-name df['lntheta'] = None return df df['nsub21'] = df.fNSub2 / df.fNSub1 + # TODO: catch nsub1 == 0 self.logger.debug('zg') df['zg_array'] = np.array(.5 - abs(df.fPtSubLeading / (df.fPtLeading + df.fPtSubLeading) - .5)) zcut = self.cfg('zcut', .1) @@ -145,6 +146,10 @@ def _calculate_variables(self, df, verify=False): # pylint: disable=invalid-name df['lntheta'] = df['fTheta'].apply(lambda x: -np.log(x)) # df['lntheta'] = np.array(-np.log(df.fTheta)) + self.logger.info('EEC') + df['eecweight'] = df[['fPairPt', 'fJetPt']].apply( + (lambda ar: ar.fPairPt / ar.fJetPt**2), axis=1) + if self.cfg('hfjet', True): df['dr'] = np.sqrt((df.fJetEta - df.fEta)**2 + ((df.fJetPhi - df.fPhi + math.pi) % math.tau - math.pi)**2) df['jetPx'] = df.fJetPt * np.cos(df.fJetPhi) @@ -245,7 +250,7 @@ def process_histomass_single(self, index): self._calculate_variables(df) for obs, spec in self.cfg('observables', {}).items(): - self.logger.debug('preparing histograms for %s', obs) + self.logger.info('preparing histograms for %s', obs) var = obs.split('-') if not all(v in df for v in var): self.logger.error('dataframe does not contain %s', var) @@ -312,7 +317,7 @@ def process_efficiency_single(self, index): self.binarray_pthf, *[self.binarrays_obs['gen'][v] for v in var]) h_response_fd[obs] = create_hist( - f'h_response_fd_{var}', + f'h_response_fd_{obs}', f";response matrix fd {obs}", self.binarrays_ptjet['det'][var[0]], self.binarrays_obs['det']['fPt'], @@ -338,7 +343,7 @@ def process_efficiency_single(self, index): # TODO: avoid hard-coding values here (check if restriction is needed at all) cols = ['ismcprompt', 'ismcsignal', 'ismcfd', 'fPt', 'fEta', 'fPhi', 'fJetPt', 'fJetEta', 'fJetPhi', 'fPtLeading', 'fPtSubLeading', 'fTheta', - 'fNSub2DR', 'fNSub1', 'fNSub2'] if self.cfg('hfjet', True) else None + 'fNSub2DR', 'fNSub1', 'fNSub2', 'fJetNConstituents', 'fEnergyMother', 'fPairTheta', 'fPairPt'] if self.cfg('hfjet', True) else None # read generator level dfgen_orig = pd.concat(read_df(self.mptfiles_gensk[bin][index], columns=cols) @@ -413,7 +418,7 @@ def process_efficiency_single(self, index): arraycols = [i - 3 for i in self.cfg(f'observables.{obs}.arraycols', [])] var = obs.split('-') - self.logger.info("Observable %s has arraycols %s -> %s", obs, arraycols, [var[icol] for icol in arraycols]) + self.logger.debug("Observable %s has arraycols %s -> %s", obs, arraycols, [var[icol] for icol in arraycols]) df_mcana = self._explode_arraycols(df_mcana, [var[icol] for icol in arraycols]) fill_hist(h_mctruth[(cat, obs)], df_mcana[['fJetPt_gen', 'fPt_gen', *(f'{v}_gen' for v in var)]]) @@ -427,7 +432,7 @@ def process_efficiency_single(self, index): def _explode_arraycols(self, df: pd.DataFrame, arraycols: "list[str]") -> pd.DataFrame: if len(arraycols) > 0: - self.logger.info("Exploding columns %s", arraycols) + self.logger.debug("Exploding columns %s", arraycols) # only consider rows with corresponding det- and gen-level entries df['length'] = [len(x) for x in df[arraycols[0]]] df['length_gen'] = [len(x) for x in df[arraycols[0] + '_gen']] @@ -482,9 +487,7 @@ def _prepare_response_fd(self, dfi, h_effkine, h_response, obs): # TODO: the first cut should be taken care of by under-/overflow bins, check their usage in analyzer df = df.loc[(df.fJetPt >= axes_det[0].GetXmin()) & (df.fJetPt < axes_det[0].GetXmax()) & (df.fPt >= axes_det[1].GetXmin()) & (df.fPt < axes_det[1].GetXmax())] - self.logger.info('cutting for %s -> %s', obs, var) for i, v in enumerate(var, 2): - self.logger.info('%i: %s', i, v) df = df.loc[(df[v] >= axes_det[i].GetXmin()) & (df[v] < axes_det[i].GetXmax())] fill_hist(h_effkine[('det', 'nocuts', obs)], df[['fJetPt', 'fPt', *var]]) df = df.loc[(df.fJetPt_gen >= axes_gen[0].GetXmin()) & (df.fJetPt_gen < axes_gen[0].GetXmax()) & diff --git a/machine_learning_hep/utils/hist.py b/machine_learning_hep/utils/hist.py index b5b4549e68..99f09ab53a 100644 --- a/machine_learning_hep/utils/hist.py +++ b/machine_learning_hep/utils/hist.py @@ -123,7 +123,7 @@ def create_hist(name, title, *bin_specs): var_bins = [hasattr(spec, '__len__') for spec in bin_specs] assert all(var_bins) or not any(var_bins), f'either all bins must be variable or fixed width: {bin_specs=}' dim = len(bin_specs) if all(var_bins) else len(bin_specs) / 3 - assert dim in range(1, 10), 'only dimensions from 1 to 10 are supported' + assert dim in range(1, 12), 'only dimensions from 1 to 10 are supported' if all(var_bins): nbins = list(map(lambda a: len(a) - 1, bin_specs)) @@ -154,7 +154,7 @@ def fill_hist(hist, dfi: pd.DataFrame, weights = None, arraycols = None, write = """ dim_hist = hist.GetDimension() if isinstance(hist, ROOT.TH1) else hist.GetNdimensions() dim_df = dfi.shape[1] if dfi.ndim > 1 else dfi.ndim - assert dim_df in range(1, 10), f'{dim_df} not supported' + assert dim_df in range(1, 12), f'{dim_df} not supported' assert dim_df == dim_hist, 'dimensions of df and histogram do not match' if len(dfi) == 0: return From 734d4976a8ce3ce4985d1e29f7b73bed1ca5a37f Mon Sep 17 00:00:00 2001 From: Jochen Klein Date: Tue, 4 Feb 2025 10:56:55 +0100 Subject: [PATCH 10/10] Fix pylint --- machine_learning_hep/processer_jet.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/machine_learning_hep/processer_jet.py b/machine_learning_hep/processer_jet.py index 087822d5c0..e57fd461cf 100644 --- a/machine_learning_hep/processer_jet.py +++ b/machine_learning_hep/processer_jet.py @@ -341,9 +341,9 @@ def process_efficiency_single(self, index): with TFile.Open(self.l_histoeff[index], "recreate") as rfile: # TODO: avoid hard-coding values here (check if restriction is needed at all) - cols = ['ismcprompt', 'ismcsignal', 'ismcfd', + cols = None if not self.cfg('hfjet', True) else ['ismcprompt', 'ismcsignal', 'ismcfd', 'fPt', 'fEta', 'fPhi', 'fJetPt', 'fJetEta', 'fJetPhi', 'fPtLeading', 'fPtSubLeading', 'fTheta', - 'fNSub2DR', 'fNSub1', 'fNSub2', 'fJetNConstituents', 'fEnergyMother', 'fPairTheta', 'fPairPt'] if self.cfg('hfjet', True) else None + 'fNSub2DR', 'fNSub1', 'fNSub2', 'fJetNConstituents', 'fEnergyMother', 'fPairTheta', 'fPairPt'] # read generator level dfgen_orig = pd.concat(read_df(self.mptfiles_gensk[bin][index], columns=cols) @@ -418,7 +418,8 @@ def process_efficiency_single(self, index): arraycols = [i - 3 for i in self.cfg(f'observables.{obs}.arraycols', [])] var = obs.split('-') - self.logger.debug("Observable %s has arraycols %s -> %s", obs, arraycols, [var[icol] for icol in arraycols]) + self.logger.debug("Observable %s has arraycols %s -> %s", + obs, arraycols, [var[icol] for icol in arraycols]) df_mcana = self._explode_arraycols(df_mcana, [var[icol] for icol in arraycols]) fill_hist(h_mctruth[(cat, obs)], df_mcana[['fJetPt_gen', 'fPt_gen', *(f'{v}_gen' for v in var)]])