From 9ba22ea76a0274ccbb0b8e186a1bc21ab98d660e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?V=C3=ADt=20Ku=C4=8Dera?= Date: Wed, 12 Feb 2025 16:48:24 +0100 Subject: [PATCH] Format Python --- machine_learning_hep/__main__.py | 1 + machine_learning_hep/analysis/analyzer.py | 19 +- .../analysis/analyzer_jets.py | 1132 +++++++++-------- .../analysis/analyzer_manager.py | 21 +- .../analysis/analyzerdhadrons.py | 405 +++--- .../analysis/analyzerdhadrons_mult.py | 649 +++++----- .../analysis/do_systematics.py | 85 +- machine_learning_hep/analysis/systematics.py | 281 ++-- machine_learning_hep/analysis/utils.py | 12 +- machine_learning_hep/bitwise.py | 15 +- machine_learning_hep/computetrigger.py | 164 +-- machine_learning_hep/config.py | 26 +- machine_learning_hep/correlations.py | 181 +-- .../derive_weights/derive_weights.py | 146 ++- machine_learning_hep/do_variations.py | 2 +- .../examples/plot_hfmassfitter.py | 145 +-- .../examples/plot_hfptspectrum.py | 486 ++++--- .../examples/plot_hfptspectrum_years.py | 124 +- machine_learning_hep/fitting/fitters.py | 974 +++++++------- machine_learning_hep/fitting/helpers.py | 656 +++++----- machine_learning_hep/fitting/roofitter.py | 123 +- machine_learning_hep/fitting/simple_fit.py | 167 +-- machine_learning_hep/fitting/utils.py | 25 +- machine_learning_hep/globalfitter.py | 302 +++-- machine_learning_hep/hf_analysis_utils.py | 91 +- machine_learning_hep/hf_pt_spectrum.py | 109 +- machine_learning_hep/logger.py | 51 +- machine_learning_hep/ml_get_data.py | 22 +- machine_learning_hep/mlperformance.py | 202 +-- machine_learning_hep/models.py | 133 +- machine_learning_hep/multiprocesser.py | 108 +- .../optimisation/bayesian_opt.py | 244 ++-- .../optimisation/grid_search.py | 50 +- machine_learning_hep/optimisation/metrics.py | 3 +- machine_learning_hep/optimiser.py | 612 +++++---- machine_learning_hep/optimization.py | 54 +- machine_learning_hep/pca.py | 22 +- .../plotting/plot_jetsubstructure.py | 519 +++++--- .../plotting/plot_jetsubstructure_lite.py | 629 ++++++--- .../plotting/plot_jetsubstructure_run3.py | 9 +- machine_learning_hep/processer.py | 462 ++++--- machine_learning_hep/processer_jet.py | 640 ++++++---- machine_learning_hep/processerdhadrons.py | 178 ++- .../processerdhadrons_mult.py | 279 ++-- machine_learning_hep/ratio.py | 51 +- machine_learning_hep/root.py | 37 +- machine_learning_hep/selectionutils.py | 120 +- .../simulations/ddbar_fonll.py | 188 ++- machine_learning_hep/simulations/sigmann.py | 123 +- machine_learning_hep/steer_analysis.py | 157 +-- machine_learning_hep/templates_keras.py | 89 +- machine_learning_hep/templates_scikit.py | 16 +- machine_learning_hep/templates_xgboost.py | 47 +- machine_learning_hep/utilities.py | 2 + machine_learning_hep/utilities_plot.py | 732 ++++++----- machine_learning_hep/utils/hist.py | 109 +- .../validation/find_duplicates_events.py | 57 +- machine_learning_hep/validation/validation.py | 9 +- .../validation/validation_candidates.py | 16 +- .../validation/validation_multiplicity.py | 26 +- machine_learning_hep/vary_bdt.py | 18 +- .../workflow/workflow_base.py | 22 +- run_hfjets.py | 45 +- 63 files changed, 6976 insertions(+), 5446 deletions(-) diff --git a/machine_learning_hep/__main__.py b/machine_learning_hep/__main__.py index 66c083f4ff..cf303726b0 100755 --- a/machine_learning_hep/__main__.py +++ b/machine_learning_hep/__main__.py @@ -13,6 +13,7 @@ ############################################################################# import sys + from machine_learning_hep.steer_analysis import main sys.exit(main()) diff --git a/machine_learning_hep/analysis/analyzer.py b/machine_learning_hep/analysis/analyzer.py index 4808ad5e37..2eb975984a 100644 --- a/machine_learning_hep/analysis/analyzer.py +++ b/machine_learning_hep/analysis/analyzer.py @@ -12,13 +12,15 @@ ## along with this program. if not, see . ## ############################################################################# -from os.path import exists, join -from os import makedirs import os +from os import makedirs +from os.path import exists, join + +from machine_learning_hep.io import dump_yaml_from_dict # HF specific imports from machine_learning_hep.workflow.workflow_base import WorkflowBase -from machine_learning_hep.io import dump_yaml_from_dict + class Analyzer(WorkflowBase): def __init__(self, datap, case, typean, period): @@ -28,15 +30,16 @@ def __init__(self, datap, case, typean, period): for mcordata in ("mc", "data"): dp = datap["analysis"][typean][mcordata] prefix_dir_res = dp.get("prefix_dir_res", "") - results_dir = prefix_dir_res + os.path.expandvars(dp["results"][period]) \ - if period is not None \ - else prefix_dir_res + os.path.expandvars(dp["resultsallp"]) + results_dir = ( + prefix_dir_res + os.path.expandvars(dp["results"][period]) + if period is not None + else prefix_dir_res + os.path.expandvars(dp["resultsallp"]) + ) if not exists(results_dir): # create otput directories in case they do not exist makedirs(results_dir) if mcordata == "data": - dump_yaml_from_dict({case: datap}, - join(results_dir, f"database_{case}_{typean}.yml")) + dump_yaml_from_dict({case: datap}, join(results_dir, f"database_{case}_{typean}.yml")) class AnalyzerAfterBurner(WorkflowBase): diff --git a/machine_learning_hep/analysis/analyzer_jets.py b/machine_learning_hep/analysis/analyzer_jets.py index 5c4e159450..60d2f2b09e 100644 --- a/machine_learning_hep/analysis/analyzer_jets.py +++ b/machine_learning_hep/analysis/analyzer_jets.py @@ -20,13 +20,29 @@ from ROOT import TF1, TCanvas, TFile, gStyle from machine_learning_hep.analysis.analyzer import Analyzer -from machine_learning_hep.fitting.roofitter import RooFitter, calc_signif -from machine_learning_hep.fitting.roofitter import create_text_info, add_text_info_fit, add_text_info_perf +from machine_learning_hep.fitting.roofitter import ( + RooFitter, + add_text_info_fit, + add_text_info_perf, + calc_signif, + create_text_info, +) from machine_learning_hep.utilities import folding, make_message_notfound -from machine_learning_hep.utils.hist import (bin_array, create_hist, norm_response, fold_hist, - fill_hist_fast, get_axis, get_dim, get_bin_limits, - get_nbins, project_hist, - scale_bin, sum_hists, ensure_sumw2) +from machine_learning_hep.utils.hist import ( + bin_array, + create_hist, + ensure_sumw2, + fill_hist_fast, + fold_hist, + get_axis, + get_bin_limits, + get_dim, + get_nbins, + norm_response, + project_hist, + scale_bin, + sum_hists, +) # pylint: disable=too-many-instance-attributes,too-many-lines,too-many-nested-blocks @@ -46,21 +62,23 @@ def __init__(self, datap, case, typean, period): super().__init__(datap, case, typean, period) # output directories - self.d_resultsallpmc = (self.cfg(f"mc.results.{period}") - if period is not None else self.cfg("mc.resultsallp")) - self.d_resultsallpdata = (self.cfg(f"data.results.{period}") - if period is not None else self.cfg("data.resultsallp")) + self.d_resultsallpmc = self.cfg(f"mc.results.{period}") if period is not None else self.cfg("mc.resultsallp") + self.d_resultsallpdata = ( + self.cfg(f"data.results.{period}") if period is not None else self.cfg("data.resultsallp") + ) # input directories (processor output) self.d_resultsallpmc_proc = self.d_resultsallpmc self.d_resultsallpdata_proc = self.d_resultsallpdata # use a different processor output if "data_proc" in datap["analysis"][typean]: - self.d_resultsallpdata_proc = self.cfg(f"data_proc.results.{period}") \ - if period is not None else self.cfg("data_proc.resultsallp") + self.d_resultsallpdata_proc = ( + self.cfg(f"data_proc.results.{period}") if period is not None else self.cfg("data_proc.resultsallp") + ) if "mc_proc" in datap["analysis"][typean]: - self.d_resultsallpmc_proc = self.cfg(f"mc_proc.results.{period}") \ - if period is not None else self.cfg("mc_proc.resultsallp") + self.d_resultsallpmc_proc = ( + self.cfg(f"mc_proc.results.{period}") if period is not None else self.cfg("mc_proc.resultsallp") + ) # input files n_filemass_name = datap["files_names"]["histofilename"] @@ -72,40 +90,40 @@ def __init__(self, datap, case, typean, period): self.n_fileresp = os.path.join(self.d_resultsallpmc_proc, self.n_fileresp) file_result_name = datap["files_names"]["resultfilename"] self.n_fileresult = os.path.join(self.d_resultsallpdata, file_result_name) - self.p_pdfnames = datap["analysis"][self.typean]['pdf_names'] - self.p_param_names = datap["analysis"][self.typean]['param_names'] + self.p_pdfnames = datap["analysis"][self.typean]["pdf_names"] + self.p_param_names = datap["analysis"][self.typean]["param_names"] self.observables = { - 'qa': ['zg', 'rg', 'nsd', 'zpar', 'dr', 'lntheta', 'lnkt', 'lntheta-lnkt'], - 'all': [*self.cfg('observables', {})], + "qa": ["zg", "rg", "nsd", "zpar", "dr", "lntheta", "lnkt", "lntheta-lnkt"], + "all": [*self.cfg("observables", {})], } - self.bins_candpt = np.asarray(self.cfg('sel_an_binmin', []) + self.cfg('sel_an_binmax', [])[-1:], 'd') + self.bins_candpt = np.asarray(self.cfg("sel_an_binmin", []) + self.cfg("sel_an_binmax", [])[-1:], "d") self.nbins = len(self.bins_candpt) - 1 - self.fit_levels = self.cfg('fit_levels', ['mc', 'data']) + self.fit_levels = self.cfg("fit_levels", ["mc", "data"]) self.fit_sigma = {} self.fit_mean = {} self.fit_func_bkg = {} self.fit_range = {} - self.hcandeff = {'pr': None, 'np': None} + self.hcandeff = {"pr": None, "np": None} self.hcandeff_gen = {} self.hcandeff_det = {} self.h_eff_ptjet_pthf = {} - self.h_effnew_ptjet_pthf = {'pr': None, 'np': None} - self.h_effnew_pthf = {'pr': None, 'np': None} - self.hfeeddown_det = {'mc': {}, 'data': {}} - self.h_reflcorr = create_hist('h_reflcorr', ';p_{T}^{HF} (GeV/#it{c})', self.bins_candpt) + self.h_effnew_ptjet_pthf = {"pr": None, "np": None} + self.h_effnew_pthf = {"pr": None, "np": None} + self.hfeeddown_det = {"mc": {}, "data": {}} + self.h_reflcorr = create_hist("h_reflcorr", ";p_{T}^{HF} (GeV/#it{c})", self.bins_candpt) self.n_events = {} self.n_colls_read = {} self.n_colls_tvx = {} self.n_bcs_tvx = {} - self.path_fig = Path(f'{os.path.expandvars(self.d_resultsallpdata)}/fig') - for folder in ['qa', 'fit', 'roofit', 'sideband', 'signalextr', 'sidesub', 'sigextr', 'fd', 'uf', 'eff']: + self.path_fig = Path(f"{os.path.expandvars(self.d_resultsallpdata)}/fig") + for folder in ["qa", "fit", "roofit", "sideband", "signalextr", "sidesub", "sigextr", "fd", "uf", "eff"]: (self.path_fig / folder).mkdir(parents=True, exist_ok=True) - self.file_out_histo = TFile(self.n_fileresult, 'recreate') + self.file_out_histo = TFile(self.n_fileresult, "recreate") self.fitter = RooFitter() self.roo_ws = {} @@ -113,180 +131,179 @@ def __init__(self, datap, case, typean, period): self.roows = {} self.roows_ptjet = {} - #region helpers + # region helpers def _save_canvas(self, canvas, filename): - canvas.SaveAs(f'{self.path_fig}/{filename}') + canvas.SaveAs(f"{self.path_fig}/{filename}") - - def _save_hist(self, hist, filename, option = '', logy = False): + def _save_hist(self, hist, filename, option="", logy=False): if not hist: - self.logger.error('No histogram for <%s>', filename) + self.logger.error("No histogram for <%s>", filename) # TODO: remove file if it exists? return c = TCanvas() if isinstance(hist, ROOT.TH1) and get_dim(hist) == 2 and len(option) == 0: - option += 'texte' + option += "texte" hist.Draw(option) c.SetLogy(logy) self._save_canvas(c, filename) - rfilename = filename.split('/')[-1] - rfilename = rfilename.removesuffix('.png') + rfilename = filename.split("/")[-1] + rfilename = rfilename.removesuffix(".png") self.file_out_histo.WriteObject(hist, rfilename) - def _clip_neg(self, hist): for ibin in range(hist.GetNcells()): if hist.GetBinContent(ibin) < 0: - hist.SetBinContent(ibin, 0.) - hist.SetBinError(ibin, 0.) + hist.SetBinContent(ibin, 0.0) + hist.SetBinError(ibin, 0.0) - #region fundamentals + # region fundamentals def init(self): - for mcordata in ['mc', 'data']: + for mcordata in ["mc", "data"]: rfilename = self.n_filemass_mc if mcordata == "mc" else self.n_filemass with TFile(rfilename) as rfile: histonorm = rfile.Get("histonorm") if not histonorm: - self.logger.critical('histonorm not found') + self.logger.critical("histonorm not found") self.n_events[mcordata] = histonorm.GetBinContent(1) self.n_colls_read[mcordata] = histonorm.GetBinContent(2) self.n_colls_tvx[mcordata] = histonorm.GetBinContent(3) self.n_bcs_tvx[mcordata] = histonorm.GetBinContent(4) - self.logger.debug('Number of selected events for %s: %d', mcordata, self.n_events[mcordata]) - self.logger.info('Number of sampled collisions for %s: %g', mcordata, self.n_colls_read[mcordata]) - self.logger.info('Number of TVX collisions for %s: %g', mcordata, self.n_colls_tvx[mcordata]) - self.logger.info('Number of TVX BCs for %s: %g', mcordata, self.n_bcs_tvx[mcordata]) + self.logger.debug("Number of selected events for %s: %d", mcordata, self.n_events[mcordata]) + self.logger.info("Number of sampled collisions for %s: %g", mcordata, self.n_colls_read[mcordata]) + self.logger.info("Number of TVX collisions for %s: %g", mcordata, self.n_colls_tvx[mcordata]) + self.logger.info("Number of TVX BCs for %s: %g", mcordata, self.n_bcs_tvx[mcordata]) - def qa(self): # pylint: disable=invalid-name + def qa(self): # pylint: disable=invalid-name self.logger.info("Producing basic QA histograms") - for mcordata in ['mc', 'data']: + for mcordata in ["mc", "data"]: rfilename = self.n_filemass_mc if mcordata == "mc" else self.n_filemass with TFile(rfilename) as rfile: - h = rfile.Get('h_mass-ptjet-pthf') - self._save_hist(project_hist(h, [0], {}), f'qa/h_mass_{mcordata}.png') - self._save_hist(project_hist(h, [1], {}), f'qa/h_ptjet_{mcordata}.png') - self._save_hist(project_hist(h, [2], {}), f'qa/h_ptcand_{mcordata}.png') + h = rfile.Get("h_mass-ptjet-pthf") + self._save_hist(project_hist(h, [0], {}), f"qa/h_mass_{mcordata}.png") + self._save_hist(project_hist(h, [1], {}), f"qa/h_ptjet_{mcordata}.png") + self._save_hist(project_hist(h, [2], {}), f"qa/h_ptcand_{mcordata}.png") - if h := rfile.Get('h_ncand'): - self._save_hist(h, f'qa/h_ncand_{mcordata}.png', logy = True) + if h := rfile.Get("h_ncand"): + self._save_hist(h, f"qa/h_ncand_{mcordata}.png", logy=True) - for var in self.observables['qa']: - if h := rfile.Get(f'h_mass-ptjet-pthf-{var}'): + for var in self.observables["qa"]: + if h := rfile.Get(f"h_mass-ptjet-pthf-{var}"): axes = list(range(get_dim(h))) hproj = project_hist(h, axes[3:], {}) - self._save_hist(hproj, f'qa/h_{var}_{mcordata}.png') + self._save_hist(hproj, f"qa/h_{var}_{mcordata}.png") with TFile(self.n_fileeff) as rfile: - for var in self.observables['all']: - if '-' in var: + for var in self.observables["all"]: + if "-" in var: continue - for cat in ('pr', 'np'): - h_response = rfile.Get(f'h_response_{cat}_{var}') + for cat in ("pr", "np"): + h_response = rfile.Get(f"h_response_{cat}_{var}") h_response_ptjet = project_hist(h_response, [0, 2], {}) h_response_shape = project_hist(h_response, [1, 3], {}) - self._save_hist(h_response_ptjet, f'qa/h_ptjet-{var}_responsematrix-ptjet_{cat}.png', 'colz') - self._save_hist(h_response_shape, f'qa/h_ptjet-{var}_responsematrix-shape_{cat}.png', 'colz') - + self._save_hist(h_response_ptjet, f"qa/h_ptjet-{var}_responsematrix-ptjet_{cat}.png", "colz") + self._save_hist(h_response_shape, f"qa/h_ptjet-{var}_responsematrix-shape_{cat}.png", "colz") - #region efficiency + # region efficiency # pylint: disable=too-many-statements def calculate_efficiencies(self): self.logger.info("Calculating efficiencies") - cats = {'pr', 'np'} + cats = {"pr", "np"} with TFile(self.n_fileeff) as rfile: - h_gen = {cat: rfile.Get(f'h_ptjet-pthf_{cat}_gen') for cat in cats} - h_det = {cat: rfile.Get(f'h_ptjet-pthf_{cat}_det') for cat in cats} - h_genmatch = {cat: rfile.Get(f'h_ptjet-pthf_{cat}_genmatch') for cat in cats} - h_detmatch = {cat: rfile.Get(f'h_ptjet-pthf_{cat}_detmatch') for cat in cats} - h_detmatch_gencuts = {cat: rfile.Get(f'h_ptjet-pthf_{cat}_detmatch_gencuts') for cat in cats} + h_gen = {cat: rfile.Get(f"h_ptjet-pthf_{cat}_gen") for cat in cats} + h_det = {cat: rfile.Get(f"h_ptjet-pthf_{cat}_det") for cat in cats} + h_genmatch = {cat: rfile.Get(f"h_ptjet-pthf_{cat}_genmatch") for cat in cats} + h_detmatch = {cat: rfile.Get(f"h_ptjet-pthf_{cat}_detmatch") for cat in cats} + h_detmatch_gencuts = {cat: rfile.Get(f"h_ptjet-pthf_{cat}_detmatch_gencuts") for cat in cats} # Run 2 efficiencies (only use ptjet bins used for analysis) - bins_ptjet_ana = self.cfg('bins_ptjet', []) - bins_ptjet = (get_axis(h_gen['pr'], 0).FindBin(min(bins_ptjet_ana)), - get_axis(h_gen['pr'], 0).FindBin(max(bins_ptjet_ana) - .001)) - self.logger.info('derived ptjet bins: %i - %i', bins_ptjet[0], bins_ptjet[1]) + bins_ptjet_ana = self.cfg("bins_ptjet", []) + bins_ptjet = ( + get_axis(h_gen["pr"], 0).FindBin(min(bins_ptjet_ana)), + get_axis(h_gen["pr"], 0).FindBin(max(bins_ptjet_ana) - 0.001), + ) + self.logger.info("derived ptjet bins: %i - %i", bins_ptjet[0], bins_ptjet[1]) h_gen_proj = {cat: project_hist(h_gen[cat], [1], {0: bins_ptjet}) for cat in cats} h_det_proj = {cat: project_hist(h_detmatch_gencuts[cat], [1], {0: bins_ptjet}) for cat in cats} for cat in cats: - self._save_hist(h_gen_proj[cat], f'eff/h_pthf_{cat}_gen.png') - self._save_hist(h_det_proj[cat], f'eff/h_pthf_{cat}_det.png') + self._save_hist(h_gen_proj[cat], f"eff/h_pthf_{cat}_gen.png") + self._save_hist(h_det_proj[cat], f"eff/h_pthf_{cat}_det.png") ensure_sumw2(h_det_proj[cat]) - self.hcandeff[cat] = h_det_proj[cat].Clone(f'h_eff_{cat}') + self.hcandeff[cat] = h_det_proj[cat].Clone(f"h_eff_{cat}") self.hcandeff[cat].Divide(h_gen_proj[cat]) - self._save_hist(self.hcandeff[cat], f'eff/h_eff_{cat}.png') + self._save_hist(self.hcandeff[cat], f"eff/h_eff_{cat}.png") # extract efficiencies in bins of jet pt ensure_sumw2(h_det[cat]) self.h_eff_ptjet_pthf[cat] = h_detmatch_gencuts[cat].Clone() self.h_eff_ptjet_pthf[cat].Divide(h_gen[cat]) - self._save_hist(self.h_eff_ptjet_pthf[cat], f'eff/h_ptjet-pthf_eff_{cat}.png') + self._save_hist(self.h_eff_ptjet_pthf[cat], f"eff/h_ptjet-pthf_eff_{cat}.png") c = TCanvas() c.cd() for i, iptjet in enumerate(range(*bins_ptjet)): h = project_hist(self.h_eff_ptjet_pthf[cat], [1], {0: (iptjet, iptjet)}) - h.DrawCopy('' if i == 0 else 'same') + h.DrawCopy("" if i == 0 else "same") h.SetLineColor(i) - self._save_canvas(c, f'eff/h_ptjet-pthf_eff_{cat}_ptjet.png') + self._save_canvas(c, f"eff/h_ptjet-pthf_eff_{cat}_ptjet.png") # Run 3 efficiencies for icat, cat in enumerate(cats): # gen-level efficiency for feeddown estimation h_eff_gen = h_genmatch[cat].Clone() h_eff_gen.Divide(h_gen[cat]) - self._save_hist(h_eff_gen, f'eff/h_effgen_{cat}.png') + self._save_hist(h_eff_gen, f"eff/h_effgen_{cat}.png") self.hcandeff_gen[cat] = h_eff_gen # matching loss h_eff_match = h_detmatch[cat].Clone() h_eff_match.Divide(h_det[cat]) - self._save_hist(h_eff_match, f'eff/h_effmatch_{cat}.png') + self._save_hist(h_eff_match, f"eff/h_effmatch_{cat}.png") - if not (h_response := rfile.Get(f'h_response_{cat}_fPt')): - self.logger.critical(make_message_notfound(f'h_response_{cat}_fPt', self.n_fileeff)) + if not (h_response := rfile.Get(f"h_response_{cat}_fPt")): + self.logger.critical(make_message_notfound(f"h_response_{cat}_fPt", self.n_fileeff)) h_response_ptjet = project_hist(h_response, [0, 2], {}) h_response_pthf = project_hist(h_response, [1, 3], {}) - self._save_hist(h_response_ptjet, f'eff/h_ptjet-pthf_responsematrix-ptjet_{cat}.png', 'colz') - self._save_hist(h_response_pthf, f'eff/h_ptjet-pthf_responsematrix-pthf_{cat}.png', 'colz') - rm = self._build_response_matrix(h_response, self.hcandeff['pr']) + self._save_hist(h_response_ptjet, f"eff/h_ptjet-pthf_responsematrix-ptjet_{cat}.png", "colz") + self._save_hist(h_response_pthf, f"eff/h_ptjet-pthf_responsematrix-pthf_{cat}.png", "colz") + rm = self._build_response_matrix(h_response, self.hcandeff["pr"]) h_effkine_gen = self._build_effkine( - rfile.Get(f'h_effkine_{cat}_gen_nocuts_fPt'), - rfile.Get(f'h_effkine_{cat}_gen_cut_fPt')) - self._save_hist(h_effkine_gen, f'eff/h_effkine-ptjet-pthf_{cat}_gen.png', 'text') + rfile.Get(f"h_effkine_{cat}_gen_nocuts_fPt"), rfile.Get(f"h_effkine_{cat}_gen_cut_fPt") + ) + self._save_hist(h_effkine_gen, f"eff/h_effkine-ptjet-pthf_{cat}_gen.png", "text") h_effkine_det = self._build_effkine( - rfile.Get(f'h_effkine_{cat}_det_nocuts_fPt'), - rfile.Get(f'h_effkine_{cat}_det_cut_fPt')) - self._save_hist(h_effkine_det, f'eff/h_effkine-ptjet-pthf_{cat}_det.png', 'text') + rfile.Get(f"h_effkine_{cat}_det_nocuts_fPt"), rfile.Get(f"h_effkine_{cat}_det_cut_fPt") + ) + self._save_hist(h_effkine_det, f"eff/h_effkine-ptjet-pthf_{cat}_det.png", "text") h_in = h_gen[cat].Clone() - self._save_hist(project_hist(h_in, [1], {}), f'eff/h_pthf_{cat}_gen.png') + self._save_hist(project_hist(h_in, [1], {}), f"eff/h_pthf_{cat}_gen.png") h_in.Multiply(h_effkine_gen) - h_out = h_in.Clone() # should derive this from the response matrix instead + h_out = h_in.Clone() # should derive this from the response matrix instead h_out = folding(h_in, rm, h_out) h_out.Divide(h_effkine_det) - self._save_hist(project_hist(h_out, [1], {}), f'eff/h_pthf_{cat}_gen_folded.png') + self._save_hist(project_hist(h_out, [1], {}), f"eff/h_pthf_{cat}_gen_folded.png") - eff = h_det[cat].Clone(f'h_effnew_{cat}') + eff = h_det[cat].Clone(f"h_effnew_{cat}") ensure_sumw2(eff) eff.Divide(h_out) - if eff_corr := self.cfg('efficiency.reweight'): + if eff_corr := self.cfg("efficiency.reweight"): for iptjet in range(get_nbins(eff, 0)): for ipt in range(get_nbins(eff, 1)): - scale_bin(eff, eff_corr[ipt][icat], iptjet+1, ipt+1) + scale_bin(eff, eff_corr[ipt][icat], iptjet + 1, ipt + 1) - self._save_hist(eff, f'eff/h_ptjet-pthf_effnew_{cat}.png') + self._save_hist(eff, f"eff/h_ptjet-pthf_effnew_{cat}.png") self.h_effnew_ptjet_pthf[cat] = eff eff_avg = project_hist(h_det[cat], [1], {0: bins_ptjet}) ensure_sumw2(eff_avg) eff_avg.Divide(project_hist(h_out, [1], {0: bins_ptjet})) - if eff_corr := self.cfg('efficiency.reweight'): + if eff_corr := self.cfg("efficiency.reweight"): for ipt in range(get_nbins(eff_avg, 0)): - scale_bin(eff_avg, eff_corr[ipt][icat], ipt+1) + scale_bin(eff_avg, eff_corr[ipt][icat], ipt + 1) - self._save_hist(eff_avg, f'eff/h_pthf_effnew_{cat}.png') + self._save_hist(eff_avg, f"eff/h_pthf_effnew_{cat}.png") self.h_effnew_pthf[cat] = eff_avg c = TCanvas() @@ -300,65 +317,63 @@ def calculate_efficiencies(self): amax = hc_eff.GetMaximum() axis_ptjet = get_axis(eff, 0) for iptjet in reversed(range(1, get_nbins(eff, 0) - 1)): - h = project_hist(eff, [1], {0: (iptjet+1, iptjet+1)}) - h.SetName(h.GetName() + f'_ptjet{iptjet}') - h.Draw('same') + h = project_hist(eff, [1], {0: (iptjet + 1, iptjet + 1)}) + h.SetName(h.GetName() + f"_ptjet{iptjet}") + h.Draw("same") h.SetLineColor(iptjet) range_ptjet = get_bin_limits(axis_ptjet, iptjet + 1) - self._save_hist(h, f'h_ptjet-pthf_effnew_{cat}_{string_range_ptjet(range_ptjet)}.png') + self._save_hist(h, f"h_ptjet-pthf_effnew_{cat}_{string_range_ptjet(range_ptjet)}.png") amax = max(amax, h.GetMaximum()) - hc_eff.GetYaxis().SetRangeUser(0., 1.1 * amax) - self._save_canvas(c, f'eff/h_ptjet-pthf_effnew_{cat}_ptjet.png') - + hc_eff.GetYaxis().SetRangeUser(0.0, 1.1 * amax) + self._save_canvas(c, f"eff/h_ptjet-pthf_effnew_{cat}_ptjet.png") def _correct_efficiency(self, hist, ipt): if not hist: - self.logger.error('no histogram to correct for efficiency') + self.logger.error("no histogram to correct for efficiency") return - if self.cfg('efficiency.correction_method') == 'run3': - eff = self.h_effnew_pthf['pr'].GetBinContent(ipt + 1) - eff_old = self.hcandeff['pr'].GetBinContent(ipt + 1) - self.logger.info('Using Run 3 efficiency %g instead of %g', eff, eff_old) - hist.Scale(1. / eff) - elif self.cfg('efficiency.correction_method') == 'run2_2d': - self.logger.info('using Run 2 efficiencies per jet pt bin') - if not self.h_eff_ptjet_pthf['pr']: - self.logger.error('no efficiency available for %s', hist.GetName()) + if self.cfg("efficiency.correction_method") == "run3": + eff = self.h_effnew_pthf["pr"].GetBinContent(ipt + 1) + eff_old = self.hcandeff["pr"].GetBinContent(ipt + 1) + self.logger.info("Using Run 3 efficiency %g instead of %g", eff, eff_old) + hist.Scale(1.0 / eff) + elif self.cfg("efficiency.correction_method") == "run2_2d": + self.logger.info("using Run 2 efficiencies per jet pt bin") + if not self.h_eff_ptjet_pthf["pr"]: + self.logger.error("no efficiency available for %s", hist.GetName()) return for iptjet in range(get_nbins(hist, 0)): - eff = self.h_eff_ptjet_pthf['pr'].GetBinContent(iptjet+1, ipt+1) + eff = self.h_eff_ptjet_pthf["pr"].GetBinContent(iptjet + 1, ipt + 1) if np.isclose(eff, 0): - self.logger.error('Efficiency 0 for %s ipt %d iptjet %d, no correction possible', - hist.GetName(), ipt, iptjet) + self.logger.error( + "Efficiency 0 for %s ipt %d iptjet %d, no correction possible", hist.GetName(), ipt, iptjet + ) continue for ivar in range(get_nbins(hist, 1)): - scale_bin(hist, 1./eff, iptjet+1, ivar+1) + scale_bin(hist, 1.0 / eff, iptjet + 1, ivar + 1) else: - self.logger.info('Correcting with Run 2 efficiencies') - if not self.hcandeff['pr']: - self.logger.error('no efficiency available for %s', hist.GetName()) + self.logger.info("Correcting with Run 2 efficiencies") + if not self.hcandeff["pr"]: + self.logger.error("no efficiency available for %s", hist.GetName()) return - eff = self.hcandeff['pr'].GetBinContent(ipt + 1) + eff = self.hcandeff["pr"].GetBinContent(ipt + 1) if np.isclose(eff, 0): if hist.GetEntries() > 0: # TODO: how should we handle this? - self.logger.error('Efficiency 0 for %s ipt %d, no correction possible', - hist.GetName(), ipt) + self.logger.error("Efficiency 0 for %s ipt %d, no correction possible", hist.GetName(), ipt) return - self.logger.debug('scaling hist %s (ipt %i) with 1. / %g', hist.GetName(), ipt, eff) - hist.Scale(1. / eff) - + self.logger.debug("scaling hist %s (ipt %i) with 1. / %g", hist.GetName(), ipt, eff) + hist.Scale(1.0 / eff) - #region fitting - def _roofit_mass(self, level, hist, ipt, pdfnames, param_names, fitcfg, roows = None, filename = None): + # region fitting + def _roofit_mass(self, level, hist, ipt, pdfnames, param_names, fitcfg, roows=None, filename=None): if fitcfg is None: return None, None res, ws, frame, residual_frame = self.fitter.fit_mass_new(hist, pdfnames, fitcfg, level, roows, True) - frame.SetTitle(f'inv. mass for p_{{T}} {self.bins_candpt[ipt]} - {self.bins_candpt[ipt+1]} GeV/c') + frame.SetTitle(f"inv. mass for p_{{T}} {self.bins_candpt[ipt]} - {self.bins_candpt[ipt + 1]} GeV/c") c = TCanvas() textInfoRight = create_text_info(0.62, 0.68, 1.0, 0.89) @@ -368,9 +383,9 @@ def _roofit_mass(self, level, hist, ipt, pdfnames, param_names, fitcfg, roows = if level == "data": mean_sgn = ws.var(self.p_param_names["gauss_mean"]) sigma_sgn = ws.var(self.p_param_names["gauss_sigma"]) - (sig, sig_err, bkg, bkg_err, - signif, signif_err, s_over_b, s_over_b_err - ) = calc_signif(ws, res, pdfnames, param_names, mean_sgn, sigma_sgn) + (sig, sig_err, bkg, bkg_err, signif, signif_err, s_over_b, s_over_b_err) = calc_signif( + ws, res, pdfnames, param_names, mean_sgn, sigma_sgn + ) add_text_info_perf(textInfoLeft, sig, sig_err, bkg, bkg_err, s_over_b, s_over_b_err, signif, signif_err) @@ -378,37 +393,38 @@ def _roofit_mass(self, level, hist, ipt, pdfnames, param_names, fitcfg, roows = textInfoRight.Draw() textInfoLeft.Draw() if res.status() != 0: - self.logger.warning('Invalid fit result for %s', hist.GetName()) - filename = filename.replace('.png', '_invalid.png') + self.logger.warning("Invalid fit result for %s", hist.GetName()) + filename = filename.replace(".png", "_invalid.png") self._save_canvas(c, filename) if level == "data": - residual_frame.SetTitle(f'inv. mass for p_{{T}} {self.bins_candpt[ipt]} - {self.bins_candpt[ipt+1]} GeV/c') + residual_frame.SetTitle( + f"inv. mass for p_{{T}} {self.bins_candpt[ipt]} - {self.bins_candpt[ipt + 1]} GeV/c" + ) cres = TCanvas() residual_frame.Draw() - filename = filename.replace('.png', '_residual.png') + filename = filename.replace(".png", "_residual.png") self._save_canvas(cres, filename) return res, ws - - def _fit_mass(self, hist, filename = None): + def _fit_mass(self, hist, filename=None): if hist.GetEntries() == 0: - raise UserWarning('Cannot fit histogram with no entries') - fit_range = self.cfg('mass_fit.range') - func_sig = TF1('funcSig', self.cfg('mass_fit.func_sig'), *fit_range) - func_bkg = TF1('funcBkg', self.cfg('mass_fit.func_bkg'), *fit_range) + raise UserWarning("Cannot fit histogram with no entries") + fit_range = self.cfg("mass_fit.range") + func_sig = TF1("funcSig", self.cfg("mass_fit.func_sig"), *fit_range) + func_bkg = TF1("funcBkg", self.cfg("mass_fit.func_bkg"), *fit_range) par_offset = func_sig.GetNpar() - func_tot = TF1('funcTot', f"{self.cfg('mass_fit.func_sig')} + {self.cfg('mass_fit.func_bkg')}({par_offset})") - func_tot.SetParameter(0, hist.GetMaximum()/3.) # TODO: better seeding? - for par, value in self.cfg('mass_fit.par_start', {}).items(): - self.logger.debug('Setting par %i to %g', par, value) + func_tot = TF1("funcTot", f"{self.cfg('mass_fit.func_sig')} + {self.cfg('mass_fit.func_bkg')}({par_offset})") + func_tot.SetParameter(0, hist.GetMaximum() / 3.0) # TODO: better seeding? + for par, value in self.cfg("mass_fit.par_start", {}).items(): + self.logger.debug("Setting par %i to %g", par, value) func_tot.SetParameter(par, value) - for par, value in self.cfg('mass_fit.par_constrain', {}).items(): - self.logger.debug('Constraining par %i to (%g, %g)', par, value[0], value[1]) + for par, value in self.cfg("mass_fit.par_constrain", {}).items(): + self.logger.debug("Constraining par %i to (%g, %g)", par, value[0], value[1]) func_tot.SetParLimits(par, value[0], value[1]) - for par, value in self.cfg('mass_fit.par_fix', {}).items(): - self.logger.debug('Fixing par %i to %g', par, value) + for par, value in self.cfg("mass_fit.par_fix", {}).items(): + self.logger.debug("Fixing par %i to %g", par, value) func_tot.FixParameter(par, value) fit_res = hist.Fit(func_tot, "SQL", "", fit_range[0], fit_range[1]) if fit_res and fit_res.Get() and fit_res.IsValid(): @@ -425,23 +441,22 @@ def _fit_mass(self, hist, filename = None): c = TCanvas() hist.Draw() func_sig.SetLineColor(ROOT.kBlue) - func_sig.Draw('lsame') + func_sig.Draw("lsame") func_bkg.SetLineColor(ROOT.kCyan) - func_bkg.Draw('lsame') + func_bkg.Draw("lsame") self._save_canvas(c, filename) else: - self.logger.warning('Invalid fit result for %s', hist.GetName()) + self.logger.warning("Invalid fit result for %s", hist.GetName()) # func_tot.Print('v') - filename = filename.replace('.png', '_invalid.png') + filename = filename.replace(".png", "_invalid.png") self._save_hist(hist, filename) # TODO: how to deal with this return (fit_res, func_sig, func_bkg) - # pylint: disable=too-many-branches,too-many-statements def fit(self): - if not self.cfg('hfjet', True): + if not self.cfg("hfjet", True): self.logger.info("Not fitting mass distributions for inclusive jets") return self.logger.info("Fitting inclusive mass distributions") @@ -463,77 +478,84 @@ def fit(self): self.logger.debug("Opening histogram %s.", name_histo) if not (h := rfile.Get(name_histo)): self.logger.critical("Histogram %s not found.", name_histo) - for iptjet, ipt in itertools.product(itertools.chain((None,), range(get_nbins(h, 1))), - range(get_nbins(h, 2))): - self.logger.debug('fitting %s: %s, %i', level, iptjet, ipt) + for iptjet, ipt in itertools.product( + itertools.chain((None,), range(get_nbins(h, 1))), range(get_nbins(h, 2)) + ): + self.logger.debug("fitting %s: %s, %i", level, iptjet, ipt) axis_ptjet = get_axis(h, 1) - cuts_proj = {2: (ipt+1, ipt+1)} + cuts_proj = {2: (ipt + 1, ipt + 1)} if iptjet is not None: - cuts_proj.update({1: (iptjet+1, iptjet+1)}) - jetptlabel = f'_{string_range_ptjet(get_bin_limits(axis_ptjet, iptjet + 1))}' + cuts_proj.update({1: (iptjet + 1, iptjet + 1)}) + jetptlabel = f"_{string_range_ptjet(get_bin_limits(axis_ptjet, iptjet + 1))}" else: - jetptlabel = '' + jetptlabel = "" h_invmass = project_hist(h, [0], cuts_proj) # Rebin if (n_rebin := self.cfg("n_rebin", 1)) != 1: h_invmass.Rebin(n_rebin) - range_pthf = (self.bins_candpt[ipt], self.bins_candpt[ipt+1]) - if self.cfg('mass_fit') and iptjet is None: - if h_invmass.GetEntries() < 100: # TODO: reconsider criterion - self.logger.error('Not enough entries to fit %s iptjet %s ipt %d', - level, iptjet, ipt) + range_pthf = (self.bins_candpt[ipt], self.bins_candpt[ipt + 1]) + if self.cfg("mass_fit") and iptjet is None: + if h_invmass.GetEntries() < 100: # TODO: reconsider criterion + self.logger.error("Not enough entries to fit %s iptjet %s ipt %d", level, iptjet, ipt) continue fit_res, _, func_bkg = self._fit_mass( - h_invmass, - f'fit/h_mass_fitted_{string_range_pthf(range_pthf)}_{level}.png') + h_invmass, f"fit/h_mass_fitted_{string_range_pthf(range_pthf)}_{level}.png" + ) if fit_res and fit_res.Get() and fit_res.IsValid(): self.fit_mean[level][ipt] = fit_res.Parameter(1) self.fit_sigma[level][ipt] = fit_res.Parameter(2) self.fit_func_bkg[level][ipt] = func_bkg else: - self.logger.error('Fit failed for %s bin %d', level, ipt) - if self.cfg('mass_roofit'): - for entry in self.cfg('mass_roofit', []): - if lvl := entry.get('level'): + self.logger.error("Fit failed for %s bin %d", level, ipt) + if self.cfg("mass_roofit"): + for entry in self.cfg("mass_roofit", []): + if lvl := entry.get("level"): if lvl != level: continue - if ptspec := entry.get('ptrange'): + if ptspec := entry.get("ptrange"): if ptspec[0] > range_pthf[0] or ptspec[1] < range_pthf[1]: continue fitcfg = entry break self.logger.debug("Using fit config for %i: %s", ipt, fitcfg) - if iptjet is not None and not fitcfg.get('per_ptjet'): + if iptjet is not None and not fitcfg.get("per_ptjet"): continue # TODO: link datasel to fit stage - if datasel := fitcfg.get('datasel'): - hist_name = f'h_mass-ptjet-pthf_{datasel}' + if datasel := fitcfg.get("datasel"): + hist_name = f"h_mass-ptjet-pthf_{datasel}" if not (hsel := rfile.Get(hist_name)): self.logger.critical("Failed to get histogram %s", hist_name) h_invmass = project_hist(hsel, [0], cuts_proj) - if h_invmass.GetEntries() < 100: # TODO: reconsider criterion - self.logger.error('Not enough entries to fit %s iptjet %s ipt %d', - level, iptjet, ipt) + if h_invmass.GetEntries() < 100: # TODO: reconsider criterion + self.logger.error("Not enough entries to fit %s iptjet %s ipt %d", level, iptjet, ipt) continue roows = self.roows.get(ipt) if iptjet is None else self.roows_ptjet.get((iptjet, ipt)) if roows is None and level != self.fit_levels[0]: - self.logger.critical('missing previous fit result, cannot fit %s iptjet %s ipt %d', - level, iptjet, ipt) - for par in fitcfg.get('fix_params', []): + self.logger.critical( + "missing previous fit result, cannot fit %s iptjet %s ipt %d", level, iptjet, ipt + ) + for par in fitcfg.get("fix_params", []): if var := roows.var(par): var.setConstant(True) - for par in fitcfg.get('free_params', []): + for par in fitcfg.get("free_params", []): if var := roows.var(par): var.setConstant(False) if iptjet is not None: - for par in fitcfg.get('fix_params_ptjet', []): + for par in fitcfg.get("fix_params_ptjet", []): if var := roows.var(par): var.setConstant(True) roo_res, roo_ws = self._roofit_mass( - level, h_invmass, ipt, self.p_pdfnames, self.p_param_names, fitcfg, roows, - f'roofit/h_mass_fitted{jetptlabel}_{string_range_pthf(range_pthf)}_{level}.png') + level, + h_invmass, + ipt, + self.p_pdfnames, + self.p_param_names, + fitcfg, + roows, + f"roofit/h_mass_fitted{jetptlabel}_{string_range_pthf(range_pthf)}_{level}.png", + ) if roo_res.status() != 0: - self.logger.error('RooFit failed for %s iptjet %s ipt %d', level, iptjet, ipt) + self.logger.error("RooFit failed for %s iptjet %s ipt %d", level, iptjet, ipt) # if level == 'mc': # roo_ws.Print() # TODO: save snapshot per level @@ -550,30 +572,32 @@ def fit(self): self.roows_ptjet[(jptjet, ipt)] = roo_ws.Clone() self.roo_ws_ptjet[level][jptjet][ipt] = roo_ws.Clone() # TODO: take parameter names from DB - if level in ('data', 'mc'): - varname_mean = fitcfg.get('var_mean', self.p_param_names["gauss_mean"]) - varname_sigma = fitcfg.get('var_sigma', self.p_param_names["gauss_sigma"]) + if level in ("data", "mc"): + varname_mean = fitcfg.get("var_mean", self.p_param_names["gauss_mean"]) + varname_sigma = fitcfg.get("var_sigma", self.p_param_names["gauss_sigma"]) self.fit_mean[level][ipt] = roo_ws.var(varname_mean).getValV() self.fit_sigma[level][ipt] = roo_ws.var(varname_sigma).getValV() - varname_m = fitcfg.get('var', 'm') + varname_m = fitcfg.get("var", "m") if roo_ws.pdf("bkg"): self.fit_func_bkg[level][ipt] = roo_ws.pdf("bkg").asTF(roo_ws.var(varname_m)) - self.fit_range[level][ipt] = (roo_ws.var(varname_m).getMin('fit'), - roo_ws.var(varname_m).getMax('fit')) - self.logger.debug('fit range for %s-%i: %s', level, ipt, self.fit_range[level][ipt]) + self.fit_range[level][ipt] = ( + roo_ws.var(varname_m).getMin("fit"), + roo_ws.var(varname_m).getMax("fit"), + ) + self.logger.debug("fit range for %s-%i: %s", level, ipt, self.fit_range[level][ipt]) - #region sidebands + # region sidebands # pylint: disable=too-many-branches,too-many-statements,too-many-locals def _subtract_sideband(self, hist, var, mcordata, ipt): """ Subtract sideband distributions, assuming mass on first axis """ if not hist: - self.logger.error('no histogram for %s bin %d', var, ipt) + self.logger.error("no histogram for %s bin %d", var, ipt) return None - label = f'-{var}' if var else '' - range_pthf = (self.bins_candpt[ipt], self.bins_candpt[ipt+1]) - self._save_hist(hist, f'sideband/h_mass-ptjet{label}_{string_range_pthf(range_pthf)}_{mcordata}.png') + label = f"-{var}" if var else "" + range_pthf = (self.bins_candpt[ipt], self.bins_candpt[ipt + 1]) + self._save_hist(hist, f"sideband/h_mass-ptjet{label}_{string_range_pthf(range_pthf)}_{mcordata}.png") mean = self.fit_mean[mcordata][ipt] # self.logger.info('means %g, %g', mean, self.roows[ipt].var('mean').getVal()) @@ -581,38 +605,46 @@ def _subtract_sideband(self, hist, var, mcordata, ipt): # self.logger.info('sigmas %g, %g', sigma, self.roows[ipt].var('sigma_g1').getVal()) fit_range = self.fit_range[mcordata][ipt] if mean is None or sigma is None or fit_range is None: - self.logger.error('no fit parameters for %s bin %s-%d', var or 'none', mcordata, ipt) + self.logger.error("no fit parameters for %s bin %s-%d", var or "none", mcordata, ipt) return None - for entry in self.cfg('sidesub', []): - if level := entry.get('level'): + for entry in self.cfg("sidesub", []): + if level := entry.get("level"): if level != mcordata: continue - if ptrange_sel := entry.get('ptrange'): - if ptrange_sel[0] > self.bins_candpt[ipt] or ptrange_sel[1] < self.bins_candpt[ipt+1]: + if ptrange_sel := entry.get("ptrange"): + if ptrange_sel[0] > self.bins_candpt[ipt] or ptrange_sel[1] < self.bins_candpt[ipt + 1]: continue - regcfg = entry['regions'] + regcfg = entry["regions"] break regions = { - 'signal': (mean + regcfg['signal'][0] * sigma, mean + regcfg['signal'][1] * sigma), - 'sideband_left': (mean + regcfg['left'][0] * sigma, mean + regcfg['left'][1] * sigma), - 'sideband_right': (mean + regcfg['right'][0] * sigma, mean + regcfg['right'][1] * sigma) + "signal": (mean + regcfg["signal"][0] * sigma, mean + regcfg["signal"][1] * sigma), + "sideband_left": (mean + regcfg["left"][0] * sigma, mean + regcfg["left"][1] * sigma), + "sideband_right": (mean + regcfg["right"][0] * sigma, mean + regcfg["right"][1] * sigma), } - if regions['sideband_left'][1] < fit_range[0] or regions['sideband_right'][0] > fit_range[1]: - self.logger.critical('sidebands %s for %s-%i not in fit range %s, fix regions in DB!', - regions, mcordata, ipt, fit_range) + if regions["sideband_left"][1] < fit_range[0] or regions["sideband_right"][0] > fit_range[1]: + self.logger.critical( + "sidebands %s for %s-%i not in fit range %s, fix regions in DB!", regions, mcordata, ipt, fit_range + ) for reg, lim in regions.items(): if lim[0] < fit_range[0] or lim[1] > fit_range[1]: regions[reg] = (max(lim[0], fit_range[0]), min(lim[1], fit_range[1])) - self.logger.warning('region %s for %s bin %d (%s) extends beyond fit range: %s, clipping to %s', - reg, mcordata, ipt, range_pthf, lim, regions[reg]) + self.logger.warning( + "region %s for %s bin %d (%s) extends beyond fit range: %s, clipping to %s", + reg, + mcordata, + ipt, + range_pthf, + lim, + regions[reg], + ) if regions[reg][1] < regions[reg][0]: - self.logger.error('region limits inverted, reducing to zero width') + self.logger.error("region limits inverted, reducing to zero width") regions[reg] = (regions[reg][0], regions[reg][0]) axis = get_axis(hist, 0) bins = {key: (axis.FindBin(region[0]), axis.FindBin(region[1]) - 1) for key, region in regions.items()} limits = {key: (axis.GetBinLowEdge(bins[key][0]), axis.GetBinUpEdge(bins[key][1])) for key in bins} - self.logger.debug('Using for %s-%i: %s, %s', mcordata, ipt, regions, limits) + self.logger.debug("Using for %s-%i: %s, %s", mcordata, ipt, regions, limits) fh = {} area = {} @@ -622,31 +654,39 @@ def _subtract_sideband(self, hist, var, mcordata, ipt): axes = list(range(get_dim(hist)))[1:] fh[region] = project_hist(hist, axes, {0: bins[region]}) self.logger.info("Projecting %s to %s in %s: %g entries", hist, axes, bins[region], fh[region].GetEntries()) - self._save_hist(fh[region], - f'sideband/h_ptjet{label}_{region}_{string_range_pthf(range_pthf)}_{mcordata}.png') + self._save_hist( + fh[region], f"sideband/h_ptjet{label}_{region}_{string_range_pthf(range_pthf)}_{mcordata}.png" + ) - fh_subtracted = fh['signal'].Clone(f'h_ptjet{label}_subtracted_{ipt}_{mcordata}') + fh_subtracted = fh["signal"].Clone(f"h_ptjet{label}_subtracted_{ipt}_{mcordata}") ensure_sumw2(fh_subtracted) fh_sideband = sum_hists( - [fh['sideband_left'], fh['sideband_right']], f'h_ptjet{label}_sideband_{ipt}_{mcordata}') + [fh["sideband_left"], fh["sideband_right"]], f"h_ptjet{label}_sideband_{ipt}_{mcordata}" + ) ensure_sumw2(fh_sideband) subtract_sidebands = False - if mcordata == 'data' and self.cfg('sidesub_per_ptjet'): - self.logger.info('Subtracting sidebands in pt jet bins') + if mcordata == "data" and self.cfg("sidesub_per_ptjet"): + self.logger.info("Subtracting sidebands in pt jet bins") for iptjet in range(get_nbins(fh_subtracted, 0)): if rws := self.roo_ws_ptjet[mcordata][iptjet][ipt]: f = rws.pdf("bkg").asTF(self.roo_ws[mcordata][ipt].var("m")) else: - self.logger.error('Could not retrieve roows for %s-%i-%i', mcordata, iptjet, ipt) + self.logger.error("Could not retrieve roows for %s-%i-%i", mcordata, iptjet, ipt) continue area = {region: f.Integral(*limits[region]) for region in regions} - self.logger.info('areas for %s-%s: %g, %g, %g', - mcordata, ipt, area['signal'], area['sideband_left'], area['sideband_right']) - if (area['sideband_left'] + area['sideband_right']) > 0.: + self.logger.info( + "areas for %s-%s: %g, %g, %g", + mcordata, + ipt, + area["signal"], + area["sideband_left"], + area["sideband_right"], + ) + if (area["sideband_left"] + area["sideband_right"]) > 0.0: subtract_sidebands = True - areaNormFactor = area['signal'] / (area['sideband_left'] + area['sideband_right']) + areaNormFactor = area["signal"] / (area["sideband_left"] + area["sideband_right"]) # TODO: extend to higher dimensions for ibin in range(get_nbins(fh_subtracted, 1)): scale_bin(fh_sideband, areaNormFactor, iptjet + 1, ibin + 1) @@ -655,182 +695,217 @@ def _subtract_sideband(self, hist, var, mcordata, ipt): f = self.roo_ws[mcordata][ipt].pdf("bkg").asTF(self.roo_ws[mcordata][ipt].var("m")) area[region] = f.Integral(*limits[region]) - self.logger.info('areas for %s-%s: %g, %g, %g', - mcordata, ipt, area['signal'], area['sideband_left'], area['sideband_right']) + self.logger.info( + "areas for %s-%s: %g, %g, %g", + mcordata, + ipt, + area["signal"], + area["sideband_left"], + area["sideband_right"], + ) - if (area['sideband_left'] + area['sideband_right']) > 0.: + if (area["sideband_left"] + area["sideband_right"]) > 0.0: subtract_sidebands = True - areaNormFactor = area['signal'] / (area['sideband_left'] + area['sideband_right']) + areaNormFactor = area["signal"] / (area["sideband_left"] + area["sideband_right"]) fh_sideband.Scale(areaNormFactor) - self._save_hist(fh_sideband, - f'sideband/h_ptjet{label}_sideband_{string_range_pthf(range_pthf)}_{mcordata}.png') + self._save_hist(fh_sideband, f"sideband/h_ptjet{label}_sideband_{string_range_pthf(range_pthf)}_{mcordata}.png") if subtract_sidebands: - fh_subtracted.Add(fh_sideband, -1.) + fh_subtracted.Add(fh_sideband, -1.0) self._clip_neg(fh_subtracted) - self._save_hist(fh_subtracted, f'sideband/h_ptjet{label}_subtracted_notscaled_' - f'{string_range_pthf(range_pthf)}_{mcordata}.png') + self._save_hist( + fh_subtracted, + f"sideband/h_ptjet{label}_subtracted_notscaled_{string_range_pthf(range_pthf)}_{mcordata}.png", + ) # plot subtraction before applying multiplicative corrections if get_dim(hist) == 2: c = TCanvas() - fh['signal'].SetLineColor(ROOT.kRed) - fh['signal'].Draw() + fh["signal"].SetLineColor(ROOT.kRed) + fh["signal"].Draw() fh_sideband.SetLineColor(ROOT.kCyan) fh_sideband.Draw("same") fh_subtracted.Draw("same") fh_subtracted.GetYaxis().SetRangeUser( - 0., max(fh_subtracted.GetMaximum(), fh['signal'].GetMaximum(), fh_sideband.GetMaximum())) - self._save_canvas(c, f'sideband/h_ptjet{label}_overview_{string_range_pthf(range_pthf)}_{mcordata}.png') + 0.0, max(fh_subtracted.GetMaximum(), fh["signal"].GetMaximum(), fh_sideband.GetMaximum()) + ) + self._save_canvas(c, f"sideband/h_ptjet{label}_overview_{string_range_pthf(range_pthf)}_{mcordata}.png") else: axis_ptjet = get_axis(hist, 1) - hists = [fh['signal'], fh_sideband, fh_subtracted] - cmap = [ROOT.kBlue, ROOT.kRed, ROOT.kGreen+3] + hists = [fh["signal"], fh_sideband, fh_subtracted] + cmap = [ROOT.kBlue, ROOT.kRed, ROOT.kGreen + 3] for iptjet in range(get_nbins(hist, 1)): c = TCanvas() hcs = [] - for i, h in enumerate(map(lambda h, ibin=iptjet+1: project_hist(h, [1], {0: (ibin, ibin)}), hists)): - hcs.append(h.DrawCopy('same' if i > 0 else '')) + for i, h in enumerate(map(lambda h, ibin=iptjet + 1: project_hist(h, [1], {0: (ibin, ibin)}), hists)): + hcs.append(h.DrawCopy("same" if i > 0 else "")) hcs[-1].SetLineColor(cmap[i]) - hcs[0].GetYaxis().SetRangeUser(0., 1.1 * max(map(lambda h: h.GetMaximum(), hcs))) + hcs[0].GetYaxis().SetRangeUser(0.0, 1.1 * max(map(lambda h: h.GetMaximum(), hcs))) range_ptjet = get_bin_limits(axis_ptjet, iptjet + 1) - filename = (f'sideband/h_{label[1:]}_overview_ptjet-pthf_{string_range_ptjet(range_ptjet)}' + - f'_{string_range_pthf(range_pthf)}_{mcordata}.png') + filename = ( + f"sideband/h_{label[1:]}_overview_ptjet-pthf_{string_range_ptjet(range_ptjet)}" + + f"_{string_range_pthf(range_pthf)}_{mcordata}.png" + ) self._save_canvas(c, filename) # TODO: calculate per ptjet bin roows = self.roows[ipt] - roows.var('mean').setVal(self.fit_mean[mcordata][ipt]) - roows.var('sigma_g1').setVal(self.fit_sigma[mcordata][ipt]) - var_m.setRange('signal', *limits['signal']) - var_m.setRange('sidel', *limits['sideband_left']) - var_m.setRange('sider', *limits['sideband_right']) + roows.var("mean").setVal(self.fit_mean[mcordata][ipt]) + roows.var("sigma_g1").setVal(self.fit_sigma[mcordata][ipt]) + var_m.setRange("signal", *limits["signal"]) + var_m.setRange("sidel", *limits["sideband_left"]) + var_m.setRange("sider", *limits["sideband_right"]) # correct for reflections - if self.cfg('corr_refl') and (mcordata == 'data' or not self.cfg('closure.filter_reflections')): - pdf_sig = self.roows[ipt].pdf('sig') - pdf_refl = self.roows[ipt].pdf('refl') - pdf_bkg = self.roows[ipt].pdf('bkg') - frac_sig = roows.var('frac').getVal() if mcordata == 'data' else 1. - frac_bkg = 1. - frac_sig - fac_sig = frac_sig * (1. - roows.var('frac_refl').getVal()) - fac_refl = frac_sig * roows.var('frac_refl').getVal() + if self.cfg("corr_refl") and (mcordata == "data" or not self.cfg("closure.filter_reflections")): + pdf_sig = self.roows[ipt].pdf("sig") + pdf_refl = self.roows[ipt].pdf("refl") + pdf_bkg = self.roows[ipt].pdf("bkg") + frac_sig = roows.var("frac").getVal() if mcordata == "data" else 1.0 + frac_bkg = 1.0 - frac_sig + fac_sig = frac_sig * (1.0 - roows.var("frac_refl").getVal()) + fac_refl = frac_sig * roows.var("frac_refl").getVal() fac_bkg = frac_bkg - area_sig_sig = pdf_sig.createIntegral(var_m, ROOT.RooFit.NormSet(var_m), - ROOT.RooFit.Range('signal')).getVal() * fac_sig - area_refl_sig = pdf_refl.createIntegral(var_m, ROOT.RooFit.NormSet(var_m), - ROOT.RooFit.Range('signal')).getVal() * fac_refl - area_refl_sidel = pdf_refl.createIntegral(var_m, ROOT.RooFit.NormSet(var_m), - ROOT.RooFit.Range('sidel')).getVal() * fac_refl - area_refl_sider = pdf_refl.createIntegral(var_m, ROOT.RooFit.NormSet(var_m), - ROOT.RooFit.Range('sider')).getVal() * fac_refl + area_sig_sig = ( + pdf_sig.createIntegral(var_m, ROOT.RooFit.NormSet(var_m), ROOT.RooFit.Range("signal")).getVal() + * fac_sig + ) + area_refl_sig = ( + pdf_refl.createIntegral(var_m, ROOT.RooFit.NormSet(var_m), ROOT.RooFit.Range("signal")).getVal() + * fac_refl + ) + area_refl_sidel = ( + pdf_refl.createIntegral(var_m, ROOT.RooFit.NormSet(var_m), ROOT.RooFit.Range("sidel")).getVal() + * fac_refl + ) + area_refl_sider = ( + pdf_refl.createIntegral(var_m, ROOT.RooFit.NormSet(var_m), ROOT.RooFit.Range("sider")).getVal() + * fac_refl + ) area_refl_side = area_refl_sidel + area_refl_sider - area_bkg_sig = pdf_bkg.createIntegral(var_m, ROOT.RooFit.NormSet(var_m), - ROOT.RooFit.Range('signal')).getVal() * fac_bkg - area_bkg_sidel = pdf_bkg.createIntegral(var_m, ROOT.RooFit.NormSet(var_m), - ROOT.RooFit.Range('sidel')).getVal() * fac_bkg - area_bkg_sider = pdf_bkg.createIntegral(var_m, ROOT.RooFit.NormSet(var_m), - ROOT.RooFit.Range('sider')).getVal() * fac_bkg + area_bkg_sig = ( + pdf_bkg.createIntegral(var_m, ROOT.RooFit.NormSet(var_m), ROOT.RooFit.Range("signal")).getVal() + * fac_bkg + ) + area_bkg_sidel = ( + pdf_bkg.createIntegral(var_m, ROOT.RooFit.NormSet(var_m), ROOT.RooFit.Range("sidel")).getVal() * fac_bkg + ) + area_bkg_sider = ( + pdf_bkg.createIntegral(var_m, ROOT.RooFit.NormSet(var_m), ROOT.RooFit.Range("sider")).getVal() * fac_bkg + ) area_bkg_side = area_bkg_sidel + area_bkg_sider - scale_bkg = area_bkg_sig / area_bkg_side if mcordata == 'data' else 1. + scale_bkg = area_bkg_sig / area_bkg_side if mcordata == "data" else 1.0 corr = area_sig_sig / (area_sig_sig + area_refl_sig - area_refl_side * scale_bkg) - self.logger.info('Correcting %s-%i for reflections with factor %g', mcordata, ipt, corr) - self.logger.info('areas: %g, %g, %g, %g; bkgscale: %g', - area_sig_sig, area_refl_sig, area_refl_sidel, area_refl_sider, scale_bkg) + self.logger.info("Correcting %s-%i for reflections with factor %g", mcordata, ipt, corr) + self.logger.info( + "areas: %g, %g, %g, %g; bkgscale: %g", + area_sig_sig, + area_refl_sig, + area_refl_sidel, + area_refl_sider, + scale_bkg, + ) self.h_reflcorr.SetBinContent(ipt + 1, corr) fh_subtracted.Scale(corr) - pdf_sig = self.roows[ipt].pdf('sig') - frac_sig = pdf_sig.createIntegral(var_m, ROOT.RooFit.NormSet(var_m), ROOT.RooFit.Range('signal')).getVal() - if pdf_peak := self.roows[ipt].pdf('peak'): - frac_peak = pdf_peak.createIntegral(var_m, ROOT.RooFit.NormSet(var_m), ROOT.RooFit.Range('signal')).getVal() - self.logger.info('correcting %s-%i for fractional signal area: %g (Gaussian: %g)', - mcordata, ipt, frac_sig, frac_peak) + pdf_sig = self.roows[ipt].pdf("sig") + frac_sig = pdf_sig.createIntegral(var_m, ROOT.RooFit.NormSet(var_m), ROOT.RooFit.Range("signal")).getVal() + if pdf_peak := self.roows[ipt].pdf("peak"): + frac_peak = pdf_peak.createIntegral(var_m, ROOT.RooFit.NormSet(var_m), ROOT.RooFit.Range("signal")).getVal() + self.logger.info( + "correcting %s-%i for fractional signal area: %g (Gaussian: %g)", mcordata, ipt, frac_sig, frac_peak + ) - fh_subtracted.Scale(1. / frac_sig) - self._save_hist(fh_subtracted, f'sideband/h_ptjet{label}_subtracted_' - f'{string_range_pthf(range_pthf)}_{mcordata}.png') + fh_subtracted.Scale(1.0 / frac_sig) + self._save_hist( + fh_subtracted, f"sideband/h_ptjet{label}_subtracted_{string_range_pthf(range_pthf)}_{mcordata}.png" + ) return fh_subtracted - # region analysis - def _analyze(self, method = 'sidesub'): + def _analyze(self, method="sidesub"): self.logger.info("Running analysis") - for mcordata in ['mc', 'data']: + for mcordata in ["mc", "data"]: rfilename = self.n_filemass_mc if mcordata == "mc" else self.n_filemass with TFile(rfilename) as rfile: - for var in [None] + self.observables['all']: - self.logger.info('Running analysis for obs. %s, %s using %s', var, mcordata, method) - label = f'-{var}' if var else '' - self.logger.debug('looking for %s', f'h_mass-ptjet-pthf{label}') - if fh := rfile.Get(f'h_mass-ptjet-pthf{label}'): # TODO: add sanity check + for var in [None] + self.observables["all"]: + self.logger.info("Running analysis for obs. %s, %s using %s", var, mcordata, method) + label = f"-{var}" if var else "" + self.logger.debug("looking for %s", f"h_mass-ptjet-pthf{label}") + if fh := rfile.Get(f"h_mass-ptjet-pthf{label}"): # TODO: add sanity check axes_proj = list(range(get_dim(fh))) axes_proj.remove(2) fh_sub = [] self.h_reflcorr.Reset() for ipt in range(self.nbins): - h_in = project_hist(fh, axes_proj, {2: (ipt+1, ipt+1)}) + h_in = project_hist(fh, axes_proj, {2: (ipt + 1, ipt + 1)}) ensure_sumw2(h_in) # Signal extraction - self.logger.info("Signal extraction (method %s): obs. %s, %s, ipt %d", - method, var, mcordata, ipt) - if not self.cfg('hfjet', True): + self.logger.info( + "Signal extraction (method %s): obs. %s, %s, ipt %d", method, var, mcordata, ipt + ) + if not self.cfg("hfjet", True): h = project_hist(h_in, list(range(1, get_dim(h_in))), {}) - elif method == 'sidesub': + elif method == "sidesub": h = self._subtract_sideband(h_in, var, mcordata, ipt) - elif method == 'sigextr': + elif method == "sigextr": h = self._extract_signal(h_in, var, mcordata, ipt) else: - self.logger.critical('invalid method %s', method) - self._save_hist(h, f'h_ptjet{label}_{method}_noeff_{mcordata}_pt{ipt}.png') - if mcordata == 'mc': - self.logger.info('projecting %s onto axes: %s', h_in, axes_proj[1:]) + self.logger.critical("invalid method %s", method) + self._save_hist(h, f"h_ptjet{label}_{method}_noeff_{mcordata}_pt{ipt}.png") + if mcordata == "mc": + self.logger.info("projecting %s onto axes: %s", h_in, axes_proj[1:]) h_proj = project_hist(h_in, list(range(1, get_dim(h_in))), {}) - h_proj_lim = project_hist(h_in, list(range(1, get_dim(h_in))), - {0: (1, get_nbins(h_in, 0))}) - self._save_hist(h_proj, f'h_ptjet{label}_proj_noeff_{mcordata}_pt{ipt}.png') + h_proj_lim = project_hist( + h_in, list(range(1, get_dim(h_in))), {0: (1, get_nbins(h_in, 0))} + ) + self._save_hist(h_proj, f"h_ptjet{label}_proj_noeff_{mcordata}_pt{ipt}.png") if h and h_proj: - self.logger.debug('signal loss %s-%i: %g, fraction in under-/overflow: %g', - mcordata, ipt, - 1. - h.Integral()/h_proj.Integral(), - 1. - h_proj_lim.Integral()/h_proj.Integral()) - if self.cfg('closure.pure_signal'): - self.logger.debug('assuming pure signal, using projection') + self.logger.debug( + "signal loss %s-%i: %g, fraction in under-/overflow: %g", + mcordata, + ipt, + 1.0 - h.Integral() / h_proj.Integral(), + 1.0 - h_proj_lim.Integral() / h_proj.Integral(), + ) + if self.cfg("closure.pure_signal"): + self.logger.debug("assuming pure signal, using projection") h = h_proj # Efficiency correction - if mcordata == 'data' or not self.cfg('closure.use_matched'): - self.logger.info("Efficiency correction: obs. %s, %s, ipt %d", - var, mcordata, ipt) - self.logger.info('correcting efficiency') + if mcordata == "data" or not self.cfg("closure.use_matched"): + self.logger.info("Efficiency correction: obs. %s, %s, ipt %d", var, mcordata, ipt) + self.logger.info("correcting efficiency") self._correct_efficiency(h, ipt) fh_sub.append(h) fh_sum = sum_hists(fh_sub) - self._save_hist(self.h_reflcorr, f'h_reflcorr-pthf{label}_reflcorr_{mcordata}.png') - self._save_hist(fh_sum, f'h_ptjet{label}_{method}_effscaled_{mcordata}.png') + self._save_hist(self.h_reflcorr, f"h_reflcorr-pthf{label}_reflcorr_{mcordata}.png") + self._save_hist(fh_sum, f"h_ptjet{label}_{method}_effscaled_{mcordata}.png") if get_dim(fh_sum) > 1: axes = list(range(get_dim(fh_sum))) axis_ptjet = get_axis(fh_sum, 0) for iptjet in range(get_nbins(fh_sum, 0)): c = TCanvas() - h_sig = project_hist(fh_sum, axes[1:], {0: (iptjet+1, iptjet+1)}) + h_sig = project_hist(fh_sum, axes[1:], {0: (iptjet + 1, iptjet + 1)}) h_sig.Draw() range_ptjet = get_bin_limits(axis_ptjet, iptjet + 1) - filename = (f'{method}/h_{label[1:]}_{method}_effscaled' + - f'_{string_range_ptjet(range_ptjet)}.png') + filename = ( + f"{method}/h_{label[1:]}_{method}_effscaled" + + f"_{string_range_ptjet(range_ptjet)}.png" + ) self._save_canvas(c, filename) fh_sum_fdsub = fh_sum.Clone() # Feed-down subtraction self.logger.info("Feed-down subtraction: obs. %s, %s", var, mcordata) - if mcordata == 'data' or not self.cfg('closure.exclude_feeddown_det'): + if mcordata == "data" or not self.cfg("closure.exclude_feeddown_det"): self._subtract_feeddown(fh_sum_fdsub, var, mcordata) self._clip_neg(fh_sum_fdsub) - self._save_hist(fh_sum_fdsub, f'h_ptjet{label}_{method}_{mcordata}.png') + self._save_hist(fh_sum_fdsub, f"h_ptjet{label}_{method}_{mcordata}.png") if get_dim(fh_sum) == 2: axes = list(range(get_dim(fh_sum))) @@ -838,220 +913,233 @@ def _analyze(self, method = 'sidesub'): for iptjet in range(get_nbins(fh_sum, 0)): c = TCanvas() c.cd() - h_sig = project_hist(fh_sum, axes[1:], {0: (iptjet+1,)*2}).Clone('hsig') + h_sig = project_hist(fh_sum, axes[1:], {0: (iptjet + 1,) * 2}).Clone("hsig") h_sig.Draw("same") h_sig.SetLineColor(ROOT.kRed) ymax = h_sig.GetMaximum() if var in self.hfeeddown_det[mcordata]: h_fd = self.hfeeddown_det[mcordata][var] - h_fd = project_hist(h_fd, axes[1:], {0: (iptjet+1,)*2}) - h_fd.DrawCopy('same') + h_fd = project_hist(h_fd, axes[1:], {0: (iptjet + 1,) * 2}) + h_fd.DrawCopy("same") h_fd.SetLineColor(ROOT.kCyan) ymax = max(ymax, h_fd.GetMaximum()) - h_fdsub = project_hist(fh_sum_fdsub, axes[1:], {0: (iptjet+1,)*2}).Clone('hfdsub') - h_fdsub.Draw('same') + h_fdsub = project_hist(fh_sum_fdsub, axes[1:], {0: (iptjet + 1,) * 2}).Clone("hfdsub") + h_fdsub.Draw("same") h_fdsub.SetLineColor(ROOT.kMagenta) ymax = max(ymax, h_fdsub.GetMaximum()) - h_sig.GetYaxis().SetRangeUser(0., 1.1 * ymax) + h_sig.GetYaxis().SetRangeUser(0.0, 1.1 * ymax) range_ptjet = get_bin_limits(axis_ptjet, iptjet + 1) - filename = (f'{method}/h_{label[1:]}_{method}_fdsub' + - f'_{string_range_ptjet(range_ptjet)}.png') + filename = ( + f"{method}/h_{label[1:]}_{method}_fdsub" + f"_{string_range_ptjet(range_ptjet)}.png" + ) self._save_canvas(c, filename) if not var: continue axis_ptjet = get_axis(fh_sum_fdsub, 0) for j in range(get_nbins(fh_sum_fdsub, 0)): - hproj = project_hist(fh_sum_fdsub, list(range(1, get_dim(fh_sum_fdsub))), {0: [j+1, j+1]}) + hproj = project_hist( + fh_sum_fdsub, list(range(1, get_dim(fh_sum_fdsub))), {0: [j + 1, j + 1]} + ) range_ptjet = get_bin_limits(axis_ptjet, j + 1) self._save_hist( - hproj, f'uf/h_{var}_{method}_{mcordata}_{string_range_ptjet(range_ptjet)}.png') + hproj, f"uf/h_{var}_{method}_{mcordata}_{string_range_ptjet(range_ptjet)}.png" + ) # Unfolding self.logger.info("Unfolding: obs. %s, %s", var, mcordata) fh_unfolded = self._unfold(fh_sum_fdsub, var, mcordata) for i, h in enumerate(fh_unfolded): - self._save_hist(h, f'h_ptjet-{var}_{method}_unfolded_{mcordata}_{i}.png') + self._save_hist(h, f"h_ptjet-{var}_{method}_unfolded_{mcordata}_{i}.png") for j in range(get_nbins(h, 0)): range_ptjet = get_bin_limits(axis_ptjet, j + 1) c = TCanvas() for i, h in enumerate(fh_unfolded): - hproj = project_hist(h, list(range(1, get_dim(h))), {0: (j+1, j+1)}) - empty = hproj.Integral() < 1.e-7 + hproj = project_hist(h, list(range(1, get_dim(h))), {0: (j + 1, j + 1)}) + empty = hproj.Integral() < 1.0e-7 if empty and i == 0: - self.logger.error("Projection %s %s %s is empty.", var, mcordata, - string_range_ptjet(range_ptjet)) + self.logger.error( + "Projection %s %s %s is empty.", var, mcordata, string_range_ptjet(range_ptjet) + ) self._save_hist( hproj, - f'uf/h_{var}_{method}_unfolded_{mcordata}_' + - f'{string_range_ptjet(range_ptjet)}_{i}.png') + f"uf/h_{var}_{method}_unfolded_{mcordata}_" + + f"{string_range_ptjet(range_ptjet)}_{i}.png", + ) # Save the default unfolding iteration separately. if i == self.cfg("unfolding_iterations_sel") - 1: self._save_hist( hproj, - f'uf/h_{var}_{method}_unfolded_{mcordata}_' + - f'{string_range_ptjet(range_ptjet)}_sel.png', "colz") + f"uf/h_{var}_{method}_unfolded_{mcordata}_" + + f"{string_range_ptjet(range_ptjet)}_sel.png", + "colz", + ) # Save also the self-normalised version. if not empty: hproj_sel = hproj.Clone(f"{hproj.GetName()}_selfnorm") - hproj_sel.Scale(1. / hproj_sel.Integral(), "width") - self.logger.debug("Final histogram: %s, jet pT %g to %g", - var, range_ptjet[0], range_ptjet[1]) + hproj_sel.Scale(1.0 / hproj_sel.Integral(), "width") + self.logger.debug( + "Final histogram: %s, jet pT %g to %g", var, range_ptjet[0], range_ptjet[1] + ) # self.logger.debug(print_histogram(hproj_sel)) self._save_hist( hproj_sel, - f'uf/h_{var}_{method}_unfolded_{mcordata}_' + - f'{string_range_ptjet(range_ptjet)}_sel_selfnorm.png') + f"uf/h_{var}_{method}_unfolded_{mcordata}_" + + f"{string_range_ptjet(range_ptjet)}_sel_selfnorm.png", + ) c.cd() - hcopy = hproj.DrawCopy('same' if i > 0 else '') - hcopy.SetLineColor(i+1) - self._save_canvas(c, - f'uf/h_{var}_{method}_convergence_{mcordata}_' + - f'{string_range_ptjet(range_ptjet)}.png') + hcopy = hproj.DrawCopy("same" if i > 0 else "") + hcopy.SetLineColor(i + 1) + self._save_canvas( + c, + f"uf/h_{var}_{method}_convergence_{mcordata}_" + + f"{string_range_ptjet(range_ptjet)}.png", + ) self.logger.info("Analysis complete: obs. %s, %s", var, mcordata) - def analyze_with_sidesub(self): - self._analyze('sidesub') - + self._analyze("sidesub") def analyze_with_sigextr(self): - self._analyze('sigextr') + self._analyze("sigextr") - - #region signal extraction + # region signal extraction def _extract_signal(self, hist, var, mcordata, ipt): """ Extract signal through inv. mass fit (first axis) in bins of other axes """ if not hist: - self.logger.warning('no histogram for %s bin %d', var, ipt) + self.logger.warning("no histogram for %s bin %d", var, ipt) return None - range_pthf = (self.bins_candpt[ipt], self.bins_candpt[ipt+1]) - self._save_hist(hist, f'signalextr/h_mass-{var}_{string_range_pthf(range_pthf)}_{mcordata}.png') + range_pthf = (self.bins_candpt[ipt], self.bins_candpt[ipt + 1]) + self._save_hist(hist, f"signalextr/h_mass-{var}_{string_range_pthf(range_pthf)}_{mcordata}.png") if self.fit_mean[mcordata][ipt] is None or self.fit_sigma[mcordata][ipt] is None: - self.logger.warning('no fit parameters for %s bin %s-%d', var, mcordata, ipt) - return None # TODO: should we continue nonetheless? + self.logger.warning("no fit parameters for %s bin %s-%d", var, mcordata, ipt) + return None # TODO: should we continue nonetheless? axes = list(range(get_dim(hist))) - hres = project_hist(hist, axes[1:], {}) # TODO: check if we can project without content + hres = project_hist(hist, axes[1:], {}) # TODO: check if we can project without content hres.Reset() # TODO: take from DB, add scaling, or extend - range_int = (self.fit_mean[mcordata][ipt] - 3 * self.fit_sigma[mcordata][ipt], - self.fit_mean[mcordata][ipt] + 3 * self.fit_sigma[mcordata][ipt]) + range_int = ( + self.fit_mean[mcordata][ipt] - 3 * self.fit_sigma[mcordata][ipt], + self.fit_mean[mcordata][ipt] + 3 * self.fit_sigma[mcordata][ipt], + ) nbins = [list(range(1, get_axis(hres, i).GetNbins() + 1)) for i in range(get_dim(hres))] for binid in itertools.product(*nbins): - label = f'{binid[0]}' + label = f"{binid[0]}" for i in range(1, len(binid)): - label += f'_{binid[i]}' + label += f"_{binid[i]}" limits = {i + 1: (j, j) for i, j in enumerate(binid)} hmass = project_hist(hist, [0], limits) if hmass.GetEntries() > 100: # TODO: change to RooFit fit_res, func_sig, _ = self._fit_mass( - hmass, f'signalextr/h_mass-{var}_fitted_{string_range_pthf(range_pthf)}_{label}_{mcordata}.png') + hmass, f"signalextr/h_mass-{var}_fitted_{string_range_pthf(range_pthf)}_{label}_{mcordata}.png" + ) if fit_res and fit_res.Get() and fit_res.IsValid(): # TODO: consider adding scaling factor hres.SetBinContent(*binid, func_sig.Integral(*range_int) / hmass.GetBinWidth(1)) else: self.logger.error("Could not extract signal for %s %s %i", var, mcordata, ipt) self._save_hist( - hres, - f'signalextr/h_{var}_signalextracted_{string_range_pthf(range_pthf)}_{label}_{mcordata}.png') + hres, f"signalextr/h_{var}_signalextracted_{string_range_pthf(range_pthf)}_{label}_{mcordata}.png" + ) # hres.Sumw2() # TODO: check if we should do this here return hres - - #region feeddown + # region feeddown # pylint: disable=too-many-statements def estimate_feeddown(self): - self.logger.info('Estimating feeddown') + self.logger.info("Estimating feeddown") - with TFile(self.cfg('fd_root')) as rfile: - powheg_xsection = rfile.Get('fHistXsection') + with TFile(self.cfg("fd_root")) as rfile: + powheg_xsection = rfile.Get("fHistXsection") powheg_xsection_scale_factor = powheg_xsection.GetBinContent(1) / powheg_xsection.GetEntries() - self.logger.info('POWHEG luminosity (mb^{-1}): %g', 1. / powheg_xsection_scale_factor) + self.logger.info("POWHEG luminosity (mb^{-1}): %g", 1.0 / powheg_xsection_scale_factor) - df = pd.read_parquet(self.cfg('fd_parquet')) - col_mapping = {'dr': 'delta_r_jet', 'zpar': 'z'} # TODO: check mapping + df = pd.read_parquet(self.cfg("fd_parquet")) + col_mapping = {"dr": "delta_r_jet", "zpar": "z"} # TODO: check mapping # TODO: generalize to higher dimensions - for var in self.observables['all']: - bins_ptjet = np.asarray(self.cfg('bins_ptjet'), 'd') + for var in self.observables["all"]: + bins_ptjet = np.asarray(self.cfg("bins_ptjet"), "d") # TODO: generalize or derive from histogram? bins_obs = {} - if binning := self.cfg(f'observables.{var}.bins_gen_var'): - bins_tmp = np.asarray(binning, 'd') - elif binning := self.cfg(f'observables.{var}.bins_gen_fix'): + if binning := self.cfg(f"observables.{var}.bins_gen_var"): + bins_tmp = np.asarray(binning, "d") + elif binning := self.cfg(f"observables.{var}.bins_gen_fix"): bins_tmp = bin_array(*binning) - elif binning := self.cfg(f'observables.{var}.bins_var'): - bins_tmp = np.asarray(binning, 'd') - elif binning := self.cfg(f'observables.{var}.bins_fix'): + elif binning := self.cfg(f"observables.{var}.bins_var"): + bins_tmp = np.asarray(binning, "d") + elif binning := self.cfg(f"observables.{var}.bins_fix"): bins_tmp = bin_array(*binning) else: - self.logger.error('no binning specified for %s, using defaults', var) - bins_tmp = bin_array(10, 0., 1.) + self.logger.error("no binning specified for %s, using defaults", var) + bins_tmp = bin_array(10, 0.0, 1.0) bins_obs[var] = bins_tmp - colname = col_mapping.get(var, f'{var}_jet') - if f'{colname}' not in df: + colname = col_mapping.get(var, f"{var}_jet") + if f"{colname}" not in df: if var is not None: - self.logger.error('No feeddown information for %s (%s), cannot estimate feeddown', var, colname) + self.logger.error("No feeddown information for %s (%s), cannot estimate feeddown", var, colname) print(df.info(), flush=True) continue # TODO: derive histogram - h3_fd_gen_orig = create_hist('h3_feeddown_gen', - f';p_{{T}}^{{jet}} (GeV/#it{{c}});p_{{T}}^{{HF}} (GeV/#it{{c}});{var}', - bins_ptjet, self.bins_candpt, bins_obs[var]) - fill_hist_fast(h3_fd_gen_orig, df[['pt_jet', 'pt_cand', f'{colname}']]) - self._save_hist(project_hist(h3_fd_gen_orig, [0, 2], {}), f'fd/h_ptjet-{var}_feeddown_gen_noeffscaling.png') + h3_fd_gen_orig = create_hist( + "h3_feeddown_gen", + f";p_{{T}}^{{jet}} (GeV/#it{{c}});p_{{T}}^{{HF}} (GeV/#it{{c}});{var}", + bins_ptjet, + self.bins_candpt, + bins_obs[var], + ) + fill_hist_fast(h3_fd_gen_orig, df[["pt_jet", "pt_cand", f"{colname}"]]) + self._save_hist(project_hist(h3_fd_gen_orig, [0, 2], {}), f"fd/h_ptjet-{var}_feeddown_gen_noeffscaling.png") # new method h3_fd_gen = h3_fd_gen_orig.Clone() ensure_sumw2(h3_fd_gen) - self._save_hist(project_hist(h3_fd_gen, [0, 2], {}), f'fd/h_ptjet-{var}_fdnew_gen.png') + self._save_hist(project_hist(h3_fd_gen, [0, 2], {}), f"fd/h_ptjet-{var}_fdnew_gen.png") # apply np efficiency for ipt in range(get_nbins(h3_fd_gen, 1)): - eff_np = self.hcandeff_gen['np'].GetBinContent(ipt+1) - for iptjet, ishape in itertools.product( - range(get_nbins(h3_fd_gen, 0)), range(get_nbins(h3_fd_gen, 2))): - scale_bin(h3_fd_gen, eff_np, iptjet+1, ipt+1, ishape+1) - self._save_hist(project_hist(h3_fd_gen, [0, 2], {}), f'fd/h_ptjet-{var}_fdnew_gen_geneff.png') + eff_np = self.hcandeff_gen["np"].GetBinContent(ipt + 1) + for iptjet, ishape in itertools.product(range(get_nbins(h3_fd_gen, 0)), range(get_nbins(h3_fd_gen, 2))): + scale_bin(h3_fd_gen, eff_np, iptjet + 1, ipt + 1, ishape + 1) + self._save_hist(project_hist(h3_fd_gen, [0, 2], {}), f"fd/h_ptjet-{var}_fdnew_gen_geneff.png") # 3d folding incl. kinematic efficiencies with TFile(self.n_fileeff) as rfile: h_effkine_gen = self._build_effkine( - rfile.Get(f'h_effkine_fd_gen_nocuts_{var}'), - rfile.Get(f'h_effkine_fd_gen_cut_{var}')) + rfile.Get(f"h_effkine_fd_gen_nocuts_{var}"), rfile.Get(f"h_effkine_fd_gen_cut_{var}") + ) h_effkine_det = self._build_effkine( - rfile.Get(f'h_effkine_fd_det_nocuts_{var}'), - rfile.Get(f'h_effkine_fd_det_cut_{var}')) - h_response = rfile.Get(f'h_response_fd_{var}') + rfile.Get(f"h_effkine_fd_det_nocuts_{var}"), rfile.Get(f"h_effkine_fd_det_cut_{var}") + ) + h_response = rfile.Get(f"h_response_fd_{var}") if not h_response: self.logger.error("Could not find response matrix for fd estimation of %s", var) rfile.ls() continue h_response_norm = norm_response(h_response, 3) h3_fd_gen.Multiply(h_effkine_gen) - self._save_hist(project_hist(h3_fd_gen, [0, 2], {}), f'fd/h_ptjet-{var}_fdnew_gen_genkine.png') + self._save_hist(project_hist(h3_fd_gen, [0, 2], {}), f"fd/h_ptjet-{var}_fdnew_gen_genkine.png") h3_fd_det = fold_hist(h3_fd_gen, h_response_norm) - self._save_hist(project_hist(h3_fd_det, [0, 2], {}), f'fd/h_ptjet-{var}_fdnew_det.png') + self._save_hist(project_hist(h3_fd_det, [0, 2], {}), f"fd/h_ptjet-{var}_fdnew_det.png") h3_fd_det.Divide(h_effkine_det) - self._save_hist(project_hist(h3_fd_det, [0, 2], {}), f'fd/h_ptjet-{var}_fdnew_det_detkine.png') + self._save_hist(project_hist(h3_fd_det, [0, 2], {}), f"fd/h_ptjet-{var}_fdnew_det_detkine.png") # undo prompt efficiency for ipt in range(get_nbins(h3_fd_det, 1)): - eff_pr = self.h_effnew_pthf['pr'].GetBinContent(ipt+1) - if np.isclose(eff_pr, 0.): - self.logger.error('Efficiency zero for %s in pt bin %d, continuing', var, ipt) - continue # TODO: how should we handle this? - for iptjet, ishape in itertools.product( - range(get_nbins(h3_fd_det, 0)), range(get_nbins(h3_fd_det, 2))): - scale_bin(h3_fd_det, 1./eff_pr, iptjet+1, ipt+1, ishape+1) - self._save_hist(project_hist(h3_fd_det, [0, 2], {}), f'fd/h_ptjet-{var}_fdnew_det_deteff.png') + eff_pr = self.h_effnew_pthf["pr"].GetBinContent(ipt + 1) + if np.isclose(eff_pr, 0.0): + self.logger.error("Efficiency zero for %s in pt bin %d, continuing", var, ipt) + continue # TODO: how should we handle this? + for iptjet, ishape in itertools.product(range(get_nbins(h3_fd_det, 0)), range(get_nbins(h3_fd_det, 2))): + scale_bin(h3_fd_det, 1.0 / eff_pr, iptjet + 1, ipt + 1, ishape + 1) + self._save_hist(project_hist(h3_fd_det, [0, 2], {}), f"fd/h_ptjet-{var}_fdnew_det_deteff.png") # project to 2d (ptjet-shape) h_fd_det = project_hist(h3_fd_det, [0, 2], {}) @@ -1060,67 +1148,74 @@ def estimate_feeddown(self): h3_fd_gen = h3_fd_gen_orig.Clone() ensure_sumw2(h3_fd_gen) for ipt in range(get_nbins(h3_fd_gen, 1)): - eff_pr = self.hcandeff['pr'].GetBinContent(ipt+1) - eff_np = self.hcandeff['np'].GetBinContent(ipt+1) - if np.isclose(eff_pr, 0.): - self.logger.error('Efficiency zero for %s in pt bin %d, continuing', var, ipt) - continue # TODO: how should we handle this? - for iptjet, ishape in itertools.product( - range(get_nbins(h3_fd_gen, 0)), range(get_nbins(h3_fd_gen, 2))): - scale_bin(h3_fd_gen, eff_np/eff_pr, iptjet+1, ipt+1, ishape+1) + eff_pr = self.hcandeff["pr"].GetBinContent(ipt + 1) + eff_np = self.hcandeff["np"].GetBinContent(ipt + 1) + if np.isclose(eff_pr, 0.0): + self.logger.error("Efficiency zero for %s in pt bin %d, continuing", var, ipt) + continue # TODO: how should we handle this? + for iptjet, ishape in itertools.product(range(get_nbins(h3_fd_gen, 0)), range(get_nbins(h3_fd_gen, 2))): + scale_bin(h3_fd_gen, eff_np / eff_pr, iptjet + 1, ipt + 1, ishape + 1) h_fd_gen = project_hist(h3_fd_gen, [0, 2], {}) - self._save_hist(h_fd_gen, f'fd/h_ptjet-{var}_feeddown_gen_effscaled.png') + self._save_hist(h_fd_gen, f"fd/h_ptjet-{var}_feeddown_gen_effscaled.png") with TFile(self.n_fileeff) as rfile: h_effkine_gen = self._build_effkine( - rfile.Get(f'h_effkine_np_gen_nocuts_{var}'), - rfile.Get(f'h_effkine_np_gen_cut_{var}')) - self._save_hist(h_effkine_gen, f'fd/h_effkine-ptjet-{var}_np_gen.png', 'text') + rfile.Get(f"h_effkine_np_gen_nocuts_{var}"), rfile.Get(f"h_effkine_np_gen_cut_{var}") + ) + self._save_hist(h_effkine_gen, f"fd/h_effkine-ptjet-{var}_np_gen.png", "text") # ROOT complains about different bin limits because fN is 0 for the histogram from file, ROOT bug? ensure_sumw2(h_fd_gen) h_fd_gen.Multiply(h_effkine_gen) - self._save_hist(h_fd_gen, f'fd/h_ptjet-{var}_feeddown_gen_kineeffscaled.png') + self._save_hist(h_fd_gen, f"fd/h_ptjet-{var}_feeddown_gen_kineeffscaled.png") - h_response = rfile.Get(f'h_response_np_{var}') - response_matrix_np = self._build_response_matrix(h_response, self.hcandeff['pr']) - self._save_hist(response_matrix_np.Hresponse(), f'fd/h_ptjet-{var}_responsematrix_np_lin.png', 'colz') + h_response = rfile.Get(f"h_response_np_{var}") + response_matrix_np = self._build_response_matrix(h_response, self.hcandeff["pr"]) + self._save_hist(response_matrix_np.Hresponse(), f"fd/h_ptjet-{var}_responsematrix_np_lin.png", "colz") hfeeddown_det = response_matrix_np.Hmeasured().Clone() hfeeddown_det.Reset() ensure_sumw2(hfeeddown_det) hfeeddown_det = folding(h_fd_gen, response_matrix_np, hfeeddown_det) - self._save_hist(hfeeddown_det, f'fd/h_ptjet-{var}_feeddown_det.png') + self._save_hist(hfeeddown_det, f"fd/h_ptjet-{var}_feeddown_det.png") h_effkine_det = self._build_effkine( - rfile.Get(f'h_effkine_np_det_nocuts_{var}'), - rfile.Get(f'h_effkine_np_det_cut_{var}')) - self._save_hist(h_effkine_det, f'fd/h_effkine-ptjet-{var}_np_det.png','text') + rfile.Get(f"h_effkine_np_det_nocuts_{var}"), rfile.Get(f"h_effkine_np_det_cut_{var}") + ) + self._save_hist(h_effkine_det, f"fd/h_effkine-ptjet-{var}_np_det.png", "text") hfeeddown_det.Divide(h_effkine_det) - self._save_hist(hfeeddown_det, f'fd/h_ptjet-{var}_feeddown_det_kineeffscaled.png') + self._save_hist(hfeeddown_det, f"fd/h_ptjet-{var}_feeddown_det_kineeffscaled.png") - if self.cfg('fd_folding_method') == '3d': - self.logger.info('using 3d folding for feeddown estimation for %s', var) + if self.cfg("fd_folding_method") == "3d": + self.logger.info("using 3d folding for feeddown estimation for %s", var) hfeeddown_det = h_fd_det # TODO: check scaling - hfeeddown_det.Scale(powheg_xsection_scale_factor * self.cfg('branching_ratio')) + hfeeddown_det.Scale(powheg_xsection_scale_factor * self.cfg("branching_ratio")) hfeeddown_det_mc = hfeeddown_det.Clone() - hfeeddown_det_mc.SetName(hfeeddown_det_mc.GetName() + '_mc') - luminosity_data = (self.n_colls_read['data'] / self.n_colls_tvx['data'] * - self.n_bcs_tvx['data'] / self.cfg('xsection_inel')) + hfeeddown_det_mc.SetName(hfeeddown_det_mc.GetName() + "_mc") + luminosity_data = ( + self.n_colls_read["data"] + / self.n_colls_tvx["data"] + * self.n_bcs_tvx["data"] + / self.cfg("xsection_inel") + ) self.logger.info("Scaling feed-down with data luminosity (mb^{-1}): %g", luminosity_data) hfeeddown_det.Scale(luminosity_data) - luminosity_mc = (self.n_colls_read['mc'] / self.n_colls_tvx['mc'] * - self.n_bcs_tvx['mc'] / self.cfg('xsection_inel') * self.cfg('lumi_scale_mc')) + luminosity_mc = ( + self.n_colls_read["mc"] + / self.n_colls_tvx["mc"] + * self.n_bcs_tvx["mc"] + / self.cfg("xsection_inel") + * self.cfg("lumi_scale_mc") + ) self.logger.info("Scaling feed-down with MC luminosity (mb^{-1}): %g", luminosity_mc) hfeeddown_det_mc.Scale(luminosity_mc) - self._save_hist(hfeeddown_det, f'fd/h_ptjet-{var}_feeddown_det_final_data.png') - self._save_hist(hfeeddown_det_mc, f'fd/h_ptjet-{var}_feeddown_det_final_mc.png') - self.hfeeddown_det['data'][var] = hfeeddown_det - self.hfeeddown_det['mc'][var] = hfeeddown_det_mc - + self._save_hist(hfeeddown_det, f"fd/h_ptjet-{var}_feeddown_det_final_data.png") + self._save_hist(hfeeddown_det_mc, f"fd/h_ptjet-{var}_feeddown_det_final_mc.png") + self.hfeeddown_det["data"][var] = hfeeddown_det + self.hfeeddown_det["mc"][var] = hfeeddown_det_mc def _build_effkine(self, h_nocuts, h_cuts): h_cuts = h_cuts.Clone() @@ -1128,36 +1223,34 @@ def _build_effkine(self, h_nocuts, h_cuts): h_cuts.Divide(h_nocuts) return h_cuts - - def _build_response_matrix(self, h_response, h_eff = None, frac_flat = 0.): + def _build_response_matrix(self, h_response, h_eff=None, frac_flat=0.0): dim = (get_dim(h_response) - 1) // 2 self.logger.info("Building %i-dim response matrix from %s", dim, h_response) rm = ROOT.RooUnfoldResponse( - project_hist(h_response, list(range(dim)), {}), project_hist(h_response, list(range(dim, 2 * dim)), {})) + project_hist(h_response, list(range(dim)), {}), project_hist(h_response, list(range(dim, 2 * dim)), {}) + ) h_gen = project_hist(h_response, list(range(dim, 2 * dim)), {}) - x = (enumerate(list(get_axis(h_response, iaxis).GetXbins())[:-1], 1) for iaxis in range(2*dim+1)) + x = (enumerate(list(get_axis(h_response, iaxis).GetXbins())[:-1], 1) for iaxis in range(2 * dim + 1)) for hbin in itertools.product(*x): - n = h_response.GetBinContent( - np.asarray([hbin[i][0] for i in range(2*dim+1)], 'i')) - eff = h_eff.GetBinContent(hbin[2*dim][0]) if h_eff else 1. - if np.isclose(eff, 0.): - self.logger.error('efficiency 0 for %s', hbin[4]) + n = h_response.GetBinContent(np.asarray([hbin[i][0] for i in range(2 * dim + 1)], "i")) + eff = h_eff.GetBinContent(hbin[2 * dim][0]) if h_eff else 1.0 + if np.isclose(eff, 0.0): + self.logger.error("efficiency 0 for %s", hbin[4]) continue - if (cnt_gen := h_gen.GetBinContent(*(hbin[i][0] for i in range(dim, 2*dim)))) > 0.: - fac = 1. - if frac_flat > 0.: - fac += frac_flat * (1. / cnt_gen - 1.) + if (cnt_gen := h_gen.GetBinContent(*(hbin[i][0] for i in range(dim, 2 * dim)))) > 0.0: + fac = 1.0 + if frac_flat > 0.0: + fac += frac_flat * (1.0 / cnt_gen - 1.0) for _ in range(int(n)): - rm.Fill(*(hbin[iaxis][1] for iaxis in range(2*dim)), 1./eff * fac) + rm.Fill(*(hbin[iaxis][1] for iaxis in range(2 * dim)), 1.0 / eff * fac) # rm.Mresponse().Print() return rm - def _subtract_feeddown(self, hist, var, mcordata): if var not in self.hfeeddown_det[mcordata]: if var is not None: - self.logger.error('No feeddown information available for %s, cannot subtract', var) + self.logger.error("No feeddown information available for %s, cannot subtract", var) return if h_fd := self.hfeeddown_det[mcordata][var]: if get_dim(hist) == 1: @@ -1165,78 +1258,83 @@ def _subtract_feeddown(self, hist, var, mcordata): assert get_dim(h_fd) == get_dim(hist) hist.Add(h_fd, -1) else: - self.logger.error('No feeddown estimation available for %s (%s)', var, mcordata) + self.logger.error("No feeddown estimation available for %s (%s)", var, mcordata) - - #region unfolding + # region unfolding def _unfold(self, hist, var, mcordata): - self.logger.info('Unfolding for %s', var) - suffix = '_frac' if mcordata == 'mc' else '' + self.logger.info("Unfolding for %s", var) + suffix = "_frac" if mcordata == "mc" else "" with TFile(self.n_fileeff) as rfile: - h_response = rfile.Get(f'h_response_pr_{var}{suffix}') + h_response = rfile.Get(f"h_response_pr_{var}{suffix}") if not h_response: - self.logger.error('Response matrix for %s not available, cannot unfold', var + suffix) + self.logger.error("Response matrix for %s not available, cannot unfold", var + suffix) return [] response_matrix_pr = self._build_response_matrix( - h_response, self.hcandeff['pr'] if mcordata == 'data' else None, - self.cfg('unfolding_prior_flatness', 0.)) - self._save_hist(response_matrix_pr.Hresponse(), - f'uf/h_ptjet-{var}-responsematrix_pr_lin_{mcordata}.png', 'colz') + h_response, + self.hcandeff["pr"] if mcordata == "data" else None, + self.cfg("unfolding_prior_flatness", 0.0), + ) + self._save_hist( + response_matrix_pr.Hresponse(), f"uf/h_ptjet-{var}-responsematrix_pr_lin_{mcordata}.png", "colz" + ) h_effkine_det = self._build_effkine( - rfile.Get(f'h_effkine_pr_det_nocuts_{var}{suffix}'), - rfile.Get(f'h_effkine_pr_det_cut_{var}{suffix}')) - self._save_hist(h_effkine_det, f'uf/h_effkine-ptjet-{var}_pr_det_{mcordata}.png', 'text') + rfile.Get(f"h_effkine_pr_det_nocuts_{var}{suffix}"), rfile.Get(f"h_effkine_pr_det_cut_{var}{suffix}") + ) + self._save_hist(h_effkine_det, f"uf/h_effkine-ptjet-{var}_pr_det_{mcordata}.png", "text") - fh_unfolding_input = hist.Clone('fh_unfolding_input') + fh_unfolding_input = hist.Clone("fh_unfolding_input") if get_dim(fh_unfolding_input) != get_dim(h_effkine_det): - self.logger.error('histograms with different dimensions, cannot unfold') + self.logger.error("histograms with different dimensions, cannot unfold") return [] ensure_sumw2(fh_unfolding_input) fh_unfolding_input.Multiply(h_effkine_det) h_effkine_gen = self._build_effkine( - rfile.Get(f'h_effkine_pr_gen_nocuts_{var}{suffix}'), - rfile.Get(f'h_effkine_pr_gen_cut_{var}{suffix}')) - self._save_hist(h_effkine_gen, f'uf/h_effkine-ptjet-{var}_pr_gen_{mcordata}.png', 'text') + rfile.Get(f"h_effkine_pr_gen_nocuts_{var}{suffix}"), rfile.Get(f"h_effkine_pr_gen_cut_{var}{suffix}") + ) + self._save_hist(h_effkine_gen, f"uf/h_effkine-ptjet-{var}_pr_gen_{mcordata}.png", "text") # TODO: move, has nothing to do with unfolding - if mcordata == 'mc' and get_dim(hist) <= 2: - h_mctruth_pr = rfile.Get(f'h_ptjet-pthf-{var}_pr_gen') + if mcordata == "mc" and get_dim(hist) <= 2: + h_mctruth_pr = rfile.Get(f"h_ptjet-pthf-{var}_pr_gen") if h_mctruth_pr: h_mctruth_pr = project_hist(h_mctruth_pr, [0, 2], {}) - self._save_hist(h_mctruth_pr, f'h_ptjet-{var}_pr_mctruth.png', 'texte') + self._save_hist(h_mctruth_pr, f"h_ptjet-{var}_pr_mctruth.png", "texte") h_mctruth_all = h_mctruth_pr.Clone() - h_mctruth_np = rfile.Get(f'h_ptjet-pthf-{var}_np_gen') + h_mctruth_np = rfile.Get(f"h_ptjet-pthf-{var}_np_gen") if h_mctruth_np: h_mctruth_np = project_hist(h_mctruth_np, [0, 2], {}) - self._save_hist(h_mctruth_np, f'h_ptjet-{var}_np_mctruth.png', 'texte') + self._save_hist(h_mctruth_np, f"h_ptjet-{var}_np_mctruth.png", "texte") h_mctruth_all.Add(h_mctruth_np) - self._save_hist(h_mctruth_all, f'h_ptjet-{var}_all_mctruth.png', 'texte') + self._save_hist(h_mctruth_all, f"h_ptjet-{var}_all_mctruth.png", "texte") h_unfolding_output = [] - for n in range(self.cfg('unfolding_iterations', 8)): + for n in range(self.cfg("unfolding_iterations", 8)): unfolding_object = ROOT.RooUnfoldBayes(response_matrix_pr, fh_unfolding_input, n + 1) fh_unfolding_output = unfolding_object.Hreco(2) - self._save_hist(fh_unfolding_output, f'uf/h_ptjet-{var}_{mcordata}_unfold{n}.png', 'texte') + self._save_hist(fh_unfolding_output, f"uf/h_ptjet-{var}_{mcordata}_unfold{n}.png", "texte") ensure_sumw2(fh_unfolding_output) fh_unfolding_output.Divide(h_effkine_gen) - self._save_hist(fh_unfolding_output, f'uf/h_ptjet-{var}_{mcordata}_unfoldeffcorr{n}.png', 'texte') + self._save_hist(fh_unfolding_output, f"uf/h_ptjet-{var}_{mcordata}_unfoldeffcorr{n}.png", "texte") h_unfolding_output.append(fh_unfolding_output) - if mcordata == 'mc' and get_dim(hist) <= 2: + if mcordata == "mc" and get_dim(hist) <= 2: if h_mctruth_pr: h_mcunfolded = fh_unfolding_output.Clone() h_mcunfolded.Divide(h_mctruth_pr) - self._save_hist(h_mcunfolded, f'uf/h_ptjet-{var}_{mcordata}_closure{n}.png', 'texte') + self._save_hist(h_mcunfolded, f"uf/h_ptjet-{var}_{mcordata}_closure{n}.png", "texte") axis_ptjet = get_axis(h_mcunfolded, 0) for iptjet in range(get_nbins(h_mcunfolded, 0)): - h = project_hist(h_mcunfolded, [1], {0: (iptjet+1,iptjet+1)}) + h = project_hist(h_mcunfolded, [1], {0: (iptjet + 1, iptjet + 1)}) range_ptjet = get_bin_limits(axis_ptjet, iptjet + 1) - self._save_hist(h, f'uf/h_{var}_{mcordata}_closure{n}' + - f'_{string_range_ptjet(range_ptjet)}.png', 'texte') + self._save_hist( + h, + f"uf/h_{var}_{mcordata}_closure{n}" + f"_{string_range_ptjet(range_ptjet)}.png", + "texte", + ) else: - self.logger.error('Could not find histogram %s', f'h_mctruth_pr_{var}') + self.logger.error("Could not find histogram %s", f"h_mctruth_pr_{var}") rfile.ls() h_refolding_input = fh_unfolding_output.Clone() @@ -1245,10 +1343,10 @@ def _unfold(self, hist, var, mcordata): h_refolding_output.Reset() h_refolding_output = folding(h_refolding_input, response_matrix_pr, h_refolding_output) h_refolding_output.Divide(h_effkine_det) - self._save_hist(h_refolding_output, f'uf/h_ptjet-{var}_{mcordata}_refold{n}.png', 'texte') + self._save_hist(h_refolding_output, f"uf/h_ptjet-{var}_{mcordata}_refold{n}.png", "texte") h_refolding_output.Divide(fh_unfolding_input) - self._save_hist(h_refolding_output, f'uf/h_ptjet-{var}_{mcordata}_refoldratio{n}.png', 'texte') + self._save_hist(h_refolding_output, f"uf/h_ptjet-{var}_{mcordata}_refoldratio{n}.png", "texte") # TODO: save as 1d projections return h_unfolding_output diff --git a/machine_learning_hep/analysis/analyzer_manager.py b/machine_learning_hep/analysis/analyzer_manager.py index 9fc5472656..81c0599d29 100644 --- a/machine_learning_hep/analysis/analyzer_manager.py +++ b/machine_learning_hep/analysis/analyzer_manager.py @@ -12,6 +12,7 @@ from machine_learning_hep.logger import get_logger + # pylint: disable=too-many-instance-attributes class AnalyzerManager: """ @@ -19,7 +20,6 @@ class AnalyzerManager: """ def __init__(self, ana_class, database, case, typean, doperiodbyperiod, *args): - self.ana_class = ana_class self.database = database self.case = case @@ -36,7 +36,6 @@ def __init__(self, ana_class, database, case, typean, doperiodbyperiod, *args): self.is_initialized = False - def get_analyzers(self, none_for_unused_period=True): self.initialize() if not none_for_unused_period: @@ -50,7 +49,6 @@ def get_analyzers(self, none_for_unused_period=True): analyzers[-1] = self.analyzers[-1] return analyzers - def initialize(self): """ Collect all analyzer objects required in a list and initialises the after_burner if present @@ -65,10 +63,8 @@ def initialize(self): for ip, period in enumerate(useperiod): if self.doperiodbyperiod and period: - self.analyzers.append(self.ana_class(self.database, self.case, self.typean, ip, - *self.add_args)) - self.analyzers.append(self.ana_class(self.database, self.case, self.typean, None, - *self.add_args)) + self.analyzers.append(self.ana_class(self.database, self.case, self.typean, ip, *self.add_args)) + self.analyzers.append(self.ana_class(self.database, self.case, self.typean, None, *self.add_args)) if self.doperiodbyperiod: # get after-burner, if any @@ -79,7 +75,6 @@ def initialize(self): self.is_initialized = True - def analyze(self, ana_steps): """ Gives a list of analyzers and analysis steps do each step for each analyzer @@ -88,14 +83,16 @@ def analyze(self, ana_steps): """ if not ana_steps: - self.logger.info("No analysis steps to be done for Analyzer class %s. Return...", - self.ana_class.__name__) + self.logger.info("No analysis steps to be done for Analyzer class %s. Return...", self.ana_class.__name__) return self.initialize() - self.logger.info("Run all registered analyzers of type %s for following analysis steps: %s", - self.ana_class.__name__, ana_steps) + self.logger.info( + "Run all registered analyzers of type %s for following analysis steps: %s", + self.ana_class.__name__, + ana_steps, + ) # Collect potentially failed systematic steps failed_steps = [] diff --git a/machine_learning_hep/analysis/analyzerdhadrons.py b/machine_learning_hep/analysis/analyzerdhadrons.py index b0631c24bf..4ebe18f24b 100644 --- a/machine_learning_hep/analysis/analyzerdhadrons.py +++ b/machine_learning_hep/analysis/analyzerdhadrons.py @@ -15,26 +15,54 @@ """ main script for doing final stage analysis """ + # pylint: disable=too-many-lines import os -from pathlib import Path from array import array +from pathlib import Path + import numpy as np + # pylint: disable=unused-wildcard-import, wildcard-import # pylint: disable=import-error, no-name-in-module, unused-import, consider-using-f-string -from ROOT import TFile, TH1F, TH2F, TCanvas, TPad, TF1, TH1 -from ROOT import gStyle, TLegend, TLine, TText, TPaveText, TArrow -from ROOT import gROOT, TDirectory, TPaveLabel -from ROOT import gInterpreter, gPad -from ROOT import kBlue, kCyan -from machine_learning_hep.fitting.roofitter import RooFitter, calc_signif -from machine_learning_hep.fitting.roofitter import create_text_info, add_text_info_fit, add_text_info_perf +from ROOT import ( + TF1, + TH1, + TH1F, + TH2F, + TArrow, + TCanvas, + TDirectory, + TFile, + TLegend, + TLine, + TPad, + TPaveLabel, + TPaveText, + TText, + gInterpreter, + gPad, + gROOT, + gStyle, + kBlue, + kCyan, +) + +from machine_learning_hep.analysis.analyzer import Analyzer + # HF specific imports from machine_learning_hep.fitting.helpers import MLFitter -from machine_learning_hep.logger import get_logger -from machine_learning_hep.analysis.analyzer import Analyzer +from machine_learning_hep.fitting.roofitter import ( + RooFitter, + add_text_info_fit, + add_text_info_perf, + calc_signif, + create_text_info, +) from machine_learning_hep.hf_pt_spectrum import hf_pt_spectrum -from machine_learning_hep.utils.hist import (get_dim, project_hist) +from machine_learning_hep.logger import get_logger +from machine_learning_hep.utils.hist import get_dim, project_hist + # pylint: disable=too-few-public-methods, too-many-instance-attributes, too-many-statements, fixme # pylint: disable=consider-using-enumerate fixme @@ -57,110 +85,102 @@ def __init__(self, datap, case, typean, period): dp = datap["analysis"][self.typean] self.d_prefix_mc = dp["mc"].get("prefix_dir_res") self.d_prefix_data = dp["data"].get("prefix_dir_res") - self.d_resultsallpmc = self.d_prefix_mc + dp["mc"]["results"][period] \ - if period is not None \ + self.d_resultsallpmc = ( + self.d_prefix_mc + dp["mc"]["results"][period] + if period is not None else self.d_prefix_mc + dp["mc"]["resultsallp"] - self.d_resultsallpdata = + dp["data"]["results"][period] \ - if period is not None \ - else self.d_prefix_data + dp["data"]["resultsallp"] + ) + self.d_resultsallpdata = ( + +dp["data"]["results"][period] if period is not None else self.d_prefix_data + dp["data"]["resultsallp"] + ) n_filemass_name = datap["files_names"]["histofilename"] self.n_filemass = os.path.join(self.d_resultsallpdata, n_filemass_name) - self.n_filemass_mc = os.path.join( - self.d_resultsallpmc, n_filemass_name) + self.n_filemass_mc = os.path.join(self.d_resultsallpmc, n_filemass_name) self.mltype = datap["ml"]["mltype"] # Output directories and filenames self.yields_filename = "yields" - self.fits_dirname = os.path.join( - self.d_resultsallpdata, f"fits_{case}_{typean}") + self.fits_dirname = os.path.join(self.d_resultsallpdata, f"fits_{case}_{typean}") self.yields_syst_filename = "yields_syst" self.efficiency_filename = "efficiencies" self.sideband_subtracted_filename = "sideband_subtracted" self.n_fileff = datap["files_names"]["efffilename"] self.n_fileff = os.path.join(self.d_resultsallpmc, self.n_fileff) - self.p_bin_width = datap["analysis"][self.typean]['bin_width'] - self.p_rebin = datap["analysis"][self.typean]['n_rebin'] - self.p_pdfnames = datap["analysis"][self.typean]['pdf_names'] - self.p_param_names = datap["analysis"][self.typean]['param_names'] + self.p_bin_width = datap["analysis"][self.typean]["bin_width"] + self.p_rebin = datap["analysis"][self.typean]["n_rebin"] + self.p_pdfnames = datap["analysis"][self.typean]["pdf_names"] + self.p_param_names = datap["analysis"][self.typean]["param_names"] self.p_latexnhadron = datap["analysis"][self.typean]["latexnamehadron"] - self.p_dobkgfromsideband = datap["analysis"][self.typean].get( - "dobkgfromsideband", None) + self.p_dobkgfromsideband = datap["analysis"][self.typean].get("dobkgfromsideband", None) if self.p_dobkgfromsideband is None: self.p_dobkgfromsideband = False # More specific fit options - self.include_reflection = datap["analysis"][self.typean].get( - "include_reflection", False) + self.include_reflection = datap["analysis"][self.typean].get("include_reflection", False) self.p_sigmamb = datap["analysis"]["sigmamb"] self.p_br = datap["ml"]["opt"]["BR"] - self.bins_candpt = np.asarray(self.cfg('sel_an_binmin', []) + self.cfg('sel_an_binmax', [])[-1:], 'd') + self.bins_candpt = np.asarray(self.cfg("sel_an_binmin", []) + self.cfg("sel_an_binmax", [])[-1:], "d") self.nbins = len(self.bins_candpt) - 1 - self.fit_levels = self.cfg('fit_levels', ['mc', 'data']) + self.fit_levels = self.cfg("fit_levels", ["mc", "data"]) self.fit_sigma = {} self.fit_mean = {} self.fit_func_bkg = {} self.fit_range = {} - self.path_fig = Path(f'fig/{self.case}/{self.typean}') - for folder in ['qa', 'fit', 'roofit', 'sideband', 'signalextr', 'fd', 'uf']: + self.path_fig = Path(f"fig/{self.case}/{self.typean}") + for folder in ["qa", "fit", "roofit", "sideband", "signalextr", "fd", "uf"]: (self.path_fig / folder).mkdir(parents=True, exist_ok=True) - self.rfigfile = TFile(str(self.path_fig / 'output.root'), 'recreate') + self.rfigfile = TFile(str(self.path_fig / "output.root"), "recreate") self.fitter = RooFitter() self.roo_ws = {} self.roows = {} # Systematics - self.mt_syst_dict = datap["analysis"][self.typean].get( - "systematics", None) - self.d_mt_results_path = os.path.join( - self.d_resultsallpdata, "multi_trial") + self.mt_syst_dict = datap["analysis"][self.typean].get("systematics", None) + self.d_mt_results_path = os.path.join(self.d_resultsallpdata, "multi_trial") self.p_anahpt = datap["analysis"]["anahptspectrum"] self.p_fd_method = datap["analysis"]["fd_method"] self.p_cctype = datap["analysis"]["cctype"] self.p_inputfonllpred = datap["analysis"]["inputfonllpred"] self.p_triggereff = datap["analysis"][self.typean].get("triggereff", [1]) - self.p_triggereffunc = datap["analysis"][self.typean].get( - "triggereffunc", [0]) + self.p_triggereffunc = datap["analysis"][self.typean].get("triggereffunc", [0]) self.root_objects = [] # Fitting - self.p_performval = datap["analysis"].get( - "event_cand_validation", None) - + self.p_performval = datap["analysis"].get("event_cand_validation", None) - #region helpers + # region helpers def _save_canvas(self, canvas, filename): # folder = self.d_resultsallpmc if mcordata == 'mc' else self.d_resultsallpdata - canvas.SaveAs(f'fig/{self.case}/{self.typean}/{filename}') + canvas.SaveAs(f"fig/{self.case}/{self.typean}/{filename}") - - def _save_hist(self, hist, filename, option = ''): + def _save_hist(self, hist, filename, option=""): if not hist: - self.logger.error('no histogram for <%s>', filename) + self.logger.error("no histogram for <%s>", filename) # TODO: remove file if it exists? return c = TCanvas() - if isinstance(hist, TH1) and get_dim(hist) == 2 and 'texte' not in option: - option += 'texte' + if isinstance(hist, TH1) and get_dim(hist) == 2 and "texte" not in option: + option += "texte" hist.Draw(option) self._save_canvas(c, filename) - rfilename = filename.split('/')[-1] - rfilename = rfilename.removesuffix('.png') + rfilename = filename.split("/")[-1] + rfilename = rfilename.removesuffix(".png") self.rfigfile.WriteObject(hist, rfilename) - #region fitting - def _roofit_mass(self, level, hist, ipt, pdfnames, param_names, fitcfg, roows = None, filename = None): + # region fitting + def _roofit_mass(self, level, hist, ipt, pdfnames, param_names, fitcfg, roows=None, filename=None): if fitcfg is None: return None, None res, ws, frame, residual_frame = self.fitter.fit_mass_new(hist, pdfnames, fitcfg, level, roows, True) - frame.SetTitle(f'inv. mass for p_{{T}} {self.bins_candpt[ipt]} - {self.bins_candpt[ipt+1]} GeV/c') + frame.SetTitle(f"inv. mass for p_{{T}} {self.bins_candpt[ipt]} - {self.bins_candpt[ipt + 1]} GeV/c") c = TCanvas() textInfoRight = create_text_info(0.62, 0.68, 1.0, 0.89) @@ -170,9 +190,9 @@ def _roofit_mass(self, level, hist, ipt, pdfnames, param_names, fitcfg, roows = if level == "data": mean_sgn = ws.var(self.p_param_names["gauss_mean"]) sigma_sgn = ws.var(self.p_param_names["gauss_sigma"]) - (sig, sig_err, bkg, bkg_err, - signif, signif_err, s_over_b, s_over_b_err - ) = calc_signif(ws, res, pdfnames, param_names, mean_sgn, sigma_sgn) + (sig, sig_err, bkg, bkg_err, signif, signif_err, s_over_b, s_over_b_err) = calc_signif( + ws, res, pdfnames, param_names, mean_sgn, sigma_sgn + ) add_text_info_perf(textInfoLeft, sig, sig_err, bkg, bkg_err, s_over_b, s_over_b_err, signif, signif_err) @@ -183,38 +203,39 @@ def _roofit_mass(self, level, hist, ipt, pdfnames, param_names, fitcfg, roows = if res.status() == 0: self._save_canvas(c, filename) else: - self.logger.warning('Invalid fit result for %s', hist.GetName()) + self.logger.warning("Invalid fit result for %s", hist.GetName()) # func_tot.Print('v') - filename = filename.replace('.png', '_invalid.png') + filename = filename.replace(".png", "_invalid.png") self._save_canvas(c, filename) if level == "data": - residual_frame.SetTitle(f'inv. mass for p_{{T}} {self.bins_candpt[ipt]} - {self.bins_candpt[ipt+1]} GeV/c') + residual_frame.SetTitle( + f"inv. mass for p_{{T}} {self.bins_candpt[ipt]} - {self.bins_candpt[ipt + 1]} GeV/c" + ) cres = TCanvas() residual_frame.Draw() - filename = filename.replace('.png', '_residual.png') + filename = filename.replace(".png", "_residual.png") self._save_canvas(cres, filename) return res, ws - - def _fit_mass(self, hist, filename = None): + def _fit_mass(self, hist, filename=None): if hist.GetEntries() == 0: - raise UserWarning('Cannot fit histogram with no entries') - fit_range = self.cfg('mass_fit.range') - func_sig = TF1('funcSig', self.cfg('mass_fit.func_sig'), *fit_range) - func_bkg = TF1('funcBkg', self.cfg('mass_fit.func_bkg'), *fit_range) + raise UserWarning("Cannot fit histogram with no entries") + fit_range = self.cfg("mass_fit.range") + func_sig = TF1("funcSig", self.cfg("mass_fit.func_sig"), *fit_range) + func_bkg = TF1("funcBkg", self.cfg("mass_fit.func_bkg"), *fit_range) par_offset = func_sig.GetNpar() - func_tot = TF1('funcTot', f"{self.cfg('mass_fit.func_sig')} + {self.cfg('mass_fit.func_bkg')}({par_offset})") - func_tot.SetParameter(0, hist.GetMaximum()/3.) # TODO: better seeding? - for par, value in self.cfg('mass_fit.par_start', {}).items(): - self.logger.debug('Setting par %i to %g', par, value) + func_tot = TF1("funcTot", f"{self.cfg('mass_fit.func_sig')} + {self.cfg('mass_fit.func_bkg')}({par_offset})") + func_tot.SetParameter(0, hist.GetMaximum() / 3.0) # TODO: better seeding? + for par, value in self.cfg("mass_fit.par_start", {}).items(): + self.logger.debug("Setting par %i to %g", par, value) func_tot.SetParameter(par, value) - for par, value in self.cfg('mass_fit.par_constrain', {}).items(): - self.logger.debug('Constraining par %i to (%g, %g)', par, value[0], value[1]) + for par, value in self.cfg("mass_fit.par_constrain", {}).items(): + self.logger.debug("Constraining par %i to (%g, %g)", par, value[0], value[1]) func_tot.SetParLimits(par, value[0], value[1]) - for par, value in self.cfg('mass_fit.par_fix', {}).items(): - self.logger.debug('Fixing par %i to %g', par, value) + for par, value in self.cfg("mass_fit.par_fix", {}).items(): + self.logger.debug("Fixing par %i to %g", par, value) func_tot.FixParameter(par, value) fit_res = hist.Fit(func_tot, "SQL", "", fit_range[0], fit_range[1]) if fit_res and fit_res.Get() and fit_res.IsValid(): @@ -231,20 +252,19 @@ def _fit_mass(self, hist, filename = None): c = TCanvas() hist.Draw() func_sig.SetLineColor(kBlue) - func_sig.Draw('lsame') + func_sig.Draw("lsame") func_bkg.SetLineColor(kCyan) - func_bkg.Draw('lsame') + func_bkg.Draw("lsame") self._save_canvas(c, filename) else: - self.logger.warning('Invalid fit result for %s', hist.GetName()) + self.logger.warning("Invalid fit result for %s", hist.GetName()) # func_tot.Print('v') - filename = filename.replace('.png', '_invalid.png') + filename = filename.replace(".png", "_invalid.png") self._save_hist(hist, filename) # TODO: how to deal with this return (fit_res, func_sig, func_bkg) - # pylint: disable=too-many-branches,too-many-statements def fit(self): self.logger.info("Fitting inclusive mass distributions") @@ -258,108 +278,120 @@ def fit(self): rfilename = self.n_filemass_mc if "mc" in level else self.n_filemass fitcfg = None - fileout_name = self.make_file_path(self.d_resultsallpdata, self.yields_filename, "root", - None, [self.case, self.typean]) + fileout_name = self.make_file_path( + self.d_resultsallpdata, self.yields_filename, "root", None, [self.case, self.typean] + ) fileout = TFile(fileout_name, "RECREATE") - yieldshistos = TH1F("hyields0", "", \ - len(self.lpt_finbinmin), array("d", self.bins_candpt)) - meanhistos = TH1F("hmean0", "", \ - len(self.lpt_finbinmin), array("d", self.bins_candpt)) - sigmahistos = TH1F("hsigmas0", "", \ - len(self.lpt_finbinmin), array("d", self.bins_candpt)) - signifhistos = TH1F("hsignifs0", "", \ - len(self.lpt_finbinmin), array("d", self.bins_candpt)) - soverbhistos = TH1F("hSoverB0", "", \ - len(self.lpt_finbinmin), array("d", self.bins_candpt)) + yieldshistos = TH1F("hyields0", "", len(self.lpt_finbinmin), array("d", self.bins_candpt)) + meanhistos = TH1F("hmean0", "", len(self.lpt_finbinmin), array("d", self.bins_candpt)) + sigmahistos = TH1F("hsigmas0", "", len(self.lpt_finbinmin), array("d", self.bins_candpt)) + signifhistos = TH1F("hsignifs0", "", len(self.lpt_finbinmin), array("d", self.bins_candpt)) + soverbhistos = TH1F("hSoverB0", "", len(self.lpt_finbinmin), array("d", self.bins_candpt)) with TFile(rfilename) as rfile: for ipt in range(len(self.lpt_finbinmin)): - self.logger.debug('fitting %s - %i', level, ipt) + self.logger.debug("fitting %s - %i", level, ipt) roows = self.roows.get(ipt) if self.mltype == "MultiClassification": - suffix = "%s%d_%d_%.2f%.2f%.2f" % \ - (self.v_var_binning, self.lpt_finbinmin[ipt], - self.lpt_finbinmax[ipt], self.lpt_probcutfin[ipt][0], - self.lpt_probcutfin[ipt][1], self.lpt_probcutfin[ipt][2]) + suffix = "%s%d_%d_%.2f%.2f%.2f" % ( + self.v_var_binning, + self.lpt_finbinmin[ipt], + self.lpt_finbinmax[ipt], + self.lpt_probcutfin[ipt][0], + self.lpt_probcutfin[ipt][1], + self.lpt_probcutfin[ipt][2], + ) else: - suffix = "%s%d_%d_%.2f" % \ - (self.v_var_binning, self.lpt_finbinmin[ipt], - self.lpt_finbinmax[ipt], self.lpt_probcutfin[ipt]) - h_invmass = rfile.Get('hmass' + suffix) + suffix = "%s%d_%d_%.2f" % ( + self.v_var_binning, + self.lpt_finbinmin[ipt], + self.lpt_finbinmax[ipt], + self.lpt_probcutfin[ipt], + ) + h_invmass = rfile.Get("hmass" + suffix) # Rebin h_invmass.Rebin(self.p_rebin[ipt]) - if h_invmass.GetEntries() < 100: # TODO: reconsider criterion - self.logger.error('Not enough entries to fit for %s bin %d', level, ipt) + if h_invmass.GetEntries() < 100: # TODO: reconsider criterion + self.logger.error("Not enough entries to fit for %s bin %d", level, ipt) continue - ptrange = (self.bins_candpt[ipt], self.bins_candpt[ipt+1]) + ptrange = (self.bins_candpt[ipt], self.bins_candpt[ipt + 1]) - if self.cfg('mass_fit'): + if self.cfg("mass_fit"): fit_res, _, func_bkg = self._fit_mass( - h_invmass, - f'fit/h_mass_fitted_pthf-{ptrange[0]}-{ptrange[1]}_{level}.png') + h_invmass, f"fit/h_mass_fitted_pthf-{ptrange[0]}-{ptrange[1]}_{level}.png" + ) if fit_res and fit_res.Get() and fit_res.IsValid(): self.fit_mean[level][ipt] = fit_res.Parameter(1) self.fit_sigma[level][ipt] = fit_res.Parameter(2) self.fit_func_bkg[level][ipt] = func_bkg else: - self.logger.error('Fit failed for %s bin %d', level, ipt) + self.logger.error("Fit failed for %s bin %d", level, ipt) - if self.cfg('mass_roofit'): - for entry in self.cfg('mass_roofit', []): - if lvl := entry.get('level'): + if self.cfg("mass_roofit"): + for entry in self.cfg("mass_roofit", []): + if lvl := entry.get("level"): if lvl != level: continue - if ptspec := entry.get('ptrange'): + if ptspec := entry.get("ptrange"): if ptspec[0] > ptrange[0] or ptspec[1] < ptrange[1]: continue fitcfg = entry break self.logger.debug("Using fit config for %i: %s", ipt, fitcfg) - if datasel := fitcfg.get('datasel'): - h = rfile.Get(f'h_mass-pthf_{datasel}') - h_invmass = project_hist(h, [0], {1: (ipt+1, ipt+1)}) # TODO: under-/overflow for jets + if datasel := fitcfg.get("datasel"): + h = rfile.Get(f"h_mass-pthf_{datasel}") + h_invmass = project_hist(h, [0], {1: (ipt + 1, ipt + 1)}) # TODO: under-/overflow for jets - for fixpar in fitcfg.get('fix_params', []): + for fixpar in fitcfg.get("fix_params", []): if roows.var(fixpar): roows.var(fixpar).setConstant(True) if h_invmass.GetEntries() == 0: continue roo_res, roo_ws = self._roofit_mass( - level, h_invmass, ipt, self.p_pdfnames, self.p_param_names, fitcfg, roows, - f'roofit/h_mass_fitted_pthf-{ptrange[0]}-{ptrange[1]}_{level}.png') + level, + h_invmass, + ipt, + self.p_pdfnames, + self.p_param_names, + fitcfg, + roows, + f"roofit/h_mass_fitted_pthf-{ptrange[0]}-{ptrange[1]}_{level}.png", + ) self.roo_ws[level][ipt] = roo_ws self.roows[ipt] = roo_ws if roo_res.status() == 0: - if level in ('data', 'mc_sig'): + if level in ("data", "mc_sig"): self.fit_mean[level][ipt] = roo_ws.var(self.p_param_names["gauss_mean"]).getValV() self.fit_sigma[level][ipt] = roo_ws.var(self.p_param_names["gauss_sigma"]).getValV() - var_m = fitcfg.get('var', 'm') + var_m = fitcfg.get("var", "m") pdf_bkg = roo_ws.pdf(self.p_pdfnames["pdf_bkg"]) if pdf_bkg: self.fit_func_bkg[level][ipt] = pdf_bkg.asTF(roo_ws.var(var_m)) - self.fit_range[level][ipt] = (roo_ws.var(var_m).getMin('fit'), \ - roo_ws.var(var_m).getMax('fit')) + self.fit_range[level][ipt] = ( + roo_ws.var(var_m).getMin("fit"), + roo_ws.var(var_m).getMax("fit"), + ) else: - self.logger.error('RooFit failed for %s bin %d', level, ipt) + self.logger.error("RooFit failed for %s bin %d", level, ipt) if level == "data": mean_sgn = roo_ws.var(self.p_param_names["gauss_mean"]) sigma_sgn = roo_ws.var(self.p_param_names["gauss_sigma"]) - (sig, sig_err, _, _, - signif, signif_err, s_over_b, s_over_b_err - ) = calc_signif(roo_ws, roo_res, self.p_pdfnames, self.p_param_names, mean_sgn, sigma_sgn) - - yieldshistos.SetBinContent(ipt+1, sig) - yieldshistos.SetBinError(ipt+1, sig_err) - meanhistos.SetBinContent(ipt+1, mean_sgn.getVal()) - meanhistos.SetBinError(ipt+1, mean_sgn.getError()) - sigmahistos.SetBinContent(ipt+1, sigma_sgn.getVal()) - sigmahistos.SetBinError(ipt+1, sigma_sgn.getError()) - signifhistos.SetBinContent(ipt+1, signif) - signifhistos.SetBinError(ipt+1, signif_err) - soverbhistos.SetBinContent(ipt+1, s_over_b) - soverbhistos.SetBinError(ipt+1, s_over_b_err) + (sig, sig_err, _, _, signif, signif_err, s_over_b, s_over_b_err) = calc_signif( + roo_ws, roo_res, self.p_pdfnames, self.p_param_names, mean_sgn, sigma_sgn + ) + + yieldshistos.SetBinContent(ipt + 1, sig) + yieldshistos.SetBinError(ipt + 1, sig_err) + meanhistos.SetBinContent(ipt + 1, mean_sgn.getVal()) + meanhistos.SetBinError(ipt + 1, mean_sgn.getError()) + sigmahistos.SetBinContent(ipt + 1, sigma_sgn.getVal()) + sigmahistos.SetBinError(ipt + 1, sigma_sgn.getError()) + signifhistos.SetBinContent(ipt + 1, signif) + signifhistos.SetBinError(ipt + 1, signif_err) + soverbhistos.SetBinContent(ipt + 1, s_over_b) + soverbhistos.SetBinError(ipt + 1, s_over_b_err) fileout.cd() yieldshistos.Write() meanhistos.Write() @@ -373,11 +405,9 @@ def yield_syst(self): tmp_is_root_batch = gROOT.IsBatch() gROOT.SetBatch(True) if not self.fitter: - self.fitter = MLFitter(self.case, self.datap, self.typean, - self.n_filemass, self.n_filemass_mc) + self.fitter = MLFitter(self.case, self.datap, self.typean, self.n_filemass, self.n_filemass_mc) if not self.fitter.load_fits(self.fits_dirname): - self.logger.error( - "Cannot load fits from dir %s", self.fits_dirname) + self.logger.error("Cannot load fits from dir %s", self.fits_dirname) return # Additional directory needed where the intermediate results of the multi trial are @@ -395,13 +425,12 @@ def efficiency(self): print(self.n_fileff) lfileeff = TFile.Open(self.n_fileff) lfileeff.ls() - fileouteff = TFile.Open("%s/efficiencies%s%s.root" % (self.d_resultsallpmc, - self.case, self.typean), "recreate") - cEff = TCanvas('cEff', 'The Fit Canvas') + fileouteff = TFile.Open("%s/efficiencies%s%s.root" % (self.d_resultsallpmc, self.case, self.typean), "recreate") + cEff = TCanvas("cEff", "The Fit Canvas") cEff.SetCanvasSize(1900, 1500) cEff.SetWindowSize(500, 500) - legeff = TLegend(.5, .65, .7, .85) + legeff = TLegend(0.5, 0.65, 0.7, 0.85) legeff.SetBorderSize(0) legeff.SetFillColor(0) legeff.SetFillStyle(0) @@ -416,19 +445,17 @@ def efficiency(self): h_sel_pr.SetName("eff") h_sel_pr.Write() h_sel_pr.GetXaxis().SetTitle("#it{p}_{T} (GeV/#it{c})") - h_sel_pr.GetYaxis().SetTitle("Acc x efficiency (prompt) %s %s (1/GeV)" - % (self.p_latexnhadron, self.typean)) + h_sel_pr.GetYaxis().SetTitle("Acc x efficiency (prompt) %s %s (1/GeV)" % (self.p_latexnhadron, self.typean)) h_sel_pr.SetMinimum(0.001) h_sel_pr.SetMaximum(1.0) gPad.SetLogy() - cEff.SaveAs("%s/Eff%s%s.eps" % (self.d_resultsallpmc, - self.case, self.typean)) + cEff.SaveAs("%s/Eff%s%s.eps" % (self.d_resultsallpmc, self.case, self.typean)) - cEffFD = TCanvas('cEffFD', 'The Fit Canvas') + cEffFD = TCanvas("cEffFD", "The Fit Canvas") cEffFD.SetCanvasSize(1900, 1500) cEffFD.SetWindowSize(500, 500) - legeffFD = TLegend(.5, .65, .7, .85) + legeffFD = TLegend(0.5, 0.65, 0.7, 0.85) legeffFD.SetBorderSize(0) legeffFD.SetFillColor(0) legeffFD.SetFillStyle(0) @@ -443,18 +470,15 @@ def efficiency(self): h_sel_fd.SetName("eff_fd") h_sel_fd.Write() h_sel_fd.GetXaxis().SetTitle("#it{p}_{T} (GeV/#it{c})") - h_sel_fd.GetYaxis().SetTitle("Acc x efficiency feed-down %s %s (1/GeV)" - % (self.p_latexnhadron, self.typean)) + h_sel_fd.GetYaxis().SetTitle("Acc x efficiency feed-down %s %s (1/GeV)" % (self.p_latexnhadron, self.typean)) h_sel_fd.SetMinimum(0.001) - h_sel_fd.SetMaximum(1.) + h_sel_fd.SetMaximum(1.0) gPad.SetLogy() legeffFD.Draw() - cEffFD.SaveAs("%s/EffFD%s%s.eps" % (self.d_resultsallpmc, - self.case, self.typean)) - + cEffFD.SaveAs("%s/EffFD%s%s.eps" % (self.d_resultsallpmc, self.case, self.typean)) @staticmethod - def calculate_norm(logger, hevents, hselevents): #TO BE FIXED WITH EV SEL + def calculate_norm(logger, hevents, hselevents): # TO BE FIXED WITH EV SEL if not hevents: # pylint: disable=undefined-variable logger.error("Missing hevents") @@ -471,19 +495,17 @@ def makenormyields(self): # pylint: disable=import-outside-toplevel, too-many-b gROOT.SetBatch(True) self.loadstyle() - yield_filename = self.make_file_path(self.d_resultsallpdata, self.yields_filename, "root", - None, [self.case, self.typean]) + yield_filename = self.make_file_path( + self.d_resultsallpdata, self.yields_filename, "root", None, [self.case, self.typean] + ) if not os.path.exists(yield_filename): - self.logger.fatal( - "Yield file %s could not be found", yield_filename) + self.logger.fatal("Yield file %s could not be found", yield_filename) fileouteff = f"{self.d_resultsallpmc}/efficiencies{self.case}{self.typean}.root" if not os.path.exists(fileouteff): - self.logger.fatal( - "Efficiency file %s could not be found", fileouteff) + self.logger.fatal("Efficiency file %s could not be found", fileouteff) - fileoutcross = "%s/finalcross%s%s.root" % \ - (self.d_resultsallpdata, self.case, self.typean) + fileoutcross = "%s/finalcross%s%s.root" % (self.d_resultsallpdata, self.case, self.typean) namehistoeffprompt = "eff" namehistoefffeed = "eff_fd" @@ -500,35 +522,40 @@ def makenormyields(self): # pylint: disable=import-outside-toplevel, too-many-b self.logger.warning("Number of events after event selection %d", selnorm) if self.p_dobkgfromsideband: - fileoutbkg = TFile.Open("%s/Background_fromsidebands_%s_%s.root" % \ - (self.d_resultsallpdata, self.case, self.typean)) + fileoutbkg = TFile.Open( + "%s/Background_fromsidebands_%s_%s.root" % (self.d_resultsallpdata, self.case, self.typean) + ) hbkg = fileoutbkg.Get("hbkg_fromsidebands") - hbkg.Scale(1./selnorm) - fileoutbkgscaled = TFile.Open("%s/NormBackground_fromsidebands_%s_%s.root" % \ - (self.d_resultsallpdata, self.case, - self.typean), "RECREATE") + hbkg.Scale(1.0 / selnorm) + fileoutbkgscaled = TFile.Open( + "%s/NormBackground_fromsidebands_%s_%s.root" % (self.d_resultsallpdata, self.case, self.typean), + "RECREATE", + ) fileoutbkgscaled.cd() hbkg.Write() fileoutbkgscaled.Close() output_prompt = [] - hf_pt_spectrum(self.p_anahpt, - self.p_br, - self.p_inputfonllpred, - self.p_fd_method, - None, - fileouteff, - namehistoeffprompt, - namehistoefffeed, - yield_filename, - nameyield, - selnorm, - self.p_sigmamb, - output_prompt, - fileoutcross) - - fileoutcrosstot = TFile.Open("%s/finalcross%s%stot.root" % - (self.d_resultsallpdata, self.case, self.typean), "recreate") + hf_pt_spectrum( + self.p_anahpt, + self.p_br, + self.p_inputfonllpred, + self.p_fd_method, + None, + fileouteff, + namehistoeffprompt, + namehistoefffeed, + yield_filename, + nameyield, + selnorm, + self.p_sigmamb, + output_prompt, + fileoutcross, + ) + + fileoutcrosstot = TFile.Open( + "%s/finalcross%s%stot.root" % (self.d_resultsallpdata, self.case, self.typean), "recreate" + ) f_fileoutcross = TFile.Open(fileoutcross) if f_fileoutcross: diff --git a/machine_learning_hep/analysis/analyzerdhadrons_mult.py b/machine_learning_hep/analysis/analyzerdhadrons_mult.py index 90b2320098..078a039a7d 100644 --- a/machine_learning_hep/analysis/analyzerdhadrons_mult.py +++ b/machine_learning_hep/analysis/analyzerdhadrons_mult.py @@ -15,35 +15,67 @@ """ main script for doing final stage analysis """ + # pylint: disable=too-many-lines # pylint: disable=unused-wildcard-import, wildcard-import import os from array import array from pathlib import Path + import numpy as np + # pylint: disable=import-error, no-name-in-module, unused-import, consider-using-f-string -from ROOT import TFile, TH1, TH1F, TH2F, TCanvas, TPad, TF1, TH1D -from ROOT import gStyle, TLegend, TLine, TText, TPaveText, TArrow -from ROOT import gROOT, TDirectory, TPaveLabel -from ROOT import TStyle, kBlue, kCyan -from ROOT import gInterpreter, gPad +from ROOT import ( + TF1, + TH1, + TH1D, + TH1F, + TH2F, + TArrow, + TCanvas, + TDirectory, + TFile, + TLegend, + TLine, + TPad, + TPaveLabel, + TPaveText, + TStyle, + TText, + gInterpreter, + gPad, + gROOT, + gStyle, + kBlue, + kCyan, +) + +from machine_learning_hep.analysis.analyzer import Analyzer + # HF specific imports -from machine_learning_hep.fitting.roofitter import RooFitter, calc_signif -from machine_learning_hep.fitting.roofitter import create_text_info, add_text_info_fit, add_text_info_perf +from machine_learning_hep.fitting.roofitter import ( + RooFitter, + add_text_info_fit, + add_text_info_perf, + calc_signif, + create_text_info, +) +from machine_learning_hep.hf_pt_spectrum import hf_pt_spectrum from machine_learning_hep.logger import get_logger from machine_learning_hep.root import save_root_object -from machine_learning_hep.analysis.analyzer import Analyzer -from machine_learning_hep.hf_pt_spectrum import hf_pt_spectrum -from machine_learning_hep.utils.hist import (get_dim, project_hist) +from machine_learning_hep.utils.hist import get_dim, project_hist + + # pylint: disable=too-few-public-methods, too-many-instance-attributes, too-many-statements, fixme # pylint: disable=consider-using-enumerate, fixme -class AnalyzerDhadrons_mult(Analyzer): # pylint: disable=invalid-name +class AnalyzerDhadrons_mult(Analyzer): # pylint: disable=invalid-name species = "analyzer" + def __init__(self, datap, case, typean, period): super().__init__(datap, case, typean, period) self.logger = get_logger() self.logger.warning("TEST") - #namefiles pkl + # namefiles pkl self.v_var_binning = datap["var_binning"] self.lpt_finbinmin = datap["analysis"][self.typean]["sel_an_binmin"] self.lpt_finbinmax = datap["analysis"][self.typean]["sel_an_binmax"] @@ -61,17 +93,19 @@ def __init__(self, datap, case, typean, period): dp = datap["analysis"][typean] self.d_prefix_mc = dp["mc"].get("prefix_dir_res") self.d_prefix_data = dp["data"].get("prefix_dir_res") - self.d_resultsallpmc = (self.d_prefix_mc + - (dp["mc"]["results"][period] if period is not None else dp["mc"]["resultsallp"])) - self.d_resultsallpdata = (self.d_prefix_data + - (dp["data"]["results"][period] if period is not None else dp["data"]["resultsallp"])) + self.d_resultsallpmc = self.d_prefix_mc + ( + dp["mc"]["results"][period] if period is not None else dp["mc"]["resultsallp"] + ) + self.d_resultsallpdata = self.d_prefix_data + ( + dp["data"]["results"][period] if period is not None else dp["data"]["resultsallp"] + ) n_filemass_name = datap["files_names"]["histofilename"] self.n_filemass = os.path.join(self.d_resultsallpdata, n_filemass_name) self.n_filemass_mc = os.path.join(self.d_resultsallpmc, n_filemass_name) self.mltype = datap["ml"]["mltype"] self.n_filecross = datap["files_names"]["crossfilename"] - self.p_mass_fit_lim = datap["analysis"][self.typean]['mass_fit_lim'] + self.p_mass_fit_lim = datap["analysis"][self.typean]["mass_fit_lim"] # Output directories and filenames self.yields_filename = "yields" @@ -81,11 +115,11 @@ def __init__(self, datap, case, typean, period): self.n_fileff = datap["files_names"]["efffilename"] self.n_fileff = os.path.join(self.d_resultsallpmc, self.n_fileff) - self.p_bin_width = datap["analysis"][self.typean]['bin_width'] + self.p_bin_width = datap["analysis"][self.typean]["bin_width"] - self.p_rebin = datap["analysis"][self.typean]['n_rebin'] - self.p_pdfnames = datap["analysis"][self.typean]['pdf_names'] - self.p_param_names = datap["analysis"][self.typean]['param_names'] + self.p_rebin = datap["analysis"][self.typean]["n_rebin"] + self.p_pdfnames = datap["analysis"][self.typean]["pdf_names"] + self.p_param_names = datap["analysis"][self.typean]["param_names"] self.p_latexnhadron = datap["analysis"][self.typean]["latexnamehadron"] self.p_latexbin2var = datap["analysis"][self.typean]["latexbin2var"] @@ -102,19 +136,19 @@ def __init__(self, datap, case, typean, period): self.p_br = datap["ml"]["opt"]["BR"] # Roofit - self.bins_candpt = np.asarray(self.cfg('sel_an_binmin', []) + self.cfg('sel_an_binmax', [])[-1:], 'd') + self.bins_candpt = np.asarray(self.cfg("sel_an_binmin", []) + self.cfg("sel_an_binmax", [])[-1:], "d") self.nbins = len(self.bins_candpt) - 1 - self.fit_levels = self.cfg('fit_levels', ['mc', 'data']) + self.fit_levels = self.cfg("fit_levels", ["mc", "data"]) self.fit_sigma = {} self.fit_mean = {} self.fit_func_bkg = {} self.fit_range = {} - self.path_fig = Path(f'fig/{self.case}/{self.typean}') - for folder in ['qa', 'fit', 'roofit', 'sideband', 'signalextr', 'fd', 'uf']: + self.path_fig = Path(f"fig/{self.case}/{self.typean}") + for folder in ["qa", "fit", "roofit", "sideband", "signalextr", "fd", "uf"]: (self.path_fig / folder).mkdir(parents=True, exist_ok=True) - self.rfigfile = TFile(str(self.path_fig / 'output.root'), 'recreate') + self.rfigfile = TFile(str(self.path_fig / "output.root"), "recreate") self.fitter = RooFitter() self.roo_ws = {} @@ -127,58 +161,57 @@ def __init__(self, datap, case, typean, period): self.p_inputfonllpred = datap["analysis"]["inputfonllpred"] self.root_objects = [] - self.get_crossmb_from_path = datap["analysis"][self.typean].get("get_crossmb_from_path", \ - None) + self.get_crossmb_from_path = datap["analysis"][self.typean].get("get_crossmb_from_path", None) self.path_for_crossmb = datap["analysis"][self.typean].get("path_for_crossmb", None) # Take efficiencies from another analysis. self.path_file_eff = datap["analysis"][self.typean].get("path_eff", None) self.mult_bin_eff = datap["analysis"][self.typean].get("mult_bin_eff", None) - if (self.path_file_eff and not self.mult_bin_eff) or \ - (not self.path_file_eff and self.mult_bin_eff): + if (self.path_file_eff and not self.mult_bin_eff) or (not self.path_file_eff and self.mult_bin_eff): # That is incoherent - self.logger.fatal("Either both or none of the lists \"path_eff\" and \"mult_bin_eff\"" \ - "must be specified") + self.logger.fatal('Either both or none of the lists "path_eff" and "mult_bin_eff"must be specified') if not self.path_file_eff: self.path_file_eff = [None] * self.p_nbin2 self.mult_bin_eff = [None] * self.p_nbin2 if len(self.path_file_eff) != self.p_nbin2 or len(self.mult_bin_eff) != self.p_nbin2: - self.logger.fatal("Efficiencies are requested to be taken from another analysis. " \ - "Make sure lists \"path_eff\" and \"mult_bin_eff\" have the same " \ - "length as the number of those bins (%i).", self.p_nbin2) + self.logger.fatal( + "Efficiencies are requested to be taken from another analysis. " + 'Make sure lists "path_eff" and "mult_bin_eff" have the same ' + "length as the number of those bins (%i).", + self.p_nbin2, + ) self.p_performval = datap["analysis"].get("event_cand_validation", None) # pylint: disable=import-outside-toplevel - #region helpers + # region helpers def _save_canvas(self, canvas, filename): # folder = self.d_resultsallpmc if mcordata == 'mc' else self.d_resultsallpdata - canvas.SaveAs(f'fig/{self.case}/{self.typean}/{filename}') - + canvas.SaveAs(f"fig/{self.case}/{self.typean}/{filename}") - def _save_hist(self, hist, filename, option = ''): + def _save_hist(self, hist, filename, option=""): if not hist: - self.logger.error('no histogram for <%s>', filename) + self.logger.error("no histogram for <%s>", filename) # TODO: remove file if it exists? return c = TCanvas() - if isinstance(hist, TH1) and get_dim(hist) == 2 and 'texte' not in option: - option += 'texte' + if isinstance(hist, TH1) and get_dim(hist) == 2 and "texte" not in option: + option += "texte" hist.Draw(option) self._save_canvas(c, filename) - rfilename = filename.split('/')[-1] - rfilename = rfilename.removesuffix('.png') + rfilename = filename.split("/")[-1] + rfilename = rfilename.removesuffix(".png") self.rfigfile.WriteObject(hist, rfilename) - #region fitting - def _roofit_mass(self, level, hist, ipt, pdfnames, param_names, fitcfg, roows = None, filename = None): + # region fitting + def _roofit_mass(self, level, hist, ipt, pdfnames, param_names, fitcfg, roows=None, filename=None): if fitcfg is None: return None, None res, ws, frame, residual_frame = self.fitter.fit_mass_new(hist, pdfnames, fitcfg, level, roows, True) - frame.SetTitle(f'inv. mass for p_{{T}} {self.bins_candpt[ipt]} - {self.bins_candpt[ipt+1]} GeV/c') + frame.SetTitle(f"inv. mass for p_{{T}} {self.bins_candpt[ipt]} - {self.bins_candpt[ipt + 1]} GeV/c") c = TCanvas() textInfoRight = create_text_info(0.62, 0.68, 1.0, 0.89) @@ -188,9 +221,9 @@ def _roofit_mass(self, level, hist, ipt, pdfnames, param_names, fitcfg, roows = if level == "data": mean_sgn = ws.var(self.p_param_names["gauss_mean"]) sigma_sgn = ws.var(self.p_param_names["gauss_sigma"]) - (sig, sig_err, bkg, bkg_err, - signif, signif_err, s_over_b, s_over_b_err - ) = calc_signif(ws, res, pdfnames, param_names, mean_sgn, sigma_sgn) + (sig, sig_err, bkg, bkg_err, signif, signif_err, s_over_b, s_over_b_err) = calc_signif( + ws, res, pdfnames, param_names, mean_sgn, sigma_sgn + ) add_text_info_perf(textInfoLeft, sig, sig_err, bkg, bkg_err, s_over_b, s_over_b_err, signif, signif_err) @@ -201,38 +234,39 @@ def _roofit_mass(self, level, hist, ipt, pdfnames, param_names, fitcfg, roows = if res.status() == 0: self._save_canvas(c, filename) else: - self.logger.warning('Invalid fit result for %s', hist.GetName()) + self.logger.warning("Invalid fit result for %s", hist.GetName()) # func_tot.Print('v') - filename = filename.replace('.png', '_invalid.png') + filename = filename.replace(".png", "_invalid.png") self._save_canvas(c, filename) if level == "data": - residual_frame.SetTitle(f'inv. mass for p_{{T}} {self.bins_candpt[ipt]} - {self.bins_candpt[ipt+1]} GeV/c') + residual_frame.SetTitle( + f"inv. mass for p_{{T}} {self.bins_candpt[ipt]} - {self.bins_candpt[ipt + 1]} GeV/c" + ) cres = TCanvas() residual_frame.Draw() - filename = filename.replace('.png', '_residual.png') + filename = filename.replace(".png", "_residual.png") self._save_canvas(cres, filename) return res, ws - - def _fit_mass(self, hist, filename = None): + def _fit_mass(self, hist, filename=None): if hist.GetEntries() == 0: - raise UserWarning('Cannot fit histogram with no entries') - fit_range = self.cfg('mass_fit.range') - func_sig = TF1('funcSig', self.cfg('mass_fit.func_sig'), *fit_range) - func_bkg = TF1('funcBkg', self.cfg('mass_fit.func_bkg'), *fit_range) + raise UserWarning("Cannot fit histogram with no entries") + fit_range = self.cfg("mass_fit.range") + func_sig = TF1("funcSig", self.cfg("mass_fit.func_sig"), *fit_range) + func_bkg = TF1("funcBkg", self.cfg("mass_fit.func_bkg"), *fit_range) par_offset = func_sig.GetNpar() - func_tot = TF1('funcTot', f"{self.cfg('mass_fit.func_sig')} + {self.cfg('mass_fit.func_bkg')}({par_offset})") - func_tot.SetParameter(0, hist.GetMaximum()/3.) # TODO: better seeding? - for par, value in self.cfg('mass_fit.par_start', {}).items(): - self.logger.debug('Setting par %i to %g', par, value) + func_tot = TF1("funcTot", f"{self.cfg('mass_fit.func_sig')} + {self.cfg('mass_fit.func_bkg')}({par_offset})") + func_tot.SetParameter(0, hist.GetMaximum() / 3.0) # TODO: better seeding? + for par, value in self.cfg("mass_fit.par_start", {}).items(): + self.logger.debug("Setting par %i to %g", par, value) func_tot.SetParameter(par, value) - for par, value in self.cfg('mass_fit.par_constrain', {}).items(): - self.logger.debug('Constraining par %i to (%g, %g)', par, value[0], value[1]) + for par, value in self.cfg("mass_fit.par_constrain", {}).items(): + self.logger.debug("Constraining par %i to (%g, %g)", par, value[0], value[1]) func_tot.SetParLimits(par, value[0], value[1]) - for par, value in self.cfg('mass_fit.par_fix', {}).items(): - self.logger.debug('Fixing par %i to %g', par, value) + for par, value in self.cfg("mass_fit.par_fix", {}).items(): + self.logger.debug("Fixing par %i to %g", par, value) func_tot.FixParameter(par, value) fit_res = hist.Fit(func_tot, "SQL", "", fit_range[0], fit_range[1]) if fit_res and fit_res.Get() and fit_res.IsValid(): @@ -249,20 +283,19 @@ def _fit_mass(self, hist, filename = None): c = TCanvas() hist.Draw() func_sig.SetLineColor(kBlue) - func_sig.Draw('lsame') + func_sig.Draw("lsame") func_bkg.SetLineColor(kCyan) - func_bkg.Draw('lsame') + func_bkg.Draw("lsame") self._save_canvas(c, filename) else: - self.logger.warning('Invalid fit result for %s', hist.GetName()) + self.logger.warning("Invalid fit result for %s", hist.GetName()) # func_tot.Print('v') - filename = filename.replace('.png', '_invalid.png') + filename = filename.replace(".png", "_invalid.png") self._save_hist(hist, filename) # TODO: how to deal with this return (fit_res, func_sig, func_bkg) - # pylint: disable=too-many-branches,too-many-statements,too-many-nested-blocks def fit(self): self.logger.info("Fitting inclusive mass distributions") @@ -275,125 +308,145 @@ def fit(self): self.roo_ws[level] = [None] * self.nbins rfilename = self.n_filemass_mc if "mc" in level else self.n_filemass fitcfg = None - fileout_name = self.make_file_path(self.d_resultsallpdata, self.yields_filename, "root", - None, [self.case, self.typean]) + fileout_name = self.make_file_path( + self.d_resultsallpdata, self.yields_filename, "root", None, [self.case, self.typean] + ) fileout = TFile(fileout_name, "RECREATE") with TFile(rfilename) as rfile: for ibin2 in range(len(self.lvar2_binmin)): - - yieldshistos = TH1F("hyields%d" % (ibin2), "", \ - len(self.lpt_finbinmin), array("d", self.bins_candpt)) - meanhistos = TH1F("hmean%d" % (ibin2), "", \ - len(self.lpt_finbinmin), array("d", self.bins_candpt)) - sigmahistos = TH1F("hsigmas%d" % (ibin2), "", \ - len(self.lpt_finbinmin), array("d", self.bins_candpt)) - signifhistos = TH1F("hsignifs%d" % (ibin2), "", \ - len(self.lpt_finbinmin), array("d", self.bins_candpt)) - soverbhistos = TH1F("hSoverB%d" % (ibin2), "", \ - len(self.lpt_finbinmin), array("d", self.bins_candpt)) + yieldshistos = TH1F( + "hyields%d" % (ibin2), "", len(self.lpt_finbinmin), array("d", self.bins_candpt) + ) + meanhistos = TH1F("hmean%d" % (ibin2), "", len(self.lpt_finbinmin), array("d", self.bins_candpt)) + sigmahistos = TH1F("hsigmas%d" % (ibin2), "", len(self.lpt_finbinmin), array("d", self.bins_candpt)) + signifhistos = TH1F( + "hsignifs%d" % (ibin2), "", len(self.lpt_finbinmin), array("d", self.bins_candpt) + ) + soverbhistos = TH1F( + "hSoverB%d" % (ibin2), "", len(self.lpt_finbinmin), array("d", self.bins_candpt) + ) for ipt in range(len(self.lpt_finbinmin)): - self.logger.debug('fitting %s - %i - %i', level, ipt, ibin2) + self.logger.debug("fitting %s - %i - %i", level, ipt, ibin2) roows = self.roows.get(ipt) if self.mltype == "MultiClassification": - suffix = "%s%d_%d_%.2f%.2f%s_%.2f_%.2f" % \ - (self.v_var_binning, self.lpt_finbinmin[ipt], - self.lpt_finbinmax[ipt], self.lpt_probcutfin[ipt][0], - self.lpt_probcutfin[ipt][1], self.v_var2_binning, - self.lvar2_binmin[ibin2], self.lvar2_binmax[ibin2]) + suffix = "%s%d_%d_%.2f%.2f%s_%.2f_%.2f" % ( + self.v_var_binning, + self.lpt_finbinmin[ipt], + self.lpt_finbinmax[ipt], + self.lpt_probcutfin[ipt][0], + self.lpt_probcutfin[ipt][1], + self.v_var2_binning, + self.lvar2_binmin[ibin2], + self.lvar2_binmax[ibin2], + ) else: - suffix = "%s%d_%d_%.2f%s_%.2f_%.2f" % \ - (self.v_var_binning, self.lpt_finbinmin[ipt], - self.lpt_finbinmax[ipt], self.lpt_probcutfin[ipt], - self.v_var2_binning, - self.lvar2_binmin[ibin2], self.lvar2_binmax[ibin2]) - h_invmass = rfile.Get('hmass' + suffix) + suffix = "%s%d_%d_%.2f%s_%.2f_%.2f" % ( + self.v_var_binning, + self.lpt_finbinmin[ipt], + self.lpt_finbinmax[ipt], + self.lpt_probcutfin[ipt], + self.v_var2_binning, + self.lvar2_binmin[ibin2], + self.lvar2_binmax[ibin2], + ) + h_invmass = rfile.Get("hmass" + suffix) # Rebin h_invmass.Rebin(self.p_rebin[ipt]) - if h_invmass.GetEntries() < 100: # TODO: reconsider criterion - self.logger.error('Not enough entries to fit for %s, pt bin %d, mult bin %d', \ - level, ipt, ibin2) + if h_invmass.GetEntries() < 100: # TODO: reconsider criterion + self.logger.error( + "Not enough entries to fit for %s, pt bin %d, mult bin %d", level, ipt, ibin2 + ) continue - ptrange = (self.bins_candpt[ipt], self.bins_candpt[ipt+1]) + ptrange = (self.bins_candpt[ipt], self.bins_candpt[ipt + 1]) multrange = (self.lvar2_binmin[ibin2], self.lvar2_binmax[ibin2]) - if self.cfg('mass_fit'): + if self.cfg("mass_fit"): fit_res, _, func_bkg = self._fit_mass( h_invmass, - f'fit/h_mass_fitted_pthf-{ptrange[0]}-{ptrange[1]}' - f'_{self.v_var2_binning}-{multrange[0]}-{multrange[1]}_{level}.png') + f"fit/h_mass_fitted_pthf-{ptrange[0]}-{ptrange[1]}" + f"_{self.v_var2_binning}-{multrange[0]}-{multrange[1]}_{level}.png", + ) if fit_res and fit_res.Get() and fit_res.IsValid(): self.fit_mean[level][ipt] = fit_res.Parameter(1) self.fit_sigma[level][ipt] = fit_res.Parameter(2) self.fit_func_bkg[level][ipt] = func_bkg else: - self.logger.error('Fit failed for %s bin %d', level, ipt) + self.logger.error("Fit failed for %s bin %d", level, ipt) - if self.cfg('mass_roofit'): - for entry in self.cfg('mass_roofit', []): - if lvl := entry.get('level'): + if self.cfg("mass_roofit"): + for entry in self.cfg("mass_roofit", []): + if lvl := entry.get("level"): if lvl != level: continue - if ptspec := entry.get('ptrange'): + if ptspec := entry.get("ptrange"): if ptspec[0] > ptrange[0] or ptspec[1] < ptrange[1]: continue fitcfg = entry break self.logger.debug("Using fit config for %i: %s", ipt, fitcfg) - if datasel := fitcfg.get('datasel'): - h = rfile.Get(f'h_mass-pthf_{datasel}') - h_invmass = project_hist(h, [0], {1: (ipt+1, ipt+1)}) + if datasel := fitcfg.get("datasel"): + h = rfile.Get(f"h_mass-pthf_{datasel}") + h_invmass = project_hist(h, [0], {1: (ipt + 1, ipt + 1)}) - for fixpar in fitcfg.get('fix_params', []): + for fixpar in fitcfg.get("fix_params", []): if roows.var(fixpar): roows.var(fixpar).setConstant(True) if h_invmass.GetEntries() == 0: continue - directory_path = Path(f'{self.path_fig}/roofit/mult_{multrange[0]}-{multrange[1]}') + directory_path = Path(f"{self.path_fig}/roofit/mult_{multrange[0]}-{multrange[1]}") # Create the directory if it doesn't exist directory_path.mkdir(parents=True, exist_ok=True) roo_res, roo_ws = self._roofit_mass( - level, h_invmass, ipt, self.p_pdfnames, self.p_param_names, fitcfg, roows, - f'roofit/mult_{multrange[0]}-{multrange[1]}/' - f'h_mass_fitted_pthf-{ptrange[0]}-{ptrange[1]}'\ - f'_{self.v_var2_binning}-{multrange[0]}-{multrange[1]}_{level}.png') + level, + h_invmass, + ipt, + self.p_pdfnames, + self.p_param_names, + fitcfg, + roows, + f"roofit/mult_{multrange[0]}-{multrange[1]}/" + f"h_mass_fitted_pthf-{ptrange[0]}-{ptrange[1]}" + f"_{self.v_var2_binning}-{multrange[0]}-{multrange[1]}_{level}.png", + ) # if level == 'mc': # roo_ws.Print() self.roo_ws[level][ipt] = roo_ws self.roows[ipt] = roo_ws if roo_res.status() == 0: - if level in ('data', 'mc_sig'): + if level in ("data", "mc_sig"): self.fit_mean[level][ipt] = roo_ws.var(self.p_param_names["gauss_mean"]).getValV() self.fit_sigma[level][ipt] = roo_ws.var(self.p_param_names["gauss_sigma"]).getValV() - var_m = fitcfg.get('var', 'm') + var_m = fitcfg.get("var", "m") pdf_bkg = roo_ws.pdf(self.p_pdfnames["pdf_bkg"]) if pdf_bkg: self.fit_func_bkg[level][ipt] = pdf_bkg.asTF(roo_ws.var(var_m)) - self.fit_range[level][ipt] = (roo_ws.var(var_m).getMin('fit'), \ - roo_ws.var(var_m).getMax('fit')) + self.fit_range[level][ipt] = ( + roo_ws.var(var_m).getMin("fit"), + roo_ws.var(var_m).getMax("fit"), + ) else: - self.logger.error('RooFit failed for %s bin %d', level, ipt) + self.logger.error("RooFit failed for %s bin %d", level, ipt) if level == "data": mean_sgn = roo_ws.var(self.p_param_names["gauss_mean"]) sigma_sgn = roo_ws.var(self.p_param_names["gauss_sigma"]) - (sig, sig_err, _, _, - signif, signif_err, s_over_b, s_over_b_err - ) = calc_signif(roo_ws, roo_res, self.p_pdfnames, \ - self.p_param_names, mean_sgn, sigma_sgn) - - yieldshistos.SetBinContent(ipt+1, sig) - yieldshistos.SetBinError(ipt+1, sig_err) - meanhistos.SetBinContent(ipt+1, mean_sgn.getVal()) - meanhistos.SetBinError(ipt+1, mean_sgn.getError()) - sigmahistos.SetBinContent(ipt+1, sigma_sgn.getVal()) - sigmahistos.SetBinError(ipt+1, sigma_sgn.getError()) - signifhistos.SetBinContent(ipt+1, signif) - signifhistos.SetBinError(ipt+1, signif_err) - soverbhistos.SetBinContent(ipt+1, s_over_b) - soverbhistos.SetBinError(ipt+1, s_over_b_err) + (sig, sig_err, _, _, signif, signif_err, s_over_b, s_over_b_err) = calc_signif( + roo_ws, roo_res, self.p_pdfnames, self.p_param_names, mean_sgn, sigma_sgn + ) + + yieldshistos.SetBinContent(ipt + 1, sig) + yieldshistos.SetBinError(ipt + 1, sig_err) + meanhistos.SetBinContent(ipt + 1, mean_sgn.getVal()) + meanhistos.SetBinError(ipt + 1, mean_sgn.getError()) + sigmahistos.SetBinContent(ipt + 1, sigma_sgn.getVal()) + sigmahistos.SetBinError(ipt + 1, sigma_sgn.getError()) + signifhistos.SetBinContent(ipt + 1, signif) + signifhistos.SetBinError(ipt + 1, signif_err) + soverbhistos.SetBinContent(ipt + 1, s_over_b) + soverbhistos.SetBinError(ipt + 1, s_over_b_err) fileout.cd() yieldshistos.Write() meanhistos.Write() @@ -402,27 +455,22 @@ def fit(self): soverbhistos.Write() fileout.Close() - - def get_efficiency(self, ibin1, ibin2): - fileouteff = TFile.Open("%s/efficiencies%s%s.root" % (self.d_resultsallpmc, \ - self.case, self.typean), "read") + fileouteff = TFile.Open("%s/efficiencies%s%s.root" % (self.d_resultsallpmc, self.case, self.typean), "read") h = fileouteff.Get(f"eff_mult{ibin2}") return h.GetBinContent(ibin1 + 1), h.GetBinError(ibin1 + 1) - def efficiency(self): self.loadstyle() lfileeff = TFile.Open(self.n_fileff) - fileouteff = TFile.Open("%s/efficiencies%s%s.root" % (self.d_resultsallpmc, \ - self.case, self.typean), "recreate") - cEff = TCanvas('cEff', 'The Fit Canvas') + fileouteff = TFile.Open("%s/efficiencies%s%s.root" % (self.d_resultsallpmc, self.case, self.typean), "recreate") + cEff = TCanvas("cEff", "The Fit Canvas") cEff.SetCanvasSize(1900, 1500) cEff.SetWindowSize(500, 500) cEff.SetLogy() - legeff = TLegend(.5, .20, .7, .45) + legeff = TLegend(0.5, 0.20, 0.7, 0.45) legeff.SetBorderSize(0) legeff.SetFillColor(0) legeff.SetFillStyle(0) @@ -430,10 +478,10 @@ def efficiency(self): legeff.SetTextSize(0.035) if self.signal_loss: - cSl = TCanvas('cSl', 'The Fit Canvas') + cSl = TCanvas("cSl", "The Fit Canvas") cSl.SetCanvasSize(1900, 1500) cSl.SetWindowSize(500, 500) - legsl = TLegend(.5, .20, .7, .45) + legsl = TLegend(0.5, 0.20, 0.7, 0.45) legsl.SetBorderSize(0) legsl.SetFillColor(0) legsl.SetFillStyle(0) @@ -441,18 +489,19 @@ def efficiency(self): legsl.SetTextSize(0.035) for imult in range(self.p_nbin2): - stringbin2 = "_%s_%.2f_%.2f" % (self.v_var2_binning, \ - self.lvar2_binmin[imult], \ - self.lvar2_binmax[imult]) - legeffstring = "%.1f #leq %s < %.1f" % \ - (self.lvar2_binmin[imult], self.p_latexbin2var, self.lvar2_binmax[imult]) + stringbin2 = "_%s_%.2f_%.2f" % (self.v_var2_binning, self.lvar2_binmin[imult], self.lvar2_binmax[imult]) + legeffstring = "%.1f #leq %s < %.1f" % ( + self.lvar2_binmin[imult], + self.p_latexbin2var, + self.lvar2_binmax[imult], + ) if self.signal_loss: h_gen_pr_sl = lfileeff.Get("h_signal_loss_gen_pr" + stringbin2) h_sel_pr_sl = lfileeff.Get("h_signal_loss_rec_pr" + stringbin2) h_sel_pr_sl.Divide(h_sel_pr_sl, h_gen_pr_sl, 1.0, 1.0, "B") - h_sel_pr_sl.SetLineColor(imult+1) - h_sel_pr_sl.SetMarkerColor(imult+1) + h_sel_pr_sl.SetLineColor(imult + 1) + h_sel_pr_sl.SetMarkerColor(imult + 1) h_sel_pr_sl.SetMarkerStyle(21) cSl.cd() h_sel_pr_sl.Draw("same") @@ -462,8 +511,7 @@ def efficiency(self): legsl.AddEntry(h_sel_pr_sl, legeffstring, "LEP") h_sel_pr_sl.GetXaxis().SetTitle("#it{p}_{T} (GeV/#it{c})") - h_sel_pr_sl.GetYaxis().SetTitle("Signal loss (prompt) %s" \ - % (self.p_latexnhadron)) + h_sel_pr_sl.GetYaxis().SetTitle("Signal loss (prompt) %s" % (self.p_latexnhadron)) h_sel_pr_sl.SetMinimum(0.7) h_sel_pr_sl.SetMaximum(1.0) @@ -474,8 +522,8 @@ def efficiency(self): if self.signal_loss: h_sel_pr.Multiply(h_sel_pr_sl) - h_sel_pr.SetLineColor(imult+1) - h_sel_pr.SetMarkerColor(imult+1) + h_sel_pr.SetLineColor(imult + 1) + h_sel_pr.SetMarkerColor(imult + 1) h_sel_pr.SetMarkerStyle(21) cEff.cd() h_sel_pr.Draw("same") @@ -484,21 +532,19 @@ def efficiency(self): h_sel_pr.Write() legeff.AddEntry(h_sel_pr, legeffstring, "LEP") h_sel_pr.GetXaxis().SetTitle("#it{p}_{T} (GeV/#it{c})") - h_sel_pr.GetYaxis().SetTitle("Acc x efficiency (prompt) %s" \ - % (self.p_latexnhadron)) + h_sel_pr.GetYaxis().SetTitle("Acc x efficiency (prompt) %s" % (self.p_latexnhadron)) h_sel_pr.SetMinimum(0.0004) h_sel_pr.SetMaximum(0.4) if self.signal_loss: cSl.cd() legsl.Draw() - cSl.SaveAs("%s/SignalLoss%s%s.eps" % (self.d_resultsallpmc, - self.case, self.typean)) + cSl.SaveAs("%s/SignalLoss%s%s.eps" % (self.d_resultsallpmc, self.case, self.typean)) - cSlFD = TCanvas('cSlFD', 'The Fit Canvas') + cSlFD = TCanvas("cSlFD", "The Fit Canvas") cSlFD.SetCanvasSize(1900, 1500) cSlFD.SetWindowSize(500, 500) - legslFD = TLegend(.5, .20, .7, .45) + legslFD = TLegend(0.5, 0.20, 0.7, 0.45) legslFD.SetBorderSize(0) legslFD.SetFillColor(0) legslFD.SetFillStyle(0) @@ -507,14 +553,13 @@ def efficiency(self): cEff.cd() legeff.Draw() - cEff.SaveAs("%s/Eff%s%s.eps" % (self.d_resultsallpmc, - self.case, self.typean)) + cEff.SaveAs("%s/Eff%s%s.eps" % (self.d_resultsallpmc, self.case, self.typean)) - cEffFD = TCanvas('cEffFD', 'The Fit Canvas') + cEffFD = TCanvas("cEffFD", "The Fit Canvas") cEffFD.SetCanvasSize(1900, 1500) cEffFD.SetWindowSize(500, 500) cEffFD.SetLogy() - legeffFD = TLegend(.5, .20, .7, .45) + legeffFD = TLegend(0.5, 0.20, 0.7, 0.45) legeffFD.SetBorderSize(0) legeffFD.SetFillColor(0) legeffFD.SetFillStyle(0) @@ -522,18 +567,19 @@ def efficiency(self): legeffFD.SetTextSize(0.035) for imult in range(self.p_nbin2): - stringbin2 = "_%s_%.2f_%.2f" % (self.v_var2_binning, \ - self.lvar2_binmin[imult], \ - self.lvar2_binmax[imult]) - legeffFDstring = "%.1f #leq %s < %.1f" % \ - (self.lvar2_binmin[imult], self.p_latexbin2var, self.lvar2_binmax[imult]) + stringbin2 = "_%s_%.2f_%.2f" % (self.v_var2_binning, self.lvar2_binmin[imult], self.lvar2_binmax[imult]) + legeffFDstring = "%.1f #leq %s < %.1f" % ( + self.lvar2_binmin[imult], + self.p_latexbin2var, + self.lvar2_binmax[imult], + ) if self.signal_loss: h_gen_fd_sl = lfileeff.Get("h_signal_loss_gen_fd" + stringbin2) h_sel_fd_sl = lfileeff.Get("h_signal_loss_rec_fd" + stringbin2) h_sel_fd_sl.Divide(h_sel_fd_sl, h_gen_fd_sl, 1.0, 1.0, "B") - h_sel_fd_sl.SetLineColor(imult+1) - h_sel_fd_sl.SetMarkerColor(imult+1) + h_sel_fd_sl.SetLineColor(imult + 1) + h_sel_fd_sl.SetMarkerColor(imult + 1) h_sel_fd_sl.SetMarkerStyle(21) cSlFD.cd() h_sel_fd_sl.Draw("same") @@ -543,8 +589,7 @@ def efficiency(self): legslFD.AddEntry(h_sel_fd_sl, legeffstring, "LEP") h_sel_fd_sl.GetXaxis().SetTitle("#it{p}_{T} (GeV/#it{c})") - h_sel_fd_sl.GetYaxis().SetTitle("Signal loss (feeddown) %s" \ - % (self.p_latexnhadron)) + h_sel_fd_sl.GetYaxis().SetTitle("Signal loss (feeddown) %s" % (self.p_latexnhadron)) h_sel_fd_sl.SetMinimum(0.7) h_sel_fd_sl.SetMaximum(1.0) @@ -555,8 +600,8 @@ def efficiency(self): if self.signal_loss: h_sel_fd.Multiply(h_sel_fd_sl) - h_sel_fd.SetLineColor(imult+1) - h_sel_fd.SetMarkerColor(imult+1) + h_sel_fd.SetLineColor(imult + 1) + h_sel_fd.SetMarkerColor(imult + 1) h_sel_fd.SetMarkerStyle(21) cEffFD.cd() h_sel_fd.Draw("same") @@ -565,40 +610,37 @@ def efficiency(self): h_sel_fd.Write() legeffFD.AddEntry(h_sel_fd, legeffFDstring, "LEP") h_sel_fd.GetXaxis().SetTitle("#it{p}_{T} (GeV/#it{c})") - h_sel_fd.GetYaxis().SetTitle("Acc x efficiency feed-down %s" \ - % (self.p_latexnhadron)) + h_sel_fd.GetYaxis().SetTitle("Acc x efficiency feed-down %s" % (self.p_latexnhadron)) h_sel_fd.SetMinimum(0.0004) h_sel_fd.SetMaximum(0.4) cEffFD.cd() legeffFD.Draw() - cEffFD.SaveAs("%s/EffFD%s%s.eps" % (self.d_resultsallpmc, - self.case, self.typean)) + cEffFD.SaveAs("%s/EffFD%s%s.eps" % (self.d_resultsallpmc, self.case, self.typean)) if self.signal_loss: cSlFD.cd() legslFD.Draw() - cSlFD.SaveAs("%s/SignalLossFD%s%s.eps" % (self.d_resultsallpmc, - self.case, self.typean)) - + cSlFD.SaveAs("%s/SignalLossFD%s%s.eps" % (self.d_resultsallpmc, self.case, self.typean)) def plotter(self): gROOT.SetBatch(True) self.loadstyle() - fileouteff = TFile.Open("%s/efficiencies%s%s.root" % \ - (self.d_resultsallpmc, self.case, self.typean)) - yield_filename = self.make_file_path(self.d_resultsallpdata, self.yields_filename, "root", - None, [self.case, self.typean]) + fileouteff = TFile.Open("%s/efficiencies%s%s.root" % (self.d_resultsallpmc, self.case, self.typean)) + yield_filename = self.make_file_path( + self.d_resultsallpdata, self.yields_filename, "root", None, [self.case, self.typean] + ) fileoutyield = TFile.Open(yield_filename, "READ") - fileoutcross = TFile.Open("%s/finalcross%s%s.root" % \ - (self.d_resultsallpdata, self.case, self.typean), "recreate") + fileoutcross = TFile.Open( + "%s/finalcross%s%s.root" % (self.d_resultsallpdata, self.case, self.typean), "recreate" + ) - cCrossvsvar1 = TCanvas('cCrossvsvar1', 'The Fit Canvas') + cCrossvsvar1 = TCanvas("cCrossvsvar1", "The Fit Canvas") cCrossvsvar1.SetCanvasSize(1900, 1500) cCrossvsvar1.SetWindowSize(500, 500) cCrossvsvar1.SetLogy() - legvsvar1 = TLegend(.5, .65, .7, .85) + legvsvar1 = TLegend(0.5, 0.65, 0.7, 0.85) legvsvar1.SetBorderSize(0) legvsvar1.SetFillColor(0) legvsvar1.SetFillStyle(0) @@ -620,64 +662,71 @@ def plotter(self): heff = fileouteff.Get("eff_mult%d" % (bineff)) hcross = fileoutyield.Get("hyields%d" % (imult)) hcross.Divide(heff) - hcross.SetLineColor(imult+1) + hcross.SetLineColor(imult + 1) norm = 2 * self.p_br * self.p_nevents / (self.p_sigmamb * 1e12) - hcross.Scale(1./norm) + hcross.Scale(1.0 / norm) fileoutcross.cd() hcross.GetXaxis().SetTitle("#it{p}_{T} %s (GeV/#it{c})" % self.p_latexnhadron) - hcross.GetYaxis().SetTitle("d#sigma/d#it{p}_{T} (%s) %s" % - (self.p_latexnhadron, self.typean)) + hcross.GetYaxis().SetTitle("d#sigma/d#it{p}_{T} (%s) %s" % (self.p_latexnhadron, self.typean)) hcross.SetName("hcross%d" % imult) hcross.GetYaxis().SetRangeUser(1e1, 1e10) - legvsvar1endstring = "%.1f < %s < %.1f" % \ - (self.lvar2_binmin[imult], self.p_latexbin2var, self.lvar2_binmax[imult]) + legvsvar1endstring = "%.1f < %s < %.1f" % ( + self.lvar2_binmin[imult], + self.p_latexbin2var, + self.lvar2_binmax[imult], + ) legvsvar1.AddEntry(hcross, legvsvar1endstring, "LEP") hcross.Draw("same") hcross.Write() - listvalpt = [hcross.GetBinContent(ipt+1) for ipt in range(self.p_nptbins)] + listvalpt = [hcross.GetBinContent(ipt + 1) for ipt in range(self.p_nptbins)] listvalues.append(listvalpt) - listvalerrpt = [hcross.GetBinError(ipt+1) for ipt in range(self.p_nptbins)] + listvalerrpt = [hcross.GetBinError(ipt + 1) for ipt in range(self.p_nptbins)] listvalueserr.append(listvalerrpt) legvsvar1.Draw() - cCrossvsvar1.SaveAs("%s/Cross%s%sVs%s.eps" % (self.d_resultsallpdata, - self.case, self.typean, self.v_var_binning)) + cCrossvsvar1.SaveAs( + "%s/Cross%s%sVs%s.eps" % (self.d_resultsallpdata, self.case, self.typean, self.v_var_binning) + ) - cCrossvsvar2 = TCanvas('cCrossvsvar2', 'The Fit Canvas') + cCrossvsvar2 = TCanvas("cCrossvsvar2", "The Fit Canvas") cCrossvsvar2.SetCanvasSize(1900, 1500) cCrossvsvar2.SetWindowSize(500, 500) cCrossvsvar2.SetLogy() - legvsvar2 = TLegend(.5, .65, .7, .85) + legvsvar2 = TLegend(0.5, 0.65, 0.7, 0.85) legvsvar2.SetBorderSize(0) legvsvar2.SetFillColor(0) legvsvar2.SetFillStyle(0) legvsvar2.SetTextFont(42) legvsvar2.SetTextSize(0.035) - hcrossvsvar2 = [TH1F("hcrossvsvar2" + "pt%d" % ipt, "", \ - self.p_nbin2, array("d", self.var2ranges)) \ - for ipt in range(self.p_nptbins)] + hcrossvsvar2 = [ + TH1F("hcrossvsvar2" + "pt%d" % ipt, "", self.p_nbin2, array("d", self.var2ranges)) + for ipt in range(self.p_nptbins) + ] for ipt in range(self.p_nptbins): print("pt", ipt) for imult in range(self.p_nbin2): - hcrossvsvar2[ipt].SetLineColor(ipt+1) + hcrossvsvar2[ipt].SetLineColor(ipt + 1) hcrossvsvar2[ipt].GetXaxis().SetTitle("%s" % self.p_latexbin2var) hcrossvsvar2[ipt].GetYaxis().SetTitle(self.p_latexnhadron) - hcrossvsvar2[ipt].SetBinContent(imult+1, listvalues[imult][ipt]) - hcrossvsvar2[ipt].SetBinError(imult+1, listvalueserr[imult][ipt]) + hcrossvsvar2[ipt].SetBinContent(imult + 1, listvalues[imult][ipt]) + hcrossvsvar2[ipt].SetBinError(imult + 1, listvalueserr[imult][ipt]) hcrossvsvar2[ipt].GetYaxis().SetRangeUser(1e4, 1e10) - legvsvar2endstring = "%.1f < %s < %.1f GeV/#it{c}" % \ - (self.lpt_finbinmin[ipt], "#it{p}_{T}", self.lpt_finbinmax[ipt]) + legvsvar2endstring = "%.1f < %s < %.1f GeV/#it{c}" % ( + self.lpt_finbinmin[ipt], + "#it{p}_{T}", + self.lpt_finbinmax[ipt], + ) hcrossvsvar2[ipt].Draw("same") legvsvar2.AddEntry(hcrossvsvar2[ipt], legvsvar2endstring, "LEP") legvsvar2.Draw() - cCrossvsvar2.SaveAs("%s/Cross%s%sVs%s.eps" % (self.d_resultsallpdata, - self.case, self.typean, self.v_var2_binning)) - + cCrossvsvar2.SaveAs( + "%s/Cross%s%sVs%s.eps" % (self.d_resultsallpdata, self.case, self.typean, self.v_var2_binning) + ) @staticmethod - def calculate_norm(logger, hevents, hselevents): #TO BE FIXED WITH EV SEL + def calculate_norm(logger, hevents, hselevents): # TO BE FIXED WITH EV SEL if not hevents: # pylint: disable=undefined-variable logger.error("Missing hevents") @@ -690,18 +739,22 @@ def calculate_norm(logger, hevents, hselevents): #TO BE FIXED WITH EV SEL return n_events, n_selevents - def makenormyields(self): # pylint: disable=import-outside-toplevel, too-many-branches + def makenormyields(self): # pylint: disable=import-outside-toplevel, too-many-branches gROOT.SetBatch(True) self.loadstyle() - yield_filename = self.make_file_path(self.d_resultsallpdata, self.yields_filename, "root", - None, [self.case, self.typean]) + yield_filename = self.make_file_path( + self.d_resultsallpdata, self.yields_filename, "root", None, [self.case, self.typean] + ) for imult in range(self.p_nbin2): # Choose where efficiencies to take from. Either this mult. bin, another mult. bin # in this analysis or another mult. bin from another analysis specified explicitly # by the user. - fileouteff = f"{self.d_resultsallpmc}/efficiencies{self.case}{self.typean}.root" \ - if not self.path_file_eff[imult] else self.path_file_eff[imult] + fileouteff = ( + f"{self.d_resultsallpmc}/efficiencies{self.case}{self.typean}.root" + if not self.path_file_eff[imult] + else self.path_file_eff[imult] + ) if not os.path.exists(fileouteff): self.logger.fatal("Efficiency file %s could not be found", fileouteff) bineff = -1 @@ -721,78 +774,81 @@ def makenormyields(self): # pylint: disable=import-outside-toplevel, too-many-br namehistoeffprompt = f"eff_mult{bineff}" namehistoefffeed = f"eff_fd_mult{bineff}" nameyield = "hyields%d" % imult - fileoutcrossmult = "%s/finalcross%s%smult%d.root" % \ - (self.d_resultsallpdata, self.case, self.typean, imult) + fileoutcrossmult = "%s/finalcross%s%smult%d.root" % (self.d_resultsallpdata, self.case, self.typean, imult) - #Bin1 is all events. Bin2 is all sel events. Mult bins start from Bin3. + # Bin1 is all events. Bin2 is all sel events. Mult bins start from Bin3. norm = histonorm.GetBinContent(imult + 3) # pylint: disable=logging-not-lazy self.logger.warning("Number of events %d for mult bin %d" % (norm, imult)) if self.p_fprompt_from_mb: if imult == 0: - fileoutcrossmb = "%s/finalcross%s%smult0.root" % \ - (self.d_resultsallpdata, self.case, self.typean) + fileoutcrossmb = "%s/finalcross%s%smult0.root" % (self.d_resultsallpdata, self.case, self.typean) output_prompt = [] if self.p_nevents is not None: norm = self.p_nevents self.logger.warning("Corrected Number of events %d for mult bin %d" % (norm, imult)) - hf_pt_spectrum(self.p_anahpt, - self.p_br, - self.p_inputfonllpred, - self.p_fd_method, - None, - fileouteff, - namehistoeffprompt, - namehistoefffeed, - yield_filename, - nameyield, - norm, - self.p_sigmamb, - output_prompt, - fileoutcrossmb) + hf_pt_spectrum( + self.p_anahpt, + self.p_br, + self.p_inputfonllpred, + self.p_fd_method, + None, + fileouteff, + namehistoeffprompt, + namehistoefffeed, + yield_filename, + nameyield, + norm, + self.p_sigmamb, + output_prompt, + fileoutcrossmb, + ) else: - #filecrossmb = TFile.Open("%s/finalcross%s%smult0.root" % \ - # (self.d_resultsallpdata, self.case, self.typean), "recreate") - self.logger.info("Calculating spectra using fPrompt from MB. "\ - "Assuming MB is bin 0") + # filecrossmb = TFile.Open("%s/finalcross%s%smult0.root" % \ + # (self.d_resultsallpdata, self.case, self.typean), "recreate") + self.logger.info("Calculating spectra using fPrompt from MB. Assuming MB is bin 0") self.p_fd_method = "ext" - hf_pt_spectrum(self.p_anahpt, - self.p_br, - self.p_inputfonllpred, - self.p_fd_method, - output_prompt, - fileouteff, - namehistoeffprompt, - namehistoefffeed, - yield_filename, - nameyield, - norm, - self.p_sigmamb, - output_prompt, - fileoutcrossmult) + hf_pt_spectrum( + self.p_anahpt, + self.p_br, + self.p_inputfonllpred, + self.p_fd_method, + output_prompt, + fileouteff, + namehistoeffprompt, + namehistoefffeed, + yield_filename, + nameyield, + norm, + self.p_sigmamb, + output_prompt, + fileoutcrossmult, + ) else: - hf_pt_spectrum(self.p_anahpt, - self.p_br, - self.p_inputfonllpred, - self.p_fd_method, - None, - fileouteff, - namehistoeffprompt, - namehistoefffeed, - yield_filename, - nameyield, - norm, - self.p_sigmamb, - output_prompt, - fileoutcrossmult) - - fileoutcrosstot = TFile.Open("%s/finalcross%s%smulttot.root" % \ - (self.d_resultsallpdata, self.case, self.typean), "recreate") + hf_pt_spectrum( + self.p_anahpt, + self.p_br, + self.p_inputfonllpred, + self.p_fd_method, + None, + fileouteff, + namehistoeffprompt, + namehistoefffeed, + yield_filename, + nameyield, + norm, + self.p_sigmamb, + output_prompt, + fileoutcrossmult, + ) + + fileoutcrosstot = TFile.Open( + "%s/finalcross%s%smulttot.root" % (self.d_resultsallpdata, self.case, self.typean), "recreate" + ) for imult in range(self.p_nbin2): - fileoutcrossmult = "%s/finalcross%s%smult%d.root" % \ - (self.d_resultsallpdata, self.case, self.typean, imult) + fileoutcrossmult = "%s/finalcross%s%smult%d.root" % (self.d_resultsallpdata, self.case, self.typean, imult) f_fileoutcrossmult = TFile.Open(fileoutcrossmult) if not f_fileoutcrossmult: continue @@ -804,36 +860,37 @@ def makenormyields(self): # pylint: disable=import-outside-toplevel, too-many-br def plotternormyields(self): gROOT.SetBatch(True) - cCrossvsvar1 = TCanvas('cCrossvsvar1', 'The Fit Canvas') + cCrossvsvar1 = TCanvas("cCrossvsvar1", "The Fit Canvas") cCrossvsvar1.SetCanvasSize(1900, 1500) cCrossvsvar1.SetWindowSize(500, 500) cCrossvsvar1.SetLogy() cCrossvsvar1.cd() - legvsvar1 = TLegend(.5, .65, .7, .85) + legvsvar1 = TLegend(0.5, 0.65, 0.7, 0.85) legvsvar1.SetBorderSize(0) legvsvar1.SetFillColor(0) legvsvar1.SetFillStyle(0) legvsvar1.SetTextFont(42) legvsvar1.SetTextSize(0.035) - fileoutcrosstot = TFile.Open("%s/finalcross%s%smulttot.root" % \ - (self.d_resultsallpdata, self.case, self.typean)) + fileoutcrosstot = TFile.Open("%s/finalcross%s%smulttot.root" % (self.d_resultsallpdata, self.case, self.typean)) for imult in range(self.p_nbin2): hcross = fileoutcrosstot.Get("histoSigmaCorr%d" % imult) - hcross.Scale(1./(self.p_sigmamb * 1e12)) - hcross.SetLineColor(imult+1) - hcross.SetMarkerColor(imult+1) + hcross.Scale(1.0 / (self.p_sigmamb * 1e12)) + hcross.SetLineColor(imult + 1) + hcross.SetMarkerColor(imult + 1) hcross.GetXaxis().SetTitle("#it{p}_{T} %s (GeV/#it{c})" % self.p_latexnhadron) hcross.GetYaxis().SetTitleOffset(1.3) - hcross.GetYaxis().SetTitle("Corrected yield/events (%s) %s" % - (self.p_latexnhadron, self.typean)) + hcross.GetYaxis().SetTitle("Corrected yield/events (%s) %s" % (self.p_latexnhadron, self.typean)) hcross.GetYaxis().SetRangeUser(1e-10, 1) - legvsvar1endstring = "%.1f #leq %s < %.1f" % \ - (self.lvar2_binmin[imult], self.p_latexbin2var, self.lvar2_binmax[imult]) + legvsvar1endstring = "%.1f #leq %s < %.1f" % ( + self.lvar2_binmin[imult], + self.p_latexbin2var, + self.lvar2_binmax[imult], + ) legvsvar1.AddEntry(hcross, legvsvar1endstring, "LEP") hcross.Draw("same") legvsvar1.Draw() - cCrossvsvar1.SaveAs("%s/CorrectedYieldsNorm%s%sVs%s.eps" % (self.d_resultsallpdata, - self.case, self.typean, - self.v_var_binning)) + cCrossvsvar1.SaveAs( + "%s/CorrectedYieldsNorm%s%sVs%s.eps" % (self.d_resultsallpdata, self.case, self.typean, self.v_var_binning) + ) fileoutcrosstot.Close() diff --git a/machine_learning_hep/analysis/do_systematics.py b/machine_learning_hep/analysis/do_systematics.py index 07e7bce5c1..d06f09c51e 100644 --- a/machine_learning_hep/analysis/do_systematics.py +++ b/machine_learning_hep/analysis/do_systematics.py @@ -28,8 +28,16 @@ import numpy as np import yaml -from ROOT import TLegend # , TLine -from ROOT import TH1F, TCanvas, TFile, TGraphAsymmErrors, TLatex, gROOT, gStyle +from ROOT import ( + TH1F, + TCanvas, + TFile, + TGraphAsymmErrors, + TLatex, + TLegend, # , TLine + gROOT, + gStyle, +) from machine_learning_hep.analysis.analyzer_jets import string_range_ptjet from machine_learning_hep.do_variations import ( @@ -41,7 +49,6 @@ # HF specific imports from machine_learning_hep.utilities import ( - make_plot, combine_graphs, draw_latex, get_colour, @@ -50,6 +57,7 @@ get_y_window_gr, get_y_window_his, make_message_notfound, + make_plot, print_histogram, reset_graph_outside_range, reset_hist_outside_range, @@ -546,8 +554,11 @@ def do_jet_systematics(self, var: str): ) input_histograms_sys[iptjet][sys_cat][sys_var].Draw("same") nsys = nsys + 1 - latex_text = "%g #leq %s < %g GeV/#it{c}" % (self.edges_ptjet_gen_min[iptjet], - self.latex_ptjet, self.edges_ptjet_gen_max[iptjet]) + latex_text = "%g #leq %s < %g GeV/#it{c}" % ( + self.edges_ptjet_gen_min[iptjet], + self.latex_ptjet, + self.edges_ptjet_gen_max[iptjet], + ) latex = TLatex( 0.15, 0.82, @@ -571,9 +582,16 @@ def do_jet_systematics(self, var: str): n_bins = input_histograms_default[iptjet].GetNbinsX() # Make the histograms for the distribution of var/default values per bin of observable. - list_his_cat_vars = [TH1F(f"his_cat_vars_{var}_{suffix}_{suffix2}_{ibin + 1}", - f"{self.systematic_catlabels[sys_cat]} distribution, bin {ibin + 1};" - "var/def;counts", 6, 0., 2.) for ibin in range(n_bins)] + list_his_cat_vars = [ + TH1F( + f"his_cat_vars_{var}_{suffix}_{suffix2}_{ibin + 1}", + f"{self.systematic_catlabels[sys_cat]} distribution, bin {ibin + 1};var/def;counts", + 6, + 0.0, + 2.0, + ) + for ibin in range(n_bins) + ] for sys_var in range(self.systematic_variations[sys_cat]): default_his = input_histograms_default[iptjet].Clone("default_his") @@ -618,16 +636,21 @@ def do_jet_systematics(self, var: str): # print([[h.GetBinContent(i + 1) for i in range(h.GetNbinsX())] for h in list_his_cat_vars]) axis_x = var_his.GetXaxis() - can_dist, _ = make_plot(f"sys_var_{var}_{suffix}_{suffix2}_ratio_dist", - list_obj=list_his_cat_vars, labels_obj=[f"{axis_x.GetBinLowEdge(ibin + 1)}-" - f"{axis_x.GetBinUpEdge(ibin + 1)}" - for ibin in range(n_bins)], - opt_leg_g=self.opt_leg_g, opt_plot_g=self.opt_plot_g, opt_plot_h="p l", - offsets_xy=self.offsets_axes, - leg_pos=[0.7, 0.7, 0.8, 0.85], - margins_y=[0.05, 0.05], margins_c=self.margins_can, - title=f"{latex_obs} {latex_text} {self.systematic_catlabels[sys_cat]};" - "var/default;counts") + can_dist, _ = make_plot( + f"sys_var_{var}_{suffix}_{suffix2}_ratio_dist", + list_obj=list_his_cat_vars, + labels_obj=[ + f"{axis_x.GetBinLowEdge(ibin + 1)}-{axis_x.GetBinUpEdge(ibin + 1)}" for ibin in range(n_bins) + ], + opt_leg_g=self.opt_leg_g, + opt_plot_g=self.opt_plot_g, + opt_plot_h="p l", + offsets_xy=self.offsets_axes, + leg_pos=[0.7, 0.7, 0.8, 0.85], + margins_y=[0.05, 0.05], + margins_c=self.margins_can, + title=f"{latex_obs} {latex_text} {self.systematic_catlabels[sys_cat]};var/default;counts", + ) self.save_canvas(can_dist, f"sys_var_{var}_{suffix}_{suffix2}_ratio_dist") # Plot efficiency variations @@ -764,17 +787,17 @@ def do_jet_systematics(self, var: str): # list of absolute downward uncertainties for all categories in a given (pt_jet, shape) bin sys_down_z = [] # combined absolute upward uncertainty in a given (pt_jet, shape) bin - error_full_up = 0. + error_full_up = 0.0 # combined absolute downward uncertainty in a given (pt_jet, shape) bin - error_full_down = 0. + error_full_down = 0.0 for sys_cat in range(self.n_sys_cat): # absolute upward uncertainty for a given category in a given (pt_jet, shape) bin - error_var_up = 0. + error_var_up = 0.0 # absolute downward uncertainty for a given category in a given (pt_jet, shape) bin - error_var_down = 0. - count_sys_up = 0. - count_sys_down = 0. - error = 0. + error_var_down = 0.0 + count_sys_up = 0.0 + count_sys_down = 0.0 + error = 0.0 for sys_var in range(self.systematic_variations[sys_cat]): out_sys = False # FIXME exception for the untagged bin pylint: disable=fixme @@ -782,13 +805,13 @@ def do_jet_systematics(self, var: str): # bin_first = 2 if "untagged" in self.systematic_varlabels[sys_cat][sys_var] else 1 # FIXME exception for the untagged bin pylint: disable=fixme if input_histograms_sys[iptjet][sys_cat][sys_var].Integral() == 0: - error = 0. + error = 0.0 out_sys = True else: error = input_histograms_sys[iptjet][sys_cat][sys_var].GetBinContent( ibinshape + bin_first ) - input_histograms_default[iptjet].GetBinContent(ibinshape + 1) - if error >= 0.: + if error >= 0.0: if self.systematic_rms[sys_cat] is True: error_var_up += error * error if not out_sys: @@ -894,7 +917,7 @@ def do_jet_systematics(self, var: str): else: rel_unc_up.append(0.0) rel_unc_down.append(0.0) - print(f"total rel. syst. unc. (%): min. {(100. * unc_rel_min):.2g}, max. {(100. * unc_rel_max):.2g}") + print(f"total rel. syst. unc. (%): min. {(100.0 * unc_rel_min):.2g}, max. {(100.0 * unc_rel_max):.2g}") shapebins_centres_array = array("d", shapebins_centres) shapebins_contents_array = array("d", shapebins_contents) shapebins_widths_up_array = array("d", shapebins_widths_up) @@ -1155,8 +1178,8 @@ def do_jet_systematics(self, var: str): tgsys_cat[iptjet][sys_cat].GetErrorYlow(ibinshape), ) print( - f"rel. syst. unc. {self.systematic_catlabels[sys_cat]} (%): min. {(100. * unc_rel_min):.2g}, " - f"max. {(100. * unc_rel_max):.2g}" + f"rel. syst. unc. {self.systematic_catlabels[sys_cat]} (%): min. {(100.0 * unc_rel_min):.2g}, " + f"max. {(100.0 * unc_rel_max):.2g}" ) h_default_stat_err[iptjet].Draw("same") h_default_stat_err[iptjet].Draw("axissame") @@ -1261,7 +1284,7 @@ def do_jet_systematics(self, var: str): tgsys_gr[iptjet][sys_gr].GetErrorYhigh(ibinshape), tgsys_gr[iptjet][sys_gr].GetErrorYlow(ibinshape), ) - print(f"rel. syst. unc. {gr} (%): min. {(100. * unc_rel_min):.2g}, max. {(100. * unc_rel_max):.2g}") + print(f"rel. syst. unc. {gr} (%): min. {(100.0 * unc_rel_min):.2g}, max. {(100.0 * unc_rel_max):.2g}") h_default_stat_err[iptjet].Draw("same") h_default_stat_err[iptjet].Draw("axissame") # Draw LaTeX diff --git a/machine_learning_hep/analysis/systematics.py b/machine_learning_hep/analysis/systematics.py index 5d80bca1a8..f61ab7cfa0 100644 --- a/machine_learning_hep/analysis/systematics.py +++ b/machine_learning_hep/analysis/systematics.py @@ -18,34 +18,30 @@ At the moment includes: Cut variation and MC pT shape The raw yield systematic is done within analyzer.py """ + # pylint: disable=no-name-in-module # pylint: disable=import-error import sys -from time import sleep -from os.path import join, exists -from os import makedirs +from copy import copy, deepcopy from operator import itemgetter -from copy import deepcopy, copy +from os import makedirs +from os.path import exists, join from random import shuffle +from time import sleep -from ROOT import TFile, TCanvas, TLegend -from ROOT import kRed, kGreen, kBlack, kBlue, kOrange, kViolet, kAzure, kYellow -from ROOT import TGraphErrors +from ROOT import TCanvas, TFile, TGraphErrors, TLegend, kAzure, kBlack, kBlue, kGreen, kOrange, kRed, kViolet, kYellow -from machine_learning_hep.utilities_plot import load_root_style from machine_learning_hep.fitting.helpers import MLFitter -from machine_learning_hep.multiprocesser import MultiProcesser -from machine_learning_hep.io import parse_yaml, dump_yaml_from_dict +from machine_learning_hep.io import dump_yaml_from_dict, parse_yaml from machine_learning_hep.logger import get_logger +from machine_learning_hep.multiprocesser import MultiProcesser +from machine_learning_hep.utilities_plot import load_root_style -class SystematicsMLWP: # pylint: disable=too-few-public-methods, too-many-instance-attributes +class SystematicsMLWP: # pylint: disable=too-few-public-methods, too-many-instance-attributes species = "systematicsmlwp" - def __init__(self, datap, case, typean, - analyzers, multiprocesser_mc, multiprocesser_data, - multi_class_opt=None): - + def __init__(self, datap, case, typean, analyzers, multiprocesser_mc, multiprocesser_data, multi_class_opt=None): self.logger = get_logger() self.datap = datap self.case = case @@ -57,7 +53,7 @@ def __init__(self, datap, case, typean, self.multiprocesser_mc = multiprocesser_mc self.multiprocesser_data = multiprocesser_data - #Variables for the systematic variations + # Variables for the systematic variations self.p_cutvar_minrange = datap["systematics"]["probvariation"]["cutvarminrange"] self.p_cutvar_maxrange = datap["systematics"]["probvariation"]["cutvarmaxrange"] self.p_ncutvar = datap["systematics"]["probvariation"]["ncutvar"] @@ -65,8 +61,8 @@ def __init__(self, datap, case, typean, self.p_fixedmean = datap["systematics"]["probvariation"]["fixedmean"] self.p_fixedsigma = datap["systematics"]["probvariation"]["fixedsigma"] # Require a minimum significance or a maximum chi2 for individual fits - self.min_signif_fit = datap["systematics"]["probvariation"].get("min_signif_fit", -1.) - self.max_red_chi2_fit = datap["systematics"]["probvariation"].get("max_red_chi2_fit", -1.) + self.min_signif_fit = datap["systematics"]["probvariation"].get("min_signif_fit", -1.0) + self.max_red_chi2_fit = datap["systematics"]["probvariation"].get("max_red_chi2_fit", -1.0) self.syst_out_dir = "ML_WP_syst" self.processers_mc_syst = None @@ -89,7 +85,7 @@ def __init__(self, datap, case, typean, self.nominal_means = [] self.nominal_sigmas = [] - #For multiclassification. Combined variations not yet implemented + # For multiclassification. Combined variations not yet implemented self.mcopt = multi_class_opt if self.mcopt is not None: if self.mcopt > len(self.p_cutvar_minrange[0]) - 1: @@ -98,9 +94,7 @@ def __init__(self, datap, case, typean, self.p_cutvar_maxrange = list(map(itemgetter(self.mcopt), self.p_cutvar_maxrange)) self.syst_out_dir = f"ML_WP_syst_MultiClass{self.mcopt}" - def __read_nominal_fit_values(self): - if self.nominal_means: return @@ -110,21 +104,20 @@ def __read_nominal_fit_values(self): fitter = self.nominal_analyzer_merged.fitter if fitter is None: - - fitter = MLFitter(self.nominal_analyzer_merged.case, - self.nominal_analyzer_merged.datap, - self.nominal_analyzer_merged.typean, - self.nominal_analyzer_merged.n_filemass, - self.nominal_analyzer_merged.n_filemass_mc) + fitter = MLFitter( + self.nominal_analyzer_merged.case, + self.nominal_analyzer_merged.datap, + self.nominal_analyzer_merged.typean, + self.nominal_analyzer_merged.n_filemass, + self.nominal_analyzer_merged.n_filemass_mc, + ) fitter.load_fits(self.nominal_analyzer_merged.fits_dirname) ana_n_first_binning = self.nominal_analyzer_merged.p_nptbins ana_n_second_binning = self.nominal_analyzer_merged.p_nbin2 - self.nominal_means = [[None] * ana_n_first_binning \ - for _ in range(ana_n_second_binning)] - self.nominal_sigmas = [[None] * ana_n_first_binning \ - for _ in range(ana_n_second_binning)] + self.nominal_means = [[None] * ana_n_first_binning for _ in range(ana_n_second_binning)] + self.nominal_sigmas = [[None] * ana_n_first_binning for _ in range(ana_n_second_binning)] for ibin1 in range(ana_n_first_binning): for ibin2 in range(ana_n_second_binning): @@ -132,8 +125,7 @@ def __read_nominal_fit_values(self): self.nominal_means[ibin2][ibin1] = fit.kernel.GetMean() self.nominal_sigmas[ibin2][ibin1] = fit.kernel.GetSigma() - - def __define_cutvariation_limits(self): #pylint: disable=too-many-statements + def __define_cutvariation_limits(self): # pylint: disable=too-many-statements """obtain ML WP limits (lower/upper) keeping required efficiency variation This runs a MultiProcesser and an Analyzer both derived from the nominal @@ -153,10 +145,8 @@ def __define_cutvariation_limits(self): #pylint: disable=too-many-statements # use multiprocesser here, prepare database datap = deepcopy(self.datap) - results_dirs_periods = [join(d, "tmp_ml_wp_limits") \ - for d in datap["analysis"][self.typean]["mc"]["results"]] - results_dir_all = join(datap["analysis"][self.typean]["mc"]["resultsallp"], - "tmp_ml_wp_limits") + results_dirs_periods = [join(d, "tmp_ml_wp_limits") for d in datap["analysis"][self.typean]["mc"]["results"]] + results_dir_all = join(datap["analysis"][self.typean]["mc"]["resultsallp"], "tmp_ml_wp_limits") datap["analysis"][self.typean]["mc"]["results"] = results_dirs_periods datap["analysis"][self.typean]["mc"]["resultsallp"] = results_dir_all @@ -169,9 +159,9 @@ def __define_cutvariation_limits(self): #pylint: disable=too-many-statements makedirs(results_dir_all) # MultiProcesser to cover all at once - multi_processer_effs = MultiProcesser(self.case, self.nominal_processer_mc.__class__, datap, - self.typean, self.multiprocesser_mc.run_param, - "mc") + multi_processer_effs = MultiProcesser( + self.case, self.nominal_processer_mc.__class__, datap, self.typean, self.multiprocesser_mc.run_param, "mc" + ) # construct analyzer for all periods merged and use it for finding ML WP boundaries analyzer_effs = self.nominal_analyzer_merged.__class__(datap, self.case, self.typean, None) @@ -203,38 +193,35 @@ def __define_cutvariation_limits(self): #pylint: disable=too-many-statements multiclasslabels = self.nominal_processer_mc.multiclass_labels def found_all_boundaries(boundaries): - """helper to check whether all boundaries have been fixed - """ + """helper to check whether all boundaries have been fixed""" if None in boundaries: return False return True - def compute_new_boundaries(wps, boundaries): - """helper to compute boundaries if not yet fixed - """ + """helper to compute boundaries if not yet fixed""" if found_all_boundaries(boundaries): return - wps_strings = ["y_test_prob%s>%s" % (modelname, wps[ipt]) \ - for ipt in range(n_pt_bins)] + wps_strings = ["y_test_prob%s>%s" % (modelname, wps[ipt]) for ipt in range(n_pt_bins)] if self.mcopt is not None: - probvar0 = 'y_test_prob' + modelname + multiclasslabels[0] - probvar1 = 'y_test_prob' + modelname + multiclasslabels[1] + probvar0 = "y_test_prob" + modelname + multiclasslabels[0] + probvar1 = "y_test_prob" + modelname + multiclasslabels[1] if self.mcopt == 0: - wps_strings = ["%s<=%s and %s>=%s" % (probvar0, wps[ipt], probvar1, \ - self.cent_cv_cut_orig[ipt][1]) for ipt in range(n_pt_bins)] - wps_multi = [[wps[ipt], self.cent_cv_cut_orig[ipt][1]] \ - for ipt in range(n_pt_bins)] + wps_strings = [ + "%s<=%s and %s>=%s" % (probvar0, wps[ipt], probvar1, self.cent_cv_cut_orig[ipt][1]) + for ipt in range(n_pt_bins) + ] + wps_multi = [[wps[ipt], self.cent_cv_cut_orig[ipt][1]] for ipt in range(n_pt_bins)] elif self.mcopt == 1: - wps_strings = ["%s<=%s and %s>=%s" % (probvar0, self.cent_cv_cut_orig[ipt][0], \ - probvar1, wps[ipt]) for ipt in range(n_pt_bins)] - wps_multi = [[self.cent_cv_cut_orig[ipt][0], wps[ipt]] \ - for ipt in range(n_pt_bins)] + wps_strings = [ + "%s<=%s and %s>=%s" % (probvar0, self.cent_cv_cut_orig[ipt][0], probvar1, wps[ipt]) + for ipt in range(n_pt_bins) + ] + wps_multi = [[self.cent_cv_cut_orig[ipt][0], wps[ipt]] for ipt in range(n_pt_bins)] else: print(f"Unknown mcopt value {self.mcopt}") sys.exit(1) - # update processers and analyzer ML WPs for proc in multi_processer_effs.process_listsample: proc.l_selml = wps_strings @@ -249,24 +236,21 @@ def compute_new_boundaries(wps, boundaries): # Read and compare efficiencies to nominal ones. Add if not yet found for ibin1 in range(ana_n_first_binning): eff_new, _ = analyzer_effs.get_efficiency(ibin1, 0) - if abs(eff_new - nominal_effs[ibin1]) / nominal_effs[ibin1] < self.p_maxperccutvar \ - and boundaries[ibin1] is None: + if ( + abs(eff_new - nominal_effs[ibin1]) / nominal_effs[ibin1] < self.p_maxperccutvar + and boundaries[ibin1] is None + ): boundaries[ibin1] = wps[bin_matching[ibin1]] - # Define stepping up and down from nominal WPs for ipt in range(n_pt_bins): + stepsmin.append((self.cent_cv_cut[ipt] - self.p_cutvar_minrange[ipt]) / ncutvar_temp) - stepsmin.append( \ - (self.cent_cv_cut[ipt] - self.p_cutvar_minrange[ipt]) / ncutvar_temp) - - stepsmax.append( \ - (self.p_cutvar_maxrange[ipt] - self.cent_cv_cut[ipt]) / ncutvar_temp) + stepsmax.append((self.p_cutvar_maxrange[ipt] - self.cent_cv_cut[ipt]) / ncutvar_temp) # Attempt to find WP variations up and down for icv in range(ncutvar_temp): - if found_all_boundaries(self.min_cv_cut) \ - and found_all_boundaries(self.max_cv_cut): + if found_all_boundaries(self.min_cv_cut) and found_all_boundaries(self.max_cv_cut): break wps = [self.p_cutvar_minrange[ipt] + icv * stepsmin[ipt] for ipt in range(n_pt_bins)] @@ -279,14 +263,11 @@ def compute_new_boundaries(wps, boundaries): print("--Central probability cut: ", self.cent_cv_cut) print("--Cut variation boundaries maximum: ", self.max_cv_cut) - - def __make_working_points(self): self.ml_wps = [[] for _ in range(self.n_trials)] n_pt_bins = self.nominal_processer_mc.p_nptfinbins for ipt in range(n_pt_bins): - stepsmin = (self.cent_cv_cut[ipt] - self.min_cv_cut[ipt]) / self.p_ncutvar stepsmax = (self.max_cv_cut[ipt] - self.cent_cv_cut[ipt]) / self.p_ncutvar @@ -296,39 +277,36 @@ def __make_working_points(self): if self.mcopt == 0: self.ml_wps[icv].append([lower_cut, self.cent_cv_cut_orig[ipt][1]]) - self.ml_wps[self.p_ncutvar + icv].append([upper_cut, \ - self.cent_cv_cut_orig[ipt][1]]) + self.ml_wps[self.p_ncutvar + icv].append([upper_cut, self.cent_cv_cut_orig[ipt][1]]) elif self.mcopt == 1: self.ml_wps[icv].append([self.cent_cv_cut_orig[ipt][0], lower_cut]) - self.ml_wps[self.p_ncutvar + icv].append([self.cent_cv_cut_orig[ipt][0], \ - upper_cut]) + self.ml_wps[self.p_ncutvar + icv].append([self.cent_cv_cut_orig[ipt][0], upper_cut]) else: self.ml_wps[icv].append(lower_cut) self.ml_wps[self.p_ncutvar + icv].append(upper_cut) def __prepare_trial(self, i_trial): - - datap = deepcopy(self.datap) - datap["analysis"][self.typean]["mc"]["results"] = \ - [join(d, self.syst_out_dir, f"trial_{i_trial}") \ - for d in datap["analysis"][self.typean]["mc"]["results"]] - datap["analysis"][self.typean]["mc"]["resultsallp"] = \ - join(datap["analysis"][self.typean]["mc"]["resultsallp"], \ - self.syst_out_dir, f"trial_{i_trial}") - - datap["analysis"][self.typean]["data"]["results"] = \ - [join(d, self.syst_out_dir, f"trial_{i_trial}") \ - for d in datap["analysis"][self.typean]["data"]["results"]] - datap["analysis"][self.typean]["data"]["resultsallp"] = \ - join(datap["analysis"][self.typean]["data"]["resultsallp"], \ - self.syst_out_dir, f"trial_{i_trial}") - - for new_dir in \ - datap["analysis"][self.typean]["mc"]["results"] + \ - [datap["analysis"][self.typean]["mc"]["resultsallp"]] + \ - datap["analysis"][self.typean]["data"]["results"] + \ - [datap["analysis"][self.typean]["data"]["resultsallp"]]: + datap["analysis"][self.typean]["mc"]["results"] = [ + join(d, self.syst_out_dir, f"trial_{i_trial}") for d in datap["analysis"][self.typean]["mc"]["results"] + ] + datap["analysis"][self.typean]["mc"]["resultsallp"] = join( + datap["analysis"][self.typean]["mc"]["resultsallp"], self.syst_out_dir, f"trial_{i_trial}" + ) + + datap["analysis"][self.typean]["data"]["results"] = [ + join(d, self.syst_out_dir, f"trial_{i_trial}") for d in datap["analysis"][self.typean]["data"]["results"] + ] + datap["analysis"][self.typean]["data"]["resultsallp"] = join( + datap["analysis"][self.typean]["data"]["resultsallp"], self.syst_out_dir, f"trial_{i_trial}" + ) + + for new_dir in ( + datap["analysis"][self.typean]["mc"]["results"] + + [datap["analysis"][self.typean]["mc"]["resultsallp"]] + + datap["analysis"][self.typean]["data"]["results"] + + [datap["analysis"][self.typean]["data"]["resultsallp"]] + ): if not exists(new_dir): makedirs(new_dir) @@ -340,27 +318,19 @@ def __prepare_trial(self, i_trial): datap["analysis"][self.typean]["FixedMean"] = True datap["analysis"][self.typean]["masspeak"] = self.nominal_means datap["analysis"][self.typean]["sigmaarray"] = self.nominal_sigmas[0] - datap["analysis"][self.typean]["SetFixGaussianSigma"] = \ - [True] * len(self.nominal_sigmas[0]) - datap["analysis"][self.typean]["SetInitialGaussianSigma"] = \ - [True] * len(self.nominal_sigmas[0]) - datap["analysis"][self.typean]["SetInitialGaussianMean"] = \ - [True] * len(self.nominal_sigmas[0]) + datap["analysis"][self.typean]["SetFixGaussianSigma"] = [True] * len(self.nominal_sigmas[0]) + datap["analysis"][self.typean]["SetInitialGaussianSigma"] = [True] * len(self.nominal_sigmas[0]) + datap["analysis"][self.typean]["SetInitialGaussianMean"] = [True] * len(self.nominal_sigmas[0]) # Processers - self.processers_mc_syst[i_trial] = MultiProcesser(self.case, - self.nominal_processer_mc.__class__, - datap, self.typean, - self.multiprocesser_mc.run_param, "mc") - self.processers_data_syst[i_trial] = MultiProcesser(self.case, - self.nominal_processer_mc.__class__, - datap, self.typean, - self.multiprocesser_mc.run_param, - "data") - - self.analyzers_syst[i_trial] = self.nominal_analyzer_merged.__class__(datap, self.case, - self.typean, None) + self.processers_mc_syst[i_trial] = MultiProcesser( + self.case, self.nominal_processer_mc.__class__, datap, self.typean, self.multiprocesser_mc.run_param, "mc" + ) + self.processers_data_syst[i_trial] = MultiProcesser( + self.case, self.nominal_processer_mc.__class__, datap, self.typean, self.multiprocesser_mc.run_param, "data" + ) + self.analyzers_syst[i_trial] = self.nominal_analyzer_merged.__class__(datap, self.case, self.typean, None) def __ml_cutvar_mass(self, i_trial): """ @@ -373,7 +343,6 @@ def __ml_cutvar_mass(self, i_trial): self.processers_mc_syst[i_trial].multi_histomass() self.processers_data_syst[i_trial].multi_histomass() - def __ml_cutvar_eff(self, i_trial): """ Cut Variation: Create ROOT file with efficiencies @@ -384,7 +353,6 @@ def __ml_cutvar_eff(self, i_trial): self.processers_mc_syst[i_trial].multi_efficiency() - def __ml_cutvar_ana(self, i_trial): """ Cut Variation: Fit invariant mass histograms with AliHFInvMassFitter @@ -400,8 +368,7 @@ def __ml_cutvar_ana(self, i_trial): @staticmethod def __style_histograms(histos, style_numbers=None): - colours = [kRed, kGreen+2, kBlue, kOrange+2, kViolet-1, kAzure+1, kOrange-7, - kViolet+2, kYellow-3] + colours = [kRed, kGreen + 2, kBlue, kOrange + 2, kViolet - 1, kAzure + 1, kOrange - 7, kViolet + 2, kYellow - 3] linestyles = [1, 7, 19] markers_closed = [43, 47, 20, 22, 23] markers_open = [42, 46, 24, 26, 32] @@ -416,7 +383,6 @@ def __style_histograms(histos, style_numbers=None): h.SetMarkerStyle(markers[i % len(markers)]) h.SetMarkerColor(colours[i % len(colours)]) - @staticmethod def __get_histogram(filepath, name): file_in = TFile.Open(filepath, "READ") @@ -424,7 +390,6 @@ def __get_histogram(filepath, name): histo.SetDirectory(0) return histo - @staticmethod def __adjust_min_max(histos): h_min = min([h.GetMinimum() for h in histos]) @@ -439,9 +404,7 @@ def __adjust_min_max(histos): h.GetYaxis().SetRangeUser(h_min, h_max) h.GetYaxis().SetMaxDigits(3) - def __make_single_plot(self, name, ibin2, successful): - # Nominal histogram successful_tmp = copy(successful) successful_tmp.sort() @@ -485,8 +448,9 @@ def __make_single_plot(self, name, ibin2, successful): legend.AddEntry(h, l) h.GetXaxis().SetTitle("#it{p}_{T} [GeV/#it{c}]") h.GetYaxis().SetTitle("WP variation / nominal") - self.__adjust_min_max(histos, ) - + self.__adjust_min_max( + histos, + ) canvas = TCanvas("c", "", 800, 800) canvas.cd() @@ -495,11 +459,13 @@ def __make_single_plot(self, name, ibin2, successful): h.Draw("same") legend.Draw("same") - save_path = join(self.nominal_analyzer_merged.d_resultsallpdata, self.syst_out_dir, - f"ml_wp_syst_{name}_ibin2_{ibin2}.eps") + save_path = join( + self.nominal_analyzer_merged.d_resultsallpdata, self.syst_out_dir, f"ml_wp_syst_{name}_ibin2_{ibin2}.eps" + ) canvas.SaveAs(save_path) - save_path = join(self.nominal_analyzer_merged.d_resultsallpdata, self.syst_out_dir, - f"ml_wp_syst_{name}_ibin2_{ibin2}.root") + save_path = join( + self.nominal_analyzer_merged.d_resultsallpdata, self.syst_out_dir, f"ml_wp_syst_{name}_ibin2_{ibin2}.root" + ) file_out = TFile.Open(save_path, "RECREATE") file_out.cd() for i, h in enumerate(histos): @@ -509,7 +475,6 @@ def __make_single_plot(self, name, ibin2, successful): canvas.Close() def __make_summary_plot(self, name, ibin2, successful): - # Nominal histogram successful_tmp = copy(successful) successful_tmp.sort() @@ -530,12 +495,11 @@ def __make_summary_plot(self, name, ibin2, successful): gr = [TGraphErrors(0) for _ in range(nptbins)] for ipt in range(nptbins): gr[ipt].SetTitle("pT bin %d" % ipt) - gr[ipt].SetPoint(0, self.cent_cv_cut[ipt], nominal_histo.GetBinContent(ipt+1)) - gr[ipt].SetPointError(0, 0.0001, nominal_histo.GetBinError(ipt+1)) + gr[ipt].SetPoint(0, self.cent_cv_cut[ipt], nominal_histo.GetBinContent(ipt + 1)) + gr[ipt].SetPointError(0, 0.0001, nominal_histo.GetBinError(ipt + 1)) for iml, succ in enumerate(successful_tmp): - gr[ipt].SetPoint(iml + 1, ml_trials[succ][ipt], - histos[succ].GetBinContent(ipt+1)) - gr[ipt].SetPointError(iml + 1, 0.0001, histos[succ].GetBinError(ipt+1)) + gr[ipt].SetPoint(iml + 1, ml_trials[succ][ipt], histos[succ].GetBinContent(ipt + 1)) + gr[ipt].SetPointError(iml + 1, 0.0001, histos[succ].GetBinError(ipt + 1)) canvas = TCanvas("cvsml%d" % ibin2, "", 1200, 800) if len(gr) <= 6: @@ -545,14 +509,20 @@ def __make_summary_plot(self, name, ibin2, successful): else: canvas.Divide(5, 4) for i, graph in enumerate(gr): - canvas.cd(i+1) + canvas.cd(i + 1) graph.Draw("a*") - save_path = join(self.nominal_analyzer_merged.d_resultsallpdata, self.syst_out_dir, - f"ml_wp_syst_{name}_vs_MLcut_ibin2_{ibin2}.eps") + save_path = join( + self.nominal_analyzer_merged.d_resultsallpdata, + self.syst_out_dir, + f"ml_wp_syst_{name}_vs_MLcut_ibin2_{ibin2}.eps", + ) canvas.SaveAs(save_path) - save_path = join(self.nominal_analyzer_merged.d_resultsallpdata, self.syst_out_dir, - f"ml_wp_syst_{name}_vs_MLcut_ibin2_{ibin2}.root") + save_path = join( + self.nominal_analyzer_merged.d_resultsallpdata, + self.syst_out_dir, + f"ml_wp_syst_{name}_vs_MLcut_ibin2_{ibin2}.root", + ) file_out = TFile.Open(save_path, "RECREATE") file_out.cd() for i, graph in enumerate(gr): @@ -562,8 +532,7 @@ def __make_summary_plot(self, name, ibin2, successful): canvas.Close() def __plot(self, successful): - """summary plots - """ + """summary plots""" load_root_style() @@ -574,18 +543,17 @@ def __plot(self, successful): self.__make_summary_plot("histoSigmaCorr", ibin2, successful) def __write_working_points(self): - write_yaml = {"central": self.cent_cv_cut, - "lower_limits": self.min_cv_cut, - "upper_limits": self.max_cv_cut, - "working_points": self.ml_wps} - save_path = join(self.nominal_analyzer_merged.d_resultsallpdata, self.syst_out_dir, - "working_points.yaml") + write_yaml = { + "central": self.cent_cv_cut, + "lower_limits": self.min_cv_cut, + "upper_limits": self.max_cv_cut, + "working_points": self.ml_wps, + } + save_path = join(self.nominal_analyzer_merged.d_resultsallpdata, self.syst_out_dir, "working_points.yaml") dump_yaml_from_dict(write_yaml, save_path) - def __load_working_points(self): - save_path = join(self.nominal_analyzer_merged.d_resultsallpdata, self.syst_out_dir, - "working_points.yaml") + save_path = join(self.nominal_analyzer_merged.d_resultsallpdata, self.syst_out_dir, "working_points.yaml") if not exists(save_path): print(f"Cannot load working points. File {save_path} doesn't exist") sys.exit(1) @@ -596,25 +564,20 @@ def __load_working_points(self): self.max_cv_cut = read_yaml["upper_limits"] self.ml_wps = read_yaml["working_points"] - def __add_trial_to_save(self, i_trial): if self.successful_write is None: self.successful_write = [] self.successful_write.append(i_trial) - def __write_successful_trials(self): if not self.successful_write: return write_yaml = {"successful_trials": self.successful_write} - save_path = join(self.nominal_analyzer_merged.d_resultsallpdata, self.syst_out_dir, - "successful_trials.yaml") + save_path = join(self.nominal_analyzer_merged.d_resultsallpdata, self.syst_out_dir, "successful_trials.yaml") dump_yaml_from_dict(write_yaml, save_path) - def __read_successful_trials(self): - save_path = join(self.nominal_analyzer_merged.d_resultsallpdata, self.syst_out_dir, - "successful_trials.yaml") + save_path = join(self.nominal_analyzer_merged.d_resultsallpdata, self.syst_out_dir, "successful_trials.yaml") if not exists(save_path): print(f"Cannot load working points. File {save_path} doesn't (yet) exist.") print("Do full syst in 10s...") @@ -622,10 +585,8 @@ def __read_successful_trials(self): return [] return parse_yaml(save_path)["successful_trials"] - def ml_systematics(self, do_only_analysis=False, resume=False): - """central method to call for ML WP systematics - """ + """central method to call for ML WP systematics""" # Make sure the summary directory exists aleady save_path = join(self.nominal_analyzer_merged.d_resultsallpdata, self.syst_out_dir) diff --git a/machine_learning_hep/analysis/utils.py b/machine_learning_hep/analysis/utils.py index 6676c96d7d..08a7498e95 100644 --- a/machine_learning_hep/analysis/utils.py +++ b/machine_learning_hep/analysis/utils.py @@ -12,14 +12,14 @@ ## along with this program. if not, see . ## ############################################################################# -from os.path import join import tempfile +from os.path import join -from machine_learning_hep.utilities import mergerootfiles from machine_learning_hep.logger import get_logger +from machine_learning_hep.utilities import mergerootfiles -def multi_preparenorm(database, typean, doperiodbyperiod): +def multi_preparenorm(database, typean, doperiodbyperiod): logger = get_logger() lper_normfilesorig = [] @@ -31,16 +31,14 @@ def multi_preparenorm(database, typean, doperiodbyperiod): lper_normfilesorig.append(join(lper_val, "correctionsweights.root")) lper_normfiles.append(join(res_path, "correctionsweights.root")) - f_normmerged = join(database["analysis"][typean]["data"]["resultsallp"], - "correctionsweights.root") + f_normmerged = join(database["analysis"][typean]["data"]["resultsallp"], "correctionsweights.root") listempty = [] useperiod = database["analysis"][typean]["useperiod"] with tempfile.TemporaryDirectory() as tmp_merged_dir: for indexp in range(len(resultsdata)): - logger.info("Origin path: %s, target path: %s", lper_normfilesorig[indexp], - lper_normfiles[indexp]) + logger.info("Origin path: %s, target path: %s", lper_normfilesorig[indexp], lper_normfiles[indexp]) mergerootfiles([lper_normfilesorig[indexp]], lper_normfiles[indexp], tmp_merged_dir) if doperiodbyperiod and useperiod[indexp]: listempty.append(lper_normfiles[indexp]) diff --git a/machine_learning_hep/bitwise.py b/machine_learning_hep/bitwise.py index 78d6d935f0..06e7840ff4 100644 --- a/machine_learning_hep/bitwise.py +++ b/machine_learning_hep/bitwise.py @@ -15,24 +15,27 @@ """ Methods to: perform bitwise operations on dataframes """ -from functools import reduce + import operator +from functools import reduce + import numpy as np from .logger import get_logger -def tag_bit_df(dfin, namebitmap, activatedbit, absval = False): + +def tag_bit_df(dfin, namebitmap, activatedbit, absval=False): try: - ar = dfin[namebitmap].to_numpy(dtype='int') + ar = dfin[namebitmap].to_numpy(dtype="int") if absval: ar = abs(ar) mask_on = reduce(operator.or_, ((1 << bit) for bit in activatedbit[0]), 0) mask_off = reduce(operator.or_, ((1 << bit) for bit in activatedbit[1]), 0) - return np.logical_and(np.bitwise_and(ar, mask_on) == mask_on, - np.bitwise_and(ar, mask_off) == 0) + return np.logical_and(np.bitwise_and(ar, mask_on) == mask_on, np.bitwise_and(ar, mask_off) == 0) except Exception: - get_logger().exception('%s, %s', dfin, namebitmap) + get_logger().exception("%s, %s", dfin, namebitmap) raise + def filter_bit_df(dfin, namebitmap, activatedbit): return dfin[tag_bit_df(dfin, namebitmap, activatedbit)] diff --git a/machine_learning_hep/computetrigger.py b/machine_learning_hep/computetrigger.py index 5c45d8c8fc..bdbc84bba1 100644 --- a/machine_learning_hep/computetrigger.py +++ b/machine_learning_hep/computetrigger.py @@ -13,28 +13,29 @@ ############################################################################# import argparse -from ROOT import TFile, TCanvas, TF1, gPad, TLine, TLegend # pylint: disable=import-error, no-name-in-module -from machine_learning_hep.utilities_plot import (load_root_style, - rebin_histogram, - buildbinning, - buildhisto) +from ROOT import TF1, TCanvas, TFile, TLegend, TLine, gPad # pylint: disable=import-error, no-name-in-module -def main(input_trg="/data/DerivedResults/D0kAnywithJets/vAN-20200304_ROOT6-1/pp_2018_data/" - "376_20200304-2028/resultsSPDvspt_ntrkl_trigger/masshisto.root", # pylint: disable=too-many-statements - input_mb="/data/DerivedResults/D0kAnywithJets/vAN-20200304_ROOT6-1/pp_2018_data/" \ - "376_20200304-2028/resultsMBvspt_ntrkl_trigger/masshisto.root", - output_path="../Analyses/ALICE_D2H_vs_mult_pp13/reweighting/data_2018/", - min_draw_range=0, max_draw_range=150, - min_fit_range=40., max_fit_range=100., - rebin_histo=True, show_func_ratio=True): +from machine_learning_hep.utilities_plot import buildbinning, buildhisto, load_root_style, rebin_histogram - draw_range = [min_draw_range, - max_draw_range] - fit_range = [min_fit_range, - max_fit_range] - re_binning = buildbinning(100, -.5, 99.5) +def main( + input_trg="/data/DerivedResults/D0kAnywithJets/vAN-20200304_ROOT6-1/pp_2018_data/" + "376_20200304-2028/resultsSPDvspt_ntrkl_trigger/masshisto.root", # pylint: disable=too-many-statements + input_mb="/data/DerivedResults/D0kAnywithJets/vAN-20200304_ROOT6-1/pp_2018_data/" + "376_20200304-2028/resultsMBvspt_ntrkl_trigger/masshisto.root", + output_path="../Analyses/ALICE_D2H_vs_mult_pp13/reweighting/data_2018/", + min_draw_range=0, + max_draw_range=150, + min_fit_range=40.0, + max_fit_range=100.0, + rebin_histo=True, + show_func_ratio=True, +): + draw_range = [min_draw_range, max_draw_range] + fit_range = [min_fit_range, max_fit_range] + + re_binning = buildbinning(100, -0.5, 99.5) re_binning += buildbinning(25, 100.5, 199.5) load_root_style() # Loading the default style @@ -45,31 +46,27 @@ def main(input_trg="/data/DerivedResults/D0kAnywithJets/vAN-20200304_ROOT6-1/pp_ hden = filedatamb.Get("hn_tracklets_corr") hnum = filedatatrg.Get("hn_tracklets_corr") if rebin_histo: - hden_rebin = buildhisto(hden.GetName() + "_den_rebin", - hden.GetTitle(), re_binning) + hden_rebin = buildhisto(hden.GetName() + "_den_rebin", hden.GetTitle(), re_binning) hden = rebin_histogram(hden, hden_rebin) - hnum_rebin = buildhisto(hnum.GetName() + "_num_rebin", - hnum.GetTitle(), re_binning) + hnum_rebin = buildhisto(hnum.GetName() + "_num_rebin", hnum.GetTitle(), re_binning) hnum = rebin_histogram(hnum, hnum_rebin) hratio = hnum.Clone("hratio") hdend = filedatamb.Get("hn_tracklets_corr_withd") hnumd = filedatatrg.Get("hn_tracklets_corr_withd") if rebin_histo: - hdend_rebin = buildhisto(hdend.GetName() + "_dend_rebin", - hdend.GetTitle(), re_binning) + hdend_rebin = buildhisto(hdend.GetName() + "_dend_rebin", hdend.GetTitle(), re_binning) hdend = rebin_histogram(hdend, hdend_rebin) - hnumd_rebin = buildhisto(hnumd.GetName() + "_numd_rebin", - hnumd.GetTitle(), re_binning) + hnumd_rebin = buildhisto(hnumd.GetName() + "_numd_rebin", hnumd.GetTitle(), re_binning) hnumd = rebin_histogram(hnumd, hnumd_rebin) hratiod = hnumd.Clone("hratiod") hratio.Divide(hden) hratiod.Divide(hdend) # Prepare the canvas - ctrigger = TCanvas('ctrigger', 'The Fit Canvas') + ctrigger = TCanvas("ctrigger", "The Fit Canvas") ctrigger.SetCanvasSize(2500, 2000) ctrigger.Divide(3, 2) - leg = TLegend(.5, .65, .7, .85) + leg = TLegend(0.5, 0.65, 0.7, 0.85) leg.SetBorderSize(0) leg.SetFillColor(0) leg.SetFillStyle(0) @@ -94,9 +91,9 @@ def main(input_trg="/data/DerivedResults/D0kAnywithJets/vAN-20200304_ROOT6-1/pp_ hratio.GetXaxis().SetRangeUser(*draw_range) hratio.Draw("pe") func = TF1("func", "([0]/(1+TMath::Exp(-[1]*(x-[2]))))", *draw_range) - func.SetParameters(300, .1, 570) - func.SetParLimits(1, 0., 10.) - func.SetParLimits(2, 0., 1000.) + func.SetParameters(300, 0.1, 570) + func.SetParLimits(1, 0.0, 10.0) + func.SetParLimits(2, 0.0, 1000.0) func.SetRange(*fit_range) func.SetLineWidth(1) hratio.Fit(func, "L", "", *fit_range) @@ -107,14 +104,12 @@ def main(input_trg="/data/DerivedResults/D0kAnywithJets/vAN-20200304_ROOT6-1/pp_ hfunratio = hratio.DrawCopy() hfunratio.GetListOfFunctions().Clear() yaxis = hfunratio.GetYaxis() - yaxis.SetTitle(yaxis.GetTitle() - + " ratio to fit function") - for i in range(1, hfunratio.GetNbinsX()+1): + yaxis.SetTitle(yaxis.GetTitle() + " ratio to fit function") + for i in range(1, hfunratio.GetNbinsX() + 1): x = hfunratio.GetXaxis().GetBinCenter(i) - y = [hfunratio.GetBinContent(i), - hfunratio.GetBinError(i)] - ratio = y[0]/func.Eval(x) - ratio_error = y[1]/func.Eval(x) + y = [hfunratio.GetBinContent(i), hfunratio.GetBinError(i)] + ratio = y[0] / func.Eval(x) + ratio_error = y[1] / func.Eval(x) hfunratio.SetBinContent(i, ratio) hfunratio.SetBinError(i, ratio_error) # Draw source with D @@ -133,9 +128,9 @@ def main(input_trg="/data/DerivedResults/D0kAnywithJets/vAN-20200304_ROOT6-1/pp_ hratiod.GetXaxis().SetRangeUser(*draw_range) hratiod.Draw("pe") funcd = TF1("func", "([0]/(1+TMath::Exp(-[1]*(x-[2]))))", *draw_range) - funcd.SetParameters(300, .1, 570) - funcd.SetParLimits(1, 0., 10.) - funcd.SetParLimits(2, 0., 1000.) + funcd.SetParameters(300, 0.1, 570) + funcd.SetParLimits(1, 0.0, 10.0) + funcd.SetParLimits(2, 0.0, 1000.0) funcd.SetRange(*fit_range) funcd.SetLineWidth(1) hratiod.Fit(funcd, "L", "", *fit_range) @@ -146,21 +141,18 @@ def main(input_trg="/data/DerivedResults/D0kAnywithJets/vAN-20200304_ROOT6-1/pp_ # Draw both fitting functions ctrigger.cd(6) # pylint: disable=unused-variable - hframe = gPad.DrawFrame(min_draw_range, 0, - max_draw_range, 1, - ";n_tracklets_corr;Efficiency") + hframe = gPad.DrawFrame(min_draw_range, 0, max_draw_range, 1, ";n_tracklets_corr;Efficiency") funcnorm = func.Clone("funcSPDvspt_ntrkl_norm") - funcnorm.FixParameter(0, funcnorm.GetParameter(0)/funcnorm.GetMaximum()) + funcnorm.FixParameter(0, funcnorm.GetParameter(0) / funcnorm.GetMaximum()) funcnormd = funcd.Clone("funcdSPDvspt_ntrkl_norm") - funcnormd.FixParameter(0, funcnormd.GetParameter(0)/funcnormd.GetMaximum()) + funcnormd.FixParameter(0, funcnormd.GetParameter(0) / funcnormd.GetMaximum()) funcnorm.Draw("same") funcnormd.Draw("same") line = TLine(60, 0, 60, 1) line.SetLineStyle(2) line.Draw("same") ctrigger.SaveAs(output_path + "/SPDtrigger.pdf") - foutput = TFile.Open(output_path + "/triggerSPDvspt_ntrkl.root", - "recreate") + foutput = TFile.Open(output_path + "/triggerSPDvspt_ntrkl.root", "recreate") foutput.cd() hratio.SetName("hratioSPDvspt_ntrkl") hratio.Write() @@ -180,54 +172,36 @@ def main(input_trg="/data/DerivedResults/D0kAnywithJets/vAN-20200304_ROOT6-1/pp_ if __name__ == "__main__": # Configuration variables PARSER = argparse.ArgumentParser(description="Compute the trigger") - PARSER.add_argument("--input-trg", - dest="input_trg", - help="input file for triggered data") - PARSER.add_argument("--input-mb", - dest="input_mb", - help="input file for MB data") - PARSER.add_argument("--output-path", - dest="output_path", - help="output path for pdf and root files", - default="/tmp/") - PARSER.add_argument("--min-draw-range", - dest="min_draw_range", - help="Minimum histogram plotting range", - default=0., - type=float) - PARSER.add_argument("--max-draw-range", - dest="max_draw_range", - help="Maximum histogram plotting range", - default=150., - type=float) - PARSER.add_argument("--min-fit-range", - dest="min_fit_range", - help="Minimum fit range", - default=40., - type=float) - PARSER.add_argument("--max-fit-range", - dest="max_fit_range", - help="Maximum fit range", - default=100., - type=float) - PARSER.add_argument("--rebin-histo", - help="Rebin the histogram", - dest="rebin_histo", - action="store_true") - PARSER.add_argument("--func-ratio", - help="Shows the ratio between the function and the fitted histogram", - dest="func_ratio", - action="store_true") + PARSER.add_argument("--input-trg", dest="input_trg", help="input file for triggered data") + PARSER.add_argument("--input-mb", dest="input_mb", help="input file for MB data") + PARSER.add_argument("--output-path", dest="output_path", help="output path for pdf and root files", default="/tmp/") + PARSER.add_argument( + "--min-draw-range", dest="min_draw_range", help="Minimum histogram plotting range", default=0.0, type=float + ) + PARSER.add_argument( + "--max-draw-range", dest="max_draw_range", help="Maximum histogram plotting range", default=150.0, type=float + ) + PARSER.add_argument("--min-fit-range", dest="min_fit_range", help="Minimum fit range", default=40.0, type=float) + PARSER.add_argument("--max-fit-range", dest="max_fit_range", help="Maximum fit range", default=100.0, type=float) + PARSER.add_argument("--rebin-histo", help="Rebin the histogram", dest="rebin_histo", action="store_true") + PARSER.add_argument( + "--func-ratio", + help="Shows the ratio between the function and the fitted histogram", + dest="func_ratio", + action="store_true", + ) PARSER.print_help() ARGS = PARSER.parse_args() print(ARGS) - main(input_trg=ARGS.input_trg, - input_mb=ARGS.input_mb, - output_path=ARGS.output_path, - min_draw_range=ARGS.min_draw_range, - max_draw_range=ARGS.max_draw_range, - min_fit_range=ARGS.min_fit_range, - max_fit_range=ARGS.max_fit_range, - rebin_histo=ARGS.rebin_histo, - show_func_ratio=ARGS.func_ratio) + main( + input_trg=ARGS.input_trg, + input_mb=ARGS.input_mb, + output_path=ARGS.output_path, + min_draw_range=ARGS.min_draw_range, + max_draw_range=ARGS.max_draw_range, + min_fit_range=ARGS.min_fit_range, + max_fit_range=ARGS.max_fit_range, + rebin_histo=ARGS.rebin_histo, + show_func_ratio=ARGS.func_ratio, + ) diff --git a/machine_learning_hep/config.py b/machine_learning_hep/config.py index 42f29f4bf8..e801273eed 100644 --- a/machine_learning_hep/config.py +++ b/machine_learning_hep/config.py @@ -17,13 +17,14 @@ """ from itertools import product -from machine_learning_hep.logger import get_logger + from machine_learning_hep.do_variations import modify_dictionary +from machine_learning_hep.logger import get_logger # disable pylint unused-argument because this is done already in view of updating the # database depending on info in there -def update_config(database: dict, run_config: dict, database_overwrite=None): # pylint: disable=unused-argument +def update_config(database: dict, run_config: dict, database_overwrite=None): # pylint: disable=unused-argument """Update database before usage 1. overwrite with potential additional user configuration @@ -59,16 +60,17 @@ def update_config(database: dict, run_config: dict, database_overwrite=None): # data_mc = ("data", "mc") pkl_keys = ("pkl_skimmed_dec", "pkl_skimmed_decmerged") for keys in product(data_mc, pkl_keys): - database["mlapplication"][keys[0]][keys[1]][:] = \ - [f"{path}_std" for path in database["mlapplication"][keys[0]][keys[1]]] + database["mlapplication"][keys[0]][keys[1]][:] = [ + f"{path}_std" for path in database["mlapplication"][keys[0]][keys[1]] + ] # ...set the ML working point all to 0 # except for MultiClassification, where bkg cut of 1 is the loosest one for k in data_mc: - database["mlapplication"]["probcutpresel"][k] = \ - [[1 if i == 0 and database["ml"]["mltype"] == "MultiClassification" else 0 \ - for i in range(len(pcut))] \ - for pcut in database["mlapplication"]["probcutpresel"][k]] - database["mlapplication"]["probcutoptimal"] = \ - [[1 if i == 0 and database["ml"]["mltype"] == "MultiClassification" else 0 \ - for i in range(len(pcut))] \ - for pcut in database["mlapplication"]["probcutoptimal"]] + database["mlapplication"]["probcutpresel"][k] = [ + [1 if i == 0 and database["ml"]["mltype"] == "MultiClassification" else 0 for i in range(len(pcut))] + for pcut in database["mlapplication"]["probcutpresel"][k] + ] + database["mlapplication"]["probcutoptimal"] = [ + [1 if i == 0 and database["ml"]["mltype"] == "MultiClassification" else 0 for i in range(len(pcut))] + for pcut in database["mlapplication"]["probcutoptimal"] + ] diff --git a/machine_learning_hep/correlations.py b/machine_learning_hep/correlations.py index 3a3e9f0c2a..e5961aed34 100644 --- a/machine_learning_hep/correlations.py +++ b/machine_learning_hep/correlations.py @@ -15,33 +15,33 @@ """ Methods for correlation and variable plots """ + import pickle from collections import deque -import numpy as np + import matplotlib as mpl import matplotlib.pyplot as plt -from matplotlib.gridspec import GridSpec +import numpy as np import seaborn as sns +from matplotlib.gridspec import GridSpec from machine_learning_hep.logger import get_logger -#mpl.use('Agg') +# mpl.use('Agg') -HIST_COLORS = ['g', 'b', 'r'] +HIST_COLORS = ["g", "b", "r"] -def vardistplot(dfs_input_, mylistvariables_, output_, - binmin, binmax, plot_options_): + +def vardistplot(dfs_input_, mylistvariables_, output_, binmin, binmax, plot_options_): mpl.rcParams.update({"text.usetex": True}) plot_type_name = "prob_cut_scan" - plot_options = plot_options_.get(plot_type_name, {}) \ - if isinstance(plot_options_, dict) else {} + plot_options = plot_options_.get(plot_type_name, {}) if isinstance(plot_options_, dict) else {} figure = plt.figure(figsize=(20, 15)) - figure.suptitle(f"Separation plots for ${binmin} < p_\\mathrm{{T}}/(\\mathrm{{GeV}}/c) < " \ - f"{binmax}$", fontsize=30) + figure.suptitle(f"Separation plots for ${binmin} < p_\\mathrm{{T}}/(\\mathrm{{GeV}}/c) < {binmax}$", fontsize=30) for ind, var in enumerate(mylistvariables_, start=1): - ax = plt.subplot(3, int(len(mylistvariables_)/3+1), ind) - plt.yscale('log') + ax = plt.subplot(3, int(len(mylistvariables_) / 3 + 1), ind) + plt.yscale("log") kwargs = {"alpha": 0.3, "density": True, "bins": 100} po = plot_options.get(var, {}) if "xlim" in po: @@ -59,21 +59,29 @@ def vardistplot(dfs_input_, mylistvariables_, output_, plt.ylabel(po.get("ylabel", "entries"), fontsize=11) ax.legend() plotname = f"{output_}/variablesDistribution_nVar{len(mylistvariables_)}_{binmin}{binmax}.png" - figure.savefig(plotname, bbox_inches='tight') + figure.savefig(plotname, bbox_inches="tight") mpl.rcParams.update({"text.usetex": False}) plt.close(figure) -def vardistplot_probscan(dataframe_, mylistvariables_, modelname_, thresharray_, # pylint: disable=too-many-statements - output_, suffix_, opt=1, plot_options_=None): +def vardistplot_probscan( + dataframe_, + mylistvariables_, + modelname_, + thresharray_, # pylint: disable=too-many-statements + output_, + suffix_, + opt=1, + plot_options_=None, +): plot_type_name = "prob_cut_scan" plot_options = {} if isinstance(plot_options_, dict): plot_options = plot_options_.get(plot_type_name, {}) - color = ['C0', 'C1', 'C2', 'C3', 'C4', 'C5', 'C6', 'C7', 'C8', 'C9'] + color = ["C0", "C1", "C2", "C3", "C4", "C5", "C6", "C7", "C8", "C9"] figure = plt.figure(figsize=(60, 25)) - gs = GridSpec(3, int(len(mylistvariables_)/3+1)) + gs = GridSpec(3, int(len(mylistvariables_) / 3 + 1)) axes = [figure.add_subplot(gs[i]) for i in range(len(mylistvariables_))] # Sort the thresharray_ @@ -93,7 +101,6 @@ def vardistplot_probscan(dataframe_, mylistvariables_, modelname_, thresharray_, df_skimmed = df_skimmed.query(selml) for i, var in enumerate(mylistvariables_): - # Extract minimum and maximum for x-axis, this is only done once # for each variable if thresh_index == 0: @@ -112,8 +119,8 @@ def vardistplot_probscan(dataframe_, mylistvariables_, modelname_, thresharray_, xrange_max.append(values0.max()) n = len(df_skimmed[var]) - lbl = f'prob > {threshold} n = {n}' - clr = color[thresh_index%len(color)] + lbl = f"prob > {threshold} n = {n}" + clr = color[thresh_index % len(color)] values = df_skimmed[var] his, bina = np.histogram(values, range=(xrange_min[i], xrange_max[i]), bins=100) if thresh_index == 0: @@ -122,19 +129,26 @@ def vardistplot_probscan(dataframe_, mylistvariables_, modelname_, thresharray_, center = (bina[:-1] + bina[1:]) / 2 if opt == 0: - axes[i].set_yscale('log') + axes[i].set_yscale("log") elif opt == 1: his = np.divide(his, ref_hists[i]) axes[i].set_ylim(0.001, 1.1) - axes[i].bar(center, his, align='center', width=width, facecolor=clr, label=lbl) + axes[i].bar(center, his, align="center", width=width, facecolor=clr, label=lbl) axes[i].legend(fontsize=10) plotname = f"{output_}/variables_distribution_{suffix_}_ratio{opt}.png" - figure.savefig(plotname, bbox_inches='tight') + figure.savefig(plotname, bbox_inches="tight") plt.close(figure) -def efficiency_cutscan(dataframe_, mylistvariables_, modelname_, threshold, # pylint: disable=too-many-statements - output_, suffix_, plot_options_=None): +def efficiency_cutscan( + dataframe_, + mylistvariables_, + modelname_, + threshold, # pylint: disable=too-many-statements + output_, + suffix_, + plot_options_=None, +): plot_type_name = "eff_cut_scan" plot_options = {} if isinstance(plot_options_, dict): @@ -143,7 +157,7 @@ def efficiency_cutscan(dataframe_, mylistvariables_, modelname_, threshold, # py dataframe_ = dataframe_.query(selml) figure = plt.figure(figsize=(60, 25)) - gs = GridSpec(3, int(len(mylistvariables_)/3+1)) + gs = GridSpec(3, int(len(mylistvariables_) / 3 + 1)) axes = [figure.add_subplot(gs[i]) for i in range(len(mylistvariables_))] # Available cut options @@ -156,15 +170,14 @@ def efficiency_cutscan(dataframe_, mylistvariables_, modelname_, threshold, # py axes[i].set_xlabel(var, fontsize=30) axes[i].set_ylabel("entries (normalised)", fontsize=30) axes[i].tick_params(labelsize=20) - axes[i].set_yscale('log') + axes[i].set_yscale("log") axes[i].set_ylim(0.1, 1.5) values = dataframe_[var].values - if "abs" in vardir: + if "abs" in vardir: cen = var_tuple[2] if len(var_tuple) > 2 else None if cen is None: - get_logger().error("Absolute cut chosen for %s. " \ - "However, no central value provided", var) + get_logger().error("Absolute cut chosen for %s. However, no central value provided", var) continue values = np.array([abs(v - cen) for v in values]) @@ -177,45 +190,49 @@ def efficiency_cutscan(dataframe_, mylistvariables_, modelname_, threshold, # py minv = values.min() maxv = values.max() _, bina = np.histogram(values, range=(minv, maxv), bins=nbinscan) - widthbin = (maxv - minv)/(float)(nbinscan) + widthbin = (maxv - minv) / (float)(nbinscan) width = np.diff(bina) center = (bina[:-1] + bina[1:]) / 2 den = len(values) ratios = deque() if vardir not in cut_options: - get_logger().error("Please choose cut option from %s. " \ - "Your current setting for variable %s is %s", str(cut_options), vardir, var) + get_logger().error( + "Please choose cut option from %s. Your current setting for variable %s is %s", + str(cut_options), + vardir, + var, + ) continue if "lt" in vardir: for ibin in range(nbinscan): - values = values[values > minv+widthbin*ibin] + values = values[values > minv + widthbin * ibin] num = len(values) - eff = float(num)/float(den) + eff = float(num) / float(den) ratios.append(eff) else: for ibin in range(nbinscan, 0, -1): - values = values[values < minv+widthbin*ibin] + values = values[values < minv + widthbin * ibin] num = len(values) - eff = float(num)/float(den) + eff = float(num) / float(den) ratios.appendleft(eff) - lbl = f'prob > {threshold}' - axes[i].bar(center, ratios, align='center', width=width, label=lbl) + lbl = f"prob > {threshold}" + axes[i].bar(center, ratios, align="center", width=width, label=lbl) axes[i].legend(fontsize=30) plotname = f"{output_}/variables_effscan_prob{threshold}_{suffix_}.png" - figure.savefig(plotname, bbox_inches='tight') + figure.savefig(plotname, bbox_inches="tight") plt.close(figure) -def picklesize_cutscan(dataframe_, mylistvariables_, output_, suffix_, plot_options_=None): # pylint: disable=too-many-statements +def picklesize_cutscan(dataframe_, mylistvariables_, output_, suffix_, plot_options_=None): # pylint: disable=too-many-statements plot_type_name = "picklesize_cut_scan" plot_options = {} if isinstance(plot_options_, dict): plot_options = plot_options_.get(plot_type_name, {}) figure = plt.figure(figsize=(60, 25)) - gs = GridSpec(3, int(len(mylistvariables_)/3+1)) + gs = GridSpec(3, int(len(mylistvariables_) / 3 + 1)) axes = [figure.add_subplot(gs[i]) for i in range(len(mylistvariables_))] df_reference_pkl_size = len(pickle.dumps(dataframe_, protocol=4)) @@ -229,10 +246,10 @@ def picklesize_cutscan(dataframe_, mylistvariables_, output_, suffix_, plot_opti axes[i].set_xlabel(var, fontsize=30) axes[i].set_ylabel("rel. pickle size after cut", fontsize=30) axes[i].tick_params(labelsize=20) - axes[i].set_yscale('log') + axes[i].set_yscale("log") axes[i].set_ylim(0.005, 1.5) values = dataframe_[var].values - if "abs" in vardir: + if "abs" in vardir: values = np.array([abs(v - cen) for v in values]) nbinscan = 100 if var in plot_options and "xlim" in plot_options[var]: @@ -242,7 +259,7 @@ def picklesize_cutscan(dataframe_, mylistvariables_, output_, suffix_, plot_opti minv = values.min() maxv = values.max() _, bina = np.histogram(values, range=(minv, maxv), bins=nbinscan) - widthbin = (maxv - minv)/(float)(nbinscan) + widthbin = (maxv - minv) / (float)(nbinscan) width = np.diff(bina) center = (bina[:-1] + bina[1:]) / 2 ratios_df_pkl_size = deque() @@ -250,65 +267,59 @@ def picklesize_cutscan(dataframe_, mylistvariables_, output_, suffix_, plot_opti df_skimmed = dataframe_ if "lt" in vardir: for ibin in range(nbinscan): - df_skimmed = df_skimmed.iloc[values > minv+widthbin*ibin] - values = values[values > minv+widthbin*ibin] + df_skimmed = df_skimmed.iloc[values > minv + widthbin * ibin] + values = values[values > minv + widthbin * ibin] num = len(pickle.dumps(df_skimmed, protocol=4)) - eff = float(num)/float(df_reference_pkl_size) + eff = float(num) / float(df_reference_pkl_size) ratios_df_pkl_size.append(eff) num = df_skimmed.shape[0] * df_skimmed.shape[1] - eff = float(num)/float(df_reference_size) + eff = float(num) / float(df_reference_size) ratios_df_size.append(eff) elif "st" in vardir: for ibin in range(nbinscan, 0, -1): - df_skimmed = df_skimmed.iloc[values < minv+widthbin*ibin] - values = values[values < minv+widthbin*ibin] + df_skimmed = df_skimmed.iloc[values < minv + widthbin * ibin] + values = values[values < minv + widthbin * ibin] num = len(pickle.dumps(df_skimmed, protocol=4)) - eff = float(num)/float(df_reference_pkl_size) + eff = float(num) / float(df_reference_pkl_size) ratios_df_pkl_size.appendleft(eff) num = df_skimmed.shape[0] * df_skimmed.shape[1] - eff = float(num)/float(df_reference_size) + eff = float(num) / float(df_reference_size) ratios_df_size.appendleft(eff) - axes[i].bar(center, ratios_df_pkl_size, align='center', width=width, label="rel. pkl size", - alpha=0.5) - axes[i].bar(center, ratios_df_size, align='center', width=width, label="rel. df length", - alpha=0.5) + axes[i].bar(center, ratios_df_pkl_size, align="center", width=width, label="rel. pkl size", alpha=0.5) + axes[i].bar(center, ratios_df_size, align="center", width=width, label="rel. df length", alpha=0.5) axes[i].legend(fontsize=30) plotname = f"{output_}/variables_cutscan_picklesize_{suffix_}.png" - figure.savefig(plotname, bbox_inches='tight') + figure.savefig(plotname, bbox_inches="tight") plt.close(figure) -def scatterplot(dfs_input_, mylistvariablesx_, - mylistvariablesy_, output_, binmin, binmax): - figurecorr = plt.figure(figsize=(30, 20)) # pylint: disable=unused-variable +def scatterplot(dfs_input_, mylistvariablesx_, mylistvariablesy_, output_, binmin, binmax): + figurecorr = plt.figure(figsize=(30, 20)) # pylint: disable=unused-variable for ind, (var_x, var_y) in enumerate(zip(mylistvariablesx_, mylistvariablesy_), start=1): - axcorr = plt.subplot(3, int(len(mylistvariablesx_)/3+1), ind) + axcorr = plt.subplot(3, int(len(mylistvariablesx_) / 3 + 1), ind) plt.xlabel(var_x, fontsize=11) plt.ylabel(var_y, fontsize=11) - title_str = 'Pearson coef. ' + title_str = "Pearson coef. " for label, color in zip(dfs_input_, HIST_COLORS): - plt.scatter(dfs_input_[label][var_x], dfs_input_[label][var_y], - alpha=0.4, c=color, label=label) + plt.scatter(dfs_input_[label][var_x], dfs_input_[label][var_y], alpha=0.4, c=color, label=label) pearson = dfs_input_[label].corr(numeric_only=True)[var_x][var_y].round(2) - title_str += f'{label}: {pearson}, ' + title_str += f"{label}: {pearson}, " plt.title(title_str[:-2]) axcorr.legend() plotname = f"{output_}/variablesScatterPlot{binmin}{binmax}.png" - figurecorr.savefig(plotname, bbox_inches='tight') + figurecorr.savefig(plotname, bbox_inches="tight") plt.close(figurecorr) -def correlationmatrix(dataframe, mylistvariables, label, output, binmin, binmax, - plot_options_=None): +def correlationmatrix(dataframe, mylistvariables, label, output, binmin, binmax, plot_options_=None): corr = dataframe[mylistvariables].corr() # Generate a mask for the upper triangle mask = np.triu(np.ones_like(corr, dtype=bool)) _, ax = plt.subplots(figsize=(10, 8)) - #sns.heatmap(corr, mask=np.zeros_like(corr, dtype=np.bool), + # sns.heatmap(corr, mask=np.zeros_like(corr, dtype=np.bool), mpl.rcParams.update({"text.usetex": True}) plot_type_name = "prob_cut_scan" - plot_options = plot_options_.get(plot_type_name, {}) \ - if isinstance(plot_options_, dict) else {} + plot_options = plot_options_.get(plot_type_name, {}) if isinstance(plot_options_, dict) else {} labels = [] for myvar in mylistvariables: if myvar in plot_options and "xlabel" in plot_options[myvar]: @@ -319,11 +330,25 @@ def correlationmatrix(dataframe, mylistvariables, label, output, binmin, binmax, if not labels: labels = "auto" - sns.heatmap(corr, mask=mask, - cmap=sns.diverging_palette(220, 10, as_cmap=True), vmin=-1, vmax=1, - square=True, ax=ax, xticklabels=labels, yticklabels=labels) - ax.text(0.7, 0.9, f"${binmin} < p_\\mathrm{{T}}/(\\mathrm{{GeV}}/c) < {binmax}$\n{label}", - verticalalignment='center', transform=ax.transAxes, fontsize=13) - plt.savefig(output, bbox_inches='tight') + sns.heatmap( + corr, + mask=mask, + cmap=sns.diverging_palette(220, 10, as_cmap=True), + vmin=-1, + vmax=1, + square=True, + ax=ax, + xticklabels=labels, + yticklabels=labels, + ) + ax.text( + 0.7, + 0.9, + f"${binmin} < p_\\mathrm{{T}}/(\\mathrm{{GeV}}/c) < {binmax}$\n{label}", + verticalalignment="center", + transform=ax.transAxes, + fontsize=13, + ) + plt.savefig(output, bbox_inches="tight") mpl.rcParams.update({"text.usetex": False}) plt.close() diff --git a/machine_learning_hep/derive_weights/derive_weights.py b/machine_learning_hep/derive_weights/derive_weights.py index 71a652187d..7fbc795b53 100644 --- a/machine_learning_hep/derive_weights/derive_weights.py +++ b/machine_learning_hep/derive_weights/derive_weights.py @@ -13,24 +13,21 @@ ############################################################################# -import sys -from glob import glob -import multiprocessing as mp import argparse +import multiprocessing as mp import pickle +import sys +from glob import glob import pandas as pd import yaml -from lz4 import frame # pylint: disable=unused-import - -from root_numpy import fill_hist # pylint: disable=import-error - -from ROOT import TFile, TH1F, TH2F # pylint: disable=import-error, no-name-in-module +from lz4 import frame # pylint: disable=unused-import +from ROOT import TH1F, TH2F, TFile # pylint: disable=import-error, no-name-in-module +from root_numpy import fill_hist # pylint: disable=import-error -from machine_learning_hep.utilities import openfile -from machine_learning_hep.io import parse_yaml from machine_learning_hep.do_variations import modify_dictionary - +from machine_learning_hep.io import parse_yaml +from machine_learning_hep.utilities import openfile # Needed here for multiprocessing INV_MASS = [None] @@ -40,29 +37,28 @@ def only_one_evt(df_in, dupl_cols): return df_in.drop_duplicates(dupl_cols) + def read_database(path, overwrite_path=None): data_param = None - with open(path, 'r') as param_config: + with open(path, "r") as param_config: data_param = yaml.load(param_config, Loader=yaml.FullLoader) case = list(data_param.keys())[0] data_param = data_param[case] if overwrite_path: overwrite_db = None - with open(overwrite_path, 'r') as param_config: + with open(overwrite_path, "r") as param_config: overwrite_db = yaml.load(param_config, Loader=yaml.FullLoader) modify_dictionary(data_param, overwrite_db) return case, data_param -def summary_histograms_and_write(file_out, histos, histo_names, - histo_xtitles, histo_ytitles): +def summary_histograms_and_write(file_out, histos, histo_names, histo_xtitles, histo_ytitles): histos_added = histos[0] for h_list in histos[1:]: for h_added, h in zip(histos_added, h_list): h_added.Add(h) - for h_add, name, xtitle, ytitle \ - in zip(histos_added, histo_names, histo_xtitles, histo_ytitles): + for h_add, name, xtitle, ytitle in zip(histos_added, histo_names, histo_xtitles, histo_ytitles): h_add.SetName(name) h_add.SetTitle(name) h_add.GetXaxis().SetTitle(xtitle) @@ -71,10 +67,20 @@ def summary_histograms_and_write(file_out, histos, histo_names, file_out.WriteTObject(h_add) -def derive(periods, in_top_dirs, gen_file_name, required_columns, use_mass_window, # pylint: disable=too-many-arguments, too-many-branches - distribution_column, distribution_x_range, file_name_mlwp_map, file_out_name, - queries_periods=None, query_all=None, queries_slices=None): - +def derive( + periods, + in_top_dirs, + gen_file_name, + required_columns, + use_mass_window, # pylint: disable=too-many-arguments, too-many-branches + distribution_column, + distribution_x_range, + file_name_mlwp_map, + file_out_name, + queries_periods=None, + query_all=None, + queries_slices=None, +): """ make n_tracklets distributions for all events @@ -96,7 +102,7 @@ def derive(periods, in_top_dirs, gen_file_name, required_columns, use_mass_windo merge_on = [required_columns[:3]] - for period, dir_applied, query_period in zip(periods, in_top_dirs, queries_periods): # pylint: disable=too-many-nested-blocks + for period, dir_applied, query_period in zip(periods, in_top_dirs, queries_periods): # pylint: disable=too-many-nested-blocks query_tmp = None if query_all: query_tmp = query_all @@ -114,9 +120,10 @@ def derive(periods, in_top_dirs, gen_file_name, required_columns, use_mass_windo files_all = glob(f"{dir_applied}/**/{gen_file_name}", recursive=True) if not file_name_mlwp_map: - args = [((f_reco,), histo_params, required_columns, \ - query_tmp, only_one_evt, merge_on[0], queries_slices, None) \ - for f_reco in files_all] + args = [ + ((f_reco,), histo_params, required_columns, query_tmp, only_one_evt, merge_on[0], queries_slices, None) + for f_reco in files_all + ] else: print(file_name_mlwp_map) @@ -135,23 +142,28 @@ def derive(periods, in_top_dirs, gen_file_name, required_columns, use_mass_windo if not found: print(f"ERROR: {file_name}") sys.exit(0) - args.append(((file_name,), histo_params, required_columns, \ - query_tmp_file, only_one_evt, merge_on[0], queries_slices, None)) - + args.append( + ( + (file_name,), + histo_params, + required_columns, + query_tmp_file, + only_one_evt, + merge_on[0], + queries_slices, + None, + ) + ) histos = multi_proc(fill_from_pickles, args, None, 100, 30) histo_names_period = [f"{name}_{period}" for name in histo_names] - summary_histograms_and_write(file_out, histos, histo_names_period, - histo_xtitles, histo_ytitles) + summary_histograms_and_write(file_out, histos, histo_names_period, histo_xtitles, histo_ytitles) file_out.Close() - - -def make_distributions(args, inv_mass, inv_mass_window): # pylint: disable=too-many-statements - +def make_distributions(args, inv_mass, inv_mass_window): # pylint: disable=too-many-statements config = parse_yaml(args.config) database_path = config["database"] @@ -205,7 +217,7 @@ def make_distributions(args, inv_mass, inv_mass_window): # pylint: disable=too-m query_all = trigger_sel in_file_name_gen = database["files_names"]["namefile_reco"] - in_file_name_gen = in_file_name_gen[:in_file_name_gen.find(".")] + in_file_name_gen = in_file_name_gen[: in_file_name_gen.find(".")] if is_ml: pkl_extension = "" @@ -216,10 +228,8 @@ def make_distributions(args, inv_mass, inv_mass_window): # pylint: disable=too-m ml_sel_pt = database["mlapplication"]["probcutoptimal"] pt_bins_low = database["sel_skim_binmin"] pt_bins_up = database["sel_skim_binmax"] - in_file_names = [f"{in_file_name_gen}{ptl}_{ptu}" \ - for ptl, ptu in zip(pt_bins_low, pt_bins_up)] - file_names_cut_map = {ifn: f"{ml_sel_column} > {cut}" \ - for ifn, cut in zip(in_file_names, ml_sel_pt)} + in_file_names = [f"{in_file_name_gen}{ptl}_{ptu}" for ptl, ptu in zip(pt_bins_low, pt_bins_up)] + file_names_cut_map = {ifn: f"{ml_sel_column} > {cut}" for ifn, cut in zip(in_file_names, ml_sel_pt)} else: pkl_extension = "_std" @@ -228,12 +238,23 @@ def make_distributions(args, inv_mass, inv_mass_window): # pylint: disable=too-m # Now make the directory path right in_top_dirs = [f"{itd}{pkl_extension}" for itd in in_top_dirs] - derive(periods, in_top_dirs, in_file_name_gen, column_names, use_mass_window, - distribution, distribution_x_range, file_names_cut_map, out_file, period_cuts, - query_all, slice_cuts) - - -def make_weights(args, *ignore): # pylint: disable=unused-argument + derive( + periods, + in_top_dirs, + in_file_name_gen, + column_names, + use_mass_window, + distribution, + distribution_x_range, + file_names_cut_map, + out_file, + period_cuts, + query_all, + slice_cuts, + ) + + +def make_weights(args, *ignore): # pylint: disable=unused-argument file_data = TFile.Open(args.data, "READ") file_mc = TFile.Open(args.mc, "READ") @@ -256,10 +277,10 @@ def get_mc_histo(histos, period): # norm all for h in mc_histos: if h.GetEntries(): - h.Scale(1. / h.Integral()) + h.Scale(1.0 / h.Integral()) for h in data_histos: if h.GetEntries(): - h.Scale(1. / h.Integral()) + h.Scale(1.0 / h.Integral()) for dh in data_histos: name = dh.GetName() @@ -268,7 +289,7 @@ def get_mc_histo(histos, period): period = name[per_pos:] mc_histo = get_mc_histo(mc_histos, period) - dh.Divide(dh, mc_histo, 1., 1.) + dh.Divide(dh, mc_histo, 1.0, 1.0) out_file.cd() dh.Write(f"{dh.GetName()}_weights") @@ -281,40 +302,40 @@ def get_mc_histo(histos, period): # FUNCTIONS # ############# + def _callback(err): print(err) -def multi_proc(function, argument_list, kw_argument_list, maxperchunk, max_n_procs=10): - chunks_args = [argument_list[x:x+maxperchunk] \ - for x in range(0, len(argument_list), maxperchunk)] +def multi_proc(function, argument_list, kw_argument_list, maxperchunk, max_n_procs=10): + chunks_args = [argument_list[x : x + maxperchunk] for x in range(0, len(argument_list), maxperchunk)] if not kw_argument_list: kw_argument_list = [{} for _ in argument_list] - chunks_kwargs = [kw_argument_list[x:x+maxperchunk] \ - for x in range(0, len(kw_argument_list), maxperchunk)] + chunks_kwargs = [kw_argument_list[x : x + maxperchunk] for x in range(0, len(kw_argument_list), maxperchunk)] res_all = [] for chunk_args, chunk_kwargs in zip(chunks_args, chunks_kwargs): print("Processing new chunck size=", maxperchunk) pool = mp.Pool(max_n_procs) - res = [pool.apply_async(function, args=args, kwds=kwds, error_callback=_callback) \ - for args, kwds in zip(chunk_args, chunk_kwargs)] + res = [ + pool.apply_async(function, args=args, kwds=kwds, error_callback=_callback) + for args, kwds in zip(chunk_args, chunk_kwargs) + ] pool.close() pool.join() res_all.extend(res) - res_list = None try: res_list = [r.get() for r in res_all] - except Exception as e: # pylint: disable=broad-except + except Exception as e: # pylint: disable=broad-except print("EXCEPTION") print(e) return res_list -def fill_from_pickles(file_paths, histo_params, cols=None, query=None, skim_func=None, - skim_func_args=None, queries=None, merge_on=None): - +def fill_from_pickles( + file_paths, histo_params, cols=None, query=None, skim_func=None, skim_func_args=None, queries=None, merge_on=None +): print(f"Process files {file_paths}") dfs = [pickle.load(openfile(f, "rb")) for f in file_paths] @@ -339,7 +360,6 @@ def fill_from_pickles(file_paths, histo_params, cols=None, query=None, skim_func # Skim the dataframe according to user function df = skim_func(df, skim_func_args) - histos = [] if not queries: queries = [None] * len(histo_params) @@ -374,10 +394,6 @@ def fill_from_pickles(file_paths, histo_params, cols=None, query=None, skim_func return histos - - - - def main(): parser = argparse.ArgumentParser() diff --git a/machine_learning_hep/do_variations.py b/machine_learning_hep/do_variations.py index 365281f8ee..ff59f9a29e 100644 --- a/machine_learning_hep/do_variations.py +++ b/machine_learning_hep/do_variations.py @@ -440,7 +440,7 @@ def main(yaml_in: str, yaml_diff: str, analysis: str, config: str, clean: bool, with open(logfile, "w", encoding="utf-8") as ana_out: subprocess.Popen( # pylint: disable=consider-using-with shlex.split( - "mlhep " "-a %s -r %s -d %s -b --delete-force" % (analysis, config_final, yaml_out) + "mlhep -a %s -r %s -d %s -b --delete-force" % (analysis, config_final, yaml_out) ), stdout=ana_out, stderr=ana_out, diff --git a/machine_learning_hep/examples/plot_hfmassfitter.py b/machine_learning_hep/examples/plot_hfmassfitter.py index cce254ea91..ffffc366c4 100644 --- a/machine_learning_hep/examples/plot_hfmassfitter.py +++ b/machine_learning_hep/examples/plot_hfmassfitter.py @@ -15,28 +15,46 @@ """ main script for doing final stage analysis """ + import os + # pylint: disable=unused-wildcard-import, wildcard-import from array import * + # pylint: disable=import-error, no-name-in-module, unused-import import yaml -from ROOT import TFile, TH1F, TCanvas -from ROOT import gStyle, TLegend, TLatex -from ROOT import Double -from ROOT import gROOT, kRed, kGreen, kBlack, kBlue, kOrange, kViolet, kAzure -from ROOT import TStyle, gPad +from ROOT import ( + TH1F, + Double, + TCanvas, + TFile, + TLatex, + TLegend, + TStyle, + gPad, + gROOT, + gStyle, + kAzure, + kBlack, + kBlue, + kGreen, + kOrange, + kRed, + kViolet, +) + from machine_learning_hep.utilities import make_file_path from machine_learning_hep.utilities_plot import load_root_style + # pylint: disable=import-error, no-name-in-module, unused-import # pylint: disable=too-many-statements # pylint: disable=too-many-branches # pylint: disable=too-many-locals def plot_hfmassfitter(case, arraytype): - load_root_style() - with open("../data/database_ml_parameters_%s.yml" % case, 'r') as param_config: + with open("../data/database_ml_parameters_%s.yml" % case, "r") as param_config: data_param = yaml.load(param_config, Loader=yaml.FullLoader) folder_plots = data_param[case]["analysis"]["dir_general_plots"] @@ -73,37 +91,29 @@ def plot_hfmassfitter(case, arraytype): d_resultsdataHM = data_param[case]["analysis"][arraytype[1]]["data"]["resultsallp"] yields_filename = "yields" - - signfhistos = [TH1F("hsignf%d" % (imult), "", \ - p_nptbins, array("d", ptranges)) \ - for imult in range(p_nbin2)] - meanhistos = [TH1F("hmean%d" % (imult), "", \ - p_nptbins, array("d", ptranges)) \ - for imult in range(p_nbin2)] - sigmahistos = [TH1F("hsigma%d" % (imult), "", \ - p_nptbins, array("d", ptranges)) \ - for imult in range(p_nbin2)] - sighistos = [TH1F("hsig%d" % (imult), "", \ - p_nptbins, array("d", ptranges)) \ - for imult in range(p_nbin2)] - backhistos = [TH1F("hback%d" % (imult), "", \ - p_nptbins, array("d", ptranges)) \ - for imult in range(p_nbin2)] + signfhistos = [TH1F("hsignf%d" % (imult), "", p_nptbins, array("d", ptranges)) for imult in range(p_nbin2)] + meanhistos = [TH1F("hmean%d" % (imult), "", p_nptbins, array("d", ptranges)) for imult in range(p_nbin2)] + sigmahistos = [TH1F("hsigma%d" % (imult), "", p_nptbins, array("d", ptranges)) for imult in range(p_nbin2)] + sighistos = [TH1F("hsig%d" % (imult), "", p_nptbins, array("d", ptranges)) for imult in range(p_nbin2)] + backhistos = [TH1F("hback%d" % (imult), "", p_nptbins, array("d", ptranges)) for imult in range(p_nbin2)] for imult, iplot in enumerate(plotbinMB): if not iplot: continue - func_filename = make_file_path(d_resultsdataMB, yields_filename, "root", - None, [case, arraytype[0]]) + func_filename = make_file_path(d_resultsdataMB, yields_filename, "root", None, [case, arraytype[0]]) func_file = TFile.Open(func_filename, "READ") for ipt in range(p_nptbins): bin_id = bin_matchingMB[ipt] - suffix = "%s%d_%d_%.2f%s_%.2f_%.2f" % \ - (v_var_binning, lpt_finbinminMB[ipt], - lpt_finbinmaxMB[ipt], lpt_probcutfin[bin_id], - v_var2_binningMB, lvar2_binminMB[imult], - lvar2_binmaxMB[imult]) + suffix = "%s%d_%d_%.2f%s_%.2f_%.2f" % ( + v_var_binning, + lpt_finbinminMB[ipt], + lpt_finbinmaxMB[ipt], + lpt_probcutfin[bin_id], + v_var2_binningMB, + lvar2_binminMB[imult], + lvar2_binmaxMB[imult], + ) load_dir = func_file.GetDirectory(suffix) mass_fitter = load_dir.Get("fitter") sign = 0 @@ -136,17 +146,20 @@ def plot_hfmassfitter(case, arraytype): for imult, iplot in enumerate(plotbinHM): if not iplot: continue - func_filename = make_file_path(d_resultsdataHM, yields_filename, "root", - None, [case, arraytype[1]]) + func_filename = make_file_path(d_resultsdataHM, yields_filename, "root", None, [case, arraytype[1]]) func_file = TFile.Open(func_filename, "READ") for ipt in range(p_nptbins): bin_id = bin_matchingHM[ipt] - suffix = "%s%d_%d_%.2f%s_%.2f_%.2f" % \ - (v_var_binning, lpt_finbinminHM[ipt], - lpt_finbinmaxHM[ipt], lpt_probcutfin[bin_id], - v_var2_binningHM, lvar2_binminHM[imult], - lvar2_binmaxHM[imult]) + suffix = "%s%d_%d_%.2f%s_%.2f_%.2f" % ( + v_var_binning, + lpt_finbinminHM[ipt], + lpt_finbinmaxHM[ipt], + lpt_probcutfin[bin_id], + v_var2_binningHM, + lvar2_binminHM[imult], + lvar2_binmaxHM[imult], + ) load_dir = func_file.GetDirectory(suffix) mass_fitter = load_dir.Get("fitter") sign = 0 @@ -176,8 +189,8 @@ def plot_hfmassfitter(case, arraytype): backhistos[imult].SetBinContent(ipt + 1, rootback) backhistos[imult].SetBinError(ipt + 1, rooteback) - #Significance fit plot - csign = TCanvas('cSign', 'The Fit Canvas') + # Significance fit plot + csign = TCanvas("cSign", "The Fit Canvas") csign.SetCanvasSize(1500, 1500) csign.SetWindowSize(500, 500) maxplot = 25 @@ -187,14 +200,14 @@ def plot_hfmassfitter(case, arraytype): maxplot = 40 csign.cd(1).DrawFrame(0, 0, 30, maxplot, ";#it{p}_{T} (GeV/#it{c});Significance %s" % name) - leg = TLegend(.25, .65, .65, .85) + leg = TLegend(0.25, 0.65, 0.65, 0.85) leg.SetBorderSize(0) leg.SetFillColor(0) leg.SetFillStyle(0) leg.SetTextFont(42) leg.SetTextSize(0.035) - colors = [kBlack, kRed, kGreen+2, kBlue, kViolet-1, kOrange+2, kAzure+1, kOrange-7] + colors = [kBlack, kRed, kGreen + 2, kBlue, kViolet - 1, kOrange + 2, kAzure + 1, kOrange - 7] for imult, iplot in enumerate(plotbinMB): if not iplot: continue @@ -202,8 +215,7 @@ def plot_hfmassfitter(case, arraytype): signfhistos[imult].SetMarkerColor(colors[imult % len(colors)]) signfhistos[imult].SetMarkerStyle(21) signfhistos[imult].Draw("same") - legyieldstring = "%.1f #leq %s < %.1f (MB)" % \ - (lvar2_binminMB[imult], latexbin2var, lvar2_binmaxMB[imult]) + legyieldstring = "%.1f #leq %s < %.1f (MB)" % (lvar2_binminMB[imult], latexbin2var, lvar2_binmaxMB[imult]) leg.AddEntry(signfhistos[imult], legyieldstring, "LEP") for imult, iplot in enumerate(plotbinHM): @@ -213,16 +225,13 @@ def plot_hfmassfitter(case, arraytype): signfhistos[imult].SetMarkerColor(colors[imult % len(colors)]) signfhistos[imult].SetMarkerStyle(21) signfhistos[imult].Draw("same") - legyieldstring = "%.1f #leq %s < %.1f (HM)" % \ - (lvar2_binminHM[imult], latexbin2var, lvar2_binmaxHM[imult]) + legyieldstring = "%.1f #leq %s < %.1f (HM)" % (lvar2_binminHM[imult], latexbin2var, lvar2_binmaxHM[imult]) leg.AddEntry(signfhistos[imult], legyieldstring, "LEP") leg.Draw() - csign.SaveAs("%s/MassFit_Signf_%s_%scombined%s.eps" % \ - (folder_plots, case, arraytype[0], arraytype[1])) - + csign.SaveAs("%s/MassFit_Signf_%s_%scombined%s.eps" % (folder_plots, case, arraytype[0], arraytype[1])) - #Mean fit plot - cmean = TCanvas('cMean', 'The Fit Canvas') + # Mean fit plot + cmean = TCanvas("cMean", "The Fit Canvas") cmean.SetCanvasSize(1500, 1500) cmean.SetWindowSize(500, 500) minplot = 2.27 @@ -251,12 +260,10 @@ def plot_hfmassfitter(case, arraytype): meanhistos[imult].SetMarkerStyle(21) meanhistos[imult].Draw("same") leg.Draw() - cmean.SaveAs("%s/MassFit_Mean_%s_%scombined%s.eps" % \ - (folder_plots, case, arraytype[0], arraytype[1])) - + cmean.SaveAs("%s/MassFit_Mean_%s_%scombined%s.eps" % (folder_plots, case, arraytype[0], arraytype[1])) - #Sigma fit plot (to add MC!) - csigm = TCanvas('cSigma', 'The Fit Canvas') + # Sigma fit plot (to add MC!) + csigm = TCanvas("cSigma", "The Fit Canvas") csigm.SetCanvasSize(1500, 1500) csigm.SetWindowSize(500, 500) maxplot = 0.03 @@ -280,17 +287,15 @@ def plot_hfmassfitter(case, arraytype): sigmahistos[imult].SetMarkerStyle(21) sigmahistos[imult].Draw("same") leg.Draw() - csigm.SaveAs("%s/MassFit_Sigma_%s_%scombined%s.eps" % \ - (folder_plots, case, arraytype[0], arraytype[1])) + csigm.SaveAs("%s/MassFit_Sigma_%s_%scombined%s.eps" % (folder_plots, case, arraytype[0], arraytype[1])) - - #Signal fit plot - csig = TCanvas('cSig', 'The Fit Canvas') + # Signal fit plot + csig = TCanvas("cSig", "The Fit Canvas") csig.SetCanvasSize(1500, 1500) csig.SetWindowSize(500, 500) csig.cd(1) - #First draw HM for scale + # First draw HM for scale for imult, iplot in enumerate(plotbinHM): if not iplot: continue @@ -309,17 +314,15 @@ def plot_hfmassfitter(case, arraytype): sighistos[imult].SetMarkerStyle(21) sighistos[imult].Draw("same") leg.Draw() - csig.SaveAs("%s/MassFit_Signal_%s_%scombined%s.eps" % \ - (folder_plots, case, arraytype[0], arraytype[1])) - + csig.SaveAs("%s/MassFit_Signal_%s_%scombined%s.eps" % (folder_plots, case, arraytype[0], arraytype[1])) - #Background fit plot - cback = TCanvas('cBack', 'The Fit Canvas') + # Background fit plot + cback = TCanvas("cBack", "The Fit Canvas") cback.SetCanvasSize(1500, 1500) cback.SetWindowSize(500, 500) cback.cd(1) - #First draw HM for scale + # First draw HM for scale for imult, iplot in enumerate(plotbinHM): if not iplot: continue @@ -338,13 +341,13 @@ def plot_hfmassfitter(case, arraytype): backhistos[imult].SetMarkerStyle(21) backhistos[imult].Draw("same") leg.Draw() - cback.SaveAs("%s/MassFit_Background_%s_%scombined%s.eps" % \ - (folder_plots, case, arraytype[0], arraytype[1])) + cback.SaveAs("%s/MassFit_Background_%s_%scombined%s.eps" % (folder_plots, case, arraytype[0], arraytype[1])) + ##################################### gROOT.SetBatch(True) -#EXAMPLE HOW TO USE plot_hfmassfitter +# EXAMPLE HOW TO USE plot_hfmassfitter # ---> Combines and plots the output of AliHFInvMassFitter in nice way -#plot_hfmassfitter("Dspp", ["MBvspt_ntrkl", "SPDvspt"]) +# plot_hfmassfitter("Dspp", ["MBvspt_ntrkl", "SPDvspt"]) diff --git a/machine_learning_hep/examples/plot_hfptspectrum.py b/machine_learning_hep/examples/plot_hfptspectrum.py index f1930ec5f9..34946f6582 100644 --- a/machine_learning_hep/examples/plot_hfptspectrum.py +++ b/machine_learning_hep/examples/plot_hfptspectrum.py @@ -15,27 +15,56 @@ """ main script for doing final stage analysis """ + import os -from math import sqrt -from shutil import copyfile + # pylint: disable=unused-wildcard-import, wildcard-import from array import * +from math import sqrt +from shutil import copyfile + # pylint: disable=import-error, no-name-in-module, unused-import import yaml -from ROOT import TFile, TH1F, TCanvas -from ROOT import gStyle, TLegend, TLatex -from ROOT import gROOT, kRed, kGreen, kBlack, kBlue, kOrange, kViolet, kAzure -from ROOT import TStyle, gPad -from machine_learning_hep.utilities_plot import plot_histograms, load_root_style +from ROOT import ( + TH1F, + TCanvas, + TFile, + TLatex, + TLegend, + TStyle, + gPad, + gROOT, + gStyle, + kAzure, + kBlack, + kBlue, + kGreen, + kOrange, + kRed, + kViolet, +) + +from machine_learning_hep.utilities_plot import load_root_style, plot_histograms FILES_NOT_FOUND = [] + + # One single particle ratio # pylint: disable=too-many-branches, too-many-arguments -def plot_hfptspectrum_ml_over_std(case_ml, ana_type_ml, period_number, filepath_std, case_std, - scale_std=None, map_std_bins=None, mult_bin=None, - ml_histo_names=None, std_histo_names=None, suffix=""): - - with open("../data/database_ml_parameters_%s.yml" % case_ml, 'r') as param_config: +def plot_hfptspectrum_ml_over_std( + case_ml, + ana_type_ml, + period_number, + filepath_std, + case_std, + scale_std=None, + map_std_bins=None, + mult_bin=None, + ml_histo_names=None, + std_histo_names=None, + suffix="", +): + with open("../data/database_ml_parameters_%s.yml" % case_ml, "r") as param_config: data_param = yaml.load(param_config, Loader=yaml.FullLoader) if period_number < 0: filepath_ml = data_param[case_ml]["analysis"][ana_type_ml]["data"]["resultsallp"] @@ -56,16 +85,28 @@ def plot_hfptspectrum_ml_over_std(case_ml, ana_type_ml, period_number, filepath_ file_std = TFile.Open(filepath_std, "READ") # Collect histo names to quickly loop later - histo_names = ["hDirectMCpt", "hFeedDownMCpt", "hDirectMCptMax", "hDirectMCptMin", - "hFeedDownMCptMax", "hFeedDownMCptMin", "hDirectEffpt", "hFeedDownEffpt", - "hRECpt", "histoYieldCorr", "histoYieldCorrMax", "histoYieldCorrMin", - "histoSigmaCorr", "histoSigmaCorrMax", "histoSigmaCorrMin"] + histo_names = [ + "hDirectMCpt", + "hFeedDownMCpt", + "hDirectMCptMax", + "hDirectMCptMin", + "hFeedDownMCptMax", + "hFeedDownMCptMin", + "hDirectEffpt", + "hFeedDownEffpt", + "hRECpt", + "histoYieldCorr", + "histoYieldCorrMax", + "histoYieldCorrMin", + "histoSigmaCorr", + "histoSigmaCorrMax", + "histoSigmaCorrMin", + ] if ml_histo_names is None: ml_histo_names = histo_names if std_histo_names is None: std_histo_names = histo_names - for hn_ml, hn_std in zip(ml_histo_names, std_histo_names): histo_ml = file_ml.Get(hn_ml) histo_std_tmp = file_std.Get(hn_std) @@ -87,13 +128,14 @@ def plot_hfptspectrum_ml_over_std(case_ml, ana_type_ml, period_number, filepath_ for ml_bin, std_bins in map_std_bins: for b in std_bins: - contents[ml_bin-1] += histo_std_tmp.GetBinWidth(b) * \ - histo_std_tmp.GetBinContent(b) / histo_ml.GetBinWidth(ml_bin) - errors[ml_bin-1] += histo_std_tmp.GetBinError(b) * histo_std_tmp.GetBinError(b) + contents[ml_bin - 1] += ( + histo_std_tmp.GetBinWidth(b) * histo_std_tmp.GetBinContent(b) / histo_ml.GetBinWidth(ml_bin) + ) + errors[ml_bin - 1] += histo_std_tmp.GetBinError(b) * histo_std_tmp.GetBinError(b) for b in range(histo_std.GetNbinsX()): - histo_std.SetBinContent(b+1, contents[b]) - histo_std.SetBinError(b+1, sqrt(errors[b])) + histo_std.SetBinContent(b + 1, contents[b]) + histo_std.SetBinError(b + 1, sqrt(errors[b])) else: histo_std = histo_std_tmp.Clone("std_cloned") @@ -111,28 +153,46 @@ def plot_hfptspectrum_ml_over_std(case_ml, ana_type_ml, period_number, filepath_ save_path = f"{folder_plots}/{hn_ml}_ml_std_{case_ml}_over_{case_std}_{suffix}.eps" - plot_histograms([h_ratio], False, False, None, histo_ml.GetTitle(), - "#it{p}_{T} (GeV/#it{c}", f"{name} / {case_std}", "", - save_path) + plot_histograms( + [h_ratio], + False, + False, + None, + histo_ml.GetTitle(), + "#it{p}_{T} (GeV/#it{c}", + f"{name} / {case_std}", + "", + save_path, + ) -# pylint: disable=too-many-locals, too-many-branches, too-many-statements, too-many-arguments -def compare_ml_std_ratio(case_ml_1, case_ml_2, ana_type_ml, period_number, filepath_std_1, - filepath_std_2, scale_std_1=None, scale_std_2=None, map_std_bins=None, - mult_bin=None, ml_histo_names=None, std_histo_names_1=None, - std_histo_names_2=None, suffix=""): - with open("../data/database_ml_parameters_%s.yml" % case_ml_1, 'r') as param_config: +# pylint: disable=too-many-locals, too-many-branches, too-many-statements, too-many-arguments +def compare_ml_std_ratio( + case_ml_1, + case_ml_2, + ana_type_ml, + period_number, + filepath_std_1, + filepath_std_2, + scale_std_1=None, + scale_std_2=None, + map_std_bins=None, + mult_bin=None, + ml_histo_names=None, + std_histo_names_1=None, + std_histo_names_2=None, + suffix="", +): + with open("../data/database_ml_parameters_%s.yml" % case_ml_1, "r") as param_config: data_param_1 = yaml.load(param_config, Loader=yaml.FullLoader) - with open("../data/database_ml_parameters_%s.yml" % case_ml_2, 'r') as param_config: + with open("../data/database_ml_parameters_%s.yml" % case_ml_2, "r") as param_config: data_param_2 = yaml.load(param_config, Loader=yaml.FullLoader) if period_number < 0: filepath_ml_1 = data_param_1[case_ml_1]["analysis"][ana_type_ml]["data"]["resultsallp"] filepath_ml_2 = data_param_2[case_ml_2]["analysis"][ana_type_ml]["data"]["resultsallp"] else: - filepath_ml_1 = \ - data_param_1[case_ml_1]["analysis"][ana_type_ml]["data"]["results"][period_number] - filepath_ml_2 = \ - data_param_2[case_ml_2]["analysis"][ana_type_ml]["data"]["results"][period_number] + filepath_ml_1 = data_param_1[case_ml_1]["analysis"][ana_type_ml]["data"]["results"][period_number] + filepath_ml_2 = data_param_2[case_ml_2]["analysis"][ana_type_ml]["data"]["results"][period_number] name_1 = data_param_1[case_ml_1]["analysis"][ana_type_ml]["latexnamehadron"] name_2 = data_param_2[case_ml_2]["analysis"][ana_type_ml]["latexnamehadron"] @@ -154,10 +214,23 @@ def compare_ml_std_ratio(case_ml_1, case_ml_2, ana_type_ml, period_number, filep file_std_2 = TFile.Open(filepath_std_2, "READ") # Collect histo names to quickly loop later - histo_names = ["hDirectMCpt", "hFeedDownMCpt", "hDirectMCptMax", "hDirectMCptMin", - "hFeedDownMCptMax", "hFeedDownMCptMin", "hDirectEffpt", "hFeedDownEffpt", - "hRECpt", "histoYieldCorr", "histoYieldCorrMax", "histoYieldCorrMin", - "histoSigmaCorr", "histoSigmaCorrMax", "histoSigmaCorrMin"] + histo_names = [ + "hDirectMCpt", + "hFeedDownMCpt", + "hDirectMCptMax", + "hDirectMCptMin", + "hFeedDownMCptMax", + "hFeedDownMCptMin", + "hDirectEffpt", + "hFeedDownEffpt", + "hRECpt", + "histoYieldCorr", + "histoYieldCorrMax", + "histoYieldCorrMin", + "histoSigmaCorr", + "histoSigmaCorrMax", + "histoSigmaCorrMin", + ] if ml_histo_names is None: ml_histo_names = histo_names @@ -198,26 +271,24 @@ def compare_ml_std_ratio(case_ml_1, case_ml_2, ana_type_ml, period_number, filep for ml_bin, std_bins in map_std_bins: for b in std_bins: - contents[ml_bin-1] += histo_std_tmp_1.GetBinContent(b) / len(std_bins) - errors[ml_bin-1] += \ - histo_std_tmp_1.GetBinError(b) * histo_std_tmp_1.GetBinError(b) + contents[ml_bin - 1] += histo_std_tmp_1.GetBinContent(b) / len(std_bins) + errors[ml_bin - 1] += histo_std_tmp_1.GetBinError(b) * histo_std_tmp_1.GetBinError(b) for b in range(histo_std_1.GetNbinsX()): - histo_std_1.SetBinContent(b+1, contents[b]) - histo_std_1.SetBinError(b+1, sqrt(errors[b])) + histo_std_1.SetBinContent(b + 1, contents[b]) + histo_std_1.SetBinError(b + 1, sqrt(errors[b])) contents = [0] * histo_ml_2.GetNbinsX() errors = [0] * histo_ml_2.GetNbinsX() for ml_bin, std_bins in map_std_bins: for b in std_bins: - contents[ml_bin-1] += histo_std_tmp_2.GetBinContent(b) / len(std_bins) - errors[ml_bin-1] += \ - histo_std_tmp_2.GetBinError(b) * histo_std_tmp_2.GetBinError(b) + contents[ml_bin - 1] += histo_std_tmp_2.GetBinContent(b) / len(std_bins) + errors[ml_bin - 1] += histo_std_tmp_2.GetBinError(b) * histo_std_tmp_2.GetBinError(b) for b in range(histo_std_2.GetNbinsX()): - histo_std_2.SetBinContent(b+1, contents[b]) - histo_std_2.SetBinError(b+1, sqrt(errors[b])) + histo_std_2.SetBinContent(b + 1, contents[b]) + histo_std_2.SetBinError(b + 1, sqrt(errors[b])) else: histo_std_1 = histo_std_tmp_1.Clone("std_cloned_1") @@ -238,24 +309,44 @@ def compare_ml_std_ratio(case_ml_1, case_ml_2, ana_type_ml, period_number, filep print("creating folder ", folder_plots) os.makedirs(folder_plots) - save_path = f"{folder_plots}/ratio_{case_ml_1}_{case_ml_2}_{hn_ml}_ml_std_mult_" \ - f"{mult_bin}_period_{period_number}{suffix}.eps" - - plot_histograms([histo_ratio_std, histo_ratio_ml], True, True, ["STD", "ML"], "Ratio", - "#it{p}_{T} (GeV/#it{c}", f"{name_1} / {name_2}", "ML / STD", - save_path) + save_path = ( + f"{folder_plots}/ratio_{case_ml_1}_{case_ml_2}_{hn_ml}_ml_std_mult_" + f"{mult_bin}_period_{period_number}{suffix}.eps" + ) + + plot_histograms( + [histo_ratio_std, histo_ratio_ml], + True, + True, + ["STD", "ML"], + "Ratio", + "#it{p}_{T} (GeV/#it{c}", + f"{name_1} / {name_2}", + "ML / STD", + save_path, + ) folder_plots = data_param_2[case_ml_2]["analysis"]["dir_general_plots"] if not os.path.exists(folder_plots): print("creating folder ", folder_plots) os.makedirs(folder_plots) - save_path = f"{folder_plots}/ratio_{case_ml_1}_{case_ml_2}_{hn_ml}_ml_std_mult_" \ - f"{mult_bin}_period_{period_number}{suffix}.eps" + save_path = ( + f"{folder_plots}/ratio_{case_ml_1}_{case_ml_2}_{hn_ml}_ml_std_mult_" + f"{mult_bin}_period_{period_number}{suffix}.eps" + ) - plot_histograms([histo_ratio_std, histo_ratio_ml], True, True, ["STD", "ML"], "Ratio", - "#it{p}_{T} (GeV/#it{c}", f"{name_1} / {name_2}", "ML / STD", - save_path) + plot_histograms( + [histo_ratio_std, histo_ratio_ml], + True, + True, + ["STD", "ML"], + "Ratio", + "#it{p}_{T} (GeV/#it{c}", + f"{name_1} / {name_2}", + "ML / STD", + save_path, + ) # pylint: disable=import-error, no-name-in-module, unused-import @@ -263,10 +354,9 @@ def compare_ml_std_ratio(case_ml_1, case_ml_2, ana_type_ml, period_number, filep # pylint: disable=too-many-branches # pylint: disable=too-many-locals def plot_hfptspectrum_comb(case, arraytype): - load_root_style() - with open("../data/database_ml_parameters_%s.yml" % case, 'r') as param_config: + with open("../data/database_ml_parameters_%s.yml" % case, "r") as param_config: data_param = yaml.load(param_config, Loader=yaml.FullLoader) folder_plots = data_param[case]["analysis"]["dir_general_plots"] @@ -286,47 +376,47 @@ def plot_hfptspectrum_comb(case, arraytype): br = data_param[case]["ml"]["opt"]["BR"] sigmav0 = data_param[case]["analysis"]["sigmav0"] - fileres_MB_allperiods = TFile.Open("%s/finalcross%s%smulttot.root" % \ - (folder_MB_allperiods, case, arraytype[0])) - fileres_MB = [TFile.Open("%s/finalcross%s%smult%d.root" % (folder_MB_allperiods, \ - case, arraytype[0], i)) for i in range(len(plotbinMB))] + fileres_MB_allperiods = TFile.Open("%s/finalcross%s%smulttot.root" % (folder_MB_allperiods, case, arraytype[0])) + fileres_MB = [ + TFile.Open("%s/finalcross%s%smult%d.root" % (folder_MB_allperiods, case, arraytype[0], i)) + for i in range(len(plotbinMB)) + ] - fileres_trig_allperiods = TFile.Open("%s/finalcross%s%smulttot.root" % \ - (folder_triggered, case, arraytype[1])) - fileres_trig = [TFile.Open("%s/finalcross%s%smult%d.root" % (folder_triggered, \ - case, arraytype[1], i)) for i in range(len(plotbinMB))] + fileres_trig_allperiods = TFile.Open("%s/finalcross%s%smulttot.root" % (folder_triggered, case, arraytype[1])) + fileres_trig = [ + TFile.Open("%s/finalcross%s%smult%d.root" % (folder_triggered, case, arraytype[1], i)) + for i in range(len(plotbinMB)) + ] - #Corrected yield plot - ccross = TCanvas('cCross', 'The Fit Canvas') + # Corrected yield plot + ccross = TCanvas("cCross", "The Fit Canvas") ccross.SetCanvasSize(1500, 1500) ccross.SetWindowSize(500, 500) - ccross.cd(1).DrawFrame(0, 1.e-9, 30, 10, ";#it{p}_{T} (GeV/#it{c});Corrected yield %s" % name) - #ccross.SetLogx() + ccross.cd(1).DrawFrame(0, 1.0e-9, 30, 10, ";#it{p}_{T} (GeV/#it{c});Corrected yield %s" % name) + # ccross.SetLogx() - legyield = TLegend(.25, .65, .65, .85) + legyield = TLegend(0.25, 0.65, 0.65, 0.85) legyield.SetBorderSize(0) legyield.SetFillColor(0) legyield.SetFillStyle(0) legyield.SetTextFont(42) legyield.SetTextSize(0.035) - colors = [kBlack, kRed, kGreen+2, kBlue, kViolet-1, kOrange+2, kAzure+1, kOrange-7] + colors = [kBlack, kRed, kGreen + 2, kBlue, kViolet - 1, kOrange + 2, kAzure + 1, kOrange - 7] tryunmerged = True if fileres_MB_allperiods and fileres_trig_allperiods: - for imult, iplot in enumerate(plotbinMB): if not iplot: continue gPad.SetLogy() hyield = fileres_MB_allperiods.Get("histoSigmaCorr%d" % (imult)) - hyield.Scale(1./(br * sigmav0 * 1e12)) + hyield.Scale(1.0 / (br * sigmav0 * 1e12)) hyield.SetLineColor(colors[imult % len(colors)]) hyield.SetMarkerColor(colors[imult % len(colors)]) hyield.SetMarkerStyle(21) hyield.SetMarkerSize(0.8) hyield.Draw("same") - legyieldstring = "%.1f #leq %s < %.1f (MB)" % \ - (binsmin[imult], latexbin2var, binsmax[imult]) + legyieldstring = "%.1f #leq %s < %.1f (MB)" % (binsmin[imult], latexbin2var, binsmax[imult]) legyield.AddEntry(hyield, legyieldstring, "LEP") for imult, iplot in enumerate(plotbinHM): @@ -334,37 +424,43 @@ def plot_hfptspectrum_comb(case, arraytype): continue gPad.SetLogy() hyieldHM = fileres_trig_allperiods.Get("histoSigmaCorr%d" % (imult)) - hyieldHM.Scale(1./(br * sigmav0 * 1e12)) + hyieldHM.Scale(1.0 / (br * sigmav0 * 1e12)) hyieldHM.SetLineColor(colors[imult % len(colors)]) hyieldHM.SetMarkerColor(colors[imult % len(colors)]) hyieldHM.SetMarkerStyle(21) hyieldHM.SetMarkerSize(0.8) hyieldHM.Draw("same") - legyieldstring = "%.1f #leq %s < %.1f (HM)" % \ - (binsmin[imult], latexbin2var, binsmax[imult]) + legyieldstring = "%.1f #leq %s < %.1f (HM)" % (binsmin[imult], latexbin2var, binsmax[imult]) legyield.AddEntry(hyieldHM, legyieldstring, "LEP") legyield.Draw() - ccross.SaveAs("%s/PtSpec_ComparisonCorrYields_%s_%scombined%s.eps" % \ - (folder_plots, case, arraytype[0], arraytype[1])) + ccross.SaveAs( + "%s/PtSpec_ComparisonCorrYields_%s_%scombined%s.eps" % (folder_plots, case, arraytype[0], arraytype[1]) + ) tryunmerged = False else: - print("---Warning: Issue with merged, trying with unmerged files for %s (%s, %s)---" % \ - (case, arraytype[0], arraytype[1])) + print( + "---Warning: Issue with merged, trying with unmerged files for %s (%s, %s)---" + % (case, arraytype[0], arraytype[1]) + ) for imult, iplot in enumerate(plotbinMB): if not iplot: continue if not fileres_MB[imult]: - print("---Warning: Issue with MB file. Eff, FD, CY plot skipped for %s (%s, %s)---" % \ - (case, arraytype[0], arraytype[1])) + print( + "---Warning: Issue with MB file. Eff, FD, CY plot skipped for %s (%s, %s)---" + % (case, arraytype[0], arraytype[1]) + ) return for imult, iplot in enumerate(plotbinHM): if not iplot: continue if not fileres_trig[imult]: - print("---Warning: Issue with HM file. Eff, FD, CY plot skipped for %s (%s, %s)---" % \ - (case, arraytype[0], arraytype[1])) + print( + "---Warning: Issue with HM file. Eff, FD, CY plot skipped for %s (%s, %s)---" + % (case, arraytype[0], arraytype[1]) + ) return if tryunmerged is True: @@ -373,14 +469,13 @@ def plot_hfptspectrum_comb(case, arraytype): continue gPad.SetLogy() hyield = fileres_MB[imult].Get("histoSigmaCorr%d" % (imult)) - hyield.Scale(1./(br * sigmav0 * 1e12)) + hyield.Scale(1.0 / (br * sigmav0 * 1e12)) hyield.SetLineColor(colors[imult % len(colors)]) hyield.SetMarkerColor(colors[imult % len(colors)]) hyield.SetMarkerStyle(21) hyield.SetMarkerSize(0.8) hyield.Draw("same") - legyieldstring = "%.1f #leq %s < %.1f (MB)" % \ - (binsmin[imult], latexbin2var, binsmax[imult]) + legyieldstring = "%.1f #leq %s < %.1f (MB)" % (binsmin[imult], latexbin2var, binsmax[imult]) legyield.AddEntry(hyield, legyieldstring, "LEP") for imult, iplot in enumerate(plotbinHM): @@ -388,28 +483,27 @@ def plot_hfptspectrum_comb(case, arraytype): continue gPad.SetLogy() hyieldHM = fileres_trig[imult].Get("histoSigmaCorr%d" % (imult)) - hyieldHM.Scale(1./(br * sigmav0 * 1e12)) + hyieldHM.Scale(1.0 / (br * sigmav0 * 1e12)) hyieldHM.SetLineColor(colors[imult % len(colors)]) hyieldHM.SetMarkerColor(colors[imult % len(colors)]) hyieldHM.SetMarkerStyle(21) hyieldHM.SetMarkerSize(0.8) hyieldHM.Draw("same") - legyieldstring = "%.1f #leq %s < %.1f (HM)" % \ - (binsmin[imult], latexbin2var, binsmax[imult]) + legyieldstring = "%.1f #leq %s < %.1f (HM)" % (binsmin[imult], latexbin2var, binsmax[imult]) legyield.AddEntry(hyieldHM, legyieldstring, "LEP") legyield.Draw() - ccross.SaveAs("%s/PtSpec_ComparisonCorrYields_%s_%scombined%s.eps" % \ - (folder_plots, case, arraytype[0], arraytype[1])) + ccross.SaveAs( + "%s/PtSpec_ComparisonCorrYields_%s_%scombined%s.eps" % (folder_plots, case, arraytype[0], arraytype[1]) + ) - #Efficiency plot - cEff = TCanvas('cEff', '', 800, 400) + # Efficiency plot + cEff = TCanvas("cEff", "", 800, 400) cEff.Divide(2) - cEff.cd(1).DrawFrame(0, 1.e-4, 25, 1., \ - ";#it{p}_{T} (GeV/#it{c});Prompt %s (Acc #times eff)" % name) + cEff.cd(1).DrawFrame(0, 1.0e-4, 25, 1.0, ";#it{p}_{T} (GeV/#it{c});Prompt %s (Acc #times eff)" % name) cEff.cd(1).SetLogy() - legeff = TLegend(.3, .15, .7, .35) + legeff = TLegend(0.3, 0.15, 0.7, 0.35) legeff.SetBorderSize(0) legeff.SetFillColor(0) legeff.SetFillStyle(0) @@ -427,8 +521,7 @@ def plot_hfptspectrum_comb(case, arraytype): hEffpr.SetMarkerStyle(21) hEffpr.SetMarkerSize(0.8) hEffpr.Draw("same") - legeffstring = "%.1f #leq %s < %.1f (MB)" % \ - (binsmin[imult], latexbin2var, binsmax[imult]) + legeffstring = "%.1f #leq %s < %.1f (MB)" % (binsmin[imult], latexbin2var, binsmax[imult]) legeff.AddEntry(hEffpr, legeffstring, "LEP") for imult, iplot in enumerate(plotbinHM): @@ -441,13 +534,11 @@ def plot_hfptspectrum_comb(case, arraytype): hEffprHM.SetMarkerStyle(21) hEffprHM.SetMarkerSize(0.8) hEffprHM.Draw("same") - legeffstring = "%.1f #leq %s < %.1f (HM)" % \ - (binsmin[imult], latexbin2var, binsmax[imult]) + legeffstring = "%.1f #leq %s < %.1f (HM)" % (binsmin[imult], latexbin2var, binsmax[imult]) legeff.AddEntry(hEffprHM, legeffstring, "LEP") legeff.Draw() - cEff.cd(2).DrawFrame(0, 1.e-4, 25, 1., \ - ";#it{p}_{T} (GeV/#it{c});Feed-down %s (Acc #times eff)" % name) + cEff.cd(2).DrawFrame(0, 1.0e-4, 25, 1.0, ";#it{p}_{T} (GeV/#it{c});Feed-down %s (Acc #times eff)" % name) cEff.cd(2).SetLogy() for imult, iplot in enumerate(plotbinMB): @@ -471,14 +562,14 @@ def plot_hfptspectrum_comb(case, arraytype): hEfffdHM.SetMarkerStyle(21) hEfffdHM.Draw("same") - cEff.SaveAs("%s/PtSpec_ComparisonEfficiencies_%s_%scombined%s.eps" % \ - (folder_plots, case, arraytype[0], arraytype[1])) + cEff.SaveAs( + "%s/PtSpec_ComparisonEfficiencies_%s_%scombined%s.eps" % (folder_plots, case, arraytype[0], arraytype[1]) + ) - #Efficiency ratio plot - cEffRatio = TCanvas('cEffRatio', '', 800, 400) + # Efficiency ratio plot + cEffRatio = TCanvas("cEffRatio", "", 800, 400) cEffRatio.Divide(2) - cEffRatio.cd(1).DrawFrame(0, 0.5, 25, 1.5, \ - ";#it{p}_{T} (GeV/#it{c});Prompt %s (Acc #times eff) Ratio" % name) + cEffRatio.cd(1).DrawFrame(0, 0.5, 25, 1.5, ";#it{p}_{T} (GeV/#it{c});Prompt %s (Acc #times eff) Ratio" % name) hEffprden = TH1F() if plotbinMB[0] == 1: @@ -518,8 +609,7 @@ def plot_hfptspectrum_comb(case, arraytype): hEffprHM.Draw("same") legeff.Draw() - cEffRatio.cd(2).DrawFrame(0, 0.5, 25, 1.5, \ - ";#it{p}_{T} (GeV/#it{c});Feed-down %s (Acc #times eff) Ratio" % name) + cEffRatio.cd(2).DrawFrame(0, 0.5, 25, 1.5, ";#it{p}_{T} (GeV/#it{c});Feed-down %s (Acc #times eff) Ratio" % name) hEfffdden = TH1F() if plotbinMB[0] == 1: @@ -556,11 +646,12 @@ def plot_hfptspectrum_comb(case, arraytype): hEfffdHM.Divide(hEfffdden) hEfffdHM.Draw("same") - cEffRatio.SaveAs("%s/PtSpec_ComparisonEfficienciesRatio_%s_%scombined%s.eps" % \ - (folder_plots, case, arraytype[0], arraytype[1])) + cEffRatio.SaveAs( + "%s/PtSpec_ComparisonEfficienciesRatio_%s_%scombined%s.eps" % (folder_plots, case, arraytype[0], arraytype[1]) + ) - #fprompt - cfPrompt = TCanvas('cfPrompt', '', 1200, 800) + # fprompt + cfPrompt = TCanvas("cfPrompt", "", 1200, 800) cfPrompt.Divide(3, 2) pt = TLatex() @@ -569,8 +660,7 @@ def plot_hfptspectrum_comb(case, arraytype): for imult, iplot in enumerate(plotbinMB): if not iplot: continue - cfPrompt.cd(imult+1).DrawFrame(0, 0, 25, 1.05, \ - ";#it{p}_{T} (GeV/#it{c});#it{f}_{prompt} %s" % name) + cfPrompt.cd(imult + 1).DrawFrame(0, 0, 25, 1.05, ";#it{p}_{T} (GeV/#it{c});#it{f}_{prompt} %s" % name) grfPrompt = fileres_MB[imult].Get("gFcConservative") grfPrompt.SetTitle(";#it{p}_{T} (GeV/#it{c});#it{f}_{prompt} %s" % name) grfPrompt.SetLineColor(colors[imult % len(colors)]) @@ -578,14 +668,12 @@ def plot_hfptspectrum_comb(case, arraytype): grfPrompt.SetMarkerStyle(21) grfPrompt.SetMarkerSize(0.5) grfPrompt.Draw("ap") - pt.DrawLatexNDC(0.15, 0.15, "%.1f #leq %s < %.1f (MB)" % \ - (binsmin[imult], latexbin2var, binsmax[imult])) + pt.DrawLatexNDC(0.15, 0.15, "%.1f #leq %s < %.1f (MB)" % (binsmin[imult], latexbin2var, binsmax[imult])) for imult, iplot in enumerate(plotbinHM): if not iplot: continue - cfPrompt.cd(imult+1).DrawFrame(0, 0, 25, 1.05, \ - ";#it{p}_{T} (GeV/#it{c});#it{f}_{prompt} %s" % name) + cfPrompt.cd(imult + 1).DrawFrame(0, 0, 25, 1.05, ";#it{p}_{T} (GeV/#it{c});#it{f}_{prompt} %s" % name) grfPromptHM = fileres_trig[imult].Get("gFcConservative") grfPromptHM.SetTitle(";#it{p}_{T} (GeV/#it{c});#it{f}_{prompt} %s" % name) grfPromptHM.SetLineColor(colors[imult % len(colors)]) @@ -593,23 +681,23 @@ def plot_hfptspectrum_comb(case, arraytype): grfPromptHM.SetMarkerStyle(21) grfPromptHM.SetMarkerSize(0.5) grfPromptHM.Draw("ap") - pt.DrawLatexNDC(0.15, 0.15, "%.1f #leq %s < %.1f (HM)" % \ - (binsmin[imult], latexbin2var, binsmax[imult])) + pt.DrawLatexNDC(0.15, 0.15, "%.1f #leq %s < %.1f (HM)" % (binsmin[imult], latexbin2var, binsmax[imult])) + + cfPrompt.SaveAs( + "%s/PtSpec_ComparisonfPrompt_%s_%scombined%s.eps" % (folder_plots, case, arraytype[0], arraytype[1]) + ) - cfPrompt.SaveAs("%s/PtSpec_ComparisonfPrompt_%s_%scombined%s.eps" % \ - (folder_plots, case, arraytype[0], arraytype[1])) # pylint: disable=import-error, no-name-in-module, unused-import # pylint: disable=too-many-statements # pylint: disable=too-many-locals def plot_hfptspectrum_ratios_comb(case_num, case_den, arraytype): - load_root_style() - with open("../data/database_ml_parameters_%s.yml" % case_num, 'r') as param_config_num: + with open("../data/database_ml_parameters_%s.yml" % case_num, "r") as param_config_num: data_param_num = yaml.load(param_config_num, Loader=yaml.FullLoader) - with open("../data/database_ml_parameters_%s.yml" % case_den, 'r') as param_config_den: + with open("../data/database_ml_parameters_%s.yml" % case_den, "r") as param_config_den: data_param_den = yaml.load(param_config_den, Loader=yaml.FullLoader) folder_plots_num = data_param_num[case_num]["analysis"]["dir_general_plots"] @@ -621,14 +709,10 @@ def plot_hfptspectrum_ratios_comb(case_num, case_den, arraytype): print("creating folder ", folder_plots_den) os.makedirs(folder_plots_den) - folder_num_allperiods = \ - data_param_num[case_num]["analysis"][arraytype[0]]["data"]["resultsallp"] - folder_den_allperiods = \ - data_param_den[case_den]["analysis"][arraytype[0]]["data"]["resultsallp"] - folder_num_triggered = \ - data_param_num[case_num]["analysis"][arraytype[1]]["data"]["resultsallp"] - folder_den_triggered = \ - data_param_den[case_den]["analysis"][arraytype[1]]["data"]["resultsallp"] + folder_num_allperiods = data_param_num[case_num]["analysis"][arraytype[0]]["data"]["resultsallp"] + folder_den_allperiods = data_param_den[case_den]["analysis"][arraytype[0]]["data"]["resultsallp"] + folder_num_triggered = data_param_num[case_num]["analysis"][arraytype[1]]["data"]["resultsallp"] + folder_den_triggered = data_param_den[case_den]["analysis"][arraytype[1]]["data"]["resultsallp"] binsmin_num = data_param_num[case_num]["analysis"][arraytype[0]]["sel_binmin2"] binsmax_num = data_param_num[case_num]["analysis"][arraytype[0]]["sel_binmax2"] @@ -642,133 +726,147 @@ def plot_hfptspectrum_ratios_comb(case_num, case_den, arraytype): sigmav0_num = data_param_num[case_num]["analysis"]["sigmav0"] sigmav0_den = data_param_den[case_den]["analysis"]["sigmav0"] - file_num_allperiods = TFile.Open("%s/finalcross%s%smulttot.root" % \ - (folder_num_allperiods, case_num, arraytype[0])) - file_den_allperiods = TFile.Open("%s/finalcross%s%smulttot.root" % \ - (folder_den_allperiods, case_den, arraytype[0])) - file_num_triggered = TFile.Open("%s/finalcross%s%smulttot.root" % \ - (folder_num_triggered, case_num, arraytype[1])) - file_den_triggered = TFile.Open("%s/finalcross%s%smulttot.root" % \ - (folder_den_triggered, case_den, arraytype[1])) + file_num_allperiods = TFile.Open("%s/finalcross%s%smulttot.root" % (folder_num_allperiods, case_num, arraytype[0])) + file_den_allperiods = TFile.Open("%s/finalcross%s%smulttot.root" % (folder_den_allperiods, case_den, arraytype[0])) + file_num_triggered = TFile.Open("%s/finalcross%s%smulttot.root" % (folder_num_triggered, case_num, arraytype[1])) + file_den_triggered = TFile.Open("%s/finalcross%s%smulttot.root" % (folder_den_triggered, case_den, arraytype[1])) if not file_num_allperiods or not file_num_triggered: - print("---Warning: Issue with %s merged files. Meson ratio plot skipped (%s, %s)---" % \ - (case_num, arraytype[0], arraytype[1])) + print( + "---Warning: Issue with %s merged files. Meson ratio plot skipped (%s, %s)---" + % (case_num, arraytype[0], arraytype[1]) + ) return if not file_den_allperiods or not file_den_triggered: - print("---Warning: Issue with %s merged files. Meson ratio plot skipped (%s, %s)---" % \ - (case_den, arraytype[0], arraytype[1])) + print( + "---Warning: Issue with %s merged files. Meson ratio plot skipped (%s, %s)---" + % (case_den, arraytype[0], arraytype[1]) + ) return - rootfilename = "%s/ComparisonRatios_%s%s_%scombined%s.root" % \ - (folder_plots_num, case_num, case_den, arraytype[0], arraytype[1]) + rootfilename = "%s/ComparisonRatios_%s%s_%scombined%s.root" % ( + folder_plots_num, + case_num, + case_den, + arraytype[0], + arraytype[1], + ) fileoutput = TFile.Open(rootfilename, "recreate") - ccross = TCanvas('cRatioCross', 'The Fit Canvas') + ccross = TCanvas("cRatioCross", "The Fit Canvas") ccross.SetCanvasSize(1500, 1500) ccross.SetWindowSize(500, 500) maxplot = 1.0 if case_num == "Dspp": maxplot = 0.5 - ccross.cd(1).DrawFrame(0.9, 0, 30, maxplot, ";#it{p}_{T} (GeV/#it{c});%s / %s" % \ - (name_num, name_den)) + ccross.cd(1).DrawFrame(0.9, 0, 30, maxplot, ";#it{p}_{T} (GeV/#it{c});%s / %s" % (name_num, name_den)) ccross.cd(1).SetLogx() - legyield = TLegend(.4, .68, .8, .88) + legyield = TLegend(0.4, 0.68, 0.8, 0.88) legyield.SetBorderSize(0) legyield.SetFillColor(0) legyield.SetFillStyle(0) legyield.SetTextFont(42) legyield.SetTextSize(0.025) - colors = [kBlack, kRed, kGreen+2, kBlue, kViolet-1, kOrange+2, kAzure+1, kOrange-7] + colors = [kBlack, kRed, kGreen + 2, kBlue, kViolet - 1, kOrange + 2, kAzure + 1, kOrange - 7] for imult, iplot in enumerate(plotbinMB): if not iplot: continue hratio = file_num_allperiods.Get("histoSigmaCorr%d" % (imult)) - hratio.Scale(1./(br_num * sigmav0_num * 1e12)) + hratio.Scale(1.0 / (br_num * sigmav0_num * 1e12)) hcross_den = file_den_allperiods.Get("histoSigmaCorr%d" % (imult)) - hcross_den.Scale(1./(br_den * sigmav0_den * 1e12)) + hcross_den.Scale(1.0 / (br_den * sigmav0_den * 1e12)) hratio.Divide(hcross_den) hratio.SetLineColor(colors[imult % len(colors)]) hratio.SetMarkerColor(colors[imult % len(colors)]) hratio.SetMarkerStyle(21) hratio.SetTitle(";#it{p}_{T} (GeV/#it{c});%s / %s" % (name_num, name_den)) hratio.Draw("same") - legyieldstring = "%.1f #leq %s < %.1f (MB)" % \ - (binsmin_num[imult], latexbin2var, binsmax_num[imult]) + legyieldstring = "%.1f #leq %s < %.1f (MB)" % (binsmin_num[imult], latexbin2var, binsmax_num[imult]) legyield.AddEntry(hratio, legyieldstring, "LEP") fileoutput.cd() - hratio.Write("hratio_fromMB_%.1f_%s_%.1f" % \ - (binsmin_num[imult], latexbin2var, binsmax_num[imult])) + hratio.Write("hratio_fromMB_%.1f_%s_%.1f" % (binsmin_num[imult], latexbin2var, binsmax_num[imult])) for imult, iplot in enumerate(plotbinHM): if not iplot: continue hratioHM = file_num_triggered.Get("histoSigmaCorr%d" % (imult)) - hratioHM.Scale(1./(br_num * sigmav0_num * 1e12)) + hratioHM.Scale(1.0 / (br_num * sigmav0_num * 1e12)) hcrossHM_den = file_den_triggered.Get("histoSigmaCorr%d" % (imult)) - hcrossHM_den.Scale(1./(br_den * sigmav0_den * 1e12)) + hcrossHM_den.Scale(1.0 / (br_den * sigmav0_den * 1e12)) hratioHM.Divide(hcrossHM_den) hratioHM.SetLineColor(colors[imult % len(colors)]) hratioHM.SetMarkerColor(colors[imult % len(colors)]) hratioHM.SetTitle(";#it{p}_{T} (GeV/#it{c});%s / %s" % (name_num, name_den)) hratioHM.Draw("same") - legyieldstring = "%.1f #leq %s < %.1f (HM)" % \ - (binsmin_num[imult], latexbin2var, binsmax_num[imult]) + legyieldstring = "%.1f #leq %s < %.1f (HM)" % (binsmin_num[imult], latexbin2var, binsmax_num[imult]) legyield.AddEntry(hratioHM, legyieldstring, "LEP") fileoutput.cd() - hratioHM.Write("hratio_fromHM_%.1f_%s_%.1f" % \ - (binsmin_num[imult], latexbin2var, binsmax_num[imult])) + hratioHM.Write("hratio_fromHM_%.1f_%s_%.1f" % (binsmin_num[imult], latexbin2var, binsmax_num[imult])) legyield.Draw() - ccross.SaveAs("%s/PtSpec_ComparisonRatios_%s%s_%scombined%s_logx.eps" % \ - (folder_plots_num, case_num, case_den, arraytype[0], arraytype[1])) - ccross.SaveAs("%s/PtSpec_ComparisonRatios_%s%s_%scombined%s_logx.eps" % \ - (folder_plots_den, case_num, case_den, arraytype[0], arraytype[1])) + ccross.SaveAs( + "%s/PtSpec_ComparisonRatios_%s%s_%scombined%s_logx.eps" + % (folder_plots_num, case_num, case_den, arraytype[0], arraytype[1]) + ) + ccross.SaveAs( + "%s/PtSpec_ComparisonRatios_%s%s_%scombined%s_logx.eps" + % (folder_plots_den, case_num, case_den, arraytype[0], arraytype[1]) + ) ccross.cd(1).SetLogx(0) - ccross.SaveAs("%s/PtSpec_ComparisonRatios_%s%s_%scombined%s.eps" % \ - (folder_plots_num, case_num, case_den, arraytype[0], arraytype[1])) - ccross.SaveAs("%s/PtSpec_ComparisonRatios_%s%s_%scombined%s.eps" % \ - (folder_plots_den, case_num, case_den, arraytype[0], arraytype[1])) + ccross.SaveAs( + "%s/PtSpec_ComparisonRatios_%s%s_%scombined%s.eps" + % (folder_plots_num, case_num, case_den, arraytype[0], arraytype[1]) + ) + ccross.SaveAs( + "%s/PtSpec_ComparisonRatios_%s%s_%scombined%s.eps" + % (folder_plots_den, case_num, case_den, arraytype[0], arraytype[1]) + ) fileoutput.cd() ccross.Write() fileoutput.Close() - rootfilenameden = "%s/ComparisonRatios_%s%s_%scombined%s.root" % \ - (folder_plots_den, case_num, case_den, arraytype[0], arraytype[1]) + rootfilenameden = "%s/ComparisonRatios_%s%s_%scombined%s.root" % ( + folder_plots_den, + case_num, + case_den, + arraytype[0], + arraytype[1], + ) copyfile(rootfilename, rootfilenameden) print("---Output stored in:", rootfilename, "and", rootfilenameden, "---") + ##################################### gROOT.SetBatch(True) -#EXAMPLE HOW TO USE plot_hfptspectrum_comb +# EXAMPLE HOW TO USE plot_hfptspectrum_comb # ---> Combines and plots the output of HFPtSpectrum in nice way -#plot_hfptspectrum_comb("Dspp", ["MBvspt_ntrkl", "SPDvspt"]) +# plot_hfptspectrum_comb("Dspp", ["MBvspt_ntrkl", "SPDvspt"]) -#EXAMPLE HOW TO USE plot_hfptspectrum_ratios_comb +# EXAMPLE HOW TO USE plot_hfptspectrum_ratios_comb # ---> Combines and plots particle-ratio both with MLHEP -#plot_hfptspectrum_ratios_comb("Dspp", "D0pp", ["MBvspt_ntrkl", "SPDvspt"]) +# plot_hfptspectrum_ratios_comb("Dspp", "D0pp", ["MBvspt_ntrkl", "SPDvspt"]) -#EXAMPLES HOW TO USE plot_hfptspectrum_ml_over_std +# EXAMPLES HOW TO USE plot_hfptspectrum_ml_over_std # ---> Plots particle-ratio with MLHEP and inputfile from STD analyses -#plot_hfptspectrum_ml_over_std("Dspp", "MBvspt_ntrkl", -1, +# plot_hfptspectrum_ml_over_std("Dspp", "MBvspt_ntrkl", -1, # "data/std_results/HFPtSpectrum_D0_merged_20191010.root", # "D0", 2.27 / 3.89, None, 0, ["histoSigmaCorr"], ["histoSigmaCorr"]) -#plot_hfptspectrum_ml_over_std("Dspp", "MBvspt_ntrkl", 0, +# plot_hfptspectrum_ml_over_std("Dspp", "MBvspt_ntrkl", 0, # "data/std_results/HFPtSpectrum_D0_2016_prel_5tev_20191015.root", # "D0", 2.27 / 3.89, # [(1, [1]), (2, [2, 3]), (3, [4, 5]), (4, [6]), (5, [7]), (6, [8])], # 0, ["histoSigmaCorr"], ["histoSigmaCorr"], # "_prelim_5tev") -#EXAMPLES HOW TO USE compare_ml_std_ratio +# EXAMPLES HOW TO USE compare_ml_std_ratio # ---> Not sure what this does, to be checked -#compare_ml_std_ratio("Dspp", "D0pp", "MBvspt_ntrkl", -1, +# compare_ml_std_ratio("Dspp", "D0pp", "MBvspt_ntrkl", -1, # "data/std_results/HFPtSpectrum_Ds_merged_20191010.root", # "data/std_results/HFPtSpectrum_D0_merged_20191010.root", None, None, None, # 0, ["histoSigmaCorr"], ["hCrossSectionStatisticError"], ["histoSigmaCorr"]) diff --git a/machine_learning_hep/examples/plot_hfptspectrum_years.py b/machine_learning_hep/examples/plot_hfptspectrum_years.py index 582d854ecf..d5bd9faa87 100644 --- a/machine_learning_hep/examples/plot_hfptspectrum_years.py +++ b/machine_learning_hep/examples/plot_hfptspectrum_years.py @@ -15,24 +15,26 @@ """ main script for doing final stage analysis """ + import os + # pylint: disable=import-error, no-name-in-module, unused-import import yaml -from ROOT import gROOT, TFile +from ROOT import TFile, gROOT + from machine_learning_hep.utilities_plot import plot_histograms FILES_NOT_FOUND = [] + # pylint: disable=import-error, no-name-in-module, unused-import # pylint: disable=too-many-statements # pylint: disable=too-many-branches, too-many-locals def plot_hfspectrum_years_ratios(case_1, case_2, ana_type, mult_bins=None): - - - with open("../data/database_ml_parameters_%s.yml" % case_1, 'r') as param_config: + with open("../data/database_ml_parameters_%s.yml" % case_1, "r") as param_config: data_param_1 = yaml.load(param_config, Loader=yaml.FullLoader) - with open("../data/database_ml_parameters_%s.yml" % case_2, 'r') as param_config: + with open("../data/database_ml_parameters_%s.yml" % case_2, "r") as param_config: data_param_2 = yaml.load(param_config, Loader=yaml.FullLoader) folder_plots_1 = data_param_1[case_1]["analysis"]["dir_general_plots"] @@ -48,17 +50,22 @@ def plot_hfspectrum_years_ratios(case_1, case_2, ana_type, mult_bins=None): use_period = data_param_1[case_1]["analysis"][ana_type]["useperiod"] latexbin2var = data_param_1[case_1]["analysis"][ana_type]["latexbin2var"] - result_paths_1 = [data_param_1[case_1]["analysis"][ana_type]["data"]["results"][i] \ - for i in range(len(use_period)) if use_period[i]] + result_paths_1 = [ + data_param_1[case_1]["analysis"][ana_type]["data"]["results"][i] + for i in range(len(use_period)) + if use_period[i] + ] result_paths_1.insert(0, data_param_1[case_1]["analysis"][ana_type]["data"]["resultsallp"]) - result_paths_2 = [data_param_2[case_2]["analysis"][ana_type]["data"]["results"][i] \ - for i in range(len(use_period)) if use_period[i]] + result_paths_2 = [ + data_param_2[case_2]["analysis"][ana_type]["data"]["results"][i] + for i in range(len(use_period)) + if use_period[i] + ] result_paths_2.insert(0, data_param_2[case_2]["analysis"][ana_type]["data"]["resultsallp"]) # Assume same in all particle cases - periods = [data_param_1[case_1]["multi"]["data"]["period"][i] \ - for i in range(len(use_period)) if use_period[i]] + periods = [data_param_1[case_1]["multi"]["data"]["period"][i] for i in range(len(use_period)) if use_period[i]] periods.insert(0, "merged") binsmin = data_param_1[case_1]["analysis"][ana_type]["sel_binmin2"] @@ -67,10 +74,10 @@ def plot_hfspectrum_years_ratios(case_1, case_2, ana_type, mult_bins=None): name_1 = data_param_1[case_1]["analysis"][ana_type]["latexnamehadron"] name_2 = data_param_2[case_2]["analysis"][ana_type]["latexnamehadron"] - #br_1 = data_param_1[case_1]["ml"]["opt"]["BR"] - #br_2 = data_param_2[case_2]["ml"]["opt"]["BR"] - #sigmav0_1 = data_param_1[case_1]["analysis"]["sigmav0"] - #sigmav0_2 = data_param_2[case_2]["analysis"]["sigmav0"] + # br_1 = data_param_1[case_1]["ml"]["opt"]["BR"] + # br_2 = data_param_2[case_2]["ml"]["opt"]["BR"] + # sigmav0_1 = data_param_1[case_1]["analysis"]["sigmav0"] + # sigmav0_2 = data_param_2[case_2]["analysis"]["sigmav0"] if mult_bins is None: mult_bins = range(len(binsmin)) @@ -101,14 +108,13 @@ def plot_hfspectrum_years_ratios(case_1, case_2, ana_type, mult_bins=None): hyield_1.SetDirectory(0) hyield_2 = file_2.Get("histoSigmaCorr") hyield_2.SetDirectory(0) - #hyield_1.Scale(1./(br_1 * sigmav0_1 * 1e12)) - #hyield_2.Scale(1./(br_2 * sigmav0_2 * 1e12)) + # hyield_1.Scale(1./(br_1 * sigmav0_1 * 1e12)) + # hyield_2.Scale(1./(br_2 * sigmav0_2 * 1e12)) hyield_ratio = hyield_1.Clone(f"{case_1}_{case_2}_ratio_{period}_{imult}") hyield_ratio.Divide(hyield_2) histos.append(hyield_ratio) - l_string = f"{binsmin[imult]:.1f} #leq {latexbin2var} < {binsmax[imult]:.1f} "\ - f"({ana_type}), {period}" + l_string = f"{binsmin[imult]:.1f} #leq {latexbin2var} < {binsmax[imult]:.1f} ({ana_type}), {period}" legend_titles.append(l_string) if not histos: @@ -121,15 +127,24 @@ def plot_hfspectrum_years_ratios(case_1, case_2, ana_type, mult_bins=None): save_path = f"{sub_folder}/{histos[0].GetName()}_combined_{periods_string}_{imult}.eps" y_label = f"{histos[0].GetYaxis().GetTitle()} {name_1} / {name_2}" - plot_histograms(histos, True, True, legend_titles, histos[0].GetTitle(), - "#it{p}_{T} (GeV/#it{c})", y_label, "year / merged", save_path) + plot_histograms( + histos, + True, + True, + legend_titles, + histos[0].GetTitle(), + "#it{p}_{T} (GeV/#it{c})", + y_label, + "year / merged", + save_path, + ) + # pylint: disable=import-error, no-name-in-module, unused-import # pylint: disable=too-many-statements # pylint: disable=too-many-branches, too-many-locals def plot_hfspectrum_years(case, ana_type, mult_bins=None): - - with open("../data/database_ml_parameters_%s.yml" % case, 'r') as param_config: + with open("../data/database_ml_parameters_%s.yml" % case, "r") as param_config: data_param = yaml.load(param_config, Loader=yaml.FullLoader) folder_plots = data_param[case]["analysis"]["dir_general_plots"] @@ -139,10 +154,10 @@ def plot_hfspectrum_years(case, ana_type, mult_bins=None): os.makedirs(folder_plots) use_period = data_param[case]["analysis"][ana_type]["useperiod"] - result_paths = [data_param[case]["analysis"][ana_type]["data"]["results"][i] \ - for i in range(len(use_period)) if use_period[i]] - periods = [data_param[case]["multi"]["data"]["period"][i] \ - for i in range(len(use_period)) if use_period[i]] + result_paths = [ + data_param[case]["analysis"][ana_type]["data"]["results"][i] for i in range(len(use_period)) if use_period[i] + ] + periods = [data_param[case]["multi"]["data"]["period"][i] for i in range(len(use_period)) if use_period[i]] result_paths.insert(0, data_param[case]["analysis"][ana_type]["data"]["resultsallp"]) periods.insert(0, "merged") @@ -151,8 +166,8 @@ def plot_hfspectrum_years(case, ana_type, mult_bins=None): binsmax = data_param[case]["analysis"][ana_type]["sel_binmax2"] name = data_param[case]["analysis"][ana_type]["latexnamehadron"] latexbin2var = data_param[case]["analysis"][ana_type]["latexbin2var"] - #br = data_param[case]["ml"]["opt"]["BR"] - #sigmav0 = data_param[case]["analysis"]["sigmav0"] + # br = data_param[case]["ml"]["opt"]["BR"] + # sigmav0 = data_param[case]["analysis"]["sigmav0"] if mult_bins is None: mult_bins = range(len(binsmin)) @@ -171,21 +186,31 @@ def plot_hfspectrum_years(case, ana_type, mult_bins=None): print("################") print(f"case {case} in analysis {ana_type}") - histo_names = ["hDirectMCpt", "hFeedDownMCpt", "hDirectMCptMax", "hDirectMCptMin", - "hFeedDownMCptMax", "hFeedDownMCptMin", "hDirectEffpt", "hFeedDownEffpt", - "hRECpt", "histoYieldCorr", "histoYieldCorrMax", "histoYieldCorrMin", - "histoSigmaCorr", "histoSigmaCorrMax", "histoSigmaCorrMin"] + histo_names = [ + "hDirectMCpt", + "hFeedDownMCpt", + "hDirectMCptMax", + "hDirectMCptMin", + "hFeedDownMCptMax", + "hFeedDownMCptMin", + "hDirectEffpt", + "hFeedDownEffpt", + "hRECpt", + "histoYieldCorr", + "histoYieldCorrMax", + "histoYieldCorrMin", + "histoSigmaCorr", + "histoSigmaCorrMax", + "histoSigmaCorrMin", + ] periods_string = "_".join(periods) for hn in histo_names: - for imult in mult_bins: - histos = [] legend_titles = [] for period, path in zip(periods, files_mult[imult]): - print(f"Mult {imult}, period {period}") print(f"In file {path}") @@ -194,11 +219,12 @@ def plot_hfspectrum_years(case, ana_type, mult_bins=None): h.SetDirectory(0) histos.append(h) comment = "" - if histos[-1].Integral() <= 0. or histos[-1].GetEntries() == 0: + if histos[-1].Integral() <= 0.0 or histos[-1].GetEntries() == 0: print(f"Empty period {period}, {case}, {ana_type}, mult {imult}") comment = "(empty)" - l_string = f"{binsmin[imult]:.1f} #leq {latexbin2var} < {binsmax[imult]:.1f} "\ - f"({ana_type}), {period} {comment}" + l_string = ( + f"{binsmin[imult]:.1f} #leq {latexbin2var} < {binsmax[imult]:.1f} ({ana_type}), {period} {comment}" + ) legend_titles.append(l_string) if not histos: @@ -210,17 +236,27 @@ def plot_hfspectrum_years(case, ana_type, mult_bins=None): save_path = f"{sub_folder}/{hn}_combined_{periods_string}_{imult}.eps" label_y = f"{histos[0].GetYaxis().GetTitle()} {name}" - plot_histograms(histos, True, True, legend_titles, histos[0].GetTitle(), - "#it{p}_{T} (GeV/#it{c})", label_y, "year / merged", save_path) + plot_histograms( + histos, + True, + True, + legend_titles, + histos[0].GetTitle(), + "#it{p}_{T} (GeV/#it{c})", + label_y, + "year / merged", + save_path, + ) + ##################################### gROOT.SetBatch(True) -#EXAMPLE HOW TO USE plot_hfptspectrum_years +# EXAMPLE HOW TO USE plot_hfptspectrum_years # ---> Makes comparison plots+ratios (for whatever histogram) between different years/periods. -#plot_hfspectrum_years("Dspp", "MBvspt_ntrkl") -#plot_hfspectrum_years_ratios("Dspp", "D0pp", "MBvspt_ntrkl") +# plot_hfspectrum_years("Dspp", "MBvspt_ntrkl") +# plot_hfspectrum_years_ratios("Dspp", "D0pp", "MBvspt_ntrkl") if FILES_NOT_FOUND: print("FILES NOT FOUND:") diff --git a/machine_learning_hep/fitting/fitters.py b/machine_learning_hep/fitting/fitters.py index e189af0ff0..7536065a78 100644 --- a/machine_learning_hep/fitting/fitters.py +++ b/machine_learning_hep/fitting/fitters.py @@ -19,32 +19,50 @@ # pylint: disable=too-many-lines -from copy import deepcopy from array import array -from math import sqrt +from copy import deepcopy from ctypes import c_double +from math import sqrt # pylint: disable=import-error, no-name-in-module, unused-import, f-string-without-interpolation try: - from ROOT import AliHFInvMassFitter, AliVertexingHFUtils, AliHFInvMassMultiTrialFit + from ROOT import AliHFInvMassFitter, AliHFInvMassMultiTrialFit, AliVertexingHFUtils except ImportError: pass -from ROOT import TFile, TH1F, TH1D, TF1, TPaveText, TLine, TLegend, TLatex -from ROOT import kBlue, kRed, kGreen, kMagenta, kOrange, kPink, kCyan, kYellow, kBlack +from ROOT import ( + TF1, + TH1D, + TH1F, + TFile, + TLatex, + TLegend, + TLine, + TPaveText, + kBlack, + kBlue, + kCyan, + kGreen, + kMagenta, + kOrange, + kPink, + kRed, + kYellow, +) -from machine_learning_hep.logger import get_logger from machine_learning_hep.fitting.utils import construct_rebinning +from machine_learning_hep.logger import get_logger # single or double Gaussian TYPE_GAUSS_1 = "kGaus" TYPE_GAUSS_2 = "k2Gaus" -class FitBase: # pylint: disable=too-many-instance-attributes + +class FitBase: # pylint: disable=too-many-instance-attributes """ Common base class for FitAliHF and FitROOT. """ - def __init__(self, init_pars, **kwargs): # pylint: disable=unused-argument + def __init__(self, init_pars, **kwargs): # pylint: disable=unused-argument self.logger = get_logger() # If nit/fitting attempt was made self.has_attempt = False @@ -54,23 +72,24 @@ def __init__(self, init_pars, **kwargs): # pylint: disable=unused-argument self.user_init_pars = deepcopy(init_pars) self.init_pars = None # Default init parameters (to be modified for deriving classes) - self.default_init_pars = {"mean": None, - "fix_mean": False, - "sigma": None, - "fix_sigma": False, - "rebin": None, - "fit_range_low": None, - "fit_range_up": None, - "likelihood": True, - "n_sigma_sideband": None, - "sig_func_name": None, - "bkg_func_name": None} + self.default_init_pars = { + "mean": None, + "fix_mean": False, + "sigma": None, + "fix_sigma": False, + "rebin": None, + "fit_range_low": None, + "fit_range_up": None, + "likelihood": True, + "n_sigma_sideband": None, + "sig_func_name": None, + "bkg_func_name": None, + } # Fitted parameters (to be modified for deriving classes) self.fit_pars = {} # The fit kernel self.kernel = None - def make_default_init_pars(self): """ Small wrapper for constructing default inititalisation parameters @@ -80,7 +99,6 @@ def make_default_init_pars(self): return deepcopy(self.default_init_pars) - def get_fit_pars(self): """ Small wrapper providing deep copy of fit parameters @@ -95,7 +113,6 @@ def override_init_pars(self, **init_pars): if par in self.user_init_pars: self.user_init_pars[par] = val - def init_fit(self): """ Common initialisation steps @@ -109,21 +126,20 @@ def init_fit(self): self.init_pars = self.make_default_init_pars() # Collect key which haven't changed - #pars_not_changed = [] + # pars_not_changed = [] for k in list(self.init_pars.keys()): if k in self.user_init_pars: self.init_pars[k] = self.user_init_pars.pop(k) # continue # pars_not_changed.append(k) - #self.logger.debug("Following default parameters are used") - #for p in pars_not_changed: - #print(p) + # self.logger.debug("Following default parameters are used") + # for p in pars_not_changed: + # print(p) if self.success: return True return self.init_kernel() - def init_kernel(self): """ Initialize the fit kernel. To be overwritten by the deriving class @@ -132,7 +148,6 @@ def init_kernel(self): self.logger.debug("Init kernel") return True - def fit_kernel(self): """ Fit the fit kernel. To be overwritten by the deriving class @@ -141,13 +156,11 @@ def fit_kernel(self): self.logger.debug("Fit kernel") return True - def set_fit_pars(self): """ Set final fitted parameters. To be overwritten by the deriving class """ - def fit(self): """ Initialize and fit. This is common and not to be overwritten by a deriving class @@ -162,7 +175,6 @@ def fit(self): self.set_fit_pars() self.has_attempt = True - def draw(self, root_pad, **draw_args): """ Draw this fit. This is common and not to be overwritten by a deriving class. Arguments @@ -189,8 +201,7 @@ def draw(self, root_pad, **draw_args): self.draw_kernel(root_pad, **draw_args) - - def draw_kernel(self, root_pad, root_objects=[], **draw_args): # pylint: disable=unused-argument, dangerous-default-value + def draw_kernel(self, root_pad, root_objects=[], **draw_args): # pylint: disable=unused-argument, dangerous-default-value """ Draw method specific to the used kernel. To be overwritten by the derivin class Args: @@ -202,7 +213,6 @@ def draw_kernel(self, root_pad, root_objects=[], **draw_args): # pylint: disable self.logger.debug("Draw kernel") - @staticmethod def add_text_helper_(pave, line, color=None): """ @@ -234,18 +244,15 @@ def add_pave_helper_(x_min, y_min, x_max, y_max, opt="NDC"): pave = TPaveText(x_min, y_min, x_max, y_max, opt) pave.SetBorderSize(0) pave.SetFillStyle(0) - pave.SetMargin(0.) + pave.SetMargin(0.0) return pave -class FitROOT(FitBase): # pylint: enable=too-many-instance-attributes - - +class FitROOT(FitBase): # pylint: enable=too-many-instance-attributes def __init__(self, *args, **kwargs): super().__init__(*args, **kwargs) self.root_objects = None - def set_root_objects(self, root_objects): self.root_objects = root_objects self.update_root_objects() @@ -253,12 +260,13 @@ def set_root_objects(self, root_objects): def update_root_objects(self): pass - def __str__(self): - string = f"--------------------------------\n" \ - f"Class: {self.__class__.__name__}\n" \ - f"Kernel: {self.kernel.__class__.__name__}, {self.kernel}\n" \ - f"Init parameters:\n" + string = ( + f"--------------------------------\n" + f"Class: {self.__class__.__name__}\n" + f"Kernel: {self.kernel.__class__.__name__}, {self.kernel}\n" + f"Init parameters:\n" + ) string += str(self.init_pars) string += "\nROOT objects\n" for name, obj in self.root_objects.items(): @@ -271,6 +279,7 @@ class FitAliHF(FitROOT): """ Class with AliHFMassFitter as core fitting utility """ + def __init__(self, *args, histo=None, histo_mc=None, histo_reflections=None, **base_args): super().__init__(*args, **base_args) self.histo = histo @@ -278,34 +287,34 @@ def __init__(self, *args, histo=None, histo_mc=None, histo_reflections=None, **b self.histo_reflections = histo_reflections # AliHF fitter - self.default_init_pars = {"mean": None, - "fix_mean": False, - "sigma": None, - "fix_sigma": False, - "include_sec_peak": False, - "sec_mean": None, - "fix_sec_mean": False, - "sec_sigma": None, - "fix_sec_sigma": False, - "use_sec_peak_rel_sigma": True, - "include_reflections": False, - "fix_reflections_s_over_b": True, - "rebin": None, - "fit_range_low": None, - "fit_range_up": None, - "likelihood": True, - "n_sigma_sideband": None, - "rel_sigma_bound": None, - "sig_func_name": None, - "bkg_func_name": None} + self.default_init_pars = { + "mean": None, + "fix_mean": False, + "sigma": None, + "fix_sigma": False, + "include_sec_peak": False, + "sec_mean": None, + "fix_sec_mean": False, + "sec_sigma": None, + "fix_sec_sigma": False, + "use_sec_peak_rel_sigma": True, + "include_reflections": False, + "fix_reflections_s_over_b": True, + "rebin": None, + "fit_range_low": None, + "fit_range_up": None, + "likelihood": True, + "n_sigma_sideband": None, + "rel_sigma_bound": None, + "sig_func_name": None, + "bkg_func_name": None, + } # Fitted parameters (to be modified for deriving classes) # Only those corresponding to init parameters are here. Specific parameters/values # provided by the kernel have to be extracted from that directly. - self.fit_pars = {"mean": None, - "sigma": None} + self.fit_pars = {"mean": None, "sigma": None} self.update_root_objects() - def update_root_objects(self): if self.root_objects is None: self.root_objects = {} @@ -317,9 +326,7 @@ def update_root_objects(self): self.root_objects["histo_mc"] = self.histo_mc self.root_objects["histo_reflections"] = self.histo_reflections - def init_kernel(self): - self.update_root_objects() rebin = construct_rebinning(self.histo, self.init_pars["rebin"]) @@ -332,12 +339,13 @@ def init_kernel(self): else: self.histo = self.histo.Clone(f"{self.histo.GetName()}_fit_histo") - - self.kernel = AliHFInvMassFitter(self.histo, - self.init_pars["fit_range_low"], - self.init_pars["fit_range_up"], - self.init_pars["bkg_func_name"], - self.init_pars["sig_func_name"]) + self.kernel = AliHFInvMassFitter( + self.histo, + self.init_pars["fit_range_low"], + self.init_pars["fit_range_up"], + self.init_pars["bkg_func_name"], + self.init_pars["sig_func_name"], + ) self.kernel.SetCheckSignalCountsAfterFirstFit(False) if self.init_pars["likelihood"]: self.kernel.SetUseLikelihoodFit() @@ -351,47 +359,50 @@ def init_kernel(self): self.kernel.SetFixGaussianSigma(self.init_pars["sigma"]) if self.init_pars["include_reflections"]: - - self.histo_reflections = AliVertexingHFUtils.AdaptTemplateRangeAndBinning( \ - self.histo_reflections, self.histo, self.init_pars["fit_range_low"], - self.init_pars["fit_range_up"]) - self.histo_mc = AliVertexingHFUtils.AdaptTemplateRangeAndBinning( \ - self.histo_mc, self.histo, self.init_pars["fit_range_low"], - self.init_pars["fit_range_up"]) + self.histo_reflections = AliVertexingHFUtils.AdaptTemplateRangeAndBinning( + self.histo_reflections, self.histo, self.init_pars["fit_range_low"], self.init_pars["fit_range_up"] + ) + self.histo_mc = AliVertexingHFUtils.AdaptTemplateRangeAndBinning( + self.histo_mc, self.histo, self.init_pars["fit_range_low"], self.init_pars["fit_range_up"] + ) self.histo_mc.SetName(f"{self.histo_mc.GetName()}_fit_histo") self.histo_reflections.SetName(f"{self.histo_reflections.GetName()}_fit_histo") if self.init_pars["fix_reflections_s_over_b"]: r_over_s = self.histo_mc.Integral( self.histo_mc.FindBin(self.init_pars["fit_range_low"] * 1.0001), - self.histo_mc.FindBin(self.init_pars["fit_range_up"] * 0.999)) - if r_over_s > 0.: - r_over_s = self.histo_reflections.Integral( - self.histo_reflections.FindBin(self.init_pars["fit_range_low"] * 1.0001), - self.histo_reflections.FindBin(self.init_pars["fit_range_up"] * 0.999)) \ - / r_over_s + self.histo_mc.FindBin(self.init_pars["fit_range_up"] * 0.999), + ) + if r_over_s > 0.0: + r_over_s = ( + self.histo_reflections.Integral( + self.histo_reflections.FindBin(self.init_pars["fit_range_low"] * 1.0001), + self.histo_reflections.FindBin(self.init_pars["fit_range_up"] * 0.999), + ) + / r_over_s + ) self.kernel.SetFixReflOverS(r_over_s) if self.histo_reflections.Integral() > 0: - self.kernel.SetTemplateReflections(self.histo_reflections, "1gaus", - self.init_pars["fit_range_low"], - self.init_pars["fit_range_up"]) + self.kernel.SetTemplateReflections( + self.histo_reflections, "1gaus", self.init_pars["fit_range_low"], self.init_pars["fit_range_up"] + ) if self.init_pars["include_sec_peak"]: - sec_sigma = self.init_pars["sigma"] * self.init_pars["sec_sigma"] \ - if self.init_pars["use_sec_peak_rel_sigma"] \ - else self.init_pars["sec_sigma"] - self.kernel.IncludeSecondGausPeak(self.init_pars["sec_mean"], - self.init_pars["fix_sec_mean"], - sec_sigma, - self.init_pars["fix_sec_sigma"]) + sec_sigma = ( + self.init_pars["sigma"] * self.init_pars["sec_sigma"] + if self.init_pars["use_sec_peak_rel_sigma"] + else self.init_pars["sec_sigma"] + ) + self.kernel.IncludeSecondGausPeak( + self.init_pars["sec_mean"], self.init_pars["fix_sec_mean"], sec_sigma, self.init_pars["fix_sec_sigma"] + ) return True - def fit_kernel(self): success = self.kernel.MassFitter(False) if success: - if self.kernel.GetRawYield() < 0.: + if self.kernel.GetRawYield() < 0.0: return False if self.init_pars["rel_sigma_bound"]: fit_sigma = self.kernel.GetSigma() @@ -400,24 +411,21 @@ def fit_kernel(self): return min_sigma < fit_sigma < max_sigma return success - def set_fit_pars(self): self.fit_pars["mean"] = self.kernel.GetMean() self.fit_pars["sigma"] = self.kernel.GetSigma() - - def draw_kernel(self, root_pad, root_objects=[], **draw_args): # pylint: disable=too-many-locals, too-many-statements, dangerous-default-value - + def draw_kernel(self, root_pad, root_objects=[], **draw_args): # pylint: disable=too-many-locals, too-many-statements, dangerous-default-value n_sigma_signal = draw_args.pop("sigma_signal", 3) mean_dim = draw_args.pop("mean_dim", "GeV/#it{c}^{2}") - mean_scale = draw_args.pop("mean_scale", 1.) + mean_scale = draw_args.pop("mean_scale", 1.0) sigma_dim = draw_args.pop("sigma_dim", "MeV/#it{c}^{2}") - sigma_scale = draw_args.pop("sigma_scale", 1000.) + sigma_scale = draw_args.pop("sigma_scale", 1000.0) title = draw_args.pop("title", "") x_axis_label = draw_args.pop("x_axis_label", "#it{M}_{inv} (GeV/#it{c}^{2})") - y_axis_label = draw_args.pop("y_axis_label", - f"Entries/({self.histo.GetBinWidth(1) * 1000:.0f} " \ - "MeV/#it{c}^{2})") + y_axis_label = draw_args.pop( + "y_axis_label", f"Entries/({self.histo.GetBinWidth(1) * 1000:.0f} MeV/#it{{c}}^{{2}})" + ) add_root_objects = draw_args.pop("add_root_objects", None) @@ -451,14 +459,12 @@ def draw_kernel(self, root_pad, root_objects=[], **draw_args): # pylint: disable # Could either be None or a nullptr draw_objects.append(refl_func) draw_options.append("") - sec_peak_func = self.kernel.GetSecondPeakFunc() \ - if self.init_pars["include_sec_peak"] else None + sec_peak_func = self.kernel.GetSecondPeakFunc() if self.init_pars["include_sec_peak"] else None if sec_peak_func: # Could either be None or a nullptr draw_objects.append(sec_peak_func) draw_options.append("") - y_plot_max = self.histo.GetMaximum() y_plot_min = self.histo.GetMinimum() for i in range(1, self.histo.GetNbinsX() + 1): @@ -480,16 +486,17 @@ def draw_kernel(self, root_pad, root_objects=[], **draw_args): # pylint: disable y_max = y_plot_max + y_rel_header_range * y_full_range root_pad.SetLeftMargin(0.12) - frame = root_pad.cd().DrawFrame(self.init_pars["fit_range_low"], y_min, - self.init_pars["fit_range_up"], y_max, - f"{title} ; " \ - f"{x_axis_label} ; " \ - f"{y_axis_label}") + frame = root_pad.cd().DrawFrame( + self.init_pars["fit_range_low"], + y_min, + self.init_pars["fit_range_up"], + y_max, + f"{title} ; {x_axis_label} ; {y_axis_label}", + ) frame.GetYaxis().SetTitleOffset(1.7) frame.GetYaxis().SetMaxDigits(4) - sig = self.kernel.GetRawYield() sig_err = self.kernel.GetRawYieldError() bkg = c_double() @@ -502,33 +509,34 @@ def draw_kernel(self, root_pad, root_objects=[], **draw_args): # pylint: disable bkg_err = bkg_err.value signif = signif.value signif_err = signif_err.value - sig_o_bkg = sig / bkg if bkg > 0. else -1. + sig_o_bkg = sig / bkg if bkg > 0.0 else -1.0 root_objects.append(self.add_pave_helper_(0.15, 0.7, 0.48, 0.89, "NDC")) self.add_text_helper_(root_objects[-1], f"S = {sig:.0f} #pm {sig_err:.0f}") - self.add_text_helper_(root_objects[-1], - f"B({n_sigma_signal}#sigma) = {bkg:.0f} " \ - f"#pm {bkg_err:.0f}") + self.add_text_helper_(root_objects[-1], f"B({n_sigma_signal}#sigma) = {bkg:.0f} #pm {bkg_err:.0f}") self.add_text_helper_(root_objects[-1], f"S/B({n_sigma_signal}#sigma) = {sig_o_bkg:.4f}") - self.add_text_helper_(root_objects[-1], - f"Signif({n_sigma_signal}#sigma) = " \ - f"{signif:.1f} #pm {signif_err:.1f}") + self.add_text_helper_(root_objects[-1], f"Signif({n_sigma_signal}#sigma) = {signif:.1f} #pm {signif_err:.1f}") root_objects[-1].Draw() root_objects.append(self.add_pave_helper_(0.55, 0.75, 0.89, 0.89, "NDC")) - self.add_text_helper_(root_objects[-1], - f"#chi/ndf = {self.kernel.GetReducedChiSquare():.4f}", color_sig) - self.add_text_helper_(root_objects[-1], - f"#mu = {self.kernel.GetMean()*mean_scale:.4f} " \ - f"#pm " \ - f"{self.kernel.GetMeanUncertainty()*mean_scale:.4f} " \ - f"{mean_dim}", color_sig) - self.add_text_helper_(root_objects[-1], - f"#sigma = " \ - f"{self.kernel.GetSigma()*sigma_scale:.4f} " \ - f"#pm " \ - f"{self.kernel.GetSigmaUncertainty()*sigma_scale:.4f} " \ - f"{sigma_dim}", color_sig) + self.add_text_helper_(root_objects[-1], f"#chi/ndf = {self.kernel.GetReducedChiSquare():.4f}", color_sig) + self.add_text_helper_( + root_objects[-1], + f"#mu = {self.kernel.GetMean() * mean_scale:.4f} " + f"#pm " + f"{self.kernel.GetMeanUncertainty() * mean_scale:.4f} " + f"{mean_dim}", + color_sig, + ) + self.add_text_helper_( + root_objects[-1], + f"#sigma = " + f"{self.kernel.GetSigma() * sigma_scale:.4f} " + f"#pm " + f"{self.kernel.GetSigmaUncertainty() * sigma_scale:.4f} " + f"{sigma_dim}", + color_sig, + ) root_objects[-1].Draw() x_min_add = 0.45 @@ -538,22 +546,23 @@ def draw_kernel(self, root_pad, root_objects=[], **draw_args): # pylint: disable sec_peak_func.SetLineColor(color_sec_peak) sec_mean = sec_peak_func.GetParameter(1) sec_sigma = sec_peak_func.GetParameter(2) - root_objects.append(self.add_pave_helper_(x_min_add, y_min_tmp, 0.89, - y_min_tmp + y_delta, "NDC")) - self.add_text_helper_(root_objects[-1], f"#mu_{{sec}} = {sec_mean*mean_scale:.4f} " \ - f"{mean_dim}, #sigma_{{sec}} = " \ - f"{sec_sigma*sigma_scale:.4f} " \ - f"{sigma_dim}", color_sec_peak) + root_objects.append(self.add_pave_helper_(x_min_add, y_min_tmp, 0.89, y_min_tmp + y_delta, "NDC")) + self.add_text_helper_( + root_objects[-1], + f"#mu_{{sec}} = {sec_mean * mean_scale:.4f} " + f"{mean_dim}, #sigma_{{sec}} = " + f"{sec_sigma * sigma_scale:.4f} " + f"{sigma_dim}", + color_sec_peak, + ) root_objects[-1].Draw() y_min_tmp += y_delta if refl_func: refl_func.SetLineColor(color_refl) refl = self.kernel.GetReflOverSig() refl_err = self.kernel.GetReflOverSigUncertainty() - root_objects.append(self.add_pave_helper_(x_min_add, y_min_tmp, 0.89, - y_min_tmp + y_delta, "NDC")) - self.add_text_helper_(root_objects[-1], f"Refl/S = {refl:.4f} #pm {refl_err:.4f}", - color_refl) + root_objects.append(self.add_pave_helper_(x_min_add, y_min_tmp, 0.89, y_min_tmp + y_delta, "NDC")) + self.add_text_helper_(root_objects[-1], f"Refl/S = {refl:.4f} #pm {refl_err:.4f}", color_refl) root_objects[-1].Draw() y_min_tmp += y_delta @@ -566,32 +575,31 @@ def draw_kernel(self, root_pad, root_objects=[], **draw_args): # pylint: disable aro.Draw("same") -class FitROOTGauss(FitROOT): # pylint: disable=too-many-instance-attributes +class FitROOTGauss(FitROOT): # pylint: disable=too-many-instance-attributes """ Class with specific ROOT TF1 as core fitting utility """ - def __init__(self, pars, histo=None, type_gauss=TYPE_GAUSS_1, - **base_args): + def __init__(self, pars, histo=None, type_gauss=TYPE_GAUSS_1, **base_args): super().__init__(pars, **base_args) self.histo = histo self.type_gauss = type_gauss - self.default_init_pars = {"rebin": None, - "use_user_fit_range": False, - "fit_range_low": None, - "fit_range_up": None, - "n_rms_fix": None, - "n_rms_start": 3., - "n_rms_stepping": 0.10, - "n_rms_steps": 20, - "likelihood": False} + self.default_init_pars = { + "rebin": None, + "use_user_fit_range": False, + "fit_range_low": None, + "fit_range_up": None, + "n_rms_fix": None, + "n_rms_start": 3.0, + "n_rms_stepping": 0.10, + "n_rms_steps": 20, + "likelihood": False, + } # Fitted parameters (to be modified for deriving classes) # Only those corresponding to init parameters are here. Specific parameters/values # provided by the kernel have to be extracted from that directly. - self.fit_pars = {"mean": None, - "sigma": None, - "second_sigma": None} + self.fit_pars = {"mean": None, "sigma": None, "second_sigma": None} # Fit range finally used for MC fit self.fit_range_low = None @@ -606,9 +614,7 @@ def update_root_objects(self): self.histo = self.root_objects.get("histo", self.histo) self.root_objects["histo"] = self.histo - def init_kernel(self): - self.update_root_objects() rebin = construct_rebinning(self.histo, self.init_pars["rebin"]) @@ -624,16 +630,13 @@ def init_kernel(self): return True - def __fit_kernel(self, mean_init, sigma_init, int_init, fit_range_low, fit_range_up): - - func_string = "[0]/TMath::Sqrt(2.*TMath::Pi())/[2]*TMath::Exp(-(x-[1])*(x-[1])/2./[2]/[2])" if self.type_gauss == TYPE_GAUSS_2: - func_string = "(1.-[3])/TMath::Sqrt(2.*TMath::Pi()) / " \ - "[2]*TMath::Exp(-(x-[1])*(x-[1])/2./[2]/[2])" - func_string = f"[0] * ({func_string} + " \ - "[3]/TMath::Sqrt(2.*TMath::Pi())/[4]*TMath::Exp(-(x-[1])*(x-[1])/2./[4]/[4]))" + func_string = "(1.-[3])/TMath::Sqrt(2.*TMath::Pi()) / [2]*TMath::Exp(-(x-[1])*(x-[1])/2./[2]/[2])" + func_string = ( + f"[0] * ({func_string} + [3]/TMath::Sqrt(2.*TMath::Pi())/[4]*TMath::Exp(-(x-[1])*(x-[1])/2./[4]/[4]))" + ) fit_func = TF1("fit_func", func_string, fit_range_low, fit_range_up) @@ -641,7 +644,6 @@ def __fit_kernel(self, mean_init, sigma_init, int_init, fit_range_low, fit_range fit_func.SetParameter(1, mean_init) fit_func.SetParameter(2, sigma_init) - if self.type_gauss == TYPE_GAUSS_2: fit_func.SetParameter(3, 0.5) # That's a guess... @@ -656,45 +658,59 @@ def __fit_kernel(self, mean_init, sigma_init, int_init, fit_range_low, fit_range mean_fit = fit_func.GetParameter(1) sigma_fit = abs(fit_func.GetParameter(2)) chi2ndf = fit_func.GetNDF() - chi2ndf = fit_func.GetChisquare() / chi2ndf if chi2ndf > 0. else 0. + chi2ndf = fit_func.GetChisquare() / chi2ndf if chi2ndf > 0.0 else 0.0 success = True - if int_fit * sigma_fit < 0. \ - or mean_init - sigma_init > mean_fit or mean_fit > mean_init + sigma_init \ - or 1.1 * sigma_init < sigma_fit or chi2ndf <= 0.: + if ( + int_fit * sigma_fit < 0.0 + or mean_init - sigma_init > mean_fit + or mean_fit > mean_init + sigma_init + or 1.1 * sigma_init < sigma_fit + or chi2ndf <= 0.0 + ): success = False return fit_func, success - def fit_kernel(self): guess_mean = self.histo.GetMean() guess_sigma = self.histo.GetRMS() if self.init_pars["use_user_fit_range"] and self.type_gauss == TYPE_GAUSS_1: - guess_int = self.histo.Integral(self.histo.FindBin(self.init_pars["fit_range_low"]), - self.histo.FindBin(self.init_pars["fit_range_up"]), - "width") - self.kernel, success = self.__fit_kernel(guess_mean, guess_sigma, guess_int, - self.init_pars["fit_range_low"], - self.init_pars["fit_range_up"]) + guess_int = self.histo.Integral( + self.histo.FindBin(self.init_pars["fit_range_low"]), + self.histo.FindBin(self.init_pars["fit_range_up"]), + "width", + ) + self.kernel, success = self.__fit_kernel( + guess_mean, guess_sigma, guess_int, self.init_pars["fit_range_low"], self.init_pars["fit_range_up"] + ) self.fit_range_low = self.init_pars["fit_range_low"] self.fit_range_up = self.init_pars["fit_range_up"] return success - for r in [self.init_pars["n_rms_start"] + i * self.init_pars["n_rms_stepping"] \ - for i in range(self.init_pars["n_rms_steps"])]: - guess_fit_range_low = guess_mean - r * guess_sigma \ - if self.type_gauss == TYPE_GAUSS_1 else self.init_pars["fit_range_low"] - guess_fit_range_up = guess_mean + r * guess_sigma \ - if self.type_gauss == TYPE_GAUSS_1 else self.init_pars["fit_range_up"] - guess_sigma_tmp = guess_sigma if guess_sigma else 1. - guess_int = self.histo.Integral(self.histo.FindBin(guess_fit_range_low), - self.histo.FindBin(guess_fit_range_up), - "width") / guess_sigma_tmp / 2.5 - self.kernel, success = self.__fit_kernel(guess_mean, guess_sigma, guess_int, - guess_fit_range_low, guess_fit_range_up) + for r in [ + self.init_pars["n_rms_start"] + i * self.init_pars["n_rms_stepping"] + for i in range(self.init_pars["n_rms_steps"]) + ]: + guess_fit_range_low = ( + guess_mean - r * guess_sigma if self.type_gauss == TYPE_GAUSS_1 else self.init_pars["fit_range_low"] + ) + guess_fit_range_up = ( + guess_mean + r * guess_sigma if self.type_gauss == TYPE_GAUSS_1 else self.init_pars["fit_range_up"] + ) + guess_sigma_tmp = guess_sigma if guess_sigma else 1.0 + guess_int = ( + self.histo.Integral( + self.histo.FindBin(guess_fit_range_low), self.histo.FindBin(guess_fit_range_up), "width" + ) + / guess_sigma_tmp + / 2.5 + ) + self.kernel, success = self.__fit_kernel( + guess_mean, guess_sigma, guess_int, guess_fit_range_low, guess_fit_range_up + ) # Save used fit range self.fit_range_low = guess_fit_range_low self.fit_range_up = guess_fit_range_up @@ -702,8 +718,7 @@ def fit_kernel(self): # Require at least 5 points in fit range # Do this here to have at least a kernel which could be drawn later - if self.histo.FindBin(guess_fit_range_up) - \ - self.histo.FindBin(guess_fit_range_low) < 5: + if self.histo.FindBin(guess_fit_range_up) - self.histo.FindBin(guess_fit_range_low) < 5: continue if success: @@ -711,25 +726,22 @@ def fit_kernel(self): return False - def set_fit_pars(self): self.fit_pars["mean"] = self.kernel.GetParameter(1) self.fit_pars["sigma"] = self.kernel.GetParameter(2) if self.type_gauss == TYPE_GAUSS_2: self.fit_pars["second_sigma"] = self.kernel.GetParameter(4) - - def draw_kernel(self, root_pad, root_objects=[], **draw_args): # pylint: disable=too-many-statements, dangerous-default-value - + def draw_kernel(self, root_pad, root_objects=[], **draw_args): # pylint: disable=too-many-statements, dangerous-default-value title = draw_args.pop("title", "") x_axis_label = draw_args.pop("x_axis_label", "#it{M}_{inv} (GeV/#it{c}^{2})") - y_axis_label = draw_args.pop("y_axis_label", - f"Entries/({self.histo.GetBinWidth(1) * 1000:.0f} " \ - "MeV/#it{c}^{2})") + y_axis_label = draw_args.pop( + "y_axis_label", f"Entries/({self.histo.GetBinWidth(1) * 1000:.0f} MeV/#it{{c}}^{{2}})" + ) mean_dim = draw_args.pop("mean_dim", "GeV/#it{c}^{2}") - mean_scale = draw_args.pop("mean_scale", 1.) + mean_scale = draw_args.pop("mean_scale", 1.0) sigma_dim = draw_args.pop("sigma_dim", "MeV/#it{c}^{2}") - sigma_scale = draw_args.pop("sigma_scale", 1000.) + sigma_scale = draw_args.pop("sigma_scale", 1000.0) add_root_objects = draw_args.pop("add_root_objects", None) @@ -755,8 +767,7 @@ def draw_kernel(self, root_pad, root_objects=[], **draw_args): # pylint: disable self.kernel.SetLineColor(color_sig) root_pad.SetLeftMargin(0.12) - frame = root_pad.cd().DrawFrame(x_min, y_min, x_max, y_max, - f"{title} ; {x_axis_label} ; {y_axis_label}") + frame = root_pad.cd().DrawFrame(x_min, y_min, x_max, y_max, f"{title} ; {x_axis_label} ; {y_axis_label}") frame.GetYaxis().SetTitleOffset(1.7) frame.GetYaxis().SetMaxDigits(4) @@ -769,52 +780,41 @@ def draw_kernel(self, root_pad, root_objects=[], **draw_args): # pylint: disable self.histo.GetYaxis().SetTitle(y_axis_label) red_chisqu = self.kernel.GetNDF() - red_chisqu = self.kernel.GetChisquare() / red_chisqu if red_chisqu > 0. else 0. + red_chisqu = self.kernel.GetChisquare() / red_chisqu if red_chisqu > 0.0 else 0.0 mean = self.kernel.GetParameter(1) * mean_scale mean_err = self.kernel.GetParError(1) * mean_scale sigma = self.kernel.GetParameter(2) * sigma_scale sigma_err = self.kernel.GetParError(2) * sigma_scale root_objects.append(self.add_pave_helper_(0.55, 0.7, 0.89, 0.89, "NDC")) - self.add_text_helper_(root_objects[-1], - f"mean_{{histo}} = {self.histo.GetMean() * mean_scale:.4f}", - color_histo) - self.add_text_helper_(root_objects[-1], - f"RMS_{{histo}} = {self.histo.GetRMS() * sigma_scale:.4f}", - color_histo) - self.add_text_helper_(root_objects[-1], - f"fit range [{self.fit_range_low:.3f}, {self.fit_range_up:.3f}]", - color_histo) + self.add_text_helper_( + root_objects[-1], f"mean_{{histo}} = {self.histo.GetMean() * mean_scale:.4f}", color_histo + ) + self.add_text_helper_(root_objects[-1], f"RMS_{{histo}} = {self.histo.GetRMS() * sigma_scale:.4f}", color_histo) + self.add_text_helper_( + root_objects[-1], f"fit range [{self.fit_range_low:.3f}, {self.fit_range_up:.3f}]", color_histo + ) if not self.init_pars["use_user_fit_range"] and self.type_gauss == TYPE_GAUSS_1: - self.add_text_helper_(root_objects[-1], - f"(corr. to {self.n_rms} #times RMS_{{histo}})", - color_histo) + self.add_text_helper_(root_objects[-1], f"(corr. to {self.n_rms} #times RMS_{{histo}})", color_histo) else: - self.add_text_helper_(root_objects[-1], - " ", - color_histo) + self.add_text_helper_(root_objects[-1], " ", color_histo) root_objects[-1].Draw() root_objects.append(self.add_pave_helper_(0.2, 0.7, 0.59, 0.89, "NDC")) - self.add_text_helper_(root_objects[-1], - f"#mu = {mean:.4f} #pm {mean_err:.4f} {mean_dim}", color_sig) - self.add_text_helper_(root_objects[-1], - f"#sigma = {sigma:.4f} #pm {sigma_err:.4f} {sigma_dim}", color_sig) + self.add_text_helper_(root_objects[-1], f"#mu = {mean:.4f} #pm {mean_err:.4f} {mean_dim}", color_sig) + self.add_text_helper_(root_objects[-1], f"#sigma = {sigma:.4f} #pm {sigma_err:.4f} {sigma_dim}", color_sig) if self.type_gauss == TYPE_GAUSS_2: # quote second sigma sigma = abs(self.kernel.GetParameter(4) * sigma_scale) sigma_err = self.kernel.GetParError(4) * sigma_scale - self.add_text_helper_(root_objects[-1], - f"#sigma_{{2}} = {sigma:.4f} #pm {sigma_err:.4f} {sigma_dim}", - color_sig) + self.add_text_helper_( + root_objects[-1], f"#sigma_{{2}} = {sigma:.4f} #pm {sigma_err:.4f} {sigma_dim}", color_sig + ) else: - self.add_text_helper_(root_objects[-1], - " ", color_sig) - self.add_text_helper_(root_objects[-1], - f"#chi/ndf = {red_chisqu:.4f}", color_sig) + self.add_text_helper_(root_objects[-1], " ", color_sig) + self.add_text_helper_(root_objects[-1], f"#chi/ndf = {red_chisqu:.4f}", color_sig) root_objects[-1].Draw() - for dob in draw_objects: dob.Draw("same") @@ -823,7 +823,8 @@ def draw_kernel(self, root_pad, root_objects=[], **draw_args): # pylint: disable root_objects.append(aro) aro.Draw("same") -class FitSystAliHF(FitROOT): # pylint: disable=too-many-instance-attributes + +class FitSystAliHF(FitROOT): # pylint: disable=too-many-instance-attributes """ Class with AliHFMassFitter as core fitting utility """ @@ -834,37 +835,39 @@ def __init__(self, *args, histo=None, histo_mc=None, histo_reflections=None, **b self.histo_mc = histo_mc self.histo_reflections = histo_reflections - self.default_init_pars = {"mean": None, - "sigma": None, - "second_sigma": None, - "include_sec_peak": False, - "sec_mean": None, - "fix_sec_mean": False, - "sec_sigma": None, - "fix_sec_sigma": False, - "use_sec_peak_rel_sigma": True, - "include_reflections": False, - "fix_reflections_s_over_b": True, - "mean_ref": None, - "sigma_ref": None, - "yield_ref": None, - "chi2_ref": None, - "signif_ref": None, - "rebin": None, - "fit_range_low": None, - "fit_range_up": None, - "likelihood": True, - "n_sigma_sideband": None, - "fit_range_low_syst": None, - "fit_range_up_syst": None, - "bin_count_sigma_syst": None, - "bkg_func_names_syst": None, - "rebin_syst": None, - "consider_free_sigma_syst": None, - "rel_var_sigma_up_syst": None, - "rel_var_sigma_down_syst": None, - "signif_min_syst": None, - "chi2_max_syst": None} + self.default_init_pars = { + "mean": None, + "sigma": None, + "second_sigma": None, + "include_sec_peak": False, + "sec_mean": None, + "fix_sec_mean": False, + "sec_sigma": None, + "fix_sec_sigma": False, + "use_sec_peak_rel_sigma": True, + "include_reflections": False, + "fix_reflections_s_over_b": True, + "mean_ref": None, + "sigma_ref": None, + "yield_ref": None, + "chi2_ref": None, + "signif_ref": None, + "rebin": None, + "fit_range_low": None, + "fit_range_up": None, + "likelihood": True, + "n_sigma_sideband": None, + "fit_range_low_syst": None, + "fit_range_up_syst": None, + "bin_count_sigma_syst": None, + "bkg_func_names_syst": None, + "rebin_syst": None, + "consider_free_sigma_syst": None, + "rel_var_sigma_up_syst": None, + "rel_var_sigma_down_syst": None, + "signif_min_syst": None, + "chi2_max_syst": None, + } # Fitted parameters (to be modified for deriving classes) # Only those corresponding to init parameters are here. Specific parameters/values # provided by the kernel have to be extracted from that directly. @@ -872,7 +875,6 @@ def __init__(self, *args, histo=None, histo_mc=None, histo_reflections=None, **b self.results_path = base_args.get("results_path", None) self.update_root_objects() - def update_root_objects(self): if self.root_objects is None: self.root_objects = {} @@ -884,9 +886,7 @@ def update_root_objects(self): self.root_objects["histo_mc"] = self.histo_mc self.root_objects["histo_reflections"] = self.histo_reflections - def init_kernel(self): - self.update_root_objects() self.histo = self.histo.Clone(f"{self.histo.GetName()}_fit_histo") @@ -910,42 +910,37 @@ def init_kernel(self): self.kernel.SetUsePowerLawTimesExpoBackground(False) # Relative sigma variation wrt nominal - rel_sigma_up = self.init_pars["rel_var_sigma_up_syst"] \ - if self.init_pars["rel_var_sigma_up_syst"] else 0 - rel_sigma_down = self.init_pars["rel_var_sigma_down_syst"] \ - if self.init_pars["rel_var_sigma_down_syst"] else 0 + rel_sigma_up = self.init_pars["rel_var_sigma_up_syst"] if self.init_pars["rel_var_sigma_up_syst"] else 0 + rel_sigma_down = self.init_pars["rel_var_sigma_down_syst"] if self.init_pars["rel_var_sigma_down_syst"] else 0 self.kernel.SetSigmaMCVariation(rel_sigma_up, rel_sigma_down) rebin = construct_rebinning(self.histo, self.init_pars["rebin"]) if rebin: - rebin_steps = [rebin + rel_rb \ - if rebin + rel_rb > 0 \ - else 1 for rel_rb in self.init_pars["rebin_syst"]] + rebin_steps = [rebin + rel_rb if rebin + rel_rb > 0 else 1 for rel_rb in self.init_pars["rebin_syst"]] # To only have unique values and we don't care about the order we can just do rebin_steps = array("i", list(set(rebin_steps))) self.kernel.ConfigureRebinSteps(len(rebin_steps), rebin_steps) if self.init_pars["fit_range_low_syst"]: low_lim_steps = array("d", self.init_pars["fit_range_low_syst"]) - self.kernel.ConfigureLowLimFitSteps(len(self.init_pars["fit_range_low_syst"]), - low_lim_steps) + self.kernel.ConfigureLowLimFitSteps(len(self.init_pars["fit_range_low_syst"]), low_lim_steps) if self.init_pars["fit_range_up_syst"]: up_lim_steps = array("d", self.init_pars["fit_range_up_syst"]) - self.kernel.ConfigureUpLimFitSteps(len(self.init_pars["fit_range_up_syst"]), - up_lim_steps) + self.kernel.ConfigureUpLimFitSteps(len(self.init_pars["fit_range_up_syst"]), up_lim_steps) if self.init_pars["bin_count_sigma_syst"]: - self.kernel.ConfigurenSigmaBinCSteps(len(self.init_pars["bin_count_sigma_syst"]), - array("d", self.init_pars["bin_count_sigma_syst"])) + self.kernel.ConfigurenSigmaBinCSteps( + len(self.init_pars["bin_count_sigma_syst"]), array("d", self.init_pars["bin_count_sigma_syst"]) + ) - if self.init_pars["include_reflections"] and self.histo_reflections.Integral() <= 0.: + if self.init_pars["include_reflections"] and self.histo_reflections.Integral() <= 0.0: self.logger.warning("Reflection requested but template is empty") elif self.init_pars["include_reflections"]: self.histo_reflections = AliVertexingHFUtils.AdaptTemplateRangeAndBinning( - self.histo_reflections, self.histo, - self.init_pars["fit_range_low"], self.init_pars["fit_range_up"]) + self.histo_reflections, self.histo, self.init_pars["fit_range_low"], self.init_pars["fit_range_up"] + ) self.histo_mc = AliVertexingHFUtils.AdaptTemplateRangeAndBinning( - self.histo_mc, self.histo, - self.init_pars["fit_range_low"], self.init_pars["fit_range_up"]) + self.histo_mc, self.histo, self.init_pars["fit_range_low"], self.init_pars["fit_range_up"] + ) self.kernel.SetTemplatesForReflections(self.histo_reflections, self.histo_mc) if not self.init_pars["fix_reflections_s_over_b"]: @@ -954,19 +949,18 @@ def init_kernel(self): self.kernel.SetFixRefoS(-1) if self.init_pars["include_sec_peak"]: - #p_widthsecpeak to be fixed - sec_sigma = self.init_pars["sigma"] * self.init_pars["sec_sigma"] \ - if self.init_pars["use_sec_peak_rel_sigma"] \ - else self.init_pars["sec_sigma"] - self.kernel.IncludeSecondGausPeak(self.init_pars["sec_mean"], - self.init_pars["fix_sec_mean"], - sec_sigma, - self.init_pars["fix_sec_sigma"]) + # p_widthsecpeak to be fixed + sec_sigma = ( + self.init_pars["sigma"] * self.init_pars["sec_sigma"] + if self.init_pars["use_sec_peak_rel_sigma"] + else self.init_pars["sec_sigma"] + ) + self.kernel.IncludeSecondGausPeak( + self.init_pars["sec_mean"], self.init_pars["fix_sec_mean"], sec_sigma, self.init_pars["fix_sec_sigma"] + ) return True - def fit_kernel(self): - histo_double = TH1D() self.histo.Copy(histo_double) success = self.kernel.DoMultiTrials(histo_double) @@ -974,15 +968,12 @@ def fit_kernel(self): self.kernel.SaveToRoot(self.results_path) return success - def set_fit_pars(self): pass - #self.fit_pars["mean"] = self.kernel.GetMean() - #self.fit_pars["sigma"] = self.kernel.GetSigma() - - - def draw_kernel(self, root_pad, root_objects=[], **draw_args): #pylint: disable=dangerous-default-value, too-many-branches, too-many-statements, too-many-locals + # self.fit_pars["mean"] = self.kernel.GetMean() + # self.fit_pars["sigma"] = self.kernel.GetSigma() + def draw_kernel(self, root_pad, root_objects=[], **draw_args): # pylint: disable=dangerous-default-value, too-many-branches, too-many-statements, too-many-locals if not self.results_path: self.logger.warning("Don't have a result file so cannot draw. Skip...") return @@ -990,16 +981,20 @@ def draw_kernel(self, root_pad, root_objects=[], **draw_args): #pylint: disable= title = draw_args.pop("title", "") # Which background functions are used? - used_bkgs = array("b", ["kExpo" in self.init_pars["bkg_func_names_syst"], - "kLin" in self.init_pars["bkg_func_names_syst"], - "Pol2" in self.init_pars["bkg_func_names_syst"], - "Pol3" in self.init_pars["bkg_func_names_syst"], - "Pol4" in self.init_pars["bkg_func_names_syst"], - "Pol5" in self.init_pars["bkg_func_names_syst"]]) + used_bkgs = array( + "b", + [ + "kExpo" in self.init_pars["bkg_func_names_syst"], + "kLin" in self.init_pars["bkg_func_names_syst"], + "Pol2" in self.init_pars["bkg_func_names_syst"], + "Pol3" in self.init_pars["bkg_func_names_syst"], + "Pol4" in self.init_pars["bkg_func_names_syst"], + "Pol5" in self.init_pars["bkg_func_names_syst"], + ], + ) # Number of bin count variations - n_bins_bincount = len(self.init_pars["bin_count_sigma_syst"]) \ - if self.init_pars["bin_count_sigma_syst"] else 0 + n_bins_bincount = len(self.init_pars["bin_count_sigma_syst"]) if self.init_pars["bin_count_sigma_syst"] else 0 # The following is just crazy @@ -1022,12 +1017,14 @@ def draw_kernel(self, root_pad, root_objects=[], **draw_args): #pylint: disable= min_bc_range = 1 max_bc_range = n_bins_bincount n_bc_ranges = n_bins_bincount - conf_case = ["FixedSigFreeMean", - "FixedSigUpFreeMean", - "FixedSigDwFreeMean", - "FreeSigFreeMean", - "FreeSigFixedMean", - "FixedSigFixedMean"] + conf_case = [ + "FixedSigFreeMean", + "FixedSigUpFreeMean", + "FixedSigDwFreeMean", + "FreeSigFreeMean", + "FreeSigFixedMean", + "FixedSigFixedMean", + ] # Names of background functions used internally bkg_func = ["Expo", "Lin", "Pol2", "Pol3", "Pol4", "Pol5"] @@ -1035,27 +1032,26 @@ def draw_kernel(self, root_pad, root_objects=[], **draw_args): #pylint: disable= tot_cases = n_config_cases * n_back_func_cases # Mask to flag what's en/disabled # 0 => not used; 1 => used for fit; 2 => used also for bin count - mask = [0] * tot_cases #0,0,0,0,0,0, // fixed sigma, free mean (Expo, Lin, Pol2,Pol3,Pol4) - #0,0,0,0,0,0, // fixed sigma upper - #0,0,0,0,0,0, // fixed sigma lower - #0,0,0,0,0,0, // free sigma, free mean - #0,0,0,0,0,0, // free sigma, fixed mean - #0,0,0,0,0,0, // fixed mean, fixed sigma + mask = [0] * tot_cases # 0,0,0,0,0,0, // fixed sigma, free mean (Expo, Lin, Pol2,Pol3,Pol4) + # 0,0,0,0,0,0, // fixed sigma upper + # 0,0,0,0,0,0, // fixed sigma lower + # 0,0,0,0,0,0, // free sigma, free mean + # 0,0,0,0,0,0, // free sigma, fixed mean + # 0,0,0,0,0,0, // fixed mean, fixed sigma # Enable only the background cases we ran the multi trial with plot_case = 2 if max_bc_range >= min_bc_range else 1 for i in range(6): if used_bkgs[i] > 0: mask[i] = plot_case - mask[30+i] = plot_case + mask[30 + i] = plot_case if self.init_pars["consider_free_sigma_syst"]: - mask[18+i] = plot_case - mask[24+i] = plot_case + mask[18 + i] = plot_case + mask[24 + i] = plot_case if self.init_pars["rel_var_sigma_up_syst"]: - - mask[6+i] = plot_case + mask[6 + i] = plot_case if self.init_pars["rel_var_sigma_down_syst"]: - mask[12+i] = plot_case + mask[12 + i] = plot_case # Extract histograms from file histo6 = [None] * tot_cases @@ -1069,7 +1065,6 @@ def draw_kernel(self, root_pad, root_objects=[], **draw_args): #pylint: disable= mask[kjh] = 0 kjh += 1 - # Prepare variables for counting tot_trials = 0 successful_trials = 0 @@ -1082,8 +1077,7 @@ def draw_kernel(self, root_pad, root_objects=[], **draw_args): #pylint: disable= last_bc0 = [0] * tot_cases first_bc1 = [0] * tot_cases last_bc1 = [0] * tot_cases - #tlabels = [None] * (tot_cases+1) - + # tlabels = [None] * (tot_cases+1) for nc in range(tot_cases): if not mask[nc]: @@ -1095,23 +1089,23 @@ def draw_kernel(self, root_pad, root_objects=[], **draw_args): #pylint: disable= tot_histos += 1 # This we might include later - #ttt = histo6[nc].GetName() - #ttt = ttt.replace("hRawYieldTrial", "") - #if "FixedMean" in ttt: + # ttt = histo6[nc].GetName() + # ttt = ttt.replace("hRawYieldTrial", "") + # if "FixedMean" in ttt: # ttt = "Fix #mu" - #elif "FixedSp20" in ttt: + # elif "FixedSp20" in ttt: # ttt = "#sigma+" - #elif "fixedSm20" in ttt: + # elif "fixedSm20" in ttt: # ttt = "#sigma-" - #elif "FreeS" in ttt: + # elif "FreeS" in ttt: # ttt = "Free #sigma" - #ttt = ttt.replace("FixedS", "") - #if bkg_treat and bkg_treat in ttt: + # ttt = ttt.replace("FixedS", "") + # if bkg_treat and bkg_treat in ttt: # ttt = ttt.replace(bkg_treat, "") - #tlabels[nc] = TLatex(first[nc] + 0.02 * tot_trials, 10, ttt) - #tlabels[nc].SetTextColor(kMagenta+2) - #tlabels[nc].SetTextColor(kMagenta+2) + # tlabels[nc] = TLatex(first[nc] + 0.02 * tot_trials, 10, ttt) + # tlabels[nc].SetTextColor(kMagenta+2) + # tlabels[nc].SetTextColor(kMagenta+2) # Extract bin count cases if mask[nc] == 2: @@ -1145,104 +1139,112 @@ def draw_kernel(self, root_pad, root_objects=[], **draw_args): #pylint: disable= continue if bkg_func[i_bkg] in hmeanname: - h_raw_yield_all_bkgs[bkg_func[i_bkg]] = \ - TH1F(f"hRawYieldAll_{bkg_func[i_bkg]}", - " ; Trial # ; raw yield", tot_trials, 0., tot_trials) + h_raw_yield_all_bkgs[bkg_func[i_bkg]] = TH1F( + f"hRawYieldAll_{bkg_func[i_bkg]}", " ; Trial # ; raw yield", tot_trials, 0.0, tot_trials + ) h_raw_yield_all_bkgs[bkg_func[i_bkg]].SetLineColor(bkg_colors[i_bkg]) h_raw_yield_all_bkgs[bkg_func[i_bkg]].SetMarkerColor(bkg_colors[i_bkg]) h_raw_yield_all_bkgs[bkg_func[i_bkg]].SetStats(0) - h_mean_all_bkgs[bkg_func[i_bkg]] = \ - TH1F(f"hMeanAll_{bkg_func[i_bkg]}", - " ; Trial # ; Gaussian mean", tot_trials, 0., tot_trials) + h_mean_all_bkgs[bkg_func[i_bkg]] = TH1F( + f"hMeanAll_{bkg_func[i_bkg]}", " ; Trial # ; Gaussian mean", tot_trials, 0.0, tot_trials + ) h_mean_all_bkgs[bkg_func[i_bkg]].SetLineColor(bkg_colors[i_bkg]) h_mean_all_bkgs[bkg_func[i_bkg]].SetMarkerColor(bkg_colors[i_bkg]) h_mean_all_bkgs[bkg_func[i_bkg]].SetMinimum(0.8 * mean_ref) h_mean_all_bkgs[bkg_func[i_bkg]].SetMaximum(1.2 * mean_ref) h_mean_all_bkgs[bkg_func[i_bkg]].SetStats(0) - h_sigma_all_bkgs[bkg_func[i_bkg]] = \ - TH1F(f"hSigmaAll_{bkg_func[i_bkg]}", - " ; Trial # ; Gaussian Sigma", tot_trials, 0., tot_trials) + h_sigma_all_bkgs[bkg_func[i_bkg]] = TH1F( + f"hSigmaAll_{bkg_func[i_bkg]}", " ; Trial # ; Gaussian Sigma", tot_trials, 0.0, tot_trials + ) h_sigma_all_bkgs[bkg_func[i_bkg]].SetLineColor(bkg_colors[i_bkg]) h_sigma_all_bkgs[bkg_func[i_bkg]].SetMarkerColor(bkg_colors[i_bkg]) - h_sigma_all_bkgs[bkg_func[i_bkg]].SetMinimum(0.) + h_sigma_all_bkgs[bkg_func[i_bkg]].SetMinimum(0.0) h_sigma_all_bkgs[bkg_func[i_bkg]].SetMaximum(1.1 * sigma_ref) h_sigma_all_bkgs[bkg_func[i_bkg]].SetStats(0) - h_chi2_all_bkgs[bkg_func[i_bkg]] = \ - TH1F(f"hChi2All_{bkg_func[i_bkg]}", - " ; Trial # ; #Chi^{2}/ndf", tot_trials, 0., tot_trials) + h_chi2_all_bkgs[bkg_func[i_bkg]] = TH1F( + f"hChi2All_{bkg_func[i_bkg]}", " ; Trial # ; #Chi^{2}/ndf", tot_trials, 0.0, tot_trials + ) h_chi2_all_bkgs[bkg_func[i_bkg]].SetLineColor(bkg_colors[i_bkg]) h_chi2_all_bkgs[bkg_func[i_bkg]].SetMarkerColor(bkg_colors[i_bkg]) h_chi2_all_bkgs[bkg_func[i_bkg]].SetMarkerStyle(7) h_chi2_all_bkgs[bkg_func[i_bkg]].SetStats(0) - h_signif_all_bkgs[bkg_func[i_bkg]] = \ - TH1F(f"hSignifAll_{bkg_func[i_bkg]}", - " ; Trial # ; Significance", tot_trials, 0., tot_trials) + h_signif_all_bkgs[bkg_func[i_bkg]] = TH1F( + f"hSignifAll_{bkg_func[i_bkg]}", " ; Trial # ; Significance", tot_trials, 0.0, tot_trials + ) h_signif_all_bkgs[bkg_func[i_bkg]].SetLineColor(bkg_colors[i_bkg]) h_signif_all_bkgs[bkg_func[i_bkg]].SetMarkerColor(bkg_colors[i_bkg]) h_signif_all_bkgs[bkg_func[i_bkg]].SetMarkerStyle(7) h_signif_all_bkgs[bkg_func[i_bkg]].SetStats(0) - # Create histograms for fit and bin count yield to be plotted in the end - h_raw_yield_all_bc0 = TH1F(f"hRawYieldAllBC0", " ; Trial # ; raw yield BC0", - tot_trials_bc0 * n_bc_ranges, 0., - tot_trials_bc0 * n_bc_ranges) - - h_raw_yield_all_bc1 = TH1F(f"hRawYieldAllBC1", " ; Trial # ; raw yield BC1", - tot_trials_bc1 * n_bc_ranges, 0., - tot_trials_bc1 * n_bc_ranges) - - + h_raw_yield_all_bc0 = TH1F( + f"hRawYieldAllBC0", + " ; Trial # ; raw yield BC0", + tot_trials_bc0 * n_bc_ranges, + 0.0, + tot_trials_bc0 * n_bc_ranges, + ) + + h_raw_yield_all_bc1 = TH1F( + f"hRawYieldAllBC1", + " ; Trial # ; raw yield BC1", + tot_trials_bc1 * n_bc_ranges, + 0.0, + tot_trials_bc1 * n_bc_ranges, + ) lower_edge_yield_histos = yield_ref - 1.5 * yield_ref - lower_edge_yield_histos = max(0., lower_edge_yield_histos) + lower_edge_yield_histos = max(0.0, lower_edge_yield_histos) upper_edge_yield_histos = yield_ref + 1.5 * yield_ref - h_raw_yield_dist_all = TH1F("hRawYieldDistAll", " ; raw yield", 200, - lower_edge_yield_histos, upper_edge_yield_histos) + h_raw_yield_dist_all = TH1F( + "hRawYieldDistAll", " ; raw yield", 200, lower_edge_yield_histos, upper_edge_yield_histos + ) h_raw_yield_dist_all.SetFillStyle(3003) h_raw_yield_dist_all.SetFillColor(kBlue + 1) - h_raw_yield_dist_all_bc0 = TH1F("hRawYieldDistAllBC0", " ; raw yield", 200, - lower_edge_yield_histos, upper_edge_yield_histos) - h_raw_yield_dist_all_bc1 = TH1F("hRawYieldDistAllBC1", " ; raw yield", 200, - lower_edge_yield_histos, upper_edge_yield_histos) + h_raw_yield_dist_all_bc0 = TH1F( + "hRawYieldDistAllBC0", " ; raw yield", 200, lower_edge_yield_histos, upper_edge_yield_histos + ) + h_raw_yield_dist_all_bc1 = TH1F( + "hRawYieldDistAllBC1", " ; raw yield", 200, lower_edge_yield_histos, upper_edge_yield_histos + ) h_raw_yield_dist_all_bc0.SetFillStyle(3004) h_raw_yield_dist_all_bc1.SetFillStyle(3004) # NOTE Note used at the moment - #TH1F* hStatErrDistAll=new TH1F("hStatErrDistAll"," ; Stat Unc on Yield",300,0,10000); - #TH1F* hRelStatErrDistAll=new TH1F("hRelStatErrDistAll", + # TH1F* hStatErrDistAll=new TH1F("hStatErrDistAll"," ; Stat Unc on Yield",300,0,10000); + # TH1F* hRelStatErrDistAll=new TH1F("hRelStatErrDistAll", # " ; Rel Stat Unc on Yield",100,0.,1.); ####################################################################### - min_yield = 999999. - max_yield = 0. - sumy = [0.] * 4 - sumwei = [0.] * 4 - sumerr = [0.] * 4 - counts = 0. + min_yield = 999999.0 + max_yield = 0.0 + sumy = [0.0] * 4 + sumwei = [0.0] * 4 + sumerr = [0.0] * 4 + counts = 0.0 wei = [None] * 4 ################## # Extract yields # ################## # Cache min/max values for plotting later - sigma_max = 0. - sigma_min = 1. - mean_max = -1. - mean_min = 10000. - chi2_max = -1. - chi2_min = 10000. - signif_max = -1. - signif_min = 10000. - yields_fit_max = -1. - yields_fit_min = 10000. - yields_bc_max = -1. - yields_bc_min = 10000. + sigma_max = 0.0 + sigma_min = 1.0 + mean_max = -1.0 + mean_min = 10000.0 + chi2_max = -1.0 + chi2_min = 10000.0 + signif_max = -1.0 + signif_min = 10000.0 + yields_fit_max = -1.0 + yields_fit_min = 10000.0 + yields_bc_max = -1.0 + yields_bc_min = 10000.0 for nc in range(tot_cases): if not mask[nc]: continue @@ -1279,9 +1281,13 @@ def draw_kernel(self, root_pad, root_objects=[], **draw_args): #pylint: disable= signif = hsignift6.GetBinContent(ib) # Fill - if ry < 0.001 or (0.5 * ry) < ery or ery < (0.01 * ry) \ - or chi2 > self.init_pars["chi2_max_syst"] \ - or signif < self.init_pars["signif_min_syst"]: + if ( + ry < 0.001 + or (0.5 * ry) < ery + or ery < (0.01 * ry) + or chi2 > self.init_pars["chi2_max_syst"] + or signif < self.init_pars["signif_min_syst"] + ): continue successful_trials += 1 # Get the right histograms to fill @@ -1297,21 +1303,21 @@ def draw_kernel(self, root_pad, root_objects=[], **draw_args): #pylint: disable= yields_fit_max = max(ry + ery, yields_fit_max, yield_ref) yields_fit_min = min(ry - ery, yields_fit_min, yield_ref) # NOTE Not used at the moment - #hStatErrDistAll->Fill(ery); - #hRelStatErrDistAll->Fill(ery/ry); + # hStatErrDistAll->Fill(ery); + # hRelStatErrDistAll->Fill(ery/ry); min_yield = min(ry, min_yield) max_yield = max(ry, max_yield) - wei[0] = 1. - wei[1] = 1. / (ery * ery) - wei[2] = 1. / (ery * ery / (ry * ry)) - wei[3] = 1. / (ery * ery / ry) + wei[0] = 1.0 + wei[1] = 1.0 / (ery * ery) + wei[2] = 1.0 / (ery * ery / (ry * ry)) + wei[3] = 1.0 / (ery * ery / ry) for kw in range(4): sumy[kw] += wei[kw] * ry sumerr[kw] += wei[kw] * wei[kw] * ery * ery sumwei[kw] += wei[kw] - counts += 1. + counts += 1.0 h_sigma_all_bkgs[bkg_func_name].SetBinContent(first[nc] + ib, sig) h_sigma_all_bkgs[bkg_func_name].SetBinError(first[nc] + ib, esig) # Collect maximum and minimum for plotting later @@ -1336,7 +1342,7 @@ def draw_kernel(self, root_pad, root_objects=[], **draw_args): #pylint: disable= ebc = hbc2dt060.GetBinError(ib, iy) bc_1 = hbc2dt060_bc1.GetBinContent(ib, iy) ebc_1 = hbc2dt060_bc1.GetBinError(ib, iy) - #if(bc>0.001 && ebc<0.5*bc && bc<5.*ry){ + # if(bc>0.001 && ebc<0.5*bc && bc<5.*ry){ if bc < 0.001: continue the_bin = iy + (first_bc0[nc] + ib - 1) * n_bc_ranges @@ -1352,11 +1358,10 @@ def draw_kernel(self, root_pad, root_objects=[], **draw_args): #pylint: disable= yields_bc_max = max(bc_1 + ebc_1, yields_bc_max, yield_ref) yields_bc_min = min(bc_1 - ebc_1, yields_bc_min, yield_ref) - - weiav = [0.] * 4 - eweiav = [0.] * 4 + weiav = [0.0] * 4 + eweiav = [0.0] * 4 for kw in range(4): - if sumwei[kw] > 0.: + if sumwei[kw] > 0.0: weiav[kw] = sumy[kw] / sumwei[kw] eweiav[kw] = sqrt(sumerr[kw]) / sumwei[kw] @@ -1371,8 +1376,7 @@ def draw_kernel(self, root_pad, root_objects=[], **draw_args): #pylint: disable= h_raw_yield_dist_all_bc0.SetLineWidth(1) h_raw_yield_dist_all_bc0.SetLineStyle(1) if h_raw_yield_dist_all_bc0.GetEntries() > 0: - h_raw_yield_dist_all_bc0.Scale(\ - h_raw_yield_dist_all.GetEntries() / h_raw_yield_dist_all_bc0.GetEntries()) + h_raw_yield_dist_all_bc0.Scale(h_raw_yield_dist_all.GetEntries() / h_raw_yield_dist_all_bc0.GetEntries()) h_raw_yield_all_bc1.SetStats(0) h_raw_yield_all_bc1.SetMarkerColor(color_bc1) @@ -1382,16 +1386,13 @@ def draw_kernel(self, root_pad, root_objects=[], **draw_args): #pylint: disable= h_raw_yield_dist_all_bc1.SetLineWidth(1) h_raw_yield_dist_all_bc1.SetLineStyle(1) if h_raw_yield_dist_all_bc1.GetEntries() > 0: - h_raw_yield_dist_all_bc1.Scale(\ - h_raw_yield_dist_all.GetEntries() / h_raw_yield_dist_all_bc1.GetEntries()) + h_raw_yield_dist_all_bc1.Scale(h_raw_yield_dist_all.GetEntries() / h_raw_yield_dist_all_bc1.GetEntries()) h_raw_yield_dist_all.SetStats(0) h_raw_yield_dist_all.SetLineWidth(1) - def make_ref_line(x_low, y_low, x_up, y_up): - """Making a reference line - """ + """Making a reference line""" line = TLine(x_low, y_low, x_up, y_up) line.SetLineColor(kRed) line.SetLineWidth(2) @@ -1399,15 +1400,14 @@ def make_ref_line(x_low, y_low, x_up, y_up): return line def fill_pad(pad, ylims, histos, ref_line=None): - """Filling a pad - """ + """Filling a pad""" pad.cd() pad.SetLeftMargin(0.13) pad.SetRightMargin(0.06) lim_delta = ylims[1] - ylims[0] lim_min = ylims[0] - 0.1 * lim_delta lim_max = ylims[1] + 0.1 * lim_delta - for h in histos: + for h in histos: h.GetYaxis().SetTitleOffset(1.7) h.Draw("same") h.GetYaxis().SetRangeUser(lim_min, lim_max) @@ -1416,23 +1416,27 @@ def fill_pad(pad, ylims, histos, ref_line=None): if ref_line: ref_line.Draw("same") - root_pad.Divide(3, 2) # Sigmas - fill_pad(root_pad.cd(1), (sigma_min, sigma_max), h_sigma_all_bkgs.values(), - make_ref_line(0., sigma_ref, tot_trials, sigma_ref)) + fill_pad( + root_pad.cd(1), + (sigma_min, sigma_max), + h_sigma_all_bkgs.values(), + make_ref_line(0.0, sigma_ref, tot_trials, sigma_ref), + ) # Means mean_pad = root_pad.cd(2) - fill_pad(mean_pad, (mean_min, mean_max), h_mean_all_bkgs.values(), - make_ref_line(0., mean_ref, tot_trials, mean_ref)) + fill_pad( + mean_pad, (mean_min, mean_max), h_mean_all_bkgs.values(), make_ref_line(0.0, mean_ref, tot_trials, mean_ref) + ) # Legend bkg_func_legend = TLegend(0.2, 0.2, 0.5, 0.5) bkg_func_legend.SetTextSize(0.04) bkg_func_legend.SetBorderSize(0) bkg_func_legend.SetFillStyle(0) root_objects.append(bkg_func_legend) - for name, histo in h_mean_all_bkgs.items(): + for name, histo in h_mean_all_bkgs.items(): bkg_func_legend.AddEntry(histo, name) bkg_func_legend.Draw("same") @@ -1441,24 +1445,39 @@ def fill_pad(pad, ylims, histos, ref_line=None): chi2_signif_pad.Divide(1, 2) # Chi2 - fill_pad(chi2_signif_pad.cd(1), (chi2_min, chi2_max), h_chi2_all_bkgs.values(), - make_ref_line(0., chi2_ref, tot_trials, chi2_ref)) + fill_pad( + chi2_signif_pad.cd(1), + (chi2_min, chi2_max), + h_chi2_all_bkgs.values(), + make_ref_line(0.0, chi2_ref, tot_trials, chi2_ref), + ) # Significance - fill_pad(chi2_signif_pad.cd(2), (signif_min, signif_max), h_signif_all_bkgs.values(), - make_ref_line(0., signif_ref, tot_trials, signif_ref)) + fill_pad( + chi2_signif_pad.cd(2), + (signif_min, signif_max), + h_signif_all_bkgs.values(), + make_ref_line(0.0, signif_ref, tot_trials, signif_ref), + ) # Fit yields and bin counts yield_pad = root_pad.cd(4) yield_pad.Divide(1, 2) # Fit yields - fill_pad(yield_pad.cd(1), (yields_fit_min, yields_fit_max), h_raw_yield_all_bkgs.values(), - make_ref_line(0., yield_ref, tot_trials, yield_ref)) + fill_pad( + yield_pad.cd(1), + (yields_fit_min, yields_fit_max), + h_raw_yield_all_bkgs.values(), + make_ref_line(0.0, yield_ref, tot_trials, yield_ref), + ) # BC yields - fill_pad(yield_pad.cd(2), (yields_bc_min, yields_bc_max), - (h_raw_yield_all_bc0, h_raw_yield_all_bc1), - make_ref_line(0., yield_ref, tot_trials * n_bc_ranges, yield_ref)) + fill_pad( + yield_pad.cd(2), + (yields_bc_min, yields_bc_max), + (h_raw_yield_all_bc0, h_raw_yield_all_bc1), + make_ref_line(0.0, yield_ref, tot_trials * n_bc_ranges, yield_ref), + ) yield_pad = root_pad.cd(5) yield_pad.SetLeftMargin(0.14) @@ -1476,29 +1495,27 @@ def fill_pad(pad, ylims, histos, ref_line=None): h_raw_yield_dist_all_bc1.Draw("sameshist") root_objects.append(h_raw_yield_dist_all_bc1) h_raw_yield_dist_all_bc1.SetDirectory(0) - make_ref_line(yield_ref, 0., yield_ref, h_raw_yield_dist_all.GetMaximum()).Draw("same") + make_ref_line(yield_ref, 0.0, yield_ref, h_raw_yield_dist_all.GetMaximum()).Draw("same") yield_pad.Update() # This might be taken care of later - #st = h_raw_yield_dist_all.GetListOfFunctions().FindObject("stats") - #st.SetY1NDC(0.71) - #st.SetY2NDC(0.9) - #stb0 = h_raw_yield_dist_all_bc0.GetListOfFunctions().FindObject("stats") - #stb0.SetY1NDC(0.51) - #stb0.SetY2NDC(0.7) - #stb0.SetTextColor(h_raw_yield_dist_all_bc0.GetLineColor()) - perc = array("d", [0.15, 0.5, 0.85]) # quantiles for +-1 sigma - lim70 = array("d", [0.] * 3) + # st = h_raw_yield_dist_all.GetListOfFunctions().FindObject("stats") + # st.SetY1NDC(0.71) + # st.SetY2NDC(0.9) + # stb0 = h_raw_yield_dist_all_bc0.GetListOfFunctions().FindObject("stats") + # stb0.SetY1NDC(0.51) + # stb0.SetY2NDC(0.7) + # stb0.SetTextColor(h_raw_yield_dist_all_bc0.GetLineColor()) + perc = array("d", [0.15, 0.5, 0.85]) # quantiles for +-1 sigma + lim70 = array("d", [0.0] * 3) h_raw_yield_dist_all.GetQuantiles(3, lim70, perc) - ####################### # Numbers and summary # ####################### def make_latex(pos_x, pos_y, text, color=None, ndc=True): - """Helper to make TLatex - """ + """Helper to make TLatex""" tlatex = TLatex(pos_x, pos_y, text) tlatex.SetTextSize(0.04) if ndc: @@ -1508,7 +1525,6 @@ def make_latex(pos_x, pos_y, text, color=None, ndc=True): root_objects.append(tlatex) return tlatex - sum_pad = root_pad.cd(6) sum_pad.SetLeftMargin(0.14) sum_pad.SetRightMargin(0.06) @@ -1516,59 +1532,57 @@ def make_latex(pos_x, pos_y, text, color=None, ndc=True): yield_fit_color = h_raw_yield_dist_all.GetLineColor() yield_bc0_color = h_raw_yield_dist_all_bc0.GetLineColor() yield_bc1_color = h_raw_yield_dist_all_bc1.GetLineColor() - rel_succ_trials = successful_trials / tot_trials if tot_trials > 0 else 0. - make_latex(0.15, 0.93, f"succ. trials = {successful_trials} / {tot_trials} " \ - f"({rel_succ_trials * 100.:.2f}%)").Draw("same") + rel_succ_trials = successful_trials / tot_trials if tot_trials > 0 else 0.0 + make_latex( + 0.15, 0.93, f"succ. trials = {successful_trials} / {tot_trials} ({rel_succ_trials * 100.0:.2f}%)" + ).Draw("same") make_latex(0.15, 0.87, f"mean = {aver:.3f}", color=yield_fit_color).Draw("same") make_latex(0.15, 0.81, f"median = {lim70[1]:.3f}", color=yield_fit_color).Draw("same") aver_bc0 = h_raw_yield_dist_all_bc0.GetMean() - make_latex(0.15, 0.75, f"mean(BinCount0) = {aver_bc0:.3f}", - color=yield_bc0_color).Draw("same") + make_latex(0.15, 0.75, f"mean(BinCount0) = {aver_bc0:.3f}", color=yield_bc0_color).Draw("same") aver_bc1 = h_raw_yield_dist_all_bc1.GetMean() - make_latex(0.15, 0.69, f"mean(BinCount1) = {aver_bc1:.3f}", - color=yield_bc1_color).Draw("same") + make_latex(0.15, 0.69, f"mean(BinCount1) = {aver_bc1:.3f}", color=yield_bc1_color).Draw("same") val = h_raw_yield_dist_all.GetRMS() val_rel = val / aver * 100 if aver != 0 else 0 - make_latex(0.15, 0.60, f"rms = {val:.3f} ({val_rel:.2f}%)", - color=yield_fit_color).Draw("same") + make_latex(0.15, 0.60, f"rms = {val:.3f} ({val_rel:.2f}%)", color=yield_fit_color).Draw("same") val = h_raw_yield_dist_all_bc0.GetRMS() - val_rel = val / aver_bc0 * 100. if aver_bc0 != 0 else 0 - make_latex(0.15, 0.54, f"rms(BinCount0) = {val:.3f} ({val_rel:.2f}%)", - color=yield_bc0_color).Draw("same") + val_rel = val / aver_bc0 * 100.0 if aver_bc0 != 0 else 0 + make_latex(0.15, 0.54, f"rms(BinCount0) = {val:.3f} ({val_rel:.2f}%)", color=yield_bc0_color).Draw("same") val = h_raw_yield_dist_all_bc1.GetRMS() - val_rel = val / aver_bc1 * 100. if aver_bc1 != 0 else 0 - make_latex(0.15, 0.48, f"rms(BinCount1) = {val:.3f} ({val_rel:.2f}%)", - color=yield_bc1_color).Draw("same") + val_rel = val / aver_bc1 * 100.0 if aver_bc1 != 0 else 0 + make_latex(0.15, 0.48, f"rms(BinCount1) = {val:.3f} ({val_rel:.2f}%)", color=yield_bc1_color).Draw("same") - make_latex(0.15, 0.39, f"min = {min_yield:.2f} ; max = {max_yield:.2f}", - color=yield_fit_color).Draw("same") + make_latex(0.15, 0.39, f"min = {min_yield:.2f} ; max = {max_yield:.2f}", color=yield_fit_color).Draw("same") val = (max_yield - min_yield) / sqrt(12) - val_rel = val / aver * 100. if aver != 0 else 0 - make_latex(0.15, 0.33, - f"(max - min) / #sqrt{{12}} = {val:.3f} ({val_rel:.2f}%)", - color=yield_fit_color).Draw("same") + val_rel = val / aver * 100.0 if aver != 0 else 0 + make_latex(0.15, 0.33, f"(max - min) / #sqrt{{12}} = {val:.3f} ({val_rel:.2f}%)", color=yield_fit_color).Draw( + "same" + ) make_latex(0.15, 0.27, f"ref = {yield_ref:.2f}", color=kRed).Draw("same") val_rel = 100 * (yield_ref - aver) / yield_ref if yield_ref != 0 else 0 - make_latex(0.15, 0.21, f"ref - mean(fit) = {yield_ref - aver:.3f} " \ - f"({val_rel:.2f}%)", color=yield_fit_color).Draw("same") + make_latex( + 0.15, 0.21, f"ref - mean(fit) = {yield_ref - aver:.3f} ({val_rel:.2f}%)", color=yield_fit_color + ).Draw("same") val_rel = 100 * (yield_ref - aver_bc0) / yield_ref if yield_ref != 0 else 0 - make_latex(0.15, 0.15, f"ref - mean(BC0) = {yield_ref - aver_bc0:.3f} " \ - f"({val_rel:.2f}%)", color=yield_bc0_color).Draw("same") + make_latex( + 0.15, 0.15, f"ref - mean(BC0) = {yield_ref - aver_bc0:.3f} ({val_rel:.2f}%)", color=yield_bc0_color + ).Draw("same") val_rel = 100 * (yield_ref - aver_bc1) / yield_ref if yield_ref != 0 else 0 - make_latex(0.15, 0.09, f"ref - mean(BC1) = {yield_ref - aver_bc1:.3f} " \ - f"({val_rel:.2f}%)", color=yield_bc1_color).Draw("same") + make_latex( + 0.15, 0.09, f"ref - mean(BC1) = {yield_ref - aver_bc1:.3f} ({val_rel:.2f}%)", color=yield_bc1_color + ).Draw("same") if draw_args: self.logger.warning("There are unknown draw arguments") diff --git a/machine_learning_hep/fitting/helpers.py b/machine_learning_hep/fitting/helpers.py index 23b7d60eef..1d0f93b362 100644 --- a/machine_learning_hep/fitting/helpers.py +++ b/machine_learning_hep/fitting/helpers.py @@ -13,23 +13,24 @@ ############################################################################# -from os.path import join -import os import math -from glob import glob +import os from array import array from ctypes import c_double +from glob import glob +from os.path import join -#pylint: disable=too-many-lines, too-few-public-methods, consider-using-f-string, too-many-statements -from ROOT import TFile, TH1F, TF1, TCanvas, gStyle #pylint: disable=import-error, no-name-in-module +# pylint: disable=too-many-lines, too-few-public-methods, consider-using-f-string, too-many-statements +from ROOT import TF1, TH1F, TCanvas, TFile, gStyle # pylint: disable=import-error, no-name-in-module +from machine_learning_hep.fitting.fitters import FitAliHF, FitROOTGauss, FitSystAliHF +from machine_learning_hep.fitting.utils import load_fit, save_fit from machine_learning_hep.logger import get_logger from machine_learning_hep.utilities import make_file_path from machine_learning_hep.utilities_plot import plot_histograms -from machine_learning_hep.fitting.utils import save_fit, load_fit -from machine_learning_hep.fitting.fitters import FitAliHF, FitROOTGauss, FitSystAliHF -class MLFitParsFactory: # pylint: disable=too-many-instance-attributes + +class MLFitParsFactory: # pylint: disable=too-many-instance-attributes """ Managing MLHEP specific fit parameters and is used to collect and retrieve all information required to initialise a (systematic) fit @@ -38,7 +39,7 @@ class MLFitParsFactory: # pylint: disable=too-many-instance-attributes SIG_FUNC_MAP = {"kGaus": 0, "k2Gaus": 1, "kGausSigmaRatioPar": 2} BKG_FUNC_MAP = {"kExpo": 0, "kLin": 1, "Pol2": 2, "kNoBk": 3, "kPow": 4, "kPowEx": 5} - def __init__(self, database: dict, ana_type: str, file_data_name: str, file_mc_name: str): # pylint: disable=too-many-branches + def __init__(self, database: dict, ana_type: str, file_data_name: str, file_mc_name: str): # pylint: disable=too-many-branches """ Initialize MLFitParsFactory Args: @@ -103,7 +104,6 @@ def __init__(self, database: dict, ana_type: str, file_data_name: str, file_mc_n except (TypeError, KeyError): self.rebin = [self.rebin for _ in range(self.n_bins2)] - # Initial fit parameters self.mean = ana_config["masspeak"] try: @@ -146,8 +146,7 @@ def __init__(self, database: dict, ana_type: str, file_data_name: str, file_mc_n except TypeError: self.fix_sec_mean = [self.fix_sec_mean for _ in range(self.n_bins2)] self.sec_sigma = ana_config.get("widthsecpeak", None) if self.include_sec_peak else None - self.fix_sec_sigma = ana_config.get("fix_widthsecpeak", None) \ - if self.include_sec_peak else None + self.fix_sec_sigma = ana_config.get("fix_widthsecpeak", None) if self.include_sec_peak else None # Reflections flag self.include_reflections = ana_config.get("include_reflection", False) @@ -186,7 +185,6 @@ def __init__(self, database: dict, ana_type: str, file_data_name: str, file_mc_n except TypeError: self.syst_rel_var_sigma_down = [self.syst_rel_var_sigma_down] * self.n_bins1 - def make_ali_hf_fit_pars(self, ibin1, ibin2): """ Making fit paramaters for AliHF mass fitter @@ -197,18 +195,20 @@ def make_ali_hf_fit_pars(self, ibin1, ibin2): dictionary of fit parameters """ - fit_pars = {"sig_func_name": MLFitParsFactory.SIG_FUNC_MAP[self.sig_func_name[ibin1]], - "bkg_func_name": MLFitParsFactory.BKG_FUNC_MAP[self.bkg_func_name[ibin1]], - "likelihood": self.likelihood, - "rebin": self.rebin[ibin2][ibin1], - "fit_range_low": self.fit_range_low[ibin1], - "fit_range_up": self.fit_range_up[ibin1], - "n_sigma_sideband": self.n_sigma_sideband, - "rel_sigma_bound": self.rel_sigma_bound, - "mean": self.mean[ibin2][ibin1], - "sigma": self.sigma[ibin1], - "fix_mean": self.fix_mean, - "fix_sigma": self.fix_sigma[ibin1]} + fit_pars = { + "sig_func_name": MLFitParsFactory.SIG_FUNC_MAP[self.sig_func_name[ibin1]], + "bkg_func_name": MLFitParsFactory.BKG_FUNC_MAP[self.bkg_func_name[ibin1]], + "likelihood": self.likelihood, + "rebin": self.rebin[ibin2][ibin1], + "fit_range_low": self.fit_range_low[ibin1], + "fit_range_up": self.fit_range_up[ibin1], + "n_sigma_sideband": self.n_sigma_sideband, + "rel_sigma_bound": self.rel_sigma_bound, + "mean": self.mean[ibin2][ibin1], + "sigma": self.sigma[ibin1], + "fix_mean": self.fix_mean, + "fix_sigma": self.fix_sigma[ibin1], + } fit_pars["include_sec_peak"] = self.include_sec_peak[ibin2][ibin1] if self.include_sec_peak[ibin2][ibin1]: @@ -226,7 +226,6 @@ def make_ali_hf_fit_pars(self, ibin1, ibin2): return fit_pars - def make_ali_hf_syst_pars(self, ibin1, ibin2): """ Making fit systematic paramaters for AliHF mass fitter @@ -237,29 +236,31 @@ def make_ali_hf_syst_pars(self, ibin1, ibin2): dictionary of systematic fit parameters """ - fit_pars = {"mean": None, - "sigma": None, - "rebin": self.rebin[ibin2][ibin1], - "fit_range_low": self.fit_range_low[ibin1], - "fit_range_up": self.fit_range_up[ibin1], - "likelihood": self.likelihood, - "n_sigma_sideband": self.n_sigma_sideband, - "mean_ref": None, - "sigma_ref": None, - "yield_ref": None, - "chi2_ref": None, - "signif_ref": None, - "fit_range_low_syst": self.syst_pars.get("massmin", None), - "fit_range_up_syst": self.syst_pars.get("massmax", None), - "bin_count_sigma_syst": self.syst_pars.get("bincount_sigma", None), - "bkg_func_names_syst": self.syst_pars.get("bkg_funcs", None), - "rebin_syst": self.syst_pars.get("rebin", None), - # Check DB - "consider_free_sigma_syst": self.syst_consider_free_sigma[ibin1], - "rel_var_sigma_up_syst": self.syst_rel_var_sigma_up[ibin1], - "rel_var_sigma_down_syst": self.syst_rel_var_sigma_down[ibin1], - "signif_min_syst": self.syst_pars.get("min_signif", 3.), - "chi2_max_syst": self.syst_pars.get("max_chisquare_ndf", 2.)} + fit_pars = { + "mean": None, + "sigma": None, + "rebin": self.rebin[ibin2][ibin1], + "fit_range_low": self.fit_range_low[ibin1], + "fit_range_up": self.fit_range_up[ibin1], + "likelihood": self.likelihood, + "n_sigma_sideband": self.n_sigma_sideband, + "mean_ref": None, + "sigma_ref": None, + "yield_ref": None, + "chi2_ref": None, + "signif_ref": None, + "fit_range_low_syst": self.syst_pars.get("massmin", None), + "fit_range_up_syst": self.syst_pars.get("massmax", None), + "bin_count_sigma_syst": self.syst_pars.get("bincount_sigma", None), + "bkg_func_names_syst": self.syst_pars.get("bkg_funcs", None), + "rebin_syst": self.syst_pars.get("rebin", None), + # Check DB + "consider_free_sigma_syst": self.syst_consider_free_sigma[ibin1], + "rel_var_sigma_up_syst": self.syst_rel_var_sigma_up[ibin1], + "rel_var_sigma_down_syst": self.syst_rel_var_sigma_down[ibin1], + "signif_min_syst": self.syst_pars.get("min_signif", 3.0), + "chi2_max_syst": self.syst_pars.get("max_chisquare_ndf", 2.0), + } fit_pars["include_sec_peak"] = self.include_sec_peak[ibin2][ibin1] if self.include_sec_peak[ibin2][ibin1]: @@ -277,7 +278,6 @@ def make_ali_hf_syst_pars(self, ibin1, ibin2): return fit_pars - def make_suffix(self, ibin1, ibin2): """ Build name suffix to find histograms in ROOT file @@ -289,26 +289,42 @@ def make_suffix(self, ibin1, ibin2): """ if self.bin2_name is not None: if self.mltype == "MultiClassification": - return "%s%d_%d_%.2f%.2f%.2f%s_%.2f_%.2f" % \ - (self.bin1_name, self.bins1_edges_low[ibin1], - self.bins1_edges_up[ibin1], self.prob_cut_fin[ibin1][0], - self.prob_cut_fin[ibin1][1], self.prob_cut_fin[ibin1][2], - self.bin2_name, self.bins2_edges_low[ibin2], - self.bins2_edges_up[ibin2]) - return "%s%d_%d_%.2f%s_%.2f_%.2f" % \ - (self.bin1_name, self.bins1_edges_low[ibin1], - self.bins1_edges_up[ibin1], self.prob_cut_fin[ibin1], - self.bin2_name, self.bins2_edges_low[ibin2], - self.bins2_edges_up[ibin2]) + return "%s%d_%d_%.2f%.2f%.2f%s_%.2f_%.2f" % ( + self.bin1_name, + self.bins1_edges_low[ibin1], + self.bins1_edges_up[ibin1], + self.prob_cut_fin[ibin1][0], + self.prob_cut_fin[ibin1][1], + self.prob_cut_fin[ibin1][2], + self.bin2_name, + self.bins2_edges_low[ibin2], + self.bins2_edges_up[ibin2], + ) + return "%s%d_%d_%.2f%s_%.2f_%.2f" % ( + self.bin1_name, + self.bins1_edges_low[ibin1], + self.bins1_edges_up[ibin1], + self.prob_cut_fin[ibin1], + self.bin2_name, + self.bins2_edges_low[ibin2], + self.bins2_edges_up[ibin2], + ) if self.mltype == "MultiClassification": - return "%s%d_%d_%.2f%.2f%.2f" % \ - (self.bin1_name, self.bins1_edges_low[ibin1], - self.bins1_edges_up[ibin1], self.prob_cut_fin[ibin1][0], - self.prob_cut_fin[ibin1][1], self.prob_cut_fin[ibin1][2]) - return "%s%d_%d_%.2f" % \ - (self.bin1_name, self.bins1_edges_low[ibin1], - self.bins1_edges_up[ibin1], self.prob_cut_fin[ibin1]) + return "%s%d_%d_%.2f%.2f%.2f" % ( + self.bin1_name, + self.bins1_edges_low[ibin1], + self.bins1_edges_up[ibin1], + self.prob_cut_fin[ibin1][0], + self.prob_cut_fin[ibin1][1], + self.prob_cut_fin[ibin1][2], + ) + return "%s%d_%d_%.2f" % ( + self.bin1_name, + self.bins1_edges_low[ibin1], + self.bins1_edges_up[ibin1], + self.prob_cut_fin[ibin1], + ) def get_histograms(self, ibin1, ibin2, get_data=True, get_mc=False, get_reflections=False): """ @@ -361,7 +377,6 @@ def get_histograms(self, ibin1, ibin2, get_data=True, get_mc=False, get_reflecti return histo_data, histo_mc, histo_reflections - def get_fit_pars(self, ibin1, ibin2): """ Collect histograms, fit paramaters and the information whether this fit should be @@ -374,22 +389,21 @@ def get_fit_pars(self, ibin1, ibin2): """ fit_pars = self.make_ali_hf_fit_pars(ibin1, ibin2) - histo_data, histo_mc, histo_reflections = self.get_histograms(ibin1, ibin2, \ - get_data=True, get_mc=True, \ - get_reflections=fit_pars["include_reflections"]) + histo_data, histo_mc, histo_reflections = self.get_histograms( + ibin1, ibin2, get_data=True, get_mc=True, get_reflections=fit_pars["include_reflections"] + ) lock_override_init = ["sigma"] if self.use_user_sigma[ibin1] else [] if self.use_user_mean[ibin1]: lock_override_init.append("mean") - return {"histograms": {"data": histo_data, - "mc": histo_mc, - "reflections": histo_reflections}, - "init_from": self.init_fits_from[ibin1], - "lock_override_init": lock_override_init, - "init_pars": fit_pars, - "pre_fit_mc": {"type_gauss": self.pre_fit_class_mc[ibin1]}} - + return { + "histograms": {"data": histo_data, "mc": histo_mc, "reflections": histo_reflections}, + "init_from": self.init_fits_from[ibin1], + "lock_override_init": lock_override_init, + "init_pars": fit_pars, + "pre_fit_mc": {"type_gauss": self.pre_fit_class_mc[ibin1]}, + } def get_syst_pars(self, ibin1, ibin2): """ @@ -407,16 +421,19 @@ def get_syst_pars(self, ibin1, ibin2): return None fit_pars = self.make_ali_hf_syst_pars(ibin1, ibin2) - histo_data, histo_mc, histo_reflections = self.get_histograms(ibin1, ibin2, \ - get_data=True, get_mc=fit_pars["include_reflections"], \ - get_reflections=fit_pars["include_reflections"]) - - return {"histograms": {"data": histo_data, - "mc": histo_mc, - "reflections": histo_reflections}, - "init_from": self.syst_init_sigma_from[ibin2][ibin1], - "init_pars": fit_pars} - + histo_data, histo_mc, histo_reflections = self.get_histograms( + ibin1, + ibin2, + get_data=True, + get_mc=fit_pars["include_reflections"], + get_reflections=fit_pars["include_reflections"], + ) + + return { + "histograms": {"data": histo_data, "mc": histo_mc, "reflections": histo_reflections}, + "init_from": self.syst_init_sigma_from[ibin2][ibin1], + "init_pars": fit_pars, + } def yield_fit_pars(self): """ @@ -426,7 +443,6 @@ def yield_fit_pars(self): for ibin1 in range(self.n_bins1): yield ibin1, ibin2, self.get_fit_pars(ibin1, ibin2) - def yield_syst_pars(self): """ Yield bin numbers and corresponding systematic fit parameters one-by-one @@ -436,14 +452,12 @@ def yield_syst_pars(self): yield ibin1, ibin2, self.get_syst_pars(ibin1, ibin2) -class MLFitter: # pylint: disable=too-many-instance-attributes +class MLFitter: # pylint: disable=too-many-instance-attributes """ Wrapper around all available fits insatntiated and used in an MLHEP analysis run. """ - - def __init__(self, case: str, database: dict, ana_type: str, - data_out_dir: str, mc_out_dir: str): + def __init__(self, case: str, database: dict, ana_type: str, data_out_dir: str, mc_out_dir: str): """ Initialize MLFitter Args: @@ -481,7 +495,6 @@ def __init__(self, case: str, database: dict, ana_type: str, self.is_initialized_syst = False self.done_syst = False - def initialize_fits(self): """ Initialize all fits required in an MLHEP analysis run. Using MLFitParsFactory to retrieve @@ -499,16 +512,17 @@ def initialize_fits(self): pre_fits_bins1 = [] for ibin1, ibin2, pars in self.pars_factory.yield_fit_pars(): - self.central_fits[(ibin1, ibin2)] = FitAliHF( \ - pars["init_pars"], \ - histo=pars["histograms"]["data"], \ - histo_mc=pars["histograms"]["mc"], \ - histo_reflections=pars["histograms"]["reflections"]) + self.central_fits[(ibin1, ibin2)] = FitAliHF( + pars["init_pars"], + histo=pars["histograms"]["data"], + histo_mc=pars["histograms"]["mc"], + histo_reflections=pars["histograms"]["reflections"], + ) self.init_central_fits_from[(ibin1, ibin2)] = pars["init_from"] self.lock_override_init[(ibin1, ibin2)] = pars["lock_override_init"] - #Weights only make sense in HM bin, not in mult. integrated where we initialise. - #If weights are used, the initialised width doesn't make sense anymore + # Weights only make sense in HM bin, not in mult. integrated where we initialise. + # If weights are used, the initialised width doesn't make sense anymore apply_weights_temp = self.pars_factory.apply_weights self.pars_factory.apply_weights = False for ibin1, ibin2, pars in self.pars_factory.yield_fit_pars(): @@ -517,18 +531,18 @@ def initialize_fits(self): pre_fits_bins1.append(ibin1) - self.pre_fits_mc[ibin1] = FitROOTGauss(pars["init_pars"], - histo=pars["histograms"]["mc"], - **pars["pre_fit_mc"]) - self.pre_fits_data[ibin1] = FitAliHF( \ - pars["init_pars"], \ - histo=pars["histograms"]["data"], \ - histo_mc=pars["histograms"]["mc"], \ - histo_reflections=pars["histograms"]["reflections"]) + self.pre_fits_mc[ibin1] = FitROOTGauss( + pars["init_pars"], histo=pars["histograms"]["mc"], **pars["pre_fit_mc"] + ) + self.pre_fits_data[ibin1] = FitAliHF( + pars["init_pars"], + histo=pars["histograms"]["data"], + histo_mc=pars["histograms"]["mc"], + histo_reflections=pars["histograms"]["reflections"], + ) self.pars_factory.apply_weights = apply_weights_temp self.is_initialized_fits = True - def perform_pre_fits(self): """ Perform all pre-fits whose fitted parameters might be used to initialize central fits. @@ -551,7 +565,6 @@ def perform_pre_fits(self): fit.fit() self.done_pre_fits = True - def perform_central_fits(self): """ Perform all central fits and initialize from pre-fits if requested. @@ -570,11 +583,17 @@ def perform_central_fits(self): pre_fit = self.pre_fits_mc[ibin1] else: pre_fit = self.pre_fits_data[ibin1] - if not pre_fit.success and self.lock_override_init[(ibin1, ibin2)] \ - and "sigma" not in self.lock_override_init[(ibin1, ibin2)]: - self.logger.warning("Requested pre-fit on %s not successful but requested for " \ - "central fit in bins (%i, %i). Skip...", - self.init_central_fits_from[(ibin1, ibin2)], ibin1, ibin2) + if ( + not pre_fit.success + and self.lock_override_init[(ibin1, ibin2)] + and "sigma" not in self.lock_override_init[(ibin1, ibin2)] + ): + self.logger.warning( + "Requested pre-fit on %s not successful but requested for central fit in bins (%i, %i). Skip...", + self.init_central_fits_from[(ibin1, ibin2)], + ibin1, + ibin2, + ) continue override_init_pars = pre_fit.get_fit_pars() if pre_fit and pre_fit.success else {} @@ -588,7 +607,6 @@ def perform_central_fits(self): self.done_central_fits = True - def get_central_fit(self, ibin1, ibin2): """ Retrieve a central fit based on specified bin numbers @@ -602,7 +620,6 @@ def get_central_fit(self, ibin1, ibin2): return self.central_fits.get((ibin1, ibin2), None) - def print_fits(self): """ Print pre-fits and central fits @@ -628,9 +645,7 @@ def print_fits(self): print(fit) self.logger.info("Print all fits done") - def bkg_fromsidebands(self, folder, n_filemass, fitlim, fbkg, masspeak): - filemass = TFile.Open(n_filemass) bins1_ranges = self.pars_factory.bins1_edges_low.copy() bins1_ranges.append(self.pars_factory.bins1_edges_up[-1]) @@ -641,57 +656,64 @@ def bkg_fromsidebands(self, folder, n_filemass, fitlim, fbkg, masspeak): sig_limit = 0 pt_bin = 0 for ibin1 in range(n_bins1): - - if(fbkg[ibin1] != "kLin" and fbkg[ibin1] != "Pol2" and fbkg[ibin1] != "kExpo"): + if fbkg[ibin1] != "kLin" and fbkg[ibin1] != "Pol2" and fbkg[ibin1] != "kExpo": self.logger.warning("Bkg function not defined. Skip...") - i = i+1 + i = i + 1 continue - hmass = filemass.Get("hmass%s%d_%d_%.2f" % (self.bin1_name, \ - self.pars_factory.bins1_edges_low[ibin1], - self.pars_factory.bins1_edges_up[ibin1], - self.pars_factory.prob_cut_fin[ibin1])) + hmass = filemass.Get( + "hmass%s%d_%d_%.2f" + % ( + self.bin1_name, + self.pars_factory.bins1_edges_low[ibin1], + self.pars_factory.bins1_edges_up[ibin1], + self.pars_factory.prob_cut_fin[ibin1], + ) + ) hmass.Rebin(self.rebin[ibin1]) - if self.pre_fits_mc[i-1].fit_pars["sigma"] is None: + if self.pre_fits_mc[i - 1].fit_pars["sigma"] is None: self.logger.warning("Pre-fit failed. No sigma to initialize the fit. Skip...") - i = i+1 + i = i + 1 continue - sig_limit = [masspeak - 3*self.pre_fits_mc[i-1].fit_pars["sigma"], - masspeak + 3*self.pre_fits_mc[i-1].fit_pars["sigma"]] + sig_limit = [ + masspeak - 3 * self.pre_fits_mc[i - 1].fit_pars["sigma"], + masspeak + 3 * self.pre_fits_mc[i - 1].fit_pars["sigma"], + ] - #introducing my bkg function defined only outside the peak region + # introducing my bkg function defined only outside the peak region pt_bin = ibin1 + class FitBkg: def __call__(self, x_var, par): - #excluding signal region from the backgound fitting function - if (x_var[0] > sig_limit[0] and x_var[0] < sig_limit[1]): + # excluding signal region from the backgound fitting function + if x_var[0] > sig_limit[0] and x_var[0] < sig_limit[1]: return 0 if fbkg[pt_bin] == "kLin": - return par[0]+x_var[0]*par[1] + return par[0] + x_var[0] * par[1] if fbkg[pt_bin] == "Pol2": - return par[0]+x_var[0]*par[1]+x_var[0]*x_var[0]*par[2] + return par[0] + x_var[0] * par[1] + x_var[0] * x_var[0] * par[2] if fbkg[pt_bin] == "kExpo": - return math.exp(par[0]+x_var[0]*par[1]) + return math.exp(par[0] + x_var[0] * par[1]) return 0 if fbkg[ibin1] == "kLin": bkgFunc = FitBkg() fit_func = TF1("fit_func", bkgFunc, fitlim[0], fitlim[1], 2) - hmass.Fit(fit_func, '', '', fitlim[0], fitlim[1]) + hmass.Fit(fit_func, "", "", fitlim[0], fitlim[1]) pars = fit_func.GetParameters() bkg_func = TF1("fbkg", "pol1", fitlim[0], fitlim[1]) elif fbkg[ibin1] == "Pol2": bkgFunc = FitBkg() fit_func = TF1("fit_func", bkgFunc, fitlim[0], fitlim[1], 3) - hmass.Fit("fit_func", '', '', fitlim[0], fitlim[1]) + hmass.Fit("fit_func", "", "", fitlim[0], fitlim[1]) pars = fit_func.GetParameters() bkg_func = TF1("fbkg", "pol2", fitlim[0], fitlim[1]) elif fbkg[ibin1] == "kExpo": bkgFunc = FitBkg() fit_func = TF1("fit_func", bkgFunc, fitlim[0], fitlim[1], 2) - hmass.Fit(fit_func, '', '', fitlim[0], fitlim[1]) + hmass.Fit(fit_func, "", "", fitlim[0], fitlim[1]) pars = fit_func.GetParameters() bkg_func = TF1("fbkg", "expo", fitlim[0], fitlim[1]) @@ -701,15 +723,15 @@ def __call__(self, x_var, par): hbkg_fromsidebands.SetBinContent(i, bkg) hbkg_fromsidebands.SetBinError(i, bkg_err) - i = i+1 + i = i + 1 - fileoutbkg_fromsidebands = TFile.Open("%s/Background_fromsidebands_%s_%s.root" % \ - (folder, self.case, self.ana_type), "RECREATE") + fileoutbkg_fromsidebands = TFile.Open( + "%s/Background_fromsidebands_%s_%s.root" % (folder, self.case, self.ana_type), "RECREATE" + ) fileoutbkg_fromsidebands.cd() hbkg_fromsidebands.Write() fileoutbkg_fromsidebands.Close() - def initialize_syst(self): """ Initialize all systematic fits required in an MLHEP analysis run. Using MLFitParsFactory @@ -730,16 +752,16 @@ def initialize_syst(self): if not pars: self.syst_fits[(ibin1, ibin2)] = None continue - self.syst_fits[(ibin1, ibin2)] = FitSystAliHF( \ - pars["init_pars"], \ - histo=pars["histograms"]["data"], \ - histo_mc=pars["histograms"]["mc"], \ - histo_reflections=pars["histograms"]["reflections"]) + self.syst_fits[(ibin1, ibin2)] = FitSystAliHF( + pars["init_pars"], + histo=pars["histograms"]["data"], + histo_mc=pars["histograms"]["mc"], + histo_reflections=pars["histograms"]["reflections"], + ) self.init_syst_fits_from[(ibin1, ibin2)] = pars["init_from"] self.is_initialized_syst = True - def perform_syst(self, results_dir): """ Perform all systematic fits and initialize from central-fits if requested. @@ -757,13 +779,11 @@ def perform_syst(self, results_dir): for (ibin1, ibin2), fit in self.syst_fits.items(): if not fit: - self.logger.warning("No systematic fit for bins (%i, %i). Skip...", - ibin1, ibin2) + self.logger.warning("No systematic fit for bins (%i, %i). Skip...", ibin1, ibin2) continue if not self.central_fits[(ibin1, ibin2)].success: - self.logger.warning("Central fit not successful for bins (%i, %i). Skip...", - ibin1, ibin2) + self.logger.warning("Central fit not successful for bins (%i, %i). Skip...", ibin1, ibin2) continue # Prepare to overwrite some ini parameters @@ -782,11 +802,13 @@ def perform_syst(self, results_dir): signif_err = c_double() central_fit.kernel.Significance(self.pars_factory.n_sigma_signal, signif, signif_err) central_fit_pars = central_fit.get_fit_pars() - overwrite_init = {"yield_ref": central_fit.kernel.GetRawYield(), - "mean_ref": central_fit_pars["mean"], - "sigma_ref": central_fit_pars["sigma"], - "chi2_ref": central_fit.kernel.GetReducedChiSquare(), - "signif_ref": signif} + overwrite_init = { + "yield_ref": central_fit.kernel.GetRawYield(), + "mean_ref": central_fit_pars["mean"], + "sigma_ref": central_fit_pars["sigma"], + "chi2_ref": central_fit.kernel.GetReducedChiSquare(), + "signif_ref": signif, + } # Get mean and sigma for fit init pre_fit_pars = pre_fit.get_fit_pars() overwrite_init["mean"] = pre_fit_pars["mean"] @@ -795,23 +817,20 @@ def perform_syst(self, results_dir): fit.override_init_pars(**overwrite_init) # Set the path for intermediate results which are produced by the multi trial fitter - fit.results_path = os.path.join(results_dir, - f"multi_trial_bin1_{ibin1}_bin2_{ibin2}.root") + fit.results_path = os.path.join(results_dir, f"multi_trial_bin1_{ibin1}_bin2_{ibin2}.root") fit.fit() self.done_syst = True - def get_bins2(self): bins2 = [] - for (_, ibin2) in self.central_fits: + for _, ibin2 in self.central_fits: if ibin2 in bins2: continue bins2.append(ibin2) return bins2 - - def draw_fits(self, save_dir, root_dir=None): # pylint: disable=too-many-branches, too-many-statements, too-many-locals + def draw_fits(self, save_dir, root_dir=None): # pylint: disable=too-many-branches, too-many-statements, too-many-locals """ Draw all fits one-by-one Args: @@ -835,18 +854,14 @@ def fill_wrapper(histo, ibin, central, err=None): histo.SetBinError(ibin, err) # Summarize in mult histograms in pT bins - yieldshistos = {ibin2: TH1F("hyields%d" % (ibin2), "", \ - n_bins1, array("d", bins1_ranges)) for ibin2 in bins2} - backgroundhistos = {ibin2: TH1F("hbackground%d" % (ibin2), "", \ - n_bins1, array("d", bins1_ranges)) for ibin2 in bins2} - means_histos = {ibin2:TH1F("hmeans%d" % (ibin2), "", \ - n_bins1, array("d", bins1_ranges)) for ibin2 in bins2} - sigmas_histos = {ibin2: TH1F("hsigmas%d" % (ibin2), "", \ - n_bins1, array("d", bins1_ranges)) for ibin2 in bins2} - signifs_histos = {ibin2: TH1F("hsignifs%d" % (ibin2), "", \ - n_bins1, array("d", bins1_ranges)) for ibin2 in bins2} - refls_histos = {ibin2: TH1F("hrefl%d" % (ibin2), "", \ - n_bins1, array("d", bins1_ranges)) for ibin2 in bins2} + yieldshistos = {ibin2: TH1F("hyields%d" % (ibin2), "", n_bins1, array("d", bins1_ranges)) for ibin2 in bins2} + backgroundhistos = { + ibin2: TH1F("hbackground%d" % (ibin2), "", n_bins1, array("d", bins1_ranges)) for ibin2 in bins2 + } + means_histos = {ibin2: TH1F("hmeans%d" % (ibin2), "", n_bins1, array("d", bins1_ranges)) for ibin2 in bins2} + sigmas_histos = {ibin2: TH1F("hsigmas%d" % (ibin2), "", n_bins1, array("d", bins1_ranges)) for ibin2 in bins2} + signifs_histos = {ibin2: TH1F("hsignifs%d" % (ibin2), "", n_bins1, array("d", bins1_ranges)) for ibin2 in bins2} + refls_histos = {ibin2: TH1F("hrefl%d" % (ibin2), "", n_bins1, array("d", bins1_ranges)) for ibin2 in bins2} have_summary_pt_bins = [] means_init_mc_histos = TH1F("hmeans_init_mc", "", n_bins1, array("d", bins1_ranges)) sigmas_init_mc_histos = TH1F("hsigmas_init_mc", "", n_bins1, array("d", bins1_ranges)) @@ -867,8 +882,7 @@ def fill_wrapper(histo, ibin, central, err=None): canvas_init_mc = TCanvas("canvas_init_mc", "MC", 1000, canvy) canvas_init_data = TCanvas("canvas_init_data", "Data", 1000, canvy) - canvas_data = {ibin2: TCanvas("canvas_data%d" % (ibin2), "Data", 1000, canvy) \ - for ibin2 in bins2} + canvas_data = {ibin2: TCanvas("canvas_data%d" % (ibin2), "Data", 1000, canvy) for ibin2 in bins2} canvas_init_mc.Divide(nx, ny) canvas_init_data.Divide(nx, ny) @@ -877,18 +891,21 @@ def fill_wrapper(histo, ibin, central, err=None): # Need to cache some object for which the canvas is only written after the loop... for (ibin1, ibin2), fit in self.central_fits.items(): - # Some variables set for drawing if self.pars_factory.mltype == "MultiClassification": - title = f"{self.pars_factory.bins1_edges_low[ibin1]:.1f} < #it{{p}}_{{T}} < " \ - f"{self.pars_factory.bins1_edges_up[ibin1]:.1f}" \ - f" (prob0 <= {self.pars_factory.prob_cut_fin[ibin1][0]:.2f} & " \ - f"prob1 >= {self.pars_factory.prob_cut_fin[ibin1][1]:.2f} & " \ - f"prob2 >= {self.pars_factory.prob_cut_fin[ibin1][2]:.2f})" + title = ( + f"{self.pars_factory.bins1_edges_low[ibin1]:.1f} < #it{{p}}_{{T}} < " + f"{self.pars_factory.bins1_edges_up[ibin1]:.1f}" + f" (prob0 <= {self.pars_factory.prob_cut_fin[ibin1][0]:.2f} & " + f"prob1 >= {self.pars_factory.prob_cut_fin[ibin1][1]:.2f} & " + f"prob2 >= {self.pars_factory.prob_cut_fin[ibin1][2]:.2f})" + ) else: - title = f"{self.pars_factory.bins1_edges_low[ibin1]:.1f} < #it{{p}}_{{T}} < " \ - f"{self.pars_factory.bins1_edges_up[ibin1]:.1f} " \ - f"(prob > {self.pars_factory.prob_cut_fin[ibin1]:.2f})" + title = ( + f"{self.pars_factory.bins1_edges_low[ibin1]:.1f} < #it{{p}}_{{T}} < " + f"{self.pars_factory.bins1_edges_up[ibin1]:.1f} " + f"(prob > {self.pars_factory.prob_cut_fin[ibin1]:.2f})" + ) x_axis_label = "#it{M}_{inv} (GeV/#it{c}^{2})" n_sigma_signal = self.pars_factory.n_sigma_signal @@ -899,30 +916,31 @@ def fill_wrapper(histo, ibin, central, err=None): histo = fit.histo # Central fits - y_axis_label = \ - f"Entries/({histo.GetBinWidth(1) * 1000:.0f} MeV/#it{{c}}^{{2}})" + y_axis_label = f"Entries/({histo.GetBinWidth(1) * 1000:.0f} MeV/#it{{c}}^{{2}})" canvas = TCanvas("fit_canvas", suffix_write, 700, 700) - fit.draw(canvas, sigma_signal=n_sigma_signal, x_axis_label=x_axis_label, - y_axis_label=y_axis_label, title=title) + fit.draw( + canvas, sigma_signal=n_sigma_signal, x_axis_label=x_axis_label, y_axis_label=y_axis_label, title=title + ) if self.pars_factory.apply_weights is False: - canvas.SaveAs(make_file_path(save_dir, "fittedplot", "eps", None, - suffix_write)) + canvas.SaveAs(make_file_path(save_dir, "fittedplot", "eps", None, suffix_write)) else: - canvas.SaveAs(make_file_path(save_dir, "fittedplotweights", "eps", None, - suffix_write)) + canvas.SaveAs(make_file_path(save_dir, "fittedplotweights", "eps", None, suffix_write)) canvas.Close() - fit.draw(canvas_data[ibin2].cd(ibin1+1), sigma_signal=n_sigma_signal, - x_axis_label=x_axis_label, y_axis_label=y_axis_label, title=title) + fit.draw( + canvas_data[ibin2].cd(ibin1 + 1), + sigma_signal=n_sigma_signal, + x_axis_label=x_axis_label, + y_axis_label=y_axis_label, + title=title, + ) if fit.success: - fill_wrapper(yieldshistos[ibin2], ibin1 + 1, - kernel.GetRawYield(), kernel.GetRawYieldError()) - fill_wrapper(means_histos[ibin2], ibin1 + 1, - kernel.GetMean(), kernel.GetMeanUncertainty()) - fill_wrapper(sigmas_histos[ibin2], ibin1 + 1, - kernel.GetSigma(), kernel.GetSigmaUncertainty()) - fill_wrapper(refls_histos[ibin2], ibin1 + 1, - kernel.GetReflOverSig(), kernel.GetReflOverSigUncertainty()) + fill_wrapper(yieldshistos[ibin2], ibin1 + 1, kernel.GetRawYield(), kernel.GetRawYieldError()) + fill_wrapper(means_histos[ibin2], ibin1 + 1, kernel.GetMean(), kernel.GetMeanUncertainty()) + fill_wrapper(sigmas_histos[ibin2], ibin1 + 1, kernel.GetSigma(), kernel.GetSigmaUncertainty()) + fill_wrapper( + refls_histos[ibin2], ibin1 + 1, kernel.GetReflOverSig(), kernel.GetReflOverSigUncertainty() + ) bkg = c_double() bkg_err = c_double() @@ -935,26 +953,31 @@ def fill_wrapper(histo, ibin, central, err=None): fill_wrapper(signifs_histos[ibin2], ibin1 + 1, signif, signif_err) # Residual plot - c_res = TCanvas('cRes', 'The Fit Canvas', 800, 800) + c_res = TCanvas("cRes", "The Fit Canvas", 800, 800) c_res.cd() h_pulls = histo.Clone(f"{histo.GetName()}_pull") h_residual_trend = histo.Clone(f"{histo.GetName()}_residual_trend") h_pulls_trend = histo.Clone(f"{histo.GetName()}_pulls_trend") if self.pars_factory.include_reflections: - _ = kernel.GetOverBackgroundPlusReflResidualsAndPulls( \ - h_pulls, h_residual_trend, h_pulls_trend, \ - self.pars_factory.fit_range_low[ibin1], \ - self.pars_factory.fit_range_up[ibin1]) + _ = kernel.GetOverBackgroundPlusReflResidualsAndPulls( + h_pulls, + h_residual_trend, + h_pulls_trend, + self.pars_factory.fit_range_low[ibin1], + self.pars_factory.fit_range_up[ibin1], + ) else: - _ = kernel.GetOverBackgroundResidualsAndPulls( \ - h_pulls, h_residual_trend, h_pulls_trend, \ - self.pars_factory.fit_range_low[ibin1], \ - self.pars_factory.fit_range_up[ibin1]) + _ = kernel.GetOverBackgroundResidualsAndPulls( + h_pulls, + h_residual_trend, + h_pulls_trend, + self.pars_factory.fit_range_low[ibin1], + self.pars_factory.fit_range_up[ibin1], + ) h_residual_trend.Draw() c_res.SaveAs(make_file_path(save_dir, "residual", "eps", None, suffix_write)) c_res.Close() - # Summary plots to be done only once per pT bin if ibin1 in have_summary_pt_bins: continue @@ -967,18 +990,15 @@ def fill_wrapper(histo, ibin, central, err=None): pre_fit_mc = self.pre_fits_mc[ibin1] kernel = pre_fit_mc.kernel histo = pre_fit_mc.histo - y_axis_label = \ - f"Entries/({histo.GetBinWidth(1) * 1000:.0f} MeV/#it{{c}}^{{2}})" + y_axis_label = f"Entries/({histo.GetBinWidth(1) * 1000:.0f} MeV/#it{{c}}^{{2}})" canvas = TCanvas("fit_canvas_mc_init", suffix_write, 700, 700) - pre_fit_mc.draw(canvas, x_axis_label=x_axis_label, y_axis_label=y_axis_label, - title=title) + pre_fit_mc.draw(canvas, x_axis_label=x_axis_label, y_axis_label=y_axis_label, title=title) - canvas.SaveAs(make_file_path(save_dir, "fittedplot_integrated_mc", "eps", None, - suffix_write)) + canvas.SaveAs(make_file_path(save_dir, "fittedplot_integrated_mc", "eps", None, suffix_write)) canvas.Close() - pre_fit_mc.draw(canvas_init_mc.cd(ibin1+1), x_axis_label=x_axis_label, - y_axis_label=y_axis_label, title=title) - + pre_fit_mc.draw( + canvas_init_mc.cd(ibin1 + 1), x_axis_label=x_axis_label, y_axis_label=y_axis_label, title=title + ) if pre_fit_mc.success: # Only fill these summary plots in case of success @@ -987,24 +1007,25 @@ def fill_wrapper(histo, ibin, central, err=None): sigmas_init_mc_histos.SetBinContent(ibin1 + 1, kernel.GetParameter(2)) sigmas_init_mc_histos.SetBinError(ibin1 + 1, kernel.GetParError(2)) - pre_fit_data = self.pre_fits_data[ibin1] kernel = pre_fit_data.kernel histo = pre_fit_data.histo - # Pre-fit data - y_axis_label = \ - f"Entries/({histo.GetBinWidth(1) * 1000:.0f} MeV/#it{{c}}^{{2}})" + y_axis_label = f"Entries/({histo.GetBinWidth(1) * 1000:.0f} MeV/#it{{c}}^{{2}})" canvas = TCanvas("fit_canvas_data_init", suffix_write, 700, 700) - pre_fit_data.draw(canvas, sigma_signal=n_sigma_signal, x_axis_label=x_axis_label, - y_axis_label=y_axis_label, title=title) - canvas.SaveAs(make_file_path(save_dir, "fittedplot_integrated", "eps", None, - suffix_write)) + pre_fit_data.draw( + canvas, sigma_signal=n_sigma_signal, x_axis_label=x_axis_label, y_axis_label=y_axis_label, title=title + ) + canvas.SaveAs(make_file_path(save_dir, "fittedplot_integrated", "eps", None, suffix_write)) canvas.Close() - pre_fit_data.draw(canvas_init_data.cd(ibin1+1), sigma_signal=n_sigma_signal, - x_axis_label=x_axis_label, y_axis_label=y_axis_label, - title=title) + pre_fit_data.draw( + canvas_init_data.cd(ibin1 + 1), + sigma_signal=n_sigma_signal, + x_axis_label=x_axis_label, + y_axis_label=y_axis_label, + title=title, + ) if pre_fit_data.success: # Only fill these summary plots in case of success @@ -1013,15 +1034,13 @@ def fill_wrapper(histo, ibin, central, err=None): sigmas_init_data_histos.SetBinContent(ibin1 + 1, kernel.GetSigma()) sigmas_init_data_histos.SetBinError(ibin1 + 1, kernel.GetSigmaUncertainty()) - canvas_init_mc.SaveAs(make_file_path(save_dir, "canvas_InitMC", "eps")) canvas_init_mc.Close() canvas_init_data.SaveAs(make_file_path(save_dir, "canvas_InitData", "eps")) canvas_init_data.Close() for ibin2 in bins2: suffix2 = f"ibin2_{ibin2}" - canvas_data[ibin2].SaveAs(make_file_path(save_dir, "canvas_FinalData", "eps", None, - suffix2)) + canvas_data[ibin2].SaveAs(make_file_path(save_dir, "canvas_FinalData", "eps", None, suffix2)) if root_dir: root_dir.cd() yieldshistos[ibin2].Write() @@ -1030,59 +1049,102 @@ def fill_wrapper(histo, ibin, central, err=None): sigmas_histos[ibin2].Write() signifs_histos[ibin2].Write() refls_histos[ibin2].Write() - #canvas_data[ibin2].Close() - + # canvas_data[ibin2].Close() latex_hadron_name = self.ana_config["latexnamehadron"] if self.pars_factory.bin2_name is not None: latex_bin2_var = self.ana_config["latexbin2var"] latex_hadron_name = self.ana_config["latexnamehadron"] # Plot some summary historgrams - leg_strings = [f"{self.pars_factory.bins2_edges_low[ibin2]} #leq {latex_bin2_var} < " \ - f"{self.pars_factory.bins2_edges_up[ibin2]}" for ibin2 in bins2] + leg_strings = [ + f"{self.pars_factory.bins2_edges_low[ibin2]} #leq {latex_bin2_var} < " + f"{self.pars_factory.bins2_edges_up[ibin2]}" + for ibin2 in bins2 + ] else: leg_strings = [""] save_name = make_file_path(save_dir, "Yields", "eps", None, [self.case, self.ana_type]) # Yields summary plot - plot_histograms([yieldshistos[ibin2] for ibin2 in bins2], True, True, leg_strings, - "uncorrected yields", "#it{p}_{T} (GeV/#it{c})", - f"Uncorrected yields {latex_hadron_name} {self.ana_type}", "mult. / int.", - save_name) + plot_histograms( + [yieldshistos[ibin2] for ibin2 in bins2], + True, + True, + leg_strings, + "uncorrected yields", + "#it{p}_{T} (GeV/#it{c})", + f"Uncorrected yields {latex_hadron_name} {self.ana_type}", + "mult. / int.", + save_name, + ) save_name = make_file_path(save_dir, "Background", "eps", None, [self.case, self.ana_type]) # Background summary plot - plot_histograms([backgroundhistos[ibin2] for ibin2 in bins2], True, True, leg_strings, - "background", "#it{p}_{T} (GeV/#it{c})", - f"Background {latex_hadron_name} {self.ana_type}", "mult. / int.", - save_name) + plot_histograms( + [backgroundhistos[ibin2] for ibin2 in bins2], + True, + True, + leg_strings, + "background", + "#it{p}_{T} (GeV/#it{c})", + f"Background {latex_hadron_name} {self.ana_type}", + "mult. / int.", + save_name, + ) save_name = make_file_path(save_dir, "Means", "eps", None, [self.case, self.ana_type]) # Means summary plot - plot_histograms([means_histos[ibin2] for ibin2 in bins2], False, True, leg_strings, "Means", - "#it{p}_{T} (GeV/#it{c})", - "#mu_{fit} " + f"{latex_hadron_name} {self.ana_type}", "mult. / int.", - save_name) + plot_histograms( + [means_histos[ibin2] for ibin2 in bins2], + False, + True, + leg_strings, + "Means", + "#it{p}_{T} (GeV/#it{c})", + "#mu_{fit} " + f"{latex_hadron_name} {self.ana_type}", + "mult. / int.", + save_name, + ) save_name = make_file_path(save_dir, "Sigmas", "eps", None, [self.case, self.ana_type]) - #Sigmas summary plot - plot_histograms([sigmas_histos[ibin2] for ibin2 in bins2], False, True, leg_strings, - "Sigmas", "#it{p}_{T} (GeV/#it{c})", - "#sigma_{fit} " + f"{latex_hadron_name} {self.ana_type}", "mult. / int.", - save_name) + # Sigmas summary plot + plot_histograms( + [sigmas_histos[ibin2] for ibin2 in bins2], + False, + True, + leg_strings, + "Sigmas", + "#it{p}_{T} (GeV/#it{c})", + "#sigma_{fit} " + f"{latex_hadron_name} {self.ana_type}", + "mult. / int.", + save_name, + ) # Plot the initialized means and sigma for MC and data - save_name = make_file_path(save_dir, "Means_mult_int", "eps", None, - [self.case, self.ana_type]) - plot_histograms([means_init_mc_histos, means_init_data_histos], False, False, - ["MC", "data"], "Means of int. mult.", "#it{p}_{T} (GeV/#it{c})", - "#mu_{fit} " + f"{latex_hadron_name} {self.ana_type}", "", save_name) - - save_name = make_file_path(save_dir, "Sigmas_mult_int", "eps", None, - [self.case, self.ana_type]) - plot_histograms([sigmas_init_mc_histos, sigmas_init_data_histos], False, False, - ["MC", "data"], "Sigmas of int. mult.", "#it{p}_{T} (GeV/#it{c})", - "#sigma_{fit} " + f"{latex_hadron_name} {self.ana_type}", "", save_name) - - - def draw_syst(self, save_dir, results_dir, root_dir=None): # pylint: disable=too-many-branches, too-many-statements, too-many-locals + save_name = make_file_path(save_dir, "Means_mult_int", "eps", None, [self.case, self.ana_type]) + plot_histograms( + [means_init_mc_histos, means_init_data_histos], + False, + False, + ["MC", "data"], + "Means of int. mult.", + "#it{p}_{T} (GeV/#it{c})", + "#mu_{fit} " + f"{latex_hadron_name} {self.ana_type}", + "", + save_name, + ) + + save_name = make_file_path(save_dir, "Sigmas_mult_int", "eps", None, [self.case, self.ana_type]) + plot_histograms( + [sigmas_init_mc_histos, sigmas_init_data_histos], + False, + False, + ["MC", "data"], + "Sigmas of int. mult.", + "#it{p}_{T} (GeV/#it{c})", + "#sigma_{fit} " + f"{latex_hadron_name} {self.ana_type}", + "", + save_name, + ) + + def draw_syst(self, save_dir, results_dir, root_dir=None): # pylint: disable=too-many-branches, too-many-statements, too-many-locals """Draw all fits one-by-one Args: @@ -1102,36 +1164,36 @@ def draw_syst(self, save_dir, results_dir, root_dir=None): # pylint: disable=too for (ibin1, ibin2), fit in self.syst_fits.items(): if not fit: - self.logger.warning("No systematic fit for bins (%i, %i). Skip...", - ibin1, ibin2) + self.logger.warning("No systematic fit for bins (%i, %i). Skip...", ibin1, ibin2) continue # Some variables set for drawing if self.pars_factory.mltype == "MultiClassification": - title = f"{self.pars_factory.bins1_edges_low[ibin1]:.1f} < #it{{p}}_{{T}} < " \ - f"{self.pars_factory.bins1_edges_up[ibin1]:.1f}" \ - f"(prob0 <= {self.pars_factory.prob_cut_fin[ibin1][0]:.2f} &" \ - f"prob1 >= {self.pars_factory.prob_cut_fin[ibin1][1]:.2f})" + title = ( + f"{self.pars_factory.bins1_edges_low[ibin1]:.1f} < #it{{p}}_{{T}} < " + f"{self.pars_factory.bins1_edges_up[ibin1]:.1f}" + f"(prob0 <= {self.pars_factory.prob_cut_fin[ibin1][0]:.2f} &" + f"prob1 >= {self.pars_factory.prob_cut_fin[ibin1][1]:.2f})" + ) else: - title = f"{self.pars_factory.bins1_edges_low[ibin1]:.1f} < #it{{p}}_{{T}} < " \ - f"{self.pars_factory.bins1_edges_up[ibin1]:.1f}" \ - f"(prob > {self.pars_factory.prob_cut_fin[ibin1]:.2f})" + title = ( + f"{self.pars_factory.bins1_edges_low[ibin1]:.1f} < #it{{p}}_{{T}} < " + f"{self.pars_factory.bins1_edges_up[ibin1]:.1f}" + f"(prob > {self.pars_factory.prob_cut_fin[ibin1]:.2f})" + ) suffix_write = self.pars_factory.make_suffix(ibin1, ibin2) - fit.results_path = os.path.join(results_dir, - f"multi_trial_bin1_{ibin1}_bin2_{ibin2}.root") + fit.results_path = os.path.join(results_dir, f"multi_trial_bin1_{ibin1}_bin2_{ibin2}.root") # Central fits canvas = TCanvas("fit_canvas", suffix_write, 1400, 800) fit.draw(canvas, title=title) if self.pars_factory.apply_weights is False: - canvas.SaveAs(make_file_path(save_dir, "multi_trial", "eps", None, - suffix_write)) + canvas.SaveAs(make_file_path(save_dir, "multi_trial", "eps", None, suffix_write)) else: - canvas.SaveAs(make_file_path(save_dir, "multi_trial_weights", "eps", None, - suffix_write)) + canvas.SaveAs(make_file_path(save_dir, "multi_trial_weights", "eps", None, suffix_write)) if root_dir: root_dir.cd() @@ -1139,7 +1201,6 @@ def draw_syst(self, save_dir, results_dir, root_dir=None): # pylint: disable=too canvas.Close() - @staticmethod def save_all_(fits, save_dir): """ @@ -1154,7 +1215,6 @@ def save_all_(fits, save_dir): annotations = {"key": key} save_fit(fit, save_dir_fit, annotations) - def save_fits(self, top_save_dir): """ Write all fits there are @@ -1167,7 +1227,6 @@ def save_fits(self, top_save_dir): self.save_all_(self.pre_fits_data, join(top_save_dir, "pre_fits_data")) self.save_all_(self.central_fits, join(top_save_dir, "central_fits")) - @staticmethod def load_all_(fits, save_dir): """ @@ -1192,7 +1251,6 @@ def load_all_(fits, save_dir): fits[key] = fit return True - def load_fits(self, top_save_dir): """ Read back all fits written to disk @@ -1207,9 +1265,11 @@ def load_fits(self, top_save_dir): self.pre_fits_mc = {} self.pre_fits_data = {} self.central_fits = {} - success = self.load_all_(self.pre_fits_mc, join(top_save_dir, "pre_fits_mc")) and \ - self.load_all_(self.pre_fits_data, join(top_save_dir, "pre_fits_data")) and \ - self.load_all_(self.central_fits, join(top_save_dir, "central_fits")) + success = ( + self.load_all_(self.pre_fits_mc, join(top_save_dir, "pre_fits_mc")) + and self.load_all_(self.pre_fits_data, join(top_save_dir, "pre_fits_data")) + and self.load_all_(self.central_fits, join(top_save_dir, "central_fits")) + ) # Flags self.is_initialized_fits = True self.done_pre_fits = True diff --git a/machine_learning_hep/fitting/roofitter.py b/machine_learning_hep/fitting/roofitter.py index 67dabc87be..54def36fe9 100644 --- a/machine_learning_hep/fitting/roofitter.py +++ b/machine_learning_hep/fitting/roofitter.py @@ -13,8 +13,10 @@ ############################################################################# from math import sqrt + import ROOT -from ROOT import RooFit, RooArgSet, RooRealVar, RooAddPdf, RooArgList, TPaveText +from ROOT import RooAddPdf, RooArgList, RooArgSet, RooFit, RooRealVar, TPaveText + # pylint: disable=too-few-public-methods, too-many-statements # (temporary until we add more functionality) @@ -25,18 +27,18 @@ def __init__(self): ROOT.RooMsgService.instance().setGlobalKillBelow(ROOT.RooFit.WARNING) ROOT.RooMsgService.instance().setGlobalKillBelow(ROOT.RooFit.ERROR) - def fit_mass_new(self, hist, pdfnames, fit_spec, level, roows = None, plot = False): + def fit_mass_new(self, hist, pdfnames, fit_spec, level, roows=None, plot=False): if hist.GetEntries() == 0: - raise UserWarning('Cannot fit histogram with no entries') + raise UserWarning("Cannot fit histogram with no entries") ws = roows or ROOT.RooWorkspace("ws") - var_m = fit_spec.get('var', 'm') + var_m = fit_spec.get("var", "m") n_signal = RooRealVar("n_signal", "Number of signal events", 100, 0, 100000000) n_background = RooRealVar("n_background", "Number of background events", 100, 0, 100000000) - for comp, spec in fit_spec.get('components', {}).items(): - fn = ws.factory(spec['fn']) - if comp == 'model': + for comp, spec in fit_spec.get("components", {}).items(): + fn = ws.factory(spec["fn"]) + if comp == "model": model = fn m = ws.var(var_m) @@ -47,16 +49,15 @@ def fit_mass_new(self, hist, pdfnames, fit_spec, level, roows = None, plot = Fal background_pdf = ws.pdf(pdfnames["pdf_bkg"]) if not background_pdf: raise ValueError("bkg pdf not found") - model = RooAddPdf("model", - "Total model", - RooArgList(signal_pdf, background_pdf), - RooArgList(n_signal, n_background)) + model = RooAddPdf( + "model", "Total model", RooArgList(signal_pdf, background_pdf), RooArgList(n_signal, n_background) + ) # if range_m := fit_spec.get('range'): # m.setRange(range_m[0], range_m[1]) dh = ROOT.RooDataHist("dh", "dh", [m], Import=hist) - if range_m := fit_spec.get('range'): - m.setRange('fit', *range_m) + if range_m := fit_spec.get("range"): + m.setRange("fit", *range_m) # print(f'using fit range: {range_m}, var range: {m.getRange("fit")}') res = model.fitTo(dh, Range=(range_m[0], range_m[1]), Save=True, PrintLevel=-1, Strategy=1) # model.Print('v') @@ -71,21 +72,24 @@ def fit_mass_new(self, hist, pdfnames, fit_spec, level, roows = None, plot = Fal frame = m.frame() dh.plotOn(frame, ROOT.RooFit.Name("data")) model.plotOn(frame) - model.paramOn(frame, Layout=(.65,1.,.9)) + model.paramOn(frame, Layout=(0.65, 1.0, 0.9)) frame.getAttText().SetTextFont(42) - frame.getAttText().SetTextSize(.001) + frame.getAttText().SetTextSize(0.001) frame.SetAxisRange(range_m[0], range_m[1], "X") - frame.SetAxisRange(0., frame.GetMaximum()+(frame.GetMaximum()*0.3), "Y") + frame.SetAxisRange(0.0, frame.GetMaximum() + (frame.GetMaximum() * 0.3), "Y") try: for pdf in model.pdfList(): pdf_name = pdf.GetName() - model.plotOn(frame, ROOT.RooFit.Components(pdf), - ROOT.RooFit.Name((f"pdf_{pdf_name}")), - ROOT.RooFit.LineStyle(ROOT.ELineStyle.kDashed), - ROOT.RooFit.LineColor(ROOT.kViolet), - ROOT.RooFit.LineWidth(1)) - #model.SetName("bkg") + model.plotOn( + frame, + ROOT.RooFit.Components(pdf), + ROOT.RooFit.Name((f"pdf_{pdf_name}")), + ROOT.RooFit.LineStyle(ROOT.ELineStyle.kDashed), + ROOT.RooFit.LineColor(ROOT.kViolet), + ROOT.RooFit.LineWidth(1), + ) + # model.SetName("bkg") model.plotOn(frame, ROOT.RooFit.Name("model")) # pylint: disable=bare-except except: @@ -108,23 +112,23 @@ def fit_mass_new(self, hist, pdfnames, fit_spec, level, roows = None, plot = Fal signal_pdf_ext.plotOn( residual_frame, ROOT.RooFit.LineColor(ROOT.kBlue), - ROOT.RooFit.Normalization(1.0, ROOT.RooAbsReal.RelativeExpected)) + ROOT.RooFit.Normalization(1.0, ROOT.RooAbsReal.RelativeExpected), + ) residual_frame.SetAxisRange(range_m[0], range_m[1], "X") residual_frame.SetYTitle("Residuals") return (res, ws, frame, residual_frame) - - def fit_mass(self, hist, fit_spec, plot = False): + def fit_mass(self, hist, fit_spec, plot=False): if hist.GetEntries() == 0: - raise UserWarning('Cannot fit histogram with no entries') + raise UserWarning("Cannot fit histogram with no entries") ws = ROOT.RooWorkspace("ws") - for comp, spec in fit_spec.get('components', {}).items(): - ws.factory(spec['fn']) - if comp == 'sum': + for comp, spec in fit_spec.get("components", {}).items(): + ws.factory(spec["fn"]) + if comp == "sum": model = ws.pdf(comp) - m = ws.var('m') + m = ws.var("m") # m.setRange('full', 0., 3.) dh = ROOT.RooDataHist("dh", "dh", [m], Import=hist) # model = ws.pdf('sum') @@ -132,18 +136,16 @@ def fit_mass(self, hist, fit_spec, plot = False): res = model.fitTo(dh, Save=True, PrintLevel=-1) frame = m.frame() if plot else None if plot: - dh.plotOn(frame) #, ROOT.RooFit.Range(0., 3.)) + dh.plotOn(frame) # , ROOT.RooFit.Range(0., 3.)) model.plotOn(frame) model.paramOn(frame) - for comp in fit_spec.get('components', {}): - if comp != 'sum': - model.plotOn(frame, ROOT.RooFit.Components(comp), - ROOT.RooFit.LineStyle(ROOT.ELineStyle.kDashed)) + for comp in fit_spec.get("components", {}): + if comp != "sum": + model.plotOn(frame, ROOT.RooFit.Components(comp), ROOT.RooFit.LineStyle(ROOT.ELineStyle.kDashed)) return (res, ws, frame) def calc_signif(roows, res, pdfnames, param_names, mean_sgn, sigma_sgn): - f_sig = roows.pdf(pdfnames["pdf_sig"]) n_signal = res.floatParsFinal().find("n_signal").getVal() sigma_n_signal = res.floatParsFinal().find("n_signal").getError() @@ -158,9 +160,7 @@ def calc_signif(roows, res, pdfnames, param_names, mean_sgn, sigma_sgn): sigma_n_bkg = res.floatParsFinal().find("n_background").getError() massvar = roows.var(param_names["mass"]) - massvar.setRange("signal", - mean_sgn.getVal() - 3 * sigma_sgn.getVal(), - mean_sgn.getVal() + 3 * sigma_sgn.getVal()) + massvar.setRange("signal", mean_sgn.getVal() - 3 * sigma_sgn.getVal(), mean_sgn.getVal() + 3 * sigma_sgn.getVal()) massvar_set = RooArgSet(massvar) norm_set = RooFit.NormSet(massvar_set) @@ -177,28 +177,33 @@ def calc_signif(roows, res, pdfnames, param_names, mean_sgn, sigma_sgn): sigma_signal_integral = signal_integral.getPropagatedError(res) sigma_bkg_integral = bkg_integral.getPropagatedError(res) - sigma_n_signal_signal = sqrt((signal_integral.getVal() * sigma_n_signal) ** 2 + - (n_signal * sigma_signal_integral) ** 2) - sigma_n_bkg_signal = sqrt((bkg_integral.getVal() * sigma_n_bkg) ** 2 + - (n_bkg * sigma_bkg_integral) ** 2) + sigma_n_signal_signal = sqrt( + (signal_integral.getVal() * sigma_n_signal) ** 2 + (n_signal * sigma_signal_integral) ** 2 + ) + sigma_n_bkg_signal = sqrt((bkg_integral.getVal() * sigma_n_bkg) ** 2 + (n_bkg * sigma_bkg_integral) ** 2) - dS_dS = (1 / sqrt(n_signal_signal + n_bkg_signal) - - (n_signal_signal / (2 * (n_signal_signal + n_bkg_signal)**(3/2)))) - dS_dB = -n_signal_signal / (2 * (n_signal_signal + n_bkg_signal)**(3/2)) - significance_err = sqrt( - (dS_dS * sigma_n_signal_signal) ** 2 + - (dS_dB * sigma_n_bkg_signal) ** 2) + dS_dS = 1 / sqrt(n_signal_signal + n_bkg_signal) - ( + n_signal_signal / (2 * (n_signal_signal + n_bkg_signal) ** (3 / 2)) + ) + dS_dB = -n_signal_signal / (2 * (n_signal_signal + n_bkg_signal) ** (3 / 2)) + significance_err = sqrt((dS_dS * sigma_n_signal_signal) ** 2 + (dS_dB * sigma_n_bkg_signal) ** 2) - #Signal to bkg ratio + # Signal to bkg ratio s_over_b = n_signal_signal / n_bkg_signal - s_over_b_err = ( - s_over_b * sqrt((sigma_n_signal_signal / n_signal_signal) ** 2 + - (sigma_n_bkg_signal / n_bkg_signal) ** 2 )) - - return (n_signal_signal, sigma_n_signal_signal, - n_bkg_signal, sigma_n_bkg_signal, - significance, significance_err, - s_over_b, s_over_b_err) + s_over_b_err = s_over_b * sqrt( + (sigma_n_signal_signal / n_signal_signal) ** 2 + (sigma_n_bkg_signal / n_bkg_signal) ** 2 + ) + + return ( + n_signal_signal, + sigma_n_signal_signal, + n_bkg_signal, + sigma_n_bkg_signal, + significance, + significance_err, + s_over_b, + s_over_b_err, + ) def create_text_info(x_1, y_1, x_2, y_2): @@ -213,6 +218,7 @@ def create_text_info(x_1, y_1, x_2, y_2): return text_info + def add_text_info_fit(text_info, frame, roows, param_names): chi2 = frame.chiSquare() mean_sgn = roows.var(param_names["gauss_mean"]) @@ -229,7 +235,6 @@ def add_text_info_fit(text_info, frame, roows, param_names): def add_text_info_perf(text_info, sig, sig_err, bkg, bkg_err, s_over_b, s_over_b_err, signif, signif_err): - text_info.AddText(f"S(3#sigma) = {sig:.0f} #pm {sig_err:.0f}") text_info.AddText(f"B(3#sigma) = {bkg:.0f} #pm {bkg_err:.0f}") text_info.AddText(f"S/B(3#sigma) = {s_over_b:.3f} #pm {s_over_b_err:.3f}") diff --git a/machine_learning_hep/fitting/simple_fit.py b/machine_learning_hep/fitting/simple_fit.py index e55ebbe0ae..2c009bd77b 100644 --- a/machine_learning_hep/fitting/simple_fit.py +++ b/machine_learning_hep/fitting/simple_fit.py @@ -16,16 +16,16 @@ Script only used for fitting """ -from os.path import exists, join -from os import makedirs import argparse +from os import makedirs +from os.path import exists, join -from ROOT import TFile, TCanvas # pylint: disable=import-error, no-name-in-module +from ROOT import TCanvas, TFile # pylint: disable=import-error, no-name-in-module -from machine_learning_hep.logger import configure_logger #, get_logger -from machine_learning_hep.io import parse_yaml from machine_learning_hep.fitting.fitters import FitAliHF, FitROOTGauss from machine_learning_hep.fitting.utils import save_fit +from machine_learning_hep.io import parse_yaml +from machine_learning_hep.logger import configure_logger # , get_logger ############################################################################# # # @@ -59,12 +59,13 @@ # # ############################################################################# + def draw(fitter, save_name, **kwargs): """Draw helper function - This can safely be ignored in view of understanding this script - and it doesn't do anything but drawing a fit. It won't change - any number. + This can safely be ignored in view of understanding this script + and it doesn't do anything but drawing a fit. It won't change + any number. """ c = TCanvas("canvas", "", 500, 500) try: @@ -72,7 +73,7 @@ def draw(fitter, save_name, **kwargs): # NOTE The broad-except is only used to make this script running under # any circumstances and ignore any reason for which a fit could not # be drawn. - except Exception as e: # pylint: disable=broad-except + except Exception as e: # pylint: disable=broad-except print(f"Could not draw fit") print(fitter) print(e) @@ -103,7 +104,6 @@ def do_simple_fit(database, type_ana, period_number=-1, output_dir="simple_fit") sig_func_map = {"kGaus": 0, "k2Gaus": 1, "kGausSigmaRatioPar": 2} bkg_func_map = {"kExpo": 0, "kLin": 1, "Pol2": 2, "kNoBk": 3, "kPow": 4, "kPowEx": 5} - # Extract the analysis parameters fit_pars = database["analysis"][type_ana] @@ -179,12 +179,11 @@ def do_simple_fit(database, type_ana, period_number=-1, output_dir="simple_fit") # END reading all fit parameters # ################################## - # Where the histomass.root is read from - input_dir_mc = fit_pars["mc"]["results"][period_number] \ - if period_number > -1 else fit_pars["mc"]["resultsallp"] - input_dir_data = fit_pars["data"]["results"][period_number] \ - if period_number > -1 else fit_pars["data"]["resultsallp"] + input_dir_mc = fit_pars["mc"]["results"][period_number] if period_number > -1 else fit_pars["mc"]["resultsallp"] + input_dir_data = ( + fit_pars["data"]["results"][period_number] if period_number > -1 else fit_pars["data"]["resultsallp"] + ) # Otherwise the output directory might not exist, hence create if not exists(output_dir): @@ -199,28 +198,33 @@ def do_simple_fit(database, type_ana, period_number=-1, output_dir="simple_fit") ############################################## mc_fitters = [] for ipt in range(n_bins1): - # Always have the MC histogram for mult. integrated bin_id_match = bin_matching[ipt] - suffix_mc_int = "%s%d_%d_%.2f%s_%.2f_%.2f" % \ - (bin1_name, bins1_edges_low[ipt], - bins1_edges_up[ipt], prob_cut_fin[bin_id_match], - bin2_gen_name, bins2_edges_low[bins2_int_bin], - bins2_edges_up[bins2_int_bin]) + suffix_mc_int = "%s%d_%d_%.2f%s_%.2f_%.2f" % ( + bin1_name, + bins1_edges_low[ipt], + bins1_edges_up[ipt], + prob_cut_fin[bin_id_match], + bin2_gen_name, + bins2_edges_low[bins2_int_bin], + bins2_edges_up[bins2_int_bin], + ) # Get always the one for the multiplicity integrated histo_mc_int = histo_file_mc.Get("hmass_sig" + suffix_mc_int) histo_mc_int.SetDirectory(0) - fit_pars_mc = {"mean": mean, - "sigma": sigma[ipt], - "rebin": rebin[bins2_int_bin][ipt], - "use_user_fit_range": False, - "fit_range_low": fit_range_low[ipt], - "fit_range_up": fit_range_up[ipt], - "n_rms_fix": None, - "n_rms_start": 3, - "n_rms_stop": 8, - "likelihood": False} + fit_pars_mc = { + "mean": mean, + "sigma": sigma[ipt], + "rebin": rebin[bins2_int_bin][ipt], + "use_user_fit_range": False, + "fit_range_low": fit_range_low[ipt], + "fit_range_up": fit_range_up[ipt], + "n_rms_fix": None, + "n_rms_start": 3, + "n_rms_stop": 8, + "likelihood": False, + } fitter_mc = FitROOTGauss(fit_pars_mc, histo=histo_mc_int) mc_fitters.append(fitter_mc) @@ -244,7 +248,6 @@ def do_simple_fit(database, type_ana, period_number=-1, output_dir="simple_fit") data_fitters = [] for imult in range(n_bins2): for ipt in range(n_bins1): - # We only perform fit where the fit on M was successful mc_fit = mc_fitters[ipt] if not mc_fit.success: @@ -253,19 +256,26 @@ def do_simple_fit(database, type_ana, period_number=-1, output_dir="simple_fit") bin_id_match = bin_matching[ipt] - suffix_data = "%s%d_%d_%.2f%s_%.2f_%.2f" % \ - (bin1_name, bins1_edges_low[ipt], - bins1_edges_up[ipt], prob_cut_fin[bin_id_match], - bin2_name, bins2_edges_low[imult], - bins2_edges_up[imult]) + suffix_data = "%s%d_%d_%.2f%s_%.2f_%.2f" % ( + bin1_name, + bins1_edges_low[ipt], + bins1_edges_up[ipt], + prob_cut_fin[bin_id_match], + bin2_name, + bins2_edges_low[imult], + bins2_edges_up[imult], + ) # There might be a different name for the MC histogram due to a potential # difference in the multiplicity binning variable - suffix_mc = "%s%d_%d_%.2f%s_%.2f_%.2f" % \ - (bin1_name, bins1_edges_low[ipt], - bins1_edges_up[ipt], prob_cut_fin[bin_id_match], - bin2_gen_name, bins2_edges_low[imult], - bins2_edges_up[imult]) - + suffix_mc = "%s%d_%d_%.2f%s_%.2f_%.2f" % ( + bin1_name, + bins1_edges_low[ipt], + bins1_edges_up[ipt], + prob_cut_fin[bin_id_match], + bin2_gen_name, + bins2_edges_low[imult], + bins2_edges_up[imult], + ) # Get all histograms which might be required # Are we using weighted or unweighted histograms? @@ -280,26 +290,28 @@ def do_simple_fit(database, type_ana, period_number=-1, output_dir="simple_fit") ################################## # All fit parameters from the DB # ################################## - fit_pars = {"mean": mean, - "fix_mean": fix_mean, - "sigma": mc_fit.fit_pars["sigma"], - "fix_sigma": fix_sigma[ipt], - "include_sec_peak": include_sec_peak[imult][ipt], - "sec_mean": None, - "fix_sec_mean": False, - "sec_sigma": None, - "fix_sec_sigma": False, - "use_sec_peak_rel_sigma": True, - "include_reflections": include_reflections, - "fix_reflections_s_over_b": True, - "rebin": rebin[imult][ipt], - "fit_range_low": fit_range_low[ipt], - "fit_range_up": fit_range_up[ipt], - "likelihood": likelihood, - "n_sigma_sideband": n_sigma_sideband, - "rel_sigma_bound": rel_sigma_bound, - "sig_func_name": sig_func_map[sig_func_name[ipt]], - "bkg_func_name": bkg_func_map[bkg_func_name[ipt]]} + fit_pars = { + "mean": mean, + "fix_mean": fix_mean, + "sigma": mc_fit.fit_pars["sigma"], + "fix_sigma": fix_sigma[ipt], + "include_sec_peak": include_sec_peak[imult][ipt], + "sec_mean": None, + "fix_sec_mean": False, + "sec_sigma": None, + "fix_sec_sigma": False, + "use_sec_peak_rel_sigma": True, + "include_reflections": include_reflections, + "fix_reflections_s_over_b": True, + "rebin": rebin[imult][ipt], + "fit_range_low": fit_range_low[ipt], + "fit_range_up": fit_range_up[ipt], + "likelihood": likelihood, + "n_sigma_sideband": n_sigma_sideband, + "rel_sigma_bound": rel_sigma_bound, + "sig_func_name": sig_func_map[sig_func_name[ipt]], + "bkg_func_name": bkg_func_map[bkg_func_name[ipt]], + } # Include second peak if required if fit_pars["include_sec_peak"]: @@ -314,8 +326,7 @@ def do_simple_fit(database, type_ana, period_number=-1, output_dir="simple_fit") ################################ # Construct fitter and add to list - fitter = FitAliHF(fit_pars, histo=histo_data, histo_mc=histo_mc, - histo_reflections=histo_refl) + fitter = FitAliHF(fit_pars, histo=histo_data, histo_mc=histo_mc, histo_reflections=histo_refl) data_fitters.append(fitter) # Fit, draw and save @@ -327,8 +338,7 @@ def do_simple_fit(database, type_ana, period_number=-1, output_dir="simple_fit") save_fit(fitter, join(output_dir, f"fit_ipt_{ipt}_imult_{imult}")) if not fitter.success: - print(f"Fit in (ipt, imult) = ({ipt}, {imult}) failed. Try to draw and save " \ - f"anyway.") + print(f"Fit in (ipt, imult) = ({ipt}, {imult}) failed. Try to draw and save anyway.") def main(): @@ -337,16 +347,19 @@ def main(): """ parser = argparse.ArgumentParser() - parser.add_argument("--database-analysis", "-d", dest="database_analysis", - help="analysis database to be used", required=True) - parser.add_argument("--analysis", "-a", dest="type_ana", - help="choose type of analysis", required=True) - parser.add_argument("--period-number", "-p", dest="period_number", type=int, - help="choose type of analysis (0: 2016, 1: 2017, 2: 2018, " \ - "-1: all merged (default))", default=-1) - parser.add_argument("--output", "-o", default="simple_fit", - help="result output directory") - + parser.add_argument( + "--database-analysis", "-d", dest="database_analysis", help="analysis database to be used", required=True + ) + parser.add_argument("--analysis", "-a", dest="type_ana", help="choose type of analysis", required=True) + parser.add_argument( + "--period-number", + "-p", + dest="period_number", + type=int, + help="choose type of analysis (0: 2016, 1: 2017, 2: 2018, -1: all merged (default))", + default=-1, + ) + parser.add_argument("--output", "-o", default="simple_fit", help="result output directory") args = parser.parse_args() diff --git a/machine_learning_hep/fitting/utils.py b/machine_learning_hep/fitting/utils.py index 9b8e32528a..f9c70b7a68 100644 --- a/machine_learning_hep/fitting/utils.py +++ b/machine_learning_hep/fitting/utils.py @@ -20,19 +20,19 @@ 2. user configuration database Providing and storing fitters """ -from os.path import join -from math import ceil + import inspect +from math import ceil +from os.path import join # pylint: disable=import-error, no-name-in-module, unused-import from ROOT import TFile -from machine_learning_hep.io import parse_yaml, dump_yaml_from_dict, checkdir +from machine_learning_hep.io import checkdir, dump_yaml_from_dict, parse_yaml from machine_learning_hep.logger import get_logger def construct_rebinning(histo, rebin): - try: iter(rebin) min_rebin = rebin[0] @@ -52,7 +52,6 @@ def construct_rebinning(histo, rebin): def save_fit(fit, save_dir, annotations=None): - if not fit.has_attempt: get_logger().warning("Fit has not been done and will hence not be saved") return @@ -76,8 +75,7 @@ def save_fit(fit, save_dir, annotations=None): dump_yaml_from_dict(fit.fit_pars, yaml_path) class_name = fit.__class__.__name__ - meta_info = {"fit_class": class_name, - "success": fit.success} + meta_info = {"fit_class": class_name, "success": fit.success} if annotations: meta_info["annotations"] = annotations @@ -91,12 +89,15 @@ def load_fit(save_dir): yaml_path = join(save_dir, "init_pars.yaml") - #pylint: disable=import-outside-toplevel + # pylint: disable=import-outside-toplevel import machine_learning_hep.fitting.fitters as search_module - #pylint: enable=import-outside-toplevel - fit_classes = {f[0]: getattr(search_module, f[0]) \ - for f in inspect.getmembers(search_module, inspect.isclass) \ - if f[1].__module__ == search_module.__name__} + + # pylint: enable=import-outside-toplevel + fit_classes = { + f[0]: getattr(search_module, f[0]) + for f in inspect.getmembers(search_module, inspect.isclass) + if f[1].__module__ == search_module.__name__ + } fit = None if meta_info["fit_class"] in fit_classes: fit = fit_classes[meta_info["fit_class"]](parse_yaml(yaml_path)) diff --git a/machine_learning_hep/globalfitter.py b/machine_learning_hep/globalfitter.py index e0fe6d2bf5..3a95eab082 100644 --- a/machine_learning_hep/globalfitter.py +++ b/machine_learning_hep/globalfitter.py @@ -16,33 +16,37 @@ Methods to: fit inv. mass """ -from math import sqrt, pi, exp +from math import exp, pi, sqrt + # pylint: disable=import-error,no-name-in-module -from ROOT import TF1, gStyle, TCanvas, TPaveText, Double, TVirtualFitter, \ - kGreen, kRed, kBlue, TGraph, gROOT -from machine_learning_hep.logger import get_logger +from ROOT import TF1, Double, TCanvas, TGraph, TPaveText, TVirtualFitter, gROOT, gStyle, kBlue, kGreen, kRed +from machine_learning_hep.logger import get_logger -gROOT.ProcessLine("struct FitValues { Double_t mean; Double_t sigma; Double_t mean_fit; \ +gROOT.ProcessLine( + "struct FitValues { Double_t mean; Double_t sigma; Double_t mean_fit; \ Double_t sigma_fit; Bool_t fix_mean; Bool_t fix_sigma; \ Double_t nsigma_sig; Double_t nsigma_sideband; \ Double_t fit_range_low; Double_t fit_range_up; \ - Bool_t success;};") + Bool_t success;};" +) # pylint: disable=wrong-import-position, ungrouped-imports from ROOT import FitValues + def fixpar(massmin, massmax, masspeak, range_signal): - par_fix1 = Double(massmax-massmin) - par_fix2 = Double(massmax+massmin) - par_fix3 = Double(massmax*massmax*massmax-massmin*massmin*massmin) + par_fix1 = Double(massmax - massmin) + par_fix2 = Double(massmax + massmin) + par_fix3 = Double(massmax * massmax * massmax - massmin * massmin * massmin) par_fix4 = Double(masspeak) par_fix5 = Double(range_signal) return par_fix1, par_fix2, par_fix3, par_fix4, par_fix5 + def gaus_fit_func(xval, par): - return par[0] / sqrt(2. * pi) / par[2] * \ - exp(-(xval[0] - par[1]) * (xval[0] - par[1]) / 2. / par[2] / par[2]) + return par[0] / sqrt(2.0 * pi) / par[2] * exp(-(xval[0] - par[1]) * (xval[0] - par[1]) / 2.0 / par[2] / par[2]) + def signal_func(func_name, sgnfunc, fit_range_low, fit_range_up): if sgnfunc != "kGaus": @@ -51,33 +55,37 @@ def signal_func(func_name, sgnfunc, fit_range_low, fit_range_up): func.SetParNames("Int", "Mean", "Sigma") return func + def pol1_func_sidebands(xval, par): if par[6] > 0 and abs(xval[0] - par[4]) < par[5]: TF1.RejectPoint() - return 0. + return 0.0 return par[0] / par[2] + par[1] * (xval[0] - 0.5 * par[3]) + def pol2_func_sidebands(xval, par): if par[8] > 0 and abs(xval[0] - par[6]) < par[7]: TF1.RejectPoint() - return 0. - return par[0] / par[3] + par[1] * (xval[0] - 0.5 * par[4]) + par[2] * \ - (xval[0] * xval[0] - 1/3. * par[5] / par[3]) + return 0.0 + return ( + par[0] / par[3] + par[1] * (xval[0] - 0.5 * par[4]) + par[2] * (xval[0] * xval[0] - 1 / 3.0 * par[5] / par[3]) + ) + -def bkg_fit_func(func_name, func_type, massmin, massmax, integralhisto, masspeak, range_signal, - reject_signal_region=True): +def bkg_fit_func( + func_name, func_type, massmin, massmax, integralhisto, masspeak, range_signal, reject_signal_region=True +): # Immediately exit if function is unknown if func_type not in ["Pol1", "Pol2"]: get_logger().fatal("Unkown background fit function %s", func_type) - par_fix1, par_fix2, par_fix3, par_fix4, par_fix5 = \ - fixpar(massmin, massmax, masspeak, range_signal) + par_fix1, par_fix2, par_fix3, par_fix4, par_fix5 = fixpar(massmin, massmax, masspeak, range_signal) # In the following return asap if func_type == "Pol1": back_fit = TF1(func_name, pol1_func_sidebands, massmin, massmax, 7) back_fit.SetParNames("BkgInt", "Slope", "", "", "", "") - back_fit.SetParameters(integralhisto, -100.) + back_fit.SetParameters(integralhisto, -100.0) back_fit.FixParameter(2, par_fix1) back_fit.FixParameter(3, par_fix2) back_fit.FixParameter(4, par_fix4) @@ -86,10 +94,18 @@ def bkg_fit_func(func_name, func_type, massmin, massmax, integralhisto, masspeak return back_fit back_fit = TF1(func_name, pol2_func_sidebands, massmin, massmax, 9) - back_fit.SetParNames("BkgInt", "Coeff1", "Coeff2", "AlwaysFixedPar1", "AlwaysFixedPar2", - "AlwaysFixedPar3", "HelperParMassPeak", "HelperParSigRange", - "HelperParRejectSigRange") - back_fit.SetParameters(integralhisto, -10., 5.) + back_fit.SetParNames( + "BkgInt", + "Coeff1", + "Coeff2", + "AlwaysFixedPar1", + "AlwaysFixedPar2", + "AlwaysFixedPar3", + "HelperParMassPeak", + "HelperParSigRange", + "HelperParRejectSigRange", + ) + back_fit.SetParameters(integralhisto, -10.0, 5.0) back_fit.FixParameter(3, par_fix1) back_fit.FixParameter(4, par_fix2) back_fit.FixParameter(5, par_fix3) @@ -99,6 +115,7 @@ def bkg_fit_func(func_name, func_type, massmin, massmax, integralhisto, masspeak back_fit.FixParameter(8, 1 if reject_signal_region else -1) return back_fit + def tot_func(bkgfunc, massmax, massmin): # Immediately exit if function is unknown if bkgfunc not in ["Pol1", "Pol2"]: @@ -106,22 +123,29 @@ def tot_func(bkgfunc, massmax, massmin): # in the following return asap if bkgfunc == "Pol1": - return "[0]/(%f)+[1]*(x-0.5*(%f)) \ - +[2]/(sqrt(2.*pi))/[4]*(exp(-(x-[3])*(x-[3])/2./[4]/[4]))" % \ - ((massmax-massmin), (massmax+massmin)) - - return "[0]/(%f)+[1]*(x-0.5*(%f))+[2]*(x*x-1/3.*(%f)/(%f)) \ - +[3]/(sqrt(2.*pi))/[5]*(exp(-(x-[4])*(x-[4])/2./[5]/[5]))" % \ - ((massmax - massmin), (massmax + massmin), + return ( + "[0]/(%f)+[1]*(x-0.5*(%f)) \ + +[2]/(sqrt(2.*pi))/[4]*(exp(-(x-[3])*(x-[3])/2./[4]/[4]))" + % ((massmax - massmin), (massmax + massmin)) + ) + + return ( + "[0]/(%f)+[1]*(x-0.5*(%f))+[2]*(x*x-1/3.*(%f)/(%f)) \ + +[3]/(sqrt(2.*pi))/[5]*(exp(-(x-[4])*(x-[4])/2./[5]/[5]))" + % ( + (massmax - massmin), + (massmax + massmin), (massmax * massmax * massmax - massmin * massmin * massmin), - (massmax-massmin)) + (massmax - massmin), + ) + ) # pylint: disable=too-many-instance-attributes class Fitter: species = "fitter" - def __init__(self): + def __init__(self): self.logger = get_logger() # These are filled after the fit has been done self.yield_sig = None @@ -158,16 +182,28 @@ def __init__(self): # The original histogram to be fitted self.histo_to_fit = None # The histogram after background subtraction after the fit has been performed - #self.histo_sideband_sub = None + # self.histo_sideband_sub = None # Flag whether it has been fitted self.fitted = False self.fit_success = False # pylint: disable=too-many-arguments - def initialize(self, histo, sig_func_name, bkg_func_name, rebin, mean, sigma, fix_mean, - fix_sigma, nsigma_sideband, nsigma_sig, fit_range_low, fit_range_up): - + def initialize( + self, + histo, + sig_func_name, + bkg_func_name, + rebin, + mean, + sigma, + fix_mean, + fix_sigma, + nsigma_sideband, + nsigma_sig, + fit_range_low, + fit_range_up, + ): self.histo_to_fit = histo.Clone(histo.GetName() + "_for_fit") self.histo_to_fit.Rebin(rebin) self.mean = mean @@ -178,46 +214,63 @@ def initialize(self, histo, sig_func_name, bkg_func_name, rebin, mean, sigma, fi self.nsigma_sig = nsigma_sig # Make the fit range safe self.fit_range_low = max(fit_range_low, self.histo_to_fit.GetBinLowEdge(2)) - self.fit_range_up = min(fit_range_up, - self.histo_to_fit.GetBinLowEdge(self.histo_to_fit.GetNbinsX())) + self.fit_range_up = min(fit_range_up, self.histo_to_fit.GetBinLowEdge(self.histo_to_fit.GetNbinsX())) - bkg_int_initial = Double(histo.Integral(self.histo_to_fit.FindBin(fit_range_low), - self.histo_to_fit.FindBin(fit_range_up), - "width")) + bkg_int_initial = Double( + histo.Integral(self.histo_to_fit.FindBin(fit_range_low), self.histo_to_fit.FindBin(fit_range_up), "width") + ) self.sig_fit_func = signal_func("sig_fit", sig_func_name, fit_range_low, fit_range_up) - self.bkg_sideband_fit_func = bkg_fit_func("bkg_fit_sidebands", bkg_func_name, fit_range_low, - fit_range_up, bkg_int_initial, mean, - nsigma_sideband * sigma) - self.bkg_fit_func = bkg_fit_func("bkg_fit", bkg_func_name, fit_range_low, fit_range_up, - bkg_int_initial, mean, nsigma_sideband * sigma, False) - self.bkg_tot_fit_func = bkg_fit_func("bkg_fit_from_tot_fit", bkg_func_name, fit_range_low, - fit_range_up, bkg_int_initial, mean, - nsigma_sideband * sigma, False) - self.tot_fit_func = TF1("tot_fit", tot_func(bkg_func_name, fit_range_up, fit_range_low), - fit_range_low, fit_range_up) + self.bkg_sideband_fit_func = bkg_fit_func( + "bkg_fit_sidebands", + bkg_func_name, + fit_range_low, + fit_range_up, + bkg_int_initial, + mean, + nsigma_sideband * sigma, + ) + self.bkg_fit_func = bkg_fit_func( + "bkg_fit", bkg_func_name, fit_range_low, fit_range_up, bkg_int_initial, mean, nsigma_sideband * sigma, False + ) + self.bkg_tot_fit_func = bkg_fit_func( + "bkg_fit_from_tot_fit", + bkg_func_name, + fit_range_low, + fit_range_up, + bkg_int_initial, + mean, + nsigma_sideband * sigma, + False, + ) + self.tot_fit_func = TF1( + "tot_fit", tot_func(bkg_func_name, fit_range_up, fit_range_low), fit_range_low, fit_range_up + ) self.fitted = False self.fit_success = False def do_likelihood(self): self.fit_options = "L,E" - def update_check_signal_fit(self): error_list = [] - if self.yield_sig < 0. < self.sigma_fit or self.sigma_fit < 0. < self.yield_sig: - error_list.append(f"Both integral pre-factor and sigma have to have the same sign. " \ - f"However, pre-factor is {self.yield_sig} and sigma is " \ - f"{self.sigma_fit}.") - if self.mean_fit < 0.: + if self.yield_sig < 0.0 < self.sigma_fit or self.sigma_fit < 0.0 < self.yield_sig: + error_list.append( + f"Both integral pre-factor and sigma have to have the same sign. " + f"However, pre-factor is {self.yield_sig} and sigma is " + f"{self.sigma_fit}." + ) + if self.mean_fit < 0.0: error_list.append(f"Mean is negative: {self.mean_fit}") if abs(self.sigma_fit) > 10 * self.sigma: - error_list.append(f"Fitted sigma is larger than 10 times initial sigma " \ - f"{self.sigma:.4f} vs. {self.sigma_fit:.4f}") + error_list.append( + f"Fitted sigma is larger than 10 times initial sigma {self.sigma:.4f} vs. {self.sigma_fit:.4f}" + ) if abs(self.sigma_fit) < 0.1 * self.sigma: - error_list.append(f"Fitted sigma is smaller than 0.1 times initial sigma " \ - f"{self.sigma:.4f} vs. {self.sigma_fit:.4f}") + error_list.append( + f"Fitted sigma is smaller than 0.1 times initial sigma {self.sigma:.4f} vs. {self.sigma_fit:.4f}" + ) if error_list: return "\n".join(error_list) @@ -241,17 +294,19 @@ def derive_yields(self): maxMass_fit = self.mean_fit + self.nsigma_sig * self.sigma_fit leftBand = self.histo_to_fit.FindBin(self.mean_fit - self.nsigma_sideband * self.sigma_fit) rightBand = self.histo_to_fit.FindBin(self.mean_fit + self.nsigma_sideband * self.sigma_fit) - intB = self.histo_to_fit.Integral(1, leftBand) + \ - self.histo_to_fit.Integral(rightBand, self.histo_to_fit.GetNbinsX()) - sum2 = 0. + intB = self.histo_to_fit.Integral(1, leftBand) + self.histo_to_fit.Integral( + rightBand, self.histo_to_fit.GetNbinsX() + ) + sum2 = 0.0 for i_left in range(1, leftBand + 1): sum2 += self.histo_to_fit.GetBinError(i_left) * self.histo_to_fit.GetBinError(i_left) for i_right in range(rightBand, (self.histo_to_fit.GetNbinsX()) + 1): sum2 += self.histo_to_fit.GetBinError(i_right) * self.histo_to_fit.GetBinError(i_right) intBerr = sqrt(sum2) - self.yield_bkg = self.bkg_tot_fit_func.Integral(minMass_fit, maxMass_fit) / \ - Double(self.histo_to_fit.GetBinWidth(1)) - #if background <= 0: + self.yield_bkg = self.bkg_tot_fit_func.Integral(minMass_fit, maxMass_fit) / Double( + self.histo_to_fit.GetBinWidth(1) + ) + # if background <= 0: # return -1, -1 self.yield_bkg_err = 0 if intB > 0: @@ -259,13 +314,8 @@ def derive_yields(self): self.yield_bkg_err = intBerr / intB * self.yield_bkg self.logger.info("Background: %s, error background: %s", self.yield_bkg, self.yield_bkg_err) - self.yield_sig = self.sig_fit_func.GetParameter(0) / \ - Double(self.histo_to_fit.GetBinWidth(1)) - self.yield_sig_err = self.sig_fit_func.GetParError(0) / \ - Double(self.histo_to_fit.GetBinWidth(1)) - - - + self.yield_sig = self.sig_fit_func.GetParameter(0) / Double(self.histo_to_fit.GetBinWidth(1)) + self.yield_sig_err = self.sig_fit_func.GetParError(0) / Double(self.histo_to_fit.GetBinWidth(1)) self.logger.info("Raw yield: %f, raw yield error: %f", self.yield_sig, self.yield_sig_err) errSigSq = self.yield_sig_err * self.yield_sig_err @@ -275,32 +325,32 @@ def derive_yields(self): self.errsignificance = 0 if sigPlusBkg > 0 and self.yield_sig > 0: self.significance = self.yield_sig / (sqrt(sigPlusBkg)) - self.errsignificance = self.significance * (sqrt((errSigSq + errBkgSq) / \ - (4. * sigPlusBkg * sigPlusBkg) + \ - (self.yield_bkg / sigPlusBkg) * errSigSq / \ - self.yield_sig / self.yield_sig)) + self.errsignificance = self.significance * ( + sqrt( + (errSigSq + errBkgSq) / (4.0 * sigPlusBkg * sigPlusBkg) + + (self.yield_bkg / sigPlusBkg) * errSigSq / self.yield_sig / self.yield_sig + ) + ) - self.logger.info("Significance: %f, error significance: %f", self.significance, - self.errsignificance) + self.logger.info("Significance: %f, error significance: %f", self.significance, self.errsignificance) def bincount(self, nsigma, use_integral=True): - if not self.fitted: self.logger.error("Cannot compute bincount. Fit required first!") return None, None # Now yield from bin count - bincount = 0. - bincount_err = 0. + bincount = 0.0 + bincount_err = 0.0 leftBand = self.histo_to_fit.FindBin(self.mean_fit - nsigma * self.sigma_fit) rightBand = self.histo_to_fit.FindBin(self.mean_fit + nsigma * self.sigma_fit) for b in range(leftBand, rightBand + 1, 1): bkg_count = 0 if use_integral: - bkg_count = self.bkg_fit_func.Integral(self.histo_to_fit.GetBinLowEdge(b), - self.histo_to_fit.GetBinLowEdge(b) + \ - self.histo_to_fit.GetBinWidth(b)) / \ - self.histo_to_fit.GetBinWidth(b) + bkg_count = self.bkg_fit_func.Integral( + self.histo_to_fit.GetBinLowEdge(b), + self.histo_to_fit.GetBinLowEdge(b) + self.histo_to_fit.GetBinWidth(b), + ) / self.histo_to_fit.GetBinWidth(b) else: bkg_count = self.bkg_fit_func.Eval(self.histo_to_fit.GetBinCenter(b)) @@ -338,8 +388,9 @@ def save(self, root_dir): def load(self, root_dir, force=False): if self.fitted and not force: - self.logger.warning("Was fitted before and will be overwritten with what is found " \ - "in ROOT dir%s", root_dir.GetName()) + self.logger.warning( + "Was fitted before and will be overwritten with what is found in ROOT dir%s", root_dir.GetName() + ) self.sig_fit_func = root_dir.Get("sig_fit") self.bkg_sideband_fit_func = root_dir.Get("bkg_fit_sidebands") @@ -366,7 +417,7 @@ def load(self, root_dir, force=False): error = self.update_check_signal_fit() self.fitted = True - self.fit_success = (error == "") + self.fit_success = error == "" # pylint: disable=too-many-arguments, too-many-locals, too-many-branches, # pylint: disable=too-many-statements @@ -402,8 +453,8 @@ def fit(self): maxForSig = self.mean + self.nsigma_sideband * self.sigma binForMinSig = self.histo_to_fit.FindBin(minForSig) binForMaxSig = self.histo_to_fit.FindBin(maxForSig) - sum_tot = 0. - sumback = 0. + sum_tot = 0.0 + sumback = 0.0 for ibin in range(binForMinSig, binForMaxSig + 1): sum_tot += self.histo_to_fit.GetBinContent(ibin) sumback += self.bkg_sideband_fit_func.Eval(self.histo_to_fit.GetBinCenter(ibin)) @@ -429,8 +480,7 @@ def fit(self): self.tot_fit_func.FixParameter(npar_bkg + 1, self.mean) if self.fix_sigma is True: # Sigma would be fixed to what the fit to MC gives - self.tot_fit_func.FixParameter(npar_bkg + 2, - self.tot_fit_func.GetParameter(npar_bkg + 2)) + self.tot_fit_func.FixParameter(npar_bkg + 2, self.tot_fit_func.GetParameter(npar_bkg + 2)) self.histo_to_fit.Fit(self.tot_fit_func, ("R,%s,+,0" % (self.fit_options))) for ipar in range(0, npar_bkg): @@ -448,15 +498,15 @@ def fit(self): self.logger.error("Signal fit probably bad for following reasons:\n%s", error) self.fitted = True - self.fit_success = (error == "") + self.fit_success = error == "" return self.fit_success def draw_fit(self, save_name, flag_plot_message=None, shade_regions=False): - #Draw + # Draw self.histo_to_fit.GetXaxis().SetTitle("Invariant Mass L_{c}^{+}(GeV/c^{2})") self.histo_to_fit.SetStats(0) - c1 = TCanvas('c1', 'The Fit Canvas', 700, 700) + c1 = TCanvas("c1", "The Fit Canvas", 700, 700) c1.cd() gStyle.SetOptStat(0) gStyle.SetCanvasColor(0) @@ -469,7 +519,7 @@ def draw_fit(self, save_name, flag_plot_message=None, shade_regions=False): self.histo_to_fit.GetYaxis().SetRangeUser(histo_min, histo_max) self.histo_to_fit.SetMarkerStyle(20) self.histo_to_fit.SetMarkerSize(1) - #histo.SetMinimum(0.) + # histo.SetMinimum(0.) self.histo_to_fit.Draw("PE") self.bkg_tot_fit_func.Draw("same") self.tot_fit_func.Draw("same") @@ -482,16 +532,18 @@ def draw_fit(self, save_name, flag_plot_message=None, shade_regions=False): bkg_fill = None if shade_regions: sideband_fill_left = self.bkg_tot_fit_func.Clone("bkg_fit_fill_left") - sideband_fill_left.SetRange(self.mean_fit - 9 * self.sigma_fit, - self.mean_fit - self.nsigma_sideband * self.sigma_fit) + sideband_fill_left.SetRange( + self.mean_fit - 9 * self.sigma_fit, self.mean_fit - self.nsigma_sideband * self.sigma_fit + ) sideband_fill_left.SetLineWidth(0) sideband_fill_left.SetFillColor(self.bkg_tot_fit_func.GetLineColor()) sideband_fill_left.SetFillStyle(3001) sideband_fill_left.Draw("same fc") sideband_fill_right = self.bkg_tot_fit_func.Clone("bkg_fit_fill_right") - sideband_fill_right.SetRange(self.mean_fit + self.nsigma_sideband * self.sigma_fit, - self.mean_fit + 9 * self.sigma_fit) + sideband_fill_right.SetRange( + self.mean_fit + self.nsigma_sideband * self.sigma_fit, self.mean_fit + 9 * self.sigma_fit + ) sideband_fill_right.SetLineWidth(0) sideband_fill_right.SetFillColor(self.bkg_tot_fit_func.GetLineColor()) sideband_fill_right.SetFillStyle(3001) @@ -499,8 +551,9 @@ def draw_fit(self, save_name, flag_plot_message=None, shade_regions=False): # Shading bakground in signal region bkg_fill = self.bkg_tot_fit_func.Clone("bkg_fit_under_sig_fill") - bkg_fill.SetRange(self.mean_fit - self.nsigma_sig * self.sigma_fit, - self.mean_fit + self.nsigma_sig * self.sigma_fit) + bkg_fill.SetRange( + self.mean_fit - self.nsigma_sig * self.sigma_fit, self.mean_fit + self.nsigma_sig * self.sigma_fit + ) bkg_fill.SetLineWidth(0) bkg_fill.SetFillColor(kRed + 2) bkg_fill.SetFillStyle(3001) @@ -515,36 +568,37 @@ def draw_fit(self, save_name, flag_plot_message=None, shade_regions=False): range_low = self.mean_fit - self.nsigma_sig * self.sigma_fit range_up = self.mean_fit + self.nsigma_sig * self.sigma_fit for ip in range(n_points): - sig_fill.SetPoint(ip, range_low + ip * dx, - self.tot_fit_func.Eval(range_low + ip * dx)) - sig_fill.SetPoint(n_points + ip, range_up - ip * dx, - self.bkg_tot_fit_func.Eval(range_up - ip * dx)) + sig_fill.SetPoint(ip, range_low + ip * dx, self.tot_fit_func.Eval(range_low + ip * dx)) + sig_fill.SetPoint(n_points + ip, range_up - ip * dx, self.bkg_tot_fit_func.Eval(range_up - ip * dx)) sig_fill.Draw("f") - #write info. + # write info. pinfos = TPaveText(0.12, 0.7, 0.47, 0.89, "NDC") pinfos.SetBorderSize(0) pinfos.SetFillStyle(0) pinfos.SetTextAlign(11) pinfos.SetTextSize(0.03) - pinfom = TPaveText(0.5, 0.7, 1., .89, "NDC") + pinfom = TPaveText(0.5, 0.7, 1.0, 0.89, "NDC") pinfom.SetTextAlign(11) pinfom.SetBorderSize(0) pinfom.SetFillStyle(0) pinfom.SetTextColor(kBlue) pinfom.SetTextSize(0.03) chisquare_ndf = self.tot_fit_func.GetNDF() - chisquare_ndf = self.tot_fit_func.GetChisquare() / chisquare_ndf if chisquare_ndf > 0. \ - else 0. + chisquare_ndf = self.tot_fit_func.GetChisquare() / chisquare_ndf if chisquare_ndf > 0.0 else 0.0 pinfom.AddText("#chi^{2}/NDF = %f" % (chisquare_ndf)) - pinfom.AddText("%s = %.3f #pm %.3f" % (self.sig_fit_func.GetParName(1),\ - self.sig_fit_func.GetParameter(1), self.sig_fit_func.GetParError(1))) - pinfom.AddText("%s = %.3f #pm %.3f" % (self.sig_fit_func.GetParName(2),\ - self.sig_fit_func.GetParameter(2), self.sig_fit_func.GetParError(2))) + pinfom.AddText( + "%s = %.3f #pm %.3f" + % (self.sig_fit_func.GetParName(1), self.sig_fit_func.GetParameter(1), self.sig_fit_func.GetParError(1)) + ) + pinfom.AddText( + "%s = %.3f #pm %.3f" + % (self.sig_fit_func.GetParName(2), self.sig_fit_func.GetParameter(2), self.sig_fit_func.GetParError(2)) + ) pinfom.Draw() flag_info = None if flag_plot_message is not None: - flag_info = TPaveText(0.5, 0.5, 1., 0.68, "NDC") + flag_info = TPaveText(0.5, 0.5, 1.0, 0.68, "NDC") flag_info.SetBorderSize(0) flag_info.SetFillStyle(0) flag_info.SetTextAlign(11) @@ -556,13 +610,15 @@ def draw_fit(self, save_name, flag_plot_message=None, shade_regions=False): sig_text = pinfos.AddText("S = %.0f #pm %.0f " % (self.yield_sig, self.yield_sig_err)) sig_text.SetTextColor(kGreen + 2) - bkg_text = pinfos.AddText("B (%.0f#sigma) = %.0f #pm %.0f" % \ - (self.nsigma_sig, self.yield_bkg, self.yield_bkg_err)) + bkg_text = pinfos.AddText( + "B (%.0f#sigma) = %.0f #pm %.0f" % (self.nsigma_sig, self.yield_bkg, self.yield_bkg_err) + ) bkg_text.SetTextColor(kRed + 2) - sig_over_back = self.yield_sig / self.yield_bkg if self.yield_bkg > 0. else 0. + sig_over_back = self.yield_sig / self.yield_bkg if self.yield_bkg > 0.0 else 0.0 pinfos.AddText("S/B (%.0f#sigma) = %.4f " % (self.nsigma_sig, sig_over_back)) - pinfos.AddText("Signif (%.0f#sigma) = %.1f #pm %.1f " %\ - (self.nsigma_sig, self.significance, self.errsignificance)) + pinfos.AddText( + "Signif (%.0f#sigma) = %.1f #pm %.1f " % (self.nsigma_sig, self.significance, self.errsignificance) + ) pinfos.Draw() c1.Update() diff --git a/machine_learning_hep/hf_analysis_utils.py b/machine_learning_hep/hf_analysis_utils.py index cb6d547f43..4414b841aa 100644 --- a/machine_learning_hep/hf_analysis_utils.py +++ b/machine_learning_hep/hf_analysis_utils.py @@ -16,7 +16,7 @@ file: hf_analysis_utils.py brief: script with miscellanea utils methods for the HF analyses author: Fabrizio Grosa , CERN -Macro committed and manteined in O2Physics: +Macro committed and manteined in O2Physics: https://github.com/AliceO2Group/O2Physics/tree/master/PWGHF/D2H/Macros """ @@ -63,13 +63,8 @@ def compute_crosssection( crosssection = -9999 crosssec_unc = -1 else: - crosssection = ( - rawy - * frac - * sigma_mb - / (2 * delta_pt * delta_y * eff_times_acc * n_events * b_ratio) - ) - if method_frac in ("Nb","ext"): + crosssection = rawy * frac * sigma_mb / (2 * delta_pt * delta_y * eff_times_acc * n_events * b_ratio) + if method_frac in ("Nb", "ext"): crosssec_unc = rawy_unc / (rawy * frac) * crosssection else: crosssec_unc = rawy_unc / rawy * crosssection @@ -132,37 +127,11 @@ def compute_fraction_fc( for i_sigma, (sigma_p, sigma_f) in enumerate(zip(cross_sec_prompt, cross_sec_fd)): for i_raa, (raa_p, raa_f) in enumerate(zip(raa_prompt, raa_fd)): if i_sigma == 0 and i_raa == 0: - frac_prompt_cent = 1.0 / ( - 1 + acc_eff_fd / acc_eff_prompt * sigma_f / sigma_p * raa_f / raa_p - ) - frac_fd_cent = 1.0 / ( - 1 + acc_eff_prompt / acc_eff_fd * sigma_p / sigma_f * raa_p / raa_f - ) + frac_prompt_cent = 1.0 / (1 + acc_eff_fd / acc_eff_prompt * sigma_f / sigma_p * raa_f / raa_p) + frac_fd_cent = 1.0 / (1 + acc_eff_prompt / acc_eff_fd * sigma_p / sigma_f * raa_p / raa_f) else: - frac_prompt.append( - 1.0 - / ( - 1 - + acc_eff_fd - / acc_eff_prompt - * sigma_f - / sigma_p - * raa_f - / raa_p - ) - ) - frac_fd.append( - 1.0 - / ( - 1 - + acc_eff_prompt - / acc_eff_fd - * sigma_p - / sigma_f - * raa_p - / raa_f - ) - ) + frac_prompt.append(1.0 / (1 + acc_eff_fd / acc_eff_prompt * sigma_f / sigma_p * raa_f / raa_p)) + frac_fd.append(1.0 / (1 + acc_eff_prompt / acc_eff_fd * sigma_p / sigma_f * raa_p / raa_f)) if frac_prompt and frac_fd: frac_prompt.sort() @@ -226,16 +195,7 @@ def compute_fraction_nb( if i_sigma == 0 and i_raa_ratio == 0: if raa_rat == 1.0 and taa == 1.0: # pp frac_cent = ( - 1 - - sigma - * delta_pt - * delta_y - * acc_eff_other - * b_ratio - * n_events - * 2 - / rawy - / sigma_mb + 1 - sigma * delta_pt * delta_y * acc_eff_other * b_ratio * n_events * 2 / rawy / sigma_mb ) else: # p-Pb or Pb-Pb: iterative evaluation of Raa needed delta_raa = 1.0 @@ -255,30 +215,13 @@ def compute_fraction_nb( frac_cent = 1 - raw_fd / rawy raa_other_old = raa_other raa_other = ( - frac_cent - * rawy - * sigma_mb - / 2 - / acc_eff_same - / delta_pt - / delta_y - / b_ratio - / n_events + frac_cent * rawy * sigma_mb / 2 / acc_eff_same / delta_pt / delta_y / b_ratio / n_events ) delta_raa = abs((raa_other - raa_other_old) / raa_other) else: if raa_rat == 1.0 and taa == 1.0: # pp frac.append( - 1 - - sigma - * delta_pt - * delta_y - * acc_eff_other - * b_ratio - * n_events - * 2 - / rawy - / sigma_mb + 1 - sigma * delta_pt * delta_y * acc_eff_other * b_ratio * n_events * 2 / rawy / sigma_mb ) else: # p-Pb or Pb-Pb: iterative evaluation of Raa needed delta_raa = 1.0 @@ -299,15 +242,7 @@ def compute_fraction_nb( frac_tmp = 1 - raw_fd / rawy raa_other_old = raa_other raa_other = ( - frac_tmp - * rawy - * sigma_mb - / 2 - / acc_eff_same - / delta_pt - / delta_y - / b_ratio - / n_events + frac_tmp * rawy * sigma_mb / 2 / acc_eff_same / delta_pt / delta_y / b_ratio / n_events ) delta_raa = abs((raa_other - raa_other_old) / raa_other) frac.append(frac_tmp) @@ -340,8 +275,6 @@ def get_hist_binlimits(histo): n_limits = histo.GetNbinsX() + 1 low_edge = histo.GetBinLowEdge(1) bin_width = histo.GetBinWidth(1) - bin_limits = np.array( - [low_edge + i_bin * bin_width for i_bin in range(n_limits)], "d" - ) + bin_limits = np.array([low_edge + i_bin * bin_width for i_bin in range(n_limits)], "d") return bin_limits diff --git a/machine_learning_hep/hf_pt_spectrum.py b/machine_learning_hep/hf_pt_spectrum.py index 472a5e8fca..ac1a6e2f4b 100644 --- a/machine_learning_hep/hf_pt_spectrum.py +++ b/machine_learning_hep/hf_pt_spectrum.py @@ -18,13 +18,13 @@ usage: python3 HfPtSpectrum.py CONFIG authors: Fabrizio Grosa , CERN Luigi Dello Stritto , CERN -Macro committed and manteined in O2Physics: +Macro committed and manteined in O2Physics: https://github.com/AliceO2Group/O2Physics/tree/master/PWGHF/D2H/Macros """ import sys -import numpy as np # pylint: disable=import-error +import numpy as np # pylint: disable=import-error from ROOT import ( # pylint: disable=import-error,no-name-in-module TH1, TH1F, @@ -38,30 +38,32 @@ kFullCircle, ) -from machine_learning_hep.hf_analysis_utils import ( # pylint: disable=import-error +from machine_learning_hep.hf_analysis_utils import ( # pylint: disable=import-error compute_crosssection, compute_fraction_fc, compute_fraction_nb, get_hist_binlimits, ) -def hf_pt_spectrum(channel, # pylint: disable=too-many-locals, too-many-arguments, too-many-statements, too-many-branches - b_ratio, - inputfonllpred, - frac_method, - prompt_frac, - eff_filename, - effprompt_histoname, - effnonprompt_histoname, - yield_filename, - yield_histoname, - norm, - sigmamb, - output_prompt, - output_file): +def hf_pt_spectrum( + channel, # pylint: disable=too-many-locals, too-many-arguments, too-many-statements, too-many-branches + b_ratio, + inputfonllpred, + frac_method, + prompt_frac, + eff_filename, + effprompt_histoname, + effnonprompt_histoname, + yield_filename, + yield_histoname, + norm, + sigmamb, + output_prompt, + output_file, +): # final plots style settings - style_hist = TStyle('style_hist','Histo graphics style') + style_hist = TStyle("style_hist", "Histo graphics style") style_hist.SetOptStat("n") style_hist.SetMarkerColor(kAzure + 4) style_hist.SetMarkerStyle(kFullCircle) @@ -88,10 +90,7 @@ def hf_pt_spectrum(channel, # pylint: disable=too-many-locals, too-many-argument sys.exit(2) if frac_method not in ["Nb", "fc", "ext"]: - print( - f"\033[91mERROR: method to subtract nonprompt" - f" {frac_method} not supported. Exit\033[0m" - ) + print(f"\033[91mERROR: method to subtract nonprompt {frac_method} not supported. Exit\033[0m") sys.exit(5) fonll_hist_name = { @@ -108,14 +107,10 @@ def hf_pt_spectrum(channel, # pylint: disable=too-many-locals, too-many-argument histos["FONLL"] = {"prompt": {}, "nonprompt": {}} infile_fonll = TFile.Open(inputfonllpred) for pred in ("central", "min", "max"): - histos["FONLL"]["nonprompt"][pred] = infile_fonll.Get( - f"{fonll_hist_name[channel]}fromBpred_{pred}_corr" - ) + histos["FONLL"]["nonprompt"][pred] = infile_fonll.Get(f"{fonll_hist_name[channel]}fromBpred_{pred}_corr") histos["FONLL"]["nonprompt"][pred].SetDirectory(0) if frac_method == "fc": - histos["FONLL"]["prompt"][pred] = infile_fonll.Get( - f"{fonll_hist_name[channel]}pred_{pred}" - ) + histos["FONLL"]["prompt"][pred] = infile_fonll.Get(f"{fonll_hist_name[channel]}pred_{pred}") histos["FONLL"]["prompt"][pred].SetDirectory(0) infile_fonll.Close() @@ -123,10 +118,7 @@ def hf_pt_spectrum(channel, # pylint: disable=too-many-locals, too-many-argument infile_rawy = TFile.Open(yield_filename) histos["rawyields"] = infile_rawy.Get(yield_histoname) if not histos["rawyields"]: - print( - f"\033[91mERROR: raw-yield histo {yield_histoname}" - f" not found in {yield_filename}. Exit\033[0m" - ) + print(f"\033[91mERROR: raw-yield histo {yield_histoname} not found in {yield_filename}. Exit\033[0m") sys.exit(6) histos["rawyields"].SetDirectory(0) @@ -135,17 +127,13 @@ def hf_pt_spectrum(channel, # pylint: disable=too-many-locals, too-many-argument infile_eff = TFile.Open(eff_filename) histos["acceffp"] = infile_eff.Get(effprompt_histoname) if not histos["acceffp"]: - print( - f"\033[91mERROR: prompt (acc x eff) histo {effprompt_histoname}" - f" not found in {eff_filename}. Exit\033[0m" - ) + print(f"\033[91mERROR: prompt (acc x eff) histo {effprompt_histoname} not found in {eff_filename}. Exit\033[0m") sys.exit(8) histos["acceffp"].SetDirectory(0) histos["acceffnp"] = infile_eff.Get(effnonprompt_histoname) if not histos["acceffnp"]: print( - f"\033[91mERROR: nonprompt (acc x eff) histo {effprompt_histoname}" - f"not found in {eff_filename}. Exit\033[0m" + f"\033[91mERROR: nonprompt (acc x eff) histo {effprompt_histoname}not found in {eff_filename}. Exit\033[0m" ) sys.exit(9) histos["acceffnp"].SetDirectory(0) @@ -155,10 +143,7 @@ def hf_pt_spectrum(channel, # pylint: disable=too-many-locals, too-many-argument ptlims = {} for histo in ["rawyields", "acceffp", "acceffnp"]: ptlims[histo] = get_hist_binlimits(histos[histo]) - if ( - histo != "rawyields" - and not np.equal(ptlims[histo], ptlims["rawyields"]).all() - ): + if histo != "rawyields" and not np.equal(ptlims[histo], ptlims["rawyields"]).all(): print("\033[91mERROR: histo binning not consistent. Exit\033[0m") sys.exit(10) @@ -182,39 +167,24 @@ def hf_pt_spectrum(channel, # pylint: disable=too-many-locals, too-many-argument len(ptlims["rawyields"]) - 1, ptlims["rawyields"], ) - hnorm = TH1F( - "hnorm", - "hnorm", - 1, - 0, - 1 - ) + hnorm = TH1F("hnorm", "hnorm", 1, 0, 1) - for i_pt, (ptmin, ptmax) in enumerate( - zip(ptlims["rawyields"][:-1], ptlims["rawyields"][1:]) - ): + for i_pt, (ptmin, ptmax) in enumerate(zip(ptlims["rawyields"][:-1], ptlims["rawyields"][1:])): pt_cent = (ptmax + ptmin) / 2 pt_delta = ptmax - ptmin rawy = histos["rawyields"].GetBinContent(i_pt + 1) rawy_unc = histos["rawyields"].GetBinError(i_pt + 1) eff_times_acc_prompt = histos["acceffp"].GetBinContent(i_pt + 1) eff_times_acc_nonprompt = histos["acceffnp"].GetBinContent(i_pt + 1) - ptmin_fonll = ( - histos["FONLL"]["nonprompt"]["central"].GetXaxis().FindBin(ptmin * 1.0001) - ) - ptmax_fonll = ( - histos["FONLL"]["nonprompt"]["central"].GetXaxis().FindBin(ptmax * 0.9999) - ) + ptmin_fonll = histos["FONLL"]["nonprompt"]["central"].GetXaxis().FindBin(ptmin * 1.0001) + ptmax_fonll = histos["FONLL"]["nonprompt"]["central"].GetXaxis().FindBin(ptmax * 0.9999) crosssec_nonprompt_fonll = [ - histos["FONLL"]["nonprompt"][pred].Integral( - ptmin_fonll, ptmax_fonll, "width" - ) - / (ptmax - ptmin) + histos["FONLL"]["nonprompt"][pred].Integral(ptmin_fonll, ptmax_fonll, "width") / (ptmax - ptmin) for pred in histos["FONLL"]["nonprompt"] ] # compute prompt fraction - frac = [0,0,0] + frac = [0, 0, 0] if frac_method == "Nb": frac = compute_fraction_nb( # BR already included in FONLL prediction rawy, @@ -229,10 +199,7 @@ def hf_pt_spectrum(channel, # pylint: disable=too-many-locals, too-many-argument ) elif frac_method == "fc": crosssec_prompt_fonll = [ - histos["FONLL"]["prompt"][pred].Integral( - ptmin_fonll, ptmax_fonll, "width" - ) - / (ptmax - ptmin) + histos["FONLL"]["prompt"][pred].Integral(ptmin_fonll, ptmax_fonll, "width") / (ptmax - ptmin) for pred in histos["FONLL"]["prompt"] ] frac, _ = compute_fraction_fc( @@ -266,12 +233,10 @@ def hf_pt_spectrum(channel, # pylint: disable=too-many-locals, too-many-argument if frac_method != "ext": output_prompt.append(frac[0]) gfraction.SetPoint(i_pt, pt_cent, frac[0]) - gfraction.SetPointError( - i_pt, pt_delta / 2, pt_delta / 2, frac[0] - frac[1], frac[2] - frac[0] - ) + gfraction.SetPointError(i_pt, pt_delta / 2, pt_delta / 2, frac[0] - frac[1], frac[2] - frac[0]) c = TCanvas("c", "c", 600, 800) - c.Divide (1, 2) + c.Divide(1, 2) c.cd(1) gPad.SetLogy(True) hptspectrum.Draw() @@ -292,7 +257,7 @@ def hf_pt_spectrum(channel, # pylint: disable=too-many-locals, too-many-argument for _, value in histos.items(): if isinstance(value, TH1): value.Write() - #else: + # else: # for flav in histos[hist]: # for pred in histos[hist][flav]: # histos[hist][flav][pred].Write() diff --git a/machine_learning_hep/logger.py b/machine_learning_hep/logger.py index 99d25ac9fb..088aabc450 100644 --- a/machine_learning_hep/logger.py +++ b/machine_learning_hep/logger.py @@ -15,6 +15,7 @@ """ Methods to: provide and manage central logging utility """ + import logging import sys from copy import copy @@ -24,39 +25,42 @@ class ExitHandler(logging.Handler): """ Add custom logging handler to exit on certain logging level """ + def emit(self, record): logging.shutdown() sys.exit(1) + class MLLoggerFormatter(logging.Formatter): """ A custom formatter that colors the levelname on request """ + # color names to indices color_map = { - 'black': 0, - 'red': 1, - 'green': 2, - 'yellow': 3, - 'blue': 4, - 'magenta': 5, - 'cyan': 6, - 'white': 7, + "black": 0, + "red": 1, + "green": 2, + "yellow": 3, + "blue": 4, + "magenta": 5, + "cyan": 6, + "white": 7, } level_map = { - logging.DEBUG: (None, 'blue', False), - logging.INFO: (None, 'green', False), - logging.WARNING: (None, 'yellow', False), - logging.ERROR: (None, 'red', False), - logging.CRITICAL: ('red', 'white', True), + logging.DEBUG: (None, "blue", False), + logging.INFO: (None, "green", False), + logging.WARNING: (None, "yellow", False), + logging.ERROR: (None, "red", False), + logging.CRITICAL: ("red", "white", True), } - csi = '\x1b[' - reset = '\x1b[0m' + csi = "\x1b[" + reset = "\x1b[0m" # Define default format string - def __init__(self, fmt=None, datefmt=None, style='%', color=False): - fmt = fmt or '%(levelname)s %(asctime)s - %(pathname)s:%(lineno)d:\n ↳ %(message)s' + def __init__(self, fmt=None, datefmt=None, style="%", color=False): + fmt = fmt or "%(levelname)s %(asctime)s - %(pathname)s:%(lineno)d:\n ↳ %(message)s" logging.Formatter.__init__(self, fmt, datefmt, style) self.color = color @@ -78,11 +82,11 @@ def format(self, record): if fg in self.color_map: params.append(str(self.color_map[fg] + 30)) if bold: - params.append('1') + params.append("1") if params: - cached_record.levelname = "".join((self.csi, ';'.join(params), "m", - cached_record.levelname, - self.reset)) + cached_record.levelname = "".join( + (self.csi, ";".join(params), "m", cached_record.levelname, self.reset) + ) return logging.Formatter.format(self, cached_record) @@ -98,8 +102,9 @@ def configure_logger(debug, logfile=None, quiet=False): logger.setLevel(logging.DEBUG if debug else logging.INFO) sh = logging.StreamHandler() - formatter = MLLoggerFormatter(color=lambda : getattr(sh.stream, 'isatty', None), - fmt = '%(levelname)s ➞ %(message)s' if quiet else None) + formatter = MLLoggerFormatter( + color=lambda: getattr(sh.stream, "isatty", None), fmt="%(levelname)s ➞ %(message)s" if quiet else None + ) sh.setFormatter(formatter) logger.addHandler(sh) diff --git a/machine_learning_hep/ml_get_data.py b/machine_learning_hep/ml_get_data.py index 4c26748bef..f67935a928 100644 --- a/machine_learning_hep/ml_get_data.py +++ b/machine_learning_hep/ml_get_data.py @@ -12,25 +12,27 @@ ## along with this program. if not, see . ## ############################################################################# -import sys -import subprocess -import os import errno +import os +import subprocess +import sys +from argparse import ArgumentParser from shutil import rmtree from tempfile import mkdtemp -from argparse import ArgumentParser DEFAULT_DEST = "~/.machine_learning_hep/data/inputroot" SOURCE = "https://www.dropbox.com/sh/a9zviv7fz0dv7co/AABMNfZWzxUFUd8VszbAwlSRa?dl=1" + def main(): argp = ArgumentParser(description="Download or update input data for MachineLearningHEP") - argp.add_argument("--verbose", dest="verbose", default=False, action="store_true", - help="Be verbose") - argp.add_argument("--clean", dest="clean", default=False, action="store_true", - help="Remove old data before downloading") - argp.add_argument("--dest", dest="dest", default=DEFAULT_DEST, - help=f"Where to download input data (defaults to {DEFAULT_DEST})") + argp.add_argument("--verbose", dest="verbose", default=False, action="store_true", help="Be verbose") + argp.add_argument( + "--clean", dest="clean", default=False, action="store_true", help="Remove old data before downloading" + ) + argp.add_argument( + "--dest", dest="dest", default=DEFAULT_DEST, help=f"Where to download input data (defaults to {DEFAULT_DEST})" + ) args = argp.parse_args() args.dest = os.path.expanduser(args.dest) diff --git a/machine_learning_hep/mlperformance.py b/machine_learning_hep/mlperformance.py index 60c35dd798..b60e180d7a 100644 --- a/machine_learning_hep/mlperformance.py +++ b/machine_learning_hep/mlperformance.py @@ -15,19 +15,20 @@ """ Methods to: model performance evaluation """ + import itertools -import pandas as pd -import numpy as np + import matplotlib.pyplot as plt +import numpy as np +import pandas as pd import seaborn as sn -from sklearn.model_selection import cross_val_score, cross_val_predict, train_test_split -from sklearn.model_selection import StratifiedKFold -from sklearn.metrics import roc_curve, auc, roc_auc_score, confusion_matrix, precision_recall_curve -from sklearn.metrics import mean_squared_error +from sklearn.metrics import auc, confusion_matrix, mean_squared_error, precision_recall_curve, roc_auc_score, roc_curve +from sklearn.model_selection import StratifiedKFold, cross_val_predict, cross_val_score, train_test_split from machine_learning_hep.utilities_plot import prepare_fig -HIST_COLORS = ['r', 'b', 'g'] +HIST_COLORS = ["r", "b", "g"] + def cross_validation_mse(names_, classifiers_, x_train, y_train, nkfolds, ncores, continuous=False): df_scores = pd.DataFrame() @@ -35,8 +36,7 @@ def cross_validation_mse(names_, classifiers_, x_train, y_train, nkfolds, ncores if "Keras" in name: ncores = 1 cv = nkfolds if continuous else StratifiedKFold(n_splits=nkfolds, shuffle=True) - scores = cross_val_score(clf, x_train, y_train, cv=cv, - scoring="neg_mean_squared_error", n_jobs=ncores) + scores = cross_val_score(clf, x_train, y_train, cv=cv, scoring="neg_mean_squared_error", n_jobs=ncores) tree_rmse_scores = np.sqrt(-scores) df_scores[name] = tree_rmse_scores return df_scores @@ -44,9 +44,9 @@ def cross_validation_mse(names_, classifiers_, x_train, y_train, nkfolds, ncores def plot_cross_validation_mse(names_, df_scores_, suffix_, folder): figure, nrows, ncols = prepare_fig(len(names_)) - for ind, name in enumerate(names_, start = 1): + for ind, name in enumerate(names_, start=1): ax = plt.subplot(nrows, ncols, ind) - ax.set_xlim([0, (df_scores_[name].mean()*2)]) + ax.set_xlim([0, (df_scores_[name].mean() * 2)]) plt.hist(df_scores_[name].values, color="b") mystring = f"$\\mu={df_scores_[name].mean():8.2f}, \\sigma={df_scores_[name].std():8.2f}$" ax.text(0.1, 4.0, mystring, fontsize=25) @@ -54,38 +54,34 @@ def plot_cross_validation_mse(names_, df_scores_, suffix_, folder): ax.set_xlabel("scores RMSE", fontsize=30) ax.set_ylabel("Entries", fontsize=30) ax.set_ylim(0, 5) - figure.savefig(f"{folder}/scoresRME{suffix_}.png", bbox_inches='tight') + figure.savefig(f"{folder}/scoresRME{suffix_}.png", bbox_inches="tight") plt.close(figure) def plot_distribution_target(names_, testset, myvariablesy, suffix_, folder): figure, nrows, ncols = prepare_fig(len(names_)) - for ind, name in enumerate(names_, start = 1): + for ind, name in enumerate(names_, start=1): ax = plt.subplot(nrows, ncols, ind) - plt.hist(testset[myvariablesy].values, - color="b", bins=100, label="true value") - plt.hist(testset[f"y_test_prediction{name}"].values, - color="r", bins=100, label="predicted value") + plt.hist(testset[myvariablesy].values, color="b", bins=100, label="true value") + plt.hist(testset[f"y_test_prediction{name}"].values, color="r", bins=100, label="predicted value") ax.set_title(name, fontsize=30) ax.set_xlabel(myvariablesy, fontsize=30) ax.set_ylabel("Entries", fontsize=30) plt.legend(loc="center right") - figure.savefig(f"{folder}/distributionregression{suffix_}.png", bbox_inches='tight') + figure.savefig(f"{folder}/distributionregression{suffix_}.png", bbox_inches="tight") plt.close(figure) def plot_scatter_target(names_, testset, myvariablesy, suffix_, folder): figure, nrows, ncols = prepare_fig(len(names_)) - for ind, name in enumerate(names_, start = 1): + for ind, name in enumerate(names_, start=1): ax = plt.subplot(nrows, ncols, ind) - plt.scatter( - testset[myvariablesy].values, - testset[f"y_test_prediction{name}"].values, color="b") + plt.scatter(testset[myvariablesy].values, testset[f"y_test_prediction{name}"].values, color="b") ax.set_title(name, fontsize=30) ax.set_xlabel(f"{myvariablesy} true", fontsize=30) ax.set_ylabel(f"{myvariablesy} predicted", fontsize=30) ax.tick_params(labelsize=20) - figure.savefig(f"{folder}/scatterplotregression{suffix_}.png", bbox_inches='tight') + figure.savefig(f"{folder}/scatterplotregression{suffix_}.png", bbox_inches="tight") plt.close(figure) @@ -93,7 +89,7 @@ def confusion(names_, classifiers_, suffix_, x_train, y_train, cvgen, folder, do figure, nrows, ncols = prepare_fig(len(names_)) if len(names_) > 1: figure.subplots_adjust(left=None, bottom=None, right=None, top=None, wspace=0.4, hspace=0.2) - for ind, (name, clf) in enumerate(zip(names_, classifiers_), start = 1): + for ind, (name, clf) in enumerate(zip(names_, classifiers_), start=1): ax = plt.subplot(nrows, ncols, ind) y_train_pred = cross_val_predict(clf, x_train, y_train, cv=cvgen) conf_mx = confusion_matrix(y_train, y_train_pred) @@ -106,36 +102,43 @@ def confusion(names_, classifiers_, suffix_, x_train, y_train, cvgen, folder, do ax_title = f"{name} tot diag = 0" if do_diag0 else name ax.set_title(ax_title) sn.heatmap(df_cm, annot=True, annot_kws={"size": 16}) # font size - ax.set_xlabel('Predicted labels') - ax.set_ylabel('True labels') - ax.xaxis.set_ticklabels(['signal', 'background']) - ax.yaxis.set_ticklabels(['signal', 'background']) + ax.set_xlabel("Predicted labels") + ax.set_ylabel("True labels") + ax.xaxis.set_ticklabels(["signal", "background"]) + ax.yaxis.set_ticklabels(["signal", "background"]) suffix_0 = "_Diag0" if do_diag0 else "" - figure.savefig(f"{folder}/confusion_matrix{suffix_}{suffix_0}.png", bbox_inches='tight') + figure.savefig(f"{folder}/confusion_matrix{suffix_}{suffix_0}.png", bbox_inches="tight") plt.close(figure) -def plot_precision_recall(names_, classifiers_, suffix_, x_train, y_train, y_train_onehot, - nkfolds, folder, class_labels): +def plot_precision_recall( + names_, classifiers_, suffix_, x_train, y_train, y_train_onehot, nkfolds, folder, class_labels +): def do_plot_precision_recall(y_truth, y_score, label, color): precisions, recalls, thresholds = precision_recall_curve(y_truth, y_score) - plt.plot(thresholds, precisions[:-1], color=color, ls="--", - label=f"Precision {label} = TP/(TP+FP)", linewidth=5.0) - plt.plot(thresholds, recalls[:-1], color=color, ls="-", alpha=0.5, - label=f"Recall {label} = TP/(TP+FN)", linewidth=5.0) + plt.plot( + thresholds, precisions[:-1], color=color, ls="--", label=f"Precision {label} = TP/(TP+FP)", linewidth=5.0 + ) + plt.plot( + thresholds, + recalls[:-1], + color=color, + ls="-", + alpha=0.5, + label=f"Recall {label} = TP/(TP+FN)", + linewidth=5.0, + ) figure, nrows, ncols = prepare_fig(len(names_)) - for ind, (name, clf) in enumerate(zip(names_, classifiers_), start = 1): + for ind, (name, clf) in enumerate(zip(names_, classifiers_), start=1): ax = plt.subplot(nrows, ncols, ind) y_score = cross_val_predict(clf, x_train, y_train, cv=nkfolds, method="predict_proba") if len(class_labels) == 2: do_plot_precision_recall(y_train, y_score[:, 1], "signal", HIST_COLORS[0]) else: for cls_hyp, (label_hyp, color) in enumerate(zip(class_labels, HIST_COLORS)): - do_plot_precision_recall(y_train_onehot.iloc[:, cls_hyp], y_score[:, cls_hyp], - label_hyp, color) - do_plot_precision_recall(y_train_onehot.to_numpy().ravel(), y_score.ravel(), - "average", "black") + do_plot_precision_recall(y_train_onehot.iloc[:, cls_hyp], y_score[:, cls_hyp], label_hyp, color) + do_plot_precision_recall(y_train_onehot.to_numpy().ravel(), y_score.ravel(), "average", "black") ax.set_xlabel("Probability", fontsize=30) ax.set_ylabel("Precision or Recall", fontsize=30) @@ -143,20 +146,18 @@ def do_plot_precision_recall(y_truth, y_score, label, color): ax.legend(loc="best", frameon=False, fontsize=25) ax.set_ylim([0, 1]) ax.tick_params(labelsize=20) - figure.savefig(f"{folder}/precision_recall{suffix_}.png", bbox_inches='tight') + figure.savefig(f"{folder}/precision_recall{suffix_}.png", bbox_inches="tight") plt.close(figure) -def plot_roc_ovr(names_, classifiers_, suffix_, x_train, y_train, - nkfolds, folder, class_labels, save=True): +def plot_roc_ovr(names_, classifiers_, suffix_, x_train, y_train, nkfolds, folder, class_labels, save=True): def plot_roc(y_truth, y_score, name, label, color): fpr, tpr, _ = roc_curve(y_truth, y_score) roc_auc = auc(fpr, tpr) - plt.plot(fpr, tpr, f"{color}-", label=f"ROC {name} {label} vs rest, "\ - f"AUC = {roc_auc:.2f}", linewidth=5.0) + plt.plot(fpr, tpr, f"{color}-", label=f"ROC {name} {label} vs rest, AUC = {roc_auc:.2f}", linewidth=5.0) figure, nrows, ncols = prepare_fig(len(names_)) - for ind, (name, clf) in enumerate(zip(names_, classifiers_), start = 1): + for ind, (name, clf) in enumerate(zip(names_, classifiers_), start=1): ax = plt.subplot(nrows, ncols, ind) y_score = cross_val_predict(clf, x_train, y_train, cv=nkfolds, method="predict_proba") for cls_hyp, (label_hyp, color) in enumerate(zip(class_labels, HIST_COLORS)): @@ -170,17 +171,16 @@ def plot_roc(y_truth, y_score, name, label, color): ax.tick_params(labelsize=20) if save: - figure.savefig(f"{folder}/ROC_OvR_{suffix_}.png", bbox_inches='tight') + figure.savefig(f"{folder}/ROC_OvR_{suffix_}.png", bbox_inches="tight") plt.close(figure) return figure -def plot_roc_ovo(names_, classifiers_, suffix_, x_train, y_train, - nkfolds, folder, class_labels, save=True): +def plot_roc_ovo(names_, classifiers_, suffix_, x_train, y_train, nkfolds, folder, class_labels, save=True): if len(class_labels) <= 2: raise ValueError("ROC OvO cannot be computed for binary classification") figure, nrows, ncols = prepare_fig(len(names_)) - for ind, (name, clf) in enumerate(zip(names_, classifiers_), start = 1): + for ind, (name, clf) in enumerate(zip(names_, classifiers_), start=1): ax = plt.subplot(nrows, ncols, ind) y_score = cross_val_predict(clf, x_train, y_train, cv=nkfolds, method="predict_proba") label_pairs = itertools.combinations(class_labels, 2) @@ -192,11 +192,16 @@ def plot_roc_ovo(names_, classifiers_, suffix_, x_train, y_train, mask = y_train == ind_lab fpr, tpr, _ = roc_curve(mask[mask_or], y_score[mask_or, ind_lab]) roc_auc = auc(fpr, tpr) - plt.plot(fpr, tpr, f"{color}-", alpha=alpha, label=f"ROC "\ - f"{label_pair[ind]} vs {label_pair[1-ind]} (AUC = {roc_auc:.2f})", - linewidth=5.0) - global_roc_auc = roc_auc_score(y_train, y_score, average="macro", multi_class='ovo') - plt.plot([], [], ' ', label=f'Unweighted average OvO ROC AUC: {global_roc_auc:.2f}') + plt.plot( + fpr, + tpr, + f"{color}-", + alpha=alpha, + label=f"ROC {label_pair[ind]} vs {label_pair[1 - ind]} (AUC = {roc_auc:.2f})", + linewidth=5.0, + ) + global_roc_auc = roc_auc_score(y_train, y_score, average="macro", multi_class="ovo") + plt.plot([], [], " ", label=f"Unweighted average OvO ROC AUC: {global_roc_auc:.2f}") ax.set_xlabel("First class efficiency", fontsize=30) ax.set_ylabel("Second class efficiency", fontsize=30) ax.set_title(f"ROC one vs. one {name}", fontsize=30) @@ -205,35 +210,50 @@ def plot_roc_ovo(names_, classifiers_, suffix_, x_train, y_train, ax.set_ylim([-0.05, 1.05]) ax.tick_params(labelsize=20) if save: - figure.savefig(f"{folder}/ROC_OvO_{suffix_}.png", bbox_inches='tight') + figure.savefig(f"{folder}/ROC_OvO_{suffix_}.png", bbox_inches="tight") plt.close(figure) return figure -def roc_train_test(names_, classifiers_, suffix_, x_train, y_train, x_test, y_test, # pylint: disable=too-many-arguments - nkfolds, folder, class_labels, binlims, roc_type): +def roc_train_test( + names_, + classifiers_, + suffix_, + x_train, + y_train, + x_test, + y_test, # pylint: disable=too-many-arguments + nkfolds, + folder, + class_labels, + binlims, + roc_type, +): binmin, binmax = binlims if roc_type not in ("OvR", "OvO"): raise ValueError("ROC type can be only OvR or OvO") roc_fun = plot_roc_ovr if roc_type == "OvR" else plot_roc_ovo - fig_train = roc_fun(names_, classifiers_, suffix_, x_train, y_train, - nkfolds, folder, class_labels, save=False) - fig_test = roc_fun(names_, classifiers_, suffix_, x_test, y_test, - nkfolds, folder, class_labels, save=False) + fig_train = roc_fun(names_, classifiers_, suffix_, x_train, y_train, nkfolds, folder, class_labels, save=False) + fig_test = roc_fun(names_, classifiers_, suffix_, x_test, y_test, nkfolds, folder, class_labels, save=False) figure, nrows, ncols = prepare_fig(len(names_)) - for ind, (ax_train, ax_test) in enumerate(zip(fig_train.get_axes(), fig_test.get_axes()), - start = 1): + for ind, (ax_train, ax_test) in enumerate(zip(fig_train.get_axes(), fig_test.get_axes()), start=1): ax = plt.subplot(nrows, ncols, ind) for roc_train, roc_test in zip(ax_train.lines, ax_test.lines): - for roc_t, set_name, ls in zip((roc_train, roc_test), ("train", "test"), - ("-", "-.")): + for roc_t, set_name, ls in zip((roc_train, roc_test), ("train", "test"), ("-", "-.")): if "average" in roc_t.get_label(): - plt.plot([], [], ' ', label=f"{roc_t.get_label()}, {set_name} set") + plt.plot([], [], " ", label=f"{roc_t.get_label()}, {set_name} set") else: - plt.plot(roc_t.get_xdata(), roc_t.get_ydata(), lw=roc_t.get_lw(), - c=roc_t.get_c(), alpha=roc_t.get_alpha(), marker=roc_t.get_marker(), - linestyle=ls, label=f"{roc_t.get_label()}, {set_name} set") + plt.plot( + roc_t.get_xdata(), + roc_t.get_ydata(), + lw=roc_t.get_lw(), + c=roc_t.get_c(), + alpha=roc_t.get_alpha(), + marker=roc_t.get_marker(), + linestyle=ls, + label=f"{roc_t.get_label()}, {set_name} set", + ) ax.set_xlabel(ax_train.get_xlabel(), fontsize=30) ax.set_ylabel(ax_train.get_ylabel(), fontsize=30) ax.legend(loc="lower right", frameon=False, fontsize=25) @@ -241,11 +261,16 @@ def roc_train_test(names_, classifiers_, suffix_, x_train, y_train, x_test, y_te ax.set_ylim([-0.05, 1.05]) ax.tick_params(labelsize=20) - ax.text(0.7, 0.8, - f" ${binmin} < p_\\mathrm{{T}}/(\\mathrm{{GeV}}/c) < {binmax}$", - verticalalignment="center", transform=ax.transAxes, fontsize=30) + ax.text( + 0.7, + 0.8, + f" ${binmin} < p_\\mathrm{{T}}/(\\mathrm{{GeV}}/c) < {binmax}$", + verticalalignment="center", + transform=ax.transAxes, + fontsize=30, + ) - figure.savefig(f"{folder}/ROCtraintest_{roc_type}_{suffix_}.png", bbox_inches='tight') + figure.savefig(f"{folder}/ROCtraintest_{roc_type}_{suffix_}.png", bbox_inches="tight") plt.close(figure) plt.close(fig_train) plt.close(fig_test) @@ -255,9 +280,9 @@ def plot_learning_curves(names_, classifiers_, suffix_, folder, x_data, y_data, x_train, x_val, y_train, y_val = train_test_split(x_data, y_data, test_size=0.2) high = len(x_train) low = 100 - step_ = int((high-low)/npoints) + step_ = int((high - low) / npoints) figure, nrows, ncols = prepare_fig(len(names_)) - for ind, (name, clf) in enumerate(zip(names_, classifiers_), start = 1): + for ind, (name, clf) in enumerate(zip(names_, classifiers_), start=1): ax = plt.subplot(nrows, ncols, ind) train_errors, val_errors = [], [] arrayvalues = np.arange(start=low, stop=high, step=step_) @@ -273,33 +298,38 @@ def plot_learning_curves(names_, classifiers_, suffix_, folder, x_data, y_data, ax.set_ylabel("MSE", fontsize=30) ax.set_title(f"Learning curve {name}", fontsize=30) ax.legend(loc="best", frameon=False, fontsize=25) - ax.set_ylim([0, np.amax(np.sqrt(val_errors))*2]) + ax.set_ylim([0, np.amax(np.sqrt(val_errors)) * 2]) ax.tick_params(labelsize=20) - figure.savefig(f"{folder}/learning_curve{suffix_}.png", bbox_inches='tight') + figure.savefig(f"{folder}/learning_curve{suffix_}.png", bbox_inches="tight") plt.close(figure) -def plot_model_pred(names, classifiers, suffix, x_train, y_train, x_test, y_test, folder, - class_labels, bins=50): +def plot_model_pred(names, classifiers, suffix, x_train, y_train, x_test, y_test, folder, class_labels, bins=50): for name, clf in zip(names, classifiers): predict_probs_train = clf.predict_proba(x_train) predict_probs_test = clf.predict_proba(x_test) for cls_hyp, label_hyp in enumerate(class_labels): figure = plt.figure(figsize=(10, 8)) for cls_true, (label, color) in enumerate(zip(class_labels, HIST_COLORS)): - plt.hist(predict_probs_train[y_train == cls_true, cls_hyp], - color=color, alpha=0.5, range=[0, 1], bins=bins, - histtype='stepfilled', density=True, label=f'{label}, train') + plt.hist( + predict_probs_train[y_train == cls_true, cls_hyp], + color=color, + alpha=0.5, + range=[0, 1], + bins=bins, + histtype="stepfilled", + density=True, + label=f"{label}, train", + ) predicted_probs = predict_probs_test[y_test == cls_true, cls_hyp] hist, bins = np.histogram(predicted_probs, bins=bins, range=[0, 1], density=True) scale = len(predicted_probs) / sum(hist) err = np.sqrt(hist * scale) / scale center = (bins[:-1] + bins[1:]) / 2 - plt.errorbar(center, hist, yerr=err, fmt='o', c=color, label=f'{label}, test') + plt.errorbar(center, hist, yerr=err, fmt="o", c=color, label=f"{label}, test") plt.xlabel(f"ML score for {label_hyp}", fontsize=15) plt.ylabel("Counts (arb. units)", fontsize=15) plt.legend(loc="best", frameon=False, fontsize=15) plt.yscale("log") - figure.savefig(f"{folder}/ModelOutDistr_{label_hyp}_{name}_{suffix}.png", - bbox_inches='tight') + figure.savefig(f"{folder}/ModelOutDistr_{label_hyp}_{name}_{suffix}.png", bbox_inches="tight") plt.close(figure) diff --git a/machine_learning_hep/models.py b/machine_learning_hep/models.py index 0c30c0c0ad..4222b569b1 100644 --- a/machine_learning_hep/models.py +++ b/machine_learning_hep/models.py @@ -17,26 +17,27 @@ load and save ML models obtain control plots """ + # pylint: disable=too-many-branches -from os.path import exists import pickle -import pandas as pd -import numpy as np -import matplotlib.pyplot as plt +from os.path import exists + import matplotlib as mpl +import matplotlib.pyplot as plt +import numpy as np +import pandas as pd +import shap from matplotlib.colors import ListedColormap - from sklearn.feature_extraction import DictVectorizer -import shap - +from machine_learning_hep import templates_keras, templates_scikit, templates_xgboost from machine_learning_hep.logger import get_logger from machine_learning_hep.utilities_plot import prepare_fig -from machine_learning_hep import templates_keras, templates_xgboost, templates_scikit + pd.options.mode.chained_assignment = None -def getclf_scikit(model_config): +def getclf_scikit(model_config): logger = get_logger() logger.debug("Load scikit models") @@ -56,8 +57,7 @@ def getclf_scikit(model_config): c_bayesian = f"{c}_bayesian_opt" bayes_opt = None if hasattr(templates_scikit, c_bayesian): - bayes_opt = getattr(templates_scikit, c_bayesian) \ - (model_config["scikit"][c]["central_params"]) + bayes_opt = getattr(templates_scikit, c_bayesian)(model_config["scikit"][c]["central_params"]) bayesian_opt.append(bayes_opt) classifiers.append(model) names.append(c) @@ -70,7 +70,6 @@ def getclf_scikit(model_config): def getclf_xgboost(model_config): - logger = get_logger() logger.debug("Load xgboost models") @@ -90,8 +89,7 @@ def getclf_xgboost(model_config): c_bayesian = f"{c}_bayesian_opt" bayes_opt = None if hasattr(templates_xgboost, c_bayesian): - bayes_opt = getattr(templates_xgboost, c_bayesian) \ - (model_config["xgboost"][c]["central_params"]) + bayes_opt = getattr(templates_xgboost, c_bayesian)(model_config["xgboost"][c]["central_params"]) bayesian_opt.append(bayes_opt) classifiers.append(model) names.append(c) @@ -104,7 +102,6 @@ def getclf_xgboost(model_config): def getclf_keras(model_config, length_input): - logger = get_logger() logger.debug("Load keras models") @@ -119,25 +116,24 @@ def getclf_keras(model_config, length_input): for c in model_config["keras"]: if model_config["keras"][c]["activate"]: try: - model = getattr(templates_keras, c)(model_config["keras"][c]["central_params"], - length_input) + model = getattr(templates_keras, c)(model_config["keras"][c]["central_params"], length_input) classifiers.append(model) c_bayesian = f"{c}_bayesian_opt" bayes_opt = None if hasattr(templates_keras, c_bayesian): - bayes_opt = getattr(templates_keras, c_bayesian) \ - (model_config["keras"][c]["central_params"], length_input) + bayes_opt = getattr(templates_keras, c_bayesian)( + model_config["keras"][c]["central_params"], length_input + ) bayesian_opt.append(bayes_opt) names.append(c) logger.info("Added keras model %s", c) except AttributeError: logger.critical("Could not load keras model %s", c) - #logger.critical("Some reason") + # logger.critical("Some reason") return classifiers, names, [], bayesian_opt - def fit(names_, classifiers_, x_train_, y_train_): trainedmodels_ = [] for _, clf in zip(names_, classifiers_): @@ -151,15 +147,15 @@ def apply(ml_type, names_, trainedmodels_, test_set_, mylistvariables_, labels_= if len(test_set_[mylistvariables_]) == 0: logger.warning("Empty dataframe provided.") - if ml_type == "BinaryClassification": + if ml_type == "BinaryClassification": for name in names_: - test_set_[f"y_test_prediction{name}"]=0 - test_set_[f"y_test_prob{name}"]=0 + test_set_[f"y_test_prediction{name}"] = 0 + test_set_[f"y_test_prob{name}"] = 0 return test_set_ - if ml_type == "MultiClassification": + if ml_type == "MultiClassification": for name in names_: for pred, lab in enumerate(labels_): - safe_lab = lab.replace('-', '_') + safe_lab = lab.replace("-", "_") if pred == 0: # bkg cuts work differently test_set_[f"y_test_prediction{name}{safe_lab}"] = 1.1 @@ -180,9 +176,8 @@ def apply(ml_type, names_, trainedmodels_, test_set_, mylistvariables_, labels_= elif ml_type == "MultiClassification" and labels_ is not None: for pred, lab in enumerate(labels_): # pandas query() used in further analysis cannot accept '-' in column names - safe_lab = lab.replace('-', '_') - test_set_[f"y_test_prob{name}{safe_lab}"] = pd.Series(y_test_prob[:, pred], - index=test_set_.index) + safe_lab = lab.replace("-", "_") + test_set_[f"y_test_prob{name}{safe_lab}"] = pd.Series(y_test_prob[:, pred], index=test_set_.index) else: logger.fatal("Incorrect settings for chosen mltype") return test_set_ @@ -194,41 +189,45 @@ def savemodels(names_, trainedmodels_, folder_, suffix_): architecture_file = f"{folder_}/{name}{suffix_}_architecture.json" weights_file = f"{folder_}/{name}{suffix_}_weights.h5" arch_json = model.model.to_json() - with open(architecture_file, 'w', encoding='utf-8') as json_file: + with open(architecture_file, "w", encoding="utf-8") as json_file: json_file.write(arch_json) model.model.save_weights(weights_file) if "scikit" in name: fileoutmodel = f"{folder_}/{name}{suffix_}.sav" - with open(fileoutmodel, 'wb') as out_file: + with open(fileoutmodel, "wb") as out_file: pickle.dump(model, out_file, protocol=4) if "xgboost" in name: fileoutmodel = f"{folder_}/{name}{suffix_}.sav" - with open(fileoutmodel, 'wb') as out_file: + with open(fileoutmodel, "wb") as out_file: pickle.dump(model, out_file, protocol=4) fileoutmodel = fileoutmodel.replace(".sav", ".model") model.save_model(fileoutmodel) + def readmodels(names_, folder_, suffix_): trainedmodels_ = [] for name in names_: - fileinput = folder_+"/"+name+suffix_+".sav" + fileinput = folder_ + "/" + name + suffix_ + ".sav" if not exists(fileinput): return None - with open(fileinput, 'rb') as input_file: + with open(fileinput, "rb") as input_file: model = pickle.load(input_file) trainedmodels_.append(model) return trainedmodels_ def importanceplotall(mylistvariables_, names_, trainedmodels_, suffix_, folder): - names_models = [(name, model) for name, model in zip(names_, trainedmodels_) \ - if not any(mname in name for mname in ("SVC", "Logistic", "Keras"))] + names_models = [ + (name, model) + for name, model in zip(names_, trainedmodels_) + if not any(mname in name for mname in ("SVC", "Logistic", "Keras")) + ] figure, nrows, ncols = prepare_fig(len(names_models)) for ind, (name, model) in enumerate(names_models, start=1): ax = plt.subplot(nrows, ncols, ind) feature_importances_ = model.feature_importances_ y_pos = np.arange(len(mylistvariables_)) - ax.barh(y_pos, feature_importances_, align='center', color='green') + ax.barh(y_pos, feature_importances_, align="center", color="green") ax.set_yticks(y_pos) ax.set_yticklabels(mylistvariables_, fontsize=17) ax.invert_yaxis() # labels read top-to-bottom @@ -236,9 +235,10 @@ def importanceplotall(mylistvariables_, names_, trainedmodels_, suffix_, folder) ax.set_title(f"Importance features {name}", fontsize=17) ax.xaxis.set_tick_params(labelsize=17) plt.xlim(0, 0.7) - figure.savefig(f"{folder}/importance_{suffix_}.png", bbox_inches='tight') + figure.savefig(f"{folder}/importance_{suffix_}.png", bbox_inches="tight") plt.close() + def shap_study(names_, trainedmodels_, suffix_, x_train_, folder, class_labels, plot_options_): """Importance via SHAP @@ -255,8 +255,7 @@ def shap_study(names_, trainedmodels_, suffix_, x_train_, folder, class_labels, """ mpl.rcParams.update({"text.usetex": True}) plot_type_name = "prob_cut_scan" - plot_options = plot_options_.get(plot_type_name, {}) \ - if isinstance(plot_options_, dict) else {} + plot_options = plot_options_.get(plot_type_name, {}) if isinstance(plot_options_, dict) else {} feature_names = [] for fn in x_train_.columns: if fn in plot_options and "xlabel" in plot_options[fn]: @@ -265,41 +264,50 @@ def shap_study(names_, trainedmodels_, suffix_, x_train_, folder, class_labels, feature_names.append(fn.replace("_", ":")) # Rely on name to exclude certain models at the moment - names_models = [(name, model) for name, model in zip(names_, trainedmodels_) \ - if not any(mname in name for mname in ("SVC", "Logistic", "Keras"))] + names_models = [ + (name, model) + for name, model in zip(names_, trainedmodels_) + if not any(mname in name for mname in ("SVC", "Logistic", "Keras")) + ] figure, nrows, ncols = prepare_fig(len(names_models)) for ind, (name, model) in enumerate(names_models, start=1): ax = figure.add_subplot(nrows, ncols, ind) plt.sca(ax) explainer = shap.TreeExplainer(model) shap_values = explainer.shap_values(x_train_) - shap.summary_plot(shap_values, x_train_, show=False, feature_names=feature_names, - class_names=class_labels, class_inds="original") + shap.summary_plot( + shap_values, + x_train_, + show=False, + feature_names=feature_names, + class_names=class_labels, + class_inds="original", + ) if len(class_labels) > 2: for ind, label in enumerate(class_labels): fig_class, _, _ = prepare_fig(1) - shap.summary_plot(shap_values[ind], x_train_, show=False, - feature_names=feature_names, class_names=class_labels) - fig_class.savefig(f"{folder}/importance_shap_{name}_{label}_{suffix_}.png", - bbox_inches='tight') + shap.summary_plot( + shap_values[ind], x_train_, show=False, feature_names=feature_names, class_names=class_labels + ) + fig_class.savefig(f"{folder}/importance_shap_{name}_{label}_{suffix_}.png", bbox_inches="tight") plt.close(fig_class) - figure.savefig(f"{folder}/importance_shap_{suffix_}.png", bbox_inches='tight') + figure.savefig(f"{folder}/importance_shap_{suffix_}.png", bbox_inches="tight") mpl.rcParams.update({"text.usetex": False}) plt.close(figure) def decisionboundaries(names_, trainedmodels_, suffix_, x_train_, y_train_, folder): mylistvariables_ = x_train_.columns.tolist() - dictionary_train = x_train_.to_dict(orient='records') + dictionary_train = x_train_.to_dict(orient="records") vec = DictVectorizer() x_train_array_ = vec.fit_transform(dictionary_train).toarray() - height = .10 + height = 0.10 cm = plt.cm.RdBu - cm_bright = ListedColormap(['#FF0000', '#0000FF']) + cm_bright = ListedColormap(["#FF0000", "#0000FF"]) - x_min, x_max = x_train_array_[:, 0].min() - .5, x_train_array_[:, 0].max() + .5 - y_min, y_max = x_train_array_[:, 1].min() - .5, x_train_array_[:, 1].max() + .5 + x_min, x_max = x_train_array_[:, 0].min() - 0.5, x_train_array_[:, 0].max() + 0.5 + y_min, y_max = x_train_array_[:, 1].min() - 0.5, x_train_array_[:, 1].max() + 0.5 xx, yy = np.meshgrid(np.arange(x_min, x_max, height), np.arange(y_min, y_max, height)) figure, nrows, ncols = prepare_fig(len(names_)) @@ -311,17 +319,22 @@ def decisionboundaries(names_, trainedmodels_, suffix_, x_train_, y_train_, fold ax = plt.subplot(nrows, ncols, ind) z_contour = z_contour.reshape(xx.shape) - ax.contourf(xx, yy, z_contour, cmap=cm, alpha=.8) + ax.contourf(xx, yy, z_contour, cmap=cm, alpha=0.8) # Plot also the training points - ax.scatter(x_train_array_[:, 0], x_train_array_[:, 1], - c=y_train_, cmap=cm_bright, edgecolors='k', alpha=0.3) + ax.scatter(x_train_array_[:, 0], x_train_array_[:, 1], c=y_train_, cmap=cm_bright, edgecolors="k", alpha=0.3) ax.set_xlim(xx.min(), xx.max()) ax.set_ylim(yy.min(), yy.max()) score = model.score(x_train_, y_train_) - ax.text(xx.max() - .3, yy.min() + .3, (f"accuracy={score:.2f}").lstrip('0'), - size=15, horizontalalignment='right', verticalalignment='center') + ax.text( + xx.max() - 0.3, + yy.min() + 0.3, + (f"accuracy={score:.2f}").lstrip("0"), + size=15, + horizontalalignment="right", + verticalalignment="center", + ) ax.set_title(name, fontsize=17) ax.set_ylabel(mylistvariables_[1], fontsize=17) ax.set_xlabel(mylistvariables_[0], fontsize=17) - figure.savefig(f"{folder}/decisionboundaries{suffix_}.png", bbox_inches='tight') + figure.savefig(f"{folder}/decisionboundaries{suffix_}.png", bbox_inches="tight") plt.close(figure) diff --git a/machine_learning_hep/multiprocesser.py b/machine_learning_hep/multiprocesser.py index 6cf88f206e..cfe999c2d5 100755 --- a/machine_learning_hep/multiprocesser.py +++ b/machine_learning_hep/multiprocesser.py @@ -15,13 +15,16 @@ """ main script for doing data processing, machine learning and analysis """ + import os import tempfile -from machine_learning_hep.utilities import merge_method, mergerootfiles -from machine_learning_hep.io import parse_yaml, dump_yaml_from_dict + +from machine_learning_hep.io import dump_yaml_from_dict, parse_yaml from machine_learning_hep.logger import get_logger +from machine_learning_hep.utilities import merge_method, mergerootfiles + -class MultiProcesser: # pylint: disable=too-many-instance-attributes, too-many-statements, consider-using-f-string, too-many-branches +class MultiProcesser: # pylint: disable=too-many-instance-attributes, too-many-statements, consider-using-f-string, too-many-branches species = "multiprocesser" logger = get_logger() @@ -45,7 +48,7 @@ def __init__(self, case, proc_class, datap, typean, run_param, mcordata): self.p_nptbins = len(datap["sel_skim_binmax"]) self.p_dofullevtmerge = datap["dofullevtmerge"] - #directories + # directories self.dlper_root = [] self.dlper_pkl = [] self.dlper_pklsk = [] @@ -63,7 +66,7 @@ def __init__(self, case, proc_class, datap, typean, run_param, mcordata): self.d_pklevt_mergedallp = self.d_prefix + os.path.expandvars(dp["pkl_evtcounter_all"]) self.dlper_mcreweights = datap["multi"][self.mcordata]["mcreweights"] - #namefiles pkl + # namefiles pkl self.v_var_binning = datap["var_binning"] self.n_reco = datap["files_names"]["namefile_reco"] self.n_evt = datap["files_names"]["namefile_evt"] @@ -71,31 +74,30 @@ def __init__(self, case, proc_class, datap, typean, run_param, mcordata): self.n_evt_count_ml = datap["files_names"].get("namefile_evt_count", "evtcount.yaml") self.n_gen = datap["files_names"]["namefile_gen"] self.n_mcreweights = datap["files_names"]["namefile_mcweights"] - self.lpt_recosk = [self.n_reco.replace(".p", "_%s%d_%d.p" % \ - (self.v_var_binning, self.lpt_anbinmin[i], self.lpt_anbinmax[i])) \ - for i in range(self.p_nptbins)] - self.lpt_gensk = [self.n_gen.replace(".p", "_%s%d_%d.p" % \ - (self.v_var_binning, self.lpt_anbinmin[i], self.lpt_anbinmax[i])) \ - for i in range(self.p_nptbins)] - self.lptper_recoml = [[os.path.join(direc, self.lpt_recosk[ipt]) \ - for direc in self.dlper_pklml] \ - for ipt in range(self.p_nptbins)] - self.lper_evt_count_ml = [os.path.join(direc, self.n_evt_count_ml) \ - for direc in self.dlper_pklml] - self.lptper_genml = [[os.path.join(direc, self.lpt_gensk[ipt]) \ - for direc in self.dlper_pklml] \ - for ipt in range(self.p_nptbins)] - self.lpt_recoml_mergedallp = \ - [os.path.join(self.d_pklml_mergedallp, self.lpt_recosk[ipt]) \ - for ipt in range(self.p_nptbins)] - self.lpt_genml_mergedallp = \ - [os.path.join(self.d_pklml_mergedallp, self.lpt_gensk[ipt]) \ - for ipt in range(self.p_nptbins)] - self.f_evtml_count = \ - os.path.join(self.d_pklml_mergedallp, self.n_evt_count_ml) + self.lpt_recosk = [ + self.n_reco.replace(".p", "_%s%d_%d.p" % (self.v_var_binning, self.lpt_anbinmin[i], self.lpt_anbinmax[i])) + for i in range(self.p_nptbins) + ] + self.lpt_gensk = [ + self.n_gen.replace(".p", "_%s%d_%d.p" % (self.v_var_binning, self.lpt_anbinmin[i], self.lpt_anbinmax[i])) + for i in range(self.p_nptbins) + ] + self.lptper_recoml = [ + [os.path.join(direc, self.lpt_recosk[ipt]) for direc in self.dlper_pklml] for ipt in range(self.p_nptbins) + ] + self.lper_evt_count_ml = [os.path.join(direc, self.n_evt_count_ml) for direc in self.dlper_pklml] + self.lptper_genml = [ + [os.path.join(direc, self.lpt_gensk[ipt]) for direc in self.dlper_pklml] for ipt in range(self.p_nptbins) + ] + self.lpt_recoml_mergedallp = [ + os.path.join(self.d_pklml_mergedallp, self.lpt_recosk[ipt]) for ipt in range(self.p_nptbins) + ] + self.lpt_genml_mergedallp = [ + os.path.join(self.d_pklml_mergedallp, self.lpt_gensk[ipt]) for ipt in range(self.p_nptbins) + ] + self.f_evtml_count = os.path.join(self.d_pklml_mergedallp, self.n_evt_count_ml) self.lper_evt = [os.path.join(direc, self.n_evt) for direc in self.dlper_pkl] - self.lper_evtorig = \ - [os.path.join(direc, self.n_evtorig) for direc in self.dlper_pkl] + self.lper_evtorig = [os.path.join(direc, self.n_evtorig) for direc in self.dlper_pkl] dp = datap["mlapplication"][self.mcordata] self.dlper_reco_modapp = [self.d_prefix_app + p for p in dp["pkl_skimmed_dec"]] @@ -106,34 +108,44 @@ def __init__(self, case, proc_class, datap, typean, run_param, mcordata): self.d_resultsallp = self.d_prefix_res + os.path.expandvars(dp["resultsallp"]) self.f_evt_mergedallp = os.path.join(self.d_pklevt_mergedallp, self.n_evt) - self.f_evtorig_mergedallp = \ - os.path.join(self.d_pklevt_mergedallp, self.n_evtorig) + self.f_evtorig_mergedallp = os.path.join(self.d_pklevt_mergedallp, self.n_evtorig) self.lper_runlistrigger = datap["analysis"][self.typean][self.mcordata]["runselection"] self.lper_mcreweights = None if self.mcordata == "mc": - self.lper_mcreweights = [os.path.join(direc, self.n_mcreweights) - for direc in self.dlper_mcreweights] + self.lper_mcreweights = [os.path.join(direc, self.n_mcreweights) for direc in self.dlper_mcreweights] self.process_listsample = [] for indexp in range(self.prodnumber): - if self.select_period[indexp]>0: - myprocess = proc_class(self.case, self.datap, self.run_param, self.mcordata, - self.p_maxfiles[indexp], self.dlper_root[indexp], - self.dlper_pkl[indexp], self.dlper_pklsk[indexp], - self.dlper_pklml[indexp], - self.p_period[indexp], indexp, self.p_chunksizeunp[indexp], - self.p_chunksizeskim[indexp], self.p_nparall, - self.p_fracmerge[indexp], self.p_seedmerge[indexp], - self.dlper_reco_modapp[indexp], - self.dlper_reco_modappmerged[indexp], - self.d_results[indexp], self.typean, - self.lper_runlistrigger[indexp], \ - self.dlper_mcreweights[indexp]) + if self.select_period[indexp] > 0: + myprocess = proc_class( + self.case, + self.datap, + self.run_param, + self.mcordata, + self.p_maxfiles[indexp], + self.dlper_root[indexp], + self.dlper_pkl[indexp], + self.dlper_pklsk[indexp], + self.dlper_pklml[indexp], + self.p_period[indexp], + indexp, + self.p_chunksizeunp[indexp], + self.p_chunksizeskim[indexp], + self.p_nparall, + self.p_fracmerge[indexp], + self.p_seedmerge[indexp], + self.dlper_reco_modapp[indexp], + self.dlper_reco_modappmerged[indexp], + self.d_results[indexp], + self.typean, + self.lper_runlistrigger[indexp], + self.dlper_mcreweights[indexp], + ) self.process_listsample.append(myprocess) else: - self.logger.info('Period [%s] excluded from the analysis', self.p_period[indexp]) + self.logger.info("Period [%s] excluded from the analysis", self.p_period[indexp]) continue self.n_filemass = datap["files_names"]["histofilename"] @@ -204,7 +216,7 @@ def multi_histomass(self): for indexp, _ in enumerate(self.process_listsample): if self.p_useperiod[indexp] == 1: self.process_listsample[indexp].process_histomass() - self.logger.debug('merging all') + self.logger.debug("merging all") with tempfile.TemporaryDirectory() as tmp_merged_dir: mergerootfiles(self.lper_filemass, self.filemass_mergedall, tmp_merged_dir) diff --git a/machine_learning_hep/optimisation/bayesian_opt.py b/machine_learning_hep/optimisation/bayesian_opt.py index fcb14499e2..e809322b2c 100644 --- a/machine_learning_hep/optimisation/bayesian_opt.py +++ b/machine_learning_hep/optimisation/bayesian_opt.py @@ -12,57 +12,53 @@ ## along with this program. if not, see . ## ############################################################################# +import pickle import sys -from os.path import join -from numbers import Number from copy import copy -import pickle -import numpy as np +from numbers import Number +from os.path import join + +import matplotlib import matplotlib.pyplot as plt +import numpy as np +from hyperopt import STATUS_OK, fmin, tpe from matplotlib.lines import Line2D -import matplotlib - -from yaml.representer import RepresenterError - from sklearn.model_selection import cross_validate - -from hyperopt import fmin, tpe, STATUS_OK +from yaml.representer import RepresenterError # from shap.plots.colors import red_blue as shap_cmap_red_blue - -from machine_learning_hep.io import dump_yaml_from_dict, parse_yaml, dict_yamlable +from machine_learning_hep.io import dict_yamlable, dump_yaml_from_dict, parse_yaml # Change to that backend to not have problems with saving fgures # when X11 connection got lost matplotlib.use("agg") -class BayesianOpt: #pylint: disable=too-many-instance-attributes +class BayesianOpt: # pylint: disable=too-many-instance-attributes """Base/utilitiy class for Bayesian model optimisation - This class utilises the hyperopt package to perform Bayesian model optimisation independent - of the concrete ML model. - The central method is "optimise" which soleyly relies on getting a model configured with - the new parameters. A method method to obtain a new model can either be implemented by - deriving this class and overwrite "yield_model_" or by passing a lambda as the - "yield_model" argument when calling "optimise". - Additionally, the best model is automatically saved when either "save_model_" is - overwritten or a lambda is passed to the "save_model" argument in optimise. - - Optimisation is done "self.n_trials" times and for each trial a Cross Validation is done - with "self.nkfolds" folds. - - Scoring functions can be freely defined in contained in the dictionary "self.scoring" and - the optimisation is done according to the scoring function with key "self.scoring_opt". - Note, that the underlying optimisation procedure is a minimisation. Hence, when a maximum - score is the best one, "self.low_is_better" must be set to False. - - All parameters and scores can be written to a YAML file and the field "best_index" - specifies the best model wrt the best test score. + This class utilises the hyperopt package to perform Bayesian model optimisation independent + of the concrete ML model. + The central method is "optimise" which soleyly relies on getting a model configured with + the new parameters. A method method to obtain a new model can either be implemented by + deriving this class and overwrite "yield_model_" or by passing a lambda as the + "yield_model" argument when calling "optimise". + Additionally, the best model is automatically saved when either "save_model_" is + overwritten or a lambda is passed to the "save_model" argument in optimise. + + Optimisation is done "self.n_trials" times and for each trial a Cross Validation is done + with "self.nkfolds" folds. + + Scoring functions can be freely defined in contained in the dictionary "self.scoring" and + the optimisation is done according to the scoring function with key "self.scoring_opt". + Note, that the underlying optimisation procedure is a minimisation. Hence, when a maximum + score is the best one, "self.low_is_better" must be set to False. + + All parameters and scores can be written to a YAML file and the field "best_index" + specifies the best model wrt the best test score. """ def __init__(self, model_config, space): - # Train samples self.x_train = None self.y_train = None @@ -119,10 +115,8 @@ def __init__(self, model_config, space): self.fit_pool = [] self.trial_id = 0 - def reset(self): - """Reset to default - """ + """Reset to default""" self.min_score = None self.results = [] @@ -134,8 +128,7 @@ def reset(self): self.best_scores = None self.trial_id = 0 - - def yield_model_(self, model_config, space): # pylint: disable=unused-argument, useless-return, no-self-use + def yield_model_(self, model_config, space): # pylint: disable=unused-argument, useless-return, no-self-use """Yield next model Next model constructed from space. To be overwritten for concrete implementation @@ -149,7 +142,6 @@ def yield_model_(self, model_config, space): # pylint: disable=unused-argument, print("yield_model_ not implemented...") return None, None - def next_params(self, space_drawn): """Yield next set of parameters @@ -165,7 +157,6 @@ def next_params(self, space_drawn): config[key] = value return config - def trial_(self, space_drawn): """Default single trial @@ -185,18 +176,24 @@ def trial_(self, space_drawn): if self.yield_model_custom: model, params = self.yield_model_custom(self.model_config, space_drawn) else: - model, params = self.yield_model_(self.model_config, space_drawn) # pylint: disable=assignment-from-none + model, params = self.yield_model_(self.model_config, space_drawn) # pylint: disable=assignment-from-none # Collect parameters - #self.params.append(params) + # self.params.append(params) # Do cross validation for this model - res = cross_validate(model, self.x_train, self.y_train, cv=self.nkfolds, - scoring=self.scoring, n_jobs=self.ncores, return_train_score=True) + res = cross_validate( + model, + self.x_train, + self.y_train, + cv=self.nkfolds, + scoring=self.scoring, + n_jobs=self.ncores, + return_train_score=True, + ) return res, model, params - def trial(self, space_drawn): """One trial @@ -214,7 +211,7 @@ def trial(self, space_drawn): # Collect results res_tmp = {} for t in ("train", "test"): - for sc in self.scoring: # pylint: disable=not-an-iterable + for sc in self.scoring: # pylint: disable=not-an-iterable res_tmp[f"{t}_{sc}"] = float(np.mean(res[f"{t}_{sc}"])) res_tmp[f"{t}_{sc}_std"] = float(np.std(res[f"{t}_{sc}"])) self.results.append(res_tmp) @@ -230,12 +227,10 @@ def trial(self, space_drawn): if not self.low_is_better: score = -score - if self.min_score is None or score < self.min_score: - - if self.score_train_test_diff is None or \ - (self.score_train_test_diff > 0. and \ - rel_train_test < self.score_train_test_diff): + if self.score_train_test_diff is None or ( + self.score_train_test_diff > 0.0 and rel_train_test < self.score_train_test_diff + ): self.min_score = score self.best = model self.best_index = len(self.params) - 1 @@ -244,10 +239,8 @@ def trial(self, space_drawn): return {"loss": score, "status": STATUS_OK} - def finalise(self): - """Finalising... - """ + """Finalising...""" # Reset number of cores self.ncores = 20 @@ -257,7 +250,6 @@ def finalise(self): print("Fit best model to whole dataset") self.best.fit(self.x_train, self.y_train) - def optimise(self, yield_model=None, save_model=None, space=None, ncores=None): """Do Bayesian optimisation @@ -307,21 +299,20 @@ def optimise(self, yield_model=None, save_model=None, space=None, ncores=None): else: self.finalise() - def make_results(self): - """Helper function to make dictionary of parameters and results - """ + """Helper function to make dictionary of parameters and results""" params_tmp = [dict_yamlable(p) for p in self.params] - return {"cv": self.results, - "params": params_tmp, - "best_index": self.best_index, - "best_params": dict_yamlable(self.best_params), - "best_scores": self.best_scores, - "score_names": list(self.scoring.keys()), - "score_opt_name": self.scoring_opt} - - - def save_model_(self, model, out_dir): # pylint: disable=unused-argument, no-self-use + return { + "cv": self.results, + "params": params_tmp, + "best_index": self.best_index, + "best_params": dict_yamlable(self.best_params), + "best_scores": self.best_scores, + "score_names": list(self.scoring.keys()), + "score_opt_name": self.scoring_opt, + } + + def save_model_(self, model, out_dir): # pylint: disable=unused-argument, no-self-use """Save a model Routine to save a model, to be implemented for concrete model @@ -329,10 +320,8 @@ def save_model_(self, model, out_dir): # pylint: disable=unused-argument, no-sel """ print("save_model_ not implemented") - def save(self, out_dir, best_only=True): - """Save paramaters/results and best model - """ + """Save paramaters/results and best model""" results = self.make_results() try: @@ -342,10 +331,9 @@ def save(self, out_dir, best_only=True): try: pickle.dump(results, open(join(out_dir, "results.pkl"), "wb")) - except Exception: #pylint: disable=broad-except + except Exception: # pylint: disable=broad-except print("Cannot pickle optimisation results") - save_func = self.save_model_ print(f"Save best model from Bayesian opt at {out_dir}") if self.yield_model_custom and self.save_model_custom: @@ -358,9 +346,7 @@ def save(self, out_dir, best_only=True): out_dir_model = join(out_dir, f"model_{i}") save_func(m, out_dir_model) - - def __extract_param_evolution(self): # pylint: disable=too-many-branches - + def __extract_param_evolution(self): # pylint: disable=too-many-branches def __extract_branches(search, branch_list, __branch=None): """helper function to collect all branches in dictionary @@ -382,7 +368,6 @@ def __extract_branches(search, branch_list, __branch=None): elif branch_tmp not in branch_list: branch_list.append(branch_tmp) - # First, actually collect all parameters param_fields = [] for p in self.params: @@ -392,14 +377,12 @@ def __extract_branches(search, branch_list, __branch=None): # more complex values params_tmp = [dict_yamlable(p) for p in self.params] - # Collect parameters as # [{"branch": branch, "iterations": iterations, "values": values, "mapping": mapping}, ...] params_extracted = [] # Go through all branches for pf in param_fields: - x_axis_vals = [] y_axis_vals = [] @@ -424,9 +407,7 @@ def __extract_branches(search, branch_list, __branch=None): x_axis_vals.append(i) y_axis_vals.append(curr_val) - params_extracted.append({"branch": pf, - "iterations": x_axis_vals, - "values": y_axis_vals}) + params_extracted.append({"branch": pf, "iterations": x_axis_vals, "values": y_axis_vals}) if not x_axis_vals: # Usually, that should not happen and at least one value should have been found @@ -458,7 +439,6 @@ def __extract_branches(search, branch_list, __branch=None): return params_extracted - def __plot_parameter_violins(self, out_dir): """plot violin for each parameter @@ -478,7 +458,6 @@ def __adjacent_values(vals, q1_, q3_): return lower_adjacent_value, upper_adjacent_value for p in self.__extract_param_evolution(): - if not p["iterations"]: # nothing to plot continue @@ -492,22 +471,25 @@ def __adjacent_values(vals, q1_, q3_): # violin plot, based on # https://matplotlib.org/3.1.0/gallery/statistics/customized_violin.html - parts = ax.violinplot([y_axis_vals], showmeans=False, showmedians=False, - showextrema=False) - for pc in parts['bodies']: - pc.set_facecolor('#00DDFF') - pc.set_edgecolor('#0C00BA') + parts = ax.violinplot([y_axis_vals], showmeans=False, showmedians=False, showextrema=False) + for pc in parts["bodies"]: + pc.set_facecolor("#00DDFF") + pc.set_edgecolor("#0C00BA") pc.set_alpha(0.2) quartile1, medians, quartile3 = np.percentile([y_axis_vals], [25, 50, 75], axis=1) - whiskers = np.array([__adjacent_values(vals_array, q1, q3) \ - for vals_array, q1, q3 in zip([y_axis_vals], quartile1, quartile3)]) + whiskers = np.array( + [ + __adjacent_values(vals_array, q1, q3) + for vals_array, q1, q3 in zip([y_axis_vals], quartile1, quartile3) + ] + ) whiskers_min, whiskers_max = whiskers[:, 0], whiskers[:, 1] inds = np.arange(1, len(medians) + 1) - ax.scatter(inds, medians, marker='o', color='white', s=40, zorder=3) - ax.vlines(inds, quartile1, quartile3, color='k', linestyle='-', lw=6) - ax.vlines(inds, whiskers_min, whiskers_max, color='k', linestyle='-', lw=3) + ax.scatter(inds, medians, marker="o", color="white", s=40, zorder=3) + ax.vlines(inds, quartile1, quartile3, color="k", linestyle="-", lw=6) + ax.vlines(inds, whiskers_min, whiskers_max, color="k", linestyle="-", lw=3) ax.set_xlabel(name, fontsize=20) ax.set_ylabel("values", fontsize=20) @@ -521,7 +503,6 @@ def __adjacent_values(vals, q1_, q3_): fig.savefig(out_file) plt.close(fig) - def __plot_parameters_shap_like(self, out_dir): # Compute optimal score average and range test_scores = [r[f"test_{self.scoring_opt}"] for r in self.results] @@ -533,7 +514,7 @@ def __plot_parameters_shap_like(self, out_dir): def __map_value(old_value, old_min, old_max, new_min=0, new_max=1): if old_min == old_max: - return (new_max - new_min) / 2. + return (new_max - new_min) / 2.0 return (((old_value - old_min) * (new_max - new_min)) / (old_max - old_min)) + new_min param_evolution = self.__extract_param_evolution() @@ -549,8 +530,17 @@ def __map_value(old_value, old_min, old_max, new_min=0, new_max=1): mapped_vals = [__map_value(v, val_min, val_max) for v in pe["values"]] - ax.scatter(x_vals, [i] * len(x_vals), s=markersize, alpha=0.5, cmap=shap_cmap_red_blue, - c=mapped_vals, zorder=3, lw=0, rasterized=len(mapped_vals) > 100) + ax.scatter( + x_vals, + [i] * len(x_vals), + s=markersize, + alpha=0.5, + cmap=shap_cmap_red_blue, + c=mapped_vals, + zorder=3, + lw=0, + rasterized=len(mapped_vals) > 100, + ) # draw line for average score ax.axvline(np.mean(test_scores), color="gray") @@ -570,7 +560,6 @@ def __map_value(old_value, old_min, old_max, new_min=0, new_max=1): fig.savefig(out_file) plt.close(fig) - def __plot_parameter_evolutions(self, out_dir): """plot evolution of all parameters @@ -585,7 +574,6 @@ def __plot_parameter_evolutions(self, out_dir): params_evolution = self.__extract_param_evolution() for p in params_evolution: - if not p["iterations"]: # nothing to plot continue @@ -625,8 +613,7 @@ def __plot_parameter_evolutions(self, out_dir): fig.savefig(out_file) plt.close(fig) - - def __plot_summary(self, out_dir, from_yaml=None, from_pickle=None):# pylint: disable=too-many-statements + def __plot_summary(self, out_dir, from_yaml=None, from_pickle=None): # pylint: disable=too-many-statements """Plot results Results are plotted to out_dir/results.png @@ -654,7 +641,6 @@ def __plot_summary(self, out_dir, from_yaml=None, from_pickle=None):# pylint: di scores_tmp = read_yaml["score_names"] score_opt_tmp = read_yaml["score_opt_name"] - # Re-arrange such that always the optimisation score is on top score_names = list(scores_tmp) del score_names[score_names.index(score_opt_tmp)] @@ -662,8 +648,7 @@ def __plot_summary(self, out_dir, from_yaml=None, from_pickle=None):# pylint: di # Prepare figrue and axes figsize = (35, 18 * len(score_names)) - fig, axes = plt.subplots(len(score_names), 1, sharex=True, gridspec_kw={"hspace": 0.05}, - figsize=figsize) + fig, axes = plt.subplots(len(score_names), 1, sharex=True, gridspec_kw={"hspace": 0.05}, figsize=figsize) # If only one score is given, need to make it iterable try: @@ -683,20 +668,27 @@ def __plot_summary(self, out_dir, from_yaml=None, from_pickle=None):# pylint: di markerstyle = markerstyles[i % len(markerstyles)] means[tt] = [r[f"{tt}_{sn}"] for r in results_tmp] stds = [r[f"{tt}_{sn}_std"] for r in results_tmp] - ax.errorbar(range(len(means[tt])), means[tt], yerr=stds, ls="", - marker=markerstyle, markersize=markersize, label=f"{sn} ({tt})") + ax.errorbar( + range(len(means[tt])), + means[tt], + yerr=stds, + ls="", + marker=markerstyle, + markersize=markersize, + label=f"{sn} ({tt})", + ) # Relative deviations between test and train index_high_score = means["test"].index(max(means["test"])) - dev_high_score = \ - abs(means["test"][index_high_score] - means["train"][index_high_score]) \ - / means["test"][index_high_score] + dev_high_score = ( + abs(means["test"][index_high_score] - means["train"][index_high_score]) + / means["test"][index_high_score] + ) index_low_score = means["test"].index(min(means["test"])) - dev_low_score = \ - abs(means["test"][index_low_score] - means["train"][index_low_score]) \ - / means["test"][index_low_score] - dev_min = [abs(test - train) / test \ - for train, test in zip(means["train"], means["test"])] + dev_low_score = ( + abs(means["test"][index_low_score] - means["train"][index_low_score]) / means["test"][index_low_score] + ) + dev_min = [abs(test - train) / test for train, test in zip(means["train"], means["test"])] index_min = dev_min.index(min(dev_min)) dev_min = min(dev_min) @@ -714,12 +706,22 @@ def __plot_summary(self, out_dir, from_yaml=None, from_pickle=None):# pylint: di if axi == 0: # Add another legend for highest, lowest score and min. rel. deviation between # test and train score - handles = [Line2D([0], [0], color="red"), - Line2D([0], [0], color="blue"), - Line2D([0], [0], color="green")] + handles = [ + Line2D([0], [0], color="red"), + Line2D([0], [0], color="blue"), + Line2D([0], [0], color="green"), + ] labels = ["highest test score", "lowest test score", "min. rel deviation"] - ax.legend(handles, labels, bbox_to_anchor=(0., 1.02, 1., .102), loc='lower left', - ncol=3, mode="expand", borderaxespad=0., fontsize=20) + ax.legend( + handles, + labels, + bbox_to_anchor=(0.0, 1.02, 1.0, 0.102), + loc="lower left", + ncol=3, + mode="expand", + borderaxespad=0.0, + fontsize=20, + ) # Add back first legend ax.add_artist(leg) @@ -733,8 +735,6 @@ def __plot_summary(self, out_dir, from_yaml=None, from_pickle=None):# pylint: di fig.savefig(out_file) plt.close(fig) - - def plot(self, out_dir, from_yaml=None, from_pickle=None): """Plot results diff --git a/machine_learning_hep/optimisation/grid_search.py b/machine_learning_hep/optimisation/grid_search.py index 118e16be71..93fd4d89bb 100644 --- a/machine_learning_hep/optimisation/grid_search.py +++ b/machine_learning_hep/optimisation/grid_search.py @@ -15,17 +15,20 @@ """ Methods to do grid-search hyper-parameters optimization """ -from os.path import join as osjoin + import itertools import pickle -import pandas as pd +from os.path import join as osjoin + import matplotlib.pyplot as plt +import pandas as pd from sklearn.model_selection import GridSearchCV + +from machine_learning_hep.io import dump_yaml_from_dict, parse_yaml, print_dict from machine_learning_hep.logger import get_logger -from machine_learning_hep.utilities import openfile -from machine_learning_hep.io import print_dict, dump_yaml_from_dict, parse_yaml from machine_learning_hep.models import savemodels from machine_learning_hep.optimisation.metrics import get_scorers +from machine_learning_hep.utilities import openfile def do_gridsearch(names, classifiers, grid_params, x_train, y_train, nkfolds, out_dirs, ncores=-1): @@ -61,9 +64,16 @@ def do_gridsearch(names, classifiers, grid_params, x_train, y_train, nkfolds, ou # performance scoring = get_scorers(gps["scoring"]) - grid_search = GridSearchCV(clf, gps["params"], cv=nkfolds, refit=gps["refit"], - scoring=scoring, n_jobs=ncores, verbose=2, - return_train_score=True) + grid_search = GridSearchCV( + clf, + gps["params"], + cv=nkfolds, + refit=gps["refit"], + scoring=scoring, + n_jobs=ncores, + verbose=2, + return_train_score=True, + ) grid_search.fit(x_train, y_train) cvres = grid_search.cv_results_ @@ -78,13 +88,12 @@ def do_gridsearch(names, classifiers, grid_params, x_train, y_train, nkfolds, ou # pylint: disable=too-many-locals, too-many-statements def perform_plot_gridsearch(names, out_dirs): - ''' + """ Function for grid scores plotting (working with scikit 0.20) - ''' + """ logger = get_logger() for name, out_dir in zip(names, out_dirs): - # Read written results gps = parse_yaml(osjoin(out_dir, "parameters.yaml")) score_obj = pickle.load(openfile(osjoin(out_dir, "results.pkl"), "rb")) @@ -114,8 +123,7 @@ def perform_plot_gridsearch(names, out_dirs): y_axis_mins = {sn: 9999 for sn in score_names} y_axis_maxs = {sn: -9999 for sn in score_names} - for indices, case in zip(itertools.product(*values_indices), - itertools.product(*list(gps["params"].values()))): + for indices, case in zip(itertools.product(*values_indices), itertools.product(*list(gps["params"].values()))): df_case = score_obj.copy() for i_case, i_key in zip(case, param_keys): df_case = df_case.loc[df_case[i_key] == df_case[i_key].dtype.type(i_case)] @@ -134,8 +142,7 @@ def perform_plot_gridsearch(names, out_dirs): # To determine fontsizes later figsize = (35, 18 * len(score_names)) - fig, axes = plt.subplots(len(score_names), 1, sharex=True, gridspec_kw={"hspace": 0.05}, - figsize=figsize) + fig, axes = plt.subplots(len(score_names), 1, sharex=True, gridspec_kw={"hspace": 0.05}, figsize=figsize) ax_plot = dict(zip(score_names, axes)) # The axes to put the parameter list @@ -149,8 +156,8 @@ def perform_plot_gridsearch(names, out_dirs): for sn in score_names: ax = ax_plot[sn] - ax_min = y_axis_mins[sn] - (y_axis_maxs[sn] - y_axis_mins[sn]) / 10. - ax_max = y_axis_maxs[sn] + (y_axis_maxs[sn] - y_axis_mins[sn]) / 10. + ax_min = y_axis_mins[sn] - (y_axis_maxs[sn] - y_axis_mins[sn]) / 10.0 + ax_max = y_axis_maxs[sn] + (y_axis_maxs[sn] - y_axis_mins[sn]) / 10.0 ax.set_ylim(ax_min, ax_max) ax.set_ylabel(f"mean {sn}", fontsize=20) ax.get_yaxis().set_tick_params(labelsize=20) @@ -158,8 +165,15 @@ def perform_plot_gridsearch(names, out_dirs): for j, tt in enumerate(("train", "test")): markerstyle = markerstyles[j % len(markerstyles)] - ax.errorbar(range(len(x_labels)), y_values[sn][tt], yerr=y_errors[sn][tt], - ls="", marker=markerstyle, markersize=markersize, label=f"{sn} ({tt})") + ax.errorbar( + range(len(x_labels)), + y_values[sn][tt], + yerr=y_errors[sn][tt], + ls="", + marker=markerstyle, + markersize=markersize, + label=f"{sn} ({tt})", + ) # Add values to points ylim = ax.get_ylim() diff --git a/machine_learning_hep/optimisation/metrics.py b/machine_learning_hep/optimisation/metrics.py index b0bf93e0ba..7c68fcc6b0 100644 --- a/machine_learning_hep/optimisation/metrics.py +++ b/machine_learning_hep/optimisation/metrics.py @@ -15,7 +15,8 @@ """ Metrics for (ML) optimisation """ -from sklearn.metrics import make_scorer, roc_auc_score, accuracy_score + +from sklearn.metrics import accuracy_score, make_scorer, roc_auc_score def get_scorers(score_names): diff --git a/machine_learning_hep/optimiser.py b/machine_learning_hep/optimiser.py index ef0c676d98..26fa643d7b 100644 --- a/machine_learning_hep/optimiser.py +++ b/machine_learning_hep/optimiser.py @@ -15,46 +15,68 @@ """ main script for doing ml optimisation """ + import copy import os +import pickle import time from math import sqrt -import pickle -import pandas as pd -import numpy as np -import matplotlib.pyplot as plt + import matplotlib as mpl -from sklearn.model_selection import train_test_split -from sklearn.utils import shuffle -from sklearn.preprocessing import label_binarize +import matplotlib.pyplot as plt +import numpy as np import onnx # pylint: disable=import-error -from onnxmltools.convert import convert_xgboost # pylint: disable=import-error +import pandas as pd from onnxconverter_common.data_types import FloatTensorType # pylint: disable=import-error -from ROOT import TFile, TCanvas, TH1F, TF1, gROOT # pylint: disable=import-error,no-name-in-module -from machine_learning_hep.utilities import seldf_singlevar, split_df_classes, createstringselection -from machine_learning_hep.utilities import dfquery, mask_df, read_df, write_df -from machine_learning_hep.correlations import vardistplot, scatterplot, correlationmatrix -from machine_learning_hep.models import getclf_scikit, getclf_xgboost, getclf_keras -from machine_learning_hep.models import fit, savemodels, readmodels, apply, decisionboundaries +from onnxmltools.convert import convert_xgboost # pylint: disable=import-error +from ROOT import TF1, TH1F, TCanvas, TFile, gROOT # pylint: disable=import-error,no-name-in-module +from sklearn.model_selection import train_test_split +from sklearn.preprocessing import label_binarize +from sklearn.utils import shuffle + # from machine_learning_hep.root import write_tree import machine_learning_hep.mlperformance as mlhep_plot -from machine_learning_hep.optimisation.grid_search import do_gridsearch, perform_plot_gridsearch -from machine_learning_hep.models import importanceplotall, shap_study -from machine_learning_hep.logger import get_logger import machine_learning_hep.optimization as optz -from machine_learning_hep.correlations import vardistplot_probscan, efficiency_cutscan +from machine_learning_hep.correlations import ( + correlationmatrix, + efficiency_cutscan, + scatterplot, + vardistplot, + vardistplot_probscan, +) +from machine_learning_hep.io import dump_yaml_from_dict, parse_yaml +from machine_learning_hep.logger import get_logger +from machine_learning_hep.models import ( + apply, + decisionboundaries, + fit, + getclf_keras, + getclf_scikit, + getclf_xgboost, + importanceplotall, + readmodels, + savemodels, + shap_study, +) +from machine_learning_hep.optimisation.grid_search import do_gridsearch, perform_plot_gridsearch +from machine_learning_hep.utilities import ( + createstringselection, + dfquery, + mask_df, + read_df, + seldf_singlevar, + split_df_classes, + write_df, +) from machine_learning_hep.utilities_files import checkdirs, checkmakedirlist -from machine_learning_hep.io import parse_yaml, dump_yaml_from_dict # pylint: disable=too-many-instance-attributes, too-many-statements, unbalanced-tuple-unpacking, fixme -class Optimiser: # pylint: disable=too-many-public-methods, consider-using-f-string, unused-argument, too-many-arguments - #Class Attribute +class Optimiser: # pylint: disable=too-many-public-methods, consider-using-f-string, unused-argument, too-many-arguments + # Class Attribute species = "optimiser" - def __init__(self, data_param, case, typean, model_config, binmin, - binmax, multbkg, raahp, training_var, index): - + def __init__(self, data_param, case, typean, model_config, binmin, binmax, multbkg, raahp, training_var, index): self.logger = get_logger() dirprefixdata = data_param["multi"]["data"].get("prefix_dir", "") @@ -63,7 +85,7 @@ def __init__(self, data_param, case, typean, model_config, binmin, dirmcml = dirprefixmc + os.path.expandvars(data_param["multi"]["mc"]["pkl_skimmed_merge_for_ml_all"]) dirdataml = dirprefixdata + os.path.expandvars(data_param["multi"]["data"]["pkl_skimmed_merge_for_ml_all"]) self.v_bin = data_param["var_binning"] - #directory + # directory self.dirmlout = dirprefix_ml + os.path.expandvars(data_param["ml"]["mlout"]) self.dirmlplot = dirprefix_ml + os.path.expandvars(data_param["ml"]["mlplot"]) @@ -72,17 +94,15 @@ def __init__(self, data_param, case, typean, model_config, binmin, self.file_steps_done = os.path.join(self.dirmlout, "steps_done.yaml") if os.path.exists(self.file_steps_done): self.steps_done = parse_yaml(self.file_steps_done)["done"] - if self.steps_done is None \ - and (os.listdir(self.dirmlout) or os.listdir(self.dirmlplot)): + if self.steps_done is None and (os.listdir(self.dirmlout) or os.listdir(self.dirmlplot)): # Backwards compatible print(f"rm -r {self.dirmlout}") print(f"rm -r {self.dirmlplot}") - self.logger.fatal("Please remove above directories as indicated above first and " \ - "run again") + self.logger.fatal("Please remove above directories as indicated above first and run again") if self.steps_done is None: self.steps_done = [] - #ml file names + # ml file names self.n_reco = data_param["files_names"]["namefile_reco"] self.n_reco = self.n_reco.replace(".p", "_%s%d_%d.p" % (self.v_bin, binmin, binmax)) self.n_evt = data_param["files_names"]["namefile_evt"] @@ -100,11 +120,11 @@ def __init__(self, data_param, case, typean, model_config, binmin, self.f_evt_count_ml = os.path.join(dirdataml, self.n_evt_count_ml) self.f_reco_applieddata = os.path.join(self.dirmlout, self.n_reco_applieddata) self.f_reco_appliedmc = os.path.join(self.dirmlout, self.n_reco_appliedmc) - #variables + # variables self.v_all = data_param["variables"]["var_all"] self.v_train = training_var self.v_selected = data_param["variables"].get("var_selected", None) - #if self.v_selected: + # if self.v_selected: # self.v_selected = self.v_selected[index] self.v_bound = data_param["variables"]["var_boundaries"] self.v_class = data_param["variables"]["var_class"] @@ -117,7 +137,7 @@ def __init__(self, data_param, case, typean, model_config, binmin, self.v_ismcprompt = data_param["bitmap_sel"]["var_ismcprompt"] self.v_ismcfd = data_param["bitmap_sel"]["var_ismcfd"] self.v_ismcbkg = data_param["bitmap_sel"]["var_ismcbkg"] - #parameters + # parameters self.p_case = case self.p_typean = typean # deep copy as this is modified for each Optimiser instance separately @@ -145,7 +165,7 @@ def __init__(self, data_param, case, typean, model_config, binmin, self.p_class_labels = data_param["ml"]["class_labels"] - #dataframes + # dataframes self.df_mc = None self.df_mcgen = None self.df_data = None @@ -162,23 +182,24 @@ def __init__(self, data_param, case, typean, model_config, binmin, self.df_ytest = None self.df_ytrain_onehot = None self.df_ytest_onehot = None - #selections - self.s_selbkg = data_param["ml"]["sel_bkg"] # used only to calculate significance + # selections + self.s_selbkg = data_param["ml"]["sel_bkg"] # used only to calculate significance self.s_selml = data_param["ml"]["sel_ml"] self.p_equalise_sig_bkg = data_param["ml"].get("equalise_sig_bkg", False) - #model param + # model param self.db_model = model_config self.p_class = None self.p_classname = None self.p_trainedmod = None self.s_suffix = None - #significance + # significance self.is_fonll_from_root = data_param["ml"]["opt"]["isFONLLfromROOT"] self.f_fonll = data_param["ml"]["opt"]["filename_fonll"] if self.is_fonll_from_root and "fonll_particle" not in data_param["ml"]["opt"]: - self.logger.fatal("Attempt to read FONLL from ROOT file but field " \ - "\"fonll_particle\" not provided in database") + self.logger.fatal( + 'Attempt to read FONLL from ROOT file but field "fonll_particle" not provided in database' + ) self.p_fonllparticle = data_param["ml"]["opt"].get("fonll_particle", "") self.p_fonllband = data_param["ml"]["opt"]["fonll_pred"] self.p_fragf = data_param["ml"]["opt"]["FF"] @@ -195,10 +216,9 @@ def __init__(self, data_param, case, typean, model_config, binmin, self.p_presel_gen_eff = data_param["ml"]["opt"]["presel_gen_eff"] # Potentially mask certain values (e.g. nsigma TOF of -999) self.p_mask_values = data_param["ml"].get("mask_values", None) - self.p_mass_fit_lim = data_param["analysis"][self.p_typean]['mass_fit_lim'] - self.p_bin_width = data_param["analysis"][self.p_typean]['bin_width'] - self.p_num_bins = int(round((self.p_mass_fit_lim[1] - self.p_mass_fit_lim[0]) / \ - self.p_bin_width)) + self.p_mass_fit_lim = data_param["analysis"][self.p_typean]["mass_fit_lim"] + self.p_bin_width = data_param["analysis"][self.p_typean]["bin_width"] + self.p_num_bins = int(round((self.p_mass_fit_lim[1] - self.p_mass_fit_lim[0]) / self.p_bin_width)) self.p_mass = data_param["mass"] self.p_raahp = raahp self.create_suffix() @@ -207,26 +227,25 @@ def __init__(self, data_param, case, typean, model_config, binmin, self.df_evt_data = None self.df_evttotsample_data = None - self.f_reco_applieddata = \ - self.f_reco_applieddata.replace(".p", "%s.p" % self.s_suffix) - self.f_reco_appliedmc = \ - self.f_reco_appliedmc.replace(".p", "%s.p" % self.s_suffix) + self.f_reco_applieddata = self.f_reco_applieddata.replace(".p", "%s.p" % self.s_suffix) + self.f_reco_appliedmc = self.f_reco_appliedmc.replace(".p", "%s.p" % self.s_suffix) self.f_df_ml_test_to_df = f"{self.dirmlout}/testsample_{self.s_suffix}_mldecision.pkl" self.f_mltest_applied = f"{self.dirmlout}/testsample_{self.s_suffix}_mldecision.pkl" self.df_mltest_applied = None - self.logger.info('training variables: %s', training_var) + self.logger.info("training variables: %s", training_var) def create_suffix(self): string_selection = createstringselection(self.v_bin, self.p_binmin, self.p_binmax) self.s_suffix = f"{self.p_case}_{string_selection}" def prepare_data_mc_mcgen(self): - self.logger.info("Prepare data reco as well as MC reco and gen") - if os.path.exists(self.f_reco_applieddata) \ - and os.path.exists(self.f_reco_appliedmc) \ - and self.step_done("preparemlsamples_data_mc_mcgen"): + if ( + os.path.exists(self.f_reco_applieddata) + and os.path.exists(self.f_reco_appliedmc) + and self.step_done("preparemlsamples_data_mc_mcgen") + ): self.df_data = read_df(self.f_reco_applieddata) self.df_mc = read_df(self.f_reco_appliedmc) else: @@ -247,18 +266,13 @@ def prepare_data_mc_mcgen(self): self.df_mcgen = seldf_singlevar(self.df_mcgen, self.v_bin, self.p_binmin, self.p_binmax) self.df_data = seldf_singlevar(self.df_data, self.v_bin, self.p_binmin, self.p_binmax) - - def preparesample(self): # pylint: disable=too-many-branches + def preparesample(self): # pylint: disable=too-many-branches self.logger.info("Prepare Sample") - filename_train = \ - os.path.join(self.dirmlout, f"df_train_{self.p_binmin}_{self.p_binmax}.pkl") - filename_test = \ - os.path.join(self.dirmlout, f"df_test_{self.p_binmin}_{self.p_binmax}.pkl") + filename_train = os.path.join(self.dirmlout, f"df_train_{self.p_binmin}_{self.p_binmax}.pkl") + filename_test = os.path.join(self.dirmlout, f"df_test_{self.p_binmin}_{self.p_binmax}.pkl") - if os.path.exists(filename_train) \ - and os.path.exists(filename_test) \ - and self.step_done("preparemlsamples"): + if os.path.exists(filename_train) and os.path.exists(filename_test) and self.step_done("preparemlsamples"): self.df_mltrain = read_df(filename_train) self.df_mltest = read_df(filename_test) @@ -268,15 +282,16 @@ def preparesample(self): # pylint: disable=too-many-branches self.dfs_input = {} for ind, label in enumerate(self.p_class_labels): self.dfs_input[label] = self.arraydf[self.p_tags[ind]] - self.dfs_input[label] = seldf_singlevar(self.dfs_input[label], - self.v_bin, self.p_binmin, self.p_binmax) + self.dfs_input[label] = seldf_singlevar(self.dfs_input[label], self.v_bin, self.p_binmin, self.p_binmax) self.dfs_input[label] = self.dfs_input[label].query(self.s_selml[ind]) bkg_labels = [lab for lab in self.p_class_labels if lab == "bkg"] if len(bkg_labels) != 1: - self.logger.fatal('No background class or more than one background class. ' \ - 'Make sure you have "bkg" exactly once in your class_labels ' \ - 'in your database') + self.logger.fatal( + "No background class or more than one background class. " + 'Make sure you have "bkg" exactly once in your class_labels ' + "in your database" + ) for var_to_zero in ["ismcsignal", "ismcprompt", "ismcfd", "ismcbkg"]: self.dfs_input[bkg_labels[0]][var_to_zero] = 0 @@ -284,26 +299,24 @@ def preparesample(self): # pylint: disable=too-many-branches min_class_count = min((len(self.dfs_input[label]) for label in self.p_class_labels)) for ind, label in enumerate(self.p_class_labels): self.p_nclasses[ind] = min(min_class_count, self.p_nclasses[ind]) - self.logger.info("Max possible number of equalized samples for %s: %d", - label, self.p_nclasses[ind]) + self.logger.info("Max possible number of equalized samples for %s: %d", label, self.p_nclasses[ind]) for ind, (label, nclass) in enumerate(zip(self.p_class_labels, self.p_nclasses)): - self.dfs_input[label] = shuffle(self.dfs_input[label], - random_state=self.rnd_shuffle) + self.dfs_input[label] = shuffle(self.dfs_input[label], random_state=self.rnd_shuffle) if label == "bkg" and self.p_equalise_sig_bkg: - nclass = nclass*self.p_multbkg + nclass = nclass * self.p_multbkg self.dfs_input[label] = self.dfs_input[label][:nclass] self.dfs_input[label][self.v_class] = ind self.df_ml = pd.concat([self.dfs_input[label] for label in self.p_class_labels]) if self.p_mltype == "MultiClassification": - df_y = label_binarize(self.df_ml[self.v_class], - classes=[*range(len(self.p_class_labels))]) + df_y = label_binarize(self.df_ml[self.v_class], classes=[*range(len(self.p_class_labels))]) for ind, label in enumerate(self.p_class_labels): self.df_ml[f"{self.v_class}_{label}"] = df_y[:, ind] - self.df_mltrain, self.df_mltest = train_test_split(self.df_ml, \ - test_size=self.test_frac, random_state=self.rnd_splt) + self.df_mltrain, self.df_mltest = train_test_split( + self.df_ml, test_size=self.test_frac, random_state=self.rnd_splt + ) self.df_mltrain = self.df_mltrain.reset_index(drop=True) self.df_mltest = self.df_mltest.reset_index(drop=True) @@ -314,11 +327,14 @@ def preparesample(self): # pylint: disable=too-many-branches # Now continue with extracting signal and background stats and report self.dfs_train = split_df_classes(self.df_mltrain, self.v_class, self.p_class_labels) self.dfs_test = split_df_classes(self.df_mltest, self.v_class, self.p_class_labels) - self.logger.info("Total number of candidates: train %d and test %d", len(self.df_mltrain), - len(self.df_mltest)) + self.logger.info("Total number of candidates: train %d and test %d", len(self.df_mltrain), len(self.df_mltest)) for label in self.p_class_labels: - self.logger.info("Number of %s candidates: train %d and test %d", - label, len(self.dfs_train[label]), len(self.dfs_test[label])) + self.logger.info( + "Number of %s candidates: train %d and test %d", + label, + len(self.dfs_train[label]), + len(self.dfs_test[label]), + ) for label, nclass in zip(self.p_class_labels, self.p_nclasses): self.logger.info("Aim for number of %s events: %d", label, nclass) @@ -344,8 +360,11 @@ def preparesample(self): # pylint: disable=too-many-branches def step_done(self, step): step_name = f"{step}_{self.p_binmin}_{self.p_binmax}" if step_name in self.steps_done: - self.logger.warning("Done ML step %s already. It's skipped now. Remove the step " \ - "from the list in %s", step_name, self.file_steps_done) + self.logger.warning( + "Done ML step %s already. It's skipped now. Remove the step from the list in %s", + step_name, + self.file_steps_done, + ) return True # Add this steps and update the corresponsing file @@ -354,7 +373,6 @@ def step_done(self, step): return False - def do_corr(self): if self.step_done("distributions_correlations"): return @@ -362,40 +380,39 @@ def do_corr(self): self.logger.info("Make feature distributions and correlation plots") def make_plot_name(output, label, n_var, binmin, binmax): - return f'{output}/CorrMatrix_{label}_nVar{n_var}_{binmin:.1f}_{binmax:.1f}.png' + return f"{output}/CorrMatrix_{label}_nVar{n_var}_{binmin:.1f}_{binmax:.1f}.png" - var_set = {"selected_vars": self.v_selected, "features": self.v_train} \ - if self.v_selected else {"all_vars": self.v_all, "features": self.v_train} + var_set = ( + {"selected_vars": self.v_selected, "features": self.v_train} + if self.v_selected + else {"all_vars": self.v_all, "features": self.v_train} + ) for _, variables in var_set.items(): - vardistplot(self.dfs_train, - variables, self.dirmlplot, - self.p_binmin, self.p_binmax, self.p_plot_options) + vardistplot(self.dfs_train, variables, self.dirmlplot, self.p_binmin, self.p_binmax, self.p_plot_options) - scatterplot(self.dfs_train, - self.v_corrx, self.v_corry, - self.dirmlplot, self.p_binmin, self.p_binmax) + scatterplot(self.dfs_train, self.v_corrx, self.v_corry, self.dirmlplot, self.p_binmin, self.p_binmax) for label in self.p_class_labels: for var_label, variables in var_set.items(): - output = make_plot_name(self.dirmlplot, f"{label}_{var_label}", len(variables), - self.p_binmin, self.p_binmax) - correlationmatrix(self.dfs_train[label], variables, label, output, - self.p_binmin, self.p_binmax, self.p_plot_options) + output = make_plot_name( + self.dirmlplot, f"{label}_{var_label}", len(variables), self.p_binmin, self.p_binmax + ) + correlationmatrix( + self.dfs_train[label], variables, label, output, self.p_binmin, self.p_binmax, self.p_plot_options + ) def loadmodels(self): classifiers_scikit, names_scikit, _, _ = getclf_scikit(self.db_model) classifiers_xgboost, names_xgboost, _, _ = getclf_xgboost(self.db_model) - classifiers_keras, names_keras, _, _ = getclf_keras(self.db_model, - len(self.df_xtrain.columns)) - self.p_class = classifiers_scikit+classifiers_xgboost+classifiers_keras - self.p_classname = names_scikit+names_xgboost+names_keras + classifiers_keras, names_keras, _, _ = getclf_keras(self.db_model, len(self.df_xtrain.columns)) + self.p_class = classifiers_scikit + classifiers_xgboost + classifiers_keras + self.p_classname = names_scikit + names_xgboost + names_keras # Try to read trained models clfs = readmodels(self.p_classname, self.dirmlout, self.s_suffix) if clfs: - self.logger.info("Read and use models from disk. Remove them if you don't want to " \ - "use them") + self.logger.info("Read and use models from disk. Remove them if you don't want to use them") self.p_trainedmod = clfs self.p_class = clfs return @@ -406,13 +423,12 @@ def do_train(self): self.logger.info("Training") t0 = time.time() - self.p_trainedmod = fit(self.p_classname, self.p_class, - self.df_xtrain.to_numpy(), self.df_ytrain.to_numpy()) + self.p_trainedmod = fit(self.p_classname, self.p_class, self.df_xtrain.to_numpy(), self.df_ytrain.to_numpy()) savemodels(self.p_classname, self.p_trainedmod, self.dirmlout, self.s_suffix) # Converting and saving models in onnx format - initial_type = [('input', FloatTensorType([None, len(self.df_xtrain.columns)]))] - onnx_model = convert_xgboost(self.p_trainedmod[0], initial_types = initial_type) + initial_type = [("input", FloatTensorType([None, len(self.df_xtrain.columns)]))] + onnx_model = convert_xgboost(self.p_trainedmod[0], initial_types=initial_type) onnx_output = os.path.join(self.dirmlout, self.s_suffix) onnx.save_model(onnx_model, onnx_output + ".onnx") @@ -426,8 +442,9 @@ def do_test(self): return self.logger.info("Testing") - self.df_mltest_applied = apply(self.p_mltype, self.p_classname, self.p_trainedmod, - self.df_mltest, self.v_train, self.p_class_labels) + self.df_mltest_applied = apply( + self.p_mltype, self.p_classname, self.p_trainedmod, self.df_mltest, self.v_train, self.p_class_labels + ) write_df(self.df_mltest_applied, self.f_mltest_applied) # df_ml_test_to_root = self.dirmlout+"/testsample_%s_mldecision.root" % (self.s_suffix) # write_tree(df_ml_test_to_root, self.n_treetest, self.df_mltest_applied) @@ -441,29 +458,27 @@ def do_apply(self): self.do_train() self.logger.info("Application") - for df, filename in zip((self.df_data, self.df_mc), - (self.f_reco_applieddata, self.f_reco_appliedmc)): - df_res = apply(self.p_mltype, self.p_classname, self.p_trainedmod, - df, self.v_train, self.p_class_labels) + for df, filename in zip((self.df_data, self.df_mc), (self.f_reco_applieddata, self.f_reco_appliedmc)): + df_res = apply(self.p_mltype, self.p_classname, self.p_trainedmod, df, self.v_train, self.p_class_labels) write_df(df_res, filename) def do_crossval(self): if self.step_done("cross_validation"): return self.logger.info("Do cross validation") - df_scores = mlhep_plot.cross_validation_mse(self.p_classname, self.p_class, - self.df_xtrain, self.df_ytrain, - self.p_nkfolds, self.p_ncorescross) - mlhep_plot.plot_cross_validation_mse(self.p_classname, df_scores, self.s_suffix, - self.dirmlplot) + df_scores = mlhep_plot.cross_validation_mse( + self.p_classname, self.p_class, self.df_xtrain, self.df_ytrain, self.p_nkfolds, self.p_ncorescross + ) + mlhep_plot.plot_cross_validation_mse(self.p_classname, df_scores, self.s_suffix, self.dirmlplot) def do_learningcurve(self): if self.step_done("learningcurve"): return self.logger.info("Make learning curve") npoints = 10 - mlhep_plot.plot_learning_curves(self.p_classname, self.p_class, self.s_suffix, - self.dirmlplot, self.df_xtrain, self.df_ytrain, npoints) + mlhep_plot.plot_learning_curves( + self.p_classname, self.p_class, self.s_suffix, self.dirmlplot, self.df_xtrain, self.df_ytrain, npoints + ) def do_roc(self): if self.step_done("roc_simple"): @@ -472,19 +487,38 @@ def do_roc(self): self.do_train() self.logger.info("Make ROC for train") - mlhep_plot.plot_precision_recall(self.p_classname, self.p_class, self.s_suffix, - self.df_xtrain, self.df_ytrain, self.df_ytrain_onehot, - self.p_nkfolds, self.dirmlplot, - self.p_class_labels) - mlhep_plot.plot_roc_ovr(self.p_classname, self.p_class, self.s_suffix, - self.df_xtrain, self.df_ytrain, - self.p_nkfolds, self.dirmlplot, - self.p_class_labels) + mlhep_plot.plot_precision_recall( + self.p_classname, + self.p_class, + self.s_suffix, + self.df_xtrain, + self.df_ytrain, + self.df_ytrain_onehot, + self.p_nkfolds, + self.dirmlplot, + self.p_class_labels, + ) + mlhep_plot.plot_roc_ovr( + self.p_classname, + self.p_class, + self.s_suffix, + self.df_xtrain, + self.df_ytrain, + self.p_nkfolds, + self.dirmlplot, + self.p_class_labels, + ) if self.p_mltype == "MultiClassification": - mlhep_plot.plot_roc_ovo(self.p_classname, self.p_class, self.s_suffix, - self.df_xtrain, self.df_ytrain, - self.p_nkfolds, self.dirmlplot, - self.p_class_labels) + mlhep_plot.plot_roc_ovo( + self.p_classname, + self.p_class, + self.s_suffix, + self.df_xtrain, + self.df_ytrain, + self.p_nkfolds, + self.dirmlplot, + self.p_class_labels, + ) def do_roc_train_test(self): if self.step_done("roc_train_test"): @@ -493,19 +527,35 @@ def do_roc_train_test(self): self.do_train() self.logger.info("Make ROC for train and test") - mlhep_plot.roc_train_test(self.p_classname, self.p_class, self.s_suffix, - self.df_xtrain, self.df_ytrain, - self.df_xtest, self.df_ytest, - self.p_nkfolds, self.dirmlplot, - self.p_class_labels, - (self.p_binmin, self.p_binmax), "OvR") + mlhep_plot.roc_train_test( + self.p_classname, + self.p_class, + self.s_suffix, + self.df_xtrain, + self.df_ytrain, + self.df_xtest, + self.df_ytest, + self.p_nkfolds, + self.dirmlplot, + self.p_class_labels, + (self.p_binmin, self.p_binmax), + "OvR", + ) if self.p_mltype == "MultiClassification": - mlhep_plot.roc_train_test(self.p_classname, self.p_class, self.s_suffix, - self.df_xtrain, self.df_ytrain, - self.df_xtest, self.df_ytest, - self.p_nkfolds, self.dirmlplot, - self.p_class_labels, - (self.p_binmin, self.p_binmax), "OvO") + mlhep_plot.roc_train_test( + self.p_classname, + self.p_class, + self.s_suffix, + self.df_xtrain, + self.df_ytrain, + self.df_xtest, + self.df_ytest, + self.p_nkfolds, + self.dirmlplot, + self.p_class_labels, + (self.p_binmin, self.p_binmax), + "OvO", + ) def do_plot_model_pred(self): if self.step_done("plot_model_pred"): @@ -514,10 +564,17 @@ def do_plot_model_pred(self): self.do_train() self.logger.info("Plot model prediction distribution") - mlhep_plot.plot_model_pred(self.p_classname, self.p_class, self.s_suffix, - self.df_xtrain, self.df_ytrain, - self.df_xtest, self.df_ytest, - self.dirmlplot, self.p_class_labels) + mlhep_plot.plot_model_pred( + self.p_classname, + self.p_class, + self.s_suffix, + self.df_xtrain, + self.df_ytrain, + self.df_xtest, + self.df_ytest, + self.dirmlplot, + self.p_class_labels, + ) def do_importance(self): if self.step_done("importance"): @@ -526,8 +583,7 @@ def do_importance(self): self.do_train() self.logger.info("Do simple importance") - importanceplotall(self.v_train, self.p_classname, self.p_class, - self.s_suffix, self.dirmlplot) + importanceplotall(self.v_train, self.p_classname, self.p_class, self.s_suffix, self.dirmlplot) def do_importance_shap(self): if self.step_done("importance_shap"): @@ -536,8 +592,15 @@ def do_importance_shap(self): self.do_train() self.logger.info("Do SHAP importance") - shap_study(self.p_classname, self.p_class, self.s_suffix, self.df_xtrain, self.dirmlplot, - self.p_class_labels, self.p_plot_options) + shap_study( + self.p_classname, + self.p_class, + self.s_suffix, + self.df_xtrain, + self.dirmlplot, + self.p_class_labels, + self.p_plot_options, + ) def do_bayesian_opt(self): if self.step_done("bayesian_opt"): @@ -545,17 +608,16 @@ def do_bayesian_opt(self): self.logger.info("Do Bayesian optimisation for all classifiers") _, names_scikit, _, bayes_opt_scikit = getclf_scikit(self.db_model) _, names_xgboost, _, bayes_opt_xgboost = getclf_xgboost(self.db_model) - _, names_keras, _, bayes_opt_keras = getclf_keras(self.db_model, - len(self.df_xtrain.columns)) + _, names_keras, _, bayes_opt_keras = getclf_keras(self.db_model, len(self.df_xtrain.columns)) clfs_all = bayes_opt_scikit + bayes_opt_xgboost + bayes_opt_keras clfs_names_all = names_scikit + names_xgboost + names_keras - clfs_names_all = [name for name, clf in zip(clfs_names_all, clfs_all) if clf] clfs_all = [clf for clf in clfs_all if clf] - out_dirs = [os.path.join(self.dirmlplot, "bayesian_opt", name, f"{name}{self.s_suffix}") \ - for name in clfs_names_all] + out_dirs = [ + os.path.join(self.dirmlplot, "bayesian_opt", name, f"{name}{self.s_suffix}") for name in clfs_names_all + ] checkmakedirlist(out_dirs) # Now, do it @@ -567,15 +629,13 @@ def do_bayesian_opt(self): opt.save(out_dir) opt.plot(out_dir) - def do_grid(self): if self.step_done("grid"): return self.logger.info("Do grid search") clfs_scikit, names_scikit, grid_params_scikit, _ = getclf_scikit(self.db_model) clfs_xgboost, names_xgboost, grid_params_xgboost, _ = getclf_xgboost(self.db_model) - clfs_keras, names_keras, grid_params_keras, _ = getclf_keras(self.db_model, - len(self.df_xtrain.columns)) + clfs_keras, names_keras, grid_params_keras, _ = getclf_keras(self.db_model, len(self.df_xtrain.columns)) clfs_grid_params_all = grid_params_scikit + grid_params_xgboost + grid_params_keras clfs_all = clfs_scikit + clfs_xgboost + clfs_keras clfs_names_all = names_scikit + names_xgboost + names_keras @@ -584,19 +644,30 @@ def do_grid(self): clfs_names_all = [name for name, gps in zip(clfs_names_all, clfs_grid_params_all) if gps] clfs_grid_params_all = [gps for gps in clfs_grid_params_all if gps] - out_dirs = [os.path.join(self.dirmlplot, "grid_search", name, f"{name}{self.s_suffix}") \ - for name in clfs_names_all] + out_dirs = [ + os.path.join(self.dirmlplot, "grid_search", name, f"{name}{self.s_suffix}") for name in clfs_names_all + ] if len(checkdirs(out_dirs)) > 0: # Only draw results if any can be found - self.logger.warning("Not overwriting anything, just plotting again what was done " \ - "before and returning. Please remove corresponding directories " \ - "if you are certain you want do do grid search again") + self.logger.warning( + "Not overwriting anything, just plotting again what was done " + "before and returning. Please remove corresponding directories " + "if you are certain you want do do grid search again" + ) perform_plot_gridsearch(clfs_names_all, out_dirs) return checkmakedirlist(out_dirs) - do_gridsearch(clfs_names_all, clfs_all, clfs_grid_params_all, self.df_xtrain, - self.df_ytrain, self.p_nkfolds, out_dirs, self.p_ncorescross) + do_gridsearch( + clfs_names_all, + clfs_all, + clfs_grid_params_all, + self.df_xtrain, + self.df_ytrain, + self.p_nkfolds, + out_dirs, + self.p_ncorescross, + ) perform_plot_gridsearch(clfs_names_all, out_dirs) def do_boundary(self): @@ -604,13 +675,13 @@ def do_boundary(self): return classifiers_scikit_2var, names_2var = getclf_scikit(self.db_model) classifiers_keras_2var, names_keras_2var = getclf_keras(self.db_model, 2) - classifiers_2var = classifiers_scikit_2var+classifiers_keras_2var - names_2var = names_2var+names_keras_2var + classifiers_2var = classifiers_scikit_2var + classifiers_keras_2var + names_2var = names_2var + names_keras_2var x_test_boundary = self.df_xtest[self.v_bound] trainedmodels_2var = fit(names_2var, classifiers_2var, x_test_boundary, self.df_ytest) decisionboundaries( - names_2var, trainedmodels_2var, self.s_suffix+"2var", x_test_boundary, - self.df_ytest, self.dirmlplot) + names_2var, trainedmodels_2var, self.s_suffix + "2var", x_test_boundary, self.df_ytest, self.dirmlplot + ) def do_efficiency(self): if self.step_done("efficiency"): @@ -622,19 +693,20 @@ def do_efficiency(self): fig_eff = optz.prepare_eff_signif_figure("Model efficiency", self.p_mltype) # FIXME: Different future signal selection? # NOTE: df with ismcprompt == 1 and ismcsignal == 0 is empty - df_sig = self.df_mltest_applied[(self.df_mltest_applied["ismcprompt"] == 1) & \ - (self.df_mltest_applied["ismcsignal"] == 1)] + df_sig = self.df_mltest_applied[ + (self.df_mltest_applied["ismcprompt"] == 1) & (self.df_mltest_applied["ismcsignal"] == 1) + ] for name in self.p_classname: - eff_array, eff_err_array, x_axis = optz.calc_sigeff_steps(self.p_nstepsign, df_sig, - name, self.p_mltype) - plt.errorbar(x_axis, eff_array, yerr=eff_err_array, c="b", alpha=0.3, - label=f"{name}", elinewidth=2.5, linewidth=4.0) + eff_array, eff_err_array, x_axis = optz.calc_sigeff_steps(self.p_nstepsign, df_sig, name, self.p_mltype) + plt.errorbar( + x_axis, eff_array, yerr=eff_err_array, c="b", alpha=0.3, label=f"{name}", elinewidth=2.5, linewidth=4.0 + ) plt.legend(loc="upper left", fontsize=25) - plt.savefig(f"{self.dirmlplot}/Efficiency_{self.s_suffix}.png", bbox_inches='tight') - with open(f"{self.dirmlplot}/Efficiency_{self.s_suffix}.pickle", 'wb') as out: + plt.savefig(f"{self.dirmlplot}/Efficiency_{self.s_suffix}.png", bbox_inches="tight") + with open(f"{self.dirmlplot}/Efficiency_{self.s_suffix}.pickle", "wb") as out: pickle.dump(fig_eff, out) - #pylint: disable=too-many-locals + # pylint: disable=too-many-locals def do_significance(self): if self.step_done("significance"): return @@ -646,51 +718,45 @@ def do_significance(self): self.logger.info("Doing significance optimization") gROOT.SetBatch(True) gROOT.ProcessLine("gErrorIgnoreLevel = kWarning;") - #first extract the number of data events in the ml sample + # first extract the number of data events in the ml sample # This might need a revisit, for now just extract the numbers from the ML merged # event count (aka from a YAML since the actual events are not needed) # Before the ML count was always taken from the ML merged event df while the total # number was taken from the event counter. But the latter is basically not used # anymore for a long time cause "dofullevtmerge" is mostly "false" in the DBs - #and the total number of events + # and the total number of events count_dict = parse_yaml(self.f_evt_count_ml) self.p_nevttot = count_dict["evtorig"] self.p_nevtml = count_dict["evt"] self.logger.debug("Number of data events used for ML: %d", self.p_nevtml) self.logger.debug("Total number of data events: %d", self.p_nevttot) - #calculate acceptance correction. we use in this case all - #the signal from the mc sample, without limiting to the n. signal - #events used for training - denacc = len(self.df_mcgen[(self.df_mcgen["ismcprompt"] == 1) & \ - (self.df_mcgen["ismcsignal"] == 1)]) - numacc = len(self.df_mc[(self.df_mc["ismcprompt"] == 1) & \ - (self.df_mc["ismcsignal"] == 1)]) + # calculate acceptance correction. we use in this case all + # the signal from the mc sample, without limiting to the n. signal + # events used for training + denacc = len(self.df_mcgen[(self.df_mcgen["ismcprompt"] == 1) & (self.df_mcgen["ismcsignal"] == 1)]) + numacc = len(self.df_mc[(self.df_mc["ismcprompt"] == 1) & (self.df_mc["ismcsignal"] == 1)]) acc, acc_err = optz.calc_eff(numacc, denacc) self.logger.debug("Acceptance: %.3e +/- %.3e", acc, acc_err) - #calculation of the expected fonll signals + # calculation of the expected fonll signals delta_pt = self.p_binmax - self.p_binmin if self.is_fonll_from_root: df_fonll = TFile.Open(self.f_fonll) - df_fonll_Lc = df_fonll.Get(self.p_fonllparticle+"_"+self.p_fonllband) + df_fonll_Lc = df_fonll.Get(self.p_fonllparticle + "_" + self.p_fonllband) bin_min = df_fonll_Lc.FindBin(self.p_binmin) bin_max = df_fonll_Lc.FindBin(self.p_binmax) prod_cross = df_fonll_Lc.Integral(bin_min, bin_max) * self.p_fragf * 1e-12 / delta_pt - signal_yield = 2. * prod_cross * delta_pt * acc * self.p_taa \ - / (self.p_sigmamb * self.p_fprompt) - #now we plot the fonll expectation + signal_yield = 2.0 * prod_cross * delta_pt * acc * self.p_taa / (self.p_sigmamb * self.p_fprompt) + # now we plot the fonll expectation cFONLL = TCanvas("cFONLL", "The FONLL expectation") df_fonll_Lc.GetXaxis().SetRangeUser(0, 16) df_fonll_Lc.Draw("") cFONLL.SaveAs(f"{self.dirmlplot}/FONLL_curve_{self.s_suffix}.png") else: df_fonll = pd.read_csv(self.f_fonll) - df_fonll_in_pt = \ - df_fonll.query('(pt >= @self.p_binmin) and (pt < @self.p_binmax)')\ - [self.p_fonllband] + df_fonll_in_pt = df_fonll.query("(pt >= @self.p_binmin) and (pt < @self.p_binmax)")[self.p_fonllband] prod_cross = df_fonll_in_pt.sum() * self.p_fragf * 1e-12 / delta_pt - signal_yield = 2. * prod_cross * delta_pt * acc * self.p_taa \ - / (self.p_sigmamb * self.p_fprompt) - #now we plot the fonll expectation + signal_yield = 2.0 * prod_cross * delta_pt * acc * self.p_taa / (self.p_sigmamb * self.p_fprompt) + # now we plot the fonll expectation fig = plt.figure(figsize=(20, 15)) plt.subplot(111) plt.plot(df_fonll["pt"], df_fonll[self.p_fonllband] * self.p_fragf, linewidth=4.0) @@ -698,7 +764,7 @@ def do_significance(self): plt.ylabel("Cross Section [pb/GeV]", fontsize=20) plt.title("FONLL cross section " + self.p_case, fontsize=20) plt.semilogy() - plt.savefig(f"{self.dirmlplot}/FONLL_curve_{self.s_suffix}.png", bbox_inches='tight') + plt.savefig(f"{self.dirmlplot}/FONLL_curve_{self.s_suffix}.png", bbox_inches="tight") plt.close(fig) self.logger.debug("Expected signal yield: %.3e", signal_yield) @@ -723,62 +789,74 @@ def do_significance(self): if int(fitsucc) != 0: self.logger.warning("Problem in signal peak fit") - sigma = 0. + sigma = 0.0 sigma = gaus_fit.GetParameter(2) self.logger.debug("Mean of the gaussian: %.3e", gaus_fit.GetParameter(1)) self.logger.debug("Sigma of the gaussian: %.3e", sigma) sig_region = [self.p_mass - 3 * sigma, self.p_mass + 3 * sigma] - fig_signif_pevt = optz.prepare_eff_signif_figure(r"Significance per event ($3 \sigma$) a.u.", - self.p_mltype) + fig_signif_pevt = optz.prepare_eff_signif_figure(r"Significance per event ($3 \sigma$) a.u.", self.p_mltype) plt.yticks([]) - fig_signif = optz.prepare_eff_signif_figure(r"Significance ($3 \sigma$) a.u.", - self.p_mltype) + fig_signif = optz.prepare_eff_signif_figure(r"Significance ($3 \sigma$) a.u.", self.p_mltype) plt.yticks([]) - df_sig = self.df_mltest_applied[(self.df_mltest_applied["ismcprompt"] == 1) & \ - (self.df_mltest_applied["ismcsignal"] == 1)] + df_sig = self.df_mltest_applied[ + (self.df_mltest_applied["ismcprompt"] == 1) & (self.df_mltest_applied["ismcsignal"] == 1) + ] for name in self.p_classname: - eff_array, eff_err_array, x_axis = optz.calc_sigeff_steps(self.p_nstepsign, df_sig, - name, self.p_mltype) - bkg_array, bkg_err_array, _ = optz.calc_bkg(df_data_sideband, name, self.p_nstepsign, - self.p_mass_fit_lim, self.p_bkg_func, - self.p_bin_width, sig_region, self.p_savefit, - self.dirmlplot, [self.p_binmin, self.p_binmax], - self.v_invmass, self.p_mltype) + eff_array, eff_err_array, x_axis = optz.calc_sigeff_steps(self.p_nstepsign, df_sig, name, self.p_mltype) + bkg_array, bkg_err_array, _ = optz.calc_bkg( + df_data_sideband, + name, + self.p_nstepsign, + self.p_mass_fit_lim, + self.p_bkg_func, + self.p_bin_width, + sig_region, + self.p_savefit, + self.dirmlplot, + [self.p_binmin, self.p_binmax], + self.v_invmass, + self.p_mltype, + ) sig_array = [eff * signal_yield for eff in eff_array] sig_err_array = [eff_err * signal_yield for eff_err in eff_err_array] bkg_array = [bkg / (self.p_bkgfracopt * self.p_nevtml) for bkg in bkg_array] - bkg_err_array = [bkg_err / (self.p_bkgfracopt * self.p_nevtml) \ - for bkg_err in bkg_err_array] - signif_array, signif_err_array = optz.calc_signif(sig_array, sig_err_array, - bkg_array, bkg_err_array) + bkg_err_array = [bkg_err / (self.p_bkgfracopt * self.p_nevtml) for bkg_err in bkg_err_array] + signif_array, signif_err_array = optz.calc_signif(sig_array, sig_err_array, bkg_array, bkg_err_array) plt.figure(fig_signif_pevt.number) - plt.errorbar(x_axis, signif_array, yerr=signif_err_array, - fmt=".", c="b", label=name, elinewidth=2.5, linewidth=5.0) + plt.errorbar( + x_axis, signif_array, yerr=signif_err_array, fmt=".", c="b", label=name, elinewidth=2.5, linewidth=5.0 + ) signif_array_ml = [sig * sqrt(self.p_nevtml) for sig in signif_array] signif_err_array_ml = [sig_err * sqrt(self.p_nevtml) for sig_err in signif_err_array] plt.figure(fig_signif.number) - plt.errorbar(x_axis, signif_array_ml, yerr=signif_err_array_ml, - c="b", label=name, elinewidth=2.5, linewidth=5.0) - plt.text(0.7, 0.95, - f" ${self.p_binmin} < p_\\mathrm{{T}}/(\\mathrm{{GeV}}/c) < {self.p_binmax}$", - verticalalignment="center", transform=fig_signif.gca().transAxes, fontsize=30) - #signif_array_tot = [sig * sqrt(self.p_nevttot) for sig in signif_array] - #signif_err_array_tot = [sig_err * sqrt(self.p_nevttot) for sig_err in signif_err_array] - #plt.figure(fig_signif.number) - #plt.errorbar(x_axis, signif_array_tot, yerr=signif_err_array_tot, + plt.errorbar( + x_axis, signif_array_ml, yerr=signif_err_array_ml, c="b", label=name, elinewidth=2.5, linewidth=5.0 + ) + plt.text( + 0.7, + 0.95, + f" ${self.p_binmin} < p_\\mathrm{{T}}/(\\mathrm{{GeV}}/c) < {self.p_binmax}$", + verticalalignment="center", + transform=fig_signif.gca().transAxes, + fontsize=30, + ) + # signif_array_tot = [sig * sqrt(self.p_nevttot) for sig in signif_array] + # signif_err_array_tot = [sig_err * sqrt(self.p_nevttot) for sig_err in signif_err_array] + # plt.figure(fig_signif.number) + # plt.errorbar(x_axis, signif_array_tot, yerr=signif_err_array_tot, # label=f'{name}_Tot', elinewidth=2.5, linewidth=5.0) plt.figure(fig_signif_pevt.number) plt.legend(loc="lower left", fontsize=25) - plt.savefig(f"{self.dirmlplot}/Significance_PerEvent_{self.s_suffix}.png", bbox_inches='tight') + plt.savefig(f"{self.dirmlplot}/Significance_PerEvent_{self.s_suffix}.png", bbox_inches="tight") plt.figure(fig_signif.number) mpl.rcParams.update({"text.usetex": True}) plt.legend(loc="lower left", fontsize=25) - plt.savefig(f"{self.dirmlplot}/Significance_{self.s_suffix}.png", bbox_inches='tight') + plt.savefig(f"{self.dirmlplot}/Significance_{self.s_suffix}.png", bbox_inches="tight") mpl.rcParams.update({"text.usetex": False}) with open(f"{self.dirmlplot}/Significance_{self.s_suffix}.pickle", "wb") as out: @@ -797,26 +875,58 @@ def do_scancuts(self): prob_array = [0.0, 0.2, 0.6, 0.9] dfdata = read_df(self.f_reco_applieddata) dfmc = read_df(self.f_reco_appliedmc) - vardistplot_probscan(dfmc, self.v_all, "xgboost_classifier", - prob_array, self.dirmlplot, "mc" + self.s_suffix, - 0, self.p_plot_options) - vardistplot_probscan(dfmc, self.v_all, "xgboost_classifier", - prob_array, self.dirmlplot, "mc" + self.s_suffix, - 1, self.p_plot_options) - vardistplot_probscan(dfdata, self.v_all, "xgboost_classifier", - prob_array, self.dirmlplot, "data" + self.s_suffix, - 0, self.p_plot_options) - vardistplot_probscan(dfdata, self.v_all, "xgboost_classifier", - prob_array, self.dirmlplot, "data" + self.s_suffix, - 1, self.p_plot_options) + vardistplot_probscan( + dfmc, + self.v_all, + "xgboost_classifier", + prob_array, + self.dirmlplot, + "mc" + self.s_suffix, + 0, + self.p_plot_options, + ) + vardistplot_probscan( + dfmc, + self.v_all, + "xgboost_classifier", + prob_array, + self.dirmlplot, + "mc" + self.s_suffix, + 1, + self.p_plot_options, + ) + vardistplot_probscan( + dfdata, + self.v_all, + "xgboost_classifier", + prob_array, + self.dirmlplot, + "data" + self.s_suffix, + 0, + self.p_plot_options, + ) + vardistplot_probscan( + dfdata, + self.v_all, + "xgboost_classifier", + prob_array, + self.dirmlplot, + "data" + self.s_suffix, + 1, + self.p_plot_options, + ) if not self.v_cuts: self.logger.warning("No variables for cut efficiency scan. Will be skipped") return - efficiency_cutscan(dfmc, self.v_cuts, "xgboost_classifier", 0.0, - self.dirmlplot, "mc" + self.s_suffix, self.p_plot_options) - efficiency_cutscan(dfmc, self.v_cuts, "xgboost_classifier", 0.5, - self.dirmlplot, "mc" + self.s_suffix, self.p_plot_options) - efficiency_cutscan(dfdata, self.v_cuts, "xgboost_classifier", 0.0, - self.dirmlplot, "data" + self.s_suffix, self.p_plot_options) - efficiency_cutscan(dfdata, self.v_cuts, "xgboost_classifier", 0.5, - self.dirmlplot, "data" + self.s_suffix, self.p_plot_options) + efficiency_cutscan( + dfmc, self.v_cuts, "xgboost_classifier", 0.0, self.dirmlplot, "mc" + self.s_suffix, self.p_plot_options + ) + efficiency_cutscan( + dfmc, self.v_cuts, "xgboost_classifier", 0.5, self.dirmlplot, "mc" + self.s_suffix, self.p_plot_options + ) + efficiency_cutscan( + dfdata, self.v_cuts, "xgboost_classifier", 0.0, self.dirmlplot, "data" + self.s_suffix, self.p_plot_options + ) + efficiency_cutscan( + dfdata, self.v_cuts, "xgboost_classifier", 0.5, self.dirmlplot, "data" + self.s_suffix, self.p_plot_options + ) diff --git a/machine_learning_hep/optimization.py b/machine_learning_hep/optimization.py index fc5bae6465..e33c78aa4b 100644 --- a/machine_learning_hep/optimization.py +++ b/machine_learning_hep/optimization.py @@ -15,33 +15,50 @@ """ Methods to: utility methods to conpute efficiency and study expected significance """ -import numpy as np + import matplotlib.pyplot as plt +import numpy as np from matplotlib.ticker import MultipleLocator from ROOT import TH1F, TFile # pylint: disable=import-error,no-name-in-module + from machine_learning_hep.logger import get_logger + def select_by_threshold(df_label, label, thr, name): # Changed from >= to > since we use that atm for the nominal selection # See processer.py self.l_selml if label == "bkg": - return df_label[df_label[f'y_test_prob{name}{label}'].values <= thr] + return df_label[df_label[f"y_test_prob{name}{label}"].values <= thr] if label == "": - return df_label[df_label[f'y_test_prob{name}{label}'].values > thr] - return df_label[df_label[f'y_test_prob{name}{label}'].values >= thr] + return df_label[df_label[f"y_test_prob{name}{label}"].values > thr] + return df_label[df_label[f"y_test_prob{name}{label}"].values >= thr] + def get_x_axis(num_steps, class_label): ns_left = int(num_steps / 10) - 1 ns_right = num_steps - ns_left if class_label == "bkg": ns_left, ns_right = ns_right, ns_left - x_axis_left = np.linspace(0., 0.49, ns_left) + x_axis_left = np.linspace(0.0, 0.49, ns_left) x_axis_right = np.linspace(0.5, 1.0, ns_right) x_axis = np.concatenate((x_axis_left, x_axis_right)) return x_axis -def calc_bkg(df_bkg, name, num_steps, fit_region, bkg_func, bin_width, sig_region, save_fit, #pylint: disable=too-many-arguments - out_dir, pt_lims, invmassvar, mltype): + +def calc_bkg( + df_bkg, + name, + num_steps, + fit_region, + bkg_func, + bin_width, + sig_region, + save_fit, # pylint: disable=too-many-arguments + out_dir, + pt_lims, + invmassvar, + mltype, +): """ Estimate the number of background candidates under the signal peak. This is obtained from real data with a fit of the sidebands of the invariant mass distribution. @@ -59,21 +76,21 @@ def calc_bkg(df_bkg, name, num_steps, fit_region, bkg_func, bin_width, sig_regio logger.debug("Saving bkg fits to file") pt_min = pt_lims[0] pt_max = pt_lims[1] - out_file = TFile(f'{out_dir}/bkg_fits_{name}_pt{pt_min:.1f}_{pt_max:.1f}.root', 'recreate') + out_file = TFile(f"{out_dir}/bkg_fits_{name}_pt{pt_min:.1f}_{pt_max:.1f}.root", "recreate") out_file.cd() logger.debug("To fit the bkg a %s function is used", bkg_func) for thr in x_axis: - bkg = 0. - bkg_err = 0. - hmass = TH1F(f'hmass_{thr:.5f}', '', num_bins, fit_region[0], fit_region[1]) + bkg = 0.0 + bkg_err = 0.0 + hmass = TH1F(f"hmass_{thr:.5f}", "", num_bins, fit_region[0], fit_region[1]) df_bkg_sel = select_by_threshold(df_bkg, class_label, thr, name) sel_mass_array = df_bkg_sel[invmassvar].values if len(sel_mass_array) > 5: for mass_value in np.nditer(sel_mass_array): hmass.Fill(mass_value) - fit = hmass.Fit(bkg_func, 'Q', '', fit_region[0], fit_region[1]) + fit = hmass.Fit(bkg_func, "Q", "", fit_region[0], fit_region[1]) if save_fit: hmass.Write() if int(fit) == 0: @@ -92,7 +109,6 @@ def calc_bkg(df_bkg, name, num_steps, fit_region, bkg_func, bin_width, sig_regio return bkg_array, bkg_err_array, x_axis - def calc_signif(sig_array, sig_err_array, bkg_array, bkg_err_array): """ Calculate the expected signal significance as a function of the treshold on the @@ -102,25 +118,28 @@ def calc_signif(sig_array, sig_err_array, bkg_array, bkg_err_array): signif_err_array = [] for sig, bkg, sig_err, bkg_err in zip(sig_array, bkg_array, sig_err_array, bkg_err_array): - signif = 0. - signif_err = 0. + signif = 0.0 + signif_err = 0.0 if sig > 0 and (sig + bkg) > 0: signif = sig / np.sqrt(sig + bkg) - signif_err = signif * np.sqrt((sig_err**2 + bkg_err**2) / (4 * (sig + bkg)**2) + \ - (bkg / (sig + bkg)) * sig_err**2 / sig**2) + signif_err = signif * np.sqrt( + (sig_err**2 + bkg_err**2) / (4 * (sig + bkg) ** 2) + (bkg / (sig + bkg)) * sig_err**2 / sig**2 + ) signif_array.append(signif) signif_err_array.append(signif_err) return signif_array, signif_err_array + def calc_eff(num, den): eff = num / den eff_err = np.sqrt(eff * (1 - eff) / den) return eff, eff_err + def calc_sigeff_steps(num_steps, df_sig, name, mltype): logger = get_logger() class_label = "bkg" if mltype == "MultiClassification" else "" @@ -141,6 +160,7 @@ def calc_sigeff_steps(num_steps, df_sig, name, mltype): return eff_array, eff_err_array, x_axis + def prepare_eff_signif_figure(y_label, mltype): class_label = "Bkg" if mltype == "MultiClassification" else "Prompt" fig = plt.figure(figsize=(20, 15)) diff --git a/machine_learning_hep/pca.py b/machine_learning_hep/pca.py index 80d29ebd01..579893e448 100644 --- a/machine_learning_hep/pca.py +++ b/machine_learning_hep/pca.py @@ -15,12 +15,14 @@ """ Methods to: apply Principal Component Analysis (PCA) and to standardize features """ + from io import BytesIO + +import matplotlib.pyplot as plt import numpy as np import pandas as pd -import matplotlib.pyplot as plt -from sklearn.preprocessing import StandardScaler from sklearn.decomposition import PCA +from sklearn.preprocessing import StandardScaler def get_pcadataframe_pca(dataframe, n_pca): @@ -29,7 +31,7 @@ def get_pcadataframe_pca(dataframe, n_pca): principalComponent = pca.fit_transform(data_values) pca_name_list = [] for i_pca in range(n_pca): - pca_name_list.append("princ_comp_%d" % (i_pca+1)) + pca_name_list.append("princ_comp_%d" % (i_pca + 1)) pca_dataframe = pd.DataFrame(data=principalComponent, columns=pca_name_list) return pca_dataframe, pca @@ -43,15 +45,15 @@ def getdataframe_standardised(dataframe): def plotvariance_pca(pca_object, output_): - figure = plt.figure(figsize=(15, 10)) # pylint: disable=unused-variable + figure = plt.figure(figsize=(15, 10)) # pylint: disable=unused-variable plt.plot(np.cumsum(pca_object.explained_variance_ratio_)) plt.plot([0, 10], [0.95, 0.95]) - plt.xlabel('number of components', fontsize=16) - plt.ylabel('cumulative explained variance', fontsize=16) - plt.title('Explained variance', fontsize=16) + plt.xlabel("number of components", fontsize=16) + plt.ylabel("cumulative explained variance", fontsize=16) + plt.title("Explained variance", fontsize=16) plt.ylim([0, 1]) - plotname = output_+'/PCAvariance.png' - plt.savefig(plotname, bbox_inches='tight') + plotname = output_ + "/PCAvariance.png" + plt.savefig(plotname, bbox_inches="tight") img_pca = BytesIO() - plt.savefig(img_pca, format='png') + plt.savefig(img_pca, format="png") img_pca.seek(0) diff --git a/machine_learning_hep/plotting/plot_jetsubstructure.py b/machine_learning_hep/plotting/plot_jetsubstructure.py index 2e87e69dc7..d2f2f3afcd 100644 --- a/machine_learning_hep/plotting/plot_jetsubstructure.py +++ b/machine_learning_hep/plotting/plot_jetsubstructure.py @@ -15,19 +15,35 @@ """ main script for doing final stage analysis """ + # pylint: disable=too-many-lines, line-too-long import argparse from array import array from cmath import nan + import yaml + # pylint: disable=import-error, no-name-in-module -from ROOT import TFile, TLatex, TLine, TGaxis, gROOT, gStyle, TCanvas, TGraphAsymmErrors, TGraphErrors, TGraph -from machine_learning_hep.utilities import make_message_notfound -from machine_learning_hep.utilities import get_colour, get_marker, draw_latex -from machine_learning_hep.utilities import make_plot, get_y_window_his, get_y_window_gr, get_plot_range, divide_graphs, get_x_window_his, get_x_window_gr, scale_graph -from machine_learning_hep.logger import get_logger +from ROOT import TCanvas, TFile, TGaxis, TGraph, TGraphAsymmErrors, TGraphErrors, TLatex, TLine, gROOT, gStyle -def main(): # pylint: disable=too-many-locals, too-many-statements, too-many-branches +from machine_learning_hep.logger import get_logger +from machine_learning_hep.utilities import ( + divide_graphs, + draw_latex, + get_colour, + get_marker, + get_plot_range, + get_x_window_gr, + get_x_window_his, + get_y_window_gr, + get_y_window_his, + make_message_notfound, + make_plot, + scale_graph, +) + + +def main(): # pylint: disable=too-many-locals, too-many-statements, too-many-branches """ Main plotting function """ @@ -36,17 +52,16 @@ def main(): # pylint: disable=too-many-locals, too-many-statements, too-many-bra # pylint: disable=unused-variable parser = argparse.ArgumentParser() - parser.add_argument("--database-analysis", "-d", dest="database_analysis", - help="analysis database to be used", required=True) - parser.add_argument("--analysis", "-a", dest="type_ana", - help="choose type of analysis", required=True) - parser.add_argument("--input", "-i", dest="input_file", - help="results input file", required=True) + parser.add_argument( + "--database-analysis", "-d", dest="database_analysis", help="analysis database to be used", required=True + ) + parser.add_argument("--analysis", "-a", dest="type_ana", help="choose type of analysis", required=True) + parser.add_argument("--input", "-i", dest="input_file", help="results input file", required=True) args = parser.parse_args() typean = args.type_ana - shape = typean[len("jet_"):] + shape = typean[len("jet_") :] print(f"Shape: {shape}") i_shape = 0 if shape == "zg" else 1 if shape == "rg" else 2 print(f"Index {i_shape}") @@ -75,39 +90,35 @@ def main(): # pylint: disable=too-many-locals, too-many-statements, too-many-bra var1ranges.append(lpt_finbinmax[-1]) # second variable (jet pt) - v_var2_binning = datap["analysis"][typean]["var_binning2"] # name + v_var2_binning = datap["analysis"][typean]["var_binning2"] # name lvar2_binmin_reco = datap["analysis"][typean].get("sel_binmin2_reco", None) lvar2_binmax_reco = datap["analysis"][typean].get("sel_binmax2_reco", None) - p_nbin2_reco = len(lvar2_binmin_reco) # number of reco bins + p_nbin2_reco = len(lvar2_binmin_reco) # number of reco bins lvar2_binmin_gen = datap["analysis"][typean].get("sel_binmin2_gen", None) lvar2_binmax_gen = datap["analysis"][typean].get("sel_binmax2_gen", None) - p_nbin2_gen = len(lvar2_binmin_gen) # number of gen bins + p_nbin2_gen = len(lvar2_binmin_gen) # number of gen bins var2ranges_reco = lvar2_binmin_reco.copy() var2ranges_reco.append(lvar2_binmax_reco[-1]) - var2binarray_reco = array("d", var2ranges_reco) # array of bin edges to use in histogram constructors + var2binarray_reco = array("d", var2ranges_reco) # array of bin edges to use in histogram constructors var2ranges_gen = lvar2_binmin_gen.copy() var2ranges_gen.append(lvar2_binmax_gen[-1]) - var2binarray_gen = array("d", var2ranges_gen) # array of bin edges to use in histogram constructors + var2binarray_gen = array("d", var2ranges_gen) # array of bin edges to use in histogram constructors # observable (z, shape,...) - v_varshape_binning = datap["analysis"][typean]["var_binningshape"] # name (reco) - v_varshape_binning_gen = datap["analysis"][typean]["var_binningshape_gen"] # name (gen) - lvarshape_binmin_reco = \ - datap["analysis"][typean].get("sel_binminshape_reco", None) - lvarshape_binmax_reco = \ - datap["analysis"][typean].get("sel_binmaxshape_reco", None) - p_nbinshape_reco = len(lvarshape_binmin_reco) # number of reco bins - lvarshape_binmin_gen = \ - datap["analysis"][typean].get("sel_binminshape_gen", None) - lvarshape_binmax_gen = \ - datap["analysis"][typean].get("sel_binmaxshape_gen", None) - p_nbinshape_gen = len(lvarshape_binmin_gen) # number of gen bins + v_varshape_binning = datap["analysis"][typean]["var_binningshape"] # name (reco) + v_varshape_binning_gen = datap["analysis"][typean]["var_binningshape_gen"] # name (gen) + lvarshape_binmin_reco = datap["analysis"][typean].get("sel_binminshape_reco", None) + lvarshape_binmax_reco = datap["analysis"][typean].get("sel_binmaxshape_reco", None) + p_nbinshape_reco = len(lvarshape_binmin_reco) # number of reco bins + lvarshape_binmin_gen = datap["analysis"][typean].get("sel_binminshape_gen", None) + lvarshape_binmax_gen = datap["analysis"][typean].get("sel_binmaxshape_gen", None) + p_nbinshape_gen = len(lvarshape_binmin_gen) # number of gen bins varshaperanges_reco = lvarshape_binmin_reco.copy() varshaperanges_reco.append(lvarshape_binmax_reco[-1]) - varshapebinarray_reco = array("d", varshaperanges_reco) # array of bin edges to use in histogram constructors + varshapebinarray_reco = array("d", varshaperanges_reco) # array of bin edges to use in histogram constructors varshaperanges_gen = lvarshape_binmin_gen.copy() varshaperanges_gen.append(lvarshape_binmax_gen[-1]) - varshapebinarray_gen = array("d", varshaperanges_gen) # array of bin edges to use in histogram constructors + varshapebinarray_gen = array("d", varshaperanges_gen) # array of bin edges to use in histogram constructors file_results = TFile.Open(file_in) if not file_results: @@ -211,16 +222,16 @@ def main(): # pylint: disable=too-many-locals, too-many-statements, too-many-bra offsets_axes_double = [0.8, 0.8] margins_can = [0.1, 0.13, 0.1, 0.03] margins_can_double = [0.1, 0.1, 0.1, 0.1] - margins_can_double = [0., 0., 0., 0.] + margins_can_double = [0.0, 0.0, 0.0, 0.0] size_thg = 0.05 offset_thg = 0.85 - gStyle.SetErrorX(0) # do not plot horizontal error bars of histograms + gStyle.SetErrorX(0) # do not plot horizontal error bars of histograms fontsize = 0.035 opt_leg_g = "FP" opt_plot_g = "2" - list_new = [] # list to avoid loosing objects created in loops + list_new = [] # list to avoid loosing objects created in loops # labels @@ -241,8 +252,17 @@ def main(): # pylint: disable=too-many-locals, too-many-statements, too-many-bra text_pythia_split = "#splitline{PYTHIA 8}{(Monash)}" text_powheg = "POWHEG" text_jets = "charged jets, anti-#it{k}_{T}, #it{R} = 0.4" - text_ptjet = "%g #leq %s < %g GeV/#it{c}, |#it{#eta}_{jet}| #leq 0.5" % (lvar2_binmin_reco[ibin2], p_latexbin2var, lvar2_binmax_reco[ibin2]) - text_pth = "%g #leq #it{p}_{T}^{%s} < %g GeV/#it{c}, |#it{y}_{%s}| #leq 0.8" % (lpt_finbinmin[0], p_latexnhadron, min(lpt_finbinmax[-1], lvar2_binmax_reco[ibin2]), p_latexnhadron) + text_ptjet = "%g #leq %s < %g GeV/#it{c}, |#it{#eta}_{jet}| #leq 0.5" % ( + lvar2_binmin_reco[ibin2], + p_latexbin2var, + lvar2_binmax_reco[ibin2], + ) + text_pth = "%g #leq #it{p}_{T}^{%s} < %g GeV/#it{c}, |#it{y}_{%s}| #leq 0.8" % ( + lpt_finbinmin[0], + p_latexnhadron, + min(lpt_finbinmax[-1], lvar2_binmax_reco[ibin2]), + p_latexnhadron, + ) text_ptcut = "#it{p}_{T, incl. ch. jet}^{leading track} #geq 5.33 GeV/#it{c}" text_ptcut_sim = "#it{p}_{T, incl. ch. jet}^{leading h^{#pm}} #geq 5.33 GeV/#it{c} (varied)" text_sd = "Soft Drop (#it{z}_{cut} = 0.1, #it{#beta} = 0)" @@ -270,18 +290,31 @@ def main(): # pylint: disable=too-many-locals, too-many-statements, too-many-bra # make the horizontal error bars smaller if shape == "nsd": - for gr in [hf_data_syst, incl_data_syst, hf_powheg_syst, hf_ratio_syst, incl_ratio_syst, incl_pythia_syst, quark_pythia_syst, gluon_pythia_syst]: + for gr in [ + hf_data_syst, + incl_data_syst, + hf_powheg_syst, + hf_ratio_syst, + incl_ratio_syst, + incl_pythia_syst, + quark_pythia_syst, + gluon_pythia_syst, + ]: for i in range(gr.GetN()): gr.SetPointEXlow(i, 0.1) gr.SetPointEXhigh(i, 0.1) # Scale PYTHIA to adjust normalisation for the missing entries in the untagged bin of zg and rg - for his, name in zip((hf_data_stat, incl_data_stat, hf_pythia_stat, incl_pythia_stat, quark_pythia_stat, gluon_pythia_stat), - ("data HF", "data incl.", "MC HF", "MC incl.", "MC quark", "MC gluon")): - print(f"Integral of {shape} {name} = {his.Integral(1, his.GetNbinsX(), 'width')}, " - f"range: {his.GetXaxis().GetXmin()} - {his.GetXaxis().GetXmax()}, " - f"untagged fraction = {his.Integral(1, 1, 'width')}") + for his, name in zip( + (hf_data_stat, incl_data_stat, hf_pythia_stat, incl_pythia_stat, quark_pythia_stat, gluon_pythia_stat), + ("data HF", "data incl.", "MC HF", "MC incl.", "MC quark", "MC gluon"), + ): + print( + f"Integral of {shape} {name} = {his.Integral(1, his.GetNbinsX(), 'width')}, " + f"range: {his.GetXaxis().GetXmin()} - {his.GetXaxis().GetXmax()}, " + f"untagged fraction = {his.Integral(1, 1, 'width')}" + ) # untagged fractions obtained from the first bin of nsd frac_untag_hf = 0.18552197557279143 @@ -290,33 +323,50 @@ def main(): # pylint: disable=too-many-locals, too-many-statements, too-many-bra frac_untag_gluon = 0.014728195998301162 if shape in ("zg", "rg"): - for his, gr, frac in zip((hf_pythia_stat, incl_pythia_stat, quark_pythia_stat, gluon_pythia_stat), - (None, incl_pythia_syst, quark_pythia_syst, gluon_pythia_syst), - (frac_untag_hf, frac_untag_incl, frac_untag_quark, frac_untag_gluon)): + for his, gr, frac in zip( + (hf_pythia_stat, incl_pythia_stat, quark_pythia_stat, gluon_pythia_stat), + (None, incl_pythia_syst, quark_pythia_syst, gluon_pythia_syst), + (frac_untag_hf, frac_untag_incl, frac_untag_quark, frac_untag_gluon), + ): f = 1 - frac his.Scale(f) scale_graph(gr, f) # Check that the integral after scaling is consistent with the missing untagged fraction. - for his, name, frac in zip((hf_pythia_stat, incl_pythia_stat, quark_pythia_stat, gluon_pythia_stat), - ("MC HF", "MC incl.", "MC quark", "MC gluon"), - (frac_untag_hf, frac_untag_incl, frac_untag_quark, frac_untag_gluon)): - print(f"Integral of {shape} {name} after scaling + untagged fraction = {his.Integral(1, his.GetNbinsX(), 'width') + frac}") + for his, name, frac in zip( + (hf_pythia_stat, incl_pythia_stat, quark_pythia_stat, gluon_pythia_stat), + ("MC HF", "MC incl.", "MC quark", "MC gluon"), + (frac_untag_hf, frac_untag_incl, frac_untag_quark, frac_untag_gluon), + ): + print( + f"Integral of {shape} {name} after scaling + untagged fraction = {his.Integral(1, his.GetNbinsX(), 'width') + frac}" + ) # data, HF and inclusive hf_data_syst_cl = hf_data_syst.Clone() - leg_pos = [.72, .75, .85, .85] + leg_pos = [0.72, 0.75, 0.85, 0.85] list_obj = [hf_data_syst, incl_data_syst, hf_data_stat, incl_data_stat] labels_obj = ["%s-tagged" % p_latexnhadron, "inclusive", "", ""] colours = [get_colour(i, j) for i, j in zip((c_hf_data, c_incl_data, c_hf_data, c_incl_data), (2, 2, 1, 1))] markers = [m_hf_data, m_incl_data, m_hf_data, m_incl_data] y_margin_up = 0.46 y_margin_down = 0.05 - cshape_data, list_obj_data_new = make_plot("cshape_data_" + suffix, size=size_can, \ - list_obj=list_obj, labels_obj=labels_obj, opt_leg_g=opt_leg_g, opt_plot_g=opt_plot_g, offsets_xy=offsets_axes, \ - colours=colours, markers=markers, leg_pos=leg_pos, margins_y=[y_margin_down, y_margin_up], margins_c=margins_can, \ - title=title_full) + cshape_data, list_obj_data_new = make_plot( + "cshape_data_" + suffix, + size=size_can, + list_obj=list_obj, + labels_obj=labels_obj, + opt_leg_g=opt_leg_g, + opt_plot_g=opt_plot_g, + offsets_xy=offsets_axes, + colours=colours, + markers=markers, + leg_pos=leg_pos, + margins_y=[y_margin_down, y_margin_up], + margins_c=margins_can, + title=title_full, + ) for gr, c in zip((hf_data_syst, incl_data_syst), (c_hf_data, c_incl_data)): gr.SetMarkerColor(get_colour(c)) list_obj_data_new[0].SetTextSize(fontsize) @@ -362,17 +412,30 @@ def main(): # pylint: disable=too-many-locals, too-many-statements, too-many-bra # data and PYTHIA, POWHEG, HF - leg_pos = [.72, .65, .85, .85] + leg_pos = [0.72, 0.65, 0.85, 0.85] list_obj = [hf_data_syst_cl, hf_powheg_syst, hf_data_stat, hf_pythia_stat, hf_powheg_stat] labels_obj = ["data", text_powheg, "", text_pythia_split, ""] - colours = [get_colour(i, j) for i, j in zip((c_hf_data, c_hf_powheg, c_hf_data, c_hf_pythia, c_hf_powheg), (2, 2, 1, 1, 1))] + colours = [ + get_colour(i, j) for i, j in zip((c_hf_data, c_hf_powheg, c_hf_data, c_hf_pythia, c_hf_powheg), (2, 2, 1, 1, 1)) + ] markers = [m_hf_data, m_hf_powheg, m_hf_data, m_hf_pythia, m_hf_powheg] y_margin_up = 0.4 y_margin_down = 0.05 - cshape_data_mc_hf, list_obj_data_mc_hf_new = make_plot("cshape_data_mc_hf_" + suffix, size=size_can, \ - list_obj=list_obj, labels_obj=labels_obj, opt_leg_g=opt_leg_g, opt_plot_g=opt_plot_g, offsets_xy=offsets_axes, \ - colours=colours, markers=markers, leg_pos=leg_pos, margins_y=[y_margin_down, y_margin_up], margins_c=margins_can, \ - title=title_full) + cshape_data_mc_hf, list_obj_data_mc_hf_new = make_plot( + "cshape_data_mc_hf_" + suffix, + size=size_can, + list_obj=list_obj, + labels_obj=labels_obj, + opt_leg_g=opt_leg_g, + opt_plot_g=opt_plot_g, + offsets_xy=offsets_axes, + colours=colours, + markers=markers, + leg_pos=leg_pos, + margins_y=[y_margin_down, y_margin_up], + margins_c=margins_can, + title=title_full, + ) for gr, c in zip([hf_data_syst_cl, hf_powheg_syst], [c_hf_data, c_hf_powheg]): gr.SetMarkerColor(get_colour(c)) leg_data_mc_hf = list_obj_data_mc_hf_new[0] @@ -380,13 +443,13 @@ def main(): # pylint: disable=too-many-locals, too-many-statements, too-many-bra leg_data_mc_hf.SetTextSize(fontsize) if shape == "nsd": hf_data_syst_cl.GetXaxis().SetNdivisions(5) - #axis_nsd = hf_data_syst_cl.GetHistogram().GetXaxis() - #x1 = axis_nsd.GetBinLowEdge(1) - #x2 = axis_nsd.GetBinUpEdge(axis_nsd.GetNbins()) - #axis_nsd.Set(5, x1, x2) - #for ibin in range(axis_nsd.GetNbins()): + # axis_nsd = hf_data_syst_cl.GetHistogram().GetXaxis() + # x1 = axis_nsd.GetBinLowEdge(1) + # x2 = axis_nsd.GetBinUpEdge(axis_nsd.GetNbins()) + # axis_nsd.Set(5, x1, x2) + # for ibin in range(axis_nsd.GetNbins()): # axis_nsd.SetBinLabel(ibin + 1, "%d" % ibin) - #axis_nsd.SetNdivisions(5) + # axis_nsd.SetNdivisions(5) cshape_data_mc_hf.Update() if shape == "rg": # plot the theta_g axis @@ -419,17 +482,28 @@ def main(): # pylint: disable=too-many-locals, too-many-statements, too-many-bra # data and PYTHIA, inclusive - #leg_pos = [.68, .65, .85, .85] + # leg_pos = [.68, .65, .85, .85] list_obj = [incl_data_syst, incl_pythia_syst, incl_data_stat, incl_pythia_stat] labels_obj = ["data", text_pythia_split] colours = [get_colour(i, j) for i, j in zip((c_incl_data, c_incl_pythia, c_incl_data, c_incl_pythia), (2, 2, 1, 1))] markers = [m_incl_data, m_incl_pythia, m_incl_data, m_incl_pythia] y_margin_up = 0.4 y_margin_down = 0.05 - cshape_data_mc_incl, list_obj_data_mc_incl_new = make_plot("cshape_data_mc_incl_" + suffix, size=size_can, \ - list_obj=list_obj, labels_obj=labels_obj, opt_leg_g=opt_leg_g, opt_plot_g=opt_plot_g, offsets_xy=offsets_axes, \ - colours=colours, markers=markers, leg_pos=leg_pos, margins_y=[y_margin_down, y_margin_up], margins_c=margins_can, \ - title=title_full) + cshape_data_mc_incl, list_obj_data_mc_incl_new = make_plot( + "cshape_data_mc_incl_" + suffix, + size=size_can, + list_obj=list_obj, + labels_obj=labels_obj, + opt_leg_g=opt_leg_g, + opt_plot_g=opt_plot_g, + offsets_xy=offsets_axes, + colours=colours, + markers=markers, + leg_pos=leg_pos, + margins_y=[y_margin_down, y_margin_up], + margins_c=margins_can, + title=title_full, + ) for gr, c in zip([incl_data_syst, incl_pythia_syst], [c_incl_data, c_incl_pythia]): gr.SetMarkerColor(get_colour(c)) leg_data_mc_incl = list_obj_data_mc_incl_new[0] @@ -474,8 +548,8 @@ def main(): # pylint: disable=too-many-locals, too-many-statements, too-many-bra line_1.SetLineColor(1) line_1.SetLineWidth(3) - #leg_pos = [.72, .7, .85, .85] # with header - leg_pos = [.72, .75, .85, .85] # without header + # leg_pos = [.72, .7, .85, .85] # with header + leg_pos = [0.72, 0.75, 0.85, 0.85] # without header list_obj = [hf_ratio_syst, line_1, incl_ratio_syst, hf_ratio_stat, incl_ratio_stat] labels_obj = ["%s-tagged" % p_latexnhadron, "inclusive"] colours = [get_colour(i, j) for i, j in zip((c_hf_data, c_incl_data, c_hf_data, c_incl_data), (2, 2, 1, 1))] @@ -484,16 +558,27 @@ def main(): # pylint: disable=too-many-locals, too-many-statements, too-many-bra y_margin_down = 0.05 if shape == "nsd": y_margin_up = 0.22 - cshape_ratio, list_obj_ratio_new = make_plot("cshape_ratio_" + suffix, size=size_can, \ - list_obj=list_obj, labels_obj=labels_obj, opt_leg_g=opt_leg_g, opt_plot_g=opt_plot_g, offsets_xy=offsets_axes, \ - colours=colours, markers=markers, leg_pos=leg_pos, margins_y=[y_margin_down, y_margin_up], margins_c=margins_can, \ - title=title_full_ratio) + cshape_ratio, list_obj_ratio_new = make_plot( + "cshape_ratio_" + suffix, + size=size_can, + list_obj=list_obj, + labels_obj=labels_obj, + opt_leg_g=opt_leg_g, + opt_plot_g=opt_plot_g, + offsets_xy=offsets_axes, + colours=colours, + markers=markers, + leg_pos=leg_pos, + margins_y=[y_margin_down, y_margin_up], + margins_c=margins_can, + title=title_full_ratio, + ) cshape_ratio.Update() for gr, c in zip((hf_ratio_syst, incl_ratio_syst), (c_hf_data, c_incl_data)): gr.SetMarkerColor(get_colour(c)) leg_ratio = list_obj_ratio_new[0] leg_ratio.SetTextSize(fontsize) - #leg_ratio.SetHeader("data/MC") + # leg_ratio.SetHeader("data/MC") if shape == "nsd": hf_ratio_syst.GetXaxis().SetNdivisions(5) cshape_ratio.Update() @@ -544,22 +629,57 @@ def main(): # pylint: disable=too-many-locals, too-many-statements, too-many-bra x_min = min(x_min_h, x_min_g) x_max = max(x_max_h, x_max_g) # explicit y ranges [zg, rg, nsd] - list_range_x = [[0.1, 0.5], [0., 0.4], [-0.5, 4.5]] # data - - #leg_pos = [.6, .65, .75, .85] - leg_pos = [.72, .55, .85, .85] - list_obj = [incl_pythia_syst, quark_pythia_syst, gluon_pythia_syst, hf_pythia_stat, incl_pythia_stat, quark_pythia_stat, gluon_pythia_stat] + list_range_x = [[0.1, 0.5], [0.0, 0.4], [-0.5, 4.5]] # data + + # leg_pos = [.6, .65, .75, .85] + leg_pos = [0.72, 0.55, 0.85, 0.85] + list_obj = [ + incl_pythia_syst, + quark_pythia_syst, + gluon_pythia_syst, + hf_pythia_stat, + incl_pythia_stat, + quark_pythia_stat, + gluon_pythia_stat, + ] labels_obj = ["inclusive", "quark", "gluon", "%s-tagged" % p_latexnhadron] - colours = [get_colour(i, j) for i, j in zip((c_incl_pythia, c_quark_pythia, c_gluon_pythia, c_hf_pythia, c_incl_pythia, c_quark_pythia, c_gluon_pythia), (2, 2, 2, 1, 1, 1, 1))] - markers = [m_incl_pythia, m_quark_pythia, m_gluon_pythia, m_hf_pythia, m_incl_pythia, m_quark_pythia, m_gluon_pythia] + colours = [ + get_colour(i, j) + for i, j in zip( + (c_incl_pythia, c_quark_pythia, c_gluon_pythia, c_hf_pythia, c_incl_pythia, c_quark_pythia, c_gluon_pythia), + (2, 2, 2, 1, 1, 1, 1), + ) + ] + markers = [ + m_incl_pythia, + m_quark_pythia, + m_gluon_pythia, + m_hf_pythia, + m_incl_pythia, + m_quark_pythia, + m_gluon_pythia, + ] y_margin_up = 0.46 y_margin_down = 0.05 - cshape_mc, list_obj_mc_new = make_plot("cshape_mc_" + suffix, size=size_can, \ - list_obj=list_obj, labels_obj=labels_obj, opt_leg_g=opt_leg_g, opt_plot_g=opt_plot_g, offsets_xy=offsets_axes, \ - colours=colours, markers=markers, leg_pos=leg_pos, range_y=[y_min_plot, y_max_plot], margins_c=margins_can, \ - title=title_full) + cshape_mc, list_obj_mc_new = make_plot( + "cshape_mc_" + suffix, + size=size_can, + list_obj=list_obj, + labels_obj=labels_obj, + opt_leg_g=opt_leg_g, + opt_plot_g=opt_plot_g, + offsets_xy=offsets_axes, + colours=colours, + markers=markers, + leg_pos=leg_pos, + range_y=[y_min_plot, y_max_plot], + margins_c=margins_can, + title=title_full, + ) cshape_mc.Update() - for gr, c in zip((incl_pythia_syst, quark_pythia_syst, gluon_pythia_syst), (c_incl_pythia, c_quark_pythia, c_gluon_pythia)): + for gr, c in zip( + (incl_pythia_syst, quark_pythia_syst, gluon_pythia_syst), (c_incl_pythia, c_quark_pythia, c_gluon_pythia) + ): gr.SetMarkerColor(get_colour(c)) leg_mc = list_obj_mc_new[0] leg_mc.SetTextSize(fontsize) @@ -598,18 +718,33 @@ def main(): # pylint: disable=too-many-locals, too-many-statements, too-many-bra # PYTHIA, HF, quark, gluon - #leg_pos = [.6, .65, .75, .85] - leg_pos = [.72, .61, .85, .85] + # leg_pos = [.6, .65, .75, .85] + leg_pos = [0.72, 0.61, 0.85, 0.85] list_obj = [quark_pythia_syst, gluon_pythia_syst, hf_pythia_stat, quark_pythia_stat, gluon_pythia_stat] labels_obj = ["quark", "gluon", "%s-tagged" % p_latexnhadron] - colours = [get_colour(i, j) for i, j in zip((c_quark_pythia, c_gluon_pythia, c_hf_pythia, c_quark_pythia, c_gluon_pythia), (2, 2, 1, 1, 1))] + colours = [ + get_colour(i, j) + for i, j in zip((c_quark_pythia, c_gluon_pythia, c_hf_pythia, c_quark_pythia, c_gluon_pythia), (2, 2, 1, 1, 1)) + ] markers = [m_quark_pythia, m_gluon_pythia, m_hf_pythia, m_quark_pythia, m_gluon_pythia] y_margin_up = 0.46 y_margin_down = 0.05 - cshape_mc, list_obj_mc_new = make_plot("cshape_mc_qgd_" + suffix, size=size_can, \ - list_obj=list_obj, labels_obj=labels_obj, opt_leg_g=opt_leg_g, opt_plot_g=opt_plot_g, offsets_xy=offsets_axes, \ - colours=colours, markers=markers, leg_pos=leg_pos, range_x=list_range_x[i_shape], range_y=[y_min_plot, y_max_plot], margins_c=margins_can, \ - title=title_full) + cshape_mc, list_obj_mc_new = make_plot( + "cshape_mc_qgd_" + suffix, + size=size_can, + list_obj=list_obj, + labels_obj=labels_obj, + opt_leg_g=opt_leg_g, + opt_plot_g=opt_plot_g, + offsets_xy=offsets_axes, + colours=colours, + markers=markers, + leg_pos=leg_pos, + range_x=list_range_x[i_shape], + range_y=[y_min_plot, y_max_plot], + margins_c=margins_can, + title=title_full, + ) cshape_mc.Update() for gr, c in zip((quark_pythia_syst, gluon_pythia_syst), (c_quark_pythia, c_gluon_pythia)): gr.SetMarkerColor(get_colour(c)) @@ -652,18 +787,29 @@ def main(): # pylint: disable=too-many-locals, too-many-statements, too-many-bra # PYTHIA, HF, inclusive - #leg_pos = [.6, .65, .75, .85] - leg_pos = [.72, .67, .85, .85] + # leg_pos = [.6, .65, .75, .85] + leg_pos = [0.72, 0.67, 0.85, 0.85] list_obj = [incl_pythia_syst_cl, incl_pythia_stat, hf_pythia_stat] labels_obj = ["inclusive", "", "%s-tagged" % p_latexnhadron] colours = [get_colour(i, j) for i, j in zip((c_incl_pythia, c_incl_pythia, c_hf_pythia), (2, 1, 1))] markers = [m_incl_pythia, m_incl_pythia, m_hf_pythia] y_margin_up = 0.46 y_margin_down = 0.05 - cshape_mc, list_obj_mc_new = make_plot("cshape_mc_id_" + suffix, size=size_can, \ - list_obj=list_obj, labels_obj=labels_obj, opt_leg_g=opt_leg_g, opt_plot_g=opt_plot_g, offsets_xy=offsets_axes, \ - colours=colours, markers=markers, leg_pos=leg_pos, range_y=[y_min_plot, y_max_plot], margins_c=margins_can, \ - title=title_full) + cshape_mc, list_obj_mc_new = make_plot( + "cshape_mc_id_" + suffix, + size=size_can, + list_obj=list_obj, + labels_obj=labels_obj, + opt_leg_g=opt_leg_g, + opt_plot_g=opt_plot_g, + offsets_xy=offsets_axes, + colours=colours, + markers=markers, + leg_pos=leg_pos, + range_y=[y_min_plot, y_max_plot], + margins_c=margins_can, + title=title_full, + ) # Draw a line through the points. if shape == "nsd": for h in (incl_pythia_stat, hf_pythia_stat): @@ -710,20 +856,46 @@ def main(): # pylint: disable=too-many-locals, too-many-statements, too-many-bra # data inclusive vs PYTHIA, quark, gluon - #leg_pos = [.6, .65, .75, .85] - #leg_pos = [.72, .55, .85, .85] - leg_pos = [.6, .7, .85, .85] - list_obj = [incl_data_syst, quark_pythia_syst, gluon_pythia_syst, incl_data_stat, quark_pythia_stat, gluon_pythia_stat] + # leg_pos = [.6, .65, .75, .85] + # leg_pos = [.72, .55, .85, .85] + leg_pos = [0.6, 0.7, 0.85, 0.85] + list_obj = [ + incl_data_syst, + quark_pythia_syst, + gluon_pythia_syst, + incl_data_stat, + quark_pythia_stat, + gluon_pythia_stat, + ] labels_obj = ["inclusive (data)", "quark (PYTHIA 8)", "gluon (PYTHIA 8)"] - colours = [get_colour(i, j) for i, j in zip((c_incl_data, c_quark_pythia, c_gluon_pythia, c_incl_data, c_quark_pythia, c_gluon_pythia), (2, 2, 2, 1, 1, 1))] + colours = [ + get_colour(i, j) + for i, j in zip( + (c_incl_data, c_quark_pythia, c_gluon_pythia, c_incl_data, c_quark_pythia, c_gluon_pythia), + (2, 2, 2, 1, 1, 1), + ) + ] markers = [m_incl_data, m_quark_pythia, m_gluon_pythia, m_incl_data, m_quark_pythia, m_gluon_pythia] y_margin_up = 0.3 y_margin_down = 0.05 - cshape_mc, list_obj_mc_new = make_plot("cshape_mc_data_iqg" + suffix, size=size_can, \ - list_obj=list_obj, labels_obj=labels_obj, opt_leg_g=opt_leg_g, opt_plot_g=opt_plot_g, offsets_xy=offsets_axes, \ - colours=colours, markers=markers, leg_pos=leg_pos, margins_y=[y_margin_down, y_margin_up], margins_c=margins_can, \ - title=title_full) - for gr, c in zip((incl_data_syst, quark_pythia_syst, gluon_pythia_syst), (c_incl_data, c_quark_pythia, c_gluon_pythia)): + cshape_mc, list_obj_mc_new = make_plot( + "cshape_mc_data_iqg" + suffix, + size=size_can, + list_obj=list_obj, + labels_obj=labels_obj, + opt_leg_g=opt_leg_g, + opt_plot_g=opt_plot_g, + offsets_xy=offsets_axes, + colours=colours, + markers=markers, + leg_pos=leg_pos, + margins_y=[y_margin_down, y_margin_up], + margins_c=margins_can, + title=title_full, + ) + for gr, c in zip( + (incl_data_syst, quark_pythia_syst, gluon_pythia_syst), (c_incl_data, c_quark_pythia, c_gluon_pythia) + ): gr.SetMarkerColor(get_colour(c)) leg_mc = list_obj_mc_new[0] leg_mc.SetTextSize(fontsize) @@ -737,8 +909,8 @@ def main(): # pylint: disable=too-many-locals, too-many-statements, too-many-bra print(f"Rel. syst. unc. for {name} {shape}") e_plus_min = float("inf") e_minus_min = float("inf") - e_plus_max = 0. - e_minus_max = 0. + e_plus_max = 0.0 + e_minus_max = 0.0 for i in range(gr.GetN()): y = gr.GetPointY(i) e_plus = 100 * gr.GetErrorYhigh(i) @@ -755,11 +927,11 @@ def main(): # pylint: disable=too-many-locals, too-many-statements, too-many-bra print(f"Absolutes: min: {min(e_plus_min, e_minus_min):.2g} %, max {max(e_plus_max, e_minus_max):.2g} %") # explicit y ranges [zg, rg, nsd] - list_range_y = [[0, 9], [0, 6], [0, 0.7]] # data - list_range_y_rat = [[0, 2], [0, 2], [0, 2]] # mc/data ratios + list_range_y = [[0, 9], [0, 6], [0, 0.7]] # data + list_range_y_rat = [[0, 2], [0, 2], [0, 2]] # mc/data ratios # data - leg_pos = [.7, .75, .82, .85] + leg_pos = [0.7, 0.75, 0.82, 0.85] list_obj = [hf_data_syst, incl_data_syst, hf_data_stat, incl_data_stat] labels_obj = ["%s-tagged" % p_latexnhadron, "inclusive", "", ""] colours = [get_colour(i, j) for i, j in zip((c_hf_data, c_incl_data, c_hf_data, c_incl_data), (2, 2, 1, 1))] @@ -770,28 +942,39 @@ def main(): # pylint: disable=too-many-locals, too-many-statements, too-many-bra cshape_datamc_all.Divide(1, 2) pad1 = cshape_datamc_all.cd(1) pad2 = cshape_datamc_all.cd(2) - pad1.SetPad(0., 0.3, 1, 1) - pad2.SetPad(0., 0., 1, 0.3) - pad1.SetBottomMargin(0.) + pad1.SetPad(0.0, 0.3, 1, 1) + pad2.SetPad(0.0, 0.0, 1, 0.3) + pad1.SetBottomMargin(0.0) pad2.SetBottomMargin(0.25) pad1.SetTopMargin(0.1) - pad2.SetTopMargin(0.) + pad2.SetTopMargin(0.0) pad1.SetLeftMargin(0.12) pad2.SetLeftMargin(0.12) pad1.SetTicks(1, 1) pad2.SetTicks(1, 1) - cshape_datamc_all, list_obj_data_new = make_plot("cshape_datamc_" + suffix, size=size_can_double, \ - can=cshape_datamc_all, pad=1, \ - list_obj=list_obj, labels_obj=labels_obj, opt_leg_g=opt_leg_g, opt_plot_g=opt_plot_g, offsets_xy=[0.8, 1.1], \ - colours=colours, markers=markers, leg_pos=leg_pos, margins_c=margins_can_double, \ + cshape_datamc_all, list_obj_data_new = make_plot( + "cshape_datamc_" + suffix, + size=size_can_double, + can=cshape_datamc_all, + pad=1, + list_obj=list_obj, + labels_obj=labels_obj, + opt_leg_g=opt_leg_g, + opt_plot_g=opt_plot_g, + offsets_xy=[0.8, 1.1], + colours=colours, + markers=markers, + leg_pos=leg_pos, + margins_c=margins_can_double, # margins_y=[y_margin_down, y_margin_up], \ - range_y=list_range_y[i_shape], \ - title=title_full) + range_y=list_range_y[i_shape], + title=title_full, + ) for gr, c in zip((hf_data_syst, incl_data_syst), (c_hf_data, c_incl_data)): gr.SetMarkerColor(get_colour(c)) list_obj_data_new[0].SetTextSize(fontsize) - hf_data_syst.GetYaxis().SetLabelSize(0.1 * 3/7) - #hf_data_syst.GetYaxis().SetTitleSize(0.1) + hf_data_syst.GetYaxis().SetLabelSize(0.1 * 3 / 7) + # hf_data_syst.GetYaxis().SetTitleSize(0.1) if shape == "nsd": hf_data_syst.GetXaxis().SetNdivisions(5) # Draw a line through the points. @@ -832,7 +1015,7 @@ def main(): # pylint: disable=too-many-locals, too-many-statements, too-many-bra cshape_datamc_all.Update() # MC/data - leg_pos = [.15, .8, .85, .95] + leg_pos = [0.15, 0.8, 0.85, 0.95] hf_ratio_powheg_stat = hf_powheg_stat.Clone(f"{hf_powheg_stat.GetName()}_rat") hf_ratio_powheg_stat.Divide(hf_data_stat) hf_ratio_powheg_syst = divide_graphs(hf_powheg_syst, hf_data_syst) @@ -844,9 +1027,9 @@ def main(): # pylint: disable=too-many-locals, too-many-statements, too-many-bra hf_pythia_stat_zero = hf_pythia_stat.Clone(f"{hf_pythia_stat.GetName()}_zero") for i in range(hf_pythia_stat_zero.GetNbinsX()): hf_pythia_stat_zero.SetBinError(i + 1, 0) - gStyle.SetErrorX(0.5) # we have to restore the histogram bin width to propagate it to graph - hf_pythia_syst = TGraphAsymmErrors(hf_pythia_stat_zero) # convert histogram into a graph - gStyle.SetErrorX(0) # set back the intended settings + gStyle.SetErrorX(0.5) # we have to restore the histogram bin width to propagate it to graph + hf_pythia_syst = TGraphAsymmErrors(hf_pythia_stat_zero) # convert histogram into a graph + gStyle.SetErrorX(0) # set back the intended settings hf_ratio_pythia_syst = divide_graphs(hf_pythia_syst, hf_data_syst) # hf_ratio_pythia_syst = divide_graphs(hf_data_syst, hf_pythia_syst) # version data/MC incl_ratio_pythia_stat = incl_pythia_stat.Clone(f"{incl_pythia_stat.GetName()}_rat") @@ -855,43 +1038,77 @@ def main(): # pylint: disable=too-many-locals, too-many-statements, too-many-bra # incl_ratio_pythia_stat = incl_data_stat.Clone(f"{incl_data_stat.GetName()}_rat") # version data/MC # incl_ratio_pythia_stat.Divide(incl_pythia_stat) # version data/MC # incl_ratio_pythia_syst = divide_graphs(incl_data_syst, incl_pythia_syst) # version data/MC - list_obj = [hf_ratio_powheg_syst, hf_ratio_pythia_syst, incl_ratio_pythia_syst, hf_ratio_powheg_stat, hf_ratio_pythia_stat, incl_ratio_pythia_stat, line_1] - labels_obj = [text_powheg, f"{p_latexnhadron}-tagged {text_pythia_short}", f"inclusive {text_pythia_short}", "", "", ""] - colours = [get_colour(i, j) for i, j in zip((c_hf_powheg, c_hf_pythia, c_incl_pythia, c_hf_powheg, c_hf_pythia, c_incl_pythia), (2, 2, 2, 1, 1, 1))] + list_obj = [ + hf_ratio_powheg_syst, + hf_ratio_pythia_syst, + incl_ratio_pythia_syst, + hf_ratio_powheg_stat, + hf_ratio_pythia_stat, + incl_ratio_pythia_stat, + line_1, + ] + labels_obj = [ + text_powheg, + f"{p_latexnhadron}-tagged {text_pythia_short}", + f"inclusive {text_pythia_short}", + "", + "", + "", + ] + colours = [ + get_colour(i, j) + for i, j in zip( + (c_hf_powheg, c_hf_pythia, c_incl_pythia, c_hf_powheg, c_hf_pythia, c_incl_pythia), (2, 2, 2, 1, 1, 1) + ) + ] markers = [m_hf_powheg, m_hf_pythia, m_incl_pythia, m_hf_powheg, m_hf_pythia, m_incl_pythia] y_margin_up = 0.2 y_margin_down = 0.05 - cshape_datamc_all, list_obj_data_mc_hf_new = make_plot("cshape_data_mc_hf_" + suffix, size=size_can_double, \ - can=cshape_datamc_all, pad=2, \ - list_obj=list_obj, labels_obj=labels_obj, opt_leg_g=opt_leg_g, opt_plot_g=opt_plot_g, offsets_xy=[1, 1.3 * 3/7], \ - colours=colours, markers=markers, leg_pos=leg_pos, margins_c=margins_can_double, \ - margins_y=[y_margin_down, y_margin_up], \ + cshape_datamc_all, list_obj_data_mc_hf_new = make_plot( + "cshape_data_mc_hf_" + suffix, + size=size_can_double, + can=cshape_datamc_all, + pad=2, + list_obj=list_obj, + labels_obj=labels_obj, + opt_leg_g=opt_leg_g, + opt_plot_g=opt_plot_g, + offsets_xy=[1, 1.3 * 3 / 7], + colours=colours, + markers=markers, + leg_pos=leg_pos, + margins_c=margins_can_double, + margins_y=[y_margin_down, y_margin_up], # range_y=list_range_y_rat[i_shape], \ - title=title_full_ratio_double) + title=title_full_ratio_double, + ) list_obj[0].GetXaxis().SetLabelSize(0.1) list_obj[0].GetXaxis().SetTitleSize(0.1) list_obj[0].GetYaxis().SetLabelSize(0.1) list_obj[0].GetYaxis().SetTitleSize(0.1) - for gr, c in zip([hf_ratio_powheg_syst, hf_ratio_pythia_syst, incl_ratio_pythia_syst], [c_hf_powheg, c_hf_pythia, c_incl_pythia]): + for gr, c in zip( + [hf_ratio_powheg_syst, hf_ratio_pythia_syst, incl_ratio_pythia_syst], [c_hf_powheg, c_hf_pythia, c_incl_pythia] + ): gr.SetMarkerColor(get_colour(c)) leg_data_mc_hf = list_obj_data_mc_hf_new[0] - #leg_data_mc_hf.SetHeader("%s-tagged" % p_latexnhadron) - leg_data_mc_hf.SetTextSize(fontsize * 7/3) + # leg_data_mc_hf.SetHeader("%s-tagged" % p_latexnhadron) + leg_data_mc_hf.SetTextSize(fontsize * 7 / 3) leg_data_mc_hf.SetNColumns(2) if shape == "nsd": list_obj[0].GetXaxis().SetNdivisions(5) cshape_datamc_all.Update() # Draw LaTeX - #y_latex = y_latex_top - #list_latex_data_mc_hf = [] - #for text_latex in [text_alice, text_jets, text_ptjet, text_pth, text_sd]: + # y_latex = y_latex_top + # list_latex_data_mc_hf = [] + # for text_latex in [text_alice, text_jets, text_ptjet, text_pth, text_sd]: # latex = TLatex(x_latex, y_latex, text_latex) # list_latex_data_mc_hf.append(latex) # draw_latex(latex, textsize=fontsize) # y_latex -= y_step - #cshape_datamc_all.Update() + # cshape_datamc_all.Update() pad1.RedrawAxis() pad2.RedrawAxis() cshape_datamc_all.SaveAs("%s/%s_datamc_all_%s.pdf" % (rootpath, shape, suffix)) + main() diff --git a/machine_learning_hep/plotting/plot_jetsubstructure_lite.py b/machine_learning_hep/plotting/plot_jetsubstructure_lite.py index f8a57f7e7b..f213f80e04 100644 --- a/machine_learning_hep/plotting/plot_jetsubstructure_lite.py +++ b/machine_learning_hep/plotting/plot_jetsubstructure_lite.py @@ -15,20 +15,38 @@ """ main script for doing final stage analysis """ + # pylint: disable=too-many-lines, line-too-long import argparse from array import array -from math import sqrt, floor, log10 +from math import floor, log10, sqrt + import yaml -# pylint: disable=import-error, no-name-in-module -from ROOT import TFile, TLatex, TLine, TGaxis, gROOT, gStyle, TCanvas, TGraphAsymmErrors, TGraphErrors, TGraph, TLegend -from machine_learning_hep.utilities import make_message_notfound -from machine_learning_hep.utilities import get_colour, get_marker, draw_latex, get_mean_uncertainty, get_mean_hist, get_mean_graph, format_value_with_unc -from machine_learning_hep.utilities import make_plot, get_y_window_his, get_y_window_gr, get_plot_range, divide_graphs, scale_graph, setup_legend -from machine_learning_hep.logger import get_logger +# pylint: disable=import-error, no-name-in-module +from ROOT import TCanvas, TFile, TGaxis, TGraph, TGraphAsymmErrors, TGraphErrors, TLatex, TLegend, TLine, gROOT, gStyle -def main(): # pylint: disable=too-many-locals, too-many-statements, too-many-branches +from machine_learning_hep.logger import get_logger +from machine_learning_hep.utilities import ( + divide_graphs, + draw_latex, + format_value_with_unc, + get_colour, + get_marker, + get_mean_graph, + get_mean_hist, + get_mean_uncertainty, + get_plot_range, + get_y_window_gr, + get_y_window_his, + make_message_notfound, + make_plot, + scale_graph, + setup_legend, +) + + +def main(): # pylint: disable=too-many-locals, too-many-statements, too-many-branches """ Main plotting function """ @@ -39,17 +57,16 @@ def main(): # pylint: disable=too-many-locals, too-many-statements, too-many-bra # pylint: disable=unused-variable parser = argparse.ArgumentParser() - parser.add_argument("--database-analysis", "-d", dest="database_analysis", - help="analysis database to be used", required=True) - parser.add_argument("--analysis", "-a", dest="type_ana", - help="choose type of analysis", required=True) - parser.add_argument("--input", "-i", dest="input_file", - help="results input file", required=True) + parser.add_argument( + "--database-analysis", "-d", dest="database_analysis", help="analysis database to be used", required=True + ) + parser.add_argument("--analysis", "-a", dest="type_ana", help="choose type of analysis", required=True) + parser.add_argument("--input", "-i", dest="input_file", help="results input file", required=True) args = parser.parse_args() typean = args.type_ana - shape = typean[len("jet_"):] + shape = typean[len("jet_") :] print("Shape:", shape) if shape != "zg": do_ivan = False @@ -78,39 +95,35 @@ def main(): # pylint: disable=too-many-locals, too-many-statements, too-many-bra var1ranges.append(lpt_finbinmax[-1]) # second variable (jet pt) - v_var2_binning = datap["analysis"][typean]["var_binning2"] # name + v_var2_binning = datap["analysis"][typean]["var_binning2"] # name lvar2_binmin_reco = datap["analysis"][typean].get("sel_binmin2_reco", None) lvar2_binmax_reco = datap["analysis"][typean].get("sel_binmax2_reco", None) - p_nbin2_reco = len(lvar2_binmin_reco) # number of reco bins + p_nbin2_reco = len(lvar2_binmin_reco) # number of reco bins lvar2_binmin_gen = datap["analysis"][typean].get("sel_binmin2_gen", None) lvar2_binmax_gen = datap["analysis"][typean].get("sel_binmax2_gen", None) - p_nbin2_gen = len(lvar2_binmin_gen) # number of gen bins + p_nbin2_gen = len(lvar2_binmin_gen) # number of gen bins var2ranges_reco = lvar2_binmin_reco.copy() var2ranges_reco.append(lvar2_binmax_reco[-1]) - var2binarray_reco = array("d", var2ranges_reco) # array of bin edges to use in histogram constructors + var2binarray_reco = array("d", var2ranges_reco) # array of bin edges to use in histogram constructors var2ranges_gen = lvar2_binmin_gen.copy() var2ranges_gen.append(lvar2_binmax_gen[-1]) - var2binarray_gen = array("d", var2ranges_gen) # array of bin edges to use in histogram constructors + var2binarray_gen = array("d", var2ranges_gen) # array of bin edges to use in histogram constructors # observable (z, shape,...) - v_varshape_binning = datap["analysis"][typean]["var_binningshape"] # name (reco) - v_varshape_binning_gen = datap["analysis"][typean]["var_binningshape_gen"] # name (gen) - lvarshape_binmin_reco = \ - datap["analysis"][typean].get("sel_binminshape_reco", None) - lvarshape_binmax_reco = \ - datap["analysis"][typean].get("sel_binmaxshape_reco", None) - p_nbinshape_reco = len(lvarshape_binmin_reco) # number of reco bins - lvarshape_binmin_gen = \ - datap["analysis"][typean].get("sel_binminshape_gen", None) - lvarshape_binmax_gen = \ - datap["analysis"][typean].get("sel_binmaxshape_gen", None) - p_nbinshape_gen = len(lvarshape_binmin_gen) # number of gen bins + v_varshape_binning = datap["analysis"][typean]["var_binningshape"] # name (reco) + v_varshape_binning_gen = datap["analysis"][typean]["var_binningshape_gen"] # name (gen) + lvarshape_binmin_reco = datap["analysis"][typean].get("sel_binminshape_reco", None) + lvarshape_binmax_reco = datap["analysis"][typean].get("sel_binmaxshape_reco", None) + p_nbinshape_reco = len(lvarshape_binmin_reco) # number of reco bins + lvarshape_binmin_gen = datap["analysis"][typean].get("sel_binminshape_gen", None) + lvarshape_binmax_gen = datap["analysis"][typean].get("sel_binmaxshape_gen", None) + p_nbinshape_gen = len(lvarshape_binmin_gen) # number of gen bins varshaperanges_reco = lvarshape_binmin_reco.copy() varshaperanges_reco.append(lvarshape_binmax_reco[-1]) - varshapebinarray_reco = array("d", varshaperanges_reco) # array of bin edges to use in histogram constructors + varshapebinarray_reco = array("d", varshaperanges_reco) # array of bin edges to use in histogram constructors varshaperanges_gen = lvarshape_binmin_gen.copy() varshaperanges_gen.append(lvarshape_binmax_gen[-1]) - varshapebinarray_gen = array("d", varshaperanges_gen) # array of bin edges to use in histogram constructors + varshapebinarray_gen = array("d", varshaperanges_gen) # array of bin edges to use in histogram constructors file_results = TFile.Open(file_in) if not file_results: @@ -139,9 +152,9 @@ def main(): # pylint: disable=too-many-locals, too-many-statements, too-many-bra hf_pythia_stat_zero = hf_pythia_stat.Clone(f"{hf_pythia_stat.GetName()}_zero") for i in range(hf_pythia_stat_zero.GetNbinsX()): hf_pythia_stat_zero.SetBinError(i + 1, 0) - gStyle.SetErrorX(0.5) # we have to restore the histogram bin width to propagate it to graph - hf_pythia_syst = TGraphAsymmErrors(hf_pythia_stat_zero) # convert histogram into a graph - gStyle.SetErrorX(0) # set back the intended settings + gStyle.SetErrorX(0.5) # we have to restore the histogram bin width to propagate it to graph + hf_pythia_syst = TGraphAsymmErrors(hf_pythia_stat_zero) # convert histogram into a graph + gStyle.SetErrorX(0) # set back the intended settings # HF POWHEG nameobj = "%s_hf_powheg_%d_stat" % (shape, ibin2) @@ -183,9 +196,9 @@ def main(): # pylint: disable=too-many-locals, too-many-statements, too-many-bra incl_pythia_stat_zero = incl_pythia_stat.Clone(f"{incl_pythia_stat.GetName()}_zero") for i in range(incl_pythia_stat_zero.GetNbinsX()): incl_pythia_stat_zero.SetBinError(i + 1, 0) - gStyle.SetErrorX(0.5) # we have to restore the histogram bin width to propagate it to graph - incl_pythia_syst = TGraphAsymmErrors(incl_pythia_stat_zero) # convert histogram into a graph - gStyle.SetErrorX(0) # set back the intended settings + gStyle.SetErrorX(0.5) # we have to restore the histogram bin width to propagate it to graph + incl_pythia_syst = TGraphAsymmErrors(incl_pythia_stat_zero) # convert histogram into a graph + gStyle.SetErrorX(0) # set back the intended settings if do_ivan: # inclusive Ivan @@ -210,9 +223,27 @@ def main(): # pylint: disable=too-many-locals, too-many-statements, too-many-bra sigma_z_var_stat = hist_means_stat.GetStdDev() mean_z_var_syst = hist_means_syst.GetMean() sigma_z_var_syst = hist_means_syst.GetStdDev() - make_plot(f"{shape}_means_hf_comb_{ibin2}", list_obj=[hist_means_comb], path=rootpath, suffix="pdf", title=f"HF mean variations comb {ibin2};{v_varshape_latex}") - make_plot(f"{shape}_means_hf_stat_{ibin2}", list_obj=[hist_means_stat], path=rootpath, suffix="pdf", title=f"HF mean variations stat {ibin2};{v_varshape_latex}") - make_plot(f"{shape}_means_hf_syst_{ibin2}", list_obj=[hist_means_syst], path=rootpath, suffix="pdf", title=f"HF mean variations syst {ibin2};{v_varshape_latex}") + make_plot( + f"{shape}_means_hf_comb_{ibin2}", + list_obj=[hist_means_comb], + path=rootpath, + suffix="pdf", + title=f"HF mean variations comb {ibin2};{v_varshape_latex}", + ) + make_plot( + f"{shape}_means_hf_stat_{ibin2}", + list_obj=[hist_means_stat], + path=rootpath, + suffix="pdf", + title=f"HF mean variations stat {ibin2};{v_varshape_latex}", + ) + make_plot( + f"{shape}_means_hf_syst_{ibin2}", + list_obj=[hist_means_syst], + path=rootpath, + suffix="pdf", + title=f"HF mean variations syst {ibin2};{v_varshape_latex}", + ) print(f"Mean HF {shape} = stat {mean_z_stat} syst {mean_z_syst} ROOT stat {hf_data_stat.GetMean()}") print(f"Mean HF {shape} = var comb {mean_z_var_comb} +- {sigma_z_var_comb}") print(f"Mean HF {shape} = var stat {mean_z_var_stat} +- {sigma_z_var_stat}") @@ -228,9 +259,27 @@ def main(): # pylint: disable=too-many-locals, too-many-statements, too-many-bra sigma_z_var_stat = hist_means_stat.GetStdDev() mean_z_var_syst = hist_means_syst.GetMean() sigma_z_var_syst = hist_means_syst.GetStdDev() - make_plot(f"{shape}_means_incl_comb_{ibin2}", list_obj=[hist_means_comb], path=rootpath, suffix="pdf", title=f"inclusive mean variations comb {ibin2};{v_varshape_latex}") - make_plot(f"{shape}_means_incl_stat_{ibin2}", list_obj=[hist_means_stat], path=rootpath, suffix="pdf", title=f"inclusive mean variations stat {ibin2};{v_varshape_latex}") - make_plot(f"{shape}_means_incl_syst_{ibin2}", list_obj=[hist_means_syst], path=rootpath, suffix="pdf", title=f"inclusive mean variations syst {ibin2};{v_varshape_latex}") + make_plot( + f"{shape}_means_incl_comb_{ibin2}", + list_obj=[hist_means_comb], + path=rootpath, + suffix="pdf", + title=f"inclusive mean variations comb {ibin2};{v_varshape_latex}", + ) + make_plot( + f"{shape}_means_incl_stat_{ibin2}", + list_obj=[hist_means_stat], + path=rootpath, + suffix="pdf", + title=f"inclusive mean variations stat {ibin2};{v_varshape_latex}", + ) + make_plot( + f"{shape}_means_incl_syst_{ibin2}", + list_obj=[hist_means_syst], + path=rootpath, + suffix="pdf", + title=f"inclusive mean variations syst {ibin2};{v_varshape_latex}", + ) print(f"Mean inclusive {shape} = stat {mean_z_stat} syst {mean_z_syst} ROOT stat {incl_data_stat.GetMean()}") print(f"Mean inclusive {shape} = var comb {mean_z_var_comb} +- {sigma_z_var_comb}") print(f"Mean inclusive {shape} = var stat {mean_z_var_stat} +- {sigma_z_var_stat}") @@ -244,19 +293,19 @@ def main(): # pylint: disable=too-many-locals, too-many-statements, too-many-bra offsets_axes_double = [0.8, 0.8] margins_can = [0.1, 0.13, 0.1, 0.03] margins_can_double = [0.1, 0.1, 0.1, 0.1] - margins_can_double = [0., 0., 0., 0.] + margins_can_double = [0.0, 0.0, 0.0, 0.0] size_thg = 0.05 offset_thg = 0.85 - gStyle.SetErrorX(0) # do not plot horizontal error bars of histograms + gStyle.SetErrorX(0) # do not plot horizontal error bars of histograms fontsize = 0.06 - fontsize_glob = 0.032 # font size relative to the canvas height - scale_title = 1.3 # scaling factor to increase the size of axis titles + fontsize_glob = 0.032 # font size relative to the canvas height + scale_title = 1.3 # scaling factor to increase the size of axis titles tick_length = 0.02 opt_leg_g = "FP" opt_plot_g = "2" - list_new = [] # list to avoid loosing objects created in loops + list_new = [] # list to avoid loosing objects created in loops # labels @@ -281,8 +330,17 @@ def main(): # pylint: disable=too-many-locals, too-many-statements, too-many-bra text_powheg = "POWHEG #plus PYTHIA 6" text_ivan = "SCET MLL" text_jets = "charged jets, anti-#it{k}_{T}, #it{R} = 0.4" - text_ptjet = "%g #leq %s < %g GeV/#it{c}, |#it{#eta}_{jet ch}| #leq 0.5" % (lvar2_binmin_reco[ibin2], p_latexbin2var, lvar2_binmax_reco[ibin2]) - text_pth = "%g #leq #it{p}_{T}^{%s} < %g GeV/#it{c}, |#it{y}_{%s}| #leq 0.8" % (lpt_finbinmin[0], p_latexnhadron, min(lpt_finbinmax[-1], lvar2_binmax_reco[ibin2]), p_latexnhadron) + text_ptjet = "%g #leq %s < %g GeV/#it{c}, |#it{#eta}_{jet ch}| #leq 0.5" % ( + lvar2_binmin_reco[ibin2], + p_latexbin2var, + lvar2_binmax_reco[ibin2], + ) + text_pth = "%g #leq #it{p}_{T}^{%s} < %g GeV/#it{c}, |#it{y}_{%s}| #leq 0.8" % ( + lpt_finbinmin[0], + p_latexnhadron, + min(lpt_finbinmax[-1], lvar2_binmax_reco[ibin2]), + p_latexnhadron, + ) text_ptcut = "#it{p}_{T, incl. ch. jet}^{leading track} #geq 5.33 GeV/#it{c}" text_ptcut_sim = "#it{p}_{T, incl. ch. jet}^{leading h^{#pm}} #geq 5.33 GeV/#it{c} (varied)" text_sd = "Soft Drop (#it{z}_{cut} = 0.1, #it{#beta} = 0)" @@ -323,17 +381,28 @@ def main(): # pylint: disable=too-many-locals, too-many-statements, too-many-bra hf_data_syst_cl = hf_data_syst.Clone() - leg_pos = [.72, .75, .85, .85] + leg_pos = [0.72, 0.75, 0.85, 0.85] list_obj = [hf_data_syst, incl_data_syst, hf_data_stat, incl_data_stat] labels_obj = ["%s-tagged" % p_latexnhadron, "inclusive", "", ""] colours = [get_colour(i, j) for i, j in zip((c_hf_data, c_incl_data, c_hf_data, c_incl_data), (2, 2, 1, 1))] markers = [m_hf_data, m_incl_data, m_hf_data, m_incl_data] y_margin_up = 0.46 y_margin_down = 0.05 - cshape_data, list_obj_data_new = make_plot("cshape_data_" + suffix, size=size_can, \ - list_obj=list_obj, labels_obj=labels_obj, opt_leg_g=opt_leg_g, opt_plot_g=opt_plot_g, offsets_xy=offsets_axes, \ - colours=colours, markers=markers, leg_pos=leg_pos, margins_y=[y_margin_down, y_margin_up], margins_c=margins_can, \ - title=title_full) + cshape_data, list_obj_data_new = make_plot( + "cshape_data_" + suffix, + size=size_can, + list_obj=list_obj, + labels_obj=labels_obj, + opt_leg_g=opt_leg_g, + opt_plot_g=opt_plot_g, + offsets_xy=offsets_axes, + colours=colours, + markers=markers, + leg_pos=leg_pos, + margins_y=[y_margin_down, y_margin_up], + margins_c=margins_can, + title=title_full, + ) for gr, c in zip((hf_data_syst, incl_data_syst), (c_hf_data, c_incl_data)): gr.SetMarkerColor(get_colour(c)) list_obj_data_new[0].SetTextSize(fontsize) @@ -388,25 +457,49 @@ def main(): # pylint: disable=too-many-locals, too-many-statements, too-many-bra err_tot = sqrt(err_stat * err_stat + err_syst * err_syst) hf_data_stat.SetBinContent(i + 1, abs(diff) / err_tot) hf_data_stat.SetBinError(i + 1, 0) - can_compare_data, list_obj_data_new = make_plot("cshape_data_compare_" + suffix, size=size_can, \ - list_obj=[hf_data_stat], labels_obj=labels_obj, opt_leg_g=opt_leg_g, opt_plot_g=opt_plot_g, offsets_xy=offsets_axes, \ - colours=colours, markers=markers, leg_pos=leg_pos, range_y=[0, 5], margins_c=margins_can, \ - title=title_full) + can_compare_data, list_obj_data_new = make_plot( + "cshape_data_compare_" + suffix, + size=size_can, + list_obj=[hf_data_stat], + labels_obj=labels_obj, + opt_leg_g=opt_leg_g, + opt_plot_g=opt_plot_g, + offsets_xy=offsets_axes, + colours=colours, + markers=markers, + leg_pos=leg_pos, + range_y=[0, 5], + margins_c=margins_can, + title=title_full, + ) can_compare_data.SaveAs("%s/%s_data_compare_%s.pdf" % (rootpath, shape, suffix)) # data and PYTHIA, POWHEG, Ivan, HF - leg_pos = [.72, .65, .85, .85] + leg_pos = [0.72, 0.65, 0.85, 0.85] list_obj = [hf_data_syst_cl, hf_powheg_syst, hf_data_stat, hf_pythia_stat, hf_powheg_stat] labels_obj = ["data", text_powheg, "", text_pythia_split, "", ""] - colours = [get_colour(i, j) for i, j in zip((c_hf_data, c_hf_powheg, c_hf_data, c_hf_pythia, c_hf_powheg), (2, 2, 1, 1, 1))] + colours = [ + get_colour(i, j) for i, j in zip((c_hf_data, c_hf_powheg, c_hf_data, c_hf_pythia, c_hf_powheg), (2, 2, 1, 1, 1)) + ] markers = [m_hf_data, m_hf_powheg, m_hf_data, m_hf_pythia, m_hf_powheg] y_margin_up = 0.4 y_margin_down = 0.05 - cshape_data_mc_hf, list_obj_data_mc_hf_new = make_plot("cshape_data_mc_hf_" + suffix, size=size_can, \ - list_obj=list_obj, labels_obj=labels_obj, opt_leg_g=opt_leg_g, opt_plot_g=opt_plot_g, offsets_xy=offsets_axes, \ - colours=colours, markers=markers, leg_pos=leg_pos, margins_y=[y_margin_down, y_margin_up], margins_c=margins_can, \ - title=title_full) + cshape_data_mc_hf, list_obj_data_mc_hf_new = make_plot( + "cshape_data_mc_hf_" + suffix, + size=size_can, + list_obj=list_obj, + labels_obj=labels_obj, + opt_leg_g=opt_leg_g, + opt_plot_g=opt_plot_g, + offsets_xy=offsets_axes, + colours=colours, + markers=markers, + leg_pos=leg_pos, + margins_y=[y_margin_down, y_margin_up], + margins_c=margins_can, + title=title_full, + ) for gr, c in zip([hf_data_syst_cl, hf_powheg_syst], [c_hf_data, c_hf_powheg]): gr.SetMarkerColor(get_colour(c)) leg_data_mc_hf = list_obj_data_mc_hf_new[0] @@ -414,13 +507,13 @@ def main(): # pylint: disable=too-many-locals, too-many-statements, too-many-bra leg_data_mc_hf.SetTextSize(fontsize) if shape == "nsd": hf_data_syst_cl.GetXaxis().SetNdivisions(5) - #axis_nsd = hf_data_syst_cl.GetHistogram().GetXaxis() - #x1 = axis_nsd.GetBinLowEdge(1) - #x2 = axis_nsd.GetBinUpEdge(axis_nsd.GetNbins()) - #axis_nsd.Set(5, x1, x2) - #for ibin in range(axis_nsd.GetNbins()): + # axis_nsd = hf_data_syst_cl.GetHistogram().GetXaxis() + # x1 = axis_nsd.GetBinLowEdge(1) + # x2 = axis_nsd.GetBinUpEdge(axis_nsd.GetNbins()) + # axis_nsd.Set(5, x1, x2) + # for ibin in range(axis_nsd.GetNbins()): # axis_nsd.SetBinLabel(ibin + 1, "%d" % ibin) - #axis_nsd.SetNdivisions(5) + # axis_nsd.SetNdivisions(5) cshape_data_mc_hf.Update() if shape == "rg": # plot the theta_g axis @@ -453,17 +546,28 @@ def main(): # pylint: disable=too-many-locals, too-many-statements, too-many-bra # data and PYTHIA, inclusive - #leg_pos = [.68, .65, .85, .85] + # leg_pos = [.68, .65, .85, .85] list_obj = [incl_data_syst, incl_pythia_syst, incl_data_stat, incl_pythia_stat] labels_obj = ["data", text_pythia_split] colours = [get_colour(i, j) for i, j in zip((c_incl_data, c_incl_pythia, c_incl_data, c_incl_pythia), (2, 2, 1, 1))] markers = [m_incl_data, m_incl_pythia, m_incl_data, m_incl_pythia] y_margin_up = 0.4 y_margin_down = 0.05 - cshape_data_mc_incl, list_obj_data_mc_incl_new = make_plot("cshape_data_mc_incl_" + suffix, size=size_can, \ - list_obj=list_obj, labels_obj=labels_obj, opt_leg_g=opt_leg_g, opt_plot_g=opt_plot_g, offsets_xy=offsets_axes, \ - colours=colours, markers=markers, leg_pos=leg_pos, margins_y=[y_margin_down, y_margin_up], margins_c=margins_can, \ - title=title_full) + cshape_data_mc_incl, list_obj_data_mc_incl_new = make_plot( + "cshape_data_mc_incl_" + suffix, + size=size_can, + list_obj=list_obj, + labels_obj=labels_obj, + opt_leg_g=opt_leg_g, + opt_plot_g=opt_plot_g, + offsets_xy=offsets_axes, + colours=colours, + markers=markers, + leg_pos=leg_pos, + margins_y=[y_margin_down, y_margin_up], + margins_c=margins_can, + title=title_full, + ) for gr, c in zip([incl_data_syst, incl_pythia_syst], [c_incl_data, c_incl_pythia]): gr.SetMarkerColor(get_colour(c)) leg_data_mc_incl = list_obj_data_mc_incl_new[0] @@ -520,18 +624,29 @@ def main(): # pylint: disable=too-many-locals, too-many-statements, too-many-bra y_margin_down = 0.05 y_min_plot, y_max_plot = get_plot_range(y_min, y_max, y_margin_down, y_margin_up) - #leg_pos = [.6, .65, .75, .85] - leg_pos = [.72, .55, .85, .85] + # leg_pos = [.6, .65, .75, .85] + leg_pos = [0.72, 0.55, 0.85, 0.85] list_obj = [hf_pythia_syst, incl_pythia_syst, hf_pythia_stat, incl_pythia_stat] labels_obj = ["%s-tagged" % p_latexnhadron, "inclusive"] colours = [get_colour(i, j) for i, j in zip((c_hf_pythia, c_incl_pythia, c_hf_pythia, c_incl_pythia), (2, 2, 1, 1))] markers = [m_hf_pythia, m_incl_pythia, m_hf_pythia, m_incl_pythia] y_margin_up = 0.46 y_margin_down = 0.05 - cshape_mc, list_obj_mc_new = make_plot("cshape_mc_" + suffix, size=size_can, \ - list_obj=list_obj, labels_obj=labels_obj, opt_leg_g=opt_leg_g, opt_plot_g=opt_plot_g, offsets_xy=offsets_axes, \ - colours=colours, markers=markers, leg_pos=leg_pos, range_y=[y_min_plot, y_max_plot], margins_c=margins_can, \ - title=title_full) + cshape_mc, list_obj_mc_new = make_plot( + "cshape_mc_" + suffix, + size=size_can, + list_obj=list_obj, + labels_obj=labels_obj, + opt_leg_g=opt_leg_g, + opt_plot_g=opt_plot_g, + offsets_xy=offsets_axes, + colours=colours, + markers=markers, + leg_pos=leg_pos, + range_y=[y_min_plot, y_max_plot], + margins_c=margins_can, + title=title_full, + ) cshape_mc.Update() for gr, c in zip((hf_pythia_syst, incl_pythia_syst), (c_hf_pythia, c_incl_pythia)): gr.SetMarkerColor(get_colour(c)) @@ -572,18 +687,29 @@ def main(): # pylint: disable=too-many-locals, too-many-statements, too-many-bra # PYTHIA, HF, inclusive - #leg_pos = [.6, .65, .75, .85] - leg_pos = [.72, .67, .85, .85] + # leg_pos = [.6, .65, .75, .85] + leg_pos = [0.72, 0.67, 0.85, 0.85] list_obj = [incl_pythia_syst_cl, incl_pythia_stat, hf_pythia_stat] labels_obj = ["inclusive", "", "%s-tagged" % p_latexnhadron] colours = [get_colour(i, j) for i, j in zip((c_incl_pythia, c_incl_pythia, c_hf_pythia), (2, 1, 1))] markers = [m_incl_pythia, m_incl_pythia, m_hf_pythia] y_margin_up = 0.46 y_margin_down = 0.05 - cshape_mc, list_obj_mc_new = make_plot("cshape_mc_id_" + suffix, size=size_can, \ - list_obj=list_obj, labels_obj=labels_obj, opt_leg_g=opt_leg_g, opt_plot_g=opt_plot_g, offsets_xy=offsets_axes, \ - colours=colours, markers=markers, leg_pos=leg_pos, range_y=[y_min_plot, y_max_plot], margins_c=margins_can, \ - title=title_full) + cshape_mc, list_obj_mc_new = make_plot( + "cshape_mc_id_" + suffix, + size=size_can, + list_obj=list_obj, + labels_obj=labels_obj, + opt_leg_g=opt_leg_g, + opt_plot_g=opt_plot_g, + offsets_xy=offsets_axes, + colours=colours, + markers=markers, + leg_pos=leg_pos, + range_y=[y_min_plot, y_max_plot], + margins_c=margins_can, + title=title_full, + ) # Draw a line through the points. if shape == "nsd": for h in (incl_pythia_stat, hf_pythia_stat): @@ -631,7 +757,7 @@ def main(): # pylint: disable=too-many-locals, too-many-statements, too-many-bra # data + MC/data, HF and inclusive # print values - n_sig = 2 # number of significant figures of the errors + n_sig = 2 # number of significant figures of the errors for name, his, gr in zip(("HF", "inclusive"), (hf_data_stat, incl_data_stat), (hf_data_syst, incl_data_syst)): print(f"Data points for {name} {shape}") for i in range(gr.GetN()): @@ -649,8 +775,8 @@ def main(): # pylint: disable=too-many-locals, too-many-statements, too-many-bra print(f"Rel. syst. unc. for {name} {shape}") e_plus_min = float("inf") e_minus_min = float("inf") - e_plus_max = 0. - e_minus_max = 0. + e_plus_max = 0.0 + e_minus_max = 0.0 for i in range(gr.GetN()): # skip untagged bin for zg and rg if i == 0 and shape in ("zg", "rg"): @@ -670,18 +796,22 @@ def main(): # pylint: disable=too-many-locals, too-many-statements, too-many-bra print(f"Absolutes: min: {min(e_plus_min, e_minus_min):.2g} %, max {max(e_plus_max, e_minus_max):.2g} %") # explicit y ranges [zg, rg, nsd] - list_range_y = [[0.01, 9], [0.01, 6.1], [0.001, 0.78]] # data - list_range_y_rat = [[0.55, 2.99], [0.7, 1.9], [0.1, 2.2]] # mc/data ratios - list_range_x = [[0.1, 0.5], [0, 0.4], [-0.5, 4.5]] # data and mc/data ratios - list_xy_sd = [[x_latex + 0.45, y_latex_top - 4 * y_step], [x_latex + 0.45, y_latex_top - 7 * y_step], [x_latex + 0.45, y_latex_top - 3 * y_step]] # position of the SD legend + list_range_y = [[0.01, 9], [0.01, 6.1], [0.001, 0.78]] # data + list_range_y_rat = [[0.55, 2.99], [0.7, 1.9], [0.1, 2.2]] # mc/data ratios + list_range_x = [[0.1, 0.5], [0, 0.4], [-0.5, 4.5]] # data and mc/data ratios + list_xy_sd = [ + [x_latex + 0.45, y_latex_top - 4 * y_step], + [x_latex + 0.45, y_latex_top - 7 * y_step], + [x_latex + 0.45, y_latex_top - 3 * y_step], + ] # position of the SD legend i_shape = 0 if shape == "zg" else 1 if shape == "rg" else 2 print(f"Index {i_shape}") # data # leg_pos = [.7, .75, .82, .85] # leg_pos = [.65, .63, .82, .78] - leg_pos = [.7, .63, .87, .78] - leg_pos = [.7, .55, .87, .78] + leg_pos = [0.7, 0.63, 0.87, 0.78] + leg_pos = [0.7, 0.55, 0.87, 0.78] fraction_untagged_hf = hf_data_stat.Integral(1, 1, "width") fraction_untagged_incl = incl_data_stat.Integral(1, 1, "width") # hard-coded to values to unify them across zg, rg, nsd @@ -703,28 +833,32 @@ def main(): # pylint: disable=too-many-locals, too-many-statements, too-many-bra pad1 = cshape_datamc_all.cd(1) pad2 = cshape_datamc_all.cd(2) pad3 = cshape_datamc_all.cd(3) - panel_top = 0.5 # height of the top panel (length of y axis) relative to the canvas height - margin_top = 0.08 # height of the top margin relative to the canvas height - margin_bottom = 0.08 # height of the bottom margin relative to the canvas height - panel_bottom = 0.5 * (1 - panel_top - margin_top - margin_bottom) # height of the bottom panel (length of y axis) relative to the canvas height - margin_top_rel = margin_top / (margin_top + panel_top) # height of the top margin relative to the top pad height - margin_bottom_rel = margin_bottom / (margin_bottom + panel_bottom) # height of the bottom margin relative to the bottom pad height + panel_top = 0.5 # height of the top panel (length of y axis) relative to the canvas height + margin_top = 0.08 # height of the top margin relative to the canvas height + margin_bottom = 0.08 # height of the bottom margin relative to the canvas height + panel_bottom = 0.5 * ( + 1 - panel_top - margin_top - margin_bottom + ) # height of the bottom panel (length of y axis) relative to the canvas height + margin_top_rel = margin_top / (margin_top + panel_top) # height of the top margin relative to the top pad height + margin_bottom_rel = margin_bottom / ( + margin_bottom + panel_bottom + ) # height of the bottom margin relative to the bottom pad height margin_left_rel = 0.12 margin_right_rel = 0.05 - y_min_1 = 1 - margin_top - panel_top # minimum y of the top pad (1) - y_min_2 = margin_bottom + panel_bottom # minimum y of the middle pad (2) - h_pad1 = panel_top + margin_top # height of pad 1 - h_pad2 = panel_bottom # height of pad 2 - h_pad3 = panel_bottom + margin_bottom # height of pad 3 - pad1.SetPad(0., y_min_1, 1, 1) - pad2.SetPad(0., y_min_2, 1, y_min_1) - pad3.SetPad(0., 0., 1, y_min_2) - pad1.SetBottomMargin(0.) - pad2.SetBottomMargin(0.) + y_min_1 = 1 - margin_top - panel_top # minimum y of the top pad (1) + y_min_2 = margin_bottom + panel_bottom # minimum y of the middle pad (2) + h_pad1 = panel_top + margin_top # height of pad 1 + h_pad2 = panel_bottom # height of pad 2 + h_pad3 = panel_bottom + margin_bottom # height of pad 3 + pad1.SetPad(0.0, y_min_1, 1, 1) + pad2.SetPad(0.0, y_min_2, 1, y_min_1) + pad3.SetPad(0.0, 0.0, 1, y_min_2) + pad1.SetBottomMargin(0.0) + pad2.SetBottomMargin(0.0) pad3.SetBottomMargin(margin_bottom_rel) pad1.SetTopMargin(margin_top_rel) - pad2.SetTopMargin(0.) - pad3.SetTopMargin(0.) + pad2.SetTopMargin(0.0) + pad3.SetTopMargin(0.0) pad1.SetLeftMargin(margin_left_rel) pad2.SetLeftMargin(margin_left_rel) pad3.SetLeftMargin(margin_left_rel) @@ -734,14 +868,25 @@ def main(): # pylint: disable=too-many-locals, too-many-statements, too-many-bra pad1.SetTicks(1, 1) pad2.SetTicks(1, 1) pad3.SetTicks(1, 1) - cshape_datamc_all, list_obj_data_new = make_plot("cshape_datamc_" + suffix, size=size_can_double, \ - can=cshape_datamc_all, pad=1, \ - list_obj=list_obj, labels_obj=labels_obj, opt_leg_g=opt_leg_g, opt_plot_g=opt_plot_g, offsets_xy=[0.8, 1.1], \ - colours=colours, markers=markers, leg_pos=leg_pos, margins_c=margins_can_double, \ - range_x=list_range_x[i_shape], \ + cshape_datamc_all, list_obj_data_new = make_plot( + "cshape_datamc_" + suffix, + size=size_can_double, + can=cshape_datamc_all, + pad=1, + list_obj=list_obj, + labels_obj=labels_obj, + opt_leg_g=opt_leg_g, + opt_plot_g=opt_plot_g, + offsets_xy=[0.8, 1.1], + colours=colours, + markers=markers, + leg_pos=leg_pos, + margins_c=margins_can_double, + range_x=list_range_x[i_shape], # margins_y=[y_margin_down, y_margin_up], \ - range_y=list_range_y[i_shape], \ - title=title_full) + range_y=list_range_y[i_shape], + title=title_full, + ) for gr, c in zip((hf_data_syst, incl_data_syst), (c_hf_data, c_incl_data)): gr.SetMarkerColor(get_colour(c)) list_obj_data_new[0].SetTextSize(fontsize_glob / h_pad1) @@ -783,14 +928,18 @@ def main(): # pylint: disable=too-many-locals, too-many-statements, too-many-bra y_latex = y_latex_top list_latex_data = [] # for text_latex in [text_alice, text_jets, text_ptjet, text_pth, text_ptcut, text_sd]: - for text_latex in [text_alice, text_jets, text_ptjet, text_pth, text_sd]: # w/o text_ptcut + for text_latex in [text_alice, text_jets, text_ptjet, text_pth, text_sd]: # w/o text_ptcut latex = TLatex(x_latex, y_latex, text_latex) list_latex_data.append(latex) draw_latex(latex, textsize=(fontsize_glob / h_pad1)) y_latex -= y_step y_latex = list_xy_sd[i_shape][1] if shape != "nsd": - for text_latex in ["SD-untagged jets", f"{p_latexnhadron}-tagged: {100 * fraction_untagged_hf_text:.2g}%", f"inclusive: {100 * fraction_untagged_incl_text:.2g}%"]: + for text_latex in [ + "SD-untagged jets", + f"{p_latexnhadron}-tagged: {100 * fraction_untagged_hf_text:.2g}%", + f"inclusive: {100 * fraction_untagged_incl_text:.2g}%", + ]: latex = TLatex(list_xy_sd[i_shape][0], y_latex, text_latex) list_latex_data.append(latex) draw_latex(latex, textsize=(fontsize_glob / h_pad1)) @@ -802,7 +951,7 @@ def main(): # pylint: disable=too-many-locals, too-many-statements, too-many-bra line_1.SetLineStyle(9) line_1.SetLineColor(1) line_1.SetLineWidth(3) - leg_pos = [.15, .55, .4, .85] + leg_pos = [0.15, 0.55, 0.4, 0.85] hf_ratio_powheg_stat = hf_powheg_stat.Clone(f"{hf_powheg_stat.GetName()}_rat") hf_ratio_powheg_stat.Divide(hf_data_stat) hf_ratio_powheg_syst = divide_graphs(hf_powheg_syst, hf_data_syst) @@ -820,23 +969,34 @@ def main(): # pylint: disable=too-many-locals, too-many-statements, too-many-bra # incl_ratio_pythia_syst = divide_graphs(incl_data_syst, incl_pythia_syst) # version data/MC if shape != "nsd": for gr in (incl_ratio_pythia_syst, hf_ratio_pythia_syst, hf_ratio_powheg_syst): - gr.SetPointY(0, 1.) + gr.SetPointY(0, 1.0) for his in (incl_ratio_pythia_stat, hf_ratio_pythia_stat, hf_ratio_powheg_stat): - his.SetBinContent(1, 1.) + his.SetBinContent(1, 1.0) list_obj = [hf_ratio_powheg_syst, hf_ratio_pythia_syst, hf_ratio_powheg_stat, hf_ratio_pythia_stat, line_1] labels_obj = [f"{p_latexnhadron}-tagged {text_powheg}", f"{p_latexnhadron}-tagged {text_pythia_short}", "", ""] colours = [get_colour(i, j) for i, j in zip((c_hf_powheg, c_hf_pythia, c_hf_powheg, c_hf_pythia), (2, 2, 1, 1))] markers = [m_hf_powheg, m_hf_pythia, m_hf_powheg, m_hf_pythia] y_margin_up = 0.29 y_margin_down = 0.05 - cshape_datamc_all, list_obj_data_mc_hf_new = make_plot("cshape_data_mc_hf_" + suffix, size=size_can_double, \ - can=cshape_datamc_all, pad=2, \ - list_obj=list_obj, labels_obj=labels_obj, opt_leg_g=opt_leg_g, opt_plot_g=opt_plot_g, offsets_xy=[1, 1.3 * 3/7], \ - colours=colours, markers=markers, leg_pos=leg_pos, margins_c=margins_can_double, \ - range_x=list_range_x[i_shape], \ + cshape_datamc_all, list_obj_data_mc_hf_new = make_plot( + "cshape_data_mc_hf_" + suffix, + size=size_can_double, + can=cshape_datamc_all, + pad=2, + list_obj=list_obj, + labels_obj=labels_obj, + opt_leg_g=opt_leg_g, + opt_plot_g=opt_plot_g, + offsets_xy=[1, 1.3 * 3 / 7], + colours=colours, + markers=markers, + leg_pos=leg_pos, + margins_c=margins_can_double, + range_x=list_range_x[i_shape], # margins_y=[y_margin_down, y_margin_up], \ - range_y=list_range_y_rat[i_shape], \ - title=title_full_ratio_double) + range_y=list_range_y_rat[i_shape], + title=title_full_ratio_double, + ) list_obj[0].GetXaxis().SetLabelSize(0.1) list_obj[0].GetXaxis().SetTitleSize(0.1) list_obj[0].GetYaxis().SetLabelSize(fontsize_glob / h_pad2) @@ -847,7 +1007,7 @@ def main(): # pylint: disable=too-many-locals, too-many-statements, too-many-bra for gr, c in zip([hf_ratio_powheg_syst, hf_ratio_pythia_syst], [c_hf_powheg, c_hf_pythia]): gr.SetMarkerColor(get_colour(c)) leg_data_mc_hf = list_obj_data_mc_hf_new[0] - #leg_data_mc_hf.SetHeader("%s-tagged" % p_latexnhadron) + # leg_data_mc_hf.SetHeader("%s-tagged" % p_latexnhadron) leg_data_mc_hf.SetTextSize(fontsize_glob / h_pad2) # leg_data_mc_hf.SetNColumns(2) if shape == "nsd": @@ -855,21 +1015,32 @@ def main(): # pylint: disable=too-many-locals, too-many-statements, too-many-bra cshape_datamc_all.Update() # inclusive MC/data - leg_pos = [.15, .8, .8, .9] + leg_pos = [0.15, 0.8, 0.8, 0.9] list_obj = [incl_ratio_pythia_syst, incl_ratio_pythia_stat, line_1] labels_obj = [f"inclusive {text_pythia_short}", ""] colours = [get_colour(i, j) for i, j in zip((c_incl_pythia, c_incl_pythia), (2, 1))] markers = [m_incl_pythia, m_incl_pythia] y_margin_up = 0.3 y_margin_down = 0.05 - cshape_datamc_all, list_obj_data_mc_hf_new_2 = make_plot("cshape_data_mc_hf_" + suffix, size=size_can_double, \ - can=cshape_datamc_all, pad=3, \ - list_obj=list_obj, labels_obj=labels_obj, opt_leg_g=opt_leg_g, opt_plot_g=opt_plot_g, offsets_xy=[1, 1.3 * 3/7], \ - colours=colours, markers=markers, leg_pos=leg_pos, margins_c=margins_can_double, \ - range_x=list_range_x[i_shape], \ - margins_y=[y_margin_down, y_margin_up], \ + cshape_datamc_all, list_obj_data_mc_hf_new_2 = make_plot( + "cshape_data_mc_hf_" + suffix, + size=size_can_double, + can=cshape_datamc_all, + pad=3, + list_obj=list_obj, + labels_obj=labels_obj, + opt_leg_g=opt_leg_g, + opt_plot_g=opt_plot_g, + offsets_xy=[1, 1.3 * 3 / 7], + colours=colours, + markers=markers, + leg_pos=leg_pos, + margins_c=margins_can_double, + range_x=list_range_x[i_shape], + margins_y=[y_margin_down, y_margin_up], # range_y=list_range_y_rat[i_shape], \ - title=title_full_ratio_double) + title=title_full_ratio_double, + ) list_obj[0].GetXaxis().SetLabelSize(fontsize_glob / h_pad3) list_obj[0].GetXaxis().SetTitleSize(scale_title * fontsize_glob / h_pad3) list_obj[0].GetXaxis().SetTitleOffset(0.8) @@ -880,7 +1051,7 @@ def main(): # pylint: disable=too-many-locals, too-many-statements, too-many-bra for gr, c in zip([incl_ratio_pythia_syst], [c_incl_pythia]): gr.SetMarkerColor(get_colour(c)) leg_data_mc_hf = list_obj_data_mc_hf_new_2[0] - #leg_data_mc_hf.SetHeader("%s-tagged" % p_latexnhadron) + # leg_data_mc_hf.SetHeader("%s-tagged" % p_latexnhadron) leg_data_mc_hf.SetTextSize(fontsize_glob / h_pad3) leg_data_mc_hf.SetNColumns(2) if shape == "nsd": @@ -888,14 +1059,14 @@ def main(): # pylint: disable=too-many-locals, too-many-statements, too-many-bra cshape_datamc_all.Update() # Draw LaTeX - #y_latex = y_latex_top - #list_latex_data_mc_hf = [] - #for text_latex in [text_alice, text_jets, text_ptjet, text_pth, text_sd]: + # y_latex = y_latex_top + # list_latex_data_mc_hf = [] + # for text_latex in [text_alice, text_jets, text_ptjet, text_pth, text_sd]: # latex = TLatex(x_latex, y_latex, text_latex) # list_latex_data_mc_hf.append(latex) # draw_latex(latex, textsize=fontsize) # y_latex -= y_step - #cshape_datamc_all.Update() + # cshape_datamc_all.Update() pad1.RedrawAxis() pad2.RedrawAxis() pad3.RedrawAxis() @@ -907,22 +1078,37 @@ def main(): # pylint: disable=too-many-locals, too-many-statements, too-many-bra # Normalise ignoring the untagged bin if shape in ("zg", "rg"): int = 1 - fraction_untagged_hf - hf_data_stat.Scale(1. / int) - scale_graph(hf_data_syst, 1. / int) + hf_data_stat.Scale(1.0 / int) + scale_graph(hf_data_syst, 1.0 / int) int = 1 - fraction_untagged_incl - incl_data_stat.Scale(1. / int) - scale_graph(incl_data_syst, 1. / int) + incl_data_stat.Scale(1.0 / int) + scale_graph(incl_data_syst, 1.0 / int) # data # leg_pos = [.65, .6, .82, .8] - leg_pos = [.7, .55, .87, .78] + leg_pos = [0.7, 0.55, 0.87, 0.78] hf_ivan_syst_plot = hf_ivan_syst.Clone(f"{hf_ivan_syst.GetName()}_plot") - hf_ivan_syst_plot.RemovePoint(0) # delete the untagged bin point + hf_ivan_syst_plot.RemovePoint(0) # delete the untagged bin point incl_ivan_syst_plot = incl_ivan_syst.Clone(f"{incl_ivan_syst.GetName()}_plot") - incl_ivan_syst_plot.RemovePoint(0) # delete the untagged bin point - list_obj = [hf_data_syst, incl_data_syst, hf_ivan_syst_plot, incl_ivan_syst_plot, hf_data_stat, incl_data_stat, hf_ivan_stat, incl_ivan_stat] + incl_ivan_syst_plot.RemovePoint(0) # delete the untagged bin point + list_obj = [ + hf_data_syst, + incl_data_syst, + hf_ivan_syst_plot, + incl_ivan_syst_plot, + hf_data_stat, + incl_data_stat, + hf_ivan_stat, + incl_ivan_stat, + ] labels_obj = [f"{p_latexnhadron}-tagged", "inclusive", "", "", "", "", "", ""] labels_obj = ["", "", "", "", f"{p_latexnhadron}-tagged", "inclusive", "", ""] - colours = [get_colour(i, j) for i, j in zip((c_hf_data, c_incl_data, c_hf_ivan, c_incl_ivan, c_hf_data, c_incl_data, c_hf_ivan, c_incl_ivan), (2, 2, 2, 2, 1, 1, 1, 1))] + colours = [ + get_colour(i, j) + for i, j in zip( + (c_hf_data, c_incl_data, c_hf_ivan, c_incl_ivan, c_hf_data, c_incl_data, c_hf_ivan, c_incl_ivan), + (2, 2, 2, 2, 1, 1, 1, 1), + ) + ] markers = [m_hf_data, m_incl_data, m_hf_ivan, m_incl_ivan, m_hf_data, m_incl_data, m_hf_ivan, m_incl_ivan] y_margin_up = 0.5 y_margin_down = 0.05 @@ -931,15 +1117,15 @@ def main(): # pylint: disable=too-many-locals, too-many-statements, too-many-bra pad1 = cshape_datamc_ivan.cd(1) pad2 = cshape_datamc_ivan.cd(2) pad3 = cshape_datamc_ivan.cd(3) - pad1.SetPad(0., y_min_1, 1, 1) - pad2.SetPad(0., y_min_2, 1, y_min_1) - pad3.SetPad(0., 0., 1, y_min_2) - pad1.SetBottomMargin(0.) - pad2.SetBottomMargin(0.) + pad1.SetPad(0.0, y_min_1, 1, 1) + pad2.SetPad(0.0, y_min_2, 1, y_min_1) + pad3.SetPad(0.0, 0.0, 1, y_min_2) + pad1.SetBottomMargin(0.0) + pad2.SetBottomMargin(0.0) pad3.SetBottomMargin(margin_bottom_rel) pad1.SetTopMargin(margin_top_rel) - pad2.SetTopMargin(0.) - pad3.SetTopMargin(0.) + pad2.SetTopMargin(0.0) + pad3.SetTopMargin(0.0) pad1.SetLeftMargin(margin_left_rel) pad2.SetLeftMargin(margin_left_rel) pad3.SetLeftMargin(margin_left_rel) @@ -949,22 +1135,36 @@ def main(): # pylint: disable=too-many-locals, too-many-statements, too-many-bra pad1.SetTicks(1, 1) pad2.SetTicks(1, 1) pad3.SetTicks(1, 1) - cshape_datamc_ivan, list_obj_data_new = make_plot("cshape_datamc_" + suffix, size=size_can_double, \ - can=cshape_datamc_ivan, pad=1, \ - list_obj=list_obj, labels_obj=labels_obj, opt_leg_g=opt_leg_g, opt_plot_g=[opt_plot_g, opt_plot_g, "3", "3"], offsets_xy=[0.8, 1.1], \ - colours=colours, markers=markers, leg_pos=leg_pos, margins_c=margins_can_double, \ - range_x=list_range_x[i_shape], \ - margins_y=[y_margin_down, y_margin_up], \ + cshape_datamc_ivan, list_obj_data_new = make_plot( + "cshape_datamc_" + suffix, + size=size_can_double, + can=cshape_datamc_ivan, + pad=1, + list_obj=list_obj, + labels_obj=labels_obj, + opt_leg_g=opt_leg_g, + opt_plot_g=[opt_plot_g, opt_plot_g, "3", "3"], + offsets_xy=[0.8, 1.1], + colours=colours, + markers=markers, + leg_pos=leg_pos, + margins_c=margins_can_double, + range_x=list_range_x[i_shape], + margins_y=[y_margin_down, y_margin_up], # range_y=list_range_y[i_shape], \ - title=";%s;%s" % (title_x, title_y_ivan)) - for gr, c in zip((hf_data_syst, incl_data_syst, hf_ivan_syst_plot, incl_ivan_syst_plot), (c_hf_data, c_incl_data, c_hf_ivan, c_incl_ivan)): + title=";%s;%s" % (title_x, title_y_ivan), + ) + for gr, c in zip( + (hf_data_syst, incl_data_syst, hf_ivan_syst_plot, incl_ivan_syst_plot), + (c_hf_data, c_incl_data, c_hf_ivan, c_incl_ivan), + ): gr.SetMarkerColor(get_colour(c)) leg_data_mc = list_obj_data_new[0] leg_data_mc.SetTextSize(fontsize_glob / h_pad1) leg_data_mc.SetHeader("data") leg_data_mc.AddEntry(hf_data_syst, "syst. unc.", "f") # leg_data_mc_theory = TLegend(.65, .35, .82, .55) - leg_data_mc_theory = TLegend(.7, .3, .87, .5) + leg_data_mc_theory = TLegend(0.7, 0.3, 0.87, 0.5) setup_legend(leg_data_mc_theory, fontsize_glob / h_pad1) leg_data_mc_theory.SetTextSize(fontsize_glob / h_pad1) leg_data_mc_theory.SetHeader(text_ivan) @@ -1009,7 +1209,7 @@ def main(): # pylint: disable=too-many-locals, too-many-statements, too-many-bra y_latex = y_latex_top list_latex_data = [] # for text_latex in [text_alice, text_jets, text_ptjet, text_pth, text_ptcut, text_sd]: - for text_latex in [text_alice, text_jets, text_ptjet, text_pth, text_sd]: # w/o text_ptcut + for text_latex in [text_alice, text_jets, text_ptjet, text_pth, text_sd]: # w/o text_ptcut latex = TLatex(x_latex, y_latex, text_latex) list_latex_data.append(latex) draw_latex(latex, textsize=(fontsize_glob / h_pad1)) @@ -1029,29 +1229,40 @@ def main(): # pylint: disable=too-many-locals, too-many-statements, too-many-bra line_1.SetLineWidth(3) hf_ratio_ivan_stat = hf_ivan_stat.Clone(f"{hf_ivan_stat.GetName()}_rat") hf_ratio_ivan_stat.Divide(hf_data_stat) - hf_ratio_ivan_stat.SetBinContent(1, 1.) + hf_ratio_ivan_stat.SetBinContent(1, 1.0) hf_ratio_ivan_syst = divide_graphs(hf_ivan_syst, hf_data_syst) - hf_ratio_ivan_syst.RemovePoint(0) # delete the untagged bin point + hf_ratio_ivan_syst.RemovePoint(0) # delete the untagged bin point incl_ratio_ivan_stat = incl_ivan_stat.Clone(f"{incl_ivan_stat.GetName()}_rat") incl_ratio_ivan_stat.Divide(incl_data_stat) - incl_ratio_ivan_stat.SetBinContent(1, 1.) + incl_ratio_ivan_stat.SetBinContent(1, 1.0) incl_ratio_ivan_syst = divide_graphs(incl_ivan_syst, incl_data_syst) - incl_ratio_ivan_syst.RemovePoint(0) # delete the untagged bin point - leg_pos = [.15, .7, .4, .95] + incl_ratio_ivan_syst.RemovePoint(0) # delete the untagged bin point + leg_pos = [0.15, 0.7, 0.4, 0.95] list_obj = [hf_ratio_ivan_syst, hf_ratio_ivan_stat, line_1] labels_obj = [f"{p_latexnhadron}-tagged {text_ivan}", "", ""] colours = [get_colour(i, j) for i, j in zip((c_hf_ivan, c_hf_ivan), (2, 1))] markers = [m_hf_ivan, m_hf_ivan] y_margin_up = 0.05 y_margin_down = 0.05 - cshape_datamc_ivan, list_obj_data_mc_hf_new = make_plot("cshape_data_mc_hf_ivan_" + suffix, size=size_can_double, \ - can=cshape_datamc_ivan, pad=2, \ - list_obj=list_obj, labels_obj=labels_obj, opt_leg_g=opt_leg_g, opt_plot_g="3", offsets_xy=[1, 1.3 * 3/7], \ - colours=colours, markers=markers, leg_pos=None, margins_c=margins_can_double, \ - range_x=list_range_x[i_shape], \ + cshape_datamc_ivan, list_obj_data_mc_hf_new = make_plot( + "cshape_data_mc_hf_ivan_" + suffix, + size=size_can_double, + can=cshape_datamc_ivan, + pad=2, + list_obj=list_obj, + labels_obj=labels_obj, + opt_leg_g=opt_leg_g, + opt_plot_g="3", + offsets_xy=[1, 1.3 * 3 / 7], + colours=colours, + markers=markers, + leg_pos=None, + margins_c=margins_can_double, + range_x=list_range_x[i_shape], # margins_y=[y_margin_down, y_margin_up], \ - range_y=[0.2, 4.4], \ - title=title_full_ratio_theory) + range_y=[0.2, 4.4], + title=title_full_ratio_theory, + ) list_obj[0].GetXaxis().SetLabelSize(0.1) list_obj[0].GetXaxis().SetTitleSize(0.1) list_obj[0].GetYaxis().SetLabelSize(fontsize_glob / h_pad2) @@ -1062,7 +1273,7 @@ def main(): # pylint: disable=too-many-locals, too-many-statements, too-many-bra for gr, c in zip([hf_ratio_ivan_syst], [c_hf_ivan]): gr.SetMarkerColor(get_colour(c)) # leg_data_mc_hf = list_obj_data_mc_hf_new[0] - #leg_data_mc_hf.SetHeader("%s-tagged" % p_latexnhadron) + # leg_data_mc_hf.SetHeader("%s-tagged" % p_latexnhadron) # leg_data_mc_hf.SetTextSize(fontsize * 7/3) # leg_data_mc_hf.SetNColumns(2) if shape == "nsd": @@ -1070,21 +1281,32 @@ def main(): # pylint: disable=too-many-locals, too-many-statements, too-many-bra cshape_datamc_ivan.Update() # inclusive theory/data - leg_pos = [.15, .8, .9, .95] + leg_pos = [0.15, 0.8, 0.9, 0.95] list_obj = [incl_ratio_ivan_syst, incl_ratio_ivan_stat, line_1] labels_obj = [f"inclusive {text_ivan}", ""] colours = [get_colour(i, j) for i, j in zip((c_incl_ivan, c_incl_ivan), (2, 1))] markers = [m_incl_ivan, m_incl_ivan] y_margin_up = 0.05 y_margin_down = 0.05 - cshape_datamc_ivan, list_obj_data_mc_hf_new_2 = make_plot("cshape_data_mc_incl_ivan_" + suffix, size=size_can_double, \ - can=cshape_datamc_ivan, pad=3, \ - list_obj=list_obj, labels_obj=labels_obj, opt_leg_g=opt_leg_g, opt_plot_g="3", offsets_xy=[1, 1.3 * 3/7], \ - colours=colours, markers=markers, leg_pos=None, margins_c=margins_can_double, \ - range_x=list_range_x[i_shape], \ - margins_y=[y_margin_down, y_margin_up], \ + cshape_datamc_ivan, list_obj_data_mc_hf_new_2 = make_plot( + "cshape_data_mc_incl_ivan_" + suffix, + size=size_can_double, + can=cshape_datamc_ivan, + pad=3, + list_obj=list_obj, + labels_obj=labels_obj, + opt_leg_g=opt_leg_g, + opt_plot_g="3", + offsets_xy=[1, 1.3 * 3 / 7], + colours=colours, + markers=markers, + leg_pos=None, + margins_c=margins_can_double, + range_x=list_range_x[i_shape], + margins_y=[y_margin_down, y_margin_up], # range_y=list_range_y_rat[i_shape], \ - title=title_full_ratio_theory) + title=title_full_ratio_theory, + ) list_obj[0].GetXaxis().SetLabelSize(fontsize_glob / h_pad3) list_obj[0].GetXaxis().SetTitleSize(scale_title * fontsize_glob / h_pad3) list_obj[0].GetXaxis().SetTitleOffset(0.8) @@ -1096,7 +1318,7 @@ def main(): # pylint: disable=too-many-locals, too-many-statements, too-many-bra for gr, c in zip([incl_ratio_ivan_syst], [c_incl_ivan]): gr.SetMarkerColor(get_colour(c)) # leg_data_mc_hf = list_obj_data_mc_hf_new_2[0] - #leg_data_mc_hf.SetHeader("%s-tagged" % p_latexnhadron) + # leg_data_mc_hf.SetHeader("%s-tagged" % p_latexnhadron) # leg_data_mc_hf.SetTextSize(fontsize * 7/3) # leg_data_mc_hf.SetNColumns(2) if shape == "nsd": @@ -1104,17 +1326,18 @@ def main(): # pylint: disable=too-many-locals, too-many-statements, too-many-bra cshape_datamc_ivan.Update() # Draw LaTeX - #y_latex = y_latex_top - #list_latex_data_mc_hf = [] - #for text_latex in [text_alice, text_jets, text_ptjet, text_pth, text_sd]: + # y_latex = y_latex_top + # list_latex_data_mc_hf = [] + # for text_latex in [text_alice, text_jets, text_ptjet, text_pth, text_sd]: # latex = TLatex(x_latex, y_latex, text_latex) # list_latex_data_mc_hf.append(latex) # draw_latex(latex, textsize=fontsize) # y_latex -= y_step - #cshape_datamc_ivan.Update() + # cshape_datamc_ivan.Update() pad1.RedrawAxis() pad2.RedrawAxis() pad3.RedrawAxis() cshape_datamc_ivan.SaveAs("%s/%s_datamc_ivan_%s.pdf" % (rootpath, shape, suffix)) + main() diff --git a/machine_learning_hep/plotting/plot_jetsubstructure_run3.py b/machine_learning_hep/plotting/plot_jetsubstructure_run3.py index 59f3a3fca7..f7ab01bc01 100644 --- a/machine_learning_hep/plotting/plot_jetsubstructure_run3.py +++ b/machine_learning_hep/plotting/plot_jetsubstructure_run3.py @@ -680,8 +680,7 @@ def plot(self): for cat, label in zip(("pr", "np"), ("prompt", "non-prompt")): self.list_obj = self.get_objects( *( - f"h_ptjet-pthf_effnew_{cat}_" - f"{string_range_ptjet(get_bin_limits(axis_ptjet, iptjet + 1))}" + f"h_ptjet-pthf_effnew_{cat}_{string_range_ptjet(get_bin_limits(axis_ptjet, iptjet + 1))}" for iptjet in bins_ptjet ) ) @@ -724,7 +723,7 @@ def plot(self): self.list_obj = self.get_objects( f"h_ptjet-{self.var}_signal_{string_pthf}_{self.mcordata}", f"h_ptjet-{self.var}_sideband_{string_pthf}_{self.mcordata}", - f"h_ptjet-{self.var}_subtracted_notscaled_{string_pthf}" f"_{self.mcordata}", + f"h_ptjet-{self.var}_subtracted_notscaled_{string_pthf}_{self.mcordata}", ) self.list_obj = [project_hist(h, [1], {0: (iptjet + 1, iptjet + 1)}) for h in self.list_obj] self.labels_obj = ["signal region", "scaled sidebands", "after subtraction"] @@ -776,7 +775,7 @@ def plot(self): if plot_unfolding: self.logger.info("Plotting unfolding") self.list_obj = [ - self.get_object(f"h_{self.var}_{self.method}_unfolded_{self.mcordata}_" f"{string_ptjet}_{i}") + self.get_object(f"h_{self.var}_{self.method}_unfolded_{self.mcordata}_{string_ptjet}_{i}") for i in range(self.niter_unfolding) ] self.labels_obj = [f"iteration {i + 1}" for i in range(self.niter_unfolding)] @@ -809,7 +808,7 @@ def plot(self): self.plot_errors_x = False self.range_x = x_range[self.var] h_stat = self.get_object( - f"h_{self.var}_{self.method}_unfolded_{self.mcordata}_" f"{string_ptjet}_sel_selfnorm" + f"h_{self.var}_{self.method}_unfolded_{self.mcordata}_{string_ptjet}_sel_selfnorm" ) self.list_obj = [h_stat] self.plot_order = list(range(len(self.list_obj))) diff --git a/machine_learning_hep/processer.py b/machine_learning_hep/processer.py index a4f2a825ad..7072ac3eef 100644 --- a/machine_learning_hep/processer.py +++ b/machine_learning_hep/processer.py @@ -13,6 +13,7 @@ """ main script for doing data processing, machine learning and analysis """ + import glob import multiprocessing as mp import os @@ -25,35 +26,63 @@ from copy import deepcopy from functools import reduce from typing import TypeVar -from pandas.api.types import is_numeric_dtype import numpy as np import pandas as pd import uproot +from pandas.api.types import is_numeric_dtype from .bitwise import tag_bit_df from .io import dump_yaml_from_dict from .logger import get_logger -from .utilities import (count_df_length_pkl, dfquery, mask_df, merge_method, - mergerootfiles, openfile, read_df, seldf_singlevar, - write_df) -from .utilities_files import (appendmainfoldertolist, create_folder_struc, - createlist, list_folders) +from .utilities import ( + count_df_length_pkl, + dfquery, + mask_df, + merge_method, + mergerootfiles, + openfile, + read_df, + seldf_singlevar, + write_df, +) +from .utilities_files import appendmainfoldertolist, create_folder_struc, createlist, list_folders pd.options.mode.chained_assignment = None -class Processer: # pylint: disable=too-many-instance-attributes + +class Processer: # pylint: disable=too-many-instance-attributes # Class Attribute - species = 'processer' + species = "processer" logger = get_logger() # Initializer / Instance Attributes # pylint: disable=too-many-statements, too-many-arguments, consider-using-f-string - def __init__(self, case, datap, run_param, mcordata, p_maxfiles, # pylint: disable=too-many-branches - d_root, d_pkl, d_pklsk, d_pkl_ml, p_period, i_period, - p_chunksizeunp, p_chunksizeskim, p_maxprocess, - p_frac_merge, p_rd_merge, d_pkl_dec, d_pkl_decmerged, - d_results, typean, runlisttrigger, d_mcreweights): + def __init__( + self, + case, + datap, + run_param, + mcordata, + p_maxfiles, # pylint: disable=too-many-branches + d_root, + d_pkl, + d_pklsk, + d_pkl_ml, + p_period, + i_period, + p_chunksizeunp, + p_chunksizeskim, + p_maxprocess, + p_frac_merge, + p_rd_merge, + d_pkl_dec, + d_pkl_decmerged, + d_results, + typean, + runlisttrigger, + d_mcreweights, + ): self.doml = datap["doml"] self.case = case # used in hadrons self.typean = typean @@ -78,8 +107,7 @@ def __init__(self, case, datap, run_param, mcordata, p_maxfiles, # pylint: disab except TypeError: self.p_frac_merge = [p_frac_merge] * self.p_nptbins if len(self.p_frac_merge) != self.p_nptbins: - print(f"Length of merge-fraction list != number of pT bins \n" \ - f"{len(self.p_frac_merge)} != {self.p_nptbins}") + print(f"Length of merge-fraction list != number of pT bins \n{len(self.p_frac_merge)} != {self.p_nptbins}") sys.exit(1) self.p_rd_merge = p_rd_merge @@ -95,18 +123,18 @@ def __init__(self, case, datap, run_param, mcordata, p_maxfiles, # pylint: disab self.p_chunksizeunp = p_chunksizeunp self.p_chunksizeskim = p_chunksizeskim - self.df_read = datap['dfs']['read'] - self.df_merge = datap['dfs'].get('merge', None) - self.df_write = datap['dfs'].get('write', None) + self.df_read = datap["dfs"]["read"] + self.df_merge = datap["dfs"].get("merge", None) + self.df_write = datap["dfs"].get("write", None) - #parameter names + # parameter names self.p_maxprocess = p_maxprocess # self.indexsample = None self.p_dofullevtmerge = datap["dofullevtmerge"] - #namefile root + # namefile root self.n_root = datap["files_names"]["namefile_unmerged_tree"] - #namefiles pkl + # namefiles pkl # def nget(d : dict, k : list, dd = None): # return nget(d.get(k.pop(0), {}), k, dd) if len(k) > 1 else d.get(k.pop(0), dd) # nget(datap, ['dfs', 'write', 'jetsubdet', 'file']) @@ -123,14 +151,14 @@ def __init__(self, case, datap, run_param, mcordata, p_maxfiles, # pylint: disab self.n_fileresp = datap["files_names"]["respfilename"] self.n_mcreweights = datap["files_names"]["namefile_mcweights"] - #selections + # selections self.s_reco_skim = datap["sel_reco_skim"] self.s_gen_skim = datap["sel_gen_skim"] - #bitmap + # bitmap # self.b_mcrefl = datap["bitmap_sel"].get("ismcrefl", None) - #variables name + # variables name self.v_train = datap["variables"]["var_training"] self.v_bitvar = datap["bitmap_sel"]["var_name"] # used in hadrons # self.v_bitvar_gen = datap["bitmap_sel"]["var_name_gen"] @@ -148,18 +176,16 @@ def __init__(self, case, datap, run_param, mcordata, p_maxfiles, # pylint: disab self.v_invmass = datap["variables"].get("var_inv_mass", "inv_mass") # self.v_rapy = datap["variables"].get("var_y", "y_cand") - #list of files names + # list of files names if os.path.isdir(self.d_root): - self.l_path = list_folders(self.d_root, self.n_root, self.p_maxfiles, - self.select_jobs) + self.l_path = list_folders(self.d_root, self.n_root, self.p_maxfiles, self.select_jobs) elif glob.glob(f"{self.d_pkl}/**/{self.n_reco}", recursive=True): - self.l_path = list_folders(self.d_pkl, self.n_reco, self.p_maxfiles, - self.select_jobs) + self.l_path = list_folders(self.d_pkl, self.n_reco, self.p_maxfiles, self.select_jobs) else: - self.n_sk = self.n_reco.replace(".p", "_%s%d_%d.p" % \ - (self.v_var_binning, self.lpt_anbinmin[0], self.lpt_anbinmax[0])) - self.l_path = list_folders(self.d_pklsk, self.n_sk, self.p_maxfiles, - self.select_jobs) + self.n_sk = self.n_reco.replace( + ".p", "_%s%d_%d.p" % (self.v_var_binning, self.lpt_anbinmin[0], self.lpt_anbinmax[0]) + ) + self.l_path = list_folders(self.d_pklsk, self.n_sk, self.p_maxfiles, self.select_jobs) self.l_root = createlist(self.d_root, self.l_path, self.n_root) self.l_reco = createlist(self.d_pkl, self.l_path, self.n_reco) @@ -191,8 +217,8 @@ def __init__(self, case, datap, run_param, mcordata, p_maxfiles, # pylint: disab # Potentially mask certain values (e.g. nsigma TOF of -999) self.p_mask_values = datap["ml"].get("mask_values", None) - self.bins_skimming = np.array(list(zip(self.lpt_anbinmin, self.lpt_anbinmax)), 'd') - self.bins_analysis = np.array(list(zip(self.lpt_finbinmin, self.lpt_finbinmax)), 'd') + self.bins_skimming = np.array(list(zip(self.lpt_anbinmin, self.lpt_anbinmax)), "d") + self.bins_analysis = np.array(list(zip(self.lpt_finbinmin, self.lpt_finbinmax)), "d") bin_matching = [ [ptrange[0] <= bin[0] and ptrange[1] >= bin[1] for ptrange in self.bins_skimming].index(True) for bin in self.bins_analysis @@ -200,33 +226,39 @@ def __init__(self, case, datap, run_param, mcordata, p_maxfiles, # pylint: disab self.lpt_probcutpre = datap["mlapplication"]["probcutpresel"][self.mcordata] lpt_probcutfin_tmp = datap["mlapplication"]["probcutoptimal"] - self.lpt_probcutfin = [lpt_probcutfin_tmp[bin_matching[ibin]] - for ibin in range(self.p_nptfinbins)] + self.lpt_probcutfin = [lpt_probcutfin_tmp[bin_matching[ibin]] for ibin in range(self.p_nptfinbins)] for ibin, probcutfin in enumerate(self.lpt_probcutfin): probcutpre = self.lpt_probcutpre[bin_matching[ibin]] if self.mltype == "MultiClassification": if probcutfin[0] > probcutpre[0] or probcutfin[1] < probcutpre[1] or probcutfin[2] < probcutpre[2]: - self.logger.fatal("Probability cut final: %s must be tighter than presel %s!\n" \ - "Verify that bkg prob presel > final, and other cuts presel < final", - self.lpt_probcutfin, self.lpt_probcutpre) + self.logger.fatal( + "Probability cut final: %s must be tighter than presel %s!\n" + "Verify that bkg prob presel > final, and other cuts presel < final", + self.lpt_probcutfin, + self.lpt_probcutpre, + ) elif probcutfin < probcutpre: - self.logger.fatal("Probability cut final: %s must be tighter (smaller values) than presel %s!", - self.lpt_probcutfin, self.lpt_probcutpre) + self.logger.fatal( + "Probability cut final: %s must be tighter (smaller values) than presel %s!", + self.lpt_probcutfin, + self.lpt_probcutpre, + ) if self.mltype == "MultiClassification": self.l_selml = [] comps = ["<=", ">=", ">="] for ipt in range(self.p_nptfinbins): - mlsel_multi = [f'y_test_prob{self.p_modelname}{label.replace("-", "_")} ' \ - f'{comp} {probcut}' - for label, comp, probcut in zip(self.class_labels, comps, - self.lpt_probcutfin[ipt])] + mlsel_multi = [ + f"y_test_prob{self.p_modelname}{label.replace('-', '_')} {comp} {probcut}" + for label, comp, probcut in zip(self.class_labels, comps, self.lpt_probcutfin[ipt]) + ] self.l_selml.append(" and ".join(mlsel_multi)) else: - self.l_selml = [f"y_test_prob{self.p_modelname} > {self.lpt_probcutfin[ipt]}" \ - for ipt in range(self.p_nptfinbins)] + self.l_selml = [ + f"y_test_prob{self.p_modelname} > {self.lpt_probcutfin[ipt]}" for ipt in range(self.p_nptfinbins) + ] self.d_pkl_dec = d_pkl_dec self.mptfiles_recosk = [] @@ -238,52 +270,80 @@ def __init__(self, case, datap, run_param, mcordata, p_maxfiles, # pylint: disab self.n_fileeff = os.path.join(self.d_results, self.n_fileeff) self.n_fileresp = os.path.join(self.d_results, self.n_fileresp) - self.lpt_recosk = [self.n_reco.replace(".p", "_%s%d_%d.p" % \ - (self.v_var_binning, self.lpt_anbinmin[i], self.lpt_anbinmax[i])) \ - for i in range(self.p_nptbins)] - self.lpt_gensk = [self.n_gen.replace(".p", "_%s%d_%d.p" % \ - (self.v_var_binning, self.lpt_anbinmin[i], self.lpt_anbinmax[i])) \ - for i in range(self.p_nptbins)] - self.lpt_reco_ml = [os.path.join(self.d_pkl_ml, self.lpt_recosk[ipt]) \ - for ipt in range(self.p_nptbins)] - self.lpt_gen_ml = [os.path.join(self.d_pkl_ml, self.lpt_gensk[ipt]) \ - for ipt in range(self.p_nptbins)] + self.lpt_recosk = [ + self.n_reco.replace(".p", "_%s%d_%d.p" % (self.v_var_binning, self.lpt_anbinmin[i], self.lpt_anbinmax[i])) + for i in range(self.p_nptbins) + ] + self.lpt_gensk = [ + self.n_gen.replace(".p", "_%s%d_%d.p" % (self.v_var_binning, self.lpt_anbinmin[i], self.lpt_anbinmax[i])) + for i in range(self.p_nptbins) + ] + self.lpt_reco_ml = [os.path.join(self.d_pkl_ml, self.lpt_recosk[ipt]) for ipt in range(self.p_nptbins)] + self.lpt_gen_ml = [os.path.join(self.d_pkl_ml, self.lpt_gensk[ipt]) for ipt in range(self.p_nptbins)] self.f_evt_count_ml = os.path.join(self.d_pkl_ml, self.n_evt_count_ml) - self.lpt_gensk_sl = [self.n_gen_sl.replace(".p", "_%s%d_%d.p" % - (self.v_var_binning, self.lpt_anbinmin[i], self.lpt_anbinmax[i])) - for i in range(self.p_nptbins)] if self.n_gen_sl else None + self.lpt_gensk_sl = ( + [ + self.n_gen_sl.replace( + ".p", "_%s%d_%d.p" % (self.v_var_binning, self.lpt_anbinmin[i], self.lpt_anbinmax[i]) + ) + for i in range(self.p_nptbins) + ] + if self.n_gen_sl + else None + ) self.lpt_recodec = None if self.doml is True: if self.mltype == "MultiClassification": - self.lpt_recodec = [self.n_reco.replace(".p", "%d_%d_%.2f%.2f%.2f.p" % \ - (self.lpt_anbinmin[i], self.lpt_anbinmax[i], - self.lpt_probcutpre[i][0], self.lpt_probcutpre[i][1], - self.lpt_probcutpre[i][2])) \ - for i in range(self.p_nptbins)] + self.lpt_recodec = [ + self.n_reco.replace( + ".p", + "%d_%d_%.2f%.2f%.2f.p" + % ( + self.lpt_anbinmin[i], + self.lpt_anbinmax[i], + self.lpt_probcutpre[i][0], + self.lpt_probcutpre[i][1], + self.lpt_probcutpre[i][2], + ), + ) + for i in range(self.p_nptbins) + ] else: - self.lpt_recodec = [self.n_reco.replace(".p", "%d_%d_%.2f.p" % \ - (self.lpt_anbinmin[i], self.lpt_anbinmax[i], \ - self.lpt_probcutpre[i])) for i in range(self.p_nptbins)] + self.lpt_recodec = [ + self.n_reco.replace( + ".p", "%d_%d_%.2f.p" % (self.lpt_anbinmin[i], self.lpt_anbinmax[i], self.lpt_probcutpre[i]) + ) + for i in range(self.p_nptbins) + ] else: - self.lpt_recodec = [self.n_reco.replace(".p", "%d_%d_std.p" % \ - (self.lpt_anbinmin[i], self.lpt_anbinmax[i])) \ - for i in range(self.p_nptbins)] - - self.mptfiles_recosk = [createlist(self.d_pklsk, self.l_path, \ - self.lpt_recosk[ipt]) for ipt in range(self.p_nptbins)] - self.mptfiles_recoskmldec = [createlist(self.d_pkl_dec, self.l_path, \ - self.lpt_recodec[ipt]) for ipt in range(self.p_nptbins)] - self.lpt_recodecmerged = [os.path.join(self.d_pkl_decmerged, self.lpt_recodec[ipt]) - for ipt in range(self.p_nptbins)] + self.lpt_recodec = [ + self.n_reco.replace(".p", "%d_%d_std.p" % (self.lpt_anbinmin[i], self.lpt_anbinmax[i])) + for i in range(self.p_nptbins) + ] + + self.mptfiles_recosk = [ + createlist(self.d_pklsk, self.l_path, self.lpt_recosk[ipt]) for ipt in range(self.p_nptbins) + ] + self.mptfiles_recoskmldec = [ + createlist(self.d_pkl_dec, self.l_path, self.lpt_recodec[ipt]) for ipt in range(self.p_nptbins) + ] + self.lpt_recodecmerged = [ + os.path.join(self.d_pkl_decmerged, self.lpt_recodec[ipt]) for ipt in range(self.p_nptbins) + ] if self.mcordata == "mc": - self.mptfiles_gensk = [createlist(self.d_pklsk, self.l_path, \ - self.lpt_gensk[ipt]) for ipt in range(self.p_nptbins)] - self.lpt_gendecmerged = [os.path.join(self.d_pkl_decmerged, self.lpt_gensk[ipt]) - for ipt in range(self.p_nptbins)] - self.mptfiles_gensk_sl = [createlist(self.d_pklsk, self.l_path, - self.lpt_gensk_sl[ipt]) for ipt in range(self.p_nptbins)] if self.lpt_gensk_sl else None + self.mptfiles_gensk = [ + createlist(self.d_pklsk, self.l_path, self.lpt_gensk[ipt]) for ipt in range(self.p_nptbins) + ] + self.lpt_gendecmerged = [ + os.path.join(self.d_pkl_decmerged, self.lpt_gensk[ipt]) for ipt in range(self.p_nptbins) + ] + self.mptfiles_gensk_sl = ( + [createlist(self.d_pklsk, self.l_path, self.lpt_gensk_sl[ipt]) for ipt in range(self.p_nptbins)] + if self.lpt_gensk_sl + else None + ) # self.triggerbit = datap["analysis"][self.typean]["triggerbit"] self.runlistrigger = runlisttrigger @@ -297,11 +357,15 @@ def __init__(self, case, datap, run_param, mcordata, p_maxfiles, # pylint: disab self.do_custom_analysis_cuts = datap["analysis"][self.typean].get("use_cuts", False) T = TypeVar("T") + def cfg(self, param: str, default: T = None) -> T: - return reduce(lambda d, key: d.get(key, default) if isinstance(d, dict) else default, - param.split("."), self.datap['analysis'][self.typean]) + return reduce( + lambda d, key: d.get(key, default) if isinstance(d, dict) else default, + param.split("."), + self.datap["analysis"][self.typean], + ) - def unpack(self, file_index, max_no_keys = None): # pylint: disable=too-many-branches, too-many-locals + def unpack(self, file_index, max_no_keys=None): # pylint: disable=too-many-branches, too-many-locals def dfread(rdir, trees, cols, idx_name=None): """Read DF from multiple (joinable) O2 tables""" try: @@ -312,22 +376,21 @@ def dfread(rdir, trees, cols, idx_name=None): df = None for tree, col in zip([rdir[name] for name in trees], cols): try: - data = tree.arrays(expressions=col, library='np') + data = tree.arrays(expressions=col, library="np") dfnew = pd.DataFrame(columns=col, data=data) df = pd.concat([df, dfnew], axis=1) - except Exception as e: # pylint: disable=broad-except + except Exception as e: # pylint: disable=broad-except tree.show(name_width=50) - self.logger.critical('Failed to read data frame from tree %s: %s', - tree.name, str(e)) + self.logger.critical("Failed to read data frame from tree %s: %s", tree.name, str(e)) sys.exit() - df['df'] = int(df_no) + df["df"] = int(df_no) if idx_name: # df.rename_axis(idx_name, inplace=True) df[idx_name] = df.index - df.set_index(['df', idx_name], inplace=True) + df.set_index(["df", idx_name], inplace=True) return df except Exception as e: - self.logger.exception('Failed to read data from trees: %s', str(e)) + self.logger.exception("Failed to read data from trees: %s", str(e)) raise e def dfappend(name: str, dfa): @@ -339,31 +402,33 @@ def dfmerge(dfl, dfr, **kwargs): try: return pd.merge(dfl, dfr, **kwargs) except Exception as e: - self.logger.error('merging failed: %s', str(e)) + self.logger.error("merging failed: %s", str(e)) dfl.info() dfr.info() raise e def dfuse(df_spec): - level = df_spec.get('level', 'all') - return ((level == 'all') or - (level in ('mc', 'gen', 'det') and self.mcordata == 'mc') or - (level in ('data') and self.mcordata == 'data')) - - self.logger.info('unpacking: %s', self.l_root[file_index]) + level = df_spec.get("level", "all") + return ( + (level == "all") + or (level in ("mc", "gen", "det") and self.mcordata == "mc") + or (level in ("data") and self.mcordata == "data") + ) + + self.logger.info("unpacking: %s", self.l_root[file_index]) dfs = {} - self.logger.debug(' -> reading') + self.logger.debug(" -> reading") with uproot.open(self.l_root[file_index]) as rfile: df_processed = set() - keys = rfile.keys(recursive=False, filter_name='DF_*') - self.logger.info('found %d dataframes, reading %s', len(keys), max_no_keys or "all") - for (idx, key) in enumerate(keys[:max_no_keys]): - if not (df_key := re.match('^DF_(\\d+);', key)): + keys = rfile.keys(recursive=False, filter_name="DF_*") + self.logger.info("found %d dataframes, reading %s", len(keys), max_no_keys or "all") + for idx, key in enumerate(keys[:max_no_keys]): + if not (df_key := re.match("^DF_(\\d+);", key)): continue if (df_no := int(df_key.group(1))) in df_processed: - self.logger.warning('multiple versions of DF %d', df_no) + self.logger.warning("multiple versions of DF %d", df_no) continue - self.logger.debug('processing DF %d - %d / %d', df_no, idx, len(keys)) + self.logger.debug("processing DF %d - %d / %d", df_no, idx, len(keys)) df_processed.add(df_no) rdir = rfile[key] @@ -371,110 +436,114 @@ def dfuse(df_spec): if dfuse(df_spec): trees = [] cols = [] - for tree, spec in zip(df_spec['trees'].keys(), df_spec['trees'].values()): + for tree, spec in zip(df_spec["trees"].keys(), df_spec["trees"].values()): if isinstance(spec, list): trees.append(tree) cols.append(spec) elif dfuse(spec): trees.append(tree) - cols.append(spec['vars']) - df = dfread(rdir, trees, cols, idx_name=df_spec.get('index', None)) + cols.append(spec["vars"]) + df = dfread(rdir, trees, cols, idx_name=df_spec.get("index", None)) dfappend(df_name, df) for df_name, df_spec in self.df_read.items(): if dfuse(df_spec) and not dfs[df_name].empty: - if 'extra' in df_spec: - self.logger.debug(' %s -> extra', df_name) - for col_name, col_val in df_spec['extra'].items(): - self.logger.debug(' %s -> %s', col_name, col_val) + if "extra" in df_spec: + self.logger.debug(" %s -> extra", df_name) + for col_name, col_val in df_spec["extra"].items(): + self.logger.debug(" %s -> %s", col_name, col_val) dfs[df_name][col_name] = dfs[df_name].eval(col_val) - if 'extract_component' in df_spec: - self.logger.debug(' %s -> extract_component', df_name) - specs = df_spec['extract_component'] + if "extract_component" in df_spec: + self.logger.debug(" %s -> extract_component", df_name) + specs = df_spec["extract_component"] for spec in specs: - var, newvar, component = spec['var'], spec['newvar'], spec['component'] + var, newvar, component = spec["var"], spec["newvar"], spec["component"] dfs[df_name][newvar] = dfs[df_name][var].apply(lambda x, comp=component: x[comp]) - if 'filter' in df_spec: - self.logger.debug(' %s -> filter', df_name) - dfquery(dfs[df_name], df_spec['filter'], inplace=True) - if 'tags' in df_spec: - self.logger.debug(' %s -> tags', df_name) - for tag, value in df_spec['tags'].items(): + if "filter" in df_spec: + self.logger.debug(" %s -> filter", df_name) + dfquery(dfs[df_name], df_spec["filter"], inplace=True) + if "tags" in df_spec: + self.logger.debug(" %s -> tags", df_name) + for tag, value in df_spec["tags"].items(): if dfuse(value): dfs[df_name][tag] = np.array( - tag_bit_df(dfs[df_name], value['var'], value['req'], value.get('abs', False)), - dtype=int) + tag_bit_df(dfs[df_name], value["var"], value["req"], value.get("abs", False)), dtype=int + ) - if 'swap' in df_spec: - self.logger.debug(' %s -> swap', df_name) - spec = df_spec['swap'] + if "swap" in df_spec: + self.logger.debug(" %s -> swap", df_name) + spec = df_spec["swap"] if dfuse(spec): - swapped = dfs[df_name][spec['cand']] == dfs[df_name][spec['var_swap']] + 1 - for var in spec['vars']: + swapped = dfs[df_name][spec["cand"]] == dfs[df_name][spec["var_swap"]] + 1 + for var in spec["vars"]: dfs[df_name][var] = np.logical_and(dfs[df_name][var] == 1, swapped) - self.logger.debug(' %s -> done', df_name) - + self.logger.debug(" %s -> done", df_name) if self.df_merge: for m_spec in self.df_merge: - base = m_spec['base'] - ref = m_spec['ref'] - out = m_spec.get('out', base) + base = m_spec["base"] + ref = m_spec["ref"] + out = m_spec.get("out", base) if all([dfuse(self.df_read[base]), dfuse(self.df_read[ref])]): - if (on := m_spec.get('use', None)) is not None: - self.logger.info('merging %s with %s on %s into %s', base, ref, on, out) - if not isinstance(on, list) or 'df' not in on: - on = ['df', on] - dfs[out] = dfmerge(dfs[base], dfs[ref], suffixes=(f'_{base}', None), on=on) - elif (on := m_spec.get('left_on', None)) is not None: - self.logger.info('merging %s with %s on %s into %s', base, ref, on, out) + if (on := m_spec.get("use", None)) is not None: + self.logger.info("merging %s with %s on %s into %s", base, ref, on, out) + if not isinstance(on, list) or "df" not in on: + on = ["df", on] + dfs[out] = dfmerge(dfs[base], dfs[ref], suffixes=(f"_{base}", None), on=on) + elif (on := m_spec.get("left_on", None)) is not None: + self.logger.info("merging %s with %s on %s into %s", base, ref, on, out) if not is_numeric_dtype(dfs[base][on]): - self.logger.info('exploding dataframe %s on variable %s', base, on) + self.logger.info("exploding dataframe %s on variable %s", base, on) dfs[out] = dfmerge( - dfs[base].explode(on), dfs[ref], left_on=['df', on], suffixes=(f'_{base}', None), - right_index=True) + dfs[base].explode(on), + dfs[ref], + left_on=["df", on], + suffixes=(f"_{base}", None), + right_index=True, + ) else: dfs[out] = dfmerge( - dfs[base], dfs[ref], left_on=['df', on], suffixes=(f'_{base}', None), right_index=True) + dfs[base], dfs[ref], left_on=["df", on], suffixes=(f"_{base}", None), right_index=True + ) else: - var = self.df_read[ref]['index'] - self.logger.info('merging %s with %s on %s (default) into %s', base, ref, var, out) + var = self.df_read[ref]["index"] + self.logger.info("merging %s with %s on %s (default) into %s", base, ref, var, out) dfs[out] = dfmerge( - dfs[base], dfs[ref], left_on=['df', var], suffixes=(f'_{base}', None), right_index=True) - if 'extra' in m_spec: - self.logger.debug(' %s -> extra', out) - for col_name, col_val in m_spec['extra'].items(): + dfs[base], dfs[ref], left_on=["df", var], suffixes=(f"_{base}", None), right_index=True + ) + if "extra" in m_spec: + self.logger.debug(" %s -> extra", out) + for col_name, col_val in m_spec["extra"].items(): dfs[out][col_name] = dfs[out].eval(col_val) if self.df_write: for df_name, df_spec in self.df_write.items(): if dfuse(df_spec): - self.logger.info('writing %s to %s', df_name, df_spec['file']) - src = df_spec.get('source', df_name) - dfo = dfquery(dfs[src], df_spec.get('filter', None)) - path = os.path.join(self.d_pkl, self.l_path[file_index], df_spec['file']) + self.logger.info("writing %s to %s", df_name, df_spec["file"]) + src = df_spec.get("source", df_name) + dfo = dfquery(dfs[src], df_spec.get("filter", None)) + path = os.path.join(self.d_pkl, self.l_path[file_index], df_spec["file"]) write_df(dfo, path) def skim(self, file_index): dfreco = read_df(self.l_reco[file_index]) - dfgen = read_df(self.l_gen[file_index]) if self.mcordata == 'mc' else None - dfgen_sl = read_df(self.l_gen_sl[file_index]) if self.n_gen_sl and self.mcordata == 'mc' else None + dfgen = read_df(self.l_gen[file_index]) if self.mcordata == "mc" else None + dfgen_sl = read_df(self.l_gen_sl[file_index]) if self.n_gen_sl and self.mcordata == "mc" else None for ipt in range(self.p_nptbins): - dfrecosk = seldf_singlevar(dfreco, self.v_var_binning, - self.lpt_anbinmin[ipt], self.lpt_anbinmax[ipt]) + dfrecosk = seldf_singlevar(dfreco, self.v_var_binning, self.lpt_anbinmin[ipt], self.lpt_anbinmax[ipt]) dfrecosk = dfquery(dfrecosk, self.s_reco_skim[ipt]) write_df(dfrecosk, self.mptfiles_recosk[ipt][file_index]) if dfgen is not None: - dfgensk = seldf_singlevar(dfgen, self.v_var_binning, - self.lpt_anbinmin[ipt], self.lpt_anbinmax[ipt]) + dfgensk = seldf_singlevar(dfgen, self.v_var_binning, self.lpt_anbinmin[ipt], self.lpt_anbinmax[ipt]) dfgensk = dfquery(dfgensk, self.s_gen_skim[ipt]) write_df(dfgensk, self.mptfiles_gensk[ipt][file_index]) if dfgen_sl is not None: - dfgensk_sl = seldf_singlevar(dfgen_sl, self.v_var_binning, - self.lpt_anbinmin[ipt], self.lpt_anbinmax[ipt]) + dfgensk_sl = seldf_singlevar( + dfgen_sl, self.v_var_binning, self.lpt_anbinmin[ipt], self.lpt_anbinmax[ipt] + ) dfgensk_sl = dfquery(dfgensk_sl, self.s_gen_skim[ipt]) write_df(dfgensk_sl, self.mptfiles_gensk_sl[ipt][file_index]) @@ -487,23 +556,24 @@ def applymodel(self, file_index): if self.p_mask_values: mask_df(dfrecosk, self.p_mask_values) if self.doml is True: - from machine_learning_hep.models import \ - apply # pylint: disable=import-error, import-outside-toplevel + from machine_learning_hep.models import apply # pylint: disable=import-error, import-outside-toplevel + if os.path.isfile(self.lpt_model[ipt]) is False: print("Model file not present in bin %d" % ipt) - with openfile(self.lpt_model[ipt], 'rb') as mod_file: + with openfile(self.lpt_model[ipt], "rb") as mod_file: mod = pickle.load(mod_file) if self.mltype == "MultiClassification": - dfrecoskml = apply(self.mltype, [self.p_modelname], [mod], - dfrecosk, self.v_train[ipt], self.class_labels) - probs = [f'y_test_prob{self.p_modelname}{label.replace("-", "_")}' \ - for label in self.class_labels] - dfrecoskml = dfrecoskml[(dfrecoskml[probs[0]] <= self.lpt_probcutpre[ipt][0]) & - (dfrecoskml[probs[1]] >= self.lpt_probcutpre[ipt][1]) & - (dfrecoskml[probs[2]] >= self.lpt_probcutpre[ipt][2])] + dfrecoskml = apply( + self.mltype, [self.p_modelname], [mod], dfrecosk, self.v_train[ipt], self.class_labels + ) + probs = [f"y_test_prob{self.p_modelname}{label.replace('-', '_')}" for label in self.class_labels] + dfrecoskml = dfrecoskml[ + (dfrecoskml[probs[0]] <= self.lpt_probcutpre[ipt][0]) + & (dfrecoskml[probs[1]] >= self.lpt_probcutpre[ipt][1]) + & (dfrecoskml[probs[2]] >= self.lpt_probcutpre[ipt][2]) + ] else: - dfrecoskml = apply("BinaryClassification", [self.p_modelname], [mod], - dfrecosk, self.v_train[ipt]) + dfrecoskml = apply("BinaryClassification", [self.p_modelname], [mod], dfrecosk, self.v_train[ipt]) probvar = f"y_test_prob{self.p_modelname}" dfrecoskml = dfrecoskml.loc[dfrecoskml[probvar] > self.lpt_probcutpre[ipt]] else: @@ -512,19 +582,17 @@ def applymodel(self, file_index): @staticmethod def callback(ex): - get_logger().exception('Error callback: %s', ex) + get_logger().exception("Error callback: %s", ex) traceback.print_stack() raise ex def parallelizer(self, function, argument_list, maxperchunk): # TODO: fix logic and avoid waiting for the slowest job - chunks = [argument_list[x:x+maxperchunk] - for x in range(0, len(argument_list), maxperchunk)] + chunks = [argument_list[x : x + maxperchunk] for x in range(0, len(argument_list), maxperchunk)] for chunk in chunks: self.logger.debug("Processing new chunk of size = %i", maxperchunk) with mp.Pool(self.p_maxprocess) as pool: - _ = [pool.apply_async(function, args=chunk[i], error_callback=self.callback) - for i in range(len(chunk))] + _ = [pool.apply_async(function, args=chunk[i], error_callback=self.callback) for i in range(len(chunk))] pool.close() pool.join() # TODO: maybe simpler to use: @@ -535,8 +603,7 @@ def process_unpack_par(self): self.logger.info("Unpacking %s period %s", self.mcordata, self.period) create_folder_struc(self.d_pkl, self.l_path) arguments = [(i,) for i in range(len(self.l_root))] - self.logger.debug('d_pkl: %s, l_path: %s, arguments: %s', - self.d_pkl, str(self.l_path), str(arguments)) + self.logger.debug("d_pkl: %s, l_path: %s, arguments: %s", self.d_pkl, str(self.l_path), str(arguments)) self.parallelizer(self.unpack, arguments, self.p_chunksizeunp) def process_skim_par(self): @@ -562,8 +629,7 @@ def process_mergeforml(self): if not nfiles: print("There are no files to be merged") continue - self.logger.info("Use merge fraction %g for pT bin %d", - self.p_frac_merge[ipt], ipt) + self.logger.info("Use merge fraction %g for pT bin %d", self.p_frac_merge[ipt], ipt) ntomerge = int(nfiles * self.p_frac_merge[ipt]) rd.seed(self.p_rd_merge) filesel = rd.sample(range(0, nfiles), ntomerge) @@ -577,8 +643,7 @@ def process_mergeforml(self): self.logger.info("Count events...") list_sel_evt = [self.l_evt[j] for j in indices_for_evt] list_sel_evtorig = [self.l_evtorig[j] for j in indices_for_evt] - count_dict = {"evt": count_df_length_pkl(*list_sel_evt), - "evtorig": count_df_length_pkl(*list_sel_evtorig)} + count_dict = {"evt": count_df_length_pkl(*list_sel_evt), "evtorig": count_df_length_pkl(*list_sel_evtorig)} dump_yaml_from_dict(count_dict, self.f_evt_count_ml) def process_mergedec(self): @@ -587,10 +652,8 @@ def process_mergedec(self): if self.mcordata == "mc": merge_method(self.mptfiles_gensk[ipt], self.lpt_gendecmerged[ipt]) - def load_cuts(self): - """Load custom analysis cuts from the database. - """ + """Load custom analysis cuts from the database.""" raw_cuts = self.datap["analysis"][self.typean].get("cuts", None) if not raw_cuts: print("No custom cuts given, hence not cutting...") @@ -601,14 +664,12 @@ def load_cuts(self): sys.exit(1) self.analysis_cuts = deepcopy(raw_cuts) - def apply_cuts_ptbin(self, df_ipt, ipt): """Cut dataframe with cuts for a given analysis pT bin""" if not self.analysis_cuts[ipt]: return df_ipt return df_ipt.query(self.analysis_cuts[ipt]) - def apply_cuts_all_ptbins(self, df_): """Apply cuts for all analysis pT bins.""" if not self.do_custom_analysis_cuts or not any(self.analysis_cuts): @@ -631,11 +692,9 @@ def apply_cut_for_ipt(df_full, ipt: int): return pd.concat(apply_cut_for_ipt(df_, ipt) for ipt in range(-1, self.p_nptfinbins + 1)) - def process_histomass(self): self.logger.debug("Doing masshisto %s %s", self.mcordata, self.period) - self.logger.debug("Using run selection for mass histo %s %s %s", - self.runlistrigger, "for period", self.period) + self.logger.debug("Using run selection for mass histo %s %s %s", self.runlistrigger, "for period", self.period) if self.doml is True: self.logger.debug("Doing ml analysis") elif self.do_custom_analysis_cuts: @@ -648,14 +707,13 @@ def process_histomass(self): create_folder_struc(self.d_results, self.l_path) arguments = [(i,) for i in range(len(self.l_root))] - self.parallelizer(self.process_histomass_single, arguments, self.p_chunksizeunp) # pylint: disable=no-member + self.parallelizer(self.process_histomass_single, arguments, self.p_chunksizeunp) # pylint: disable=no-member with tempfile.TemporaryDirectory() as tmp_merged_dir: mergerootfiles(self.l_histomass, self.n_filemass, tmp_merged_dir) def process_efficiency(self): print("Doing efficiencies", self.mcordata, self.period) - print("Using run selection for eff histo", \ - self.runlistrigger, "for period", self.period) + print("Using run selection for eff histo", self.runlistrigger, "for period", self.period) if self.doml is True: print("Doing ml analysis") elif self.do_custom_analysis_cuts: @@ -665,6 +723,6 @@ def process_efficiency(self): create_folder_struc(self.d_results, self.l_path) arguments = [(i,) for i in range(len(self.l_root))] - self.parallelizer(self.process_efficiency_single, arguments, self.p_chunksizeunp) # pylint: disable=no-member + self.parallelizer(self.process_efficiency_single, arguments, self.p_chunksizeunp) # pylint: disable=no-member with tempfile.TemporaryDirectory() as tmp_merged_dir: mergerootfiles(self.l_histoeff, self.n_fileeff, tmp_merged_dir) diff --git a/machine_learning_hep/processer_jet.py b/machine_learning_hep/processer_jet.py index e57fd461cf..0645f81fb3 100644 --- a/machine_learning_hep/processer_jet.py +++ b/machine_learning_hep/processer_jet.py @@ -28,66 +28,106 @@ class ProcesserJets(Processer): species = "processer" - def __init__(self, case, datap, run_param, mcordata, p_maxfiles, # pylint: disable=too-many-arguments - d_root, d_pkl, d_pklsk, d_pkl_ml, p_period, i_period, - p_chunksizeunp, p_chunksizeskim, p_maxprocess, - p_frac_merge, p_rd_merge, d_pkl_dec, d_pkl_decmerged, - d_results, typean, runlisttrigger, d_mcreweights): - super().__init__(case, datap, run_param, mcordata, p_maxfiles, - d_root, d_pkl, d_pklsk, d_pkl_ml, p_period, i_period, - p_chunksizeunp, p_chunksizeskim, p_maxprocess, - p_frac_merge, p_rd_merge, d_pkl_dec, d_pkl_decmerged, - d_results, typean, runlisttrigger, d_mcreweights) + def __init__( + self, + case, + datap, + run_param, + mcordata, + p_maxfiles, # pylint: disable=too-many-arguments + d_root, + d_pkl, + d_pklsk, + d_pkl_ml, + p_period, + i_period, + p_chunksizeunp, + p_chunksizeskim, + p_maxprocess, + p_frac_merge, + p_rd_merge, + d_pkl_dec, + d_pkl_decmerged, + d_results, + typean, + runlisttrigger, + d_mcreweights, + ): + super().__init__( + case, + datap, + run_param, + mcordata, + p_maxfiles, + d_root, + d_pkl, + d_pklsk, + d_pkl_ml, + p_period, + i_period, + p_chunksizeunp, + p_chunksizeskim, + p_maxprocess, + p_frac_merge, + p_rd_merge, + d_pkl_dec, + d_pkl_decmerged, + d_results, + typean, + runlisttrigger, + d_mcreweights, + ) self.logger.info("initialized processer for HF jets") self.s_evtsel = datap["analysis"][self.typean]["evtsel"] # bins: 2d array [[low, high], ...] - self.bins_skimming = np.array(list(zip(self.lpt_anbinmin, self.lpt_anbinmax)), 'd') # TODO: replace with cfg - self.bins_analysis = np.array(list(zip(self.lpt_finbinmin, self.lpt_finbinmax)), 'd') + self.bins_skimming = np.array(list(zip(self.lpt_anbinmin, self.lpt_anbinmax)), "d") # TODO: replace with cfg + self.bins_analysis = np.array(list(zip(self.lpt_finbinmin, self.lpt_finbinmax)), "d") # skimming bins in overlap with the analysis range self.active_bins_skim = [ - iskim for iskim, ptrange in enumerate(self.bins_skimming) - if ptrange[0] < max(self.bins_analysis[:,1]) and ptrange[1] > min(self.bins_analysis[:,0])] - self.logger.info('Using skimming bins: %s', self.active_bins_skim) + iskim + for iskim, ptrange in enumerate(self.bins_skimming) + if ptrange[0] < max(self.bins_analysis[:, 1]) and ptrange[1] > min(self.bins_analysis[:, 0]) + ] + self.logger.info("Using skimming bins: %s", self.active_bins_skim) # binarray: array of bin edges as double (passable to ROOT) limits_mass = datap["analysis"][self.typean]["mass_fit_lim"] binwidth_mass = datap["analysis"][self.typean]["bin_width"] nbins_mass = int(round((limits_mass[1] - limits_mass[0]) / binwidth_mass)) self.binarray_mass = bin_array(nbins_mass, limits_mass[0], limits_mass[1]) - self.binarray_ptjet = np.asarray(self.cfg('bins_ptjet'), 'd') - self.binarray_pthf = np.asarray(self.cfg('sel_an_binmin', []) + self.cfg('sel_an_binmax', [])[-1:], 'd') - self.binarrays_obs = {'gen': {}, 'det': {}} - self.binarrays_ptjet = {'gen': {}, 'det': {}} - for obs in self.cfg('observables', {}): - var = obs.split('-') + self.binarray_ptjet = np.asarray(self.cfg("bins_ptjet"), "d") + self.binarray_pthf = np.asarray(self.cfg("sel_an_binmin", []) + self.cfg("sel_an_binmax", [])[-1:], "d") + self.binarrays_obs = {"gen": {}, "det": {}} + self.binarrays_ptjet = {"gen": {}, "det": {}} + for obs in self.cfg("observables", {}): + var = obs.split("-") for v in var: if v in self.binarrays_obs: continue - for level in ('gen', 'det'): - if binning := self.cfg(f'observables.{v}.bins_{level}_var'): - self.binarrays_obs[level][v] = np.asarray(binning, 'd') - elif binning := self.cfg(f'observables.{v}.bins_{level}_fix'): + for level in ("gen", "det"): + if binning := self.cfg(f"observables.{v}.bins_{level}_var"): + self.binarrays_obs[level][v] = np.asarray(binning, "d") + elif binning := self.cfg(f"observables.{v}.bins_{level}_fix"): self.binarrays_obs[level][v] = bin_array(*binning) - elif binning := self.cfg(f'observables.{v}.bins_var'): - self.binarrays_obs[level][v] = np.asarray(binning, 'd') - elif binning := self.cfg(f'observables.{v}.bins_fix'): + elif binning := self.cfg(f"observables.{v}.bins_var"): + self.binarrays_obs[level][v] = np.asarray(binning, "d") + elif binning := self.cfg(f"observables.{v}.bins_fix"): self.binarrays_obs[level][v] = bin_array(*binning) else: - self.logger.error('no binning specified for %s, using defaults', v) - self.binarrays_obs[level][v] = bin_array(10, 0., 1.) + self.logger.error("no binning specified for %s, using defaults", v) + self.binarrays_obs[level][v] = bin_array(10, 0.0, 1.0) - if binning := self.cfg(f'observables.{v}.bins_ptjet'): - self.binarrays_ptjet[level][v] = np.asarray(binning, 'd') + if binning := self.cfg(f"observables.{v}.bins_ptjet"): + self.binarrays_ptjet[level][v] = np.asarray(binning, "d") else: self.binarrays_ptjet[level][v] = self.binarray_ptjet - self.binarrays_obs['gen']['fPt'] = self.binarray_pthf - self.binarrays_obs['det']['fPt'] = self.binarray_pthf - self.binarrays_ptjet['gen']['fPt'] = np.asarray(self.cfg('bins_ptjet_eff'), 'd') - self.binarrays_ptjet['det']['fPt'] = np.asarray(self.cfg('bins_ptjet_eff'), 'd') - + self.binarrays_obs["gen"]["fPt"] = self.binarray_pthf + self.binarrays_obs["det"]["fPt"] = self.binarray_pthf + self.binarrays_ptjet["gen"]["fPt"] = np.asarray(self.cfg("bins_ptjet_eff"), "d") + self.binarrays_ptjet["det"]["fPt"] = np.asarray(self.cfg("bins_ptjet_eff"), "d") # region observables # pylint: disable=invalid-name @@ -96,91 +136,91 @@ def _verify_variables(self, dfi): Explicit (slow) implementation, use for reference/validation only """ df = dfi.copy(deep=True) - df['rg'] = -.1 - df['nsd'] = -1.0 - df['zg'] = -.1 + df["rg"] = -0.1 + df["nsd"] = -1.0 + df["zg"] = -0.1 for idx, row in df.iterrows(): isSoftDropped = False nsd = 0 - for zg, theta in zip(row['zg_array'], row['fTheta']): - if zg >= self.cfg('zcut', .1): + for zg, theta in zip(row["zg_array"], row["fTheta"]): + if zg >= self.cfg("zcut", 0.1): if not isSoftDropped: - df.loc[idx, 'zg'] = zg - df.loc[idx, 'rg'] = theta + df.loc[idx, "zg"] = zg + df.loc[idx, "rg"] = theta isSoftDropped = True nsd += 1 - df.loc[idx, 'nsd'] = nsd - for var in ['zg', 'nsd', 'rg']: + df.loc[idx, "nsd"] = nsd + for var in ["zg", "nsd", "rg"]: if np.allclose(dfi[var], df[var]): - self.logger.info('%s check ok', var) + self.logger.info("%s check ok", var) else: - self.logger.error('%s check failed', var) + self.logger.error("%s check failed", var) mask = np.isclose(dfi[var], df[var]) print(df[~mask][var], flush=True) print(dfi[~mask][var], flush=True) - - def _calculate_variables(self, df, verify=False): # pylint: disable=invalid-name - self.logger.info('calculating variables') + def _calculate_variables(self, df, verify=False): # pylint: disable=invalid-name + self.logger.info("calculating variables") if len(df) == 0: - df['nsub21'] = None - df['zg'] = None - df['rg'] = None - df['nsd'] = None - df['lnkt'] = None - df['lntheta'] = None + df["nsub21"] = None + df["zg"] = None + df["rg"] = None + df["nsd"] = None + df["lnkt"] = None + df["lntheta"] = None return df - df['nsub21'] = df.fNSub2 / df.fNSub1 + df["nsub21"] = df.fNSub2 / df.fNSub1 # TODO: catch nsub1 == 0 - self.logger.debug('zg') - df['zg_array'] = np.array(.5 - abs(df.fPtSubLeading / (df.fPtLeading + df.fPtSubLeading) - .5)) - zcut = self.cfg('zcut', .1) - df['zg'] = df['zg_array'].apply((lambda ar: next((zg for zg in ar if zg >= zcut), -.1))) - df['rg'] = df[['zg_array', 'fTheta']].apply( - (lambda ar: next((rg for (zg, rg) in zip(ar.zg_array, ar.fTheta) if zg >= zcut), -.1)), axis=1) - df['nsd'] = df['zg_array'].apply((lambda ar: len([zg for zg in ar if zg >= zcut]))) - - self.logger.debug('Lund') - df['lnkt'] = df[['fPtSubLeading', 'fTheta']].apply( - (lambda ar: np.log(ar.fPtSubLeading * np.sin(ar.fTheta))), axis=1) - df['lntheta'] = df['fTheta'].apply(lambda x: -np.log(x)) + self.logger.debug("zg") + df["zg_array"] = np.array(0.5 - abs(df.fPtSubLeading / (df.fPtLeading + df.fPtSubLeading) - 0.5)) + zcut = self.cfg("zcut", 0.1) + df["zg"] = df["zg_array"].apply((lambda ar: next((zg for zg in ar if zg >= zcut), -0.1))) + df["rg"] = df[["zg_array", "fTheta"]].apply( + (lambda ar: next((rg for (zg, rg) in zip(ar.zg_array, ar.fTheta) if zg >= zcut), -0.1)), axis=1 + ) + df["nsd"] = df["zg_array"].apply((lambda ar: len([zg for zg in ar if zg >= zcut]))) + + self.logger.debug("Lund") + df["lnkt"] = df[["fPtSubLeading", "fTheta"]].apply( + (lambda ar: np.log(ar.fPtSubLeading * np.sin(ar.fTheta))), axis=1 + ) + df["lntheta"] = df["fTheta"].apply(lambda x: -np.log(x)) # df['lntheta'] = np.array(-np.log(df.fTheta)) - self.logger.info('EEC') - df['eecweight'] = df[['fPairPt', 'fJetPt']].apply( - (lambda ar: ar.fPairPt / ar.fJetPt**2), axis=1) - - if self.cfg('hfjet', True): - df['dr'] = np.sqrt((df.fJetEta - df.fEta)**2 + ((df.fJetPhi - df.fPhi + math.pi) % math.tau - math.pi)**2) - df['jetPx'] = df.fJetPt * np.cos(df.fJetPhi) - df['jetPy'] = df.fJetPt * np.sin(df.fJetPhi) - df['jetPz'] = df.fJetPt * np.sinh(df.fJetEta) - df['hfPx'] = df.fPt * np.cos(df.fPhi) - df['hfPy'] = df.fPt * np.sin(df.fPhi) - df['hfPz'] = df.fPt * np.sinh(df.fEta) - df['zpar_num'] = df.jetPx * df.hfPx + df.jetPy * df.hfPy + df.jetPz * df.hfPz - df['zpar_den'] = df.jetPx * df.jetPx + df.jetPy * df.jetPy + df.jetPz * df.jetPz - df['zpar'] = df.zpar_num / df.zpar_den - df[df['zpar'] >= 1.]['zpar'] = .999 # move 1 to last bin - - self.logger.debug('done') + self.logger.info("EEC") + df["eecweight"] = df[["fPairPt", "fJetPt"]].apply((lambda ar: ar.fPairPt / ar.fJetPt**2), axis=1) + + if self.cfg("hfjet", True): + df["dr"] = np.sqrt( + (df.fJetEta - df.fEta) ** 2 + ((df.fJetPhi - df.fPhi + math.pi) % math.tau - math.pi) ** 2 + ) + df["jetPx"] = df.fJetPt * np.cos(df.fJetPhi) + df["jetPy"] = df.fJetPt * np.sin(df.fJetPhi) + df["jetPz"] = df.fJetPt * np.sinh(df.fJetEta) + df["hfPx"] = df.fPt * np.cos(df.fPhi) + df["hfPy"] = df.fPt * np.sin(df.fPhi) + df["hfPz"] = df.fPt * np.sinh(df.fEta) + df["zpar_num"] = df.jetPx * df.hfPx + df.jetPy * df.hfPy + df.jetPz * df.hfPz + df["zpar_den"] = df.jetPx * df.jetPx + df.jetPy * df.jetPy + df.jetPz * df.jetPz + df["zpar"] = df.zpar_num / df.zpar_den + df[df["zpar"] >= 1.0]["zpar"] = 0.999 # move 1 to last bin + + self.logger.debug("done") if verify: self._verify_variables(df) return df - def split_df(self, dfi, frac): - '''split data frame based on df number''' + """split data frame based on df number""" # dfa = dfi.split(frac=frac, random_state=1234) # return dfa, dfi.drop(dfa.index) mask = (dfi.index.get_level_values(0) % 100) < frac * 100 return dfi[mask], dfi[~mask] - # region histomass # pylint: disable=too-many-branches def process_histomass_single(self, index): - self.logger.info('Processing (histomass) %s', self.l_evtorig[index]) + self.logger.info("Processing (histomass) %s", self.l_evtorig[index]) with TFile.Open(self.l_histomass[index], "recreate") as _: dfevtorig = read_df(self.l_evtorig[index]) @@ -188,82 +228,91 @@ def process_histomass_single(self, index): histonorm.SetBinContent(1, len(dfquery(dfevtorig, self.s_evtsel))) if self.l_collcnt: dfcollcnt = read_df(self.l_collcnt[index]) - ser_collcnt = dfcollcnt[self.cfg(f'counter_read_{self.mcordata}')] - collcnt_read = functools.reduce(lambda x,y: float(x)+float(y), (ar[0] for ar in ser_collcnt)) - self.logger.info('sampled %g collisions', collcnt_read) + ser_collcnt = dfcollcnt[self.cfg(f"counter_read_{self.mcordata}")] + collcnt_read = functools.reduce(lambda x, y: float(x) + float(y), (ar[0] for ar in ser_collcnt)) + self.logger.info("sampled %g collisions", collcnt_read) histonorm.SetBinContent(2, collcnt_read) - ser_collcnt = dfcollcnt[self.cfg('counter_tvx')] - collcnt_tvx = functools.reduce(lambda x,y: float(x)+float(y), (ar[0] for ar in ser_collcnt)) + ser_collcnt = dfcollcnt[self.cfg("counter_tvx")] + collcnt_tvx = functools.reduce(lambda x, y: float(x) + float(y), (ar[0] for ar in ser_collcnt)) histonorm.SetBinContent(3, collcnt_tvx) if self.l_bccnt: dfbccnt = read_df(self.l_bccnt[index]) - ser_bccnt = dfbccnt[self.cfg('counter_tvx')] - bccnt_tvx = functools.reduce(lambda x,y: float(x)+float(y), (ar[0] for ar in ser_bccnt)) + ser_bccnt = dfbccnt[self.cfg("counter_tvx")] + bccnt_tvx = functools.reduce(lambda x, y: float(x) + float(y), (ar[0] for ar in ser_bccnt)) histonorm.SetBinContent(4, bccnt_tvx) - get_axis(histonorm, 0).SetBinLabel(1, 'N_{evt}') - get_axis(histonorm, 0).SetBinLabel(2, 'N_{coll}') - get_axis(histonorm, 0).SetBinLabel(3, 'N_{coll}^{TVX}') - get_axis(histonorm, 0).SetBinLabel(4, 'N_{BC}^{TVX}') + get_axis(histonorm, 0).SetBinLabel(1, "N_{evt}") + get_axis(histonorm, 0).SetBinLabel(2, "N_{coll}") + get_axis(histonorm, 0).SetBinLabel(3, "N_{coll}^{TVX}") + get_axis(histonorm, 0).SetBinLabel(4, "N_{BC}^{TVX}") histonorm.Write() df = pd.concat(read_df(self.mptfiles_recosk[bin][index]) for bin in self.active_bins_skim) # remove entries outside of kinematic range (should be taken care of by projections in analyzer) df = df.loc[(df.fJetPt >= min(self.binarray_ptjet)) & (df.fJetPt < max(self.binarray_ptjet))] - df = df.loc[(df.fPt >= min(self.bins_analysis[:,0])) & (df.fPt < max(self.bins_analysis[:,1]))] + df = df.loc[(df.fPt >= min(self.bins_analysis[:, 0])) & (df.fPt < max(self.bins_analysis[:, 1]))] # Custom skimming cuts df = self.apply_cuts_all_ptbins(df) - if col_evtidx := self.cfg('cand_collidx'): - h = create_hist('h_ncand', ';N_{cand}', 20, 0., 20.) + if col_evtidx := self.cfg("cand_collidx"): + h = create_hist("h_ncand", ";N_{cand}", 20, 0.0, 20.0) fill_hist(h, df.groupby([col_evtidx]).size(), write=True) h = create_hist( - 'h_mass-ptjet-pthf', - ';M (GeV/#it{c}^{2});p_{T}^{jet} (GeV/#it{c});p_{T}^{HF} (GeV/#it{c})', - self.binarray_mass, self.binarray_ptjet, self.binarray_pthf) - fill_hist(h, df[['fM', 'fJetPt', 'fPt']], write=True) - - for sel_name, sel_spec in self.cfg('data_selections', {}).items(): - if sel_spec['level'] == self.mcordata: - df_sel = dfquery(df, sel_spec['query']) + "h_mass-ptjet-pthf", + ";M (GeV/#it{c}^{2});p_{T}^{jet} (GeV/#it{c});p_{T}^{HF} (GeV/#it{c})", + self.binarray_mass, + self.binarray_ptjet, + self.binarray_pthf, + ) + fill_hist(h, df[["fM", "fJetPt", "fPt"]], write=True) + + for sel_name, sel_spec in self.cfg("data_selections", {}).items(): + if sel_spec["level"] == self.mcordata: + df_sel = dfquery(df, sel_spec["query"]) h = create_hist( - f'h_mass-ptjet-pthf_{sel_name}', - ';M (GeV/#it{c}^{2});p_{T}^{jet} (GeV/#it{c});p_{T}^{HF} (GeV/#it{c})', - self.binarray_mass, self.binarray_ptjet, self.binarray_pthf) - fill_hist(h, df_sel[['fM', 'fJetPt', 'fPt']], write=True) + f"h_mass-ptjet-pthf_{sel_name}", + ";M (GeV/#it{c}^{2});p_{T}^{jet} (GeV/#it{c});p_{T}^{HF} (GeV/#it{c})", + self.binarray_mass, + self.binarray_ptjet, + self.binarray_pthf, + ) + fill_hist(h, df_sel[["fM", "fJetPt", "fPt"]], write=True) - if self.mcordata == 'mc': - df, _ = self.split_df(df, self.cfg('frac_mcana', .2)) + if self.mcordata == "mc": + df, _ = self.split_df(df, self.cfg("frac_mcana", 0.2)) if len(df) == 0: return - self.logger.debug('MC det: %s', df.index.get_level_values(0).unique()) - if f := self.cfg('closure.exclude_feeddown_det'): + self.logger.debug("MC det: %s", df.index.get_level_values(0).unique()) + if f := self.cfg("closure.exclude_feeddown_det"): dfquery(df, f, inplace=True) - if f := self.cfg('closure.filter_reflections'): + if f := self.cfg("closure.filter_reflections"): dfquery(df, f, inplace=True) - if self.cfg('closure.use_matched'): - if idx := self.cfg('efficiency.index_match'): - df['idx_match'] = df[idx].apply(lambda ar: ar[0] if len(ar) > 0 else -1) - dfquery(df, 'idx_match >= 0', inplace=True) + if self.cfg("closure.use_matched"): + if idx := self.cfg("efficiency.index_match"): + df["idx_match"] = df[idx].apply(lambda ar: ar[0] if len(ar) > 0 else -1) + dfquery(df, "idx_match >= 0", inplace=True) self._calculate_variables(df) - for obs, spec in self.cfg('observables', {}).items(): - self.logger.info('preparing histograms for %s', obs) - var = obs.split('-') + for obs, spec in self.cfg("observables", {}).items(): + self.logger.info("preparing histograms for %s", obs) + var = obs.split("-") if not all(v in df for v in var): - self.logger.error('dataframe does not contain %s', var) + self.logger.error("dataframe does not contain %s", var) continue h = create_hist( - f'h_mass-ptjet-pthf-{obs}', - f';M (GeV/#it{{c}}^{{2}});p_{{T}}^{{jet}} (GeV/#it{{c}});p_{{T}}^{{HF}} (GeV/#it{{c}});{obs}', - self.binarray_mass, self.binarray_ptjet, self.binarray_pthf, - *[self.binarrays_obs['det'][v] for v in var]) + f"h_mass-ptjet-pthf-{obs}", + f";M (GeV/#it{{c}}^{{2}});p_{{T}}^{{jet}} (GeV/#it{{c}});p_{{T}}^{{HF}} (GeV/#it{{c}});{obs}", + self.binarray_mass, + self.binarray_ptjet, + self.binarray_pthf, + *[self.binarrays_obs["det"][v] for v in var], + ) for i, v in enumerate(var): - get_axis(h, 3+i).SetTitle(self.cfg(f'observables.{v}.label', v)) + get_axis(h, 3 + i).SetTitle(self.cfg(f"observables.{v}.label", v)) - fill_hist(h, df[['fM', 'fJetPt', 'fPt', *var]], arraycols=spec.get('arraycols', None), write=True) + fill_hist(h, df[["fM", "fJetPt", "fPt", *var]], arraycols=spec.get("arraycols", None), write=True) # TODO: # - binning variations (separate ranges for MC and data) @@ -272,182 +321,232 @@ def process_histomass_single(self, index): # region efficiency # pylint: disable=too-many-branches,too-many-statements,too-many-locals def process_efficiency_single(self, index): - self.logger.info('Processing (efficiency) %s', self.l_evtorig[index]) - - cats = ['pr', 'np'] - levels_eff = ['gen', 'det', 'genmatch', 'detmatch', 'detmatch_gencuts'] - levels_effkine = ['gen', 'det'] - cuts = ['nocuts', 'cut'] - observables = self.cfg('observables', {}) - observables.update({'fPt': {'label': 'p_{T}^{HF} (GeV/#it{c})'}}) - h_eff = {(cat, level): create_hist(f'h_ptjet-pthf_{cat}_{level}', - ';p_{T}^{jet} (GeV/#it{c});p_{T}^{HF} (GeV/#it{c})', - self.binarrays_ptjet['det']['fPt'], self.binarray_pthf) - for cat in cats for level in levels_eff} + self.logger.info("Processing (efficiency) %s", self.l_evtorig[index]) + + cats = ["pr", "np"] + levels_eff = ["gen", "det", "genmatch", "detmatch", "detmatch_gencuts"] + levels_effkine = ["gen", "det"] + cuts = ["nocuts", "cut"] + observables = self.cfg("observables", {}) + observables.update({"fPt": {"label": "p_{T}^{HF} (GeV/#it{c})"}}) + h_eff = { + (cat, level): create_hist( + f"h_ptjet-pthf_{cat}_{level}", + ";p_{T}^{jet} (GeV/#it{c});p_{T}^{HF} (GeV/#it{c})", + self.binarrays_ptjet["det"]["fPt"], + self.binarray_pthf, + ) + for cat in cats + for level in levels_eff + } h_response = {} h_effkine = {} h_response_fd = {} h_effkine_fd = {} h_mctruth = {} for cat in cats: - for obs in self.cfg('observables', {}): - self.logger.info('preparing response matrix for %s', obs) - var = obs.split('-') + for obs in self.cfg("observables", {}): + self.logger.info("preparing response matrix for %s", obs) + var = obs.split("-") dim = len(var) + 1 h_response[(cat, obs)] = h = create_hist( - f'h_response_{cat}_{obs}', f"response matrix {obs}", - self.binarrays_ptjet['det'][var[0]], *[self.binarrays_obs['det'][v] for v in var], - self.binarrays_ptjet['gen'][var[0]], *[self.binarrays_obs['gen'][v] for v in var], - self.binarray_pthf) + f"h_response_{cat}_{obs}", + f"response matrix {obs}", + self.binarrays_ptjet["det"][var[0]], + *[self.binarrays_obs["det"][v] for v in var], + self.binarrays_ptjet["gen"][var[0]], + *[self.binarrays_obs["gen"][v] for v in var], + self.binarray_pthf, + ) get_axis(h, 0).SetTitle("p_{T}^{jet} (GeV/#it{c})") get_axis(h, dim).SetTitle("p_{T}^{jet} (GeV/#it{c})") - get_axis(h, 2*dim).SetTitle("p_{T}^{HF} (GeV/#it{c})") + get_axis(h, 2 * dim).SetTitle("p_{T}^{HF} (GeV/#it{c})") for i, v in enumerate(var, 1): - get_axis(h, i).SetTitle(self.cfg(f'observables.{v}.label', v)) - get_axis(h, i+dim).SetTitle(self.cfg(f'observables.{v}.label', v)) + get_axis(h, i).SetTitle(self.cfg(f"observables.{v}.label", v)) + get_axis(h, i + dim).SetTitle(self.cfg(f"observables.{v}.label", v)) for cut in cuts: - h_effkine[(cat, 'det', cut, obs)] = he = project_hist(h, list(range(dim)), {}).Clone() - he.SetName(f'h_effkine_{cat}_det_{cut}_{obs}') - h_effkine[(cat, 'gen', cut, obs)] = he = project_hist(h, list(range(dim, 2*dim)), {}).Clone() - he.SetName(f'h_effkine_{cat}_gen_{cut}_{obs}') + h_effkine[(cat, "det", cut, obs)] = he = project_hist(h, list(range(dim)), {}).Clone() + he.SetName(f"h_effkine_{cat}_det_{cut}_{obs}") + h_effkine[(cat, "gen", cut, obs)] = he = project_hist(h, list(range(dim, 2 * dim)), {}).Clone() + he.SetName(f"h_effkine_{cat}_gen_{cut}_{obs}") h_mctruth[(cat, obs)] = create_hist( - f'h_ptjet-pthf-{obs}_{cat}_gen', - f";p_{{T}}^{{jet}} (GeV/#it{{c}});p_{{T}}^{{HF}} (GeV/#it{{c}});{obs}", - self.binarrays_ptjet['gen'][var[0]], - self.binarray_pthf, - *[self.binarrays_obs['gen'][v] for v in var]) + f"h_ptjet-pthf-{obs}_{cat}_gen", + f";p_{{T}}^{{jet}} (GeV/#it{{c}});p_{{T}}^{{HF}} (GeV/#it{{c}});{obs}", + self.binarrays_ptjet["gen"][var[0]], + self.binarray_pthf, + *[self.binarrays_obs["gen"][v] for v in var], + ) h_response_fd[obs] = create_hist( - f'h_response_fd_{obs}', - f";response matrix fd {obs}", - self.binarrays_ptjet['det'][var[0]], - self.binarrays_obs['det']['fPt'], - *[self.binarrays_obs['det'][v] for v in var], - self.binarrays_ptjet['gen'][var[0]], - self.binarrays_obs['gen']['fPt'], - *[self.binarrays_obs['gen'][v] for v in var]) + f"h_response_fd_{obs}", + f";response matrix fd {obs}", + self.binarrays_ptjet["det"][var[0]], + self.binarrays_obs["det"]["fPt"], + *[self.binarrays_obs["det"][v] for v in var], + self.binarrays_ptjet["gen"][var[0]], + self.binarrays_obs["gen"]["fPt"], + *[self.binarrays_obs["gen"][v] for v in var], + ) for level, cut in itertools.product(levels_effkine, cuts): h_effkine_fd[(level, cut, obs)] = create_hist( - f'h_effkine_fd_{level}_{cut}_{obs}', - f"effkine {obs}", - self.binarrays_ptjet[level][var[0]], - self.binarrays_obs[level]['fPt'], - *[self.binarrays_obs[level][v] for v in var]) + f"h_effkine_fd_{level}_{cut}_{obs}", + f"effkine {obs}", + self.binarrays_ptjet[level][var[0]], + self.binarrays_obs[level]["fPt"], + *[self.binarrays_obs[level][v] for v in var], + ) # create partial versions for closure testing h_effkine_frac = copy.deepcopy(h_effkine) h_response_frac = copy.deepcopy(h_response) for hist in itertools.chain(h_effkine_frac.values(), h_response_frac.values()): - hist.SetName(hist.GetName() + '_frac') + hist.SetName(hist.GetName() + "_frac") with TFile.Open(self.l_histoeff[index], "recreate") as rfile: # TODO: avoid hard-coding values here (check if restriction is needed at all) - cols = None if not self.cfg('hfjet', True) else ['ismcprompt', 'ismcsignal', 'ismcfd', - 'fPt', 'fEta', 'fPhi', 'fJetPt', 'fJetEta', 'fJetPhi', 'fPtLeading', 'fPtSubLeading', 'fTheta', - 'fNSub2DR', 'fNSub1', 'fNSub2', 'fJetNConstituents', 'fEnergyMother', 'fPairTheta', 'fPairPt'] + cols = ( + None + if not self.cfg("hfjet", True) + else [ + "ismcprompt", + "ismcsignal", + "ismcfd", + "fPt", + "fEta", + "fPhi", + "fJetPt", + "fJetEta", + "fJetPhi", + "fPtLeading", + "fPtSubLeading", + "fTheta", + "fNSub2DR", + "fNSub1", + "fNSub2", + "fJetNConstituents", + "fEnergyMother", + "fPairTheta", + "fPairPt", + ] + ) # read generator level - dfgen_orig = pd.concat(read_df(self.mptfiles_gensk[bin][index], columns=cols) - for bin in self.active_bins_skim) + dfgen_orig = pd.concat( + read_df(self.mptfiles_gensk[bin][index], columns=cols) for bin in self.active_bins_skim + ) df = self._calculate_variables(dfgen_orig) - df = df.rename(lambda name: name + '_gen', axis=1) - if self.cfg('hfjet', True): - dfgen = {'pr': df.loc[(df.ismcsignal_gen == 1) & (df.ismcprompt_gen == 1)], - 'np': df.loc[(df.ismcsignal_gen == 1) & (df.ismcfd_gen == 1)]} + df = df.rename(lambda name: name + "_gen", axis=1) + if self.cfg("hfjet", True): + dfgen = { + "pr": df.loc[(df.ismcsignal_gen == 1) & (df.ismcprompt_gen == 1)], + "np": df.loc[(df.ismcsignal_gen == 1) & (df.ismcfd_gen == 1)], + } else: - dfgen = {'pr': df, 'np': df} + dfgen = {"pr": df, "np": df} # read detector level if cols: - cols.extend(self.cfg('efficiency.extra_cols', [])) - if idx := self.cfg('efficiency.index_match'): + cols.extend(self.cfg("efficiency.extra_cols", [])) + if idx := self.cfg("efficiency.index_match"): cols.append(idx) - df = pd.concat(read_df(self.mptfiles_recosk[bin][index], columns=cols) - for bin in self.active_bins_skim) + df = pd.concat(read_df(self.mptfiles_recosk[bin][index], columns=cols) for bin in self.active_bins_skim) # Custom skimming cuts df = self.apply_cuts_all_ptbins(df) - dfquery(df, self.cfg('efficiency.filter_det'), inplace=True) - if idx := self.cfg('efficiency.index_match'): - df['idx_match'] = df[idx].apply(lambda ar: ar[0] if len(ar) > 0 else -1) + dfquery(df, self.cfg("efficiency.filter_det"), inplace=True) + if idx := self.cfg("efficiency.index_match"): + df["idx_match"] = df[idx].apply(lambda ar: ar[0] if len(ar) > 0 else -1) else: - self.logger.warning('No matching criterion specified, cannot match det and gen') + self.logger.warning("No matching criterion specified, cannot match det and gen") df = self._calculate_variables(df) - if self.cfg('hfjet', True): - dfdet = {'pr': df.loc[(df.ismcsignal == 1) & (df.ismcprompt == 1)], - 'np': df.loc[(df.ismcsignal == 1) & (df.ismcfd == 1)]} + if self.cfg("hfjet", True): + dfdet = { + "pr": df.loc[(df.ismcsignal == 1) & (df.ismcprompt == 1)], + "np": df.loc[(df.ismcsignal == 1) & (df.ismcfd == 1)], + } else: - dfdet = {'pr': df, 'np': df} + dfdet = {"pr": df, "np": df} - dfmatch = {cat: pd.merge(dfdet[cat], dfgen[cat], left_on=['df', 'idx_match'], right_index=True) - for cat in cats if 'idx_match' in dfdet[cat]} + dfmatch = { + cat: pd.merge(dfdet[cat], dfgen[cat], left_on=["df", "idx_match"], right_index=True) + for cat in cats + if "idx_match" in dfdet[cat] + } for cat in cats: - fill_hist(h_eff[(cat, 'gen')], dfgen[cat][['fJetPt_gen', 'fPt_gen']]) - fill_hist(h_eff[(cat, 'det')], dfdet[cat][['fJetPt', 'fPt']]) + fill_hist(h_eff[(cat, "gen")], dfgen[cat][["fJetPt_gen", "fPt_gen"]]) + fill_hist(h_eff[(cat, "det")], dfdet[cat][["fJetPt", "fPt"]]) if cat in dfmatch and dfmatch[cat] is not None: df = dfmatch[cat] - fill_hist(h_eff[(cat, 'genmatch')], df[['fJetPt_gen', 'fPt_gen']]) - fill_hist(h_eff[(cat, 'detmatch')], df[['fJetPt', 'fPt']]) + fill_hist(h_eff[(cat, "genmatch")], df[["fJetPt_gen", "fPt_gen"]]) + fill_hist(h_eff[(cat, "detmatch")], df[["fJetPt", "fPt"]]) # apply gen-level cuts for Run 2 efficiencies - range_ptjet_gen = get_range(h_eff[(cat, 'gen')], 0) - range_pthf_gen = get_range(h_eff[(cat, 'gen')], 1) + range_ptjet_gen = get_range(h_eff[(cat, "gen")], 0) + range_pthf_gen = get_range(h_eff[(cat, "gen")], 1) df = df.loc[(df.fJetPt_gen >= range_ptjet_gen[0]) & (df.fJetPt_gen < range_ptjet_gen[1])] df = df.loc[(df.fPt_gen >= range_pthf_gen[0]) & (df.fPt_gen < range_pthf_gen[1])] - fill_hist(h_eff[(cat, 'detmatch_gencuts')], df[['fJetPt', 'fPt']]) + fill_hist(h_eff[(cat, "detmatch_gencuts")], df[["fJetPt", "fPt"]]) else: - self.logger.error('No matching, could not fill matched detector-level histograms') + self.logger.error("No matching, could not fill matched detector-level histograms") for obs, cat in itertools.product(observables, cats): if cat in dfmatch and dfmatch[cat] is not None: self._prepare_response(dfmatch[cat], h_effkine, h_response, cat, obs) - f = self.cfg('frac_mcana', .2) - _, df_mccorr = self.split_df(dfmatch[cat], f if f < 1. else 0.) + f = self.cfg("frac_mcana", 0.2) + _, df_mccorr = self.split_df(dfmatch[cat], f if f < 1.0 else 0.0) self._prepare_response(df_mccorr, h_effkine_frac, h_response_frac, cat, obs) self._prepare_response_fd(dfmatch[cat], h_effkine_fd, h_response_fd, obs) # TODO: move outside of loop? - if self.cfg('closure.use_matched'): - self.logger.info('using matched for truth') - df_mcana, _ = self.split_df(dfmatch[cat], self.cfg('frac_mcana', .2)) + if self.cfg("closure.use_matched"): + self.logger.info("using matched for truth") + df_mcana, _ = self.split_df(dfmatch[cat], self.cfg("frac_mcana", 0.2)) else: - df_mcana, _ = self.split_df(dfgen[cat], self.cfg('frac_mcana', .2)) - if f := self.cfg('closure.exclude_feeddown_gen'): - self.logger.debug('excluding feeddown gen') + df_mcana, _ = self.split_df(dfgen[cat], self.cfg("frac_mcana", 0.2)) + if f := self.cfg("closure.exclude_feeddown_gen"): + self.logger.debug("excluding feeddown gen") dfquery(df_mcana, f, inplace=True) - arraycols = [i - 3 for i in self.cfg(f'observables.{obs}.arraycols', [])] - var = obs.split('-') - self.logger.debug("Observable %s has arraycols %s -> %s", - obs, arraycols, [var[icol] for icol in arraycols]) + arraycols = [i - 3 for i in self.cfg(f"observables.{obs}.arraycols", [])] + var = obs.split("-") + self.logger.debug( + "Observable %s has arraycols %s -> %s", obs, arraycols, [var[icol] for icol in arraycols] + ) df_mcana = self._explode_arraycols(df_mcana, [var[icol] for icol in arraycols]) - fill_hist(h_mctruth[(cat, obs)], df_mcana[['fJetPt_gen', 'fPt_gen', *(f'{v}_gen' for v in var)]]) - - for name, obj in itertools.chain(h_eff.items(), h_effkine.items(), h_response.items(), - h_effkine_fd.items(), h_response_fd.items(), - h_effkine_frac.items(), h_response_frac.items(), h_mctruth.items()): + fill_hist(h_mctruth[(cat, obs)], df_mcana[["fJetPt_gen", "fPt_gen", *(f"{v}_gen" for v in var)]]) + + for name, obj in itertools.chain( + h_eff.items(), + h_effkine.items(), + h_response.items(), + h_effkine_fd.items(), + h_response_fd.items(), + h_effkine_frac.items(), + h_response_frac.items(), + h_mctruth.items(), + ): try: rfile.WriteObject(obj, obj.GetName()) - except Exception as ex: # pylint: disable=broad-exception-caught - self.logger.error('Writing of <%s> (%s) failed: %s', name, str(obj), str(ex)) + except Exception as ex: # pylint: disable=broad-exception-caught + self.logger.error("Writing of <%s> (%s) failed: %s", name, str(obj), str(ex)) def _explode_arraycols(self, df: pd.DataFrame, arraycols: "list[str]") -> pd.DataFrame: if len(arraycols) > 0: self.logger.debug("Exploding columns %s", arraycols) # only consider rows with corresponding det- and gen-level entries - df['length'] = [len(x) for x in df[arraycols[0]]] - df['length_gen'] = [len(x) for x in df[arraycols[0] + '_gen']] + df["length"] = [len(x) for x in df[arraycols[0]]] + df["length_gen"] = [len(x) for x in df[arraycols[0] + "_gen"]] df = df.loc[df.length == df.length_gen] - df = df.explode(arraycols + [col + '_gen' for col in arraycols]) + df = df.explode(arraycols + [col + "_gen" for col in arraycols]) df.dropna(inplace=True) return df def _prepare_response(self, dfi, h_effkine, h_response, cat, obs): - var = obs.split('-') + var = obs.split("-") dim = len(var) + 1 axes_det = [get_axis(h_response[(cat, obs)], i) for i in range(dim)] axes_gen = [get_axis(h_response[(cat, obs)], i) for i in range(dim, 2 * dim)] - arraycols = [i - 3 for i in self.cfg(f'observables.{obs}', {}).get('arraycols', [])] + arraycols = [i - 3 for i in self.cfg(f"observables.{obs}", {}).get("arraycols", [])] df = dfi df = self._explode_arraycols(df, [var[icol] for icol in arraycols]) @@ -455,59 +554,74 @@ def _prepare_response(self, dfi, h_effkine, h_response, cat, obs): df = df.loc[(df.fJetPt >= axes_det[0].GetXmin()) & (df.fJetPt < axes_det[0].GetXmax())] for i, v in enumerate(var, 1): df = df.loc[(df[v] >= axes_det[i].GetXmin()) & (df[v] < axes_det[i].GetXmax())] - fill_hist(h_effkine[(cat, 'det', 'nocuts', obs)], df[['fJetPt', *var]]) + fill_hist(h_effkine[(cat, "det", "nocuts", obs)], df[["fJetPt", *var]]) df = df.loc[(df.fJetPt >= axes_gen[0].GetXmin()) & (df.fJetPt < axes_gen[0].GetXmax())] for i, v in enumerate(var, 1): - df = df.loc[(df[f'{v}_gen'] >= axes_gen[i].GetXmin()) & (df[f'{v}_gen'] < axes_gen[i].GetXmax())] - fill_hist(h_effkine[(cat, 'det', 'cut', obs)], df[['fJetPt', *var]]) + df = df.loc[(df[f"{v}_gen"] >= axes_gen[i].GetXmin()) & (df[f"{v}_gen"] < axes_gen[i].GetXmax())] + fill_hist(h_effkine[(cat, "det", "cut", obs)], df[["fJetPt", *var]]) # print(df[['fJetPt', *var, 'fJetPt_gen', *(f'{v}_gen' for v in var), 'fPt']].info(), flush=True) - fill_hist(h_response[(cat, obs)], df[['fJetPt', *var, 'fJetPt_gen', *(f'{v}_gen' for v in var), 'fPt']]) + fill_hist(h_response[(cat, obs)], df[["fJetPt", *var, "fJetPt_gen", *(f"{v}_gen" for v in var), "fPt"]]) df = dfi df = self._explode_arraycols(df, [var[icol] for icol in arraycols]) df = df.loc[(df.fJetPt >= axes_gen[0].GetXmin()) & (df.fJetPt < axes_gen[0].GetXmax())] for i, v in enumerate(var, 1): - df = df.loc[(df[f'{v}_gen'] >= axes_gen[i].GetXmin()) & (df[f'{v}_gen'] < axes_gen[i].GetXmax())] - fill_hist(h_effkine[(cat, 'gen', 'nocuts', obs)], df[['fJetPt_gen', *(f'{v}_gen' for v in var)]]) + df = df.loc[(df[f"{v}_gen"] >= axes_gen[i].GetXmin()) & (df[f"{v}_gen"] < axes_gen[i].GetXmax())] + fill_hist(h_effkine[(cat, "gen", "nocuts", obs)], df[["fJetPt_gen", *(f"{v}_gen" for v in var)]]) df = df.loc[(df.fJetPt >= axes_det[0].GetXmin()) & (df.fJetPt < axes_det[0].GetXmax())] for i, v in enumerate(var, 1): df = df.loc[(df[v] >= axes_det[i].GetXmin()) & (df[v] < axes_det[i].GetXmax())] - fill_hist(h_effkine[(cat, 'gen', 'cut', obs)], df[['fJetPt_gen', *(f'{v}_gen' for v in var)]]) - + fill_hist(h_effkine[(cat, "gen", "cut", obs)], df[["fJetPt_gen", *(f"{v}_gen" for v in var)]]) def _prepare_response_fd(self, dfi, h_effkine, h_response, obs): - var = obs.split('-') + var = obs.split("-") dim = len(var) + 2 axes_det = [get_axis(h_response[obs], i) for i in range(dim)] axes_gen = [get_axis(h_response[obs], i) for i in range(dim, 2 * dim)] - arraycols = [i - 3 for i in self.cfg(f'observables.{obs}', {}).get('arraycols', [])] + arraycols = [i - 3 for i in self.cfg(f"observables.{obs}", {}).get("arraycols", [])] df = dfi df = self._explode_arraycols(df, [var[icol] for icol in arraycols]) # TODO: the first cut should be taken care of by under-/overflow bins, check their usage in analyzer - df = df.loc[(df.fJetPt >= axes_det[0].GetXmin()) & (df.fJetPt < axes_det[0].GetXmax()) & - (df.fPt >= axes_det[1].GetXmin()) & (df.fPt < axes_det[1].GetXmax())] + df = df.loc[ + (df.fJetPt >= axes_det[0].GetXmin()) + & (df.fJetPt < axes_det[0].GetXmax()) + & (df.fPt >= axes_det[1].GetXmin()) + & (df.fPt < axes_det[1].GetXmax()) + ] for i, v in enumerate(var, 2): df = df.loc[(df[v] >= axes_det[i].GetXmin()) & (df[v] < axes_det[i].GetXmax())] - fill_hist(h_effkine[('det', 'nocuts', obs)], df[['fJetPt', 'fPt', *var]]) - df = df.loc[(df.fJetPt_gen >= axes_gen[0].GetXmin()) & (df.fJetPt_gen < axes_gen[0].GetXmax()) & - (df.fPt_gen >= axes_gen[1].GetXmin()) & (df.fPt_gen < axes_gen[1].GetXmax())] + fill_hist(h_effkine[("det", "nocuts", obs)], df[["fJetPt", "fPt", *var]]) + df = df.loc[ + (df.fJetPt_gen >= axes_gen[0].GetXmin()) + & (df.fJetPt_gen < axes_gen[0].GetXmax()) + & (df.fPt_gen >= axes_gen[1].GetXmin()) + & (df.fPt_gen < axes_gen[1].GetXmax()) + ] for i, v in enumerate(var, 2): - df = df.loc[(df[f'{v}_gen'] >= axes_gen[i].GetXmin()) & (df[f'{v}_gen'] < axes_gen[i].GetXmax())] - fill_hist(h_effkine[('det', 'cut', obs)], df[['fJetPt', 'fPt', *var]]) + df = df.loc[(df[f"{v}_gen"] >= axes_gen[i].GetXmin()) & (df[f"{v}_gen"] < axes_gen[i].GetXmax())] + fill_hist(h_effkine[("det", "cut", obs)], df[["fJetPt", "fPt", *var]]) - fill_hist(h_response[obs], df[['fJetPt', 'fPt', *var, 'fJetPt_gen', 'fPt_gen', *(f'{v}_gen' for v in var)]]) + fill_hist(h_response[obs], df[["fJetPt", "fPt", *var, "fJetPt_gen", "fPt_gen", *(f"{v}_gen" for v in var)]]) df = dfi df = self._explode_arraycols(df, [var[icol] for icol in arraycols]) - df = df.loc[(df.fJetPt_gen >= axes_gen[0].GetXmin()) & (df.fJetPt_gen < axes_gen[0].GetXmax()) & - (df.fPt_gen >= axes_gen[1].GetXmin()) & (df.fPt_gen < axes_gen[1].GetXmax())] + df = df.loc[ + (df.fJetPt_gen >= axes_gen[0].GetXmin()) + & (df.fJetPt_gen < axes_gen[0].GetXmax()) + & (df.fPt_gen >= axes_gen[1].GetXmin()) + & (df.fPt_gen < axes_gen[1].GetXmax()) + ] for i, v in enumerate(var, 2): - df = df.loc[(df[f'{v}_gen'] >= axes_gen[i].GetXmin()) & (df[f'{v}_gen'] < axes_gen[i].GetXmax())] - fill_hist(h_effkine[('gen', 'nocuts', obs)], df[['fJetPt_gen', 'fPt', *(f'{v}_gen' for v in var)]]) - df = df.loc[(df.fJetPt >= axes_det[0].GetXmin()) & (df.fJetPt < axes_det[0].GetXmax()) & - (df.fPt >= axes_det[1].GetXmin()) & (df.fPt < axes_det[1].GetXmax())] + df = df.loc[(df[f"{v}_gen"] >= axes_gen[i].GetXmin()) & (df[f"{v}_gen"] < axes_gen[i].GetXmax())] + fill_hist(h_effkine[("gen", "nocuts", obs)], df[["fJetPt_gen", "fPt", *(f"{v}_gen" for v in var)]]) + df = df.loc[ + (df.fJetPt >= axes_det[0].GetXmin()) + & (df.fJetPt < axes_det[0].GetXmax()) + & (df.fPt >= axes_det[1].GetXmin()) + & (df.fPt < axes_det[1].GetXmax()) + ] for i, v in enumerate(var, 2): df = df.loc[(df[v] >= axes_det[i].GetXmin()) & (df[v] < axes_det[i].GetXmax())] - fill_hist(h_effkine[('gen', 'cut', obs)], df[['fJetPt_gen', 'fPt', *(f'{v}_gen' for v in var)]]) + fill_hist(h_effkine[("gen", "cut", obs)], df[["fJetPt_gen", "fPt", *(f"{v}_gen" for v in var)]]) diff --git a/machine_learning_hep/processerdhadrons.py b/machine_learning_hep/processerdhadrons.py index 76e8bf68b7..a46e90ee37 100755 --- a/machine_learning_hep/processerdhadrons.py +++ b/machine_learning_hep/processerdhadrons.py @@ -17,40 +17,81 @@ """ main script for doing data processing, machine learning and analysis """ -import math + import array +import math + import numpy as np import pandas as pd -from ROOT import TFile, TH1F -from machine_learning_hep.utilities import seldf_singlevar, read_df +from ROOT import TH1F, TFile + from machine_learning_hep.processer import Processer, dfquery +from machine_learning_hep.utilities import read_df, seldf_singlevar from machine_learning_hep.utils.hist import bin_array, create_hist, fill_hist -class ProcesserDhadrons(Processer): # pylint: disable=too-many-instance-attributes + +class ProcesserDhadrons(Processer): # pylint: disable=too-many-instance-attributes # Class Attribute - species = 'processer' + species = "processer" # Initializer / Instance Attributes # pylint: disable=too-many-statements, too-many-arguments - def __init__(self, case, datap, run_param, mcordata, p_maxfiles, - d_root, d_pkl, d_pklsk, d_pkl_ml, p_period, i_period, - p_chunksizeunp, p_chunksizeskim, p_maxprocess, - p_frac_merge, p_rd_merge, d_pkl_dec, d_pkl_decmerged, - d_results, typean, runlisttrigger, d_mcreweights): - super().__init__(case, datap, run_param, mcordata, p_maxfiles, - d_root, d_pkl, d_pklsk, d_pkl_ml, p_period, i_period, - p_chunksizeunp, p_chunksizeskim, p_maxprocess, - p_frac_merge, p_rd_merge, d_pkl_dec, d_pkl_decmerged, - d_results, typean, runlisttrigger, d_mcreweights) - - self.p_mass_fit_lim = datap["analysis"][self.typean]['mass_fit_lim'] - self.p_bin_width = datap["analysis"][self.typean]['bin_width'] + def __init__( + self, + case, + datap, + run_param, + mcordata, + p_maxfiles, + d_root, + d_pkl, + d_pklsk, + d_pkl_ml, + p_period, + i_period, + p_chunksizeunp, + p_chunksizeskim, + p_maxprocess, + p_frac_merge, + p_rd_merge, + d_pkl_dec, + d_pkl_decmerged, + d_results, + typean, + runlisttrigger, + d_mcreweights, + ): + super().__init__( + case, + datap, + run_param, + mcordata, + p_maxfiles, + d_root, + d_pkl, + d_pklsk, + d_pkl_ml, + p_period, + i_period, + p_chunksizeunp, + p_chunksizeskim, + p_maxprocess, + p_frac_merge, + p_rd_merge, + d_pkl_dec, + d_pkl_decmerged, + d_results, + typean, + runlisttrigger, + d_mcreweights, + ) + + self.p_mass_fit_lim = datap["analysis"][self.typean]["mass_fit_lim"] + self.p_bin_width = datap["analysis"][self.typean]["bin_width"] limits_mass = datap["analysis"][self.typean]["mass_fit_lim"] nbins_mass = int(round((limits_mass[1] - limits_mass[0]) / self.p_bin_width)) - self.p_num_bins = int(round((self.p_mass_fit_lim[1] - self.p_mass_fit_lim[0]) / \ - self.p_bin_width)) - self.s_presel_gen_eff = datap["analysis"][self.typean]['presel_gen_eff'] - + self.p_num_bins = int(round((self.p_mass_fit_lim[1] - self.p_mass_fit_lim[0]) / self.p_bin_width)) + self.s_presel_gen_eff = datap["analysis"][self.typean]["presel_gen_eff"] self.lpt_finbinmin = datap["analysis"][self.typean]["sel_an_binmin"] self.lpt_finbinmax = datap["analysis"][self.typean]["sel_an_binmax"] @@ -59,7 +100,7 @@ def __init__(self, case, datap, run_param, mcordata, p_maxfiles, self.s_evtsel = datap["analysis"][self.typean]["evtsel"] self.v_invmass = datap["variables"].get("var_inv_mass", "fM") self.binarray_mass = bin_array(nbins_mass, limits_mass[0], limits_mass[1]) - self.binarray_pthf = np.asarray(self.cfg('sel_an_binmin', []) + self.cfg('sel_an_binmax', [])[-1:], 'd') + self.binarray_pthf = np.asarray(self.cfg("sel_an_binmin", []) + self.cfg("sel_an_binmax", [])[-1:], "d") # pylint: disable=too-many-branches def process_histomass_single(self, index): @@ -72,7 +113,7 @@ def process_histomass_single(self, index): dfevtevtsel = dfevtorig neventsafterevtsel = len(dfevtevtsel) - #validation plot for event selection + # validation plot for event selection histonorm = TH1F("histonorm", "histonorm", 10, 0, 10) histonorm.SetBinContent(1, neventsorig) histonorm.GetXaxis().SetBinLabel(1, "tot events") @@ -81,8 +122,8 @@ def process_histomass_single(self, index): histonorm.Write() myfile.cd() - hEvents = TH1F('all_events', 'all_events', 1, -0.5, 0.5) - hSelEvents = TH1F('sel_events', 'sel_events', 1, -0.5, 0.5) + hEvents = TH1F("all_events", "all_events", 1, -0.5, 0.5) + hSelEvents = TH1F("sel_events", "sel_events", 1, -0.5, 0.5) hEvents.SetBinContent(1, len(dfevtorig)) hSelEvents.SetBinContent(1, len(dfevtevtsel)) @@ -99,8 +140,7 @@ def process_histomass_single(self, index): if self.doml is True: df = df.query(self.l_selml[bin_id]) - df = seldf_singlevar(df, self.v_var_binning, \ - self.lpt_finbinmin[ipt], self.lpt_finbinmax[ipt]) + df = seldf_singlevar(df, self.v_var_binning, self.lpt_finbinmin[ipt], self.lpt_finbinmax[ipt]) if self.do_custom_analysis_cuts: df = self.apply_cuts_ptbin(df, ipt) @@ -108,17 +148,23 @@ def process_histomass_single(self, index): df_ptmerged = pd.concat([df_ptmerged, df], ignore_index=True) if self.mltype == "MultiClassification": - suffix = "%s%d_%d_%.2f%.2f%.2f" % \ - (self.v_var_binning, self.lpt_finbinmin[ipt], - self.lpt_finbinmax[ipt], self.lpt_probcutfin[ipt][0], - self.lpt_probcutfin[ipt][1], self.lpt_probcutfin[ipt][2]) + suffix = "%s%d_%d_%.2f%.2f%.2f" % ( + self.v_var_binning, + self.lpt_finbinmin[ipt], + self.lpt_finbinmax[ipt], + self.lpt_probcutfin[ipt][0], + self.lpt_probcutfin[ipt][1], + self.lpt_probcutfin[ipt][2], + ) else: - suffix = "%s%d_%d_%.2f" % \ - (self.v_var_binning, self.lpt_finbinmin[ipt], - self.lpt_finbinmax[ipt], self.lpt_probcutfin[ipt]) + suffix = "%s%d_%d_%.2f" % ( + self.v_var_binning, + self.lpt_finbinmin[ipt], + self.lpt_finbinmax[ipt], + self.lpt_probcutfin[ipt], + ) - h_invmass = TH1F("hmass" + suffix, "", self.p_num_bins, - self.p_mass_fit_lim[0], self.p_mass_fit_lim[1]) + h_invmass = TH1F("hmass" + suffix, "", self.p_num_bins, self.p_mass_fit_lim[0], self.p_mass_fit_lim[1]) fill_hist(h_invmass, df[self.v_invmass]) myfile.cd() @@ -127,10 +173,12 @@ def process_histomass_single(self, index): if self.mcordata == "mc": df_sig = df[df[self.v_ismcsignal] == 1] df_bkg = df[df[self.v_ismcbkg] == 1] - h_invmass_sig = TH1F("hmass_sig" + suffix, "", self.p_num_bins, - self.p_mass_fit_lim[0], self.p_mass_fit_lim[1]) - h_invmass_bkg = TH1F("hmass_bkg" + suffix, "", self.p_num_bins, - self.p_mass_fit_lim[0], self.p_mass_fit_lim[1]) + h_invmass_sig = TH1F( + "hmass_sig" + suffix, "", self.p_num_bins, self.p_mass_fit_lim[0], self.p_mass_fit_lim[1] + ) + h_invmass_bkg = TH1F( + "hmass_bkg" + suffix, "", self.p_num_bins, self.p_mass_fit_lim[0], self.p_mass_fit_lim[1] + ) fill_hist(h_invmass_sig, df_sig[self.v_invmass]) fill_hist(h_invmass_bkg, df_bkg[self.v_invmass]) @@ -139,35 +187,31 @@ def process_histomass_single(self, index): h_invmass_sig.Write() h_invmass_bkg.Write() - for sel_name, sel_spec in self.cfg('data_selections', {}).items(): - if sel_spec['level'] == self.mcordata: - df_sel = dfquery(df_ptmerged, sel_spec['query']) + for sel_name, sel_spec in self.cfg("data_selections", {}).items(): + if sel_spec["level"] == self.mcordata: + df_sel = dfquery(df_ptmerged, sel_spec["query"]) h = create_hist( - f'h_mass-pthf_{sel_name}', - ';M (GeV/#it{c}^{2});p_{T}^{HF} (GeV/#it{c})', - self.binarray_mass, self.binarray_pthf) - fill_hist(h, df_sel[['fM', 'fPt']], write=True) + f"h_mass-pthf_{sel_name}", + ";M (GeV/#it{c}^{2});p_{T}^{HF} (GeV/#it{c})", + self.binarray_mass, + self.binarray_pthf, + ) + fill_hist(h, df_sel[["fM", "fPt"]], write=True) # pylint: disable=line-too-long def process_efficiency_single(self, index): - #TO UPDATE TO DHADRON_MULT VERSION + # TO UPDATE TO DHADRON_MULT VERSION out_file = TFile.Open(self.l_histoeff[index], "recreate") n_bins = len(self.lpt_finbinmin) analysis_bin_lims_temp = self.lpt_finbinmin.copy() - analysis_bin_lims_temp.append(self.lpt_finbinmax[n_bins-1]) - analysis_bin_lims = array.array('f', analysis_bin_lims_temp) - h_gen_pr = TH1F("h_gen_pr", "Prompt Generated in acceptance |y|<0.5", \ - n_bins, analysis_bin_lims) - h_presel_pr = TH1F("h_presel_pr", "Prompt Reco in acc |#eta|<0.8 and sel", \ - n_bins, analysis_bin_lims) - h_sel_pr = TH1F("h_sel_pr", "Prompt Reco and sel in acc |#eta|<0.8 and sel", \ - n_bins, analysis_bin_lims) - h_gen_fd = TH1F("h_gen_fd", "FD Generated in acceptance |y|<0.5", \ - n_bins, analysis_bin_lims) - h_presel_fd = TH1F("h_presel_fd", "FD Reco in acc |#eta|<0.8 and sel", \ - n_bins, analysis_bin_lims) - h_sel_fd = TH1F("h_sel_fd", "FD Reco and sel in acc |#eta|<0.8 and sel", \ - n_bins, analysis_bin_lims) + analysis_bin_lims_temp.append(self.lpt_finbinmax[n_bins - 1]) + analysis_bin_lims = array.array("f", analysis_bin_lims_temp) + h_gen_pr = TH1F("h_gen_pr", "Prompt Generated in acceptance |y|<0.5", n_bins, analysis_bin_lims) + h_presel_pr = TH1F("h_presel_pr", "Prompt Reco in acc |#eta|<0.8 and sel", n_bins, analysis_bin_lims) + h_sel_pr = TH1F("h_sel_pr", "Prompt Reco and sel in acc |#eta|<0.8 and sel", n_bins, analysis_bin_lims) + h_gen_fd = TH1F("h_gen_fd", "FD Generated in acceptance |y|<0.5", n_bins, analysis_bin_lims) + h_presel_fd = TH1F("h_presel_fd", "FD Reco in acc |#eta|<0.8 and sel", n_bins, analysis_bin_lims) + h_sel_fd = TH1F("h_sel_fd", "FD Reco and sel in acc |#eta|<0.8 and sel", n_bins, analysis_bin_lims) bincounter = 0 for ipt in range(self.p_nptfinbins): @@ -177,10 +221,10 @@ def process_efficiency_single(self, index): df_mc_reco = df_mc_reco.query(self.s_evtsel) df_mc_gen = read_df(self.mptfiles_gensk[bin_id][index]) df_mc_gen = df_mc_gen.query(self.s_presel_gen_eff) - df_mc_reco = seldf_singlevar(df_mc_reco, self.v_var_binning, \ - self.lpt_finbinmin[ipt], self.lpt_finbinmax[ipt]) - df_mc_gen = seldf_singlevar(df_mc_gen, self.v_var_binning, \ - self.lpt_finbinmin[ipt], self.lpt_finbinmax[ipt]) + df_mc_reco = seldf_singlevar( + df_mc_reco, self.v_var_binning, self.lpt_finbinmin[ipt], self.lpt_finbinmax[ipt] + ) + df_mc_gen = seldf_singlevar(df_mc_gen, self.v_var_binning, self.lpt_finbinmin[ipt], self.lpt_finbinmax[ipt]) df_gen_sel_pr = df_mc_gen.loc[(df_mc_gen.ismcprompt == 1) & (df_mc_gen.ismcsignal == 1)] df_reco_presel_pr = df_mc_reco.loc[(df_mc_reco.ismcprompt == 1) & (df_mc_reco.ismcsignal == 1)] df_reco_sel_pr = None diff --git a/machine_learning_hep/processerdhadrons_mult.py b/machine_learning_hep/processerdhadrons_mult.py index 6ed9714661..3544cfe3e9 100755 --- a/machine_learning_hep/processerdhadrons_mult.py +++ b/machine_learning_hep/processerdhadrons_mult.py @@ -12,52 +12,98 @@ ## along with this program. if not, see . ## ############################################################################# -#pylint: disable=import-error, no-name-in-module, consider-using-f-string, too-many-statements, too-many-branches, too-many-arguments, too-many-instance-attributes, too-many-locals +# pylint: disable=import-error, no-name-in-module, consider-using-f-string, too-many-statements, too-many-branches, too-many-arguments, too-many-instance-attributes, too-many-locals """ main script for doing data processing, machine learning and analysis """ -import math + import array +import math import os + import numpy as np import pandas as pd -from ROOT import TFile, TH1F, TH2F -from machine_learning_hep.utilities_files import create_folder_struc -from machine_learning_hep.utilities import seldf_singlevar, seldf_singlevar_inclusive -from machine_learning_hep.utilities import mergerootfiles, read_df -from machine_learning_hep.utilities import get_timestamp_string +from ROOT import TH1F, TH2F, TFile + from machine_learning_hep.processer import Processer +from machine_learning_hep.utilities import ( + get_timestamp_string, + mergerootfiles, + read_df, + seldf_singlevar, + seldf_singlevar_inclusive, +) +from machine_learning_hep.utilities_files import create_folder_struc from machine_learning_hep.utils.hist import bin_array, fill_hist + # pylint: disable=invalid-name class ProcesserDhadrons_mult(Processer): # Class Attribute - species = 'processer' + species = "processer" # Initializer / Instance Attributes - def __init__(self, case, datap, run_param, mcordata, p_maxfiles, - d_root, d_pkl, d_pklsk, d_pkl_ml, p_period, i_period, - p_chunksizeunp, p_chunksizeskim, p_maxprocess, - p_frac_merge, p_rd_merge, d_pkl_dec, d_pkl_decmerged, - d_results, typean, runlisttrigger, d_mcreweights): - super().__init__(case, datap, run_param, mcordata, p_maxfiles, - d_root, d_pkl, d_pklsk, d_pkl_ml, p_period, i_period, - p_chunksizeunp, p_chunksizeskim, p_maxprocess, - p_frac_merge, p_rd_merge, d_pkl_dec, d_pkl_decmerged, - d_results, typean, runlisttrigger, d_mcreweights) + def __init__( + self, + case, + datap, + run_param, + mcordata, + p_maxfiles, + d_root, + d_pkl, + d_pklsk, + d_pkl_ml, + p_period, + i_period, + p_chunksizeunp, + p_chunksizeskim, + p_maxprocess, + p_frac_merge, + p_rd_merge, + d_pkl_dec, + d_pkl_decmerged, + d_results, + typean, + runlisttrigger, + d_mcreweights, + ): + super().__init__( + case, + datap, + run_param, + mcordata, + p_maxfiles, + d_root, + d_pkl, + d_pklsk, + d_pkl_ml, + p_period, + i_period, + p_chunksizeunp, + p_chunksizeskim, + p_maxprocess, + p_frac_merge, + p_rd_merge, + d_pkl_dec, + d_pkl_decmerged, + d_results, + typean, + runlisttrigger, + d_mcreweights, + ) self.v_invmass = datap["variables"].get("var_inv_mass", "fM") - self.p_mass_fit_lim = datap["analysis"][self.typean]['mass_fit_lim'] - self.p_bin_width = datap["analysis"][self.typean]['bin_width'] - self.binarray_pthf = np.asarray(self.cfg('sel_an_binmin', []) + self.cfg('sel_an_binmax', [])[-1:], 'd') + self.p_mass_fit_lim = datap["analysis"][self.typean]["mass_fit_lim"] + self.p_bin_width = datap["analysis"][self.typean]["bin_width"] + self.binarray_pthf = np.asarray(self.cfg("sel_an_binmin", []) + self.cfg("sel_an_binmax", [])[-1:], "d") limits_mass = datap["analysis"][self.typean]["mass_fit_lim"] nbins_mass = int(round((limits_mass[1] - limits_mass[0]) / self.p_bin_width)) self.binarray_mass = bin_array(nbins_mass, limits_mass[0], limits_mass[1]) - self.p_num_bins = int(round((self.p_mass_fit_lim[1] - self.p_mass_fit_lim[0]) / \ - self.p_bin_width)) - self.s_presel_gen_eff = datap["analysis"][self.typean]['presel_gen_eff'] + self.p_num_bins = int(round((self.p_mass_fit_lim[1] - self.p_mass_fit_lim[0]) / self.p_bin_width)) + self.s_presel_gen_eff = datap["analysis"][self.typean]["presel_gen_eff"] self.lvar2_binmin = datap["analysis"][self.typean]["sel_binmin2"] self.lvar2_binmax = datap["analysis"][self.typean]["sel_binmax2"] self.v_var2_binning = datap["analysis"][self.typean]["var_binning2"] @@ -108,11 +154,13 @@ def make_weights(col, func, hist, use_func): if use_func: return [func.Eval(x) for x in col] + def reg(value): # warning, the histogram has empty bins at high mult. # (>125 ntrkl) so a check is needed to avoid a 1/0 division # when computing the inverse of the weight - return value if value != 0. else 1. + return value if value != 0.0 else 1.0 + return [reg(hist.GetBinContent(hist.FindBin(iw))) for iw in col] def process_histomass_single(self, index): @@ -125,7 +173,7 @@ def process_histomass_single(self, index): else: dfevtevtsel = dfevtorig - #validation plot for event selection + # validation plot for event selection neventsafterevtsel = len(dfevtevtsel) histonorm = TH1F("histonorm", "histonorm", 10, 0, 10) histonorm.SetBinContent(1, neventsorig) @@ -133,17 +181,18 @@ def process_histomass_single(self, index): histonorm.SetBinContent(2, neventsafterevtsel) histonorm.GetXaxis().SetBinLabel(2, "tot events after evt sel") for ibin2, _ in enumerate(self.lvar2_binmin): - binneddf = seldf_singlevar_inclusive(dfevtevtsel, self.v_var2_binning, \ - self.lvar2_binmin[ibin2], self.lvar2_binmax[ibin2]) + binneddf = seldf_singlevar_inclusive( + dfevtevtsel, self.v_var2_binning, self.lvar2_binmin[ibin2], self.lvar2_binmax[ibin2] + ) histonorm.SetBinContent(3 + ibin2, len(binneddf)) - histonorm.GetXaxis().SetBinLabel(3 + ibin2, \ - "tot events after mult sel %d - %d" % \ - (self.lvar2_binmin[ibin2], self.lvar2_binmax[ibin2])) + histonorm.GetXaxis().SetBinLabel( + 3 + ibin2, "tot events after mult sel %d - %d" % (self.lvar2_binmin[ibin2], self.lvar2_binmax[ibin2]) + ) histonorm.Write() myfile.cd() - hEvents = TH1F('all_events', 'all_events', 1, -0.5, 0.5) - hSelEvents = TH1F('sel_events', 'sel_events', 1, -0.5, 0.5) + hEvents = TH1F("all_events", "all_events", 1, -0.5, 0.5) + hSelEvents = TH1F("sel_events", "sel_events", 1, -0.5, 0.5) hEvents.SetBinContent(1, len(dfevtorig)) hSelEvents.SetBinContent(1, len(dfevtevtsel)) @@ -152,15 +201,14 @@ def process_histomass_single(self, index): df_ptmerged = pd.DataFrame() - for ipt in range(self.p_nptfinbins): # pylint: disable=too-many-nested-blocks + for ipt in range(self.p_nptfinbins): # pylint: disable=too-many-nested-blocks bin_id = self.bin_matching[ipt] df = read_df(self.mptfiles_recoskmldec[bin_id][index]) if self.s_evtsel is not None: df = df.query(self.s_evtsel) if self.doml is True: df = df.query(self.l_selml[ipt]) - df = seldf_singlevar(df, self.v_var_binning, \ - self.lpt_finbinmin[ipt], self.lpt_finbinmax[ipt]) + df = seldf_singlevar(df, self.v_var_binning, self.lpt_finbinmin[ipt], self.lpt_finbinmax[ipt]) if self.do_custom_analysis_cuts: df = self.apply_cuts_ptbin(df, ipt) @@ -168,58 +216,71 @@ def process_histomass_single(self, index): df_ptmerged = pd.concat([df_ptmerged, df], ignore_index=True) for ibin2, _ in enumerate(self.lvar2_binmin): - if self.mltype == "MultiClassification": - suffix = "%s%d_%d_%.2f%.2f%s_%.2f_%.2f" % \ - (self.v_var_binning, self.lpt_finbinmin[ipt], - self.lpt_finbinmax[ipt], self.lpt_probcutfin[ipt][0], - self.lpt_probcutfin[ipt][1], self.v_var2_binning, - self.lvar2_binmin[ibin2], self.lvar2_binmax[ibin2]) + suffix = "%s%d_%d_%.2f%.2f%s_%.2f_%.2f" % ( + self.v_var_binning, + self.lpt_finbinmin[ipt], + self.lpt_finbinmax[ipt], + self.lpt_probcutfin[ipt][0], + self.lpt_probcutfin[ipt][1], + self.v_var2_binning, + self.lvar2_binmin[ibin2], + self.lvar2_binmax[ibin2], + ) else: - suffix = "%s%d_%d_%.2f%s_%.2f_%.2f" % \ - (self.v_var_binning, self.lpt_finbinmin[ipt], - self.lpt_finbinmax[ipt], self.lpt_probcutfin[ipt], - self.v_var2_binning, - self.lvar2_binmin[ibin2], self.lvar2_binmax[ibin2]) - h_invmass = TH1F("hmass" + suffix, "", self.p_num_bins, - self.p_mass_fit_lim[0], self.p_mass_fit_lim[1]) - df_bin = seldf_singlevar_inclusive(df, self.v_var2_binning, \ - self.lvar2_binmin[ibin2], self.lvar2_binmax[ibin2]) + suffix = "%s%d_%d_%.2f%s_%.2f_%.2f" % ( + self.v_var_binning, + self.lpt_finbinmin[ipt], + self.lpt_finbinmax[ipt], + self.lpt_probcutfin[ipt], + self.v_var2_binning, + self.lvar2_binmin[ibin2], + self.lvar2_binmax[ibin2], + ) + h_invmass = TH1F("hmass" + suffix, "", self.p_num_bins, self.p_mass_fit_lim[0], self.p_mass_fit_lim[1]) + df_bin = seldf_singlevar_inclusive( + df, self.v_var2_binning, self.lvar2_binmin[ibin2], self.lvar2_binmax[ibin2] + ) fill_hist(h_invmass, df_bin[self.v_invmass]) myfile.cd() h_invmass.Write() if self.mcordata == "mc": df_bin_sig = df_bin[df_bin[self.v_ismcsignal] == 1] - h_invmass_sig = TH1F("hmass_sig" + suffix, "", self.p_num_bins, - self.p_mass_fit_lim[0], self.p_mass_fit_lim[1]) + h_invmass_sig = TH1F( + "hmass_sig" + suffix, "", self.p_num_bins, self.p_mass_fit_lim[0], self.p_mass_fit_lim[1] + ) fill_hist(h_invmass_sig, df_bin_sig[self.v_invmass]) myfile.cd() h_invmass_sig.Write() if self.event_cand_validation is True: label = "h%s" % self.v_var2_binning - histomult = TH1F(label, label, self.nbinshisto, - self.minvaluehisto, self.maxvaluehisto) + histomult = TH1F(label, label, self.nbinshisto, self.minvaluehisto, self.maxvaluehisto) fill_hist(histomult, dfevtevtsel[self.v_var2_binning]) histomult.Write() if self.v_var2_binning_weigths is not None: label = "h%s" % self.v_var2_binning_weigths - histomult_weigths = TH1F(label, label, self.nbinshisto, - self.minvaluehisto, self.maxvaluehisto) + histomult_weigths = TH1F(label, label, self.nbinshisto, self.minvaluehisto, self.maxvaluehisto) fill_hist(histomult_weigths, dfevtevtsel[self.v_var2_binning_weigths]) label = "h%s_%s" % (self.v_var2_binning_weigths, self.v_var2_binning) - histomult_weigths_2d = TH2F(label, label, - self.nbinshisto, self.minvaluehisto, self.maxvaluehisto, - self.nbinshisto, self.minvaluehisto, self.maxvaluehisto) + histomult_weigths_2d = TH2F( + label, + label, + self.nbinshisto, + self.minvaluehisto, + self.maxvaluehisto, + self.nbinshisto, + self.minvaluehisto, + self.maxvaluehisto, + ) fill_hist(histomult_weigths_2d, dfevtevtsel[[self.v_var2_binning_weigths, self.v_var2_binning]]) histomult_weigths.Write() histomult_weigths_2d.Write() - def get_reweighted_count(self, dfsel, ibin=None): """Apply event weights @@ -241,18 +302,15 @@ def no_weights(df_): return val, math.sqrt(val) event_weighting_mc = {} - if self.event_weighting_mc and ibin is not None \ - and len(self.event_weighting_mc) - 1 >= ibin: + if self.event_weighting_mc and ibin is not None and len(self.event_weighting_mc) - 1 >= ibin: # Check is there is a dictionary with desired info event_weighting_mc = self.event_weighting_mc[ibin] # If there were explicit info in the analysis database, assume that all fields exist # If incomplete, there will be a mix-up between these values and default values - filepath = event_weighting_mc.get("filepath", os.path.join(self.d_mcreweights, - self.n_mcreweights)) + filepath = event_weighting_mc.get("filepath", os.path.join(self.d_mcreweights, self.n_mcreweights)) if not os.path.exists(filepath): - print(f"Could not find filepath {filepath} for MC event weighting." \ - "Compute unweighted values...") + print(f"Could not find filepath {filepath} for MC event weighting.Compute unweighted values...") return no_weights(dfsel) weight_file = TFile.Open(filepath, "read") @@ -260,17 +318,15 @@ def no_weights(df_): weights = weight_file.Get(histo_name) if not weights: - print(f"Could not find histogram {histo_name} for MC event weighting." \ - "Compute unweighted values...") + print(f"Could not find histogram {histo_name} for MC event weighting.Compute unweighted values...") return no_weights(dfsel) weight_according_to = event_weighting_mc.get("according_to", self.v_var2_binning) - w = [weights.GetBinContent(weights.FindBin(v)) for v in - dfsel[weight_according_to]] + w = [weights.GetBinContent(weights.FindBin(v)) for v in dfsel[weight_according_to]] val = sum(w) err = math.sqrt(sum(map(lambda i: i * i, w))) - #print('reweighting sum: {:.1f} +- {:.1f} -> {:.1f} +- {:.1f} (zeroes: {})' \ + # print('reweighting sum: {:.1f} +- {:.1f} -> {:.1f} +- {:.1f} (zeroes: {})' \ # .format(len(dfsel), math.sqrt(len(dfsel)), val, err, w.count(0.))) return val, err @@ -279,43 +335,36 @@ def process_efficiency_single(self, index): out_file = TFile.Open(self.l_histoeff[index], "recreate") h_list = [] for ibin2, _ in enumerate(self.lvar2_binmin): - stringbin2 = "_%s_%.2f_%.2f" % (self.v_var2_binning, - self.lvar2_binmin[ibin2], - self.lvar2_binmax[ibin2]) + stringbin2 = "_%s_%.2f_%.2f" % (self.v_var2_binning, self.lvar2_binmin[ibin2], self.lvar2_binmax[ibin2]) n_bins = len(self.lpt_finbinmin) analysis_bin_lims_temp = self.lpt_finbinmin.copy() - analysis_bin_lims_temp.append(self.lpt_finbinmax[n_bins-1]) - analysis_bin_lims = array.array('f', analysis_bin_lims_temp) + analysis_bin_lims_temp.append(self.lpt_finbinmax[n_bins - 1]) + analysis_bin_lims = array.array("f", analysis_bin_lims_temp) - def make_histo(name, title, - name_extra=stringbin2, - bins=n_bins, - binning=analysis_bin_lims): + def make_histo(name, title, name_extra=stringbin2, bins=n_bins, binning=analysis_bin_lims): histo = TH1F(name + name_extra, title, bins, binning) h_list.append(histo) return histo - h_gen_pr = make_histo("h_gen_pr", - "Prompt Generated in acceptance |y|<0.5") - h_presel_pr = make_histo("h_presel_pr", - "Prompt Reco in acc |#eta|<0.8 and sel") - h_sel_pr = make_histo("h_sel_pr", - "Prompt Reco and sel in acc |#eta|<0.8 and sel") - h_gen_fd = make_histo("h_gen_fd", - "FD Generated in acceptance |y|<0.5") - h_presel_fd = make_histo("h_presel_fd", - "FD Reco in acc |#eta|<0.8 and sel") - h_sel_fd = make_histo("h_sel_fd", - "FD Reco and sel in acc |#eta|<0.8 and sel") + h_gen_pr = make_histo("h_gen_pr", "Prompt Generated in acceptance |y|<0.5") + h_presel_pr = make_histo("h_presel_pr", "Prompt Reco in acc |#eta|<0.8 and sel") + h_sel_pr = make_histo("h_sel_pr", "Prompt Reco and sel in acc |#eta|<0.8 and sel") + h_gen_fd = make_histo("h_gen_fd", "FD Generated in acceptance |y|<0.5") + h_presel_fd = make_histo("h_presel_fd", "FD Reco in acc |#eta|<0.8 and sel") + h_sel_fd = make_histo("h_sel_fd", "FD Reco and sel in acc |#eta|<0.8 and sel") if self.signal_loss: - h_signal_loss_gen_pr = make_histo("h_signal_loss_gen_pr", - "Gen Prompt signal loss in acceptance |y|<0.5") - h_signal_loss_rec_pr = make_histo("h_signal_loss_rec_pr", - "Rec Prompt signal loss in acceptance |y|<0.5") - h_signal_loss_gen_fd = make_histo("h_signal_loss_gen_fd", - "Gen Feeddown signal loss in acceptance |y|<0.5") - h_signal_loss_rec_fd = make_histo("h_signal_loss_rec_fd", - "Rec Feeddown signal loss in acceptance |y|<0.5") + h_signal_loss_gen_pr = make_histo( + "h_signal_loss_gen_pr", "Gen Prompt signal loss in acceptance |y|<0.5" + ) + h_signal_loss_rec_pr = make_histo( + "h_signal_loss_rec_pr", "Rec Prompt signal loss in acceptance |y|<0.5" + ) + h_signal_loss_gen_fd = make_histo( + "h_signal_loss_gen_fd", "Gen Feeddown signal loss in acceptance |y|<0.5" + ) + h_signal_loss_rec_fd = make_histo( + "h_signal_loss_rec_fd", "Rec Feeddown signal loss in acceptance |y|<0.5" + ) bincounter = 0 for ipt in range(self.p_nptfinbins): @@ -327,10 +376,12 @@ def make_histo(name, title, df_mc_gen = df_mc_gen.query(self.s_presel_gen_eff) if self.s_evtsel is not None: df_mc_gen = df_mc_gen.query(self.s_evtsel) - df_mc_reco = seldf_singlevar(df_mc_reco, self.v_var_binning, \ - self.lpt_finbinmin[ipt], self.lpt_finbinmax[ipt]) - df_mc_gen = seldf_singlevar(df_mc_gen, self.v_var_binning, \ - self.lpt_finbinmin[ipt], self.lpt_finbinmax[ipt]) + df_mc_reco = seldf_singlevar( + df_mc_reco, self.v_var_binning, self.lpt_finbinmin[ipt], self.lpt_finbinmax[ipt] + ) + df_mc_gen = seldf_singlevar( + df_mc_gen, self.v_var_binning, self.lpt_finbinmin[ipt], self.lpt_finbinmax[ipt] + ) # Whether or not to calculate the signal loss if self.signal_loss: @@ -340,8 +391,9 @@ def make_histo(name, title, if self.s_evtsel is not None: df_mc_gen_sl = df_mc_gen_sl.query(self.s_evtsel) - df_mc_gen_sl = seldf_singlevar_inclusive(df_mc_gen_sl, self.v_var2_binning_gen, \ - self.lvar2_binmin[ibin2], self.lvar2_binmax[ibin2]) + df_mc_gen_sl = seldf_singlevar_inclusive( + df_mc_gen_sl, self.v_var2_binning_gen, self.lvar2_binmin[ibin2], self.lvar2_binmax[ibin2] + ) df_gen_pr_sl = df_mc_gen_sl.loc[(df_mc_gen_sl.ismcprompt == 1) & (df_mc_gen_sl.ismcsignal == 1)] gen_tot_pr = len(df_gen_pr_sl) @@ -358,10 +410,12 @@ def make_histo(name, title, # Whether or not to cut on the 2nd binning variable if self.mc_cut_on_binning2: - df_mc_reco = seldf_singlevar_inclusive(df_mc_reco, self.v_var2_binning, \ - self.lvar2_binmin[ibin2], self.lvar2_binmax[ibin2]) - df_mc_gen = seldf_singlevar_inclusive(df_mc_gen, self.v_var2_binning, \ - self.lvar2_binmin[ibin2], self.lvar2_binmax[ibin2]) + df_mc_reco = seldf_singlevar_inclusive( + df_mc_reco, self.v_var2_binning, self.lvar2_binmin[ibin2], self.lvar2_binmax[ibin2] + ) + df_mc_gen = seldf_singlevar_inclusive( + df_mc_gen, self.v_var2_binning, self.lvar2_binmin[ibin2], self.lvar2_binmax[ibin2] + ) df_gen_sel_pr = df_mc_gen.loc[(df_mc_gen.ismcprompt == 1) & (df_mc_gen.ismcsignal == 1)] df_reco_presel_pr = df_mc_reco.loc[(df_mc_reco.ismcprompt == 1) & (df_mc_reco.ismcsignal == 1)] df_reco_sel_pr = None @@ -381,8 +435,7 @@ def make_histo(name, title, df_reco_sel_pr = self.apply_cuts_ptbin(df_reco_sel_pr, ipt) df_reco_sel_fd = self.apply_cuts_ptbin(df_reco_sel_fd, ipt) - def set_content(df_to_use, histogram, - i_b=ibin2, b_c=bincounter): + def set_content(df_to_use, histogram, i_b=ibin2, b_c=bincounter): if self.corr_eff_mult[i_b] is True: val, err = self.get_reweighted_count(df_to_use, i_b) else: diff --git a/machine_learning_hep/ratio.py b/machine_learning_hep/ratio.py index d57ad54d17..bb21e30e1a 100644 --- a/machine_learning_hep/ratio.py +++ b/machine_learning_hep/ratio.py @@ -15,19 +15,18 @@ """ main script for doing final stage analysis """ + # pylint: disable=unused-wildcard-import, wildcard-import from array import * + # pylint: disable=import-error, no-name-in-module, unused-import import yaml -from ROOT import TFile, TH1F, TCanvas -from ROOT import gStyle, TLegend -from ROOT import gROOT -from ROOT import TStyle +from ROOT import TH1F, TCanvas, TFile, TLegend, TStyle, gROOT, gStyle + # pylint: disable=import-error, no-name-in-module, unused-import # pylint: disable=too-many-statements def ratio(imult): - gROOT.SetStyle("Plain") gStyle.SetOptStat(0) gStyle.SetOptStat(0000) @@ -36,27 +35,29 @@ def ratio(imult): gStyle.SetFrameFillColor(0) gStyle.SetOptTitle(0) - ccross = TCanvas('cCross', 'The Fit Canvas', 100, 600) + ccross = TCanvas("cCross", "The Fit Canvas", 100, 600) fileoutcrossd0pp = TFile.Open("finalcrossD0pp.root") fileoutcrossdspp = TFile.Open("finalcrossDspp.root") fileoutcrossLcpkpipp = TFile.Open("finalcrossLcpKpipp.root") fileoutcrossLcpk0s = TFile.Open("finalcrossLcpK0spp.root") - with open("data/database_ml_parameters_D0pp.yml", 'r') as param_config: + with open("data/database_ml_parameters_D0pp.yml", "r") as param_config: data_param = yaml.load(param_config, Loader=yaml.FullLoader) nbins = len(data_param["D0pp"]["analysis"]["sel_binmax2"]) print("nbins", nbins) - ccross = TCanvas('cCross', 'The Fit Canvas') + ccross = TCanvas("cCross", "The Fit Canvas") ccross.SetCanvasSize(1500, 1500) ccross.SetWindowSize(500, 500) ccross.SetLogx() colorparticle = [[600, 632, 880], [600, 632, 880]] markerstyle = [[21, 21, 21], [22, 22, 22]] - legendtxt = [["Ds < 20 tracklets", "LcK0s < 20 tracklets", "LcpKpi < 20 tracklets"], \ - ["Ds > 20 tracklets", "LcK0s > 20 tracklets", "LcpKpi > 20 tracklets"]] + legendtxt = [ + ["Ds < 20 tracklets", "LcK0s < 20 tracklets", "LcpKpi < 20 tracklets"], + ["Ds > 20 tracklets", "LcK0s > 20 tracklets", "LcpKpi > 20 tracklets"], + ] - leg = TLegend(.5, .65, .7, .85) + leg = TLegend(0.5, 0.65, 0.7, 0.85) leg.SetBorderSize(0) leg.SetFillColor(0) leg.SetFillStyle(0) @@ -70,29 +71,31 @@ def ratio(imult): hcrossDspp.Divide(hcrossD0pp) hcrossLcpK0spp.Divide(hcrossD0pp) hcrossLcpKpipp.Divide(hcrossD0pp) - hcrossDspp.SetMarkerStyle(markerstyle[imult-1][0]) - hcrossLcpK0spp.SetMarkerStyle(markerstyle[imult-1][1]) - hcrossLcpKpipp.SetMarkerStyle(markerstyle[imult-1][2]) - hcrossDspp.SetMarkerColor(colorparticle[imult-1][0]) - hcrossLcpK0spp.SetMarkerColor(colorparticle[imult-1][1]) - hcrossLcpKpipp.SetMarkerColor(colorparticle[imult-1][2]) - hcrossDspp.SetLineColor(colorparticle[imult-1][0]) - hcrossLcpK0spp.SetLineColor(colorparticle[imult-1][1]) - hcrossLcpKpipp.SetLineColor(colorparticle[imult-1][2]) + hcrossDspp.SetMarkerStyle(markerstyle[imult - 1][0]) + hcrossLcpK0spp.SetMarkerStyle(markerstyle[imult - 1][1]) + hcrossLcpKpipp.SetMarkerStyle(markerstyle[imult - 1][2]) + hcrossDspp.SetMarkerColor(colorparticle[imult - 1][0]) + hcrossLcpK0spp.SetMarkerColor(colorparticle[imult - 1][1]) + hcrossLcpKpipp.SetMarkerColor(colorparticle[imult - 1][2]) + hcrossDspp.SetLineColor(colorparticle[imult - 1][0]) + hcrossLcpK0spp.SetLineColor(colorparticle[imult - 1][1]) + hcrossLcpKpipp.SetLineColor(colorparticle[imult - 1][2]) hcrossDspp.SetMarkerSize(2.5) hcrossLcpK0spp.SetMarkerSize(2.5) hcrossLcpKpipp.SetMarkerSize(2.5) hcrossDspp.GetXaxis().SetTitle("p_{T} (GeV)") hcrossDspp.GetYaxis().SetTitle("Particle ratio") - hcrossDspp.GetYaxis().SetRangeUser(0., 1.) + hcrossDspp.GetYaxis().SetRangeUser(0.0, 1.0) hcrossDspp.Draw() hcrossLcpKpipp.Draw("same") hcrossLcpK0spp.Draw("same") - leg.AddEntry(hcrossDspp, legendtxt[imult-1][0], "LEP") - leg.AddEntry(hcrossLcpKpipp, legendtxt[imult-1][1], "LEP") - leg.AddEntry(hcrossLcpK0spp, legendtxt[imult-1][2], "LEP") + leg.AddEntry(hcrossDspp, legendtxt[imult - 1][0], "LEP") + leg.AddEntry(hcrossLcpKpipp, legendtxt[imult - 1][1], "LEP") + leg.AddEntry(hcrossLcpK0spp, legendtxt[imult - 1][2], "LEP") leg.Draw() ccross.SaveAs("ComparisonRatios%d.eps" % imult) + + ratio(1) ratio(2) diff --git a/machine_learning_hep/root.py b/machine_learning_hep/root.py index 1bbe85a82d..a6529424bc 100644 --- a/machine_learning_hep/root.py +++ b/machine_learning_hep/root.py @@ -18,20 +18,23 @@ import array import ast + import numpy as np -from ROOT import TNtuple, TFile # pylint: disable=import-error,no-name-in-module +from ROOT import TFile, TNtuple # pylint: disable=import-error,no-name-in-module + from machine_learning_hep.logger import get_logger + def read_ntuple(ntuple, variables): """ - Return a numpy array with the values from TNtuple. - ntuple : input TNtuple - variables : list of ntuple variables to read + Return a numpy array with the values from TNtuple. + ntuple : input TNtuple + variables : list of ntuple variables to read """ logger = get_logger() code_list = [] for v in variables: - code_list += [compile("i.%s" % v, '', 'eval')] + code_list += [compile("i.%s" % v, "", "eval")] nentries = ntuple.GetEntries() nvars = len(variables) myarray = np.zeros((nentries, nvars)) @@ -45,18 +48,18 @@ def read_ntuple(ntuple, variables): def read_ntuple_ml(ntuple, variablesfeatures, variablesothers, variabley): """ - Return a numpy array with the values from TNtuple. - ntuple : input TNtuple - variables : list of ntuple variables to read + Return a numpy array with the values from TNtuple. + ntuple : input TNtuple + variables : list of ntuple variables to read """ logger = get_logger() code_listfeatures = [] code_listothers = [] for v in variablesfeatures: - code_listfeatures += [compile("i.%s" % v, '', 'eval')] + code_listfeatures += [compile("i.%s" % v, "", "eval")] for v in variablesothers: - code_listothers += [compile("i.%s" % v, '', 'eval')] - codevariabley = compile("i.%s" % variabley, '', 'eval') + code_listothers += [compile("i.%s" % v, "", "eval")] + codevariabley = compile("i.%s" % variabley, "", "eval") nentries = ntuple.GetEntries() nvars = len(variablesfeatures) nvarsothers = len(variablesothers) @@ -76,17 +79,17 @@ def read_ntuple_ml(ntuple, variablesfeatures, variablesothers, variabley): def fill_ntuple(tupname, data, names): """ - Create and fill ROOT NTuple with the data sample. - tupname : name of the NTuple - data : data sample - names : names of the NTuple variables + Create and fill ROOT NTuple with the data sample. + tupname : name of the NTuple + data : data sample + names : names of the NTuple variables """ variables = "" for n in names: variables += "%s:" % n variables = variables[:-1] - values = len(names)*[0.] - avalues = array.array('f', values) + values = len(names) * [0.0] + avalues = array.array("f", values) nt = TNtuple(tupname, "", variables) for d in data: for i in range(len(names)): diff --git a/machine_learning_hep/selectionutils.py b/machine_learning_hep/selectionutils.py index 976ec538d0..f058870728 100644 --- a/machine_learning_hep/selectionutils.py +++ b/machine_learning_hep/selectionutils.py @@ -18,10 +18,12 @@ import numba import numpy as np -from ROOT import TH1F # pylint: disable=import-error, no-name-in-module +from ROOT import TH1F # pylint: disable=import-error, no-name-in-module + from machine_learning_hep.bitwise import filter_bit_df, tag_bit_df -#@numba.njit + +# @numba.njit def selectcandidateml(array_prob, probcut): array_is_sel = [] for prob in array_prob: @@ -31,6 +33,7 @@ def selectcandidateml(array_prob, probcut): array_is_sel.append(False) return array_is_sel + @numba.njit def select_runs(good_runlist, array_run): array_run_sel = np.zeros(len(array_run), np.bool_) @@ -41,8 +44,9 @@ def select_runs(good_runlist, array_run): break return array_run_sel + # (pt > 5 and abs(y) < 0.8) or (pt <= 5 and abs(y) < ...) -#@numba.njit +# @numba.njit def selectfidacc(array_pt, array_y): array_is_sel = [] for icand, pt in enumerate(array_pt): @@ -52,79 +56,102 @@ def selectfidacc(array_pt, array_y): else: array_is_sel.append(False) else: - yfid = -0.2/15 * pt**2 + 1.9/15 * pt + 0.5 + yfid = -0.2 / 15 * pt**2 + 1.9 / 15 * pt + 0.5 if abs(array_y[icand]) < yfid: array_is_sel.append(True) else: array_is_sel.append(False) return array_is_sel -# pylint: disable=too-many-arguments -#@numba.njit -def selectpid_dstokkpi(array_nsigma_tpc_pi_0, array_nsigma_tpc_k_0, \ - array_nsigma_tof_pi_0, array_nsigma_tof_k_0, \ - array_nsigma_tpc_k_1, array_nsigma_tof_k_1, \ - array_nsigma_tpc_pi_2, array_nsigma_tpc_k_2, \ - array_nsigma_tof_pi_2, array_nsigma_tof_k_2, nsigmacut): +# pylint: disable=too-many-arguments +# @numba.njit +def selectpid_dstokkpi( + array_nsigma_tpc_pi_0, + array_nsigma_tpc_k_0, + array_nsigma_tof_pi_0, + array_nsigma_tof_k_0, + array_nsigma_tpc_k_1, + array_nsigma_tof_k_1, + array_nsigma_tpc_pi_2, + array_nsigma_tpc_k_2, + array_nsigma_tof_pi_2, + array_nsigma_tof_k_2, + nsigmacut, +): array_is_pid_sel = [] for icand, _ in enumerate(array_nsigma_tpc_pi_0): - is_track_0_sel = array_nsigma_tpc_pi_0[icand] < nsigmacut \ - or array_nsigma_tof_pi_0[icand] < nsigmacut \ - or array_nsigma_tpc_k_0[icand] < nsigmacut \ - or array_nsigma_tof_k_0[icand] < nsigmacut - #second track must be a kaon - is_track_1_sel = array_nsigma_tpc_k_1[icand] < nsigmacut \ - or array_nsigma_tof_k_1[icand] < nsigmacut - is_track_2_sel = array_nsigma_tpc_pi_2[icand] < nsigmacut \ - or array_nsigma_tof_pi_2[icand] < nsigmacut \ - or array_nsigma_tpc_k_2[icand] < nsigmacut \ - or array_nsigma_tof_k_2[icand] < nsigmacut + is_track_0_sel = ( + array_nsigma_tpc_pi_0[icand] < nsigmacut + or array_nsigma_tof_pi_0[icand] < nsigmacut + or array_nsigma_tpc_k_0[icand] < nsigmacut + or array_nsigma_tof_k_0[icand] < nsigmacut + ) + # second track must be a kaon + is_track_1_sel = array_nsigma_tpc_k_1[icand] < nsigmacut or array_nsigma_tof_k_1[icand] < nsigmacut + is_track_2_sel = ( + array_nsigma_tpc_pi_2[icand] < nsigmacut + or array_nsigma_tof_pi_2[icand] < nsigmacut + or array_nsigma_tpc_k_2[icand] < nsigmacut + or array_nsigma_tof_k_2[icand] < nsigmacut + ) if is_track_0_sel and is_track_1_sel and is_track_2_sel: array_is_pid_sel.append(True) else: array_is_pid_sel.append(False) return array_is_pid_sel -#@numba.njit -def selectpid_dzerotokpi(array_nsigma_tpc_pi_0, array_nsigma_tpc_k_0, \ - array_nsigma_tof_pi_0, array_nsigma_tof_k_0, \ - array_nsigma_tpc_pi_1, array_nsigma_tpc_k_1, \ - array_nsigma_tof_pi_1, array_nsigma_tof_k_1, nsigmacut): +# @numba.njit +def selectpid_dzerotokpi( + array_nsigma_tpc_pi_0, + array_nsigma_tpc_k_0, + array_nsigma_tof_pi_0, + array_nsigma_tof_k_0, + array_nsigma_tpc_pi_1, + array_nsigma_tpc_k_1, + array_nsigma_tof_pi_1, + array_nsigma_tof_k_1, + nsigmacut, +): array_is_pid_sel = [] for icand, _ in enumerate(array_nsigma_tpc_pi_0): - is_track_0_sel = array_nsigma_tpc_pi_0[icand] < nsigmacut \ - or array_nsigma_tof_pi_0[icand] < nsigmacut \ - or array_nsigma_tpc_k_0[icand] < nsigmacut \ - or array_nsigma_tof_k_0[icand] < nsigmacut - is_track_1_sel = array_nsigma_tpc_pi_1[icand] < nsigmacut \ - or array_nsigma_tof_pi_1[icand] < nsigmacut \ - or array_nsigma_tpc_k_1[icand] < nsigmacut \ - or array_nsigma_tof_k_1[icand] < nsigmacut + is_track_0_sel = ( + array_nsigma_tpc_pi_0[icand] < nsigmacut + or array_nsigma_tof_pi_0[icand] < nsigmacut + or array_nsigma_tpc_k_0[icand] < nsigmacut + or array_nsigma_tof_k_0[icand] < nsigmacut + ) + is_track_1_sel = ( + array_nsigma_tpc_pi_1[icand] < nsigmacut + or array_nsigma_tof_pi_1[icand] < nsigmacut + or array_nsigma_tpc_k_1[icand] < nsigmacut + or array_nsigma_tof_k_1[icand] < nsigmacut + ) if is_track_0_sel and is_track_1_sel: array_is_pid_sel.append(True) else: array_is_pid_sel.append(False) return array_is_pid_sel -#@numba.njit + +# @numba.njit def selectpid_lctov0bachelor(array_nsigma_tpc, array_nsigma_tof, nsigmacut): - #nsigma for desired species (i.e. p in case of pK0s or pi in case of piL) + # nsigma for desired species (i.e. p in case of pK0s or pi in case of piL) array_is_pid_sel = [] for icand, _ in enumerate(array_nsigma_tpc): - is_track_sel = array_nsigma_tpc[icand] < nsigmacut or \ - array_nsigma_tof[icand] < nsigmacut + is_track_sel = array_nsigma_tpc[icand] < nsigmacut or array_nsigma_tof[icand] < nsigmacut if is_track_sel: array_is_pid_sel.append(True) else: array_is_pid_sel.append(False) return array_is_pid_sel -#@numba.njit + +# @numba.njit def selectcand_lincut(array_cut_var, minvalue, maxvalue, isabs): array_is_sel = [] for icand, _ in enumerate(array_cut_var): @@ -138,17 +165,18 @@ def selectcand_lincut(array_cut_var, minvalue, maxvalue, isabs): array_is_sel.append(False) return array_is_sel + def gethistonormforselevt(df_evt, dfevtevtsel, label): - hSelMult = TH1F('sel_' + label, 'sel_' + label, 1, -0.5, 0.5) - hNoVtxMult = TH1F('novtx_' + label, 'novtx_' + label, 1, -0.5, 0.5) - hVtxOutMult = TH1F('vtxout_' + label, 'vtxout_' + label, 1, -0.5, 0.5) + hSelMult = TH1F("sel_" + label, "sel_" + label, 1, -0.5, 0.5) + hNoVtxMult = TH1F("novtx_" + label, "novtx_" + label, 1, -0.5, 0.5) + hVtxOutMult = TH1F("vtxout_" + label, "vtxout_" + label, 1, -0.5, 0.5) - df_to_keep = filter_bit_df(df_evt, 'fIsEventReject', [[], [0, 5, 6, 10, 11]]) + df_to_keep = filter_bit_df(df_evt, "fIsEventReject", [[], [0, 5, 6, 10, 11]]) # events with reco vtx after previous selection - tag_vtx = tag_bit_df(df_to_keep, 'fIsEventReject', [[], [1, 2, 7, 12]]) + tag_vtx = tag_bit_df(df_to_keep, "fIsEventReject", [[], [1, 2, 7, 12]]) df_no_vtx = df_to_keep[tag_vtx] # events with reco zvtx > 10 cm after previous selection - df_bit_zvtx_gr10 = filter_bit_df(df_to_keep, 'fIsEventReject', [[3], [1, 2, 7, 12]]) + df_bit_zvtx_gr10 = filter_bit_df(df_to_keep, "fIsEventReject", [[3], [1, 2, 7, 12]]) hSelMult.SetBinContent(1, len(dfevtevtsel)) hNoVtxMult.SetBinContent(1, len(df_no_vtx)) diff --git a/machine_learning_hep/simulations/ddbar_fonll.py b/machine_learning_hep/simulations/ddbar_fonll.py index 4521034e21..a3ae52998c 100644 --- a/machine_learning_hep/simulations/ddbar_fonll.py +++ b/machine_learning_hep/simulations/ddbar_fonll.py @@ -15,14 +15,17 @@ """ preliminary studies for cross section estimation """ + from array import array + import pandas as pd -from ROOT import TCanvas, TH1F, gROOT, TLatex, gPad # pylint: disable=import-error,no-name-in-module -from machine_learning_hep.utilities import setup_histogram, draw_latex +from ROOT import TH1F, TCanvas, TLatex, gPad, gROOT # pylint: disable=import-error,no-name-in-module + +from machine_learning_hep.utilities import draw_latex, setup_histogram from machine_learning_hep.utilities_plot import load_root_style # pylint: disable=invalid-name -p_fonllband = 'max' +p_fonllband = "max" ptmin = 0 ptmax = 30 delta_pt = ptmax - ptmin @@ -40,7 +43,7 @@ eff_range = [0.01, 0.03, 0.07, 0.1, 0.15, 0.2, 0.25, 0.3] effAA_range = [0.001, 0.01, 0.04, 0.06, 0.1, 0.17, 0.18, 0.18] raa_range = [0.8, 0.7, 0.3, 0.2, 0.2, 0.2, 0.22, 0.3] -bins = array('f', pt_range) +bins = array("f", pt_range) hfonllc = TH1F("hfonllc", "", len(pt_range) - 1, bins) hfonllDtoKpi = TH1F("hfonllDtoKpi", "", len(pt_range) - 1, bins) @@ -59,65 +62,100 @@ for i, ptmin in enumerate(pt_range): if i == len(pt_range) - 1: break - ptmax = pt_range[i+1] - binwidth = pt_range[i+1] - pt_range[i] - df_fonll_in_pt = df_fonll.query('(pt >= @ptmin) and (pt < @ptmax)')[p_fonllband] - crossc = df_fonll_in_pt.sum() * 1e-12 /binwidth + ptmax = pt_range[i + 1] + binwidth = pt_range[i + 1] - pt_range[i] + df_fonll_in_pt = df_fonll.query("(pt >= @ptmin) and (pt < @ptmax)")[p_fonllband] + crossc = df_fonll_in_pt.sum() * 1e-12 / binwidth yieldc = crossc * binwidth / p_sigmamb crossDtoKpi = crossc * p_br * p_fragf yieldDtoKpi = crossc * p_br * p_fragf * binwidth / p_sigmamb yieldDtoKpirsel = crossc * p_br * p_fragf * binwidth * eff_range[i] / p_sigmamb - yieldcAA = crossc * binwidth * p_ncoll/ p_sigmamb + yieldcAA = crossc * binwidth * p_ncoll / p_sigmamb yieldDtoKpiAA = crossc * p_br * p_fragf * binwidth * p_ncoll * raa_range[i] / p_sigmamb - yieldDtoKpirselAA = crossc * p_br * p_fragf * binwidth * p_ncoll * raa_range[i] \ - * effAA_range[i] / p_sigmamb - - yieldDtoKpipairrsel = crossc * p_br * p_fragf * binwidth * eff_range[i]/ p_sigmamb \ - * p_br * p_fragf * eff_range[i] - yieldDtoKpipairrselAA = crossc * p_br * p_fragf * binwidth * p_ncoll \ - * raa_range[i] * effAA_range[i] / p_sigmamb \ - * p_br * p_fragf * raa_range[i] * effAA_range[i] - - - - hfonllc.SetBinContent(i+1, crossc) - hyieldc.SetBinContent(i+1, yieldc) - hfonllDtoKpi.SetBinContent(i+1, crossDtoKpi) - hyieldDtoKpi.SetBinContent(i+1, yieldDtoKpi) - hyieldDtoKpirsel.SetBinContent(i+1, yieldDtoKpirsel) - - hyieldcAA.SetBinContent(i+1, yieldcAA) - hyieldDtoKpiAA.SetBinContent(i+1, yieldDtoKpiAA) - hyieldDtoKpirselAA.SetBinContent(i+1, yieldDtoKpirselAA) - - - hyieldDtoKpipairrsel.SetBinContent(i+1, yieldDtoKpipairrsel) - hyieldDtoKpipairrselAA.SetBinContent(i+1, yieldDtoKpipairrselAA) + yieldDtoKpirselAA = crossc * p_br * p_fragf * binwidth * p_ncoll * raa_range[i] * effAA_range[i] / p_sigmamb + + yieldDtoKpipairrsel = crossc * p_br * p_fragf * binwidth * eff_range[i] / p_sigmamb * p_br * p_fragf * eff_range[i] + yieldDtoKpipairrselAA = ( + crossc + * p_br + * p_fragf + * binwidth + * p_ncoll + * raa_range[i] + * effAA_range[i] + / p_sigmamb + * p_br + * p_fragf + * raa_range[i] + * effAA_range[i] + ) + + hfonllc.SetBinContent(i + 1, crossc) + hyieldc.SetBinContent(i + 1, yieldc) + hfonllDtoKpi.SetBinContent(i + 1, crossDtoKpi) + hyieldDtoKpi.SetBinContent(i + 1, yieldDtoKpi) + hyieldDtoKpirsel.SetBinContent(i + 1, yieldDtoKpirsel) + + hyieldcAA.SetBinContent(i + 1, yieldcAA) + hyieldDtoKpiAA.SetBinContent(i + 1, yieldDtoKpiAA) + hyieldDtoKpirselAA.SetBinContent(i + 1, yieldDtoKpirselAA) + + hyieldDtoKpipairrsel.SetBinContent(i + 1, yieldDtoKpipairrsel) + hyieldDtoKpipairrselAA.SetBinContent(i + 1, yieldDtoKpipairrselAA) print("min,max", ptmin, ptmax, crossDtoKpi) load_root_style() -histo_list = [hfonllc, hyieldc, hyieldcAA, hfonllDtoKpi, - hyieldDtoKpi, hyieldDtoKpirsel, hyieldDtoKpiAA, - hyieldDtoKpirselAA, hyieldDtoKpipairrsel, hyieldDtoKpipairrselAA] +histo_list = [ + hfonllc, + hyieldc, + hyieldcAA, + hfonllDtoKpi, + hyieldDtoKpi, + hyieldDtoKpirsel, + hyieldDtoKpiAA, + hyieldDtoKpirselAA, + hyieldDtoKpipairrsel, + hyieldDtoKpipairrselAA, +] min_list = [1e-8, 1e-8, 1e-8, 1e-8, 1e-8, 1e-8, 1e-8, 1e-8, 1e-14, 1e-14] max_list = [1e3, 1e3, 1e3, 1e3, 1e3, 1e3, 1e3, 1e3, 1e-5, 1e-5] -xaxis_list = ["p_{T} (GeV)", "p_{T} (GeV)", "p_{T} (GeV)", \ - "p_{T} (GeV)", "p_{T} (GeV)", "p_{T} (GeV)", - "p_{T} (GeV)", "p_{T} (GeV)", "p_{T} (GeV)", "p_{T} (GeV)"] -yaxis_list = ["d#sigma/dp_{T} (b/GeV)", "Counts", "Counts", \ - "d#sigma/dp_{T} (b/GeV)", "Counts", "Counts", - "Counts", "Counts", "Counts", "Counts"] -text_list = ["c-quark production cross section", - "Average number of c quarks per event pp", - "Average number of c quarks per event PbPb", - "D^{0} #rightarrow K#pi (BR included) in pp", - "Average number of D^{0} per event pp", - "Average number of D^{0} per event pp recosel", - "Average number of D^{0} per event PbPb", - "Average number of D^{0} per event PbPb recosel", - "Average number of D^{0}-D^{0}bar pair per event pp recosel", - "Average number of D^{0}-D^{0}bar pair per event AA recosel"] +xaxis_list = [ + "p_{T} (GeV)", + "p_{T} (GeV)", + "p_{T} (GeV)", + "p_{T} (GeV)", + "p_{T} (GeV)", + "p_{T} (GeV)", + "p_{T} (GeV)", + "p_{T} (GeV)", + "p_{T} (GeV)", + "p_{T} (GeV)", +] +yaxis_list = [ + "d#sigma/dp_{T} (b/GeV)", + "Counts", + "Counts", + "d#sigma/dp_{T} (b/GeV)", + "Counts", + "Counts", + "Counts", + "Counts", + "Counts", + "Counts", +] +text_list = [ + "c-quark production cross section", + "Average number of c quarks per event pp", + "Average number of c quarks per event PbPb", + "D^{0} #rightarrow K#pi (BR included) in pp", + "Average number of D^{0} per event pp", + "Average number of D^{0} per event pp recosel", + "Average number of D^{0} per event PbPb", + "Average number of D^{0} per event PbPb recosel", + "Average number of D^{0}-D^{0}bar pair per event pp recosel", + "Average number of D^{0}-D^{0}bar pair per event AA recosel", +] list_latex = [] c = TCanvas("canvas", "canvas", 3000, 2000) c.Divide(4, 3) @@ -145,25 +183,37 @@ hyieldDtoKpipairrselAA50B = hyieldDtoKpipairrselAA.Clone("hyieldDtoKpipairrselAA50B") hyieldDtoKpipairrselAA2500B = hyieldDtoKpipairrselAA.Clone("hyieldDtoKpipairrselAA2500B") -histo_list_est = [hyieldDtoKpirsel2B, hyieldDtoKpirselAA100M, - hyieldDtoKpipairrsel2B, hyieldDtoKpipairrsel200B, - hyieldDtoKpipairrselAA100M, hyieldDtoKpipairrselAA50B, - hyieldDtoKpipairrselAA2500B] +histo_list_est = [ + hyieldDtoKpirsel2B, + hyieldDtoKpirselAA100M, + hyieldDtoKpipairrsel2B, + hyieldDtoKpipairrsel200B, + hyieldDtoKpipairrselAA100M, + hyieldDtoKpipairrselAA50B, + hyieldDtoKpipairrselAA2500B, +] min_list_est = [1e-8, 1e-8, 1e-8, 1e-8, 1e-8, 1e-8, 1e-8] max_list_est = [1e10, 1e10, 1e10, 1e10, 1e10, 1e10, 1e10] -xaxis_list_est = ["p_{T} (GeV)", "p_{T} (GeV)", "p_{T} (GeV)", \ - "p_{T} (GeV)", "p_{T} (GeV)", - "p_{T} (GeV)", "p_{T} (GeV)"] -yaxis_list_est = ["Counts", "Counts", "Counts", "Counts", - "Counts", "Counts", "Counts"] -text_list_est = ["D^{0} pp recosel 2B", - "D^{0} AA recosel 100M", - "D^{0}-D^{0}bar pairs pp recosel 2B", - "D^{0}-D^{0}bar pairs pp recosel 200B", - "D^{0}-D^{0}bar pairs AA recosel 100M", - "D^{0}-D^{0}bar pairs AA recosel 50B", - "D^{0}-D^{0}bar pairs AA recosel 2500B"] -nevents_list_ext = [2e9, 100*1e6, 2e9, 200*2e9, 100*1e6, 50*1e9, 2500*1e9] +xaxis_list_est = [ + "p_{T} (GeV)", + "p_{T} (GeV)", + "p_{T} (GeV)", + "p_{T} (GeV)", + "p_{T} (GeV)", + "p_{T} (GeV)", + "p_{T} (GeV)", +] +yaxis_list_est = ["Counts", "Counts", "Counts", "Counts", "Counts", "Counts", "Counts"] +text_list_est = [ + "D^{0} pp recosel 2B", + "D^{0} AA recosel 100M", + "D^{0}-D^{0}bar pairs pp recosel 2B", + "D^{0}-D^{0}bar pairs pp recosel 200B", + "D^{0}-D^{0}bar pairs AA recosel 100M", + "D^{0}-D^{0}bar pairs AA recosel 50B", + "D^{0}-D^{0}bar pairs AA recosel 2500B", +] +nevents_list_ext = [2e9, 100 * 1e6, 2e9, 200 * 2e9, 100 * 1e6, 50 * 1e9, 2500 * 1e9] for ihisto, _ in enumerate(histo_list_est): histo_list_est[ihisto].Scale(nevents_list_ext[ihisto]) diff --git a/machine_learning_hep/simulations/sigmann.py b/machine_learning_hep/simulations/sigmann.py index bf65aa591e..d62a2a6a23 100644 --- a/machine_learning_hep/simulations/sigmann.py +++ b/machine_learning_hep/simulations/sigmann.py @@ -12,36 +12,61 @@ ## along with this program. if not, see . ## ############################################################################# from array import array -from ROOT import TCanvas, TFile, gROOT, TLatex, gPad # pylint: disable=import-error,no-name-in-module -from ROOT import TGraphErrors, TF1, TLegend # pylint: disable=import-error,no-name-in-module -import ROOT # pylint: disable=import-error,no-name-in-module + +import ROOT # pylint: disable=import-error,no-name-in-module +from ROOT import ( # pylint: disable=import-error,no-name-in-module # pylint: disable=import-error,no-name-in-module + TF1, + TCanvas, + TFile, + TGraphErrors, + TLatex, + TLegend, + gPad, + gROOT, +) gROOT.SetBatch(True) # pylint: disable=invalid-name -energy = [0.20, 0.90, 2.76, 5.02, 5.44, 5.50, 7.00, 8.00, 8.16, 8.80, 10.60, - 13.00, 14.00, 17., 27., 39., 63., 100.] +energy = [ + 0.20, + 0.90, + 2.76, + 5.02, + 5.44, + 5.50, + 7.00, + 8.00, + 8.16, + 8.80, + 10.60, + 13.00, + 14.00, + 17.0, + 27.0, + 39.0, + 63.0, + 100.0, +] npoints = len(energy) -errorenergy = [0.] * npoints -sigmann = [41.6, 52.2, 61.8, 67.6, 68.4, 68.5, 70.9, 72.3, 72.5, 73.3, 75.3, - 77.6, 78.4, 80.6, 86.0, 90.5, 96.5, 102.6] -errorsigmann = [0.6, 1.0, 0.9, 0.6, 0.5, 0.5, 0.4, 0.5, 0.5, 0.6, 0.7, - 1.0, 1.1, 1.5, 2.4, 3.3, 4.6, 6.0] +errorenergy = [0.0] * npoints +sigmann = [41.6, 52.2, 61.8, 67.6, 68.4, 68.5, 70.9, 72.3, 72.5, 73.3, 75.3, 77.6, 78.4, 80.6, 86.0, 90.5, 96.5, 102.6] +errorsigmann = [0.6, 1.0, 0.9, 0.6, 0.5, 0.5, 0.4, 0.5, 0.5, 0.6, 0.7, 1.0, 1.1, 1.5, 2.4, 3.3, 4.6, 6.0] energyrun5 = [7.0, 6.3, 7.0, 6.46, 5.86, 5.52] -dndetaperpairrun5 = [10., 10., 10., 10., 10., 10.] +dndetaperpairrun5 = [10.0, 10.0, 10.0, 10.0, 10.0, 10.0] speciesrun5 = ["O", "Ar", "Ca", "Kr", "Xe", "Pb"] colorrun5 = [2, 4, 3, 6, 8, 19] -npartrun5 = [11.1, 24.3, 24.2, 42, 71.2, 113.7] #KRIPTON VALUE 42, IS APPROX -dndeta_points = [0., 1., 2., 3., 4., -5] +npartrun5 = [11.1, 24.3, 24.2, 42, 71.2, 113.7] # KRIPTON VALUE 42, IS APPROX +dndeta_points = [0.0, 1.0, 2.0, 3.0, 4.0, -5] dndeta_points_min = -4 dndeta_points_max = +4 -energy_ = array('f', energy) -errorenergy_ = array('f', errorenergy) -sigmann_ = array('f', sigmann) -errorsigmann_ = array('f', errorsigmann) +energy_ = array("f", energy) +errorenergy_ = array("f", errorenergy) +sigmann_ = array("f", sigmann) +errorsigmann_ = array("f", errorsigmann) c1 = TCanvas("c1", "A Simple Graph with error bars", 200, 10, 700, 500) @@ -54,7 +79,7 @@ latex.SetNDC() latex.SetTextSize(0.03) latex.Draw() -f1 = TF1("f1", "[0]+[1]*log(x)+[2]*x*x+[3]*x", 0.2, 27.) +f1 = TF1("f1", "[0]+[1]*log(x)+[2]*x*x+[3]*x", 0.2, 27.0) gsigma_nn.Fit("f1", "R") c1.SaveAs("sigmavsenergy.pdf") @@ -73,9 +98,9 @@ errdndeta_list = [] for ip in range(npoint): - etaval = ROOT.Double(0.) - dndeta = ROOT.Double(0.) - errdndeta = ROOT.Double(0.) + etaval = ROOT.Double(0.0) + dndeta = ROOT.Double(0.0) + errdndeta = ROOT.Double(0.0) graphpbpb05.GetPoint(ip, etaval, dndeta) errdndeta = graphpbpb05.GetErrorY(ip) etaval_list.append(etaval) @@ -95,18 +120,22 @@ print(etaval_list_o) c2 = TCanvas("c2", "A Simple Graph with error bars", 200, 10, 700, 500) -erretaval_list_d = array('f', [0.] * len(etaval_list_o)) -etaval_list_d = array('f', etaval_list_o) -dndeta_list_d = array('f', dndeta_list_o) -errdndeta_list_d = array('f', errdndeta_list_o) -graphpbpb05_sym = TGraphErrors(len(etaval_list_o), etaval_list_d, dndeta_list_d, \ - erretaval_list_d, errdndeta_list_d) +erretaval_list_d = array("f", [0.0] * len(etaval_list_o)) +etaval_list_d = array("f", etaval_list_o) +dndeta_list_d = array("f", dndeta_list_o) +errdndeta_list_d = array("f", errdndeta_list_o) +graphpbpb05_sym = TGraphErrors(len(etaval_list_o), etaval_list_d, dndeta_list_d, erretaval_list_d, errdndeta_list_d) graphpbpb05_sym.SetTitle(";#eta;dN^{ch}/d#eta;") graphpbpb05_sym.GetXaxis().SetTitleOffset(1.2) graphpbpb05_sym.Draw("ALP") -fpbpb05 = TF1("f2", "([0]+[1]*x*x+[2]*x*x*x*x + \ +fpbpb05 = TF1( + "f2", + "([0]+[1]*x*x+[2]*x*x*x*x + \ [3]/([4]*sqrt(2*3.14))*exp(-((x-[5])/(2*[4]))^2) + \ - [6]/([7]*sqrt(2*3.14))*exp(-((x-[8])/(2*[7]))^2))", -5, 5.) + [6]/([7]*sqrt(2*3.14))*exp(-((x-[8])/(2*[7]))^2))", + -5, + 5.0, +) fpbpb05.SetParameter(5, -1) fpbpb05.SetParameter(4, 1) fpbpb05.SetParameter(8, 1) @@ -122,7 +151,7 @@ f = TFile.Open("dndeta_run5.root", "recreate") fpbpb05_norm = fpbpb05.Clone("fpbpb05_norm") -scalefactor = 1./fpbpb05_norm.Eval(0.) +scalefactor = 1.0 / fpbpb05_norm.Eval(0.0) fpbpb05_norm.FixParameter(0, fpbpb05_norm.GetParameter(0) * scalefactor) fpbpb05_norm.FixParameter(1, fpbpb05_norm.GetParameter(1) * scalefactor) fpbpb05_norm.FixParameter(2, fpbpb05_norm.GetParameter(2) * scalefactor) @@ -130,45 +159,37 @@ fpbpb05_norm.FixParameter(6, fpbpb05_norm.GetParameter(6) * scalefactor) for index, etap in enumerate(dndeta_points): print("dndeta norm at eta=%f" % etap + ", val =%.2f" % fpbpb05_norm.Eval(etap)) -print("dndeta at -4 0: - logger.info('existing directories must be deleted') + logger.info("existing directories must be deleted") for d in exdirs: - print(f'rm -rf {d}') + print(f"rm -rf {d}") delete = False if args.delete: - ok = input('Do you want to delete these directories now (y/n)? ') - delete = ok.lower() == 'y' + ok = input("Do you want to delete these directories now (y/n)? ") + delete = ok.lower() == "y" if args.delete_force: delete = True if delete: @@ -258,48 +262,44 @@ def mlhepmod(name): return importlib.import_module(f"..{name}", __name__) import ROOT # pylint: disable=import-outside-toplevel, import-error - ROOT.gROOT.SetBatch(args.batch) # pylint: disable=no-member - ROOT.TDirectory.AddDirectory(False) # pylint: disable=no-member + + ROOT.gROOT.SetBatch(args.batch) # pylint: disable=no-member + ROOT.TDirectory.AddDirectory(False) # pylint: disable=no-member ROOT.TH1.AddDirectory(False) - ROOT.gErrorIgnoreLevel = ROOT.kWarning # pylint: disable=no-member - from machine_learning_hep.multiprocesser import \ - MultiProcesser # pylint: disable=import-outside-toplevel - syst_class = mlhepmod('analysis.systematics').SystematicsMLWP + ROOT.gErrorIgnoreLevel = ROOT.kWarning # pylint: disable=no-member + from machine_learning_hep.multiprocesser import MultiProcesser # pylint: disable=import-outside-toplevel + + syst_class = mlhepmod("analysis.systematics").SystematicsMLWP if proc_type == "Dhadrons": - proc_class = mlhepmod('processerdhadrons').ProcesserDhadrons - ana_class = mlhepmod('analysis.analyzerdhadrons').AnalyzerDhadrons + proc_class = mlhepmod("processerdhadrons").ProcesserDhadrons + ana_class = mlhepmod("analysis.analyzerdhadrons").AnalyzerDhadrons elif proc_type == "Dhadrons_mult": - proc_class = mlhepmod('processerdhadrons_mult').ProcesserDhadrons_mult - ana_class = mlhepmod('analysis.analyzerdhadrons_mult').AnalyzerDhadrons_mult + proc_class = mlhepmod("processerdhadrons_mult").ProcesserDhadrons_mult + ana_class = mlhepmod("analysis.analyzerdhadrons_mult").AnalyzerDhadrons_mult elif proc_type == "Dhadrons_jet": - proc_class = mlhepmod('processerdhadrons_jet').ProcesserDhadrons_jet - ana_class = mlhepmod('analysis.analyzer_jet').AnalyzerJet + proc_class = mlhepmod("processerdhadrons_jet").ProcesserDhadrons_jet + ana_class = mlhepmod("analysis.analyzer_jet").AnalyzerJet elif proc_type == "Jets": proc_class = mlhepmod("processer_jet").ProcesserJets ana_class = mlhepmod("analysis.analyzer_jets").AnalyzerJets else: - proc_class = mlhepmod('processer').Processer - ana_class = mlhepmod('analysis.analyzer').Analyzer + proc_class = mlhepmod("processer").Processer + ana_class = mlhepmod("analysis.analyzer").Analyzer - mymultiprocessmc = MultiProcesser( - case, proc_class, data_param[case], typean, run_param, "mc") - mymultiprocessdata = MultiProcesser( - case, proc_class, data_param[case], typean, run_param, "data") + mymultiprocessmc = MultiProcesser(case, proc_class, data_param[case], typean, run_param, "mc") + mymultiprocessdata = MultiProcesser(case, proc_class, data_param[case], typean, run_param, "data") ana_mgr = AnalyzerManager(ana_class, data_param[case], case, typean, doanaperperiod) analyzers = ana_mgr.get_analyzers() # For ML WP systematics if mltype == "MultiClassification": - syst_ml_pt_cl0 = syst_class(data_param[case], case, typean, analyzers, - mymultiprocessmc, mymultiprocessdata, 0) - syst_ml_pt_cl1 = syst_class(data_param[case], case, typean, analyzers, - mymultiprocessmc, mymultiprocessdata, 1) + syst_ml_pt_cl0 = syst_class(data_param[case], case, typean, analyzers, mymultiprocessmc, mymultiprocessdata, 0) + syst_ml_pt_cl1 = syst_class(data_param[case], case, typean, analyzers, mymultiprocessmc, mymultiprocessdata, 1) else: - syst_ml_pt = syst_class(data_param[case], case, typean, analyzers, - mymultiprocessmc, mymultiprocessdata) + syst_ml_pt = syst_class(data_param[case], case, typean, analyzers, mymultiprocessmc, mymultiprocessdata) - #perform the analysis flow + # perform the analysis flow if dodownloadalice: subprocess.call("../cplusutilities/Download.sh") @@ -328,12 +328,21 @@ def mlhepmod(name): mymultiprocessdata.multi_mergeml_allinone() if doml: - from machine_learning_hep.optimiser import \ - Optimiser # pylint: disable=import-outside-toplevel + from machine_learning_hep.optimiser import Optimiser # pylint: disable=import-outside-toplevel + for index, (binmin, binmax) in enumerate(zip(binminarray, binmaxarray)): - myopt = Optimiser(data_param[case], case, typean, - data_model[mltype], binmin, binmax, multbkg[index], - raahp[index], training_vars[index], index) + myopt = Optimiser( + data_param[case], + case, + typean, + data_model[mltype], + binmin, + binmax, + multbkg[index], + raahp[index], + training_vars[index], + index, + ) if docorrelation: myopt.do_corr() if dotraining: @@ -398,7 +407,7 @@ def mlhepmod(name): # Collect all desired analysis steps analyze_steps = [] - for step in data_config["analysis"].get('steps', []) or []: + for step in data_config["analysis"].get("steps", []) or []: if step not in analyze_steps: analyze_steps.append(step) @@ -423,6 +432,7 @@ def mlhepmod(name): logger.info("Done") + def load_config(user_path: str, default_path=None) -> dict: """ Quickly extract either configuration given by user and fall back to package default if no user @@ -440,13 +450,14 @@ def load_config(user_path: str, default_path=None) -> dict: if not os.path.exists(user_path): get_logger().fatal("The file %s does not exist", user_path) sys.exit(-1) - with open(user_path, 'r', encoding='utf-8') as stream: + with open(user_path, "r", encoding="utf-8") as stream: cfg = yaml.safe_load(stream) else: res = importlib.resources.files(default_path[0]).joinpath(default_path[1]).read_bytes() cfg = yaml.safe_load(res) return cfg + def main(args=None): """ This is used as the entry point for ml-analysis. @@ -455,26 +466,19 @@ def main(args=None): parser = argparse.ArgumentParser() parser.add_argument("--debug", action="store_true", help="activate debug log level") - parser.add_argument("--quiet", '-q', action="store_true", help="quiet logging") + parser.add_argument("--quiet", "-q", action="store_true", help="quiet logging") parser.add_argument("--log-file", dest="log_file", help="file to print the log to") - parser.add_argument("--run-config", "-r", dest="run_config", - help="the run configuration to be used") - parser.add_argument("--database-analysis", "-d", dest="database_analysis", - help="analysis database to be used", required=True) - parser.add_argument("--database-overwrite", dest="database_overwrite", - help="overwrite fields in analysis database") - parser.add_argument("--database-ml-models", dest="database_ml_models", - help="ml model database to be used") - parser.add_argument("--database-run-list", dest="database_run_list", - help="run list database to be used") - parser.add_argument("--analysis", "-a", dest="type_ana", - help="choose type of analysis") - parser.add_argument("--clean", "-c", action="store_true", - help="delete per-period results at the end") - parser.add_argument("--delete", action="store_true", - help="delete existing directories") - parser.add_argument("--delete-force", action="store_true", - help="delete existing directories without asking") + parser.add_argument("--run-config", "-r", dest="run_config", help="the run configuration to be used") + parser.add_argument( + "--database-analysis", "-d", dest="database_analysis", help="analysis database to be used", required=True + ) + parser.add_argument("--database-overwrite", dest="database_overwrite", help="overwrite fields in analysis database") + parser.add_argument("--database-ml-models", dest="database_ml_models", help="ml model database to be used") + parser.add_argument("--database-run-list", dest="database_run_list", help="run list database to be used") + parser.add_argument("--analysis", "-a", dest="type_ana", help="choose type of analysis") + parser.add_argument("--clean", "-c", action="store_true", help="delete per-period results at the end") + parser.add_argument("--delete", action="store_true", help="delete existing directories") + parser.add_argument("--delete-force", action="store_true", help="delete existing directories without asking") parser.add_argument("--batch", "-b", action="store_true", help="enable ROOT batch mode") args = parser.parse_args(args) @@ -498,5 +502,6 @@ def main(args=None): # Run the chain do_entire_analysis(run_config, db_analysis, db_analysis_overwrite, db_ml_models, db_run_list, args) -if __name__ == '__main__': + +if __name__ == "__main__": main() diff --git a/machine_learning_hep/templates_keras.py b/machine_learning_hep/templates_keras.py index bec5d92359..ca6a4e4df2 100644 --- a/machine_learning_hep/templates_keras.py +++ b/machine_learning_hep/templates_keras.py @@ -14,12 +14,11 @@ from copy import deepcopy -from keras.layers import Input, Dense -from keras.models import Model -from keras.wrappers.scikit_learn import KerasClassifier - from hyperopt import hp from hyperopt.pyll import scope +from keras.layers import Dense, Input +from keras.models import Model +from keras.wrappers.scikit_learn import KerasClassifier from machine_learning_hep.optimisation.bayesian_opt import BayesianOpt from machine_learning_hep.optimisation.metrics import get_scorers @@ -31,39 +30,44 @@ def keras_classifier_(model_config, input_length): """ # Create layers inputs = Input(shape=(input_length,)) - layer = Dense(model_config["layers"][0]["n_nodes"], - activation=model_config["layers"][0]["activation"])(inputs) - predictions = Dense(1, activation='sigmoid')(layer) + layer = Dense(model_config["layers"][0]["n_nodes"], activation=model_config["layers"][0]["activation"])(inputs) + predictions = Dense(1, activation="sigmoid")(layer) # Build model from layers model = Model(inputs=inputs, outputs=predictions) - model.compile(loss=model_config["loss"], optimizer=model_config["optimizer"], - metrics=['accuracy']) + model.compile(loss=model_config["loss"], optimizer=model_config["optimizer"], metrics=["accuracy"]) return model def keras_classifier(model_config, input_length): - return KerasClassifier(build_fn=lambda: \ - keras_classifier_(model_config, input_length), \ - epochs=model_config["epochs"], \ - batch_size=model_config["batch_size"], \ - verbose=1) + return KerasClassifier( + build_fn=lambda: keras_classifier_(model_config, input_length), + epochs=model_config["epochs"], + batch_size=model_config["batch_size"], + verbose=1, + ) def keras_classifier_bayesian_space(): - return {"n_nodes": hp.choice("x_n_nodes", [[scope.int(hp.quniform("x_n_nodes_1", 12, 64, 1)), - scope.int(hp.quniform("x_n_nodes_2", 12, 64, 1))], - [scope.int(hp.quniform("x_n_nodes_1", 12, 64, 1)), - scope.int(hp.quniform("x_n_nodes_2", 12, 64, 1)), - scope.int(hp.quniform("x_n_nodes_3", 12, 64, 1))]]), - "activation_0": hp.choice("x_activation_0", ["relu", "sigmoid"]), - "activation_1": hp.choice("x_activation_1", ["relu", "sigmoid"]), - "epochs": scope.int(hp.quniform("x_epochs", 50, 100, 1)), - "batch_size": scope.int(hp.quniform("x_batch_size", 28, 256, 1))} - - -class KerasClassifierBayesianOpt(BayesianOpt): # pylint: disable=too-many-instance-attributes - - + return { + "n_nodes": hp.choice( + "x_n_nodes", + [ + [scope.int(hp.quniform("x_n_nodes_1", 12, 64, 1)), scope.int(hp.quniform("x_n_nodes_2", 12, 64, 1))], + [ + scope.int(hp.quniform("x_n_nodes_1", 12, 64, 1)), + scope.int(hp.quniform("x_n_nodes_2", 12, 64, 1)), + scope.int(hp.quniform("x_n_nodes_3", 12, 64, 1)), + ], + ], + ), + "activation_0": hp.choice("x_activation_0", ["relu", "sigmoid"]), + "activation_1": hp.choice("x_activation_1", ["relu", "sigmoid"]), + "epochs": scope.int(hp.quniform("x_epochs", 50, 100, 1)), + "batch_size": scope.int(hp.quniform("x_batch_size", 28, 256, 1)), + } + + +class KerasClassifierBayesianOpt(BayesianOpt): # pylint: disable=too-many-instance-attributes def __init__(self, model_config, space, input_length): super().__init__(model_config, space) self.input_length = input_length @@ -72,7 +76,6 @@ def __init__(self, model_config, space, input_length): self.model_config_tmp = None self.space_tmp = None - def get_scikit_model(self): """Just a helper funtion @@ -80,37 +83,31 @@ def get_scikit_model(self): """ inputs = Input(shape=(self.input_length,)) - layer = Dense(self.space_tmp["n_nodes"][0], - activation=self.space_tmp["activation_0"])(inputs) + layer = Dense(self.space_tmp["n_nodes"][0], activation=self.space_tmp["activation_0"])(inputs) for i, n_nodes in enumerate(self.space_tmp["n_nodes"][1:]): - layer = Dense(n_nodes, - activation=self.space_tmp[f"activation_{(i+1)%2}"])(layer) - predictions = Dense(1, activation='sigmoid')(layer) + layer = Dense(n_nodes, activation=self.space_tmp[f"activation_{(i + 1) % 2}"])(layer) + predictions = Dense(1, activation="sigmoid")(layer) # Build model from layers model = Model(inputs=inputs, outputs=predictions) - model.compile(loss=self.model_config_tmp["loss"], - optimizer=self.model_config_tmp["optimizer"], - metrics=['accuracy']) + model.compile( + loss=self.model_config_tmp["loss"], optimizer=self.model_config_tmp["optimizer"], metrics=["accuracy"] + ) return model - def yield_model_(self, model_config, space): - self.space_tmp = deepcopy(space) self.model_config_tmp = deepcopy(model_config) - return KerasClassifier(build_fn=self.get_scikit_model, epochs=space["epochs"], - batch_size=space["batch_size"], verbose=1), space - + return KerasClassifier( + build_fn=self.get_scikit_model, epochs=space["epochs"], batch_size=space["batch_size"], verbose=1 + ), space def save_model_(self, model, out_dir): - """Not implemented yet - """ + """Not implemented yet""" def keras_classifier_bayesian_opt(model_config, input_length): - bayesian_opt = KerasClassifierBayesianOpt(model_config, keras_classifier_bayesian_space(), - input_length) + bayesian_opt = KerasClassifierBayesianOpt(model_config, keras_classifier_bayesian_space(), input_length) bayesian_opt.nkfolds = 3 bayesian_opt.scoring = get_scorers(["AUC", "Accuracy"]) bayesian_opt.scoring_opt = "AUC" diff --git a/machine_learning_hep/templates_scikit.py b/machine_learning_hep/templates_scikit.py index 1e7504e330..3ebfb56df8 100644 --- a/machine_learning_hep/templates_scikit.py +++ b/machine_learning_hep/templates_scikit.py @@ -12,18 +12,20 @@ ## along with this program. if not, see . ## ############################################################################# -from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier +from sklearn.ensemble import AdaBoostClassifier, RandomForestClassifier +from sklearn.linear_model import Lasso, LinearRegression, Ridge from sklearn.tree import DecisionTreeClassifier -from sklearn.linear_model import LinearRegression, Ridge, Lasso def scikit_random_forest_classifier(model_config): - return RandomForestClassifier(max_depth=model_config["max_depth"], - n_estimators=model_config["n_estimators"], - max_features=model_config["max_features"]) + return RandomForestClassifier( + max_depth=model_config["max_depth"], + n_estimators=model_config["n_estimators"], + max_features=model_config["max_features"], + ) -def scikit_adaboost_classifier(model_config): # pylint: disable=W0613 +def scikit_adaboost_classifier(model_config): # pylint: disable=W0613 return AdaBoostClassifier() @@ -31,7 +33,7 @@ def scikit_decision_tree_classifier(model_config): return DecisionTreeClassifier(max_depth=model_config["max_depth"]) -def scikit_linear_regression(model_config): # pylint: disable=W0613 +def scikit_linear_regression(model_config): # pylint: disable=W0613 return LinearRegression() diff --git a/machine_learning_hep/templates_xgboost.py b/machine_learning_hep/templates_xgboost.py index 79eccef21c..4844e475ed 100644 --- a/machine_learning_hep/templates_xgboost.py +++ b/machine_learning_hep/templates_xgboost.py @@ -12,50 +12,51 @@ ## along with this program. if not, see . ## ############################################################################# -from os.path import join - import pickle +from os.path import join -from xgboost import XGBClassifier from hyperopt import hp from hyperopt.pyll import scope +from xgboost import XGBClassifier from machine_learning_hep.optimisation.bayesian_opt import BayesianOpt from machine_learning_hep.optimisation.metrics import get_scorers -def xgboost_classifier(model_config): # pylint: disable=W0613 - return XGBClassifier(verbosity=1, - # n_gpus=0, - **model_config) + +def xgboost_classifier(model_config): # pylint: disable=W0613 + return XGBClassifier( + verbosity=1, + # n_gpus=0, + **model_config, + ) def xgboost_classifier_bayesian_space(): - return {"max_depth": scope.int(hp.quniform("x_max_depth", 1, 3, 1)), - "n_estimators": scope.int(hp.quniform("x_n_estimators", 100, 1000, 1)), - "min_child_weight": scope.int(hp.quniform("x_min_child", 1, 10, 1)), - "subsample": hp.uniform("x_subsample", 0.5, 0.9), - "gamma": hp.uniform("x_gamma", 0.0, 0.2), - "colsample_bytree": hp.uniform("x_colsample_bytree", 0.5, 1.), - "colsample_bylevel": hp.uniform("x_colsample_bylevel", 0.5, 1.), - "colsample_bynode": hp.uniform("x_colsample_bynode", 0.5, 1.), - #"max_delta_step": scope.int(hp.quniform("x_max_delta_step", 0, 8, 1)), - "reg_lambda": hp.uniform("x_reg_lambda", 0, 1), - "reg_alpha": hp.uniform("x_reg_alpha", 0, 1), - "learning_rate": hp.uniform("x_learning_rate", 0.01, 0.5)} + return { + "max_depth": scope.int(hp.quniform("x_max_depth", 1, 3, 1)), + "n_estimators": scope.int(hp.quniform("x_n_estimators", 100, 1000, 1)), + "min_child_weight": scope.int(hp.quniform("x_min_child", 1, 10, 1)), + "subsample": hp.uniform("x_subsample", 0.5, 0.9), + "gamma": hp.uniform("x_gamma", 0.0, 0.2), + "colsample_bytree": hp.uniform("x_colsample_bytree", 0.5, 1.0), + "colsample_bylevel": hp.uniform("x_colsample_bylevel", 0.5, 1.0), + "colsample_bynode": hp.uniform("x_colsample_bynode", 0.5, 1.0), + # "max_delta_step": scope.int(hp.quniform("x_max_delta_step", 0, 8, 1)), + "reg_lambda": hp.uniform("x_reg_lambda", 0, 1), + "reg_alpha": hp.uniform("x_reg_alpha", 0, 1), + "learning_rate": hp.uniform("x_learning_rate", 0.01, 0.5), + } class XGBoostClassifierBayesianOpt(BayesianOpt): - - def yield_model_(self, model_config, space): config = self.next_params(space) config["early_stopping_rounds"] = 10 return xgboost_classifier(config), config - def save_model_(self, model, out_dir): out_filename = join(out_dir, "xgboost_classifier.sav") - with open(out_filename, 'wb') as outfile: + with open(out_filename, "wb") as outfile: pickle.dump(model, outfile, protocol=4) out_filename = join(out_dir, "xgboost_classifier.model") model.save_model(out_filename) diff --git a/machine_learning_hep/utilities.py b/machine_learning_hep/utilities.py index 5414321a51..2c47a3ac7a 100644 --- a/machine_learning_hep/utilities.py +++ b/machine_learning_hep/utilities.py @@ -14,6 +14,7 @@ Script containing all helper functions e.g. processing files, creating objects, calculating physical quantities. """ + import bz2 import gzip import lzma @@ -209,6 +210,7 @@ def seldf_singlevar(dataframe, var, minval, maxval): """ return dataframe.loc[(dataframe[var] >= minval) & (dataframe[var] < maxval)] if var is not None else dataframe + def seldf_singlevar_inclusive(dataframe, var, minval, maxval): """ Make projection on variable using [X,Y), e.g. pT or multiplicity diff --git a/machine_learning_hep/utilities_plot.py b/machine_learning_hep/utilities_plot.py index 274e1e2e29..163777bc1f 100644 --- a/machine_learning_hep/utilities_plot.py +++ b/machine_learning_hep/utilities_plot.py @@ -18,19 +18,41 @@ Script also contains the "class Errors", used for systematic uncertainties (to replace AliHFSystErr from AliPhysics). """ + # pylint: disable=too-many-lines -from array import array import math -import numpy as np +from array import array + import matplotlib.pyplot as plt +import numpy as np + # from root_numpy import fill_hist # pylint: disable=import-error, no-name-in-module # pylint: disable=import-error, no-name-in-module -from ROOT import TH1F, TH2F, TH2, TFile, TH1, TH3F, TGraphAsymmErrors -from ROOT import TPad, TCanvas, TLegend, kBlack, kGreen, kRed, kBlue, kWhite -from ROOT import gStyle, gROOT, TMatrixD -from machine_learning_hep.io import parse_yaml, dump_yaml_from_dict +from ROOT import ( + TH1, + TH1F, + TH2, + TH2F, + TH3F, + TCanvas, + TFile, + TGraphAsymmErrors, + TLegend, + TMatrixD, + TPad, + gROOT, + gStyle, + kBlack, + kBlue, + kGreen, + kRed, + kWhite, +) + +from machine_learning_hep.io import dump_yaml_from_dict, parse_yaml from machine_learning_hep.logger import get_logger + def prepare_fig(plot_count): """ Prepare figure for ML optimiser plots @@ -44,13 +66,15 @@ def prepare_fig(plot_count): figure.subplots_adjust(hspace=0.5) return figure, nrows, ncols + def buildarray(listnumber): """ Build an array out of a list, useful for histogram binning """ - arraynumber = array('d', listnumber) + arraynumber = array("d", listnumber) return arraynumber + def buildbinning(nbinsx, xlow, xup): """ Build a list for binning out of bin limits and number of bins @@ -58,13 +82,16 @@ def buildbinning(nbinsx, xlow, xup): listnumber = [xlow + (xup - xlow) / nbinsx * i for i in range(nbinsx + 1)] return buildarray(listnumber) + def buildhisto(h_name, h_tit, arrayx, arrayy=None, arrayz=None): """ Create a histogram of size 1D, 2D, 3D, depending on the number of arguments given """ histo = None + def binning(binning_array): return len(binning_array) - 1, binning_array + if arrayz: histo = TH3F(h_name, h_tit, *binning(arrayx), *binning(arrayy), *binning(arrayz)) elif arrayy: @@ -74,7 +101,8 @@ def binning(binning_array): histo.Sumw2() return histo -#def makefill1dhist(df_, h_name, h_tit, arrayx, nvar1): + +# def makefill1dhist(df_, h_name, h_tit, arrayx, nvar1): # """ # Create a TH1F histogram and fill it with one variables from a dataframe. # """ @@ -82,13 +110,15 @@ def binning(binning_array): # fill_hist(histo, df_[nvar1]) # return histo + def build2dhisto(titlehist, arrayx, arrayy): """ Create a TH2 histogram from two axis arrays. """ return buildhisto(titlehist, titlehist, arrayx, arrayy) -#def makefill2dhist(df_, titlehist, arrayx, arrayy, nvar1, nvar2): + +# def makefill2dhist(df_, titlehist, arrayx, arrayy, nvar1, nvar2): # """ # Create a TH2F histogram and fill it with two variables from a dataframe. # """ @@ -98,6 +128,7 @@ def build2dhisto(titlehist, arrayx, arrayy): # fill_hist(histo, arr2) # return histo + def makefill2dweighed(df_, titlehist, arrayx, arrayy, nvar1, nvar2, weight): """ Create a TH2F histogram and fill it with two variables from a dataframe. @@ -107,34 +138,36 @@ def makefill2dweighed(df_, titlehist, arrayx, arrayy, nvar1, nvar2, weight): histo.Fill(getattr(row, nvar1), getattr(row, nvar2), getattr(row, weight)) return histo + def makefill3dhist(df_, titlehist, arrayx, arrayy, arrayz, nvar1, nvar2, nvar3): """ Create a TH3F histogram and fill it with three variables from a dataframe. """ histo = buildhisto(titlehist, titlehist, arrayx, arrayy, arrayz) - #df_rd = df_[[nvar1, nvar2, nvar3]] - #arr3 = df_rd.to_numpy() - #fill_hist(histo, arr3) # this does not work, gives an empty histogram + # df_rd = df_[[nvar1, nvar2, nvar3]] + # arr3 = df_rd.to_numpy() + # fill_hist(histo, arr3) # this does not work, gives an empty histogram for row in df_.itertuples(): histo.Fill(getattr(row, nvar1), getattr(row, nvar2), getattr(row, nvar3)) return histo + def makefill3dweighed(df_, titlehist, arrayx, arrayy, arrayz, nvar1, nvar2, nvar3, weight): """ Create a TH3F histogram and fill it with three variables from a dataframe. """ histo = buildhisto(titlehist, titlehist, arrayx, arrayy, arrayz) - #df_rd = df_[[nvar1, nvar2, nvar3]] - #arr3 = df_rd.to_numpy() - #fill_hist(histo, arr3) # this does not work, gives an empty histogram + # df_rd = df_[[nvar1, nvar2, nvar3]] + # arr3 = df_rd.to_numpy() + # fill_hist(histo, arr3) # this does not work, gives an empty histogram for row in df_.itertuples(): - histo.Fill(getattr(row, nvar1), getattr(row, nvar2), \ - getattr(row, nvar3), getattr(row, weight)) + histo.Fill(getattr(row, nvar1), getattr(row, nvar2), getattr(row, nvar3), getattr(row, weight)) return histo -#def fill2dhist(df_, histo, nvar1, nvar2): + +# def fill2dhist(df_, histo, nvar1, nvar2): # """ # Fill a TH2 histogram with two variables from a dataframe. # """ @@ -143,27 +176,29 @@ def makefill3dweighed(df_, titlehist, arrayx, arrayy, arrayz, nvar1, nvar2, nvar # fill_hist(histo, arr2) # return histo + def fill2dweighed(df_, histo, nvar1, nvar2, weight): """ Fill a TH2 histogram with two variables from a dataframe. """ - #df_rd = df_[[nvar1, nvar2]] - #arr2 = df_rd.values - #fill_hist(histo, arr2) + # df_rd = df_[[nvar1, nvar2]] + # arr2 = df_rd.values + # fill_hist(histo, arr2) if isinstance(histo, TH2): for row in df_.itertuples(): - histo.Fill(getattr(row, nvar1), getattr(row, nvar2), getattr(row, weight)) + histo.Fill(getattr(row, nvar1), getattr(row, nvar2), getattr(row, weight)) else: print("WARNING!Incorrect histogram type (should be TH2F) ") return histo + def fillweighed(df_, histo, nvar1, weight): """ Fill a TH1 weighted histogram. """ - #df_rd = df_[[nvar1, nvar2]] - #arr2 = df_rd.values - #fill_hist(histo, arr2) + # df_rd = df_[[nvar1, nvar2]] + # arr2 = df_rd.values + # fill_hist(histo, arr2) if isinstance(histo, TH1): for row in df_.itertuples(): histo.Fill(getattr(row, nvar1), getattr(row, weight)) @@ -171,6 +206,7 @@ def fillweighed(df_, histo, nvar1, weight): print("WARNING!Incorrect histogram type (should be TH1F) ") return histo + def rebin_histogram(src_histo, new_histo): """ Rebins the content of the histogram src_histo into new_histo. @@ -181,17 +217,17 @@ def rebin_histogram(src_histo, new_histo): x_axis_new = new_histo.GetXaxis() x_axis_src = new_histo.GetXaxis() for i in range(1, x_axis_new.GetNbins() + 1): - x_new = [x_axis_new.GetBinLowEdge(i), - x_axis_new.GetBinUpEdge(i), - x_axis_new.GetBinWidth(i), - x_axis_new.GetBinCenter(i)] + x_new = [ + x_axis_new.GetBinLowEdge(i), + x_axis_new.GetBinUpEdge(i), + x_axis_new.GetBinWidth(i), + x_axis_new.GetBinCenter(i), + ] width_src = [] y_src = [] ye_src = [] for j in range(1, x_axis_src.GetNbins() + 1): - x_src = [x_axis_src.GetBinLowEdge(j), - x_axis_src.GetBinUpEdge(j), - x_axis_src.GetBinWidth(j)] + x_src = [x_axis_src.GetBinLowEdge(j), x_axis_src.GetBinUpEdge(j), x_axis_src.GetBinWidth(j)] if x_src[1] <= x_new[0]: continue if x_src[0] >= x_new[1]: @@ -199,11 +235,23 @@ def rebin_histogram(src_histo, new_histo): if x_src[0] < x_new[0]: get_logger().fatal( "For bin %i, bin %i low edge is too low! [%f, %f] vs [%f, %f]", - i, j, x_new[0], x_new[1], x_src[0], x_src[1]) + i, + j, + x_new[0], + x_new[1], + x_src[0], + x_src[1], + ) if x_src[1] > x_new[1]: get_logger().fatal( "For bin %i, bin %i up edge is too high! [%f, %f] vs [%f, %f]", - i, j, x_new[0], x_new[1], x_src[0], x_src[1]) + i, + j, + x_new[0], + x_new[1], + x_src[0], + x_src[1], + ) y_src.append(src_histo.GetBinContent(j)) ye_src.append(src_histo.GetBinError(j)) width_src.append(x_src[-1]) @@ -223,6 +271,7 @@ def load_root_style_simple(): gStyle.SetCanvasColor(0) gStyle.SetFrameFillColor(0) + def load_root_style(): """ Set more advanced ROOT style for histograms @@ -240,7 +289,8 @@ def load_root_style(): gStyle.SetPadTickX(1) gStyle.SetPadTickY(1) -#def scatterplotroot(dfevt, nvar1, nvar2, nbins1, min1, max1, nbins2, min2, max2): + +# def scatterplotroot(dfevt, nvar1, nvar2, nbins1, min1, max1, nbins2, min2, max2): # """ # Make TH2F scatterplot between two variables from dataframe # """ @@ -250,6 +300,7 @@ def load_root_style(): # fill_hist(hmult1_mult2, arr2) # return hmult1_mult2 + def find_axes_limits(histos, use_log_y=False): """ Finds common axes limits for list of histograms provided @@ -258,14 +309,16 @@ def find_axes_limits(histos, use_log_y=False): # reasonably well if there is at least one histogram. max_y = max((h.GetMaximum() for h in histos if isinstance(h, TH1))) min_y = min((h.GetMinimum() for h in histos if isinstance(h, TH1))) - if not min_y > 0. and use_log_y: - min_y = 10.e-9 + if not min_y > 0.0 and use_log_y: + min_y = 10.0e-9 max_x = max((h.GetXaxis().GetXmax() for h in histos)) min_x = min((h.GetXaxis().GetXmin() for h in histos)) return min_x, max_x, min_y, max_y -def style_histograms(histos, linestyles=None, markerstyles=None, colors=None, linewidths=None, - fillstyles=None, fillcolors=None): + +def style_histograms( + histos, linestyles=None, markerstyles=None, colors=None, linewidths=None, fillstyles=None, fillcolors=None +): """ Loops over given line- and markerstyles as well as colors applying them to the given list of histograms. The list of histograms might be larger than the styles provided. In that case @@ -296,6 +349,7 @@ def style_histograms(histos, linestyles=None, markerstyles=None, colors=None, li h.GetXaxis().SetTitleSize(0.02) h.GetYaxis().SetTitleSize(0.02) + def divide_all_by_first(histos): """ Divides all histograms in the list by the first one in the list and returns the @@ -309,6 +363,7 @@ def divide_all_by_first(histos): return histos_ratio + def divide_by_eachother(histos1, histos2, scale=None, rebin2=None): """ Divides all histos1 by histos2 and returns the @@ -316,27 +371,26 @@ def divide_by_eachother(histos1, histos2, scale=None, rebin2=None): """ if len(histos1) != len(histos2): - get_logger().fatal("Number of histograms mismatch, %i vs. %i", \ - len(histos1), len(histos2)) + get_logger().fatal("Number of histograms mismatch, %i vs. %i", len(histos1), len(histos2)) histos_ratio = [] for i, _ in enumerate(histos1): - origname = histos1[i].GetName() if rebin2 is not None: - rebin = array('d', rebin2) - histos1[i] = histos1[i].Rebin(len(rebin2)-1, f"{histos1[i].GetName()}_rebin", rebin) - histos2[i] = histos2[i].Rebin(len(rebin2)-1, f"{histos2[i].GetName()}_rebin", rebin) + rebin = array("d", rebin2) + histos1[i] = histos1[i].Rebin(len(rebin2) - 1, f"{histos1[i].GetName()}_rebin", rebin) + histos2[i] = histos2[i].Rebin(len(rebin2) - 1, f"{histos2[i].GetName()}_rebin", rebin) if scale is not None: - histos1[i].Scale(1./scale[0]) - histos2[i].Scale(1./scale[1]) + histos1[i].Scale(1.0 / scale[0]) + histos2[i].Scale(1.0 / scale[1]) histos_ratio.append(histos1[i].Clone(f"{origname}_ratio")) histos_ratio[-1].Divide(histos2[i]) return histos_ratio + def divide_by_eachother_barlow(histos1, histos2, scale=None, rebin2=None): """ Divides all histos1 by histos2 using Barlow for stat. unc. and returns the @@ -344,37 +398,36 @@ def divide_by_eachother_barlow(histos1, histos2, scale=None, rebin2=None): """ if len(histos1) != len(histos2): - get_logger().fatal("Number of histograms mismatch, %i vs. %i", \ - len(histos1), len(histos2)) + get_logger().fatal("Number of histograms mismatch, %i vs. %i", len(histos1), len(histos2)) histos_ratio = [] for i, _ in enumerate(histos1): - origname = histos1[i].GetName() if rebin2 is not None: - rebin = array('d', rebin2) - histos1[i] = histos1[i].Rebin(len(rebin2)-1, f"{histos1[i].GetName()}_rebin", rebin) - histos2[i] = histos2[i].Rebin(len(rebin2)-1, f"{histos2[i].GetName()}_rebin", rebin) + rebin = array("d", rebin2) + histos1[i] = histos1[i].Rebin(len(rebin2) - 1, f"{histos1[i].GetName()}_rebin", rebin) + histos2[i] = histos2[i].Rebin(len(rebin2) - 1, f"{histos2[i].GetName()}_rebin", rebin) if scale is not None: - histos1[i].Scale(1./scale[0]) - histos2[i].Scale(1./scale[1]) + histos1[i].Scale(1.0 / scale[0]) + histos2[i].Scale(1.0 / scale[1]) stat1 = [] stat2 = [] for j in range(histos1[i].GetNbinsX()): - stat1.append(histos1[i].GetBinError(j+1) / histos1[i].GetBinContent(j+1)) - stat2.append(histos2[i].GetBinError(j+1) / histos2[i].GetBinContent(j+1)) + stat1.append(histos1[i].GetBinError(j + 1) / histos1[i].GetBinContent(j + 1)) + stat2.append(histos2[i].GetBinError(j + 1) / histos2[i].GetBinContent(j + 1)) histos_ratio.append(histos1[i].Clone(f"{origname}_ratio")) histos_ratio[-1].Divide(histos2[i]) for j in range(histos_ratio[-1].GetNbinsX()): statunc = math.sqrt(abs(stat1[j] * stat1[j] - stat2[j] * stat2[j])) - histos_ratio[-1].SetBinError(j+1, histos_ratio[-1].GetBinContent(j+1) * statunc) + histos_ratio[-1].SetBinError(j + 1, histos_ratio[-1].GetBinContent(j + 1) * statunc) return histos_ratio + def divide_all_by_first_multovermb(histos): """ Divides all histograms in the list by the first one in the list and returns the @@ -388,16 +441,17 @@ def divide_all_by_first_multovermb(histos): stat = [] for j in range(h.GetNbinsX()): - stat.append(h.GetBinError(j+1) / h.GetBinContent(j+1)) + stat.append(h.GetBinError(j + 1) / h.GetBinContent(j + 1)) err.append(stat) histos_ratio[-1].Divide(histos[0]) for j in range(h.GetNbinsX()): statunc = math.sqrt(abs(err[-1][j] * err[-1][j] - err[0][j] * err[0][j])) - histos_ratio[-1].SetBinError(j+1, histos_ratio[-1].GetBinContent(j+1) * statunc) + histos_ratio[-1].SetBinError(j + 1, histos_ratio[-1].GetBinContent(j + 1) * statunc) return histos_ratio + def put_in_pad(pad, use_log_y, histos, title="", x_label="", y_label="", yrange=None, **kwargs): """ Providing a TPad this plots all given histograms in that pad adjusting the X- and Y-ranges @@ -409,11 +463,10 @@ def put_in_pad(pad, use_log_y, histos, title="", x_label="", y_label="", yrange= min_x, max_x, min_y, max_y = find_axes_limits(histos, use_log_y) pad.SetLogy(use_log_y) pad.cd() - scale_frame_y = (0.01, 100.) if use_log_y else (0.7, 1.2) + scale_frame_y = (0.01, 100.0) if use_log_y else (0.7, 1.2) if yrange is None: yrange = [min_y * scale_frame_y[0], max_y * scale_frame_y[1]] - frame = pad.DrawFrame(min_x, yrange[0], max_x, yrange[1], - f"{title};{x_label};{y_label}") + frame = pad.DrawFrame(min_x, yrange[0], max_x, yrange[1], f"{title};{x_label};{y_label}") frame.GetYaxis().SetTitleOffset(1.2) pad.SetTicks() if draw_options is None: @@ -421,9 +474,20 @@ def put_in_pad(pad, use_log_y, histos, title="", x_label="", y_label="", yrange= for h, o in zip(histos, draw_options): h.Draw(f"same {o}") -#pylint: disable=too-many-statements -def plot_histograms(histos, use_log_y=False, ratio_=False, legend_titles=None, title="", x_label="", - y_label_up="", y_label_ratio="", save_path="./plot.eps", **kwargs): + +# pylint: disable=too-many-statements +def plot_histograms( + histos, + use_log_y=False, + ratio_=False, + legend_titles=None, + title="", + x_label="", + y_label_up="", + y_label_ratio="", + save_path="./plot.eps", + **kwargs, +): """ Throws all given histograms into one canvas. If desired, a ratio plot will be added. """ @@ -448,25 +512,24 @@ def plot_histograms(histos, use_log_y=False, ratio_=False, legend_titles=None, t canvas_name = kwargs.get("canvas_name", "Canvas") style_histograms(histos, linestyles, markerstyles, colors, linewidths, fillstyles, fillcolors) - canvas = TCanvas('canvas', canvas_name, 800, 800) - pad_up_start = 0.4 if ratio else 0. + canvas = TCanvas("canvas", canvas_name, 800, 800) + pad_up_start = 0.4 if ratio else 0.0 - pad_up = TPad("pad_up", "", 0., pad_up_start, 1., 1.) + pad_up = TPad("pad_up", "", 0.0, pad_up_start, 1.0, 1.0) if ratio: - pad_up.SetBottomMargin(0.) + pad_up.SetBottomMargin(0.0) pad_up.Draw() x_label_up_tmp = x_label if not ratio else "" - put_in_pad(pad_up, use_log_y, histos, title, x_label_up_tmp, y_label_up, - yrange, draw_options=draw_options) + put_in_pad(pad_up, use_log_y, histos, title, x_label_up_tmp, y_label_up, yrange, draw_options=draw_options) pad_up.cd() legend = None if legend_titles is not None: if justratioplot: - legend = TLegend(.2, .65, .6, .85) + legend = TLegend(0.2, 0.65, 0.6, 0.85) else: - legend = TLegend(.45, .65, .85, .85) + legend = TLegend(0.45, 0.65, 0.85, 0.85) legend.SetBorderSize(0) legend.SetFillColor(0) legend.SetFillStyle(0) @@ -483,8 +546,8 @@ def plot_histograms(histos, use_log_y=False, ratio_=False, legend_titles=None, t if ratio and justratioplot is False: histos_ratio = divide_all_by_first(histos) - pad_ratio = TPad("pad_ratio", "", 0., 0.05, 1., pad_up_start) - pad_ratio.SetTopMargin(0.) + pad_ratio = TPad("pad_ratio", "", 0.0, 0.05, 1.0, pad_up_start) + pad_ratio.SetTopMargin(0.0) pad_ratio.SetBottomMargin(0.3) pad_ratio.Draw() @@ -504,6 +567,7 @@ def plot_histograms(histos, use_log_y=False, ratio_=False, legend_titles=None, t canvas.Close() + def save_histograms(histos, save_path="./plot.root"): """ Save everything into a ROOT file for offline plotting @@ -517,89 +581,100 @@ def save_histograms(histos, save_path="./plot.root"): h.Write() root_file.Close() + # pylint: disable=too-many-branches def calc_systematic_multovermb(errnum_list, errden_list, n_bins, same_mc_used=False, justfd=-99): """ Returns a list of total errors taking into account the defined correlations Propagation uncertainties defined for Ds(mult) / Ds(MB). Check if applicable to your situation """ - tot_list = [[0., 0., 0., 0.] for _ in range(n_bins)] - if n_bins != len(list(errnum_list.errors.values())[0]) or \ - n_bins != len(list(errden_list.errors.values())[0]): - get_logger().fatal("Number of bins and number of errors mismatch, %i vs. %i vs. %i", \ - n_bins, len(list(errnum_list.errors.values())[0]), \ - len(list(errden_list.errors.values())[0])) - - listimpl = ["yield", "cut", "pid", "feeddown_mult", "feeddown_mult_spectra", "trigger", \ - "multiplicity_interval", "multiplicity_weights", "track", "ptshape", \ - "feeddown_NB", "sigmav0", "branching_ratio", "statunceff"] + tot_list = [[0.0, 0.0, 0.0, 0.0] for _ in range(n_bins)] + if n_bins != len(list(errnum_list.errors.values())[0]) or n_bins != len(list(errden_list.errors.values())[0]): + get_logger().fatal( + "Number of bins and number of errors mismatch, %i vs. %i vs. %i", + n_bins, + len(list(errnum_list.errors.values())[0]), + len(list(errden_list.errors.values())[0]), + ) + + listimpl = [ + "yield", + "cut", + "pid", + "feeddown_mult", + "feeddown_mult_spectra", + "trigger", + "multiplicity_interval", + "multiplicity_weights", + "track", + "ptshape", + "feeddown_NB", + "sigmav0", + "branching_ratio", + "statunceff", + ] j = 0 for (_, errnum), (_, errden) in zip(errnum_list.errors.items(), errden_list.errors.items()): for i in range(n_bins): - if errnum_list.names[j] not in listimpl: get_logger().fatal("Unknown systematic name: %s", errnum_list.names[j]) if errnum_list.names[j] != errden_list.names[j]: - get_logger().fatal("Names not in same order: %s vs %s", \ - errnum_list.names[j], errden_list.names[j]) + get_logger().fatal("Names not in same order: %s vs %s", errnum_list.names[j], errden_list.names[j]) for nb in range(len(tot_list[i])): if errnum_list.names[j] == "yield" and justfd is not True: - #Partially correlated, take largest - tot_list[i][nb] += max(errnum[i][nb], errden[i][nb]) \ - * max(errnum[i][nb], errden[i][nb]) + # Partially correlated, take largest + tot_list[i][nb] += max(errnum[i][nb], errden[i][nb]) * max(errnum[i][nb], errden[i][nb]) elif errnum_list.names[j] == "cut" and justfd is not True: - #Partially correlated, take largest - tot_list[i][nb] += max(errnum[i][nb], errden[i][nb]) \ - * max(errnum[i][nb], errden[i][nb]) + # Partially correlated, take largest + tot_list[i][nb] += max(errnum[i][nb], errden[i][nb]) * max(errnum[i][nb], errden[i][nb]) elif errnum_list.names[j] == "pid" and justfd is not True: - #Correlated, do nothing + # Correlated, do nothing pass elif errnum_list.names[j] == "feeddown_mult" and justfd is not False: - #Assign directly from multiplicity case, no syst for MB + # Assign directly from multiplicity case, no syst for MB tot_list[i][nb] += errnum[i][nb] * errnum[i][nb] elif errnum_list.names[j] == "feeddown_mult_spectra" and justfd is not False: - #Ratio here, skip spectra syst + # Ratio here, skip spectra syst pass elif errnum_list.names[j] == "trigger" and justfd is not True: - #Assign directly from multiplicity case, no syst for MB + # Assign directly from multiplicity case, no syst for MB tot_list[i][nb] += errnum[i][nb] * errnum[i][nb] elif errnum_list.names[j] == "multiplicity_interval" and justfd is not True: - #FD: estimated using 7TeV strategy directly for ratio + # FD: estimated using 7TeV strategy directly for ratio tot_list[i][nb] += errnum[i][nb] * errnum[i][nb] elif errnum_list.names[j] == "multiplicity_weights" and justfd is not True: - #Uncorrelated + # Uncorrelated tot_list[i][nb] += errnum[i][nb] * errnum[i][nb] + errden[i][nb] * errden[i][nb] elif errnum_list.names[j] == "track" and justfd is not True: - #Correlated, do nothing + # Correlated, do nothing pass elif errnum_list.names[j] == "ptshape" and justfd is not True: - #Correlated, assign difference + # Correlated, assign difference diff = abs(errnum[i][nb] - errden[i][nb]) tot_list[i][nb] += diff * diff elif errnum_list.names[j] == "feeddown_NB" and justfd is not False: - #Correlated, do nothing + # Correlated, do nothing pass elif errnum_list.names[j] == "sigmav0" and justfd is not True: - #Correlated and usually not plotted in boxes, do nothing + # Correlated and usually not plotted in boxes, do nothing pass elif errnum_list.names[j] == "branching_ratio" and justfd is not True: - #Correlated, do nothing + # Correlated, do nothing pass elif errnum_list.names[j] == "statunceff" and justfd is not True: - #Uncorrelated (new since June 2020, add it in syst boxes) - #Part of stat is in common when same MC is used, so doing Barlow test there + # Uncorrelated (new since June 2020, add it in syst boxes) + # Part of stat is in common when same MC is used, so doing Barlow test there if same_mc_used is False: - tot_list[i][nb] += errnum[i][nb] * errnum[i][nb] + \ - errden[i][nb] * errden[i][nb] + tot_list[i][nb] += errnum[i][nb] * errnum[i][nb] + errden[i][nb] * errden[i][nb] else: - tot_list[i][nb] += abs(errnum[i][nb] * errnum[i][nb] - \ - errden[i][nb] * errden[i][nb]) + tot_list[i][nb] += abs(errnum[i][nb] * errnum[i][nb] - errden[i][nb] * errden[i][nb]) j = j + 1 tot_list = np.sqrt(tot_list) return tot_list + # pylint: disable=too-many-branches def calc_systematic_mesonratio(errnum_list, errden_list, n_bins, justfd=-99): """ @@ -607,43 +682,56 @@ def calc_systematic_mesonratio(errnum_list, errden_list, n_bins, justfd=-99): Propagation uncertainties defined for Ds(MB or mult) / D0(MB or mult). Check if applicable to your situation """ - tot_list = [[0., 0., 0., 0.] for _ in range(n_bins)] - if n_bins != len(list(errnum_list.errors.values())[0]) or \ - n_bins != len(list(errden_list.errors.values())[0]): - get_logger().fatal("Number of bins and number of errors mismatch, %i vs. %i vs. %i", \ - n_bins, len(list(errnum_list.errors.values())[0]), \ - len(list(errden_list.errors.values())[0])) - - listimpl = ["yield", "cut", "pid", "feeddown_mult", "feeddown_mult_spectra", "trigger", \ - "multiplicity_interval", "multiplicity_weights", "track", "ptshape", \ - "feeddown_NB", "sigmav0", "branching_ratio", "statunceff"] + tot_list = [[0.0, 0.0, 0.0, 0.0] for _ in range(n_bins)] + if n_bins != len(list(errnum_list.errors.values())[0]) or n_bins != len(list(errden_list.errors.values())[0]): + get_logger().fatal( + "Number of bins and number of errors mismatch, %i vs. %i vs. %i", + n_bins, + len(list(errnum_list.errors.values())[0]), + len(list(errden_list.errors.values())[0]), + ) + + listimpl = [ + "yield", + "cut", + "pid", + "feeddown_mult", + "feeddown_mult_spectra", + "trigger", + "multiplicity_interval", + "multiplicity_weights", + "track", + "ptshape", + "feeddown_NB", + "sigmav0", + "branching_ratio", + "statunceff", + ] j = 0 for (_, errnum), (_, errden) in zip(errnum_list.errors.items(), errden_list.errors.items()): for i in range(n_bins): - if errnum_list.names[j] not in listimpl: get_logger().fatal("Unknown systematic name: %s", errnum_list.names[j]) if errnum_list.names[j] != errden_list.names[j]: - get_logger().fatal("Names not in same order: %s vs %s", \ - errnum_list.names[j], errden_list.names[j]) + get_logger().fatal("Names not in same order: %s vs %s", errnum_list.names[j], errden_list.names[j]) for nb in range(len(tot_list[i])): if errnum_list.names[j] == "yield" and justfd is not True: - #Uncorrelated + # Uncorrelated tot_list[i][nb] += errnum[i][nb] * errnum[i][nb] + errden[i][nb] * errden[i][nb] elif errnum_list.names[j] == "cut" and justfd is not True: - #Uncorrelated + # Uncorrelated tot_list[i][nb] += errnum[i][nb] * errnum[i][nb] + errden[i][nb] * errden[i][nb] elif errnum_list.names[j] == "pid" and justfd is not True: - #Correlated, assign difference + # Correlated, assign difference diff = abs(errnum[i][nb] - errden[i][nb]) tot_list[i][nb] += diff * diff elif errnum_list.names[j] == "feeddown_mult_spectra" and justfd is not False: - #Fully correlated + # Fully correlated ynum = errnum_list.errors["feeddown_NB"][i][4] yden = errden_list.errors["feeddown_NB"][i][4] - #Relative uncertainties stored, make absolute + # Relative uncertainties stored, make absolute ynuml = ynum - ynum * errnum[i][2] ydenl = yden - yden * errden[i][2] ynumh = ynum + ynum * errnum[i][3] @@ -656,16 +744,16 @@ def calc_systematic_mesonratio(errnum_list, errden_list, n_bins, justfd=-99): if nb == 3: tot_list[i][nb] += (maxsys - rat[1]) * (maxsys - rat[1]) / (rat[1] * rat[1]) elif errnum_list.names[j] == "feeddown_mult" and justfd is not False: - #Spectra here, skip ratio systematic + # Spectra here, skip ratio systematic pass elif errnum_list.names[j] == "trigger" and justfd is not True: - #Correlated, do nothing + # Correlated, do nothing pass elif errnum_list.names[j] == "feeddown_NB" and justfd is not False: - #Fully correlated under assumption central Fc value stays within Nb syst + # Fully correlated under assumption central Fc value stays within Nb syst ynum = errnum[i][4] yden = errden[i][4] - #Absolute uncertainties stored + # Absolute uncertainties stored ynuml = ynum - errnum[i][2] ydenl = yden - errden[i][2] ynumh = ynum + errnum[i][3] @@ -678,60 +766,77 @@ def calc_systematic_mesonratio(errnum_list, errden_list, n_bins, justfd=-99): if nb == 3: tot_list[i][nb] += (maxsys - rat[1]) * (maxsys - rat[1]) / (rat[1] * rat[1]) elif errnum_list.names[j] == "multiplicity_weights" and justfd is not True: - #Correlated, assign difference + # Correlated, assign difference diff = abs(errnum[i][nb] - errden[i][nb]) tot_list[i][nb] += diff * diff elif errnum_list.names[j] == "track" and justfd is not True: - #Correlated, assign difference + # Correlated, assign difference diff = abs(errnum[i][nb] - errden[i][nb]) tot_list[i][nb] += diff * diff elif errnum_list.names[j] == "ptshape" and justfd is not True: - #Uncorrelated + # Uncorrelated tot_list[i][nb] += errnum[i][nb] * errnum[i][nb] + errden[i][nb] * errden[i][nb] elif errnum_list.names[j] == "multiplicity_interval" and justfd is not True: - #NB: Assuming ratio: 3prongs over 2prongs here! 2prong part cancels - #We use 1/3 of systematic of numerator + # NB: Assuming ratio: 3prongs over 2prongs here! 2prong part cancels + # We use 1/3 of systematic of numerator tot_list[i][nb] += errnum[i][nb] * errnum[i][nb] / 9 elif errnum_list.names[j] == "sigmav0" and justfd is not True: - #Correlated and usually not plotted in boxes, do nothing + # Correlated and usually not plotted in boxes, do nothing pass elif errnum_list.names[j] == "branching_ratio" and justfd is not True: - #Uncorrelated (new since May 2020, add it in syst boxes) + # Uncorrelated (new since May 2020, add it in syst boxes) tot_list[i][nb] += errnum[i][nb] * errnum[i][nb] + errden[i][nb] * errden[i][nb] elif errnum_list.names[j] == "statunceff" and justfd is not True: - #Uncorrelated (new since June 2020, add it in syst boxes) + # Uncorrelated (new since June 2020, add it in syst boxes) tot_list[i][nb] += errnum[i][nb] * errnum[i][nb] + errden[i][nb] * errden[i][nb] j = j + 1 tot_list = np.sqrt(tot_list) return tot_list -def calc_systematic_mesondoubleratio(errnum_list1, errnum_list2, errden_list1, \ - errden_list2, n_bins, same_mc_used=False, \ - dropbins=None, justfd=-99): + +def calc_systematic_mesondoubleratio( + errnum_list1, errnum_list2, errden_list1, errden_list2, n_bins, same_mc_used=False, dropbins=None, justfd=-99 +): """ Returns a list of total errors taking into account the defined correlations Propagation uncertainties defined for Lc/D0_mult-i / Lc/D0_mult-j. Check if applicable to your situation """ - tot_list = [[0., 0., 0., 0.] for _ in range(n_bins)] - if n_bins != len(list(errnum_list1.errors.values())[0]) or \ - n_bins != len(list(errden_list1.errors.values())[0]): + tot_list = [[0.0, 0.0, 0.0, 0.0] for _ in range(n_bins)] + if n_bins != len(list(errnum_list1.errors.values())[0]) or n_bins != len(list(errden_list1.errors.values())[0]): if dropbins is None: - get_logger().fatal("Number of bins and number of errors mismatch, %i vs. %i vs. %i", \ - n_bins, len(list(errnum_list1.errors.values())[0]), \ - len(list(errden_list1.errors.values())[0])) - - listimpl = ["yield", "cut", "pid", "feeddown_mult", "feeddown_mult_spectra", "trigger", \ - "multiplicity_interval", "multiplicity_weights", "track", "ptshape", \ - "feeddown_NB", "sigmav0", "branching_ratio", "statunceff"] + get_logger().fatal( + "Number of bins and number of errors mismatch, %i vs. %i vs. %i", + n_bins, + len(list(errnum_list1.errors.values())[0]), + len(list(errden_list1.errors.values())[0]), + ) + + listimpl = [ + "yield", + "cut", + "pid", + "feeddown_mult", + "feeddown_mult_spectra", + "trigger", + "multiplicity_interval", + "multiplicity_weights", + "track", + "ptshape", + "feeddown_NB", + "sigmav0", + "branching_ratio", + "statunceff", + ] j = 0 - for (_, errnum1), (_, errnum2), (_, errden1), (_, errden2) in zip(errnum_list1.errors.items(), \ - errnum_list2.errors.items(), \ - errden_list1.errors.items(), \ - errden_list2.errors.items()): + for (_, errnum1), (_, errnum2), (_, errden1), (_, errden2) in zip( + errnum_list1.errors.items(), + errnum_list2.errors.items(), + errden_list1.errors.items(), + errden_list2.errors.items(), + ): for i in range(n_bins): - inum = i iden = i if dropbins is not None: @@ -741,80 +846,97 @@ def calc_systematic_mesondoubleratio(errnum_list1, errnum_list2, errden_list1, \ if errnum_list1.names[j] not in listimpl: get_logger().fatal("Unknown systematic name: %s", errnum_list1.names[j]) if errnum_list1.names[j] != errden_list2.names[j]: - get_logger().fatal("Names not in same order: %s vs %s", \ - errnum_list1.names[j], errden_list2.names[j]) + get_logger().fatal("Names not in same order: %s vs %s", errnum_list1.names[j], errden_list2.names[j]) for nb in range(len(tot_list[i])): if errnum_list1.names[j] == "yield" and justfd is not True: - #Uncorrelated - tot_list[i][nb] += errnum1[inum][nb] * errnum1[inum][nb] + \ - errnum2[inum][nb] * errnum2[inum][nb] + \ - errden1[iden][nb] * errden1[iden][nb] + \ - errden2[iden][nb] * errden2[iden][nb] + # Uncorrelated + tot_list[i][nb] += ( + errnum1[inum][nb] * errnum1[inum][nb] + + errnum2[inum][nb] * errnum2[inum][nb] + + errden1[iden][nb] * errden1[iden][nb] + + errden2[iden][nb] * errden2[iden][nb] + ) elif errnum_list1.names[j] == "cut" and justfd is not True: - #Uncorrelated - tot_list[i][nb] += errnum1[inum][nb] * errnum1[inum][nb] + \ - errnum2[inum][nb] * errnum2[inum][nb] + \ - errden1[iden][nb] * errden1[iden][nb] + \ - errden2[iden][nb] * errden2[iden][nb] + # Uncorrelated + tot_list[i][nb] += ( + errnum1[inum][nb] * errnum1[inum][nb] + + errnum2[inum][nb] * errnum2[inum][nb] + + errden1[iden][nb] * errden1[iden][nb] + + errden2[iden][nb] * errden2[iden][nb] + ) elif errnum_list1.names[j] == "pid" and justfd is not True: - #Correlated, do nothing + # Correlated, do nothing pass elif errnum_list1.names[j] == "feeddown_mult_spectra" and justfd is not False: - #Correlated, do nothing + # Correlated, do nothing pass elif errnum_list1.names[j] == "feeddown_mult" and justfd is not False: - #Correlated, do nothing + # Correlated, do nothing pass elif errnum_list1.names[j] == "trigger" and justfd is not True: - #Correlated, do nothing + # Correlated, do nothing pass elif errnum_list1.names[j] == "feeddown_NB" and justfd is not False: - #Correlated, do nothing + # Correlated, do nothing pass elif errnum_list1.names[j] == "multiplicity_weights" and justfd is not True: - #Correlated, do nothing + # Correlated, do nothing pass elif errnum_list1.names[j] == "track" and justfd is not True: - #Correlated, do nothing + # Correlated, do nothing pass elif errnum_list1.names[j] == "ptshape" and justfd is not True: - #Uncorrelated - tot_list[i][nb] += errnum1[inum][nb] * errnum1[inum][nb] + \ - errnum2[inum][nb] * errnum2[inum][nb] + \ - errden1[iden][nb] * errden1[iden][nb] + \ - errden2[iden][nb] * errden2[iden][nb] + # Uncorrelated + tot_list[i][nb] += ( + errnum1[inum][nb] * errnum1[inum][nb] + + errnum2[inum][nb] * errnum2[inum][nb] + + errden1[iden][nb] * errden1[iden][nb] + + errden2[iden][nb] * errden2[iden][nb] + ) elif errnum_list1.names[j] == "multiplicity_interval" and justfd is not True: - #NB: Assuming ratio: 3prongs over 2prongs here! 2prong part cancels - #We use 1/3 of systematic of numerator + # NB: Assuming ratio: 3prongs over 2prongs here! 2prong part cancels + # We use 1/3 of systematic of numerator tot_list[i][nb] += errden1[iden][nb] * errden1[iden][nb] / 9 elif errnum_list1.names[j] == "sigmav0" and justfd is not True: - #Correlated and usually not plotted in boxes, do nothing + # Correlated and usually not plotted in boxes, do nothing pass elif errnum_list1.names[j] == "branching_ratio" and justfd is not True: - #Correlated, do nothing + # Correlated, do nothing pass elif errnum_list1.names[j] == "statunceff" and justfd is not True: - #Uncorrelated (new since June 2020, add it in syst boxes) - #Part of stat is in common when same MC is used, so doing Barlow test there + # Uncorrelated (new since June 2020, add it in syst boxes) + # Part of stat is in common when same MC is used, so doing Barlow test there if same_mc_used is False: - tot_list[i][nb] += errnum1[inum][nb] * errnum1[inum][nb] + \ - errnum2[inum][nb] * errnum2[inum][nb] + \ - errden1[iden][nb] * errden1[iden][nb] + \ - errden2[iden][nb] * errden2[iden][nb] + tot_list[i][nb] += ( + errnum1[inum][nb] * errnum1[inum][nb] + + errnum2[inum][nb] * errnum2[inum][nb] + + errden1[iden][nb] * errden1[iden][nb] + + errden2[iden][nb] * errden2[iden][nb] + ) else: - tot_list[i][nb] += abs(errnum1[inum][nb] * errnum1[inum][nb] - \ - errden1[iden][nb] * errden1[iden][nb]) + \ - abs(errnum2[inum][nb] * errnum2[inum][nb] - \ - errden2[iden][nb] * errden2[iden][nb]) + tot_list[i][nb] += abs( + errnum1[inum][nb] * errnum1[inum][nb] - errden1[iden][nb] * errden1[iden][nb] + ) + abs(errnum2[inum][nb] * errnum2[inum][nb] - errden2[iden][nb] * errden2[iden][nb]) j = j + 1 tot_list = np.sqrt(tot_list) return tot_list + # pylint: disable=too-many-locals -def average_pkpi_pk0s(histo_pkpi, histo_pk0s, graph_pkpi, graph_pk0s, err_pkpi, err_pk0s, - matchbins_pkpi, matchbins_pk0s, matchbinsgr_pkpi, matchbinsgr_pk0s): +def average_pkpi_pk0s( + histo_pkpi, + histo_pk0s, + graph_pkpi, + graph_pk0s, + err_pkpi, + err_pk0s, + matchbins_pkpi, + matchbins_pk0s, + matchbinsgr_pkpi, + matchbinsgr_pk0s, +): """ Strategy described in https://alice-notes.web.cern.ch/node/613 @@ -838,8 +960,9 @@ def average_pkpi_pk0s(histo_pkpi, histo_pk0s, graph_pkpi, graph_pk0s, err_pkpi, Input files need to be scaled with BR! """ if len(matchbins_pkpi) != len(matchbins_pk0s): - get_logger().fatal("Length matchbins_pkpi != matchbins_pk0s: %d != %d", - len(matchbins_pkpi), len(matchbins_pk0s)) + get_logger().fatal( + "Length matchbins_pkpi != matchbins_pk0s: %d != %d", len(matchbins_pkpi), len(matchbins_pk0s) + ) nbins = len(matchbins_pkpi) arr_errors = [err_pkpi, err_pk0s] @@ -854,7 +977,7 @@ def average_pkpi_pk0s(histo_pkpi, histo_pk0s, graph_pkpi, graph_pk0s, err_pkpi, arr_weights = [[-99 for _ in range(nbins)], [-99 for _ in range(nbins)]] arr_weightsum = [-99 for _ in range(nbins)] - #Fill arrays with corryield and fprompt from pkpi and pk0s + # Fill arrays with corryield and fprompt from pkpi and pk0s stat_unc = [[0 for _ in range(nbins)], [0 for _ in range(nbins)]] rel_stat_unc = [[0 for _ in range(nbins)], [0 for _ in range(nbins)]] corr_yield = [[0 for _ in range(nbins)], [0 for _ in range(nbins)]] @@ -874,19 +997,18 @@ def average_pkpi_pk0s(histo_pkpi, histo_pk0s, graph_pkpi, graph_pk0s, err_pkpi, fprompthigh[j][ipt] = -99 else: stat_unc[j][ipt] = arr_histo[j].GetBinError(binmatch) - rel_stat_unc[j][ipt] = arr_histo[j].GetBinError(binmatch) / \ - arr_histo[j].GetBinContent(binmatch) + rel_stat_unc[j][ipt] = arr_histo[j].GetBinError(binmatch) / arr_histo[j].GetBinContent(binmatch) corr_yield[j][ipt] = arr_histo[j].GetBinContent(binmatch) fprompt[j][ipt] = arr_graph[j].GetY()[binmatchgr] fpromptlow[j][ipt] = arr_graph[j].GetEYlow()[binmatchgr] fprompthigh[j][ipt] = arr_graph[j].GetEYhigh()[binmatchgr] - #Get uncorrelated part of the systematics + # Get uncorrelated part of the systematics syst_uncorr_pkpi = err_pkpi.get_uncorr_for_lc_average() syst_uncorr_pk0s = err_pk0s.get_uncorr_for_lc_average() syst_uncorr = [syst_uncorr_pkpi, syst_uncorr_pk0s] - #Partial correlation of BR + # Partial correlation of BR mbrw = TMatrixD(2, 2) mbrw.Zero() correlationbrpp = [[1, 0.5], [0.5, 1]] @@ -894,28 +1016,29 @@ def average_pkpi_pk0s(histo_pkpi, histo_pk0s, graph_pkpi, graph_pk0s, err_pkpi, for j in range(2): for k in range(2): if j != k: - mbrw[j, k] = correlationbrpp[j][k] * lcsystbr[k]*lcsystbr[j] + mbrw[j, k] = correlationbrpp[j][k] * lcsystbr[k] * lcsystbr[j] - #preperation weights - mtotw = TMatrixD(2*nbins, 2*nbins) + # preperation weights + mtotw = TMatrixD(2 * nbins, 2 * nbins) mtotw.Zero() correlationother = [[1, 0], [0, 1]] for j in range(2): for k in range(2): for ipt in range(nbins): - mtotw[ipt*2+j, ipt*2+k] = mbrw[j][k] + correlationother[j][k] * \ - syst_uncorr[j][ipt][2] * syst_uncorr[k][ipt][2] + \ - correlationother[j][k] * rel_stat_unc[j][ipt] * \ - rel_stat_unc[k][ipt] + mtotw[ipt * 2 + j, ipt * 2 + k] = ( + mbrw[j][k] + + correlationother[j][k] * syst_uncorr[j][ipt][2] * syst_uncorr[k][ipt][2] + + correlationother[j][k] * rel_stat_unc[j][ipt] * rel_stat_unc[k][ipt] + ) mtotw.Invert() lcsystuncorrweights = [[0 for _ in range(nbins)], [0 for _ in range(nbins)]] for ipt in range(nbins): for j in range(2): for k in range(2): - lcsystuncorrweights[j][ipt] += mtotw(ipt*2+j, ipt*2+k) + lcsystuncorrweights[j][ipt] += mtotw(ipt * 2 + j, ipt * 2 + k) - #applying weights + # applying weights for ipt in range(nbins): if matchbins_pkpi[ipt] < 0: average_corryield[ipt] = corr_yield[1][ipt] @@ -930,13 +1053,13 @@ def average_pkpi_pk0s(histo_pkpi, histo_pk0s, graph_pkpi, graph_pk0s, err_pkpi, weightsum = 0 for j in range(2): - weightsyst = 1/np.sqrt(lcsystuncorrweights[j][ipt]) + weightsyst = 1 / np.sqrt(lcsystuncorrweights[j][ipt]) weightstat = stat_unc[j][ipt] / corr_yield[j][ipt] weighttemp = np.sqrt(weightstat * weightstat + weightsyst * weightsyst) - weight = 1/(weighttemp * weighttemp) + weight = 1 / (weighttemp * weighttemp) average_corryield[ipt] += weight * corr_yield[j][ipt] - average_statunc[ipt] += (stat_unc[j][ipt]*weight) * (stat_unc[j][ipt]*weight) + average_statunc[ipt] += (stat_unc[j][ipt] * weight) * (stat_unc[j][ipt] * weight) average_fprompt[ipt] += weight * fprompt[j][ipt] arr_weights[j][ipt] = weight @@ -947,10 +1070,10 @@ def average_pkpi_pk0s(histo_pkpi, histo_pk0s, graph_pkpi, graph_pk0s, err_pkpi, average_statunc[ipt] = np.sqrt(average_statunc[ipt]) / weightsum arr_weightsum[ipt] = weightsum - #applying weights to the systematics - average_err, average_fpromptlow, average_fprompthigh = \ - weight_systematic_lc_averaging(arr_errors, fprompt, fpromptlow, fprompthigh, - arr_weights, arr_weightsum) + # applying weights to the systematics + average_err, average_fpromptlow, average_fprompthigh = weight_systematic_lc_averaging( + arr_errors, fprompt, fpromptlow, fprompthigh, arr_weights, arr_weightsum + ) average_fpromptlow = [i * j for i, j in zip(average_fpromptlow, average_fprompt)] average_fprompthigh = [i * j for i, j in zip(average_fprompthigh, average_fprompt)] @@ -965,59 +1088,72 @@ def average_pkpi_pk0s(histo_pkpi, histo_pk0s, graph_pkpi, graph_pk0s, err_pkpi, average_fprompthigh[ipt] = fprompthigh[0][ipt] continue - return average_corryield, average_statunc, average_fprompt, \ - average_fpromptlow, average_fprompthigh, average_err + return average_corryield, average_statunc, average_fprompt, average_fpromptlow, average_fprompthigh, average_err + -def weight_systematic_lc_averaging(arr_errors, fprompt, fpromptlow, fprompthigh, - arr_weights, arr_weightsum): +def weight_systematic_lc_averaging(arr_errors, fprompt, fpromptlow, fprompthigh, arr_weights, arr_weightsum): """ Propagate weights for Lc averaging to systematic percentages """ nbins = len(arr_weightsum) err_new = arr_errors[0] - listimpl = ["yield", "cut", "pid", "feeddown_mult", "feeddown_mult_spectra", "trigger", \ - "multiplicity_interval", "multiplicity_weights", "track", "ptshape", \ - "sigmav0", "branching_ratio", "statunceff"] + listimpl = [ + "yield", + "cut", + "pid", + "feeddown_mult", + "feeddown_mult_spectra", + "trigger", + "multiplicity_interval", + "multiplicity_weights", + "track", + "ptshape", + "sigmav0", + "branching_ratio", + "statunceff", + ] j = 0 - for (_, errpkpi), (_, errpk0s) in zip(arr_errors[0].errors.items(), \ - arr_errors[1].errors.items()): - + for (_, errpkpi), (_, errpk0s) in zip(arr_errors[0].errors.items(), arr_errors[1].errors.items()): for i in range(nbins): - if arr_errors[0].names[j] not in listimpl: get_logger().fatal("Unknown systematic name: %s", arr_errors[0].names[j]) if arr_errors[0].names[j] != arr_errors[1].names[j]: - get_logger().fatal("Names not in same order: %s vs %s", \ - arr_errors[0].names[j], arr_errors[1].names[j]) + get_logger().fatal("Names not in same order: %s vs %s", arr_errors[0].names[j], arr_errors[1].names[j]) syst = arr_errors[0].names[j] for nb in range(4): if syst in ["yield", "cut", "pid", "statunceff"]: - #Uncorrelated - err_new.errors[syst][i][nb] = np.sqrt((errpkpi[i][nb] * arr_weights[0][i]) * \ - (errpkpi[i][nb] * arr_weights[0][i]) + \ - (errpk0s[i][nb] * arr_weights[1][i]) * \ - (errpk0s[i][nb] * arr_weights[1][i])) / \ - arr_weightsum[i] - elif syst in ["feeddown_mult_spectra", "feeddown_mult", "trigger", - "multiplicity_weights", "track", "ptshape", - "multiplicity_interval", "sigmav0"]: - #Correlated - err_new.errors[syst][i][nb] = ((errpkpi[i][nb] * arr_weights[0][i]) + \ - (errpk0s[i][nb] * arr_weights[1][i])) / \ - arr_weightsum[i] + # Uncorrelated + err_new.errors[syst][i][nb] = ( + np.sqrt( + (errpkpi[i][nb] * arr_weights[0][i]) * (errpkpi[i][nb] * arr_weights[0][i]) + + (errpk0s[i][nb] * arr_weights[1][i]) * (errpk0s[i][nb] * arr_weights[1][i]) + ) + / arr_weightsum[i] + ) + elif syst in [ + "feeddown_mult_spectra", + "feeddown_mult", + "trigger", + "multiplicity_weights", + "track", + "ptshape", + "multiplicity_interval", + "sigmav0", + ]: + # Correlated + err_new.errors[syst][i][nb] = ( + (errpkpi[i][nb] * arr_weights[0][i]) + (errpk0s[i][nb] * arr_weights[1][i]) + ) / arr_weightsum[i] elif syst == "branching_ratio": - #Uncorrelated - syst_errbr = (errpkpi[i][nb] * arr_weights[0][i]) * \ - (errpkpi[i][nb] * arr_weights[0][i]) + \ - (errpk0s[i][nb] * arr_weights[1][i]) * \ - (errpk0s[i][nb] * arr_weights[1][i]) - syst_errbr += 0.5 * errpkpi[i][nb] * arr_weights[0][i] * \ - errpk0s[i][nb] * arr_weights[1][i] - syst_errbr += 0.5 * errpk0s[i][nb] * arr_weights[1][i] * \ - errpkpi[i][nb] * arr_weights[0][i] + # Uncorrelated + syst_errbr = (errpkpi[i][nb] * arr_weights[0][i]) * (errpkpi[i][nb] * arr_weights[0][i]) + ( + errpk0s[i][nb] * arr_weights[1][i] + ) * (errpk0s[i][nb] * arr_weights[1][i]) + syst_errbr += 0.5 * errpkpi[i][nb] * arr_weights[0][i] * errpk0s[i][nb] * arr_weights[1][i] + syst_errbr += 0.5 * errpk0s[i][nb] * arr_weights[1][i] * errpkpi[i][nb] * arr_weights[0][i] err_new.errors[syst][i][nb] = np.sqrt(syst_errbr) / arr_weightsum[i] else: print("Error for systematic: ", syst) @@ -1037,21 +1173,23 @@ def weight_systematic_lc_averaging(arr_errors, fprompt, fpromptlow, fprompthigh, fpromptlow[1] = [i / j for i, j in zip(fpromptlow[1], fprompt[1])] fprompthigh[1] = [i / j for i, j in zip(fprompthigh[1], fprompt[1])] for i in range(nbins): - fpromptlownew[i] = ((fpromptlow[0][i] * arr_weights[0][i]) + \ - (fpromptlow[1][i] * arr_weights[1][i])) / \ - arr_weightsum[i] - fprompthighnew[i] = ((fprompthigh[0][i] * arr_weights[0][i]) + \ - (fprompthigh[1][i] * arr_weights[1][i])) / \ - arr_weightsum[i] + fpromptlownew[i] = ( + (fpromptlow[0][i] * arr_weights[0][i]) + (fpromptlow[1][i] * arr_weights[1][i]) + ) / arr_weightsum[i] + fprompthighnew[i] = ( + (fprompthigh[0][i] * arr_weights[0][i]) + (fprompthigh[1][i] * arr_weights[1][i]) + ) / arr_weightsum[i] return err_new, fpromptlownew, fprompthighnew + # pylint: disable=too-many-nested-blocks class Errors: """ Errors corresponding to one histogram Relative errors are assumed """ + def __init__(self, n_bins): # A dictionary of lists, lists will contain 4-tuples self.errors = {} @@ -1070,8 +1208,7 @@ def make_symm_y_errors(*args): def make_asymm_y_errors(*args): if len(args) % 2 != 0: get_logger().fatal("Need an even number ==> ((low, up) * n_central) of errors") - return [[0, 0, args[i], args[i+1]] for i in range(0, len(args), 2)] - + return [[0, 0, args[i], args[i + 1]] for i in range(0, len(args), 2)] @staticmethod def make_root_asymm(histo_central, error_list, **kwargs): @@ -1081,8 +1218,7 @@ def make_root_asymm(histo_central, error_list, **kwargs): """ n_bins = histo_central.GetNbinsX() if n_bins != len(error_list): - get_logger().fatal("Number of bins and number of errors mismatch, %i vs. %i", - n_bins, len(error_list)) + get_logger().fatal("Number of bins and number of errors mismatch, %i vs. %i", n_bins, len(error_list)) rel_x = kwargs.get("rel_x", True) rel_y = kwargs.get("rel_y", True) const_x_err = kwargs.get("const_x_err", None) @@ -1097,10 +1233,8 @@ def make_root_asymm(histo_central, error_list, **kwargs): x_up = array("d", [const_x_err] * n_bins) x_low = array("d", [const_x_err] * n_bins) elif rel_x is True: - x_up = array("d", [err[1] * histo_central.GetBinCenter(b + 1) \ - for b, err in enumerate(error_list)]) - x_low = array("d", [err[0] * histo_central.GetBinCenter(b + 1) \ - for b, err in enumerate(error_list)]) + x_up = array("d", [err[1] * histo_central.GetBinCenter(b + 1) for b, err in enumerate(error_list)]) + x_low = array("d", [err[0] * histo_central.GetBinCenter(b + 1) for b, err in enumerate(error_list)]) else: x_up = array("d", [err[1] for err in error_list]) x_low = array("d", [err[0] for err in error_list]) @@ -1110,10 +1244,8 @@ def make_root_asymm(histo_central, error_list, **kwargs): y_up = array("d", [const_y_err] * n_bins) y_low = array("d", [const_y_err] * n_bins) elif rel_y is True: - y_up = array("d", [err[3] * histo_central.GetBinContent(b + 1) \ - for b, err in enumerate(error_list)]) - y_low = array("d", [err[2] * histo_central.GetBinContent(b + 1) \ - for b, err in enumerate(error_list)]) + y_up = array("d", [err[3] * histo_central.GetBinContent(b + 1) for b, err in enumerate(error_list)]) + y_low = array("d", [err[2] * histo_central.GetBinContent(b + 1) for b, err in enumerate(error_list)]) else: y_up = array("d", [err[3] for err in error_list]) y_low = array("d", [err[2] for err in error_list]) @@ -1128,10 +1260,10 @@ def make_root_asymm_dummy(histo_central): n_bins = histo_central.GetNbinsX() bin_centers = array("d", [histo_central.GetBinCenter(b + 1) for b in range(n_bins)]) bin_contents = array("d", [histo_central.GetBinContent(b + 1) for b in range(n_bins)]) - y_up = array("d", [0.] * n_bins) - y_low = array("d", [0.] * n_bins) - x_up = array("d", [0.] * n_bins) - x_low = array("d", [0.] * n_bins) + y_up = array("d", [0.0] * n_bins) + y_low = array("d", [0.0] * n_bins) + x_up = array("d", [0.0] * n_bins) + x_low = array("d", [0.0] * n_bins) return TGraphAsymmErrors(n_bins, bin_centers, bin_contents, x_low, x_up, y_low, y_up) @@ -1183,24 +1315,24 @@ def define_correlations(self): """ Not yet defined """ - self.logger.warning("Function \"define_correlations\' not yet defined") + self.logger.warning("Function \"define_correlations' not yet defined") def divide(self): """ Not yet defined """ - self.logger.warning("Function \"divide\" not yet defined") + self.logger.warning('Function "divide" not yet defined') def get_total(self): """ Returns a list of total errors For now only add in quadrature and take sqrt """ - tot_list = [[0., 0., 0., 0.] for _ in range(self.n_bins)] + tot_list = [[0.0, 0.0, 0.0, 0.0] for _ in range(self.n_bins)] for _, errors in enumerate(self.errors.values()): for i in range(self.n_bins): for nb in range(len(tot_list[i])): - tot_list[i][nb] += (errors[i][nb] * errors[i][nb]) + tot_list[i][nb] += errors[i][nb] * errors[i][nb] tot_list = np.sqrt(tot_list) return tot_list @@ -1224,15 +1356,18 @@ def get_uncorr_for_lc_average(self): Returns a list of total uncorrelated errors For now only add in quadrature and take sqrt """ - tot_list = [[0., 0., 0., 0.] for _ in range(self.n_bins)] + tot_list = [[0.0, 0.0, 0.0, 0.0] for _ in range(self.n_bins)] for j, errors in enumerate(self.errors.values()): for i in range(self.n_bins): for nb in range(len(tot_list[i])): - - if self.names[j] == "yield" or self.names[j] == "cut" \ - or self.names[j] == "pid" or self.names[j] == "branching_ratio" \ - or self.names[j] == "statunceff": - tot_list[i][nb] += (errors[i][nb] * errors[i][nb]) + if ( + self.names[j] == "yield" + or self.names[j] == "cut" + or self.names[j] == "pid" + or self.names[j] == "branching_ratio" + or self.names[j] == "statunceff" + ): + tot_list[i][nb] += errors[i][nb] * errors[i][nb] tot_list = np.sqrt(tot_list) return tot_list @@ -1241,23 +1376,20 @@ def get_total_for_spectra_plot(self, justfd=-99): Returns a list of total errors For now only add in quadrature and take sqrt """ - tot_list = [[0., 0., 0., 0.] for _ in range(self.n_bins)] + tot_list = [[0.0, 0.0, 0.0, 0.0] for _ in range(self.n_bins)] for j, errors in enumerate(self.errors.values()): for i in range(self.n_bins): for nb in range(len(tot_list[i])): - #New since May 2020, add BR in syst boxes + # New since May 2020, add BR in syst boxes if self.names[j] != "sigmav0" and self.names[j] != "feeddown_mult": - if justfd == -99: - tot_list[i][nb] += (errors[i][nb] * errors[i][nb]) + tot_list[i][nb] += errors[i][nb] * errors[i][nb] elif justfd is True: - if self.names[j] == "feeddown_NB" \ - or self.names[j] == "feeddown_mult_spectra": - tot_list[i][nb] += (errors[i][nb] * errors[i][nb]) + if self.names[j] == "feeddown_NB" or self.names[j] == "feeddown_mult_spectra": + tot_list[i][nb] += errors[i][nb] * errors[i][nb] elif justfd is False: - if self.names[j] != "feeddown_NB" \ - and self.names[j] != "feeddown_mult_spectra": - tot_list[i][nb] += (errors[i][nb] * errors[i][nb]) + if self.names[j] != "feeddown_NB" and self.names[j] != "feeddown_mult_spectra": + tot_list[i][nb] += errors[i][nb] * errors[i][nb] else: get_logger().fatal("Option for spectra systematic not valid") diff --git a/machine_learning_hep/utils/hist.py b/machine_learning_hep/utils/hist.py index 99f09ab53a..241b6162e8 100644 --- a/machine_learning_hep/utils/hist.py +++ b/machine_learning_hep/utils/hist.py @@ -1,14 +1,14 @@ -from collections import deque import itertools - import math +from collections import deque + import numpy as np import pandas as pd import ROOT def bin_array(nbins, low, high): - return np.linspace(float(low), float(high), nbins + 1, 'd') + return np.linspace(float(low), float(high), nbins + 1, "d") def get_axis(hist, axis: int): @@ -41,7 +41,7 @@ def get_binrange(hist, axis: int): return (axis.GetFirst(), axis.GetLast()) -def get_nbins(hist, axis:int): +def get_nbins(hist, axis: int): return get_axis(hist, axis).GetNbins() @@ -56,8 +56,8 @@ def project_hist(hist, axes: list, limits: dict[int, tuple[int, int]]): if not hist: raise ValueError if len(axes) == 2: - axes = axes[:] # slice to avoid modifying the list passed as parameter - axes.reverse() # compensation for ROOT signature using ydim, xdim for 2d projection + axes = axes[:] # slice to avoid modifying the list passed as parameter + axes.reverse() # compensation for ROOT signature using ydim, xdim for 2d projection reset = False if isinstance(hist, ROOT.THn): assert len(axes) < hist.GetNdimensions() @@ -69,7 +69,7 @@ def project_hist(hist, axes: list, limits: dict[int, tuple[int, int]]): if bins[0] == 1 and bins[1] == get_nbins(hist, iaxis): get_axis(hist, iaxis).SetBit(ROOT.TAxis.kAxisRange) reset |= bins[1] < bins[0] - hproj = hist.Projection(*axes, 'e') if len(axes) < 4 else hist.Projection(len(axes), np.asarray(axes, 'i'), 'e') + hproj = hist.Projection(*axes, "e") if len(axes) < 4 else hist.Projection(len(axes), np.asarray(axes, "i"), "e") for iaxis in limits: get_axis(hist, iaxis).SetRange(*ranges[iaxis]) if reset: @@ -89,7 +89,7 @@ def project_hist(hist, axes: list, limits: dict[int, tuple[int, int]]): reset |= bins[1] < bins[0] proj_spec = "" for axis in axes: - proj_spec += ('x' if axis == 0 else 'y' if axis == 1 else 'z') + proj_spec += "x" if axis == 0 else "y" if axis == 1 else "z" hproj = hist.Project3D(proj_spec) for iaxis in limits: get_axis(hist, iaxis).SetRange(*ranges[iaxis]) @@ -120,10 +120,10 @@ def project_hist(hist, axes: list, limits: dict[int, tuple[int, int]]): def create_hist(name, title, *bin_specs): """Create ROOT histogram from standard bin specifications or arrays""" rhist = {1: ROOT.TH1F, 2: ROOT.TH2F, 3: ROOT.TH3F, 4: ROOT.THnF} - var_bins = [hasattr(spec, '__len__') for spec in bin_specs] - assert all(var_bins) or not any(var_bins), f'either all bins must be variable or fixed width: {bin_specs=}' + var_bins = [hasattr(spec, "__len__") for spec in bin_specs] + assert all(var_bins) or not any(var_bins), f"either all bins must be variable or fixed width: {bin_specs=}" dim = len(bin_specs) if all(var_bins) else len(bin_specs) / 3 - assert dim in range(1, 12), 'only dimensions from 1 to 10 are supported' + assert dim in range(1, 12), "only dimensions from 1 to 10 are supported" if all(var_bins): nbins = list(map(lambda a: len(a) - 1, bin_specs)) @@ -135,14 +135,15 @@ def create_hist(name, title, *bin_specs): return rhist[min(dim, 4)](name, title, *bin_specs) if all(var_bins): - nbins = np.asarray(nbins, 'i') + nbins = np.asarray(nbins, "i") return rhist[min(dim, 4)](name, title, dim, nbins, bin_specs) raise NotImplementedError + # TODO: generalize which columns can contain arrays # pylint: disable=too-many-branches -def fill_hist(hist, dfi: pd.DataFrame, weights = None, arraycols = None, write = False): +def fill_hist(hist, dfi: pd.DataFrame, weights=None, arraycols=None, write=False): """ Fill histogram from dataframe @@ -154,8 +155,8 @@ def fill_hist(hist, dfi: pd.DataFrame, weights = None, arraycols = None, write = """ dim_hist = hist.GetDimension() if isinstance(hist, ROOT.TH1) else hist.GetNdimensions() dim_df = dfi.shape[1] if dfi.ndim > 1 else dfi.ndim - assert dim_df in range(1, 12), f'{dim_df} not supported' - assert dim_df == dim_hist, 'dimensions of df and histogram do not match' + assert dim_df in range(1, 12), f"{dim_df} not supported" + assert dim_df == dim_hist, "dimensions of df and histogram do not match" if len(dfi) == 0: return if dim_hist == 1: @@ -167,37 +168,43 @@ def fill_hist(hist, dfi: pd.DataFrame, weights = None, arraycols = None, write = if not arraycols: hist.FillN(len(dfi), np.float64(dfi.iloc[:, 0]), np.float64(dfi.iloc[:, 1]), weights or ROOT.nullptr) else: - assert weights is None, 'weights not supported' + assert weights is None, "weights not supported" dfi.apply(lambda row: [hist.Fill(row.iloc[0], v) for v in row.iloc[1]], axis=1) elif dim_hist == 3: # TODO: why does TH3 not support FillN? # hist.FillN(len(dfi), np.float64(dfi.iloc[:, 0]), np.float64(dfi.iloc[:, 1]), np.float64(dfi.iloc[:, 2]), # weights or np.float64(len(dfi)*[1.])) - assert weights is None, 'weights not supported' + assert weights is None, "weights not supported" if not arraycols: dfi.apply(lambda row: hist.Fill(row.iloc[0], row.iloc[1], row.iloc[2]), axis=1) else: - assert arraycols == [1, 2], 'other cases not yet implemented' - dfi.apply(lambda row: [hist.Fill(row.iloc[0], v[0], v[1]) - for v in zip(row.iloc[i] for i in arraycols)], axis=1) + assert arraycols == [1, 2], "other cases not yet implemented" + dfi.apply( + lambda row: [hist.Fill(row.iloc[0], v[0], v[1]) for v in zip(row.iloc[i] for i in arraycols)], axis=1 + ) elif dim_hist > 3: - assert weights is None, 'weights not supported' + assert weights is None, "weights not supported" if not arraycols: - dfi.apply(lambda row: hist.Fill(np.array(row, 'd'), 1.), axis=1) + dfi.apply(lambda row: hist.Fill(np.array(row, "d"), 1.0), axis=1) else: m = [-1] * dim_hist idx = 0 for i in arraycols: m[i] = idx idx += 1 + def fill_row(row): # for v in zip(*[row.iloc[i] for i in arraycols]): # hist.Fill(np.asarray([row.iloc[i] if i not in arraycols else v[m[i]] # for i in range(dim_hist)], 'd')) - gen = (hist.Fill(np.asarray([row.iloc[i] if i not in arraycols else v[m[i]] - for i in range(dim_hist)], 'd')) - for v in zip(*[row.iloc[i] for i in arraycols])) + gen = ( + hist.Fill( + np.asarray([row.iloc[i] if i not in arraycols else v[m[i]] for i in range(dim_hist)], "d") + ) + for v in zip(*[row.iloc[i] for i in arraycols]) + ) deque(gen, maxlen=0) + dfi.apply(fill_row, axis=1) if write: hist.Write() @@ -210,21 +217,23 @@ def fill_hist_fast(hist, dfi, write=False): """ dim_hist = hist.GetDimension() if isinstance(hist, ROOT.TH1) else hist.GetNdimensions() dim_df = dfi.shape[1] if dfi.ndim > 1 else dfi.ndim - assert dim_df in range(3, 4), f'{dim_df} not supported' - assert dim_df == dim_hist, 'dimensions of df and histogram do not match' - bin_it = [range(get_nbins(hist, i)+2) for i in range(get_dim(hist))] + assert dim_df in range(3, 4), f"{dim_df} not supported" + assert dim_df == dim_hist, "dimensions of df and histogram do not match" + bin_it = [range(get_nbins(hist, i) + 2) for i in range(get_dim(hist))] for binids in itertools.product(*bin_it): df = dfi for i in range(get_dim(hist)): if binids[i] == 0: # underflow - df = df.loc[df.iloc[:, i] < get_axis(hist, i).GetXmin()] + df = df.loc[df.iloc[:, i] < get_axis(hist, i).GetXmin()] elif binids[i] == (get_nbins(hist, i) + 1): # overflow df = df.loc[df.iloc[:, i] >= get_axis(hist, i).GetXmax()] else: - df = df.loc[(df.iloc[:, i] >= get_axis(hist, i).GetBinLowEdge(binids[i])) & - (df.iloc[:, i] < get_axis(hist, i).GetBinUpEdge(binids[i]))] + df = df.loc[ + (df.iloc[:, i] >= get_axis(hist, i).GetBinLowEdge(binids[i])) + & (df.iloc[:, i] < get_axis(hist, i).GetBinUpEdge(binids[i])) + ] hist.SetBinContent(*binids, len(df)) if write: hist.Write() @@ -236,7 +245,7 @@ def scale_bin(hist, factor, *bin_indices): hist.SetBinError(*bin_indices, hist.GetBinError(*bin_indices) * factor) -def sum_hists(hists, name = None): +def sum_hists(hists, name=None): """ Return histogram with sum of all histograms from iterable """ @@ -245,7 +254,7 @@ def sum_hists(hists, name = None): if h is None: continue if hist is None: - hist = h.Clone(name or (h.GetName() + '_cloned')) + hist = h.Clone(name or (h.GetName() + "_cloned")) else: hist.Add(h) return hist @@ -256,18 +265,17 @@ def ensure_sumw2(hist): if hist.GetSumw2N() < 1: hist.Sumw2() elif isinstance(hist, ROOT.THn): - if hist.GetSumw2() < 0.: + if hist.GetSumw2() < 0.0: hist.Sumw2() else: raise NotImplementedError - def get_bin_val(hist, hbin): if isinstance(hist, ROOT.TH1): return hist.GetBinContent(*hbin) if isinstance(hist, ROOT.THn): - return hist.GetBinContent(np.array(hbin, 'i')) + return hist.GetBinContent(np.array(hbin, "i")) raise NotImplementedError @@ -275,7 +283,7 @@ def get_bin_err(hist, hbin): if isinstance(hist, ROOT.TH1): return hist.GetBinError(*hbin) if isinstance(hist, ROOT.THn): - return hist.GetBinError(np.array(hbin, 'i')) + return hist.GetBinError(np.array(hbin, "i")) raise NotImplementedError @@ -283,7 +291,7 @@ def set_bin_val(hist, hbin, val): if isinstance(hist, ROOT.TH1): return hist.SetBinContent(*hbin, val) if isinstance(hist, ROOT.THn): - return hist.SetBinContent(np.array(hbin, 'i'), val) + return hist.SetBinContent(np.array(hbin, "i"), val) raise NotImplementedError @@ -291,20 +299,21 @@ def set_bin_err(hist, hbin, val): if isinstance(hist, ROOT.TH1): return hist.SetBinError(*hbin, val) if isinstance(hist, ROOT.THn): - return hist.SetBinError(np.array(hbin, 'i'), val) + return hist.SetBinError(np.array(hbin, "i"), val) raise NotImplementedError def norm_response(response, dim_out): response_norm = response.Clone() - for bin_in in itertools.product(*(range(1, get_nbins(response_norm, iaxis) + 1) - for iaxis in range(dim_out, get_dim(response_norm)))): + for bin_in in itertools.product( + *(range(1, get_nbins(response_norm, iaxis) + 1) for iaxis in range(dim_out, get_dim(response_norm))) + ): for iaxis, val in enumerate(bin_in, dim_out): get_axis(response_norm, iaxis).SetRange(val, val) norm = response_norm.Projection(0).Integral() - if np.isclose(norm, 0.): + if np.isclose(norm, 0.0): continue - for bin_out in itertools.product(*(range(1, get_nbins(response_norm, i)+1) for i in range(dim_out))): + for bin_out in itertools.product(*(range(1, get_nbins(response_norm, i) + 1) for i in range(dim_out))): set_bin_val(response_norm, bin_out + bin_in, get_bin_val(response_norm, bin_out + bin_in) / norm) set_bin_err(response_norm, bin_out + bin_in, get_bin_err(response_norm, bin_out + bin_in) / norm) return response_norm @@ -314,14 +323,14 @@ def fold_hist(hist, response): """Fold hist with response""" assert get_dim(response) > get_dim(hist) dim_out = get_dim(response) - get_dim(hist) - axes_spec = list(np.array(get_axis(response, i).GetXbins(), 'd') for i in range(dim_out)) - hfold = create_hist('test', 'test', *axes_spec) - for bin_out in itertools.product(*(range(1, get_nbins(hfold, i)+1) for i in range(get_dim(hfold)))): - val = 0. - err = 0. - for bin_in in itertools.product(*(range(1, get_nbins(hist, i)+1) for i in range(get_dim(hist)))): + axes_spec = list(np.array(get_axis(response, i).GetXbins(), "d") for i in range(dim_out)) + hfold = create_hist("test", "test", *axes_spec) + for bin_out in itertools.product(*(range(1, get_nbins(hfold, i) + 1) for i in range(get_dim(hfold)))): + val = 0.0 + err = 0.0 + for bin_in in itertools.product(*(range(1, get_nbins(hist, i) + 1) for i in range(get_dim(hist)))): val += get_bin_val(hist, bin_in) * get_bin_val(response, bin_out + bin_in) - err += get_bin_err(hist, bin_in)**2 * get_bin_val(response, bin_out + bin_in)**2 + err += get_bin_err(hist, bin_in) ** 2 * get_bin_val(response, bin_out + bin_in) ** 2 set_bin_val(hfold, bin_out, val) set_bin_err(hfold, bin_out, math.sqrt(err)) return hfold diff --git a/machine_learning_hep/validation/find_duplicates_events.py b/machine_learning_hep/validation/find_duplicates_events.py index 0de04a806c..64f47e99e2 100644 --- a/machine_learning_hep/validation/find_duplicates_events.py +++ b/machine_learning_hep/validation/find_duplicates_events.py @@ -13,61 +13,62 @@ ############################################################################# import multiprocessing as mp -from glob import glob import pickle -from lz4 import frame # pylint: disable=unused-import +from glob import glob import yaml +from lz4 import frame # pylint: disable=unused-import -from machine_learning_hep.utilities import openfile -from machine_learning_hep.io import dump_yaml_from_dict from machine_learning_hep.do_variations import modify_dictionary +from machine_learning_hep.io import dump_yaml_from_dict +from machine_learning_hep.utilities import openfile def read_database(path, overwrite_path=None): data_param = None - with open(path, 'r') as param_config: + with open(path, "r") as param_config: data_param = yaml.load(param_config, Loader=yaml.FullLoader) case = list(data_param.keys())[0] data_param = data_param[case] if overwrite_path: overwrite_db = None - with open(overwrite_path, 'r') as param_config: + with open(overwrite_path, "r") as param_config: overwrite_db = yaml.load(param_config, Loader=yaml.FullLoader) modify_dictionary(data_param, overwrite_db) return case, data_param + def _callback(exept_msg): print(exept_msg) + def multi_proc(function, argument_list, kw_argument_list, maxperchunk, max_n_procs=10): - chunks_args = [argument_list[x:x+maxperchunk] \ - for x in range(0, len(argument_list), maxperchunk)] + chunks_args = [argument_list[x : x + maxperchunk] for x in range(0, len(argument_list), maxperchunk)] if not kw_argument_list: kw_argument_list = [{} for _ in argument_list] - chunks_kwargs = [kw_argument_list[x:x+maxperchunk] \ - for x in range(0, len(kw_argument_list), maxperchunk)] + chunks_kwargs = [kw_argument_list[x : x + maxperchunk] for x in range(0, len(kw_argument_list), maxperchunk)] res_list = [] for chunk_args, chunk_kwargs in zip(chunks_args, chunks_kwargs): print("Processing new chunck size=", maxperchunk) pool = mp.Pool(max_n_procs) - res = [pool.apply_async(function, args=args, kwds=kwds, error_callback=_callback) \ - for args, kwds in zip(chunk_args, chunk_kwargs)] + res = [ + pool.apply_async(function, args=args, kwds=kwds, error_callback=_callback) + for args, kwds in zip(chunk_args, chunk_kwargs) + ] pool.close() pool.join() res_list.extend(res) try: res_list = [r.get() for r in res_list] - except Exception as e: # pylint: disable=broad-except + except Exception as e: # pylint: disable=broad-except print("EXCEPTION") print(e) return res_list def check_duplicates(file_path, cols): - """Open dataframe and check for duplicates - """ + """Open dataframe and check for duplicates""" df = pickle.load(openfile(file_path, "rb"))[cols] len_orig = len(df) @@ -76,12 +77,13 @@ def check_duplicates(file_path, cols): return len_orig, len_dupl, df_dupl + ########################### # MAIN # ########################### # BASICALLY THESE HAVE TO BE ADJUSTED -DATABASE_PATH = "/home/bvolkel/HF/MachineLearningHEP/machine_learning_hep/data/data_prod_20200304/database_ml_parameters_LcpK0spp_0304.yml" # pylint: disable=line-too-long +DATABASE_PATH = "/home/bvolkel/HF/MachineLearningHEP/machine_learning_hep/data/data_prod_20200304/database_ml_parameters_LcpK0spp_0304.yml" # pylint: disable=line-too-long # Summary YAML will be written to this one # Check "has_duplicates" to find all files with duplictates and the dupl/all ratio @@ -95,7 +97,7 @@ def check_duplicates(file_path, cols): UNIQUE_COLS = ["ev_id", "ev_id_ext", "run_number"] # Run over mc and/or data, like this automatically over data and MC -DATA_MC = ("mc",) # "data") # ("mc",) ("data",) +DATA_MC = ("mc",) # "data") # ("mc",) ("data",) ################################# @@ -107,13 +109,11 @@ def check_duplicates(file_path, cols): FILE_NAME = DATABASE["files_names"]["namefile_evtorig"] - DUPLICATES_SUMMARY = {} for dm in DATA_MC: DUPLICATES_SUMMARY[dm] = {} - for period, dir_applied in zip(DATABASE["multi"][dm]["period"], - DATABASE["multi"][dm]["pkl"]): + for period, dir_applied in zip(DATABASE["multi"][dm]["period"], DATABASE["multi"][dm]["pkl"]): print(f"Process {dm} of period {period}") DUPLICATES_SUMMARY[dm][period] = {} files_all = glob(f"{dir_applied}/**/{FILE_NAME}", recursive=True) @@ -128,28 +128,27 @@ def check_duplicates(file_path, cols): for child in children: files_child = [f for f in files_all if f"/{child}/" in f] args = [] - for f in files_child: + for f in files_child: args.append((f, UNIQUE_COLS)) duplicates = multi_proc(check_duplicates, args, None, 500, 40) - duplicates_ratio = [d[1] / d[0] * 100 if d[0] > 0 else 0. for d in duplicates] + duplicates_ratio = [d[1] / d[0] * 100 if d[0] > 0 else 0.0 for d in duplicates] if EXTRACT_DUPL_INFO: duplicates_cols = [] for d in duplicates: duplicates_cols_this_df = [] for _, row in d[2].iterrows(): - duplicates_cols_this_df.append([float(row[col_name]) \ - for col_name in UNIQUE_COLS]) + duplicates_cols_this_df.append([float(row[col_name]) for col_name in UNIQUE_COLS]) duplicates_cols.append(duplicates_cols_this_df) else: duplicates_cols = [None] * len(duplicates) - has_duplicates = [dr > 0. for dr in duplicates_ratio] - DUPLICATES_SUMMARY[dm][period][child] = \ - [{"file": df, "dupl_ratio": dr, "has_duplicates": hd, "duplicates": dc} \ - for df, dr, hd, dc \ - in zip(files_child, duplicates_ratio, has_duplicates, duplicates_cols)] + has_duplicates = [dr > 0.0 for dr in duplicates_ratio] + DUPLICATES_SUMMARY[dm][period][child] = [ + {"file": df, "dupl_ratio": dr, "has_duplicates": hd, "duplicates": dc} + for df, dr, hd, dc in zip(files_child, duplicates_ratio, has_duplicates, duplicates_cols) + ] dump_yaml_from_dict(DUPLICATES_SUMMARY, SUMMARY_FILE) diff --git a/machine_learning_hep/validation/validation.py b/machine_learning_hep/validation/validation.py index dde42fadf3..dc214c34cb 100644 --- a/machine_learning_hep/validation/validation.py +++ b/machine_learning_hep/validation/validation.py @@ -16,8 +16,8 @@ Script base function for validation histograms """ -from machine_learning_hep.utilities_plot import makefill1dhist, makefill2dhist from machine_learning_hep.logger import get_logger +from machine_learning_hep.utilities_plot import makefill1dhist, makefill2dhist class ValidationCollection: @@ -44,6 +44,7 @@ def make_and_fill(self, binx, namex, biny=None, namey=None): """ Makes histogram and fills them based on their axis titles """ + def column_exists(col_name, axis_name): if col_name not in self.source_dataframe: msg = f"Columns {col_name} for {axis_name} axis does not exist in dataframe: " @@ -63,15 +64,13 @@ def column_exists(col_name, axis_name): return h_name = f"hVal_{namex}_vs_{namey}{self.collection_tag}" h_tit = f" ; {namex} ; {namey}" - h = makefill2dhist(self.source_dataframe, h_name, - binx, biny, namex, namey) + h = makefill2dhist(self.source_dataframe, h_name, binx, biny, namex, namey) h.SetTitle(h_tit) else: # Check that column exists h_name = f"hVal_{namex}{self.collection_tag}" h_tit = f" ; {namex} ; Entries" - h = makefill1dhist(self.source_dataframe, - h_name, h_tit, binx, namex) + h = makefill1dhist(self.source_dataframe, h_name, h_tit, binx, namex) if self.verbose: get_logger().info("Filling histogram %s", h.GetName()) self.histograms.append(h) diff --git a/machine_learning_hep/validation/validation_candidates.py b/machine_learning_hep/validation/validation_candidates.py index 5c825302e1..193d7f3214 100644 --- a/machine_learning_hep/validation/validation_candidates.py +++ b/machine_learning_hep/validation/validation_candidates.py @@ -16,7 +16,7 @@ Script containing validation histograms on the candidate granularity """ -from machine_learning_hep.utilities_plot import buildbinning, buildarray +from machine_learning_hep.utilities_plot import buildarray, buildbinning from machine_learning_hep.validation.validation import ValidationCollection @@ -56,14 +56,12 @@ def fill_validation_candidates(df_reco, tag=""): val.make_and_fill(binning_phi, "phi_cand", *yaxis) # Invariant mass - val.make_and_fill(binning_inv_mass, "inv_mass", - binning_v0m_perc, "perc_v0m") - val.make_and_fill(binning_inv_mass, "inv_mass", - binning_ntrklt, "n_tracklets_corr") + val.make_and_fill(binning_inv_mass, "inv_mass", binning_v0m_perc, "perc_v0m") + val.make_and_fill(binning_inv_mass, "inv_mass", binning_ntrklt, "n_tracklets_corr") for i, j in enumerate(binning_pt[0:-1]): # Defining pT interval lower_pt = j - upper_pt = binning_pt[i+1] + upper_pt = binning_pt[i + 1] pt_interval = "_pt_cand_{:.1f}-{:.1f}".format(lower_pt, upper_pt) # Cutting the DF in the pT interval df_ptcut = df_reco[df_reco.pt_cand > lower_pt] @@ -71,9 +69,7 @@ def fill_validation_candidates(df_reco, tag=""): # Resetting validation collection to use the pT cut DF val.reset_input(df_ptcut, tag=tag + pt_interval) # Filling histograms with inv mass and multiplicity - val.make_and_fill(binning_inv_mass, "inv_mass", - binning_v0m_perc, "perc_v0m") - val.make_and_fill(binning_inv_mass, "inv_mass", - binning_ntrklt, "n_tracklets_corr") + val.make_and_fill(binning_inv_mass, "inv_mass", binning_v0m_perc, "perc_v0m") + val.make_and_fill(binning_inv_mass, "inv_mass", binning_ntrklt, "n_tracklets_corr") return val diff --git a/machine_learning_hep/validation/validation_multiplicity.py b/machine_learning_hep/validation/validation_multiplicity.py index 4e23c251a8..ef2d05f5ed 100644 --- a/machine_learning_hep/validation/validation_multiplicity.py +++ b/machine_learning_hep/validation/validation_multiplicity.py @@ -17,8 +17,8 @@ """ from machine_learning_hep.bitwise import filter_bit_df -from machine_learning_hep.validation.validation import ValidationCollection from machine_learning_hep.utilities_plot import buildbinning +from machine_learning_hep.validation.validation import ValidationCollection def fill_validation_multiplicity(dfevt, dfevtevtsel, df_reco): @@ -54,12 +54,9 @@ def do_mult_plots(): val.reset_input(dfevtevtsel, "_EvtSel") do_mult_plots() - val.make_and_fill(binning_ntrklt, "n_tracklets", - binning_ntrklt, "n_tracklets_corr") - val.make_and_fill(binning_zvtx, "z_vtx_reco", - binning_ntrklt, "n_tracklets_corr") - val.make_and_fill(binning_zvtx, "z_vtx_reco", - binning_ntrklt, "n_tracklets") + val.make_and_fill(binning_ntrklt, "n_tracklets", binning_ntrklt, "n_tracklets_corr") + val.make_and_fill(binning_zvtx, "z_vtx_reco", binning_ntrklt, "n_tracklets_corr") + val.make_and_fill(binning_zvtx, "z_vtx_reco", binning_ntrklt, "n_tracklets") val.make_and_fill(binning_ntrklt, "n_tracklets_corr") val.make_and_fill(binning_ntrklt, "n_tracklets_corr_shm") @@ -69,12 +66,9 @@ def do_mult_plots(): # val.reset_input(dfevtevtsel.query("is_ev_sel_shm == 1"), "spd") # val.make_and_fill(binning_ntrklt, "n_tracklets_corr") - df_reco["n_tracklets_corr-n_tracklets_corr_sub"] = ( - df_reco["n_tracklets_corr"] - df_reco["n_tracklets_corr_sub"] - ) + df_reco["n_tracklets_corr-n_tracklets_corr_sub"] = df_reco["n_tracklets_corr"] - df_reco["n_tracklets_corr_sub"] - df_reco_list = [[df_reco, ""], - [df_reco[df_reco.is_ev_rej_INT7 == 0], "MB"]] + df_reco_list = [[df_reco, ""], [df_reco[df_reco.is_ev_rej_INT7 == 0], "MB"]] if "is_ev_sel_shm" in df_reco: df_reco_list.append([df_reco.query("is_ev_sel_shm == 1"), "HMSPD"]) for i in df_reco_list: @@ -85,11 +79,7 @@ def do_mult_plots(): binning_ntrklt_diff, "n_tracklets_corr-n_tracklets_corr_sub", ) - val.make_and_fill( - binning_ntrklt, "n_tracklets_corr_sub", binning_ntrklt, "n_tracklets_corr" - ) - val.make_and_fill( - binning_ntrklt, "n_tracklets_corr", binning_ntrklt, "n_tracklets_corr_sub" - ) + val.make_and_fill(binning_ntrklt, "n_tracklets_corr_sub", binning_ntrklt, "n_tracklets_corr") + val.make_and_fill(binning_ntrklt, "n_tracklets_corr", binning_ntrklt, "n_tracklets_corr_sub") return val diff --git a/machine_learning_hep/vary_bdt.py b/machine_learning_hep/vary_bdt.py index 469e797ad8..1c0d9d238b 100644 --- a/machine_learning_hep/vary_bdt.py +++ b/machine_learning_hep/vary_bdt.py @@ -21,18 +21,18 @@ def main(): print_default = False dic_cuts = { - "d0" : { + "d0": { "string": "mlBkgScore < %g", - "cuts_default" : [0.02, 0.02, 0.02, 0.05, 0.06, 0.08, 0.08, 0.10, 0.10, 0.20, 0.25, 0.30], # default - "cuts_min" : [0.008, 0.008, 0.0087, 0.017, 0.024, 0.031, 0.028, 0.042, 0.038, 0.052, 0.067, 0.060], # tight - "cuts_max" : [0.045, 0.053, 0.054, 0.19, 0.22, 0.33, 0.46, 0.38, 0.50, 0.50, 0.50, 0.50] # loose + "cuts_default": [0.02, 0.02, 0.02, 0.05, 0.06, 0.08, 0.08, 0.10, 0.10, 0.20, 0.25, 0.30], # default + "cuts_min": [0.008, 0.008, 0.0087, 0.017, 0.024, 0.031, 0.028, 0.042, 0.038, 0.052, 0.067, 0.060], # tight + "cuts_max": [0.045, 0.053, 0.054, 0.19, 0.22, 0.33, 0.46, 0.38, 0.50, 0.50, 0.50, 0.50], # loose }, "lc": { - "string" : "mlPromptScore > %g", - "cuts_default" : [0.97, 0.9, 0.9, 0.85, 0.85, 0.8, 0.8, 0.6, 0.6], # default - "cuts_min" : [0.961, 0.83, 0.84, 0.74, 0.74, 0.62, 0.63, 0.15, 0.15], # loose - "cuts_max" : [0.978, 0.94, 0.937, 0.915, 0.91, 0.89, 0.88, 0.85, 0.85] # tight - } + "string": "mlPromptScore > %g", + "cuts_default": [0.97, 0.9, 0.9, 0.85, 0.85, 0.8, 0.8, 0.6, 0.6], # default + "cuts_min": [0.961, 0.83, 0.84, 0.74, 0.74, 0.62, 0.63, 0.15, 0.15], # loose + "cuts_max": [0.978, 0.94, 0.937, 0.915, 0.91, 0.89, 0.88, 0.85, 0.85], # tight + }, } def format_list(str_format: str, values: list): diff --git a/machine_learning_hep/workflow/workflow_base.py b/machine_learning_hep/workflow/workflow_base.py index b5ad9b32db..57929513c7 100644 --- a/machine_learning_hep/workflow/workflow_base.py +++ b/machine_learning_hep/workflow/workflow_base.py @@ -14,28 +14,35 @@ from functools import reduce from os.path import join + # pylint: disable=import-error, no-name-in-module from ROOT import gStyle + # HF specific imports from machine_learning_hep.logger import get_logger + # pylint: disable=too-few-public-methods class WorkflowBase: """ Base class for all workflows related classes including systematics """ + species = "workflow_base" - def __init__(self, datap, case, typean, period=None): + def __init__(self, datap, case, typean, period=None): self.logger = get_logger() self.datap = datap self.case = case self.typean = typean self.period = period - def cfg(self, param, default = None): - return reduce(lambda d, key: d.get(key, default) if isinstance(d, dict) else default, - param.split("."), self.datap['analysis'][self.typean]) + def cfg(self, param, default=None): + return reduce( + lambda d, key: d.get(key, default) if isinstance(d, dict) else default, + param.split("."), + self.datap["analysis"][self.typean], + ) @staticmethod def loadstyle(): @@ -46,7 +53,6 @@ def loadstyle(): gStyle.SetCanvasColor(0) gStyle.SetFrameFillColor(0) - @staticmethod def make_pre_suffix(args): """ @@ -62,7 +68,6 @@ def make_pre_suffix(args): args = [str(a) for a in args] return "_".join(args) - @staticmethod def make_file_path(directory, filename, extension, prefix=None, suffix=None): if prefix is not None: @@ -72,7 +77,6 @@ def make_file_path(directory, filename, extension, prefix=None, suffix=None): extension = extension.replace(".", "") return join(directory, filename + "." + extension) - def step(self, step: str): """ Given a workflow steps as string, find the corresponding method and call it. @@ -82,14 +86,12 @@ def step(self, step: str): True if the step was found and executed, False otherwise """ if not hasattr(self, step): - self.logger.error("Could not run workflow step %s for workflow %s", step, - self.__class__.__name__) + self.logger.error("Could not run workflow step %s for workflow %s", step, self.__class__.__name__) return False self.logger.info("Run workflow step %s for workflow %s", step, self.__class__.__name__) getattr(self, step)() return True - def get_after_burner(self): """ Return an after-burner object to be run after per-period workflow steps, OPTIONAL diff --git a/run_hfjets.py b/run_hfjets.py index 10e13a3f1d..7111667611 100755 --- a/run_hfjets.py +++ b/run_hfjets.py @@ -16,35 +16,40 @@ import sys parser = argparse.ArgumentParser() -parser.add_argument('--case', '-c', default='d0jet') -parser.add_argument('--analysis', '-a', default='jet_obs') -parser.add_argument('--steps', '-s', nargs='+', default=['analyzer']) -parser.add_argument('--interactive', '-i', action='store_true') -parser.add_argument('--delete', '-d', action='store_true') +parser.add_argument("--case", "-c", default="d0jet") +parser.add_argument("--analysis", "-a", default="jet_obs") +parser.add_argument("--steps", "-s", nargs="+", default=["analyzer"]) +parser.add_argument("--interactive", "-i", action="store_true") +parser.add_argument("--delete", "-d", action="store_true") # parser.add_argument('--dryrun', '-n', action='store_true') args = parser.parse_args() match args.case: - case 'jet': - DB = 'machine_learning_hep/data/data_run3/database_ml_parameters_Jet_pp.yml' - case 'd0jet': - DB = 'machine_learning_hep/data/data_run3/database_ml_parameters_D0Jet_pp.yml' + case "jet": + DB = "machine_learning_hep/data/data_run3/database_ml_parameters_Jet_pp.yml" + case "d0jet": + DB = "machine_learning_hep/data/data_run3/database_ml_parameters_D0Jet_pp.yml" # DB = 'machine_learning_hep/data/data_run3/database_ml_parameters_D0Jet_pp_fitting_rebin_0.yml' # DB = 'machine_learning_hep/data/data_run3/database_ml_parameters_D0Jet_pp_fitting_rebin_1.yml' # DB = 'machine_learning_hep/data/data_run3/database_ml_parameters_D0Jet_pp_fitting_rebin_2.yml' # DB = 'machine_learning_hep/data/data_run3/database_ml_parameters_D0Jet_pp_fitting_bkgfunc.yml' - case 'd0jetr2': - DB = 'machine_learning_hep/data/data_run3/database_ml_parameters_D0pp_jet_run2cmp.yml' - case 'lcjet': - DB = 'machine_learning_hep/data/data_run3/database_ml_parameters_LcJet_pp.yml' - case 'jpsijet': - DB = 'machine_learning_hep/data/data_run3/database_ml_parameters_JPsiJet_pp.yml' + case "d0jetr2": + DB = "machine_learning_hep/data/data_run3/database_ml_parameters_D0pp_jet_run2cmp.yml" + case "lcjet": + DB = "machine_learning_hep/data/data_run3/database_ml_parameters_LcJet_pp.yml" + case "jpsijet": + DB = "machine_learning_hep/data/data_run3/database_ml_parameters_JPsiJet_pp.yml" case _: - print(f'Unknown case <{args.case}>') + print(f"Unknown case <{args.case}>") sys.exit(-1) for step in args.steps: - subprocess.run(f'mlhep -r machine_learning_hep/submission/{step}.yml ' + - f'-d {DB} {"-b" if not args.interactive else ""} ' + - f'-a {args.analysis} {"--delete" if args.delete else ""}', - shell=True, stdout=sys.stdout, stderr=sys.stderr, check=True) + subprocess.run( + f"mlhep -r machine_learning_hep/submission/{step}.yml " + + f"-d {DB} {'-b' if not args.interactive else ''} " + + f"-a {args.analysis} {'--delete' if args.delete else ''}", + shell=True, + stdout=sys.stdout, + stderr=sys.stderr, + check=True, + )