From 9502f97d4ad211249d949de36583e5646447d514 Mon Sep 17 00:00:00 2001 From: Jochen Klein Date: Fri, 19 Jan 2024 17:12:24 +0100 Subject: [PATCH] Add new DB format for DF specification --- ...abase_ml_parameters_D0pp_jet_newformat.yml | 670 ++++++++++++++++++ ...abase_ml_parameters_LcToPKPi_newformat.yml | 444 ++++++++++++ machine_learning_hep/processer.py | 258 +++---- machine_learning_hep/utilities.py | 3 + 4 files changed, 1220 insertions(+), 155 deletions(-) create mode 100644 machine_learning_hep/data/data_run3/database_ml_parameters_D0pp_jet_newformat.yml create mode 100644 machine_learning_hep/data/data_run3/database_ml_parameters_LcToPKPi_newformat.yml mode change 100755 => 100644 machine_learning_hep/processer.py diff --git a/machine_learning_hep/data/data_run3/database_ml_parameters_D0pp_jet_newformat.yml b/machine_learning_hep/data/data_run3/database_ml_parameters_D0pp_jet_newformat.yml new file mode 100644 index 0000000000..d5bf0b85d8 --- /dev/null +++ b/machine_learning_hep/data/data_run3/database_ml_parameters_D0pp_jet_newformat.yml @@ -0,0 +1,670 @@ +############################################################################# +## © Copyright CERN 2018. All rights not expressly granted are reserved. ## +## Author: Gian.Michele.Innocenti@cern.ch ## +## This program is free software: you can redistribute it and/or modify it ## +## under the terms of the GNU General Public License as published by the ## +## Free Software Foundation, either version 3 of the License, or (at your ## +## option) any later version. This program is distributed in the hope that ## +## it will be useful, but WITHOUT ANY WARRANTY; without even the implied ## +## warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. ## +## See the GNU General Public License for more details. ## +## You should have received a copy of the GNU General Public License ## +## along with this program. if not, see . ## +############################################################################# + +D0pp_jet: + nprongs: 2 + prongformultsub: [1,1] + doml: false + mass: 1.864 + sel_reco_unp: "fPt > 1." + sel_reco_singletrac_unp: null + sel_gen_unp: "fPt > 1. and fPt < 10." + sel_cen_unp: null + sel_good_evt_unp: "fIsEventReject == 0" + sel_reco_skim: ["fPtProng0>0.3 and fPtProng1>0.3 and abs(fImpactParameter0) < 0.5 and abs(fImpactParameter1) < 0.5 and fCpa > 0.6", + "fPtProng0>0.5 and fPtProng1>0.5 and abs(fImpactParameter0) < 0.5 and abs(fImpactParameter1) < 0.5 and fCpa > 0.6", + "fPtProng0>0.5 and fPtProng1>0.5 and abs(fImpactParameter0) < 0.5 and abs(fImpactParameter1) < 0.5 and fCpa > 0.6", + "fPtProng0>0.6 and fPtProng1>0.6 and abs(fImpactParameter0) < 0.5 and abs(fImpactParameter1) < 0.5 and fCpa > 0.6", + "fPtProng0>0.6 and fPtProng1>0.6 and abs(fImpactParameter0) < 0.5 and abs(fImpactParameter1) < 0.5 and fCpa > 0.6", + "fPtProng0>0.6 and fPtProng1>0.6 and abs(fImpactParameter0) < 0.5 and abs(fImpactParameter1) < 0.5 and fCpa > 0.6", + "fPtProng0>0.6 and fPtProng1>0.6 and abs(fImpactParameter0) < 0.5 and abs(fImpactParameter1) < 0.5 and fCpa > 0.6"] # sel_skim_binmin bins + sel_gen_skim: [null,null,null,null,null,null,null] # sel_skim_binmin bins + sel_skim_binmin: [1,2,4,6,8,12,24] # skimming pt bins (sel_skim_binmin bins) + sel_skim_binmax: [2,4,6,8,12,24,48] # skimming pt bins (sel_skim_binmin bins) + var_binning: fPt + dofullevtmerge: false + var_cand: fCandidateSelFlag + # var_swap: fIsCandidateSwapped + bitmap_sel: + var_name: fFlagMcMatchRec + var_name_gen: fFlagMcMatchGen + var_name_origgen: fOriginMcGen + var_name_origrec: fOriginMcRec + var_isstd: isstd + var_ismcsignal: ismcsignal + var_ismcprompt: ismcprompt + var_ismcfd: ismcfd + var_ismcbkg: ismcbkg + var_ismcrefl: ismcrefl + isstd : [[1],[]] + ismcsignal: [[0],[]] + ismcprompt: [[0],[]] + ismcfd: [[1],[]] + ismcbkg: [[],[1]] + ismcrefl: [[1],[1]] # probably missing from tree creator + + dfs: + read: + evtorig: + level: all + index: fIndexD0CollBases + trees: + O2hfd0collbase: [fNumContrib, fIsEventReject, fRunNumber] + + reco: + level: all + index: fIndexHfD0Bases_0 + trees: + O2hfd0base: [fIndexHfD0CollBases, fPt, fEta, fPhi, fM] + O2hfd0mc: [fFlagMcMatchRec, fOriginMcRec] + O2hfd0par: [fCpa, fCpaXY, fChi2PCA, + fDecayLength, fDecayLengthXY, fDecayLengthNormalised, fDecayLengthXYNormalised, + fImpactParameter0, fImpactParameter1, + fImpactParameterNormalised0, fImpactParameterNormalised1, + fPtProng0, fPtProng1, + fNSigTpcPi0, fNSigTpcKa0, fNSigTpcPi1, fNSigTpcKa1, + fNSigTofPi0, fNSigTofKa0, fNSigTofPi1, fNSigTofKa1] + O2hfd0pare: [fErrorDecayLength, fErrorDecayLengthXY, + fErrorImpactParameter0, fErrorImpactParameter1] + O2hfd0sel: [fCandidateSelFlag] + extra: + fY: 0. + filter: "fPt > 1." + gen: + level: gen + index: fIndexHfD0PBases_0 + trees: + O2hfd0pbase: [fPt, fEta, fPhi, fFlagMcMatchGen, fOriginMcGen] + + collgen: + level: gen + index: fIndexD0ChargedMCParticleLevelJetCollisionOutputs + trees: + O2d0mcpco: [fPosZ, fCentrality, fEventSel] + jetgen: + level: gen + index: fIndexD0ChargedMCParticleLevelJetOutputs + trees: + O2d0mcpo: [fIndexD0ChargedMCParticleLevelJetCollisionOutputs, fIndexHfD0PBases_0, + fJetPt, fJetPhi, fJetEta, fJetNConstituents] + jetsubgen: + level: gen + index: fIndexD0ChargedMCParticleLevelJetSubstructures + trees: + # O2d0mcpsso: [fIndexD0ChargedMCParticleLevelJetOutputs, fZg, fRg, fNsd] + O2d0mcpsso: [fIndexD0ChargedMCParticleLevelJetOutputs, fEnergyMother, fPtLeading, fPtSubLeading, fTheta] + + colldet: + level: det + index: fIndexD0ChargedMCDetectorLevelJetCollisionOutputs + trees: + O2d0mcdco: [fPosZ, fCentrality, fEventSel] + jetdet: + level: det + index: fIndexD0ChargedMCDetectorLevelJetOutputs + trees: + O2d0mcdo: [fIndexD0ChargedMCDetectorLevelJetCollisionOutputs, fIndexHfD0Bases_0, + fJetPt, fJetPhi, fJetEta, fJetNConstituents] + jetsubdet: + level: det + index: fIndexD0ChargedMCDetectorLevelJetSubstructures + trees: + # O2d0mcdsso: [fIndexD0ChargedMCDetectorLevelJetOutputs, fZg, fRg, fNsd] + O2d0mcdsso: [fIndexD0ChargedMCDetectorLevelJetOutputs, fEnergyMother, fPtLeading, fPtSubLeading, fTheta] + + colldata: + level: data + index: fIndexD0ChargedJetCollisionOutputs + trees: + O2d0mcdco: [fPosZ, fCentrality, fEventSel] + jetdata: + level: data + index: fIndexD0JetOutputs + trees: + O2d0mcdo: [fIndexD0ChargedJetCollisionOutputs, fIndexHfD0Bases_0, + fJetPt, fJetPhi, fJetEta, fJetNConstituents] + jetsubdata: + level: data + index: fIndexD0JetSubstructures + trees: + O2d0mcdsso: [fIndexD0ChargedJetOutputs, fZg, fRg, fNsd] + + merge: + - {base: jetgen, ref: collgen} + - {base: jetgen, ref: gen} + - {base: jetsubgen, ref: jetgen} + + - {base: jetdet, ref: colldet} + - {base: jetdet, ref: reco} + - {base: jetsubdet, ref: jetdet} + + - {base: jetdata, ref: colldata} + - {base: jetdata, ref: reco} + - {base: jetsubdata, ref: jetdata} + +# - {base: gen, ref: evtorig} + + write: + jetsubgen: + level: gen + file: AnalysisResultsGen.pkl.lz4 + jetsubdet: + level: det + file: AnalysisResultsReco.pkl.lz4 + jetsubdata: + level: data + file: AnalysisResultsReco.pkl.lz4 + evtorig: + level: all + file: AnalysisResultsEvtOrig.pkl.lz4 + evt: + level: all + source: evtorig + file: AnalysisResultsEvt.pkl.lz4 + filter: "fIsEventReject == 0" + gen: + level: mc + file: AnalysisResultsGen.pkl.lz4 + + variables: + var_training: [[fDecayLength, fDecayLengthXY, fDecayLengthNormalised, fDecayLengthXYNormalised, fCpa, fCpaXY, fImpactParameter0, fImpactParameter1, fErrorImpactParameter0, fErrorImpactParameter1, fNSigTpcPi0, fNSigTpcKa0, fNSigTofPi0, fNSigTofKa0, fNSigTpcPi1, fNSigTpcKa1, fNSigTofPi1, fNSigTofKa1], [fDecayLength, fDecayLengthXY, fDecayLengthNormalised, fDecayLengthXYNormalised, fCpa, fCpaXY, fImpactParameter0, fImpactParameter1, fErrorImpactParameter0, fErrorImpactParameter1, fNSigTpcPi0, fNSigTpcKa0, fNSigTofPi0, fNSigTofKa0, fNSigTpcPi1, fNSigTpcKa1, fNSigTofPi1, fNSigTofKa1], [fDecayLength, fDecayLengthXY, fDecayLengthNormalised, fDecayLengthXYNormalised, fCpa, fCpaXY, fImpactParameter0, fImpactParameter1, fErrorImpactParameter0, fErrorImpactParameter1, fNSigTpcPi0, fNSigTpcKa0, fNSigTofPi0, fNSigTofKa0, fNSigTpcPi1, fNSigTpcKa1, fNSigTofPi1, fNSigTofKa1], [fDecayLength, fDecayLengthXY, fDecayLengthNormalised, fDecayLengthXYNormalised, fCpa, fCpaXY, fImpactParameter0, fImpactParameter1, fErrorImpactParameter0, fErrorImpactParameter1, fNSigTpcPi0, fNSigTpcKa0, fNSigTofPi0, fNSigTofKa0, fNSigTpcPi1, fNSigTpcKa1, fNSigTofPi1, fNSigTofKa1], [fDecayLength, fDecayLengthXY, fDecayLengthNormalised, fDecayLengthXYNormalised, fCpa, fCpaXY, fImpactParameter0, fImpactParameter1, fErrorImpactParameter0, fErrorImpactParameter1, fNSigTpcPi0, fNSigTpcKa0, fNSigTofPi0, fNSigTofKa0, fNSigTpcPi1, fNSigTpcKa1, fNSigTofPi1, fNSigTofKa1], [fDecayLength, fDecayLengthXY, fDecayLengthNormalised, fDecayLengthXYNormalised, fCpa, fCpaXY, fImpactParameter0, fImpactParameter1, fErrorImpactParameter0, fErrorImpactParameter1, fNSigTpcPi0, fNSigTpcKa0, fNSigTofPi0, fNSigTofKa0, fNSigTpcPi1, fNSigTpcKa1, fNSigTofPi1, fNSigTofKa1], [fDecayLength, fDecayLengthXY, fDecayLengthNormalised, fDecayLengthXYNormalised, fCpa, fCpaXY, fImpactParameter0, fImpactParameter1, fErrorImpactParameter0, fErrorImpactParameter1, fNSigTpcPi0, fNSigTpcKa0, fNSigTofPi0, fNSigTofKa0, fNSigTpcPi1, fNSigTpcKa1, fNSigTofPi1, fNSigTofKa1]] + #TODO: add new variables for dca, max_norm_d0d0exp + # sel_skim_binmin bins + var_boundaries: [fCosThetaStar, fPtProng] + var_correlation: + - [fCosThetaStar] # TODO: update + - [fPtProng0] + var_signal: signal + var_inv_mass: fM + var_y: fY + var_evt_sel: fIsEventReject + var_cuts: + - [fPtProng0, lt, null] + - [fPtProng1, lt, null] + + plot_options: + prob_cut_scan: + fPtProng0: + xlim: + - 0 + - 1 + fPtProng1: + xlim: + - 0 + - 1 + eff_cut_scan: + fPtProng0: + xlim: + - 0 + - 1 + fPtProng1: + xlim: + - 0 + - 1 + + files_names: + namefile_unmerged_tree: AO2D.root + namefile_reco: AnalysisResultsReco.pkl.lz4 + namefile_evt: AnalysisResultsEvt.pkl.lz4 + namefile_evtvalroot: AnalysisResultsROOTEvtVal.root + namefile_evtorig: AnalysisResultsEvtOrig.pkl.lz4 + namefile_gen: AnalysisResultsGen.pkl.lz4 + namefile_reco_applieddata: AnalysisResultsRecoAppliedData.pkl.lz4 + namefile_reco_appliedmc: AnalysisResultsRecoAppliedMC.pkl.lz4 + namefile_mcweights: mcweights.root + histofilename: "masshisto.root" + efffilename: "effhisto.root" + respfilename: "resphisto.root" + crossfilename: "cross_section_tot.root" + + multi: + data: + nprocessesparallel: 20 + maxfiles: [-1] #list of periods + chunksizeunp: [100] #list of periods + chunksizeskim: [100] #list of periods + fracmerge: [0.08] #list of periods + seedmerge: [12] #list of periods + period: [LHC22o] #list of periods + select_period: [1] + prefix_dir: /data2/MLhep/real/train_131050/ + unmerged_tree_dir: [alice/cern.ch/user/a/alihyperloop/jobs/0024/] #list of periods + pkl: [d0jet/pkl] #list of periods + pkl_skimmed: [d0jet/pklsk] #list of periods + pkl_skimmed_merge_for_ml: [d0jet/pklskml] #list of periods + pkl_skimmed_merge_for_ml_all: d0jet/pp_data_mltot + pkl_evtcounter_all: d0jet/pp_data_evttot + mcreweights: [../Analyses] #list of periods + mc: + nprocessesparallel: 20 + maxfiles: [-1] #list of periods + chunksizeunp: [100] #list of periods + chunksizeskim: [1000] #list of periods + fracmerge: [1.0] #list of periods + seedmerge: [12] #list of periods + period: [mctest_nima] #list of periods + select_period: [1] + # /data2/MLhep/sim/test_jets/2024-xx-xx/local/AOD/001/AO2D.root + prefix_dir: /data2/MLhep/sim/test_jets/ + unmerged_tree_dir: ['2024-01-25'] + pkl: [d0jet/pkl] #list of periods + pkl_skimmed: [d0jet/pklsk] #list of periods + pkl_skimmed_merge_for_ml: [d0jet/pklskml] #list of periods + pkl_skimmed_merge_for_ml_all: d0jet/pp_mc_prod_mltot + pkl_evtcounter_all: d0jet/pp_mc_prod_evttot + mcreweights: [../Analyses] #list of periods + + ml: + evtsel: fIsEventReject == 0 + triggersel: + data: null + mc: null + + nbkg: 500000 + nsig: 500000 + equalise_sig_bkg: True + sampletagforsignal: 1 + sampletagforbkg: 0 + sel_sigml: ismcprompt == 1 + sel_bkgml: fM<1.8 or fM>1.92 + nkfolds: 5 + rnd_shuffle: 12 + rnd_splt: 12 + test_frac: 0.2 + binmin: [1,2,4,6,8,12,24] # must be equal to sel_skim_binmin (sel_skim_binmin bins) + binmax: [2,4,6,8,12,24,48] # must be equal to sel_skim_binmax (sel_skim_binmin bins) + mltype: BinaryClassification + ncorescrossval: 10 + prefix_dir_ml: /data2/jklein/MLhep + mlplot: mlplot + mlout: mlout + + opt: + isFONLLfromROOT: true + filename_fonll: 'data/fonll/D0DplusDstarPredictions_13TeV_y05_all_300416_BDShapeCorrected.root' # file with FONLL predictions + fonll_particle: 'hD0Kpi' + fonll_pred: 'max' # edge of the FONLL prediction + FF: 0.6086 # fragmentation fraction + sigma_MB: 57.8e-3 # Minimum Bias cross section (pp) 50.87e-3 [b], 1 for Pb-Pb + Taa: 1 # 23260 [b^-1] in 0-10% Pb-Pb, 3917 [b^-1] in 30-50% Pb-Pb, 1 for pp + BR: 3.95e-2 # branching ratio of the decay D0 -> K- pi+ + f_prompt: 0.9 # estimated fraction of prompt candidates + bkg_data_fraction: 0.1 # fraction of real data used in the estimation + num_steps: 111 # number of steps used in efficiency and signif. estimation + bkg_function: pol2 # fit function for bkg (among TH1 predefined fit functions, e.g. expo, pol1, pol2, ...) + save_fit: True # save bkg fits with the various cuts on ML output + raahp: [1,1,1,1,1,1,1] # sel_skim_binmin bins + presel_gen_eff: "abs(fY) < 0.5 and abs(fPosZ) < 10" + + mlapplication: + data: + prefix_dir_res: /data2/jklein/ + pkl_skimmed_dec: [LHC22pp/MLapplication/prod_LHC22o/skpkldecdata] #list of periods + pkl_skimmed_decmerged: [LHC22pp/MLapplication/prod_LHC22o/skpkldecdatamerged] #list of periods + mc: + prefix_dir_res: /data2/jklein/ + pkl_skimmed_dec: [LHC22pp_mc/MLapplication/prod_LHC22b1b/skpkldecmc] #list of periods + pkl_skimmed_decmerged: [LHC22pp_mc/MLapplication/prod_LHC22b1b/skpkldecmcmerged] #list of periods + modelname: xgboost + modelsperptbin: [xgboost_classifierD0pp_FF_dfselection_pt_cand_1.0_2.0.sav, + xgboost_classifierD0pp_FF_dfselection_pt_cand_2.0_4.0.sav, + xgboost_classifierD0pp_FF_dfselection_pt_cand_4.0_6.0.sav, + xgboost_classifierD0pp_FF_dfselection_pt_cand_6.0_8.0.sav, + xgboost_classifierD0pp_FF_dfselection_pt_cand_8.0_12.0.sav, + xgboost_classifierD0pp_FF_dfselection_pt_cand_12.0_24.0.sav, + xgboost_classifierD0pp_FF_dfselection_pt_cand_24.0_48.0.sav] # sel_skim_binmin bins + probcutpresel: + data: [0.75,0.75,0.65,0.65,0.45,0.45,0.45] # sel_skim_binmin bins + mc: [0.75,0.75,0.65,0.65,0.45,0.45,0.45] # sel_skim_binmin bins + probcutoptimal: [0.92,0.90,0.82,0.80,0.60,0.60,0.60] # sel_skim_binmin bins + + analysis: + indexhptspectrum: 0 #kD0Kpi=0, kDplusKpipi=1, kDstarD0pi=2, kDsKKpi=3, kLctopKpi=4, kLcK0Sp=5 + fd_method: 2 #knone=0, kfc=1, kNb=2 + cctype: 1 #kpp7 + sigmav0: 57.8e-3 #NB: multiplied by 1e12 before giving to HFPtSpectrum! + inputfonllpred: data/fonll/D0DplusDstarPredictions_13TeV_y05_all_300416_BDShapeCorrected.root + dir_general_plots: /data2/jklein/data/analysis_plots + + jet_zg: &jet_default + proc_type: Jets + useperiod: [1] #list of periods + usejetptbinned_deff: false + doeff_resp: true #efficiency correction for the response matrix + unmatched_gen: true + latexnamehadron: "D^{0}" + latexnamedecay: "K^{#minus} #pi^{#plus}" + var_binning2: pt_jet + var_binning2_gen: pt_gen_jet + latexbin2var: "#it{p}_{T}^{jet ch}" + sel_binmin2_reco: [7.0,15.0,30.0] # rec jet pt bins (sel_binmin2_reco bins) + sel_binmax2_reco: [15.0,30.0,50.0] # rec jet pt bins (sel_binmin2_reco bins) + sel_binmin2_gen: [7.0,15.0,30.0] # gen jet pt bins (sel_binmin2_gen bins) + sel_binmax2_gen: [15.0,30.0,50.0] # gen jet pt bins (sel_binmin2_gen bins) + var_binningshape: zg_jet + var_binningshape_gen: zg_gen_jet + var_shape_latex: "#it{z}_{g}" + sel_binminshape_reco: [-0.1,0.1,0.2,0.3,0.4] + sel_binmaxshape_reco: [0.1,0.2,0.3,0.4,0.5] + sel_binminshape_gen: [-0.1,0.1,0.2,0.3,0.4] + sel_binmaxshape_gen: [0.1,0.2,0.3,0.4,0.5] + sel_closure_frac: 0.2 + triggerbit: INT7 + sel_an_binmin: [5,6,7,8,10,12,16,24] # hadron pt bins (sel_an_binmin bins) + sel_an_binmax: [6,7,8,10,12,16,24,36] # hadron pt bins (sel_an_binmin bins) + binning_matching: [2,3,3, 4, 4, 5, 5, 6] # mapping to skimming pt bins (sel_an_binmin bins) + #jetsel_gen: "abs(y_cand) < 0.8 and abs(z_vtx_gen) < 10 and abs(eta_jet) < 0.5" + #jetsel_sim: "abs(y_cand) < 0.8 and abs(eta_jet) < 0.5" # jet selection in simulations + #jetsel_reco: "abs(y_cand) < 0.8 and abs(z_vtx_reco) < 10 and abs(eta_jet) < 0.5" + #jetsel_gen_matched_reco: "abs(eta_gen_jet) < 5.0" + jetsel_gen: "abs(y_cand) < 0.5 and abs(z_vtx_gen) < 10 and abs(eta_jet) < 0.5" + jetsel_sim: "abs(y_cand) < 0.5 and abs(eta_jet) < 0.5" # jet selection in simulations + jetsel_reco: "abs(y_cand) < 0.5 and abs(z_vtx_reco) < 10 and abs(eta_jet) < 0.5" + jetsel_gen_matched_reco: "abs(y_cand) < 0.5 and abs(z_vtx_gen) < 10 and abs(eta_gen_jet) < 0.5" + evtsel: fIsEventReject==0 + triggersel: + data: "trigger_hasbit_INT7==1" + mc: null + data: &data_out_default + runselection: [null] #FIXME + results: [/data2/jklein/data/test/d0jet/resultsMBjetvspt] #list of periods + resultsallp: /data2/jklein/data/test/d0jet/resultsMBjetvspt_all + mc: &mc_out_default + runselection: [null] #FIXME + results: [/data2/jklein/data/mctest/d0jet/resultsMBjetvspt] #list of periods + resultsallp: /data2/jklein/data/mctest/d0jet/resultsMBjetvspt_all + data_proc: # alternative processor output used as the analyzer input + <<: *data_out_default + mc_proc: # alternative processor output used as the analyzer input + <<: *mc_out_default + + mass_fit_lim: [1.5, 2.2] # histogram range of the invariant mass distribution [GeV/c^2] + bin_width: 0.001 # bin width of the invariant mass histogram + + # simple fitter START + sgnfunc: [0,0,0,0,0,0,0,0,0,0,0,0] # kGaus=0, k2Gaus=1, k2GausSigmaRatioPar=2 (sel_an_binmin bins) + bkgfunc: [0,0,0,0,0,0,0,0,0,0,0,0] # kExpo=0, kLin=1, kPol2=2, kNoBk=3, kPow=4, kPowEx=5 (sel_an_binmin bins) + masspeak: 1.864 + massmin: [1.66,1.66,1.66,1.66,1.66,1.66,1.66,1.66,1.66,1.66,1.66,1.66] # sel_an_binmin bins, fit region of the invariant mass distribution [GeV/c^2] + massmax: [2.06,2.06,2.06,2.06,2.06,2.06,2.06,2.06,2.06,2.06,2.06,2.06] # sel_an_binmin bins, fit region of the invariant mass distribution [GeV/c^2] + rebin: [6,6,6,6,6,6,6,6,6,6,6,6] # sel_an_binmin bins + fix_mean: [false, false, false, false, false, false, false, false, false, false, false, false] # sel_an_binmin bins + masspeaksec: 1.864 + + # If SetArraySigma true: sigma_initial is taken from sigmaarray; false: sigma_initial is taken from MC + # If SetFixGaussianSigma true: sigma fixed to sigma_initial + # SetFixGaussianSigma: [false, false, false, false, false, false, false, false, false, false, false, false] # sel_an_binmin bins + SetFixGaussianSigma: [true, true, true, true, true, true, true, true, true, true, true, true] # sel_an_binmin bins + SetArraySigma: [false, false, false, false, false, false, false, false, false, false, false, false] # sel_an_binmin bins + sigmaarray: [0.01,0.01,0.01,0.01,0.01,0.01,0.01,0.01,0.01,0.01,0.01,0.01] # initial sigma (sel_an_binmin bins) + + fix_sigmasec: [true, true, true, true, true, true, true, true, true, true, true, true] # sel_an_binmin bins + sigmaarraysec: [0.007497,0.01,0.01,0.01,0.01,0.01,0.01,0.01,0.01,0.01,0.01,0.01] # sel_an_binmin bins + use_reflections: true + # simple fitter END + + signal_sigma: 2.0 + sigma_scale: 0.9545 + sideband_sigma_1_left: 4 + sideband_sigma_2_left: 9 + sideband_sigma_1_right: 4 + sideband_sigma_2_right: 9 + sidebandleftonly: false + + niterunfolding: 15 + niterunfoldingchosen: 4 + + branching_ratio: 3.95e-2 + xsection_inel: 57.8 # (mb) cross-section of minimum-bias events + + doprior: false + domodeldep: false + path_modeldep: /home/nzardosh/PYTHIA_Sim/PYTHIA8_Simulations/Plots/D0_Substructure_Simulations_Output.root + + powheg_path_nonprompt: /data/POWHEG/trees_powheg_fd_central.root + + powheg_path_prompt: /data/POWHEG/trees_powheg_pr_central.root + powheg_prompt_variations_path: /data/POWHEG/trees_powheg_pr_ + powheg_prompt_variations: ["F1_R05","F05_R1","F2_R1","F1_R2","F2_R2","F05_R05","Mhigh","Mlow"] + + pythia8_prompt_variations_path: /data/PYTHIA8/trees_pythia8_pr_ + pythia8_prompt_variations: ["default", "charm_lo"] #["default","colour0soft"] + pythia8_prompt_variations_legend: ["PYTHIA 8 (Monash)", "PYTHIA 8 charm LO"] # ["PYTHIA 8 (Monash)","PYTHIA 8 SoftQCD, mode 0"] + + variations_db: ./data/data_prod_20210223/database_variations_D0pp_jet_zg.yml + + # Additional cuts applied before mass histogram is filled + use_cuts: False + cuts: # (sel_an_binmin bins) + - Null + - Null + - Null + - Null + - Null + - Null + - Null + - Null + + jet_qa: + <<: *jet_default + + jet_rg: + <<: *jet_default + var_binningshape: rg_jet + var_binningshape_gen: rg_gen_jet + var_shape_latex: "#it{R}_{g}" + sel_binminshape_reco: [-0.1,0.0,0.1,0.2,0.3] + sel_binmaxshape_reco: [0.0,0.1,0.2,0.3,0.4] + sel_binminshape_gen: [-0.1,0.0,0.1,0.2,0.3] + sel_binmaxshape_gen: [0.0,0.1,0.2,0.3,0.4] + data: &data_out_rg + <<: *data_out_default + results: [/data/DerivedResultsJets/D0kAnywithJets_Vit/vAN-20210223_ROOT6-1/rg/default/default/pp_2016_data/593_20210223-2051/resultsMBjetvspt, + /data/DerivedResultsJets/D0kAnywithJets_Vit/vAN-20210223_ROOT6-1/rg/default/default/pp_2017_data/593_20210223-2051/resultsMBjetvspt, + /data/DerivedResultsJets/D0kAnywithJets_Vit/vAN-20210223_ROOT6-1/rg/default/default/pp_2018_data/593_20210223-2051/resultsMBjetvspt] #list of periods + resultsallp: /data/DerivedResultsJets/D0kAnywithJets_Vit/vAN-20210223_ROOT6-1/rg/default/default/pp_data/resultsMBjetvspt + mc: &mc_out_rg + <<: *mc_out_default + results: [/data/DerivedResultsJets/D0kAnywithJets_Vit/vAN-20210223_ROOT6-1/rg/default/default/pp_2016_mc_prodD2H/594_20210301-1015/resultsMBjetvspt, + /data/DerivedResultsJets/D0kAnywithJets_Vit/vAN-20210223_ROOT6-1/rg/default/default/pp_2017_mc_prodD2H/594_20210301-1015/resultsMBjetvspt, + /data/DerivedResultsJets/D0kAnywithJets_Vit/vAN-20210223_ROOT6-1/rg/default/default/pp_2018_mc_prodD2H/594_20210301-1015/resultsMBjetvspt] #list of periods + resultsallp: /data/DerivedResultsJets/D0kAnywithJets_Vit/vAN-20210223_ROOT6-1/rg/default/default/pp_mc_prodD2H/resultsMBjetvspt + data_proc: # alternative processor output used as the analyzer input + <<: *data_out_rg + mc_proc: # alternative processor output used as the analyzer input + <<: *mc_out_rg + variations_db: ./data/data_prod_20210223/database_variations_D0pp_jet_rg.yml + + jet_nsd: + <<: *jet_default + var_binningshape: nsd_jet + var_binningshape_gen: nsd_gen_jet + var_shape_latex: "#it{n}_{SD}" + sel_binminshape_reco: [-0.5, 0.5, 1.5, 2.5, 3.5] + sel_binmaxshape_reco: [0.5, 1.5, 2.5, 3.5, 4.5] + sel_binminshape_gen: [-0.5, 0.5, 1.5, 2.5, 3.5] + sel_binmaxshape_gen: [0.5, 1.5, 2.5, 3.5, 4.5] + data: &data_out_nsd + <<: *data_out_default + results: [/data/DerivedResultsJets/D0kAnywithJets_Vit/vAN-20210223_ROOT6-1/nsd/default/default/pp_2016_data/593_20210223-2051/resultsMBjetvspt, + /data/DerivedResultsJets/D0kAnywithJets_Vit/vAN-20210223_ROOT6-1/nsd/default/default/pp_2017_data/593_20210223-2051/resultsMBjetvspt, + /data/DerivedResultsJets/D0kAnywithJets_Vit/vAN-20210223_ROOT6-1/nsd/default/default/pp_2018_data/593_20210223-2051/resultsMBjetvspt] #list of periods + resultsallp: /data/DerivedResultsJets/D0kAnywithJets_Vit/vAN-20210223_ROOT6-1/nsd/default/default/pp_data/resultsMBjetvspt + mc: &mc_out_nsd + <<: *mc_out_default + results: [/data/DerivedResultsJets/D0kAnywithJets_Vit/vAN-20210223_ROOT6-1/nsd/default/default/pp_2016_mc_prodD2H/594_20210301-1015/resultsMBjetvspt, + /data/DerivedResultsJets/D0kAnywithJets_Vit/vAN-20210223_ROOT6-1/nsd/default/default/pp_2017_mc_prodD2H/594_20210301-1015/resultsMBjetvspt, + /data/DerivedResultsJets/D0kAnywithJets_Vit/vAN-20210223_ROOT6-1/nsd/default/default/pp_2018_mc_prodD2H/594_20210301-1015/resultsMBjetvspt] #list of periods + resultsallp: /data/DerivedResultsJets/D0kAnywithJets_Vit/vAN-20210223_ROOT6-1/nsd/default/default/pp_mc_prodD2H/resultsMBjetvspt + data_proc: # alternative processor output used as the analyzer input + <<: *data_out_nsd + mc_proc: # alternative processor output used as the analyzer input + <<: *mc_out_nsd + variations_db: ./data/data_prod_20210223/database_variations_D0pp_jet_nsd.yml + + jet_FF: + <<: *jet_default + + pythia8_prompt_variations: ["default","colour2soft"] + pythia8_prompt_variations_legend: ["PYTHIA 8 (Monash)","PYTHIA 8 SoftQCD, mode 2"] + + sel_binmin2_reco: [5.0,7.0,10.0,15.0] # rec jet pt bins (sel_binmin2_reco bins) + sel_binmax2_reco: [7.0,10.0,15.0,50.0] # rec jet pt bins (sel_binmin2_reco bins) + sel_binmin2_gen: [5.0,7.0,10.0,15.0] # gen jet pt bins (sel_binmin2_gen bins) + sel_binmax2_gen: [7.0,10.0,15.0,50.0] # gen jet pt bins (sel_binmin2_gen bins) + var_binningshape: z + var_binningshape_gen: z_gen + var_shape_latex: "#it{z}_{#parallel}^{ch}" + sel_binminshape_reco: [0.2001,0.4001,0.5001,0.6001,0.7001,0.8001,0.9001] + sel_binmaxshape_reco: [0.4001,0.5001,0.6001,0.7001,0.8001,0.9001,1.0001] + sel_binminshape_gen: [0.2001,0.4001,0.5001,0.6001,0.7001,0.8001,0.9001] + sel_binmaxshape_gen: [0.4001,0.5001,0.6001,0.7001,0.8001,0.9001,1.0001] + data: &data_out_ff + <<: *data_out_default + results: [/data/DerivedResultsJets/D0kAnywithJets_Vit/vAN-20210223_ROOT6-1/ff/default/default/pp_2016_data/593_20210223-2051/resultsMBjetvspt, + /data/DerivedResultsJets/D0kAnywithJets_Vit/vAN-20210223_ROOT6-1/ff/default/default/pp_2017_data/593_20210223-2051/resultsMBjetvspt, + /data/DerivedResultsJets/D0kAnywithJets_Vit/vAN-20210223_ROOT6-1/ff/default/default/pp_2018_data/593_20210223-2051/resultsMBjetvspt] #list of periods + resultsallp: /data/DerivedResultsJets/D0kAnywithJets_Vit/vAN-20210223_ROOT6-1/ff/default/default/pp_data/resultsMBjetvspt + mc: &mc_out_ff + <<: *mc_out_default + results: [/data/DerivedResultsJets/D0kAnywithJets_Vit/vAN-20210223_ROOT6-1/ff/default/default/pp_2016_mc_prodD2H/594_20210301-1015/resultsMBjetvspt, + /data/DerivedResultsJets/D0kAnywithJets_Vit/vAN-20210223_ROOT6-1/ff/default/default/pp_2017_mc_prodD2H/594_20210301-1015/resultsMBjetvspt, + /data/DerivedResultsJets/D0kAnywithJets_Vit/vAN-20210223_ROOT6-1/ff/default/default/pp_2018_mc_prodD2H/594_20210301-1015/resultsMBjetvspt] #list of periods + resultsallp: /data/DerivedResultsJets/D0kAnywithJets_Vit/vAN-20210223_ROOT6-1/ff/default/default/pp_mc_prodD2H/resultsMBjetvspt + data_proc: # alternative processor output used as the analyzer input + <<: *data_out_ff + mc_proc: # alternative processor output used as the analyzer input + <<: *mc_out_ff + variations_db: ./data/data_prod_20210223/database_variations_D0pp_jet_FF.yml + + jet_r_shape_2_6: + <<: *jet_default + lc_d0_ratio: true #make ratio of D0 and corresponding Lc analysis + xsec: false + use_inclusive_systematics: true + + pythia8_prompt_variations: ["default","colour2soft"] + pythia8_prompt_variations_legend: ["PYTHIA 8 (Monash)","PYTHIA 8 SoftQCD, mode 2"] + + var_binningshape: delta_r_jet + var_binningshape_gen: delta_r_gen_jet + var_shape_latex: "#it{r}" + sel_binminshape_reco: [0., 0.05, 0.1] + sel_binmaxshape_reco: [0.05, 0.1, 0.2] + sel_binminshape_gen: [0., 0.05, 0.1] + sel_binmaxshape_gen: [0.05, 0.1, 0.2] + sel_an_binmin: [2,3,4,5] # hadron pt bins (sel_an_binmin bins) + sel_an_binmax: [3,4,5,6] # hadron pt bins (sel_an_binmin bins) + binning_matching: [1,1,2,2] # mapping to skimming pt bins (sel_an_binmin bins) + sel_binmin2_reco: [3.0, 5.0, 7.0, 15.0] # rec jet pt bins (sel_binmin2_reco bins) + sel_binmax2_reco: [5.0, 7.0, 15.0, 30.0] # rec jet pt bins (sel_binmin2_reco bins) + sel_binmin2_gen: [3.0, 5.0, 7.0, 15.0] # gen jet pt bins (sel_binmin2_gen bins) + sel_binmax2_gen: [5.0, 7.0, 15.0, 30.0] # gen jet pt bins (sel_binmin2_gen bins) + cuts: + - Null + - Null + - Null + - Null + data: &data_out_r_shape_2_6 + <<: *data_out_default + results: [/data/Derived_testResults/Jets/D0kAnywithJets_Vit/2_6/vAN-20210223_ROOT6-1/r_shape/default/default/pp_2016_data/593_20210223-2051/resultsMBjetvspt, + /data/Derived_testResults/Jets/D0kAnywithJets_Vit/2_6/vAN-20210223_ROOT6-1/r_shape/default/default/pp_2017_data/593_20210223-2051/resultsMBjetvspt, + /data/Derived_testResults/Jets/D0kAnywithJets_Vit/2_6/vAN-20210223_ROOT6-1/r_shape/default/default/pp_2018_data/593_20210223-2051/resultsMBjetvspt] #list of periods + resultsallp: /data/Derived_testResults/Jets/D0kAnywithJets_Vit/2_6/vAN-20210223_ROOT6-1/r_shape/default/default/pp_data/resultsMBjetvspt + + resultslc: /data/Derived_testResults/Jets/Lc/2_6/vAN-20210223_ROOT6-1/r_shape/default/default/pp_data/resultsMBjetvspt + + mc: &mc_out_r_shape_2_6 + <<: *mc_out_default + results: [/data/Derived_testResults/Jets/D0kAnywithJets_Vit/2_6/vAN-20210223_ROOT6-1/r_shape/default/default/pp_2016_mc_prodD2H/594_20210301-1015/resultsMBjetvspt, + /data/Derived_testResults/Jets/D0kAnywithJets_Vit/2_6/vAN-20210223_ROOT6-1/r_shape/default/default/pp_2017_mc_prodD2H/594_20210301-1015/resultsMBjetvspt, + /data/Derived_testResults/Jets/D0kAnywithJets_Vit/2_6/vAN-20210223_ROOT6-1/r_shape/default/default/pp_2018_mc_prodD2H/594_20210301-1015/resultsMBjetvspt] #list of periods + resultsallp: /data/Derived_testResults/Jets/D0kAnywithJets_Vit/2_6/vAN-20210223_ROOT6-1/r_shape/default/default/pp_mc_prodD2H/resultsMBjetvspt + resultsinclusive: /data/Derived_testResults/Jets/D0kAnywithJets_Vit/vAN-20210223_ROOT6-1/r_shape_fullpt/default/default/pp_mc_prodD2H/resultsMBjetvspt + data_proc: # alternative processor output used as the analyzer input + <<: *data_out_r_shape_2_6 + mc_proc: # alternative processor output used as the analyzer input + <<: *mc_out_r_shape_2_6 + variations_db: ./data/data_prod_20210223/database_variations_D0pp_jet_r_shape_2_6.yml + + jet_r_shape_6_12: + <<: *jet_default + lc_d0_ratio: true + xsec: false + use_inclusive_systematics: true + + pythia8_prompt_variations: ["default","colour2soft"] + pythia8_prompt_variations_legend: ["PYTHIA 8 (Monash)","PYTHIA 8 SoftQCD, mode 2"] + + var_binningshape: delta_r_jet + var_binningshape_gen: delta_r_gen_jet + var_shape_latex: "#it{r}" + sel_binminshape_reco: [0., 0.05, 0.1] + sel_binmaxshape_reco: [0.05, 0.1, 0.2] + sel_binminshape_gen: [0., 0.05, 0.1] + sel_binmaxshape_gen: [0.05, 0.1, 0.2] + + sel_an_binmin: [6,7,8,10] # hadron pt bins (sel_an_binmin bins) + sel_an_binmax: [7,8,10,12] # hadron pt bins (sel_an_binmin bins) + binning_matching: [3,3,4, 4] # mapping to skimming pt bins (sel_an_binmin bins) + sel_binmin2_reco: [5.0, 7.0, 15.0] # rec jet pt bins (sel_binmin2_reco bins) + sel_binmax2_reco: [7.0, 15.0, 30.0] # rec jet pt bins (sel_binmin2_reco bins) + sel_binmin2_gen: [5.0, 7.0, 15.0] # gen jet pt bins (sel_binmin2_gen bins) + sel_binmax2_gen: [7.0, 15.0, 30.0] # gen jet pt bins (sel_binmin2_gen bins) + cuts: + - Null + - Null + - Null + - Null + + data: &data_out_r_shape_6_12 + <<: *data_out_default + results: [/data/Derived_testResults/Jets/D0kAnywithJets_Vit/6_12/vAN-20210223_ROOT6-1/r_shape/default/default/pp_2016_data/593_20210223-2051/resultsMBjetvspt, + /data/Derived_testResults/Jets/D0kAnywithJets_Vit/6_12/vAN-20210223_ROOT6-1/r_shape/default/default/pp_2017_data/593_20210223-2051/resultsMBjetvspt, + /data/Derived_testResults/Jets/D0kAnywithJets_Vit/6_12/vAN-20210223_ROOT6-1/r_shape/default/default/pp_2018_data/593_20210223-2051/resultsMBjetvspt] #list of periods + resultsallp: /data/Derived_testResults/Jets/D0kAnywithJets_Vit/6_12/vAN-20210223_ROOT6-1/r_shape/default/default/pp_data/resultsMBjetvspt + + resultslc: /data/Derived_testResults/Jets/Lc/6_12/vAN-20210223_ROOT6-1/r_shape/default/default/pp_data/resultsMBjetvspt + + mc: &mc_out_r_shape_6_12 + <<: *mc_out_default + results: [/data/Derived_testResults/Jets/D0kAnywithJets_Vit/6_12/vAN-20210223_ROOT6-1/r_shape/default/default/pp_2016_mc_prodD2H/594_20210301-1015/resultsMBjetvspt, + /data/Derived_testResults/Jets/D0kAnywithJets_Vit/6_12/vAN-20210223_ROOT6-1/r_shape/default/default/pp_2017_mc_prodD2H/594_20210301-1015/resultsMBjetvspt, + /data/Derived_testResults/Jets/D0kAnywithJets_Vit/6_12/vAN-20210223_ROOT6-1/r_shape/default/default/pp_2018_mc_prodD2H/594_20210301-1015/resultsMBjetvspt] #list of periods + resultsallp: /data/Derived_testResults/Jets/D0kAnywithJets_Vit/6_12/vAN-20210223_ROOT6-1/r_shape/default/default/pp_mc_prodD2H/resultsMBjetvspt + resultsinclusive: /data/Derived_testResults/Jets/D0kAnywithJets_Vit/vAN-20210223_ROOT6-1/r_shape_fullpt/default/default/pp_mc_prodD2H/resultsMBjetvspt + data_proc: # alternative processor output used as the analyzer input + <<: *data_out_r_shape_6_12 + mc_proc: # alternative processor output used as the analyzer input + <<: *mc_out_r_shape_6_12 + variations_db: ./data/data_prod_20210223/database_variations_D0pp_jet_r_shape_6_12.yml + + + systematics: + probvariation: + useperiod: [0,0,1] #period from where to define prob cuts + ncutvar: 10 #number of looser and tighter variations + maxperccutvar: 0.25 #max diff in efficiency for loosest/tightest var + cutvarminrange: [0.80, 0.80, 0.6, 0.3, 0.3] #Min starting point for scan + cutvarmaxrange: [0.98, 0.95, 0.95, 0.95, 0.95] #Max starting point for scan + fixedmean: True #Fix mean cutvar histo to central fit + fixedsigma: True #Fix sigma cutvar histo to central fit + mcptshape: + #FONLL / generated LHC19h4c1 + weights: [1.000000] + #From SetPtWeightsFromFONLL13overLHC17c3a12 in AliPhysics + #weights: [1.429770] + weights_min_pt: 0 + weights_max_pt: 40 + weights_bins: 400 diff --git a/machine_learning_hep/data/data_run3/database_ml_parameters_LcToPKPi_newformat.yml b/machine_learning_hep/data/data_run3/database_ml_parameters_LcToPKPi_newformat.yml new file mode 100644 index 0000000000..48648bbba5 --- /dev/null +++ b/machine_learning_hep/data/data_run3/database_ml_parameters_LcToPKPi_newformat.yml @@ -0,0 +1,444 @@ +############################################################################# +## © Copyright CERN 2023. All rights not expressly granted are reserved. ## +## Author: Gian.Michele.Innocenti@cern.ch ## +## This program is free software: you can redistribute it and/or modify it ## +## under the terms of the GNU General Public License as published by the ## +## Free Software Foundation, either version 3 of the License, or (at your ## +## option) any later version. This program is distributed in the hope that ## +## it will be useful, but WITHOUT ANY WARRANTY; without even the implied ## +## warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. ## +## See the GNU General Public License for more details. ## +## You should have received a copy of the GNU General Public License ## +## along with this program. if not, see . ## +############################################################################# + +LcpKpi: + nprongs: 3 + prongformultsub: [0,0,0] + doml: true + mass: 2.286 + sel_reco_unp: "fPt>0" + sel_gen_unp: "fPt>0" + sel_cen_unp: null + sel_good_evt_unp: "fIsEventReject == 0" + sel_reco_skim: [null,null,null,null,null,null] + sel_gen_skim: [null,null,null,null,null,null] + sel_skim_binmin: [1,2,4,6,8,12] #list of nbins + sel_skim_binmax: [2,4,6,8,12,24] #list of nbins + apply_yptacccut: false + var_binning: fPt + dofullevtmerge: false + var_cand: fCandidateSelFlag + var_swap: fIsCandidateSwapped + bitmap_sel: + var_name: fFlagMc + var_name_gen: fFlagMc + var_name_origgen: fOriginMcGen + var_name_origrec: fOriginMcRec + var_isstd: isstd + var_ismcsignal: ismcsignal + var_ismcprompt: ismcprompt + var_ismcfd: ismcfd + var_ismcbkg: ismcbkg + var_ismcrefl: ismcref + isstd : [[1],[]] + ismcsignal: [[1],[]] + ismcprompt: [[0],[]] + ismcfd: [[1],[]] + ismcbkg: [[],[1]] + ismcrefl: [[1],[1]] + + dfs: + read: + evtorig: + level: data + trees: + O2hfcandlcfullev: [fIndexCollisions, fIsEventReject, fNumContrib, fMultZeqNTracksPV, fMultZeqFT0A, fMultZeqFT0C, fMultFT0M, fMultZeqFV0A] + evtorigmc: + level: mc + trees: + O2hfcandlcfullev: [fIndexCollisions, fIndexMcCollisions, fIsEventReject, fNumContrib, fMultZeqNTracksPV, fMultZeqFT0A, fMultZeqFT0C, fMultFT0M, fMultZeqFV0A] + reco: + level: all + trees: + O2hfcandlclite: [fPosX, fPosY, fPosZ, fFlagMc, fCandidateSelFlag, fOriginMcRec, fIsCandidateSwapped, fY, fEta, fPt, fCpa, fCpaXY, fM, + fChi2PCA, fDecayLength, fDecayLengthXY, fPtProng0, fPtProng1, fPtProng2, fImpactParameter0, fImpactParameter1, fImpactParameter2, + fNSigTpcPi0, fNSigTpcPr0, fNSigTpcKa1, fNSigTpcPi2, fNSigTpcPr2, + fNSigTofPi0, fNSigTofPr0, fNSigTofKa1, fNSigTofPi2, fNSigTofPr2] + O2hfcollidlclite: [fIndexCollisions] + filter: "fPt > 1." + gen: + level: mc + trees: + O2hfcandlcfullp: [fIndexMcCollisions, fPt, fY, fFlagMc, fOriginMcGen] + + merge: + - {base: reco, ref: evtorig, use: fIndexCollisions} + - {base: reco, ref: evtorigmc, use: fIndexCollisions} + - {base: gen, ref: evtorigmc, use: fIndexMcCollisions} + + write: + evtorig: + level: data + file: AnalysisResultsEvtOrig.pkl.lz4 + evt: + level: data + source: evtorig + file: AnalysisResultsEvt.pkl.lz4 + filter: "fIsEventReject == 0" + evtorigmc: + level: mc + file: AnalysisResultsEvtOrig.pkl.lz4 + evtmc: + level: mc + source: evtorigmc + file: AnalysisResultsEvt.pkl.lz4 + filter: "fIsEventReject == 0" + reco: + level: all + file: AnalysisResultsReco.pkl.lz4 + gen: + level: mc + file: AnalysisResultsGen.pkl.lz4 + + variables: + var_all: [fIndexCollisions, fPosX, fPosY, fPosZ, fFlagMc, fCandidateSelFlag, fOriginMcRec, fIsCandidateSwapped, fY, fEta, fPt, fCpa, fCpaXY, fM, + fChi2PCA, fDecayLength, fDecayLengthXY, fPtProng0, fPtProng1, fPtProng2, fImpactParameter0, fImpactParameter1, fImpactParameter2, + fNSigTpcPi0, fNSigTpcPr0, fNSigTpcKa1, fNSigTpcPi2, fNSigTpcPr2, + fNSigTofPi0, fNSigTofPr0, fNSigTofKa1, fNSigTofPi2, fNSigTofPr2, + fNSigTpcTofPi0, fNSigTpcTofPr0, fNSigTpcTofKa1, fNSigTpcTofPi2, fNSigTpcTofPr2] + var_jet: [fJetPt, fJetEta, fJetPhi] + var_jetsub: [fZg, fRg, fNsd] + var_jet_match: [df, fIndexHfCand2Prong] + var_jetsub_match: [df, fIndexD0ChargedJets] + var_evt: + data: [fIndexCollisions, fIsEventReject, fNumContrib, fMultZeqNTracksPV, fMultZeqFT0A, fMultZeqFT0C, fMultFT0M, fMultZeqFV0A] + mc: [fIndexCollisions, fIndexMcCollisions, fIsEventReject, fNumContrib, fMultZeqNTracksPV, fMultZeqFT0A, fMultZeqFT0C, fMultFT0M, fMultZeqFV0A] + var_gen: [fIndexMcCollisions, fPt, fY, fFlagMc, fOriginMcGen] + var_evt_match: [df, fIndexCollisions] + var_evt_match_mc: [df, fIndexMcCollisions] + var_training: [[fImpactParameter0, fImpactParameter1, fImpactParameter2, fPtProng0, fPtProng1, fPtProng2, fCpa, fChi2PCA, + fNSigTpcPi0, fNSigTpcPr0, fNSigTpcKa1, fNSigTpcPi2, fNSigTpcPr2, fNSigTofPi0, fNSigTofPr0, fNSigTofKa1, fNSigTofPi2, fNSigTofPr2], + [fImpactParameter0, fImpactParameter1, fImpactParameter2, fPtProng0, fPtProng1, fPtProng2, fCpa, fChi2PCA, + fNSigTpcPi0, fNSigTpcPr0, fNSigTpcKa1, fNSigTpcPi2, fNSigTpcPr2, fNSigTofPi0, fNSigTofPr0, fNSigTofKa1, fNSigTofPi2, fNSigTofPr2], + [fImpactParameter0, fImpactParameter1, fImpactParameter2, fPtProng0, fPtProng1, fPtProng2, fCpa, fChi2PCA, + fNSigTpcPi0, fNSigTpcPr0, fNSigTpcKa1, fNSigTpcPi2, fNSigTpcPr2, fNSigTofPi0, fNSigTofPr0, fNSigTofKa1, fNSigTofPi2, fNSigTofPr2], + [fImpactParameter0, fImpactParameter1, fImpactParameter2, fPtProng0, fPtProng1, fPtProng2, fCpa, fChi2PCA, + fNSigTpcPi0, fNSigTpcPr0, fNSigTpcKa1, fNSigTpcPi2, fNSigTpcPr2, fNSigTofPi0, fNSigTofPr0, fNSigTofKa1, fNSigTofPi2, fNSigTofPr2], + [fImpactParameter0, fImpactParameter1, fImpactParameter2, fPtProng0, fPtProng1, fPtProng2, fCpa, fChi2PCA, + fNSigTpcPi0, fNSigTpcPr0, fNSigTpcKa1, fNSigTpcPi2, fNSigTpcPr2, fNSigTofPi0, fNSigTofPr0, fNSigTofKa1, fNSigTofPi2, fNSigTofPr2], + [fImpactParameter0, fImpactParameter1, fImpactParameter2, fPtProng0, fPtProng1, fPtProng2, fCpa, fChi2PCA, + fNSigTpcPi0, fNSigTpcPr0, fNSigTpcKa1, fNSigTpcPi2, fNSigTpcPr2, fNSigTofPi0, fNSigTofPr0, fNSigTofKa1, fNSigTofPi2, fNSigTofPr2]] + var_selected: [fPosX, fPosY, fPosZ, fY, fEta, fPt, fCpa, fCpaXY, fM, fChi2PCA, fDecayLength, fDecayLengthXY, + fNSigTpcPi0, fNSigTpcPr0, fNSigTpcKa1, fNSigTpcPi2, fNSigTpcPr2, + fNSigTofPi0, fNSigTofPr0, fNSigTofKa1, fNSigTofPi2, fNSigTofPr2] + var_boundaries: [fDecayLength, fPt] + var_correlation: + - [fDecayLength, fChi2PCA, fCpa] + - [fPt, fPt, fPt] + var_class: class + var_inv_mass: fM + var_y: fY + var_evt_sel: fIsEventReject + var_cuts: + - [fPtProng0, lt, null] + - [fPtProng1, lt, null] + - [fPtProng2, lt, null] + - [fCpa, lt, null] + - [fDecayLength, lt, null] + - [fChi2PCA, lt, null] + + plot_options: + prob_cut_scan: + pt_prong0: + xlim: + - 0 + - 8 + pt_prong1: + xlim: + - 0 + - 8 + pt_prong2: + xlim: + - 0 + - 8 + fDecayLength: + xlim: + - 0 + - 0.08 + fChi2PCA: + xlim: + - 0 + - 20. + fNSigTofPr0: + xlim: [-50, 50] + xlabel: "n\\sigma_\\mathrm{TOF}(p)0" + fNSigTofPi0: + xlim: [-50, 50] + xlabel: "n\\sigma_\\mathrm{TOF}(\\pi)0" + fNSigTofKa1: + xlim: [-10, 10] + xlabel: "n\\sigma_\\mathrm{TOF}(K)1" + fNSigTofPr2: + xlim: [-50, 50] + xlabel: "n\\sigma_\\mathrm{TOF}(p)2" + fNSigTofPi2: + xlim: [-50, 50] + xlabel: "n\\sigma_\\mathrm{TOF}(\\pi)2" + fNSigTpcPr0: + xlim: [-50, 50] + xlabel: "n\\sigma_\\mathrm{TPC}(p)0" + fNSigTpcPi0: + xlim: [-50, 50] + xlabel: "n\\sigma_\\mathrm{TPC}(\\pi)0" + fNSigTpcKa1: + xlim: [-10, 10] + xlabel: "n\\sigma_\\mathrm{TPC}(K)1" + fNSigTpcPr2: + xlim: [-50, 50] + xlabel: "n\\sigma_\\mathrm{TPC}(p)2" + fNSigTpcPi2: + xlim: [-50, 50] + xlabel: "n\\sigma_\\mathrm{TPC}(\\pi)2" + eff_cut_scan: + pt_prong0: + xlim: + - 0 + - 8 + pt_prong1: + xlim: + - 0 + - 8 + pt_prong2: + xlim: + - 0 + - 8 + fDecayLength: + xlim: + - 0 + - 0.08 + fChi2PCA: + xlim: + - 0 + - 20. + files_names: + namefile_unmerged_tree: AO2D.root + namefile_reco: AnalysisResultsReco.pkl + namefile_evt: AnalysisResultsEvt.pkl + namefile_evtvalroot: AnalysisResultsROOTEvtVal.root + namefile_evtorig: AnalysisResultsEvtOrig.pkl + namefile_gen: AnalysisResultsGen.pkl + namefile_reco_applieddata: AnalysisResultsRecoAppliedData.pkl + namefile_reco_appliedmc: AnalysisResultsRecoAppliedMC.pkl + namefile_mcweights: mcweights.root + treeoriginreco: 'O2hfcandlclite' + treeorigingen: 'O2hfcandlcfullp' + treeoriginevt: 'O2hfcandlcfullev' + treeoutput: "Lctree" + histofilename: "masshisto.root" + efffilename: "effhisto.root" + respfilename: "resphisto.root" + crossfilename: "cross_section_tot.root" + + multi: + data: + nprocessesparallel: 60 + maxfiles : [-1] #list of periods + chunksizeunp : [100] #list of periods + chunksizeskim: [100] #list of periods + fracmerge : [0.05] #list of periods + seedmerge: [12] #list of periods + period: [LHC22o] #list of periods + select_period: [1] + prefix_dir: /data2/MLhep/ + unmerged_tree_dir: [real/train_158254/alice/cern.ch/user/a/alihyperloop/jobs/0029] #list of periods + pkl: [LHC22pp/period_LHC22o/pkldata] #list of periods + pkl_skimmed: [LHC22pp/period_LHC22o/pklskdata] #list of periods + pkl_skimmed_merge_for_ml: [LHC22pp/period_LHC22o/pklskmldata] #list of periods + pkl_skimmed_merge_for_ml_all: LHC22pp/mltotdata + pkl_evtcounter_all: LHC22pp/evttotdata + #select_jobs: [[hy_189959], [hy_189000]] + mcreweights: [../Analyses] + mc: + nprocessesparallel: 50 + maxfiles : [-1, -1] #list of periods + chunksizeunp : [100, 100] #list of periods + chunksizeskim: [100, 100] #list of periods + fracmerge : [1.0, 1.0] #list of periods + seedmerge: [12, 12] #list of periods + period: [LHC22b1b, LHC22b1a] #list of periods + select_period: [1, 1] + prefix_dir: /data2/MLhep/ + unmerged_tree_dir: [sim/train_159856/alice/cern.ch/user/a/alihyperloop/jobs/0029, + sim/train_159854/alice/cern.ch/user/a/alihyperloop/jobs/0029] #list of periods + pkl: [LHC22pp_mc_Jochen/prod_LHC22b1b/pklmc, + LHC22pp_mc_Jochen/prod_LHC22b1a/pklmc] #list of periods + pkl_skimmed: [LHC22pp_mc/prod_LHC22b1b/pklskmc, + LHC22pp_mc/prod_LHC22b1a/pklskmc] #list of periods + pkl_skimmed_merge_for_ml: [LHC22pp_mc/prod_LHC22b1b/pklskmlmc, + LHC22pp_mc/prod_LHC22b1a/pklskmlmc] #list of periods + pkl_skimmed_merge_for_ml_all: LHC22pp_mc/prod_LHC22/mltotmc + pkl_evtcounter_all: LHC22pp_mc/prod_LHC22/evttotmc + mcreweights: [../Analyses, ../Analyses] + ml: + evtsel: null + triggersel: + data: null + mc: null + + nclasses: [50000, 50000] + equalise_sig_bkg: True + sampletags: [0, 1] + sel_bkg: fM < 2.22 or fM > 2.35 # for plotting significance; should agree with bkg selection in sel_ml + # best to have non-prompt (the smallest class) last, so the plots won't complain about the middle class missing + sel_ml: [fM < 2.22 or fM > 2.35, ismcsignal == 1 and ismcprompt == 1] + class_labels: [bkg, prompt] + nkfolds: 5 + rnd_shuffle: 12 + rnd_splt: 12 + rnd_all: 12 # Set to None for pure randomness + test_frac: 0.2 + binmin: [1,2,4,6,8,12] # must be equal to sel_skim_binmin (sel_skim_binmin bins) + binmax: [2,4,6,8,12,24] # must be equal to sel_skim_binmax (sel_skim_binmin bins) + mltype: BinaryClassification + ncorescrossval: 10 + prefix_dir_ml: /data2/MLhep/ + mlplot: mlplot # to be removed + mlout: mlout # to be removed + + opt: + isFONLLfromROOT: true + filename_fonll: 'data/fonll/DmesonLcPredictions_502TeV_y05_FFptDepLHCb_BRpythia8.root' # file with FONLL predictions + fonll_particle: 'hLcpkpi' + fonll_pred: 'max' # edge of the FONLL prediction + FF: 0.1281 # fragmentation fraction + sigma_MB: 57.8e-3 # Minimum Bias cross section (pp) 50.87e-3 [b], 1 for Pb-Pb + Taa: 1 # 23260 [b^-1] in 0-10% Pb-Pb, 3917 [b^-1] in 30-50% Pb-Pb, 1 for pp + BR: 6.23e-2 # branching ratio of the decay Lc -> p K- pi+ + f_prompt: 0.9 # estimated fraction of prompt candidates + bkg_data_fraction: 0.1 # fraction of real data used in the estimation + num_steps: 111 # number of steps used in efficiency and signif. estimation + bkg_function: pol2 # fit function for bkg (among TH1 predefined fit functions, e.g. expo, pol1, pol2, ...) + save_fit: True # save bkg fits with the various cuts on ML output + raahp: [1,1,1,1,1,1] # sel_skim_binmin bins + presel_gen_eff: "abs(fY) < 0.8" + #presel_gen_eff: "abs(fY) < 0.8 and abs(fPosZ) < 10" + + mlapplication: + data: + prefix_dir_app: /data2/MLhep/ + pkl_skimmed_dec: [LHC22pp/MLapplication/prod_LHC22o/skpkldecdata] #list of periods + pkl_skimmed_decmerged: [LHC22pp/MLapplication/prod_LHC22o/skpkldecdatamerged] #list of periods + mc: + prefix_dir_app: /data2/MLhep/ + pkl_skimmed_dec: [LHC22pp_mc/MLapplication/prod_LHC22b1b/skpkldecmc, + LHC22pp_mc/MLapplication/prod_LHC22b1a/skpkldecmc,] #list of periods + pkl_skimmed_decmerged: [LHC22pp_mc/MLapplication/prod_LHC22b1b/skpkldecmcmerged, + LHC22pp_mc/MLapplication/prod_LHC22b1a/skpkldecmcmerged] #list of periods + modelname: xgboost + modelsperptbin: [xgboost_classifierLcpKpi_dfselection_fPt_1.0_2.0.sav, + xgboost_classifierLcpKpi_dfselection_fPt_2.0_4.0.sav, + xgboost_classifierLcpKpi_dfselection_fPt_4.0_6.0.sav, + xgboost_classifierLcpKpi_dfselection_fPt_6.0_8.0.sav, + xgboost_classifierLcpKpi_dfselection_fPt_8.0_12.0.sav, + xgboost_classifierLcpKpi_dfselection_fPt_12.0_24.0.sav] + probcutpresel: + data: [0.4, 0.4, 0.4, 0.4, 0.4, 0.4] #list of nbins + mc: [0.4, 0.4, 0.4, 0.4, 0.4, 0.4] #list of nbins + probcutoptimal: [0.5, 0.5, 0.5, 0.5, 0.5, 0.5] #list of nbins + + analysis: + indexhptspectrum: -1 #kD0Kpi=0, kDplusKpipi=1, kDstarD0pi=2, kDsKKpi=3, kLctopKpi=4, kLcK0Sp=5 + fd_method: -1 #knone=0, kfc=1, kNb=2 + cctype: -1 #kpp7 + sigmav0: -1 + inputfonllpred: null + dir_general_plots: analysis_plots + + Run3analysis: + proc_type: Dhadrons + useperiod: [1,1] + plotbin: [1] + usesinglebineff: 0 + sel_binmin2: [1,2,4,6,8,12] #list of nbins + sel_binmax2: [2,4,6,8,12,24] #list of nbins + var_binning2: null + triggerbit: '' + use_cuts: True + cuts: + - "fDecayLength > 0.02" + - "fDecayLength > 0.02" + - "fDecayLength > 0.02" + - "fDecayLength > 0.02" + - "fDecayLength > 0.02" + - "fDecayLength > 0.02" + + + sel_an_binmin: [1,2,4,6,8,12] + sel_an_binmax: [2,4,6,8,12,24] + binning_matching: [0,1,2,3,4,5] + presel_gen_eff: "abs(fY) < 0.8" + evtsel: null + triggersel: + data: null + mc: null + weighttrig: false + + data: + runselection: [null] #FIXME + prefix_dir_res: /data2/MLhep/ + results: [LHC22pp/Results/prod_LHC22o/resultsdata] #list of periods + resultsallp: LHC22pp/Results/resultsdatatot + mc: + runselection: [null, null] #FIXME + prefix_dir_res: /data2/MLhep/ + results: [LHC22pp_mc/Results/prod_LHC22b1b/resultsmc, + LHC22pp_mc/Results/prod_LHC22b1a/resultsmc] #list of periods + resultsallp: LHC22pp_mc/Results/prod_LHC22/resultsmctot + + mass_fit_lim: [2.14, 2.436] # region for the fit of the invariant mass distribution [GeV/c^2] + bin_width: 0.001 # bin width of the invariant mass histogram + # To initialize the individual fits in pT bins + # Decide whether to take the sigma from MC or data for individual fits + init_fits_from: [mc,mc,mc,mc,mc,mc] # data or mc + sgnfunc: [kGaus,kGaus,kGaus,kGaus,kGaus,kGaus] + bkgfunc: [Pol2,Pol2,Pol2,Pol2,Pol2,Pol2] + masspeak: 2.286 + massmin: [2.16, 2.16, 2.16, 2.14, 2.14, 2.14] + massmax: [2.416, 2.416, 2.416, 2.436, 2.436, 2.436] + rebin: [6,6,7,8,8,8] + fix_mean: [false,false,false,false,false,false] + fix_sigma: [false,false,false,false,false,false] + # Fix mean and/or sigma + FixedMean: False + SetFixGaussianSigma: [false,false,false,false,false,false] + # Use value set for "masspeak" for initializing total fit, otherwise what is derived from MC fit is used + SetInitialGaussianMean: true + # Use values set for "sigmaarray" for initializing total fit (per pT bin), + # otherwise what is derived from MC fit is used + SetInitialGaussianSigma: [false,false,false,false,false,false] + # Max percentage deviation in sigma (from init) to be considered as a good fit + MaxPercSigmaDeviation: 0.5 + # Number of initial signal sigmas around the mean to be excluded for side-band fit + exclude_nsigma_sideband: 4 + # Sigma around mean where signal is integrated after total fit has been ne + nsigma_signal: 3 + dolikelihood: true + sigmaarray: [0.01,0.01,0.01,0.01,0.01,0.01] + FixedSigma: false + fitcase: Lc + latexnamehadron: "#Lambda_{c}^{pK#pi}" + latexbin2var: "n_{trkl}" + nevents: null + dodoublecross: false + dobkgfromsideband: false + + systematics: + probvariation: + useperiod: [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] #period from where to define prob cuts + ncutvar: 10 #number of looser and tighter variations + maxperccutvar: 0.25 #max diff in efficiency for loosest/tightest var + cutvarminrange: [0.70, 0.50, 0.50, 0.30, 0.30, 0.30] #Min starting point for scan + cutvarmaxrange: [0.95, 0.90, 0.90, 0.80, 0.80, 0.80] #Max starting point for scan + fixedmean: True #Fix mean cutvar histo to central fit + fixedsigma: True #Fix sigma cutvar histo to central fit \ No newline at end of file diff --git a/machine_learning_hep/processer.py b/machine_learning_hep/processer.py old mode 100755 new mode 100644 index 53d3b2c6b4..7ab7942d86 --- a/machine_learning_hep/processer.py +++ b/machine_learning_hep/processer.py @@ -29,7 +29,7 @@ import numpy as np from machine_learning_hep.selectionutils import selectfidacc from machine_learning_hep.bitwise import tag_bit_df #, filter_bit_df -from machine_learning_hep.utilities import selectdfquery, merge_method, mask_df +from machine_learning_hep.utilities import dfquery, selectdfquery, merge_method, mask_df from machine_learning_hep.utilities import list_folders, createlist, appendmainfoldertolist from machine_learning_hep.utilities import create_folder_struc, seldf_singlevar, openfile from machine_learning_hep.utilities import mergerootfiles, count_df_length_pkl @@ -93,26 +93,21 @@ def __init__(self, case, datap, run_param, mcordata, p_maxfiles, # pylint: disab self.p_chunksizeunp = p_chunksizeunp self.p_chunksizeskim = p_chunksizeskim + self.df_read = datap['dfs']['read'] + self.df_merge = datap['dfs'].get('merge', None) + self.df_write = datap['dfs'].get('write', None) + #parameter names self.p_maxprocess = p_maxprocess self.indexsample = None self.p_dofullevtmerge = datap["dofullevtmerge"] #namefile root self.n_root = datap["files_names"]["namefile_unmerged_tree"] - #troot trees names - self.n_treereco = datap["files_names"]["treeoriginreco"] - self.n_treegen = datap["files_names"]["treeorigingen"] - self.n_treeevt = datap["files_names"]["treeoriginevt"] - if self.mcordata == 'mc': - self.n_treejetreco = datap["files_names"].get("treejetdet", None) - self.n_treejetsubreco = datap["files_names"].get("treejetsubdet", None) - else: - self.n_treejetreco = datap["files_names"].get("treejetdata", None) - self.n_treejetsubreco = datap["files_names"].get("treejetsubdata", None) - self.n_treejetgen = datap["files_names"].get("treejetgen", None) - self.n_treejetsubgen = datap["files_names"].get("treejetsubgen", None) #namefiles pkl + # def nget(d : dict, k : list, dd = None): + # return nget(d.get(k.pop(0), {}), k, dd) if len(k) > 1 else d.get(k.pop(0), dd) + # nget(datap, ['dfs', 'write', 'jetsubdet', 'file']) self.n_reco = datap["files_names"]["namefile_reco"] self.n_evt = datap["files_names"]["namefile_evt"] self.n_evtorig = datap["files_names"]["namefile_evtorig"] @@ -141,28 +136,9 @@ def __init__(self, case, datap, run_param, mcordata, p_maxfiles, # pylint: disab self.b_mcrefl = datap["bitmap_sel"]["ismcrefl"] #variables name - self.v_all = datap["variables"]["var_all"] self.v_train = datap["variables"]["var_training"] - self.v_evt = datap["variables"]["var_evt"][self.mcordata] - self.v_gen = datap["variables"]["var_gen"] - self.v_evtmatch = datap["variables"]["var_evt_match"] - self.v_evtmatch_mc = datap["variables"]["var_evt_match_mc"] - if self.mcordata == 'mc': - self.v_jetmatch = datap["variables"].get("var_jet_match_det", None) - self.v_jetsubmatch = datap["variables"].get("var_jetsub_match_det", None) - self.v_jet = datap["variables"].get("var_jet_det", None) - self.v_jetsub = datap["variables"].get("var_jetsub_det", None) - else: - self.v_jetmatch = datap["variables"].get("var_jet_match_data", None) - self.v_jetsubmatch = datap["variables"].get("var_jetsub_match_data", None) - self.v_jet = datap["variables"].get("var_jet_data", None) - self.v_jetsub = datap["variables"].get("var_jetsub_data", None) - self.v_jet_gen = datap["variables"].get("var_jet_gen", None) - self.v_jetsub_gen = datap["variables"].get("var_jetsub_gen", None) - self.v_jetmatch_mc = datap["variables"].get("var_jet_match_mc", None) - self.v_jetmatch_mc_hf = datap["variables"].get("var_jet_match_mc_hf", None) - self.v_jetsubmatch_mc = datap["variables"].get("var_jetsub_match_mc", None) self.v_bitvar = datap["bitmap_sel"]["var_name"] + self.v_bitvar_gen = datap["bitmap_sel"]["var_name_gen"] self.v_bitvar_origgen = datap["bitmap_sel"]["var_name_origgen"] self.v_bitvar_origrec = datap["bitmap_sel"]["var_name_origrec"] self.v_candtype = datap["var_cand"] @@ -308,25 +284,7 @@ def __init__(self, case, datap, run_param, mcordata, p_maxfiles, # pylint: disab self.do_custom_analysis_cuts = datap["analysis"][self.typean].get("use_cuts", False) def unpack(self, file_index, max_no_keys = None): # pylint: disable=too-many-branches - dfevtorig = None - dfreco = None - dfjetreco = None - dfjetsubreco = None - dfgen = None - dfjetgen = None - dfjetsubgen = None - - def dfmerge(dfl, dfr, **kwargs): - """Merge dfl and dfr""" - try: - return pd.merge(dfl, dfr, **kwargs) - except Exception as e: - self.logger.error('merging failed: %s', str(e)) - dfl.info() - dfr.info() - raise e - - def dfread(trees, cols): + def dfread(rdir, trees, cols, idx_name=None): """Read DF from multiple (joinable) O2 tables""" try: if not isinstance(trees, list): @@ -334,29 +292,42 @@ def dfread(trees, cols): cols = [cols] # if all(type(var) is str for var in vars): vars = [vars] df = None - for tree, col in zip(trees, cols): - data = tree.arrays(expressions=col, library='np') - dfnew = pd.DataFrame(columns=col, data=data) - dfnew['df'] = int(df_no) - df = pd.concat([df, dfnew], axis=1) + for tree, col in zip([rdir[name] for name in trees], cols): + try: + data = tree.arrays(expressions=col, library='np') + dfnew = pd.DataFrame(columns=col, data=data) + df = pd.concat([df, dfnew], axis=1) + except Exception as e: # pylint: disable=broad-except + self.logger.critical('Failed to read data frame from tree %s', str(e)) + sys.exit() + df['df'] = int(df_no) + if idx_name: + # df.rename_axis(idx_name, inplace=True) + df[idx_name] = df.index + df.set_index(['df', idx_name], inplace=True) return df except Exception as e: self.logger.exception('Failed to read data from trees: %s', str(e)) raise e def dfappend(name: str, dfa): + """Append DF row-wise""" dfs[name] = pd.concat([dfs.get(name, None), dfa]) - def read_df(tree, df_base, var): + def dfmerge(dfl, dfr, **kwargs): + """Merge dfl and dfr""" try: - df = pd.DataFrame( - columns=var, - data=tree.arrays(expressions=var, library="np")) - df['df'] = int(df_no) - return pd.concat([df_base, df]) - except Exception as e: # pylint: disable=broad-except - self.logger.critical('Failed to read data frame from tree %s', str(e)) - sys.exit() + return pd.merge(dfl, dfr, **kwargs) + except Exception as e: + self.logger.error('merging failed: %s', str(e)) + dfl.info() + dfr.info() + raise e + + def dfuse(df_spec): + return ((df_spec['level'] == 'all') or + (df_spec['level'] in ('mc', 'gen', 'det') and self.mcordata == 'mc') or + (df_spec['level'] in ('data') and self.mcordata == 'data')) self.logger.info('unpacking: %s', self.l_root[file_index]) dfs = {} @@ -373,105 +344,82 @@ def read_df(tree, df_base, var): df_processed.add(df_no) rdir = rfile[key] - tree = rdir[self.n_treereco] # accessing the tree is the slow bit! - dfreco = read_df(tree, dfreco, self.v_all) - dfappend('reco', dfread(rdir[self.n_treereco], self.v_all)) - dfevtorig = read_df(rdir[self.n_treeevt], dfevtorig, self.v_evt) - - if self.n_treejetreco: - dfjetreco = read_df(rdir[self.n_treejetreco], - dfjetreco, self.v_jet) - - if self.n_treejetsubreco: - dfjetsubreco = read_df(rdir[self.n_treejetsubreco], - dfjetsubreco, self.v_jetsub) - - if self.mcordata == 'mc': - dfgen = read_df(rdir[self.n_treegen], - dfgen, self.v_gen) - - if self.n_treejetgen: - dfjetgen = read_df(rdir[self.n_treejetgen], - dfjetgen, self.v_jet_gen) - - if self.n_treejetsubgen: - dfjetsubgen = read_df(rdir[self.n_treejetsubgen], - dfjetsubgen, self.v_jetsub_gen) - - dfevtorig = selectdfquery(dfevtorig, self.s_cen_unp) - dfevtorig = dfevtorig.reset_index(drop=True) - pickle.dump(dfevtorig, openfile(self.l_evtorig[file_index], "wb"), protocol=4) - - dfevt = selectdfquery(dfevtorig, self.s_good_evt_unp) - dfevt = dfevt.reset_index(drop=True) - pickle.dump(dfevt, openfile(self.l_evt[file_index], "wb"), protocol=4) - - if dfjetreco is not None: - if dfjetsubreco is not None: - dfjetreco = dfmerge(dfjetreco, dfjetsubreco, how='inner', on=self.v_jetsubmatch) - dfreco = dfmerge(dfjetreco, dfreco, on=self.v_jetmatch) - - dfreco = selectdfquery(dfreco, self.s_reco_unp) - - if 'fIndexCollisions' not in dfevt.columns: - self.logger.warning('Adding fIndexCollisions retroactively') - dfevt.rename_axis('fIndexCollisions', inplace=True) - - dfreco = dfmerge(dfreco, dfevt, on=self.v_evtmatch) - + for df_name, df_spec in self.df_read.items(): + if dfuse(df_spec): + df = dfread(rdir, list(df_spec['trees'].keys()), + list(df_spec['trees'].values()), + idx_name=df_spec.get('index', None)) + dfappend(df_name, df) + + for df_name, df_spec in self.df_read.items(): + if dfuse(df_spec): + if 'extra' in df_spec: + for col_name, col_val in df_spec['extra'].items(): + dfs[df_name][col_name] = dfs[df_name].eval(col_val) + if 'filter' in df_spec: + dfquery(dfs[df_name], df_spec['filter'], inplace=True) + + # extra logic should eventually come from DB if self.s_apply_yptacccut is True: - isselacc = selectfidacc(dfreco[self.v_var_binning].values, - dfreco[self.v_rapy].values) - dfreco = dfreco[np.array(isselacc, dtype=bool)] - + isselacc = selectfidacc(dfs['reco'][self.v_var_binning].values, + dfs['reco'][self.v_rapy].values) + dfs['reco'] = dfs['reco'][np.array(isselacc, dtype=bool)] - # needs to be revisited for Run 3 if self.mcordata == "mc": - dfreco[self.v_ismcsignal] = np.array(tag_bit_df(dfreco, self.v_bitvar, - self.b_mcsig, True), dtype=int) - dfreco[self.v_ismcprompt] = np.array(tag_bit_df(dfreco, self.v_bitvar_origrec, - self.b_mcsigprompt), dtype=int) - dfreco[self.v_ismcfd] = np.array(tag_bit_df(dfreco, self.v_bitvar_origrec, - self.b_mcsigfd), dtype=int) + dfs['reco'][self.v_ismcsignal] = np.array(tag_bit_df(dfs['reco'], self.v_bitvar, + self.b_mcsig, True), dtype=int) + dfs['reco'][self.v_ismcprompt] = np.array(tag_bit_df(dfs['reco'], self.v_bitvar_origrec, + self.b_mcsigprompt), dtype=int) + dfs['reco'][self.v_ismcfd] = np.array(tag_bit_df(dfs['reco'], self.v_bitvar_origrec, + self.b_mcsigfd), dtype=int) + dfs['reco'][self.v_ismcbkg] = np.array(tag_bit_df(dfs['reco'], self.v_bitvar, + self.b_mcbkg, True), dtype=int) if self.v_swap: - mydf = dfreco[self.v_candtype] == dfreco[self.v_swap] + 1 - dfreco[self.v_ismcsignal] = np.logical_and(dfreco[self.v_ismcsignal] == 1, mydf) - dfreco[self.v_ismcprompt] = np.logical_and(dfreco[self.v_ismcprompt] == 1, mydf) - dfreco[self.v_ismcfd] = np.logical_and(dfreco[self.v_ismcfd] == 1, mydf) - - dfreco[self.v_ismcbkg] = np.array(tag_bit_df(dfreco, self.v_bitvar, - self.b_mcbkg, True), dtype=int) - - pickle.dump(dfreco, openfile(self.l_reco[file_index], "wb"), protocol=4) - - if self.mcordata == "mc": - dfgen = dfmerge(dfgen, dfevtorig, on=self.v_evtmatch_mc) - - dfgen[self.v_isstd] = np.array(tag_bit_df(dfgen, self.v_bitvar, + mydf = dfs['reco'][self.v_candtype] == dfs['reco'][self.v_swap] + 1 + dfs['reco'][self.v_ismcsignal] = \ + np.logical_and(dfs['reco'][self.v_ismcsignal] == 1, mydf) + dfs['reco'][self.v_ismcprompt] = \ + np.logical_and(dfs['reco'][self.v_ismcprompt] == 1, mydf) + dfs['reco'][self.v_ismcfd] = np.logical_and(dfs['reco'][self.v_ismcfd] == 1, mydf) + + dfs['gen'][self.v_isstd] = np.array(tag_bit_df(dfs['gen'], self.v_bitvar_gen, self.b_std), dtype=int) - dfgen[self.v_ismcsignal] = np.array(tag_bit_df(dfgen, self.v_bitvar, + dfs['gen'][self.v_ismcsignal] = np.array(tag_bit_df(dfs['gen'], self.v_bitvar_gen, self.b_mcsig, True), dtype=int) - dfgen[self.v_ismcprompt] = np.array(tag_bit_df(dfgen, self.v_bitvar_origgen, + dfs['gen'][self.v_ismcprompt] = np.array(tag_bit_df(dfs['gen'], self.v_bitvar_origgen, self.b_mcsigprompt), dtype=int) - dfgen[self.v_ismcfd] = np.array(tag_bit_df(dfgen, self.v_bitvar_origgen, + dfs['gen'][self.v_ismcfd] = np.array(tag_bit_df(dfs['gen'], self.v_bitvar_origgen, self.b_mcsigfd), dtype=int) - dfgen[self.v_ismcbkg] = np.array(tag_bit_df(dfgen, self.v_bitvar, + dfs['gen'][self.v_ismcbkg] = np.array(tag_bit_df(dfs['gen'], self.v_bitvar_gen, self.b_mcbkg, True), dtype=int) - dfgen = dfgen.reset_index(drop=True) - - if dfjetgen is not None: - if dfjetsubgen is not None: - dfjetgen = dfmerge(dfjetgen, dfjetsubgen, - how='inner', on=self.v_jetsubmatch_mc) - # Workaround for HF tree creator filling: - # McCollisionId -> CollisionId - # McParticleId -> HfCand2ProngId - dfgen = dfmerge(dfjetgen, dfgen, - left_on=self.v_jetmatch_mc, - right_on=self.v_jetmatch_mc_hf) - - pickle.dump(dfgen, openfile(self.l_gen[file_index], "wb"), protocol=4) + + if self.df_merge: + for m_spec in self.df_merge: + base = m_spec['base'] + ref = m_spec['ref'] + out = m_spec.get('out', base) + if all([dfuse(self.df_read[base]), dfuse(self.df_read[ref])]): + if (on := m_spec.get('use', None)) is not None: + self.logger.info('merging %s with %s on %s into %s', base, ref, on, out) + if not isinstance(on, list) or 'df' not in on: + on = ['df', on] + dfs[out] = dfmerge(dfs[base], dfs[ref], on=on) + else: + var = self.df_read[ref]['index'] + self.logger.info('merging %s with %s on %s into %s', base, ref, var, out) + dfs[out] = dfmerge(dfs[base], dfs[ref], + left_on=['df', var], right_index=True) + + if self.df_write: + for df_name, df_spec in self.df_write.items(): + if dfuse(df_spec): + self.logger.info('writing %s to %s', df_name, df_spec['file']) + src = df_spec.get('source', df_name) + dfo = dfquery(dfs[src], df_spec.get('filter', None)) + path = os.path.join(self.d_pkl, self.l_path[file_index], df_spec['file']) + with openfile(path, "wb") as file: + pickle.dump(dfo, file, protocol=4) def skim(self, file_index): try: diff --git a/machine_learning_hep/utilities.py b/machine_learning_hep/utilities.py index 4aa393a0bd..f7c07e5c53 100644 --- a/machine_learning_hep/utilities.py +++ b/machine_learning_hep/utilities.py @@ -108,6 +108,9 @@ def conv_none(value): # Mask the at the column name with mask value df_to_mask.loc[mask_indices, [mc["column"]]] = conv_none(mc["mask_with"]) +def dfquery(df, selection, **kwargs): + return df.query(selection, **kwargs) if selection is not None else df + def selectdfquery(dfr, selection): """ Query on dataframe