From 9477bbc63ce1e22eb5a1d7bc2004022ff2f97dc8 Mon Sep 17 00:00:00 2001 From: Jochen Klein Date: Tue, 23 Jan 2024 11:51:32 +0100 Subject: [PATCH] Add other data levels and filtering --- ...abase_ml_parameters_D0pp_jet_newformat.yml | 57 ++++++++++++-- machine_learning_hep/processer.py | 78 ++++++++++--------- 2 files changed, 92 insertions(+), 43 deletions(-) diff --git a/machine_learning_hep/data/data_run3/database_ml_parameters_D0pp_jet_newformat.yml b/machine_learning_hep/data/data_run3/database_ml_parameters_D0pp_jet_newformat.yml index ba15a14de0..64f006bec7 100644 --- a/machine_learning_hep/data/data_run3/database_ml_parameters_D0pp_jet_newformat.yml +++ b/machine_learning_hep/data/data_run3/database_ml_parameters_D0pp_jet_newformat.yml @@ -37,7 +37,7 @@ D0pp_jet: var_cand: fCandidateSelFlag # var_swap: fIsCandidateSwapped bitmap_sel: - var_name: fFlagMc + var_name: fFlagMcMatchRec var_name_origgen: fOriginMcGen var_name_origrec: fOriginMcRec var_isstd: isstd @@ -61,9 +61,10 @@ D0pp_jet: index: fIndexD0CollBases trees: O2hfd0collbase: [fNumContrib, fIsEventReject, fRunNumber] + reco: level: all - index: fIndexHfD0Bases + index: fIndexHfD0Bases_0 trees: O2hfd0base: [fIndexHfD0CollBases, fPt, fEta, fPhi, fM] O2hfd0mc: [fFlagMcMatchRec, fOriginMcRec] @@ -82,7 +83,8 @@ D0pp_jet: index: fIndexHfD0PBases_0 trees: O2hfd0pbase: [fPt, fEta, fPhi, fFlagMcMatchGen, fOriginMcGen] - coll: + + collgen: level: gen index: fIndexD0ChargedMCParticleLevelJetCollisionOutputs trees: @@ -98,13 +100,58 @@ D0pp_jet: index: fIndexD0ChargedMCParticleLevelJetSubstructures trees: O2d0mcpsso: [fIndexD0ChargedMCParticleLevelJetOutputs, fZg, fRg, fNsd] - # analogously: O2d0o, O2d0mcdo, O2d0sso, O2d0mcdsso + + colldet: + level: det + index: fIndexD0ChargedMCDetectorLevelJetCollisionOutputs + trees: + O2d0mcdco: [fPosZ, fCentrality, fEventSel] + jetdet: + level: det + index: fIndexD0ChargedMCDetectorLevelJetOutputs + trees: + O2d0mcdo: [fIndexD0ChargedMCDetectorLevelJetCollisionOutputs, fIndexHfD0Bases_0, + fJetPt, fJetPhi, fJetEta, fJetNConstituents] + jetsubdet: + level: det + index: fIndexD0ChargedMCDetectorLevelJetSubstructures + trees: + O2d0mcdsso: [fIndexD0ChargedMCDetectorLevelJetOutputs, fZg, fRg, fNsd] + + colldata: + level: data + index: fIndexD0ChargedJetCollisionOutputs + trees: + O2d0mcdco: [fPosZ, fCentrality, fEventSel] + jetdata: + level: data + index: fIndexD0JetOutputs + trees: + O2d0mcdo: [fIndexD0ChargedJetCollisionOutputs, fIndexHfD0Bases_0, + fJetPt, fJetPhi, fJetEta, fJetNConstituents] + jetsubdata: + level: data + index: fIndexD0JetSubstructures + trees: + O2d0mcdsso: [fIndexD0ChargedJetOutputs, fZg, fRg, fNsd] merge: - - {base: jetgen, ref: coll} + - {base: jetgen, ref: collgen} - {base: jetgen, ref: gen} - {base: jetsubgen, ref: jetgen} + - {base: jetdet, ref: colldet} + - {base: jetdet, ref: reco} + - {base: jetsubdet, ref: jetdet} + + - {base: jetdata, ref: colldata} + - {base: jetdata, ref: reco} + - {base: jetsubdata, ref: jetdata} + +# - {base: gen, ref: evtorig} + + write: + variables: var_all: [[fIndexHfD0CollBases, fPt, fEta, fPhi, fM], # O2hfd0base [fFlagMcMatchRec, fOriginMcRec]] # O2hfd0mc diff --git a/machine_learning_hep/processer.py b/machine_learning_hep/processer.py index a490b1132f..7450a127a3 100755 --- a/machine_learning_hep/processer.py +++ b/machine_learning_hep/processer.py @@ -28,7 +28,7 @@ import numpy as np from machine_learning_hep.selectionutils import selectfidacc from machine_learning_hep.bitwise import tag_bit_df #, filter_bit_df -from machine_learning_hep.utilities import selectdfquery, merge_method, mask_df +from machine_learning_hep.utilities import dfquery, selectdfquery, merge_method, mask_df from machine_learning_hep.utilities import list_folders, createlist, appendmainfoldertolist from machine_learning_hep.utilities import create_folder_struc, seldf_singlevar, openfile from machine_learning_hep.utilities import mergerootfiles, count_df_length_pkl @@ -352,6 +352,11 @@ def dfmerge(dfl, dfr, **kwargs): dfr.info() raise e + def dfuse(df_spec): + return ((df_spec['level'] == 'all') or + (df_spec['level'] in ('mc', 'gen', 'det') and self.mcordata == 'mc') or + (df_spec['level'] in ('data') and self.mcordata == 'data')) + self.logger.info('unpacking: %s', self.l_root[file_index]) dfs = {} with uproot.open(self.l_root[file_index]) as rfile: @@ -368,54 +373,51 @@ def dfmerge(dfl, dfr, **kwargs): rdir = rfile[key] for df_name, df_spec in self.df_spec.items(): - # TODO: check level - dfappend(df_name, dfread(rdir, list(df_spec['trees'].keys()), list(df_spec['trees'].values()), idx_name=df_spec.get('index', None))) + if dfuse(df_spec): + dfappend(df_name, dfread(rdir, list(df_spec['trees'].keys()), list(df_spec['trees'].values()), idx_name=df_spec.get('index', None))) self.logger.info('pickling: %s', self.l_root[file_index]) - # dfevtorig = selectdfquery(dfevtorig, self.s_cen_unp) - # dfevtorig = dfevtorig.reset_index(drop=True) # TODO: is this safe for the new format? + + # TODO: specifications for cleaning and pickling should go to DB + dfquery(dfs['evtorig'], self.s_cen_unp, inplace=True) pickle.dump(dfs['evtorig'], openfile(self.l_evtorig[file_index], "wb"), protocol=4) - # dfevt = selectdfquery(dfevtorig, self.s_good_evt_unp) - # dfevt = dfevt.reset_index(drop=True) # TODO: is this safe for the new format? - # pickle.dump(dfevt, openfile(self.l_evt[file_index], "wb"), protocol=4) + dfs['evt'] = dfquery(dfs['evtorig'], self.s_good_evt_unp) + pickle.dump(dfs['evt'], openfile(self.l_evt[file_index], "wb"), protocol=4) if self.df_merge: for m_spec in self.df_merge: - self.logger.info('merging %s with %s based on %s', m_spec['base'], m_spec['ref'], self.df_spec[m_spec['ref']]['index']) - dfmerge(dfs[m_spec['base']], dfs[m_spec['ref']], on=self.df_spec[m_spec['ref']]['index']) - - # if dfjetreco is not None: - # if dfjetsubreco is not None: - # dfjetreco = dfmerge(dfjetreco, dfjetsubreco, how='inner', on=self.v_jetsubmatch) - # dfreco = dfmerge(dfjetreco, dfreco, on=self.v_jetmatch) + base = m_spec['base'] + ref = m_spec['ref'] + if all([dfuse(self.df_spec[base]), dfuse(self.df_spec[ref])]): + var = self.df_spec[ref]['index'] + self.logger.info('merging %s with %s based on %s', base, ref, var) + dfmerge(dfs[base], dfs[ref], on=['df', var]) - # dfreco = selectdfquery(dfreco, self.s_reco_unp) - - # dfreco = dfmerge(dfreco, dfevt, on=self.v_evtmatch) + dfquery(dfs['reco'], self.s_reco_unp, inplace=True) # if self.s_apply_yptacccut is True: - # isselacc = selectfidacc(dfreco[self.v_var_binning].values, - # dfreco[self.v_rapy].values) - # dfreco = dfreco[np.array(isselacc, dtype=bool)] + # isselacc = selectfidacc(dfs['reco'][self.v_var_binning].values, + # dfs['reco'][self.v_rapy].values) + # dfs['reco'] = dfs['reco'][np.array(isselacc, dtype=bool)] - # # needs to be revisited for Run 3 - # if self.mcordata == "mc": - # dfreco[self.v_ismcsignal] = np.array(tag_bit_df(dfreco, self.v_bitvar, - # self.b_mcsig), dtype=int) - # dfreco[self.v_ismcprompt] = np.array(tag_bit_df(dfreco, self.v_bitvar_origrec, - # self.b_mcsigprompt), dtype=int) - # dfreco[self.v_ismcfd] = np.array(tag_bit_df(dfreco, self.v_bitvar_origrec, - # self.b_mcsigfd), dtype=int) - - # if self.v_swap: - # mydf = dfreco[self.v_candtype] == dfreco[self.v_swap] + 1 - # dfreco[self.v_ismcsignal] = np.logical_and(dfreco[self.v_ismcsignal] == 1, mydf) - # dfreco[self.v_ismcprompt] = np.logical_and(dfreco[self.v_ismcprompt] == 1, mydf) - # dfreco[self.v_ismcfd] = np.logical_and(dfreco[self.v_ismcfd] == 1, mydf) - - # dfreco[self.v_ismcbkg] = np.array(tag_bit_df(dfreco, self.v_bitvar, - # self.b_mcbkg), dtype=int) + # needs to be revisited for Run 3 + if self.mcordata == "mc": + dfs['reco'][self.v_ismcsignal] = np.array(tag_bit_df(dfs['reco'], self.v_bitvar, + self.b_mcsig), dtype=int) + dfs['reco'][self.v_ismcprompt] = np.array(tag_bit_df(dfs['reco'], self.v_bitvar_origrec, + self.b_mcsigprompt), dtype=int) + dfs['reco'][self.v_ismcfd] = np.array(tag_bit_df(dfs['reco'], self.v_bitvar_origrec, + self.b_mcsigfd), dtype=int) + + if self.v_swap: + mydf = dfs['reco'][self.v_candtype] == dfs['reco'][self.v_swap] + 1 + dfs['reco'][self.v_ismcsignal] = np.logical_and(dfs['reco'][self.v_ismcsignal] == 1, mydf) + dfs['reco'][self.v_ismcprompt] = np.logical_and(dfs['reco'][self.v_ismcprompt] == 1, mydf) + dfs['reco'][self.v_ismcfd] = np.logical_and(dfs['reco'][self.v_ismcfd] == 1, mydf) + + dfs['reco'][self.v_ismcbkg] = np.array(tag_bit_df(dfs['reco'], self.v_bitvar, + self.b_mcbkg), dtype=int) pickle.dump(dfs['reco'], openfile(self.l_reco[file_index], "wb"), protocol=4)