Skip to content

Commit

Permalink
Add other data levels and filtering
Browse files Browse the repository at this point in the history
  • Loading branch information
qgp committed Jan 23, 2024
1 parent 29ba6ff commit 9477bbc
Show file tree
Hide file tree
Showing 2 changed files with 92 additions and 43 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -37,7 +37,7 @@ D0pp_jet:
var_cand: fCandidateSelFlag
# var_swap: fIsCandidateSwapped
bitmap_sel:
var_name: fFlagMc
var_name: fFlagMcMatchRec
var_name_origgen: fOriginMcGen
var_name_origrec: fOriginMcRec
var_isstd: isstd
Expand All @@ -61,9 +61,10 @@ D0pp_jet:
index: fIndexD0CollBases
trees:
O2hfd0collbase: [fNumContrib, fIsEventReject, fRunNumber]

reco:
level: all
index: fIndexHfD0Bases
index: fIndexHfD0Bases_0
trees:
O2hfd0base: [fIndexHfD0CollBases, fPt, fEta, fPhi, fM]
O2hfd0mc: [fFlagMcMatchRec, fOriginMcRec]
Expand All @@ -82,7 +83,8 @@ D0pp_jet:
index: fIndexHfD0PBases_0
trees:
O2hfd0pbase: [fPt, fEta, fPhi, fFlagMcMatchGen, fOriginMcGen]
coll:

collgen:
level: gen
index: fIndexD0ChargedMCParticleLevelJetCollisionOutputs
trees:
Expand All @@ -98,13 +100,58 @@ D0pp_jet:
index: fIndexD0ChargedMCParticleLevelJetSubstructures
trees:
O2d0mcpsso: [fIndexD0ChargedMCParticleLevelJetOutputs, fZg, fRg, fNsd]
# analogously: O2d0o, O2d0mcdo, O2d0sso, O2d0mcdsso

colldet:
level: det
index: fIndexD0ChargedMCDetectorLevelJetCollisionOutputs
trees:
O2d0mcdco: [fPosZ, fCentrality, fEventSel]
jetdet:
level: det
index: fIndexD0ChargedMCDetectorLevelJetOutputs
trees:
O2d0mcdo: [fIndexD0ChargedMCDetectorLevelJetCollisionOutputs, fIndexHfD0Bases_0,
fJetPt, fJetPhi, fJetEta, fJetNConstituents]
jetsubdet:
level: det
index: fIndexD0ChargedMCDetectorLevelJetSubstructures
trees:
O2d0mcdsso: [fIndexD0ChargedMCDetectorLevelJetOutputs, fZg, fRg, fNsd]

colldata:
level: data
index: fIndexD0ChargedJetCollisionOutputs
trees:
O2d0mcdco: [fPosZ, fCentrality, fEventSel]
jetdata:
level: data
index: fIndexD0JetOutputs
trees:
O2d0mcdo: [fIndexD0ChargedJetCollisionOutputs, fIndexHfD0Bases_0,
fJetPt, fJetPhi, fJetEta, fJetNConstituents]
jetsubdata:
level: data
index: fIndexD0JetSubstructures
trees:
O2d0mcdsso: [fIndexD0ChargedJetOutputs, fZg, fRg, fNsd]

merge:
- {base: jetgen, ref: coll}
- {base: jetgen, ref: collgen}
- {base: jetgen, ref: gen}
- {base: jetsubgen, ref: jetgen}

- {base: jetdet, ref: colldet}
- {base: jetdet, ref: reco}
- {base: jetsubdet, ref: jetdet}

- {base: jetdata, ref: colldata}
- {base: jetdata, ref: reco}
- {base: jetsubdata, ref: jetdata}

# - {base: gen, ref: evtorig}

write:

variables:
var_all: [[fIndexHfD0CollBases, fPt, fEta, fPhi, fM], # O2hfd0base
[fFlagMcMatchRec, fOriginMcRec]] # O2hfd0mc
Expand Down
78 changes: 40 additions & 38 deletions machine_learning_hep/processer.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,7 @@
import numpy as np
from machine_learning_hep.selectionutils import selectfidacc
from machine_learning_hep.bitwise import tag_bit_df #, filter_bit_df
from machine_learning_hep.utilities import selectdfquery, merge_method, mask_df
from machine_learning_hep.utilities import dfquery, selectdfquery, merge_method, mask_df
from machine_learning_hep.utilities import list_folders, createlist, appendmainfoldertolist
from machine_learning_hep.utilities import create_folder_struc, seldf_singlevar, openfile
from machine_learning_hep.utilities import mergerootfiles, count_df_length_pkl
Expand Down Expand Up @@ -352,6 +352,11 @@ def dfmerge(dfl, dfr, **kwargs):
dfr.info()
raise e

def dfuse(df_spec):
return ((df_spec['level'] == 'all') or
(df_spec['level'] in ('mc', 'gen', 'det') and self.mcordata == 'mc') or
(df_spec['level'] in ('data') and self.mcordata == 'data'))

self.logger.info('unpacking: %s', self.l_root[file_index])
dfs = {}
with uproot.open(self.l_root[file_index]) as rfile:
Expand All @@ -368,54 +373,51 @@ def dfmerge(dfl, dfr, **kwargs):
rdir = rfile[key]

for df_name, df_spec in self.df_spec.items():
# TODO: check level
dfappend(df_name, dfread(rdir, list(df_spec['trees'].keys()), list(df_spec['trees'].values()), idx_name=df_spec.get('index', None)))
if dfuse(df_spec):
dfappend(df_name, dfread(rdir, list(df_spec['trees'].keys()), list(df_spec['trees'].values()), idx_name=df_spec.get('index', None)))

self.logger.info('pickling: %s', self.l_root[file_index])
# dfevtorig = selectdfquery(dfevtorig, self.s_cen_unp)
# dfevtorig = dfevtorig.reset_index(drop=True) # TODO: is this safe for the new format?

# TODO: specifications for cleaning and pickling should go to DB
dfquery(dfs['evtorig'], self.s_cen_unp, inplace=True)
pickle.dump(dfs['evtorig'], openfile(self.l_evtorig[file_index], "wb"), protocol=4)

# dfevt = selectdfquery(dfevtorig, self.s_good_evt_unp)
# dfevt = dfevt.reset_index(drop=True) # TODO: is this safe for the new format?
# pickle.dump(dfevt, openfile(self.l_evt[file_index], "wb"), protocol=4)
dfs['evt'] = dfquery(dfs['evtorig'], self.s_good_evt_unp)
pickle.dump(dfs['evt'], openfile(self.l_evt[file_index], "wb"), protocol=4)

if self.df_merge:
for m_spec in self.df_merge:
self.logger.info('merging %s with %s based on %s', m_spec['base'], m_spec['ref'], self.df_spec[m_spec['ref']]['index'])
dfmerge(dfs[m_spec['base']], dfs[m_spec['ref']], on=self.df_spec[m_spec['ref']]['index'])

# if dfjetreco is not None:
# if dfjetsubreco is not None:
# dfjetreco = dfmerge(dfjetreco, dfjetsubreco, how='inner', on=self.v_jetsubmatch)
# dfreco = dfmerge(dfjetreco, dfreco, on=self.v_jetmatch)
base = m_spec['base']
ref = m_spec['ref']
if all([dfuse(self.df_spec[base]), dfuse(self.df_spec[ref])]):
var = self.df_spec[ref]['index']
self.logger.info('merging %s with %s based on %s', base, ref, var)
dfmerge(dfs[base], dfs[ref], on=['df', var])

# dfreco = selectdfquery(dfreco, self.s_reco_unp)

# dfreco = dfmerge(dfreco, dfevt, on=self.v_evtmatch)
dfquery(dfs['reco'], self.s_reco_unp, inplace=True)

# if self.s_apply_yptacccut is True:
# isselacc = selectfidacc(dfreco[self.v_var_binning].values,
# dfreco[self.v_rapy].values)
# dfreco = dfreco[np.array(isselacc, dtype=bool)]
# isselacc = selectfidacc(dfs['reco'][self.v_var_binning].values,
# dfs['reco'][self.v_rapy].values)
# dfs['reco'] = dfs['reco'][np.array(isselacc, dtype=bool)]

# # needs to be revisited for Run 3
# if self.mcordata == "mc":
# dfreco[self.v_ismcsignal] = np.array(tag_bit_df(dfreco, self.v_bitvar,
# self.b_mcsig), dtype=int)
# dfreco[self.v_ismcprompt] = np.array(tag_bit_df(dfreco, self.v_bitvar_origrec,
# self.b_mcsigprompt), dtype=int)
# dfreco[self.v_ismcfd] = np.array(tag_bit_df(dfreco, self.v_bitvar_origrec,
# self.b_mcsigfd), dtype=int)

# if self.v_swap:
# mydf = dfreco[self.v_candtype] == dfreco[self.v_swap] + 1
# dfreco[self.v_ismcsignal] = np.logical_and(dfreco[self.v_ismcsignal] == 1, mydf)
# dfreco[self.v_ismcprompt] = np.logical_and(dfreco[self.v_ismcprompt] == 1, mydf)
# dfreco[self.v_ismcfd] = np.logical_and(dfreco[self.v_ismcfd] == 1, mydf)

# dfreco[self.v_ismcbkg] = np.array(tag_bit_df(dfreco, self.v_bitvar,
# self.b_mcbkg), dtype=int)
# needs to be revisited for Run 3
if self.mcordata == "mc":
dfs['reco'][self.v_ismcsignal] = np.array(tag_bit_df(dfs['reco'], self.v_bitvar,
self.b_mcsig), dtype=int)
dfs['reco'][self.v_ismcprompt] = np.array(tag_bit_df(dfs['reco'], self.v_bitvar_origrec,
self.b_mcsigprompt), dtype=int)
dfs['reco'][self.v_ismcfd] = np.array(tag_bit_df(dfs['reco'], self.v_bitvar_origrec,
self.b_mcsigfd), dtype=int)

if self.v_swap:
mydf = dfs['reco'][self.v_candtype] == dfs['reco'][self.v_swap] + 1
dfs['reco'][self.v_ismcsignal] = np.logical_and(dfs['reco'][self.v_ismcsignal] == 1, mydf)
dfs['reco'][self.v_ismcprompt] = np.logical_and(dfs['reco'][self.v_ismcprompt] == 1, mydf)
dfs['reco'][self.v_ismcfd] = np.logical_and(dfs['reco'][self.v_ismcfd] == 1, mydf)

dfs['reco'][self.v_ismcbkg] = np.array(tag_bit_df(dfs['reco'], self.v_bitvar,
self.b_mcbkg), dtype=int)

pickle.dump(dfs['reco'], openfile(self.l_reco[file_index], "wb"), protocol=4)

Expand Down

0 comments on commit 9477bbc

Please sign in to comment.