Skip to content

Commit

Permalink
Multiclass: Adjust application and invariant mass fitting (#851)
Browse files Browse the repository at this point in the history
* Adjust paths in database and analysis type in yml. All steps for warm-up.

* Rename output folder after preprocess merge

* Rename output folder after test-corr merge

* Rename output folder after all-plots merge

* Rename output directories after multiclass-config merge

* Improve multiclass cuts for mlapplication

* Enable apply and invmass steps

* Better part in processer

* Disable ml training, partially change output folders

* Fix inv mass plotting. Application working

* Modify hadd paths to be inside home directory

* Update hist names in fitting

* Enable fitting only

* Fix output folders

* Remove local adjustments for PR

* Update copyright notice

* Remove redundant line

* Revert "Modify hadd paths to be inside home directory"

This reverts commit 39c85de.

* Properly use temp dirs with tempfile

* Proper solution to pd.query()
  • Loading branch information
saganatt authored Jan 29, 2024
1 parent 48b531b commit b91a205
Show file tree
Hide file tree
Showing 8 changed files with 57 additions and 56 deletions.
23 changes: 12 additions & 11 deletions machine_learning_hep/analysis/utils.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
#############################################################################
## © Copyright CERN 2018. All rights not expressly granted are reserved. ##
## © Copyright CERN 2023. All rights not expressly granted are reserved. ##
## Author: [email protected] ##
## This program is free software: you can redistribute it and/or modify it ##
## under the terms of the GNU General Public License as published by the ##
Expand All @@ -13,11 +13,12 @@
#############################################################################

from os.path import join
import tempfile

from machine_learning_hep.utilities import mergerootfiles, get_timestamp_string
from machine_learning_hep.utilities import mergerootfiles
from machine_learning_hep.logger import get_logger

def multi_preparenorm(database, case, typean, doperiodbyperiod):
def multi_preparenorm(database, typean, doperiodbyperiod):

logger = get_logger()

Expand All @@ -34,14 +35,14 @@ def multi_preparenorm(database, case, typean, doperiodbyperiod):
"correctionsweights.root")

listempty = []
tmp_merged = f"/data/tmp/hadd/{case}_{typean}/norm_analyzer/{get_timestamp_string()}/"
useperiod = database["analysis"][typean]["useperiod"]

for indexp in range(len(resultsdata)):
logger.info("Origin path: %s, target path: %s", lper_normfilesorig[indexp],
lper_normfiles[indexp])
mergerootfiles([lper_normfilesorig[indexp]], lper_normfiles[indexp], tmp_merged)
if doperiodbyperiod and useperiod[indexp]:
listempty.append(lper_normfiles[indexp])
with tempfile.TemporaryDirectory() as tmp_merged_dir:
for indexp in range(len(resultsdata)):
logger.info("Origin path: %s, target path: %s", lper_normfilesorig[indexp],
lper_normfiles[indexp])
mergerootfiles([lper_normfilesorig[indexp]], lper_normfiles[indexp], tmp_merged_dir)
if doperiodbyperiod and useperiod[indexp]:
listempty.append(lper_normfiles[indexp])

mergerootfiles(listempty, f_normmerged, tmp_merged)
mergerootfiles(listempty, f_normmerged, tmp_merged_dir)
Original file line number Diff line number Diff line change
Expand Up @@ -293,9 +293,9 @@ LcpKpi:
xgboost_classifierLcpKpi_dfselection_fPt_8.0_12.0.sav,
xgboost_classifierLcpKpi_dfselection_fPt_12.0_24.0.sav]
probcutpresel:
data: [[0.3, 0.0], [0.3, 0.0], [0.3, 0.0], [0.3, 0.0], [0.3, 0.0], [0.3, 0.0]] #list of nbins
mc: [[0.3, 0.0], [0.3, 0.0], [0.3, 0.0], [0.3, 0.0], [0.3, 0.0], [0.3, 0.0]] #list of nbins
probcutoptimal: [[0.5, 0.1], [0.5, 0.1], [0.5, 0.1], [0.5, 0.1], [0.5, 0.1], [0.5, 0.1]] #list of nbins
data: [[0.3, 0.0, 0.0], [0.3, 0.0, 0.0], [0.3, 0.0, 0.0], [0.3, 0.0, 0.0], [0.3, 0.0, 0.0], [0.3, 0.0, 0.0]] #list of nbins
mc: [[0.3, 0.0, 0.0], [0.3, 0.0, 0.0], [0.3, 0.0, 0.0], [0.3, 0.0, 0.0], [0.3, 0.0, 0.0], [0.3, 0.0, 0.0]] #list of nbins
probcutoptimal: [[0.5, 0.1, 0.0], [0.5, 0.1, 0.0], [0.5, 0.1, 0.0], [0.5, 0.1, 0.0], [0.5, 0.1, 0.0], [0.5, 0.1, 0.0]] #list of nbins

analysis:
indexhptspectrum: -1 #kD0Kpi=0, kDplusKpipi=1, kDstarD0pi=2, kDsKKpi=3, kLctopKpi=4, kLcK0Sp=5
Expand Down
11 changes: 6 additions & 5 deletions machine_learning_hep/fitting/helpers.py
Original file line number Diff line number Diff line change
Expand Up @@ -289,22 +289,23 @@ def make_suffix(self, ibin1, ibin2):
"""
if self.bin2_name is not None:
if self.mltype == "MultiClassification":
return "%s%d_%d_%.2f%.2f%s_%.2f_%.2f" % \
return "%s%d_%d_%.2f%.2f%.2f%s_%.2f_%.2f" % \
(self.bin1_name, self.bins1_edges_low[ibin1],
self.bins1_edges_up[ibin1], self.prob_cut_fin[ibin1][0],
self.prob_cut_fin[ibin1][1], self.bin2_name,
self.bins2_edges_low[ibin2], self.bins2_edges_up[ibin2])
self.prob_cut_fin[ibin1][1], self.prob_cut_fin[ibin1][2],
self.bin2_name, self.bins2_edges_low[ibin2],
self.bins2_edges_up[ibin2])
return "%s%d_%d_%.2f%s_%.2f_%.2f" % \
(self.bin1_name, self.bins1_edges_low[ibin1],
self.bins1_edges_up[ibin1], self.prob_cut_fin[ibin1],
self.bin2_name, self.bins2_edges_low[ibin2],
self.bins2_edges_up[ibin2])

if self.mltype == "MultiClassification":
return "%s%d_%d_%.2f%.2f" % \
return "%s%d_%d_%.2f%.2f%.2f" % \
(self.bin1_name, self.bins1_edges_low[ibin1],
self.bins1_edges_up[ibin1], self.prob_cut_fin[ibin1][0],
self.prob_cut_fin[ibin1][1])
self.prob_cut_fin[ibin1][1], self.prob_cut_fin[ibin1][2])
return "%s%d_%d_%.2f" % \
(self.bin1_name, self.bins1_edges_low[ibin1],
self.bins1_edges_up[ibin1], self.prob_cut_fin[ibin1])
Expand Down
6 changes: 4 additions & 2 deletions machine_learning_hep/models.py
Original file line number Diff line number Diff line change
Expand Up @@ -158,8 +158,10 @@ def apply(ml_type, names_, trainedmodels_, test_set_, mylistvariables_, labels_=
test_set_[f"y_test_prob{name}"] = pd.Series(y_test_prob[:, 1], index=test_set_.index)
elif ml_type == "MultiClassification" and labels_ is not None:
for pred, lab in enumerate(labels_):
test_set_[f"y_test_prob{name}{lab}"] = pd.Series(y_test_prob[:, pred],
index=test_set_.index)
# pandas query() used in further analysis cannot accept '-' in column names
safe_lab = lab.replace('-', '_')
test_set_[f"y_test_prob{name}{safe_lab}"] = pd.Series(y_test_prob[:, pred],
index=test_set_.index)
else:
logger.fatal("Incorrect settings for chosen mltype")
return test_set_
Expand Down
17 changes: 8 additions & 9 deletions machine_learning_hep/multiprocesser.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,8 @@
main script for doing data processing, machine learning and analysis
"""
import os
from machine_learning_hep.utilities import merge_method, mergerootfiles, get_timestamp_string
import tempfile
from machine_learning_hep.utilities import merge_method, mergerootfiles
from machine_learning_hep.io import parse_yaml, dump_yaml_from_dict
from machine_learning_hep.logger import get_logger

Expand Down Expand Up @@ -203,17 +204,16 @@ def multi_histomass(self):
for indexp, _ in enumerate(self.process_listsample):
if self.p_useperiod[indexp] == 1:
self.process_listsample[indexp].process_histomass()
tmp_merged = f"/data/tmp/hadd/{self.case}_{self.typean}/mass/{get_timestamp_string()}/"
self.logger.debug('merging all')
mergerootfiles(self.lper_filemass, self.filemass_mergedall, tmp_merged)
with tempfile.TemporaryDirectory() as tmp_merged_dir:
mergerootfiles(self.lper_filemass, self.filemass_mergedall, tmp_merged_dir)

def multi_efficiency(self):
for indexp, _ in enumerate(self.process_listsample):
if self.p_useperiod[indexp] == 1:
self.process_listsample[indexp].process_efficiency()
tmp_merged = \
f"/data/tmp/hadd/{self.case}_{self.typean}/efficiency/{get_timestamp_string()}/"
mergerootfiles(self.lper_fileeff, self.fileeff_mergedall, tmp_merged)
with tempfile.TemporaryDirectory() as tmp_merged_dir:
mergerootfiles(self.lper_fileeff, self.fileeff_mergedall, tmp_merged_dir)

def multi_response(self):
resp_exists = False
Expand All @@ -223,9 +223,8 @@ def multi_response(self):
resp_exists = True
self.process_listsample[indexp].process_response()
if resp_exists:
tmp_merged = \
f"/data/tmp/hadd/{self.case}_{self.typean}/response/{get_timestamp_string()}/"
mergerootfiles(self.lper_fileresp, self.fileresp_mergedall, tmp_merged)
with tempfile.TemporaryDirectory() as tmp_merged_dir:
mergerootfiles(self.lper_fileresp, self.fileresp_mergedall, tmp_merged_dir)

def multi_scancuts(self):
for indexp, _ in enumerate(self.process_listsample):
Expand Down
43 changes: 22 additions & 21 deletions machine_learning_hep/processer.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@
import glob
import random as rd
import re
import tempfile
import uproot
import pandas as pd
import numpy as np
Expand All @@ -32,7 +33,6 @@
from machine_learning_hep.utilities import list_folders, createlist, appendmainfoldertolist
from machine_learning_hep.utilities import create_folder_struc, seldf_singlevar, openfile
from machine_learning_hep.utilities import mergerootfiles, count_df_length_pkl
from machine_learning_hep.utilities import get_timestamp_string
from machine_learning_hep.io import dump_yaml_from_dict
from machine_learning_hep.logger import get_logger
pd.options.mode.chained_assignment = None
Expand Down Expand Up @@ -236,16 +236,16 @@ def __init__(self, case, datap, run_param, mcordata, p_maxfiles, # pylint: disab

if self.mltype == "MultiClassification":
self.l_selml = []
comps = ["<=", ">=", ">="]
for ipt in range(self.p_nptfinbins):
mlsel_multi0 = "y_test_prob" + self.p_modelname + self.class_labels[0] + \
" <= " + str(self.lpt_probcutfin[ipt][0])
mlsel_multi1 = "y_test_prob" + self.p_modelname + self.class_labels[1] + \
" >= " + str(self.lpt_probcutfin[ipt][1])
mlsel_multi = mlsel_multi0 + " and " + mlsel_multi1
self.l_selml.append(mlsel_multi)
mlsel_multi = [f'y_test_prob{self.p_modelname}{label.replace("-", "_")} ' \
f'{comp} {probcut}'
for label, comp, probcut in zip(self.class_labels, comps,
self.lpt_probcutfin[ipt])]
self.l_selml.append(" and ".join(mlsel_multi))

else:
self.l_selml = [f"y_test_prob {self.p_modelname} > {self.lpt_probcutfin[ipt]}" \
self.l_selml = [f"y_test_prob{self.p_modelname} > {self.lpt_probcutfin[ipt]}" \
for ipt in range(self.p_nptfinbins)]

self.d_pkl_dec = d_pkl_dec
Expand All @@ -271,9 +271,10 @@ def __init__(self, case, datap, run_param, mcordata, p_maxfiles, # pylint: disab
self.lpt_recodec = None
if self.doml is True:
if self.mltype == "MultiClassification":
self.lpt_recodec = [self.n_reco.replace(".pkl", "%d_%d_%.2f%.2f.pkl" % \
(self.lpt_anbinmin[i], self.lpt_anbinmax[i], \
self.lpt_probcutpre[i][0], self.lpt_probcutpre[i][1])) \
self.lpt_recodec = [self.n_reco.replace(".pkl", "%d_%d_%.2f%.2f%.2f.pkl" % \
(self.lpt_anbinmin[i], self.lpt_anbinmax[i],
self.lpt_probcutpre[i][0], self.lpt_probcutpre[i][1],
self.lpt_probcutpre[i][2])) \
for i in range(self.p_nptbins)]
else:
self.lpt_recodec = [self.n_reco.replace(".pkl", "%d_%d_%.2f.pkl" % \
Expand Down Expand Up @@ -516,14 +517,15 @@ def applymodel(self, file_index):
if self.mltype == "MultiClassification":
dfrecoskml = apply(self.mltype, [self.p_modelname], [mod],
dfrecosk, self.v_train[ipt], self.class_labels)
prob0 = f"y_test_prob{self.p_modelname}{self.class_labels[0]}"
prob1 = f"y_test_prob{self.p_modelname}{self.class_labels[1]}"
dfrecoskml = dfrecoskml.loc[(dfrecoskml[prob0] <= self.lpt_probcutpre[ipt][0]) &
(dfrecoskml[prob1] >= self.lpt_probcutpre[ipt][1])]
probs = [f'y_test_prob{self.p_modelname}{label.replace("-", "_")}' \
for label in self.class_labels]
dfrecoskml = dfrecoskml[(dfrecoskml[probs[0]] <= self.lpt_probcutpre[ipt][0]) &
(dfrecoskml[probs[1]] >= self.lpt_probcutpre[ipt][1]) &
(dfrecoskml[probs[2]] >= self.lpt_probcutpre[ipt][2])]
else:
dfrecoskml = apply("BinaryClassification", [self.p_modelname], [mod],
dfrecosk, self.v_train[ipt])
probvar = "y_test_prob" + self.p_modelname
probvar = f"y_test_prob{self.p_modelname}"
dfrecoskml = dfrecoskml.loc[dfrecoskml[probvar] > self.lpt_probcutpre[ipt]]
else:
dfrecoskml = dfrecosk.query("isstd == 1")
Expand Down Expand Up @@ -654,9 +656,8 @@ def process_histomass(self):
create_folder_struc(self.d_results, self.l_path)
arguments = [(i,) for i in range(len(self.l_root))]
self.parallelizer(self.process_histomass_single, arguments, self.p_chunksizeunp) # pylint: disable=no-member
tmp_merged = \
f"/tmp/hadd/{self.case}_{self.typean}/mass_{self.period}/{get_timestamp_string()}/"
mergerootfiles(self.l_histomass, self.n_filemass, tmp_merged)
with tempfile.TemporaryDirectory() as tmp_merged_dir:
mergerootfiles(self.l_histomass, self.n_filemass, tmp_merged_dir)

def process_efficiency(self):
print("Doing efficiencies", self.mcordata, self.period)
Expand All @@ -672,5 +673,5 @@ def process_efficiency(self):
create_folder_struc(self.d_results, self.l_path)
arguments = [(i,) for i in range(len(self.l_root))]
self.parallelizer(self.process_efficiency_single, arguments, self.p_chunksizeunp) # pylint: disable=no-member
tmp_merged = f"/tmp/hadd/{self.case}_{self.typean}/histoeff_{self.period}/{get_timestamp_string()}/" # pylint: disable=line-too-long
mergerootfiles(self.l_histoeff, self.n_fileeff, tmp_merged)
with tempfile.TemporaryDirectory() as tmp_merged_dir:
mergerootfiles(self.l_histoeff, self.n_fileeff, tmp_merged_dir)
4 changes: 2 additions & 2 deletions machine_learning_hep/processerdhadrons.py
Original file line number Diff line number Diff line change
Expand Up @@ -121,10 +121,10 @@ def process_histomass_single(self, index):
df = self.apply_cuts_ptbin(df, ipt)

if self.mltype == "MultiClassification":
suffix = "%s%d_%d_%.2f%.2f" % \
suffix = "%s%d_%d_%.2f%.2f%.2f" % \
(self.v_var_binning, self.lpt_finbinmin[ipt],
self.lpt_finbinmax[ipt], self.lpt_probcutfin[ipt][0],
self.lpt_probcutfin[ipt][1])
self.lpt_probcutfin[ipt][1], self.lpt_probcutfin[ipt][2])
else:
suffix = "%s%d_%d_%.2f" % \
(self.v_var_binning, self.lpt_finbinmin[ipt],
Expand Down
3 changes: 0 additions & 3 deletions machine_learning_hep/utilities.py
Original file line number Diff line number Diff line change
Expand Up @@ -327,9 +327,6 @@ def divide_chunks(list_to_split, chunk_size):

tmp_files = []
if len(listfiles) > 500:
if not os.path.exists(tmp_dir):
os.makedirs(tmp_dir)

for i, split_list in enumerate(divide_chunks(listfiles, 500)):
tmp_files.append(os.path.join(tmp_dir, f"hadd_tmp_merged{i}.root"))
outstring = " ".join(split_list)
Expand Down

0 comments on commit b91a205

Please sign in to comment.