Multiclass: Adjust application and invariant mass fitting (#851)

* Adjust paths in database and analysis type in yml. All steps for warm-up. * Rename output folder after preprocess merge * Rename output folder after test-corr merge * Rename output folder after all-plots merge * Rename output directories after multiclass-config merge * Improve multiclass cuts for mlapplication * Enable apply and invmass steps * Better part in processer * Disable ml training, partially change output folders * Fix inv mass plotting. Application working * Modify hadd paths to be inside home directory * Update hist names in fitting * Enable fitting only * Fix output folders * Remove local adjustments for PR * Update copyright notice * Remove redundant line * Revert "Modify hadd paths to be inside home directory" This reverts commit 39c85de. * Properly use temp dirs with tempfile * Proper solution to pd.query()
alisw · Jan 29, 2024 · b91a205 · b91a205
1 parent 48b531b
commit b91a205
Show file tree

Hide file tree

Showing 8 changed files with 57 additions and 56 deletions.
diff --git a/machine_learning_hep/analysis/utils.py b/machine_learning_hep/analysis/utils.py
@@ -1,5 +1,5 @@
 #############################################################################
-##  © Copyright CERN 2018. All rights not expressly granted are reserved.  ##
+##  © Copyright CERN 2023. All rights not expressly granted are reserved.  ##
 ##                 Author: [email protected]                  ##
 ## This program is free software: you can redistribute it and/or modify it ##
 ##  under the terms of the GNU General Public License as published by the  ##
@@ -13,11 +13,12 @@
 #############################################################################
 
 from os.path import join
+import tempfile
 
-from machine_learning_hep.utilities import mergerootfiles, get_timestamp_string
+from machine_learning_hep.utilities import mergerootfiles
 from machine_learning_hep.logger import get_logger
 
-def multi_preparenorm(database, case, typean, doperiodbyperiod):
+def multi_preparenorm(database, typean, doperiodbyperiod):
 
     logger = get_logger()
 
@@ -34,14 +35,14 @@ def multi_preparenorm(database, case, typean, doperiodbyperiod):
                         "correctionsweights.root")
 
     listempty = []
-    tmp_merged = f"/data/tmp/hadd/{case}_{typean}/norm_analyzer/{get_timestamp_string()}/"
     useperiod = database["analysis"][typean]["useperiod"]
 
-    for indexp in range(len(resultsdata)):
-        logger.info("Origin path: %s, target path: %s", lper_normfilesorig[indexp],
-                    lper_normfiles[indexp])
-        mergerootfiles([lper_normfilesorig[indexp]], lper_normfiles[indexp], tmp_merged)
-        if doperiodbyperiod and useperiod[indexp]:
-            listempty.append(lper_normfiles[indexp])
+    with tempfile.TemporaryDirectory() as tmp_merged_dir:
+        for indexp in range(len(resultsdata)):
+            logger.info("Origin path: %s, target path: %s", lper_normfilesorig[indexp],
+                        lper_normfiles[indexp])
+            mergerootfiles([lper_normfilesorig[indexp]], lper_normfiles[indexp], tmp_merged_dir)
+            if doperiodbyperiod and useperiod[indexp]:
+                listempty.append(lper_normfiles[indexp])
 
-    mergerootfiles(listempty, f_normmerged, tmp_merged)
+        mergerootfiles(listempty, f_normmerged, tmp_merged_dir)
diff --git a/machine_learning_hep/data/data_run3/database_ml_parameters_LcToPKPi_multiclass.yml b/machine_learning_hep/data/data_run3/database_ml_parameters_LcToPKPi_multiclass.yml
@@ -293,9 +293,9 @@ LcpKpi:
                      xgboost_classifierLcpKpi_dfselection_fPt_8.0_12.0.sav,
                      xgboost_classifierLcpKpi_dfselection_fPt_12.0_24.0.sav]
     probcutpresel:
-      data: [[0.3, 0.0], [0.3, 0.0], [0.3, 0.0], [0.3, 0.0], [0.3, 0.0], [0.3, 0.0]] #list of nbins
-      mc: [[0.3, 0.0], [0.3, 0.0], [0.3, 0.0], [0.3, 0.0], [0.3, 0.0], [0.3, 0.0]] #list of nbins
-    probcutoptimal: [[0.5, 0.1], [0.5, 0.1], [0.5, 0.1], [0.5, 0.1], [0.5, 0.1], [0.5, 0.1]] #list of nbins
+      data: [[0.3, 0.0, 0.0], [0.3, 0.0, 0.0], [0.3, 0.0, 0.0], [0.3, 0.0, 0.0], [0.3, 0.0, 0.0], [0.3, 0.0, 0.0]] #list of nbins
+      mc: [[0.3, 0.0, 0.0], [0.3, 0.0, 0.0], [0.3, 0.0, 0.0], [0.3, 0.0, 0.0], [0.3, 0.0, 0.0], [0.3, 0.0, 0.0]] #list of nbins
+    probcutoptimal: [[0.5, 0.1, 0.0], [0.5, 0.1, 0.0], [0.5, 0.1, 0.0], [0.5, 0.1, 0.0], [0.5, 0.1, 0.0], [0.5, 0.1, 0.0]] #list of nbins
 
   analysis:
     indexhptspectrum: -1 #kD0Kpi=0, kDplusKpipi=1, kDstarD0pi=2, kDsKKpi=3, kLctopKpi=4, kLcK0Sp=5

diff --git a/machine_learning_hep/fitting/helpers.py b/machine_learning_hep/fitting/helpers.py
@@ -289,22 +289,23 @@ def make_suffix(self, ibin1, ibin2):
         """
         if self.bin2_name is not None:
             if self.mltype == "MultiClassification":
-                return "%s%d_%d_%.2f%.2f%s_%.2f_%.2f" % \
+                return "%s%d_%d_%.2f%.2f%.2f%s_%.2f_%.2f" % \
                        (self.bin1_name, self.bins1_edges_low[ibin1],
                         self.bins1_edges_up[ibin1], self.prob_cut_fin[ibin1][0],
-                        self.prob_cut_fin[ibin1][1], self.bin2_name,
-                        self.bins2_edges_low[ibin2], self.bins2_edges_up[ibin2])
+                        self.prob_cut_fin[ibin1][1], self.prob_cut_fin[ibin1][2],
+                        self.bin2_name, self.bins2_edges_low[ibin2],
+                        self.bins2_edges_up[ibin2])
             return "%s%d_%d_%.2f%s_%.2f_%.2f" % \
                    (self.bin1_name, self.bins1_edges_low[ibin1],
                     self.bins1_edges_up[ibin1], self.prob_cut_fin[ibin1],
                     self.bin2_name, self.bins2_edges_low[ibin2],
                     self.bins2_edges_up[ibin2])
 
         if self.mltype == "MultiClassification":
-            return "%s%d_%d_%.2f%.2f" % \
+            return "%s%d_%d_%.2f%.2f%.2f" % \
                    (self.bin1_name, self.bins1_edges_low[ibin1],
                     self.bins1_edges_up[ibin1], self.prob_cut_fin[ibin1][0],
-                    self.prob_cut_fin[ibin1][1])
+                    self.prob_cut_fin[ibin1][1], self.prob_cut_fin[ibin1][2])
         return "%s%d_%d_%.2f" % \
                (self.bin1_name, self.bins1_edges_low[ibin1],
                 self.bins1_edges_up[ibin1], self.prob_cut_fin[ibin1])

diff --git a/machine_learning_hep/models.py b/machine_learning_hep/models.py
@@ -158,8 +158,10 @@ def apply(ml_type, names_, trainedmodels_, test_set_, mylistvariables_, labels_=
             test_set_[f"y_test_prob{name}"] = pd.Series(y_test_prob[:, 1], index=test_set_.index)
         elif ml_type == "MultiClassification" and labels_ is not None:
             for pred, lab in enumerate(labels_):
-                test_set_[f"y_test_prob{name}{lab}"] = pd.Series(y_test_prob[:, pred],
-                                                                 index=test_set_.index)
+                # pandas query() used in further analysis cannot accept '-' in column names
+                safe_lab = lab.replace('-', '_')
+                test_set_[f"y_test_prob{name}{safe_lab}"] = pd.Series(y_test_prob[:, pred],
+                                                                      index=test_set_.index)
         else:
             logger.fatal("Incorrect settings for chosen mltype")
     return test_set_

diff --git a/machine_learning_hep/multiprocesser.py b/machine_learning_hep/multiprocesser.py
@@ -16,7 +16,8 @@
 main script for doing data processing, machine learning and analysis
 """
 import os
-from machine_learning_hep.utilities import merge_method, mergerootfiles, get_timestamp_string
+import tempfile
+from machine_learning_hep.utilities import merge_method, mergerootfiles
 from machine_learning_hep.io import parse_yaml, dump_yaml_from_dict
 from machine_learning_hep.logger import get_logger
 
@@ -203,17 +204,16 @@ def multi_histomass(self):
         for indexp, _ in enumerate(self.process_listsample):
             if self.p_useperiod[indexp] == 1:
                 self.process_listsample[indexp].process_histomass()
-        tmp_merged = f"/data/tmp/hadd/{self.case}_{self.typean}/mass/{get_timestamp_string()}/"
         self.logger.debug('merging all')
-        mergerootfiles(self.lper_filemass, self.filemass_mergedall, tmp_merged)
+        with tempfile.TemporaryDirectory() as tmp_merged_dir:
+            mergerootfiles(self.lper_filemass, self.filemass_mergedall, tmp_merged_dir)
 
     def multi_efficiency(self):
         for indexp, _ in enumerate(self.process_listsample):
             if self.p_useperiod[indexp] == 1:
                 self.process_listsample[indexp].process_efficiency()
-        tmp_merged = \
-                f"/data/tmp/hadd/{self.case}_{self.typean}/efficiency/{get_timestamp_string()}/"
-        mergerootfiles(self.lper_fileeff, self.fileeff_mergedall, tmp_merged)
+        with tempfile.TemporaryDirectory() as tmp_merged_dir:
+            mergerootfiles(self.lper_fileeff, self.fileeff_mergedall, tmp_merged_dir)
 
     def multi_response(self):
         resp_exists = False
@@ -223,9 +223,8 @@ def multi_response(self):
                     resp_exists = True
                     self.process_listsample[indexp].process_response()
         if resp_exists:
-            tmp_merged = \
-                    f"/data/tmp/hadd/{self.case}_{self.typean}/response/{get_timestamp_string()}/"
-            mergerootfiles(self.lper_fileresp, self.fileresp_mergedall, tmp_merged)
+            with tempfile.TemporaryDirectory() as tmp_merged_dir:
+                mergerootfiles(self.lper_fileresp, self.fileresp_mergedall, tmp_merged_dir)
 
     def multi_scancuts(self):
         for indexp, _ in enumerate(self.process_listsample):

diff --git a/machine_learning_hep/processer.py b/machine_learning_hep/processer.py
@@ -23,6 +23,7 @@
 import glob
 import random as rd
 import re
+import tempfile
 import uproot
 import pandas as pd
 import numpy as np
@@ -32,7 +33,6 @@
 from machine_learning_hep.utilities import list_folders, createlist, appendmainfoldertolist
 from machine_learning_hep.utilities import create_folder_struc, seldf_singlevar, openfile
 from machine_learning_hep.utilities import mergerootfiles, count_df_length_pkl
-from machine_learning_hep.utilities import get_timestamp_string
 from machine_learning_hep.io import dump_yaml_from_dict
 from machine_learning_hep.logger import get_logger
 pd.options.mode.chained_assignment = None
@@ -236,16 +236,16 @@ def __init__(self, case, datap, run_param, mcordata, p_maxfiles, # pylint: disab
 
         if self.mltype == "MultiClassification":
             self.l_selml = []
+            comps = ["<=", ">=", ">="]
             for ipt in range(self.p_nptfinbins):
-                mlsel_multi0 = "y_test_prob" + self.p_modelname + self.class_labels[0] + \
-                               " <= " + str(self.lpt_probcutfin[ipt][0])
-                mlsel_multi1 = "y_test_prob" + self.p_modelname + self.class_labels[1] + \
-                               " >= " + str(self.lpt_probcutfin[ipt][1])
-                mlsel_multi = mlsel_multi0 + " and " + mlsel_multi1
-                self.l_selml.append(mlsel_multi)
+                mlsel_multi = [f'y_test_prob{self.p_modelname}{label.replace("-", "_")} ' \
+                               f'{comp} {probcut}'
+                               for label, comp, probcut in zip(self.class_labels, comps,
+                                                               self.lpt_probcutfin[ipt])]
+                self.l_selml.append(" and ".join(mlsel_multi))
 
         else:
-            self.l_selml = [f"y_test_prob {self.p_modelname} > {self.lpt_probcutfin[ipt]}" \
+            self.l_selml = [f"y_test_prob{self.p_modelname} > {self.lpt_probcutfin[ipt]}" \
                            for ipt in range(self.p_nptfinbins)]
 
         self.d_pkl_dec = d_pkl_dec
@@ -271,9 +271,10 @@ def __init__(self, case, datap, run_param, mcordata, p_maxfiles, # pylint: disab
         self.lpt_recodec = None
         if self.doml is True:
             if self.mltype == "MultiClassification":
-                self.lpt_recodec = [self.n_reco.replace(".pkl", "%d_%d_%.2f%.2f.pkl" % \
-                                   (self.lpt_anbinmin[i], self.lpt_anbinmax[i], \
-                                    self.lpt_probcutpre[i][0], self.lpt_probcutpre[i][1])) \
+                self.lpt_recodec = [self.n_reco.replace(".pkl", "%d_%d_%.2f%.2f%.2f.pkl" % \
+                                   (self.lpt_anbinmin[i], self.lpt_anbinmax[i],
+                                    self.lpt_probcutpre[i][0], self.lpt_probcutpre[i][1],
+                                    self.lpt_probcutpre[i][2])) \
                                     for i in range(self.p_nptbins)]
             else:
                 self.lpt_recodec = [self.n_reco.replace(".pkl", "%d_%d_%.2f.pkl" % \
@@ -516,14 +517,15 @@ def applymodel(self, file_index):
                 if self.mltype == "MultiClassification":
                     dfrecoskml = apply(self.mltype, [self.p_modelname], [mod],
                                        dfrecosk, self.v_train[ipt], self.class_labels)
-                    prob0 = f"y_test_prob{self.p_modelname}{self.class_labels[0]}"
-                    prob1 = f"y_test_prob{self.p_modelname}{self.class_labels[1]}"
-                    dfrecoskml = dfrecoskml.loc[(dfrecoskml[prob0] <= self.lpt_probcutpre[ipt][0]) &
-                                                (dfrecoskml[prob1] >= self.lpt_probcutpre[ipt][1])]
+                    probs = [f'y_test_prob{self.p_modelname}{label.replace("-", "_")}' \
+                             for label in self.class_labels]
+                    dfrecoskml = dfrecoskml[(dfrecoskml[probs[0]] <= self.lpt_probcutpre[ipt][0]) &
+                                            (dfrecoskml[probs[1]] >= self.lpt_probcutpre[ipt][1]) &
+                                            (dfrecoskml[probs[2]] >= self.lpt_probcutpre[ipt][2])]
                 else:
                     dfrecoskml = apply("BinaryClassification", [self.p_modelname], [mod],
                                        dfrecosk, self.v_train[ipt])
-                    probvar = "y_test_prob" + self.p_modelname
+                    probvar = f"y_test_prob{self.p_modelname}"
                     dfrecoskml = dfrecoskml.loc[dfrecoskml[probvar] > self.lpt_probcutpre[ipt]]
             else:
                 dfrecoskml = dfrecosk.query("isstd == 1")
@@ -654,9 +656,8 @@ def process_histomass(self):
         create_folder_struc(self.d_results, self.l_path)
         arguments = [(i,) for i in range(len(self.l_root))]
         self.parallelizer(self.process_histomass_single, arguments, self.p_chunksizeunp) # pylint: disable=no-member
-        tmp_merged = \
-            f"/tmp/hadd/{self.case}_{self.typean}/mass_{self.period}/{get_timestamp_string()}/"
-        mergerootfiles(self.l_histomass, self.n_filemass, tmp_merged)
+        with tempfile.TemporaryDirectory() as tmp_merged_dir:
+            mergerootfiles(self.l_histomass, self.n_filemass, tmp_merged_dir)
 
     def process_efficiency(self):
         print("Doing efficiencies", self.mcordata, self.period)
@@ -672,5 +673,5 @@ def process_efficiency(self):
         create_folder_struc(self.d_results, self.l_path)
         arguments = [(i,) for i in range(len(self.l_root))]
         self.parallelizer(self.process_efficiency_single, arguments, self.p_chunksizeunp) # pylint: disable=no-member
-        tmp_merged = f"/tmp/hadd/{self.case}_{self.typean}/histoeff_{self.period}/{get_timestamp_string()}/" # pylint: disable=line-too-long
-        mergerootfiles(self.l_histoeff, self.n_fileeff, tmp_merged)
+        with tempfile.TemporaryDirectory() as tmp_merged_dir:
+            mergerootfiles(self.l_histoeff, self.n_fileeff, tmp_merged_dir)
diff --git a/machine_learning_hep/processerdhadrons.py b/machine_learning_hep/processerdhadrons.py
@@ -121,10 +121,10 @@ def process_histomass_single(self, index):
                 df = self.apply_cuts_ptbin(df, ipt)
 
             if self.mltype == "MultiClassification":
-                suffix = "%s%d_%d_%.2f%.2f" % \
+                suffix = "%s%d_%d_%.2f%.2f%.2f" % \
                          (self.v_var_binning, self.lpt_finbinmin[ipt],
                           self.lpt_finbinmax[ipt], self.lpt_probcutfin[ipt][0],
-                          self.lpt_probcutfin[ipt][1])
+                          self.lpt_probcutfin[ipt][1], self.lpt_probcutfin[ipt][2])
             else:
                 suffix = "%s%d_%d_%.2f" % \
                          (self.v_var_binning, self.lpt_finbinmin[ipt],

diff --git a/machine_learning_hep/utilities.py b/machine_learning_hep/utilities.py
@@ -327,9 +327,6 @@ def divide_chunks(list_to_split, chunk_size):
 
     tmp_files = []
     if len(listfiles) > 500:
-        if not os.path.exists(tmp_dir):
-            os.makedirs(tmp_dir)
-
         for i, split_list in enumerate(divide_chunks(listfiles, 500)):
             tmp_files.append(os.path.join(tmp_dir, f"hadd_tmp_merged{i}.root"))
             outstring = " ".join(split_list)