From ae5c039b5d0ba46dad983e4d9e3e08756c56eb16 Mon Sep 17 00:00:00 2001 From: Maja Kabus Date: Wed, 24 Jan 2024 14:39:06 +0100 Subject: [PATCH] Properly use temp dirs with tempfile --- machine_learning_hep/analysis/utils.py | 21 +++++++++++---------- machine_learning_hep/multiprocesser.py | 17 ++++++++--------- machine_learning_hep/processer.py | 11 +++++------ machine_learning_hep/utilities.py | 3 --- 4 files changed, 24 insertions(+), 28 deletions(-) diff --git a/machine_learning_hep/analysis/utils.py b/machine_learning_hep/analysis/utils.py index 67f0c92cc0..6676c96d7d 100644 --- a/machine_learning_hep/analysis/utils.py +++ b/machine_learning_hep/analysis/utils.py @@ -13,11 +13,12 @@ ############################################################################# from os.path import join +import tempfile -from machine_learning_hep.utilities import mergerootfiles, get_timestamp_string +from machine_learning_hep.utilities import mergerootfiles from machine_learning_hep.logger import get_logger -def multi_preparenorm(database, case, typean, doperiodbyperiod): +def multi_preparenorm(database, typean, doperiodbyperiod): logger = get_logger() @@ -34,14 +35,14 @@ def multi_preparenorm(database, case, typean, doperiodbyperiod): "correctionsweights.root") listempty = [] - tmp_merged = f"/data/tmp/hadd/{case}_{typean}/norm_analyzer/{get_timestamp_string()}/" useperiod = database["analysis"][typean]["useperiod"] - for indexp in range(len(resultsdata)): - logger.info("Origin path: %s, target path: %s", lper_normfilesorig[indexp], - lper_normfiles[indexp]) - mergerootfiles([lper_normfilesorig[indexp]], lper_normfiles[indexp], tmp_merged) - if doperiodbyperiod and useperiod[indexp]: - listempty.append(lper_normfiles[indexp]) + with tempfile.TemporaryDirectory() as tmp_merged_dir: + for indexp in range(len(resultsdata)): + logger.info("Origin path: %s, target path: %s", lper_normfilesorig[indexp], + lper_normfiles[indexp]) + mergerootfiles([lper_normfilesorig[indexp]], lper_normfiles[indexp], tmp_merged_dir) + if doperiodbyperiod and useperiod[indexp]: + listempty.append(lper_normfiles[indexp]) - mergerootfiles(listempty, f_normmerged, tmp_merged) + mergerootfiles(listempty, f_normmerged, tmp_merged_dir) diff --git a/machine_learning_hep/multiprocesser.py b/machine_learning_hep/multiprocesser.py index 9289e8dc1c..1432515633 100755 --- a/machine_learning_hep/multiprocesser.py +++ b/machine_learning_hep/multiprocesser.py @@ -16,7 +16,8 @@ main script for doing data processing, machine learning and analysis """ import os -from machine_learning_hep.utilities import merge_method, mergerootfiles, get_timestamp_string +import tempfile +from machine_learning_hep.utilities import merge_method, mergerootfiles from machine_learning_hep.io import parse_yaml, dump_yaml_from_dict from machine_learning_hep.logger import get_logger @@ -203,17 +204,16 @@ def multi_histomass(self): for indexp, _ in enumerate(self.process_listsample): if self.p_useperiod[indexp] == 1: self.process_listsample[indexp].process_histomass() - tmp_merged = f"/data/tmp/hadd/{self.case}_{self.typean}/mass/{get_timestamp_string()}/" self.logger.debug('merging all') - mergerootfiles(self.lper_filemass, self.filemass_mergedall, tmp_merged) + with tempfile.TemporaryDirectory() as tmp_merged_dir: + mergerootfiles(self.lper_filemass, self.filemass_mergedall, tmp_merged_dir) def multi_efficiency(self): for indexp, _ in enumerate(self.process_listsample): if self.p_useperiod[indexp] == 1: self.process_listsample[indexp].process_efficiency() - tmp_merged = \ - f"/data/tmp/hadd/{self.case}_{self.typean}/efficiency/{get_timestamp_string()}/" - mergerootfiles(self.lper_fileeff, self.fileeff_mergedall, tmp_merged) + with tempfile.TemporaryDirectory() as tmp_merged_dir: + mergerootfiles(self.lper_fileeff, self.fileeff_mergedall, tmp_merged_dir) def multi_response(self): resp_exists = False @@ -223,9 +223,8 @@ def multi_response(self): resp_exists = True self.process_listsample[indexp].process_response() if resp_exists: - tmp_merged = \ - f"/data/tmp/hadd/{self.case}_{self.typean}/response/{get_timestamp_string()}/" - mergerootfiles(self.lper_fileresp, self.fileresp_mergedall, tmp_merged) + with tempfile.TemporaryDirectory() as tmp_merged_dir: + mergerootfiles(self.lper_fileresp, self.fileresp_mergedall, tmp_merged_dir) def multi_scancuts(self): for indexp, _ in enumerate(self.process_listsample): diff --git a/machine_learning_hep/processer.py b/machine_learning_hep/processer.py index a073234b39..c3d9be1427 100755 --- a/machine_learning_hep/processer.py +++ b/machine_learning_hep/processer.py @@ -23,6 +23,7 @@ import glob import random as rd import re +import tempfile import uproot import pandas as pd import numpy as np @@ -32,7 +33,6 @@ from machine_learning_hep.utilities import list_folders, createlist, appendmainfoldertolist from machine_learning_hep.utilities import create_folder_struc, seldf_singlevar, openfile from machine_learning_hep.utilities import mergerootfiles, count_df_length_pkl -from machine_learning_hep.utilities import get_timestamp_string from machine_learning_hep.io import dump_yaml_from_dict from machine_learning_hep.logger import get_logger pd.options.mode.chained_assignment = None @@ -656,9 +656,8 @@ def process_histomass(self): create_folder_struc(self.d_results, self.l_path) arguments = [(i,) for i in range(len(self.l_root))] self.parallelizer(self.process_histomass_single, arguments, self.p_chunksizeunp) # pylint: disable=no-member - tmp_merged = \ - f"/tmp/hadd/{self.case}_{self.typean}/mass_{self.period}/{get_timestamp_string()}/" - mergerootfiles(self.l_histomass, self.n_filemass, tmp_merged) + with tempfile.TemporaryDirectory() as tmp_merged_dir: + mergerootfiles(self.l_histomass, self.n_filemass, tmp_merged_dir) def process_efficiency(self): print("Doing efficiencies", self.mcordata, self.period) @@ -674,5 +673,5 @@ def process_efficiency(self): create_folder_struc(self.d_results, self.l_path) arguments = [(i,) for i in range(len(self.l_root))] self.parallelizer(self.process_efficiency_single, arguments, self.p_chunksizeunp) # pylint: disable=no-member - tmp_merged = f"/tmp/hadd/{self.case}_{self.typean}/histoeff_{self.period}/{get_timestamp_string()}/" # pylint: disable=line-too-long - mergerootfiles(self.l_histoeff, self.n_fileeff, tmp_merged) + with tempfile.TemporaryDirectory() as tmp_merged_dir: + mergerootfiles(self.l_histoeff, self.n_fileeff, tmp_merged_dir) diff --git a/machine_learning_hep/utilities.py b/machine_learning_hep/utilities.py index a92aa8db47..4aa393a0bd 100644 --- a/machine_learning_hep/utilities.py +++ b/machine_learning_hep/utilities.py @@ -327,9 +327,6 @@ def divide_chunks(list_to_split, chunk_size): tmp_files = [] if len(listfiles) > 500: - if not os.path.exists(tmp_dir): - os.makedirs(tmp_dir) - for i, split_list in enumerate(divide_chunks(listfiles, 500)): tmp_files.append(os.path.join(tmp_dir, f"hadd_tmp_merged{i}.root")) outstring = " ".join(split_list)