diff --git a/src/hats_import/catalog/resume_plan.py b/src/hats_import/catalog/resume_plan.py index c296513..5cec0e0 100644 --- a/src/hats_import/catalog/resume_plan.py +++ b/src/hats_import/catalog/resume_plan.py @@ -146,10 +146,9 @@ def get_remaining_map_keys(self): list of mapping keys *not* found in files like /resume/path/mapping_key.npz """ prefix = file_io.get_upath(self.tmp_path) / self.HISTOGRAMS_DIR - done_indexes = [ - int(re.match(r"map_(\d+).npz", path.name).group(1)) for path in prefix.glob("*.npz") - ] - remaining_indexes = list(set(range(0, len(self.input_paths))).difference(set(done_indexes))) + map_file_pattern = re.compile(r"map_(\d+).npz") + done_indexes = [int(map_file_pattern.match(path.name).group(1)) for path in prefix.glob("*.npz")] + remaining_indexes = list(set(range(0, len(self.input_paths))) - (set(done_indexes))) return [(f"map_{key}", self.input_paths[key]) for key in remaining_indexes] def read_histogram(self, healpix_order): @@ -214,9 +213,8 @@ def get_remaining_split_keys(self): list of splitting keys *not* found in files like /resume/path/split_key.done """ prefix = file_io.get_upath(self.tmp_path) / self.SPLITTING_STAGE - done_indexes = [ - int(re.match(r"split_(\d+)_done", path.name).group(1)) for path in prefix.glob("*_done") - ] + split_file_pattern = re.compile(r"split_(\d+)_done") + done_indexes = [int(split_file_pattern.match(path.name).group(1)) for path in prefix.glob("*_done")] remaining_indexes = list(set(range(0, len(self.input_paths))) - set(done_indexes)) return [(f"split_{key}", self.input_paths[key]) for key in remaining_indexes] diff --git a/src/hats_import/pipeline_resume_plan.py b/src/hats_import/pipeline_resume_plan.py index 747cd6f..449ab53 100644 --- a/src/hats_import/pipeline_resume_plan.py +++ b/src/hats_import/pipeline_resume_plan.py @@ -119,8 +119,9 @@ def read_markers(self, stage_name: str) -> dict[str, list[str]]: prefix = file_io.append_paths_to_pointer(self.tmp_path, stage_name) result = {} result_files = file_io.find_files_matching_path(prefix, "*_done") + done_file_pattern = re.compile(r"(.*)_done") for file_path in result_files: - match = re.match(r"(.*)_done", str(file_path.name)) + match = done_file_pattern.match(str(file_path.name)) if not match: raise ValueError(f"Unexpected file found: {file_path.name}") key = match.group(1) @@ -136,9 +137,8 @@ def read_done_pixels(self, stage_name): List[HealpixPixel] - all pixel keys found in done directory """ prefix = file_io.append_paths_to_pointer(self.tmp_path, stage_name) - pixel_tuples = [ - re.match(r"(\d+)_(\d+)_done", path.name).group(1, 2) for path in prefix.glob("*_done") - ] + done_file_pattern = re.compile(r"(\d+)_(\d+)_done") + pixel_tuples = [done_file_pattern.match(path.name).group(1, 2) for path in prefix.glob("*_done")] return [HealpixPixel(int(match[0]), int(match[1])) for match in pixel_tuples] def clean_resume_files(self): diff --git a/src/hats_import/soap/resume_plan.py b/src/hats_import/soap/resume_plan.py index 7d96ebb..93a3d71 100644 --- a/src/hats_import/soap/resume_plan.py +++ b/src/hats_import/soap/resume_plan.py @@ -98,8 +98,9 @@ def get_sources_to_count(self, source_pixel_map=None): self.source_pixel_map = source_pixel_map if self.source_pixel_map is None: raise ValueError("source_pixel_map not provided for progress tracking.") + count_file_pattern = re.compile(r"(\d+)_(\d+).csv") counted_pixel_tuples = [ - re.match(r"(\d+)_(\d+).csv", path.name).group(1, 2) for path in self.tmp_path.glob("*.csv") + count_file_pattern.match(path.name).group(1, 2) for path in self.tmp_path.glob("*.csv") ] counted_pixels = [HealpixPixel(int(match[0]), int(match[1])) for match in counted_pixel_tuples]