From 63f126d6e8c2b44c08f7b988ba4053efbe1b92dd Mon Sep 17 00:00:00 2001 From: David Baines Date: Tue, 27 May 2025 12:30:51 +0100 Subject: [PATCH 1/5] Add multi-threading to speed up IO bound operations --- silnlp/common/clean_projects.py | 112 ++++++++++++++++++++++++++------ 1 file changed, 91 insertions(+), 21 deletions(-) diff --git a/silnlp/common/clean_projects.py b/silnlp/common/clean_projects.py index 6031c244..b1335735 100644 --- a/silnlp/common/clean_projects.py +++ b/silnlp/common/clean_projects.py @@ -1,6 +1,7 @@ #!/usr/bin/env python3 import argparse +import concurrent.futures import logging import shutil import sys @@ -320,6 +321,19 @@ def execute_cleanup(self): self._log_info("Cleanup execution finished.") +# --- Helper for concurrent project cleaning --- +def process_single_project_for_cleaning( + project_path: Path, current_args: argparse.Namespace +) -> str: + """ + Creates a ProjectCleaner instance, analyzes, and cleans a single project. + Returns the project name upon successful completion. + """ + cleaner = ProjectCleaner(project_path, current_args) + cleaner.analyze_project_contents() + cleaner.execute_cleanup() + return project_path.name # Return name for logging/tracking + # --- Main Function --- def main(): parser = argparse.ArgumentParser( @@ -368,10 +382,9 @@ def main(): file_handler.setLevel(logging.INFO) logger.addHandler(file_handler) - if args.verbose > 0: - print(f"Starting cleanup process for projects in: {args.projects_root}") - if args.dry_run: - print("DRY RUN mode enabled.") + print(f"Starting cleanup process for projects in: {args.projects_root}") + if args.dry_run: + print("DRY RUN mode enabled.") logger.info( f"Starting cleanup process. Projects root: {args.projects_root}. Dry run: {args.dry_run}. Verbose: {args.verbose}." ) @@ -381,21 +394,53 @@ def main(): print(f"Error: Projects root folder not found: {args.projects_root}") sys.exit(1) - all_folders = [folder for folder in projects_root_path.iterdir() if folder.is_dir()] - found_total_msg = f"Found {len(all_folders)} folders in {args.projects_root}." + # Initial scan for all items to determine directories + initial_items = list(projects_root_path.glob("*")) + all_folders = [] + if args.verbose > 0: + print(f"Scanning {len(initial_items)} items in {args.projects_root} to find directories...") + + for item in tqdm(initial_items, desc=f"Scanning {args.projects_root}", unit="item", disable=args.verbose > 0): + if item.is_dir(): + all_folders.append(item) + + + found_total_msg = f"Found {len(all_folders)} total directories in {args.projects_root}." logger.info(found_total_msg) if args.verbose > 0: print(found_total_msg) project_folders = [] non_project_folders = [] - for folder in tqdm( - all_folders, desc="Scanning folders", unit="folder", disable=args.verbose > 0 - ): - if has_settings_file(folder): - project_folders.append(folder) - else: - non_project_folders.append(folder) + + # Use a ThreadPoolExecutor for concurrent I/O-bound tasks + max_workers = 10 + + with concurrent.futures.ThreadPoolExecutor(max_workers=max_workers) as executor: + + # Submit tasks for each folder + future_to_folder = {executor.submit(has_settings_file, folder): folder for folder in all_folders} + + # Iterate over completed tasks using tqdm, add mininterval for smoother updates + # if individual has_settings_file calls are very fast. + for future in tqdm(concurrent.futures.as_completed(future_to_folder), + total=len(all_folders), + desc="Identifying project folders", + unit="folder", + disable=args.verbose > 0): + folder = future_to_folder[future] + try: + is_project = future.result() + if is_project: + project_folders.append(folder) + else: + non_project_folders.append(folder) + except Exception as exc: + logger.error(f"Error checking folder {folder}: {exc}") + if args.verbose > 0: + print(f"Error checking folder {folder}: {exc}") + non_project_folders.append(folder) + found_msg = f"Found {len(project_folders)} project folders." logger.info(found_msg) @@ -422,14 +467,39 @@ def main(): print(no_projects_msg) return - for project_path in tqdm(project_folders, desc="Cleaning projects", unit="project"): - cleaner = ProjectCleaner(project_path, args) - cleaner.analyze_project_contents() - cleaner.execute_cleanup() - if args.verbose > 0: - print(f"--- Finished processing project: {project_path.name} ---") - elif args.verbose == 0: - logger.info(f"Finished processing project: {project_path.name}") + # Concurrently process each project folder for cleaning + # Re-use max_workers from the previous section, or define a new one if desired. + with concurrent.futures.ThreadPoolExecutor(max_workers=max_workers) as executor: + # Store future to project_path to retrieve the original Path object for robust error messages + future_to_project_path_map = { + executor.submit(process_single_project_for_cleaning, project_path, args): project_path + for project_path in project_folders + } + + for future in tqdm( + concurrent.futures.as_completed(future_to_project_path_map), + total=len(project_folders), + desc="Cleaning projects", + unit="project", + disable=args.verbose > 0, # tqdm is disabled if verbose output is on + mininterval=0.01 # More frequent updates, similar to the folder identification step + ): + processed_project_path = future_to_project_path_map[future] + try: + # future.result() will re-raise exceptions from the worker function + # and return the project name. + project_name = future.result() + + # Log completion for this project + if args.verbose > 0: + print(f"--- Finished processing project: {project_name} ---") + # Log to file even if tqdm is active (args.verbose == 0) + logger.info(f"Finished processing project: {project_name}") + + except Exception as exc: + logger.error(f"Error cleaning project {processed_project_path.name}: {exc}") + if args.verbose > 0: # Also print to console if verbose + print(f"Error cleaning project {processed_project_path.name}: {exc}") final_msg = "\nCleanup process completed." logger.info(final_msg) From 90dc99b7bc3d9d8193d84ccff11f9f3c571102d4 Mon Sep 17 00:00:00 2001 From: David Baines Date: Tue, 27 May 2025 16:44:11 +0100 Subject: [PATCH 2/5] Updates to clean_projects.py --- silnlp/common/clean_projects.py | 127 ++++++++++++++++++++------------ 1 file changed, 79 insertions(+), 48 deletions(-) diff --git a/silnlp/common/clean_projects.py b/silnlp/common/clean_projects.py index b1335735..12e68d3e 100644 --- a/silnlp/common/clean_projects.py +++ b/silnlp/common/clean_projects.py @@ -110,21 +110,18 @@ def __init__(self, project_path: Path, args): self.files_to_delete = set() self.folders_to_delete = set() self.parsing_errors = [] + self.log_buffer: list[str] = [] # Buffer to store log messages for this project self.log_prefix = f"[{self.project_path.name}] " def _log_info(self, message: str): full_message = f"{self.log_prefix}{message}" - logger.info(full_message) - if self.args.verbose > 0: - print(full_message) + self.log_buffer.append(full_message) def _log_action(self, action: str, item_path: Path): full_message = ( f"{self.log_prefix}{action}: {item_path.relative_to(self.project_path)}" ) - logger.info(full_message) - if self.args.verbose > 0: - print(full_message) + self.log_buffer.append(full_message) def _parse_settings(self): settings_file_path = self.project_path / SETTINGS_FILENAME @@ -132,7 +129,7 @@ def _parse_settings(self): settings_file_path = self.project_path / SETTINGS_FILENAME.lower() if not settings_file_path.exists(): warning_msg = f"Warning: {SETTINGS_FILENAME} not found." - if self.args.verbose: + if self.args.verbose > 0: # Condition to buffer this warning self._log_info(warning_msg) self.parsing_errors.append(f"{SETTINGS_FILENAME} not found.") return @@ -141,36 +138,44 @@ def _parse_settings(self): parser = FileParatextProjectSettingsParser(str(self.project_path)) project_settings = parser.parse() self.project_settings = project_settings - - full_suffix = project_settings.file_name_suffix.upper() - self.scripture_file_extension = Path(full_suffix).suffix - if not self.scripture_file_extension: - self.scripture_file_extension = "" + + # Log raw settings related to file naming now that self.project_settings is assigned. self._log_info( - f"Determined scripture file extension: {self.scripture_file_extension}" + f"Settings - FileNamePrePart:'{self.project_settings.file_name_prefix}' " + f"PostPart:'{self.project_settings.file_name_suffix}' " + f"BookNameForm:'{self.project_settings.file_name_form}'" ) - if project_settings.biblical_terms_file_name: - terms_file_path = ( - self.project_path / project_settings.biblical_terms_file_name - ) - if terms_file_path.is_file(): - self.biblical_terms_files.add(terms_file_path) - self._log_info( - f"Found BiblicalTermsListSetting file: {terms_file_path.name}" - ) - else: - warning_msg = f"Warning: BiblicalTermsListSetting file not found at expected path: {terms_file_path}" - if self.args.verbose: - self._log_info(warning_msg) - self.parsing_errors.append( - f"BiblicalTermsListSetting file not found: {terms_file_path.name}" - ) except Exception as e: error_msg = f"Error parsing {SETTINGS_FILENAME}: {e}" - if self.args.verbose: + if self.args.verbose > 0: # Condition to buffer this error message self._log_info(error_msg) self.parsing_errors.append(error_msg) + # Log that specific settings details could not be retrieved + self._log_info( + f"Settings - Could not log file naming details (PrePart, PostPart, BookNameForm) due to parsing error: {e}" + ) + + # The following code correctly uses self.project_settings, + # which will be None if parsing failed, and thus these blocks will be skipped. + + if project_settings.biblical_terms_file_name: + terms_file_path = ( + self.project_path / project_settings.biblical_terms_file_name + ) + if terms_file_path.is_file(): + self.biblical_terms_files.add(terms_file_path) + self._log_info( + f"Found BiblicalTermsListSetting file: {terms_file_path.name}" + ) + else: + warning_msg = f"Warning: BiblicalTermsListSetting file not found at expected path: {terms_file_path}" + if self.args.verbose > 0: # Condition to buffer this warning + self._log_info(warning_msg) + self.parsing_errors.append( + f"BiblicalTermsListSetting file not found: {terms_file_path.name}" + ) + def analyze_project_contents(self): self._parse_settings() @@ -324,15 +329,15 @@ def execute_cleanup(self): # --- Helper for concurrent project cleaning --- def process_single_project_for_cleaning( project_path: Path, current_args: argparse.Namespace -) -> str: +) -> tuple[str, list[str], list[str]]: """ Creates a ProjectCleaner instance, analyzes, and cleans a single project. - Returns the project name upon successful completion. + Returns the project name, a list of log messages, and a list of parsing errors. """ cleaner = ProjectCleaner(project_path, current_args) cleaner.analyze_project_contents() cleaner.execute_cleanup() - return project_path.name # Return name for logging/tracking + return project_path.name, cleaner.log_buffer, cleaner.parsing_errors # --- Main Function --- def main(): @@ -404,6 +409,14 @@ def main(): if item.is_dir(): all_folders.append(item) + test = True + + if test: + all_folders = all_folders[:200] + # Use a single ThreadPoolExecutor for concurrent I/O-bound tasks + max_workers = 1 + else: + max_workers = 10 found_total_msg = f"Found {len(all_folders)} total directories in {args.projects_root}." logger.info(found_total_msg) @@ -413,8 +426,7 @@ def main(): project_folders = [] non_project_folders = [] - # Use a ThreadPoolExecutor for concurrent I/O-bound tasks - max_workers = 10 + with concurrent.futures.ThreadPoolExecutor(max_workers=max_workers) as executor: @@ -467,6 +479,8 @@ def main(): print(no_projects_msg) return + processed_project_data: list[tuple[str, list[str], list[str], Path]] = [] + # Concurrently process each project folder for cleaning # Re-use max_workers from the previous section, or define a new one if desired. with concurrent.futures.ThreadPoolExecutor(max_workers=max_workers) as executor: @@ -486,20 +500,37 @@ def main(): ): processed_project_path = future_to_project_path_map[future] try: - # future.result() will re-raise exceptions from the worker function - # and return the project name. - project_name = future.result() - - # Log completion for this project - if args.verbose > 0: - print(f"--- Finished processing project: {project_name} ---") - # Log to file even if tqdm is active (args.verbose == 0) - logger.info(f"Finished processing project: {project_name}") - + project_name, project_logs, project_errors = future.result() + processed_project_data.append((project_name, project_logs, project_errors, processed_project_path)) except Exception as exc: - logger.error(f"Error cleaning project {processed_project_path.name}: {exc}") - if args.verbose > 0: # Also print to console if verbose - print(f"Error cleaning project {processed_project_path.name}: {exc}") + # Log critical errors during processing immediately, as they might prevent log collection + crit_error_msg = f"Critical error during processing of project {processed_project_path.name}: {exc}" + logger.error(crit_error_msg) + if args.verbose > 0: + print(crit_error_msg) + # Store a placeholder for sorted output + processed_project_data.append((processed_project_path.name, [], [f"Critical error: {exc}"], processed_project_path)) + + # Sort all collected data by project name + processed_project_data.sort(key=lambda x: x[0]) + + # Log the collected and sorted data + for project_name, project_logs, project_parsing_errors, _project_path in processed_project_data: + # Log messages collected by the cleaner + for log_msg_from_buffer in project_logs: + logger.info(log_msg_from_buffer) # Already formatted with [ProjectName] prefix by ProjectCleaner + if args.verbose > 0: # Print to console if verbose + print(log_msg_from_buffer) + + # Log parsing errors, ensuring they are associated with the project + if project_parsing_errors: + for err_str in project_parsing_errors: + error_log_message = f"[{project_name}] Config Error: {err_str}" + logger.warning(error_log_message) # Use warning for parsing/config errors + if args.verbose > 0: + print(error_log_message) + + logger.info(f"[{project_name}] Processing completed.") # Log overall completion for this project final_msg = "\nCleanup process completed." logger.info(final_msg) From d3f182d0fd24d83c29b3d3acb991cdc5b11fafd1 Mon Sep 17 00:00:00 2001 From: David Baines Date: Wed, 28 May 2025 14:07:11 +0100 Subject: [PATCH 3/5] Lower case TermRenderings.xml and turn off test --- silnlp/common/clean_projects.py | 118 ++++++++++++-------------------- 1 file changed, 44 insertions(+), 74 deletions(-) diff --git a/silnlp/common/clean_projects.py b/silnlp/common/clean_projects.py index 12e68d3e..1d0e42c2 100644 --- a/silnlp/common/clean_projects.py +++ b/silnlp/common/clean_projects.py @@ -12,11 +12,12 @@ from tqdm import tqdm # --- Global Constants --- -PROJECTS_FOLDER_DEFAULT = "M:/Paratext/projects" +PROJECTS_FOLDER_DEFAULT = "M:/Paratext/projects" logger = logging.getLogger(__name__) SETTINGS_FILENAME = "Settings.xml" # --- Configuration for Deletion/Keep Rules --- +# These are matched with lower cased versions of the filename, they must be listed in lower case here. FILES_TO_DELETE_BY_NAME_CI = { "allclustercorrections.txt", @@ -76,8 +77,7 @@ "bookNames.xml", "canons.xml", "lexicon.xml", - "TermRenderings.xml", - + "termrenderings.xml", } EXTENSIONS_TO_KEEP_CI = { @@ -94,9 +94,7 @@ def has_settings_file(project_folder: Path) -> bool: - return (project_folder / SETTINGS_FILENAME).is_file() or ( - project_folder / SETTINGS_FILENAME.lower() - ).is_file() + return (project_folder / SETTINGS_FILENAME).is_file() or (project_folder / SETTINGS_FILENAME.lower()).is_file() class ProjectCleaner: @@ -118,9 +116,7 @@ def _log_info(self, message: str): self.log_buffer.append(full_message) def _log_action(self, action: str, item_path: Path): - full_message = ( - f"{self.log_prefix}{action}: {item_path.relative_to(self.project_path)}" - ) + full_message = f"{self.log_prefix}{action}: {item_path.relative_to(self.project_path)}" self.log_buffer.append(full_message) def _parse_settings(self): @@ -129,7 +125,7 @@ def _parse_settings(self): settings_file_path = self.project_path / SETTINGS_FILENAME.lower() if not settings_file_path.exists(): warning_msg = f"Warning: {SETTINGS_FILENAME} not found." - if self.args.verbose > 0: # Condition to buffer this warning + if self.args.verbose > 0: # Condition to buffer this warning self._log_info(warning_msg) self.parsing_errors.append(f"{SETTINGS_FILENAME} not found.") return @@ -138,7 +134,7 @@ def _parse_settings(self): parser = FileParatextProjectSettingsParser(str(self.project_path)) project_settings = parser.parse() self.project_settings = project_settings - + # Log raw settings related to file naming now that self.project_settings is assigned. self._log_info( f"Settings - FileNamePrePart:'{self.project_settings.file_name_prefix}' " @@ -148,34 +144,27 @@ def _parse_settings(self): except Exception as e: error_msg = f"Error parsing {SETTINGS_FILENAME}: {e}" - if self.args.verbose > 0: # Condition to buffer this error message + if self.args.verbose > 0: # Condition to buffer this error message self._log_info(error_msg) self.parsing_errors.append(error_msg) # Log that specific settings details could not be retrieved self._log_info( - f"Settings - Could not log file naming details (PrePart, PostPart, BookNameForm) due to parsing error: {e}" + f"Settings - Couldn't log naming details (PrePart, PostPart, BookNameForm) due to parsing error: {e}" ) # The following code correctly uses self.project_settings, # which will be None if parsing failed, and thus these blocks will be skipped. if project_settings.biblical_terms_file_name: - terms_file_path = ( - self.project_path / project_settings.biblical_terms_file_name - ) + terms_file_path = self.project_path / project_settings.biblical_terms_file_name if terms_file_path.is_file(): self.biblical_terms_files.add(terms_file_path) - self._log_info( - f"Found BiblicalTermsListSetting file: {terms_file_path.name}" - ) + self._log_info(f"Found BiblicalTermsListSetting file: {terms_file_path.name}") else: warning_msg = f"Warning: BiblicalTermsListSetting file not found at expected path: {terms_file_path}" - if self.args.verbose > 0: # Condition to buffer this warning + if self.args.verbose > 0: # Condition to buffer this warning self._log_info(warning_msg) - self.parsing_errors.append( - f"BiblicalTermsListSetting file not found: {terms_file_path.name}" - ) - + self.parsing_errors.append(f"BiblicalTermsListSetting file not found: {terms_file_path.name}") def analyze_project_contents(self): self._parse_settings() @@ -201,32 +190,22 @@ def analyze_project_contents(self): # Scripture files are identified using ParatextProjectSettings.get_book_id() if self.project_settings: - for ( - item - ) in ( - self.project_path.iterdir() - ): # Scripture files are typically at the project root + for item in self.project_path.iterdir(): # Scripture files are typically at the project root if item.is_file(): book_id = self.project_settings.get_book_id(item.name) if book_id is not None: self.files_to_keep.add(item) if self.args.verbose > 1: - self._log_info( - f"Kept scripture file (via get_book_id): {item.name}" - ) + self._log_info(f"Kept scripture file (via get_book_id): {item.name}") elif self.args.verbose > 0: - self._log_info( - "Project settings not available; cannot use get_book_id for scripture identification." - ) + self._log_info("Project settings not available; cannot use get_book_id for scripture identification.") for item in all_items_in_project: if item.is_file() and item.suffix.lower() in EXTENSIONS_TO_KEEP_CI: self.files_to_keep.add(item) if self.args.verbose > 1: - self._log_info( - f"Identified {len(self.files_to_keep)} files to keep initially." - ) + self._log_info(f"Identified {len(self.files_to_keep)} files to keep initially.") # --- Pass 2: Identify files to DELETE --- for item_path in all_items_in_project: @@ -241,17 +220,12 @@ def analyze_project_contents(self): if item_name_lower in FILES_TO_DELETE_BY_NAME_CI: delete_file = True reason = "specific name" - elif any( - item_path.match(pattern) for pattern in FILES_TO_DELETE_BY_PATTERN - ): + elif any(item_path.match(pattern) for pattern in FILES_TO_DELETE_BY_PATTERN): delete_file = True reason = "pattern match" - elif any( - sub_str in item_name_lower - for sub_str in FILENAME_SUBSTRINGS_TO_DELETE_CI - ): + elif any(sub_str in item_name_lower for sub_str in FILENAME_SUBSTRINGS_TO_DELETE_CI): delete_file = True - reason = "substring match" + reason = "substring match" elif item_suffix_lower in EXTENSIONS_TO_DELETE_CI: delete_file = True reason = f"extension ({item_suffix_lower})" @@ -274,9 +248,7 @@ def analyze_project_contents(self): if delete_file: self.files_to_delete.add(item_path) if self.args.verbose > 1: - self._log_info( - f"Marked for deletion ({reason}): {item_path.relative_to(self.project_path)}" - ) + self._log_info(f"Marked for deletion ({reason}): {item_path.relative_to(self.project_path)}") # --- Pass 3: Identify folders to DELETE --- for item in self.project_path.iterdir(): @@ -339,6 +311,7 @@ def process_single_project_for_cleaning( cleaner.execute_cleanup() return project_path.name, cleaner.log_buffer, cleaner.parsing_errors + # --- Main Function --- def main(): parser = argparse.ArgumentParser( @@ -363,9 +336,7 @@ def main(): default=0, help="Increase output verbosity. -v for project-level info, -vv for file-level decisions.", ) - parser.add_argument( - "--log-file", help="Path to a file to log actions and verbose information." - ) + parser.add_argument("--log-file", help="Path to a file to log actions and verbose information.") args = parser.parse_args() # --- Configure Logging --- @@ -391,7 +362,7 @@ def main(): if args.dry_run: print("DRY RUN mode enabled.") logger.info( - f"Starting cleanup process. Projects root: {args.projects_root}. Dry run: {args.dry_run}. Verbose: {args.verbose}." + f"Starting cleanup process for: {args.projects_root}. Dry run: {args.dry_run}. Verbose: {args.verbose}." ) projects_root_path = Path(args.projects_root) @@ -409,7 +380,7 @@ def main(): if item.is_dir(): all_folders.append(item) - test = True + test = False if test: all_folders = all_folders[:200] @@ -426,20 +397,20 @@ def main(): project_folders = [] non_project_folders = [] - - with concurrent.futures.ThreadPoolExecutor(max_workers=max_workers) as executor: - + # Submit tasks for each folder future_to_folder = {executor.submit(has_settings_file, folder): folder for folder in all_folders} # Iterate over completed tasks using tqdm, add mininterval for smoother updates # if individual has_settings_file calls are very fast. - for future in tqdm(concurrent.futures.as_completed(future_to_folder), - total=len(all_folders), - desc="Identifying project folders", - unit="folder", - disable=args.verbose > 0): + for future in tqdm( + concurrent.futures.as_completed(future_to_folder), + total=len(all_folders), + desc="Identifying project folders", + unit="folder", + disable=args.verbose > 0, + ): folder = future_to_folder[future] try: is_project = future.result() @@ -450,19 +421,16 @@ def main(): except Exception as exc: logger.error(f"Error checking folder {folder}: {exc}") if args.verbose > 0: - print(f"Error checking folder {folder}: {exc}") + print(f"Error checking folder {folder}: {exc}") non_project_folders.append(folder) - found_msg = f"Found {len(project_folders)} project folders." logger.info(found_msg) if args.verbose > 0: print(found_msg) if non_project_folders: - non_project_msg = ( - f"Found {len(non_project_folders)} non-project folders (will be ignored):" - ) + non_project_msg = f"Found {len(non_project_folders)} non-project folders (will be ignored):" logger.info(non_project_msg) if args.verbose > 0: print(non_project_msg) @@ -496,7 +464,7 @@ def main(): desc="Cleaning projects", unit="project", disable=args.verbose > 0, # tqdm is disabled if verbose output is on - mininterval=0.01 # More frequent updates, similar to the folder identification step + mininterval=0.01, # More frequent updates, similar to the folder identification step ): processed_project_path = future_to_project_path_map[future] try: @@ -509,7 +477,9 @@ def main(): if args.verbose > 0: print(crit_error_msg) # Store a placeholder for sorted output - processed_project_data.append((processed_project_path.name, [], [f"Critical error: {exc}"], processed_project_path)) + processed_project_data.append( + (processed_project_path.name, [], [f"Critical error: {exc}"], processed_project_path) + ) # Sort all collected data by project name processed_project_data.sort(key=lambda x: x[0]) @@ -518,19 +488,19 @@ def main(): for project_name, project_logs, project_parsing_errors, _project_path in processed_project_data: # Log messages collected by the cleaner for log_msg_from_buffer in project_logs: - logger.info(log_msg_from_buffer) # Already formatted with [ProjectName] prefix by ProjectCleaner - if args.verbose > 0: # Print to console if verbose + logger.info(log_msg_from_buffer) # Already formatted with [ProjectName] prefix by ProjectCleaner + if args.verbose > 0: # Print to console if verbose print(log_msg_from_buffer) # Log parsing errors, ensuring they are associated with the project if project_parsing_errors: for err_str in project_parsing_errors: error_log_message = f"[{project_name}] Config Error: {err_str}" - logger.warning(error_log_message) # Use warning for parsing/config errors + logger.warning(error_log_message) # Use warning for parsing/config errors if args.verbose > 0: print(error_log_message) - - logger.info(f"[{project_name}] Processing completed.") # Log overall completion for this project + + logger.info(f"[{project_name}] Processing completed.") # Log overall completion for this project final_msg = "\nCleanup process completed." logger.info(final_msg) From ef092ffa59c51bdf0de4e186a9a51a7f2c503ea3 Mon Sep 17 00:00:00 2001 From: David Baines Date: Wed, 28 May 2025 14:09:23 +0100 Subject: [PATCH 4/5] Remove test code. --- silnlp/common/clean_projects.py | 8 +------- 1 file changed, 1 insertion(+), 7 deletions(-) diff --git a/silnlp/common/clean_projects.py b/silnlp/common/clean_projects.py index 1d0e42c2..f4defdd9 100644 --- a/silnlp/common/clean_projects.py +++ b/silnlp/common/clean_projects.py @@ -381,13 +381,7 @@ def main(): all_folders.append(item) test = False - - if test: - all_folders = all_folders[:200] - # Use a single ThreadPoolExecutor for concurrent I/O-bound tasks - max_workers = 1 - else: - max_workers = 10 + max_workers = 10 found_total_msg = f"Found {len(all_folders)} total directories in {args.projects_root}." logger.info(found_total_msg) From deba37f431c01b47f380a9fa50e90499e14070b2 Mon Sep 17 00:00:00 2001 From: David Baines Date: Mon, 2 Jun 2025 10:36:26 +0100 Subject: [PATCH 5/5] Remove the unused test parameter --- silnlp/common/clean_projects.py | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/silnlp/common/clean_projects.py b/silnlp/common/clean_projects.py index f4defdd9..4cb0e82c 100644 --- a/silnlp/common/clean_projects.py +++ b/silnlp/common/clean_projects.py @@ -155,8 +155,8 @@ def _parse_settings(self): # The following code correctly uses self.project_settings, # which will be None if parsing failed, and thus these blocks will be skipped. - if project_settings.biblical_terms_file_name: - terms_file_path = self.project_path / project_settings.biblical_terms_file_name + if self.project_settings and self.project_settings.biblical_terms_file_name: + terms_file_path = self.project_path / self.project_settings.biblical_terms_file_name if terms_file_path.is_file(): self.biblical_terms_files.add(terms_file_path) self._log_info(f"Found BiblicalTermsListSetting file: {terms_file_path.name}") @@ -164,7 +164,7 @@ def _parse_settings(self): warning_msg = f"Warning: BiblicalTermsListSetting file not found at expected path: {terms_file_path}" if self.args.verbose > 0: # Condition to buffer this warning self._log_info(warning_msg) - self.parsing_errors.append(f"BiblicalTermsListSetting file not found: {terms_file_path.name}") + self.parsing_errors.append(f"BiblicalTermsListSetting file not found: {self.project_settings.biblical_terms_file_name})") def analyze_project_contents(self): self._parse_settings() @@ -340,7 +340,9 @@ def main(): args = parser.parse_args() # --- Configure Logging --- - log_formatter = logging.Formatter("%(asctime)s - %(levelname)s - %(message)s") + #log_formatter = logging.Formatter("%(asctime)s - %(levelname)s - %(message)s") + log_formatter = logging.Formatter("2025-05-29 14:30:00,000 - %(levelname)s - %(message)s") + logger.setLevel(logging.INFO) console_handler = logging.StreamHandler(sys.stdout) console_handler.setFormatter(log_formatter) @@ -380,7 +382,6 @@ def main(): if item.is_dir(): all_folders.append(item) - test = False max_workers = 10 found_total_msg = f"Found {len(all_folders)} total directories in {args.projects_root}."