From 9e5ddaa1ad968d1b1d664625142e61853715feef Mon Sep 17 00:00:00 2001 From: Fernando Rios Date: Tue, 1 Oct 2024 12:20:39 -0700 Subject: [PATCH] Feat: describe enhancement or feature (Issue #41) (#106) * Add dry run argument * add dry run arg to config object * Add dry run functionality * lint * lint * lint --- README.md | 3 ++- app.py | 3 +++ figshare/Article.py | 47 ++++++++++++++++++++++++++++++++---------- figshare/Collection.py | 18 +++++++++++----- 4 files changed, 54 insertions(+), 17 deletions(-) diff --git a/README.md b/README.md index f31e057..5529ce9 100644 --- a/README.md +++ b/README.md @@ -33,7 +33,7 @@ ReBACH is run via the command line as outlined in the 'How to Run' section of th - user - required: Your user email address on AP Trust - token - required: Your user secret token on AP Trust - items_per_page - Maximum number of object to be return per page by the API - - alt_identifier_starts_with - Prefix for alternate identifier in AP Trust + - alt_identifier_starts_with - Prefix for alternate identifier in AP Trust - retries - required: Number of times the script should retry API or file system calls if it is unable to connect. Defaults to 3 - retries_wait - required: Number of seconds the script should wait between call retries if it is unable to connect. Defaults to 10 - preservation_storage_location - required: The file system location where the preservation folders/packages should be created @@ -54,6 +54,7 @@ These parameters are only available on the command line. |`--xfg` | The path to the configuration file to use.| |`--ids` | A comma-separated list of article IDs to process. E.g., 12345,12356| |`--continue-on-error`| If there is an error during the item processing stage for a given item, skip it and continue to the next item.| +|`--dry-run` | Runs all operations, excluding any that involve writing any storage medium | ## Execution notes - ReBACH will attempt to fetch all items in the institutional instance. Items that are not published (curation_status != 'approved') will be ignored. diff --git a/app.py b/app.py index cf8d476..07a8f36 100644 --- a/app.py +++ b/app.py @@ -23,6 +23,8 @@ def get_args(): help='list of article and/or collection IDs to process. E.g., "2323,4353,5454"') parser.add_argument('--continue-on-error', action='store_true', help='If an item encounters an error during the processing stage, continue to the next item.') + parser.add_argument('--dry-run', action='store_true', + help='Fetch, match and verify items only. Do not download, delete, or upload to preservation any files.') args = parser.parse_args() @@ -72,6 +74,7 @@ def main(): config_obj = Config(env_file) config_obj.add_setting(name='continue-on-error', value=args.continue_on_error) + config_obj.add_setting(name='dry-run', value=args.dry_run) figshare_config = config_obj.figshare_config() system_config = config_obj.system_config() diff --git a/figshare/Article.py b/figshare/Article.py index d0e97c2..188a61a 100644 --- a/figshare/Article.py +++ b/figshare/Article.py @@ -660,7 +660,10 @@ def __check_file_hash(self, files, version_data, folder_path): # delete directory if validation failed. if (delete_folder is True): self.logs.write_log_in_file("error", f"Validation failed, deleting {preservation_storage_location + folder_path}.", True) - self.delete_folder(preservation_storage_location + folder_path) + if self.system_config['dry-run'] == 'False': + self.delete_folder(preservation_storage_location + folder_path) + else: + self.logs.write_log_in_file("info", "*Dry Run* Folder not deleted.", True) process_article = True return process_article @@ -1008,8 +1011,14 @@ def process_articles(self, articles): if (version_data["matched"] is True): self.logs.write_log_in_file("info", f"------- Processing article {article} version {version_data['version']}.", True) + # call pre process script function for each matched item. - value_pre_process = self.pre_process_script_function() + if self.system_config['dry-run'] == 'False': + value_pre_process = self.pre_process_script_function() + else: + value_pre_process = 0 + self.logs.write_log_in_file("info", "*Dry Run* Skipping pre processing.", True) + if (value_pre_process == 0): self.logs.write_log_in_file("info", "Pre-processing script finished successfully.", True) # check main folder exists in preservation storage. @@ -1026,24 +1035,40 @@ def process_articles(self, articles): else: self.logs.write_log_in_file("info", "Exists and is empty", True) check_files = False - # delete folder if validation fails - self.delete_folder(check_dir) - # call post process script function for each matched item. Code 5 corresponds to step 5 of S4.4 in the spec. - value_post_process = self.processor.post_process_script_function("Article", check_dir, value_pre_process, 5) - if (value_post_process != 0): - self.logs.write_log_in_file("error", f"{version_data['id']} version {version_data['version']} - " - + "Post-processing script error found.", True) + + if self.system_config['dry-run'] == 'False': + # delete folder if validation fails + self.delete_folder(check_dir) + # call post process script function for each matched item. Code 5 corresponds to step 5 of S4.4 in the spec. + value_post_process = self.processor.post_process_script_function("Article", check_dir, value_pre_process, 5) + if (value_post_process != 0): + self.logs.write_log_in_file("error", f"{version_data['id']} version {version_data['version']} - " + + "Post-processing script error found.", True) + else: + self.logs.write_log_in_file("info", "*Dry Run* File download and post-processing with " + + f"{self.system_config['post_process_script_command']} skipped.", True) + break else: - self.logs.write_log_in_file("info", "Does not exist. Folder will be created", True) + value_post_process = 0 + if self.system_config['dry-run'] == 'False': + self.logs.write_log_in_file("info", "Does not exist. Folder will be created", True) + else: + self.logs.write_log_in_file("info", "*Dru Run* Does not exist. Folder will not be created", True) # end check main folder exists in preservation storage. # check required files exist in curation UAL_RDM folder self.logs.write_log_in_file("info", "Checking required files exist in associated curation " + f"folder {curation_storage_location}.", True) copy_files = self.__can_copy_files(version_data) - if self.__final_process(check_files, copy_files, check_dir, version_data, folder_name, version_no, value_pre_process): + + if self.system_config['dry-run'] == 'False': + if self.__final_process(check_files, copy_files, check_dir, version_data, folder_name, version_no, value_pre_process): + processed_count += 1 + else: processed_count += 1 + self.logs.write_log_in_file("info", "*Dry Run* File download and post-processing with " + + f"{self.system_config['post_process_script_command']} skipped.", True) else: self.logs.write_log_in_file("error", "Pre-processing script failed. Running post-processing script.", True) # call post process script function for each matched item. diff --git a/figshare/Collection.py b/figshare/Collection.py index 0859ceb..186c335 100644 --- a/figshare/Collection.py +++ b/figshare/Collection.py @@ -302,13 +302,21 @@ def process_collections(self, collections): version["license"] = json.loads('{"value": 2,"name": "CC0","url": "https://creativecommons.org/publicdomain/zero/1.0/"}') self.logs.write_log_in_file("info", f"------- Processing collection {collection} version {version['version']}.", True) - self.__save_json_in_metadata(collection, version, folder_name) - collection_preservation_path = self.preservation_storage_location + os.path.basename(os.path.dirname(os.path.dirname(folder_name))) - value_post_process = self.processor.post_process_script_function("Collection", collection_preservation_path) - if (value_post_process != 0): - self.logs.write_log_in_file("error", f"collection {collection} - post-processing script failed.", True) + + if self.system_config['dry-run'] == 'False': + self.__save_json_in_metadata(collection, version, folder_name) + collection_preservation_path = self.preservation_storage_location + \ + os.path.basename(os.path.dirname(os.path.dirname(folder_name))) + value_post_process = self.processor.post_process_script_function("Collection", collection_preservation_path) + if (value_post_process != 0): + self.logs.write_log_in_file("error", f"collection {collection} - post-processing script failed.", True) + else: + processed_count += 1 else: + self.logs.write_log_in_file("info", "*Dry Run* File download and post-processing with " + + f"{self.system_config['post_process_script_command']} skipped.", True) processed_count += 1 + return processed_count, self.already_preserved_counts_dict """