From 9e5ddaa1ad968d1b1d664625142e61853715feef Mon Sep 17 00:00:00 2001
From: Fernando Rios <zoidy@users.noreply.github.com>
Date: Tue, 1 Oct 2024 12:20:39 -0700
Subject: [PATCH] Feat: describe enhancement or feature (Issue #41) (#106)

* Add dry run argument

* add dry run arg to config object

* Add dry run functionality

* lint

* lint

* lint
---
 README.md              |  3 ++-
 app.py                 |  3 +++
 figshare/Article.py    | 47 ++++++++++++++++++++++++++++++++----------
 figshare/Collection.py | 18 +++++++++++-----
 4 files changed, 54 insertions(+), 17 deletions(-)

diff --git a/README.md b/README.md
index f31e057..5529ce9 100644
--- a/README.md
+++ b/README.md
@@ -33,7 +33,7 @@ ReBACH is run via the command line as outlined in the 'How to Run' section of th
 		- user - required: Your user email address on AP Trust
 		- token - required: Your user secret token on AP Trust
         - items_per_page - Maximum number of object to be return per page by the API
-        - alt_identifier_starts_with - Prefix for alternate identifier in AP Trust 
+        - alt_identifier_starts_with - Prefix for alternate identifier in AP Trust
 		- retries - required: Number of times the script should retry API or file system calls if it is unable to connect. Defaults to 3
 		- retries_wait - required: Number of seconds the script should wait between call retries if it is unable to connect. Defaults to 10
     - preservation_storage_location - required: The file system location where the preservation folders/packages should be created
@@ -54,6 +54,7 @@ These parameters are only available on the command line.
 |`--xfg`  | The path to the configuration file to use.|
 |`--ids`  | A comma-separated list of article IDs to process. E.g., 12345,12356|
 |`--continue-on-error`| If there is an error during the item processing stage for a given item, skip it and continue to the next item.|
+|`--dry-run` | Runs all operations, excluding any that involve writing any storage medium |
 
 ## Execution notes
 - ReBACH will attempt to fetch all items in the institutional instance. Items that are not published (curation_status != 'approved') will be ignored.
diff --git a/app.py b/app.py
index cf8d476..07a8f36 100644
--- a/app.py
+++ b/app.py
@@ -23,6 +23,8 @@ def get_args():
                         help='list of article and/or collection IDs to process. E.g., "2323,4353,5454"')
     parser.add_argument('--continue-on-error', action='store_true',
                         help='If an item encounters an error during the processing stage, continue to the next item.')
+    parser.add_argument('--dry-run', action='store_true',
+                        help='Fetch, match and verify items only. Do not download, delete, or upload to preservation any files.')
     args = parser.parse_args()
 
 
@@ -72,6 +74,7 @@ def main():
     config_obj = Config(env_file)
 
     config_obj.add_setting(name='continue-on-error', value=args.continue_on_error)
+    config_obj.add_setting(name='dry-run', value=args.dry_run)
 
     figshare_config = config_obj.figshare_config()
     system_config = config_obj.system_config()
diff --git a/figshare/Article.py b/figshare/Article.py
index d0e97c2..188a61a 100644
--- a/figshare/Article.py
+++ b/figshare/Article.py
@@ -660,7 +660,10 @@ def __check_file_hash(self, files, version_data, folder_path):
         # delete directory if validation failed.
         if (delete_folder is True):
             self.logs.write_log_in_file("error", f"Validation failed, deleting {preservation_storage_location + folder_path}.", True)
-            self.delete_folder(preservation_storage_location + folder_path)
+            if self.system_config['dry-run'] == 'False':
+                self.delete_folder(preservation_storage_location + folder_path)
+            else:
+                self.logs.write_log_in_file("info", "*Dry Run* Folder not deleted.", True)
             process_article = True
 
         return process_article
@@ -1008,8 +1011,14 @@ def process_articles(self, articles):
 
                     if (version_data["matched"] is True):
                         self.logs.write_log_in_file("info", f"------- Processing article {article} version {version_data['version']}.", True)
+
                         # call pre process script function for each matched item.
-                        value_pre_process = self.pre_process_script_function()
+                        if self.system_config['dry-run'] == 'False':
+                            value_pre_process = self.pre_process_script_function()
+                        else:
+                            value_pre_process = 0
+                            self.logs.write_log_in_file("info", "*Dry Run* Skipping pre processing.", True)
+
                         if (value_pre_process == 0):
                             self.logs.write_log_in_file("info", "Pre-processing script finished successfully.", True)
                             # check main folder exists in preservation storage.
@@ -1026,24 +1035,40 @@ def process_articles(self, articles):
                                 else:
                                     self.logs.write_log_in_file("info", "Exists and is empty", True)
                                     check_files = False
-                                    # delete folder if validation fails
-                                    self.delete_folder(check_dir)
-                                    # call post process script function for each matched item. Code 5 corresponds to step 5 of S4.4 in the spec.
-                                    value_post_process = self.processor.post_process_script_function("Article", check_dir, value_pre_process, 5)
-                                    if (value_post_process != 0):
-                                        self.logs.write_log_in_file("error", f"{version_data['id']} version {version_data['version']} - "
-                                                                    + "Post-processing script error found.", True)
+
+                                    if self.system_config['dry-run'] == 'False':
+                                        # delete folder if validation fails
+                                        self.delete_folder(check_dir)
+                                        # call post process script function for each matched item. Code 5 corresponds to step 5 of S4.4 in the spec.
+                                        value_post_process = self.processor.post_process_script_function("Article", check_dir, value_pre_process, 5)
+                                        if (value_post_process != 0):
+                                            self.logs.write_log_in_file("error", f"{version_data['id']} version {version_data['version']} - "
+                                                                        + "Post-processing script error found.", True)
+                                    else:
+                                        self.logs.write_log_in_file("info", "*Dry Run* File download and post-processing with "
+                                                                    + f"{self.system_config['post_process_script_command']} skipped.", True)
+
                                     break
                             else:
-                                self.logs.write_log_in_file("info", "Does not exist. Folder will be created", True)
+                                value_post_process = 0
+                                if self.system_config['dry-run'] == 'False':
+                                    self.logs.write_log_in_file("info", "Does not exist. Folder will be created", True)
+                                else:
+                                    self.logs.write_log_in_file("info", "*Dru Run* Does not exist. Folder will not be created", True)
 
                             # end check main folder exists in preservation storage.
                             # check required files exist in curation UAL_RDM folder
                             self.logs.write_log_in_file("info", "Checking required files exist in associated curation "
                                                         + f"folder {curation_storage_location}.", True)
                             copy_files = self.__can_copy_files(version_data)
-                            if self.__final_process(check_files, copy_files, check_dir, version_data, folder_name, version_no, value_pre_process):
+
+                            if self.system_config['dry-run'] == 'False':
+                                if self.__final_process(check_files, copy_files, check_dir, version_data, folder_name, version_no, value_pre_process):
+                                    processed_count += 1
+                            else:
                                 processed_count += 1
+                                self.logs.write_log_in_file("info", "*Dry Run* File download and post-processing with "
+                                                            + f"{self.system_config['post_process_script_command']} skipped.", True)
                         else:
                             self.logs.write_log_in_file("error", "Pre-processing script failed. Running post-processing script.", True)
                             # call post process script function for each matched item.
diff --git a/figshare/Collection.py b/figshare/Collection.py
index 0859ceb..186c335 100644
--- a/figshare/Collection.py
+++ b/figshare/Collection.py
@@ -302,13 +302,21 @@ def process_collections(self, collections):
                 version["license"] = json.loads('{"value": 2,"name": "CC0","url": "https://creativecommons.org/publicdomain/zero/1.0/"}')
 
                 self.logs.write_log_in_file("info", f"------- Processing collection {collection} version {version['version']}.", True)
-                self.__save_json_in_metadata(collection, version, folder_name)
-                collection_preservation_path = self.preservation_storage_location + os.path.basename(os.path.dirname(os.path.dirname(folder_name)))
-                value_post_process = self.processor.post_process_script_function("Collection", collection_preservation_path)
-                if (value_post_process != 0):
-                    self.logs.write_log_in_file("error", f"collection {collection} - post-processing script failed.", True)
+
+                if self.system_config['dry-run'] == 'False':
+                    self.__save_json_in_metadata(collection, version, folder_name)
+                    collection_preservation_path = self.preservation_storage_location + \
+                        os.path.basename(os.path.dirname(os.path.dirname(folder_name)))
+                    value_post_process = self.processor.post_process_script_function("Collection", collection_preservation_path)
+                    if (value_post_process != 0):
+                        self.logs.write_log_in_file("error", f"collection {collection} - post-processing script failed.", True)
+                    else:
+                        processed_count += 1
                 else:
+                    self.logs.write_log_in_file("info", "*Dry Run* File download and post-processing with "
+                                                + f"{self.system_config['post_process_script_command']} skipped.", True)
                     processed_count += 1
+
         return processed_count, self.already_preserved_counts_dict
 
     """