Merge pull request #103 from UAL-RE/102-feature-check-if-bag-exist-in…

…-ap-trust-prior-to-bagging Feat: Check if item version is already preserved before bagging (Issue #102)
UAL-RE · Sep 16, 2024 · 1499a14 · 1499a14
2 parents 2813c16 + 937fd45
commit 1499a14
Show file tree

Hide file tree

Showing 7 changed files with 580 additions and 45 deletions.
diff --git a/.env.sample.ini b/.env.sample.ini
@@ -5,6 +5,15 @@ retries = 3
 retries_wait = 10
 institution = 1077
 
+[aptrust_api]
+url =
+user =
+token =
+items_per_page =
+alt_identifier_starts_with =
+retries = 3
+retries_wait = 10
+
 [system]
 preservation_storage_location = 
 logs_location = 

diff --git a/Config.py b/Config.py
@@ -9,6 +9,9 @@ def __init__(self, fileName):
     def figshare_config(self):
         return self.config['figshare_api']
 
+    def aptrust_config(self):
+        return self.config['aptrust_api']
+
     def system_config(self):
         return self.config['system']
 

diff --git a/README.md b/README.md
@@ -22,14 +22,23 @@ ReBACH is run via the command line as outlined in the 'How to Run' section of th
 ## How to run:
 - Copy the .env.sample.ini file and give it a name of your choice (e.g. .env.ini).
 - Fill out the .env.ini file (IMPORTANT: Make sure not to commit this file to Github)
-    - url - required: The figshare API url
-    - token - required: Your auth token to your organization's API
-    - retries - required: Number of times the script should retry API or file system calls if it is unable to connect. Defaults to 3
-    - retries_wait - required: Number of seconds the script should wait between call retries if it is unable to connect. Defaults to 10
-    - institution - required: The Figshare Institution ID for your organization
+    - figshare_api
+	    - url - required: The figshare API url
+	    - token - required: Your auth token to your organization's API
+	    - retries - required: Number of times the script should retry API or file system calls if it is unable to connect. Defaults to 3
+	    - retries_wait - required: Number of seconds the script should wait between call retries if it is unable to connect. Defaults to 10
+	    - institution - required: The Figshare Institution ID for your organization
+    - aptrust_api
+	    - url - required: The AP Trust member API url including the version
+		- user - required: Your user email address on AP Trust
+		- token - required: Your user secret token on AP Trust
+        - items_per_page - Maximum number of object to be return per page by the API
+        - alt_identifier_starts_with - Prefix for alternate identifier in AP Trust 
+		- retries - required: Number of times the script should retry API or file system calls if it is unable to connect. Defaults to 3
+		- retries_wait - required: Number of seconds the script should wait between call retries if it is unable to connect. Defaults to 10
     - preservation_storage_location - required: The file system location where the preservation folders/packages should be created
     - logs_location - required: The file system location where logs should be created. This value will override the one in `bagger/config/default.toml` when bagger is used for post-processing (see post_process_script_command setting below).
-    - additional_precentage_required - required: How much extra space the preservation storage location should have in order to handle files as a percent. This percent is applied to the total storage needed for all files. I.e. if the value of this field is 10 and the amount of storage needed for files is 1 GB, the script will make sure that the preservation storage location has at least 1.1 GB free. Defaults to 10
+    - additional_percentage_required - required: How much extra space the preservation storage location should have in order to handle files as a percent. This percent is applied to the total storage needed for all files. I.e. if the value of this field is 10 and the amount of storage needed for files is 1 GB, the script will make sure that the preservation storage location has at least 1.1 GB free. Defaults to 10
     - pre_process_script_command - optional: The terminal command (including arguments) to invoke a script to be run BEFORE the files are copied and logic applied to the preservation storage (note: this action is not currently implemented)
     - post_process_script_command - required: Specifies the method of performing post-processing steps. This can take only two values: the string 'Bagger', or the path to an external script. If the value is set to 'Bagger', the post-processing steps will consist of running the internal `bagger` module. If the value is set to a path to an external script, the post-processing steps will be executed by invoking the external script through the function 'post_process_script_function'. The post-processing steps are executed AFTER the files are copied and logic applied to the preservation storage.
     - curation_storage_location - required: The file system location where the Curation files reside

diff --git a/app.py b/app.py
@@ -142,15 +142,17 @@ def main():
     get_args()
     config, log = main()
 
-    log.write_log_in_file('info',
-                          "Fetching articles...",
-                          True)
+    log.write_log_in_file('info', " ", True)
+    log.write_log_in_file('info', "------- Fetching articles -------", True)
     article_obj = Article(config, log, args.ids)
-    article_data = article_obj.get_articles()
+    article_data, already_preserved_counts_dict = article_obj.get_articles()
 
+    already_preserved_articles_count = len(already_preserved_counts_dict['already_preserved_article_ids'])
+    already_preserved_versions_count = already_preserved_counts_dict['already_preserved_versions']
     published_articles_count = 0
     published_articles_versions_count = 0
     published_unpublished_count = 0
+
     for i, (k, v) in enumerate(article_data.items()):
         published_unpublished_count += 1
         if len(v) > 0:
@@ -159,14 +161,12 @@ def main():
 
     log.write_log_in_file('info', "Fetched: "
                           + f"Total articles: {published_unpublished_count}, "
-                          + f"Published articles: {published_articles_count}, "
-                          + f"Published article versions: {published_articles_versions_count}",
+                          + f"Published articles: {published_articles_count + already_preserved_articles_count}, "
+                          + f"Published article versions: {published_articles_versions_count + already_preserved_versions_count}",
                           True)
     print(" ")
 
-    log.write_log_in_file('info',
-                          "Fetching collections...",
-                          True)
+    log.write_log_in_file('info', "------- Fetching collections -------", True)
     collection_obj = Collection(config, log, args.ids)
     collection_data = collection_obj.get_collections()
 
@@ -181,52 +181,103 @@ def main():
     print(" ")
 
     # Start articles processing after completing fetching data from API
-    processed_articles_versions_count = article_obj.process_articles(article_data)
+    processed_articles_versions_count, ap_trust_preserved_article_version_count, wasabi_preserved_versions \
+        = article_obj.process_articles(article_data)
 
     # Start collections processing after completing fetching data from API and articles processing.
-    processed_collections_versions_count = collection_obj.process_collections(collection_data)
+    processed_collections_versions_count, already_preserved_collections_counts = collection_obj.process_collections(collection_data)
+    already_preserved_collections = len(already_preserved_collections_counts['already_preserved_collection_ids'])
+    already_preserved_collection_versions = already_preserved_collections_counts['already_preserved_versions']
+    preserved_collection_versions_in_wasabi = already_preserved_collections_counts['wasabi_preserved_versions']
+    preserved_collection_versions_in_ap_trust = already_preserved_collections_counts['ap_trust_preserved_versions']
+
+    log.write_log_in_file('info', ' ', True)
+    log.write_log_in_file('info', '------- Summary -------', True)
+    log.write_log_in_file('info',
+                          f"Total articles: \t\t\t\t\t\t\t\t\t{published_unpublished_count}",
+                          True)
 
-    log.write_log_in_file('info', '------- Summary -------')
     log.write_log_in_file('info',
-                          "Total articles/published articles: \t\t\t\t\t\t"
-                          + f'{published_unpublished_count} / {published_articles_count}',
+                          "Total published articles/article versions: \t\t\t\t\t"
+                          + f'{published_articles_count + already_preserved_articles_count} / '
+                          + f'{published_articles_versions_count + already_preserved_versions_count}',
                           True)
+
     log.write_log_in_file('info',
-                          "Total processed articles bags already in preservation storage: \t\t\t"
-                          + f'{article_obj.processor.duplicate_bag_in_preservation_storage_count}',
+                          "Total count of already preserved (skipped) articles / article versions: \t\t"
+                          + f'{already_preserved_articles_count} / {already_preserved_versions_count}',
                           True)
+
+    if article_obj.processor.duplicate_bag_in_preservation_storage_count > 0:
+        log.write_log_in_file('warning',
+                              f'Bagger found {article_obj.processor.duplicate_bag_in_preservation_storage_count} duplicate article(s)',
+                              True)
+
     log.write_log_in_file('info',
-                          "Total articles versions matched/published: \t\t\t\t\t"  # todo: exclude already-preserved bags from processing
+                          "Total articles versions matched/published (unskipped): \t\t\t\t"
                           + f'{article_obj.no_matched} / {published_articles_versions_count}',
                           True)
     log.write_log_in_file('info',
                           "Total articles versions processed/matched: \t\t\t\t\t"
                           + f'{processed_articles_versions_count} / {article_obj.no_matched}',
                           True)
+    log.write_log_in_file('info',
+                          "Total count of already preserved article versions in preservation final remote storage: \t\t"
+                          + f'{ap_trust_preserved_article_version_count}',
+                          True)
+    log.write_log_in_file('info',
+                          "Total count of already preserved article versions in preservation staging remote storage: \t"
+                          + f'{wasabi_preserved_versions}',
+                          True)
+
     log.write_log_in_file('info',
                           "Total articles versions unmatched (published-matched): \t\t\t\t"
                           + f'{article_obj.no_unmatched}',
                           True)
     log.write_log_in_file('info',
-                          "Total processed articles bags successfully preserved \t\t\t\t"
+                          "Total processed articles bags successfully preserved: \t\t\t\t"
                           + f'{article_obj.processor.bag_preserved_count}',
                           True)
+
+    log.write_log_in_file('info', "", True)
+    log.write_log_in_file('info',
+                          "Total collections: \t\t\t\t\t\t\t\t"
+                          + f'{collections_count}',
+                          True)
+    log.write_log_in_file('info',
+                          "Total published collections / collection versions: \t\t\t\t"
+                          + f'{collections_count} / {collections_versions_count}',
+                          True)
+
     log.write_log_in_file('info',
-                          "Total collections/published collections: \t\t\t\t\t\t"
-                          + f'{collections_count} / {collections_count}',
+                          "Total count of already preserved (skipped) collections / collection versions: \t"
+                          + f'{already_preserved_collections} / {already_preserved_collection_versions}',
                           True)
+
     log.write_log_in_file('info',
                           "Total collections versions processed/published: \t\t\t\t\t"
-                          + f'{processed_collections_versions_count} / {collections_versions_count}',
+                          + f'{processed_collections_versions_count} / {collections_versions_count - already_preserved_collection_versions}',
                           True)
+
+    if collection_obj.processor.duplicate_bag_in_preservation_storage_count > 0:
+        log.write_log_in_file('warning',
+                              f'Bagger found {collection_obj.processor.duplicate_bag_in_preservation_storage_count} duplicate collection(s)',
+                              True)
+
+    log.write_log_in_file('info',
+                          "Total count of already preserved collection versions in preservation final remote storage: \t"
+                          + f'{preserved_collection_versions_in_ap_trust}',
+                          True)
+
     log.write_log_in_file('info',
-                          "Total collections already preserved: \t\t\t\t\t\t"
-                          + f'{collection_obj.processor.duplicate_bag_in_preservation_storage_count}',
+                          "Total count of already preserved collection versions in preservation staging remote storage: \t"
+                          + f'{preserved_collection_versions_in_wasabi}',
                           True)
 
-    if processed_articles_versions_count != published_articles_versions_count or processed_collections_versions_count != collections_versions_count:
+    if processed_articles_versions_count != published_articles_versions_count or \
+            processed_collections_versions_count != (collections_versions_count - already_preserved_collection_versions):
         log.write_log_in_file('warning',
-                              'The number of articles versions or collections versions sucessfully processed is different'
+                              'The number of articles versions or collections versions successfully processed is different'
                               + ' than the number fetched. Check the log for details.', True)
 
     log.write_log_in_file('info',