UAL-RE · zoidy · Oct 3, 2024 · Sep 25, 2024 · Sep 26, 2024 · Sep 30, 2024
diff --git a/README.md b/README.md
@@ -58,5 +58,7 @@ These parameters are only available on the command line.
 ## Execution notes
 - ReBACH will attempt to fetch all items in the institutional instance. Items that are not published (curation_status != 'approved') will be ignored.
 - Items that are embargoed are also fetched however due to limitations in the API, only the latest version can be fetched until the embargo expires or is removed.
+- While fetching, ReBACH checks preservation remote storages for a preserved copy of each item. If a preservation copy of an item is found and confirmed, the item will ignored in subsequent stages.
+- Checking preservation final remote storage for a preserved copy of an article requires size of the curation storage folder of the article. If an error occurs while calculating the size of an article curation folder, the error will be recorded and execution will stop except if the `--continue-on-error` flag is set.     
 - When processing collections, ReBACH records which items are part of the collection by appending them to collection's JSON as returned by the Figshare API.
 - If an item encounters errors, it will not be processed and any partial files are deleted in preservation staging storage.
diff --git a/app.py b/app.py
@@ -149,6 +149,8 @@ def main():
 
     already_preserved_articles_count = len(already_preserved_counts_dict['already_preserved_article_ids'])
     already_preserved_versions_count = already_preserved_counts_dict['already_preserved_versions']
+    articles_with_error_count = len(already_preserved_counts_dict['articles_with_error'])
+    article_versions_with_error_count = already_preserved_counts_dict['article_versions_with_error']
     published_articles_count = 0
     published_articles_versions_count = 0
     published_unpublished_count = 0
@@ -161,8 +163,9 @@ def main():
 
     log.write_log_in_file('info', "Fetched: "
                           + f"Total articles: {published_unpublished_count}, "
-                          + f"Published articles: {published_articles_count + already_preserved_articles_count}, "
-                          + f"Published article versions: {published_articles_versions_count + already_preserved_versions_count}",
+                          + f"Published articles: {published_articles_count + already_preserved_articles_count + articles_with_error_count}, "
+                          + "Published article versions: "
+                          + f"{published_articles_versions_count + already_preserved_versions_count + article_versions_with_error_count}",
                           True)
     print(" ")
 
@@ -199,15 +202,26 @@ def main():
 
     log.write_log_in_file('info',
                           "Total published articles/article versions: \t\t\t\t\t"
-                          + f'{published_articles_count + already_preserved_articles_count} / '
-                          + f'{published_articles_versions_count + already_preserved_versions_count}',
+                          + f'{published_articles_count + already_preserved_articles_count + articles_with_error_count} / '
+                          + f'{published_articles_versions_count + already_preserved_versions_count + article_versions_with_error_count}',
                           True)
 
     log.write_log_in_file('info',
                           "Total count of already preserved (skipped) articles / article versions: \t\t"
                           + f'{already_preserved_articles_count} / {already_preserved_versions_count}',
                           True)
 
+    log.write_log_in_file('info',
+                          "Total count of articles with fetch error / articles: \t\t\t\t"
+                          + f'{articles_with_error_count} / {published_unpublished_count}',
+                          True)
+
+    log.write_log_in_file('info',
+                          "Total count of article versions with fetch error / article versions: \t\t"
+                          + f'{article_versions_with_error_count} / '
+                          + f'{published_articles_versions_count + already_preserved_versions_count + article_versions_with_error_count}',
+                          True)
+
     if article_obj.processor.duplicate_bag_in_preservation_storage_count > 0:
         log.write_log_in_file('warning',
                               f'Bagger found {article_obj.processor.duplicate_bag_in_preservation_storage_count} duplicate article(s)',

diff --git a/figshare/Article.py b/figshare/Article.py
@@ -51,7 +51,8 @@ def __init__(self, config, log, ids):
         self.no_matched = 0
         self.no_unmatched = 0
         self.already_preserved_counts_dict = {'already_preserved_article_ids': set(), 'already_preserved_versions': 0,
-                                              'wasabi_preserved_versions': 0, 'ap_trust_preserved_versions': 0}
+                                              'wasabi_preserved_versions': 0, 'ap_trust_preserved_versions': 0,
+                                              'articles_with_error': set(), 'article_versions_with_error': 0}
         self.skipped_article_versions = {}
         self.processor = Integration(self.config_obj, self.logs)
 
@@ -260,6 +261,22 @@ def __get_article_metadata_by_version(self, version, article_id):
                     if (get_response.status_code == 200):
                         version_data = get_response.json()
                         payload_size = calculate_payload_size(self.system_config, version_data)
+
+                        if payload_size == 0:
+                            if self.system_config['continue-on-error'] == "False":
+                                self.logs.write_log_in_file("error",
+                                                            f"Curation folder for article {article_id} version {version['version']} not found.",
+                                                            True)
+                                self.logs.write_log_in_file("info", "Aborting execution.", True)
+                                exit()
+                            self.already_preserved_counts_dict['articles_with_error'].add(article_id)
+                            self.already_preserved_counts_dict['article_versions_with_error'] += 1
+                            self.logs.write_log_in_file("error",
+                                                        f"Curation folder for article {article_id} version {version['version']} not found."
+                                                        + " Article version will be skipped.",
+                                                        True)
+                            return None
+
                         total_file_size = version_data['size']
                         files = []
                         error = ""
@@ -310,7 +327,7 @@ def __get_article_metadata_by_version(self, version, article_id):
                             self.already_preserved_counts_dict['ap_trust_preserved_versions'] += 1
                             self.logs.write_log_in_file("info",
                                                         f"Article {article_id} version {version['version']} "
-                                                        + "already preserved in preservation staging remote storage.",
+                                                        + "already preserved in preservation final remote storage.",
                                                         True)
 
                         if already_preserved:

diff --git a/figshare/Utils.py b/figshare/Utils.py
@@ -248,20 +248,28 @@ def calculate_ual_rdm_size(config, article_id: int, version: str):
     article_version_ual_rdm = ""
     version_ual_rdm_size = 0
     curation_storage = config['curation_storage_location']
-    if os.access(curation_storage, os.R_OK):
+    if os.path.exists(curation_storage) and os.access(curation_storage, os.R_OK):
         curation_storage_items = os.scandir(curation_storage)
         for item in curation_storage_items:
             if item.is_dir() and item.name.__contains__(str(article_id)):
                 article_dir = os.path.join(curation_storage, item.name)
                 break
+        if not os.path.exists(article_dir):
+            return 0
         for item in os.scandir(article_dir):
             if item.is_dir() and item.name.__contains__(version):
                 article_version_dir = os.path.join(article_dir, item.name)
                 break
+
+        if not os.path.exists(article_version_dir):
+            return 0
         for item in os.scandir(article_version_dir):
             if item.is_dir() and item.name.__contains__('UAL_RDM'):
                 article_version_ual_rdm = os.path.join(article_version_dir, item.name)
                 break
+
+        if not os.path.exists(article_version_ual_rdm):
+            return 0
         for item in os.scandir(article_version_ual_rdm):
             file_size = os.path.getsize(os.path.join(article_version_ual_rdm, item.name))
             version_ual_rdm_size += file_size
@@ -317,6 +325,8 @@ def calculate_payload_size(config: dict, version_data: dict) -> int:
     if int(version_no) > 9:
         version = f"v{str(version_no)}"
     version_ual_rdm_size = calculate_ual_rdm_size(config, article_id, version)
+    if version_ual_rdm_size == 0:
+        return 0
     json_file_size = calculate_json_file_size(version_data)
     payload_size = version_ual_rdm_size + json_file_size + article_files_size