Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fix: Preprocessing of articles stops if curation folder does not exist for an article (Issue #105) #107

2 changes: 2 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -58,5 +58,7 @@ These parameters are only available on the command line.
## Execution notes
- ReBACH will attempt to fetch all items in the institutional instance. Items that are not published (curation_status != 'approved') will be ignored.
- Items that are embargoed are also fetched however due to limitations in the API, only the latest version can be fetched until the embargo expires or is removed.
- While fetching, ReBACH checks preservation remote storages for a preserved copy of each item. If a preservation copy of an item is found and confirmed, the item will ignored in subsequent stages.
- Checking preservation final remote storage for a preserved copy of an article requires size of the curation storage folder of the article. If an error occurs while calculating the size of an article curation folder, the error will be recorded and execution will stop except if the `--continue-on-error` flag is set.
- When processing collections, ReBACH records which items are part of the collection by appending them to collection's JSON as returned by the Figshare API.
- If an item encounters errors, it will not be processed and any partial files are deleted in preservation staging storage.
22 changes: 18 additions & 4 deletions app.py
Original file line number Diff line number Diff line change
Expand Up @@ -149,6 +149,8 @@ def main():

already_preserved_articles_count = len(already_preserved_counts_dict['already_preserved_article_ids'])
already_preserved_versions_count = already_preserved_counts_dict['already_preserved_versions']
articles_with_error_count = len(already_preserved_counts_dict['articles_with_error'])
article_versions_with_error_count = already_preserved_counts_dict['article_versions_with_error']
published_articles_count = 0
published_articles_versions_count = 0
published_unpublished_count = 0
Expand All @@ -161,8 +163,9 @@ def main():

log.write_log_in_file('info', "Fetched: "
+ f"Total articles: {published_unpublished_count}, "
+ f"Published articles: {published_articles_count + already_preserved_articles_count}, "
+ f"Published article versions: {published_articles_versions_count + already_preserved_versions_count}",
+ f"Published articles: {published_articles_count + already_preserved_articles_count + articles_with_error_count}, "
+ "Published article versions: "
+ f"{published_articles_versions_count + already_preserved_versions_count + article_versions_with_error_count}",
True)
print(" ")

Expand Down Expand Up @@ -199,15 +202,26 @@ def main():

log.write_log_in_file('info',
"Total published articles/article versions: \t\t\t\t\t"
+ f'{published_articles_count + already_preserved_articles_count} / '
+ f'{published_articles_versions_count + already_preserved_versions_count}',
+ f'{published_articles_count + already_preserved_articles_count + articles_with_error_count} / '
+ f'{published_articles_versions_count + already_preserved_versions_count + article_versions_with_error_count}',
True)

log.write_log_in_file('info',
"Total count of already preserved (skipped) articles / article versions: \t\t"
+ f'{already_preserved_articles_count} / {already_preserved_versions_count}',
True)

log.write_log_in_file('info',
"Total count of articles with fetch error / articles: \t\t\t\t"
+ f'{articles_with_error_count} / {published_unpublished_count}',
True)

log.write_log_in_file('info',
"Total count of article versions with fetch error / article versions: \t\t"
+ f'{article_versions_with_error_count} / '
+ f'{published_articles_versions_count + already_preserved_versions_count + article_versions_with_error_count}',
True)

if article_obj.processor.duplicate_bag_in_preservation_storage_count > 0:
log.write_log_in_file('warning',
f'Bagger found {article_obj.processor.duplicate_bag_in_preservation_storage_count} duplicate article(s)',
Expand Down
21 changes: 19 additions & 2 deletions figshare/Article.py
Original file line number Diff line number Diff line change
Expand Up @@ -51,7 +51,8 @@ def __init__(self, config, log, ids):
self.no_matched = 0
self.no_unmatched = 0
self.already_preserved_counts_dict = {'already_preserved_article_ids': set(), 'already_preserved_versions': 0,
'wasabi_preserved_versions': 0, 'ap_trust_preserved_versions': 0}
'wasabi_preserved_versions': 0, 'ap_trust_preserved_versions': 0,
'articles_with_error': set(), 'article_versions_with_error': 0}
self.skipped_article_versions = {}
self.processor = Integration(self.config_obj, self.logs)

Expand Down Expand Up @@ -260,6 +261,22 @@ def __get_article_metadata_by_version(self, version, article_id):
if (get_response.status_code == 200):
version_data = get_response.json()
payload_size = calculate_payload_size(self.system_config, version_data)

if payload_size == 0:
if self.system_config['continue-on-error'] == "False":
self.logs.write_log_in_file("error",
f"Curation folder for article {article_id} version {version['version']} not found.",
True)
self.logs.write_log_in_file("info", "Aborting execution.", True)
exit()
zoidy marked this conversation as resolved.
Show resolved Hide resolved
self.already_preserved_counts_dict['articles_with_error'].add(article_id)
self.already_preserved_counts_dict['article_versions_with_error'] += 1
self.logs.write_log_in_file("error",
f"Curation folder for article {article_id} version {version['version']} not found."
+ " Article version will be skipped.",
True)
return None

total_file_size = version_data['size']
files = []
error = ""
Expand Down Expand Up @@ -310,7 +327,7 @@ def __get_article_metadata_by_version(self, version, article_id):
self.already_preserved_counts_dict['ap_trust_preserved_versions'] += 1
self.logs.write_log_in_file("info",
f"Article {article_id} version {version['version']} "
+ "already preserved in preservation staging remote storage.",
+ "already preserved in preservation final remote storage.",
True)

if already_preserved:
Expand Down
12 changes: 11 additions & 1 deletion figshare/Utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -248,20 +248,28 @@ def calculate_ual_rdm_size(config, article_id: int, version: str):
article_version_ual_rdm = ""
version_ual_rdm_size = 0
curation_storage = config['curation_storage_location']
if os.access(curation_storage, os.R_OK):
if os.path.exists(curation_storage) and os.access(curation_storage, os.R_OK):
curation_storage_items = os.scandir(curation_storage)
for item in curation_storage_items:
if item.is_dir() and item.name.__contains__(str(article_id)):
article_dir = os.path.join(curation_storage, item.name)
break
if not os.path.exists(article_dir):
return 0
zoidy marked this conversation as resolved.
Show resolved Hide resolved
for item in os.scandir(article_dir):
if item.is_dir() and item.name.__contains__(version):
article_version_dir = os.path.join(article_dir, item.name)
break

if not os.path.exists(article_version_dir):
return 0
for item in os.scandir(article_version_dir):
if item.is_dir() and item.name.__contains__('UAL_RDM'):
article_version_ual_rdm = os.path.join(article_version_dir, item.name)
break

if not os.path.exists(article_version_ual_rdm):
return 0
for item in os.scandir(article_version_ual_rdm):
file_size = os.path.getsize(os.path.join(article_version_ual_rdm, item.name))
version_ual_rdm_size += file_size
Expand Down Expand Up @@ -317,6 +325,8 @@ def calculate_payload_size(config: dict, version_data: dict) -> int:
if int(version_no) > 9:
version = f"v{str(version_no)}"
version_ual_rdm_size = calculate_ual_rdm_size(config, article_id, version)
if version_ual_rdm_size == 0:
return 0
json_file_size = calculate_json_file_size(version_data)
payload_size = version_ual_rdm_size + json_file_size + article_files_size

Expand Down
Loading