Skip to content

Commit

Permalink
Merge pull request #103 from UAL-RE/102-feature-check-if-bag-exist-in…
Browse files Browse the repository at this point in the history
…-ap-trust-prior-to-bagging

Feat: Check if item version is already preserved before bagging (Issue #102)
  • Loading branch information
zoidy authored Sep 16, 2024
2 parents 2813c16 + 937fd45 commit 1499a14
Show file tree
Hide file tree
Showing 7 changed files with 580 additions and 45 deletions.
9 changes: 9 additions & 0 deletions .env.sample.ini
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,15 @@ retries = 3
retries_wait = 10
institution = 1077

[aptrust_api]
url =
user =
token =
items_per_page =
alt_identifier_starts_with =
retries = 3
retries_wait = 10

[system]
preservation_storage_location =
logs_location =
Expand Down
3 changes: 3 additions & 0 deletions Config.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,9 @@ def __init__(self, fileName):
def figshare_config(self):
return self.config['figshare_api']

def aptrust_config(self):
return self.config['aptrust_api']

def system_config(self):
return self.config['system']

Expand Down
21 changes: 15 additions & 6 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -22,14 +22,23 @@ ReBACH is run via the command line as outlined in the 'How to Run' section of th
## How to run:
- Copy the .env.sample.ini file and give it a name of your choice (e.g. .env.ini).
- Fill out the .env.ini file (IMPORTANT: Make sure not to commit this file to Github)
- url - required: The figshare API url
- token - required: Your auth token to your organization's API
- retries - required: Number of times the script should retry API or file system calls if it is unable to connect. Defaults to 3
- retries_wait - required: Number of seconds the script should wait between call retries if it is unable to connect. Defaults to 10
- institution - required: The Figshare Institution ID for your organization
- figshare_api
- url - required: The figshare API url
- token - required: Your auth token to your organization's API
- retries - required: Number of times the script should retry API or file system calls if it is unable to connect. Defaults to 3
- retries_wait - required: Number of seconds the script should wait between call retries if it is unable to connect. Defaults to 10
- institution - required: The Figshare Institution ID for your organization
- aptrust_api
- url - required: The AP Trust member API url including the version
- user - required: Your user email address on AP Trust
- token - required: Your user secret token on AP Trust
- items_per_page - Maximum number of object to be return per page by the API
- alt_identifier_starts_with - Prefix for alternate identifier in AP Trust
- retries - required: Number of times the script should retry API or file system calls if it is unable to connect. Defaults to 3
- retries_wait - required: Number of seconds the script should wait between call retries if it is unable to connect. Defaults to 10
- preservation_storage_location - required: The file system location where the preservation folders/packages should be created
- logs_location - required: The file system location where logs should be created. This value will override the one in `bagger/config/default.toml` when bagger is used for post-processing (see post_process_script_command setting below).
- additional_precentage_required - required: How much extra space the preservation storage location should have in order to handle files as a percent. This percent is applied to the total storage needed for all files. I.e. if the value of this field is 10 and the amount of storage needed for files is 1 GB, the script will make sure that the preservation storage location has at least 1.1 GB free. Defaults to 10
- additional_percentage_required - required: How much extra space the preservation storage location should have in order to handle files as a percent. This percent is applied to the total storage needed for all files. I.e. if the value of this field is 10 and the amount of storage needed for files is 1 GB, the script will make sure that the preservation storage location has at least 1.1 GB free. Defaults to 10
- pre_process_script_command - optional: The terminal command (including arguments) to invoke a script to be run BEFORE the files are copied and logic applied to the preservation storage (note: this action is not currently implemented)
- post_process_script_command - required: Specifies the method of performing post-processing steps. This can take only two values: the string 'Bagger', or the path to an external script. If the value is set to 'Bagger', the post-processing steps will consist of running the internal `bagger` module. If the value is set to a path to an external script, the post-processing steps will be executed by invoking the external script through the function 'post_process_script_function'. The post-processing steps are executed AFTER the files are copied and logic applied to the preservation storage.
- curation_storage_location - required: The file system location where the Curation files reside
Expand Down
101 changes: 76 additions & 25 deletions app.py
Original file line number Diff line number Diff line change
Expand Up @@ -142,15 +142,17 @@ def main():
get_args()
config, log = main()

log.write_log_in_file('info',
"Fetching articles...",
True)
log.write_log_in_file('info', " ", True)
log.write_log_in_file('info', "------- Fetching articles -------", True)
article_obj = Article(config, log, args.ids)
article_data = article_obj.get_articles()
article_data, already_preserved_counts_dict = article_obj.get_articles()

already_preserved_articles_count = len(already_preserved_counts_dict['already_preserved_article_ids'])
already_preserved_versions_count = already_preserved_counts_dict['already_preserved_versions']
published_articles_count = 0
published_articles_versions_count = 0
published_unpublished_count = 0

for i, (k, v) in enumerate(article_data.items()):
published_unpublished_count += 1
if len(v) > 0:
Expand All @@ -159,14 +161,12 @@ def main():

log.write_log_in_file('info', "Fetched: "
+ f"Total articles: {published_unpublished_count}, "
+ f"Published articles: {published_articles_count}, "
+ f"Published article versions: {published_articles_versions_count}",
+ f"Published articles: {published_articles_count + already_preserved_articles_count}, "
+ f"Published article versions: {published_articles_versions_count + already_preserved_versions_count}",
True)
print(" ")

log.write_log_in_file('info',
"Fetching collections...",
True)
log.write_log_in_file('info', "------- Fetching collections -------", True)
collection_obj = Collection(config, log, args.ids)
collection_data = collection_obj.get_collections()

Expand All @@ -181,52 +181,103 @@ def main():
print(" ")

# Start articles processing after completing fetching data from API
processed_articles_versions_count = article_obj.process_articles(article_data)
processed_articles_versions_count, ap_trust_preserved_article_version_count, wasabi_preserved_versions \
= article_obj.process_articles(article_data)

# Start collections processing after completing fetching data from API and articles processing.
processed_collections_versions_count = collection_obj.process_collections(collection_data)
processed_collections_versions_count, already_preserved_collections_counts = collection_obj.process_collections(collection_data)
already_preserved_collections = len(already_preserved_collections_counts['already_preserved_collection_ids'])
already_preserved_collection_versions = already_preserved_collections_counts['already_preserved_versions']
preserved_collection_versions_in_wasabi = already_preserved_collections_counts['wasabi_preserved_versions']
preserved_collection_versions_in_ap_trust = already_preserved_collections_counts['ap_trust_preserved_versions']

log.write_log_in_file('info', ' ', True)
log.write_log_in_file('info', '------- Summary -------', True)
log.write_log_in_file('info',
f"Total articles: \t\t\t\t\t\t\t\t\t{published_unpublished_count}",
True)

log.write_log_in_file('info', '------- Summary -------')
log.write_log_in_file('info',
"Total articles/published articles: \t\t\t\t\t\t"
+ f'{published_unpublished_count} / {published_articles_count}',
"Total published articles/article versions: \t\t\t\t\t"
+ f'{published_articles_count + already_preserved_articles_count} / '
+ f'{published_articles_versions_count + already_preserved_versions_count}',
True)

log.write_log_in_file('info',
"Total processed articles bags already in preservation storage: \t\t\t"
+ f'{article_obj.processor.duplicate_bag_in_preservation_storage_count}',
"Total count of already preserved (skipped) articles / article versions: \t\t"
+ f'{already_preserved_articles_count} / {already_preserved_versions_count}',
True)

if article_obj.processor.duplicate_bag_in_preservation_storage_count > 0:
log.write_log_in_file('warning',
f'Bagger found {article_obj.processor.duplicate_bag_in_preservation_storage_count} duplicate article(s)',
True)

log.write_log_in_file('info',
"Total articles versions matched/published: \t\t\t\t\t" # todo: exclude already-preserved bags from processing
"Total articles versions matched/published (unskipped): \t\t\t\t"
+ f'{article_obj.no_matched} / {published_articles_versions_count}',
True)
log.write_log_in_file('info',
"Total articles versions processed/matched: \t\t\t\t\t"
+ f'{processed_articles_versions_count} / {article_obj.no_matched}',
True)
log.write_log_in_file('info',
"Total count of already preserved article versions in preservation final remote storage: \t\t"
+ f'{ap_trust_preserved_article_version_count}',
True)
log.write_log_in_file('info',
"Total count of already preserved article versions in preservation staging remote storage: \t"
+ f'{wasabi_preserved_versions}',
True)

log.write_log_in_file('info',
"Total articles versions unmatched (published-matched): \t\t\t\t"
+ f'{article_obj.no_unmatched}',
True)
log.write_log_in_file('info',
"Total processed articles bags successfully preserved \t\t\t\t"
"Total processed articles bags successfully preserved: \t\t\t\t"
+ f'{article_obj.processor.bag_preserved_count}',
True)

log.write_log_in_file('info', "", True)
log.write_log_in_file('info',
"Total collections: \t\t\t\t\t\t\t\t"
+ f'{collections_count}',
True)
log.write_log_in_file('info',
"Total published collections / collection versions: \t\t\t\t"
+ f'{collections_count} / {collections_versions_count}',
True)

log.write_log_in_file('info',
"Total collections/published collections: \t\t\t\t\t\t"
+ f'{collections_count} / {collections_count}',
"Total count of already preserved (skipped) collections / collection versions: \t"
+ f'{already_preserved_collections} / {already_preserved_collection_versions}',
True)

log.write_log_in_file('info',
"Total collections versions processed/published: \t\t\t\t\t"
+ f'{processed_collections_versions_count} / {collections_versions_count}',
+ f'{processed_collections_versions_count} / {collections_versions_count - already_preserved_collection_versions}',
True)

if collection_obj.processor.duplicate_bag_in_preservation_storage_count > 0:
log.write_log_in_file('warning',
f'Bagger found {collection_obj.processor.duplicate_bag_in_preservation_storage_count} duplicate collection(s)',
True)

log.write_log_in_file('info',
"Total count of already preserved collection versions in preservation final remote storage: \t"
+ f'{preserved_collection_versions_in_ap_trust}',
True)

log.write_log_in_file('info',
"Total collections already preserved: \t\t\t\t\t\t"
+ f'{collection_obj.processor.duplicate_bag_in_preservation_storage_count}',
"Total count of already preserved collection versions in preservation staging remote storage: \t"
+ f'{preserved_collection_versions_in_wasabi}',
True)

if processed_articles_versions_count != published_articles_versions_count or processed_collections_versions_count != collections_versions_count:
if processed_articles_versions_count != published_articles_versions_count or \
processed_collections_versions_count != (collections_versions_count - already_preserved_collection_versions):
log.write_log_in_file('warning',
'The number of articles versions or collections versions sucessfully processed is different'
'The number of articles versions or collections versions successfully processed is different'
+ ' than the number fetched. Check the log for details.', True)

log.write_log_in_file('info',
Expand Down
Loading

0 comments on commit 1499a14

Please sign in to comment.