From 5d905ec7f38f44af8fc569c3a7cb25e55043c6f9 Mon Sep 17 00:00:00 2001 From: dansand Date: Wed, 1 May 2024 14:15:50 +1000 Subject: [PATCH] updates to ro-crate flattening, compaction; requests: add maxRety argument, and handle some downstream type errors when response returns MaxRetryError --- .github/scripts/parse_issue.py | 3 ++ .github/scripts/request_utils.py | 28 ++++++++++++---- .github/scripts/ro_crate_utils.py | 31 +++++++++++++++++- .github/scripts/write_repo_contents.py | 44 +++++++++++++++----------- 4 files changed, 79 insertions(+), 27 deletions(-) diff --git a/.github/scripts/parse_issue.py b/.github/scripts/parse_issue.py index 5072f6f..f1307a9 100644 --- a/.github/scripts/parse_issue.py +++ b/.github/scripts/parse_issue.py @@ -567,6 +567,9 @@ def parse_issue(issue): else: error_log += "**Computer URI/DOI**\n" + response + log1 + "\n" + #except: + #error_log += "**Computer URI/DOI**\n" + "there was a problem parsing Computer URI/DOI" + log1 + "\n" + data_dict["computer_resource"] = computer_record diff --git a/.github/scripts/request_utils.py b/.github/scripts/request_utils.py index 1c5c35e..09758bf 100644 --- a/.github/scripts/request_utils.py +++ b/.github/scripts/request_utils.py @@ -1,5 +1,7 @@ import requests import os +from requests.adapters import HTTPAdapter +from requests.packages.urllib3.util.retry import Retry # Base URLs configuration BASE_URLS = { @@ -16,6 +18,19 @@ # Initialize a requests session session = requests.Session() + +# Configure retries +max_retries = 3 # Set the maximum number of retries +retry_strategy = Retry( + total=max_retries, + status_forcelist=[429, 500, 502, 503, 504], # Specify which status codes to retry on + allowed_methods=["HEAD", "GET", "OPTIONS"], # Use `allowed_methods` for urllib3 v1.26.0 or later + backoff_factor=1 # Defines the delay between retries +) +adapter = HTTPAdapter(max_retries=retry_strategy) +session.mount("http://", adapter) +session.mount("https://", adapter) + def get_record(record_type, record_id): log = "" metadata = {} @@ -23,15 +38,12 @@ def get_record(record_type, record_id): if record_type not in BASE_URLS: raise ValueError(f"Record type `{record_type}` not supported") - # Define content types to try - content_types = ["application/ld+json", "application/json"] - - # Iterate over URLs and content types to fetch the record - #for url in urls: - url = BASE_URLS[record_type] + record_id print(url) + # Define content types to try + content_types = ["application/ld+json", "application/json"] + for content_type in content_types: headers = {"Content-Type": content_type, "Accept": content_type} @@ -54,6 +66,7 @@ def get_record(record_type, record_id): return metadata, log + def search_organization(org_url): log = "" ror_id = "" @@ -103,4 +116,5 @@ def check_uri(uri): return "OK" except Exception as err: - return err.args[0] + #return err.args[0] + return str(err) # 01/05/24: Convert the error to a string to avoid TypeError when we concatenate to log diff --git a/.github/scripts/ro_crate_utils.py b/.github/scripts/ro_crate_utils.py index fb291f4..5158b64 100644 --- a/.github/scripts/ro_crate_utils.py +++ b/.github/scripts/ro_crate_utils.py @@ -669,7 +669,7 @@ def get_default_contexts(context_urls=[ Note: this function was set up to try to work with multiple contexts, however this is not working properly currently, it just returns the default ro-crate context ("https://w3id.org/ro/crate/1.1/context") in json format. - + """ # Define paths for local testing and GitHub workflow @@ -716,3 +716,32 @@ def get_default_contexts(context_urls=[ merged_context.update(context) return context_list, merged_context + +def replace_keys_recursive(obj): + """ + Recursively walks through a nested dictionary and replaces keys 'id' and 'type' + with '@id' and '@type' respectively. + + Args: + obj (dict, list, set): The input object to transform. + + Returns: + dict, list, set: The transformed object with keys replaced. + """ + if isinstance(obj, dict): + new_dict = {} + for key, value in obj.items(): + new_key = key + if key == 'id': + new_key = '@id' + elif key == 'type': + new_key = '@type' + new_dict[new_key] = replace_keys_recursive(value) + return new_dict + elif isinstance(obj, list): + return [replace_keys_recursive(item) for item in obj] + elif isinstance(obj, set): + # Convert set to list, process it, and convert it back to set + return set(replace_keys_recursive(list(obj))) + else: + return obj diff --git a/.github/scripts/write_repo_contents.py b/.github/scripts/write_repo_contents.py index 02ea7df..ec31a3b 100644 --- a/.github/scripts/write_repo_contents.py +++ b/.github/scripts/write_repo_contents.py @@ -3,7 +3,7 @@ from github import Github, Auth from parse_issue import parse_issue from crosswalks import dict_to_metadata, dict_to_yaml, dict_to_report -from ro_crate_utils import get_default_contexts +from ro_crate_utils import replace_keys_recursive from yaml_utils import format_yaml_string from copy_files import copy_files from ruamel.yaml import YAML @@ -40,51 +40,57 @@ data, error_log = parse_issue(issue) # Convert dictionary to metadata json -metadata = dict_to_metadata(data, flat_compact_crate=False, timestamp= timestamp) -rocratedict = json.loads(metadata) +rocratestr_nested = dict_to_metadata(data, flat_compact_crate=False, timestamp= timestamp) +rocratedict = json.loads(rocratestr_nested) default_context_list = copy.deepcopy(rocratedict['@context']) try: - - context_list, context_dict = get_default_contexts(context_urls=["https://w3id.org/ro/crate/1.1/context"], - verbose=True) + #context_list, context_dict = get_default_contexts(context_urls=["https://w3id.org/ro/crate/1.1/context"], + # verbose=True) #we're going to delete the rocratedict context, so we expand in terms of the contexts provided by get_default_contexts - del rocratedict['@context'] + #del rocratedict['@context'] - ctx = context_dict["@context"] + #ctx = context_dict["@context"] # Expand the document using the specific contexts # this will get rid of any items that are not defined in the schema - expanded = jsonld.expand(rocratedict, options={"expandContext": ctx}) + #expanded = jsonld.expand(rocratedict, options={"expandContext": ctx}) #flatten the document using the specific contexts - flattened = jsonld.flatten(expanded) + #flattened = jsonld.flatten(expanded) #I have figured out how to compact against multiple contexts, so thise will only compact #against the value of context_list[0], which is "https://w3id.org/ro/crate/1.1/context" - flat_compacted = jsonld.compact(flattened , ctx = ctx, - options={"compactArrays": True, "graph": False}) + #flat_compacted = jsonld.compact(flattened , ctx = ctx, + # options={"compactArrays": True, "graph": False}) - rocratedict.update({'@context':default_context_list}) - flat_compacted.update({'@context':default_context_list}) + #rocratedict.update({'@context':default_context_list}) + #flat_compacted.update({'@context':default_context_list}) #compacted contains the full the context. We don't need these,URLs are sufficient. #flat_compacted['@context'] = rocratedict['@context'] + expanded = jsonld.expand(rocratedict) + flattened = jsonld.flatten(expanded) + rocratedict['@graph'] = flattened + #this strips the @ from the @ids, + flatcompact = jsonld.compact(rocratedict, ctx = default_context_list) + #add the @ back to type, id + flatcompact = replace_keys_recursive(flatcompact) except: #use the flattening routine we wrote #this is not necessary fully compacted (although we try to build compact records) - flat_compacted = dict_to_metadata(data, flat_compact_crate=True, timestamp= timestamp) + flatcompact = dict_to_metadata(data, flat_compact_crate=True, timestamp= timestamp) #FOR TESTING - print out dictionary as a comment -issue.create_comment("# M@TE crate \n"+str(metadata)) +#issue.create_comment("# M@TE crate \n"+str(metadata)) # Move files to repo -flat_compacted_str = json.dumps(flat_compacted) -model_repo.create_file("ro-crate-metadata.json","add ro-crate",flat_compacted_str) +rocratestr_flatcompact= json.dumps(flatcompact) +model_repo.create_file("ro-crate-metadata.json","add ro-crate", rocratestr_flatcompact) #we should do this this as part of the copy to website action -model_repo.create_file("website_material/ro-crate-metadata.json","add ro-crate",metadata) +model_repo.create_file("website_material/ro-crate-metadata.json","add ro-crate", rocratestr_nested) ####### #Save the trail of metadata sources to .metadata_trail