updates to ro-crate flattening, compaction; requests: add maxRety arg…

…ument, and handle some downstream type errors when response returns MaxRetryError
ModelAtlasofTheEarth · May 1, 2024 · 5d905ec · 5d905ec
1 parent 16c8d2a
commit 5d905ec
Show file tree

Hide file tree

Showing 4 changed files with 79 additions and 27 deletions.
diff --git a/.github/scripts/parse_issue.py b/.github/scripts/parse_issue.py
@@ -567,6 +567,9 @@ def parse_issue(issue):
         else:
             error_log += "**Computer URI/DOI**\n" + response + log1 + "\n"
 
+            #except:
+            #error_log += "**Computer URI/DOI**\n" + "there was a problem parsing Computer URI/DOI" + log1 + "\n"
+
 
     data_dict["computer_resource"] = computer_record
 

diff --git a/.github/scripts/request_utils.py b/.github/scripts/request_utils.py
@@ -1,5 +1,7 @@
 import requests
 import os
+from requests.adapters import HTTPAdapter
+from requests.packages.urllib3.util.retry import Retry
 
 # Base URLs configuration
 BASE_URLS = {
@@ -16,22 +18,32 @@
 # Initialize a requests session
 session = requests.Session()
 
+
+# Configure retries
+max_retries = 3  # Set the maximum number of retries
+retry_strategy = Retry(
+    total=max_retries,
+    status_forcelist=[429, 500, 502, 503, 504],  # Specify which status codes to retry on
+    allowed_methods=["HEAD", "GET", "OPTIONS"],  # Use `allowed_methods` for urllib3 v1.26.0 or later
+    backoff_factor=1  # Defines the delay between retries
+)
+adapter = HTTPAdapter(max_retries=retry_strategy)
+session.mount("http://", adapter)
+session.mount("https://", adapter)
+
 def get_record(record_type, record_id):
     log = ""
     metadata = {}
 
     if record_type not in BASE_URLS:
         raise ValueError(f"Record type `{record_type}` not supported")
 
-    # Define content types to try
-    content_types = ["application/ld+json", "application/json"]
-
-    # Iterate over URLs and content types to fetch the record
-    #for url in urls:
-
     url = BASE_URLS[record_type] + record_id
     print(url)
 
+    # Define content types to try
+    content_types = ["application/ld+json", "application/json"]
+
     for content_type in content_types:
         headers = {"Content-Type": content_type, "Accept": content_type}
 
@@ -54,6 +66,7 @@ def get_record(record_type, record_id):
 
     return metadata, log
 
+
 def search_organization(org_url):
     log = ""
     ror_id = ""
@@ -103,4 +116,5 @@ def check_uri(uri):
         return "OK"
 
     except Exception as err:
-        return err.args[0]
+        #return err.args[0]
+        return str(err)  # 01/05/24: Convert the error to a string to avoid TypeError when we concatenate to log
diff --git a/.github/scripts/ro_crate_utils.py b/.github/scripts/ro_crate_utils.py
@@ -669,7 +669,7 @@ def get_default_contexts(context_urls=[
     Note:
     this function was set up to try to work with multiple contexts, however this is not working properly
     currently, it just returns the default ro-crate context ("https://w3id.org/ro/crate/1.1/context") in json format.
-    
+
     """
 
     # Define paths for local testing and GitHub workflow
@@ -716,3 +716,32 @@ def get_default_contexts(context_urls=[
         merged_context.update(context)
 
     return context_list, merged_context
+
+def replace_keys_recursive(obj):
+    """
+    Recursively walks through a nested dictionary and replaces keys 'id' and 'type'
+    with '@id' and '@type' respectively.
+
+    Args:
+    obj (dict, list, set): The input object to transform.
+
+    Returns:
+    dict, list, set: The transformed object with keys replaced.
+    """
+    if isinstance(obj, dict):
+        new_dict = {}
+        for key, value in obj.items():
+            new_key = key
+            if key == 'id':
+                new_key = '@id'
+            elif key == 'type':
+                new_key = '@type'
+            new_dict[new_key] = replace_keys_recursive(value)
+        return new_dict
+    elif isinstance(obj, list):
+        return [replace_keys_recursive(item) for item in obj]
+    elif isinstance(obj, set):
+        # Convert set to list, process it, and convert it back to set
+        return set(replace_keys_recursive(list(obj)))
+    else:
+        return obj
diff --git a/.github/scripts/write_repo_contents.py b/.github/scripts/write_repo_contents.py
@@ -3,7 +3,7 @@
 from github import Github, Auth
 from parse_issue import parse_issue
 from crosswalks import dict_to_metadata, dict_to_yaml, dict_to_report
-from ro_crate_utils import get_default_contexts
+from ro_crate_utils import replace_keys_recursive
 from yaml_utils import format_yaml_string
 from copy_files import copy_files
 from ruamel.yaml import YAML
@@ -40,51 +40,57 @@
 data, error_log = parse_issue(issue)
 
 # Convert dictionary to metadata json
-metadata = dict_to_metadata(data, flat_compact_crate=False, timestamp= timestamp)
-rocratedict = json.loads(metadata)
+rocratestr_nested = dict_to_metadata(data, flat_compact_crate=False, timestamp= timestamp)
+rocratedict = json.loads(rocratestr_nested)
 default_context_list = copy.deepcopy(rocratedict['@context'])
 
 try:
-
-    context_list, context_dict = get_default_contexts(context_urls=["https://w3id.org/ro/crate/1.1/context"],
-         verbose=True)
+    #context_list, context_dict = get_default_contexts(context_urls=["https://w3id.org/ro/crate/1.1/context"],
+    #     verbose=True)
 
     #we're going to delete the  rocratedict context, so we expand in terms of the contexts provided by get_default_contexts
-    del rocratedict['@context']
+    #del rocratedict['@context']
 
-    ctx = context_dict["@context"]
+    #ctx = context_dict["@context"]
     # Expand the document using the specific contexts
     # this will get rid of any items that are not defined in the schema
-    expanded = jsonld.expand(rocratedict, options={"expandContext": ctx})
+    #expanded = jsonld.expand(rocratedict, options={"expandContext": ctx})
 
     #flatten the document using the specific contexts
-    flattened = jsonld.flatten(expanded)
+    #flattened = jsonld.flatten(expanded)
 
     #I have figured out how to compact against multiple contexts, so thise will only compact
     #against the value of context_list[0], which is "https://w3id.org/ro/crate/1.1/context"
-    flat_compacted =  jsonld.compact(flattened , ctx = ctx,
-                           options={"compactArrays": True, "graph": False})
+    #flat_compacted =  jsonld.compact(flattened , ctx = ctx,
+    #                       options={"compactArrays": True, "graph": False})
 
-    rocratedict.update({'@context':default_context_list})
-    flat_compacted.update({'@context':default_context_list})
+    #rocratedict.update({'@context':default_context_list})
+    #flat_compacted.update({'@context':default_context_list})
     #compacted contains the full the context. We don't need these,URLs are sufficient.
     #flat_compacted['@context'] = rocratedict['@context']
 
+    expanded = jsonld.expand(rocratedict)
+    flattened  = jsonld.flatten(expanded)
+    rocratedict['@graph'] = flattened
+    #this strips the @ from the @ids,
+    flatcompact = jsonld.compact(rocratedict, ctx  = default_context_list)
+    #add the @ back to type, id
+    flatcompact = replace_keys_recursive(flatcompact)
 
 except:
     #use the flattening routine we wrote
     #this is not necessary fully compacted (although we try to build compact records)
-    flat_compacted = dict_to_metadata(data, flat_compact_crate=True, timestamp= timestamp)
+    flatcompact = dict_to_metadata(data, flat_compact_crate=True, timestamp= timestamp)
 
 
 #FOR TESTING - print out dictionary as a comment
-issue.create_comment("# M@TE crate \n"+str(metadata))
+#issue.create_comment("# M@TE crate \n"+str(metadata))
 
 # Move files to repo
-flat_compacted_str = json.dumps(flat_compacted)
-model_repo.create_file("ro-crate-metadata.json","add ro-crate",flat_compacted_str)
+rocratestr_flatcompact= json.dumps(flatcompact)
+model_repo.create_file("ro-crate-metadata.json","add ro-crate", rocratestr_flatcompact)
 #we should do this this as part of the copy to website action
-model_repo.create_file("website_material/ro-crate-metadata.json","add ro-crate",metadata)
+model_repo.create_file("website_material/ro-crate-metadata.json","add ro-crate", rocratestr_nested)
 
 #######
 #Save the trail of metadata sources to .metadata_trail