Skip to content

Commit

Permalink
updates to ro-crate flattening, compaction; requests: add maxRety arg…
Browse files Browse the repository at this point in the history
…ument, and handle some downstream type errors when response returns MaxRetryError
  • Loading branch information
dansand committed May 1, 2024
1 parent 16c8d2a commit 5d905ec
Show file tree
Hide file tree
Showing 4 changed files with 79 additions and 27 deletions.
3 changes: 3 additions & 0 deletions .github/scripts/parse_issue.py
Original file line number Diff line number Diff line change
Expand Up @@ -567,6 +567,9 @@ def parse_issue(issue):
else:
error_log += "**Computer URI/DOI**\n" + response + log1 + "\n"

#except:
#error_log += "**Computer URI/DOI**\n" + "there was a problem parsing Computer URI/DOI" + log1 + "\n"


data_dict["computer_resource"] = computer_record

Expand Down
28 changes: 21 additions & 7 deletions .github/scripts/request_utils.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,7 @@
import requests
import os
from requests.adapters import HTTPAdapter
from requests.packages.urllib3.util.retry import Retry

# Base URLs configuration
BASE_URLS = {
Expand All @@ -16,22 +18,32 @@
# Initialize a requests session
session = requests.Session()


# Configure retries
max_retries = 3 # Set the maximum number of retries
retry_strategy = Retry(
total=max_retries,
status_forcelist=[429, 500, 502, 503, 504], # Specify which status codes to retry on
allowed_methods=["HEAD", "GET", "OPTIONS"], # Use `allowed_methods` for urllib3 v1.26.0 or later
backoff_factor=1 # Defines the delay between retries
)
adapter = HTTPAdapter(max_retries=retry_strategy)
session.mount("http://", adapter)
session.mount("https://", adapter)

def get_record(record_type, record_id):
log = ""
metadata = {}

if record_type not in BASE_URLS:
raise ValueError(f"Record type `{record_type}` not supported")

# Define content types to try
content_types = ["application/ld+json", "application/json"]

# Iterate over URLs and content types to fetch the record
#for url in urls:

url = BASE_URLS[record_type] + record_id
print(url)

# Define content types to try
content_types = ["application/ld+json", "application/json"]

for content_type in content_types:
headers = {"Content-Type": content_type, "Accept": content_type}

Expand All @@ -54,6 +66,7 @@ def get_record(record_type, record_id):

return metadata, log


def search_organization(org_url):
log = ""
ror_id = ""
Expand Down Expand Up @@ -103,4 +116,5 @@ def check_uri(uri):
return "OK"

except Exception as err:
return err.args[0]
#return err.args[0]
return str(err) # 01/05/24: Convert the error to a string to avoid TypeError when we concatenate to log
31 changes: 30 additions & 1 deletion .github/scripts/ro_crate_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -669,7 +669,7 @@ def get_default_contexts(context_urls=[
Note:
this function was set up to try to work with multiple contexts, however this is not working properly
currently, it just returns the default ro-crate context ("https://w3id.org/ro/crate/1.1/context") in json format.
"""

# Define paths for local testing and GitHub workflow
Expand Down Expand Up @@ -716,3 +716,32 @@ def get_default_contexts(context_urls=[
merged_context.update(context)

return context_list, merged_context

def replace_keys_recursive(obj):
"""
Recursively walks through a nested dictionary and replaces keys 'id' and 'type'
with '@id' and '@type' respectively.
Args:
obj (dict, list, set): The input object to transform.
Returns:
dict, list, set: The transformed object with keys replaced.
"""
if isinstance(obj, dict):
new_dict = {}
for key, value in obj.items():
new_key = key
if key == 'id':
new_key = '@id'
elif key == 'type':
new_key = '@type'
new_dict[new_key] = replace_keys_recursive(value)
return new_dict
elif isinstance(obj, list):
return [replace_keys_recursive(item) for item in obj]
elif isinstance(obj, set):
# Convert set to list, process it, and convert it back to set
return set(replace_keys_recursive(list(obj)))
else:
return obj
44 changes: 25 additions & 19 deletions .github/scripts/write_repo_contents.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
from github import Github, Auth
from parse_issue import parse_issue
from crosswalks import dict_to_metadata, dict_to_yaml, dict_to_report
from ro_crate_utils import get_default_contexts
from ro_crate_utils import replace_keys_recursive
from yaml_utils import format_yaml_string
from copy_files import copy_files
from ruamel.yaml import YAML
Expand Down Expand Up @@ -40,51 +40,57 @@
data, error_log = parse_issue(issue)

# Convert dictionary to metadata json
metadata = dict_to_metadata(data, flat_compact_crate=False, timestamp= timestamp)
rocratedict = json.loads(metadata)
rocratestr_nested = dict_to_metadata(data, flat_compact_crate=False, timestamp= timestamp)
rocratedict = json.loads(rocratestr_nested)
default_context_list = copy.deepcopy(rocratedict['@context'])

try:

context_list, context_dict = get_default_contexts(context_urls=["https://w3id.org/ro/crate/1.1/context"],
verbose=True)
#context_list, context_dict = get_default_contexts(context_urls=["https://w3id.org/ro/crate/1.1/context"],
# verbose=True)

#we're going to delete the rocratedict context, so we expand in terms of the contexts provided by get_default_contexts
del rocratedict['@context']
#del rocratedict['@context']

ctx = context_dict["@context"]
#ctx = context_dict["@context"]
# Expand the document using the specific contexts
# this will get rid of any items that are not defined in the schema
expanded = jsonld.expand(rocratedict, options={"expandContext": ctx})
#expanded = jsonld.expand(rocratedict, options={"expandContext": ctx})

#flatten the document using the specific contexts
flattened = jsonld.flatten(expanded)
#flattened = jsonld.flatten(expanded)

#I have figured out how to compact against multiple contexts, so thise will only compact
#against the value of context_list[0], which is "https://w3id.org/ro/crate/1.1/context"
flat_compacted = jsonld.compact(flattened , ctx = ctx,
options={"compactArrays": True, "graph": False})
#flat_compacted = jsonld.compact(flattened , ctx = ctx,
# options={"compactArrays": True, "graph": False})

rocratedict.update({'@context':default_context_list})
flat_compacted.update({'@context':default_context_list})
#rocratedict.update({'@context':default_context_list})
#flat_compacted.update({'@context':default_context_list})
#compacted contains the full the context. We don't need these,URLs are sufficient.
#flat_compacted['@context'] = rocratedict['@context']

expanded = jsonld.expand(rocratedict)
flattened = jsonld.flatten(expanded)
rocratedict['@graph'] = flattened
#this strips the @ from the @ids,
flatcompact = jsonld.compact(rocratedict, ctx = default_context_list)
#add the @ back to type, id
flatcompact = replace_keys_recursive(flatcompact)

except:
#use the flattening routine we wrote
#this is not necessary fully compacted (although we try to build compact records)
flat_compacted = dict_to_metadata(data, flat_compact_crate=True, timestamp= timestamp)
flatcompact = dict_to_metadata(data, flat_compact_crate=True, timestamp= timestamp)


#FOR TESTING - print out dictionary as a comment
issue.create_comment("# M@TE crate \n"+str(metadata))
#issue.create_comment("# M@TE crate \n"+str(metadata))

# Move files to repo
flat_compacted_str = json.dumps(flat_compacted)
model_repo.create_file("ro-crate-metadata.json","add ro-crate",flat_compacted_str)
rocratestr_flatcompact= json.dumps(flatcompact)
model_repo.create_file("ro-crate-metadata.json","add ro-crate", rocratestr_flatcompact)
#we should do this this as part of the copy to website action
model_repo.create_file("website_material/ro-crate-metadata.json","add ro-crate",metadata)
model_repo.create_file("website_material/ro-crate-metadata.json","add ro-crate", rocratestr_nested)

#######
#Save the trail of metadata sources to .metadata_trail
Expand Down

0 comments on commit 5d905ec

Please sign in to comment.