diff --git a/tests/test_absolutize.py b/tests/test_absolutize.py index 37dbac9..ad0ac91 100644 --- a/tests/test_absolutize.py +++ b/tests/test_absolutize.py @@ -10,8 +10,8 @@ from workflowhub_graph.merge import merge_all_files -def get_test_data_file(filename): - """Returns the path to a test data file.""" +def get_test_data_file(filename=""): + """Returns the path to a test data file given it's relative path.""" tests_dir = os.path.dirname(os.path.abspath(__file__)) return os.path.join(tests_dir, "test_data", filename) @@ -19,9 +19,9 @@ def get_test_data_file(filename): class TestAbsolutizePaths: # (unittest.TestCase): # NOTE: ids can not be found, like 634, or forbidden, like 678 - @pytest.mark.parametrize("workflow_id", [41, 552]) + @pytest.mark.parametrize("workflow_id", [41, 552, 883]) def test_make_paths_absolute(self, workflow_id): - with patch_rdflib_urlopen(get_test_data_file): + with patch_rdflib_urlopen(get_test_data_file(), write_cache=False): with open( get_test_data_file(f"{workflow_id}_ro-crate-metadata.json"), "r" ) as f: @@ -40,11 +40,13 @@ def test_make_paths_absolute(self, workflow_id): assert is_all_absolute(G) def test_merged(self): - # G = merge_all_files(get_test_data_file("[0-9]*ro-crate*.json")) - G = merge_all_files("data/[0-9]*ro-crate*.json") - assert is_all_absolute(G) + G = merge_all_files( + get_test_data_file("[0-9]*ro-crate*.json"), + cache_base_dir=get_test_data_file(), + write_cache=False, + ) - print(list(G.triples((None, None, None)))) + assert is_all_absolute(G) for s, o in G.subject_objects(rdflib.URIRef("http://schema.org/author")): print(s, o) diff --git a/tests/test_data/883_ro-crate-metadata.json b/tests/test_data/883_ro-crate-metadata.json new file mode 100644 index 0000000..b8320f9 --- /dev/null +++ b/tests/test_data/883_ro-crate-metadata.json @@ -0,0 +1,68 @@ +{ + "@context": [ + "https://w3id.org/ro/crate/1.1/context" + ], + "@graph": [ + { + "@id": "ro-crate-metadata.json", + "@type": "CreativeWork", + "about": { + "@id": "https://training.galaxyproject.org/training-material/topics/introduction/tutorials/galaxy-intro-101-everyone/workflows#main_workflow.ga" + }, + "conformsTo": { + "@id": "https://w3id.org/ro/crate/1.1" + } + }, + { + "@id": "ro-crate-preview.html", + "@type": "CreativeWork", + "about": { + "@id": "./" + } + }, + { + "@id": "https://training.galaxyproject.org/training-material/topics/introduction/tutorials/galaxy-intro-101-everyone/workflows#main_workflow.ga", + "@type": "Dataset", + "name": "GTN Training: Galaxy 101 For Everyone", + "description": "Galaxy workflow", + "version": "16", + "license": "https://spdx.org/licenses/CC-BY-4.0", + "datePublished": "2024-03-18T12:38:43.978+01:00", + "mainEntity": { + "@id": "main_workflow.ga" + }, + "hasPart": [ + { + "@id": "main_workflow.ga" + } + ] + }, + { + "@id": "main_workflow.ga", + "@type": [ + "File", + "SoftwareSourceCode", + "ComputationalWorkflow" + ], + "author": [ + + ], + "name": "GTN Training: Galaxy 101 For Everyone", + "programmingLanguage": { + "@id": "https://w3id.org/workflowhub/workflow-ro-crate#galaxy" + } + }, + { + "@id": "https://w3id.org/workflowhub/workflow-ro-crate#galaxy", + "@type": "ComputerLanguage", + "identifier": { + "@id": "https://galaxyproject.org/" + }, + "name": "Galaxy", + "url": { + "@id": "https://galaxyproject.org/" + }, + "version": "23.1" + } + ] +} \ No newline at end of file diff --git a/tests/test_data/ro-crate-context-1.0.json b/tests/test_data/w3id-org-ro-crate-1-0-context similarity index 100% rename from tests/test_data/ro-crate-context-1.0.json rename to tests/test_data/w3id-org-ro-crate-1-0-context diff --git a/tests/test_data/ro-crate-context-1.1.json b/tests/test_data/w3id-org-ro-crate-1-1-context similarity index 100% rename from tests/test_data/ro-crate-context-1.1.json rename to tests/test_data/w3id-org-ro-crate-1-1-context diff --git a/workflowhub_graph/cachedurlopen.py b/workflowhub_graph/cachedurlopen.py index 9475e6c..9dd8269 100644 --- a/workflowhub_graph/cachedurlopen.py +++ b/workflowhub_graph/cachedurlopen.py @@ -1,22 +1,38 @@ +import os +import re from unittest.mock import patch, MagicMock from contextlib import contextmanager import io +from urllib.parse import urlparse +from urllib.request import urlopen -cached_urls = { - "https://w3id.org/ro/crate/1.0/context": "ro-crate-context-1.0.json", - "https://w3id.org/ro/crate/1.1/context": "ro-crate-context-1.1.json", -} +def url_to_filename(url): + parsed = urlparse(url) + if parsed.scheme not in ["http", "https"]: + raise ValueError(f"Unsupported scheme {parsed.scheme}") + + return re.sub("[^0-9a-z]+", "-", (parsed.netloc + parsed.path).lower().strip("_")) @contextmanager -def patch_rdflib_urlopen(file_locator): +def patch_rdflib_urlopen( + cache_base_dir=None, + write_cache=True, + allowed_urls_pattern=r"https://w3id.org/ro/crate/1\.[01]/context", +): + allowed_urls_re = re.compile(allowed_urls_pattern) + if cache_base_dir is None: + cache_base_dir = "cached_urlopen" + os.makedirs(cache_base_dir, exist_ok=True) + def cached_urlopen(request): url = request.get_full_url() - if url not in cached_urls: - # TODO: store and use cache - raise ValueError(f"URL {url} not in cache, have: {cached_urls.keys()}") + if not allowed_urls_re.match(url): + raise ValueError( + f"URL {url} not allowed to cache, allowed: {allowed_urls_pattern}" + ) class Response(io.StringIO): content_type = "text/html" @@ -28,7 +44,21 @@ def info(self): def geturl(self): return url - content = open(file_locator(cached_urls[url]), "rt").read() + cached_filename = os.path.join(cache_base_dir, url_to_filename(url)) + + if not os.path.exists(cached_filename): + if write_cache: + response = urlopen(request) + content = response.read().decode("utf-8") + + with open(cached_filename, "wt") as f: + f.write(content) + else: + raise ValueError( + f"Cache file {cached_filename} not found, not allowed to download and update cache" + ) + + content = open(cached_filename, "rt").read() return Response(content) diff --git a/workflowhub_graph/merge.py b/workflowhub_graph/merge.py index 5008386..dba6286 100644 --- a/workflowhub_graph/merge.py +++ b/workflowhub_graph/merge.py @@ -10,7 +10,7 @@ from workflowhub_graph.constants import BASE_URL -def merge_all_files(pattern="data/*.json") -> rdflib.Graph: +def merge_all_files(pattern="data/*.json", **cache_kwargs) -> rdflib.Graph: G = rdflib.Graph() filenames = glob.glob(pattern) @@ -27,10 +27,7 @@ def merge_all_files(pattern="data/*.json") -> rdflib.Graph: w_id = int(os.path.basename(fn).split("_")[0]) json_data = make_paths_absolute(json_data, BASE_URL, w_id) - # TODO: make this actual caching, and pre-populate in the test - with patch_rdflib_urlopen( - lambda x: "tests/test_data/ro-crate-context-1.0.json" - ): + with patch_rdflib_urlopen(**cache_kwargs): G.parse(data=json_data, format="json-ld") # TODO: set a total version @@ -49,4 +46,4 @@ def main(): if __name__ == "__main__": - main() \ No newline at end of file + main()