Skip to content

Commit

Permalink
actual cache
Browse files Browse the repository at this point in the history
  • Loading branch information
Volodymyr Savchenko authored and Volodymyr Savchenko committed May 23, 2024
1 parent d60c53e commit d6821ce
Show file tree
Hide file tree
Showing 6 changed files with 120 additions and 23 deletions.
18 changes: 10 additions & 8 deletions tests/test_absolutize.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,18 +10,18 @@
from workflowhub_graph.merge import merge_all_files


def get_test_data_file(filename):
"""Returns the path to a test data file."""
def get_test_data_file(filename=""):
"""Returns the path to a test data file given it's relative path."""

tests_dir = os.path.dirname(os.path.abspath(__file__))
return os.path.join(tests_dir, "test_data", filename)


class TestAbsolutizePaths: # (unittest.TestCase):
# NOTE: ids can not be found, like 634, or forbidden, like 678
@pytest.mark.parametrize("workflow_id", [41, 552])
@pytest.mark.parametrize("workflow_id", [41, 552, 883])
def test_make_paths_absolute(self, workflow_id):
with patch_rdflib_urlopen(get_test_data_file):
with patch_rdflib_urlopen(get_test_data_file(), write_cache=False):
with open(
get_test_data_file(f"{workflow_id}_ro-crate-metadata.json"), "r"
) as f:
Expand All @@ -40,11 +40,13 @@ def test_make_paths_absolute(self, workflow_id):
assert is_all_absolute(G)

def test_merged(self):
# G = merge_all_files(get_test_data_file("[0-9]*ro-crate*.json"))
G = merge_all_files("data/[0-9]*ro-crate*.json")
assert is_all_absolute(G)
G = merge_all_files(
get_test_data_file("[0-9]*ro-crate*.json"),
cache_base_dir=get_test_data_file(),
write_cache=False,
)

print(list(G.triples((None, None, None))))
assert is_all_absolute(G)

for s, o in G.subject_objects(rdflib.URIRef("http://schema.org/author")):
print(s, o)
68 changes: 68 additions & 0 deletions tests/test_data/883_ro-crate-metadata.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,68 @@
{
"@context": [
"https://w3id.org/ro/crate/1.1/context"
],
"@graph": [
{
"@id": "ro-crate-metadata.json",
"@type": "CreativeWork",
"about": {
"@id": "https://training.galaxyproject.org/training-material/topics/introduction/tutorials/galaxy-intro-101-everyone/workflows#main_workflow.ga"
},
"conformsTo": {
"@id": "https://w3id.org/ro/crate/1.1"
}
},
{
"@id": "ro-crate-preview.html",
"@type": "CreativeWork",
"about": {
"@id": "./"
}
},
{
"@id": "https://training.galaxyproject.org/training-material/topics/introduction/tutorials/galaxy-intro-101-everyone/workflows#main_workflow.ga",
"@type": "Dataset",
"name": "GTN Training: Galaxy 101 For Everyone",
"description": "Galaxy workflow",
"version": "16",
"license": "https://spdx.org/licenses/CC-BY-4.0",
"datePublished": "2024-03-18T12:38:43.978+01:00",
"mainEntity": {
"@id": "main_workflow.ga"
},
"hasPart": [
{
"@id": "main_workflow.ga"
}
]
},
{
"@id": "main_workflow.ga",
"@type": [
"File",
"SoftwareSourceCode",
"ComputationalWorkflow"
],
"author": [

],
"name": "GTN Training: Galaxy 101 For Everyone",
"programmingLanguage": {
"@id": "https://w3id.org/workflowhub/workflow-ro-crate#galaxy"
}
},
{
"@id": "https://w3id.org/workflowhub/workflow-ro-crate#galaxy",
"@type": "ComputerLanguage",
"identifier": {
"@id": "https://galaxyproject.org/"
},
"name": "Galaxy",
"url": {
"@id": "https://galaxyproject.org/"
},
"version": "23.1"
}
]
}
File renamed without changes.
File renamed without changes.
48 changes: 39 additions & 9 deletions workflowhub_graph/cachedurlopen.py
Original file line number Diff line number Diff line change
@@ -1,22 +1,38 @@
import os
import re
from unittest.mock import patch, MagicMock
from contextlib import contextmanager
import io
from urllib.parse import urlparse
from urllib.request import urlopen


cached_urls = {
"https://w3id.org/ro/crate/1.0/context": "ro-crate-context-1.0.json",
"https://w3id.org/ro/crate/1.1/context": "ro-crate-context-1.1.json",
}
def url_to_filename(url):
parsed = urlparse(url)
if parsed.scheme not in ["http", "https"]:
raise ValueError(f"Unsupported scheme {parsed.scheme}")

return re.sub("[^0-9a-z]+", "-", (parsed.netloc + parsed.path).lower().strip("_"))


@contextmanager
def patch_rdflib_urlopen(file_locator):
def patch_rdflib_urlopen(
cache_base_dir=None,
write_cache=True,
allowed_urls_pattern=r"https://w3id.org/ro/crate/1\.[01]/context",
):
allowed_urls_re = re.compile(allowed_urls_pattern)
if cache_base_dir is None:
cache_base_dir = "cached_urlopen"
os.makedirs(cache_base_dir, exist_ok=True)

def cached_urlopen(request):
url = request.get_full_url()

if url not in cached_urls:
# TODO: store and use cache
raise ValueError(f"URL {url} not in cache, have: {cached_urls.keys()}")
if not allowed_urls_re.match(url):
raise ValueError(
f"URL {url} not allowed to cache, allowed: {allowed_urls_pattern}"
)

class Response(io.StringIO):
content_type = "text/html"
Expand All @@ -28,7 +44,21 @@ def info(self):
def geturl(self):
return url

content = open(file_locator(cached_urls[url]), "rt").read()
cached_filename = os.path.join(cache_base_dir, url_to_filename(url))

if not os.path.exists(cached_filename):
if write_cache:
response = urlopen(request)
content = response.read().decode("utf-8")

with open(cached_filename, "wt") as f:
f.write(content)
else:
raise ValueError(
f"Cache file {cached_filename} not found, not allowed to download and update cache"
)

content = open(cached_filename, "rt").read()

return Response(content)

Expand Down
9 changes: 3 additions & 6 deletions workflowhub_graph/merge.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@
from workflowhub_graph.constants import BASE_URL


def merge_all_files(pattern="data/*.json") -> rdflib.Graph:
def merge_all_files(pattern="data/*.json", **cache_kwargs) -> rdflib.Graph:
G = rdflib.Graph()

filenames = glob.glob(pattern)
Expand All @@ -27,10 +27,7 @@ def merge_all_files(pattern="data/*.json") -> rdflib.Graph:
w_id = int(os.path.basename(fn).split("_")[0])
json_data = make_paths_absolute(json_data, BASE_URL, w_id)

# TODO: make this actual caching, and pre-populate in the test
with patch_rdflib_urlopen(
lambda x: "tests/test_data/ro-crate-context-1.0.json"
):
with patch_rdflib_urlopen(**cache_kwargs):
G.parse(data=json_data, format="json-ld")

# TODO: set a total version
Expand All @@ -49,4 +46,4 @@ def main():


if __name__ == "__main__":
main()
main()

0 comments on commit d6821ce

Please sign in to comment.