diff --git a/repo2docker/contentproviders/ckan.py b/repo2docker/contentproviders/ckan.py index 2618fd06..7f0dec77 100644 --- a/repo2docker/contentproviders/ckan.py +++ b/repo2docker/contentproviders/ckan.py @@ -25,7 +25,7 @@ def _fetch_version(self, api_url): Borrowed from the Hydroshare provider. """ package_show_url = f"{api_url}package_show?id={self.dataset_id}" - resp = self.urlopen(package_show_url).json() + resp = self.session.get(package_show_url).json() date = resp["result"]["metadata_modified"] parsed_date = datetime.strptime(date, "%Y-%m-%dT%H:%M:%S.%f") epoch = parsed_date.replace(tzinfo=timezone(timedelta(0))).timestamp() @@ -35,8 +35,6 @@ def _fetch_version(self, api_url): def _request(self, url, **kwargs): return self.session.get(url, **kwargs) - urlopen = _request - def detect(self, source, ref=None, extra_args=None): """Trigger this provider for things that resolve to a CKAN dataset.""" parsed_url = urlparse(source) @@ -58,7 +56,7 @@ def detect(self, source, ref=None, extra_args=None): ).geturl() status_show_url = f"{api_url}status_show" - resp = self.urlopen(status_show_url) + resp = self.session.get(status_show_url) if resp.status_code == 200: # Activity ID may be present either as a query parameter, activity_id @@ -97,7 +95,7 @@ def fetch(self, spec, output_dir, yield_output=False): {"id": dataset_id} ) - resp = self.urlopen( + resp = self.session.get( fetch_url, headers={"accept": "application/json"}, ) diff --git a/repo2docker/contentproviders/doi.py b/repo2docker/contentproviders/doi.py index cf43a157..2dc34b56 100644 --- a/repo2docker/contentproviders/doi.py +++ b/repo2docker/contentproviders/doi.py @@ -27,21 +27,6 @@ def __init__(self): def _request(self, url, **kwargs): return self.session.get(url, **kwargs) - urlopen = _request - - def _urlopen(self, req, headers=None): - """A urlopen() helper""" - # someone passed a string, not a request - if not isinstance(req, request.Request): - req = request.Request(req) - - req.add_header("User-Agent", f"repo2docker {__version__}") - if headers is not None: - for key, value in headers.items(): - req.add_header(key, value) - - return request.urlopen(req) - def doi2url(self, doi): # Transform a DOI to a URL # If not a doi, assume we have a URL and return diff --git a/repo2docker/contentproviders/figshare.py b/repo2docker/contentproviders/figshare.py index 6c7b26c8..e8fec3fe 100644 --- a/repo2docker/contentproviders/figshare.py +++ b/repo2docker/contentproviders/figshare.py @@ -74,7 +74,7 @@ def fetch(self, spec, output_dir, yield_output=False): host = spec["host"] yield f"Fetching Figshare article {article_id} in version {article_version}.\n" - resp = self.urlopen( + resp = self.session.get( f'{host["api"]}{article_id}/versions/{article_version}', headers={"accept": "application/json"}, ) diff --git a/repo2docker/contentproviders/hydroshare.py b/repo2docker/contentproviders/hydroshare.py index 4104d97a..24a4d888 100755 --- a/repo2docker/contentproviders/hydroshare.py +++ b/repo2docker/contentproviders/hydroshare.py @@ -1,11 +1,14 @@ import json import os import shutil +import tempfile import time import zipfile from datetime import datetime, timedelta, timezone +from urllib.parse import urlparse, urlunparse from urllib.request import urlretrieve +from ..utils import is_doi from .base import ContentProviderException from .doi import DoiProvider @@ -13,9 +16,15 @@ class Hydroshare(DoiProvider): """Provide contents of a Hydroshare resource.""" - def _fetch_version(self, host): - """Fetch resource modified date and convert to epoch""" - json_response = self.urlopen(host["version"].format(self.resource_id)).json() + HYDROSHARE_DOMAINS = ["www.hydroshare.org"] + + def get_version(self, resource_id: str) -> str: + """ + Get current version of given resource_id + """ + api_url = f"https://{self.HYDROSHARE_DOMAIN}/hsapi/resource/{resource_id}/scimeta/elements" + + json_response = self.session.get(api_url).json() date = next( item for item in json_response["dates"] if item["type"] == "modified" )["start_date"] @@ -26,7 +35,7 @@ def _fetch_version(self, host): # truncate the timestamp return str(int(epoch)) - def detect(self, doi, ref=None, extra_args=None): + def detect(self, spec, ref=None, extra_args=None): """Trigger this provider for things that resolve to a Hydroshare resource""" hosts = [ { @@ -35,35 +44,36 @@ def detect(self, doi, ref=None, extra_args=None): "http://www.hydroshare.org/resource/", ], "django_irods": "https://www.hydroshare.org/django_irods/download/bags/", - "version": "https://www.hydroshare.org/hsapi/resource/{}/scimeta/elements", + "version": "", } ] - url = self.doi2url(doi) - for host in hosts: - if any([url.startswith(s) for s in host["hostname"]]): - self.resource_id = url.strip("/").rsplit("/", maxsplit=1)[1] - self.version = self._fetch_version(host) - return { - "resource": self.resource_id, - "host": host, - "version": self.version, - } + # Our spec could be a doi that resolves to a hydroshare URL, or a hydroshare URL + if is_doi(spec): + url = self.doi2url(spec) + else: + url = spec - def _urlretrieve(self, bag_url): - return urlretrieve(bag_url) + parsed = urlparse(url) + + print(url) + if parsed.netloc in self.HYDROSHARE_DOMAINS: + return url def fetch(self, spec, output_dir, yield_output=False, timeout=120): """Fetch and unpack a Hydroshare resource""" - resource_id = spec["resource"] - host = spec["host"] + url = spec + parts = urlparse(url) + self.resource_id = parts.path.strip("/").rsplit("/", maxsplit=1)[1] - bag_url = f'{host["django_irods"]}{resource_id}' + bag_url = urlunparse( + parts._replace(path=f"django_irods/download/bags/{self.resource_id}") + ) yield f"Downloading {bag_url}.\n" # bag downloads are prepared on demand and may need some time - conn = self.urlopen(bag_url) + conn = self.session.get(bag_url) total_wait_time = 0 while ( conn.status_code == 200 @@ -77,26 +87,27 @@ def fetch(self, spec, output_dir, yield_output=False, timeout=120): raise ContentProviderException(msg) yield f"Bag is being prepared, requesting again in {wait_time} seconds.\n" time.sleep(wait_time) - conn = self.urlopen(bag_url) + conn = self.session.get(bag_url) if conn.status_code != 200: msg = f"Failed to download bag. status code {conn.status_code}.\n" yield msg raise ContentProviderException(msg) # Bag creation seems to need a small time buffer after it says it's ready. time.sleep(1) - filehandle, _ = self._urlretrieve(bag_url) + filehandle, _ = urlretrieve(bag_url) zip_file_object = zipfile.ZipFile(filehandle, "r") yield "Downloaded, unpacking contents.\n" - zip_file_object.extractall("temp") - # resources store the contents in the data/contents directory, which is all we want to keep - contents_dir = os.path.join("temp", self.resource_id, "data", "contents") - files = os.listdir(contents_dir) - for f in files: - shutil.move(os.path.join(contents_dir, f), output_dir) - yield "Finished, cleaning up.\n" - shutil.rmtree("temp") + + with tempfile.TemporaryDirectory() as d: + zip_file_object.extractall(d) + # resources store the contents in the data/contents directory, which is all we want to keep + contents_dir = os.path.join(d, self.resource_id, "data", "contents") + files = os.listdir(contents_dir) + for f in files: + shutil.move(os.path.join(contents_dir, f), output_dir) + yield "Finished, cleaning up.\n" @property def content_id(self): """The HydroShare resource ID""" - return f"{self.resource_id}.v{self.version}" + return f"{self.resource_id}" diff --git a/repo2docker/contentproviders/zenodo.py b/repo2docker/contentproviders/zenodo.py index 6982c3a7..31ea1741 100644 --- a/repo2docker/contentproviders/zenodo.py +++ b/repo2docker/contentproviders/zenodo.py @@ -73,7 +73,7 @@ def fetch(self, spec, output_dir, yield_output=False): host = spec["host"] yield f"Fetching Zenodo record {record_id}.\n" - resp = self.urlopen( + resp = self.session.get( f'{host["api"]}{record_id}', headers={"accept": "application/json"}, ) @@ -82,7 +82,7 @@ def fetch(self, spec, output_dir, yield_output=False): if host["files"]: yield f"Fetching Zenodo record {record_id} files.\n" files_url = deep_get(record, host["files"]) - resp = self.urlopen( + resp = self.session.get( files_url, headers={"accept": "application/json"}, ) diff --git a/setup.py b/setup.py index 58abff7c..a4e59d8c 100644 --- a/setup.py +++ b/setup.py @@ -26,11 +26,11 @@ def finalize_options(self): def run(self): import json - from urllib.request import urlopen - resp = urlopen(self.url, timeout=5) - resp_body = resp.read() - data = json.loads(resp_body.decode("utf-8")) + import requests + + resp = requests.get(self.url) + data = resp.json() if "installations" not in data: raise ValueError("Malformed installation map.") diff --git a/tests/contentproviders/test_hydroshare.py b/tests/contentproviders/test_hydroshare.py new file mode 100755 index 00000000..e6d8a4c8 --- /dev/null +++ b/tests/contentproviders/test_hydroshare.py @@ -0,0 +1,67 @@ +import hashlib +import os +from tempfile import TemporaryDirectory + +import pytest + +from repo2docker.contentproviders import Hydroshare + + +@pytest.mark.parametrize( + ("spec", "url"), + [ + # Test a hydroshare DOI + ( + "10.4211/hs.b8f6eae9d89241cf8b5904033460af61", + "http://www.hydroshare.org/resource/b8f6eae9d89241cf8b5904033460af61", + ), + # Hydroshare DOI in a different form + ( + "https://doi.org/10.4211/hs.b8f6eae9d89241cf8b5904033460af61", + "http://www.hydroshare.org/resource/b8f6eae9d89241cf8b5904033460af61", + ), + # Test a non-hydroshare DOI + ("doi:10.7910/DVN/TJCLKP", None), + # Test a hydroshare URL + ( + "http://www.hydroshare.org/resource/b8f6eae9d89241cf8b5904033460af61", + "http://www.hydroshare.org/resource/b8f6eae9d89241cf8b5904033460af61", + ), + # Test a random URL + ("https://www.eff.org/cyberspace-independence", None), + ], +) +def test_detect(spec, url): + assert Hydroshare().detect(spec) == url + + +@pytest.mark.parametrize( + ("specs", "md5tree"), + [ + ( + ("https://www.hydroshare.org/resource/8f7c2f0341ef4180b0dbe97f59130756/",), + { + "binder/Dockerfile": "872ab0ef22645a42a5560eae640cdc77", + "README.md": "88ac547c3a5f616f6d26e0689d63a113", + "notebooks/sanity-check.ipynb": "7fc4c455bc8cd244479f4d2282051ee6", + }, + ), + ], +) +def test_fetch(specs: list[str], md5tree): + dv = Hydroshare() + + for spec in specs: + with TemporaryDirectory() as d: + output = [] + for l in dv.fetch(dv.detect(spec), d): + output.append(l) + + # Verify md5 sum of the files we expect to find + # We are using md5 instead of something more secure because that is what + # dataverse itself uses + for subpath, expected_sha in md5tree.items(): + with open(os.path.join(d, subpath), "rb") as f: + h = hashlib.md5() + h.update(f.read()) + assert h.hexdigest() == expected_sha diff --git a/tests/unit/contentproviders/test_doi.py b/tests/unit/contentproviders/test_doi.py index 58cae5f5..f2de2b50 100644 --- a/tests/unit/contentproviders/test_doi.py +++ b/tests/unit/contentproviders/test_doi.py @@ -24,7 +24,7 @@ def test_url_headers(requests_mock): doi = DoiProvider() headers = {"test1": "value1", "Test2": "value2"} - result = doi.urlopen("https://mybinder.org", headers=headers) + result = doi.session.get("https://mybinder.org", headers=headers) assert "test1" in result.request.headers assert "Test2" in result.request.headers assert result.request.headers["User-Agent"] == f"repo2docker {__version__}" diff --git a/tests/unit/contentproviders/test_figshare.py b/tests/unit/contentproviders/test_figshare.py index 760c0e48..881c8513 100644 --- a/tests/unit/contentproviders/test_figshare.py +++ b/tests/unit/contentproviders/test_figshare.py @@ -5,7 +5,6 @@ from io import BytesIO from tempfile import NamedTemporaryFile, TemporaryDirectory from unittest.mock import patch -from urllib.request import Request, urlopen from zipfile import ZipFile import pytest diff --git a/tests/unit/contentproviders/test_hydroshare.py b/tests/unit/contentproviders/test_hydroshare.py deleted file mode 100755 index 41e234cd..00000000 --- a/tests/unit/contentproviders/test_hydroshare.py +++ /dev/null @@ -1,188 +0,0 @@ -import os -import re -from contextlib import contextmanager -from tempfile import NamedTemporaryFile, TemporaryDirectory -from unittest.mock import patch -from zipfile import ZipFile - -import pytest - -from repo2docker.contentproviders import Hydroshare -from repo2docker.contentproviders.base import ContentProviderException - -doi_responses = { - "https://doi.org/10.4211/hs.b8f6eae9d89241cf8b5904033460af61": ( - "https://www.hydroshare.org/resource/b8f6eae9d89241cf8b5904033460af61" - ), - "https://doi.org/10.21105/joss.01277": ( - "https://joss.theoj.org/papers/10.21105/joss.01277" - ), -} - - -def doi_resolver(req, context): - resp = doi_responses.get(req.url) - # doi responses are redirects - if resp is not None: - context.status_code = 302 - context.headers["Location"] = resp - return resp - - -hydroshare_data = { - "dates": [{"type": "modified", "start_date": "2019-09-25T16:09:17.006152Z"}] -} - - -def test_content_id(): - hydro = Hydroshare() - - hydro.detect("10.4211/hs.b8f6eae9d89241cf8b5904033460af61") - assert hydro.content_id == "b8f6eae9d89241cf8b5904033460af61.v1585005408" - - -def test_detect_hydroshare(): - # valid Hydroshare DOIs trigger this content provider - expected = { - "host": { - "hostname": [ - "https://www.hydroshare.org/resource/", - "http://www.hydroshare.org/resource/", - ], - "django_irods": "https://www.hydroshare.org/django_irods/download/bags/", - "version": "https://www.hydroshare.org/hsapi/resource/{}/scimeta/elements", - }, - "resource": "b8f6eae9d89241cf8b5904033460af61", - "version": "1585005408", - } - - assert ( - Hydroshare().detect( - "https://www.hydroshare.org/resource/b8f6eae9d89241cf8b5904033460af61" - ) - == expected - ) - - assert ( - Hydroshare().detect("10.4211/hs.b8f6eae9d89241cf8b5904033460af61") == expected - ) - - assert ( - Hydroshare().detect( - "https://doi.org/10.4211/hs.b8f6eae9d89241cf8b5904033460af61" - ) - == expected - ) - - # Don't trigger the Hydroshare content provider - assert Hydroshare().detect("/some/path/here") is None - assert Hydroshare().detect("https://example.com/path/here") is None - - # don't handle DOIs that aren't from Hydroshare - assert Hydroshare().detect("https://doi.org/10.21105/joss.01277") is None - - -@contextmanager -def hydroshare_archive(prefix="b8f6eae9d89241cf8b5904033460af61/data/contents"): - with NamedTemporaryFile(suffix=".zip") as zfile: - with ZipFile(zfile.name, mode="w") as zip: - zip.writestr(f"{prefix}/some-file.txt", "some content") - zip.writestr(f"{prefix}/some-other-file.txt", "some more content") - - yield zfile - - -class MockResponse: - def __init__(self, content_type, status_code): - self.status_code = status_code - self.headers = dict() - self.headers["content-type"] = content_type - - -def test_fetch_bag(): - # we "fetch" a local ZIP file to simulate a Hydroshare resource - with hydroshare_archive() as hydro_path: - with patch.object( - Hydroshare, - "urlopen", - side_effect=[ - MockResponse("application/html", 200), - MockResponse("application/zip", 200), - ], - ): - with patch.object( - Hydroshare, "_urlretrieve", side_effect=[(hydro_path, None)] - ): - hydro = Hydroshare() - hydro.resource_id = "b8f6eae9d89241cf8b5904033460af61" - spec = { - "host": { - "hostname": [ - "https://www.hydroshare.org/resource/", - "http://www.hydroshare.org/resource/", - ], - "django_irods": "https://www.hydroshare.org/django_irods/download/bags/", - }, - "resource": "123456789", - } - - with TemporaryDirectory() as d: - output = [] - for l in hydro.fetch(spec, d): - output.append(l) - - unpacked_files = set(os.listdir(d)) - expected = {"some-other-file.txt", "some-file.txt"} - assert expected == unpacked_files - - -def test_fetch_bag_failure(): - with hydroshare_archive(): - with patch.object( - Hydroshare, "urlopen", side_effect=[MockResponse("application/html", 500)] - ): - hydro = Hydroshare() - spec = { - "host": { - "hostname": [ - "https://www.hydroshare.org/resource/", - "http://www.hydroshare.org/resource/", - ], - "django_irods": "https://www.hydroshare.org/django_irods/download/bags/", - }, - "resource": "123456789", - } - with TemporaryDirectory() as d: - with pytest.raises( - ContentProviderException, - match=r"Failed to download bag\. status code 500\.", - ): - # loop for yield statements - for l in hydro.fetch(spec, d): - pass - - -def test_fetch_bag_timeout(): - with hydroshare_archive(): - with patch.object( - Hydroshare, "urlopen", side_effect=[MockResponse("application/html", 200)] - ): - hydro = Hydroshare() - spec = { - "host": { - "hostname": [ - "https://www.hydroshare.org/resource/", - "http://www.hydroshare.org/resource/", - ], - "django_irods": "https://www.hydroshare.org/django_irods/download/bags/", - }, - "resource": "123456789", - } - with TemporaryDirectory() as d: - with pytest.raises( - ContentProviderException, - match=r"Bag taking too long to prepare, exiting now, try again later\.", - ): - # loop for yield statements - for l in hydro.fetch(spec, d, timeout=0): - pass diff --git a/tests/unit/contentproviders/test_zenodo.py b/tests/unit/contentproviders/test_zenodo.py index a1e1226f..1152ce2c 100644 --- a/tests/unit/contentproviders/test_zenodo.py +++ b/tests/unit/contentproviders/test_zenodo.py @@ -5,7 +5,6 @@ from io import BytesIO from tempfile import NamedTemporaryFile, TemporaryDirectory from unittest.mock import patch -from urllib.request import Request, urlopen from zipfile import ZipFile import pytest