From aec0e02b9739e3b11437c59d6a18abc57190a141 Mon Sep 17 00:00:00 2001 From: YuviPanda Date: Fri, 20 Dec 2024 15:46:49 -0800 Subject: [PATCH 1/4] Only use requests, not a mix of requests & urlopen We were: 1. In some cases, directly using requests 2. In some cases, directly using the stdlib's urlopen 3. In some cases, had a method named urlopen that simply passed things through to requests This is unnecessarily confusing, and seems to primarily be done for the benefit of mocking the network calls. However, as described in the recently merged https://github.com/jupyterhub/repo2docker/pull/1390, I don't think mocking is appropriate here as it means we don't actually catch problems. This PR mostly focuses on getting unifying to only using requests directly with as little indirection as possible. If any tests were directly using mocks here, they will be replaced with something that is testing things more directly as appropriate --- repo2docker/contentproviders/ckan.py | 8 +++----- repo2docker/contentproviders/doi.py | 15 --------------- repo2docker/contentproviders/figshare.py | 2 +- repo2docker/contentproviders/hydroshare.py | 6 +++--- repo2docker/contentproviders/zenodo.py | 4 ++-- setup.py | 7 +++---- tests/unit/contentproviders/test_doi.py | 2 +- tests/unit/contentproviders/test_figshare.py | 1 - tests/unit/contentproviders/test_zenodo.py | 1 - 9 files changed, 13 insertions(+), 33 deletions(-) diff --git a/repo2docker/contentproviders/ckan.py b/repo2docker/contentproviders/ckan.py index 2618fd06e..7f0dec775 100644 --- a/repo2docker/contentproviders/ckan.py +++ b/repo2docker/contentproviders/ckan.py @@ -25,7 +25,7 @@ def _fetch_version(self, api_url): Borrowed from the Hydroshare provider. """ package_show_url = f"{api_url}package_show?id={self.dataset_id}" - resp = self.urlopen(package_show_url).json() + resp = self.session.get(package_show_url).json() date = resp["result"]["metadata_modified"] parsed_date = datetime.strptime(date, "%Y-%m-%dT%H:%M:%S.%f") epoch = parsed_date.replace(tzinfo=timezone(timedelta(0))).timestamp() @@ -35,8 +35,6 @@ def _fetch_version(self, api_url): def _request(self, url, **kwargs): return self.session.get(url, **kwargs) - urlopen = _request - def detect(self, source, ref=None, extra_args=None): """Trigger this provider for things that resolve to a CKAN dataset.""" parsed_url = urlparse(source) @@ -58,7 +56,7 @@ def detect(self, source, ref=None, extra_args=None): ).geturl() status_show_url = f"{api_url}status_show" - resp = self.urlopen(status_show_url) + resp = self.session.get(status_show_url) if resp.status_code == 200: # Activity ID may be present either as a query parameter, activity_id @@ -97,7 +95,7 @@ def fetch(self, spec, output_dir, yield_output=False): {"id": dataset_id} ) - resp = self.urlopen( + resp = self.session.get( fetch_url, headers={"accept": "application/json"}, ) diff --git a/repo2docker/contentproviders/doi.py b/repo2docker/contentproviders/doi.py index cf43a1578..2dc34b561 100644 --- a/repo2docker/contentproviders/doi.py +++ b/repo2docker/contentproviders/doi.py @@ -27,21 +27,6 @@ def __init__(self): def _request(self, url, **kwargs): return self.session.get(url, **kwargs) - urlopen = _request - - def _urlopen(self, req, headers=None): - """A urlopen() helper""" - # someone passed a string, not a request - if not isinstance(req, request.Request): - req = request.Request(req) - - req.add_header("User-Agent", f"repo2docker {__version__}") - if headers is not None: - for key, value in headers.items(): - req.add_header(key, value) - - return request.urlopen(req) - def doi2url(self, doi): # Transform a DOI to a URL # If not a doi, assume we have a URL and return diff --git a/repo2docker/contentproviders/figshare.py b/repo2docker/contentproviders/figshare.py index 6c7b26c86..e8fec3fe7 100644 --- a/repo2docker/contentproviders/figshare.py +++ b/repo2docker/contentproviders/figshare.py @@ -74,7 +74,7 @@ def fetch(self, spec, output_dir, yield_output=False): host = spec["host"] yield f"Fetching Figshare article {article_id} in version {article_version}.\n" - resp = self.urlopen( + resp = self.session.get( f'{host["api"]}{article_id}/versions/{article_version}', headers={"accept": "application/json"}, ) diff --git a/repo2docker/contentproviders/hydroshare.py b/repo2docker/contentproviders/hydroshare.py index 4104d97a6..90c41914f 100755 --- a/repo2docker/contentproviders/hydroshare.py +++ b/repo2docker/contentproviders/hydroshare.py @@ -15,7 +15,7 @@ class Hydroshare(DoiProvider): def _fetch_version(self, host): """Fetch resource modified date and convert to epoch""" - json_response = self.urlopen(host["version"].format(self.resource_id)).json() + json_response = self.session.get(host["version"].format(self.resource_id)).json() date = next( item for item in json_response["dates"] if item["type"] == "modified" )["start_date"] @@ -63,7 +63,7 @@ def fetch(self, spec, output_dir, yield_output=False, timeout=120): yield f"Downloading {bag_url}.\n" # bag downloads are prepared on demand and may need some time - conn = self.urlopen(bag_url) + conn = self.session.get(bag_url) total_wait_time = 0 while ( conn.status_code == 200 @@ -77,7 +77,7 @@ def fetch(self, spec, output_dir, yield_output=False, timeout=120): raise ContentProviderException(msg) yield f"Bag is being prepared, requesting again in {wait_time} seconds.\n" time.sleep(wait_time) - conn = self.urlopen(bag_url) + conn = self.session.get(bag_url) if conn.status_code != 200: msg = f"Failed to download bag. status code {conn.status_code}.\n" yield msg diff --git a/repo2docker/contentproviders/zenodo.py b/repo2docker/contentproviders/zenodo.py index 6982c3a7c..31ea17415 100644 --- a/repo2docker/contentproviders/zenodo.py +++ b/repo2docker/contentproviders/zenodo.py @@ -73,7 +73,7 @@ def fetch(self, spec, output_dir, yield_output=False): host = spec["host"] yield f"Fetching Zenodo record {record_id}.\n" - resp = self.urlopen( + resp = self.session.get( f'{host["api"]}{record_id}', headers={"accept": "application/json"}, ) @@ -82,7 +82,7 @@ def fetch(self, spec, output_dir, yield_output=False): if host["files"]: yield f"Fetching Zenodo record {record_id} files.\n" files_url = deep_get(record, host["files"]) - resp = self.urlopen( + resp = self.session.get( files_url, headers={"accept": "application/json"}, ) diff --git a/setup.py b/setup.py index 58abff7c0..d37aa17cc 100644 --- a/setup.py +++ b/setup.py @@ -26,11 +26,10 @@ def finalize_options(self): def run(self): import json - from urllib.request import urlopen + import requests - resp = urlopen(self.url, timeout=5) - resp_body = resp.read() - data = json.loads(resp_body.decode("utf-8")) + resp = requests.get(self.url) + data = resp.json() if "installations" not in data: raise ValueError("Malformed installation map.") diff --git a/tests/unit/contentproviders/test_doi.py b/tests/unit/contentproviders/test_doi.py index 58cae5f5b..f2de2b50b 100644 --- a/tests/unit/contentproviders/test_doi.py +++ b/tests/unit/contentproviders/test_doi.py @@ -24,7 +24,7 @@ def test_url_headers(requests_mock): doi = DoiProvider() headers = {"test1": "value1", "Test2": "value2"} - result = doi.urlopen("https://mybinder.org", headers=headers) + result = doi.session.get("https://mybinder.org", headers=headers) assert "test1" in result.request.headers assert "Test2" in result.request.headers assert result.request.headers["User-Agent"] == f"repo2docker {__version__}" diff --git a/tests/unit/contentproviders/test_figshare.py b/tests/unit/contentproviders/test_figshare.py index 760c0e487..881c8513b 100644 --- a/tests/unit/contentproviders/test_figshare.py +++ b/tests/unit/contentproviders/test_figshare.py @@ -5,7 +5,6 @@ from io import BytesIO from tempfile import NamedTemporaryFile, TemporaryDirectory from unittest.mock import patch -from urllib.request import Request, urlopen from zipfile import ZipFile import pytest diff --git a/tests/unit/contentproviders/test_zenodo.py b/tests/unit/contentproviders/test_zenodo.py index a1e1226fb..1152ce2cf 100644 --- a/tests/unit/contentproviders/test_zenodo.py +++ b/tests/unit/contentproviders/test_zenodo.py @@ -5,7 +5,6 @@ from io import BytesIO from tempfile import NamedTemporaryFile, TemporaryDirectory from unittest.mock import patch -from urllib.request import Request, urlopen from zipfile import ZipFile import pytest From cd9911c9f2d5e880ec87fed904b16a6f926a0bd8 Mon Sep 17 00:00:00 2001 From: YuviPanda Date: Fri, 20 Dec 2024 16:43:14 -0800 Subject: [PATCH 2/4] Cleanup hydroshare provider - Cleanup how the provider is detected, as we were simply doing a domain check but with many extra steps - Move the tests to be real integration tests - Test detection, not content_id --- repo2docker/contentproviders/hydroshare.py | 69 ++++--- tests/contentproviders/test_hydroshare.py | 59 ++++++ .../unit/contentproviders/test_hydroshare.py | 188 ------------------ 3 files changed, 100 insertions(+), 216 deletions(-) create mode 100755 tests/contentproviders/test_hydroshare.py delete mode 100755 tests/unit/contentproviders/test_hydroshare.py diff --git a/repo2docker/contentproviders/hydroshare.py b/repo2docker/contentproviders/hydroshare.py index 90c41914f..ce870b59e 100755 --- a/repo2docker/contentproviders/hydroshare.py +++ b/repo2docker/contentproviders/hydroshare.py @@ -2,20 +2,29 @@ import os import shutil import time +import tempfile import zipfile from datetime import datetime, timedelta, timezone from urllib.request import urlretrieve +from urllib.parse import urlparse, urlunparse from .base import ContentProviderException from .doi import DoiProvider +from ..utils import is_doi class Hydroshare(DoiProvider): """Provide contents of a Hydroshare resource.""" - def _fetch_version(self, host): - """Fetch resource modified date and convert to epoch""" - json_response = self.session.get(host["version"].format(self.resource_id)).json() + HYDROSHARE_DOMAINS = ["www.hydroshare.org"] + + def get_version(self, resource_id: str) -> str: + """ + Get current version of given resource_id + """ + api_url = f"https://{self.HYDROSHARE_DOMAIN}/hsapi/resource/{resource_id}/scimeta/elements" + + json_response = self.session.get(api_url).json() date = next( item for item in json_response["dates"] if item["type"] == "modified" )["start_date"] @@ -26,7 +35,7 @@ def _fetch_version(self, host): # truncate the timestamp return str(int(epoch)) - def detect(self, doi, ref=None, extra_args=None): + def detect(self, spec, ref=None, extra_args=None): """Trigger this provider for things that resolve to a Hydroshare resource""" hosts = [ { @@ -35,30 +44,33 @@ def detect(self, doi, ref=None, extra_args=None): "http://www.hydroshare.org/resource/", ], "django_irods": "https://www.hydroshare.org/django_irods/download/bags/", - "version": "https://www.hydroshare.org/hsapi/resource/{}/scimeta/elements", + "version": "", } ] - url = self.doi2url(doi) - - for host in hosts: - if any([url.startswith(s) for s in host["hostname"]]): - self.resource_id = url.strip("/").rsplit("/", maxsplit=1)[1] - self.version = self._fetch_version(host) - return { - "resource": self.resource_id, - "host": host, - "version": self.version, - } + + # Our spec could be a doi that resolves to a hydroshare URL, or a hydroshare URL + if is_doi(spec): + url = self.doi2url(spec) + else: + url = spec + + parsed = urlparse(url) + + print(url) + if parsed.netloc in self.HYDROSHARE_DOMAINS: + return url def _urlretrieve(self, bag_url): return urlretrieve(bag_url) def fetch(self, spec, output_dir, yield_output=False, timeout=120): """Fetch and unpack a Hydroshare resource""" - resource_id = spec["resource"] - host = spec["host"] + url = spec + print(url) + parts = urlparse(url) + self.resource_id = parts.path.strip("/").rsplit("/", maxsplit=1)[1] - bag_url = f'{host["django_irods"]}{resource_id}' + bag_url = urlunparse(parts._replace(path=f"django_irods/download/bags/{self.resource_id}")) yield f"Downloading {bag_url}.\n" @@ -87,16 +99,17 @@ def fetch(self, spec, output_dir, yield_output=False, timeout=120): filehandle, _ = self._urlretrieve(bag_url) zip_file_object = zipfile.ZipFile(filehandle, "r") yield "Downloaded, unpacking contents.\n" - zip_file_object.extractall("temp") - # resources store the contents in the data/contents directory, which is all we want to keep - contents_dir = os.path.join("temp", self.resource_id, "data", "contents") - files = os.listdir(contents_dir) - for f in files: - shutil.move(os.path.join(contents_dir, f), output_dir) - yield "Finished, cleaning up.\n" - shutil.rmtree("temp") + + with tempfile.TemporaryDirectory() as d: + zip_file_object.extractall(d) + # resources store the contents in the data/contents directory, which is all we want to keep + contents_dir = os.path.join(d, self.resource_id, "data", "contents") + files = os.listdir(contents_dir) + for f in files: + shutil.move(os.path.join(contents_dir, f), output_dir) + yield "Finished, cleaning up.\n" @property def content_id(self): """The HydroShare resource ID""" - return f"{self.resource_id}.v{self.version}" + return f"{self.resource_id}" diff --git a/tests/contentproviders/test_hydroshare.py b/tests/contentproviders/test_hydroshare.py new file mode 100755 index 000000000..abd485f33 --- /dev/null +++ b/tests/contentproviders/test_hydroshare.py @@ -0,0 +1,59 @@ +import os +import hashlib +from tempfile import TemporaryDirectory + +import pytest + +from repo2docker.contentproviders import Hydroshare + + +@pytest.mark.parametrize( + ("spec", "url"), + [ + # Test a hydroshare DOI + ("10.4211/hs.b8f6eae9d89241cf8b5904033460af61", "http://www.hydroshare.org/resource/b8f6eae9d89241cf8b5904033460af61"), + # Hydroshare DOI in a different form + ("https://doi.org/10.4211/hs.b8f6eae9d89241cf8b5904033460af61", "http://www.hydroshare.org/resource/b8f6eae9d89241cf8b5904033460af61"), + # Test a non-hydroshare DOI + ("doi:10.7910/DVN/TJCLKP", None), + # Test a hydroshare URL + ("http://www.hydroshare.org/resource/b8f6eae9d89241cf8b5904033460af61", "http://www.hydroshare.org/resource/b8f6eae9d89241cf8b5904033460af61"), + # Test a random URL + ("https://www.eff.org/cyberspace-independence", None) + ] +) +def test_detect(spec, url): + assert Hydroshare().detect(spec) == url + + +@pytest.mark.parametrize( + ("specs", "md5tree"), + [ + ( + ("https://www.hydroshare.org/resource/8f7c2f0341ef4180b0dbe97f59130756/", ), + { + "binder/Dockerfile": "872ab0ef22645a42a5560eae640cdc77", + "README.md": "88ac547c3a5f616f6d26e0689d63a113", + "notebooks/sanity-check.ipynb": "7fc4c455bc8cd244479f4d2282051ee6" + }, + ), + ], +) +def test_fetch(specs: list[str], md5tree): + dv = Hydroshare() + + for spec in specs: + with TemporaryDirectory() as d: + output = [] + for l in dv.fetch(dv.detect(spec), d): + output.append(l) + + # Verify md5 sum of the files we expect to find + # We are using md5 instead of something more secure because that is what + # dataverse itself uses + for subpath, expected_sha in md5tree.items(): + with open(os.path.join(d, subpath), "rb") as f: + h = hashlib.md5() + h.update(f.read()) + assert h.hexdigest() == expected_sha + diff --git a/tests/unit/contentproviders/test_hydroshare.py b/tests/unit/contentproviders/test_hydroshare.py deleted file mode 100755 index 41e234cd1..000000000 --- a/tests/unit/contentproviders/test_hydroshare.py +++ /dev/null @@ -1,188 +0,0 @@ -import os -import re -from contextlib import contextmanager -from tempfile import NamedTemporaryFile, TemporaryDirectory -from unittest.mock import patch -from zipfile import ZipFile - -import pytest - -from repo2docker.contentproviders import Hydroshare -from repo2docker.contentproviders.base import ContentProviderException - -doi_responses = { - "https://doi.org/10.4211/hs.b8f6eae9d89241cf8b5904033460af61": ( - "https://www.hydroshare.org/resource/b8f6eae9d89241cf8b5904033460af61" - ), - "https://doi.org/10.21105/joss.01277": ( - "https://joss.theoj.org/papers/10.21105/joss.01277" - ), -} - - -def doi_resolver(req, context): - resp = doi_responses.get(req.url) - # doi responses are redirects - if resp is not None: - context.status_code = 302 - context.headers["Location"] = resp - return resp - - -hydroshare_data = { - "dates": [{"type": "modified", "start_date": "2019-09-25T16:09:17.006152Z"}] -} - - -def test_content_id(): - hydro = Hydroshare() - - hydro.detect("10.4211/hs.b8f6eae9d89241cf8b5904033460af61") - assert hydro.content_id == "b8f6eae9d89241cf8b5904033460af61.v1585005408" - - -def test_detect_hydroshare(): - # valid Hydroshare DOIs trigger this content provider - expected = { - "host": { - "hostname": [ - "https://www.hydroshare.org/resource/", - "http://www.hydroshare.org/resource/", - ], - "django_irods": "https://www.hydroshare.org/django_irods/download/bags/", - "version": "https://www.hydroshare.org/hsapi/resource/{}/scimeta/elements", - }, - "resource": "b8f6eae9d89241cf8b5904033460af61", - "version": "1585005408", - } - - assert ( - Hydroshare().detect( - "https://www.hydroshare.org/resource/b8f6eae9d89241cf8b5904033460af61" - ) - == expected - ) - - assert ( - Hydroshare().detect("10.4211/hs.b8f6eae9d89241cf8b5904033460af61") == expected - ) - - assert ( - Hydroshare().detect( - "https://doi.org/10.4211/hs.b8f6eae9d89241cf8b5904033460af61" - ) - == expected - ) - - # Don't trigger the Hydroshare content provider - assert Hydroshare().detect("/some/path/here") is None - assert Hydroshare().detect("https://example.com/path/here") is None - - # don't handle DOIs that aren't from Hydroshare - assert Hydroshare().detect("https://doi.org/10.21105/joss.01277") is None - - -@contextmanager -def hydroshare_archive(prefix="b8f6eae9d89241cf8b5904033460af61/data/contents"): - with NamedTemporaryFile(suffix=".zip") as zfile: - with ZipFile(zfile.name, mode="w") as zip: - zip.writestr(f"{prefix}/some-file.txt", "some content") - zip.writestr(f"{prefix}/some-other-file.txt", "some more content") - - yield zfile - - -class MockResponse: - def __init__(self, content_type, status_code): - self.status_code = status_code - self.headers = dict() - self.headers["content-type"] = content_type - - -def test_fetch_bag(): - # we "fetch" a local ZIP file to simulate a Hydroshare resource - with hydroshare_archive() as hydro_path: - with patch.object( - Hydroshare, - "urlopen", - side_effect=[ - MockResponse("application/html", 200), - MockResponse("application/zip", 200), - ], - ): - with patch.object( - Hydroshare, "_urlretrieve", side_effect=[(hydro_path, None)] - ): - hydro = Hydroshare() - hydro.resource_id = "b8f6eae9d89241cf8b5904033460af61" - spec = { - "host": { - "hostname": [ - "https://www.hydroshare.org/resource/", - "http://www.hydroshare.org/resource/", - ], - "django_irods": "https://www.hydroshare.org/django_irods/download/bags/", - }, - "resource": "123456789", - } - - with TemporaryDirectory() as d: - output = [] - for l in hydro.fetch(spec, d): - output.append(l) - - unpacked_files = set(os.listdir(d)) - expected = {"some-other-file.txt", "some-file.txt"} - assert expected == unpacked_files - - -def test_fetch_bag_failure(): - with hydroshare_archive(): - with patch.object( - Hydroshare, "urlopen", side_effect=[MockResponse("application/html", 500)] - ): - hydro = Hydroshare() - spec = { - "host": { - "hostname": [ - "https://www.hydroshare.org/resource/", - "http://www.hydroshare.org/resource/", - ], - "django_irods": "https://www.hydroshare.org/django_irods/download/bags/", - }, - "resource": "123456789", - } - with TemporaryDirectory() as d: - with pytest.raises( - ContentProviderException, - match=r"Failed to download bag\. status code 500\.", - ): - # loop for yield statements - for l in hydro.fetch(spec, d): - pass - - -def test_fetch_bag_timeout(): - with hydroshare_archive(): - with patch.object( - Hydroshare, "urlopen", side_effect=[MockResponse("application/html", 200)] - ): - hydro = Hydroshare() - spec = { - "host": { - "hostname": [ - "https://www.hydroshare.org/resource/", - "http://www.hydroshare.org/resource/", - ], - "django_irods": "https://www.hydroshare.org/django_irods/download/bags/", - }, - "resource": "123456789", - } - with TemporaryDirectory() as d: - with pytest.raises( - ContentProviderException, - match=r"Bag taking too long to prepare, exiting now, try again later\.", - ): - # loop for yield statements - for l in hydro.fetch(spec, d, timeout=0): - pass From 2c577f9ef63fb564961786e6a8c691880578a8a5 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Sat, 21 Dec 2024 00:52:20 +0000 Subject: [PATCH 3/4] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- repo2docker/contentproviders/hydroshare.py | 10 +++++---- setup.py | 1 + tests/contentproviders/test_hydroshare.py | 26 ++++++++++++++-------- 3 files changed, 24 insertions(+), 13 deletions(-) diff --git a/repo2docker/contentproviders/hydroshare.py b/repo2docker/contentproviders/hydroshare.py index ce870b59e..ba2323659 100755 --- a/repo2docker/contentproviders/hydroshare.py +++ b/repo2docker/contentproviders/hydroshare.py @@ -1,16 +1,16 @@ import json import os import shutil -import time import tempfile +import time import zipfile from datetime import datetime, timedelta, timezone -from urllib.request import urlretrieve from urllib.parse import urlparse, urlunparse +from urllib.request import urlretrieve +from ..utils import is_doi from .base import ContentProviderException from .doi import DoiProvider -from ..utils import is_doi class Hydroshare(DoiProvider): @@ -70,7 +70,9 @@ def fetch(self, spec, output_dir, yield_output=False, timeout=120): parts = urlparse(url) self.resource_id = parts.path.strip("/").rsplit("/", maxsplit=1)[1] - bag_url = urlunparse(parts._replace(path=f"django_irods/download/bags/{self.resource_id}")) + bag_url = urlunparse( + parts._replace(path=f"django_irods/download/bags/{self.resource_id}") + ) yield f"Downloading {bag_url}.\n" diff --git a/setup.py b/setup.py index d37aa17cc..a4e59d8cf 100644 --- a/setup.py +++ b/setup.py @@ -26,6 +26,7 @@ def finalize_options(self): def run(self): import json + import requests resp = requests.get(self.url) diff --git a/tests/contentproviders/test_hydroshare.py b/tests/contentproviders/test_hydroshare.py index abd485f33..e6d8a4c8f 100755 --- a/tests/contentproviders/test_hydroshare.py +++ b/tests/contentproviders/test_hydroshare.py @@ -1,5 +1,5 @@ -import os import hashlib +import os from tempfile import TemporaryDirectory import pytest @@ -11,16 +11,25 @@ ("spec", "url"), [ # Test a hydroshare DOI - ("10.4211/hs.b8f6eae9d89241cf8b5904033460af61", "http://www.hydroshare.org/resource/b8f6eae9d89241cf8b5904033460af61"), + ( + "10.4211/hs.b8f6eae9d89241cf8b5904033460af61", + "http://www.hydroshare.org/resource/b8f6eae9d89241cf8b5904033460af61", + ), # Hydroshare DOI in a different form - ("https://doi.org/10.4211/hs.b8f6eae9d89241cf8b5904033460af61", "http://www.hydroshare.org/resource/b8f6eae9d89241cf8b5904033460af61"), + ( + "https://doi.org/10.4211/hs.b8f6eae9d89241cf8b5904033460af61", + "http://www.hydroshare.org/resource/b8f6eae9d89241cf8b5904033460af61", + ), # Test a non-hydroshare DOI ("doi:10.7910/DVN/TJCLKP", None), # Test a hydroshare URL - ("http://www.hydroshare.org/resource/b8f6eae9d89241cf8b5904033460af61", "http://www.hydroshare.org/resource/b8f6eae9d89241cf8b5904033460af61"), + ( + "http://www.hydroshare.org/resource/b8f6eae9d89241cf8b5904033460af61", + "http://www.hydroshare.org/resource/b8f6eae9d89241cf8b5904033460af61", + ), # Test a random URL - ("https://www.eff.org/cyberspace-independence", None) - ] + ("https://www.eff.org/cyberspace-independence", None), + ], ) def test_detect(spec, url): assert Hydroshare().detect(spec) == url @@ -30,11 +39,11 @@ def test_detect(spec, url): ("specs", "md5tree"), [ ( - ("https://www.hydroshare.org/resource/8f7c2f0341ef4180b0dbe97f59130756/", ), + ("https://www.hydroshare.org/resource/8f7c2f0341ef4180b0dbe97f59130756/",), { "binder/Dockerfile": "872ab0ef22645a42a5560eae640cdc77", "README.md": "88ac547c3a5f616f6d26e0689d63a113", - "notebooks/sanity-check.ipynb": "7fc4c455bc8cd244479f4d2282051ee6" + "notebooks/sanity-check.ipynb": "7fc4c455bc8cd244479f4d2282051ee6", }, ), ], @@ -56,4 +65,3 @@ def test_fetch(specs: list[str], md5tree): h = hashlib.md5() h.update(f.read()) assert h.hexdigest() == expected_sha - From e5b8c975d8cd291b5c5459ec0c71af94320354dc Mon Sep 17 00:00:00 2001 From: YuviPanda Date: Fri, 20 Dec 2024 16:56:14 -0800 Subject: [PATCH 4/4] Remove another layer of indirection --- repo2docker/contentproviders/hydroshare.py | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/repo2docker/contentproviders/hydroshare.py b/repo2docker/contentproviders/hydroshare.py index ba2323659..24a4d888e 100755 --- a/repo2docker/contentproviders/hydroshare.py +++ b/repo2docker/contentproviders/hydroshare.py @@ -60,13 +60,9 @@ def detect(self, spec, ref=None, extra_args=None): if parsed.netloc in self.HYDROSHARE_DOMAINS: return url - def _urlretrieve(self, bag_url): - return urlretrieve(bag_url) - def fetch(self, spec, output_dir, yield_output=False, timeout=120): """Fetch and unpack a Hydroshare resource""" url = spec - print(url) parts = urlparse(url) self.resource_id = parts.path.strip("/").rsplit("/", maxsplit=1)[1] @@ -98,7 +94,7 @@ def fetch(self, spec, output_dir, yield_output=False, timeout=120): raise ContentProviderException(msg) # Bag creation seems to need a small time buffer after it says it's ready. time.sleep(1) - filehandle, _ = self._urlretrieve(bag_url) + filehandle, _ = urlretrieve(bag_url) zip_file_object = zipfile.ZipFile(filehandle, "r") yield "Downloaded, unpacking contents.\n"