From aec0e02b9739e3b11437c59d6a18abc57190a141 Mon Sep 17 00:00:00 2001
From: YuviPanda <yuvipanda@gmail.com>
Date: Fri, 20 Dec 2024 15:46:49 -0800
Subject: [PATCH 1/4] Only use requests, not a mix of requests & urlopen

We were:

1. In some cases, directly using requests
2. In some cases, directly using the stdlib's urlopen
3. In some cases, had a method named urlopen that simply passed
   things through to requests

This is unnecessarily confusing, and seems to primarily be done
for the benefit of mocking the network calls. However, as described
in the recently merged https://github.com/jupyterhub/repo2docker/pull/1390,
I don't think mocking is appropriate here as it means we don't actually
catch problems.

This PR mostly focuses on getting unifying to only using requests
directly with as little indirection as possible. If any tests were
directly using mocks here, they will be replaced with something that
is testing things more directly as appropriate
---
 repo2docker/contentproviders/ckan.py         |  8 +++-----
 repo2docker/contentproviders/doi.py          | 15 ---------------
 repo2docker/contentproviders/figshare.py     |  2 +-
 repo2docker/contentproviders/hydroshare.py   |  6 +++---
 repo2docker/contentproviders/zenodo.py       |  4 ++--
 setup.py                                     |  7 +++----
 tests/unit/contentproviders/test_doi.py      |  2 +-
 tests/unit/contentproviders/test_figshare.py |  1 -
 tests/unit/contentproviders/test_zenodo.py   |  1 -
 9 files changed, 13 insertions(+), 33 deletions(-)

diff --git a/repo2docker/contentproviders/ckan.py b/repo2docker/contentproviders/ckan.py
index 2618fd06e..7f0dec775 100644
--- a/repo2docker/contentproviders/ckan.py
+++ b/repo2docker/contentproviders/ckan.py
@@ -25,7 +25,7 @@ def _fetch_version(self, api_url):
         Borrowed from the Hydroshare provider.
         """
         package_show_url = f"{api_url}package_show?id={self.dataset_id}"
-        resp = self.urlopen(package_show_url).json()
+        resp = self.session.get(package_show_url).json()
         date = resp["result"]["metadata_modified"]
         parsed_date = datetime.strptime(date, "%Y-%m-%dT%H:%M:%S.%f")
         epoch = parsed_date.replace(tzinfo=timezone(timedelta(0))).timestamp()
@@ -35,8 +35,6 @@ def _fetch_version(self, api_url):
     def _request(self, url, **kwargs):
         return self.session.get(url, **kwargs)
 
-    urlopen = _request
-
     def detect(self, source, ref=None, extra_args=None):
         """Trigger this provider for things that resolve to a CKAN dataset."""
         parsed_url = urlparse(source)
@@ -58,7 +56,7 @@ def detect(self, source, ref=None, extra_args=None):
         ).geturl()
 
         status_show_url = f"{api_url}status_show"
-        resp = self.urlopen(status_show_url)
+        resp = self.session.get(status_show_url)
         if resp.status_code == 200:
 
             # Activity ID may be present either as a query parameter, activity_id
@@ -97,7 +95,7 @@ def fetch(self, spec, output_dir, yield_output=False):
                 {"id": dataset_id}
             )
 
-        resp = self.urlopen(
+        resp = self.session.get(
             fetch_url,
             headers={"accept": "application/json"},
         )
diff --git a/repo2docker/contentproviders/doi.py b/repo2docker/contentproviders/doi.py
index cf43a1578..2dc34b561 100644
--- a/repo2docker/contentproviders/doi.py
+++ b/repo2docker/contentproviders/doi.py
@@ -27,21 +27,6 @@ def __init__(self):
     def _request(self, url, **kwargs):
         return self.session.get(url, **kwargs)
 
-    urlopen = _request
-
-    def _urlopen(self, req, headers=None):
-        """A urlopen() helper"""
-        # someone passed a string, not a request
-        if not isinstance(req, request.Request):
-            req = request.Request(req)
-
-        req.add_header("User-Agent", f"repo2docker {__version__}")
-        if headers is not None:
-            for key, value in headers.items():
-                req.add_header(key, value)
-
-        return request.urlopen(req)
-
     def doi2url(self, doi):
         # Transform a DOI to a URL
         # If not a doi, assume we have a URL and return
diff --git a/repo2docker/contentproviders/figshare.py b/repo2docker/contentproviders/figshare.py
index 6c7b26c86..e8fec3fe7 100644
--- a/repo2docker/contentproviders/figshare.py
+++ b/repo2docker/contentproviders/figshare.py
@@ -74,7 +74,7 @@ def fetch(self, spec, output_dir, yield_output=False):
         host = spec["host"]
 
         yield f"Fetching Figshare article {article_id} in version {article_version}.\n"
-        resp = self.urlopen(
+        resp = self.session.get(
             f'{host["api"]}{article_id}/versions/{article_version}',
             headers={"accept": "application/json"},
         )
diff --git a/repo2docker/contentproviders/hydroshare.py b/repo2docker/contentproviders/hydroshare.py
index 4104d97a6..90c41914f 100755
--- a/repo2docker/contentproviders/hydroshare.py
+++ b/repo2docker/contentproviders/hydroshare.py
@@ -15,7 +15,7 @@ class Hydroshare(DoiProvider):
 
     def _fetch_version(self, host):
         """Fetch resource modified date and convert to epoch"""
-        json_response = self.urlopen(host["version"].format(self.resource_id)).json()
+        json_response = self.session.get(host["version"].format(self.resource_id)).json()
         date = next(
             item for item in json_response["dates"] if item["type"] == "modified"
         )["start_date"]
@@ -63,7 +63,7 @@ def fetch(self, spec, output_dir, yield_output=False, timeout=120):
         yield f"Downloading {bag_url}.\n"
 
         # bag downloads are prepared on demand and may need some time
-        conn = self.urlopen(bag_url)
+        conn = self.session.get(bag_url)
         total_wait_time = 0
         while (
             conn.status_code == 200
@@ -77,7 +77,7 @@ def fetch(self, spec, output_dir, yield_output=False, timeout=120):
                 raise ContentProviderException(msg)
             yield f"Bag is being prepared, requesting again in {wait_time} seconds.\n"
             time.sleep(wait_time)
-            conn = self.urlopen(bag_url)
+            conn = self.session.get(bag_url)
         if conn.status_code != 200:
             msg = f"Failed to download bag. status code {conn.status_code}.\n"
             yield msg
diff --git a/repo2docker/contentproviders/zenodo.py b/repo2docker/contentproviders/zenodo.py
index 6982c3a7c..31ea17415 100644
--- a/repo2docker/contentproviders/zenodo.py
+++ b/repo2docker/contentproviders/zenodo.py
@@ -73,7 +73,7 @@ def fetch(self, spec, output_dir, yield_output=False):
         host = spec["host"]
 
         yield f"Fetching Zenodo record {record_id}.\n"
-        resp = self.urlopen(
+        resp = self.session.get(
             f'{host["api"]}{record_id}',
             headers={"accept": "application/json"},
         )
@@ -82,7 +82,7 @@ def fetch(self, spec, output_dir, yield_output=False):
         if host["files"]:
             yield f"Fetching Zenodo record {record_id} files.\n"
             files_url = deep_get(record, host["files"])
-            resp = self.urlopen(
+            resp = self.session.get(
                 files_url,
                 headers={"accept": "application/json"},
             )
diff --git a/setup.py b/setup.py
index 58abff7c0..d37aa17cc 100644
--- a/setup.py
+++ b/setup.py
@@ -26,11 +26,10 @@ def finalize_options(self):
 
     def run(self):
         import json
-        from urllib.request import urlopen
+        import requests
 
-        resp = urlopen(self.url, timeout=5)
-        resp_body = resp.read()
-        data = json.loads(resp_body.decode("utf-8"))
+        resp = requests.get(self.url)
+        data = resp.json()
         if "installations" not in data:
             raise ValueError("Malformed installation map.")
 
diff --git a/tests/unit/contentproviders/test_doi.py b/tests/unit/contentproviders/test_doi.py
index 58cae5f5b..f2de2b50b 100644
--- a/tests/unit/contentproviders/test_doi.py
+++ b/tests/unit/contentproviders/test_doi.py
@@ -24,7 +24,7 @@ def test_url_headers(requests_mock):
     doi = DoiProvider()
 
     headers = {"test1": "value1", "Test2": "value2"}
-    result = doi.urlopen("https://mybinder.org", headers=headers)
+    result = doi.session.get("https://mybinder.org", headers=headers)
     assert "test1" in result.request.headers
     assert "Test2" in result.request.headers
     assert result.request.headers["User-Agent"] == f"repo2docker {__version__}"
diff --git a/tests/unit/contentproviders/test_figshare.py b/tests/unit/contentproviders/test_figshare.py
index 760c0e487..881c8513b 100644
--- a/tests/unit/contentproviders/test_figshare.py
+++ b/tests/unit/contentproviders/test_figshare.py
@@ -5,7 +5,6 @@
 from io import BytesIO
 from tempfile import NamedTemporaryFile, TemporaryDirectory
 from unittest.mock import patch
-from urllib.request import Request, urlopen
 from zipfile import ZipFile
 
 import pytest
diff --git a/tests/unit/contentproviders/test_zenodo.py b/tests/unit/contentproviders/test_zenodo.py
index a1e1226fb..1152ce2cf 100644
--- a/tests/unit/contentproviders/test_zenodo.py
+++ b/tests/unit/contentproviders/test_zenodo.py
@@ -5,7 +5,6 @@
 from io import BytesIO
 from tempfile import NamedTemporaryFile, TemporaryDirectory
 from unittest.mock import patch
-from urllib.request import Request, urlopen
 from zipfile import ZipFile
 
 import pytest

From cd9911c9f2d5e880ec87fed904b16a6f926a0bd8 Mon Sep 17 00:00:00 2001
From: YuviPanda <yuvipanda@gmail.com>
Date: Fri, 20 Dec 2024 16:43:14 -0800
Subject: [PATCH 2/4] Cleanup hydroshare provider

- Cleanup how the provider is detected, as we were simply doing
  a domain check but with many extra steps
- Move the tests to be real integration tests
- Test detection, not content_id
---
 repo2docker/contentproviders/hydroshare.py    |  69 ++++---
 tests/contentproviders/test_hydroshare.py     |  59 ++++++
 .../unit/contentproviders/test_hydroshare.py  | 188 ------------------
 3 files changed, 100 insertions(+), 216 deletions(-)
 create mode 100755 tests/contentproviders/test_hydroshare.py
 delete mode 100755 tests/unit/contentproviders/test_hydroshare.py

diff --git a/repo2docker/contentproviders/hydroshare.py b/repo2docker/contentproviders/hydroshare.py
index 90c41914f..ce870b59e 100755
--- a/repo2docker/contentproviders/hydroshare.py
+++ b/repo2docker/contentproviders/hydroshare.py
@@ -2,20 +2,29 @@
 import os
 import shutil
 import time
+import tempfile
 import zipfile
 from datetime import datetime, timedelta, timezone
 from urllib.request import urlretrieve
+from urllib.parse import urlparse, urlunparse
 
 from .base import ContentProviderException
 from .doi import DoiProvider
+from ..utils import is_doi
 
 
 class Hydroshare(DoiProvider):
     """Provide contents of a Hydroshare resource."""
 
-    def _fetch_version(self, host):
-        """Fetch resource modified date and convert to epoch"""
-        json_response = self.session.get(host["version"].format(self.resource_id)).json()
+    HYDROSHARE_DOMAINS = ["www.hydroshare.org"]
+
+    def get_version(self, resource_id: str) -> str:
+        """
+        Get current version of given resource_id
+        """
+        api_url = f"https://{self.HYDROSHARE_DOMAIN}/hsapi/resource/{resource_id}/scimeta/elements"
+
+        json_response = self.session.get(api_url).json()
         date = next(
             item for item in json_response["dates"] if item["type"] == "modified"
         )["start_date"]
@@ -26,7 +35,7 @@ def _fetch_version(self, host):
         # truncate the timestamp
         return str(int(epoch))
 
-    def detect(self, doi, ref=None, extra_args=None):
+    def detect(self, spec, ref=None, extra_args=None):
         """Trigger this provider for things that resolve to a Hydroshare resource"""
         hosts = [
             {
@@ -35,30 +44,33 @@ def detect(self, doi, ref=None, extra_args=None):
                     "http://www.hydroshare.org/resource/",
                 ],
                 "django_irods": "https://www.hydroshare.org/django_irods/download/bags/",
-                "version": "https://www.hydroshare.org/hsapi/resource/{}/scimeta/elements",
+                "version": "",
             }
         ]
-        url = self.doi2url(doi)
-
-        for host in hosts:
-            if any([url.startswith(s) for s in host["hostname"]]):
-                self.resource_id = url.strip("/").rsplit("/", maxsplit=1)[1]
-                self.version = self._fetch_version(host)
-                return {
-                    "resource": self.resource_id,
-                    "host": host,
-                    "version": self.version,
-                }
+
+        # Our spec could be a doi that resolves to a hydroshare URL, or a hydroshare URL
+        if is_doi(spec):
+            url = self.doi2url(spec)
+        else:
+            url = spec
+
+        parsed = urlparse(url)
+
+        print(url)
+        if parsed.netloc in self.HYDROSHARE_DOMAINS:
+            return url
 
     def _urlretrieve(self, bag_url):
         return urlretrieve(bag_url)
 
     def fetch(self, spec, output_dir, yield_output=False, timeout=120):
         """Fetch and unpack a Hydroshare resource"""
-        resource_id = spec["resource"]
-        host = spec["host"]
+        url = spec
+        print(url)
+        parts = urlparse(url)
+        self.resource_id = parts.path.strip("/").rsplit("/", maxsplit=1)[1]
 
-        bag_url = f'{host["django_irods"]}{resource_id}'
+        bag_url = urlunparse(parts._replace(path=f"django_irods/download/bags/{self.resource_id}"))
 
         yield f"Downloading {bag_url}.\n"
 
@@ -87,16 +99,17 @@ def fetch(self, spec, output_dir, yield_output=False, timeout=120):
         filehandle, _ = self._urlretrieve(bag_url)
         zip_file_object = zipfile.ZipFile(filehandle, "r")
         yield "Downloaded, unpacking contents.\n"
-        zip_file_object.extractall("temp")
-        # resources store the contents in the data/contents directory, which is all we want to keep
-        contents_dir = os.path.join("temp", self.resource_id, "data", "contents")
-        files = os.listdir(contents_dir)
-        for f in files:
-            shutil.move(os.path.join(contents_dir, f), output_dir)
-        yield "Finished, cleaning up.\n"
-        shutil.rmtree("temp")
+
+        with tempfile.TemporaryDirectory() as d:
+            zip_file_object.extractall(d)
+            # resources store the contents in the data/contents directory, which is all we want to keep
+            contents_dir = os.path.join(d, self.resource_id, "data", "contents")
+            files = os.listdir(contents_dir)
+            for f in files:
+                shutil.move(os.path.join(contents_dir, f), output_dir)
+            yield "Finished, cleaning up.\n"
 
     @property
     def content_id(self):
         """The HydroShare resource ID"""
-        return f"{self.resource_id}.v{self.version}"
+        return f"{self.resource_id}"
diff --git a/tests/contentproviders/test_hydroshare.py b/tests/contentproviders/test_hydroshare.py
new file mode 100755
index 000000000..abd485f33
--- /dev/null
+++ b/tests/contentproviders/test_hydroshare.py
@@ -0,0 +1,59 @@
+import os
+import hashlib
+from tempfile import TemporaryDirectory
+
+import pytest
+
+from repo2docker.contentproviders import Hydroshare
+
+
+@pytest.mark.parametrize(
+    ("spec", "url"),
+    [
+        # Test a hydroshare DOI
+        ("10.4211/hs.b8f6eae9d89241cf8b5904033460af61", "http://www.hydroshare.org/resource/b8f6eae9d89241cf8b5904033460af61"),
+        # Hydroshare DOI in a different form
+        ("https://doi.org/10.4211/hs.b8f6eae9d89241cf8b5904033460af61", "http://www.hydroshare.org/resource/b8f6eae9d89241cf8b5904033460af61"),
+        # Test a non-hydroshare DOI
+        ("doi:10.7910/DVN/TJCLKP", None),
+        # Test a hydroshare URL
+        ("http://www.hydroshare.org/resource/b8f6eae9d89241cf8b5904033460af61", "http://www.hydroshare.org/resource/b8f6eae9d89241cf8b5904033460af61"),
+        # Test a random URL
+        ("https://www.eff.org/cyberspace-independence", None)
+    ]
+)
+def test_detect(spec, url):
+    assert Hydroshare().detect(spec) == url
+
+
+@pytest.mark.parametrize(
+    ("specs", "md5tree"),
+    [
+        (
+            ("https://www.hydroshare.org/resource/8f7c2f0341ef4180b0dbe97f59130756/", ),
+            {
+                "binder/Dockerfile": "872ab0ef22645a42a5560eae640cdc77",
+                "README.md": "88ac547c3a5f616f6d26e0689d63a113",
+                "notebooks/sanity-check.ipynb": "7fc4c455bc8cd244479f4d2282051ee6"
+            },
+        ),
+    ],
+)
+def test_fetch(specs: list[str], md5tree):
+    dv = Hydroshare()
+
+    for spec in specs:
+        with TemporaryDirectory() as d:
+            output = []
+            for l in dv.fetch(dv.detect(spec), d):
+                output.append(l)
+
+            # Verify md5 sum of the files we expect to find
+            # We are using md5 instead of something more secure because that is what
+            # dataverse itself uses
+            for subpath, expected_sha in md5tree.items():
+                with open(os.path.join(d, subpath), "rb") as f:
+                    h = hashlib.md5()
+                    h.update(f.read())
+                    assert h.hexdigest() == expected_sha
+
diff --git a/tests/unit/contentproviders/test_hydroshare.py b/tests/unit/contentproviders/test_hydroshare.py
deleted file mode 100755
index 41e234cd1..000000000
--- a/tests/unit/contentproviders/test_hydroshare.py
+++ /dev/null
@@ -1,188 +0,0 @@
-import os
-import re
-from contextlib import contextmanager
-from tempfile import NamedTemporaryFile, TemporaryDirectory
-from unittest.mock import patch
-from zipfile import ZipFile
-
-import pytest
-
-from repo2docker.contentproviders import Hydroshare
-from repo2docker.contentproviders.base import ContentProviderException
-
-doi_responses = {
-    "https://doi.org/10.4211/hs.b8f6eae9d89241cf8b5904033460af61": (
-        "https://www.hydroshare.org/resource/b8f6eae9d89241cf8b5904033460af61"
-    ),
-    "https://doi.org/10.21105/joss.01277": (
-        "https://joss.theoj.org/papers/10.21105/joss.01277"
-    ),
-}
-
-
-def doi_resolver(req, context):
-    resp = doi_responses.get(req.url)
-    # doi responses are redirects
-    if resp is not None:
-        context.status_code = 302
-        context.headers["Location"] = resp
-    return resp
-
-
-hydroshare_data = {
-    "dates": [{"type": "modified", "start_date": "2019-09-25T16:09:17.006152Z"}]
-}
-
-
-def test_content_id():
-    hydro = Hydroshare()
-
-    hydro.detect("10.4211/hs.b8f6eae9d89241cf8b5904033460af61")
-    assert hydro.content_id == "b8f6eae9d89241cf8b5904033460af61.v1585005408"
-
-
-def test_detect_hydroshare():
-    # valid Hydroshare DOIs trigger this content provider
-    expected = {
-        "host": {
-            "hostname": [
-                "https://www.hydroshare.org/resource/",
-                "http://www.hydroshare.org/resource/",
-            ],
-            "django_irods": "https://www.hydroshare.org/django_irods/download/bags/",
-            "version": "https://www.hydroshare.org/hsapi/resource/{}/scimeta/elements",
-        },
-        "resource": "b8f6eae9d89241cf8b5904033460af61",
-        "version": "1585005408",
-    }
-
-    assert (
-        Hydroshare().detect(
-            "https://www.hydroshare.org/resource/b8f6eae9d89241cf8b5904033460af61"
-        )
-        == expected
-    )
-
-    assert (
-        Hydroshare().detect("10.4211/hs.b8f6eae9d89241cf8b5904033460af61") == expected
-    )
-
-    assert (
-        Hydroshare().detect(
-            "https://doi.org/10.4211/hs.b8f6eae9d89241cf8b5904033460af61"
-        )
-        == expected
-    )
-
-    # Don't trigger the Hydroshare content provider
-    assert Hydroshare().detect("/some/path/here") is None
-    assert Hydroshare().detect("https://example.com/path/here") is None
-
-    # don't handle DOIs that aren't from Hydroshare
-    assert Hydroshare().detect("https://doi.org/10.21105/joss.01277") is None
-
-
-@contextmanager
-def hydroshare_archive(prefix="b8f6eae9d89241cf8b5904033460af61/data/contents"):
-    with NamedTemporaryFile(suffix=".zip") as zfile:
-        with ZipFile(zfile.name, mode="w") as zip:
-            zip.writestr(f"{prefix}/some-file.txt", "some content")
-            zip.writestr(f"{prefix}/some-other-file.txt", "some more content")
-
-        yield zfile
-
-
-class MockResponse:
-    def __init__(self, content_type, status_code):
-        self.status_code = status_code
-        self.headers = dict()
-        self.headers["content-type"] = content_type
-
-
-def test_fetch_bag():
-    # we "fetch" a local ZIP file to simulate a Hydroshare resource
-    with hydroshare_archive() as hydro_path:
-        with patch.object(
-            Hydroshare,
-            "urlopen",
-            side_effect=[
-                MockResponse("application/html", 200),
-                MockResponse("application/zip", 200),
-            ],
-        ):
-            with patch.object(
-                Hydroshare, "_urlretrieve", side_effect=[(hydro_path, None)]
-            ):
-                hydro = Hydroshare()
-                hydro.resource_id = "b8f6eae9d89241cf8b5904033460af61"
-                spec = {
-                    "host": {
-                        "hostname": [
-                            "https://www.hydroshare.org/resource/",
-                            "http://www.hydroshare.org/resource/",
-                        ],
-                        "django_irods": "https://www.hydroshare.org/django_irods/download/bags/",
-                    },
-                    "resource": "123456789",
-                }
-
-                with TemporaryDirectory() as d:
-                    output = []
-                    for l in hydro.fetch(spec, d):
-                        output.append(l)
-
-                    unpacked_files = set(os.listdir(d))
-                    expected = {"some-other-file.txt", "some-file.txt"}
-                    assert expected == unpacked_files
-
-
-def test_fetch_bag_failure():
-    with hydroshare_archive():
-        with patch.object(
-            Hydroshare, "urlopen", side_effect=[MockResponse("application/html", 500)]
-        ):
-            hydro = Hydroshare()
-            spec = {
-                "host": {
-                    "hostname": [
-                        "https://www.hydroshare.org/resource/",
-                        "http://www.hydroshare.org/resource/",
-                    ],
-                    "django_irods": "https://www.hydroshare.org/django_irods/download/bags/",
-                },
-                "resource": "123456789",
-            }
-            with TemporaryDirectory() as d:
-                with pytest.raises(
-                    ContentProviderException,
-                    match=r"Failed to download bag\. status code 500\.",
-                ):
-                    # loop for yield statements
-                    for l in hydro.fetch(spec, d):
-                        pass
-
-
-def test_fetch_bag_timeout():
-    with hydroshare_archive():
-        with patch.object(
-            Hydroshare, "urlopen", side_effect=[MockResponse("application/html", 200)]
-        ):
-            hydro = Hydroshare()
-            spec = {
-                "host": {
-                    "hostname": [
-                        "https://www.hydroshare.org/resource/",
-                        "http://www.hydroshare.org/resource/",
-                    ],
-                    "django_irods": "https://www.hydroshare.org/django_irods/download/bags/",
-                },
-                "resource": "123456789",
-            }
-            with TemporaryDirectory() as d:
-                with pytest.raises(
-                    ContentProviderException,
-                    match=r"Bag taking too long to prepare, exiting now, try again later\.",
-                ):
-                    # loop for yield statements
-                    for l in hydro.fetch(spec, d, timeout=0):
-                        pass

From 2c577f9ef63fb564961786e6a8c691880578a8a5 Mon Sep 17 00:00:00 2001
From: "pre-commit-ci[bot]"
 <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Date: Sat, 21 Dec 2024 00:52:20 +0000
Subject: [PATCH 3/4] [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci
---
 repo2docker/contentproviders/hydroshare.py | 10 +++++----
 setup.py                                   |  1 +
 tests/contentproviders/test_hydroshare.py  | 26 ++++++++++++++--------
 3 files changed, 24 insertions(+), 13 deletions(-)

diff --git a/repo2docker/contentproviders/hydroshare.py b/repo2docker/contentproviders/hydroshare.py
index ce870b59e..ba2323659 100755
--- a/repo2docker/contentproviders/hydroshare.py
+++ b/repo2docker/contentproviders/hydroshare.py
@@ -1,16 +1,16 @@
 import json
 import os
 import shutil
-import time
 import tempfile
+import time
 import zipfile
 from datetime import datetime, timedelta, timezone
-from urllib.request import urlretrieve
 from urllib.parse import urlparse, urlunparse
+from urllib.request import urlretrieve
 
+from ..utils import is_doi
 from .base import ContentProviderException
 from .doi import DoiProvider
-from ..utils import is_doi
 
 
 class Hydroshare(DoiProvider):
@@ -70,7 +70,9 @@ def fetch(self, spec, output_dir, yield_output=False, timeout=120):
         parts = urlparse(url)
         self.resource_id = parts.path.strip("/").rsplit("/", maxsplit=1)[1]
 
-        bag_url = urlunparse(parts._replace(path=f"django_irods/download/bags/{self.resource_id}"))
+        bag_url = urlunparse(
+            parts._replace(path=f"django_irods/download/bags/{self.resource_id}")
+        )
 
         yield f"Downloading {bag_url}.\n"
 
diff --git a/setup.py b/setup.py
index d37aa17cc..a4e59d8cf 100644
--- a/setup.py
+++ b/setup.py
@@ -26,6 +26,7 @@ def finalize_options(self):
 
     def run(self):
         import json
+
         import requests
 
         resp = requests.get(self.url)
diff --git a/tests/contentproviders/test_hydroshare.py b/tests/contentproviders/test_hydroshare.py
index abd485f33..e6d8a4c8f 100755
--- a/tests/contentproviders/test_hydroshare.py
+++ b/tests/contentproviders/test_hydroshare.py
@@ -1,5 +1,5 @@
-import os
 import hashlib
+import os
 from tempfile import TemporaryDirectory
 
 import pytest
@@ -11,16 +11,25 @@
     ("spec", "url"),
     [
         # Test a hydroshare DOI
-        ("10.4211/hs.b8f6eae9d89241cf8b5904033460af61", "http://www.hydroshare.org/resource/b8f6eae9d89241cf8b5904033460af61"),
+        (
+            "10.4211/hs.b8f6eae9d89241cf8b5904033460af61",
+            "http://www.hydroshare.org/resource/b8f6eae9d89241cf8b5904033460af61",
+        ),
         # Hydroshare DOI in a different form
-        ("https://doi.org/10.4211/hs.b8f6eae9d89241cf8b5904033460af61", "http://www.hydroshare.org/resource/b8f6eae9d89241cf8b5904033460af61"),
+        (
+            "https://doi.org/10.4211/hs.b8f6eae9d89241cf8b5904033460af61",
+            "http://www.hydroshare.org/resource/b8f6eae9d89241cf8b5904033460af61",
+        ),
         # Test a non-hydroshare DOI
         ("doi:10.7910/DVN/TJCLKP", None),
         # Test a hydroshare URL
-        ("http://www.hydroshare.org/resource/b8f6eae9d89241cf8b5904033460af61", "http://www.hydroshare.org/resource/b8f6eae9d89241cf8b5904033460af61"),
+        (
+            "http://www.hydroshare.org/resource/b8f6eae9d89241cf8b5904033460af61",
+            "http://www.hydroshare.org/resource/b8f6eae9d89241cf8b5904033460af61",
+        ),
         # Test a random URL
-        ("https://www.eff.org/cyberspace-independence", None)
-    ]
+        ("https://www.eff.org/cyberspace-independence", None),
+    ],
 )
 def test_detect(spec, url):
     assert Hydroshare().detect(spec) == url
@@ -30,11 +39,11 @@ def test_detect(spec, url):
     ("specs", "md5tree"),
     [
         (
-            ("https://www.hydroshare.org/resource/8f7c2f0341ef4180b0dbe97f59130756/", ),
+            ("https://www.hydroshare.org/resource/8f7c2f0341ef4180b0dbe97f59130756/",),
             {
                 "binder/Dockerfile": "872ab0ef22645a42a5560eae640cdc77",
                 "README.md": "88ac547c3a5f616f6d26e0689d63a113",
-                "notebooks/sanity-check.ipynb": "7fc4c455bc8cd244479f4d2282051ee6"
+                "notebooks/sanity-check.ipynb": "7fc4c455bc8cd244479f4d2282051ee6",
             },
         ),
     ],
@@ -56,4 +65,3 @@ def test_fetch(specs: list[str], md5tree):
                     h = hashlib.md5()
                     h.update(f.read())
                     assert h.hexdigest() == expected_sha
-

From e5b8c975d8cd291b5c5459ec0c71af94320354dc Mon Sep 17 00:00:00 2001
From: YuviPanda <yuvipanda@gmail.com>
Date: Fri, 20 Dec 2024 16:56:14 -0800
Subject: [PATCH 4/4] Remove another layer of indirection

---
 repo2docker/contentproviders/hydroshare.py | 6 +-----
 1 file changed, 1 insertion(+), 5 deletions(-)

diff --git a/repo2docker/contentproviders/hydroshare.py b/repo2docker/contentproviders/hydroshare.py
index ba2323659..24a4d888e 100755
--- a/repo2docker/contentproviders/hydroshare.py
+++ b/repo2docker/contentproviders/hydroshare.py
@@ -60,13 +60,9 @@ def detect(self, spec, ref=None, extra_args=None):
         if parsed.netloc in self.HYDROSHARE_DOMAINS:
             return url
 
-    def _urlretrieve(self, bag_url):
-        return urlretrieve(bag_url)
-
     def fetch(self, spec, output_dir, yield_output=False, timeout=120):
         """Fetch and unpack a Hydroshare resource"""
         url = spec
-        print(url)
         parts = urlparse(url)
         self.resource_id = parts.path.strip("/").rsplit("/", maxsplit=1)[1]
 
@@ -98,7 +94,7 @@ def fetch(self, spec, output_dir, yield_output=False, timeout=120):
             raise ContentProviderException(msg)
         # Bag creation seems to need a small time buffer after it says it's ready.
         time.sleep(1)
-        filehandle, _ = self._urlretrieve(bag_url)
+        filehandle, _ = urlretrieve(bag_url)
         zip_file_object = zipfile.ZipFile(filehandle, "r")
         yield "Downloaded, unpacking contents.\n"