Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Cleanup hydroshare provider and stop using urlopen #1393

Draft
wants to merge 4 commits into
base: main
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 3 additions & 5 deletions repo2docker/contentproviders/ckan.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,7 @@ def _fetch_version(self, api_url):
Borrowed from the Hydroshare provider.
"""
package_show_url = f"{api_url}package_show?id={self.dataset_id}"
resp = self.urlopen(package_show_url).json()
resp = self.session.get(package_show_url).json()
date = resp["result"]["metadata_modified"]
parsed_date = datetime.strptime(date, "%Y-%m-%dT%H:%M:%S.%f")
epoch = parsed_date.replace(tzinfo=timezone(timedelta(0))).timestamp()
Expand All @@ -35,8 +35,6 @@ def _fetch_version(self, api_url):
def _request(self, url, **kwargs):
return self.session.get(url, **kwargs)

urlopen = _request

def detect(self, source, ref=None, extra_args=None):
"""Trigger this provider for things that resolve to a CKAN dataset."""
parsed_url = urlparse(source)
Expand All @@ -58,7 +56,7 @@ def detect(self, source, ref=None, extra_args=None):
).geturl()

status_show_url = f"{api_url}status_show"
resp = self.urlopen(status_show_url)
resp = self.session.get(status_show_url)
if resp.status_code == 200:

# Activity ID may be present either as a query parameter, activity_id
Expand Down Expand Up @@ -97,7 +95,7 @@ def fetch(self, spec, output_dir, yield_output=False):
{"id": dataset_id}
)

resp = self.urlopen(
resp = self.session.get(
fetch_url,
headers={"accept": "application/json"},
)
Expand Down
15 changes: 0 additions & 15 deletions repo2docker/contentproviders/doi.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,21 +27,6 @@ def __init__(self):
def _request(self, url, **kwargs):
return self.session.get(url, **kwargs)

urlopen = _request

def _urlopen(self, req, headers=None):
"""A urlopen() helper"""
# someone passed a string, not a request
if not isinstance(req, request.Request):
req = request.Request(req)

req.add_header("User-Agent", f"repo2docker {__version__}")
if headers is not None:
for key, value in headers.items():
req.add_header(key, value)

return request.urlopen(req)

def doi2url(self, doi):
# Transform a DOI to a URL
# If not a doi, assume we have a URL and return
Expand Down
2 changes: 1 addition & 1 deletion repo2docker/contentproviders/figshare.py
Original file line number Diff line number Diff line change
Expand Up @@ -74,7 +74,7 @@ def fetch(self, spec, output_dir, yield_output=False):
host = spec["host"]

yield f"Fetching Figshare article {article_id} in version {article_version}.\n"
resp = self.urlopen(
resp = self.session.get(
f'{host["api"]}{article_id}/versions/{article_version}',
headers={"accept": "application/json"},
)
Expand Down
75 changes: 43 additions & 32 deletions repo2docker/contentproviders/hydroshare.py
Original file line number Diff line number Diff line change
@@ -1,21 +1,30 @@
import json
import os
import shutil
import tempfile
import time
import zipfile
from datetime import datetime, timedelta, timezone
from urllib.parse import urlparse, urlunparse
from urllib.request import urlretrieve

from ..utils import is_doi
from .base import ContentProviderException
from .doi import DoiProvider


class Hydroshare(DoiProvider):
"""Provide contents of a Hydroshare resource."""

def _fetch_version(self, host):
"""Fetch resource modified date and convert to epoch"""
json_response = self.urlopen(host["version"].format(self.resource_id)).json()
HYDROSHARE_DOMAINS = ["www.hydroshare.org"]

def get_version(self, resource_id: str) -> str:
"""
Get current version of given resource_id
"""
api_url = f"https://{self.HYDROSHARE_DOMAIN}/hsapi/resource/{resource_id}/scimeta/elements"

json_response = self.session.get(api_url).json()
date = next(
item for item in json_response["dates"] if item["type"] == "modified"
)["start_date"]
Expand All @@ -26,7 +35,7 @@ def _fetch_version(self, host):
# truncate the timestamp
return str(int(epoch))

def detect(self, doi, ref=None, extra_args=None):
def detect(self, spec, ref=None, extra_args=None):
"""Trigger this provider for things that resolve to a Hydroshare resource"""
hosts = [
{
Expand All @@ -35,35 +44,36 @@ def detect(self, doi, ref=None, extra_args=None):
"http://www.hydroshare.org/resource/",
],
"django_irods": "https://www.hydroshare.org/django_irods/download/bags/",
"version": "https://www.hydroshare.org/hsapi/resource/{}/scimeta/elements",
"version": "",
}
]
url = self.doi2url(doi)

for host in hosts:
if any([url.startswith(s) for s in host["hostname"]]):
self.resource_id = url.strip("/").rsplit("/", maxsplit=1)[1]
self.version = self._fetch_version(host)
return {
"resource": self.resource_id,
"host": host,
"version": self.version,
}
# Our spec could be a doi that resolves to a hydroshare URL, or a hydroshare URL
if is_doi(spec):
url = self.doi2url(spec)
else:
url = spec

def _urlretrieve(self, bag_url):
return urlretrieve(bag_url)
parsed = urlparse(url)

print(url)
if parsed.netloc in self.HYDROSHARE_DOMAINS:
return url

def fetch(self, spec, output_dir, yield_output=False, timeout=120):
"""Fetch and unpack a Hydroshare resource"""
resource_id = spec["resource"]
host = spec["host"]
url = spec
parts = urlparse(url)
self.resource_id = parts.path.strip("/").rsplit("/", maxsplit=1)[1]

bag_url = f'{host["django_irods"]}{resource_id}'
bag_url = urlunparse(
parts._replace(path=f"django_irods/download/bags/{self.resource_id}")
)

yield f"Downloading {bag_url}.\n"

# bag downloads are prepared on demand and may need some time
conn = self.urlopen(bag_url)
conn = self.session.get(bag_url)
total_wait_time = 0
while (
conn.status_code == 200
Expand All @@ -77,26 +87,27 @@ def fetch(self, spec, output_dir, yield_output=False, timeout=120):
raise ContentProviderException(msg)
yield f"Bag is being prepared, requesting again in {wait_time} seconds.\n"
time.sleep(wait_time)
conn = self.urlopen(bag_url)
conn = self.session.get(bag_url)
if conn.status_code != 200:
msg = f"Failed to download bag. status code {conn.status_code}.\n"
yield msg
raise ContentProviderException(msg)
# Bag creation seems to need a small time buffer after it says it's ready.
time.sleep(1)
filehandle, _ = self._urlretrieve(bag_url)
filehandle, _ = urlretrieve(bag_url)
zip_file_object = zipfile.ZipFile(filehandle, "r")
yield "Downloaded, unpacking contents.\n"
zip_file_object.extractall("temp")
# resources store the contents in the data/contents directory, which is all we want to keep
contents_dir = os.path.join("temp", self.resource_id, "data", "contents")
files = os.listdir(contents_dir)
for f in files:
shutil.move(os.path.join(contents_dir, f), output_dir)
yield "Finished, cleaning up.\n"
shutil.rmtree("temp")

with tempfile.TemporaryDirectory() as d:
zip_file_object.extractall(d)
# resources store the contents in the data/contents directory, which is all we want to keep
contents_dir = os.path.join(d, self.resource_id, "data", "contents")
files = os.listdir(contents_dir)
for f in files:
shutil.move(os.path.join(contents_dir, f), output_dir)
yield "Finished, cleaning up.\n"

@property
def content_id(self):
"""The HydroShare resource ID"""
return f"{self.resource_id}.v{self.version}"
return f"{self.resource_id}"
4 changes: 2 additions & 2 deletions repo2docker/contentproviders/zenodo.py
Original file line number Diff line number Diff line change
Expand Up @@ -73,7 +73,7 @@ def fetch(self, spec, output_dir, yield_output=False):
host = spec["host"]

yield f"Fetching Zenodo record {record_id}.\n"
resp = self.urlopen(
resp = self.session.get(
f'{host["api"]}{record_id}',
headers={"accept": "application/json"},
)
Expand All @@ -82,7 +82,7 @@ def fetch(self, spec, output_dir, yield_output=False):
if host["files"]:
yield f"Fetching Zenodo record {record_id} files.\n"
files_url = deep_get(record, host["files"])
resp = self.urlopen(
resp = self.session.get(
files_url,
headers={"accept": "application/json"},
)
Expand Down
8 changes: 4 additions & 4 deletions setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,11 +26,11 @@ def finalize_options(self):

def run(self):
import json
from urllib.request import urlopen

resp = urlopen(self.url, timeout=5)
resp_body = resp.read()
data = json.loads(resp_body.decode("utf-8"))
import requests

resp = requests.get(self.url)
data = resp.json()
if "installations" not in data:
raise ValueError("Malformed installation map.")

Expand Down
67 changes: 67 additions & 0 deletions tests/contentproviders/test_hydroshare.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,67 @@
import hashlib
import os
from tempfile import TemporaryDirectory

import pytest

from repo2docker.contentproviders import Hydroshare


@pytest.mark.parametrize(
("spec", "url"),
[
# Test a hydroshare DOI
(
"10.4211/hs.b8f6eae9d89241cf8b5904033460af61",
"http://www.hydroshare.org/resource/b8f6eae9d89241cf8b5904033460af61",
),
# Hydroshare DOI in a different form
(
"https://doi.org/10.4211/hs.b8f6eae9d89241cf8b5904033460af61",
"http://www.hydroshare.org/resource/b8f6eae9d89241cf8b5904033460af61",
),
# Test a non-hydroshare DOI
("doi:10.7910/DVN/TJCLKP", None),
# Test a hydroshare URL
(
"http://www.hydroshare.org/resource/b8f6eae9d89241cf8b5904033460af61",
"http://www.hydroshare.org/resource/b8f6eae9d89241cf8b5904033460af61",
),
# Test a random URL
("https://www.eff.org/cyberspace-independence", None),
],
)
def test_detect(spec, url):
assert Hydroshare().detect(spec) == url


@pytest.mark.parametrize(
("specs", "md5tree"),
[
(
("https://www.hydroshare.org/resource/8f7c2f0341ef4180b0dbe97f59130756/",),
{
"binder/Dockerfile": "872ab0ef22645a42a5560eae640cdc77",
"README.md": "88ac547c3a5f616f6d26e0689d63a113",
"notebooks/sanity-check.ipynb": "7fc4c455bc8cd244479f4d2282051ee6",
},
),
],
)
def test_fetch(specs: list[str], md5tree):
dv = Hydroshare()

for spec in specs:
with TemporaryDirectory() as d:
output = []
for l in dv.fetch(dv.detect(spec), d):
output.append(l)

# Verify md5 sum of the files we expect to find
# We are using md5 instead of something more secure because that is what
# dataverse itself uses
for subpath, expected_sha in md5tree.items():
with open(os.path.join(d, subpath), "rb") as f:
h = hashlib.md5()
h.update(f.read())
assert h.hexdigest() == expected_sha
2 changes: 1 addition & 1 deletion tests/unit/contentproviders/test_doi.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,7 @@ def test_url_headers(requests_mock):
doi = DoiProvider()

headers = {"test1": "value1", "Test2": "value2"}
result = doi.urlopen("https://mybinder.org", headers=headers)
result = doi.session.get("https://mybinder.org", headers=headers)
assert "test1" in result.request.headers
assert "Test2" in result.request.headers
assert result.request.headers["User-Agent"] == f"repo2docker {__version__}"
Expand Down
1 change: 0 additions & 1 deletion tests/unit/contentproviders/test_figshare.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,6 @@
from io import BytesIO
from tempfile import NamedTemporaryFile, TemporaryDirectory
from unittest.mock import patch
from urllib.request import Request, urlopen
from zipfile import ZipFile

import pytest
Expand Down
Loading