Skip to content

Commit

Permalink
Cleanup hydroshare provider
Browse files Browse the repository at this point in the history
- Cleanup how the provider is detected, as we were simply doing
  a domain check but with many extra steps
- Move the tests to be real integration tests
- Test detection, not content_id
  • Loading branch information
yuvipanda committed Dec 21, 2024
1 parent aec0e02 commit cd9911c
Show file tree
Hide file tree
Showing 3 changed files with 100 additions and 216 deletions.
69 changes: 41 additions & 28 deletions repo2docker/contentproviders/hydroshare.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,20 +2,29 @@
import os
import shutil
import time
import tempfile
import zipfile
from datetime import datetime, timedelta, timezone
from urllib.request import urlretrieve
from urllib.parse import urlparse, urlunparse

from .base import ContentProviderException
from .doi import DoiProvider
from ..utils import is_doi


class Hydroshare(DoiProvider):
"""Provide contents of a Hydroshare resource."""

def _fetch_version(self, host):
"""Fetch resource modified date and convert to epoch"""
json_response = self.session.get(host["version"].format(self.resource_id)).json()
HYDROSHARE_DOMAINS = ["www.hydroshare.org"]

def get_version(self, resource_id: str) -> str:
"""
Get current version of given resource_id
"""
api_url = f"https://{self.HYDROSHARE_DOMAIN}/hsapi/resource/{resource_id}/scimeta/elements"

json_response = self.session.get(api_url).json()
date = next(
item for item in json_response["dates"] if item["type"] == "modified"
)["start_date"]
Expand All @@ -26,7 +35,7 @@ def _fetch_version(self, host):
# truncate the timestamp
return str(int(epoch))

def detect(self, doi, ref=None, extra_args=None):
def detect(self, spec, ref=None, extra_args=None):
"""Trigger this provider for things that resolve to a Hydroshare resource"""
hosts = [
{
Expand All @@ -35,30 +44,33 @@ def detect(self, doi, ref=None, extra_args=None):
"http://www.hydroshare.org/resource/",
],
"django_irods": "https://www.hydroshare.org/django_irods/download/bags/",
"version": "https://www.hydroshare.org/hsapi/resource/{}/scimeta/elements",
"version": "",
}
]
url = self.doi2url(doi)

for host in hosts:
if any([url.startswith(s) for s in host["hostname"]]):
self.resource_id = url.strip("/").rsplit("/", maxsplit=1)[1]
self.version = self._fetch_version(host)
return {
"resource": self.resource_id,
"host": host,
"version": self.version,
}

# Our spec could be a doi that resolves to a hydroshare URL, or a hydroshare URL
if is_doi(spec):
url = self.doi2url(spec)
else:
url = spec

parsed = urlparse(url)

print(url)
if parsed.netloc in self.HYDROSHARE_DOMAINS:
return url

def _urlretrieve(self, bag_url):
return urlretrieve(bag_url)

def fetch(self, spec, output_dir, yield_output=False, timeout=120):
"""Fetch and unpack a Hydroshare resource"""
resource_id = spec["resource"]
host = spec["host"]
url = spec
print(url)
parts = urlparse(url)
self.resource_id = parts.path.strip("/").rsplit("/", maxsplit=1)[1]

bag_url = f'{host["django_irods"]}{resource_id}'
bag_url = urlunparse(parts._replace(path=f"django_irods/download/bags/{self.resource_id}"))

yield f"Downloading {bag_url}.\n"

Expand Down Expand Up @@ -87,16 +99,17 @@ def fetch(self, spec, output_dir, yield_output=False, timeout=120):
filehandle, _ = self._urlretrieve(bag_url)
zip_file_object = zipfile.ZipFile(filehandle, "r")
yield "Downloaded, unpacking contents.\n"
zip_file_object.extractall("temp")
# resources store the contents in the data/contents directory, which is all we want to keep
contents_dir = os.path.join("temp", self.resource_id, "data", "contents")
files = os.listdir(contents_dir)
for f in files:
shutil.move(os.path.join(contents_dir, f), output_dir)
yield "Finished, cleaning up.\n"
shutil.rmtree("temp")

with tempfile.TemporaryDirectory() as d:
zip_file_object.extractall(d)
# resources store the contents in the data/contents directory, which is all we want to keep
contents_dir = os.path.join(d, self.resource_id, "data", "contents")
files = os.listdir(contents_dir)
for f in files:
shutil.move(os.path.join(contents_dir, f), output_dir)
yield "Finished, cleaning up.\n"

@property
def content_id(self):
"""The HydroShare resource ID"""
return f"{self.resource_id}.v{self.version}"
return f"{self.resource_id}"
59 changes: 59 additions & 0 deletions tests/contentproviders/test_hydroshare.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,59 @@
import os
import hashlib
from tempfile import TemporaryDirectory

import pytest

from repo2docker.contentproviders import Hydroshare


@pytest.mark.parametrize(
("spec", "url"),
[
# Test a hydroshare DOI
("10.4211/hs.b8f6eae9d89241cf8b5904033460af61", "http://www.hydroshare.org/resource/b8f6eae9d89241cf8b5904033460af61"),
# Hydroshare DOI in a different form
("https://doi.org/10.4211/hs.b8f6eae9d89241cf8b5904033460af61", "http://www.hydroshare.org/resource/b8f6eae9d89241cf8b5904033460af61"),
# Test a non-hydroshare DOI
("doi:10.7910/DVN/TJCLKP", None),
# Test a hydroshare URL
("http://www.hydroshare.org/resource/b8f6eae9d89241cf8b5904033460af61", "http://www.hydroshare.org/resource/b8f6eae9d89241cf8b5904033460af61"),
# Test a random URL
("https://www.eff.org/cyberspace-independence", None)
]
)
def test_detect(spec, url):
assert Hydroshare().detect(spec) == url


@pytest.mark.parametrize(
("specs", "md5tree"),
[
(
("https://www.hydroshare.org/resource/8f7c2f0341ef4180b0dbe97f59130756/", ),
{
"binder/Dockerfile": "872ab0ef22645a42a5560eae640cdc77",
"README.md": "88ac547c3a5f616f6d26e0689d63a113",
"notebooks/sanity-check.ipynb": "7fc4c455bc8cd244479f4d2282051ee6"
},
),
],
)
def test_fetch(specs: list[str], md5tree):
dv = Hydroshare()

for spec in specs:
with TemporaryDirectory() as d:
output = []
for l in dv.fetch(dv.detect(spec), d):
output.append(l)

# Verify md5 sum of the files we expect to find
# We are using md5 instead of something more secure because that is what
# dataverse itself uses
for subpath, expected_sha in md5tree.items():
with open(os.path.join(d, subpath), "rb") as f:
h = hashlib.md5()
h.update(f.read())
assert h.hexdigest() == expected_sha

188 changes: 0 additions & 188 deletions tests/unit/contentproviders/test_hydroshare.py

This file was deleted.

0 comments on commit cd9911c

Please sign in to comment.