From 22cde2c35ba52aaa2fd559862a60f2740b6f9308 Mon Sep 17 00:00:00 2001 From: stevejpurves Date: Mon, 19 Feb 2024 21:35:54 +0000 Subject: [PATCH 1/7] Added meca provider --- binderhub/app.py | 2 + binderhub/event-schemas/launch.json | 3 +- binderhub/main.py | 1 + binderhub/repoproviders.py | 100 +++++++++++++++++++++++- binderhub/static/js/src/form.js | 33 ++++---- docs/source/reference/repoproviders.rst | 6 ++ 6 files changed, 127 insertions(+), 18 deletions(-) diff --git a/binderhub/app.py b/binderhub/app.py index 5eb97aa9c..59d86d623 100644 --- a/binderhub/app.py +++ b/binderhub/app.py @@ -56,6 +56,7 @@ from .ratelimit import RateLimiter from .registry import DockerRegistry from .repoproviders import ( + MecaRepoProvider, DataverseProvider, FigshareProvider, GistRepoProvider, @@ -586,6 +587,7 @@ def _default_build_namespace(self): "figshare": FigshareProvider, "hydroshare": HydroshareProvider, "dataverse": DataverseProvider, + "meca": MecaRepoProvider, }, config=True, help=""" diff --git a/binderhub/event-schemas/launch.json b/binderhub/event-schemas/launch.json index 16e277cf4..09c8b3f39 100644 --- a/binderhub/event-schemas/launch.json +++ b/binderhub/event-schemas/launch.json @@ -14,7 +14,8 @@ "Zenodo", "Figshare", "Hydroshare", - "Dataverse" + "Dataverse", + "MECA" ], "description": "Provider for the repository being launched" }, diff --git a/binderhub/main.py b/binderhub/main.py index 2a2027598..ba2b4f4b5 100644 --- a/binderhub/main.py +++ b/binderhub/main.py @@ -22,6 +22,7 @@ "figshare": "Figshare", "hydroshare": "Hydroshare", "dataverse": "Dataverse", + "meca": "MECA", } diff --git a/binderhub/repoproviders.py b/binderhub/repoproviders.py index be9dd75f4..57303072d 100644 --- a/binderhub/repoproviders.py +++ b/binderhub/repoproviders.py @@ -15,7 +15,7 @@ import time import urllib.parse from datetime import datetime, timedelta, timezone -from urllib.parse import urlparse +from urllib.parse import urlparse, urlunparse import escapism from prometheus_client import Gauge @@ -263,6 +263,104 @@ def get_build_slug(self): return f"zenodo-{self.record_id}" +class MecaRepoProvider(RepoProvider): + """BinderHub Provider that can handle the contents of a MECA bundle + + Users must provide a spec consisting of a public the URL to the bundle + The URL origin must conform to the origin trait when that is set + """ + + name = Unicode("MECA Bundle") + + display_name = "MECA Bundle" + + labels = { + "text": "MECA Bundle URL (https://journals.curvenote.com/journal/submissions/12345/meca.zip)", + "tag_text": "", + "ref_prop_disabled": True, + "label_prop_disabled": True, + } + + validate_bundle = Bool(config=True, help="Validate the file as MECA Bundle").tag( + default=True + ) + + allowed_origins = List( + config=True, + help="""List of allowed origins for the URL + + If set, the URL must be on one of these origins. + + If not set, the URL can be on any origin. + """, + ) + + @default("allowed_origins") + def _allowed_origins_default(self): + return [] + + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) + + url = unquote(self.spec) + + if not val.url(url): + raise ValueError(f"[MecaRepoProvider] Invalid URL {url}") + + if ( + len(self.allowed_origins) > 0 + and urlparse(self.spec).hostname not in self.allowed_origins + ): + raise ValueError("URL is not on an allowed origin") + + self.url = url + + self.log.info(f"MECA Bundle URL: {self.url}") + self.log.info(f"MECA Bundle raw spec: {self.spec}") + + async def get_resolved_ref(self): + # Check the URL is reachable + client = AsyncHTTPClient() + req = HTTPRequest(self.url, method="HEAD", user_agent="BinderHub") + self.log.info(f"get_resolved_ref() HEAD: {self.url}") + try: + r = await client.fetch(req) + self.log.info(f"URL is reachable: {self.url}") + self.hashed_slug = get_hashed_slug( + self.url, r.headers.get("ETag") or r.headers.get("Content-Length") + ) + except Exception as e: + raise ValueError(f"URL is unreachable ({e})") + + self.log.info(f"hashed_slug: {self.hashed_slug}") + return self.hashed_slug + + async def get_resolved_spec(self): + if not hasattr(self, "hashed_slug"): + await self.get_resolved_ref() + self.log.info(f"get_resolved_spec(): {self.hashed_slug}") + return self.spec + + async def get_resolved_ref_url(self): + self.log.info(f"get_resolved_ref_url(): {self.url}") + return self.url + + def get_repo_url(self): + """This is passed to repo2docker and is the URL that is to be fetched + with a `http[s]+meca` protocol string. We do this by convention to enable + detection of meca urls by the MecaContentProvider. + """ + parsed = urlparse(self.url) + parsed = parsed._replace(scheme=f"{parsed.scheme}+meca") + url = urlunparse(parsed) + self.log.info(f"get_repo_url(): {url}") + return url + + def get_build_slug(self): + """Should return a unique build slug""" + return self.hashed_slug + + class FigshareProvider(RepoProvider): """Provide contents of a Figshare article diff --git a/binderhub/static/js/src/form.js b/binderhub/static/js/src/form.js index cc00d7b45..e18b09fa2 100644 --- a/binderhub/static/js/src/form.js +++ b/binderhub/static/js/src/form.js @@ -1,4 +1,4 @@ -import { getPathType } from "./path"; +import { getPathType } from './path'; /** * Parse current values in form and return them with appropriate URL encoding @@ -11,31 +11,32 @@ import { getPathType } from "./path"; * @returns {} */ export function getBuildFormValues() { - const providerPrefix = $("#provider_prefix").val().trim(); - let repo = $("#repository").val().trim(); - if (providerPrefix !== "git") { - repo = repo.replace(/^(https?:\/\/)?gist.github.com\//, ""); - repo = repo.replace(/^(https?:\/\/)?github.com\//, ""); - repo = repo.replace(/^(https?:\/\/)?gitlab.com\//, ""); + const providerPrefix = $('#provider_prefix').val().trim(); + let repo = $('#repository').val().trim(); + if (providerPrefix !== 'git') { + repo = repo.replace(/^(https?:\/\/)?gist.github.com\//, ''); + repo = repo.replace(/^(https?:\/\/)?github.com\//, ''); + repo = repo.replace(/^(https?:\/\/)?gitlab.com\//, ''); } // trim trailing or leading '/' on repo - repo = repo.replace(/(^\/)|(\/?$)/g, ""); + repo = repo.replace(/(^\/)|(\/?$)/g, ''); // git providers encode the URL of the git repository as the repo // argument. - if (repo.includes("://") || providerPrefix === "gl") { + if (repo.includes('://') || providerPrefix === 'gl') { repo = encodeURIComponent(repo); } - let ref = $("#ref").val().trim() || $("#ref").attr("placeholder"); + let ref = $('#ref').val().trim() || $('#ref').attr('placeholder'); if ( - providerPrefix === "zenodo" || - providerPrefix === "figshare" || - providerPrefix === "dataverse" || - providerPrefix === "hydroshare" + providerPrefix === 'zenodo' || + providerPrefix === 'figshare' || + providerPrefix === 'dataverse' || + providerPrefix === 'hydroshare' || + providerPrefix === 'meca' ) { - ref = ""; + ref = ''; } - const path = $("#filepath").val().trim(); + const path = $('#filepath').val().trim(); return { providerPrefix: providerPrefix, repo: repo, diff --git a/docs/source/reference/repoproviders.rst b/docs/source/reference/repoproviders.rst index d0f5ca37c..5bcdda538 100644 --- a/docs/source/reference/repoproviders.rst +++ b/docs/source/reference/repoproviders.rst @@ -65,6 +65,12 @@ Module: :mod:`binderhub.repoproviders` .. autoconfigurable:: DataverseProvider :members: +:class:`MecaRepoProvider` +--------------------------- + +.. autoconfigurable:: MecaRepoProvider + :members: + :class:`GitRepoProvider` --------------------------- From 7f55c41dd39c5a870ca16f83664ddc3a9ad567ef Mon Sep 17 00:00:00 2001 From: stevejpurves Date: Mon, 19 Feb 2024 21:41:28 +0000 Subject: [PATCH 2/7] add missing slug hashing fn --- binderhub/repoproviders.py | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/binderhub/repoproviders.py b/binderhub/repoproviders.py index 57303072d..56caeb25b 100644 --- a/binderhub/repoproviders.py +++ b/binderhub/repoproviders.py @@ -16,6 +16,7 @@ import urllib.parse from datetime import datetime, timedelta, timezone from urllib.parse import urlparse, urlunparse +from hashlib import md5 import escapism from prometheus_client import Gauge @@ -263,6 +264,15 @@ def get_build_slug(self): return f"zenodo-{self.record_id}" +def get_hashed_slug(url, changes_with_content): + """Return a unique slug that is invariant to query parameters in the url""" + parsed_url = urlparse(url) + stripped_url = urlunparse( + (parsed_url.scheme, parsed_url.netloc, parsed_url.path, "", "", "") + ) + + return "meca-" + md5(f"{stripped_url}-{changes_with_content}".encode()).hexdigest() + class MecaRepoProvider(RepoProvider): """BinderHub Provider that can handle the contents of a MECA bundle From cb6a6ee0c021b6099ed7b89a5ef3eb8cc4d6983e Mon Sep 17 00:00:00 2001 From: stevejpurves Date: Tue, 20 Feb 2024 21:03:20 +0000 Subject: [PATCH 3/7] revert formatting --- binderhub/static/js/src/form.js | 36 ++++++++++++++++----------------- 1 file changed, 18 insertions(+), 18 deletions(-) diff --git a/binderhub/static/js/src/form.js b/binderhub/static/js/src/form.js index e18b09fa2..f63cc1d19 100644 --- a/binderhub/static/js/src/form.js +++ b/binderhub/static/js/src/form.js @@ -1,4 +1,4 @@ -import { getPathType } from './path'; +import { getPathType } from "./path"; /** * Parse current values in form and return them with appropriate URL encoding @@ -11,32 +11,32 @@ import { getPathType } from './path'; * @returns {} */ export function getBuildFormValues() { - const providerPrefix = $('#provider_prefix').val().trim(); - let repo = $('#repository').val().trim(); - if (providerPrefix !== 'git') { - repo = repo.replace(/^(https?:\/\/)?gist.github.com\//, ''); - repo = repo.replace(/^(https?:\/\/)?github.com\//, ''); - repo = repo.replace(/^(https?:\/\/)?gitlab.com\//, ''); + const providerPrefix = $("#provider_prefix").val().trim(); + let repo = $("#repository").val().trim(); + if (providerPrefix !== "git") { + repo = repo.replace(/^(https?:\/\/)?gist.github.com\//, ""); + repo = repo.replace(/^(https?:\/\/)?github.com\//, ""); + repo = repo.replace(/^(https?:\/\/)?gitlab.com\//, ""); } - // trim trailing or leading '/' on repo - repo = repo.replace(/(^\/)|(\/?$)/g, ''); + // trim trailing or leading "/" on repo + repo = repo.replace(/(^\/)|(\/?$)/g, ""); // git providers encode the URL of the git repository as the repo // argument. - if (repo.includes('://') || providerPrefix === 'gl') { + if (repo.includes("://") || providerPrefix === "gl") { repo = encodeURIComponent(repo); } - let ref = $('#ref').val().trim() || $('#ref').attr('placeholder'); + let ref = $("#ref").val().trim() || $("#ref").attr("placeholder"); if ( - providerPrefix === 'zenodo' || - providerPrefix === 'figshare' || - providerPrefix === 'dataverse' || - providerPrefix === 'hydroshare' || - providerPrefix === 'meca' + providerPrefix === "zenodo" || + providerPrefix === "figshare" || + providerPrefix === "dataverse" || + providerPrefix === "hydroshare" || + providerPrefix === "meca" ) { - ref = ''; + ref = ""; } - const path = $('#filepath').val().trim(); + const path = $("#filepath").val().trim(); return { providerPrefix: providerPrefix, repo: repo, From 77b314175b7e94f6cfeffbecaff0e99a681e986e Mon Sep 17 00:00:00 2001 From: Steve Purves Date: Tue, 20 Feb 2024 20:51:45 +0000 Subject: [PATCH 4/7] Update binderhub/repoproviders.py fix typo Co-authored-by: Samuel Gaist --- binderhub/repoproviders.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/binderhub/repoproviders.py b/binderhub/repoproviders.py index 56caeb25b..5a66590c6 100644 --- a/binderhub/repoproviders.py +++ b/binderhub/repoproviders.py @@ -276,7 +276,7 @@ def get_hashed_slug(url, changes_with_content): class MecaRepoProvider(RepoProvider): """BinderHub Provider that can handle the contents of a MECA bundle - Users must provide a spec consisting of a public the URL to the bundle + Users must provide a spec consisting of a public URL to the bundle The URL origin must conform to the origin trait when that is set """ From 4e73a8f829005cd5692d12223ed62836485ad1c4 Mon Sep 17 00:00:00 2001 From: stevejpurves Date: Tue, 20 Feb 2024 21:05:29 +0000 Subject: [PATCH 5/7] review changes --- binderhub/repoproviders.py | 23 +++++++++++------------ 1 file changed, 11 insertions(+), 12 deletions(-) diff --git a/binderhub/repoproviders.py b/binderhub/repoproviders.py index 5a66590c6..af48e943e 100644 --- a/binderhub/repoproviders.py +++ b/binderhub/repoproviders.py @@ -264,20 +264,11 @@ def get_build_slug(self): return f"zenodo-{self.record_id}" -def get_hashed_slug(url, changes_with_content): - """Return a unique slug that is invariant to query parameters in the url""" - parsed_url = urlparse(url) - stripped_url = urlunparse( - (parsed_url.scheme, parsed_url.netloc, parsed_url.path, "", "", "") - ) - - return "meca-" + md5(f"{stripped_url}-{changes_with_content}".encode()).hexdigest() - class MecaRepoProvider(RepoProvider): """BinderHub Provider that can handle the contents of a MECA bundle Users must provide a spec consisting of a public URL to the bundle - The URL origin must conform to the origin trait when that is set + The URL origin must be included in the list of allowed_origins when that trait is set """ name = Unicode("MECA Bundle") @@ -328,6 +319,14 @@ def __init__(self, *args, **kwargs): self.log.info(f"MECA Bundle URL: {self.url}") self.log.info(f"MECA Bundle raw spec: {self.spec}") + def get_hashed_slug(self, url, changes_with_content): + """Return a unique slug that is invariant to query parameters in the url""" + parsed_url = urlparse(url) + stripped_url = urlunparse( + (parsed_url.scheme, parsed_url.netloc, parsed_url.path, "", "", "") + ) + return "meca-" + md5(f"{stripped_url}-{changes_with_content}".encode()).hexdigest() + async def get_resolved_ref(self): # Check the URL is reachable client = AsyncHTTPClient() @@ -336,11 +335,11 @@ async def get_resolved_ref(self): try: r = await client.fetch(req) self.log.info(f"URL is reachable: {self.url}") - self.hashed_slug = get_hashed_slug( + self.hashed_slug = self.get_hashed_slug( self.url, r.headers.get("ETag") or r.headers.get("Content-Length") ) except Exception as e: - raise ValueError(f"URL is unreachable ({e})") + raise RuntimeError(f"URL is unreachable ({e})") self.log.info(f"hashed_slug: {self.hashed_slug}") return self.hashed_slug From 07967530ed3f68bf7b352e0690423308f15cd918 Mon Sep 17 00:00:00 2001 From: stevejpurves Date: Tue, 20 Feb 2024 21:17:27 +0000 Subject: [PATCH 6/7] missing dep --- requirements.txt | 1 + 1 file changed, 1 insertion(+) diff --git a/requirements.txt b/requirements.txt index a416e1527..103536a80 100644 --- a/requirements.txt +++ b/requirements.txt @@ -9,3 +9,4 @@ pyjwt>=2 python-json-logger tornado>=5.1 traitlets +validators From 1d08767b3a69d4dfa8a65855365d843f5867ffdb Mon Sep 17 00:00:00 2001 From: stevejpurves Date: Tue, 20 Feb 2024 21:17:53 +0000 Subject: [PATCH 7/7] pre-commit fixes --- binderhub/app.py | 2 +- binderhub/repoproviders.py | 7 +++++-- 2 files changed, 6 insertions(+), 3 deletions(-) diff --git a/binderhub/app.py b/binderhub/app.py index 59d86d623..dba23c1d4 100644 --- a/binderhub/app.py +++ b/binderhub/app.py @@ -56,7 +56,6 @@ from .ratelimit import RateLimiter from .registry import DockerRegistry from .repoproviders import ( - MecaRepoProvider, DataverseProvider, FigshareProvider, GistRepoProvider, @@ -64,6 +63,7 @@ GitLabRepoProvider, GitRepoProvider, HydroshareProvider, + MecaRepoProvider, RepoProvider, ZenodoProvider, ) diff --git a/binderhub/repoproviders.py b/binderhub/repoproviders.py index af48e943e..d3df12b91 100644 --- a/binderhub/repoproviders.py +++ b/binderhub/repoproviders.py @@ -15,10 +15,11 @@ import time import urllib.parse from datetime import datetime, timedelta, timezone -from urllib.parse import urlparse, urlunparse from hashlib import md5 +from urllib.parse import unquote, urlparse, urlunparse import escapism +import validators as val from prometheus_client import Gauge from tornado.httpclient import AsyncHTTPClient, HTTPError, HTTPRequest from tornado.httputil import url_concat @@ -325,7 +326,9 @@ def get_hashed_slug(self, url, changes_with_content): stripped_url = urlunparse( (parsed_url.scheme, parsed_url.netloc, parsed_url.path, "", "", "") ) - return "meca-" + md5(f"{stripped_url}-{changes_with_content}".encode()).hexdigest() + return ( + "meca-" + md5(f"{stripped_url}-{changes_with_content}".encode()).hexdigest() + ) async def get_resolved_ref(self): # Check the URL is reachable