diff --git a/doc/changes/DM-47325.feature.rst b/doc/changes/DM-47325.feature.rst new file mode 100644 index 0000000000..3fe2ac51b8 --- /dev/null +++ b/doc/changes/DM-47325.feature.rst @@ -0,0 +1,5 @@ +Added two new APIs for handling Butler dataset URIs. +``Butler.parse_dataset_uri`` parses a URI and returns the butler repository label and associated UUID. +``Butler.get_dataset_from_uri`` will parse a URI and attempt to retrieve the ``DatasetRef``. +URIs should be of the form IVOA identifiers as described in `DMTN-302 `_. +Deprecated ``butler://`` URIs are still supported but should not be used in new systems. diff --git a/python/lsst/daf/butler/_butler.py b/python/lsst/daf/butler/_butler.py index 2fa502bb78..dcb978589e 100644 --- a/python/lsst/daf/butler/_butler.py +++ b/python/lsst/daf/butler/_butler.py @@ -29,6 +29,9 @@ __all__ = ["Butler"] +import dataclasses +import urllib.parse +import uuid from abc import abstractmethod from collections.abc import Collection, Iterable, Iterator, Mapping, Sequence from contextlib import AbstractContextManager @@ -60,6 +63,7 @@ from ._dataset_type import DatasetType from ._deferredDatasetHandle import DeferredDatasetHandle from ._file_dataset import FileDataset + from ._labeled_butler_factory import LabeledButlerFactoryProtocol from ._storage_class import StorageClass from ._timespan import Timespan from .datastore import DatasetRefURIs @@ -71,6 +75,19 @@ _LOG = getLogger(__name__) +@dataclasses.dataclass +class ParsedButlerDatasetURI: + label: str + dataset_id: uuid.UUID + uri: str + + +@dataclasses.dataclass +class SpecificButlerDataset: + butler: Butler + dataset: DatasetRef | None + + class Butler(LimitedButler): # numpydoc ignore=PR02 """Interface for data butler and factory for Butler instances. @@ -526,6 +543,105 @@ def get_known_repos(cls) -> set[str]: """ return ButlerRepoIndex.get_known_repos() + @classmethod + def parse_dataset_uri(cls, uri: str) -> ParsedButlerDatasetURI: + """Extract the butler label and dataset ID from a dataset URI. + + Parameters + ---------- + uri : `str` + The dataset URI to parse. + + Returns + ------- + parsed : `ParsedButlerDatasetURI` + The label associated with the butler repository from which this + dataset originates and the ID of the dataset. + + Notes + ----- + Supports dataset URIs of the forms + ``ivo://org.rubinobs/usdac/dr1?repo=butler_label&id=UUID`` (see + DMTN-302) and ``butler://butler_label/UUID``. The ``butler`` URI is + deprecated and can not include ``/`` in the label string. ``ivo`` URIs + can include anything supported by the `Butler` constructor, including + paths to repositories and alias labels. + + ivo://org.rubinobs/dr1?repo=/repo/main&id=UUID + + will return a label of ``/repo/main``. + + This method does not attempt to check that the dataset exists in the + labeled butler. + + Since the IVOID can be issued by any publisher to represent a Butler + dataset there is no validation of the path or netloc component of the + URI. The only requirement is that there are ``id`` and ``repo`` keys + in the ``ivo`` URI query component. + """ + parsed = urllib.parse.urlparse(uri) + parsed_scheme = parsed.scheme.lower() + if parsed_scheme == "ivo": + # Do not validate the netloc or the path values. + qs = urllib.parse.parse_qs(parsed.query) + if "repo" not in qs or "id" not in qs: + raise ValueError(f"Missing 'repo' and/or 'id' query parameters in IVOID {uri}.") + if len(qs["repo"]) != 1 or len(qs["id"]) != 1: + raise ValueError(f"Butler IVOID only supports a single value of repo and id, got {uri}") + label = qs["repo"][0] + id_ = qs["id"][0] + elif parsed_scheme == "butler": + label = parsed.netloc # Butler label is case sensitive. + # Need to strip the leading /. + id_ = parsed.path[1:] + else: + raise ValueError(f"Unrecognized URI scheme: {uri!r}") + # Strip trailing/leading whitespace from label. + label = label.strip() + if not label: + raise ValueError(f"No butler repository label found in uri {uri!r}") + try: + dataset_id = uuid.UUID(hex=id_) + except Exception as e: + e.add_note(f"Error extracting dataset ID from uri {uri!r} with dataset ID string {id_!r}") + raise + + return ParsedButlerDatasetURI(label=label, dataset_id=dataset_id, uri=uri) + + @classmethod + def get_dataset_from_uri( + cls, uri: str, factory: LabeledButlerFactoryProtocol | None = None + ) -> SpecificButlerDataset: + """Get the dataset associated with the given dataset URI. + + Parameters + ---------- + uri : `str` + The URI associated with a dataset. + factory : `LabeledButlerFactoryProtocol` or `None`, optional + Bound factory function that will be given the butler label + and receive a `Butler`. If this is not provided the label + will be tried directly. + + Returns + ------- + result : `SpecificButlerDataset` + The butler associated with this URI and the dataset itself. + The dataset can be `None` if the UUID is valid but the dataset + is not known to this butler. + """ + parsed = cls.parse_dataset_uri(uri) + butler: Butler | None = None + if factory is not None: + # If the label is not recognized, it might be a path. + try: + butler = factory(parsed.label) + except KeyError: + pass + if butler is None: + butler = cls.from_config(parsed.label) + return SpecificButlerDataset(butler=butler, dataset=butler.get_dataset(parsed.dataset_id)) + @abstractmethod def _caching_context(self) -> AbstractContextManager[None]: """Context manager that enables caching.""" diff --git a/python/lsst/daf/butler/_labeled_butler_factory.py b/python/lsst/daf/butler/_labeled_butler_factory.py index 40f887ed0a..141ddc0eee 100644 --- a/python/lsst/daf/butler/_labeled_butler_factory.py +++ b/python/lsst/daf/butler/_labeled_butler_factory.py @@ -25,9 +25,10 @@ # You should have received a copy of the GNU General Public License # along with this program. If not, see . -__all__ = ("LabeledButlerFactory",) +__all__ = ("LabeledButlerFactory", "LabeledButlerFactoryProtocol") from collections.abc import Callable, Mapping +from typing import Protocol from lsst.resources import ResourcePathExpression @@ -42,6 +43,12 @@ instance.""" +class LabeledButlerFactoryProtocol(Protocol): + """Callable to retrieve a butler from a label.""" + + def __call__(self, label: str) -> Butler: ... + + class LabeledButlerFactory: """Factory for efficiently instantiating Butler instances from the repository index file. This is intended for use from long-lived services @@ -83,6 +90,27 @@ def __init__(self, repositories: Mapping[str, str] | None = None) -> None: # This may be overridden by unit tests. self._preload_direct_butler_cache = True + def bind(self, access_token: str | None) -> LabeledButlerFactoryProtocol: + """Create a callable factory function for generating Butler instances + with out needing to specify access tokans again. + + Parameters + ---------- + access_token : `str` or `None` + An optional access token to use for authentication with the Butler. + + Returns + ------- + bound : `LabeledButlerFactoryProtocol` + A callable that takes a label as input and returns a Butler + instance. + """ + + def create(label: str) -> Butler: + return self.create_butler(label=label, access_token=access_token) + + return create + def create_butler(self, *, label: str, access_token: str | None) -> Butler: """Create a Butler instance. diff --git a/python/lsst_daf_butler.dist-info/METADATA b/python/lsst_daf_butler.dist-info/METADATA new file mode 100644 index 0000000000..ca0bc054af --- /dev/null +++ b/python/lsst_daf_butler.dist-info/METADATA @@ -0,0 +1,3 @@ +Metadata-Version: 1.0 +Name: lsst-daf-butler +Version: g57cedf6216+76f9c43fa5 diff --git a/tests/test_simpleButler.py b/tests/test_simpleButler.py index 595b08957f..1ffa7941ee 100644 --- a/tests/test_simpleButler.py +++ b/tests/test_simpleButler.py @@ -48,13 +48,14 @@ DatasetId, DatasetRef, DatasetType, + LabeledButlerFactory, StorageClass, Timespan, ) from lsst.daf.butler.datastore.file_templates import FileTemplate from lsst.daf.butler.registry import RegistryConfig, RegistryDefaults, _RegistryFactory from lsst.daf.butler.tests import DatastoreMock -from lsst.daf.butler.tests.utils import TestCaseMixin, makeTestTempDir, removeTestTempDir +from lsst.daf.butler.tests.utils import TestCaseMixin, makeTestTempDir, mock_env, removeTestTempDir try: from lsst.daf.butler.tests.server import create_test_server @@ -882,10 +883,75 @@ def makeButler(self, writeable: bool = False) -> Butler: registryConfig = RegistryConfig(config.get("registry")) _RegistryFactory(registryConfig).create_from_config() + # Write the YAML file so that some tests can recreate butler from it. + config.dumpToUri(os.path.join(self.root, "butler.yaml")) butler = Butler.from_config(config, writeable=writeable) DatastoreMock.apply(butler) return butler + def test_dataset_uris(self): + """Test that dataset URIs can be parsed and retrieved.""" + butler = self.makeButler(writeable=True) + butler.import_(filename=os.path.join(TESTDIR, "data", "registry", "base.yaml")) + butler.import_(filename=os.path.join(TESTDIR, "data", "registry", self.datasetsImportFile)) + + butler.registry.defaults = RegistryDefaults(collections=["imported_g"]) + ref = butler.find_dataset("flat", detector=2, physical_filter="Cam1-G") + self.assertIsInstance(ref, DatasetRef) + + # Get the butler root for the URI. + config_dir = butler._config["root"] + + # Read it via a repo label and a path. + with tempfile.NamedTemporaryFile(mode="w", suffix=".yaml") as index_file: + label = "test_repo" + index_file.write(f"{label}: {config_dir}\n") + index_file.flush() + with mock_env({"DAF_BUTLER_REPOSITORY_INDEX": index_file.name}): + butler_factory = LabeledButlerFactory() + factory = butler_factory.bind(access_token=None) + + for dataset_uri in ( + f"ivo://org.rubinobs/usdac/test?repo={config_dir}&id={ref.id}", + f"ivo://org.rubinobs/ukdac/lsst-dr1?repo={config_dir}%2Fbutler.yaml&id={ref.id}", + f"butler://{label}/{ref.id}", + f"ivo://org.rubinobs/usdac/lsst-dp1?repo={label}&id={ref.id}", + ): + result = Butler.get_dataset_from_uri(dataset_uri) + self.assertEqual(result.dataset, ref) + # The returned butler needs to have the datastore mocked. + DatastoreMock.apply(result.butler) + dataset_id, _ = result.butler.get(result.dataset) + self.assertEqual(dataset_id, ref.id) + + factory_result = Butler.get_dataset_from_uri(dataset_uri, factory=factory) + self.assertEqual(factory_result.dataset, ref) + # The returned butler needs to have the datastore mocked. + DatastoreMock.apply(factory_result.butler) + dataset_id, _ = factory_result.butler.get(factory_result.dataset) + self.assertEqual(dataset_id, ref.id) + + # Non existent dataset. + missing_id = str(ref.id).replace("2", "3") + result = Butler.get_dataset_from_uri(f"butler://{label}/{missing_id}") + self.assertIsNone(result.dataset) + + # Test some failure modes. + for dataset_uri in ( + "butler://label/1234", # Bad UUID. + "butler://1234", # No UUID. + "butler:///1234", # No label. + "ivo://rubin/1234", # No query part and bad UUID and no label. + "ivo://rubin/datasets/dr1/82d79caa-0823-4300-9874-67b737367ee0", # No query part. + "ivo://org.rubinobs/datasets?repo=dr1&id=1234", # Bad UUID. + "ivo://org.rubinobs/butler?release=dr1&id=82d79caa-0823-4300-9874-67b737367ee0", # No repo key. + "ivo://org.rubinobs/butler?repo=dr1&repo=dr2&id=82d79caa-0823-4300-9874-67b737367ee0", # 2 vals. + "ivo://org.rubinobs/something?repo=%20&id=82d79caa-0823-4300-9874-67b737367ee0", # no repo. + "https://something.edu/1234", # Wrong scheme. + ): + with self.assertRaises(ValueError): + Butler.parse_dataset_uri(dataset_uri) + class NameKeyCollectionManagerDirectSimpleButlerTestCase(DirectSimpleButlerTestCase, unittest.TestCase): """Run tests against DirectButler implementation using the