Merge pull request #899 from lsst/tickets/DM-41365

DM-41365: Add findDataset/getDataset/getDatasetType to Butler API
lsst · Nov 3, 2023 · 5baa637 · 5baa637
2 parents e89626a + 17bc74e
commit 5baa637
Show file tree

Hide file tree

Showing 15 changed files with 583 additions and 39 deletions.
diff --git a/doc/changes/DM-41365.feature.rst b/doc/changes/DM-41365.feature.rst
@@ -0,0 +1 @@
+Added new ``Butler`` APIs migrated from registry: ``Butler.get_dataset_type()``, ``Butler.get_dataset()``, and ``Butler.find_dataset()``.
diff --git a/python/lsst/daf/butler/__init__.py b/python/lsst/daf/butler/__init__.py
@@ -79,7 +79,15 @@
 
 # Do not import or lift symbols from 'server' or 'server_models'.
 # Import the registry subpackage directly for other symbols.
-from .registry import CollectionSearch, CollectionType, Registry, RegistryConfig
+from .registry import (
+    CollectionSearch,
+    CollectionType,
+    MissingCollectionError,
+    MissingDatasetTypeError,
+    NoDefaultCollectionError,
+    Registry,
+    RegistryConfig,
+)
 from .transfers import RepoExportContext, YamlRepoExportBackend, YamlRepoImportBackend
 from .version import *
 

diff --git a/python/lsst/daf/butler/_butler.py b/python/lsst/daf/butler/_butler.py
@@ -42,12 +42,13 @@
 from ._butler_repo_index import ButlerRepoIndex
 from ._config import Config, ConfigSubset
 from ._dataset_existence import DatasetExistence
-from ._dataset_ref import DatasetIdGenEnum, DatasetRef
+from ._dataset_ref import DatasetId, DatasetIdGenEnum, DatasetRef
 from ._dataset_type import DatasetType
 from ._deferredDatasetHandle import DeferredDatasetHandle
 from ._file_dataset import FileDataset
 from ._limited_butler import LimitedButler
 from ._storage_class import StorageClass
+from ._timespan import Timespan
 from .datastore import DatasetRefURIs, Datastore
 from .dimensions import DataId, DimensionConfig
 from .registry import Registry, RegistryConfig, _RegistryFactory
@@ -772,6 +773,151 @@ def getURI(
         """
         raise NotImplementedError()
 
+    @abstractmethod
+    def get_dataset_type(self, name: str) -> DatasetType:
+        """Get the `DatasetType`.
+
+        Parameters
+        ----------
+        name : `str`
+            Name of the type.
+
+        Returns
+        -------
+        type : `DatasetType`
+            The `DatasetType` associated with the given name.
+
+        Raises
+        ------
+        lsst.daf.butler.MissingDatasetTypeError
+            Raised if the requested dataset type has not been registered.
+
+        Notes
+        -----
+        This method handles component dataset types automatically, though most
+        other operations do not.
+        """
+        raise NotImplementedError()
+
+    @abstractmethod
+    def get_dataset(
+        self,
+        id: DatasetId,
+        storage_class: str | StorageClass | None,
+        dimension_records: bool = False,
+        datastore_records: bool = False,
+    ) -> DatasetRef | None:
+        """Retrieve a Dataset entry.
+
+        Parameters
+        ----------
+        id : `DatasetId`
+            The unique identifier for the dataset.
+        storage_class : `str` or `StorageClass` or `None`
+            A storage class to use when creating the returned entry. If given
+            it must be compatible with the default storage class.
+        dimension_records: `bool`, optional
+            If `True` the ref will be expanded and contain dimension records.
+        datastore_records: `bool`, optional.
+            If `True` the ref will contain associated datastore records.
+
+        Returns
+        -------
+        ref : `DatasetRef` or `None`
+            A ref to the Dataset, or `None` if no matching Dataset
+            was found.
+        """
+        raise NotImplementedError()
+
+    @abstractmethod
+    def find_dataset(
+        self,
+        dataset_type: DatasetType | str,
+        data_id: DataId | None = None,
+        *,
+        collections: str | Sequence[str] | None = None,
+        timespan: Timespan | None = None,
+        storage_class: str | StorageClass | None = None,
+        dimension_records: bool = False,
+        datastore_records: bool = False,
+        **kwargs: Any,
+    ) -> DatasetRef | None:
+        """Find a dataset given its `DatasetType` and data ID.
+
+        This can be used to obtain a `DatasetRef` that permits the dataset to
+        be read from a `Datastore`. If the dataset is a component and can not
+        be found using the provided dataset type, a dataset ref for the parent
+        will be returned instead but with the correct dataset type.
+
+        Parameters
+        ----------
+        dataset_type : `DatasetType` or `str`
+            A `DatasetType` or the name of one.  If this is a `DatasetType`
+            instance, its storage class will be respected and propagated to
+            the output, even if it differs from the dataset type definition
+            in the registry, as long as the storage classes are convertible.
+        data_id : `dict` or `DataCoordinate`, optional
+            A `dict`-like object containing the `Dimension` links that identify
+            the dataset within a collection. If it is a `dict` the dataId
+            can include dimension record values such as ``day_obs`` and
+            ``seq_num`` or ``full_name`` that can be used to derive the
+            primary dimension.
+        collections : `str` or `list` [`str`], optional
+            A an ordered list of collections to search for the dataset.
+            Defaults to ``self.defaults.collections``.
+        timespan : `Timespan`, optional
+            A timespan that the validity range of the dataset must overlap.
+            If not provided, any `~CollectionType.CALIBRATION` collections
+            matched by the ``collections`` argument will not be searched.
+        storage_class : `str` or `StorageClass` or `None`
+            A storage class to use when creating the returned entry. If given
+            it must be compatible with the default storage class.
+        dimension_records: `bool`, optional
+            If `True` the ref will be expanded and contain dimension records.
+        datastore_records: `bool`, optional.
+            If `True` the ref will contain associated datastore records.
+        **kwargs
+            Additional keyword arguments passed to
+            `DataCoordinate.standardize` to convert ``dataId`` to a true
+            `DataCoordinate` or augment an existing one. This can also include
+            dimension record metadata that can be used to derive a primary
+            dimension value.
+
+        Returns
+        -------
+        ref : `DatasetRef`
+            A reference to the dataset, or `None` if no matching Dataset
+            was found.
+
+        Raises
+        ------
+        lsst.daf.butler.NoDefaultCollectionError
+            Raised if ``collections`` is `None` and
+            ``self.collections`` is `None`.
+        LookupError
+            Raised if one or more data ID keys are missing.
+        lsst.daf.butler.MissingDatasetTypeError
+            Raised if the dataset type does not exist.
+        lsst.daf.butler.MissingCollectionError
+            Raised if any of ``collections`` does not exist in the registry.
+
+        Notes
+        -----
+        This method simply returns `None` and does not raise an exception even
+        when the set of collections searched is intrinsically incompatible with
+        the dataset type, e.g. if ``datasetType.isCalibration() is False``, but
+        only `~CollectionType.CALIBRATION` collections are being searched.
+        This may make it harder to debug some lookup failures, but the behavior
+        is intentional; we consider it more important that failed searches are
+        reported consistently, regardless of the reason, and that adding
+        additional collections that do not contain a match to the search path
+        never changes the behavior.
+
+        This method handles component dataset types automatically, though most
+        other query operations do not.
+        """
+        raise NotImplementedError()
+
     @abstractmethod
     def retrieveArtifacts(
         self,

diff --git a/python/lsst/daf/butler/direct_butler.py b/python/lsst/daf/butler/direct_butler.py
@@ -55,7 +55,7 @@
 from ._butler_config import ButlerConfig
 from ._config import Config
 from ._dataset_existence import DatasetExistence
-from ._dataset_ref import DatasetIdGenEnum, DatasetRef
+from ._dataset_ref import DatasetId, DatasetIdGenEnum, DatasetRef
 from ._dataset_type import DatasetType
 from ._deferredDatasetHandle import DeferredDatasetHandle
 from ._exceptions import ValidationError
@@ -228,7 +228,7 @@ def __init__(
     def _retrieve_dataset_type(self, name: str) -> DatasetType | None:
         """Return DatasetType defined in registry given dataset type name."""
         try:
-            return self._registry.getDatasetType(name)
+            return self.get_dataset_type(name)
         except MissingDatasetTypeError:
             return None
 
@@ -369,11 +369,11 @@ def _standardizeArgs(
             if isinstance(datasetRefOrType, DatasetType):
                 externalDatasetType = datasetRefOrType
             else:
-                internalDatasetType = self._registry.getDatasetType(datasetRefOrType)
+                internalDatasetType = self.get_dataset_type(datasetRefOrType)
 
         # Check that they are self-consistent
         if externalDatasetType is not None:
-            internalDatasetType = self._registry.getDatasetType(externalDatasetType.name)
+            internalDatasetType = self.get_dataset_type(externalDatasetType.name)
             if externalDatasetType != internalDatasetType:
                 # We can allow differences if they are compatible, depending
                 # on whether this is a get or a put. A get requires that
@@ -846,7 +846,7 @@ def _findDatasetRef(
             )
         # Always lookup the DatasetRef, even if one is given, to ensure it is
         # present in the current collection.
-        ref = self._registry.findDataset(
+        ref = self.find_dataset(
             datasetType,
             dataId,
             collections=collections,
@@ -1318,6 +1318,60 @@ def getURI(
             )
         return primary
 
+    def get_dataset_type(self, name: str) -> DatasetType:
+        return self._registry.getDatasetType(name)
+
+    def get_dataset(
+        self,
+        id: DatasetId,
+        storage_class: str | StorageClass | None = None,
+        dimension_records: bool = False,
+        datastore_records: bool = False,
+    ) -> DatasetRef | None:
+        ref = self._registry.getDataset(id)
+        if ref is not None:
+            if dimension_records:
+                ref = ref.expanded(self._registry.expandDataId(ref.dataId, graph=ref.datasetType.dimensions))
+            if storage_class:
+                ref = ref.overrideStorageClass(storage_class)
+            if datastore_records:
+                ref = self._registry.get_datastore_records(ref)
+        return ref
+
+    def find_dataset(
+        self,
+        dataset_type: DatasetType | str,
+        data_id: DataId | None = None,
+        *,
+        collections: str | Sequence[str] | None = None,
+        timespan: Timespan | None = None,
+        storage_class: str | StorageClass | None = None,
+        dimension_records: bool = False,
+        datastore_records: bool = False,
+        **kwargs: Any,
+    ) -> DatasetRef | None:
+        # Handle any parts of the dataID that are not using primary dimension
+        # keys.
+        if isinstance(dataset_type, str):
+            actual_type = self.get_dataset_type(dataset_type)
+        else:
+            actual_type = dataset_type
+        data_id, kwargs = self._rewrite_data_id(data_id, actual_type, **kwargs)
+
+        ref = self._registry.findDataset(
+            dataset_type,
+            data_id,
+            collections=collections,
+            timespan=timespan,
+            datastore_records=datastore_records,
+            **kwargs,
+        )
+        if ref is not None and dimension_records:
+            ref = ref.expanded(self._registry.expandDataId(ref.dataId, graph=ref.datasetType.dimensions))
+        if ref is not None and storage_class is not None:
+            ref = ref.overrideStorageClass(storage_class)
+        return ref
+
     def retrieveArtifacts(
         self,
         refs: Iterable[DatasetRef],
@@ -1877,7 +1931,7 @@ def transfer_from(
                     newly_registered_dataset_types.add(datasetType)
             else:
                 # If the dataset type is missing, let it fail immediately.
-                target_dataset_type = self._registry.getDatasetType(datasetType.name)
+                target_dataset_type = self.get_dataset_type(datasetType.name)
                 if target_dataset_type != datasetType:
                     raise ConflictingDefinitionError(
                         "Source butler dataset type differs from definition"
@@ -1994,7 +2048,7 @@ def validateConfiguration(
     ) -> None:
         # Docstring inherited.
         if datasetTypeNames:
-            datasetTypes = [self._registry.getDatasetType(name) for name in datasetTypeNames]
+            datasetTypes = [self.get_dataset_type(name) for name in datasetTypeNames]
         else:
             datasetTypes = list(self._registry.queryDatasetTypes())
 
@@ -2064,7 +2118,7 @@ def validateConfiguration(
                     pass
                 else:
                     try:
-                        self._registry.getDatasetType(key.name)
+                        self.get_dataset_type(key.name)
                     except KeyError:
                         if logFailures:
                             _LOG.critical(

diff --git a/python/lsst/daf/butler/registry/_registry.py b/python/lsst/daf/butler/registry/_registry.py
@@ -27,15 +27,15 @@
 
 from __future__ import annotations
 
-__all__ = ("Registry",)
+__all__ = ("Registry", "CollectionArgType")
 
 import contextlib
 import logging
 import re
 from abc import ABC, abstractmethod
 from collections.abc import Iterable, Iterator, Mapping, Sequence
 from types import EllipsisType
-from typing import TYPE_CHECKING, Any
+from typing import TYPE_CHECKING, Any, TypeAlias
 
 from .._dataset_association import DatasetAssociation
 from .._dataset_ref import DatasetId, DatasetIdGenEnum, DatasetRef
@@ -64,7 +64,9 @@
 _LOG = logging.getLogger(__name__)
 
 # TYpe alias for `collections` arguments.
-CollectionArgType = str | re.Pattern | Iterable[str | re.Pattern] | EllipsisType | CollectionWildcard
+CollectionArgType: TypeAlias = (
+    str | re.Pattern | Iterable[str | re.Pattern] | EllipsisType | CollectionWildcard
+)
 
 
 class Registry(ABC):
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1 @@
		Added new ``Butler`` APIs migrated from registry: ``Butler.get_dataset_type()``, ``Butler.get_dataset()``, and ``Butler.find_dataset()``.