Skip to content

Commit

Permalink
Merge pull request #899 from lsst/tickets/DM-41365
Browse files Browse the repository at this point in the history
DM-41365: Add findDataset/getDataset/getDatasetType to Butler API
  • Loading branch information
timj authored Nov 3, 2023
2 parents e89626a + 17bc74e commit 5baa637
Show file tree
Hide file tree
Showing 15 changed files with 583 additions and 39 deletions.
1 change: 1 addition & 0 deletions doc/changes/DM-41365.feature.rst
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
Added new ``Butler`` APIs migrated from registry: ``Butler.get_dataset_type()``, ``Butler.get_dataset()``, and ``Butler.find_dataset()``.
10 changes: 9 additions & 1 deletion python/lsst/daf/butler/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -79,7 +79,15 @@

# Do not import or lift symbols from 'server' or 'server_models'.
# Import the registry subpackage directly for other symbols.
from .registry import CollectionSearch, CollectionType, Registry, RegistryConfig
from .registry import (
CollectionSearch,
CollectionType,
MissingCollectionError,
MissingDatasetTypeError,
NoDefaultCollectionError,
Registry,
RegistryConfig,
)
from .transfers import RepoExportContext, YamlRepoExportBackend, YamlRepoImportBackend
from .version import *

Expand Down
148 changes: 147 additions & 1 deletion python/lsst/daf/butler/_butler.py
Original file line number Diff line number Diff line change
Expand Up @@ -42,12 +42,13 @@
from ._butler_repo_index import ButlerRepoIndex
from ._config import Config, ConfigSubset
from ._dataset_existence import DatasetExistence
from ._dataset_ref import DatasetIdGenEnum, DatasetRef
from ._dataset_ref import DatasetId, DatasetIdGenEnum, DatasetRef
from ._dataset_type import DatasetType
from ._deferredDatasetHandle import DeferredDatasetHandle
from ._file_dataset import FileDataset
from ._limited_butler import LimitedButler
from ._storage_class import StorageClass
from ._timespan import Timespan
from .datastore import DatasetRefURIs, Datastore
from .dimensions import DataId, DimensionConfig
from .registry import Registry, RegistryConfig, _RegistryFactory
Expand Down Expand Up @@ -772,6 +773,151 @@ def getURI(
"""
raise NotImplementedError()

@abstractmethod
def get_dataset_type(self, name: str) -> DatasetType:
"""Get the `DatasetType`.
Parameters
----------
name : `str`
Name of the type.
Returns
-------
type : `DatasetType`
The `DatasetType` associated with the given name.
Raises
------
lsst.daf.butler.MissingDatasetTypeError
Raised if the requested dataset type has not been registered.
Notes
-----
This method handles component dataset types automatically, though most
other operations do not.
"""
raise NotImplementedError()

@abstractmethod
def get_dataset(
self,
id: DatasetId,
storage_class: str | StorageClass | None,
dimension_records: bool = False,
datastore_records: bool = False,
) -> DatasetRef | None:
"""Retrieve a Dataset entry.
Parameters
----------
id : `DatasetId`
The unique identifier for the dataset.
storage_class : `str` or `StorageClass` or `None`
A storage class to use when creating the returned entry. If given
it must be compatible with the default storage class.
dimension_records: `bool`, optional
If `True` the ref will be expanded and contain dimension records.
datastore_records: `bool`, optional.
If `True` the ref will contain associated datastore records.
Returns
-------
ref : `DatasetRef` or `None`
A ref to the Dataset, or `None` if no matching Dataset
was found.
"""
raise NotImplementedError()

@abstractmethod
def find_dataset(
self,
dataset_type: DatasetType | str,
data_id: DataId | None = None,
*,
collections: str | Sequence[str] | None = None,
timespan: Timespan | None = None,
storage_class: str | StorageClass | None = None,
dimension_records: bool = False,
datastore_records: bool = False,
**kwargs: Any,
) -> DatasetRef | None:
"""Find a dataset given its `DatasetType` and data ID.
This can be used to obtain a `DatasetRef` that permits the dataset to
be read from a `Datastore`. If the dataset is a component and can not
be found using the provided dataset type, a dataset ref for the parent
will be returned instead but with the correct dataset type.
Parameters
----------
dataset_type : `DatasetType` or `str`
A `DatasetType` or the name of one. If this is a `DatasetType`
instance, its storage class will be respected and propagated to
the output, even if it differs from the dataset type definition
in the registry, as long as the storage classes are convertible.
data_id : `dict` or `DataCoordinate`, optional
A `dict`-like object containing the `Dimension` links that identify
the dataset within a collection. If it is a `dict` the dataId
can include dimension record values such as ``day_obs`` and
``seq_num`` or ``full_name`` that can be used to derive the
primary dimension.
collections : `str` or `list` [`str`], optional
A an ordered list of collections to search for the dataset.
Defaults to ``self.defaults.collections``.
timespan : `Timespan`, optional
A timespan that the validity range of the dataset must overlap.
If not provided, any `~CollectionType.CALIBRATION` collections
matched by the ``collections`` argument will not be searched.
storage_class : `str` or `StorageClass` or `None`
A storage class to use when creating the returned entry. If given
it must be compatible with the default storage class.
dimension_records: `bool`, optional
If `True` the ref will be expanded and contain dimension records.
datastore_records: `bool`, optional.
If `True` the ref will contain associated datastore records.
**kwargs
Additional keyword arguments passed to
`DataCoordinate.standardize` to convert ``dataId`` to a true
`DataCoordinate` or augment an existing one. This can also include
dimension record metadata that can be used to derive a primary
dimension value.
Returns
-------
ref : `DatasetRef`
A reference to the dataset, or `None` if no matching Dataset
was found.
Raises
------
lsst.daf.butler.NoDefaultCollectionError
Raised if ``collections`` is `None` and
``self.collections`` is `None`.
LookupError
Raised if one or more data ID keys are missing.
lsst.daf.butler.MissingDatasetTypeError
Raised if the dataset type does not exist.
lsst.daf.butler.MissingCollectionError
Raised if any of ``collections`` does not exist in the registry.
Notes
-----
This method simply returns `None` and does not raise an exception even
when the set of collections searched is intrinsically incompatible with
the dataset type, e.g. if ``datasetType.isCalibration() is False``, but
only `~CollectionType.CALIBRATION` collections are being searched.
This may make it harder to debug some lookup failures, but the behavior
is intentional; we consider it more important that failed searches are
reported consistently, regardless of the reason, and that adding
additional collections that do not contain a match to the search path
never changes the behavior.
This method handles component dataset types automatically, though most
other query operations do not.
"""
raise NotImplementedError()

@abstractmethod
def retrieveArtifacts(
self,
Expand Down
70 changes: 62 additions & 8 deletions python/lsst/daf/butler/direct_butler.py
Original file line number Diff line number Diff line change
Expand Up @@ -55,7 +55,7 @@
from ._butler_config import ButlerConfig
from ._config import Config
from ._dataset_existence import DatasetExistence
from ._dataset_ref import DatasetIdGenEnum, DatasetRef
from ._dataset_ref import DatasetId, DatasetIdGenEnum, DatasetRef
from ._dataset_type import DatasetType
from ._deferredDatasetHandle import DeferredDatasetHandle
from ._exceptions import ValidationError
Expand Down Expand Up @@ -228,7 +228,7 @@ def __init__(
def _retrieve_dataset_type(self, name: str) -> DatasetType | None:
"""Return DatasetType defined in registry given dataset type name."""
try:
return self._registry.getDatasetType(name)
return self.get_dataset_type(name)
except MissingDatasetTypeError:
return None

Expand Down Expand Up @@ -369,11 +369,11 @@ def _standardizeArgs(
if isinstance(datasetRefOrType, DatasetType):
externalDatasetType = datasetRefOrType
else:
internalDatasetType = self._registry.getDatasetType(datasetRefOrType)
internalDatasetType = self.get_dataset_type(datasetRefOrType)

# Check that they are self-consistent
if externalDatasetType is not None:
internalDatasetType = self._registry.getDatasetType(externalDatasetType.name)
internalDatasetType = self.get_dataset_type(externalDatasetType.name)
if externalDatasetType != internalDatasetType:
# We can allow differences if they are compatible, depending
# on whether this is a get or a put. A get requires that
Expand Down Expand Up @@ -846,7 +846,7 @@ def _findDatasetRef(
)
# Always lookup the DatasetRef, even if one is given, to ensure it is
# present in the current collection.
ref = self._registry.findDataset(
ref = self.find_dataset(
datasetType,
dataId,
collections=collections,
Expand Down Expand Up @@ -1318,6 +1318,60 @@ def getURI(
)
return primary

def get_dataset_type(self, name: str) -> DatasetType:
return self._registry.getDatasetType(name)

def get_dataset(
self,
id: DatasetId,
storage_class: str | StorageClass | None = None,
dimension_records: bool = False,
datastore_records: bool = False,
) -> DatasetRef | None:
ref = self._registry.getDataset(id)
if ref is not None:
if dimension_records:
ref = ref.expanded(self._registry.expandDataId(ref.dataId, graph=ref.datasetType.dimensions))
if storage_class:
ref = ref.overrideStorageClass(storage_class)
if datastore_records:
ref = self._registry.get_datastore_records(ref)
return ref

def find_dataset(
self,
dataset_type: DatasetType | str,
data_id: DataId | None = None,
*,
collections: str | Sequence[str] | None = None,
timespan: Timespan | None = None,
storage_class: str | StorageClass | None = None,
dimension_records: bool = False,
datastore_records: bool = False,
**kwargs: Any,
) -> DatasetRef | None:
# Handle any parts of the dataID that are not using primary dimension
# keys.
if isinstance(dataset_type, str):
actual_type = self.get_dataset_type(dataset_type)
else:
actual_type = dataset_type
data_id, kwargs = self._rewrite_data_id(data_id, actual_type, **kwargs)

ref = self._registry.findDataset(
dataset_type,
data_id,
collections=collections,
timespan=timespan,
datastore_records=datastore_records,
**kwargs,
)
if ref is not None and dimension_records:
ref = ref.expanded(self._registry.expandDataId(ref.dataId, graph=ref.datasetType.dimensions))
if ref is not None and storage_class is not None:
ref = ref.overrideStorageClass(storage_class)
return ref

def retrieveArtifacts(
self,
refs: Iterable[DatasetRef],
Expand Down Expand Up @@ -1877,7 +1931,7 @@ def transfer_from(
newly_registered_dataset_types.add(datasetType)
else:
# If the dataset type is missing, let it fail immediately.
target_dataset_type = self._registry.getDatasetType(datasetType.name)
target_dataset_type = self.get_dataset_type(datasetType.name)
if target_dataset_type != datasetType:
raise ConflictingDefinitionError(
"Source butler dataset type differs from definition"
Expand Down Expand Up @@ -1994,7 +2048,7 @@ def validateConfiguration(
) -> None:
# Docstring inherited.
if datasetTypeNames:
datasetTypes = [self._registry.getDatasetType(name) for name in datasetTypeNames]
datasetTypes = [self.get_dataset_type(name) for name in datasetTypeNames]
else:
datasetTypes = list(self._registry.queryDatasetTypes())

Expand Down Expand Up @@ -2064,7 +2118,7 @@ def validateConfiguration(
pass
else:
try:
self._registry.getDatasetType(key.name)
self.get_dataset_type(key.name)
except KeyError:
if logFailures:
_LOG.critical(
Expand Down
8 changes: 5 additions & 3 deletions python/lsst/daf/butler/registry/_registry.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,15 +27,15 @@

from __future__ import annotations

__all__ = ("Registry",)
__all__ = ("Registry", "CollectionArgType")

import contextlib
import logging
import re
from abc import ABC, abstractmethod
from collections.abc import Iterable, Iterator, Mapping, Sequence
from types import EllipsisType
from typing import TYPE_CHECKING, Any
from typing import TYPE_CHECKING, Any, TypeAlias

from .._dataset_association import DatasetAssociation
from .._dataset_ref import DatasetId, DatasetIdGenEnum, DatasetRef
Expand Down Expand Up @@ -64,7 +64,9 @@
_LOG = logging.getLogger(__name__)

# TYpe alias for `collections` arguments.
CollectionArgType = str | re.Pattern | Iterable[str | re.Pattern] | EllipsisType | CollectionWildcard
CollectionArgType: TypeAlias = (
str | re.Pattern | Iterable[str | re.Pattern] | EllipsisType | CollectionWildcard
)


class Registry(ABC):
Expand Down
Loading

0 comments on commit 5baa637

Please sign in to comment.