Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

DM-41365: Add findDataset/getDataset/getDatasetType to Butler API #899

Merged
merged 13 commits into from
Nov 3, 2023
Merged
1 change: 1 addition & 0 deletions doc/changes/DM-41365.feature.rst
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
Added new ``Butler`` APIs migrated from registry: ``Butler.get_dataset_type()``, ``Butler.get_dataset()``, and ``Butler.find_dataset()``.
10 changes: 9 additions & 1 deletion python/lsst/daf/butler/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -79,7 +79,15 @@

# Do not import or lift symbols from 'server' or 'server_models'.
# Import the registry subpackage directly for other symbols.
from .registry import CollectionSearch, CollectionType, Registry, RegistryConfig
from .registry import (
CollectionSearch,
CollectionType,
MissingCollectionError,
MissingDatasetTypeError,
NoDefaultCollectionError,
Registry,
RegistryConfig,
)
from .transfers import RepoExportContext, YamlRepoExportBackend, YamlRepoImportBackend
from .version import *

Expand Down
148 changes: 147 additions & 1 deletion python/lsst/daf/butler/_butler.py
Original file line number Diff line number Diff line change
Expand Up @@ -42,12 +42,13 @@
from ._butler_repo_index import ButlerRepoIndex
from ._config import Config, ConfigSubset
from ._dataset_existence import DatasetExistence
from ._dataset_ref import DatasetIdGenEnum, DatasetRef
from ._dataset_ref import DatasetId, DatasetIdGenEnum, DatasetRef
from ._dataset_type import DatasetType
from ._deferredDatasetHandle import DeferredDatasetHandle
from ._file_dataset import FileDataset
from ._limited_butler import LimitedButler
from ._storage_class import StorageClass
from ._timespan import Timespan
from .datastore import DatasetRefURIs, Datastore
from .dimensions import DataId, DimensionConfig
from .registry import Registry, RegistryConfig, _RegistryFactory
Expand Down Expand Up @@ -772,6 +773,151 @@ def getURI(
"""
raise NotImplementedError()

@abstractmethod
def get_dataset_type(self, name: str) -> DatasetType:
"""Get the `DatasetType`.
Parameters
----------
name : `str`
Name of the type.
Returns
-------
type : `DatasetType`
The `DatasetType` associated with the given name.
Raises
------
lsst.daf.butler.MissingDatasetTypeError
Raised if the requested dataset type has not been registered.
Notes
-----
This method handles component dataset types automatically, though most
other operations do not.
"""
raise NotImplementedError()

@abstractmethod
def get_dataset(
self,
id: DatasetId,
storage_class: str | StorageClass | None,
dimension_records: bool = False,
datastore_records: bool = False,
) -> DatasetRef | None:
"""Retrieve a Dataset entry.
Parameters
----------
id : `DatasetId`
The unique identifier for the dataset.
storage_class : `str` or `StorageClass` or `None`
A storage class to use when creating the returned entry. If given
it must be compatible with the default storage class.
dimension_records: `bool`, optional
If `True` the ref will be expanded and contain dimension records.
datastore_records: `bool`, optional.
If `True` the ref will contain associated datastore records.
Returns
-------
ref : `DatasetRef` or `None`
A ref to the Dataset, or `None` if no matching Dataset
was found.
"""
raise NotImplementedError()

@abstractmethod
def find_dataset(
self,
dataset_type: DatasetType | str,
data_id: DataId | None = None,
*,
collections: str | Sequence[str] | None = None,
timespan: Timespan | None = None,
storage_class: str | StorageClass | None = None,
dimension_records: bool = False,
datastore_records: bool = False,
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

If we provide this as an option, we should also provide a way to expand the data ID (i.e. add dimension records) at the same time, and we should think about whether we want just one option to do both or allow the two kinds of expansion to happen separately.

Copy link
Member Author

@timj timj Oct 30, 2023

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I had completely failed to notice that that flag was in there now. One option is for find_dataset to always expand and add everything. People shouldn't be calling it in bulk.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Always expanding could be inefficient, I think many clients do not care about datastore records at all, we should not be pessimizing things for them.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Presumably get_dataset should have the datastore_records and dimension_records booleans as well for consistency?

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

👍

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I also need to add datastore records to the SerializedDatasetRef because that was never updated and so it's impossible to return datastore records from the server at the moment...

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Jim asked me to not add them to SerializedDatasetRef, as it may increase graph size on disk (but I don't think we give records to refs that go into the graph, at least not yet).

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We are sending SerializedDatasetRef over the wire so we can only reconstruct things in the far end that were put in at the other end. We need to add them somehow (but obviously they should be optional). For now I've turned the feature off in client/server so you can't get the datastore records.

**kwargs: Any,
timj marked this conversation as resolved.
Show resolved Hide resolved
) -> DatasetRef | None:
"""Find a dataset given its `DatasetType` and data ID.
timj marked this conversation as resolved.
Show resolved Hide resolved
This can be used to obtain a `DatasetRef` that permits the dataset to
be read from a `Datastore`. If the dataset is a component and can not
be found using the provided dataset type, a dataset ref for the parent
will be returned instead but with the correct dataset type.
Parameters
----------
dataset_type : `DatasetType` or `str`
A `DatasetType` or the name of one. If this is a `DatasetType`
instance, its storage class will be respected and propagated to
the output, even if it differs from the dataset type definition
in the registry, as long as the storage classes are convertible.
data_id : `dict` or `DataCoordinate`, optional
A `dict`-like object containing the `Dimension` links that identify
the dataset within a collection. If it is a `dict` the dataId
can include dimension record values such as ``day_obs`` and
``seq_num`` or ``full_name`` that can be used to derive the
primary dimension.
collections : `str` or `list` [`str`], optional
A an ordered list of collections to search for the dataset.
Defaults to ``self.defaults.collections``.
timespan : `Timespan`, optional
A timespan that the validity range of the dataset must overlap.
If not provided, any `~CollectionType.CALIBRATION` collections
matched by the ``collections`` argument will not be searched.
storage_class : `str` or `StorageClass` or `None`
A storage class to use when creating the returned entry. If given
it must be compatible with the default storage class.
dimension_records: `bool`, optional
If `True` the ref will be expanded and contain dimension records.
datastore_records: `bool`, optional.
If `True` the ref will contain associated datastore records.
**kwargs
Additional keyword arguments passed to
`DataCoordinate.standardize` to convert ``dataId`` to a true
`DataCoordinate` or augment an existing one. This can also include
dimension record metadata that can be used to derive a primary
dimension value.
Returns
-------
ref : `DatasetRef`
A reference to the dataset, or `None` if no matching Dataset
was found.
Raises
------
lsst.daf.butler.NoDefaultCollectionError
Raised if ``collections`` is `None` and
``self.collections`` is `None`.
LookupError
Raised if one or more data ID keys are missing.
lsst.daf.butler.MissingDatasetTypeError
Raised if the dataset type does not exist.
lsst.daf.butler.MissingCollectionError
Raised if any of ``collections`` does not exist in the registry.
Notes
-----
This method simply returns `None` and does not raise an exception even
when the set of collections searched is intrinsically incompatible with
the dataset type, e.g. if ``datasetType.isCalibration() is False``, but
only `~CollectionType.CALIBRATION` collections are being searched.
This may make it harder to debug some lookup failures, but the behavior
is intentional; we consider it more important that failed searches are
reported consistently, regardless of the reason, and that adding
additional collections that do not contain a match to the search path
never changes the behavior.
This method handles component dataset types automatically, though most
other query operations do not.
"""
raise NotImplementedError()

@abstractmethod
def retrieveArtifacts(
self,
Expand Down
70 changes: 62 additions & 8 deletions python/lsst/daf/butler/direct_butler.py
Original file line number Diff line number Diff line change
Expand Up @@ -55,7 +55,7 @@
from ._butler_config import ButlerConfig
from ._config import Config
from ._dataset_existence import DatasetExistence
from ._dataset_ref import DatasetIdGenEnum, DatasetRef
from ._dataset_ref import DatasetId, DatasetIdGenEnum, DatasetRef
from ._dataset_type import DatasetType
from ._deferredDatasetHandle import DeferredDatasetHandle
from ._exceptions import ValidationError
Expand Down Expand Up @@ -228,7 +228,7 @@
def _retrieve_dataset_type(self, name: str) -> DatasetType | None:
"""Return DatasetType defined in registry given dataset type name."""
try:
return self._registry.getDatasetType(name)
return self.get_dataset_type(name)
except MissingDatasetTypeError:
return None

Expand Down Expand Up @@ -369,11 +369,11 @@
if isinstance(datasetRefOrType, DatasetType):
externalDatasetType = datasetRefOrType
else:
internalDatasetType = self._registry.getDatasetType(datasetRefOrType)
internalDatasetType = self.get_dataset_type(datasetRefOrType)

# Check that they are self-consistent
if externalDatasetType is not None:
internalDatasetType = self._registry.getDatasetType(externalDatasetType.name)
internalDatasetType = self.get_dataset_type(externalDatasetType.name)
if externalDatasetType != internalDatasetType:
# We can allow differences if they are compatible, depending
# on whether this is a get or a put. A get requires that
Expand Down Expand Up @@ -846,7 +846,7 @@
)
# Always lookup the DatasetRef, even if one is given, to ensure it is
# present in the current collection.
ref = self._registry.findDataset(
ref = self.find_dataset(
datasetType,
dataId,
collections=collections,
Expand Down Expand Up @@ -1318,6 +1318,60 @@
)
return primary

def get_dataset_type(self, name: str) -> DatasetType:
return self._registry.getDatasetType(name)

def get_dataset(
self,
id: DatasetId,
storage_class: str | StorageClass | None = None,
dimension_records: bool = False,
datastore_records: bool = False,
) -> DatasetRef | None:
ref = self._registry.getDataset(id)
if ref is not None:
if dimension_records:
ref = ref.expanded(self._registry.expandDataId(ref.dataId, graph=ref.datasetType.dimensions))
if storage_class:
ref = ref.overrideStorageClass(storage_class)
if datastore_records:
ref = self._registry.get_datastore_records(ref)

Check warning on line 1338 in python/lsst/daf/butler/direct_butler.py

View check run for this annotation

Codecov / codecov/patch

python/lsst/daf/butler/direct_butler.py#L1338

Added line #L1338 was not covered by tests
return ref

def find_dataset(
self,
dataset_type: DatasetType | str,
data_id: DataId | None = None,
*,
collections: str | Sequence[str] | None = None,
timespan: Timespan | None = None,
storage_class: str | StorageClass | None = None,
dimension_records: bool = False,
datastore_records: bool = False,
**kwargs: Any,
) -> DatasetRef | None:
# Handle any parts of the dataID that are not using primary dimension
# keys.
if isinstance(dataset_type, str):
actual_type = self.get_dataset_type(dataset_type)
else:
actual_type = dataset_type
data_id, kwargs = self._rewrite_data_id(data_id, actual_type, **kwargs)

ref = self._registry.findDataset(
dataset_type,
data_id,
collections=collections,
timespan=timespan,
datastore_records=datastore_records,
**kwargs,
)
if ref is not None and dimension_records:
ref = ref.expanded(self._registry.expandDataId(ref.dataId, graph=ref.datasetType.dimensions))
if ref is not None and storage_class is not None:
ref = ref.overrideStorageClass(storage_class)
return ref

def retrieveArtifacts(
self,
refs: Iterable[DatasetRef],
Expand Down Expand Up @@ -1877,7 +1931,7 @@
newly_registered_dataset_types.add(datasetType)
else:
# If the dataset type is missing, let it fail immediately.
target_dataset_type = self._registry.getDatasetType(datasetType.name)
target_dataset_type = self.get_dataset_type(datasetType.name)
if target_dataset_type != datasetType:
raise ConflictingDefinitionError(
"Source butler dataset type differs from definition"
Expand Down Expand Up @@ -1994,7 +2048,7 @@
) -> None:
# Docstring inherited.
if datasetTypeNames:
datasetTypes = [self._registry.getDatasetType(name) for name in datasetTypeNames]
datasetTypes = [self.get_dataset_type(name) for name in datasetTypeNames]
else:
datasetTypes = list(self._registry.queryDatasetTypes())

Expand Down Expand Up @@ -2064,7 +2118,7 @@
pass
else:
try:
self._registry.getDatasetType(key.name)
self.get_dataset_type(key.name)
except KeyError:
if logFailures:
_LOG.critical(
Expand Down
8 changes: 5 additions & 3 deletions python/lsst/daf/butler/registry/_registry.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,15 +27,15 @@

from __future__ import annotations

__all__ = ("Registry",)
__all__ = ("Registry", "CollectionArgType")

import contextlib
import logging
import re
from abc import ABC, abstractmethod
from collections.abc import Iterable, Iterator, Mapping, Sequence
from types import EllipsisType
from typing import TYPE_CHECKING, Any
from typing import TYPE_CHECKING, Any, TypeAlias

from .._dataset_association import DatasetAssociation
from .._dataset_ref import DatasetId, DatasetIdGenEnum, DatasetRef
Expand Down Expand Up @@ -64,7 +64,9 @@
_LOG = logging.getLogger(__name__)

# TYpe alias for `collections` arguments.
CollectionArgType = str | re.Pattern | Iterable[str | re.Pattern] | EllipsisType | CollectionWildcard
CollectionArgType: TypeAlias = (
str | re.Pattern | Iterable[str | re.Pattern] | EllipsisType | CollectionWildcard
)


class Registry(ABC):
Expand Down
Loading
Loading