diff --git a/doc/changes/DM-41365.feature.rst b/doc/changes/DM-41365.feature.rst new file mode 100644 index 0000000000..c95608d470 --- /dev/null +++ b/doc/changes/DM-41365.feature.rst @@ -0,0 +1 @@ +Added new ``Butler`` APIs migrated from registry: ``Butler.get_dataset_type()``, ``Butler.get_dataset()``, and ``Butler.find_dataset()``. diff --git a/python/lsst/daf/butler/__init__.py b/python/lsst/daf/butler/__init__.py index c346ca7b87..8333081ddc 100644 --- a/python/lsst/daf/butler/__init__.py +++ b/python/lsst/daf/butler/__init__.py @@ -79,7 +79,15 @@ # Do not import or lift symbols from 'server' or 'server_models'. # Import the registry subpackage directly for other symbols. -from .registry import CollectionSearch, CollectionType, Registry, RegistryConfig +from .registry import ( + CollectionSearch, + CollectionType, + MissingCollectionError, + MissingDatasetTypeError, + NoDefaultCollectionError, + Registry, + RegistryConfig, +) from .transfers import RepoExportContext, YamlRepoExportBackend, YamlRepoImportBackend from .version import * diff --git a/python/lsst/daf/butler/_butler.py b/python/lsst/daf/butler/_butler.py index f9f32fbbf3..f83a4ad347 100644 --- a/python/lsst/daf/butler/_butler.py +++ b/python/lsst/daf/butler/_butler.py @@ -42,12 +42,13 @@ from ._butler_repo_index import ButlerRepoIndex from ._config import Config, ConfigSubset from ._dataset_existence import DatasetExistence -from ._dataset_ref import DatasetIdGenEnum, DatasetRef +from ._dataset_ref import DatasetId, DatasetIdGenEnum, DatasetRef from ._dataset_type import DatasetType from ._deferredDatasetHandle import DeferredDatasetHandle from ._file_dataset import FileDataset from ._limited_butler import LimitedButler from ._storage_class import StorageClass +from ._timespan import Timespan from .datastore import DatasetRefURIs, Datastore from .dimensions import DataId, DimensionConfig from .registry import Registry, RegistryConfig, _RegistryFactory @@ -772,6 +773,151 @@ def getURI( """ raise NotImplementedError() + @abstractmethod + def get_dataset_type(self, name: str) -> DatasetType: + """Get the `DatasetType`. + + Parameters + ---------- + name : `str` + Name of the type. + + Returns + ------- + type : `DatasetType` + The `DatasetType` associated with the given name. + + Raises + ------ + lsst.daf.butler.MissingDatasetTypeError + Raised if the requested dataset type has not been registered. + + Notes + ----- + This method handles component dataset types automatically, though most + other operations do not. + """ + raise NotImplementedError() + + @abstractmethod + def get_dataset( + self, + id: DatasetId, + storage_class: str | StorageClass | None, + dimension_records: bool = False, + datastore_records: bool = False, + ) -> DatasetRef | None: + """Retrieve a Dataset entry. + + Parameters + ---------- + id : `DatasetId` + The unique identifier for the dataset. + storage_class : `str` or `StorageClass` or `None` + A storage class to use when creating the returned entry. If given + it must be compatible with the default storage class. + dimension_records: `bool`, optional + If `True` the ref will be expanded and contain dimension records. + datastore_records: `bool`, optional. + If `True` the ref will contain associated datastore records. + + Returns + ------- + ref : `DatasetRef` or `None` + A ref to the Dataset, or `None` if no matching Dataset + was found. + """ + raise NotImplementedError() + + @abstractmethod + def find_dataset( + self, + dataset_type: DatasetType | str, + data_id: DataId | None = None, + *, + collections: str | Sequence[str] | None = None, + timespan: Timespan | None = None, + storage_class: str | StorageClass | None = None, + dimension_records: bool = False, + datastore_records: bool = False, + **kwargs: Any, + ) -> DatasetRef | None: + """Find a dataset given its `DatasetType` and data ID. + + This can be used to obtain a `DatasetRef` that permits the dataset to + be read from a `Datastore`. If the dataset is a component and can not + be found using the provided dataset type, a dataset ref for the parent + will be returned instead but with the correct dataset type. + + Parameters + ---------- + dataset_type : `DatasetType` or `str` + A `DatasetType` or the name of one. If this is a `DatasetType` + instance, its storage class will be respected and propagated to + the output, even if it differs from the dataset type definition + in the registry, as long as the storage classes are convertible. + data_id : `dict` or `DataCoordinate`, optional + A `dict`-like object containing the `Dimension` links that identify + the dataset within a collection. If it is a `dict` the dataId + can include dimension record values such as ``day_obs`` and + ``seq_num`` or ``full_name`` that can be used to derive the + primary dimension. + collections : `str` or `list` [`str`], optional + A an ordered list of collections to search for the dataset. + Defaults to ``self.defaults.collections``. + timespan : `Timespan`, optional + A timespan that the validity range of the dataset must overlap. + If not provided, any `~CollectionType.CALIBRATION` collections + matched by the ``collections`` argument will not be searched. + storage_class : `str` or `StorageClass` or `None` + A storage class to use when creating the returned entry. If given + it must be compatible with the default storage class. + dimension_records: `bool`, optional + If `True` the ref will be expanded and contain dimension records. + datastore_records: `bool`, optional. + If `True` the ref will contain associated datastore records. + **kwargs + Additional keyword arguments passed to + `DataCoordinate.standardize` to convert ``dataId`` to a true + `DataCoordinate` or augment an existing one. This can also include + dimension record metadata that can be used to derive a primary + dimension value. + + Returns + ------- + ref : `DatasetRef` + A reference to the dataset, or `None` if no matching Dataset + was found. + + Raises + ------ + lsst.daf.butler.NoDefaultCollectionError + Raised if ``collections`` is `None` and + ``self.collections`` is `None`. + LookupError + Raised if one or more data ID keys are missing. + lsst.daf.butler.MissingDatasetTypeError + Raised if the dataset type does not exist. + lsst.daf.butler.MissingCollectionError + Raised if any of ``collections`` does not exist in the registry. + + Notes + ----- + This method simply returns `None` and does not raise an exception even + when the set of collections searched is intrinsically incompatible with + the dataset type, e.g. if ``datasetType.isCalibration() is False``, but + only `~CollectionType.CALIBRATION` collections are being searched. + This may make it harder to debug some lookup failures, but the behavior + is intentional; we consider it more important that failed searches are + reported consistently, regardless of the reason, and that adding + additional collections that do not contain a match to the search path + never changes the behavior. + + This method handles component dataset types automatically, though most + other query operations do not. + """ + raise NotImplementedError() + @abstractmethod def retrieveArtifacts( self, diff --git a/python/lsst/daf/butler/direct_butler.py b/python/lsst/daf/butler/direct_butler.py index 6e7ad20cb4..6b70ecb1e1 100644 --- a/python/lsst/daf/butler/direct_butler.py +++ b/python/lsst/daf/butler/direct_butler.py @@ -55,7 +55,7 @@ from ._butler_config import ButlerConfig from ._config import Config from ._dataset_existence import DatasetExistence -from ._dataset_ref import DatasetIdGenEnum, DatasetRef +from ._dataset_ref import DatasetId, DatasetIdGenEnum, DatasetRef from ._dataset_type import DatasetType from ._deferredDatasetHandle import DeferredDatasetHandle from ._exceptions import ValidationError @@ -228,7 +228,7 @@ def __init__( def _retrieve_dataset_type(self, name: str) -> DatasetType | None: """Return DatasetType defined in registry given dataset type name.""" try: - return self._registry.getDatasetType(name) + return self.get_dataset_type(name) except MissingDatasetTypeError: return None @@ -369,11 +369,11 @@ def _standardizeArgs( if isinstance(datasetRefOrType, DatasetType): externalDatasetType = datasetRefOrType else: - internalDatasetType = self._registry.getDatasetType(datasetRefOrType) + internalDatasetType = self.get_dataset_type(datasetRefOrType) # Check that they are self-consistent if externalDatasetType is not None: - internalDatasetType = self._registry.getDatasetType(externalDatasetType.name) + internalDatasetType = self.get_dataset_type(externalDatasetType.name) if externalDatasetType != internalDatasetType: # We can allow differences if they are compatible, depending # on whether this is a get or a put. A get requires that @@ -846,7 +846,7 @@ def _findDatasetRef( ) # Always lookup the DatasetRef, even if one is given, to ensure it is # present in the current collection. - ref = self._registry.findDataset( + ref = self.find_dataset( datasetType, dataId, collections=collections, @@ -1318,6 +1318,60 @@ def getURI( ) return primary + def get_dataset_type(self, name: str) -> DatasetType: + return self._registry.getDatasetType(name) + + def get_dataset( + self, + id: DatasetId, + storage_class: str | StorageClass | None = None, + dimension_records: bool = False, + datastore_records: bool = False, + ) -> DatasetRef | None: + ref = self._registry.getDataset(id) + if ref is not None: + if dimension_records: + ref = ref.expanded(self._registry.expandDataId(ref.dataId, graph=ref.datasetType.dimensions)) + if storage_class: + ref = ref.overrideStorageClass(storage_class) + if datastore_records: + ref = self._registry.get_datastore_records(ref) + return ref + + def find_dataset( + self, + dataset_type: DatasetType | str, + data_id: DataId | None = None, + *, + collections: str | Sequence[str] | None = None, + timespan: Timespan | None = None, + storage_class: str | StorageClass | None = None, + dimension_records: bool = False, + datastore_records: bool = False, + **kwargs: Any, + ) -> DatasetRef | None: + # Handle any parts of the dataID that are not using primary dimension + # keys. + if isinstance(dataset_type, str): + actual_type = self.get_dataset_type(dataset_type) + else: + actual_type = dataset_type + data_id, kwargs = self._rewrite_data_id(data_id, actual_type, **kwargs) + + ref = self._registry.findDataset( + dataset_type, + data_id, + collections=collections, + timespan=timespan, + datastore_records=datastore_records, + **kwargs, + ) + if ref is not None and dimension_records: + ref = ref.expanded(self._registry.expandDataId(ref.dataId, graph=ref.datasetType.dimensions)) + if ref is not None and storage_class is not None: + ref = ref.overrideStorageClass(storage_class) + return ref + def retrieveArtifacts( self, refs: Iterable[DatasetRef], @@ -1877,7 +1931,7 @@ def transfer_from( newly_registered_dataset_types.add(datasetType) else: # If the dataset type is missing, let it fail immediately. - target_dataset_type = self._registry.getDatasetType(datasetType.name) + target_dataset_type = self.get_dataset_type(datasetType.name) if target_dataset_type != datasetType: raise ConflictingDefinitionError( "Source butler dataset type differs from definition" @@ -1994,7 +2048,7 @@ def validateConfiguration( ) -> None: # Docstring inherited. if datasetTypeNames: - datasetTypes = [self._registry.getDatasetType(name) for name in datasetTypeNames] + datasetTypes = [self.get_dataset_type(name) for name in datasetTypeNames] else: datasetTypes = list(self._registry.queryDatasetTypes()) @@ -2064,7 +2118,7 @@ def validateConfiguration( pass else: try: - self._registry.getDatasetType(key.name) + self.get_dataset_type(key.name) except KeyError: if logFailures: _LOG.critical( diff --git a/python/lsst/daf/butler/registry/_registry.py b/python/lsst/daf/butler/registry/_registry.py index 398f2479a4..2f0cb3231d 100644 --- a/python/lsst/daf/butler/registry/_registry.py +++ b/python/lsst/daf/butler/registry/_registry.py @@ -27,7 +27,7 @@ from __future__ import annotations -__all__ = ("Registry",) +__all__ = ("Registry", "CollectionArgType") import contextlib import logging @@ -35,7 +35,7 @@ from abc import ABC, abstractmethod from collections.abc import Iterable, Iterator, Mapping, Sequence from types import EllipsisType -from typing import TYPE_CHECKING, Any +from typing import TYPE_CHECKING, Any, TypeAlias from .._dataset_association import DatasetAssociation from .._dataset_ref import DatasetId, DatasetIdGenEnum, DatasetRef @@ -64,7 +64,9 @@ _LOG = logging.getLogger(__name__) # TYpe alias for `collections` arguments. -CollectionArgType = str | re.Pattern | Iterable[str | re.Pattern] | EllipsisType | CollectionWildcard +CollectionArgType: TypeAlias = ( + str | re.Pattern | Iterable[str | re.Pattern] | EllipsisType | CollectionWildcard +) class Registry(ABC): diff --git a/python/lsst/daf/butler/remote_butler/_remote_butler.py b/python/lsst/daf/butler/remote_butler/_remote_butler.py index a9a0273618..20cff12322 100644 --- a/python/lsst/daf/butler/remote_butler/_remote_butler.py +++ b/python/lsst/daf/butler/remote_butler/_remote_butler.py @@ -40,17 +40,20 @@ from .._butler_config import ButlerConfig from .._config import Config from .._dataset_existence import DatasetExistence -from .._dataset_ref import DatasetIdGenEnum, DatasetRef -from .._dataset_type import DatasetType +from .._dataset_ref import DatasetId, DatasetIdGenEnum, DatasetRef, SerializedDatasetRef +from .._dataset_type import DatasetType, SerializedDatasetType from .._deferredDatasetHandle import DeferredDatasetHandle from .._file_dataset import FileDataset from .._limited_butler import LimitedButler from .._storage_class import StorageClass +from .._timespan import Timespan from ..datastore import DatasetRefURIs -from ..dimensions import DataId, DimensionConfig, DimensionUniverse -from ..registry import Registry, RegistryDefaults +from ..dimensions import DataCoordinate, DataId, DimensionConfig, DimensionUniverse, SerializedDataCoordinate +from ..registry import MissingDatasetTypeError, NoDefaultCollectionError, Registry, RegistryDefaults +from ..registry.wildcards import CollectionWildcard from ..transfers import RepoExportContext from ._config import RemoteButlerConfigModel +from .server import FindDatasetModel class RemoteButler(Butler): @@ -100,9 +103,38 @@ def dimensions(self) -> DimensionUniverse: self._dimensions = DimensionUniverse(config) return self._dimensions - def getDatasetType(self, name: str) -> DatasetType: - # Docstring inherited. - raise NotImplementedError() + def _simplify_dataId( + self, dataId: DataId | None, **kwargs: dict[str, int | str] + ) -> SerializedDataCoordinate | None: + """Take a generic Data ID and convert it to a serializable form. + + Parameters + ---------- + dataId : `dict`, `None`, `DataCoordinate` + The data ID to serialize. + **kwargs : `dict` + Additional values that should be included if this is not + a `DataCoordinate`. + + Returns + ------- + data_id : `SerializedDataCoordinate` or `None` + A serializable form. + """ + if dataId is None and not kwargs: + return None + if isinstance(dataId, DataCoordinate): + return dataId.to_simple() + + if dataId is None: + data_id = kwargs + elif kwargs: + # Change variable because DataId is immutable and mypy complains. + data_id = dict(dataId) + data_id.update(kwargs) + + # Assume we can treat it as a dict. + return SerializedDataCoordinate(dataId=data_id) def transaction(self) -> AbstractContextManager[None]: """Will always raise NotImplementedError. @@ -179,6 +211,94 @@ def getURI( # Docstring inherited. raise NotImplementedError() + def get_dataset_type(self, name: str) -> DatasetType: + # In future implementation this should directly access the cache + # and only go to the server if the dataset type is not known. + path = f"dataset_type/{name}" + response = self._client.get(self._get_url(path)) + if response.status_code != httpx.codes.OK: + content = response.json() + if content["exception"] == "MissingDatasetTypeError": + raise MissingDatasetTypeError(content["detail"]) + response.raise_for_status() + return DatasetType.from_simple(SerializedDatasetType(**response.json()), universe=self.dimensions) + + def get_dataset( + self, + id: DatasetId, + storage_class: str | StorageClass | None = None, + dimension_records: bool = False, + datastore_records: bool = False, + ) -> DatasetRef | None: + path = f"dataset/{id}" + if isinstance(storage_class, StorageClass): + storage_class_name = storage_class.name + elif storage_class: + storage_class_name = storage_class + params: dict[str, str | bool] = { + "dimension_records": dimension_records, + "datastore_records": datastore_records, + } + if datastore_records: + raise ValueError("Datastore records can not yet be returned in client/server butler.") + if storage_class: + params["storage_class"] = storage_class_name + response = self._client.get(self._get_url(path), params=params) + response.raise_for_status() + if response.json() is None: + return None + return DatasetRef.from_simple(SerializedDatasetRef(**response.json()), universe=self.dimensions) + + def find_dataset( + self, + dataset_type: DatasetType | str, + data_id: DataId | None = None, + *, + collections: str | Sequence[str] | None = None, + timespan: Timespan | None = None, + storage_class: str | StorageClass | None = None, + dimension_records: bool = False, + datastore_records: bool = False, + **kwargs: Any, + ) -> DatasetRef | None: + if collections is None: + if not self.collections: + raise NoDefaultCollectionError( + "No collections provided to find_dataset, and no defaults from butler construction." + ) + collections = self.collections + # Temporary hack. Assume strings for collections. In future + # want to construct CollectionWildcard and filter it through collection + # cache to generate list of collection names. + wildcards = CollectionWildcard.from_expression(collections) + + if datastore_records: + raise ValueError("Datastore records can not yet be returned in client/server butler.") + if timespan: + raise ValueError("Timespan can not yet be used in butler client/server.") + + if isinstance(dataset_type, DatasetType): + dataset_type = dataset_type.name + + if isinstance(storage_class, StorageClass): + storage_class = storage_class.name + + query = FindDatasetModel( + data_id=self._simplify_dataId(data_id, **kwargs), + collections=wildcards.strings, + storage_class=storage_class, + dimension_records=dimension_records, + datastore_records=datastore_records, + ) + + path = f"find_dataset/{dataset_type}" + response = self._client.post( + self._get_url(path), json=query.model_dump(mode="json", exclude_unset=True, exclude_defaults=True) + ) + response.raise_for_status() + + return DatasetRef.from_simple(SerializedDatasetRef(**response.json()), universe=self.dimensions) + def retrieveArtifacts( self, refs: Iterable[DatasetRef], diff --git a/python/lsst/daf/butler/remote_butler/server/__init__.py b/python/lsst/daf/butler/remote_butler/server/__init__.py index d63badaf11..93c9018bc4 100644 --- a/python/lsst/daf/butler/remote_butler/server/__init__.py +++ b/python/lsst/daf/butler/remote_butler/server/__init__.py @@ -27,3 +27,4 @@ from ._factory import * from ._server import * +from ._server_models import * diff --git a/python/lsst/daf/butler/remote_butler/server/_server.py b/python/lsst/daf/butler/remote_butler/server/_server.py index 3be9348223..a0c84555bd 100644 --- a/python/lsst/daf/butler/remote_butler/server/_server.py +++ b/python/lsst/daf/butler/remote_butler/server/_server.py @@ -30,14 +30,24 @@ __all__ = ("app", "factory_dependency") import logging +import uuid from functools import cache from typing import Any -from fastapi import Depends, FastAPI +from fastapi import Depends, FastAPI, Request from fastapi.middleware.gzip import GZipMiddleware -from lsst.daf.butler import Butler +from fastapi.responses import JSONResponse +from lsst.daf.butler import ( + Butler, + DataCoordinate, + MissingDatasetTypeError, + SerializedDataCoordinate, + SerializedDatasetRef, + SerializedDatasetType, +) from ._factory import Factory +from ._server_models import FindDatasetModel BUTLER_ROOT = "ci_hsc_gen3/DATA" @@ -47,6 +57,17 @@ app.add_middleware(GZipMiddleware, minimum_size=1000) +@app.exception_handler(MissingDatasetTypeError) +def missing_dataset_type_exception_handler(request: Request, exc: MissingDatasetTypeError) -> JSONResponse: + # Remove the double quotes around the string form. These confuse + # the JSON serialization when single quotes are in the message. + message = str(exc).strip('"') + return JSONResponse( + status_code=404, + content={"detail": message, "exception": "MissingDatasetTypeError"}, + ) + + @cache def _make_global_butler() -> Butler: return Butler.from_config(BUTLER_ROOT) @@ -56,8 +77,111 @@ def factory_dependency() -> Factory: return Factory(butler=_make_global_butler()) +def unpack_dataId(butler: Butler, data_id: SerializedDataCoordinate | None) -> DataCoordinate | None: + """Convert the serialized dataId back to full DataCoordinate. + + Parameters + ---------- + butler : `lsst.daf.butler.Butler` + The butler to use for registry and universe. + data_id : `SerializedDataCoordinate` or `None` + The serialized form. + + Returns + ------- + dataId : `DataCoordinate` or `None` + The DataId usable by registry. + """ + if data_id is None: + return None + return DataCoordinate.from_simple(data_id, registry=butler.registry) + + @app.get("/butler/v1/universe", response_model=dict[str, Any]) def get_dimension_universe(factory: Factory = Depends(factory_dependency)) -> dict[str, Any]: """Allow remote client to get dimensions definition.""" butler = factory.create_butler() return butler.dimensions.dimensionConfig.toDict() + + +@app.get( + "/butler/v1/dataset_type/{dataset_type_name}", + summary="Retrieve this dataset type definition.", + response_model=SerializedDatasetType, + response_model_exclude_unset=True, + response_model_exclude_defaults=True, + response_model_exclude_none=True, +) +def get_dataset_type( + dataset_type_name: str, factory: Factory = Depends(factory_dependency) +) -> SerializedDatasetType: + """Return the dataset type.""" + butler = factory.create_butler() + datasetType = butler.get_dataset_type(dataset_type_name) + return datasetType.to_simple() + + +@app.get( + "/butler/v1/dataset/{id}", + summary="Retrieve this dataset definition.", + response_model=SerializedDatasetRef | None, + response_model_exclude_unset=True, + response_model_exclude_defaults=True, + response_model_exclude_none=True, +) +def get_dataset( + id: uuid.UUID, + storage_class: str | None = None, + dimension_records: bool = False, + datastore_records: bool = False, + factory: Factory = Depends(factory_dependency), +) -> SerializedDatasetRef | None: + """Return a single dataset reference.""" + butler = factory.create_butler() + ref = butler.get_dataset( + id, + storage_class=storage_class, + dimension_records=dimension_records, + datastore_records=datastore_records, + ) + if ref is not None: + return ref.to_simple() + # This could raise a 404 since id is not found. The standard implementation + # get_dataset method returns without error so follow that example here. + return ref + + +# Not yet supported: TimeSpan is not yet a pydantic model. +# collections parameter assumes client-side has resolved regexes. +@app.post( + "/butler/v1/find_dataset/{dataset_type}", + summary="Retrieve this dataset definition from collection, dataset type, and dataId", + response_model=SerializedDatasetRef, + response_model_exclude_unset=True, + response_model_exclude_defaults=True, + response_model_exclude_none=True, +) +def find_dataset( + dataset_type: str, + query: FindDatasetModel, + factory: Factory = Depends(factory_dependency), +) -> SerializedDatasetRef | None: + collection_query = query.collections if query.collections else None + + # Get the simple dict from the SerializedDataCoordinate. We do not know + # if it is a well-defined DataCoordinate or needs some massaging first. + # find_dataset will use dimension record queries if necessary. + data_id = query.data_id.dataId + + butler = factory.create_butler() + ref = butler.find_dataset( + dataset_type, + None, + collections=collection_query, + storage_class=query.storage_class, + timespan=None, + dimension_records=query.dimension_records, + datastore_records=query.datastore_records, + **data_id, + ) + return ref.to_simple() if ref else None diff --git a/python/lsst/daf/butler/remote_butler/server/_server_models.py b/python/lsst/daf/butler/remote_butler/server/_server_models.py index 1c34747e33..627d09abb3 100644 --- a/python/lsst/daf/butler/remote_butler/server/_server_models.py +++ b/python/lsst/daf/butler/remote_butler/server/_server_models.py @@ -26,3 +26,17 @@ # along with this program. If not, see . """Models used for client/server communication.""" + +__all__ = ["FindDatasetModel"] + +from lsst.daf.butler import SerializedDataCoordinate + +from ..._compat import _BaseModelCompat + + +class FindDatasetModel(_BaseModelCompat): + data_id: SerializedDataCoordinate + collections: list[str] + storage_class: str | None + dimension_records: bool = False + datastore_records: bool = False diff --git a/python/lsst/daf/butler/script/ingest_files.py b/python/lsst/daf/butler/script/ingest_files.py index e4e645229b..aa3f2b1aac 100644 --- a/python/lsst/daf/butler/script/ingest_files.py +++ b/python/lsst/daf/butler/script/ingest_files.py @@ -107,7 +107,7 @@ def ingest_files( # Create the butler with the relevant run attached. butler = Butler.from_config(repo, run=run) - datasetType = butler.registry.getDatasetType(dataset_type) + datasetType = butler.get_dataset_type(dataset_type) # Convert the k=v strings into a dataId dict. universe = butler.dimensions diff --git a/tests/data/registry/base.yaml b/tests/data/registry/base.yaml index 799e385cfa..7b2791ece0 100644 --- a/tests/data/registry/base.yaml +++ b/tests/data/registry/base.yaml @@ -13,6 +13,7 @@ data: - name: Cam1 visit_max: 1024 + visit_system: 1 exposure_max: 512 detector_max: 4 class_name: lsst.obs.base.Instrument diff --git a/tests/test_butler.py b/tests/test_butler.py index e13d393b52..f34b144d10 100644 --- a/tests/test_butler.py +++ b/tests/test_butler.py @@ -384,7 +384,7 @@ def runPutGetTest(self, storageClass: StorageClass, datasetTypeName: str) -> Dir with self.assertRaises(FileNotFoundError): butler.get(ref) # Registry shouldn't be able to find it by dataset_id anymore. - self.assertIsNone(butler.registry.getDataset(ref.id)) + self.assertIsNone(butler.get_dataset(ref.id)) # Do explicit registry removal since we know they are # empty @@ -442,7 +442,7 @@ def runPutGetTest(self, storageClass: StorageClass, datasetTypeName: str) -> Dir ) self.assertEqual(count, stop) - compRef = butler.registry.findDataset(compNameS, dataId, collections=butler.collections) + compRef = butler.find_dataset(compNameS, dataId, collections=butler.collections) assert compRef is not None summary = butler.get(compRef) self.assertEqual(summary, metric.summary) @@ -822,7 +822,7 @@ def testPytypePutCoercion(self) -> None: # Check that the put still works if a DatasetType is given with # a definition matching this python type. - registry_type = butler.registry.getDatasetType(datasetTypeName) + registry_type = butler.get_dataset_type(datasetTypeName) this_type = DatasetType(datasetTypeName, registry_type.dimensions, "StructuredDataDictJson") metric2_ref = butler.put(test_dict, this_type, dataId=dataId, visit=425) self.assertEqual(metric2_ref.datasetType, registry_type) @@ -928,7 +928,7 @@ def testIngest(self) -> None: datasets[0].refs = [ cast( DatasetRef, - butler.registry.findDataset(ref.datasetType, dataId=ref.dataId, collections=ref.run), + butler.find_dataset(ref.datasetType, data_id=ref.dataId, collections=ref.run), ) for ref in datasets[0].refs ] @@ -938,7 +938,7 @@ def testIngest(self) -> None: for ref in dataset.refs: # Create a dict from the dataId to drop the records. new_data_id = {str(k): v for k, v in ref.dataId.items()} - new_ref = butler.registry.findDataset(ref.datasetType, new_data_id, collections=ref.run) + new_ref = butler.find_dataset(ref.datasetType, new_data_id, collections=ref.run) assert new_ref is not None self.assertFalse(new_ref.dataId.hasRecords()) refs.append(new_ref) @@ -1115,7 +1115,7 @@ def testTransaction(self) -> None: with self.assertRaises(LookupError, msg=f"Check can't get by {datasetTypeName} and {dataId}"): butler.get(datasetTypeName, dataId) # Also check explicitly if Dataset entry is missing - self.assertIsNone(butler.registry.findDataset(datasetType, dataId, collections=butler.collections)) + self.assertIsNone(butler.find_dataset(datasetType, dataId, collections=butler.collections)) # Direct retrieval should not find the file in the Datastore with self.assertRaises(FileNotFoundError, msg=f"Check {ref} can't be retrieved directly"): butler.get(ref) @@ -1708,7 +1708,7 @@ def testPytypeCoercion(self) -> None: metric = butler.get(datasetTypeName, dataId=dataId) self.assertEqual(get_full_type_name(metric), "lsst.daf.butler.tests.MetricsExample") - datasetType_ori = butler.registry.getDatasetType(datasetTypeName) + datasetType_ori = butler.get_dataset_type(datasetTypeName) self.assertEqual(datasetType_ori.storageClass.name, "StructuredDataNoComponents") # Now need to hack the registry dataset type definition. @@ -1725,7 +1725,7 @@ def testPytypeCoercion(self) -> None: # Force reset of dataset type cache butler.registry.refresh() - datasetType_new = butler.registry.getDatasetType(datasetTypeName) + datasetType_new = butler.get_dataset_type(datasetTypeName) self.assertEqual(datasetType_new.name, datasetType_ori.name) self.assertEqual(datasetType_new.storageClass.name, "StructuredDataNoComponentsModel") diff --git a/tests/test_server.py b/tests/test_server.py index 401e0126dd..a7661023fd 100644 --- a/tests/test_server.py +++ b/tests/test_server.py @@ -27,6 +27,7 @@ import os.path import unittest +import uuid try: # Failing to import any of these should disable the tests. @@ -37,7 +38,8 @@ TestClient = None app = None -from lsst.daf.butler import Butler +from lsst.daf.butler import Butler, DataCoordinate, DatasetRef, MissingDatasetTypeError, StorageClassFactory +from lsst.daf.butler.tests import DatastoreMock from lsst.daf.butler.tests.utils import MetricTestRepo, makeTestTempDir, removeTestTempDir TESTDIR = os.path.abspath(os.path.dirname(__file__)) @@ -62,11 +64,16 @@ class ButlerClientServerTestCase(unittest.TestCase): @classmethod def setUpClass(cls): + cls.storageClassFactory = StorageClassFactory() + # First create a butler and populate it. cls.root = makeTestTempDir(TESTDIR) cls.repo = MetricTestRepo(root=cls.root, configFile=os.path.join(TESTDIR, "config/basic/butler.yaml")) # Override the server's Butler initialization to point at our test repo - server_butler = Butler.from_config(cls.root) + server_butler = Butler.from_config(cls.root, writeable=True) + + # Not yet testing butler.get() + DatastoreMock.apply(server_butler) def create_factory_dependency(): return Factory(butler=server_butler) @@ -77,6 +84,10 @@ def create_factory_dependency(): cls.client = TestClient(app) cls.butler = _make_remote_butler(cls.client) + # Populate the test server. + server_butler.import_(filename=os.path.join(TESTDIR, "data", "registry", "base.yaml")) + server_butler.import_(filename=os.path.join(TESTDIR, "data", "registry", "datasets-uuid.yaml")) + @classmethod def tearDownClass(cls): del app.dependency_overrides[factory_dependency] @@ -90,6 +101,68 @@ def test_simple(self): def test_remote_butler(self): universe = self.butler.dimensions self.assertEqual(universe.namespace, "daf_butler") + self.assertFalse(self.butler.isWriteable()) + + def test_get_dataset_type(self): + bias_type = self.butler.get_dataset_type("bias") + self.assertEqual(bias_type.name, "bias") + + with self.assertRaises(MissingDatasetTypeError): + self.butler.get_dataset_type("not_bias") + + def test_find_dataset(self): + storage_class = self.storageClassFactory.getStorageClass("Exposure") + + ref = self.butler.find_dataset("bias", collections="imported_g", detector=1, instrument="Cam1") + self.assertIsInstance(ref, DatasetRef) + self.assertEqual(ref.id, uuid.UUID("e15ab039-bc8b-4135-87c5-90902a7c0b22")) + self.assertFalse(ref.dataId.hasRecords()) + + # Try again with variation of parameters. + ref_new = self.butler.find_dataset( + "bias", + {"detector": 1}, + collections="imported_g", + instrument="Cam1", + dimension_records=True, + ) + self.assertEqual(ref_new, ref) + self.assertTrue(ref_new.dataId.hasRecords()) + + ref_new = self.butler.find_dataset( + ref.datasetType, + DataCoordinate.standardize(detector=1, instrument="Cam1", universe=self.butler.dimensions), + collections="imported_g", + storage_class=storage_class, + ) + self.assertEqual(ref_new, ref) + + ref2 = self.butler.get_dataset(ref.id) + self.assertEqual(ref2, ref) + + # Use detector name to find it. + ref3 = self.butler.find_dataset( + ref.datasetType, + collections="imported_g", + instrument="Cam1", + full_name="Aa", + ) + self.assertEqual(ref2, ref3) + + # Try expanded refs. + self.assertFalse(ref.dataId.hasRecords()) + expanded = self.butler.get_dataset(ref.id, dimension_records=True) + self.assertTrue(expanded.dataId.hasRecords()) + + # The test datasets are all Exposure so storage class conversion + # can not be tested until we fix that. For now at least test the + # code paths. + bias = self.butler.get_dataset(ref.id, storage_class=storage_class) + self.assertEqual(bias.datasetType.storageClass, storage_class) + + # Unknown dataset should not fail. + self.assertIsNone(self.butler.get_dataset(uuid.uuid4())) + self.assertIsNone(self.butler.get_dataset(uuid.uuid4(), storage_class="NumpyArray")) if __name__ == "__main__": diff --git a/tests/test_simpleButler.py b/tests/test_simpleButler.py index 483d7d03ef..e14370fc0a 100644 --- a/tests/test_simpleButler.py +++ b/tests/test_simpleButler.py @@ -277,8 +277,8 @@ def testButlerGet(self): # Find the DatasetRef for a flat coll = "imported_g" - flat2g = butler.registry.findDataset( - "flat", instrument="Cam1", detector=2, physical_filter="Cam1-G", collections=coll + flat2g = butler.find_dataset( + "flat", instrument="Cam1", full_name="Ab", physical_filter="Cam1-G", collections=coll ) # Create a numpy integer to check that works fine @@ -512,7 +512,7 @@ def testRegistryDefaults(self): # input collections. butler.registry.defaults = RegistryDefaults(collections=["imported_g"]) # Use findDataset without collections or instrument. - ref = butler.registry.findDataset("flat", detector=2, physical_filter="Cam1-G") + ref = butler.find_dataset("flat", detector=2, physical_filter="Cam1-G") # Do the same with Butler.get; this should ultimately invoke a lot of # the same code, so it's a bit circular, but mostly we're checking that # it works at all. @@ -583,7 +583,7 @@ def testJson(self): # input collections. butler.registry.defaults = RegistryDefaults(collections=["imported_g"]) # Use findDataset without collections or instrument. - ref = butler.registry.findDataset("flat", detector=2, physical_filter="Cam1-G") + ref = butler.find_dataset("flat", detector=2, physical_filter="Cam1-G") # Transform the ref and dataset type to and from JSON # and check that it can be reconstructed properly diff --git a/tests/test_testRepo.py b/tests/test_testRepo.py index 71f40e7e6f..50c42283b7 100644 --- a/tests/test_testRepo.py +++ b/tests/test_testRepo.py @@ -176,8 +176,8 @@ def testAddDatasetType(self): # Testing the DatasetType objects is not practical, because all tests # need a DimensionUniverse. So just check that we have the dataset # types we expect. - self.butler.registry.getDatasetType("DataType1") - self.butler.registry.getDatasetType("DataType2") + self.butler.get_dataset_type("DataType1") + self.butler.get_dataset_type("DataType2") with self.assertRaises(ValueError): addDatasetType(self.butler, "DataType3", {"4thDimension"}, "NumpyArray")