From 6a0d1ea617d75eb9deb44634e49e17a8897e6a85 Mon Sep 17 00:00:00 2001
From: Jim Bosch <jbosch@astro.princeton.edu>
Date: Fri, 3 Nov 2023 10:24:08 -0700
Subject: [PATCH] Rewrite much of DatasetRef.from_simple to fix cache problem.

The previous caching did not respect storage class overrides; all
DatasetRefs with the same UUID and component would end up using the
first storage class deserialized.

The fix here is to always call overrideStorageClass when using the
cache, rather than add the storage class to the cache key.  That's
because we expect the value of the cache to mostly be in avoiding
reconstruction of the data ID, and overrideStorageClass doesn't touch
that.

By the same token, the dataset type name has been removed from the
cache key as well, with only refs for parent dataset types cached.
This should shrink the cache a bit and improve performance in the
usual (no-component) case.
---
 doc/changes/DM-41562.bugfix.md                |   3 +
 python/lsst/daf/butler/_dataset_ref.py        | 104 +++++++++---------
 python/lsst/daf/butler/persistence_context.py |   6 +-
 3 files changed, 59 insertions(+), 54 deletions(-)
 create mode 100644 doc/changes/DM-41562.bugfix.md

diff --git a/doc/changes/DM-41562.bugfix.md b/doc/changes/DM-41562.bugfix.md
new file mode 100644
index 0000000000..ad5c4c4368
--- /dev/null
+++ b/doc/changes/DM-41562.bugfix.md
@@ -0,0 +1,3 @@
+Fix caching in DatasetRef deserialization that caused the serialized storage class to be ignored.
+
+This caused intermittent failures when running pipelines that use multiple storage classes for the same dataset type.
diff --git a/python/lsst/daf/butler/_dataset_ref.py b/python/lsst/daf/butler/_dataset_ref.py
index cb85cb13ef..428c91e035 100644
--- a/python/lsst/daf/butler/_dataset_ref.py
+++ b/python/lsst/daf/butler/_dataset_ref.py
@@ -464,17 +464,25 @@ def from_simple(
             Newly-constructed object.
         """
         cache = PersistenceContextVars.datasetRefs.get()
-        localName = sys.intern(
-            datasetType.name
-            if datasetType is not None
-            else (x.name if (x := simple.datasetType) is not None else "")
-        )
-        key = (simple.id.int, localName)
-        if cache is not None and (cachedRef := cache.get(key, None)) is not None:
-            return cachedRef
+        key = simple.id.int
+        if cache is not None and (ref := cache.get(key, None)) is not None:
+            if datasetType is not None:
+                if (component := datasetType.component()) is not None:
+                    ref = ref.makeComponentRef(component)
+                ref = ref.overrideStorageClass(datasetType.storageClass_name)
+                return ref
+            if simple.datasetType is not None:
+                _, component = DatasetType.splitDatasetTypeName(simple.datasetType.name)
+                if component is not None:
+                    ref = ref.makeComponentRef(component)
+                if simple.datasetType.storageClass is not None:
+                    ref = ref.overrideStorageClass(simple.datasetType.storageClass)
+                    return ref
+            # If dataset type is not given ignore the cache, because we can't
+            # reliably return the right storage class.
         # Minimalist component will just specify component and id and
         # require registry to reconstruct
-        if not (simple.datasetType is not None or simple.dataId is not None or simple.run is not None):
+        if simple.datasetType is None and simple.dataId is None and simple.run is None:
             if registry is None:
                 raise ValueError("Registry is required to construct component DatasetRef from integer id")
             if simple.id is None:
@@ -484,52 +492,42 @@ def from_simple(
                 raise RuntimeError(f"No matching dataset found in registry for id {simple.id}")
             if simple.component:
                 ref = ref.makeComponentRef(simple.component)
-            if cache is not None:
-                cache[key] = ref
-            return ref
-
-        if universe is None and registry is None:
-            raise ValueError("One of universe or registry must be provided.")
-
-        if universe is None and registry is not None:
-            universe = registry.dimensions
-
-        if universe is None:
-            # this is for mypy
-            raise ValueError("Unable to determine a usable universe")
-
-        if simple.datasetType is None and datasetType is None:
-            # mypy
-            raise ValueError("The DatasetType must be specified to construct a DatasetRef")
-        if datasetType is None:
-            if simple.datasetType is None:
-                raise ValueError("Cannot determine Dataset type of this serialized class")
-            datasetType = DatasetType.from_simple(simple.datasetType, universe=universe, registry=registry)
-
-        if simple.dataId is None:
-            # mypy
-            raise ValueError("The DataId must be specified to construct a DatasetRef")
-        dataId = DataCoordinate.from_simple(simple.dataId, universe=universe)
-
-        # Check that simple ref is resolved.
-        if simple.run is None:
-            dstr = ""
-            if simple.datasetType is None:
-                dstr = f" (datasetType={datasetType.name!r})"
-            raise ValueError(
-                "Run collection name is missing from serialized representation. "
-                f"Encountered with {simple!r}{dstr}."
+        else:
+            if universe is None:
+                if registry is None:
+                    raise ValueError("One of universe or registry must be provided.")
+                universe = registry.dimensions
+            if datasetType is None:
+                if simple.datasetType is None:
+                    raise ValueError("Cannot determine Dataset type of this serialized class")
+                datasetType = DatasetType.from_simple(
+                    simple.datasetType, universe=universe, registry=registry
+                )
+            if simple.dataId is None:
+                # mypy
+                raise ValueError("The DataId must be specified to construct a DatasetRef")
+            dataId = DataCoordinate.from_simple(simple.dataId, universe=universe)
+            # Check that simple ref is resolved.
+            if simple.run is None:
+                dstr = ""
+                if simple.datasetType is None:
+                    dstr = f" (datasetType={datasetType.name!r})"
+                raise ValueError(
+                    "Run collection name is missing from serialized representation. "
+                    f"Encountered with {simple!r}{dstr}."
+                )
+            ref = cls(
+                datasetType,
+                dataId,
+                id=simple.id,
+                run=simple.run,
             )
-
-        newRef = cls(
-            datasetType,
-            dataId,
-            id=simple.id,
-            run=simple.run,
-        )
         if cache is not None:
-            cache[key] = newRef
-        return newRef
+            if ref.datasetType.component() is not None:
+                cache[key] = ref.makeCompositeRef()
+            else:
+                cache[key] = ref
+        return ref
 
     to_json = to_json_pydantic
     from_json: ClassVar = classmethod(from_json_pydantic)
diff --git a/python/lsst/daf/butler/persistence_context.py b/python/lsst/daf/butler/persistence_context.py
index d56a6df2d6..30e0cb80d9 100644
--- a/python/lsst/daf/butler/persistence_context.py
+++ b/python/lsst/daf/butler/persistence_context.py
@@ -117,10 +117,14 @@ class PersistenceContextVars:
     r"""A cache of `DataCoordinate`\ s.
     """
 
-    datasetRefs: ContextVar[dict[tuple[int, str], DatasetRef] | None] = ContextVar(
+    datasetRefs: ContextVar[dict[int, DatasetRef] | None] = ContextVar(
         "datasetRefs", default=None
     )
     r"""A cache of `DatasetRef`\ s.
+
+    Keys are UUID converted to int, but only refs of parent dataset types are
+    cached AND THE STORAGE CLASS IS UNSPECIFIED; consumers of this cache must
+    call overrideStorageClass on the result.
     """
 
     dimensionRecords: ContextVar[dict[Hashable, DimensionRecord] | None] = ContextVar(