zarr-developers · TomNicholas · Aug 27, 2024 · Aug 26, 2024 · Aug 26, 2024 · Aug 27, 2024
diff --git a/docs/usage.md b/docs/usage.md
@@ -306,8 +306,8 @@ Dimensions:  (time: 2920, lat: 25, lon: 53)
 Coordinates:
     lat      (lat) float32 100B ManifestArray<shape=(25,), dtype=float32, chu...
     lon      (lon) float32 212B ManifestArray<shape=(53,), dtype=float32, chu...
-  * time     (time) float32 12kB 1.867e+06 1.867e+06 ... 1.885e+06 1.885e+06
-Data variables:
+  * time     (time) datetime64[ns] 23kB 2013-01-01 ... 2014-12-31T18:00:00
+  Data variables:
     air      (time, lat, lon) float64 31MB ...
 Attributes:
     Conventions:  COARDS
@@ -325,13 +325,15 @@ Loading variables can be useful in a few scenarios:
 
 ### CF-encoded time variables
 
-Notice that the `time` variable that was loaded above does not have the expected dtype. To correctly decode time variables according to the CF conventions (like `xr.open_dataset` does by default), you need to include them in an additional keyword argument `cftime_variables`:
+To correctly decode time variables according to the CF conventions, you need to pass `time` to `loadable_variables` and ensure the `decode_times` argument of `open_virtual_dataset` is set to True ( like the default behavior of `xr.open_dataset`, `decode_times` defaults to `True`).
+
+
 
 ```python
 vds = open_virtual_dataset(
     'air.nc',
     loadable_variables=['air', 'time'],
-    cftime_variables=['time'],
+    decode_times=True,
     indexes={},
 )
 ```
@@ -352,7 +354,7 @@ Attributes:
     title:        4x daily NMC reanalysis (1948)
 ```
 
-Now the loaded time variable has a `datetime64[ns]` dtype. Any variables listed as `cftime_variables` must also be listed as `loadable_variables`.
+
 
 ## Writing virtual stores to disk
 

diff --git a/virtualizarr/tests/test_integration.py b/virtualizarr/tests/test_integration.py
@@ -92,13 +92,11 @@ def test_kerchunk_roundtrip_concat(self, tmpdir, format, decode_times, time_vars
             f"{tmpdir}/air1.nc",
             indexes={},
             loadable_variables=time_vars,
-            cftime_variables=time_vars,
         )
         vds2 = open_virtual_dataset(
             f"{tmpdir}/air2.nc",
             indexes={},
             loadable_variables=time_vars,
-            cftime_variables=time_vars,
         )
 
         if decode_times is False:

diff --git a/virtualizarr/tests/test_xarray.py b/virtualizarr/tests/test_xarray.py
@@ -253,7 +253,7 @@ def test_no_indexes(self, netcdf4_file):
     def test_create_default_indexes(self, netcdf4_file):
         with pytest.warns(UserWarning, match="will create in-memory pandas indexes"):
             vds = open_virtual_dataset(netcdf4_file, indexes=None)
-        ds = xr.open_dataset(netcdf4_file, decode_times=False)
+        ds = xr.open_dataset(netcdf4_file, decode_times=True)
 
         # TODO use xr.testing.assert_identical(vds.indexes, ds.indexes) instead once class supported by assertion comparison, see https://github.com/pydata/xarray/issues/5812
         assert index_mappings_equal(vds.xindexes, ds.xindexes)
@@ -398,7 +398,7 @@ def test_loadable_variables(self, netcdf4_file):
             else:
                 assert isinstance(vds[name].data, ManifestArray), name
 
-        full_ds = xr.open_dataset(netcdf4_file, decode_times=False)
+        full_ds = xr.open_dataset(netcdf4_file, decode_times=True)
 
         for name in full_ds.variables:
             if name in vars_to_load:
@@ -479,8 +479,22 @@ def test_mixture_of_manifestarrays_and_numpy_arrays(self, netcdf4_file):
         assert isinstance(renamed_vds["lat"].data, np.ndarray)
 
 
-def test_cftime_variables_must_be_in_loadable_variables(tmpdir):
-    ds = xr.Dataset(data_vars={"time": ["2024-06-21"]})
-    ds.to_netcdf(f"{tmpdir}/scalar.nc")
-    with pytest.raises(ValueError, match="'time' not in"):
-        open_virtual_dataset(f"{tmpdir}/scalar.nc", cftime_variables=["time"])
+def test_cftime_index(tmpdir):
+    """Ensure a virtual dataset contains the same indexes as an Xarray dataset"""
+    # Note: Test was created to debug: https://github.com/zarr-developers/VirtualiZarr/issues/168
+    ds = xr.Dataset(
+        data_vars={
+            "tasmax": (["time", "lat", "lon"], np.random.rand(2, 18, 36)),
+        },
+        coords={
+            "time": np.array(["2023-01-01", "2023-01-02"], dtype="datetime64[ns]"),
+            "lat": np.arange(-90, 90, 10),
+            "lon": np.arange(-180, 180, 10),
+        },
+    )
+    ds.to_netcdf(f"{tmpdir}/tmp.nc")
+    vds = open_virtual_dataset(
+        f"{tmpdir}/tmp.nc", loadable_variables=["time", "lat", "lon"], indexes={}
+    )
+    # TODO use xr.testing.assert_identical(vds.indexes, ds.indexes) instead once class supported by assertion comparison, see https://github.com/pydata/xarray/issues/5812
+    assert index_mappings_equal(vds.xindexes, ds.xindexes)
diff --git a/virtualizarr/xarray.py b/virtualizarr/xarray.py
@@ -17,7 +17,6 @@
 import xarray as xr
 from xarray import register_dataset_accessor
 from xarray.backends import AbstractDataStore, BackendArray
-from xarray.coding.times import CFDatetimeCoder
 from xarray.core.indexes import Index, PandasIndex
 from xarray.core.variable import IndexVariable
 
@@ -46,7 +45,7 @@ def open_virtual_dataset(
     filetype: FileType | None = None,
     drop_variables: Iterable[str] | None = None,
     loadable_variables: Iterable[str] | None = None,
-    cftime_variables: Iterable[str] | None = None,
+    decode_times: bool = True,
     indexes: Mapping[str, Index] | None = None,
     virtual_array_class=ManifestArray,
     reader_options: Optional[dict] = None,
@@ -71,10 +70,8 @@ def open_virtual_dataset(
     loadable_variables: list[str], default is None
         Variables in the file to open as lazy numpy/dask arrays instead of instances of virtual_array_class.
         Default is to open all variables as virtual arrays (i.e. ManifestArray).
-    cftime_variables : list[str], default is None
-        Interpret the value of specified vars using cftime, returning a datetime.
-        These will be automatically re-encoded with cftime. This list must be a subset
-        of ``loadable_variables``.
+    decode_times: bool, default is True
+        Bool that is passed into Xarray's open_dataset. Allows time to be decoded into a datetime object.
     indexes : Mapping[str, Index], default is None
         Indexes to use on the returned xarray Dataset.
         Default is None, which will read any 1D coordinate data to create in-memory Pandas indexes.
@@ -111,20 +108,6 @@ def open_virtual_dataset(
     if common:
         raise ValueError(f"Cannot both load and drop variables {common}")
 
-    if cftime_variables is None:
-        cftime_variables = []
-    elif isinstance(cftime_variables, str):
-        cftime_variables = [cftime_variables]
-    else:
-        cftime_variables = list(cftime_variables)
-
-    if diff := (set(cftime_variables) - set(loadable_variables)):
-        missing_str = ", ".join([f"'{v}'" for v in diff])
-        raise ValueError(
-            "All ``cftime_variables`` must be included in ``loadable_variables`` "
-            f"({missing_str} not in ``loadable_variables``)"
-        )
-
     if virtual_array_class is not ManifestArray:
         raise NotImplementedError()
 
@@ -140,9 +123,9 @@ def open_virtual_dataset(
     elif filetype == FileType.dmrpp:
         from virtualizarr.readers.dmrpp import DMRParser
 
-        if loadable_variables != [] or cftime_variables != [] or indexes is None:
+        if loadable_variables != [] or indexes is None:
             raise NotImplementedError(
-                "Specifying `loadable_variables`, `cftime_variables` or auto-creating indexes with `indexes=None` is not supported for dmrpp files."
+                "Specifying `loadable_variables` or auto-creating indexes with `indexes=None` is not supported for dmrpp files."
             )
 
         fpath = _fsspec_openfile_from_filepath(
@@ -185,7 +168,7 @@ def open_virtual_dataset(
             ds = xr.open_dataset(
                 cast(XArrayOpenT, fpath),
                 drop_variables=drop_variables,
-                decode_times=False,
+                decode_times=True,
             )
 
             if indexes is None:
@@ -208,10 +191,6 @@ def open_virtual_dataset(
                 if name in loadable_variables
             }
 
-            for name in cftime_variables:
-                var = loadable_vars[name]
-                loadable_vars[name] = CFDatetimeCoder().decode(var, name=name)
-
             # if we only read the indexes we can just close the file right away as nothing is lazy
             if loadable_vars == {}:
                 ds.close()