Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Let Xarray handle decode_times #232

Merged
merged 9 commits into from
Aug 27, 2024
12 changes: 7 additions & 5 deletions docs/usage.md
Original file line number Diff line number Diff line change
Expand Up @@ -306,8 +306,8 @@ Dimensions: (time: 2920, lat: 25, lon: 53)
Coordinates:
lat (lat) float32 100B ManifestArray<shape=(25,), dtype=float32, chu...
lon (lon) float32 212B ManifestArray<shape=(53,), dtype=float32, chu...
* time (time) float32 12kB 1.867e+06 1.867e+06 ... 1.885e+06 1.885e+06
Data variables:
* time (time) datetime64[ns] 23kB 2013-01-01 ... 2014-12-31T18:00:00
Data variables:
air (time, lat, lon) float64 31MB ...
Attributes:
Conventions: COARDS
Expand All @@ -325,13 +325,15 @@ Loading variables can be useful in a few scenarios:

### CF-encoded time variables

Notice that the `time` variable that was loaded above does not have the expected dtype. To correctly decode time variables according to the CF conventions (like `xr.open_dataset` does by default), you need to include them in an additional keyword argument `cftime_variables`:
To correctly decode time variables according to the CF conventions, you need to pass `time` to `loadable_variables` and ensure the `decode_times` argument of `open_virtual_dataset` is set to True ( like the default behavior of `xr.open_dataset`, `decode_times` defaults to `True`).
TomNicholas marked this conversation as resolved.
Show resolved Hide resolved



```python
vds = open_virtual_dataset(
'air.nc',
loadable_variables=['air', 'time'],
cftime_variables=['time'],
decode_times=True,
indexes={},
)
```
Expand All @@ -352,7 +354,7 @@ Attributes:
title: 4x daily NMC reanalysis (1948)
```

Now the loaded time variable has a `datetime64[ns]` dtype. Any variables listed as `cftime_variables` must also be listed as `loadable_variables`.


## Writing virtual stores to disk

Expand Down
2 changes: 0 additions & 2 deletions virtualizarr/tests/test_integration.py
Original file line number Diff line number Diff line change
Expand Up @@ -92,13 +92,11 @@ def test_kerchunk_roundtrip_concat(self, tmpdir, format, decode_times, time_vars
f"{tmpdir}/air1.nc",
indexes={},
loadable_variables=time_vars,
cftime_variables=time_vars,
)
vds2 = open_virtual_dataset(
f"{tmpdir}/air2.nc",
indexes={},
loadable_variables=time_vars,
cftime_variables=time_vars,
)

if decode_times is False:
Expand Down
28 changes: 21 additions & 7 deletions virtualizarr/tests/test_xarray.py
Original file line number Diff line number Diff line change
Expand Up @@ -253,7 +253,7 @@ def test_no_indexes(self, netcdf4_file):
def test_create_default_indexes(self, netcdf4_file):
with pytest.warns(UserWarning, match="will create in-memory pandas indexes"):
vds = open_virtual_dataset(netcdf4_file, indexes=None)
ds = xr.open_dataset(netcdf4_file, decode_times=False)
ds = xr.open_dataset(netcdf4_file, decode_times=True)

# TODO use xr.testing.assert_identical(vds.indexes, ds.indexes) instead once class supported by assertion comparison, see https://github.com/pydata/xarray/issues/5812
assert index_mappings_equal(vds.xindexes, ds.xindexes)
Expand Down Expand Up @@ -398,7 +398,7 @@ def test_loadable_variables(self, netcdf4_file):
else:
assert isinstance(vds[name].data, ManifestArray), name

full_ds = xr.open_dataset(netcdf4_file, decode_times=False)
full_ds = xr.open_dataset(netcdf4_file, decode_times=True)

for name in full_ds.variables:
if name in vars_to_load:
Expand Down Expand Up @@ -479,8 +479,22 @@ def test_mixture_of_manifestarrays_and_numpy_arrays(self, netcdf4_file):
assert isinstance(renamed_vds["lat"].data, np.ndarray)


def test_cftime_variables_must_be_in_loadable_variables(tmpdir):
ds = xr.Dataset(data_vars={"time": ["2024-06-21"]})
ds.to_netcdf(f"{tmpdir}/scalar.nc")
with pytest.raises(ValueError, match="'time' not in"):
open_virtual_dataset(f"{tmpdir}/scalar.nc", cftime_variables=["time"])
def test_cftime_index(tmpdir):
"""Ensure a virtual dataset contains the same indexes as an Xarray dataset"""
# Note: Test was created to debug: https://github.com/zarr-developers/VirtualiZarr/issues/168
ds = xr.Dataset(
data_vars={
"tasmax": (["time", "lat", "lon"], np.random.rand(2, 18, 36)),
},
coords={
"time": np.array(["2023-01-01", "2023-01-02"], dtype="datetime64[ns]"),
"lat": np.arange(-90, 90, 10),
"lon": np.arange(-180, 180, 10),
},
)
ds.to_netcdf(f"{tmpdir}/tmp.nc")
vds = open_virtual_dataset(
f"{tmpdir}/tmp.nc", loadable_variables=["time", "lat", "lon"], indexes={}
)
# TODO use xr.testing.assert_identical(vds.indexes, ds.indexes) instead once class supported by assertion comparison, see https://github.com/pydata/xarray/issues/5812
assert index_mappings_equal(vds.xindexes, ds.xindexes)
TomNicholas marked this conversation as resolved.
Show resolved Hide resolved
33 changes: 6 additions & 27 deletions virtualizarr/xarray.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,6 @@
import xarray as xr
from xarray import register_dataset_accessor
from xarray.backends import AbstractDataStore, BackendArray
from xarray.coding.times import CFDatetimeCoder
from xarray.core.indexes import Index, PandasIndex
from xarray.core.variable import IndexVariable

Expand Down Expand Up @@ -46,7 +45,7 @@ def open_virtual_dataset(
filetype: FileType | None = None,
drop_variables: Iterable[str] | None = None,
loadable_variables: Iterable[str] | None = None,
cftime_variables: Iterable[str] | None = None,
TomNicholas marked this conversation as resolved.
Show resolved Hide resolved
decode_times: bool = True,
indexes: Mapping[str, Index] | None = None,
virtual_array_class=ManifestArray,
reader_options: Optional[dict] = None,
Expand All @@ -71,10 +70,8 @@ def open_virtual_dataset(
loadable_variables: list[str], default is None
Variables in the file to open as lazy numpy/dask arrays instead of instances of virtual_array_class.
Default is to open all variables as virtual arrays (i.e. ManifestArray).
cftime_variables : list[str], default is None
Interpret the value of specified vars using cftime, returning a datetime.
These will be automatically re-encoded with cftime. This list must be a subset
of ``loadable_variables``.
decode_times: bool, default is True
Bool that is passed into Xarray's open_dataset. Allows time to be decoded into a datetime object.
indexes : Mapping[str, Index], default is None
Indexes to use on the returned xarray Dataset.
Default is None, which will read any 1D coordinate data to create in-memory Pandas indexes.
Expand Down Expand Up @@ -111,20 +108,6 @@ def open_virtual_dataset(
if common:
raise ValueError(f"Cannot both load and drop variables {common}")

if cftime_variables is None:
cftime_variables = []
elif isinstance(cftime_variables, str):
cftime_variables = [cftime_variables]
else:
cftime_variables = list(cftime_variables)

if diff := (set(cftime_variables) - set(loadable_variables)):
missing_str = ", ".join([f"'{v}'" for v in diff])
raise ValueError(
"All ``cftime_variables`` must be included in ``loadable_variables`` "
f"({missing_str} not in ``loadable_variables``)"
)

if virtual_array_class is not ManifestArray:
raise NotImplementedError()

Expand All @@ -140,9 +123,9 @@ def open_virtual_dataset(
elif filetype == FileType.dmrpp:
from virtualizarr.readers.dmrpp import DMRParser

if loadable_variables != [] or cftime_variables != [] or indexes is None:
if loadable_variables != [] or indexes is None:
raise NotImplementedError(
"Specifying `loadable_variables`, `cftime_variables` or auto-creating indexes with `indexes=None` is not supported for dmrpp files."
"Specifying `loadable_variables` or auto-creating indexes with `indexes=None` is not supported for dmrpp files."
)

fpath = _fsspec_openfile_from_filepath(
Expand Down Expand Up @@ -185,7 +168,7 @@ def open_virtual_dataset(
ds = xr.open_dataset(
cast(XArrayOpenT, fpath),
drop_variables=drop_variables,
decode_times=False,
decode_times=True,
)

if indexes is None:
Expand All @@ -208,10 +191,6 @@ def open_virtual_dataset(
if name in loadable_variables
}

for name in cftime_variables:
var = loadable_vars[name]
loadable_vars[name] = CFDatetimeCoder().decode(var, name=name)

# if we only read the indexes we can just close the file right away as nothing is lazy
if loadable_vars == {}:
ds.close()
Expand Down
Loading