From c07fae2646f9ea06bcd652a21eb839ea23871f76 Mon Sep 17 00:00:00 2001 From: Raphael Hagen Date: Mon, 26 Aug 2024 17:56:07 -0600 Subject: [PATCH 1/8] replaces cftime_variables w/ decode_times --- docs/usage.md | 12 ++++++---- virtualizarr/tests/test_integration.py | 2 -- virtualizarr/tests/test_xarray.py | 28 ++++++++++++++++------ virtualizarr/xarray.py | 33 +++++--------------------- 4 files changed, 34 insertions(+), 41 deletions(-) diff --git a/docs/usage.md b/docs/usage.md index b0935286..4ea2b18f 100644 --- a/docs/usage.md +++ b/docs/usage.md @@ -306,8 +306,8 @@ Dimensions: (time: 2920, lat: 25, lon: 53) Coordinates: lat (lat) float32 100B ManifestArray Date: Mon, 26 Aug 2024 17:57:34 -0600 Subject: [PATCH 2/8] adds note in test --- virtualizarr/tests/test_xarray.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/virtualizarr/tests/test_xarray.py b/virtualizarr/tests/test_xarray.py index 0cf15fb4..fc55ea10 100644 --- a/virtualizarr/tests/test_xarray.py +++ b/virtualizarr/tests/test_xarray.py @@ -481,7 +481,7 @@ def test_mixture_of_manifestarrays_and_numpy_arrays(self, netcdf4_file): def test_cftime_index(tmpdir): """Ensure a virtual dataset contains the same indexes as an Xarray dataset""" - + # Note: Test was created to debug: https://github.com/zarr-developers/VirtualiZarr/issues/168 ds = xr.Dataset( data_vars={ "tasmax": (["time", "lat", "lon"], np.random.rand(2, 18, 36)), From 7fc7530c6621acec3e9c2b45c987be0adef5d475 Mon Sep 17 00:00:00 2001 From: Raphael Hagen Date: Mon, 26 Aug 2024 18:37:17 -0600 Subject: [PATCH 3/8] keeps cftime_variables arg, but raises a DepreciationWarning --- docs/usage.md | 2 +- virtualizarr/xarray.py | 13 ++++++++++++- 2 files changed, 13 insertions(+), 2 deletions(-) diff --git a/docs/usage.md b/docs/usage.md index 4ea2b18f..fd407c5f 100644 --- a/docs/usage.md +++ b/docs/usage.md @@ -325,7 +325,7 @@ Loading variables can be useful in a few scenarios: ### CF-encoded time variables -To correctly decode time variables according to the CF conventions, you need to pass `time` to `loadable_variables` and ensure the `decode_times` argument of `open_virtual_dataset` is set to True ( like the default behavior of `xr.open_dataset`, `decode_times` defaults to `True`). +To correctly decode time variables according to the CF conventions, you need to pass `time` to `loadable_variables` and ensure the `decode_times` argument of `open_virtual_dataset` is set to True (`decode_times` defaults to `True`). diff --git a/virtualizarr/xarray.py b/virtualizarr/xarray.py index 743eb0c6..8690567f 100644 --- a/virtualizarr/xarray.py +++ b/virtualizarr/xarray.py @@ -46,6 +46,7 @@ def open_virtual_dataset( drop_variables: Iterable[str] | None = None, loadable_variables: Iterable[str] | None = None, decode_times: bool = True, + cftime_variables: Iterable[str] | None = None, indexes: Mapping[str, Index] | None = None, virtual_array_class=ManifestArray, reader_options: Optional[dict] = None, @@ -72,6 +73,7 @@ def open_virtual_dataset( Default is to open all variables as virtual arrays (i.e. ManifestArray). decode_times: bool, default is True Bool that is passed into Xarray's open_dataset. Allows time to be decoded into a datetime object. + cftime_variables: Iterable[str] | None = None indexes : Mapping[str, Index], default is None Indexes to use on the returned xarray Dataset. Default is None, which will read any 1D coordinate data to create in-memory Pandas indexes. @@ -88,6 +90,15 @@ def open_virtual_dataset( vds An xarray Dataset containing instances of virtual_array_cls for each variable, or normal lazily indexed arrays for each variable in loadable_variables. """ + + if cftime_variables is not None: + # It seems like stacklevel=2 is req to surface this warning. + warnings.warn( + "cftime_variables is depreciated and will be ignored. Pass decode_times=True and loadable_variables=['time'] to decode time values to datetime objects.", + DeprecationWarning, + stacklevel=2, + ) + loadable_vars: dict[str, xr.Variable] virtual_vars: dict[str, xr.Variable] vars: dict[str, xr.Variable] @@ -168,7 +179,7 @@ def open_virtual_dataset( ds = xr.open_dataset( cast(XArrayOpenT, fpath), drop_variables=drop_variables, - decode_times=True, + decode_times=decode_times, ) if indexes is None: From 11a8fbed7372c9a9379f3e19840940a9d40a6777 Mon Sep 17 00:00:00 2001 From: Raphael Hagen Date: Mon, 26 Aug 2024 18:59:48 -0600 Subject: [PATCH 4/8] adds some testing for coords, dims and attrs between ds and vds --- virtualizarr/tests/test_xarray.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/virtualizarr/tests/test_xarray.py b/virtualizarr/tests/test_xarray.py index fc55ea10..62404098 100644 --- a/virtualizarr/tests/test_xarray.py +++ b/virtualizarr/tests/test_xarray.py @@ -491,6 +491,7 @@ def test_cftime_index(tmpdir): "lat": np.arange(-90, 90, 10), "lon": np.arange(-180, 180, 10), }, + attrs={"attr1_key": "attr1_val"}, ) ds.to_netcdf(f"{tmpdir}/tmp.nc") vds = open_virtual_dataset( @@ -498,3 +499,6 @@ def test_cftime_index(tmpdir): ) # TODO use xr.testing.assert_identical(vds.indexes, ds.indexes) instead once class supported by assertion comparison, see https://github.com/pydata/xarray/issues/5812 assert index_mappings_equal(vds.xindexes, ds.xindexes) + assert list(ds.coords) == list(vds.coords) + assert vds.dims == ds.dims + assert vds.attrs == ds.attrs From 279dddfb98a81efc1e673d0e6aa139cac6351cef Mon Sep 17 00:00:00 2001 From: Raphael Hagen Date: Mon, 26 Aug 2024 19:06:12 -0600 Subject: [PATCH 5/8] updated decode_times default --- docs/usage.md | 2 +- virtualizarr/tests/test_xarray.py | 4 ++-- virtualizarr/xarray.py | 4 ++-- 3 files changed, 5 insertions(+), 5 deletions(-) diff --git a/docs/usage.md b/docs/usage.md index fd407c5f..40071b8a 100644 --- a/docs/usage.md +++ b/docs/usage.md @@ -325,7 +325,7 @@ Loading variables can be useful in a few scenarios: ### CF-encoded time variables -To correctly decode time variables according to the CF conventions, you need to pass `time` to `loadable_variables` and ensure the `decode_times` argument of `open_virtual_dataset` is set to True (`decode_times` defaults to `True`). +To correctly decode time variables according to the CF conventions, you need to pass `time` to `loadable_variables` and ensure the `decode_times` argument of `open_virtual_dataset` is set to True (`decode_times` defaults to None). diff --git a/virtualizarr/tests/test_xarray.py b/virtualizarr/tests/test_xarray.py index 62404098..bae58c7d 100644 --- a/virtualizarr/tests/test_xarray.py +++ b/virtualizarr/tests/test_xarray.py @@ -253,7 +253,7 @@ def test_no_indexes(self, netcdf4_file): def test_create_default_indexes(self, netcdf4_file): with pytest.warns(UserWarning, match="will create in-memory pandas indexes"): vds = open_virtual_dataset(netcdf4_file, indexes=None) - ds = xr.open_dataset(netcdf4_file, decode_times=True) + ds = xr.open_dataset(netcdf4_file) # TODO use xr.testing.assert_identical(vds.indexes, ds.indexes) instead once class supported by assertion comparison, see https://github.com/pydata/xarray/issues/5812 assert index_mappings_equal(vds.xindexes, ds.xindexes) @@ -398,7 +398,7 @@ def test_loadable_variables(self, netcdf4_file): else: assert isinstance(vds[name].data, ManifestArray), name - full_ds = xr.open_dataset(netcdf4_file, decode_times=True) + full_ds = xr.open_dataset(netcdf4_file) for name in full_ds.variables: if name in vars_to_load: diff --git a/virtualizarr/xarray.py b/virtualizarr/xarray.py index 8690567f..b85536bf 100644 --- a/virtualizarr/xarray.py +++ b/virtualizarr/xarray.py @@ -45,7 +45,7 @@ def open_virtual_dataset( filetype: FileType | None = None, drop_variables: Iterable[str] | None = None, loadable_variables: Iterable[str] | None = None, - decode_times: bool = True, + decode_times: bool | None = None, cftime_variables: Iterable[str] | None = None, indexes: Mapping[str, Index] | None = None, virtual_array_class=ManifestArray, @@ -71,7 +71,7 @@ def open_virtual_dataset( loadable_variables: list[str], default is None Variables in the file to open as lazy numpy/dask arrays instead of instances of virtual_array_class. Default is to open all variables as virtual arrays (i.e. ManifestArray). - decode_times: bool, default is True + decode_times: bool | None, default is None Bool that is passed into Xarray's open_dataset. Allows time to be decoded into a datetime object. cftime_variables: Iterable[str] | None = None indexes : Mapping[str, Index], default is None From d0c1c13f3a6207107a20cc0f716fe022762cff89 Mon Sep 17 00:00:00 2001 From: Raphael Hagen Date: Tue, 27 Aug 2024 09:39:04 -0600 Subject: [PATCH 6/8] removed test_cftime_variables_must_be_in_loadable_variables test & updated xr.open_dataset decode behavior --- virtualizarr/tests/test_backend.py | 11 ++--------- 1 file changed, 2 insertions(+), 9 deletions(-) diff --git a/virtualizarr/tests/test_backend.py b/virtualizarr/tests/test_backend.py index f922cf69..c3001587 100644 --- a/virtualizarr/tests/test_backend.py +++ b/virtualizarr/tests/test_backend.py @@ -82,7 +82,7 @@ def test_no_indexes(self, netcdf4_file): def test_create_default_indexes(self, netcdf4_file): with pytest.warns(UserWarning, match="will create in-memory pandas indexes"): vds = open_virtual_dataset(netcdf4_file, indexes=None) - ds = open_dataset(netcdf4_file, decode_times=False) + ds = open_dataset(netcdf4_file, decode_times=True) # TODO use xr.testing.assert_identical(vds.indexes, ds.indexes) instead once class supported by assertion comparison, see https://github.com/pydata/xarray/issues/5812 assert index_mappings_equal(vds.xindexes, ds.xindexes) @@ -236,7 +236,7 @@ def test_loadable_variables(self, netcdf4_file): else: assert isinstance(vds[name].data, ManifestArray), name - full_ds = xr.open_dataset(netcdf4_file, decode_times=False) + full_ds = xr.open_dataset(netcdf4_file, decode_times=True) for name in full_ds.variables: if name in vars_to_load: @@ -271,10 +271,3 @@ def test_open_dataset_with_scalar(self, hdf5_scalar, tmpdir): vds = open_virtual_dataset(hdf5_scalar) assert vds.scalar.dims == () assert vds.scalar.attrs == {"scalar": "true"} - - -def test_cftime_variables_must_be_in_loadable_variables(tmpdir): - ds = xr.Dataset(data_vars={"time": ["2024-06-21"]}) - ds.to_netcdf(f"{tmpdir}/scalar.nc") - with pytest.raises(ValueError, match="'time' not in"): - open_virtual_dataset(f"{tmpdir}/scalar.nc", cftime_variables=["time"]) From 35d2ef2f677506861e672ddd41ef80a414fb2b94 Mon Sep 17 00:00:00 2001 From: Raphael Hagen Date: Tue, 27 Aug 2024 10:21:45 -0600 Subject: [PATCH 7/8] adds decode_times to releases.rst --- docs/releases.rst | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/docs/releases.rst b/docs/releases.rst index 5ae3bff4..39d517be 100644 --- a/docs/releases.rst +++ b/docs/releases.rst @@ -9,6 +9,9 @@ v1.0.1 (unreleased) New Features ~~~~~~~~~~~~ +- Adds `decode_times` to open_virtual_dataset (:pull:`232`) + By `Raphael Hagen `_. + - Add parser for the OPeNDAP DMR++ XML format and integration with open_virtual_dataset (:pull:`113`) By `Ayush Nag `_. @@ -17,17 +20,18 @@ New Features Breaking changes ~~~~~~~~~~~~~~~~ - - Serialize valid ZarrV3 metadata and require full compressor numcodec config (for :pull:`193`) By `Gustavo Hidalgo `_. - VirtualiZarr's `ZArray`, `ChunkEntry`, and `Codec` no longer subclass `pydantic.BaseModel` (:pull:`210`) - `ZArray`'s `__init__` signature has changed to match `zarr.Array`'s (:pull:`xxx`) - Deprecations ~~~~~~~~~~~~ +- Depreciates cftime_variables in open_virtual_dataset in favor of decode_times. (:pull:`232`) + By `Raphael Hagen `_. + Bug fixes ~~~~~~~~~ From c9520aac20507a6551919764c461eb90c3952499 Mon Sep 17 00:00:00 2001 From: Tom Nicholas Date: Tue, 27 Aug 2024 10:34:59 -0600 Subject: [PATCH 8/8] Remove cftime_variables from docstring --- virtualizarr/backend.py | 1 - 1 file changed, 1 deletion(-) diff --git a/virtualizarr/backend.py b/virtualizarr/backend.py index ba5c0160..b155c2ce 100644 --- a/virtualizarr/backend.py +++ b/virtualizarr/backend.py @@ -81,7 +81,6 @@ def open_virtual_dataset( Default is to open all variables as virtual arrays (i.e. ManifestArray). decode_times: bool | None, default is None Bool that is passed into Xarray's open_dataset. Allows time to be decoded into a datetime object. - cftime_variables: Iterable[str] | None = None indexes : Mapping[str, Index], default is None Indexes to use on the returned xarray Dataset. Default is None, which will read any 1D coordinate data to create in-memory Pandas indexes.