From 14d73ee973c3206b2f6061cc7327d7ab70cd350e Mon Sep 17 00:00:00 2001 From: Joseph Hamman Date: Mon, 27 Mar 2023 15:33:38 -0700 Subject: [PATCH 1/3] add reset_encoding to dataset/dataarray/variable --- doc/api-hidden.rst | 1 + doc/api.rst | 2 ++ doc/user-guide/io.rst | 10 ++++++++++ doc/whats-new.rst | 3 +++ xarray/core/dataarray.py | 6 ++++++ xarray/core/dataset.py | 6 ++++++ xarray/core/variable.py | 4 ++++ xarray/tests/test_dataarray.py | 19 +++++++++++++++++++ xarray/tests/test_dataset.py | 15 +++++++++++++++ xarray/tests/test_variable.py | 23 ++++++++++++++++++++++- 10 files changed, 88 insertions(+), 1 deletion(-) diff --git a/doc/api-hidden.rst b/doc/api-hidden.rst index 04013d545c3..73e1e54f9bc 100644 --- a/doc/api-hidden.rst +++ b/doc/api-hidden.rst @@ -234,6 +234,7 @@ Variable.dims Variable.dtype Variable.encoding + Variable.reset_encoding Variable.imag Variable.nbytes Variable.ndim diff --git a/doc/api.rst b/doc/api.rst index 0d56fc73997..6b2f85db81f 100644 --- a/doc/api.rst +++ b/doc/api.rst @@ -112,6 +112,7 @@ Dataset contents Dataset.drop_dims Dataset.set_coords Dataset.reset_coords + Dataset.reset_encoding Dataset.convert_calendar Dataset.interp_calendar Dataset.get_index @@ -303,6 +304,7 @@ DataArray contents DataArray.drop_indexes DataArray.drop_duplicates DataArray.reset_coords + DataArray.reset_encoding DataArray.copy DataArray.convert_calendar DataArray.interp_calendar diff --git a/doc/user-guide/io.rst b/doc/user-guide/io.rst index 5610e7829f2..cb0900ac6b0 100644 --- a/doc/user-guide/io.rst +++ b/doc/user-guide/io.rst @@ -279,6 +279,16 @@ You can view this encoding information (among others) in the Note that all operations that manipulate variables other than indexing will remove encoding information. +In some cases it is useful to intentionally reset a dataset's original encoding values. +This can be done with either the :py:meth:`Dataset.reset_encoding` or +:py:meth:`DataArray.reset_encoding` methods. + +.. ipython:: + :verbatim: + + In [1]: ds_no_encoding = ds_disk.reset_encoding() + In [2]: ds_no_encoding["y"] + Out[2]: {} .. _combining multiple files: diff --git a/doc/whats-new.rst b/doc/whats-new.rst index 6580695adaf..5d71da77455 100644 --- a/doc/whats-new.rst +++ b/doc/whats-new.rst @@ -22,6 +22,9 @@ v2023.04.0 (unreleased) New Features ~~~~~~~~~~~~ +- New methods to reset an objects encoding (:py:meth:`Dataset.reset_encoding`, :py:meth:`DataArray.reset_encoding`). + (:issue:`7686`, :pull:`7689`). + By `Joe Hamman `_. Breaking changes diff --git a/xarray/core/dataarray.py b/xarray/core/dataarray.py index 1f04f506397..ba55837ef7e 100644 --- a/xarray/core/dataarray.py +++ b/xarray/core/dataarray.py @@ -877,6 +877,12 @@ def encoding(self) -> dict[Any, Any]: def encoding(self, value: Mapping[Any, Any]) -> None: self.variable.encoding = dict(value) + def reset_encoding(self: T_DataArray) -> T_DataArray: + """Return a new DataArray without encoding on the array or any attached + coords.""" + ds = self._to_temp_dataset().reset_encoding() + return self._from_temp_dataset(ds) + @property def indexes(self) -> Indexes: """Mapping of pandas.Index objects used for label based indexing. diff --git a/xarray/core/dataset.py b/xarray/core/dataset.py index 0bd335f3f0a..ac021e8e54f 100644 --- a/xarray/core/dataset.py +++ b/xarray/core/dataset.py @@ -666,6 +666,12 @@ def encoding(self) -> dict[Any, Any]: def encoding(self, value: Mapping[Any, Any]) -> None: self._encoding = dict(value) + def reset_encoding(self: T_Dataset) -> None: + """Return a new Dataset without encoding on the dataset or any of its + variables/coords.""" + variables = {k: v.reset_encoding() for k, v in self.variables.items()} + return self._replace(variables=variables, encoding={}) + @property def dims(self) -> Frozen[Hashable, int]: """Mapping from dimension names to lengths. diff --git a/xarray/core/variable.py b/xarray/core/variable.py index bddeb85f5e9..5d1ed4baea6 100644 --- a/xarray/core/variable.py +++ b/xarray/core/variable.py @@ -977,6 +977,10 @@ def encoding(self, value): except ValueError: raise ValueError("encoding must be castable to a dictionary") + def reset_encoding(self: T_Variable) -> T_Variable: + """Return a new Variable without encoding.""" + return self._replace(encoding={}) + def copy( self: T_Variable, deep: bool = True, data: ArrayLike | None = None ) -> T_Variable: diff --git a/xarray/tests/test_dataarray.py b/xarray/tests/test_dataarray.py index 378d471ba6b..4ffa95e16e6 100644 --- a/xarray/tests/test_dataarray.py +++ b/xarray/tests/test_dataarray.py @@ -278,6 +278,25 @@ def test_encoding(self) -> None: self.dv.encoding = expected2 assert expected2 is not self.dv.encoding + def test_reset_encoding(self) -> None: + array = self.mda + encoding = {"scale_factor": 10} + array.encoding = encoding + array["x"].encoding = encoding + + assert array.encoding == encoding + assert array["x"].encoding == encoding + + actual = array.reset_encoding() + + # did not modify in place + assert array.encoding == encoding + assert array["x"].encoding == encoding + + # variable and coord encoding is empty + assert actual.encoding == {} + assert actual["x"].encoding == {} + def test_constructor(self) -> None: data = np.random.random((2, 3)) diff --git a/xarray/tests/test_dataset.py b/xarray/tests/test_dataset.py index 2e23d02a261..08e922d99ec 100644 --- a/xarray/tests/test_dataset.py +++ b/xarray/tests/test_dataset.py @@ -2827,6 +2827,21 @@ def test_copy_with_data_errors(self) -> None: with pytest.raises(ValueError, match=r"contain all variables in original"): orig.copy(data={"var1": new_var1}) + def test_reset_encoding(self) -> None: + orig = create_test_data() + vencoding = {"scale_factor": 10} + orig.encoding = {"foo": "bar"} + + for k, v in orig.variables.items(): + orig[k].encoding = vencoding + + actual = orig.reset_encoding() + assert actual.encoding == {} + for k, v in actual.variables.items(): + assert v.encoding == {} + + assert_equal(actual, orig) + def test_rename(self) -> None: data = create_test_data() newnames = { diff --git a/xarray/tests/test_variable.py b/xarray/tests/test_variable.py index f656818c71f..2f571eb9a83 100644 --- a/xarray/tests/test_variable.py +++ b/xarray/tests/test_variable.py @@ -455,6 +455,23 @@ def test_encoding_preserved(self): assert_identical(expected.to_base_variable(), actual.to_base_variable()) assert expected.encoding == actual.encoding + def test_reset_encoding(self) -> None: + encoding1 = {"scale_factor": 1} + # encoding set via cls constructor + v1 = self.cls(["a"], [0, 1, 2], encoding=encoding1) + assert v1.encoding == encoding1 + v2 = v1.reset_encoding() + assert v1.encoding == encoding1 + assert v2.encoding == {} + + # encoding set via setter + encoding3 = {"scale_factor": 10} + v3 = self.cls(["a"], [0, 1, 2], encoding=encoding3) + assert v3.encoding == encoding3 + v4 = v3.reset_encoding() + assert v3.encoding == encoding3 + assert v4.encoding == {} + def test_concat(self): x = np.arange(5) y = np.arange(5, 10) @@ -2201,9 +2218,13 @@ def test_coarsen_keep_attrs(self, operation="mean"): assert new.attrs == _attrs +def _init_dask_variable(*args, **kwargs): + return Variable(*args, **kwargs).chunk() + + @requires_dask class TestVariableWithDask(VariableSubclassobjects): - cls = staticmethod(lambda *args: Variable(*args).chunk()) + cls = staticmethod(_init_dask_variable) def test_chunk(self): unblocked = Variable(["dim_0", "dim_1"], np.ones((3, 4))) From f72907938b158d5cbf0ffd575851fded16c6779b Mon Sep 17 00:00:00 2001 From: Joseph Hamman Date: Mon, 27 Mar 2023 15:41:23 -0700 Subject: [PATCH 2/3] fix bad return type --- xarray/core/dataset.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/xarray/core/dataset.py b/xarray/core/dataset.py index ac021e8e54f..d3a5ddd470e 100644 --- a/xarray/core/dataset.py +++ b/xarray/core/dataset.py @@ -666,7 +666,7 @@ def encoding(self) -> dict[Any, Any]: def encoding(self, value: Mapping[Any, Any]) -> None: self._encoding = dict(value) - def reset_encoding(self: T_Dataset) -> None: + def reset_encoding(self: T_Dataset) -> T_Dataset: """Return a new Dataset without encoding on the dataset or any of its variables/coords.""" variables = {k: v.reset_encoding() for k, v in self.variables.items()} From 286ecf81ea81a644700de3e9c78172d0c2bdb6db Mon Sep 17 00:00:00 2001 From: Joseph Hamman Date: Tue, 28 Mar 2023 08:24:38 -0700 Subject: [PATCH 3/3] update io docs --- doc/user-guide/io.rst | 31 ++++++------------------------- 1 file changed, 6 insertions(+), 25 deletions(-) diff --git a/doc/user-guide/io.rst b/doc/user-guide/io.rst index cb0900ac6b0..eca229ef8a7 100644 --- a/doc/user-guide/io.rst +++ b/doc/user-guide/io.rst @@ -254,27 +254,10 @@ You can view this encoding information (among others) in the :py:attr:`DataArray.encoding` and :py:attr:`DataArray.encoding` attributes: -.. ipython:: - :verbatim: +.. ipython:: python - In [1]: ds_disk["y"].encoding - Out[1]: - {'zlib': False, - 'shuffle': False, - 'complevel': 0, - 'fletcher32': False, - 'contiguous': True, - 'chunksizes': None, - 'source': 'saved_on_disk.nc', - 'original_shape': (5,), - 'dtype': dtype('int64'), - 'units': 'days since 2000-01-01 00:00:00', - 'calendar': 'proleptic_gregorian'} - - In [9]: ds_disk.encoding - Out[9]: - {'unlimited_dims': set(), - 'source': 'saved_on_disk.nc'} + ds_disk["y"].encoding + ds_disk.encoding Note that all operations that manipulate variables other than indexing will remove encoding information. @@ -283,12 +266,10 @@ In some cases it is useful to intentionally reset a dataset's original encoding This can be done with either the :py:meth:`Dataset.reset_encoding` or :py:meth:`DataArray.reset_encoding` methods. -.. ipython:: - :verbatim: +.. ipython:: python - In [1]: ds_no_encoding = ds_disk.reset_encoding() - In [2]: ds_no_encoding["y"] - Out[2]: {} + ds_no_encoding = ds_disk.reset_encoding() + ds_no_encoding.encoding .. _combining multiple files: