From 7afb8c7aa4f86bd22f67006755f4ff4c93c7e086 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Kai=20M=C3=BChlbauer?= Date: Sat, 1 Apr 2023 10:18:08 +0200 Subject: [PATCH 1/8] implement coders, adapt tests --- xarray/coding/variables.py | 207 +++++++++++++++++++++++++++---- xarray/conventions.py | 137 ++------------------ xarray/tests/test_conventions.py | 6 +- 3 files changed, 199 insertions(+), 151 deletions(-) diff --git a/xarray/coding/variables.py b/xarray/coding/variables.py index 4107b3aa883..a94302a9cfd 100644 --- a/xarray/coding/variables.py +++ b/xarray/coding/variables.py @@ -78,6 +78,71 @@ def __repr__(self) -> str: ) +class NativeEndiannessArray(indexing.ExplicitlyIndexedNDArrayMixin): + """Decode arrays on the fly from non-native to native endianness + + This is useful for decoding arrays from netCDF3 files (which are all + big endian) into native endianness, so they can be used with Cython + functions, such as those found in bottleneck and pandas. + + >>> x = np.arange(5, dtype=">i2") + + >>> x.dtype + dtype('>i2') + + >>> NativeEndiannessArray(x).dtype + dtype('int16') + + >>> indexer = indexing.BasicIndexer((slice(None),)) + >>> NativeEndiannessArray(x)[indexer].dtype + dtype('int16') + """ + + __slots__ = ("array",) + + def __init__(self, array): + self.array = indexing.as_indexable(array) + + @property + def dtype(self): + return np.dtype(self.array.dtype.kind + str(self.array.dtype.itemsize)) + + def __getitem__(self, key): + return np.asarray(self.array[key], dtype=self.dtype) + + +class BoolTypeArray(indexing.ExplicitlyIndexedNDArrayMixin): + """Decode arrays on the fly from integer to boolean datatype + + This is useful for decoding boolean arrays from integer typed netCDF + variables. + + >>> x = np.array([1, 0, 1, 1, 0], dtype="i1") + + >>> x.dtype + dtype('int8') + + >>> BoolTypeArray(x).dtype + dtype('bool') + + >>> indexer = indexing.BasicIndexer((slice(None),)) + >>> BoolTypeArray(x)[indexer].dtype + dtype('bool') + """ + + __slots__ = ("array",) + + def __init__(self, array): + self.array = indexing.as_indexable(array) + + @property + def dtype(self): + return np.dtype("bool") + + def __getitem__(self, key): + return np.asarray(self.array[key], dtype=self.dtype) + + def lazy_elemwise_func(array, func: Callable, dtype: np.typing.DTypeLike): """Lazily apply an element-wise function to an array. Parameters @@ -159,30 +224,34 @@ def encode(self, variable: Variable, name: T_Name = None): fv = encoding.get("_FillValue") mv = encoding.get("missing_value") - if ( - fv is not None - and mv is not None - and not duck_array_ops.allclose_or_equiv(fv, mv) - ): - raise ValueError( - f"Variable {name!r} has conflicting _FillValue ({fv}) and missing_value ({mv}). Cannot encode data." - ) + if fv is not None or mv is not None: + if ( + fv is not None + and mv is not None + and not duck_array_ops.allclose_or_equiv(fv, mv) + ): + raise ValueError( + f"Variable {name!r} has conflicting _FillValue ({fv}) and missing_value ({mv}). Cannot encode data." + ) - if fv is not None: - # Ensure _FillValue is cast to same dtype as data's - encoding["_FillValue"] = dtype.type(fv) - fill_value = pop_to(encoding, attrs, "_FillValue", name=name) - if not pd.isnull(fill_value): - data = duck_array_ops.fillna(data, fill_value) + if fv is not None: + # Ensure _FillValue is cast to same dtype as data's + encoding["_FillValue"] = dtype.type(fv) + fill_value = pop_to(encoding, attrs, "_FillValue", name=name) + if not pd.isnull(fill_value): + data = duck_array_ops.fillna(data, fill_value) - if mv is not None: - # Ensure missing_value is cast to same dtype as data's - encoding["missing_value"] = dtype.type(mv) - fill_value = pop_to(encoding, attrs, "missing_value", name=name) - if not pd.isnull(fill_value) and fv is None: - data = duck_array_ops.fillna(data, fill_value) + if mv is not None: + # Ensure missing_value is cast to same dtype as data's + encoding["missing_value"] = dtype.type(mv) + fill_value = pop_to(encoding, attrs, "missing_value", name=name) + if not pd.isnull(fill_value) and fv is None: + data = duck_array_ops.fillna(data, fill_value) - return Variable(dims, data, attrs, encoding, fastpath=True) + return Variable(dims, data, attrs, encoding, fastpath=True) + + else: + return variable def decode(self, variable: Variable, name: T_Name = None): dims, data, attrs, encoding = unpack_for_decoding(variable) @@ -349,3 +418,99 @@ def decode(self, variable: Variable, name: T_Name = None) -> Variable: return Variable(dims, data, attrs, encoding, fastpath=True) else: return variable + + +class DefaultFillvalueCoder(VariableCoder): + """Encode default _FillValue if needed.""" + + def encode(self, variable: Variable, name: T_Name = None) -> Variable: + dims, data, attrs, encoding = unpack_for_encoding(variable) + # make NaN the fill value for float types + if ( + "_FillValue" not in attrs + and "_FillValue" not in encoding + and np.issubdtype(variable.dtype, np.floating) + ): + attrs["_FillValue"] = variable.dtype.type(np.nan) + return Variable(dims, data, attrs, encoding, fastpath=True) + else: + return variable + + def decode(self, variable: Variable, name: T_Name = None) -> Variable: + raise NotImplementedError() + + +class BooleanCoder(VariableCoder): + """Code boolean values.""" + + def encode(self, variable: Variable, name: T_Name = None) -> Variable: + if ( + (variable.dtype == bool) + and ("dtype" not in variable.encoding) + and ("dtype" not in variable.attrs) + ): + dims, data, attrs, encoding = unpack_for_encoding(variable) + attrs["dtype"] = "bool" + data = duck_array_ops.astype(data, dtype="i1", copy=True) + + return Variable(dims, data, attrs, encoding, fastpath=True) + else: + return variable + + def decode(self, variable: Variable, name: T_Name = None) -> Variable: + if variable.attrs.get("dtype", False) == "bool": + dims, data, attrs, encoding = unpack_for_decoding(variable) + del attrs["dtype"] + data = BoolTypeArray(data) + return Variable(dims, data, attrs, encoding, fastpath=True) + else: + return variable + + +class EndianCoder(VariableCoder): + """Decode Endianness to native.""" + + def encode(self): + raise NotImplementedError() + + def decode(self, variable: Variable, name: T_Name = None) -> Variable: + dims, data, attrs, encoding = unpack_for_decoding(variable) + if not data.dtype.isnative: + data = NativeEndiannessArray(data) + return Variable(dims, data, attrs, encoding, fastpath=True) + else: + return variable + + +class NonStringCoder(VariableCoder): + """Encode NonString variables if dtypes differ.""" + + def encode(self, variable: Variable, name: T_Name = None) -> Variable: + if "dtype" in variable.encoding and variable.encoding["dtype"] not in ( + "S1", + str, + ): + dims, data, attrs, encoding = unpack_for_encoding(variable) + dtype = np.dtype(encoding.pop("dtype")) + if dtype != variable.dtype: + if np.issubdtype(dtype, np.integer): + if ( + np.issubdtype(variable.dtype, np.floating) + and "_FillValue" not in variable.attrs + and "missing_value" not in variable.attrs + ): + warnings.warn( + f"saving variable {name} with floating " + "point data as an integer dtype without " + "any _FillValue to use for NaNs", + SerializationWarning, + stacklevel=10, + ) + data = np.around(data) + data = data.astype(dtype=dtype) + return Variable(dims, data, attrs, encoding, fastpath=True) + else: + return variable + + def decode(self): + raise NotImplementedError() diff --git a/xarray/conventions.py b/xarray/conventions.py index 780172879c6..f7cdc0bc7d3 100644 --- a/xarray/conventions.py +++ b/xarray/conventions.py @@ -48,123 +48,10 @@ T_DatasetOrAbstractstore = Union[Dataset, AbstractDataStore] -class NativeEndiannessArray(indexing.ExplicitlyIndexedNDArrayMixin): - """Decode arrays on the fly from non-native to native endianness - - This is useful for decoding arrays from netCDF3 files (which are all - big endian) into native endianness, so they can be used with Cython - functions, such as those found in bottleneck and pandas. - - >>> x = np.arange(5, dtype=">i2") - - >>> x.dtype - dtype('>i2') - - >>> NativeEndiannessArray(x).dtype - dtype('int16') - - >>> indexer = indexing.BasicIndexer((slice(None),)) - >>> NativeEndiannessArray(x)[indexer].dtype - dtype('int16') - """ - - __slots__ = ("array",) - - def __init__(self, array): - self.array = indexing.as_indexable(array) - - @property - def dtype(self): - return np.dtype(self.array.dtype.kind + str(self.array.dtype.itemsize)) - - def __getitem__(self, key): - return np.asarray(self.array[key], dtype=self.dtype) - - -class BoolTypeArray(indexing.ExplicitlyIndexedNDArrayMixin): - """Decode arrays on the fly from integer to boolean datatype - - This is useful for decoding boolean arrays from integer typed netCDF - variables. - - >>> x = np.array([1, 0, 1, 1, 0], dtype="i1") - - >>> x.dtype - dtype('int8') - - >>> BoolTypeArray(x).dtype - dtype('bool') - - >>> indexer = indexing.BasicIndexer((slice(None),)) - >>> BoolTypeArray(x)[indexer].dtype - dtype('bool') - """ - - __slots__ = ("array",) - - def __init__(self, array): - self.array = indexing.as_indexable(array) - - @property - def dtype(self): - return np.dtype("bool") - - def __getitem__(self, key): - return np.asarray(self.array[key], dtype=self.dtype) - - def _var_as_tuple(var: Variable) -> T_VarTuple: return var.dims, var.data, var.attrs.copy(), var.encoding.copy() -def maybe_encode_nonstring_dtype(var: Variable, name: T_Name = None) -> Variable: - if "dtype" in var.encoding and var.encoding["dtype"] not in ("S1", str): - dims, data, attrs, encoding = _var_as_tuple(var) - dtype = np.dtype(encoding.pop("dtype")) - if dtype != var.dtype: - if np.issubdtype(dtype, np.integer): - if ( - np.issubdtype(var.dtype, np.floating) - and "_FillValue" not in var.attrs - and "missing_value" not in var.attrs - ): - warnings.warn( - f"saving variable {name} with floating " - "point data as an integer dtype without " - "any _FillValue to use for NaNs", - SerializationWarning, - stacklevel=10, - ) - data = np.around(data) - data = data.astype(dtype=dtype) - var = Variable(dims, data, attrs, encoding, fastpath=True) - return var - - -def maybe_default_fill_value(var: Variable) -> Variable: - # make NaN the fill value for float types: - if ( - "_FillValue" not in var.attrs - and "_FillValue" not in var.encoding - and np.issubdtype(var.dtype, np.floating) - ): - var.attrs["_FillValue"] = var.dtype.type(np.nan) - return var - - -def maybe_encode_bools(var: Variable) -> Variable: - if ( - (var.dtype == bool) - and ("dtype" not in var.encoding) - and ("dtype" not in var.attrs) - ): - dims, data, attrs, encoding = _var_as_tuple(var) - attrs["dtype"] = "bool" - data = duck_array_ops.astype(data, dtype="i1", copy=True) - var = Variable(dims, data, attrs, encoding, fastpath=True) - return var - - def _infer_dtype(array, name: T_Name = None) -> np.dtype: """Given an object array with no missing values, infer its dtype from its first element @@ -292,13 +179,13 @@ def encode_cf_variable( variables.CFScaleOffsetCoder(), variables.CFMaskCoder(), variables.UnsignedIntegerCoder(), + variables.NonStringCoder(), + variables.DefaultFillvalueCoder(), + variables.BooleanCoder(), ]: var = coder.encode(var, name=name) - # TODO(shoyer): convert all of these to use coders, too: - var = maybe_encode_nonstring_dtype(var, name=name) - var = maybe_default_fill_value(var) - var = maybe_encode_bools(var) + # TODO(kmuehlbauer): check if ensure_dtype_not_object can be moved to backends: var = ensure_dtype_not_object(var, name=name) for attr_name in CF_RELATED_DATA: @@ -389,19 +276,15 @@ def decode_cf_variable( if decode_times: var = times.CFDatetimeCoder(use_cftime=use_cftime).decode(var, name=name) - dimensions, data, attributes, encoding = variables.unpack_for_decoding(var) - # TODO(shoyer): convert everything below to use coders + if decode_endianness and not var.dtype.isnative: + var = variables.EndianCoder().decode(var) + original_dtype = var.dtype - if decode_endianness and not data.dtype.isnative: - # do this last, so it's only done if we didn't already unmask/scale - data = NativeEndiannessArray(data) - original_dtype = data.dtype + var = variables.BooleanCoder().decode(var) - encoding.setdefault("dtype", original_dtype) + dimensions, data, attributes, encoding = variables.unpack_for_decoding(var) - if "dtype" in attributes and attributes["dtype"] == "bool": - del attributes["dtype"] - data = BoolTypeArray(data) + encoding.setdefault("dtype", original_dtype) if not is_duck_dask_array(data): data = indexing.LazilyIndexedArray(data) diff --git a/xarray/tests/test_conventions.py b/xarray/tests/test_conventions.py index 9485b506b89..6d219f09e0e 100644 --- a/xarray/tests/test_conventions.py +++ b/xarray/tests/test_conventions.py @@ -32,7 +32,7 @@ class TestBoolTypeArray: def test_booltype_array(self) -> None: x = np.array([1, 0, 1, 1, 0], dtype="i1") - bx = conventions.BoolTypeArray(x) + bx = coding.variables.BoolTypeArray(x) assert bx.dtype == bool assert_array_equal(bx, np.array([True, False, True, True, False], dtype=bool)) @@ -41,7 +41,7 @@ class TestNativeEndiannessArray: def test(self) -> None: x = np.arange(5, dtype=">i8") expected = np.arange(5, dtype="int64") - a = conventions.NativeEndiannessArray(x) + a = coding.variables.NativeEndiannessArray(x) assert a.dtype == expected.dtype assert a.dtype == expected[:].dtype assert_array_equal(a, expected) @@ -247,7 +247,7 @@ def test_decode_coordinates(self) -> None: def test_0d_int32_encoding(self) -> None: original = Variable((), np.int32(0), encoding={"dtype": "int64"}) expected = Variable((), np.int64(0)) - actual = conventions.maybe_encode_nonstring_dtype(original) + actual = coding.variables.NonStringCoder().encode(original) assert_identical(expected, actual) def test_decode_cf_with_multiple_missing_values(self) -> None: From 5c497c3825d5fbeb9495dda16eb2e80588072db0 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Kai=20M=C3=BChlbauer?= Date: Sun, 2 Apr 2023 17:41:09 +0200 Subject: [PATCH 2/8] Apply suggestions from code review Co-authored-by: Illviljan <14371165+Illviljan@users.noreply.github.com> --- xarray/coding/variables.py | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/xarray/coding/variables.py b/xarray/coding/variables.py index a94302a9cfd..00199d4a21d 100644 --- a/xarray/coding/variables.py +++ b/xarray/coding/variables.py @@ -100,14 +100,14 @@ class NativeEndiannessArray(indexing.ExplicitlyIndexedNDArrayMixin): __slots__ = ("array",) - def __init__(self, array): + def __init__(self, array) -> None: self.array = indexing.as_indexable(array) @property - def dtype(self): + def dtype(self) -> np.dtype: return np.dtype(self.array.dtype.kind + str(self.array.dtype.itemsize)) - def __getitem__(self, key): + def __getitem__(self, key) -> np.ndarray: return np.asarray(self.array[key], dtype=self.dtype) @@ -132,14 +132,14 @@ class BoolTypeArray(indexing.ExplicitlyIndexedNDArrayMixin): __slots__ = ("array",) - def __init__(self, array): + def __init__(self, array) -> None: self.array = indexing.as_indexable(array) @property - def dtype(self): + def dtype(self) -> np.dtype: return np.dtype("bool") - def __getitem__(self, key): + def __getitem__(self, key) -> np.ndarray: return np.asarray(self.array[key], dtype=self.dtype) From eb4c9a9a5343b0c16d0205a830fe21820f15e202 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Kai=20M=C3=BChlbauer?= Date: Wed, 5 Apr 2023 07:42:54 +0200 Subject: [PATCH 3/8] add whats-new.rst entry --- doc/whats-new.rst | 2 ++ 1 file changed, 2 insertions(+) diff --git a/doc/whats-new.rst b/doc/whats-new.rst index eedb8c71624..3d5af86c99d 100644 --- a/doc/whats-new.rst +++ b/doc/whats-new.rst @@ -73,6 +73,8 @@ Internal Changes - Remove internal support for reading GRIB files through the ``cfgrib`` backend. ``cfgrib`` now uses the external backend interface, so no existing code should break. By `Deepak Cherian `_. +- Implement CF coding functions in ``VariableCoders``. + By By `Kai Mühlbauer `_ - Added a config.yml file with messages for the welcome bot when a Github user creates their first ever issue or pull request or has their first PR merged. (:issue:`7685`, :pull:`7685`) By `Nishtha P `_. From dbcb5743a1c7c9c230ba91447f1cebbfdb64714c Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Wed, 5 Apr 2023 06:01:47 +0000 Subject: [PATCH 4/8] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- xarray/conventions.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/xarray/conventions.py b/xarray/conventions.py index f7cdc0bc7d3..1506efc31e8 100644 --- a/xarray/conventions.py +++ b/xarray/conventions.py @@ -10,7 +10,7 @@ from xarray.coding import strings, times, variables from xarray.coding.variables import SerializationWarning, pop_to -from xarray.core import duck_array_ops, indexing +from xarray.core import indexing from xarray.core.common import ( _contains_datetime_like_objects, contains_cftime_datetimes, From a8e18e6040697da6e280c5b1e2789981d863d16e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Kai=20M=C3=BChlbauer?= Date: Wed, 5 Apr 2023 08:03:30 +0200 Subject: [PATCH 5/8] fix whats-new.rst entry --- doc/whats-new.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/whats-new.rst b/doc/whats-new.rst index 3d5af86c99d..ca10ab27966 100644 --- a/doc/whats-new.rst +++ b/doc/whats-new.rst @@ -74,7 +74,7 @@ Internal Changes backend interface, so no existing code should break. By `Deepak Cherian `_. - Implement CF coding functions in ``VariableCoders``. - By By `Kai Mühlbauer `_ + By `Kai Mühlbauer `_ - Added a config.yml file with messages for the welcome bot when a Github user creates their first ever issue or pull request or has their first PR merged. (:issue:`7685`, :pull:`7685`) By `Nishtha P `_. From a7848d3ab86e1894b1962e393da32f37bda70666 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Kai=20M=C3=BChlbauer?= Date: Wed, 5 Apr 2023 08:13:22 +0200 Subject: [PATCH 6/8] add PR link to whats-new.rst entry --- doc/whats-new.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/whats-new.rst b/doc/whats-new.rst index ca10ab27966..2e97e61abb1 100644 --- a/doc/whats-new.rst +++ b/doc/whats-new.rst @@ -73,7 +73,7 @@ Internal Changes - Remove internal support for reading GRIB files through the ``cfgrib`` backend. ``cfgrib`` now uses the external backend interface, so no existing code should break. By `Deepak Cherian `_. -- Implement CF coding functions in ``VariableCoders``. +- Implement CF coding functions in ``VariableCoders`` (:pull:`7719`). By `Kai Mühlbauer `_ - Added a config.yml file with messages for the welcome bot when a Github user creates their first ever issue or pull request or has their first PR merged. (:issue:`7685`, :pull:`7685`) From fa569376e6a73e8a10faee4a444f9e3d8b9a22d3 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Kai=20M=C3=BChlbauer?= Date: Thu, 6 Apr 2023 07:41:08 +0200 Subject: [PATCH 7/8] return early if no missing values defined --- xarray/coding/variables.py | 46 ++++++++++++++++++-------------------- 1 file changed, 22 insertions(+), 24 deletions(-) diff --git a/xarray/coding/variables.py b/xarray/coding/variables.py index 00199d4a21d..8307e7fcb7a 100644 --- a/xarray/coding/variables.py +++ b/xarray/coding/variables.py @@ -224,34 +224,32 @@ def encode(self, variable: Variable, name: T_Name = None): fv = encoding.get("_FillValue") mv = encoding.get("missing_value") - if fv is not None or mv is not None: - if ( - fv is not None - and mv is not None - and not duck_array_ops.allclose_or_equiv(fv, mv) - ): - raise ValueError( - f"Variable {name!r} has conflicting _FillValue ({fv}) and missing_value ({mv}). Cannot encode data." - ) + fv_exists = fv is not None + mv_exists = mv is not None - if fv is not None: - # Ensure _FillValue is cast to same dtype as data's - encoding["_FillValue"] = dtype.type(fv) - fill_value = pop_to(encoding, attrs, "_FillValue", name=name) - if not pd.isnull(fill_value): - data = duck_array_ops.fillna(data, fill_value) + if not fv_exists and not mv_exists: + return variable - if mv is not None: - # Ensure missing_value is cast to same dtype as data's - encoding["missing_value"] = dtype.type(mv) - fill_value = pop_to(encoding, attrs, "missing_value", name=name) - if not pd.isnull(fill_value) and fv is None: - data = duck_array_ops.fillna(data, fill_value) + if fv_exists and mv_exists and not duck_array_ops.allclose_or_equiv(fv, mv): + raise ValueError( + f"Variable {name!r} has conflicting _FillValue ({fv}) and missing_value ({mv}). Cannot encode data." + ) - return Variable(dims, data, attrs, encoding, fastpath=True) + if fv_exists: + # Ensure _FillValue is cast to same dtype as data's + encoding["_FillValue"] = dtype.type(fv) + fill_value = pop_to(encoding, attrs, "_FillValue", name=name) + if not pd.isnull(fill_value): + data = duck_array_ops.fillna(data, fill_value) - else: - return variable + if mv_exists: + # Ensure missing_value is cast to same dtype as data's + encoding["missing_value"] = dtype.type(mv) + fill_value = pop_to(encoding, attrs, "missing_value", name=name) + if not pd.isnull(fill_value) and fv_exists: + data = duck_array_ops.fillna(data, fill_value) + + return Variable(dims, data, attrs, encoding, fastpath=True) def decode(self, variable: Variable, name: T_Name = None): dims, data, attrs, encoding = unpack_for_decoding(variable) From 43eb08304b922c87866b36d1f397f376e5ff78d7 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Kai=20M=C3=BChlbauer?= Date: Thu, 6 Apr 2023 08:20:29 +0200 Subject: [PATCH 8/8] fix check --- xarray/coding/variables.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/xarray/coding/variables.py b/xarray/coding/variables.py index 8307e7fcb7a..6a439418028 100644 --- a/xarray/coding/variables.py +++ b/xarray/coding/variables.py @@ -246,7 +246,7 @@ def encode(self, variable: Variable, name: T_Name = None): # Ensure missing_value is cast to same dtype as data's encoding["missing_value"] = dtype.type(mv) fill_value = pop_to(encoding, attrs, "missing_value", name=name) - if not pd.isnull(fill_value) and fv_exists: + if not pd.isnull(fill_value) and not fv_exists: data = duck_array_ops.fillna(data, fill_value) return Variable(dims, data, attrs, encoding, fastpath=True)