Skip to content

Implement more Variable Coders #7719

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 8 commits into from
Apr 7, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions doc/whats-new.rst
Original file line number Diff line number Diff line change
Expand Up @@ -73,6 +73,8 @@ Internal Changes
- Remove internal support for reading GRIB files through the ``cfgrib`` backend. ``cfgrib`` now uses the external
backend interface, so no existing code should break.
By `Deepak Cherian <https://github.com/dcherian>`_.
- Implement CF coding functions in ``VariableCoders`` (:pull:`7719`).
By `Kai Mühlbauer <https://github.com/kmuehlbauer>`_

- Added a config.yml file with messages for the welcome bot when a Github user creates their first ever issue or pull request or has their first PR merged. (:issue:`7685`, :pull:`7685`)
By `Nishtha P <https://github.com/nishthap981>`_.
Expand Down
179 changes: 171 additions & 8 deletions xarray/coding/variables.py
Original file line number Diff line number Diff line change
Expand Up @@ -78,6 +78,71 @@ def __repr__(self) -> str:
)


class NativeEndiannessArray(indexing.ExplicitlyIndexedNDArrayMixin):
"""Decode arrays on the fly from non-native to native endianness

This is useful for decoding arrays from netCDF3 files (which are all
big endian) into native endianness, so they can be used with Cython
functions, such as those found in bottleneck and pandas.

>>> x = np.arange(5, dtype=">i2")

>>> x.dtype
dtype('>i2')

>>> NativeEndiannessArray(x).dtype
dtype('int16')

>>> indexer = indexing.BasicIndexer((slice(None),))
>>> NativeEndiannessArray(x)[indexer].dtype
dtype('int16')
"""

__slots__ = ("array",)

def __init__(self, array) -> None:
self.array = indexing.as_indexable(array)

@property
def dtype(self) -> np.dtype:
return np.dtype(self.array.dtype.kind + str(self.array.dtype.itemsize))

def __getitem__(self, key) -> np.ndarray:
return np.asarray(self.array[key], dtype=self.dtype)


class BoolTypeArray(indexing.ExplicitlyIndexedNDArrayMixin):
"""Decode arrays on the fly from integer to boolean datatype

This is useful for decoding boolean arrays from integer typed netCDF
variables.

>>> x = np.array([1, 0, 1, 1, 0], dtype="i1")

>>> x.dtype
dtype('int8')

>>> BoolTypeArray(x).dtype
dtype('bool')

>>> indexer = indexing.BasicIndexer((slice(None),))
>>> BoolTypeArray(x)[indexer].dtype
dtype('bool')
"""

__slots__ = ("array",)

def __init__(self, array) -> None:
self.array = indexing.as_indexable(array)

@property
def dtype(self) -> np.dtype:
return np.dtype("bool")

def __getitem__(self, key) -> np.ndarray:
return np.asarray(self.array[key], dtype=self.dtype)


def lazy_elemwise_func(array, func: Callable, dtype: np.typing.DTypeLike):
"""Lazily apply an element-wise function to an array.
Parameters
Expand Down Expand Up @@ -159,27 +224,29 @@ def encode(self, variable: Variable, name: T_Name = None):
fv = encoding.get("_FillValue")
mv = encoding.get("missing_value")

if (
fv is not None
and mv is not None
and not duck_array_ops.allclose_or_equiv(fv, mv)
):
fv_exists = fv is not None
mv_exists = mv is not None

if not fv_exists and not mv_exists:
return variable

if fv_exists and mv_exists and not duck_array_ops.allclose_or_equiv(fv, mv):
raise ValueError(
f"Variable {name!r} has conflicting _FillValue ({fv}) and missing_value ({mv}). Cannot encode data."
)

if fv is not None:
if fv_exists:
# Ensure _FillValue is cast to same dtype as data's
encoding["_FillValue"] = dtype.type(fv)
fill_value = pop_to(encoding, attrs, "_FillValue", name=name)
if not pd.isnull(fill_value):
data = duck_array_ops.fillna(data, fill_value)

if mv is not None:
if mv_exists:
# Ensure missing_value is cast to same dtype as data's
encoding["missing_value"] = dtype.type(mv)
fill_value = pop_to(encoding, attrs, "missing_value", name=name)
if not pd.isnull(fill_value) and fv is None:
if not pd.isnull(fill_value) and not fv_exists:
data = duck_array_ops.fillna(data, fill_value)

return Variable(dims, data, attrs, encoding, fastpath=True)
Expand Down Expand Up @@ -349,3 +416,99 @@ def decode(self, variable: Variable, name: T_Name = None) -> Variable:
return Variable(dims, data, attrs, encoding, fastpath=True)
else:
return variable


class DefaultFillvalueCoder(VariableCoder):
"""Encode default _FillValue if needed."""

def encode(self, variable: Variable, name: T_Name = None) -> Variable:
dims, data, attrs, encoding = unpack_for_encoding(variable)
# make NaN the fill value for float types
if (
"_FillValue" not in attrs
and "_FillValue" not in encoding
and np.issubdtype(variable.dtype, np.floating)
):
attrs["_FillValue"] = variable.dtype.type(np.nan)
return Variable(dims, data, attrs, encoding, fastpath=True)
else:
return variable

def decode(self, variable: Variable, name: T_Name = None) -> Variable:
raise NotImplementedError()


class BooleanCoder(VariableCoder):
"""Code boolean values."""

def encode(self, variable: Variable, name: T_Name = None) -> Variable:
if (
(variable.dtype == bool)
and ("dtype" not in variable.encoding)
and ("dtype" not in variable.attrs)
):
dims, data, attrs, encoding = unpack_for_encoding(variable)
attrs["dtype"] = "bool"
data = duck_array_ops.astype(data, dtype="i1", copy=True)

return Variable(dims, data, attrs, encoding, fastpath=True)
else:
return variable

def decode(self, variable: Variable, name: T_Name = None) -> Variable:
if variable.attrs.get("dtype", False) == "bool":
dims, data, attrs, encoding = unpack_for_decoding(variable)
del attrs["dtype"]
data = BoolTypeArray(data)
return Variable(dims, data, attrs, encoding, fastpath=True)
else:
return variable


class EndianCoder(VariableCoder):
"""Decode Endianness to native."""

def encode(self):
raise NotImplementedError()

def decode(self, variable: Variable, name: T_Name = None) -> Variable:
dims, data, attrs, encoding = unpack_for_decoding(variable)
if not data.dtype.isnative:
data = NativeEndiannessArray(data)
return Variable(dims, data, attrs, encoding, fastpath=True)
else:
return variable


class NonStringCoder(VariableCoder):
"""Encode NonString variables if dtypes differ."""

def encode(self, variable: Variable, name: T_Name = None) -> Variable:
if "dtype" in variable.encoding and variable.encoding["dtype"] not in (
"S1",
str,
):
dims, data, attrs, encoding = unpack_for_encoding(variable)
dtype = np.dtype(encoding.pop("dtype"))
if dtype != variable.dtype:
if np.issubdtype(dtype, np.integer):
if (
np.issubdtype(variable.dtype, np.floating)
and "_FillValue" not in variable.attrs
and "missing_value" not in variable.attrs
):
warnings.warn(
f"saving variable {name} with floating "
"point data as an integer dtype without "
"any _FillValue to use for NaNs",
SerializationWarning,
stacklevel=10,
)
data = np.around(data)
data = data.astype(dtype=dtype)
return Variable(dims, data, attrs, encoding, fastpath=True)
else:
return variable

def decode(self):
raise NotImplementedError()
139 changes: 11 additions & 128 deletions xarray/conventions.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@

from xarray.coding import strings, times, variables
from xarray.coding.variables import SerializationWarning, pop_to
from xarray.core import duck_array_ops, indexing
from xarray.core import indexing
from xarray.core.common import (
_contains_datetime_like_objects,
contains_cftime_datetimes,
Expand Down Expand Up @@ -48,123 +48,10 @@
T_DatasetOrAbstractstore = Union[Dataset, AbstractDataStore]


class NativeEndiannessArray(indexing.ExplicitlyIndexedNDArrayMixin):
"""Decode arrays on the fly from non-native to native endianness

This is useful for decoding arrays from netCDF3 files (which are all
big endian) into native endianness, so they can be used with Cython
functions, such as those found in bottleneck and pandas.

>>> x = np.arange(5, dtype=">i2")

>>> x.dtype
dtype('>i2')

>>> NativeEndiannessArray(x).dtype
dtype('int16')

>>> indexer = indexing.BasicIndexer((slice(None),))
>>> NativeEndiannessArray(x)[indexer].dtype
dtype('int16')
"""

__slots__ = ("array",)

def __init__(self, array):
self.array = indexing.as_indexable(array)

@property
def dtype(self):
return np.dtype(self.array.dtype.kind + str(self.array.dtype.itemsize))

def __getitem__(self, key):
return np.asarray(self.array[key], dtype=self.dtype)


class BoolTypeArray(indexing.ExplicitlyIndexedNDArrayMixin):
"""Decode arrays on the fly from integer to boolean datatype

This is useful for decoding boolean arrays from integer typed netCDF
variables.

>>> x = np.array([1, 0, 1, 1, 0], dtype="i1")

>>> x.dtype
dtype('int8')

>>> BoolTypeArray(x).dtype
dtype('bool')

>>> indexer = indexing.BasicIndexer((slice(None),))
>>> BoolTypeArray(x)[indexer].dtype
dtype('bool')
"""

__slots__ = ("array",)

def __init__(self, array):
self.array = indexing.as_indexable(array)

@property
def dtype(self):
return np.dtype("bool")

def __getitem__(self, key):
return np.asarray(self.array[key], dtype=self.dtype)


def _var_as_tuple(var: Variable) -> T_VarTuple:
return var.dims, var.data, var.attrs.copy(), var.encoding.copy()


def maybe_encode_nonstring_dtype(var: Variable, name: T_Name = None) -> Variable:
if "dtype" in var.encoding and var.encoding["dtype"] not in ("S1", str):
dims, data, attrs, encoding = _var_as_tuple(var)
dtype = np.dtype(encoding.pop("dtype"))
if dtype != var.dtype:
if np.issubdtype(dtype, np.integer):
if (
np.issubdtype(var.dtype, np.floating)
and "_FillValue" not in var.attrs
and "missing_value" not in var.attrs
):
warnings.warn(
f"saving variable {name} with floating "
"point data as an integer dtype without "
"any _FillValue to use for NaNs",
SerializationWarning,
stacklevel=10,
)
data = np.around(data)
data = data.astype(dtype=dtype)
var = Variable(dims, data, attrs, encoding, fastpath=True)
return var


def maybe_default_fill_value(var: Variable) -> Variable:
# make NaN the fill value for float types:
if (
"_FillValue" not in var.attrs
and "_FillValue" not in var.encoding
and np.issubdtype(var.dtype, np.floating)
):
var.attrs["_FillValue"] = var.dtype.type(np.nan)
return var


def maybe_encode_bools(var: Variable) -> Variable:
if (
(var.dtype == bool)
and ("dtype" not in var.encoding)
and ("dtype" not in var.attrs)
):
dims, data, attrs, encoding = _var_as_tuple(var)
attrs["dtype"] = "bool"
data = duck_array_ops.astype(data, dtype="i1", copy=True)
var = Variable(dims, data, attrs, encoding, fastpath=True)
return var


def _infer_dtype(array, name: T_Name = None) -> np.dtype:
"""Given an object array with no missing values, infer its dtype from its
first element
Expand Down Expand Up @@ -292,13 +179,13 @@ def encode_cf_variable(
variables.CFScaleOffsetCoder(),
variables.CFMaskCoder(),
variables.UnsignedIntegerCoder(),
variables.NonStringCoder(),
variables.DefaultFillvalueCoder(),
variables.BooleanCoder(),
]:
var = coder.encode(var, name=name)

# TODO(shoyer): convert all of these to use coders, too:
var = maybe_encode_nonstring_dtype(var, name=name)
var = maybe_default_fill_value(var)
var = maybe_encode_bools(var)
# TODO(kmuehlbauer): check if ensure_dtype_not_object can be moved to backends:
var = ensure_dtype_not_object(var, name=name)

for attr_name in CF_RELATED_DATA:
Expand Down Expand Up @@ -389,19 +276,15 @@ def decode_cf_variable(
if decode_times:
var = times.CFDatetimeCoder(use_cftime=use_cftime).decode(var, name=name)

dimensions, data, attributes, encoding = variables.unpack_for_decoding(var)
# TODO(shoyer): convert everything below to use coders
if decode_endianness and not var.dtype.isnative:
var = variables.EndianCoder().decode(var)
original_dtype = var.dtype

if decode_endianness and not data.dtype.isnative:
# do this last, so it's only done if we didn't already unmask/scale
data = NativeEndiannessArray(data)
original_dtype = data.dtype
var = variables.BooleanCoder().decode(var)

encoding.setdefault("dtype", original_dtype)
dimensions, data, attributes, encoding = variables.unpack_for_decoding(var)

if "dtype" in attributes and attributes["dtype"] == "bool":
del attributes["dtype"]
data = BoolTypeArray(data)
encoding.setdefault("dtype", original_dtype)

if not is_duck_dask_array(data):
data = indexing.LazilyIndexedArray(data)
Expand Down
Loading