Skip to content

implement scale_factor/add_offset CF conformance test, add and align tests #7771

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Closed
wants to merge 8 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
122 changes: 114 additions & 8 deletions xarray/coding/variables.py
Original file line number Diff line number Diff line change
Expand Up @@ -318,6 +318,105 @@ def _choose_float_dtype(dtype: np.dtype, has_offset: bool) -> type[np.floating[A
return np.float64


def _ensure_scale_offset_conformance(
mapping: MutableMapping[str, Any], strict: bool = False
) -> bool | None:
"""Check conformance of scale_factor and add_offset for cf encoding/decoding.

scale_factor and/or add_offset as well as packed dtype are needed within mapping
"""
conforms = True
# https://cfconventions.org/cf-conventions/cf-conventions.html#packed-data
scale_factor = mapping.get("scale_factor")
if scale_factor is not None and np.ndim(scale_factor) > 0:
if strict:
raise ValueError(f"scale_factor {scale_factor} mismatch, scalar expected.")
else:
scale_factor = np.asarray(scale_factor).item()
mapping["scale_factor"] = scale_factor

add_offset = mapping.get("add_offset")
if add_offset is not None and np.ndim(add_offset) > 0:
if strict:
raise ValueError(f"add_offset {add_offset} mismatch, scalar expected.")
else:
add_offset = np.asarray(add_offset).item()
mapping["add_offset"] = add_offset

dtype = mapping.get("dtype")
ptype = np.dtype(dtype) if dtype is not None else None

# get the type from scale_factor/add_offset
scale_offset_dtype = list(
{np.dtype(type(att)) for att in [scale_factor, add_offset] if att is not None}
)

# raise early, aligns with netcdf4-python
if np.float16 in scale_offset_dtype:
raise ValueError(
f"scale_factor and/or add_offset dtype {scale_offset_dtype} mismatch. "
"float16 is not allowed."
)

ptype_exists = ptype is not None

# no packing information available, do nothing
if not scale_offset_dtype:
return None

# no packing information available, do nothing
if not scale_offset_dtype and not ptype_exists:
return None

# mandatory packing information missing
if scale_offset_dtype and not ptype_exists:
raise ValueError("Packed dtype information is missing!")

if len(scale_offset_dtype) == 1:
# OK, we have at least one of scale_factor or add_offset
# and if both are given, they are of the same dtype
if scale_offset_dtype[0] != ptype:
if scale_offset_dtype[0] not in [np.float32, np.float64]:
msg = (
f"scale_factor and/or add_offset dtype {scale_offset_dtype[0]} "
"mismatch. Must be either float32 or float64 dtype."
)
if strict:
raise ValueError(msg)
else:
warnings.warn(msg, SerializationWarning, stacklevel=3)
conforms = False
if np.issubdtype(ptype, np.integer) and ptype not in [
np.int8,
np.int16,
np.int32,
]:
msg = f"packed dtype {ptype} mismatch. Must be of type byte, short or int."
if strict:
raise ValueError(msg)
else:
warnings.warn(msg, SerializationWarning, stacklevel=3)
conforms = False
if ptype == np.int32 and scale_offset_dtype[0] == np.float32:
warnings.warn(
"Trying to pack float32 into int32. This is not advised per CF Convention "
"because of potential precision loss!",
SerializationWarning,
stacklevel=3,
)
else:
msg = (
f"scale_factor dtype {np.dtype(type(scale_factor))} and add_offset dtype "
f"{np.dtype(type(add_offset))} mismatch! Must be of same dtype."
)
if strict:
raise ValueError(msg)
else:
warnings.warn(msg, SerializationWarning, stacklevel=3)
conforms = False
return conforms


class CFScaleOffsetCoder(VariableCoder):
"""Scale and offset variables according to CF conventions.

Expand All @@ -329,6 +428,9 @@ def encode(self, variable: Variable, name: T_Name = None) -> Variable:
dims, data, attrs, encoding = unpack_for_encoding(variable)

if "scale_factor" in encoding or "add_offset" in encoding:
# strict checking, raise error on encoding
# we do not want to write non-conforming data
_ensure_scale_offset_conformance(encoding, strict=True)
dtype = _choose_float_dtype(data.dtype, "add_offset" in encoding)
data = data.astype(dtype=dtype, copy=True)
if "add_offset" in encoding:
Expand All @@ -343,17 +445,21 @@ def decode(self, variable: Variable, name: T_Name = None) -> Variable:
if "scale_factor" in _attrs or "add_offset" in _attrs:
dims, data, attrs, encoding = unpack_for_decoding(variable)

scale_factor = pop_to(attrs, encoding, "scale_factor", name=name)
add_offset = pop_to(attrs, encoding, "add_offset", name=name)
pop_to(attrs, encoding, "scale_factor", name=name)
pop_to(attrs, encoding, "add_offset", name=name)

# for decoding we need the original dtype
encoding.setdefault("dtype", data.dtype)

# only warn on decoding, but fix erroneous encoding
# we try to decode non-conforming data
_ensure_scale_offset_conformance(encoding, strict=False)

dtype = _choose_float_dtype(data.dtype, "add_offset" in encoding)
if np.ndim(scale_factor) > 0:
scale_factor = np.asarray(scale_factor).item()
if np.ndim(add_offset) > 0:
add_offset = np.asarray(add_offset).item()
transform = partial(
_scale_offset_decoding,
scale_factor=scale_factor,
add_offset=add_offset,
scale_factor=encoding.get("scale_factor"),
add_offset=encoding.get("add_offset"),
dtype=dtype,
)
data = lazy_elemwise_func(data, transform, dtype)
Expand Down
91 changes: 59 additions & 32 deletions xarray/tests/test_backends.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@
from contextlib import ExitStack
from io import BytesIO
from pathlib import Path
from typing import TYPE_CHECKING, Any, Final, cast
from typing import TYPE_CHECKING, Any, Callable, Final, cast

import numpy as np
import pandas as pd
Expand Down Expand Up @@ -138,96 +138,110 @@ def open_example_mfdataset(names, *args, **kwargs) -> Dataset:
)


def create_masked_and_scaled_data() -> Dataset:
x = np.array([np.nan, np.nan, 10, 10.1, 10.2], dtype=np.float32)
def create_masked_and_scaled_data(dtype: type[np.number] = np.float32) -> Dataset:
x = np.array([np.nan, np.nan, 10, 10.1, 10.2], dtype=dtype)
encoding = {
"_FillValue": -1,
"add_offset": 10,
"scale_factor": np.float32(0.1),
"add_offset": dtype(10),
"scale_factor": dtype(0.1),
"dtype": "i2",
}
return Dataset({"x": ("t", x, {}, encoding)})


def create_encoded_masked_and_scaled_data() -> Dataset:
attributes = {"_FillValue": -1, "add_offset": 10, "scale_factor": np.float32(0.1)}
def create_encoded_masked_and_scaled_data(
dtype: type[np.number] = np.float32,
) -> Dataset:
attributes = {"_FillValue": -1, "add_offset": dtype(10), "scale_factor": dtype(0.1)}
return Dataset(
{"x": ("t", np.array([-1, -1, 0, 1, 2], dtype=np.int16), attributes)}
)


def create_unsigned_masked_scaled_data() -> Dataset:
def create_unsigned_masked_scaled_data(
dtype: type[np.number] = np.float32,
) -> Dataset:
encoding = {
"_FillValue": 255,
"_Unsigned": "true",
"dtype": "i1",
"add_offset": 10,
"scale_factor": np.float32(0.1),
"add_offset": dtype(10),
"scale_factor": dtype(0.1),
}
x = np.array([10.0, 10.1, 22.7, 22.8, np.nan], dtype=np.float32)
x = np.array([10.0, 10.1, 22.7, 22.8, np.nan], dtype=dtype)
return Dataset({"x": ("t", x, {}, encoding)})


def create_encoded_unsigned_masked_scaled_data() -> Dataset:
def create_encoded_unsigned_masked_scaled_data(
dtype: type[np.number] = np.float32,
) -> Dataset:
# These are values as written to the file: the _FillValue will
# be represented in the signed form.
attributes = {
"_FillValue": -1,
"_Unsigned": "true",
"add_offset": 10,
"scale_factor": np.float32(0.1),
"add_offset": dtype(10),
"scale_factor": dtype(0.1),
}
# Create unsigned data corresponding to [0, 1, 127, 128, 255] unsigned
sb = np.asarray([0, 1, 127, -128, -1], dtype="i1")
return Dataset({"x": ("t", sb, attributes)})


def create_bad_unsigned_masked_scaled_data() -> Dataset:
def create_bad_unsigned_masked_scaled_data(
dtype: type[np.number] = np.float32,
) -> Dataset:
encoding = {
"_FillValue": 255,
"_Unsigned": True,
"dtype": "i1",
"add_offset": 10,
"scale_factor": np.float32(0.1),
"add_offset": dtype(0),
"scale_factor": dtype(0.1),
}
x = np.array([10.0, 10.1, 22.7, 22.8, np.nan], dtype=np.float32)
x = np.array([10.0, 10.1, 22.7, 22.8, np.nan], dtype=dtype)
return Dataset({"x": ("t", x, {}, encoding)})


def create_bad_encoded_unsigned_masked_scaled_data() -> Dataset:
def create_bad_encoded_unsigned_masked_scaled_data(
dtype: type[np.number] = np.float32,
) -> Dataset:
# These are values as written to the file: the _FillValue will
# be represented in the signed form.
attributes = {
"_FillValue": -1,
"_Unsigned": True,
"add_offset": 10,
"scale_factor": np.float32(0.1),
"add_offset": dtype(10),
"scale_factor": dtype(0.1),
}
# Create signed data corresponding to [0, 1, 127, 128, 255] unsigned
sb = np.asarray([0, 1, 127, -128, -1], dtype="i1")
return Dataset({"x": ("t", sb, attributes)})


def create_signed_masked_scaled_data() -> Dataset:
def create_signed_masked_scaled_data(
dtype: type[np.number] = np.float32,
) -> Dataset:
encoding = {
"_FillValue": -127,
"_Unsigned": "false",
"dtype": "i1",
"add_offset": 10,
"scale_factor": np.float32(0.1),
"add_offset": dtype(10),
"scale_factor": dtype(0.1),
}
x = np.array([-1.0, 10.1, 22.7, np.nan], dtype=np.float32)
x = np.array([-1.0, 10.1, 22.7, np.nan], dtype=dtype)
return Dataset({"x": ("t", x, {}, encoding)})


def create_encoded_signed_masked_scaled_data() -> Dataset:
def create_encoded_signed_masked_scaled_data(
dtype: type[np.number] = np.float32,
) -> Dataset:
# These are values as written to the file: the _FillValue will
# be represented in the signed form.
attributes = {
"_FillValue": -127,
"_Unsigned": "false",
"add_offset": 10,
"scale_factor": np.float32(0.1),
"add_offset": dtype(10),
"scale_factor": dtype(0.1),
}
# Create signed data corresponding to [0, 1, 127, 128, 255] unsigned
sb = np.asarray([-110, 1, 127, -127], dtype="i1")
Expand Down Expand Up @@ -859,6 +873,8 @@ def test_roundtrip_string_with_fill_value_nchar(self) -> None:
with self.roundtrip(original) as actual:
assert_identical(expected, actual)

# Todo: (kmuehlbauer) make this work np.float64
@pytest.mark.parametrize("dtype", [np.float32])
@pytest.mark.parametrize(
"decoded_fn, encoded_fn",
[
Expand All @@ -878,9 +894,20 @@ def test_roundtrip_string_with_fill_value_nchar(self) -> None:
(create_masked_and_scaled_data, create_encoded_masked_and_scaled_data),
],
)
def test_roundtrip_mask_and_scale(self, decoded_fn, encoded_fn) -> None:
decoded = decoded_fn()
encoded = encoded_fn()
def test_roundtrip_mask_and_scale(
self,
decoded_fn: Callable[[type[np.number]], Dataset],
encoded_fn: Callable[[type[np.number]], Dataset],
dtype: type[np.number],
) -> None:
if dtype == np.float32 and isinstance(
self, (TestZarrDirectoryStore, TestZarrDictStore)
):
pytest.skip(
"zarr attributes (eg. `scale_factor` are unconditionally promoted to `float64`"
)
decoded = decoded_fn(dtype)
encoded = encoded_fn(dtype)

with self.roundtrip(decoded) as actual:
for k in decoded.variables:
Expand All @@ -901,7 +928,7 @@ def test_roundtrip_mask_and_scale(self, decoded_fn, encoded_fn) -> None:

# make sure roundtrip encoding didn't change the
# original dataset.
assert_allclose(encoded, encoded_fn(), decode_bytes=False)
assert_allclose(encoded, encoded_fn(dtype), decode_bytes=False)

with self.roundtrip(encoded) as actual:
for k in decoded.variables:
Expand Down
Loading