determine cf packed data dtype

kmuehlbauer · kmuehlbauer · commit 64ef0b95a78b · 2023-03-31T12:59:06.000+02:00
diff --git a/xarray/coding/variables.py b/xarray/coding/variables.py
@@ -232,20 +232,29 @@ def _scale_offset_decoding(data, scale_factor, add_offset, dtype: np.typing.DTyp
     return data
 
 
-def _choose_float_dtype(dtype: np.dtype, has_offset: bool) -> type[np.floating[Any]]:
-    """Return a float dtype that can losslessly represent `dtype` values."""
-    # Keep float32 as-is.  Upcast half-precision to single-precision,
+def _choose_float_dtype(
+    dtype: np.dtype, encoding: MutableMapping
+) -> type[np.floating[Any]]:
+    # check scale/offset first to derive dtype with
+    if "scale_factor" in encoding or "add_offset" in encoding:
+        scale_factor = encoding.get("scale_factor", False)
+        add_offset = encoding.get("add_offset", False)
+        # minimal floating point size -> 4 byte
+        maxsize = 4
+        if scale_factor and np.issubdtype(type(scale_factor), np.floating):
+            maxsize = max(maxsize, np.dtype(type(scale_factor)).itemsize)
+        if add_offset and np.issubdtype(type(add_offset), np.floating):
+            maxsize = max(maxsize, np.dtype(type(add_offset)).itemsize)
+        return np.dtype(f"float{maxsize * 8}")
+    # Keep float32 as-is. Upcast half-precision to single-precision,
     # because float16 is "intended for storage but not computation"
     if dtype.itemsize <= 4 and np.issubdtype(dtype, np.floating):
         return np.float32
     # float32 can exactly represent all integers up to 24 bits
     if dtype.itemsize <= 2 and np.issubdtype(dtype, np.integer):
         # A scale factor is entirely safe (vanishing into the mantissa),
         # but a large integer offset could lead to loss of precision.
-        # Sensitivity analysis can be tricky, so we just use a float64
-        # if there's any offset at all - better unoptimised than wrong!
-        if not has_offset:
-            return np.float32
+        return np.float32
     # For all other types and circumstances, we just use float64.
     # (safe because eg. complex numbers are not supported in NetCDF)
     return np.float64
@@ -281,7 +290,7 @@ def encode(self, variable: Variable, name: T_Name = None) -> Variable:
         dims, data, attrs, encoding = unpack_for_encoding(variable)
 
         if "scale_factor" in encoding or "add_offset" in encoding:
-            dtype = _choose_float_dtype(data.dtype, "add_offset" in encoding)
+            dtype = _choose_float_dtype(data.dtype, encoding)
             data = data.astype(dtype=dtype, copy=True)
         if "add_offset" in encoding:
             data -= pop_to(encoding, attrs, "add_offset", name=name)
@@ -297,7 +306,7 @@ def decode(self, variable: Variable, name: T_Name = None) -> Variable:
 
             scale_factor = pop_to(attrs, encoding, "scale_factor", name=name)
             add_offset = pop_to(attrs, encoding, "add_offset", name=name)
-            dtype = _choose_float_dtype(data.dtype, "add_offset" in encoding)
+            dtype = _choose_float_dtype(data.dtype, encoding)
             if np.ndim(scale_factor) > 0:
                 scale_factor = np.asarray(scale_factor).item()
             if np.ndim(add_offset) > 0:
diff --git a/xarray/tests/test_backends.py b/xarray/tests/test_backends.py
@@ -138,96 +138,96 @@ def open_example_mfdataset(names, *args, **kwargs) -> Dataset:
     )
 
 
-def create_masked_and_scaled_data() -> Dataset:
-    x = np.array([np.nan, np.nan, 10, 10.1, 10.2], dtype=np.float32)
+def create_masked_and_scaled_data(dtype=np.float32) -> Dataset:
+    x = np.array([np.nan, np.nan, 10, 10.1, 10.2], dtype=dtype)
     encoding = {
         "_FillValue": -1,
         "add_offset": 10,
-        "scale_factor": np.float32(0.1),
+        "scale_factor": dtype(0.1),
         "dtype": "i2",
     }
     return Dataset({"x": ("t", x, {}, encoding)})
 
 
-def create_encoded_masked_and_scaled_data() -> Dataset:
-    attributes = {"_FillValue": -1, "add_offset": 10, "scale_factor": np.float32(0.1)}
+def create_encoded_masked_and_scaled_data(dtype=np.float32) -> Dataset:
+    attributes = {"_FillValue": -1, "add_offset": 10, "scale_factor": dtype(0.1)}
     return Dataset(
         {"x": ("t", np.array([-1, -1, 0, 1, 2], dtype=np.int16), attributes)}
     )
 
 
-def create_unsigned_masked_scaled_data() -> Dataset:
+def create_unsigned_masked_scaled_data(dtype=np.float32) -> Dataset:
     encoding = {
         "_FillValue": 255,
         "_Unsigned": "true",
         "dtype": "i1",
         "add_offset": 10,
-        "scale_factor": np.float32(0.1),
+        "scale_factor": dtype(0.1),
     }
-    x = np.array([10.0, 10.1, 22.7, 22.8, np.nan], dtype=np.float32)
+    x = np.array([10.0, 10.1, 22.7, 22.8, np.nan], dtype=dtype)
     return Dataset({"x": ("t", x, {}, encoding)})
 
 
-def create_encoded_unsigned_masked_scaled_data() -> Dataset:
+def create_encoded_unsigned_masked_scaled_data(dtype=np.float32) -> Dataset:
     # These are values as written to the file: the _FillValue will
     # be represented in the signed form.
     attributes = {
         "_FillValue": -1,
         "_Unsigned": "true",
         "add_offset": 10,
-        "scale_factor": np.float32(0.1),
+        "scale_factor": dtype(0.1),
     }
     # Create unsigned data corresponding to [0, 1, 127, 128, 255] unsigned
     sb = np.asarray([0, 1, 127, -128, -1], dtype="i1")
     return Dataset({"x": ("t", sb, attributes)})
 
 
-def create_bad_unsigned_masked_scaled_data() -> Dataset:
+def create_bad_unsigned_masked_scaled_data(dtype=np.float32) -> Dataset:
     encoding = {
         "_FillValue": 255,
         "_Unsigned": True,
         "dtype": "i1",
         "add_offset": 10,
-        "scale_factor": np.float32(0.1),
+        "scale_factor": dtype(0.1),
     }
-    x = np.array([10.0, 10.1, 22.7, 22.8, np.nan], dtype=np.float32)
+    x = np.array([10.0, 10.1, 22.7, 22.8, np.nan], dtype=dtype)
     return Dataset({"x": ("t", x, {}, encoding)})
 
 
-def create_bad_encoded_unsigned_masked_scaled_data() -> Dataset:
+def create_bad_encoded_unsigned_masked_scaled_data(dtype=np.float32) -> Dataset:
     # These are values as written to the file: the _FillValue will
     # be represented in the signed form.
     attributes = {
         "_FillValue": -1,
         "_Unsigned": True,
         "add_offset": 10,
-        "scale_factor": np.float32(0.1),
+        "scale_factor": dtype(0.1),
     }
     # Create signed data corresponding to [0, 1, 127, 128, 255] unsigned
     sb = np.asarray([0, 1, 127, -128, -1], dtype="i1")
     return Dataset({"x": ("t", sb, attributes)})
 
 
-def create_signed_masked_scaled_data() -> Dataset:
+def create_signed_masked_scaled_data(dtype=np.float32) -> Dataset:
     encoding = {
         "_FillValue": -127,
         "_Unsigned": "false",
         "dtype": "i1",
         "add_offset": 10,
-        "scale_factor": np.float32(0.1),
+        "scale_factor": dtype(0.1),
     }
-    x = np.array([-1.0, 10.1, 22.7, np.nan], dtype=np.float32)
+    x = np.array([-1.0, 10.1, 22.7, np.nan], dtype=dtype)
     return Dataset({"x": ("t", x, {}, encoding)})
 
 
-def create_encoded_signed_masked_scaled_data() -> Dataset:
+def create_encoded_signed_masked_scaled_data(dtype=np.float32) -> Dataset:
     # These are values as written to the file: the _FillValue will
     # be represented in the signed form.
     attributes = {
         "_FillValue": -127,
         "_Unsigned": "false",
         "add_offset": 10,
-        "scale_factor": np.float32(0.1),
+        "scale_factor": dtype(0.1),
     }
     # Create signed data corresponding to [0, 1, 127, 128, 255] unsigned
     sb = np.asarray([-110, 1, 127, -127], dtype="i1")
@@ -857,6 +857,7 @@ def test_roundtrip_string_with_fill_value_nchar(self) -> None:
             with self.roundtrip(original) as actual:
                 assert_identical(expected, actual)
 
+    @pytest.mark.parametrize("dtype", [np.float32, np.float64])
     @pytest.mark.parametrize(
         "decoded_fn, encoded_fn",
         [
@@ -876,12 +877,19 @@ def test_roundtrip_string_with_fill_value_nchar(self) -> None:
             (create_masked_and_scaled_data, create_encoded_masked_and_scaled_data),
         ],
     )
-    def test_roundtrip_mask_and_scale(self, decoded_fn, encoded_fn) -> None:
-        decoded = decoded_fn()
-        encoded = encoded_fn()
+    def test_roundtrip_mask_and_scale(self, decoded_fn, encoded_fn, dtype) -> None:
+        if dtype == np.float32 and isinstance(
+            self, (TestZarrDirectoryStore, TestZarrDictStore)
+        ):
+            pytest.skip(
+                "zarr attributes (eg. `scale_factor` are unconditionally promoted to `float64`"
+            )
+        decoded = decoded_fn(dtype)
+        encoded = encoded_fn(dtype)
 
         with self.roundtrip(decoded) as actual:
             for k in decoded.variables:
+                print(k, decoded.variables[k].dtype)
                 assert decoded.variables[k].dtype == actual.variables[k].dtype
             assert_allclose(decoded, actual, decode_bytes=False)
 
@@ -899,7 +907,7 @@ def test_roundtrip_mask_and_scale(self, decoded_fn, encoded_fn) -> None:
 
         # make sure roundtrip encoding didn't change the
         # original dataset.
-        assert_allclose(encoded, encoded_fn(), decode_bytes=False)
+        assert_allclose(encoded, encoded_fn(dtype), decode_bytes=False)
 
         with self.roundtrip(encoded) as actual:
             for k in decoded.variables:
@@ -1533,29 +1541,32 @@ def test_encoding_chunksizes_unlimited(self) -> None:
         with self.roundtrip(ds) as actual:
             assert_equal(ds, actual)
 
-    def test_mask_and_scale(self) -> None:
+    @pytest.mark.parametrize("dtype", [np.float32, np.float64])
+    def test_mask_and_scale(self, dtype) -> None:
         with create_tmp_file() as tmp_file:
             with nc4.Dataset(tmp_file, mode="w") as nc:
                 nc.createDimension("t", 5)
                 nc.createVariable("x", "int16", ("t",), fill_value=-1)
                 v = nc.variables["x"]
                 v.set_auto_maskandscale(False)
                 v.add_offset = 10
-                v.scale_factor = 0.1
+                v.scale_factor = dtype(0.1)
                 v[:] = np.array([-1, -1, 0, 1, 2])
 
             # first make sure netCDF4 reads the masked and scaled data
             # correctly
             with nc4.Dataset(tmp_file, mode="r") as nc:
                 expected = np.ma.array(
-                    [-1, -1, 10, 10.1, 10.2], mask=[True, True, False, False, False]
+                    [-1, -1, 10, 10.1, 10.2],
+                    mask=[True, True, False, False, False],
+                    dtype=dtype,
                 )
                 actual = nc.variables["x"][:]
                 assert_array_equal(expected, actual)
 
             # now check xarray
             with open_dataset(tmp_file) as ds:
-                expected = create_masked_and_scaled_data()
+                expected = create_masked_and_scaled_data(dtype)
                 assert_identical(expected, ds)
 
     def test_0dimensional_variable(self) -> None: