Skip to content

Commit 64ef0b9

Browse files
committed
determine cf packed data dtype
1 parent 86f0209 commit 64ef0b9

File tree

2 files changed

+57
-37
lines changed

2 files changed

+57
-37
lines changed

xarray/coding/variables.py

+18-9
Original file line numberDiff line numberDiff line change
@@ -232,20 +232,29 @@ def _scale_offset_decoding(data, scale_factor, add_offset, dtype: np.typing.DTyp
232232
return data
233233

234234

235-
def _choose_float_dtype(dtype: np.dtype, has_offset: bool) -> type[np.floating[Any]]:
236-
"""Return a float dtype that can losslessly represent `dtype` values."""
237-
# Keep float32 as-is. Upcast half-precision to single-precision,
235+
def _choose_float_dtype(
236+
dtype: np.dtype, encoding: MutableMapping
237+
) -> type[np.floating[Any]]:
238+
# check scale/offset first to derive dtype with
239+
if "scale_factor" in encoding or "add_offset" in encoding:
240+
scale_factor = encoding.get("scale_factor", False)
241+
add_offset = encoding.get("add_offset", False)
242+
# minimal floating point size -> 4 byte
243+
maxsize = 4
244+
if scale_factor and np.issubdtype(type(scale_factor), np.floating):
245+
maxsize = max(maxsize, np.dtype(type(scale_factor)).itemsize)
246+
if add_offset and np.issubdtype(type(add_offset), np.floating):
247+
maxsize = max(maxsize, np.dtype(type(add_offset)).itemsize)
248+
return np.dtype(f"float{maxsize * 8}")
249+
# Keep float32 as-is. Upcast half-precision to single-precision,
238250
# because float16 is "intended for storage but not computation"
239251
if dtype.itemsize <= 4 and np.issubdtype(dtype, np.floating):
240252
return np.float32
241253
# float32 can exactly represent all integers up to 24 bits
242254
if dtype.itemsize <= 2 and np.issubdtype(dtype, np.integer):
243255
# A scale factor is entirely safe (vanishing into the mantissa),
244256
# but a large integer offset could lead to loss of precision.
245-
# Sensitivity analysis can be tricky, so we just use a float64
246-
# if there's any offset at all - better unoptimised than wrong!
247-
if not has_offset:
248-
return np.float32
257+
return np.float32
249258
# For all other types and circumstances, we just use float64.
250259
# (safe because eg. complex numbers are not supported in NetCDF)
251260
return np.float64
@@ -281,7 +290,7 @@ def encode(self, variable: Variable, name: T_Name = None) -> Variable:
281290
dims, data, attrs, encoding = unpack_for_encoding(variable)
282291

283292
if "scale_factor" in encoding or "add_offset" in encoding:
284-
dtype = _choose_float_dtype(data.dtype, "add_offset" in encoding)
293+
dtype = _choose_float_dtype(data.dtype, encoding)
285294
data = data.astype(dtype=dtype, copy=True)
286295
if "add_offset" in encoding:
287296
data -= pop_to(encoding, attrs, "add_offset", name=name)
@@ -297,7 +306,7 @@ def decode(self, variable: Variable, name: T_Name = None) -> Variable:
297306

298307
scale_factor = pop_to(attrs, encoding, "scale_factor", name=name)
299308
add_offset = pop_to(attrs, encoding, "add_offset", name=name)
300-
dtype = _choose_float_dtype(data.dtype, "add_offset" in encoding)
309+
dtype = _choose_float_dtype(data.dtype, encoding)
301310
if np.ndim(scale_factor) > 0:
302311
scale_factor = np.asarray(scale_factor).item()
303312
if np.ndim(add_offset) > 0:

xarray/tests/test_backends.py

+39-28
Original file line numberDiff line numberDiff line change
@@ -138,96 +138,96 @@ def open_example_mfdataset(names, *args, **kwargs) -> Dataset:
138138
)
139139

140140

141-
def create_masked_and_scaled_data() -> Dataset:
142-
x = np.array([np.nan, np.nan, 10, 10.1, 10.2], dtype=np.float32)
141+
def create_masked_and_scaled_data(dtype=np.float32) -> Dataset:
142+
x = np.array([np.nan, np.nan, 10, 10.1, 10.2], dtype=dtype)
143143
encoding = {
144144
"_FillValue": -1,
145145
"add_offset": 10,
146-
"scale_factor": np.float32(0.1),
146+
"scale_factor": dtype(0.1),
147147
"dtype": "i2",
148148
}
149149
return Dataset({"x": ("t", x, {}, encoding)})
150150

151151

152-
def create_encoded_masked_and_scaled_data() -> Dataset:
153-
attributes = {"_FillValue": -1, "add_offset": 10, "scale_factor": np.float32(0.1)}
152+
def create_encoded_masked_and_scaled_data(dtype=np.float32) -> Dataset:
153+
attributes = {"_FillValue": -1, "add_offset": 10, "scale_factor": dtype(0.1)}
154154
return Dataset(
155155
{"x": ("t", np.array([-1, -1, 0, 1, 2], dtype=np.int16), attributes)}
156156
)
157157

158158

159-
def create_unsigned_masked_scaled_data() -> Dataset:
159+
def create_unsigned_masked_scaled_data(dtype=np.float32) -> Dataset:
160160
encoding = {
161161
"_FillValue": 255,
162162
"_Unsigned": "true",
163163
"dtype": "i1",
164164
"add_offset": 10,
165-
"scale_factor": np.float32(0.1),
165+
"scale_factor": dtype(0.1),
166166
}
167-
x = np.array([10.0, 10.1, 22.7, 22.8, np.nan], dtype=np.float32)
167+
x = np.array([10.0, 10.1, 22.7, 22.8, np.nan], dtype=dtype)
168168
return Dataset({"x": ("t", x, {}, encoding)})
169169

170170

171-
def create_encoded_unsigned_masked_scaled_data() -> Dataset:
171+
def create_encoded_unsigned_masked_scaled_data(dtype=np.float32) -> Dataset:
172172
# These are values as written to the file: the _FillValue will
173173
# be represented in the signed form.
174174
attributes = {
175175
"_FillValue": -1,
176176
"_Unsigned": "true",
177177
"add_offset": 10,
178-
"scale_factor": np.float32(0.1),
178+
"scale_factor": dtype(0.1),
179179
}
180180
# Create unsigned data corresponding to [0, 1, 127, 128, 255] unsigned
181181
sb = np.asarray([0, 1, 127, -128, -1], dtype="i1")
182182
return Dataset({"x": ("t", sb, attributes)})
183183

184184

185-
def create_bad_unsigned_masked_scaled_data() -> Dataset:
185+
def create_bad_unsigned_masked_scaled_data(dtype=np.float32) -> Dataset:
186186
encoding = {
187187
"_FillValue": 255,
188188
"_Unsigned": True,
189189
"dtype": "i1",
190190
"add_offset": 10,
191-
"scale_factor": np.float32(0.1),
191+
"scale_factor": dtype(0.1),
192192
}
193-
x = np.array([10.0, 10.1, 22.7, 22.8, np.nan], dtype=np.float32)
193+
x = np.array([10.0, 10.1, 22.7, 22.8, np.nan], dtype=dtype)
194194
return Dataset({"x": ("t", x, {}, encoding)})
195195

196196

197-
def create_bad_encoded_unsigned_masked_scaled_data() -> Dataset:
197+
def create_bad_encoded_unsigned_masked_scaled_data(dtype=np.float32) -> Dataset:
198198
# These are values as written to the file: the _FillValue will
199199
# be represented in the signed form.
200200
attributes = {
201201
"_FillValue": -1,
202202
"_Unsigned": True,
203203
"add_offset": 10,
204-
"scale_factor": np.float32(0.1),
204+
"scale_factor": dtype(0.1),
205205
}
206206
# Create signed data corresponding to [0, 1, 127, 128, 255] unsigned
207207
sb = np.asarray([0, 1, 127, -128, -1], dtype="i1")
208208
return Dataset({"x": ("t", sb, attributes)})
209209

210210

211-
def create_signed_masked_scaled_data() -> Dataset:
211+
def create_signed_masked_scaled_data(dtype=np.float32) -> Dataset:
212212
encoding = {
213213
"_FillValue": -127,
214214
"_Unsigned": "false",
215215
"dtype": "i1",
216216
"add_offset": 10,
217-
"scale_factor": np.float32(0.1),
217+
"scale_factor": dtype(0.1),
218218
}
219-
x = np.array([-1.0, 10.1, 22.7, np.nan], dtype=np.float32)
219+
x = np.array([-1.0, 10.1, 22.7, np.nan], dtype=dtype)
220220
return Dataset({"x": ("t", x, {}, encoding)})
221221

222222

223-
def create_encoded_signed_masked_scaled_data() -> Dataset:
223+
def create_encoded_signed_masked_scaled_data(dtype=np.float32) -> Dataset:
224224
# These are values as written to the file: the _FillValue will
225225
# be represented in the signed form.
226226
attributes = {
227227
"_FillValue": -127,
228228
"_Unsigned": "false",
229229
"add_offset": 10,
230-
"scale_factor": np.float32(0.1),
230+
"scale_factor": dtype(0.1),
231231
}
232232
# Create signed data corresponding to [0, 1, 127, 128, 255] unsigned
233233
sb = np.asarray([-110, 1, 127, -127], dtype="i1")
@@ -857,6 +857,7 @@ def test_roundtrip_string_with_fill_value_nchar(self) -> None:
857857
with self.roundtrip(original) as actual:
858858
assert_identical(expected, actual)
859859

860+
@pytest.mark.parametrize("dtype", [np.float32, np.float64])
860861
@pytest.mark.parametrize(
861862
"decoded_fn, encoded_fn",
862863
[
@@ -876,12 +877,19 @@ def test_roundtrip_string_with_fill_value_nchar(self) -> None:
876877
(create_masked_and_scaled_data, create_encoded_masked_and_scaled_data),
877878
],
878879
)
879-
def test_roundtrip_mask_and_scale(self, decoded_fn, encoded_fn) -> None:
880-
decoded = decoded_fn()
881-
encoded = encoded_fn()
880+
def test_roundtrip_mask_and_scale(self, decoded_fn, encoded_fn, dtype) -> None:
881+
if dtype == np.float32 and isinstance(
882+
self, (TestZarrDirectoryStore, TestZarrDictStore)
883+
):
884+
pytest.skip(
885+
"zarr attributes (eg. `scale_factor` are unconditionally promoted to `float64`"
886+
)
887+
decoded = decoded_fn(dtype)
888+
encoded = encoded_fn(dtype)
882889

883890
with self.roundtrip(decoded) as actual:
884891
for k in decoded.variables:
892+
print(k, decoded.variables[k].dtype)
885893
assert decoded.variables[k].dtype == actual.variables[k].dtype
886894
assert_allclose(decoded, actual, decode_bytes=False)
887895

@@ -899,7 +907,7 @@ def test_roundtrip_mask_and_scale(self, decoded_fn, encoded_fn) -> None:
899907

900908
# make sure roundtrip encoding didn't change the
901909
# original dataset.
902-
assert_allclose(encoded, encoded_fn(), decode_bytes=False)
910+
assert_allclose(encoded, encoded_fn(dtype), decode_bytes=False)
903911

904912
with self.roundtrip(encoded) as actual:
905913
for k in decoded.variables:
@@ -1533,29 +1541,32 @@ def test_encoding_chunksizes_unlimited(self) -> None:
15331541
with self.roundtrip(ds) as actual:
15341542
assert_equal(ds, actual)
15351543

1536-
def test_mask_and_scale(self) -> None:
1544+
@pytest.mark.parametrize("dtype", [np.float32, np.float64])
1545+
def test_mask_and_scale(self, dtype) -> None:
15371546
with create_tmp_file() as tmp_file:
15381547
with nc4.Dataset(tmp_file, mode="w") as nc:
15391548
nc.createDimension("t", 5)
15401549
nc.createVariable("x", "int16", ("t",), fill_value=-1)
15411550
v = nc.variables["x"]
15421551
v.set_auto_maskandscale(False)
15431552
v.add_offset = 10
1544-
v.scale_factor = 0.1
1553+
v.scale_factor = dtype(0.1)
15451554
v[:] = np.array([-1, -1, 0, 1, 2])
15461555

15471556
# first make sure netCDF4 reads the masked and scaled data
15481557
# correctly
15491558
with nc4.Dataset(tmp_file, mode="r") as nc:
15501559
expected = np.ma.array(
1551-
[-1, -1, 10, 10.1, 10.2], mask=[True, True, False, False, False]
1560+
[-1, -1, 10, 10.1, 10.2],
1561+
mask=[True, True, False, False, False],
1562+
dtype=dtype,
15521563
)
15531564
actual = nc.variables["x"][:]
15541565
assert_array_equal(expected, actual)
15551566

15561567
# now check xarray
15571568
with open_dataset(tmp_file) as ds:
1558-
expected = create_masked_and_scaled_data()
1569+
expected = create_masked_and_scaled_data(dtype)
15591570
assert_identical(expected, ds)
15601571

15611572
def test_0dimensional_variable(self) -> None:

0 commit comments

Comments
 (0)