diff --git a/.gitignore b/.gitignore index fdf1b12d706..1acc1c789a0 100644 --- a/.gitignore +++ b/.gitignore @@ -69,3 +69,4 @@ xarray/version.py Icon* .ipynb_checkpoints +.nfs* diff --git a/doc/whats-new.rst b/doc/whats-new.rst index 9c88445b5ba..0cddf9cba36 100644 --- a/doc/whats-new.rst +++ b/doc/whats-new.rst @@ -119,6 +119,23 @@ Other enhancements See :ref:`compute.using_coordinates` for the detail. (:issue:`1332`) By `Keisuke Fujii `_. +- :py:meth:`pandas.Series.dropna` is now supported for a + :py:class:`pandas.Series` indexed by a :py:class:`~xarray.CFTimeIndex` + (:issue:`2688`). By `Spencer Clark `_. +- Variables are now unpacked with scale_factor and offset dtypes if present in datasets. + According `to cf convetion `_. + By `Daoud Jahdou `_. +- :py:meth:`~xarray.cftime_range` now supports QuarterBegin and QuarterEnd offsets (:issue:`2663`). + By `Jwen Fai Low `_ +- :py:meth:`~xarray.open_dataset` now accepts a ``use_cftime`` argument, which + can be used to require that ``cftime.datetime`` objects are always used, or + never used when decoding dates encoded with a standard calendar. This can be + used to ensure consistent date types are returned when using + :py:meth:`~xarray.open_mfdataset` (:issue:`1263`) and/or to silence + serialization warnings raised if dates from a standard calendar are found to + be outside the :py:class:`pandas.Timestamp`-valid range (:issue:`2754`). By + `Spencer Clark `_. + - Added :py:meth:`~xarray.Dataset.drop_dims` (:issue:`1949`). By `Kevin Squire `_. diff --git a/xarray/coding/variables.py b/xarray/coding/variables.py index 1f74181f3b3..8f188061e83 100644 --- a/xarray/coding/variables.py +++ b/xarray/coding/variables.py @@ -189,8 +189,49 @@ def _scale_offset_decoding(data, scale_factor, add_offset, dtype): return data -def _choose_float_dtype(dtype, has_offset): +def _choose_decoding_float_dtype(dtype, scale_factor, add_offset): + """Return a float dtype according to cf-convention""" + # Implementing cf-convention + # http://cfconventions.org/Data/cf-conventions/cf-conventions-1.7/build/ch08.html: + # Detail: + # If the scale_factor and add_offset attributes are of the same + # data type as the + # associated variable, the unpacked data is assumed to be of the same + # data type as the packed data. However, if the scale_factor + # and add_offset attributes are of a different data type + # from the variable (containing the packed data) + # then the unpacked data should match + # the type of these attributes, which must both be of type float or both + # be of type double. An additional restriction in this case is that + # the variable containing the packed data must + # be of type byte, short or int. + # It is not advised to unpack an int into a float as there + # is a potential precision loss. + + if scale_factor or add_offset: + + types = (np.dtype(type(scale_factor)), + np.dtype(type(add_offset)), + np.dtype(dtype)) + + if add_offset is None: + types = (np.dtype(type(scale_factor)), + np.dtype(dtype)) + + # scaled_type should be the largest type we find + scaled_dtype = dtypes.result_type(*types) + + # We return it only if it's a float32 or a float64 + if (scaled_dtype.itemsize >= 4 + and np.issubdtype(scaled_dtype, np.floating)): + return scaled_dtype + + return _choose_encoding_float_dtype(dtype, add_offset is not None) + + +def _choose_encoding_float_dtype(dtype, has_offset): """Return a float dtype that can losslessly represent `dtype` values.""" + # Keep float32 as-is. Upcast half-precision to single-precision, # because float16 is "intended for storage but not computation" if dtype.itemsize <= 4 and np.issubdtype(dtype, np.floating): @@ -217,9 +258,9 @@ class CFScaleOffsetCoder(VariableCoder): def encode(self, variable, name=None): dims, data, attrs, encoding = unpack_for_encoding(variable) - if 'scale_factor' in encoding or 'add_offset' in encoding: - dtype = _choose_float_dtype(data.dtype, 'add_offset' in encoding) + dtype = _choose_encoding_float_dtype(data.dtype, + 'add_offset' in encoding) data = data.astype(dtype=dtype, copy=True) if 'add_offset' in encoding: data -= pop_to(encoding, attrs, 'add_offset', name=name) @@ -232,9 +273,12 @@ def decode(self, variable, name=None): dims, data, attrs, encoding = unpack_for_decoding(variable) if 'scale_factor' in attrs or 'add_offset' in attrs: + scale_factor = pop_to(attrs, encoding, 'scale_factor', name=name) add_offset = pop_to(attrs, encoding, 'add_offset', name=name) - dtype = _choose_float_dtype(data.dtype, 'add_offset' in attrs) + dtype = _choose_decoding_float_dtype(data.dtype, + scale_factor, add_offset) + transform = partial(_scale_offset_decoding, scale_factor=scale_factor, add_offset=add_offset, diff --git a/xarray/conventions.py b/xarray/conventions.py index 5f41639e890..9588ee41d52 100644 --- a/xarray/conventions.py +++ b/xarray/conventions.py @@ -222,7 +222,6 @@ def encode_cf_variable(var, needs_copy=True, name=None): A variable which has been encoded as described above. """ ensure_not_multiindex(var, name=name) - for coder in [times.CFDatetimeCoder(), times.CFTimedeltaCoder(), variables.CFScaleOffsetCoder(), diff --git a/xarray/core/dtypes.py b/xarray/core/dtypes.py index 00ff7958183..803c5c4064c 100644 --- a/xarray/core/dtypes.py +++ b/xarray/core/dtypes.py @@ -30,7 +30,6 @@ def __eq__(self, other): INF = AlwaysGreaterThan() NINF = AlwaysLessThan() - # Pairs of types that, if both found, should be promoted to object dtype # instead of following NumPy's own type-promotion rules. These type promotion # rules match pandas instead. For reference, see the NumPy type hierarchy: diff --git a/xarray/tests/test_backends.py b/xarray/tests/test_backends.py index a20ba2df229..0cd394d75cd 100644 --- a/xarray/tests/test_backends.py +++ b/xarray/tests/test_backends.py @@ -685,7 +685,9 @@ def test_roundtrip_mask_and_scale(self, decoded_fn, encoded_fn): with self.roundtrip(decoded) as actual: for k in decoded.variables: assert (decoded.variables[k].dtype - == actual.variables[k].dtype) + == actual.variables[k].dtype + or (decoded.variables[k].dtype == np.float32 and + actual.variables[k].dtype == np.float64)) assert_allclose(decoded, actual, decode_bytes=False) with self.roundtrip(decoded, @@ -711,7 +713,9 @@ def test_roundtrip_mask_and_scale(self, decoded_fn, encoded_fn): with self.roundtrip(encoded) as actual: for k in decoded.variables: assert (decoded.variables[k].dtype == - actual.variables[k].dtype) + actual.variables[k].dtype or + (decoded.variables[k].dtype == np.float32 and + actual.variables[k].dtype == np.float64)) assert_allclose(decoded, actual, decode_bytes=False) def test_coordinates_encoding(self): @@ -1156,15 +1160,16 @@ def test_mask_and_scale(self): nc.createVariable('x', 'int16', ('t',), fill_value=-1) v = nc.variables['x'] v.set_auto_maskandscale(False) - v.add_offset = 10 - v.scale_factor = 0.1 + v.add_offset = np.float32(10) + v.scale_factor = np.float32(0.1) v[:] = np.array([-1, -1, 0, 1, 2]) # first make sure netCDF4 reads the masked and scaled data # correctly with nc4.Dataset(tmp_file, mode='r') as nc: expected = np.ma.array([-1, -1, 10, 10.1, 10.2], - mask=[True, True, False, False, False]) + mask=[True, True, False, False, False], + dtype=np.float32) actual = nc.variables['x'][:] assert_array_equal(expected, actual) @@ -1173,6 +1178,25 @@ def test_mask_and_scale(self): expected = create_masked_and_scaled_data() assert_identical(expected, ds) + def test_mask_and_scale_with_float64_scale_factor(self): + with create_tmp_file() as tmp_file: + with nc4.Dataset(tmp_file, mode='w') as nc: + nc.createDimension('t', 5) + nc.createVariable('x', 'int16', ('t',), fill_value=-1) + v = nc.variables['x'] + v.scale_factor = 0.01 + v.add_offset = 10 + v[:] = np.array([-1123, -1123, 123, 1123, 2123]) + # We read the newly created netcdf file + with nc4.Dataset(tmp_file, mode='r') as nc: + # we open the dataset + with open_dataset(tmp_file) as ds: + # Both dataset values should be equal + # And both of float64 array type + dsv = ds['x'].values + ncv = nc.variables['x'][:] + np.testing.assert_array_almost_equal(dsv, ncv, 15) + def test_0dimensional_variable(self): # This fix verifies our work-around to this netCDF4-python bug: # https://github.com/Unidata/netcdf4-python/pull/220 diff --git a/xarray/tests/test_coding.py b/xarray/tests/test_coding.py index 95c8ebc0b42..a98d5c857f9 100644 --- a/xarray/tests/test_coding.py +++ b/xarray/tests/test_coding.py @@ -12,6 +12,10 @@ import dask.array as da +def reverse_list_of_tuple(plist): + return [tuple(reversed(t)) for t in plist] + + def test_CFMaskCoder_decode(): original = xr.Variable(('x',), [0, -1, 1], {'_FillValue': -1}) expected = xr.Variable(('x',), [0, np.nan, 1]) @@ -43,10 +47,145 @@ def test_coder_roundtrip(): @pytest.mark.parametrize('dtype', 'u1 u2 i1 i2 f2 f4'.split()) def test_scaling_converts_to_float32(dtype): original = xr.Variable(('x',), np.arange(10, dtype=dtype), - encoding=dict(scale_factor=10)) + encoding=dict(scale_factor=np.float32(10))) coder = variables.CFScaleOffsetCoder() encoded = coder.encode(original) assert encoded.dtype == np.float32 roundtripped = coder.decode(encoded) - assert_identical(original, roundtripped) assert roundtripped.dtype == np.float32 + assert_identical(original, roundtripped) + + +all_possible_types = [np.uint8(8), + np.uint16(16), + np.uint32(32), + np.uint64(64), + np.int8(80), + np.int16(160), + np.int32(320), + np.int64(640), + 1, + 0.01, + np.float16(1600), + np.float32(3200), + np.float64(6400)] + +# In all cases encoding returns either np.float32 or np.float64 +# Encoding only cares about existence of add_offset when the +# variable is an integer +# If the variable is a float then the encoded dtype is np.float32 +# If the variable is an integer with add_offset +# then the encoded dtype is np.float64 +# If the variable is an integer with no add_affset +# then the encoded dtype is np.float32 +# In all other cases the encoded dtype is np.float64 +# decoding is the equivalent of unpacking mentioned in the cf-convention +# in all cases decoding takes the encoded dtype which +# is either np.float32 or np.float64 +# then decoded type is the largest between scale_factor, add_offset +# and encoded type and not original + +############################# +# Case 1: variable has offset +############################# + +# Case 1.1: variable is float +# encoded should be np.float32 +# if (scale_factor, add_offset) is in the following list +# decoded should be np.float32 +# if not decoded should be np.float64 +combinations_for_float32 = [ + (np.uint8(8), np.uint16(16)), + (np.uint8(8), np.int8(80)), + (np.uint8(8), np.int16(160)), + (np.uint8(8), np.float16(1600)), + (np.uint8(8), np.float32(3200)), + (np.uint16(16), np.float16(1600)), + (np.uint16(16), np.float32(3200)), + (np.int8(80), np.int16(160)), + (np.int8(80), np.float16(1600)), + (np.int8(80), np.float32(3200)), + (np.int16(160), np.float16(1600)), + (np.int16(160), np.float32(3200)), + (np.float16(1600), np.float32(3200)), + (np.float32(3200), np.float32(3200)), + (np.float16(1600), np.float16(1600)), + (np.int16(160), np.int16(160)), + (np.int8(80), np.int8(80)), + (np.uint16(16), np.uint16(16)), + (np.uint8(8), np.uint8(8)) +] +(combinations_for_float32.extend( + reverse_list_of_tuple(combinations_for_float32))) + + +@pytest.mark.parametrize('dtype', 'f2 f4'.split()) +@pytest.mark.parametrize('scale_factor', all_possible_types) +@pytest.mark.parametrize('add_offset', all_possible_types) +def test_cfscaleoffset_case_1_float_var(dtype, scale_factor, add_offset): + original = xr.Variable(('x',), np.arange(10, dtype=dtype), + encoding=dict(scale_factor=scale_factor, + add_offset=add_offset)) + + coder = variables.CFScaleOffsetCoder() + encoded = coder.encode(original) + assert encoded.dtype == np.float32 + roundtripped = coder.decode(encoded) + + if (scale_factor, add_offset) in combinations_for_float32: + assert roundtripped.dtype == np.float32 + else: + assert roundtripped.dtype == np.float64 + + +# Case 1.2: variable is integer +# encoded should be np.float64 as we have offset +# decoded should always be np.float64 +@pytest.mark.parametrize('dtype', 'u1 u2 i1 i2'.split()) +@pytest.mark.parametrize('scale_factor', all_possible_types) +@pytest.mark.parametrize('add_offset', all_possible_types) +def test_cfscaleoffset_case_1_int_var(dtype, scale_factor, add_offset): + original = xr.Variable(('x',), np.arange(10, dtype=dtype), + encoding=dict(scale_factor=scale_factor, + add_offset=add_offset)) + + coder = variables.CFScaleOffsetCoder() + encoded = coder.encode(original) + assert encoded.dtype == np.float64 + roundtripped = coder.decode(encoded) + assert roundtripped.dtype == np.float64 + + +#################################### +# Case 2: variable has no add_offset +#################################### + +# Case 2.1: +# for any variable dtype +# encoded should be np.float32 +# if scale_factor in the following list +# decoded should be np.float32 +# if not decoded should be np.float64 +types_for_float32 = [np.uint8(8), + np.uint16(16), + np.int8(80), + np.int16(160), + np.float16(1600), + np.float32(3200)] + + +@pytest.mark.parametrize('dtype', 'u1 u2 i1 i2 f2 f4'.split()) +@pytest.mark.parametrize('scale_factor', all_possible_types) +def test_cfscaleoffset_case_2(dtype, scale_factor): + + original = xr.Variable(('x',), np.arange(10, dtype=dtype), + encoding=dict(scale_factor=scale_factor)) + + coder = variables.CFScaleOffsetCoder() + encoded = coder.encode(original) + assert encoded.dtype == np.float32 + roundtripped = coder.decode(encoded) + if scale_factor in types_for_float32: + assert roundtripped.dtype == np.float32 + else: + assert roundtripped.dtype == np.float64