From fef5fe7477f314992e0996dcbb6951bc1c4fe266 Mon Sep 17 00:00:00 2001 From: Daoud Jahdou Date: Thu, 7 Feb 2019 15:10:38 +0100 Subject: [PATCH 01/13] ENH: ability to force promote float64 When dealing with int variables with scale factor xarray transform them to float32 by default. This enhancement aims to give the possibity to force that to float64 when needed. The behaviour is unchanged when not specifying the parameter force_promote_float64. --- doc/whats-new.rst | 6 +++++- xarray/backends/api.py | 13 ++++++++++++- xarray/core/dtypes.py | 7 ++++++- xarray/tests/test_backends.py | 18 ++++++++++++++++++ 4 files changed, 41 insertions(+), 3 deletions(-) diff --git a/doc/whats-new.rst b/doc/whats-new.rst index 142ef3a2e51..ec8bc782e6e 100644 --- a/doc/whats-new.rst +++ b/doc/whats-new.rst @@ -24,7 +24,7 @@ Breaking changes - Remove support for Python 2. This is the first version of xarray that is Python 3 only. (:issue:`1876`). By `Joe Hamman `_. -- The `compat` argument to `Dataset` and the `encoding` argument to +- The `compat` argument to `Dataset` and the `encoding` argument to `DataArray` are deprecated and will be removed in a future release. (:issue:`1188`) By `Maximilian Roos `_. @@ -62,6 +62,10 @@ Enhancements - :py:meth:`pandas.Series.dropna` is now supported for a :py:class:`pandas.Series` indexed by a :py:class:`~xarray.CFTimeIndex` (:issue:`2688`). By `Spencer Clark `_. +- Ability to promote to float64 instead of float32 when dealing int type + variables with scale_factor added parameter force_promote_float64 + to :py:meth:`~xarray.open_dataset` and :py:meth:`~xarray.open_dataarray` + that enables this behaviour. By `Daoud Jahdou `_. Bug fixes ~~~~~~~~~ diff --git a/xarray/backends/api.py b/xarray/backends/api.py index e52f47a0841..04ec1e0ce3c 100644 --- a/xarray/backends/api.py +++ b/xarray/backends/api.py @@ -14,6 +14,7 @@ from ..core.utils import close_on_error, is_grib_path, is_remote_uri from .common import ArrayWriter from .locks import _get_scheduler +from ..core import dtypes DATAARRAY_NAME = '__xarray_dataarray_name__' DATAARRAY_VARIABLE = '__xarray_dataarray_variable__' @@ -161,6 +162,7 @@ def open_dataset(filename_or_obj, group=None, decode_cf=True, mask_and_scale=None, decode_times=True, autoclose=None, concat_characters=True, decode_coords=True, engine=None, chunks=None, lock=None, cache=None, drop_variables=None, + force_promote_float64=False, backend_kwargs=None): """Load and decode a dataset from a file or file-like object. @@ -227,6 +229,9 @@ def open_dataset(filename_or_obj, group=None, decode_cf=True, A variable or list of variables to exclude from being parsed from the dataset. This may be useful to drop variables with problems or inconsistent values. + force_promote_float64: bool + If True force int variables to be promoted to float64 instead of + float32 backend_kwargs: dictionary, optional A dictionary of keyword arguments to pass on to the backend. This may be useful when backend options would improve performance or @@ -262,6 +267,8 @@ def open_dataset(filename_or_obj, group=None, decode_cf=True, if cache is None: cache = chunks is None + dtypes.FORCE_PROMOTE_FLOAT64 = force_promote_float64 + if backend_kwargs is None: backend_kwargs = {} @@ -360,7 +367,7 @@ def open_dataarray(filename_or_obj, group=None, decode_cf=True, mask_and_scale=None, decode_times=True, autoclose=None, concat_characters=True, decode_coords=True, engine=None, chunks=None, lock=None, cache=None, drop_variables=None, - backend_kwargs=None): + force_promote_float64=False, backend_kwargs=None): """Open an DataArray from a netCDF file containing a single data variable. This is designed to read netCDF files with only one data variable. If @@ -424,6 +431,9 @@ def open_dataarray(filename_or_obj, group=None, decode_cf=True, A variable or list of variables to exclude from being parsed from the dataset. This may be useful to drop variables with problems or inconsistent values. + force_promote_float64: bool + If True force int variables to be promoted to float64 instead of + float32 backend_kwargs: dictionary, optional A dictionary of keyword arguments to pass on to the backend. This may be useful when backend options would improve performance or @@ -450,6 +460,7 @@ def open_dataarray(filename_or_obj, group=None, decode_cf=True, decode_coords=decode_coords, engine=engine, chunks=chunks, lock=lock, cache=cache, drop_variables=drop_variables, + force_promote_float64=force_promote_float64, backend_kwargs=backend_kwargs) if len(dataset.data_vars) != 1: diff --git a/xarray/core/dtypes.py b/xarray/core/dtypes.py index 00ff7958183..0e2c9c04dbb 100644 --- a/xarray/core/dtypes.py +++ b/xarray/core/dtypes.py @@ -30,6 +30,11 @@ def __eq__(self, other): INF = AlwaysGreaterThan() NINF = AlwaysLessThan() +# Constant that indicates that we +# must switch to float64 instead of +# float32 even if dtype.itemsize <= 2 +FORCE_PROMOTE_FLOAT64 = False + # Pairs of types that, if both found, should be promoted to object dtype # instead of following NumPy's own type-promotion rules. These type promotion @@ -63,7 +68,7 @@ def maybe_promote(dtype): # Check np.timedelta64 before np.integer fill_value = np.timedelta64('NaT') elif np.issubdtype(dtype, np.integer): - if dtype.itemsize <= 2: + if dtype.itemsize <= 2 and not FORCE_PROMOTE_FLOAT64: dtype = np.float32 else: dtype = np.float64 diff --git a/xarray/tests/test_backends.py b/xarray/tests/test_backends.py index e9d7a9f65d6..67508d326e4 100644 --- a/xarray/tests/test_backends.py +++ b/xarray/tests/test_backends.py @@ -1156,6 +1156,24 @@ def test_mask_and_scale(self): expected = create_masked_and_scaled_data() assert_identical(expected, ds) + def test_force_promote_float64_variable(self): + with create_tmp_file() as tmp_file: + with nc4.Dataset(tmp_file, mode='w') as nc: + nc.createDimension('t', 5) + nc.createVariable('x', 'int16', ('t',), fill_value=-1) + v = nc.variables['x'] + v.scale_factor = 0.01 + v[:] = np.array([-1123, -1123, 123, 1123, 2123]) + # We read the newly created netcdf file + with nc4.Dataset(tmp_file, mode='r') as nc: + # we open the dataset with forced promotion to 64 bit + with open_dataset(tmp_file, force_promote_float64=True) as ds: + # Both dataset values should be equal + # And both of float64 array type + dsv = ds['x'].values + ncv = nc.variables['x'][:] + np.testing.assert_array_almost_equal(dsv, ncv, 15) + def test_0dimensional_variable(self): # This fix verifies our work-around to this netCDF4-python bug: # https://github.com/Unidata/netcdf4-python/pull/220 From 093a33822e57e38a0fa9e685635ead94f0c31e0b Mon Sep 17 00:00:00 2001 From: Daoud Jahdou Date: Thu, 7 Feb 2019 15:17:56 +0100 Subject: [PATCH 02/13] updated documentation #2304 --- xarray/backends/api.py | 8 ++++---- xarray/core/dtypes.py | 4 ++-- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/xarray/backends/api.py b/xarray/backends/api.py index 04ec1e0ce3c..e4381d35db1 100644 --- a/xarray/backends/api.py +++ b/xarray/backends/api.py @@ -230,8 +230,8 @@ def open_dataset(filename_or_obj, group=None, decode_cf=True, dataset. This may be useful to drop variables with problems or inconsistent values. force_promote_float64: bool - If True force int variables to be promoted to float64 instead of - float32 + If True force int variables with scale factorto be promoted to + float64 instead of float32 backend_kwargs: dictionary, optional A dictionary of keyword arguments to pass on to the backend. This may be useful when backend options would improve performance or @@ -432,8 +432,8 @@ def open_dataarray(filename_or_obj, group=None, decode_cf=True, dataset. This may be useful to drop variables with problems or inconsistent values. force_promote_float64: bool - If True force int variables to be promoted to float64 instead of - float32 + If True force int variables with scale factor to be promoted to + float64 instead of float32 backend_kwargs: dictionary, optional A dictionary of keyword arguments to pass on to the backend. This may be useful when backend options would improve performance or diff --git a/xarray/core/dtypes.py b/xarray/core/dtypes.py index 0e2c9c04dbb..d3dbc8c0df6 100644 --- a/xarray/core/dtypes.py +++ b/xarray/core/dtypes.py @@ -30,8 +30,8 @@ def __eq__(self, other): INF = AlwaysGreaterThan() NINF = AlwaysLessThan() -# Constant that indicates that we -# must switch to float64 instead of +# Constant that indicates if we +# want to promote to float64 instead of # float32 even if dtype.itemsize <= 2 FORCE_PROMOTE_FLOAT64 = False From 47d42078ab81041cf887c9118360a5e7f8e0dc44 Mon Sep 17 00:00:00 2001 From: Daoud Jahdou Date: Thu, 7 Feb 2019 15:29:27 +0100 Subject: [PATCH 03/13] Fixed typos doc #2304 --- xarray/backends/api.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/xarray/backends/api.py b/xarray/backends/api.py index e4381d35db1..f0847be4037 100644 --- a/xarray/backends/api.py +++ b/xarray/backends/api.py @@ -230,7 +230,7 @@ def open_dataset(filename_or_obj, group=None, decode_cf=True, dataset. This may be useful to drop variables with problems or inconsistent values. force_promote_float64: bool - If True force int variables with scale factorto be promoted to + If True force int variables with scale factor to be promoted to float64 instead of float32 backend_kwargs: dictionary, optional A dictionary of keyword arguments to pass on to the backend. This From 3804d56f71de35ec82ec6274afd7c0d5625b72c8 Mon Sep 17 00:00:00 2001 From: Daoud Jahdou Date: Thu, 7 Feb 2019 15:46:04 +0100 Subject: [PATCH 04/13] Minor modification to documentation --- doc/whats-new.rst | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/doc/whats-new.rst b/doc/whats-new.rst index ec8bc782e6e..f1f8e4b56df 100644 --- a/doc/whats-new.rst +++ b/doc/whats-new.rst @@ -62,8 +62,9 @@ Enhancements - :py:meth:`pandas.Series.dropna` is now supported for a :py:class:`pandas.Series` indexed by a :py:class:`~xarray.CFTimeIndex` (:issue:`2688`). By `Spencer Clark `_. -- Ability to promote to float64 instead of float32 when dealing int type - variables with scale_factor added parameter force_promote_float64 +- Ability to promote to float64 instead of float32 when dealing with int type + variables with scale_factor. + Added parameter force_promote_float64 (False by default) to :py:meth:`~xarray.open_dataset` and :py:meth:`~xarray.open_dataarray` that enables this behaviour. By `Daoud Jahdou `_. From 014254665015efdd6b1db9fb405cfdf6371f908b Mon Sep 17 00:00:00 2001 From: Daoud Jahdou Date: Thu, 7 Feb 2019 17:02:51 +0100 Subject: [PATCH 05/13] Updated doc --- doc/whats-new.rst | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/doc/whats-new.rst b/doc/whats-new.rst index f1f8e4b56df..e8dc01fd3a4 100644 --- a/doc/whats-new.rst +++ b/doc/whats-new.rst @@ -66,7 +66,8 @@ Enhancements variables with scale_factor. Added parameter force_promote_float64 (False by default) to :py:meth:`~xarray.open_dataset` and :py:meth:`~xarray.open_dataarray` - that enables this behaviour. By `Daoud Jahdou `_. + that enables this behaviour. + By `Daoud Jahdou `_. Bug fixes ~~~~~~~~~ From 5b421c8392f838f7420a69496c0a8e307e1d7fd4 Mon Sep 17 00:00:00 2001 From: Daoud Jahdou Date: Mon, 11 Feb 2019 17:15:34 +0100 Subject: [PATCH 06/13] Propagated the force_promote_float64 parameter down to CFScaleOffsetCoder --- xarray/backends/api.py | 3 ++- xarray/coding/variables.py | 13 ++++++++++--- xarray/conventions.py | 30 +++++++++++++++++++++--------- xarray/core/dtypes.py | 8 +------- 4 files changed, 34 insertions(+), 20 deletions(-) diff --git a/xarray/backends/api.py b/xarray/backends/api.py index f0847be4037..f942f6cb7a9 100644 --- a/xarray/backends/api.py +++ b/xarray/backends/api.py @@ -276,7 +276,8 @@ def maybe_decode_store(store, lock=False): ds = conventions.decode_cf( store, mask_and_scale=mask_and_scale, decode_times=decode_times, concat_characters=concat_characters, decode_coords=decode_coords, - drop_variables=drop_variables) + drop_variables=drop_variables, + force_promote_float64=force_promote_float64) _protect_dataset_variables_inplace(ds, cache) diff --git a/xarray/coding/variables.py b/xarray/coding/variables.py index 1f74181f3b3..fc483b0eb8e 100644 --- a/xarray/coding/variables.py +++ b/xarray/coding/variables.py @@ -189,8 +189,10 @@ def _scale_offset_decoding(data, scale_factor, add_offset, dtype): return data -def _choose_float_dtype(dtype, has_offset): +def _choose_float_dtype(dtype, has_offset, force_promote_float64=False): """Return a float dtype that can losslessly represent `dtype` values.""" + if force_promote_float64: + return np.float64 # Keep float32 as-is. Upcast half-precision to single-precision, # because float16 is "intended for storage but not computation" if dtype.itemsize <= 4 and np.issubdtype(dtype, np.floating): @@ -228,13 +230,18 @@ def encode(self, variable, name=None): return Variable(dims, data, attrs, encoding) - def decode(self, variable, name=None): + def decode(self, variable, force_promote_float64=False, name=None): dims, data, attrs, encoding = unpack_for_decoding(variable) if 'scale_factor' in attrs or 'add_offset' in attrs: scale_factor = pop_to(attrs, encoding, 'scale_factor', name=name) add_offset = pop_to(attrs, encoding, 'add_offset', name=name) - dtype = _choose_float_dtype(data.dtype, 'add_offset' in attrs) + # Avoiding line too long warning in + # _choose_float_dtype + force_promote = force_promote_float64 + dtype = _choose_float_dtype(data.dtype, 'add_offset' in attrs, + force_promote_float64=force_promote) + transform = partial(_scale_offset_decoding, scale_factor=scale_factor, add_offset=add_offset, diff --git a/xarray/conventions.py b/xarray/conventions.py index c1c95a6b60e..b24bdcbf5f7 100644 --- a/xarray/conventions.py +++ b/xarray/conventions.py @@ -240,7 +240,7 @@ def encode_cf_variable(var, needs_copy=True, name=None): def decode_cf_variable(name, var, concat_characters=True, mask_and_scale=True, decode_times=True, decode_endianness=True, - stack_char_dim=True): + stack_char_dim=True, force_promote_float64=False): """ Decodes a variable which may hold CF encoded information. @@ -270,7 +270,9 @@ def decode_cf_variable(name, var, concat_characters=True, mask_and_scale=True, Whether to stack characters into bytes along the last dimension of this array. Passed as an argument because we need to look at the full dataset to figure out if this is appropriate. - + force_promote_float64: bool + If True force int variables with scale factor to be promoted to + float64 instead of float32 Returns ------- out : Variable @@ -286,10 +288,14 @@ def decode_cf_variable(name, var, concat_characters=True, mask_and_scale=True, if mask_and_scale: for coder in [variables.UnsignedIntegerCoder(), - variables.CFMaskCoder(), - variables.CFScaleOffsetCoder()]: + variables.CFMaskCoder()]: var = coder.decode(var, name=name) + coder = variables.CFScaleOffsetCoder() + var = coder.decode(var, + name=name, + force_promote_float64=force_promote_float64) + if decode_times: for coder in [times.CFTimedeltaCoder(), times.CFDatetimeCoder()]: @@ -346,7 +352,8 @@ def _update_bounds_attributes(variables): def decode_cf_variables(variables, attributes, concat_characters=True, mask_and_scale=True, decode_times=True, - decode_coords=True, drop_variables=None): + decode_coords=True, drop_variables=None, + force_promote_float64=False): """ Decode several CF encoded variables. @@ -387,7 +394,8 @@ def stackable(dim): new_vars[k] = decode_cf_variable( k, v, concat_characters=concat_characters, mask_and_scale=mask_and_scale, decode_times=decode_times, - stack_char_dim=stack_char_dim) + stack_char_dim=stack_char_dim, + force_promote_float64=force_promote_float64) if decode_coords: var_attrs = new_vars[k].attrs if 'coordinates' in var_attrs: @@ -406,7 +414,8 @@ def stackable(dim): def decode_cf(obj, concat_characters=True, mask_and_scale=True, - decode_times=True, decode_coords=True, drop_variables=None): + decode_times=True, decode_coords=True, drop_variables=None, + force_promote_float64=False): """Decode the given Dataset or Datastore according to CF conventions into a new Dataset. @@ -430,7 +439,9 @@ def decode_cf(obj, concat_characters=True, mask_and_scale=True, A variable or list of variables to exclude from being parsed from the dataset. This may be useful to drop variables with problems or inconsistent values. - + force_promote_float64: bool + If True force int variables with scale factor to be promoted to + float64 instead of float32 Returns ------- decoded : Dataset @@ -454,7 +465,8 @@ def decode_cf(obj, concat_characters=True, mask_and_scale=True, vars, attrs, coord_names = decode_cf_variables( vars, attrs, concat_characters, mask_and_scale, decode_times, - decode_coords, drop_variables=drop_variables) + decode_coords, drop_variables=drop_variables, + force_promote_float64=force_promote_float64) ds = Dataset(vars, attrs=attrs) ds = ds.set_coords(coord_names.union(extra_coords).intersection(vars)) ds._file_obj = file_obj diff --git a/xarray/core/dtypes.py b/xarray/core/dtypes.py index d3dbc8c0df6..803c5c4064c 100644 --- a/xarray/core/dtypes.py +++ b/xarray/core/dtypes.py @@ -30,12 +30,6 @@ def __eq__(self, other): INF = AlwaysGreaterThan() NINF = AlwaysLessThan() -# Constant that indicates if we -# want to promote to float64 instead of -# float32 even if dtype.itemsize <= 2 -FORCE_PROMOTE_FLOAT64 = False - - # Pairs of types that, if both found, should be promoted to object dtype # instead of following NumPy's own type-promotion rules. These type promotion # rules match pandas instead. For reference, see the NumPy type hierarchy: @@ -68,7 +62,7 @@ def maybe_promote(dtype): # Check np.timedelta64 before np.integer fill_value = np.timedelta64('NaT') elif np.issubdtype(dtype, np.integer): - if dtype.itemsize <= 2 and not FORCE_PROMOTE_FLOAT64: + if dtype.itemsize <= 2: dtype = np.float32 else: dtype = np.float64 From c1a2a03ca71db09d0857290e3d646b5a3b1e9b38 Mon Sep 17 00:00:00 2001 From: Daoud Jahdou Date: Tue, 12 Feb 2019 10:38:46 +0100 Subject: [PATCH 07/13] removing dtypes constant mutation --- xarray/backends/api.py | 3 --- 1 file changed, 3 deletions(-) diff --git a/xarray/backends/api.py b/xarray/backends/api.py index f942f6cb7a9..af25105239c 100644 --- a/xarray/backends/api.py +++ b/xarray/backends/api.py @@ -14,7 +14,6 @@ from ..core.utils import close_on_error, is_grib_path, is_remote_uri from .common import ArrayWriter from .locks import _get_scheduler -from ..core import dtypes DATAARRAY_NAME = '__xarray_dataarray_name__' DATAARRAY_VARIABLE = '__xarray_dataarray_variable__' @@ -267,8 +266,6 @@ def open_dataset(filename_or_obj, group=None, decode_cf=True, if cache is None: cache = chunks is None - dtypes.FORCE_PROMOTE_FLOAT64 = force_promote_float64 - if backend_kwargs is None: backend_kwargs = {} From b05193dce441b49deb3f302cddfc179d42506619 Mon Sep 17 00:00:00 2001 From: Daoud Jahdou Date: Mon, 18 Feb 2019 12:35:48 +0100 Subject: [PATCH 08/13] implementing cf convention regarding scale_factor and add_offset types http://cfconventions.org/Data/cf-conventions/cf-conventions-1.7/build/ch08.html --- doc/whats-new.rst | 7 ++---- xarray/backends/api.py | 13 ++-------- xarray/coding/variables.py | 46 ++++++++++++++++++++++++++--------- xarray/conventions.py | 28 ++++++--------------- xarray/tests/test_backends.py | 10 +++++--- 5 files changed, 52 insertions(+), 52 deletions(-) diff --git a/doc/whats-new.rst b/doc/whats-new.rst index e8dc01fd3a4..293049e52a8 100644 --- a/doc/whats-new.rst +++ b/doc/whats-new.rst @@ -62,11 +62,8 @@ Enhancements - :py:meth:`pandas.Series.dropna` is now supported for a :py:class:`pandas.Series` indexed by a :py:class:`~xarray.CFTimeIndex` (:issue:`2688`). By `Spencer Clark `_. -- Ability to promote to float64 instead of float32 when dealing with int type - variables with scale_factor. - Added parameter force_promote_float64 (False by default) - to :py:meth:`~xarray.open_dataset` and :py:meth:`~xarray.open_dataarray` - that enables this behaviour. +- Variables are now unpacked with scale_factor and offset dtypes if present in datasets. + According `to cf convetion `_. By `Daoud Jahdou `_. Bug fixes diff --git a/xarray/backends/api.py b/xarray/backends/api.py index af25105239c..e52f47a0841 100644 --- a/xarray/backends/api.py +++ b/xarray/backends/api.py @@ -161,7 +161,6 @@ def open_dataset(filename_or_obj, group=None, decode_cf=True, mask_and_scale=None, decode_times=True, autoclose=None, concat_characters=True, decode_coords=True, engine=None, chunks=None, lock=None, cache=None, drop_variables=None, - force_promote_float64=False, backend_kwargs=None): """Load and decode a dataset from a file or file-like object. @@ -228,9 +227,6 @@ def open_dataset(filename_or_obj, group=None, decode_cf=True, A variable or list of variables to exclude from being parsed from the dataset. This may be useful to drop variables with problems or inconsistent values. - force_promote_float64: bool - If True force int variables with scale factor to be promoted to - float64 instead of float32 backend_kwargs: dictionary, optional A dictionary of keyword arguments to pass on to the backend. This may be useful when backend options would improve performance or @@ -273,8 +269,7 @@ def maybe_decode_store(store, lock=False): ds = conventions.decode_cf( store, mask_and_scale=mask_and_scale, decode_times=decode_times, concat_characters=concat_characters, decode_coords=decode_coords, - drop_variables=drop_variables, - force_promote_float64=force_promote_float64) + drop_variables=drop_variables) _protect_dataset_variables_inplace(ds, cache) @@ -365,7 +360,7 @@ def open_dataarray(filename_or_obj, group=None, decode_cf=True, mask_and_scale=None, decode_times=True, autoclose=None, concat_characters=True, decode_coords=True, engine=None, chunks=None, lock=None, cache=None, drop_variables=None, - force_promote_float64=False, backend_kwargs=None): + backend_kwargs=None): """Open an DataArray from a netCDF file containing a single data variable. This is designed to read netCDF files with only one data variable. If @@ -429,9 +424,6 @@ def open_dataarray(filename_or_obj, group=None, decode_cf=True, A variable or list of variables to exclude from being parsed from the dataset. This may be useful to drop variables with problems or inconsistent values. - force_promote_float64: bool - If True force int variables with scale factor to be promoted to - float64 instead of float32 backend_kwargs: dictionary, optional A dictionary of keyword arguments to pass on to the backend. This may be useful when backend options would improve performance or @@ -458,7 +450,6 @@ def open_dataarray(filename_or_obj, group=None, decode_cf=True, decode_coords=decode_coords, engine=engine, chunks=chunks, lock=lock, cache=cache, drop_variables=drop_variables, - force_promote_float64=force_promote_float64, backend_kwargs=backend_kwargs) if len(dataset.data_vars) != 1: diff --git a/xarray/coding/variables.py b/xarray/coding/variables.py index fc483b0eb8e..99f33b8154b 100644 --- a/xarray/coding/variables.py +++ b/xarray/coding/variables.py @@ -189,10 +189,36 @@ def _scale_offset_decoding(data, scale_factor, add_offset, dtype): return data -def _choose_float_dtype(dtype, has_offset, force_promote_float64=False): +def _choose_float_dtype(dtype, scale_factor, add_offset): """Return a float dtype that can losslessly represent `dtype` values.""" - if force_promote_float64: - return np.float64 + # Implementing cf-convention + # http://cfconventions.org/Data/cf-conventions/cf-conventions-1.7/build/ch08.html: + # Detail: + # If the scale_factor and add_offset attributes are of the same + # data type as the + # associated variable, the unpacked data is assumed to be of the same + # data type as the packed data. However, if the scale_factor + # and add_offset attributes are of a different data type + # from the variable (containing the packed data) + # then the unpacked data should match + # the type of these attributes, which must both be of type float or both + # be of type double. An additional restriction in this case is that + # the variable containing the packed data must + # be of type byte, short or int. + # It is not advised to unpack an int into a float as there + # is a potential precision loss. + + # We first return scale_factor type + # as multiplying takes priority over + # adding values + if dtype is not type(scale_factor) and \ + isinstance(scale_factor, np.generic): + return np.dtype(scale_factor) + + if dtype is not type(add_offset) and \ + isinstance(add_offset, np.generic): + return np.dtype(add_offset) + # Keep float32 as-is. Upcast half-precision to single-precision, # because float16 is "intended for storage but not computation" if dtype.itemsize <= 4 and np.issubdtype(dtype, np.floating): @@ -203,7 +229,7 @@ def _choose_float_dtype(dtype, has_offset, force_promote_float64=False): # but a large integer offset could lead to loss of precision. # Sensitivity analysis can be tricky, so we just use a float64 # if there's any offset at all - better unoptimised than wrong! - if not has_offset: + if not add_offset: return np.float32 # For all other types and circumstances, we just use float64. # (safe because eg. complex numbers are not supported in NetCDF) @@ -221,7 +247,9 @@ def encode(self, variable, name=None): dims, data, attrs, encoding = unpack_for_encoding(variable) if 'scale_factor' in encoding or 'add_offset' in encoding: - dtype = _choose_float_dtype(data.dtype, 'add_offset' in encoding) + scale_factor = pop_to(attrs, encoding, 'scale_factor', name=name) + add_offset = pop_to(attrs, encoding, 'add_offset', name=name) + dtype = _choose_float_dtype(data.dtype, scale_factor, add_offset) data = data.astype(dtype=dtype, copy=True) if 'add_offset' in encoding: data -= pop_to(encoding, attrs, 'add_offset', name=name) @@ -230,17 +258,13 @@ def encode(self, variable, name=None): return Variable(dims, data, attrs, encoding) - def decode(self, variable, force_promote_float64=False, name=None): + def decode(self, variable, name=None): dims, data, attrs, encoding = unpack_for_decoding(variable) if 'scale_factor' in attrs or 'add_offset' in attrs: scale_factor = pop_to(attrs, encoding, 'scale_factor', name=name) add_offset = pop_to(attrs, encoding, 'add_offset', name=name) - # Avoiding line too long warning in - # _choose_float_dtype - force_promote = force_promote_float64 - dtype = _choose_float_dtype(data.dtype, 'add_offset' in attrs, - force_promote_float64=force_promote) + dtype = _choose_float_dtype(data.dtype, scale_factor, add_offset) transform = partial(_scale_offset_decoding, scale_factor=scale_factor, diff --git a/xarray/conventions.py b/xarray/conventions.py index b24bdcbf5f7..4558f5d0d85 100644 --- a/xarray/conventions.py +++ b/xarray/conventions.py @@ -240,7 +240,7 @@ def encode_cf_variable(var, needs_copy=True, name=None): def decode_cf_variable(name, var, concat_characters=True, mask_and_scale=True, decode_times=True, decode_endianness=True, - stack_char_dim=True, force_promote_float64=False): + stack_char_dim=True): """ Decodes a variable which may hold CF encoded information. @@ -270,9 +270,6 @@ def decode_cf_variable(name, var, concat_characters=True, mask_and_scale=True, Whether to stack characters into bytes along the last dimension of this array. Passed as an argument because we need to look at the full dataset to figure out if this is appropriate. - force_promote_float64: bool - If True force int variables with scale factor to be promoted to - float64 instead of float32 Returns ------- out : Variable @@ -288,14 +285,10 @@ def decode_cf_variable(name, var, concat_characters=True, mask_and_scale=True, if mask_and_scale: for coder in [variables.UnsignedIntegerCoder(), - variables.CFMaskCoder()]: + variables.CFMaskCoder(), + variables.CFScaleOffsetCoder()]: var = coder.decode(var, name=name) - coder = variables.CFScaleOffsetCoder() - var = coder.decode(var, - name=name, - force_promote_float64=force_promote_float64) - if decode_times: for coder in [times.CFTimedeltaCoder(), times.CFDatetimeCoder()]: @@ -352,8 +345,7 @@ def _update_bounds_attributes(variables): def decode_cf_variables(variables, attributes, concat_characters=True, mask_and_scale=True, decode_times=True, - decode_coords=True, drop_variables=None, - force_promote_float64=False): + decode_coords=True, drop_variables=None): """ Decode several CF encoded variables. @@ -394,8 +386,7 @@ def stackable(dim): new_vars[k] = decode_cf_variable( k, v, concat_characters=concat_characters, mask_and_scale=mask_and_scale, decode_times=decode_times, - stack_char_dim=stack_char_dim, - force_promote_float64=force_promote_float64) + stack_char_dim=stack_char_dim) if decode_coords: var_attrs = new_vars[k].attrs if 'coordinates' in var_attrs: @@ -414,8 +405,7 @@ def stackable(dim): def decode_cf(obj, concat_characters=True, mask_and_scale=True, - decode_times=True, decode_coords=True, drop_variables=None, - force_promote_float64=False): + decode_times=True, decode_coords=True, drop_variables=None): """Decode the given Dataset or Datastore according to CF conventions into a new Dataset. @@ -439,9 +429,6 @@ def decode_cf(obj, concat_characters=True, mask_and_scale=True, A variable or list of variables to exclude from being parsed from the dataset. This may be useful to drop variables with problems or inconsistent values. - force_promote_float64: bool - If True force int variables with scale factor to be promoted to - float64 instead of float32 Returns ------- decoded : Dataset @@ -465,8 +452,7 @@ def decode_cf(obj, concat_characters=True, mask_and_scale=True, vars, attrs, coord_names = decode_cf_variables( vars, attrs, concat_characters, mask_and_scale, decode_times, - decode_coords, drop_variables=drop_variables, - force_promote_float64=force_promote_float64) + decode_coords, drop_variables=drop_variables) ds = Dataset(vars, attrs=attrs) ds = ds.set_coords(coord_names.union(extra_coords).intersection(vars)) ds._file_obj = file_obj diff --git a/xarray/tests/test_backends.py b/xarray/tests/test_backends.py index 67508d326e4..c9409187b72 100644 --- a/xarray/tests/test_backends.py +++ b/xarray/tests/test_backends.py @@ -1140,14 +1140,15 @@ def test_mask_and_scale(self): v = nc.variables['x'] v.set_auto_maskandscale(False) v.add_offset = 10 - v.scale_factor = 0.1 + v.scale_factor = np.float32(0.1) v[:] = np.array([-1, -1, 0, 1, 2]) # first make sure netCDF4 reads the masked and scaled data # correctly with nc4.Dataset(tmp_file, mode='r') as nc: expected = np.ma.array([-1, -1, 10, 10.1, 10.2], - mask=[True, True, False, False, False]) + mask=[True, True, False, False, False], + dtype=np.float32) actual = nc.variables['x'][:] assert_array_equal(expected, actual) @@ -1156,18 +1157,19 @@ def test_mask_and_scale(self): expected = create_masked_and_scaled_data() assert_identical(expected, ds) - def test_force_promote_float64_variable(self): + def test_mask_and_scale_with_float64_scale_factor(self): with create_tmp_file() as tmp_file: with nc4.Dataset(tmp_file, mode='w') as nc: nc.createDimension('t', 5) nc.createVariable('x', 'int16', ('t',), fill_value=-1) v = nc.variables['x'] v.scale_factor = 0.01 + v.add_offset = 10 v[:] = np.array([-1123, -1123, 123, 1123, 2123]) # We read the newly created netcdf file with nc4.Dataset(tmp_file, mode='r') as nc: # we open the dataset with forced promotion to 64 bit - with open_dataset(tmp_file, force_promote_float64=True) as ds: + with open_dataset(tmp_file) as ds: # Both dataset values should be equal # And both of float64 array type dsv = ds['x'].values From a94f2af2fb83c6d31a042298d0654eff17cfd4c0 Mon Sep 17 00:00:00 2001 From: Daoud Jahdou Date: Thu, 21 Feb 2019 18:23:23 +0100 Subject: [PATCH 09/13] Taking into account comments and adding edge case tests --- .gitignore | 1 + xarray/coding/variables.py | 43 +++++++++++++++++++---------------- xarray/conventions.py | 1 - xarray/tests/test_backends.py | 13 ++++++++--- xarray/tests/test_coding.py | 39 +++++++++++++++++++++++++++++++ 5 files changed, 73 insertions(+), 24 deletions(-) diff --git a/.gitignore b/.gitignore index 2a016bb9228..189c25bef3d 100644 --- a/.gitignore +++ b/.gitignore @@ -68,3 +68,4 @@ xarray/version.py Icon* .ipynb_checkpoints +.nfs* diff --git a/xarray/coding/variables.py b/xarray/coding/variables.py index 99f33b8154b..e9a0e510c72 100644 --- a/xarray/coding/variables.py +++ b/xarray/coding/variables.py @@ -189,7 +189,7 @@ def _scale_offset_decoding(data, scale_factor, add_offset, dtype): return data -def _choose_float_dtype(dtype, scale_factor, add_offset): +def _choose_float_dtype(dtype, scale_factor, add_offset, mode=None): """Return a float dtype that can losslessly represent `dtype` values.""" # Implementing cf-convention # http://cfconventions.org/Data/cf-conventions/cf-conventions-1.7/build/ch08.html: @@ -208,16 +208,17 @@ def _choose_float_dtype(dtype, scale_factor, add_offset): # It is not advised to unpack an int into a float as there # is a potential precision loss. - # We first return scale_factor type - # as multiplying takes priority over - # adding values - if dtype is not type(scale_factor) and \ - isinstance(scale_factor, np.generic): - return np.dtype(scale_factor) + if mode is 'decoding': + types = [np.dtype(type(scale_factor)), + np.dtype(type(add_offset)), + np.dtype(dtype)] + # scaled_type should be the largest type we find + scaled_dtype = max(types) - if dtype is not type(add_offset) and \ - isinstance(add_offset, np.generic): - return np.dtype(add_offset) + # We return it only if it's a float32 or a float64 + if (scaled_dtype.itemsize >= 4 + and np.issubdtype(scaled_dtype, np.floating)): + return scaled_dtype # Keep float32 as-is. Upcast half-precision to single-precision, # because float16 is "intended for storage but not computation" @@ -245,16 +246,18 @@ class CFScaleOffsetCoder(VariableCoder): def encode(self, variable, name=None): dims, data, attrs, encoding = unpack_for_encoding(variable) - if 'scale_factor' in encoding or 'add_offset' in encoding: - scale_factor = pop_to(attrs, encoding, 'scale_factor', name=name) - add_offset = pop_to(attrs, encoding, 'add_offset', name=name) - dtype = _choose_float_dtype(data.dtype, scale_factor, add_offset) + scale_factor = pop_to(encoding, attrs, + 'scale_factor', name=name) + add_offset = pop_to(encoding, attrs, + 'add_offset', name=name) + dtype = _choose_float_dtype(data.dtype, scale_factor, + add_offset, mode='encoding') data = data.astype(dtype=dtype, copy=True) - if 'add_offset' in encoding: - data -= pop_to(encoding, attrs, 'add_offset', name=name) - if 'scale_factor' in encoding: - data /= pop_to(encoding, attrs, 'scale_factor', name=name) + if add_offset: + data -= add_offset + if scale_factor: + data /= scale_factor return Variable(dims, data, attrs, encoding) @@ -264,8 +267,8 @@ def decode(self, variable, name=None): if 'scale_factor' in attrs or 'add_offset' in attrs: scale_factor = pop_to(attrs, encoding, 'scale_factor', name=name) add_offset = pop_to(attrs, encoding, 'add_offset', name=name) - dtype = _choose_float_dtype(data.dtype, scale_factor, add_offset) - + dtype = _choose_float_dtype(data.dtype, scale_factor, + add_offset, mode='decoding') transform = partial(_scale_offset_decoding, scale_factor=scale_factor, add_offset=add_offset, diff --git a/xarray/conventions.py b/xarray/conventions.py index 4558f5d0d85..84f07c10589 100644 --- a/xarray/conventions.py +++ b/xarray/conventions.py @@ -222,7 +222,6 @@ def encode_cf_variable(var, needs_copy=True, name=None): A variable which has been encoded as described above. """ ensure_not_multiindex(var, name=name) - for coder in [times.CFDatetimeCoder(), times.CFTimedeltaCoder(), variables.CFScaleOffsetCoder(), diff --git a/xarray/tests/test_backends.py b/xarray/tests/test_backends.py index c9409187b72..5f1b3838e43 100644 --- a/xarray/tests/test_backends.py +++ b/xarray/tests/test_backends.py @@ -668,7 +668,9 @@ def test_roundtrip_mask_and_scale(self, decoded_fn, encoded_fn): with self.roundtrip(decoded) as actual: for k in decoded.variables: assert (decoded.variables[k].dtype - == actual.variables[k].dtype) + == actual.variables[k].dtype or + (decoded.variables[k].dtype == np.float32 and + actual.variables[k].dtype == np.float64)) assert_allclose(decoded, actual, decode_bytes=False) with self.roundtrip(decoded, @@ -694,7 +696,9 @@ def test_roundtrip_mask_and_scale(self, decoded_fn, encoded_fn): with self.roundtrip(encoded) as actual: for k in decoded.variables: assert (decoded.variables[k].dtype == - actual.variables[k].dtype) + actual.variables[k].dtype or + (decoded.variables[k].dtype == np.float32 and + actual.variables[k].dtype == np.float64)) assert_allclose(decoded, actual, decode_bytes=False) def test_coordinates_encoding(self): @@ -1168,8 +1172,11 @@ def test_mask_and_scale_with_float64_scale_factor(self): v[:] = np.array([-1123, -1123, 123, 1123, 2123]) # We read the newly created netcdf file with nc4.Dataset(tmp_file, mode='r') as nc: - # we open the dataset with forced promotion to 64 bit + # we open the dataset + from xarray.coding import variables + variables.MYCASE = False with open_dataset(tmp_file) as ds: + variables.MYCASE = False # Both dataset values should be equal # And both of float64 array type dsv = ds['x'].values diff --git a/xarray/tests/test_coding.py b/xarray/tests/test_coding.py index 95c8ebc0b42..a35ba6d1681 100644 --- a/xarray/tests/test_coding.py +++ b/xarray/tests/test_coding.py @@ -50,3 +50,42 @@ def test_scaling_converts_to_float32(dtype): roundtripped = coder.decode(encoded) assert_identical(original, roundtripped) assert roundtripped.dtype == np.float32 + + +@pytest.mark.parametrize('dtype', 'u1 u2 i1 i2 f2 f4'.split()) +@pytest.mark.parametrize('scale_factor', [10, 0.01, + np.float16(10), + np.float32(10), + np.float64(10), + np.int8(10), + np.int16(10), np.int32(10), + np.int64(10), np.uint8(10), + np.uint16(10), np.uint32(10), + np.uint64(10), np.uint64(10)]) +@pytest.mark.parametrize('add_offset', [10, 0.01, + np.float16(10), + np.float32(10), + np.float64(10), + np.int8(10), + np.int16(10), np.int32(10), + np.int64(10), np.uint8(10), + np.uint16(10), np.uint32(10), + np.uint64(10), np.uint64(10)]) +def test_scaling_according_to_cf_convention(dtype, scale_factor, add_offset): + original = xr.Variable(('x',), np.arange(10, dtype=dtype), + encoding=dict(scale_factor=scale_factor, + add_offset=add_offset)) + coder = variables.CFScaleOffsetCoder() + encoded = coder.encode(original) + assert encoded.dtype.itemsize >= np.dtype(dtype).itemsize + assert encoded.dtype.itemsize >= 4 and np.issubdtype(encoded, np.floating) + + roundtripped = coder.decode(encoded) + + # We make sure that roundtripped is larger than + # the original + assert roundtripped.dtype.itemsize >= original.dtype.itemsize + assert (roundtripped.dtype is np.dtype(np.float64) + or roundtripped.dtype is np.dtype(np.float32)) + + np.testing.assert_array_almost_equal(roundtripped, original) From 99599ed898cc4babe66b3891d41e92dc17e3da93 Mon Sep 17 00:00:00 2001 From: Daoud Jahdou Date: Thu, 21 Feb 2019 18:31:28 +0100 Subject: [PATCH 10/13] removed unused code --- xarray/tests/test_backends.py | 3 --- 1 file changed, 3 deletions(-) diff --git a/xarray/tests/test_backends.py b/xarray/tests/test_backends.py index 60cd00f90eb..1f3eb9af6ad 100644 --- a/xarray/tests/test_backends.py +++ b/xarray/tests/test_backends.py @@ -1190,10 +1190,7 @@ def test_mask_and_scale_with_float64_scale_factor(self): # We read the newly created netcdf file with nc4.Dataset(tmp_file, mode='r') as nc: # we open the dataset - from xarray.coding import variables - variables.MYCASE = False with open_dataset(tmp_file) as ds: - variables.MYCASE = False # Both dataset values should be equal # And both of float64 array type dsv = ds['x'].values From 638332c871484f49412208468b4903899e59c14d Mon Sep 17 00:00:00 2001 From: Daoud Jahdou Date: Thu, 7 Mar 2019 19:02:45 +0100 Subject: [PATCH 11/13] Adding more tests with expected outputs --- xarray/coding/variables.py | 44 ++++++---- xarray/tests/test_backends.py | 6 +- xarray/tests/test_coding.py | 160 +++++++++++++++++++++++++++------- 3 files changed, 160 insertions(+), 50 deletions(-) diff --git a/xarray/coding/variables.py b/xarray/coding/variables.py index e9a0e510c72..2e7e7d5a8e7 100644 --- a/xarray/coding/variables.py +++ b/xarray/coding/variables.py @@ -189,8 +189,8 @@ def _scale_offset_decoding(data, scale_factor, add_offset, dtype): return data -def _choose_float_dtype(dtype, scale_factor, add_offset, mode=None): - """Return a float dtype that can losslessly represent `dtype` values.""" +def _choose_decoding_float_dtype(dtype, scale_factor, add_offset): + """Return a float dtype according to cf-convention""" # Implementing cf-convention # http://cfconventions.org/Data/cf-conventions/cf-conventions-1.7/build/ch08.html: # Detail: @@ -208,18 +208,30 @@ def _choose_float_dtype(dtype, scale_factor, add_offset, mode=None): # It is not advised to unpack an int into a float as there # is a potential precision loss. - if mode is 'decoding': + if scale_factor or add_offset: + types = [np.dtype(type(scale_factor)), np.dtype(type(add_offset)), np.dtype(dtype)] + + if add_offset is None: + types = [np.dtype(type(scale_factor)), + np.dtype(dtype)] + # scaled_type should be the largest type we find - scaled_dtype = max(types) + scaled_dtype = dtypes.result_type(*tuple(types)) # We return it only if it's a float32 or a float64 if (scaled_dtype.itemsize >= 4 and np.issubdtype(scaled_dtype, np.floating)): return scaled_dtype + return _choose_encoding_float_dtype(dtype, add_offset is not None) + + +def _choose_encoding_float_dtype(dtype, has_offset): + """Return a float dtype that can losslessly represent `dtype` values.""" + # Keep float32 as-is. Upcast half-precision to single-precision, # because float16 is "intended for storage but not computation" if dtype.itemsize <= 4 and np.issubdtype(dtype, np.floating): @@ -230,7 +242,7 @@ def _choose_float_dtype(dtype, scale_factor, add_offset, mode=None): # but a large integer offset could lead to loss of precision. # Sensitivity analysis can be tricky, so we just use a float64 # if there's any offset at all - better unoptimised than wrong! - if not add_offset: + if not has_offset: return np.float32 # For all other types and circumstances, we just use float64. # (safe because eg. complex numbers are not supported in NetCDF) @@ -247,17 +259,13 @@ class CFScaleOffsetCoder(VariableCoder): def encode(self, variable, name=None): dims, data, attrs, encoding = unpack_for_encoding(variable) if 'scale_factor' in encoding or 'add_offset' in encoding: - scale_factor = pop_to(encoding, attrs, - 'scale_factor', name=name) - add_offset = pop_to(encoding, attrs, - 'add_offset', name=name) - dtype = _choose_float_dtype(data.dtype, scale_factor, - add_offset, mode='encoding') + dtype = _choose_encoding_float_dtype(data.dtype, + 'add_offset' in encoding) data = data.astype(dtype=dtype, copy=True) - if add_offset: - data -= add_offset - if scale_factor: - data /= scale_factor + if 'add_offset' in encoding: + data -= pop_to(encoding, attrs, 'add_offset', name=name) + if 'scale_factor' in encoding: + data /= pop_to(encoding, attrs, 'scale_factor', name=name) return Variable(dims, data, attrs, encoding) @@ -265,10 +273,12 @@ def decode(self, variable, name=None): dims, data, attrs, encoding = unpack_for_decoding(variable) if 'scale_factor' in attrs or 'add_offset' in attrs: + scale_factor = pop_to(attrs, encoding, 'scale_factor', name=name) add_offset = pop_to(attrs, encoding, 'add_offset', name=name) - dtype = _choose_float_dtype(data.dtype, scale_factor, - add_offset, mode='decoding') + dtype = _choose_decoding_float_dtype(data.dtype, + scale_factor, add_offset) + transform = partial(_scale_offset_decoding, scale_factor=scale_factor, add_offset=add_offset, diff --git a/xarray/tests/test_backends.py b/xarray/tests/test_backends.py index 1f3eb9af6ad..f10466ec16d 100644 --- a/xarray/tests/test_backends.py +++ b/xarray/tests/test_backends.py @@ -685,8 +685,8 @@ def test_roundtrip_mask_and_scale(self, decoded_fn, encoded_fn): with self.roundtrip(decoded) as actual: for k in decoded.variables: assert (decoded.variables[k].dtype - == actual.variables[k].dtype or - (decoded.variables[k].dtype == np.float32 and + == actual.variables[k].dtype + or (decoded.variables[k].dtype == np.float32 and actual.variables[k].dtype == np.float64)) assert_allclose(decoded, actual, decode_bytes=False) @@ -1160,7 +1160,7 @@ def test_mask_and_scale(self): nc.createVariable('x', 'int16', ('t',), fill_value=-1) v = nc.variables['x'] v.set_auto_maskandscale(False) - v.add_offset = 10 + v.add_offset = np.float32(10) v.scale_factor = np.float32(0.1) v[:] = np.array([-1, -1, 0, 1, 2]) diff --git a/xarray/tests/test_coding.py b/xarray/tests/test_coding.py index a35ba6d1681..d4c0ec596ee 100644 --- a/xarray/tests/test_coding.py +++ b/xarray/tests/test_coding.py @@ -12,6 +12,10 @@ import dask.array as da +def reverse_list_of_tuple(plist): + return [tuple(reversed(t)) for t in plist] + + def test_CFMaskCoder_decode(): original = xr.Variable(('x',), [0, -1, 1], {'_FillValue': -1}) expected = xr.Variable(('x',), [0, np.nan, 1]) @@ -43,49 +47,145 @@ def test_coder_roundtrip(): @pytest.mark.parametrize('dtype', 'u1 u2 i1 i2 f2 f4'.split()) def test_scaling_converts_to_float32(dtype): original = xr.Variable(('x',), np.arange(10, dtype=dtype), - encoding=dict(scale_factor=10)) + encoding=dict(scale_factor=np.float32(10))) coder = variables.CFScaleOffsetCoder() encoded = coder.encode(original) assert encoded.dtype == np.float32 roundtripped = coder.decode(encoded) - assert_identical(original, roundtripped) assert roundtripped.dtype == np.float32 + assert_identical(original, roundtripped) -@pytest.mark.parametrize('dtype', 'u1 u2 i1 i2 f2 f4'.split()) -@pytest.mark.parametrize('scale_factor', [10, 0.01, - np.float16(10), - np.float32(10), - np.float64(10), - np.int8(10), - np.int16(10), np.int32(10), - np.int64(10), np.uint8(10), - np.uint16(10), np.uint32(10), - np.uint64(10), np.uint64(10)]) -@pytest.mark.parametrize('add_offset', [10, 0.01, - np.float16(10), - np.float32(10), - np.float64(10), - np.int8(10), - np.int16(10), np.int32(10), - np.int64(10), np.uint8(10), - np.uint16(10), np.uint32(10), - np.uint64(10), np.uint64(10)]) -def test_scaling_according_to_cf_convention(dtype, scale_factor, add_offset): +all_possible_types = [np.uint8(8), + np.uint16(16), + np.uint32(32), + np.uint64(64), + np.int8(80), + np.int16(160), + np.int32(320), + np.int64(640), + 1, + 0.01, + np.float16(1600), + np.float32(3200), + np.float64(6400)] + +# In all cases encoding returns either np.float32 or np.float64 +# Encoding only cares about existence of add_offset when the +# variable is an integer +# If the variable is a float then the encoded dtype is np.float32 +# If the variable is an integer with add_offset +# then the encoded dtype is np.float64 +# If the variable is an integer with no add_affset +# then the encoded dtype is np.float32 +# In all other cases the encoded dtype is np.float64 +# decoding is the equivalent of unpacking mentioned in the cf-convention +# in all cases decoding takes the encoded dtype which +# is either np.float32 or np.float64 +# then decoded type is the largest between scale_factor, add_offset +# and encoded type and not original + +############################# +# Case 1: variable has offset +############################# + +# Case 1.1: variable is float +# encoded should be np.float32 +# if (scale_factor, add_offset) is in the following list +# decoded should be np.float32 +# if not decoded should be np.float64 +combinations_for_float32 = [ + (np.uint8(8), np.uint16(16)), + (np.uint8(8), np.int8(80)), + (np.uint8(8), np.int16(160)), + (np.uint8(8), np.float16(1600)), + (np.uint8(8), np.float32(3200)), + (np.uint16(16), np.float16(1600)), + (np.uint16(16), np.float32(3200)), + (np.int8(80), np.int16(160)), + (np.int8(80), np.float16(1600)), + (np.int8(80), np.float32(3200)), + (np.int16(160), np.float16(1600)), + (np.int16(160), np.float32(3200)), + (np.float16(1600), np.float32(3200)), + (np.float32(3200), np.float32(3200)), + (np.float16(1600), np.float16(1600)), + (np.int16(160), np.int16(160)), + (np.int8(80), np.int8(80)), + (np.uint16(16), np.uint16(16)), + (np.uint8(8), np.uint8(8)) +] +(combinations_for_float32.extend( + reverse_list_of_tuple(combinations_for_float32))) + + +@pytest.mark.parametrize('dtype', 'f2 f4'.split()) +@pytest.mark.parametrize('scale_factor', all_possible_types) +@pytest.mark.parametrize('add_offset', all_possible_types) +def test_scaling_case_1_float_var(dtype, scale_factor, add_offset): original = xr.Variable(('x',), np.arange(10, dtype=dtype), encoding=dict(scale_factor=scale_factor, add_offset=add_offset)) + coder = variables.CFScaleOffsetCoder() encoded = coder.encode(original) - assert encoded.dtype.itemsize >= np.dtype(dtype).itemsize - assert encoded.dtype.itemsize >= 4 and np.issubdtype(encoded, np.floating) + assert encoded.dtype == np.float32 + roundtripped = coder.decode(encoded) + + if (scale_factor, add_offset) in combinations_for_float32: + assert roundtripped.dtype == np.float32 + else: + assert roundtripped.dtype == np.float64 + + +# Case 1.2: variable is integer +# encoded should be np.float64 as we have offset +# decoded should always be np.float64 +@pytest.mark.parametrize('dtype', 'u1 u2 i1 i2'.split()) +@pytest.mark.parametrize('scale_factor', all_possible_types) +@pytest.mark.parametrize('add_offset', all_possible_types) +def test_scaling_cf_convention_case_1_int_var(dtype, scale_factor, add_offset): + original = xr.Variable(('x',), np.arange(10, dtype=dtype), + encoding=dict(scale_factor=scale_factor, + add_offset=add_offset)) + coder = variables.CFScaleOffsetCoder() + encoded = coder.encode(original) + assert encoded.dtype == np.float64 roundtripped = coder.decode(encoded) + assert roundtripped.dtype == np.float64 + + +#################################### +# Case 2: variable has no add_offset +#################################### - # We make sure that roundtripped is larger than - # the original - assert roundtripped.dtype.itemsize >= original.dtype.itemsize - assert (roundtripped.dtype is np.dtype(np.float64) - or roundtripped.dtype is np.dtype(np.float32)) +# Case 2.1: +# for any variable dtype +# encoded should be np.float32 +# if scale_factor in the following list +# decoded should be np.float32 +# if not decoded should be np.float64 +types_for_float32 = [np.uint8(8), + np.uint16(16), + np.int8(80), + np.int16(160), + np.float16(1600), + np.float32(3200)] + + +@pytest.mark.parametrize('dtype', 'u1 u2 i1 i2 f2 f4'.split()) +@pytest.mark.parametrize('scale_factor', all_possible_types) +def test_scaling_case_2(dtype, scale_factor): - np.testing.assert_array_almost_equal(roundtripped, original) + original = xr.Variable(('x',), np.arange(10, dtype=dtype), + encoding=dict(scale_factor=scale_factor)) + + coder = variables.CFScaleOffsetCoder() + encoded = coder.encode(original) + assert encoded.dtype == np.float32 + roundtripped = coder.decode(encoded) + if scale_factor in types_for_float32: + assert roundtripped.dtype == np.float32 + else: + assert roundtripped.dtype == np.float64 From 41da20565fe15e3c516d10f2d62916ce78e124f4 Mon Sep 17 00:00:00 2001 From: Daoud Jahdou Date: Thu, 7 Mar 2019 19:11:36 +0100 Subject: [PATCH 12/13] Renaming test_coding methods --- xarray/tests/test_coding.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/xarray/tests/test_coding.py b/xarray/tests/test_coding.py index d4c0ec596ee..a98d5c857f9 100644 --- a/xarray/tests/test_coding.py +++ b/xarray/tests/test_coding.py @@ -122,7 +122,7 @@ def test_scaling_converts_to_float32(dtype): @pytest.mark.parametrize('dtype', 'f2 f4'.split()) @pytest.mark.parametrize('scale_factor', all_possible_types) @pytest.mark.parametrize('add_offset', all_possible_types) -def test_scaling_case_1_float_var(dtype, scale_factor, add_offset): +def test_cfscaleoffset_case_1_float_var(dtype, scale_factor, add_offset): original = xr.Variable(('x',), np.arange(10, dtype=dtype), encoding=dict(scale_factor=scale_factor, add_offset=add_offset)) @@ -144,7 +144,7 @@ def test_scaling_case_1_float_var(dtype, scale_factor, add_offset): @pytest.mark.parametrize('dtype', 'u1 u2 i1 i2'.split()) @pytest.mark.parametrize('scale_factor', all_possible_types) @pytest.mark.parametrize('add_offset', all_possible_types) -def test_scaling_cf_convention_case_1_int_var(dtype, scale_factor, add_offset): +def test_cfscaleoffset_case_1_int_var(dtype, scale_factor, add_offset): original = xr.Variable(('x',), np.arange(10, dtype=dtype), encoding=dict(scale_factor=scale_factor, add_offset=add_offset)) @@ -176,7 +176,7 @@ def test_scaling_cf_convention_case_1_int_var(dtype, scale_factor, add_offset): @pytest.mark.parametrize('dtype', 'u1 u2 i1 i2 f2 f4'.split()) @pytest.mark.parametrize('scale_factor', all_possible_types) -def test_scaling_case_2(dtype, scale_factor): +def test_cfscaleoffset_case_2(dtype, scale_factor): original = xr.Variable(('x',), np.arange(10, dtype=dtype), encoding=dict(scale_factor=scale_factor)) From 29d191addc8d47494ee126d0e3af52dfa657a438 Mon Sep 17 00:00:00 2001 From: Daoud Jahdou Date: Fri, 8 Mar 2019 09:18:45 +0100 Subject: [PATCH 13/13] Replacing array by tuple --- xarray/coding/variables.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/xarray/coding/variables.py b/xarray/coding/variables.py index 2e7e7d5a8e7..8f188061e83 100644 --- a/xarray/coding/variables.py +++ b/xarray/coding/variables.py @@ -210,16 +210,16 @@ def _choose_decoding_float_dtype(dtype, scale_factor, add_offset): if scale_factor or add_offset: - types = [np.dtype(type(scale_factor)), + types = (np.dtype(type(scale_factor)), np.dtype(type(add_offset)), - np.dtype(dtype)] + np.dtype(dtype)) if add_offset is None: - types = [np.dtype(type(scale_factor)), - np.dtype(dtype)] + types = (np.dtype(type(scale_factor)), + np.dtype(dtype)) # scaled_type should be the largest type we find - scaled_dtype = dtypes.result_type(*tuple(types)) + scaled_dtype = dtypes.result_type(*types) # We return it only if it's a float32 or a float64 if (scaled_dtype.itemsize >= 4