Skip to content

Commit 76adf13

Browse files
authored
Don't set encoding attributes on bounds variables. (#2965)
* Don't set attributes on bounds variables. Fixes #2921 1. Removes certain attributes from bounds variables on encode. 2. open_mfdataset: Sets encoding on variables based on encoding in first file. * remove whitespace stuff. * Make sure variable exists in first file before assigning encoding * Make sure we iterate over coords too. * lint fix. * docs/comment fixes. * mfdataset encoding test. * time_bounds attrs test + allow for slight CF non-compliance. * I need to deal with encoding! * minor fixes. * another minor fix. * review fixes. * lint fixes. * Remove encoding changes and xfail test. * Update whats-new.rst
1 parent b054c31 commit 76adf13

File tree

4 files changed

+146
-12
lines changed

4 files changed

+146
-12
lines changed

doc/whats-new.rst

+3
Original file line numberDiff line numberDiff line change
@@ -76,6 +76,9 @@ Enhancements
7676
Bug fixes
7777
~~~~~~~~~
7878

79+
- Don't set encoding attributes on bounds variables when writing to netCDF.
80+
(:issue:`2921`)
81+
By `Deepak Cherian <https://github.com/dcherian>`_.
7982
- NetCDF4 output: variables with unlimited dimensions must be chunked (not
8083
contiguous) on output. (:issue:`1849`)
8184
By `James McCreight <https://github.com/jmccreight>`_.

xarray/conventions.py

+75-10
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,7 @@
77
from .coding import strings, times, variables
88
from .coding.variables import SerializationWarning
99
from .core import duck_array_ops, indexing
10+
from .core.common import contains_cftime_datetimes
1011
from .core.pycompat import dask_array_type
1112
from .core.variable import IndexVariable, Variable, as_variable
1213

@@ -355,6 +356,51 @@ def _update_bounds_attributes(variables):
355356
bounds_attrs.setdefault('calendar', attrs['calendar'])
356357

357358

359+
def _update_bounds_encoding(variables):
360+
"""Adds time encoding to time bounds variables.
361+
362+
Variables handling time bounds ("Cell boundaries" in the CF
363+
conventions) do not necessarily carry the necessary attributes to be
364+
decoded. This copies the encoding from the time variable to the
365+
associated bounds variable so that we write CF-compliant files.
366+
367+
See Also:
368+
369+
http://cfconventions.org/Data/cf-conventions/cf-conventions-1.7/
370+
cf-conventions.html#cell-boundaries
371+
372+
https://github.com/pydata/xarray/issues/2565
373+
"""
374+
375+
# For all time variables with bounds
376+
for v in variables.values():
377+
attrs = v.attrs
378+
encoding = v.encoding
379+
has_date_units = 'units' in encoding and 'since' in encoding['units']
380+
is_datetime_type = (np.issubdtype(v.dtype, np.datetime64) or
381+
contains_cftime_datetimes(v))
382+
383+
if (is_datetime_type and not has_date_units and
384+
'bounds' in attrs and attrs['bounds'] in variables):
385+
warnings.warn("Variable '{0}' has datetime type and a "
386+
"bounds variable but {0}.encoding does not have "
387+
"units specified. The units encodings for '{0}' "
388+
"and '{1}' will be determined independently "
389+
"and may not be equal, counter to CF-conventions. "
390+
"If this is a concern, specify a units encoding for "
391+
"'{0}' before writing to a file."
392+
.format(v.name, attrs['bounds']),
393+
UserWarning)
394+
395+
if has_date_units and 'bounds' in attrs:
396+
if attrs['bounds'] in variables:
397+
bounds_encoding = variables[attrs['bounds']].encoding
398+
bounds_encoding.setdefault('units', encoding['units'])
399+
if 'calendar' in encoding:
400+
bounds_encoding.setdefault('calendar',
401+
encoding['calendar'])
402+
403+
358404
def decode_cf_variables(variables, attributes, concat_characters=True,
359405
mask_and_scale=True, decode_times=True,
360406
decode_coords=True, drop_variables=None,
@@ -492,8 +538,6 @@ def cf_decoder(variables, attributes,
492538
"""
493539
Decode a set of CF encoded variables and attributes.
494540
495-
See Also, decode_cf_variable
496-
497541
Parameters
498542
----------
499543
variables : dict
@@ -515,6 +559,10 @@ def cf_decoder(variables, attributes,
515559
A dictionary mapping from variable name to xarray.Variable objects.
516560
decoded_attributes : dict
517561
A dictionary mapping from attribute name to values.
562+
563+
See also
564+
--------
565+
decode_cf_variable
518566
"""
519567
variables, attributes, _ = decode_cf_variables(
520568
variables, attributes, concat_characters, mask_and_scale, decode_times)
@@ -595,14 +643,12 @@ def encode_dataset_coordinates(dataset):
595643

596644
def cf_encoder(variables, attributes):
597645
"""
598-
A function which takes a dicts of variables and attributes
599-
and encodes them to conform to CF conventions as much
600-
as possible. This includes masking, scaling, character
601-
array handling, and CF-time encoding.
602-
603-
Decode a set of CF encoded variables and attributes.
646+
Encode a set of CF encoded variables and attributes.
647+
Takes a dicts of variables and attributes and encodes them
648+
to conform to CF conventions as much as possible.
649+
This includes masking, scaling, character array handling,
650+
and CF-time encoding.
604651
605-
See Also, decode_cf_variable
606652
607653
Parameters
608654
----------
@@ -618,8 +664,27 @@ def cf_encoder(variables, attributes):
618664
encoded_attributes : dict
619665
A dictionary mapping from attribute name to value
620666
621-
See also: encode_cf_variable
667+
See also
668+
--------
669+
decode_cf_variable, encode_cf_variable
622670
"""
671+
672+
# add encoding for time bounds variables if present.
673+
_update_bounds_encoding(variables)
674+
623675
new_vars = OrderedDict((k, encode_cf_variable(v, name=k))
624676
for k, v in variables.items())
677+
678+
# Remove attrs from bounds variables (issue #2921)
679+
for var in new_vars.values():
680+
bounds = var.attrs['bounds'] if 'bounds' in var.attrs else None
681+
if bounds and bounds in new_vars:
682+
# see http://cfconventions.org/cf-conventions/cf-conventions.html#cell-boundaries # noqa
683+
for attr in ['units', 'standard_name', 'axis', 'positive',
684+
'calendar', 'long_name', 'leap_month', 'leap_year',
685+
'month_lengths']:
686+
if attr in new_vars[bounds].attrs and attr in var.attrs:
687+
if new_vars[bounds].attrs[attr] == var.attrs[attr]:
688+
new_vars[bounds].attrs.pop(attr)
689+
625690
return new_vars, attributes

xarray/tests/test_backends.py

+21
Original file line numberDiff line numberDiff line change
@@ -2486,6 +2486,27 @@ def test_attrs_mfdataset(self):
24862486
'no attribute'):
24872487
actual.test2
24882488

2489+
@pytest.mark.xfail(reason='mfdataset loses encoding currently.')
2490+
def test_encoding_mfdataset(self):
2491+
original = Dataset({'foo': ('t', np.random.randn(10)),
2492+
't': ('t', pd.date_range(start='2010-01-01',
2493+
periods=10,
2494+
freq='1D'))})
2495+
original.t.encoding['units'] = 'days since 2010-01-01'
2496+
2497+
with create_tmp_file() as tmp1:
2498+
with create_tmp_file() as tmp2:
2499+
ds1 = original.isel(t=slice(5))
2500+
ds2 = original.isel(t=slice(5, 10))
2501+
ds1.t.encoding['units'] = 'days since 2010-01-01'
2502+
ds2.t.encoding['units'] = 'days since 2000-01-01'
2503+
ds1.to_netcdf(tmp1)
2504+
ds2.to_netcdf(tmp2)
2505+
with open_mfdataset([tmp1, tmp2]) as actual:
2506+
assert actual.t.encoding['units'] == original.t.encoding['units'] # noqa
2507+
assert actual.t.encoding['units'] == ds1.t.encoding['units'] # noqa
2508+
assert actual.t.encoding['units'] != ds2.t.encoding['units'] # noqa
2509+
24892510
def test_preprocess_mfdataset(self):
24902511
original = Dataset({'foo': ('x', np.random.randn(10))})
24912512
with create_tmp_file() as tmp:

xarray/tests/test_coding_times.py

+47-2
Original file line numberDiff line numberDiff line change
@@ -5,11 +5,11 @@
55
import pandas as pd
66
import pytest
77

8-
from xarray import DataArray, Variable, coding, decode_cf
8+
from xarray import DataArray, Dataset, Variable, coding, decode_cf
99
from xarray.coding.times import (
1010
_import_cftime, cftime_to_nptime, decode_cf_datetime, encode_cf_datetime)
1111
from xarray.coding.variables import SerializationWarning
12-
from xarray.conventions import _update_bounds_attributes
12+
from xarray.conventions import _update_bounds_attributes, cf_encoder
1313
from xarray.core.common import contains_cftime_datetimes
1414
from xarray.testing import assert_equal
1515

@@ -671,6 +671,51 @@ def test_decode_cf_time_bounds():
671671
_update_bounds_attributes(ds.variables)
672672

673673

674+
@requires_cftime_or_netCDF4
675+
def test_encode_time_bounds():
676+
677+
time = pd.date_range('2000-01-16', periods=1)
678+
time_bounds = pd.date_range('2000-01-01', periods=2, freq='MS')
679+
ds = Dataset(dict(time=time, time_bounds=time_bounds))
680+
ds.time.attrs = {'bounds': 'time_bounds'}
681+
ds.time.encoding = {'calendar': 'noleap',
682+
'units': 'days since 2000-01-01'}
683+
684+
expected = dict()
685+
# expected['time'] = Variable(data=np.array([15]), dims=['time'])
686+
expected['time_bounds'] = Variable(data=np.array([0, 31]),
687+
dims=['time_bounds'])
688+
689+
encoded, _ = cf_encoder(ds.variables, ds.attrs)
690+
assert_equal(encoded['time_bounds'], expected['time_bounds'])
691+
assert 'calendar' not in encoded['time_bounds'].attrs
692+
assert 'units' not in encoded['time_bounds'].attrs
693+
694+
# if time_bounds attrs are same as time attrs, it doesn't matter
695+
ds.time_bounds.encoding = {'calendar': 'noleap',
696+
'units': 'days since 2000-01-01'}
697+
encoded, _ = cf_encoder({k: ds[k] for k in ds.variables},
698+
ds.attrs)
699+
assert_equal(encoded['time_bounds'], expected['time_bounds'])
700+
assert 'calendar' not in encoded['time_bounds'].attrs
701+
assert 'units' not in encoded['time_bounds'].attrs
702+
703+
# for CF-noncompliant case of time_bounds attrs being different from
704+
# time attrs; preserve them for faithful roundtrip
705+
ds.time_bounds.encoding = {'calendar': 'noleap',
706+
'units': 'days since 1849-01-01'}
707+
encoded, _ = cf_encoder({k: ds[k] for k in ds.variables},
708+
ds.attrs)
709+
with pytest.raises(AssertionError):
710+
assert_equal(encoded['time_bounds'], expected['time_bounds'])
711+
assert 'calendar' not in encoded['time_bounds'].attrs
712+
assert encoded['time_bounds'].attrs['units'] == ds.time_bounds.encoding['units'] # noqa
713+
714+
ds.time.encoding = {}
715+
with pytest.warns(UserWarning):
716+
cf_encoder(ds.variables, ds.attrs)
717+
718+
674719
@pytest.fixture(params=_ALL_CALENDARS)
675720
def calendar(request):
676721
return request.param

0 commit comments

Comments
 (0)