Skip to content
forked from pydata/xarray

Commit 2ee89c3

Browse files
authored
Respect user-specified coordinates attribute. (pydata#3487)
* Respect user-specified coordinates attribute. * Add whats-new * Better if statement. * maybe it's better to not raise an error if "coordinates" in attrs. * tweak whats-new. * more thorough test. * Emit one "coordinates" warning per dataset, instead of one per variable. Also add stacklevel * Preserve attrs["coordinates"] when roundtripping with decode_coords=False * Avoid raising warnings. * fix whats-new * [minor] add comments * fix whats-new * Actually test global "coordinates" handling. * filerwarning not necessary. * Add comment * fix whats-new
1 parent bcf0d61 commit 2ee89c3

File tree

5 files changed

+93
-17
lines changed

5 files changed

+93
-17
lines changed

doc/io.rst

+15-1
Original file line numberDiff line numberDiff line change
@@ -437,9 +437,23 @@ like ``'days'`` for ``timedelta64`` data. ``calendar`` should be one of the cale
437437
supported by netCDF4-python: 'standard', 'gregorian', 'proleptic_gregorian' 'noleap',
438438
'365_day', '360_day', 'julian', 'all_leap', '366_day'.
439439

440-
By default, xarray uses the 'proleptic_gregorian' calendar and units of the smallest time
440+
By default, xarray uses the ``'proleptic_gregorian'`` calendar and units of the smallest time
441441
difference between values, with a reference time of the first time value.
442442

443+
444+
.. _io.coordinates:
445+
446+
Coordinates
447+
...........
448+
449+
You can control the ``coordinates`` attribute written to disk by specifying ``DataArray.encoding["coordinates"]``.
450+
If not specified, xarray automatically sets ``DataArray.encoding["coordinates"]`` to a space-delimited list
451+
of names of coordinate variables that share dimensions with the ``DataArray`` being written.
452+
This allows perfect roundtripping of xarray datasets but may not be desirable.
453+
When an xarray ``Dataset`` contains non-dimensional coordinates that do not share dimensions with any of
454+
the variables, these coordinate variable names are saved under a "global" ``"coordinates"`` attribute.
455+
This is not CF-compliant but again facilitates roundtripping of xarray datasets.
456+
443457
Invalid netCDF files
444458
~~~~~~~~~~~~~~~~~~~~
445459

doc/whats-new.rst

+3
Original file line numberDiff line numberDiff line change
@@ -148,6 +148,9 @@ New Features
148148
invoked. (:issue:`3378`, :pull:`3446`, :pull:`3515`)
149149
By `Deepak Cherian <https://github.com/dcherian>`_ and
150150
`Guido Imperiale <https://github.com/crusaderky>`_.
151+
- xarray now respects the ``DataArray.encoding["coordinates"]`` attribute when writing to disk.
152+
See :ref:`io.coordinates` for more. (:issue:`3351`, :pull:`3487`)
153+
By `Deepak Cherian <https://github.com/dcherian>`_.
151154
- Add the documented-but-missing :py:meth:`DatasetGroupBy.quantile`.
152155
(:issue:`3525`, :pull:`3527`). By `Justus Magin <https://github.com/keewis>`_.
153156

xarray/conventions.py

+26-14
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,7 @@
55
import pandas as pd
66

77
from .coding import strings, times, variables
8-
from .coding.variables import SerializationWarning
8+
from .coding.variables import SerializationWarning, pop_to
99
from .core import duck_array_ops, indexing
1010
from .core.common import contains_cftime_datetimes
1111
from .core.pycompat import dask_array_type
@@ -660,34 +660,46 @@ def _encode_coordinates(variables, attributes, non_dim_coord_names):
660660
and set(target_dims) <= set(v.dims)
661661
):
662662
variable_coordinates[k].add(coord_name)
663-
global_coordinates.discard(coord_name)
664663

665664
variables = {k: v.copy(deep=False) for k, v in variables.items()}
666665

667-
# These coordinates are saved according to CF conventions
668-
for var_name, coord_names in variable_coordinates.items():
669-
attrs = variables[var_name].attrs
670-
if "coordinates" in attrs:
666+
# keep track of variable names written to file under the "coordinates" attributes
667+
written_coords = set()
668+
for name, var in variables.items():
669+
encoding = var.encoding
670+
attrs = var.attrs
671+
if "coordinates" in attrs and "coordinates" in encoding:
671672
raise ValueError(
672-
"cannot serialize coordinates because variable "
673-
"%s already has an attribute 'coordinates'" % var_name
673+
f"'coordinates' found in both attrs and encoding for variable {name!r}."
674674
)
675-
attrs["coordinates"] = " ".join(map(str, coord_names))
675+
676+
# this will copy coordinates from encoding to attrs if "coordinates" in attrs
677+
# after the next line, "coordinates" is never in encoding
678+
# we get support for attrs["coordinates"] for free.
679+
coords_str = pop_to(encoding, attrs, "coordinates")
680+
if not coords_str and variable_coordinates[name]:
681+
attrs["coordinates"] = " ".join(map(str, variable_coordinates[name]))
682+
if "coordinates" in attrs:
683+
written_coords.update(attrs["coordinates"].split())
676684

677685
# These coordinates are not associated with any particular variables, so we
678686
# save them under a global 'coordinates' attribute so xarray can roundtrip
679687
# the dataset faithfully. Because this serialization goes beyond CF
680688
# conventions, only do it if necessary.
681689
# Reference discussion:
682-
# http://mailman.cgd.ucar.edu/pipermail/cf-metadata/2014/057771.html
690+
# http://mailman.cgd.ucar.edu/pipermail/cf-metadata/2014/007571.html
691+
global_coordinates.difference_update(written_coords)
683692
if global_coordinates:
684693
attributes = dict(attributes)
685694
if "coordinates" in attributes:
686-
raise ValueError(
687-
"cannot serialize coordinates because the global "
688-
"attribute 'coordinates' already exists"
695+
warnings.warn(
696+
f"cannot serialize global coordinates {global_coordinates!r} because the global "
697+
f"attribute 'coordinates' already exists. This may prevent faithful roundtripping"
698+
f"of xarray datasets",
699+
SerializationWarning,
689700
)
690-
attributes["coordinates"] = " ".join(map(str, global_coordinates))
701+
else:
702+
attributes["coordinates"] = " ".join(map(str, global_coordinates))
691703

692704
return variables, attributes
693705

xarray/tests/test_backends.py

+35-2
Original file line numberDiff line numberDiff line change
@@ -33,6 +33,7 @@
3333
from xarray.backends.netCDF4_ import _extract_nc4_variable_encoding
3434
from xarray.backends.pydap_ import PydapDataStore
3535
from xarray.coding.variables import SerializationWarning
36+
from xarray.conventions import encode_dataset_coordinates
3637
from xarray.core import indexing
3738
from xarray.core.options import set_options
3839
from xarray.core.pycompat import dask_array_type
@@ -522,15 +523,35 @@ def test_roundtrip_coordinates(self):
522523
with self.roundtrip(original) as actual:
523524
assert_identical(original, actual)
524525

526+
original["foo"].encoding["coordinates"] = "y"
527+
with self.roundtrip(original, open_kwargs={"decode_coords": False}) as expected:
528+
# check roundtripping when decode_coords=False
529+
with self.roundtrip(
530+
expected, open_kwargs={"decode_coords": False}
531+
) as actual:
532+
assert_identical(expected, actual)
533+
525534
def test_roundtrip_global_coordinates(self):
526-
original = Dataset({"x": [2, 3], "y": ("a", [42]), "z": ("x", [4, 5])})
535+
original = Dataset(
536+
{"foo": ("x", [0, 1])}, {"x": [2, 3], "y": ("a", [42]), "z": ("x", [4, 5])}
537+
)
527538
with self.roundtrip(original) as actual:
528539
assert_identical(original, actual)
529540

541+
# test that global "coordinates" is as expected
542+
_, attrs = encode_dataset_coordinates(original)
543+
assert attrs["coordinates"] == "y"
544+
545+
# test warning when global "coordinates" is already set
546+
original.attrs["coordinates"] = "foo"
547+
with pytest.warns(SerializationWarning):
548+
_, attrs = encode_dataset_coordinates(original)
549+
assert attrs["coordinates"] == "foo"
550+
530551
def test_roundtrip_coordinates_with_space(self):
531552
original = Dataset(coords={"x": 0, "y z": 1})
532553
expected = Dataset({"y z": 1}, {"x": 0})
533-
with pytest.warns(xr.SerializationWarning):
554+
with pytest.warns(SerializationWarning):
534555
with self.roundtrip(original) as actual:
535556
assert_identical(expected, actual)
536557

@@ -810,6 +831,18 @@ def equals_latlon(obj):
810831
assert "coordinates" not in ds["lat"].attrs
811832
assert "coordinates" not in ds["lon"].attrs
812833

834+
original["temp"].encoding["coordinates"] = "lat"
835+
with self.roundtrip(original) as actual:
836+
assert_identical(actual, original)
837+
original["precip"].encoding["coordinates"] = "lat"
838+
with create_tmp_file() as tmp_file:
839+
original.to_netcdf(tmp_file)
840+
with open_dataset(tmp_file, decode_coords=True) as ds:
841+
assert "lon" not in ds["temp"].encoding["coordinates"]
842+
assert "lon" not in ds["precip"].encoding["coordinates"]
843+
assert "coordinates" not in ds["lat"].encoding
844+
assert "coordinates" not in ds["lon"].encoding
845+
813846
def test_roundtrip_endian(self):
814847
ds = Dataset(
815848
{

xarray/tests/test_conventions.py

+14
Original file line numberDiff line numberDiff line change
@@ -136,6 +136,20 @@ def test_multidimensional_coordinates(self):
136136
# Should not have any global coordinates.
137137
assert "coordinates" not in attrs
138138

139+
def test_do_not_overwrite_user_coordinates(self):
140+
orig = Dataset(
141+
coords={"x": [0, 1, 2], "y": ("x", [5, 6, 7]), "z": ("x", [8, 9, 10])},
142+
data_vars={"a": ("x", [1, 2, 3]), "b": ("x", [3, 5, 6])},
143+
)
144+
orig["a"].encoding["coordinates"] = "y"
145+
orig["b"].encoding["coordinates"] = "z"
146+
enc, _ = conventions.encode_dataset_coordinates(orig)
147+
assert enc["a"].attrs["coordinates"] == "y"
148+
assert enc["b"].attrs["coordinates"] == "z"
149+
orig["a"].attrs["coordinates"] = "foo"
150+
with raises_regex(ValueError, "'coordinates' found in both attrs"):
151+
conventions.encode_dataset_coordinates(orig)
152+
139153
@requires_dask
140154
def test_string_object_warning(self):
141155
original = Variable(("x",), np.array(["foo", "bar"], dtype=object)).chunk()

0 commit comments

Comments
 (0)