Skip to content

Commit f0ade3d

Browse files
slevangmax-sixty
andauthored
Automatic region detection and transpose for to_zarr() (#8434)
* implement auto region and transpose * fix validation * support str auto, Dataset doc string, and user-guide example * add whats new entry * Update doc/user-guide/io.rst Co-authored-by: Maximilian Roos <[email protected]> * drop indices and test that they are not written * test that auto append fails * more concise indexes detection * fix typing --------- Co-authored-by: Maximilian Roos <[email protected]>
1 parent 28053ba commit f0ade3d

File tree

6 files changed

+310
-23
lines changed

6 files changed

+310
-23
lines changed

doc/user-guide/io.rst

+8-5
Original file line numberDiff line numberDiff line change
@@ -876,17 +876,20 @@ and then calling ``to_zarr`` with ``compute=False`` to write only metadata
876876
ds.to_zarr(path, compute=False)
877877
878878
Now, a Zarr store with the correct variable shapes and attributes exists that
879-
can be filled out by subsequent calls to ``to_zarr``. The ``region`` provides a
880-
mapping from dimension names to Python ``slice`` objects indicating where the
881-
data should be written (in index space, not coordinate space), e.g.,
879+
can be filled out by subsequent calls to ``to_zarr``. ``region`` can be
880+
specified as ``"auto"``, which opens the existing store and determines the
881+
correct alignment of the new data with the existing coordinates, or as an
882+
explicit mapping from dimension names to Python ``slice`` objects indicating
883+
where the data should be written (in index space, not label space), e.g.,
882884

883885
.. ipython:: python
884886
885887
# For convenience, we'll slice a single dataset, but in the real use-case
886888
# we would create them separately possibly even from separate processes.
887889
ds = xr.Dataset({"foo": ("x", np.arange(30))})
888-
ds.isel(x=slice(0, 10)).to_zarr(path, region={"x": slice(0, 10)})
889-
ds.isel(x=slice(10, 20)).to_zarr(path, region={"x": slice(10, 20)})
890+
# Any of the following region specifications are valid
891+
ds.isel(x=slice(0, 10)).to_zarr(path, region="auto")
892+
ds.isel(x=slice(10, 20)).to_zarr(path, region={"x": "auto"})
890893
ds.isel(x=slice(20, 30)).to_zarr(path, region={"x": slice(20, 30)})
891894
892895
Concurrent writes with ``region`` are safe as long as they modify distinct

doc/whats-new.rst

+4
Original file line numberDiff line numberDiff line change
@@ -26,6 +26,10 @@ New Features
2626
By `Deepak Cherian <https://github.com/dcherian>`_. (:issue:`7764`, :pull:`8373`).
2727
- Add ``DataArray.dt.total_seconds()`` method to match the Pandas API. (:pull:`8435`).
2828
By `Ben Mares <https://github.com/maresb>`_.
29+
- Allow passing ``region="auto"`` in :py:meth:`Dataset.to_zarr` to automatically infer the
30+
region to write in the original store. Also implement automatic transpose when dimension
31+
order does not match the original store. (:issue:`7702`, :issue:`8421`, :pull:`8434`).
32+
By `Sam Levang <https://github.com/slevang>`_.
2933

3034
Breaking changes
3135
~~~~~~~~~~~~~~~~

xarray/backends/api.py

+74-5
Original file line numberDiff line numberDiff line change
@@ -27,6 +27,7 @@
2727
_normalize_path,
2828
)
2929
from xarray.backends.locks import _get_scheduler
30+
from xarray.backends.zarr import open_zarr
3031
from xarray.core import indexing
3132
from xarray.core.combine import (
3233
_infer_concat_order_from_positions,
@@ -1443,10 +1444,63 @@ def save_mfdataset(
14431444
)
14441445

14451446

1446-
def _validate_region(ds, region):
1447+
def _auto_detect_region(ds_new, ds_orig, dim):
1448+
# Create a mapping array of coordinates to indices on the original array
1449+
coord = ds_orig[dim]
1450+
da_map = DataArray(np.arange(coord.size), coords={dim: coord})
1451+
1452+
try:
1453+
da_idxs = da_map.sel({dim: ds_new[dim]})
1454+
except KeyError as e:
1455+
if "not all values found" in str(e):
1456+
raise KeyError(
1457+
f"Not all values of coordinate '{dim}' in the new array were"
1458+
" found in the original store. Writing to a zarr region slice"
1459+
" requires that no dimensions or metadata are changed by the write."
1460+
)
1461+
else:
1462+
raise e
1463+
1464+
if (da_idxs.diff(dim) != 1).any():
1465+
raise ValueError(
1466+
f"The auto-detected region of coordinate '{dim}' for writing new data"
1467+
" to the original store had non-contiguous indices. Writing to a zarr"
1468+
" region slice requires that the new data constitute a contiguous subset"
1469+
" of the original store."
1470+
)
1471+
1472+
dim_slice = slice(da_idxs.values[0], da_idxs.values[-1] + 1)
1473+
1474+
return dim_slice
1475+
1476+
1477+
def _auto_detect_regions(ds, region, open_kwargs):
1478+
ds_original = open_zarr(**open_kwargs)
1479+
for key, val in region.items():
1480+
if val == "auto":
1481+
region[key] = _auto_detect_region(ds, ds_original, key)
1482+
return region
1483+
1484+
1485+
def _validate_and_autodetect_region(
1486+
ds, region, mode, open_kwargs
1487+
) -> tuple[dict[str, slice], bool]:
1488+
if region == "auto":
1489+
region = {dim: "auto" for dim in ds.dims}
1490+
14471491
if not isinstance(region, dict):
14481492
raise TypeError(f"``region`` must be a dict, got {type(region)}")
14491493

1494+
if any(v == "auto" for v in region.values()):
1495+
region_was_autodetected = True
1496+
if mode != "r+":
1497+
raise ValueError(
1498+
f"``mode`` must be 'r+' when using ``region='auto'``, got {mode}"
1499+
)
1500+
region = _auto_detect_regions(ds, region, open_kwargs)
1501+
else:
1502+
region_was_autodetected = False
1503+
14501504
for k, v in region.items():
14511505
if k not in ds.dims:
14521506
raise ValueError(
@@ -1478,6 +1532,8 @@ def _validate_region(ds, region):
14781532
f".drop_vars({non_matching_vars!r})"
14791533
)
14801534

1535+
return region, region_was_autodetected
1536+
14811537

14821538
def _validate_datatypes_for_zarr_append(zstore, dataset):
14831539
"""If variable exists in the store, confirm dtype of the data to append is compatible with
@@ -1529,7 +1585,7 @@ def to_zarr(
15291585
compute: Literal[True] = True,
15301586
consolidated: bool | None = None,
15311587
append_dim: Hashable | None = None,
1532-
region: Mapping[str, slice] | None = None,
1588+
region: Mapping[str, slice | Literal["auto"]] | Literal["auto"] | None = None,
15331589
safe_chunks: bool = True,
15341590
storage_options: dict[str, str] | None = None,
15351591
zarr_version: int | None = None,
@@ -1553,7 +1609,7 @@ def to_zarr(
15531609
compute: Literal[False],
15541610
consolidated: bool | None = None,
15551611
append_dim: Hashable | None = None,
1556-
region: Mapping[str, slice] | None = None,
1612+
region: Mapping[str, slice | Literal["auto"]] | Literal["auto"] | None = None,
15571613
safe_chunks: bool = True,
15581614
storage_options: dict[str, str] | None = None,
15591615
zarr_version: int | None = None,
@@ -1575,7 +1631,7 @@ def to_zarr(
15751631
compute: bool = True,
15761632
consolidated: bool | None = None,
15771633
append_dim: Hashable | None = None,
1578-
region: Mapping[str, slice] | None = None,
1634+
region: Mapping[str, slice | Literal["auto"]] | Literal["auto"] | None = None,
15791635
safe_chunks: bool = True,
15801636
storage_options: dict[str, str] | None = None,
15811637
zarr_version: int | None = None,
@@ -1640,7 +1696,20 @@ def to_zarr(
16401696
_validate_dataset_names(dataset)
16411697

16421698
if region is not None:
1643-
_validate_region(dataset, region)
1699+
open_kwargs = dict(
1700+
store=store,
1701+
synchronizer=synchronizer,
1702+
group=group,
1703+
consolidated=consolidated,
1704+
storage_options=storage_options,
1705+
zarr_version=zarr_version,
1706+
)
1707+
region, region_was_autodetected = _validate_and_autodetect_region(
1708+
dataset, region, mode, open_kwargs
1709+
)
1710+
# drop indices to avoid potential race condition with auto region
1711+
if region_was_autodetected:
1712+
dataset = dataset.drop_vars(dataset.indexes)
16441713
if append_dim is not None and append_dim in region:
16451714
raise ValueError(
16461715
f"cannot list the same dimension in both ``append_dim`` and "

xarray/backends/zarr.py

+19-9
Original file line numberDiff line numberDiff line change
@@ -320,14 +320,19 @@ def encode_zarr_variable(var, needs_copy=True, name=None):
320320
return var
321321

322322

323-
def _validate_existing_dims(var_name, new_var, existing_var, region, append_dim):
323+
def _validate_and_transpose_existing_dims(
324+
var_name, new_var, existing_var, region, append_dim
325+
):
324326
if new_var.dims != existing_var.dims:
325-
raise ValueError(
326-
f"variable {var_name!r} already exists with different "
327-
f"dimension names {existing_var.dims} != "
328-
f"{new_var.dims}, but changing variable "
329-
f"dimensions is not supported by to_zarr()."
330-
)
327+
if set(existing_var.dims) == set(new_var.dims):
328+
new_var = new_var.transpose(*existing_var.dims)
329+
else:
330+
raise ValueError(
331+
f"variable {var_name!r} already exists with different "
332+
f"dimension names {existing_var.dims} != "
333+
f"{new_var.dims}, but changing variable "
334+
f"dimensions is not supported by to_zarr()."
335+
)
331336

332337
existing_sizes = {}
333338
for dim, size in existing_var.sizes.items():
@@ -344,9 +349,14 @@ def _validate_existing_dims(var_name, new_var, existing_var, region, append_dim)
344349
f"variable {var_name!r} already exists with different "
345350
f"dimension sizes: {existing_sizes} != {new_sizes}. "
346351
f"to_zarr() only supports changing dimension sizes when "
347-
f"explicitly appending, but append_dim={append_dim!r}."
352+
f"explicitly appending, but append_dim={append_dim!r}. "
353+
f"If you are attempting to write to a subset of the "
354+
f"existing store without changing dimension sizes, "
355+
f"consider using the region argument in to_zarr()."
348356
)
349357

358+
return new_var
359+
350360

351361
def _put_attrs(zarr_obj, attrs):
352362
"""Raise a more informative error message for invalid attrs."""
@@ -616,7 +626,7 @@ def store(
616626
for var_name in existing_variable_names:
617627
new_var = variables_encoded[var_name]
618628
existing_var = existing_vars[var_name]
619-
_validate_existing_dims(
629+
new_var = _validate_and_transpose_existing_dims(
620630
var_name,
621631
new_var,
622632
existing_var,

xarray/core/dataset.py

+10-4
Original file line numberDiff line numberDiff line change
@@ -2305,7 +2305,7 @@ def to_zarr(
23052305
compute: Literal[True] = True,
23062306
consolidated: bool | None = None,
23072307
append_dim: Hashable | None = None,
2308-
region: Mapping[str, slice] | None = None,
2308+
region: Mapping[str, slice | Literal["auto"]] | Literal["auto"] | None = None,
23092309
safe_chunks: bool = True,
23102310
storage_options: dict[str, str] | None = None,
23112311
zarr_version: int | None = None,
@@ -2328,7 +2328,7 @@ def to_zarr(
23282328
compute: Literal[False],
23292329
consolidated: bool | None = None,
23302330
append_dim: Hashable | None = None,
2331-
region: Mapping[str, slice] | None = None,
2331+
region: Mapping[str, slice | Literal["auto"]] | Literal["auto"] | None = None,
23322332
safe_chunks: bool = True,
23332333
storage_options: dict[str, str] | None = None,
23342334
zarr_version: int | None = None,
@@ -2349,7 +2349,7 @@ def to_zarr(
23492349
compute: bool = True,
23502350
consolidated: bool | None = None,
23512351
append_dim: Hashable | None = None,
2352-
region: Mapping[str, slice] | None = None,
2352+
region: Mapping[str, slice | Literal["auto"]] | Literal["auto"] | None = None,
23532353
safe_chunks: bool = True,
23542354
storage_options: dict[str, str] | None = None,
23552355
zarr_version: int | None = None,
@@ -2411,14 +2411,20 @@ def to_zarr(
24112411
append_dim : hashable, optional
24122412
If set, the dimension along which the data will be appended. All
24132413
other dimensions on overridden variables must remain the same size.
2414-
region : dict, optional
2414+
region : dict or "auto", optional
24152415
Optional mapping from dimension names to integer slices along
24162416
dataset dimensions to indicate the region of existing zarr array(s)
24172417
in which to write this dataset's data. For example,
24182418
``{'x': slice(0, 1000), 'y': slice(10000, 11000)}`` would indicate
24192419
that values should be written to the region ``0:1000`` along ``x``
24202420
and ``10000:11000`` along ``y``.
24212421
2422+
Can also specify ``"auto"``, in which case the existing store will be
2423+
opened and the region inferred by matching the new data's coordinates.
2424+
``"auto"`` can be used as a single string, which will automatically infer
2425+
the region for all dimensions, or as dictionary values for specific
2426+
dimensions mixed together with explicit slices for other dimensions.
2427+
24222428
Two restrictions apply to the use of ``region``:
24232429
24242430
- If ``region`` is set, _all_ variables in a dataset must have at

0 commit comments

Comments
 (0)