Skip to content

Commit e510a9e

Browse files
authored
Compatibility with Zarr v3b2 (#9795)
* Compatibility with Zarr v3b2 * More guards with mode="w" * refactoring * tweak expected requestsC * compat * more compat * fix
1 parent 6942d68 commit e510a9e

File tree

3 files changed

+138
-118
lines changed

3 files changed

+138
-118
lines changed

xarray/backends/zarr.py

Lines changed: 102 additions & 81 deletions
Original file line numberDiff line numberDiff line change
@@ -37,6 +37,7 @@
3737
from xarray.namedarray.utils import module_available
3838

3939
if TYPE_CHECKING:
40+
from zarr import Array as ZarrArray
4041
from zarr import Group as ZarrGroup
4142

4243
from xarray.backends.common import AbstractDataStore
@@ -443,7 +444,7 @@ def extract_zarr_variable_encoding(
443444
shape = shape if shape else variable.shape
444445
encoding = variable.encoding.copy()
445446

446-
safe_to_drop = {"source", "original_shape"}
447+
safe_to_drop = {"source", "original_shape", "preferred_chunks"}
447448
valid_encodings = {
448449
"codecs",
449450
"chunks",
@@ -871,16 +872,27 @@ def store(
871872
else:
872873
zarr = attempt_import("zarr")
873874

874-
existing_keys = tuple(self.zarr_group.array_keys())
875+
if self._mode == "w":
876+
# always overwrite, so we don't care about existing names,
877+
# and consistency of encoding
878+
new_variable_names = set(variables)
879+
existing_keys = {}
880+
existing_variable_names = {}
881+
else:
882+
existing_keys = tuple(self.zarr_group.array_keys())
883+
existing_variable_names = {
884+
vn for vn in variables if _encode_variable_name(vn) in existing_keys
885+
}
886+
new_variable_names = set(variables) - existing_variable_names
875887

876-
if self._mode == "r+":
877-
new_names = [k for k in variables if k not in existing_keys]
878-
if new_names:
879-
raise ValueError(
880-
f"dataset contains non-pre-existing variables {new_names}, "
881-
"which is not allowed in ``xarray.Dataset.to_zarr()`` with "
882-
"``mode='r+'``. To allow writing new variables, set ``mode='a'``."
883-
)
888+
if self._mode == "r+" and (
889+
new_names := [k for k in variables if k not in existing_keys]
890+
):
891+
raise ValueError(
892+
f"dataset contains non-pre-existing variables {new_names!r}, "
893+
"which is not allowed in ``xarray.Dataset.to_zarr()`` with "
894+
"``mode='r+'``. To allow writing new variables, set ``mode='a'``."
895+
)
884896

885897
if self._append_dim is not None and self._append_dim not in existing_keys:
886898
# For dimensions without coordinate values, we must parse
@@ -895,10 +907,6 @@ def store(
895907
f"dataset dimensions {existing_dims}"
896908
)
897909

898-
existing_variable_names = {
899-
vn for vn in variables if _encode_variable_name(vn) in existing_keys
900-
}
901-
new_variable_names = set(variables) - existing_variable_names
902910
variables_encoded, attributes = self.encode(
903911
{vn: variables[vn] for vn in new_variable_names}, attributes
904912
)
@@ -920,10 +928,9 @@ def store(
920928
# Modified variables must use the same encoding as the store.
921929
vars_with_encoding = {}
922930
for vn in existing_variable_names:
923-
if self._mode in ["a", "a-", "r+"]:
924-
_validate_datatypes_for_zarr_append(
925-
vn, existing_vars[vn], variables[vn]
926-
)
931+
_validate_datatypes_for_zarr_append(
932+
vn, existing_vars[vn], variables[vn]
933+
)
927934
vars_with_encoding[vn] = variables[vn].copy(deep=False)
928935
vars_with_encoding[vn].encoding = existing_vars[vn].encoding
929936
vars_with_encoding, _ = self.encode(vars_with_encoding, {})
@@ -968,6 +975,69 @@ def store(
968975
def sync(self):
969976
pass
970977

978+
def _open_existing_array(self, *, name) -> ZarrArray:
979+
import zarr
980+
981+
# TODO: if mode="a", consider overriding the existing variable
982+
# metadata. This would need some case work properly with region
983+
# and append_dim.
984+
if self._write_empty is not None:
985+
# Write to zarr_group.chunk_store instead of zarr_group.store
986+
# See https://github.com/pydata/xarray/pull/8326#discussion_r1365311316 for a longer explanation
987+
# The open_consolidated() enforces a mode of r or r+
988+
# (and to_zarr with region provided enforces a read mode of r+),
989+
# and this function makes sure the resulting Group has a store of type ConsolidatedMetadataStore
990+
# and a 'normal Store subtype for chunk_store.
991+
# The exact type depends on if a local path was used, or a URL of some sort,
992+
# but the point is that it's not a read-only ConsolidatedMetadataStore.
993+
# It is safe to write chunk data to the chunk_store because no metadata would be changed by
994+
# to_zarr with the region parameter:
995+
# - Because the write mode is enforced to be r+, no new variables can be added to the store
996+
# (this is also checked and enforced in xarray.backends.api.py::to_zarr()).
997+
# - Existing variables already have their attrs included in the consolidated metadata file.
998+
# - The size of dimensions can not be expanded, that would require a call using `append_dim`
999+
# which is mutually exclusive with `region`
1000+
zarr_array = zarr.open(
1001+
store=(
1002+
self.zarr_group.store if _zarr_v3() else self.zarr_group.chunk_store
1003+
),
1004+
# TODO: see if zarr should normalize these strings.
1005+
path="/".join([self.zarr_group.name.rstrip("/"), name]).lstrip("/"),
1006+
write_empty_chunks=self._write_empty,
1007+
)
1008+
else:
1009+
zarr_array = self.zarr_group[name]
1010+
1011+
return zarr_array
1012+
1013+
def _create_new_array(
1014+
self, *, name, shape, dtype, fill_value, encoding, attrs
1015+
) -> ZarrArray:
1016+
if coding.strings.check_vlen_dtype(dtype) is str:
1017+
dtype = str
1018+
1019+
if self._write_empty is not None:
1020+
if (
1021+
"write_empty_chunks" in encoding
1022+
and encoding["write_empty_chunks"] != self._write_empty
1023+
):
1024+
raise ValueError(
1025+
'Differing "write_empty_chunks" values in encoding and parameters'
1026+
f'Got {encoding["write_empty_chunks"] = } and {self._write_empty = }'
1027+
)
1028+
else:
1029+
encoding["write_empty_chunks"] = self._write_empty
1030+
1031+
zarr_array = self.zarr_group.create(
1032+
name,
1033+
shape=shape,
1034+
dtype=dtype,
1035+
fill_value=fill_value,
1036+
**encoding,
1037+
)
1038+
zarr_array = _put_attrs(zarr_array, attrs)
1039+
return zarr_array
1040+
9711041
def set_variables(self, variables, check_encoding_set, writer, unlimited_dims=None):
9721042
"""
9731043
This provides a centralized method to set the variables on the data
@@ -986,8 +1056,6 @@ def set_variables(self, variables, check_encoding_set, writer, unlimited_dims=No
9861056
dimensions.
9871057
"""
9881058

989-
import zarr
990-
9911059
existing_keys = tuple(self.zarr_group.array_keys())
9921060
is_zarr_v3_format = _zarr_v3() and self.zarr_group.metadata.zarr_format == 3
9931061

@@ -1016,47 +1084,13 @@ def set_variables(self, variables, check_encoding_set, writer, unlimited_dims=No
10161084
else:
10171085
del v.encoding["_FillValue"]
10181086

1019-
zarr_array = None
10201087
zarr_shape = None
10211088
write_region = self._write_region if self._write_region is not None else {}
10221089
write_region = {dim: write_region.get(dim, slice(None)) for dim in dims}
10231090

1024-
if name in existing_keys:
1091+
if self._mode != "w" and name in existing_keys:
10251092
# existing variable
1026-
# TODO: if mode="a", consider overriding the existing variable
1027-
# metadata. This would need some case work properly with region
1028-
# and append_dim.
1029-
if self._write_empty is not None:
1030-
# Write to zarr_group.chunk_store instead of zarr_group.store
1031-
# See https://github.com/pydata/xarray/pull/8326#discussion_r1365311316 for a longer explanation
1032-
# The open_consolidated() enforces a mode of r or r+
1033-
# (and to_zarr with region provided enforces a read mode of r+),
1034-
# and this function makes sure the resulting Group has a store of type ConsolidatedMetadataStore
1035-
# and a 'normal Store subtype for chunk_store.
1036-
# The exact type depends on if a local path was used, or a URL of some sort,
1037-
# but the point is that it's not a read-only ConsolidatedMetadataStore.
1038-
# It is safe to write chunk data to the chunk_store because no metadata would be changed by
1039-
# to_zarr with the region parameter:
1040-
# - Because the write mode is enforced to be r+, no new variables can be added to the store
1041-
# (this is also checked and enforced in xarray.backends.api.py::to_zarr()).
1042-
# - Existing variables already have their attrs included in the consolidated metadata file.
1043-
# - The size of dimensions can not be expanded, that would require a call using `append_dim`
1044-
# which is mutually exclusive with `region`
1045-
zarr_array = zarr.open(
1046-
store=(
1047-
self.zarr_group.store
1048-
if _zarr_v3()
1049-
else self.zarr_group.chunk_store
1050-
),
1051-
# TODO: see if zarr should normalize these strings.
1052-
path="/".join([self.zarr_group.name.rstrip("/"), name]).lstrip(
1053-
"/"
1054-
),
1055-
write_empty_chunks=self._write_empty,
1056-
)
1057-
else:
1058-
zarr_array = self.zarr_group[name]
1059-
1093+
zarr_array = self._open_existing_array(name=name)
10601094
if self._append_dim is not None and self._append_dim in dims:
10611095
# resize existing variable
10621096
append_axis = dims.index(self._append_dim)
@@ -1089,40 +1123,27 @@ def set_variables(self, variables, check_encoding_set, writer, unlimited_dims=No
10891123
shape=zarr_shape,
10901124
)
10911125

1092-
if name not in existing_keys:
1126+
if self._mode == "w" or name not in existing_keys:
10931127
# new variable
1094-
encoded_attrs = {}
1128+
encoded_attrs = {k: self.encode_attribute(v) for k, v in attrs.items()}
10951129
# the magic for storing the hidden dimension data
10961130
if is_zarr_v3_format:
10971131
encoding["dimension_names"] = dims
10981132
else:
10991133
encoded_attrs[DIMENSION_KEY] = dims
1100-
for k2, v2 in attrs.items():
1101-
encoded_attrs[k2] = self.encode_attribute(v2)
1102-
1103-
if coding.strings.check_vlen_dtype(dtype) is str:
1104-
dtype = str
1105-
1106-
if self._write_empty is not None:
1107-
if (
1108-
"write_empty_chunks" in encoding
1109-
and encoding["write_empty_chunks"] != self._write_empty
1110-
):
1111-
raise ValueError(
1112-
'Differing "write_empty_chunks" values in encoding and parameters'
1113-
f'Got {encoding["write_empty_chunks"] = } and {self._write_empty = }'
1114-
)
1115-
else:
1116-
encoding["write_empty_chunks"] = self._write_empty
1117-
1118-
zarr_array = self.zarr_group.create(
1119-
name,
1120-
shape=shape,
1134+
1135+
encoding["exists_ok" if _zarr_v3() else "overwrite"] = (
1136+
True if self._mode == "w" else False
1137+
)
1138+
1139+
zarr_array = self._create_new_array(
1140+
name=name,
11211141
dtype=dtype,
1142+
shape=shape,
11221143
fill_value=fill_value,
1123-
**encoding,
1144+
encoding=encoding,
1145+
attrs=encoded_attrs,
11241146
)
1125-
zarr_array = _put_attrs(zarr_array, encoded_attrs)
11261147

11271148
writer.add(v.data, zarr_array, region)
11281149

xarray/tests/__init__.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -141,6 +141,7 @@ def _importorskip(
141141
has_pint, requires_pint = _importorskip("pint")
142142
has_numexpr, requires_numexpr = _importorskip("numexpr")
143143
has_flox, requires_flox = _importorskip("flox")
144+
has_netcdf, requires_netcdf = _importorskip("netcdf")
144145
has_pandas_ge_2_2, requires_pandas_ge_2_2 = _importorskip("pandas", "2.2")
145146
has_pandas_3, requires_pandas_3 = _importorskip("pandas", "3.0.0.dev0")
146147

0 commit comments

Comments
 (0)