37
37
from xarray .namedarray .utils import module_available
38
38
39
39
if TYPE_CHECKING :
40
+ from zarr import Array as ZarrArray
40
41
from zarr import Group as ZarrGroup
41
42
42
43
from xarray .backends .common import AbstractDataStore
@@ -443,7 +444,7 @@ def extract_zarr_variable_encoding(
443
444
shape = shape if shape else variable .shape
444
445
encoding = variable .encoding .copy ()
445
446
446
- safe_to_drop = {"source" , "original_shape" }
447
+ safe_to_drop = {"source" , "original_shape" , "preferred_chunks" }
447
448
valid_encodings = {
448
449
"codecs" ,
449
450
"chunks" ,
@@ -871,16 +872,27 @@ def store(
871
872
else :
872
873
zarr = attempt_import ("zarr" )
873
874
874
- existing_keys = tuple (self .zarr_group .array_keys ())
875
+ if self ._mode == "w" :
876
+ # always overwrite, so we don't care about existing names,
877
+ # and consistency of encoding
878
+ new_variable_names = set (variables )
879
+ existing_keys = {}
880
+ existing_variable_names = {}
881
+ else :
882
+ existing_keys = tuple (self .zarr_group .array_keys ())
883
+ existing_variable_names = {
884
+ vn for vn in variables if _encode_variable_name (vn ) in existing_keys
885
+ }
886
+ new_variable_names = set (variables ) - existing_variable_names
875
887
876
- if self ._mode == "r+" :
877
- new_names = [k for k in variables if k not in existing_keys ]
878
- if new_names :
879
- raise ValueError (
880
- f"dataset contains non-pre-existing variables { new_names } , "
881
- "which is not allowed in ``xarray.Dataset.to_zarr()`` with "
882
- "``mode='r+'``. To allow writing new variables, set ``mode='a'``."
883
- )
888
+ if self ._mode == "r+" and (
889
+ new_names : = [k for k in variables if k not in existing_keys ]
890
+ ) :
891
+ raise ValueError (
892
+ f"dataset contains non-pre-existing variables { new_names !r } , "
893
+ "which is not allowed in ``xarray.Dataset.to_zarr()`` with "
894
+ "``mode='r+'``. To allow writing new variables, set ``mode='a'``."
895
+ )
884
896
885
897
if self ._append_dim is not None and self ._append_dim not in existing_keys :
886
898
# For dimensions without coordinate values, we must parse
@@ -895,10 +907,6 @@ def store(
895
907
f"dataset dimensions { existing_dims } "
896
908
)
897
909
898
- existing_variable_names = {
899
- vn for vn in variables if _encode_variable_name (vn ) in existing_keys
900
- }
901
- new_variable_names = set (variables ) - existing_variable_names
902
910
variables_encoded , attributes = self .encode (
903
911
{vn : variables [vn ] for vn in new_variable_names }, attributes
904
912
)
@@ -920,10 +928,9 @@ def store(
920
928
# Modified variables must use the same encoding as the store.
921
929
vars_with_encoding = {}
922
930
for vn in existing_variable_names :
923
- if self ._mode in ["a" , "a-" , "r+" ]:
924
- _validate_datatypes_for_zarr_append (
925
- vn , existing_vars [vn ], variables [vn ]
926
- )
931
+ _validate_datatypes_for_zarr_append (
932
+ vn , existing_vars [vn ], variables [vn ]
933
+ )
927
934
vars_with_encoding [vn ] = variables [vn ].copy (deep = False )
928
935
vars_with_encoding [vn ].encoding = existing_vars [vn ].encoding
929
936
vars_with_encoding , _ = self .encode (vars_with_encoding , {})
@@ -968,6 +975,69 @@ def store(
968
975
def sync (self ):
969
976
pass
970
977
978
+ def _open_existing_array (self , * , name ) -> ZarrArray :
979
+ import zarr
980
+
981
+ # TODO: if mode="a", consider overriding the existing variable
982
+ # metadata. This would need some case work properly with region
983
+ # and append_dim.
984
+ if self ._write_empty is not None :
985
+ # Write to zarr_group.chunk_store instead of zarr_group.store
986
+ # See https://github.com/pydata/xarray/pull/8326#discussion_r1365311316 for a longer explanation
987
+ # The open_consolidated() enforces a mode of r or r+
988
+ # (and to_zarr with region provided enforces a read mode of r+),
989
+ # and this function makes sure the resulting Group has a store of type ConsolidatedMetadataStore
990
+ # and a 'normal Store subtype for chunk_store.
991
+ # The exact type depends on if a local path was used, or a URL of some sort,
992
+ # but the point is that it's not a read-only ConsolidatedMetadataStore.
993
+ # It is safe to write chunk data to the chunk_store because no metadata would be changed by
994
+ # to_zarr with the region parameter:
995
+ # - Because the write mode is enforced to be r+, no new variables can be added to the store
996
+ # (this is also checked and enforced in xarray.backends.api.py::to_zarr()).
997
+ # - Existing variables already have their attrs included in the consolidated metadata file.
998
+ # - The size of dimensions can not be expanded, that would require a call using `append_dim`
999
+ # which is mutually exclusive with `region`
1000
+ zarr_array = zarr .open (
1001
+ store = (
1002
+ self .zarr_group .store if _zarr_v3 () else self .zarr_group .chunk_store
1003
+ ),
1004
+ # TODO: see if zarr should normalize these strings.
1005
+ path = "/" .join ([self .zarr_group .name .rstrip ("/" ), name ]).lstrip ("/" ),
1006
+ write_empty_chunks = self ._write_empty ,
1007
+ )
1008
+ else :
1009
+ zarr_array = self .zarr_group [name ]
1010
+
1011
+ return zarr_array
1012
+
1013
+ def _create_new_array (
1014
+ self , * , name , shape , dtype , fill_value , encoding , attrs
1015
+ ) -> ZarrArray :
1016
+ if coding .strings .check_vlen_dtype (dtype ) is str :
1017
+ dtype = str
1018
+
1019
+ if self ._write_empty is not None :
1020
+ if (
1021
+ "write_empty_chunks" in encoding
1022
+ and encoding ["write_empty_chunks" ] != self ._write_empty
1023
+ ):
1024
+ raise ValueError (
1025
+ 'Differing "write_empty_chunks" values in encoding and parameters'
1026
+ f'Got { encoding ["write_empty_chunks" ] = } and { self ._write_empty = } '
1027
+ )
1028
+ else :
1029
+ encoding ["write_empty_chunks" ] = self ._write_empty
1030
+
1031
+ zarr_array = self .zarr_group .create (
1032
+ name ,
1033
+ shape = shape ,
1034
+ dtype = dtype ,
1035
+ fill_value = fill_value ,
1036
+ ** encoding ,
1037
+ )
1038
+ zarr_array = _put_attrs (zarr_array , attrs )
1039
+ return zarr_array
1040
+
971
1041
def set_variables (self , variables , check_encoding_set , writer , unlimited_dims = None ):
972
1042
"""
973
1043
This provides a centralized method to set the variables on the data
@@ -986,8 +1056,6 @@ def set_variables(self, variables, check_encoding_set, writer, unlimited_dims=No
986
1056
dimensions.
987
1057
"""
988
1058
989
- import zarr
990
-
991
1059
existing_keys = tuple (self .zarr_group .array_keys ())
992
1060
is_zarr_v3_format = _zarr_v3 () and self .zarr_group .metadata .zarr_format == 3
993
1061
@@ -1016,47 +1084,13 @@ def set_variables(self, variables, check_encoding_set, writer, unlimited_dims=No
1016
1084
else :
1017
1085
del v .encoding ["_FillValue" ]
1018
1086
1019
- zarr_array = None
1020
1087
zarr_shape = None
1021
1088
write_region = self ._write_region if self ._write_region is not None else {}
1022
1089
write_region = {dim : write_region .get (dim , slice (None )) for dim in dims }
1023
1090
1024
- if name in existing_keys :
1091
+ if self . _mode != "w" and name in existing_keys :
1025
1092
# existing variable
1026
- # TODO: if mode="a", consider overriding the existing variable
1027
- # metadata. This would need some case work properly with region
1028
- # and append_dim.
1029
- if self ._write_empty is not None :
1030
- # Write to zarr_group.chunk_store instead of zarr_group.store
1031
- # See https://github.com/pydata/xarray/pull/8326#discussion_r1365311316 for a longer explanation
1032
- # The open_consolidated() enforces a mode of r or r+
1033
- # (and to_zarr with region provided enforces a read mode of r+),
1034
- # and this function makes sure the resulting Group has a store of type ConsolidatedMetadataStore
1035
- # and a 'normal Store subtype for chunk_store.
1036
- # The exact type depends on if a local path was used, or a URL of some sort,
1037
- # but the point is that it's not a read-only ConsolidatedMetadataStore.
1038
- # It is safe to write chunk data to the chunk_store because no metadata would be changed by
1039
- # to_zarr with the region parameter:
1040
- # - Because the write mode is enforced to be r+, no new variables can be added to the store
1041
- # (this is also checked and enforced in xarray.backends.api.py::to_zarr()).
1042
- # - Existing variables already have their attrs included in the consolidated metadata file.
1043
- # - The size of dimensions can not be expanded, that would require a call using `append_dim`
1044
- # which is mutually exclusive with `region`
1045
- zarr_array = zarr .open (
1046
- store = (
1047
- self .zarr_group .store
1048
- if _zarr_v3 ()
1049
- else self .zarr_group .chunk_store
1050
- ),
1051
- # TODO: see if zarr should normalize these strings.
1052
- path = "/" .join ([self .zarr_group .name .rstrip ("/" ), name ]).lstrip (
1053
- "/"
1054
- ),
1055
- write_empty_chunks = self ._write_empty ,
1056
- )
1057
- else :
1058
- zarr_array = self .zarr_group [name ]
1059
-
1093
+ zarr_array = self ._open_existing_array (name = name )
1060
1094
if self ._append_dim is not None and self ._append_dim in dims :
1061
1095
# resize existing variable
1062
1096
append_axis = dims .index (self ._append_dim )
@@ -1089,40 +1123,27 @@ def set_variables(self, variables, check_encoding_set, writer, unlimited_dims=No
1089
1123
shape = zarr_shape ,
1090
1124
)
1091
1125
1092
- if name not in existing_keys :
1126
+ if self . _mode == "w" or name not in existing_keys :
1093
1127
# new variable
1094
- encoded_attrs = {}
1128
+ encoded_attrs = {k : self . encode_attribute ( v ) for k , v in attrs . items () }
1095
1129
# the magic for storing the hidden dimension data
1096
1130
if is_zarr_v3_format :
1097
1131
encoding ["dimension_names" ] = dims
1098
1132
else :
1099
1133
encoded_attrs [DIMENSION_KEY ] = dims
1100
- for k2 , v2 in attrs .items ():
1101
- encoded_attrs [k2 ] = self .encode_attribute (v2 )
1102
-
1103
- if coding .strings .check_vlen_dtype (dtype ) is str :
1104
- dtype = str
1105
-
1106
- if self ._write_empty is not None :
1107
- if (
1108
- "write_empty_chunks" in encoding
1109
- and encoding ["write_empty_chunks" ] != self ._write_empty
1110
- ):
1111
- raise ValueError (
1112
- 'Differing "write_empty_chunks" values in encoding and parameters'
1113
- f'Got { encoding ["write_empty_chunks" ] = } and { self ._write_empty = } '
1114
- )
1115
- else :
1116
- encoding ["write_empty_chunks" ] = self ._write_empty
1117
-
1118
- zarr_array = self .zarr_group .create (
1119
- name ,
1120
- shape = shape ,
1134
+
1135
+ encoding ["exists_ok" if _zarr_v3 () else "overwrite" ] = (
1136
+ True if self ._mode == "w" else False
1137
+ )
1138
+
1139
+ zarr_array = self ._create_new_array (
1140
+ name = name ,
1121
1141
dtype = dtype ,
1142
+ shape = shape ,
1122
1143
fill_value = fill_value ,
1123
- ** encoding ,
1144
+ encoding = encoding ,
1145
+ attrs = encoded_attrs ,
1124
1146
)
1125
- zarr_array = _put_attrs (zarr_array , encoded_attrs )
1126
1147
1127
1148
writer .add (v .data , zarr_array , region )
1128
1149
0 commit comments