pydata · jhamman · Jan 24, 2018 · Dec 21, 2017 · Dec 22, 2017 · Dec 24, 2017
diff --git a/xarray/backends/common.py b/xarray/backends/common.py
@@ -6,7 +6,7 @@
 import time
 import traceback
 import contextlib
-from collections import Mapping
+from collections import Mapping, OrderedDict
 import warnings
 
 from ..conventions import cf_encoder
@@ -96,6 +96,9 @@ def __getitem__(self, key):
     def __len__(self):
         return len(self.variables)
 
+    def get_dimensions(self):  # pragma: no cover
+        raise NotImplementedError
+
     def get_attrs(self):  # pragma: no cover
         raise NotImplementedError
 
@@ -195,6 +198,19 @@ def __init__(self, writer=None):
             writer = ArrayWriter()
         self.writer = writer
 
+    def encode(self, variables, attributes):
+        variables = OrderedDict([(k, self.encode_variable(v))
+                                 for k, v in variables.items()])
+        attributes = OrderedDict([(k, self.encode_attribute(v))
+                                  for k, v in attributes.items()])
+        return variables, attributes
+
+    def encode_variable(self, v):
+        return v
+
+    def encode_attribute(self, a):
+        return a
+
     def set_dimension(self, d, l):  # pragma: no cover
         raise NotImplementedError
 
@@ -216,7 +232,10 @@ def store_dataset(self, dataset):
 
     def store(self, variables, attributes, check_encoding_set=frozenset(),
               unlimited_dims=None):
+        variables, attributes = self.encode(variables, attributes)
+
         self.set_attributes(attributes)
+        self.set_dimensions(variables, unlimited_dims=unlimited_dims)
         self.set_variables(variables, check_encoding_set,
                            unlimited_dims=unlimited_dims)
 
@@ -234,23 +253,42 @@ def set_variables(self, variables, check_encoding_set,
 
             self.writer.add(source, target)
 
-    def set_necessary_dimensions(self, variable, unlimited_dims=None):
+    def set_dimensions(self, variables, unlimited_dims=None):
         if unlimited_dims is None:
             unlimited_dims = set()
-        dims = self.get_dimensions()
-        for d, l in zip(variable.dims, variable.shape):
-            if d not in dims:
+
+        existing_dims = self.get_dimensions()
+
+        dims = OrderedDict()
+        for v in unlimited_dims:  # put unlimited_dims first
+            dims[v] = None
+        for v in variables.values():
+            dims.update(dict(zip(v.dims, v.shape)))
+
+        for d, l in dims.items():
+
+            if d in existing_dims and l != existing_dims[d]:
+                raise ValueError("Unable to update size for existing dimension"
+                                 "%r (%d != %d)" % (d, l, existing_dims[d]))
+            elif d not in existing_dims:
                 is_unlimited = d in unlimited_dims
                 self.set_dimension(d, l, is_unlimited)
 
 
 class WritableCFDataStore(AbstractWritableDataStore):
 
-    def store(self, variables, attributes, *args, **kwargs):
+    def encode(self, variables, attributes):
         # All NetCDF files get CF encoded by default, without this attempting
         # to write times, for example, would fail.
-        cf_variables, cf_attrs = cf_encoder(variables, attributes)
-        AbstractWritableDataStore.store(self, cf_variables, cf_attrs,
+        variables, attributes = cf_encoder(variables, attributes)
+        variables = OrderedDict([(k, self.encode_variable(v))
+                                 for k, v in variables.items()])
+        attributes = OrderedDict([(k, self.encode_attribute(v))
+                                  for k, v in attributes.items()])
+        return variables, attributes
+
+    def store(self, variables, attributes, *args, **kwargs):
+        AbstractWritableDataStore.store(self, variables, attributes,
                                         *args, **kwargs)
 
 

diff --git a/xarray/backends/h5netcdf_.py b/xarray/backends/h5netcdf_.py
@@ -9,7 +9,7 @@
 from ..core.pycompat import iteritems, bytes_type, unicode_type, OrderedDict
 
 from .common import WritableCFDataStore, DataStorePickleMixin, find_root
-from .netCDF4_ import (_nc4_group, _nc4_values_and_dtype,
+from .netCDF4_ import (_nc4_group, _encode_nc4_variable, _get_datatype,
                        _extract_nc4_variable_encoding, BaseNetCDF4Array)
 
 
@@ -126,14 +126,15 @@ def set_attribute(self, key, value):
         with self.ensure_open(autoclose=False):
             self.ds.setncattr(key, value)
 
+    def encode_variable(self, variable):
+        return _encode_nc4_variable(variable)
+
     def prepare_variable(self, name, variable, check_encoding=False,
                          unlimited_dims=None):
         import h5py
 
         attrs = variable.attrs.copy()
-        variable, dtype = _nc4_values_and_dtype(variable)
-
-        self.set_necessary_dimensions(variable, unlimited_dims=unlimited_dims)
+        dtype = _get_datatype(variable)
 
         fill_value = attrs.pop('_FillValue', None)
         if dtype is str and fill_value is not None:

diff --git a/xarray/backends/memory.py b/xarray/backends/memory.py
@@ -29,6 +29,13 @@ def get_attrs(self):
     def get_variables(self):
         return self._variables
 
+    def get_dimensions(self):
+        dims = OrderedDict()
+        for v in self._variables.values():
+            for d, s in v.dims.items():
+                dims[d] = s
+        return dims
+
     def prepare_variable(self, k, v, *args, **kwargs):
         new_var = Variable(v.dims, np.empty_like(v), v.attrs)
         # we copy the variable and stuff all encodings in the
@@ -41,6 +48,6 @@ def set_attribute(self, k, v):
         # copy to imitate writing to disk.
         self._attributes[k] = copy.deepcopy(v)
 
-    def set_dimension(self, d, l):
+    def set_dimension(self, d, l, unlimited_dims=None):
         # in this model, dimensions are accounted for in the variables
         pass
diff --git a/xarray/backends/netCDF4_.py b/xarray/backends/netCDF4_.py
@@ -74,19 +74,28 @@ def __getitem__(self, key):
         return data
 
 
-def _nc4_values_and_dtype(var):
+def _encode_nc4_variable(var):
+    if var.dtype.kind == 'S':
+        var = conventions.maybe_encode_as_char_array(var)
+    return var
+
+
+def _get_datatype(var, nc_format='NETCDF4'):
+    if nc_format == 'NETCDF4':
+        datatype = _nc4_dtype(var)
+    else:
+        datatype = var.dtype
+    return datatype
+
+
+def _nc4_dtype(var):
     if var.dtype.kind == 'U':
         dtype = str
-    elif var.dtype.kind == 'S':
-        # use character arrays instead of unicode, because unicode support in
-        # netCDF4 is still rather buggy
-        var = conventions.maybe_encode_as_char_array(var)
-        dtype = var.dtype
-    elif var.dtype.kind in ['i', 'u', 'f', 'c']:
+    elif var.dtype.kind in ['i', 'u', 'f', 'c', 'S']:
         dtype = var.dtype
     else:
         raise ValueError('cannot infer dtype for netCDF4 variable')
-    return var, dtype
+    return dtype
 
 
 def _nc4_group(ds, group, mode):
@@ -324,18 +333,17 @@ def set_variables(self, *args, **kwargs):
         with self.ensure_open(autoclose=False):
             super(NetCDF4DataStore, self).set_variables(*args, **kwargs)
 
-    def prepare_variable(self, name, variable, check_encoding=False,
-                         unlimited_dims=None):
+    def encode_variable(self, variable):
         variable = _force_native_endianness(variable)
-
         if self.format == 'NETCDF4':
-            variable, datatype = _nc4_values_and_dtype(variable)
+            variable = _encode_nc4_variable(variable)
         else:
             variable = encode_nc3_variable(variable)
-            datatype = variable.dtype
-
-        self.set_necessary_dimensions(variable, unlimited_dims=unlimited_dims)
+        return variable
 
+    def prepare_variable(self, name, variable, check_encoding=False,
+                         unlimited_dims=None):
+        datatype = _get_datatype(variable, self.format)
         attrs = variable.attrs.copy()
 
         fill_value = attrs.pop('_FillValue', None)

diff --git a/xarray/backends/scipy_.py b/xarray/backends/scipy_.py
@@ -181,17 +181,16 @@ def set_attribute(self, key, value):
             value = encode_nc3_attr_value(value)
             setattr(self.ds, key, value)
 
+    def encode_variable(self, variable):
+        variable = encode_nc3_variable(variable)
+        return variable
+
     def prepare_variable(self, name, variable, check_encoding=False,
                          unlimited_dims=None):
-        variable = encode_nc3_variable(variable)
         if check_encoding and variable.encoding:
             raise ValueError('unexpected encoding for scipy backend: %r'
                              % list(variable.encoding))
 
-        if unlimited_dims is not None and len(unlimited_dims) > 1:
-            raise ValueError('NETCDF3 only supports one unlimited dimension')
-        self.set_necessary_dimensions(variable, unlimited_dims=unlimited_dims)
-
         data = variable.data
         # nb. this still creates a numpy array in all memory, even though we
         # don't write the data yet; scipy.io.netcdf does not not support

diff --git a/xarray/backends/zarr.py b/xarray/backends/zarr.py
@@ -43,13 +43,8 @@ def _ensure_valid_fill_value(value, dtype):
     return _encode_zarr_attr_value(valid)
 
 
-def _decode_zarr_attr_value(value):
-    return value
-
-
 def _decode_zarr_attrs(attrs):
-    return OrderedDict([(k, _decode_zarr_attr_value(v))
-                        for k, v in attrs.items()])
+    return OrderedDict(attrs)
 
 
 def _replace_slices_with_arrays(key, shape):
@@ -297,9 +292,6 @@ def __init__(self, zarr_group, writer=None):
                 raise KeyError("Zarr group can't be read by xarray because "
                                "it is missing the `%s` attribute." %
                                _DIMENSION_KEY)
-            else:
-                # initialize hidden dimension attribute
-                self.ds.attrs[_DIMENSION_KEY] = {}
 
         if writer is None:
             # by default, we should not need a lock for writing zarr because
@@ -331,29 +323,37 @@ def get_variables(self):
                                  for k, v in self.ds.arrays())
 
     def get_attrs(self):
-        _, attributes = _get_zarr_dims_and_attrs(self.ds, _DIMENSION_KEY)
+        attributes = HiddenKeyDict(self.ds.attrs.asdict(), [_DIMENSION_KEY])
         return _decode_zarr_attrs(attributes)
 
     def get_dimensions(self):
-        dimensions, _ = _get_zarr_dims_and_attrs(self.ds, _DIMENSION_KEY)
+        try:
+            dimensions = self.ds.attrs[_DIMENSION_KEY].asdict()
+        except KeyError:
+            raise KeyError("Zarr object is missing the attribute `%s`, which "
+                           "is required for xarray to determine variable "
+                           "dimensions." % (_DIMENSION_KEY))
         return dimensions
 
-    def set_dimension(self, name, length, is_unlimited=False):
-        if is_unlimited:
+    def set_dimensions(self, variables, unlimited_dims=None):
+        if unlimited_dims is not None:
             raise NotImplementedError(
                 "Zarr backend doesn't know how to handle unlimited dimensions")
-        # consistency check
-        if name in self.ds.attrs[_DIMENSION_KEY]:
-            if self.ds.attrs[_DIMENSION_KEY][name] != length:
-                raise ValueError("Pre-existing array dimensions %r "
-                                 "encoded in Zarr attributes are incompatible "
-                                 "with newly specified dimension `%s`: %g" %
-                                 (self.ds.attrs[_DIMENSION_KEY], name, length))
-        self.ds.attrs[_DIMENSION_KEY][name] = length
-
-    def set_attribute(self, key, value):
-        _, attributes = _get_zarr_dims_and_attrs(self.ds, _DIMENSION_KEY)
-        attributes[key] = _encode_zarr_attr_value(value)
+
+        dims = {}
+        for v in variables.values():
+            dims.update(dict(zip(v.dims, v.shape)))
+
+        self.ds.attrs.update({_DIMENSION_KEY: dims})
+
+    def set_attributes(self, attributes):
+        encoded_attrs = OrderedDict((k, _encode_zarr_attr_value(v))
+                                    for k, v in iteritems(attributes))
+        self.ds.attrs.put(encoded_attrs)
+
+    def encode_variable(self, variable):
+        variable = encode_zarr_variable(variable)
+        return variable
 
     def prepare_variable(self, name, variable, check_encoding=False,
                          unlimited_dims=None):
@@ -363,72 +363,27 @@ def prepare_variable(self, name, variable, check_encoding=False,
         dtype = variable.dtype
         shape = variable.shape
 
-        # TODO: figure out how zarr should deal with unlimited dimensions
-        self.set_necessary_dimensions(variable, unlimited_dims=unlimited_dims)
-
         fill_value = _ensure_valid_fill_value(attrs.pop('_FillValue', None),
                                               dtype)
 
-        # TODO: figure out what encoding is needed for zarr
         encoding = _extract_zarr_variable_encoding(
             variable, raise_on_invalid=check_encoding)
 
-        # arguments for zarr.create:
-        # zarr.creation.create(shape, chunks=None, dtype=None,
-        # compressor='default', fill_value=0, order='C', store=None,
-        # synchronizer=None, overwrite=False, path=None, chunk_store=None,
-        # filters=None, cache_metadata=True, **kwargs)
-        if name in self.ds:
-            zarr_array = self.ds[name]
-        else:
-            zarr_array = self.ds.create(name, shape=shape, dtype=dtype,
-                                        fill_value=fill_value, **encoding)
-        # decided not to explicity enumerate encoding options because we
-        # risk overriding zarr's defaults (e.g. if we specificy
-        # cache_metadata=None instead of True). Alternative is to have lots of
-        # logic in _extract_zarr_variable encoding to duplicate zarr defaults.
-        #                            chunks=encoding.get('chunks'),
-        #                            compressor=encoding.get('compressor'),
-        #                            filters=encodings.get('filters'),
-        #                            cache_metadata=encoding.get('cache_metadata'))
-
+        encoded_attrs = OrderedDict()
         # the magic for storing the hidden dimension data
-        zarr_array.attrs[_DIMENSION_KEY] = dims
-        _, attributes = _get_zarr_dims_and_attrs(zarr_array, _DIMENSION_KEY)
-
+        encoded_attrs[_DIMENSION_KEY] = dims
         for k, v in iteritems(attrs):
-            attributes[k] = _encode_zarr_attr_value(v)
+            encoded_attrs[k] = _encode_zarr_attr_value(v)
+
+        zarr_array = self.ds.create(name, shape=shape, dtype=dtype,
+                                    fill_value=fill_value, **encoding)
+        zarr_array.attrs.put(encoded_attrs)
 
         return zarr_array, variable.data
 
     def store(self, variables, attributes, *args, **kwargs):
-        new_vars = OrderedDict((k, encode_zarr_variable(v, name=k))
-                               for k, v in iteritems(variables))
-        AbstractWritableDataStore.store(self, new_vars, attributes,
+        AbstractWritableDataStore.store(self, variables, attributes,
                                         *args, **kwargs)
-    # sync() and close() methods should not be needed with zarr
-
-
-# from zarr docs
-
-# Zarr arrays can be used as either the source or sink for data in parallel
-# computations. Both multi-threaded and multi-process parallelism are
-# supported. The Python global interpreter lock (GIL) is released for both
-# compression and decompression operations, so Zarr will not block other Python
-# threads from running.
-#
-# A Zarr array can be read concurrently by multiple threads or processes. No
-# synchronization (i.e., locking) is required for concurrent reads.
-#
-# A Zarr array can also be written to concurrently by multiple threads or
-# processes. Some synchronization may be required, depending on the way the
-# data is being written.
-
-# If each worker in a parallel computation is writing to a separate region of
-# the array, and if region boundaries are perfectly aligned with chunk
-# boundaries, then no synchronization is required. However, if region and chunk
-# boundaries are not perfectly aligned, then synchronization is required to
-# avoid two workers attempting to modify the same chunk at the same time.
 
 
 def open_zarr(store, group=None, synchronizer=None, auto_chunk=True,