pydata · jhamman · Jan 24, 2018 · Dec 21, 2017 · Dec 22, 2017 · Dec 24, 2017
diff --git a/doc/whats-new.rst b/doc/whats-new.rst
@@ -38,6 +38,9 @@ Enhancements
 - Use ``pandas.Grouper`` class in xarray resample methods rather than the
   deprecated ``pandas.TimeGrouper`` class (:issue:`1766`).
   By `Joe Hamman <https://github.com/jhamman>`_.
+- Support for using `Zarr`_ as storage layer for xarray. (:issue:`1223`).
+  By `Ryan Abernathey <https://github.com/rabernat>`_ and
+  `Joe Hamman <https://github.com/jhamman>`_.
 - Support for using `Zarr`_ as storage layer for xarray.
   By `Ryan Abernathey <https://github.com/rabernat>`_.
 - :func:`xarray.plot.imshow` now handles RGB and RGBA images.

diff --git a/xarray/backends/common.py b/xarray/backends/common.py
@@ -6,7 +6,7 @@
 import time
 import traceback
 import contextlib
-from collections import Mapping
+from collections import Mapping, OrderedDict
 import warnings
 
 from ..conventions import cf_encoder
@@ -96,6 +96,9 @@ def __getitem__(self, key):
     def __len__(self):
         return len(self.variables)
 
+    def get_dimensions(self):  # pragma: no cover
+        raise NotImplementedError
+
     def get_attrs(self):  # pragma: no cover
         raise NotImplementedError
 
@@ -195,6 +198,37 @@ def __init__(self, writer=None):
             writer = ArrayWriter()
         self.writer = writer
 
+    def encode(self, variables, attributes):
+        """
+        Encode the variables and attributes in this store
+
+        Parameters
+        ----------
+        variables : dict-like
+            Dictionary of key/value (variable name / xr.Variable) pairs
+        attributes : dict-like
+            Dictionary of key/value (attribute name / attribute) pairs
+
+        Returns
+        -------
+        variables : dict-like
+        attributes : dict-like
+
+        """
+        variables = OrderedDict([(k, self.encode_variable(v))
+                                 for k, v in variables.items()])
+        attributes = OrderedDict([(k, self.encode_attribute(v))
+                                  for k, v in attributes.items()])
+        return variables, attributes
+
+    def encode_variable(self, v):
+        """encode one variable"""
+        return v
+
+    def encode_attribute(self, a):
+        """encode one attribute"""
+        return a
+
     def set_dimension(self, d, l):  # pragma: no cover
         raise NotImplementedError
 
@@ -208,24 +242,74 @@ def sync(self):
         self.writer.sync()
 
     def store_dataset(self, dataset):
-        # in stores variables are all variables AND coordinates
-        # in xarray.Dataset variables are variables NOT coordinates,
-        # so here we pass the whole dataset in instead of doing
-        # dataset.variables
+        """
+        in stores, variables are all variables AND coordinates
+        in xarray.Dataset variables are variables NOT coordinates,
+        so here we pass the whole dataset in instead of doing
+        dataset.variables
+        """
         self.store(dataset, dataset.attrs)
 
     def store(self, variables, attributes, check_encoding_set=frozenset(),
               unlimited_dims=None):
+        """
+        Top level method for putting data on this store, this method:
+          - encodes variables/attributes
+          - sets dimensions
+          - sets variables
+
+        Parameters
+        ----------
+        variables : dict-like
+            Dictionary of key/value (variable name / xr.Variable) pairs
+        attributes : dict-like
+            Dictionary of key/value (attribute name / attribute) pairs
+        check_encoding_set : list-like
+            List of variables that should be checked for invalid encoding
+            values
+        unlimited_dims : list-like
+            List of dimension names that should be treated as unlimited
+            dimensions.
+        """
+
+        variables, attributes = self.encode(variables, attributes)
+
         self.set_attributes(attributes)
+        self.set_dimensions(variables, unlimited_dims=unlimited_dims)
         self.set_variables(variables, check_encoding_set,
                            unlimited_dims=unlimited_dims)
 
     def set_attributes(self, attributes):
+        """
+        This provides a centralized method to set the dataset attributes on the
+        data store.
+
+        Parameters
+        ----------
+        attributes : dict-like
+            Dictionary of key/value (attribute name / attribute) pairs
+        """
         for k, v in iteritems(attributes):
             self.set_attribute(k, v)
 
     def set_variables(self, variables, check_encoding_set,
                       unlimited_dims=None):
+        """
+        This provides a centralized method to set the variables on the data
+        store.
+
+        Parameters
+        ----------
+        variables : dict-like
+            Dictionary of key/value (variable name / xr.Variable) pairs
+        check_encoding_set : list-like
+            List of variables that should be checked for invalid encoding
+            values
+        unlimited_dims : list-like
+            List of dimension names that should be treated as unlimited
+            dimensions.
+        """
+
         for vn, v in iteritems(variables):
             name = _encode_variable_name(vn)
             check = vn in check_encoding_set
@@ -234,24 +318,51 @@ def set_variables(self, variables, check_encoding_set,
 
             self.writer.add(source, target)
 
-    def set_necessary_dimensions(self, variable, unlimited_dims=None):
+    def set_dimensions(self, variables, unlimited_dims=None):
+        """
+        This provides a centralized method to set the dimensions on the data
+        store.
+
+        Parameters
+        ----------
+        variables : dict-like
+            Dictionary of key/value (variable name / xr.Variable) pairs
+        unlimited_dims : list-like
+            List of dimension names that should be treated as unlimited
+            dimensions.
+        """
         if unlimited_dims is None:
             unlimited_dims = set()
-        dims = self.get_dimensions()
-        for d, l in zip(variable.dims, variable.shape):
-            if d not in dims:
-                is_unlimited = d in unlimited_dims
-                self.set_dimension(d, l, is_unlimited)
+
+        existing_dims = self.get_dimensions()
+
+        dims = OrderedDict()
+        for v in unlimited_dims:  # put unlimited_dims first
+            dims[v] = None
+        for v in variables.values():
+            dims.update(dict(zip(v.dims, v.shape)))
+
+        for dim, length in dims.items():
+            if dim in existing_dims and length != existing_dims[dim]:
+                raise ValueError(
+                    "Unable to update size for existing dimension"
+                    "%r (%d != %d)" % (dim, length, existing_dims[dim]))
+            elif dim not in existing_dims:
+                is_unlimited = dim in unlimited_dims
+                self.set_dimension(dim, length, is_unlimited)
 
 
 class WritableCFDataStore(AbstractWritableDataStore):
 
-    def store(self, variables, attributes, *args, **kwargs):
+    def encode(self, variables, attributes):
         # All NetCDF files get CF encoded by default, without this attempting
         # to write times, for example, would fail.
-        cf_variables, cf_attrs = cf_encoder(variables, attributes)
-        AbstractWritableDataStore.store(self, cf_variables, cf_attrs,
-                                        *args, **kwargs)
+        variables, attributes = cf_encoder(variables, attributes)
+        variables = OrderedDict([(k, self.encode_variable(v))
+                                 for k, v in variables.items()])
+        attributes = OrderedDict([(k, self.encode_attribute(v))
+                                  for k, v in attributes.items()])
+        return variables, attributes
 
 
 class DataStorePickleMixin(object):

diff --git a/xarray/backends/h5netcdf_.py b/xarray/backends/h5netcdf_.py
@@ -9,7 +9,7 @@
 from ..core.pycompat import iteritems, bytes_type, unicode_type, OrderedDict
 
 from .common import WritableCFDataStore, DataStorePickleMixin, find_root
-from .netCDF4_ import (_nc4_group, _nc4_values_and_dtype,
+from .netCDF4_ import (_nc4_group, _encode_nc4_variable, _get_datatype,
                        _extract_nc4_variable_encoding, BaseNetCDF4Array)
 
 
@@ -127,14 +127,15 @@ def set_attribute(self, key, value):
         with self.ensure_open(autoclose=False):
             self.ds.setncattr(key, value)
 
+    def encode_variable(self, variable):
+        return _encode_nc4_variable(variable)
+
     def prepare_variable(self, name, variable, check_encoding=False,
                          unlimited_dims=None):
         import h5py
 
         attrs = variable.attrs.copy()
-        variable, dtype = _nc4_values_and_dtype(variable)
-
-        self.set_necessary_dimensions(variable, unlimited_dims=unlimited_dims)
+        dtype = _get_datatype(variable)
 
         fill_value = attrs.pop('_FillValue', None)
         if dtype is str and fill_value is not None:

diff --git a/xarray/backends/memory.py b/xarray/backends/memory.py
@@ -30,6 +30,13 @@ def get_attrs(self):
     def get_variables(self):
         return self._variables
 
+    def get_dimensions(self):
+        dims = OrderedDict()
+        for v in self._variables.values():
+            for d, s in v.dims.items():
+                dims[d] = s
+        return dims
+
     def prepare_variable(self, k, v, *args, **kwargs):
         new_var = Variable(v.dims, np.empty_like(v), v.attrs)
         # we copy the variable and stuff all encodings in the
@@ -42,6 +49,6 @@ def set_attribute(self, k, v):
         # copy to imitate writing to disk.
         self._attributes[k] = copy.deepcopy(v)
 
-    def set_dimension(self, d, l):
+    def set_dimension(self, d, l, unlimited_dims=None):
         # in this model, dimensions are accounted for in the variables
         pass
diff --git a/xarray/backends/netCDF4_.py b/xarray/backends/netCDF4_.py
@@ -74,19 +74,28 @@ def __getitem__(self, key):
         return data
 
 
-def _nc4_values_and_dtype(var):
+def _encode_nc4_variable(var):
+    if var.dtype.kind == 'S':
+        var = conventions.maybe_encode_as_char_array(var)
+    return var
+
+
+def _get_datatype(var, nc_format='NETCDF4'):
+    if nc_format == 'NETCDF4':
+        datatype = _nc4_dtype(var)
+    else:
+        datatype = var.dtype
+    return datatype
+
+
+def _nc4_dtype(var):
     if var.dtype.kind == 'U':
         dtype = str
-    elif var.dtype.kind == 'S':
-        # use character arrays instead of unicode, because unicode support in
-        # netCDF4 is still rather buggy
-        var = conventions.maybe_encode_as_char_array(var)
-        dtype = var.dtype
-    elif var.dtype.kind in ['i', 'u', 'f', 'c']:
+    elif var.dtype.kind in ['i', 'u', 'f', 'c', 'S']:
         dtype = var.dtype
     else:
         raise ValueError('cannot infer dtype for netCDF4 variable')
-    return var, dtype
+    return dtype
 
 
 def _nc4_group(ds, group, mode):
@@ -325,18 +334,17 @@ def set_variables(self, *args, **kwargs):
         with self.ensure_open(autoclose=False):
             super(NetCDF4DataStore, self).set_variables(*args, **kwargs)
 
-    def prepare_variable(self, name, variable, check_encoding=False,
-                         unlimited_dims=None):
+    def encode_variable(self, variable):
         variable = _force_native_endianness(variable)
-
         if self.format == 'NETCDF4':
-            variable, datatype = _nc4_values_and_dtype(variable)
+            variable = _encode_nc4_variable(variable)
         else:
             variable = encode_nc3_variable(variable)
-            datatype = variable.dtype
-
-        self.set_necessary_dimensions(variable, unlimited_dims=unlimited_dims)
+        return variable
 
+    def prepare_variable(self, name, variable, check_encoding=False,
+                         unlimited_dims=None):
+        datatype = _get_datatype(variable, self.format)
         attrs = variable.attrs.copy()
 
         fill_value = attrs.pop('_FillValue', None)

diff --git a/xarray/backends/scipy_.py b/xarray/backends/scipy_.py
@@ -181,17 +181,16 @@ def set_attribute(self, key, value):
             value = encode_nc3_attr_value(value)
             setattr(self.ds, key, value)
 
+    def encode_variable(self, variable):
+        variable = encode_nc3_variable(variable)
+        return variable
+
     def prepare_variable(self, name, variable, check_encoding=False,
                          unlimited_dims=None):
-        variable = encode_nc3_variable(variable)
         if check_encoding and variable.encoding:
             raise ValueError('unexpected encoding for scipy backend: %r'
                              % list(variable.encoding))
 
-        if unlimited_dims is not None and len(unlimited_dims) > 1:
-            raise ValueError('NETCDF3 only supports one unlimited dimension')
-        self.set_necessary_dimensions(variable, unlimited_dims=unlimited_dims)
-
         data = variable.data
         # nb. this still creates a numpy array in all memory, even though we
         # don't write the data yet; scipy.io.netcdf does not not support