move backend append logic to the prepare_variable methods (#1799)

Joe Hamman · web-flow · commit 6eac8574f856 · 2017-12-27T22:40:16.000-07:00
* move backend append logic to the prepare_variable methods

* deprecate variables/dimensions/attrs properties on AbstractWritableDataStore

* warnings instead of errors for backend properties

* whatsnew
diff --git a/doc/whats-new.rst b/doc/whats-new.rst
@@ -59,8 +59,13 @@ Bug fixes
   ``dask.threaded.get``.  By `Matthew Rocklin <https://github.com/mrocklin>`_.
 - Bug fixes in :py:meth:`DataArray.plot.imshow`: all-NaN arrays and arrays
   with size one in some dimension can now be plotted, which is good for
-  exploring satellite imagery.  (:issue:`1780`)
+  exploring satellite imagery (:issue:`1780`).
   By `Zac Hatfield-Dodds <https://github.com/Zac-HD>`_.
+- The ``variables``, ``attrs``, and ``dimensions`` properties have been
+  deprecated as part of a bug fix addressing an issue where backends were
+  unintentionally loading the datastores data and attributes repeatedly during
+  writes  (:issue:`1798`).
+  By `Joe Hamman <https://github.com/jhamman>`_.
 
 
 .. _whats-new.0.10.0:
diff --git a/xarray/backends/common.py b/xarray/backends/common.py
@@ -7,7 +7,7 @@
 import traceback
 import contextlib
 from collections import Mapping
-from distutils.version import LooseVersion
+import warnings
 
 from ..conventions import cf_encoder
 from ..core import indexing
@@ -133,24 +133,25 @@ def load(self):
 
     @property
     def variables(self):
-        # Because encoding/decoding might happen which may require both the
-        # attributes and the variables, and because a store may be updated
-        # we need to load both the attributes and variables
-        # anytime either one is requested.
+        warnings.warn('The ``variables`` property has been deprecated and '
+                      'will be removed in xarray v0.11.',
+                      FutureWarning, stacklevel=2)
         variables, _ = self.load()
         return variables
 
     @property
     def attrs(self):
-        # Because encoding/decoding might happen which may require both the
-        # attributes and the variables, and because a store may be updated
-        # we need to load both the attributes and variables
-        # anytime either one is requested.
-        _, attributes = self.load()
-        return attributes
+        warnings.warn('The ``attrs`` property has been deprecated and '
+                      'will be removed in xarray v0.11.',
+                      FutureWarning, stacklevel=2)
+        _, attrs = self.load()
+        return attrs
 
     @property
     def dimensions(self):
+        warnings.warn('The ``dimensions`` property has been deprecated and '
+                      'will be removed in xarray v0.11.',
+                      FutureWarning, stacklevel=2)
         return self.get_dimensions()
 
     def close(self):
@@ -183,11 +184,7 @@ def add(self, source, target):
     def sync(self):
         if self.sources:
             import dask.array as da
-            import dask
-            if LooseVersion(dask.__version__) > LooseVersion('0.8.1'):
-                da.store(self.sources, self.targets, lock=self.lock)
-            else:
-                da.store(self.sources, self.targets)
+            da.store(self.sources, self.targets, lock=self.lock)
             self.sources = []
             self.targets = []
 
@@ -232,19 +229,17 @@ def set_variables(self, variables, check_encoding_set,
         for vn, v in iteritems(variables):
             name = _encode_variable_name(vn)
             check = vn in check_encoding_set
-            if vn not in self.variables:
-                target, source = self.prepare_variable(
-                    name, v, check, unlimited_dims=unlimited_dims)
-            else:
-                target, source = self.ds.variables[name], v.data
+            target, source = self.prepare_variable(
+                name, v, check, unlimited_dims=unlimited_dims)
 
             self.writer.add(source, target)
 
     def set_necessary_dimensions(self, variable, unlimited_dims=None):
         if unlimited_dims is None:
             unlimited_dims = set()
+        dims = self.get_dimensions()
         for d, l in zip(variable.dims, variable.shape):
-            if d not in self.dimensions:
+            if d not in dims:
                 is_unlimited = d in unlimited_dims
                 self.set_dimension(d, l, is_unlimited)
 
diff --git a/xarray/backends/h5netcdf_.py b/xarray/backends/h5netcdf_.py
@@ -156,8 +156,11 @@ def prepare_variable(self, name, variable, check_encoding=False,
                     'chunksizes', 'fletcher32']:
             if key in encoding:
                 kwargs[key] = encoding[key]
-        nc4_var = self.ds.createVariable(name, dtype, variable.dims,
-                                         fill_value=fill_value, **kwargs)
+        if name not in self.ds.variables:
+            nc4_var = self.ds.createVariable(name, dtype, variable.dims,
+                                             fill_value=fill_value, **kwargs)
+        else:
+            nc4_var = self.ds.variables[name]
 
         for k, v in iteritems(attrs):
             nc4_var.setncattr(k, v)
diff --git a/xarray/backends/netCDF4_.py b/xarray/backends/netCDF4_.py
@@ -352,20 +352,24 @@ def prepare_variable(self, name, variable, check_encoding=False,
         encoding = _extract_nc4_variable_encoding(
             variable, raise_on_invalid=check_encoding,
             unlimited_dims=unlimited_dims)
-        nc4_var = self.ds.createVariable(
-            varname=name,
-            datatype=datatype,
-            dimensions=variable.dims,
-            zlib=encoding.get('zlib', False),
-            complevel=encoding.get('complevel', 4),
-            shuffle=encoding.get('shuffle', True),
-            fletcher32=encoding.get('fletcher32', False),
-            contiguous=encoding.get('contiguous', False),
-            chunksizes=encoding.get('chunksizes'),
-            endian='native',
-            least_significant_digit=encoding.get('least_significant_digit'),
-            fill_value=fill_value)
-        _disable_auto_decode_variable(nc4_var)
+        if name in self.ds.variables:
+            nc4_var = self.ds.variables[name]
+        else:
+            nc4_var = self.ds.createVariable(
+                varname=name,
+                datatype=datatype,
+                dimensions=variable.dims,
+                zlib=encoding.get('zlib', False),
+                complevel=encoding.get('complevel', 4),
+                shuffle=encoding.get('shuffle', True),
+                fletcher32=encoding.get('fletcher32', False),
+                contiguous=encoding.get('contiguous', False),
+                chunksizes=encoding.get('chunksizes'),
+                endian='native',
+                least_significant_digit=encoding.get(
+                    'least_significant_digit'),
+                fill_value=fill_value)
+            _disable_auto_decode_variable(nc4_var)
 
         for k, v in iteritems(attrs):
             # set attributes one-by-one since netCDF4<1.0.10 can't handle
diff --git a/xarray/backends/netcdf3.py b/xarray/backends/netcdf3.py
@@ -6,7 +6,6 @@
 import numpy as np
 
 from .. import conventions, Variable
-from ..core import duck_array_ops
 from ..core.pycompat import basestring, unicode_type, OrderedDict
 
 
diff --git a/xarray/backends/scipy_.py b/xarray/backends/scipy_.py
@@ -165,7 +165,7 @@ def get_encoding(self):
 
     def set_dimension(self, name, length, is_unlimited=False):
         with self.ensure_open(autoclose=False):
-            if name in self.dimensions:
+            if name in self.ds.dimensions:
                 raise ValueError('%s does not support modifying dimensions'
                                  % type(self).__name__)
             dim_length = length if not is_unlimited else None
@@ -196,7 +196,8 @@ def prepare_variable(self, name, variable, check_encoding=False,
         # nb. this still creates a numpy array in all memory, even though we
         # don't write the data yet; scipy.io.netcdf does not not support
         # incremental writes.
-        self.ds.createVariable(name, data.dtype, variable.dims)
+        if name not in self.ds.variables:
+            self.ds.createVariable(name, data.dtype, variable.dims)
         scipy_var = self.ds.variables[name]
         for k, v in iteritems(variable.attrs):
             self._validate_attr_key(k)
diff --git a/xarray/backends/zarr.py b/xarray/backends/zarr.py
@@ -378,8 +378,11 @@ def prepare_variable(self, name, variable, check_encoding=False,
         # compressor='default', fill_value=0, order='C', store=None,
         # synchronizer=None, overwrite=False, path=None, chunk_store=None,
         # filters=None, cache_metadata=True, **kwargs)
-        zarr_array = self.ds.create(name, shape=shape, dtype=dtype,
-                                    fill_value=fill_value, **encoding)
+        if name in self.ds:
+            zarr_array = self.ds[name]
+        else:
+            zarr_array = self.ds.create(name, shape=shape, dtype=dtype,
+                                        fill_value=fill_value, **encoding)
         # decided not to explicity enumerate encoding options because we
         # risk overriding zarr's defaults (e.g. if we specificy
         # cache_metadata=None instead of True). Alternative is to have lots of
diff --git a/xarray/core/variable.py b/xarray/core/variable.py
@@ -5,7 +5,6 @@
 from collections import defaultdict
 import functools
 import itertools
-from distutils.version import LooseVersion
 
 import numpy as np
 import pandas as pd
@@ -1392,9 +1391,6 @@ def quantile(self, q, dim=None, interpolation='linear'):
             raise TypeError("quantile does not work for arrays stored as dask "
                             "arrays. Load the data via .compute() or .load() "
                             "prior to calling this method.")
-        if LooseVersion(np.__version__) < LooseVersion('1.10.0'):
-            raise NotImplementedError(
-                'quantile requres numpy version 1.10.0 or later')
 
         q = np.asarray(q, dtype=np.float64)