Skip to content

Commit 0a0593d

Browse files
author
Joe Hamman
authored
WIP: Performance improvements for zarr backend (#1800)
* move backend append logic to the prepare_variable methods * deprecate variables/dimensions/attrs properties on AbstractWritableDataStore * warnings instead of errors for backend properties * use attrs.update when setting zarr attributes * more performance improvements to attributes in zarr backend * fix typo * new set_dimensions method for writable data stores * more fixes for zarr * more tests for zarr and remove append logic for zarr * more tests for zarr and remove append logic for zarr * a few more tweaks to zarr attrs * Add encode methods to writable data stores, fixes for Zarr tests * fix for InMemoryDataStore * fix for unlimited dimensions Scipy Datastores * another patch for scipy * whatsnew * ordereddict * address some of rabernats comments, in particular, this commit removes the _DIMENSION_KEY from the zarr_group.attrs * stop skipping zero-dim zarr tests * update minimum zarr version for tests * cleanup and docs for zarr performance branch * fix two failing tests when using zarr master * flake8 * back to zarr 2.2 * remove extra store method
1 parent 04974b9 commit 0a0593d

File tree

9 files changed

+256
-158
lines changed

9 files changed

+256
-158
lines changed

doc/whats-new.rst

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -44,6 +44,9 @@ Enhancements
4444
- Use ``pandas.Grouper`` class in xarray resample methods rather than the
4545
deprecated ``pandas.TimeGrouper`` class (:issue:`1766`).
4646
By `Joe Hamman <https://github.com/jhamman>`_.
47+
- Support for using `Zarr`_ as storage layer for xarray. (:issue:`1223`).
48+
By `Ryan Abernathey <https://github.com/rabernat>`_ and
49+
`Joe Hamman <https://github.com/jhamman>`_.
4750
- Support for using `Zarr`_ as storage layer for xarray.
4851
By `Ryan Abernathey <https://github.com/rabernat>`_.
4952
- :func:`xarray.plot.imshow` now handles RGB and RGBA images.

xarray/backends/common.py

Lines changed: 126 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,7 @@
66
import time
77
import traceback
88
import contextlib
9-
from collections import Mapping
9+
from collections import Mapping, OrderedDict
1010
import warnings
1111

1212
from ..conventions import cf_encoder
@@ -96,6 +96,9 @@ def __getitem__(self, key):
9696
def __len__(self):
9797
return len(self.variables)
9898

99+
def get_dimensions(self): # pragma: no cover
100+
raise NotImplementedError
101+
99102
def get_attrs(self): # pragma: no cover
100103
raise NotImplementedError
101104

@@ -195,6 +198,37 @@ def __init__(self, writer=None):
195198
writer = ArrayWriter()
196199
self.writer = writer
197200

201+
def encode(self, variables, attributes):
202+
"""
203+
Encode the variables and attributes in this store
204+
205+
Parameters
206+
----------
207+
variables : dict-like
208+
Dictionary of key/value (variable name / xr.Variable) pairs
209+
attributes : dict-like
210+
Dictionary of key/value (attribute name / attribute) pairs
211+
212+
Returns
213+
-------
214+
variables : dict-like
215+
attributes : dict-like
216+
217+
"""
218+
variables = OrderedDict([(k, self.encode_variable(v))
219+
for k, v in variables.items()])
220+
attributes = OrderedDict([(k, self.encode_attribute(v))
221+
for k, v in attributes.items()])
222+
return variables, attributes
223+
224+
def encode_variable(self, v):
225+
"""encode one variable"""
226+
return v
227+
228+
def encode_attribute(self, a):
229+
"""encode one attribute"""
230+
return a
231+
198232
def set_dimension(self, d, l): # pragma: no cover
199233
raise NotImplementedError
200234

@@ -208,24 +242,74 @@ def sync(self):
208242
self.writer.sync()
209243

210244
def store_dataset(self, dataset):
211-
# in stores variables are all variables AND coordinates
212-
# in xarray.Dataset variables are variables NOT coordinates,
213-
# so here we pass the whole dataset in instead of doing
214-
# dataset.variables
245+
"""
246+
in stores, variables are all variables AND coordinates
247+
in xarray.Dataset variables are variables NOT coordinates,
248+
so here we pass the whole dataset in instead of doing
249+
dataset.variables
250+
"""
215251
self.store(dataset, dataset.attrs)
216252

217253
def store(self, variables, attributes, check_encoding_set=frozenset(),
218254
unlimited_dims=None):
255+
"""
256+
Top level method for putting data on this store, this method:
257+
- encodes variables/attributes
258+
- sets dimensions
259+
- sets variables
260+
261+
Parameters
262+
----------
263+
variables : dict-like
264+
Dictionary of key/value (variable name / xr.Variable) pairs
265+
attributes : dict-like
266+
Dictionary of key/value (attribute name / attribute) pairs
267+
check_encoding_set : list-like
268+
List of variables that should be checked for invalid encoding
269+
values
270+
unlimited_dims : list-like
271+
List of dimension names that should be treated as unlimited
272+
dimensions.
273+
"""
274+
275+
variables, attributes = self.encode(variables, attributes)
276+
219277
self.set_attributes(attributes)
278+
self.set_dimensions(variables, unlimited_dims=unlimited_dims)
220279
self.set_variables(variables, check_encoding_set,
221280
unlimited_dims=unlimited_dims)
222281

223282
def set_attributes(self, attributes):
283+
"""
284+
This provides a centralized method to set the dataset attributes on the
285+
data store.
286+
287+
Parameters
288+
----------
289+
attributes : dict-like
290+
Dictionary of key/value (attribute name / attribute) pairs
291+
"""
224292
for k, v in iteritems(attributes):
225293
self.set_attribute(k, v)
226294

227295
def set_variables(self, variables, check_encoding_set,
228296
unlimited_dims=None):
297+
"""
298+
This provides a centralized method to set the variables on the data
299+
store.
300+
301+
Parameters
302+
----------
303+
variables : dict-like
304+
Dictionary of key/value (variable name / xr.Variable) pairs
305+
check_encoding_set : list-like
306+
List of variables that should be checked for invalid encoding
307+
values
308+
unlimited_dims : list-like
309+
List of dimension names that should be treated as unlimited
310+
dimensions.
311+
"""
312+
229313
for vn, v in iteritems(variables):
230314
name = _encode_variable_name(vn)
231315
check = vn in check_encoding_set
@@ -234,24 +318,51 @@ def set_variables(self, variables, check_encoding_set,
234318

235319
self.writer.add(source, target)
236320

237-
def set_necessary_dimensions(self, variable, unlimited_dims=None):
321+
def set_dimensions(self, variables, unlimited_dims=None):
322+
"""
323+
This provides a centralized method to set the dimensions on the data
324+
store.
325+
326+
Parameters
327+
----------
328+
variables : dict-like
329+
Dictionary of key/value (variable name / xr.Variable) pairs
330+
unlimited_dims : list-like
331+
List of dimension names that should be treated as unlimited
332+
dimensions.
333+
"""
238334
if unlimited_dims is None:
239335
unlimited_dims = set()
240-
dims = self.get_dimensions()
241-
for d, l in zip(variable.dims, variable.shape):
242-
if d not in dims:
243-
is_unlimited = d in unlimited_dims
244-
self.set_dimension(d, l, is_unlimited)
336+
337+
existing_dims = self.get_dimensions()
338+
339+
dims = OrderedDict()
340+
for v in unlimited_dims: # put unlimited_dims first
341+
dims[v] = None
342+
for v in variables.values():
343+
dims.update(dict(zip(v.dims, v.shape)))
344+
345+
for dim, length in dims.items():
346+
if dim in existing_dims and length != existing_dims[dim]:
347+
raise ValueError(
348+
"Unable to update size for existing dimension"
349+
"%r (%d != %d)" % (dim, length, existing_dims[dim]))
350+
elif dim not in existing_dims:
351+
is_unlimited = dim in unlimited_dims
352+
self.set_dimension(dim, length, is_unlimited)
245353

246354

247355
class WritableCFDataStore(AbstractWritableDataStore):
248356

249-
def store(self, variables, attributes, *args, **kwargs):
357+
def encode(self, variables, attributes):
250358
# All NetCDF files get CF encoded by default, without this attempting
251359
# to write times, for example, would fail.
252-
cf_variables, cf_attrs = cf_encoder(variables, attributes)
253-
AbstractWritableDataStore.store(self, cf_variables, cf_attrs,
254-
*args, **kwargs)
360+
variables, attributes = cf_encoder(variables, attributes)
361+
variables = OrderedDict([(k, self.encode_variable(v))
362+
for k, v in variables.items()])
363+
attributes = OrderedDict([(k, self.encode_attribute(v))
364+
for k, v in attributes.items()])
365+
return variables, attributes
255366

256367

257368
class DataStorePickleMixin(object):

xarray/backends/h5netcdf_.py

Lines changed: 5 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,7 @@
99
from ..core.pycompat import iteritems, bytes_type, unicode_type, OrderedDict
1010

1111
from .common import WritableCFDataStore, DataStorePickleMixin, find_root
12-
from .netCDF4_ import (_nc4_group, _nc4_values_and_dtype,
12+
from .netCDF4_ import (_nc4_group, _encode_nc4_variable, _get_datatype,
1313
_extract_nc4_variable_encoding, BaseNetCDF4Array)
1414

1515

@@ -127,14 +127,15 @@ def set_attribute(self, key, value):
127127
with self.ensure_open(autoclose=False):
128128
self.ds.setncattr(key, value)
129129

130+
def encode_variable(self, variable):
131+
return _encode_nc4_variable(variable)
132+
130133
def prepare_variable(self, name, variable, check_encoding=False,
131134
unlimited_dims=None):
132135
import h5py
133136

134137
attrs = variable.attrs.copy()
135-
variable, dtype = _nc4_values_and_dtype(variable)
136-
137-
self.set_necessary_dimensions(variable, unlimited_dims=unlimited_dims)
138+
dtype = _get_datatype(variable)
138139

139140
fill_value = attrs.pop('_FillValue', None)
140141
if dtype is str and fill_value is not None:

xarray/backends/memory.py

Lines changed: 8 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -30,6 +30,13 @@ def get_attrs(self):
3030
def get_variables(self):
3131
return self._variables
3232

33+
def get_dimensions(self):
34+
dims = OrderedDict()
35+
for v in self._variables.values():
36+
for d, s in v.dims.items():
37+
dims[d] = s
38+
return dims
39+
3340
def prepare_variable(self, k, v, *args, **kwargs):
3441
new_var = Variable(v.dims, np.empty_like(v), v.attrs)
3542
# we copy the variable and stuff all encodings in the
@@ -42,6 +49,6 @@ def set_attribute(self, k, v):
4249
# copy to imitate writing to disk.
4350
self._attributes[k] = copy.deepcopy(v)
4451

45-
def set_dimension(self, d, l):
52+
def set_dimension(self, d, l, unlimited_dims=None):
4653
# in this model, dimensions are accounted for in the variables
4754
pass

xarray/backends/netCDF4_.py

Lines changed: 23 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -76,19 +76,28 @@ def __getitem__(self, key):
7676
return data
7777

7878

79-
def _nc4_values_and_dtype(var):
79+
def _encode_nc4_variable(var):
80+
if var.dtype.kind == 'S':
81+
var = conventions.maybe_encode_as_char_array(var)
82+
return var
83+
84+
85+
def _get_datatype(var, nc_format='NETCDF4'):
86+
if nc_format == 'NETCDF4':
87+
datatype = _nc4_dtype(var)
88+
else:
89+
datatype = var.dtype
90+
return datatype
91+
92+
93+
def _nc4_dtype(var):
8094
if var.dtype.kind == 'U':
8195
dtype = str
82-
elif var.dtype.kind == 'S':
83-
# use character arrays instead of unicode, because unicode support in
84-
# netCDF4 is still rather buggy
85-
var = conventions.maybe_encode_as_char_array(var)
86-
dtype = var.dtype
87-
elif var.dtype.kind in ['i', 'u', 'f', 'c']:
96+
elif var.dtype.kind in ['i', 'u', 'f', 'c', 'S']:
8897
dtype = var.dtype
8998
else:
9099
raise ValueError('cannot infer dtype for netCDF4 variable')
91-
return var, dtype
100+
return dtype
92101

93102

94103
def _nc4_group(ds, group, mode):
@@ -338,18 +347,17 @@ def set_variables(self, *args, **kwargs):
338347
with self.ensure_open(autoclose=False):
339348
super(NetCDF4DataStore, self).set_variables(*args, **kwargs)
340349

341-
def prepare_variable(self, name, variable, check_encoding=False,
342-
unlimited_dims=None):
350+
def encode_variable(self, variable):
343351
variable = _force_native_endianness(variable)
344-
345352
if self.format == 'NETCDF4':
346-
variable, datatype = _nc4_values_and_dtype(variable)
353+
variable = _encode_nc4_variable(variable)
347354
else:
348355
variable = encode_nc3_variable(variable)
349-
datatype = variable.dtype
350-
351-
self.set_necessary_dimensions(variable, unlimited_dims=unlimited_dims)
356+
return variable
352357

358+
def prepare_variable(self, name, variable, check_encoding=False,
359+
unlimited_dims=None):
360+
datatype = _get_datatype(variable, self.format)
353361
attrs = variable.attrs.copy()
354362

355363
fill_value = attrs.pop('_FillValue', None)

xarray/backends/scipy_.py

Lines changed: 4 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -181,17 +181,16 @@ def set_attribute(self, key, value):
181181
value = encode_nc3_attr_value(value)
182182
setattr(self.ds, key, value)
183183

184+
def encode_variable(self, variable):
185+
variable = encode_nc3_variable(variable)
186+
return variable
187+
184188
def prepare_variable(self, name, variable, check_encoding=False,
185189
unlimited_dims=None):
186-
variable = encode_nc3_variable(variable)
187190
if check_encoding and variable.encoding:
188191
raise ValueError('unexpected encoding for scipy backend: %r'
189192
% list(variable.encoding))
190193

191-
if unlimited_dims is not None and len(unlimited_dims) > 1:
192-
raise ValueError('NETCDF3 only supports one unlimited dimension')
193-
self.set_necessary_dimensions(variable, unlimited_dims=unlimited_dims)
194-
195194
data = variable.data
196195
# nb. this still creates a numpy array in all memory, even though we
197196
# don't write the data yet; scipy.io.netcdf does not not support

0 commit comments

Comments
 (0)