From f31a278af2a0c86daf35a6c5f488a39d84149141 Mon Sep 17 00:00:00 2001 From: Benoit Bovy Date: Fri, 5 Aug 2016 04:02:24 +0200 Subject: [PATCH 01/26] make multi-index levels visible as coordinates --- xarray/core/dataarray.py | 15 +++++++++++++-- xarray/core/formatting.py | 8 ++++++++ xarray/core/variable.py | 38 +++++++++++++++++++++++++++++--------- 3 files changed, 50 insertions(+), 11 deletions(-) diff --git a/xarray/core/dataarray.py b/xarray/core/dataarray.py index 55b254fa358..2996a91268d 100644 --- a/xarray/core/dataarray.py +++ b/xarray/core/dataarray.py @@ -417,12 +417,23 @@ def _item_key_to_dict(self, key): key = indexing.expanded_indexer(key, self.ndim) return dict(zip(self.dims, key)) + @property + def _level_coords(self): + level_coords = OrderedDict() + for name, var in self._coords.items(): + if name not in self.dims: + continue + level_coords.update(var.to_coord().get_level_coords()) + return level_coords + def __getitem__(self, key): if isinstance(key, basestring): from .dataset import _get_virtual_variable try: - var = self._coords[key] + var = self._coords.get(key) + if var is None: + var = self._level_coords[key] except KeyError: _, key, var = _get_virtual_variable(self._coords, key) @@ -444,7 +455,7 @@ def __delitem__(self, key): @property def _attr_sources(self): """List of places to look-up items for attribute-style access""" - return [self.coords, self.attrs] + return [self.coords, self._level_coords, self.attrs] def __contains__(self, key): return key in self._coords diff --git a/xarray/core/formatting.py b/xarray/core/formatting.py index e8b45c97e0f..8979d5fc7eb 100644 --- a/xarray/core/formatting.py +++ b/xarray/core/formatting.py @@ -222,6 +222,14 @@ def summarize_coord(name, var, col_width): is_index = name in var.dims show_values = is_index or _not_remote(var) marker = u'*' if is_index else u' ' + if is_index: + level_coords = var.variable.to_coord().get_level_coords() + if level_coords: + return u'\n'.join([ + _summarize_var_or_coord(coord_name, coord, col_width, + show_values, marker) + for coord_name, coord in level_coords.items() + ]) return _summarize_var_or_coord(name, var, col_width, show_values, marker) diff --git a/xarray/core/variable.py b/xarray/core/variable.py index d29137fb61b..39f5db968f3 100644 --- a/xarray/core/variable.py +++ b/xarray/core/variable.py @@ -1071,12 +1071,17 @@ class IndexVariable(Variable): of a NumPy array. Hence, their values are immutable and must always be one- dimensional. - They also have a name property, which is the name of their sole dimension. + They also have a name property, which is the name of their sole dimension + unless another name is given. """ - def __init__(self, name, data, attrs=None, encoding=None, fastpath=False): - super(IndexVariable, self).__init__( - name, data, attrs, encoding, fastpath) + def __init__(self, name, data, dim=None, attrs=None, encoding=None, + fastpath=False): + if dim is None: + dim = name + self._name = name + + super(IndexVariable, self).__init__(dim, data, attrs, encoding, fastpath) if self.ndim != 1: raise ValueError('%s objects must be 1-dimensional' % type(self).__name__) @@ -1092,8 +1097,8 @@ def __getitem__(self, key): if not hasattr(values, 'ndim') or values.ndim == 0: return Variable((), values, self._attrs, self._encoding) else: - return type(self)(self.dims, values, self._attrs, self._encoding, - fastpath=True) + return type(self)(self._name, values, self.dims, self._attrs, + self._encoding, fastpath=True) def __setitem__(self, key, value): raise TypeError('%s values cannot be modified' % type(self).__name__) @@ -1145,8 +1150,8 @@ def copy(self, deep=True): # there is no need to copy the index values here even if deep=True # since pandas.Index objects are immutable data = PandasIndexAdapter(self) if deep else self._data - return type(self)(self.dims, data, self._attrs, self._encoding, - fastpath=True) + return type(self)(self._name, data, self.dims, self._attrs, + self._encoding, fastpath=True) def _data_equals(self, other): return self.to_index().equals(other.to_index()) @@ -1173,9 +1178,24 @@ def to_index(self): index = index.set_names(self.name) return index + def get_level_coords(self): + """Return an OrderedDict of independent coordinates for each + index level, or return an empty OrderedDict if the coordinate + has not a MultiIndex. + """ + level_coords = OrderedDict() + index = self.to_index() + if not isinstance(index, pd.MultiIndex): + return level_coords + for level_name in index.names: + level_coords[level_name] = type(self)( + level_name, index.get_level_values(level_name), dim=self.name + ) + return level_coords + @property def name(self): - return self.dims[0] + return self._name @name.setter def name(self, value): From 5e8a67751356d32a3c605d8503a1c0a393c84b39 Mon Sep 17 00:00:00 2001 From: Benoit Bovy Date: Fri, 5 Aug 2016 11:17:27 +0200 Subject: [PATCH 02/26] make levels also visible for Dataset --- xarray/core/dataset.py | 14 +++++++++++++- 1 file changed, 13 insertions(+), 1 deletion(-) diff --git a/xarray/core/dataset.py b/xarray/core/dataset.py index 6f34107686c..bfecc192fa6 100644 --- a/xarray/core/dataset.py +++ b/xarray/core/dataset.py @@ -424,6 +424,16 @@ def _subset_with_all_valid_coords(self, variables, coord_names, attrs): return self._construct_direct(variables, coord_names, dims, attrs) + @property + def _level_coords(self): + level_coords = OrderedDict() + for name in self._coord_names: + var = self.variables[name] + if name != var.dims[0]: + continue + level_coords.update(var.to_coord().get_level_coords()) + return level_coords + def _copy_listed(self, names): """Create a new Dataset with the listed variables from this dataset and the all relevant coordinates. Skips all validation. @@ -450,7 +460,9 @@ def _construct_dataarray(self, name): from .dataarray import DataArray try: - variable = self._variables[name] + variable = self._variables.get(name) + if variable is None: + variable = self._level_coords[name] except KeyError: _, name, variable = _get_virtual_variable(self._variables, name) From 19ec381bf2ba7c2240d00f02c21f89595007a041 Mon Sep 17 00:00:00 2001 From: Benoit Bovy Date: Fri, 5 Aug 2016 11:17:58 +0200 Subject: [PATCH 03/26] fix unnamed levels --- xarray/core/variable.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/xarray/core/variable.py b/xarray/core/variable.py index 39f5db968f3..f9aa32dd492 100644 --- a/xarray/core/variable.py +++ b/xarray/core/variable.py @@ -1171,7 +1171,7 @@ def to_index(self): if isinstance(index, pd.MultiIndex): # set default names for multi-index unnamed levels so that # we can safely rename dimension / coordinate later - valid_level_names = [name or '{}_level_{}'.format(self.name, i) + valid_level_names = [name or '{}_level_{}'.format(self.dims[0], i) for i, name in enumerate(index.names)] index = index.set_names(valid_level_names) else: @@ -1181,7 +1181,7 @@ def to_index(self): def get_level_coords(self): """Return an OrderedDict of independent coordinates for each index level, or return an empty OrderedDict if the coordinate - has not a MultiIndex. + has no MultiIndex. """ level_coords = OrderedDict() index = self.to_index() From 156693895c98e518181b423b90630b9d59c4ca88 Mon Sep 17 00:00:00 2001 From: Benoit Bovy Date: Fri, 5 Aug 2016 12:44:53 +0200 Subject: [PATCH 04/26] allow providing multi-index levels in .sel --- xarray/core/indexing.py | 30 +++++++++++++++++++++++++++++- 1 file changed, 29 insertions(+), 1 deletion(-) diff --git a/xarray/core/indexing.py b/xarray/core/indexing.py index 2fbcb317c37..861d2c69915 100644 --- a/xarray/core/indexing.py +++ b/xarray/core/indexing.py @@ -214,6 +214,34 @@ def convert_label_indexer(index, label, index_name='', method=None, return indexer, new_index +def _get_valid_indexers(data_obj, indexers): + """Ensure that the keys of the returned mapping of indexers + are valid dimension names (group multi-index level keys / labels into + dims / dict-based labels if needed). + """ + dim_indexers = {} + for name, label in iteritems(indexers): + dim = data_obj[name].dims[0] + if name != dim: + # we consider this case as a multi-index level indexer + if not dim_indexers.get(dim, False): + dim_indexers[dim] = {} + dim_indexers[dim][name] = label + else: + index = data_obj[dim].to_index() + if isinstance(index, pd.MultiIndex): + if not dim_indexers.get(dim, False): + dim_indexers[dim] = {} + if is_dict_like(label): + dim_indexers[dim].update(label) + else: + first_level_name = index.names[0] + dim_indexers[dim].update({first_level_name: label}) + else: + dim_indexers[dim] = label + return dim_indexers + + def remap_label_indexers(data_obj, indexers, method=None, tolerance=None): """Given an xarray data object and label based indexers, return a mapping of equivalent location based indexers. Also return a mapping of updated @@ -223,7 +251,7 @@ def remap_label_indexers(data_obj, indexers, method=None, tolerance=None): raise TypeError('``method`` must be a string') pos_indexers, new_indexes = {}, {} - for dim, label in iteritems(indexers): + for dim, label in iteritems(_get_valid_indexers(data_obj, indexers)): index = data_obj[dim].to_index() idxr, new_idx = convert_label_indexer(index, label, dim, method, tolerance) From 9f4e4e304234c5a00c15884dee4e5c5acb2154f1 Mon Sep 17 00:00:00 2001 From: Benoit Bovy Date: Sat, 6 Aug 2016 00:39:17 +0200 Subject: [PATCH 05/26] refactored _get_valid_indexers to get_dim_indexers --- xarray/core/indexing.py | 52 ++++++++++++++++++++++++----------------- 1 file changed, 31 insertions(+), 21 deletions(-) diff --git a/xarray/core/indexing.py b/xarray/core/indexing.py index 861d2c69915..13f5a62ace1 100644 --- a/xarray/core/indexing.py +++ b/xarray/core/indexing.py @@ -214,31 +214,41 @@ def convert_label_indexer(index, label, index_name='', method=None, return indexer, new_index -def _get_valid_indexers(data_obj, indexers): - """Ensure that the keys of the returned mapping of indexers - are valid dimension names (group multi-index level keys / labels into - dims / dict-based labels if needed). +def get_dim_indexers(data_obj, indexers): + """Given an xarray data object and label based indexers, return a mapping + of indexers with only dimension names as keys. + + It tries to group multiple indexers given on a multi-index dimension + into a single, dictionary indexer for that dimension (Raise a ValueError + if it is not possible). """ + level_indexers = {} dim_indexers = {} - for name, label in iteritems(indexers): - dim = data_obj[name].dims[0] - if name != dim: - # we consider this case as a multi-index level indexer - if not dim_indexers.get(dim, False): - dim_indexers[dim] = {} - dim_indexers[dim][name] = label + for key, label in iteritems(indexers): + dim = data_obj[key].dims[0] + if key != dim: + if not level_indexers.get(dim, False): + level_indexers[dim] = {} + level_indexers[dim][key] = label else: - index = data_obj[dim].to_index() - if isinstance(index, pd.MultiIndex): - if not dim_indexers.get(dim, False): - dim_indexers[dim] = {} - if is_dict_like(label): - dim_indexers[dim].update(label) + dim_indexers[key] = label + + for dim, level_labels in iteritems(level_indexers): + dim_idx = dim_indexers.get(dim, False) + if dim_idx: + if is_dict_like(dim_idx): + if len(set(dim_idx.keys() & set(level_labels.keys()))): + raise ValueError("Duplicate multi-index level indexer(s) " + "given for dimension %s" % dim) else: - first_level_name = index.names[0] - dim_indexers[dim].update({first_level_name: label}) + dim_indexers[dim].update(level_labels) else: - dim_indexers[dim] = label + raise ValueError("Cannot combine multi-index level indexers " + "with a non-dict indexer for dimension %s" + % dim) + else: + dim_indexers[dim] = level_labels + return dim_indexers @@ -251,7 +261,7 @@ def remap_label_indexers(data_obj, indexers, method=None, tolerance=None): raise TypeError('``method`` must be a string') pos_indexers, new_indexes = {}, {} - for dim, label in iteritems(_get_valid_indexers(data_obj, indexers)): + for dim, label in iteritems(get_dim_indexers(data_obj, indexers)): index = data_obj[dim].to_index() idxr, new_idx = convert_label_indexer(index, label, dim, method, tolerance) From 2679318d693f22882b981be188c7de7379d885a2 Mon Sep 17 00:00:00 2001 From: Benoit Bovy Date: Sat, 6 Aug 2016 02:38:50 +0200 Subject: [PATCH 06/26] fix broken tests --- xarray/core/coordinates.py | 16 ++++++++++++++++ xarray/core/dataarray.py | 10 +++++----- xarray/core/dataset.py | 5 ++--- xarray/core/groupby.py | 2 +- xarray/core/indexing.py | 7 +++++++ xarray/core/variable.py | 18 +++++++++++------- xarray/test/test_dataarray.py | 2 -- xarray/test/test_dataset.py | 6 ++---- 8 files changed, 44 insertions(+), 22 deletions(-) diff --git a/xarray/core/coordinates.py b/xarray/core/coordinates.py index 5a5d4d519d0..083a459df7a 100644 --- a/xarray/core/coordinates.py +++ b/xarray/core/coordinates.py @@ -215,6 +215,22 @@ def __delitem__(self, key): del self._data._coords[key] +class DataArrayLevelCoordinates(AbstractCoordinates): + """Dictionary like container for DataArray multi-index + level coordinates. + """ + def __init__(self, dataarray): + self._data = dataarray + + @property + def _names(self): + return set(self._data._level_coords) + + @property + def variables(self): + return Frozen(self._data._level_coords) + + class Indexes(Mapping, formatting.ReprMixin): """Ordered Mapping[str, pandas.Index] for xarray objects. """ diff --git a/xarray/core/dataarray.py b/xarray/core/dataarray.py index 2996a91268d..dfbe267f6ff 100644 --- a/xarray/core/dataarray.py +++ b/xarray/core/dataarray.py @@ -14,7 +14,8 @@ from . import utils from .alignment import align from .common import AbstractArray, BaseDataObject, squeeze -from .coordinates import DataArrayCoordinates, Indexes +from .coordinates import (DataArrayCoordinates, DataArrayLevelCoordinates, + Indexes) from .dataset import Dataset from .pycompat import iteritems, basestring, OrderedDict, zip from .variable import (as_variable, Variable, as_compatible_data, IndexVariable, @@ -421,9 +422,8 @@ def _item_key_to_dict(self, key): def _level_coords(self): level_coords = OrderedDict() for name, var in self._coords.items(): - if name not in self.dims: - continue - level_coords.update(var.to_coord().get_level_coords()) + if var.ndim == 1: + level_coords.update(var.to_coord().get_level_coords()) return level_coords def __getitem__(self, key): @@ -455,7 +455,7 @@ def __delitem__(self, key): @property def _attr_sources(self): """List of places to look-up items for attribute-style access""" - return [self.coords, self._level_coords, self.attrs] + return [self.coords, DataArrayLevelCoordinates(self), self.attrs] def __contains__(self, key): return key in self._coords diff --git a/xarray/core/dataset.py b/xarray/core/dataset.py index bfecc192fa6..bc9b6c01c59 100644 --- a/xarray/core/dataset.py +++ b/xarray/core/dataset.py @@ -429,9 +429,8 @@ def _level_coords(self): level_coords = OrderedDict() for name in self._coord_names: var = self.variables[name] - if name != var.dims[0]: - continue - level_coords.update(var.to_coord().get_level_coords()) + if var.ndim == 1: + level_coords.update(var.to_coord().get_level_coords()) return level_coords def _copy_listed(self, names): diff --git a/xarray/core/groupby.py b/xarray/core/groupby.py index a182cae92c4..7b925af2105 100644 --- a/xarray/core/groupby.py +++ b/xarray/core/groupby.py @@ -283,7 +283,7 @@ def _yield_binary_applied(self, func, other): raise TypeError('GroupBy objects only support binary ops ' 'when the other argument is a Dataset or ' 'DataArray') - except KeyError: + except (KeyError, ValueError): if self.group.name not in other.dims: raise ValueError('incompatible dimensions for a grouped ' 'binary operation: the group variable %r ' diff --git a/xarray/core/indexing.py b/xarray/core/indexing.py index 13f5a62ace1..bbe7152bfbc 100644 --- a/xarray/core/indexing.py +++ b/xarray/core/indexing.py @@ -222,11 +222,18 @@ def get_dim_indexers(data_obj, indexers): into a single, dictionary indexer for that dimension (Raise a ValueError if it is not possible). """ + invalid = [k for k in indexers + if k not in data_obj.dims and k not in data_obj._level_coords] + if invalid: + raise ValueError("dimensions or multi-index levels %r do not exist" + % invalid) + level_indexers = {} dim_indexers = {} for key, label in iteritems(indexers): dim = data_obj[key].dims[0] if key != dim: + # assume here multi-index level indexer if not level_indexers.get(dim, False): level_indexers[dim] = {} level_indexers[dim][key] = label diff --git a/xarray/core/variable.py b/xarray/core/variable.py index f9aa32dd492..d79cfba7d9f 100644 --- a/xarray/core/variable.py +++ b/xarray/core/variable.py @@ -1075,17 +1075,21 @@ class IndexVariable(Variable): unless another name is given. """ - def __init__(self, name, data, dim=None, attrs=None, encoding=None, - fastpath=False): + def __init__(self, name, data, attrs=None, encoding=None, + fastpath=False, dim=None): if dim is None: dim = name - self._name = name super(IndexVariable, self).__init__(dim, data, attrs, encoding, fastpath) if self.ndim != 1: raise ValueError('%s objects must be 1-dimensional' % type(self).__name__) + if isinstance(name, basestring): + self._name = name + else: + self._name = self.dims[0] + def _data_cached(self): if not isinstance(self._data, PandasIndexAdapter): self._data = PandasIndexAdapter(self._data) @@ -1097,8 +1101,8 @@ def __getitem__(self, key): if not hasattr(values, 'ndim') or values.ndim == 0: return Variable((), values, self._attrs, self._encoding) else: - return type(self)(self._name, values, self.dims, self._attrs, - self._encoding, fastpath=True) + return type(self)(self._name, values, self._attrs, + self._encoding, fastpath=True, dim=self.dims) def __setitem__(self, key, value): raise TypeError('%s values cannot be modified' % type(self).__name__) @@ -1150,8 +1154,8 @@ def copy(self, deep=True): # there is no need to copy the index values here even if deep=True # since pandas.Index objects are immutable data = PandasIndexAdapter(self) if deep else self._data - return type(self)(self._name, data, self.dims, self._attrs, - self._encoding, fastpath=True) + return type(self)(self._name, data, self._attrs, + self._encoding, fastpath=True, dim=self.dims) def _data_equals(self, other): return self.to_index().equals(other.to_index()) diff --git a/xarray/test/test_dataarray.py b/xarray/test/test_dataarray.py index 2e789516574..01900dcf0f1 100644 --- a/xarray/test/test_dataarray.py +++ b/xarray/test/test_dataarray.py @@ -579,8 +579,6 @@ def test_sel(lab_indexer, pos_indexer, replaced_idx=False, mdata.sel(x=('a', 1))) self.assertDataArrayIdentical(mdata.loc[{'one': 'a'}, ...], mdata.sel(x={'one': 'a'})) - with self.assertRaises(KeyError): - mdata.loc[{'one': 'a'}] with self.assertRaises(IndexError): mdata.loc[('a', 1)] diff --git a/xarray/test/test_dataset.py b/xarray/test/test_dataset.py index 61170f85a71..ac2c665c12a 100644 --- a/xarray/test/test_dataset.py +++ b/xarray/test/test_dataset.py @@ -916,8 +916,6 @@ def test_sel(lab_indexer, pos_indexer, replaced_idx=False, mdata.sel(x=('a', 1))) self.assertDatasetIdentical(mdata.loc[{'x': ('a', 1, -1)}], mdata.sel(x=('a', 1, -1))) - with self.assertRaises(KeyError): - mdata.loc[{'one': 'a'}] def test_reindex_like(self): data = create_test_data() @@ -1744,9 +1742,9 @@ def test_groupby_math(self): actual = zeros + grouped self.assertDatasetEqual(expected, actual) - with self.assertRaisesRegexp(ValueError, 'dimensions .* do not exist'): + with self.assertRaisesRegexp(ValueError, 'incompat.* grouped binary'): grouped + ds - with self.assertRaisesRegexp(ValueError, 'dimensions .* do not exist'): + with self.assertRaisesRegexp(ValueError, 'incompat.* grouped binary'): ds + grouped with self.assertRaisesRegexp(TypeError, 'only support binary ops'): grouped + 1 From 723c99a86ebd244623d296d4b5d8e00a58295169 Mon Sep 17 00:00:00 2001 From: Benoit Bovy Date: Thu, 11 Aug 2016 14:38:03 +0200 Subject: [PATCH 07/26] refactored accessibility and repr of index levels --- xarray/core/coordinates.py | 22 +++++++++-- xarray/core/dataarray.py | 22 ++++++++--- xarray/core/dataset.py | 76 +++++++++++++++++++++++++------------- xarray/core/formatting.py | 26 +++++++++---- xarray/core/variable.py | 31 ++++++++++------ 5 files changed, 123 insertions(+), 54 deletions(-) diff --git a/xarray/core/coordinates.py b/xarray/core/coordinates.py index 083a459df7a..f3884dadfe6 100644 --- a/xarray/core/coordinates.py +++ b/xarray/core/coordinates.py @@ -166,6 +166,13 @@ def _update_coords(self, coords): self._data._coord_names.update(updated_coord_names) self._data._dims = dict(dims) + def __setitem__(self, key, value): + if key in self._data._level_coords: + raise ValueError("cannot replace MultiIndex level %r, replace %r " + "coordinate instead" + % (key, self._data._level_coords[key])) + return super(DatasetCoordinates, self).__setitem__(key, value) + def __delitem__(self, key): if key in self: del self._data[key] @@ -208,6 +215,13 @@ def _to_dataset(self, shallow_copy=True): def to_dataset(self): return self._to_dataset() + def __setitem__(self, key, value): + if key in self._data._level_coords: + raise ValueError("cannot replace MultiIndex level %r, replace %r " + "coordinate instead" + % (key, self._data._level_coords[key])) + return super(DataArrayCoordinates, self).__setitem__(key, value) + def __delitem__(self, key): if key in self.dims: raise ValueError('cannot delete a coordinate corresponding to a ' @@ -216,8 +230,7 @@ def __delitem__(self, key): class DataArrayLevelCoordinates(AbstractCoordinates): - """Dictionary like container for DataArray multi-index - level coordinates. + """Dictionary like container for DataArray MultiIndex level coordinates. """ def __init__(self, dataarray): self._data = dataarray @@ -228,7 +241,10 @@ def _names(self): @property def variables(self): - return Frozen(self._data._level_coords) + level_coords = OrderedDict( + (k, self._data[v].variable.get_level_coord(k)) + for k, v in self._data._level_coords.items()) + return Frozen(level_coords) class Indexes(Mapping, formatting.ReprMixin): diff --git a/xarray/core/dataarray.py b/xarray/core/dataarray.py index dfbe267f6ff..2fd3460b359 100644 --- a/xarray/core/dataarray.py +++ b/xarray/core/dataarray.py @@ -420,10 +420,21 @@ def _item_key_to_dict(self, key): @property def _level_coords(self): + """Return a mapping of all MultiIndex levels and their corresponding + coordinate name. Raise a `ValueError`` if two or more levels have + the same name. + """ level_coords = OrderedDict() - for name, var in self._coords.items(): + for cname, var in self._coords.items(): if var.ndim == 1: - level_coords.update(var.to_coord().get_level_coords()) + level_names = var.to_coord().level_names + if level_names is not None: + duplicate_names = set(level_names) & set(level_coords) + if duplicate_names: + raise ValueError("duplicate MultiIndex level names %r" + % duplicate_names) + dim = var.dims[0] + level_coords.update({lname: dim for lname in level_names}) return level_coords def __getitem__(self, key): @@ -431,11 +442,10 @@ def __getitem__(self, key): from .dataset import _get_virtual_variable try: - var = self._coords.get(key) - if var is None: - var = self._level_coords[key] + var = self._coords[key] except KeyError: - _, key, var = _get_virtual_variable(self._coords, key) + _, key, var = _get_virtual_variable( + self._coords, key, self._level_coords) return self._replace_maybe_drop_dims(var, name=key) else: diff --git a/xarray/core/dataset.py b/xarray/core/dataset.py index bc9b6c01c59..41124f460ab 100644 --- a/xarray/core/dataset.py +++ b/xarray/core/dataset.py @@ -33,34 +33,48 @@ 'quarter'] -def _get_virtual_variable(variables, key): - """Get a virtual variable (e.g., 'time.year') from a dict of - xarray.Variable objects (if possible) +def _get_virtual_variable(variables, key, level_vars={}): + """Get a virtual variable (e.g., 'time.year' or a MultiIndex level) + from a dict of xarray.Variable objects (if possible) """ if not isinstance(key, basestring): raise KeyError(key) split_key = key.split('.', 1) - if len(split_key) != 2: + if len(split_key) == 2: + ref_name, var_name = split_key + elif len(split_key) == 1: + ref_name, var_name = key, None + else: raise KeyError(key) - ref_name, var_name = split_key - ref_var = variables[ref_name] - if ref_var.ndim == 1: - date = ref_var.to_index() - elif ref_var.ndim == 0: - date = pd.Timestamp(ref_var.values) + if ref_name in level_vars: + dim_var = variables[level_vars[ref_name]] + ref_var = dim_var.to_coord().get_level_coord(ref_name) else: - raise KeyError(key) + ref_var = variables[ref_name] - if var_name == 'season': - # TODO: move 'season' into pandas itself - seasons = np.array(['DJF', 'MAM', 'JJA', 'SON']) - month = date.month - data = seasons[(month // 3) % 4] + if var_name is None: + virtual_var = ref_var + var_name = key else: - data = getattr(date, var_name) - return ref_name, var_name, Variable(ref_var.dims, data) + if ref_var.ndim == 1: + date = ref_var.to_index() + elif ref_var.ndim == 0: + date = pd.Timestamp(ref_var.values) + else: + raise KeyError(key) + + if var_name == 'season': + # TODO: move 'season' into pandas itself + seasons = np.array(['DJF', 'MAM', 'JJA', 'SON']) + month = date.month + data = seasons[(month // 3) % 4] + else: + data = getattr(date, var_name) + virtual_var = Variable(ref_var.dims, data) + + return ref_name, var_name, virtual_var def calculate_dimensions(variables): @@ -426,11 +440,22 @@ def _subset_with_all_valid_coords(self, variables, coord_names, attrs): @property def _level_coords(self): + """Return a mapping of all MultiIndex levels and their corresponding + coordinate name. Raise a `ValueError`` if two or more levels have + the same name. + """ level_coords = OrderedDict() - for name in self._coord_names: - var = self.variables[name] + for cname in self._coord_names: + var = self.variables[cname] if var.ndim == 1: - level_coords.update(var.to_coord().get_level_coords()) + level_names = var.to_coord().level_names + if level_names is not None: + duplicate_names = set(level_names) & set(level_coords) + if duplicate_names: + raise ValueError("duplicate MultiIndex level names %r" + % duplicate_names) + dim = var.dims[0] + level_coords.update({lname: dim for lname in level_names}) return level_coords def _copy_listed(self, names): @@ -445,7 +470,7 @@ def _copy_listed(self, names): variables[name] = self._variables[name] except KeyError: ref_name, var_name, var = _get_virtual_variable( - self._variables, name) + self._variables, name, self._level_coords) variables[var_name] = var if ref_name in self._coord_names: coord_names.add(var_name) @@ -459,11 +484,10 @@ def _construct_dataarray(self, name): from .dataarray import DataArray try: - variable = self._variables.get(name) - if variable is None: - variable = self._level_coords[name] + variable = self._variables[name] except KeyError: - _, name, variable = _get_virtual_variable(self._variables, name) + _, name, variable = _get_virtual_variable( + self._variables, name, self._level_coords) coords = OrderedDict() needed_dims = set(variable.dims) diff --git a/xarray/core/formatting.py b/xarray/core/formatting.py index 8979d5fc7eb..2b7f89613fa 100644 --- a/xarray/core/formatting.py +++ b/xarray/core/formatting.py @@ -200,9 +200,23 @@ def _summarize_var_or_coord(name, var, col_width, show_values=True, values_str = format_array_flat(var, max_width - len(front_str)) else: values_str = u'...' + return front_str + values_str +def _summarize_coord_multiindex(coord, col_width, marker): + first_col = pretty_print(u' %s %s ' % (marker, coord.name), col_width) + return u'%s(%s) MultiIndex' % (first_col, unicode_type(coord.dims[0])) + + +def _summarize_coord_levels(coord, col_width, marker): + # TODO: maybe slicing based on calculated number of displayed values + return u'\n'.join( + [_summarize_var_or_coord(lname, coord[:30].get_level_coord(lname), + col_width, marker=marker) + for lname in coord.level_names]) + + def _not_remote(var): """Helper function to identify if array is positively identifiable as coming from a remote source. @@ -223,13 +237,11 @@ def summarize_coord(name, var, col_width): show_values = is_index or _not_remote(var) marker = u'*' if is_index else u' ' if is_index: - level_coords = var.variable.to_coord().get_level_coords() - if level_coords: - return u'\n'.join([ - _summarize_var_or_coord(coord_name, coord, col_width, - show_values, marker) - for coord_name, coord in level_coords.items() - ]) + coord = var.variable.to_coord() + if coord.level_names is not None: + return u'\n'.join( + [_summarize_coord_multiindex(coord, col_width, marker), + _summarize_coord_levels(coord, col_width, u'-')]) return _summarize_var_or_coord(name, var, col_width, show_values, marker) diff --git a/xarray/core/variable.py b/xarray/core/variable.py index d79cfba7d9f..0f4e471c7c1 100644 --- a/xarray/core/variable.py +++ b/xarray/core/variable.py @@ -1182,20 +1182,27 @@ def to_index(self): index = index.set_names(self.name) return index - def get_level_coords(self): - """Return an OrderedDict of independent coordinates for each - index level, or return an empty OrderedDict if the coordinate - has no MultiIndex. + @property + def level_names(self): + """Return MultiIndex level names or None if Coordinate has no + MultiIndex. """ - level_coords = OrderedDict() index = self.to_index() - if not isinstance(index, pd.MultiIndex): - return level_coords - for level_name in index.names: - level_coords[level_name] = type(self)( - level_name, index.get_level_values(level_name), dim=self.name - ) - return level_coords + if isinstance(index, pd.MultiIndex): + return index.names + else: + return None + + @level_names.setter + def level_names(self, value): + raise AttributeError('cannot modify level names of Coordinate in-place') + + def get_level_coord(self, level): + """Return a new Coordinate from a given MultiIndex level.""" + if self.level_names is None: + raise ValueError("Coordinate %s has no MultiIndex" % self.name) + index = self.to_index() + return type(self)(level, index.get_level_values(level), dim=self.name) @property def name(self): From 6afcb4aa258cd2816bcf7ff60ba35f27361bd863 Mon Sep 17 00:00:00 2001 From: Benoit Bovy Date: Thu, 11 Aug 2016 14:39:15 +0200 Subject: [PATCH 08/26] do not allow providing both level and dim indexers in .sel --- xarray/core/indexing.py | 20 +++++--------------- 1 file changed, 5 insertions(+), 15 deletions(-) diff --git a/xarray/core/indexing.py b/xarray/core/indexing.py index bbe7152bfbc..f6e9bab7171 100644 --- a/xarray/core/indexing.py +++ b/xarray/core/indexing.py @@ -218,7 +218,7 @@ def get_dim_indexers(data_obj, indexers): """Given an xarray data object and label based indexers, return a mapping of indexers with only dimension names as keys. - It tries to group multiple indexers given on a multi-index dimension + It groups multiple level indexers given on a multi-index dimension into a single, dictionary indexer for that dimension (Raise a ValueError if it is not possible). """ @@ -241,20 +241,10 @@ def get_dim_indexers(data_obj, indexers): dim_indexers[key] = label for dim, level_labels in iteritems(level_indexers): - dim_idx = dim_indexers.get(dim, False) - if dim_idx: - if is_dict_like(dim_idx): - if len(set(dim_idx.keys() & set(level_labels.keys()))): - raise ValueError("Duplicate multi-index level indexer(s) " - "given for dimension %s" % dim) - else: - dim_indexers[dim].update(level_labels) - else: - raise ValueError("Cannot combine multi-index level indexers " - "with a non-dict indexer for dimension %s" - % dim) - else: - dim_indexers[dim] = level_labels + if dim_indexers.get(dim, False): + raise ValueError("Cannot combine multi-index level indexers " + "with an indexer for dimension %s" % dim) + dim_indexers[dim] = level_labels return dim_indexers From 76c937efc7838525d31ca414f73f787ceab923b2 Mon Sep 17 00:00:00 2001 From: Benoit Bovy Date: Fri, 12 Aug 2016 01:00:05 +0200 Subject: [PATCH 09/26] cosmetic changes --- xarray/core/coordinates.py | 3 +++ xarray/core/formatting.py | 8 ++++---- xarray/core/indexing.py | 7 +++---- 3 files changed, 10 insertions(+), 8 deletions(-) diff --git a/xarray/core/coordinates.py b/xarray/core/coordinates.py index f3884dadfe6..b304bc88f87 100644 --- a/xarray/core/coordinates.py +++ b/xarray/core/coordinates.py @@ -231,6 +231,9 @@ def __delitem__(self, key): class DataArrayLevelCoordinates(AbstractCoordinates): """Dictionary like container for DataArray MultiIndex level coordinates. + + Used for attribute style lookup. Not returned directly by any + public methods. """ def __init__(self, dataarray): self._data = dataarray diff --git a/xarray/core/formatting.py b/xarray/core/formatting.py index 2b7f89613fa..985daf0fc59 100644 --- a/xarray/core/formatting.py +++ b/xarray/core/formatting.py @@ -209,10 +209,10 @@ def _summarize_coord_multiindex(coord, col_width, marker): return u'%s(%s) MultiIndex' % (first_col, unicode_type(coord.dims[0])) -def _summarize_coord_levels(coord, col_width, marker): - # TODO: maybe slicing based on calculated number of displayed values +def _summarize_coord_levels(coord, col_width, marker=u'-'): + relevant_coord = coord[:30] return u'\n'.join( - [_summarize_var_or_coord(lname, coord[:30].get_level_coord(lname), + [_summarize_var_or_coord(lname, relevant_coord.get_level_coord(lname), col_width, marker=marker) for lname in coord.level_names]) @@ -241,7 +241,7 @@ def summarize_coord(name, var, col_width): if coord.level_names is not None: return u'\n'.join( [_summarize_coord_multiindex(coord, col_width, marker), - _summarize_coord_levels(coord, col_width, u'-')]) + _summarize_coord_levels(coord, col_width)]) return _summarize_var_or_coord(name, var, col_width, show_values, marker) diff --git a/xarray/core/indexing.py b/xarray/core/indexing.py index f6e9bab7171..65243c3fa04 100644 --- a/xarray/core/indexing.py +++ b/xarray/core/indexing.py @@ -1,4 +1,5 @@ from datetime import timedelta +from collections import defaultdict import numpy as np import pandas as pd @@ -228,14 +229,12 @@ def get_dim_indexers(data_obj, indexers): raise ValueError("dimensions or multi-index levels %r do not exist" % invalid) - level_indexers = {} + level_indexers = defaultdict(dict) dim_indexers = {} for key, label in iteritems(indexers): - dim = data_obj[key].dims[0] + dim, = data_obj[key].dims if key != dim: # assume here multi-index level indexer - if not level_indexers.get(dim, False): - level_indexers[dim] = {} level_indexers[dim][key] = label else: dim_indexers[key] = label From 5009ba832f7fc311679b58b7eb33e08ccee3bab2 Mon Sep 17 00:00:00 2001 From: Benoit Bovy Date: Wed, 24 Aug 2016 16:46:38 +0200 Subject: [PATCH 10/26] change signature of Coordinate.__init__ --- xarray/core/variable.py | 16 +++++++--------- 1 file changed, 7 insertions(+), 9 deletions(-) diff --git a/xarray/core/variable.py b/xarray/core/variable.py index 0f4e471c7c1..8f74f81109b 100644 --- a/xarray/core/variable.py +++ b/xarray/core/variable.py @@ -1075,10 +1075,8 @@ class IndexVariable(Variable): unless another name is given. """ - def __init__(self, name, data, attrs=None, encoding=None, - fastpath=False, dim=None): - if dim is None: - dim = name + def __init__(self, dims, data, attrs=None, encoding=None, + name=None, fastpath=False): super(IndexVariable, self).__init__(dim, data, attrs, encoding, fastpath) if self.ndim != 1: @@ -1101,8 +1099,8 @@ def __getitem__(self, key): if not hasattr(values, 'ndim') or values.ndim == 0: return Variable((), values, self._attrs, self._encoding) else: - return type(self)(self._name, values, self._attrs, - self._encoding, fastpath=True, dim=self.dims) + return type(self)(self.dims, values, self._attrs, + self._encoding, name=self._name, fastpath=True) def __setitem__(self, key, value): raise TypeError('%s values cannot be modified' % type(self).__name__) @@ -1154,8 +1152,8 @@ def copy(self, deep=True): # there is no need to copy the index values here even if deep=True # since pandas.Index objects are immutable data = PandasIndexAdapter(self) if deep else self._data - return type(self)(self._name, data, self._attrs, - self._encoding, fastpath=True, dim=self.dims) + return type(self)(self.dims, data, self._attrs, + self._encoding, name=self._name, fastpath=True) def _data_equals(self, other): return self.to_index().equals(other.to_index()) @@ -1202,7 +1200,7 @@ def get_level_coord(self, level): if self.level_names is None: raise ValueError("Coordinate %s has no MultiIndex" % self.name) index = self.to_index() - return type(self)(level, index.get_level_values(level), dim=self.name) + return type(self)(self.dims, index.get_level_values(level), name=level) @property def name(self): From 4c78ea9e9296b972ce15b9a638f1615af7740f3d Mon Sep 17 00:00:00 2001 From: Benoit Bovy Date: Tue, 30 Aug 2016 14:55:58 +0200 Subject: [PATCH 11/26] check for uniqueness of multi-index level names --- xarray/core/dataarray.py | 10 ++++++++++ xarray/core/dataset.py | 16 ++++++++++++++++ 2 files changed, 26 insertions(+) diff --git a/xarray/core/dataarray.py b/xarray/core/dataarray.py index 2fd3460b359..33a623ddd26 100644 --- a/xarray/core/dataarray.py +++ b/xarray/core/dataarray.py @@ -71,6 +71,7 @@ def _infer_coords_and_dims(shape, coords, dims): new_coords[dim] = default_index_coordinate(dim, size) sizes = dict(zip(dims, shape)) + level_names = {} for k, v in new_coords.items(): if any(d not in dims for d in v.dims): raise ValueError('coordinate %s has dimensions %s, but these ' @@ -83,6 +84,15 @@ def _infer_coords_and_dims(shape, coords, dims): 'length %s on the data but length %s on ' 'coordinate %r' % (d, sizes[d], s, k)) + if v.ndim == 1: + idx_level_names = v.to_coord().level_names or [] + for n in idx_level_names: + if n in level_names: + raise ValueError('found duplicate MultiIndex level ' + 'name %r for coordinates %r and %r' + % (n, k, level_names[n])) + level_names[n] = k + return new_coords, dims diff --git a/xarray/core/dataset.py b/xarray/core/dataset.py index 41124f460ab..72558a69f3c 100644 --- a/xarray/core/dataset.py +++ b/xarray/core/dataset.py @@ -216,6 +216,7 @@ def __init__(self, data_vars=None, coords=None, attrs=None, coords = {} if data_vars is not None or coords is not None: self._set_init_vars_and_dims(data_vars, coords, compat) + self._check_multiindex_level_names() if attrs is not None: self.attrs = attrs self._initialized = True @@ -242,6 +243,21 @@ def _set_init_vars_and_dims(self, data_vars, coords, compat): self._coord_names = coord_names self._dims = dims + def _check_multiindex_level_names(self): + """Check for uniqueness of MultiIndex level names + """ + level_names = {} + for c in self._coord_names: + v = self._variables[c] + if v.ndim == 1 and v._in_memory: + idx_level_names = v.to_coord().level_names or [] + for n in idx_level_names: + if n in level_names: + raise ValueError('found duplicate MultiIndex level ' + 'name %r for coordinates %r and %r' + % (n, c, level_names[n])) + level_names[n] = c + @classmethod def load_store(cls, store, decoder=None): """Create a new dataset from the contents of a backends.*DataStore From d28e82926634c60653dd5413da78f87ac3ea6629 Mon Sep 17 00:00:00 2001 From: Benoit Bovy Date: Tue, 30 Aug 2016 15:22:43 +0200 Subject: [PATCH 12/26] no need to check for uniqueness of level names in _level_coords --- xarray/core/dataarray.py | 7 +------ xarray/core/dataset.py | 7 +------ 2 files changed, 2 insertions(+), 12 deletions(-) diff --git a/xarray/core/dataarray.py b/xarray/core/dataarray.py index 33a623ddd26..8c84a8a2399 100644 --- a/xarray/core/dataarray.py +++ b/xarray/core/dataarray.py @@ -431,18 +431,13 @@ def _item_key_to_dict(self, key): @property def _level_coords(self): """Return a mapping of all MultiIndex levels and their corresponding - coordinate name. Raise a `ValueError`` if two or more levels have - the same name. + coordinate name. """ level_coords = OrderedDict() for cname, var in self._coords.items(): if var.ndim == 1: level_names = var.to_coord().level_names if level_names is not None: - duplicate_names = set(level_names) & set(level_coords) - if duplicate_names: - raise ValueError("duplicate MultiIndex level names %r" - % duplicate_names) dim = var.dims[0] level_coords.update({lname: dim for lname in level_names}) return level_coords diff --git a/xarray/core/dataset.py b/xarray/core/dataset.py index 72558a69f3c..62f5f5233bd 100644 --- a/xarray/core/dataset.py +++ b/xarray/core/dataset.py @@ -457,8 +457,7 @@ def _subset_with_all_valid_coords(self, variables, coord_names, attrs): @property def _level_coords(self): """Return a mapping of all MultiIndex levels and their corresponding - coordinate name. Raise a `ValueError`` if two or more levels have - the same name. + coordinate name. """ level_coords = OrderedDict() for cname in self._coord_names: @@ -466,10 +465,6 @@ def _level_coords(self): if var.ndim == 1: level_names = var.to_coord().level_names if level_names is not None: - duplicate_names = set(level_names) & set(level_coords) - if duplicate_names: - raise ValueError("duplicate MultiIndex level names %r" - % duplicate_names) dim = var.dims[0] level_coords.update({lname: dim for lname in level_names}) return level_coords From 810b4f9f3c91ea5b4258a7522139d709ff51788e Mon Sep 17 00:00:00 2001 From: Benoit Bovy Date: Wed, 31 Aug 2016 17:30:03 +0200 Subject: [PATCH 13/26] rewritten checking uniqueness of multi-index level names --- xarray/core/dataarray.py | 12 +++--------- xarray/core/dataset.py | 16 ---------------- xarray/core/merge.py | 8 ++++++-- xarray/core/variable.py | 26 ++++++++++++++++++++++++++ 4 files changed, 35 insertions(+), 27 deletions(-) diff --git a/xarray/core/dataarray.py b/xarray/core/dataarray.py index 8c84a8a2399..d7a5742021e 100644 --- a/xarray/core/dataarray.py +++ b/xarray/core/dataarray.py @@ -19,7 +19,8 @@ from .dataset import Dataset from .pycompat import iteritems, basestring, OrderedDict, zip from .variable import (as_variable, Variable, as_compatible_data, IndexVariable, - default_index_coordinate) + default_index_coordinate, + assert_unique_multiindex_level_names) from .formatting import format_item @@ -84,14 +85,7 @@ def _infer_coords_and_dims(shape, coords, dims): 'length %s on the data but length %s on ' 'coordinate %r' % (d, sizes[d], s, k)) - if v.ndim == 1: - idx_level_names = v.to_coord().level_names or [] - for n in idx_level_names: - if n in level_names: - raise ValueError('found duplicate MultiIndex level ' - 'name %r for coordinates %r and %r' - % (n, k, level_names[n])) - level_names[n] = k + assert_unique_multiindex_level_names(new_coords) return new_coords, dims diff --git a/xarray/core/dataset.py b/xarray/core/dataset.py index 62f5f5233bd..c13f8fdaadb 100644 --- a/xarray/core/dataset.py +++ b/xarray/core/dataset.py @@ -216,7 +216,6 @@ def __init__(self, data_vars=None, coords=None, attrs=None, coords = {} if data_vars is not None or coords is not None: self._set_init_vars_and_dims(data_vars, coords, compat) - self._check_multiindex_level_names() if attrs is not None: self.attrs = attrs self._initialized = True @@ -243,21 +242,6 @@ def _set_init_vars_and_dims(self, data_vars, coords, compat): self._coord_names = coord_names self._dims = dims - def _check_multiindex_level_names(self): - """Check for uniqueness of MultiIndex level names - """ - level_names = {} - for c in self._coord_names: - v = self._variables[c] - if v.ndim == 1 and v._in_memory: - idx_level_names = v.to_coord().level_names or [] - for n in idx_level_names: - if n in level_names: - raise ValueError('found duplicate MultiIndex level ' - 'name %r for coordinates %r and %r' - % (n, c, level_names[n])) - level_names[n] = c - @classmethod def load_store(cls, store, decoder=None): """Create a new dataset from the contents of a backends.*DataStore diff --git a/xarray/core/merge.py b/xarray/core/merge.py index 28eb8ecea84..8cab70f9fff 100644 --- a/xarray/core/merge.py +++ b/xarray/core/merge.py @@ -2,7 +2,8 @@ from .alignment import align from .utils import Frozen, is_dict_like -from .variable import as_variable, default_index_coordinate +from .variable import (as_variable, default_index_coordinate, + assert_unique_multiindex_level_names) from .pycompat import (basestring, OrderedDict) @@ -110,7 +111,7 @@ def merge_variables( If provided, variables are always taken from this dict in preference to the input variable dictionaries, without checking for conflicts. compat : {'identical', 'equals', 'broadcast_equals', 'minimal'}, optional - Type of equality check to use wben checking for conflicts. + Type of equality check to use when checking for conflicts. Returns ------- @@ -278,6 +279,7 @@ def merge_coords_without_align(objs, priority_vars=None): """ expanded = expand_variable_dicts(objs) variables = merge_variables(expanded, priority_vars) + assert_unique_multiindex_level_names(variables) return variables @@ -370,6 +372,7 @@ def merge_coords(objs, compat='minimal', join='outer', priority_arg=None, expanded = expand_variable_dicts(aligned) priority_vars = _get_priority_vars(aligned, priority_arg, compat=compat) variables = merge_variables(expanded, priority_vars, compat=compat) + assert_unique_multiindex_level_names(variables) return variables @@ -431,6 +434,7 @@ def merge_core(objs, compat='broadcast_equals', join='outer', priority_arg=None, priority_vars = _get_priority_vars(aligned, priority_arg, compat=compat) variables = merge_variables(expanded, priority_vars, compat=compat) + assert_unique_multiindex_level_names(variables) dims = calculate_dimensions(variables) diff --git a/xarray/core/variable.py b/xarray/core/variable.py index 8f74f81109b..e043520084a 100644 --- a/xarray/core/variable.py +++ b/xarray/core/variable.py @@ -1,4 +1,5 @@ from datetime import timedelta +from collections import defaultdict import functools import itertools import warnings @@ -1305,3 +1306,28 @@ def concat(variables, dim='concat_dim', positions=None, shortcut=False): return IndexVariable.concat(variables, dim, positions, shortcut) else: return Variable.concat(variables, dim, positions, shortcut) + + +def assert_unique_multiindex_level_names(variables): + """Check for uniqueness of MultiIndex level names in all given + variables. + + Not public API. Used for checking consistency of DataArray and Dataset + objects. + """ + level_names = defaultdict(list) + for var_name, var in variables.items(): + if isinstance(var._data, PandasIndexAdapter): + idx_level_names = var.to_coord().level_names + if idx_level_names is not None: + for n in idx_level_names: + level_names[n].append(var_name) + + duplicate_level_names = {k: v for k, v in level_names.items() + if len(v) > 1} + if duplicate_level_names: + duplicate_str = '\n'.join(['level %r found in %s' + % (k, ' and '.join(v)) + for k, v in duplicate_level_names.items()]) + raise ValueError('conflicting MultiIndex level names:\n%s' + % duplicate_str) From 7738059757abff35ddcf5ea06137aff247040abc Mon Sep 17 00:00:00 2001 From: Benoit Bovy Date: Thu, 1 Sep 2016 15:14:37 +0200 Subject: [PATCH 14/26] fix adding coords/vars with the same name than a multi-index level --- xarray/core/coordinates.py | 12 ++++++++++++ xarray/core/dataarray.py | 1 - xarray/core/dataset.py | 6 ++++++ 3 files changed, 18 insertions(+), 1 deletion(-) diff --git a/xarray/core/coordinates.py b/xarray/core/coordinates.py index b304bc88f87..f9d737d7854 100644 --- a/xarray/core/coordinates.py +++ b/xarray/core/coordinates.py @@ -151,6 +151,12 @@ def to_dataset(self): def _update_coords(self, coords): from .dataset import calculate_dimensions + for key in coords: + if key in self._data._level_coords: + raise ValueError("%r is already a MultiIndex level of " + "coordinate %r" + % (key, self._data._level_coords[key])) + variables = self._data._variables.copy() variables.update(coords) @@ -196,6 +202,12 @@ def _names(self): def _update_coords(self, coords): from .dataset import calculate_dimensions + for key in coords: + if key in self._data._level_coords: + raise ValueError("%r is already a MultiIndex level of " + "coordinate %r" + % (key, self._data._level_coords[key])) + dims = calculate_dimensions(coords) if set(dims) != set(self.dims): raise ValueError('cannot add coordinates with new dimensions to ' diff --git a/xarray/core/dataarray.py b/xarray/core/dataarray.py index d7a5742021e..0479eb1faba 100644 --- a/xarray/core/dataarray.py +++ b/xarray/core/dataarray.py @@ -72,7 +72,6 @@ def _infer_coords_and_dims(shape, coords, dims): new_coords[dim] = default_index_coordinate(dim, size) sizes = dict(zip(dims, shape)) - level_names = {} for k, v in new_coords.items(): if any(d not in dims for d in v.dims): raise ValueError('coordinate %s has dimensions %s, but these ' diff --git a/xarray/core/dataset.py b/xarray/core/dataset.py index c13f8fdaadb..1f488c424ab 100644 --- a/xarray/core/dataset.py +++ b/xarray/core/dataset.py @@ -551,6 +551,12 @@ def __setitem__(self, key, value): if utils.is_dict_like(key): raise NotImplementedError('cannot yet use a dictionary as a key ' 'to set Dataset values') + + if key in self._level_coords: + raise ValueError("%r is already a MultiIndex level of " + "coordinate %r" + % (key, self._level_coords[key])) + self.update({key: value}) def __delitem__(self, key): From 62b46f274e23b419c0315d8ba147eb3133cd5d6c Mon Sep 17 00:00:00 2001 From: Benoit Bovy Date: Thu, 1 Sep 2016 22:53:49 +0200 Subject: [PATCH 15/26] check for level/var name conflicts in one place --- xarray/core/coordinates.py | 12 ------------ xarray/core/variable.py | 13 ++++++++----- 2 files changed, 8 insertions(+), 17 deletions(-) diff --git a/xarray/core/coordinates.py b/xarray/core/coordinates.py index f9d737d7854..b304bc88f87 100644 --- a/xarray/core/coordinates.py +++ b/xarray/core/coordinates.py @@ -151,12 +151,6 @@ def to_dataset(self): def _update_coords(self, coords): from .dataset import calculate_dimensions - for key in coords: - if key in self._data._level_coords: - raise ValueError("%r is already a MultiIndex level of " - "coordinate %r" - % (key, self._data._level_coords[key])) - variables = self._data._variables.copy() variables.update(coords) @@ -202,12 +196,6 @@ def _names(self): def _update_coords(self, coords): from .dataset import calculate_dimensions - for key in coords: - if key in self._data._level_coords: - raise ValueError("%r is already a MultiIndex level of " - "coordinate %r" - % (key, self._data._level_coords[key])) - dims = calculate_dimensions(coords) if set(dims) != set(self.dims): raise ValueError('cannot add coordinates with new dimensions to ' diff --git a/xarray/core/variable.py b/xarray/core/variable.py index e043520084a..0ab6f084a10 100644 --- a/xarray/core/variable.py +++ b/xarray/core/variable.py @@ -1321,13 +1321,16 @@ def assert_unique_multiindex_level_names(variables): idx_level_names = var.to_coord().level_names if idx_level_names is not None: for n in idx_level_names: - level_names[n].append(var_name) + level_names[n].append('%r (%s)' % (n, var_name)) + + for n in level_names: + if n in variables: + level_names[n].append('(%s)' % n) duplicate_level_names = {k: v for k, v in level_names.items() if len(v) > 1} if duplicate_level_names: - duplicate_str = '\n'.join(['level %r found in %s' - % (k, ' and '.join(v)) - for k, v in duplicate_level_names.items()]) - raise ValueError('conflicting MultiIndex level names:\n%s' + duplicate_str = '\n'.join([', '.join(v) + for v in duplicate_level_names.values()]) + raise ValueError('conflicting MultiIndex level name(s):\n%s' % duplicate_str) From 936ec55774e5fd9b8c3c4fc285755f24aa5345a5 Mon Sep 17 00:00:00 2001 From: Benoit Bovy Date: Fri, 2 Sep 2016 10:06:42 +0200 Subject: [PATCH 16/26] cosmetic changes --- xarray/core/variable.py | 18 ++++++++---------- 1 file changed, 8 insertions(+), 10 deletions(-) diff --git a/xarray/core/variable.py b/xarray/core/variable.py index 0ab6f084a10..a25dc007ac5 100644 --- a/xarray/core/variable.py +++ b/xarray/core/variable.py @@ -1323,14 +1323,12 @@ def assert_unique_multiindex_level_names(variables): for n in idx_level_names: level_names[n].append('%r (%s)' % (n, var_name)) - for n in level_names: - if n in variables: - level_names[n].append('(%s)' % n) - - duplicate_level_names = {k: v for k, v in level_names.items() - if len(v) > 1} - if duplicate_level_names: - duplicate_str = '\n'.join([', '.join(v) - for v in duplicate_level_names.values()]) + for k, v in level_names.items(): + if k in variables: + v.append('(%s)' % n) + + duplicate_names = [v for v in level_names.values() if len(v) > 1] + if duplicate_names: + conflict_str = '\n'.join([', '.join(v) for v in duplicate_names]) raise ValueError('conflicting MultiIndex level name(s):\n%s' - % duplicate_str) + % conflict_str) From 1d6a96ff1effb6307eafb6af0fb8c07352615500 Mon Sep 17 00:00:00 2001 From: Benoit Bovy Date: Fri, 2 Sep 2016 10:56:39 +0200 Subject: [PATCH 17/26] fix Coordinate -> IndexVariable --- xarray/core/coordinates.py | 4 ++-- xarray/core/dataarray.py | 2 +- xarray/core/dataset.py | 4 ++-- xarray/core/formatting.py | 5 +++-- xarray/core/indexing.py | 4 ++-- xarray/core/variable.py | 16 +++++++++------- 6 files changed, 19 insertions(+), 16 deletions(-) diff --git a/xarray/core/coordinates.py b/xarray/core/coordinates.py index b304bc88f87..1dda1e9ffc0 100644 --- a/xarray/core/coordinates.py +++ b/xarray/core/coordinates.py @@ -245,8 +245,8 @@ def _names(self): @property def variables(self): level_coords = OrderedDict( - (k, self._data[v].variable.get_level_coord(k)) - for k, v in self._data._level_coords.items()) + (k, self._data[v].variable.get_level_variable(k)) + for k, v in self._data._level_coords.items()) return Frozen(level_coords) diff --git a/xarray/core/dataarray.py b/xarray/core/dataarray.py index 0479eb1faba..e8758c7b62e 100644 --- a/xarray/core/dataarray.py +++ b/xarray/core/dataarray.py @@ -429,7 +429,7 @@ def _level_coords(self): level_coords = OrderedDict() for cname, var in self._coords.items(): if var.ndim == 1: - level_names = var.to_coord().level_names + level_names = var.to_index_variable().level_names if level_names is not None: dim = var.dims[0] level_coords.update({lname: dim for lname in level_names}) diff --git a/xarray/core/dataset.py b/xarray/core/dataset.py index 1f488c424ab..71867f0537b 100644 --- a/xarray/core/dataset.py +++ b/xarray/core/dataset.py @@ -50,7 +50,7 @@ def _get_virtual_variable(variables, key, level_vars={}): if ref_name in level_vars: dim_var = variables[level_vars[ref_name]] - ref_var = dim_var.to_coord().get_level_coord(ref_name) + ref_var = dim_var.to_index_variable().get_level_variable(ref_name) else: ref_var = variables[ref_name] @@ -447,7 +447,7 @@ def _level_coords(self): for cname in self._coord_names: var = self.variables[cname] if var.ndim == 1: - level_names = var.to_coord().level_names + level_names = var.to_index_variable().level_names if level_names is not None: dim = var.dims[0] level_coords.update({lname: dim for lname in level_names}) diff --git a/xarray/core/formatting.py b/xarray/core/formatting.py index 985daf0fc59..dfe07ea31bf 100644 --- a/xarray/core/formatting.py +++ b/xarray/core/formatting.py @@ -212,7 +212,8 @@ def _summarize_coord_multiindex(coord, col_width, marker): def _summarize_coord_levels(coord, col_width, marker=u'-'): relevant_coord = coord[:30] return u'\n'.join( - [_summarize_var_or_coord(lname, relevant_coord.get_level_coord(lname), + [_summarize_var_or_coord(lname, + relevant_coord.get_level_variable(lname), col_width, marker=marker) for lname in coord.level_names]) @@ -237,7 +238,7 @@ def summarize_coord(name, var, col_width): show_values = is_index or _not_remote(var) marker = u'*' if is_index else u' ' if is_index: - coord = var.variable.to_coord() + coord = var.variable.to_index_variable() if coord.level_names is not None: return u'\n'.join( [_summarize_coord_multiindex(coord, col_width, marker), diff --git a/xarray/core/indexing.py b/xarray/core/indexing.py index 65243c3fa04..3cafda83bc4 100644 --- a/xarray/core/indexing.py +++ b/xarray/core/indexing.py @@ -216,8 +216,8 @@ def convert_label_indexer(index, label, index_name='', method=None, def get_dim_indexers(data_obj, indexers): - """Given an xarray data object and label based indexers, return a mapping - of indexers with only dimension names as keys. + """Given a xarray data object and label based indexers, return a mapping + of label indexers with only dimension names as keys. It groups multiple level indexers given on a multi-index dimension into a single, dictionary indexer for that dimension (Raise a ValueError diff --git a/xarray/core/variable.py b/xarray/core/variable.py index a25dc007ac5..7f62b0e0886 100644 --- a/xarray/core/variable.py +++ b/xarray/core/variable.py @@ -1079,7 +1079,8 @@ class IndexVariable(Variable): def __init__(self, dims, data, attrs=None, encoding=None, name=None, fastpath=False): - super(IndexVariable, self).__init__(dim, data, attrs, encoding, fastpath) + super(IndexVariable, self).__init__(dims, data, attrs, encoding, + fastpath) if self.ndim != 1: raise ValueError('%s objects must be 1-dimensional' % type(self).__name__) @@ -1183,7 +1184,7 @@ def to_index(self): @property def level_names(self): - """Return MultiIndex level names or None if Coordinate has no + """Return MultiIndex level names or None if this IndexVariable has no MultiIndex. """ index = self.to_index() @@ -1194,12 +1195,13 @@ def level_names(self): @level_names.setter def level_names(self, value): - raise AttributeError('cannot modify level names of Coordinate in-place') + raise AttributeError('cannot modify level names of ' + 'IndexVariable in-place') - def get_level_coord(self, level): - """Return a new Coordinate from a given MultiIndex level.""" + def get_level_variable(self, level): + """Return a new IndexVariable from a given MultiIndex level.""" if self.level_names is None: - raise ValueError("Coordinate %s has no MultiIndex" % self.name) + raise ValueError("IndexVariable %r has no MultiIndex" % self.name) index = self.to_index() return type(self)(self.dims, index.get_level_values(level), name=level) @@ -1318,7 +1320,7 @@ def assert_unique_multiindex_level_names(variables): level_names = defaultdict(list) for var_name, var in variables.items(): if isinstance(var._data, PandasIndexAdapter): - idx_level_names = var.to_coord().level_names + idx_level_names = var.to_index_variable().level_names if idx_level_names is not None: for n in idx_level_names: level_names[n].append('%r (%s)' % (n, var_name)) From ec67bbd50211a7cd2e7558a9d138821d7a7a4e46 Mon Sep 17 00:00:00 2001 From: Benoit Bovy Date: Fri, 2 Sep 2016 13:47:35 +0200 Subject: [PATCH 18/26] fix col width when formatting multi-index levels --- xarray/core/formatting.py | 35 +++++++++++++++++++++++++++-------- 1 file changed, 27 insertions(+), 8 deletions(-) diff --git a/xarray/core/formatting.py b/xarray/core/formatting.py index dfe07ea31bf..1f496b4d193 100644 --- a/xarray/core/formatting.py +++ b/xarray/core/formatting.py @@ -13,6 +13,7 @@ from .options import OPTIONS from .pycompat import PY2, iteritems, unicode_type, bytes_type, dask_array_type +from .indexing import PandasIndexAdapter def pretty_print(x, numchars): @@ -254,9 +255,24 @@ def summarize_attr(key, value, col_width=None): EMPTY_REPR = u' *empty*' -def _calculate_col_width(mapping): - max_name_length = (max(len(unicode_type(k)) for k in mapping) - if mapping else 0) +def _get_col_items(mapping): + """Get all column items to format, including both keys of `mapping` + and MultiIndex levels if any. + """ + col_items = [] + for k, v in mapping.items(): + col_items.append(k) + var = getattr(v, 'variable', v) + if isinstance(var._data, PandasIndexAdapter): + level_names = var.to_index_variable().level_names + if level_names is not None: + col_items += list(level_names) + return col_items + + +def _calculate_col_width(col_items): + max_name_length = (max(len(unicode_type(s)) for s in col_items) + if col_items else 0) col_width = max(max_name_length, 7) + 6 return col_width @@ -272,10 +288,6 @@ def _mapping_repr(mapping, title, summarizer, col_width=None): return u'\n'.join(summary) -coords_repr = functools.partial(_mapping_repr, title=u'Coordinates', - summarizer=summarize_coord) - - vars_repr = functools.partial(_mapping_repr, title=u'Data variables', summarizer=summarize_var) @@ -284,6 +296,13 @@ def _mapping_repr(mapping, title, summarizer, col_width=None): summarizer=summarize_attr) +def coords_repr(coords, col_width=None): + if col_width is None: + col_width = _calculate_col_width(_get_col_items(coords)) + return _mapping_repr(coords, title=u'Coordinates', + summarizer=summarize_coord, col_width=col_width) + + def indexes_repr(indexes): summary = [] for k, v in indexes.items(): @@ -323,7 +342,7 @@ def array_repr(arr): def dataset_repr(ds): summary = [u'' % type(ds).__name__] - col_width = _calculate_col_width(ds) + col_width = _calculate_col_width(_get_col_items(ds)) dims_start = pretty_print(u'Dimensions:', col_width) all_dim_strings = [u'%s: %s' % (k, v) for k, v in iteritems(ds.dims)] From f80d7a8fcaae961ca70a08892dd7cd1b74202ace Mon Sep 17 00:00:00 2001 From: Benoit Bovy Date: Fri, 2 Sep 2016 16:40:33 +0200 Subject: [PATCH 19/26] add tests for IndexVariable new methods and indexing --- xarray/core/indexing.py | 2 +- xarray/test/test_dataarray.py | 5 ++++- xarray/test/test_dataset.py | 3 +++ xarray/test/test_indexing.py | 16 +++++++++++++++- xarray/test/test_variable.py | 25 +++++++++++++++++++++++++ 5 files changed, 48 insertions(+), 3 deletions(-) diff --git a/xarray/core/indexing.py b/xarray/core/indexing.py index 3cafda83bc4..bfdc6d305ad 100644 --- a/xarray/core/indexing.py +++ b/xarray/core/indexing.py @@ -241,7 +241,7 @@ def get_dim_indexers(data_obj, indexers): for dim, level_labels in iteritems(level_indexers): if dim_indexers.get(dim, False): - raise ValueError("Cannot combine multi-index level indexers " + raise ValueError("cannot combine multi-index level indexers " "with an indexer for dimension %s" % dim) dim_indexers[dim] = level_labels diff --git a/xarray/test/test_dataarray.py b/xarray/test/test_dataarray.py index 01900dcf0f1..e495abb8599 100644 --- a/xarray/test/test_dataarray.py +++ b/xarray/test/test_dataarray.py @@ -543,7 +543,7 @@ def test_loc_single_boolean(self): self.assertEqual(data.loc[True], 0) self.assertEqual(data.loc[False], 1) - def test_multiindex(self): + def test_selection_multiindex(self): mindex = pd.MultiIndex.from_product([['a', 'b'], [1, 2], [-1, -2]], names=('one', 'two', 'three')) mdata = DataArray(range(8), [('x', mindex)]) @@ -582,6 +582,9 @@ def test_sel(lab_indexer, pos_indexer, replaced_idx=False, with self.assertRaises(IndexError): mdata.loc[('a', 1)] + self.assertDataArrayIdentical(mdata.sel(x={'one': 'a', 'two': 1}), + mdata.sel(one='a', two=1)) + def test_time_components(self): dates = pd.date_range('2000-01-01', periods=10) da = DataArray(np.arange(1, 11), [('time', dates)]) diff --git a/xarray/test/test_dataset.py b/xarray/test/test_dataset.py index ac2c665c12a..d57c48c9626 100644 --- a/xarray/test/test_dataset.py +++ b/xarray/test/test_dataset.py @@ -917,6 +917,9 @@ def test_sel(lab_indexer, pos_indexer, replaced_idx=False, self.assertDatasetIdentical(mdata.loc[{'x': ('a', 1, -1)}], mdata.sel(x=('a', 1, -1))) + self.assertDatasetIdentical(mdata.sel(x={'one': 'a', 'two': 1}), + mdata.sel(one='a', two=1)) + def test_reindex_like(self): data = create_test_data() data['letters'] = ('dim3', 10 * ['a']) diff --git a/xarray/test/test_indexing.py b/xarray/test/test_indexing.py index 1dca99ec99a..7ed3f5bc372 100644 --- a/xarray/test/test_indexing.py +++ b/xarray/test/test_indexing.py @@ -107,8 +107,22 @@ def test_convert_unsorted_datetime_index_raises(self): # slice is always a view. indexing.convert_label_indexer(index, slice('2001', '2002')) + def test_get_dim_indexers(self): + mindex = pd.MultiIndex.from_product([['a', 'b'], [1, 2]], + names=('one', 'two')) + mdata = DataArray(range(4), [('x', mindex)]) + + dim_indexers = indexing.get_dim_indexers(mdata, {'one': 'a', 'two': 1}) + self.assertEqual(dim_indexers, {'x': {'one': 'a', 'two': 1}}) + + with self.assertRaisesRegexp(ValueError, 'cannot combine'): + _ = indexing.get_dim_indexers(mdata, {'x': 'a', 'two': 1}) + + with self.assertRaisesRegexp(ValueError, 'do not exist'): + _ = indexing.get_dim_indexers(mdata, {'y': 'a'}) + _ = indexing.get_dim_indexers(data, {'four': 1}) + def test_remap_label_indexers(self): - # TODO: fill in more tests! def test_indexer(data, x, expected_pos, expected_idx=None): pos, idx = indexing.remap_label_indexers(data, {'x': x}) self.assertArrayEqual(pos.get('x'), expected_pos) diff --git a/xarray/test/test_variable.py b/xarray/test/test_variable.py index 10e360f5322..dc6ce78b1ee 100644 --- a/xarray/test/test_variable.py +++ b/xarray/test/test_variable.py @@ -1035,9 +1035,34 @@ def test_name(self): coord = IndexVariable('x', [10.0]) self.assertEqual(coord.name, 'x') + coord = IndexVariable('x', [10.0], name='y') + self.assertEqual(coord.name, 'y') + with self.assertRaises(AttributeError): coord.name = 'y' + def test_level_names(self): + midx = pd.MultiIndex.from_product([['a', 'b'], [1, 2]], + names=['level_1', 'level_2']) + x = IndexVariable('x', midx) + self.assertEqual(x.level_names, midx.names) + + with self.assertRaisesRegexp(AttributeError, 'cannot modify'): + x.level_names = ['one', 'two'] + + self.assertIsNone(IndexVariable('y', [10.0]).level_names) + + def test_get_level_variable(self): + midx = pd.MultiIndex.from_product([['a', 'b'], [1, 2]], + names=['level_1', 'level_2']) + x = IndexVariable('x', midx) + level_1 = IndexVariable('x', midx.get_level_values('level_1'), + name='level_1') + self.assertVariableIdentical(x.get_level_variable('level_1'), level_1) + + with self.assertRaisesRegexp(ValueError, 'has no MultiIndex'): + IndexVariable('y', [10.0]).get_level_variable('level') + def test_concat_periods(self): periods = pd.period_range('2000-01-01', periods=10) coords = [IndexVariable('t', periods[:5]), IndexVariable('t', periods[5:])] From 861c78b6575183d603793d66f98da85631a132d0 Mon Sep 17 00:00:00 2001 From: Benoit Bovy Date: Fri, 2 Sep 2016 18:42:41 +0200 Subject: [PATCH 20/26] fix bug in assert_unique_multiindex_level_names --- xarray/core/variable.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/xarray/core/variable.py b/xarray/core/variable.py index 7f62b0e0886..098373eb4f8 100644 --- a/xarray/core/variable.py +++ b/xarray/core/variable.py @@ -1327,7 +1327,7 @@ def assert_unique_multiindex_level_names(variables): for k, v in level_names.items(): if k in variables: - v.append('(%s)' % n) + v.append('(%s)' % k) duplicate_names = [v for v in level_names.values() if len(v) > 1] if duplicate_names: From 37a0796b0b14df2db71b0602276d071a159d0b28 Mon Sep 17 00:00:00 2001 From: Benoit Bovy Date: Fri, 2 Sep 2016 18:49:32 +0200 Subject: [PATCH 21/26] add tests for Dataset --- xarray/test/test_dataset.py | 79 ++++++++++++++++++++++++++++++++++++- 1 file changed, 78 insertions(+), 1 deletion(-) diff --git a/xarray/test/test_dataset.py b/xarray/test/test_dataset.py index d57c48c9626..dcc7048baa3 100644 --- a/xarray/test/test_dataset.py +++ b/xarray/test/test_dataset.py @@ -45,6 +45,12 @@ def create_test_data(seed=None): return obj +def create_test_multiindex(): + mindex = pd.MultiIndex.from_product([['a', 'b'], [1, 2]], + names=('level_1', 'level_2')) + return Dataset(data_vars={'foo': ('x', range(4))}, coords={'x': mindex}) + + class InaccessibleVariableDataStore(backends.InMemoryDataStore): def get_variables(self): def lazy_inaccessible(x): @@ -110,6 +116,39 @@ def test_repr(self): data = Dataset(attrs={'foo': 'bar' * 1000}) self.assertTrue(len(repr(data)) < 1000) + def test_repr_multiindex(self): + data = create_test_multiindex() + expected = dedent("""\ + + Dimensions: (x: 4) + Coordinates: + * x (x) MultiIndex + - level_1 (x) object 'a' 'a' 'b' 'b' + - level_2 (x) int64 1 2 1 2 + Data variables: + foo (x) int64 0 1 2 3""") + actual = '\n'.join(x.rstrip() for x in repr(data).split('\n')) + print(actual) + self.assertEqual(expected, actual) + + # verify that long level names are not truncated + mindex = pd.MultiIndex.from_product( + [['a', 'b'], [1, 2]], + names=('a_quite_long_level_name', 'level_2')) + data = data.assign_coords(x=mindex) + expected = dedent("""\ + + Dimensions: (x: 4) + Coordinates: + * x (x) MultiIndex + - a_quite_long_level_name (x) object 'a' 'a' 'b' 'b' + - level_2 (x) int64 1 2 1 2 + Data variables: + foo (x) int64 0 1 2 3""") + actual = '\n'.join(x.rstrip() for x in repr(data).split('\n')) + print(actual) + self.assertEqual(expected, actual) + def test_repr_period_index(self): data = create_test_data(seed=456) data.coords['time'] = pd.period_range('2000-01-01', periods=20, freq='B') @@ -288,6 +327,12 @@ def test_constructor_with_coords(self): self.assertFalse(ds.data_vars) self.assertItemsEqual(ds.coords.keys(), ['x', 'a']) + mindex = pd.MultiIndex.from_product([['a', 'b'], [1, 2]], + names=('level_1', 'level_2')) + with self.assertRaisesRegexp(ValueError, 'conflicting MultiIndex'): + Dataset({}, {'x': mindex, 'y': mindex}) + Dataset({}, {'x': mindex, 'level_1': range(4)}) + def test_properties(self): ds = create_test_data() self.assertEqual(ds.dims, @@ -466,6 +511,11 @@ def test_coords_setitem_with_new_dimension(self): expected = Dataset(coords={'foo': ('x', [1, 2, 3])}) self.assertDatasetIdentical(expected, actual) + def test_coords_setitem_multiindex(self): + data = create_test_multiindex() + with self.assertRaisesRegexp(ValueError, 'cannot replace MultiIndex'): + data.coords['level_1'] = range(4) + def test_coords_set(self): one_coord = Dataset({'x': ('x', [0]), 'yy': ('x', [1]), @@ -876,7 +926,7 @@ def test_loc(self): with self.assertRaises(TypeError): data.loc[dict(dim3='a')] = 0 - def test_multiindex(self): + def test_selection_multiindex(self): mindex = pd.MultiIndex.from_product([['a', 'b'], [1, 2], [-1, -2]], names=('one', 'two', 'three')) mdata = Dataset(data_vars={'var': ('x', range(8))}, @@ -1474,6 +1524,22 @@ def test_virtual_variable_same_name(self): expected = DataArray(times.time, [('time', times)], name='time') self.assertDataArrayIdentical(actual, expected) + def test_virtual_variable_multiindex(self): + # access multi-index levels as virtual variables + data = create_test_multiindex() + expected = DataArray(['a', 'a', 'b', 'b'], name='level_1', + coords=[data['x'].to_index()], dims='x') + self.assertDataArrayIdentical(expected, data['level_1']) + + # combine multi-index level and datetime + dr_index = pd.date_range('1/1/2011', periods=4, freq='H') + mindex = pd.MultiIndex.from_arrays([['a', 'a', 'b', 'b'], dr_index], + names=('level_str', 'level_date')) + data = Dataset({}, {'x': mindex}) + expected = DataArray(mindex.get_level_values('level_date').hour, + name='hour', coords=[mindex], dims='x') + self.assertDataArrayIdentical(expected, data['level_date.hour']) + def test_time_season(self): ds = Dataset({'t': pd.date_range('2000-01-01', periods=12, freq='M')}) expected = ['DJF'] * 2 + ['MAM'] * 3 + ['JJA'] * 3 + ['SON'] * 3 + ['DJF'] @@ -1590,6 +1656,12 @@ def test_assign(self): expected = expected.set_coords('c') self.assertDatasetIdentical(actual, expected) + def test_assign_multiindex_level(self): + data = create_test_multiindex() + with self.assertRaisesRegexp(ValueError, 'conflicting MultiIndex'): + data.assign(level_1=range(4)) + data.assign_coords(level_1=range(4)) + def test_setitem_original_non_unique_index(self): # regression test for GH943 original = Dataset({'data': ('x', np.arange(5))}, @@ -1619,6 +1691,11 @@ def test_setitem_both_non_unique_index(self): actual['second'] = array self.assertDatasetIdentical(expected, actual) + def test_setitem_multiindex_level(self): + data = create_test_multiindex() + with self.assertRaisesRegexp(ValueError, 'already a MultiIndex level'): + data['level_1'] = range(4) + def test_delitem(self): data = create_test_data() all_items = set(data) From fdbf4aa40c9645d32c1839e9a780b1d06b15dce1 Mon Sep 17 00:00:00 2001 From: Benoit Bovy Date: Fri, 2 Sep 2016 20:44:53 +0200 Subject: [PATCH 22/26] fix appveyor tests --- xarray/test/test_dataset.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/xarray/test/test_dataset.py b/xarray/test/test_dataset.py index dcc7048baa3..df12074370a 100644 --- a/xarray/test/test_dataset.py +++ b/xarray/test/test_dataset.py @@ -48,7 +48,7 @@ def create_test_data(seed=None): def create_test_multiindex(): mindex = pd.MultiIndex.from_product([['a', 'b'], [1, 2]], names=('level_1', 'level_2')) - return Dataset(data_vars={'foo': ('x', range(4))}, coords={'x': mindex}) + return Dataset({}, {'x': mindex}) class InaccessibleVariableDataStore(backends.InMemoryDataStore): @@ -126,7 +126,7 @@ def test_repr_multiindex(self): - level_1 (x) object 'a' 'a' 'b' 'b' - level_2 (x) int64 1 2 1 2 Data variables: - foo (x) int64 0 1 2 3""") + *empty*""") actual = '\n'.join(x.rstrip() for x in repr(data).split('\n')) print(actual) self.assertEqual(expected, actual) @@ -135,7 +135,7 @@ def test_repr_multiindex(self): mindex = pd.MultiIndex.from_product( [['a', 'b'], [1, 2]], names=('a_quite_long_level_name', 'level_2')) - data = data.assign_coords(x=mindex) + data = Dataset({}, {'x': mindex}) expected = dedent("""\ Dimensions: (x: 4) @@ -144,7 +144,7 @@ def test_repr_multiindex(self): - a_quite_long_level_name (x) object 'a' 'a' 'b' 'b' - level_2 (x) int64 1 2 1 2 Data variables: - foo (x) int64 0 1 2 3""") + *empty*""") actual = '\n'.join(x.rstrip() for x in repr(data).split('\n')) print(actual) self.assertEqual(expected, actual) From d2370223a426fa5a4b623ed1635241b3e73d413f Mon Sep 17 00:00:00 2001 From: Benoit Bovy Date: Fri, 2 Sep 2016 21:45:02 +0200 Subject: [PATCH 23/26] add tests for DataArray --- xarray/test/test_dataarray.py | 31 +++++++++++++++++++++++++++++++ 1 file changed, 31 insertions(+) diff --git a/xarray/test/test_dataarray.py b/xarray/test/test_dataarray.py index e495abb8599..a28e82add70 100644 --- a/xarray/test/test_dataarray.py +++ b/xarray/test/test_dataarray.py @@ -24,6 +24,10 @@ def setUp(self): self.ds = Dataset({'foo': self.v}) self.dv = self.ds['foo'] + self.mindex = pd.MultiIndex.from_product([['a', 'b'], [1, 2]], + names=('level_1', 'level_2')) + self.mda = DataArray([0, 1, 2, 3], coords={'x': self.mindex}, dims='x') + def test_repr(self): v = Variable(['time', 'x'], [[1, 2, 3], [4, 5, 6]], {'foo': 'bar'}) data_array = DataArray(v, {'other': np.int64(0)}, name='my_variable') @@ -39,6 +43,16 @@ def test_repr(self): foo: bar""") self.assertEqual(expected, repr(data_array)) + def test_repr_multiindex(self): + expected = dedent("""\ + + array([0, 1, 2, 3]) + Coordinates: + * x (x) MultiIndex + - level_1 (x) object 'a' 'a' 'b' 'b' + - level_2 (x) int64 1 2 1 2""") + self.assertEqual(expected, repr(self.mda)) + def test_properties(self): self.assertVariableEqual(self.dv.variable, self.v) self.assertArrayEqual(self.dv.values, self.v.values) @@ -236,6 +250,11 @@ def test_constructor_invalid(self): with self.assertRaisesRegexp(ValueError, 'conflicting sizes for dim'): DataArray([1, 2], coords={'x': [0, 1], 'y': ('x', [1])}, dims='x') + with self.assertRaisesRegexp(ValueError, 'conflicting MultiIndex'): + DataArray(np.random.rand(4, 4), + [('x', self.mindex), ('y', self.mindex)]) + DataArray(np.random.rand(4, 4), + [('x', mindex), ('level_1', range(4))]) def test_constructor_from_self_described(self): data = [[-0.1, 21], [0, 2]] @@ -405,6 +424,11 @@ def test_getitem_coords(self): dims='x') self.assertDataArrayIdentical(expected, actual) + def test_attr_sources_multiindex(self): + # make sure attr-style access for multi-index levels + # returns DataArray objects + self.assertIsInstance(self.mda.level_1, DataArray) + def test_pickle(self): data = DataArray(np.random.random((3, 3)), dims=('id', 'time')) roundtripped = pickle.loads(pickle.dumps(data)) @@ -627,6 +651,10 @@ def test_coords(self): with self.assertRaisesRegexp(ValueError, 'cannot delete'): del da.coords['x'] + with self.assertRaisesRegexp(ValueError, 'cannot replace MultiIndex'): + self.mda['level_1'] = np.arange(4) + self.mda.coords['level_1'] = np.arange(4) + def test_coord_coords(self): orig = DataArray([10, 20], {'x': [1, 2], 'x2': ('x', ['a', 'b']), 'z': 4}, @@ -707,6 +735,9 @@ def test_assign_coords(self): expected.coords['d'] = ('x', [1.5, 1.5, 3.5, 3.5]) self.assertDataArrayIdentical(actual, expected) + with self.assertRaisesRegexp(ValueError, 'conflicting MultiIndex'): + self.mda.assign_coords(level_1=range(4)) + def test_coords_alignment(self): lhs = DataArray([1, 2, 3], [('x', [0, 1, 2])]) rhs = DataArray([2, 3, 4], [('x', [1, 2, 3])]) From 949fb46cdfc790a363a79eb10191fb9fb41d6e05 Mon Sep 17 00:00:00 2001 From: Benoit Bovy Date: Sat, 3 Sep 2016 00:37:10 +0200 Subject: [PATCH 24/26] add docs --- doc/data-structures.rst | 39 +++++++++++++++++++++++++++++++++++---- doc/indexing.rst | 20 +++++++++++++++++--- doc/whats-new.rst | 7 +++++++ 3 files changed, 59 insertions(+), 7 deletions(-) diff --git a/doc/data-structures.rst b/doc/data-structures.rst index 97b716653f3..74617a43340 100644 --- a/doc/data-structures.rst +++ b/doc/data-structures.rst @@ -115,10 +115,6 @@ If you create a ``DataArray`` by supplying a pandas df xr.DataArray(df) -Xarray supports labeling coordinate values with a :py:class:`pandas.MultiIndex`. -While it handles multi-indexes with unnamed levels, it is recommended that you -explicitly set the names of the levels. - DataArray properties ~~~~~~~~~~~~~~~~~~~~ @@ -532,6 +528,41 @@ dimension and whose the values are ``Index`` objects: ds.indexes +MultiIndex coordinates +~~~~~~~~~~~~~~~~~~~~~~ + +Xarray supports labeling coordinate values with a :py:class:`pandas.MultiIndex`: + +.. ipython:: python + + midx = pd.MultiIndex.from_arrays([['R', 'R', 'V', 'V'], [.1, .2, .7, .9]], + names=('band', 'wn')) + mda = xr.DataArray(np.random.rand(4), coords={'spec': midx}, dims='spec') + mda + +For convenience multi-index levels are directly accessible as "virtual" or +"derived" coordinates (marked by ``-`` when printing a dataset or data array): + +.. ipython:: python + + mda['band'] + mda.wn + +Indexing with multi-index levels is also possible using the ``sel`` method +(see :ref:`multi-level indexing`). + +Unlike other coordinates, "virtual" level coordinates are not stored in +the ``coords`` attribute of ``DataArray`` and ``Dataset`` objects +(although they are shown when printing the ``coords`` attribute). +Consequently, most of the coordinates related methods don't apply for them. +It also can't be used to replace one particular level. + +Because in a ``DataArray`` or ``Dataset`` object each multi-index level is +accessible as a "virtual" coordinate, its name must not conflict with the names +of the other levels, coordinates and data variables of the same object. +Even though Xarray set default names for multi-indexes with unnamed levels, +it is recommended that you explicitly set the names of the levels. + .. [1] Latitude and longitude are 2D arrays because the dataset uses `projected coordinates`__. ``reference_time`` refers to the reference time at which the forecast was made, rather than ``time`` which is the valid time diff --git a/doc/indexing.rst b/doc/indexing.rst index d21adda2c8e..8e8783ef6f8 100644 --- a/doc/indexing.rst +++ b/doc/indexing.rst @@ -325,11 +325,25 @@ Additionally, xarray supports dictionaries: .. ipython:: python mda.sel(x={'one': 'a', 'two': 0}) - mda.loc[{'one': 'a'}, ...] + +For convenience, ``sel`` also accepts multi-index levels directly +as keyword arguments: + +.. ipython:: python + + mda.sel(one='a', two=0) + +Note that using ``sel`` it is not possible to mix a dimension +indexer with level indexers for that dimension +(e.g., ``mda.sel(x={'one': 'a'}, two=0)`` will raise a ``ValueError``). Like pandas, xarray handles partial selection on multi-index (level drop). -As shown in the last example above, it also renames the dimension / coordinate -when the multi-index is reduced to a single index. +As shown below, it also renames the dimension / coordinate when the +multi-index is reduced to a single index. + +.. ipython:: python + + mda.loc[{'one': 'a'}, ...] Unlike pandas, xarray does not guess whether you provide index levels or dimensions when using ``loc`` in some ambiguous cases. For example, for diff --git a/doc/whats-new.rst b/doc/whats-new.rst index 0f2d0535dc0..9795926240c 100644 --- a/doc/whats-new.rst +++ b/doc/whats-new.rst @@ -40,6 +40,13 @@ Deprecations Enhancements ~~~~~~~~~~~~ +- Multi-index levels are now accessible as "virtual" coordinate variables, + e.g., ``ds['time']`` can pull out the ``'time'`` level of a multi-index + (see :ref:`coordinates`). ``sel`` also accepts providing multi-index levels + as keyword arguments, e.g., ``ds.sel(time='2000-01')`` + (see :ref:`multi-level indexing`). + By `Benoit Bovy `_. + Bug fixes ~~~~~~~~~ From bdaad9b895a53c3844536a666d8590a8bfc64f29 Mon Sep 17 00:00:00 2001 From: Benoit Bovy Date: Sat, 3 Sep 2016 12:05:28 +0200 Subject: [PATCH 25/26] review changes --- xarray/core/coordinates.py | 14 -------------- xarray/core/dataarray.py | 2 +- xarray/core/dataset.py | 7 +------ xarray/core/formatting.py | 5 +++-- xarray/core/variable.py | 5 ----- xarray/test/test_dataarray.py | 2 +- xarray/test/test_dataset.py | 4 ++-- xarray/test/test_variable.py | 3 --- 8 files changed, 8 insertions(+), 34 deletions(-) diff --git a/xarray/core/coordinates.py b/xarray/core/coordinates.py index 1dda1e9ffc0..30240c16b27 100644 --- a/xarray/core/coordinates.py +++ b/xarray/core/coordinates.py @@ -166,13 +166,6 @@ def _update_coords(self, coords): self._data._coord_names.update(updated_coord_names) self._data._dims = dict(dims) - def __setitem__(self, key, value): - if key in self._data._level_coords: - raise ValueError("cannot replace MultiIndex level %r, replace %r " - "coordinate instead" - % (key, self._data._level_coords[key])) - return super(DatasetCoordinates, self).__setitem__(key, value) - def __delitem__(self, key): if key in self: del self._data[key] @@ -215,13 +208,6 @@ def _to_dataset(self, shallow_copy=True): def to_dataset(self): return self._to_dataset() - def __setitem__(self, key, value): - if key in self._data._level_coords: - raise ValueError("cannot replace MultiIndex level %r, replace %r " - "coordinate instead" - % (key, self._data._level_coords[key])) - return super(DataArrayCoordinates, self).__setitem__(key, value) - def __delitem__(self, key): if key in self.dims: raise ValueError('cannot delete a coordinate corresponding to a ' diff --git a/xarray/core/dataarray.py b/xarray/core/dataarray.py index e8758c7b62e..7fb9b928823 100644 --- a/xarray/core/dataarray.py +++ b/xarray/core/dataarray.py @@ -431,7 +431,7 @@ def _level_coords(self): if var.ndim == 1: level_names = var.to_index_variable().level_names if level_names is not None: - dim = var.dims[0] + dim, = var.dims level_coords.update({lname: dim for lname in level_names}) return level_coords diff --git a/xarray/core/dataset.py b/xarray/core/dataset.py index 71867f0537b..c8f93e9263a 100644 --- a/xarray/core/dataset.py +++ b/xarray/core/dataset.py @@ -449,7 +449,7 @@ def _level_coords(self): if var.ndim == 1: level_names = var.to_index_variable().level_names if level_names is not None: - dim = var.dims[0] + dim, = var.dims level_coords.update({lname: dim for lname in level_names}) return level_coords @@ -552,11 +552,6 @@ def __setitem__(self, key, value): raise NotImplementedError('cannot yet use a dictionary as a key ' 'to set Dataset values') - if key in self._level_coords: - raise ValueError("%r is already a MultiIndex level of " - "coordinate %r" - % (key, self._level_coords[key])) - self.update({key: value}) def __delitem__(self, key): diff --git a/xarray/core/formatting.py b/xarray/core/formatting.py index 1f496b4d193..e6a33989935 100644 --- a/xarray/core/formatting.py +++ b/xarray/core/formatting.py @@ -13,7 +13,6 @@ from .options import OPTIONS from .pycompat import PY2, iteritems, unicode_type, bytes_type, dask_array_type -from .indexing import PandasIndexAdapter def pretty_print(x, numchars): @@ -259,11 +258,13 @@ def _get_col_items(mapping): """Get all column items to format, including both keys of `mapping` and MultiIndex levels if any. """ + from .variable import IndexVariable + col_items = [] for k, v in mapping.items(): col_items.append(k) var = getattr(v, 'variable', v) - if isinstance(var._data, PandasIndexAdapter): + if isinstance(var, IndexVariable): level_names = var.to_index_variable().level_names if level_names is not None: col_items += list(level_names) diff --git a/xarray/core/variable.py b/xarray/core/variable.py index 098373eb4f8..43741d5a211 100644 --- a/xarray/core/variable.py +++ b/xarray/core/variable.py @@ -1193,11 +1193,6 @@ def level_names(self): else: return None - @level_names.setter - def level_names(self, value): - raise AttributeError('cannot modify level names of ' - 'IndexVariable in-place') - def get_level_variable(self, level): """Return a new IndexVariable from a given MultiIndex level.""" if self.level_names is None: diff --git a/xarray/test/test_dataarray.py b/xarray/test/test_dataarray.py index a28e82add70..f613259f128 100644 --- a/xarray/test/test_dataarray.py +++ b/xarray/test/test_dataarray.py @@ -651,7 +651,7 @@ def test_coords(self): with self.assertRaisesRegexp(ValueError, 'cannot delete'): del da.coords['x'] - with self.assertRaisesRegexp(ValueError, 'cannot replace MultiIndex'): + with self.assertRaisesRegexp(ValueError, 'conflicting MultiIndex'): self.mda['level_1'] = np.arange(4) self.mda.coords['level_1'] = np.arange(4) diff --git a/xarray/test/test_dataset.py b/xarray/test/test_dataset.py index df12074370a..a1da10b4ca5 100644 --- a/xarray/test/test_dataset.py +++ b/xarray/test/test_dataset.py @@ -513,7 +513,7 @@ def test_coords_setitem_with_new_dimension(self): def test_coords_setitem_multiindex(self): data = create_test_multiindex() - with self.assertRaisesRegexp(ValueError, 'cannot replace MultiIndex'): + with self.assertRaisesRegexp(ValueError, 'conflicting MultiIndex'): data.coords['level_1'] = range(4) def test_coords_set(self): @@ -1693,7 +1693,7 @@ def test_setitem_both_non_unique_index(self): def test_setitem_multiindex_level(self): data = create_test_multiindex() - with self.assertRaisesRegexp(ValueError, 'already a MultiIndex level'): + with self.assertRaisesRegexp(ValueError, 'conflicting MultiIndex'): data['level_1'] = range(4) def test_delitem(self): diff --git a/xarray/test/test_variable.py b/xarray/test/test_variable.py index dc6ce78b1ee..735e66a6aeb 100644 --- a/xarray/test/test_variable.py +++ b/xarray/test/test_variable.py @@ -1047,9 +1047,6 @@ def test_level_names(self): x = IndexVariable('x', midx) self.assertEqual(x.level_names, midx.names) - with self.assertRaisesRegexp(AttributeError, 'cannot modify'): - x.level_names = ['one', 'two'] - self.assertIsNone(IndexVariable('y', [10.0]).level_names) def test_get_level_variable(self): From a447767e8d611d945dc864910a427ef7e3f4db11 Mon Sep 17 00:00:00 2001 From: Benoit Bovy Date: Tue, 13 Sep 2016 14:46:00 +0200 Subject: [PATCH 26/26] remove name argument of IndexVariable --- xarray/core/variable.py | 17 +++++------------ xarray/test/test_variable.py | 6 +----- 2 files changed, 6 insertions(+), 17 deletions(-) diff --git a/xarray/core/variable.py b/xarray/core/variable.py index 43741d5a211..755cd0b9aee 100644 --- a/xarray/core/variable.py +++ b/xarray/core/variable.py @@ -1076,20 +1076,13 @@ class IndexVariable(Variable): unless another name is given. """ - def __init__(self, dims, data, attrs=None, encoding=None, - name=None, fastpath=False): - + def __init__(self, dims, data, attrs=None, encoding=None, fastpath=False): super(IndexVariable, self).__init__(dims, data, attrs, encoding, fastpath) if self.ndim != 1: raise ValueError('%s objects must be 1-dimensional' % type(self).__name__) - if isinstance(name, basestring): - self._name = name - else: - self._name = self.dims[0] - def _data_cached(self): if not isinstance(self._data, PandasIndexAdapter): self._data = PandasIndexAdapter(self._data) @@ -1102,7 +1095,7 @@ def __getitem__(self, key): return Variable((), values, self._attrs, self._encoding) else: return type(self)(self.dims, values, self._attrs, - self._encoding, name=self._name, fastpath=True) + self._encoding, fastpath=True) def __setitem__(self, key, value): raise TypeError('%s values cannot be modified' % type(self).__name__) @@ -1155,7 +1148,7 @@ def copy(self, deep=True): # since pandas.Index objects are immutable data = PandasIndexAdapter(self) if deep else self._data return type(self)(self.dims, data, self._attrs, - self._encoding, name=self._name, fastpath=True) + self._encoding, fastpath=True) def _data_equals(self, other): return self.to_index().equals(other.to_index()) @@ -1198,11 +1191,11 @@ def get_level_variable(self, level): if self.level_names is None: raise ValueError("IndexVariable %r has no MultiIndex" % self.name) index = self.to_index() - return type(self)(self.dims, index.get_level_values(level), name=level) + return type(self)(self.dims, index.get_level_values(level)) @property def name(self): - return self._name + return self.dims[0] @name.setter def name(self, value): diff --git a/xarray/test/test_variable.py b/xarray/test/test_variable.py index 735e66a6aeb..d6a61975659 100644 --- a/xarray/test/test_variable.py +++ b/xarray/test/test_variable.py @@ -1035,9 +1035,6 @@ def test_name(self): coord = IndexVariable('x', [10.0]) self.assertEqual(coord.name, 'x') - coord = IndexVariable('x', [10.0], name='y') - self.assertEqual(coord.name, 'y') - with self.assertRaises(AttributeError): coord.name = 'y' @@ -1053,8 +1050,7 @@ def test_get_level_variable(self): midx = pd.MultiIndex.from_product([['a', 'b'], [1, 2]], names=['level_1', 'level_2']) x = IndexVariable('x', midx) - level_1 = IndexVariable('x', midx.get_level_values('level_1'), - name='level_1') + level_1 = IndexVariable('x', midx.get_level_values('level_1')) self.assertVariableIdentical(x.get_level_variable('level_1'), level_1) with self.assertRaisesRegexp(ValueError, 'has no MultiIndex'):