From 33ee76f74228b66dd23a67d45d7b5be93d4340d3 Mon Sep 17 00:00:00 2001 From: Benoit Bovy Date: Thu, 24 Mar 2016 14:13:30 +0100 Subject: [PATCH 01/24] Use dict-like for indexing on dims with multiindex --- xarray/core/dataarray.py | 24 +++++++++++++++++++---- xarray/core/dataset.py | 41 +++++++++++++++++++++++++++++++++------- xarray/core/indexing.py | 34 ++++++++++++++++++++++++++------- 3 files changed, 81 insertions(+), 18 deletions(-) diff --git a/xarray/core/dataarray.py b/xarray/core/dataarray.py index fc8a73335cb..b7bbbd7d219 100644 --- a/xarray/core/dataarray.py +++ b/xarray/core/dataarray.py @@ -88,7 +88,9 @@ def __init__(self, data_array): def _remap_key(self, key): def lookup_positions(dim, labels): index = self.data_array.indexes[dim] - return indexing.convert_label_indexer(index, labels) + indexer, new_idx = indexing.convert_label_indexer(index, labels) + # don't replace indexes in this case. + return indexer if utils.is_dict_like(key): return dict((dim, lookup_positions(dim, labels)) @@ -244,6 +246,14 @@ def _replace_maybe_drop_dims(self, variable, name=__default): if set(v.dims) <= allowed_dims) return self._replace(variable, coords, name) + def _replace_indexes(self, indexes): + obj = self + for dim, idx in iteritems(indexes): + obj = obj.rename({dim: idx.name}) + new_coord = Coordinate(idx.name, idx) + obj = obj._replace(coords={idx.name: new_coord}) + return obj + __this_array = _ThisArray() def _to_temp_dataset(self): @@ -590,7 +600,7 @@ def isel(self, **indexers): ds = self._to_temp_dataset().isel(**indexers) return self._from_temp_dataset(ds) - def sel(self, method=None, tolerance=None, **indexers): + def sel(self, method=None, tolerance=None, drop_levels=True, **indexers): """Return a new DataArray whose dataset is given by selecting index labels along the specified dimension(s). @@ -599,8 +609,14 @@ def sel(self, method=None, tolerance=None, **indexers): Dataset.sel DataArray.isel """ - return self.isel(**indexing.remap_label_indexers( - self, indexers, method=method, tolerance=tolerance)) + pos_indexers, new_indexes = indexing.remap_label_indexers( + self, indexers, method=method, tolerance=tolerance + ) + obj = self.isel(**pos_indexers) + if drop_levels: + return obj._replace_indexes(new_indexes) + else: + return obj def isel_points(self, dim='points', **indexers): """Return a new DataArray whose dataset is given by pointwise integer diff --git a/xarray/core/dataset.py b/xarray/core/dataset.py index 660a349e06d..85184110333 100644 --- a/xarray/core/dataset.py +++ b/xarray/core/dataset.py @@ -419,6 +419,20 @@ def _replace_vars_and_dims(self, variables, coord_names=None, obj = self._construct_direct(variables, coord_names, dims, attrs) return obj + def _replace_indexes(self, indexes): + obj = self + for dim, idx in iteritems(indexes): + obj = obj.rename({dim: idx.name}) + new_coord = Coordinate(idx.name, idx) + variables = OrderedDict() + for k, v in iteritems(obj._variables): + if k == idx.name: + variables[k] = new_coord + else: + variables[k] = v + obj = obj._replace_vars_and_dims(variables) + return obj + def copy(self, deep=False): """Returns a copy of this dataset. @@ -921,7 +935,7 @@ def isel(self, **indexers): variables[name] = var.isel(**var_indexers) return self._replace_vars_and_dims(variables) - def sel(self, method=None, tolerance=None, **indexers): + def sel(self, method=None, tolerance=None, drop_levels=True, **indexers): """Returns a new dataset with each array indexed by tick labels along the specified dimension(s). @@ -952,9 +966,15 @@ def sel(self, method=None, tolerance=None, **indexers): matches. The values of the index at the matching locations most satisfy the equation ``abs(index[indexer] - target) <= tolerance``. Requires pandas>=0.17. + drop_levels : bool + If True (default), rename dimension and replace coordinate + for multi-index reduced into a single index (only if a dict-like + object is provided as indexer). **indexers : {dim: indexer, ...} Keyword arguments with names matching dimensions and values given - by scalars, slices or arrays of tick labels. + by scalars, slices or arrays of tick labels. For dimensions with + multi-index, the indexer may also be a dict-like object with keys + matching index level names. Returns ------- @@ -972,8 +992,14 @@ def sel(self, method=None, tolerance=None, **indexers): Dataset.isel_points DataArray.sel """ - return self.isel(**indexing.remap_label_indexers( - self, indexers, method=method, tolerance=tolerance)) + pos_indexers, new_indexes = indexing.remap_label_indexers( + self, indexers, method=method, tolerance=tolerance + ) + obj = self.isel(**pos_indexers) + if drop_levels: + return obj._replace_indexes(new_indexes) + else: + return obj def isel_points(self, dim='points', **indexers): """Returns a new dataset with each array indexed pointwise along the @@ -1114,9 +1140,10 @@ def sel_points(self, dim='points', method=None, tolerance=None, Dataset.isel_points DataArray.sel_points """ - pos_indexers = indexing.remap_label_indexers( - self, indexers, method=method, tolerance=tolerance) - return self.isel_points(dim=dim, **pos_indexers) + obj, pos_indexers = indexing.remap_label_indexers( + self, indexers, method=method, tolerance=tolerance + ) + return obj.isel_points(dim=dim, **pos_indexers) def reindex_like(self, other, method=None, tolerance=None, copy=True): """Conform this object onto the indexes of another object, filling diff --git a/xarray/core/indexing.py b/xarray/core/indexing.py index 6ca00e67cab..5c575cd0bb0 100644 --- a/xarray/core/indexing.py +++ b/xarray/core/indexing.py @@ -4,7 +4,7 @@ from . import utils from .pycompat import iteritems, range, dask_array_type, suppress -from .utils import is_full_slice +from .utils import is_full_slice, is_dict_like def expanded_indexer(key, ndim): @@ -139,7 +139,8 @@ def convert_label_indexer(index, label, index_name='', method=None, tolerance=None): """Given a pandas.Index and labels (e.g., from __getitem__) for one dimension, return an indexer suitable for indexing an ndarray along that - dimension + dimension. If label is a dict-like object and a pandas.MultiIndex is given, + also return a new pandas.Index (otherwise return None). """ # backwards compatibility for pandas<0.16 (method) or pandas<0.17 # (tolerance) @@ -152,6 +153,8 @@ def convert_label_indexer(index, label, index_name='', method=None, 'the tolerance argument requires pandas v0.17 or newer') kwargs['tolerance'] = tolerance + new_index = None + if isinstance(label, slice): if method is not None or tolerance is not None: raise NotImplementedError( @@ -166,6 +169,14 @@ def convert_label_indexer(index, label, index_name='', method=None, raise KeyError('cannot represent labeled-based slice indexer for ' 'dimension %r with a slice over integer positions; ' 'the index is unsorted or non-unique') + + elif is_dict_like(label): + if not isinstance(index, pd.MultiIndex): + raise ValueError('cannot use a dict-like object for selection on a' + 'dimension that does not have a MultiIndex') + indexer, new_index = index.get_loc_level(list(label.values()), + level=list(label.keys())) + else: label = _asarray_tuplesafe(label) if label.ndim == 0: @@ -177,18 +188,27 @@ def convert_label_indexer(index, label, index_name='', method=None, if np.any(indexer < 0): raise KeyError('not all values found in index %r' % index_name) - return indexer + return indexer, new_index def remap_label_indexers(data_obj, indexers, method=None, tolerance=None): """Given an xarray data object and label based indexers, return a mapping - of equivalent location based indexers. + of equivalent location based indexers. Also return a mapping of pandas' + single index objects returned from multi-index objects. """ if method is not None and not isinstance(method, str): raise TypeError('``method`` must be a string') - return dict((dim, convert_label_indexer(data_obj[dim].to_index(), label, - dim, method, tolerance)) - for dim, label in iteritems(indexers)) + + pos_indexers = dict() + new_indexes = dict() + for dim, label in iteritems(indexers): + idxr, new_idx = convert_label_indexer(data_obj[dim].to_index(), label, + dim, method, tolerance) + pos_indexers[dim] = idxr + if new_idx is not None and not isinstance(new_idx, pd.MultiIndex): + new_indexes[dim] = new_idx + + return pos_indexers, new_indexes def slice_slice(old_slice, applied_slice, size): From 6695e854b9d5d0379249218300969c2cad71bdc1 Mon Sep 17 00:00:00 2001 From: Benoit Bovy Date: Thu, 24 Mar 2016 14:58:00 +0100 Subject: [PATCH 02/24] renamed drop_levels to drop_level --- xarray/core/dataarray.py | 4 ++-- xarray/core/dataset.py | 6 +++--- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/xarray/core/dataarray.py b/xarray/core/dataarray.py index b7bbbd7d219..9349efc3f70 100644 --- a/xarray/core/dataarray.py +++ b/xarray/core/dataarray.py @@ -600,7 +600,7 @@ def isel(self, **indexers): ds = self._to_temp_dataset().isel(**indexers) return self._from_temp_dataset(ds) - def sel(self, method=None, tolerance=None, drop_levels=True, **indexers): + def sel(self, method=None, tolerance=None, drop_level=True, **indexers): """Return a new DataArray whose dataset is given by selecting index labels along the specified dimension(s). @@ -613,7 +613,7 @@ def sel(self, method=None, tolerance=None, drop_levels=True, **indexers): self, indexers, method=method, tolerance=tolerance ) obj = self.isel(**pos_indexers) - if drop_levels: + if drop_level: return obj._replace_indexes(new_indexes) else: return obj diff --git a/xarray/core/dataset.py b/xarray/core/dataset.py index 85184110333..3945f78f502 100644 --- a/xarray/core/dataset.py +++ b/xarray/core/dataset.py @@ -935,7 +935,7 @@ def isel(self, **indexers): variables[name] = var.isel(**var_indexers) return self._replace_vars_and_dims(variables) - def sel(self, method=None, tolerance=None, drop_levels=True, **indexers): + def sel(self, method=None, tolerance=None, drop_level=True, **indexers): """Returns a new dataset with each array indexed by tick labels along the specified dimension(s). @@ -966,7 +966,7 @@ def sel(self, method=None, tolerance=None, drop_levels=True, **indexers): matches. The values of the index at the matching locations most satisfy the equation ``abs(index[indexer] - target) <= tolerance``. Requires pandas>=0.17. - drop_levels : bool + drop_level : bool If True (default), rename dimension and replace coordinate for multi-index reduced into a single index (only if a dict-like object is provided as indexer). @@ -996,7 +996,7 @@ def sel(self, method=None, tolerance=None, drop_levels=True, **indexers): self, indexers, method=method, tolerance=tolerance ) obj = self.isel(**pos_indexers) - if drop_levels: + if drop_level: return obj._replace_indexes(new_indexes) else: return obj From 647967a0de28d4bc33600a71d349ae2967de347f Mon Sep 17 00:00:00 2001 From: Benoit Bovy Date: Fri, 25 Mar 2016 00:25:27 +0100 Subject: [PATCH 03/24] fixed existing test failures --- xarray/core/dataset.py | 4 ++-- xarray/test/test_indexing.py | 8 +++++--- 2 files changed, 7 insertions(+), 5 deletions(-) diff --git a/xarray/core/dataset.py b/xarray/core/dataset.py index 3945f78f502..0a3295ea444 100644 --- a/xarray/core/dataset.py +++ b/xarray/core/dataset.py @@ -1140,10 +1140,10 @@ def sel_points(self, dim='points', method=None, tolerance=None, Dataset.isel_points DataArray.sel_points """ - obj, pos_indexers = indexing.remap_label_indexers( + pos_indexers, new_indexes = indexing.remap_label_indexers( self, indexers, method=method, tolerance=tolerance ) - return obj.isel_points(dim=dim, **pos_indexers) + return self.isel_points(dim=dim, **pos_indexers) def reindex_like(self, other, method=None, tolerance=None, copy=True): """Conform this object onto the indexes of another object, filling diff --git a/xarray/test/test_indexing.py b/xarray/test/test_indexing.py index 805d245bacf..3d0d140d9c5 100644 --- a/xarray/test/test_indexing.py +++ b/xarray/test/test_indexing.py @@ -85,6 +85,8 @@ def test_convert_label_indexer(self): indexing.convert_label_indexer(index, [0]) with self.assertRaises(KeyError): indexing.convert_label_indexer(index, 0) + with self.assertRaises(ValueError): + indexing.convert_label_indexer(index, {'somelevel': 0}) def test_convert_unsorted_datetime_index_raises(self): index = pd.to_datetime(['2001', '2000', '2002']) @@ -100,9 +102,9 @@ def test_remap_label_indexers(self): def test_indexer(x): return indexing.remap_label_indexers(data, {'x': x}) - self.assertEqual({'x': 0}, test_indexer(1)) - self.assertEqual({'x': 0}, test_indexer(np.int32(1))) - self.assertEqual({'x': 0}, test_indexer(Variable([], 1))) + self.assertEqual({'x': 0}, test_indexer(1)[0]) + self.assertEqual({'x': 0}, test_indexer(np.int32(1))[0]) + self.assertEqual({'x': 0}, test_indexer(Variable([], 1))[0]) class TestLazyArray(TestCase): From de03fc9cf1a790a887c46d342a5b27b6aebf027b Mon Sep 17 00:00:00 2001 From: Benoit Bovy Date: Fri, 25 Mar 2016 13:29:40 +0100 Subject: [PATCH 04/24] removed drop_level option --- xarray/core/dataarray.py | 8 ++------ xarray/core/dataset.py | 12 ++---------- 2 files changed, 4 insertions(+), 16 deletions(-) diff --git a/xarray/core/dataarray.py b/xarray/core/dataarray.py index 9349efc3f70..0d0effd9556 100644 --- a/xarray/core/dataarray.py +++ b/xarray/core/dataarray.py @@ -600,7 +600,7 @@ def isel(self, **indexers): ds = self._to_temp_dataset().isel(**indexers) return self._from_temp_dataset(ds) - def sel(self, method=None, tolerance=None, drop_level=True, **indexers): + def sel(self, method=None, tolerance=None, **indexers): """Return a new DataArray whose dataset is given by selecting index labels along the specified dimension(s). @@ -612,11 +612,7 @@ def sel(self, method=None, tolerance=None, drop_level=True, **indexers): pos_indexers, new_indexes = indexing.remap_label_indexers( self, indexers, method=method, tolerance=tolerance ) - obj = self.isel(**pos_indexers) - if drop_level: - return obj._replace_indexes(new_indexes) - else: - return obj + return self.isel(**pos_indexers)._replace_indexes(new_indexes) def isel_points(self, dim='points', **indexers): """Return a new DataArray whose dataset is given by pointwise integer diff --git a/xarray/core/dataset.py b/xarray/core/dataset.py index 0a3295ea444..34cdbd7fc65 100644 --- a/xarray/core/dataset.py +++ b/xarray/core/dataset.py @@ -935,7 +935,7 @@ def isel(self, **indexers): variables[name] = var.isel(**var_indexers) return self._replace_vars_and_dims(variables) - def sel(self, method=None, tolerance=None, drop_level=True, **indexers): + def sel(self, method=None, tolerance=None, **indexers): """Returns a new dataset with each array indexed by tick labels along the specified dimension(s). @@ -966,10 +966,6 @@ def sel(self, method=None, tolerance=None, drop_level=True, **indexers): matches. The values of the index at the matching locations most satisfy the equation ``abs(index[indexer] - target) <= tolerance``. Requires pandas>=0.17. - drop_level : bool - If True (default), rename dimension and replace coordinate - for multi-index reduced into a single index (only if a dict-like - object is provided as indexer). **indexers : {dim: indexer, ...} Keyword arguments with names matching dimensions and values given by scalars, slices or arrays of tick labels. For dimensions with @@ -995,11 +991,7 @@ def sel(self, method=None, tolerance=None, drop_level=True, **indexers): pos_indexers, new_indexes = indexing.remap_label_indexers( self, indexers, method=method, tolerance=tolerance ) - obj = self.isel(**pos_indexers) - if drop_level: - return obj._replace_indexes(new_indexes) - else: - return obj + return self.isel(**pos_indexers)._replace_indexes(new_indexes) def isel_points(self, dim='points', **indexers): """Returns a new dataset with each array indexed pointwise along the From f88cd42f3e02498fdd19d3404eef59fd01c83e12 Mon Sep 17 00:00:00 2001 From: Benoit Bovy Date: Fri, 25 Mar 2016 14:25:24 +0100 Subject: [PATCH 05/24] multi-index level drop for DataArray.loc --- xarray/core/dataarray.py | 26 ++++++++++++++++++-------- 1 file changed, 18 insertions(+), 8 deletions(-) diff --git a/xarray/core/dataarray.py b/xarray/core/dataarray.py index 0d0effd9556..d50ce80fe52 100644 --- a/xarray/core/dataarray.py +++ b/xarray/core/dataarray.py @@ -89,23 +89,33 @@ def _remap_key(self, key): def lookup_positions(dim, labels): index = self.data_array.indexes[dim] indexer, new_idx = indexing.convert_label_indexer(index, labels) - # don't replace indexes in this case. - return indexer + return indexer, new_idx if utils.is_dict_like(key): - return dict((dim, lookup_positions(dim, labels)) - for dim, labels in iteritems(key)) + pos_indexers, new_indexes = {}, {} + for dim, labels in iteritems(key): + pos_indexers[dim], idx = lookup_positions(dim, labels) + if idx is not None and not isinstance(idx, pd.MultiIndex): + new_indexes[dim] = idx + return pos_indexers, new_indexes + else: # expand the indexer so we can handle Ellipsis + # doesn't support dict-like key for multi-index key = indexing.expanded_indexer(key, self.data_array.ndim) - return tuple(lookup_positions(dim, labels) for dim, labels - in zip(self.data_array.dims, key)) + pos_indexers = tuple( + lookup_positions(dim, labels)[0] for dim, labels + in zip(self.data_array.dims, key) + ) + return pos_indexers, {} def __getitem__(self, key): - return self.data_array[self._remap_key(key)] + pos_indexers, new_indexes = self._remap_key(key) + return self.data_array[pos_indexers]._replace_indexes(new_indexes) def __setitem__(self, key, value): - self.data_array[self._remap_key(key)] = value + pos_indexers, new_indexes = self._remap_key(key) + self.data_array[pos_indexers] = value class _ThisArray(object): From 8b17a7976d0ae94b07f99bd18b7e3847ae5e67ee Mon Sep 17 00:00:00 2001 From: Benoit Bovy Date: Fri, 25 Mar 2016 17:26:48 +0100 Subject: [PATCH 06/24] mindex level drop for DataArray.loc with Ellipsis --- xarray/core/dataarray.py | 26 +++++--------------------- xarray/core/indexing.py | 3 +-- 2 files changed, 6 insertions(+), 23 deletions(-) diff --git a/xarray/core/dataarray.py b/xarray/core/dataarray.py index d50ce80fe52..9a8ef4d6df4 100644 --- a/xarray/core/dataarray.py +++ b/xarray/core/dataarray.py @@ -86,28 +86,12 @@ def __init__(self, data_array): self.data_array = data_array def _remap_key(self, key): - def lookup_positions(dim, labels): - index = self.data_array.indexes[dim] - indexer, new_idx = indexing.convert_label_indexer(index, labels) - return indexer, new_idx - - if utils.is_dict_like(key): - pos_indexers, new_indexes = {}, {} - for dim, labels in iteritems(key): - pos_indexers[dim], idx = lookup_positions(dim, labels) - if idx is not None and not isinstance(idx, pd.MultiIndex): - new_indexes[dim] = idx - return pos_indexers, new_indexes - - else: + if not utils.is_dict_like(key): # expand the indexer so we can handle Ellipsis - # doesn't support dict-like key for multi-index - key = indexing.expanded_indexer(key, self.data_array.ndim) - pos_indexers = tuple( - lookup_positions(dim, labels)[0] for dim, labels - in zip(self.data_array.dims, key) - ) - return pos_indexers, {} + labels = indexing.expanded_indexer(key, self.data_array.ndim) + key = dict(zip(self.data_array.dims, labels)) + + return indexing.remap_label_indexers(self.data_array, key) def __getitem__(self, key): pos_indexers, new_indexes = self._remap_key(key) diff --git a/xarray/core/indexing.py b/xarray/core/indexing.py index 5c575cd0bb0..b6c4eb7b412 100644 --- a/xarray/core/indexing.py +++ b/xarray/core/indexing.py @@ -199,8 +199,7 @@ def remap_label_indexers(data_obj, indexers, method=None, tolerance=None): if method is not None and not isinstance(method, str): raise TypeError('``method`` must be a string') - pos_indexers = dict() - new_indexes = dict() + pos_indexers, new_indexes = {}, {} for dim, label in iteritems(indexers): idxr, new_idx = convert_label_indexer(data_obj[dim].to_index(), label, dim, method, tolerance) From 881da7f7f58908e1b7c4d5d8188535d7ca3aa9fe Mon Sep 17 00:00:00 2001 From: Benoit Bovy Date: Fri, 27 May 2016 01:05:11 +0200 Subject: [PATCH 07/24] fix unnamed indexes --- xarray/core/dataarray.py | 2 ++ xarray/core/dataset.py | 2 ++ xarray/core/indexing.py | 6 +++--- 3 files changed, 7 insertions(+), 3 deletions(-) diff --git a/xarray/core/dataarray.py b/xarray/core/dataarray.py index 9a8ef4d6df4..9d4ad70f805 100644 --- a/xarray/core/dataarray.py +++ b/xarray/core/dataarray.py @@ -243,6 +243,8 @@ def _replace_maybe_drop_dims(self, variable, name=__default): def _replace_indexes(self, indexes): obj = self for dim, idx in iteritems(indexes): + if idx.name is None: + idx.name = dim + "_unnamed_level" obj = obj.rename({dim: idx.name}) new_coord = Coordinate(idx.name, idx) obj = obj._replace(coords={idx.name: new_coord}) diff --git a/xarray/core/dataset.py b/xarray/core/dataset.py index 34cdbd7fc65..80692ebcbc7 100644 --- a/xarray/core/dataset.py +++ b/xarray/core/dataset.py @@ -422,6 +422,8 @@ def _replace_vars_and_dims(self, variables, coord_names=None, def _replace_indexes(self, indexes): obj = self for dim, idx in iteritems(indexes): + if idx.name is None: + idx.name = dim + "_unnamed_level" obj = obj.rename({dim: idx.name}) new_coord = Coordinate(idx.name, idx) variables = OrderedDict() diff --git a/xarray/core/indexing.py b/xarray/core/indexing.py index b6c4eb7b412..dd62235074b 100644 --- a/xarray/core/indexing.py +++ b/xarray/core/indexing.py @@ -140,7 +140,7 @@ def convert_label_indexer(index, label, index_name='', method=None, """Given a pandas.Index and labels (e.g., from __getitem__) for one dimension, return an indexer suitable for indexing an ndarray along that dimension. If label is a dict-like object and a pandas.MultiIndex is given, - also return a new pandas.Index (otherwise return None). + also return a new pandas.Index, otherwise return None. """ # backwards compatibility for pandas<0.16 (method) or pandas<0.17 # (tolerance) @@ -174,8 +174,8 @@ def convert_label_indexer(index, label, index_name='', method=None, if not isinstance(index, pd.MultiIndex): raise ValueError('cannot use a dict-like object for selection on a' 'dimension that does not have a MultiIndex') - indexer, new_index = index.get_loc_level(list(label.values()), - level=list(label.keys())) + indexer, new_index = index.get_loc_level(tuple(label.values()), + level=tuple(label.keys())) else: label = _asarray_tuplesafe(label) From b78679369052cd602501db3477aba4b1234a5a47 Mon Sep 17 00:00:00 2001 From: Benoit Bovy Date: Fri, 27 May 2016 17:15:59 +0200 Subject: [PATCH 08/24] added tests for indexing and selection --- xarray/test/test_dataarray.py | 31 +++++++++++++++++++++++-------- xarray/test/test_dataset.py | 25 +++++++++++++++++++++++++ xarray/test/test_indexing.py | 33 +++++++++++++++++++++++++++++++-- 3 files changed, 79 insertions(+), 10 deletions(-) diff --git a/xarray/test/test_dataarray.py b/xarray/test/test_dataarray.py index 20f2d01bdec..c08af203ae4 100644 --- a/xarray/test/test_dataarray.py +++ b/xarray/test/test_dataarray.py @@ -486,7 +486,8 @@ def test_loc_single_boolean(self): self.assertEqual(data.loc[False], 1) def test_multiindex(self): - idx = pd.MultiIndex.from_product([list('abc'), [0, 1]]) + idx = pd.MultiIndex.from_product([list('abc'), [0, 1]], + names=('one', 'two')) data = DataArray(range(6), [('x', idx)]) self.assertDataArrayIdentical(data.sel(x=('a', 0)), data.isel(x=0)) @@ -495,6 +496,20 @@ def test_multiindex(self): self.assertDataArrayIdentical(data.sel(x=[('a', 0), ('c', 1)]), data.isel(x=[0, -1])) self.assertDataArrayIdentical(data.sel(x='a'), data.isel(x=slice(2))) + self.assertVariableNotEqual(data.sel(x={'one': slice(None)}), data) + self.assertDataArrayIdentical(data.isel(x=[0]), + data.sel(x={'one': 'a', 'two': 0})) + self.assertDataArrayIdentical(data.isel(x=[0, 1]), data.sel(x='a')) + self.assertVariableIdentical( + data.sel(x={'one': 'a'}), + data.unstack('x').sel(one='a').dropna('two') + ) + + self.assertDataArrayIdentical(data.loc['a'], data[:2]) + self.assertDataArrayIdentical(data.loc[{'one': 'a', 'two': 0}, ...], + data[[0]]) + self.assertDataArrayIdentical(data.loc[{'one': 'a'}, ...], + data.sel(x={'one': 'a'})) def test_time_components(self): dates = pd.date_range('2000-01-01', periods=10) @@ -1818,29 +1833,29 @@ def test_full_like(self): actual = _full_like(DataArray([1, 2, 3]), fill_value=np.nan) self.assertEqual(actual.dtype, np.float) np.testing.assert_equal(actual.values, np.nan) - + def test_dot(self): x = np.linspace(-3, 3, 6) y = np.linspace(-3, 3, 5) - z = range(4) + z = range(4) da_vals = np.arange(6 * 5 * 4).reshape((6, 5, 4)) da = DataArray(da_vals, coords=[x, y, z], dims=['x', 'y', 'z']) - + dm_vals = range(4) dm = DataArray(dm_vals, coords=[z], dims=['z']) - + # nd dot 1d actual = da.dot(dm) expected_vals = np.tensordot(da_vals, dm_vals, [2, 0]) expected = DataArray(expected_vals, coords=[x, y], dims=['x', 'y']) self.assertDataArrayEqual(expected, actual) - + # all shared dims actual = da.dot(da) expected_vals = np.tensordot(da_vals, da_vals, axes=([0, 1, 2], [0, 1, 2])) expected = DataArray(expected_vals) self.assertDataArrayEqual(expected, actual) - + # multiple shared dims dm_vals = np.arange(20 * 5 * 4).reshape((20, 5, 4)) j = np.linspace(-3, 3, 20) @@ -1849,7 +1864,7 @@ def test_dot(self): expected_vals = np.tensordot(da_vals, dm_vals, axes=([1, 2], [1, 2])) expected = DataArray(expected_vals, coords=[x, j], dims=['x', 'j']) self.assertDataArrayEqual(expected, actual) - + with self.assertRaises(NotImplementedError): da.dot(dm.to_dataset(name='dm')) with self.assertRaises(TypeError): diff --git a/xarray/test/test_dataset.py b/xarray/test/test_dataset.py index 69e2a582b4c..f18b8885153 100644 --- a/xarray/test/test_dataset.py +++ b/xarray/test/test_dataset.py @@ -840,6 +840,31 @@ def test_loc(self): with self.assertRaises(TypeError): data.loc[dict(dim3='a')] = 0 + def test_multiindex(self): + idx = pd.MultiIndex.from_product([list('abc'), [0, 1]], + names=('one', 'two')) + data = Dataset(data_vars={'var': ('x', range(6))}, coords={'x': idx}) + + self.assertDatasetIdentical(data.sel(x=('a', 0)), data.isel(x=0)) + self.assertDatasetIdentical(data.sel(x=('c', 1)), data.isel(x=-1)) + self.assertDatasetIdentical(data.sel(x=[('a', 0)]), data.isel(x=[0])) + self.assertDatasetIdentical(data.sel(x=[('a', 0), ('c', 1)]), + data.isel(x=[0, -1])) + self.assertDatasetIdentical(data.sel(x='a'), data.isel(x=slice(2))) + self.assertVariableNotEqual(data.sel(x={'one': slice(None)})['var'], + data['var']) + self.assertDatasetIdentical(data.isel(x=[0]), + data.sel(x={'one': 'a', 'two': 0})) + self.assertDatasetIdentical(data.isel(x=[0, 1]), data.sel(x='a')) + self.assertVariableIdentical( + data.sel(x={'one': 'a'})['var'], + data.unstack('x').sel(one='a').dropna('two')['var'] + ) + + self.assertDatasetIdentical(data.loc[{'x': 'a'}], data.sel(x='a')) + self.assertDatasetIdentical(data.loc[{'x': {'one': 'a', 'two': 0}}], + data.sel(x={'one': 'a', 'two': 0})) + def test_reindex_like(self): data = create_test_data() data['letters'] = ('dim3', 10 * ['a']) diff --git a/xarray/test/test_indexing.py b/xarray/test/test_indexing.py index 3d0d140d9c5..be473eb84a2 100644 --- a/xarray/test/test_indexing.py +++ b/xarray/test/test_indexing.py @@ -1,7 +1,7 @@ import numpy as np import pandas as pd -from xarray import Dataset, Variable +from xarray import Dataset, DataArray, Variable from xarray.core import indexing from . import TestCase, ReturnItem @@ -85,8 +85,18 @@ def test_convert_label_indexer(self): indexing.convert_label_indexer(index, [0]) with self.assertRaises(KeyError): indexing.convert_label_indexer(index, 0) + with self.assertRaisesRegexp(ValueError, 'does not have a MultiIndex'): + indexing.convert_label_indexer(index, {'one': 0}) + + mindex = pd.MultiIndex(levels=[[1, 2, 3], [4, 5, 6]], + labels=[[0, 1, 2], [0, 1, 2]], + names=['one', 'two']) + with self.assertRaises(KeyError): + indexing.convert_label_indexer(mindex, [0]) + with self.assertRaises(KeyError): + indexing.convert_label_indexer(mindex, 0) with self.assertRaises(ValueError): - indexing.convert_label_indexer(index, {'somelevel': 0}) + indexing.convert_label_indexer(index, {'three': 0}) def test_convert_unsorted_datetime_index_raises(self): index = pd.to_datetime(['2001', '2000', '2002']) @@ -105,6 +115,25 @@ def test_indexer(x): self.assertEqual({'x': 0}, test_indexer(1)[0]) self.assertEqual({'x': 0}, test_indexer(np.int32(1))[0]) self.assertEqual({'x': 0}, test_indexer(Variable([], 1))[0]) + with self.assertRaisesRegexp(ValueError, 'does not have a MultiIndex'): + indexing.remap_label_indexers(data, {'x': {'level': 1}}) + + mindex = pd.MultiIndex(levels=[[1, 2, 3], [4, 5, 6]], + labels=[[0, 1, 2], [0, 1, 2]], + names=['one', 'two']) + s = pd.Series([7, 8, 9], index=mindex) + data = DataArray(s, dims='x') + + pos, idx = indexing.remap_label_indexers(data, {'x': {'one': 1}}) + self.assertArrayEqual(pos['x'], [True, False, False]) + self.assertArrayEqual(idx['x'].values, [4]) + self.assertEqual(idx['x'].name, 'two') + + pos, idx = indexing.remap_label_indexers( + data, {'x': {'one': 1, 'two': 4}} + ) + self.assertArrayEqual(pos['x'], [True, False, False]) + self.assertEqual(len(idx), 0) class TestLazyArray(TestCase): From 3b6d36d21943102786f195ee72f83d2c9fadd04f Mon Sep 17 00:00:00 2001 From: Benoit Bovy Date: Mon, 30 May 2016 16:05:14 +0200 Subject: [PATCH 09/24] added documentation (indexing) --- doc/indexing.rst | 21 +++++++++++++++++++++ 1 file changed, 21 insertions(+) diff --git a/doc/indexing.rst b/doc/indexing.rst index a64e20610d5..61ae946ced4 100644 --- a/doc/indexing.rst +++ b/doc/indexing.rst @@ -294,6 +294,27 @@ elements that are fully masked: arr2.where(arr2.y < 2, drop=True) +.. _multi-index indexing: + +Multi-index indexing +-------------------- + +The ``loc`` and ``sel`` methods of ``Dataset`` and ``DataArray`` both accept +dictionaries for indexing on multi-index dimensions: + +.. ipython:: python + + idx = pd.MultiIndex.from_product([list('abc'), [0, 1]], + names=('one', 'two')) + da_midx = xr.DataArray(range(6), [('x', idx)]) + da_midx + da_midx.sel(x={'one': 'a', 'two': 0}) + da_midx.loc[{'one': 'a'}, ...] + +As shown in the last example above, xarray handles partial selection on +pandas multi-index ; it automatically renames the dimension and replaces the +coordinate when a single index is returned (level drop). + Multi-dimensional indexing -------------------------- From 0a5a7cf1df7b2039b675a9bc65422495ddadc6f1 Mon Sep 17 00:00:00 2001 From: Benoit Bovy Date: Mon, 30 May 2016 20:45:54 +0200 Subject: [PATCH 10/24] allow multi-index indexing with nested tuples --- doc/indexing.rst | 10 ++++++++++ xarray/core/indexing.py | 18 ++++++++++++++++++ xarray/test/test_dataarray.py | 2 ++ xarray/test/test_dataset.py | 4 +++- xarray/test/test_indexing.py | 25 ++++++++++++++++--------- 5 files changed, 49 insertions(+), 10 deletions(-) diff --git a/doc/indexing.rst b/doc/indexing.rst index 61ae946ced4..64d4f172498 100644 --- a/doc/indexing.rst +++ b/doc/indexing.rst @@ -315,6 +315,16 @@ As shown in the last example above, xarray handles partial selection on pandas multi-index ; it automatically renames the dimension and replaces the coordinate when a single index is returned (level drop). +Like pandas, it is also possible to use tuples of tuples, lists or slices +(in that case xarray always returns the full multi-index): + +.. ipython:: python + + da_midx.sel(x=(list('ab'), [0])) + +Indexing with dictionaries uses the ``MultiIndex.get_loc_level`` pandas method +while indexing with nested tuples uses the ``MultiIndex.get_locs`` method. + Multi-dimensional indexing -------------------------- diff --git a/xarray/core/indexing.py b/xarray/core/indexing.py index dd62235074b..bd8b26d0715 100644 --- a/xarray/core/indexing.py +++ b/xarray/core/indexing.py @@ -135,6 +135,21 @@ def _asarray_tuplesafe(values): return result +def _is_nested_tuple(tup, index): + """Check for a compatible nested tuple and multiindex (taken from + pandas.core.indexing.is_nested_tuple). + """ + if not isinstance(tup, tuple): + return False + + # are we nested tuple of: tuple,list,slice + for i, k in enumerate(tup): + if isinstance(k, (tuple, list, slice)): + return isinstance(index, pd.MultiIndex) + + return False + + def convert_label_indexer(index, label, index_name='', method=None, tolerance=None): """Given a pandas.Index and labels (e.g., from __getitem__) for one @@ -177,6 +192,9 @@ def convert_label_indexer(index, label, index_name='', method=None, indexer, new_index = index.get_loc_level(tuple(label.values()), level=tuple(label.keys())) + elif _is_nested_tuple(label, index): + indexer = index.get_locs(label) + else: label = _asarray_tuplesafe(label) if label.ndim == 0: diff --git a/xarray/test/test_dataarray.py b/xarray/test/test_dataarray.py index c08af203ae4..58c4bc48679 100644 --- a/xarray/test/test_dataarray.py +++ b/xarray/test/test_dataarray.py @@ -504,6 +504,8 @@ def test_multiindex(self): data.sel(x={'one': 'a'}), data.unstack('x').sel(one='a').dropna('two') ) + self.assertDataArrayIdentical(data.sel(x=('a', slice(None))), + data.isel(x=[0, 1])) self.assertDataArrayIdentical(data.loc['a'], data[:2]) self.assertDataArrayIdentical(data.loc[{'one': 'a', 'two': 0}, ...], diff --git a/xarray/test/test_dataset.py b/xarray/test/test_dataset.py index f18b8885153..2e24d949abb 100644 --- a/xarray/test/test_dataset.py +++ b/xarray/test/test_dataset.py @@ -849,7 +849,9 @@ def test_multiindex(self): self.assertDatasetIdentical(data.sel(x=('c', 1)), data.isel(x=-1)) self.assertDatasetIdentical(data.sel(x=[('a', 0)]), data.isel(x=[0])) self.assertDatasetIdentical(data.sel(x=[('a', 0), ('c', 1)]), - data.isel(x=[0, -1])) + data.isel(x=[0, -1])) + self.assertDatasetIdentical(data.sel(x=(['a', 'c'], [0, 1])), + data.isel(x=[0, 1, -2, -1])) self.assertDatasetIdentical(data.sel(x='a'), data.isel(x=slice(2))) self.assertVariableNotEqual(data.sel(x={'one': slice(None)})['var'], data['var']) diff --git a/xarray/test/test_indexing.py b/xarray/test/test_indexing.py index be473eb84a2..f4e5f73021e 100644 --- a/xarray/test/test_indexing.py +++ b/xarray/test/test_indexing.py @@ -88,15 +88,17 @@ def test_convert_label_indexer(self): with self.assertRaisesRegexp(ValueError, 'does not have a MultiIndex'): indexing.convert_label_indexer(index, {'one': 0}) - mindex = pd.MultiIndex(levels=[[1, 2, 3], [4, 5, 6]], - labels=[[0, 1, 2], [0, 1, 2]], + mindex = pd.MultiIndex(levels=[[1, 2, 3], ['a', 'b']], + labels=[[0, 1, 2], [0, 0, 1]], names=['one', 'two']) - with self.assertRaises(KeyError): + with self.assertRaisesRegexp(KeyError, 'not all values found'): indexing.convert_label_indexer(mindex, [0]) with self.assertRaises(KeyError): indexing.convert_label_indexer(mindex, 0) with self.assertRaises(ValueError): indexing.convert_label_indexer(index, {'three': 0}) + with self.assertRaisesRegexp(KeyError, 'index to be fully lexsorted'): + indexing.convert_label_indexer(mindex, (slice(None), 'a', 0)) def test_convert_unsorted_datetime_index_raises(self): index = pd.to_datetime(['2001', '2000', '2002']) @@ -118,23 +120,28 @@ def test_indexer(x): with self.assertRaisesRegexp(ValueError, 'does not have a MultiIndex'): indexing.remap_label_indexers(data, {'x': {'level': 1}}) - mindex = pd.MultiIndex(levels=[[1, 2, 3], [4, 5, 6]], - labels=[[0, 1, 2], [0, 1, 2]], + mindex = pd.MultiIndex(levels=[[1, 2, 3], ['a', 'b']], + labels=[[0, 1, 2], [0, 0, 1]], names=['one', 'two']) - s = pd.Series([7, 8, 9], index=mindex) - data = DataArray(s, dims='x') + data = DataArray(range(3), [('x', mindex)]) pos, idx = indexing.remap_label_indexers(data, {'x': {'one': 1}}) self.assertArrayEqual(pos['x'], [True, False, False]) - self.assertArrayEqual(idx['x'].values, [4]) + self.assertArrayEqual(idx['x'].values, ['a']) self.assertEqual(idx['x'].name, 'two') pos, idx = indexing.remap_label_indexers( - data, {'x': {'one': 1, 'two': 4}} + data, {'x': {'one': 1, 'two': 'a'}} ) self.assertArrayEqual(pos['x'], [True, False, False]) self.assertEqual(len(idx), 0) + pos, idx = indexing.remap_label_indexers( + data, {'x': ([1], 'a')} + ) + self.assertArrayEqual(pos['x'], [0]) + self.assertEqual(len(idx), 0) + class TestLazyArray(TestCase): def test_slice_slice(self): From 8a9d48861f8244e0d7d6f92e8e696d58d9977255 Mon Sep 17 00:00:00 2001 From: Benoit Bovy Date: Tue, 31 May 2016 11:21:58 +0200 Subject: [PATCH 11/24] updated what's new --- doc/whats-new.rst | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/doc/whats-new.rst b/doc/whats-new.rst index 1946b5dd67b..e073a0152e1 100644 --- a/doc/whats-new.rst +++ b/doc/whats-new.rst @@ -39,10 +39,14 @@ Enhancements attributes are retained in the resampled object. By `Jeremy McGibbon `_. +- DataArray and Dataset methods :py:meth:`sel` and :py:meth:`loc` now + accept dictionaries or nested tuples for indexing on multi-index dimensions. + By `Benoit Bovy `_. + - New (experimental) decorators :py:func:`~xarray.register_dataset_accessor` and :py:func:`~xarray.register_dataarray_accessor` for registering custom xarray extensions without subclassing. They are described in the new documentation - page on :ref:`internals`. By `Stephan Hoyer ` + page on :ref:`internals`. By `Stephan Hoyer `_. - Round trip boolean datatypes. Previously, writing boolean datatypes to netCDF formats would raise an error since netCDF does not have a `bool` datatype. From fd26dceae24398dddc1f3f20c4bae6c158405c57 Mon Sep 17 00:00:00 2001 From: Benoit Bovy Date: Sat, 4 Jun 2016 17:42:30 +0200 Subject: [PATCH 12/24] updated doc --- doc/indexing.rst | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/doc/indexing.rst b/doc/indexing.rst index 64d4f172498..3085b136209 100644 --- a/doc/indexing.rst +++ b/doc/indexing.rst @@ -316,15 +316,12 @@ pandas multi-index ; it automatically renames the dimension and replaces the coordinate when a single index is returned (level drop). Like pandas, it is also possible to use tuples of tuples, lists or slices -(in that case xarray always returns the full multi-index): +(for now xarray always returns the full multi-index in that case): .. ipython:: python da_midx.sel(x=(list('ab'), [0])) -Indexing with dictionaries uses the ``MultiIndex.get_loc_level`` pandas method -while indexing with nested tuples uses the ``MultiIndex.get_locs`` method. - Multi-dimensional indexing -------------------------- From dabb2ce19ad419e3108e4299418330f10689e2e1 Mon Sep 17 00:00:00 2001 From: Benoit Bovy Date: Sat, 4 Jun 2016 17:43:25 +0200 Subject: [PATCH 13/24] set better default names for multi-index levels --- xarray/core/indexing.py | 14 ++++++++++++-- 1 file changed, 12 insertions(+), 2 deletions(-) diff --git a/xarray/core/indexing.py b/xarray/core/indexing.py index bd8b26d0715..3e1daca1452 100644 --- a/xarray/core/indexing.py +++ b/xarray/core/indexing.py @@ -187,7 +187,7 @@ def convert_label_indexer(index, label, index_name='', method=None, elif is_dict_like(label): if not isinstance(index, pd.MultiIndex): - raise ValueError('cannot use a dict-like object for selection on a' + raise ValueError('cannot use a dict-like object for selection on a ' 'dimension that does not have a MultiIndex') indexer, new_index = index.get_loc_level(tuple(label.values()), level=tuple(label.keys())) @@ -219,7 +219,17 @@ def remap_label_indexers(data_obj, indexers, method=None, tolerance=None): pos_indexers, new_indexes = {}, {} for dim, label in iteritems(indexers): - idxr, new_idx = convert_label_indexer(data_obj[dim].to_index(), label, + index = data_obj[dim].to_index() + + if isinstance(index, pd.MultiIndex): + # set default names for multi-index unnamed levels so that + # we can safely rename dimension / coordinate later + valid_level_names = [name or '{}_level_{}'.format(dim, i) + for i, name in enumerate(index.names)] + index = index.copy() + index.names = valid_level_names + + idxr, new_idx = convert_label_indexer(index, label, dim, method, tolerance) pos_indexers[dim] = idxr if new_idx is not None and not isinstance(new_idx, pd.MultiIndex): From 5f7d6706dd3b576b372d015e72f64024e32d1228 Mon Sep 17 00:00:00 2001 From: Benoit Bovy Date: Sat, 4 Jun 2016 17:45:29 +0200 Subject: [PATCH 14/24] refactored and fixed dim name / coord replacement --- xarray/core/dataarray.py | 22 +++++++--------------- xarray/core/dataset.py | 24 ++++++++++-------------- 2 files changed, 17 insertions(+), 29 deletions(-) diff --git a/xarray/core/dataarray.py b/xarray/core/dataarray.py index 9d4ad70f805..bc45de6095c 100644 --- a/xarray/core/dataarray.py +++ b/xarray/core/dataarray.py @@ -90,12 +90,14 @@ def _remap_key(self, key): # expand the indexer so we can handle Ellipsis labels = indexing.expanded_indexer(key, self.data_array.ndim) key = dict(zip(self.data_array.dims, labels)) - return indexing.remap_label_indexers(self.data_array, key) def __getitem__(self, key): pos_indexers, new_indexes = self._remap_key(key) - return self.data_array[pos_indexers]._replace_indexes(new_indexes) + ds = self.data_array[pos_indexers]._to_temp_dataset() + return self.data_array._from_temp_dataset( + ds._replace_indexes(new_indexes) + ) def __setitem__(self, key, value): pos_indexers, new_indexes = self._remap_key(key) @@ -240,16 +242,6 @@ def _replace_maybe_drop_dims(self, variable, name=__default): if set(v.dims) <= allowed_dims) return self._replace(variable, coords, name) - def _replace_indexes(self, indexes): - obj = self - for dim, idx in iteritems(indexes): - if idx.name is None: - idx.name = dim + "_unnamed_level" - obj = obj.rename({dim: idx.name}) - new_coord = Coordinate(idx.name, idx) - obj = obj._replace(coords={idx.name: new_coord}) - return obj - __this_array = _ThisArray() def _to_temp_dataset(self): @@ -605,10 +597,10 @@ def sel(self, method=None, tolerance=None, **indexers): Dataset.sel DataArray.isel """ - pos_indexers, new_indexes = indexing.remap_label_indexers( - self, indexers, method=method, tolerance=tolerance + ds = self._to_temp_dataset().sel( + method=method, tolerance=tolerance, **indexers ) - return self.isel(**pos_indexers)._replace_indexes(new_indexes) + return self._from_temp_dataset(ds) def isel_points(self, dim='points', **indexers): """Return a new DataArray whose dataset is given by pointwise integer diff --git a/xarray/core/dataset.py b/xarray/core/dataset.py index 80692ebcbc7..f8a5a93995b 100644 --- a/xarray/core/dataset.py +++ b/xarray/core/dataset.py @@ -420,20 +420,16 @@ def _replace_vars_and_dims(self, variables, coord_names=None, return obj def _replace_indexes(self, indexes): - obj = self - for dim, idx in iteritems(indexes): - if idx.name is None: - idx.name = dim + "_unnamed_level" - obj = obj.rename({dim: idx.name}) - new_coord = Coordinate(idx.name, idx) - variables = OrderedDict() - for k, v in iteritems(obj._variables): - if k == idx.name: - variables[k] = new_coord - else: - variables[k] = v - obj = obj._replace_vars_and_dims(variables) - return obj + variables = OrderedDict() + for k, v in iteritems(self._variables): + if k in indexes.keys(): + idx = indexes[k] + variables[k] = Coordinate(idx.name, idx) + else: + variables[k] = v + obj = self._replace_vars_and_dims(variables) + dim_names = {dim: idx.name for dim, idx in iteritems(indexes)} + return obj.rename(dim_names) def copy(self, deep=False): """Returns a copy of this dataset. From 555f06e9f2f5c838fc324fa9e10f21c4cc635f74 Mon Sep 17 00:00:00 2001 From: Benoit Bovy Date: Sat, 4 Jun 2016 23:32:40 +0200 Subject: [PATCH 15/24] fix remap_label_indexers tests --- xarray/test/test_indexing.py | 33 ++++++++++++++------------------- 1 file changed, 14 insertions(+), 19 deletions(-) diff --git a/xarray/test/test_indexing.py b/xarray/test/test_indexing.py index f4e5f73021e..c152d4d8823 100644 --- a/xarray/test/test_indexing.py +++ b/xarray/test/test_indexing.py @@ -113,10 +113,13 @@ def test_remap_label_indexers(self): data = Dataset({'x': ('x', [1, 2, 3])}) def test_indexer(x): - return indexing.remap_label_indexers(data, {'x': x}) - self.assertEqual({'x': 0}, test_indexer(1)[0]) - self.assertEqual({'x': 0}, test_indexer(np.int32(1))[0]) - self.assertEqual({'x': 0}, test_indexer(Variable([], 1))[0]) + pos, idx = indexing.remap_label_indexers(data, {'x': x}) + return pos + + self.assertEqual({'x': 0}, test_indexer(1)) + self.assertEqual({'x': 0}, test_indexer(np.int32(1))) + self.assertEqual({'x': 0}, test_indexer(Variable([], 1))) + with self.assertRaisesRegexp(ValueError, 'does not have a MultiIndex'): indexing.remap_label_indexers(data, {'x': {'level': 1}}) @@ -124,24 +127,16 @@ def test_indexer(x): labels=[[0, 1, 2], [0, 0, 1]], names=['one', 'two']) data = DataArray(range(3), [('x', mindex)]) - - pos, idx = indexing.remap_label_indexers(data, {'x': {'one': 1}}) - self.assertArrayEqual(pos['x'], [True, False, False]) + expected_pos = np.array([True, False, False]) + expected_len_idx = (0, 0, 1) + labels = ({'one': 1, 'two': 'a'}, ([1], 'a'), {'one': 1}) + for lbl, lidx in zip(labels, expected_len_idx): + pos, idx = indexing.remap_label_indexers(data, {'x': lbl}) + self.assertTrue(expected_pos[pos['x']]) + self.assertEqual(len(idx), lidx) self.assertArrayEqual(idx['x'].values, ['a']) self.assertEqual(idx['x'].name, 'two') - pos, idx = indexing.remap_label_indexers( - data, {'x': {'one': 1, 'two': 'a'}} - ) - self.assertArrayEqual(pos['x'], [True, False, False]) - self.assertEqual(len(idx), 0) - - pos, idx = indexing.remap_label_indexers( - data, {'x': ([1], 'a')} - ) - self.assertArrayEqual(pos['x'], [0]) - self.assertEqual(len(idx), 0) - class TestLazyArray(TestCase): def test_slice_slice(self): From 8895e0429886a713da19a182a3cd00b721c8f79b Mon Sep 17 00:00:00 2001 From: Benoit Bovy Date: Tue, 7 Jun 2016 17:38:32 +0200 Subject: [PATCH 16/24] more detailed doc --- doc/indexing.rst | 38 ++++++++++++++++++++++++++++++++------ 1 file changed, 32 insertions(+), 6 deletions(-) diff --git a/doc/indexing.rst b/doc/indexing.rst index 3085b136209..086d5936f1c 100644 --- a/doc/indexing.rst +++ b/doc/indexing.rst @@ -294,19 +294,20 @@ elements that are fully masked: arr2.where(arr2.y < 2, drop=True) -.. _multi-index indexing: +.. _multi-level indexing: -Multi-index indexing +Multi-level indexing -------------------- The ``loc`` and ``sel`` methods of ``Dataset`` and ``DataArray`` both accept -dictionaries for indexing on multi-index dimensions: +dictionaries for label-based indexing on multi-index dimensions: .. ipython:: python idx = pd.MultiIndex.from_product([list('abc'), [0, 1]], names=('one', 'two')) - da_midx = xr.DataArray(range(6), [('x', idx)]) + da_midx = xr.DataArray(np.random.rand(6, 3), + [('x', idx), ('y', range(3))]) da_midx da_midx.sel(x={'one': 'a', 'two': 0}) da_midx.loc[{'one': 'a'}, ...] @@ -315,13 +316,38 @@ As shown in the last example above, xarray handles partial selection on pandas multi-index ; it automatically renames the dimension and replaces the coordinate when a single index is returned (level drop). -Like pandas, it is also possible to use tuples of tuples, lists or slices -(for now xarray always returns the full multi-index in that case): +Like pandas, it is also possible to slice a multi-indexed dimension by providing +a tuple of multiple indexers (i.e., slices, labels, list of labels, or any +selector allowed by pandas). Note that for now xarray doesn't fully handle +partial selection in that case (no level drop is done): .. ipython:: python da_midx.sel(x=(list('ab'), [0])) +Lists or slices of tuples can be used to select several combinations of +multi-index labels: + +.. ipython:: python + + da_midx.sel(x=[('a', 0), ('b', 1)]) + +A single, flat tuple can be used to select a given combination of +multi-index labels: + +.. ipython:: python + + da_midx.sel(x=('a', 0)) + +Unlike pandas, xarray can't make the distinction between index levels and +dimensions when using ``loc`` in some ambiguous cases. For example, for +``da_midx.loc[{'one': 'a', 'two': 0}]`` and ``da_midx.loc['a', 0]`` xarray +always interprets ('one', 'two') and ('a', 0) as the names and +labels of the 1st and 2nd dimension, respectively. You must specify all +dimensions or use the ellipsis in the ``loc`` specifier, e.g. in the example +above, ``da_midx.loc[{'one': 'a', 'two': 0}, :]`` or +``da_midx.loc[('a', 0), ...]``. + Multi-dimensional indexing -------------------------- From e900e9a0ed9d92d8ff3bbcb981ad2520f64540d1 Mon Sep 17 00:00:00 2001 From: Benoit Bovy Date: Wed, 8 Jun 2016 15:30:47 +0200 Subject: [PATCH 17/24] re-written _replace_indexes --- xarray/core/dataset.py | 25 +++++++++++++++---------- 1 file changed, 15 insertions(+), 10 deletions(-) diff --git a/xarray/core/dataset.py b/xarray/core/dataset.py index f8a5a93995b..50d2f12d1a1 100644 --- a/xarray/core/dataset.py +++ b/xarray/core/dataset.py @@ -420,16 +420,21 @@ def _replace_vars_and_dims(self, variables, coord_names=None, return obj def _replace_indexes(self, indexes): - variables = OrderedDict() - for k, v in iteritems(self._variables): - if k in indexes.keys(): - idx = indexes[k] - variables[k] = Coordinate(idx.name, idx) - else: - variables[k] = v + if not len(indexes): + return self + variables = self._variables.copy() + for name, idx in indexes.items(): + variables[name] = Coordinate(name, idx) obj = self._replace_vars_and_dims(variables) - dim_names = {dim: idx.name for dim, idx in iteritems(indexes)} - return obj.rename(dim_names) + + # switch from dimension to level names, if necessary + dim_names = {} + for dim, idx in indexes.items(): + if idx.name != dim: + dim_names[dim] = idx.name + if dim_names: + obj = obj.rename(dim_names) + return obj def copy(self, deep=False): """Returns a copy of this dataset. @@ -1130,7 +1135,7 @@ def sel_points(self, dim='points', method=None, tolerance=None, Dataset.isel_points DataArray.sel_points """ - pos_indexers, new_indexes = indexing.remap_label_indexers( + pos_indexers, _ = indexing.remap_label_indexers( self, indexers, method=method, tolerance=tolerance ) return self.isel_points(dim=dim, **pos_indexers) From d73cff52002667e95a8d0b0cd91844db05e244ab Mon Sep 17 00:00:00 2001 From: Benoit Bovy Date: Wed, 8 Jun 2016 15:50:08 +0200 Subject: [PATCH 18/24] avoid creating temp dataset for dataarray sel/loc --- xarray/core/dataarray.py | 30 ++++++++++++++++++++++-------- 1 file changed, 22 insertions(+), 8 deletions(-) diff --git a/xarray/core/dataarray.py b/xarray/core/dataarray.py index bc45de6095c..0079b9ee923 100644 --- a/xarray/core/dataarray.py +++ b/xarray/core/dataarray.py @@ -94,13 +94,10 @@ def _remap_key(self, key): def __getitem__(self, key): pos_indexers, new_indexes = self._remap_key(key) - ds = self.data_array[pos_indexers]._to_temp_dataset() - return self.data_array._from_temp_dataset( - ds._replace_indexes(new_indexes) - ) + return self.data_array[pos_indexers]._replace_indexes(new_indexes) def __setitem__(self, key, value): - pos_indexers, new_indexes = self._remap_key(key) + pos_indexers, _ = self._remap_key(key) self.data_array[pos_indexers] = value @@ -242,6 +239,23 @@ def _replace_maybe_drop_dims(self, variable, name=__default): if set(v.dims) <= allowed_dims) return self._replace(variable, coords, name) + def _replace_indexes(self, indexes): + if not len(indexes): + return self + coords = self._coords.copy() + for name, idx in indexes.items(): + coords[name] = Coordinate(name, idx) + obj = self._replace(coords=coords) + + # switch from dimension to level names, if necessary + dim_names = {} + for dim, idx in indexes.items(): + if idx.name != dim: + dim_names[dim] = idx.name + if dim_names: + obj = obj.rename(dim_names) + return obj + __this_array = _ThisArray() def _to_temp_dataset(self): @@ -597,10 +611,10 @@ def sel(self, method=None, tolerance=None, **indexers): Dataset.sel DataArray.isel """ - ds = self._to_temp_dataset().sel( - method=method, tolerance=tolerance, **indexers + pos_indexers, new_indexes = indexing.remap_label_indexers( + self, indexers, method=method, tolerance=tolerance ) - return self._from_temp_dataset(ds) + return self.isel(**pos_indexers)._replace_indexes(new_indexes) def isel_points(self, dim='points', **indexers): """Return a new DataArray whose dataset is given by pointwise integer From 31b2c5008f1442834bd7311f501f7c3a59568131 Mon Sep 17 00:00:00 2001 From: Benoit Bovy Date: Wed, 8 Jun 2016 16:52:06 +0200 Subject: [PATCH 19/24] clean up internal function is_nested_tuple --- xarray/core/indexing.py | 19 +++++-------------- 1 file changed, 5 insertions(+), 14 deletions(-) diff --git a/xarray/core/indexing.py b/xarray/core/indexing.py index 3e1daca1452..d55873c9ee2 100644 --- a/xarray/core/indexing.py +++ b/xarray/core/indexing.py @@ -135,19 +135,10 @@ def _asarray_tuplesafe(values): return result -def _is_nested_tuple(tup, index): - """Check for a compatible nested tuple and multiindex (taken from - pandas.core.indexing.is_nested_tuple). - """ - if not isinstance(tup, tuple): - return False - - # are we nested tuple of: tuple,list,slice - for i, k in enumerate(tup): - if isinstance(k, (tuple, list, slice)): - return isinstance(index, pd.MultiIndex) - - return False +def _is_nested_tuple(possible_tuple): + return (isinstance(possible_tuple, tuple) + and any(isinstance(value, (tuple, list, slice)) + for value in possible_tuple)) def convert_label_indexer(index, label, index_name='', method=None, @@ -192,7 +183,7 @@ def convert_label_indexer(index, label, index_name='', method=None, indexer, new_index = index.get_loc_level(tuple(label.values()), level=tuple(label.keys())) - elif _is_nested_tuple(label, index): + elif _is_nested_tuple(label) and isinstance(index, pd.MultiIndex): indexer = index.get_locs(label) else: From 6e17a85ec0a47f6d2709d313fe8fba40d10244e6 Mon Sep 17 00:00:00 2001 From: Benoit Bovy Date: Fri, 10 Jun 2016 17:14:47 +0200 Subject: [PATCH 20/24] better handling of multi-index level drop --- xarray/core/dataarray.py | 2 +- xarray/core/dataset.py | 2 +- xarray/core/indexing.py | 23 +++++++++---- xarray/test/test_dataarray.py | 64 +++++++++++++++++++++-------------- xarray/test/test_dataset.py | 59 ++++++++++++++++++-------------- xarray/test/test_indexing.py | 62 ++++++++++++++++++--------------- 6 files changed, 125 insertions(+), 87 deletions(-) diff --git a/xarray/core/dataarray.py b/xarray/core/dataarray.py index 0079b9ee923..c996365b190 100644 --- a/xarray/core/dataarray.py +++ b/xarray/core/dataarray.py @@ -250,7 +250,7 @@ def _replace_indexes(self, indexes): # switch from dimension to level names, if necessary dim_names = {} for dim, idx in indexes.items(): - if idx.name != dim: + if not isinstance(idx, pd.MultiIndex) and idx.name != dim: dim_names[dim] = idx.name if dim_names: obj = obj.rename(dim_names) diff --git a/xarray/core/dataset.py b/xarray/core/dataset.py index 50d2f12d1a1..5139004c93b 100644 --- a/xarray/core/dataset.py +++ b/xarray/core/dataset.py @@ -430,7 +430,7 @@ def _replace_indexes(self, indexes): # switch from dimension to level names, if necessary dim_names = {} for dim, idx in indexes.items(): - if idx.name != dim: + if not isinstance(idx, pd.MultiIndex) and idx.name != dim: dim_names[dim] = idx.name if dim_names: obj = obj.rename(dim_names) diff --git a/xarray/core/indexing.py b/xarray/core/indexing.py index d55873c9ee2..bb9a145f50a 100644 --- a/xarray/core/indexing.py +++ b/xarray/core/indexing.py @@ -177,14 +177,25 @@ def convert_label_indexer(index, label, index_name='', method=None, 'the index is unsorted or non-unique') elif is_dict_like(label): + is_nested_vals = _is_nested_tuple(tuple(label.values())) if not isinstance(index, pd.MultiIndex): raise ValueError('cannot use a dict-like object for selection on a ' 'dimension that does not have a MultiIndex') - indexer, new_index = index.get_loc_level(tuple(label.values()), - level=tuple(label.keys())) - - elif _is_nested_tuple(label) and isinstance(index, pd.MultiIndex): - indexer = index.get_locs(label) + elif len(label) == index.nlevels and not is_nested_vals: + indexer = index.get_loc(tuple((label[k] for k in index.names))) + else: + indexer, new_index = index.get_loc_level(tuple(label.values()), + level=tuple(label.keys())) + + elif isinstance(label, tuple) and isinstance(index, pd.MultiIndex): + if _is_nested_tuple(label): + indexer = index.get_locs(label) + elif len(label) == index.nlevels: + indexer = index.get_loc(label) + else: + indexer, new_index = index.get_loc_level( + label, level=list(range(len(label))) + ) else: label = _asarray_tuplesafe(label) @@ -223,7 +234,7 @@ def remap_label_indexers(data_obj, indexers, method=None, tolerance=None): idxr, new_idx = convert_label_indexer(index, label, dim, method, tolerance) pos_indexers[dim] = idxr - if new_idx is not None and not isinstance(new_idx, pd.MultiIndex): + if new_idx is not None: new_indexes[dim] = new_idx return pos_indexers, new_indexes diff --git a/xarray/test/test_dataarray.py b/xarray/test/test_dataarray.py index 58c4bc48679..3ef17a59a83 100644 --- a/xarray/test/test_dataarray.py +++ b/xarray/test/test_dataarray.py @@ -486,32 +486,44 @@ def test_loc_single_boolean(self): self.assertEqual(data.loc[False], 1) def test_multiindex(self): - idx = pd.MultiIndex.from_product([list('abc'), [0, 1]], - names=('one', 'two')) - data = DataArray(range(6), [('x', idx)]) - - self.assertDataArrayIdentical(data.sel(x=('a', 0)), data.isel(x=0)) - self.assertDataArrayIdentical(data.sel(x=('c', 1)), data.isel(x=-1)) - self.assertDataArrayIdentical(data.sel(x=[('a', 0)]), data.isel(x=[0])) - self.assertDataArrayIdentical(data.sel(x=[('a', 0), ('c', 1)]), - data.isel(x=[0, -1])) - self.assertDataArrayIdentical(data.sel(x='a'), data.isel(x=slice(2))) - self.assertVariableNotEqual(data.sel(x={'one': slice(None)}), data) - self.assertDataArrayIdentical(data.isel(x=[0]), - data.sel(x={'one': 'a', 'two': 0})) - self.assertDataArrayIdentical(data.isel(x=[0, 1]), data.sel(x='a')) - self.assertVariableIdentical( - data.sel(x={'one': 'a'}), - data.unstack('x').sel(one='a').dropna('two') - ) - self.assertDataArrayIdentical(data.sel(x=('a', slice(None))), - data.isel(x=[0, 1])) - - self.assertDataArrayIdentical(data.loc['a'], data[:2]) - self.assertDataArrayIdentical(data.loc[{'one': 'a', 'two': 0}, ...], - data[[0]]) - self.assertDataArrayIdentical(data.loc[{'one': 'a'}, ...], - data.sel(x={'one': 'a'})) + mindex = pd.MultiIndex.from_product([['a', 'b'], [1, 2], [-1, -2]], + names=('one', 'two', 'three')) + mdata = DataArray(range(8), [('x', mindex)]) + + def test_sel(lab_indexer, pos_indexer, replaced_idx=False, + renamed_dim=None): + da = mdata.sel(x=lab_indexer) + expected_da = mdata.isel(x=pos_indexer) + if not replaced_idx: + self.assertDataArrayIdentical(da, expected_da) + else: + if renamed_dim: + self.assertEqual(da.dims[0], renamed_dim) + da = da.rename({renamed_dim: 'x'}) + self.assertVariableIdentical(da, expected_da) + self.assertVariableNotEqual(da['x'], expected_da['x']) + + test_sel(('a', 1, -1), 0) + test_sel(('b', 2, -2), -1) + test_sel(('a', 1), [0, 1], replaced_idx=True, renamed_dim='three') + test_sel(('a',), range(4), replaced_idx=True) + test_sel([('a', 1, -1), ('b', 2, -2)], [0, 7]) + test_sel(slice('a', 'b'), range(8)) + test_sel(slice(('a', 1), ('b', 1)), range(6)) + test_sel({'one': 'a', 'two': 1, 'three': -1}, 0) + test_sel({'one': 'a', 'two': 1}, [0, 1], replaced_idx=True, + renamed_dim='three') + test_sel({'one': 'a'}, range(4), replaced_idx=True) + + self.assertDataArrayIdentical(mdata.loc['a'], mdata.sel(x='a')) + self.assertDataArrayIdentical(mdata.loc[('a', 1), ...], + mdata.sel(x=('a', 1))) + self.assertDataArrayIdentical(mdata.loc[{'one': 'a'}, ...], + mdata.sel(x={'one': 'a'})) + with self.assertRaises(KeyError): + mdata.loc[{'one': 'a'}] + with self.assertRaises(IndexError): + mdata.loc[('a', 1)] def test_time_components(self): dates = pd.date_range('2000-01-01', periods=10) diff --git a/xarray/test/test_dataset.py b/xarray/test/test_dataset.py index 2e24d949abb..61d791ed324 100644 --- a/xarray/test/test_dataset.py +++ b/xarray/test/test_dataset.py @@ -841,31 +841,40 @@ def test_loc(self): data.loc[dict(dim3='a')] = 0 def test_multiindex(self): - idx = pd.MultiIndex.from_product([list('abc'), [0, 1]], - names=('one', 'two')) - data = Dataset(data_vars={'var': ('x', range(6))}, coords={'x': idx}) - - self.assertDatasetIdentical(data.sel(x=('a', 0)), data.isel(x=0)) - self.assertDatasetIdentical(data.sel(x=('c', 1)), data.isel(x=-1)) - self.assertDatasetIdentical(data.sel(x=[('a', 0)]), data.isel(x=[0])) - self.assertDatasetIdentical(data.sel(x=[('a', 0), ('c', 1)]), - data.isel(x=[0, -1])) - self.assertDatasetIdentical(data.sel(x=(['a', 'c'], [0, 1])), - data.isel(x=[0, 1, -2, -1])) - self.assertDatasetIdentical(data.sel(x='a'), data.isel(x=slice(2))) - self.assertVariableNotEqual(data.sel(x={'one': slice(None)})['var'], - data['var']) - self.assertDatasetIdentical(data.isel(x=[0]), - data.sel(x={'one': 'a', 'two': 0})) - self.assertDatasetIdentical(data.isel(x=[0, 1]), data.sel(x='a')) - self.assertVariableIdentical( - data.sel(x={'one': 'a'})['var'], - data.unstack('x').sel(one='a').dropna('two')['var'] - ) - - self.assertDatasetIdentical(data.loc[{'x': 'a'}], data.sel(x='a')) - self.assertDatasetIdentical(data.loc[{'x': {'one': 'a', 'two': 0}}], - data.sel(x={'one': 'a', 'two': 0})) + mindex = pd.MultiIndex.from_product([['a', 'b'], [1, 2], [-1, -2]], + names=('one', 'two', 'three')) + mdata = Dataset(data_vars={'var': ('x', range(8))}, + coords={'x': mindex}) + + def test_sel(lab_indexer, pos_indexer, replaced_idx=False, + renamed_dim=None): + da = mdata.sel(x=lab_indexer) + expected_da = mdata.isel(x=pos_indexer) + if not replaced_idx: + self.assertDatasetIdentical(da, expected_da) + else: + if renamed_dim: + self.assertEqual(da['var'].dims[0], renamed_dim) + da = da.rename({renamed_dim: 'x'}) + self.assertVariableIdentical(da['var'], expected_da['var']) + self.assertVariableNotEqual(da['x'], expected_da['x']) + + test_sel(('a', 1, -1), 0) + test_sel(('b', 2, -2), -1) + test_sel(('a', 1), [0, 1], replaced_idx=True, renamed_dim='three') + test_sel(('a',), range(4), replaced_idx=True) + test_sel([('a', 1, -1), ('b', 2, -2)], [0, 7]) + test_sel(slice('a', 'b'), range(8)) + test_sel(slice(('a', 1), ('b', 1)), range(6)) + test_sel({'one': 'a', 'two': 1, 'three': -1}, 0) + test_sel({'one': 'a', 'two': 1}, [0, 1], replaced_idx=True, + renamed_dim='three') + test_sel({'one': 'a'}, range(4), replaced_idx=True) + + self.assertDatasetIdentical(mdata.loc[{'x': {'one': 'a'}}], + mdata.sel(x={'one': 'a'})) + with self.assertRaises(KeyError): + mdata.loc[{'one': 'a'}] def test_reindex_like(self): data = create_test_data() diff --git a/xarray/test/test_indexing.py b/xarray/test/test_indexing.py index c152d4d8823..cb8e7c06323 100644 --- a/xarray/test/test_indexing.py +++ b/xarray/test/test_indexing.py @@ -88,9 +88,8 @@ def test_convert_label_indexer(self): with self.assertRaisesRegexp(ValueError, 'does not have a MultiIndex'): indexing.convert_label_indexer(index, {'one': 0}) - mindex = pd.MultiIndex(levels=[[1, 2, 3], ['a', 'b']], - labels=[[0, 1, 2], [0, 0, 1]], - names=['one', 'two']) + mindex = pd.MultiIndex.from_product([['a', 'b'], [1, 2]], + names=('one', 'two')) with self.assertRaisesRegexp(KeyError, 'not all values found'): indexing.convert_label_indexer(mindex, [0]) with self.assertRaises(KeyError): @@ -98,7 +97,7 @@ def test_convert_label_indexer(self): with self.assertRaises(ValueError): indexing.convert_label_indexer(index, {'three': 0}) with self.assertRaisesRegexp(KeyError, 'index to be fully lexsorted'): - indexing.convert_label_indexer(mindex, (slice(None), 'a', 0)) + indexing.convert_label_indexer(mindex, (slice(None), 1, 'no_level')) def test_convert_unsorted_datetime_index_raises(self): index = pd.to_datetime(['2001', '2000', '2002']) @@ -110,32 +109,39 @@ def test_convert_unsorted_datetime_index_raises(self): def test_remap_label_indexers(self): # TODO: fill in more tests! - data = Dataset({'x': ('x', [1, 2, 3])}) - - def test_indexer(x): + def test_indexer(data, x, expected_pos, expected_idx=None): pos, idx = indexing.remap_label_indexers(data, {'x': x}) - return pos - - self.assertEqual({'x': 0}, test_indexer(1)) - self.assertEqual({'x': 0}, test_indexer(np.int32(1))) - self.assertEqual({'x': 0}, test_indexer(Variable([], 1))) + self.assertArrayEqual(pos.get('x'), expected_pos) + self.assertArrayEqual(idx.get('x'), expected_idx) - with self.assertRaisesRegexp(ValueError, 'does not have a MultiIndex'): - indexing.remap_label_indexers(data, {'x': {'level': 1}}) - - mindex = pd.MultiIndex(levels=[[1, 2, 3], ['a', 'b']], - labels=[[0, 1, 2], [0, 0, 1]], - names=['one', 'two']) - data = DataArray(range(3), [('x', mindex)]) - expected_pos = np.array([True, False, False]) - expected_len_idx = (0, 0, 1) - labels = ({'one': 1, 'two': 'a'}, ([1], 'a'), {'one': 1}) - for lbl, lidx in zip(labels, expected_len_idx): - pos, idx = indexing.remap_label_indexers(data, {'x': lbl}) - self.assertTrue(expected_pos[pos['x']]) - self.assertEqual(len(idx), lidx) - self.assertArrayEqual(idx['x'].values, ['a']) - self.assertEqual(idx['x'].name, 'two') + data = Dataset({'x': ('x', [1, 2, 3])}) + mindex = pd.MultiIndex.from_product([['a', 'b'], [1, 2], [-1, -2]], + names=('one', 'two', 'three')) + mdata = DataArray(range(8), [('x', mindex)]) + + test_indexer(data, 1, 0) + test_indexer(data, np.int32(1), 0) + test_indexer(data, Variable([], 1), 0) + test_indexer(mdata, ('a', 1, -1), 0) + test_indexer(mdata, ('a', 1), + [True, True, False, False, False, False, False, False], + [-1, -2]) + test_indexer(mdata, ('a',), + [True, True, True, True, False, False, False, False], + pd.MultiIndex.from_product([[1, 2], [-1, -2]])) + test_indexer(mdata, [('a', 1, -1), ('b', 2, -2)], [0, 7]) + test_indexer(mdata, slice('a', 'b'), slice(0, 8, None)) + test_indexer(mdata, slice(('a', 1), ('b', 1)), slice(0, 6, None)) + test_indexer(mdata, {'one': 'a', 'two': 1, 'three': -1}, 0) + test_indexer(mdata, {'one': 'a', 'two': 1}, + [True, True, False, False, False, False, False, False], + [-1, -2]) + test_indexer(mdata, {'one': 'a', 'three': -1}, + [True, False, True, False, False, False, False, False], + [1, 2]) + test_indexer(mdata, {'one': 'a'}, + [True, True, True, True, False, False, False, False], + pd.MultiIndex.from_product([[1, 2], [-1, -2]])) class TestLazyArray(TestCase): From 934beef3a5578e1386b0e39f2e33e4666ea2c304 Mon Sep 17 00:00:00 2001 From: Benoit Bovy Date: Fri, 10 Jun 2016 17:42:24 +0200 Subject: [PATCH 21/24] more global handling of unnamed multi-index levels --- xarray/core/dataset.py | 3 --- xarray/core/indexing.py | 9 --------- xarray/core/variable.py | 8 +++++++- xarray/test/test_dataset.py | 4 ---- xarray/test/test_variable.py | 5 +++++ 5 files changed, 12 insertions(+), 17 deletions(-) diff --git a/xarray/core/dataset.py b/xarray/core/dataset.py index 5139004c93b..aec64a574bd 100644 --- a/xarray/core/dataset.py +++ b/xarray/core/dataset.py @@ -1418,9 +1418,6 @@ def unstack(self, dim): obj = self.reindex(copy=False, **{dim: full_idx}) new_dim_names = index.names - if any(name is None for name in new_dim_names): - raise ValueError('cannot unstack dimension with unnamed levels') - new_dim_sizes = [lev.size for lev in index.levels] variables = OrderedDict() diff --git a/xarray/core/indexing.py b/xarray/core/indexing.py index bb9a145f50a..65e8d05a2de 100644 --- a/xarray/core/indexing.py +++ b/xarray/core/indexing.py @@ -222,15 +222,6 @@ def remap_label_indexers(data_obj, indexers, method=None, tolerance=None): pos_indexers, new_indexes = {}, {} for dim, label in iteritems(indexers): index = data_obj[dim].to_index() - - if isinstance(index, pd.MultiIndex): - # set default names for multi-index unnamed levels so that - # we can safely rename dimension / coordinate later - valid_level_names = [name or '{}_level_{}'.format(dim, i) - for i, name in enumerate(index.names)] - index = index.copy() - index.names = valid_level_names - idxr, new_idx = convert_label_indexer(index, label, dim, method, tolerance) pos_indexers[dim] = idxr diff --git a/xarray/core/variable.py b/xarray/core/variable.py index cb3041ddcd8..c6e9a1ead8b 100644 --- a/xarray/core/variable.py +++ b/xarray/core/variable.py @@ -1096,7 +1096,13 @@ def to_index(self): # basically free as pandas.Index objects are immutable assert self.ndim == 1 index = self._data_cached().array - if not isinstance(index, pd.MultiIndex): + if isinstance(index, pd.MultiIndex): + # set default names for multi-index unnamed levels so that + # we can safely rename dimension / coordinate later + valid_level_names = [name or '{}_level_{}'.format(self.name, i) + for i, name in enumerate(index.names)] + index = index.set_names(valid_level_names) + else: index = index.set_names(self.name) return index diff --git a/xarray/test/test_dataset.py b/xarray/test/test_dataset.py index 61d791ed324..15d8f3ed0be 100644 --- a/xarray/test/test_dataset.py +++ b/xarray/test/test_dataset.py @@ -1213,10 +1213,6 @@ def test_unstack_errors(self): with self.assertRaisesRegexp(ValueError, 'does not have a MultiIndex'): ds.unstack('x') - ds2 = Dataset({'x': pd.Index([(0, 1)])}) - with self.assertRaisesRegexp(ValueError, 'unnamed levels'): - ds2.unstack('x') - def test_stack_unstack(self): ds = Dataset({'a': ('x', [0, 1]), 'b': (('x', 'y'), [[0, 1], [2, 3]]), diff --git a/xarray/test/test_variable.py b/xarray/test/test_variable.py index 8304b50c315..70592c04a05 100644 --- a/xarray/test/test_variable.py +++ b/xarray/test/test_variable.py @@ -971,6 +971,11 @@ def test_to_index(self): v = Coordinate(['time'], data, {'foo': 'bar'}) self.assertTrue(pd.Index(data, name='time').identical(v.to_index())) + def test_multiindex_default_level_names(self): + midx = pd.MultiIndex.from_product([['a', 'b'], [1, 2]]) + v = Coordinate(['x'], midx, {'foo': 'bar'}) + self.assertEqual(v.to_index().names, ('x_level_0', 'x_level_1')) + def test_data(self): x = Coordinate('x', np.arange(3.0)) # data should be initially saved as an ndarray From 03c21bdb5801ecf6742f4ea234175e315af88eed Mon Sep 17 00:00:00 2001 From: Benoit Bovy Date: Fri, 10 Jun 2016 18:37:29 +0200 Subject: [PATCH 22/24] updated doc --- doc/data-structures.rst | 8 +++---- doc/indexing.rst | 53 +++++++++++++++++------------------------ doc/whats-new.rst | 11 ++++++--- 3 files changed, 33 insertions(+), 39 deletions(-) diff --git a/doc/data-structures.rst b/doc/data-structures.rst index 7705c7954d5..f1dd6a5f482 100644 --- a/doc/data-structures.rst +++ b/doc/data-structures.rst @@ -115,11 +115,9 @@ If you create a ``DataArray`` by supplying a pandas df xr.DataArray(df) -xarray does not (yet!) support labeling coordinate values with a -:py:class:`pandas.MultiIndex` (see :issue:`164`). -However, the alternate ``from_series`` constructor will automatically unpack -any hierarchical indexes it encounters by expanding the series into a -multi-dimensional array, as described in :doc:`pandas`. +Xarray supports labeling coordinate values with a :py:class:`pandas.MultiIndex`. +While it handles multi-indexes with unnamed levels, it is recommended that you +explicitly set the names of the levels. DataArray properties ~~~~~~~~~~~~~~~~~~~~ diff --git a/doc/indexing.rst b/doc/indexing.rst index 086d5936f1c..e8ac7393d0b 100644 --- a/doc/indexing.rst +++ b/doc/indexing.rst @@ -299,54 +299,45 @@ elements that are fully masked: Multi-level indexing -------------------- -The ``loc`` and ``sel`` methods of ``Dataset`` and ``DataArray`` both accept -dictionaries for label-based indexing on multi-index dimensions: +Just like pandas, advanced indexing on multi-level indexes is also possible with +``loc`` and ``sel``. You can slice a multi-index by providing multiple indexers, +i.e., a tuple of slices, labels, list of labels, or any selector allowed by +pandas (see :doc:`pandas`): .. ipython:: python - idx = pd.MultiIndex.from_product([list('abc'), [0, 1]], - names=('one', 'two')) - da_midx = xr.DataArray(np.random.rand(6, 3), - [('x', idx), ('y', range(3))]) - da_midx - da_midx.sel(x={'one': 'a', 'two': 0}) - da_midx.loc[{'one': 'a'}, ...] + midx = pd.MultiIndex.from_product([list('abc'), [0, 1]], + names=('one', 'two')) + mda = xr.DataArray(np.random.rand(6, 3), + [('x', midx), ('y', range(3))]) + mda + mda.sel(x=(list('ab'), [0])) -As shown in the last example above, xarray handles partial selection on -pandas multi-index ; it automatically renames the dimension and replaces the -coordinate when a single index is returned (level drop). - -Like pandas, it is also possible to slice a multi-indexed dimension by providing -a tuple of multiple indexers (i.e., slices, labels, list of labels, or any -selector allowed by pandas). Note that for now xarray doesn't fully handle -partial selection in that case (no level drop is done): +You can also select multiple elements by providing a list of labels or tuples or +a slice of tuples: .. ipython:: python - da_midx.sel(x=(list('ab'), [0])) + mda.sel(x=[('a', 0), ('b', 1)]) -Lists or slices of tuples can be used to select several combinations of -multi-index labels: +Additionally, xarray supports dictionaries: .. ipython:: python - da_midx.sel(x=[('a', 0), ('b', 1)]) - -A single, flat tuple can be used to select a given combination of -multi-index labels: - -.. ipython:: python + mda.sel(x={'one': 'a', 'two': 0}) + mda.loc[{'one': 'a'}, ...] - da_midx.sel(x=('a', 0)) +Like pandas, xarray handles partial selection on multi-index (level drop). +As shown in the last example above, it also renames the dimension / coordinate +when the multi-index is reduced to a single index. -Unlike pandas, xarray can't make the distinction between index levels and +Unlike pandas, xarray does not guess whether you provide index levels or dimensions when using ``loc`` in some ambiguous cases. For example, for -``da_midx.loc[{'one': 'a', 'two': 0}]`` and ``da_midx.loc['a', 0]`` xarray +``mda.loc[{'one': 'a', 'two': 0}]`` and ``mda.loc['a', 0]`` xarray always interprets ('one', 'two') and ('a', 0) as the names and labels of the 1st and 2nd dimension, respectively. You must specify all dimensions or use the ellipsis in the ``loc`` specifier, e.g. in the example -above, ``da_midx.loc[{'one': 'a', 'two': 0}, :]`` or -``da_midx.loc[('a', 0), ...]``. +above, ``mda.loc[{'one': 'a', 'two': 0}, :]`` or ``mda.loc[('a', 0), ...]``. Multi-dimensional indexing -------------------------- diff --git a/doc/whats-new.rst b/doc/whats-new.rst index e073a0152e1..93b973467ae 100644 --- a/doc/whats-new.rst +++ b/doc/whats-new.rst @@ -26,6 +26,9 @@ Breaking changes ~~~~~~~~~~~~~~~~ - Dropped support for Python 2.6 (:issue:`855`). +- Indexing on multi-index now drop levels, which is consitent with pandas. + It also changes the name of the dimension / coordinate when the multi-index is + reduced to a single index. Enhancements ~~~~~~~~~~~~ @@ -39,9 +42,11 @@ Enhancements attributes are retained in the resampled object. By `Jeremy McGibbon `_. -- DataArray and Dataset methods :py:meth:`sel` and :py:meth:`loc` now - accept dictionaries or nested tuples for indexing on multi-index dimensions. - By `Benoit Bovy `_. +- Better multi-index support in DataArray and Dataset :py:meth:`sel` and + :py:meth:`loc` methods, which now behave more closely to pandas and which + also accept dictionaries for indexing based on given level names and labels + (see :ref:`multi-level indexing`). By + `Benoit Bovy `_. - New (experimental) decorators :py:func:`~xarray.register_dataset_accessor` and :py:func:`~xarray.register_dataarray_accessor` for registering custom xarray From 030ee25240d8206ec69f30f8b89cd47ac2c8e20a Mon Sep 17 00:00:00 2001 From: Benoit Bovy Date: Sat, 11 Jun 2016 01:15:48 +0200 Subject: [PATCH 23/24] typos and missing details (docstrings, doc) --- doc/indexing.rst | 4 ++-- xarray/core/indexing.py | 8 ++++---- xarray/test/test_dataset.py | 14 +++++++------- 3 files changed, 13 insertions(+), 13 deletions(-) diff --git a/doc/indexing.rst b/doc/indexing.rst index e8ac7393d0b..d21adda2c8e 100644 --- a/doc/indexing.rst +++ b/doc/indexing.rst @@ -299,10 +299,10 @@ elements that are fully masked: Multi-level indexing -------------------- -Just like pandas, advanced indexing on multi-level indexes is also possible with +Just like pandas, advanced indexing on multi-level indexes is possible with ``loc`` and ``sel``. You can slice a multi-index by providing multiple indexers, i.e., a tuple of slices, labels, list of labels, or any selector allowed by -pandas (see :doc:`pandas`): +pandas: .. ipython:: python diff --git a/xarray/core/indexing.py b/xarray/core/indexing.py index 65e8d05a2de..e1719e39075 100644 --- a/xarray/core/indexing.py +++ b/xarray/core/indexing.py @@ -145,8 +145,8 @@ def convert_label_indexer(index, label, index_name='', method=None, tolerance=None): """Given a pandas.Index and labels (e.g., from __getitem__) for one dimension, return an indexer suitable for indexing an ndarray along that - dimension. If label is a dict-like object and a pandas.MultiIndex is given, - also return a new pandas.Index, otherwise return None. + dimension. If `index` is a pandas.MultiIndex and depending on `label`, + return a new pandas.Index or pandas.MultiIndex (otherwise return None). """ # backwards compatibility for pandas<0.16 (method) or pandas<0.17 # (tolerance) @@ -213,8 +213,8 @@ def convert_label_indexer(index, label, index_name='', method=None, def remap_label_indexers(data_obj, indexers, method=None, tolerance=None): """Given an xarray data object and label based indexers, return a mapping - of equivalent location based indexers. Also return a mapping of pandas' - single index objects returned from multi-index objects. + of equivalent location based indexers. Also return a mapping of updated + pandas index objects (in case of multi-index level drop). """ if method is not None and not isinstance(method, str): raise TypeError('``method`` must be a string') diff --git a/xarray/test/test_dataset.py b/xarray/test/test_dataset.py index 15d8f3ed0be..84f3c80c9e4 100644 --- a/xarray/test/test_dataset.py +++ b/xarray/test/test_dataset.py @@ -848,16 +848,16 @@ def test_multiindex(self): def test_sel(lab_indexer, pos_indexer, replaced_idx=False, renamed_dim=None): - da = mdata.sel(x=lab_indexer) - expected_da = mdata.isel(x=pos_indexer) + ds = mdata.sel(x=lab_indexer) + expected_ds = mdata.isel(x=pos_indexer) if not replaced_idx: - self.assertDatasetIdentical(da, expected_da) + self.assertDatasetIdentical(ds, expected_ds) else: if renamed_dim: - self.assertEqual(da['var'].dims[0], renamed_dim) - da = da.rename({renamed_dim: 'x'}) - self.assertVariableIdentical(da['var'], expected_da['var']) - self.assertVariableNotEqual(da['x'], expected_da['x']) + self.assertEqual(ds['var'].dims[0], renamed_dim) + ds = ds.rename({renamed_dim: 'x'}) + self.assertVariableIdentical(ds['var'], expected_ds['var']) + self.assertVariableNotEqual(ds['x'], expected_ds['x']) test_sel(('a', 1, -1), 0) test_sel(('b', 2, -2), -1) From 712497c3997e72a36cafc8fb9eaafbecc76af5dc Mon Sep 17 00:00:00 2001 From: Benoit Bovy Date: Sat, 11 Jun 2016 12:04:00 +0200 Subject: [PATCH 24/24] handle multi-index level drop for scalar labels --- xarray/core/indexing.py | 5 ++++- xarray/test/test_dataarray.py | 1 + xarray/test/test_dataset.py | 7 +++++++ xarray/test/test_indexing.py | 2 ++ 4 files changed, 14 insertions(+), 1 deletion(-) diff --git a/xarray/core/indexing.py b/xarray/core/indexing.py index e1719e39075..8c685a24b26 100644 --- a/xarray/core/indexing.py +++ b/xarray/core/indexing.py @@ -200,7 +200,10 @@ def convert_label_indexer(index, label, index_name='', method=None, else: label = _asarray_tuplesafe(label) if label.ndim == 0: - indexer = index.get_loc(label.item(), **kwargs) + if isinstance(index, pd.MultiIndex): + indexer, new_index = index.get_loc_level(label.item(), level=0) + else: + indexer = index.get_loc(label.item(), **kwargs) elif label.dtype.kind == 'b': indexer, = np.nonzero(label) else: diff --git a/xarray/test/test_dataarray.py b/xarray/test/test_dataarray.py index 3ef17a59a83..02d7da50188 100644 --- a/xarray/test/test_dataarray.py +++ b/xarray/test/test_dataarray.py @@ -507,6 +507,7 @@ def test_sel(lab_indexer, pos_indexer, replaced_idx=False, test_sel(('b', 2, -2), -1) test_sel(('a', 1), [0, 1], replaced_idx=True, renamed_dim='three') test_sel(('a',), range(4), replaced_idx=True) + test_sel('a', range(4), replaced_idx=True) test_sel([('a', 1, -1), ('b', 2, -2)], [0, 7]) test_sel(slice('a', 'b'), range(8)) test_sel(slice(('a', 1), ('b', 1)), range(6)) diff --git a/xarray/test/test_dataset.py b/xarray/test/test_dataset.py index 84f3c80c9e4..c44cc373ec3 100644 --- a/xarray/test/test_dataset.py +++ b/xarray/test/test_dataset.py @@ -863,6 +863,7 @@ def test_sel(lab_indexer, pos_indexer, replaced_idx=False, test_sel(('b', 2, -2), -1) test_sel(('a', 1), [0, 1], replaced_idx=True, renamed_dim='three') test_sel(('a',), range(4), replaced_idx=True) + test_sel('a', range(4), replaced_idx=True) test_sel([('a', 1, -1), ('b', 2, -2)], [0, 7]) test_sel(slice('a', 'b'), range(8)) test_sel(slice(('a', 1), ('b', 1)), range(6)) @@ -873,6 +874,12 @@ def test_sel(lab_indexer, pos_indexer, replaced_idx=False, self.assertDatasetIdentical(mdata.loc[{'x': {'one': 'a'}}], mdata.sel(x={'one': 'a'})) + self.assertDatasetIdentical(mdata.loc[{'x': 'a'}], + mdata.sel(x='a')) + self.assertDatasetIdentical(mdata.loc[{'x': ('a', 1)}], + mdata.sel(x=('a', 1))) + self.assertDatasetIdentical(mdata.loc[{'x': ('a', 1, -1)}], + mdata.sel(x=('a', 1, -1))) with self.assertRaises(KeyError): mdata.loc[{'one': 'a'}] diff --git a/xarray/test/test_indexing.py b/xarray/test/test_indexing.py index cb8e7c06323..1dca99ec99a 100644 --- a/xarray/test/test_indexing.py +++ b/xarray/test/test_indexing.py @@ -126,6 +126,8 @@ def test_indexer(data, x, expected_pos, expected_idx=None): test_indexer(mdata, ('a', 1), [True, True, False, False, False, False, False, False], [-1, -2]) + test_indexer(mdata, 'a', slice(0, 4, None), + pd.MultiIndex.from_product([[1, 2], [-1, -2]])) test_indexer(mdata, ('a',), [True, True, True, True, False, False, False, False], pd.MultiIndex.from_product([[1, 2], [-1, -2]]))