diff --git a/doc/api.rst b/doc/api.rst index dbca0e2563f..a625164988a 100644 --- a/doc/api.rst +++ b/doc/api.rst @@ -113,6 +113,7 @@ Indexing Dataset.reset_index Dataset.reorder_levels + Computation ----------- @@ -127,6 +128,8 @@ Computation Dataset.resample Dataset.diff Dataset.quantile + Dataset.idxmin + Dataset.idxmax **Aggregation**: :py:attr:`~Dataset.all` @@ -280,6 +283,10 @@ Computation DataArray.diff DataArray.dot DataArray.quantile + DataArray.idxmin + DataArray.idxmax + DataArray.indexes_min + DataArray.indexes_max **Aggregation**: :py:attr:`~DataArray.all` @@ -294,6 +301,8 @@ Computation :py:attr:`~DataArray.sum` :py:attr:`~DataArray.std` :py:attr:`~DataArray.var` +:py:attr:`~DataArray.argmax_indexes` +:py:attr:`~DataArray.argmax_indexes` **Missing values**: :py:attr:`~DataArray.isnull` diff --git a/doc/whats-new.rst b/doc/whats-new.rst index ac4fc507aa7..716578254c7 100644 --- a/doc/whats-new.rst +++ b/doc/whats-new.rst @@ -24,6 +24,15 @@ Enhancements Bug fixes ~~~~~~~~~ +- Now ``.argmin`` and `.argmax` only supports 1-dimensional array. + Instead, `~xarray.DataArray` now supports ``.indexes_min`` and + (and ``.indexes_max``), which returns a Dataset with minimum + (or maximum) indexes along the specified dimensions as DataArrays. + By `Keisuke Fujii `_. + Also `.idxmin` and `.idxmax` are also added that works as similar to + pandas's `.idxmin` and `.idxmax`. + + .. _whats-new.0.9.6: v0.9.6 (8 June 2017) diff --git a/xarray/core/common.py b/xarray/core/common.py index d61e2cdb15f..17d6971ad10 100644 --- a/xarray/core/common.py +++ b/xarray/core/common.py @@ -24,20 +24,28 @@ def wrapped_func(self, dim=None, axis=None, keep_attrs=False, allow_lazy=True, **kwargs) return wrapped_func - _reduce_extra_args_docstring = \ - """dim : str or sequence of str, optional - Dimension(s) over which to apply `{name}`. - axis : int or sequence of int, optional - Axis(es) over which to apply `{name}`. Only one of the 'dim' - and 'axis' arguments can be supplied. If neither are supplied, then - `{name}` is calculated over axes.""" - - _cum_extra_args_docstring = \ - """dim : str or sequence of str, optional - Dimension over which to apply `{name}`. - axis : int or sequence of int, optional - Axis over which to apply `{name}`. Only one of the 'dim' - and 'axis' arguments can be supplied.""" + _reduce_extra_args_docstring = """\ +dim : str or sequence of str, optional + Dimension(s) over which to apply `{name}`. +axis : int or sequence of int, optional + Axis(es) over which to apply `{name}`. Only one of the 'dim' + and 'axis' arguments can be supplied. If neither are supplied, then + `{name}` is calculated over axes.""" + + _reduce1dim_extra_args_docstring = """\ +dim : str, optional + Dimension over which to apply `{name}`. +axis : int, optional + Axis over which to apply `{name}`. Only one of the 'dim' + and 'axis' arguments can be supplied. If neither are supplied, then + `{name}` is calculated over axes.""" + + _cum_extra_args_docstring = """\ +dim : str or sequence of str, optional + Dimension over which to apply `{name}`. +axis : int or sequence of int, optional + Axis over which to apply `{name}`. Only one of the 'dim' + and 'axis' arguments can be supplied.""" class ImplementsDatasetReduce(object): @@ -56,17 +64,22 @@ def wrapped_func(self, dim=None, keep_attrs=False, **kwargs): **kwargs) return wrapped_func - _reduce_extra_args_docstring = \ - """dim : str or sequence of str, optional - Dimension(s) over which to apply `{name}`. By default `{name}` is - applied over all dimensions.""" - - _cum_extra_args_docstring = \ - """dim : str or sequence of str, optional - Dimension over which to apply `{name}`. - axis : int or sequence of int, optional - Axis over which to apply `{name}`. Only one of the 'dim' - and 'axis' arguments can be supplied.""" + _reduce_extra_args_docstring = """\ +dim : str or sequence of str, optional + Dimension(s) over which to apply `{name}`. By default `{name}` is + applied over all dimensions.""" + + _reduce1dim_extra_args_docstring = """\ +dim : str + Dimension over which to apply `{name}`. By default `{name}` is + applied over all dimensions.""" + + _cum_extra_args_docstring = """\ +dim : str or sequence of str, optional + Dimension over which to apply `{name}`. +axis : int or sequence of int, optional + Axis over which to apply `{name}`. Only one of the 'dim' + and 'axis' arguments can be supplied.""" class AbstractArray(ImplementsArrayReduce, formatting.ReprMixin): diff --git a/xarray/core/dataarray.py b/xarray/core/dataarray.py index 8700446295c..db3a0c11a44 100644 --- a/xarray/core/dataarray.py +++ b/xarray/core/dataarray.py @@ -520,6 +520,89 @@ def indexes(self): """ return Indexes(self._coords, self.sizes) + def idxmax(self, dim=None, skipna=True, keep_dims=False): + """Return indexes of the maximum values along a given dimension. + + Parameters + ---------- + dim : string + If True, the given dimension is kept with size one. + keep_dims: bool + If True, the given dimension is kept with size one. + + Returns + ------- + idx : DataArray + DataArray which stores the first occurence of the maximum index + """ + ds = self._to_temp_dataset().idxmax(dim, skipna, keep_dims) + return self._from_temp_dataset(ds) + + def idxmin(self, dim=None, skipna=True, keep_dims=False): + """Return indexes of the minimum values along a given dimension. + + Parameters + ---------- + dim : string + Which dimension the maximum index is taken. + keep_dims: bool + If True, the given dimension is kept with size one. + + Returns + ------- + idx : DataArray + DataArray which stores the first occurence of the minimum index + """ + ds = self._to_temp_dataset().idxmin(dim, skipna, keep_dims) + return self._from_temp_dataset(ds) + + def _indexes_min_max(self, func, dims, skipna): + """ Methods for indexes_min and indexes_max """ + arg_dict = getattr(self.variable, func)(dims) + + variables = OrderedDict() + for key, item in arg_dict.items(): + coords={d: self.coords[d] for d in item.dims} + variables[key] = DataArray(item, dims=item.dims, name=key, + coords=coords) + return Dataset(variables) + + def indexes_min(self, dims=None, skipna=True): + """Return indexes of the minimum values along a dim(dims). + + Parameters + ---------- + dim : string + Which dimension the minimum index is taken. + skipna: boolean + Exclude NA/null values. If an entire row/column is NA, the result + will be first index. + + Returns + ------- + indexes : Dataset + Dataset mappig dimension nemes to minimum indexes. + """ + return self._indexes_min_max('indexes_min', dims, skipna) + + def indexes_max(self, dims=None, skipna=True): + """Return indexes of the minimum values along a dim(dims). + + Parameters + ---------- + dim : string + Which dimension the maximum index is taken. + skipna: boolean + Exclude NA/null values. If an entire row/column is NA, the result + will be first index. + + Returns + ------- + indexes : Dataset + Dataset mappig dimension nemes to maximum indexes. + """ + return self._indexes_min_max('indexes_max', dims, skipna) + @property def coords(self): """Dictionary-like container of coordinate arrays. diff --git a/xarray/core/dataset.py b/xarray/core/dataset.py index 28524134474..9ea4f954dbc 100644 --- a/xarray/core/dataset.py +++ b/xarray/core/dataset.py @@ -819,6 +819,81 @@ def indexes(self): """ return Indexes(self._variables, self._dims) + def _idx_min_max(self, func, dim, skipna, keep_dims): + """Methods both for idxmin and idxmin""" + if dim is not None and not isinstance(dim, basestring): + raise ValueError('dim should be a string (not array) ' + dim) + + if dim is None and keep_dims: # The reduced dim should be identical. + dim_set = set([v.dims[0] for k, v in iteritems(self._variables) + if len(v.dims)==1]) + if len(dim_set) > 1: + raise ValueError('with keep_dims option, the reduced index' + ' cannot be different in ' + func + '.') + if len(dim_set) == 1: + dim = dim_set[0] + + variables = OrderedDict() + coord_names = [] + for k, v in iteritems(self._variables): + if dim is None and len(v.dims) > 1: + raise ValueError('dim should be specified for more than ' + '1-dimensional array ' + k) + + if k in self._coord_names: # Do not change coordinates + if not keep_dims or k != dim: + variables[k] = v + coord_names.append(k) + elif len(v.dims) == 0: + variables[k] = v + elif dim is not None and dim not in v.dims: + variables[k] = v + else: + d = dim or v.dims[0] + if d in v.dims: + variables[k] = getattr(v, func)(d, skipna, keep_dims)[d] + return self._replace_vars_and_dims(variables, set(coord_names)) + + def idxmax(self, dim=None, skipna=True, keep_dims=False): + """Return indexes of the maximum values along a given dimension. + + Parameters + ---------- + dim : string + Which dimension the maximum index is taken. + skipna: boolean + Exclude NA/null values. If an entire row/column is NA, the result + will be first index. + keep_dims: bool + If True, the given dimension is kept with size one. + + Returns + ------- + idx : DataArray + DataArray which stores the first occurence of the maximum index + """ + return self._idx_min_max('indexes_max', dim, skipna, keep_dims) + + def idxmin(self, dim=None, skipna=True, keep_dims=False): + """Return indexes of the maximum values along a given dimension. + + Parameters + ---------- + dim : string + Which dimension the minimum index is taken. + skipna: boolean + Exclude NA/null values. If an entire row/column is NA, the result + will be first index. + keep_dims: bool + If True, the given dimension is kept with size one. + + Returns + ------- + idx : DataArray + DataArray which stores the first occurence of the minimum index + """ + return self._idx_min_max('indexes_min', dim, skipna, keep_dims) + @property def coords(self): """Dictionary of xarray.DataArray objects corresponding to coordinate diff --git a/xarray/core/duck_array_ops.py b/xarray/core/duck_array_ops.py index 66d44ce547b..80b0a254c82 100644 --- a/xarray/core/duck_array_ops.py +++ b/xarray/core/duck_array_ops.py @@ -167,7 +167,7 @@ def _ignore_warnings_if(condition): def _create_nan_agg_method(name, numeric_only=False, np_compat=False, no_bottleneck=False, coerce_strings=False, - keep_dims=False): + keep_dims=False, only_1dim=False): def f(values, axis=None, skipna=None, **kwargs): # ignore keyword args inserted by np.mean and other numpy aggregators # automatically: @@ -176,6 +176,13 @@ def f(values, axis=None, skipna=None, **kwargs): values = asarray(values) + if only_1dim: + if ((axis is None and values.ndim > 1) or + (hasattr(axis, 'len') and len(axis) > 1)): + raise ValueError('Method %s is only applicable to ' + '1-dimensional data (or with a single dim ' + 'arguments).' % name) + if coerce_strings and values.dtype.kind in 'SU': values = values.astype(object) @@ -214,13 +221,14 @@ def f(values, axis=None, skipna=None, **kwargs): 'or newer to use skipna=True or skipna=None' % name) raise NotImplementedError(msg) f.numeric_only = numeric_only + f.only_1dim = only_1dim f.keep_dims = keep_dims f.__name__ = name return f -argmax = _create_nan_agg_method('argmax', coerce_strings=True) -argmin = _create_nan_agg_method('argmin', coerce_strings=True) +argmax = _create_nan_agg_method('argmax', coerce_strings=True, only_1dim=True) +argmin = _create_nan_agg_method('argmin', coerce_strings=True, only_1dim=True) max = _create_nan_agg_method('max', coerce_strings=True) min = _create_nan_agg_method('min', coerce_strings=True) sum = _create_nan_agg_method('sum', numeric_only=True) diff --git a/xarray/core/ops.py b/xarray/core/ops.py index 45cc6ae80a6..5a535f21d72 100644 --- a/xarray/core/ops.py +++ b/xarray/core/ops.py @@ -81,6 +81,7 @@ _REDUCE_DOCSTRING_TEMPLATE = """\ Reduce this {cls}'s data by applying `{name}` along some dimension(s). +{only_1dim} Parameters ---------- @@ -105,6 +106,11 @@ indicated dimension(s) removed. """ +_REDUCE_ONLY1DIM_DOCSTRING = """\ +The data should be 1-dimensional or either of (a single) dim or axis should be +passed. +""" + _ROLLING_REDUCE_DOCSTRING_TEMPLATE = """\ Reduce this {da_or_ds}'s data windows by applying `{name}` along its dimension. @@ -206,11 +212,18 @@ def inject_reduce_methods(cls): [('count', duck_array_ops.count, False)]) for name, f, include_skipna in methods: numeric_only = getattr(f, 'numeric_only', False) + only_1dim = getattr(f, 'only_1dim', False) + only_1dim_doc = _REDUCE_ONLY1DIM_DOCSTRING if only_1dim else '' + if only_1dim: + extra_args = cls._reduce1dim_extra_args_docstring.format(name=name) + else: + extra_args = cls._reduce_extra_args_docstring.format(name=name) + func = cls._reduce_method(f, include_skipna, numeric_only) func.__name__ = name func.__doc__ = _REDUCE_DOCSTRING_TEMPLATE.format( name=name, cls=cls.__name__, - extra_args=cls._reduce_extra_args_docstring.format(name=name)) + only_1dim=only_1dim_doc, extra_args=extra_args) setattr(cls, name, func) diff --git a/xarray/core/variable.py b/xarray/core/variable.py index ad4836b930f..77d63d69819 100644 --- a/xarray/core/variable.py +++ b/xarray/core/variable.py @@ -1143,6 +1143,55 @@ def real(self): def imag(self): return type(self)(self.dims, self.data.imag, self._attrs) + def _indexes_min_max(self, funcname, dims, skipna, keep_dims): + """ return indexes of the minimum or maximum along dim, as an + OrderedDict of Variables. + dims should be None or str or sequence of strs. + funcname is one of ['argmin', 'argmax'] + """ + if dims is None: + dims = self.dims + if isinstance(dims, basestring): + dims = [dims] + + invalid = [k for k in dims if k not in self.dims] + if invalid: + raise ValueError("dimensions %r do not exist" % invalid) + + # sort dims to the same order to self.dims + dims = [d for d in self.dims if d in dims] + + # shape of the dropped dims (which argmin/argmax to be applied along) + drop_shape = [self.shape[i] for i, d in enumerate(self.dims) + if d in dims] + drop_size = np.prod(drop_shape) + # the rest of dimensions, self.dims - dims + kept_dims = [d for d in self.dims if d not in dims] + kept_shape = [self.shape[i] for i, d in enumerate(self.dims) + if d not in dims] + + transposed = self.transpose(*(kept_dims + dims)) + flattened = transposed.data.reshape(kept_shape + [drop_size]) + flattened_args = getattr(duck_array_ops, funcname)( + flattened, axis=-1, skipna=skipna) + args = np.unravel_index(flattened_args, drop_shape) + + arg_dict = OrderedDict() + for i, d in enumerate(dims): + arg_dict[d] = type(self)(kept_dims, args[i]) + if keep_dims: + arg_dict[d] = arg_dict[d].set_dims(self.dims) + + return arg_dict + + def indexes_min(self, dims=None, skipna=True, keep_dims=False): + return self._indexes_min_max('argmin', dims=dims, skipna=skipna, + keep_dims=keep_dims) + + def indexes_max(self, dims=None, skipna=True, keep_dims=False): + return self._indexes_min_max('argmax', dims=dims, skipna=skipna, + keep_dims=keep_dims) + def __array_wrap__(self, obj, context=None): return Variable(self.dims, obj) diff --git a/xarray/tests/test_dataarray.py b/xarray/tests/test_dataarray.py index bff0f10d89a..e7948dd6d31 100644 --- a/xarray/tests/test_dataarray.py +++ b/xarray/tests/test_dataarray.py @@ -2567,6 +2567,27 @@ def test_sortby(self): actual = da.sortby(['x', 'y']) self.assertDataArrayEqual(actual, expected) + def test_indexes_min(self): + da = DataArray([[1, 2], [-1, 40], [5, 6]], + [('x', ['c', 'b', 'a']), ('y', [1, 0])]) + + actual = da.indexes_min() + expected = {'x': DataArray(1, name='x'), 'y': DataArray(0, name='y')} + self.assertDataArrayIdentical(actual['x'], expected['x']) + self.assertDataArrayIdentical(actual['y'], expected['y']) + actual = da.indexes_max() + expected = {'x': DataArray(1, name='x'), 'y': DataArray(1, name='y')} + + actual = da.indexes_min(dims='x') + expected = Dataset({'x': DataArray([1, 0], dims=['y'], + coords={'y': da['y']})}) + self.assertDatasetIdentical(actual, expected) + + actual = da.indexes_max(dims='x') + expected = Dataset({'x': DataArray([2, 1], dims=['y'], + coords={'y': da['y']})}) + self.assertDatasetIdentical(actual, expected) + @pytest.fixture(params=[1]) def da(request): diff --git a/xarray/tests/test_dataset.py b/xarray/tests/test_dataset.py index 92f86f4fcf3..c9545973eb8 100644 --- a/xarray/tests/test_dataset.py +++ b/xarray/tests/test_dataset.py @@ -2861,6 +2861,36 @@ def test_reduce_keep_attrs(self): for k, v in ds.data_vars.items(): self.assertEqual(v.attrs, data[k].attrs) + def test_reduce_1dim(self): + """ test reduce methods that accepts only 1-dim data. """ + data = create_test_data() + with self.assertRaisesRegexp(ValueError, + "Method argmax is only applicable to "): + data.argmax() + + with self.assertRaisesRegexp(ValueError, + "Method argmin is only applicable to "): + data.argmin() + + with self.assertRaisesRegexp(ValueError, + "Method argmin is only applicable to "): + data.argmin(dim=['dim1', 'dim2']) + + with self.assertRaisesRegexp(ValueError, + "Method argmax is only applicable to "): + data.argmax(dim=['dim1', 'dim2']) + + data['var1'][3, 2] = 1000.0 + actual = data.argmax(dim='dim1')['var1'] + self.assertTrue(actual[2].item() == 3) + data['var1'][3, 2] = -1000.0 + actual = data.argmin(dim='dim1')['var1'] + self.assertTrue(actual[2].item() == 3) + + data = data.expand_dims('dim4', -1) + actual = data.argmin(dim='dim1')['var1'] + self.assertTrue(actual[2].item() == 3) + def test_reduce_argmin(self): # regression test for #205 ds = Dataset({'a': ('x', [0, 1])}) @@ -3419,6 +3449,77 @@ def test_sortby(self): actual = ds.sortby(['x', 'y'], ascending=False) self.assertDatasetEqual(actual, ds) + def test_idxmin_max(self): + ds = Dataset({'A': DataArray([[7, 2], [3, 4], [5, 6]], + [('x', ['c', 'b', 'a']), + ('y', [1, 0])]), + 'B': DataArray([[5, 11], [7, 8], [9, 10]], + dims=['x', 'y'])}) + actual = ds.idxmax(dim='x') + expected = Dataset({'A': DataArray([0, 2], [('y', [1, 0])]), + 'B': DataArray([2, 0], dims=['y'])}, + coords={'x': ['c', 'b', 'a']}) + self.assertDatasetIdentical(actual['x'], expected['x']) + self.assertDatasetIdentical(actual, expected) + + actual = ds.idxmin(dim='x') + expected = Dataset({'A': DataArray([1, 0], [('y', [1, 0])]), + 'B': DataArray([0, 1], dims=['y'])}, + coords={'x': ['c', 'b', 'a']}) + self.assertDatasetIdentical(actual['x'], expected['x']) + self.assertDatasetIdentical(actual, expected) + + actual = ds.idxmax(dim='x', keep_dims=True) + expected = Dataset({'A': DataArray([[0, 2]], dims=['x', 'y']), + 'B': DataArray([[2, 0]], dims=['x', 'y'])}, + coords={'y': [1, 0]}) + self.assertDatasetIdentical(actual, expected) + + with self.assertRaisesRegexp(ValueError, 'dim should be specified'): + ds.idxmin() + + def test_idxmin_max_1dim(self): + ds = Dataset({'A': DataArray([1, 0, 2], [('x', ['c', 'b', 'a'])]), + 'B': DataArray([1, 0, 2], [('y', [1, 2, 3])])}) + actual = ds.idxmax() + expected = ds.idxmax() + self.assertDatasetIdentical(actual, expected) + + actual = ds.idxmax('x') + + # Here we only compare array-values because actual['A'] does not have + # coordinates 'x' (as actual['A'] is 0-dimensional) + self.assertTrue(np.allclose(actual['A'].values, + ds['A'].idxmax().values)) + self.assertDataArrayIdentical(actual['B'], ds['B']) + self.assertDataArrayIdentical(actual['x'], ds['x']) + self.assertDataArrayIdentical(actual['y'], ds['y']) + + with self.assertRaisesRegexp(ValueError, 'with keep_dims option'): + actual = ds.idxmax(keep_dims=True) + + actual = ds.idxmax('x', keep_dims=True) + expected = Dataset({'A': DataArray([2], dims=['x']), + 'B': DataArray([1, 0, 2], [('y', [1, 2, 3])])}) + self.assertDatasetIdentical(actual, expected) + + def test_idxmin_max_skipna(self): + ds = Dataset({'A': DataArray([[7, np.nan], [np.nan, 4], [5, 6]], + [('x', ['c', 'b', 'a']), + ('y', [1, 0])]), + 'B': DataArray([[5, 11], [7, 8], [np.nan, 10]], + dims=['x', 'y'])}) + actual = ds.idxmin(dim='x') + expected = Dataset({'A': DataArray([2, 1], [('y', [1, 0])]), + 'B': DataArray([0, 1], dims=['y'])}, + coords={'x': ['c', 'b', 'a']}) + self.assertDatasetIdentical(actual, expected) + + actual = ds.idxmin(dim='x', skipna=False) + expected = Dataset({'A': DataArray([1, 0], [('y', [1, 0])]), + 'B': DataArray([2, 1], dims=['y'])}, + coords={'x': ['c', 'b', 'a']}) + self.assertDatasetIdentical(actual, expected) # Py.test tests diff --git a/xarray/tests/test_variable.py b/xarray/tests/test_variable.py index f5d207d0978..cbe3a4eaa9c 100644 --- a/xarray/tests/test_variable.py +++ b/xarray/tests/test_variable.py @@ -1087,6 +1087,55 @@ def test_count(self): actual = Variable(['x', 'y'], [[1, 0, np.nan], [1, 1, 1]]).count('y') self.assertVariableIdentical(expected, actual) + def test_argmin_max(self): + d = self.d # shape [10, 3] + d[2, 1] = -1000.0 + v = Variable(['time', 'x'], d) + argdict = v.indexes_min() + self.assertTrue(argdict['time'] == 2) + self.assertTrue(argdict['x'] == 1) + # make sure the order of the arguments does not change the result + argdict = v.indexes_min(['x', 'time']) + self.assertTrue(argdict['time'] == 2) + self.assertTrue(argdict['x'] == 1) + + d[2, 1] = 1000.0 + v = Variable(['time', 'x'], d) + argdict = v.indexes_max() + self.assertTrue(argdict['time'] == 2) + self.assertTrue(argdict['x'] == 1) + self.assertTrue('time' not in argdict['time'].dims) + + argdict = v.indexes_max(keep_dims=True) + self.assertTrue('time' in argdict['time'].dims) + + d[2, 0] = -1000.0 + d[2, 1] = -1000.0 + d[3, 2] = -1000.0 + v = Variable(['time', 'x'], d) + argdict = v.indexes_min('time') + self.assertTrue(np.allclose(argdict['time'], [2, 2, 3])) + argdict = v.indexes_min('time', keep_dims=True) + self.assertTrue('time' in argdict['time'].dims) + + d[2, 0] = 1000.0 + d[2, 1] = 1000.0 + d[3, 2] = 1000.0 + v = Variable(['time', 'x'], d) + argdict = v.indexes_max('time') + self.assertTrue(np.allclose(argdict['time'], [2, 2, 3])) + + with self.assertRaisesRegexp(ValueError, 'dimensions'): + v.indexes_max(['space']) + + # numpy array with order 'F' + d = np.random.randn(30).reshape(10, 3, order='F') + d[2, 1] = -1000.0 + v = Variable(['time', 'x'], d) + argdict = v.indexes_min() + self.assertTrue(argdict['time'] == 2) + self.assertTrue(argdict['x'] == 1) + class TestIndexVariable(TestCase, VariableSubclassTestCases): cls = staticmethod(IndexVariable)