add groupby_bins method

rabernat · rabernat · commit 6cc1144de2f6 · 2016-05-14T22:42:59.000-04:00
diff --git a/xarray/core/common.py b/xarray/core/common.py
@@ -320,7 +320,7 @@ def pipe(self, func, *args, **kwargs):
         else:
             return func(self, *args, **kwargs)
 
-    def groupby(self, group, squeeze=True, bins=None):
+    def groupby(self, group, squeeze=True):
         """Returns a GroupBy object for performing grouped operations.
 
         Parameters
@@ -332,26 +332,67 @@ def groupby(self, group, squeeze=True, bins=None):
             If "group" is a dimension of any arrays in this dataset, `squeeze`
             controls whether the subarrays have a dimension of length 1 along
             that dimension or if the dimension is squeezed out.
-        bins : array-like, optional
-            If `bins` is specified, the groups will be discretized into the
-            specified bins determined by `pandas.cut` applied to the index of
-            `group`.
 
         Returns
         -------
         grouped : GroupBy
             A `GroupBy` object patterned after `pandas.GroupBy` that can be
             iterated over in the form of `(unique_value, grouped_array)` pairs.
-
-        See Also
-        --------
-        pandas.cut
         """
-        from .dataarray import DataArray
+        if isinstance(group, basestring):
+            group = self[group]
+        return self.groupby_cls(self, group, squeeze=squeeze)
+
+    def groupby_bins(self, group, bins, right=True, labels=None, precision=3,
+            include_lowest=False, squeeze=True):
+        """Returns a GroupBy object for performing grouped operations. Rather
+        than using all unique values of `group`, the values are discretized
+        first by applying `pandas.cut` [1]_ to `group`.
 
+        Parameters
+        ----------
+        group : str, DataArray or Coordinate
+            Array whose binned values should be used to group this array. If a
+            string, must be the name of a variable contained in this dataset.
+        bins : int or array of scalars
+            If bins is an int, it defines the number of equal-width bins in the
+            range of x. However, in this case, the range of x is extended by .1%
+            on each side to include the min or max values of x. If bins is a
+            sequence it defines the bin edges allowing for non-uniform bin
+            width. No extension of the range of x is done in this case.
+        right : boolean, optional
+I           ndicates whether the bins include the rightmost edge or not. If
+            right == True (the default), then the bins [1,2,3,4] indicate
+            (1,2], (2,3], (3,4].
+        labels : array or boolean, default None
+            Used as labels for the resulting bins. Must be of the same length as
+            the resulting bins. If False, string bin labels are assigned by
+            `pandas.cut`.
+        precision : int
+            The precision at which to store and display the bins labels.
+        include_lowest : bool
+            Whether the first interval should be left-inclusive or not.
+        squeeze : boolean, optional
+            If "group" is a dimension of any arrays in this dataset, `squeeze`
+            controls whether the subarrays have a dimension of length 1 along
+            that dimension or if the dimension is squeezed out.
+
+        Returns
+        -------
+        grouped : GroupBy
+            A `GroupBy` object patterned after `pandas.GroupBy` that can be
+            iterated over in the form of `(unique_value, grouped_array)` pairs.
+
+        References
+        ----------
+        .. [1] http://pandas.pydata.org/pandas-docs/stable/generated/pandas.cut.html
+        """
         if isinstance(group, basestring):
             group = self[group]
-        return self.groupby_cls(self, group, squeeze=squeeze, bins=bins)
+        return self.groupby_cls(self, group, squeeze=squeeze, bins=bins,
+                cut_kwargs={'right': right, 'labels': labels,
+                             'precision': precision,
+                             'include_lowest': include_lowest})
 
     def rolling(self, min_periods=None, center=False, **windows):
         """
diff --git a/xarray/core/groupby.py b/xarray/core/groupby.py
@@ -83,7 +83,8 @@ class GroupBy(object):
     Dataset.groupby
     DataArray.groupby
     """
-    def __init__(self, obj, group, squeeze=False, grouper=None, bins=None):
+    def __init__(self, obj, group, squeeze=False, grouper=None, bins=None,
+                    cut_kwargs={}):
         """Create a GroupBy object
 
         Parameters
@@ -101,6 +102,8 @@ def __init__(self, obj, group, squeeze=False, grouper=None, bins=None):
         bins : array-like, optional
             If `bins` is specified, the groups will be discretized into the
             specified bins by `pandas.cut`.
+        cut_kwargs : dict, optional
+            Extra keyword arguments to pass to `pandas.cut`
         """
         from .dataset import as_dataset
         from .dataarray import DataArray
@@ -138,7 +141,7 @@ def __init__(self, obj, group, squeeze=False, grouper=None, bins=None):
         if grouper is not None and bins is not None:
             raise TypeError("Can't specify both `grouper` and `bins`.")
         if bins is not None:
-            group = DataArray(pd.cut(group.values, bins),
+            group = DataArray(pd.cut(group.values, bins, **cut_kwargs),
                                 group.coords, name=group.name)
         if grouper is not None:
             index = safe_cast_to_index(group)
diff --git a/xarray/test/test_dataarray.py b/xarray/test/test_dataarray.py
@@ -1278,7 +1278,7 @@ def test_groupby_bins(self):
         expected = DataArray([1,5], dims='dim_0', coords={'dim_0': bin_coords})
         # the problem with this is that it overwrites the dimensions of array!
         #actual = array.groupby('dim_0', bins=bins).sum()
-        actual = array.groupby('dim_0', bins=bins).apply(
+        actual = array.groupby_bins('dim_0', bins).apply(
                                     lambda x : x.sum(), shortcut=False)
         self.assertDataArrayIdentical(expected, actual)
         # make sure original array dims are unchanged
@@ -1290,7 +1290,7 @@ def test_groupby_bins_multidim(self):
         bins = [0,15,20]
         bin_coords = ['(0, 15]', '(15, 20]']
         expected = DataArray([16, 40], dims='lat', coords={'lat': bin_coords})
-        actual = array.groupby('lat', bins=bins).apply(
+        actual = array.groupby_bins('lat', bins).apply(
                                     lambda x : x.sum(), shortcut=False)
         self.assertDataArrayIdentical(expected, actual)