Avoid realizing a potentially very large RangeIndex in to memory

dcherian · dcherian · commit 6a8bbf7dad72 · 2025-04-02T22:11:14.000-06:00
xref #428
diff --git a/flox/core.py b/flox/core.py
@@ -1602,7 +1602,7 @@ def dask_groupby_agg(
     engine: T_Engine = "numpy",
     sort: bool = True,
     chunks_cohorts=None,
-) -> tuple[DaskArray, tuple[np.ndarray | DaskArray]]:
+) -> tuple[DaskArray, tuple[pd.Index | np.ndarray | DaskArray]]:
     import dask.array
     from dask.array.core import slices_from_chunks
     from dask.highlevelgraph import HighLevelGraph
@@ -1730,7 +1730,7 @@ def dask_groupby_agg(
                 group_chunks = ((np.nan,),)
             else:
                 assert expected_groups is not None
-                groups = (expected_groups.to_numpy(),)
+                groups = (expected_groups,)
                 group_chunks = ((len(expected_groups),),)
 
         elif method == "cohorts":
@@ -1846,7 +1846,7 @@ def cubed_groupby_agg(
     engine: T_Engine = "numpy",
     sort: bool = True,
     chunks_cohorts=None,
-) -> tuple[CubedArray, tuple[np.ndarray | CubedArray]]:
+) -> tuple[CubedArray, tuple[pd.Index | np.ndarray | CubedArray]]:
     import cubed
     import cubed.core.groupby
 
@@ -1882,7 +1882,7 @@ def _reduction_func(a, by, axis, start_group, num_groups):
         result = cubed.core.groupby.groupby_blockwise(
             array, by, axis=axis, func=_reduction_func, num_groups=num_groups
         )
-        groups = (expected_groups.to_numpy(),)
+        groups = (expected_groups,)
         return (result, groups)
 
     else:
@@ -1964,7 +1964,7 @@ def _groupby_aggregate(a, **kwargs):
             num_groups=num_groups,
         )
 
-        groups = (expected_groups.to_numpy(),)
+        groups = (expected_groups,)
 
         return (result, groups)
 
diff --git a/flox/dask_array_ops.py b/flox/dask_array_ops.py
@@ -4,14 +4,22 @@
 from itertools import product
 from numbers import Integral
 
+import pandas as pd
 from dask import config
+from dask.base import normalize_token
 from dask.blockwise import lol_tuples
 from toolz import partition_all
 
 from .lib import ArrayLayer
 from .types import Graph
 
 
+# workaround for https://github.com/dask/dask/issues/11862
+@normalize_token.register(pd.RangeIndex)
+def normalize_range_index(x):
+    return normalize_token(type(x)), x.start, x.stop, x.step, x.dtype, x.name
+
+
 # _tree_reduce and partial_reduce are copied from dask.array.reductions
 # They have been modified to work purely with graphs, and without creating new Array layers
 # in the graph. The `block_index` kwarg is new and avoids a concatenation by simply setting the right