1
- import re
2
-
1
+ import numpy as np
3
2
from dask .array .gufunc import _parse_gufunc_signature , _validate_normalize_axes
4
3
from tlz import concat , merge , unique
5
4
5
+
6
6
def apply_gufunc (
7
7
func ,
8
8
signature ,
9
9
* args ,
10
10
axes = None ,
11
11
axis = None ,
12
- keepdims = False ,
13
12
output_dtypes = None ,
14
- output_sizes = None ,
15
13
vectorize = None ,
16
- allow_rechunk = False ,
17
- meta = None ,
18
14
** kwargs ,
19
15
):
20
16
"""
21
17
Apply a generalized ufunc or similar python function to arrays.
22
18
23
- ``signature`` determines if the function consumes or produces core
24
- dimensions. The remaining dimensions in given input arrays (``*args``)
25
- are considered loop dimensions and are required to broadcast
26
- naturally against each other.
27
-
28
- In other terms, this function is like ``np.vectorize``, but for
29
- the blocks of dask arrays. If the function itself shall also
30
- be vectorized use ``vectorize=True`` for convenience.
31
-
32
- Parameters
33
- ----------
34
- func : callable
35
- Function to call like ``func(*args, **kwargs)`` on input arrays
36
- (``*args``) that returns an array or tuple of arrays. If multiple
37
- arguments with non-matching dimensions are supplied, this function is
38
- expected to vectorize (broadcast) over axes of positional arguments in
39
- the style of NumPy universal functions [1]_ (if this is not the case,
40
- set ``vectorize=True``). If this function returns multiple outputs,
41
- ``output_core_dims`` has to be set as well.
42
- signature: string
43
- Specifies what core dimensions are consumed and produced by ``func``.
44
- According to the specification of numpy.gufunc signature [2]_
45
- *args : numeric
46
- Input arrays or scalars to the callable function.
47
- axes: List of tuples, optional, keyword only
48
- A list of tuples with indices of axes a generalized ufunc should operate on.
49
- For instance, for a signature of ``"(i,j),(j,k)->(i,k)"`` appropriate for
50
- matrix multiplication, the base elements are two-dimensional matrices
51
- and these are taken to be stored in the two last axes of each argument. The
52
- corresponding axes keyword would be ``[(-2, -1), (-2, -1), (-2, -1)]``.
53
- For simplicity, for generalized ufuncs that operate on 1-dimensional arrays
54
- (vectors), a single integer is accepted instead of a single-element tuple,
55
- and for generalized ufuncs for which all outputs are scalars, the output
56
- tuples can be omitted.
57
- axis: int, optional, keyword only
58
- A single axis over which a generalized ufunc should operate. This is a short-cut
59
- for ufuncs that operate over a single, shared core dimension, equivalent to passing
60
- in axes with entries of (axis,) for each single-core-dimension argument and ``()`` for
61
- all others. For instance, for a signature ``"(i),(i)->()"``, it is equivalent to passing
62
- in ``axes=[(axis,), (axis,), ()]``.
63
- keepdims: bool, optional, keyword only
64
- If this is set to True, axes which are reduced over will be left in the result as
65
- a dimension with size one, so that the result will broadcast correctly against the
66
- inputs. This option can only be used for generalized ufuncs that operate on inputs
67
- that all have the same number of core dimensions and with outputs that have no core
68
- dimensions , i.e., with signatures like ``"(i),(i)->()"`` or ``"(m,m)->()"``.
69
- If used, the location of the dimensions in the output can be controlled with axes
70
- and axis.
71
- output_dtypes : Optional, dtype or list of dtypes, keyword only
72
- Valid numpy dtype specification or list thereof.
73
- If not given, a call of ``func`` with a small set of data
74
- is performed in order to try to automatically determine the
75
- output dtypes.
76
- output_sizes : dict, optional, keyword only
77
- Optional mapping from dimension names to sizes for outputs. Only used if
78
- new core dimensions (not found on inputs) appear on outputs.
79
- vectorize: bool, keyword only
80
- If set to ``True``, ``np.vectorize`` is applied to ``func`` for
81
- convenience. Defaults to ``False``.
82
- allow_rechunk: Optional, bool, keyword only
83
- Allows rechunking, otherwise chunk sizes need to match and core
84
- dimensions are to consist only of one chunk.
85
- Warning: enabling this can increase memory usage significantly.
86
- Defaults to ``False``.
87
- meta: Optional, tuple, keyword only
88
- tuple of empty ndarrays describing the shape and dtype of the output of the gufunc.
89
- Defaults to ``None``.
90
- **kwargs : dict
91
- Extra keyword arguments to pass to `func`
19
+ This is a cutdown version of the
20
+ `equivalent function <https://docs.dask.org/en/stable/generated/dask.array.gufunc.apply_gufunc.html>`_
21
+ in Dask. Refer there for usage information.
92
22
93
- Returns
94
- -------
95
- Single dask.array.Array or tuple of dask.array.Array
23
+ Current limitations: ``keepdims``, ``output_sizes``, and ``allow_rechunk`` are not supported;
24
+ and multiple outputs are not supported.
96
25
97
- Examples
98
- --------
99
- >>> import dask.array as da
100
- >>> import numpy as np
101
- >>> def stats(x):
102
- ... return np.mean(x, axis=-1), np.std(x, axis=-1)
103
- >>> a = da.random.normal(size=(10,20,30), chunks=(5, 10, 30))
104
- >>> mean, std = da.apply_gufunc(stats, "(i)->(),()", a)
105
- >>> mean.compute().shape
106
- (10, 20)
26
+ Cubed assumes that ``func`` will allocate a new output array. However, if it allocates more memory
27
+ than than, then you need to tell Cubed about it by setting the ``extra_required_mem`` parameter
28
+ to the amount needed in bytes (per task).
29
+ """
107
30
31
+ # Currently the following parameters cannot be changed
32
+ keepdims = False
33
+ output_sizes = None
34
+ allow_rechunk = False
108
35
109
- >>> def outer_product(x, y):
110
- ... return np.einsum("i,j->ij", x, y)
111
- >>> a = da.random.normal(size=( 20,30), chunks=(10, 30))
112
- >>> b = da.random.normal(size=(10, 1,40), chunks=(5, 1, 40))
113
- >>> c = da.apply_gufunc(outer_product, "(i),(j)->(i,j)", a, b, vectorize=True)
114
- >>> c.compute().shape
115
- (10, 20, 30, 40)
36
+ # based on dask's apply_gufunc
116
37
117
- References
118
- ----------
119
- .. [1] https://docs.scipy.org/doc/numpy/reference/ufuncs.html
120
- .. [2] https://docs.scipy.org/doc/numpy/reference/c-api/generalized-ufuncs.html
121
- """
122
38
# Input processing:
123
- ## Signature
39
+
40
+ # Signature
124
41
if not isinstance (signature , str ):
125
42
raise TypeError ("`signature` has to be of type string" )
126
- # NumPy versions before https://github.com/numpy/numpy/pull/19627
127
- # would not ignore whitespace characters in `signature` like they
128
- # are supposed to. We remove the whitespace here as a workaround.
129
- signature = re .sub (r"\s+" , "" , signature )
130
43
input_coredimss , output_coredimss = _parse_gufunc_signature (signature )
131
44
132
- ## Determine nout: nout = None for functions of one direct return; nout = int for return tuples
45
+ # Determine nout: nout = None for functions of one direct return; nout = int for return tuples
133
46
nout = None if not isinstance (output_coredimss , list ) else len (output_coredimss )
134
47
135
- ## Consolidate onto `meta`
136
- if meta is not None and output_dtypes is not None :
137
- raise ValueError (
138
- "Only one of `meta` and `output_dtypes` should be given (`meta` is preferred)."
48
+ if nout is not None :
49
+ raise NotImplementedError (
50
+ "Multiple outputs are not yet supported, see https://github.com/tomwhite/cubed/issues/69"
139
51
)
140
- if meta is None :
141
- if output_dtypes is None :
142
- ## Infer `output_dtypes`
143
- if vectorize :
144
- tempfunc = np .vectorize (func , signature = signature )
145
- else :
146
- tempfunc = func
147
- output_dtypes = apply_infer_dtype (
148
- tempfunc , args , kwargs , "apply_gufunc" , "output_dtypes" , nout
149
- )
150
-
151
- ## Turn `output_dtypes` into `meta`
152
- if (
153
- nout is None
154
- and isinstance (output_dtypes , (tuple , list ))
155
- and len (output_dtypes ) == 1
156
- ):
157
- output_dtypes = output_dtypes [0 ]
158
- sample = args [0 ] if args else None
159
- if nout is None :
160
- meta = meta_from_array (sample , dtype = output_dtypes )
161
- else :
162
- meta = tuple (meta_from_array (sample , dtype = odt ) for odt in output_dtypes )
163
52
164
- ## Normalize `meta` format
165
- meta = meta_from_array (meta )
166
- if isinstance (meta , list ):
167
- meta = tuple (meta )
168
-
169
- ## Validate `meta`
170
- if nout is None :
171
- if isinstance (meta , tuple ):
172
- if len (meta ) == 1 :
173
- meta = meta [0 ]
174
- else :
175
- raise ValueError (
176
- "For a function with one output, must give a single item for `output_dtypes`/`meta`, "
177
- "not a tuple or list."
178
- )
179
- else :
180
- if not isinstance (meta , tuple ):
181
- raise ValueError (
182
- f"For a function with { nout } outputs, must give a tuple or list for `output_dtypes`/`meta`, "
183
- "not a single item."
184
- )
185
- if len (meta ) != nout :
186
- raise ValueError (
187
- f"For a function with { nout } outputs, must give a tuple or list of { nout } items for "
188
- f"`output_dtypes`/`meta`, not { len (meta )} ."
189
- )
190
-
191
- ## Vectorize function, if required
53
+ # Vectorize function, if required
192
54
if vectorize :
193
- otypes = [ x . dtype for x in meta ] if isinstance ( meta , tuple ) else [ meta . dtype ]
55
+ otypes = output_dtypes
194
56
func = np .vectorize (func , signature = signature , otypes = otypes )
195
57
196
- ## Miscellaneous
58
+ # Miscellaneous
197
59
if output_sizes is None :
198
60
output_sizes = {}
199
61
200
- ## Axes
62
+ # Axes
201
63
input_axes , output_axes = _validate_normalize_axes (
202
64
axes , axis , keepdims , input_coredimss , output_coredimss
203
65
)
204
66
205
67
# Main code:
206
- ## Cast all input arrays to dask
207
- args = [asarray (a ) for a in args ]
68
+
69
+ # Cast all input arrays to cubed
70
+ # args = [asarray(a) for a in args] # TODO: do we need to support casting?
208
71
209
72
if len (input_coredimss ) != len (args ):
210
73
raise ValueError (
211
74
"According to `signature`, `func` requires %d arguments, but %s given"
212
75
% (len (input_coredimss ), len (args ))
213
76
)
214
77
215
- ## Axes: transpose input arguments
216
- transposed_args = []
217
- for arg , iax in zip (args , input_axes ):
218
- shape = arg .shape
219
- iax = tuple (a if a < 0 else a - len (shape ) for a in iax )
220
- tidc = tuple (i for i in range (- len (shape ) + 0 , 0 ) if i not in iax ) + iax
221
- transposed_arg = arg .transpose (tidc )
222
- transposed_args .append (transposed_arg )
223
- args = transposed_args
78
+ # Note (cubed): since we don't support allow_rechunk=True, there is no need to transpose args (and outputs back again)
224
79
225
- ## Assess input args for loop dims
80
+ # Assess input args for loop dims
226
81
input_shapes = [a .shape for a in args ]
227
82
input_chunkss = [a .chunks for a in args ]
228
83
num_loopdims = [len (s ) - len (cd ) for s , cd in zip (input_shapes , input_coredimss )]
@@ -238,12 +93,12 @@ def apply_gufunc(
238
93
tuple ("__loopdim%d__" % d for d in range (max_loopdims - n , max_loopdims ))
239
94
for n in num_loopdims
240
95
]
241
- input_dimss = [l + c for l , c in zip (loop_input_dimss , input_coredimss )]
96
+ input_dimss = [lp + c for lp , c in zip (loop_input_dimss , input_coredimss )]
242
97
243
98
loop_output_dims = max (loop_input_dimss , key = len ) if loop_input_dimss else tuple ()
244
99
245
- ## Assess input args for same size and chunk sizes
246
- ### Collect sizes and chunksizes of all dims in all arrays
100
+ # Assess input args for same size and chunk sizes
101
+ # Collect sizes and chunksizes of all dims in all arrays
247
102
dimsizess = {}
248
103
chunksizess = {}
249
104
for dims , shape , chunksizes in zip (input_dimss , input_shapes , input_chunkss ):
@@ -254,14 +109,14 @@ def apply_gufunc(
254
109
chunksizes_ = chunksizess .get (dim , [])
255
110
chunksizes_ .append (chunksize )
256
111
chunksizess [dim ] = chunksizes_
257
- ### Assert correct partitioning, for case:
112
+ # Assert correct partitioning, for case:
258
113
for dim , sizes in dimsizess .items ():
259
- #### Check that the arrays have same length for same dimensions or dimension `1`
114
+ # Check that the arrays have same length for same dimensions or dimension `1`
260
115
if set (sizes ) | {1 } != {1 , max (sizes )}:
261
116
raise ValueError (f"Dimension `'{ dim } '` with different lengths in arrays" )
262
117
if not allow_rechunk :
263
118
chunksizes = chunksizess [dim ]
264
- #### Check if core dimensions consist of only one chunk
119
+ # Check if core dimensions consist of only one chunk
265
120
if (dim in core_shapes ) and (chunksizes [0 ][0 ] < core_shapes [dim ]):
266
121
raise ValueError (
267
122
"Core dimension `'{}'` consists of multiple chunks. To fix, rechunk into a single \
@@ -270,7 +125,7 @@ def apply_gufunc(
270
125
dim
271
126
)
272
127
)
273
- #### Check if loop dimensions consist of same chunksizes, when they have sizes > 1
128
+ # Check if loop dimensions consist of same chunksizes, when they have sizes > 1
274
129
relevant_chunksizes = list (
275
130
unique (c for s , c in zip (sizes , chunksizes ) if s > 1 )
276
131
)
@@ -279,76 +134,12 @@ def apply_gufunc(
279
134
f"Dimension `'{ dim } '` with different chunksize present"
280
135
)
281
136
282
- ## Apply function - use blockwise here
137
+ # Apply function - use blockwise here
283
138
arginds = list (concat (zip (args , input_dimss )))
284
139
285
- ### Use existing `blockwise` but only with loopdims to enforce
286
- ### concatenation for coredims that appear also at the output
287
- ### Modifying `blockwise` could improve things here.
288
- tmp = blockwise (
289
- func , loop_output_dims , * arginds , concatenate = True , meta = meta , ** kwargs
290
- )
291
-
292
- # NOTE: we likely could just use `meta` instead of `tmp._meta`,
293
- # but we use it and validate it anyway just to be sure nothing odd has happened.
294
- metas = tmp ._meta
295
- if nout is None :
296
- assert not isinstance (
297
- metas , (list , tuple )
298
- ), f"meta changed from single output to multiple output during blockwise: { meta } -> { metas } "
299
- metas = (metas ,)
300
- else :
301
- assert isinstance (
302
- metas , (list , tuple )
303
- ), f"meta changed from multiple output to single output during blockwise: { meta } -> { metas } "
304
- assert (
305
- len (metas ) == nout
306
- ), f"Number of outputs changed from { nout } to { len (metas )} during blockwise"
307
-
308
- ## Prepare output shapes
309
- loop_output_shape = tmp .shape
310
- loop_output_chunks = tmp .chunks
311
- keys = list (flatten (tmp .__dask_keys__ ()))
312
- name , token = keys [0 ][0 ].split ("-" )
313
-
314
- ### *) Treat direct output
315
- if nout is None :
316
- output_coredimss = [output_coredimss ]
317
-
318
- ## Split output
319
- leaf_arrs = []
320
- for i , (ocd , oax , meta ) in enumerate (zip (output_coredimss , output_axes , metas )):
321
- core_output_shape = tuple (core_shapes [d ] for d in ocd )
322
- core_chunkinds = len (ocd ) * (0 ,)
323
- output_shape = loop_output_shape + core_output_shape
324
- output_chunks = loop_output_chunks + core_output_shape
325
- leaf_name = "%s_%d-%s" % (name , i , token )
326
- leaf_dsk = {
327
- (leaf_name ,)
328
- + key [1 :]
329
- + core_chunkinds : ((getitem , key , i ) if nout else key )
330
- for key in keys
331
- }
332
- graph = HighLevelGraph .from_collections (leaf_name , leaf_dsk , dependencies = [tmp ])
333
- meta = meta_from_array (meta , len (output_shape ))
334
- leaf_arr = Array (
335
- graph , leaf_name , chunks = output_chunks , shape = output_shape , meta = meta
336
- )
337
-
338
- ### Axes:
339
- if keepdims :
340
- slices = len (leaf_arr .shape ) * (slice (None ),) + len (oax ) * (np .newaxis ,)
341
- leaf_arr = leaf_arr [slices ]
140
+ from cubed .core .ops import blockwise
342
141
343
- tidcs = [None ] * len (leaf_arr .shape )
344
- for ii , oa in zip (range (- len (oax ), 0 ), oax ):
345
- tidcs [oa ] = ii
346
- j = 0
347
- for ii in range (len (tidcs )):
348
- if tidcs [ii ] is None :
349
- tidcs [ii ] = j
350
- j += 1
351
- leaf_arr = leaf_arr .transpose (tidcs )
352
- leaf_arrs .append (leaf_arr )
142
+ # Note (cubed): use blockwise on all output dimensions, not just loop_output_dims like in original
143
+ out_ind = loop_output_dims + output_coredimss
353
144
354
- return ( * leaf_arrs ,) if nout else leaf_arrs [ 0 ] # Undo *) from above
145
+ return blockwise ( func , out_ind , * arginds , dtype = output_dtypes , ** kwargs )
0 commit comments