diff --git a/dace/sdfg/infer_types.py b/dace/sdfg/infer_types.py index cf58cf76cc..97010e95a7 100644 --- a/dace/sdfg/infer_types.py +++ b/dace/sdfg/infer_types.py @@ -116,8 +116,7 @@ def infer_connector_types(sdfg: SDFG): for e in state.out_edges(node): cname = e.src_conn if cname and node.out_connectors[cname] is None: - raise TypeError('Ambiguous or uninferable type in' - ' connector "%s" of node "%s"' % (cname, node)) + raise TypeError('Ambiguous or uninferable type in' ' connector "%s" of node "%s"' % (cname, node)) ############################################################################# @@ -301,6 +300,12 @@ def _set_default_schedule_in_scope(state: SDFGState, else: child_schedule = _determine_child_schedule(parent_schedules) + # Special case for dynamic thread-block neighboring schedules + if child_schedule == dtypes.ScheduleType.GPU_ThreadBlock: + from dace.transformation.helpers import gpu_map_has_explicit_dyn_threadblocks # Avoid import loops + if gpu_map_has_explicit_dyn_threadblocks(state, parent_node): + child_schedule = dtypes.ScheduleType.GPU_ThreadBlock_Dynamic + # Set child schedule type in scope for node in child_nodes[parent_node]: # Set default schedule types @@ -393,6 +398,7 @@ def _get_storage_from_parent(data_name: str, sdfg: SDFG) -> dtypes.StorageType: raise ValueError(f'Could not find data descriptor {data_name} in parent SDFG') + def infer_aliasing(node: nodes.NestedSDFG, sdfg: SDFG, state: SDFGState) -> None: """ Infers aliasing information on nested SDFG arrays based on external edges and connectors. diff --git a/dace/transformation/helpers.py b/dace/transformation/helpers.py index 6ca4602079..b7bf49e62b 100644 --- a/dace/transformation/helpers.py +++ b/dace/transformation/helpers.py @@ -934,11 +934,7 @@ def replicate_scope(sdfg: SDFG, state: SDFGState, scope: ScopeSubgraphView) -> S return ScopeSubgraphView(state, new_nodes, new_entry) -def offset_map(state: SDFGState, - entry: nodes.MapEntry, - dim: int, - offset: symbolic.SymbolicType, - negative: bool = True): +def offset_map(state: SDFGState, entry: nodes.MapEntry, dim: int, offset: symbolic.SymbolicType, negative: bool = True): """ Offsets a map parameter and its contents by a value. @@ -1270,6 +1266,17 @@ def gpu_map_has_explicit_threadblocks(state: SDFGState, entry: nodes.EntryNode) return False +def gpu_map_has_explicit_dyn_threadblocks(state: SDFGState, entry: nodes.EntryNode) -> bool: + """ + Returns True if GPU_Device map has explicit thread-block maps nested within. + """ + internal_maps = get_internal_scopes(state, entry) + if any(m.schedule == dtypes.ScheduleType.GPU_ThreadBlock_Dynamic for _, m in internal_maps): + return True + + return False + + def reconnect_edge_through_map( state: SDFGState, edge: graph.MultiConnectorEdge[Memlet], new_node: Union[nodes.EntryNode, nodes.ExitNode], keep_src: bool) -> Tuple[graph.MultiConnectorEdge[Memlet], graph.MultiConnectorEdge[Memlet]]: diff --git a/tests/dynamic_tb_map_cudatest.py b/tests/dynamic_tb_map_cudatest.py index b24e5f2ea6..280fecab13 100644 --- a/tests/dynamic_tb_map_cudatest.py +++ b/tests/dynamic_tb_map_cudatest.py @@ -12,10 +12,8 @@ @dace.program(dace.uint32[H + 1], dace.uint32[nnz], dace.float32[nnz], dace.float32[W], dace.float32[H]) def spmv(A_row, A_col, A_val, x, b): - @dace.mapscope(_[0:H]) def compute_row(i): - @dace.map(_[A_row[i]:A_row[i + 1]]) def compute(j): a << A_val[j] @@ -292,8 +290,29 @@ def sddvm(D_vals: dace.float32[nnz_D], A2_crd: dace.int32[nnz_A], A2_pos: dace.i assert np.allclose(val, ref.data) +@pytest.mark.gpu +def test_dynamic_default_schedule(): + N = dace.symbol('N') + + @dace.program + def tester(a: dace.float32[N, 10]): + A = dace.ndarray([N, 10], dtype=dace.float32, storage=dace.StorageType.GPU_Global) + A[:] = a + for i in dace.map[0:N] @ dace.ScheduleType.GPU_Device: + smem = np.empty((10, ), dtype=np.float32) @ dace.StorageType.GPU_Shared + smem[:] = 1 + for j in dace.map[0:10] @ dace.ScheduleType.GPU_ThreadBlock_Dynamic: + A[i, j] = i * 10 + smem[j] + a[:] = A + + a = np.zeros((65, 10), dtype=np.float32) + tester(a) + assert np.allclose(a, np.fromfunction(lambda i, j, k: i * 10 + j, (65, 10), dtype=np.float32)) + + if __name__ == '__main__': test_dynamic_map() test_dynamic_maps() test_nested_dynamic_map() test_dynamic_map_with_step() + test_dynamic_default_schedule()