Merge pull request #895 from spcl/npbench_tests_4

Npbench tests 4
spcl · Dec 16, 2021 · 49e3f55 · 49e3f55
2 parents 9aa71e7 + 3a82a3b
commit 49e3f55
Show file tree

Hide file tree

Showing 5 changed files with 482 additions and 23 deletions.
diff --git a/dace/libraries/blas/nodes/gemv.py b/dace/libraries/blas/nodes/gemv.py
@@ -10,6 +10,7 @@
 from dace.libraries.blas import blas_helpers
 from dace.frontend.common import op_repository as oprepo
 from dace.libraries.blas import environments
+from dace.sdfg import nodes, utils as sdutils
 import numpy as np
 import warnings
 
@@ -183,31 +184,42 @@ def expansion(node,
 
         node.validate(parent_sdfg, parent_state)
 
-        for e in parent_state.in_edges(node):
-            if e.dst_conn == "_A":
-                desc_a = parent_sdfg.arrays[e.data.data]
-            elif e.dst_conn == "_x":
-                desc_x = parent_sdfg.arrays[e.data.data]
-        for e in parent_state.out_edges(node):
-            if e.src_conn == "_y":
-                desc_y = parent_sdfg.arrays[e.data.data]
-
         sdfg = dace.SDFG("gemv")
         state = sdfg.add_state("gemv")
 
         alpha = node.alpha
         beta = node.beta
 
-        # Create local versions of input data nodes
-        desc_a = desc_a.clone()
-        desc_a.transient = False
-        sdfg.add_datadesc("_A", desc_a)
-        desc_x = desc_x.clone()
-        desc_x.transient = False
-        sdfg.add_datadesc("_x", desc_x)
-        desc_y = desc_y.clone()
-        desc_y.transient = False
-        sdfg.add_datadesc("_y", desc_y)
+        # Get input/output data (the method considers also the presence of view nodes)
+        ((edge_a, desc_a, shape_a, strides_a), (edge_x, desc_x, shape_x,
+                                                strides_x),
+         (edge_y, desc_y, shape_y,
+          strides_y)) = _get_matmul_operands(node,
+                                             parent_state,
+                                             parent_sdfg,
+                                             name_lhs="_A",
+                                             name_rhs="_x",
+                                             name_out="_y")
+
+        # Create local versions of input/output data nodes
+        _, desc_a = sdfg.add_array("_A",
+                                   shape_a,
+                                   desc_a.dtype,
+                                   strides=strides_a,
+                                   storage=desc_a.storage,
+                                   transient=False)
+        _, desc_x = sdfg.add_array("_x",
+                                   shape_x,
+                                   desc_x.dtype,
+                                   strides=strides_x,
+                                   storage=desc_x.storage,
+                                   transient=False)
+        _, desc_y_y = sdfg.add_array("_y",
+                                     shape_y,
+                                     desc_y.dtype,
+                                     strides=strides_y,
+                                     storage=desc_y.storage,
+                                     transient=False)
 
         if node.transA and desc_a.dtype.veclen > 1:
             raise NotImplementedError(

diff --git a/dace/transformation/interstate/sdfg_nesting.py b/dace/transformation/interstate/sdfg_nesting.py
@@ -1294,6 +1294,7 @@ def apply(self, sdfg):
                                             replacement_limit_value = state.ranges[
                                                 str(to_solve_limit_value
                                                     )][0][1] + 1
+
                                             to_solve_limit_value = replacement_limit_value
 
                                         # Range Initial value
@@ -1309,14 +1310,15 @@ def apply(self, sdfg):
 
                                             # Note: here we are lenient. We can't evaluate the maximum of the two,
                                             # since we don't know the value of symbols, therefore we only take the one
-                                            # that is not negative
+                                            # that is positive
 
                                             newsz_limit = newsz.subs(
                                                 {s: replacement_limit_value})
                                             newsz_initial = newsz.subs(
                                                 {s: replacement_initial_value})
-                                            if newsz_limit.is_negative:
-                                                if newsz_initial.is_negative:
+
+                                            if newsz_limit.is_negative or newsz_limit.is_zero:
+                                                if newsz_initial.is_negative and newsz_initial.is_zero:
                                                     raise ValueError(
                                                         f"Can not over-approximate shape for transient{node.data}"
                                                     )
@@ -1383,10 +1385,13 @@ def apply(self, sdfg):
                     mem.data = outputs[mem.data]
         outer_state = outer_sdfg.add_state(outer_sdfg.label)
 
-        # Clean up any remaining mentions of input nodes in the nested SDFG
+        # Clean up any remaining mentions of input/output nodes in the nested SDFG
         for before, after in inputs.items():
             nested_sdfg.replace(before, after)
 
+        for before, after in outputs.items():
+            nested_sdfg.replace(before, after)
+
         # Remove from the parent SDFG the symbols that are defined in the nested one
         defined_syms = set()
 

diff --git a/tests/npbench/covariance_test.py b/tests/npbench/covariance_test.py
@@ -0,0 +1,153 @@
+# Copyright 2019-2021 ETH Zurich and the DaCe authors. All rights reserved.
+# Original application code: NPBench - https://github.com/spcl/npbench
+
+import dace.dtypes
+import numpy as np
+import dace as dc
+import pytest
+import argparse
+from dace.fpga_testing import fpga_test
+from dace.transformation.interstate import FPGATransformSDFG, InlineSDFG
+from dace.transformation.dataflow import StreamingMemory, MapFusion, StreamingComposition, PruneConnectors
+from dace.transformation.auto.auto_optimize import auto_optimize, fpga_auto_opt
+from dace.libraries.standard import Reduce
+from dace.libraries.blas import Gemv
+
+M, N = (dc.symbol(s, dtype=dc.int32) for s in ('M', 'N'))
+
+
+@dc.program
+def covariance_kernel(float_n: dc.float32, data: dc.float32[N, M]):
+
+    mean = np.mean(data, axis=0)
+    np.subtract(data, mean, out=data)
+    cov = np.zeros((M, M), dtype=data.dtype)
+    for i in range(M):
+        cov[i, i:M] = data[:, i] @ data[:, i:M] / (float_n - 1.0)
+        cov[i:M, i] = cov[i, i:M]
+
+    # for i in range(M):
+    #     cov[i, i:M] = data[:, i] @ data[:, i:M]
+
+    return cov
+
+def ground_truth(M, N, float_n, data):
+
+    mean = np.empty((M,), dtype=data.dtype)
+    for j in range(M):
+        mean[j] = 0.0
+        for i in range(N):
+            mean[j] += data[i, j]
+        mean[j] /= float_n
+
+    for i in range(N):
+        for j in range(M):
+            data[i, j] -= mean[j]
+
+    cov = np.empty((M, M), dtype=data.dtype)
+    for i in range(M):
+        for j in range(i, M):
+            cov[i, j] = 0.0
+            for k in range(N):
+                cov[i, j] += data[k, i] * data[k, j]
+            cov[i, j] /= float_n - 1.0
+            cov[j, i] = cov[i, j]
+
+    return cov
+
+
+
+def init_data(M, N):
+
+    float_n = np.float32(N)
+    data = np.empty((N, M), dtype=np.float32)
+    for i in range(N):
+        for j in range(M):
+            data[i, j] = (i * j) / M
+
+    return float_n, data
+
+
+
+def run_covariance(device_type: dace.dtypes.DeviceType):
+    '''
+    Runs Covariance for the given device
+    :return: the SDFG
+    '''
+
+    # Initialize data (polybench small size)
+    M, N = (80, 100)
+    float_n, data  = init_data(M, N)
+
+    gt_data = np.copy(data)
+
+    if device_type in {dace.dtypes.DeviceType.CPU, dace.dtypes.DeviceType.GPU}:
+        # Parse the SDFG and apply autopot
+        sdfg = covariance_kernel.to_sdfg()
+        sdfg = auto_optimize(sdfg, device_type)
+        dace_res = sdfg(float_n=float_n, data=data, M=M, N=N)
+
+    elif device_type == dace.dtypes.DeviceType.FPGA:
+        # Parse SDFG and apply FPGA friendly optimization
+        sdfg = covariance_kernel.to_sdfg(strict=False)
+        sdfg.apply_strict_transformations()
+        applied = sdfg.apply_transformations([FPGATransformSDFG])
+        assert applied == 1
+
+        sdfg.apply_transformations([InlineSDFG])
+
+        # Use FPGA Expansion for lib nodes, and expand them to enable further optimizations
+        # Reduce.default_implementation = "FPGAPartialReduction"
+        Gemv.default_implementation = "FPGA_Accumulate"
+
+        sdfg.expand_library_nodes()
+        sdfg.apply_transformations([InlineSDFG])
+
+        # Other FPGA auto opt
+        fpga_auto_opt.fpga_global_to_local(sdfg)
+        fpga_auto_opt.fpga_rr_interleave_containers_to_banks(sdfg)
+
+        # Specialize the SDFG
+        sdfg.specialize(dict(N=N, M=M))
+
+        # run program
+        dace_res = sdfg(float_n=float_n, data=data)
+
+    # Compute ground truth and validate result
+    gt_res = ground_truth(M, N, float_n, gt_data)
+    assert np.allclose(gt_res, dace_res)
+    return sdfg
+
+
+def test_cpu():
+    run_covariance(dace.dtypes.DeviceType.CPU)
+
+
+@pytest.mark.gpu
+def test_gpu():
+    run_covariance(dace.dtypes.DeviceType.GPU)
+
+
+@fpga_test(assert_ii_1=False)
+def test_fpga():
+    return run_covariance(dace.dtypes.DeviceType.FPGA)
+
+
+if __name__ == "__main__":
+
+    parser = argparse.ArgumentParser()
+    parser.add_argument("-t",
+                        "--target",
+                        default='cpu',
+                        choices=['cpu', 'gpu', 'fpga'],
+                        help='Target platform')
+
+    args = vars(parser.parse_args())
+    target = args["target"]
+
+    if target == "cpu":
+        run_covariance(dace.dtypes.DeviceType.CPU)
+    elif target == "gpu":
+        run_covariance(dace.dtypes.DeviceType.GPU)
+    elif target == "fpga":
+        run_covariance(dace.dtypes.DeviceType.FPGA)
diff --git a/tests/npbench/lu_test.py b/tests/npbench/lu_test.py
@@ -0,0 +1,133 @@
+# Copyright 2019-2021 ETH Zurich and the DaCe authors. All rights reserved.
+# Original application code: NPBench - https://github.com/spcl/npbench
+
+import dace.dtypes
+import numpy as np
+import dace as dc
+import pytest
+import argparse
+from dace.fpga_testing import fpga_test
+from dace.transformation.interstate import FPGATransformSDFG, InlineSDFG
+from dace.transformation.dataflow import StreamingMemory, MapFusion, StreamingComposition, PruneConnectors
+from dace.transformation.auto.auto_optimize import auto_optimize, fpga_auto_opt
+
+N = dc.symbol('N', dtype=dc.int32)
+
+
+@dc.program
+def lu_kernel(A: dc.float32[N, N]):
+
+    for i in range(N):
+        for j in range(i):
+            A[i, j] -= A[i, :j] @ A[:j, j]
+            A[i, j] /= A[j, j]
+        for j in range(i, N):
+            A[i, j] -= A[i, :i] @ A[:i, j]
+
+
+def ground_truth(N, A):
+
+    for i in range(N):
+        for j in range(i):
+            A[i, j] -= A[i, :j] @ A[:j, j]
+            A[i, j] /= A[j, j]
+        for j in range(i, N):
+            A[i, j] -= A[i, :i] @ A[:i, j]
+
+
+def init_data(N):
+
+    A = np.empty((N, N), dtype=np.float32)
+    for i in range(N):
+        for j in range(i + 1):
+            A[i, j] = (-j % N) / N + 1
+        for j in range(i + 1, N):
+            A[i, j] = 0.0
+        A[i, i] = 1.0
+
+    B = np.empty((N, N), dtype=np.float32)
+    B[:] = A @ np.transpose(A)
+    A[:] = B
+
+    return A
+
+
+def run_lu(device_type: dace.dtypes.DeviceType):
+    '''
+    Runs LU for the given device
+    :return: the SDFG
+    '''
+
+    # Initialize data (polybench mini size)
+    N = 40
+    A = init_data(N)
+    gt_A = np.copy(A)
+
+    if device_type in {dace.dtypes.DeviceType.CPU, dace.dtypes.DeviceType.GPU}:
+        # Parse the SDFG and apply autopot
+        sdfg = lu_kernel.to_sdfg()
+        dace_res = sdfg(A=A, N=N)
+
+    elif device_type == dace.dtypes.DeviceType.FPGA:
+        # Parse SDFG and apply FPGA friendly optimization
+        sdfg = lu_kernel.to_sdfg(strict=True)
+
+        applied = sdfg.apply_transformations([FPGATransformSDFG])
+        assert applied == 1
+
+        # Use FPGA Expansion for lib nodes, and expand them to enable further optimizations
+        from dace.libraries.blas import Dot
+        platform = dace.config.Config.get("compiler", "fpga", "vendor")
+        if platform == "intel_fpga":
+            Dot.default_implementation = "FPGA_Accumulate"
+        else:
+            Dot.default_implementation = "FPGA_PartialSums"
+
+        sdfg.expand_library_nodes()
+        sdfg.apply_transformations_repeated([InlineSDFG])
+
+        fpga_auto_opt.fpga_rr_interleave_containers_to_banks(sdfg)
+        fpga_auto_opt.fpga_global_to_local(sdfg)
+
+        sdfg.specialize(dict(N=N))
+        dace_res = sdfg(A=A)
+
+    # Compute ground truth and validate result
+    ground_truth(N, gt_A)
+    diff = np.linalg.norm(gt_A - A) / np.linalg.norm(gt_A)
+    assert diff < 1e-5
+    return sdfg
+
+
+def test_cpu():
+    run_lu(dace.dtypes.DeviceType.CPU)
+
+
+@pytest.mark.gpu
+def test_gpu():
+    run_lu(dace.dtypes.DeviceType.GPU)
+
+
+@fpga_test(assert_ii_1=False)
+def test_fpga():
+    return run_lu(dace.dtypes.DeviceType.FPGA)
+
+
+if __name__ == "__main__":
+
+    parser = argparse.ArgumentParser()
+    parser.add_argument("-t",
+                        "--target",
+                        default='cpu',
+                        choices=['cpu', 'gpu', 'fpga'],
+                        help='Target platform')
+
+    args = vars(parser.parse_args())
+    target = args["target"]
+
+    if target == "cpu":
+        run_lu(dace.dtypes.DeviceType.CPU)
+    elif target == "gpu":
+        run_lu(dace.dtypes.DeviceType.GPU)
+    elif target == "fpga":
+        run_lu(dace.dtypes.DeviceType.FPGA)