Skip to content

Commit

Permalink
Merge pull request #895 from spcl/npbench_tests_4
Browse files Browse the repository at this point in the history
Npbench tests 4
  • Loading branch information
TizianoDeMatteis authored Dec 16, 2021
2 parents 9aa71e7 + 3a82a3b commit 49e3f55
Show file tree
Hide file tree
Showing 5 changed files with 482 additions and 23 deletions.
50 changes: 31 additions & 19 deletions dace/libraries/blas/nodes/gemv.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@
from dace.libraries.blas import blas_helpers
from dace.frontend.common import op_repository as oprepo
from dace.libraries.blas import environments
from dace.sdfg import nodes, utils as sdutils
import numpy as np
import warnings

Expand Down Expand Up @@ -183,31 +184,42 @@ def expansion(node,

node.validate(parent_sdfg, parent_state)

for e in parent_state.in_edges(node):
if e.dst_conn == "_A":
desc_a = parent_sdfg.arrays[e.data.data]
elif e.dst_conn == "_x":
desc_x = parent_sdfg.arrays[e.data.data]
for e in parent_state.out_edges(node):
if e.src_conn == "_y":
desc_y = parent_sdfg.arrays[e.data.data]

sdfg = dace.SDFG("gemv")
state = sdfg.add_state("gemv")

alpha = node.alpha
beta = node.beta

# Create local versions of input data nodes
desc_a = desc_a.clone()
desc_a.transient = False
sdfg.add_datadesc("_A", desc_a)
desc_x = desc_x.clone()
desc_x.transient = False
sdfg.add_datadesc("_x", desc_x)
desc_y = desc_y.clone()
desc_y.transient = False
sdfg.add_datadesc("_y", desc_y)
# Get input/output data (the method considers also the presence of view nodes)
((edge_a, desc_a, shape_a, strides_a), (edge_x, desc_x, shape_x,
strides_x),
(edge_y, desc_y, shape_y,
strides_y)) = _get_matmul_operands(node,
parent_state,
parent_sdfg,
name_lhs="_A",
name_rhs="_x",
name_out="_y")

# Create local versions of input/output data nodes
_, desc_a = sdfg.add_array("_A",
shape_a,
desc_a.dtype,
strides=strides_a,
storage=desc_a.storage,
transient=False)
_, desc_x = sdfg.add_array("_x",
shape_x,
desc_x.dtype,
strides=strides_x,
storage=desc_x.storage,
transient=False)
_, desc_y_y = sdfg.add_array("_y",
shape_y,
desc_y.dtype,
strides=strides_y,
storage=desc_y.storage,
transient=False)

if node.transA and desc_a.dtype.veclen > 1:
raise NotImplementedError(
Expand Down
13 changes: 9 additions & 4 deletions dace/transformation/interstate/sdfg_nesting.py
Original file line number Diff line number Diff line change
Expand Up @@ -1294,6 +1294,7 @@ def apply(self, sdfg):
replacement_limit_value = state.ranges[
str(to_solve_limit_value
)][0][1] + 1

to_solve_limit_value = replacement_limit_value

# Range Initial value
Expand All @@ -1309,14 +1310,15 @@ def apply(self, sdfg):

# Note: here we are lenient. We can't evaluate the maximum of the two,
# since we don't know the value of symbols, therefore we only take the one
# that is not negative
# that is positive

newsz_limit = newsz.subs(
{s: replacement_limit_value})
newsz_initial = newsz.subs(
{s: replacement_initial_value})
if newsz_limit.is_negative:
if newsz_initial.is_negative:

if newsz_limit.is_negative or newsz_limit.is_zero:
if newsz_initial.is_negative and newsz_initial.is_zero:
raise ValueError(
f"Can not over-approximate shape for transient{node.data}"
)
Expand Down Expand Up @@ -1383,10 +1385,13 @@ def apply(self, sdfg):
mem.data = outputs[mem.data]
outer_state = outer_sdfg.add_state(outer_sdfg.label)

# Clean up any remaining mentions of input nodes in the nested SDFG
# Clean up any remaining mentions of input/output nodes in the nested SDFG
for before, after in inputs.items():
nested_sdfg.replace(before, after)

for before, after in outputs.items():
nested_sdfg.replace(before, after)

# Remove from the parent SDFG the symbols that are defined in the nested one
defined_syms = set()

Expand Down
153 changes: 153 additions & 0 deletions tests/npbench/covariance_test.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,153 @@
# Copyright 2019-2021 ETH Zurich and the DaCe authors. All rights reserved.
# Original application code: NPBench - https://github.com/spcl/npbench

import dace.dtypes
import numpy as np
import dace as dc
import pytest
import argparse
from dace.fpga_testing import fpga_test
from dace.transformation.interstate import FPGATransformSDFG, InlineSDFG
from dace.transformation.dataflow import StreamingMemory, MapFusion, StreamingComposition, PruneConnectors
from dace.transformation.auto.auto_optimize import auto_optimize, fpga_auto_opt
from dace.libraries.standard import Reduce
from dace.libraries.blas import Gemv

M, N = (dc.symbol(s, dtype=dc.int32) for s in ('M', 'N'))


@dc.program
def covariance_kernel(float_n: dc.float32, data: dc.float32[N, M]):

mean = np.mean(data, axis=0)
np.subtract(data, mean, out=data)
cov = np.zeros((M, M), dtype=data.dtype)
for i in range(M):
cov[i, i:M] = data[:, i] @ data[:, i:M] / (float_n - 1.0)
cov[i:M, i] = cov[i, i:M]

# for i in range(M):
# cov[i, i:M] = data[:, i] @ data[:, i:M]

return cov

def ground_truth(M, N, float_n, data):

mean = np.empty((M,), dtype=data.dtype)
for j in range(M):
mean[j] = 0.0
for i in range(N):
mean[j] += data[i, j]
mean[j] /= float_n

for i in range(N):
for j in range(M):
data[i, j] -= mean[j]

cov = np.empty((M, M), dtype=data.dtype)
for i in range(M):
for j in range(i, M):
cov[i, j] = 0.0
for k in range(N):
cov[i, j] += data[k, i] * data[k, j]
cov[i, j] /= float_n - 1.0
cov[j, i] = cov[i, j]

return cov



def init_data(M, N):

float_n = np.float32(N)
data = np.empty((N, M), dtype=np.float32)
for i in range(N):
for j in range(M):
data[i, j] = (i * j) / M

return float_n, data



def run_covariance(device_type: dace.dtypes.DeviceType):
'''
Runs Covariance for the given device
:return: the SDFG
'''

# Initialize data (polybench small size)
M, N = (80, 100)
float_n, data = init_data(M, N)

gt_data = np.copy(data)

if device_type in {dace.dtypes.DeviceType.CPU, dace.dtypes.DeviceType.GPU}:
# Parse the SDFG and apply autopot
sdfg = covariance_kernel.to_sdfg()
sdfg = auto_optimize(sdfg, device_type)
dace_res = sdfg(float_n=float_n, data=data, M=M, N=N)

elif device_type == dace.dtypes.DeviceType.FPGA:
# Parse SDFG and apply FPGA friendly optimization
sdfg = covariance_kernel.to_sdfg(strict=False)
sdfg.apply_strict_transformations()
applied = sdfg.apply_transformations([FPGATransformSDFG])
assert applied == 1

sdfg.apply_transformations([InlineSDFG])

# Use FPGA Expansion for lib nodes, and expand them to enable further optimizations
# Reduce.default_implementation = "FPGAPartialReduction"
Gemv.default_implementation = "FPGA_Accumulate"

sdfg.expand_library_nodes()
sdfg.apply_transformations([InlineSDFG])

# Other FPGA auto opt
fpga_auto_opt.fpga_global_to_local(sdfg)
fpga_auto_opt.fpga_rr_interleave_containers_to_banks(sdfg)

# Specialize the SDFG
sdfg.specialize(dict(N=N, M=M))

# run program
dace_res = sdfg(float_n=float_n, data=data)

# Compute ground truth and validate result
gt_res = ground_truth(M, N, float_n, gt_data)
assert np.allclose(gt_res, dace_res)
return sdfg


def test_cpu():
run_covariance(dace.dtypes.DeviceType.CPU)


@pytest.mark.gpu
def test_gpu():
run_covariance(dace.dtypes.DeviceType.GPU)


@fpga_test(assert_ii_1=False)
def test_fpga():
return run_covariance(dace.dtypes.DeviceType.FPGA)


if __name__ == "__main__":

parser = argparse.ArgumentParser()
parser.add_argument("-t",
"--target",
default='cpu',
choices=['cpu', 'gpu', 'fpga'],
help='Target platform')

args = vars(parser.parse_args())
target = args["target"]

if target == "cpu":
run_covariance(dace.dtypes.DeviceType.CPU)
elif target == "gpu":
run_covariance(dace.dtypes.DeviceType.GPU)
elif target == "fpga":
run_covariance(dace.dtypes.DeviceType.FPGA)
133 changes: 133 additions & 0 deletions tests/npbench/lu_test.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,133 @@
# Copyright 2019-2021 ETH Zurich and the DaCe authors. All rights reserved.
# Original application code: NPBench - https://github.com/spcl/npbench

import dace.dtypes
import numpy as np
import dace as dc
import pytest
import argparse
from dace.fpga_testing import fpga_test
from dace.transformation.interstate import FPGATransformSDFG, InlineSDFG
from dace.transformation.dataflow import StreamingMemory, MapFusion, StreamingComposition, PruneConnectors
from dace.transformation.auto.auto_optimize import auto_optimize, fpga_auto_opt

N = dc.symbol('N', dtype=dc.int32)


@dc.program
def lu_kernel(A: dc.float32[N, N]):

for i in range(N):
for j in range(i):
A[i, j] -= A[i, :j] @ A[:j, j]
A[i, j] /= A[j, j]
for j in range(i, N):
A[i, j] -= A[i, :i] @ A[:i, j]


def ground_truth(N, A):

for i in range(N):
for j in range(i):
A[i, j] -= A[i, :j] @ A[:j, j]
A[i, j] /= A[j, j]
for j in range(i, N):
A[i, j] -= A[i, :i] @ A[:i, j]


def init_data(N):

A = np.empty((N, N), dtype=np.float32)
for i in range(N):
for j in range(i + 1):
A[i, j] = (-j % N) / N + 1
for j in range(i + 1, N):
A[i, j] = 0.0
A[i, i] = 1.0

B = np.empty((N, N), dtype=np.float32)
B[:] = A @ np.transpose(A)
A[:] = B

return A


def run_lu(device_type: dace.dtypes.DeviceType):
'''
Runs LU for the given device
:return: the SDFG
'''

# Initialize data (polybench mini size)
N = 40
A = init_data(N)
gt_A = np.copy(A)

if device_type in {dace.dtypes.DeviceType.CPU, dace.dtypes.DeviceType.GPU}:
# Parse the SDFG and apply autopot
sdfg = lu_kernel.to_sdfg()
dace_res = sdfg(A=A, N=N)

elif device_type == dace.dtypes.DeviceType.FPGA:
# Parse SDFG and apply FPGA friendly optimization
sdfg = lu_kernel.to_sdfg(strict=True)

applied = sdfg.apply_transformations([FPGATransformSDFG])
assert applied == 1

# Use FPGA Expansion for lib nodes, and expand them to enable further optimizations
from dace.libraries.blas import Dot
platform = dace.config.Config.get("compiler", "fpga", "vendor")
if platform == "intel_fpga":
Dot.default_implementation = "FPGA_Accumulate"
else:
Dot.default_implementation = "FPGA_PartialSums"

sdfg.expand_library_nodes()
sdfg.apply_transformations_repeated([InlineSDFG])

fpga_auto_opt.fpga_rr_interleave_containers_to_banks(sdfg)
fpga_auto_opt.fpga_global_to_local(sdfg)

sdfg.specialize(dict(N=N))
dace_res = sdfg(A=A)

# Compute ground truth and validate result
ground_truth(N, gt_A)
diff = np.linalg.norm(gt_A - A) / np.linalg.norm(gt_A)
assert diff < 1e-5
return sdfg


def test_cpu():
run_lu(dace.dtypes.DeviceType.CPU)


@pytest.mark.gpu
def test_gpu():
run_lu(dace.dtypes.DeviceType.GPU)


@fpga_test(assert_ii_1=False)
def test_fpga():
return run_lu(dace.dtypes.DeviceType.FPGA)


if __name__ == "__main__":

parser = argparse.ArgumentParser()
parser.add_argument("-t",
"--target",
default='cpu',
choices=['cpu', 'gpu', 'fpga'],
help='Target platform')

args = vars(parser.parse_args())
target = args["target"]

if target == "cpu":
run_lu(dace.dtypes.DeviceType.CPU)
elif target == "gpu":
run_lu(dace.dtypes.DeviceType.GPU)
elif target == "fpga":
run_lu(dace.dtypes.DeviceType.FPGA)
Loading

0 comments on commit 49e3f55

Please sign in to comment.