-
Notifications
You must be signed in to change notification settings - Fork 131
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge pull request #895 from spcl/npbench_tests_4
Npbench tests 4
- Loading branch information
Showing
5 changed files
with
482 additions
and
23 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,153 @@ | ||
# Copyright 2019-2021 ETH Zurich and the DaCe authors. All rights reserved. | ||
# Original application code: NPBench - https://github.com/spcl/npbench | ||
|
||
import dace.dtypes | ||
import numpy as np | ||
import dace as dc | ||
import pytest | ||
import argparse | ||
from dace.fpga_testing import fpga_test | ||
from dace.transformation.interstate import FPGATransformSDFG, InlineSDFG | ||
from dace.transformation.dataflow import StreamingMemory, MapFusion, StreamingComposition, PruneConnectors | ||
from dace.transformation.auto.auto_optimize import auto_optimize, fpga_auto_opt | ||
from dace.libraries.standard import Reduce | ||
from dace.libraries.blas import Gemv | ||
|
||
M, N = (dc.symbol(s, dtype=dc.int32) for s in ('M', 'N')) | ||
|
||
|
||
@dc.program | ||
def covariance_kernel(float_n: dc.float32, data: dc.float32[N, M]): | ||
|
||
mean = np.mean(data, axis=0) | ||
np.subtract(data, mean, out=data) | ||
cov = np.zeros((M, M), dtype=data.dtype) | ||
for i in range(M): | ||
cov[i, i:M] = data[:, i] @ data[:, i:M] / (float_n - 1.0) | ||
cov[i:M, i] = cov[i, i:M] | ||
|
||
# for i in range(M): | ||
# cov[i, i:M] = data[:, i] @ data[:, i:M] | ||
|
||
return cov | ||
|
||
def ground_truth(M, N, float_n, data): | ||
|
||
mean = np.empty((M,), dtype=data.dtype) | ||
for j in range(M): | ||
mean[j] = 0.0 | ||
for i in range(N): | ||
mean[j] += data[i, j] | ||
mean[j] /= float_n | ||
|
||
for i in range(N): | ||
for j in range(M): | ||
data[i, j] -= mean[j] | ||
|
||
cov = np.empty((M, M), dtype=data.dtype) | ||
for i in range(M): | ||
for j in range(i, M): | ||
cov[i, j] = 0.0 | ||
for k in range(N): | ||
cov[i, j] += data[k, i] * data[k, j] | ||
cov[i, j] /= float_n - 1.0 | ||
cov[j, i] = cov[i, j] | ||
|
||
return cov | ||
|
||
|
||
|
||
def init_data(M, N): | ||
|
||
float_n = np.float32(N) | ||
data = np.empty((N, M), dtype=np.float32) | ||
for i in range(N): | ||
for j in range(M): | ||
data[i, j] = (i * j) / M | ||
|
||
return float_n, data | ||
|
||
|
||
|
||
def run_covariance(device_type: dace.dtypes.DeviceType): | ||
''' | ||
Runs Covariance for the given device | ||
:return: the SDFG | ||
''' | ||
|
||
# Initialize data (polybench small size) | ||
M, N = (80, 100) | ||
float_n, data = init_data(M, N) | ||
|
||
gt_data = np.copy(data) | ||
|
||
if device_type in {dace.dtypes.DeviceType.CPU, dace.dtypes.DeviceType.GPU}: | ||
# Parse the SDFG and apply autopot | ||
sdfg = covariance_kernel.to_sdfg() | ||
sdfg = auto_optimize(sdfg, device_type) | ||
dace_res = sdfg(float_n=float_n, data=data, M=M, N=N) | ||
|
||
elif device_type == dace.dtypes.DeviceType.FPGA: | ||
# Parse SDFG and apply FPGA friendly optimization | ||
sdfg = covariance_kernel.to_sdfg(strict=False) | ||
sdfg.apply_strict_transformations() | ||
applied = sdfg.apply_transformations([FPGATransformSDFG]) | ||
assert applied == 1 | ||
|
||
sdfg.apply_transformations([InlineSDFG]) | ||
|
||
# Use FPGA Expansion for lib nodes, and expand them to enable further optimizations | ||
# Reduce.default_implementation = "FPGAPartialReduction" | ||
Gemv.default_implementation = "FPGA_Accumulate" | ||
|
||
sdfg.expand_library_nodes() | ||
sdfg.apply_transformations([InlineSDFG]) | ||
|
||
# Other FPGA auto opt | ||
fpga_auto_opt.fpga_global_to_local(sdfg) | ||
fpga_auto_opt.fpga_rr_interleave_containers_to_banks(sdfg) | ||
|
||
# Specialize the SDFG | ||
sdfg.specialize(dict(N=N, M=M)) | ||
|
||
# run program | ||
dace_res = sdfg(float_n=float_n, data=data) | ||
|
||
# Compute ground truth and validate result | ||
gt_res = ground_truth(M, N, float_n, gt_data) | ||
assert np.allclose(gt_res, dace_res) | ||
return sdfg | ||
|
||
|
||
def test_cpu(): | ||
run_covariance(dace.dtypes.DeviceType.CPU) | ||
|
||
|
||
@pytest.mark.gpu | ||
def test_gpu(): | ||
run_covariance(dace.dtypes.DeviceType.GPU) | ||
|
||
|
||
@fpga_test(assert_ii_1=False) | ||
def test_fpga(): | ||
return run_covariance(dace.dtypes.DeviceType.FPGA) | ||
|
||
|
||
if __name__ == "__main__": | ||
|
||
parser = argparse.ArgumentParser() | ||
parser.add_argument("-t", | ||
"--target", | ||
default='cpu', | ||
choices=['cpu', 'gpu', 'fpga'], | ||
help='Target platform') | ||
|
||
args = vars(parser.parse_args()) | ||
target = args["target"] | ||
|
||
if target == "cpu": | ||
run_covariance(dace.dtypes.DeviceType.CPU) | ||
elif target == "gpu": | ||
run_covariance(dace.dtypes.DeviceType.GPU) | ||
elif target == "fpga": | ||
run_covariance(dace.dtypes.DeviceType.FPGA) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,133 @@ | ||
# Copyright 2019-2021 ETH Zurich and the DaCe authors. All rights reserved. | ||
# Original application code: NPBench - https://github.com/spcl/npbench | ||
|
||
import dace.dtypes | ||
import numpy as np | ||
import dace as dc | ||
import pytest | ||
import argparse | ||
from dace.fpga_testing import fpga_test | ||
from dace.transformation.interstate import FPGATransformSDFG, InlineSDFG | ||
from dace.transformation.dataflow import StreamingMemory, MapFusion, StreamingComposition, PruneConnectors | ||
from dace.transformation.auto.auto_optimize import auto_optimize, fpga_auto_opt | ||
|
||
N = dc.symbol('N', dtype=dc.int32) | ||
|
||
|
||
@dc.program | ||
def lu_kernel(A: dc.float32[N, N]): | ||
|
||
for i in range(N): | ||
for j in range(i): | ||
A[i, j] -= A[i, :j] @ A[:j, j] | ||
A[i, j] /= A[j, j] | ||
for j in range(i, N): | ||
A[i, j] -= A[i, :i] @ A[:i, j] | ||
|
||
|
||
def ground_truth(N, A): | ||
|
||
for i in range(N): | ||
for j in range(i): | ||
A[i, j] -= A[i, :j] @ A[:j, j] | ||
A[i, j] /= A[j, j] | ||
for j in range(i, N): | ||
A[i, j] -= A[i, :i] @ A[:i, j] | ||
|
||
|
||
def init_data(N): | ||
|
||
A = np.empty((N, N), dtype=np.float32) | ||
for i in range(N): | ||
for j in range(i + 1): | ||
A[i, j] = (-j % N) / N + 1 | ||
for j in range(i + 1, N): | ||
A[i, j] = 0.0 | ||
A[i, i] = 1.0 | ||
|
||
B = np.empty((N, N), dtype=np.float32) | ||
B[:] = A @ np.transpose(A) | ||
A[:] = B | ||
|
||
return A | ||
|
||
|
||
def run_lu(device_type: dace.dtypes.DeviceType): | ||
''' | ||
Runs LU for the given device | ||
:return: the SDFG | ||
''' | ||
|
||
# Initialize data (polybench mini size) | ||
N = 40 | ||
A = init_data(N) | ||
gt_A = np.copy(A) | ||
|
||
if device_type in {dace.dtypes.DeviceType.CPU, dace.dtypes.DeviceType.GPU}: | ||
# Parse the SDFG and apply autopot | ||
sdfg = lu_kernel.to_sdfg() | ||
dace_res = sdfg(A=A, N=N) | ||
|
||
elif device_type == dace.dtypes.DeviceType.FPGA: | ||
# Parse SDFG and apply FPGA friendly optimization | ||
sdfg = lu_kernel.to_sdfg(strict=True) | ||
|
||
applied = sdfg.apply_transformations([FPGATransformSDFG]) | ||
assert applied == 1 | ||
|
||
# Use FPGA Expansion for lib nodes, and expand them to enable further optimizations | ||
from dace.libraries.blas import Dot | ||
platform = dace.config.Config.get("compiler", "fpga", "vendor") | ||
if platform == "intel_fpga": | ||
Dot.default_implementation = "FPGA_Accumulate" | ||
else: | ||
Dot.default_implementation = "FPGA_PartialSums" | ||
|
||
sdfg.expand_library_nodes() | ||
sdfg.apply_transformations_repeated([InlineSDFG]) | ||
|
||
fpga_auto_opt.fpga_rr_interleave_containers_to_banks(sdfg) | ||
fpga_auto_opt.fpga_global_to_local(sdfg) | ||
|
||
sdfg.specialize(dict(N=N)) | ||
dace_res = sdfg(A=A) | ||
|
||
# Compute ground truth and validate result | ||
ground_truth(N, gt_A) | ||
diff = np.linalg.norm(gt_A - A) / np.linalg.norm(gt_A) | ||
assert diff < 1e-5 | ||
return sdfg | ||
|
||
|
||
def test_cpu(): | ||
run_lu(dace.dtypes.DeviceType.CPU) | ||
|
||
|
||
@pytest.mark.gpu | ||
def test_gpu(): | ||
run_lu(dace.dtypes.DeviceType.GPU) | ||
|
||
|
||
@fpga_test(assert_ii_1=False) | ||
def test_fpga(): | ||
return run_lu(dace.dtypes.DeviceType.FPGA) | ||
|
||
|
||
if __name__ == "__main__": | ||
|
||
parser = argparse.ArgumentParser() | ||
parser.add_argument("-t", | ||
"--target", | ||
default='cpu', | ||
choices=['cpu', 'gpu', 'fpga'], | ||
help='Target platform') | ||
|
||
args = vars(parser.parse_args()) | ||
target = args["target"] | ||
|
||
if target == "cpu": | ||
run_lu(dace.dtypes.DeviceType.CPU) | ||
elif target == "gpu": | ||
run_lu(dace.dtypes.DeviceType.GPU) | ||
elif target == "fpga": | ||
run_lu(dace.dtypes.DeviceType.FPGA) |
Oops, something went wrong.