From a84ee9dd685e67b72b8ab0f1b51ecec95bd1a41b Mon Sep 17 00:00:00 2001 From: Federico Ficarelli <1379149+nazavode@users.noreply.github.com> Date: Mon, 6 Nov 2023 13:19:21 +0100 Subject: [PATCH] Add kernels/dense/fused --- kernels/dense/8x8xf64/Makefile | 1 + kernels/dense/8x8xf64/fused.c | 24 ++++++++++++++++++++++++ 2 files changed, 25 insertions(+) create mode 100644 kernels/dense/8x8xf64/fused.c diff --git a/kernels/dense/8x8xf64/Makefile b/kernels/dense/8x8xf64/Makefile index 26ef0923..d51bb71b 100644 --- a/kernels/dense/8x8xf64/Makefile +++ b/kernels/dense/8x8xf64/Makefile @@ -4,6 +4,7 @@ include ../../../snitch/Makefile.rules TESTS = TESTS += baseline.x +TESTS += fused.x TESTS += linalg.x include ../../Makefile.kernels diff --git a/kernels/dense/8x8xf64/fused.c b/kernels/dense/8x8xf64/fused.c new file mode 100644 index 00000000..05a29b4c --- /dev/null +++ b/kernels/dense/8x8xf64/fused.c @@ -0,0 +1,24 @@ +#include "data.h" + +#include + +// Baseline version, fused, unaliased ptrs allow for reduntant loads to +// be elided +// * Inputs: x[ M x K ] +// * Weights: w[ K x N ] +// * Biases: b[ M x N ] +// * Outputs: y[ M x N ] +void dense(const double* restrict x, const double* restrict w, const double* restrict b, + double* restrict y) { + // Y = relu(X W + B) + for (uint32_t i = 0; i < M; ++i) { + for (uint32_t j = 0; j < N; ++j) { + double v = y[i * N + j]; + for (uint32_t k = 0; k < K; ++k) { + v += x[i * K + k] * w[k * N + j]; + } + v += b[i * N + j]; + y[i * N + j] = v > 0. ? v : 0.; + } + } +}