From d923516ac34d5fb0f34ce6a28ca6bd0d517e9fec Mon Sep 17 00:00:00 2001 From: Jeremy L Thompson Date: Thu, 11 Nov 2021 07:49:16 -0700 Subject: [PATCH] sve - initial SVE backend framework --- Makefile | 13 +++ README.md | 4 + backends/ceed-backend-list.h | 2 + backends/sve/ceed-sve-blocked.c | 41 ++++++++ backends/sve/ceed-sve-serial.c | 42 ++++++++ backends/sve/ceed-sve-tensor-f32.c | 153 +++++++++++++++++++++++++++++ backends/sve/ceed-sve-tensor-f64.c | 153 +++++++++++++++++++++++++++++ backends/sve/ceed-sve.h | 13 +++ 8 files changed, 421 insertions(+) create mode 100644 backends/sve/ceed-sve-blocked.c create mode 100644 backends/sve/ceed-sve-serial.c create mode 100644 backends/sve/ceed-sve-tensor-f32.c create mode 100644 backends/sve/ceed-sve-tensor-f64.c create mode 100644 backends/sve/ceed-sve.h diff --git a/Makefile b/Makefile index 85c5a9d481..d635cf9c31 100644 --- a/Makefile +++ b/Makefile @@ -276,6 +276,7 @@ blocked.c := $(sort $(wildcard backends/blocked/*.c)) ceedmemcheck.c := $(sort $(wildcard backends/memcheck/*.c)) opt.c := $(sort $(wildcard backends/opt/*.c)) avx.c := $(sort $(wildcard backends/avx/*.c)) +sve.c := $(sort $(wildcard backends/sve/*.c)) xsmm.c := $(sort $(wildcard backends/xsmm/*.c)) cuda.c := $(sort $(wildcard backends/cuda/*.c)) cuda.cpp := $(sort $(wildcard backends/cuda/*.cpp)) @@ -347,6 +348,7 @@ info: $(info ------------------------------------) $(info MEMCHK_STATUS = $(MEMCHK_STATUS)$(call backend_status,$(MEMCHK_BACKENDS))) $(info AVX_STATUS = $(AVX_STATUS)$(call backend_status,$(AVX_BACKENDS))) + $(info SVE_STATUS = $(SVE_STATUS)$(call backend_status,$(SVE_BACKENDS))) $(info XSMM_DIR = $(XSMM_DIR)$(call backend_status,$(XSMM_BACKENDS))) $(info OCCA_DIR = $(OCCA_DIR)$(call backend_status,$(OCCA_BACKENDS))) $(info MAGMA_DIR = $(MAGMA_DIR)$(call backend_status,$(MAGMA_BACKENDS))) @@ -400,6 +402,17 @@ ifneq ($(AVX),) BACKENDS_MAKE += $(AVX_BACKENDS) endif +# SVE Backends +SVE_STATUS = Disabled +AVX_FLAG := $(if $(filter clang,$(CC_VENDOR)),+sve,-msve) +SVE := $(filter $(SVE_FLAG),$(shell $(CC) $(CFLAGS:-M%=) -v -E -x c /dev/null 2>&1)) +SVE_BACKENDS = /cpu/self/sve/serial /cpu/self/sve/blocked +ifneq ($(SVE),) + SVE_STATUS = Enabled + libceed.c += $(sve.c) + BACKENDS_MAKE += $(SVE_BACKENDS) +endif + # Collect list of libraries and paths for use in linking and pkg-config PKG_LIBS = # Stubs that will not be RPATH'd diff --git a/README.md b/README.md index 45f3a08489..89b5a369ca 100644 --- a/README.md +++ b/README.md @@ -155,6 +155,8 @@ There are multiple supported backends, which can be selected at runtime in the e | `/cpu/self/opt/blocked` | Blocked optimized C implementation | Yes | | `/cpu/self/avx/serial` | Serial AVX implementation | Yes | | `/cpu/self/avx/blocked` | Blocked AVX implementation | Yes | +| `/cpu/self/sve/serial` | Serial SVE implementation | Yes | +| `/cpu/self/sve/blocked` | Blocked SVE implementation | Yes | || | **CPU Valgrind** | | `/cpu/self/memcheck/*` | Memcheck backends, undefined value checks | Yes | @@ -200,6 +202,8 @@ The `/cpu/self/opt/*` backends are written in pure C and use partial e-vectors t The `/cpu/self/avx/*` backends rely upon AVX instructions to provide vectorized CPU performance. +The `/cpu/self/sve/*` backends rely upon SVE instructions to provide vectorized CPU performance. + The `/cpu/self/memcheck/*` backends rely upon the [Valgrind](https://valgrind.org/) Memcheck tool to help verify that user QFunctions have no undefined values. To use, run your code with Valgrind and the Memcheck backends, e.g. `valgrind ./build/ex1 -ceed /cpu/self/ref/memcheck`. A 'development' or 'debugging' version of Valgrind with headers is required to use this backend. diff --git a/backends/ceed-backend-list.h b/backends/ceed-backend-list.h index 75b1d1fe75..3d40a2c6e2 100644 --- a/backends/ceed-backend-list.h +++ b/backends/ceed-backend-list.h @@ -31,5 +31,7 @@ CEED_BACKEND(CeedRegister_Opt_Blocked, 1, "/cpu/self/opt/blocked") CEED_BACKEND(CeedRegister_Opt_Serial, 1, "/cpu/self/opt/serial") CEED_BACKEND(CeedRegister_Ref, 1, "/cpu/self/ref/serial") CEED_BACKEND(CeedRegister_Ref_Blocked, 1, "/cpu/self/ref/blocked") +CEED_BACKEND(CeedRegister_Sve_Serial, 1, "/cpu/self/sve/serial") +CEED_BACKEND(CeedRegister_Sve_Blocked, 1, "/cpu/self/sve/blocked") CEED_BACKEND(CeedRegister_Xsmm_Blocked, 1, "/cpu/self/xsmm/blocked") CEED_BACKEND(CeedRegister_Xsmm_Serial, 1, "/cpu/self/xsmm/serial") diff --git a/backends/sve/ceed-sve-blocked.c b/backends/sve/ceed-sve-blocked.c new file mode 100644 index 0000000000..18cfadc385 --- /dev/null +++ b/backends/sve/ceed-sve-blocked.c @@ -0,0 +1,41 @@ +// Copyright (c) 2017-2022, Lawrence Livermore National Security, LLC and other CEED contributors. +// All Rights Reserved. See the top-level LICENSE and NOTICE files for details. +// +// SPDX-License-Identifier: BSD-2-Clause +// +// This file is part of CEED: http://github.com/ceed + +#include +#include +#include +#include + +#include "ceed-sve.h" + +//------------------------------------------------------------------------------ +// Backend Init +//------------------------------------------------------------------------------ +static int CeedInit_Sve(const char *resource, Ceed ceed) { + Ceed ceed_ref; + + CeedCheck(!strcmp(resource, "/cpu/self") || !strcmp(resource, "/cpu/self/sve") && strcmp(resource, "/cpu/self/sve/blocked"), ceed, + CEED_ERROR_BACKEND, "SVE backend cannot use resource: %s", resource); + CeedCallBackend(CeedSetDeterministic(ceed, true)); + + // Create reference CEED that implementation will be dispatched through unless overridden + CeedCallBackend(CeedInit("/cpu/self/opt/blocked", &ceed_ref)); + CeedCallBackend(CeedSetDelegate(ceed, ceed_ref)); + + if (CEED_SCALAR_TYPE == CEED_SCALAR_FP64) { + CeedCallBackend(CeedSetBackendFunction(ceed, "Ceed", ceed, "TensorContractCreate", CeedTensorContractCreate_f64_Sve)); + } else { + CeedCallBackend(CeedSetBackendFunction(ceed, "Ceed", ceed, "TensorContractCreate", CeedTensorContractCreate_f32_Sve); + } + return CEED_ERROR_SUCCESS; +} + +//------------------------------------------------------------------------------ +// Backend Register +//------------------------------------------------------------------------------ +CEED_INTERN int CeedRegister_Sve_Blocked(void) { return CeedRegister("/cpu/self/sve/blocked", CeedInit_Sve, 30); } +//------------------------------------------------------------------------------ diff --git a/backends/sve/ceed-sve-serial.c b/backends/sve/ceed-sve-serial.c new file mode 100644 index 0000000000..1519ae60e9 --- /dev/null +++ b/backends/sve/ceed-sve-serial.c @@ -0,0 +1,42 @@ +// Copyright (c) 2017-2022, Lawrence Livermore National Security, LLC and other CEED contributors. +// All Rights Reserved. See the top-level LICENSE and NOTICE files for details. +// +// SPDX-License-Identifier: BSD-2-Clause +// +// This file is part of CEED: http://github.com/ceed + +#include +#include +#include +#include + +#include "ceed-sve.h" + +//------------------------------------------------------------------------------ +// Backend Init +//------------------------------------------------------------------------------ +static int CeedInit_Sve(const char *resource, Ceed ceed) { + Ceed ceed_ref; + + CeedCheck(!strcmp(resource, "/cpu/self") || !strcmp(resource, "/cpu/self/sve/serial"), ceed, CEED_ERROR_BACKEND, + "SVE backend cannot use resource: %s", resource); + CeedCallBackend(CeedSetDeterministic(ceed, true)); + + // Create reference CEED that implementation will be dispatched through unless overridden + CeedCallBackend(CeedInit("/cpu/self/opt/serial", &ceed_ref)); + CeedCallBackend(CeedSetDelegate(ceed, ceed_ref)); + + if (CEED_SCALAR_TYPE == CEED_SCALAR_FP64) { + CeedCallBackend(CeedSetBackendFunction(ceed, "Ceed", ceed, "TensorContractCreate", CeedTensorContractCreate_f64_Sve)); + } else { + CeedCallBackend(CeedSetBackendFunction(ceed, "Ceed", ceed, "TensorContractCreate", CeedTensorContractCreate_f32_Sve)); + } + return CEED_ERROR_SUCCESS; +} + +//------------------------------------------------------------------------------ +// Backend Register +//------------------------------------------------------------------------------ +CEED_INTERN int CeedRegister_Sve_Serial(void) { return CeedRegister("/cpu/self/sve/serial", CeedInit_Sve, 35); } + +//------------------------------------------------------------------------------ diff --git a/backends/sve/ceed-sve-tensor-f32.c b/backends/sve/ceed-sve-tensor-f32.c new file mode 100644 index 0000000000..73c41c9a7d --- /dev/null +++ b/backends/sve/ceed-sve-tensor-f32.c @@ -0,0 +1,153 @@ +// Copyright (c) 2017-2022, Lawrence Livermore National Security, LLC and other CEED contributors. +// All Rights Reserved. See the top-level LICENSE and NOTICE files for details. +// +// SPDX-License-Identifier: BSD-2-Clause +// +// This file is part of CEED: http://github.com/ceed + +#include +#include +#ifdef __ARM_FEATURE_SVE +#include +#endif +#include + +#include "ceed-sve.h" + +//------------------------------------------------------------------------------ +// Blocked Tensor Contract +//------------------------------------------------------------------------------ +static inline int CeedTensorContract_Sve_Blocked(CeedTensorContract contract, CeedInt A, CeedInt B, CeedInt C, CeedInt J, const float *restrict t, + CeedTransposeMode t_mode, const CeedInt add, const float *restrict u, float *restrict v, + const CeedInt JJ) { + CeedInt t_stride_0 = B, t_stride_1 = 1; + + if (t_mode == CEED_TRANSPOSE) { + t_stride_0 = 1; + t_stride_1 = J; + } + + for (CeedInt a = 0; a < A; a++) { + for (CeedInt b = 0; b < B; b++) { + // Blocks of JJ rows + for (CeedInt j = 0; j < (J / JJ) * JJ; j += JJ) { + for (CeedInt jj = 0; jj < JJ; jj++) { // unroll + // C vectorization by compiler + for (int32_t c = 0; c < C; c += svcntd()) { + svbool_t pg = svwhilelt_b32(c, C); + // Load u, v into vectors + svfloat32_t u_vec = svld1(pg, &u[(a * B + b) * C + c]); + svfloat32_t v_vec = svld1(pg, &v[(a * J + j + jj) * C + c]); + // Basis matrix value + float tq = t[(j + jj) * t_stride_0 + b * t_stride_1]; + + // fmadd + svst1(pg, &v[(a * J + j + jj) * C + c], svmla_x(pg, v_vec, u_vec, tq)); + } + } + } + } + } + + // Remainder of rows + const CeedInt j = (J / JJ) * JJ; + + if (j < J) { + for (CeedInt a = 0; a < A; a++) { + for (CeedInt b = 0; b < B; b++) { + // Blocks of JJ rows + for (CeedInt jj = 0; jj < J - j; jj++) { // not unrolled + // C vectorization by compiler + for (int32_t c = 0; c < C; c += svcntd()) { + svbool_t pg = svwhilelt_b32(c, C); + // Load u, v into vectors + svfloat32_t u_vec = svld1(pg, &u[(a * B + b) * C + c]); + svfloat32_t v_vec = svld1(pg, &v[(a * J + j + jj) * C + c]); + // Basis matrix value + float tq = t[(j + jj) * t_stride_0 + b * t_stride_1]; + + // fmadd + svst1(pg, &v[(a * J + j + jj) * C + c], svmla_x(pg, v_vec, u_vec, tq)); + } + } + } + } + } + return CEED_ERROR_SUCCESS; +} + +//------------------------------------------------------------------------------ +// Blocked Tensor Contract +//------------------------------------------------------------------------------ +static inline int CeedTensorContract_Sve_Serial(CeedTensorContract contract, CeedInt A, CeedInt B, CeedInt C, CeedInt J, const float *restrict t, + CeedTransposeMode t_mode, const CeedInt add, const float *restrict u, float *restrict v, + const CeedInt JJ) { + CeedInt t_stride_0 = B, t_stride_1 = 1; + + if (t_mode == CEED_TRANSPOSE) { + t_stride_0 = 1; + t_stride_1 = J; + } + + for (CeedInt a = 0; a < A; a++) { + for (CeedInt b = 0; b < B; b++) { + for (CeedInt j = 0; j < (J / JJ) * JJ; j += JJ) { + for (CeedInt jj = 0; jj < JJ; jj++) { // unroll + v[a * J + (j + jj)] += t[(j + jj) * t_stride_0 + b * t_stride_1] * u[a * B + b]; + } + } + } + } + + const CeedInt j = (J / JJ) * JJ; + + if (j < J) { + for (CeedInt a = 0; a < A; a++) { + for (CeedInt b = 0; b < B; b++) { + for (CeedInt jj = 0; jj < J - j; jj++) { // not unrolled + v[a * J + (j + jj)] += t[(j + jj) * t_stride_0 + b * t_stride_1] * u[a * B + b]; + } + } + } + } + return CEED_ERROR_SUCCESS; +} + +//------------------------------------------------------------------------------ +// Tensor Contract - Common Sizes +//------------------------------------------------------------------------------ +static int CeedTensorContract_Sve_Blocked_8(CeedTensorContract contract, CeedInt A, CeedInt B, CeedInt C, CeedInt J, const float *restrict t, + CeedTransposeMode t_mode, const CeedInt add, const float *restrict u, float *restrict v) { + return CeedTensorContract_Sve_Blocked(contract, A, B, C, J, t, t_mode, add, u, v, 8); +} +static int CeedTensorContract_Sve_Serial_8(CeedTensorContract contract, CeedInt A, CeedInt B, CeedInt C, CeedInt J, const float *restrict t, + CeedTransposeMode t_mode, const CeedInt add, const float *restrict u, float *restrict v) { + return CeedTensorContract_Sve_Serial(contract, A, B, C, J, t, t_mode, add, u, v, 8); +} + +//------------------------------------------------------------------------------ +// Tensor Contract Apply +//------------------------------------------------------------------------------ +static int CeedTensorContractApply_Sve(CeedTensorContract contract, CeedInt A, CeedInt B, CeedInt C, CeedInt J, const float *restrict t, + CeedTransposeMode t_mode, const CeedInt add, const float *restrict u, float *restrict v) { + if (!add) { + for (CeedInt q = 0; q < A * J * C; q++) v[q] = (float)0.0; + } + + if (C == 1) CeedTensorContract_Sve_Serial_8(contract, A, B, C, J, t, t_mode, true, u, v); + else CeedTensorContract_Sve_Blocked_8(contract, A, B, C, J, t, t_mode, true, u, v); + return CEED_ERROR_SUCCESS; +} + +//------------------------------------------------------------------------------ +// Tensor Contract Create +//------------------------------------------------------------------------------ +int CeedTensorContractCreate_f32_Sve(CeedBasis basis, CeedTensorContract contract) { + Ceed ceed; + + CeedCallBackend(CeedTensorContractGetCeed(contract, &ceed)); + CeedCallBackend(CeedSetBackendFunction(ceed, "TensorContract", contract, "Apply", CeedTensorContractApply_Sve)); + return CEED_ERROR_SUCCESS; +} + +//------------------------------------------------------------------------------ diff --git a/backends/sve/ceed-sve-tensor-f64.c b/backends/sve/ceed-sve-tensor-f64.c new file mode 100644 index 0000000000..8b864f9a13 --- /dev/null +++ b/backends/sve/ceed-sve-tensor-f64.c @@ -0,0 +1,153 @@ +// Copyright (c) 2017-2022, Lawrence Livermore National Security, LLC and other CEED contributors. +// All Rights Reserved. See the top-level LICENSE and NOTICE files for details. +// +// SPDX-License-Identifier: BSD-2-Clause +// +// This file is part of CEED: http://github.com/ceed + +#include +#include +#ifdef __ARM_FEATURE_SVE +#include +#endif +#include + +#include "ceed-sve.h" + +//------------------------------------------------------------------------------ +// Blocked Tensor Contract +//------------------------------------------------------------------------------ +static inline int CeedTensorContract_Sve_Blocked(CeedTensorContract contract, CeedInt A, CeedInt B, CeedInt C, CeedInt J, const double *restrict t, + CeedTransposeMode t_mode, const CeedInt add, const double *restrict u, double *restrict v, + const CeedInt JJ) { + CeedInt t_stride_0 = B, t_stride_1 = 1; + + if (t_mode == CEED_TRANSPOSE) { + t_stride_0 = 1; + t_stride_1 = J; + } + + for (CeedInt a = 0; a < A; a++) { + for (CeedInt b = 0; b < B; b++) { + // Blocks of JJ rows + for (CeedInt j = 0; j < (J / JJ) * JJ; j += JJ) { + for (CeedInt jj = 0; jj < JJ; jj++) { // unroll + // C vectorization by compiler + for (int32_t c = 0; c < C; c += svcntd()) { + svbool_t pg = svwhilelt_b64(c, C); + // Load u, v into vectors + svfloat64_t u_vec = svld1(pg, &u[(a * B + b) * C + c]); + svfloat64_t v_vec = svld1(pg, &v[(a * J + j + jj) * C + c]); + // Basis matrix value + double tq = t[(j + jj) * t_stride_0 + b * t_stride_1]; + + // fmadd + svst1(pg, &v[(a * J + j + jj) * C + c], svmla_x(pg, v_vec, u_vec, tq)); + } + } + } + } + } + + // Remainder of rows + const CeedInt j = (J / JJ) * JJ; + + if (j < J) { + for (CeedInt a = 0; a < A; a++) { + for (CeedInt b = 0; b < B; b++) { + // Blocks of JJ rows + for (CeedInt jj = 0; jj < J - j; jj++) { // not unrolled + // C vectorization by compiler + for (int32_t c = 0; c < C; c += svcntd()) { + svbool_t pg = svwhilelt_b64(c, C); + // Load u, v into vectors + svfloat64_t u_vec = svld1(pg, &u[(a * B + b) * C + c]); + svfloat64_t v_vec = svld1(pg, &v[(a * J + j + jj) * C + c]); + // Basis matrix value + double tq = t[(j + jj) * t_stride_0 + b * t_stride_1]; + + // fmadd + svst1(pg, &v[(a * J + j + jj) * C + c], svmla_x(pg, v_vec, u_vec, tq)); + } + } + } + } + } + return CEED_ERROR_SUCCESS; +} + +//------------------------------------------------------------------------------ +// Blocked Tensor Contract +//------------------------------------------------------------------------------ +static inline int CeedTensorContract_Sve_Serial(CeedTensorContract contract, CeedInt A, CeedInt B, CeedInt C, CeedInt J, const double *restrict t, + CeedTransposeMode t_mode, const CeedInt add, const double *restrict u, double *restrict v, + const CeedInt JJ) { + CeedInt t_stride_0 = B, t_stride_1 = 1; + + if (t_mode == CEED_TRANSPOSE) { + t_stride_0 = 1; + t_stride_1 = J; + } + + for (CeedInt a = 0; a < A; a++) { + for (CeedInt b = 0; b < B; b++) { + for (CeedInt j = 0; j < (J / JJ) * JJ; j += JJ) { + for (CeedInt jj = 0; jj < JJ; jj++) { // unroll + v[a * J + (j + jj)] += t[(j + jj) * t_stride_0 + b * t_stride_1] * u[a * B + b]; + } + } + } + } + + const CeedInt j = (J / JJ) * JJ; + + if (j < J) { + for (CeedInt a = 0; a < A; a++) { + for (CeedInt b = 0; b < B; b++) { + for (CeedInt jj = 0; jj < J - j; jj++) { // not unrolled + v[a * J + (j + jj)] += t[(j + jj) * t_stride_0 + b * t_stride_1] * u[a * B + b]; + } + } + } + } + return CEED_ERROR_SUCCESS; +} + +//------------------------------------------------------------------------------ +// Tensor Contract - Common Sizes +//------------------------------------------------------------------------------ +static int CeedTensorContract_Sve_Blocked_8(CeedTensorContract contract, CeedInt A, CeedInt B, CeedInt C, CeedInt J, const double *restrict t, + CeedTransposeMode t_mode, const CeedInt add, const double *restrict u, double *restrict v) { + return CeedTensorContract_Sve_Blocked(contract, A, B, C, J, t, t_mode, add, u, v, 8); +} +static int CeedTensorContract_Sve_Serial_8(CeedTensorContract contract, CeedInt A, CeedInt B, CeedInt C, CeedInt J, const double *restrict t, + CeedTransposeMode t_mode, const CeedInt add, const double *restrict u, double *restrict v) { + return CeedTensorContract_Sve_Serial(contract, A, B, C, J, t, t_mode, add, u, v, 8); +} + +//------------------------------------------------------------------------------ +// Tensor Contract Apply +//------------------------------------------------------------------------------ +static int CeedTensorContractApply_Sve(CeedTensorContract contract, CeedInt A, CeedInt B, CeedInt C, CeedInt J, const double *restrict t, + CeedTransposeMode t_mode, const CeedInt add, const double *restrict u, double *restrict v) { + if (!add) { + for (CeedInt q = 0; q < A * J * C; q++) v[q] = (double)0.0; + } + + if (C == 1) CeedTensorContract_Sve_Serial_8(contract, A, B, C, J, t, t_mode, true, u, v); + else CeedTensorContract_Sve_Blocked_8(contract, A, B, C, J, t, t_mode, true, u, v); + return CEED_ERROR_SUCCESS; +} + +//------------------------------------------------------------------------------ +// Tensor Contract Create +//------------------------------------------------------------------------------ +int CeedTensorContractCreate_f64_Sve(CeedBasis basis, CeedTensorContract contract) { + Ceed ceed; + + CeedCallBackend(CeedTensorContractGetCeed(contract, &ceed)); + CeedCallBackend(CeedSetBackendFunction(ceed, "TensorContract", contract, "Apply", CeedTensorContractApply_Sve)); + return CEED_ERROR_SUCCESS; +} + +//------------------------------------------------------------------------------ diff --git a/backends/sve/ceed-sve.h b/backends/sve/ceed-sve.h new file mode 100644 index 0000000000..0a04a4d297 --- /dev/null +++ b/backends/sve/ceed-sve.h @@ -0,0 +1,13 @@ +// Copyright (c) 2017-2022, Lawrence Livermore National Security, LLC and other CEED contributors. +// All Rights Reserved. See the top-level LICENSE and NOTICE files for details. +// +// SPDX-License-Identifier: BSD-2-Clause +// +// This file is part of CEED: http://github.com/ceed +#pragma once + +#include +#include + +CEED_INTERN int CeedTensorContractCreate_f32_Sve(CeedBasis basis, CeedTensorContract contract); +CEED_INTERN int CeedTensorContractCreate_f64_Sve(CeedBasis basis, CeedTensorContract contract);