Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

SVE Backend #842

Open
wants to merge 1 commit into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
13 changes: 13 additions & 0 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -267,6 +267,7 @@ blocked.c := $(sort $(wildcard backends/blocked/*.c))
ceedmemcheck.c := $(sort $(wildcard backends/memcheck/*.c))
opt.c := $(sort $(wildcard backends/opt/*.c))
avx.c := $(sort $(wildcard backends/avx/*.c))
sve.c := $(sort $(wildcard backends/sve/*.c))
xsmm.c := $(sort $(wildcard backends/xsmm/*.c))
cuda.c := $(sort $(wildcard backends/cuda/*.c))
cuda.cpp := $(sort $(wildcard backends/cuda/*.cpp))
Expand Down Expand Up @@ -338,6 +339,7 @@ info:
$(info ------------------------------------)
$(info MEMCHK_STATUS = $(MEMCHK_STATUS)$(call backend_status,$(MEMCHK_BACKENDS)))
$(info AVX_STATUS = $(AVX_STATUS)$(call backend_status,$(AVX_BACKENDS)))
$(info SVE_STATUS = $(SVE_STATUS)$(call backend_status,$(SVE_BACKENDS)))
$(info XSMM_DIR = $(XSMM_DIR)$(call backend_status,$(XSMM_BACKENDS)))
$(info OCCA_DIR = $(OCCA_DIR)$(call backend_status,$(OCCA_BACKENDS)))
$(info MAGMA_DIR = $(MAGMA_DIR)$(call backend_status,$(MAGMA_BACKENDS)))
Expand Down Expand Up @@ -391,6 +393,17 @@ ifneq ($(AVX),)
BACKENDS_MAKE += $(AVX_BACKENDS)
endif

# SVE Backends
SVE_STATUS = Disabled
AVX_FLAG := $(if $(filter clang,$(CC_VENDOR)),+sve,-msve)
SVE := $(filter $(SVE_FLAG),$(shell $(CC) $(CFLAGS:-M%=) -v -E -x c /dev/null 2>&1))
SVE_BACKENDS = /cpu/self/sve/serial /cpu/self/sve/blocked
ifneq ($(SVE),)
SVE_STATUS = Enabled
libceed.c += $(sve.c)
BACKENDS_MAKE += $(SVE_BACKENDS)
endif

# Collect list of libraries and paths for use in linking and pkg-config
PKG_LIBS =
# Stubs that will not be RPATH'd
Expand Down
4 changes: 4 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -155,6 +155,8 @@ There are multiple supported backends, which can be selected at runtime in the e
| `/cpu/self/opt/blocked` | Blocked optimized C implementation | Yes |
| `/cpu/self/avx/serial` | Serial AVX implementation | Yes |
| `/cpu/self/avx/blocked` | Blocked AVX implementation | Yes |
| `/cpu/self/sve/serial` | Serial SVE implementation | Yes |
| `/cpu/self/sve/blocked` | Blocked SVE implementation | Yes |
||
| **CPU Valgrind** |
| `/cpu/self/memcheck/*` | Memcheck backends, undefined value checks | Yes |
Expand Down Expand Up @@ -200,6 +202,8 @@ The `/cpu/self/opt/*` backends are written in pure C and use partial e-vectors t

The `/cpu/self/avx/*` backends rely upon AVX instructions to provide vectorized CPU performance.

The `/cpu/self/sve/*` backends rely upon SVE instructions to provide vectorized CPU performance.

The `/cpu/self/memcheck/*` backends rely upon the [Valgrind](https://valgrind.org/) Memcheck tool to help verify that user QFunctions have no undefined values.
To use, run your code with Valgrind and the Memcheck backends, e.g. `valgrind ./build/ex1 -ceed /cpu/self/ref/memcheck`.
A 'development' or 'debugging' version of Valgrind with headers is required to use this backend.
Expand Down
2 changes: 2 additions & 0 deletions backends/ceed-backend-list.h
Original file line number Diff line number Diff line change
Expand Up @@ -31,5 +31,7 @@ CEED_BACKEND(CeedRegister_Opt_Blocked, 1, "/cpu/self/opt/blocked")
CEED_BACKEND(CeedRegister_Opt_Serial, 1, "/cpu/self/opt/serial")
CEED_BACKEND(CeedRegister_Ref, 1, "/cpu/self/ref/serial")
CEED_BACKEND(CeedRegister_Ref_Blocked, 1, "/cpu/self/ref/blocked")
CEED_BACKEND(CeedRegister_Sve_Serial, 1, "/cpu/self/sve/serial")
CEED_BACKEND(CeedRegister_Sve_Blocked, 1, "/cpu/self/sve/blocked")
CEED_BACKEND(CeedRegister_Xsmm_Blocked, 1, "/cpu/self/xsmm/blocked")
CEED_BACKEND(CeedRegister_Xsmm_Serial, 1, "/cpu/self/xsmm/serial")
41 changes: 41 additions & 0 deletions backends/sve/ceed-sve-blocked.c
Original file line number Diff line number Diff line change
@@ -0,0 +1,41 @@
// Copyright (c) 2017-2022, Lawrence Livermore National Security, LLC and other CEED contributors.
// All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
//
// SPDX-License-Identifier: BSD-2-Clause
//
// This file is part of CEED: http://github.com/ceed

#include <ceed.h>
#include <ceed/backend.h>
#include <stdbool.h>
#include <string.h>

#include "ceed-sve.h"

//------------------------------------------------------------------------------
// Backend Init
//------------------------------------------------------------------------------
static int CeedInit_Sve(const char *resource, Ceed ceed) {
Ceed ceed_ref;

CeedCheck(!strcmp(resource, "/cpu/self") || !strcmp(resource, "/cpu/self/sve") && strcmp(resource, "/cpu/self/sve/blocked"), ceed,
CEED_ERROR_BACKEND, "SVE backend cannot use resource: %s", resource);
CeedCallBackend(CeedSetDeterministic(ceed, true));

// Create reference CEED that implementation will be dispatched through unless overridden
CeedCallBackend(CeedInit("/cpu/self/opt/blocked", &ceed_ref));
CeedCallBackend(CeedSetDelegate(ceed, ceed_ref));

if (CEED_SCALAR_TYPE == CEED_SCALAR_FP64) {
CeedCallBackend(CeedSetBackendFunction(ceed, "Ceed", ceed, "TensorContractCreate", CeedTensorContractCreate_f64_Sve));
} else {
CeedCallBackend(CeedSetBackendFunction(ceed, "Ceed", ceed, "TensorContractCreate", CeedTensorContractCreate_f32_Sve);
}
return CEED_ERROR_SUCCESS;
}

//------------------------------------------------------------------------------
// Backend Register
//------------------------------------------------------------------------------
CEED_INTERN int CeedRegister_Sve_Blocked(void) { return CeedRegister("/cpu/self/sve/blocked", CeedInit_Sve, 30); }
//------------------------------------------------------------------------------
42 changes: 42 additions & 0 deletions backends/sve/ceed-sve-serial.c
Original file line number Diff line number Diff line change
@@ -0,0 +1,42 @@
// Copyright (c) 2017-2022, Lawrence Livermore National Security, LLC and other CEED contributors.
// All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
//
// SPDX-License-Identifier: BSD-2-Clause
//
// This file is part of CEED: http://github.com/ceed

#include <ceed.h>
#include <ceed/backend.h>
#include <stdbool.h>
#include <string.h>

#include "ceed-sve.h"

//------------------------------------------------------------------------------
// Backend Init
//------------------------------------------------------------------------------
static int CeedInit_Sve(const char *resource, Ceed ceed) {
Ceed ceed_ref;

CeedCheck(!strcmp(resource, "/cpu/self") || !strcmp(resource, "/cpu/self/sve/serial"), ceed, CEED_ERROR_BACKEND,
"SVE backend cannot use resource: %s", resource);
CeedCallBackend(CeedSetDeterministic(ceed, true));

// Create reference CEED that implementation will be dispatched through unless overridden
CeedCallBackend(CeedInit("/cpu/self/opt/serial", &ceed_ref));
CeedCallBackend(CeedSetDelegate(ceed, ceed_ref));

if (CEED_SCALAR_TYPE == CEED_SCALAR_FP64) {
CeedCallBackend(CeedSetBackendFunction(ceed, "Ceed", ceed, "TensorContractCreate", CeedTensorContractCreate_f64_Sve));
} else {
CeedCallBackend(CeedSetBackendFunction(ceed, "Ceed", ceed, "TensorContractCreate", CeedTensorContractCreate_f32_Sve));
}
return CEED_ERROR_SUCCESS;
}

//------------------------------------------------------------------------------
// Backend Register
//------------------------------------------------------------------------------
CEED_INTERN int CeedRegister_Sve_Serial(void) { return CeedRegister("/cpu/self/sve/serial", CeedInit_Sve, 35); }

//------------------------------------------------------------------------------
153 changes: 153 additions & 0 deletions backends/sve/ceed-sve-tensor-f32.c
Original file line number Diff line number Diff line change
@@ -0,0 +1,153 @@
// Copyright (c) 2017-2022, Lawrence Livermore National Security, LLC and other CEED contributors.
// All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
//
// SPDX-License-Identifier: BSD-2-Clause
//
// This file is part of CEED: http://github.com/ceed

#include <ceed.h>
#include <ceed/backend.h>
#ifdef __ARM_FEATURE_SVE
#include <arm_sve.h>
#endif
#include <stdbool.h>

#include "ceed-sve.h"

//------------------------------------------------------------------------------
// Blocked Tensor Contract
//------------------------------------------------------------------------------
static inline int CeedTensorContract_Sve_Blocked(CeedTensorContract contract, CeedInt A, CeedInt B, CeedInt C, CeedInt J, const float *restrict t,
CeedTransposeMode t_mode, const CeedInt add, const float *restrict u, float *restrict v,
const CeedInt JJ) {
CeedInt t_stride_0 = B, t_stride_1 = 1;

if (t_mode == CEED_TRANSPOSE) {
t_stride_0 = 1;
t_stride_1 = J;
}

for (CeedInt a = 0; a < A; a++) {
for (CeedInt b = 0; b < B; b++) {
// Blocks of JJ rows
for (CeedInt j = 0; j < (J / JJ) * JJ; j += JJ) {
for (CeedInt jj = 0; jj < JJ; jj++) { // unroll
// C vectorization by compiler
for (int32_t c = 0; c < C; c += svcntd()) {
svbool_t pg = svwhilelt_b32(c, C);
// Load u, v into vectors
svfloat32_t u_vec = svld1(pg, &u[(a * B + b) * C + c]);
svfloat32_t v_vec = svld1(pg, &v[(a * J + j + jj) * C + c]);
// Basis matrix value
float tq = t[(j + jj) * t_stride_0 + b * t_stride_1];

// fmadd
svst1(pg, &v[(a * J + j + jj) * C + c], svmla_x(pg, v_vec, u_vec, tq));
}
}
}
}
}

// Remainder of rows
const CeedInt j = (J / JJ) * JJ;

if (j < J) {
for (CeedInt a = 0; a < A; a++) {
for (CeedInt b = 0; b < B; b++) {
// Blocks of JJ rows
for (CeedInt jj = 0; jj < J - j; jj++) { // not unrolled
// C vectorization by compiler
for (int32_t c = 0; c < C; c += svcntd()) {
svbool_t pg = svwhilelt_b32(c, C);
// Load u, v into vectors
svfloat32_t u_vec = svld1(pg, &u[(a * B + b) * C + c]);
svfloat32_t v_vec = svld1(pg, &v[(a * J + j + jj) * C + c]);
// Basis matrix value
float tq = t[(j + jj) * t_stride_0 + b * t_stride_1];

// fmadd
svst1(pg, &v[(a * J + j + jj) * C + c], svmla_x(pg, v_vec, u_vec, tq));
}
}
}
}
}
return CEED_ERROR_SUCCESS;
}

//------------------------------------------------------------------------------
// Blocked Tensor Contract
//------------------------------------------------------------------------------
static inline int CeedTensorContract_Sve_Serial(CeedTensorContract contract, CeedInt A, CeedInt B, CeedInt C, CeedInt J, const float *restrict t,
CeedTransposeMode t_mode, const CeedInt add, const float *restrict u, float *restrict v,
const CeedInt JJ) {
CeedInt t_stride_0 = B, t_stride_1 = 1;

if (t_mode == CEED_TRANSPOSE) {
t_stride_0 = 1;
t_stride_1 = J;
}

for (CeedInt a = 0; a < A; a++) {
for (CeedInt b = 0; b < B; b++) {
for (CeedInt j = 0; j < (J / JJ) * JJ; j += JJ) {
for (CeedInt jj = 0; jj < JJ; jj++) { // unroll
v[a * J + (j + jj)] += t[(j + jj) * t_stride_0 + b * t_stride_1] * u[a * B + b];
}
}
}
}

const CeedInt j = (J / JJ) * JJ;

if (j < J) {
for (CeedInt a = 0; a < A; a++) {
for (CeedInt b = 0; b < B; b++) {
for (CeedInt jj = 0; jj < J - j; jj++) { // not unrolled
v[a * J + (j + jj)] += t[(j + jj) * t_stride_0 + b * t_stride_1] * u[a * B + b];
}
}
}
}
return CEED_ERROR_SUCCESS;
}

//------------------------------------------------------------------------------
// Tensor Contract - Common Sizes
//------------------------------------------------------------------------------
static int CeedTensorContract_Sve_Blocked_8(CeedTensorContract contract, CeedInt A, CeedInt B, CeedInt C, CeedInt J, const float *restrict t,
CeedTransposeMode t_mode, const CeedInt add, const float *restrict u, float *restrict v) {
return CeedTensorContract_Sve_Blocked(contract, A, B, C, J, t, t_mode, add, u, v, 8);
}
static int CeedTensorContract_Sve_Serial_8(CeedTensorContract contract, CeedInt A, CeedInt B, CeedInt C, CeedInt J, const float *restrict t,
CeedTransposeMode t_mode, const CeedInt add, const float *restrict u, float *restrict v) {
return CeedTensorContract_Sve_Serial(contract, A, B, C, J, t, t_mode, add, u, v, 8);
}

//------------------------------------------------------------------------------
// Tensor Contract Apply
//------------------------------------------------------------------------------
static int CeedTensorContractApply_Sve(CeedTensorContract contract, CeedInt A, CeedInt B, CeedInt C, CeedInt J, const float *restrict t,
CeedTransposeMode t_mode, const CeedInt add, const float *restrict u, float *restrict v) {
if (!add) {
for (CeedInt q = 0; q < A * J * C; q++) v[q] = (float)0.0;
}

if (C == 1) CeedTensorContract_Sve_Serial_8(contract, A, B, C, J, t, t_mode, true, u, v);
else CeedTensorContract_Sve_Blocked_8(contract, A, B, C, J, t, t_mode, true, u, v);
return CEED_ERROR_SUCCESS;
}

//------------------------------------------------------------------------------
// Tensor Contract Create
//------------------------------------------------------------------------------
int CeedTensorContractCreate_f32_Sve(CeedBasis basis, CeedTensorContract contract) {
Ceed ceed;

CeedCallBackend(CeedTensorContractGetCeed(contract, &ceed));
CeedCallBackend(CeedSetBackendFunction(ceed, "TensorContract", contract, "Apply", CeedTensorContractApply_Sve));
return CEED_ERROR_SUCCESS;
}

//------------------------------------------------------------------------------
Loading