Skip to content

Commit

Permalink
Merge pull request #7 from SeisSol/davschneller/sve-multisize
Browse files Browse the repository at this point in the history
SVE for 128, 256, 1024, and 2048 Bits
  • Loading branch information
krenzland authored Sep 1, 2023
2 parents 5af38e4 + 9bd7d93 commit ad30f48
Show file tree
Hide file tree
Showing 10 changed files with 134 additions and 35 deletions.
4 changes: 3 additions & 1 deletion .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -2,8 +2,10 @@ __pycache__/

# Testing (for now)
tests/arm/
tests/arm_sve/
tests/arm_sve*/
tests/hsw/
tests/knl/
tests/testsuite.cpp
tests/arm_*testsuite.cpp
tests/sve*-test
a.out
2 changes: 1 addition & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,6 @@ Currently Intel Xeon Phi 'Knights Landing' (AVX512), Haswell/Zen2 (AVX2), and AR

Usage:

./pspamm M N K LDA LDB LDC ALPHA BETA --arch {arm,arm_sve,knl,hsw}
./pspamm M N K LDA LDB LDC ALPHA BETA --arch {arm,arm_sve{128,256,512,1024,2048},knl,hsw}

--mtx_filename MTX_FILE_PATH --output_funcname FUNCTION_NAME --output_filename OUTPUT_NAME
3 changes: 2 additions & 1 deletion codegen/architectures/arm/generator.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,8 @@ class Generator(AbstractGenerator):
pspamm_num_total_flops += {flop};
#endif
}}}};"""
}}}};
"""

def get_v_size(self):
if self.precision == Precision.DOUBLE:
Expand Down
31 changes: 17 additions & 14 deletions codegen/architectures/arm_sve/generator.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,17 +29,19 @@ class Generator(AbstractGenerator):
pspamm_num_total_flops += {flop};
#endif
}}}}}}}};"""
}}}}}}}};
"""

prefetch_reg = None
prefetch_count = 0
is_sparse = False
v_len = 4 # vector register length: v_len * 128 bit

def get_v_size(self):
if self.precision == Precision.DOUBLE:
return 8
return 2 * self.v_len # 128 bit == 2 x 64 bit (double)
elif self.precision == Precision.SINGLE:
return 16
return 4 * self.v_len # 128 bit == 4 x 32 bit (float)
raise NotImplementedError

def get_precision(self):
Expand All @@ -55,7 +57,8 @@ def pred_n_trues(self, num_trues: int, v_size: int, suffix: str = None) -> Regis
set to true"""
assert (num_trues > 0)
assert (suffix == "m" or suffix == "z" or suffix is None)
# we only use p7 or p0 as predicates

# we only use p7 or p0 as predicates (1 == p0, 8 == p7)
num_trues = 8 if num_trues >= v_size else 1

if suffix is None:
Expand Down Expand Up @@ -125,11 +128,11 @@ def init_registers(self,
v_size: int
) -> None:

bm %= v_size
bmmod = bm % v_size

eol = "\\n\\t" # define the "end of line" sequence for easy assembly
p_suffix = "d" if v_size == 8 else "s" # determine whether predicate suffix is '.d' or '.s
gen_reg = "x" if v_size == 8 else "w" # determine if 'dup' registers are 64 bit or 32 bit
p_suffix = "d" if v_size == 2 * self.v_len else "s" # determine whether predicate suffix is '.d' or '.s
gen_reg = "x" if v_size == 2 * self.v_len else "w" # determine if 'dup' registers are 64 bit or 32 bit
overhead_counter = 6

# https://developer.arm.com/documentation/102374/0101/Registers-in-AArch64---general-purpose-registers
Expand All @@ -145,13 +148,13 @@ def init_registers(self,
# https://developer.arm.com/documentation/ddi0596/2020-12/Shared-Pseudocode/AArch64-Functions?lang=en#impl-aarch64.DecodePredCount.2
# 'ptrue' doesnt work for initialising overhead predicate when using single precision -> see valid patterns from above
# overhead = "\"ptrue p0.{suffix}, #{overhead}{eol}\"\n\t" if bm != 0 else "" # define overhead predicate
overhead = "\"mov {gen_reg}{overhead_counter}, #{overhead}{eol}\"\n\t\"whilelo p0.{suffix}, {gen_reg}zr, {gen_reg}{overhead_counter}{eol}\"\n\t" if bm != 0 else ""
overhead = "\"mov {gen_reg}{overhead_counter}, #{overhead}{eol}\"\n\t\"whilelo p0.{suffix}, {gen_reg}zr, {gen_reg}{overhead_counter}{eol}\"\n\t" if bmmod != 0 else ""
all_true = "\"ptrue p7.{suffix}, #31{eol}\"" # define all true predicate
init_registers = (dup_alpha + dup_beta + comment + overhead + all_true).format(suffix=p_suffix,
gen_reg=gen_reg,
overhead_counter=overhead_counter,
v_size=v_size,
overhead=bm,
overhead=bmmod,
eol=eol)

# since .format() doesn't allow partial formatting, we need to re-include the
Expand Down Expand Up @@ -188,12 +191,12 @@ def move_register_block(self,
b_row, b_col, i, _ = cursor.get_block(cursor_ptr, block_offset)

cur11 = 0
#TODO: figure out appropriate threshold
threshold = 1 if self.is_sparse else 4 # uses whole 256 byte cache line, as one SVE vector = 64 bytes
#TODO: figure out appropriate threshold (the 16 // self.v_len may still not be optimal; especially if 16 % self.v_len != 0, e.g. 384 bit)
threshold = 1 if self.is_sparse else (16 // self.v_len) # uses whole 256 byte cache line, as one SVE-512 vector = 64 bytes

# TODO: if another CPU implements SVE at VL != 64 bytes, rewrite mul_vl (maybe do this dynamically)
mul_vl = 64 # A64FX has VL of 64 bytes in memory
max_mem_ins_mult = 7 # A64FX allows a maximum positive offset of 7 in memory instructions, e.g. ld1d z1.d, p0/z, [x0, 7, MUL VL]
# DONE if another CPU implements SVE at VL != 64 bytes, rewrite mul_vl (maybe do this dynamically)
mul_vl = 16 * self.v_len # e.g. A64FX has VL of 64 bytes in memory (thus, use v_len==4)
max_mem_ins_mult = 7 # A64FX allows a maximum positive offset of 7 in memory instructions, e.g. ld1d z1.d, p0/z, [x0, 7, MUL VL] (TODO: tune, if ever different)
max_offset = mul_vl * max_mem_ins_mult # ld1d/st1d instruction encodes the immediate offset using 4 bits, multiplies it with MUL VL

prev_disp = 0
Expand Down
16 changes: 15 additions & 1 deletion matmul.py
Original file line number Diff line number Diff line change
Expand Up @@ -98,6 +98,16 @@ def __init__(self,

if arch == 'skx':
arch = 'knl'

# hacky implementation of multi-register length
if arch.startswith('arm_sve'):
if len(arch) == 7:
v_len_regs = 4 # compatibility: arm_sve == arm_sve512
else:
v_len_bits = int(arch[7:])
assert v_len_bits % 128 == 0 and v_len_bits <= 2048
v_len_regs = v_len_bits // 128
arch = 'arm_sve'

self.arch = arch
assert precision.lower() in ['s', 'd']
Expand All @@ -110,10 +120,14 @@ def __init__(self,

self.generator = architecture.Generator(self.precision)

self.v_size = self.generator.get_v_size()
# flag that determines if a matmul kernel uses sve instructions -> needed for sve predicates
self.is_sve = arch == "arm_sve"

if self.is_sve:
self.generator.v_len = v_len_regs

self.v_size = self.generator.get_v_size()

if bk == None:
bk = 2 if arch == 'knl' else 1

Expand Down
28 changes: 28 additions & 0 deletions tests/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -41,3 +41,31 @@ If nothing breaks, the generated testsuite reports the number of successful test
2. Adjust the Makefile as needed and compile the generated ```sve_testsuite.cpp``` by calling
```make sve_testsuite```
3. Run the compiled executable with ```./sve_testsuite```

#### Notes Running SVE with QEMU user-static

Run `runall-sve.sh` which tests a bunch of configurations already.

For a bit length `BITLEN`, it executes the following commands:
```
# generate tests
python unit_tests_arm_sve.py $BITLEN
# compile: we use AVM V8.2 and SVE; the SVE vector length is set explicitly
aarch64-linux-gnu-g++ -static -march=armv8.2-a+sve -msve-vector-bits=${BITLEN} arm_sve${BITLEN}_testsuite.cpp -o sve${BITLEN}-test
# run using QEMU, this way we may run on x86-64 as well; enable all features and constrain to sve${BITLEN} SVE registers maximum length (cf. https://qemu-project.gitlab.io/qemu/system/arm/cpu-features.html); the sve-default-vector-length=-1 parameter is needed for 1024 and 2048 bit SVE to work correctly (otherwise, QEMU will assume 512 bit maximum)
qemu-aarch64-static -cpu max,sve${BITLEN}=on,sve-default-vector-length=-1 ./sve${BITLEN}-test
```


For debugging, for example for vector length 512 (cf. https://mariokartwii.com/showthread.php?tid=1998 ):
```
aarch64-linux-gnu-g++ -g -ggdb -static -march=armv9-a+sve -msve-vector-bits=512 sve_testsuite.cpp
qemu-aarch64-static -g 1234 -cpu max,sve512=on ./a.out
```
(we use 1234 as port here, and a.out as filename)

In a separate window, run `aarch64-linux-gnu-gdb --ex "target remote localhost:1234" --ex "file a.out"`.
The extra commands already connect you with QEMU and attach you to the compiled binary file, so method names etc. are printed correctly.
To run the program, just type `continue`. You may maybe want to set up breakpoints etc. before you do that.
16 changes: 16 additions & 0 deletions tests/runall-sve.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
#!/usr/bin/env bash
# maybe do PYTHONPATH=$(pwd)/..:$PYTHONPATH

echo "SVE GEMM test. Right now, we do not test all multiples of 128 bit. Mostly powers of two, since gcc may not support others."

for BITLEN in 128 256 512 1024 2048
do
echo ""
echo ""
echo "Testing $BITLEN bit SVE register GEMM"
python unit_tests_arm_sve.py $BITLEN
aarch64-linux-gnu-g++ -static -march=armv8.2-a+sve -msve-vector-bits=${BITLEN} arm_sve${BITLEN}_testsuite.cpp -o sve${BITLEN}-test
qemu-aarch64-static -cpu max,sve${BITLEN}=on,sve-default-vector-length=-1 ./sve${BITLEN}-test
done

echo "All tests done. Bye!"
39 changes: 31 additions & 8 deletions tests/sve_testsuite_generator.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
import numpy as np
import random
import sys
import os.path
import os
import testsuite_generator as test_generator

SparseKernel = namedtuple('SparseKernel', 'name m n k lda ldb ldc alpha beta block_sizes mtx delta')
Expand All @@ -14,7 +14,7 @@

setup_prefetching = """
template <typename T>
void setup_prefetch(T* prefetch, T* matrix, unsigned n, unsigned ldc) {
void setup_prefetch(T*& prefetch, T* matrix, unsigned n, unsigned ldc) {
posix_memalign(reinterpret_cast<void **>(&prefetch), 64, ldc*n*sizeof(T));
std::memcpy(prefetch, matrix, ldc*n*sizeof(T));
}
Expand All @@ -24,7 +24,11 @@ def generateMTX(k, n, nnz):
return test_generator.generateMTX(k, n, nnz)

def make(kernels, arch):
f = open('sve_testsuite.cpp', 'w')

f = open(f'{arch}_testsuite.cpp', 'w')

if not os.path.exists(arch):
os.mkdir(arch)

f.write(test_generator.head_of_testsuite)

Expand Down Expand Up @@ -52,11 +56,16 @@ def make(kernels, arch):
assert (bm % 8 == 0 and (bn + 1) * (bm / 8) <= 32)
elif arch == "arm":
assert (bm % 2 == 0 and (bn + 1) * (bm / 2) + bn <= 32)
elif arch == "arm_sve":
# this is for A64fx only, with SVE_vector_bits = 512
v_len = 8 if prec == 'd' else 16
elif arch.startswith("arm_sve"):
veclen = int(arch[7:])
assert veclen % 128 == 0 and veclen <= 2048
reglen = veclen // 128
v_len = 2 * reglen if prec == 'd' else 4 * reglen
# this should be the same assertion as in ../scripts/max_arm_sve.py
assert ((bn + 1) * (bm / v_len) + bn <= 32)
bk = 1
if not ((bn + bk) * (bm / v_len) + bn * bk + 2 <= 32):
print(f'Skipping block size {bm}x{bn} for {arch}')
continue

name = kern.name + '_' + str(bm) + '_' + str(bn)

Expand All @@ -83,14 +92,28 @@ def make(kernels, arch):
double* prefetch;
float* fprefetch;
""")

for kern in kernels:

block_sizes = list(set(kern.block_sizes))

for bs in block_sizes:
bm = bs[0]
bn = bs[1]

prec = 's' if isinstance(kern, SparseKernelS) or isinstance(kern, DenseKernelS) else 'd'

if arch.startswith("arm_sve"):
veclen = int(arch[7:])
assert veclen % 128 == 0 and veclen <= 2048
reglen = veclen // 128
v_len = 2 * reglen if prec == 'd' else 4 * reglen
# this should be the same assertion as in ../scripts/max_arm_sve.py
bk = 1
if not ((bn + bk) * (bm / v_len) + bn * bk + 2 <= 32):
# print(f'Skipping block size {bm}x{bn} for {arch}')
continue

name = kern.name + '_' + str(bm) + '_' + str(bn)

if isinstance(kern, SparseKernel) or isinstance(kern, SparseKernelS):
Expand Down
12 changes: 8 additions & 4 deletions tests/testsuite_generator.py
Original file line number Diff line number Diff line change
Expand Up @@ -125,12 +125,15 @@
gemm_ref(M, N, K, LDA, *LDB, LDC, *ALPHA, *BETA, A, B, Cref);
for(int i = 0; i < M; i++)
for(int j = 0; j < N; j++)
for(int i = 0; i < M; i++) {
for(int j = 0; j < N; j++) {
// we use the relative error instead of the absolute error because of an issue we found for sparse single precision
// kernels presumably due to limited precision of floats
if(std::abs((C[i + j * LDC] - Cref[i + j * LDC])) / Cref[i + j * LDC] > DELTA)
if(std::abs((C[i + j * LDC] - Cref[i + j * LDC])) / Cref[i + j * LDC] > DELTA) {
return 0;
}
}
}
return 1;
}
Expand Down Expand Up @@ -177,7 +180,8 @@
printf("\\n%i out of %lu test successful!\\n", correct, results.size());
return 0;
}"""
}
"""


def generateMTX(k, n, nnz):
Expand Down
18 changes: 13 additions & 5 deletions tests/unit_tests_arm_sve.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,15 +4,23 @@

import scripts.max_arm_sve as max_sve

import sys

v_len = 4

if len(sys.argv) == 2:
v_len = int(sys.argv[1]) // 128

blocksize_algs = [max_sve]
v_size = 8
v_size_s = 16
v_size = 2 * v_len
v_size_s = 4 * v_len
bitlen = v_len * 128
kernels = []

# define the maximum allowed difference between elements of our solution and the reference solution for
# double and single precision
delta_sp = 1e-6
delta_dp = 1e-7
delta_sp = 1e-4 # epsilon is around e-7 => /2 ... For most cases, 1e-6 is enough
delta_dp = 1e-7 # epsilon is around e-15 => /2

# test cases for double precision multiplication
kernels.append(generator.DenseKernel("sve_mixed_test1", 9, 9, 9, 9, 9, 9, 1.0, 0.0, [(3, 3)] + [x.getBlocksize(9, 9, 1, v_size) for x in blocksize_algs], delta_dp))
Expand Down Expand Up @@ -57,4 +65,4 @@
kernels.append(generator.SparseKernelS("sve_single_prec_test_S7", 23, 23, 23, 23, 0, 23, 1.5, -0.66, [x.getBlocksize(23, 23, 1, v_size_s) for x in blocksize_algs], generator.generateMTX(23, 23, 52), delta_sp))
kernels.append(generator.SparseKernelS("sve_single_prec_test_S8", 23, 31, 13, 23, 0, 23, 2.0, 0.0, [x.getBlocksize(23, 31, 1, v_size_s) for x in blocksize_algs], generator.generateMTX(13, 31, 40), delta_sp))

generator.make(kernels, "arm_sve")
generator.make(kernels, f"arm_sve{bitlen}")

0 comments on commit ad30f48

Please sign in to comment.