Merge pull request #7 from SeisSol/davschneller/sve-multisize

SVE for 128, 256, 1024, and 2048 Bits
SeisSol · Sep 1, 2023 · ad30f48 · ad30f48
2 parents 5af38e4 + 9bd7d93
commit ad30f48
Show file tree

Hide file tree

Showing 10 changed files with 134 additions and 35 deletions.
diff --git a/.gitignore b/.gitignore
@@ -2,8 +2,10 @@ __pycache__/
 
 # Testing (for now)
 tests/arm/
-tests/arm_sve/
+tests/arm_sve*/
 tests/hsw/
 tests/knl/
 tests/testsuite.cpp
+tests/arm_*testsuite.cpp
+tests/sve*-test
 a.out
diff --git a/README.md b/README.md
@@ -5,6 +5,6 @@ Currently Intel Xeon Phi 'Knights Landing' (AVX512), Haswell/Zen2 (AVX2), and AR
 
 Usage: 
 
-./pspamm M N K LDA LDB LDC ALPHA BETA --arch {arm,arm_sve,knl,hsw}
+./pspamm M N K LDA LDB LDC ALPHA BETA --arch {arm,arm_sve{128,256,512,1024,2048},knl,hsw}
 
 --mtx_filename MTX_FILE_PATH --output_funcname FUNCTION_NAME --output_filename OUTPUT_NAME
diff --git a/codegen/architectures/arm/generator.py b/codegen/architectures/arm/generator.py
@@ -30,7 +30,8 @@ class Generator(AbstractGenerator):
     pspamm_num_total_flops += {flop};
     #endif
 
-}}}};"""
+}}}};
+"""
 
     def get_v_size(self):
         if self.precision == Precision.DOUBLE:

diff --git a/codegen/architectures/arm_sve/generator.py b/codegen/architectures/arm_sve/generator.py
@@ -29,17 +29,19 @@ class Generator(AbstractGenerator):
     pspamm_num_total_flops += {flop};
     #endif
 
-}}}}}}}};"""
+}}}}}}}};
+"""
 
     prefetch_reg = None
     prefetch_count = 0
     is_sparse = False
+    v_len = 4 # vector register length: v_len * 128 bit
 
     def get_v_size(self):
         if self.precision == Precision.DOUBLE:
-            return 8
+            return 2 * self.v_len # 128 bit == 2 x 64 bit (double)
         elif self.precision == Precision.SINGLE:
-            return 16
+            return 4 * self.v_len # 128 bit == 4 x 32 bit (float)
         raise NotImplementedError
 
     def get_precision(self):
@@ -55,7 +57,8 @@ def pred_n_trues(self, num_trues: int, v_size: int, suffix: str = None) -> Regis
          set to true"""
         assert (num_trues > 0)
         assert (suffix == "m" or suffix == "z" or suffix is None)
-        # we only use p7 or p0 as predicates
+
+        # we only use p7 or p0 as predicates (1 == p0, 8 == p7)
         num_trues = 8 if num_trues >= v_size else 1
 
         if suffix is None:
@@ -125,11 +128,11 @@ def init_registers(self,
                        v_size: int
                        ) -> None:
 
-        bm %= v_size
+        bmmod = bm % v_size
 
         eol = "\\n\\t"                          # define the "end of line" sequence for easy assembly
-        p_suffix = "d" if v_size == 8 else "s"  # determine whether predicate suffix is '.d' or '.s
-        gen_reg = "x" if v_size == 8 else "w"   # determine if 'dup' registers are 64 bit or 32 bit
+        p_suffix = "d" if v_size == 2 * self.v_len else "s"  # determine whether predicate suffix is '.d' or '.s
+        gen_reg = "x" if v_size == 2 * self.v_len else "w"   # determine if 'dup' registers are 64 bit or 32 bit
         overhead_counter = 6
 
         # https://developer.arm.com/documentation/102374/0101/Registers-in-AArch64---general-purpose-registers
@@ -145,13 +148,13 @@ def init_registers(self,
         # https://developer.arm.com/documentation/ddi0596/2020-12/Shared-Pseudocode/AArch64-Functions?lang=en#impl-aarch64.DecodePredCount.2
         # 'ptrue' doesnt work for initialising overhead predicate when using single precision -> see valid patterns from above
         # overhead = "\"ptrue p0.{suffix}, #{overhead}{eol}\"\n\t" if bm != 0 else ""    # define overhead predicate
-        overhead = "\"mov {gen_reg}{overhead_counter}, #{overhead}{eol}\"\n\t\"whilelo p0.{suffix}, {gen_reg}zr, {gen_reg}{overhead_counter}{eol}\"\n\t" if bm != 0 else ""
+        overhead = "\"mov {gen_reg}{overhead_counter}, #{overhead}{eol}\"\n\t\"whilelo p0.{suffix}, {gen_reg}zr, {gen_reg}{overhead_counter}{eol}\"\n\t" if bmmod != 0 else ""
         all_true = "\"ptrue p7.{suffix}, #31{eol}\""                             # define all true predicate
         init_registers = (dup_alpha + dup_beta + comment + overhead + all_true).format(suffix=p_suffix,
                                                                                        gen_reg=gen_reg,
                                                                                        overhead_counter=overhead_counter,
                                                                                        v_size=v_size,
-                                                                                       overhead=bm,
+                                                                                       overhead=bmmod,
                                                                                        eol=eol)
 
         # since .format() doesn't allow partial formatting, we need to re-include the
@@ -188,12 +191,12 @@ def move_register_block(self,
         b_row, b_col, i, _ = cursor.get_block(cursor_ptr, block_offset)
 
         cur11 = 0
-        #TODO: figure out appropriate threshold
-        threshold = 1 if self.is_sparse else 4  # uses whole 256 byte cache line, as one SVE vector = 64 bytes
+        #TODO: figure out appropriate threshold (the 16 // self.v_len may still not be optimal; especially if 16 % self.v_len != 0, e.g. 384 bit)
+        threshold = 1 if self.is_sparse else (16 // self.v_len)  # uses whole 256 byte cache line, as one SVE-512 vector = 64 bytes
 
-        # TODO: if another CPU implements SVE at VL != 64 bytes, rewrite mul_vl (maybe do this dynamically)
-        mul_vl = 64     # A64FX has VL of 64 bytes in memory
-        max_mem_ins_mult = 7  # A64FX allows a maximum positive offset of 7 in memory instructions, e.g. ld1d z1.d, p0/z, [x0, 7, MUL VL]
+        # DONE if another CPU implements SVE at VL != 64 bytes, rewrite mul_vl (maybe do this dynamically)
+        mul_vl = 16 * self.v_len   # e.g. A64FX has VL of 64 bytes in memory (thus, use v_len==4)
+        max_mem_ins_mult = 7  # A64FX allows a maximum positive offset of 7 in memory instructions, e.g. ld1d z1.d, p0/z, [x0, 7, MUL VL] (TODO: tune, if ever different)
         max_offset = mul_vl * max_mem_ins_mult  # ld1d/st1d instruction encodes the immediate offset using 4 bits, multiplies it with MUL VL
 
         prev_disp = 0

diff --git a/matmul.py b/matmul.py
@@ -98,6 +98,16 @@ def __init__(self,
 
         if arch == 'skx':
           arch = 'knl'
+
+        # hacky implementation of multi-register length
+        if arch.startswith('arm_sve'):
+          if len(arch) == 7:
+            v_len_regs = 4 # compatibility: arm_sve == arm_sve512
+          else:
+            v_len_bits = int(arch[7:])
+            assert v_len_bits % 128 == 0 and v_len_bits <= 2048
+            v_len_regs = v_len_bits // 128
+          arch = 'arm_sve'
 
         self.arch = arch
         assert precision.lower() in ['s', 'd']
@@ -110,10 +120,14 @@ def __init__(self,
 
         self.generator = architecture.Generator(self.precision)
 
-        self.v_size = self.generator.get_v_size()
         # flag that determines if a matmul kernel uses sve instructions -> needed for sve predicates
         self.is_sve = arch == "arm_sve"
 
+        if self.is_sve:
+          self.generator.v_len = v_len_regs
+
+        self.v_size = self.generator.get_v_size()
+
         if bk == None:
             bk = 2 if arch == 'knl' else 1
 

diff --git a/tests/README.md b/tests/README.md
@@ -41,3 +41,31 @@ If nothing breaks, the generated testsuite reports the number of successful test
 2. Adjust the Makefile as needed and compile the generated ```sve_testsuite.cpp``` by calling  
 ```make sve_testsuite```
 3. Run the compiled executable with ```./sve_testsuite```
+
+#### Notes Running SVE with QEMU user-static
+
+Run `runall-sve.sh` which tests a bunch of configurations already.
+
+For a bit length `BITLEN`, it executes the following commands:
+```
+# generate tests
+python unit_tests_arm_sve.py $BITLEN
+
+# compile: we use AVM V8.2 and SVE; the SVE vector length is set explicitly
+aarch64-linux-gnu-g++ -static -march=armv8.2-a+sve -msve-vector-bits=${BITLEN} arm_sve${BITLEN}_testsuite.cpp -o sve${BITLEN}-test
+
+# run using QEMU, this way we may run on x86-64 as well; enable all features and constrain to sve${BITLEN} SVE registers maximum length (cf. https://qemu-project.gitlab.io/qemu/system/arm/cpu-features.html); the sve-default-vector-length=-1 parameter is needed for 1024 and 2048 bit SVE to work correctly (otherwise, QEMU will assume 512 bit maximum)
+qemu-aarch64-static -cpu max,sve${BITLEN}=on,sve-default-vector-length=-1 ./sve${BITLEN}-test
+```
+
+
+For debugging, for example for vector length 512 (cf. https://mariokartwii.com/showthread.php?tid=1998 ):
+```
+aarch64-linux-gnu-g++ -g -ggdb -static -march=armv9-a+sve -msve-vector-bits=512 sve_testsuite.cpp
+qemu-aarch64-static -g 1234 -cpu max,sve512=on ./a.out
+```
+(we use 1234 as port here, and a.out as filename)
+
+In a separate window, run `aarch64-linux-gnu-gdb --ex "target remote localhost:1234" --ex "file a.out"`.
+The extra commands already connect you with QEMU and attach you to the compiled binary file, so method names etc. are printed correctly.
+To run the program, just type `continue`. You may maybe want to set up breakpoints etc. before you do that.
diff --git a/tests/runall-sve.sh b/tests/runall-sve.sh
@@ -0,0 +1,16 @@
+#!/usr/bin/env bash
+# maybe do PYTHONPATH=$(pwd)/..:$PYTHONPATH
+
+echo "SVE GEMM test. Right now, we do not test all multiples of 128 bit. Mostly powers of two, since gcc may not support others."
+
+for BITLEN in 128 256 512 1024 2048
+do
+    echo ""
+    echo ""
+    echo "Testing $BITLEN bit SVE register GEMM"
+    python unit_tests_arm_sve.py $BITLEN
+    aarch64-linux-gnu-g++ -static -march=armv8.2-a+sve -msve-vector-bits=${BITLEN} arm_sve${BITLEN}_testsuite.cpp -o sve${BITLEN}-test
+    qemu-aarch64-static -cpu max,sve${BITLEN}=on,sve-default-vector-length=-1 ./sve${BITLEN}-test
+done
+
+echo "All tests done. Bye!"
diff --git a/tests/sve_testsuite_generator.py b/tests/sve_testsuite_generator.py
@@ -3,7 +3,7 @@
 import numpy as np
 import random
 import sys
-import os.path
+import os
 import testsuite_generator as test_generator
 
 SparseKernel = namedtuple('SparseKernel', 'name m n k lda ldb ldc alpha beta block_sizes mtx delta')
@@ -14,7 +14,7 @@
 
 setup_prefetching = """
 template <typename T>
-void setup_prefetch(T* prefetch, T* matrix, unsigned n, unsigned ldc) {
+void setup_prefetch(T*& prefetch, T* matrix, unsigned n, unsigned ldc) {
  posix_memalign(reinterpret_cast<void **>(&prefetch), 64, ldc*n*sizeof(T));
  std::memcpy(prefetch, matrix, ldc*n*sizeof(T));
 }
@@ -24,7 +24,11 @@ def generateMTX(k, n, nnz):
     return test_generator.generateMTX(k, n, nnz)
 
 def make(kernels, arch):
-    f = open('sve_testsuite.cpp', 'w')
+
+    f = open(f'{arch}_testsuite.cpp', 'w')
+
+    if not os.path.exists(arch):
+        os.mkdir(arch)
 
     f.write(test_generator.head_of_testsuite)
 
@@ -52,11 +56,16 @@ def make(kernels, arch):
                 assert (bm % 8 == 0 and (bn + 1) * (bm / 8) <= 32)
             elif arch == "arm":
                 assert (bm % 2 == 0 and (bn + 1) * (bm / 2) + bn <= 32)
-            elif arch == "arm_sve":
-                # this is for A64fx only, with SVE_vector_bits = 512
-                v_len = 8 if prec == 'd' else 16
+            elif arch.startswith("arm_sve"):
+                veclen = int(arch[7:])
+                assert veclen % 128 == 0 and veclen <= 2048
+                reglen = veclen // 128
+                v_len = 2 * reglen if prec == 'd' else 4 * reglen
                 # this should be the same assertion as in ../scripts/max_arm_sve.py
-                assert ((bn + 1) * (bm / v_len) + bn <= 32)
+                bk = 1
+                if not ((bn + bk) * (bm / v_len) + bn * bk + 2 <= 32):
+                    print(f'Skipping block size {bm}x{bn} for {arch}')
+                    continue
 
             name = kern.name + '_' + str(bm) + '_' + str(bn)
 
@@ -83,14 +92,28 @@ def make(kernels, arch):
   double* prefetch;
   float* fprefetch;
   """)
-
+  
     for kern in kernels:
 
         block_sizes = list(set(kern.block_sizes))
 
         for bs in block_sizes:
             bm = bs[0]
             bn = bs[1]
+
+            prec = 's' if isinstance(kern, SparseKernelS) or isinstance(kern, DenseKernelS) else 'd'
+
+            if arch.startswith("arm_sve"):
+                veclen = int(arch[7:])
+                assert veclen % 128 == 0 and veclen <= 2048
+                reglen = veclen // 128
+                v_len = 2 * reglen if prec == 'd' else 4 * reglen
+                # this should be the same assertion as in ../scripts/max_arm_sve.py
+                bk = 1
+                if not ((bn + bk) * (bm / v_len) + bn * bk + 2 <= 32):
+                    # print(f'Skipping block size {bm}x{bn} for {arch}')
+                    continue
+
             name = kern.name + '_' + str(bm) + '_' + str(bn)
 
             if isinstance(kern, SparseKernel) or isinstance(kern, SparseKernelS):

diff --git a/tests/testsuite_generator.py b/tests/testsuite_generator.py
@@ -125,12 +125,15 @@
 
   gemm_ref(M, N, K, LDA, *LDB, LDC, *ALPHA, *BETA, A, B, Cref);
     
-  for(int i = 0; i < M; i++)
-    for(int j = 0; j < N; j++)
+  for(int i = 0; i < M; i++) {
+    for(int j = 0; j < N; j++) {
       // we use the relative error instead of the absolute error because of an issue we found for sparse single precision 
       // kernels presumably due to limited precision of floats
-      if(std::abs((C[i + j * LDC] - Cref[i + j * LDC])) / Cref[i + j * LDC] > DELTA)
+      if(std::abs((C[i + j * LDC] - Cref[i + j * LDC])) / Cref[i + j * LDC] > DELTA) {
         return 0;
+      }
+    }
+  }
 
   return 1;
 }
@@ -177,7 +180,8 @@
   printf("\\n%i out of %lu test successful!\\n", correct, results.size());
 
   return 0;
-}"""
+}
+"""
 
 
 def generateMTX(k, n, nnz):

diff --git a/tests/unit_tests_arm_sve.py b/tests/unit_tests_arm_sve.py
@@ -4,15 +4,23 @@
 
 import scripts.max_arm_sve as max_sve
 
+import sys
+
+v_len = 4
+
+if len(sys.argv) == 2:
+    v_len = int(sys.argv[1]) // 128
+
 blocksize_algs = [max_sve]
-v_size = 8
-v_size_s = 16
+v_size = 2 * v_len
+v_size_s = 4 * v_len
+bitlen = v_len * 128
 kernels = []
 
 # define the maximum allowed difference between elements of our solution and the reference solution for
 # double and single precision
-delta_sp = 1e-6
-delta_dp = 1e-7
+delta_sp = 1e-4 # epsilon is around e-7 => /2 ... For most cases, 1e-6 is enough
+delta_dp = 1e-7 # epsilon is around e-15 => /2
 
 # test cases for double precision multiplication
 kernels.append(generator.DenseKernel("sve_mixed_test1", 9, 9, 9, 9, 9, 9, 1.0, 0.0, [(3, 3)] + [x.getBlocksize(9, 9, 1, v_size) for x in blocksize_algs], delta_dp))
@@ -57,4 +65,4 @@
 kernels.append(generator.SparseKernelS("sve_single_prec_test_S7", 23, 23, 23, 23, 0, 23, 1.5, -0.66, [x.getBlocksize(23, 23, 1, v_size_s) for x in blocksize_algs], generator.generateMTX(23, 23, 52), delta_sp))
 kernels.append(generator.SparseKernelS("sve_single_prec_test_S8", 23, 31, 13, 23, 0, 23, 2.0, 0.0, [x.getBlocksize(23, 31, 1, v_size_s) for x in blocksize_algs], generator.generateMTX(13, 31, 40), delta_sp))
 
-generator.make(kernels, "arm_sve")
+generator.make(kernels, f"arm_sve{bitlen}")