diff --git a/kernels/ssum/8x16xf32/Makefile b/kernels/ssum/8x16xf32/Makefile index 5881c78f..5f251787 100644 --- a/kernels/ssum/8x16xf32/Makefile +++ b/kernels/ssum/8x16xf32/Makefile @@ -1,28 +1,29 @@ -.DEFAULT_GOAL := all +# .DEFAULT_GOAL := all include ../../../snitch/Makefile.rules -PRES = -PRES += pres_0_llvm.x -PRES += pres_1_llvm_clean.x -PRES += pres_2_vectorized.x -PRES += pres_3_ssr_loop.x -PRES += pres_4_ssr_frep.x - -TESTS = $(PRES) -TESTS += baseline.x -TESTS += noalias.x -TESTS += ssr1d.x -TESTS += ssr1d_frep1d.x -TESTS += ssr2d.x -TESTS += linalg.x -TESTS += vector.x -TESTS += scf.x - -CFLAGS += -std=gnu11 -CFLAGS += -Wall -Wextra - -%.x: %.o main.o data.o +# PRES = +# PRES += pres_0_llvm.x +# PRES += pres_1_llvm_clean.x +# PRES += pres_2_vectorized.x +# PRES += pres_3_ssr_loop.x +# PRES += pres_4_ssr_frep.x + +# TESTS = $(PRES) +# TESTS += baseline.x +# TESTS += noalias.x +# TESTS += ssr1d.x +# TESTS += ssr1d_frep1d.x +# TESTS += ssr2d.x +# TESTS += linalg.x +# TESTS += vector.x +# TESTS += scf.x +TESTS = test_frep.x + +# CFLAGS += -std=gnu11 +# CFLAGS += -Wall -Wextra + +%.x: %.o main.o data.o $(SSR_RUNTIME_OBJS) $(LD) $(LDFLAGS) $^ -o $@ sim_%: % @@ -33,18 +34,18 @@ RUN = $(addprefix run_, $(TESTS)) $(RUN): run_%: sim_% mv logs $(subst sim_,,$<).logs -all: $(TESTS) +# all: $(TESTS) -allrun: $(RUN) +# allrun: $(RUN) -RUN_PRES = $(addprefix run_, $(PRES)) -$(RUN_PRES): run_%: sim_% - mv logs $(subst sim_,,$<).logs +# RUN_PRES = $(addprefix run_, $(PRES)) +# $(RUN_PRES): run_%: sim_% +# mv logs $(subst sim_,,$<).logs -all-pres: $(PRES) +# all-pres: $(PRES) -allrun-pres: $(RUN_PRES) +# allrun-pres: $(RUN_PRES) clean: rm -fr *.ll12 *.x *.o *.logs/ logs/ diff --git a/kernels/ssum/8x16xf32/test_frep.mlir b/kernels/ssum/8x16xf32/test_frep.mlir index 8d45e494..73804124 100644 --- a/kernels/ssum/8x16xf32/test_frep.mlir +++ b/kernels/ssum/8x16xf32/test_frep.mlir @@ -27,7 +27,7 @@ module { llvm.call @snrt_ssr_enable() : () -> () // Inline assembly - llvm.inline_asm has_side_effects "frep.o $0, 1, 0, 0 \n fadd.d ft2, ft0, ft1", "r" %niter_minus_1 : (i32) -> () + llvm.inline_asm has_side_effects "frep.o $0, 1, 0, 0 \n vfadd.s ft2, ft0, ft1", "r" %niter_minus_1 : (i32) -> () llvm.call @snrt_fpu_fence() : () -> () llvm.call @snrt_ssr_disable() : () -> () diff --git a/snitch/Makefile.rules b/snitch/Makefile.rules index 0052aee8..816dfb88 100644 --- a/snitch/Makefile.rules +++ b/snitch/Makefile.rules @@ -12,6 +12,11 @@ MLIROPT = mlir-opt-16 MLIRTRANSLATE = mlir-translate-16 XDSLOPT = xdsl-opt +# FIXME fix upstream runtime for SSRs +# This is a stopgap solution to provide SSRs configuration functions +# as extern C abi symbols +SSR_RUNTIME_OBJS = $(MAKEFILE_RULES_DIRNAME)/snitch-runtime/ssr_api.o + CFLAGS = # Mixing .c and .ll files makes some flags, useful for the former, # unused for the latter (e.g. -I) diff --git a/snitch/snitch-runtime/ssr_api.c b/snitch/snitch-runtime/ssr_api.c new file mode 100644 index 00000000..02f0f857 --- /dev/null +++ b/snitch/snitch-runtime/ssr_api.c @@ -0,0 +1,105 @@ +#include "ssr_api.h" + +#include +#include + +static void write_ssr_cfg(uint32_t reg, uint32_t dm, uint32_t value) { + uint32_t addr = reg << 5u | dm; + asm volatile( + "scfgw %[value], %[addr]\n" + ::[value] "r"(value), [addr] "r"(addr)); +} + +void snrt_fpu_fence(void) { + unsigned tmp; + asm volatile( + "fmv.x.w %0, fa0\n" + "mv %0, %0\n" + : "+r"(tmp)::"memory"); +} + +void snrt_ssr_enable(void) { +#ifdef __TOOLCHAIN_LLVM__ + __builtin_ssr_enable(); +#else + asm volatile("csrsi 0x7C0, 1\n"); +#endif +} + +void snrt_ssr_disable(void) { +#ifdef __TOOLCHAIN_LLVM__ + __builtin_ssr_disable(); +#else + asm volatile("csrci 0x7C0, 1\n"); +#endif +} + +void snrt_ssr_loop_1d(enum snrt_ssr_dm dm, size_t b0, size_t s0) { + --b0; + write_ssr_cfg(REG_BOUNDS + 0, dm, b0); + size_t a = 0; + write_ssr_cfg(REG_STRIDES + 0, dm, s0 - a); + a += s0 * b0; +} + +void snrt_ssr_loop_2d(enum snrt_ssr_dm dm, size_t b0, size_t b1, size_t s0, size_t s1) { + --b0; + --b1; + write_ssr_cfg(REG_BOUNDS + 0, dm, b0); + write_ssr_cfg(REG_BOUNDS + 1, dm, b1); + size_t a = 0; + write_ssr_cfg(REG_STRIDES + 0, dm, s0 - a); + a += s0 * b0; + write_ssr_cfg(REG_STRIDES + 1, dm, s1 - a); + a += s1 * b1; +} + +void snrt_ssr_loop_3d(enum snrt_ssr_dm dm, size_t b0, size_t b1, size_t b2, size_t s0, + size_t s1, size_t s2) { + --b0; + --b1; + --b2; + write_ssr_cfg(REG_BOUNDS + 0, dm, b0); + write_ssr_cfg(REG_BOUNDS + 1, dm, b1); + write_ssr_cfg(REG_BOUNDS + 2, dm, b2); + size_t a = 0; + write_ssr_cfg(REG_STRIDES + 0, dm, s0 - a); + a += s0 * b0; + write_ssr_cfg(REG_STRIDES + 1, dm, s1 - a); + a += s1 * b1; + write_ssr_cfg(REG_STRIDES + 2, dm, s2 - a); + a += s2 * b2; +} + +void snrt_ssr_loop_4d(enum snrt_ssr_dm dm, size_t b0, size_t b1, size_t b2, size_t b3, + size_t s0, size_t s1, size_t s2, size_t s3) { + --b0; + --b1; + --b2; + --b3; + write_ssr_cfg(REG_BOUNDS + 0, dm, b0); + write_ssr_cfg(REG_BOUNDS + 1, dm, b1); + write_ssr_cfg(REG_BOUNDS + 2, dm, b2); + write_ssr_cfg(REG_BOUNDS + 3, dm, b3); + size_t a = 0; + write_ssr_cfg(REG_STRIDES + 0, dm, s0 - a); + a += s0 * b0; + write_ssr_cfg(REG_STRIDES + 1, dm, s1 - a); + a += s1 * b1; + write_ssr_cfg(REG_STRIDES + 2, dm, s2 - a); + a += s2 * b2; + write_ssr_cfg(REG_STRIDES + 3, dm, s3 - a); + a += s3 * b3; +} + +void snrt_ssr_repeat(enum snrt_ssr_dm dm, size_t count) { + write_ssr_cfg(REG_REPEAT, dm, count - 1); +} + +void snrt_ssr_read(enum snrt_ssr_dm dm, enum snrt_ssr_dim dim, volatile void *ptr) { + write_ssr_cfg(REG_RPTR + dim, dm, (uintptr_t)ptr); +} + +void snrt_ssr_write(enum snrt_ssr_dm dm, enum snrt_ssr_dim dim, volatile void *ptr) { + write_ssr_cfg(REG_WPTR + dim, dm, (uintptr_t)ptr); +} diff --git a/snitch/snitch-runtime/ssr_api.h b/snitch/snitch-runtime/ssr_api.h new file mode 100644 index 00000000..bf741299 --- /dev/null +++ b/snitch/snitch-runtime/ssr_api.h @@ -0,0 +1,68 @@ +#include +#include + +/// The different SSR data movers. +enum snrt_ssr_dm { + SNRT_SSR_DM0 = 0, + SNRT_SSR_DM1 = 1, + SNRT_SSR_DM2 = 2, + // To write to all SSRs, use index 31 + SNRT_SSR_DM_ALL = 31, +}; + +/// The different dimensions. +enum snrt_ssr_dim { + SNRT_SSR_1D = 0, + SNRT_SSR_2D = 1, + SNRT_SSR_3D = 2, + SNRT_SSR_4D = 3, +}; + +/// The SSR configuration registers. +enum { + REG_STATUS = 0, + REG_REPEAT = 1, + REG_BOUNDS = 2, // + loop index + REG_STRIDES = 6, // + loop index + REG_RPTR = 24, // + snrt_ssr_dim + REG_WPTR = 28, // + snrt_ssr_dim +}; + +/// Synchronize the integer and float pipelines. +void snrt_fpu_fence(void); + +/// Enable SSR. +void snrt_ssr_enable(void); + +/// Disable SSR. +void snrt_ssr_disable(void); + +// Configure an SSR data mover for a 1D loop nest. +void snrt_ssr_loop_1d(enum snrt_ssr_dm dm, size_t b0, size_t s0); + +// Configure an SSR data mover for a 2D loop nest. +void snrt_ssr_loop_2d(enum snrt_ssr_dm dm, size_t b0, size_t b1, + size_t s0, size_t s1); + +// Configure an SSR data mover for a 3D loop nest. +void snrt_ssr_loop_3d(enum snrt_ssr_dm dm, size_t b0, size_t b1, + size_t b2, size_t s0, size_t s1, size_t s2); + +// Configure an SSR data mover for a 4D loop nest. +// b0: Inner-most bound (limit of loop) +// b3: Outer-most bound (limit of loop) +// s0: increment size of inner-most loop +void snrt_ssr_loop_4d(enum snrt_ssr_dm dm, size_t b0, size_t b1, + size_t b2, size_t b3, size_t s0, size_t s1, + size_t s2, size_t s3); + +/// Configure the repetition count for a stream. +void snrt_ssr_repeat(enum snrt_ssr_dm dm, size_t count); + +/// Start a streaming read. +void snrt_ssr_read(enum snrt_ssr_dm dm, enum snrt_ssr_dim dim, + volatile void *ptr); + +/// Start a streaming write. +void snrt_ssr_write(enum snrt_ssr_dm dm, enum snrt_ssr_dim dim, + volatile void *ptr);