Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Provide SSR runtime APIs as global symbols #33

Closed
wants to merge 1 commit into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
59 changes: 30 additions & 29 deletions kernels/ssum/8x16xf32/Makefile
Original file line number Diff line number Diff line change
@@ -1,28 +1,29 @@
.DEFAULT_GOAL := all
# .DEFAULT_GOAL := all

include ../../../snitch/Makefile.rules

PRES =
PRES += pres_0_llvm.x
PRES += pres_1_llvm_clean.x
PRES += pres_2_vectorized.x
PRES += pres_3_ssr_loop.x
PRES += pres_4_ssr_frep.x

TESTS = $(PRES)
TESTS += baseline.x
TESTS += noalias.x
TESTS += ssr1d.x
TESTS += ssr1d_frep1d.x
TESTS += ssr2d.x
TESTS += linalg.x
TESTS += vector.x
TESTS += scf.x

CFLAGS += -std=gnu11
CFLAGS += -Wall -Wextra

%.x: %.o main.o data.o
# PRES =
# PRES += pres_0_llvm.x
# PRES += pres_1_llvm_clean.x
# PRES += pres_2_vectorized.x
# PRES += pres_3_ssr_loop.x
# PRES += pres_4_ssr_frep.x

# TESTS = $(PRES)
# TESTS += baseline.x
# TESTS += noalias.x
# TESTS += ssr1d.x
# TESTS += ssr1d_frep1d.x
# TESTS += ssr2d.x
# TESTS += linalg.x
# TESTS += vector.x
# TESTS += scf.x
TESTS = test_frep.x

# CFLAGS += -std=gnu11
# CFLAGS += -Wall -Wextra

%.x: %.o main.o data.o $(SSR_RUNTIME_OBJS)
$(LD) $(LDFLAGS) $^ -o $@

sim_%: %
Expand All @@ -33,18 +34,18 @@ RUN = $(addprefix run_, $(TESTS))
$(RUN): run_%: sim_%
mv logs $(subst sim_,,$<).logs

all: $(TESTS)
# all: $(TESTS)

allrun: $(RUN)
# allrun: $(RUN)


RUN_PRES = $(addprefix run_, $(PRES))
$(RUN_PRES): run_%: sim_%
mv logs $(subst sim_,,$<).logs
# RUN_PRES = $(addprefix run_, $(PRES))
# $(RUN_PRES): run_%: sim_%
# mv logs $(subst sim_,,$<).logs

all-pres: $(PRES)
# all-pres: $(PRES)

allrun-pres: $(RUN_PRES)
# allrun-pres: $(RUN_PRES)

clean:
rm -fr *.ll12 *.x *.o *.logs/ logs/
2 changes: 1 addition & 1 deletion kernels/ssum/8x16xf32/test_frep.mlir
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,7 @@ module {
llvm.call @snrt_ssr_enable() : () -> ()

// Inline assembly
llvm.inline_asm has_side_effects "frep.o $0, 1, 0, 0 \n fadd.d ft2, ft0, ft1", "r" %niter_minus_1 : (i32) -> ()
llvm.inline_asm has_side_effects "frep.o $0, 1, 0, 0 \n vfadd.s ft2, ft0, ft1", "r" %niter_minus_1 : (i32) -> ()

llvm.call @snrt_fpu_fence() : () -> ()
llvm.call @snrt_ssr_disable() : () -> ()
Expand Down
5 changes: 5 additions & 0 deletions snitch/Makefile.rules
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,11 @@ MLIROPT = mlir-opt-16
MLIRTRANSLATE = mlir-translate-16
XDSLOPT = xdsl-opt

# FIXME fix upstream runtime for SSRs
# This is a stopgap solution to provide SSRs configuration functions
# as extern C abi symbols
SSR_RUNTIME_OBJS = $(MAKEFILE_RULES_DIRNAME)/snitch-runtime/ssr_api.o

CFLAGS =
# Mixing .c and .ll files makes some flags, useful for the former,
# unused for the latter (e.g. -I)
Expand Down
105 changes: 105 additions & 0 deletions snitch/snitch-runtime/ssr_api.c
Original file line number Diff line number Diff line change
@@ -0,0 +1,105 @@
#include "ssr_api.h"

#include <stddef.h>
#include <stdint.h>

static void write_ssr_cfg(uint32_t reg, uint32_t dm, uint32_t value) {
uint32_t addr = reg << 5u | dm;
asm volatile(
"scfgw %[value], %[addr]\n"
::[value] "r"(value), [addr] "r"(addr));
}

void snrt_fpu_fence(void) {
unsigned tmp;
asm volatile(
"fmv.x.w %0, fa0\n"
"mv %0, %0\n"
: "+r"(tmp)::"memory");
}

void snrt_ssr_enable(void) {
#ifdef __TOOLCHAIN_LLVM__
__builtin_ssr_enable();
#else
asm volatile("csrsi 0x7C0, 1\n");
#endif
}

void snrt_ssr_disable(void) {
#ifdef __TOOLCHAIN_LLVM__
__builtin_ssr_disable();
#else
asm volatile("csrci 0x7C0, 1\n");
#endif
}

void snrt_ssr_loop_1d(enum snrt_ssr_dm dm, size_t b0, size_t s0) {
--b0;
write_ssr_cfg(REG_BOUNDS + 0, dm, b0);
size_t a = 0;
write_ssr_cfg(REG_STRIDES + 0, dm, s0 - a);
a += s0 * b0;
}

void snrt_ssr_loop_2d(enum snrt_ssr_dm dm, size_t b0, size_t b1, size_t s0, size_t s1) {
--b0;
--b1;
write_ssr_cfg(REG_BOUNDS + 0, dm, b0);
write_ssr_cfg(REG_BOUNDS + 1, dm, b1);
size_t a = 0;
write_ssr_cfg(REG_STRIDES + 0, dm, s0 - a);
a += s0 * b0;
write_ssr_cfg(REG_STRIDES + 1, dm, s1 - a);
a += s1 * b1;
}

void snrt_ssr_loop_3d(enum snrt_ssr_dm dm, size_t b0, size_t b1, size_t b2, size_t s0,
size_t s1, size_t s2) {
--b0;
--b1;
--b2;
write_ssr_cfg(REG_BOUNDS + 0, dm, b0);
write_ssr_cfg(REG_BOUNDS + 1, dm, b1);
write_ssr_cfg(REG_BOUNDS + 2, dm, b2);
size_t a = 0;
write_ssr_cfg(REG_STRIDES + 0, dm, s0 - a);
a += s0 * b0;
write_ssr_cfg(REG_STRIDES + 1, dm, s1 - a);
a += s1 * b1;
write_ssr_cfg(REG_STRIDES + 2, dm, s2 - a);
a += s2 * b2;
}

void snrt_ssr_loop_4d(enum snrt_ssr_dm dm, size_t b0, size_t b1, size_t b2, size_t b3,
size_t s0, size_t s1, size_t s2, size_t s3) {
--b0;
--b1;
--b2;
--b3;
write_ssr_cfg(REG_BOUNDS + 0, dm, b0);
write_ssr_cfg(REG_BOUNDS + 1, dm, b1);
write_ssr_cfg(REG_BOUNDS + 2, dm, b2);
write_ssr_cfg(REG_BOUNDS + 3, dm, b3);
size_t a = 0;
write_ssr_cfg(REG_STRIDES + 0, dm, s0 - a);
a += s0 * b0;
write_ssr_cfg(REG_STRIDES + 1, dm, s1 - a);
a += s1 * b1;
write_ssr_cfg(REG_STRIDES + 2, dm, s2 - a);
a += s2 * b2;
write_ssr_cfg(REG_STRIDES + 3, dm, s3 - a);
a += s3 * b3;
}

void snrt_ssr_repeat(enum snrt_ssr_dm dm, size_t count) {
write_ssr_cfg(REG_REPEAT, dm, count - 1);
}

void snrt_ssr_read(enum snrt_ssr_dm dm, enum snrt_ssr_dim dim, volatile void *ptr) {
write_ssr_cfg(REG_RPTR + dim, dm, (uintptr_t)ptr);
}

void snrt_ssr_write(enum snrt_ssr_dm dm, enum snrt_ssr_dim dim, volatile void *ptr) {
write_ssr_cfg(REG_WPTR + dim, dm, (uintptr_t)ptr);
}
68 changes: 68 additions & 0 deletions snitch/snitch-runtime/ssr_api.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,68 @@
#include <stdint.h>
#include <stddef.h>

/// The different SSR data movers.
enum snrt_ssr_dm {
SNRT_SSR_DM0 = 0,
SNRT_SSR_DM1 = 1,
SNRT_SSR_DM2 = 2,
// To write to all SSRs, use index 31
SNRT_SSR_DM_ALL = 31,
};

/// The different dimensions.
enum snrt_ssr_dim {
SNRT_SSR_1D = 0,
SNRT_SSR_2D = 1,
SNRT_SSR_3D = 2,
SNRT_SSR_4D = 3,
};

/// The SSR configuration registers.
enum {
REG_STATUS = 0,
REG_REPEAT = 1,
REG_BOUNDS = 2, // + loop index
REG_STRIDES = 6, // + loop index
REG_RPTR = 24, // + snrt_ssr_dim
REG_WPTR = 28, // + snrt_ssr_dim
};

/// Synchronize the integer and float pipelines.
void snrt_fpu_fence(void);

/// Enable SSR.
void snrt_ssr_enable(void);

/// Disable SSR.
void snrt_ssr_disable(void);

// Configure an SSR data mover for a 1D loop nest.
void snrt_ssr_loop_1d(enum snrt_ssr_dm dm, size_t b0, size_t s0);

// Configure an SSR data mover for a 2D loop nest.
void snrt_ssr_loop_2d(enum snrt_ssr_dm dm, size_t b0, size_t b1,
size_t s0, size_t s1);

// Configure an SSR data mover for a 3D loop nest.
void snrt_ssr_loop_3d(enum snrt_ssr_dm dm, size_t b0, size_t b1,
size_t b2, size_t s0, size_t s1, size_t s2);

// Configure an SSR data mover for a 4D loop nest.
// b0: Inner-most bound (limit of loop)
// b3: Outer-most bound (limit of loop)
// s0: increment size of inner-most loop
void snrt_ssr_loop_4d(enum snrt_ssr_dm dm, size_t b0, size_t b1,
size_t b2, size_t b3, size_t s0, size_t s1,
size_t s2, size_t s3);

/// Configure the repetition count for a stream.
void snrt_ssr_repeat(enum snrt_ssr_dm dm, size_t count);

/// Start a streaming read.
void snrt_ssr_read(enum snrt_ssr_dm dm, enum snrt_ssr_dim dim,
volatile void *ptr);

/// Start a streaming write.
void snrt_ssr_write(enum snrt_ssr_dm dm, enum snrt_ssr_dim dim,
volatile void *ptr);