Skip to content

Commit

Permalink
Merge pull request #15 from RWTH-ACS/share-object-support
Browse files Browse the repository at this point in the history
add several new features that are required for pytorch support

- shared object support
- decode fatbinary
- extract cubin
- send cubin to server
- add registry for tranferred cubins and kernel functions so Cricket is able to identify them when launching kernels
switch over old kernel launching functionality to always use the new registry instead of relying on kernel locations being the same on client and server
- use libelf to read kernel infos instead of relying on cuobjdump which does not support in-memory ELFs
- read parameter infos using libelf.
- enable reading CUDA elfs with debugging infos and compressed elfs
- Test with minimal pytorch (deactivated some features, no kernel compression)
- Test with default pytorch (no kernel compression)
- cuDNN implementation
- cudnnBackend not fully working yet
  • Loading branch information
n-eiling authored Jul 19, 2023
2 parents d277976 + 088b6fc commit bcc5c93
Show file tree
Hide file tree
Showing 53 changed files with 7,667 additions and 719 deletions.
5 changes: 5 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@ build/
.clangd
.project
.cproject
*.code-workspace
.settings/
.vscode/
.directory
Expand Down Expand Up @@ -39,3 +40,7 @@ core.*
compile_commands.json
tags
.gdb_history

# perf data
perf.data
main
47 changes: 28 additions & 19 deletions .gitlab-ci.yml
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@ stages:
##############################################################################

# Build docker image
prepare:centos8:docker-dev:
prepare:rocky9:docker-dev:
stage: prepare
script:
- docker build
Expand All @@ -31,13 +31,13 @@ prepare:centos8:docker-dev:
tags:
- docker

prepare:centos8:cuda10:
prepare:centos8:cuda11:
stage: prepare
script:
- docker build
--file utils/Dockerfile.cuda10
--tag ${DOCKER_IMAGE_DEV}_cuda10:${DOCKER_TAG}
--tag ${DOCKER_IMAGE_DEV}_cuda10:latest .
--file utils/Dockerfile.cuda11
--tag ${DOCKER_IMAGE_DEV}_cuda11:${DOCKER_TAG}
--tag ${DOCKER_IMAGE_DEV}_cuda11:latest .
tags:
- docker

Expand All @@ -57,7 +57,7 @@ prepare:centos8:cuda10:

build:
stage: build
needs: ["prepare:centos8:docker-dev"]
needs: ["prepare:rocky9:docker-dev"]
script:
- make -j 32 libtirpc
- make -j 32 cuda-gdb
Expand All @@ -68,6 +68,7 @@ build:
paths:
- bin
- tests/bin
- tests/samples/samples-bin
image: ${DOCKER_IMAGE_DEV}:${DOCKER_TAG}
cache:
paths:
Expand All @@ -82,7 +83,7 @@ build:

build:ib:
stage: build
needs: ["prepare:centos8:docker-dev"]
needs: ["prepare:rocky9:docker-dev"]
script:
- make -j 32 libtirpc
- make -j 32 cuda-gdb
Expand All @@ -108,19 +109,19 @@ build:ib:
tags:
- docker

build:cuda10:
build:cuda11:
stage: build
needs: ["prepare:centos8:cuda10"]
needs: ["prepare:centos8:cuda11"]
script:
- make -j 32 libtirpc
- make -j 32 cuda-gdb
- make -j 1 LOG=INFO
- make -j 1 LOG=INFO NOSAMPLES=yes
artifacts:
expire_in: 1 week
paths:
- bin
- tests/bin
image: ${DOCKER_IMAGE_DEV}_cuda10:${DOCKER_TAG}
image: ${DOCKER_IMAGE_DEV}_cuda11:${DOCKER_TAG}
cache:
paths:
- gpu/build
Expand All @@ -130,13 +131,13 @@ build:cuda10:
- submodules/libtirpc
- submodules/cuda-gdb
- submodules/cuda-gdb-src.rpm
key: build_cuda10
key: build_cuda11
tags:
- docker

build:debug:
stage: build
needs: ["prepare:centos8:docker-dev"]
needs: ["prepare:rocky9:docker-dev"]
script:
- make -j 32 libtirpc
- make -j 32 cuda-gdb
Expand Down Expand Up @@ -170,6 +171,7 @@ build:debug:
LDIR: '$CI_BUILDS_DIR/$CI_PROJECT_PATH/bin'
SAMPLES_PATH: '/usr/local/cuda/samples'
PARAMETER: ''
CHDIR: 'tests'
script:
- mkdir ~/.ssh &&
echo "-----BEGIN OPENSSH PRIVATE KEY-----" > ~/.ssh/id_rsa &&
Expand All @@ -179,9 +181,10 @@ build:debug:
echo $KNOWN_HOSTS > ~/.ssh/known_hosts && chmod 600 ~/.ssh/id_rsa
- ssh $GPU_TARGET mkdir -p $RDIR
- scp -r $LDIR/* $GPU_TARGET:$RDIR/
- ssh $GPU_TARGET "LD_PRELOAD=$RDIR/libtirpc.so.3:$RDIR/cricket-server.so $RDIR/$TEST_BINARY" &
- ssh $GPU_TARGET "LD_PRELOAD=$RDIR/libtirpc.so.3 $RDIR/cricket-rpc-server 255" &
- sleep 2
- REMOTE_GPU_ADDRESS="ghost.acs-lab.eonerc.rwth-aachen.de" PATH=$LDIR:$PATH LD_PRELOAD=$LDIR/libtirpc.so.3:$LDIR/cricket-client.so $LDIR/$TEST_BINARY $PARAMETER
- cd $LDIR/$CHDIR
- CRICKET_RPCID=255 REMOTE_GPU_ADDRESS="ghost.acs-lab.eonerc.rwth-aachen.de" PATH=$LDIR:$PATH LD_PRELOAD=$LDIR/libtirpc.so.3:$LDIR/cricket-client.so ./$TEST_BINARY $PARAMETER
after_script:
- ssh $GPU_TARGET rm -rf $RDIR
- ssh $GPU_TARGET pkill -fe -2 $RDIR/test_kernel
Expand Down Expand Up @@ -216,21 +219,27 @@ test:test_programs(2/2):
test:test_kernel:
extends: .remote-gpu
variables:
TEST_BINARY: 'tests/kernel.testapp'
TEST_BINARY: 'kernel.testapp'

test:samples:matrixMul:
extends: .remote-gpu
variables:
TEST_BINARY: 'tests/matrixMul'
TEST_BINARY: 'matrixMul.compressed.sample'

test:samples:bandwidthTest:
extends: .remote-gpu
variables:
TEST_BINARY: 'tests/bandwidthTest'
TEST_BINARY: 'bandwidthTest.sample'

test:samples:nbody:
extends: .remote-gpu
variables:
TEST_BINARY: 'tests/nbody'
TEST_BINARY: 'nbody.uncompressed.sample'
PARAMETER: '-benchmark'

test:samples:mnistCUDNN:
extends: .remote-gpu
variables:
CHDIR: '../tests/samples/samples-bin'
TEST_BINARY: 'mnistCUDNN.sample'

7 changes: 4 additions & 3 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@ cuda-gdb:

libtirpc:
@echo -e "\033[36m----> Building libtirpc\033[0m"
$(MAKE) -C submodules libtirpc
$(MAKE) -C submodules libtirpc/install

gpu: cuda-gdb
@echo -e "\033[36m----> Building gpu\033[0m"
Expand All @@ -33,7 +33,7 @@ tests:
@echo -e "\033[36m----> Building test kernels\033[0m"
$(MAKE) -C tests

install-cpu: bin/cricket-client.so bin/cricket-server.so bin/libtirpc.so bin/libtirpc.so.3 bin/tests
install-cpu: bin/cricket-client.so bin/cricket-rpc-server bin/libtirpc.so bin/libtirpc.so.3 bin/tests
@echo -e "\033[36m----> Copying cpu binaries to build/bin\033[0m"

install: install-cpu bin/cricket
Expand All @@ -51,7 +51,8 @@ bin/cricket-client.so: bin

bin/cricket-server.so: bin
$(MAKE) -C cpu cricket-server.so
cp cpu/cricket-server.so bin
mv cpu/cricket-server.so bin/cricket-server.so


bin/cricket-rpc-server: bin
$(MAKE) -C cpu cricket-rpc-server
Expand Down
55 changes: 35 additions & 20 deletions cpu/Makefile
Original file line number Diff line number Diff line change
@@ -1,12 +1,12 @@
#RPC server library
SERVER = cricket-server.so
#Standalone RPC Server
SERVER_BIN = cricket-rpc-server
SERVER = cricket-rpc-server
SERVER_LIB = cricket-server.so
#RPC client library
CLIENT = cricket-client.so

CUDA_SRC = /usr/local/cuda
LIBTIRPC_PREFIX = ../submodules/libtirpc/install
SUBMODULE_LIBS = ../submodules/lib

CC = gcc
LD = gcc
Expand Down Expand Up @@ -39,7 +39,10 @@ SRC_SERVER = $(RPC_XDR) \
cr.c \
gsched_none.c \
oob.c \
mt-memcpy.c
mt-memcpy.c \
cpu-elf2.c \
cpu-server-nvml.c \
cpu-server-cudnn.c

SRC_SERVER_LIB = server-library.c
SRC_SERVER_EXE = server-exe.c
Expand All @@ -55,7 +58,11 @@ SRC_CLIENT = $(RPC_XDR) \
cpu-libwrap.c \
cpu-client-cusolver.c \
oob.c \
mt-memcpy.c
mt-memcpy.c \
cpu-elf2.c \
cpu-client-nvml.c \
cpu-client-cudnn.c \
cpu-client-cublas.c

# cpu-client-driver-hidden.c \
Expand All @@ -72,15 +79,17 @@ RPCGEN_FLAGS = -C -M -N
INC_FLAGS += -I$(LIBTIRPC_PREFIX)/include/tirpc
INC_FLAGS += -I$(CUDA_SRC)/include

LIB_FLAGS += -L$(LIBTIRPC_PREFIX)/lib -L$(CUDA_SRC)/lib64
CC_FLAGS += -std=gnu99 $(INC_FLAGS) -O2
LIB_FLAGS += -L$(LIBTIRPC_PREFIX)/lib
LIB_FLAGS += -L$(CUDA_SRC)/lib64
LIB_FLAGS += -L$(CUDA_SRC)/lib64/stubs
CC_FLAGS += -std=gnu11 $(INC_FLAGS) #-O2
# TODO: use extern in header files instead of direct definition e.g. in cpu-common.h to remove -fcommon flag
CC_FLAGS += -fcommon
LD_FLAGS = $(LIB_FLAGS) -ltirpc -ldl -lcrypto
LD_FLAGS = $(LIB_FLAGS) -ltirpc -ldl -lcrypto -lelf

ifdef WITH_DEBUG
# use ASAN_OPTIONS=protect_shadow_gap=0 LSAN_OPTIONS=fast_unwind_on_malloc=0 when running
CC_FLAGS += -g -ggdb #-fsanitize=address -fsanitize=pointer-compare -fsanitize=pointer-subtract -fsanitize-address-use-after-scope
CC_FLAGS += -g -ggdb #-static-libasan -fsanitize=address -fsanitize=pointer-compare -fsanitize=pointer-subtract -fsanitize-address-use-after-scope
endif

ifdef WITH_IB
Expand All @@ -90,48 +99,54 @@ endif
ifdef LOG
CC_FLAGS += -DLOG_LEVEL=LOG_$(LOG)
endif

ifdef LOGN
CC_FLAGS += -DLOG_LEVEL=$(LOGN)
endif

ifdef WITH_IB
CC_FLAGS += -DWITH_IB=$(WITH_IB)
endif

SERVER_LD_FLAGS = $(LD_FLAGS) -lcudart -lcusolver -lcuda -lcublas -lbfd -lrt -lpthread
SERVER_LD_FLAGS = $(LD_FLAGS) -lcudart -lcusolver -lcuda -lcublas -lrt -lpthread -lnvidia-ml -lcudnn
SERVER_BIN_LD_FLAGS = $(SERVER_LD_FLAGS) -Wl,--unresolved-symbols=ignore-in-object-files
CLIENT_LD_FLAGS = $(LD_FLAGS) -lbfd
CLIENT_LD_FLAGS = $(LD_FLAGS)

# Targets
.PHONY: all clean

all : $(SERVER) $(SERVER_BIN) $(CLIENT)
all : $(SERVER) $(CLIENT)

$(CLIENT) : $(OBJ_CLIENT)
$(LD) $(CC_FLAGS) -shared -o $@ $^ $(CLIENT_LD_FLAGS)

$(SERVER) : $(OBJ_SERVER) $(SRC_SERVER_LIB:%.c=%.o)
$(LD) $(CC_FLAGS) -shared -o $@ $^ $(SERVER_LD_FLAGS)
$(SERVER_LIB) : $(OBJ_SERVER) $(SRC_SERVER_EXE:%.c=%.o)
$(LD) $(CC_FLAGS) -shared -o $@ $^ $(SERVER_BIN_LD_FLAGS)

$(SERVER_BIN) : $(OBJ_SERVER) $(SRC_SERVER_EXE:%.c=%.o)
$(SERVER) : $(OBJ_SERVER) $(SRC_SERVER_EXE:%.c=%.o)
$(LD) $(CC_FLAGS) -o $@ $^ $(SERVER_BIN_LD_FLAGS)

$(RPC_H) : $(RPC_DEF)
$(RPCGEN) $(RPCGEN_FLAGS) -h -o $@ $<
rm -f $@ && $(RPCGEN) $(RPCGEN_FLAGS) -h -o $@ $<

$(RPC_CLIENT) : $(RPC_DEF)
$(RPCGEN) $(RPCGEN_FLAGS) -l -o $@ $<
rm -f $@ && $(RPCGEN) $(RPCGEN_FLAGS) -l -o $@ $<

$(RPC_SERVER) : $(RPC_DEF)
$(RPCGEN) $(RPCGEN_FLAGS) -m -o $@ $<
rm -f $@ && $(RPCGEN) $(RPCGEN_FLAGS) -m -o $@ $<

$(RPC_SERVER_MOD) : $(RPC_SERVER)
./generate_dispatch.sh

$(RPC_XDR) : $(RPC_DEF)
$(RPCGEN) $(RPCGEN_FLAGS) -c -o $@ $<
rm -f $@ && $(RPCGEN) $(RPCGEN_FLAGS) -c -o $@ $<

%.o : %.c $(RPC_H)
$(CC) $(CC_FLAGS) -c -fpic -o $@ $< $(LD_FLAGS)

clean:
rm -f $(RPC_H) $(RPC_CLIENT) $(RPC_SERVER) $(RPC_SERVER_BIN) $(RPC_SERVER_MOD) $(RPC_XDR) $(OBJ_CLIENT) $(OBJ_SERVER) $(SERVER) $(CLIENT)
rm -f $(RPC_H) $(RPC_CLIENT) $(RPC_SERVER) $(RPC_SERVER_MOD) $(RPC_XDR) $(OBJ_CLIENT) $(OBJ_SERVER) $(SERVER) $(SERVER_LIB) $(CLIENT) $(SRC_SERVER_EXE:%.c=%.o)




25 changes: 24 additions & 1 deletion cpu/api-recorder.c
Original file line number Diff line number Diff line change
Expand Up @@ -4,11 +4,13 @@

#include "api-recorder.h"
#include "log.h"
#include "list.h"


list api_records;

void api_records_free_args(void)

static void api_records_free_args(void)
{
api_record_t *record;
for (size_t i = 0; i < api_records.length; i++) {
Expand All @@ -22,6 +24,27 @@ void api_records_free_args(void)

}

static void api_records_free_data(void)
{
api_record_t *record;
for (size_t i = 0; i < api_records.length; i++) {
if (list_at(&api_records, i, (void**)&record) != 0) {
LOGE(LOG_ERROR, "list_at %zu returned an error.", i);
continue;
}
free(record->data);
record->data = NULL;
}
}


void api_records_free(void)
{
api_records_free_args();
api_records_free_data();
list_free(&api_records);
}

size_t api_records_malloc_get_size(void *ptr)
{
api_record_t *record;
Expand Down
5 changes: 4 additions & 1 deletion cpu/api-recorder.h
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,8 @@
*arguments = ARG
#define RECORD_ARG(NUM, ARG) \
arguments->arg##NUM = ARG
#define RECORD_NARG(ARG) \
arguments->ARG = ARG
#define RECORD_DATA(SIZE, PTR) \
record->data_size = SIZE; \
record->data = malloc(SIZE); \
Expand All @@ -58,14 +60,15 @@ typedef struct api_record {
void* ptr;
int integer;
ptr_result ptr_result_u;
sz_result sz_result_u;
} result;
void *data;
size_t data_size;
} api_record_t;
extern list api_records;


void api_records_free_args(void);
void api_records_free(void);
void api_records_print(void);
void api_records_print_records(api_record_t *record);

Expand Down
Loading

0 comments on commit bcc5c93

Please sign in to comment.