diff --git a/.gitignore b/.gitignore index 8c3acb57..11652fae 100644 --- a/.gitignore +++ b/.gitignore @@ -4,6 +4,7 @@ build/ .clangd .project .cproject +*.code-workspace .settings/ .vscode/ .directory @@ -39,3 +40,7 @@ core.* compile_commands.json tags .gdb_history + +# perf data +perf.data +main diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml index 35089bc0..df9b8eb4 100644 --- a/.gitlab-ci.yml +++ b/.gitlab-ci.yml @@ -21,7 +21,7 @@ stages: ############################################################################## # Build docker image -prepare:centos8:docker-dev: +prepare:rocky9:docker-dev: stage: prepare script: - docker build @@ -31,13 +31,13 @@ prepare:centos8:docker-dev: tags: - docker -prepare:centos8:cuda10: +prepare:centos8:cuda11: stage: prepare script: - docker build - --file utils/Dockerfile.cuda10 - --tag ${DOCKER_IMAGE_DEV}_cuda10:${DOCKER_TAG} - --tag ${DOCKER_IMAGE_DEV}_cuda10:latest . + --file utils/Dockerfile.cuda11 + --tag ${DOCKER_IMAGE_DEV}_cuda11:${DOCKER_TAG} + --tag ${DOCKER_IMAGE_DEV}_cuda11:latest . tags: - docker @@ -57,7 +57,7 @@ prepare:centos8:cuda10: build: stage: build - needs: ["prepare:centos8:docker-dev"] + needs: ["prepare:rocky9:docker-dev"] script: - make -j 32 libtirpc - make -j 32 cuda-gdb @@ -68,6 +68,7 @@ build: paths: - bin - tests/bin + - tests/samples/samples-bin image: ${DOCKER_IMAGE_DEV}:${DOCKER_TAG} cache: paths: @@ -82,7 +83,7 @@ build: build:ib: stage: build - needs: ["prepare:centos8:docker-dev"] + needs: ["prepare:rocky9:docker-dev"] script: - make -j 32 libtirpc - make -j 32 cuda-gdb @@ -108,19 +109,19 @@ build:ib: tags: - docker -build:cuda10: +build:cuda11: stage: build - needs: ["prepare:centos8:cuda10"] + needs: ["prepare:centos8:cuda11"] script: - make -j 32 libtirpc - make -j 32 cuda-gdb - - make -j 1 LOG=INFO + - make -j 1 LOG=INFO NOSAMPLES=yes artifacts: expire_in: 1 week paths: - bin - tests/bin - image: ${DOCKER_IMAGE_DEV}_cuda10:${DOCKER_TAG} + image: ${DOCKER_IMAGE_DEV}_cuda11:${DOCKER_TAG} cache: paths: - gpu/build @@ -130,13 +131,13 @@ build:cuda10: - submodules/libtirpc - submodules/cuda-gdb - submodules/cuda-gdb-src.rpm - key: build_cuda10 + key: build_cuda11 tags: - docker build:debug: stage: build - needs: ["prepare:centos8:docker-dev"] + needs: ["prepare:rocky9:docker-dev"] script: - make -j 32 libtirpc - make -j 32 cuda-gdb @@ -170,6 +171,7 @@ build:debug: LDIR: '$CI_BUILDS_DIR/$CI_PROJECT_PATH/bin' SAMPLES_PATH: '/usr/local/cuda/samples' PARAMETER: '' + CHDIR: 'tests' script: - mkdir ~/.ssh && echo "-----BEGIN OPENSSH PRIVATE KEY-----" > ~/.ssh/id_rsa && @@ -179,9 +181,10 @@ build:debug: echo $KNOWN_HOSTS > ~/.ssh/known_hosts && chmod 600 ~/.ssh/id_rsa - ssh $GPU_TARGET mkdir -p $RDIR - scp -r $LDIR/* $GPU_TARGET:$RDIR/ - - ssh $GPU_TARGET "LD_PRELOAD=$RDIR/libtirpc.so.3:$RDIR/cricket-server.so $RDIR/$TEST_BINARY" & + - ssh $GPU_TARGET "LD_PRELOAD=$RDIR/libtirpc.so.3 $RDIR/cricket-rpc-server 255" & - sleep 2 - - REMOTE_GPU_ADDRESS="ghost.acs-lab.eonerc.rwth-aachen.de" PATH=$LDIR:$PATH LD_PRELOAD=$LDIR/libtirpc.so.3:$LDIR/cricket-client.so $LDIR/$TEST_BINARY $PARAMETER + - cd $LDIR/$CHDIR + - CRICKET_RPCID=255 REMOTE_GPU_ADDRESS="ghost.acs-lab.eonerc.rwth-aachen.de" PATH=$LDIR:$PATH LD_PRELOAD=$LDIR/libtirpc.so.3:$LDIR/cricket-client.so ./$TEST_BINARY $PARAMETER after_script: - ssh $GPU_TARGET rm -rf $RDIR - ssh $GPU_TARGET pkill -fe -2 $RDIR/test_kernel @@ -216,21 +219,27 @@ test:test_programs(2/2): test:test_kernel: extends: .remote-gpu variables: - TEST_BINARY: 'tests/kernel.testapp' + TEST_BINARY: 'kernel.testapp' test:samples:matrixMul: extends: .remote-gpu variables: - TEST_BINARY: 'tests/matrixMul' + TEST_BINARY: 'matrixMul.compressed.sample' test:samples:bandwidthTest: extends: .remote-gpu variables: - TEST_BINARY: 'tests/bandwidthTest' + TEST_BINARY: 'bandwidthTest.sample' test:samples:nbody: extends: .remote-gpu variables: - TEST_BINARY: 'tests/nbody' + TEST_BINARY: 'nbody.uncompressed.sample' PARAMETER: '-benchmark' +test:samples:mnistCUDNN: + extends: .remote-gpu + variables: + CHDIR: '../tests/samples/samples-bin' + TEST_BINARY: 'mnistCUDNN.sample' + diff --git a/Makefile b/Makefile index 2e5401fc..7cc6f46c 100644 --- a/Makefile +++ b/Makefile @@ -19,7 +19,7 @@ cuda-gdb: libtirpc: @echo -e "\033[36m----> Building libtirpc\033[0m" - $(MAKE) -C submodules libtirpc + $(MAKE) -C submodules libtirpc/install gpu: cuda-gdb @echo -e "\033[36m----> Building gpu\033[0m" @@ -33,7 +33,7 @@ tests: @echo -e "\033[36m----> Building test kernels\033[0m" $(MAKE) -C tests -install-cpu: bin/cricket-client.so bin/cricket-server.so bin/libtirpc.so bin/libtirpc.so.3 bin/tests +install-cpu: bin/cricket-client.so bin/cricket-rpc-server bin/libtirpc.so bin/libtirpc.so.3 bin/tests @echo -e "\033[36m----> Copying cpu binaries to build/bin\033[0m" install: install-cpu bin/cricket @@ -51,7 +51,8 @@ bin/cricket-client.so: bin bin/cricket-server.so: bin $(MAKE) -C cpu cricket-server.so - cp cpu/cricket-server.so bin + mv cpu/cricket-server.so bin/cricket-server.so + bin/cricket-rpc-server: bin $(MAKE) -C cpu cricket-rpc-server diff --git a/cpu/Makefile b/cpu/Makefile index a03a7fc9..c2a13b13 100644 --- a/cpu/Makefile +++ b/cpu/Makefile @@ -1,12 +1,12 @@ -#RPC server library -SERVER = cricket-server.so #Standalone RPC Server -SERVER_BIN = cricket-rpc-server +SERVER = cricket-rpc-server +SERVER_LIB = cricket-server.so #RPC client library CLIENT = cricket-client.so CUDA_SRC = /usr/local/cuda LIBTIRPC_PREFIX = ../submodules/libtirpc/install +SUBMODULE_LIBS = ../submodules/lib CC = gcc LD = gcc @@ -39,7 +39,10 @@ SRC_SERVER = $(RPC_XDR) \ cr.c \ gsched_none.c \ oob.c \ - mt-memcpy.c + mt-memcpy.c \ + cpu-elf2.c \ + cpu-server-nvml.c \ + cpu-server-cudnn.c SRC_SERVER_LIB = server-library.c SRC_SERVER_EXE = server-exe.c @@ -55,7 +58,11 @@ SRC_CLIENT = $(RPC_XDR) \ cpu-libwrap.c \ cpu-client-cusolver.c \ oob.c \ - mt-memcpy.c + mt-memcpy.c \ + cpu-elf2.c \ + cpu-client-nvml.c \ + cpu-client-cudnn.c \ + cpu-client-cublas.c # cpu-client-driver-hidden.c \ @@ -72,15 +79,17 @@ RPCGEN_FLAGS = -C -M -N INC_FLAGS += -I$(LIBTIRPC_PREFIX)/include/tirpc INC_FLAGS += -I$(CUDA_SRC)/include -LIB_FLAGS += -L$(LIBTIRPC_PREFIX)/lib -L$(CUDA_SRC)/lib64 -CC_FLAGS += -std=gnu99 $(INC_FLAGS) -O2 +LIB_FLAGS += -L$(LIBTIRPC_PREFIX)/lib +LIB_FLAGS += -L$(CUDA_SRC)/lib64 +LIB_FLAGS += -L$(CUDA_SRC)/lib64/stubs +CC_FLAGS += -std=gnu11 $(INC_FLAGS) #-O2 # TODO: use extern in header files instead of direct definition e.g. in cpu-common.h to remove -fcommon flag CC_FLAGS += -fcommon -LD_FLAGS = $(LIB_FLAGS) -ltirpc -ldl -lcrypto +LD_FLAGS = $(LIB_FLAGS) -ltirpc -ldl -lcrypto -lelf ifdef WITH_DEBUG # use ASAN_OPTIONS=protect_shadow_gap=0 LSAN_OPTIONS=fast_unwind_on_malloc=0 when running -CC_FLAGS += -g -ggdb #-fsanitize=address -fsanitize=pointer-compare -fsanitize=pointer-subtract -fsanitize-address-use-after-scope +CC_FLAGS += -g -ggdb #-static-libasan -fsanitize=address -fsanitize=pointer-compare -fsanitize=pointer-subtract -fsanitize-address-use-after-scope endif ifdef WITH_IB @@ -90,48 +99,54 @@ endif ifdef LOG CC_FLAGS += -DLOG_LEVEL=LOG_$(LOG) endif + +ifdef LOGN +CC_FLAGS += -DLOG_LEVEL=$(LOGN) +endif + ifdef WITH_IB CC_FLAGS += -DWITH_IB=$(WITH_IB) endif -SERVER_LD_FLAGS = $(LD_FLAGS) -lcudart -lcusolver -lcuda -lcublas -lbfd -lrt -lpthread +SERVER_LD_FLAGS = $(LD_FLAGS) -lcudart -lcusolver -lcuda -lcublas -lrt -lpthread -lnvidia-ml -lcudnn SERVER_BIN_LD_FLAGS = $(SERVER_LD_FLAGS) -Wl,--unresolved-symbols=ignore-in-object-files -CLIENT_LD_FLAGS = $(LD_FLAGS) -lbfd +CLIENT_LD_FLAGS = $(LD_FLAGS) # Targets .PHONY: all clean -all : $(SERVER) $(SERVER_BIN) $(CLIENT) +all : $(SERVER) $(CLIENT) $(CLIENT) : $(OBJ_CLIENT) $(LD) $(CC_FLAGS) -shared -o $@ $^ $(CLIENT_LD_FLAGS) -$(SERVER) : $(OBJ_SERVER) $(SRC_SERVER_LIB:%.c=%.o) - $(LD) $(CC_FLAGS) -shared -o $@ $^ $(SERVER_LD_FLAGS) +$(SERVER_LIB) : $(OBJ_SERVER) $(SRC_SERVER_EXE:%.c=%.o) + $(LD) $(CC_FLAGS) -shared -o $@ $^ $(SERVER_BIN_LD_FLAGS) -$(SERVER_BIN) : $(OBJ_SERVER) $(SRC_SERVER_EXE:%.c=%.o) +$(SERVER) : $(OBJ_SERVER) $(SRC_SERVER_EXE:%.c=%.o) $(LD) $(CC_FLAGS) -o $@ $^ $(SERVER_BIN_LD_FLAGS) $(RPC_H) : $(RPC_DEF) - $(RPCGEN) $(RPCGEN_FLAGS) -h -o $@ $< + rm -f $@ && $(RPCGEN) $(RPCGEN_FLAGS) -h -o $@ $< $(RPC_CLIENT) : $(RPC_DEF) - $(RPCGEN) $(RPCGEN_FLAGS) -l -o $@ $< + rm -f $@ && $(RPCGEN) $(RPCGEN_FLAGS) -l -o $@ $< $(RPC_SERVER) : $(RPC_DEF) - $(RPCGEN) $(RPCGEN_FLAGS) -m -o $@ $< + rm -f $@ && $(RPCGEN) $(RPCGEN_FLAGS) -m -o $@ $< $(RPC_SERVER_MOD) : $(RPC_SERVER) ./generate_dispatch.sh $(RPC_XDR) : $(RPC_DEF) - $(RPCGEN) $(RPCGEN_FLAGS) -c -o $@ $< + rm -f $@ && $(RPCGEN) $(RPCGEN_FLAGS) -c -o $@ $< %.o : %.c $(RPC_H) $(CC) $(CC_FLAGS) -c -fpic -o $@ $< $(LD_FLAGS) clean: - rm -f $(RPC_H) $(RPC_CLIENT) $(RPC_SERVER) $(RPC_SERVER_BIN) $(RPC_SERVER_MOD) $(RPC_XDR) $(OBJ_CLIENT) $(OBJ_SERVER) $(SERVER) $(CLIENT) + rm -f $(RPC_H) $(RPC_CLIENT) $(RPC_SERVER) $(RPC_SERVER_MOD) $(RPC_XDR) $(OBJ_CLIENT) $(OBJ_SERVER) $(SERVER) $(SERVER_LIB) $(CLIENT) $(SRC_SERVER_EXE:%.c=%.o) + diff --git a/cpu/api-recorder.c b/cpu/api-recorder.c index f02cdc37..e67204d2 100644 --- a/cpu/api-recorder.c +++ b/cpu/api-recorder.c @@ -4,11 +4,13 @@ #include "api-recorder.h" #include "log.h" +#include "list.h" list api_records; -void api_records_free_args(void) + +static void api_records_free_args(void) { api_record_t *record; for (size_t i = 0; i < api_records.length; i++) { @@ -22,6 +24,27 @@ void api_records_free_args(void) } +static void api_records_free_data(void) +{ + api_record_t *record; + for (size_t i = 0; i < api_records.length; i++) { + if (list_at(&api_records, i, (void**)&record) != 0) { + LOGE(LOG_ERROR, "list_at %zu returned an error.", i); + continue; + } + free(record->data); + record->data = NULL; + } +} + + +void api_records_free(void) +{ + api_records_free_args(); + api_records_free_data(); + list_free(&api_records); +} + size_t api_records_malloc_get_size(void *ptr) { api_record_t *record; diff --git a/cpu/api-recorder.h b/cpu/api-recorder.h index 856a3121..37c5e569 100644 --- a/cpu/api-recorder.h +++ b/cpu/api-recorder.h @@ -35,6 +35,8 @@ *arguments = ARG #define RECORD_ARG(NUM, ARG) \ arguments->arg##NUM = ARG +#define RECORD_NARG(ARG) \ + arguments->ARG = ARG #define RECORD_DATA(SIZE, PTR) \ record->data_size = SIZE; \ record->data = malloc(SIZE); \ @@ -58,6 +60,7 @@ typedef struct api_record { void* ptr; int integer; ptr_result ptr_result_u; + sz_result sz_result_u; } result; void *data; size_t data_size; @@ -65,7 +68,7 @@ typedef struct api_record { extern list api_records; -void api_records_free_args(void); +void api_records_free(void); void api_records_print(void); void api_records_print_records(api_record_t *record); diff --git a/cpu/cpu-client-cublas.c b/cpu/cpu-client-cublas.c new file mode 100644 index 00000000..f9fbc159 --- /dev/null +++ b/cpu/cpu-client-cublas.c @@ -0,0 +1,763 @@ + +#define _GNU_SOURCE +#include +#include +#include +#include + +//for strerror +#include +#include + +#include "cpu-libwrap.h" +#include "cpu_rpc_prot.h" +#include "cpu-common.h" +#include "cpu-utils.h" +#include "log.h" + +#ifdef WITH_API_CNT +extern int api_call_cnt; +#endif //WITH_API_CNT + +cublasStatus_t cublasCreate_v2(cublasHandle_t* handle) +{ +#ifdef WITH_API_CNT + api_call_cnt++; +#endif //WITH_API_CNT + ptr_result result; + enum clnt_stat retval_1; + retval_1 = rpc_cublascreate_1(&result, clnt); + if (retval_1 != RPC_SUCCESS) { + clnt_perror (clnt, "call failed"); + } + if (result.err == 0) { + *handle = (void*)result.ptr_result_u.ptr; + } + return result.err; +} + +cublasStatus_t cublasDestroy_v2(cublasHandle_t handle) +{ +#ifdef WITH_API_CNT + api_call_cnt++; +#endif //WITH_API_CNT + int result; + enum clnt_stat retval_1; + retval_1 = rpc_cublasdestroy_1((ptr)handle, &result, clnt); + if (retval_1 != RPC_SUCCESS) { + clnt_perror (clnt, "call failed"); + } + return result; +} + +DEF_FN(cublasStatus_t, cublasGetVersion_v2, cublasHandle_t, handle, int*, version); +DEF_FN(cublasStatus_t, cublasGetProperty, libraryPropertyType, type, int*, value); +DEF_FN(size_t, cublasGetCudartVersion); +cublasStatus_t cublasSetWorkspace_v2(cublasHandle_t handle, void* workspace, size_t workspaceSizeInBytes) +{ +#ifdef WITH_API_CNT + api_call_cnt++; +#endif //WITH_API_CNT + int result; + enum clnt_stat retval_1; + retval_1 = rpc_cublassetworkspace_1( + (ptr)handle, + (ptr)workspace, + workspaceSizeInBytes, + &result, clnt); + if (retval_1 != RPC_SUCCESS) { + clnt_perror (clnt, "call failed"); + } + return result; +} + +cublasStatus_t cublasSetStream_v2(cublasHandle_t handle, cudaStream_t streamId) +{ +#ifdef WITH_API_CNT + api_call_cnt++; +#endif //WITH_API_CNT + int result; + enum clnt_stat retval_1; + retval_1 = rpc_cublassetstream_1( + (ptr)handle, + (ptr)streamId, + &result, clnt); + if (retval_1 != RPC_SUCCESS) { + clnt_perror (clnt, "call failed"); + } + return result; +} + +DEF_FN(cublasStatus_t, cublasGetStream_v2, cublasHandle_t, handle, cudaStream_t*, streamId); +DEF_FN(cublasStatus_t, cublasGetPointerMode_v2, cublasHandle_t, handle, cublasPointerMode_t*, mode); +DEF_FN(cublasStatus_t, cublasSetPointerMode_v2, cublasHandle_t, handle, cublasPointerMode_t, mode); +DEF_FN(cublasStatus_t, cublasGetAtomicsMode, cublasHandle_t, handle, cublasAtomicsMode_t*, mode); +DEF_FN(cublasStatus_t, cublasSetAtomicsMode, cublasHandle_t, handle, cublasAtomicsMode_t, mode); +DEF_FN(cublasStatus_t, cublasGetMathMode, cublasHandle_t, handle, cublasMath_t*, mode); +cublasStatus_t cublasSetMathMode(cublasHandle_t handle, cublasMath_t mode) +{ +#ifdef WITH_API_CNT + api_call_cnt++; +#endif //WITH_API_CNT + int result; + enum clnt_stat retval_1; + retval_1 = rpc_cublassetmathmode_1( + (ptr)handle, + (int)mode, + &result, clnt); + if (retval_1 != RPC_SUCCESS) { + clnt_perror (clnt, "call failed"); + } + return result; +} + +DEF_FN(cublasStatus_t, cublasGetSmCountTarget, cublasHandle_t, handle, int*, smCountTarget); +DEF_FN(cublasStatus_t, cublasSetSmCountTarget, cublasHandle_t, handle, int, smCountTarget); +DEF_FN(const char*, cublasGetStatusName, cublasStatus_t, status); +DEF_FN(const char*, cublasGetStatusString, cublasStatus_t, status); +DEF_FN(cublasStatus_t, cublasLoggerConfigure, int, logIsOn, int, logToStdOut, int, logToStdErr, const char*, logFileName); +DEF_FN(cublasStatus_t, cublasSetLoggerCallback, cublasLogCallback, userCallback); +DEF_FN(cublasStatus_t, cublasGetLoggerCallback, cublasLogCallback*, userCallback); +DEF_FN(cublasStatus_t, cublasSetVector, int, n, int, elemSize, const void*, x, int, incx, void*, devicePtr, int, incy); +DEF_FN(cublasStatus_t, cublasSetVector_64, int64_t, n, int64_t, elemSize, const void*, x, int64_t, incx, void*, devicePtr, int64_t, incy); +DEF_FN(cublasStatus_t, cublasGetVector, int, n, int, elemSize, const void*, x, int, incx, void*, y, int, incy); +DEF_FN(cublasStatus_t, cublasGetVector_64, int64_t, n, int64_t, elemSize, const void*, x, int64_t, incx, void*, y, int64_t, incy); +DEF_FN(cublasStatus_t, cublasSetMatrix, int, rows, int, cols, int, elemSize, const void*, A, int, lda, void*, B, int, ldb); +DEF_FN(cublasStatus_t, cublasSetMatrix_64, int64_t, rows, int64_t, cols, int64_t, elemSize, const void*, A, int64_t, lda, void*, B, int64_t, ldb); +DEF_FN(cublasStatus_t, cublasGetMatrix, int, rows, int, cols, int, elemSize, const void*, A, int, lda, void*, B, int, ldb); +DEF_FN(cublasStatus_t, cublasGetMatrix_64, int64_t, rows, int64_t, cols, int64_t, elemSize, const void*, A, int64_t, lda, void*, B, int64_t, ldb); +DEF_FN(cublasStatus_t, cublasSetVectorAsync, , int, n, int, elemSize, const void*, hostPtr, int, incx, void*, devicePtr, int, incy, cudaStream_t, stream); +DEF_FN(cublasStatus_t, cublasSetVectorAsync_64, , int64_t, n, int64_t, elemSize, const void*, hostPtr, int64_t, incx, void*, devicePtr, int64_t, incy, cudaStream_t, stream); +DEF_FN(cublasStatus_t, cublasGetVectorAsync, , int, n, int, elemSize, const void*, devicePtr, int, incx, void*, hostPtr, int, incy, cudaStream_t, stream); +DEF_FN(cublasStatus_t, cublasGetVectorAsync_64, , int64_t, n, int64_t, elemSize, const void*, devicePtr, int64_t, incx, void*, hostPtr, int64_t, incy, cudaStream_t, stream); +DEF_FN(cublasStatus_t, cublasSetMatrixAsync, int, rows, int, cols, int, elemSize, const void*, A, int, lda, void*, B, int, ldb, cudaStream_t, stream); +DEF_FN(cublasStatus_t, cublasSetMatrixAsync_64, int64_t, rows, int64_t, cols, int64_t, elemSize, const void*, A, int64_t, lda, void*, B, int64_t, ldb, cudaStream_t, stream); +DEF_FN(cublasStatus_t, cublasGetMatrixAsync, int, rows, int, cols, int, elemSize, const void*, A, int, lda, void*, B, int, ldb, cudaStream_t, stream); +DEF_FN(cublasStatus_t, cublasGetMatrixAsync_64, int64_t, rows, int64_t, cols, int64_t, elemSize, const void*, A, int64_t, lda, void*, B, int64_t, ldb, cudaStream_t, stream); +void cublasXerbla(const char* srName, int info) { + void (*fun)(const char*, int); + char* error_str; *(void **)(&fun) = dlsym(libwrap_get_sohandle(), "cublasXerbla"); + if ((error_str = dlerror()) != ((void *)0)) { + if (0 > get_log_data()->curr_level) ; + else + loggfe(0, 88, "/home/eiling/projects/cricket/cpu/cpu-client-cublas.c", "[libwrap] %s", error_str); + } + if (3 > get_log_data()->curr_level) ; + else + loggf(3, "%s called", "cublasXerbla"); + (*fun)(srName, info); + if (3 > get_log_data()->curr_level) ; + else loggf(3, "%s finished", "cublasXerbla"); +} +DEF_FN(cublasStatus_t, cublasNrm2Ex, cublasHandle_t, handle, int, n, const void*, x, cudaDataType, xType, int, incx, void*, result, cudaDataType, resultType, cudaDataType, executionType); +DEF_FN(cublasStatus_t, cublasNrm2Ex_64, cublasHandle_t, handle, int64_t, n, const void*, x, cudaDataType, xType, int64_t, incx, void*, result, cudaDataType, resultType, cudaDataType, executionType); +DEF_FN(cublasStatus_t, cublasSnrm2_v2, cublasHandle_t, handle, int, n, const float*, x, int, incx, float*, result); +DEF_FN(cublasStatus_t, cublasSnrm2_v2_64, cublasHandle_t, handle, int64_t, n, const float*, x, int64_t, incx, float*, result); +DEF_FN(cublasStatus_t, cublasDnrm2_v2, cublasHandle_t, handle, int, n, const double*, x, int, incx, double*, result); +DEF_FN(cublasStatus_t, cublasDnrm2_v2_64, cublasHandle_t, handle, int64_t, n, const double*, x, int64_t, incx, double*, result); +DEF_FN(cublasStatus_t, cublasScnrm2_v2, cublasHandle_t, handle, int, n, const cuComplex*, x, int, incx, float*, result); +DEF_FN(cublasStatus_t, cublasScnrm2_v2_64, cublasHandle_t, handle, int64_t, n, const cuComplex*, x, int64_t, incx, float*, result); +DEF_FN(cublasStatus_t, cublasDznrm2_v2, cublasHandle_t, handle, int, n, const cuDoubleComplex*, x, int, incx, double*, result); +DEF_FN(cublasStatus_t, cublasDznrm2_v2_64, cublasHandle_t, handle, int64_t, n, const cuDoubleComplex*, x, int64_t, incx, double*, result); +DEF_FN(cublasStatus_t, cublasDotEx, cublasHandle_t, handle, int, n, const void*, x, cudaDataType, xType, int, incx, const void*, y, cudaDataType, yType, int, incy, void*, result, cudaDataType, resultType, cudaDataType, executionType); +DEF_FN(cublasStatus_t, cublasDotEx_64, cublasHandle_t, handle, int64_t, n, const void*, x, cudaDataType, xType, int64_t, incx, const void*, y, cudaDataType, yType, int64_t, incy, void*, result, cudaDataType, resultType, cudaDataType, executionType); +DEF_FN(cublasStatus_t, cublasDotcEx, cublasHandle_t, handle, int, n, const void*, x, cudaDataType, xType, int, incx, const void*, y, cudaDataType, yType, int, incy, void*, result, cudaDataType, resultType, cudaDataType, executionType); +DEF_FN(cublasStatus_t, cublasDotcEx_64, cublasHandle_t, handle, int64_t, n, const void*, x, cudaDataType, xType, int64_t, incx, const void*, y, cudaDataType, yType, int64_t, incy, void*, result, cudaDataType, resultType, cudaDataType, executionType); +DEF_FN(cublasStatus_t, cublasSdot_v2, cublasHandle_t, handle, int, n, const float*, x, int, incx, const float*, y, int, incy, float*, result); +DEF_FN(cublasStatus_t, cublasSdot_v2_64, , cublasHandle_t, handle, int64_t, n, const float*, x, int64_t, incx, const float*, y, int64_t, incy, float*, result); +DEF_FN(cublasStatus_t, cublasDdot_v2, cublasHandle_t, handle, int, n, const double*, x, int, incx, const double*, y, int, incy, double*, result); +DEF_FN(cublasStatus_t, cublasDdot_v2_64, , cublasHandle_t, handle, int64_t, n, const double*, x, int64_t, incx, const double*, y, int64_t, incy, double*, result); +DEF_FN(cublasStatus_t, cublasCdotu_v2, , cublasHandle_t, handle, int, n, const cuComplex*, x, int, incx, const cuComplex*, y, int, incy, cuComplex*, result); +DEF_FN(cublasStatus_t, cublasCdotu_v2_64, cublasHandle_t, handle, int64_t, n, const cuComplex*, x, int64_t, incx, const cuComplex*, y, int64_t, incy, cuComplex*, result); +DEF_FN(cublasStatus_t, cublasCdotc_v2, , cublasHandle_t, handle, int, n, const cuComplex*, x, int, incx, const cuComplex*, y, int, incy, cuComplex*, result); +DEF_FN(cublasStatus_t, cublasCdotc_v2_64, cublasHandle_t, handle, int64_t, n, const cuComplex*, x, int64_t, incx, const cuComplex*, y, int64_t, incy, cuComplex*, result); +DEF_FN(cublasStatus_t, cublasZdotu_v2, cublasHandle_t, handle, int, n, const cuDoubleComplex*, x, int, incx, const cuDoubleComplex*, y, int, incy, cuDoubleComplex*, result); +DEF_FN(cublasStatus_t, cublasZdotu_v2_64, cublasHandle_t, handle, int64_t, n, const cuDoubleComplex*, x, int64_t, incx, const cuDoubleComplex*, y, int64_t, incy, cuDoubleComplex*, result); +DEF_FN(cublasStatus_t, cublasZdotc_v2, cublasHandle_t, handle, int, n, const cuDoubleComplex*, x, int, incx, const cuDoubleComplex*, y, int, incy, cuDoubleComplex*, result); +DEF_FN(cublasStatus_t, cublasZdotc_v2_64, cublasHandle_t, handle, int64_t, n, const cuDoubleComplex*, x, int64_t, incx, const cuDoubleComplex*, y, int64_t, incy, cuDoubleComplex*, result); +DEF_FN(cublasStatus_t, cublasScalEx, cublasHandle_t, handle, int, n, const void*, alpha, cudaDataType, alphaType, void*, x, cudaDataType, xType, int, incx, cudaDataType, executionType); +DEF_FN(cublasStatus_t, cublasScalEx_64, cublasHandle_t, handle, int64_t, n, const void*, alpha, cudaDataType, alphaType, void*, x, cudaDataType, xType, int64_t, incx, cudaDataType, executionType); +DEF_FN(cublasStatus_t, cublasSscal_v2, cublasHandle_t, handle, int, n, const float*, alpha, float*, x, int, incx); +DEF_FN(cublasStatus_t, cublasSscal_v2_64, cublasHandle_t, handle, int64_t, n, const float*, alpha, float*, x, int64_t, incx); +DEF_FN(cublasStatus_t, cublasDscal_v2, cublasHandle_t, handle, int, n, const double*, alpha, double*, x, int, incx); +DEF_FN(cublasStatus_t, cublasDscal_v2_64, cublasHandle_t, handle, int64_t, n, const double*, alpha, double*, x, int64_t, incx); +DEF_FN(cublasStatus_t, cublasCscal_v2, cublasHandle_t, handle, int, n, const cuComplex*, alpha, cuComplex*, x, int, incx); +DEF_FN(cublasStatus_t, cublasCscal_v2_64, cublasHandle_t, handle, int64_t, n, const cuComplex*, alpha, cuComplex*, x, int64_t, incx); +DEF_FN(cublasStatus_t, cublasCsscal_v2, cublasHandle_t, handle, int, n, const float*, alpha, cuComplex*, x, int, incx); +DEF_FN(cublasStatus_t, cublasCsscal_v2_64, cublasHandle_t, handle, int64_t, n, const float*, alpha, cuComplex*, x, int64_t, incx); +DEF_FN(cublasStatus_t, cublasZscal_v2, cublasHandle_t, handle, int, n, const cuDoubleComplex*, alpha, cuDoubleComplex*, x, int, incx); +DEF_FN(cublasStatus_t, cublasZscal_v2_64, cublasHandle_t, handle, int64_t, n, const cuDoubleComplex*, alpha, cuDoubleComplex*, x, int64_t, incx); +DEF_FN(cublasStatus_t, cublasZdscal_v2, cublasHandle_t, handle, int, n, const double*, alpha, cuDoubleComplex*, x, int, incx); +DEF_FN(cublasStatus_t, cublasZdscal_v2_64, cublasHandle_t, handle, int64_t, n, const double*, alpha, cuDoubleComplex*, x, int64_t, incx); +DEF_FN(cublasStatus_t, cublasAxpyEx, cublasHandle_t, handle, int, n, const void*, alpha, cudaDataType, alphaType, const void*, x, cudaDataType, xType, int, incx, void*, y, cudaDataType, yType, int, incy, cudaDataType, executiontype); +DEF_FN(cublasStatus_t, cublasAxpyEx_64, cublasHandle_t, handle, int64_t, n, const void*, alpha, cudaDataType, alphaType, const void*, x, cudaDataType, xType, int64_t, incx, void*, y, cudaDataType, yType, int64_t, incy, cudaDataType, executiontype); +DEF_FN(cublasStatus_t, cublasSaxpy_v2, cublasHandle_t, handle, int, n, const float*, alpha, const float*, x, int, incx, float*, y, int, incy); +DEF_FN(cublasStatus_t, cublasSaxpy_v2_64, , cublasHandle_t, handle, int64_t, n, const float*, alpha, const float*, x, int64_t, incx, float*, y, int64_t, incy); +DEF_FN(cublasStatus_t, cublasDaxpy_v2, cublasHandle_t, handle, int, n, const double*, alpha, const double*, x, int, incx, double*, y, int, incy); +DEF_FN(cublasStatus_t, cublasDaxpy_v2_64, , cublasHandle_t, handle, int64_t, n, const double*, alpha, const double*, x, int64_t, incx, double*, y, int64_t, incy); +DEF_FN(cublasStatus_t, cublasCaxpy_v2, , cublasHandle_t, handle, int, n, const cuComplex*, alpha, const cuComplex*, x, int, incx, cuComplex*, y, int, incy); +DEF_FN(cublasStatus_t, cublasCaxpy_v2_64, cublasHandle_t, handle, int64_t, n, const cuComplex*, alpha, const cuComplex*, x, int64_t, incx, cuComplex*, y, int64_t, incy); +DEF_FN(cublasStatus_t, cublasZaxpy_v2, cublasHandle_t, handle, int, n, const cuDoubleComplex*, alpha, const cuDoubleComplex*, x, int, incx, cuDoubleComplex*, y, int, incy); +DEF_FN(cublasStatus_t, cublasZaxpy_v2_64, cublasHandle_t, handle, int64_t, n, const cuDoubleComplex*, alpha, const cuDoubleComplex*, x, int64_t, incx, cuDoubleComplex*, y, int64_t, incy); +DEF_FN(cublasStatus_t, cublasCopyEx, , cublasHandle_t, handle, int, n, const void*, x, cudaDataType, xType, int, incx, void*, y, cudaDataType, yType, int, incy); +DEF_FN(cublasStatus_t, cublasCopyEx_64, cublasHandle_t, handle, int64_t, n, const void*, x, cudaDataType, xType, int64_t, incx, void*, y, cudaDataType, yType, int64_t, incy); +DEF_FN(cublasStatus_t, cublasScopy_v2, cublasHandle_t, handle, int, n, const float*, x, int, incx, float*, y, int, incy); +DEF_FN(cublasStatus_t, cublasScopy_v2_64, cublasHandle_t, handle, int64_t, n, const float*, x, int64_t, incx, float*, y, int64_t, incy); +DEF_FN(cublasStatus_t, cublasDcopy_v2, cublasHandle_t, handle, int, n, const double*, x, int, incx, double*, y, int, incy); +DEF_FN(cublasStatus_t, cublasDcopy_v2_64, cublasHandle_t, handle, int64_t, n, const double*, x, int64_t, incx, double*, y, int64_t, incy); +DEF_FN(cublasStatus_t, cublasCcopy_v2, cublasHandle_t, handle, int, n, const cuComplex*, x, int, incx, cuComplex*, y, int, incy); +DEF_FN(cublasStatus_t, cublasCcopy_v2_64, cublasHandle_t, handle, int64_t, n, const cuComplex*, x, int64_t, incx, cuComplex*, y, int64_t, incy); +DEF_FN(cublasStatus_t, cublasZcopy_v2, cublasHandle_t, handle, int, n, const cuDoubleComplex*, x, int, incx, cuDoubleComplex*, y, int, incy); +DEF_FN(cublasStatus_t, cublasZcopy_v2_64, , cublasHandle_t, handle, int64_t, n, const cuDoubleComplex*, x, int64_t, incx, cuDoubleComplex*, y, int64_t, incy); +DEF_FN(cublasStatus_t, cublasSswap_v2, cublasHandle_t, handle, int, n, float*, x, int, incx, float*, y, int, incy); +DEF_FN(cublasStatus_t, cublasSswap_v2_64, cublasHandle_t, handle, int64_t, n, float*, x, int64_t, incx, float*, y, int64_t, incy); +DEF_FN(cublasStatus_t, cublasDswap_v2, cublasHandle_t, handle, int, n, double*, x, int, incx, double*, y, int, incy); +DEF_FN(cublasStatus_t, cublasDswap_v2_64, cublasHandle_t, handle, int64_t, n, double*, x, int64_t, incx, double*, y, int64_t, incy); +DEF_FN(cublasStatus_t, cublasCswap_v2, cublasHandle_t, handle, int, n, cuComplex*, x, int, incx, cuComplex*, y, int, incy); +DEF_FN(cublasStatus_t, cublasCswap_v2_64, cublasHandle_t, handle, int64_t, n, cuComplex*, x, int64_t, incx, cuComplex*, y, int64_t, incy); +DEF_FN(cublasStatus_t, cublasZswap_v2, cublasHandle_t, handle, int, n, cuDoubleComplex*, x, int, incx, cuDoubleComplex*, y, int, incy); +DEF_FN(cublasStatus_t, cublasZswap_v2_64, cublasHandle_t, handle, int64_t, n, cuDoubleComplex*, x, int64_t, incx, cuDoubleComplex*, y, int64_t, incy); +DEF_FN(cublasStatus_t, cublasSwapEx, , cublasHandle_t, handle, int, n, void*, x, cudaDataType, xType, int, incx, void*, y, cudaDataType, yType, int, incy); +DEF_FN(cublasStatus_t, cublasSwapEx_64, cublasHandle_t, handle, int64_t, n, void*, x, cudaDataType, xType, int64_t, incx, void*, y, cudaDataType, yType, int64_t, incy); +DEF_FN(cublasStatus_t, cublasIsamax_v2, cublasHandle_t, handle, int, n, const float*, x, int, incx, int*, result); +DEF_FN(cublasStatus_t, cublasIsamax_v2_64, cublasHandle_t, handle, int64_t, n, const float*, x, int64_t, incx, int64_t*, result); +DEF_FN(cublasStatus_t, cublasIdamax_v2, cublasHandle_t, handle, int, n, const double*, x, int, incx, int*, result); +DEF_FN(cublasStatus_t, cublasIdamax_v2_64, cublasHandle_t, handle, int64_t, n, const double*, x, int64_t, incx, int64_t*, result); +DEF_FN(cublasStatus_t, cublasIcamax_v2, cublasHandle_t, handle, int, n, const cuComplex*, x, int, incx, int*, result); +DEF_FN(cublasStatus_t, cublasIcamax_v2_64, cublasHandle_t, handle, int64_t, n, const cuComplex*, x, int64_t, incx, int64_t*, result); +DEF_FN(cublasStatus_t, cublasIzamax_v2, cublasHandle_t, handle, int, n, const cuDoubleComplex*, x, int, incx, int*, result); +DEF_FN(cublasStatus_t, cublasIzamax_v2_64, cublasHandle_t, handle, int64_t, n, const cuDoubleComplex*, x, int64_t, incx, int64_t*, result); +DEF_FN(cublasStatus_t, cublasIamaxEx, cublasHandle_t, handle, int, n, const void*, x, cudaDataType, xType, int, incx, int*, result); +DEF_FN(cublasStatus_t, cublasIamaxEx_64, cublasHandle_t, handle, int64_t, n, const void*, x, cudaDataType, xType, int64_t, incx, int64_t*, result); +DEF_FN(cublasStatus_t, cublasIsamin_v2, cublasHandle_t, handle, int, n, const float*, x, int, incx, int*, result); +DEF_FN(cublasStatus_t, cublasIsamin_v2_64, cublasHandle_t, handle, int64_t, n, const float*, x, int64_t, incx, int64_t*, result); +DEF_FN(cublasStatus_t, cublasIdamin_v2, cublasHandle_t, handle, int, n, const double*, x, int, incx, int*, result); +DEF_FN(cublasStatus_t, cublasIdamin_v2_64, cublasHandle_t, handle, int64_t, n, const double*, x, int64_t, incx, int64_t*, result); +DEF_FN(cublasStatus_t, cublasIcamin_v2, cublasHandle_t, handle, int, n, const cuComplex*, x, int, incx, int*, result); +DEF_FN(cublasStatus_t, cublasIcamin_v2_64, cublasHandle_t, handle, int64_t, n, const cuComplex*, x, int64_t, incx, int64_t*, result); +DEF_FN(cublasStatus_t, cublasIzamin_v2, cublasHandle_t, handle, int, n, const cuDoubleComplex*, x, int, incx, int*, result); +DEF_FN(cublasStatus_t, cublasIzamin_v2_64, cublasHandle_t, handle, int64_t, n, const cuDoubleComplex*, x, int64_t, incx, int64_t*, result); +DEF_FN(cublasStatus_t, cublasIaminEx, cublasHandle_t, handle, int, n, const void*, x, cudaDataType, xType, int, incx, int*, result); +DEF_FN(cublasStatus_t, cublasIaminEx_64, cublasHandle_t, handle, int64_t, n, const void*, x, cudaDataType, xType, int64_t, incx, int64_t*, result); +DEF_FN(cublasStatus_t, cublasAsumEx, cublasHandle_t, handle, int, n, const void*, x, cudaDataType, xType, int, incx, void*, result, cudaDataType, resultType, cudaDataType, executiontype); +DEF_FN(cublasStatus_t, cublasAsumEx_64, cublasHandle_t, handle, int64_t, n, const void*, x, cudaDataType, xType, int64_t, incx, void*, result, cudaDataType, resultType, cudaDataType, executiontype); +DEF_FN(cublasStatus_t, cublasSasum_v2, cublasHandle_t, handle, int, n, const float*, x, int, incx, float*, result); +DEF_FN(cublasStatus_t, cublasSasum_v2_64, cublasHandle_t, handle, int64_t, n, const float*, x, int64_t, incx, float*, result); +DEF_FN(cublasStatus_t, cublasDasum_v2, cublasHandle_t, handle, int, n, const double*, x, int, incx, double*, result); +DEF_FN(cublasStatus_t, cublasDasum_v2_64, cublasHandle_t, handle, int64_t, n, const double*, x, int64_t, incx, double*, result); +DEF_FN(cublasStatus_t, cublasScasum_v2, cublasHandle_t, handle, int, n, const cuComplex*, x, int, incx, float*, result); +DEF_FN(cublasStatus_t, cublasScasum_v2_64, cublasHandle_t, handle, int64_t, n, const cuComplex*, x, int64_t, incx, float*, result); +DEF_FN(cublasStatus_t, cublasDzasum_v2, cublasHandle_t, handle, int, n, const cuDoubleComplex*, x, int, incx, double*, result); +DEF_FN(cublasStatus_t, cublasDzasum_v2_64, cublasHandle_t, handle, int64_t, n, const cuDoubleComplex*, x, int64_t, incx, double*, result); +DEF_FN(cublasStatus_t, cublasSrot_v2, cublasHandle_t, handle, int, n, float*, x, int, incx, float*, y, int, incy, const float*, c, const float*, s); +DEF_FN(cublasStatus_t, cublasSrot_v2_64, , cublasHandle_t, handle, int64_t, n, float*, x, int64_t, incx, float*, y, int64_t, incy, const float*, c, const float*, s); +DEF_FN(cublasStatus_t, cublasDrot_v2, cublasHandle_t, handle, int, n, double*, x, int, incx, double*, y, int, incy, const double*, c, const double*, s); +DEF_FN(cublasStatus_t, cublasDrot_v2_64, cublasHandle_t, handle, int64_t, n, double*, x, int64_t, incx, double*, y, int64_t, incy, const double*, c, const double*, s); +DEF_FN(cublasStatus_t, cublasCrot_v2, , cublasHandle_t, handle, int, n, cuComplex*, x, int, incx, cuComplex*, y, int, incy, const float*, c, const cuComplex*, s); +DEF_FN(cublasStatus_t, cublasCrot_v2_64, cublasHandle_t, handle, int64_t, n, cuComplex*, x, int64_t, incx, cuComplex*, y, int64_t, incy, const float*, c, const cuComplex*, s); +DEF_FN(cublasStatus_t, cublasCsrot_v2, , cublasHandle_t, handle, int, n, cuComplex*, x, int, incx, cuComplex*, y, int, incy, const float*, c, const float*, s); +DEF_FN(cublasStatus_t, cublasCsrot_v2_64, cublasHandle_t, handle, int64_t, n, cuComplex*, x, int64_t, incx, cuComplex*, y, int64_t, incy, const float*, c, const float*, s); +DEF_FN(cublasStatus_t, cublasZrot_v2, cublasHandle_t, handle, int, n, cuDoubleComplex*, x, int, incx, cuDoubleComplex*, y, int, incy, const double*, c, const cuDoubleComplex*, s); +DEF_FN(cublasStatus_t, cublasZrot_v2_64, cublasHandle_t, handle, int64_t, n, cuDoubleComplex*, x, int64_t, incx, cuDoubleComplex*, y, int64_t, incy, const double*, c, const cuDoubleComplex*, s); +DEF_FN(cublasStatus_t, cublasZdrot_v2, cublasHandle_t, handle, int, n, cuDoubleComplex*, x, int, incx, cuDoubleComplex*, y, int, incy, const double*, c, const double*, s); +DEF_FN(cublasStatus_t, cublasZdrot_v2_64, cublasHandle_t, handle, int64_t, n, cuDoubleComplex*, x, int64_t, incx, cuDoubleComplex*, y, int64_t, incy, const double*, c, const double*, s); +DEF_FN(cublasStatus_t, cublasRotEx, cublasHandle_t, handle, int, n, void*, x, cudaDataType, xType, int, incx, void*, y, cudaDataType, yType, int, incy, const void*, c, const void*, s, cudaDataType, csType, cudaDataType, executiontype); +DEF_FN(cublasStatus_t, cublasRotEx_64, cublasHandle_t, handle, int64_t, n, void*, x, cudaDataType, xType, int64_t, incx, void*, y, cudaDataType, yType, int64_t, incy, const void*, c, const void*, s, cudaDataType, csType, cudaDataType, executiontype); +DEF_FN(cublasStatus_t, cublasSrotg_v2, cublasHandle_t, handle, float*, a, float*, b, float*, c, float*, s); +DEF_FN(cublasStatus_t, cublasDrotg_v2, cublasHandle_t, handle, double*, a, double*, b, double*, c, double*, s); +DEF_FN(cublasStatus_t, cublasCrotg_v2, cublasHandle_t, handle, cuComplex*, a, cuComplex*, b, float*, c, cuComplex*, s); +DEF_FN(cublasStatus_t, cublasZrotg_v2, cublasHandle_t, handle, cuDoubleComplex*, a, cuDoubleComplex*, b, double*, c, cuDoubleComplex*, s); +DEF_FN(cublasStatus_t, cublasRotgEx, cublasHandle_t, handle, void*, a, void*, b, cudaDataType, abType, void*, c, void*, s, cudaDataType, csType, cudaDataType, executiontype); +DEF_FN(cublasStatus_t, cublasSrotm_v2, cublasHandle_t, handle, int, n, float*, x, int, incx, float*, y, int, incy, const float*, param); +DEF_FN(cublasStatus_t, cublasSrotm_v2_64, cublasHandle_t, handle, int64_t, n, float*, x, int64_t, incx, float*, y, int64_t, incy, const float*, param); +DEF_FN(cublasStatus_t, cublasDrotm_v2, cublasHandle_t, handle, int, n, double*, x, int, incx, double*, y, int, incy, const double*, param); +DEF_FN(cublasStatus_t, cublasDrotm_v2_64, , cublasHandle_t, handle, int64_t, n, double*, x, int64_t, incx, double*, y, int64_t, incy, const double*, param); +DEF_FN(cublasStatus_t, cublasRotmEx, cublasHandle_t, handle, int, n, void*, x, cudaDataType, xType, int, incx, void*, y, cudaDataType, yType, int, incy, const void*, param, cudaDataType, paramType, cudaDataType, executiontype); +DEF_FN(cublasStatus_t, cublasRotmEx_64, cublasHandle_t, handle, int64_t, n, void*, x, cudaDataType, xType, int64_t, incx, void*, y, cudaDataType, yType, int64_t, incy, const void*, param, cudaDataType, paramType, cudaDataType, executiontype); +DEF_FN(cublasStatus_t, cublasSrotmg_v2, cublasHandle_t, handle, float*, d1, float*, d2, float*, x1, const float*, y1, float*, param); +DEF_FN(cublasStatus_t, cublasDrotmg_v2, cublasHandle_t, handle, double*, d1, double*, d2, double*, x1, const double*, y1, double*, param); +DEF_FN(cublasStatus_t, cublasRotmgEx, cublasHandle_t, handle, void*, d1, cudaDataType, d1Type, void*, d2, cudaDataType, d2Type, void*, x1, cudaDataType, x1Type, const void*, y1, cudaDataType, y1Type, void*, param, cudaDataType, paramType, cudaDataType, executiontype); + +cublasStatus_t cublasSgemv_v2(cublasHandle_t handle, + cublasOperation_t trans, + int m, int n, + const float *alpha, + const float *A, int lda, + const float *x, int incx, + const float *beta, + float *y, int incy) +{ +#ifdef WITH_API_CNT + api_call_cnt++; +#endif //WITH_API_CNT + int result; + enum clnt_stat retval_1; + retval_1 = rpc_cublassgemv_1( + (ptr)handle, + (int)trans, + m, n, + *alpha, + (ptr)A, lda, + (ptr)x, incx, + *beta, + (ptr)y, incy, + &result, clnt); + if (retval_1 != RPC_SUCCESS) { + clnt_perror (clnt, "call failed"); + } + return result; +} + +DEF_FN(cublasStatus_t, cublasSgemv_v2_64, cublasHandle_t, handle, cublasOperation_t, trans, int64_t, m, int64_t, n, const float*, alpha, const float*, A, int64_t, lda, const float*, x, int64_t, incx, const float*, beta, float*, y, int64_t, incy); + +cublasStatus_t cublasDgemv_v2(cublasHandle_t handle, +cublasOperation_t trans, + int m, int n, + const double *alpha, + const double *A, int lda, + const double *x, int incx, + const double *beta, + double *y, int incy) +{ +#ifdef WITH_API_CNT + api_call_cnt++; +#endif //WITH_API_CNT + int result; + enum clnt_stat retval_1; + retval_1 = rpc_cublasdgemv_1( + (ptr)handle, + (int)trans, + m, n, + *alpha, + (ptr)A, lda, + (ptr)x, incx, + *beta, + (ptr)y, incy, + &result, clnt); + if (retval_1 != RPC_SUCCESS) { + clnt_perror (clnt, "call failed"); + } + return result; +} +DEF_FN(cublasStatus_t, cublasDgemv_v2_64, cublasHandle_t, handle, cublasOperation_t, trans, int64_t, m, int64_t, n, const double*, alpha, const double*, A, int64_t, lda, const double*, x, int64_t, incx, const double*, beta, double*, y, int64_t, incy); +DEF_FN(cublasStatus_t, cublasCgemv_v2, cublasHandle_t, handle, cublasOperation_t, trans, int, m, int, n, const cuComplex*, alpha, const cuComplex*, A, int, lda, const cuComplex*, x, int, incx, const cuComplex*, beta, cuComplex*, y, int, incy); +DEF_FN(cublasStatus_t, cublasCgemv_v2_64, cublasHandle_t, handle, cublasOperation_t, trans, int64_t, m, int64_t, n, const cuComplex*, alpha, const cuComplex*, A, int64_t, lda, const cuComplex*, x, int64_t, incx, const cuComplex*, beta, cuComplex*, y, int64_t, incy); +DEF_FN(cublasStatus_t, cublasZgemv_v2, cublasHandle_t, handle, cublasOperation_t, trans, int, m, int, n, const cuDoubleComplex*, alpha, const cuDoubleComplex*, A, int, lda, const cuDoubleComplex*, x, int, incx, const cuDoubleComplex*, beta, cuDoubleComplex*, y, int, incy); +DEF_FN(cublasStatus_t, cublasZgemv_v2_64, cublasHandle_t, handle, cublasOperation_t, trans, int64_t, m, int64_t, n, const cuDoubleComplex*, alpha, const cuDoubleComplex*, A, int64_t, lda, const cuDoubleComplex*, x, int64_t, incx, const cuDoubleComplex*, beta, cuDoubleComplex*, y, int64_t, incy); +DEF_FN(cublasStatus_t, cublasSgbmv_v2, cublasHandle_t, handle, cublasOperation_t, trans, int, m, int, n, int, kl, int, ku, const float*, alpha, const float*, A, int, lda, const float*, x, int, incx, const float*, beta, float*, y, int, incy); +DEF_FN(cublasStatus_t, cublasSgbmv_v2_64, cublasHandle_t, handle, cublasOperation_t, trans, int64_t, m, int64_t, n, int64_t, kl, int64_t, ku, const float*, alpha, const float*, A, int64_t, lda, const float*, x, int64_t, incx, const float*, beta, float*, y, int64_t, incy); +DEF_FN(cublasStatus_t, cublasDgbmv_v2, cublasHandle_t, handle, cublasOperation_t, trans, int, m, int, n, int, kl, int, ku, const double*, alpha, const double*, A, int, lda, const double*, x, int, incx, const double*, beta, double*, y, int, incy); +DEF_FN(cublasStatus_t, cublasDgbmv_v2_64, cublasHandle_t, handle, cublasOperation_t, trans, int64_t, m, int64_t, n, int64_t, kl, int64_t, ku, const double*, alpha, const double*, A, int64_t, lda, const double*, x, int64_t, incx, const double*, beta, double*, y, int64_t, incy); +DEF_FN(cublasStatus_t, cublasCgbmv_v2, cublasHandle_t, handle, cublasOperation_t, trans, int, m, int, n, int, kl, int, ku, const cuComplex*, alpha, const cuComplex*, A, int, lda, const cuComplex*, x, int, incx, const cuComplex*, beta, cuComplex*, y, int, incy); +DEF_FN(cublasStatus_t, cublasCgbmv_v2_64, cublasHandle_t, handle, cublasOperation_t, trans, int64_t, m, int64_t, n, int64_t, kl, int64_t, ku, const cuComplex*, alpha, const cuComplex*, A, int64_t, lda, const cuComplex*, x, int64_t, incx, const cuComplex*, beta, cuComplex*, y, int64_t, incy); +DEF_FN(cublasStatus_t, cublasZgbmv_v2, cublasHandle_t, handle, cublasOperation_t, trans, int, m, int, n, int, kl, int, ku, const cuDoubleComplex*, alpha, const cuDoubleComplex*, A, int, lda, const cuDoubleComplex*, x, int, incx, const cuDoubleComplex*, beta, cuDoubleComplex*, y, int, incy); +DEF_FN(cublasStatus_t, cublasZgbmv_v2_64, cublasHandle_t, handle, cublasOperation_t, trans, int64_t, m, int64_t, n, int64_t, kl, int64_t, ku, const cuDoubleComplex*, alpha, const cuDoubleComplex*, A, int64_t, lda, const cuDoubleComplex*, x, int64_t, incx, const cuDoubleComplex*, beta, cuDoubleComplex*, y, int64_t, incy); +DEF_FN(cublasStatus_t, cublasStrmv_v2, cublasHandle_t, handle, cublasFillMode_t, uplo, cublasOperation_t, trans, cublasDiagType_t, diag, int, n, const float*, A, int, lda, float*, x, int, incx); +DEF_FN(cublasStatus_t, cublasStrmv_v2_64, cublasHandle_t, handle, cublasFillMode_t, uplo, cublasOperation_t, trans, cublasDiagType_t, diag, int64_t, n, const float*, A, int64_t, lda, float*, x, int64_t, incx); +DEF_FN(cublasStatus_t, cublasDtrmv_v2, cublasHandle_t, handle, cublasFillMode_t, uplo, cublasOperation_t, trans, cublasDiagType_t, diag, int, n, const double*, A, int, lda, double*, x, int, incx); +DEF_FN(cublasStatus_t, cublasDtrmv_v2_64, cublasHandle_t, handle, cublasFillMode_t, uplo, cublasOperation_t, trans, cublasDiagType_t, diag, int64_t, n, const double*, A, int64_t, lda, double*, x, int64_t, incx); +DEF_FN(cublasStatus_t, cublasCtrmv_v2, cublasHandle_t, handle, cublasFillMode_t, uplo, cublasOperation_t, trans, cublasDiagType_t, diag, int, n, const cuComplex*, A, int, lda, cuComplex*, x, int, incx); +DEF_FN(cublasStatus_t, cublasCtrmv_v2_64, cublasHandle_t, handle, cublasFillMode_t, uplo, cublasOperation_t, trans, cublasDiagType_t, diag, int64_t, n, const cuComplex*, A, int64_t, lda, cuComplex*, x, int64_t, incx); +DEF_FN(cublasStatus_t, cublasZtrmv_v2, cublasHandle_t, handle, cublasFillMode_t, uplo, cublasOperation_t, trans, cublasDiagType_t, diag, int, n, const cuDoubleComplex*, A, int, lda, cuDoubleComplex*, x, int, incx); +DEF_FN(cublasStatus_t, cublasZtrmv_v2_64, cublasHandle_t, handle, cublasFillMode_t, uplo, cublasOperation_t, trans, cublasDiagType_t, diag, int64_t, n, const cuDoubleComplex*, A, int64_t, lda, cuDoubleComplex*, x, int64_t, incx); +DEF_FN(cublasStatus_t, cublasStbmv_v2, cublasHandle_t, handle, cublasFillMode_t, uplo, cublasOperation_t, trans, cublasDiagType_t, diag, int, n, int, k, const float*, A, int, lda, float*, x, int, incx); +DEF_FN(cublasStatus_t, cublasStbmv_v2_64, cublasHandle_t, handle, cublasFillMode_t, uplo, cublasOperation_t, trans, cublasDiagType_t, diag, int64_t, n, int64_t, k, const float*, A, int64_t, lda, float*, x, int64_t, incx); +DEF_FN(cublasStatus_t, cublasDtbmv_v2, cublasHandle_t, handle, cublasFillMode_t, uplo, cublasOperation_t, trans, cublasDiagType_t, diag, int, n, int, k, const double*, A, int, lda, double*, x, int, incx); +DEF_FN(cublasStatus_t, cublasDtbmv_v2_64, cublasHandle_t, handle, cublasFillMode_t, uplo, cublasOperation_t, trans, cublasDiagType_t, diag, int64_t, n, int64_t, k, const double*, A, int64_t, lda, double*, x, int64_t, incx); +DEF_FN(cublasStatus_t, cublasCtbmv_v2, cublasHandle_t, handle, cublasFillMode_t, uplo, cublasOperation_t, trans, cublasDiagType_t, diag, int, n, int, k, const cuComplex*, A, int, lda, cuComplex*, x, int, incx); +DEF_FN(cublasStatus_t, cublasCtbmv_v2_64, cublasHandle_t, handle, cublasFillMode_t, uplo, cublasOperation_t, trans, cublasDiagType_t, diag, int64_t, n, int64_t, k, const cuComplex*, A, int64_t, lda, cuComplex*, x, int64_t, incx); +DEF_FN(cublasStatus_t, cublasZtbmv_v2, cublasHandle_t, handle, cublasFillMode_t, uplo, cublasOperation_t, trans, cublasDiagType_t, diag, int, n, int, k, const cuDoubleComplex*, A, int, lda, cuDoubleComplex*, x, int, incx); +DEF_FN(cublasStatus_t, cublasZtbmv_v2_64, cublasHandle_t, handle, cublasFillMode_t, uplo, cublasOperation_t, trans, cublasDiagType_t, diag, int64_t, n, int64_t, k, const cuDoubleComplex*, A, int64_t, lda, cuDoubleComplex*, x, int64_t, incx); +DEF_FN(cublasStatus_t, cublasStpmv_v2, cublasHandle_t, handle, cublasFillMode_t, uplo, cublasOperation_t, trans, cublasDiagType_t, diag, int, n, const float*, AP, float*, x, int, incx); +DEF_FN(cublasStatus_t, cublasStpmv_v2_64, cublasHandle_t, handle, cublasFillMode_t, uplo, cublasOperation_t, trans, cublasDiagType_t, diag, int64_t, n, const float*, AP, float*, x, int64_t, incx); +DEF_FN(cublasStatus_t, cublasDtpmv_v2, cublasHandle_t, handle, cublasFillMode_t, uplo, cublasOperation_t, trans, cublasDiagType_t, diag, int, n, const double*, AP, double*, x, int, incx); +DEF_FN(cublasStatus_t, cublasDtpmv_v2_64, cublasHandle_t, handle, cublasFillMode_t, uplo, cublasOperation_t, trans, cublasDiagType_t, diag, int64_t, n, const double*, AP, double*, x, int64_t, incx); +DEF_FN(cublasStatus_t, cublasCtpmv_v2, cublasHandle_t, handle, cublasFillMode_t, uplo, cublasOperation_t, trans, cublasDiagType_t, diag, int, n, const cuComplex*, AP, cuComplex*, x, int, incx); +DEF_FN(cublasStatus_t, cublasCtpmv_v2_64, cublasHandle_t, handle, cublasFillMode_t, uplo, cublasOperation_t, trans, cublasDiagType_t, diag, int64_t, n, const cuComplex*, AP, cuComplex*, x, int64_t, incx); +DEF_FN(cublasStatus_t, cublasZtpmv_v2, cublasHandle_t, handle, cublasFillMode_t, uplo, cublasOperation_t, trans, cublasDiagType_t, diag, int, n, const cuDoubleComplex*, AP, cuDoubleComplex*, x, int, incx); +DEF_FN(cublasStatus_t, cublasZtpmv_v2_64, cublasHandle_t, handle, cublasFillMode_t, uplo, cublasOperation_t, trans, cublasDiagType_t, diag, int64_t, n, const cuDoubleComplex*, AP, cuDoubleComplex*, x, int64_t, incx); +DEF_FN(cublasStatus_t, cublasStrsv_v2, cublasHandle_t, handle, cublasFillMode_t, uplo, cublasOperation_t, trans, cublasDiagType_t, diag, int, n, const float*, A, int, lda, float*, x, int, incx); +DEF_FN(cublasStatus_t, cublasStrsv_v2_64, cublasHandle_t, handle, cublasFillMode_t, uplo, cublasOperation_t, trans, cublasDiagType_t, diag, int64_t, n, const float*, A, int64_t, lda, float*, x, int64_t, incx); +DEF_FN(cublasStatus_t, cublasDtrsv_v2, cublasHandle_t, handle, cublasFillMode_t, uplo, cublasOperation_t, trans, cublasDiagType_t, diag, int, n, const double*, A, int, lda, double*, x, int, incx); +DEF_FN(cublasStatus_t, cublasDtrsv_v2_64, cublasHandle_t, handle, cublasFillMode_t, uplo, cublasOperation_t, trans, cublasDiagType_t, diag, int64_t, n, const double*, A, int64_t, lda, double*, x, int64_t, incx); +DEF_FN(cublasStatus_t, cublasCtrsv_v2, cublasHandle_t, handle, cublasFillMode_t, uplo, cublasOperation_t, trans, cublasDiagType_t, diag, int, n, const cuComplex*, A, int, lda, cuComplex*, x, int, incx); +DEF_FN(cublasStatus_t, cublasCtrsv_v2_64, cublasHandle_t, handle, cublasFillMode_t, uplo, cublasOperation_t, trans, cublasDiagType_t, diag, int64_t, n, const cuComplex*, A, int64_t, lda, cuComplex*, x, int64_t, incx); +DEF_FN(cublasStatus_t, cublasZtrsv_v2, cublasHandle_t, handle, cublasFillMode_t, uplo, cublasOperation_t, trans, cublasDiagType_t, diag, int, n, const cuDoubleComplex*, A, int, lda, cuDoubleComplex*, x, int, incx); +DEF_FN(cublasStatus_t, cublasZtrsv_v2_64, cublasHandle_t, handle, cublasFillMode_t, uplo, cublasOperation_t, trans, cublasDiagType_t, diag, int64_t, n, const cuDoubleComplex*, A, int64_t, lda, cuDoubleComplex*, x, int64_t, incx); +DEF_FN(cublasStatus_t, cublasStpsv_v2, cublasHandle_t, handle, cublasFillMode_t, uplo, cublasOperation_t, trans, cublasDiagType_t, diag, int, n, const float*, AP, float*, x, int, incx); +DEF_FN(cublasStatus_t, cublasStpsv_v2_64, cublasHandle_t, handle, cublasFillMode_t, uplo, cublasOperation_t, trans, cublasDiagType_t, diag, int64_t, n, const float*, AP, float*, x, int64_t, incx); +DEF_FN(cublasStatus_t, cublasDtpsv_v2, cublasHandle_t, handle, cublasFillMode_t, uplo, cublasOperation_t, trans, cublasDiagType_t, diag, int, n, const double*, AP, double*, x, int, incx); +DEF_FN(cublasStatus_t, cublasDtpsv_v2_64, cublasHandle_t, handle, cublasFillMode_t, uplo, cublasOperation_t, trans, cublasDiagType_t, diag, int64_t, n, const double*, AP, double*, x, int64_t, incx); +DEF_FN(cublasStatus_t, cublasCtpsv_v2, cublasHandle_t, handle, cublasFillMode_t, uplo, cublasOperation_t, trans, cublasDiagType_t, diag, int, n, const cuComplex*, AP, cuComplex*, x, int, incx); +DEF_FN(cublasStatus_t, cublasCtpsv_v2_64, cublasHandle_t, handle, cublasFillMode_t, uplo, cublasOperation_t, trans, cublasDiagType_t, diag, int64_t, n, const cuComplex*, AP, cuComplex*, x, int64_t, incx); +DEF_FN(cublasStatus_t, cublasZtpsv_v2, cublasHandle_t, handle, cublasFillMode_t, uplo, cublasOperation_t, trans, cublasDiagType_t, diag, int, n, const cuDoubleComplex*, AP, cuDoubleComplex*, x, int, incx); +DEF_FN(cublasStatus_t, cublasZtpsv_v2_64, cublasHandle_t, handle, cublasFillMode_t, uplo, cublasOperation_t, trans, cublasDiagType_t, diag, int64_t, n, const cuDoubleComplex*, AP, cuDoubleComplex*, x, int64_t, incx); +DEF_FN(cublasStatus_t, cublasStbsv_v2, cublasHandle_t, handle, cublasFillMode_t, uplo, cublasOperation_t, trans, cublasDiagType_t, diag, int, n, int, k, const float*, A, int, lda, float*, x, int, incx); +DEF_FN(cublasStatus_t, cublasStbsv_v2_64, cublasHandle_t, handle, cublasFillMode_t, uplo, cublasOperation_t, trans, cublasDiagType_t, diag, int64_t, n, int64_t, k, const float*, A, int64_t, lda, float*, x, int64_t, incx); +DEF_FN(cublasStatus_t, cublasDtbsv_v2, cublasHandle_t, handle, cublasFillMode_t, uplo, cublasOperation_t, trans, cublasDiagType_t, diag, int, n, int, k, const double*, A, int, lda, double*, x, int, incx); +DEF_FN(cublasStatus_t, cublasDtbsv_v2_64, cublasHandle_t, handle, cublasFillMode_t, uplo, cublasOperation_t, trans, cublasDiagType_t, diag, int64_t, n, int64_t, k, const double*, A, int64_t, lda, double*, x, int64_t, incx); +DEF_FN(cublasStatus_t, cublasCtbsv_v2, cublasHandle_t, handle, cublasFillMode_t, uplo, cublasOperation_t, trans, cublasDiagType_t, diag, int, n, int, k, const cuComplex*, A, int, lda, cuComplex*, x, int, incx); +DEF_FN(cublasStatus_t, cublasCtbsv_v2_64, cublasHandle_t, handle, cublasFillMode_t, uplo, cublasOperation_t, trans, cublasDiagType_t, diag, int64_t, n, int64_t, k, const cuComplex*, A, int64_t, lda, cuComplex*, x, int64_t, incx); +DEF_FN(cublasStatus_t, cublasZtbsv_v2, cublasHandle_t, handle, cublasFillMode_t, uplo, cublasOperation_t, trans, cublasDiagType_t, diag, int, n, int, k, const cuDoubleComplex*, A, int, lda, cuDoubleComplex*, x, int, incx); +DEF_FN(cublasStatus_t, cublasZtbsv_v2_64, cublasHandle_t, handle, cublasFillMode_t, uplo, cublasOperation_t, trans, cublasDiagType_t, diag, int64_t, n, int64_t, k, const cuDoubleComplex*, A, int64_t, lda, cuDoubleComplex*, x, int64_t, incx); +DEF_FN(cublasStatus_t, cublasSsymv_v2, cublasHandle_t, handle, cublasFillMode_t, uplo, int, n, const float*, alpha, const float*, A, int, lda, const float*, x, int, incx, const float*, beta, float*, y, int, incy); +DEF_FN(cublasStatus_t, cublasSsymv_v2_64, cublasHandle_t, handle, cublasFillMode_t, uplo, int64_t, n, const float*, alpha, const float*, A, int64_t, lda, const float*, x, int64_t, incx, const float*, beta, float*, y, int64_t, incy); +DEF_FN(cublasStatus_t, cublasDsymv_v2, cublasHandle_t, handle, cublasFillMode_t, uplo, int, n, const double*, alpha, const double*, A, int, lda, const double*, x, int, incx, const double*, beta, double*, y, int, incy); +DEF_FN(cublasStatus_t, cublasDsymv_v2_64, cublasHandle_t, handle, cublasFillMode_t, uplo, int64_t, n, const double*, alpha, const double*, A, int64_t, lda, const double*, x, int64_t, incx, const double*, beta, double*, y, int64_t, incy); +DEF_FN(cublasStatus_t, cublasCsymv_v2, cublasHandle_t, handle, cublasFillMode_t, uplo, int, n, const cuComplex*, alpha, const cuComplex*, A, int, lda, const cuComplex*, x, int, incx, const cuComplex*, beta, cuComplex*, y, int, incy); +DEF_FN(cublasStatus_t, cublasCsymv_v2_64, cublasHandle_t, handle, cublasFillMode_t, uplo, int64_t, n, const cuComplex*, alpha, const cuComplex*, A, int64_t, lda, const cuComplex*, x, int64_t, incx, const cuComplex*, beta, cuComplex*, y, int64_t, incy); +DEF_FN(cublasStatus_t, cublasZsymv_v2, cublasHandle_t, handle, cublasFillMode_t, uplo, int, n, const cuDoubleComplex*, alpha, const cuDoubleComplex*, A, int, lda, const cuDoubleComplex*, x, int, incx, const cuDoubleComplex*, beta, cuDoubleComplex*, y, int, incy); +DEF_FN(cublasStatus_t, cublasZsymv_v2_64, cublasHandle_t, handle, cublasFillMode_t, uplo, int64_t, n, const cuDoubleComplex*, alpha, const cuDoubleComplex*, A, int64_t, lda, const cuDoubleComplex*, x, int64_t, incx, const cuDoubleComplex*, beta, cuDoubleComplex*, y, int64_t, incy); +DEF_FN(cublasStatus_t, cublasChemv_v2, cublasHandle_t, handle, cublasFillMode_t, uplo, int, n, const cuComplex*, alpha, const cuComplex*, A, int, lda, const cuComplex*, x, int, incx, const cuComplex*, beta, cuComplex*, y, int, incy); +DEF_FN(cublasStatus_t, cublasChemv_v2_64, cublasHandle_t, handle, cublasFillMode_t, uplo, int64_t, n, const cuComplex*, alpha, const cuComplex*, A, int64_t, lda, const cuComplex*, x, int64_t, incx, const cuComplex*, beta, cuComplex*, y, int64_t, incy); +DEF_FN(cublasStatus_t, cublasZhemv_v2, cublasHandle_t, handle, cublasFillMode_t, uplo, int, n, const cuDoubleComplex*, alpha, const cuDoubleComplex*, A, int, lda, const cuDoubleComplex*, x, int, incx, const cuDoubleComplex*, beta, cuDoubleComplex*, y, int, incy); +DEF_FN(cublasStatus_t, cublasZhemv_v2_64, cublasHandle_t, handle, cublasFillMode_t, uplo, int64_t, n, const cuDoubleComplex*, alpha, const cuDoubleComplex*, A, int64_t, lda, const cuDoubleComplex*, x, int64_t, incx, const cuDoubleComplex*, beta, cuDoubleComplex*, y, int64_t, incy); +DEF_FN(cublasStatus_t, cublasSsbmv_v2, cublasHandle_t, handle, cublasFillMode_t, uplo, int, n, int, k, const float*, alpha, const float*, A, int, lda, const float*, x, int, incx, const float*, beta, float*, y, int, incy); +DEF_FN(cublasStatus_t, cublasSsbmv_v2_64, cublasHandle_t, handle, cublasFillMode_t, uplo, int64_t, n, int64_t, k, const float*, alpha, const float*, A, int64_t, lda, const float*, x, int64_t, incx, const float*, beta, float*, y, int64_t, incy); +DEF_FN(cublasStatus_t, cublasDsbmv_v2, cublasHandle_t, handle, cublasFillMode_t, uplo, int, n, int, k, const double*, alpha, const double*, A, int, lda, const double*, x, int, incx, const double*, beta, double*, y, int, incy); +DEF_FN(cublasStatus_t, cublasDsbmv_v2_64, cublasHandle_t, handle, cublasFillMode_t, uplo, int64_t, n, int64_t, k, const double*, alpha, const double*, A, int64_t, lda, const double*, x, int64_t, incx, const double*, beta, double*, y, int64_t, incy); +DEF_FN(cublasStatus_t, cublasChbmv_v2, cublasHandle_t, handle, cublasFillMode_t, uplo, int, n, int, k, const cuComplex*, alpha, const cuComplex*, A, int, lda, const cuComplex*, x, int, incx, const cuComplex*, beta, cuComplex*, y, int, incy); +DEF_FN(cublasStatus_t, cublasChbmv_v2_64, cublasHandle_t, handle, cublasFillMode_t, uplo, int64_t, n, int64_t, k, const cuComplex*, alpha, const cuComplex*, A, int64_t, lda, const cuComplex*, x, int64_t, incx, const cuComplex*, beta, cuComplex*, y, int64_t, incy); +DEF_FN(cublasStatus_t, cublasZhbmv_v2, cublasHandle_t, handle, cublasFillMode_t, uplo, int, n, int, k, const cuDoubleComplex*, alpha, const cuDoubleComplex*, A, int, lda, const cuDoubleComplex*, x, int, incx, const cuDoubleComplex*, beta, cuDoubleComplex*, y, int, incy); +DEF_FN(cublasStatus_t, cublasZhbmv_v2_64, cublasHandle_t, handle, cublasFillMode_t, uplo, int64_t, n, int64_t, k, const cuDoubleComplex*, alpha, const cuDoubleComplex*, A, int64_t, lda, const cuDoubleComplex*, x, int64_t, incx, const cuDoubleComplex*, beta, cuDoubleComplex*, y, int64_t, incy); +DEF_FN(cublasStatus_t, cublasSspmv_v2, cublasHandle_t, handle, cublasFillMode_t, uplo, int, n, const float*, alpha, const float*, AP, const float*, x, int, incx, const float*, beta, float*, y, int, incy); +DEF_FN(cublasStatus_t, cublasSspmv_v2_64, cublasHandle_t, handle, cublasFillMode_t, uplo, int64_t, n, const float*, alpha, const float*, AP, const float*, x, int64_t, incx, const float*, beta, float*, y, int64_t, incy); +DEF_FN(cublasStatus_t, cublasDspmv_v2, cublasHandle_t, handle, cublasFillMode_t, uplo, int, n, const double*, alpha, const double*, AP, const double*, x, int, incx, const double*, beta, double*, y, int, incy); +DEF_FN(cublasStatus_t, cublasDspmv_v2_64, cublasHandle_t, handle, cublasFillMode_t, uplo, int64_t, n, const double*, alpha, const double*, AP, const double*, x, int64_t, incx, const double*, beta, double*, y, int64_t, incy); +DEF_FN(cublasStatus_t, cublasChpmv_v2, cublasHandle_t, handle, cublasFillMode_t, uplo, int, n, const cuComplex*, alpha, const cuComplex*, AP, const cuComplex*, x, int, incx, const cuComplex*, beta, cuComplex*, y, int, incy); +DEF_FN(cublasStatus_t, cublasChpmv_v2_64, cublasHandle_t, handle, cublasFillMode_t, uplo, int64_t, n, const cuComplex*, alpha, const cuComplex*, AP, const cuComplex*, x, int64_t, incx, const cuComplex*, beta, cuComplex*, y, int64_t, incy); +DEF_FN(cublasStatus_t, cublasZhpmv_v2, cublasHandle_t, handle, cublasFillMode_t, uplo, int, n, const cuDoubleComplex*, alpha, const cuDoubleComplex*, AP, const cuDoubleComplex*, x, int, incx, const cuDoubleComplex*, beta, cuDoubleComplex*, y, int, incy); +DEF_FN(cublasStatus_t, cublasZhpmv_v2_64, cublasHandle_t, handle, cublasFillMode_t, uplo, int64_t, n, const cuDoubleComplex*, alpha, const cuDoubleComplex*, AP, const cuDoubleComplex*, x, int64_t, incx, const cuDoubleComplex*, beta, cuDoubleComplex*, y, int64_t, incy); +DEF_FN(cublasStatus_t, cublasSger_v2, cublasHandle_t, handle, int, m, int, n, const float*, alpha, const float*, x, int, incx, const float*, y, int, incy, float*, A, int, lda); +DEF_FN(cublasStatus_t, cublasSger_v2_64, cublasHandle_t, handle, int64_t, m, int64_t, n, const float*, alpha, const float*, x, int64_t, incx, const float*, y, int64_t, incy, float*, A, int64_t, lda); +DEF_FN(cublasStatus_t, cublasDger_v2, cublasHandle_t, handle, int, m, int, n, const double*, alpha, const double*, x, int, incx, const double*, y, int, incy, double*, A, int, lda); +DEF_FN(cublasStatus_t, cublasDger_v2_64, cublasHandle_t, handle, int64_t, m, int64_t, n, const double*, alpha, const double*, x, int64_t, incx, const double*, y, int64_t, incy, double*, A, int64_t, lda); +DEF_FN(cublasStatus_t, cublasCgeru_v2, cublasHandle_t, handle, int, m, int, n, const cuComplex*, alpha, const cuComplex*, x, int, incx, const cuComplex*, y, int, incy, cuComplex*, A, int, lda); +DEF_FN(cublasStatus_t, cublasCgeru_v2_64, cublasHandle_t, handle, int64_t, m, int64_t, n, const cuComplex*, alpha, const cuComplex*, x, int64_t, incx, const cuComplex*, y, int64_t, incy, cuComplex*, A, int64_t, lda); +DEF_FN(cublasStatus_t, cublasCgerc_v2, cublasHandle_t, handle, int, m, int, n, const cuComplex*, alpha, const cuComplex*, x, int, incx, const cuComplex*, y, int, incy, cuComplex*, A, int, lda); +DEF_FN(cublasStatus_t, cublasCgerc_v2_64, cublasHandle_t, handle, int64_t, m, int64_t, n, const cuComplex*, alpha, const cuComplex*, x, int64_t, incx, const cuComplex*, y, int64_t, incy, cuComplex*, A, int64_t, lda); +DEF_FN(cublasStatus_t, cublasZgeru_v2, cublasHandle_t, handle, int, m, int, n, const cuDoubleComplex*, alpha, const cuDoubleComplex*, x, int, incx, const cuDoubleComplex*, y, int, incy, cuDoubleComplex*, A, int, lda); +DEF_FN(cublasStatus_t, cublasZgeru_v2_64, cublasHandle_t, handle, int64_t, m, int64_t, n, const cuDoubleComplex*, alpha, const cuDoubleComplex*, x, int64_t, incx, const cuDoubleComplex*, y, int64_t, incy, cuDoubleComplex*, A, int64_t, lda); +DEF_FN(cublasStatus_t, cublasZgerc_v2, cublasHandle_t, handle, int, m, int, n, const cuDoubleComplex*, alpha, const cuDoubleComplex*, x, int, incx, const cuDoubleComplex*, y, int, incy, cuDoubleComplex*, A, int, lda); +DEF_FN(cublasStatus_t, cublasZgerc_v2_64, cublasHandle_t, handle, int64_t, m, int64_t, n, const cuDoubleComplex*, alpha, const cuDoubleComplex*, x, int64_t, incx, const cuDoubleComplex*, y, int64_t, incy, cuDoubleComplex*, A, int64_t, lda); +DEF_FN(cublasStatus_t, cublasSsyr_v2, cublasHandle_t, handle, cublasFillMode_t, uplo, int, n, const float*, alpha, const float*, x, int, incx, float*, A, int, lda); +DEF_FN(cublasStatus_t, cublasSsyr_v2_64, cublasHandle_t, handle, cublasFillMode_t, uplo, int64_t, n, const float*, alpha, const float*, x, int64_t, incx, float*, A, int64_t, lda); +DEF_FN(cublasStatus_t, cublasDsyr_v2, cublasHandle_t, handle, cublasFillMode_t, uplo, int, n, const double*, alpha, const double*, x, int, incx, double*, A, int, lda); +DEF_FN(cublasStatus_t, cublasDsyr_v2_64, cublasHandle_t, handle, cublasFillMode_t, uplo, int64_t, n, const double*, alpha, const double*, x, int64_t, incx, double*, A, int64_t, lda); +DEF_FN(cublasStatus_t, cublasCsyr_v2, cublasHandle_t, handle, cublasFillMode_t, uplo, int, n, const cuComplex*, alpha, const cuComplex*, x, int, incx, cuComplex*, A, int, lda); +DEF_FN(cublasStatus_t, cublasCsyr_v2_64, cublasHandle_t, handle, cublasFillMode_t, uplo, int64_t, n, const cuComplex*, alpha, const cuComplex*, x, int64_t, incx, cuComplex*, A, int64_t, lda); +DEF_FN(cublasStatus_t, cublasZsyr_v2, cublasHandle_t, handle, cublasFillMode_t, uplo, int, n, const cuDoubleComplex*, alpha, const cuDoubleComplex*, x, int, incx, cuDoubleComplex*, A, int, lda); +DEF_FN(cublasStatus_t, cublasZsyr_v2_64, cublasHandle_t, handle, cublasFillMode_t, uplo, int64_t, n, const cuDoubleComplex*, alpha, const cuDoubleComplex*, x, int64_t, incx, cuDoubleComplex*, A, int64_t, lda); +DEF_FN(cublasStatus_t, cublasCher_v2, cublasHandle_t, handle, cublasFillMode_t, uplo, int, n, const float*, alpha, const cuComplex*, x, int, incx, cuComplex*, A, int, lda); +DEF_FN(cublasStatus_t, cublasCher_v2_64, cublasHandle_t, handle, cublasFillMode_t, uplo, int64_t, n, const float*, alpha, const cuComplex*, x, int64_t, incx, cuComplex*, A, int64_t, lda); +DEF_FN(cublasStatus_t, cublasZher_v2, cublasHandle_t, handle, cublasFillMode_t, uplo, int, n, const double*, alpha, const cuDoubleComplex*, x, int, incx, cuDoubleComplex*, A, int, lda); +DEF_FN(cublasStatus_t, cublasZher_v2_64, cublasHandle_t, handle, cublasFillMode_t, uplo, int64_t, n, const double*, alpha, const cuDoubleComplex*, x, int64_t, incx, cuDoubleComplex*, A, int64_t, lda); +DEF_FN(cublasStatus_t, cublasSspr_v2, , cublasHandle_t, handle, cublasFillMode_t, uplo, int, n, const float*, alpha, const float*, x, int, incx, float*, AP); +DEF_FN(cublasStatus_t, cublasSspr_v2_64, cublasHandle_t, handle, cublasFillMode_t, uplo, int64_t, n, const float*, alpha, const float*, x, int64_t, incx, float*, AP); +DEF_FN(cublasStatus_t, cublasDspr_v2, , cublasHandle_t, handle, cublasFillMode_t, uplo, int, n, const double*, alpha, const double*, x, int, incx, double*, AP); +DEF_FN(cublasStatus_t, cublasDspr_v2_64, cublasHandle_t, handle, cublasFillMode_t, uplo, int64_t, n, const double*, alpha, const double*, x, int64_t, incx, double*, AP); +DEF_FN(cublasStatus_t, cublasChpr_v2, cublasHandle_t, handle, cublasFillMode_t, uplo, int, n, const float*, alpha, const cuComplex*, x, int, incx, cuComplex*, AP); +DEF_FN(cublasStatus_t, cublasChpr_v2_64, cublasHandle_t, handle, cublasFillMode_t, uplo, int64_t, n, const float*, alpha, const cuComplex*, x, int64_t, incx, cuComplex*, AP); +DEF_FN(cublasStatus_t, cublasZhpr_v2, cublasHandle_t, handle, cublasFillMode_t, uplo, int, n, const double*, alpha, const cuDoubleComplex*, x, int, incx, cuDoubleComplex*, AP); +DEF_FN(cublasStatus_t, cublasZhpr_v2_64, cublasHandle_t, handle, cublasFillMode_t, uplo, int64_t, n, const double*, alpha, const cuDoubleComplex*, x, int64_t, incx, cuDoubleComplex*, AP); +DEF_FN(cublasStatus_t, cublasSsyr2_v2, cublasHandle_t, handle, cublasFillMode_t, uplo, int, n, const float*, alpha, const float*, x, int, incx, const float*, y, int, incy, float*, A, int, lda); +DEF_FN(cublasStatus_t, cublasSsyr2_v2_64, cublasHandle_t, handle, cublasFillMode_t, uplo, int64_t, n, const float*, alpha, const float*, x, int64_t, incx, const float*, y, int64_t, incy, float*, A, int64_t, lda); +DEF_FN(cublasStatus_t, cublasDsyr2_v2, cublasHandle_t, handle, cublasFillMode_t, uplo, int, n, const double*, alpha, const double*, x, int, incx, const double*, y, int, incy, double*, A, int, lda); +DEF_FN(cublasStatus_t, cublasDsyr2_v2_64, cublasHandle_t, handle, cublasFillMode_t, uplo, int64_t, n, const double*, alpha, const double*, x, int64_t, incx, const double*, y, int64_t, incy, double*, A, int64_t, lda); +DEF_FN(cublasStatus_t, cublasCsyr2_v2, cublasHandle_t, handle, cublasFillMode_t, uplo, int, n, const cuComplex*, alpha, const cuComplex*, x, int, incx, const cuComplex*, y, int, incy, cuComplex*, A, int, lda); +DEF_FN(cublasStatus_t, cublasCsyr2_v2_64, cublasHandle_t, handle, cublasFillMode_t, uplo, int64_t, n, const cuComplex*, alpha, const cuComplex*, x, int64_t, incx, const cuComplex*, y, int64_t, incy, cuComplex*, A, int64_t, lda); +DEF_FN(cublasStatus_t, cublasZsyr2_v2, cublasHandle_t, handle, cublasFillMode_t, uplo, int, n, const cuDoubleComplex*, alpha, const cuDoubleComplex*, x, int, incx, const cuDoubleComplex*, y, int, incy, cuDoubleComplex*, A, int, lda); +DEF_FN(cublasStatus_t, cublasZsyr2_v2_64, cublasHandle_t, handle, cublasFillMode_t, uplo, int64_t, n, const cuDoubleComplex*, alpha, const cuDoubleComplex*, x, int64_t, incx, const cuDoubleComplex*, y, int64_t, incy, cuDoubleComplex*, A, int64_t, lda); +DEF_FN(cublasStatus_t, cublasCher2_v2, cublasHandle_t, handle, cublasFillMode_t, uplo, int, n, const cuComplex*, alpha, const cuComplex*, x, int, incx, const cuComplex*, y, int, incy, cuComplex*, A, int, lda); +DEF_FN(cublasStatus_t, cublasCher2_v2_64, cublasHandle_t, handle, cublasFillMode_t, uplo, int64_t, n, const cuComplex*, alpha, const cuComplex*, x, int64_t, incx, const cuComplex*, y, int64_t, incy, cuComplex*, A, int64_t, lda); +DEF_FN(cublasStatus_t, cublasZher2_v2, cublasHandle_t, handle, cublasFillMode_t, uplo, int, n, const cuDoubleComplex*, alpha, const cuDoubleComplex*, x, int, incx, const cuDoubleComplex*, y, int, incy, cuDoubleComplex*, A, int, lda); +DEF_FN(cublasStatus_t, cublasZher2_v2_64, cublasHandle_t, handle, cublasFillMode_t, uplo, int64_t, n, const cuDoubleComplex*, alpha, const cuDoubleComplex*, x, int64_t, incx, const cuDoubleComplex*, y, int64_t, incy, cuDoubleComplex*, A, int64_t, lda); +DEF_FN(cublasStatus_t, cublasSspr2_v2, cublasHandle_t, handle, cublasFillMode_t, uplo, int, n, const float*, alpha, const float*, x, int, incx, const float*, y, int, incy, float*, AP); +DEF_FN(cublasStatus_t, cublasSspr2_v2_64, cublasHandle_t, handle, cublasFillMode_t, uplo, int64_t, n, const float*, alpha, const float*, x, int64_t, incx, const float*, y, int64_t, incy, float*, AP); +DEF_FN(cublasStatus_t, cublasDspr2_v2, cublasHandle_t, handle, cublasFillMode_t, uplo, int, n, const double*, alpha, const double*, x, int, incx, const double*, y, int, incy, double*, AP); +DEF_FN(cublasStatus_t, cublasDspr2_v2_64, cublasHandle_t, handle, cublasFillMode_t, uplo, int64_t, n, const double*, alpha, const double*, x, int64_t, incx, const double*, y, int64_t, incy, double*, AP); +DEF_FN(cublasStatus_t, cublasChpr2_v2, cublasHandle_t, handle, cublasFillMode_t, uplo, int, n, const cuComplex*, alpha, const cuComplex*, x, int, incx, const cuComplex*, y, int, incy, cuComplex*, AP); +DEF_FN(cublasStatus_t, cublasChpr2_v2_64, cublasHandle_t, handle, cublasFillMode_t, uplo, int64_t, n, const cuComplex*, alpha, const cuComplex*, x, int64_t, incx, const cuComplex*, y, int64_t, incy, cuComplex*, AP); +DEF_FN(cublasStatus_t, cublasZhpr2_v2, cublasHandle_t, handle, cublasFillMode_t, uplo, int, n, const cuDoubleComplex*, alpha, const cuDoubleComplex*, x, int, incx, const cuDoubleComplex*, y, int, incy, cuDoubleComplex*, AP); +DEF_FN(cublasStatus_t, cublasZhpr2_v2_64, cublasHandle_t, handle, cublasFillMode_t, uplo, int64_t, n, const cuDoubleComplex*, alpha, const cuDoubleComplex*, x, int64_t, incx, const cuDoubleComplex*, y, int64_t, incy, cuDoubleComplex*, AP); +DEF_FN(cublasStatus_t, cublasSgemvBatched, cublasHandle_t, handle, cublasOperation_t, trans, int, m, int, n, const float*, alpha, const float* const*, Aarray, int, lda, const float* const*, xarray, int, incx, const float*, beta, float* const*, yarray, int, incy, int, batchCount); +DEF_FN(cublasStatus_t, cublasSgemvBatched_64, cublasHandle_t, handle, cublasOperation_t, trans, int64_t, m, int64_t, n, const float*, alpha, const float* const*, Aarray, int64_t, lda, const float* const*, xarray, int64_t, incx, const float*, beta, float* const*, yarray, int64_t, incy, int64_t, batchCount); +DEF_FN(cublasStatus_t, cublasDgemvBatched, cublasHandle_t, handle, cublasOperation_t, trans, int, m, int, n, const double*, alpha, const double* const*, Aarray, int, lda, const double* const*, xarray, int, incx, const double*, beta, double* const*, yarray, int, incy, int, batchCount); +DEF_FN(cublasStatus_t, cublasDgemvBatched_64, cublasHandle_t, handle, cublasOperation_t, trans, int64_t, m, int64_t, n, const double*, alpha, const double* const*, Aarray, int64_t, lda, const double* const*, xarray, int64_t, incx, const double*, beta, double* const*, yarray, int64_t, incy, int64_t, batchCount); +DEF_FN(cublasStatus_t, cublasCgemvBatched, cublasHandle_t, handle, cublasOperation_t, trans, int, m, int, n, const cuComplex*, alpha, const cuComplex* const*, Aarray, int, lda, const cuComplex* const*, xarray, int, incx, const cuComplex*, beta, cuComplex* const*, yarray, int, incy, int, batchCount); +DEF_FN(cublasStatus_t, cublasCgemvBatched_64, cublasHandle_t, handle, cublasOperation_t, trans, int64_t, m, int64_t, n, const cuComplex*, alpha, const cuComplex* const*, Aarray, int64_t, lda, const cuComplex* const*, xarray, int64_t, incx, const cuComplex*, beta, cuComplex* const*, yarray, int64_t, incy, int64_t, batchCount); +DEF_FN(cublasStatus_t, cublasZgemvBatched, cublasHandle_t, handle, cublasOperation_t, trans, int, m, int, n, const cuDoubleComplex*, alpha, const cuDoubleComplex* const*, Aarray, int, lda, const cuDoubleComplex* const*, xarray, int, incx, const cuDoubleComplex*, beta, cuDoubleComplex* const*, yarray, int, incy, int, batchCount); +DEF_FN(cublasStatus_t, cublasZgemvBatched_64, cublasHandle_t, handle, cublasOperation_t, trans, int64_t, m, int64_t, n, const cuDoubleComplex*, alpha, const cuDoubleComplex* const*, Aarray, int64_t, lda, const cuDoubleComplex* const*, xarray, int64_t, incx, const cuDoubleComplex*, beta, cuDoubleComplex* const*, yarray, int64_t, incy, int64_t, batchCount); +DEF_FN(cublasStatus_t, cublasSgemvStridedBatched, cublasHandle_t, handle, cublasOperation_t, trans, int, m, int, n, const float*, alpha, const float*, A, int, lda, long long int, strideA, const float*, x, int, incx, long long int, stridex, const float*, beta, float*, y, int, incy, long long int, stridey, int, batchCount); +DEF_FN(cublasStatus_t, cublasSgemvStridedBatched_64, cublasHandle_t, handle, cublasOperation_t, trans, int64_t, m, int64_t, n, const float*, alpha, const float*, A, int64_t, lda, long long int, strideA, const float*, x, int64_t, incx, long long int, stridex, const float*, beta, float*, y, int64_t, incy, long long int, stridey, int64_t, batchCount); +DEF_FN(cublasStatus_t, cublasDgemvStridedBatched, cublasHandle_t, handle, cublasOperation_t, trans, int, m, int, n, const double*, alpha, const double*, A, int, lda, long long int, strideA, const double*, x, int, incx, long long int, stridex, const double*, beta, double*, y, int, incy, long long int, stridey, int, batchCount); +DEF_FN(cublasStatus_t, cublasDgemvStridedBatched_64, cublasHandle_t, handle, cublasOperation_t, trans, int64_t, m, int64_t, n, const double*, alpha, const double*, A, int64_t, lda, long long int, strideA, const double*, x, int64_t, incx, long long int, stridex, const double*, beta, double*, y, int64_t, incy, long long int, stridey, int64_t, batchCount); +DEF_FN(cublasStatus_t, cublasCgemvStridedBatched, cublasHandle_t, handle, cublasOperation_t, trans, int, m, int, n, const cuComplex*, alpha, const cuComplex*, A, int, lda, long long int, strideA, const cuComplex*, x, int, incx, long long int, stridex, const cuComplex*, beta, cuComplex*, y, int, incy, long long int, stridey, int, batchCount); +DEF_FN(cublasStatus_t, cublasCgemvStridedBatched_64, cublasHandle_t, handle, cublasOperation_t, trans, int64_t, m, int64_t, n, const cuComplex*, alpha, const cuComplex*, A, int64_t, lda, long long int, strideA, const cuComplex*, x, int64_t, incx, long long int, stridex, const cuComplex*, beta, cuComplex*, y, int64_t, incy, long long int, stridey, int64_t, batchCount); +DEF_FN(cublasStatus_t, cublasZgemvStridedBatched, cublasHandle_t, handle, cublasOperation_t, trans, int, m, int, n, const cuDoubleComplex*, alpha, const cuDoubleComplex*, A, int, lda, long long int, strideA, const cuDoubleComplex*, x, int, incx, long long int, stridex, const cuDoubleComplex*, beta, cuDoubleComplex*, y, int, incy, long long int, stridey, int, batchCount); +DEF_FN(cublasStatus_t, cublasZgemvStridedBatched_64, cublasHandle_t, handle, cublasOperation_t, trans, int64_t, m, int64_t, n, const cuDoubleComplex*, alpha, const cuDoubleComplex*, A, int64_t, lda, long long int, strideA, const cuDoubleComplex*, x, int64_t, incx, long long int, stridex, const cuDoubleComplex*, beta, cuDoubleComplex*, y, int64_t, incy, long long int, stridey, int64_t, batchCount); + +cublasStatus_t cublasSgemm_v2(cublasHandle_t handle, + cublasOperation_t transa, cublasOperation_t transb, + int m, int n, int k, + const float *alpha, + const float *A, int lda, + const float *B, int ldb, + const float *beta, + float *C, int ldc) +{ +#ifdef WITH_API_CNT + api_call_cnt++; +#endif //WITH_API_CNT + int result; + enum clnt_stat retval_1; + retval_1 = rpc_cublassgemm_1( + (ptr)handle, + (int)transa, + (int)transb, + m, n, k, + *alpha, + (ptr)A, lda, + (ptr)B, ldb, + *beta, + (ptr)C, ldc, + &result, clnt); + if (retval_1 != RPC_SUCCESS) { + clnt_perror (clnt, "call failed"); + } + return result; +} +DEF_FN(cublasStatus_t, cublasSgemm_v2_64, cublasHandle_t, handle, cublasOperation_t, transa, cublasOperation_t, transb, int64_t, m, int64_t, n, int64_t, k, const float*, alpha, const float*, A, int64_t, lda, const float*, B, int64_t, ldb, const float*, beta, float*, C, int64_t, ldc); + +cublasStatus_t cublasDgemm_v2(cublasHandle_t handle, + cublasOperation_t transa, cublasOperation_t transb, + int m, int n, int k, + const double *alpha, + const double *A, int lda, + const double *B, int ldb, + const double *beta, + double *C, int ldc) +{ +#ifdef WITH_API_CNT + api_call_cnt++; +#endif //WITH_API_CNT + int result; + enum clnt_stat retval_1; + retval_1 = rpc_cublasdgemm_1( + (ptr)handle, + (int)transa, + (int)transb, + m, n, k, + *alpha, + (ptr)A, lda, + (ptr)B, ldb, + *beta, + (ptr)C, ldc, + &result, clnt); + if (retval_1 != RPC_SUCCESS) { + clnt_perror (clnt, "call failed"); + } + return result; +} + +DEF_FN(cublasStatus_t, cublasDgemm_v2_64, cublasHandle_t, handle, cublasOperation_t, transa, cublasOperation_t, transb, int64_t, m, int64_t, n, int64_t, k, const double*, alpha, const double*, A, int64_t, lda, const double*, B, int64_t, ldb, const double*, beta, double*, C, int64_t, ldc); +DEF_FN(cublasStatus_t, cublasCgemm_v2, cublasHandle_t, handle, cublasOperation_t, transa, cublasOperation_t, transb, int, m, int, n, int, k, const cuComplex*, alpha, const cuComplex*, A, int, lda, const cuComplex*, B, int, ldb, const cuComplex*, beta, cuComplex*, C, int, ldc); +DEF_FN(cublasStatus_t, cublasCgemm_v2_64, cublasHandle_t, handle, cublasOperation_t, transa, cublasOperation_t, transb, int64_t, m, int64_t, n, int64_t, k, const cuComplex*, alpha, const cuComplex*, A, int64_t, lda, const cuComplex*, B, int64_t, ldb, const cuComplex*, beta, cuComplex*, C, int64_t, ldc); +DEF_FN(cublasStatus_t, cublasCgemm3m, cublasHandle_t, handle, cublasOperation_t, transa, cublasOperation_t, transb, int, m, int, n, int, k, const cuComplex*, alpha, const cuComplex*, A, int, lda, const cuComplex*, B, int, ldb, const cuComplex*, beta, cuComplex*, C, int, ldc); +DEF_FN(cublasStatus_t, cublasCgemm3m_64, cublasHandle_t, handle, cublasOperation_t, transa, cublasOperation_t, transb, int64_t, m, int64_t, n, int64_t, k, const cuComplex*, alpha, const cuComplex*, A, int64_t, lda, const cuComplex*, B, int64_t, ldb, const cuComplex*, beta, cuComplex*, C, int64_t, ldc); +DEF_FN(cublasStatus_t, cublasCgemm3mEx, cublasHandle_t, handle, cublasOperation_t, transa, cublasOperation_t, transb, int, m, int, n, int, k, const cuComplex*, alpha, const void*, A, cudaDataType, Atype, int, lda, const void*, B, cudaDataType, Btype, int, ldb, const cuComplex*, beta, void*, C, cudaDataType, Ctype, int, ldc); +DEF_FN(cublasStatus_t, cublasCgemm3mEx_64, cublasHandle_t, handle, cublasOperation_t, transa, cublasOperation_t, transb, int64_t, m, int64_t, n, int64_t, k, const cuComplex*, alpha, const void*, A, cudaDataType, Atype, int64_t, lda, const void*, B, cudaDataType, Btype, int64_t, ldb, const cuComplex*, beta, void*, C, cudaDataType, Ctype, int64_t, ldc); +DEF_FN(cublasStatus_t, cublasZgemm_v2, cublasHandle_t, handle, cublasOperation_t, transa, cublasOperation_t, transb, int, m, int, n, int, k, const cuDoubleComplex*, alpha, const cuDoubleComplex*, A, int, lda, const cuDoubleComplex*, B, int, ldb, const cuDoubleComplex*, beta, cuDoubleComplex*, C, int, ldc); +DEF_FN(cublasStatus_t, cublasZgemm_v2_64, cublasHandle_t, handle, cublasOperation_t, transa, cublasOperation_t, transb, int64_t, m, int64_t, n, int64_t, k, const cuDoubleComplex*, alpha, const cuDoubleComplex*, A, int64_t, lda, const cuDoubleComplex*, B, int64_t, ldb, const cuDoubleComplex*, beta, cuDoubleComplex*, C, int64_t, ldc); +DEF_FN(cublasStatus_t, cublasZgemm3m, cublasHandle_t, handle, cublasOperation_t, transa, cublasOperation_t, transb, int, m, int, n, int, k, const cuDoubleComplex*, alpha, const cuDoubleComplex*, A, int, lda, const cuDoubleComplex*, B, int, ldb, const cuDoubleComplex*, beta, cuDoubleComplex*, C, int, ldc); +DEF_FN(cublasStatus_t, cublasZgemm3m_64, cublasHandle_t, handle, cublasOperation_t, transa, cublasOperation_t, transb, int64_t, m, int64_t, n, int64_t, k, const cuDoubleComplex*, alpha, const cuDoubleComplex*, A, int64_t, lda, const cuDoubleComplex*, B, int64_t, ldb, const cuDoubleComplex*, beta, cuDoubleComplex*, C, int64_t, ldc); + +cublasStatus_t cublasSgemmEx(cublasHandle_t handle, + cublasOperation_t transa, cublasOperation_t transb, + int m, int n, int k, + const float *alpha, + const void *A, cudaDataType_t Atype, int lda, + const void *B, cudaDataType_t Btype, int ldb, + const float *beta, + void *C, cudaDataType_t Ctype, int ldc) +{ +#ifdef WITH_API_CNT + api_call_cnt++; +#endif //WITH_API_CNT + int result; + enum clnt_stat retval_1; + retval_1 = rpc_cublassgemmex_1( + (ptr)handle, + (int)transa, + (int)transb, + m, n, k, + *alpha, + (ptr)A, (int)Atype, lda, + (ptr)B, (int)Btype, ldb, + *beta, + (ptr)C, (int)Ctype, ldc, + &result, clnt); + if (retval_1 != RPC_SUCCESS) { + clnt_perror (clnt, "call failed"); + } + return result; +} + + +DEF_FN(cublasStatus_t, cublasSgemmEx_64, cublasHandle_t, handle, cublasOperation_t, transa, cublasOperation_t, transb, int64_t, m, int64_t, n, int64_t, k, const float*, alpha, const void*, A, cudaDataType, Atype, int64_t, lda, const void*, B, cudaDataType, Btype, int64_t, ldb, const float*, beta, void*, C, cudaDataType, Ctype, int64_t, ldc); +DEF_FN(cublasStatus_t, cublasGemmEx, cublasHandle_t, handle, cublasOperation_t, transa, cublasOperation_t, transb, int, m, int, n, int, k, const void*, alpha, const void*, A, cudaDataType, Atype, int, lda, const void*, B, cudaDataType, Btype, int, ldb, const void*, beta, void*, C, cudaDataType, Ctype, int, ldc, cublasComputeType_t, computeType, cublasGemmAlgo_t, algo); +DEF_FN(cublasStatus_t, cublasGemmEx_64, cublasHandle_t, handle, cublasOperation_t, transa, cublasOperation_t, transb, int64_t, m, int64_t, n, int64_t, k, const void*, alpha, const void*, A, cudaDataType, Atype, int64_t, lda, const void*, B, cudaDataType, Btype, int64_t, ldb, const void*, beta, void*, C, cudaDataType, Ctype, int64_t, ldc, cublasComputeType_t, computeType, cublasGemmAlgo_t, algo); +DEF_FN(cublasStatus_t, cublasCgemmEx, cublasHandle_t, handle, cublasOperation_t, transa, cublasOperation_t, transb, int, m, int, n, int, k, const cuComplex*, alpha, const void*, A, cudaDataType, Atype, int, lda, const void*, B, cudaDataType, Btype, int, ldb, const cuComplex*, beta, void*, C, cudaDataType, Ctype, int, ldc); +DEF_FN(cublasStatus_t, cublasCgemmEx_64, cublasHandle_t, handle, cublasOperation_t, transa, cublasOperation_t, transb, int64_t, m, int64_t, n, int64_t, k, const cuComplex*, alpha, const void*, A, cudaDataType, Atype, int64_t, lda, const void*, B, cudaDataType, Btype, int64_t, ldb, const cuComplex*, beta, void*, C, cudaDataType, Ctype, int64_t, ldc); +DEF_FN(cublasStatus_t, cublasSsyrk_v2, cublasHandle_t, handle, cublasFillMode_t, uplo, cublasOperation_t, trans, int, n, int, k, const float*, alpha, const float*, A, int, lda, const float*, beta, float*, C, int, ldc); +DEF_FN(cublasStatus_t, cublasSsyrk_v2_64, cublasHandle_t, handle, cublasFillMode_t, uplo, cublasOperation_t, trans, int64_t, n, int64_t, k, const float*, alpha, const float*, A, int64_t, lda, const float*, beta, float*, C, int64_t, ldc); +DEF_FN(cublasStatus_t, cublasDsyrk_v2, cublasHandle_t, handle, cublasFillMode_t, uplo, cublasOperation_t, trans, int, n, int, k, const double*, alpha, const double*, A, int, lda, const double*, beta, double*, C, int, ldc); +DEF_FN(cublasStatus_t, cublasDsyrk_v2_64, cublasHandle_t, handle, cublasFillMode_t, uplo, cublasOperation_t, trans, int64_t, n, int64_t, k, const double*, alpha, const double*, A, int64_t, lda, const double*, beta, double*, C, int64_t, ldc); +DEF_FN(cublasStatus_t, cublasCsyrk_v2, cublasHandle_t, handle, cublasFillMode_t, uplo, cublasOperation_t, trans, int, n, int, k, const cuComplex*, alpha, const cuComplex*, A, int, lda, const cuComplex*, beta, cuComplex*, C, int, ldc); +DEF_FN(cublasStatus_t, cublasCsyrk_v2_64, cublasHandle_t, handle, cublasFillMode_t, uplo, cublasOperation_t, trans, int64_t, n, int64_t, k, const cuComplex*, alpha, const cuComplex*, A, int64_t, lda, const cuComplex*, beta, cuComplex*, C, int64_t, ldc); +DEF_FN(cublasStatus_t, cublasZsyrk_v2, cublasHandle_t, handle, cublasFillMode_t, uplo, cublasOperation_t, trans, int, n, int, k, const cuDoubleComplex*, alpha, const cuDoubleComplex*, A, int, lda, const cuDoubleComplex*, beta, cuDoubleComplex*, C, int, ldc); +DEF_FN(cublasStatus_t, cublasZsyrk_v2_64, cublasHandle_t, handle, cublasFillMode_t, uplo, cublasOperation_t, trans, int64_t, n, int64_t, k, const cuDoubleComplex*, alpha, const cuDoubleComplex*, A, int64_t, lda, const cuDoubleComplex*, beta, cuDoubleComplex*, C, int64_t, ldc); +DEF_FN(cublasStatus_t, cublasCsyrkEx, cublasHandle_t, handle, cublasFillMode_t, uplo, cublasOperation_t, trans, int, n, int, k, const cuComplex*, alpha, const void*, A, cudaDataType, Atype, int, lda, const cuComplex*, beta, void*, C, cudaDataType, Ctype, int, ldc); +DEF_FN(cublasStatus_t, cublasCsyrkEx_64, cublasHandle_t, handle, cublasFillMode_t, uplo, cublasOperation_t, trans, int64_t, n, int64_t, k, const cuComplex*, alpha, const void*, A, cudaDataType, Atype, int64_t, lda, const cuComplex*, beta, void*, C, cudaDataType, Ctype, int64_t, ldc); +DEF_FN(cublasStatus_t, cublasCsyrk3mEx, cublasHandle_t, handle, cublasFillMode_t, uplo, cublasOperation_t, trans, int, n, int, k, const cuComplex*, alpha, const void*, A, cudaDataType, Atype, int, lda, const cuComplex*, beta, void*, C, cudaDataType, Ctype, int, ldc); +DEF_FN(cublasStatus_t, cublasCsyrk3mEx_64, cublasHandle_t, handle, cublasFillMode_t, uplo, cublasOperation_t, trans, int64_t, n, int64_t, k, const cuComplex*, alpha, const void*, A, cudaDataType, Atype, int64_t, lda, const cuComplex*, beta, void*, C, cudaDataType, Ctype, int64_t, ldc); +DEF_FN(cublasStatus_t, cublasCherk_v2, cublasHandle_t, handle, cublasFillMode_t, uplo, cublasOperation_t, trans, int, n, int, k, const float*, alpha, const cuComplex*, A, int, lda, const float*, beta, cuComplex*, C, int, ldc); +DEF_FN(cublasStatus_t, cublasCherk_v2_64, cublasHandle_t, handle, cublasFillMode_t, uplo, cublasOperation_t, trans, int64_t, n, int64_t, k, const float*, alpha, const cuComplex*, A, int64_t, lda, const float*, beta, cuComplex*, C, int64_t, ldc); +DEF_FN(cublasStatus_t, cublasZherk_v2, cublasHandle_t, handle, cublasFillMode_t, uplo, cublasOperation_t, trans, int, n, int, k, const double*, alpha, const cuDoubleComplex*, A, int, lda, const double*, beta, cuDoubleComplex*, C, int, ldc); +DEF_FN(cublasStatus_t, cublasZherk_v2_64, cublasHandle_t, handle, cublasFillMode_t, uplo, cublasOperation_t, trans, int64_t, n, int64_t, k, const double*, alpha, const cuDoubleComplex*, A, int64_t, lda, const double*, beta, cuDoubleComplex*, C, int64_t, ldc); +DEF_FN(cublasStatus_t, cublasCherkEx, cublasHandle_t, handle, cublasFillMode_t, uplo, cublasOperation_t, trans, int, n, int, k, const float*, alpha, const void*, A, cudaDataType, Atype, int, lda, const float*, beta, void*, C, cudaDataType, Ctype, int, ldc); +DEF_FN(cublasStatus_t, cublasCherkEx_64, cublasHandle_t, handle, cublasFillMode_t, uplo, cublasOperation_t, trans, int64_t, n, int64_t, k, const float*, alpha, const void*, A, cudaDataType, Atype, int64_t, lda, const float*, beta, void*, C, cudaDataType, Ctype, int64_t, ldc); +DEF_FN(cublasStatus_t, cublasCherk3mEx, cublasHandle_t, handle, cublasFillMode_t, uplo, cublasOperation_t, trans, int, n, int, k, const float*, alpha, const void*, A, cudaDataType, Atype, int, lda, const float*, beta, void*, C, cudaDataType, Ctype, int, ldc); +DEF_FN(cublasStatus_t, cublasCherk3mEx_64, cublasHandle_t, handle, cublasFillMode_t, uplo, cublasOperation_t, trans, int64_t, n, int64_t, k, const float*, alpha, const void*, A, cudaDataType, Atype, int64_t, lda, const float*, beta, void*, C, cudaDataType, Ctype, int64_t, ldc); +DEF_FN(cublasStatus_t, cublasSsyr2k_v2, cublasHandle_t, handle, cublasFillMode_t, uplo, cublasOperation_t, trans, int, n, int, k, const float*, alpha, const float*, A, int, lda, const float*, B, int, ldb, const float*, beta, float*, C, int, ldc); +DEF_FN(cublasStatus_t, cublasSsyr2k_v2_64, cublasHandle_t, handle, cublasFillMode_t, uplo, cublasOperation_t, trans, int64_t, n, int64_t, k, const float*, alpha, const float*, A, int64_t, lda, const float*, B, int64_t, ldb, const float*, beta, float*, C, int64_t, ldc); +DEF_FN(cublasStatus_t, cublasDsyr2k_v2, cublasHandle_t, handle, cublasFillMode_t, uplo, cublasOperation_t, trans, int, n, int, k, const double*, alpha, const double*, A, int, lda, const double*, B, int, ldb, const double*, beta, double*, C, int, ldc); +DEF_FN(cublasStatus_t, cublasDsyr2k_v2_64, cublasHandle_t, handle, cublasFillMode_t, uplo, cublasOperation_t, trans, int64_t, n, int64_t, k, const double*, alpha, const double*, A, int64_t, lda, const double*, B, int64_t, ldb, const double*, beta, double*, C, int64_t, ldc); +DEF_FN(cublasStatus_t, cublasCsyr2k_v2, cublasHandle_t, handle, cublasFillMode_t, uplo, cublasOperation_t, trans, int, n, int, k, const cuComplex*, alpha, const cuComplex*, A, int, lda, const cuComplex*, B, int, ldb, const cuComplex*, beta, cuComplex*, C, int, ldc); +DEF_FN(cublasStatus_t, cublasCsyr2k_v2_64, cublasHandle_t, handle, cublasFillMode_t, uplo, cublasOperation_t, trans, int64_t, n, int64_t, k, const cuComplex*, alpha, const cuComplex*, A, int64_t, lda, const cuComplex*, B, int64_t, ldb, const cuComplex*, beta, cuComplex*, C, int64_t, ldc); +DEF_FN(cublasStatus_t, cublasZsyr2k_v2, cublasHandle_t, handle, cublasFillMode_t, uplo, cublasOperation_t, trans, int, n, int, k, const cuDoubleComplex*, alpha, const cuDoubleComplex*, A, int, lda, const cuDoubleComplex*, B, int, ldb, const cuDoubleComplex*, beta, cuDoubleComplex*, C, int, ldc); +DEF_FN(cublasStatus_t, cublasZsyr2k_v2_64, cublasHandle_t, handle, cublasFillMode_t, uplo, cublasOperation_t, trans, int64_t, n, int64_t, k, const cuDoubleComplex*, alpha, const cuDoubleComplex*, A, int64_t, lda, const cuDoubleComplex*, B, int64_t, ldb, const cuDoubleComplex*, beta, cuDoubleComplex*, C, int64_t, ldc); +DEF_FN(cublasStatus_t, cublasCher2k_v2, cublasHandle_t, handle, cublasFillMode_t, uplo, cublasOperation_t, trans, int, n, int, k, const cuComplex*, alpha, const cuComplex*, A, int, lda, const cuComplex*, B, int, ldb, const float*, beta, cuComplex*, C, int, ldc); +DEF_FN(cublasStatus_t, cublasCher2k_v2_64, cublasHandle_t, handle, cublasFillMode_t, uplo, cublasOperation_t, trans, int64_t, n, int64_t, k, const cuComplex*, alpha, const cuComplex*, A, int64_t, lda, const cuComplex*, B, int64_t, ldb, const float*, beta, cuComplex*, C, int64_t, ldc); +DEF_FN(cublasStatus_t, cublasZher2k_v2, cublasHandle_t, handle, cublasFillMode_t, uplo, cublasOperation_t, trans, int, n, int, k, const cuDoubleComplex*, alpha, const cuDoubleComplex*, A, int, lda, const cuDoubleComplex*, B, int, ldb, const double*, beta, cuDoubleComplex*, C, int, ldc); +DEF_FN(cublasStatus_t, cublasZher2k_v2_64, cublasHandle_t, handle, cublasFillMode_t, uplo, cublasOperation_t, trans, int64_t, n, int64_t, k, const cuDoubleComplex*, alpha, const cuDoubleComplex*, A, int64_t, lda, const cuDoubleComplex*, B, int64_t, ldb, const double*, beta, cuDoubleComplex*, C, int64_t, ldc); +DEF_FN(cublasStatus_t, cublasSsyrkx, cublasHandle_t, handle, cublasFillMode_t, uplo, cublasOperation_t, trans, int, n, int, k, const float*, alpha, const float*, A, int, lda, const float*, B, int, ldb, const float*, beta, float*, C, int, ldc); +DEF_FN(cublasStatus_t, cublasSsyrkx_64, cublasHandle_t, handle, cublasFillMode_t, uplo, cublasOperation_t, trans, int64_t, n, int64_t, k, const float*, alpha, const float*, A, int64_t, lda, const float*, B, int64_t, ldb, const float*, beta, float*, C, int64_t, ldc); +DEF_FN(cublasStatus_t, cublasDsyrkx, cublasHandle_t, handle, cublasFillMode_t, uplo, cublasOperation_t, trans, int, n, int, k, const double*, alpha, const double*, A, int, lda, const double*, B, int, ldb, const double*, beta, double*, C, int, ldc); +DEF_FN(cublasStatus_t, cublasDsyrkx_64, cublasHandle_t, handle, cublasFillMode_t, uplo, cublasOperation_t, trans, int64_t, n, int64_t, k, const double*, alpha, const double*, A, int64_t, lda, const double*, B, int64_t, ldb, const double*, beta, double*, C, int64_t, ldc); +DEF_FN(cublasStatus_t, cublasCsyrkx, cublasHandle_t, handle, cublasFillMode_t, uplo, cublasOperation_t, trans, int, n, int, k, const cuComplex*, alpha, const cuComplex*, A, int, lda, const cuComplex*, B, int, ldb, const cuComplex*, beta, cuComplex*, C, int, ldc); +DEF_FN(cublasStatus_t, cublasCsyrkx_64, cublasHandle_t, handle, cublasFillMode_t, uplo, cublasOperation_t, trans, int64_t, n, int64_t, k, const cuComplex*, alpha, const cuComplex*, A, int64_t, lda, const cuComplex*, B, int64_t, ldb, const cuComplex*, beta, cuComplex*, C, int64_t, ldc); +DEF_FN(cublasStatus_t, cublasZsyrkx, cublasHandle_t, handle, cublasFillMode_t, uplo, cublasOperation_t, trans, int, n, int, k, const cuDoubleComplex*, alpha, const cuDoubleComplex*, A, int, lda, const cuDoubleComplex*, B, int, ldb, const cuDoubleComplex*, beta, cuDoubleComplex*, C, int, ldc); +DEF_FN(cublasStatus_t, cublasZsyrkx_64, cublasHandle_t, handle, cublasFillMode_t, uplo, cublasOperation_t, trans, int64_t, n, int64_t, k, const cuDoubleComplex*, alpha, const cuDoubleComplex*, A, int64_t, lda, const cuDoubleComplex*, B, int64_t, ldb, const cuDoubleComplex*, beta, cuDoubleComplex*, C, int64_t, ldc); +DEF_FN(cublasStatus_t, cublasCherkx, cublasHandle_t, handle, cublasFillMode_t, uplo, cublasOperation_t, trans, int, n, int, k, const cuComplex*, alpha, const cuComplex*, A, int, lda, const cuComplex*, B, int, ldb, const float*, beta, cuComplex*, C, int, ldc); +DEF_FN(cublasStatus_t, cublasCherkx_64, cublasHandle_t, handle, cublasFillMode_t, uplo, cublasOperation_t, trans, int64_t, n, int64_t, k, const cuComplex*, alpha, const cuComplex*, A, int64_t, lda, const cuComplex*, B, int64_t, ldb, const float*, beta, cuComplex*, C, int64_t, ldc); +DEF_FN(cublasStatus_t, cublasZherkx, cublasHandle_t, handle, cublasFillMode_t, uplo, cublasOperation_t, trans, int, n, int, k, const cuDoubleComplex*, alpha, const cuDoubleComplex*, A, int, lda, const cuDoubleComplex*, B, int, ldb, const double*, beta, cuDoubleComplex*, C, int, ldc); +DEF_FN(cublasStatus_t, cublasZherkx_64, cublasHandle_t, handle, cublasFillMode_t, uplo, cublasOperation_t, trans, int64_t, n, int64_t, k, const cuDoubleComplex*, alpha, const cuDoubleComplex*, A, int64_t, lda, const cuDoubleComplex*, B, int64_t, ldb, const double*, beta, cuDoubleComplex*, C, int64_t, ldc); +DEF_FN(cublasStatus_t, cublasSsymm_v2, cublasHandle_t, handle, cublasSideMode_t, side, cublasFillMode_t, uplo, int, m, int, n, const float*, alpha, const float*, A, int, lda, const float*, B, int, ldb, const float*, beta, float*, C, int, ldc); +DEF_FN(cublasStatus_t, cublasSsymm_v2_64, cublasHandle_t, handle, cublasSideMode_t, side, cublasFillMode_t, uplo, int64_t, m, int64_t, n, const float*, alpha, const float*, A, int64_t, lda, const float*, B, int64_t, ldb, const float*, beta, float*, C, int64_t, ldc); +DEF_FN(cublasStatus_t, cublasDsymm_v2, cublasHandle_t, handle, cublasSideMode_t, side, cublasFillMode_t, uplo, int, m, int, n, const double*, alpha, const double*, A, int, lda, const double*, B, int, ldb, const double*, beta, double*, C, int, ldc); +DEF_FN(cublasStatus_t, cublasDsymm_v2_64, cublasHandle_t, handle, cublasSideMode_t, side, cublasFillMode_t, uplo, int64_t, m, int64_t, n, const double*, alpha, const double*, A, int64_t, lda, const double*, B, int64_t, ldb, const double*, beta, double*, C, int64_t, ldc); +DEF_FN(cublasStatus_t, cublasCsymm_v2, cublasHandle_t, handle, cublasSideMode_t, side, cublasFillMode_t, uplo, int, m, int, n, const cuComplex*, alpha, const cuComplex*, A, int, lda, const cuComplex*, B, int, ldb, const cuComplex*, beta, cuComplex*, C, int, ldc); +DEF_FN(cublasStatus_t, cublasCsymm_v2_64, cublasHandle_t, handle, cublasSideMode_t, side, cublasFillMode_t, uplo, int64_t, m, int64_t, n, const cuComplex*, alpha, const cuComplex*, A, int64_t, lda, const cuComplex*, B, int64_t, ldb, const cuComplex*, beta, cuComplex*, C, int64_t, ldc); +DEF_FN(cublasStatus_t, cublasZsymm_v2, cublasHandle_t, handle, cublasSideMode_t, side, cublasFillMode_t, uplo, int, m, int, n, const cuDoubleComplex*, alpha, const cuDoubleComplex*, A, int, lda, const cuDoubleComplex*, B, int, ldb, const cuDoubleComplex*, beta, cuDoubleComplex*, C, int, ldc); +DEF_FN(cublasStatus_t, cublasZsymm_v2_64, cublasHandle_t, handle, cublasSideMode_t, side, cublasFillMode_t, uplo, int64_t, m, int64_t, n, const cuDoubleComplex*, alpha, const cuDoubleComplex*, A, int64_t, lda, const cuDoubleComplex*, B, int64_t, ldb, const cuDoubleComplex*, beta, cuDoubleComplex*, C, int64_t, ldc); +DEF_FN(cublasStatus_t, cublasChemm_v2, cublasHandle_t, handle, cublasSideMode_t, side, cublasFillMode_t, uplo, int, m, int, n, const cuComplex*, alpha, const cuComplex*, A, int, lda, const cuComplex*, B, int, ldb, const cuComplex*, beta, cuComplex*, C, int, ldc); +DEF_FN(cublasStatus_t, cublasChemm_v2_64, cublasHandle_t, handle, cublasSideMode_t, side, cublasFillMode_t, uplo, int64_t, m, int64_t, n, const cuComplex*, alpha, const cuComplex*, A, int64_t, lda, const cuComplex*, B, int64_t, ldb, const cuComplex*, beta, cuComplex*, C, int64_t, ldc); +DEF_FN(cublasStatus_t, cublasZhemm_v2, cublasHandle_t, handle, cublasSideMode_t, side, cublasFillMode_t, uplo, int, m, int, n, const cuDoubleComplex*, alpha, const cuDoubleComplex*, A, int, lda, const cuDoubleComplex*, B, int, ldb, const cuDoubleComplex*, beta, cuDoubleComplex*, C, int, ldc); +DEF_FN(cublasStatus_t, cublasZhemm_v2_64, cublasHandle_t, handle, cublasSideMode_t, side, cublasFillMode_t, uplo, int64_t, m, int64_t, n, const cuDoubleComplex*, alpha, const cuDoubleComplex*, A, int64_t, lda, const cuDoubleComplex*, B, int64_t, ldb, const cuDoubleComplex*, beta, cuDoubleComplex*, C, int64_t, ldc); +DEF_FN(cublasStatus_t, cublasStrsm_v2, cublasHandle_t, handle, cublasSideMode_t, side, cublasFillMode_t, uplo, cublasOperation_t, trans, cublasDiagType_t, diag, int, m, int, n, const float*, alpha, const float*, A, int, lda, float*, B, int, ldb); +DEF_FN(cublasStatus_t, cublasStrsm_v2_64, cublasHandle_t, handle, cublasSideMode_t, side, cublasFillMode_t, uplo, cublasOperation_t, trans, cublasDiagType_t, diag, int64_t, m, int64_t, n, const float*, alpha, const float*, A, int64_t, lda, float*, B, int64_t, ldb); +DEF_FN(cublasStatus_t, cublasDtrsm_v2, cublasHandle_t, handle, cublasSideMode_t, side, cublasFillMode_t, uplo, cublasOperation_t, trans, cublasDiagType_t, diag, int, m, int, n, const double*, alpha, const double*, A, int, lda, double*, B, int, ldb); +DEF_FN(cublasStatus_t, cublasDtrsm_v2_64, cublasHandle_t, handle, cublasSideMode_t, side, cublasFillMode_t, uplo, cublasOperation_t, trans, cublasDiagType_t, diag, int64_t, m, int64_t, n, const double*, alpha, const double*, A, int64_t, lda, double*, B, int64_t, ldb); +DEF_FN(cublasStatus_t, cublasCtrsm_v2, cublasHandle_t, handle, cublasSideMode_t, side, cublasFillMode_t, uplo, cublasOperation_t, trans, cublasDiagType_t, diag, int, m, int, n, const cuComplex*, alpha, const cuComplex*, A, int, lda, cuComplex*, B, int, ldb); +DEF_FN(cublasStatus_t, cublasCtrsm_v2_64, cublasHandle_t, handle, cublasSideMode_t, side, cublasFillMode_t, uplo, cublasOperation_t, trans, cublasDiagType_t, diag, int64_t, m, int64_t, n, const cuComplex*, alpha, const cuComplex*, A, int64_t, lda, cuComplex*, B, int64_t, ldb); +DEF_FN(cublasStatus_t, cublasZtrsm_v2, cublasHandle_t, handle, cublasSideMode_t, side, cublasFillMode_t, uplo, cublasOperation_t, trans, cublasDiagType_t, diag, int, m, int, n, const cuDoubleComplex*, alpha, const cuDoubleComplex*, A, int, lda, cuDoubleComplex*, B, int, ldb); +DEF_FN(cublasStatus_t, cublasZtrsm_v2_64, cublasHandle_t, handle, cublasSideMode_t, side, cublasFillMode_t, uplo, cublasOperation_t, trans, cublasDiagType_t, diag, int64_t, m, int64_t, n, const cuDoubleComplex*, alpha, const cuDoubleComplex*, A, int64_t, lda, cuDoubleComplex*, B, int64_t, ldb); +DEF_FN(cublasStatus_t, cublasStrmm_v2, cublasHandle_t, handle, cublasSideMode_t, side, cublasFillMode_t, uplo, cublasOperation_t, trans, cublasDiagType_t, diag, int, m, int, n, const float*, alpha, const float*, A, int, lda, const float*, B, int, ldb, float*, C, int, ldc); +DEF_FN(cublasStatus_t, cublasStrmm_v2_64, cublasHandle_t, handle, cublasSideMode_t, side, cublasFillMode_t, uplo, cublasOperation_t, trans, cublasDiagType_t, diag, int64_t, m, int64_t, n, const float*, alpha, const float*, A, int64_t, lda, const float*, B, int64_t, ldb, float*, C, int64_t, ldc); +DEF_FN(cublasStatus_t, cublasDtrmm_v2, cublasHandle_t, handle, cublasSideMode_t, side, cublasFillMode_t, uplo, cublasOperation_t, trans, cublasDiagType_t, diag, int, m, int, n, const double*, alpha, const double*, A, int, lda, const double*, B, int, ldb, double*, C, int, ldc); +DEF_FN(cublasStatus_t, cublasDtrmm_v2_64, cublasHandle_t, handle, cublasSideMode_t, side, cublasFillMode_t, uplo, cublasOperation_t, trans, cublasDiagType_t, diag, int64_t, m, int64_t, n, const double*, alpha, const double*, A, int64_t, lda, const double*, B, int64_t, ldb, double*, C, int64_t, ldc); +DEF_FN(cublasStatus_t, cublasCtrmm_v2, cublasHandle_t, handle, cublasSideMode_t, side, cublasFillMode_t, uplo, cublasOperation_t, trans, cublasDiagType_t, diag, int, m, int, n, const cuComplex*, alpha, const cuComplex*, A, int, lda, const cuComplex*, B, int, ldb, cuComplex*, C, int, ldc); +DEF_FN(cublasStatus_t, cublasCtrmm_v2_64, cublasHandle_t, handle, cublasSideMode_t, side, cublasFillMode_t, uplo, cublasOperation_t, trans, cublasDiagType_t, diag, int64_t, m, int64_t, n, const cuComplex*, alpha, const cuComplex*, A, int64_t, lda, const cuComplex*, B, int64_t, ldb, cuComplex*, C, int64_t, ldc); +DEF_FN(cublasStatus_t, cublasZtrmm_v2, cublasHandle_t, handle, cublasSideMode_t, side, cublasFillMode_t, uplo, cublasOperation_t, trans, cublasDiagType_t, diag, int, m, int, n, const cuDoubleComplex*, alpha, const cuDoubleComplex*, A, int, lda, const cuDoubleComplex*, B, int, ldb, cuDoubleComplex*, C, int, ldc); +DEF_FN(cublasStatus_t, cublasZtrmm_v2_64, cublasHandle_t, handle, cublasSideMode_t, side, cublasFillMode_t, uplo, cublasOperation_t, trans, cublasDiagType_t, diag, int64_t, m, int64_t, n, const cuDoubleComplex*, alpha, const cuDoubleComplex*, A, int64_t, lda, const cuDoubleComplex*, B, int64_t, ldb, cuDoubleComplex*, C, int64_t, ldc); +DEF_FN(cublasStatus_t, cublasSgemmBatched, cublasHandle_t, handle, cublasOperation_t, transa, cublasOperation_t, transb, int, m, int, n, int, k, const float*, alpha, const float* const*, Aarray, int, lda, const float* const*, Barray, int, ldb, const float*, beta, float* const*, Carray, int, ldc, int, batchCount); +DEF_FN(cublasStatus_t, cublasSgemmBatched_64, cublasHandle_t, handle, cublasOperation_t, transa, cublasOperation_t, transb, int64_t, m, int64_t, n, int64_t, k, const float*, alpha, const float* const*, Aarray, int64_t, lda, const float* const*, Barray, int64_t, ldb, const float*, beta, float* const*, Carray, int64_t, ldc, int64_t, batchCount); +DEF_FN(cublasStatus_t, cublasDgemmBatched, cublasHandle_t, handle, cublasOperation_t, transa, cublasOperation_t, transb, int, m, int, n, int, k, const double*, alpha, const double* const*, Aarray, int, lda, const double* const*, Barray, int, ldb, const double*, beta, double* const*, Carray, int, ldc, int, batchCount); +DEF_FN(cublasStatus_t, cublasDgemmBatched_64, cublasHandle_t, handle, cublasOperation_t, transa, cublasOperation_t, transb, int64_t, m, int64_t, n, int64_t, k, const double*, alpha, const double* const*, Aarray, int64_t, lda, const double* const*, Barray, int64_t, ldb, const double*, beta, double* const*, Carray, int64_t, ldc, int64_t, batchCount); +DEF_FN(cublasStatus_t, cublasCgemmBatched, cublasHandle_t, handle, cublasOperation_t, transa, cublasOperation_t, transb, int, m, int, n, int, k, const cuComplex*, alpha, const cuComplex* const*, Aarray, int, lda, const cuComplex* const*, Barray, int, ldb, const cuComplex*, beta, cuComplex* const*, Carray, int, ldc, int, batchCount); +DEF_FN(cublasStatus_t, cublasCgemmBatched_64, cublasHandle_t, handle, cublasOperation_t, transa, cublasOperation_t, transb, int64_t, m, int64_t, n, int64_t, k, const cuComplex*, alpha, const cuComplex* const*, Aarray, int64_t, lda, const cuComplex* const*, Barray, int64_t, ldb, const cuComplex*, beta, cuComplex* const*, Carray, int64_t, ldc, int64_t, batchCount); +DEF_FN(cublasStatus_t, cublasCgemm3mBatched, cublasHandle_t, handle, cublasOperation_t, transa, cublasOperation_t, transb, int, m, int, n, int, k, const cuComplex*, alpha, const cuComplex* const*, Aarray, int, lda, const cuComplex* const*, Barray, int, ldb, const cuComplex*, beta, cuComplex* const*, Carray, int, ldc, int, batchCount); +DEF_FN(cublasStatus_t, cublasCgemm3mBatched_64, cublasHandle_t, handle, cublasOperation_t, transa, cublasOperation_t, transb, int64_t, m, int64_t, n, int64_t, k, const cuComplex*, alpha, const cuComplex* const*, Aarray, int64_t, lda, const cuComplex* const*, Barray, int64_t, ldb, const cuComplex*, beta, cuComplex* const*, Carray, int64_t, ldc, int64_t, batchCount); +DEF_FN(cublasStatus_t, cublasZgemmBatched, cublasHandle_t, handle, cublasOperation_t, transa, cublasOperation_t, transb, int, m, int, n, int, k, const cuDoubleComplex*, alpha, const cuDoubleComplex* const*, Aarray, int, lda, const cuDoubleComplex* const*, Barray, int, ldb, const cuDoubleComplex*, beta, cuDoubleComplex* const*, Carray, int, ldc, int, batchCount); +DEF_FN(cublasStatus_t, cublasZgemmBatched_64, cublasHandle_t, handle, cublasOperation_t, transa, cublasOperation_t, transb, int64_t, m, int64_t, n, int64_t, k, const cuDoubleComplex*, alpha, const cuDoubleComplex* const*, Aarray, int64_t, lda, const cuDoubleComplex* const*, Barray, int64_t, ldb, const cuDoubleComplex*, beta, cuDoubleComplex* const*, Carray, int64_t, ldc, int64_t, batchCount); +DEF_FN(cublasStatus_t, cublasSgemmStridedBatched, cublasHandle_t, handle, cublasOperation_t, transa, cublasOperation_t, transb, int, m, int, n, int, k, const float*, alpha, const float*, A, int, lda, long long int, strideA, const float*, B, int, ldb, long long int, strideB, const float*, beta, float*, C, int, ldc, long long int, strideC, int, batchCount); +DEF_FN(cublasStatus_t, cublasSgemmStridedBatched_64, cublasHandle_t, handle, cublasOperation_t, transa, cublasOperation_t, transb, int64_t, m, int64_t, n, int64_t, k, const float*, alpha, const float*, A, int64_t, lda, long long int, strideA, const float*, B, int64_t, ldb, long long int, strideB, const float*, beta, float*, C, int64_t, ldc, long long int, strideC, int64_t, batchCount); +DEF_FN(cublasStatus_t, cublasDgemmStridedBatched, cublasHandle_t, handle, cublasOperation_t, transa, cublasOperation_t, transb, int, m, int, n, int, k, const double*, alpha, const double*, A, int, lda, long long int, strideA, const double*, B, int, ldb, long long int, strideB, const double*, beta, double*, C, int, ldc, long long int, strideC, int, batchCount); +DEF_FN(cublasStatus_t, cublasDgemmStridedBatched_64, cublasHandle_t, handle, cublasOperation_t, transa, cublasOperation_t, transb, int64_t, m, int64_t, n, int64_t, k, const double*, alpha, const double*, A, int64_t, lda, long long int, strideA, const double*, B, int64_t, ldb, long long int, strideB, const double*, beta, double*, C, int64_t, ldc, long long int, strideC, int64_t, batchCount); +DEF_FN(cublasStatus_t, cublasCgemmStridedBatched, cublasHandle_t, handle, cublasOperation_t, transa, cublasOperation_t, transb, int, m, int, n, int, k, const cuComplex*, alpha, const cuComplex*, A, int, lda, long long int, strideA, const cuComplex*, B, int, ldb, long long int, strideB, const cuComplex*, beta, cuComplex*, C, int, ldc, long long int, strideC, int, batchCount); +DEF_FN(cublasStatus_t, cublasCgemmStridedBatched_64, cublasHandle_t, handle, cublasOperation_t, transa, cublasOperation_t, transb, int64_t, m, int64_t, n, int64_t, k, const cuComplex*, alpha, const cuComplex*, A, int64_t, lda, long long int, strideA, const cuComplex*, B, int64_t, ldb, long long int, strideB, const cuComplex*, beta, cuComplex*, C, int64_t, ldc, long long int, strideC, int64_t, batchCount); +DEF_FN(cublasStatus_t, cublasCgemm3mStridedBatched, cublasHandle_t, handle, cublasOperation_t, transa, cublasOperation_t, transb, int, m, int, n, int, k, const cuComplex*, alpha, const cuComplex*, A, int, lda, long long int, strideA, const cuComplex*, B, int, ldb, long long int, strideB, const cuComplex*, beta, cuComplex*, C, int, ldc, long long int, strideC, int, batchCount); +DEF_FN(cublasStatus_t, cublasCgemm3mStridedBatched_64, cublasHandle_t, handle, cublasOperation_t, transa, cublasOperation_t, transb, int64_t, m, int64_t, n, int64_t, k, const cuComplex*, alpha, const cuComplex*, A, int64_t, lda, long long int, strideA, const cuComplex*, B, int64_t, ldb, long long int, strideB, const cuComplex*, beta, cuComplex*, C, int64_t, ldc, long long int, strideC, int64_t, batchCount); +DEF_FN(cublasStatus_t, cublasZgemmStridedBatched, cublasHandle_t, handle, cublasOperation_t, transa, cublasOperation_t, transb, int, m, int, n, int, k, const cuDoubleComplex*, alpha, const cuDoubleComplex*, A, int, lda, long long int, strideA, const cuDoubleComplex*, B, int, ldb, long long int, strideB, const cuDoubleComplex*, beta, cuDoubleComplex*, C, int, ldc, long long int, strideC, int, batchCount); +DEF_FN(cublasStatus_t, cublasZgemmStridedBatched_64, cublasHandle_t, handle, cublasOperation_t, transa, cublasOperation_t, transb, int64_t, m, int64_t, n, int64_t, k, const cuDoubleComplex*, alpha, const cuDoubleComplex*, A, int64_t, lda, long long int, strideA, const cuDoubleComplex*, B, int64_t, ldb, long long int, strideB, const cuDoubleComplex*, beta, cuDoubleComplex*, C, int64_t, ldc, long long int, strideC, int64_t, batchCount); +DEF_FN(cublasStatus_t, cublasGemmBatchedEx, cublasHandle_t, handle, cublasOperation_t, transa, cublasOperation_t, transb, int, m, int, n, int, k, const void*, alpha, const void* const*, Aarray, cudaDataType, Atype, int, lda, const void* const*, Barray, cudaDataType, Btype, int, ldb, const void*, beta, void* const*, Carray, cudaDataType, Ctype, int, ldc, int, batchCount, cublasComputeType_t, computeType, cublasGemmAlgo_t, algo); +DEF_FN(cublasStatus_t, cublasGemmBatchedEx_64, cublasHandle_t, handle, cublasOperation_t, transa, cublasOperation_t, transb, int64_t, m, int64_t, n, int64_t, k, const void*, alpha, const void* const*, Aarray, cudaDataType, Atype, int64_t, lda, const void* const*, Barray, cudaDataType, Btype, int64_t, ldb, const void*, beta, void* const*, Carray, cudaDataType, Ctype, int64_t, ldc, int64_t, batchCount, cublasComputeType_t, computeType, cublasGemmAlgo_t, algo); +DEF_FN(cublasStatus_t, cublasGemmStridedBatchedEx, cublasHandle_t, handle, cublasOperation_t, transa, cublasOperation_t, transb, int, m, int, n, int, k, const void*, alpha, const void*, A, cudaDataType, Atype, int, lda, long long int, strideA, const void*, B, cudaDataType, Btype, int, ldb, long long int, strideB, const void*, beta, void*, C, cudaDataType, Ctype, int, ldc, long long int, strideC, int, batchCount, cublasComputeType_t, computeType, cublasGemmAlgo_t, algo); +DEF_FN(cublasStatus_t, cublasGemmStridedBatchedEx_64, cublasHandle_t, handle, cublasOperation_t, transa, cublasOperation_t, transb, int64_t, m, int64_t, n, int64_t, k, const void*, alpha, const void*, A, cudaDataType, Atype, int64_t, lda, long long int, strideA, const void*, B, cudaDataType, Btype, int64_t, ldb, long long int, strideB, const void*, beta, void*, C, cudaDataType, Ctype, int64_t, ldc, long long int, strideC, int64_t, batchCount, cublasComputeType_t, computeType, cublasGemmAlgo_t, algo); +DEF_FN(cublasStatus_t, cublasSgeam, cublasHandle_t, handle, cublasOperation_t, transa, cublasOperation_t, transb, int, m, int, n, const float*, alpha, const float*, A, int, lda, const float*, beta, const float*, B, int, ldb, float*, C, int, ldc); +DEF_FN(cublasStatus_t, cublasSgeam_64, cublasHandle_t, handle, cublasOperation_t, transa, cublasOperation_t, transb, int64_t, m, int64_t, n, const float*, alpha, const float*, A, int64_t, lda, const float*, beta, const float*, B, int64_t, ldb, float*, C, int64_t, ldc); +DEF_FN(cublasStatus_t, cublasDgeam, cublasHandle_t, handle, cublasOperation_t, transa, cublasOperation_t, transb, int, m, int, n, const double*, alpha, const double*, A, int, lda, const double*, beta, const double*, B, int, ldb, double*, C, int, ldc); +DEF_FN(cublasStatus_t, cublasDgeam_64, cublasHandle_t, handle, cublasOperation_t, transa, cublasOperation_t, transb, int64_t, m, int64_t, n, const double*, alpha, const double*, A, int64_t, lda, const double*, beta, const double*, B, int64_t, ldb, double*, C, int64_t, ldc); +DEF_FN(cublasStatus_t, cublasCgeam, cublasHandle_t, handle, cublasOperation_t, transa, cublasOperation_t, transb, int, m, int, n, const cuComplex*, alpha, const cuComplex*, A, int, lda, const cuComplex*, beta, const cuComplex*, B, int, ldb, cuComplex*, C, int, ldc); +DEF_FN(cublasStatus_t, cublasCgeam_64, cublasHandle_t, handle, cublasOperation_t, transa, cublasOperation_t, transb, int64_t, m, int64_t, n, const cuComplex*, alpha, const cuComplex*, A, int64_t, lda, const cuComplex*, beta, const cuComplex*, B, int64_t, ldb, cuComplex*, C, int64_t, ldc); +DEF_FN(cublasStatus_t, cublasZgeam, cublasHandle_t, handle, cublasOperation_t, transa, cublasOperation_t, transb, int, m, int, n, const cuDoubleComplex*, alpha, const cuDoubleComplex*, A, int, lda, const cuDoubleComplex*, beta, const cuDoubleComplex*, B, int, ldb, cuDoubleComplex*, C, int, ldc); +DEF_FN(cublasStatus_t, cublasZgeam_64, cublasHandle_t, handle, cublasOperation_t, transa, cublasOperation_t, transb, int64_t, m, int64_t, n, const cuDoubleComplex*, alpha, const cuDoubleComplex*, A, int64_t, lda, const cuDoubleComplex*, beta, const cuDoubleComplex*, B, int64_t, ldb, cuDoubleComplex*, C, int64_t, ldc); +DEF_FN(cublasStatus_t, cublasStrsmBatched, cublasHandle_t, handle, cublasSideMode_t, side, cublasFillMode_t, uplo, cublasOperation_t, trans, cublasDiagType_t, diag, int, m, int, n, const float*, alpha, const float* const*, A, int, lda, float* const*, B, int, ldb, int, batchCount); +DEF_FN(cublasStatus_t, cublasStrsmBatched_64, cublasHandle_t, handle, cublasSideMode_t, side, cublasFillMode_t, uplo, cublasOperation_t, trans, cublasDiagType_t, diag, int64_t, m, int64_t, n, const float*, alpha, const float* const*, A, int64_t, lda, float* const*, B, int64_t, ldb, int64_t, batchCount); +DEF_FN(cublasStatus_t, cublasDtrsmBatched, cublasHandle_t, handle, cublasSideMode_t, side, cublasFillMode_t, uplo, cublasOperation_t, trans, cublasDiagType_t, diag, int, m, int, n, const double*, alpha, const double* const*, A, int, lda, double* const*, B, int, ldb, int, batchCount); +DEF_FN(cublasStatus_t, cublasDtrsmBatched_64, cublasHandle_t, handle, cublasSideMode_t, side, cublasFillMode_t, uplo, cublasOperation_t, trans, cublasDiagType_t, diag, int64_t, m, int64_t, n, const double*, alpha, const double* const*, A, int64_t, lda, double* const*, B, int64_t, ldb, int64_t, batchCount); +DEF_FN(cublasStatus_t, cublasCtrsmBatched, cublasHandle_t, handle, cublasSideMode_t, side, cublasFillMode_t, uplo, cublasOperation_t, trans, cublasDiagType_t, diag, int, m, int, n, const cuComplex*, alpha, const cuComplex* const*, A, int, lda, cuComplex* const*, B, int, ldb, int, batchCount); +DEF_FN(cublasStatus_t, cublasCtrsmBatched_64, cublasHandle_t, handle, cublasSideMode_t, side, cublasFillMode_t, uplo, cublasOperation_t, trans, cublasDiagType_t, diag, int64_t, m, int64_t, n, const cuComplex*, alpha, const cuComplex* const*, A, int64_t, lda, cuComplex* const*, B, int64_t, ldb, int64_t, batchCount); +DEF_FN(cublasStatus_t, cublasZtrsmBatched, cublasHandle_t, handle, cublasSideMode_t, side, cublasFillMode_t, uplo, cublasOperation_t, trans, cublasDiagType_t, diag, int, m, int, n, const cuDoubleComplex*, alpha, const cuDoubleComplex* const*, A, int, lda, cuDoubleComplex* const*, B, int, ldb, int, batchCount); +DEF_FN(cublasStatus_t, cublasZtrsmBatched_64, cublasHandle_t, handle, cublasSideMode_t, side, cublasFillMode_t, uplo, cublasOperation_t, trans, cublasDiagType_t, diag, int64_t, m, int64_t, n, const cuDoubleComplex*, alpha, const cuDoubleComplex* const*, A, int64_t, lda, cuDoubleComplex* const*, B, int64_t, ldb, int64_t, batchCount); +DEF_FN(cublasStatus_t, cublasSdgmm, cublasHandle_t, handle, cublasSideMode_t, mode, int, m, int, n, const float*, A, int, lda, const float*, x, int, incx, float*, C, int, ldc); +DEF_FN(cublasStatus_t, cublasSdgmm_64, cublasHandle_t, handle, cublasSideMode_t, mode, int64_t, m, int64_t, n, const float*, A, int64_t, lda, const float*, x, int64_t, incx, float*, C, int64_t, ldc); +DEF_FN(cublasStatus_t, cublasDdgmm, cublasHandle_t, handle, cublasSideMode_t, mode, int, m, int, n, const double*, A, int, lda, const double*, x, int, incx, double*, C, int, ldc); +DEF_FN(cublasStatus_t, cublasDdgmm_64, cublasHandle_t, handle, cublasSideMode_t, mode, int64_t, m, int64_t, n, const double*, A, int64_t, lda, const double*, x, int64_t, incx, double*, C, int64_t, ldc); +DEF_FN(cublasStatus_t, cublasCdgmm, cublasHandle_t, handle, cublasSideMode_t, mode, int, m, int, n, const cuComplex*, A, int, lda, const cuComplex*, x, int, incx, cuComplex*, C, int, ldc); +DEF_FN(cublasStatus_t, cublasCdgmm_64, cublasHandle_t, handle, cublasSideMode_t, mode, int64_t, m, int64_t, n, const cuComplex*, A, int64_t, lda, const cuComplex*, x, int64_t, incx, cuComplex*, C, int64_t, ldc); +DEF_FN(cublasStatus_t, cublasZdgmm, cublasHandle_t, handle, cublasSideMode_t, mode, int, m, int, n, const cuDoubleComplex*, A, int, lda, const cuDoubleComplex*, x, int, incx, cuDoubleComplex*, C, int, ldc); +DEF_FN(cublasStatus_t, cublasZdgmm_64, cublasHandle_t, handle, cublasSideMode_t, mode, int64_t, m, int64_t, n, const cuDoubleComplex*, A, int64_t, lda, const cuDoubleComplex*, x, int64_t, incx, cuDoubleComplex*, C, int64_t, ldc); +DEF_FN(cublasStatus_t, cublasSmatinvBatched, cublasHandle_t, handle, int, n, const float* const*, A, int, lda, float* const*, Ainv, int, lda_inv, int*, info, int, batchSize); +DEF_FN(cublasStatus_t, cublasDmatinvBatched, cublasHandle_t, handle, int, n, const double* const*, A, int, lda, double* const*, Ainv, int, lda_inv, int*, info, int, batchSize); +DEF_FN(cublasStatus_t, cublasCmatinvBatched, cublasHandle_t, handle, int, n, const cuComplex* const*, A, int, lda, cuComplex* const*, Ainv, int, lda_inv, int*, info, int, batchSize); +DEF_FN(cublasStatus_t, cublasZmatinvBatched, cublasHandle_t, handle, int, n, const cuDoubleComplex* const*, A, int, lda, cuDoubleComplex* const*, Ainv, int, lda_inv, int*, info, int, batchSize); +DEF_FN(cublasStatus_t, cublasSgeqrfBatched, cublasHandle_t, handle, int, m, int, n, float* const*, Aarray, int, lda, float* const*, TauArray, int*, info, int, batchSize); +DEF_FN(cublasStatus_t, cublasDgeqrfBatched, cublasHandle_t, handle, int, m, int, n, double* const*, Aarray, int, lda, double* const*, TauArray, int*, info, int, batchSize); +DEF_FN(cublasStatus_t, cublasCgeqrfBatched, cublasHandle_t, handle, int, m, int, n, cuComplex* const*, Aarray, int, lda, cuComplex* const*, TauArray, int*, info, int, batchSize); +DEF_FN(cublasStatus_t, cublasZgeqrfBatched, cublasHandle_t, handle, int, m, int, n, cuDoubleComplex* const*, Aarray, int, lda, cuDoubleComplex* const*, TauArray, int*, info, int, batchSize); +DEF_FN(cublasStatus_t, cublasSgelsBatched, cublasHandle_t, handle, cublasOperation_t, trans, int, m, int, n, int, nrhs, float* const*, Aarray, int, lda, float* const*, Carray, int, ldc, int*, info, int*, devInfoArray, int, batchSize); +DEF_FN(cublasStatus_t, cublasDgelsBatched, cublasHandle_t, handle, cublasOperation_t, trans, int, m, int, n, int, nrhs, double* const*, Aarray, int, lda, double* const*, Carray, int, ldc, int*, info, int*, devInfoArray, int, batchSize); +DEF_FN(cublasStatus_t, cublasCgelsBatched, cublasHandle_t, handle, cublasOperation_t, trans, int, m, int, n, int, nrhs, cuComplex* const*, Aarray, int, lda, cuComplex* const*, Carray, int, ldc, int*, info, int*, devInfoArray, int, batchSize); +DEF_FN(cublasStatus_t, cublasZgelsBatched, cublasHandle_t, handle, cublasOperation_t, trans, int, m, int, n, int, nrhs, cuDoubleComplex* const*, Aarray, int, lda, cuDoubleComplex* const*, Carray, int, ldc, int*, info, int*, devInfoArray, int, batchSize); +DEF_FN(cublasStatus_t, cublasStpttr, cublasHandle_t, handle, cublasFillMode_t, uplo, int, n, const float*, AP, float*, A, int, lda); +DEF_FN(cublasStatus_t, cublasDtpttr, cublasHandle_t, handle, cublasFillMode_t, uplo, int, n, const double*, AP, double*, A, int, lda); +DEF_FN(cublasStatus_t, cublasCtpttr, cublasHandle_t, handle, cublasFillMode_t, uplo, int, n, const cuComplex*, AP, cuComplex*, A, int, lda); +DEF_FN(cublasStatus_t, cublasZtpttr, , cublasHandle_t, handle, cublasFillMode_t, uplo, int, n, const cuDoubleComplex*, AP, cuDoubleComplex*, A, int, lda); +DEF_FN(cublasStatus_t, cublasStrttp, cublasHandle_t, handle, cublasFillMode_t, uplo, int, n, const float*, A, int, lda, float*, AP); +DEF_FN(cublasStatus_t, cublasDtrttp, cublasHandle_t, handle, cublasFillMode_t, uplo, int, n, const double*, A, int, lda, double*, AP); +DEF_FN(cublasStatus_t, cublasCtrttp, cublasHandle_t, handle, cublasFillMode_t, uplo, int, n, const cuComplex*, A, int, lda, cuComplex*, AP); +DEF_FN(cublasStatus_t, cublasZtrttp, , cublasHandle_t, handle, cublasFillMode_t, uplo, int, n, const cuDoubleComplex*, A, int, lda, cuDoubleComplex*, AP); +DEF_FN(cublasStatus_t, cublasSgetrfBatched, cublasHandle_t, handle, int, n, float* const*, A, int, lda, int*, P, int*, info, int, batchSize); +DEF_FN(cublasStatus_t, cublasDgetrfBatched, cublasHandle_t, handle, int, n, double* const*, A, int, lda, int*, P, int*, info, int, batchSize); +DEF_FN(cublasStatus_t, cublasCgetrfBatched, cublasHandle_t, handle, int, n, cuComplex* const*, A, int, lda, int*, P, int*, info, int, batchSize); +DEF_FN(cublasStatus_t, cublasZgetrfBatched, , cublasHandle_t, handle, int, n, cuDoubleComplex* const*, A, int, lda, int*, P, int*, info, int, batchSize); +DEF_FN(cublasStatus_t, cublasSgetriBatched, cublasHandle_t, handle, int, n, const float* const*, A, int, lda, const int*, P, float* const*, C, int, ldc, int*, info, int, batchSize); +DEF_FN(cublasStatus_t, cublasDgetriBatched, cublasHandle_t, handle, int, n, const double* const*, A, int, lda, const int*, P, double* const*, C, int, ldc, int*, info, int, batchSize); +DEF_FN(cublasStatus_t, cublasCgetriBatched, cublasHandle_t, handle, int, n, const cuComplex* const*, A, int, lda, const int*, P, cuComplex* const*, C, int, ldc, int*, info, int, batchSize); +DEF_FN(cublasStatus_t, cublasZgetriBatched, cublasHandle_t, handle, int, n, const cuDoubleComplex* const*, A, int, lda, const int*, P, cuDoubleComplex* const*, C, int, ldc, int*, info, int, batchSize); +DEF_FN(cublasStatus_t, cublasSgetrsBatched, cublasHandle_t, handle, cublasOperation_t, trans, int, n, int, nrhs, const float* const*, Aarray, int, lda, const int*, devIpiv, float* const*, Barray, int, ldb, int*, info, int, batchSize); +DEF_FN(cublasStatus_t, cublasDgetrsBatched, cublasHandle_t, handle, cublasOperation_t, trans, int, n, int, nrhs, const double* const*, Aarray, int, lda, const int*, devIpiv, double* const*, Barray, int, ldb, int*, info, int, batchSize); +DEF_FN(cublasStatus_t, cublasCgetrsBatched, cublasHandle_t, handle, cublasOperation_t, trans, int, n, int, nrhs, const cuComplex* const*, Aarray, int, lda, const int*, devIpiv, cuComplex* const*, Barray, int, ldb, int*, info, int, batchSize); +DEF_FN(cublasStatus_t, cublasZgetrsBatched, cublasHandle_t, handle, cublasOperation_t, trans, int, n, int, nrhs, const cuDoubleComplex* const*, Aarray, int, lda, const int*, devIpiv, cuDoubleComplex* const*, Barray, int, ldb, int*, info, int, batchSize); diff --git a/cpu/cpu-client-cudnn.c b/cpu/cpu-client-cudnn.c new file mode 100644 index 00000000..05136fe2 --- /dev/null +++ b/cpu/cpu-client-cudnn.c @@ -0,0 +1,1854 @@ +#include +#include +#include + +#include "cpu-libwrap.h" +#include "cpu_rpc_prot.h" +#include "cpu-common.h" +#include "cpu-utils.h" +#include "log.h" + +#ifdef WITH_API_CNT +extern int api_call_cnt; +#endif //WITH_API_CNT + +size_t cudnnGetVersion(void) +{ +#ifdef WITH_API_CNT + api_call_cnt++; +#endif //WITH_API_CNT + size_t result; + enum clnt_stat retval_1; + retval_1 = rpc_cudnngetversion_1(&result, clnt); + if (retval_1 != RPC_SUCCESS) { + LOGE(LOG_ERROR, "%s failed (%d)", __FUNCTION__, retval_1); + } + return result; +} + +size_t cudnnGetMaxDeviceVersion(void) +{ +#ifdef WITH_API_CNT + api_call_cnt++; +#endif //WITH_API_CNT + size_t result; + enum clnt_stat retval_1; + retval_1 = rpc_cudnngetmaxdeviceversion_1(&result, clnt); + if (retval_1 != RPC_SUCCESS) { + LOGE(LOG_ERROR, "%s failed (%d)", __FUNCTION__, retval_1); + } + return result; +} +size_t cudnnGetCudartVersion(void) +{ +#ifdef WITH_API_CNT + api_call_cnt++; +#endif //WITH_API_CNT + size_t result; + enum clnt_stat retval_1; + retval_1 = rpc_cudnngetcudartversion_1(&result, clnt); + if (retval_1 != RPC_SUCCESS) { + LOGE(LOG_ERROR, "%s failed (%d)", __FUNCTION__, retval_1); + } + return result; +} + +const char *cudnnGetErrorString(cudnnStatus_t status) +{ +#ifdef WITH_API_CNT + api_call_cnt++; +#endif //WITH_API_CNT + static char str[128]; + char *result = NULL; + enum clnt_stat retval_1; + retval_1 = rpc_cudnngeterrorstring_1((int)status, &result, clnt); + if (retval_1 != RPC_SUCCESS) { + LOGE(LOG_ERROR, "%s failed (%d)", __FUNCTION__, retval_1); + } + if (result == NULL) { + LOGE(LOG_ERROR, "%s failed (result is NULL)", __FUNCTION__); + } + strncpy(str, result, 128); + return str; +} + +cudnnStatus_t cudnnQueryRuntimeError(cudnnHandle_t handle, cudnnStatus_t* rstatus, cudnnErrQueryMode_t mode, cudnnRuntimeTag_t * tag) +{ +#ifdef WITH_API_CNT + api_call_cnt++; +#endif //WITH_API_CNT + int_result result; + enum clnt_stat retval_1; + retval_1 = rpc_cudnnqueryruntimeerror_1((ptr)handle, (int)mode, &result, clnt); + if (retval_1 != RPC_SUCCESS) { + LOGE(LOG_ERROR, "%s failed (%d)", __FUNCTION__, retval_1); + } + if (result.err != CUDNN_STATUS_SUCCESS) { + LOGE(LOG_ERROR, "%s failed (result is %d)", __FUNCTION__, result.err); + } else { + *rstatus = (cudnnStatus_t)result.int_result_u.data; + //*tag = NULL; + } + return result.err; +} + +cudnnStatus_t cudnnGetProperty(libraryPropertyType type, int * value) +{ +#ifdef WITH_API_CNT + api_call_cnt++; +#endif //WITH_API_CNT + int_result result; + enum clnt_stat retval_1; + if (value == NULL) { + LOGE(LOG_ERROR, "%s failed (value is NULL)", __FUNCTION__); + return CUDNN_STATUS_BAD_PARAM; + } + retval_1 = rpc_cudnngetproperty_1((int)type, &result, clnt); + if (retval_1 != RPC_SUCCESS) { + LOGE(LOG_ERROR, "%s failed (%d)", __FUNCTION__, retval_1); + } + if (result.err != CUDNN_STATUS_SUCCESS) { + LOGE(LOG_ERROR, "%s failed (result is %d)", __FUNCTION__, result.err); + } else { + *value = result.int_result_u.data; + } + return result.err; +} + +cudnnStatus_t cudnnCreate(cudnnHandle_t* handle) +{ +#ifdef WITH_API_CNT + api_call_cnt++; +#endif //WITH_API_CNT + ptr_result result; + enum clnt_stat retval_1; + if (handle == NULL) { + LOGE(LOG_ERROR, "%s failed (value is NULL)", __FUNCTION__); + return CUDNN_STATUS_BAD_PARAM; + } + retval_1 = rpc_cudnncreate_1(&result, clnt); + if (retval_1 != RPC_SUCCESS) { + LOGE(LOG_ERROR, "%s failed (%d)", __FUNCTION__, retval_1); + } + if (result.err != CUDNN_STATUS_SUCCESS) { + LOGE(LOG_ERROR, "%s failed (result is %d)", __FUNCTION__, result.err); + } else { + *handle = (cudnnHandle_t)result.ptr_result_u.ptr; + } + return result.err; +} + +cudnnStatus_t cudnnDestroy(cudnnHandle_t handle) +{ +#ifdef WITH_API_CNT + api_call_cnt++; +#endif //WITH_API_CNT + int result; + enum clnt_stat retval_1; + retval_1 = rpc_cudnndestroy_1((ptr)handle, &result, clnt); + if (retval_1 != RPC_SUCCESS) { + LOGE(LOG_ERROR, "%s failed (%d)", __FUNCTION__, retval_1); + } + if (result != CUDNN_STATUS_SUCCESS) { + LOGE(LOG_ERROR, "%s failed (result is %d)", __FUNCTION__, result); + } + return result; +} + +cudnnStatus_t cudnnSetStream(cudnnHandle_t handle, cudaStream_t streamId) +{ +#ifdef WITH_API_CNT + api_call_cnt++; +#endif //WITH_API_CNT + int result; + enum clnt_stat retval_1; + retval_1 = rpc_cudnnsetstream_1((ptr)handle, (ptr)streamId, &result, clnt); + if (retval_1 != RPC_SUCCESS) { + LOGE(LOG_ERROR, "%s failed (%d)", __FUNCTION__, retval_1); + } + if (result != CUDNN_STATUS_SUCCESS) { + LOGE(LOG_ERROR, "%s failed (result is %d)", __FUNCTION__, result); + } + return result; +} + +cudnnStatus_t cudnnGetStream(cudnnHandle_t handle, cudaStream_t * streamId) +{ +#ifdef WITH_API_CNT + api_call_cnt++; +#endif //WITH_API_CNT + ptr_result result; + enum clnt_stat retval_1; + if (streamId == NULL) { + LOGE(LOG_ERROR, "%s failed (value is NULL)", __FUNCTION__); + return CUDNN_STATUS_BAD_PARAM; + } + retval_1 = rpc_cudnngetstream_1((ptr)handle, &result, clnt); + if (retval_1 != RPC_SUCCESS) { + LOGE(LOG_ERROR, "%s failed (%d)", __FUNCTION__, retval_1); + } + if (result.err != CUDNN_STATUS_SUCCESS) { + LOGE(LOG_ERROR, "%s failed (result is %d)", __FUNCTION__, result.err); + } else { + *streamId = (cudaStream_t)result.ptr_result_u.ptr; + } + return result.err; +} + +cudnnStatus_t cudnnCreateTensorDescriptor(cudnnTensorDescriptor_t * tensorDesc) +{ +#ifdef WITH_API_CNT + api_call_cnt++; +#endif //WITH_API_CNT + ptr_result result; + enum clnt_stat retval_1; + if (tensorDesc == NULL) { + LOGE(LOG_ERROR, "%s failed (value is NULL)", __FUNCTION__); + return CUDNN_STATUS_BAD_PARAM; + } + retval_1 = rpc_cudnncreatetensordescriptor_1(&result, clnt); + if (retval_1 != RPC_SUCCESS) { + LOGE(LOG_ERROR, "%s failed (%d)", __FUNCTION__, retval_1); + } + if (result.err != CUDNN_STATUS_SUCCESS) { + LOGE(LOG_ERROR, "%s failed (result is %d)", __FUNCTION__, result.err); + } else { + *tensorDesc = (cudnnTensorDescriptor_t)result.ptr_result_u.ptr; + } + return result.err; +} + +cudnnStatus_t cudnnSetTensor4dDescriptor(cudnnTensorDescriptor_t tensorDesc, cudnnTensorFormat_t format, cudnnDataType_t dataType, int n, int c, int h, int w) +{ +#ifdef WITH_API_CNT + api_call_cnt++; +#endif //WITH_API_CNT + int result; + enum clnt_stat retval_1; + retval_1 = rpc_cudnnsettensor4ddescriptor_1( + (ptr)tensorDesc, + (int)format, + (int)dataType, + n, c, h, w, &result, clnt); + + if (retval_1 != RPC_SUCCESS) { + LOGE(LOG_ERROR, "%s failed (%d)", __FUNCTION__, retval_1); + } + if (result != CUDNN_STATUS_SUCCESS) { + LOGE(LOG_ERROR, "%s failed (result is %d)", __FUNCTION__, result); + } + return result; +} + +cudnnStatus_t cudnnSetTensor4dDescriptorEx(cudnnTensorDescriptor_t tensorDesc, cudnnDataType_t dataType, int n, int c, int h, int w, int nStride, int cStride, int hStride, int wStride) +{ +#ifdef WITH_API_CNT + api_call_cnt++; +#endif //WITH_API_CNT + int result; + enum clnt_stat retval_1; + retval_1 = rpc_cudnnsettensor4ddescriptorex_1( + (ptr)tensorDesc, + (int)dataType, + n, c, h, w, nStride, cStride, hStride, wStride, &result, clnt); + + if (retval_1 != RPC_SUCCESS) { + LOGE(LOG_ERROR, "%s failed (%d)", __FUNCTION__, retval_1); + } + if (result != CUDNN_STATUS_SUCCESS) { + LOGE(LOG_ERROR, "%s failed (result is %d)", __FUNCTION__, result); + } + return result; +} + +cudnnStatus_t cudnnGetTensor4dDescriptor(const cudnnTensorDescriptor_t tensorDesc, cudnnDataType_t *dataType, int* n, int* c, int* h, int* w, int* nStride, int* cStride, int* hStride, int* wStride) +{ +#ifdef WITH_API_CNT + api_call_cnt++; +#endif //WITH_API_CNT + int9_result result; + enum clnt_stat retval_1; + if (dataType == NULL || n == NULL || c == NULL || h == NULL || w == NULL || nStride == NULL || cStride == NULL || hStride == NULL || wStride == NULL) { + LOGE(LOG_ERROR, "%s failed (value is NULL)", __FUNCTION__); + return CUDNN_STATUS_BAD_PARAM; + } + retval_1 = rpc_cudnngettensor4ddescriptor_1( + (ptr)tensorDesc, + &result, clnt); + + if (retval_1 != RPC_SUCCESS) { + LOGE(LOG_ERROR, "%s failed (%d)", __FUNCTION__, retval_1); + } + if (result.err != CUDNN_STATUS_SUCCESS) { + LOGE(LOG_ERROR, "%s failed (result is %d)", __FUNCTION__, result); + } else { + *dataType = (cudnnDataType_t)result.int9_result_u.data[0]; + *n = result.int9_result_u.data[1]; + *c = result.int9_result_u.data[2]; + *h = result.int9_result_u.data[3]; + *w = result.int9_result_u.data[4]; + *nStride = result.int9_result_u.data[5]; + *cStride = result.int9_result_u.data[6]; + *hStride = result.int9_result_u.data[7]; + *wStride = result.int9_result_u.data[8]; + } + return result.err; +} + +cudnnStatus_t cudnnSetTensorNdDescriptor(cudnnTensorDescriptor_t tensorDesc, cudnnDataType_t dataType, int nbDims, const int* dimA, const int* strideA) +{ +#ifdef WITH_API_CNT + api_call_cnt++; +#endif //WITH_API_CNT + int result; + enum clnt_stat retval_1; + mem_data rpc_dimA = { + .mem_data_len = nbDims * sizeof(int), + .mem_data_val = (char*)dimA + }; + mem_data rpc_strideA = { + .mem_data_len = nbDims * sizeof(int), + .mem_data_val = (char*)strideA + }; + retval_1 = rpc_cudnnsettensornddescriptor_1( + (ptr)tensorDesc, + (int)dataType, + (int)nbDims, + rpc_dimA, rpc_strideA, &result, clnt); + + if (retval_1 != RPC_SUCCESS) { + LOGE(LOG_ERROR, "%s failed (%d)", __FUNCTION__, retval_1); + } + if (result != CUDNN_STATUS_SUCCESS) { + LOGE(LOG_ERROR, "%s failed (result is %d)", __FUNCTION__, result); + } + return result; +} + +cudnnStatus_t cudnnSetTensorNdDescriptorEx(cudnnTensorDescriptor_t tensorDesc, cudnnTensorFormat_t format, cudnnDataType_t dataType, int nbDims, const int* dimA) +{ +#ifdef WITH_API_CNT + api_call_cnt++; +#endif //WITH_API_CNT + int result; + enum clnt_stat retval_1; + mem_data rpc_dimA = { + .mem_data_len = nbDims * sizeof(int), + .mem_data_val = (char*)dimA + }; + retval_1 = rpc_cudnnsettensornddescriptorex_1( + (ptr)tensorDesc, + (int)format, + (int)dataType, + (int)nbDims, + rpc_dimA, &result, clnt); + + if (retval_1 != RPC_SUCCESS) { + LOGE(LOG_ERROR, "%s failed (%d)", __FUNCTION__, retval_1); + } + if (result != CUDNN_STATUS_SUCCESS) { + LOGE(LOG_ERROR, "%s failed (result is %d)", __FUNCTION__, result); + } + return result; +} + +cudnnStatus_t cudnnGetTensorNdDescriptor(const cudnnTensorDescriptor_t tensorDesc, int nbDimsRequested, cudnnDataType_t *dataType, int* nbDims, int* dimA, int* strideA) +{ +#ifdef WITH_API_CNT + api_call_cnt++; +#endif //WITH_API_CNT + size_t expected_size = nbDimsRequested * sizeof(int) * 2 + sizeof(int) + sizeof(cudnnDataType_t); + mem_result result; + result.mem_result_u.data.mem_data_val = malloc(expected_size); + enum clnt_stat retval_1; + if (dataType == NULL || nbDims == NULL || dimA == NULL || strideA == NULL) { + LOGE(LOG_ERROR, "%s failed (value is NULL)", __FUNCTION__); + return CUDNN_STATUS_BAD_PARAM; + } + retval_1 = rpc_cudnngettensornddescriptor_1( + (ptr)tensorDesc, + nbDimsRequested, + &result, clnt); + + if (retval_1 != RPC_SUCCESS) { + LOGE(LOG_ERROR, "%s failed (%d)", __FUNCTION__, retval_1); + } + if (result.err != CUDNN_STATUS_SUCCESS || result.mem_result_u.data.mem_data_len != expected_size) { + LOGE(LOG_ERROR, "%s failed (result is %d)", __FUNCTION__, result.err); + } else { + size_t offset = 0; + *dataType = (cudnnDataType_t)result.mem_result_u.data.mem_data_val[offset]; + offset += sizeof(cudnnDataType_t); + *nbDims = (int)result.mem_result_u.data.mem_data_val[offset]; + offset += sizeof(int); + memcpy(dimA, result.mem_result_u.data.mem_data_val+offset, *nbDims * sizeof(int)); + offset += *nbDims * sizeof(int); + memcpy(strideA, result.mem_result_u.data.mem_data_val+offset, *nbDims * sizeof(int)); + } + free(result.mem_result_u.data.mem_data_val); + return result.err; +} + +cudnnStatus_t cudnnGetTensorSizeInBytes(const cudnnTensorDescriptor_t tensorDesc, size_t* size) +{ +#ifdef WITH_API_CNT + api_call_cnt++; +#endif //WITH_API_CNT + sz_result result; + enum clnt_stat retval_1; + if (size == NULL) { + LOGE(LOG_ERROR, "%s failed (value is NULL)", __FUNCTION__); + return CUDNN_STATUS_BAD_PARAM; + } + retval_1 = rpc_cudnngettensorsizeinbytes_1( + (ptr)tensorDesc, + &result, clnt); + + if (retval_1 != RPC_SUCCESS) { + LOGE(LOG_ERROR, "%s failed (%d)", __FUNCTION__, retval_1); + } + if (result.err != CUDNN_STATUS_SUCCESS) { + LOGE(LOG_ERROR, "%s failed (result is %d)", __FUNCTION__, result.err); + } else { + *size = result.sz_result_u.data; + } + return result.err; +} + +cudnnStatus_t cudnnDestroyTensorDescriptor(cudnnTensorDescriptor_t tensorDesc) +{ +#ifdef WITH_API_CNT + api_call_cnt++; +#endif //WITH_API_CNT + int result; + enum clnt_stat retval_1; + retval_1 = rpc_cudnndestroytensordescriptor_1( + (ptr)tensorDesc, + &result, clnt); + + if (retval_1 != RPC_SUCCESS) { + LOGE(LOG_ERROR, "%s failed (%d)", __FUNCTION__, retval_1); + } + if (result != CUDNN_STATUS_SUCCESS) { + LOGE(LOG_ERROR, "%s failed (result is %d)", __FUNCTION__, result); + } + return result; +} + +DEF_FN(cudnnStatus_t, cudnnInitTransformDest, const cudnnTensorTransformDescriptor_t, transformDesc, const cudnnTensorDescriptor_t, srcDesc, cudnnTensorDescriptor_t, destDesc, size_t*, destSizeInBytes) +DEF_FN(cudnnStatus_t, cudnnCreateTensorTransformDescriptor, cudnnTensorTransformDescriptor_t *, transformDesc) +DEF_FN(cudnnStatus_t, cudnnSetTensorTransformDescriptor, cudnnTensorTransformDescriptor_t, transformDesc, const uint32_t, nbDims, const cudnnTensorFormat_t, destFormat, const int32_t*, padBeforeA, const int32_t*, padAfterA, const uint32_t*, foldA, const cudnnFoldingDirection_t, direction) +DEF_FN(cudnnStatus_t, cudnnGetTensorTransformDescriptor, cudnnTensorTransformDescriptor_t, transformDesc, uint32_t, nbDimsRequested, cudnnTensorFormat_t *, destFormat, int32_t*, padBeforeA, int32_t*, padAfterA, uint32_t*, foldA, cudnnFoldingDirection_t *, direction) +DEF_FN(cudnnStatus_t, cudnnDestroyTensorTransformDescriptor, cudnnTensorTransformDescriptor_t, transformDesc) + +cudnnStatus_t cudnnTransformTensor(cudnnHandle_t handle, const void * alpha, const cudnnTensorDescriptor_t xDesc, const void *x, const void *beta, const cudnnTensorDescriptor_t yDesc, void *y) +{ +#ifdef WITH_API_CNT + api_call_cnt++; +#endif //WITH_API_CNT + int result; + enum clnt_stat retval_1; + //TODO: Check if we have a float instead of always sending doubles + cudnn_scaling_t rpc_alpha = {.dataType = CUDNN_DATA_DOUBLE, .cudnn_scaling_t_u.d = *((double*)alpha)}; + cudnn_scaling_t rpc_beta = {.dataType = CUDNN_DATA_DOUBLE, .cudnn_scaling_t_u.d = *((double*)beta)}; + retval_1 = rpc_cudnntransformtensor_1( + (ptr)handle, + rpc_alpha, + (ptr)xDesc, + (ptr)x, + rpc_beta, + (ptr)yDesc, + (ptr)y, + &result, clnt); + if (retval_1 != RPC_SUCCESS) { + LOGE(LOG_ERROR, "%s failed (%d)", __FUNCTION__, retval_1); + } + if (result != CUDNN_STATUS_SUCCESS) { + LOGE(LOG_ERROR, "%s failed (result is %d)", __FUNCTION__, result); + } + return result; +} + +DEF_FN(cudnnStatus_t, cudnnTransformTensorEx, cudnnHandle_t, handle, const cudnnTensorTransformDescriptor_t, transDesc, const void *, alpha, const cudnnTensorDescriptor_t, srcDesc, const void *, srcData, const void *, beta, const cudnnTensorDescriptor_t, destDesc, void *, destData) + +cudnnStatus_t cudnnAddTensor(cudnnHandle_t handle, const void * alpha, const cudnnTensorDescriptor_t aDesc, const void * A, const void *beta, const cudnnTensorDescriptor_t cDesc, void * C) +{ +#ifdef WITH_API_CNT + api_call_cnt++; +#endif //WITH_API_CNT + int result; + enum clnt_stat retval_1; + //TODO: Check if we have a float instead of always sending doubles + cudnn_scaling_t rpc_alpha = {.dataType = CUDNN_DATA_DOUBLE, .cudnn_scaling_t_u.d = *((double*)alpha)}; + cudnn_scaling_t rpc_beta = {.dataType = CUDNN_DATA_DOUBLE, .cudnn_scaling_t_u.d = *((double*)beta)}; + retval_1 = rpc_cudnnaddtensor_1( + (ptr)handle, + rpc_alpha, + (ptr)aDesc, + (ptr)A, + rpc_beta, + (ptr)cDesc, + (ptr)C, + &result, clnt); + if (retval_1 != RPC_SUCCESS) { + LOGE(LOG_ERROR, "%s failed (%d)", __FUNCTION__, retval_1); + } + if (result != CUDNN_STATUS_SUCCESS) { + LOGE(LOG_ERROR, "%s failed (result is %d)", __FUNCTION__, result); + } + return result; +} + +DEF_FN(cudnnStatus_t, cudnnCreateOpTensorDescriptor, cudnnOpTensorDescriptor_t *, opTensorDesc) +DEF_FN(cudnnStatus_t, cudnnSetOpTensorDescriptor, cudnnOpTensorDescriptor_t, opTensorDesc, cudnnOpTensorOp_t, opTensorOp, cudnnDataType_t, opTensorCompType, cudnnNanPropagation_t, opTensorNanOpt) +DEF_FN(cudnnStatus_t, cudnnGetOpTensorDescriptor, const cudnnOpTensorDescriptor_t, opTensorDesc, cudnnOpTensorOp_t *, opTensorOp, cudnnDataType_t *, opTensorCompType, cudnnNanPropagation_t *, opTensorNanOpt) +DEF_FN(cudnnStatus_t, cudnnDestroyOpTensorDescriptor, cudnnOpTensorDescriptor_t, opTensorDesc) +DEF_FN(cudnnStatus_t, cudnnOpTensor, cudnnHandle_t, handle, const cudnnOpTensorDescriptor_t, opTensorDesc, const void *, alpha1, const cudnnTensorDescriptor_t, aDesc, const void *, A, const void *, alpha2, const cudnnTensorDescriptor_t, bDesc, const void *, B, const void *, beta, const cudnnTensorDescriptor_t, cDesc, void *, C) +DEF_FN(cudnnStatus_t, cudnnCreateReduceTensorDescriptor, cudnnReduceTensorDescriptor_t *, reduceTensorDesc) +DEF_FN(cudnnStatus_t, cudnnSetReduceTensorDescriptor, cudnnReduceTensorDescriptor_t, reduceTensorDesc, cudnnReduceTensorOp_t, reduceTensorOp, cudnnDataType_t, reduceTensorCompType, cudnnNanPropagation_t, reduceTensorNanOpt, cudnnReduceTensorIndices_t, reduceTensorIndices, cudnnIndicesType_t, reduceTensorIndicesType) +DEF_FN(cudnnStatus_t, cudnnGetReduceTensorDescriptor, const cudnnReduceTensorDescriptor_t, reduceTensorDesc, cudnnReduceTensorOp_t *, reduceTensorOp, cudnnDataType_t *, reduceTensorCompType, cudnnNanPropagation_t *, reduceTensorNanOpt, cudnnReduceTensorIndices_t *, reduceTensorIndices, cudnnIndicesType_t *, reduceTensorIndicesType) +DEF_FN(cudnnStatus_t, cudnnDestroyReduceTensorDescriptor, cudnnReduceTensorDescriptor_t, reduceTensorDesc) +DEF_FN(cudnnStatus_t, cudnnGetReductionIndicesSize, cudnnHandle_t, handle, const cudnnReduceTensorDescriptor_t, reduceTensorDesc, const cudnnTensorDescriptor_t, aDesc, const cudnnTensorDescriptor_t, cDesc, size_t*, sizeInBytes) +DEF_FN(cudnnStatus_t, cudnnGetReductionWorkspaceSize, cudnnHandle_t, handle, const cudnnReduceTensorDescriptor_t, reduceTensorDesc, const cudnnTensorDescriptor_t, aDesc, const cudnnTensorDescriptor_t, cDesc, size_t*, sizeInBytes) +DEF_FN(cudnnStatus_t, cudnnReduceTensor, cudnnHandle_t, handle, const cudnnReduceTensorDescriptor_t, reduceTensorDesc, void *, indices, size_t, indicesSizeInBytes, void *, workspace, size_t, workspaceSizeInBytes, const void *, alpha, const cudnnTensorDescriptor_t, aDesc, const void *, A, const void *, beta, const cudnnTensorDescriptor_t, cDesc, void *, C) +DEF_FN(cudnnStatus_t, cudnnSetTensor, cudnnHandle_t, handle, const cudnnTensorDescriptor_t, yDesc, void *, y, const void *, valuePtr) +DEF_FN(cudnnStatus_t, cudnnScaleTensor, cudnnHandle_t, handle, const cudnnTensorDescriptor_t, yDesc, void *, y, const void *, alpha) + +cudnnStatus_t cudnnCreateFilterDescriptor(cudnnFilterDescriptor_t * filterDesc) +{ +#ifdef WITH_API_CNT + api_call_cnt++; +#endif //WITH_API_CNT + ptr_result result; + enum clnt_stat retval_1; + if (filterDesc == NULL) { + LOGE(LOG_ERROR, "%s failed (value is NULL)", __FUNCTION__); + return CUDNN_STATUS_BAD_PARAM; + } + retval_1 = rpc_cudnncreatefilterdescriptor_1(&result, clnt); + if (retval_1 != RPC_SUCCESS) { + LOGE(LOG_ERROR, "%s failed (%d)", __FUNCTION__, retval_1); + } + if (result.err != CUDNN_STATUS_SUCCESS) { + LOGE(LOG_ERROR, "%s failed (result is %d)", __FUNCTION__, result.err); + } else { + *filterDesc = (cudnnFilterDescriptor_t)result.ptr_result_u.ptr; + } + return result.err; +} + +cudnnStatus_t cudnnSetFilter4dDescriptor(cudnnFilterDescriptor_t filterDesc, cudnnDataType_t dataType, cudnnTensorFormat_t format, int k, int c, int h, int w) +{ +#ifdef WITH_API_CNT + api_call_cnt++; +#endif //WITH_API_CNT + int result; + enum clnt_stat retval_1; + retval_1 = rpc_cudnnsetfilter4ddescriptor_1( + (ptr)filterDesc, + (int)dataType, + (int)format, + k, c, h, w, &result, clnt); + + if (retval_1 != RPC_SUCCESS) { + LOGE(LOG_ERROR, "%s failed (%d)", __FUNCTION__, retval_1); + } + if (result != CUDNN_STATUS_SUCCESS) { + LOGE(LOG_ERROR, "%s failed (result is %d)", __FUNCTION__, result); + } + return result; +} + +cudnnStatus_t cudnnGetFilter4dDescriptor(const cudnnFilterDescriptor_t filterDesc, cudnnDataType_t *dataType, cudnnTensorFormat_t *format, int* k, int* c, int* h, int* w) +{ +#ifdef WITH_API_CNT + api_call_cnt++; +#endif //WITH_API_CNT + int6_result result; + enum clnt_stat retval_1; + if (dataType == NULL || format == NULL || k == NULL || c == NULL || h == NULL || w == NULL) { + LOGE(LOG_ERROR, "%s failed (value is NULL)", __FUNCTION__); + return CUDNN_STATUS_BAD_PARAM; + } + retval_1 = rpc_cudnngetfilter4ddescriptor_1( + (ptr)filterDesc, + &result, clnt); + + if (retval_1 != RPC_SUCCESS) { + LOGE(LOG_ERROR, "%s failed (%d)", __FUNCTION__, retval_1); + } + if (result.err != CUDNN_STATUS_SUCCESS) { + LOGE(LOG_ERROR, "%s failed (result is %d)", __FUNCTION__, result); + } else { + *dataType = (cudnnDataType_t)result.int6_result_u.data[0]; + *format = (cudnnTensorFormat_t)result.int6_result_u.data[1]; + *k = result.int6_result_u.data[2]; + *c = result.int6_result_u.data[3]; + *h = result.int6_result_u.data[4]; + *w = result.int6_result_u.data[5]; + } + return result.err; +} + +cudnnStatus_t cudnnSetFilterNdDescriptor(cudnnFilterDescriptor_t filterDesc, cudnnDataType_t dataType, cudnnTensorFormat_t format, int nbDims, const int* filterDimA) +{ +#ifdef WITH_API_CNT + api_call_cnt++; +#endif //WITH_API_CNT + int result; + enum clnt_stat retval_1; + mem_data rpc_filterDimA = { + .mem_data_len = nbDims * sizeof(int), + .mem_data_val = (char*)filterDimA + }; + retval_1 = rpc_cudnnsetfilternddescriptor_1( + (ptr)filterDesc, + (int)dataType, + (int)format, + (int)nbDims, + rpc_filterDimA, &result, clnt); + + if (retval_1 != RPC_SUCCESS) { + LOGE(LOG_ERROR, "%s failed (%d)", __FUNCTION__, retval_1); + } + if (result != CUDNN_STATUS_SUCCESS) { + LOGE(LOG_ERROR, "%s failed (result is %d)", __FUNCTION__, result); + } + return result; +} + +cudnnStatus_t cudnnGetFilterNdDescriptor(const cudnnFilterDescriptor_t filterDesc, int nbDimsRequested, cudnnDataType_t * dataType, cudnnTensorFormat_t * format, int* nbDims, int* filterDimA) +{ +#ifdef WITH_API_CNT + api_call_cnt++; +#endif //WITH_API_CNT + size_t expected_size = nbDimsRequested * sizeof(int) + sizeof(int) + sizeof(cudnnDataType_t) + sizeof(cudnnTensorFormat_t); + mem_result result; + result.mem_result_u.data.mem_data_val = (char*)malloc(expected_size); + enum clnt_stat retval_1; + if (dataType == NULL || format == NULL || nbDims == NULL || filterDimA == NULL) { + LOGE(LOG_ERROR, "%s failed (value is NULL)", __FUNCTION__); + return CUDNN_STATUS_BAD_PARAM; + } + retval_1 = rpc_cudnngetfilternddescriptor_1( + (ptr)filterDesc, + nbDimsRequested, + &result, clnt); + + if (retval_1 != RPC_SUCCESS) { + LOGE(LOG_ERROR, "%s failed (%d)", __FUNCTION__, retval_1); + } + if (result.err != CUDNN_STATUS_SUCCESS || result.mem_result_u.data.mem_data_len < expected_size) { + LOGE(LOG_ERROR, "%s failed (result is %d)", __FUNCTION__, result.err); + } else { + size_t offset = 0; + *dataType = (cudnnDataType_t)result.mem_result_u.data.mem_data_val[offset]; + offset += sizeof(cudnnDataType_t); + *format = (cudnnTensorFormat_t)result.mem_result_u.data.mem_data_val[offset]; + offset += sizeof(cudnnTensorFormat_t); + *nbDims = (int)result.mem_result_u.data.mem_data_val[offset]; + offset += sizeof(int); + memcpy(filterDimA, result.mem_result_u.data.mem_data_val+offset, *nbDims * sizeof(int)); + } + free(result.mem_result_u.data.mem_data_val); + return result.err; +} + +cudnnStatus_t cudnnGetFilterSizeInBytes(const cudnnFilterDescriptor_t filterDesc, size_t* size) +{ +#ifdef WITH_API_CNT + api_call_cnt++; +#endif //WITH_API_CNT + sz_result result; + enum clnt_stat retval_1; + if (size == NULL) { + LOGE(LOG_ERROR, "%s failed (value is NULL)", __FUNCTION__); + return CUDNN_STATUS_BAD_PARAM; + } + retval_1 = rpc_cudnngetfiltersizeinbytes_1( + (ptr)filterDesc, + &result, clnt); + + if (retval_1 != RPC_SUCCESS) { + LOGE(LOG_ERROR, "%s failed (%d)", __FUNCTION__, retval_1); + } + if (result.err != CUDNN_STATUS_SUCCESS) { + LOGE(LOG_ERROR, "%s failed (result is %d)", __FUNCTION__, result.err); + } else { + *size = result.sz_result_u.data; + } + return result.err; +} + +cudnnStatus_t cudnnTransformFilter(cudnnHandle_t handle, const cudnnTensorTransformDescriptor_t transDesc, const void * alpha, const cudnnFilterDescriptor_t srcDesc, const void * srcData, const void * beta, const cudnnFilterDescriptor_t destDesc, void * destData) +{ +#ifdef WITH_API_CNT + api_call_cnt++; +#endif //WITH_API_CNT + int result; + enum clnt_stat retval_1; + //TODO: Check if we have a float instead of always sending doubles + cudnn_scaling_t rpc_alpha = {.dataType = CUDNN_DATA_DOUBLE, .cudnn_scaling_t_u.d = *((double*)alpha)}; + cudnn_scaling_t rpc_beta = {.dataType = CUDNN_DATA_DOUBLE, .cudnn_scaling_t_u.d = *((double*)beta)}; + retval_1 = rpc_cudnntransformfilter_1( + (ptr)handle, + (ptr)transDesc, + rpc_alpha, + (ptr)srcDesc, + (ptr)srcData, + rpc_beta, + (ptr)destDesc, + (ptr)destData, + &result, clnt); + + if (retval_1 != RPC_SUCCESS) { + LOGE(LOG_ERROR, "%s failed (%d)", __FUNCTION__, retval_1); + } + if (result != CUDNN_STATUS_SUCCESS) { + LOGE(LOG_ERROR, "%s failed (result is %d)", __FUNCTION__, result); + } + return result; +} + +cudnnStatus_t cudnnDestroyFilterDescriptor(cudnnFilterDescriptor_t filterDesc) +{ +#ifdef WITH_API_CNT + api_call_cnt++; +#endif //WITH_API_CNT + int result; + enum clnt_stat retval_1; + retval_1 = rpc_cudnndestroyfilterdescriptor_1( + (ptr)filterDesc, + &result, clnt); + + if (retval_1 != RPC_SUCCESS) { + LOGE(LOG_ERROR, "%s failed (%d)", __FUNCTION__, retval_1); + } + if (result != CUDNN_STATUS_SUCCESS) { + LOGE(LOG_ERROR, "%s failed (result is %d)", __FUNCTION__, result); + } + return result; +} + +cudnnStatus_t cudnnSoftmaxForward(cudnnHandle_t handle, cudnnSoftmaxAlgorithm_t algo, cudnnSoftmaxMode_t mode, const void *alpha, const cudnnTensorDescriptor_t xDesc, const void *x, const void *beta, const cudnnTensorDescriptor_t yDesc, void * y) +{ +#ifdef WITH_API_CNT + api_call_cnt++; +#endif //WITH_API_CNT + int result; + enum clnt_stat retval_1; + //TODO: Check if we have a float instead of always sending doubles + cudnn_scaling_t rpc_alpha = {.dataType = CUDNN_DATA_DOUBLE, .cudnn_scaling_t_u.d = *((double*)alpha)}; + cudnn_scaling_t rpc_beta = {.dataType = CUDNN_DATA_DOUBLE, .cudnn_scaling_t_u.d = *((double*)beta)}; + retval_1 = rpc_cudnnsoftmaxforward_1( + (ptr)handle, + (int)algo, + (int)mode, + rpc_alpha, + (ptr)xDesc, + (ptr)x, + rpc_beta, + (ptr)yDesc, + (ptr)y, + &result, clnt); + if (retval_1 != RPC_SUCCESS) { + LOGE(LOG_ERROR, "%s failed (%d)", __FUNCTION__, retval_1); + } + if (result != CUDNN_STATUS_SUCCESS) { + LOGE(LOG_ERROR, "%s failed (result is %d)", __FUNCTION__, result); + } + return result; +} + +cudnnStatus_t cudnnCreatePoolingDescriptor(cudnnPoolingDescriptor_t *poolingDesc) +{ +#ifdef WITH_API_CNT + api_call_cnt++; +#endif //WITH_API_CNT + ptr_result result; + enum clnt_stat retval_1; + if (poolingDesc == NULL) { + LOGE(LOG_ERROR, "%s failed (value is NULL)", __FUNCTION__); + return CUDNN_STATUS_BAD_PARAM; + } + retval_1 = rpc_cudnncreatepoolingdescriptor_1(&result, clnt); + if (retval_1 != RPC_SUCCESS) { + LOGE(LOG_ERROR, "%s failed (%d)", __FUNCTION__, retval_1); + } + if (result.err != CUDNN_STATUS_SUCCESS) { + LOGE(LOG_ERROR, "%s failed (result is %d)", __FUNCTION__, result.err); + } else { + *poolingDesc = (cudnnPoolingDescriptor_t)result.ptr_result_u.ptr; + } + return result.err; +} + +cudnnStatus_t cudnnSetPooling2dDescriptor(cudnnPoolingDescriptor_t poolingDesc, cudnnPoolingMode_t mode, cudnnNanPropagation_t maxpoolingNanOpt, int windowHeight, int windowWidth, int verticalPadding, int horizontalPadding, int verticalStride, int horizontalStride) +{ +#ifdef WITH_API_CNT + api_call_cnt++; +#endif //WITH_API_CNT + int result; + enum clnt_stat retval_1; + retval_1 = rpc_cudnnsetpooling2ddescriptor_1( + (ptr)poolingDesc, + (int)mode, + (int)maxpoolingNanOpt, + windowHeight, + windowWidth, + verticalPadding, + horizontalPadding, + verticalStride, + horizontalStride, + &result, clnt); + + if (retval_1 != RPC_SUCCESS) { + LOGE(LOG_ERROR, "%s failed (%d)", __FUNCTION__, retval_1); + } + if (result != CUDNN_STATUS_SUCCESS) { + LOGE(LOG_ERROR, "%s failed (result is %d)", __FUNCTION__, result); + } + return result; +} + +cudnnStatus_t cudnnGetPooling2dDescriptor(const cudnnPoolingDescriptor_t poolingDesc, cudnnPoolingMode_t *mode, cudnnNanPropagation_t *maxpoolingNanOpt, int* windowHeight, int* windowWidth, int* verticalPadding, int* horizontalPadding, int* verticalStride, int* horizontalStride) +{ +#ifdef WITH_API_CNT + api_call_cnt++; +#endif //WITH_API_CNT + int8_result result; + enum clnt_stat retval_1; + if (mode == NULL || maxpoolingNanOpt == NULL || windowHeight == NULL || windowWidth == NULL || verticalPadding == NULL || verticalStride == NULL || horizontalPadding == NULL || horizontalStride == NULL) { + LOGE(LOG_ERROR, "%s failed (value is NULL)", __FUNCTION__); + return CUDNN_STATUS_BAD_PARAM; + } + retval_1 = rpc_cudnngetpooling2ddescriptor_1( + (ptr)poolingDesc, + &result, clnt); + + if (retval_1 != RPC_SUCCESS) { + LOGE(LOG_ERROR, "%s failed (%d)", __FUNCTION__, retval_1); + } + if (result.err != CUDNN_STATUS_SUCCESS) { + LOGE(LOG_ERROR, "%s failed (result is %d)", __FUNCTION__, result); + } else { + *mode = (cudnnPoolingMode_t)result.int8_result_u.data[0]; + *maxpoolingNanOpt = (cudnnNanPropagation_t)result.int8_result_u.data[1]; + *windowHeight = result.int8_result_u.data[2]; + *windowWidth = result.int8_result_u.data[3]; + *verticalPadding = result.int8_result_u.data[4]; + *horizontalPadding = result.int8_result_u.data[5]; + *verticalStride = result.int8_result_u.data[6]; + *horizontalStride = result.int8_result_u.data[7]; + } + return result.err; +} + +cudnnStatus_t cudnnSetPoolingNdDescriptor(cudnnPoolingDescriptor_t poolingDesc, const cudnnPoolingMode_t mode, const cudnnNanPropagation_t maxpoolingNanOpt, int nbDims, const int* windowDimA, const int* paddingA, const int* strideA) +{ +#ifdef WITH_API_CNT + api_call_cnt++; +#endif //WITH_API_CNT + int result; + enum clnt_stat retval_1; + mem_data rpc_windowDimA = { + .mem_data_len = nbDims * sizeof(int), + .mem_data_val = (char*)windowDimA + }; + mem_data rpc_paddingA = { + .mem_data_len = nbDims * sizeof(int), + .mem_data_val = (char*)paddingA + }; + mem_data rpc_strideA = { + .mem_data_len = nbDims * sizeof(int), + .mem_data_val = (char*)strideA + }; + retval_1 = rpc_cudnnsetpoolingnddescriptor_1( + (ptr)poolingDesc, + (int)mode, + (int)maxpoolingNanOpt, + (int)nbDims, + rpc_windowDimA, + rpc_paddingA, + rpc_strideA, + &result, clnt); + + if (retval_1 != RPC_SUCCESS) { + LOGE(LOG_ERROR, "%s failed (%d)", __FUNCTION__, retval_1); + } + if (result != CUDNN_STATUS_SUCCESS) { + LOGE(LOG_ERROR, "%s failed (result is %d)", __FUNCTION__, result); + } + return result; +} + +cudnnStatus_t cudnnGetPoolingNdDescriptor(const cudnnPoolingDescriptor_t poolingDesc, int nbDimsRequested, cudnnPoolingMode_t * mode, cudnnNanPropagation_t * maxpoolingNanOpt, int* nbDims, int* windowDimA, int* paddingA, int* strideA) +{ +#ifdef WITH_API_CNT + api_call_cnt++; +#endif //WITH_API_CNT + size_t expected_size = nbDimsRequested * sizeof(int) * 3 + sizeof(int) + sizeof(cudnnPoolingMode_t) + sizeof(cudnnNanPropagation_t); + mem_result result; + result.mem_result_u.data.mem_data_val = (char*)malloc(expected_size); + enum clnt_stat retval_1; + if (mode == NULL || maxpoolingNanOpt == NULL || nbDims == NULL || windowDimA == NULL || paddingA == NULL || strideA == NULL) { + LOGE(LOG_ERROR, "%s failed (value is NULL)", __FUNCTION__); + return CUDNN_STATUS_BAD_PARAM; + } + retval_1 = rpc_cudnngetpoolingnddescriptor_1( + (ptr)poolingDesc, + nbDimsRequested, + &result, clnt); + if (retval_1 != RPC_SUCCESS) { + LOGE(LOG_ERROR, "%s failed (%d)", __FUNCTION__, retval_1); + } + if (result.err != CUDNN_STATUS_SUCCESS || result.mem_result_u.data.mem_data_len != expected_size) { + LOGE(LOG_ERROR, "%s failed (result is %d)", __FUNCTION__, result.err); + } else { + size_t offset = 0; + *mode = (cudnnPoolingMode_t)result.mem_result_u.data.mem_data_val[offset]; + offset += sizeof(cudnnPoolingMode_t); + *maxpoolingNanOpt = (cudnnNanPropagation_t)result.mem_result_u.data.mem_data_val[offset]; + offset += sizeof(cudnnNanPropagation_t); + *nbDims = (int)result.mem_result_u.data.mem_data_val[offset]; + offset += sizeof(int); + memcpy(windowDimA, result.mem_result_u.data.mem_data_val+offset, *nbDims * sizeof(int)); + offset += *nbDims * sizeof(int); + memcpy(paddingA, result.mem_result_u.data.mem_data_val+offset, *nbDims * sizeof(int)); + offset += *nbDims * sizeof(int); + memcpy(strideA, result.mem_result_u.data.mem_data_val+offset, *nbDims * sizeof(int)); + } + free(result.mem_result_u.data.mem_data_val); + return result.err; +} + +cudnnStatus_t cudnnGetPoolingNdForwardOutputDim(const cudnnPoolingDescriptor_t poolingDesc, const cudnnTensorDescriptor_t inputTensorDesc, int nbDims, int* outputTensorDimA) +{ +#ifdef WITH_API_CNT + api_call_cnt++; +#endif //WITH_API_CNT + mem_result result; + result.mem_result_u.data.mem_data_val = (char*)outputTensorDimA; + enum clnt_stat retval_1; + if (outputTensorDimA == NULL) { + LOGE(LOG_ERROR, "%s failed (value is NULL)", __FUNCTION__); + return CUDNN_STATUS_BAD_PARAM; + } + retval_1 = rpc_cudnngetpoolingndforwardoutputdim_1( + (ptr)poolingDesc, + (ptr)inputTensorDesc, + nbDims, + &result, clnt); + if (retval_1 != RPC_SUCCESS) { + LOGE(LOG_ERROR, "%s failed (%d)", __FUNCTION__, retval_1); + } + size_t expected_size = nbDims * sizeof(int); + if (result.err != CUDNN_STATUS_SUCCESS || result.mem_result_u.data.mem_data_len != expected_size) { + LOGE(LOG_ERROR, "%s failed (result is %d)", __FUNCTION__, result.err); + } + return result.err; +} + +cudnnStatus_t cudnnGetPooling2dForwardOutputDim(const cudnnPoolingDescriptor_t poolingDesc, const cudnnTensorDescriptor_t inputTensorDesc, int* n, int* c, int* h, int* w) +{ +#ifdef WITH_API_CNT + api_call_cnt++; +#endif //WITH_API_CNT + int4_result result; + enum clnt_stat retval_1; + if (n == NULL || c == NULL || h == NULL || w == NULL) { + LOGE(LOG_ERROR, "%s failed (value is NULL)", __FUNCTION__); + return CUDNN_STATUS_BAD_PARAM; + } + retval_1 = rpc_cudnngetpooling2dforwardoutputdim_1( + (ptr)poolingDesc, + (ptr)inputTensorDesc, + &result, clnt); + if (retval_1 != RPC_SUCCESS) { + LOGE(LOG_ERROR, "%s failed (%d)", __FUNCTION__, retval_1); + } + if (result.err != CUDNN_STATUS_SUCCESS) { + LOGE(LOG_ERROR, "%s failed (result is %d)", __FUNCTION__, result.err); + } else { + *n = result.int4_result_u.data[0]; + *c = result.int4_result_u.data[1]; + *h = result.int4_result_u.data[2]; + *w = result.int4_result_u.data[3]; + } + return result.err; +} + +cudnnStatus_t cudnnDestroyPoolingDescriptor(cudnnPoolingDescriptor_t poolingDesc) +{ +#ifdef WITH_API_CNT + api_call_cnt++; +#endif //WITH_API_CNT + int result; + enum clnt_stat retval_1; + retval_1 = rpc_cudnndestroypoolingdescriptor_1( + (ptr)poolingDesc, + &result, clnt); + + if (retval_1 != RPC_SUCCESS) { + LOGE(LOG_ERROR, "%s failed (%d)", __FUNCTION__, retval_1); + } + if (result != CUDNN_STATUS_SUCCESS) { + LOGE(LOG_ERROR, "%s failed (result is %d)", __FUNCTION__, result); + } + return result; +} + +cudnnStatus_t cudnnPoolingForward(cudnnHandle_t handle, const cudnnPoolingDescriptor_t poolingDesc, const void * alpha, const cudnnTensorDescriptor_t xDesc, const void * x, const void * beta, const cudnnTensorDescriptor_t yDesc, void * y) +{ +#ifdef WITH_API_CNT + api_call_cnt++; +#endif //WITH_API_CNT + int result; + enum clnt_stat retval_1; + //TODO: Check if we have a float instead of always sending doubles + cudnn_scaling_t rpc_alpha = {.dataType = CUDNN_DATA_DOUBLE, .cudnn_scaling_t_u.d = *((double*)alpha)}; + cudnn_scaling_t rpc_beta = {.dataType = CUDNN_DATA_DOUBLE, .cudnn_scaling_t_u.d = *((double*)beta)}; + retval_1 = rpc_cudnnpoolingforward_1( + (ptr)handle, + (ptr)poolingDesc, + rpc_alpha, + (ptr)xDesc, + (ptr)x, + rpc_beta, + (ptr)yDesc, + (ptr)y, + &result, clnt); + if (retval_1 != RPC_SUCCESS) { + LOGE(LOG_ERROR, "%s failed (%d)", __FUNCTION__, retval_1); + } + if (result != CUDNN_STATUS_SUCCESS) { + LOGE(LOG_ERROR, "%s failed (result is %d)", __FUNCTION__, result); + } + return result; +} + +cudnnStatus_t cudnnCreateActivationDescriptor(cudnnActivationDescriptor_t * activationDesc) +{ +#ifdef WITH_API_CNT + api_call_cnt++; +#endif //WITH_API_CNT + ptr_result result; + enum clnt_stat retval_1; + if (activationDesc == NULL) { + LOGE(LOG_ERROR, "%s failed (value is NULL)", __FUNCTION__); + return CUDNN_STATUS_BAD_PARAM; + } + retval_1 = rpc_cudnncreateactivationdescriptor_1(&result, clnt); + if (retval_1 != RPC_SUCCESS) { + LOGE(LOG_ERROR, "%s failed (%d)", __FUNCTION__, retval_1); + } + if (result.err != CUDNN_STATUS_SUCCESS) { + LOGE(LOG_ERROR, "%s failed (result is %d)", __FUNCTION__, result.err); + } else { + *activationDesc = (cudnnActivationDescriptor_t)result.ptr_result_u.ptr; + } + return result.err; +} + +cudnnStatus_t cudnnSetActivationDescriptor(cudnnActivationDescriptor_t activationDesc, cudnnActivationMode_t mode, cudnnNanPropagation_t reluNanOpt, double coef) +{ +#ifdef WITH_API_CNT + api_call_cnt++; +#endif //WITH_API_CNT + int result; + enum clnt_stat retval_1; + retval_1 = rpc_cudnnsetactivationdescriptor_1( + (ptr)activationDesc, + (int)mode, + (int)reluNanOpt, + coef, + &result, clnt); + + if (retval_1 != RPC_SUCCESS) { + LOGE(LOG_ERROR, "%s failed (%d)", __FUNCTION__, retval_1); + } + if (result != CUDNN_STATUS_SUCCESS) { + LOGE(LOG_ERROR, "%s failed (result is %d)", __FUNCTION__, result); + } + return result; +} + +cudnnStatus_t cudnnGetActivationDescriptor(const cudnnActivationDescriptor_t activationDesc, cudnnActivationMode_t *mode, cudnnNanPropagation_t *reluNanOpt, double *coef) +{ +#ifdef WITH_API_CNT + api_call_cnt++; +#endif //WITH_API_CNT + int2d1_result result; + enum clnt_stat retval_1; + if (mode == NULL || reluNanOpt == NULL || coef == NULL) { + LOGE(LOG_ERROR, "%s failed (value is NULL)", __FUNCTION__); + return CUDNN_STATUS_BAD_PARAM; + } + retval_1 = rpc_cudnngetactivationdescriptor_1( + (ptr)activationDesc, + &result, clnt); + + if (retval_1 != RPC_SUCCESS) { + LOGE(LOG_ERROR, "%s failed (%d)", __FUNCTION__, retval_1); + } + if (result.err != CUDNN_STATUS_SUCCESS) { + LOGE(LOG_ERROR, "%s failed (result is %d)", __FUNCTION__, result); + } else { + *mode = (cudnnActivationMode_t)result.int2d1_result_u.data.i[0]; + *reluNanOpt = (cudnnNanPropagation_t)result.int2d1_result_u.data.i[1]; + *coef = result.int2d1_result_u.data.d; + } + return result.err; +} + +cudnnStatus_t cudnnSetActivationDescriptorSwishBeta(cudnnActivationDescriptor_t activationDesc, double swish_beta) +{ +#ifdef WITH_API_CNT + api_call_cnt++; +#endif //WITH_API_CNT + int result; + enum clnt_stat retval_1; + retval_1 = rpc_cudnnsetactivationdescriptorswishbeta_1( + (ptr)activationDesc, + swish_beta, + &result, clnt); + + if (retval_1 != RPC_SUCCESS) { + LOGE(LOG_ERROR, "%s failed (%d)", __FUNCTION__, retval_1); + } + if (result != CUDNN_STATUS_SUCCESS) { + LOGE(LOG_ERROR, "%s failed (result is %d)", __FUNCTION__, result); + } + return result; +} + +cudnnStatus_t cudnnGetActivationDescriptorSwishBeta(cudnnActivationDescriptor_t activationDesc, double * swish_beta) +{ +#ifdef WITH_API_CNT + api_call_cnt++; +#endif //WITH_API_CNT + d_result result; + enum clnt_stat retval_1; + if (swish_beta == NULL) { + LOGE(LOG_ERROR, "%s failed (value is NULL)", __FUNCTION__); + return CUDNN_STATUS_BAD_PARAM; + } + retval_1 = rpc_cudnngetactivationdescriptorswishbeta_1( + (ptr)activationDesc, + &result, clnt); + + if (retval_1 != RPC_SUCCESS) { + LOGE(LOG_ERROR, "%s failed (%d)", __FUNCTION__, retval_1); + } + if (result.err != CUDNN_STATUS_SUCCESS) { + LOGE(LOG_ERROR, "%s failed (result is %d)", __FUNCTION__, result); + } else { + *swish_beta = result.d_result_u.data; + } + return result.err; +} + +cudnnStatus_t cudnnDestroyActivationDescriptor(cudnnActivationDescriptor_t activationDesc) +{ +#ifdef WITH_API_CNT + api_call_cnt++; +#endif //WITH_API_CNT + int result; + enum clnt_stat retval_1; + retval_1 = rpc_cudnndestroyactivationdescriptor_1( + (ptr)activationDesc, + &result, clnt); + + if (retval_1 != RPC_SUCCESS) { + LOGE(LOG_ERROR, "%s failed (%d)", __FUNCTION__, retval_1); + } + if (result != CUDNN_STATUS_SUCCESS) { + LOGE(LOG_ERROR, "%s failed (result is %d)", __FUNCTION__, result); + } + return result; +} + +cudnnStatus_t cudnnActivationForward(cudnnHandle_t handle, cudnnActivationDescriptor_t activationDesc, const void * alpha, const cudnnTensorDescriptor_t xDesc, const void * x, const void * beta, const cudnnTensorDescriptor_t yDesc, void * y) +{ +#ifdef WITH_API_CNT + api_call_cnt++; +#endif //WITH_API_CNT + int result; + enum clnt_stat retval_1; + //TODO: Check if we have a float instead of always sending doubles + cudnn_scaling_t rpc_alpha = {.dataType = CUDNN_DATA_DOUBLE, .cudnn_scaling_t_u.d = *((double*)alpha)}; + cudnn_scaling_t rpc_beta = {.dataType = CUDNN_DATA_DOUBLE, .cudnn_scaling_t_u.d = *((double*)beta)}; + retval_1 = rpc_cudnnactivationforward_1( + (ptr)handle, + (ptr)activationDesc, + rpc_alpha, + (ptr)xDesc, + (ptr)x, + rpc_beta, + (ptr)yDesc, + (ptr)y, + &result, clnt); + if (retval_1 != RPC_SUCCESS) { + LOGE(LOG_ERROR, "%s failed (%d)", __FUNCTION__, retval_1); + } + if (result != CUDNN_STATUS_SUCCESS) { + LOGE(LOG_ERROR, "%s failed (result is %d)", __FUNCTION__, result); + } + return result; +} + +cudnnStatus_t cudnnCreateLRNDescriptor(cudnnLRNDescriptor_t * normDesc) +{ +#ifdef WITH_API_CNT + api_call_cnt++; +#endif //WITH_API_CNT + ptr_result result; + enum clnt_stat retval_1; + if (normDesc == NULL) { + LOGE(LOG_ERROR, "%s failed (value is NULL)", __FUNCTION__); + return CUDNN_STATUS_BAD_PARAM; + } + retval_1 = rpc_cudnncreatelrndescriptor_1(&result, clnt); + if (retval_1 != RPC_SUCCESS) { + LOGE(LOG_ERROR, "%s failed (%d)", __FUNCTION__, retval_1); + } + if (result.err != CUDNN_STATUS_SUCCESS) { + LOGE(LOG_ERROR, "%s failed (result is %d)", __FUNCTION__, result.err); + } else { + *normDesc = (cudnnLRNDescriptor_t)result.ptr_result_u.ptr; + } + return result.err; +} + +cudnnStatus_t cudnnSetLRNDescriptor(cudnnLRNDescriptor_t normDesc, unsigned lrnN, double lrnAlpha, double lrnBeta, double lrnK) +{ +#ifdef WITH_API_CNT + api_call_cnt++; +#endif //WITH_API_CNT + int result; + enum clnt_stat retval_1; + retval_1 = rpc_cudnnsetlrndescriptor_1( + (ptr)normDesc, + (int)lrnN, + lrnAlpha, + lrnBeta, + lrnK, + &result, clnt); + + if (retval_1 != RPC_SUCCESS) { + LOGE(LOG_ERROR, "%s failed (%d)", __FUNCTION__, retval_1); + } + if (result != CUDNN_STATUS_SUCCESS) { + LOGE(LOG_ERROR, "%s failed (result is %d)", __FUNCTION__, result); + } + return result; +} + +cudnnStatus_t cudnnGetLRNDescriptor(cudnnLRNDescriptor_t normDesc, unsigned * lrnN, double * lrnAlpha, double * lrnBeta, double * lrnK) +{ +#ifdef WITH_API_CNT + api_call_cnt++; +#endif //WITH_API_CNT + int1d3_result result; + enum clnt_stat retval_1; + if (lrnN == NULL || lrnAlpha == NULL || lrnBeta == NULL || lrnK == NULL) { + LOGE(LOG_ERROR, "%s failed (value is NULL)", __FUNCTION__); + return CUDNN_STATUS_BAD_PARAM; + } + retval_1 = rpc_cudnngetlrndescriptor_1( + (ptr)normDesc, + &result, clnt); + + if (retval_1 != RPC_SUCCESS) { + LOGE(LOG_ERROR, "%s failed (%d)", __FUNCTION__, retval_1); + } + if (result.err != CUDNN_STATUS_SUCCESS) { + LOGE(LOG_ERROR, "%s failed (result is %d)", __FUNCTION__, result); + } else { + *lrnN = result.int1d3_result_u.data.i; + *lrnAlpha = result.int1d3_result_u.data.d[0]; + *lrnBeta = result.int1d3_result_u.data.d[1]; + *lrnK = result.int1d3_result_u.data.d[2]; + } + return result.err; +} + +cudnnStatus_t cudnnDestroyLRNDescriptor(cudnnLRNDescriptor_t lrnDesc) +{ +#ifdef WITH_API_CNT + api_call_cnt++; +#endif //WITH_API_CNT + int result; + enum clnt_stat retval_1; + retval_1 = rpc_cudnndestroylrndescriptor_1( + (ptr)lrnDesc, + &result, clnt); + + if (retval_1 != RPC_SUCCESS) { + LOGE(LOG_ERROR, "%s failed (%d)", __FUNCTION__, retval_1); + } + if (result != CUDNN_STATUS_SUCCESS) { + LOGE(LOG_ERROR, "%s failed (result is %d)", __FUNCTION__, result); + } + return result; +} + +cudnnStatus_t cudnnLRNCrossChannelForward(cudnnHandle_t handle, cudnnLRNDescriptor_t normDesc, cudnnLRNMode_t lrnMode, const void * alpha, const cudnnTensorDescriptor_t xDesc, const void * x, const void * beta, const cudnnTensorDescriptor_t yDesc, void * y) +{ +#ifdef WITH_API_CNT + api_call_cnt++; +#endif //WITH_API_CNT + int result; + enum clnt_stat retval_1; + //TODO: Check if we have a float instead of always sending doubles + cudnn_scaling_t rpc_alpha = {.dataType = CUDNN_DATA_DOUBLE, .cudnn_scaling_t_u.d = *((double*)alpha)}; + cudnn_scaling_t rpc_beta = {.dataType = CUDNN_DATA_DOUBLE, .cudnn_scaling_t_u.d = *((double*)beta)}; + retval_1 = rpc_cudnnlrncrosschannelforward_1( + (ptr)handle, + (ptr)normDesc, + (int)lrnMode, + rpc_alpha, + (ptr)xDesc, + (ptr)x, + rpc_beta, + (ptr)yDesc, + (ptr)y, + &result, clnt); + if (retval_1 != RPC_SUCCESS) { + LOGE(LOG_ERROR, "%s failed (%d)", __FUNCTION__, retval_1); + } + if (result != CUDNN_STATUS_SUCCESS) { + LOGE(LOG_ERROR, "%s failed (result is %d)", __FUNCTION__, result); + } + return result; +} + +DEF_FN(cudnnStatus_t, cudnnDivisiveNormalizationForward, cudnnHandle_t, handle, cudnnLRNDescriptor_t, normDesc, cudnnDivNormMode_t, mode, const void *, alpha, const cudnnTensorDescriptor_t, xDesc, const void *, x, const void *, means, void *, temp, void *, temp2, const void *, beta, const cudnnTensorDescriptor_t, yDesc, void *, y) +DEF_FN(cudnnStatus_t, cudnnDeriveBNTensorDescriptor, cudnnTensorDescriptor_t, derivedBnDesc, const cudnnTensorDescriptor_t, xDesc, cudnnBatchNormMode_t, mode) +DEF_FN(cudnnStatus_t, cudnnBatchNormalizationForwardInference, cudnnHandle_t, handle, cudnnBatchNormMode_t, mode, const void *, alpha, const void *, beta, const cudnnTensorDescriptor_t, xDesc, const void *, x, const cudnnTensorDescriptor_t, yDesc, void *, y, const cudnnTensorDescriptor_t, bnScaleBiasMeanVarDesc, const void *, bnScale, const void *, bnBias, const void *, estimatedMean, const void *, estimatedVariance, double, epsilon) +DEF_FN(cudnnStatus_t, cudnnDeriveNormTensorDescriptor, cudnnTensorDescriptor_t, derivedNormScaleBiasDesc, cudnnTensorDescriptor_t, derivedNormMeanVarDesc, const cudnnTensorDescriptor_t, xDesc, cudnnNormMode_t, mode, int, groupCnt) +DEF_FN(cudnnStatus_t, cudnnNormalizationForwardInference, cudnnHandle_t, handle, cudnnNormMode_t, mode, cudnnNormOps_t, normOps, cudnnNormAlgo_t, algo, const void *, alpha, const void *, beta, const cudnnTensorDescriptor_t, xDesc, const void *, x, const cudnnTensorDescriptor_t normScaleBiasDesc, const void *, normScale, const void *, normBias, const cudnnTensorDescriptor_t, normMeanVarDesc, const void *, estimatedMean, const void *, estimatedVariance, const cudnnTensorDescriptor_t, zDesc, const void *, z, cudnnActivationDescriptor_t, activationDesc, const cudnnTensorDescriptor_t, yDesc, void *, y, double, epsilon, int, groupCnt) +DEF_FN(cudnnStatus_t, cudnnCreateSpatialTransformerDescriptor, cudnnSpatialTransformerDescriptor_t *, stDesc) +DEF_FN(cudnnStatus_t, cudnnSetSpatialTransformerNdDescriptor, cudnnSpatialTransformerDescriptor_t, stDesc, cudnnSamplerType_t, samplerType, cudnnDataType_t, dataType, const int, nbDims, const int*, dimA) +DEF_FN(cudnnStatus_t, cudnnDestroySpatialTransformerDescriptor, cudnnSpatialTransformerDescriptor_t, stDesc) +DEF_FN(cudnnStatus_t, cudnnSpatialTfGridGeneratorForward, cudnnHandle_t, handle, const cudnnSpatialTransformerDescriptor_t, stDesc, const void *, theta, void *, grid) +DEF_FN(cudnnStatus_t, cudnnSpatialTfSamplerForward, cudnnHandle_t, handle, cudnnSpatialTransformerDescriptor_t, stDesc, const void *, alpha, const cudnnTensorDescriptor_t, xDesc, const void *, x, const void *, grid, const void *, beta, cudnnTensorDescriptor_t, yDesc, void *, y) +DEF_FN(cudnnStatus_t, cudnnCreateDropoutDescriptor, cudnnDropoutDescriptor_t *, dropoutDesc) +DEF_FN(cudnnStatus_t, cudnnDestroyDropoutDescriptor, cudnnDropoutDescriptor_t, dropoutDesc) +DEF_FN(cudnnStatus_t, cudnnDropoutGetStatesSize, cudnnHandle_t, handle, size_t *, sizeInBytes) +DEF_FN(cudnnStatus_t, cudnnDropoutGetReserveSpaceSize, cudnnTensorDescriptor_t, xdesc, size_t*, sizeInBytes) +DEF_FN(cudnnStatus_t, cudnnSetDropoutDescriptor, cudnnDropoutDescriptor_t, dropoutDesc, cudnnHandle_t, handle, float, dropout, void *, states, size_t, stateSizeInBytes, unsigned long long, seed) +DEF_FN(cudnnStatus_t, cudnnRestoreDropoutDescriptor, cudnnDropoutDescriptor_t, dropoutDesc, cudnnHandle_t, handle, float, dropout, void *, states, size_t, stateSizeInBytes, unsigned long long, seed) +DEF_FN(cudnnStatus_t, cudnnGetDropoutDescriptor, cudnnDropoutDescriptor_t, dropoutDesc, cudnnHandle_t, handle, float *, dropout, void **, states, unsigned long long *, seed) +DEF_FN(cudnnStatus_t, cudnnDropoutForward, cudnnHandle_t, handle, const cudnnDropoutDescriptor_t, dropoutDesc, const cudnnTensorDescriptor_t, xdesc, const void *, x, const cudnnTensorDescriptor_t, ydesc, void *, y, void *, reserveSpace, size_t, reserveSpaceSizeInBytes) +DEF_FN(cudnnStatus_t, cudnnCreateAlgorithmDescriptor, cudnnAlgorithmDescriptor_t *, algoDesc) +DEF_FN(cudnnStatus_t, cudnnSetAlgorithmDescriptor, cudnnAlgorithmDescriptor_t, algoDesc, cudnnAlgorithm_t, algorithm) +DEF_FN(cudnnStatus_t, cudnnGetAlgorithmDescriptor, const cudnnAlgorithmDescriptor_t, algoDesc, cudnnAlgorithm_t *, algorithm) +DEF_FN(cudnnStatus_t, cudnnCopyAlgorithmDescriptor, const cudnnAlgorithmDescriptor_t, src, cudnnAlgorithmDescriptor_t, dest) +DEF_FN(cudnnStatus_t, cudnnDestroyAlgorithmDescriptor, cudnnAlgorithmDescriptor_t, algoDesc) +DEF_FN(cudnnStatus_t, cudnnCreateAlgorithmPerformance, cudnnAlgorithmPerformance_t *, algoPerf, int, numberToCreate) +DEF_FN(cudnnStatus_t, cudnnSetAlgorithmPerformance, cudnnAlgorithmPerformance_t, algoPerf, cudnnAlgorithmDescriptor_t, algoDesc, cudnnStatus_t, status, float, time, size_t, memory) +DEF_FN(cudnnStatus_t, cudnnGetAlgorithmPerformance, const cudnnAlgorithmPerformance_t, algoPerf, cudnnAlgorithmDescriptor_t *, algoDesc, cudnnStatus_t, *, status, float *, time, size_t*, memory) +DEF_FN(cudnnStatus_t, cudnnDestroyAlgorithmPerformance, cudnnAlgorithmPerformance_t *, algoPerf, int, numberToDestroy) +DEF_FN(cudnnStatus_t, cudnnGetAlgorithmSpaceSize, cudnnHandle_t, handle, cudnnAlgorithmDescriptor_t, algoDesc, size_t *, algoSpaceSizeInBytes) +DEF_FN(cudnnStatus_t, cudnnSaveAlgorithm, cudnnHandle_t, handle, cudnnAlgorithmDescriptor_t, algoDesc, void *, algoSpace, size_t, algoSpaceSizeInBytes) +DEF_FN(cudnnStatus_t, cudnnRestoreAlgorithm, cudnnHandle_t, handle, void *, algoSpace, size_t, algoSpaceSizeInBytes, cudnnAlgorithmDescriptor_t, algoDesc) +DEF_FN(cudnnStatus_t, cudnnSetCallback, unsigned, mask, void *, udata, cudnnCallback_t, fptr) +DEF_FN(cudnnStatus_t, cudnnGetCallback, unsigned *, mask, void **, udata, cudnnCallback_t *, fptr) +DEF_FN(cudnnStatus_t, cudnnOpsInferVersionCheck) + + +/***************** cudnn_cnn_infer *******************/ + +cudnnStatus_t cudnnCreateConvolutionDescriptor(cudnnConvolutionDescriptor_t* convDesc) +{ +#ifdef WITH_API_CNT + api_call_cnt++; +#endif //WITH_API_CNT + ptr_result result; + enum clnt_stat retval_1; + if (convDesc == NULL) { + LOGE(LOG_ERROR, "%s failed (value is NULL)", __FUNCTION__); + return CUDNN_STATUS_BAD_PARAM; + } + retval_1 = rpc_cudnncreateconvolutiondescriptor_1(&result, clnt); + if (retval_1 != RPC_SUCCESS) { + LOGE(LOG_ERROR, "%s failed (%d)", __FUNCTION__, retval_1); + } + if (result.err != CUDNN_STATUS_SUCCESS) { + LOGE(LOG_ERROR, "%s failed (result is %d)", __FUNCTION__, result.err); + } else { + *convDesc = (cudnnConvolutionDescriptor_t)result.ptr_result_u.ptr; + } + return result.err; +} + +cudnnStatus_t cudnnDestroyConvolutionDescriptor(cudnnConvolutionDescriptor_t convDesc) +{ +#ifdef WITH_API_CNT + api_call_cnt++; +#endif //WITH_API_CNT + int result; + enum clnt_stat retval_1; + retval_1 = rpc_cudnndestroyconvolutiondescriptor_1( + (ptr)convDesc, + &result, clnt); + + if (retval_1 != RPC_SUCCESS) { + LOGE(LOG_ERROR, "%s failed (%d)", __FUNCTION__, retval_1); + } + if (result != CUDNN_STATUS_SUCCESS) { + LOGE(LOG_ERROR, "%s failed (result is %d)", __FUNCTION__, result); + } + return result; +} +DEF_FN(cudnnStatus_t, cudnnSetConvolutionMathType, cudnnConvolutionDescriptor_t, convDesc, cudnnMathType_t, mathType) +DEF_FN(cudnnStatus_t, cudnnGetConvolutionMathType, cudnnConvolutionDescriptor_t, convDesc, cudnnMathType_t*, mathType) +DEF_FN(cudnnStatus_t, cudnnSetConvolutionGroupCount, cudnnConvolutionDescriptor_t, convDesc, int, groupCount) +DEF_FN(cudnnStatus_t, cudnnGetConvolutionGroupCount, cudnnConvolutionDescriptor_t, convDesc, int*, groupCount) +DEF_FN(cudnnStatus_t, cudnnSetConvolutionReorderType, cudnnConvolutionDescriptor_t, convDesc, cudnnReorderType_t, reorderType) +DEF_FN(cudnnStatus_t, cudnnGetConvolutionReorderType, cudnnConvolutionDescriptor_t, convDesc, cudnnReorderType_t*, reorderType) +DEF_FN(cudnnStatus_t, cudnnSetConvolution2dDescriptor, cudnnConvolutionDescriptor_t, convDesc, int, pad_h, int, pad_w, int, u, int, v, int, dilation_h, int, dilation_w, cudnnConvolutionMode_t, mode, cudnnDataType_t, computeType) +DEF_FN(cudnnStatus_t, cudnnGetConvolution2dDescriptor, const cudnnConvolutionDescriptor_t, convDesc, int*, pad_h, int*, pad_w, int*, u, int*, v, int*, dilation_h, int*, dilation_w, cudnnConvolutionMode_t*, mode, cudnnDataType_t*, computeType) + +cudnnStatus_t cudnnSetConvolutionNdDescriptor(cudnnConvolutionDescriptor_t convDesc, int arrayLength, const int* padA, const int* filterStrideA, const int* dilationA, cudnnConvolutionMode_t mode, cudnnDataType_t computeType) +{ +#ifdef WITH_API_CNT + api_call_cnt++; +#endif //WITH_API_CNT + int result; + enum clnt_stat retval_1; + mem_data rpc_windowDimA = { + .mem_data_len = arrayLength * sizeof(int), + .mem_data_val = (char*)padA + }; + mem_data rpc_paddingA = { + .mem_data_len = arrayLength * sizeof(int), + .mem_data_val = (char*)filterStrideA + }; + mem_data rpc_strideA = { + .mem_data_len = arrayLength * sizeof(int), + .mem_data_val = (char*)dilationA + }; + retval_1 = rpc_cudnnsetconvolutionnddescriptor_1( + (ptr)convDesc, + arrayLength, + rpc_windowDimA, + rpc_paddingA, + rpc_strideA, + mode, + computeType, + &result, clnt); + + if (retval_1 != RPC_SUCCESS) { + LOGE(LOG_ERROR, "%s failed (%d)", __FUNCTION__, retval_1); + } + if (result != CUDNN_STATUS_SUCCESS) { + LOGE(LOG_ERROR, "%s failed (result is %d)", __FUNCTION__, result); + } + return result; +} + +DEF_FN(cudnnStatus_t, cudnnGetConvolutionNdDescriptor, const cudnnConvolutionDescriptor_t, convDesc, int, arrayLengthRequested, int*, arrayLength, int*, padA, int*, strideA, int*, dilationA, cudnnConvolutionMode_t*, mode, cudnnDataType_t*, computeType) +DEF_FN(cudnnStatus_t, cudnnGetConvolution2dForwardOutputDim, const cudnnConvolutionDescriptor_t, convDesc, const cudnnTensorDescriptor_t, inputTensorDesc, const cudnnFilterDescriptor_t, filterDesc, int*, n, int*, c, int*, h, int*, w) + +cudnnStatus_t cudnnGetConvolutionNdForwardOutputDim(const cudnnConvolutionDescriptor_t convDesc, const cudnnTensorDescriptor_t inputTensorDesc, const cudnnFilterDescriptor_t filterDesc, int nbDims, int* tensorOutputDimA) +{ +#ifdef WITH_API_CNT + api_call_cnt++; +#endif //WITH_API_CNT + mem_result result; + result.mem_result_u.data.mem_data_val = (char*)tensorOutputDimA; + enum clnt_stat retval_1; + if (tensorOutputDimA == NULL) { + LOGE(LOG_ERROR, "%s failed (value is NULL)", __FUNCTION__); + return CUDNN_STATUS_BAD_PARAM; + } + retval_1 = rpc_cudnngetconvolutionndforwardoutputdim_1( + (ptr)convDesc, + (ptr)inputTensorDesc, + (ptr)filterDesc, + nbDims, + &result, clnt); + if (retval_1 != RPC_SUCCESS) { + LOGE(LOG_ERROR, "%s failed (%d)", __FUNCTION__, retval_1); + } + size_t expected_size = nbDims * sizeof(int); + if (result.err != CUDNN_STATUS_SUCCESS || result.mem_result_u.data.mem_data_len != expected_size) { + LOGE(LOG_ERROR, "%s failed (result is %d)", __FUNCTION__, result.err); + } + return result.err; +} + +DEF_FN(cudnnStatus_t, cudnnGetConvolutionForwardAlgorithmMaxCount, cudnnHandle_t, handle, int*, count) + +cudnnStatus_t cudnnGetConvolutionForwardAlgorithm_v7(cudnnHandle_t handle, const cudnnTensorDescriptor_t srcDesc, const cudnnFilterDescriptor_t filterDesc, const cudnnConvolutionDescriptor_t convDesc, const cudnnTensorDescriptor_t destDesc, const int requestedAlgoCount, int* returnedAlgoCount, cudnnConvolutionFwdAlgoPerf_t* perfResults) +{ +#ifdef WITH_API_CNT + api_call_cnt++; +#endif //WITH_API_CNT + mem_result result; + result.mem_result_u.data.mem_data_val = (char*)malloc(requestedAlgoCount * sizeof(cudnnConvolutionFwdAlgoPerf_t) + sizeof(int)); + enum clnt_stat retval_1; + if (returnedAlgoCount == NULL || perfResults == NULL) { + LOGE(LOG_ERROR, "%s failed (value is NULL)", __FUNCTION__); + return CUDNN_STATUS_BAD_PARAM; + } + retval_1 = rpc_cudnngetconvolutionforwardalgorithm_v7_1( + (ptr)handle, + (ptr)srcDesc, + (ptr)filterDesc, + (ptr)convDesc, + (ptr)destDesc, + requestedAlgoCount, + &result, clnt); + if (retval_1 != RPC_SUCCESS) { + LOGE(LOG_ERROR, "%s failed (%d)", __FUNCTION__, retval_1); + } + size_t expected_size = requestedAlgoCount * sizeof(cudnnConvolutionFwdAlgoPerf_t) + sizeof(int); + if (result.err != CUDNN_STATUS_SUCCESS || result.mem_result_u.data.mem_data_len != expected_size) { + LOGE(LOG_ERROR, "%s failed (result is %d)", __FUNCTION__, result.err); + } else { + *returnedAlgoCount = *(int*)result.mem_result_u.data.mem_data_val; + if (*returnedAlgoCount > requestedAlgoCount) { + LOGE(LOG_ERROR, "%s failed (returnedAlgoCount is %d, requestedAlgoCount is %d)", __FUNCTION__, *returnedAlgoCount, requestedAlgoCount); + return CUDNN_STATUS_INTERNAL_ERROR; + } + memcpy(perfResults, result.mem_result_u.data.mem_data_val + sizeof(int), *returnedAlgoCount * sizeof(cudnnConvolutionFwdAlgoPerf_t)); + } + free(result.mem_result_u.data.mem_data_val); + return result.err; +} + +cudnnStatus_t cudnnFindConvolutionForwardAlgorithm( cudnnHandle_t handle, const cudnnTensorDescriptor_t xDesc, const cudnnFilterDescriptor_t wDesc, const cudnnConvolutionDescriptor_t convDesc, const cudnnTensorDescriptor_t yDesc, const int requestedAlgoCount, int* returnedAlgoCount, cudnnConvolutionFwdAlgoPerf_t* perfResults) +{ +#ifdef WITH_API_CNT + api_call_cnt++; +#endif //WITH_API_CNT + mem_result result; + result.mem_result_u.data.mem_data_val = (char*)malloc(requestedAlgoCount * sizeof(cudnnConvolutionFwdAlgoPerf_t) + sizeof(int)); + enum clnt_stat retval_1; + if (returnedAlgoCount == NULL || perfResults == NULL) { + LOGE(LOG_ERROR, "%s failed (value is NULL)", __FUNCTION__); + return CUDNN_STATUS_BAD_PARAM; + } + retval_1 = rpc_cudnnfindconvolutionforwardalgorithm_1( + (ptr)handle, + (ptr)xDesc, + (ptr)wDesc, + (ptr)convDesc, + (ptr)yDesc, + requestedAlgoCount, + &result, clnt); + if (retval_1 != RPC_SUCCESS) { + LOGE(LOG_ERROR, "%s failed (%d)", __FUNCTION__, retval_1); + } + size_t expected_size = requestedAlgoCount * sizeof(cudnnConvolutionFwdAlgoPerf_t) + sizeof(int); + if (result.err != CUDNN_STATUS_SUCCESS || result.mem_result_u.data.mem_data_len != expected_size) { + LOGE(LOG_ERROR, "%s failed (result is %d)", __FUNCTION__, result.err); + } else { + *returnedAlgoCount = *(int*)result.mem_result_u.data.mem_data_val; + if (*returnedAlgoCount > requestedAlgoCount) { + LOGE(LOG_ERROR, "%s failed (returnedAlgoCount is %d, requestedAlgoCount is %d)", __FUNCTION__, *returnedAlgoCount, requestedAlgoCount); + return CUDNN_STATUS_INTERNAL_ERROR; + } + memcpy(perfResults, result.mem_result_u.data.mem_data_val + sizeof(int), *returnedAlgoCount * sizeof(cudnnConvolutionFwdAlgoPerf_t)); + } + free(result.mem_result_u.data.mem_data_val); + return result.err; +} + +DEF_FN(cudnnStatus_t, cudnnFindConvolutionForwardAlgorithmEx, cudnnHandle_t, handle, const cudnnTensorDescriptor_t, xDesc, const void*, x, const cudnnFilterDescriptor_t, wDesc, const void*, w, const cudnnConvolutionDescriptor_t, convDesc, const cudnnTensorDescriptor_t, yDesc, void*, y, const int, requestedAlgoCount, int*, returnedAlgoCount, cudnnConvolutionFwdAlgoPerf_t*, perfResults, void*, workSpace, size_t, workSpaceSizeInBytes) +DEF_FN(cudnnStatus_t, cudnnIm2Col, cudnnHandle_t, handle, const cudnnTensorDescriptor_t, xDesc, const void*, x, const cudnnFilterDescriptor_t, wDesc, const cudnnConvolutionDescriptor_t, convDesc, void*, colBuffer) +DEF_FN(cudnnStatus_t, cudnnReorderFilterAndBias, cudnnHandle_t, handle, const cudnnFilterDescriptor_t, filterDesc, cudnnReorderType_t, reorderType, const void*, filterData, void*, reorderedFilterData, int, reorderBias, const void*, biasData, void*, reorderedBiasData) + +cudnnStatus_t cudnnGetConvolutionForwardWorkspaceSize( cudnnHandle_t handle, const cudnnTensorDescriptor_t xDesc, const cudnnFilterDescriptor_t wDesc, const cudnnConvolutionDescriptor_t convDesc, const cudnnTensorDescriptor_t yDesc, cudnnConvolutionFwdAlgo_t algo, size_t* sizeInBytes) +{ +#ifdef WITH_API_CNT + api_call_cnt++; +#endif //WITH_API_CNT + sz_result result; + enum clnt_stat retval_1; + if (sizeInBytes == NULL) { + LOGE(LOG_ERROR, "%s failed (value is NULL)", __FUNCTION__); + return CUDNN_STATUS_BAD_PARAM; + } + retval_1 = rpc_cudnngetconvolutionforwardworkspacesize_1( + (ptr)handle, + (ptr)xDesc, + (ptr)wDesc, + (ptr)convDesc, + (ptr)yDesc, + algo, + &result, clnt); + if (retval_1 != RPC_SUCCESS) { + LOGE(LOG_ERROR, "%s failed (%d)", __FUNCTION__, retval_1); + } + if (result.err != CUDNN_STATUS_SUCCESS) { + LOGE(LOG_ERROR, "%s failed (result is %d)", __FUNCTION__, result.err); + } else { + *sizeInBytes = result.sz_result_u.data; + } + return result.err; +} + +cudnnStatus_t cudnnConvolutionForward(cudnnHandle_t handle, const void* alpha, const cudnnTensorDescriptor_t xDesc, const void* x, const cudnnFilterDescriptor_t wDesc, const void* w, const cudnnConvolutionDescriptor_t convDesc, cudnnConvolutionFwdAlgo_t algo, void* workSpace, size_t workSpaceSizeInBytes, const void* beta, const cudnnTensorDescriptor_t yDesc, void* y) +{ +#ifdef WITH_API_CNT + api_call_cnt++; +#endif //WITH_API_CNT + int result; + enum clnt_stat retval_1; + //TODO: Check if we have a float instead of always sending doubles + cudnn_scaling_t rpc_alpha = {.dataType = CUDNN_DATA_DOUBLE, .cudnn_scaling_t_u.d = *((double*)alpha)}; + cudnn_scaling_t rpc_beta = {.dataType = CUDNN_DATA_DOUBLE, .cudnn_scaling_t_u.d = *((double*)beta)}; + retval_1 = rpc_cudnnconvolutionforward_1( + (ptr)handle, + rpc_alpha, + (ptr)xDesc, + (ptr)x, + (ptr)wDesc, + (ptr)w, + (ptr)convDesc, + algo, + (ptr)workSpace, + workSpaceSizeInBytes, + rpc_beta, + (ptr)yDesc, + (ptr)y, + &result, clnt); + if (retval_1 != RPC_SUCCESS) { + LOGE(LOG_ERROR, "%s failed (%d)", __FUNCTION__, retval_1); + } + if (result != CUDNN_STATUS_SUCCESS) { + LOGE(LOG_ERROR, "%s failed (result is %d)", __FUNCTION__, result); + } + return result; +} + +DEF_FN(cudnnStatus_t, cudnnConvolutionBiasActivationForward, cudnnHandle_t, handle, const void*, alpha1, const cudnnTensorDescriptor_t, xDesc, const void*, x, const cudnnFilterDescriptor_t, wDesc, const void*, w, const cudnnConvolutionDescriptor_t, convDesc, cudnnConvolutionFwdAlgo_t, algo, void*, workSpace, size_t, workSpaceSizeInBytes, const void*, alpha2, const cudnnTensorDescriptor_t, zDesc, const void*, z, const cudnnTensorDescriptor_t, biasDesc, const void*, bias, const cudnnActivationDescriptor_t, activationDesc, const cudnnTensorDescriptor_t, yDesc, void*, y) +DEF_FN(cudnnStatus_t, cudnnGetConvolutionBackwardDataAlgorithmMaxCount, cudnnHandle_t, handle, int*, count) +DEF_FN(cudnnStatus_t, cudnnFindConvolutionBackwardDataAlgorithm, cudnnHandle_t, handle, const cudnnFilterDescriptor_t, wDesc, const cudnnTensorDescriptor_t, dyDesc, const cudnnConvolutionDescriptor_t, convDesc, const cudnnTensorDescriptor_t, dxDesc, const int, requestedAlgoCount, int*, returnedAlgoCount, cudnnConvolutionBwdDataAlgoPerf_t*, perfResults) +DEF_FN(cudnnStatus_t, cudnnFindConvolutionBackwardDataAlgorithmEx, cudnnHandle_t, handle, const cudnnFilterDescriptor_t, wDesc, const void*, w, const cudnnTensorDescriptor_t, dyDesc, const void*, dy, const cudnnConvolutionDescriptor_t, convDesc, const cudnnTensorDescriptor_t, dxDesc, void*, dx, const int, requestedAlgoCount, int*, returnedAlgoCount, cudnnConvolutionBwdDataAlgoPerf_t*, perfResults, void*, workSpace, size_t, workSpaceSizeInBytes) +DEF_FN(cudnnStatus_t, cudnnGetConvolutionBackwardDataAlgorithm_v7, cudnnHandle_t, handle, const cudnnFilterDescriptor_t, filterDesc, const cudnnTensorDescriptor_t, diffDesc, const cudnnConvolutionDescriptor_t, convDesc, const cudnnTensorDescriptor_t, gradDesc, const int, requestedAlgoCount, int*, returnedAlgoCount, cudnnConvolutionBwdDataAlgoPerf_t*, perfResults) +DEF_FN(cudnnStatus_t, cudnnGetConvolutionBackwardDataWorkspaceSize, cudnnHandle_t, handle, const cudnnFilterDescriptor_t, wDesc, const cudnnTensorDescriptor_t, dyDesc, const cudnnConvolutionDescriptor_t, convDesc, const cudnnTensorDescriptor_t, dxDesc, cudnnConvolutionBwdDataAlgo_t, algo, size_t*, sizeInBytes) +DEF_FN(cudnnStatus_t, cudnnConvolutionBackwardData, cudnnHandle_t, handle, const void*, alpha, const cudnnFilterDescriptor_t, wDesc, const void*, w, const cudnnTensorDescriptor_t, dyDesc, const void*, dy, const cudnnConvolutionDescriptor_t, convDesc, cudnnConvolutionBwdDataAlgo_t, algo, void*, workSpace, size_t, workSpaceSizeInBytes, const void*, beta, const cudnnTensorDescriptor_t, dxDesc, void*, dx) +DEF_FN(cudnnStatus_t, cudnnGetFoldedConvBackwardDataDescriptors, const cudnnHandle_t, handle, const cudnnFilterDescriptor_t, filterDesc, const cudnnTensorDescriptor_t, diffDesc, const cudnnConvolutionDescriptor_t, convDesc, const cudnnTensorDescriptor_t, gradDesc, const cudnnTensorFormat_t, transformFormat, cudnnFilterDescriptor_t, foldedFilterDesc, cudnnTensorDescriptor_t, paddedDiffDesc, cudnnConvolutionDescriptor_t, foldedConvDesc, cudnnTensorDescriptor_t, foldedGradDesc, cudnnTensorTransformDescriptor_t, filterFoldTransDesc, cudnnTensorTransformDescriptor_t, diffPadTransDesc, cudnnTensorTransformDescriptor_t, gradFoldTransDesc, cudnnTensorTransformDescriptor_t, gradUnfoldTransDesc) +DEF_FN(cudnnStatus_t, cudnnCnnInferVersionCheck) + +/********************** CUDNN BACKEND API ********************************/ +cudnnStatus_t cudnnBackendCreateDescriptor(cudnnBackendDescriptorType_t descriptorType, cudnnBackendDescriptor_t *descriptor) +{ +#ifdef WITH_API_CNT + api_call_cnt++; +#endif //WITH_API_CNT + ptr_result result; + enum clnt_stat retval_1; + LOGE(LOG_DEBUG, "%s(%d)", __FUNCTION__, descriptorType); + if (descriptor == NULL) { + LOGE(LOG_ERROR, "%s failed (descriptor is NULL)", __FUNCTION__); + return CUDNN_STATUS_BAD_PARAM; + } + retval_1 = rpc_cudnnbackendcreatedescriptor_1( + (int)descriptorType, + &result, clnt); + if (retval_1 != RPC_SUCCESS) { + LOGE(LOG_ERROR, "%s failed (%d)", __FUNCTION__, retval_1); + } + if (result.err != CUDNN_STATUS_SUCCESS) { + LOGE(LOG_ERROR, "%s failed (result is %d)", __FUNCTION__, result.err); + } else { + *descriptor = (void*)result.ptr_result_u.ptr; + LOGE(LOG_DEBUG, "-> %p", *descriptor); + } + return result.err; +} + +cudnnStatus_t cudnnBackendDestroyDescriptor(cudnnBackendDescriptor_t descriptor) +{ +#ifdef WITH_API_CNT + api_call_cnt++; +#endif //WITH_API_CNT + int result; + enum clnt_stat retval_1; + LOGE(LOG_DEBUG, "%s(%p)", __FUNCTION__, descriptor); + retval_1 = rpc_cudnnbackenddestroydescriptor_1((ptr)descriptor, &result, clnt); + if (retval_1 != RPC_SUCCESS) { + LOGE(LOG_ERROR, "%s failed (%d)", __FUNCTION__, retval_1); + } + if (result != CUDNN_STATUS_SUCCESS) { + LOGE(LOG_ERROR, "%s failed (result is %d)", __FUNCTION__, result); + } + return result; +} + +cudnnStatus_t cudnnBackendInitialize(cudnnBackendDescriptor_t descriptor) +{ +#ifdef WITH_API_CNT + api_call_cnt++; +#endif //WITH_API_CNT + int result; + enum clnt_stat retval_1; + LOGE(LOG_DEBUG, "%s(%p)", __FUNCTION__, descriptor); + retval_1 = rpc_cudnnbackendinitialize_1((ptr)descriptor, &result, clnt); + if (retval_1 != RPC_SUCCESS) { + LOGE(LOG_ERROR, "%s failed (%d)", __FUNCTION__, retval_1); + } + if (result != CUDNN_STATUS_SUCCESS) { + LOGE(LOG_ERROR, "%s failed (result is %d)", __FUNCTION__, result); + } + return result; +} + +cudnnStatus_t cudnnBackendFinalize(cudnnBackendDescriptor_t descriptor) +{ +#ifdef WITH_API_CNT + api_call_cnt++; +#endif //WITH_API_CNT + int result; + enum clnt_stat retval_1; + LOGE(LOG_DEBUG, "%s(%p)", __FUNCTION__, descriptor); + retval_1 = rpc_cudnnbackendfinalize_1((ptr)descriptor, &result, clnt); + if (retval_1 != RPC_SUCCESS) { + LOGE(LOG_ERROR, "%s failed (%d)", __FUNCTION__, retval_1); + } + if (result != CUDNN_STATUS_SUCCESS) { + LOGE(LOG_ERROR, "%s failed (result is %d)", __FUNCTION__, result); + } + return result; +} + +static const size_t backendAttributeSizes[] = { + [CUDNN_TYPE_HANDLE] = sizeof(cudnnHandle_t), + [CUDNN_TYPE_DATA_TYPE] = sizeof(cudnnDataType_t), + [CUDNN_TYPE_BOOLEAN] = sizeof(bool), + [CUDNN_TYPE_INT64] = sizeof(int64_t), + [CUDNN_TYPE_FLOAT] = sizeof(float), + [CUDNN_TYPE_DOUBLE] = sizeof(double), + [CUDNN_TYPE_VOID_PTR] = sizeof(void *), + [CUDNN_TYPE_CONVOLUTION_MODE] = sizeof(cudnnConvolutionMode_t), + [CUDNN_TYPE_HEUR_MODE] = sizeof(cudnnBackendHeurMode_t), + [CUDNN_TYPE_KNOB_TYPE] = sizeof(cudnnBackendKnobType_t), + [CUDNN_TYPE_NAN_PROPOGATION] = sizeof(cudnnNanPropagation_t), + [CUDNN_TYPE_NUMERICAL_NOTE] = sizeof(cudnnBackendNumericalNote_t), + [CUDNN_TYPE_LAYOUT_TYPE] = sizeof(cudnnBackendLayoutType_t), + [CUDNN_TYPE_ATTRIB_NAME] = sizeof(cudnnBackendAttributeName_t), + [CUDNN_TYPE_POINTWISE_MODE] = sizeof(cudnnPointwiseMode_t), + [CUDNN_TYPE_BACKEND_DESCRIPTOR] = sizeof(cudnnBackendDescriptor_t), + [CUDNN_TYPE_GENSTATS_MODE] = sizeof(cudnnGenStatsMode_t), + [CUDNN_TYPE_BN_FINALIZE_STATS_MODE] = sizeof(cudnnBnFinalizeStatsMode_t), + [CUDNN_TYPE_REDUCTION_OPERATOR_TYPE] = sizeof(cudnnReduceTensorOp_t), + [CUDNN_TYPE_BEHAVIOR_NOTE] = sizeof(cudnnBackendBehaviorNote_t), + [CUDNN_TYPE_TENSOR_REORDERING_MODE] = sizeof(cudnnBackendTensorReordering_t), + [CUDNN_TYPE_RESAMPLE_MODE] = sizeof(cudnnResampleMode_t), + [CUDNN_TYPE_PADDING_MODE] = sizeof(cudnnPaddingMode_t), + [CUDNN_TYPE_INT32] = sizeof(int32_t), + [CUDNN_TYPE_CHAR] = sizeof(char), + [CUDNN_TYPE_SIGNAL_MODE] = sizeof(cudnnSignalMode_t), + [CUDNN_TYPE_FRACTION] = sizeof(cudnnFraction_t), + [CUDNN_TYPE_NORM_MODE] = sizeof(cudnnBackendNormMode_t), + [CUDNN_TYPE_NORM_FWD_PHASE] = sizeof(cudnnBackendNormFwdPhase_t), + [CUDNN_TYPE_RNG_DISTRIBUTION] = sizeof(cudnnRngDistribution_t), +}; +cudnnStatus_t cudnnBackendSetAttribute(cudnnBackendDescriptor_t descriptor, + cudnnBackendAttributeName_t attributeName, + cudnnBackendAttributeType_t attributeType, + int64_t elementCount, + const void *arrayOfElements) +{ +#ifdef WITH_API_CNT + api_call_cnt++; +#endif //WITH_API_CNT + int result; + LOGE(LOG_DEBUG, "%s(%p, %d, %d, %ld, %p)", __FUNCTION__, descriptor, attributeName, attributeType, elementCount, arrayOfElements); + if (attributeType > CUDNN_TYPE_RNG_DISTRIBUTION) { + LOGE(LOG_ERROR, "%s failed (attributeType is too large %d)", __FUNCTION__, attributeType); + return CUDNN_STATUS_BAD_PARAM; + } + mem_data data = { + .mem_data_len = elementCount * backendAttributeSizes[attributeType], + .mem_data_val = (char *)arrayOfElements + }; + enum clnt_stat retval_1; + retval_1 = rpc_cudnnbackendsetattribute_1( + (ptr)descriptor, + (int)attributeName, + (int)attributeType, + elementCount, + data, + &result, clnt); + if (retval_1 != RPC_SUCCESS) { + LOGE(LOG_ERROR, "%s failed (%d)", __FUNCTION__, retval_1); + } + if (result != CUDNN_STATUS_SUCCESS) { + LOGE(LOG_ERROR, "%s failed (result is %d)", __FUNCTION__, result); + } + return result; +} + +cudnnStatus_t cudnnBackendGetAttribute(cudnnBackendDescriptor_t const descriptor, + cudnnBackendAttributeName_t attributeName, + cudnnBackendAttributeType_t attributeType, + int64_t requestedElementCount, + int64_t *elementCount, + void *arrayOfElements) +{ +#ifdef WITH_API_CNT + api_call_cnt++; +#endif //WITH_API_CNT + mem_result result; + enum clnt_stat retval_1; + LOGE(LOG_DEBUG, "%s(%p, %d, %d, %ld, %p, %p)", __FUNCTION__, descriptor, attributeName, attributeType, requestedElementCount, elementCount, arrayOfElements); + size_t expected_size = requestedElementCount * backendAttributeSizes[attributeType] + sizeof(int64_t); + result.mem_result_u.data.mem_data_val = malloc(expected_size); + if (result.mem_result_u.data.mem_data_val == NULL) { + LOGE(LOG_ERROR, "%s failed (malloc failed)", __FUNCTION__); + return CUDNN_STATUS_ALLOC_FAILED; + } + retval_1 = rpc_cudnnbackendgetattribute_1( + (ptr)descriptor, + (int)attributeName, + (int)attributeType, + requestedElementCount, + &result, clnt); + if (retval_1 != RPC_SUCCESS) { + LOGE(LOG_ERROR, "%s failed (%d)", __FUNCTION__, retval_1); + } + if (result.err != CUDNN_STATUS_SUCCESS || result.mem_result_u.data.mem_data_len != expected_size) { + LOGE(LOG_ERROR, "%s failed (result is %d, size is %zd, expected %zd)", __FUNCTION__, result.err, result.mem_result_u.data.mem_data_len, expected_size); + if (elementCount != NULL) { + *elementCount = 0; + } + } else { + if (elementCount != NULL) { + *elementCount = *(int64_t*)result.mem_result_u.data.mem_data_val; + LOGE(LOG_DEBUG, "elementCount = %ld", *elementCount); + } + if (arrayOfElements != NULL) { + memcpy(arrayOfElements, result.mem_result_u.data.mem_data_val + sizeof(int64_t), *elementCount * backendAttributeSizes[attributeType]); + } + } + return result.err; +} + +cudnnStatus_t cudnnBackendExecute(cudnnHandle_t handle, cudnnBackendDescriptor_t executionPlan, cudnnBackendDescriptor_t variantPack) +{ +#ifdef WITH_API_CNT + api_call_cnt++; +#endif //WITH_API_CNT + int result; + enum clnt_stat retval_1; + LOGE(LOG_DEBUG, "%s(%p, %p, %p)", __FUNCTION__, handle, executionPlan, variantPack); + retval_1 = rpc_cudnnbackendexecute_1( + (ptr)handle, + (ptr)executionPlan, + (ptr)variantPack, + &result, clnt); + if (retval_1 != RPC_SUCCESS) { + LOGE(LOG_ERROR, "%s failed (%d)", __FUNCTION__, retval_1); + } + if (result != CUDNN_STATUS_SUCCESS) { + LOGE(LOG_ERROR, "%s failed (result is %d)", __FUNCTION__, result); + } + return result; +} \ No newline at end of file diff --git a/cpu/cpu-client-driver.c b/cpu/cpu-client-driver.c index 06f908be..4d131737 100644 --- a/cpu/cpu-client-driver.c +++ b/cpu/cpu-client-driver.c @@ -6,6 +6,7 @@ #include #include #include +#include #include #include @@ -15,17 +16,19 @@ #include "cpu_rpc_prot.h" #include "cpu-common.h" #include "cpu-utils.h" +#include "cpu-elf2.h" //DEF_FN(CUresult, cuProfilerInitialize, const char*, configFile, const char*, outputFile, CUoutput_mode, outputMode) //DEF_FN(CUresult, cuProfilerStart) //DEF_FN(CUresult, cuProfilerStop) DEF_FN(CUresult, cuVDPAUGetDevice, CUdevice*, pDevice, VdpDevice, vdpDevice, VdpGetProcAddress*, vdpGetProcAddress) +#undef cuVDPAUCtxCreate DEF_FN(CUresult, cuVDPAUCtxCreate, CUcontext*, pCtx, unsigned int, flags, CUdevice, device, VdpDevice, vdpDevice, VdpGetProcAddress*, vdpGetProcAddress) DEF_FN(CUresult, cuGraphicsVDPAURegisterVideoSurface, CUgraphicsResource*, pCudaResource, VdpVideoSurface, vdpSurface, unsigned int, flags) DEF_FN(CUresult, cuGraphicsVDPAURegisterOutputSurface, CUgraphicsResource*, pCudaResource, VdpOutputSurface, vdpSurface, unsigned int, flags) -//DEF_FN(CUresult, cuDeviceTotalMem, size_t*, bytes, CUdevice, dev) +#undef cuDeviceTotalMem CUresult cuDeviceTotalMem(size_t* bytes, CUdevice dev) { enum clnt_stat retval; @@ -41,7 +44,7 @@ CUresult cuDeviceTotalMem(size_t* bytes, CUdevice dev) return result.err; } -//DEF_FN(CUresult, cuCtxCreate, CUcontext*, pctx, unsigned int, flags, CUdevice, dev) +#undef cuCtxCreate CUresult cuCtxCreate(CUcontext *pctx, unsigned int flags, CUdevice dev) { DEF_FN_PTR(CUresult, CUcontext*, unsigned int, CUdevice); @@ -51,10 +54,12 @@ CUresult cuCtxCreate(CUcontext *pctx, unsigned int flags, CUdevice dev) return ret; } DEF_FN(CUresult, cuCtxSynchronize) +#undef cuModuleGetGlobal DEF_FN(CUresult, cuModuleGetGlobal, CUdeviceptr*, dptr, size_t*, bytes, CUmodule, hmod, const char*, name) +#undef cuMemGetInfo DEF_FN(CUresult, cuMemGetInfo, size_t*, free, size_t*, total) -//DEF_FN(CUresult, cuMemAlloc, CUdeviceptr*, dptr, size_t, bytesize) +#undef cuMemAlloc CUresult cuMemAlloc(CUdeviceptr* dptr, size_t bytesize) { enum clnt_stat retval; @@ -71,30 +76,40 @@ CUresult cuMemAlloc(CUdeviceptr* dptr, size_t bytesize) return result.err; } +#undef cuMemAllocPitch DEF_FN(CUresult, cuMemAllocPitch, CUdeviceptr*, dptr, size_t*, pPitch, size_t, WidthInBytes, size_t, Height, unsigned int, ElementSizeBytes) +#undef cuMemFree DEF_FN(CUresult, cuMemFree, CUdeviceptr, dptr) +#undef cuMemGetAddressRange DEF_FN(CUresult, cuMemGetAddressRange, CUdeviceptr*, pbase, size_t*, psize, CUdeviceptr, dptr) +#undef cuMemHostGetDevicePointer DEF_FN(CUresult, cuMemHostGetDevicePointer, CUdeviceptr*, pdptr, void*, p, unsigned int, Flags) +#undef cuMemHostRegister DEF_FN(CUresult, cuMemHostRegister, void*, p, size_t, bytesize, unsigned int, Flags) +#undef cuMemsetD8 DEF_FN(CUresult, cuMemsetD8, CUdeviceptr, dstDevice, unsigned char, uc, size_t, N); DEF_FN(CUresult, cuMemsetD8_v2_ptds, CUdeviceptr, dstDevice, unsigned char, uc, size_t, N); +#undef cuMemsetD2D8 DEF_FN(CUresult, cuMemsetD2D8, CUdeviceptr, dstDevice, size_t, dstPitch, unsigned char, uc, size_t, Width, size_t, Height) DEF_FN(CUresult, cuMemsetD2D8_v2_ptds, CUdeviceptr, dstDevice, size_t, dstPitch, unsigned char, uc, size_t, Width, size_t, Height) +#undef cuEventDestroy DEF_FN(CUresult, cuEventDestroy, CUevent, hEvent) +#undef cuStreamDestroy DEF_FN(CUresult, cuStreamDestroy, CUstream, hStream) +#undef cuGLCtxCreate DEF_FN(CUresult, cuGLCtxCreate, CUcontext*, pCtx, unsigned int, Flags, CUdevice, device) +#undef cuArrayCreate DEF_FN(CUresult, cuArrayCreate, CUarray*, pHandle, const CUDA_ARRAY_DESCRIPTOR*, pAllocateArray) +#undef cuArrayGetDescriptor DEF_FN(CUresult, cuArrayGetDescriptor, CUDA_ARRAY_DESCRIPTOR*, pArrayDescriptor, CUarray, hArray) +#undef cuArray3DCreate DEF_FN(CUresult, cuArray3DCreate, CUarray*, pHandle, const CUDA_ARRAY3D_DESCRIPTOR*, pAllocateArray) +#undef cuArray3DGetDescriptor DEF_FN(CUresult, cuArray3DGetDescriptor, CUDA_ARRAY3D_DESCRIPTOR*, pArrayDescriptor, CUarray, hArray) +#undef cuTexRefSetAddress2D DEF_FN(CUresult, cuTexRefSetAddress2D, CUtexref, hTexRef, const CUDA_ARRAY_DESCRIPTOR*, desc, CUdeviceptr, dptr, size_t, Pitch) +#undef cuTexRefSetAddress DEF_FN(CUresult, cuTexRefSetAddress, size_t*, ByteOffset, CUtexref, hTexRef, CUdeviceptr, dptr, size_t, bytes) - - - - - - DEF_FN(CUresult, cuGLInit) #undef cuGLGetDevices #undef cuGLMapBufferObject_v2 @@ -212,7 +227,7 @@ CUresult cuDeviceGetUuid(CUuuid* uuid, CUdevice dev) } DEF_FN(CUresult, cuDeviceGetLuid, char*, luid, unsigned int*, deviceNodeMask, CUdevice, dev) -//DEF_FN(CUresult, cuDeviceGetAttribute, int*, pi, CUdevice_attribute, attrib, CUdevice, dev) + CUresult cuDeviceGetAttribute(int* pi, CUdevice_attribute attrib, CUdevice dev) { enum clnt_stat retval; @@ -227,9 +242,67 @@ CUresult cuDeviceGetAttribute(int* pi, CUdevice_attribute attrib, CUdevice dev) *pi = result.int_result_u.data; return result.err; } -DEF_FN(CUresult, cuDeviceGetProperties, CUdevprop*, prop, CUdevice, dev) + +CUresult cuDeviceGetProperties(CUdevprop* prop, CUdevice dev) +{ + enum clnt_stat retval; + mem_result result; + if (prop == NULL) { + LOGE(LOG_ERROR, "%s: prop is NULL", __FUNCTION__); + return CUDA_ERROR_INVALID_VALUE; + } + retval = rpc_cudevicegetproperties_1(dev, &result, clnt); + LOGE(LOG_DEBUG, "%s = %d, result len: %d", __FUNCTION__, result.err, + result.mem_result_u.data.mem_data_len); + if (retval != RPC_SUCCESS) { + fprintf(stderr, "[rpc] %s failed.", __FUNCTION__); + return CUDA_ERROR_UNKNOWN; + } + if (result.mem_result_u.data.mem_data_len != sizeof(CUdevprop)) { + LOGE(LOG_ERROR, "%s: size mismatch", __FUNCTION__); + return CUDA_ERROR_INVALID_VALUE; + } + if (memcpy(prop, result.mem_result_u.data.mem_data_val, sizeof(CUdevprop)) == NULL) { + LOGE(LOG_ERROR, "%s: memcpy failed", __FUNCTION__); + return CUDA_ERROR_UNKNOWN; + } + return result.err; +} +CUresult cuDeviceComputeCapability(int* major, int* minor, CUdevice dev) +{ + enum clnt_stat retval; + dint_result result; + if (major == NULL || minor == NULL) { + LOGE(LOG_ERROR, "%s: major or minor is NULL", __FUNCTION__); + return CUDA_ERROR_INVALID_VALUE; + } + retval = rpc_cudevicecomputecapability_1(dev, &result, clnt); + LOGE(LOG_DEBUG, "%s = %d, result %d, %d", __FUNCTION__, result.err, + result.dint_result_u.data.i1, + result.dint_result_u.data.i2); + if (retval != RPC_SUCCESS) { + fprintf(stderr, "[rpc] %s failed.", __FUNCTION__); + return CUDA_ERROR_UNKNOWN; + } + *major = result.dint_result_u.data.i1; + *minor = result.dint_result_u.data.i2; + return result.err; +} + DEF_FN(CUresult, cuDeviceGetByPCIBusId, CUdevice*, dev, const char*, pciBusId) -DEF_FN(CUresult, cuDeviceGetP2PAttribute, int*, value, CUdevice_P2PAttribute, attrib, CUdevice, srcDevice, CUdevice, dstDevice) +CUresult cuDeviceGetP2PAttribute ( int* value, CUdevice_P2PAttribute attrib, CUdevice srcDevice, CUdevice dstDevice ) +{ + enum clnt_stat retval; + int_result result; + retval = rpc_cudevicegetp2pattribute_1((int)attrib, (ptr)srcDevice, (ptr)dstDevice, &result, clnt); + LOGE(LOG_DEBUG, "[rpc] %s(%d, %p, %p) = %d, result %s", __FUNCTION__, attrib, srcDevice, dstDevice, result.err, result.int_result_u.data); + if (retval != RPC_SUCCESS) { + fprintf(stderr, "[rpc] %s failed.", __FUNCTION__); + return CUDA_ERROR_UNKNOWN; + } + return result.err; +} + //DEF_FN(CUresult, cuDriverGetVersion, int*, driverVersion) CUresult cuDriverGetVersion(int* driverVersion) { @@ -261,9 +334,31 @@ CUresult cuDevicePrimaryCtxRetain(CUcontext *pctx, CUdevice dev) *pctx = (CUcontext)result.ptr_result_u.ptr; return result.err; } +#undef cuDevicePrimaryCtxRelease DEF_FN(CUresult, cuDevicePrimaryCtxRelease, CUdevice, dev) +#undef cuDevicePrimaryCtxSetFlags DEF_FN(CUresult, cuDevicePrimaryCtxSetFlags, CUdevice, dev, unsigned int, flags) -DEF_FN(CUresult, cuDevicePrimaryCtxGetState, CUdevice, dev, unsigned int*, flags, int*, active) +CUresult cuDevicePrimaryCtxGetState(CUdevice dev, unsigned int* flags, int* active) +{ + enum clnt_stat retval; + dint_result result; + if (flags == NULL || active == NULL) { + LOGE(LOG_ERROR, "%s flags or active is NULL.", __FUNCTION__); + return CUDA_ERROR_INVALID_VALUE; + } + retval = rpc_cudeviceprimaryctxgetstate_1(dev, &result, clnt); + LOGE(LOG_DEBUG, "%s = %d, result %d %d", __FUNCTION__, result.err, + result.dint_result_u.data.i1, + result.dint_result_u.data.i2); + if (retval != RPC_SUCCESS) { + LOGE(LOG_ERROR, "%s failed.", __FUNCTION__); + return CUDA_ERROR_UNKNOWN; + } + *flags = result.dint_result_u.data.i1; + *active = result.dint_result_u.data.i2; + return result.err; +} +#undef cuDevicePrimaryCtxReset DEF_FN(CUresult, cuDevicePrimaryCtxReset, CUdevice, dev) DEF_FN(CUresult, cuCtxGetFlags, unsigned int*, flags) //DEF_FN(CUresult, cuCtxSetCurrent, CUcontext, ctx) @@ -344,8 +439,51 @@ CUresult cuModuleLoad(CUmodule* module, const char* fname) } return result.err; } -DEF_FN(CUresult, cuModuleLoadData, CUmodule*, module, const void*, image) -//DEF_FN(CUresult, cuModuleLoadDataEx, CUmodule*, module, const void*, image, unsigned int, numOptions, CUjit_option*, options, void**, optionValues) + + +CUresult cuModuleLoadData(CUmodule* module, const void* image) +{ + enum clnt_stat retval; + ptr_result result; + mem_data mem; + + if (image == NULL) { + LOGE(LOG_ERROR, "image is NULL!"); + return CUDA_ERROR_INVALID_IMAGE; + } + Elf64_Ehdr *ehdr = (Elf64_Ehdr*)image; + + if (ehdr->e_ident[EI_MAG0] != ELFMAG0 || + ehdr->e_ident[EI_MAG1] != ELFMAG1 || + ehdr->e_ident[EI_MAG2] != ELFMAG2 || + ehdr->e_ident[EI_MAG3] != ELFMAG3) { + LOGE(LOG_ERROR, "image is not an ELF!"); + return CUDA_ERROR_INVALID_IMAGE; + } + + mem.mem_data_len = ehdr->e_shoff + ehdr->e_shnum * ehdr->e_shentsize; + mem.mem_data_val = (uint8_t*)image; + + LOGE(LOG_DEBUG, "image_size = %#0zx", mem.mem_data_len); + + if (elf2_parameter_info(&kernel_infos, mem.mem_data_val, mem.mem_data_len) != 0) { + LOGE(LOG_ERROR, "could not get kernel infos from memory"); + return CUDA_ERROR_INVALID_IMAGE; + } + + retval = rpc_cumoduleloaddata_1(mem, &result, clnt); + printf("[rpc] %s(%p) = %d, result %p\n", __FUNCTION__, image, result.err, (void*)result.ptr_result_u.ptr); + if (retval != RPC_SUCCESS) { + fprintf(stderr, "[rpc] %s failed.", __FUNCTION__); + return CUDA_ERROR_UNKNOWN; + } + if (module != NULL) { + *module = (CUmodule)result.ptr_result_u.ptr; + } + return result.err; +} + +DEF_FN(CUresult, cuModuleLoadDataEx, CUmodule*, module, const void*, image, unsigned int, numOptions, CUjit_option*, options, void**, optionValues) DEF_FN(CUresult, cuModuleLoadFatBinary, CUmodule*, module, const void*, fatCubin) CUresult cuModuleUnload(CUmodule hmod) { @@ -360,7 +498,6 @@ CUresult cuModuleUnload(CUmodule hmod) } return result; } -//DEF_FN(CUresult, cuModuleGetFunction, CUfunction*, hfunc, CUmodule, hmod, const char*, name) CUresult cuModuleGetFunction(CUfunction* hfun, CUmodule hmod, const char* name) { enum clnt_stat retval; @@ -373,8 +510,8 @@ CUresult cuModuleGetFunction(CUfunction* hfun, CUmodule hmod, const char* name) return CUDA_ERROR_UNKNOWN; } *hfun = (CUfunction)result.ptr_result_u.ptr; - if ((info = cricketd_utils_search_info(&kernel_infos, (char*)name)) == NULL) { - LOGE(LOG_ERROR, "cannot find kernel %s kernel_info_t"); + if ((info = utils_search_info(&kernel_infos, (char*)name)) == NULL) { + LOGE(LOG_ERROR, "cannot find kernel %s kernel_info_t", name); return CUDA_ERROR_UNKNOWN; } info->host_fun = *hfun; @@ -402,6 +539,7 @@ DEF_FN(CUresult, cuPointerGetAttributes, unsigned int, numAttributes, CUpointer_ DEF_FN(CUresult, cuMemcpy, CUdeviceptr, dst, CUdeviceptr, src, size_t, ByteCount) DEF_FN(CUresult, cuMemcpy_ptds, CUdeviceptr, dst, CUdeviceptr, src, size_t, ByteCount) //DEF_FN(CUresult, cuMemcpyHtoD, CUdeviceptr, dstDevice, const void*, srcHost, size_t, ByteCount) +#undef cuMemcpyHtoD CUresult cuMemcpyHtoD(CUdeviceptr dstDevice, const void* srcHost, size_t ByteCount) { enum clnt_stat retval; @@ -418,34 +556,51 @@ CUresult cuMemcpyHtoD(CUdeviceptr dstDevice, const void* srcHost, size_t ByteCou return result; } DEF_FN(CUresult, cuMemcpyHtoD_v2_ptds, CUdeviceptr, dstDevice, const void*, srcHost, size_t, ByteCount) +#undef cuMemcpyDtoH DEF_FN(CUresult, cuMemcpyDtoH, void*, dstHost, CUdeviceptr, srcDevice, size_t, ByteCount) DEF_FN(CUresult, cuMemcpyDtoH_v2_ptds, void*, dstHost, CUdeviceptr, srcDevice, size_t, ByteCount) +#undef cuMemcpyDtoD DEF_FN(CUresult, cuMemcpyDtoD, CUdeviceptr, dstDevice, CUdeviceptr, srcDevice, size_t, ByteCount) DEF_FN(CUresult, cuMemcpyDtoD_v2_ptds, CUdeviceptr, dstDevice, CUdeviceptr, srcDevice, size_t, ByteCount) +#undef cuMemcpyDtoA DEF_FN(CUresult, cuMemcpyDtoA, CUarray, dstArray, size_t, dstOffset, CUdeviceptr, srcDevice, size_t, ByteCount) +#undef cuMemcpyAtoD DEF_FN(CUresult, cuMemcpyAtoD, CUdeviceptr, dstDevice, CUarray, srcArray, size_t, srcOffset, size_t, ByteCount) +#undef cuMemcpyHtoA DEF_FN(CUresult, cuMemcpyHtoA, CUarray, dstArray, size_t, dstOffset, const void*, srcHost, size_t, ByteCount) +#undef cuMemcpyAtoH DEF_FN(CUresult, cuMemcpyAtoH, void*, dstHost, CUarray, srcArray, size_t, srcOffset, size_t, ByteCount) +#undef cuMemcpyAtoA DEF_FN(CUresult, cuMemcpyAtoA, CUarray, dstArray, size_t, dstOffset, CUarray, srcArray, size_t, srcOffset, size_t, ByteCount) +#undef cuMemcpy2D DEF_FN(CUresult, cuMemcpy2D, const CUDA_MEMCPY2D*, pCopy) +#undef cuMemcpy2DUnaligned DEF_FN(CUresult, cuMemcpy2DUnaligned, const CUDA_MEMCPY2D*, pCopy) DEF_FN(CUresult, cuMemcpy2DUnaligned_v2_ptds, const CUDA_MEMCPY2D*, pCopy) +#undef cuMemcpy3D DEF_FN(CUresult, cuMemcpy3D, const CUDA_MEMCPY3D*, pCopy) DEF_FN(CUresult, cuMemcpy3D_v2_ptds, const CUDA_MEMCPY3D*, pCopy) DEF_FN(CUresult, cuMemcpyPeerAsync, CUdeviceptr, dstDevice, CUcontext, dstContext, CUdeviceptr, srcDevice, CUcontext, srcContext, size_t, ByteCount, CUstream, hStream) DEF_FN(CUresult, cuMemcpyPeerAsync_ptsz, CUdeviceptr, dstDevice, CUcontext, dstContext, CUdeviceptr, srcDevice, CUcontext, srcContext, size_t, ByteCount, CUstream, hStream) +#undef cuMemcpyHtoAAsync DEF_FN(CUresult, cuMemcpyHtoAAsync, CUarray, dstArray, size_t, dstOffset, const void*, srcHost, size_t, ByteCount, CUstream, hStream) +#undef cuMemcpyAtoHAsync DEF_FN(CUresult, cuMemcpyAtoHAsync, void*, dstHost, CUarray, srcArray, size_t, srcOffset, size_t, ByteCount, CUstream, hStream) DEF_FN(CUresult, cuMemcpy3DPeerAsync, const CUDA_MEMCPY3D_PEER*, pCopy, CUstream, hStream) DEF_FN(CUresult, cuMemcpy3DPeerAsync_ptsz, const CUDA_MEMCPY3D_PEER*, pCopy, CUstream, hStream) +#undef cuMemcpyHtoDAsync DEF_FN(CUresult, cuMemcpyHtoDAsync, CUdeviceptr, dstDevice, const void*, srcHost, size_t, ByteCount, CUstream, hStream) DEF_FN(CUresult, cuMemcpyHtoDAsync_v2_ptsz, CUdeviceptr, dstDevice, const void*, srcHost, size_t, ByteCount, CUstream, hStream) +#undef cuMemcpyDtoHAsync DEF_FN(CUresult, cuMemcpyDtoHAsync, void*, dstHost, CUdeviceptr, srcDevice, size_t, ByteCount, CUstream, hStream) DEF_FN(CUresult, cuMemcpyDtoHAsync_v2_ptsz, void*, dstHost, CUdeviceptr, srcDevice, size_t, ByteCount, CUstream, hStream) +#undef cuMemcpyDtoDAsync DEF_FN(CUresult, cuMemcpyDtoDAsync, CUdeviceptr, dstDevice, CUdeviceptr, srcDevice, size_t, ByteCount, CUstream, hStream) DEF_FN(CUresult, cuMemcpyDtoDAsync_v2_ptsz, CUdeviceptr, dstDevice, CUdeviceptr, srcDevice, size_t, ByteCount, CUstream, hStream) +#undef cuMemcpy2DAsync DEF_FN(CUresult, cuMemcpy2DAsync, const CUDA_MEMCPY2D*, pCopy, CUstream, hStream) DEF_FN(CUresult, cuMemcpy2DAsync_v2_ptsz, const CUDA_MEMCPY2D*, pCopy, CUstream, hStream) +#undef cuMemcpy3DAsync DEF_FN(CUresult, cuMemcpy3DAsync, const CUDA_MEMCPY3D*, pCopy, CUstream, hStream) DEF_FN(CUresult, cuMemcpy3DAsync_v2_ptsz, const CUDA_MEMCPY3D*, pCopy, CUstream, hStream) DEF_FN(CUresult, cuMemcpyAsync, CUdeviceptr, dst, CUdeviceptr, src, size_t, ByteCount, CUstream, hStream) @@ -567,14 +722,19 @@ DEF_FN(CUresult, cuEventRecord_ptsz, CUevent, hEvent, CUstream, hStream) DEF_FN(CUresult, cuEventQuery, CUevent, hEvent) DEF_FN(CUresult, cuEventSynchronize, CUevent, hEvent) DEF_FN(CUresult, cuEventElapsedTime, float*, pMilliseconds, CUevent, hStart, CUevent, hEnd) +#undef cuStreamWaitValue32 DEF_FN(CUresult, cuStreamWaitValue32, CUstream, stream, CUdeviceptr, addr, cuuint32_t, value, unsigned int, flags) DEF_FN(CUresult, cuStreamWaitValue32_ptsz, CUstream, stream, CUdeviceptr, addr, cuuint32_t, value, unsigned int, flags) +#undef cuStreamWriteValue32 DEF_FN(CUresult, cuStreamWriteValue32, CUstream, stream, CUdeviceptr, addr, cuuint32_t, value, unsigned int, flags) DEF_FN(CUresult, cuStreamWriteValue32_ptsz, CUstream, stream, CUdeviceptr, addr, cuuint32_t, value, unsigned int, flags) +#undef cuStreamWaitValue64 DEF_FN(CUresult, cuStreamWaitValue64, CUstream, stream, CUdeviceptr, addr, cuuint64_t, value, unsigned int, flags) DEF_FN(CUresult, cuStreamWaitValue64_ptsz, CUstream, stream, CUdeviceptr, addr, cuuint64_t, value, unsigned int, flags) +#undef cuStreamWriteValue64 DEF_FN(CUresult, cuStreamWriteValue64, CUstream, stream, CUdeviceptr, addr, cuuint64_t, value, unsigned int, flags) DEF_FN(CUresult, cuStreamWriteValue64_ptsz, CUstream, stream, CUdeviceptr, addr, cuuint64_t, value, unsigned int, flags) +#undef cuStreamBatchMemOp DEF_FN(CUresult, cuStreamBatchMemOp, CUstream, stream, unsigned int, count, CUstreamBatchMemOpParams*, paramArray, unsigned int, flags) DEF_FN(CUresult, cuStreamBatchMemOp_ptsz, CUstream, stream, unsigned int, count, CUstreamBatchMemOpParams*, paramArray, unsigned int, flags) DEF_FN(CUresult, cuStreamCreate, CUstream*, phStream, unsigned int, Flags) @@ -600,6 +760,7 @@ DEF_FN(CUresult, cuCtxDisablePeerAccess, CUcontext, peerContext) DEF_FN(CUresult, cuIpcGetEventHandle, CUipcEventHandle*, pHandle, CUevent, event) DEF_FN(CUresult, cuIpcOpenEventHandle, CUevent*, phEvent, CUipcEventHandle, handle) DEF_FN(CUresult, cuIpcGetMemHandle, CUipcMemHandle*, pHandle, CUdeviceptr, dptr) +#undef cuIpcOpenMemHandle DEF_FN(CUresult, cuIpcOpenMemHandle, CUdeviceptr*, pdptr, CUipcMemHandle, handle, unsigned int, Flags) DEF_FN(CUresult, cuIpcCloseMemHandle, CUdeviceptr, dptr) DEF_FN(CUresult, cuGraphicsUnregisterResource, CUgraphicsResource, resource) @@ -609,7 +770,9 @@ DEF_FN(CUresult, cuGraphicsUnmapResources, unsigned int, count, CUgraphicsResour DEF_FN(CUresult, cuGraphicsUnmapResources_ptsz, unsigned int, count, CUgraphicsResource*, resources, CUstream, hStream) DEF_FN(CUresult, cuGraphicsSubResourceGetMappedArray, CUarray*, pArray, CUgraphicsResource, resource, unsigned int, arrayIndex, unsigned int, mipLevel) DEF_FN(CUresult, cuGraphicsResourceGetMappedMipmappedArray, CUmipmappedArray*, pMipmappedArray, CUgraphicsResource, resource) +#undef cuGraphicsResourceGetMappedPointer DEF_FN(CUresult, cuGraphicsResourceGetMappedPointer, CUdeviceptr*, pDevPtr, size_t*, pSize, CUgraphicsResource, resource) +#undef cuGraphicsResourceSetMapFlags DEF_FN(CUresult, cuGraphicsResourceSetMapFlags, CUgraphicsResource, resource, unsigned int, flags) //DEF_FN(CUresult, cuGetExportTable, const void**, ppExportTable, const CUuuid*, pExportTableId) @@ -672,8 +835,11 @@ CUresult cuGetErrorString(CUresult error, const char** pStr) } DEF_FN(CUresult, cuGetErrorName, CUresult, error, const char**, pStr) DEF_FN(CUresult, cuGraphCreate, CUgraph*, phGraph, unsigned int, flags) +#undef cuGraphAddKernelNode DEF_FN(CUresult, cuGraphAddKernelNode, CUgraphNode*, phGraphNode, CUgraph, hGraph, const CUgraphNode*, dependencies, size_t, numDependencies, const CUDA_KERNEL_NODE_PARAMS*, nodeParams) +#undef cuGraphKernelNodeGetParams DEF_FN(CUresult, cuGraphKernelNodeGetParams, CUgraphNode, hNode, CUDA_KERNEL_NODE_PARAMS*, nodeParams) +#undef cuGraphKernelNodeSetParams DEF_FN(CUresult, cuGraphKernelNodeSetParams, CUgraphNode, hNode, const CUDA_KERNEL_NODE_PARAMS*, nodeParams) DEF_FN(CUresult, cuGraphAddMemcpyNode, CUgraphNode*, phGraphNode, CUgraph, hGraph, const CUgraphNode*, dependencies, size_t, numDependencies, const CUDA_MEMCPY3D*, copyParams, CUcontext, ctx) DEF_FN(CUresult, cuGraphMemcpyNodeGetParams, CUgraphNode, hNode, CUDA_MEMCPY3D*, nodeParams) @@ -697,7 +863,12 @@ DEF_FN(CUresult, cuGraphNodeGetDependencies, CUgraphNode, hNode, CUgraphNode*, d DEF_FN(CUresult, cuGraphNodeGetDependentNodes, CUgraphNode, hNode, CUgraphNode*, dependentNodes, size_t*, numDependentNodes) DEF_FN(CUresult, cuGraphAddDependencies, CUgraph, hGraph, const CUgraphNode*, from, const CUgraphNode*, to, size_t, numDependencies) DEF_FN(CUresult, cuGraphRemoveDependencies, CUgraph, hGraph, const CUgraphNode*, from, const CUgraphNode*, to, size_t, numDependencies) +#if CUDA_VERSION >= 12000 +#undef cuGraphInstantiate +DEF_FN(CUresult, cuGraphInstantiate, CUgraphExec*, phGraphExec, CUgraph, hGraph, unsigned long long, flags) +#else DEF_FN(CUresult, cuGraphInstantiate, CUgraphExec*, phGraphExec, CUgraph, hGraph, CUgraphNode*, phErrorNode, char*, logBuffer, size_t, bufferSize) +#endif DEF_FN(CUresult, cuGraphLaunch, CUgraphExec, hGraphExec, CUstream, hStream) DEF_FN(CUresult, cuGraphLaunch_ptsz, CUgraphExec, hGraphExec, CUstream, hStream) DEF_FN(CUresult, cuGraphExecDestroy, CUgraphExec, hGraphExec) @@ -705,7 +876,6 @@ DEF_FN(CUresult, cuGraphDestroyNode, CUgraphNode, hNode) DEF_FN(CUresult, cuGraphDestroy, CUgraph, hGraph) DEF_FN(CUresult, cuGraphDestroy_ptsz, CUgraph, hGraph) DEF_FN(CUresult, cuStreamBeginCapture_ptsz, CUstream, hStream) -DEF_FN(CUresult, cuStreamBeginCapture, CUstream, hStream, CUstreamCaptureMode, mode) #undef cuStreamBeginCapture DEF_FN(CUresult, cuStreamBeginCapture, CUstream, hStream, CUstreamCaptureMode, mode) DEF_FN(CUresult, cuStreamBeginCapture_v2_ptsz, CUstream, hStream) @@ -714,6 +884,30 @@ DEF_FN(CUresult, cuStreamEndCapture_ptsz, CUstream, hStream, CUgraph*, phGraph) DEF_FN(CUresult, cuStreamIsCapturing, CUstream, hStream, CUstreamCaptureStatus*, captureStatus) DEF_FN(CUresult, cuStreamIsCapturing_ptsz, CUstream, hStream, CUstreamCaptureStatus*, captureStatus) DEF_FN(CUresult, cuThreadExchangeStreamCaptureMode, CUstreamCaptureMode*, mode) -DEF_FN(CUresult, cuStreamGetCaptureInfo, CUstream, hStream, CUstreamCaptureStatus*, captureStatus, cuuint64_t*, id) +#undef cuStreamGetCaptureInfo +DEF_FN(CUresult, cuStreamGetCaptureInfo, CUstream, hStream, CUstreamCaptureStatus*, captureStatus_out, cuuint64_t*, id_out, CUgraph*. graph_out, const CUgraphNode**, dependencies_out, size_t*, numDependencies_out) DEF_FN(CUresult, cuStreamGetCaptureInfo_ptsz, CUstream, hStream, CUstreamCaptureStatus*, captureStatus, cuuint64_t*, id) +#undef cuGraphExecKernelNodeSetParams DEF_FN(CUresult, cuGraphExecKernelNodeSetParams, CUgraphExec, hGraphExec, CUgraphNode, hNode, const CUDA_KERNEL_NODE_PARAMS*, nodeParams) + +#if CUDA_VERSION >= 12000 +#undef cuGetProcAddress +CUresult cuGetProcAddress(const char* symbol, void** pfn, int cudaVersion, cuuint64_t flags, CUdriverProcAddressQueryResult* symbolStatus) +{ + enum clnt_stat retval; + ptr_result result; + LOGE(LOG_DEBUG, "%s(%s, %d, %llx)", __FUNCTION__, symbol, cudaVersion, flags); + + *pfn = elf2_symbol_address(symbol); + if (*pfn == NULL) { + LOGE(LOG_WARNING, "symbol %s not found.", symbol); + return CUDA_ERROR_UNKNOWN; + } + // Pytorch uses the 11.3 API of this function which does not have the symbolStatus parameter + // Because we do not support API versioning yet and to avoid segfaults, we ignore this parameter for now. + //*symbolStatus = CU_GET_PROC_ADDRESS_VERSION_NOT_SUFFICIENT; + return cudaSuccess; +} +#endif + + diff --git a/cpu/cpu-client-nvml.c b/cpu/cpu-client-nvml.c new file mode 100644 index 00000000..29f86380 --- /dev/null +++ b/cpu/cpu-client-nvml.c @@ -0,0 +1,211 @@ +#define _GNU_SOURCE +#include +#include +#include + +#include "cpu-libwrap.h" +#include "cpu_rpc_prot.h" +#include "cpu-common.h" +#include "cpu-utils.h" +#include "log.h" + +#ifdef WITH_API_CNT +static int api_call_cnt = 0; +void cpu_nvml_print_api_call_cnt(void) +{ + LOG(LOG_INFO, "nvml api-call-cnt: %d", api_call_cnt); +} +#endif //WITH_API_CNT + +nvmlReturn_t nvmlInitWithFlags ( unsigned int flags ) +{ +#ifdef WITH_API_CNT + api_call_cnt++; +#endif //WITH_API_CNT + int result; + enum clnt_stat retval_1; + retval_1 = rpc_nvmlinitwithflags_1(flags, &result, clnt); + if (retval_1 != RPC_SUCCESS) { + LOGE(LOG_ERROR, "call failed: %s", __FUNCTION__); + return result; + } + return result; +} + +#undef nvmlInit +nvmlReturn_t nvmlInit(void) +{ + return nvmlInitWithFlags(0); +} + +nvmlReturn_t nvmlInit_v2 ( void ) +{ +#ifdef WITH_API_CNT + api_call_cnt++; +#endif //WITH_API_CNT + int result; + enum clnt_stat retval_1; + retval_1 = rpc_nvmlinit_v2_1(&result, clnt); + if (retval_1 != RPC_SUCCESS) { + LOGE(LOG_ERROR, "call failed: %s", __FUNCTION__); + return result; + } + return result; +} +nvmlReturn_t nvmlShutdown ( void ) +{ +#ifdef WITH_API_CNT + api_call_cnt++; +#endif //WITH_API_CNT + int result; + enum clnt_stat retval_1; + retval_1 = rpc_nvmlshutdown_1(&result, clnt); + if (retval_1 != RPC_SUCCESS) { + LOGE(LOG_ERROR, "call failed: %s", __FUNCTION__); + return result; + } + return result; +} + + +DEF_FN(nvmlReturn_t, nvmlDeviceGetAPIRestriction, nvmlDevice_t, device, nvmlRestrictedAPI_t, apiType, nvmlEnableState_t*, isRestricted ) +DEF_FN(nvmlReturn_t, nvmlDeviceGetAdaptiveClockInfoStatus, nvmlDevice_t, device, unsigned int*, adaptiveClockStatus ) +DEF_FN(nvmlReturn_t, nvmlDeviceGetApplicationsClock, nvmlDevice_t, device, nvmlClockType_t, clockType, unsigned int*, clockMHz ) +#if NVML_API_VERSION >= 12 +DEF_FN(nvmlReturn_t, nvmlDeviceGetArchitecture, nvmlDevice_t, device, nvmlDeviceArchitecture_t*, arch ) +DEF_FN(nvmlReturn_t, nvmlDeviceGetAttributes_v2, nvmlDevice_t, device, nvmlDeviceAttributes_t*, attributes ) +#endif +DEF_FN(nvmlReturn_t, nvmlDeviceGetAutoBoostedClocksEnabled, nvmlDevice_t, device, nvmlEnableState_t*, isEnabled, nvmlEnableState_t*, defaultIsEnabled ) +DEF_FN(nvmlReturn_t, nvmlDeviceGetBAR1MemoryInfo, nvmlDevice_t, device, nvmlBAR1Memory_t*, bar1Memory ) +DEF_FN(nvmlReturn_t, nvmlDeviceGetBoardId, nvmlDevice_t, device, unsigned int*, boardId ) +DEF_FN(nvmlReturn_t, nvmlDeviceGetBoardPartNumber, nvmlDevice_t, device, char*, partNumber, unsigned int, length ) +DEF_FN(nvmlReturn_t, nvmlDeviceGetBrand, nvmlDevice_t, device, nvmlBrandType_t*, type ) +DEF_FN(nvmlReturn_t, nvmlDeviceGetBridgeChipInfo, nvmlDevice_t, device, nvmlBridgeChipHierarchy_t*, bridgeHierarchy ) +DEF_FN(nvmlReturn_t, nvmlDeviceGetClock, nvmlDevice_t, device, nvmlClockType_t, clockType, nvmlClockId_t, clockId, unsigned int*, clockMHz ) +DEF_FN(nvmlReturn_t, nvmlDeviceGetClockInfo, nvmlDevice_t, device, nvmlClockType_t, type, unsigned int*, clock ) +DEF_FN(nvmlReturn_t, nvmlDeviceGetComputeMode, nvmlDevice_t, device, nvmlComputeMode_t*, mode ) +DEF_FN(nvmlReturn_t, nvmlDeviceGetComputeRunningProcesses_v3, nvmlDevice_t, device, unsigned int*, infoCount, nvmlProcessInfo_t*, infos ) +nvmlReturn_t nvmlDeviceGetCount_v2(unsigned int* deviceCount ) +{ +#ifdef WITH_API_CNT + api_call_cnt++; +#endif //WITH_API_CNT + int_result result; + enum clnt_stat retval_1; + if (deviceCount == NULL) { + return NVML_ERROR_INVALID_ARGUMENT; + } + retval_1 = rpc_nvmldevicegetcount_v2_1(&result, clnt); + if (retval_1 != RPC_SUCCESS) { + LOGE(LOG_ERROR, "call failed: %s", __FUNCTION__); + } + if (result.err == 0) { + *deviceCount = result.int_result_u.data; + } + return result.err; +} +DEF_FN(nvmlReturn_t, nvmlDeviceGetCudaComputeCapability, nvmlDevice_t, device, int*, major, int*, minor ) +DEF_FN(nvmlReturn_t, nvmlDeviceGetCurrPcieLinkGeneration, nvmlDevice_t, device, unsigned int*, currLinkGen ) +DEF_FN(nvmlReturn_t, nvmlDeviceGetCurrPcieLinkWidth, nvmlDevice_t, device, unsigned int*, currLinkWidth ) +DEF_FN(nvmlReturn_t, nvmlDeviceGetCurrentClocksThrottleReasons, nvmlDevice_t, device, unsigned long long*, clocksThrottleReasons ) +DEF_FN(nvmlReturn_t, nvmlDeviceGetDecoderUtilization, nvmlDevice_t, device, unsigned int*, utilization, unsigned int*, samplingPeriodUs ) +DEF_FN(nvmlReturn_t, nvmlDeviceGetDefaultApplicationsClock, nvmlDevice_t, device, nvmlClockType_t, clockType, unsigned int*, clockMHz ) +DEF_FN(nvmlReturn_t, nvmlDeviceGetDefaultEccMode, nvmlDevice_t, device, nvmlEnableState_t*, defaultMode ) +DEF_FN(nvmlReturn_t, nvmlDeviceGetDetailedEccErrors, nvmlDevice_t, device, nvmlMemoryErrorType_t, errorType, nvmlEccCounterType_t, counterType, nvmlEccErrorCounts_t*, eccCounts ) +DEF_FN(nvmlReturn_t, nvmlDeviceGetDisplayActive, nvmlDevice_t, device, nvmlEnableState_t*, isActive ) +DEF_FN(nvmlReturn_t, nvmlDeviceGetDisplayMode, nvmlDevice_t, device, nvmlEnableState_t*, display ) +DEF_FN(nvmlReturn_t, nvmlDeviceGetDriverModel, nvmlDevice_t, device, nvmlDriverModel_t*, current, nvmlDriverModel_t*, pending ) +DEF_FN(nvmlReturn_t, nvmlDeviceGetEccMode, nvmlDevice_t, device, nvmlEnableState_t*, current, nvmlEnableState_t*, pending ) +DEF_FN(nvmlReturn_t, nvmlDeviceGetEncoderCapacity, nvmlDevice_t, device, nvmlEncoderType_t, encoderQueryType, unsigned int*, encoderCapacity ) +DEF_FN(nvmlReturn_t, nvmlDeviceGetEncoderSessions, nvmlDevice_t, device, unsigned int*, sessionCount, nvmlEncoderSessionInfo_t*, sessionInfos ) +DEF_FN(nvmlReturn_t, nvmlDeviceGetEncoderStats, nvmlDevice_t, device, unsigned int*, sessionCount, unsigned int*, averageFps, unsigned int*, averageLatency ) +DEF_FN(nvmlReturn_t, nvmlDeviceGetEncoderUtilization, nvmlDevice_t, device, unsigned int*, utilization, unsigned int*, samplingPeriodUs ) +DEF_FN(nvmlReturn_t, nvmlDeviceGetEnforcedPowerLimit, nvmlDevice_t, device, unsigned int*, limit ) +DEF_FN(nvmlReturn_t, nvmlDeviceGetFBCSessions, nvmlDevice_t, device, unsigned int*, sessionCount, nvmlFBCSessionInfo_t*, sessionInfo ) +DEF_FN(nvmlReturn_t, nvmlDeviceGetFBCStats, nvmlDevice_t, device, nvmlFBCStats_t*, fbcStats ) +#if NVML_API_VERSION >= 12 +DEF_FN(nvmlReturn_t, nvmlDeviceGetFanControlPolicy_v2, nvmlDevice_t, device, unsigned int, fan, nvmlFanControlPolicy_t*, policy ) +#endif +DEF_FN(nvmlReturn_t, nvmlDeviceGetFanSpeed, nvmlDevice_t, device, unsigned int*, speed ) +DEF_FN(nvmlReturn_t, nvmlDeviceGetFanSpeed_v2, nvmlDevice_t, device, unsigned int, fan, unsigned int*, speed ) +DEF_FN(nvmlReturn_t, nvmlDeviceGetGpuMaxPcieLinkGeneration, nvmlDevice_t, device, unsigned int*, maxLinkGenDevice ) +DEF_FN(nvmlReturn_t, nvmlDeviceGetGpuOperationMode, nvmlDevice_t, device, nvmlGpuOperationMode_t*, current, nvmlGpuOperationMode_t*, pending ) +DEF_FN(nvmlReturn_t, nvmlDeviceGetGraphicsRunningProcesses_v3, nvmlDevice_t, device, unsigned int*, infoCount, nvmlProcessInfo_t*, infos ) +DEF_FN(nvmlReturn_t, nvmlDeviceGetHandleByIndex_v2, unsigned int, index, nvmlDevice_t*, device ) +DEF_FN(nvmlReturn_t, nvmlDeviceGetHandleByPciBusId_v2, const char*, pciBusId, nvmlDevice_t*, device ) +DEF_FN(nvmlReturn_t, nvmlDeviceGetHandleBySerial, const char*, serial, nvmlDevice_t*, device ) +DEF_FN(nvmlReturn_t, nvmlDeviceGetHandleByUUID, const char*, uuid, nvmlDevice_t*, device ) +DEF_FN(nvmlReturn_t, nvmlDeviceGetIndex, nvmlDevice_t, device, unsigned int*, index ) +DEF_FN(nvmlReturn_t, nvmlDeviceGetInforomConfigurationChecksum, nvmlDevice_t, device, unsigned int*, checksum ) +DEF_FN(nvmlReturn_t, nvmlDeviceGetInforomImageVersion, nvmlDevice_t, device, char*, version, unsigned int, length ) +DEF_FN(nvmlReturn_t, nvmlDeviceGetInforomVersion, nvmlDevice_t, device, nvmlInforomObject_t, object, char*, version, unsigned int, length ) +DEF_FN(nvmlReturn_t, nvmlDeviceGetIrqNum, nvmlDevice_t, device, unsigned int*, irqNum ) +DEF_FN(nvmlReturn_t, nvmlDeviceGetMPSComputeRunningProcesses_v3, nvmlDevice_t, device, unsigned int*, infoCount, nvmlProcessInfo_t*, infos ) +DEF_FN(nvmlReturn_t, nvmlDeviceGetMaxClockInfo, nvmlDevice_t, device, nvmlClockType_t, type, unsigned int*, clock ) +DEF_FN(nvmlReturn_t, nvmlDeviceGetMaxCustomerBoostClock, nvmlDevice_t, device, nvmlClockType_t, clockType, unsigned int*, clockMHz ) +DEF_FN(nvmlReturn_t, nvmlDeviceGetMaxPcieLinkGeneration, nvmlDevice_t, device, unsigned int*, maxLinkGen ) +DEF_FN(nvmlReturn_t, nvmlDeviceGetMaxPcieLinkWidth, nvmlDevice_t, device, unsigned int*, maxLinkWidth ) +DEF_FN(nvmlReturn_t, nvmlDeviceGetMemoryBusWidth, nvmlDevice_t, device, unsigned int*, busWidth ) +DEF_FN(nvmlReturn_t, nvmlDeviceGetMemoryErrorCounter, nvmlDevice_t, device, nvmlMemoryErrorType_t, errorType, nvmlEccCounterType_t, counterType, nvmlMemoryLocation_t, locationType, unsigned long long*, count ) +DEF_FN(nvmlReturn_t, nvmlDeviceGetMemoryInfo, nvmlDevice_t, device, nvmlMemory_t*, memory ) +DEF_FN(nvmlReturn_t, nvmlDeviceGetMinMaxFanSpeed, nvmlDevice_t, device, unsigned int*, minSpeed, unsigned int*, maxSpeed ) +DEF_FN(nvmlReturn_t, nvmlDeviceGetMinorNumber, nvmlDevice_t, device, unsigned int*, minorNumber ) +DEF_FN(nvmlReturn_t, nvmlDeviceGetMultiGpuBoard, nvmlDevice_t, device, unsigned int*, multiGpuBool ) +DEF_FN(nvmlReturn_t, nvmlDeviceGetName, nvmlDevice_t, device, char*, name, unsigned int, length ) +DEF_FN(nvmlReturn_t, nvmlDeviceGetNumFans, nvmlDevice_t, device, unsigned int*, numFans ) +DEF_FN(nvmlReturn_t, nvmlDeviceGetNumGpuCores, nvmlDevice_t, device, unsigned int*, numCores ) +DEF_FN(nvmlReturn_t, nvmlDeviceGetP2PStatus, nvmlDevice_t, device1, nvmlDevice_t, device2, nvmlGpuP2PCapsIndex_t, p2pIndex, nvmlGpuP2PStatus_t*, p2pStatus ) +DEF_FN(nvmlReturn_t, nvmlDeviceGetPciInfo_v3, nvmlDevice_t, device, nvmlPciInfo_t*, pci ) +DEF_FN(nvmlReturn_t, nvmlDeviceGetPcieLinkMaxSpeed, nvmlDevice_t, device, unsigned int*, maxSpeed ) +DEF_FN(nvmlReturn_t, nvmlDeviceGetPcieReplayCounter, nvmlDevice_t, device, unsigned int*, value ) +DEF_FN(nvmlReturn_t, nvmlDeviceGetPcieSpeed, nvmlDevice_t, device, unsigned int*, pcieSpeed ) +DEF_FN(nvmlReturn_t, nvmlDeviceGetPcieThroughput, nvmlDevice_t, device, nvmlPcieUtilCounter_t, counter, unsigned int*, value ) +DEF_FN(nvmlReturn_t, nvmlDeviceGetPerformanceState, nvmlDevice_t, device, nvmlPstates_t*, pState ) +DEF_FN(nvmlReturn_t, nvmlDeviceGetPersistenceMode, nvmlDevice_t, device, nvmlEnableState_t*, mode ) +DEF_FN(nvmlReturn_t, nvmlDeviceGetPowerManagementDefaultLimit, nvmlDevice_t, device, unsigned int*, defaultLimit ) +DEF_FN(nvmlReturn_t, nvmlDeviceGetPowerManagementLimit, nvmlDevice_t, device, unsigned int*, limit ) +DEF_FN(nvmlReturn_t, nvmlDeviceGetPowerManagementLimitConstraints, nvmlDevice_t, device, unsigned int*, minLimit, unsigned int*, maxLimit ) +DEF_FN(nvmlReturn_t, nvmlDeviceGetPowerManagementMode, nvmlDevice_t, device, nvmlEnableState_t*, mode ) +#if NVML_API_VERSION >= 12 +DEF_FN(nvmlReturn_t, nvmlDeviceGetPowerSource, nvmlDevice_t, device, nvmlPowerSource_t*, powerSource ) +#endif +DEF_FN(nvmlReturn_t, nvmlDeviceGetPowerState, nvmlDevice_t, device, nvmlPstates_t*, pState ) +DEF_FN(nvmlReturn_t, nvmlDeviceGetPowerUsage, nvmlDevice_t, device, unsigned int*, power ) +DEF_FN(nvmlReturn_t, nvmlDeviceGetRemappedRows, nvmlDevice_t, device, unsigned int*, corrRows, unsigned int*, uncRows, unsigned int*, isPending, unsigned int*, failureOccurred ) +DEF_FN(nvmlReturn_t, nvmlDeviceGetRetiredPages, nvmlDevice_t, device, nvmlPageRetirementCause_t, cause, unsigned int*, pageCount, unsigned long long*, addresses ) +DEF_FN(nvmlReturn_t, nvmlDeviceGetRetiredPagesPendingStatus, nvmlDevice_t, device, nvmlEnableState_t*, isPending ) +DEF_FN(nvmlReturn_t, nvmlDeviceGetRetiredPages_v2, nvmlDevice_t, device, nvmlPageRetirementCause_t, cause, unsigned int*, pageCount, unsigned long long*, addresses, unsigned long long*, timestamps ) +#if NVML_API_VERSION >= 12 +DEF_FN(nvmlReturn_t, nvmlDeviceGetRowRemapperHistogram, nvmlDevice_t, device, nvmlRowRemapperHistogramValues_t*, values ) +#endif +DEF_FN(nvmlReturn_t, nvmlDeviceGetSamples, nvmlDevice_t, device, nvmlSamplingType_t, type, unsigned long long, lastSeenTimeStamp, nvmlValueType_t*, sampleValType, unsigned int*, sampleCount, nvmlSample_t*, samples ) +DEF_FN(nvmlReturn_t, nvmlDeviceGetSerial, nvmlDevice_t, device, char*, serial, unsigned int, length ) +DEF_FN(nvmlReturn_t, nvmlDeviceGetSupportedClocksThrottleReasons, nvmlDevice_t, device, unsigned long long*, supportedClocksThrottleReasons ) +DEF_FN(nvmlReturn_t, nvmlDeviceGetSupportedGraphicsClocks, nvmlDevice_t, device, unsigned int, memoryClockMHz, unsigned int*, count, unsigned int*, clocksMHz ) +DEF_FN(nvmlReturn_t, nvmlDeviceGetSupportedMemoryClocks, nvmlDevice_t, device, unsigned int*, count, unsigned int*, clocksMHz ) +DEF_FN(nvmlReturn_t, nvmlDeviceGetTargetFanSpeed, nvmlDevice_t, device, unsigned int, fan, unsigned int*, targetSpeed ) +DEF_FN(nvmlReturn_t, nvmlDeviceGetTemperature, nvmlDevice_t, device, nvmlTemperatureSensors_t, sensorType, unsigned int*, temp ) +DEF_FN(nvmlReturn_t, nvmlDeviceGetTemperatureThreshold, nvmlDevice_t, device, nvmlTemperatureThresholds_t, thresholdType, unsigned int*, temp ) +#if NVML_API_VERSION >= 12 +DEF_FN(nvmlReturn_t, nvmlDeviceGetThermalSettings, nvmlDevice_t, device, unsigned int, sensorIndex, nvmlGpuThermalSettings_t*, pThermalSettings ) +#endif +DEF_FN(nvmlReturn_t, nvmlDeviceGetTopologyCommonAncestor, nvmlDevice_t, device1, nvmlDevice_t, device2, nvmlGpuTopologyLevel_t*, pathInfo ) +DEF_FN(nvmlReturn_t, nvmlDeviceGetTopologyNearestGpus, nvmlDevice_t, device, nvmlGpuTopologyLevel_t, level, unsigned int*, count, nvmlDevice_t*, deviceArray ) +DEF_FN(nvmlReturn_t, nvmlDeviceGetTotalEccErrors, nvmlDevice_t, device, nvmlMemoryErrorType_t, errorType, nvmlEccCounterType_t, counterType, unsigned long long*, eccCounts ) +DEF_FN(nvmlReturn_t, nvmlDeviceGetTotalEnergyConsumption, nvmlDevice_t, device, unsigned long long*, energy ) +DEF_FN(nvmlReturn_t, nvmlDeviceGetUUID, nvmlDevice_t, device, char*, uuid, unsigned int, length ) +DEF_FN(nvmlReturn_t, nvmlDeviceGetUtilizationRates, nvmlDevice_t, device, nvmlUtilization_t*, utilization ) +DEF_FN(nvmlReturn_t, nvmlDeviceGetVbiosVersion, nvmlDevice_t, device, char*, version, unsigned int, length ) +DEF_FN(nvmlReturn_t, nvmlDeviceGetViolationStatus, nvmlDevice_t, device, nvmlPerfPolicyType_t, perfPolicyType, nvmlViolationTime_t*, violTime ) +DEF_FN(nvmlReturn_t, nvmlDeviceOnSameBoard, nvmlDevice_t, device1, nvmlDevice_t, device2, int*, onSameBoard ) +DEF_FN(nvmlReturn_t, nvmlDeviceResetApplicationsClocks, nvmlDevice_t, device ) +DEF_FN(nvmlReturn_t, nvmlDeviceSetAutoBoostedClocksEnabled, nvmlDevice_t, device, nvmlEnableState_t, enabled ) +DEF_FN(nvmlReturn_t, nvmlDeviceSetDefaultAutoBoostedClocksEnabled, nvmlDevice_t, device, nvmlEnableState_t, enabled, unsigned int, flags ) +DEF_FN(nvmlReturn_t, nvmlDeviceSetDefaultFanSpeed_v2, nvmlDevice_t, device, unsigned int, fan ) +#if NVML_API_VERSION >= 12 +DEF_FN(nvmlReturn_t, nvmlDeviceSetFanControlPolicy, nvmlDevice_t, device, unsigned int, fan, nvmlFanControlPolicy_t, policy ) +#endif +DEF_FN(nvmlReturn_t, nvmlDeviceSetTemperatureThreshold, nvmlDevice_t, device, nvmlTemperatureThresholds_t, thresholdType, int*, temp ) +DEF_FN(nvmlReturn_t, nvmlDeviceValidateInforom, nvmlDevice_t, device ) +DEF_FN(nvmlReturn_t, nvmlSystemGetTopologyGpuSet, unsigned int, cpuNumber, unsigned int*, count, nvmlDevice_t*, deviceArray ) +DEF_FN(nvmlReturn_t, nvmlVgpuInstanceGetMdevUUID, nvmlVgpuInstance_t, vgpuInstance, char*, mdevUuid, unsigned int, size ) diff --git a/cpu/cpu-client-runtime.c b/cpu/cpu-client-runtime.c index 373993a3..cbd1eab0 100644 --- a/cpu/cpu-client-runtime.c +++ b/cpu/cpu-client-runtime.c @@ -1,4 +1,3 @@ -#include "mt-memcpy.h" #define _GNU_SOURCE #include #include @@ -24,6 +23,7 @@ #include "cpu-utils.h" #include "log.h" #include "oob.h" +#include "mt-memcpy.h" #ifdef WITH_IB #include "cpu-ib.h" #endif //WITH_IB @@ -269,12 +269,12 @@ cudaError_t cudaDeviceSynchronize(void) #endif //WITH_API_CNT int result = -1; enum clnt_stat retval_1; - for (int i=0; result != 0 && i < 10; ++i) { - retval_1 = cuda_device_synchronize_1(&result, clnt); - if (retval_1 != RPC_SUCCESS) { - clnt_perror (clnt, "call failed"); - } - } + + struct timeval timeout = {.tv_sec = -1, .tv_usec = 0}; + + return (clnt_call (clnt, CUDA_DEVICE_SYNCHRONIZE, (xdrproc_t) xdr_void, (caddr_t) NULL, + (xdrproc_t) xdr_int, (caddr_t) &result, + timeout)); return result; } @@ -329,15 +329,18 @@ cudaError_t cudaGetDeviceFlags(unsigned int* flags) return result.err; } +#undef cudaGetDeviceProperties cudaError_t cudaGetDeviceProperties(struct cudaDeviceProp* prop, int device) { #ifdef WITH_API_CNT api_call_cnt++; #endif //WITH_API_CNT - mem_result result; - result.mem_result_u.data.mem_data_len = sizeof(struct cudaDeviceProp); - result.mem_result_u.data.mem_data_val = (char*)prop; + cuda_device_prop_result result; enum clnt_stat retval; + if (prop == NULL) { + LOGE(LOG_ERROR, "error: prop == NULL"); + return cudaErrorInvalidValue; + } retval = cuda_get_device_properties_1(device, &result, clnt); if (retval != RPC_SUCCESS) { clnt_perror (clnt, "call failed"); @@ -345,13 +348,21 @@ cudaError_t cudaGetDeviceProperties(struct cudaDeviceProp* prop, int device) if (result.err != 0) { return result.err; } - if (result.mem_result_u.data.mem_data_len != sizeof(struct cudaDeviceProp)) { - LOGE(LOG_ERROR, "error: expected size != retrieved size\n"); + // if (memcpy(prop, result.mem_result_u.data.mem_data_val, sizeof(struct cudaDeviceProp)) == NULL) { + //FIXME: Don't know why, but pytorch expects a different definition of cudaDeviceProp, which is only 728 bytes long + if (memcpy(prop, result.cuda_device_prop_result_u.data, 728) == NULL) { + LOGE(LOG_ERROR, "error: memcpy failed"); return result.err; } return result.err; } +cudaError_t cudaGetDeviceProperties_v2(struct cudaDeviceProp* prop, int device) +{ + return cudaGetDeviceProperties(prop, device); +} + + DEF_FN(cudaError_t, cudaIpcCloseMemHandle, void*, devPtr) DEF_FN(cudaError_t, cudaIpcGetEventHandle, cudaIpcEventHandle_t*, handle, cudaEvent_t, event) DEF_FN(cudaError_t, cudaIpcGetMemHandle, cudaIpcMemHandle_t*, handle, void*, devPtr) @@ -572,7 +583,25 @@ cudaError_t cudaStreamGetPriority(cudaStream_t hStream, int* priority) return result.err; } -DEF_FN(cudaError_t, cudaStreamIsCapturing, cudaStream_t, stream, enum cudaStreamCaptureStatus*, pCaptureStatus) +cudaError_t cudaStreamIsCapturing(cudaStream_t stream, enum cudaStreamCaptureStatus* pCaptureStatus) +{ +#ifdef WITH_API_CNT + api_call_cnt++; +#endif //WITH_API_CNT + int_result result; + enum clnt_stat retval_1; + if (pCaptureStatus == NULL) { + return cudaErrorInvalidValue; + } + retval_1 = cuda_stream_is_capturing_1((ptr)stream, &result, clnt); + if (retval_1 != RPC_SUCCESS) { + clnt_perror (clnt, "call failed"); + } + if (result.err == 0) { + *pCaptureStatus = (enum cudaStreamCaptureStatus)result.int_result_u.data; + } + return result.err; +} cudaError_t cudaStreamQuery(cudaStream_t stream) { @@ -752,7 +781,9 @@ DEF_FN(cudaError_t, cudaExternalMemoryGetMappedBuffer, void**, devPtr, cudaExter DEF_FN(cudaError_t, cudaExternalMemoryGetMappedMipmappedArray, cudaMipmappedArray_t*, mipmap, cudaExternalMemory_t, extMem, const struct cudaExternalMemoryMipmappedArrayDesc*, mipmapDesc) DEF_FN(cudaError_t, cudaImportExternalMemory, cudaExternalMemory_t*, extMem_out, const struct cudaExternalMemoryHandleDesc*, memHandleDesc) DEF_FN(cudaError_t, cudaImportExternalSemaphore, cudaExternalSemaphore_t*, extSem_out, const struct cudaExternalSemaphoreHandleDesc*, semHandleDesc) +#undef cudaSignalExternalSemaphoresAsync DEF_FN(cudaError_t, cudaSignalExternalSemaphoresAsync, const cudaExternalSemaphore_t*, extSemArray, const struct cudaExternalSemaphoreSignalParams*, paramsArray, unsigned int, numExtSems, cudaStream_t, stream) +#undef cudaWaitExternalSemaphoresAsync DEF_FN(cudaError_t, cudaWaitExternalSemaphoresAsync, const cudaExternalSemaphore_t*, extSemArray, const struct cudaExternalSemaphoreWaitParams*, paramsArray, unsigned int, numExtSems, cudaStream_t, stream) cudaError_t cudaFuncGetAttributes(struct cudaFuncAttributes* attr, const void* func) @@ -1088,12 +1119,12 @@ cudaError_t cudaFreeArray(cudaArray_t array) } typedef struct host_alloc_info { - int cnt; + int idx; size_t size; void *client_ptr; } host_alloc_info_t; static host_alloc_info_t hainfo[64] = {0}; -static size_t hainfo_cnt = 1; +static size_t hainfo_cnt = 0; static int hainfo_getindex(void *client_ptr) { int i; @@ -1195,44 +1226,49 @@ cudaError_t cudaHostAlloc(void** pHost, size_t size, unsigned int flags) #ifdef WITH_API_CNT api_call_cnt++; #endif //WITH_API_CNT - int ret = cudaErrorMemoryAllocation; + sz_result ret = {.err = cudaErrorMemoryAllocation}; + int reg_ret; int fd_shm; - char shm_name[128]; + char *shm_name = NULL; enum clnt_stat retval_1; if (shm_enabled && connection_is_local == 1) { //Use local shared memory + retval_1 = cuda_host_alloc_1(size, flags, &ret, clnt); + if (retval_1 != RPC_SUCCESS || ret.err != cudaSuccess) { + LOGE(LOG_ERROR, "cudaHostAlloc failed on server-side."); + goto out; + } - snprintf(shm_name, 128, "/crickethostalloc-%zu", hainfo_cnt); - if ((fd_shm = shm_open(shm_name, O_RDWR | O_CREAT, S_IRWXU)) == -1) { - LOGE(LOG_ERROR, "ERROR: could not open shared memory \"%s\" with size %d: %s", shm_name, size, strerror(errno)); + if (asprintf(&shm_name, "/crickethostalloc-%zu", ret.sz_result_u.data) == -1) { + LOGE(LOG_ERROR, "ERROR: asprintf failed: %s", strerror(errno)); + ret.err = cudaErrorMemoryAllocation; goto out; } - if (ftruncate(fd_shm, size) == -1) { - LOGE(LOG_ERROR, "ERROR: cannot resize shared memory"); - shm_unlink(shm_name); + + if ((fd_shm = shm_open(shm_name, O_RDWR, S_IREAD | S_IWRITE)) == -1) { + LOGE(LOG_ERROR, "ERROR: could not open shared memory \"%s\" with size %d: %s", shm_name, size, strerror(errno)); + ret.err = cudaErrorMemoryAllocation; goto out; } - LOGE(LOG_DEBUG, "shm opened with name \"%s\", size: %d", shm_name, size); + if ((*pHost = mmap(NULL, size, PROT_READ | PROT_WRITE, MAP_SHARED, fd_shm, 0)) == MAP_FAILED) { LOGE(LOG_ERROR, "ERROR: mmap returned unexpected pointer: %p", *pHost); shm_unlink(shm_name); + ret.err = cudaErrorMemoryAllocation; goto out; } - hainfo[hainfo_cnt].cnt = hainfo_cnt; + hainfo[hainfo_cnt].idx = ret.sz_result_u.data; hainfo[hainfo_cnt].size = size; hainfo[hainfo_cnt].client_ptr = *pHost; - - retval_1 = cuda_host_alloc_1(hainfo_cnt, size, (uint64_t)*pHost, flags, &ret, clnt); - if (retval_1 != RPC_SUCCESS) { - clnt_perror (clnt, "call failed"); - } - if (ret == cudaSuccess) { - hainfo_cnt++; - } else { - munmap(*pHost, size); - *pHost = NULL; + hainfo_cnt++; + + retval_1 = cuda_host_alloc_regshm_1(ret.sz_result_u.data, (ptr)*pHost, ®_ret, clnt); + if (retval_1 != RPC_SUCCESS || ret.err != cudaSuccess) { + LOGE(LOG_ERROR, "cudaHostAlloc failed on server-side."); + goto out; } + shm_unlink(shm_name); } else if (socktype == TCP) { //Use infiniband #ifdef WITH_IB @@ -1240,14 +1276,14 @@ cudaError_t cudaHostAlloc(void** pHost, size_t size, unsigned int flags) LOGE(LOG_ERROR, "failed to register infiniband memory region"); goto out; } - hainfo[hainfo_cnt].cnt = hainfo_cnt; + hainfo[hainfo_cnt].idx = hainfo_cnt; hainfo[hainfo_cnt].size = size; hainfo[hainfo_cnt].client_ptr = *pHost; hainfo_cnt++; retval_1 = RPC_SUCCESS; - ret = cudaSuccess; + ret.err = cudaSuccess; #else LOGE(LOG_DEBUG, "cudaHostAlloc is not supported for TCP transports without IB. Using malloc instead..."); @@ -1255,7 +1291,7 @@ cudaError_t cudaHostAlloc(void** pHost, size_t size, unsigned int flags) if (*pHost == NULL) { goto out; } else { - ret = cudaSuccess; + ret.err = cudaSuccess; goto out; } #endif //WITH_IB @@ -1264,7 +1300,8 @@ cudaError_t cudaHostAlloc(void** pHost, size_t size, unsigned int flags) goto out; } out: - return ret; + free(shm_name); + return ret.err; } cudaError_t cudaHostGetDevicePointer(void** pDevice, void* pHost, unsigned int flags) @@ -1528,7 +1565,6 @@ extern char server[256]; #define WITH_MT_MEMCPY cudaError_t cudaMemcpy(void* dst, const void* src, size_t count, enum cudaMemcpyKind kind) { - #ifdef WITH_API_CNT api_call_cnt++; memcpy_cnt += count; @@ -1536,9 +1572,9 @@ cudaError_t cudaMemcpy(void* dst, const void* src, size_t count, enum cudaMemcpy int ret = 1; enum clnt_stat retval; if (kind == cudaMemcpyHostToDevice) { -//get index of mem reg (src: cpu reg memregion) + // get index of mem reg (src: cpu reg memregion) int index = hainfo_getindex((void*)src); -// not a cudaHostAlloc'ed memory + // not a cudaHostAlloc'ed memory if (index == -1) { #ifdef WITH_MT_MEMCPY if (count > 2*MT_MEMCPY_MEM_PER_THREAD) { @@ -1572,7 +1608,7 @@ cudaError_t cudaMemcpy(void* dst, const void* src, size_t count, enum cudaMemcpy #endif //WITH_MT_MEMCPY } else { if (shm_enabled && connection_is_local == 1) { //Use local shared memory - retval = cuda_memcpy_shm_1(index, (ptr)dst, count, kind, &ret, clnt); + retval = cuda_memcpy_shm_1(hainfo[index].idx, (ptr)dst, count, kind, &ret, clnt); } else if (socktype == TCP) { //Use infiniband #ifdef WITH_IB //the following commend connects to serverside cuda_memcpy_ib_1_svc, server thread is initialized waiting for client send @@ -1635,7 +1671,7 @@ cudaError_t cudaMemcpy(void* dst, const void* src, size_t count, enum cudaMemcpy #endif //WITH_MT_MEMCPY } else { if (shm_enabled && connection_is_local) { //Use local shared memory - retval = cuda_memcpy_shm_1(index, (ptr)src, count, kind, &ret, clnt); + retval = cuda_memcpy_shm_1(hainfo[index].idx, (ptr)src, count, kind, &ret, clnt); } else if (socktype == TCP) { //Use infiniband #ifdef WITH_IB pthread_t thread = {0}; @@ -1758,7 +1794,19 @@ cudaError_t cudaMemset2D(void* devPtr, size_t pitch, int value, size_t width, si return result; } -DEF_FN(cudaError_t, cudaMemset2DAsync, void*, devPtr, size_t, pitch, int, value, size_t, width, size_t, height, cudaStream_t, stream) +cudaError_t cudaMemset2DAsync(void* devPtr, size_t pitch, int value, size_t width, size_t height, cudaStream_t stream) +{ +#ifdef WITH_API_CNT + api_call_cnt++; +#endif //WITH_API_CNT + int result; + enum clnt_stat retval; + retval = cuda_memset_2d_async_1((ptr)devPtr, pitch, value, width, height, (ptr)stream, &result, clnt); + if (retval != RPC_SUCCESS) { + clnt_perror (clnt, "call failed"); + } + return result; +} cudaError_t cudaMemset3D(struct cudaPitchedPtr pitchedDevPtr, int value, struct cudaExtent extent) { @@ -1782,8 +1830,42 @@ cudaError_t cudaMemset3D(struct cudaPitchedPtr pitchedDevPtr, int value, struct return result; } -DEF_FN(cudaError_t, cudaMemset3DAsync, struct cudaPitchedPtr, pitchedDevPtr, int, value, struct cudaExtent, extent, cudaStream_t, stream) -DEF_FN(cudaError_t, cudaMemsetAsync, void*, devPtr, int, value, size_t, count, cudaStream_t, stream) +cudaError_t cudaMemset3DAsync(struct cudaPitchedPtr pitchedDevPtr, int value, struct cudaExtent extent, cudaStream_t stream) +{ +#ifdef WITH_API_CNT + api_call_cnt++; +#endif //WITH_API_CNT + int result; + enum clnt_stat retval; + retval = cuda_memset_3d_async_1(pitchedDevPtr.pitch, + (ptr)pitchedDevPtr.ptr, + pitchedDevPtr.xsize, + pitchedDevPtr.ysize, + value, + extent.depth, + extent.height, + extent.width, + (ptr)stream, + &result, clnt); + if (retval != RPC_SUCCESS) { + clnt_perror (clnt, "call failed"); + } + return result; +} + +cudaError_t cudaMemsetAsync(void* devPtr, int value, size_t count, cudaStream_t stream) +{ +#ifdef WITH_API_CNT + api_call_cnt++; +#endif //WITH_API_CNT + int result; + enum clnt_stat retval; + retval = cuda_memset_async_1((ptr)devPtr, value, count, (ptr)stream, &result, clnt); + if (retval != RPC_SUCCESS) { + clnt_perror (clnt, "call failed"); + } + return result; +} DEF_FN(struct cudaExtent, make_cudaExtent, size_t, w, size_t, h, size_t, d) DEF_FN(struct cudaPitchedPtr, make_cudaPitchedPtr, void*, d, size_t, p, size_t, xsz, size_t, ysz) @@ -1907,7 +1989,11 @@ DEF_FN(cudaError_t, cudaGraphGetNodes, cudaGraph_t, graph, cudaGraphNode_t*, nod DEF_FN(cudaError_t, cudaGraphGetRootNodes, cudaGraph_t, graph, cudaGraphNode_t*, pRootNodes, size_t*, pNumRootNodes) DEF_FN(cudaError_t, cudaGraphHostNodeGetParams, cudaGraphNode_t, node, struct cudaHostNodeParams*, pNodeParams) DEF_FN(cudaError_t, cudaGraphHostNodeSetParams, cudaGraphNode_t, node, const struct cudaHostNodeParams*, pNodeParams) +#if CUDART_VERSION >= 12000 +DEF_FN(cudaError_t, cudaGraphInstantiate, cudaGraphExec_t*, pGraphExec, cudaGraph_t, graph, unsigned long long, flags) +#else DEF_FN(cudaError_t, cudaGraphInstantiate, cudaGraphExec_t*, pGraphExec, cudaGraph_t, graph, cudaGraphNode_t*, pErrorNode, char*, pLogBuffer, size_t, bufferSize) +#endif DEF_FN(cudaError_t, cudaGraphKernelNodeGetParams, cudaGraphNode_t, node, struct cudaKernelNodeParams*, pNodeParams) DEF_FN(cudaError_t, cudaGraphKernelNodeSetParams, cudaGraphNode_t, node, const struct cudaKernelNodeParams*, pNodeParams) DEF_FN(cudaError_t, cudaGraphLaunch, cudaGraphExec_t, graphExec, cudaStream_t, stream) @@ -1920,6 +2006,33 @@ DEF_FN(cudaError_t, cudaGraphNodeGetDependencies, cudaGraphNode_t, node, cudaGra DEF_FN(cudaError_t, cudaGraphNodeGetDependentNodes, cudaGraphNode_t, node, cudaGraphNode_t*, pDependentNodes, size_t*, pNumDependentNodes) DEF_FN(cudaError_t, cudaGraphNodeGetType, cudaGraphNode_t, node, enum cudaGraphNodeType*, pType) DEF_FN(cudaError_t, cudaGraphRemoveDependencies, cudaGraph_t, graph, const cudaGraphNode_t*, from, const cudaGraphNode_t*, to, size_t, numDependencies) -DEF_FN(cudaError_t, cudaProfilerInitialize, const char*, configFile, const char*, outputFile, cudaOutputMode_t, outputMode) DEF_FN(cudaError_t, cudaProfilerStart, void) DEF_FN(cudaError_t, cudaProfilerStop, void) + +cudaError_t cudaProfilerStart(void) +{ +#ifdef WITH_API_CNT + api_call_cnt++; +#endif //WITH_API_CNT + int result; + enum clnt_stat retval; + retval = cuda_profiler_start_1(&result, clnt); + if (retval != RPC_SUCCESS) { + clnt_perror (clnt, "call failed"); + } + return result; +} + +cudaError_t cudaProfilerStop(void) +{ +#ifdef WITH_API_CNT + api_call_cnt++; +#endif //WITH_API_CNT + int result; + enum clnt_stat retval; + retval = cuda_profiler_stop_1(&result, clnt); + if (retval != RPC_SUCCESS) { + clnt_perror (clnt, "call failed"); + } + return result; +} \ No newline at end of file diff --git a/cpu/cpu-client.c b/cpu/cpu-client.c index 45f92d51..c4bc68d1 100644 --- a/cpu/cpu-client.c +++ b/cpu/cpu-client.c @@ -1,30 +1,32 @@ #define _GNU_SOURCE -#include -#include #include #include +#include +#include +#include -//For TCP socket -#include -#include +// For TCP socket #include #include +#include +#include -#include "cpu-libwrap.h" -#include "cpu_rpc_prot.h" #include "cpu-common.h" +#include "cpu-libwrap.h" #include "cpu-utils.h" +#include "cpu_rpc_prot.h" #include "list.h" +#include "cpu-elf2.h" #ifdef WITH_IB #include "cpu-ib.h" -#endif //WITH_IB +#endif // WITH_IB -//static const char* LIBCUDA_PATH = "/lib64/libcuda.so"; -const char* LIBCUDA_PATH = "/usr/local/cuda/lib64/libcudart.so"; +// static const char* LIBCUDA_PATH = "/lib64/libcuda.so"; +const char *LIBCUDA_PATH = "/usr/local/cuda/lib64/libcudart.so"; CLIENT *clnt = NULL; -list kernel_infos = {0}; +list kernel_infos = { 0 }; char server[256]; @@ -34,30 +36,33 @@ int shm_enabled = 1; int initialized = 0; #ifdef WITH_IB - int ib_device = 0; -#endif //WITH_IB +int ib_device = 0; +#endif // WITH_IB #ifdef WITH_API_CNT extern void cpu_runtime_print_api_call_cnt(void); -#endif //WITH_API_CNT +#endif // WITH_API_CNT static void rpc_connect(void) { int isock; - struct sockaddr_un sock_un = {0}; - struct sockaddr_in sock_in = {0}; - struct sockaddr_in local_addr = {0}; + struct sockaddr_un sock_un = { 0 }; + struct sockaddr_in sock_in = { 0 }; + struct sockaddr_in local_addr = { 0 }; struct hostent *hp; socklen_t sockaddr_len = sizeof(struct sockaddr_in); - unsigned long prog=0, vers=0; + unsigned long prog = 0, vers = 0; char envvar[] = "REMOTE_GPU_ADDRESS"; - if(!getenv(envvar)) { - LOG(LOG_ERROR, "Environment variable %s does not exist. It must contain the address where the server application is listening.", envvar); + if (!getenv(envvar)) { + LOG(LOG_ERROR, + "Environment variable %s does not exist. It must contain the " + "address where the server application is listening.", + envvar); exit(1); } - if(strncpy(server, getenv(envvar), 256) == NULL) { + if (strncpy(server, getenv(envvar), 256) == NULL) { LOGE(LOG_ERROR, "strncpy failed."); exit(1); } @@ -65,23 +70,24 @@ static void rpc_connect(void) #ifdef WITH_IB - if(getenv("IB_DEVICE_ID")) { + if (getenv("IB_DEVICE_ID")) { ib_device = atoi(getenv("IB_DEVICE_ID")); } LOG(LOG_INFO, "Using IB device: %d.", ib_device); -#endif //WITH_IB +#endif // WITH_IB - LOGE(LOG_INFO, "test\n"); - if(getenv("CRICKET_NOHASH")) { - prog=99; - vers=1; - } else if (cpu_utils_md5hash("/proc/self/exe", &prog, &vers) != 0) { - LOGE(LOG_ERROR, "error while creating binary checksum"); - exit(0); + prog = 99; + vers = 1; + const char *env_vers = getenv("CRICKET_RPCID"); + if (env_vers != NULL) { + if (sscanf(env_vers, "%lu", &vers) != 1) { + LOGE(LOG_ERROR, "error parsing CRICKET_RPCID"); + exit(1); + } } - char* cmd = NULL; + char *cmd = NULL; if (cpu_utils_command(&cmd) != 0) { LOGE(LOG_ERROR, "error getting command"); } else { @@ -109,18 +115,19 @@ static void rpc_connect(void) LOGE(LOG_ERROR, "error resolving hostname: %s", server); exit(1); } - sock_in.sin_addr = *(struct in_addr*)hp->h_addr; - //inet_aton("137.226.133.199", &sock_in.sin_addr); + sock_in.sin_addr = *(struct in_addr *)hp->h_addr; + // inet_aton("137.226.133.199", &sock_in.sin_addr); clnt = clnttcp_create(&sock_in, prog, vers, &isock, 0, 0); getsockname(isock, &local_addr, &sockaddr_len); - connection_is_local = (local_addr.sin_addr.s_addr == sock_in.sin_addr.s_addr); + connection_is_local = + (local_addr.sin_addr.s_addr == sock_in.sin_addr.s_addr); break; case UDP: - /* From RPCEGEN documentation: + /* From RPCEGEN documentation: * Warning: since UDP-based RPC messages can only hold up to 8 Kbytes - * of encoded data, this transport cannot be used for procedures that - * take large arguments or return huge results. + * of encoded data, this transport cannot be used for procedures that + * take large arguments or return huge results. * -> Sounds like UDP does not make sense for CUDA, because we need to * be able to copy large memory chunks **/ @@ -130,11 +137,12 @@ static void rpc_connect(void) if (clnt == NULL) { clnt_pcreateerror("[rpc] Error"); - exit (1); + exit(1); } } -static void repair_connection(int signo) { +static void repair_connection(int signo) +{ enum clnt_stat retval_1; int result_1; /*LOGE(LOG_INFO, "Trying connection..."); @@ -154,13 +162,14 @@ static void repair_connection(int signo) { } } -void __attribute__ ((constructor)) init_rpc(void) +void __attribute__((constructor)) init_rpc(void) { enum clnt_stat retval_1; int result_1; int_result result_2; char *printmessage_1_arg1 = "hello"; + LOG(LOG_DBG(1), "log level is %d", LOG_LEVEL); init_log(LOG_LEVEL, __FILE__); rpc_connect(); @@ -172,24 +181,29 @@ void __attribute__ ((constructor)) init_rpc(void) retval_1 = rpc_printmessage_1(printmessage_1_arg1, &result_1, clnt); if (retval_1 != RPC_SUCCESS) { - clnt_perror (clnt, "call failed"); + clnt_perror(clnt, "call failed"); } if (list_init(&kernel_infos, sizeof(kernel_info_t)) != 0) { LOGE(LOG_ERROR, "list init failed."); } - if (cpu_utils_parameter_info(&kernel_infos, "/proc/self/exe") != 0) { - LOG(LOG_ERROR, "error while getting parameter size. Check whether cuobjdump binary is in PATH! Trying anyway (will only work if there is no kernel in this binary)"); + if (elf2_init() != 0) { + LOGE(LOG_ERROR, "libelf init failed"); } + + // if (cpu_utils_parameter_info(&kernel_infos, "/proc/self/exe") != 0) { + // LOG(LOG_ERROR, "error while getting parameter size. Check whether " + // "cuobjdump binary is in PATH! Trying anyway (will only " + // "work if there is no kernel in this binary)"); + // } #ifdef WITH_IB if (ib_init(ib_device, server) != 0) { LOG(LOG_ERROR, "initilization of infiniband verbs failed."); } -#endif //WITH_IB - +#endif // WITH_IB } -void __attribute__ ((destructor)) deinit_rpc(void) +void __attribute__((destructor)) deinit_rpc(void) { enum clnt_stat retval_1; int result; @@ -202,151 +216,210 @@ void __attribute__ ((destructor)) deinit_rpc(void) list_free(&kernel_infos); #ifdef WITH_API_CNT cpu_runtime_print_api_call_cnt(); -#endif //WITH_API_CNT +#endif // WITH_API_CNT } if (clnt != NULL) { - clnt_destroy (clnt); + clnt_destroy(clnt); } } -void __cudaRegisterVar(void **fatCubinHandle, char *hostVar, char *deviceAddress, const char *deviceName, int ext, size_t size, int constant, int global) -{ -} -void __cudaRegisterFunction(void **fatCubinHandle, const char *hostFun, char *deviceFun, - const char *deviceName, int thread_limit, uint3 *tid, - uint3 *bid, dim3 *bDim, dim3 *gDim, int *wSize) +static void *(*dlopen_orig)(const char *, int) = NULL; +static int (*dlclose_orig)(void *) = NULL; +static void *dl_handle = NULL; + +void *dlopen(const char *filename, int flag) { - int result; - enum clnt_stat retval_1; + void *ret = NULL; + struct link_map *map; + int has_kernel = 0; + LOG(LOG_DBG(1), "intercepted dlopen(%s, %d)", filename, flag); + + if (filename == NULL) { + return dlopen_orig(filename, flag); + } - printf("__cudaRegisterFunction(fatCubinHandle=%p, hostFun=%p, devFunc=%s, deviceName=%s, thread_limit=%d, tid=[%p], bid=[%p], bDim=[%p], gDim=[%p], wSize=%p)\n", fatCubinHandle, hostFun, deviceFun, deviceName, thread_limit, tid, bid, bDim, gDim, wSize); + if (dlopen_orig == NULL) { + if ((dlopen_orig = dlsym(RTLD_NEXT, "dlopen")) == NULL) { + LOGE(LOG_ERROR, "[dlopen] dlsym failed"); + } + } - kernel_info_t *info = cricketd_utils_search_info(&kernel_infos, (char*)deviceName); - if (info == NULL) { - LOGE(LOG_ERROR, "request to register unknown function: \"%s\"", deviceName); - retval_1 = cuda_register_function_1((ptr)fatCubinHandle, (ptr)hostFun, deviceFun, (char*)deviceName, thread_limit, &result, clnt); - if (retval_1 != RPC_SUCCESS) { - LOGE(LOG_ERROR, "call failed."); + static const char *replace_libs[] = { + "libcuda.so.1", + "libcuda.so", + "libnvidia-ml.so.1", + "libcudnn_cnn_infer.so.8" + }; + static const size_t replace_libs_sz = sizeof(replace_libs) / sizeof(char *); + if (filename != NULL) { + for (size_t i=0; i != replace_libs_sz; ++i) { + if (strcmp(filename, replace_libs[i]) == 0) { + LOG(LOG_DEBUG, "replacing dlopen call to %s with cricket-client.so", filename); + dl_handle = dlopen_orig("cricket-client.so", flag); + if (clnt == NULL) { + LOGE(LOG_ERROR, "rpc seems to be uninitialized"); + } + return dl_handle; + } } + } + /* filename is NULL or not in replace_libs list */ + if ((ret = dlopen_orig(filename, flag)) == NULL) { + LOGE(LOG_ERROR, "dlopen failed: ", dlerror()); + } else if (has_kernel) { + dlinfo(ret, RTLD_DI_LINKMAP, &map); + LOGE(LOG_DEBUG, "dlopen to %p", map->l_addr); + } + return ret; +} - return; +int dlclose(void *handle) +{ + if (handle == NULL) { + LOGE(LOG_ERROR, "[dlclose] handle NULL"); + return -1; + } else if (dlclose_orig == NULL) { + if ((dlclose_orig = dlsym(RTLD_NEXT, "dlclose")) == NULL) { + LOGE(LOG_ERROR, "[dlclose] dlsym failed"); + } } - info->host_fun = (void*)hostFun; - if (retval_1 != RPC_SUCCESS) { - clnt_perror (clnt, "call failed"); + // Ignore dlclose call that would close this library + if (dl_handle == handle) { + LOGE(LOG_DEBUG, "[dlclose] ignore close"); + return 0; + } else { + return dlclose_orig(handle); } } -struct __fatCubin { - uint32_t magic; - uint32_t seq; - uint64_t text; - uint64_t data; - uint64_t ptr; - uint64_t ptr2; - uint64_t zero; -}; - -struct rpc_fatCubin { - uint32_t magic; - uint32_t seq; - uint64_t text; - uint64_t data; - uint64_t ptr; - uint64_t ptr2; - uint64_t zero; -}; - -void** __cudaRegisterFatBinary(void *fatCubin) +void __cudaRegisterVar(void **fatCubinHandle, char *hostVar, char *deviceAddress, + const char *deviceName, int ext, size_t size, int constant, + int global); + +void __cudaRegisterVar(void **fatCubinHandle, char *hostVar, char *deviceAddress, + const char *deviceName, int ext, size_t size, int constant, + int global) { - ptr_result result; enum clnt_stat retval_1; - - struct __fatCubin *fat = (struct __fatCubin*)((fatCubin)); - struct rpc_fatCubin rpc_fat = {.magic = fat->magic, - .seq = fat->seq, - .text = fat->text, - .data = fat->data, - .ptr = fat->ptr, - .ptr2 = fat->ptr2, - .zero = fat->zero}; - LOGE(LOG_DEBUG, "__cudaRegisterFatBinary"); - //printf("__cudaRegisterFatBinary(magic: %x, seq: %x, text: %lx, data: %lx, ptr: %lx, ptr2: %lx, zero: %lx\n", - // fat->magic, fat->seq, fat->text, fat->data, fat->ptr, fat->ptr2, fat->zero); - retval_1 = RPC_SUCCESS;//cuda_register_fat_binary_1(rpc_fat, &result, clnt); + int result; + LOGE(LOG_DEBUG, "__cudaRegisterVar(fatCubinHandle=%p, hostVar=%p, deviceAddress=%p, " + "deviceName=%s, ext=%d, size=%zu, constant=%d, global=%d)\n", + fatCubinHandle, hostVar, deviceAddress, deviceName, ext, size, constant, global); + retval_1 = rpc_register_var_1((ptr)fatCubinHandle, (ptr)hostVar, (ptr)deviceAddress, (char*)deviceName, ext, size, constant, global, + &result, clnt); if (retval_1 != RPC_SUCCESS) { - clnt_perror (clnt, "call failed"); + LOGE(LOG_ERROR, "call failed."); } - if (result.err != 0) { - return NULL; - } - return (void*)result.ptr_result_u.ptr; } -void __cudaRegisterFatBinaryEnd(void **fatCubinHandle) +void __cudaRegisterFunction(void **fatCubinHandle, const char *hostFun, + char *deviceFun, const char *deviceName, + int thread_limit, uint3 *tid, uint3 *bid, + dim3 *bDim, dim3 *gDim, int *wSize) { - int result; + ptr_result result; enum clnt_stat retval_1; - //printf("__cudaRegisterFatBinaryEnd(fatCubinHandle=%p)\n", fatCubinHandle); + LOGE(LOG_DEBUG, "__cudaRegisterFunction(fatCubinHandle=%p, hostFun=%p, devFunc=%s, " + "deviceName=%s, thread_limit=%d, tid=[%p], bid=[%p], bDim=[%p], " + "gDim=[%p], wSize=%p)\n", + fatCubinHandle, hostFun, deviceFun, deviceName, thread_limit, tid, + bid, bDim, gDim, wSize); - retval_1 = RPC_SUCCESS;//cuda_register_fat_binary_end_1((uint64_t)fatCubinHandle, &result, clnt); - if (retval_1 != RPC_SUCCESS) { - clnt_perror (clnt, "call failed"); + kernel_info_t *info = utils_search_info(&kernel_infos, (char *)deviceName); + if (info == NULL) { + LOGE(LOG_ERROR, "request to register unknown function: \"%s\"", + deviceName); + return; + } else { + LOGE(LOG_DEBUG, "request to register known function: \"%s\"", + deviceName); + retval_1 = rpc_register_function_1((ptr)fatCubinHandle, (ptr)hostFun, + deviceFun, (char*)deviceName, thread_limit, + &result, clnt); + if (retval_1 != RPC_SUCCESS) { + LOGE(LOG_ERROR, "call failed."); + exit(1); + } + if (result.err != 0) { + LOGE(LOG_ERROR, "error registering function: %d", result.err); + exit(1); + } + info->host_fun = (void *)hostFun; } } -static void *(*dlopen_orig)(const char *, int) = NULL; -static int (*dlclose_orig)(void *) = NULL; -static void *dl_handle = NULL; -void *dlopen(const char *filename, int flag) +void **__cudaRegisterFatBinary(void *fatCubin) { - LOG(LOG_DEBUG, "intercepted dlopen(%s, %d)", filename, flag); - if (dlopen_orig == NULL) { - if ( (dlopen_orig = dlsym(RTLD_NEXT, "dlopen")) == NULL) { - LOGE(LOG_ERROR, "[dlopen] dlsym failed"); - } + void **result; + int rpc_result; + enum clnt_stat retval_1; + size_t fatbin_size; + LOGE(LOG_DEBUG, "__cudaRegisterFatBinary(fatCubin=%p)", fatCubin); + + mem_data rpc_fat = { .mem_data_len = 0, .mem_data_val = NULL }; + + if (elf2_get_fatbin_info((struct fat_header *)fatCubin, + &kernel_infos, + (uint8_t **)&rpc_fat.mem_data_val, + &fatbin_size) != 0) { + LOGE(LOG_ERROR, "error getting fatbin info"); + return NULL; } + rpc_fat.mem_data_len = fatbin_size; - if (filename != NULL && strcmp(filename, "libcuda.so.1") == 0) { - LOG(LOG_DEBUG, "replacing dlopen call to cuda driver library with cricket-client.so"); - dl_handle = dlopen_orig("cricket-client.so", flag); - if (clnt == NULL) { - LOGE(LOG_ERROR, "rpc seems to be uninitialized"); - } - return dl_handle; - } else { - LOGE(LOG_DEBUG, "request to dlopen \"%s\"", filename); - if (cpu_utils_contains_kernel(filename) == 0) { - LOGE(LOG_ERROR, "file does not contain a kernel"); - } - return dlopen_orig(filename, flag); + // CUDA registers an atexit handler for fatbin cleanup that accesses + // the fatbin data structure. Let's allocate some zeroes to avoid segfaults. + result = (void**)calloc(1, 0x58); + + retval_1 = rpc_elf_load_1(rpc_fat, (ptr)result, &rpc_result, clnt); + if (retval_1 != RPC_SUCCESS) { + LOGE(LOG_ERROR, "call failed."); + } + if (rpc_result != 0) { + LOGE(LOG_ERROR, "error registering fatbin: %d", rpc_result); + return NULL; } + LOG(LOG_DEBUG, "fatbin loaded to %p", result); + // we return a bunch of zeroes to avoid segfaults. The memory is + // mapped by the modules resource + return result; } -int dlclose(void *handle) -{ - if (handle == NULL) { - LOGE(LOG_ERROR, "[dlclose] handle NULL"); - return -1; - } else if (dlclose_orig == NULL) { - if ( (dlclose_orig = dlsym(RTLD_NEXT, "dlclose")) == NULL) { - LOGE(LOG_ERROR, "[dlclose] dlsym failed"); - } - } +void __cudaUnregisterFatBinary(void **fatCubinHandle) +{ + int result; + enum clnt_stat retval_1; - // Ignore dlclose call that would close this library - if (dl_handle == handle) { - LOGE(LOG_DEBUG, "[dlclose] ignore close"); - return 0; - } else { - return dlclose_orig(handle); + LOGE(LOG_DEBUG, "__cudaUnregisterFatBinary(fatCubinHandle=%p)", + fatCubinHandle); + + if (fatCubinHandle == NULL) { + LOGE(LOG_WARNING, "fatCubinHandle is NULL - so we have nothing to unload. (This is okay if this binary does not contain a kernel.)"); + return; } + // retval_1 = rpc_elf_unload_1((ptr)fatCubinHandle, &result, clnt); + // if (retval_1 != RPC_SUCCESS || result != 0) { + // LOGE(LOG_ERROR, "call failed."); + // } } +// void __cudaRegisterFatBinaryEnd(void **fatCubinHandle) +// { +// int result; +// enum clnt_stat retval_1; + +// //printf("__cudaRegisterFatBinaryEnd(fatCubinHandle=%p)\n", +// fatCubinHandle); +// retval_1 = +// RPC_SUCCESS;//cuda_register_fat_binary_end_1((uint64_t)fatCubinHandle, +// &result, clnt); if (retval_1 != RPC_SUCCESS) { +// clnt_perror (clnt, "call failed"); +// } +// } diff --git a/cpu/cpu-elf2.c b/cpu/cpu-elf2.c new file mode 100644 index 00000000..89fcb24a --- /dev/null +++ b/cpu/cpu-elf2.c @@ -0,0 +1,999 @@ +#define _GNU_SOURCE +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "cpu-common.h" +#include "log.h" +#include "cpu-elf2.h" +#include "cpu-utils.h" + +#define uint16_t unsigned short +#define CRICKET_ELF_NV_INFO_PREFIX ".nv.info" +#define CRICKET_ELF_NV_SHARED_PREFIX ".nv.shared." +#define CRICKET_ELF_NV_TEXT_PREFIX ".nv.text." +#define CRICKET_ELF_TEXT_PREFIX ".text." + +#define CRICKET_ELF_FATBIN ".nv_fatbin" +#define CRICKET_ELF_REGFUN "_ZL24__sti____cudaRegisterAllv" + +#define FATBIN_STRUCT_MAGIC 0x466243b1 +#define FATBIN_TEXT_MAGIC 0xBA55ED50 + +struct __attribute__((__packed__)) fat_elf_header +{ + uint32_t magic; + uint16_t version; + uint16_t header_size; + uint64_t size; +}; +struct __attribute__((__packed__)) fat_text_header +{ + uint16_t kind; + uint16_t unknown1; + uint32_t header_size; + uint64_t size; + uint32_t compressed_size; // Size of compressed data + uint32_t unknown2; // Address size for PTX? + uint16_t minor; + uint16_t major; + uint32_t arch; + uint32_t obj_name_offset; + uint32_t obj_name_len; + uint64_t flags; + uint64_t zero; // Alignment for compression? + uint64_t decompressed_size; // Length of compressed data in decompressed representation. + // There is an uncompressed footer so this is generally smaller + // than size. +}; + +#define FATBIN_FLAG_64BIT 0x0000000000000001LL +#define FATBIN_FLAG_DEBUG 0x0000000000000002LL +#define FATBIN_FLAG_LINUX 0x0000000000000010LL +#define FATBIN_FLAG_COMPRESS 0x0000000000002000LL + +int elf2_init(void) +{ + if (elf_version(EV_CURRENT) == EV_NONE) { + LOGE(LOG_ERROR, "ELF library initialization failed: %s", elf_errmsg(-1)); + return -1; + } + return 0; +} + +static int flag_to_str(char** str, uint64_t flag) +{ + return asprintf(str, "64Bit: %s, Debug: %s, Linux: %s, Compress %s", + (flag & FATBIN_FLAG_64BIT) ? "yes" : "no", + (flag & FATBIN_FLAG_DEBUG) ? "yes" : "no", + (flag & FATBIN_FLAG_LINUX) ? "yes" : "no", + (flag & FATBIN_FLAG_COMPRESS) ? "yes" : "no"); +} + +static void print_header(struct fat_text_header *th) +{ + char* flagstr = NULL; + flag_to_str(&flagstr, th->flags); + + LOGE(LOG_DBG(1), "text_header: fatbin_kind: %#x, header_size %#x, size %#zx, compressed_size %#x,\ + minor %#x, major %#x, arch %d, decompressed_size %#zx\n\tflags: %s\n", + th->kind, + th->header_size, + th->size, + th->compressed_size, + th->minor, + th->major, + th->arch, + th->decompressed_size, + flagstr); + LOGE(LOG_DBG(1), "\tunknown fields: unknown1: %#x, unknown2: %#x, zeros: %#zx\n", + th->unknown1, + th->unknown2, + th->zero); + + free(flagstr); +} + +/** Check the header of a fatbin + * Performs some integrity checks and returns the elf header + * @param fatbin_data Pointer to the fatbin data + * @param fatbin_size Size of the fatbin data + * @param decompressed_size Pointer to a variable that will be set to the size of the decompressed data + * @param compressed_data Pointer to a variable that will be set to point to the compressed data +*/ +static int get_elf_header(const uint8_t* fatbin_data, size_t fatbin_size, struct fat_elf_header **elf_header) +{ + struct fat_elf_header *eh = NULL; + + if (fatbin_data == NULL || elf_header == NULL) { + LOGE(LOG_ERROR, "fatbin_data is NULL"); + return 1; + } + + if (fatbin_size < sizeof(struct fat_elf_header)) { + LOGE(LOG_ERROR, "fatbin_size is too small"); + return 1; + } + + eh = (struct fat_elf_header*) fatbin_data; + if (eh->magic != FATBIN_TEXT_MAGIC) { + LOGE(LOG_ERROR, "Invalid magic number: expected %#x but got %#x", FATBIN_TEXT_MAGIC, eh->magic); + return 1; + } + + if (eh->version != 1 || eh->header_size != sizeof(struct fat_elf_header)) { + LOGE(LOG_ERROR, "fatbin text version is wrong or header size is inconsistent.\ + This is a sanity check to avoid reading a new fatbinary format"); + return 1; + } + + *elf_header = eh; + return 0; +} + +/** Check the text header of a fatbin + * Performs some integrity checks and returns the text header + * @param fatbin_data Pointer to the fatbin data + * @param fatbin_size Size of the fatbin data + * @param decompressed_size Pointer to a variable that will be set to the size of the decompressed data + * @param compressed_data Pointer to a variable that will be set to point to the compressed data +*/ +static int get_text_header(const uint8_t* fatbin_data, size_t fatbin_size, struct fat_text_header **text_header) +{ + struct fat_text_header *th = NULL; + + if (fatbin_data == NULL || text_header == NULL) { + LOGE(LOG_ERROR, "fatbin_data is NULL"); + return 1; + } + + if (fatbin_size < sizeof(struct fat_text_header)) { + LOGE(LOG_ERROR, "fatbin_size is too small"); + return 1; + } + + th = (struct fat_text_header*)fatbin_data; + + if(th->obj_name_offset != 0) { + if (((char*)th)[th->obj_name_offset + th->obj_name_len] != '\0') { + LOGE(LOG_WARNING, "Fatbin object name is not null terminated"); + } else { + char *obj_name = (char*)th + th->obj_name_offset; + LOGE(LOG_DEBUG, "Fatbin object name: %s (len:%#x)", obj_name, th->obj_name_len); + } + } + + *text_header = th; + return 0; +} + +/** Decompresses a fatbin file + * @param input Pointer compressed input data + * @param input_size Size of compressed data + * @param output preallocated memory where decompressed output should be stored + * @param output_size size of output buffer. Should be equal to the size of the decompressed data + */ +static size_t decompress(const uint8_t* input, size_t input_size, uint8_t* output, size_t output_size) +{ + size_t ipos = 0, opos = 0; + uint64_t next_nclen; // length of next non-compressed segment + uint64_t next_clen; // length of next compressed segment + uint64_t back_offset; // negative offset where redudant data is located, relative to current opos + + while (ipos < input_size) { + next_nclen = (input[ipos] & 0xf0) >> 4; + next_clen = 4 + (input[ipos] & 0xf); + if (next_nclen == 0xf) { + do { + next_nclen += input[++ipos]; + } while (input[ipos] == 0xff); + } + + if (memcpy(output + opos, input + (++ipos), next_nclen) == NULL) { + LOGE(LOG_ERROR, "copying data"); + return 0; + } +#ifdef FATBIN_DECOMPRESS_DEBUG + printf("%#04zx nocompress (len:%#x):\n", opos, next_nclen); + hexdump(output + opos, next_nclen); +#endif + ipos += next_nclen; + opos += next_nclen; + if (ipos >= input_size || opos >= output_size) { + break; + } + back_offset = input[ipos] + (input[ipos + 1] << 8); + ipos += 2; + if (next_clen == 0xf+4) { + do { + next_clen += input[ipos++]; + } while (input[ipos - 1] == 0xff); + } +#ifdef FATBIN_DECOMPRESS_DEBUG + printf("%#04zx compress (decompressed len: %#x, back_offset %#x):\n", opos, next_clen, back_offset); +#endif + if (next_clen <= back_offset) { + if (memcpy(output + opos, output + opos - back_offset, next_clen) == NULL) { + LOGE(LOG_ERROR, "Error copying data"); + return 0; + } + } else { + if (memcpy(output + opos, output + opos - back_offset, back_offset) == NULL) { + LOGE(LOG_ERROR, "Error copying data"); + return 0; + } + for (size_t i = back_offset; i < next_clen; i++) { + output[opos + i] = output[opos + i - back_offset]; + } + } +#ifdef FATBIN_DECOMPRESS_DEBUG + hexdump(output + opos, next_clen); +#endif + opos += next_clen; + } + LOGE(LOG_DEBUG, "ipos: %#zx, opos: %#zx, ilen: %#zx, olen: %#zx", ipos, opos, input_size, output_size); + return opos; +} + +static ssize_t decompress_section(const uint8_t *input, uint8_t **output, size_t *output_size, + struct fat_elf_header *eh, struct fat_text_header *th, size_t *eh_out_offset) +{ + struct fat_text_header *th_out = NULL; + struct fat_elf_header *eh_out = NULL; + uint8_t *output_pos = 0; + size_t padding; + size_t input_read = 0; + const uint8_t zeroes[6] = {0}; + + if (output == NULL || output_size == NULL || eh == NULL || th == NULL || eh_out_offset == NULL) { + LOGE(LOG_ERROR, "invalid parameters"); + return 1; + } + + if ((*output = realloc(*output, *output_size + th->decompressed_size + eh->header_size + th->header_size)) == NULL) { + LOGE(LOG_ERROR, "Error allocating memory of size %#zx for output buffer: %s", + *output_size + th->decompressed_size + eh->header_size + th->header_size, strerror(errno)); + goto error; + } + output_pos = *output + *output_size; + *output_size += th->decompressed_size + th->header_size; + + if (input == (uint8_t*)eh + eh->header_size + th->header_size) { // We are at the first section + if (memcpy(output_pos, eh, eh->header_size) == NULL) { + LOGE(LOG_ERROR, "Error copying data"); + goto error; + } + eh_out = ((struct fat_elf_header*)(output_pos)); + eh_out->size = 0; + *eh_out_offset = output_pos - *output; + output_pos += eh->header_size; + *output_size += eh->header_size; + } + eh_out = ((struct fat_elf_header*)(*output + *eh_out_offset)); // repair pointer in case realloc moved the buffer + eh_out->size += th->decompressed_size + th->header_size; // set size + + if (memcpy(output_pos, th, th->header_size) == NULL) { + LOGE(LOG_ERROR, "Error copying data"); + goto error; + } + th_out = ((struct fat_text_header*)output_pos); + th_out->flags &= ~FATBIN_FLAG_COMPRESS; // clear compressed flag + th_out->compressed_size = 0; // clear compressed size + th_out->decompressed_size = 0; // clear decompressed size + th_out->size = th->decompressed_size; // set size + + output_pos += th->header_size; + + if (decompress(input, th->compressed_size, output_pos, th->decompressed_size) != th->decompressed_size) { + LOGE(LOG_ERROR, "Decompression failed"); + goto error; + } + + input_read += th->compressed_size; + output_pos += th->decompressed_size; + + // if (input_pos != (uint8_t*)th + eh->size) { + // printf("There is %#zx bytes of data remaining\n", (uint8_t*)th + eh->size - input_pos); + // } + + padding = (8 - (size_t)(input + input_read) % 8); + if (memcmp(input + input_read, zeroes, padding) != 0) { + LOGE(LOG_ERROR, "expected %#zx zero bytes, got:", padding); + hexdump(input + input_read, 0x60); + goto error; + } + input_read += padding; + + padding = ((8 - (size_t)th->decompressed_size) % 8); + // Because we always allocated enough memory for one more elf_header and this is smaller than + // the maximal padding of 7, we do not have to reallocate here. + memset(output_pos, 0, padding); + *output_size += padding; + eh_out->size += padding; + th_out->size += padding; + + return input_read; + error: + free(*output); + *output = NULL; + return -1; +} + +static ssize_t decompress_single_section(const uint8_t *input, uint8_t **output, size_t *output_size, + struct fat_elf_header *eh, struct fat_text_header *th) +{ + size_t padding; + size_t input_read = 0; + size_t output_written = 0; + size_t decompress_ret = 0; + const uint8_t zeroes[8] = {0}; + + if (input == NULL || output == NULL || eh == NULL || th == NULL) { + LOGE(LOG_ERROR, "invalid parameters"); + return 1; + } + + // add max padding of 7 bytes + if ((*output = malloc(th->decompressed_size + 7)) == NULL) { + LOGE(LOG_ERROR, "Error allocating memory of size %#zx for output buffer: %s", + th->decompressed_size, strerror(errno)); + goto error; + } + print_header(th); + + if ((decompress_ret = decompress(input, th->compressed_size, *output, th->decompressed_size)) != th->decompressed_size) { + LOGE(LOG_ERROR, "Decompression failed: decompressed size is %#zx, but header says %#zx", + decompress_ret, th->decompressed_size); + LOGE(LOG_ERROR, "input pos: %#zx, output pos: %#zx", input - (uint8_t*)eh, *output); + hexdump(input, 0x160); + if (decompress_ret >= 0x60) + hexdump((*output) + decompress_ret - 0x60, 0x60); + goto error; + } + input_read += th->compressed_size; + output_written += th->decompressed_size; + + padding = ((8 - (size_t)(input + input_read)) % 8); + if (memcmp(input + input_read, zeroes, padding) != 0) { + LOGE(LOG_ERROR, "expected %#zx zero bytes, got:", padding); + hexdump(input + input_read, 0x60); + goto error; + } + input_read += padding; + + padding = ((8 - (size_t)th->decompressed_size) % 8); + // Because we always allocated enough memory for one more elf_header and this is smaller than + // the maximal padding of 7, we do not have to reallocate here. + memset(*output, 0, padding); + output_written += padding; + + *output_size = output_written; + return input_read; + error: + free(*output); + *output = NULL; + return -1; +} + +/** Decompresses a fatbin file + * @param fatbin_data Pointer to the fatbin data + * @param fatbin_size Size of the fatbin data + * @param decompressed_data Pointer to a variable that will be set to point to the decompressed data + * @param decompressed_size Pointer to a variable that will be set to the size of the decompressed data + */ +static size_t decompress_fatbin(const uint8_t* fatbin_data, size_t fatbin_size, uint8_t** decompressed_data) +{ + struct fat_elf_header *eh = NULL; + size_t eh_out_offset = 0; + struct fat_text_header *th = NULL; + const uint8_t *input_pos = fatbin_data; + + int i = 0; + uint8_t *output = NULL; + size_t output_size = 0; + ssize_t input_read; + + if (fatbin_data == NULL || decompressed_data == NULL) { + LOGE(LOG_ERROR, "fatbin_data is NULL"); + goto error; + } + + while (input_pos < fatbin_data + fatbin_size) { + if (get_elf_header(input_pos, fatbin_size - (input_pos - fatbin_data), &eh) != 0) { + LOGE(LOG_ERROR, "Something went wrong while checking the header."); + goto error; + } + // printf("elf header no. %d: magic: %#x, version: %#x, header_size: %#x, size: %#zx\n", + // i++, eh->magic, eh->version, eh->header_size, eh->size); + input_pos += eh->header_size; + do { + if (get_text_header(input_pos, fatbin_size - (input_pos - fatbin_data) - eh->header_size, &th) != 0) { + LOGE(LOG_ERROR, "Something went wrong while checking the header."); + goto error; + } + //print_header(th); + input_pos += th->header_size; + + if ((input_read = decompress_section(input_pos, &output, &output_size, eh, th, &eh_out_offset)) < 0) { + LOGE(LOG_ERROR, "Something went wrong while decompressing text section."); + goto error; + } + input_pos += input_read; + + } while (input_pos < (uint8_t*)eh + eh->header_size + eh->size); + + //printf("##### Decompressed data (size %#zx): #####\n", th->decompressed_size); + //hexdump(output_pos, th->decompressed_size); + } + + *decompressed_data = output; + return output_size; + error: + if (output != NULL) { + free(output); + } + *decompressed_data = NULL; + return 0; +} + +int elf2_get_fatbin_info(const struct fat_header *fatbin, list *kernel_infos, uint8_t** fatbin_mem, size_t* fatbin_size) +{ + struct fat_elf_header* eh; + struct fat_text_header* th; + const uint8_t *input_pos = NULL; + const uint8_t *fatbin_data = NULL; + uint8_t *text_data = NULL; + size_t text_data_size = 0; + size_t fatbin_total_size = 0; + int ret = -1; + if (fatbin == NULL || fatbin_mem == NULL || fatbin_size == NULL) { + LOGE(LOG_ERROR, "at least one parameter is NULL"); + goto error; + } + fatbin_data = input_pos = (const uint8_t*)fatbin->text; + if (fatbin->magic != FATBIN_STRUCT_MAGIC) { + LOGE(LOG_ERROR, "fatbin struct magic number is wrong. Got %llx, expected %llx.", fatbin->magic, FATBIN_STRUCT_MAGIC); + goto error; + } + LOGE(LOG_DBG(1), "Fatbin: magic: %x, version: %x, text: %lx, data: %lx, ptr: %lx, ptr2: %lx, zero: %lx", + fatbin->magic, fatbin->version, fatbin->text, fatbin->data, fatbin->unknown, fatbin->text2, fatbin->zero); + + if (get_elf_header((uint8_t*)fatbin->text, sizeof(struct fat_elf_header), &eh) != 0) { + LOGE(LOG_ERROR, "Something went wrong while checking the header."); + goto error; + } + // LOGE(LOG_DBG(1), "elf header: magic: %#x, version: %#x, header_size: %#x, size: %#zx", + // eh->magic, eh->version, eh->header_size, eh->size); + + input_pos += eh->header_size; + fatbin_total_size = eh->header_size + eh->size; + do { + if (get_text_header(input_pos, *fatbin_size - (input_pos - fatbin_data) - eh->header_size, &th) != 0) { + LOGE(LOG_ERROR, "Something went wrong while checking the header."); + goto error; + } + //print_header(th); + input_pos += th->header_size; + if (th->kind != 2) { // section does not cotain device code (but e.g. PTX) + input_pos += th->size; + continue; + } + if (th->flags & FATBIN_FLAG_DEBUG) { + LOGE(LOG_DEBUG, "fatbin contains debug information."); + } + + if (th->flags & FATBIN_FLAG_COMPRESS) { + ssize_t input_read; + + LOGE(LOG_DEBUG, "fatbin contains compressed device code. Decompressing..."); + if ((input_read = decompress_single_section(input_pos, &text_data, &text_data_size, eh, th)) < 0) { + LOGE(LOG_ERROR, "Something went wrong while decompressing text section."); + goto error; + } + input_pos += input_read; + //hexdump(text_data, text_data_size); + } else { + text_data = (uint8_t*)input_pos; + text_data_size = th->size; + input_pos += th->size; + } + // print_header(th); + if (elf2_parameter_info(kernel_infos, text_data , text_data_size) != 0) { + LOGE(LOG_ERROR, "error getting parameter info"); + goto error; + } + if (th->flags & FATBIN_FLAG_COMPRESS) { + free(text_data); + } + } while (input_pos < (uint8_t*)eh + eh->header_size + eh->size); + + // if (get_elf_header((uint8_t*)fatbin->text2, sizeof(struct fat_elf_header), &eh) != 0) { + // LOGE(LOG_ERROR, "Something went wrong while checking the header."); + // goto error; + // } + // fatbin_total_size += eh->header_size + eh->size; + + *fatbin_mem = (void*)fatbin->text; + *fatbin_size = fatbin_total_size; + ret = 0; + error: + return ret; +} + +static void print_hexmem(void *mem, size_t len) +{ + for (int i=0; iname == NULL || memory == NULL) { + LOGE(LOG_ERROR, "at least one parameter is NULL"); + goto cleanup; + } + kernel->param_num = 0; + kernel->param_offsets = NULL; + kernel->param_sizes = NULL; + + if ((section_name = get_kernel_section_from_kernel_name(kernel->name)) == NULL) { + LOGE(LOG_ERROR, "get_kernel_section_from_kernel_name failed"); + goto cleanup; + } + + if (get_section_by_name(elf, section_name, §ion) != 0) { + LOGE(LOG_ERROR, "section %s not found", section_name); + goto cleanup; + } + + if ((data = elf_getdata(section, NULL)) == NULL) { + LOGE(LOG_ERROR, "error getting section data"); + goto cleanup; + } + + //print_hexmem(data->d_buf, data->d_size); + + size_t secpos=0; + int i=0; + while (secpos < data->d_size) { + struct nv_info_kernel_entry *entry = (struct nv_info_kernel_entry*)(data->d_buf+secpos); + // printf("entry %d: format: %#x, attr: %#x, ", i++, entry->format, entry->attribute); + if (entry->format == EIFMT_SVAL && entry->attribute == EIATTR_KPARAM_INFO) { + if (entry->values_size != 0xc) { + LOGE(LOG_ERROR, "EIATTR_KPARAM_INFO values size has not the expected value of 0xc"); + goto cleanup; + } + struct nv_info_kparam_info *kparam = (struct nv_info_kparam_info*)&entry->values; + // printf("kparam: index: %#x, ordinal: %#x, offset: %#x, unknown: %#0x, cbank: %#0x, size: %#0x\n", + // kparam->index, kparam->ordinal, kparam->offset, kparam->unknown, kparam->cbank, kparam->size); + LOGE(LOG_DBG(1), "param %d: offset: %#x, size: %#x", kparam->ordinal, kparam->offset, kparam->size); + if (kparam->ordinal >= kernel->param_num) { + kernel->param_offsets = realloc(kernel->param_offsets, + (kparam->ordinal+1)*sizeof(uint16_t)); + kernel->param_sizes = realloc(kernel->param_sizes, + (kparam->ordinal+1)*sizeof(uint16_t)); + kernel->param_num = kparam->ordinal+1; + } + kernel->param_offsets[kparam->ordinal] = kparam->offset; + kernel->param_sizes[kparam->ordinal] = kparam->size; + secpos += sizeof(struct nv_info_kernel_entry) + entry->values_size-4; + } else if (entry->format == EIFMT_HVAL && entry->attribute == EIATTR_CBANK_PARAM_SIZE) { + kernel->param_size = entry->values_size; + LOGE(LOG_DEBUG, "cbank_param_size: %#0x", entry->values_size); + secpos += sizeof(struct nv_info_kernel_entry)-4; + } else if (entry->format == EIFMT_HVAL) { + // printf("hval: %#x(%d)\n", entry->values_size, entry->values_size); + secpos += sizeof(struct nv_info_kernel_entry)-4; + } else if (entry->format == EIFMT_SVAL) { + // printf("sval_size: %#x ", entry->values_size); + // for (int j=0; j*sizeof(uint32_t) < entry->values_size; j++) { + // printf("val%d: %#x(%d) ", j, (&entry->values)[j], (&entry->values)[j]); + // } + // printf("\n"); + secpos += sizeof(struct nv_info_kernel_entry) + entry->values_size-4; + } else if (entry->format == EIFMT_NVAL) { + // printf("nval\n"); + secpos += sizeof(struct nv_info_kernel_entry)-4; + } else { + LOGE(LOG_WARNING, "unknown format: %#x", entry->format); + secpos += sizeof(struct nv_info_kernel_entry)-4; + } + } + // printf("remaining: %d\n", data->d_size % sizeof(struct nv_info_kernel_entry)); + ret = 0; + cleanup: + free(section_name); + return ret; +} + +static int get_symtab(Elf *elf, Elf_Data **symbol_table_data, size_t *symbol_table_size, GElf_Shdr *symbol_table_shdr) +{ + GElf_Shdr shdr; + Elf_Scn *section = NULL; + + if (elf == NULL || symbol_table_data == NULL || symbol_table_size == NULL) { + LOGE(LOG_ERROR, "invalid argument"); + return -1; + } + + if (get_section_by_name(elf, ".symtab", §ion) != 0) { + LOGE(LOG_ERROR, "could not find .symtab section"); + return -1; + } + + if (gelf_getshdr(section, &shdr) == NULL) { + LOGE(LOG_ERROR, "gelf_getshdr failed"); + return -1; + } + + if (symbol_table_shdr != NULL) { + *symbol_table_shdr = shdr; + } + + if(shdr.sh_type != SHT_SYMTAB) { + LOGE(LOG_ERROR, "not a symbol table: %d", shdr.sh_type); + return -1; + } + + if ((*symbol_table_data = elf_getdata(section, NULL)) == NULL) { + LOGE(LOG_ERROR, "elf_getdata failed"); + return -1; + } + + *symbol_table_size = shdr.sh_size / shdr.sh_entsize; + + return 0; +} + +static void print_symtab(Elf *elf) +{ + GElf_Sym sym; + Elf_Data *symbol_table_data = NULL; + GElf_Shdr shdr; + size_t symnum; + int i = 0; + + if (get_symtab(elf, &symbol_table_data, &symnum, &shdr) != 0) { + LOGE(LOG_ERROR, "could not get symbol table"); + return; + } + + LOGE(LOG_DEBUG, "found %d symbols", symnum); + + while (gelf_getsym(symbol_table_data, i, &sym) != NULL) { + printf("sym %d: name: %s, value: %#lx, size: %#lx, info: %#x, other: %#x, shndx: %#x\n", i, + elf_strptr(elf, shdr.sh_link, sym.st_name), + sym.st_value, sym.st_size, sym.st_info, sym.st_other, sym.st_shndx); + i++; + } +} + +static int check_elf(Elf *elf) +{ + Elf_Kind ek; + GElf_Ehdr ehdr; + + int elfclass; + char *id; + size_t program_header_num; + size_t sections_num; + size_t section_str_num; + int ret = -1; + + if ((ek = elf_kind(elf)) != ELF_K_ELF) { + LOGE(LOG_ERROR, "elf_kind is not ELF_K_ELF, but %d", ek); + goto cleanup; + } + + if (gelf_getehdr(elf, &ehdr) == NULL) { + LOGE(LOG_ERROR, "gelf_getehdr failed"); + goto cleanup; + } + + if ((elfclass = gelf_getclass(elf)) == ELFCLASSNONE) { + LOGE(LOG_ERROR, "gelf_getclass failed"); + goto cleanup; + } + + if ((id = elf_getident(elf, NULL)) == NULL) { + LOGE(LOG_ERROR, "elf_getident failed"); + goto cleanup; + } + + LOGE(LOG_DBG(1), "elfclass: %d-bit; elf ident[0..%d]: %7s", + (elfclass == ELFCLASS32) ? 32 : 64, + EI_ABIVERSION, id); + + if (elf_getshdrnum(elf, §ions_num) != 0) { + LOGE(LOG_ERROR, "elf_getphdrnum failed"); + goto cleanup; + } + + if (elf_getphdrnum(elf, &program_header_num) != 0) { + LOGE(LOG_ERROR, "elf_getshdrnum failed"); + goto cleanup; + } + + if (elf_getshdrstrndx(elf, §ion_str_num) != 0) { + LOGE(LOG_ERROR, "elf_getshstrndx Wfailed"); + goto cleanup; + } + + LOGE(LOG_DBG(1), "elf contains %d sections, %d program_headers, string table section: %d", + sections_num, program_header_num, section_str_num); + + ret = 0; +cleanup: + return ret; +} + +int elf2_parameter_info(list *kernel_infos, void* memory, size_t memsize) +{ + struct __attribute__((__packed__)) nv_info_entry{ + uint8_t format; + uint8_t attribute; + uint16_t values_size; + uint32_t kernel_id; + uint32_t value; + }; + + Elf *elf = NULL; + Elf_Scn *section = NULL; + Elf_Data *data = NULL, *symbol_table_data = NULL; + GElf_Shdr symtab_shdr; + size_t symnum; + int i = 0; + GElf_Sym sym; + + int ret = -1; + kernel_info_t *ki = NULL; + const char *kernel_str; + + if (memory == NULL || memsize == 0) { + LOGE(LOG_ERROR, "memory was NULL or memsize was 0"); + return -1; + } + +#define ELF_DUMP_TO_FILE 1 + +#ifdef ELF_DUMP_TO_FILE + FILE* fd2 = fopen("/tmp/cricket-elf-dump", "wb"); + fwrite(memory, memsize, 1, fd2); + fclose(fd2); +#endif + + if ((elf = elf_memory(memory, memsize)) == NULL) { + LOGE(LOG_ERROR, "elf_memory failed"); + goto cleanup; + } + + if (check_elf(elf) != 0) { + LOGE(LOG_ERROR, "check_elf failed"); + goto cleanup; + } + + if (get_symtab(elf, &symbol_table_data, &symnum, &symtab_shdr) != 0) { + LOGE(LOG_ERROR, "could not get symbol table"); + goto cleanup; + } + + if (get_section_by_name(elf, ".nv.info", §ion) != 0) { + LOGE(LOG_WARNING, "could not find .nv.info section. This means this binary does not contain any kernels."); + ret = 0; // This is not an error. + goto cleanup; + } + + if ((data = elf_getdata(section, NULL)) == NULL) { + LOGE(LOG_ERROR, "elf_getdata failed"); + goto cleanup; + } + + for (size_t secpos=0; secpos < data->d_size; secpos += sizeof(struct nv_info_entry)) { + struct nv_info_entry *entry = (struct nv_info_entry *)(data->d_buf+secpos); + // LOGE(LOG_DBG(1), "%d: format: %#x, attr: %#x, values_size: %#x kernel: %#x, sval: %#x(%d)", + // i++, entry->format, entry->attribute, entry->values_size, entry->kernel_id, + // entry->value, entry->value); + + if (entry->values_size != 8) { + LOGE(LOG_ERROR, "unexpected values_size: %#x", entry->values_size); + continue; + } + + if (entry->attribute != EIATTR_FRAME_SIZE) { + continue; + } + + if (entry->kernel_id >= symnum) { + LOGE(LOG_ERROR, "kernel_id out of bounds: %#x", entry->kernel_id); + continue; + } + + if (gelf_getsym(symbol_table_data, entry->kernel_id, &sym) == NULL) { + LOGE(LOG_ERROR, "gelf_getsym failed for entry %d", entry->kernel_id); + continue; + } + if ((kernel_str = elf_strptr(elf, symtab_shdr.sh_link, sym.st_name) ) == NULL) { + LOGE(LOG_ERROR, "strptr failed for entry %d", entry->kernel_id); + continue; + } + + if (utils_search_info(kernel_infos, kernel_str) != NULL) { + continue; + } + + LOGE(LOG_DEBUG, "found new kernel: %s (symbol table id: %#x)", kernel_str, entry->kernel_id); + + if (list_append(kernel_infos, (void**)&ki) != 0) { + LOGE(LOG_ERROR, "error on appending to list"); + goto cleanup; + } + + size_t buflen = strlen(kernel_str)+1; + if ((ki->name = malloc(buflen)) == NULL) { + LOGE(LOG_ERROR, "malloc failed"); + goto cleanup; + } + if (strncpy(ki->name, kernel_str, buflen) != ki->name) { + LOGE(LOG_ERROR, "strncpy failed"); + goto cleanup; + } + + if (get_parm_for_kernel(elf, ki, memory, memsize) != 0) { + LOGE(LOG_ERROR, "get_parm_for_kernel failed for kernel %s", kernel_str); + goto cleanup; + } + } + + ret = 0; + cleanup: + if (elf != NULL) { + elf_end(elf); + } + return ret; +} + +void* elf2_symbol_address(const char *symbol) +{ + return dlsym(RTLD_DEFAULT, symbol); +} \ No newline at end of file diff --git a/cpu/cpu-elf2.h b/cpu/cpu-elf2.h new file mode 100644 index 00000000..4223498e --- /dev/null +++ b/cpu/cpu-elf2.h @@ -0,0 +1,25 @@ +#ifndef _ELF_H_ +#define _ELF_H_ + +#include +#include "cpu-common.h" +#include "list.h" + +struct __attribute__((__packed__)) fat_header { + uint32_t magic; + uint32_t version; + uint64_t text; // points to first text section + uint64_t data; // points to outside of the file + uint64_t unknown; + uint64_t text2; // points to second text section + uint64_t zero; +}; + +int elf2_init(void); +int elf2_get_fatbin_info(const struct fat_header *fatbin, list *kernel_infos, uint8_t** fatbin_mem, size_t* fatbin_size); + +int elf2_parameter_info(list *kernel_infos, void* memory, size_t memsize); +void* elf2_symbol_address(const char *symbol); +//int elf2_contains_kernel(void* memory, size_t memsize); + +#endif //_ELF_H_ diff --git a/cpu/cpu-libwrap.h b/cpu/cpu-libwrap.h index 361f4105..5b3a8ba7 100644 --- a/cpu/cpu-libwrap.h +++ b/cpu/cpu-libwrap.h @@ -186,10 +186,24 @@ RET NAME(P1_TYPE P1_NAME, P2_TYPE P2_NAME, P3_TYPE P3_NAME, P4_TYPE P4_NAME, P5_ DEF_FN_PTR(RET, P1_TYPE, P2_TYPE, P3_TYPE, P4_TYPE, P5_TYPE, P6_TYPE, P7_TYPE, P8_TYPE, P9_TYPE, P10_TYPE, P11_TYPE, P12_TYPE, P13_TYPE, P14_TYPE, P15_TYPE, P16_TYPE, P17_TYPE, P18_TYPE, P19_TYPE, P20_TYPE, P21_TYPE); \ DEF_FN_BODY(RET, NAME, P1_NAME, P2_NAME, P3_NAME, P4_NAME, P5_NAME, P6_NAME, P7_NAME, P8_NAME, P9_NAME, P10_NAME, P11_NAME, P12_NAME, P13_NAME, P14_NAME, P15_NAME, P16_NAME, P17_NAME, P18_NAME, P19_NAME, P20_NAME, P21_NAME); \ } +#define DEF_FN_22(RET, NAME, P1_TYPE, P1_NAME, P2_TYPE, P2_NAME, P3_TYPE, P3_NAME, P4_TYPE, P4_NAME, P5_TYPE, P5_NAME, P6_TYPE, P6_NAME, P7_TYPE, P7_NAME, P8_TYPE, P8_NAME, P9_TYPE, P9_NAME, P10_TYPE, P10_NAME, P11_TYPE, P11_NAME, P12_TYPE, P12_NAME, P13_TYPE, P13_NAME, P14_TYPE, P14_NAME, P15_TYPE, P15_NAME, P16_TYPE, P16_NAME, P17_TYPE, P17_NAME, P18_TYPE, P18_NAME, P19_TYPE, P19_NAME, P20_TYPE, P20_NAME, P21_TYPE, P21_NAME, P22_TYPE, P22_NAME) \ +RET NAME(P1_TYPE P1_NAME, P2_TYPE P2_NAME, P3_TYPE P3_NAME, P4_TYPE P4_NAME, P5_TYPE P5_NAME, P6_TYPE P6_NAME, P7_TYPE P7_NAME, P8_TYPE P8_NAME, P9_TYPE P9_NAME, P10_TYPE P10_NAME, P11_TYPE P11_NAME, P12_TYPE P12_NAME, P13_TYPE P13_NAME, P14_TYPE P14_NAME, P15_TYPE P15_NAME, P16_TYPE P16_NAME, P17_TYPE P17_NAME, P18_TYPE P18_NAME, P19_TYPE P19_NAME, P20_TYPE P20_NAME, P21_TYPE P21_NAME, P22_TYPE P22_NAME) \ +{ \ + DEF_FN_PTR(RET, P1_TYPE, P2_TYPE, P3_TYPE, P4_TYPE, P5_TYPE, P6_TYPE, P7_TYPE, P8_TYPE, P9_TYPE, P10_TYPE, P11_TYPE, P12_TYPE, P13_TYPE, P14_TYPE, P15_TYPE, P16_TYPE, P17_TYPE, P18_TYPE, P19_TYPE, P20_TYPE, P21_TYPE, P22_TYPE); \ + DEF_FN_BODY(RET, NAME, P1_NAME, P2_NAME, P3_NAME, P4_NAME, P5_NAME, P6_NAME, P7_NAME, P8_NAME, P9_NAME, P10_NAME, P11_NAME, P12_NAME, P13_NAME, P14_NAME, P15_NAME, P16_NAME, P17_NAME, P18_NAME, P19_NAME, P20_NAME, P21_NAME, P22_NAME); \ +} +#define DEF_FN_23(RET, NAME, P1_TYPE, P1_NAME, P2_TYPE, P2_NAME, P3_TYPE, P3_NAME, P4_TYPE, P4_NAME, P5_TYPE, P5_NAME, P6_TYPE, P6_NAME, P7_TYPE, P7_NAME, P8_TYPE, P8_NAME, P9_TYPE, P9_NAME, P10_TYPE, P10_NAME, P11_TYPE, P11_NAME, P12_TYPE, P12_NAME, P13_TYPE, P13_NAME, P14_TYPE, P14_NAME, P15_TYPE, P15_NAME, P16_TYPE, P16_NAME, P17_TYPE, P17_NAME, P18_TYPE, P18_NAME, P19_TYPE, P19_NAME, P20_TYPE, P20_NAME, P21_TYPE, P21_NAME, P22_TYPE, P22_NAME, P23_TYPE, P23_NAME) \ +RET NAME(P1_TYPE P1_NAME, P2_TYPE P2_NAME, P3_TYPE P3_NAME, P4_TYPE P4_NAME, P5_TYPE P5_NAME, P6_TYPE P6_NAME, P7_TYPE P7_NAME, P8_TYPE P8_NAME, P9_TYPE P9_NAME, P10_TYPE P10_NAME, P11_TYPE P11_NAME, P12_TYPE P12_NAME, P13_TYPE P13_NAME, P14_TYPE P14_NAME, P15_TYPE P15_NAME, P16_TYPE P16_NAME, P17_TYPE P17_NAME, P18_TYPE P18_NAME, P19_TYPE P19_NAME, P20_TYPE P20_NAME, P21_TYPE P21_NAME, P22_TYPE P22_NAME, P23_TYPE P23_NAME) \ +{ \ + DEF_FN_PTR(RET, P1_TYPE, P2_TYPE, P3_TYPE, P4_TYPE, P5_TYPE, P6_TYPE, P7_TYPE, P8_TYPE, P9_TYPE, P10_TYPE, P11_TYPE, P12_TYPE, P13_TYPE, P14_TYPE, P15_TYPE, P16_TYPE, P17_TYPE, P18_TYPE, P19_TYPE, P20_TYPE, P21_TYPE, P22_TYPE, P23_TYPE); \ + DEF_FN_BODY(RET, NAME, P1_NAME, P2_NAME, P3_NAME, P4_NAME, P5_NAME, P6_NAME, P7_NAME, P8_NAME, P9_NAME, P10_NAME, P11_NAME, P12_NAME, P13_NAME, P14_NAME, P15_NAME, P16_NAME, P17_NAME, P18_NAME, P19_NAME, P20_NAME, P21_NAME, P22_NAME, P23_NAME); \ +} -#define DEF_FN_X(x, RET, NAME, P1_TYPE, P1_NAME, P2_TYPE, P2_NAME, P3_TYPE, P3_NAME, P4_TYPE, P4_NAME, P5_TYPE, P5_NAME, P6_TYPE, P6_NAME, P7_TYPE, P7_NAME, P8_TYPE, P8_NAME, P9_TYPE, P9_NAME, P10_TYPE, P10_NAME, P11_TYPE, P11_NAME, P12_TYPE, P12_NAME, P13_TYPE, P13_NAME, P14_TYPE, P14_NAME, P15_TYPE, P15_NAME, P16_TYPE, P16_NAME, P17_TYPE, P17_NAME, P18_TYPE, P18_NAME, P19_TYPE, P19_NAME, P20_TYPE, P20_NAME, P21_TYPE, P21_NAME, FUNC, ...) FUNC +#define DEF_FN_X(x, RET, NAME, P1_TYPE, P1_NAME, P2_TYPE, P2_NAME, P3_TYPE, P3_NAME, P4_TYPE, P4_NAME, P5_TYPE, P5_NAME, P6_TYPE, P6_NAME, P7_TYPE, P7_NAME, P8_TYPE, P8_NAME, P9_TYPE, P9_NAME, P10_TYPE, P10_NAME, P11_TYPE, P11_NAME, P12_TYPE, P12_NAME, P13_TYPE, P13_NAME, P14_TYPE, P14_NAME, P15_TYPE, P15_NAME, P16_TYPE, P16_NAME, P17_TYPE, P17_NAME, P18_TYPE, P18_NAME, P19_TYPE, P19_NAME, P20_TYPE, P20_NAME, P21_TYPE, P21_NAME, P22_TYPE, P22_NAME, P23_TYPE, P23_NAME, FUNC, ...) FUNC #define DEF_FN(...) DEF_FN_X(,##__VA_ARGS__,\ + DEF_FN_23(__VA_ARGS__),,\ + DEF_FN_22(__VA_ARGS__),,\ DEF_FN_21(__VA_ARGS__),,\ DEF_FN_20(__VA_ARGS__),,\ DEF_FN_19(__VA_ARGS__),,\ diff --git a/cpu/cpu-server-cublas.c b/cpu/cpu-server-cublas.c index 972b2c31..ad54eca0 100644 --- a/cpu/cpu-server-cublas.c +++ b/cpu/cpu-server-cublas.c @@ -16,6 +16,7 @@ #define WITH_RECORDER #include "api-recorder.h" #include "cpu-server-cublas.h" +#include "gsched.h" @@ -43,9 +44,12 @@ bool_t rpc_cublascreate_1_svc(ptr_result *result, struct svc_req *rqstp) RECORD_VOID_API; LOGE(LOG_DEBUG, "cublasCreate_v2"); + GSCHED_RETAIN; result->err = cublasCreate_v2((cublasHandle_t*)&result->ptr_result_u.ptr); - RECORD_RESULT(ptr_result_u, *result); resource_mg_create(&rm_cublas, (void*)result->ptr_result_u.ptr); + GSCHED_RELEASE; + + RECORD_RESULT(ptr_result_u, *result); return 1; } @@ -55,15 +59,33 @@ bool_t rpc_cublasdgemm_1_svc(ptr handle, int transa, int transb, int m, int n, i ptr C, int ldc, int *result, struct svc_req *rqstp) { + RECORD_API(rpc_cublasdgemm_1_argument); + RECORD_ARG(1, handle); + RECORD_ARG(2, transa); + RECORD_ARG(3, transb); + RECORD_ARG(4, m); + RECORD_ARG(5, n); + RECORD_ARG(6, k); + RECORD_ARG(7, alpha); + RECORD_ARG(8, A); + RECORD_ARG(9, lda); + RECORD_ARG(10, B); + RECORD_ARG(11, ldb); + RECORD_ARG(12, beta); + RECORD_ARG(13, C); + RECORD_ARG(14, ldc); LOGE(LOG_DEBUG, "cublasDgemm"); + GSCHED_RETAIN; *result = cublasDgemm(resource_mg_get(&rm_cublas, (void*)handle), (cublasOperation_t) transa, (cublasOperation_t) transb, m, n, k, &alpha, - resource_mg_get(&rm_cublas, (void*)A), lda, - resource_mg_get(&rm_cublas, (void*)B), ldb, &beta, - resource_mg_get(&rm_cublas, (void*)C), ldc + resource_mg_get(&rm_memory, (void*)A), lda, + resource_mg_get(&rm_memory, (void*)B), ldb, &beta, + resource_mg_get(&rm_memory, (void*)C), ldc ); + GSCHED_RELEASE; + RECORD_RESULT(integer, *result); return 1; } @@ -72,7 +94,209 @@ bool_t rpc_cublasdestroy_1_svc(ptr handle, int *result, struct svc_req *rqstp) RECORD_API(ptr); RECORD_SINGLE_ARG(handle); LOGE(LOG_DEBUG, "cublasDestroy_v2"); + GSCHED_RETAIN; *result = cublasDestroy_v2(resource_mg_get(&rm_cublas, (void*)handle)); + GSCHED_RELEASE; + RECORD_RESULT(integer, *result); + return 1; +} + +bool_t rpc_cublassetworkspace_1_svc(ptr handle, ptr workspace, size_t workspaceSizeInBytes, int *result, struct svc_req *rqstp) +{ + RECORD_API(rpc_cublassetworkspace_1_argument); + RECORD_NARG(handle); + RECORD_NARG(workspace); + RECORD_NARG(workspaceSizeInBytes); + LOGE(LOG_DEBUG, "%s", __FUNCTION__); + GSCHED_RETAIN; +#if CUBLAS_VERSION >= 11000 + *result = cublasSetWorkspace( + resource_mg_get(&rm_cublas, (void*)handle), + resource_mg_get(&rm_memory, (void*)workspace), + workspaceSizeInBytes); +#else + LOGE(LOG_ERROR, "cublassetworkspace not supported in this version"); + *result = -1; +#endif + GSCHED_RELEASE; + RECORD_RESULT(integer, *result); + return 1; +} + +bool_t rpc_cublassetstream_1_svc(ptr handle, ptr streamId, int *result, struct svc_req *rqstp) +{ + RECORD_API(rpc_cublassetstream_1_argument); + RECORD_NARG(handle); + RECORD_NARG(streamId); + LOGE(LOG_DEBUG, "%s", __FUNCTION__); + GSCHED_RETAIN; + *result = cublasSetStream( + resource_mg_get(&rm_cublas, (void*)handle), + resource_mg_get(&rm_streams, (void*)streamId)); + GSCHED_RELEASE; + RECORD_RESULT(integer, *result); + return 1; +} + +bool_t rpc_cublassetmathmode_1_svc(ptr handle, int mode, int *result, struct svc_req *rqstp) +{ + RECORD_API(rpc_cublassetmathmode_1_argument); + RECORD_NARG(handle); + RECORD_NARG(mode); + LOGE(LOG_DEBUG, "%s", __FUNCTION__); + GSCHED_RETAIN; + *result = cublasSetMathMode( + resource_mg_get(&rm_cublas, (void*)handle), + (cublasMath_t)mode); + GSCHED_RELEASE; RECORD_RESULT(integer, *result); return 1; } + +bool_t rpc_cublassgemm_1_svc(ptr handle, int transa, int transb, int m, int n, int k, float alpha, + ptr A, int lda, + ptr B, int ldb, float beta, + ptr C, int ldc, + int *result, struct svc_req *rqstp) +{ + RECORD_API(rpc_cublassgemm_1_argument); + RECORD_ARG(1, handle); + RECORD_ARG(2, transa); + RECORD_ARG(3, transb); + RECORD_ARG(4, m); + RECORD_ARG(5, n); + RECORD_ARG(6, k); + RECORD_ARG(7, alpha); + RECORD_ARG(8, A); + RECORD_ARG(9, lda); + RECORD_ARG(10, B); + RECORD_ARG(11, ldb); + RECORD_ARG(12, beta); + RECORD_ARG(13, C); + RECORD_ARG(14, ldc); + LOGE(LOG_DEBUG, "cublasSgemm"); + GSCHED_RETAIN; +#if CUBLAS_VERSION >= 11000 + *result = cublasSgemm(resource_mg_get(&rm_cublas, (void*)handle), + (cublasOperation_t) transa, + (cublasOperation_t) transb, + m, n, k, &alpha, + resource_mg_get(&rm_memory, (void*)A), lda, + resource_mg_get(&rm_memory, (void*)B), ldb, &beta, + resource_mg_get(&rm_memory, (void*)C), ldc + ); +#else + LOGE(LOG_ERROR, "cublassetworkspace not supported in this version"); + *result = -1; +#endif + GSCHED_RELEASE; + RECORD_RESULT(integer, *result); + return 1; +} + +bool_t rpc_cublassgemv_1_svc(ptr handle, int trans, int m, + int n, float alpha, + ptr A, int lda, + ptr x, int incx, float beta, + ptr y, int incy, + int *result, struct svc_req *rqstp) +{ + RECORD_API(rpc_cublassgemv_1_argument); + RECORD_ARG(1, handle); + RECORD_ARG(2, trans); + RECORD_ARG(3, m); + RECORD_ARG(4, n); + RECORD_ARG(5, alpha); + RECORD_ARG(6, A); + RECORD_ARG(7, lda); + RECORD_ARG(8, x); + RECORD_ARG(9, incx); + RECORD_ARG(10, beta); + RECORD_ARG(11, y); + RECORD_ARG(12, incy); + LOGE(LOG_DEBUG, "cublasSgemv"); + GSCHED_RETAIN; + *result = cublasSgemv(resource_mg_get(&rm_cublas, (void*)handle), + (cublasOperation_t) trans, + m, n, &alpha, + resource_mg_get(&rm_memory, (void*)A), lda, + resource_mg_get(&rm_memory, (void*)x), incx, &beta, + resource_mg_get(&rm_memory, (void*)y), incy + ); + GSCHED_RELEASE; + RECORD_RESULT(integer, *result); + return 1; +} + +bool_t rpc_cublasdgemv_1_svc(ptr handle, int trans, int m, + int n, double alpha, + ptr A, int lda, + ptr x, int incx, double beta, + ptr y, int incy, + int *result, struct svc_req *rqstp) +{ + RECORD_API(rpc_cublasdgemv_1_argument); + RECORD_ARG(1, handle); + RECORD_ARG(2, trans); + RECORD_ARG(3, m); + RECORD_ARG(4, n); + RECORD_ARG(5, alpha); + RECORD_ARG(6, A); + RECORD_ARG(7, lda); + RECORD_ARG(8, x); + RECORD_ARG(9, incx); + RECORD_ARG(10, beta); + RECORD_ARG(11, y); + RECORD_ARG(12, incy); + LOGE(LOG_DEBUG, "cublasDgemv"); + GSCHED_RETAIN; + *result = cublasDgemv(resource_mg_get(&rm_cublas, (void*)handle), + (cublasOperation_t) trans, + m, n, &alpha, + resource_mg_get(&rm_memory, (void*)A), lda, + resource_mg_get(&rm_memory, (void*)x), incx, &beta, + resource_mg_get(&rm_memory, (void*)y), incy + ); + GSCHED_RELEASE; + RECORD_RESULT(integer, *result); + return 1; +} + +bool_t rpc_cublassgemmex_1_svc(ptr handle, int transa, int transb, int m, int n, int k, float alpha, + ptr A, int Atype, int lda, + ptr B, int Btype, int ldb, float beta, + ptr C, int Ctype, int ldc, + int *result, struct svc_req *rqstp) +{ + RECORD_API(rpc_cublassgemmex_1_argument); + RECORD_ARG(1, handle); + RECORD_ARG(2, transa); + RECORD_ARG(3, transb); + RECORD_ARG(4, m); + RECORD_ARG(5, n); + RECORD_ARG(6, k); + RECORD_ARG(7, alpha); + RECORD_ARG(8, A); + RECORD_ARG(9, Atype); + RECORD_ARG(10, lda); + RECORD_ARG(11, B); + RECORD_ARG(12, Btype); + RECORD_ARG(13, ldb); + RECORD_ARG(14, beta); + RECORD_ARG(15, C); + RECORD_ARG(16, Ctype); + RECORD_ARG(17, ldc); + LOGE(LOG_DEBUG, "cublasSgemmEx"); + GSCHED_RETAIN; + *result = cublasSgemmEx(resource_mg_get(&rm_cublas, (void*)handle), + (cublasOperation_t) transa, + (cublasOperation_t) transb, + m, n, k, &alpha, + resource_mg_get(&rm_memory, (void*)A), (cudaDataType_t)Atype, lda, + resource_mg_get(&rm_memory, (void*)B), (cudaDataType_t)Btype, ldb, &beta, + resource_mg_get(&rm_memory, (void*)C), (cudaDataType_t)Ctype, ldc + ); + GSCHED_RELEASE; + RECORD_RESULT(integer, *result); + return 1; +} \ No newline at end of file diff --git a/cpu/cpu-server-cudnn.c b/cpu/cpu-server-cudnn.c new file mode 100644 index 00000000..70e4abce --- /dev/null +++ b/cpu/cpu-server-cudnn.c @@ -0,0 +1,1396 @@ + +#include +#include +#include +#include + +#include "cpu_rpc_prot.h" +#include "cpu-common.h" +#include "cpu-utils.h" +#include "log.h" +#include "resource-mg.h" +#include "gsched.h" + +#define WITH_RECORDER +#include "api-recorder.h" + +#include "cpu-server-cudnn.h" + + + +int server_cudnn_init(int bypass) +{ + int ret = 0; + ret &= resource_mg_init(&rm_cudnn, bypass); + ret &= resource_mg_init(&rm_cudnn_tensors, bypass); + ret &= resource_mg_init(&rm_cudnn_filters, bypass); + ret &= resource_mg_init(&rm_cudnn_poolings, bypass); + ret &= resource_mg_init(&rm_cudnn_activations, bypass); + ret &= resource_mg_init(&rm_cudnn_lrns, bypass); + ret &= resource_mg_init(&rm_cudnn_convs, bypass); + ret &= resource_mg_init(&rm_cudnn_backendds, bypass); + return ret; +} + +int server_cudnn_deinit(void) +{ + resource_mg_free(&rm_cudnn); + resource_mg_free(&rm_cudnn_tensors); + resource_mg_free(&rm_cudnn_filters); + resource_mg_free(&rm_cudnn_poolings); + resource_mg_free(&rm_cudnn_activations); + resource_mg_free(&rm_cudnn_lrns); + resource_mg_free(&rm_cudnn_convs); + resource_mg_free(&rm_cudnn_backendds); + return 0; + +} + +bool_t rpc_cudnngetversion_1_svc(size_t *result, struct svc_req *rqstp) +{ + LOGE(LOG_DEBUG, "%s", __FUNCTION__); + + GSCHED_RETAIN; + *result = cudnnGetVersion(); + GSCHED_RELEASE; + return 1; +} + +bool_t rpc_cudnngetmaxdeviceversion_1_svc(size_t *result, struct svc_req *rqstp) +{ + LOGE(LOG_DEBUG, "%s", __FUNCTION__); + + GSCHED_RETAIN; + *result = cudnnGetMaxDeviceVersion(); + GSCHED_RELEASE; + return 1; +} + +bool_t rpc_cudnngetcudartversion_1_svc(size_t *result, struct svc_req *rqstp) +{ + LOGE(LOG_DEBUG, "%s", __FUNCTION__); + + GSCHED_RETAIN; + *result = cudnnGetCudartVersion(); + GSCHED_RELEASE; + return 1; +} + +bool_t rpc_cudnngeterrorstring_1_svc(int status, char **result, struct svc_req *rqstp) +{ + const char* str; + *result = malloc(128); + LOGE(LOG_DEBUG, "%s", __FUNCTION__); + GSCHED_RETAIN; + str = cudnnGetErrorString((cudnnStatus_t)status); + strncpy(*result, str, 128); + GSCHED_RELEASE; + return 1; +} + +bool_t rpc_cudnnqueryruntimeerror_1_svc(ptr handle, int mode, int_result *result, struct svc_req *rqstp) +{ + LOGE(LOG_DEBUG, "%s", __FUNCTION__); + cudnnRuntimeTag_t *tag; + + GSCHED_RETAIN; + result->err = cudnnQueryRuntimeError( + (cudnnHandle_t)resource_mg_get(&rm_cudnn, (void*)handle), + (cudnnStatus_t*)&result->int_result_u.data, (cudnnErrQueryMode_t)mode, tag); + GSCHED_RELEASE; + return 1; +} + +bool_t rpc_cudnngetproperty_1_svc(int type, int_result *result, struct svc_req *rqstp) +{ + LOGE(LOG_DEBUG, "%s", __FUNCTION__); + + GSCHED_RETAIN; + result->err = cudnnGetProperty((libraryPropertyType)type, &result->int_result_u.data); + GSCHED_RELEASE; + return 1; +} + +bool_t rpc_cudnncreate_1_svc(ptr_result *result, struct svc_req *rqstp) +{ + RECORD_VOID_API; + LOGE(LOG_DEBUG, "%s", __FUNCTION__); + + GSCHED_RETAIN; + result->err = cudnnCreate((cudnnHandle_t*)&result->ptr_result_u.ptr); + if (resource_mg_create(&rm_cudnn, (void*)result->ptr_result_u.ptr) != 0) { + LOGE(LOG_ERROR, "error in resource manager"); + } + GSCHED_RELEASE; + RECORD_RESULT(ptr_result_u, *result); + return 1; +} + +bool_t rpc_cudnndestroy_1_svc(ptr handle, int *result, struct svc_req *rqstp) +{ + RECORD_API(ptr); + RECORD_SINGLE_ARG(handle); + LOGE(LOG_DEBUG, "%s", __FUNCTION__); + + GSCHED_RETAIN; + *result = cudnnDestroy( + (cudnnHandle_t)resource_mg_get(&rm_cudnn, (void*)handle)); + // TODO: Remove from resource manager + GSCHED_RELEASE; + RECORD_RESULT(integer, *result); + return 1; +} + +bool_t rpc_cudnnsetstream_1_svc(ptr handle, ptr streamId, int *result, struct svc_req *rqstp) +{ + RECORD_API(rpc_cudnnsetstream_1_argument); + RECORD_NARG(handle); + RECORD_NARG(streamId); + LOGE(LOG_DEBUG, "%s", __FUNCTION__); + + GSCHED_RETAIN; + *result = cudnnSetStream( + (cudnnHandle_t)resource_mg_get(&rm_cudnn, (void*)handle), + (cudaStream_t)resource_mg_get(&rm_streams, (void*)streamId)); + GSCHED_RELEASE; + RECORD_RESULT(integer, *result); + return 1; +} + +bool_t rpc_cudnngetstream_1_svc(ptr handle, ptr_result *result, struct svc_req *rqstp) +{ + LOGE(LOG_DEBUG, "%s", __FUNCTION__); + + GSCHED_RETAIN; + result->err = cudnnGetStream( + (cudnnHandle_t)resource_mg_get(&rm_cudnn, (void*)handle), + (cudaStream_t*)&result->ptr_result_u.ptr); + + GSCHED_RELEASE; + return 1; +} + +bool_t rpc_cudnncreatetensordescriptor_1_svc(ptr_result *result, struct svc_req *rqstp) +{ + RECORD_VOID_API; + LOGE(LOG_DEBUG, "%s", __FUNCTION__); + + GSCHED_RETAIN; + result->err = cudnnCreateTensorDescriptor((cudnnTensorDescriptor_t*)&result->ptr_result_u.ptr); + if (resource_mg_create(&rm_cudnn_tensors, (void*)result->ptr_result_u.ptr) != 0) { + LOGE(LOG_ERROR, "error in resource manager"); + } + GSCHED_RELEASE; + RECORD_RESULT(ptr_result_u, *result); + return 1; +} + +bool_t rpc_cudnnsettensor4ddescriptor_1_svc(ptr tensorDesc, int format, int dataType, int n, int c, int h, int w, int *result, struct svc_req *rqstp) +{ + RECORD_API(rpc_cudnnsettensor4ddescriptor_1_argument); + RECORD_NARG(tensorDesc); + RECORD_NARG(format); + RECORD_NARG(dataType); + RECORD_NARG(n); + RECORD_NARG(c); + RECORD_NARG(h); + RECORD_NARG(w); + + LOGE(LOG_DEBUG, "%s", __FUNCTION__); + + GSCHED_RETAIN; + *result = cudnnSetTensor4dDescriptor( + (cudnnTensorDescriptor_t)resource_mg_get(&rm_cudnn_tensors, (void*)tensorDesc), + (cudnnTensorFormat_t)format, + (cudnnDataType_t)dataType, + n, c, h, w); + GSCHED_RELEASE; + RECORD_RESULT(integer, *result); + return 1; +} + +bool_t rpc_cudnnsettensor4ddescriptorex_1_svc(ptr tensorDesc, int dataType, int n, int c, int h, int w, int nStride, int cStride, int hStride, int wStride, int *result, struct svc_req *rqstp) +{ + RECORD_API(rpc_cudnnsettensor4ddescriptorex_1_argument); + RECORD_NARG(tensorDesc); + RECORD_NARG(dataType); + RECORD_NARG(n); + RECORD_NARG(c); + RECORD_NARG(h); + RECORD_NARG(w); + RECORD_NARG(nStride); + RECORD_NARG(cStride); + RECORD_NARG(hStride); + RECORD_NARG(wStride); + + LOGE(LOG_DEBUG, "%s", __FUNCTION__); + + GSCHED_RETAIN; + *result = cudnnSetTensor4dDescriptorEx( + (cudnnTensorDescriptor_t)resource_mg_get(&rm_cudnn_tensors, (void*)tensorDesc), + (cudnnDataType_t)dataType, + n, c, h, w, nStride, cStride, hStride, wStride); + GSCHED_RELEASE; + RECORD_RESULT(integer, *result); + return 1; +} + +bool_t rpc_cudnngettensor4ddescriptor_1_svc(ptr tensorDesc, int9_result *result, struct svc_req *rqstp) +{ + LOGE(LOG_DEBUG, "%s", __FUNCTION__); + + GSCHED_RETAIN; + result->err = cudnnGetTensor4dDescriptor( + (cudnnTensorDescriptor_t)resource_mg_get(&rm_cudnn_tensors, (void*)tensorDesc), + (cudnnDataType_t*)&result->int9_result_u.data[0], + &result->int9_result_u.data[1], + &result->int9_result_u.data[2], + &result->int9_result_u.data[3], + &result->int9_result_u.data[4], + &result->int9_result_u.data[5], + &result->int9_result_u.data[6], + &result->int9_result_u.data[7], + &result->int9_result_u.data[8]); + GSCHED_RELEASE; + return 1; +} + +bool_t rpc_cudnnsettensornddescriptor_1_svc(ptr tensorDesc, int dataType, int nbDims, mem_data dimA, mem_data strideA, int *result, struct svc_req *rqstp) +{ + RECORD_API(rpc_cudnnsettensornddescriptor_1_argument); + RECORD_NARG(tensorDesc); + RECORD_NARG(dataType); + RECORD_NARG(nbDims); + RECORD_NARG(dimA); + RECORD_NARG(strideA); + + //TODO: Recording dimA and strideA is not as easy as done here. + + LOGE(LOG_DEBUG, "%s", __FUNCTION__); + + if (dimA.mem_data_len != nbDims * sizeof(int) || strideA.mem_data_len != nbDims * sizeof(int)) { + LOGE(LOG_ERROR, "array dimensions not as expected."); + return 0; + } + GSCHED_RETAIN; + *result = cudnnSetTensorNdDescriptor( + (cudnnTensorDescriptor_t)resource_mg_get(&rm_cudnn_tensors, (void*)tensorDesc), + (cudnnDataType_t)dataType, + nbDims, + (const int*)dimA.mem_data_val, + (const int*)strideA.mem_data_val); + GSCHED_RELEASE; + RECORD_RESULT(integer, *result); + return 1; +} + +bool_t rpc_cudnnsettensornddescriptorex_1_svc(ptr tensorDesc, int format, int dataType, int nbDims, mem_data dimA, int *result, struct svc_req *rqstp) +{ + RECORD_API(rpc_cudnnsettensornddescriptorex_1_argument); + RECORD_NARG(tensorDesc); + RECORD_NARG(format); + RECORD_NARG(dataType); + RECORD_NARG(nbDims); + RECORD_NARG(dimA); + + //TODO: Recording dimA and strideA is not as easy as done here. + + LOGE(LOG_DEBUG, "%s", __FUNCTION__); + + if (dimA.mem_data_len != nbDims * sizeof(int)) { + LOGE(LOG_ERROR, "array dimensions not as expected."); + return 0; + } + GSCHED_RETAIN; + *result = cudnnSetTensorNdDescriptorEx( + (cudnnTensorDescriptor_t)resource_mg_get(&rm_cudnn_tensors, (void*)tensorDesc), + (cudnnTensorFormat_t)format, + (cudnnDataType_t)dataType, + nbDims, + (const int*)dimA.mem_data_val); + GSCHED_RELEASE; + RECORD_RESULT(integer, *result); + return 1; +} + +bool_t rpc_cudnngettensornddescriptor_1_svc(ptr tensorDesc, int nbDimsRequested, mem_result *result, struct svc_req *rqstp) +{ + LOGE(LOG_DEBUG, "%s", __FUNCTION__); + result->mem_result_u.data.mem_data_len = sizeof(cudnnDataType_t) + sizeof(int) + nbDimsRequested*sizeof(int)*2; + if ((result->mem_result_u.data.mem_data_val = malloc(result->mem_result_u.data.mem_data_len)) == NULL) { + LOGE(LOG_ERROR, "malloc failed"); + return 0; + } + + GSCHED_RETAIN; + result->err = cudnnGetTensorNdDescriptor( + (cudnnTensorDescriptor_t)resource_mg_get(&rm_cudnn_tensors, (void*)tensorDesc), + nbDimsRequested, + (cudnnDataType_t*)result->mem_result_u.data.mem_data_val, + (int*)&result->mem_result_u.data.mem_data_val[sizeof(cudnnDataType_t)], + (int*)&result->mem_result_u.data.mem_data_val[sizeof(cudnnDataType_t)+sizeof(int)], + (int*)&result->mem_result_u.data.mem_data_val[sizeof(cudnnDataType_t)+sizeof(int)+nbDimsRequested*sizeof(int)]); + + GSCHED_RELEASE; + return 1; +} + +bool_t rpc_cudnngettensorsizeinbytes_1_svc(ptr tensorDesc, sz_result *result, struct svc_req *rqstp) +{ + LOGE(LOG_DEBUG, "%s", __FUNCTION__); + GSCHED_RETAIN; + result->err = cudnnGetTensorSizeInBytes( + (cudnnTensorDescriptor_t)resource_mg_get(&rm_cudnn_tensors, (void*)tensorDesc), + &result->sz_result_u.data); + GSCHED_RELEASE; + return 1; +} + +bool_t rpc_cudnndestroytensordescriptor_1_svc(ptr tensorDesc, int *result, struct svc_req *rqstp) +{ + RECORD_API(ptr); + RECORD_SINGLE_ARG(tensorDesc); + LOGE(LOG_DEBUG, "%s", __FUNCTION__); + + GSCHED_RETAIN; + *result = cudnnDestroyTensorDescriptor( + (cudnnTensorDescriptor_t)resource_mg_get(&rm_cudnn_tensors, (void*)tensorDesc)); + // TODO: Remove from resource manager + GSCHED_RELEASE; + RECORD_RESULT(integer, *result); + return 1; +} + + +bool_t rpc_cudnncreatefilterdescriptor_1_svc(ptr_result *result, struct svc_req *rqstp) +{ + RECORD_VOID_API; + LOGE(LOG_DEBUG, "%s", __FUNCTION__); + + GSCHED_RETAIN; + result->err = cudnnCreateFilterDescriptor((cudnnFilterDescriptor_t*)&result->ptr_result_u.ptr); + if (resource_mg_create(&rm_cudnn_filters, (void*)result->ptr_result_u.ptr) != 0) { + LOGE(LOG_ERROR, "error in resource manager"); + } + GSCHED_RELEASE; + RECORD_RESULT(ptr_result_u, *result); + return 1; +} + +bool_t rpc_cudnnsetfilter4ddescriptor_1_svc(ptr filterDesc, int dataType, int format, int k, int c, int h, int w, int *result, struct svc_req *rqstp) +{ + RECORD_API(rpc_cudnnsetfilter4ddescriptor_1_argument); + RECORD_NARG(filterDesc); + RECORD_NARG(dataType); + RECORD_NARG(format); + RECORD_NARG(k); + RECORD_NARG(c); + RECORD_NARG(h); + RECORD_NARG(w); + + LOGE(LOG_DEBUG, "%s", __FUNCTION__); + + GSCHED_RETAIN; + *result = cudnnSetFilter4dDescriptor( + (cudnnFilterDescriptor_t)resource_mg_get(&rm_cudnn_filters, (void*)filterDesc), + (cudnnDataType_t)dataType, + (cudnnTensorFormat_t)format, + k, c, h, w); + GSCHED_RELEASE; + RECORD_RESULT(integer, *result); + return 1; +} + +bool_t rpc_cudnngetfilter4ddescriptor_1_svc(ptr filterDesc, int6_result *result, struct svc_req *rqstp) +{ + LOGE(LOG_DEBUG, "%s", __FUNCTION__); + + GSCHED_RETAIN; + result->err = cudnnGetFilter4dDescriptor( + (cudnnFilterDescriptor_t)resource_mg_get(&rm_cudnn_filters, (void*)filterDesc), + (cudnnDataType_t*)&result->int6_result_u.data[0], + (cudnnTensorFormat_t*)&result->int6_result_u.data[1], + &result->int6_result_u.data[2], + &result->int6_result_u.data[3], + &result->int6_result_u.data[4], + &result->int6_result_u.data[5]); + GSCHED_RELEASE; + return 1; +} + +bool_t rpc_cudnnsetfilternddescriptor_1_svc(ptr filterDesc, int dataType, int format, int nbDims, mem_data filterDimA, int *result, struct svc_req *rqstp) +{ + RECORD_API(rpc_cudnnsetfilternddescriptor_1_argument); + RECORD_NARG(filterDesc); + RECORD_NARG(dataType); + RECORD_NARG(format); + RECORD_NARG(nbDims); + RECORD_NARG(filterDimA); + + //TODO: Recording filterDimA is not as easy as done here. + + LOGE(LOG_DEBUG, "%s", __FUNCTION__); + + if (filterDimA.mem_data_len != nbDims * sizeof(int)) { + LOGE(LOG_ERROR, "array dimension not as expected."); + return 0; + } + GSCHED_RETAIN; + *result = cudnnSetFilterNdDescriptor( + (cudnnFilterDescriptor_t)resource_mg_get(&rm_cudnn_filters, (void*)filterDesc), + (cudnnDataType_t)dataType, + (cudnnTensorFormat_t)format, + nbDims, + (const int*)filterDimA.mem_data_val); + GSCHED_RELEASE; + RECORD_RESULT(integer, *result); + return 1; +} + +bool_t rpc_cudnngetfilternddescriptor_1_svc(ptr filterDesc, int nbDimsRequested, mem_result *result, struct svc_req *rqstp) +{ + LOGE(LOG_DEBUG, "%s", __FUNCTION__); + result->mem_result_u.data.mem_data_len = sizeof(cudnnDataType_t) + sizeof(cudnnTensorFormat_t) + sizeof(int) + nbDimsRequested*sizeof(int); + if ((result->mem_result_u.data.mem_data_val = malloc(result->mem_result_u.data.mem_data_len)) == NULL) { + LOGE(LOG_ERROR, "malloc failed"); + return 0; + } + + GSCHED_RETAIN; + result->err = cudnnGetFilterNdDescriptor( + (cudnnFilterDescriptor_t)resource_mg_get(&rm_cudnn_filters, (void*)filterDesc), + nbDimsRequested, + (cudnnDataType_t*)result->mem_result_u.data.mem_data_val, + (cudnnTensorFormat_t*)&result->mem_result_u.data.mem_data_val[sizeof(cudnnDataType_t)], + (int*)&result->mem_result_u.data.mem_data_val[sizeof(cudnnDataType_t)+sizeof(cudnnTensorDescriptor_t)], + (int*)&result->mem_result_u.data.mem_data_val[sizeof(cudnnDataType_t)+sizeof(cudnnTensorDescriptor_t)+sizeof(int)]); + GSCHED_RELEASE; + return 1; +} + +bool_t rpc_cudnngetfiltersizeinbytes_1_svc(ptr filterDesc, sz_result *result, struct svc_req *rqstp) +{ + LOGE(LOG_DEBUG, "%s", __FUNCTION__); + GSCHED_RETAIN; + result->err = cudnnGetFilterSizeInBytes( + (cudnnFilterDescriptor_t)resource_mg_get(&rm_cudnn_filters, (void*)filterDesc), + &result->sz_result_u.data); + GSCHED_RELEASE; + return 1; +} + +bool_t rpc_cudnntransformfilter_1_svc(ptr handle, ptr transDesc, cudnn_scaling_t alpha, ptr srcDesc, ptr srcData, cudnn_scaling_t beta, ptr destDesc, ptr destData, int *result, struct svc_req *rqstp) +{ + RECORD_API(rpc_cudnntransformfilter_1_argument); + RECORD_NARG(handle); + RECORD_NARG(transDesc); + RECORD_NARG(alpha); + RECORD_NARG(srcDesc); + RECORD_NARG(srcData); + RECORD_NARG(beta); + RECORD_NARG(destDesc); + RECORD_NARG(destData); + + LOGE(LOG_DEBUG, "%s", __FUNCTION__); + + GSCHED_RETAIN; + *result = cudnnTransformFilter( + (cudnnHandle_t)resource_mg_get(&rm_cudnn, (void*)handle), + (const cudnnTensorTransformDescriptor_t)resource_mg_get(&rm_cudnn_tensortransform, (void*)transDesc), + (alpha.dataType == CUDNN_DATA_DOUBLE ? (const void*)&alpha.cudnn_scaling_t_u.d : (const void*)&alpha.cudnn_scaling_t_u.f), + (const cudnnFilterDescriptor_t)resource_mg_get(&rm_cudnn_filters, (void*)srcDesc), + (const void*)srcData, + (beta.dataType == CUDNN_DATA_DOUBLE ? (const void*)&beta.cudnn_scaling_t_u.d : (const void*)&beta.cudnn_scaling_t_u.f), + (const cudnnFilterDescriptor_t)resource_mg_get(&rm_cudnn_filters, (void*)destDesc), + (void*)destData); + GSCHED_RELEASE; + RECORD_RESULT(integer, *result); + return 1; +} + +bool_t rpc_cudnndestroyfilterdescriptor_1_svc(ptr filterDesc, int *result, struct svc_req *rqstp) +{ + RECORD_API(ptr); + RECORD_SINGLE_ARG(filterDesc); + LOGE(LOG_DEBUG, "%s", __FUNCTION__); + + GSCHED_RETAIN; + *result = cudnnDestroyFilterDescriptor( + (cudnnFilterDescriptor_t)resource_mg_get(&rm_cudnn_filters, (void*)filterDesc)); + // TODO: Remove from resource manager + GSCHED_RELEASE; + RECORD_RESULT(integer, *result); + return 1; +} + +bool_t rpc_cudnncreatepoolingdescriptor_1_svc(ptr_result *result, struct svc_req *rqstp) +{ + RECORD_VOID_API; + LOGE(LOG_DEBUG, "%s", __FUNCTION__); + + GSCHED_RETAIN; + result->err = cudnnCreatePoolingDescriptor((cudnnPoolingDescriptor_t*)&result->ptr_result_u.ptr); + if (resource_mg_create(&rm_cudnn_poolings, (void*)result->ptr_result_u.ptr) != 0) { + LOGE(LOG_ERROR, "error in resource manager"); + } + GSCHED_RELEASE; + RECORD_RESULT(ptr_result_u, *result); + return 1; +} + +bool_t rpc_cudnnsetpooling2ddescriptor_1_svc(ptr poolingDesc, int mode, int maxpoolingNanOpt, int windowHeight, int windowWidth, int verticalPadding, int horizontalPadding, int verticalStride, int horizontalStride, int *result, struct svc_req *rqstp) +{ + RECORD_API(rpc_cudnnsetpooling2ddescriptor_1_argument); + RECORD_NARG(poolingDesc); + RECORD_NARG(mode); + RECORD_NARG(maxpoolingNanOpt); + RECORD_NARG(windowHeight); + RECORD_NARG(windowWidth); + RECORD_NARG(verticalPadding); + RECORD_NARG(horizontalPadding); + RECORD_NARG(verticalStride); + RECORD_NARG(horizontalStride); + + LOGE(LOG_DEBUG, "%s", __FUNCTION__); + + GSCHED_RETAIN; + *result = cudnnSetPooling2dDescriptor( + (cudnnPoolingDescriptor_t)resource_mg_get(&rm_cudnn_poolings, (void*)poolingDesc), + (cudnnPoolingMode_t)mode, + (cudnnNanPropagation_t)maxpoolingNanOpt, + windowHeight, windowWidth, + verticalPadding, horizontalPadding, + verticalStride, horizontalStride); + GSCHED_RELEASE; + RECORD_RESULT(integer, *result); + return 1; +} + +bool_t rpc_cudnngetpooling2ddescriptor_1_svc(ptr poolingDesc, int8_result *result, struct svc_req *rqstp) +{ + LOGE(LOG_DEBUG, "%s", __FUNCTION__); + + GSCHED_RETAIN; + result->err = cudnnGetPooling2dDescriptor( + (cudnnPoolingDescriptor_t)resource_mg_get(&rm_cudnn_poolings, (void*)poolingDesc), + (cudnnPoolingMode_t*)&result->int8_result_u.data[0], + (cudnnNanPropagation_t*)&result->int8_result_u.data[1], + &result->int8_result_u.data[2], + &result->int8_result_u.data[3], + &result->int8_result_u.data[4], + &result->int8_result_u.data[5], + &result->int8_result_u.data[6], + &result->int8_result_u.data[7]); + GSCHED_RELEASE; + return 1; +} + +bool_t rpc_cudnnsetpoolingnddescriptor_1_svc(ptr poolingDesc, int mode, int maxpoolingNanOpt, int nbDims, mem_data windowDimA, mem_data paddingA, mem_data strideA, int *result, struct svc_req *rqstp) +{ + RECORD_API(rpc_cudnnsetpoolingnddescriptor_1_argument); + RECORD_NARG(poolingDesc); + RECORD_NARG(mode); + RECORD_NARG(maxpoolingNanOpt); + RECORD_NARG(nbDims); + RECORD_NARG(windowDimA); + RECORD_NARG(paddingA); + RECORD_NARG(strideA); + //TODO: Recording windowDimA, paddingA and strideA are not as easy as done here. + + LOGE(LOG_DEBUG, "%s", __FUNCTION__); + + if (windowDimA.mem_data_len != nbDims * sizeof(int) || + paddingA.mem_data_len != nbDims * sizeof(int) || + strideA.mem_data_len != nbDims * sizeof(int)) { + LOGE(LOG_ERROR, "array dimensions not as expected."); + return 0; + } + GSCHED_RETAIN; + *result = cudnnSetPoolingNdDescriptor( + (cudnnPoolingDescriptor_t)resource_mg_get(&rm_cudnn_poolings, (void*)poolingDesc), + (cudnnPoolingMode_t)mode, + (cudnnNanPropagation_t)maxpoolingNanOpt, + nbDims, + (const int*)windowDimA.mem_data_val, + (const int*)paddingA.mem_data_val, + (const int*)strideA.mem_data_val); + GSCHED_RELEASE; + RECORD_RESULT(integer, *result); + return 1; +} + +bool_t rpc_cudnngetpoolingnddescriptor_1_svc(ptr poolingDesc, int nbDimsRequested, mem_result *result, struct svc_req *rqstp) +{ + LOGE(LOG_DEBUG, "%s", __FUNCTION__); + result->mem_result_u.data.mem_data_len = sizeof(cudnnPoolingMode_t) + sizeof(cudnnNanPropagation_t) + nbDimsRequested * sizeof(int) * 3; + if ((result->mem_result_u.data.mem_data_val = malloc(result->mem_result_u.data.mem_data_len)) == NULL) { + LOGE(LOG_ERROR, "malloc failed"); + return 0; + } + + size_t offsets[] = { + 0, + sizeof(cudnnPoolingMode_t), + sizeof(cudnnPoolingMode_t) + sizeof(cudnnNanPropagation_t), + sizeof(cudnnPoolingMode_t) + sizeof(cudnnNanPropagation_t) + sizeof(int), + sizeof(cudnnPoolingMode_t) + sizeof(cudnnNanPropagation_t) + sizeof(int) + sizeof(int) * nbDimsRequested, + sizeof(cudnnPoolingMode_t) + sizeof(cudnnNanPropagation_t) + sizeof(int) + sizeof(int) * nbDimsRequested * 2, + }; + + GSCHED_RETAIN; +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wint-to-pointer-cast" + result->err = cudnnGetPoolingNdDescriptor( + (cudnnPoolingDescriptor_t)resource_mg_get(&rm_cudnn_poolings, (void*)poolingDesc), + nbDimsRequested, + (cudnnPoolingMode_t*)result->mem_result_u.data.mem_data_val[offsets[0]], + (cudnnNanPropagation_t*)result->mem_result_u.data.mem_data_val[offsets[1]], + (int*)result->mem_result_u.data.mem_data_val[offsets[2]], + (int*)result->mem_result_u.data.mem_data_val[offsets[3]], + (int*)result->mem_result_u.data.mem_data_val[offsets[4]], + (int*)result->mem_result_u.data.mem_data_val[offsets[5]]); +#pragma GCC diagnostic pop + + GSCHED_RELEASE; + return 1; +} + +bool_t rpc_cudnngetpoolingndforwardoutputdim_1_svc(ptr poolingDesc, ptr inputTensorDesc, int nbDims, mem_result *result, struct svc_req *rqstp) +{ + LOGE(LOG_DEBUG, "%s", __FUNCTION__); + GSCHED_RETAIN; + result->mem_result_u.data.mem_data_len = sizeof(int) * nbDims; + if ((result->mem_result_u.data.mem_data_val = malloc(result->mem_result_u.data.mem_data_len)) == NULL) { + LOGE(LOG_ERROR, "malloc failed"); + return 0; + } + result->err = cudnnGetPoolingNdForwardOutputDim( + (cudnnPoolingDescriptor_t)resource_mg_get(&rm_cudnn_poolings, (void*)poolingDesc), + (cudnnTensorDescriptor_t)resource_mg_get(&rm_cudnn_tensors, (void*)inputTensorDesc), + nbDims, + (int*)result->mem_result_u.data.mem_data_val); + GSCHED_RELEASE; + return 1; +} + +bool_t rpc_cudnngetpooling2dforwardoutputdim_1_svc(ptr poolingDesc, ptr inputTensorDesc, int4_result *result, struct svc_req *rqstp) +{ + LOGE(LOG_DEBUG, "%s", __FUNCTION__); + GSCHED_RETAIN; + result->err = cudnnGetPooling2dForwardOutputDim( + (cudnnPoolingDescriptor_t)resource_mg_get(&rm_cudnn_poolings, (void*)poolingDesc), + (cudnnTensorDescriptor_t)resource_mg_get(&rm_cudnn_tensors, (void*)inputTensorDesc), + (int*)&result->int4_result_u.data[0], + (int*)&result->int4_result_u.data[1], + (int*)&result->int4_result_u.data[2], + (int*)&result->int4_result_u.data[3]); + GSCHED_RELEASE; + return 1; +} + +bool_t rpc_cudnndestroypoolingdescriptor_1_svc(ptr poolingDesc, int *result, struct svc_req *rqstp) +{ + RECORD_API(ptr); + RECORD_SINGLE_ARG(poolingDesc); + LOGE(LOG_DEBUG, "%s", __FUNCTION__); + + GSCHED_RETAIN; + *result = cudnnDestroyPoolingDescriptor( + (cudnnPoolingDescriptor_t)resource_mg_get(&rm_cudnn_poolings, (void*)poolingDesc)); + // TODO: Remove from resource manager + GSCHED_RELEASE; + RECORD_RESULT(integer, *result); + return 1; +} + +bool_t rpc_cudnncreateactivationdescriptor_1_svc(ptr_result *result, struct svc_req *rqstp) +{ + RECORD_VOID_API; + LOGE(LOG_DEBUG, "%s", __FUNCTION__); + + GSCHED_RETAIN; + result->err = cudnnCreateActivationDescriptor((cudnnActivationDescriptor_t*)&result->ptr_result_u.ptr); + if (resource_mg_create(&rm_cudnn_activations, (void*)result->ptr_result_u.ptr) != 0) { + LOGE(LOG_ERROR, "error in resource manager"); + } + GSCHED_RELEASE; + RECORD_RESULT(ptr_result_u, *result); + return 1; +} + +bool_t rpc_cudnnsetactivationdescriptor_1_svc(ptr activationDesc, int mode, int reluNanOpt, double coef, int *result, struct svc_req *rqstp) +{ + RECORD_API(rpc_cudnnsetactivationdescriptor_1_argument); + RECORD_NARG(activationDesc); + RECORD_NARG(mode); + RECORD_NARG(reluNanOpt); + RECORD_NARG(coef); + + LOGE(LOG_DEBUG, "%s", __FUNCTION__); + + GSCHED_RETAIN; + *result = cudnnSetActivationDescriptor( + (cudnnActivationDescriptor_t)resource_mg_get(&rm_cudnn_activations, (void*)activationDesc), + (cudnnActivationMode_t)mode, + (cudnnNanPropagation_t)reluNanOpt, + coef); + GSCHED_RELEASE; + RECORD_RESULT(integer, *result); + return 1; +} + +bool_t rpc_cudnngetactivationdescriptor_1_svc(ptr activationDesc, int2d1_result *result, struct svc_req *rqstp) +{ + LOGE(LOG_DEBUG, "%s", __FUNCTION__); + + GSCHED_RETAIN; + result->err = cudnnGetActivationDescriptor( + (cudnnActivationDescriptor_t)resource_mg_get(&rm_cudnn_activations, (void*)activationDesc), + (cudnnActivationMode_t*)&result->int2d1_result_u.data.i[0], + (cudnnNanPropagation_t*)&result->int2d1_result_u.data.i[1], + &result->int2d1_result_u.data.d); + GSCHED_RELEASE; + return 1; +} + +bool_t rpc_cudnnsetactivationdescriptorswishbeta_1_svc(ptr activationDesc, double swish_beta, int *result, struct svc_req *rqstp) +{ + RECORD_API(rpc_cudnnsetactivationdescriptorswishbeta_1_argument); + RECORD_NARG(activationDesc); + RECORD_NARG(swish_beta); + + LOGE(LOG_DEBUG, "%s", __FUNCTION__); + + GSCHED_RETAIN; + *result = cudnnSetActivationDescriptorSwishBeta( + (cudnnActivationDescriptor_t)resource_mg_get(&rm_cudnn_activations, (void*)activationDesc), + swish_beta); + GSCHED_RELEASE; + RECORD_RESULT(integer, *result); + return 1; +} + +bool_t rpc_cudnngetactivationdescriptorswishbeta_1_svc(ptr activationDesc, d_result *result, struct svc_req *rqstp) +{ + LOGE(LOG_DEBUG, "%s", __FUNCTION__); + + GSCHED_RETAIN; + result->err = cudnnGetActivationDescriptorSwishBeta( + (cudnnActivationDescriptor_t)resource_mg_get(&rm_cudnn_activations, (void*)activationDesc), + &result->d_result_u.data); + GSCHED_RELEASE; + return 1; +} + +bool_t rpc_cudnndestroyactivationdescriptor_1_svc(ptr activationDesc, int *result, struct svc_req *rqstp) +{ + RECORD_API(ptr); + RECORD_SINGLE_ARG(activationDesc); + LOGE(LOG_DEBUG, "%s", __FUNCTION__); + + GSCHED_RETAIN; + *result = cudnnDestroyActivationDescriptor( + (cudnnActivationDescriptor_t)resource_mg_get(&rm_cudnn_activations, (void*)activationDesc)); + // TODO: Remove from resource manager + GSCHED_RELEASE; + RECORD_RESULT(integer, *result); + return 1; +} + +bool_t rpc_cudnncreatelrndescriptor_1_svc(ptr_result *result, struct svc_req *rqstp) +{ + RECORD_VOID_API; + LOGE(LOG_DEBUG, "%s", __FUNCTION__); + + GSCHED_RETAIN; + result->err = cudnnCreateLRNDescriptor((cudnnLRNDescriptor_t*)&result->ptr_result_u.ptr); + if (resource_mg_create(&rm_cudnn_lrns, (void*)result->ptr_result_u.ptr) != 0) { + LOGE(LOG_ERROR, "error in resource manager"); + } + GSCHED_RELEASE; + RECORD_RESULT(ptr_result_u, *result); + return 1; +} + +bool_t rpc_cudnnsetlrndescriptor_1_svc(ptr normDesc, unsigned lrnN, double lrnAlpha, double lrnBeta, double lrnK, int *result, struct svc_req *rqstp) +{ + RECORD_API(rpc_cudnnsetlrndescriptor_1_argument); + RECORD_NARG(normDesc); + RECORD_NARG(lrnN); + RECORD_NARG(lrnAlpha); + RECORD_NARG(lrnBeta); + RECORD_NARG(lrnK); + + LOGE(LOG_DEBUG, "%s", __FUNCTION__); + + GSCHED_RETAIN; + *result = cudnnSetLRNDescriptor( + (cudnnLRNDescriptor_t)resource_mg_get(&rm_cudnn_lrns, (void*)normDesc), + lrnN, + lrnAlpha, + lrnBeta, + lrnK); + GSCHED_RELEASE; + RECORD_RESULT(integer, *result); + return 1; +} + +bool_t rpc_cudnngetlrndescriptor_1_svc(ptr normDesc, int1d3_result *result, struct svc_req *rqstp) +{ + LOGE(LOG_DEBUG, "%s", __FUNCTION__); + + GSCHED_RETAIN; + result->err = cudnnGetLRNDescriptor( + (cudnnLRNDescriptor_t)resource_mg_get(&rm_cudnn_lrns, (void*)normDesc), + (unsigned int*)&result->int1d3_result_u.data.i, + &result->int1d3_result_u.data.d[0], + &result->int1d3_result_u.data.d[1], + &result->int1d3_result_u.data.d[2]); + GSCHED_RELEASE; + return 1; +} + +bool_t rpc_cudnndestroylrndescriptor_1_svc(ptr lrnDesc, int *result, struct svc_req *rqstp) +{ + RECORD_API(ptr); + RECORD_SINGLE_ARG(lrnDesc); + LOGE(LOG_DEBUG, "%s", __FUNCTION__); + + GSCHED_RETAIN; + *result = cudnnDestroyLRNDescriptor( + (cudnnLRNDescriptor_t)resource_mg_get(&rm_cudnn_lrns, (void*)lrnDesc)); + // TODO: Remove from resource manager + GSCHED_RELEASE; + RECORD_RESULT(integer, *result); + return 1; +} + +bool_t rpc_cudnnpoolingforward_1_svc(ptr handle, ptr poolingDesc, cudnn_scaling_t alpha, ptr xDesc, ptr x, cudnn_scaling_t beta, ptr yDesc, ptr y, int *result, struct svc_req *rqstp) +{ + RECORD_API(rpc_cudnnpoolingforward_1_argument); + RECORD_NARG(handle); + RECORD_NARG(poolingDesc); + RECORD_NARG(alpha); + RECORD_NARG(xDesc); + RECORD_NARG(x); + RECORD_NARG(beta); + RECORD_NARG(yDesc); + RECORD_NARG(y); + + LOGE(LOG_DEBUG, "%s", __FUNCTION__); + GSCHED_RETAIN; + *result = cudnnPoolingForward( + (cudnnHandle_t)resource_mg_get(&rm_cudnn, (void*)handle), + (cudnnPoolingDescriptor_t)resource_mg_get(&rm_cudnn_poolings, (void*)poolingDesc), + (alpha.dataType == CUDNN_DATA_DOUBLE ? (const void*)&alpha.cudnn_scaling_t_u.d : (const void*)&alpha.cudnn_scaling_t_u.f), + (cudnnTensorDescriptor_t)resource_mg_get(&rm_cudnn_tensors, (void*)xDesc), + (const void*)resource_mg_get(&rm_memory, (void*)x), + (beta.dataType == CUDNN_DATA_DOUBLE ? (const void*)&beta.cudnn_scaling_t_u.d : (const void*)&beta.cudnn_scaling_t_u.f), + (cudnnTensorDescriptor_t)resource_mg_get(&rm_cudnn_tensors, (void*)yDesc), + (void*)resource_mg_get(&rm_memory, (void*)y)); + GSCHED_RELEASE; + RECORD_RESULT(integer, *result); + return 1; +} + +bool_t rpc_cudnnactivationforward_1_svc(ptr handle, ptr activationDesc, cudnn_scaling_t alpha, ptr xDesc, ptr x, cudnn_scaling_t beta, ptr yDesc, ptr y, int *result, struct svc_req *rqstp) +{ + RECORD_API(rpc_cudnnactivationforward_1_argument); + RECORD_NARG(handle); + RECORD_NARG(activationDesc); + RECORD_NARG(alpha); + RECORD_NARG(xDesc); + RECORD_NARG(x); + RECORD_NARG(beta); + RECORD_NARG(yDesc); + RECORD_NARG(y); + + LOGE(LOG_DEBUG, "%s", __FUNCTION__); + GSCHED_RETAIN; + *result = cudnnActivationForward( + (cudnnHandle_t)resource_mg_get(&rm_cudnn, (void*)handle), + (cudnnActivationDescriptor_t)resource_mg_get(&rm_cudnn_activations, (void*)activationDesc), + (alpha.dataType == CUDNN_DATA_DOUBLE ? (const void*)&alpha.cudnn_scaling_t_u.d : (const void*)&alpha.cudnn_scaling_t_u.f), + (cudnnTensorDescriptor_t)resource_mg_get(&rm_cudnn_tensors, (void*)xDesc), + (const void*)resource_mg_get(&rm_memory, (void*)x), + (beta.dataType == CUDNN_DATA_DOUBLE ? (const void*)&beta.cudnn_scaling_t_u.d : (const void*)&beta.cudnn_scaling_t_u.f), + (cudnnTensorDescriptor_t)resource_mg_get(&rm_cudnn_tensors, (void*)yDesc), + (void*)resource_mg_get(&rm_memory, (void*)y)); + GSCHED_RELEASE; + RECORD_RESULT(integer, *result); + return 1; +} + +bool_t rpc_cudnnlrncrosschannelforward_1_svc(ptr handle, ptr normDesc, int lrnMode, cudnn_scaling_t alpha, ptr xDesc, ptr x, cudnn_scaling_t beta, ptr yDesc, ptr y, int *result, struct svc_req *rqstp) +{ + RECORD_API(rpc_cudnnlrncrosschannelforward_1_argument); + RECORD_NARG(handle); + RECORD_NARG(normDesc); + RECORD_NARG(lrnMode); + RECORD_NARG(alpha); + RECORD_NARG(xDesc); + RECORD_NARG(x); + RECORD_NARG(beta); + RECORD_NARG(yDesc); + RECORD_NARG(y); + + LOGE(LOG_DEBUG, "%s", __FUNCTION__); + GSCHED_RETAIN; + *result = cudnnLRNCrossChannelForward( + (cudnnHandle_t)resource_mg_get(&rm_cudnn, (void*)handle), + (cudnnLRNDescriptor_t)resource_mg_get(&rm_cudnn_lrns, (void*)normDesc), + (cudnnLRNMode_t)lrnMode, + (alpha.dataType == CUDNN_DATA_DOUBLE ? (const void*)&alpha.cudnn_scaling_t_u.d : (const void*)&alpha.cudnn_scaling_t_u.f), + (cudnnTensorDescriptor_t)resource_mg_get(&rm_cudnn_tensors, (void*)xDesc), + (const void*)resource_mg_get(&rm_memory, (void*)x), + (beta.dataType == CUDNN_DATA_DOUBLE ? (const void*)&beta.cudnn_scaling_t_u.d : (const void*)&beta.cudnn_scaling_t_u.f), + (cudnnTensorDescriptor_t)resource_mg_get(&rm_cudnn_tensors, (void*)yDesc), + (void*)resource_mg_get(&rm_memory, (void*)y)); + GSCHED_RELEASE; + RECORD_RESULT(integer, *result); + return 1; +} + +bool_t rpc_cudnnsoftmaxforward_1_svc(ptr handle, int algo, int mode, cudnn_scaling_t alpha, ptr xDesc, ptr x, cudnn_scaling_t beta, ptr yDesc, ptr y, int *result, struct svc_req *rqstp) +{ + RECORD_API(rpc_cudnnsoftmaxforward_1_argument); + RECORD_NARG(handle); + RECORD_NARG(algo); + RECORD_NARG(mode); + RECORD_NARG(alpha); + RECORD_NARG(xDesc); + RECORD_NARG(x); + RECORD_NARG(beta); + RECORD_NARG(yDesc); + RECORD_NARG(y); + + LOGE(LOG_DEBUG, "%s", __FUNCTION__); + GSCHED_RETAIN; + *result = cudnnSoftmaxForward( + (cudnnHandle_t)resource_mg_get(&rm_cudnn, (void*)handle), + (cudnnSoftmaxAlgorithm_t)algo, + (cudnnSoftmaxMode_t)mode, + (alpha.dataType == CUDNN_DATA_DOUBLE ? (const void*)&alpha.cudnn_scaling_t_u.d : (const void*)&alpha.cudnn_scaling_t_u.f), + (cudnnTensorDescriptor_t)resource_mg_get(&rm_cudnn_tensors, (void*)xDesc), + (const void*)resource_mg_get(&rm_memory, (void*)x), + (beta.dataType == CUDNN_DATA_DOUBLE ? (const void*)&beta.cudnn_scaling_t_u.d : (const void*)&beta.cudnn_scaling_t_u.f), + (cudnnTensorDescriptor_t)resource_mg_get(&rm_cudnn_tensors, (void*)yDesc), + (void*)resource_mg_get(&rm_memory, (void*)y)); + GSCHED_RELEASE; + RECORD_RESULT(integer, *result); + return 1; +} + +/* cudnn cnn inference */ +bool_t rpc_cudnngetconvolutionndforwardoutputdim_1_svc(ptr convDesc, ptr inputTensorDesc, ptr filterDesc, int nbDims, mem_result *result, struct svc_req *rqstp) +{ + LOGE(LOG_DEBUG, "%s", __FUNCTION__); + GSCHED_RETAIN; + result->mem_result_u.data.mem_data_len = sizeof(int) * nbDims; + if ((result->mem_result_u.data.mem_data_val = malloc(result->mem_result_u.data.mem_data_len)) == NULL) { + LOGE(LOG_ERROR, "malloc failed"); + return 0; + } + result->err = cudnnGetConvolutionNdForwardOutputDim( + (cudnnConvolutionDescriptor_t)resource_mg_get(&rm_cudnn_convs, (void*)convDesc), + (cudnnTensorDescriptor_t)resource_mg_get(&rm_cudnn_tensors, (void*)inputTensorDesc), + (cudnnFilterDescriptor_t)resource_mg_get(&rm_cudnn_filters, (void*)filterDesc), + nbDims, + (int*)result->mem_result_u.data.mem_data_val); + GSCHED_RELEASE; + return 1; +} + +bool_t rpc_cudnncreateconvolutiondescriptor_1_svc(ptr_result *result, struct svc_req *rqstp) +{ + RECORD_VOID_API; + LOGE(LOG_DEBUG, "%s", __FUNCTION__); + + GSCHED_RETAIN; + result->err = cudnnCreateConvolutionDescriptor((cudnnConvolutionDescriptor_t*)&result->ptr_result_u.ptr); + if (resource_mg_create(&rm_cudnn_convs, (void*)result->ptr_result_u.ptr) != 0) { + LOGE(LOG_ERROR, "error in resource manager"); + } + GSCHED_RELEASE; + RECORD_RESULT(ptr_result_u, *result); + return 1; +} + +bool_t rpc_cudnndestroyconvolutiondescriptor_1_svc(ptr convDesc, int *result, struct svc_req *rqstp) +{ + RECORD_API(ptr); + RECORD_SINGLE_ARG(convDesc); + LOGE(LOG_DEBUG, "%s", __FUNCTION__); + + GSCHED_RETAIN; + *result = cudnnDestroyConvolutionDescriptor( + (cudnnConvolutionDescriptor_t)resource_mg_get(&rm_cudnn_convs, (void*)convDesc)); + // TODO: Remove from resource manager + GSCHED_RELEASE; + RECORD_RESULT(integer, *result); + return 1; +} + +bool_t rpc_cudnnsetconvolutionnddescriptor_1_svc(ptr convDesc, int arrayLength, mem_data padA, mem_data filterStrideA, mem_data dilationA, int mode, int computeType, int *result, struct svc_req *rqstp) +{ + RECORD_API(rpc_cudnnsetconvolutionnddescriptor_1_argument); + RECORD_NARG(convDesc); + RECORD_NARG(arrayLength); + RECORD_NARG(padA); + RECORD_NARG(filterStrideA); + RECORD_NARG(dilationA); + RECORD_NARG(mode); + RECORD_NARG(computeType); + //TODO: Recording mem_data is not as easy as done here. + + LOGE(LOG_DEBUG, "%s", __FUNCTION__); + + if (padA.mem_data_len != arrayLength * sizeof(int) || + filterStrideA.mem_data_len != arrayLength * sizeof(int) || + dilationA.mem_data_len != arrayLength * sizeof(int)) { + LOGE(LOG_ERROR, "array dimensions not as expected."); + return 0; + } + GSCHED_RETAIN; + *result = cudnnSetConvolutionNdDescriptor( + (cudnnConvolutionDescriptor_t)resource_mg_get(&rm_cudnn_convs, (void*)convDesc), + arrayLength, + (const int*)padA.mem_data_val, + (const int*)filterStrideA.mem_data_val, + (const int*)dilationA.mem_data_val, + (cudnnConvolutionMode_t)mode, + (cudnnDataType_t)computeType); + GSCHED_RELEASE; + RECORD_RESULT(integer, *result); + return 1; +} + +bool_t rpc_cudnngetconvolutionforwardalgorithm_v7_1_svc(ptr handle, ptr srcDesc, ptr filterDesc, ptr convDesc, ptr destDesc, int requestedAlgoCount, mem_result *result, struct svc_req *rqstp) +{ + LOGE(LOG_DEBUG, "%s", __FUNCTION__); + GSCHED_RETAIN; + result->mem_result_u.data.mem_data_len = sizeof(int) + sizeof(cudnnConvolutionFwdAlgoPerf_t) * requestedAlgoCount; + if ((result->mem_result_u.data.mem_data_val = malloc(result->mem_result_u.data.mem_data_len)) == NULL) { + LOGE(LOG_ERROR, "malloc failed"); + return 0; + } + result->err = cudnnGetConvolutionForwardAlgorithm_v7( + (cudnnHandle_t)resource_mg_get(&rm_cudnn, (void*)handle), + (cudnnTensorDescriptor_t)resource_mg_get(&rm_cudnn_tensors, (void*)srcDesc), + (cudnnFilterDescriptor_t)resource_mg_get(&rm_cudnn_filters, (void*)filterDesc), + (cudnnConvolutionDescriptor_t)resource_mg_get(&rm_cudnn_convs, (void*)convDesc), + (cudnnTensorDescriptor_t)resource_mg_get(&rm_cudnn_tensors, (void*)destDesc), + requestedAlgoCount, + (int*)result->mem_result_u.data.mem_data_val, + (cudnnConvolutionFwdAlgoPerf_t*)(result->mem_result_u.data.mem_data_val + sizeof(int))); + GSCHED_RELEASE; + return 1; +} + +bool_t rpc_cudnnfindconvolutionforwardalgorithm_1_svc(ptr handle, ptr xDesc, ptr wDesc, ptr convDesc, ptr yDesc, int requestedAlgoCount, mem_result *result, struct svc_req *rqstp) +{ + LOGE(LOG_DEBUG, "%s", __FUNCTION__); + GSCHED_RETAIN; + result->mem_result_u.data.mem_data_len = sizeof(int) + sizeof(cudnnConvolutionFwdAlgoPerf_t) * requestedAlgoCount; + if ((result->mem_result_u.data.mem_data_val = malloc(result->mem_result_u.data.mem_data_len)) == NULL) { + LOGE(LOG_ERROR, "malloc failed"); + return 0; + } + result->err = cudnnFindConvolutionForwardAlgorithm( + (cudnnHandle_t)resource_mg_get(&rm_cudnn, (void*)handle), + (cudnnTensorDescriptor_t)resource_mg_get(&rm_cudnn_tensors, (void*)xDesc), + (cudnnFilterDescriptor_t)resource_mg_get(&rm_cudnn_filters, (void*)wDesc), + (cudnnConvolutionDescriptor_t)resource_mg_get(&rm_cudnn_convs, (void*)convDesc), + (cudnnTensorDescriptor_t)resource_mg_get(&rm_cudnn_tensors, (void*)yDesc), + requestedAlgoCount, + (int*)result->mem_result_u.data.mem_data_val, + (cudnnConvolutionFwdAlgoPerf_t*)(result->mem_result_u.data.mem_data_val + sizeof(int))); + GSCHED_RELEASE; + return 1; +} + +bool_t rpc_cudnngetconvolutionforwardworkspacesize_1_svc(ptr handle, ptr xDesc, ptr wDesc, ptr convDesc, ptr yDesc, int algo, sz_result *result, struct svc_req *rqstp) +{ + LOGE(LOG_DEBUG, "%s", __FUNCTION__); + GSCHED_RETAIN; + result->err = cudnnGetConvolutionForwardWorkspaceSize( + (cudnnHandle_t)resource_mg_get(&rm_cudnn, (void*)handle), + (cudnnTensorDescriptor_t)resource_mg_get(&rm_cudnn_tensors, (void*)xDesc), + (cudnnFilterDescriptor_t)resource_mg_get(&rm_cudnn_filters, (void*)wDesc), + (cudnnConvolutionDescriptor_t)resource_mg_get(&rm_cudnn_convs, (void*)convDesc), + (cudnnTensorDescriptor_t)resource_mg_get(&rm_cudnn_tensors, (void*)yDesc), + (cudnnConvolutionFwdAlgo_t)algo, + (size_t*)&result->sz_result_u.data); + GSCHED_RELEASE; + return 1; +} + +bool_t rpc_cudnnconvolutionforward_1_svc(ptr handle, cudnn_scaling_t alpha, ptr xDesc, ptr x, ptr wDesc, ptr w, ptr convDesc, int algo, ptr workSpace, size_t workSpaceSizeInBytes, cudnn_scaling_t beta, ptr yDesc, ptr y, int *result, struct svc_req *rqstp) +{ + RECORD_API(rpc_cudnnconvolutionforward_1_argument); + RECORD_NARG(handle); + RECORD_NARG(alpha); + RECORD_NARG(xDesc); + RECORD_NARG(x); + RECORD_NARG(wDesc); + RECORD_NARG(w); + RECORD_NARG(convDesc); + RECORD_NARG(algo); + RECORD_NARG(workSpace); + RECORD_NARG(workSpaceSizeInBytes); + RECORD_NARG(beta); + RECORD_NARG(yDesc); + RECORD_NARG(y); + + LOGE(LOG_DEBUG, "%s", __FUNCTION__); + GSCHED_RETAIN; + *result = cudnnConvolutionForward( + (cudnnHandle_t)resource_mg_get(&rm_cudnn, (void*)handle), + (alpha.dataType == CUDNN_DATA_DOUBLE ? (const void*)&alpha.cudnn_scaling_t_u.d : (const void*)&alpha.cudnn_scaling_t_u.f), + (cudnnTensorDescriptor_t)resource_mg_get(&rm_cudnn_tensors, (void*)xDesc), + (const void*)resource_mg_get(&rm_memory, (void*)x), + (cudnnFilterDescriptor_t)resource_mg_get(&rm_cudnn_filters, (void*)wDesc), + (const void*)resource_mg_get(&rm_memory, (void*)w), + (cudnnConvolutionDescriptor_t)resource_mg_get(&rm_cudnn_convs, (void*)convDesc), + algo, + (void*)resource_mg_get(&rm_memory, (void*)workSpace), + workSpaceSizeInBytes, + (beta.dataType == CUDNN_DATA_DOUBLE ? (const void*)&beta.cudnn_scaling_t_u.d : (const void*)&beta.cudnn_scaling_t_u.f), + (cudnnTensorDescriptor_t)resource_mg_get(&rm_cudnn_tensors, (void*)yDesc), + (void*)resource_mg_get(&rm_memory, (void*)y)); + GSCHED_RELEASE; + RECORD_RESULT(integer, *result); + return 1; +} + +bool_t rpc_cudnnaddtensor_1_svc(ptr handle, cudnn_scaling_t alpha, ptr aDesc, ptr A, cudnn_scaling_t beta, ptr cDesc, ptr C, int *result, struct svc_req *rqstp) +{ + RECORD_API(rpc_cudnnaddtensor_1_argument); + RECORD_NARG(handle); + RECORD_NARG(alpha); + RECORD_NARG(aDesc); + RECORD_NARG(A); + RECORD_NARG(beta); + RECORD_NARG(cDesc); + RECORD_NARG(C); + + LOGE(LOG_DEBUG, "%s", __FUNCTION__); + GSCHED_RETAIN; + *result = cudnnAddTensor( + (cudnnHandle_t)resource_mg_get(&rm_cudnn, (void*)handle), + (alpha.dataType == CUDNN_DATA_DOUBLE ? (const void*)&alpha.cudnn_scaling_t_u.d : (const void*)&alpha.cudnn_scaling_t_u.f), + (cudnnTensorDescriptor_t)resource_mg_get(&rm_cudnn_tensors, (void*)aDesc), + (const void*)resource_mg_get(&rm_memory, (void*)A), + (beta.dataType == CUDNN_DATA_DOUBLE ? (const void*)&beta.cudnn_scaling_t_u.d : (const void*)&beta.cudnn_scaling_t_u.f), + (cudnnTensorDescriptor_t)resource_mg_get(&rm_cudnn_tensors, (void*)cDesc), + (void*)resource_mg_get(&rm_memory, (void*)C)); + GSCHED_RELEASE; + RECORD_RESULT(integer, *result); + return 1; +} + +bool_t rpc_cudnntransformtensor_1_svc(ptr handle, cudnn_scaling_t alpha, ptr xDesc, ptr x, cudnn_scaling_t beta, ptr yDesc, ptr y, int *result, struct svc_req *rqstp) +{ + RECORD_API(rpc_cudnntransformtensor_1_argument); + RECORD_NARG(handle); + RECORD_NARG(alpha); + RECORD_NARG(xDesc); + RECORD_NARG(x); + RECORD_NARG(beta); + RECORD_NARG(yDesc); + RECORD_NARG(y); + + LOGE(LOG_DEBUG, "%s", __FUNCTION__); + GSCHED_RETAIN; + *result = cudnnTransformTensor( + (cudnnHandle_t)resource_mg_get(&rm_cudnn, (void*)handle), + (alpha.dataType == CUDNN_DATA_DOUBLE ? (const void*)&alpha.cudnn_scaling_t_u.d : (const void*)&alpha.cudnn_scaling_t_u.f), + (cudnnTensorDescriptor_t)resource_mg_get(&rm_cudnn_tensors, (void*)xDesc), + (const void*)resource_mg_get(&rm_memory, (void*)x), + (beta.dataType == CUDNN_DATA_DOUBLE ? (const void*)&beta.cudnn_scaling_t_u.d : (const void*)&beta.cudnn_scaling_t_u.f), + (cudnnTensorDescriptor_t)resource_mg_get(&rm_cudnn_tensors, (void*)yDesc), + (void*)resource_mg_get(&rm_memory, (void*)y)); + GSCHED_RELEASE; + RECORD_RESULT(integer, *result); + return 1; +} + +static const size_t backendAttributeSizes[] = { + [CUDNN_TYPE_HANDLE] = sizeof(cudnnHandle_t), + [CUDNN_TYPE_DATA_TYPE] = sizeof(cudnnDataType_t), + [CUDNN_TYPE_BOOLEAN] = sizeof(bool), + [CUDNN_TYPE_INT64] = sizeof(int64_t), + [CUDNN_TYPE_FLOAT] = sizeof(float), + [CUDNN_TYPE_DOUBLE] = sizeof(double), + [CUDNN_TYPE_VOID_PTR] = sizeof(void *), + [CUDNN_TYPE_CONVOLUTION_MODE] = sizeof(cudnnConvolutionMode_t), + [CUDNN_TYPE_HEUR_MODE] = sizeof(cudnnBackendHeurMode_t), + [CUDNN_TYPE_KNOB_TYPE] = sizeof(cudnnBackendKnobType_t), + [CUDNN_TYPE_NAN_PROPOGATION] = sizeof(cudnnNanPropagation_t), + [CUDNN_TYPE_NUMERICAL_NOTE] = sizeof(cudnnBackendNumericalNote_t), + [CUDNN_TYPE_LAYOUT_TYPE] = sizeof(cudnnBackendLayoutType_t), + [CUDNN_TYPE_ATTRIB_NAME] = sizeof(cudnnBackendAttributeName_t), + [CUDNN_TYPE_POINTWISE_MODE] = sizeof(cudnnPointwiseMode_t), + [CUDNN_TYPE_BACKEND_DESCRIPTOR] = sizeof(cudnnBackendDescriptor_t), + [CUDNN_TYPE_GENSTATS_MODE] = sizeof(cudnnGenStatsMode_t), + [CUDNN_TYPE_BN_FINALIZE_STATS_MODE] = sizeof(cudnnBnFinalizeStatsMode_t), + [CUDNN_TYPE_REDUCTION_OPERATOR_TYPE] = sizeof(cudnnReduceTensorOp_t), + [CUDNN_TYPE_BEHAVIOR_NOTE] = sizeof(cudnnBackendBehaviorNote_t), + [CUDNN_TYPE_TENSOR_REORDERING_MODE] = sizeof(cudnnBackendTensorReordering_t), + [CUDNN_TYPE_RESAMPLE_MODE] = sizeof(cudnnResampleMode_t), + [CUDNN_TYPE_PADDING_MODE] = sizeof(cudnnPaddingMode_t), + [CUDNN_TYPE_INT32] = sizeof(int32_t), + [CUDNN_TYPE_CHAR] = sizeof(char), + [CUDNN_TYPE_SIGNAL_MODE] = sizeof(cudnnSignalMode_t), + [CUDNN_TYPE_FRACTION] = sizeof(cudnnFraction_t), + [CUDNN_TYPE_NORM_MODE] = sizeof(cudnnBackendNormMode_t), + [CUDNN_TYPE_NORM_FWD_PHASE] = sizeof(cudnnBackendNormFwdPhase_t), + [CUDNN_TYPE_RNG_DISTRIBUTION] = sizeof(cudnnRngDistribution_t), +}; + +bool_t rpc_cudnnbackendcreatedescriptor_1_svc(int descriptorType, ptr_result *result, struct svc_req *rqstp) +{ + RECORD_API(int); + RECORD_SINGLE_ARG(descriptorType); + LOGE(LOG_DEBUG, "%s", __FUNCTION__); + + GSCHED_RETAIN; + result->err = cudnnBackendCreateDescriptor( + (cudnnBackendDescriptorType_t)descriptorType, + (cudnnBackendDescriptor_t*)&result->ptr_result_u.ptr); + if (resource_mg_create(&rm_cudnn_backendds, (void*)result->ptr_result_u.ptr) != 0) { + LOGE(LOG_ERROR, "error in resource manager"); + } + GSCHED_RELEASE; + RECORD_RESULT(ptr_result_u, *result); + return 1; +} + +bool_t rpc_cudnnbackenddestroydescriptor_1_svc(ptr descriptor, int *result, struct svc_req *rqstp) +{ + RECORD_API(ptr); + RECORD_SINGLE_ARG(descriptor); + LOGE(LOG_DEBUG, "%s", __FUNCTION__); + + GSCHED_RETAIN; + *result = cudnnBackendDestroyDescriptor( + (cudnnBackendDescriptor_t)resource_mg_get(&rm_cudnn_backendds, (void*)descriptor)); + // TODO: Remove from resource manager + GSCHED_RELEASE; + RECORD_RESULT(integer, *result); + return 1; +} + +bool_t rpc_cudnnbackendinitialize_1_svc(ptr descriptor, int *result, struct svc_req *rqstp) +{ + RECORD_API(ptr); + RECORD_SINGLE_ARG(descriptor); + LOGE(LOG_DEBUG, "%s", __FUNCTION__); + + GSCHED_RETAIN; + *result = cudnnBackendInitialize( + (cudnnBackendDescriptor_t)resource_mg_get(&rm_cudnn_backendds, (void*)descriptor)); + GSCHED_RELEASE; + RECORD_RESULT(integer, *result); + return 1; +} + +bool_t rpc_cudnnbackendfinalize_1_svc(ptr descriptor, int *result, struct svc_req *rqstp) +{ + RECORD_API(ptr); + RECORD_SINGLE_ARG(descriptor); + LOGE(LOG_DEBUG, "%s", __FUNCTION__); + + GSCHED_RETAIN; + *result = cudnnBackendFinalize( + (cudnnBackendDescriptor_t)resource_mg_get(&rm_cudnn_backendds, (void*)descriptor)); + GSCHED_RELEASE; + RECORD_RESULT(integer, *result); + return 1; +} +bool_t rpc_cudnnbackendsetattribute_1_svc( + ptr descriptor, + int attributeName, + int attributeType, + int64_t elementCount, + mem_data arrayOfElements, + int *result, struct svc_req *rqstp) +{ + RECORD_API(rpc_cudnnbackendsetattribute_1_argument); + RECORD_NARG(descriptor); + RECORD_NARG(attributeName); + RECORD_NARG(attributeType); + RECORD_NARG(elementCount); + RECORD_NARG(arrayOfElements); + + LOGE(LOG_DEBUG, "%s", __FUNCTION__); + + if (attributeType < 0 || attributeType >= CUDNN_TYPE_RNG_DISTRIBUTION) { + LOGE(LOG_ERROR, "attributeType out of range."); + return 0; + } + + if (arrayOfElements.mem_data_len != elementCount * backendAttributeSizes[attributeType]) { + LOGE(LOG_ERROR, "array dimensions not as expected."); + return 0; + } + GSCHED_RETAIN; + *result = cudnnBackendSetAttribute( + (cudnnBackendDescriptor_t)resource_mg_get(&rm_cudnn_backendds, (void*)descriptor), + (cudnnBackendAttributeName_t)attributeName, + (cudnnBackendAttributeType_t)attributeType, + elementCount, + arrayOfElements.mem_data_val); + GSCHED_RELEASE; + RECORD_RESULT(integer, *result); + return 1; +} + +bool_t rpc_cudnnbackendgetattribute_1_svc(ptr descriptor, int attributeName, int attributeType, int64_t requestedElementCount, mem_result *result, struct svc_req *rqstp) +{ + void *arrayOfElements = NULL; + LOGE(LOG_DEBUG, "%s", __FUNCTION__); + if (attributeType < 0 || attributeType >= CUDNN_TYPE_RNG_DISTRIBUTION) { + LOGE(LOG_ERROR, "attributeType out of range."); + return 0; + } + result->mem_result_u.data.mem_data_len = sizeof(int64_t) + requestedElementCount*sizeof(backendAttributeSizes[attributeType]); + if ((result->mem_result_u.data.mem_data_val = malloc(result->mem_result_u.data.mem_data_len)) == NULL) { + LOGE(LOG_ERROR, "malloc failed"); + return 0; + } + if (requestedElementCount > 0) { + void *data = result->mem_result_u.data.mem_data_val + sizeof(int64_t); + } + + GSCHED_RETAIN; + result->err = cudnnBackendGetAttribute( + (cudnnBackendDescriptor_t)resource_mg_get(&rm_cudnn_backendds, (void*)descriptor), + (cudnnBackendAttributeName_t)attributeName, + (cudnnBackendAttributeType_t)attributeType, + requestedElementCount, + (int64_t*)result->mem_result_u.data.mem_data_val, + arrayOfElements); + + LOGE(LOG_DEBUG, "desc: %p, name: %d, type: %d, requestedElementCount: %zd, elementCount: %zd, arrayOfElements: %p -> %d", descriptor, attributeName, attributeType, requestedElementCount, *result->mem_result_u.data.mem_data_val, arrayOfElements, result->err); + + GSCHED_RELEASE; + return 1; +} +bool_t rpc_cudnnbackendexecute_1_svc(ptr handle, ptr executionPlan, ptr variantPack, int *result, struct svc_req *rqstp) +{ + RECORD_API(rpc_cudnnbackendexecute_1_argument); + RECORD_NARG(handle); + RECORD_NARG(executionPlan); + RECORD_NARG(variantPack); + LOGE(LOG_DEBUG, "%s", __FUNCTION__); + + GSCHED_RETAIN; + *result = cudnnBackendExecute( + (cudnnHandle_t)resource_mg_get(&rm_cudnn, (void*)handle), + (cudnnBackendDescriptor_t)resource_mg_get(&rm_cudnn_backendds, (void*)executionPlan), + (cudnnBackendDescriptor_t)resource_mg_get(&rm_cudnn_backendds, (void*)variantPack)); + GSCHED_RELEASE; + RECORD_RESULT(integer, *result); + return 1; +} \ No newline at end of file diff --git a/cpu/cpu-server-cudnn.h b/cpu/cpu-server-cudnn.h new file mode 100644 index 00000000..6c892919 --- /dev/null +++ b/cpu/cpu-server-cudnn.h @@ -0,0 +1,9 @@ +#ifndef _CPU_SERVER_CUDNN_H_ +#define _CPU_SERVER_CUDNN_H_ + +#include "resource-mg.h" + +int server_cudnn_init(int restore); +int server_cudnn_deinit(void); + +#endif // _CPU_SERVER_CUDNN_H_ \ No newline at end of file diff --git a/cpu/cpu-server-driver.c b/cpu/cpu-server-driver.c index f6714b56..4eb2aad4 100644 --- a/cpu/cpu-server-driver.c +++ b/cpu/cpu-server-driver.c @@ -20,16 +20,157 @@ int server_driver_init(int restore) int ret = 0; if (!restore) { - ret &= resource_mg_init(&rm_modules, 1); - ret &= resource_mg_init(&rm_functions, 1); + // we cannot bypass the resource manager for functions and modules + // because CUfunctions and modules are at different locations on server and client + ret &= resource_mg_init(&rm_modules, 0); + ret &= resource_mg_init(&rm_functions, 0); + ret &= resource_mg_init(&rm_globals, 0); } else { ret &= resource_mg_init(&rm_modules, 0); ret &= resource_mg_init(&rm_functions, 0); + ret &= resource_mg_init(&rm_globals, 0); //ret &= server_driver_restore("ckp"); } return ret; } +#include + +// Does not support checkpoint/restart yet +bool_t rpc_elf_load_1_svc(mem_data elf, ptr module_key, int *result, struct svc_req *rqstp) +{ + LOGE(LOG_DEBUG, "rpc_elf_load(elf: %p, len: %#x, module_key: %#x)", elf.mem_data_val, elf.mem_data_len, module_key); + CUresult res; + CUmodule module = NULL; + + if ((res = cuModuleLoadData(&module, elf.mem_data_val)) != CUDA_SUCCESS) { + LOGE(LOG_ERROR, "cuModuleLoadData failed: %d", res); + *result = res; + return 1; + } + + // We add our module using module_key as key. This means a fatbinaryHandle on the client is translated + // to a CUmodule on the server. + if ((res = resource_mg_add_sorted(&rm_modules, (void*)module_key, (void*)module)) != CUDA_SUCCESS) { + LOGE(LOG_ERROR, "resource_mg_create failed: %d", res); + *result = res; + return 1; + } + + LOGE(LOG_DEBUG, "->module: %p", module); + *result = 0; + return 1; +} + +// Does not support checkpoint/restart yet +// TODO: We should also remove associated function handles +bool_t rpc_elf_unload_1_svc(ptr elf_handle, int *result, struct svc_req *rqstp) +{ + LOGE(LOG_DEBUG, "rpc_elf_unload(elf_handle: %p)", elf_handle); + CUmodule module = NULL; + CUresult res; + + if ((module = (CUmodule)resource_mg_get(&rm_modules, (void*)elf_handle)) == NULL) { + LOG(LOG_ERROR, "resource_mg_get failed"); + *result = -1; + return 1; + } + + LOGE(LOG_DEBUG,"module: %p", module); + + // if ((res = resource_mg_remove(&rm_modules, (void*)elf_handle)) != CUDA_SUCCESS) { + // LOG(LOG_ERROR, "resource_mg_create failed: %d", res); + // result->err = res; + // return 1; + // } + + if ((res = cuModuleUnload(module)) != CUDA_SUCCESS) { + const char *errstr; + cuGetErrorString(res, &errstr); + LOG(LOG_ERROR, "cuModuleUnload failed: %s (%d)", errstr, res); + *result = res; + return 1; + } + + *result = 0; + return 1; +} + +// Does not support checkpoint/restart yet +bool_t rpc_register_function_1_svc(ptr fatCubinHandle, ptr hostFun, char* deviceFun, + char* deviceName, int thread_limit, ptr_result *result, struct svc_req *rqstp) +{ + void *module = NULL; + RECORD_API(rpc_register_function_1_argument); + RECORD_ARG(1, fatCubinHandle); + RECORD_ARG(2, hostFun); + RECORD_ARG(3, deviceFun); + RECORD_ARG(4, deviceName); + RECORD_ARG(5, thread_limit); + LOG(LOG_DEBUG, "rpc_register_function(fatCubinHandle: %p, hostFun: %p, deviceFun: %s, deviceName: %s, thread_limit: %d)", + fatCubinHandle, hostFun, deviceFun, deviceName, thread_limit); + GSCHED_RETAIN; + //resource_mg_print(&rm_modules); + if ((module = resource_mg_get(&rm_modules, (void*)fatCubinHandle)) == (void*)fatCubinHandle) { + LOGE(LOG_ERROR, "%p not found in resource manager - we cannot call a function from an unknown module.", fatCubinHandle); + result->err = -1; + return 1; + } + result->err = cuModuleGetFunction((CUfunction*)&result->ptr_result_u.ptr, + module, + deviceName); + if (resource_mg_add_sorted(&rm_functions, (void*)hostFun, (void*)result->ptr_result_u.ptr) != 0) { + LOGE(LOG_ERROR, "error in resource manager"); + } + GSCHED_RELEASE; + RECORD_RESULT(ptr_result_u, *result); + return 1; +} + +// Does not support checkpoint/restart yet +bool_t rpc_register_var_1_svc(ptr fatCubinHandle, ptr hostVar, ptr deviceAddress, char *deviceName, int ext, size_t size, + int constant, int global, int *result, struct svc_req *rqstp) +{ + RECORD_API(rpc_register_var_1_argument); + RECORD_ARG(1, fatCubinHandle); + RECORD_ARG(2, hostVar); + RECORD_ARG(3, deviceAddress); + RECORD_ARG(4, deviceName); + RECORD_ARG(5, ext); + RECORD_ARG(6, size); + RECORD_ARG(7, constant); + RECORD_ARG(8, global); + + LOG(LOG_DEBUG, "rpc_register_var(fatCubinHandle: %p, hostVar: %p, deviceAddress: %p, deviceName: %s, " + "ext: %d, size: %d, constant: %d, global: %d)", + fatCubinHandle, hostVar, deviceAddress, deviceName, ext, size, constant, global); + + CUdeviceptr dptr = 0; + size_t d_size = 0; + CUresult res; + void *module = NULL; + GSCHED_RETAIN; + if ((module = resource_mg_get(&rm_modules, (void*)fatCubinHandle)) == (void*)fatCubinHandle) { + LOGE(LOG_ERROR, "%p not found in resource manager - we cannot call a function from an unknown module.", fatCubinHandle); + *result = -1; + return 1; + } + if ((res = cuModuleGetGlobal(&dptr, &d_size, module, deviceName)) != CUDA_SUCCESS) { + LOGE(LOG_ERROR, "cuModuleGetGlobal failed: %d", res); + *result = 1; + return 1; + } + if (resource_mg_add_sorted(&rm_globals, (void*)hostVar, (void*)dptr) != 0) { + LOGE(LOG_ERROR, "error in resource manager"); + *result = 1; + } else { + *result = 0; + } + GSCHED_RELEASE; + RECORD_RESULT(integer, *result); + return 1; +} + int server_driver_deinit(void) { resource_mg_free(&rm_modules); @@ -158,6 +299,26 @@ bool_t rpc_cumodulegetfunction_1_svc(uint64_t module, char *name, ptr_result *re return 1; } +bool_t rpc_cumoduleloaddata_1_svc(mem_data mem, ptr_result *result, + struct svc_req *rqstp) +{ + RECORD_API(mem_data); + RECORD_SINGLE_ARG(mem); + LOG(LOG_DEBUG, "%s(%p, %#0zx)", __FUNCTION__, mem.mem_data_val, mem.mem_data_len); + GSCHED_RETAIN; + result->err = cuModuleLoadData((CUmodule*)&result->ptr_result_u.ptr, mem.mem_data_val); + GSCHED_RELEASE; + if (resource_mg_create(&rm_modules, (void*)result->ptr_result_u.ptr) != 0) { + LOGE(LOG_ERROR, "error in resource manager"); + } + if (result->err != 0) { + char *err_str = NULL; + cuGetErrorName(result->err, &err_str); + LOGE(LOG_DEBUG, "cuModuleLoadData result: %s", err_str); + } + RECORD_RESULT(ptr_result_u, *result); + return 1; +} bool_t rpc_cumoduleload_1_svc(char* path, ptr_result *result, struct svc_req *rqstp) { @@ -170,6 +331,11 @@ bool_t rpc_cumoduleload_1_svc(char* path, ptr_result *result, if (resource_mg_create(&rm_modules, (void*)result->ptr_result_u.ptr) != 0) { LOGE(LOG_ERROR, "error in resource manager"); } + if (result->err != 0) { + char *err_str = NULL; + cuGetErrorName(result->err, &err_str); + LOGE(LOG_DEBUG, "cuModuleLoad result: %s", err_str); + } RECORD_RESULT(ptr_result_u, *result); return 1; } @@ -181,7 +347,7 @@ bool_t rpc_cumoduleunload_1_svc(ptr module, int *result, RECORD_SINGLE_ARG(module); LOG(LOG_DEBUG, "%s(%p)", __FUNCTION__, (void*)module); GSCHED_RETAIN; - *result = cuModuleUnload(resource_mg_get(&rm_streams, (void*)module)); + *result = cuModuleUnload(resource_mg_get(&rm_modules, (void*)module)); GSCHED_RELEASE; RECORD_RESULT(integer, *result); return 1; @@ -202,6 +368,45 @@ bool_t rpc_cugeterrorstring_1_svc(int err, str_result *result, return 1; } +bool_t rpc_cudeviceprimaryctxgetstate_1_svc(int dev, dint_result *result, + struct svc_req *rqstp) +{ + LOGE(LOG_DEBUG, "%s(%d)", __FUNCTION__, dev); + GSCHED_RETAIN; + result->err = cuDevicePrimaryCtxGetState(dev, &(result->dint_result_u.data.i1), + &(result->dint_result_u.data.i2)); + LOGE(LOG_DEBUG, "state: %d, flags: %d", result->dint_result_u.data.i1, + result->dint_result_u.data.i2); + GSCHED_RELEASE; + return 1; +} + +bool_t rpc_cudevicegetproperties_1_svc(int dev, mem_result *result, + struct svc_req *rqstp) +{ + LOGE(LOG_DEBUG, "%s(%d)", __FUNCTION__, dev); + GSCHED_RETAIN; + if ((result->mem_result_u.data.mem_data_val = malloc(sizeof(CUdevprop))) == NULL) { + result->err = CUDA_ERROR_OUT_OF_MEMORY; + } + result->mem_result_u.data.mem_data_len = sizeof(CUdevprop); + result->err = cuDeviceGetProperties((CUdevprop*)result->mem_result_u.data.mem_data_val, dev); + GSCHED_RELEASE; + return 1; +} + +bool_t rpc_cudevicecomputecapability_1_svc(int dev, dint_result *result, + struct svc_req *rqstp) +{ + LOGE(LOG_DEBUG, "%s(%d)", __FUNCTION__, dev); + GSCHED_RETAIN; + result->err = cuDeviceComputeCapability(&(result->dint_result_u.data.i1), + &(result->dint_result_u.data.i2), + dev); + GSCHED_RELEASE; + return 1; +} + /* bool_t rpc_cugetexporttable_1_svc(char *rpc_uuid, ptr_result *result, struct svc_req *rqstp) @@ -276,7 +481,6 @@ bool_t rpc_culaunchkernel_1_svc(uint64_t f, unsigned int gridDimX, unsigned int void **cuda_args; uint16_t *arg_offsets; size_t param_num; - LOG(LOG_DEBUG, "%s", __FUNCTION__); if (args.mem_data_val == NULL) { LOGE(LOG_ERROR, "param.mem_data_val is NULL"); *result = CUDA_ERROR_INVALID_VALUE; @@ -303,10 +507,15 @@ bool_t rpc_culaunchkernel_1_svc(uint64_t f, unsigned int gridDimX, unsigned int LOGE(LOG_DEBUG, "arg: %p (%d)", *(void**)cuda_args[i], *(int*)cuda_args[i]); } - LOGE(LOG_DEBUG, "cuLaunchKernel(func=%p, gridDim=[%d,%d,%d], blockDim=[%d,%d,%d], args=%p, sharedMem=%d, stream=%p)", f, gridDimX, gridDimY, gridDimZ, blockDimX, blockDimY, blockDimZ, cuda_args, sharedMemBytes, (void*)hStream); + LOGE(LOG_DEBUG, "cuLaunchKernel(func=%p->%p, gridDim=[%d,%d,%d], blockDim=[%d,%d,%d], args=%p, sharedMem=%d, stream=%p)", f, resource_mg_get(&rm_functions, (void*)f), gridDimX, gridDimY, gridDimZ, blockDimX, blockDimY, blockDimZ, cuda_args, sharedMemBytes, (void*)hStream); GSCHED_RETAIN; - *result = cuLaunchKernel((CUfunction)f, gridDimX, gridDimY, gridDimZ, blockDimX, blockDimY, blockDimZ, sharedMemBytes, (CUstream)hStream, cuda_args, NULL); + *result = cuLaunchKernel((CUfunction)resource_mg_get(&rm_functions, (void*)f), + gridDimX, gridDimY, gridDimZ, + blockDimX, blockDimY, blockDimZ, + sharedMemBytes, + (CUstream)hStream, + cuda_args, NULL); GSCHED_RELEASE; free(cuda_args); @@ -314,6 +523,15 @@ bool_t rpc_culaunchkernel_1_svc(uint64_t f, unsigned int gridDimX, unsigned int } +bool_t rpc_cudevicegetp2pattribute_1_svc(int attrib, ptr srcDevice, ptr dstDevice, int_result *result, struct svc_req *rqstp) +{ + LOG(LOG_DEBUG, "%s", __FUNCTION__); + GSCHED_RETAIN; + result->err = cuDeviceGetP2PAttribute(&result->int_result_u.data, (CUdevice_P2PAttribute)attrib, (CUdevice)srcDevice, (CUdevice)dstDevice); + GSCHED_RELEASE; + return 1; +} + /* ################## START OF HIDDEN FUNCTIONS IMPL ######################## */ /* diff --git a/cpu/cpu-server-nvml.c b/cpu/cpu-server-nvml.c new file mode 100644 index 00000000..89467618 --- /dev/null +++ b/cpu/cpu-server-nvml.c @@ -0,0 +1,72 @@ +#define _GNU_SOURCE +#include +#include + +#include +#include + +#include "cpu_rpc_prot.h" +#include "cpu-common.h" +#include "cpu-utils.h" +#include "log.h" +#include "resource-mg.h" +#define WITH_RECORDER +#include "api-recorder.h" +#include "gsched.h" + +int server_nvml_init(int restore) +{ + int ret = 0; + if (!restore) { + //ret &= resource_mg_init(&rm_modules, 1); + } else { + //ret &= resource_mg_init(&rm_modules, 0); + //ret &= server_driver_restore("ckp"); + } + return ret; +} + +int server_nvml_deinit(void) +{ + //resource_mg_free(&rm_modules); + return 0; +} + +bool_t rpc_nvmldevicegetcount_v2_1_svc(int_result *result, struct svc_req *rqstp) +{ + LOGE(LOG_DEBUG, "%s", __FUNCTION__); + GSCHED_RETAIN; + // Workaround for pytorch expecting nvmlDeviceGetCount and cudaGetDeviceCount to be the same + //result->err = nvmlDeviceGetCount_v2(&result->int_result_u.data); + result->err = cudaGetDeviceCount(&result->int_result_u.data); + LOGE(LOG_DEBUG, "%s: %d", __FUNCTION__, result->int_result_u.data); + GSCHED_RELEASE; + return 1; +} + +bool_t rpc_nvmlinitwithflags_1_svc(int flags, int *result, struct svc_req *rqstp) +{ + LOGE(LOG_DEBUG, "%s", __FUNCTION__); + GSCHED_RETAIN; + *result = nvmlInitWithFlags(flags); + GSCHED_RELEASE; + return 1; +} + +bool_t rpc_nvmlinit_v2_1_svc(int *result, struct svc_req *rqstp) +{ + LOGE(LOG_DEBUG, "%s", __FUNCTION__); + GSCHED_RETAIN; + *result = nvmlInit_v2(); + GSCHED_RELEASE; + return 1; +} + +bool_t rpc_nvmlshutdown_1_svc(int *result, struct svc_req *rqstp) +{ + LOGE(LOG_DEBUG, "%s", __FUNCTION__); + GSCHED_RETAIN; + *result = nvmlShutdown(); + GSCHED_RELEASE; + return 1; +} \ No newline at end of file diff --git a/cpu/cpu-server-nvml.h b/cpu/cpu-server-nvml.h new file mode 100644 index 00000000..84a8270c --- /dev/null +++ b/cpu/cpu-server-nvml.h @@ -0,0 +1,9 @@ +#ifndef _CPU_SERVER_NVML_H_ +#define _CPU_SERVER_NVML_H_ + +int server_nvml_init(int restore); +int server_nvml_deinit(void); +//int server_nvml_checkpoint(const char *path, int dump_memory, unsigned long prog, unsigned long vers); +//int server_nvml_restore(const char *path); + +#endif //_CPU_SERVER_NVML_H_ diff --git a/cpu/cpu-server-runtime.c b/cpu/cpu-server-runtime.c index 87780856..3d70e0a5 100644 --- a/cpu/cpu-server-runtime.c +++ b/cpu/cpu-server-runtime.c @@ -2,6 +2,8 @@ #include #include #include +#include +#include //for strerror #include @@ -34,13 +36,13 @@ #include "mt-memcpy.h" typedef struct host_alloc_info { - int cnt; + size_t idx; size_t size; void *client_ptr; void *server_ptr; } host_alloc_info_t; static host_alloc_info_t hainfo[64]; -static size_t hainfo_cnt = 1; +static size_t hainfo_cnt = 0; list mt_memcpy_list = {0}; static int hainfo_getserverindex(void *server_ptr) @@ -77,10 +79,21 @@ int server_runtime_init(int restore) ret &= resource_mg_init(&rm_events, 0); ret &= resource_mg_init(&rm_arrays, 0); ret &= resource_mg_init(&rm_memory, 0); + ret &= resource_mg_init(&rm_kernels, 0); ret &= cusolver_init(0, &rm_streams, &rm_memory); ret &= cublas_init(0, &rm_memory); ret &= server_runtime_restore("ckp"); } + + // Make sure runtime API is initialized + // If we don't do this and use the driver API, it might be unintialized + cudaError_t cres; + if ((cres = cudaSetDevice(0)) != cudaSuccess) { + LOG(LOG_ERROR, "cudaSetDevice failed: %d", cres); + ret = 1; + } + cudaDeviceSynchronize(); + return ret; } @@ -90,6 +103,7 @@ int server_runtime_deinit(void) resource_mg_free(&rm_events); resource_mg_free(&rm_arrays); resource_mg_free(&rm_memory); + resource_mg_free(&rm_kernels); cusolver_deinit(); cublas_deinit(); list_free(&mt_memcpy_list); @@ -133,6 +147,42 @@ int server_runtime_restore(const char *path) return 0; } + +/** implementation for CUDA_REGISTER_FUNCTION(ptr, str, str, str, int) + * + */ +bool_t cuda_register_function_1_svc(ptr fatCubinHandle, ptr hostFun, char* deviceFun, char* deviceName, int thread_limit, int* result, struct svc_req *rqstp) +{ + LOGE(LOG_DEBUG, "cudaRegisterFunction(%p, %p, %s, %s, %d)", fatCubinHandle, hostFun, deviceFun, deviceName, thread_limit); + + void (*serverFun)(void); + + if ( (serverFun = dlsym(RTLD_NEXT, "dlopen")) == NULL) { + LOGE(LOG_ERROR, "failed to get dlopen %s", dlerror()); + *result = 1; + return 1; + } + + if (resource_mg_add_sorted(&rm_kernels, (void*)hostFun, serverFun) != 0) { + LOGE(LOG_ERROR, "failed to add kernel to resource manager"); + *result = 1; + return 1; + } + LOGE(LOG_DEBUG, "added kernel %p->%p to resource manager", hostFun, serverFun); + // __cudaRegisterFunction(&fatCubinHandle, hostFun, deviceFun, + // deviceName, thread_limit, &tid, &bid, &bDim, &gDim, &wSize); + + // LOGE(LOG_DEBUG, "-> %p, {%d, %d, %d}, {%d, %d, %d}, {%d, %d, %d}, {%d, %d, %d}, %d)", + // fatCubinHandle, + // tid.x, tid.y, tid.z, + // bid.x, bid.y, bid.z, + // bDim.x, bDim.y, bDim.z, + // gDim.x, gDim.y, gDim.z, + // wSize); + *result = 0; + return 1; +} + /* ############### RUNTIME API ############### */ /* ### Device Management ### */ bool_t cuda_choose_device_1_svc(mem_data prop, int_result *result, struct svc_req *rqstp) @@ -310,19 +360,14 @@ bool_t cuda_get_device_flags_1_svc(int_result *result, struct svc_req *rqstp) return 1; } -bool_t cuda_get_device_properties_1_svc(int device, mem_result *result, struct svc_req *rqstp) +bool_t cuda_get_device_properties_1_svc(int device, cuda_device_prop_result *result, struct svc_req *rqstp) { LOGE(LOG_DEBUG, "cudaGetDeviceProperties"); - result->mem_result_u.data.mem_data_val = malloc(sizeof(struct cudaDeviceProp)); - if (result->mem_result_u.data.mem_data_val == NULL) { - LOGE(LOG_ERROR, "malloc failed."); + if (sizeof(result->cuda_device_prop_result_u.data) != sizeof(struct cudaDeviceProp)) { + LOGE(LOG_ERROR, "cuda_device_prop_result size mismatch"); return 0; } - result->mem_result_u.data.mem_data_len = sizeof(struct cudaDeviceProp); - result->err = cudaGetDeviceProperties((void*)result->mem_result_u.data.mem_data_val, device); - if (result->err != 0) { - free(result->mem_result_u.data.mem_data_val); - } + result->err = cudaGetDeviceProperties((void*)result->cuda_device_prop_result_u.data, device); return 1; } @@ -542,8 +587,14 @@ bool_t cuda_stream_get_priority_1_svc(ptr hStream, int_result *result, struct sv return 1; } -/* Capture API does not make sense without graph API */ -// /* ? CUDA_STREAM_IS_CAPTURING(ptr) = 264;*/ +bool_t cuda_stream_is_capturing_1_svc(ptr stream, int_result *result, struct svc_req *rqstp) +{ + LOGE(LOG_DEBUG, "cudaStreamIsCapturing"); + result->err = cudaStreamIsCapturing( + resource_mg_get(&rm_streams, (void*)stream), + (enum cudaStreamCaptureStatus*)&result->int_result_u.data); + return 1; +} bool_t cuda_stream_query_1_svc(ptr hStream, int *result, struct svc_req *rqstp) { @@ -770,7 +821,7 @@ bool_t cuda_launch_cooperative_kernel_1_svc(ptr func, rpc_dim3 gridDim, rpc_dim3 LOGE(LOG_DEBUG, "cudaLaunchCooperativeKernel(func=%p, gridDim=[%d,%d,%d], blockDim=[%d,%d,%d], args=%p, sharedMem=%d, stream=%p)", func, cuda_gridDim.x, cuda_gridDim.y, cuda_gridDim.z, cuda_blockDim.x, cuda_blockDim.y, cuda_blockDim.z, cuda_args, sharedMem, (void*)stream); *result = cudaLaunchCooperativeKernel( - (void*)func, + resource_mg_get(&rm_kernels, (void*)func), cuda_gridDim, cuda_blockDim, cuda_args, @@ -781,44 +832,6 @@ bool_t cuda_launch_cooperative_kernel_1_svc(ptr func, rpc_dim3 gridDim, rpc_dim3 return 1; } -bool_t cuda_launch_cooperative_kernel_multi_device_1_svc(ptr func, rpc_dim3 gridDim, rpc_dim3 blockDim, mem_data args, size_t sharedMem, ptr stream, int numDevices, int flags, int *result, struct svc_req *rqstp) -{ - RECORD_API(cuda_launch_cooperative_kernel_multi_device_1_argument); - RECORD_ARG(1, func); - RECORD_ARG(2, gridDim); - RECORD_ARG(3, blockDim); - //TODO: Store parameters explicitly - //RECORD_ARG(4, args); - RECORD_ARG(5, sharedMem); - RECORD_ARG(6, stream); - RECORD_ARG(7, numDevices); - RECORD_ARG(8, flags); - dim3 cuda_gridDim = {gridDim.x, gridDim.y, gridDim.z}; - dim3 cuda_blockDim = {blockDim.x, blockDim.y, blockDim.z}; - void **cuda_args; - uint16_t *arg_offsets; - size_t param_num = *((size_t*)args.mem_data_val); - struct cudaLaunchParams lp; - arg_offsets = (uint16_t*)(args.mem_data_val+sizeof(size_t)); - cuda_args = malloc(param_num*sizeof(void*)); - for (size_t i = 0; i < param_num; ++i) { - cuda_args[i] = args.mem_data_val+sizeof(size_t)+param_num*sizeof(uint16_t)+arg_offsets[i]; - //LOGE(LOG_DEBUG, "arg: %p (%d)\n", *(void**)cuda_args[i], *(int*)cuda_args[i]); - } - - LOGE(LOG_DEBUG, "cudaLaunchCooperativeKernelMultiDevice(func=%p, gridDim=[%d,%d,%d], blockDim=[%d,%d,%d], args=%p, sharedMem=%d, stream=%p)", func, cuda_gridDim.x, cuda_gridDim.y, cuda_gridDim.z, cuda_blockDim.x, cuda_blockDim.y, cuda_blockDim.z, cuda_args, sharedMem, (void*)stream); - lp.args = cuda_args; - lp.blockDim = cuda_blockDim; - lp.func = (void*)func; - lp.gridDim = cuda_gridDim; - lp.sharedMem = sharedMem; - lp.stream = resource_mg_get(&rm_streams, (void*)stream); - *result = cudaLaunchCooperativeKernelMultiDevice(&lp, numDevices, flags); - RECORD_RESULT(integer, *result); - LOGE(LOG_DEBUG, "cudaLaunchCooperativeKernelMultiDevice result: %d", *result); - return 1; -} - /* This would require RPCs in the opposite direction. * __host__ cudaError_t cudaLaunchHostFunc ( cudaStream_t stream, cudaHostFn_t fn, void* userData ) * Enqueues a host function call in a stream. @@ -848,15 +861,28 @@ bool_t cuda_launch_kernel_1_svc(ptr func, rpc_dim3 gridDim, rpc_dim3 blockDim, LOGE(LOG_DEBUG, "arg: %p (%d)", *(void**)cuda_args[i], *(int*)cuda_args[i]); } - LOGE(LOG_DEBUG, "cudaLaunchKernel(func=%p, gridDim=[%d,%d,%d], blockDim=[%d,%d,%d], args=%p, sharedMem=%d, stream=%p)", func, cuda_gridDim.x, cuda_gridDim.y, cuda_gridDim.z, cuda_blockDim.x, cuda_blockDim.y, cuda_blockDim.z, cuda_args, sharedMem, (void*)stream); - - *result = cudaLaunchKernel( - (void*)func, - cuda_gridDim, - cuda_blockDim, - cuda_args, - sharedMem, - resource_mg_get(&rm_streams, (void*)stream)); + LOGE(LOG_DEBUG, "cudaLaunchKernel(func=%p, gridDim=[%d,%d,%d], blockDim=[%d,%d,%d], args=%p, sharedMem=%d, stream=%p)", + resource_mg_get(&rm_functions, (void*)func), + cuda_gridDim.x, cuda_gridDim.y, cuda_gridDim.z, + cuda_blockDim.x, cuda_blockDim.y, cuda_blockDim.z, + cuda_args, + sharedMem, + (void*)stream); + + *result = cuLaunchKernel((CUfunction)resource_mg_get(&rm_functions, (void*)func), + gridDim.x, gridDim.y, gridDim.z, + blockDim.x, blockDim.y, blockDim.z, + sharedMem, + resource_mg_get(&rm_streams, (void*)stream), + cuda_args, NULL); + + // *result = cudaLaunchKernel( + // resource_mg_get(&rm_functions, (void*)func), + // cuda_gridDim, + // cuda_blockDim, + // cuda_args, + // sharedMem, + // resource_mg_get(&rm_streams, (void*)stream)); free(cuda_args); RECORD_RESULT(integer, *result); LOGE(LOG_DEBUG, "cudaLaunchKernel result: %d", *result); @@ -1028,8 +1054,8 @@ bool_t cuda_free_host_1_svc(int index, int *result, struct svc_req *rqstp) *result = cudaSuccess; return 1; } - if (hainfo[index].cnt != 0 && - hainfo[index].cnt == index) { + if (hainfo[index].idx != 0 && + hainfo[index].idx == index) { *result = cudaHostUnregister(hainfo[index].server_ptr); munmap(hainfo[index].server_ptr, hainfo[index].size); @@ -1064,31 +1090,39 @@ bool_t cuda_get_symbol_size_1_svc(ptr symbol, u64_result *result, struct svc_req return 1; } -bool_t cuda_host_alloc_1_svc(int client_cnt, size_t size, ptr client_ptr, unsigned int flags, int *result, struct svc_req *rqstp) +bool_t cuda_host_alloc_1_svc(size_t size, unsigned int flags, sz_result *result, struct svc_req *rqstp) { //TODO: Make checkpointable. Implement reattaching of shm segment. int fd_shm; - char shm_name[128]; + char *shm_name = NULL; void *shm_addr; unsigned int register_flags = 0; - *result = cudaErrorMemoryAllocation; RECORD_API(cuda_host_alloc_1_argument); - RECORD_ARG(1, client_cnt); - RECORD_ARG(2, size); - RECORD_ARG(3, client_ptr); - RECORD_ARG(4, flags); + RECORD_ARG(1, size); + RECORD_ARG(2, flags); LOGE(LOG_DEBUG, "cudaHostAlloc"); + result->err = cudaErrorMemoryAllocation; if (socktype == UNIX || (shm_enabled && cpu_utils_is_local_connection(rqstp))) { //Use local shared memory - snprintf(shm_name, 128, "/crickethostalloc-%d", client_cnt); - if ((fd_shm = shm_open(shm_name, O_RDWR, 600)) == -1) { + if (asprintf(&shm_name, "/crickethostalloc-%d", hainfo_cnt) == -1) { + LOGE(LOG_ERROR, "asprintf failed: %s", strerror(errno)); + goto out; + } + if ((fd_shm = shm_open(shm_name, O_RDWR | O_CREAT | O_TRUNC, S_IRWXU)) == -1) { LOGE(LOG_ERROR, "could not open shared memory \"%s\" with size %d: %s", shm_name, size, strerror(errno)); goto out; } + if (ftruncate(fd_shm, size) == -1) { + LOGE(LOG_ERROR, "cannot resize shared memory"); + shm_unlink(shm_name); + goto out; + } + result->sz_result_u.data = hainfo_cnt; + LOGE(LOG_DEBUG, "shm opened with name \"%s\", size: %d", shm_name, size); if ((shm_addr = mmap(NULL, size, PROT_READ | PROT_WRITE, MAP_SHARED, fd_shm, 0)) == MAP_FAILED) { LOGE(LOG_ERROR, "mmap returned unexpected pointer: %p", shm_addr); - goto cleanup; + goto out; } if (flags & cudaHostAllocPortable) { @@ -1101,23 +1135,23 @@ bool_t cuda_host_alloc_1_svc(int client_cnt, size_t size, ptr client_ptr, unsign register_flags |= cudaHostRegisterMapped; } - if ((*result = cudaHostRegister(shm_addr, size, flags)) != cudaSuccess) { + if ((result->err = cudaHostRegister(shm_addr, size, flags)) != cudaSuccess) { LOGE(LOG_ERROR, "cudaHostRegister failed."); munmap(shm_addr, size); - goto cleanup; + goto out; } - - hainfo[hainfo_cnt].cnt = client_cnt; + hainfo[hainfo_cnt].idx = hainfo_cnt; hainfo[hainfo_cnt].size = size; - hainfo[hainfo_cnt].client_ptr = (void*)client_ptr; + hainfo[hainfo_cnt].client_ptr = NULL; hainfo[hainfo_cnt].server_ptr = shm_addr; hainfo_cnt++; } else if (socktype == TCP) { //Use infiniband #ifdef WITH_IB - + LOGE(LOG_ERROR, "infiniband does not yet support cudaHostAlloc."); + goto out; #else - LOGE(LOG_ERROR, "infiniband is disabled."); - goto cleanup; + LOGE(LOG_ERROR, "infiniband is disabled."); + goto out; #endif //WITH_IB } else { @@ -1125,14 +1159,40 @@ bool_t cuda_host_alloc_1_svc(int client_cnt, size_t size, ptr client_ptr, unsign goto out; } + result->err = cudaSuccess; +out: + RECORD_RESULT(sz_result_u, *result); + return 1; +} + +bool_t cuda_host_alloc_regshm_1_svc(size_t hainfo_idx, ptr client_ptr, int *result, struct svc_req *rqstp) +{ + char *shm_name = NULL; + RECORD_API(cuda_host_alloc_regshm_1_argument); + RECORD_ARG(1, hainfo_idx); + RECORD_ARG(2, client_ptr); + + LOGE(LOG_DEBUG, "cudaHostAllocRegShm"); + *result = cudaErrorMemoryAllocation; + + if (socktype != UNIX && !(shm_enabled && cpu_utils_is_local_connection(rqstp))) { + LOGE(LOG_ERROR, "cudaHostAllocRegShm is only supported for local connections."); + goto out; + } + if (asprintf(&shm_name, "/crickethostalloc-%d", hainfo_idx) == -1) { + LOGE(LOG_ERROR, "asprintf failed: %s", strerror(errno)); + goto out; + } + hainfo[hainfo_idx].client_ptr = (void*)client_ptr; *result = cudaSuccess; -cleanup: - shm_unlink(shm_name); out: + shm_unlink(shm_name); + free(shm_name); RECORD_RESULT(integer, *result); return 1; } + bool_t cuda_host_get_device_pointer_1_svc(ptr pHost, int flags, ptr_result *result, struct svc_req *rqstp) { LOGE(LOG_DEBUG, "cudaHostGetDevicePointer"); @@ -1165,7 +1225,7 @@ bool_t cuda_malloc_1_svc(size_t argp, ptr_result *result, struct svc_req *rqstp) #ifdef WITH_IB result->err = ib_allocate_memreg((void**)&result->ptr_result_u.ptr, argp, hainfo_cnt, true); if (result->err == 0) { - hainfo[hainfo_cnt].cnt = hainfo_cnt; + hainfo[hainfo_cnt].idx = hainfo_cnt; hainfo[hainfo_cnt].size = argp; hainfo[hainfo_cnt].server_ptr = (void*)result->ptr_result_u.ptr; @@ -1321,7 +1381,7 @@ bool_t cuda_memcpy_htod_1_svc(uint64_t ptr, mem_data mem, size_t size, int *resu RECORD_ARG(2, mem); RECORD_ARG(3, size); - LOGE(LOG_DEBUG, "cudaMemcpyHtoD"); + LOGE(LOG_DEBUG, "cudaMemcpyHtoD(%p, %p, %zu)", (void*)ptr, mem.mem_data_val, size); if (size != mem.mem_data_len) { LOGE(LOG_ERROR, "data size mismatch"); *result = cudaErrorUnknown; @@ -1476,8 +1536,8 @@ bool_t cuda_memcpy_ib_1_svc(int index, ptr device_ptr, size_t size, int kind, in LOGE(LOG_DEBUG, "cudaMemcpyIB"); *result = cudaErrorInitializationError; //anstatt array list (list.c) - if (hainfo[index].cnt == 0 || - hainfo[index].cnt != index) { + if (hainfo[index].idx == 0 || + hainfo[index].idx != index) { LOGE(LOG_ERROR, "inconsistent state"); goto out; @@ -1529,12 +1589,12 @@ bool_t cuda_memcpy_shm_1_svc(int index, ptr device_ptr, size_t size, int kind, i RECORD_ARG(2, device_ptr); RECORD_ARG(3, size); RECORD_ARG(4, kind); - LOGE(LOG_DEBUG, "cudaMemcpyShm"); + LOGE(LOG_DEBUG, "cudaMemcpyShm(index: %d, device_ptr: %p, size: %d, kind: %d)", index, device_ptr, size, kind); *result = cudaErrorInitializationError; - if (hainfo[index].cnt == 0 || - hainfo[index].cnt != index) { + if (index >= hainfo_cnt || + hainfo[index].idx != index) { - LOGE(LOG_ERROR, "inconsistent state"); + LOGE(LOG_ERROR, "inconsistent state: index: %d, hainfo[index].idx: %d", index, hainfo[index].idx); goto out; } if (hainfo[index].size < size) { @@ -1610,63 +1670,27 @@ bool_t cuda_memcpy_dtoh_1_svc(uint64_t ptr, size_t size, mem_result *result, str /* cudaMemcpyPeer ( void* dst, int dstDevice, const void* src, int srcDevice, size_t count ) not implemented yet. see cudaMemcpyDtoD */ /* cudaMemcpyPeerAsync ( void* dst, int dstDevice, const void* src, int srcDevice, size_t count, cudaStream_t stream = 0 ) */ -bool_t cuda_memcpy_to_symbol_1_svc(uint64_t ptr, mem_data mem, size_t size, size_t offset, int *result, struct svc_req *rqstp) +bool_t cuda_memcpy_to_symbol_1_svc(uint64_t symbolptr, mem_data mem, size_t size, size_t offset, int *result, struct svc_req *rqstp) { - RECORD_API(cuda_memcpy_to_symbol_1_argument); - RECORD_ARG(1, ptr); - RECORD_ARG(2, mem); - RECORD_ARG(3, size); - RECORD_ARG(4, offset); - - LOGE(LOG_DEBUG, "cudaMemcpyToSymbol"); - if (size != mem.mem_data_len) { - LOGE(LOG_ERROR, "data size mismatch"); - *result = cudaErrorUnknown; + LOGE(LOG_DEBUG, "cudaMemcpyToSymbol(%p, %p, %zu, %zu)", symbolptr, mem.mem_data_val, size, offset); + void *symbol_addr = resource_mg_get(&rm_globals, (void*)symbolptr); + if (symbol_addr == NULL) { + LOGE(LOG_ERROR, "cudaMemcpyToSymbol: symbol not found"); + *result = cudaErrorInvalidSymbol; return 1; } -#ifdef WITH_MEMCPY_REGISTER - if ((*result = cudaHostRegister(mem.mem_data_val, size, cudaHostRegisterMapped)) != cudaSuccess) { - LOGE(LOG_ERROR, "cudaHostRegister failed: %d.", *result); - return 1; - } -#endif - *result = cudaMemcpyToSymbol((void*)ptr, mem.mem_data_val, size, offset, cudaMemcpyHostToDevice); -#ifdef WITH_MEMCPY_REGISTER - cudaHostUnregister(mem.mem_data_val); -#endif - RECORD_RESULT(integer, *result); - return 1; + return cuda_memcpy_htod_1_svc((ptr)(symbol_addr+offset), mem, size, result, rqstp); } bool_t cuda_memcpy_to_symbol_shm_1_svc(int index, ptr device_ptr, size_t size, size_t offset, int kind, int *result, struct svc_req *rqstp) { - RECORD_API(cuda_memcpy_to_symbol_shm_1_argument); - RECORD_ARG(1, index); - RECORD_ARG(2, device_ptr); - RECORD_ARG(3, size); - RECORD_ARG(4, offset); - RECORD_ARG(5, kind); - LOGE(LOG_DEBUG, "cudaMemcpyToSymbolShm"); - *result = cudaErrorInitializationError; - if (hainfo[index].cnt == 0 || - hainfo[index].cnt != index) { - - LOGE(LOG_ERROR, "inconsistent state"); - goto out; - } - if (hainfo[index].size < size) { - LOGE(LOG_ERROR, "requested size is smaller than shared memory segment"); - goto out; - } - - if (kind == cudaMemcpyHostToDevice) { - *result = cudaMemcpyToSymbol((void*)device_ptr, hainfo[index].server_ptr, size, offset, kind); - } else { - LOGE(LOG_ERROR, "a kind different from HostToDevice is unsupported for cudaMemcpyToSymbol"); + void *symbol_addr = resource_mg_get(&rm_globals, (void*)device_ptr); + if (symbol_addr == NULL) { + LOGE(LOG_ERROR, "cudaMemcpyToSymbol: symbol not found"); + *result = cudaErrorInvalidSymbol; + return 1; } -out: - RECORD_RESULT(integer, *result); - return 1; + return cuda_memcpy_shm_1_svc(index, (ptr)(symbol_addr+offset), size, kind, result, rqstp); } /* cudaMemcpyToSymbolAsync ( const void* symbol, const void* src, size_t count, size_t offset, cudaMemcpyKind kind, cudaStream_t stream = 0 ) not implemented yet */ @@ -1706,7 +1730,26 @@ bool_t cuda_memset_2d_1_svc(ptr devPtr, size_t pitch, int value, size_t width, s return 1; } -/* cudaMemset2DAsync ( void* devPtr, size_t pitch, int value, size_t width, size_t height, cudaStream_t stream = 0 ) is not implemented */ +bool_t cuda_memset_2d_async_1_svc(ptr devPtr, size_t pitch, int value, size_t width, size_t height, ptr stream, int *result, struct svc_req *rqstp) +{ + RECORD_API(cuda_memset_2d_async_1_argument); + RECORD_ARG(1, devPtr); + RECORD_ARG(2, pitch); + RECORD_ARG(3, value); + RECORD_ARG(4, height); + RECORD_ARG(5, width); + RECORD_ARG(6, stream); + LOGE(LOG_DEBUG, "cudaMemset2DAsync"); + *result = cudaMemset2DAsync( + resource_mg_get(&rm_memory, (void*)devPtr), + pitch, + value, + width, + height, + resource_mg_get(&rm_streams, (void*)stream)); + RECORD_RESULT(integer, *result); + return 1; +} bool_t cuda_memset_3d_1_svc(size_t pitch, ptr devPtr, size_t xsize, size_t ysize, int value, size_t depth, size_t height, size_t width, int *result, struct svc_req *rqstp) { @@ -1731,8 +1774,49 @@ bool_t cuda_memset_3d_1_svc(size_t pitch, ptr devPtr, size_t xsize, size_t ysize RECORD_RESULT(integer, *result); return 1; } -/* cudaMemset3DAsync ( cudaPitchedPtr pitchedDevPtr, int value, cudaExtent extent, cudaStream_t stream = 0 ) is not implemented */ -/* cudaMemsetAsync ( void* devPtr, int value, size_t count, cudaStream_t stream = 0 ) is not implemented */ + +bool_t cuda_memset_3d_async_1_svc(size_t pitch, ptr devPtr, size_t xsize, size_t ysize, int value, size_t depth, size_t height, size_t width, ptr stream, int *result, struct svc_req *rqstp) +{ + RECORD_API(cuda_memset_3d_async_1_argument); + RECORD_ARG(1, pitch); + RECORD_ARG(2, devPtr); + RECORD_ARG(3, xsize); + RECORD_ARG(4, ysize); + RECORD_ARG(5, value); + RECORD_ARG(6, depth); + RECORD_ARG(7, height); + RECORD_ARG(8, width); + RECORD_ARG(9, stream); + LOGE(LOG_DEBUG, "cudaMemset3DAsync"); + struct cudaPitchedPtr pptr = {.pitch = pitch, + .ptr = resource_mg_get(&rm_memory, (void*)devPtr), + .xsize = xsize, + .ysize = ysize}; + struct cudaExtent extent = {.depth = depth, + .height = height, + .width = width}; + *result = cudaMemset3DAsync(pptr, value, extent, + resource_mg_get(&rm_streams, (void*)stream)); + RECORD_RESULT(integer, *result); + return 1; +} + +bool_t cuda_memset_async_1_svc(ptr devPtr, int value, size_t count, ptr stream, int *result, struct svc_req *rqstp) +{ + RECORD_API(cuda_memset_async_1_argument); + RECORD_ARG(1, devPtr); + RECORD_ARG(2, value); + RECORD_ARG(3, count); + RECORD_ARG(3, stream); + LOGE(LOG_DEBUG, "cudaMemsetAsync"); + *result = cudaMemsetAsync( + resource_mg_get(&rm_memory, (void*)devPtr), + value, + count, + resource_mg_get(&rm_streams, (void*)stream)); + RECORD_RESULT(integer, *result); + return 1; +} /* cudaMipmappedArrayGetSparseProperties ( cudaArraySparseProperties* sparseProperties, cudaMipmappedArray_t mipmap ) is not implemented */ /* make_cudaExtent ( size_t w, size_t h, size_t d ) should be implemented on the client side */ /* make_cudaPitchedPtr ( void* d, size_t p, size_t xsz, size_t ysz ) should be implemented on the client side */ @@ -1826,3 +1910,21 @@ bool_t cuda_register_fat_binary_end_1_svc(ptr cubinHandle, int *result, struct s *result = 0; return 1; }*/ + +bool_t cuda_profiler_start_1_svc(int *result, struct svc_req *rqstp) +{ + RECORD_VOID_API; + LOGE(LOG_DEBUG, "cudaProfilerStart"); + *result = cudaProfilerStart(); + RECORD_RESULT(integer, *result); + return 1; +} + +bool_t cuda_profiler_stop_1_svc(int *result, struct svc_req *rqstp) +{ + RECORD_VOID_API; + LOGE(LOG_DEBUG, "cudaProfilerStop"); + *result = cudaProfilerStop(); + RECORD_RESULT(integer, *result); + return 1; +} diff --git a/cpu/cpu-server.c b/cpu/cpu-server.c index 460edc45..a8bbbe65 100644 --- a/cpu/cpu-server.c +++ b/cpu/cpu-server.c @@ -1,3 +1,4 @@ +#define _GNU_SOURCE #include #include #include @@ -5,6 +6,8 @@ #include //sigaction #include #include +#include +#include #include "cpu-server.h" #include "cpu_rpc_prot.h" @@ -15,12 +18,15 @@ #include "cpu-server-driver.h" #include "rpc/xdr.h" #include "cr.h" +#include "cpu-elf2.h" #ifdef WITH_IB #include "cpu-ib.h" #endif //WITH_IB #define WITH_RECORDER #include "api-recorder.h" #include "gsched.h" +#include "cpu-server-nvml.h" +#include "cpu-server-cudnn.h" INIT_SOCKTYPE @@ -109,27 +115,54 @@ bool_t rpc_checkpoint_1_svc(int *result, struct svc_req *rqstp) return ret == 0; } -/** implementation for CUDA_REGISTER_FUNCTION(ptr, str, str, str, int) - * - */ -bool_t cuda_register_function_1_svc(ptr fatCubinHandle, ptr hostFun, char* deviceFun, char* deviceName, int thread_limit, int* result, struct svc_req *rqstp) +/* Call CUDA initialization function (usually called by __libc_init_main()) +* Address of "_ZL24__sti____cudaRegisterAllv" in static symbol table is e.g. 0x4016c8 +*/ +void cricket_so_register(void* dlhandle, char *path) { - LOGE(LOG_DEBUG, "cudaRegisterFunction(%p, %p, %s, %s, %d)", fatCubinHandle, hostFun, deviceFun, deviceName, thread_limit); - *result = 0; - return 1; -} + // struct link_map *map; + // dlinfo(dlhandle, RTLD_DI_LINKMAP, &map); -void cricket_main_hash(char* app_command) -{ - cricket_main(app_command, 0, 0); + // // add load location of library to offset in symbol table + // void (*cudaRegisterAllv)(void) = + // (void(*)(void)) elf_symbol_address(path, "_ZL24__sti____cudaRegisterAllv"); + + // LOG(LOG_INFO, "found CUDA initialization function at %p + %p = %p", + // map->l_addr, cudaRegisterAllv, map->l_addr + cudaRegisterAllv); + + // cudaRegisterAllv += map->l_addr; + + // if (cudaRegisterAllv == NULL) { + // LOGE(LOG_WARNING, "could not find cudaRegisterAllv initialization function in cubin. Kernels cannot be launched without it!"); + // } else { + // cudaRegisterAllv(); + // } } -void cricket_main_static(size_t prog_num, size_t vers_num) +bool_t rpc_dlopen_1_svc(char *path, int *result, struct svc_req *rqstp) { - cricket_main("", prog_num, vers_num); + void *dlhandle; + + if (path == NULL) { + LOGE(LOG_ERROR, "path is NULL"); + *result = 1; + return 1; + } + if ((dlhandle = dlopen(path, RTLD_LAZY)) == NULL) { + LOGE(LOG_ERROR, "error opening \"%s\": %s. Make sure libraries are present.", path, dlerror()); + *result = 1; + return 1; + } else { + LOG(LOG_INFO, "dlopened \"%s\"", path); + + //cricket_so_register(dlhandle, path); + + } + *result = 0; + return 1; } -void cricket_main(char* app_command, size_t prog_num, size_t vers_num) +void cricket_main(size_t prog_num, size_t vers_num) { int ret = 1; register SVCXPRT *transp; @@ -139,9 +172,10 @@ void cricket_main(char* app_command, size_t prog_num, size_t vers_num) struct sigaction act; char *command = NULL; act.sa_handler = int_handler; - sigaction(SIGINT, &act, NULL); - + printf("welcome to cricket!\n"); init_log(LOG_LEVEL, __FILE__); + LOG(LOG_DBG(1), "log level is %d", LOG_LEVEL); + sigaction(SIGINT, &act, NULL); #ifdef WITH_IB char client[256]; @@ -174,36 +208,16 @@ void cricket_main(char* app_command, size_t prog_num, size_t vers_num) restore = 1; } - if (cpu_utils_command(&command) != 0) { - LOG(LOG_WARNING, "could not retrieve command name. This might prevent starting CUDA applications"); - } else { - LOG(LOG_DEBUG, "the command is '%s'", command); - //This is a workaround to make LD_PRELOAD work under GDB supervision - const char *cmp = "cudbgprocess"; - if (strncmp(command, cmp, strlen(cmp)) == 0) { - LOG(LOG_DEBUG, "skipping RPC server"); - return; - } - } - if (restore == 1) { if (cr_restore_rpc_id("ckp", &prog, &vers) != 0) { LOGE(LOG_ERROR, "error while restoring rpc id"); } } else { - if (prog_num == 0) { - if (cpu_utils_md5hash(app_command, &prog, &vers) != 0) { - LOGE(LOG_ERROR, "error while creating binary checksum"); - exit(0); - } - } - else { - prog = prog_num; - vers = vers_num; - } + prog = prog_num; + vers = vers_num; } - LOGE(LOG_DEBUG, "using prog=%d, vers=%d, derived from \"%s\"", prog, vers, app_command); + LOGE(LOG_DEBUG, "using prog=%d, vers=%d", prog, vers); switch (socktype) { @@ -247,16 +261,16 @@ void cricket_main(char* app_command, size_t prog_num, size_t vers_num) /* Call CUDA initialization function (usually called by __libc_init_main()) * Address of "_ZL24__sti____cudaRegisterAllv" in static symbol table is e.g. 0x4016c8 */ - void (*cudaRegisterAllv)(void) = - (void(*)(void)) cricketd_utils_symbol_address("_ZL24__sti____cudaRegisterAllv"); - LOG(LOG_INFO, "found CUDA initialization function at %p", cudaRegisterAllv); - if (cudaRegisterAllv == NULL) { - LOGE(LOG_WARNING, "could not find cudaRegisterAllv initialization function in cubin. Kernels cannot be launched without it!"); - } else { - cudaRegisterAllv(); - } - - sched = &sched_none; + // void (*cudaRegisterAllv)(void) = + // (void(*)(void)) elf_symbol_address(NULL, "_ZL24__sti____cudaRegisterAllv"); + // LOG(LOG_INFO, "found CUDA initialization function at %p", cudaRegisterAllv); + // if (cudaRegisterAllv == NULL) { + // LOGE(LOG_WARNING, "could not find cudaRegisterAllv initialization function in cubin. Kernels cannot be launched without it!"); + // } else { + // cudaRegisterAllv(); + // } + + sched = &sched_none; if (sched->init() != 0) { LOGE(LOG_ERROR, "initializing scheduler failed."); goto cleanup4; @@ -276,6 +290,16 @@ void cricket_main(char* app_command, size_t prog_num, size_t vers_num) LOGE(LOG_ERROR, "initializing server_runtime failed."); goto cleanup2; } + + if (server_nvml_init(restore) != 0) { + LOGE(LOG_ERROR, "initializing server_nvml failed."); + goto cleanup1; + } + + if (server_cudnn_init(restore) != 0) { + LOGE(LOG_ERROR, "initializing server_nvml failed."); + goto cleanup0; + } #ifdef WITH_IB @@ -289,23 +313,29 @@ void cricket_main(char* app_command, size_t prog_num, size_t vers_num) if (signal(SIGUSR1, signal_checkpoint) == SIG_ERR) { LOGE(LOG_ERROR, "An error occurred while setting a signal handler."); - goto cleanup1; + goto cleanup00; } LOG(LOG_INFO, "waiting for RPC requests..."); + // make sure that our output is flushed even for non line-buffered shells + fflush(stdout); + svc_run(); LOG(LOG_DEBUG, "svc_run returned. Cleaning up."); ret = 0; //api_records_print(); - cleanup1: + cleanup00: + server_cudnn_deinit(); + cleanup0: server_driver_deinit(); + cleanup1: + server_nvml_deinit(); cleanup2: server_runtime_deinit(); cleanup3: - api_records_free_args(); - list_free(&api_records); + api_records_free(); cleanup4: pmap_unset(prog, vers); svc_destroy(transp); diff --git a/cpu/cpu-server.h b/cpu/cpu-server.h index 9c3fcbb0..3eea0f63 100644 --- a/cpu/cpu-server.h +++ b/cpu/cpu-server.h @@ -3,8 +3,6 @@ #include -void cricket_main(char* app_command, size_t prog_version, size_t vers_num); -void cricket_main_hash(char* app_command); -void cricket_main_static(size_t prog_num, size_t vers_num); +void cricket_main(size_t prog_version, size_t vers_num); #endif //_CPU_SERVER_H_ diff --git a/cpu/cpu-utils.c b/cpu/cpu-utils.c index 15db60b5..9a4371c8 100644 --- a/cpu/cpu-utils.c +++ b/cpu/cpu-utils.c @@ -1,27 +1,19 @@ #define _GNU_SOURCE #include #include +#include #include #include #include #include -#include #include #include "rpc/types.h" - -#include +#include #include "cpu-utils.h" #include "cpu-common.h" #include "log.h" -#define CRICKET_ELF_NV_INFO_PREFIX ".nv.info" -#define CRICKET_ELF_NV_SHARED_PREFIX ".nv.shared." -#define CRICKET_ELF_NV_TEXT_PREFIX ".nv.text." -#define CRICKET_ELF_TEXT_PREFIX ".text." - -#define CRICKET_ELF_FATBIN ".nv_fatbin" -#define CRICKET_ELF_REGFUN "_ZL24__sti____cudaRegisterAllv" int cpu_utils_command(char **command) { @@ -53,109 +45,6 @@ int cpu_utils_command(char **command) } -int cpu_utils_md5hash(char *filename, unsigned long *high, unsigned long *low) -{ - unsigned char c[MD5_DIGEST_LENGTH]; - FILE *fd; - MD5_CTX mdContext; - int bytes; - unsigned char data[1024]; - - if (filename == NULL || high == NULL || low == NULL) { - return -1; - } - - if ((fd = fopen(filename, "rb")) == NULL) { - LOGE(LOG_ERROR, "%s can't be opened.", filename); - return -1; - } - - MD5_Init (&mdContext); - while ((bytes = fread(data, 1, 1024, fd)) != 0) - MD5_Update(&mdContext, data, bytes); - MD5_Final(c, &mdContext); - fclose (fd); - *high = *((unsigned long*)c); - *low = *((unsigned long*)(c+8)); - return 0; -} - -void* cricketd_utils_symbol_address(char *symbol) -{ - bfd *hostbfd = NULL; - asection *section; - FILE *hostbfd_fd = NULL; - void *ret = NULL; - size_t symtab_size, symtab_length; - asymbol **symtab = NULL; - char path[256]; - size_t length; - - - bfd_init(); - - length = readlink("/proc/self/exe", path, sizeof(path)); - - /* Catch some errors: */ - if (length < 0) { - LOGE(LOG_WARNING, "error resolving symlink /proc/self/exe."); - } else if (length >= 256) { - LOGE(LOG_WARNING, "path was too long and was truncated."); - } else { - path[length] = '\0'; - LOG(LOG_DEBUG, "opening '%s'", path); - } - - if ((hostbfd_fd = fopen("/proc/self/exe", "rb")) == NULL) { - LOGE(LOG_ERROR, "fopen failed"); - return NULL; - } - - if ((hostbfd = bfd_openstreamr("/proc/self/exe", NULL, hostbfd_fd)) == NULL) { - LOGE(LOG_ERROR, "bfd_openr failed on %s", - "/proc/self/exe"); - fclose(hostbfd_fd); - goto cleanup; - } - - if (!bfd_check_format(hostbfd, bfd_object)) { - LOGE(LOG_ERROR, "%s has wrong bfd format", - "/proc/self/exe"); - goto cleanup; - } - - if ((symtab_size = bfd_get_symtab_upper_bound(hostbfd)) == -1) { - LOGE(LOG_ERROR, "bfd_get_symtab_upper_bound failed"); - return NULL; - } - - if ((symtab = (asymbol **)malloc(symtab_size)) == NULL) { - LOGE(LOG_ERROR, "malloc symtab failed"); - return NULL; - } - - if ((symtab_length = bfd_canonicalize_symtab(hostbfd, symtab)) == 0) { - LOG(LOG_WARNING, "symtab is empty..."); - } else { - //printf("%lu symtab entries\n", symtab_length); - } - - for (int i = 0; i < symtab_length; ++i) { - if (strcmp(bfd_asymbol_name(symtab[i]), CRICKET_ELF_REGFUN) == 0) { - ret = (void*)bfd_asymbol_value(symtab[i]); - break; - } - //printf("%d: %s: %lx\n", i, bfd_asymbol_name(symtab[i]), - // bfd_asymbol_value(symtab[i])); - } - - - cleanup: - free(symtab); - if (hostbfd != NULL) - bfd_close(hostbfd); - return ret; -} int cpu_utils_launch_child(const char *file, char **args) { @@ -173,6 +62,7 @@ int cpu_utils_launch_child(const char *file, char **args) return -1; } else if (pid == 0) { while ((dup2(filedes[1], STDOUT_FILENO) == -1) && (errno == EINTR)) {} + while ((dup2(filedes[1], STDERR_FILENO) == -1) && (errno == EINTR)) {} close(filedes[1]); close(filedes[0]); char *env[] = {NULL}; @@ -183,14 +73,14 @@ int cpu_utils_launch_child(const char *file, char **args) return filedes[0]; } -kernel_info_t* cricketd_utils_search_info(list *kernel_infos, char *kernelname) +kernel_info_t* utils_search_info(list *kernel_infos, const char *kernelname) { kernel_info_t *info = NULL; if (kernel_infos == NULL) { LOGE(LOG_ERROR, "list is NULL."); return NULL; } - LOGE(LOG_DEBUG, "searching for %s in %d entries", kernelname, kernel_infos->length); + LOGE(LOG_DBG(1), "searching for %s in %d entries", kernelname, kernel_infos->length); for (int i=0; i < kernel_infos->length; ++i) { if (list_at(kernel_infos, i, (void**)&info) != 0) { LOGE(LOG_ERROR, "no element at index %d", i); @@ -204,10 +94,6 @@ kernel_info_t* cricketd_utils_search_info(list *kernel_infos, char *kernelname) int cpu_utils_is_local_connection(struct svc_req *rqstp) { - LOGE(LOG_DEBUG, "%p", rqstp); - LOGE(LOG_DEBUG, "%p", rqstp->rq_xprt); - LOGE(LOG_DEBUG, "%p", rqstp->rq_xprt->xp_fd); - struct sockaddr_in remote_addr = {0}; struct sockaddr_in local_addr = {0}; struct hostent *hp; @@ -364,6 +250,7 @@ int cpu_utils_contains_kernel(const char *path) // Line does not start with .nv.info. so continue searching. continue; }*/ + line[strlen(line)-1] = '\0'; LOGE(LOG_DEBUG, "output: \"%s\"", line); } ret = 0; @@ -371,10 +258,10 @@ int cpu_utils_contains_kernel(const char *path) cleanup: close(output); wait(&child_exit); - LOG(LOG_DEBUG, "child exit code: %d", child_exit); + LOG(LOG_DBG(1), "child exit code: %d", child_exit); out: free(line); - return (ret != 0 ? ret : child_exit); + return ret == 0 && child_exit == 0; } int cpu_utils_parameter_info(list *kernel_infos, char *path) @@ -392,6 +279,11 @@ int cpu_utils_parameter_info(list *kernel_infos, char *path) char *kernelname; struct stat filestat = {0}; + if (path == NULL) { + LOGE(LOG_ERROR, "path is NULL."); + goto out; + } + if (kernel_infos == NULL) { LOGE(LOG_ERROR, "list is NULL."); goto out; @@ -441,13 +333,14 @@ int cpu_utils_parameter_info(list *kernel_infos, char *path) goto cleanup2; } - if ((buf->name = malloc(strlen(kernelname))) == NULL) { + size_t buflen = strlen(kernelname); + if ((buf->name = malloc(buflen)) == NULL) { LOGE(LOG_ERROR, "malloc failed"); goto cleanup2; } //copy string and remove trailing \n - strncpy(buf->name, kernelname, strlen(kernelname)-1); - buf->name[strlen(kernelname)-1] = '\0'; + strncpy(buf->name, kernelname, buflen-1); + buf->name[buflen-1] = '\0'; if (cpu_utils_read_pars(buf, fdesc) != 0) { LOGE(LOG_ERROR, "reading paramter infos failed.\n"); @@ -470,10 +363,10 @@ int cpu_utils_parameter_info(list *kernel_infos, char *path) cleanup1: close(output); wait(&child_exit); - LOG(LOG_DEBUG, "child exit code: %d", child_exit); + LOG(LOG_DBG(1), "child exit code: %d", child_exit); out: free(line); - return (ret != 0 ? ret : child_exit); + return ret == 0 && child_exit == 0; } void kernel_infos_free(kernel_info_t *infos, size_t kernelnum) @@ -484,3 +377,35 @@ void kernel_infos_free(kernel_info_t *infos, size_t kernelnum) free(infos[i].param_sizes); } } + +void hexdump(const uint8_t* data, size_t size) +{ + size_t pos = 0; + while (pos < size) { + printf("%#05zx: ", pos); + for (int i = 0; i < 16; i++) { + if (pos + i < size) { + printf("%02x", data[pos + i]); + } else { + printf(" "); + } + if (i % 4 == 3) { + printf(" "); + } + } + printf(" | "); + for (int i = 0; i < 16; i++) { + if (pos + i < size) { + if (data[pos + i] >= 0x20 && data[pos + i] <= 0x7e) { + printf("%c", data[pos + i]); + } else { + printf("."); + } + } else { + printf(" "); + } + } + printf("\n"); + pos += 16; + } +} \ No newline at end of file diff --git a/cpu/cpu-utils.h b/cpu/cpu-utils.h index c0ed97b8..6b1261ef 100644 --- a/cpu/cpu-utils.h +++ b/cpu/cpu-utils.h @@ -5,16 +5,18 @@ #include "cpu-common.h" #include "list.h" + + void kernel_infos_free(kernel_info_t *infos, size_t kernelnum); int cpu_utils_is_local_connection(struct svc_req *rqstp); int cpu_utils_command(char **command); int cpu_utils_md5hash(char *filename, unsigned long *high, unsigned long *low); -void* cricketd_utils_symbol_address(char *symbol); int cricketd_utils_launch_child(const char *file, char **args); int cpu_utils_parameter_info(list *kernel_infos, char *path); -int cpu_utils_contains_kernel(const char *path); -kernel_info_t* cricketd_utils_search_info(list *kernel_infos, char *kernelname); +kernel_info_t* utils_search_info(list *kernel_infos, const char *kernelname); +void hexdump(const uint8_t* data, size_t size); + #endif //_CPU_UTILS_H_ diff --git a/cpu/cpu_rpc_prot.x b/cpu/cpu_rpc_prot.x index f5c405e4..6d505842 100644 --- a/cpu/cpu_rpc_prot.x +++ b/cpu/cpu_rpc_prot.x @@ -1,6 +1,8 @@ typedef opaque mem_data<>; + typedef unsigned hyper size_t; typedef unsigned hyper ptr; +typedef opaque rpc_cuda_device_prop[1032]; struct dint { int i1; @@ -38,6 +40,24 @@ struct rpc_dim3 { unsigned int z; }; +struct int2d1 { + int i[2]; + double d; +}; + +struct int1d3 { + int i; + double d[3]; +}; + +union cudnn_scaling_t switch (int dataType) { +case 2: +case 0: + float f; +case 1: + double d; +}; + union int_result switch (int err) { case 0: int data; @@ -80,6 +100,13 @@ default: void; }; +union sz_result switch (int err) { +case 0: + size_t data; +default: + void; +}; + union ptr_result switch (int err) { case 0: ptr ptr; @@ -108,6 +135,8 @@ default: void; }; +/* memory allocated for RPC. */ +/* Freed rpc_cd_prog_1_freeresult by after RPC. */ union mem_result switch (int err) { case 0: mem_data data; @@ -115,12 +144,79 @@ default: void; }; +union cuda_device_prop_result switch (int err) { +case 0: + rpc_cuda_device_prop data; +default: + void; +}; + +union int3_result switch (int err) { +case 0: + int data[3]; +default: + void; +}; + +union int4_result switch (int err) { +case 0: + int data[4]; +default: + void; +}; + +union int5_result switch (int err) { +case 0: + int data[5]; +default: + void; +}; + +union int6_result switch (int err) { +case 0: + int data[6]; +default: + void; +}; + +union int8_result switch (int err) { +case 0: + int data[8]; +default: + void; +}; + +union int9_result switch (int err) { +case 0: + int data[9]; +default: + void; +}; + +union int2d1_result switch (int err) { +case 0: + int2d1 data; +default: + void; +}; + +union int1d3_result switch (int err) { +case 0: + int1d3 data; +default: + void; +}; + program RPC_CD_PROG { version RPC_CD_VERS { int rpc_checkpoint(void) = 0; int rpc_deinit(void) = 1; int rpc_printmessage(string) = 2; - int CUDA_REGISTER_FUNCTION(ptr, ptr, string, string, int) = 50; + int rpc_dlopen(string) = 3; + ptr_result rpc_register_function(ptr, ptr, string, string, int) = 50; + int rpc_elf_load(mem_data, ptr) = 51; + int rpc_elf_unload(ptr) = 52; + int rpc_register_var(ptr, ptr, ptr, string, int, size_t, int, int) = 53; /* RUNTIME API */ /* ### Device Management ### */ @@ -143,7 +239,7 @@ program RPC_CD_PROG { int_result CUDA_GET_DEVICE(void) = 117; int_result CUDA_GET_DEVICE_COUNT(void) = 118; int_result CUDA_GET_DEVICE_FLAGS(void) = 119; - mem_result CUDA_GET_DEVICE_PROPERTIES(int) = 120; + cuda_device_prop_result CUDA_GET_DEVICE_PROPERTIES(int) = 120; /*int CUDA_IPC_CLOSE_MEM_HANDLE(ptr) = 121;*/ /*ptr_result CUDA_IPC_GET_EVENT_HANDLE(int) = 122;*/ /*ptr_result CUDA_IPC_GET_MEM_HANDLE(ptr) = 123;*/ @@ -174,7 +270,7 @@ program RPC_CD_PROG { /* ? CUDA_STREAM_GET_CAPTURE_INFO(ptr) = 261;*/ int_result CUDA_STREAM_GET_FLAGS(ptr) = 262; int_result CUDA_STREAM_GET_PRIORITY(ptr) = 263; - /* ? CUDA_STREAM_IS_CAPTURING(ptr) = 264;*/ + int_result CUDA_STREAM_IS_CAPTURING(ptr) = 264; int CUDA_STREAM_QUERY(ptr) = 265; /*int CUDA_STREAM_SET_ATTRIBUTE(ptr, int, ?) = 266;*/ int CUDA_STREAM_SYNCHRONIZE(ptr) = 267; @@ -201,8 +297,6 @@ program RPC_CD_PROG { int CUDA_FUNC_SET_SHARED_MEM_CONFIG(ptr, int) = 313; int CUDA_LAUNCH_COOPERATIVE_KERNEL(ptr, rpc_dim3, rpc_dim3, mem_data, size_t, ptr) = 314; - int CUDA_LAUNCH_COOPERATIVE_KERNEL_MULTI_DEVICE(ptr, - rpc_dim3, rpc_dim3, mem_data, size_t, ptr, int, int) = 315; /*int CUDA_LAUNCH_HOST_FUNC(ptr, ptr, mem_data) = 316;*/ int CUDA_LAUNCH_KERNEL(ptr, rpc_dim3, rpc_dim3, mem_data, size_t, ptr) = 317; @@ -225,7 +319,8 @@ program RPC_CD_PROG { /*ptr_result CUDA_GET_MIPMAPPED_ARRAY_LEVEL(ptr, int) = 406;*/ ptr_result CUDA_GET_SYMBOL_ADDRESS(ptr) = 407; u64_result CUDA_GET_SYMBOL_SIZE(ptr) = 408; - int CUDA_HOST_ALLOC(int, size_t, ptr, unsigned int) = 409; + sz_result CUDA_HOST_ALLOC(size_t, unsigned int) = 409; + int CUDA_HOST_ALLOC_REGSHM(size_t, ptr) = 477; ptr_result CUDA_HOST_GET_DEVICE_POINTER(ptr, int) = 410; int_result CUDA_HOST_GET_FLAGS(ptr) = 411; /*int CUDA_HOST_REGISTER(ptr, size_t, int) = 412;*/ @@ -263,13 +358,13 @@ program RPC_CD_PROG { int CUDA_MEMCPY_MT_SYNC(int) = 451; int CUDA_MEMSET(ptr, int, size_t) = 470; int CUDA_MEMSET_2D(ptr, size_t, int, size_t, size_t) = 471; - /*int CUDA_MEMSET_2D_ASYNC(ptr, size_t, - int, size_t, size_t, int) = 472;*/ + int CUDA_MEMSET_2D_ASYNC(ptr, size_t, + int, size_t, size_t, ptr) = 472; int CUDA_MEMSET_3D(size_t, ptr, size_t, size_t, int, size_t, size_t, size_t) = 473; - /*int CUDA_MEMSET_3D_ASYNC(size_t, ptr, size_t, size_t, int, - size_t, size_t, size_t, int) = 474;*/ - /*int CUDA_MEMSET_ASYNC(ptr, int, size_t, int) = 475;*/ + int CUDA_MEMSET_3D_ASYNC(size_t, ptr, size_t, size_t, int, + size_t, size_t, size_t, ptr) = 474; + int CUDA_MEMSET_ASYNC(ptr, int, size_t, ptr) = 475; /*? CUDA_MIPMAPPED_ARRAY_GET_SPARSE_PROPERTIES(ptr) = 476;*/ /* make_ APIs can be copied on the client side */ @@ -298,6 +393,8 @@ program RPC_CD_PROG { /* NOT IMPLEMENTED */ /* ### Profiler Control ### */ + int CUDA_PROFILER_START(void) = 701; + int CUDA_PROFILER_STOP(void) = 702; /* NOT IMPLEMENTED */ /* DRIVER API */ @@ -323,6 +420,11 @@ program RPC_CD_PROG { ptr_result rpc_cuModuleLoad(string<>) = 1019; str_result rpc_cuGetErrorString(int) = 1020; int rpc_cuModuleUnload(ptr) = 1021; + dint_result rpc_cuDevicePrimaryCtxGetState(int) = 1022; + mem_result rpc_cuDeviceGetProperties(int) = 1023; + dint_result rpc_cuDeviceComputeCapability(int) = 1024; + int_result rpc_cuDeviceGetP2PAttribute(int, ptr, ptr) = 1025; + ptr_result rpc_cuModuleLoadData(mem_data mem) = 1026; /* HIDDEN DRIVER API */ /* ptr_result rpc_hidden_get_device_ctx(int) = 1101; @@ -349,5 +451,124 @@ program RPC_CD_PROG { int rpc_cublasDgemm(ptr, int, int, int, int, int, double, ptr, int, ptr, int, double, ptr, int) = 3002; int rpc_cublasDestroy(ptr) = 3003; + int rpc_cublasSgemm(ptr, int, int, int, int, int, float, + ptr, int, ptr, int, float, ptr, int) = 3004; + int rpc_cublasSgemv(ptr, int, int, int, float, + ptr, int, ptr, int, float, ptr, int) = 3005; + int rpc_cublasDgemv(ptr, int, int, int, double, + ptr, int, ptr, int, double, ptr, int) = 3006; + int rpc_cublasSgemmEx(ptr, int, int, int, int, int, float, + ptr, int, int, ptr, int, int, float, ptr, int, int) = 3007; + int rpc_cublasSetStream(ptr handle, ptr streamId) = 3008; + int rpc_cublasSetWorkspace(ptr handle, ptr workspace, size_t workspaceSizeInBytes) = 3009; + int rpc_cublasSetMathMode(ptr handle, int mode) = 3010; + + /* NVML */ + int_result rpc_nvmlDeviceGetCount_v2(void) = 4000; + int rpc_nvmlInitWithFlags(int) = 4001; + int rpc_nvmlInit_v2(void) = 4002; + int rpc_nvmlShutdown(void) = 4003; + + /* CUDNN */ + size_t rpc_cudnnGetVersion(void) = 5000; + size_t rpc_cudnnGetMaxDeviceVersion(void) = 5001; + size_t rpc_cudnnGetCudartVersion(void) = 5002; + string rpc_cudnnGetErrorString (int status) = 5003; + int_result rpc_cudnnQueryRuntimeError(ptr handle, int mode) = 5004; + int_result rpc_cudnnGetProperty(int type) = 5005; + ptr_result rpc_cudnnCreate(void) = 5006; + int rpc_cudnnDestroy(ptr handle) = 5007; + int rpc_cudnnSetStream(ptr handle, ptr streamId) = 5008; + ptr_result rpc_cudnnGetStream(ptr handle) = 5009; + ptr_result rpc_cudnnCreateTensorDescriptor(void) = 5010; + int rpc_cudnnSetTensor4dDescriptor(ptr tensorDesc, int format, int dataType, int n, int c, int h, int w) = 5011; + int rpc_cudnnSetTensor4dDescriptorEx(ptr tensorDesc, int dataType, int n, int c, int h, int w, int nStride, int cStride, int hStride, int wStride) = 5012; + int9_result rpc_cudnnGetTensor4dDescriptor(ptr tensorDesc) = 5013; + int rpc_cudnnSetTensorNdDescriptor(ptr tensorDesc, int dataType, int nbDims, mem_data dimA, mem_data strideA) = 5014; + int rpc_cudnnSetTensorNdDescriptorEx(ptr tensorDesc, int format, int dataType, int nbDims, mem_data dimA) = 5015; + mem_result rpc_cudnnGetTensorNdDescriptor(ptr tensorDesc, int nbDimsRequested) = 5016; + sz_result rpc_cudnnGetTensorSizeInBytes(ptr tensorDesc) = 5017; + int rpc_cudnnDestroyTensorDescriptor(ptr tensorDesc) = 5018; + /* + sz_result rpc_cudnnInitTransformDest(ptr transformDesc, ptr srcDesc, ptr destDesc) = 5019; + ptr_result rpc_cudnnCreateTensorTransformDescriptor(void) = 5020; + int rpc_cudnnSetTensorTransformDescriptor(ptr transformDesc, uint32_t nbDims, int destFormat, mem_data padBeforeA, mem_data padAfterA, mem_data foldA, int direction) = 5021; + mem_result rpc_cudnnGetTensorTransformDescriptor(ptr transformDesc, uint32_t nbDimsRequested) = 5022; + int rpc_cudnnDestroyTensorTransformDescriptor(ptr transformDesc) = 5023; + */ + int rpc_cudnnTransformTensor(ptr handle, cudnn_scaling_t alpha, ptr xDesc, ptr x, cudnn_scaling_t beta, ptr yDesc, ptr y) = 5024; + /* + ptr_result rpc_cudnnTransformTensorEx(ptr handle, ptr transDesc, cudnn_scaling_t alpha, ptr srcDesc, cudnn_scaling_t srcData, cudnn_scaling_t beta, ptr destDesc) = 5025; + */ + int rpc_cudnnAddTensor(ptr handle, cudnn_scaling_t alpha, ptr aDesc, ptr A, cudnn_scaling_t beta, ptr cDesc, ptr C) = 5026; + /* + ptr_result rpc_cudnnCreateOpTensorDescriptor(void) = 5027; + int rpc_cudnnSetOpTensorDescriptor(ptr opTensorDesc, int opTensorOp, int opTensorCompType, int opTensorNanOpt) = 5028; + int3_result rpc_cudnnGetOpTensorDescriptor(ptr opTensorDesc) = 5029; + int rpc_cudnnDestroyOpTensorDescriptor(ptr opTensorDesc) = 5030; + mem_result rpc_cudnnOpTensor(ptr handle, ptr opTensorDesc, cudnn_scaling_t alpha1, ptr aDesc, mem_data A, cudnn_scaling_t alpha2, ptr bDesc, mem_data B, cudnn_scaling_t beta, ptr cDesc) = 5031; + ptr_result rpc_cudnnCreateReduceTensorDescriptor(void) = 5032; + int rpc_cudnnSetReduceTensorDescriptor(ptr reduceTensorDesc, int reduceTensorOp, int reduceTensorCompType, int reduceTensorNanOpt, int reduceTensorIndices, int reduceTensorIndicesType) = 5033; + int5_result rpc_cudnnGetReduceTensorDescriptor(ptr reduceTensorDesc) = 5034; + int rpc_cudnnDestroyReduceTensorDescriptor(ptr reduceTensorDesc) = 5035; + sz_result rpc_cudnnGetReductionIndicesSize(ptr handle, ptr reduceTensorDesc, ptr aDesc, ptr cDesc) = 5036; + sz_result rpc_cudnnGetReductionWorkspaceSize(ptr handle, ptr reduceTensorDesc, ptr aDesc, ptr cDesc) = 5037; + mem_result rpc_cudnnReduceTensor(ptr handle, ptr reduceTensorDesc, ptr indices, size_t indicesSizeInBytes, ptr workspace, size_t workspaceSizeInBytes, cudnn_scaling_t alpha, ptr aDesc, ptr A, cudnn_scaling_t beta, ptr cDesc, ptr C) = 5038; + int rpc_cudnnSetTensor(ptr handle, ptr yDesc, ptr y, mem_data valuePtr) = 5039; + int rpc_cudnnScaleTensor(ptr handle, ptr yDesc, ptr y, cudnn_scaling_t alpha) = 5040; */ + + ptr_result rpc_cudnnCreateFilterDescriptor(void) = 5041; + int rpc_cudnnSetFilter4dDescriptor(ptr filterDesc, int dataType, int format, int k, int c, int h, int w) = 5042; + int6_result rpc_cudnnGetFilter4dDescriptor(ptr filterDesc) = 5043; + int rpc_cudnnSetFilterNdDescriptor(ptr filterDesc, int dataType, int format, int nbDims, mem_data filterDimA) = 5044; + mem_result rpc_cudnnGetFilterNdDescriptor(ptr filterDesc, int nbDimsRequested) = 5045; + sz_result rpc_cudnnGetFilterSizeInBytes(ptr filterDesc) = 5046; + int rpc_cudnnTransformFilter(ptr handle, ptr transDesc, cudnn_scaling_t alpha, ptr srcDesc, ptr srcData, cudnn_scaling_t beta, ptr destDesc, ptr destData) = 5047; + int rpc_cudnnDestroyFilterDescriptor(ptr filterDesc) = 5048; + int rpc_cudnnSoftmaxForward(ptr handle, int algo, int mode, cudnn_scaling_t alpha, ptr xDesc, ptr x, cudnn_scaling_t beta, ptr yDesc, ptr y) = 5049; + ptr_result rpc_cudnnCreatePoolingDescriptor(void) = 5050; + int rpc_cudnnSetPooling2dDescriptor(ptr poolingDesc, int mode, int maxpoolingNanOpt, int windowHeight, int windowWidth, int verticalPadding, int horizontalPadding, int verticalStride, int horizontalStride) = 5051; + int8_result rpc_cudnnGetPooling2dDescriptor(ptr poolingDesc) = 5052; + int rpc_cudnnSetPoolingNdDescriptor(ptr poolingDesc, int mode, int maxpoolingNanOpt, int nbDims, mem_data windowDimA, mem_data paddingA, mem_data strideA) = 5053; + mem_result rpc_cudnnGetPoolingNdDescriptor(ptr poolingDesc, int nbDimsRequested) = 5054; + mem_result rpc_cudnnGetPoolingNdForwardOutputDim(ptr poolingDesc, ptr inputTensorDesc, int nbDims) = 5055; + int4_result rpc_cudnnGetPooling2dForwardOutputDim(ptr poolingDesc, ptr inputTensorDesc) = 5056; + int rpc_cudnnDestroyPoolingDescriptor(ptr poolingDesc) = 5057; + int rpc_cudnnPoolingForward(ptr handle, ptr poolingDesc, cudnn_scaling_t alpha, ptr xDesc, ptr x, cudnn_scaling_t beta, ptr yDesc, ptr y) = 5058; + ptr_result rpc_cudnnCreateActivationDescriptor(void) = 5059; + int rpc_cudnnSetActivationDescriptor(ptr activationDesc, int mode, int reluNanOpt, double coef) = 5060; + int2d1_result rpc_cudnnGetActivationDescriptor(ptr activationDesc) = 5061; + int rpc_cudnnSetActivationDescriptorSwishBeta(ptr activationDesc, double swish_beta) = 5062; + d_result rpc_cudnnGetActivationDescriptorSwishBeta(ptr activationDesc) = 5063; + int rpc_cudnnDestroyActivationDescriptor(ptr activationDesc) = 5064; + int rpc_cudnnActivationForward(ptr handle, ptr activationDesc, cudnn_scaling_t alpha, ptr xDesc, ptr x, cudnn_scaling_t beta, ptr yDesc, ptr y) = 5065; + ptr_result rpc_cudnnCreateLRNDescriptor(void) = 5066; + int rpc_cudnnSetLRNDescriptor(ptr normDesc, unsigned lrnN, double lrnAlpha, double lrnBeta, double lrnK) = 5067; + int1d3_result rpc_cudnnGetLRNDescriptor(ptr normDesc) = 5068; + int rpc_cudnnDestroyLRNDescriptor(ptr lrnDesc) = 5069; + int rpc_cudnnLRNCrossChannelForward(ptr handle, ptr normDesc, int lrnMode, cudnn_scaling_t alpha, ptr xDesc, ptr x, cudnn_scaling_t beta, ptr yDesc, ptr y) = 5070; + /* cudnn cnn inference */ + ptr_result rpc_cudnnCreateConvolutionDescriptor(void) = 5301; + int rpc_cudnnDestroyConvolutionDescriptor(ptr convDesc) = 5302; + mem_result rpc_cudnnGetConvolutionNdForwardOutputDim(ptr convDesc, ptr inputTensorDesc, ptr filterDesc, int nbDims) = 5303; + int rpc_cudnnSetConvolutionNdDescriptor(ptr convDesc, int arrayLength, mem_data padA, mem_data filterStrideA, mem_data dilationA, int mode, int computeType) = 5304; + mem_result rpc_cudnnGetConvolutionForwardAlgorithm_v7(ptr handle, ptr srcDesc, ptr filterDesc, ptr convDesc, ptr destDesc, int requestedAlgoCount) = 5305; + mem_result rpc_cudnnFindConvolutionForwardAlgorithm(ptr handle, ptr xDesc, ptr wDesc, ptr convDesc, ptr yDesc, int requestedAlgoCount) = 5306; + sz_result rpc_cudnnGetConvolutionForwardWorkspaceSize(ptr handle, ptr xDesc, ptr wDesc, ptr convDesc, ptr yDesc, int algo) = 5307; + int rpc_cudnnConvolutionForward(ptr handle, cudnn_scaling_t alpha, ptr xDesc, ptr x, ptr wDesc, ptr w, ptr convDesc, int algo, ptr workSpace, size_t workSpaceSizeInBytes, cudnn_scaling_t beta, ptr yDesc, ptr y) = 5308; + ptr_result rpc_cudnnBackendCreateDescriptor(int descriptorType) = 5309; + int rpc_cudnnBackendDestroyDescriptor(ptr descriptor) = 5310; + int rpc_cudnnBackendInitialize(ptr descriptor) = 5311; + int rpc_cudnnBackendFinalize(ptr descriptor) = 5312; + int rpc_cudnnBackendSetAttribute(ptr descriptor, + int attributeName, + int attributeType, + hyper elementCount, + mem_data arrayOfElements) = 5313; + mem_result rpc_cudnnBackendGetAttribute(ptr descriptor, + int attributeName, + int attributeType, + hyper requestedElementCount) = 5314; + int rpc_cudnnBackendExecute(ptr handle, ptr executionPlan, ptr variantPack) = 5315; } = 1; } = 99; diff --git a/cpu/cr.c b/cpu/cr.c index e14f58f5..7e1e2e74 100644 --- a/cpu/cr.c +++ b/cpu/cr.c @@ -754,7 +754,6 @@ static int cr_restore_resources(const char *path, api_record_t *record, resource break; case CUDA_LAUNCH_KERNEL: case CUDA_LAUNCH_COOPERATIVE_KERNEL: - case CUDA_LAUNCH_COOPERATIVE_KERNEL_MULTI_DEVICE: break; case rpc_cusolverDnCreate: if (cr_restore_cusolver(record, rm_cusolver) != 0) { @@ -821,9 +820,6 @@ int cr_launch_kernel(void) } else if (record->function == CUDA_LAUNCH_COOPERATIVE_KERNEL) { LOGE(LOG_ERROR, "not yet supported"); goto cleanup; - } else if (record->function == CUDA_LAUNCH_COOPERATIVE_KERNEL_MULTI_DEVICE) { - LOGE(LOG_ERROR, "not yet supported"); - goto cleanup; } } ret = 0; diff --git a/cpu/gsched_none.c b/cpu/gsched_none.c index 8b509089..6cb1152f 100644 --- a/cpu/gsched_none.c +++ b/cpu/gsched_none.c @@ -23,7 +23,7 @@ int gsched_none_init(void) pthread_mutex_init(&mutex_device, NULL); pthread_mutex_init(&mutex_ids, NULL); if ((res = cudaGetDeviceCount(&cuda_max_devices)) != cudaSuccess) { - LOGE(LOG_ERROR, "cudaGetDeviceCount failed: %s", cudaGetErrorString(res)); + LOGE(LOG_ERROR, "cudaGetDeviceCount failed: %s (%d)", cudaGetErrorString(res), res); return 1; } return 0; diff --git a/cpu/log.c b/cpu/log.c index e104890e..6e4807ee 100644 --- a/cpu/log.c +++ b/cpu/log.c @@ -20,6 +20,8 @@ #include #include +static struct timeval start_time = {0}; + struct log_data* get_log_data() { static struct log_data log_data; return &log_data; @@ -46,6 +48,7 @@ void init_log(char log_level, const char* proj_root) { get_log_data()->curr_level=log_level; get_log_data()->project_offset = str_find_last_of(proj_root, '/'); + gettimeofday(&start_time, 0); } void now_time(char* buf) @@ -57,9 +60,23 @@ void now_time(char* buf) sprintf(buf, "%s.%06ld", buffer, (long)tv.tv_usec); } +void delta_time(char* buf) +{ + struct timeval tv; + gettimeofday(&tv, 0); + timersub(&tv, &start_time, &tv); + char buffer[100]; + strftime(buffer, sizeof(buffer), "%X", localtime(&tv.tv_sec)); + sprintf(buf, "+%s.%06ld", buffer, (long)tv.tv_usec); +} + const char* to_string(log_level level) { +#ifdef NOCOLORS static const char* const buffer[] = {"ERROR", "WARNING", "INFO", "DEBUG"}; +#else + static const char* const buffer[] = {"\033[1m\033[31mERROR\033[0m", "\033[33mWARNING\033[0m", "\033[34mINFO\033[0m", "\033[32mDEBUG\033[0m"}; +#endif //NOCOLORS if(level > LOG_DEBUG){ return buffer[LOG_DEBUG]; } @@ -71,9 +88,13 @@ void loggf(log_level level, const char* formatstr, ... ) va_list vararg; va_start(vararg, formatstr); - char time[100]; + char time[64]; +#ifdef DELTA_TIME + delta_time(time); +#else now_time(time); - printf("%s (%s):\t", time, to_string(level)); +#endif //DELTA_TIME + printf("%s %s:\t", time, to_string(level)); vprintf(formatstr, vararg); printf("\n"); } @@ -84,11 +105,19 @@ void loggfe(log_level level, int line, const char* file, const char* formatstr, va_start(vararg, formatstr); char time[64]; +#ifdef DELTA_TIME + delta_time(time); +#else now_time(time); +#endif //DELTA_TIME printf("%s %7s: ", time, to_string(level)); vprintf(formatstr, vararg); char stripped[64]; strcpy(stripped, file); str_strip(stripped, get_log_data()->project_offset); - printf("\tin %s(%d)\n", stripped, line); +#ifdef NOCOLORS + printf("\tin %s:%d\n", stripped, line); +#else + printf("\tin \033[4m%s:%d\033[0m\n", stripped, line); +#endif //NOCOLORS } diff --git a/cpu/log.h b/cpu/log.h index 81ce80be..379c5865 100644 --- a/cpu/log.h +++ b/cpu/log.h @@ -38,6 +38,8 @@ else loggfe(level, __LINE__, __FILE__, __VA_ARGS__) #define LOG_DEBUG 3 #define LOG_DBG(i) LOG_DEBUG + i +#define DELTA_TIME 1 + typedef char log_level; struct log_data{ diff --git a/cpu/resource-mg.c b/cpu/resource-mg.c index e07e6a5f..f78503fe 100644 --- a/cpu/resource-mg.c +++ b/cpu/resource-mg.c @@ -75,6 +75,28 @@ static void* resource_mg_search_map(resource_mg *mg, void *client_address) LOGE(LOG_DEBUG, "no find: %p", client_address); return client_address; } + +void resource_mg_print(resource_mg *mg) +{ + size_t i; + resource_mg_map_elem *elem; + if (mg == NULL) { + LOGE(LOG_ERROR, "resource manager mg is NULL"); + return; + } + LOG(LOG_DEBUG, "new_res:"); + for (i = 0; i < mg->new_res.length; i++) { + LOG(LOG_DEBUG, "%p", *(void**)list_get(&mg->new_res, i)); + } + if (mg->bypass == 0) { + LOG(LOG_DEBUG, "map_res:"); + for (i = 0; i < mg->map_res.length; i++) { + elem = list_get(&mg->map_res, i); + LOG(LOG_DEBUG, "%p -> %p", elem->client_address, elem->cuda_address); + } + } +} + inline void* resource_mg_get(resource_mg *mg, void* client_address) { if (mg->bypass) { @@ -85,6 +107,7 @@ inline void* resource_mg_get(resource_mg *mg, void* client_address) return 0; } +#include int resource_mg_add_sorted(resource_mg *mg, void* client_address, void* cuda_address) { ssize_t start = 0; @@ -124,10 +147,11 @@ int resource_mg_add_sorted(resource_mg *mg, void* client_address, void* cuda_add return 0; } } - if (end < 0) { + if (end < 0LL) { end = 0; } - if (mid_elem->client_address < client_address) { + resource_mg_map_elem *end_elem = list_get(&mg->map_res, end); + if (end_elem->client_address < client_address) { end++; } return list_insert(&mg->map_res, end, &new_elem); diff --git a/cpu/resource-mg.h b/cpu/resource-mg.h index aa8bff25..ee8c44fa 100644 --- a/cpu/resource-mg.h +++ b/cpu/resource-mg.h @@ -28,15 +28,28 @@ resource_mg rm_streams; resource_mg rm_events; resource_mg rm_arrays; resource_mg rm_memory; +resource_mg rm_kernels; //Driver API RMs resource_mg rm_modules; resource_mg rm_functions; +resource_mg rm_globals; //Other RMs resource_mg rm_cusolver; resource_mg rm_cublas; +//CUDNN RMs +resource_mg rm_cudnn; +resource_mg rm_cudnn_tensors; +resource_mg rm_cudnn_filters; +resource_mg rm_cudnn_tensortransform; +resource_mg rm_cudnn_poolings; +resource_mg rm_cudnn_activations; +resource_mg rm_cudnn_lrns; +resource_mg rm_cudnn_convs; +resource_mg rm_cudnn_backendds; + /** initializes the resource manager * @@ -54,4 +67,6 @@ int resource_mg_create(resource_mg *mg, void* cuda_address); void* resource_mg_get(resource_mg *mg, void* client_address); +void resource_mg_print(resource_mg *mg); + #endif //_RESOURCE_MG_H_ diff --git a/cpu/server-exe.c b/cpu/server-exe.c index 8e358be6..a174e0a2 100644 --- a/cpu/server-exe.c +++ b/cpu/server-exe.c @@ -3,17 +3,22 @@ #include "log.h" #include +#include int main(int argc, char** argv) { - - //TODO: Check if command path exists if (argc == 1) { - cricket_main_static(RPC_CD_PROG, RPC_CD_VERS); + cricket_main(RPC_CD_PROG, RPC_CD_VERS); } else if (argc == 2) { - cricket_main_hash(argv[1]); + uint64_t vers; + if (sscanf(argv[1], "%lu", &vers) != 1) { + printf("version string could not be converted to number\n"); + printf("usage: %s [unique rpc version]\n", argv[0]); + return 1; + } + cricket_main(RPC_CD_PROG, vers); } else { - LOGE(LOG_ERROR, "usage: %s [command]", argv[0]); + printf("usage: %s\n", argv[0]); } return 0; } diff --git a/cpu/server-library.c b/cpu/server-library.c deleted file mode 100644 index cd5e57a1..00000000 --- a/cpu/server-library.c +++ /dev/null @@ -1,10 +0,0 @@ - -#include "cpu-server.h" -#include "log.h" - -/* shared object constructor; executes before main and thus hijacks main program */ -void __attribute__ ((constructor)) library_constr(void) -{ - cricket_main_hash("/proc/self/exe"); -} - diff --git a/docs/pytorch.md b/docs/pytorch.md new file mode 100644 index 00000000..ebb8a3b5 --- /dev/null +++ b/docs/pytorch.md @@ -0,0 +1,130 @@ +# Cricket pyTorch + +Get pytorch sources +``` +git clone git@github.com:pytorch/pytorch.git +git checkout v1.13.1 +git submodule update --init --recursive +``` + +patch sources. +- link cudart dynamically when building docker image +- link cudart dynamically when building ATen +- link cudart dynamically when building nccl +- deactivate building for some old cuda versions. (optional) +- add cricket dependencies to dockerfile +``` +diff --git a/Dockerfile b/Dockerfile +index 815a9108ce9..53ec7689493 100644 +--- a/Dockerfile ++++ b/Dockerfile +@@ -53,7 +53,7 @@ WORKDIR /opt/pytorch + COPY --from=conda /opt/conda /opt/conda + COPY --from=submodule-update /opt/pytorch /opt/pytorch + RUN --mount=type=cache,target=/opt/ccache \ +- TORCH_CUDA_ARCH_LIST="3.5 5.2 6.0 6.1 7.0+PTX 8.0" TORCH_NVCC_FLAGS="-Xfatbin -compress-all" \ ++ TORCH_CUDA_ARCH_LIST="6.0 6.1 7.0 8.0" TORCH_NVCC_FLAGS="-Xfatbin -compress-all -cudart shared" \ + CMAKE_PREFIX_PATH="$(dirname $(which conda))/../" \ + python setup.py install + +@@ -93,3 +93,13 @@ WORKDIR /workspace + FROM official as dev + # Should override the already installed version from the official-image stage + COPY --from=build /opt/conda /opt/conda ++RUN apt-get update && apt-get install -y --no-install-recommends \ ++ rpcbind \ ++ git \ ++ automake \ ++ libtool \ ++ libssl-dev \ ++ inetutils-ping \ ++ vim \ ++ libgl1-mesa-dev \ ++ gdb && \ ++ rm -rf /var/lib/apt/lists/* +diff --git a/aten/src/ATen/CMakeLists.txt b/aten/src/ATen/CMakeLists.txt +index 3055e290094..4cc14c794b0 100644 +--- a/aten/src/ATen/CMakeLists.txt ++++ b/aten/src/ATen/CMakeLists.txt +@@ -458,7 +458,7 @@ if(USE_CUDA AND NOT USE_ROCM) + endif() + if($ENV{ATEN_STATIC_CUDA}) + list(APPEND ATen_CUDA_DEPENDENCY_LIBS "${CUDA_TOOLKIT_ROOT_DIR}/lib64/libculibos.a") +- list(APPEND ATen_CUDA_DEPENDENCY_LIBS "${CUDA_TOOLKIT_ROOT_DIR}/lib64/libcudart_static.a") ++ list(APPEND ATen_CUDA_DEPENDENCY_LIBS "${CUDA_TOOLKIT_ROOT_DIR}/lib64/libcudart.so") + endif($ENV{ATEN_STATIC_CUDA}) + endif() +``` +`third_party/nccl/nccl` +``` +diff --git a/makefiles/common.mk b/makefiles/common.mk +index 1a1c2b6..c781b39 100644 +--- a/makefiles/common.mk ++++ b/makefiles/common.mk +@@ -54,7 +54,7 @@ CXXFLAGS := -DCUDA_MAJOR=$(CUDA_MAJOR) -DCUDA_MINOR=$(CUDA_MINOR) -fPIC -fvisi + # Maxrregcount needs to be set accordingly to NCCL_MAX_NTHREADS (otherwise it will cause kernel launch errors) + # 512 : 120, 640 : 96, 768 : 80, 1024 : 60 + # We would not have to set this if we used __launch_bounds__, but this only works on kernels, not on functions. +-NVCUFLAGS := -ccbin $(CXX) $(NVCC_GENCODE) -std=c++11 --expt-extended-lambda -Xptxas -maxrregcount=96 -Xfatbin -compress-all ++NVCUFLAGS := -ccbin $(CXX) $(NVCC_GENCODE) -std=c++11 --expt-extended-lambda -Xptxas -maxrregcount=96 -Xfatbin -compress-all -cudart shared + # Use addprefix so that we can specify more than one path +-NVLDFLAGS := -L${CUDA_LIB} -lcudart -lrt ++NVLDFLAGS := -L${CUDA_LIB} -lcudart -lrt -cudart shared + + ########## GCOV ########## + GCOV ?= 0 # disable by default. +diff --git a/src/Makefile b/src/Makefile +index d658c35..5bd9876 100644 +--- a/src/Makefile ++++ b/src/Makefile +@@ -28,7 +28,7 @@ LIBDIR := $(BUILDDIR)/lib + OBJDIR := $(BUILDDIR)/obj + PKGDIR := $(BUILDDIR)/lib/pkgconfig + ##### target files +-CUDARTLIB ?= cudart_static ++CUDARTLIB ?= cudart + INCTARGETS := $(INCEXPORTS:%=$(INCDIR)/%) + LIBSONAME := $(LIBNAME:%=%.$(NCCL_MAJOR)) + LIBTARGET := $(LIBNAME:%=%.$(NCCL_MAJOR).$(NCCL_MINOR).$(NCCL_PATCH)) +``` + +Avoid `CMake Error: File /opt/pytorch/build_variables.bzl does not exist.` (https://github.com/pytorch/pytorch/pull/85947): +``` +diff --git a/.gitignore b/.gitignore +index 3e6f3831c4c..db6d9c3527e 100644 +--- a/.gitignore ++++ b/.gitignore +@@ -214,6 +214,7 @@ build_host_protoc + build_android + build_ios + /build_* ++!/build_variables.bzl + .build_debug/* + .build_release/* + .build_profile/* +``` + +build pytorch +``` +# only necessary when building on an NFS share +EXTRA_DOCKER_BUILD_FLAGS='--storage-opt "overlay.mount_program=/usr/bin/fuse-overlayfs"' + +make -f docker.Makefile +``` + +launch cricket server (outside of docker container) +``` +/bin/cricket-rpc-server +``` + +launch docker container, torch +``` +sudo docker run --gpus all --rm -it -v /cricket:/cricket --ipc=host pytorch:latest +LD_LIBRARY_PATH=/cricket/cpu REMOTE_GPU_ADDRESS= LD_PRELOAD=/cricket/cpu/cricket-client.so python3 /cricket/tests/test_apps/pytorch_minimal.py +``` +or under gdb supervision: +``` +LD_LIBRARY_PATH=/cricket/cpu gdb -x /cricket/tests/gdb_client_cmds python3 +(gdb) run /cricket/tests/test_apps/pytorch_minimal.py +``` + diff --git a/submodules/Makefile b/submodules/Makefile index e08c8fc9..e30870da 100644 --- a/submodules/Makefile +++ b/submodules/Makefile @@ -10,7 +10,7 @@ clean: cd cuda-gdb && git apply -R ../cuda-gdb.patch rm -rf lib -libtirpc: +libtirpc/install: @echo -e "\033[36m----> autogen libtirpc\033[0m" if [ ! -f "libtirpc/configure" ]; then cd libtirpc && ./bootstrap; fi @echo -e "\033[36m----> Configuring libtirpc\033[0m" @@ -36,12 +36,17 @@ else endif cuda-gdb/build: +ifeq (,$(wildcard ./cuda-gdb/build)) @echo -e "\033[36m----> Configuring cuda-gdb\033[0m" + @echo -e "\033[36m----> extracting cuda-gdb\033[0m" mkdir -p cuda-gdb/build && cd cuda-gdb/build && \ - ../configure --disable-werror --program-prefix=cuda- --enable-cuda --with-python=no --enable-targets="x86_64-apple-darwin,x86_64-unknown-linux-gnu,arm-elf-linux-gnu,m68k-unknown-linux-gnu" CFLAGS='-I/usr/local/cuda/include' LDFLAGS='-lpthread' + ../configure --disable-werror --program-prefix=cuda- --enable-cuda --with-python=no --enable-targets="x86_64-apple-darwin,x86_64-unknown-linux-gnu,arm-elf-linux-gnu,m68k-unknown-linux-gnu" CFLAGS='-I/usr/local/cuda/include -fPIC' LDFLAGS='-lpthread' @echo -e "\033[36m----> Building cuda-gdb\033[0m" CPATH=/usr/local/cuda/include $(MAKE) -C cuda-gdb/build CPATH=/usr/local/cuda/include $(MAKE) -C cuda-gdb/build/gdb libgdb.a +else + @echo -e "\033[36m----> cuda-gdb/build directory present. Skipping building of cuda-gdb\033[0m" +endif lib: mkdir -p lib diff --git a/tests/.gitignore b/tests/.gitignore index d5dbc051..c08de5b9 100644 --- a/tests/.gitignore +++ b/tests/.gitignore @@ -3,3 +3,4 @@ test-cricket test_api test_cpu test_kernel +test_kernel_call diff --git a/tests/Makefile b/tests/Makefile index 8adc5da7..3048d6d5 100644 --- a/tests/Makefile +++ b/tests/Makefile @@ -1,7 +1,11 @@ #MIT License... .PHONY: all clean test_apps cpu gpu samples +ifdef NOSAMPLES +all: test_apps cpu gpu bin +else all: test_apps cpu gpu samples bin +endif test_apps: @echo -e "\033[36m----> Building tests/test_apps\033[0m" @@ -20,13 +24,13 @@ samples: @echo -e "\033[36m----> Building tests/samples\033[0m" $(MAKE) -C samples -bin: cpu samples test_apps +bin: cpu test_apps mkdir -p bin cp cpu/unit/*.test bin cp test_apps/*.testapp bin - cp samples/matrixMul/matrixMul bin - cp samples/bandwidthTest/bandwidthTest bin - cp samples/nbody/nbody bin +ifndef NOSAMPLES + cp samples/samples-bin/*.sample bin +endif clean: @echo -e "\033[31m----> Cleaning up tests/test_apps\033[0m" diff --git a/tests/cpu/cubin/Makefile b/tests/cpu/cubin/Makefile index ca8418a3..c42ee827 100644 --- a/tests/cpu/cubin/Makefile +++ b/tests/cpu/cubin/Makefile @@ -8,11 +8,12 @@ LDFLAGS = -arch=$(ARCH) -cudart shared -lcuda BINARY = main CUBIN = kernel.cubin FATBIN = kernel.fatbin +SHARED = kernel.so FILES := main.o .PHONY: all depend clean -all : $(BINARY) $(CUBIN) $(FATBIN) +all : $(BINARY) $(CUBIN) $(FATBIN) $(SHARED) $(BINARY) : $(FILES) $(LD) $(LDFLAGS) -o $@ $< @@ -26,6 +27,9 @@ $(BINARY) : $(FILES) %.o : %.cpp $(CC) $(CFLAGS) -c -o $@ $< +%.so : %.cu + $(CC) $(CFLAGS) --compiler-options '-fPIC' -o $@ $< + clean : rm -f *.o *.cubin *.fatbin $(BINARY) diff --git a/tests/cpu/cubin/main b/tests/cpu/cubin/main deleted file mode 100755 index f10f1de5..00000000 Binary files a/tests/cpu/cubin/main and /dev/null differ diff --git a/tests/cpu/cubin/main.cpp b/tests/cpu/cubin/main.cpp index c9137243..6bad89b8 100644 --- a/tests/cpu/cubin/main.cpp +++ b/tests/cpu/cubin/main.cpp @@ -3,6 +3,9 @@ #include #include #include +#include +#include +#include #define printCudaErrors(err) __printCudaErrors (err, __FILE__, __LINE__) @@ -66,6 +69,46 @@ void check_free_mem(int *mem, size_t len) cudaFree(mem); } +int getModuleFromCubin(CUmodule *module, const char *cubin) +{ + CUresult err; + if ((err = cuModuleLoad(module, "kernel.cubin")) != CUDA_SUCCESS) { + printCudaErrors(err); + return 1; + } + return 0; +} + +int getModuleFromCubinInMemory(CUmodule *module, const char *cubin) +{ + int fd = open(cubin, O_RDONLY); + if (fd < 0) { + printf("error\n"); + return 1; + } + struct stat st; + if (fstat(fd, &st) < 0) { + printf("error\n"); + return 1; + } + printf("size: %#0zx\n", (int)st.st_size); + void *buf = mmap(NULL, st.st_size, PROT_READ, MAP_PRIVATE, fd, 0); + if (buf == MAP_FAILED) { + printf("error\n"); + return 1; + } + CUresult err; + if ((err = cuModuleLoadData(module, buf)) != CUDA_SUCCESS) { + printCudaErrors(err); + return 1; + } + return 0; +} + +int getModuleFromShared(CUmodule **module, const char *cubin) +{ + return 0; +} int main(int argc, char** argv) { @@ -83,9 +126,18 @@ int main(int argc, char** argv) CUmodule module; CUfunction func; printf("testing cubin...\n"); - if ((err = cuModuleLoad(&module, "kernel.cubin")) != CUDA_SUCCESS) { - printCudaErrors(err); + if (getModuleFromCubinInMemory(&module, "kernel.cubin") != 0) { + printf("error\n"); + return 1; } + // if (getModuleFromCubin(&module, "kernel.cubin") != 0) { + // printf("error\n"); + // return 1; + // } + // if ((err = getModuleFromShared(&module, "kernel.so")) != 0) { + // printf("error\n"); + // return 1; + // } if ((err = cuModuleGetFunction(&func, module, "kernel")) != CUDA_SUCCESS) { printCudaErrors(err); diff --git a/tests/cpu/unit/Makefile b/tests/cpu/unit/Makefile index 6e22bb27..e7de359c 100644 --- a/tests/cpu/unit/Makefile +++ b/tests/cpu/unit/Makefile @@ -16,7 +16,7 @@ INC_FLAGS += -I../../../cpu/ LIB_FLAGS += -L$(LIBTIRPC_PREFIX)/lib LIB_FLAGS += -L$(CUDA_SRC)/lib64 -LIB_FLAGS += -L../../../cpu/ +LIB_FLAGS += -L../../../bin/ CC_FLAGS += -std=gnu99 $(INC_FLAGS) -g -ggdb -fsanitize=address -fsanitize=pointer-compare -fsanitize=pointer-subtract -fsanitize-address-use-after-scope LD_FLAGS = $(LIB_FLAGS) @@ -32,11 +32,14 @@ CLIENT_LD_FLAGS = $(LD_FLAGS) -l:cricket-client.so all : $(BIN_CLIENT_TESTS) $(BIN_SERVER_TESTS) -$(BIN_SERVER_TESTS) : %.test:%.o +../../../bin/cricket-server.so: + $(MAKE) -C ../../../ bin/cricket-server.so + +$(BIN_SERVER_TESTS) : %.test:%.o ../../../bin/cricket-server.so $(LD) $(CC_FLAGS) -o $@ $< $(SERVER_LD_FLAGS) $(OBJ_SERVER_TESTS) : %.o:%.c - $(CC) $(CC_FLAGS) -c -o $@ $< $(SERVER_LD_FLAGS) + $(CC) $(CC_FLAGS) -c -o $@ $< clean: rm -f $(OBJ_SERVER_TESTS) $(OBJ_CLIENT_TESTS) $(BIN_SERVER_TESTS) $(BIN_CLIENT_TESTS) diff --git a/tests/gdb_client_cmds b/tests/gdb_client_cmds new file mode 100644 index 00000000..825cfdd6 --- /dev/null +++ b/tests/gdb_client_cmds @@ -0,0 +1,3 @@ +python gdb.execute("set environment CRICKET_NOHASH=yes") +python gdb.execute("set environment REMOTE_GPU_ADDRESS=localhost") +python gdb.execute("set environment LD_PRELOAD=../../cpu/cricket-client.so") \ No newline at end of file diff --git a/tests/samples/.gitignore b/tests/samples/.gitignore new file mode 100644 index 00000000..33a20c36 --- /dev/null +++ b/tests/samples/.gitignore @@ -0,0 +1,2 @@ +samples-bin +samples diff --git a/tests/samples/Makefile b/tests/samples/Makefile index 38aa9512..c7c83f63 100644 --- a/tests/samples/Makefile +++ b/tests/samples/Makefile @@ -1,55 +1,101 @@ CC = gcc LD = gcc -CFLAGS = -Wall -std=gnu99 -ARCH = sm_61 -CUDA_DIR = /usr/local/cuda +CFLAGS = -Wall -std=gnu99 -g -ggdb +SAMPLES = samples-bin/matrixMul.compressed.sample \ + samples-bin/matrixMul.uncompressed.sample \ + samples-bin/nbody.uncompressed.sample \ + samples-bin/nbody.compressed.sample \ + samples-bin/bandwidthTest.sample \ + samples-bin/mnistCUDNN.sample + +CUDA_PATH = /usr/local/cuda +SMS = 75 60 +CUDA_SAMPLES_RELEASE ?= 12.1 +CUDA_SAMPLES_URL = https://github.com/NVIDIA/cuda-samples/archive/refs/tags/v${CUDA_SAMPLES_RELEASE}.tar.gz +CUDNN_SAMPLES_URL = https://developer.download.nvidia.com/compute/cuda/repos/rhel8/x86_64/libcudnn8-samples-8.9.2.26-1.cuda12.1.x86_64.rpm PWD = $(shell dirname $(realpath $(firstword $(MAKEFILE_LIST)))) .PHONY: all clean distclean -all : matrixMul/matrixMul bandwidthTest/bandwidthTest nbody/nbody - -matrixMul : - mkdir -p $(PWD)/matrixMul - cp -r $(CUDA_DIR)/samples/0_Simple/matrixMul $(PWD) - make -C matrixMul clean - -matrixMul/matrixMul : matrixMul - make -C matrixMul \ - NVCCFLAGS="-m64 -cudart shared" \ - GENCODE_FLAGS="-arch=$(ARCH)" \ - CPATH="$(CUDA_DIR)/samples/common/inc" - -bandwidthTest : - mkdir -p $(PWD)/bandwidthTest - cp -r $(CUDA_DIR)/samples/1_Utilities/bandwidthTest $(PWD) - make -C bandwidthTest clean - -bandwidthTest/bandwidthTest : bandwidthTest - make -C bandwidthTest \ - NVCCFLAGS="-m64 -cudart shared" \ - GENCODE_FLAGS="-arch=$(ARCH)" \ - CPATH="$(CUDA_DIR)/samples/common/inc" - -nbody : - mkdir -p $(PWD)/nbody - cp -r $(CUDA_DIR)/samples/5_Simulations/nbody $(PWD) - make -C nbody clean - -nbody/nbody : nbody - make -C nbody \ - NVCCFLAGS="-m64 -cudart shared" \ - GENCODE_FLAGS="-arch=$(ARCH)" \ - CPATH="$(CUDA_DIR)/samples/common/inc" +all : $(SAMPLES) + +samples: + mkdir -p $@ + wget ${CUDA_SAMPLES_URL} -O - | tar -xz --strip-components=1 -C $@ + +cudnn-samples: + mkdir -p $@ + wget ${CUDNN_SAMPLES_URL} -O - | rpm2archive - | tar zxf - --strip-components=4 -C $@ + +samples-bin: + mkdir -p $@ + +samples-bin/data: + cp -R cudnn-samples/mnistCUDNN/data $@ + +samples-bin/mnistCUDNN.sample : cudnn-samples samples-bin samples-bin/data + make -C cudnn-samples/mnistCUDNN \ + clean + make -C cudnn-samples/mnistCUDNN \ + NVCCFLAGS="-cudart shared --no-compress -G" \ + SMS="${SMS}" \ + CUDA_PATH=${CUDA_PATH} \ + DEBUG=1 + cp cudnn-samples/mnistCUDNN/mnistCUDNN $@ + +samples-bin/nbody.uncompressed.sample : samples samples-bin + make -C samples/Samples/5_Domain_Specific/nbody \ + clean + make -C samples/Samples/5_Domain_Specific/nbody \ + NVCCFLAGS="-cudart shared --no-compress -g -G" \ + SMS="${SMS}" \ + CPATH="samples/Common" \ + CUDA_PATH=${CUDA_PATH} + cp samples/Samples/5_Domain_Specific/nbody/nbody $@ + +samples-bin/nbody.compressed.sample : samples samples-bin + make -C samples/Samples/5_Domain_Specific/nbody \ + clean + make -C samples/Samples/5_Domain_Specific/nbody \ + NVCCFLAGS="-cudart shared -Xfatbin --compress-all -g -G" \ + SMS="${SMS}" \ + CPATH="samples/Common" \ + CUDA_PATH=${CUDA_PATH} + cp samples/Samples/5_Domain_Specific/nbody/nbody $@ + +samples-bin/matrixMul.compressed.sample : samples samples-bin + make -C samples/Samples/0_Introduction/matrixMul \ + clean + make -C samples/Samples/0_Introduction/matrixMul \ + NVCCFLAGS="-cudart shared -Xfatbin --compress-all" \ + SMS="${SMS}" \ + CPATH="samples/Common" \ + CUDA_PATH=${CUDA_PATH} + cp samples/Samples/0_Introduction/matrixMul/matrixMul $@ + +samples-bin/matrixMul.uncompressed.sample : samples samples-bin + make -C samples/Samples/0_Introduction/matrixMul \ + clean + make -C samples/Samples/0_Introduction/matrixMul \ + NVCCFLAGS="-cudart shared --no-compress" \ + SMS="${SMS}" \ + CPATH="samples/Common" \ + CUDA_PATH=${CUDA_PATH} + cp samples/Samples/0_Introduction/matrixMul/matrixMul $@ + +samples-bin/bandwidthTest.sample : samples samples-bin + make -C samples/Samples/1_Utilities/bandwidthTest \ + clean + make -C samples/Samples/1_Utilities/bandwidthTest \ + NVCCFLAGS="-cudart shared --no-compress" \ + SMS="${SMS}" \ + CPATH="samples/Common" \ + CUDA_PATH=${CUDA_PATH} + cp samples/Samples/1_Utilities/bandwidthTest/bandwidthTest $@ clean : - rm -f *.elf *.hex *.o *.d .depend *~ - make -C matrixMul clean - make -C bandwidthTest clean - make -C nbody clean + rm -rf samples-bin distclean : clean - rm -r matrixMul - rm -r bandwidthTest - rm -r nbody \ No newline at end of file + rm -rf samples \ No newline at end of file diff --git a/tests/test_apps/Makefile b/tests/test_apps/Makefile index 73a1d32c..dafae5a3 100644 --- a/tests/test_apps/Makefile +++ b/tests/test_apps/Makefile @@ -10,6 +10,7 @@ CFLAGS = -arch=$(ARCH) -cudart shared #CFLAGS = -arch=$(ARCH) LD = nvcc -ccbin g++ LDFLAGS = -arch=$(ARCH) -cudart shared +DEBUG_FLAGS = #-g -G #LDFLAGS = -lcuda -arch=$(ARCH) TEST_CPU_BIN = cpu.testapp TEST_CPU_O = test_cpu.o @@ -19,14 +20,19 @@ TEST_KERNEL_BIN = kernel.testapp TEST_KERNEL_O = test_kernel.o BINARY = cricket.testapp +TEST_KERNEL_LIB_O = test_kernel_lib.o +TEST_KERNEL_LIB = test_kernel.so +TEST_KERNEL_LIB_CALL_O = test_kernel_call.o +TEST_KERNEL_LIB_CALL = test_kernel_call + LIBCUDA_WRAPPER = libcuda.so.1 LIBCUDA_OBJ = libcuda.o -LIBCUDA_LIBS = -ldl +LIBCUDA_LIBS = -ldl -I../../cpu FILES := matmul.cu .PHONY: all depend clean -all : $(TEST_KERNEL_BIN) +all : $(TEST_KERNEL_BIN) $(BINARY) $(TEST_CPU_BIN) $(TEST_API_BIN) $(TEST_KERNEL_LIB) $(TEST_KERNEL_LIB_CALL) $(TEST_CPU_O) : $(FILES) $(CC) -DTEST_CPU $(CFLAGS) -dc -o $@ $< @@ -55,11 +61,23 @@ $(BINARY) : $(FILES) $(LIBCUDA_OBJ) : $(LIBCUDA_OBJ:.o=.c) $(HOST_CC) -c -fpic -o $@ $< $(LIBCUDA_LIBS) +$(TEST_KERNEL_LIB_O) : $(FILES) + $(CC) $(CFLAGS) $(DEBUG_FLAGS) -dc --compiler-options '-fPIC' -o $@ $< + +$(TEST_KERNEL_LIB) : $(TEST_KERNEL_LIB_O) + $(LD) $(LDFLAGS) $(DEBUG_FLAGS) -shared -o lib$@ $^ + +$(TEST_KERNEL_LIB_CALL_O) : $(TEST_KERNEL_LIB_CALL_O:.o=.c) + $(HOST_CC) -c -o $@ $< + +$(TEST_KERNEL_LIB_CALL) : $(TEST_KERNEL_LIB_CALL_O) + $(HOST_LD) -o $@ $< -I. -ldl + $(LIBCUDA_WRAPPER) : $(LIBCUDA_OBJ) $(HOST_LD) -shared -o $@ $^ clean : - rm -f *.elf *.hex *.o *.d .depend *~ $(BINARY) $(LIBCUDA_WRAPPER) $(TEST_CPU_BIN) $(TEST_API_BIN) $(TEST_KERNEL_BIN) + rm -f *.elf *.hex *.o *.d .depend *~ $(BINARY) $(LIBCUDA_WRAPPER) $(TEST_CPU_BIN) $(TEST_API_BIN) $(TEST_KERNEL_BIN) $(TEST_KERNEL_LIB) $(TEST_KERNEL_LIB_CALL) diff --git a/tests/test_apps/matmul.cu b/tests/test_apps/matmul.cu index 7790ae7b..b4960c39 100644 --- a/tests/test_apps/matmul.cu +++ b/tests/test_apps/matmul.cu @@ -8,7 +8,7 @@ #include "cricket-cuda.h" #define N 32 -#define ITERATIONS 1024*128*8*16 +#define ITERATIONS 1024*128*4 const int blocksize = 32; #ifndef RANDOM_INIT @@ -173,7 +173,6 @@ int main() #endif //RANDOM_INIT uint16_t *res; uint16_t *dev_A, *dev_x, *dev_res; - uint16_t *dev_ptr; struct timeval begin, end; struct timeval messb, messa; const int A_size = N*N*sizeof(uint16_t); @@ -253,11 +252,9 @@ int main() */ cudaMalloc( (void**)&dev_x, x_size ); cudaMalloc( (void**)&dev_res, x_size ); - cudaMalloc( (void**)&dev_ptr, A_size ); printf("Mallocs done\n"); - cudaMemcpy( dev_ptr, A, A_size, cudaMemcpyHostToDevice ); cudaMemcpy( dev_A, A, A_size, cudaMemcpyHostToDevice ); cudaMemcpy( dev_x, x, x_size, cudaMemcpyHostToDevice ); @@ -265,7 +262,7 @@ int main() dim3 dimBlock( blocksize, 1 ); dim3 dimGrid( 1, 1); kernel<<>>(dev_A, dev_x, dev_res, 0, 0, 0, 0); - //kernel_no_param<<>>(); + kernel_no_param<<>>(); //void *args = NULL; //int result = cudaLaunchKernel((void*)kernel_no_param, dimGrid, dimBlock, &args, 0LL, NULL); @@ -305,7 +302,7 @@ int main() gettimeofday(&end, NULL); printf("elapsed time: %0u.%06u\n", (end.tv_sec - begin.tv_sec), (end.tv_usec - begin.tv_usec)); - + free(res); return (success ? 0 : 1); } diff --git a/tests/test_apps/pytorch_minimal.py b/tests/test_apps/pytorch_minimal.py new file mode 100644 index 00000000..d6f49e2d --- /dev/null +++ b/tests/test_apps/pytorch_minimal.py @@ -0,0 +1,73 @@ +# BSD 3-Clause License +# +# Copyright (c) 2017-2022, Pytorch contributors +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# +# * Redistributions of source code must retain the above copyright notice, this +# list of conditions and the following disclaimer. +# +# * Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# +# * Neither the name of the copyright holder nor the names of its +# contributors may be used to endorse or promote products derived from +# this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +import torch +import math + + +dtype = torch.float +device = torch.device("cuda:0") + +# Create random input and output data +x = torch.linspace(-math.pi, math.pi, 2000, device=device, dtype=dtype) +y = torch.sin(x) + +# Randomly initialize weights +a = torch.randn((), device=device, dtype=dtype) +b = torch.randn((), device=device, dtype=dtype) +c = torch.randn((), device=device, dtype=dtype) +d = torch.randn((), device=device, dtype=dtype) + +learning_rate = 1e-6 +for t in range(2000): + # Forward pass: compute predicted y + y_pred = a + b * x + c * x ** 2 + d * x ** 3 + + # Compute and print loss + loss = (y_pred - y).pow(2).sum().item() + if t % 100 == 99: + print(t, loss) + + # Backprop to compute gradients of a, b, c, d with respect to loss + grad_y_pred = 2.0 * (y_pred - y) + grad_a = grad_y_pred.sum() + grad_b = (grad_y_pred * x).sum() + grad_c = (grad_y_pred * x ** 2).sum() + grad_d = (grad_y_pred * x ** 3).sum() + + # Update weights using gradient descent + a -= learning_rate * grad_a + b -= learning_rate * grad_b + c -= learning_rate * grad_c + d -= learning_rate * grad_d + + +print(f'Result: y = {a.item()} + {b.item()} x + {c.item()} x^2 + {d.item()} x^3') + diff --git a/tests/test_apps/test_kernel_call.c b/tests/test_apps/test_kernel_call.c new file mode 100644 index 00000000..ce538a2c --- /dev/null +++ b/tests/test_apps/test_kernel_call.c @@ -0,0 +1,26 @@ +#include +#include + +int main(int argc, char** argv) +{ + void *dlhandle; + + if ((dlhandle = dlopen("./libtest_kernel.so", RTLD_LAZY)) == NULL) { + printf("error opening library\n"); + return 1; + } + + int (*fn)(void); + + printf("kernel: %p\n", dlsym(dlhandle, "_Z6kernelPtS_S_csix")); + + if ((fn = dlsym(dlhandle, "main")) == NULL) { + printf("dlsym failed\n"); + return 1; + } + + fn(); + + return 0; +} + diff --git a/tests/test_apps/yolo.py b/tests/test_apps/yolo.py new file mode 100644 index 00000000..1d929155 --- /dev/null +++ b/tests/test_apps/yolo.py @@ -0,0 +1,12 @@ +import torch + +model = torch.hub.load("ultralytics/yolov5", "yolov5s", device='cuda:0') # or yolov5n - yolov5x6, custom + +# Images +img = "https://ultralytics.com/images/zidane.jpg" # or file, Path, PIL, OpenCV, numpy, list + +# Inference +results = model(img) + +# Results +results.print() # or .show(), .save(), .crop(), .pandas(), etc. \ No newline at end of file diff --git a/utils/Dockerfile b/utils/Dockerfile index 30fddb78..66cfcae5 100644 --- a/utils/Dockerfile +++ b/utils/Dockerfile @@ -1,4 +1,4 @@ -FROM centos:8 +FROM rockylinux:8 LABEL \ org.label-schema.schema-version = "1.0" \ @@ -9,25 +9,25 @@ LABEL \ org.label-schema.author.email = "niklas.eiling@eonerc.rwth-aachen.de" \ org.label-schema.vcs-url = "https://git.rwth-aachen.de/niklas.eiling/cricket" -RUN cd /etc/yum.repos.d/ && sed -i 's/mirrorlist/#mirrorlist/g' /etc/yum.repos.d/CentOS-* && sed -i 's|#baseurl=http://mirror.centos.org|baseurl=http://vault.centos.org|g' /etc/yum.repos.d/CentOS-* && yum update -y - RUN dnf -y update && \ - dnf install -y epel-release dnf-plugins-core && \ - dnf install -y https://rpms.remirepo.net/enterprise/remi-release-8.rpm && \ - dnf config-manager --set-enabled powertools && \ - dnf config-manager --set-enabled remi + dnf install -y epel-release dnf-plugins-core && \ + dnf install -y https://rpms.remirepo.net/enterprise/remi-release-8.rpm && \ + dnf config-manager --set-enabled powertools && \ + dnf config-manager --set-enabled remi RUN dnf install -y make bash git gcc autoconf libtool automake rpcgen \ ncurses-devel zlib-devel binutils-devel mesa-libGL-devel \ libvdpau-devel mesa-libEGL-devel openssl-devel rpcbind \ texinfo bison flex python3 which libibverbs libibverbs-devel \ - libasan cppcheck wget expat-devel xz-devel mesa-libGLU-devel freeglut-devel + libasan cppcheck wget expat-devel xz-devel mesa-libGLU-devel freeglut-devel \ + elfutils-libelf-devel cpio openssl-devel openssl-libs \ + freeimage freeimage-devel ENV LD_LIBRARY_PATH="/usr/local/lib:/usr/local/lib64:${LD_LIBRARY_PATH}" RUN dnf -y config-manager --add-repo http://developer.download.nvidia.com/compute/cuda/repos/rhel8/x86_64/cuda-rhel8.repo && \ - dnf --refresh -y install cuda-compiler-11-1 cuda-libraries-devel-11-1 cuda-samples-11-1 cuda-driver-devel-11-1 && \ - ln -s cuda-11.1 /usr/local/cuda && \ + dnf --refresh -y install cuda-compiler-12-1 cuda-libraries-devel-12-1 cuda-driver-devel-12-1 cuda-profiler-api-12-1 cuda-nvml-devel-12-1 nvidia-driver-NVML-530.30.02 libcudnn8-devel && \ + ln -s cuda-12.1 /usr/local/cuda && \ ln -s libcuda.so /usr/local/cuda/targets/x86_64-linux/lib/stubs/libcuda.so.1 ENV PATH="/usr/local/cuda/bin:${PATH}" diff --git a/utils/Dockerfile.cuda10 b/utils/Dockerfile.cuda11 similarity index 63% rename from utils/Dockerfile.cuda10 rename to utils/Dockerfile.cuda11 index 2dcec62b..a261bb98 100644 --- a/utils/Dockerfile.cuda10 +++ b/utils/Dockerfile.cuda11 @@ -9,6 +9,8 @@ LABEL \ org.label-schema.author.email = "niklas.eiling@eonerc.rwth-aachen.de" \ org.label-schema.vcs-url = "https://git.rwth-aachen.de/niklas.eiling/cricket" +RUN cd /etc/yum.repos.d/ && sed -i 's/mirrorlist/#mirrorlist/g' /etc/yum.repos.d/CentOS-* && sed -i 's|#baseurl=http://mirror.centos.org|baseurl=http://vault.centos.org|g' /etc/yum.repos.d/CentOS-* && yum update -y + RUN dnf -y update && \ dnf install -y epel-release dnf-plugins-core && \ dnf install -y https://rpms.remirepo.net/enterprise/remi-release-8.rpm && \ @@ -18,16 +20,17 @@ RUN dnf -y update && \ RUN dnf install -y make bash git gcc autoconf libtool automake rpcgen \ ncurses-devel zlib-devel binutils-devel mesa-libGL-devel \ libvdpau-devel mesa-libEGL-devel openssl-devel rpcbind \ - texinfo bison flex python3 which libibverbs libasan \ - cppcheck wget expat-devel xz-devel + texinfo bison flex python3 which libibverbs libibverbs-devel \ + libasan cppcheck wget expat-devel xz-devel mesa-libGLU-devel freeglut-devel \ + elfutils-libelf-devel cpio openssl-devel openssl-libs \ + freeimage freeimage-devel ENV LD_LIBRARY_PATH="/usr/local/lib:/usr/local/lib64:${LD_LIBRARY_PATH}" -RUN dnf -y install https://developer.download.nvidia.com/compute/cuda/repos/rhel8/x86_64/cuda-repo-rhel8-10.2.89-1.x86_64.rpm && \ - dnf --refresh -y install cuda-compiler-10-2 cuda-libraries-dev-10-2 cuda-samples-10-2 cuda-driver-dev-10-2 && \ - ln -s cuda-10.2 /usr/local/cuda && \ +RUN dnf -y config-manager --add-repo http://developer.download.nvidia.com/compute/cuda/repos/rhel8/x86_64/cuda-rhel8.repo && \ + dnf --refresh -y install cuda-compiler-11-1 cuda-libraries-devel-11-1 cuda-samples-11-1 cuda-driver-devel-11-1 cuda-nvprof-11-1 cuda-nvml-devel-11-1 nvidia-driver-NVML-530.30.02 libcudnn8-devel && \ + ln -s cuda-11.1 /usr/local/cuda && \ ln -s libcuda.so /usr/local/cuda/targets/x86_64-linux/lib/stubs/libcuda.so.1 - ENV PATH="/usr/local/cuda/bin:${PATH}" ENV LIBRARY_PATH="/usr/local/cuda/targets/x86_64-linux/lib/stubs:$(LIBRARY_PATH}"