diff --git a/.gitignore b/.gitignore
index 8c3acb57..11652fae 100644
--- a/.gitignore
+++ b/.gitignore
@@ -4,6 +4,7 @@ build/
 .clangd
 .project
 .cproject
+*.code-workspace
 .settings/
 .vscode/
 .directory
@@ -39,3 +40,7 @@ core.*
 compile_commands.json
 tags
 .gdb_history
+
+# perf data
+perf.data
+main
diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml
index 35089bc0..df9b8eb4 100644
--- a/.gitlab-ci.yml
+++ b/.gitlab-ci.yml
@@ -21,7 +21,7 @@ stages:
 ##############################################################################
 
 # Build docker image
-prepare:centos8:docker-dev:
+prepare:rocky9:docker-dev:
   stage: prepare
   script:
     - docker build
@@ -31,13 +31,13 @@ prepare:centos8:docker-dev:
   tags:
     - docker
 
-prepare:centos8:cuda10:
+prepare:centos8:cuda11:
   stage: prepare
   script:
     - docker build
-        --file utils/Dockerfile.cuda10
-        --tag ${DOCKER_IMAGE_DEV}_cuda10:${DOCKER_TAG}
-        --tag ${DOCKER_IMAGE_DEV}_cuda10:latest .
+        --file utils/Dockerfile.cuda11
+        --tag ${DOCKER_IMAGE_DEV}_cuda11:${DOCKER_TAG}
+        --tag ${DOCKER_IMAGE_DEV}_cuda11:latest .
   tags:
     - docker
 
@@ -57,7 +57,7 @@ prepare:centos8:cuda10:
 
 build:
   stage: build
-  needs: ["prepare:centos8:docker-dev"]
+  needs: ["prepare:rocky9:docker-dev"]
   script:
    - make -j 32 libtirpc
    - make -j 32 cuda-gdb
@@ -68,6 +68,7 @@ build:
     paths:
       - bin
       - tests/bin
+      - tests/samples/samples-bin
   image: ${DOCKER_IMAGE_DEV}:${DOCKER_TAG}
   cache:
     paths:
@@ -82,7 +83,7 @@ build:
 
 build:ib:
   stage: build
-  needs: ["prepare:centos8:docker-dev"]
+  needs: ["prepare:rocky9:docker-dev"]
   script:
    - make -j 32 libtirpc
    - make -j 32 cuda-gdb
@@ -108,19 +109,19 @@ build:ib:
   tags:
     - docker
 
-build:cuda10:
+build:cuda11:
   stage: build
-  needs: ["prepare:centos8:cuda10"]
+  needs: ["prepare:centos8:cuda11"]
   script:
    - make -j 32 libtirpc
    - make -j 32 cuda-gdb
-   - make -j 1 LOG=INFO
+   - make -j 1 LOG=INFO NOSAMPLES=yes
   artifacts:
     expire_in: 1 week
     paths:
       - bin
       - tests/bin
-  image: ${DOCKER_IMAGE_DEV}_cuda10:${DOCKER_TAG}
+  image: ${DOCKER_IMAGE_DEV}_cuda11:${DOCKER_TAG}
   cache:
     paths:
       - gpu/build
@@ -130,13 +131,13 @@ build:cuda10:
       - submodules/libtirpc
       - submodules/cuda-gdb
       - submodules/cuda-gdb-src.rpm
-    key: build_cuda10
+    key: build_cuda11
   tags:
     - docker
 
 build:debug:
   stage: build
-  needs: ["prepare:centos8:docker-dev"]
+  needs: ["prepare:rocky9:docker-dev"]
   script:
    - make -j 32 libtirpc
    - make -j 32 cuda-gdb
@@ -170,6 +171,7 @@ build:debug:
     LDIR: '$CI_BUILDS_DIR/$CI_PROJECT_PATH/bin'
     SAMPLES_PATH: '/usr/local/cuda/samples'
     PARAMETER: ''
+    CHDIR: 'tests'
   script:
     - mkdir ~/.ssh &&
       echo "-----BEGIN OPENSSH PRIVATE KEY-----" > ~/.ssh/id_rsa &&
@@ -179,9 +181,10 @@ build:debug:
       echo $KNOWN_HOSTS > ~/.ssh/known_hosts && chmod 600 ~/.ssh/id_rsa
     - ssh $GPU_TARGET mkdir -p $RDIR
     - scp -r $LDIR/* $GPU_TARGET:$RDIR/
-    - ssh $GPU_TARGET "LD_PRELOAD=$RDIR/libtirpc.so.3:$RDIR/cricket-server.so $RDIR/$TEST_BINARY" &
+    - ssh $GPU_TARGET "LD_PRELOAD=$RDIR/libtirpc.so.3 $RDIR/cricket-rpc-server 255" &
     - sleep 2
-    - REMOTE_GPU_ADDRESS="ghost.acs-lab.eonerc.rwth-aachen.de" PATH=$LDIR:$PATH LD_PRELOAD=$LDIR/libtirpc.so.3:$LDIR/cricket-client.so $LDIR/$TEST_BINARY $PARAMETER
+    - cd $LDIR/$CHDIR
+    - CRICKET_RPCID=255 REMOTE_GPU_ADDRESS="ghost.acs-lab.eonerc.rwth-aachen.de" PATH=$LDIR:$PATH LD_PRELOAD=$LDIR/libtirpc.so.3:$LDIR/cricket-client.so ./$TEST_BINARY $PARAMETER
   after_script:
     - ssh $GPU_TARGET rm -rf $RDIR
     - ssh $GPU_TARGET pkill -fe -2 $RDIR/test_kernel
@@ -216,21 +219,27 @@ test:test_programs(2/2):
 test:test_kernel:
     extends: .remote-gpu
     variables:
-      TEST_BINARY: 'tests/kernel.testapp'
+      TEST_BINARY: 'kernel.testapp'
 
 test:samples:matrixMul:
     extends: .remote-gpu
     variables:
-      TEST_BINARY: 'tests/matrixMul'
+      TEST_BINARY: 'matrixMul.compressed.sample'
 
 test:samples:bandwidthTest:
     extends: .remote-gpu
     variables:
-      TEST_BINARY: 'tests/bandwidthTest'
+      TEST_BINARY: 'bandwidthTest.sample'
 
 test:samples:nbody:
     extends: .remote-gpu
     variables:
-      TEST_BINARY: 'tests/nbody'
+      TEST_BINARY: 'nbody.uncompressed.sample'
       PARAMETER: '-benchmark'
 
+test:samples:mnistCUDNN:
+    extends: .remote-gpu
+    variables:
+      CHDIR: '../tests/samples/samples-bin'
+      TEST_BINARY: 'mnistCUDNN.sample'
+
diff --git a/Makefile b/Makefile
index 2e5401fc..7cc6f46c 100644
--- a/Makefile
+++ b/Makefile
@@ -19,7 +19,7 @@ cuda-gdb:
 
 libtirpc:
 	@echo -e "\033[36m----> Building libtirpc\033[0m"
-	$(MAKE) -C submodules libtirpc
+	$(MAKE) -C submodules libtirpc/install
 
 gpu: cuda-gdb
 	@echo -e "\033[36m----> Building gpu\033[0m"
@@ -33,7 +33,7 @@ tests:
 	@echo -e "\033[36m----> Building test kernels\033[0m"
 	$(MAKE) -C tests
 
-install-cpu: bin/cricket-client.so bin/cricket-server.so bin/libtirpc.so bin/libtirpc.so.3 bin/tests
+install-cpu: bin/cricket-client.so bin/cricket-rpc-server bin/libtirpc.so bin/libtirpc.so.3 bin/tests
 	@echo -e "\033[36m----> Copying cpu binaries to build/bin\033[0m"
 
 install: install-cpu bin/cricket
@@ -51,7 +51,8 @@ bin/cricket-client.so: bin
 
 bin/cricket-server.so: bin
 	$(MAKE) -C cpu cricket-server.so
-	cp cpu/cricket-server.so bin
+	mv cpu/cricket-server.so bin/cricket-server.so
+
 
 bin/cricket-rpc-server: bin
 	$(MAKE) -C cpu cricket-rpc-server
diff --git a/cpu/Makefile b/cpu/Makefile
index a03a7fc9..c2a13b13 100644
--- a/cpu/Makefile
+++ b/cpu/Makefile
@@ -1,12 +1,12 @@
-#RPC server library
-SERVER = cricket-server.so
 #Standalone RPC Server
-SERVER_BIN = cricket-rpc-server
+SERVER = cricket-rpc-server
+SERVER_LIB = cricket-server.so
 #RPC client library
 CLIENT = cricket-client.so
 
 CUDA_SRC = /usr/local/cuda
 LIBTIRPC_PREFIX = ../submodules/libtirpc/install
+SUBMODULE_LIBS = ../submodules/lib
 
 CC = gcc
 LD = gcc
@@ -39,7 +39,10 @@ SRC_SERVER = $(RPC_XDR)                 \
 			 cr.c 					    \
 			 gsched_none.c 			    \
 			 oob.c 					    \
-			 mt-memcpy.c
+			 mt-memcpy.c				\
+			 cpu-elf2.c					\
+			 cpu-server-nvml.c			\
+			 cpu-server-cudnn.c
 
 SRC_SERVER_LIB = server-library.c
 SRC_SERVER_EXE = server-exe.c
@@ -55,7 +58,11 @@ SRC_CLIENT = $(RPC_XDR)                 \
 			 cpu-libwrap.c              \
 			 cpu-client-cusolver.c 		\
 			 oob.c 					    \
-			 mt-memcpy.c
+			 mt-memcpy.c				\
+			 cpu-elf2.c					\
+			 cpu-client-nvml.c          \
+			 cpu-client-cudnn.c			\
+			 cpu-client-cublas.c
 
 # 			 cpu-client-driver-hidden.c \
 
@@ -72,15 +79,17 @@ RPCGEN_FLAGS = -C -M -N
 INC_FLAGS += -I$(LIBTIRPC_PREFIX)/include/tirpc
 INC_FLAGS += -I$(CUDA_SRC)/include
 
-LIB_FLAGS += -L$(LIBTIRPC_PREFIX)/lib -L$(CUDA_SRC)/lib64
-CC_FLAGS += -std=gnu99 $(INC_FLAGS) -O2
+LIB_FLAGS += -L$(LIBTIRPC_PREFIX)/lib
+LIB_FLAGS += -L$(CUDA_SRC)/lib64
+LIB_FLAGS += -L$(CUDA_SRC)/lib64/stubs
+CC_FLAGS += -std=gnu11 $(INC_FLAGS) #-O2
 # TODO: use extern in header files instead of direct definition e.g. in cpu-common.h to remove -fcommon flag
 CC_FLAGS += -fcommon
-LD_FLAGS = $(LIB_FLAGS) -ltirpc -ldl -lcrypto
+LD_FLAGS = $(LIB_FLAGS) -ltirpc -ldl -lcrypto -lelf
 
 ifdef WITH_DEBUG
 # use ASAN_OPTIONS=protect_shadow_gap=0  LSAN_OPTIONS=fast_unwind_on_malloc=0 when running
-CC_FLAGS += -g -ggdb #-fsanitize=address -fsanitize=pointer-compare -fsanitize=pointer-subtract -fsanitize-address-use-after-scope
+CC_FLAGS += -g -ggdb #-static-libasan -fsanitize=address -fsanitize=pointer-compare -fsanitize=pointer-subtract -fsanitize-address-use-after-scope
 endif
 
 ifdef WITH_IB
@@ -90,48 +99,54 @@ endif
 ifdef LOG
 CC_FLAGS += -DLOG_LEVEL=LOG_$(LOG)
 endif
+
+ifdef LOGN
+CC_FLAGS += -DLOG_LEVEL=$(LOGN)
+endif
+
 ifdef WITH_IB
 CC_FLAGS += -DWITH_IB=$(WITH_IB)
 endif
 
-SERVER_LD_FLAGS = $(LD_FLAGS) -lcudart -lcusolver -lcuda -lcublas -lbfd -lrt -lpthread
+SERVER_LD_FLAGS = $(LD_FLAGS) -lcudart -lcusolver -lcuda -lcublas -lrt -lpthread -lnvidia-ml -lcudnn
 SERVER_BIN_LD_FLAGS = $(SERVER_LD_FLAGS) -Wl,--unresolved-symbols=ignore-in-object-files
-CLIENT_LD_FLAGS = $(LD_FLAGS) -lbfd
+CLIENT_LD_FLAGS = $(LD_FLAGS)
 
 # Targets
 .PHONY: all clean
 
-all : $(SERVER) $(SERVER_BIN) $(CLIENT)
+all : $(SERVER) $(CLIENT)
 
 $(CLIENT) : $(OBJ_CLIENT)
 	$(LD) $(CC_FLAGS) -shared -o $@ $^ $(CLIENT_LD_FLAGS)
 
-$(SERVER) : $(OBJ_SERVER) $(SRC_SERVER_LIB:%.c=%.o)
-	$(LD) $(CC_FLAGS) -shared -o $@ $^ $(SERVER_LD_FLAGS)
+$(SERVER_LIB) : $(OBJ_SERVER) $(SRC_SERVER_EXE:%.c=%.o)
+	$(LD) $(CC_FLAGS) -shared -o $@ $^ $(SERVER_BIN_LD_FLAGS)
 
-$(SERVER_BIN) : $(OBJ_SERVER) $(SRC_SERVER_EXE:%.c=%.o)
+$(SERVER) : $(OBJ_SERVER) $(SRC_SERVER_EXE:%.c=%.o)
 	$(LD) $(CC_FLAGS) -o $@ $^ $(SERVER_BIN_LD_FLAGS)
 
 $(RPC_H) : $(RPC_DEF)
-	$(RPCGEN) $(RPCGEN_FLAGS) -h -o $@ $<
+	rm -f $@ && $(RPCGEN) $(RPCGEN_FLAGS) -h -o $@ $<
 
 $(RPC_CLIENT) : $(RPC_DEF)
-	$(RPCGEN) $(RPCGEN_FLAGS) -l -o $@ $<
+	rm -f $@ && $(RPCGEN) $(RPCGEN_FLAGS) -l -o $@ $<
 
 $(RPC_SERVER) : $(RPC_DEF)
-	$(RPCGEN) $(RPCGEN_FLAGS) -m -o $@ $<
+	rm -f $@ && $(RPCGEN) $(RPCGEN_FLAGS) -m -o $@ $<
 
 $(RPC_SERVER_MOD) : $(RPC_SERVER)
 	./generate_dispatch.sh
 
 $(RPC_XDR) : $(RPC_DEF)
-	$(RPCGEN) $(RPCGEN_FLAGS) -c -o $@ $<
+	rm -f $@ && $(RPCGEN) $(RPCGEN_FLAGS) -c -o $@ $<
 
 %.o : %.c $(RPC_H)
 	$(CC) $(CC_FLAGS) -c -fpic -o $@ $< $(LD_FLAGS) 
 
 clean:
-	 rm -f $(RPC_H) $(RPC_CLIENT) $(RPC_SERVER) $(RPC_SERVER_BIN) $(RPC_SERVER_MOD) $(RPC_XDR) $(OBJ_CLIENT) $(OBJ_SERVER) $(SERVER) $(CLIENT)
+	 rm -f $(RPC_H) $(RPC_CLIENT) $(RPC_SERVER) $(RPC_SERVER_MOD) $(RPC_XDR) $(OBJ_CLIENT) $(OBJ_SERVER) $(SERVER) $(SERVER_LIB) $(CLIENT) $(SRC_SERVER_EXE:%.c=%.o)
+
 
 
 
diff --git a/cpu/api-recorder.c b/cpu/api-recorder.c
index f02cdc37..e67204d2 100644
--- a/cpu/api-recorder.c
+++ b/cpu/api-recorder.c
@@ -4,11 +4,13 @@
 
 #include "api-recorder.h"
 #include "log.h"
+#include "list.h"
 
 
 list api_records;
 
-void api_records_free_args(void)
+
+static void api_records_free_args(void)
 {
     api_record_t *record;
     for (size_t i = 0; i < api_records.length; i++) {
@@ -22,6 +24,27 @@ void api_records_free_args(void)
 
 }
 
+static void api_records_free_data(void)
+{
+    api_record_t *record;
+    for (size_t i = 0; i < api_records.length; i++) {
+        if (list_at(&api_records, i, (void**)&record) != 0) {
+            LOGE(LOG_ERROR, "list_at %zu returned an error.", i);
+            continue;
+        }
+        free(record->data);
+        record->data = NULL;
+    }
+}
+
+
+void api_records_free(void)
+{
+    api_records_free_args();
+    api_records_free_data();
+    list_free(&api_records);
+}
+
 size_t api_records_malloc_get_size(void *ptr)
 {
     api_record_t *record;
diff --git a/cpu/api-recorder.h b/cpu/api-recorder.h
index 856a3121..37c5e569 100644
--- a/cpu/api-recorder.h
+++ b/cpu/api-recorder.h
@@ -35,6 +35,8 @@
     *arguments = ARG
 #define RECORD_ARG(NUM, ARG) \
     arguments->arg##NUM = ARG
+#define RECORD_NARG(ARG) \
+    arguments->ARG = ARG
 #define RECORD_DATA(SIZE, PTR) \
     record->data_size = SIZE; \
     record->data = malloc(SIZE); \
@@ -58,6 +60,7 @@ typedef struct api_record {
         void* ptr;
         int integer;
         ptr_result ptr_result_u;
+        sz_result sz_result_u;
     } result;
     void *data;
     size_t data_size;
@@ -65,7 +68,7 @@ typedef struct api_record {
 extern list api_records;
 
 
-void api_records_free_args(void);
+void api_records_free(void);
 void api_records_print(void);
 void api_records_print_records(api_record_t *record);
 
diff --git a/cpu/cpu-client-cublas.c b/cpu/cpu-client-cublas.c
new file mode 100644
index 00000000..f9fbc159
--- /dev/null
+++ b/cpu/cpu-client-cublas.c
@@ -0,0 +1,763 @@
+
+#define _GNU_SOURCE
+#include <stdio.h>
+#include <stdlib.h>
+#include <cuda.h>
+#include <cublas_v2.h>
+
+//for strerror
+#include <string.h>
+#include <errno.h>
+
+#include "cpu-libwrap.h"
+#include "cpu_rpc_prot.h"
+#include "cpu-common.h"
+#include "cpu-utils.h"
+#include "log.h"
+
+#ifdef WITH_API_CNT
+extern int api_call_cnt;
+#endif //WITH_API_CNT
+
+cublasStatus_t cublasCreate_v2(cublasHandle_t* handle)
+{
+#ifdef WITH_API_CNT
+    api_call_cnt++;
+#endif //WITH_API_CNT
+    ptr_result result;
+    enum clnt_stat retval_1;
+    retval_1 = rpc_cublascreate_1(&result, clnt);
+    if (retval_1 != RPC_SUCCESS) {
+        clnt_perror (clnt, "call failed");
+    }
+    if (result.err == 0) {
+        *handle = (void*)result.ptr_result_u.ptr;
+    }
+    return result.err;
+}
+
+cublasStatus_t cublasDestroy_v2(cublasHandle_t handle)
+{
+#ifdef WITH_API_CNT
+    api_call_cnt++;
+#endif //WITH_API_CNT
+    int result;
+    enum clnt_stat retval_1;
+    retval_1 = rpc_cublasdestroy_1((ptr)handle, &result, clnt);
+    if (retval_1 != RPC_SUCCESS) {
+        clnt_perror (clnt, "call failed");
+    }
+    return result;
+}
+
+DEF_FN(cublasStatus_t, cublasGetVersion_v2, cublasHandle_t, handle, int*, version);
+DEF_FN(cublasStatus_t, cublasGetProperty, libraryPropertyType, type, int*, value);
+DEF_FN(size_t, cublasGetCudartVersion);
+cublasStatus_t cublasSetWorkspace_v2(cublasHandle_t handle, void* workspace, size_t workspaceSizeInBytes)
+{
+#ifdef WITH_API_CNT
+    api_call_cnt++;
+#endif //WITH_API_CNT
+    int result;
+    enum clnt_stat retval_1;
+    retval_1 = rpc_cublassetworkspace_1(
+        (ptr)handle,
+        (ptr)workspace,
+        workspaceSizeInBytes,
+        &result, clnt);
+    if (retval_1 != RPC_SUCCESS) {
+        clnt_perror (clnt, "call failed");
+    }
+    return result;
+}
+
+cublasStatus_t cublasSetStream_v2(cublasHandle_t handle, cudaStream_t streamId)
+{
+#ifdef WITH_API_CNT
+    api_call_cnt++;
+#endif //WITH_API_CNT
+    int result;
+    enum clnt_stat retval_1;
+    retval_1 = rpc_cublassetstream_1(
+        (ptr)handle,
+        (ptr)streamId,
+        &result, clnt);
+    if (retval_1 != RPC_SUCCESS) {
+        clnt_perror (clnt, "call failed");
+    }
+    return result;
+}
+
+DEF_FN(cublasStatus_t, cublasGetStream_v2, cublasHandle_t, handle, cudaStream_t*, streamId);
+DEF_FN(cublasStatus_t, cublasGetPointerMode_v2, cublasHandle_t, handle, cublasPointerMode_t*, mode);
+DEF_FN(cublasStatus_t, cublasSetPointerMode_v2, cublasHandle_t, handle, cublasPointerMode_t, mode);
+DEF_FN(cublasStatus_t, cublasGetAtomicsMode, cublasHandle_t, handle, cublasAtomicsMode_t*, mode);
+DEF_FN(cublasStatus_t, cublasSetAtomicsMode, cublasHandle_t, handle, cublasAtomicsMode_t, mode);
+DEF_FN(cublasStatus_t, cublasGetMathMode, cublasHandle_t, handle, cublasMath_t*, mode);
+cublasStatus_t cublasSetMathMode(cublasHandle_t handle, cublasMath_t mode)
+{
+#ifdef WITH_API_CNT
+    api_call_cnt++;
+#endif //WITH_API_CNT
+    int result;
+    enum clnt_stat retval_1;
+    retval_1 = rpc_cublassetmathmode_1(
+        (ptr)handle,
+        (int)mode,
+        &result, clnt);
+    if (retval_1 != RPC_SUCCESS) {
+        clnt_perror (clnt, "call failed");
+    }
+    return result;
+}
+
+DEF_FN(cublasStatus_t, cublasGetSmCountTarget, cublasHandle_t, handle, int*, smCountTarget);
+DEF_FN(cublasStatus_t, cublasSetSmCountTarget, cublasHandle_t, handle, int, smCountTarget);
+DEF_FN(const char*, cublasGetStatusName, cublasStatus_t, status);
+DEF_FN(const char*, cublasGetStatusString, cublasStatus_t, status);
+DEF_FN(cublasStatus_t, cublasLoggerConfigure, int, logIsOn, int, logToStdOut, int, logToStdErr, const char*, logFileName);
+DEF_FN(cublasStatus_t, cublasSetLoggerCallback, cublasLogCallback, userCallback);
+DEF_FN(cublasStatus_t, cublasGetLoggerCallback, cublasLogCallback*, userCallback);
+DEF_FN(cublasStatus_t, cublasSetVector, int, n, int, elemSize, const void*, x, int, incx, void*, devicePtr, int, incy);
+DEF_FN(cublasStatus_t, cublasSetVector_64, int64_t, n, int64_t, elemSize, const void*, x, int64_t, incx, void*, devicePtr, int64_t, incy);
+DEF_FN(cublasStatus_t, cublasGetVector, int, n, int, elemSize, const void*, x, int, incx, void*, y, int, incy);
+DEF_FN(cublasStatus_t, cublasGetVector_64, int64_t, n, int64_t, elemSize, const void*, x, int64_t, incx, void*, y, int64_t, incy);
+DEF_FN(cublasStatus_t, cublasSetMatrix, int, rows, int, cols, int, elemSize, const void*, A, int, lda, void*, B, int, ldb);
+DEF_FN(cublasStatus_t, cublasSetMatrix_64, int64_t, rows, int64_t, cols, int64_t, elemSize, const void*, A, int64_t, lda, void*, B, int64_t, ldb);
+DEF_FN(cublasStatus_t, cublasGetMatrix, int, rows, int, cols, int, elemSize, const void*, A, int, lda, void*, B, int, ldb);
+DEF_FN(cublasStatus_t, cublasGetMatrix_64, int64_t, rows, int64_t, cols, int64_t, elemSize, const void*, A, int64_t, lda, void*, B, int64_t, ldb);
+DEF_FN(cublasStatus_t, cublasSetVectorAsync, , int, n, int, elemSize, const void*, hostPtr, int, incx, void*, devicePtr, int, incy, cudaStream_t, stream);
+DEF_FN(cublasStatus_t, cublasSetVectorAsync_64, , int64_t, n, int64_t, elemSize, const void*, hostPtr, int64_t, incx, void*, devicePtr, int64_t, incy, cudaStream_t, stream);
+DEF_FN(cublasStatus_t, cublasGetVectorAsync, , int, n, int, elemSize, const void*, devicePtr, int, incx, void*, hostPtr, int, incy, cudaStream_t, stream);
+DEF_FN(cublasStatus_t, cublasGetVectorAsync_64, , int64_t, n, int64_t, elemSize, const void*, devicePtr, int64_t, incx, void*, hostPtr, int64_t, incy, cudaStream_t, stream);
+DEF_FN(cublasStatus_t, cublasSetMatrixAsync, int, rows, int, cols, int, elemSize, const void*, A, int, lda, void*, B, int, ldb, cudaStream_t, stream);
+DEF_FN(cublasStatus_t, cublasSetMatrixAsync_64, int64_t, rows, int64_t, cols, int64_t, elemSize, const void*, A, int64_t, lda, void*, B, int64_t, ldb, cudaStream_t, stream);
+DEF_FN(cublasStatus_t, cublasGetMatrixAsync, int, rows, int, cols, int, elemSize, const void*, A, int, lda, void*, B, int, ldb, cudaStream_t, stream);
+DEF_FN(cublasStatus_t, cublasGetMatrixAsync_64, int64_t, rows, int64_t, cols, int64_t, elemSize, const void*, A, int64_t, lda, void*, B, int64_t, ldb, cudaStream_t, stream);
+void cublasXerbla(const char* srName, int info) {
+    void (*fun)(const char*, int);
+    char* error_str; *(void **)(&fun) = dlsym(libwrap_get_sohandle(), "cublasXerbla");
+    if ((error_str = dlerror()) != ((void *)0)) {
+        if (0 > get_log_data()->curr_level) ;
+        else 
+            loggfe(0, 88, "/home/eiling/projects/cricket/cpu/cpu-client-cublas.c", "[libwrap] %s", error_str); 
+    }
+    if (3 > get_log_data()->curr_level) ;
+    else 
+        loggf(3, "%s called", "cublasXerbla");
+    (*fun)(srName, info); 
+    if (3 > get_log_data()->curr_level) ;
+    else loggf(3, "%s finished", "cublasXerbla");
+}
+DEF_FN(cublasStatus_t, cublasNrm2Ex, cublasHandle_t, handle, int, n, const void*, x, cudaDataType, xType, int, incx, void*, result, cudaDataType, resultType, cudaDataType, executionType);
+DEF_FN(cublasStatus_t, cublasNrm2Ex_64, cublasHandle_t, handle, int64_t, n, const void*, x, cudaDataType, xType, int64_t, incx, void*, result, cudaDataType, resultType, cudaDataType, executionType);
+DEF_FN(cublasStatus_t, cublasSnrm2_v2, cublasHandle_t, handle, int, n, const float*, x, int, incx, float*, result);
+DEF_FN(cublasStatus_t, cublasSnrm2_v2_64, cublasHandle_t, handle, int64_t, n, const float*, x, int64_t, incx, float*, result);
+DEF_FN(cublasStatus_t, cublasDnrm2_v2, cublasHandle_t, handle, int, n, const double*, x, int, incx, double*, result);
+DEF_FN(cublasStatus_t, cublasDnrm2_v2_64, cublasHandle_t, handle, int64_t, n, const double*, x, int64_t, incx, double*, result);
+DEF_FN(cublasStatus_t, cublasScnrm2_v2, cublasHandle_t, handle, int, n, const cuComplex*, x, int, incx, float*, result);
+DEF_FN(cublasStatus_t, cublasScnrm2_v2_64, cublasHandle_t, handle, int64_t, n, const cuComplex*, x, int64_t, incx, float*, result);
+DEF_FN(cublasStatus_t, cublasDznrm2_v2, cublasHandle_t, handle, int, n, const cuDoubleComplex*, x, int, incx, double*, result);
+DEF_FN(cublasStatus_t, cublasDznrm2_v2_64, cublasHandle_t, handle, int64_t, n, const cuDoubleComplex*, x, int64_t, incx, double*, result);
+DEF_FN(cublasStatus_t, cublasDotEx, cublasHandle_t, handle, int, n, const void*, x, cudaDataType, xType, int, incx, const void*, y, cudaDataType, yType, int, incy, void*, result, cudaDataType, resultType, cudaDataType, executionType);
+DEF_FN(cublasStatus_t, cublasDotEx_64, cublasHandle_t, handle, int64_t, n, const void*, x, cudaDataType, xType, int64_t, incx, const void*, y, cudaDataType, yType, int64_t, incy, void*, result, cudaDataType, resultType, cudaDataType, executionType);
+DEF_FN(cublasStatus_t, cublasDotcEx, cublasHandle_t, handle, int, n, const void*, x, cudaDataType, xType, int, incx, const void*, y, cudaDataType, yType, int, incy, void*, result, cudaDataType, resultType, cudaDataType, executionType);
+DEF_FN(cublasStatus_t, cublasDotcEx_64, cublasHandle_t, handle, int64_t, n, const void*, x, cudaDataType, xType, int64_t, incx, const void*, y, cudaDataType, yType, int64_t, incy, void*, result, cudaDataType, resultType, cudaDataType, executionType);
+DEF_FN(cublasStatus_t, cublasSdot_v2, cublasHandle_t, handle, int, n, const float*, x, int, incx, const float*, y, int, incy, float*, result);
+DEF_FN(cublasStatus_t, cublasSdot_v2_64, , cublasHandle_t, handle, int64_t, n, const float*, x, int64_t, incx, const float*, y, int64_t, incy, float*, result);
+DEF_FN(cublasStatus_t, cublasDdot_v2, cublasHandle_t, handle, int, n, const double*, x, int, incx, const double*, y, int, incy, double*, result);
+DEF_FN(cublasStatus_t, cublasDdot_v2_64, , cublasHandle_t, handle, int64_t, n, const double*, x, int64_t, incx, const double*, y, int64_t, incy, double*, result);
+DEF_FN(cublasStatus_t, cublasCdotu_v2, , cublasHandle_t, handle, int, n, const cuComplex*, x, int, incx, const cuComplex*, y, int, incy, cuComplex*, result);
+DEF_FN(cublasStatus_t, cublasCdotu_v2_64, cublasHandle_t, handle, int64_t, n, const cuComplex*, x, int64_t, incx, const cuComplex*, y, int64_t, incy, cuComplex*, result);
+DEF_FN(cublasStatus_t, cublasCdotc_v2, , cublasHandle_t, handle, int, n, const cuComplex*, x, int, incx, const cuComplex*, y, int, incy, cuComplex*, result);
+DEF_FN(cublasStatus_t, cublasCdotc_v2_64, cublasHandle_t, handle, int64_t, n, const cuComplex*, x, int64_t, incx, const cuComplex*, y, int64_t, incy, cuComplex*, result);
+DEF_FN(cublasStatus_t, cublasZdotu_v2, cublasHandle_t, handle, int, n, const cuDoubleComplex*, x, int, incx, const cuDoubleComplex*, y, int, incy, cuDoubleComplex*, result);
+DEF_FN(cublasStatus_t, cublasZdotu_v2_64, cublasHandle_t, handle, int64_t, n, const cuDoubleComplex*, x, int64_t, incx, const cuDoubleComplex*, y, int64_t, incy, cuDoubleComplex*, result);
+DEF_FN(cublasStatus_t, cublasZdotc_v2, cublasHandle_t, handle, int, n, const cuDoubleComplex*, x, int, incx, const cuDoubleComplex*, y, int, incy, cuDoubleComplex*, result);
+DEF_FN(cublasStatus_t, cublasZdotc_v2_64, cublasHandle_t, handle, int64_t, n, const cuDoubleComplex*, x, int64_t, incx, const cuDoubleComplex*, y, int64_t, incy, cuDoubleComplex*, result);
+DEF_FN(cublasStatus_t, cublasScalEx, cublasHandle_t, handle, int, n, const void*, alpha, cudaDataType, alphaType, void*, x, cudaDataType, xType, int, incx, cudaDataType, executionType);
+DEF_FN(cublasStatus_t, cublasScalEx_64, cublasHandle_t, handle, int64_t, n, const void*, alpha, cudaDataType, alphaType, void*, x, cudaDataType, xType, int64_t, incx, cudaDataType, executionType);
+DEF_FN(cublasStatus_t, cublasSscal_v2, cublasHandle_t, handle, int, n, const float*, alpha, float*, x, int, incx);
+DEF_FN(cublasStatus_t, cublasSscal_v2_64, cublasHandle_t, handle, int64_t, n, const float*, alpha, float*, x, int64_t, incx);
+DEF_FN(cublasStatus_t, cublasDscal_v2, cublasHandle_t, handle, int, n, const double*, alpha, double*, x, int, incx);
+DEF_FN(cublasStatus_t, cublasDscal_v2_64, cublasHandle_t, handle, int64_t, n, const double*, alpha, double*, x, int64_t, incx);
+DEF_FN(cublasStatus_t, cublasCscal_v2, cublasHandle_t, handle, int, n, const cuComplex*, alpha, cuComplex*, x, int, incx);
+DEF_FN(cublasStatus_t, cublasCscal_v2_64, cublasHandle_t, handle, int64_t, n, const cuComplex*, alpha, cuComplex*, x, int64_t, incx);
+DEF_FN(cublasStatus_t, cublasCsscal_v2, cublasHandle_t, handle, int, n, const float*, alpha, cuComplex*, x, int, incx);
+DEF_FN(cublasStatus_t, cublasCsscal_v2_64, cublasHandle_t, handle, int64_t, n, const float*, alpha, cuComplex*, x, int64_t, incx);
+DEF_FN(cublasStatus_t, cublasZscal_v2, cublasHandle_t, handle, int, n, const cuDoubleComplex*, alpha, cuDoubleComplex*, x, int, incx);
+DEF_FN(cublasStatus_t, cublasZscal_v2_64, cublasHandle_t, handle, int64_t, n, const cuDoubleComplex*, alpha, cuDoubleComplex*, x, int64_t, incx);
+DEF_FN(cublasStatus_t, cublasZdscal_v2, cublasHandle_t, handle, int, n, const double*, alpha, cuDoubleComplex*, x, int, incx);
+DEF_FN(cublasStatus_t, cublasZdscal_v2_64, cublasHandle_t, handle, int64_t, n, const double*, alpha, cuDoubleComplex*, x, int64_t, incx);
+DEF_FN(cublasStatus_t, cublasAxpyEx, cublasHandle_t, handle, int, n, const void*, alpha, cudaDataType, alphaType, const void*, x, cudaDataType, xType, int, incx, void*, y, cudaDataType, yType, int, incy, cudaDataType, executiontype);
+DEF_FN(cublasStatus_t, cublasAxpyEx_64, cublasHandle_t, handle, int64_t, n, const void*, alpha, cudaDataType, alphaType, const void*, x, cudaDataType, xType, int64_t, incx, void*, y, cudaDataType, yType, int64_t, incy, cudaDataType, executiontype);
+DEF_FN(cublasStatus_t, cublasSaxpy_v2, cublasHandle_t, handle, int, n, const float*, alpha, const float*, x, int, incx, float*, y, int, incy);
+DEF_FN(cublasStatus_t, cublasSaxpy_v2_64, , cublasHandle_t, handle, int64_t, n, const float*, alpha, const float*, x, int64_t, incx, float*, y, int64_t, incy);
+DEF_FN(cublasStatus_t, cublasDaxpy_v2, cublasHandle_t, handle, int, n, const double*, alpha, const double*, x, int, incx, double*, y, int, incy);
+DEF_FN(cublasStatus_t, cublasDaxpy_v2_64, , cublasHandle_t, handle, int64_t, n, const double*, alpha, const double*, x, int64_t, incx, double*, y, int64_t, incy);
+DEF_FN(cublasStatus_t, cublasCaxpy_v2, , cublasHandle_t, handle, int, n, const cuComplex*, alpha, const cuComplex*, x, int, incx, cuComplex*, y, int, incy);
+DEF_FN(cublasStatus_t, cublasCaxpy_v2_64, cublasHandle_t, handle, int64_t, n, const cuComplex*, alpha, const cuComplex*, x, int64_t, incx, cuComplex*, y, int64_t, incy);
+DEF_FN(cublasStatus_t, cublasZaxpy_v2, cublasHandle_t, handle, int, n, const cuDoubleComplex*, alpha, const cuDoubleComplex*, x, int, incx, cuDoubleComplex*, y, int, incy);
+DEF_FN(cublasStatus_t, cublasZaxpy_v2_64, cublasHandle_t, handle, int64_t, n, const cuDoubleComplex*, alpha, const cuDoubleComplex*, x, int64_t, incx, cuDoubleComplex*, y, int64_t, incy);
+DEF_FN(cublasStatus_t, cublasCopyEx, , cublasHandle_t, handle, int, n, const void*, x, cudaDataType, xType, int, incx, void*, y, cudaDataType, yType, int, incy);
+DEF_FN(cublasStatus_t, cublasCopyEx_64, cublasHandle_t, handle, int64_t, n, const void*, x, cudaDataType, xType, int64_t, incx, void*, y, cudaDataType, yType, int64_t, incy);
+DEF_FN(cublasStatus_t, cublasScopy_v2, cublasHandle_t, handle, int, n, const float*, x, int, incx, float*, y, int, incy);
+DEF_FN(cublasStatus_t, cublasScopy_v2_64, cublasHandle_t, handle, int64_t, n, const float*, x, int64_t, incx, float*, y, int64_t, incy);
+DEF_FN(cublasStatus_t, cublasDcopy_v2, cublasHandle_t, handle, int, n, const double*, x, int, incx, double*, y, int, incy);
+DEF_FN(cublasStatus_t, cublasDcopy_v2_64, cublasHandle_t, handle, int64_t, n, const double*, x, int64_t, incx, double*, y, int64_t, incy);
+DEF_FN(cublasStatus_t, cublasCcopy_v2, cublasHandle_t, handle, int, n, const cuComplex*, x, int, incx, cuComplex*, y, int, incy);
+DEF_FN(cublasStatus_t, cublasCcopy_v2_64, cublasHandle_t, handle, int64_t, n, const cuComplex*, x, int64_t, incx, cuComplex*, y, int64_t, incy);
+DEF_FN(cublasStatus_t, cublasZcopy_v2, cublasHandle_t, handle, int, n, const cuDoubleComplex*, x, int, incx, cuDoubleComplex*, y, int, incy);
+DEF_FN(cublasStatus_t, cublasZcopy_v2_64, , cublasHandle_t, handle, int64_t, n, const cuDoubleComplex*, x, int64_t, incx, cuDoubleComplex*, y, int64_t, incy);
+DEF_FN(cublasStatus_t, cublasSswap_v2, cublasHandle_t, handle, int, n, float*, x, int, incx, float*, y, int, incy);
+DEF_FN(cublasStatus_t, cublasSswap_v2_64, cublasHandle_t, handle, int64_t, n, float*, x, int64_t, incx, float*, y, int64_t, incy);
+DEF_FN(cublasStatus_t, cublasDswap_v2, cublasHandle_t, handle, int, n, double*, x, int, incx, double*, y, int, incy);
+DEF_FN(cublasStatus_t, cublasDswap_v2_64, cublasHandle_t, handle, int64_t, n, double*, x, int64_t, incx, double*, y, int64_t, incy);
+DEF_FN(cublasStatus_t, cublasCswap_v2, cublasHandle_t, handle, int, n, cuComplex*, x, int, incx, cuComplex*, y, int, incy);
+DEF_FN(cublasStatus_t, cublasCswap_v2_64, cublasHandle_t, handle, int64_t, n, cuComplex*, x, int64_t, incx, cuComplex*, y, int64_t, incy);
+DEF_FN(cublasStatus_t, cublasZswap_v2, cublasHandle_t, handle, int, n, cuDoubleComplex*, x, int, incx, cuDoubleComplex*, y, int, incy);
+DEF_FN(cublasStatus_t, cublasZswap_v2_64, cublasHandle_t, handle, int64_t, n, cuDoubleComplex*, x, int64_t, incx, cuDoubleComplex*, y, int64_t, incy);
+DEF_FN(cublasStatus_t, cublasSwapEx, , cublasHandle_t, handle, int, n, void*, x, cudaDataType, xType, int, incx, void*, y, cudaDataType, yType, int, incy);
+DEF_FN(cublasStatus_t, cublasSwapEx_64, cublasHandle_t, handle, int64_t, n, void*, x, cudaDataType, xType, int64_t, incx, void*, y, cudaDataType, yType, int64_t, incy);
+DEF_FN(cublasStatus_t, cublasIsamax_v2, cublasHandle_t, handle, int, n, const float*, x, int, incx, int*, result);
+DEF_FN(cublasStatus_t, cublasIsamax_v2_64, cublasHandle_t, handle, int64_t, n, const float*, x, int64_t, incx, int64_t*, result);
+DEF_FN(cublasStatus_t, cublasIdamax_v2, cublasHandle_t, handle, int, n, const double*, x, int, incx, int*, result);
+DEF_FN(cublasStatus_t, cublasIdamax_v2_64, cublasHandle_t, handle, int64_t, n, const double*, x, int64_t, incx, int64_t*, result);
+DEF_FN(cublasStatus_t, cublasIcamax_v2, cublasHandle_t, handle, int, n, const cuComplex*, x, int, incx, int*, result);
+DEF_FN(cublasStatus_t, cublasIcamax_v2_64, cublasHandle_t, handle, int64_t, n, const cuComplex*, x, int64_t, incx, int64_t*, result);
+DEF_FN(cublasStatus_t, cublasIzamax_v2, cublasHandle_t, handle, int, n, const cuDoubleComplex*, x, int, incx, int*, result);
+DEF_FN(cublasStatus_t, cublasIzamax_v2_64, cublasHandle_t, handle, int64_t, n, const cuDoubleComplex*, x, int64_t, incx, int64_t*, result);
+DEF_FN(cublasStatus_t, cublasIamaxEx, cublasHandle_t, handle, int, n, const void*, x, cudaDataType, xType, int, incx, int*, result);
+DEF_FN(cublasStatus_t, cublasIamaxEx_64, cublasHandle_t, handle, int64_t, n, const void*, x, cudaDataType, xType, int64_t, incx, int64_t*, result);
+DEF_FN(cublasStatus_t, cublasIsamin_v2, cublasHandle_t, handle, int, n, const float*, x, int, incx, int*, result);
+DEF_FN(cublasStatus_t, cublasIsamin_v2_64, cublasHandle_t, handle, int64_t, n, const float*, x, int64_t, incx, int64_t*, result);
+DEF_FN(cublasStatus_t, cublasIdamin_v2, cublasHandle_t, handle, int, n, const double*, x, int, incx, int*, result);
+DEF_FN(cublasStatus_t, cublasIdamin_v2_64, cublasHandle_t, handle, int64_t, n, const double*, x, int64_t, incx, int64_t*, result);
+DEF_FN(cublasStatus_t, cublasIcamin_v2, cublasHandle_t, handle, int, n, const cuComplex*, x, int, incx, int*, result);
+DEF_FN(cublasStatus_t, cublasIcamin_v2_64, cublasHandle_t, handle, int64_t, n, const cuComplex*, x, int64_t, incx, int64_t*, result);
+DEF_FN(cublasStatus_t, cublasIzamin_v2, cublasHandle_t, handle, int, n, const cuDoubleComplex*, x, int, incx, int*, result);
+DEF_FN(cublasStatus_t, cublasIzamin_v2_64, cublasHandle_t, handle, int64_t, n, const cuDoubleComplex*, x, int64_t, incx, int64_t*, result);
+DEF_FN(cublasStatus_t, cublasIaminEx, cublasHandle_t, handle, int, n, const void*, x, cudaDataType, xType, int, incx, int*, result);
+DEF_FN(cublasStatus_t, cublasIaminEx_64, cublasHandle_t, handle, int64_t, n, const void*, x, cudaDataType, xType, int64_t, incx, int64_t*, result);
+DEF_FN(cublasStatus_t, cublasAsumEx, cublasHandle_t, handle, int, n, const void*, x, cudaDataType, xType, int, incx, void*, result, cudaDataType, resultType, cudaDataType, executiontype);
+DEF_FN(cublasStatus_t, cublasAsumEx_64, cublasHandle_t, handle, int64_t, n, const void*, x, cudaDataType, xType, int64_t, incx, void*, result, cudaDataType, resultType, cudaDataType, executiontype);
+DEF_FN(cublasStatus_t, cublasSasum_v2, cublasHandle_t, handle, int, n, const float*, x, int, incx, float*, result);
+DEF_FN(cublasStatus_t, cublasSasum_v2_64, cublasHandle_t, handle, int64_t, n, const float*, x, int64_t, incx, float*, result);
+DEF_FN(cublasStatus_t, cublasDasum_v2, cublasHandle_t, handle, int, n, const double*, x, int, incx, double*, result);
+DEF_FN(cublasStatus_t, cublasDasum_v2_64, cublasHandle_t, handle, int64_t, n, const double*, x, int64_t, incx, double*, result);
+DEF_FN(cublasStatus_t, cublasScasum_v2, cublasHandle_t, handle, int, n, const cuComplex*, x, int, incx, float*, result);
+DEF_FN(cublasStatus_t, cublasScasum_v2_64, cublasHandle_t, handle, int64_t, n, const cuComplex*, x, int64_t, incx, float*, result);
+DEF_FN(cublasStatus_t, cublasDzasum_v2, cublasHandle_t, handle, int, n, const cuDoubleComplex*, x, int, incx, double*, result);
+DEF_FN(cublasStatus_t, cublasDzasum_v2_64, cublasHandle_t, handle, int64_t, n, const cuDoubleComplex*, x, int64_t, incx, double*, result);
+DEF_FN(cublasStatus_t, cublasSrot_v2, cublasHandle_t, handle, int, n, float*, x, int, incx, float*, y, int, incy, const float*, c, const float*, s);
+DEF_FN(cublasStatus_t, cublasSrot_v2_64, , cublasHandle_t, handle, int64_t, n, float*, x, int64_t, incx, float*, y, int64_t, incy, const float*, c, const float*, s);
+DEF_FN(cublasStatus_t, cublasDrot_v2, cublasHandle_t, handle, int, n, double*, x, int, incx, double*, y, int, incy, const double*, c, const double*, s);
+DEF_FN(cublasStatus_t, cublasDrot_v2_64, cublasHandle_t, handle, int64_t, n, double*, x, int64_t, incx, double*, y, int64_t, incy, const double*, c, const double*, s);
+DEF_FN(cublasStatus_t, cublasCrot_v2, , cublasHandle_t, handle, int, n, cuComplex*, x, int, incx, cuComplex*, y, int, incy, const float*, c, const cuComplex*, s);
+DEF_FN(cublasStatus_t, cublasCrot_v2_64, cublasHandle_t, handle, int64_t, n, cuComplex*, x, int64_t, incx, cuComplex*, y, int64_t, incy, const float*, c, const cuComplex*, s);
+DEF_FN(cublasStatus_t, cublasCsrot_v2, , cublasHandle_t, handle, int, n, cuComplex*, x, int, incx, cuComplex*, y, int, incy, const float*, c, const float*, s);
+DEF_FN(cublasStatus_t, cublasCsrot_v2_64, cublasHandle_t, handle, int64_t, n, cuComplex*, x, int64_t, incx, cuComplex*, y, int64_t, incy, const float*, c, const float*, s);
+DEF_FN(cublasStatus_t, cublasZrot_v2, cublasHandle_t, handle, int, n, cuDoubleComplex*, x, int, incx, cuDoubleComplex*, y, int, incy, const double*, c, const cuDoubleComplex*, s);
+DEF_FN(cublasStatus_t, cublasZrot_v2_64, cublasHandle_t, handle, int64_t, n, cuDoubleComplex*, x, int64_t, incx, cuDoubleComplex*, y, int64_t, incy, const double*, c, const cuDoubleComplex*, s);
+DEF_FN(cublasStatus_t, cublasZdrot_v2, cublasHandle_t, handle, int, n, cuDoubleComplex*, x, int, incx, cuDoubleComplex*, y, int, incy, const double*, c, const double*, s);
+DEF_FN(cublasStatus_t, cublasZdrot_v2_64, cublasHandle_t, handle, int64_t, n, cuDoubleComplex*, x, int64_t, incx, cuDoubleComplex*, y, int64_t, incy, const double*, c, const double*, s);
+DEF_FN(cublasStatus_t, cublasRotEx, cublasHandle_t, handle, int, n, void*, x, cudaDataType, xType, int, incx, void*, y, cudaDataType, yType, int, incy, const void*, c, const void*, s, cudaDataType, csType, cudaDataType, executiontype);
+DEF_FN(cublasStatus_t, cublasRotEx_64, cublasHandle_t, handle, int64_t, n, void*, x, cudaDataType, xType, int64_t, incx, void*, y, cudaDataType, yType, int64_t, incy, const void*, c, const void*, s, cudaDataType, csType, cudaDataType, executiontype);
+DEF_FN(cublasStatus_t, cublasSrotg_v2, cublasHandle_t, handle, float*, a, float*, b, float*, c, float*, s);
+DEF_FN(cublasStatus_t, cublasDrotg_v2, cublasHandle_t, handle, double*, a, double*, b, double*, c, double*, s);
+DEF_FN(cublasStatus_t, cublasCrotg_v2, cublasHandle_t, handle, cuComplex*, a, cuComplex*, b, float*, c, cuComplex*, s);
+DEF_FN(cublasStatus_t, cublasZrotg_v2, cublasHandle_t, handle, cuDoubleComplex*, a, cuDoubleComplex*, b, double*, c, cuDoubleComplex*, s);
+DEF_FN(cublasStatus_t, cublasRotgEx, cublasHandle_t, handle, void*, a, void*, b, cudaDataType, abType, void*, c, void*, s, cudaDataType, csType, cudaDataType, executiontype);
+DEF_FN(cublasStatus_t, cublasSrotm_v2, cublasHandle_t, handle, int, n, float*, x, int, incx, float*, y, int, incy, const float*, param);
+DEF_FN(cublasStatus_t, cublasSrotm_v2_64, cublasHandle_t, handle, int64_t, n, float*, x, int64_t, incx, float*, y, int64_t, incy, const float*, param);
+DEF_FN(cublasStatus_t, cublasDrotm_v2, cublasHandle_t, handle, int, n, double*, x, int, incx, double*, y, int, incy, const double*, param);
+DEF_FN(cublasStatus_t, cublasDrotm_v2_64, , cublasHandle_t, handle, int64_t, n, double*, x, int64_t, incx, double*, y, int64_t, incy, const double*, param);
+DEF_FN(cublasStatus_t, cublasRotmEx, cublasHandle_t, handle, int, n, void*, x, cudaDataType, xType, int, incx, void*, y, cudaDataType, yType, int, incy, const void*, param, cudaDataType, paramType, cudaDataType, executiontype);
+DEF_FN(cublasStatus_t, cublasRotmEx_64, cublasHandle_t, handle, int64_t, n, void*, x, cudaDataType, xType, int64_t, incx, void*, y, cudaDataType, yType, int64_t, incy, const void*, param, cudaDataType, paramType, cudaDataType, executiontype);
+DEF_FN(cublasStatus_t, cublasSrotmg_v2, cublasHandle_t, handle, float*, d1, float*, d2, float*, x1, const float*, y1, float*, param);
+DEF_FN(cublasStatus_t, cublasDrotmg_v2, cublasHandle_t, handle, double*, d1, double*, d2, double*, x1, const double*, y1, double*, param);
+DEF_FN(cublasStatus_t, cublasRotmgEx, cublasHandle_t, handle, void*, d1, cudaDataType, d1Type, void*, d2, cudaDataType, d2Type, void*, x1, cudaDataType, x1Type, const void*, y1, cudaDataType, y1Type, void*, param, cudaDataType, paramType, cudaDataType, executiontype);
+
+cublasStatus_t cublasSgemv_v2(cublasHandle_t handle,
+                           cublasOperation_t trans,
+                           int m, int n,
+                           const float          *alpha,
+                           const float          *A, int lda,
+                           const float          *x, int incx,
+                           const float          *beta,
+                           float          *y, int incy)
+{
+#ifdef WITH_API_CNT
+    api_call_cnt++;
+#endif //WITH_API_CNT
+    int result;
+    enum clnt_stat retval_1;
+    retval_1 = rpc_cublassgemv_1(
+        (ptr)handle,
+        (int)trans,
+        m, n,
+        *alpha,
+        (ptr)A, lda,
+        (ptr)x, incx,
+        *beta,
+        (ptr)y, incy,
+         &result, clnt);
+    if (retval_1 != RPC_SUCCESS) {
+        clnt_perror (clnt, "call failed");
+    }
+    return result;
+}
+
+DEF_FN(cublasStatus_t, cublasSgemv_v2_64, cublasHandle_t, handle, cublasOperation_t, trans, int64_t, m, int64_t, n, const float*, alpha, const float*, A, int64_t, lda, const float*, x, int64_t, incx, const float*, beta, float*, y, int64_t, incy);
+
+cublasStatus_t cublasDgemv_v2(cublasHandle_t handle,
+cublasOperation_t trans,
+                           int m, int n,
+                           const double          *alpha,
+                           const double          *A, int lda,
+                           const double          *x, int incx,
+                           const double          *beta,
+                           double          *y, int incy)
+{
+#ifdef WITH_API_CNT
+    api_call_cnt++;
+#endif //WITH_API_CNT
+    int result;
+    enum clnt_stat retval_1;
+    retval_1 = rpc_cublasdgemv_1(
+        (ptr)handle,
+        (int)trans,
+        m, n,
+        *alpha,
+        (ptr)A, lda,
+        (ptr)x, incx,
+        *beta,
+        (ptr)y, incy,
+         &result, clnt);
+    if (retval_1 != RPC_SUCCESS) {
+        clnt_perror (clnt, "call failed");
+    }
+    return result;
+}
+DEF_FN(cublasStatus_t, cublasDgemv_v2_64, cublasHandle_t, handle, cublasOperation_t, trans, int64_t, m, int64_t, n, const double*, alpha, const double*, A, int64_t, lda, const double*, x, int64_t, incx, const double*, beta, double*, y, int64_t, incy);
+DEF_FN(cublasStatus_t, cublasCgemv_v2, cublasHandle_t, handle, cublasOperation_t, trans, int, m, int, n, const cuComplex*, alpha, const cuComplex*, A, int, lda, const cuComplex*, x, int, incx, const cuComplex*, beta, cuComplex*, y, int, incy);
+DEF_FN(cublasStatus_t, cublasCgemv_v2_64, cublasHandle_t, handle, cublasOperation_t, trans, int64_t, m, int64_t, n, const cuComplex*, alpha, const cuComplex*, A, int64_t, lda, const cuComplex*, x, int64_t, incx, const cuComplex*, beta, cuComplex*, y, int64_t, incy);
+DEF_FN(cublasStatus_t, cublasZgemv_v2, cublasHandle_t, handle, cublasOperation_t, trans, int, m, int, n, const cuDoubleComplex*, alpha, const cuDoubleComplex*, A, int, lda, const cuDoubleComplex*, x, int, incx, const cuDoubleComplex*, beta, cuDoubleComplex*, y, int, incy);
+DEF_FN(cublasStatus_t, cublasZgemv_v2_64, cublasHandle_t, handle, cublasOperation_t, trans, int64_t, m, int64_t, n, const cuDoubleComplex*, alpha, const cuDoubleComplex*, A, int64_t, lda, const cuDoubleComplex*, x, int64_t, incx, const cuDoubleComplex*, beta, cuDoubleComplex*, y, int64_t, incy);
+DEF_FN(cublasStatus_t, cublasSgbmv_v2, cublasHandle_t, handle, cublasOperation_t, trans, int, m, int, n, int, kl, int, ku, const float*, alpha, const float*, A, int, lda, const float*, x, int, incx, const float*, beta, float*, y, int, incy);
+DEF_FN(cublasStatus_t, cublasSgbmv_v2_64, cublasHandle_t, handle, cublasOperation_t, trans, int64_t, m, int64_t, n, int64_t, kl, int64_t, ku, const float*, alpha, const float*, A, int64_t, lda, const float*, x, int64_t, incx, const float*, beta, float*, y, int64_t, incy);
+DEF_FN(cublasStatus_t, cublasDgbmv_v2, cublasHandle_t, handle, cublasOperation_t, trans, int, m, int, n, int, kl, int, ku, const double*, alpha, const double*, A, int, lda, const double*, x, int, incx, const double*, beta, double*, y, int, incy);
+DEF_FN(cublasStatus_t, cublasDgbmv_v2_64, cublasHandle_t, handle, cublasOperation_t, trans, int64_t, m, int64_t, n, int64_t, kl, int64_t, ku, const double*, alpha, const double*, A, int64_t, lda, const double*, x, int64_t, incx, const double*, beta, double*, y, int64_t, incy);
+DEF_FN(cublasStatus_t, cublasCgbmv_v2, cublasHandle_t, handle, cublasOperation_t, trans, int, m, int, n, int, kl, int, ku, const cuComplex*, alpha, const cuComplex*, A, int, lda, const cuComplex*, x, int, incx, const cuComplex*, beta, cuComplex*, y, int, incy);
+DEF_FN(cublasStatus_t, cublasCgbmv_v2_64, cublasHandle_t, handle, cublasOperation_t, trans, int64_t, m, int64_t, n, int64_t, kl, int64_t, ku, const cuComplex*, alpha, const cuComplex*, A, int64_t, lda, const cuComplex*, x, int64_t, incx, const cuComplex*, beta, cuComplex*, y, int64_t, incy);
+DEF_FN(cublasStatus_t, cublasZgbmv_v2, cublasHandle_t, handle, cublasOperation_t, trans, int, m, int, n, int, kl, int, ku, const cuDoubleComplex*, alpha, const cuDoubleComplex*, A, int, lda, const cuDoubleComplex*, x, int, incx, const cuDoubleComplex*, beta, cuDoubleComplex*, y, int, incy);
+DEF_FN(cublasStatus_t, cublasZgbmv_v2_64, cublasHandle_t, handle, cublasOperation_t, trans, int64_t, m, int64_t, n, int64_t, kl, int64_t, ku, const cuDoubleComplex*, alpha, const cuDoubleComplex*, A, int64_t, lda, const cuDoubleComplex*, x, int64_t, incx, const cuDoubleComplex*, beta, cuDoubleComplex*, y, int64_t, incy);
+DEF_FN(cublasStatus_t, cublasStrmv_v2, cublasHandle_t, handle, cublasFillMode_t, uplo, cublasOperation_t, trans, cublasDiagType_t, diag, int, n, const float*, A, int, lda, float*, x, int, incx);
+DEF_FN(cublasStatus_t, cublasStrmv_v2_64, cublasHandle_t, handle, cublasFillMode_t, uplo, cublasOperation_t, trans, cublasDiagType_t, diag, int64_t, n, const float*, A, int64_t, lda, float*, x, int64_t, incx);
+DEF_FN(cublasStatus_t, cublasDtrmv_v2, cublasHandle_t, handle, cublasFillMode_t, uplo, cublasOperation_t, trans, cublasDiagType_t, diag, int, n, const double*, A, int, lda, double*, x, int, incx);
+DEF_FN(cublasStatus_t, cublasDtrmv_v2_64, cublasHandle_t, handle, cublasFillMode_t, uplo, cublasOperation_t, trans, cublasDiagType_t, diag, int64_t, n, const double*, A, int64_t, lda, double*, x, int64_t, incx);
+DEF_FN(cublasStatus_t, cublasCtrmv_v2, cublasHandle_t, handle, cublasFillMode_t, uplo, cublasOperation_t, trans, cublasDiagType_t, diag, int, n, const cuComplex*, A, int, lda, cuComplex*, x, int, incx);
+DEF_FN(cublasStatus_t, cublasCtrmv_v2_64, cublasHandle_t, handle, cublasFillMode_t, uplo, cublasOperation_t, trans, cublasDiagType_t, diag, int64_t, n, const cuComplex*, A, int64_t, lda, cuComplex*, x, int64_t, incx);
+DEF_FN(cublasStatus_t, cublasZtrmv_v2, cublasHandle_t, handle, cublasFillMode_t, uplo, cublasOperation_t, trans, cublasDiagType_t, diag, int, n, const cuDoubleComplex*, A, int, lda, cuDoubleComplex*, x, int, incx);
+DEF_FN(cublasStatus_t, cublasZtrmv_v2_64, cublasHandle_t, handle, cublasFillMode_t, uplo, cublasOperation_t, trans, cublasDiagType_t, diag, int64_t, n, const cuDoubleComplex*, A, int64_t, lda, cuDoubleComplex*, x, int64_t, incx);
+DEF_FN(cublasStatus_t, cublasStbmv_v2, cublasHandle_t, handle, cublasFillMode_t, uplo, cublasOperation_t, trans, cublasDiagType_t, diag, int, n, int, k, const float*, A, int, lda, float*, x, int, incx);
+DEF_FN(cublasStatus_t, cublasStbmv_v2_64, cublasHandle_t, handle, cublasFillMode_t, uplo, cublasOperation_t, trans, cublasDiagType_t, diag, int64_t, n, int64_t, k, const float*, A, int64_t, lda, float*, x, int64_t, incx);
+DEF_FN(cublasStatus_t, cublasDtbmv_v2, cublasHandle_t, handle, cublasFillMode_t, uplo, cublasOperation_t, trans, cublasDiagType_t, diag, int, n, int, k, const double*, A, int, lda, double*, x, int, incx);
+DEF_FN(cublasStatus_t, cublasDtbmv_v2_64, cublasHandle_t, handle, cublasFillMode_t, uplo, cublasOperation_t, trans, cublasDiagType_t, diag, int64_t, n, int64_t, k, const double*, A, int64_t, lda, double*, x, int64_t, incx);
+DEF_FN(cublasStatus_t, cublasCtbmv_v2, cublasHandle_t, handle, cublasFillMode_t, uplo, cublasOperation_t, trans, cublasDiagType_t, diag, int, n, int, k, const cuComplex*, A, int, lda, cuComplex*, x, int, incx);
+DEF_FN(cublasStatus_t, cublasCtbmv_v2_64, cublasHandle_t, handle, cublasFillMode_t, uplo, cublasOperation_t, trans, cublasDiagType_t, diag, int64_t, n, int64_t, k, const cuComplex*, A, int64_t, lda, cuComplex*, x, int64_t, incx);
+DEF_FN(cublasStatus_t, cublasZtbmv_v2, cublasHandle_t, handle, cublasFillMode_t, uplo, cublasOperation_t, trans, cublasDiagType_t, diag, int, n, int, k, const cuDoubleComplex*, A, int, lda, cuDoubleComplex*, x, int, incx);
+DEF_FN(cublasStatus_t, cublasZtbmv_v2_64, cublasHandle_t, handle, cublasFillMode_t, uplo, cublasOperation_t, trans, cublasDiagType_t, diag, int64_t, n, int64_t, k, const cuDoubleComplex*, A, int64_t, lda, cuDoubleComplex*, x, int64_t, incx);
+DEF_FN(cublasStatus_t, cublasStpmv_v2, cublasHandle_t, handle, cublasFillMode_t, uplo, cublasOperation_t, trans, cublasDiagType_t, diag, int, n, const float*, AP, float*, x, int, incx);
+DEF_FN(cublasStatus_t, cublasStpmv_v2_64, cublasHandle_t, handle, cublasFillMode_t, uplo, cublasOperation_t, trans, cublasDiagType_t, diag, int64_t, n, const float*, AP, float*, x, int64_t, incx);
+DEF_FN(cublasStatus_t, cublasDtpmv_v2, cublasHandle_t, handle, cublasFillMode_t, uplo, cublasOperation_t, trans, cublasDiagType_t, diag, int, n, const double*, AP, double*, x, int, incx);
+DEF_FN(cublasStatus_t, cublasDtpmv_v2_64, cublasHandle_t, handle, cublasFillMode_t, uplo, cublasOperation_t, trans, cublasDiagType_t, diag, int64_t, n, const double*, AP, double*, x, int64_t, incx);
+DEF_FN(cublasStatus_t, cublasCtpmv_v2, cublasHandle_t, handle, cublasFillMode_t, uplo, cublasOperation_t, trans, cublasDiagType_t, diag, int, n, const cuComplex*, AP, cuComplex*, x, int, incx);
+DEF_FN(cublasStatus_t, cublasCtpmv_v2_64, cublasHandle_t, handle, cublasFillMode_t, uplo, cublasOperation_t, trans, cublasDiagType_t, diag, int64_t, n, const cuComplex*, AP, cuComplex*, x, int64_t, incx);
+DEF_FN(cublasStatus_t, cublasZtpmv_v2, cublasHandle_t, handle, cublasFillMode_t, uplo, cublasOperation_t, trans, cublasDiagType_t, diag, int, n, const cuDoubleComplex*, AP, cuDoubleComplex*, x, int, incx);
+DEF_FN(cublasStatus_t, cublasZtpmv_v2_64, cublasHandle_t, handle, cublasFillMode_t, uplo, cublasOperation_t, trans, cublasDiagType_t, diag, int64_t, n, const cuDoubleComplex*, AP, cuDoubleComplex*, x, int64_t, incx);
+DEF_FN(cublasStatus_t, cublasStrsv_v2, cublasHandle_t, handle, cublasFillMode_t, uplo, cublasOperation_t, trans, cublasDiagType_t, diag, int, n, const float*, A, int, lda, float*, x, int, incx);
+DEF_FN(cublasStatus_t, cublasStrsv_v2_64, cublasHandle_t, handle, cublasFillMode_t, uplo, cublasOperation_t, trans, cublasDiagType_t, diag, int64_t, n, const float*, A, int64_t, lda, float*, x, int64_t, incx);
+DEF_FN(cublasStatus_t, cublasDtrsv_v2, cublasHandle_t, handle, cublasFillMode_t, uplo, cublasOperation_t, trans, cublasDiagType_t, diag, int, n, const double*, A, int, lda, double*, x, int, incx);
+DEF_FN(cublasStatus_t, cublasDtrsv_v2_64, cublasHandle_t, handle, cublasFillMode_t, uplo, cublasOperation_t, trans, cublasDiagType_t, diag, int64_t, n, const double*, A, int64_t, lda, double*, x, int64_t, incx);
+DEF_FN(cublasStatus_t, cublasCtrsv_v2, cublasHandle_t, handle, cublasFillMode_t, uplo, cublasOperation_t, trans, cublasDiagType_t, diag, int, n, const cuComplex*, A, int, lda, cuComplex*, x, int, incx);
+DEF_FN(cublasStatus_t, cublasCtrsv_v2_64, cublasHandle_t, handle, cublasFillMode_t, uplo, cublasOperation_t, trans, cublasDiagType_t, diag, int64_t, n, const cuComplex*, A, int64_t, lda, cuComplex*, x, int64_t, incx);
+DEF_FN(cublasStatus_t, cublasZtrsv_v2, cublasHandle_t, handle, cublasFillMode_t, uplo, cublasOperation_t, trans, cublasDiagType_t, diag, int, n, const cuDoubleComplex*, A, int, lda, cuDoubleComplex*, x, int, incx);
+DEF_FN(cublasStatus_t, cublasZtrsv_v2_64, cublasHandle_t, handle, cublasFillMode_t, uplo, cublasOperation_t, trans, cublasDiagType_t, diag, int64_t, n, const cuDoubleComplex*, A, int64_t, lda, cuDoubleComplex*, x, int64_t, incx);
+DEF_FN(cublasStatus_t, cublasStpsv_v2, cublasHandle_t, handle, cublasFillMode_t, uplo, cublasOperation_t, trans, cublasDiagType_t, diag, int, n, const float*, AP, float*, x, int, incx);
+DEF_FN(cublasStatus_t, cublasStpsv_v2_64, cublasHandle_t, handle, cublasFillMode_t, uplo, cublasOperation_t, trans, cublasDiagType_t, diag, int64_t, n, const float*, AP, float*, x, int64_t, incx);
+DEF_FN(cublasStatus_t, cublasDtpsv_v2, cublasHandle_t, handle, cublasFillMode_t, uplo, cublasOperation_t, trans, cublasDiagType_t, diag, int, n, const double*, AP, double*, x, int, incx);
+DEF_FN(cublasStatus_t, cublasDtpsv_v2_64, cublasHandle_t, handle, cublasFillMode_t, uplo, cublasOperation_t, trans, cublasDiagType_t, diag, int64_t, n, const double*, AP, double*, x, int64_t, incx);
+DEF_FN(cublasStatus_t, cublasCtpsv_v2, cublasHandle_t, handle, cublasFillMode_t, uplo, cublasOperation_t, trans, cublasDiagType_t, diag, int, n, const cuComplex*, AP, cuComplex*, x, int, incx);
+DEF_FN(cublasStatus_t, cublasCtpsv_v2_64, cublasHandle_t, handle, cublasFillMode_t, uplo, cublasOperation_t, trans, cublasDiagType_t, diag, int64_t, n, const cuComplex*, AP, cuComplex*, x, int64_t, incx);
+DEF_FN(cublasStatus_t, cublasZtpsv_v2, cublasHandle_t, handle, cublasFillMode_t, uplo, cublasOperation_t, trans, cublasDiagType_t, diag, int, n, const cuDoubleComplex*, AP, cuDoubleComplex*, x, int, incx);
+DEF_FN(cublasStatus_t, cublasZtpsv_v2_64, cublasHandle_t, handle, cublasFillMode_t, uplo, cublasOperation_t, trans, cublasDiagType_t, diag, int64_t, n, const cuDoubleComplex*, AP, cuDoubleComplex*, x, int64_t, incx);
+DEF_FN(cublasStatus_t, cublasStbsv_v2, cublasHandle_t, handle, cublasFillMode_t, uplo, cublasOperation_t, trans, cublasDiagType_t, diag, int, n, int, k, const float*, A, int, lda, float*, x, int, incx);
+DEF_FN(cublasStatus_t, cublasStbsv_v2_64, cublasHandle_t, handle, cublasFillMode_t, uplo, cublasOperation_t, trans, cublasDiagType_t, diag, int64_t, n, int64_t, k, const float*, A, int64_t, lda, float*, x, int64_t, incx);
+DEF_FN(cublasStatus_t, cublasDtbsv_v2, cublasHandle_t, handle, cublasFillMode_t, uplo, cublasOperation_t, trans, cublasDiagType_t, diag, int, n, int, k, const double*, A, int, lda, double*, x, int, incx);
+DEF_FN(cublasStatus_t, cublasDtbsv_v2_64, cublasHandle_t, handle, cublasFillMode_t, uplo, cublasOperation_t, trans, cublasDiagType_t, diag, int64_t, n, int64_t, k, const double*, A, int64_t, lda, double*, x, int64_t, incx);
+DEF_FN(cublasStatus_t, cublasCtbsv_v2, cublasHandle_t, handle, cublasFillMode_t, uplo, cublasOperation_t, trans, cublasDiagType_t, diag, int, n, int, k, const cuComplex*, A, int, lda, cuComplex*, x, int, incx);
+DEF_FN(cublasStatus_t, cublasCtbsv_v2_64, cublasHandle_t, handle, cublasFillMode_t, uplo, cublasOperation_t, trans, cublasDiagType_t, diag, int64_t, n, int64_t, k, const cuComplex*, A, int64_t, lda, cuComplex*, x, int64_t, incx);
+DEF_FN(cublasStatus_t, cublasZtbsv_v2, cublasHandle_t, handle, cublasFillMode_t, uplo, cublasOperation_t, trans, cublasDiagType_t, diag, int, n, int, k, const cuDoubleComplex*, A, int, lda, cuDoubleComplex*, x, int, incx);
+DEF_FN(cublasStatus_t, cublasZtbsv_v2_64, cublasHandle_t, handle, cublasFillMode_t, uplo, cublasOperation_t, trans, cublasDiagType_t, diag, int64_t, n, int64_t, k, const cuDoubleComplex*, A, int64_t, lda, cuDoubleComplex*, x, int64_t, incx);
+DEF_FN(cublasStatus_t, cublasSsymv_v2, cublasHandle_t, handle, cublasFillMode_t, uplo, int, n, const float*, alpha, const float*, A, int, lda, const float*, x, int, incx, const float*, beta, float*, y, int, incy);
+DEF_FN(cublasStatus_t, cublasSsymv_v2_64, cublasHandle_t, handle, cublasFillMode_t, uplo, int64_t, n, const float*, alpha, const float*, A, int64_t, lda, const float*, x, int64_t, incx, const float*, beta, float*, y, int64_t, incy);
+DEF_FN(cublasStatus_t, cublasDsymv_v2, cublasHandle_t, handle, cublasFillMode_t, uplo, int, n, const double*, alpha, const double*, A, int, lda, const double*, x, int, incx, const double*, beta, double*, y, int, incy);
+DEF_FN(cublasStatus_t, cublasDsymv_v2_64, cublasHandle_t, handle, cublasFillMode_t, uplo, int64_t, n, const double*, alpha, const double*, A, int64_t, lda, const double*, x, int64_t, incx, const double*, beta, double*, y, int64_t, incy);
+DEF_FN(cublasStatus_t, cublasCsymv_v2, cublasHandle_t, handle, cublasFillMode_t, uplo, int, n, const cuComplex*, alpha, const cuComplex*, A, int, lda, const cuComplex*, x, int, incx, const cuComplex*, beta, cuComplex*, y, int, incy);
+DEF_FN(cublasStatus_t, cublasCsymv_v2_64, cublasHandle_t, handle, cublasFillMode_t, uplo, int64_t, n, const cuComplex*, alpha, const cuComplex*, A, int64_t, lda, const cuComplex*, x, int64_t, incx, const cuComplex*, beta, cuComplex*, y, int64_t, incy);
+DEF_FN(cublasStatus_t, cublasZsymv_v2, cublasHandle_t, handle, cublasFillMode_t, uplo, int, n, const cuDoubleComplex*, alpha, const cuDoubleComplex*, A, int, lda, const cuDoubleComplex*, x, int, incx, const cuDoubleComplex*, beta, cuDoubleComplex*, y, int, incy);
+DEF_FN(cublasStatus_t, cublasZsymv_v2_64, cublasHandle_t, handle, cublasFillMode_t, uplo, int64_t, n, const cuDoubleComplex*, alpha, const cuDoubleComplex*, A, int64_t, lda, const cuDoubleComplex*, x, int64_t, incx, const cuDoubleComplex*, beta, cuDoubleComplex*, y, int64_t, incy);
+DEF_FN(cublasStatus_t, cublasChemv_v2, cublasHandle_t, handle, cublasFillMode_t, uplo, int, n, const cuComplex*, alpha, const cuComplex*, A, int, lda, const cuComplex*, x, int, incx, const cuComplex*, beta, cuComplex*, y, int, incy);
+DEF_FN(cublasStatus_t, cublasChemv_v2_64, cublasHandle_t, handle, cublasFillMode_t, uplo, int64_t, n, const cuComplex*, alpha, const cuComplex*, A, int64_t, lda, const cuComplex*, x, int64_t, incx, const cuComplex*, beta, cuComplex*, y, int64_t, incy);
+DEF_FN(cublasStatus_t, cublasZhemv_v2, cublasHandle_t, handle, cublasFillMode_t, uplo, int, n, const cuDoubleComplex*, alpha, const cuDoubleComplex*, A, int, lda, const cuDoubleComplex*, x, int, incx, const cuDoubleComplex*, beta, cuDoubleComplex*, y, int, incy);
+DEF_FN(cublasStatus_t, cublasZhemv_v2_64, cublasHandle_t, handle, cublasFillMode_t, uplo, int64_t, n, const cuDoubleComplex*, alpha, const cuDoubleComplex*, A, int64_t, lda, const cuDoubleComplex*, x, int64_t, incx, const cuDoubleComplex*, beta, cuDoubleComplex*, y, int64_t, incy);
+DEF_FN(cublasStatus_t, cublasSsbmv_v2, cublasHandle_t, handle, cublasFillMode_t, uplo, int, n, int, k, const float*, alpha, const float*, A, int, lda, const float*, x, int, incx, const float*, beta, float*, y, int, incy);
+DEF_FN(cublasStatus_t, cublasSsbmv_v2_64, cublasHandle_t, handle, cublasFillMode_t, uplo, int64_t, n, int64_t, k, const float*, alpha, const float*, A, int64_t, lda, const float*, x, int64_t, incx, const float*, beta, float*, y, int64_t, incy);
+DEF_FN(cublasStatus_t, cublasDsbmv_v2, cublasHandle_t, handle, cublasFillMode_t, uplo, int, n, int, k, const double*, alpha, const double*, A, int, lda, const double*, x, int, incx, const double*, beta, double*, y, int, incy);
+DEF_FN(cublasStatus_t, cublasDsbmv_v2_64, cublasHandle_t, handle, cublasFillMode_t, uplo, int64_t, n, int64_t, k, const double*, alpha, const double*, A, int64_t, lda, const double*, x, int64_t, incx, const double*, beta, double*, y, int64_t, incy);
+DEF_FN(cublasStatus_t, cublasChbmv_v2, cublasHandle_t, handle, cublasFillMode_t, uplo, int, n, int, k, const cuComplex*, alpha, const cuComplex*, A, int, lda, const cuComplex*, x, int, incx, const cuComplex*, beta, cuComplex*, y, int, incy);
+DEF_FN(cublasStatus_t, cublasChbmv_v2_64, cublasHandle_t, handle, cublasFillMode_t, uplo, int64_t, n, int64_t, k, const cuComplex*, alpha, const cuComplex*, A, int64_t, lda, const cuComplex*, x, int64_t, incx, const cuComplex*, beta, cuComplex*, y, int64_t, incy);
+DEF_FN(cublasStatus_t, cublasZhbmv_v2, cublasHandle_t, handle, cublasFillMode_t, uplo, int, n, int, k, const cuDoubleComplex*, alpha, const cuDoubleComplex*, A, int, lda, const cuDoubleComplex*, x, int, incx, const cuDoubleComplex*, beta, cuDoubleComplex*, y, int, incy);
+DEF_FN(cublasStatus_t, cublasZhbmv_v2_64, cublasHandle_t, handle, cublasFillMode_t, uplo, int64_t, n, int64_t, k, const cuDoubleComplex*, alpha, const cuDoubleComplex*, A, int64_t, lda, const cuDoubleComplex*, x, int64_t, incx, const cuDoubleComplex*, beta, cuDoubleComplex*, y, int64_t, incy);
+DEF_FN(cublasStatus_t, cublasSspmv_v2, cublasHandle_t, handle, cublasFillMode_t, uplo, int, n, const float*, alpha, const float*, AP, const float*, x, int, incx, const float*, beta, float*, y, int, incy);
+DEF_FN(cublasStatus_t, cublasSspmv_v2_64, cublasHandle_t, handle, cublasFillMode_t, uplo, int64_t, n, const float*, alpha, const float*, AP, const float*, x, int64_t, incx, const float*, beta, float*, y, int64_t, incy);
+DEF_FN(cublasStatus_t, cublasDspmv_v2, cublasHandle_t, handle, cublasFillMode_t, uplo, int, n, const double*, alpha, const double*, AP, const double*, x, int, incx, const double*, beta, double*, y, int, incy);
+DEF_FN(cublasStatus_t, cublasDspmv_v2_64, cublasHandle_t, handle, cublasFillMode_t, uplo, int64_t, n, const double*, alpha, const double*, AP, const double*, x, int64_t, incx, const double*, beta, double*, y, int64_t, incy);
+DEF_FN(cublasStatus_t, cublasChpmv_v2, cublasHandle_t, handle, cublasFillMode_t, uplo, int, n, const cuComplex*, alpha, const cuComplex*, AP, const cuComplex*, x, int, incx, const cuComplex*, beta, cuComplex*, y, int, incy);
+DEF_FN(cublasStatus_t, cublasChpmv_v2_64, cublasHandle_t, handle, cublasFillMode_t, uplo, int64_t, n, const cuComplex*, alpha, const cuComplex*, AP, const cuComplex*, x, int64_t, incx, const cuComplex*, beta, cuComplex*, y, int64_t, incy);
+DEF_FN(cublasStatus_t, cublasZhpmv_v2, cublasHandle_t, handle, cublasFillMode_t, uplo, int, n, const cuDoubleComplex*, alpha, const cuDoubleComplex*, AP, const cuDoubleComplex*, x, int, incx, const cuDoubleComplex*, beta, cuDoubleComplex*, y, int, incy);
+DEF_FN(cublasStatus_t, cublasZhpmv_v2_64, cublasHandle_t, handle, cublasFillMode_t, uplo, int64_t, n, const cuDoubleComplex*, alpha, const cuDoubleComplex*, AP, const cuDoubleComplex*, x, int64_t, incx, const cuDoubleComplex*, beta, cuDoubleComplex*, y, int64_t, incy);
+DEF_FN(cublasStatus_t, cublasSger_v2, cublasHandle_t, handle, int, m, int, n, const float*, alpha, const float*, x, int, incx, const float*, y, int, incy, float*, A, int, lda);
+DEF_FN(cublasStatus_t, cublasSger_v2_64, cublasHandle_t, handle, int64_t, m, int64_t, n, const float*, alpha, const float*, x, int64_t, incx, const float*, y, int64_t, incy, float*, A, int64_t, lda);
+DEF_FN(cublasStatus_t, cublasDger_v2, cublasHandle_t, handle, int, m, int, n, const double*, alpha, const double*, x, int, incx, const double*, y, int, incy, double*, A, int, lda);
+DEF_FN(cublasStatus_t, cublasDger_v2_64, cublasHandle_t, handle, int64_t, m, int64_t, n, const double*, alpha, const double*, x, int64_t, incx, const double*, y, int64_t, incy, double*, A, int64_t, lda);
+DEF_FN(cublasStatus_t, cublasCgeru_v2, cublasHandle_t, handle, int, m, int, n, const cuComplex*, alpha, const cuComplex*, x, int, incx, const cuComplex*, y, int, incy, cuComplex*, A, int, lda);
+DEF_FN(cublasStatus_t, cublasCgeru_v2_64, cublasHandle_t, handle, int64_t, m, int64_t, n, const cuComplex*, alpha, const cuComplex*, x, int64_t, incx, const cuComplex*, y, int64_t, incy, cuComplex*, A, int64_t, lda);
+DEF_FN(cublasStatus_t, cublasCgerc_v2, cublasHandle_t, handle, int, m, int, n, const cuComplex*, alpha, const cuComplex*, x, int, incx, const cuComplex*, y, int, incy, cuComplex*, A, int, lda);
+DEF_FN(cublasStatus_t, cublasCgerc_v2_64, cublasHandle_t, handle, int64_t, m, int64_t, n, const cuComplex*, alpha, const cuComplex*, x, int64_t, incx, const cuComplex*, y, int64_t, incy, cuComplex*, A, int64_t, lda);
+DEF_FN(cublasStatus_t, cublasZgeru_v2, cublasHandle_t, handle, int, m, int, n, const cuDoubleComplex*, alpha, const cuDoubleComplex*, x, int, incx, const cuDoubleComplex*, y, int, incy, cuDoubleComplex*, A, int, lda);
+DEF_FN(cublasStatus_t, cublasZgeru_v2_64, cublasHandle_t, handle, int64_t, m, int64_t, n, const cuDoubleComplex*, alpha, const cuDoubleComplex*, x, int64_t, incx, const cuDoubleComplex*, y, int64_t, incy, cuDoubleComplex*, A, int64_t, lda);
+DEF_FN(cublasStatus_t, cublasZgerc_v2, cublasHandle_t, handle, int, m, int, n, const cuDoubleComplex*, alpha, const cuDoubleComplex*, x, int, incx, const cuDoubleComplex*, y, int, incy, cuDoubleComplex*, A, int, lda);
+DEF_FN(cublasStatus_t, cublasZgerc_v2_64, cublasHandle_t, handle, int64_t, m, int64_t, n, const cuDoubleComplex*, alpha, const cuDoubleComplex*, x, int64_t, incx, const cuDoubleComplex*, y, int64_t, incy, cuDoubleComplex*, A, int64_t, lda);
+DEF_FN(cublasStatus_t, cublasSsyr_v2, cublasHandle_t, handle, cublasFillMode_t, uplo, int, n, const float*, alpha, const float*, x, int, incx, float*, A, int, lda);
+DEF_FN(cublasStatus_t, cublasSsyr_v2_64, cublasHandle_t, handle, cublasFillMode_t, uplo, int64_t, n, const float*, alpha, const float*, x, int64_t, incx, float*, A, int64_t, lda);
+DEF_FN(cublasStatus_t, cublasDsyr_v2, cublasHandle_t, handle, cublasFillMode_t, uplo, int, n, const double*, alpha, const double*, x, int, incx, double*, A, int, lda);
+DEF_FN(cublasStatus_t, cublasDsyr_v2_64, cublasHandle_t, handle, cublasFillMode_t, uplo, int64_t, n, const double*, alpha, const double*, x, int64_t, incx, double*, A, int64_t, lda);
+DEF_FN(cublasStatus_t, cublasCsyr_v2, cublasHandle_t, handle, cublasFillMode_t, uplo, int, n, const cuComplex*, alpha, const cuComplex*, x, int, incx, cuComplex*, A, int, lda);
+DEF_FN(cublasStatus_t, cublasCsyr_v2_64, cublasHandle_t, handle, cublasFillMode_t, uplo, int64_t, n, const cuComplex*, alpha, const cuComplex*, x, int64_t, incx, cuComplex*, A, int64_t, lda);
+DEF_FN(cublasStatus_t, cublasZsyr_v2, cublasHandle_t, handle, cublasFillMode_t, uplo, int, n, const cuDoubleComplex*, alpha, const cuDoubleComplex*, x, int, incx, cuDoubleComplex*, A, int, lda);
+DEF_FN(cublasStatus_t, cublasZsyr_v2_64, cublasHandle_t, handle, cublasFillMode_t, uplo, int64_t, n, const cuDoubleComplex*, alpha, const cuDoubleComplex*, x, int64_t, incx, cuDoubleComplex*, A, int64_t, lda);
+DEF_FN(cublasStatus_t, cublasCher_v2, cublasHandle_t, handle, cublasFillMode_t, uplo, int, n, const float*, alpha, const cuComplex*, x, int, incx, cuComplex*, A, int, lda);
+DEF_FN(cublasStatus_t, cublasCher_v2_64, cublasHandle_t, handle, cublasFillMode_t, uplo, int64_t, n, const float*, alpha, const cuComplex*, x, int64_t, incx, cuComplex*, A, int64_t, lda);
+DEF_FN(cublasStatus_t, cublasZher_v2, cublasHandle_t, handle, cublasFillMode_t, uplo, int, n, const double*, alpha, const cuDoubleComplex*, x, int, incx, cuDoubleComplex*, A, int, lda);
+DEF_FN(cublasStatus_t, cublasZher_v2_64, cublasHandle_t, handle, cublasFillMode_t, uplo, int64_t, n, const double*, alpha, const cuDoubleComplex*, x, int64_t, incx, cuDoubleComplex*, A, int64_t, lda);
+DEF_FN(cublasStatus_t, cublasSspr_v2, , cublasHandle_t, handle, cublasFillMode_t, uplo, int, n, const float*, alpha, const float*, x, int, incx, float*, AP);
+DEF_FN(cublasStatus_t, cublasSspr_v2_64, cublasHandle_t, handle, cublasFillMode_t, uplo, int64_t, n, const float*, alpha, const float*, x, int64_t, incx, float*, AP);
+DEF_FN(cublasStatus_t, cublasDspr_v2, , cublasHandle_t, handle, cublasFillMode_t, uplo, int, n, const double*, alpha, const double*, x, int, incx, double*, AP);
+DEF_FN(cublasStatus_t, cublasDspr_v2_64, cublasHandle_t, handle, cublasFillMode_t, uplo, int64_t, n, const double*, alpha, const double*, x, int64_t, incx, double*, AP);
+DEF_FN(cublasStatus_t, cublasChpr_v2, cublasHandle_t, handle, cublasFillMode_t, uplo, int, n, const float*, alpha, const cuComplex*, x, int, incx, cuComplex*, AP);
+DEF_FN(cublasStatus_t, cublasChpr_v2_64, cublasHandle_t, handle, cublasFillMode_t, uplo, int64_t, n, const float*, alpha, const cuComplex*, x, int64_t, incx, cuComplex*, AP);
+DEF_FN(cublasStatus_t, cublasZhpr_v2, cublasHandle_t, handle, cublasFillMode_t, uplo, int, n, const double*, alpha, const cuDoubleComplex*, x, int, incx, cuDoubleComplex*, AP);
+DEF_FN(cublasStatus_t, cublasZhpr_v2_64, cublasHandle_t, handle, cublasFillMode_t, uplo, int64_t, n, const double*, alpha, const cuDoubleComplex*, x, int64_t, incx, cuDoubleComplex*, AP);
+DEF_FN(cublasStatus_t, cublasSsyr2_v2, cublasHandle_t, handle, cublasFillMode_t, uplo, int, n, const float*, alpha, const float*, x, int, incx, const float*, y, int, incy, float*, A, int, lda);
+DEF_FN(cublasStatus_t, cublasSsyr2_v2_64, cublasHandle_t, handle, cublasFillMode_t, uplo, int64_t, n, const float*, alpha, const float*, x, int64_t, incx, const float*, y, int64_t, incy, float*, A, int64_t, lda);
+DEF_FN(cublasStatus_t, cublasDsyr2_v2, cublasHandle_t, handle, cublasFillMode_t, uplo, int, n, const double*, alpha, const double*, x, int, incx, const double*, y, int, incy, double*, A, int, lda);
+DEF_FN(cublasStatus_t, cublasDsyr2_v2_64, cublasHandle_t, handle, cublasFillMode_t, uplo, int64_t, n, const double*, alpha, const double*, x, int64_t, incx, const double*, y, int64_t, incy, double*, A, int64_t, lda);
+DEF_FN(cublasStatus_t, cublasCsyr2_v2, cublasHandle_t, handle, cublasFillMode_t, uplo, int, n, const cuComplex*, alpha, const cuComplex*, x, int, incx, const cuComplex*, y, int, incy, cuComplex*, A, int, lda);
+DEF_FN(cublasStatus_t, cublasCsyr2_v2_64, cublasHandle_t, handle, cublasFillMode_t, uplo, int64_t, n, const cuComplex*, alpha, const cuComplex*, x, int64_t, incx, const cuComplex*, y, int64_t, incy, cuComplex*, A, int64_t, lda);
+DEF_FN(cublasStatus_t, cublasZsyr2_v2, cublasHandle_t, handle, cublasFillMode_t, uplo, int, n, const cuDoubleComplex*, alpha, const cuDoubleComplex*, x, int, incx, const cuDoubleComplex*, y, int, incy, cuDoubleComplex*, A, int, lda);
+DEF_FN(cublasStatus_t, cublasZsyr2_v2_64, cublasHandle_t, handle, cublasFillMode_t, uplo, int64_t, n, const cuDoubleComplex*, alpha, const cuDoubleComplex*, x, int64_t, incx, const cuDoubleComplex*, y, int64_t, incy, cuDoubleComplex*, A, int64_t, lda);
+DEF_FN(cublasStatus_t, cublasCher2_v2, cublasHandle_t, handle, cublasFillMode_t, uplo, int, n, const cuComplex*, alpha, const cuComplex*, x, int, incx, const cuComplex*, y, int, incy, cuComplex*, A, int, lda);
+DEF_FN(cublasStatus_t, cublasCher2_v2_64, cublasHandle_t, handle, cublasFillMode_t, uplo, int64_t, n, const cuComplex*, alpha, const cuComplex*, x, int64_t, incx, const cuComplex*, y, int64_t, incy, cuComplex*, A, int64_t, lda);
+DEF_FN(cublasStatus_t, cublasZher2_v2, cublasHandle_t, handle, cublasFillMode_t, uplo, int, n, const cuDoubleComplex*, alpha, const cuDoubleComplex*, x, int, incx, const cuDoubleComplex*, y, int, incy, cuDoubleComplex*, A, int, lda);
+DEF_FN(cublasStatus_t, cublasZher2_v2_64, cublasHandle_t, handle, cublasFillMode_t, uplo, int64_t, n, const cuDoubleComplex*, alpha, const cuDoubleComplex*, x, int64_t, incx, const cuDoubleComplex*, y, int64_t, incy, cuDoubleComplex*, A, int64_t, lda);
+DEF_FN(cublasStatus_t, cublasSspr2_v2, cublasHandle_t, handle, cublasFillMode_t, uplo, int, n, const float*, alpha, const float*, x, int, incx, const float*, y, int, incy, float*, AP);
+DEF_FN(cublasStatus_t, cublasSspr2_v2_64, cublasHandle_t, handle, cublasFillMode_t, uplo, int64_t, n, const float*, alpha, const float*, x, int64_t, incx, const float*, y, int64_t, incy, float*, AP);
+DEF_FN(cublasStatus_t, cublasDspr2_v2, cublasHandle_t, handle, cublasFillMode_t, uplo, int, n, const double*, alpha, const double*, x, int, incx, const double*, y, int, incy, double*, AP);
+DEF_FN(cublasStatus_t, cublasDspr2_v2_64, cublasHandle_t, handle, cublasFillMode_t, uplo, int64_t, n, const double*, alpha, const double*, x, int64_t, incx, const double*, y, int64_t, incy, double*, AP);
+DEF_FN(cublasStatus_t, cublasChpr2_v2, cublasHandle_t, handle, cublasFillMode_t, uplo, int, n, const cuComplex*, alpha, const cuComplex*, x, int, incx, const cuComplex*, y, int, incy, cuComplex*, AP);
+DEF_FN(cublasStatus_t, cublasChpr2_v2_64, cublasHandle_t, handle, cublasFillMode_t, uplo, int64_t, n, const cuComplex*, alpha, const cuComplex*, x, int64_t, incx, const cuComplex*, y, int64_t, incy, cuComplex*, AP);
+DEF_FN(cublasStatus_t, cublasZhpr2_v2, cublasHandle_t, handle, cublasFillMode_t, uplo, int, n, const cuDoubleComplex*, alpha, const cuDoubleComplex*, x, int, incx, const cuDoubleComplex*, y, int, incy, cuDoubleComplex*, AP);
+DEF_FN(cublasStatus_t, cublasZhpr2_v2_64, cublasHandle_t, handle, cublasFillMode_t, uplo, int64_t, n, const cuDoubleComplex*, alpha, const cuDoubleComplex*, x, int64_t, incx, const cuDoubleComplex*, y, int64_t, incy, cuDoubleComplex*, AP);
+DEF_FN(cublasStatus_t, cublasSgemvBatched, cublasHandle_t, handle, cublasOperation_t, trans, int, m, int, n, const float*, alpha, const float* const*,  Aarray, int, lda, const float* const*,  xarray, int, incx, const float*, beta, float* const*,  yarray, int, incy, int, batchCount);
+DEF_FN(cublasStatus_t, cublasSgemvBatched_64, cublasHandle_t, handle, cublasOperation_t, trans, int64_t, m, int64_t, n, const float*, alpha, const float* const*,  Aarray, int64_t, lda, const float* const*,  xarray, int64_t, incx, const float*, beta, float* const*,  yarray, int64_t, incy, int64_t, batchCount);
+DEF_FN(cublasStatus_t, cublasDgemvBatched, cublasHandle_t, handle, cublasOperation_t, trans, int, m, int, n, const double*, alpha, const double* const*,  Aarray, int, lda, const double* const*,  xarray, int, incx, const double*, beta, double* const*,  yarray, int, incy, int, batchCount);
+DEF_FN(cublasStatus_t, cublasDgemvBatched_64, cublasHandle_t, handle, cublasOperation_t, trans, int64_t, m, int64_t, n, const double*, alpha, const double* const*,  Aarray, int64_t, lda, const double* const*,  xarray, int64_t, incx, const double*, beta, double* const*,  yarray, int64_t, incy, int64_t, batchCount);
+DEF_FN(cublasStatus_t, cublasCgemvBatched, cublasHandle_t, handle, cublasOperation_t, trans, int, m, int, n, const cuComplex*, alpha, const cuComplex* const*,  Aarray, int, lda, const cuComplex* const*,  xarray, int, incx, const cuComplex*, beta, cuComplex* const*,  yarray, int, incy, int, batchCount);
+DEF_FN(cublasStatus_t, cublasCgemvBatched_64, cublasHandle_t, handle, cublasOperation_t, trans, int64_t, m, int64_t, n, const cuComplex*, alpha, const cuComplex* const*,  Aarray, int64_t, lda, const cuComplex* const*,  xarray, int64_t, incx, const cuComplex*, beta, cuComplex* const*,  yarray, int64_t, incy, int64_t, batchCount);
+DEF_FN(cublasStatus_t, cublasZgemvBatched, cublasHandle_t, handle, cublasOperation_t, trans, int, m, int, n, const cuDoubleComplex*, alpha, const cuDoubleComplex* const*,  Aarray, int, lda, const cuDoubleComplex* const*,  xarray, int, incx, const cuDoubleComplex*, beta, cuDoubleComplex* const*,  yarray, int, incy, int, batchCount);
+DEF_FN(cublasStatus_t, cublasZgemvBatched_64, cublasHandle_t, handle, cublasOperation_t, trans, int64_t, m, int64_t, n, const cuDoubleComplex*, alpha, const cuDoubleComplex* const*,  Aarray, int64_t, lda, const cuDoubleComplex* const*,  xarray, int64_t, incx, const cuDoubleComplex*, beta, cuDoubleComplex* const*,  yarray, int64_t, incy, int64_t, batchCount);
+DEF_FN(cublasStatus_t, cublasSgemvStridedBatched, cublasHandle_t, handle, cublasOperation_t, trans, int, m, int, n, const float*, alpha, const float*, A, int, lda, long long int, strideA, const float*, x, int, incx, long long int, stridex, const float*, beta, float*, y, int, incy, long long int, stridey, int, batchCount);
+DEF_FN(cublasStatus_t, cublasSgemvStridedBatched_64, cublasHandle_t, handle, cublasOperation_t, trans, int64_t, m, int64_t, n, const float*, alpha, const float*, A, int64_t, lda, long long int, strideA, const float*, x, int64_t, incx, long long int, stridex, const float*, beta, float*, y, int64_t, incy, long long int, stridey, int64_t, batchCount);
+DEF_FN(cublasStatus_t, cublasDgemvStridedBatched, cublasHandle_t, handle, cublasOperation_t, trans, int, m, int, n, const double*, alpha, const double*, A, int, lda, long long int, strideA, const double*, x, int, incx, long long int, stridex, const double*, beta, double*, y, int, incy, long long int, stridey, int, batchCount);
+DEF_FN(cublasStatus_t, cublasDgemvStridedBatched_64, cublasHandle_t, handle, cublasOperation_t, trans, int64_t, m, int64_t, n, const double*, alpha, const double*, A, int64_t, lda, long long int, strideA, const double*, x, int64_t, incx, long long int, stridex, const double*, beta, double*, y, int64_t, incy, long long int, stridey, int64_t, batchCount);
+DEF_FN(cublasStatus_t, cublasCgemvStridedBatched, cublasHandle_t, handle, cublasOperation_t, trans, int, m, int, n, const cuComplex*, alpha, const cuComplex*, A, int, lda, long long int, strideA, const cuComplex*, x, int, incx, long long int, stridex, const cuComplex*, beta, cuComplex*, y, int, incy, long long int, stridey, int, batchCount);
+DEF_FN(cublasStatus_t, cublasCgemvStridedBatched_64, cublasHandle_t, handle, cublasOperation_t, trans, int64_t, m, int64_t, n, const cuComplex*, alpha, const cuComplex*, A, int64_t, lda, long long int, strideA, const cuComplex*, x, int64_t, incx, long long int, stridex, const cuComplex*, beta, cuComplex*, y, int64_t, incy, long long int, stridey, int64_t, batchCount);
+DEF_FN(cublasStatus_t, cublasZgemvStridedBatched, cublasHandle_t, handle, cublasOperation_t, trans, int, m, int, n, const cuDoubleComplex*, alpha, const cuDoubleComplex*, A, int, lda, long long int, strideA, const cuDoubleComplex*, x, int, incx, long long int, stridex, const cuDoubleComplex*, beta, cuDoubleComplex*, y, int, incy, long long int, stridey, int, batchCount);
+DEF_FN(cublasStatus_t, cublasZgemvStridedBatched_64, cublasHandle_t, handle, cublasOperation_t, trans, int64_t, m, int64_t, n, const cuDoubleComplex*, alpha, const cuDoubleComplex*, A, int64_t, lda, long long int, strideA, const cuDoubleComplex*, x, int64_t, incx, long long int, stridex, const cuDoubleComplex*, beta, cuDoubleComplex*, y, int64_t, incy, long long int, stridey, int64_t, batchCount);
+
+cublasStatus_t cublasSgemm_v2(cublasHandle_t handle,
+                           cublasOperation_t transa, cublasOperation_t transb,
+                           int m, int n, int k,
+                           const float *alpha,
+                           const float *A, int lda,
+                           const float *B, int ldb,
+                           const float *beta,
+                           float *C, int ldc)
+{
+#ifdef WITH_API_CNT
+    api_call_cnt++;
+#endif //WITH_API_CNT
+    int result;
+    enum clnt_stat retval_1;
+    retval_1 = rpc_cublassgemm_1(
+        (ptr)handle,
+        (int)transa,
+        (int)transb,
+        m, n, k,
+        *alpha,
+        (ptr)A, lda,
+        (ptr)B, ldb,
+        *beta,
+        (ptr)C, ldc,
+         &result, clnt);
+    if (retval_1 != RPC_SUCCESS) {
+        clnt_perror (clnt, "call failed");
+    }
+    return result;
+}
+DEF_FN(cublasStatus_t, cublasSgemm_v2_64, cublasHandle_t, handle, cublasOperation_t, transa, cublasOperation_t, transb, int64_t, m, int64_t, n, int64_t, k, const float*, alpha, const float*, A, int64_t, lda, const float*, B, int64_t, ldb, const float*, beta, float*, C, int64_t, ldc);
+
+cublasStatus_t cublasDgemm_v2(cublasHandle_t handle,
+                           cublasOperation_t transa, cublasOperation_t transb,
+                           int m, int n, int k,
+                           const double          *alpha,
+                           const double          *A, int lda,
+                           const double          *B, int ldb,
+                           const double          *beta,
+                           double          *C, int ldc)
+{
+#ifdef WITH_API_CNT
+    api_call_cnt++;
+#endif //WITH_API_CNT
+    int result;
+    enum clnt_stat retval_1;
+    retval_1 = rpc_cublasdgemm_1(
+        (ptr)handle,
+        (int)transa,
+        (int)transb,
+        m, n, k,
+        *alpha,
+        (ptr)A, lda,
+        (ptr)B, ldb,
+        *beta,
+        (ptr)C, ldc,
+         &result, clnt);
+    if (retval_1 != RPC_SUCCESS) {
+        clnt_perror (clnt, "call failed");
+    }
+    return result;
+}
+
+DEF_FN(cublasStatus_t, cublasDgemm_v2_64, cublasHandle_t, handle, cublasOperation_t, transa, cublasOperation_t, transb, int64_t, m, int64_t, n, int64_t, k, const double*, alpha, const double*, A, int64_t, lda, const double*, B, int64_t, ldb, const double*, beta, double*, C, int64_t, ldc);
+DEF_FN(cublasStatus_t, cublasCgemm_v2, cublasHandle_t, handle, cublasOperation_t, transa, cublasOperation_t, transb, int, m, int, n, int, k, const cuComplex*, alpha, const cuComplex*, A, int, lda, const cuComplex*, B, int, ldb, const cuComplex*, beta, cuComplex*, C, int, ldc);
+DEF_FN(cublasStatus_t, cublasCgemm_v2_64, cublasHandle_t, handle, cublasOperation_t, transa, cublasOperation_t, transb, int64_t, m, int64_t, n, int64_t, k, const cuComplex*, alpha, const cuComplex*, A, int64_t, lda, const cuComplex*, B, int64_t, ldb, const cuComplex*, beta, cuComplex*, C, int64_t, ldc);
+DEF_FN(cublasStatus_t, cublasCgemm3m, cublasHandle_t, handle, cublasOperation_t, transa, cublasOperation_t, transb, int, m, int, n, int, k, const cuComplex*, alpha, const cuComplex*, A, int, lda, const cuComplex*, B, int, ldb, const cuComplex*, beta, cuComplex*, C, int, ldc);
+DEF_FN(cublasStatus_t, cublasCgemm3m_64, cublasHandle_t, handle, cublasOperation_t, transa, cublasOperation_t, transb, int64_t, m, int64_t, n, int64_t, k, const cuComplex*, alpha, const cuComplex*, A, int64_t, lda, const cuComplex*, B, int64_t, ldb, const cuComplex*, beta, cuComplex*, C, int64_t, ldc);
+DEF_FN(cublasStatus_t, cublasCgemm3mEx, cublasHandle_t, handle, cublasOperation_t, transa, cublasOperation_t, transb, int, m, int, n, int, k, const cuComplex*, alpha, const void*, A, cudaDataType, Atype, int, lda, const void*, B, cudaDataType, Btype, int, ldb, const cuComplex*, beta, void*, C, cudaDataType, Ctype, int, ldc);
+DEF_FN(cublasStatus_t, cublasCgemm3mEx_64, cublasHandle_t, handle, cublasOperation_t, transa, cublasOperation_t, transb, int64_t, m, int64_t, n, int64_t, k, const cuComplex*, alpha, const void*, A, cudaDataType, Atype, int64_t, lda, const void*, B, cudaDataType, Btype, int64_t, ldb, const cuComplex*, beta, void*, C, cudaDataType, Ctype, int64_t, ldc);
+DEF_FN(cublasStatus_t, cublasZgemm_v2, cublasHandle_t, handle, cublasOperation_t, transa, cublasOperation_t, transb, int, m, int, n, int, k, const cuDoubleComplex*, alpha, const cuDoubleComplex*, A, int, lda, const cuDoubleComplex*, B, int, ldb, const cuDoubleComplex*, beta, cuDoubleComplex*, C, int, ldc);
+DEF_FN(cublasStatus_t, cublasZgemm_v2_64, cublasHandle_t, handle, cublasOperation_t, transa, cublasOperation_t, transb, int64_t, m, int64_t, n, int64_t, k, const cuDoubleComplex*, alpha, const cuDoubleComplex*, A, int64_t, lda, const cuDoubleComplex*, B, int64_t, ldb, const cuDoubleComplex*, beta, cuDoubleComplex*, C, int64_t, ldc);
+DEF_FN(cublasStatus_t, cublasZgemm3m, cublasHandle_t, handle, cublasOperation_t, transa, cublasOperation_t, transb, int, m, int, n, int, k, const cuDoubleComplex*, alpha, const cuDoubleComplex*, A, int, lda, const cuDoubleComplex*, B, int, ldb, const cuDoubleComplex*, beta, cuDoubleComplex*, C, int, ldc);
+DEF_FN(cublasStatus_t, cublasZgemm3m_64, cublasHandle_t, handle, cublasOperation_t, transa, cublasOperation_t, transb, int64_t, m, int64_t, n, int64_t, k, const cuDoubleComplex*, alpha, const cuDoubleComplex*, A, int64_t, lda, const cuDoubleComplex*, B, int64_t, ldb, const cuDoubleComplex*, beta, cuDoubleComplex*, C, int64_t, ldc);
+
+cublasStatus_t cublasSgemmEx(cublasHandle_t handle,
+                           cublasOperation_t transa, cublasOperation_t transb,
+                           int m, int n, int k,
+                           const float *alpha,
+                           const void *A, cudaDataType_t Atype, int lda,
+                           const void *B, cudaDataType_t Btype, int ldb,
+                           const float *beta,
+                           void *C, cudaDataType_t Ctype, int ldc)
+{
+#ifdef WITH_API_CNT
+    api_call_cnt++;
+#endif //WITH_API_CNT
+    int result;
+    enum clnt_stat retval_1;
+    retval_1 = rpc_cublassgemmex_1(
+        (ptr)handle,
+        (int)transa,
+        (int)transb,
+        m, n, k,
+        *alpha,
+        (ptr)A, (int)Atype, lda,
+        (ptr)B, (int)Btype, ldb,
+        *beta,
+        (ptr)C, (int)Ctype, ldc,
+         &result, clnt);
+    if (retval_1 != RPC_SUCCESS) {
+        clnt_perror (clnt, "call failed");
+    }
+    return result;
+}
+
+
+DEF_FN(cublasStatus_t, cublasSgemmEx_64, cublasHandle_t, handle, cublasOperation_t, transa, cublasOperation_t, transb, int64_t, m, int64_t, n, int64_t, k, const float*, alpha, const void*, A, cudaDataType, Atype, int64_t, lda, const void*, B, cudaDataType, Btype, int64_t, ldb, const float*, beta, void*, C, cudaDataType, Ctype, int64_t, ldc);
+DEF_FN(cublasStatus_t, cublasGemmEx, cublasHandle_t, handle, cublasOperation_t, transa, cublasOperation_t, transb, int, m, int, n, int, k, const void*, alpha, const void*, A, cudaDataType, Atype, int, lda, const void*, B, cudaDataType, Btype, int, ldb, const void*, beta, void*, C, cudaDataType, Ctype, int, ldc, cublasComputeType_t, computeType, cublasGemmAlgo_t, algo);
+DEF_FN(cublasStatus_t, cublasGemmEx_64, cublasHandle_t, handle, cublasOperation_t, transa, cublasOperation_t, transb, int64_t, m, int64_t, n, int64_t, k, const void*, alpha, const void*, A, cudaDataType, Atype, int64_t, lda, const void*, B, cudaDataType, Btype, int64_t, ldb, const void*, beta, void*, C, cudaDataType, Ctype, int64_t, ldc, cublasComputeType_t, computeType, cublasGemmAlgo_t, algo);
+DEF_FN(cublasStatus_t, cublasCgemmEx, cublasHandle_t, handle, cublasOperation_t, transa, cublasOperation_t, transb, int, m, int, n, int, k, const cuComplex*, alpha, const void*, A, cudaDataType, Atype, int, lda, const void*, B, cudaDataType, Btype, int, ldb, const cuComplex*, beta, void*, C, cudaDataType, Ctype, int, ldc);
+DEF_FN(cublasStatus_t, cublasCgemmEx_64, cublasHandle_t, handle, cublasOperation_t, transa, cublasOperation_t, transb, int64_t, m, int64_t, n, int64_t, k, const cuComplex*, alpha, const void*, A, cudaDataType, Atype, int64_t, lda, const void*, B, cudaDataType, Btype, int64_t, ldb, const cuComplex*, beta, void*, C, cudaDataType, Ctype, int64_t, ldc);
+DEF_FN(cublasStatus_t, cublasSsyrk_v2, cublasHandle_t, handle, cublasFillMode_t, uplo, cublasOperation_t, trans, int, n, int, k, const float*, alpha, const float*, A, int, lda, const float*, beta, float*, C, int, ldc);
+DEF_FN(cublasStatus_t, cublasSsyrk_v2_64, cublasHandle_t, handle, cublasFillMode_t, uplo, cublasOperation_t, trans, int64_t, n, int64_t, k, const float*, alpha, const float*, A, int64_t, lda, const float*, beta, float*, C, int64_t, ldc);
+DEF_FN(cublasStatus_t, cublasDsyrk_v2, cublasHandle_t, handle, cublasFillMode_t, uplo, cublasOperation_t, trans, int, n, int, k, const double*, alpha, const double*, A, int, lda, const double*, beta, double*, C, int, ldc);
+DEF_FN(cublasStatus_t, cublasDsyrk_v2_64, cublasHandle_t, handle, cublasFillMode_t, uplo, cublasOperation_t, trans, int64_t, n, int64_t, k, const double*, alpha, const double*, A, int64_t, lda, const double*, beta, double*, C, int64_t, ldc);
+DEF_FN(cublasStatus_t, cublasCsyrk_v2, cublasHandle_t, handle, cublasFillMode_t, uplo, cublasOperation_t, trans, int, n, int, k, const cuComplex*, alpha, const cuComplex*, A, int, lda, const cuComplex*, beta, cuComplex*, C, int, ldc);
+DEF_FN(cublasStatus_t, cublasCsyrk_v2_64, cublasHandle_t, handle, cublasFillMode_t, uplo, cublasOperation_t, trans, int64_t, n, int64_t, k, const cuComplex*, alpha, const cuComplex*, A, int64_t, lda, const cuComplex*, beta, cuComplex*, C, int64_t, ldc);
+DEF_FN(cublasStatus_t, cublasZsyrk_v2, cublasHandle_t, handle, cublasFillMode_t, uplo, cublasOperation_t, trans, int, n, int, k, const cuDoubleComplex*, alpha, const cuDoubleComplex*, A, int, lda, const cuDoubleComplex*, beta, cuDoubleComplex*, C, int, ldc);
+DEF_FN(cublasStatus_t, cublasZsyrk_v2_64, cublasHandle_t, handle, cublasFillMode_t, uplo, cublasOperation_t, trans, int64_t, n, int64_t, k, const cuDoubleComplex*, alpha, const cuDoubleComplex*, A, int64_t, lda, const cuDoubleComplex*, beta, cuDoubleComplex*, C, int64_t, ldc);
+DEF_FN(cublasStatus_t, cublasCsyrkEx, cublasHandle_t, handle, cublasFillMode_t, uplo, cublasOperation_t, trans, int, n, int, k, const cuComplex*, alpha, const void*, A, cudaDataType, Atype, int, lda, const cuComplex*, beta, void*, C, cudaDataType, Ctype, int, ldc);
+DEF_FN(cublasStatus_t, cublasCsyrkEx_64, cublasHandle_t, handle, cublasFillMode_t, uplo, cublasOperation_t, trans, int64_t, n, int64_t, k, const cuComplex*, alpha, const void*, A, cudaDataType, Atype, int64_t, lda, const cuComplex*, beta, void*, C, cudaDataType, Ctype, int64_t, ldc);
+DEF_FN(cublasStatus_t, cublasCsyrk3mEx, cublasHandle_t, handle, cublasFillMode_t, uplo, cublasOperation_t, trans, int, n, int, k, const cuComplex*, alpha, const void*, A, cudaDataType, Atype, int, lda, const cuComplex*, beta, void*, C, cudaDataType, Ctype, int, ldc);
+DEF_FN(cublasStatus_t, cublasCsyrk3mEx_64, cublasHandle_t, handle, cublasFillMode_t, uplo, cublasOperation_t, trans, int64_t, n, int64_t, k, const cuComplex*, alpha, const void*, A, cudaDataType, Atype, int64_t, lda, const cuComplex*, beta, void*, C, cudaDataType, Ctype, int64_t, ldc);
+DEF_FN(cublasStatus_t, cublasCherk_v2, cublasHandle_t, handle, cublasFillMode_t, uplo, cublasOperation_t, trans, int, n, int, k, const float*, alpha, const cuComplex*, A, int, lda, const float*, beta, cuComplex*, C, int, ldc);
+DEF_FN(cublasStatus_t, cublasCherk_v2_64, cublasHandle_t, handle, cublasFillMode_t, uplo, cublasOperation_t, trans, int64_t, n, int64_t, k, const float*, alpha, const cuComplex*, A, int64_t, lda, const float*, beta, cuComplex*, C, int64_t, ldc);
+DEF_FN(cublasStatus_t, cublasZherk_v2, cublasHandle_t, handle, cublasFillMode_t, uplo, cublasOperation_t, trans, int, n, int, k, const double*, alpha, const cuDoubleComplex*, A, int, lda, const double*, beta, cuDoubleComplex*, C, int, ldc);
+DEF_FN(cublasStatus_t, cublasZherk_v2_64, cublasHandle_t, handle, cublasFillMode_t, uplo, cublasOperation_t, trans, int64_t, n, int64_t, k, const double*, alpha, const cuDoubleComplex*, A, int64_t, lda, const double*, beta, cuDoubleComplex*, C, int64_t, ldc);
+DEF_FN(cublasStatus_t, cublasCherkEx, cublasHandle_t, handle, cublasFillMode_t, uplo, cublasOperation_t, trans, int, n, int, k, const float*, alpha, const void*, A, cudaDataType, Atype, int, lda, const float*, beta, void*, C, cudaDataType, Ctype, int, ldc);
+DEF_FN(cublasStatus_t, cublasCherkEx_64, cublasHandle_t, handle, cublasFillMode_t, uplo, cublasOperation_t, trans, int64_t, n, int64_t, k, const float*, alpha, const void*, A, cudaDataType, Atype, int64_t, lda, const float*, beta, void*, C, cudaDataType, Ctype, int64_t, ldc);
+DEF_FN(cublasStatus_t, cublasCherk3mEx, cublasHandle_t, handle, cublasFillMode_t, uplo, cublasOperation_t, trans, int, n, int, k, const float*, alpha, const void*, A, cudaDataType, Atype, int, lda, const float*, beta, void*, C, cudaDataType, Ctype, int, ldc);
+DEF_FN(cublasStatus_t, cublasCherk3mEx_64, cublasHandle_t, handle, cublasFillMode_t, uplo, cublasOperation_t, trans, int64_t, n, int64_t, k, const float*, alpha, const void*, A, cudaDataType, Atype, int64_t, lda, const float*, beta, void*, C, cudaDataType, Ctype, int64_t, ldc);
+DEF_FN(cublasStatus_t, cublasSsyr2k_v2, cublasHandle_t, handle, cublasFillMode_t, uplo, cublasOperation_t, trans, int, n, int, k, const float*, alpha, const float*, A, int, lda, const float*, B, int, ldb, const float*, beta, float*, C, int, ldc);
+DEF_FN(cublasStatus_t, cublasSsyr2k_v2_64, cublasHandle_t, handle, cublasFillMode_t, uplo, cublasOperation_t, trans, int64_t, n, int64_t, k, const float*, alpha, const float*, A, int64_t, lda, const float*, B, int64_t, ldb, const float*, beta, float*, C, int64_t, ldc);
+DEF_FN(cublasStatus_t, cublasDsyr2k_v2, cublasHandle_t, handle, cublasFillMode_t, uplo, cublasOperation_t, trans, int, n, int, k, const double*, alpha, const double*, A, int, lda, const double*, B, int, ldb, const double*, beta, double*, C, int, ldc);
+DEF_FN(cublasStatus_t, cublasDsyr2k_v2_64, cublasHandle_t, handle, cublasFillMode_t, uplo, cublasOperation_t, trans, int64_t, n, int64_t, k, const double*, alpha, const double*, A, int64_t, lda, const double*, B, int64_t, ldb, const double*, beta, double*, C, int64_t, ldc);
+DEF_FN(cublasStatus_t, cublasCsyr2k_v2, cublasHandle_t, handle, cublasFillMode_t, uplo, cublasOperation_t, trans, int, n, int, k, const cuComplex*, alpha, const cuComplex*, A, int, lda, const cuComplex*, B, int, ldb, const cuComplex*, beta, cuComplex*, C, int, ldc);
+DEF_FN(cublasStatus_t, cublasCsyr2k_v2_64, cublasHandle_t, handle, cublasFillMode_t, uplo, cublasOperation_t, trans, int64_t, n, int64_t, k, const cuComplex*, alpha, const cuComplex*, A, int64_t, lda, const cuComplex*, B, int64_t, ldb, const cuComplex*, beta, cuComplex*, C, int64_t, ldc);
+DEF_FN(cublasStatus_t, cublasZsyr2k_v2, cublasHandle_t, handle, cublasFillMode_t, uplo, cublasOperation_t, trans, int, n, int, k, const cuDoubleComplex*, alpha, const cuDoubleComplex*, A, int, lda, const cuDoubleComplex*, B, int, ldb, const cuDoubleComplex*, beta, cuDoubleComplex*, C, int, ldc);
+DEF_FN(cublasStatus_t, cublasZsyr2k_v2_64, cublasHandle_t, handle, cublasFillMode_t, uplo, cublasOperation_t, trans, int64_t, n, int64_t, k, const cuDoubleComplex*, alpha, const cuDoubleComplex*, A, int64_t, lda, const cuDoubleComplex*, B, int64_t, ldb, const cuDoubleComplex*, beta, cuDoubleComplex*, C, int64_t, ldc);
+DEF_FN(cublasStatus_t, cublasCher2k_v2, cublasHandle_t, handle, cublasFillMode_t, uplo, cublasOperation_t, trans, int, n, int, k, const cuComplex*, alpha, const cuComplex*, A, int, lda, const cuComplex*, B, int, ldb, const float*, beta, cuComplex*, C, int, ldc);
+DEF_FN(cublasStatus_t, cublasCher2k_v2_64, cublasHandle_t, handle, cublasFillMode_t, uplo, cublasOperation_t, trans, int64_t, n, int64_t, k, const cuComplex*, alpha, const cuComplex*, A, int64_t, lda, const cuComplex*, B, int64_t, ldb, const float*, beta, cuComplex*, C, int64_t, ldc);
+DEF_FN(cublasStatus_t, cublasZher2k_v2, cublasHandle_t, handle, cublasFillMode_t, uplo, cublasOperation_t, trans, int, n, int, k, const cuDoubleComplex*, alpha, const cuDoubleComplex*, A, int, lda, const cuDoubleComplex*, B, int, ldb, const double*, beta, cuDoubleComplex*, C, int, ldc);
+DEF_FN(cublasStatus_t, cublasZher2k_v2_64, cublasHandle_t, handle, cublasFillMode_t, uplo, cublasOperation_t, trans, int64_t, n, int64_t, k, const cuDoubleComplex*, alpha, const cuDoubleComplex*, A, int64_t, lda, const cuDoubleComplex*, B, int64_t, ldb, const double*, beta, cuDoubleComplex*, C, int64_t, ldc);
+DEF_FN(cublasStatus_t, cublasSsyrkx, cublasHandle_t, handle, cublasFillMode_t, uplo, cublasOperation_t, trans, int, n, int, k, const float*, alpha, const float*, A, int, lda, const float*, B, int, ldb, const float*, beta, float*, C, int, ldc);
+DEF_FN(cublasStatus_t, cublasSsyrkx_64, cublasHandle_t, handle, cublasFillMode_t, uplo, cublasOperation_t, trans, int64_t, n, int64_t, k, const float*, alpha, const float*, A, int64_t, lda, const float*, B, int64_t, ldb, const float*, beta, float*, C, int64_t, ldc);
+DEF_FN(cublasStatus_t, cublasDsyrkx, cublasHandle_t, handle, cublasFillMode_t, uplo, cublasOperation_t, trans, int, n, int, k, const double*, alpha, const double*, A, int, lda, const double*, B, int, ldb, const double*, beta, double*, C, int, ldc);
+DEF_FN(cublasStatus_t, cublasDsyrkx_64, cublasHandle_t, handle, cublasFillMode_t, uplo, cublasOperation_t, trans, int64_t, n, int64_t, k, const double*, alpha, const double*, A, int64_t, lda, const double*, B, int64_t, ldb, const double*, beta, double*, C, int64_t, ldc);
+DEF_FN(cublasStatus_t, cublasCsyrkx, cublasHandle_t, handle, cublasFillMode_t, uplo, cublasOperation_t, trans, int, n, int, k, const cuComplex*, alpha, const cuComplex*, A, int, lda, const cuComplex*, B, int, ldb, const cuComplex*, beta, cuComplex*, C, int, ldc);
+DEF_FN(cublasStatus_t, cublasCsyrkx_64, cublasHandle_t, handle, cublasFillMode_t, uplo, cublasOperation_t, trans, int64_t, n, int64_t, k, const cuComplex*, alpha, const cuComplex*, A, int64_t, lda, const cuComplex*, B, int64_t, ldb, const cuComplex*, beta, cuComplex*, C, int64_t, ldc);
+DEF_FN(cublasStatus_t, cublasZsyrkx, cublasHandle_t, handle, cublasFillMode_t, uplo, cublasOperation_t, trans, int, n, int, k, const cuDoubleComplex*, alpha, const cuDoubleComplex*, A, int, lda, const cuDoubleComplex*, B, int, ldb, const cuDoubleComplex*, beta, cuDoubleComplex*, C, int, ldc);
+DEF_FN(cublasStatus_t, cublasZsyrkx_64, cublasHandle_t, handle, cublasFillMode_t, uplo, cublasOperation_t, trans, int64_t, n, int64_t, k, const cuDoubleComplex*, alpha, const cuDoubleComplex*, A, int64_t, lda, const cuDoubleComplex*, B, int64_t, ldb, const cuDoubleComplex*, beta, cuDoubleComplex*, C, int64_t, ldc);
+DEF_FN(cublasStatus_t, cublasCherkx, cublasHandle_t, handle, cublasFillMode_t, uplo, cublasOperation_t, trans, int, n, int, k, const cuComplex*, alpha, const cuComplex*, A, int, lda, const cuComplex*, B, int, ldb, const float*, beta, cuComplex*, C, int, ldc);
+DEF_FN(cublasStatus_t, cublasCherkx_64, cublasHandle_t, handle, cublasFillMode_t, uplo, cublasOperation_t, trans, int64_t, n, int64_t, k, const cuComplex*, alpha, const cuComplex*, A, int64_t, lda, const cuComplex*, B, int64_t, ldb, const float*, beta, cuComplex*, C, int64_t, ldc);
+DEF_FN(cublasStatus_t, cublasZherkx, cublasHandle_t, handle, cublasFillMode_t, uplo, cublasOperation_t, trans, int, n, int, k, const cuDoubleComplex*, alpha, const cuDoubleComplex*, A, int, lda, const cuDoubleComplex*, B, int, ldb, const double*, beta, cuDoubleComplex*, C, int, ldc);
+DEF_FN(cublasStatus_t, cublasZherkx_64, cublasHandle_t, handle, cublasFillMode_t, uplo, cublasOperation_t, trans, int64_t, n, int64_t, k, const cuDoubleComplex*, alpha, const cuDoubleComplex*, A, int64_t, lda, const cuDoubleComplex*, B, int64_t, ldb, const double*, beta, cuDoubleComplex*, C, int64_t, ldc);
+DEF_FN(cublasStatus_t, cublasSsymm_v2, cublasHandle_t, handle, cublasSideMode_t, side, cublasFillMode_t, uplo, int, m, int, n, const float*, alpha, const float*, A, int, lda, const float*, B, int, ldb, const float*, beta, float*, C, int, ldc);
+DEF_FN(cublasStatus_t, cublasSsymm_v2_64, cublasHandle_t, handle, cublasSideMode_t, side, cublasFillMode_t, uplo, int64_t, m, int64_t, n, const float*, alpha, const float*, A, int64_t, lda, const float*, B, int64_t, ldb, const float*, beta, float*, C, int64_t, ldc);
+DEF_FN(cublasStatus_t, cublasDsymm_v2, cublasHandle_t, handle, cublasSideMode_t, side, cublasFillMode_t, uplo, int, m, int, n, const double*, alpha, const double*, A, int, lda, const double*, B, int, ldb, const double*, beta, double*, C, int, ldc);
+DEF_FN(cublasStatus_t, cublasDsymm_v2_64, cublasHandle_t, handle, cublasSideMode_t, side, cublasFillMode_t, uplo, int64_t, m, int64_t, n, const double*, alpha, const double*, A, int64_t, lda, const double*, B, int64_t, ldb, const double*, beta, double*, C, int64_t, ldc);
+DEF_FN(cublasStatus_t, cublasCsymm_v2, cublasHandle_t, handle, cublasSideMode_t, side, cublasFillMode_t, uplo, int, m, int, n, const cuComplex*, alpha, const cuComplex*, A, int, lda, const cuComplex*, B, int, ldb, const cuComplex*, beta, cuComplex*, C, int, ldc);
+DEF_FN(cublasStatus_t, cublasCsymm_v2_64, cublasHandle_t, handle, cublasSideMode_t, side, cublasFillMode_t, uplo, int64_t, m, int64_t, n, const cuComplex*, alpha, const cuComplex*, A, int64_t, lda, const cuComplex*, B, int64_t, ldb, const cuComplex*, beta, cuComplex*, C, int64_t, ldc);
+DEF_FN(cublasStatus_t, cublasZsymm_v2, cublasHandle_t, handle, cublasSideMode_t, side, cublasFillMode_t, uplo, int, m, int, n, const cuDoubleComplex*, alpha, const cuDoubleComplex*, A, int, lda, const cuDoubleComplex*, B, int, ldb, const cuDoubleComplex*, beta, cuDoubleComplex*, C, int, ldc);
+DEF_FN(cublasStatus_t, cublasZsymm_v2_64, cublasHandle_t, handle, cublasSideMode_t, side, cublasFillMode_t, uplo, int64_t, m, int64_t, n, const cuDoubleComplex*, alpha, const cuDoubleComplex*, A, int64_t, lda, const cuDoubleComplex*, B, int64_t, ldb, const cuDoubleComplex*, beta, cuDoubleComplex*, C, int64_t, ldc);
+DEF_FN(cublasStatus_t, cublasChemm_v2, cublasHandle_t, handle, cublasSideMode_t, side, cublasFillMode_t, uplo, int, m, int, n, const cuComplex*, alpha, const cuComplex*, A, int, lda, const cuComplex*, B, int, ldb, const cuComplex*, beta, cuComplex*, C, int, ldc);
+DEF_FN(cublasStatus_t, cublasChemm_v2_64, cublasHandle_t, handle, cublasSideMode_t, side, cublasFillMode_t, uplo, int64_t, m, int64_t, n, const cuComplex*, alpha, const cuComplex*, A, int64_t, lda, const cuComplex*, B, int64_t, ldb, const cuComplex*, beta, cuComplex*, C, int64_t, ldc);
+DEF_FN(cublasStatus_t, cublasZhemm_v2, cublasHandle_t, handle, cublasSideMode_t, side, cublasFillMode_t, uplo, int, m, int, n, const cuDoubleComplex*, alpha, const cuDoubleComplex*, A, int, lda, const cuDoubleComplex*, B, int, ldb, const cuDoubleComplex*, beta, cuDoubleComplex*, C, int, ldc);
+DEF_FN(cublasStatus_t, cublasZhemm_v2_64, cublasHandle_t, handle, cublasSideMode_t, side, cublasFillMode_t, uplo, int64_t, m, int64_t, n, const cuDoubleComplex*, alpha, const cuDoubleComplex*, A, int64_t, lda, const cuDoubleComplex*, B, int64_t, ldb, const cuDoubleComplex*, beta, cuDoubleComplex*, C, int64_t, ldc);
+DEF_FN(cublasStatus_t, cublasStrsm_v2, cublasHandle_t, handle, cublasSideMode_t, side, cublasFillMode_t, uplo, cublasOperation_t, trans, cublasDiagType_t, diag, int, m, int, n, const float*, alpha, const float*, A, int, lda, float*, B, int, ldb);
+DEF_FN(cublasStatus_t, cublasStrsm_v2_64, cublasHandle_t, handle, cublasSideMode_t, side, cublasFillMode_t, uplo, cublasOperation_t, trans, cublasDiagType_t, diag, int64_t, m, int64_t, n, const float*, alpha, const float*, A, int64_t, lda, float*, B, int64_t, ldb);
+DEF_FN(cublasStatus_t, cublasDtrsm_v2, cublasHandle_t, handle, cublasSideMode_t, side, cublasFillMode_t, uplo, cublasOperation_t, trans, cublasDiagType_t, diag, int, m, int, n, const double*, alpha, const double*, A, int, lda, double*, B, int, ldb);
+DEF_FN(cublasStatus_t, cublasDtrsm_v2_64, cublasHandle_t, handle, cublasSideMode_t, side, cublasFillMode_t, uplo, cublasOperation_t, trans, cublasDiagType_t, diag, int64_t, m, int64_t, n, const double*, alpha, const double*, A, int64_t, lda, double*, B, int64_t, ldb);
+DEF_FN(cublasStatus_t, cublasCtrsm_v2, cublasHandle_t, handle, cublasSideMode_t, side, cublasFillMode_t, uplo, cublasOperation_t, trans, cublasDiagType_t, diag, int, m, int, n, const cuComplex*, alpha, const cuComplex*, A, int, lda, cuComplex*, B, int, ldb);
+DEF_FN(cublasStatus_t, cublasCtrsm_v2_64, cublasHandle_t, handle, cublasSideMode_t, side, cublasFillMode_t, uplo, cublasOperation_t, trans, cublasDiagType_t, diag, int64_t, m, int64_t, n, const cuComplex*, alpha, const cuComplex*, A, int64_t, lda, cuComplex*, B, int64_t, ldb);
+DEF_FN(cublasStatus_t, cublasZtrsm_v2, cublasHandle_t, handle, cublasSideMode_t, side, cublasFillMode_t, uplo, cublasOperation_t, trans, cublasDiagType_t, diag, int, m, int, n, const cuDoubleComplex*, alpha, const cuDoubleComplex*, A, int, lda, cuDoubleComplex*, B, int, ldb);
+DEF_FN(cublasStatus_t, cublasZtrsm_v2_64, cublasHandle_t, handle, cublasSideMode_t, side, cublasFillMode_t, uplo, cublasOperation_t, trans, cublasDiagType_t, diag, int64_t, m, int64_t, n, const cuDoubleComplex*, alpha, const cuDoubleComplex*, A, int64_t, lda, cuDoubleComplex*, B, int64_t, ldb);
+DEF_FN(cublasStatus_t, cublasStrmm_v2, cublasHandle_t, handle, cublasSideMode_t, side, cublasFillMode_t, uplo, cublasOperation_t, trans, cublasDiagType_t, diag, int, m, int, n, const float*, alpha, const float*, A, int, lda, const float*, B, int, ldb, float*, C, int, ldc);
+DEF_FN(cublasStatus_t, cublasStrmm_v2_64, cublasHandle_t, handle, cublasSideMode_t, side, cublasFillMode_t, uplo, cublasOperation_t, trans, cublasDiagType_t, diag, int64_t, m, int64_t, n, const float*, alpha, const float*, A, int64_t, lda, const float*, B, int64_t, ldb, float*, C, int64_t, ldc);
+DEF_FN(cublasStatus_t, cublasDtrmm_v2, cublasHandle_t, handle, cublasSideMode_t, side, cublasFillMode_t, uplo, cublasOperation_t, trans, cublasDiagType_t, diag, int, m, int, n, const double*, alpha, const double*, A, int, lda, const double*, B, int, ldb, double*, C, int, ldc);
+DEF_FN(cublasStatus_t, cublasDtrmm_v2_64, cublasHandle_t, handle, cublasSideMode_t, side, cublasFillMode_t, uplo, cublasOperation_t, trans, cublasDiagType_t, diag, int64_t, m, int64_t, n, const double*, alpha, const double*, A, int64_t, lda, const double*, B, int64_t, ldb, double*, C, int64_t, ldc);
+DEF_FN(cublasStatus_t, cublasCtrmm_v2, cublasHandle_t, handle, cublasSideMode_t, side, cublasFillMode_t, uplo, cublasOperation_t, trans, cublasDiagType_t, diag, int, m, int, n, const cuComplex*, alpha, const cuComplex*, A, int, lda, const cuComplex*, B, int, ldb, cuComplex*, C, int, ldc);
+DEF_FN(cublasStatus_t, cublasCtrmm_v2_64, cublasHandle_t, handle, cublasSideMode_t, side, cublasFillMode_t, uplo, cublasOperation_t, trans, cublasDiagType_t, diag, int64_t, m, int64_t, n, const cuComplex*, alpha, const cuComplex*, A, int64_t, lda, const cuComplex*, B, int64_t, ldb, cuComplex*, C, int64_t, ldc);
+DEF_FN(cublasStatus_t, cublasZtrmm_v2, cublasHandle_t, handle, cublasSideMode_t, side, cublasFillMode_t, uplo, cublasOperation_t, trans, cublasDiagType_t, diag, int, m, int, n, const cuDoubleComplex*, alpha, const cuDoubleComplex*, A, int, lda, const cuDoubleComplex*, B, int, ldb, cuDoubleComplex*, C, int, ldc);
+DEF_FN(cublasStatus_t, cublasZtrmm_v2_64, cublasHandle_t, handle, cublasSideMode_t, side, cublasFillMode_t, uplo, cublasOperation_t, trans, cublasDiagType_t, diag, int64_t, m, int64_t, n, const cuDoubleComplex*, alpha, const cuDoubleComplex*, A, int64_t, lda, const cuDoubleComplex*, B, int64_t, ldb, cuDoubleComplex*, C, int64_t, ldc);
+DEF_FN(cublasStatus_t, cublasSgemmBatched, cublasHandle_t, handle, cublasOperation_t, transa, cublasOperation_t, transb, int, m, int, n, int, k, const float*, alpha, const float* const*,  Aarray, int, lda, const float* const*,  Barray, int, ldb, const float*, beta, float* const*,  Carray, int, ldc, int, batchCount);
+DEF_FN(cublasStatus_t, cublasSgemmBatched_64, cublasHandle_t, handle, cublasOperation_t, transa, cublasOperation_t, transb, int64_t, m, int64_t, n, int64_t, k, const float*, alpha, const float* const*,  Aarray, int64_t, lda, const float* const*,  Barray, int64_t, ldb, const float*, beta, float* const*,  Carray, int64_t, ldc, int64_t, batchCount);
+DEF_FN(cublasStatus_t, cublasDgemmBatched, cublasHandle_t, handle, cublasOperation_t, transa, cublasOperation_t, transb, int, m, int, n, int, k, const double*, alpha, const double* const*,  Aarray, int, lda, const double* const*,  Barray, int, ldb, const double*, beta, double* const*,  Carray, int, ldc, int, batchCount);
+DEF_FN(cublasStatus_t, cublasDgemmBatched_64, cublasHandle_t, handle, cublasOperation_t, transa, cublasOperation_t, transb, int64_t, m, int64_t, n, int64_t, k, const double*, alpha, const double* const*,  Aarray, int64_t, lda, const double* const*,  Barray, int64_t, ldb, const double*, beta, double* const*,  Carray, int64_t, ldc, int64_t, batchCount);
+DEF_FN(cublasStatus_t, cublasCgemmBatched, cublasHandle_t, handle, cublasOperation_t, transa, cublasOperation_t, transb, int, m, int, n, int, k, const cuComplex*, alpha, const cuComplex* const*,  Aarray, int, lda, const cuComplex* const*,  Barray, int, ldb, const cuComplex*, beta, cuComplex* const*,  Carray, int, ldc, int, batchCount);
+DEF_FN(cublasStatus_t, cublasCgemmBatched_64, cublasHandle_t, handle, cublasOperation_t, transa, cublasOperation_t, transb, int64_t, m, int64_t, n, int64_t, k, const cuComplex*, alpha, const cuComplex* const*,  Aarray, int64_t, lda, const cuComplex* const*,  Barray, int64_t, ldb, const cuComplex*, beta, cuComplex* const*,  Carray, int64_t, ldc, int64_t, batchCount);
+DEF_FN(cublasStatus_t, cublasCgemm3mBatched, cublasHandle_t, handle, cublasOperation_t, transa, cublasOperation_t, transb, int, m, int, n, int, k, const cuComplex*, alpha, const cuComplex* const*,  Aarray, int, lda, const cuComplex* const*,  Barray, int, ldb, const cuComplex*, beta, cuComplex* const*,  Carray, int, ldc, int, batchCount);
+DEF_FN(cublasStatus_t, cublasCgemm3mBatched_64, cublasHandle_t, handle, cublasOperation_t, transa, cublasOperation_t, transb, int64_t, m, int64_t, n, int64_t, k, const cuComplex*, alpha, const cuComplex* const*,  Aarray, int64_t, lda, const cuComplex* const*,  Barray, int64_t, ldb, const cuComplex*, beta, cuComplex* const*,  Carray, int64_t, ldc, int64_t, batchCount);
+DEF_FN(cublasStatus_t, cublasZgemmBatched, cublasHandle_t, handle, cublasOperation_t, transa, cublasOperation_t, transb, int, m, int, n, int, k, const cuDoubleComplex*, alpha, const cuDoubleComplex* const*,  Aarray, int, lda, const cuDoubleComplex* const*,  Barray, int, ldb, const cuDoubleComplex*, beta, cuDoubleComplex* const*,  Carray, int, ldc, int, batchCount);
+DEF_FN(cublasStatus_t, cublasZgemmBatched_64, cublasHandle_t, handle, cublasOperation_t, transa, cublasOperation_t, transb, int64_t, m, int64_t, n, int64_t, k, const cuDoubleComplex*, alpha, const cuDoubleComplex* const*,  Aarray, int64_t, lda, const cuDoubleComplex* const*,  Barray, int64_t, ldb, const cuDoubleComplex*, beta, cuDoubleComplex* const*,  Carray, int64_t, ldc, int64_t, batchCount);
+DEF_FN(cublasStatus_t, cublasSgemmStridedBatched, cublasHandle_t, handle, cublasOperation_t, transa, cublasOperation_t, transb, int, m, int, n, int, k, const float*, alpha, const float*, A, int, lda, long long int, strideA, const float*, B, int, ldb, long long int, strideB, const float*, beta, float*, C, int, ldc, long long int, strideC, int, batchCount);
+DEF_FN(cublasStatus_t, cublasSgemmStridedBatched_64, cublasHandle_t, handle, cublasOperation_t, transa, cublasOperation_t, transb, int64_t, m, int64_t, n, int64_t, k, const float*, alpha, const float*, A, int64_t, lda, long long int, strideA, const float*, B, int64_t, ldb, long long int, strideB, const float*, beta, float*, C, int64_t, ldc, long long int, strideC, int64_t, batchCount);
+DEF_FN(cublasStatus_t, cublasDgemmStridedBatched, cublasHandle_t, handle, cublasOperation_t, transa, cublasOperation_t, transb, int, m, int, n, int, k, const double*, alpha, const double*, A, int, lda, long long int, strideA, const double*, B, int, ldb, long long int, strideB, const double*, beta, double*, C, int, ldc, long long int, strideC, int, batchCount);
+DEF_FN(cublasStatus_t, cublasDgemmStridedBatched_64, cublasHandle_t, handle, cublasOperation_t, transa, cublasOperation_t, transb, int64_t, m, int64_t, n, int64_t, k, const double*, alpha, const double*, A, int64_t, lda, long long int, strideA, const double*, B, int64_t, ldb, long long int, strideB, const double*, beta, double*, C, int64_t, ldc, long long int, strideC, int64_t, batchCount);
+DEF_FN(cublasStatus_t, cublasCgemmStridedBatched, cublasHandle_t, handle, cublasOperation_t, transa, cublasOperation_t, transb, int, m, int, n, int, k, const cuComplex*, alpha, const cuComplex*, A, int, lda, long long int, strideA, const cuComplex*, B, int, ldb, long long int, strideB, const cuComplex*, beta, cuComplex*, C, int, ldc, long long int, strideC, int, batchCount);
+DEF_FN(cublasStatus_t, cublasCgemmStridedBatched_64, cublasHandle_t, handle, cublasOperation_t, transa, cublasOperation_t, transb, int64_t, m, int64_t, n, int64_t, k, const cuComplex*, alpha, const cuComplex*, A, int64_t, lda, long long int, strideA, const cuComplex*, B, int64_t, ldb, long long int, strideB, const cuComplex*, beta, cuComplex*, C, int64_t, ldc, long long int, strideC, int64_t, batchCount);
+DEF_FN(cublasStatus_t, cublasCgemm3mStridedBatched, cublasHandle_t, handle, cublasOperation_t, transa, cublasOperation_t, transb, int, m, int, n, int, k, const cuComplex*, alpha, const cuComplex*, A, int, lda, long long int, strideA, const cuComplex*, B, int, ldb, long long int, strideB, const cuComplex*, beta, cuComplex*, C, int, ldc, long long int, strideC, int, batchCount);
+DEF_FN(cublasStatus_t, cublasCgemm3mStridedBatched_64, cublasHandle_t, handle, cublasOperation_t, transa, cublasOperation_t, transb, int64_t, m, int64_t, n, int64_t, k, const cuComplex*, alpha, const cuComplex*, A, int64_t, lda, long long int, strideA, const cuComplex*, B, int64_t, ldb, long long int, strideB, const cuComplex*, beta, cuComplex*, C, int64_t, ldc, long long int, strideC, int64_t, batchCount);
+DEF_FN(cublasStatus_t, cublasZgemmStridedBatched, cublasHandle_t, handle, cublasOperation_t, transa, cublasOperation_t, transb, int, m, int, n, int, k, const cuDoubleComplex*, alpha, const cuDoubleComplex*, A, int, lda, long long int, strideA, const cuDoubleComplex*, B, int, ldb, long long int, strideB, const cuDoubleComplex*, beta, cuDoubleComplex*, C, int, ldc, long long int, strideC, int, batchCount);
+DEF_FN(cublasStatus_t, cublasZgemmStridedBatched_64, cublasHandle_t, handle, cublasOperation_t, transa, cublasOperation_t, transb, int64_t, m, int64_t, n, int64_t, k, const cuDoubleComplex*, alpha, const cuDoubleComplex*, A, int64_t, lda, long long int, strideA, const cuDoubleComplex*, B, int64_t, ldb, long long int, strideB, const cuDoubleComplex*, beta, cuDoubleComplex*, C, int64_t, ldc, long long int, strideC, int64_t, batchCount);
+DEF_FN(cublasStatus_t, cublasGemmBatchedEx, cublasHandle_t, handle, cublasOperation_t, transa, cublasOperation_t, transb, int, m, int, n, int, k, const void*, alpha, const void* const*,  Aarray, cudaDataType, Atype, int, lda, const void* const*,  Barray, cudaDataType, Btype, int, ldb, const void*, beta, void* const*,  Carray, cudaDataType, Ctype, int, ldc, int, batchCount, cublasComputeType_t, computeType, cublasGemmAlgo_t, algo);
+DEF_FN(cublasStatus_t, cublasGemmBatchedEx_64, cublasHandle_t, handle, cublasOperation_t, transa, cublasOperation_t, transb, int64_t, m, int64_t, n, int64_t, k, const void*, alpha, const void* const*,  Aarray, cudaDataType, Atype, int64_t, lda, const void* const*,  Barray, cudaDataType, Btype, int64_t, ldb, const void*, beta, void* const*,  Carray, cudaDataType, Ctype, int64_t, ldc, int64_t, batchCount, cublasComputeType_t, computeType, cublasGemmAlgo_t, algo);
+DEF_FN(cublasStatus_t, cublasGemmStridedBatchedEx, cublasHandle_t, handle, cublasOperation_t, transa, cublasOperation_t, transb, int, m, int, n, int, k, const void*, alpha, const void*, A, cudaDataType, Atype, int, lda, long long int, strideA, const void*, B, cudaDataType, Btype, int, ldb, long long int, strideB, const void*, beta, void*, C, cudaDataType, Ctype, int, ldc, long long int, strideC, int, batchCount, cublasComputeType_t, computeType, cublasGemmAlgo_t, algo);
+DEF_FN(cublasStatus_t, cublasGemmStridedBatchedEx_64, cublasHandle_t, handle, cublasOperation_t, transa, cublasOperation_t, transb, int64_t, m, int64_t, n, int64_t, k, const void*, alpha, const void*, A, cudaDataType, Atype, int64_t, lda, long long int, strideA, const void*, B, cudaDataType, Btype, int64_t, ldb, long long int, strideB, const void*, beta, void*, C, cudaDataType, Ctype, int64_t, ldc, long long int, strideC, int64_t, batchCount, cublasComputeType_t, computeType, cublasGemmAlgo_t, algo);
+DEF_FN(cublasStatus_t, cublasSgeam, cublasHandle_t, handle, cublasOperation_t, transa, cublasOperation_t, transb, int, m, int, n, const float*, alpha, const float*, A, int, lda, const float*, beta, const float*, B, int, ldb, float*, C, int, ldc);
+DEF_FN(cublasStatus_t, cublasSgeam_64, cublasHandle_t, handle, cublasOperation_t, transa, cublasOperation_t, transb, int64_t, m, int64_t, n, const float*, alpha, const float*, A, int64_t, lda, const float*, beta, const float*, B, int64_t, ldb, float*, C, int64_t, ldc);
+DEF_FN(cublasStatus_t, cublasDgeam, cublasHandle_t, handle, cublasOperation_t, transa, cublasOperation_t, transb, int, m, int, n, const double*, alpha, const double*, A, int, lda, const double*, beta, const double*, B, int, ldb, double*, C, int, ldc);
+DEF_FN(cublasStatus_t, cublasDgeam_64, cublasHandle_t, handle, cublasOperation_t, transa, cublasOperation_t, transb, int64_t, m, int64_t, n, const double*, alpha, const double*, A, int64_t, lda, const double*, beta, const double*, B, int64_t, ldb, double*, C, int64_t, ldc);
+DEF_FN(cublasStatus_t, cublasCgeam, cublasHandle_t, handle, cublasOperation_t, transa, cublasOperation_t, transb, int, m, int, n, const cuComplex*, alpha, const cuComplex*, A, int, lda, const cuComplex*, beta, const cuComplex*, B, int, ldb, cuComplex*, C, int, ldc);
+DEF_FN(cublasStatus_t, cublasCgeam_64, cublasHandle_t, handle, cublasOperation_t, transa, cublasOperation_t, transb, int64_t, m, int64_t, n, const cuComplex*, alpha, const cuComplex*, A, int64_t, lda, const cuComplex*, beta, const cuComplex*, B, int64_t, ldb, cuComplex*, C, int64_t, ldc);
+DEF_FN(cublasStatus_t, cublasZgeam, cublasHandle_t, handle, cublasOperation_t, transa, cublasOperation_t, transb, int, m, int, n, const cuDoubleComplex*, alpha, const cuDoubleComplex*, A, int, lda, const cuDoubleComplex*, beta, const cuDoubleComplex*, B, int, ldb, cuDoubleComplex*, C, int, ldc);
+DEF_FN(cublasStatus_t, cublasZgeam_64, cublasHandle_t, handle, cublasOperation_t, transa, cublasOperation_t, transb, int64_t, m, int64_t, n, const cuDoubleComplex*, alpha, const cuDoubleComplex*, A, int64_t, lda, const cuDoubleComplex*, beta, const cuDoubleComplex*, B, int64_t, ldb, cuDoubleComplex*, C, int64_t, ldc);
+DEF_FN(cublasStatus_t, cublasStrsmBatched, cublasHandle_t, handle, cublasSideMode_t, side, cublasFillMode_t, uplo, cublasOperation_t, trans, cublasDiagType_t, diag, int, m, int, n, const float*, alpha, const float* const*,  A, int, lda, float* const*,  B, int, ldb, int, batchCount);
+DEF_FN(cublasStatus_t, cublasStrsmBatched_64, cublasHandle_t, handle, cublasSideMode_t, side, cublasFillMode_t, uplo, cublasOperation_t, trans, cublasDiagType_t, diag, int64_t, m, int64_t, n, const float*, alpha, const float* const*,  A, int64_t, lda, float* const*,  B, int64_t, ldb, int64_t, batchCount);
+DEF_FN(cublasStatus_t, cublasDtrsmBatched, cublasHandle_t, handle, cublasSideMode_t, side, cublasFillMode_t, uplo, cublasOperation_t, trans, cublasDiagType_t, diag, int, m, int, n, const double*, alpha, const double* const*,  A, int, lda, double* const*,  B, int, ldb, int, batchCount);
+DEF_FN(cublasStatus_t, cublasDtrsmBatched_64, cublasHandle_t, handle, cublasSideMode_t, side, cublasFillMode_t, uplo, cublasOperation_t, trans, cublasDiagType_t, diag, int64_t, m, int64_t, n, const double*, alpha, const double* const*,  A, int64_t, lda, double* const*,  B, int64_t, ldb, int64_t, batchCount);
+DEF_FN(cublasStatus_t, cublasCtrsmBatched, cublasHandle_t, handle, cublasSideMode_t, side, cublasFillMode_t, uplo, cublasOperation_t, trans, cublasDiagType_t, diag, int, m, int, n, const cuComplex*, alpha, const cuComplex* const*,  A, int, lda, cuComplex* const*,  B, int, ldb, int, batchCount);
+DEF_FN(cublasStatus_t, cublasCtrsmBatched_64, cublasHandle_t, handle, cublasSideMode_t, side, cublasFillMode_t, uplo, cublasOperation_t, trans, cublasDiagType_t, diag, int64_t, m, int64_t, n, const cuComplex*, alpha, const cuComplex* const*,  A, int64_t, lda, cuComplex* const*,  B, int64_t, ldb, int64_t, batchCount);
+DEF_FN(cublasStatus_t, cublasZtrsmBatched, cublasHandle_t, handle, cublasSideMode_t, side, cublasFillMode_t, uplo, cublasOperation_t, trans, cublasDiagType_t, diag, int, m, int, n, const cuDoubleComplex*, alpha, const cuDoubleComplex* const*,  A, int, lda, cuDoubleComplex* const*,  B, int, ldb, int, batchCount);
+DEF_FN(cublasStatus_t, cublasZtrsmBatched_64, cublasHandle_t, handle, cublasSideMode_t, side, cublasFillMode_t, uplo, cublasOperation_t, trans, cublasDiagType_t, diag, int64_t, m, int64_t, n, const cuDoubleComplex*, alpha, const cuDoubleComplex* const*,  A, int64_t, lda, cuDoubleComplex* const*,  B, int64_t, ldb, int64_t, batchCount);
+DEF_FN(cublasStatus_t, cublasSdgmm, cublasHandle_t, handle, cublasSideMode_t, mode, int, m, int, n, const float*, A, int, lda, const float*, x, int, incx, float*, C, int, ldc);
+DEF_FN(cublasStatus_t, cublasSdgmm_64, cublasHandle_t, handle, cublasSideMode_t, mode, int64_t, m, int64_t, n, const float*, A, int64_t, lda, const float*, x, int64_t, incx, float*, C, int64_t, ldc);
+DEF_FN(cublasStatus_t, cublasDdgmm, cublasHandle_t, handle, cublasSideMode_t, mode, int, m, int, n, const double*, A, int, lda, const double*, x, int, incx, double*, C, int, ldc);
+DEF_FN(cublasStatus_t, cublasDdgmm_64, cublasHandle_t, handle, cublasSideMode_t, mode, int64_t, m, int64_t, n, const double*, A, int64_t, lda, const double*, x, int64_t, incx, double*, C, int64_t, ldc);
+DEF_FN(cublasStatus_t, cublasCdgmm, cublasHandle_t, handle, cublasSideMode_t, mode, int, m, int, n, const cuComplex*, A, int, lda, const cuComplex*, x, int, incx, cuComplex*, C, int, ldc);
+DEF_FN(cublasStatus_t, cublasCdgmm_64, cublasHandle_t, handle, cublasSideMode_t, mode, int64_t, m, int64_t, n, const cuComplex*, A, int64_t, lda, const cuComplex*, x, int64_t, incx, cuComplex*, C, int64_t, ldc);
+DEF_FN(cublasStatus_t, cublasZdgmm, cublasHandle_t, handle, cublasSideMode_t, mode, int, m, int, n, const cuDoubleComplex*, A, int, lda, const cuDoubleComplex*, x, int, incx, cuDoubleComplex*, C, int, ldc);
+DEF_FN(cublasStatus_t, cublasZdgmm_64, cublasHandle_t, handle, cublasSideMode_t, mode, int64_t, m, int64_t, n, const cuDoubleComplex*, A, int64_t, lda, const cuDoubleComplex*, x, int64_t, incx, cuDoubleComplex*, C, int64_t, ldc);
+DEF_FN(cublasStatus_t, cublasSmatinvBatched, cublasHandle_t, handle, int, n, const float* const*,  A, int, lda, float* const*,  Ainv, int, lda_inv, int*, info, int, batchSize);
+DEF_FN(cublasStatus_t, cublasDmatinvBatched, cublasHandle_t, handle, int, n, const double* const*,  A, int, lda, double* const*,  Ainv, int, lda_inv, int*, info, int, batchSize);
+DEF_FN(cublasStatus_t, cublasCmatinvBatched, cublasHandle_t, handle, int, n, const cuComplex* const*,  A, int, lda, cuComplex* const*,  Ainv, int, lda_inv, int*, info, int, batchSize);
+DEF_FN(cublasStatus_t, cublasZmatinvBatched, cublasHandle_t, handle, int, n, const cuDoubleComplex* const*,  A, int, lda, cuDoubleComplex* const*,  Ainv, int, lda_inv, int*, info, int, batchSize);
+DEF_FN(cublasStatus_t, cublasSgeqrfBatched, cublasHandle_t, handle, int, m, int, n, float* const*,  Aarray, int, lda, float* const*,  TauArray, int*, info, int, batchSize);
+DEF_FN(cublasStatus_t, cublasDgeqrfBatched, cublasHandle_t, handle, int, m, int, n, double* const*,  Aarray, int, lda, double* const*,  TauArray, int*, info, int, batchSize);
+DEF_FN(cublasStatus_t, cublasCgeqrfBatched, cublasHandle_t, handle, int, m, int, n, cuComplex* const*,  Aarray, int, lda, cuComplex* const*,  TauArray, int*, info, int, batchSize);
+DEF_FN(cublasStatus_t, cublasZgeqrfBatched, cublasHandle_t, handle, int, m, int, n, cuDoubleComplex* const*,  Aarray, int, lda, cuDoubleComplex* const*,  TauArray, int*, info, int, batchSize);
+DEF_FN(cublasStatus_t, cublasSgelsBatched, cublasHandle_t, handle, cublasOperation_t, trans, int, m, int, n, int, nrhs, float* const*,  Aarray, int, lda, float* const*,  Carray, int, ldc, int*, info, int*, devInfoArray, int, batchSize);
+DEF_FN(cublasStatus_t, cublasDgelsBatched, cublasHandle_t, handle, cublasOperation_t, trans, int, m, int, n, int, nrhs, double* const*,  Aarray, int, lda, double* const*,  Carray, int, ldc, int*, info, int*, devInfoArray, int, batchSize);
+DEF_FN(cublasStatus_t, cublasCgelsBatched, cublasHandle_t, handle, cublasOperation_t, trans, int, m, int, n, int, nrhs, cuComplex* const*,  Aarray, int, lda, cuComplex* const*,  Carray, int, ldc, int*, info, int*, devInfoArray, int, batchSize);
+DEF_FN(cublasStatus_t, cublasZgelsBatched, cublasHandle_t, handle, cublasOperation_t, trans, int, m, int, n, int, nrhs, cuDoubleComplex* const*,  Aarray, int, lda, cuDoubleComplex* const*,  Carray, int, ldc, int*, info, int*, devInfoArray, int, batchSize);
+DEF_FN(cublasStatus_t, cublasStpttr, cublasHandle_t, handle, cublasFillMode_t, uplo, int, n, const float*, AP, float*, A, int, lda);
+DEF_FN(cublasStatus_t, cublasDtpttr, cublasHandle_t, handle, cublasFillMode_t, uplo, int, n, const double*, AP, double*, A, int, lda);
+DEF_FN(cublasStatus_t, cublasCtpttr, cublasHandle_t, handle, cublasFillMode_t, uplo, int, n, const cuComplex*, AP, cuComplex*, A, int, lda);
+DEF_FN(cublasStatus_t, cublasZtpttr, , cublasHandle_t, handle, cublasFillMode_t, uplo, int, n, const cuDoubleComplex*, AP, cuDoubleComplex*, A, int, lda);
+DEF_FN(cublasStatus_t, cublasStrttp, cublasHandle_t, handle, cublasFillMode_t, uplo, int, n, const float*, A, int, lda, float*, AP);
+DEF_FN(cublasStatus_t, cublasDtrttp, cublasHandle_t, handle, cublasFillMode_t, uplo, int, n, const double*, A, int, lda, double*, AP);
+DEF_FN(cublasStatus_t, cublasCtrttp, cublasHandle_t, handle, cublasFillMode_t, uplo, int, n, const cuComplex*, A, int, lda, cuComplex*, AP);
+DEF_FN(cublasStatus_t, cublasZtrttp, , cublasHandle_t, handle, cublasFillMode_t, uplo, int, n, const cuDoubleComplex*, A, int, lda, cuDoubleComplex*, AP);
+DEF_FN(cublasStatus_t, cublasSgetrfBatched, cublasHandle_t, handle, int, n, float* const*,  A, int, lda, int*, P, int*, info, int, batchSize);
+DEF_FN(cublasStatus_t, cublasDgetrfBatched, cublasHandle_t, handle, int, n, double* const*,  A, int, lda, int*, P, int*, info, int, batchSize);
+DEF_FN(cublasStatus_t, cublasCgetrfBatched, cublasHandle_t, handle, int, n, cuComplex* const*,  A, int, lda, int*, P, int*, info, int, batchSize);
+DEF_FN(cublasStatus_t, cublasZgetrfBatched, , cublasHandle_t, handle, int, n, cuDoubleComplex* const*,  A, int, lda, int*, P, int*, info, int, batchSize);
+DEF_FN(cublasStatus_t, cublasSgetriBatched, cublasHandle_t, handle, int, n, const float* const*,  A, int, lda, const int*, P, float* const*,  C, int, ldc, int*, info, int, batchSize);
+DEF_FN(cublasStatus_t, cublasDgetriBatched, cublasHandle_t, handle, int, n, const double* const*,  A, int, lda, const int*, P, double* const*,  C, int, ldc, int*, info, int, batchSize);
+DEF_FN(cublasStatus_t, cublasCgetriBatched, cublasHandle_t, handle, int, n, const cuComplex* const*,  A, int, lda, const int*, P, cuComplex* const*,  C, int, ldc, int*, info, int, batchSize);
+DEF_FN(cublasStatus_t, cublasZgetriBatched, cublasHandle_t, handle, int, n, const cuDoubleComplex* const*,  A, int, lda, const int*, P, cuDoubleComplex* const*,  C, int, ldc, int*, info, int, batchSize);
+DEF_FN(cublasStatus_t, cublasSgetrsBatched, cublasHandle_t, handle, cublasOperation_t, trans, int, n, int, nrhs, const float* const*,  Aarray, int, lda, const int*, devIpiv, float* const*,  Barray, int, ldb, int*, info, int, batchSize);
+DEF_FN(cublasStatus_t, cublasDgetrsBatched, cublasHandle_t, handle, cublasOperation_t, trans, int, n, int, nrhs, const double* const*,  Aarray, int, lda, const int*, devIpiv, double* const*,  Barray, int, ldb, int*, info, int, batchSize);
+DEF_FN(cublasStatus_t, cublasCgetrsBatched, cublasHandle_t, handle, cublasOperation_t, trans, int, n, int, nrhs, const cuComplex* const*,  Aarray, int, lda, const int*, devIpiv, cuComplex* const*,  Barray, int, ldb, int*, info, int, batchSize);
+DEF_FN(cublasStatus_t, cublasZgetrsBatched, cublasHandle_t, handle, cublasOperation_t, trans, int, n, int, nrhs, const cuDoubleComplex* const*,  Aarray, int, lda, const int*, devIpiv, cuDoubleComplex* const*,  Barray, int, ldb, int*, info, int, batchSize);
diff --git a/cpu/cpu-client-cudnn.c b/cpu/cpu-client-cudnn.c
new file mode 100644
index 00000000..05136fe2
--- /dev/null
+++ b/cpu/cpu-client-cudnn.c
@@ -0,0 +1,1854 @@
+#include <cudnn.h>
+#include <stdint.h>
+#include <stdbool.h>
+
+#include "cpu-libwrap.h"
+#include "cpu_rpc_prot.h"
+#include "cpu-common.h"
+#include "cpu-utils.h"
+#include "log.h"
+
+#ifdef WITH_API_CNT
+extern int api_call_cnt;
+#endif //WITH_API_CNT
+
+size_t cudnnGetVersion(void)
+{
+#ifdef WITH_API_CNT
+    api_call_cnt++;
+#endif //WITH_API_CNT
+    size_t result;
+    enum clnt_stat retval_1;
+    retval_1 = rpc_cudnngetversion_1(&result, clnt);
+    if (retval_1 != RPC_SUCCESS) {
+        LOGE(LOG_ERROR, "%s failed (%d)", __FUNCTION__, retval_1);
+    }
+    return result;
+}
+
+size_t cudnnGetMaxDeviceVersion(void)
+{
+#ifdef WITH_API_CNT
+    api_call_cnt++;
+#endif //WITH_API_CNT
+    size_t result;
+    enum clnt_stat retval_1;
+    retval_1 = rpc_cudnngetmaxdeviceversion_1(&result, clnt);
+    if (retval_1 != RPC_SUCCESS) {
+        LOGE(LOG_ERROR, "%s failed (%d)", __FUNCTION__, retval_1);
+    }
+    return result;
+}
+size_t cudnnGetCudartVersion(void)
+{
+#ifdef WITH_API_CNT
+    api_call_cnt++;
+#endif //WITH_API_CNT
+    size_t result;
+    enum clnt_stat retval_1;
+    retval_1 = rpc_cudnngetcudartversion_1(&result, clnt);
+    if (retval_1 != RPC_SUCCESS) {
+        LOGE(LOG_ERROR, "%s failed (%d)", __FUNCTION__, retval_1);
+    }
+    return result;
+}
+
+const char *cudnnGetErrorString(cudnnStatus_t status)
+{
+#ifdef WITH_API_CNT
+    api_call_cnt++;
+#endif //WITH_API_CNT
+    static char str[128];
+    char *result = NULL;
+    enum clnt_stat retval_1;
+    retval_1 = rpc_cudnngeterrorstring_1((int)status, &result, clnt);
+    if (retval_1 != RPC_SUCCESS) {
+        LOGE(LOG_ERROR, "%s failed (%d)", __FUNCTION__, retval_1);
+    }
+    if (result == NULL) {
+        LOGE(LOG_ERROR, "%s failed (result is NULL)", __FUNCTION__);
+    }
+    strncpy(str, result, 128);
+    return str; 
+}
+
+cudnnStatus_t cudnnQueryRuntimeError(cudnnHandle_t handle, cudnnStatus_t* rstatus, cudnnErrQueryMode_t  mode, cudnnRuntimeTag_t * tag)
+{
+#ifdef WITH_API_CNT
+    api_call_cnt++;
+#endif //WITH_API_CNT
+    int_result result;
+    enum clnt_stat retval_1;
+    retval_1 = rpc_cudnnqueryruntimeerror_1((ptr)handle, (int)mode, &result, clnt);
+    if (retval_1 != RPC_SUCCESS) {
+        LOGE(LOG_ERROR, "%s failed (%d)", __FUNCTION__, retval_1);
+    }
+    if (result.err != CUDNN_STATUS_SUCCESS) {
+        LOGE(LOG_ERROR, "%s failed (result is %d)", __FUNCTION__, result.err);
+    } else {
+        *rstatus = (cudnnStatus_t)result.int_result_u.data;
+        //*tag = NULL;
+    }
+    return result.err;
+}
+
+cudnnStatus_t cudnnGetProperty(libraryPropertyType type, int * value)
+{
+#ifdef WITH_API_CNT
+    api_call_cnt++;
+#endif //WITH_API_CNT
+    int_result result;
+    enum clnt_stat retval_1;
+    if (value == NULL) {
+        LOGE(LOG_ERROR, "%s failed (value is NULL)", __FUNCTION__);
+        return CUDNN_STATUS_BAD_PARAM;
+    }
+    retval_1 = rpc_cudnngetproperty_1((int)type, &result, clnt);
+    if (retval_1 != RPC_SUCCESS) {
+        LOGE(LOG_ERROR, "%s failed (%d)", __FUNCTION__, retval_1);
+    }
+    if (result.err != CUDNN_STATUS_SUCCESS) {
+        LOGE(LOG_ERROR, "%s failed (result is %d)", __FUNCTION__, result.err);
+    } else {
+        *value = result.int_result_u.data;
+    }
+    return result.err;
+}
+
+cudnnStatus_t cudnnCreate(cudnnHandle_t* handle)
+{
+#ifdef WITH_API_CNT
+    api_call_cnt++;
+#endif //WITH_API_CNT
+    ptr_result result;
+    enum clnt_stat retval_1;
+    if (handle == NULL) {
+        LOGE(LOG_ERROR, "%s failed (value is NULL)", __FUNCTION__);
+        return CUDNN_STATUS_BAD_PARAM;
+    }
+    retval_1 = rpc_cudnncreate_1(&result, clnt);
+    if (retval_1 != RPC_SUCCESS) {
+        LOGE(LOG_ERROR, "%s failed (%d)", __FUNCTION__, retval_1);
+    }
+    if (result.err != CUDNN_STATUS_SUCCESS) {
+        LOGE(LOG_ERROR, "%s failed (result is %d)", __FUNCTION__, result.err);
+    } else {
+        *handle = (cudnnHandle_t)result.ptr_result_u.ptr;
+    }
+    return result.err;
+}
+
+cudnnStatus_t cudnnDestroy(cudnnHandle_t handle)
+{
+#ifdef WITH_API_CNT
+    api_call_cnt++;
+#endif //WITH_API_CNT
+    int result;
+    enum clnt_stat retval_1;
+    retval_1 = rpc_cudnndestroy_1((ptr)handle, &result, clnt);
+    if (retval_1 != RPC_SUCCESS) {
+        LOGE(LOG_ERROR, "%s failed (%d)", __FUNCTION__, retval_1);
+    }
+    if (result != CUDNN_STATUS_SUCCESS) {
+        LOGE(LOG_ERROR, "%s failed (result is %d)", __FUNCTION__, result);
+    }
+    return result;
+}
+
+cudnnStatus_t cudnnSetStream(cudnnHandle_t handle, cudaStream_t streamId)
+{
+#ifdef WITH_API_CNT
+    api_call_cnt++;
+#endif //WITH_API_CNT
+    int result;
+    enum clnt_stat retval_1;
+    retval_1 = rpc_cudnnsetstream_1((ptr)handle, (ptr)streamId, &result, clnt);
+    if (retval_1 != RPC_SUCCESS) {
+        LOGE(LOG_ERROR, "%s failed (%d)", __FUNCTION__, retval_1);
+    }
+    if (result != CUDNN_STATUS_SUCCESS) {
+        LOGE(LOG_ERROR, "%s failed (result is %d)", __FUNCTION__, result);
+    }
+    return result;
+}
+
+cudnnStatus_t cudnnGetStream(cudnnHandle_t handle, cudaStream_t * streamId)
+{
+#ifdef WITH_API_CNT
+    api_call_cnt++;
+#endif //WITH_API_CNT
+    ptr_result result;
+    enum clnt_stat retval_1;
+    if (streamId == NULL) {
+        LOGE(LOG_ERROR, "%s failed (value is NULL)", __FUNCTION__);
+        return CUDNN_STATUS_BAD_PARAM;
+    }
+    retval_1 = rpc_cudnngetstream_1((ptr)handle, &result, clnt);
+    if (retval_1 != RPC_SUCCESS) {
+        LOGE(LOG_ERROR, "%s failed (%d)", __FUNCTION__, retval_1);
+    }
+    if (result.err != CUDNN_STATUS_SUCCESS) {
+        LOGE(LOG_ERROR, "%s failed (result is %d)", __FUNCTION__, result.err);
+    } else {
+        *streamId = (cudaStream_t)result.ptr_result_u.ptr;
+    }
+    return result.err;
+}
+
+cudnnStatus_t cudnnCreateTensorDescriptor(cudnnTensorDescriptor_t * tensorDesc)
+{
+#ifdef WITH_API_CNT
+    api_call_cnt++;
+#endif //WITH_API_CNT
+    ptr_result result;
+    enum clnt_stat retval_1;
+    if (tensorDesc == NULL) {
+        LOGE(LOG_ERROR, "%s failed (value is NULL)", __FUNCTION__);
+        return CUDNN_STATUS_BAD_PARAM;
+    }
+    retval_1 = rpc_cudnncreatetensordescriptor_1(&result, clnt);
+    if (retval_1 != RPC_SUCCESS) {
+        LOGE(LOG_ERROR, "%s failed (%d)", __FUNCTION__, retval_1);
+    }
+    if (result.err != CUDNN_STATUS_SUCCESS) {
+        LOGE(LOG_ERROR, "%s failed (result is %d)", __FUNCTION__, result.err);
+    } else {
+        *tensorDesc = (cudnnTensorDescriptor_t)result.ptr_result_u.ptr;
+    }
+    return result.err;
+}
+
+cudnnStatus_t cudnnSetTensor4dDescriptor(cudnnTensorDescriptor_t tensorDesc, cudnnTensorFormat_t format, cudnnDataType_t dataType, int n, int c, int h, int w) 
+{
+#ifdef WITH_API_CNT
+    api_call_cnt++;
+#endif //WITH_API_CNT
+    int result;
+    enum clnt_stat retval_1;
+    retval_1 = rpc_cudnnsettensor4ddescriptor_1(
+        (ptr)tensorDesc,
+        (int)format,
+        (int)dataType,
+        n, c, h, w, &result, clnt);
+
+    if (retval_1 != RPC_SUCCESS) {
+        LOGE(LOG_ERROR, "%s failed (%d)", __FUNCTION__, retval_1);
+    }
+    if (result != CUDNN_STATUS_SUCCESS) {
+        LOGE(LOG_ERROR, "%s failed (result is %d)", __FUNCTION__, result);
+    } 
+    return result;
+}
+
+cudnnStatus_t cudnnSetTensor4dDescriptorEx(cudnnTensorDescriptor_t tensorDesc, cudnnDataType_t dataType, int n, int c, int h, int w, int nStride, int cStride, int hStride, int wStride)
+{
+#ifdef WITH_API_CNT
+    api_call_cnt++;
+#endif //WITH_API_CNT
+    int result;
+    enum clnt_stat retval_1;
+    retval_1 = rpc_cudnnsettensor4ddescriptorex_1(
+        (ptr)tensorDesc,
+        (int)dataType,
+        n, c, h, w, nStride, cStride, hStride, wStride, &result, clnt);
+
+    if (retval_1 != RPC_SUCCESS) {
+        LOGE(LOG_ERROR, "%s failed (%d)", __FUNCTION__, retval_1);
+    }
+    if (result != CUDNN_STATUS_SUCCESS) {
+        LOGE(LOG_ERROR, "%s failed (result is %d)", __FUNCTION__, result);
+    } 
+    return result;
+}
+
+cudnnStatus_t cudnnGetTensor4dDescriptor(const cudnnTensorDescriptor_t tensorDesc, cudnnDataType_t *dataType, int* n, int* c, int* h, int* w, int* nStride, int* cStride, int* hStride, int* wStride)
+{
+#ifdef WITH_API_CNT
+    api_call_cnt++;
+#endif //WITH_API_CNT
+    int9_result result;
+    enum clnt_stat retval_1;
+    if (dataType == NULL || n == NULL || c == NULL || h == NULL || w == NULL || nStride == NULL || cStride == NULL || hStride == NULL || wStride == NULL) { 
+        LOGE(LOG_ERROR, "%s failed (value is NULL)", __FUNCTION__);
+        return CUDNN_STATUS_BAD_PARAM;
+    }
+    retval_1 = rpc_cudnngettensor4ddescriptor_1(
+        (ptr)tensorDesc,
+        &result, clnt);
+
+    if (retval_1 != RPC_SUCCESS) {
+        LOGE(LOG_ERROR, "%s failed (%d)", __FUNCTION__, retval_1);
+    }
+    if (result.err != CUDNN_STATUS_SUCCESS) {
+        LOGE(LOG_ERROR, "%s failed (result is %d)", __FUNCTION__, result);
+    } else {
+        *dataType = (cudnnDataType_t)result.int9_result_u.data[0];
+        *n = result.int9_result_u.data[1];
+        *c = result.int9_result_u.data[2];
+        *h = result.int9_result_u.data[3];
+        *w = result.int9_result_u.data[4];
+        *nStride = result.int9_result_u.data[5];
+        *cStride = result.int9_result_u.data[6];
+        *hStride = result.int9_result_u.data[7];
+        *wStride = result.int9_result_u.data[8];
+    }
+    return result.err;
+}
+
+cudnnStatus_t cudnnSetTensorNdDescriptor(cudnnTensorDescriptor_t tensorDesc, cudnnDataType_t dataType, int nbDims, const int* dimA, const int* strideA)
+{
+#ifdef WITH_API_CNT
+    api_call_cnt++;
+#endif //WITH_API_CNT
+    int result;
+    enum clnt_stat retval_1;
+    mem_data rpc_dimA = {
+        .mem_data_len = nbDims * sizeof(int),
+        .mem_data_val = (char*)dimA
+    };
+    mem_data rpc_strideA = {
+        .mem_data_len = nbDims * sizeof(int),
+        .mem_data_val = (char*)strideA
+    };
+    retval_1 = rpc_cudnnsettensornddescriptor_1(
+        (ptr)tensorDesc,
+        (int)dataType,
+        (int)nbDims,
+        rpc_dimA, rpc_strideA, &result, clnt);
+
+    if (retval_1 != RPC_SUCCESS) {
+        LOGE(LOG_ERROR, "%s failed (%d)", __FUNCTION__, retval_1);
+    }
+    if (result != CUDNN_STATUS_SUCCESS) {
+        LOGE(LOG_ERROR, "%s failed (result is %d)", __FUNCTION__, result);
+    } 
+    return result;
+}
+
+cudnnStatus_t cudnnSetTensorNdDescriptorEx(cudnnTensorDescriptor_t tensorDesc, cudnnTensorFormat_t format, cudnnDataType_t dataType, int nbDims, const int* dimA)
+{
+#ifdef WITH_API_CNT
+    api_call_cnt++;
+#endif //WITH_API_CNT
+    int result;
+    enum clnt_stat retval_1;
+    mem_data rpc_dimA = {
+        .mem_data_len = nbDims * sizeof(int),
+        .mem_data_val = (char*)dimA
+    };
+    retval_1 = rpc_cudnnsettensornddescriptorex_1(
+        (ptr)tensorDesc,
+        (int)format,
+        (int)dataType,
+        (int)nbDims,
+        rpc_dimA, &result, clnt);
+
+    if (retval_1 != RPC_SUCCESS) {
+        LOGE(LOG_ERROR, "%s failed (%d)", __FUNCTION__, retval_1);
+    }
+    if (result != CUDNN_STATUS_SUCCESS) {
+        LOGE(LOG_ERROR, "%s failed (result is %d)", __FUNCTION__, result);
+    } 
+    return result;
+}
+
+cudnnStatus_t cudnnGetTensorNdDescriptor(const cudnnTensorDescriptor_t tensorDesc, int nbDimsRequested, cudnnDataType_t *dataType, int* nbDims, int* dimA, int* strideA)
+{
+#ifdef WITH_API_CNT
+    api_call_cnt++;
+#endif //WITH_API_CNT
+    size_t expected_size = nbDimsRequested * sizeof(int) * 2 + sizeof(int) + sizeof(cudnnDataType_t);
+    mem_result result;
+    result.mem_result_u.data.mem_data_val = malloc(expected_size);
+    enum clnt_stat retval_1;
+    if (dataType == NULL || nbDims == NULL || dimA == NULL || strideA == NULL) { 
+        LOGE(LOG_ERROR, "%s failed (value is NULL)", __FUNCTION__);
+        return CUDNN_STATUS_BAD_PARAM;
+    }
+    retval_1 = rpc_cudnngettensornddescriptor_1(
+        (ptr)tensorDesc,
+        nbDimsRequested,
+        &result, clnt);
+
+    if (retval_1 != RPC_SUCCESS) {
+        LOGE(LOG_ERROR, "%s failed (%d)", __FUNCTION__, retval_1);
+    }
+    if (result.err != CUDNN_STATUS_SUCCESS || result.mem_result_u.data.mem_data_len != expected_size) {
+        LOGE(LOG_ERROR, "%s failed (result is %d)", __FUNCTION__, result.err);
+    } else {
+        size_t offset = 0;
+        *dataType = (cudnnDataType_t)result.mem_result_u.data.mem_data_val[offset];
+        offset += sizeof(cudnnDataType_t);
+        *nbDims = (int)result.mem_result_u.data.mem_data_val[offset];
+        offset += sizeof(int);
+        memcpy(dimA, result.mem_result_u.data.mem_data_val+offset, *nbDims * sizeof(int));
+        offset += *nbDims * sizeof(int);
+        memcpy(strideA, result.mem_result_u.data.mem_data_val+offset, *nbDims * sizeof(int));
+    }
+    free(result.mem_result_u.data.mem_data_val);
+    return result.err;
+}
+
+cudnnStatus_t cudnnGetTensorSizeInBytes(const cudnnTensorDescriptor_t tensorDesc, size_t* size)
+{
+#ifdef WITH_API_CNT
+    api_call_cnt++;
+#endif //WITH_API_CNT
+    sz_result result;
+    enum clnt_stat retval_1;
+    if (size == NULL) {
+        LOGE(LOG_ERROR, "%s failed (value is NULL)", __FUNCTION__);
+        return CUDNN_STATUS_BAD_PARAM;
+    }
+    retval_1 = rpc_cudnngettensorsizeinbytes_1(
+        (ptr)tensorDesc,
+        &result, clnt);
+
+    if (retval_1 != RPC_SUCCESS) {
+        LOGE(LOG_ERROR, "%s failed (%d)", __FUNCTION__, retval_1);
+    }
+    if (result.err != CUDNN_STATUS_SUCCESS) {
+        LOGE(LOG_ERROR, "%s failed (result is %d)", __FUNCTION__, result.err);
+    } else {
+        *size = result.sz_result_u.data;
+    }
+    return result.err;
+}
+
+cudnnStatus_t cudnnDestroyTensorDescriptor(cudnnTensorDescriptor_t tensorDesc)
+{
+#ifdef WITH_API_CNT
+    api_call_cnt++;
+#endif //WITH_API_CNT
+    int result;
+    enum clnt_stat retval_1;
+    retval_1 = rpc_cudnndestroytensordescriptor_1(
+        (ptr)tensorDesc,
+        &result, clnt);
+
+    if (retval_1 != RPC_SUCCESS) {
+        LOGE(LOG_ERROR, "%s failed (%d)", __FUNCTION__, retval_1);
+    }
+    if (result != CUDNN_STATUS_SUCCESS) {
+        LOGE(LOG_ERROR, "%s failed (result is %d)", __FUNCTION__, result);
+    }
+    return result;
+}
+
+DEF_FN(cudnnStatus_t, cudnnInitTransformDest, const cudnnTensorTransformDescriptor_t, transformDesc, const cudnnTensorDescriptor_t, srcDesc, cudnnTensorDescriptor_t, destDesc, size_t*, destSizeInBytes)
+DEF_FN(cudnnStatus_t, cudnnCreateTensorTransformDescriptor, cudnnTensorTransformDescriptor_t *, transformDesc)
+DEF_FN(cudnnStatus_t, cudnnSetTensorTransformDescriptor, cudnnTensorTransformDescriptor_t, transformDesc, const uint32_t, nbDims, const cudnnTensorFormat_t, destFormat, const int32_t*, padBeforeA, const int32_t*, padAfterA, const uint32_t*, foldA, const cudnnFoldingDirection_t,  direction)
+DEF_FN(cudnnStatus_t, cudnnGetTensorTransformDescriptor, cudnnTensorTransformDescriptor_t, transformDesc, uint32_t, nbDimsRequested, cudnnTensorFormat_t *, destFormat, int32_t*, padBeforeA, int32_t*, padAfterA, uint32_t*, foldA, cudnnFoldingDirection_t *, direction)
+DEF_FN(cudnnStatus_t, cudnnDestroyTensorTransformDescriptor, cudnnTensorTransformDescriptor_t, transformDesc)
+
+cudnnStatus_t cudnnTransformTensor(cudnnHandle_t handle, const void * alpha, const cudnnTensorDescriptor_t xDesc, const void *x, const void *beta, const cudnnTensorDescriptor_t yDesc, void *y)
+{
+#ifdef WITH_API_CNT
+    api_call_cnt++;
+#endif //WITH_API_CNT
+    int result;
+    enum clnt_stat retval_1;
+    //TODO: Check if we have a float instead of always sending doubles
+    cudnn_scaling_t rpc_alpha = {.dataType = CUDNN_DATA_DOUBLE, .cudnn_scaling_t_u.d = *((double*)alpha)};
+    cudnn_scaling_t rpc_beta = {.dataType = CUDNN_DATA_DOUBLE, .cudnn_scaling_t_u.d = *((double*)beta)};
+    retval_1 = rpc_cudnntransformtensor_1(
+        (ptr)handle,
+        rpc_alpha,
+        (ptr)xDesc,
+        (ptr)x,
+        rpc_beta,
+        (ptr)yDesc,
+        (ptr)y,
+        &result, clnt);
+    if (retval_1 != RPC_SUCCESS) {
+        LOGE(LOG_ERROR, "%s failed (%d)", __FUNCTION__, retval_1);
+    }
+    if (result != CUDNN_STATUS_SUCCESS) {
+        LOGE(LOG_ERROR, "%s failed (result is %d)", __FUNCTION__, result);
+    }
+    return result;
+}
+
+DEF_FN(cudnnStatus_t, cudnnTransformTensorEx, cudnnHandle_t, handle, const cudnnTensorTransformDescriptor_t, transDesc, const void *, alpha, const cudnnTensorDescriptor_t, srcDesc, const void *, srcData, const void *, beta, const cudnnTensorDescriptor_t, destDesc, void *, destData)
+    
+cudnnStatus_t cudnnAddTensor(cudnnHandle_t handle, const void * alpha, const cudnnTensorDescriptor_t aDesc, const void * A, const void *beta, const cudnnTensorDescriptor_t cDesc, void * C)
+{
+#ifdef WITH_API_CNT
+    api_call_cnt++;
+#endif //WITH_API_CNT
+    int result;
+    enum clnt_stat retval_1;
+    //TODO: Check if we have a float instead of always sending doubles
+    cudnn_scaling_t rpc_alpha = {.dataType = CUDNN_DATA_DOUBLE, .cudnn_scaling_t_u.d = *((double*)alpha)};
+    cudnn_scaling_t rpc_beta = {.dataType = CUDNN_DATA_DOUBLE, .cudnn_scaling_t_u.d = *((double*)beta)};
+    retval_1 = rpc_cudnnaddtensor_1(
+        (ptr)handle,
+        rpc_alpha,
+        (ptr)aDesc,
+        (ptr)A,
+        rpc_beta,
+        (ptr)cDesc,
+        (ptr)C,
+        &result, clnt);
+    if (retval_1 != RPC_SUCCESS) {
+        LOGE(LOG_ERROR, "%s failed (%d)", __FUNCTION__, retval_1);
+    }
+    if (result != CUDNN_STATUS_SUCCESS) {
+        LOGE(LOG_ERROR, "%s failed (result is %d)", __FUNCTION__, result);
+    }
+    return result;
+}
+    
+DEF_FN(cudnnStatus_t, cudnnCreateOpTensorDescriptor, cudnnOpTensorDescriptor_t *, opTensorDesc)
+DEF_FN(cudnnStatus_t, cudnnSetOpTensorDescriptor, cudnnOpTensorDescriptor_t, opTensorDesc, cudnnOpTensorOp_t, opTensorOp, cudnnDataType_t, opTensorCompType, cudnnNanPropagation_t, opTensorNanOpt)
+DEF_FN(cudnnStatus_t, cudnnGetOpTensorDescriptor, const cudnnOpTensorDescriptor_t, opTensorDesc, cudnnOpTensorOp_t *, opTensorOp, cudnnDataType_t *, opTensorCompType, cudnnNanPropagation_t *, opTensorNanOpt)
+DEF_FN(cudnnStatus_t, cudnnDestroyOpTensorDescriptor, cudnnOpTensorDescriptor_t, opTensorDesc)
+DEF_FN(cudnnStatus_t, cudnnOpTensor, cudnnHandle_t, handle, const cudnnOpTensorDescriptor_t, opTensorDesc, const void *, alpha1, const cudnnTensorDescriptor_t, aDesc, const void *, A, const void *, alpha2, const cudnnTensorDescriptor_t, bDesc, const void *, B, const void *, beta, const cudnnTensorDescriptor_t,  cDesc, void *, C)
+DEF_FN(cudnnStatus_t, cudnnCreateReduceTensorDescriptor, cudnnReduceTensorDescriptor_t *, reduceTensorDesc)
+DEF_FN(cudnnStatus_t, cudnnSetReduceTensorDescriptor, cudnnReduceTensorDescriptor_t, reduceTensorDesc, cudnnReduceTensorOp_t, reduceTensorOp, cudnnDataType_t, reduceTensorCompType, cudnnNanPropagation_t, reduceTensorNanOpt, cudnnReduceTensorIndices_t, reduceTensorIndices, cudnnIndicesType_t, reduceTensorIndicesType)
+DEF_FN(cudnnStatus_t, cudnnGetReduceTensorDescriptor, const cudnnReduceTensorDescriptor_t, reduceTensorDesc, cudnnReduceTensorOp_t *, reduceTensorOp, cudnnDataType_t *, reduceTensorCompType, cudnnNanPropagation_t *, reduceTensorNanOpt, cudnnReduceTensorIndices_t *, reduceTensorIndices, cudnnIndicesType_t *, reduceTensorIndicesType)
+DEF_FN(cudnnStatus_t, cudnnDestroyReduceTensorDescriptor, cudnnReduceTensorDescriptor_t, reduceTensorDesc)
+DEF_FN(cudnnStatus_t, cudnnGetReductionIndicesSize, cudnnHandle_t, handle, const cudnnReduceTensorDescriptor_t, reduceTensorDesc, const cudnnTensorDescriptor_t, aDesc, const cudnnTensorDescriptor_t, cDesc, size_t*, sizeInBytes)
+DEF_FN(cudnnStatus_t, cudnnGetReductionWorkspaceSize, cudnnHandle_t, handle, const cudnnReduceTensorDescriptor_t, reduceTensorDesc, const cudnnTensorDescriptor_t, aDesc, const cudnnTensorDescriptor_t, cDesc, size_t*, sizeInBytes)
+DEF_FN(cudnnStatus_t, cudnnReduceTensor, cudnnHandle_t, handle, const cudnnReduceTensorDescriptor_t, reduceTensorDesc, void *, indices, size_t, indicesSizeInBytes, void *, workspace, size_t, workspaceSizeInBytes, const void *, alpha, const cudnnTensorDescriptor_t, aDesc, const void *, A, const void *, beta, const cudnnTensorDescriptor_t, cDesc, void *, C)
+DEF_FN(cudnnStatus_t, cudnnSetTensor, cudnnHandle_t, handle, const cudnnTensorDescriptor_t, yDesc, void *, y, const void *, valuePtr)
+DEF_FN(cudnnStatus_t, cudnnScaleTensor, cudnnHandle_t, handle, const cudnnTensorDescriptor_t, yDesc, void *, y, const void *, alpha)
+
+cudnnStatus_t cudnnCreateFilterDescriptor(cudnnFilterDescriptor_t * filterDesc)
+{
+#ifdef WITH_API_CNT
+    api_call_cnt++;
+#endif //WITH_API_CNT
+    ptr_result result;
+    enum clnt_stat retval_1;
+    if (filterDesc == NULL) {
+        LOGE(LOG_ERROR, "%s failed (value is NULL)", __FUNCTION__);
+        return CUDNN_STATUS_BAD_PARAM;
+    }
+    retval_1 = rpc_cudnncreatefilterdescriptor_1(&result, clnt);
+    if (retval_1 != RPC_SUCCESS) {
+        LOGE(LOG_ERROR, "%s failed (%d)", __FUNCTION__, retval_1);
+    }
+    if (result.err != CUDNN_STATUS_SUCCESS) {
+        LOGE(LOG_ERROR, "%s failed (result is %d)", __FUNCTION__, result.err);
+    } else {
+        *filterDesc = (cudnnFilterDescriptor_t)result.ptr_result_u.ptr;
+    }
+    return result.err;
+}
+
+cudnnStatus_t cudnnSetFilter4dDescriptor(cudnnFilterDescriptor_t filterDesc, cudnnDataType_t dataType, cudnnTensorFormat_t format, int k, int c, int h, int w) 
+{
+#ifdef WITH_API_CNT
+    api_call_cnt++;
+#endif //WITH_API_CNT
+    int result;
+    enum clnt_stat retval_1;
+    retval_1 = rpc_cudnnsetfilter4ddescriptor_1(
+        (ptr)filterDesc,
+        (int)dataType,
+        (int)format,
+        k, c, h, w, &result, clnt);
+
+    if (retval_1 != RPC_SUCCESS) {
+        LOGE(LOG_ERROR, "%s failed (%d)", __FUNCTION__, retval_1);
+    }
+    if (result != CUDNN_STATUS_SUCCESS) {
+        LOGE(LOG_ERROR, "%s failed (result is %d)", __FUNCTION__, result);
+    } 
+    return result;
+}
+
+cudnnStatus_t cudnnGetFilter4dDescriptor(const cudnnFilterDescriptor_t filterDesc, cudnnDataType_t *dataType, cudnnTensorFormat_t *format, int* k, int* c, int* h, int* w) 
+{
+#ifdef WITH_API_CNT
+    api_call_cnt++;
+#endif //WITH_API_CNT
+    int6_result result;
+    enum clnt_stat retval_1;
+    if (dataType == NULL || format == NULL || k == NULL || c == NULL || h == NULL || w == NULL) {
+        LOGE(LOG_ERROR, "%s failed (value is NULL)", __FUNCTION__);
+        return CUDNN_STATUS_BAD_PARAM;
+    }
+    retval_1 = rpc_cudnngetfilter4ddescriptor_1(
+        (ptr)filterDesc,
+        &result, clnt);
+
+    if (retval_1 != RPC_SUCCESS) {
+        LOGE(LOG_ERROR, "%s failed (%d)", __FUNCTION__, retval_1);
+    }
+    if (result.err != CUDNN_STATUS_SUCCESS) {
+        LOGE(LOG_ERROR, "%s failed (result is %d)", __FUNCTION__, result);
+    } else {
+        *dataType = (cudnnDataType_t)result.int6_result_u.data[0];
+        *format = (cudnnTensorFormat_t)result.int6_result_u.data[1];
+        *k = result.int6_result_u.data[2];
+        *c = result.int6_result_u.data[3];
+        *h = result.int6_result_u.data[4];
+        *w = result.int6_result_u.data[5];
+    }
+    return result.err;
+}
+
+cudnnStatus_t cudnnSetFilterNdDescriptor(cudnnFilterDescriptor_t filterDesc, cudnnDataType_t dataType, cudnnTensorFormat_t format, int nbDims, const int* filterDimA)
+{
+#ifdef WITH_API_CNT
+    api_call_cnt++;
+#endif //WITH_API_CNT
+    int result;
+    enum clnt_stat retval_1;
+    mem_data rpc_filterDimA = {
+        .mem_data_len = nbDims * sizeof(int),
+        .mem_data_val = (char*)filterDimA
+    };
+    retval_1 = rpc_cudnnsetfilternddescriptor_1(
+        (ptr)filterDesc,
+        (int)dataType,
+        (int)format,
+        (int)nbDims,
+        rpc_filterDimA, &result, clnt);
+
+    if (retval_1 != RPC_SUCCESS) {
+        LOGE(LOG_ERROR, "%s failed (%d)", __FUNCTION__, retval_1);
+    }
+    if (result != CUDNN_STATUS_SUCCESS) {
+        LOGE(LOG_ERROR, "%s failed (result is %d)", __FUNCTION__, result);
+    } 
+    return result;
+}
+
+cudnnStatus_t cudnnGetFilterNdDescriptor(const cudnnFilterDescriptor_t filterDesc, int nbDimsRequested, cudnnDataType_t * dataType, cudnnTensorFormat_t * format, int* nbDims, int* filterDimA)
+{
+#ifdef WITH_API_CNT
+    api_call_cnt++;
+#endif //WITH_API_CNT
+    size_t expected_size = nbDimsRequested * sizeof(int) + sizeof(int) + sizeof(cudnnDataType_t) + sizeof(cudnnTensorFormat_t);
+    mem_result result;
+    result.mem_result_u.data.mem_data_val = (char*)malloc(expected_size);
+    enum clnt_stat retval_1;
+    if (dataType == NULL || format == NULL || nbDims == NULL || filterDimA == NULL) {
+        LOGE(LOG_ERROR, "%s failed (value is NULL)", __FUNCTION__);
+        return CUDNN_STATUS_BAD_PARAM;
+    }
+    retval_1 = rpc_cudnngetfilternddescriptor_1(
+        (ptr)filterDesc,
+        nbDimsRequested,
+        &result, clnt);
+
+    if (retval_1 != RPC_SUCCESS) {
+        LOGE(LOG_ERROR, "%s failed (%d)", __FUNCTION__, retval_1);
+    }
+    if (result.err != CUDNN_STATUS_SUCCESS || result.mem_result_u.data.mem_data_len < expected_size) {
+        LOGE(LOG_ERROR, "%s failed (result is %d)", __FUNCTION__, result.err);
+    } else {
+        size_t offset = 0;
+        *dataType = (cudnnDataType_t)result.mem_result_u.data.mem_data_val[offset];
+        offset += sizeof(cudnnDataType_t);
+        *format = (cudnnTensorFormat_t)result.mem_result_u.data.mem_data_val[offset];
+        offset += sizeof(cudnnTensorFormat_t);
+        *nbDims = (int)result.mem_result_u.data.mem_data_val[offset];
+        offset += sizeof(int);
+        memcpy(filterDimA, result.mem_result_u.data.mem_data_val+offset, *nbDims * sizeof(int));
+    }
+    free(result.mem_result_u.data.mem_data_val);
+    return result.err;
+}
+
+cudnnStatus_t cudnnGetFilterSizeInBytes(const cudnnFilterDescriptor_t filterDesc, size_t* size)
+{
+#ifdef WITH_API_CNT
+    api_call_cnt++;
+#endif //WITH_API_CNT
+    sz_result result;
+    enum clnt_stat retval_1;
+    if (size == NULL) {
+        LOGE(LOG_ERROR, "%s failed (value is NULL)", __FUNCTION__);
+        return CUDNN_STATUS_BAD_PARAM;
+    }
+    retval_1 = rpc_cudnngetfiltersizeinbytes_1(
+        (ptr)filterDesc,
+        &result, clnt);
+
+    if (retval_1 != RPC_SUCCESS) {
+        LOGE(LOG_ERROR, "%s failed (%d)", __FUNCTION__, retval_1);
+    }
+    if (result.err != CUDNN_STATUS_SUCCESS) {
+        LOGE(LOG_ERROR, "%s failed (result is %d)", __FUNCTION__, result.err);
+    } else {
+        *size = result.sz_result_u.data;
+    }
+    return result.err;
+}
+
+cudnnStatus_t cudnnTransformFilter(cudnnHandle_t handle, const cudnnTensorTransformDescriptor_t transDesc, const void * alpha, const cudnnFilterDescriptor_t srcDesc, const void * srcData, const void * beta, const cudnnFilterDescriptor_t destDesc, void * destData)
+{
+#ifdef WITH_API_CNT
+    api_call_cnt++;
+#endif //WITH_API_CNT
+    int result;
+    enum clnt_stat retval_1;
+    //TODO: Check if we have a float instead of always sending doubles
+    cudnn_scaling_t rpc_alpha = {.dataType = CUDNN_DATA_DOUBLE, .cudnn_scaling_t_u.d = *((double*)alpha)};
+    cudnn_scaling_t rpc_beta = {.dataType = CUDNN_DATA_DOUBLE, .cudnn_scaling_t_u.d = *((double*)beta)};
+    retval_1 = rpc_cudnntransformfilter_1(
+        (ptr)handle,
+        (ptr)transDesc,
+        rpc_alpha,
+        (ptr)srcDesc,
+        (ptr)srcData,
+        rpc_beta,
+        (ptr)destDesc,
+        (ptr)destData,
+        &result, clnt);
+
+    if (retval_1 != RPC_SUCCESS) {
+        LOGE(LOG_ERROR, "%s failed (%d)", __FUNCTION__, retval_1);
+    }
+    if (result != CUDNN_STATUS_SUCCESS) {
+        LOGE(LOG_ERROR, "%s failed (result is %d)", __FUNCTION__, result);
+    }
+    return result;
+}
+
+cudnnStatus_t cudnnDestroyFilterDescriptor(cudnnFilterDescriptor_t filterDesc)
+{
+#ifdef WITH_API_CNT
+    api_call_cnt++;
+#endif //WITH_API_CNT
+    int result;
+    enum clnt_stat retval_1;
+    retval_1 = rpc_cudnndestroyfilterdescriptor_1(
+        (ptr)filterDesc,
+        &result, clnt);
+
+    if (retval_1 != RPC_SUCCESS) {
+        LOGE(LOG_ERROR, "%s failed (%d)", __FUNCTION__, retval_1);
+    }
+    if (result != CUDNN_STATUS_SUCCESS) {
+        LOGE(LOG_ERROR, "%s failed (result is %d)", __FUNCTION__, result);
+    }
+    return result;
+}
+
+cudnnStatus_t cudnnSoftmaxForward(cudnnHandle_t handle, cudnnSoftmaxAlgorithm_t algo, cudnnSoftmaxMode_t mode, const void *alpha, const cudnnTensorDescriptor_t xDesc, const void *x, const void *beta, const cudnnTensorDescriptor_t yDesc, void * y)
+{
+#ifdef WITH_API_CNT
+    api_call_cnt++;
+#endif //WITH_API_CNT
+    int result;
+    enum clnt_stat retval_1;
+    //TODO: Check if we have a float instead of always sending doubles
+    cudnn_scaling_t rpc_alpha = {.dataType = CUDNN_DATA_DOUBLE, .cudnn_scaling_t_u.d = *((double*)alpha)};
+    cudnn_scaling_t rpc_beta = {.dataType = CUDNN_DATA_DOUBLE, .cudnn_scaling_t_u.d = *((double*)beta)};
+    retval_1 = rpc_cudnnsoftmaxforward_1(
+        (ptr)handle,
+        (int)algo,
+        (int)mode,
+        rpc_alpha,
+        (ptr)xDesc,
+        (ptr)x,
+        rpc_beta,
+        (ptr)yDesc,
+        (ptr)y,
+        &result, clnt);
+    if (retval_1 != RPC_SUCCESS) {
+        LOGE(LOG_ERROR, "%s failed (%d)", __FUNCTION__, retval_1);
+    }
+    if (result != CUDNN_STATUS_SUCCESS) {
+        LOGE(LOG_ERROR, "%s failed (result is %d)", __FUNCTION__, result);
+    }
+    return result;
+}
+    
+cudnnStatus_t cudnnCreatePoolingDescriptor(cudnnPoolingDescriptor_t *poolingDesc)
+{
+#ifdef WITH_API_CNT
+    api_call_cnt++;
+#endif //WITH_API_CNT
+    ptr_result result;
+    enum clnt_stat retval_1;
+    if (poolingDesc == NULL) {
+        LOGE(LOG_ERROR, "%s failed (value is NULL)", __FUNCTION__);
+        return CUDNN_STATUS_BAD_PARAM;
+    }
+    retval_1 = rpc_cudnncreatepoolingdescriptor_1(&result, clnt);
+    if (retval_1 != RPC_SUCCESS) {
+        LOGE(LOG_ERROR, "%s failed (%d)", __FUNCTION__, retval_1);
+    }
+    if (result.err != CUDNN_STATUS_SUCCESS) {
+        LOGE(LOG_ERROR, "%s failed (result is %d)", __FUNCTION__, result.err);
+    } else {
+        *poolingDesc = (cudnnPoolingDescriptor_t)result.ptr_result_u.ptr;
+    }
+    return result.err;
+}
+
+cudnnStatus_t cudnnSetPooling2dDescriptor(cudnnPoolingDescriptor_t poolingDesc, cudnnPoolingMode_t mode, cudnnNanPropagation_t maxpoolingNanOpt, int windowHeight, int windowWidth, int verticalPadding, int horizontalPadding, int verticalStride, int horizontalStride)
+{
+#ifdef WITH_API_CNT
+    api_call_cnt++;
+#endif //WITH_API_CNT
+    int result;
+    enum clnt_stat retval_1;
+    retval_1 = rpc_cudnnsetpooling2ddescriptor_1(
+        (ptr)poolingDesc,
+        (int)mode,
+        (int)maxpoolingNanOpt,
+        windowHeight,
+        windowWidth,
+        verticalPadding,
+        horizontalPadding,
+        verticalStride,
+        horizontalStride,
+        &result, clnt);
+
+    if (retval_1 != RPC_SUCCESS) {
+        LOGE(LOG_ERROR, "%s failed (%d)", __FUNCTION__, retval_1);
+    }
+    if (result != CUDNN_STATUS_SUCCESS) {
+        LOGE(LOG_ERROR, "%s failed (result is %d)", __FUNCTION__, result);
+    } 
+    return result;
+}
+    
+cudnnStatus_t cudnnGetPooling2dDescriptor(const cudnnPoolingDescriptor_t poolingDesc, cudnnPoolingMode_t *mode, cudnnNanPropagation_t *maxpoolingNanOpt, int* windowHeight, int* windowWidth, int* verticalPadding, int* horizontalPadding, int* verticalStride, int* horizontalStride)
+{
+#ifdef WITH_API_CNT
+    api_call_cnt++;
+#endif //WITH_API_CNT
+    int8_result result;
+    enum clnt_stat retval_1;
+    if (mode == NULL || maxpoolingNanOpt == NULL || windowHeight == NULL || windowWidth == NULL || verticalPadding == NULL || verticalStride == NULL || horizontalPadding == NULL || horizontalStride == NULL) {
+        LOGE(LOG_ERROR, "%s failed (value is NULL)", __FUNCTION__);
+        return CUDNN_STATUS_BAD_PARAM;
+    }
+    retval_1 = rpc_cudnngetpooling2ddescriptor_1(
+        (ptr)poolingDesc,
+        &result, clnt);
+
+    if (retval_1 != RPC_SUCCESS) {
+        LOGE(LOG_ERROR, "%s failed (%d)", __FUNCTION__, retval_1);
+    }
+    if (result.err != CUDNN_STATUS_SUCCESS) {
+        LOGE(LOG_ERROR, "%s failed (result is %d)", __FUNCTION__, result);
+    } else {
+        *mode = (cudnnPoolingMode_t)result.int8_result_u.data[0];
+        *maxpoolingNanOpt = (cudnnNanPropagation_t)result.int8_result_u.data[1];
+        *windowHeight = result.int8_result_u.data[2];
+        *windowWidth = result.int8_result_u.data[3];
+        *verticalPadding = result.int8_result_u.data[4];
+        *horizontalPadding = result.int8_result_u.data[5];
+        *verticalStride = result.int8_result_u.data[6];
+        *horizontalStride = result.int8_result_u.data[7];
+    }
+    return result.err;
+}
+
+cudnnStatus_t cudnnSetPoolingNdDescriptor(cudnnPoolingDescriptor_t poolingDesc, const cudnnPoolingMode_t mode, const cudnnNanPropagation_t maxpoolingNanOpt, int nbDims, const int* windowDimA, const int* paddingA, const int* strideA)
+{
+#ifdef WITH_API_CNT
+    api_call_cnt++;
+#endif //WITH_API_CNT
+    int result;
+    enum clnt_stat retval_1;
+    mem_data rpc_windowDimA = {
+        .mem_data_len = nbDims * sizeof(int),
+        .mem_data_val = (char*)windowDimA
+    };
+    mem_data rpc_paddingA = {
+        .mem_data_len = nbDims * sizeof(int),
+        .mem_data_val = (char*)paddingA
+    };
+    mem_data rpc_strideA = {
+        .mem_data_len = nbDims * sizeof(int),
+        .mem_data_val = (char*)strideA
+    };
+    retval_1 = rpc_cudnnsetpoolingnddescriptor_1(
+        (ptr)poolingDesc,
+        (int)mode,
+        (int)maxpoolingNanOpt,
+        (int)nbDims,
+        rpc_windowDimA,
+        rpc_paddingA,
+        rpc_strideA,
+        &result, clnt);
+
+    if (retval_1 != RPC_SUCCESS) {
+        LOGE(LOG_ERROR, "%s failed (%d)", __FUNCTION__, retval_1);
+    }
+    if (result != CUDNN_STATUS_SUCCESS) {
+        LOGE(LOG_ERROR, "%s failed (result is %d)", __FUNCTION__, result);
+    } 
+    return result;
+}
+
+cudnnStatus_t cudnnGetPoolingNdDescriptor(const cudnnPoolingDescriptor_t poolingDesc, int nbDimsRequested, cudnnPoolingMode_t * mode, cudnnNanPropagation_t * maxpoolingNanOpt, int* nbDims, int* windowDimA, int* paddingA, int* strideA)
+{
+#ifdef WITH_API_CNT
+    api_call_cnt++;
+#endif //WITH_API_CNT
+    size_t expected_size = nbDimsRequested * sizeof(int) * 3 + sizeof(int) + sizeof(cudnnPoolingMode_t) + sizeof(cudnnNanPropagation_t);
+    mem_result result;
+    result.mem_result_u.data.mem_data_val = (char*)malloc(expected_size);
+    enum clnt_stat retval_1;
+    if (mode == NULL || maxpoolingNanOpt == NULL || nbDims == NULL || windowDimA == NULL || paddingA == NULL || strideA == NULL) {
+        LOGE(LOG_ERROR, "%s failed (value is NULL)", __FUNCTION__);
+        return CUDNN_STATUS_BAD_PARAM;
+    }
+    retval_1 = rpc_cudnngetpoolingnddescriptor_1(
+        (ptr)poolingDesc,
+        nbDimsRequested,
+        &result, clnt); 
+    if (retval_1 != RPC_SUCCESS) {
+        LOGE(LOG_ERROR, "%s failed (%d)", __FUNCTION__, retval_1);
+    }
+    if (result.err != CUDNN_STATUS_SUCCESS || result.mem_result_u.data.mem_data_len != expected_size) {
+        LOGE(LOG_ERROR, "%s failed (result is %d)", __FUNCTION__, result.err);
+    } else {
+        size_t offset = 0;
+        *mode = (cudnnPoolingMode_t)result.mem_result_u.data.mem_data_val[offset];
+        offset += sizeof(cudnnPoolingMode_t);
+        *maxpoolingNanOpt = (cudnnNanPropagation_t)result.mem_result_u.data.mem_data_val[offset];
+        offset += sizeof(cudnnNanPropagation_t);
+        *nbDims = (int)result.mem_result_u.data.mem_data_val[offset];
+        offset += sizeof(int);
+        memcpy(windowDimA, result.mem_result_u.data.mem_data_val+offset, *nbDims * sizeof(int));
+        offset += *nbDims * sizeof(int);
+        memcpy(paddingA, result.mem_result_u.data.mem_data_val+offset, *nbDims * sizeof(int));
+        offset += *nbDims * sizeof(int);
+        memcpy(strideA, result.mem_result_u.data.mem_data_val+offset, *nbDims * sizeof(int));
+    }
+    free(result.mem_result_u.data.mem_data_val);
+    return result.err;
+}
+
+cudnnStatus_t cudnnGetPoolingNdForwardOutputDim(const cudnnPoolingDescriptor_t poolingDesc, const cudnnTensorDescriptor_t inputTensorDesc, int nbDims, int* outputTensorDimA)
+{
+#ifdef WITH_API_CNT
+    api_call_cnt++;
+#endif //WITH_API_CNT
+    mem_result result;
+    result.mem_result_u.data.mem_data_val = (char*)outputTensorDimA;
+    enum clnt_stat retval_1;
+    if (outputTensorDimA == NULL) {
+        LOGE(LOG_ERROR, "%s failed (value is NULL)", __FUNCTION__);
+        return CUDNN_STATUS_BAD_PARAM;
+    }
+    retval_1 = rpc_cudnngetpoolingndforwardoutputdim_1(
+        (ptr)poolingDesc,
+        (ptr)inputTensorDesc,
+        nbDims,
+        &result, clnt); 
+    if (retval_1 != RPC_SUCCESS) {
+        LOGE(LOG_ERROR, "%s failed (%d)", __FUNCTION__, retval_1);
+    }
+    size_t expected_size = nbDims * sizeof(int);
+    if (result.err != CUDNN_STATUS_SUCCESS || result.mem_result_u.data.mem_data_len != expected_size) {
+        LOGE(LOG_ERROR, "%s failed (result is %d)", __FUNCTION__, result.err);
+    }
+    return result.err;
+}
+
+cudnnStatus_t cudnnGetPooling2dForwardOutputDim(const cudnnPoolingDescriptor_t poolingDesc, const cudnnTensorDescriptor_t inputTensorDesc, int* n, int* c, int* h, int* w)
+{
+#ifdef WITH_API_CNT
+    api_call_cnt++;
+#endif //WITH_API_CNT
+    int4_result result;
+    enum clnt_stat retval_1;
+    if (n == NULL || c == NULL || h == NULL || w == NULL) {
+        LOGE(LOG_ERROR, "%s failed (value is NULL)", __FUNCTION__);
+        return CUDNN_STATUS_BAD_PARAM;
+    }
+    retval_1 = rpc_cudnngetpooling2dforwardoutputdim_1(
+        (ptr)poolingDesc,
+        (ptr)inputTensorDesc,
+        &result, clnt); 
+    if (retval_1 != RPC_SUCCESS) {
+        LOGE(LOG_ERROR, "%s failed (%d)", __FUNCTION__, retval_1);
+    }
+    if (result.err != CUDNN_STATUS_SUCCESS) {
+        LOGE(LOG_ERROR, "%s failed (result is %d)", __FUNCTION__, result.err);
+    } else {
+        *n = result.int4_result_u.data[0];
+        *c = result.int4_result_u.data[1];
+        *h = result.int4_result_u.data[2];
+        *w = result.int4_result_u.data[3];
+    }
+    return result.err;
+}
+
+cudnnStatus_t cudnnDestroyPoolingDescriptor(cudnnPoolingDescriptor_t poolingDesc)
+{
+#ifdef WITH_API_CNT
+    api_call_cnt++;
+#endif //WITH_API_CNT
+    int result;
+    enum clnt_stat retval_1;
+    retval_1 = rpc_cudnndestroypoolingdescriptor_1(
+        (ptr)poolingDesc,
+        &result, clnt);
+
+    if (retval_1 != RPC_SUCCESS) {
+        LOGE(LOG_ERROR, "%s failed (%d)", __FUNCTION__, retval_1);
+    }
+    if (result != CUDNN_STATUS_SUCCESS) {
+        LOGE(LOG_ERROR, "%s failed (result is %d)", __FUNCTION__, result);
+    }
+    return result;
+}
+
+cudnnStatus_t cudnnPoolingForward(cudnnHandle_t handle, const cudnnPoolingDescriptor_t poolingDesc, const void * alpha, const cudnnTensorDescriptor_t xDesc, const void * x, const void * beta, const cudnnTensorDescriptor_t yDesc, void * y)
+{
+#ifdef WITH_API_CNT
+    api_call_cnt++;
+#endif //WITH_API_CNT
+    int result;
+    enum clnt_stat retval_1;
+    //TODO: Check if we have a float instead of always sending doubles
+    cudnn_scaling_t rpc_alpha = {.dataType = CUDNN_DATA_DOUBLE, .cudnn_scaling_t_u.d = *((double*)alpha)};
+    cudnn_scaling_t rpc_beta = {.dataType = CUDNN_DATA_DOUBLE, .cudnn_scaling_t_u.d = *((double*)beta)};
+    retval_1 = rpc_cudnnpoolingforward_1(
+        (ptr)handle,
+        (ptr)poolingDesc,
+        rpc_alpha,
+        (ptr)xDesc,
+        (ptr)x,
+        rpc_beta,
+        (ptr)yDesc,
+        (ptr)y,
+        &result, clnt);
+    if (retval_1 != RPC_SUCCESS) {
+        LOGE(LOG_ERROR, "%s failed (%d)", __FUNCTION__, retval_1);
+    }
+    if (result != CUDNN_STATUS_SUCCESS) {
+        LOGE(LOG_ERROR, "%s failed (result is %d)", __FUNCTION__, result);
+    }
+    return result;
+}
+
+cudnnStatus_t cudnnCreateActivationDescriptor(cudnnActivationDescriptor_t * activationDesc)
+{
+#ifdef WITH_API_CNT
+    api_call_cnt++;
+#endif //WITH_API_CNT
+    ptr_result result;
+    enum clnt_stat retval_1;
+    if (activationDesc == NULL) {
+        LOGE(LOG_ERROR, "%s failed (value is NULL)", __FUNCTION__);
+        return CUDNN_STATUS_BAD_PARAM;
+    }
+    retval_1 = rpc_cudnncreateactivationdescriptor_1(&result, clnt);
+    if (retval_1 != RPC_SUCCESS) {
+        LOGE(LOG_ERROR, "%s failed (%d)", __FUNCTION__, retval_1);
+    }
+    if (result.err != CUDNN_STATUS_SUCCESS) {
+        LOGE(LOG_ERROR, "%s failed (result is %d)", __FUNCTION__, result.err);
+    } else {
+        *activationDesc = (cudnnActivationDescriptor_t)result.ptr_result_u.ptr;
+    }
+    return result.err;
+}
+
+cudnnStatus_t cudnnSetActivationDescriptor(cudnnActivationDescriptor_t activationDesc, cudnnActivationMode_t mode, cudnnNanPropagation_t reluNanOpt, double coef) 
+{
+#ifdef WITH_API_CNT
+    api_call_cnt++;
+#endif //WITH_API_CNT
+    int result;
+    enum clnt_stat retval_1;
+    retval_1 = rpc_cudnnsetactivationdescriptor_1(
+        (ptr)activationDesc,
+        (int)mode,
+        (int)reluNanOpt,
+        coef,
+        &result, clnt);
+
+    if (retval_1 != RPC_SUCCESS) {
+        LOGE(LOG_ERROR, "%s failed (%d)", __FUNCTION__, retval_1);
+    }
+    if (result != CUDNN_STATUS_SUCCESS) {
+        LOGE(LOG_ERROR, "%s failed (result is %d)", __FUNCTION__, result);
+    } 
+    return result;
+}
+
+cudnnStatus_t cudnnGetActivationDescriptor(const cudnnActivationDescriptor_t activationDesc, cudnnActivationMode_t *mode, cudnnNanPropagation_t *reluNanOpt, double *coef) 
+{
+#ifdef WITH_API_CNT
+    api_call_cnt++;
+#endif //WITH_API_CNT
+    int2d1_result result;
+    enum clnt_stat retval_1;
+    if (mode == NULL || reluNanOpt == NULL || coef == NULL) {
+        LOGE(LOG_ERROR, "%s failed (value is NULL)", __FUNCTION__);
+        return CUDNN_STATUS_BAD_PARAM;
+    }
+    retval_1 = rpc_cudnngetactivationdescriptor_1(
+        (ptr)activationDesc,
+        &result, clnt);
+
+    if (retval_1 != RPC_SUCCESS) {
+        LOGE(LOG_ERROR, "%s failed (%d)", __FUNCTION__, retval_1);
+    }
+    if (result.err != CUDNN_STATUS_SUCCESS) {
+        LOGE(LOG_ERROR, "%s failed (result is %d)", __FUNCTION__, result);
+    } else {
+        *mode = (cudnnActivationMode_t)result.int2d1_result_u.data.i[0];
+        *reluNanOpt = (cudnnNanPropagation_t)result.int2d1_result_u.data.i[1];
+        *coef = result.int2d1_result_u.data.d;
+    }
+    return result.err;
+}
+
+cudnnStatus_t cudnnSetActivationDescriptorSwishBeta(cudnnActivationDescriptor_t activationDesc, double swish_beta)
+{
+#ifdef WITH_API_CNT
+    api_call_cnt++;
+#endif //WITH_API_CNT
+    int result;
+    enum clnt_stat retval_1;
+    retval_1 = rpc_cudnnsetactivationdescriptorswishbeta_1(
+        (ptr)activationDesc,
+        swish_beta,
+        &result, clnt);
+
+    if (retval_1 != RPC_SUCCESS) {
+        LOGE(LOG_ERROR, "%s failed (%d)", __FUNCTION__, retval_1);
+    }
+    if (result != CUDNN_STATUS_SUCCESS) {
+        LOGE(LOG_ERROR, "%s failed (result is %d)", __FUNCTION__, result);
+    } 
+    return result;
+}
+    
+cudnnStatus_t cudnnGetActivationDescriptorSwishBeta(cudnnActivationDescriptor_t activationDesc, double * swish_beta)
+{
+#ifdef WITH_API_CNT
+    api_call_cnt++;
+#endif //WITH_API_CNT
+    d_result result;
+    enum clnt_stat retval_1;
+    if (swish_beta == NULL) {
+        LOGE(LOG_ERROR, "%s failed (value is NULL)", __FUNCTION__);
+        return CUDNN_STATUS_BAD_PARAM;
+    }
+    retval_1 = rpc_cudnngetactivationdescriptorswishbeta_1(
+        (ptr)activationDesc,
+        &result, clnt);
+
+    if (retval_1 != RPC_SUCCESS) {
+        LOGE(LOG_ERROR, "%s failed (%d)", __FUNCTION__, retval_1);
+    }
+    if (result.err != CUDNN_STATUS_SUCCESS) {
+        LOGE(LOG_ERROR, "%s failed (result is %d)", __FUNCTION__, result);
+    } else {
+        *swish_beta = result.d_result_u.data;
+    }
+    return result.err;
+}
+
+cudnnStatus_t cudnnDestroyActivationDescriptor(cudnnActivationDescriptor_t activationDesc)
+{
+#ifdef WITH_API_CNT
+    api_call_cnt++;
+#endif //WITH_API_CNT
+    int result;
+    enum clnt_stat retval_1;
+    retval_1 = rpc_cudnndestroyactivationdescriptor_1(
+        (ptr)activationDesc,
+        &result, clnt);
+
+    if (retval_1 != RPC_SUCCESS) {
+        LOGE(LOG_ERROR, "%s failed (%d)", __FUNCTION__, retval_1);
+    }
+    if (result != CUDNN_STATUS_SUCCESS) {
+        LOGE(LOG_ERROR, "%s failed (result is %d)", __FUNCTION__, result);
+    }
+    return result;
+}
+
+cudnnStatus_t cudnnActivationForward(cudnnHandle_t handle, cudnnActivationDescriptor_t activationDesc, const void * alpha, const cudnnTensorDescriptor_t xDesc, const void * x, const void * beta, const cudnnTensorDescriptor_t yDesc, void * y)
+{
+#ifdef WITH_API_CNT
+    api_call_cnt++;
+#endif //WITH_API_CNT
+    int result;
+    enum clnt_stat retval_1;
+    //TODO: Check if we have a float instead of always sending doubles
+    cudnn_scaling_t rpc_alpha = {.dataType = CUDNN_DATA_DOUBLE, .cudnn_scaling_t_u.d = *((double*)alpha)};
+    cudnn_scaling_t rpc_beta = {.dataType = CUDNN_DATA_DOUBLE, .cudnn_scaling_t_u.d = *((double*)beta)};
+    retval_1 = rpc_cudnnactivationforward_1(
+        (ptr)handle,
+        (ptr)activationDesc,
+        rpc_alpha,
+        (ptr)xDesc,
+        (ptr)x,
+        rpc_beta,
+        (ptr)yDesc,
+        (ptr)y,
+        &result, clnt);
+    if (retval_1 != RPC_SUCCESS) {
+        LOGE(LOG_ERROR, "%s failed (%d)", __FUNCTION__, retval_1);
+    }
+    if (result != CUDNN_STATUS_SUCCESS) {
+        LOGE(LOG_ERROR, "%s failed (result is %d)", __FUNCTION__, result);
+    }
+    return result;
+}
+
+cudnnStatus_t cudnnCreateLRNDescriptor(cudnnLRNDescriptor_t * normDesc)
+{
+#ifdef WITH_API_CNT
+    api_call_cnt++;
+#endif //WITH_API_CNT
+    ptr_result result;
+    enum clnt_stat retval_1;
+    if (normDesc == NULL) {
+        LOGE(LOG_ERROR, "%s failed (value is NULL)", __FUNCTION__);
+        return CUDNN_STATUS_BAD_PARAM;
+    }
+    retval_1 = rpc_cudnncreatelrndescriptor_1(&result, clnt);
+    if (retval_1 != RPC_SUCCESS) {
+        LOGE(LOG_ERROR, "%s failed (%d)", __FUNCTION__, retval_1);
+    }
+    if (result.err != CUDNN_STATUS_SUCCESS) {
+        LOGE(LOG_ERROR, "%s failed (result is %d)", __FUNCTION__, result.err);
+    } else {
+        *normDesc = (cudnnLRNDescriptor_t)result.ptr_result_u.ptr;
+    }
+    return result.err;
+}
+
+cudnnStatus_t cudnnSetLRNDescriptor(cudnnLRNDescriptor_t normDesc, unsigned lrnN, double lrnAlpha, double lrnBeta, double lrnK)
+{
+#ifdef WITH_API_CNT
+    api_call_cnt++;
+#endif //WITH_API_CNT
+    int result;
+    enum clnt_stat retval_1;
+    retval_1 = rpc_cudnnsetlrndescriptor_1(
+        (ptr)normDesc,
+        (int)lrnN,
+        lrnAlpha,
+        lrnBeta,
+        lrnK,
+        &result, clnt);
+
+    if (retval_1 != RPC_SUCCESS) {
+        LOGE(LOG_ERROR, "%s failed (%d)", __FUNCTION__, retval_1);
+    }
+    if (result != CUDNN_STATUS_SUCCESS) {
+        LOGE(LOG_ERROR, "%s failed (result is %d)", __FUNCTION__, result);
+    } 
+    return result;
+}
+
+cudnnStatus_t cudnnGetLRNDescriptor(cudnnLRNDescriptor_t normDesc, unsigned * lrnN, double * lrnAlpha, double * lrnBeta, double * lrnK)
+{
+#ifdef WITH_API_CNT
+    api_call_cnt++;
+#endif //WITH_API_CNT
+    int1d3_result result;
+    enum clnt_stat retval_1;
+    if (lrnN == NULL || lrnAlpha == NULL || lrnBeta == NULL || lrnK == NULL) {
+        LOGE(LOG_ERROR, "%s failed (value is NULL)", __FUNCTION__);
+        return CUDNN_STATUS_BAD_PARAM;
+    }
+    retval_1 = rpc_cudnngetlrndescriptor_1(
+        (ptr)normDesc,
+        &result, clnt);
+
+    if (retval_1 != RPC_SUCCESS) {
+        LOGE(LOG_ERROR, "%s failed (%d)", __FUNCTION__, retval_1);
+    }
+    if (result.err != CUDNN_STATUS_SUCCESS) {
+        LOGE(LOG_ERROR, "%s failed (result is %d)", __FUNCTION__, result);
+    } else {
+        *lrnN = result.int1d3_result_u.data.i;
+        *lrnAlpha = result.int1d3_result_u.data.d[0];
+        *lrnBeta = result.int1d3_result_u.data.d[1];
+        *lrnK = result.int1d3_result_u.data.d[2];
+    }
+    return result.err;
+}
+
+cudnnStatus_t cudnnDestroyLRNDescriptor(cudnnLRNDescriptor_t lrnDesc)
+{
+#ifdef WITH_API_CNT
+    api_call_cnt++;
+#endif //WITH_API_CNT
+    int result;
+    enum clnt_stat retval_1;
+    retval_1 = rpc_cudnndestroylrndescriptor_1(
+        (ptr)lrnDesc,
+        &result, clnt);
+
+    if (retval_1 != RPC_SUCCESS) {
+        LOGE(LOG_ERROR, "%s failed (%d)", __FUNCTION__, retval_1);
+    }
+    if (result != CUDNN_STATUS_SUCCESS) {
+        LOGE(LOG_ERROR, "%s failed (result is %d)", __FUNCTION__, result);
+    }
+    return result;
+}
+
+cudnnStatus_t cudnnLRNCrossChannelForward(cudnnHandle_t handle, cudnnLRNDescriptor_t normDesc, cudnnLRNMode_t lrnMode, const void * alpha, const cudnnTensorDescriptor_t xDesc, const void * x, const void * beta, const cudnnTensorDescriptor_t yDesc, void * y)
+{
+#ifdef WITH_API_CNT
+    api_call_cnt++;
+#endif //WITH_API_CNT
+    int result;
+    enum clnt_stat retval_1;
+    //TODO: Check if we have a float instead of always sending doubles
+    cudnn_scaling_t rpc_alpha = {.dataType = CUDNN_DATA_DOUBLE, .cudnn_scaling_t_u.d = *((double*)alpha)};
+    cudnn_scaling_t rpc_beta = {.dataType = CUDNN_DATA_DOUBLE, .cudnn_scaling_t_u.d = *((double*)beta)};
+    retval_1 = rpc_cudnnlrncrosschannelforward_1(
+        (ptr)handle,
+        (ptr)normDesc,
+        (int)lrnMode,
+        rpc_alpha,
+        (ptr)xDesc,
+        (ptr)x,
+        rpc_beta,
+        (ptr)yDesc,
+        (ptr)y,
+        &result, clnt);
+    if (retval_1 != RPC_SUCCESS) {
+        LOGE(LOG_ERROR, "%s failed (%d)", __FUNCTION__, retval_1);
+    }
+    if (result != CUDNN_STATUS_SUCCESS) {
+        LOGE(LOG_ERROR, "%s failed (result is %d)", __FUNCTION__, result);
+    }
+    return result;
+}
+
+DEF_FN(cudnnStatus_t, cudnnDivisiveNormalizationForward, cudnnHandle_t, handle, cudnnLRNDescriptor_t, normDesc, cudnnDivNormMode_t, mode, const void *, alpha, const cudnnTensorDescriptor_t, xDesc, const void *, x, const void *, means, void *, temp, void *, temp2, const void *, beta, const cudnnTensorDescriptor_t, yDesc, void *, y)
+DEF_FN(cudnnStatus_t, cudnnDeriveBNTensorDescriptor, cudnnTensorDescriptor_t, derivedBnDesc, const cudnnTensorDescriptor_t, xDesc, cudnnBatchNormMode_t, mode)
+DEF_FN(cudnnStatus_t, cudnnBatchNormalizationForwardInference, cudnnHandle_t, handle, cudnnBatchNormMode_t, mode, const void *, alpha, const void *, beta, const cudnnTensorDescriptor_t, xDesc, const void *, x, const cudnnTensorDescriptor_t, yDesc, void *, y, const cudnnTensorDescriptor_t,  bnScaleBiasMeanVarDesc, const void *, bnScale, const void *, bnBias, const void *, estimatedMean, const void *, estimatedVariance, double, epsilon)
+DEF_FN(cudnnStatus_t, cudnnDeriveNormTensorDescriptor, cudnnTensorDescriptor_t, derivedNormScaleBiasDesc, cudnnTensorDescriptor_t, derivedNormMeanVarDesc, const cudnnTensorDescriptor_t, xDesc, cudnnNormMode_t, mode, int, groupCnt) 
+DEF_FN(cudnnStatus_t, cudnnNormalizationForwardInference, cudnnHandle_t, handle, cudnnNormMode_t, mode, cudnnNormOps_t, normOps, cudnnNormAlgo_t, algo, const void *, alpha, const void *, beta, const cudnnTensorDescriptor_t, xDesc, const void *, x, const cudnnTensorDescriptor_t normScaleBiasDesc, const void *, normScale, const void *, normBias, const cudnnTensorDescriptor_t, normMeanVarDesc, const void *, estimatedMean, const void *, estimatedVariance, const cudnnTensorDescriptor_t, zDesc, const void *, z, cudnnActivationDescriptor_t, activationDesc, const cudnnTensorDescriptor_t, yDesc, void *, y, double, epsilon, int, groupCnt) 
+DEF_FN(cudnnStatus_t, cudnnCreateSpatialTransformerDescriptor, cudnnSpatialTransformerDescriptor_t *, stDesc)
+DEF_FN(cudnnStatus_t, cudnnSetSpatialTransformerNdDescriptor, cudnnSpatialTransformerDescriptor_t, stDesc, cudnnSamplerType_t, samplerType, cudnnDataType_t, dataType, const int, nbDims, const int*, dimA)
+DEF_FN(cudnnStatus_t, cudnnDestroySpatialTransformerDescriptor, cudnnSpatialTransformerDescriptor_t, stDesc)
+DEF_FN(cudnnStatus_t, cudnnSpatialTfGridGeneratorForward, cudnnHandle_t, handle, const cudnnSpatialTransformerDescriptor_t, stDesc, const void *, theta, void *, grid)
+DEF_FN(cudnnStatus_t, cudnnSpatialTfSamplerForward, cudnnHandle_t, handle, cudnnSpatialTransformerDescriptor_t, stDesc, const void *, alpha, const cudnnTensorDescriptor_t, xDesc, const void *, x, const void *, grid, const void *, beta, cudnnTensorDescriptor_t, yDesc, void *, y)
+DEF_FN(cudnnStatus_t, cudnnCreateDropoutDescriptor, cudnnDropoutDescriptor_t *, dropoutDesc)
+DEF_FN(cudnnStatus_t, cudnnDestroyDropoutDescriptor, cudnnDropoutDescriptor_t, dropoutDesc)
+DEF_FN(cudnnStatus_t, cudnnDropoutGetStatesSize, cudnnHandle_t, handle, size_t *, sizeInBytes)
+DEF_FN(cudnnStatus_t, cudnnDropoutGetReserveSpaceSize, cudnnTensorDescriptor_t, xdesc, size_t*, sizeInBytes)
+DEF_FN(cudnnStatus_t, cudnnSetDropoutDescriptor, cudnnDropoutDescriptor_t, dropoutDesc, cudnnHandle_t, handle, float, dropout, void *, states, size_t, stateSizeInBytes, unsigned long long, seed)
+DEF_FN(cudnnStatus_t, cudnnRestoreDropoutDescriptor, cudnnDropoutDescriptor_t, dropoutDesc, cudnnHandle_t, handle, float, dropout, void *, states, size_t, stateSizeInBytes, unsigned long long, seed)
+DEF_FN(cudnnStatus_t, cudnnGetDropoutDescriptor, cudnnDropoutDescriptor_t, dropoutDesc, cudnnHandle_t, handle, float *, dropout, void **, states, unsigned long long *, seed)
+DEF_FN(cudnnStatus_t, cudnnDropoutForward, cudnnHandle_t, handle, const cudnnDropoutDescriptor_t, dropoutDesc, const cudnnTensorDescriptor_t, xdesc, const void *, x, const cudnnTensorDescriptor_t, ydesc, void *, y, void *, reserveSpace, size_t, reserveSpaceSizeInBytes)
+DEF_FN(cudnnStatus_t, cudnnCreateAlgorithmDescriptor, cudnnAlgorithmDescriptor_t *, algoDesc)
+DEF_FN(cudnnStatus_t, cudnnSetAlgorithmDescriptor, cudnnAlgorithmDescriptor_t, algoDesc, cudnnAlgorithm_t, algorithm)
+DEF_FN(cudnnStatus_t, cudnnGetAlgorithmDescriptor, const cudnnAlgorithmDescriptor_t, algoDesc, cudnnAlgorithm_t *, algorithm)
+DEF_FN(cudnnStatus_t, cudnnCopyAlgorithmDescriptor, const cudnnAlgorithmDescriptor_t, src, cudnnAlgorithmDescriptor_t, dest)
+DEF_FN(cudnnStatus_t, cudnnDestroyAlgorithmDescriptor, cudnnAlgorithmDescriptor_t, algoDesc)
+DEF_FN(cudnnStatus_t, cudnnCreateAlgorithmPerformance, cudnnAlgorithmPerformance_t *, algoPerf, int, numberToCreate)
+DEF_FN(cudnnStatus_t, cudnnSetAlgorithmPerformance, cudnnAlgorithmPerformance_t, algoPerf, cudnnAlgorithmDescriptor_t, algoDesc, cudnnStatus_t, status, float, time, size_t, memory)
+DEF_FN(cudnnStatus_t, cudnnGetAlgorithmPerformance, const cudnnAlgorithmPerformance_t, algoPerf, cudnnAlgorithmDescriptor_t *, algoDesc, cudnnStatus_t, *, status, float *, time, size_t*, memory)
+DEF_FN(cudnnStatus_t, cudnnDestroyAlgorithmPerformance, cudnnAlgorithmPerformance_t *, algoPerf, int, numberToDestroy)
+DEF_FN(cudnnStatus_t, cudnnGetAlgorithmSpaceSize, cudnnHandle_t, handle, cudnnAlgorithmDescriptor_t, algoDesc, size_t *, algoSpaceSizeInBytes)
+DEF_FN(cudnnStatus_t, cudnnSaveAlgorithm, cudnnHandle_t, handle, cudnnAlgorithmDescriptor_t, algoDesc, void *, algoSpace, size_t, algoSpaceSizeInBytes)
+DEF_FN(cudnnStatus_t, cudnnRestoreAlgorithm, cudnnHandle_t, handle, void *, algoSpace, size_t, algoSpaceSizeInBytes, cudnnAlgorithmDescriptor_t, algoDesc)
+DEF_FN(cudnnStatus_t, cudnnSetCallback, unsigned, mask, void *, udata, cudnnCallback_t, fptr)
+DEF_FN(cudnnStatus_t, cudnnGetCallback, unsigned *, mask, void **, udata, cudnnCallback_t *, fptr)
+DEF_FN(cudnnStatus_t, cudnnOpsInferVersionCheck)
+
+
+/***************** cudnn_cnn_infer *******************/
+
+cudnnStatus_t cudnnCreateConvolutionDescriptor(cudnnConvolutionDescriptor_t* convDesc)
+{
+#ifdef WITH_API_CNT
+    api_call_cnt++;
+#endif //WITH_API_CNT
+    ptr_result result;
+    enum clnt_stat retval_1;
+    if (convDesc == NULL) {
+        LOGE(LOG_ERROR, "%s failed (value is NULL)", __FUNCTION__);
+        return CUDNN_STATUS_BAD_PARAM;
+    }
+    retval_1 = rpc_cudnncreateconvolutiondescriptor_1(&result, clnt);
+    if (retval_1 != RPC_SUCCESS) {
+        LOGE(LOG_ERROR, "%s failed (%d)", __FUNCTION__, retval_1);
+    }
+    if (result.err != CUDNN_STATUS_SUCCESS) {
+        LOGE(LOG_ERROR, "%s failed (result is %d)", __FUNCTION__, result.err);
+    } else {
+        *convDesc = (cudnnConvolutionDescriptor_t)result.ptr_result_u.ptr;
+    }
+    return result.err;
+}
+    
+cudnnStatus_t cudnnDestroyConvolutionDescriptor(cudnnConvolutionDescriptor_t convDesc)
+{
+#ifdef WITH_API_CNT
+    api_call_cnt++;
+#endif //WITH_API_CNT
+    int result;
+    enum clnt_stat retval_1;
+    retval_1 = rpc_cudnndestroyconvolutiondescriptor_1(
+        (ptr)convDesc,
+        &result, clnt);
+
+    if (retval_1 != RPC_SUCCESS) {
+        LOGE(LOG_ERROR, "%s failed (%d)", __FUNCTION__, retval_1);
+    }
+    if (result != CUDNN_STATUS_SUCCESS) {
+        LOGE(LOG_ERROR, "%s failed (result is %d)", __FUNCTION__, result);
+    }
+    return result;
+}
+DEF_FN(cudnnStatus_t, cudnnSetConvolutionMathType,  cudnnConvolutionDescriptor_t, convDesc,  cudnnMathType_t, mathType)
+DEF_FN(cudnnStatus_t, cudnnGetConvolutionMathType,  cudnnConvolutionDescriptor_t, convDesc,  cudnnMathType_t*, mathType)
+DEF_FN(cudnnStatus_t, cudnnSetConvolutionGroupCount,  cudnnConvolutionDescriptor_t, convDesc,  int, groupCount)
+DEF_FN(cudnnStatus_t, cudnnGetConvolutionGroupCount,  cudnnConvolutionDescriptor_t, convDesc,  int*, groupCount)
+DEF_FN(cudnnStatus_t, cudnnSetConvolutionReorderType,  cudnnConvolutionDescriptor_t, convDesc,  cudnnReorderType_t, reorderType)
+DEF_FN(cudnnStatus_t, cudnnGetConvolutionReorderType,  cudnnConvolutionDescriptor_t, convDesc,  cudnnReorderType_t*, reorderType)
+DEF_FN(cudnnStatus_t, cudnnSetConvolution2dDescriptor,  cudnnConvolutionDescriptor_t, convDesc,  int, pad_h,  int, pad_w,  int, u, int, v, int, dilation_h,  int, dilation_w,  cudnnConvolutionMode_t, mode,  cudnnDataType_t, computeType)
+DEF_FN(cudnnStatus_t, cudnnGetConvolution2dDescriptor,  const cudnnConvolutionDescriptor_t, convDesc,  int*, pad_h,  int*, pad_w,  int*, u,  int*, v,  int*, dilation_h,  int*, dilation_w,  cudnnConvolutionMode_t*, mode,  cudnnDataType_t*, computeType)
+    
+cudnnStatus_t cudnnSetConvolutionNdDescriptor(cudnnConvolutionDescriptor_t convDesc,  int arrayLength,  const int* padA,  const int* filterStrideA,  const int* dilationA,  cudnnConvolutionMode_t mode,  cudnnDataType_t computeType)
+{
+#ifdef WITH_API_CNT
+    api_call_cnt++;
+#endif //WITH_API_CNT
+    int result;
+    enum clnt_stat retval_1;
+    mem_data rpc_windowDimA = {
+        .mem_data_len = arrayLength * sizeof(int),
+        .mem_data_val = (char*)padA
+    };
+    mem_data rpc_paddingA = {
+        .mem_data_len = arrayLength * sizeof(int),
+        .mem_data_val = (char*)filterStrideA
+    };
+    mem_data rpc_strideA = {
+        .mem_data_len = arrayLength * sizeof(int),
+        .mem_data_val = (char*)dilationA
+    };
+    retval_1 = rpc_cudnnsetconvolutionnddescriptor_1(
+        (ptr)convDesc,
+        arrayLength,
+        rpc_windowDimA,
+        rpc_paddingA,
+        rpc_strideA,
+        mode,
+        computeType,
+        &result, clnt);
+
+    if (retval_1 != RPC_SUCCESS) {
+        LOGE(LOG_ERROR, "%s failed (%d)", __FUNCTION__, retval_1);
+    }
+    if (result != CUDNN_STATUS_SUCCESS) {
+        LOGE(LOG_ERROR, "%s failed (result is %d)", __FUNCTION__, result);
+    } 
+    return result;
+}
+
+DEF_FN(cudnnStatus_t, cudnnGetConvolutionNdDescriptor,  const cudnnConvolutionDescriptor_t, convDesc,  int, arrayLengthRequested,  int*, arrayLength,  int*, padA,  int*, strideA,  int*, dilationA,  cudnnConvolutionMode_t*, mode,  cudnnDataType_t*, computeType)
+DEF_FN(cudnnStatus_t, cudnnGetConvolution2dForwardOutputDim,  const cudnnConvolutionDescriptor_t, convDesc,  const cudnnTensorDescriptor_t, inputTensorDesc,  const cudnnFilterDescriptor_t, filterDesc,  int*, n,  int*, c,  int*, h,  int*, w)
+
+cudnnStatus_t cudnnGetConvolutionNdForwardOutputDim(const cudnnConvolutionDescriptor_t convDesc,  const cudnnTensorDescriptor_t inputTensorDesc,  const cudnnFilterDescriptor_t filterDesc,  int nbDims,  int* tensorOutputDimA)
+{
+#ifdef WITH_API_CNT
+    api_call_cnt++;
+#endif //WITH_API_CNT
+    mem_result result;
+    result.mem_result_u.data.mem_data_val = (char*)tensorOutputDimA;
+    enum clnt_stat retval_1;
+    if (tensorOutputDimA == NULL) {
+        LOGE(LOG_ERROR, "%s failed (value is NULL)", __FUNCTION__);
+        return CUDNN_STATUS_BAD_PARAM;
+    }
+    retval_1 = rpc_cudnngetconvolutionndforwardoutputdim_1(
+        (ptr)convDesc,
+        (ptr)inputTensorDesc,
+        (ptr)filterDesc,
+        nbDims,
+        &result, clnt); 
+    if (retval_1 != RPC_SUCCESS) {
+        LOGE(LOG_ERROR, "%s failed (%d)", __FUNCTION__, retval_1);
+    }
+    size_t expected_size = nbDims * sizeof(int);
+    if (result.err != CUDNN_STATUS_SUCCESS || result.mem_result_u.data.mem_data_len != expected_size) {
+        LOGE(LOG_ERROR, "%s failed (result is %d)", __FUNCTION__, result.err);
+    }
+    return result.err;
+}
+
+DEF_FN(cudnnStatus_t, cudnnGetConvolutionForwardAlgorithmMaxCount,  cudnnHandle_t, handle,  int*, count)
+
+cudnnStatus_t cudnnGetConvolutionForwardAlgorithm_v7(cudnnHandle_t handle,  const cudnnTensorDescriptor_t srcDesc,  const cudnnFilterDescriptor_t filterDesc,  const cudnnConvolutionDescriptor_t convDesc,  const cudnnTensorDescriptor_t destDesc,  const int requestedAlgoCount,  int* returnedAlgoCount,  cudnnConvolutionFwdAlgoPerf_t* perfResults)
+{
+#ifdef WITH_API_CNT
+    api_call_cnt++;
+#endif //WITH_API_CNT
+    mem_result result;
+    result.mem_result_u.data.mem_data_val = (char*)malloc(requestedAlgoCount * sizeof(cudnnConvolutionFwdAlgoPerf_t) + sizeof(int));
+    enum clnt_stat retval_1;
+    if (returnedAlgoCount == NULL || perfResults == NULL) {
+        LOGE(LOG_ERROR, "%s failed (value is NULL)", __FUNCTION__);
+        return CUDNN_STATUS_BAD_PARAM;
+    }
+    retval_1 = rpc_cudnngetconvolutionforwardalgorithm_v7_1(
+        (ptr)handle,
+        (ptr)srcDesc,
+        (ptr)filterDesc,
+        (ptr)convDesc,
+        (ptr)destDesc,
+        requestedAlgoCount,
+        &result, clnt); 
+    if (retval_1 != RPC_SUCCESS) {
+        LOGE(LOG_ERROR, "%s failed (%d)", __FUNCTION__, retval_1);
+    }
+    size_t expected_size = requestedAlgoCount * sizeof(cudnnConvolutionFwdAlgoPerf_t) + sizeof(int);
+    if (result.err != CUDNN_STATUS_SUCCESS || result.mem_result_u.data.mem_data_len != expected_size) {
+        LOGE(LOG_ERROR, "%s failed (result is %d)", __FUNCTION__, result.err);
+    } else {
+        *returnedAlgoCount = *(int*)result.mem_result_u.data.mem_data_val;
+        if (*returnedAlgoCount > requestedAlgoCount) {
+            LOGE(LOG_ERROR, "%s failed (returnedAlgoCount is %d, requestedAlgoCount is %d)", __FUNCTION__, *returnedAlgoCount, requestedAlgoCount);
+            return CUDNN_STATUS_INTERNAL_ERROR;
+        }
+        memcpy(perfResults, result.mem_result_u.data.mem_data_val + sizeof(int), *returnedAlgoCount * sizeof(cudnnConvolutionFwdAlgoPerf_t));
+    }
+    free(result.mem_result_u.data.mem_data_val);
+    return result.err;
+}
+
+cudnnStatus_t cudnnFindConvolutionForwardAlgorithm( cudnnHandle_t handle,  const cudnnTensorDescriptor_t xDesc,  const cudnnFilterDescriptor_t wDesc,  const cudnnConvolutionDescriptor_t convDesc,  const cudnnTensorDescriptor_t yDesc,  const int requestedAlgoCount,  int* returnedAlgoCount,  cudnnConvolutionFwdAlgoPerf_t* perfResults)
+{
+#ifdef WITH_API_CNT
+    api_call_cnt++;
+#endif //WITH_API_CNT
+    mem_result result;
+    result.mem_result_u.data.mem_data_val = (char*)malloc(requestedAlgoCount * sizeof(cudnnConvolutionFwdAlgoPerf_t) + sizeof(int));
+    enum clnt_stat retval_1;
+    if (returnedAlgoCount == NULL || perfResults == NULL) {
+        LOGE(LOG_ERROR, "%s failed (value is NULL)", __FUNCTION__);
+        return CUDNN_STATUS_BAD_PARAM;
+    }
+    retval_1 = rpc_cudnnfindconvolutionforwardalgorithm_1(
+        (ptr)handle,
+        (ptr)xDesc,
+        (ptr)wDesc,
+        (ptr)convDesc,
+        (ptr)yDesc,
+        requestedAlgoCount,
+        &result, clnt); 
+    if (retval_1 != RPC_SUCCESS) {
+        LOGE(LOG_ERROR, "%s failed (%d)", __FUNCTION__, retval_1);
+    }
+    size_t expected_size = requestedAlgoCount * sizeof(cudnnConvolutionFwdAlgoPerf_t) + sizeof(int);
+    if (result.err != CUDNN_STATUS_SUCCESS || result.mem_result_u.data.mem_data_len != expected_size) {
+        LOGE(LOG_ERROR, "%s failed (result is %d)", __FUNCTION__, result.err);
+    } else {
+        *returnedAlgoCount = *(int*)result.mem_result_u.data.mem_data_val;
+        if (*returnedAlgoCount > requestedAlgoCount) {
+            LOGE(LOG_ERROR, "%s failed (returnedAlgoCount is %d, requestedAlgoCount is %d)", __FUNCTION__, *returnedAlgoCount, requestedAlgoCount);
+            return CUDNN_STATUS_INTERNAL_ERROR;
+        }
+        memcpy(perfResults, result.mem_result_u.data.mem_data_val + sizeof(int), *returnedAlgoCount * sizeof(cudnnConvolutionFwdAlgoPerf_t));
+    }
+    free(result.mem_result_u.data.mem_data_val);
+    return result.err;
+}
+    
+DEF_FN(cudnnStatus_t, cudnnFindConvolutionForwardAlgorithmEx,  cudnnHandle_t, handle,  const cudnnTensorDescriptor_t, xDesc,  const void*, x,  const cudnnFilterDescriptor_t, wDesc,  const void*, w,  const cudnnConvolutionDescriptor_t, convDesc,  const cudnnTensorDescriptor_t, yDesc,  void*, y,  const int, requestedAlgoCount,  int*, returnedAlgoCount,  cudnnConvolutionFwdAlgoPerf_t*, perfResults,  void*, workSpace,  size_t, workSpaceSizeInBytes)
+DEF_FN(cudnnStatus_t, cudnnIm2Col,  cudnnHandle_t, handle,  const cudnnTensorDescriptor_t, xDesc,  const void*, x,  const cudnnFilterDescriptor_t, wDesc,  const cudnnConvolutionDescriptor_t, convDesc,  void*, colBuffer)
+DEF_FN(cudnnStatus_t, cudnnReorderFilterAndBias,  cudnnHandle_t, handle,  const cudnnFilterDescriptor_t, filterDesc,  cudnnReorderType_t, reorderType,  const void*, filterData,  void*, reorderedFilterData,  int, reorderBias,  const void*, biasData,  void*, reorderedBiasData)
+
+cudnnStatus_t cudnnGetConvolutionForwardWorkspaceSize( cudnnHandle_t handle,  const cudnnTensorDescriptor_t xDesc,  const cudnnFilterDescriptor_t wDesc,  const cudnnConvolutionDescriptor_t convDesc,  const cudnnTensorDescriptor_t yDesc,  cudnnConvolutionFwdAlgo_t algo,  size_t* sizeInBytes)
+{
+#ifdef WITH_API_CNT
+    api_call_cnt++;
+#endif //WITH_API_CNT
+    sz_result result;
+    enum clnt_stat retval_1;
+    if (sizeInBytes == NULL) {
+        LOGE(LOG_ERROR, "%s failed (value is NULL)", __FUNCTION__);
+        return CUDNN_STATUS_BAD_PARAM;
+    }
+    retval_1 = rpc_cudnngetconvolutionforwardworkspacesize_1(
+        (ptr)handle,
+        (ptr)xDesc,
+        (ptr)wDesc,
+        (ptr)convDesc,
+        (ptr)yDesc,
+        algo,
+        &result, clnt);
+    if (retval_1 != RPC_SUCCESS) {
+        LOGE(LOG_ERROR, "%s failed (%d)", __FUNCTION__, retval_1);
+    }
+    if (result.err != CUDNN_STATUS_SUCCESS) {
+        LOGE(LOG_ERROR, "%s failed (result is %d)", __FUNCTION__, result.err);
+    } else {
+        *sizeInBytes = result.sz_result_u.data;
+    }
+    return result.err;
+}
+
+cudnnStatus_t cudnnConvolutionForward(cudnnHandle_t handle,  const void* alpha,  const cudnnTensorDescriptor_t xDesc,  const void* x,  const cudnnFilterDescriptor_t wDesc,  const void* w,  const cudnnConvolutionDescriptor_t convDesc,  cudnnConvolutionFwdAlgo_t algo,  void* workSpace,  size_t workSpaceSizeInBytes,  const void* beta,  const cudnnTensorDescriptor_t yDesc,  void* y)
+{
+#ifdef WITH_API_CNT
+    api_call_cnt++;
+#endif //WITH_API_CNT
+    int result;
+    enum clnt_stat retval_1;
+    //TODO: Check if we have a float instead of always sending doubles
+    cudnn_scaling_t rpc_alpha = {.dataType = CUDNN_DATA_DOUBLE, .cudnn_scaling_t_u.d = *((double*)alpha)};
+    cudnn_scaling_t rpc_beta = {.dataType = CUDNN_DATA_DOUBLE, .cudnn_scaling_t_u.d = *((double*)beta)};
+    retval_1 = rpc_cudnnconvolutionforward_1(
+        (ptr)handle,
+        rpc_alpha,
+        (ptr)xDesc,
+        (ptr)x,
+        (ptr)wDesc,
+        (ptr)w,
+        (ptr)convDesc,
+        algo,
+        (ptr)workSpace,
+        workSpaceSizeInBytes,
+        rpc_beta,
+        (ptr)yDesc,
+        (ptr)y,
+        &result, clnt);
+    if (retval_1 != RPC_SUCCESS) {
+        LOGE(LOG_ERROR, "%s failed (%d)", __FUNCTION__, retval_1);
+    }
+    if (result != CUDNN_STATUS_SUCCESS) {
+        LOGE(LOG_ERROR, "%s failed (result is %d)", __FUNCTION__, result);
+    }
+    return result;
+}
+
+DEF_FN(cudnnStatus_t, cudnnConvolutionBiasActivationForward,  cudnnHandle_t, handle,  const void*, alpha1,  const cudnnTensorDescriptor_t, xDesc,  const void*, x,  const cudnnFilterDescriptor_t, wDesc,  const void*, w,  const cudnnConvolutionDescriptor_t, convDesc,  cudnnConvolutionFwdAlgo_t, algo,  void*, workSpace,  size_t, workSpaceSizeInBytes,  const void*, alpha2,  const cudnnTensorDescriptor_t, zDesc,  const void*, z,  const cudnnTensorDescriptor_t, biasDesc,  const void*, bias,  const cudnnActivationDescriptor_t, activationDesc,  const cudnnTensorDescriptor_t, yDesc,  void*, y)
+DEF_FN(cudnnStatus_t, cudnnGetConvolutionBackwardDataAlgorithmMaxCount,  cudnnHandle_t, handle,  int*, count)
+DEF_FN(cudnnStatus_t, cudnnFindConvolutionBackwardDataAlgorithm,  cudnnHandle_t, handle,  const cudnnFilterDescriptor_t, wDesc,  const cudnnTensorDescriptor_t, dyDesc,  const cudnnConvolutionDescriptor_t, convDesc,  const cudnnTensorDescriptor_t, dxDesc,  const int, requestedAlgoCount,  int*, returnedAlgoCount,  cudnnConvolutionBwdDataAlgoPerf_t*, perfResults)
+DEF_FN(cudnnStatus_t, cudnnFindConvolutionBackwardDataAlgorithmEx,  cudnnHandle_t, handle,  const cudnnFilterDescriptor_t, wDesc,  const void*, w,  const cudnnTensorDescriptor_t, dyDesc,  const void*, dy,  const cudnnConvolutionDescriptor_t, convDesc,  const cudnnTensorDescriptor_t, dxDesc,  void*, dx,  const int, requestedAlgoCount,  int*, returnedAlgoCount,  cudnnConvolutionBwdDataAlgoPerf_t*, perfResults,  void*, workSpace,  size_t, workSpaceSizeInBytes)
+DEF_FN(cudnnStatus_t, cudnnGetConvolutionBackwardDataAlgorithm_v7,  cudnnHandle_t, handle,  const cudnnFilterDescriptor_t, filterDesc,  const cudnnTensorDescriptor_t, diffDesc,  const cudnnConvolutionDescriptor_t, convDesc,  const cudnnTensorDescriptor_t, gradDesc,  const int, requestedAlgoCount,  int*, returnedAlgoCount,  cudnnConvolutionBwdDataAlgoPerf_t*, perfResults)
+DEF_FN(cudnnStatus_t, cudnnGetConvolutionBackwardDataWorkspaceSize,  cudnnHandle_t, handle,  const cudnnFilterDescriptor_t, wDesc,  const cudnnTensorDescriptor_t, dyDesc,  const cudnnConvolutionDescriptor_t, convDesc,  const cudnnTensorDescriptor_t, dxDesc,  cudnnConvolutionBwdDataAlgo_t, algo,  size_t*, sizeInBytes)
+DEF_FN(cudnnStatus_t, cudnnConvolutionBackwardData,  cudnnHandle_t, handle,  const void*, alpha,  const cudnnFilterDescriptor_t, wDesc,  const void*, w,  const cudnnTensorDescriptor_t, dyDesc,  const void*, dy,  const cudnnConvolutionDescriptor_t, convDesc,  cudnnConvolutionBwdDataAlgo_t, algo,  void*, workSpace,  size_t, workSpaceSizeInBytes,  const void*, beta,  const cudnnTensorDescriptor_t, dxDesc,  void*, dx)
+DEF_FN(cudnnStatus_t, cudnnGetFoldedConvBackwardDataDescriptors,  const cudnnHandle_t, handle,  const cudnnFilterDescriptor_t, filterDesc,  const cudnnTensorDescriptor_t, diffDesc,  const cudnnConvolutionDescriptor_t, convDesc,  const cudnnTensorDescriptor_t, gradDesc,  const cudnnTensorFormat_t, transformFormat,  cudnnFilterDescriptor_t, foldedFilterDesc,  cudnnTensorDescriptor_t, paddedDiffDesc,  cudnnConvolutionDescriptor_t, foldedConvDesc,  cudnnTensorDescriptor_t, foldedGradDesc,  cudnnTensorTransformDescriptor_t, filterFoldTransDesc,  cudnnTensorTransformDescriptor_t, diffPadTransDesc,  cudnnTensorTransformDescriptor_t, gradFoldTransDesc,  cudnnTensorTransformDescriptor_t, gradUnfoldTransDesc)
+DEF_FN(cudnnStatus_t, cudnnCnnInferVersionCheck)
+
+/********************** CUDNN BACKEND API ********************************/
+cudnnStatus_t cudnnBackendCreateDescriptor(cudnnBackendDescriptorType_t descriptorType, cudnnBackendDescriptor_t *descriptor)
+{
+#ifdef WITH_API_CNT
+    api_call_cnt++;
+#endif //WITH_API_CNT
+    ptr_result result;
+    enum clnt_stat retval_1;
+    LOGE(LOG_DEBUG, "%s(%d)", __FUNCTION__, descriptorType);
+    if (descriptor == NULL) {
+        LOGE(LOG_ERROR, "%s failed (descriptor is NULL)", __FUNCTION__);
+        return CUDNN_STATUS_BAD_PARAM;
+    }
+    retval_1 = rpc_cudnnbackendcreatedescriptor_1(
+        (int)descriptorType,
+        &result, clnt);
+    if (retval_1 != RPC_SUCCESS) {
+        LOGE(LOG_ERROR, "%s failed (%d)", __FUNCTION__, retval_1);
+    }
+    if (result.err != CUDNN_STATUS_SUCCESS) {
+        LOGE(LOG_ERROR, "%s failed (result is %d)", __FUNCTION__, result.err);
+    } else {
+        *descriptor = (void*)result.ptr_result_u.ptr;
+        LOGE(LOG_DEBUG, "-> %p", *descriptor);
+    }
+    return result.err;
+}
+
+cudnnStatus_t cudnnBackendDestroyDescriptor(cudnnBackendDescriptor_t descriptor)
+{
+#ifdef WITH_API_CNT
+    api_call_cnt++;
+#endif //WITH_API_CNT
+    int result;
+    enum clnt_stat retval_1;
+    LOGE(LOG_DEBUG, "%s(%p)", __FUNCTION__, descriptor);
+    retval_1 = rpc_cudnnbackenddestroydescriptor_1((ptr)descriptor, &result, clnt);
+    if (retval_1 != RPC_SUCCESS) {
+        LOGE(LOG_ERROR, "%s failed (%d)", __FUNCTION__, retval_1);
+    }
+    if (result != CUDNN_STATUS_SUCCESS) {
+        LOGE(LOG_ERROR, "%s failed (result is %d)", __FUNCTION__, result);
+    }
+    return result;
+}
+
+cudnnStatus_t cudnnBackendInitialize(cudnnBackendDescriptor_t descriptor)
+{
+#ifdef WITH_API_CNT
+    api_call_cnt++;
+#endif //WITH_API_CNT
+    int result;
+    enum clnt_stat retval_1;
+    LOGE(LOG_DEBUG, "%s(%p)", __FUNCTION__, descriptor);
+    retval_1 = rpc_cudnnbackendinitialize_1((ptr)descriptor, &result, clnt);
+    if (retval_1 != RPC_SUCCESS) {
+        LOGE(LOG_ERROR, "%s failed (%d)", __FUNCTION__, retval_1);
+    }
+    if (result != CUDNN_STATUS_SUCCESS) {
+        LOGE(LOG_ERROR, "%s failed (result is %d)", __FUNCTION__, result);
+    }
+    return result;
+}
+
+cudnnStatus_t cudnnBackendFinalize(cudnnBackendDescriptor_t descriptor)
+{
+#ifdef WITH_API_CNT
+    api_call_cnt++;
+#endif //WITH_API_CNT
+    int result;
+    enum clnt_stat retval_1;
+    LOGE(LOG_DEBUG, "%s(%p)", __FUNCTION__, descriptor);
+    retval_1 = rpc_cudnnbackendfinalize_1((ptr)descriptor, &result, clnt);
+    if (retval_1 != RPC_SUCCESS) {
+        LOGE(LOG_ERROR, "%s failed (%d)", __FUNCTION__, retval_1);
+    }
+    if (result != CUDNN_STATUS_SUCCESS) {
+        LOGE(LOG_ERROR, "%s failed (result is %d)", __FUNCTION__, result);
+    }
+    return result;
+}
+
+static const size_t backendAttributeSizes[] = {
+    [CUDNN_TYPE_HANDLE] = sizeof(cudnnHandle_t),
+    [CUDNN_TYPE_DATA_TYPE] = sizeof(cudnnDataType_t),
+    [CUDNN_TYPE_BOOLEAN] = sizeof(bool),
+    [CUDNN_TYPE_INT64] = sizeof(int64_t),
+    [CUDNN_TYPE_FLOAT] = sizeof(float),
+    [CUDNN_TYPE_DOUBLE] = sizeof(double),
+    [CUDNN_TYPE_VOID_PTR] = sizeof(void *),
+    [CUDNN_TYPE_CONVOLUTION_MODE] = sizeof(cudnnConvolutionMode_t),
+    [CUDNN_TYPE_HEUR_MODE] = sizeof(cudnnBackendHeurMode_t),
+    [CUDNN_TYPE_KNOB_TYPE] = sizeof(cudnnBackendKnobType_t),
+    [CUDNN_TYPE_NAN_PROPOGATION] = sizeof(cudnnNanPropagation_t),
+    [CUDNN_TYPE_NUMERICAL_NOTE] = sizeof(cudnnBackendNumericalNote_t),
+    [CUDNN_TYPE_LAYOUT_TYPE] = sizeof(cudnnBackendLayoutType_t),
+    [CUDNN_TYPE_ATTRIB_NAME] = sizeof(cudnnBackendAttributeName_t),
+    [CUDNN_TYPE_POINTWISE_MODE] = sizeof(cudnnPointwiseMode_t),
+    [CUDNN_TYPE_BACKEND_DESCRIPTOR] = sizeof(cudnnBackendDescriptor_t),
+    [CUDNN_TYPE_GENSTATS_MODE] = sizeof(cudnnGenStatsMode_t),
+    [CUDNN_TYPE_BN_FINALIZE_STATS_MODE] = sizeof(cudnnBnFinalizeStatsMode_t),
+    [CUDNN_TYPE_REDUCTION_OPERATOR_TYPE] = sizeof(cudnnReduceTensorOp_t),
+    [CUDNN_TYPE_BEHAVIOR_NOTE] = sizeof(cudnnBackendBehaviorNote_t),
+    [CUDNN_TYPE_TENSOR_REORDERING_MODE] = sizeof(cudnnBackendTensorReordering_t),
+    [CUDNN_TYPE_RESAMPLE_MODE] = sizeof(cudnnResampleMode_t),
+    [CUDNN_TYPE_PADDING_MODE] = sizeof(cudnnPaddingMode_t),
+    [CUDNN_TYPE_INT32] = sizeof(int32_t),
+    [CUDNN_TYPE_CHAR] = sizeof(char),
+    [CUDNN_TYPE_SIGNAL_MODE] = sizeof(cudnnSignalMode_t),
+    [CUDNN_TYPE_FRACTION] = sizeof(cudnnFraction_t),
+    [CUDNN_TYPE_NORM_MODE] = sizeof(cudnnBackendNormMode_t),
+    [CUDNN_TYPE_NORM_FWD_PHASE] = sizeof(cudnnBackendNormFwdPhase_t),
+    [CUDNN_TYPE_RNG_DISTRIBUTION] = sizeof(cudnnRngDistribution_t),
+};
+cudnnStatus_t cudnnBackendSetAttribute(cudnnBackendDescriptor_t descriptor,
+                         cudnnBackendAttributeName_t attributeName,
+                         cudnnBackendAttributeType_t attributeType,
+                         int64_t elementCount,
+                         const void *arrayOfElements)
+{
+#ifdef WITH_API_CNT
+    api_call_cnt++;
+#endif //WITH_API_CNT
+    int result;
+    LOGE(LOG_DEBUG, "%s(%p, %d, %d, %ld, %p)", __FUNCTION__, descriptor, attributeName, attributeType, elementCount, arrayOfElements);
+    if (attributeType > CUDNN_TYPE_RNG_DISTRIBUTION) {
+        LOGE(LOG_ERROR, "%s failed (attributeType is too large %d)", __FUNCTION__, attributeType);
+        return CUDNN_STATUS_BAD_PARAM;
+    }
+    mem_data data = {
+        .mem_data_len = elementCount * backendAttributeSizes[attributeType],
+        .mem_data_val = (char *)arrayOfElements
+    };
+    enum clnt_stat retval_1;
+    retval_1 = rpc_cudnnbackendsetattribute_1(
+        (ptr)descriptor,
+        (int)attributeName,
+        (int)attributeType,
+        elementCount,
+        data,
+        &result, clnt);
+    if (retval_1 != RPC_SUCCESS) {
+        LOGE(LOG_ERROR, "%s failed (%d)", __FUNCTION__, retval_1);
+    }
+    if (result != CUDNN_STATUS_SUCCESS) {
+        LOGE(LOG_ERROR, "%s failed (result is %d)", __FUNCTION__, result);
+    }
+    return result;
+}
+
+cudnnStatus_t cudnnBackendGetAttribute(cudnnBackendDescriptor_t const descriptor,
+                         cudnnBackendAttributeName_t attributeName,
+                         cudnnBackendAttributeType_t attributeType,
+                         int64_t requestedElementCount,
+                         int64_t *elementCount,
+                         void *arrayOfElements)
+{
+#ifdef WITH_API_CNT
+    api_call_cnt++;
+#endif //WITH_API_CNT
+    mem_result result;
+    enum clnt_stat retval_1;
+    LOGE(LOG_DEBUG, "%s(%p, %d, %d, %ld, %p, %p)", __FUNCTION__, descriptor, attributeName, attributeType, requestedElementCount, elementCount, arrayOfElements);
+    size_t expected_size = requestedElementCount * backendAttributeSizes[attributeType] + sizeof(int64_t);
+    result.mem_result_u.data.mem_data_val = malloc(expected_size);
+    if (result.mem_result_u.data.mem_data_val == NULL) {
+        LOGE(LOG_ERROR, "%s failed (malloc failed)", __FUNCTION__);
+        return CUDNN_STATUS_ALLOC_FAILED;
+    }
+    retval_1 = rpc_cudnnbackendgetattribute_1(
+        (ptr)descriptor,
+        (int)attributeName,
+        (int)attributeType,
+        requestedElementCount,
+        &result, clnt);
+    if (retval_1 != RPC_SUCCESS) {
+        LOGE(LOG_ERROR, "%s failed (%d)", __FUNCTION__, retval_1);
+    }
+    if (result.err != CUDNN_STATUS_SUCCESS || result.mem_result_u.data.mem_data_len != expected_size) {
+        LOGE(LOG_ERROR, "%s failed (result is %d, size is %zd, expected %zd)", __FUNCTION__, result.err, result.mem_result_u.data.mem_data_len, expected_size);
+        if (elementCount != NULL) {
+            *elementCount = 0;
+        }
+    } else {
+        if (elementCount != NULL) {
+            *elementCount = *(int64_t*)result.mem_result_u.data.mem_data_val;
+            LOGE(LOG_DEBUG, "elementCount = %ld", *elementCount);
+        }
+        if (arrayOfElements != NULL) {
+            memcpy(arrayOfElements, result.mem_result_u.data.mem_data_val + sizeof(int64_t), *elementCount * backendAttributeSizes[attributeType]);
+        }
+    }
+    return result.err;
+}
+
+cudnnStatus_t cudnnBackendExecute(cudnnHandle_t handle, cudnnBackendDescriptor_t executionPlan, cudnnBackendDescriptor_t variantPack)
+{
+#ifdef WITH_API_CNT
+    api_call_cnt++;
+#endif //WITH_API_CNT
+    int result;
+    enum clnt_stat retval_1;
+    LOGE(LOG_DEBUG, "%s(%p, %p, %p)", __FUNCTION__, handle, executionPlan, variantPack);
+    retval_1 = rpc_cudnnbackendexecute_1(
+        (ptr)handle,
+        (ptr)executionPlan,
+        (ptr)variantPack,
+        &result, clnt);
+    if (retval_1 != RPC_SUCCESS) {
+        LOGE(LOG_ERROR, "%s failed (%d)", __FUNCTION__, retval_1);
+    }
+    if (result != CUDNN_STATUS_SUCCESS) {
+        LOGE(LOG_ERROR, "%s failed (result is %d)", __FUNCTION__, result);
+    }
+    return result;
+}
\ No newline at end of file
diff --git a/cpu/cpu-client-driver.c b/cpu/cpu-client-driver.c
index 06f908be..4d131737 100644
--- a/cpu/cpu-client-driver.c
+++ b/cpu/cpu-client-driver.c
@@ -6,6 +6,7 @@
 #include <cudaEGL.h>
 #include <vdpau/vdpau.h>
 #include <cudaVDPAU.h>
+#include <elf.h>
 
 #include <driver_types.h>
 #include <string.h>
@@ -15,17 +16,19 @@
 #include "cpu_rpc_prot.h"
 #include "cpu-common.h"
 #include "cpu-utils.h"
+#include "cpu-elf2.h"
 
 
 //DEF_FN(CUresult, cuProfilerInitialize, const char*, configFile, const char*, outputFile, CUoutput_mode, outputMode)
 //DEF_FN(CUresult, cuProfilerStart)
 //DEF_FN(CUresult, cuProfilerStop)
 DEF_FN(CUresult, cuVDPAUGetDevice, CUdevice*, pDevice, VdpDevice, vdpDevice, VdpGetProcAddress*, vdpGetProcAddress)
+#undef cuVDPAUCtxCreate
 DEF_FN(CUresult, cuVDPAUCtxCreate, CUcontext*, pCtx, unsigned int, flags, CUdevice, device, VdpDevice, vdpDevice, VdpGetProcAddress*, vdpGetProcAddress)
 DEF_FN(CUresult, cuGraphicsVDPAURegisterVideoSurface, CUgraphicsResource*, pCudaResource, VdpVideoSurface, vdpSurface, unsigned int, flags)
 DEF_FN(CUresult, cuGraphicsVDPAURegisterOutputSurface, CUgraphicsResource*, pCudaResource, VdpOutputSurface, vdpSurface, unsigned int, flags)
 
-//DEF_FN(CUresult, cuDeviceTotalMem, size_t*, bytes, CUdevice, dev)
+#undef cuDeviceTotalMem
 CUresult cuDeviceTotalMem(size_t* bytes, CUdevice dev)
 {
 	enum clnt_stat retval;
@@ -41,7 +44,7 @@ CUresult cuDeviceTotalMem(size_t* bytes, CUdevice dev)
     return result.err;
 }
 
-//DEF_FN(CUresult, cuCtxCreate, CUcontext*, pctx, unsigned int, flags, CUdevice, dev)
+#undef cuCtxCreate
 CUresult cuCtxCreate(CUcontext *pctx, unsigned int flags, CUdevice dev)
 {
     DEF_FN_PTR(CUresult, CUcontext*, unsigned int, CUdevice);
@@ -51,10 +54,12 @@ CUresult cuCtxCreate(CUcontext *pctx, unsigned int flags, CUdevice dev)
     return ret;
 }
 DEF_FN(CUresult, cuCtxSynchronize)
+#undef cuModuleGetGlobal
 DEF_FN(CUresult, cuModuleGetGlobal, CUdeviceptr*, dptr, size_t*, bytes, CUmodule, hmod, const char*, name)
+#undef cuMemGetInfo
 DEF_FN(CUresult, cuMemGetInfo, size_t*, free, size_t*, total)
 
-//DEF_FN(CUresult, cuMemAlloc, CUdeviceptr*, dptr, size_t, bytesize)
+#undef cuMemAlloc
 CUresult cuMemAlloc(CUdeviceptr* dptr, size_t bytesize)
 {
 	enum clnt_stat retval;
@@ -71,30 +76,40 @@ CUresult cuMemAlloc(CUdeviceptr* dptr, size_t bytesize)
     return result.err;
 }
 
+#undef cuMemAllocPitch
 DEF_FN(CUresult, cuMemAllocPitch, CUdeviceptr*, dptr, size_t*, pPitch, size_t, WidthInBytes, size_t, Height, unsigned int, ElementSizeBytes)
+#undef cuMemFree
 DEF_FN(CUresult, cuMemFree, CUdeviceptr, dptr)
+#undef cuMemGetAddressRange
 DEF_FN(CUresult, cuMemGetAddressRange, CUdeviceptr*, pbase, size_t*, psize, CUdeviceptr, dptr)
+#undef cuMemHostGetDevicePointer
 DEF_FN(CUresult, cuMemHostGetDevicePointer, CUdeviceptr*, pdptr, void*, p, unsigned int, Flags)
+#undef cuMemHostRegister
 DEF_FN(CUresult, cuMemHostRegister, void*, p, size_t, bytesize, unsigned int, Flags)
+#undef cuMemsetD8
 DEF_FN(CUresult, cuMemsetD8, CUdeviceptr, dstDevice, unsigned char, uc, size_t, N);
 DEF_FN(CUresult, cuMemsetD8_v2_ptds, CUdeviceptr, dstDevice, unsigned char, uc, size_t, N);
+#undef cuMemsetD2D8
 DEF_FN(CUresult, cuMemsetD2D8, CUdeviceptr, dstDevice, size_t, dstPitch, unsigned char, uc, size_t, Width, size_t, Height)
 DEF_FN(CUresult, cuMemsetD2D8_v2_ptds, CUdeviceptr, dstDevice, size_t, dstPitch, unsigned char, uc, size_t, Width, size_t, Height)
+#undef cuEventDestroy
 DEF_FN(CUresult, cuEventDestroy, CUevent, hEvent)
+#undef cuStreamDestroy
 DEF_FN(CUresult, cuStreamDestroy, CUstream, hStream)
+#undef cuGLCtxCreate
 DEF_FN(CUresult, cuGLCtxCreate, CUcontext*, pCtx, unsigned int, Flags, CUdevice, device)
+#undef cuArrayCreate
 DEF_FN(CUresult, cuArrayCreate, CUarray*, pHandle, const CUDA_ARRAY_DESCRIPTOR*, pAllocateArray)
+#undef cuArrayGetDescriptor
 DEF_FN(CUresult, cuArrayGetDescriptor, CUDA_ARRAY_DESCRIPTOR*, pArrayDescriptor, CUarray, hArray)
+#undef cuArray3DCreate
 DEF_FN(CUresult, cuArray3DCreate, CUarray*, pHandle, const CUDA_ARRAY3D_DESCRIPTOR*, pAllocateArray)
+#undef cuArray3DGetDescriptor
 DEF_FN(CUresult, cuArray3DGetDescriptor, CUDA_ARRAY3D_DESCRIPTOR*, pArrayDescriptor, CUarray, hArray)
+#undef cuTexRefSetAddress2D
 DEF_FN(CUresult, cuTexRefSetAddress2D, CUtexref, hTexRef, const CUDA_ARRAY_DESCRIPTOR*, desc, CUdeviceptr, dptr, size_t, Pitch)
+#undef cuTexRefSetAddress
 DEF_FN(CUresult, cuTexRefSetAddress, size_t*, ByteOffset, CUtexref, hTexRef, CUdeviceptr, dptr, size_t, bytes)
-
-
-
-
-
-
 DEF_FN(CUresult, cuGLInit)
 #undef cuGLGetDevices
 #undef cuGLMapBufferObject_v2
@@ -212,7 +227,7 @@ CUresult cuDeviceGetUuid(CUuuid* uuid, CUdevice dev)
 }
 
 DEF_FN(CUresult, cuDeviceGetLuid, char*, luid, unsigned int*, deviceNodeMask, CUdevice, dev)
-//DEF_FN(CUresult, cuDeviceGetAttribute, int*, pi, CUdevice_attribute, attrib, CUdevice, dev)
+
 CUresult cuDeviceGetAttribute(int* pi, CUdevice_attribute attrib, CUdevice dev)
 {
 	enum clnt_stat retval;
@@ -227,9 +242,67 @@ CUresult cuDeviceGetAttribute(int* pi, CUdevice_attribute attrib, CUdevice dev)
     *pi = result.int_result_u.data;
     return result.err;
 }
-DEF_FN(CUresult, cuDeviceGetProperties, CUdevprop*, prop, CUdevice, dev)
+
+CUresult cuDeviceGetProperties(CUdevprop* prop, CUdevice dev)
+{
+	enum clnt_stat retval;
+    mem_result result;
+    if (prop == NULL) {
+        LOGE(LOG_ERROR, "%s: prop is NULL", __FUNCTION__);
+        return CUDA_ERROR_INVALID_VALUE;
+    }
+    retval = rpc_cudevicegetproperties_1(dev, &result, clnt);
+    LOGE(LOG_DEBUG, "%s = %d, result len: %d", __FUNCTION__, result.err,
+                                        result.mem_result_u.data.mem_data_len);
+	if (retval != RPC_SUCCESS) {
+		fprintf(stderr, "[rpc] %s failed.", __FUNCTION__);
+        return CUDA_ERROR_UNKNOWN;
+	}
+    if (result.mem_result_u.data.mem_data_len != sizeof(CUdevprop)) {
+        LOGE(LOG_ERROR, "%s: size mismatch", __FUNCTION__);
+        return CUDA_ERROR_INVALID_VALUE;
+    }
+    if (memcpy(prop, result.mem_result_u.data.mem_data_val, sizeof(CUdevprop)) == NULL) {
+        LOGE(LOG_ERROR, "%s: memcpy failed", __FUNCTION__);
+        return CUDA_ERROR_UNKNOWN;
+    }
+    return result.err;
+}
+CUresult cuDeviceComputeCapability(int* major, int* minor, CUdevice dev)
+{
+    enum clnt_stat retval;
+    dint_result result;
+    if (major == NULL || minor == NULL) {
+        LOGE(LOG_ERROR, "%s: major or minor is NULL", __FUNCTION__);
+        return CUDA_ERROR_INVALID_VALUE;
+    }
+    retval = rpc_cudevicecomputecapability_1(dev, &result, clnt);
+    LOGE(LOG_DEBUG, "%s = %d, result %d, %d", __FUNCTION__, result.err,
+                                        result.dint_result_u.data.i1,
+                                        result.dint_result_u.data.i2);
+    if (retval != RPC_SUCCESS) {
+        fprintf(stderr, "[rpc] %s failed.", __FUNCTION__);
+        return CUDA_ERROR_UNKNOWN;
+    }
+    *major = result.dint_result_u.data.i1;
+    *minor = result.dint_result_u.data.i2;
+    return result.err;
+} 
+
 DEF_FN(CUresult, cuDeviceGetByPCIBusId, CUdevice*, dev, const char*, pciBusId)
-DEF_FN(CUresult, cuDeviceGetP2PAttribute, int*, value, CUdevice_P2PAttribute, attrib, CUdevice, srcDevice, CUdevice, dstDevice)
+CUresult cuDeviceGetP2PAttribute ( int* value, CUdevice_P2PAttribute attrib, CUdevice srcDevice, CUdevice dstDevice ) 
+{
+	enum clnt_stat retval;
+    int_result result;
+    retval = rpc_cudevicegetp2pattribute_1((int)attrib, (ptr)srcDevice, (ptr)dstDevice, &result, clnt);
+    LOGE(LOG_DEBUG, "[rpc] %s(%d, %p, %p) = %d, result %s", __FUNCTION__, attrib, srcDevice, dstDevice, result.err, result.int_result_u.data);
+	if (retval != RPC_SUCCESS) {
+		fprintf(stderr, "[rpc] %s failed.", __FUNCTION__);
+        return CUDA_ERROR_UNKNOWN;
+	}
+    return result.err;
+}
+
 //DEF_FN(CUresult, cuDriverGetVersion, int*, driverVersion)
 CUresult cuDriverGetVersion(int* driverVersion)
 {
@@ -261,9 +334,31 @@ CUresult cuDevicePrimaryCtxRetain(CUcontext *pctx, CUdevice dev)
     *pctx = (CUcontext)result.ptr_result_u.ptr;
     return result.err;
 }
+#undef cuDevicePrimaryCtxRelease
 DEF_FN(CUresult, cuDevicePrimaryCtxRelease, CUdevice, dev)
+#undef cuDevicePrimaryCtxSetFlags
 DEF_FN(CUresult, cuDevicePrimaryCtxSetFlags, CUdevice, dev, unsigned int, flags)
-DEF_FN(CUresult, cuDevicePrimaryCtxGetState, CUdevice, dev, unsigned int*, flags, int*, active)
+CUresult cuDevicePrimaryCtxGetState(CUdevice dev, unsigned int* flags, int* active)
+{
+	enum clnt_stat retval;
+    dint_result result;
+    if (flags == NULL || active == NULL) {
+        LOGE(LOG_ERROR, "%s flags or active is NULL.", __FUNCTION__);
+        return CUDA_ERROR_INVALID_VALUE;
+    }
+    retval = rpc_cudeviceprimaryctxgetstate_1(dev, &result, clnt);
+    LOGE(LOG_DEBUG, "%s = %d, result %d %d", __FUNCTION__, result.err,
+                                        result.dint_result_u.data.i1,
+                                        result.dint_result_u.data.i2);
+	if (retval != RPC_SUCCESS) {
+		LOGE(LOG_ERROR, "%s failed.", __FUNCTION__);
+        return CUDA_ERROR_UNKNOWN;
+	}
+    *flags = result.dint_result_u.data.i1;
+    *active = result.dint_result_u.data.i2; 
+    return result.err;
+}
+#undef cuDevicePrimaryCtxReset
 DEF_FN(CUresult, cuDevicePrimaryCtxReset, CUdevice, dev)
 DEF_FN(CUresult, cuCtxGetFlags, unsigned int*, flags)
 //DEF_FN(CUresult, cuCtxSetCurrent, CUcontext, ctx)
@@ -344,8 +439,51 @@ CUresult cuModuleLoad(CUmodule* module, const char* fname)
     }
     return result.err;
 }
-DEF_FN(CUresult, cuModuleLoadData, CUmodule*, module, const void*, image)
-//DEF_FN(CUresult, cuModuleLoadDataEx, CUmodule*, module, const void*, image, unsigned int, numOptions, CUjit_option*, options, void**, optionValues)
+
+
+CUresult cuModuleLoadData(CUmodule* module, const void* image)
+{
+	enum clnt_stat retval;
+    ptr_result result;
+    mem_data mem;
+
+    if (image == NULL) {
+        LOGE(LOG_ERROR, "image is NULL!");
+        return CUDA_ERROR_INVALID_IMAGE;
+    }
+    Elf64_Ehdr *ehdr = (Elf64_Ehdr*)image;
+
+    if (ehdr->e_ident[EI_MAG0] != ELFMAG0 ||
+        ehdr->e_ident[EI_MAG1] != ELFMAG1 ||
+        ehdr->e_ident[EI_MAG2] != ELFMAG2 ||
+        ehdr->e_ident[EI_MAG3] != ELFMAG3) {
+        LOGE(LOG_ERROR, "image is not an ELF!");
+        return CUDA_ERROR_INVALID_IMAGE;
+    }
+
+    mem.mem_data_len = ehdr->e_shoff + ehdr->e_shnum * ehdr->e_shentsize;
+    mem.mem_data_val = (uint8_t*)image;
+
+    LOGE(LOG_DEBUG, "image_size = %#0zx", mem.mem_data_len);
+    
+    if (elf2_parameter_info(&kernel_infos, mem.mem_data_val, mem.mem_data_len) != 0) {
+        LOGE(LOG_ERROR, "could not get kernel infos from memory");
+        return CUDA_ERROR_INVALID_IMAGE;
+    }
+
+    retval = rpc_cumoduleloaddata_1(mem, &result, clnt);
+    printf("[rpc] %s(%p) = %d, result %p\n", __FUNCTION__, image, result.err, (void*)result.ptr_result_u.ptr);
+	if (retval != RPC_SUCCESS) {
+		fprintf(stderr, "[rpc] %s failed.", __FUNCTION__);
+        return CUDA_ERROR_UNKNOWN;
+	}
+    if (module != NULL) {
+       *module = (CUmodule)result.ptr_result_u.ptr;
+    }
+    return result.err;
+}
+
+DEF_FN(CUresult, cuModuleLoadDataEx, CUmodule*, module, const void*, image, unsigned int, numOptions, CUjit_option*, options, void**, optionValues)
 DEF_FN(CUresult, cuModuleLoadFatBinary, CUmodule*, module, const void*, fatCubin)
 CUresult cuModuleUnload(CUmodule hmod)
 {
@@ -360,7 +498,6 @@ CUresult cuModuleUnload(CUmodule hmod)
 	}
     return result;
 }
-//DEF_FN(CUresult, cuModuleGetFunction, CUfunction*, hfunc, CUmodule, hmod, const char*, name)
 CUresult cuModuleGetFunction(CUfunction* hfun, CUmodule hmod, const char* name)
 {
 	enum clnt_stat retval;
@@ -373,8 +510,8 @@ CUresult cuModuleGetFunction(CUfunction* hfun, CUmodule hmod, const char* name)
         return CUDA_ERROR_UNKNOWN;
 	}
     *hfun = (CUfunction)result.ptr_result_u.ptr;
-    if ((info = cricketd_utils_search_info(&kernel_infos, (char*)name)) == NULL) {
-        LOGE(LOG_ERROR, "cannot find kernel %s kernel_info_t");
+    if ((info = utils_search_info(&kernel_infos, (char*)name)) == NULL) {
+        LOGE(LOG_ERROR, "cannot find kernel %s kernel_info_t", name);
         return CUDA_ERROR_UNKNOWN;
     }
     info->host_fun = *hfun;
@@ -402,6 +539,7 @@ DEF_FN(CUresult, cuPointerGetAttributes, unsigned int, numAttributes, CUpointer_
 DEF_FN(CUresult, cuMemcpy, CUdeviceptr, dst, CUdeviceptr, src, size_t, ByteCount)
 DEF_FN(CUresult, cuMemcpy_ptds, CUdeviceptr, dst, CUdeviceptr, src, size_t, ByteCount)
 //DEF_FN(CUresult, cuMemcpyHtoD, CUdeviceptr, dstDevice, const void*, srcHost, size_t, ByteCount)
+#undef cuMemcpyHtoD
 CUresult cuMemcpyHtoD(CUdeviceptr dstDevice, const void* srcHost, size_t ByteCount)
 {
 	enum clnt_stat retval;
@@ -418,34 +556,51 @@ CUresult cuMemcpyHtoD(CUdeviceptr dstDevice, const void* srcHost, size_t ByteCou
     return result;
 }
 DEF_FN(CUresult, cuMemcpyHtoD_v2_ptds, CUdeviceptr, dstDevice, const void*, srcHost, size_t, ByteCount)
+#undef cuMemcpyDtoH
 DEF_FN(CUresult, cuMemcpyDtoH, void*, dstHost, CUdeviceptr, srcDevice, size_t, ByteCount)
 DEF_FN(CUresult, cuMemcpyDtoH_v2_ptds, void*, dstHost, CUdeviceptr, srcDevice, size_t, ByteCount)
+#undef cuMemcpyDtoD
 DEF_FN(CUresult, cuMemcpyDtoD, CUdeviceptr, dstDevice, CUdeviceptr, srcDevice, size_t, ByteCount)
 DEF_FN(CUresult, cuMemcpyDtoD_v2_ptds, CUdeviceptr, dstDevice, CUdeviceptr, srcDevice, size_t, ByteCount)
+#undef cuMemcpyDtoA
 DEF_FN(CUresult, cuMemcpyDtoA, CUarray, dstArray, size_t, dstOffset, CUdeviceptr, srcDevice, size_t, ByteCount)
+#undef cuMemcpyAtoD
 DEF_FN(CUresult, cuMemcpyAtoD, CUdeviceptr, dstDevice, CUarray, srcArray, size_t, srcOffset, size_t, ByteCount)
+#undef cuMemcpyHtoA
 DEF_FN(CUresult, cuMemcpyHtoA, CUarray, dstArray, size_t, dstOffset, const void*, srcHost, size_t, ByteCount)
+#undef cuMemcpyAtoH
 DEF_FN(CUresult, cuMemcpyAtoH, void*, dstHost, CUarray, srcArray, size_t, srcOffset, size_t, ByteCount)
+#undef cuMemcpyAtoA
 DEF_FN(CUresult, cuMemcpyAtoA, CUarray, dstArray, size_t, dstOffset, CUarray, srcArray, size_t, srcOffset, size_t, ByteCount)
+#undef cuMemcpy2D
 DEF_FN(CUresult, cuMemcpy2D, const CUDA_MEMCPY2D*, pCopy)
+#undef cuMemcpy2DUnaligned
 DEF_FN(CUresult, cuMemcpy2DUnaligned, const CUDA_MEMCPY2D*, pCopy)
 DEF_FN(CUresult, cuMemcpy2DUnaligned_v2_ptds, const CUDA_MEMCPY2D*, pCopy)
+#undef cuMemcpy3D
 DEF_FN(CUresult, cuMemcpy3D, const CUDA_MEMCPY3D*, pCopy)
 DEF_FN(CUresult, cuMemcpy3D_v2_ptds, const CUDA_MEMCPY3D*, pCopy)
 DEF_FN(CUresult, cuMemcpyPeerAsync, CUdeviceptr, dstDevice, CUcontext, dstContext, CUdeviceptr, srcDevice, CUcontext, srcContext, size_t, ByteCount, CUstream, hStream)
 DEF_FN(CUresult, cuMemcpyPeerAsync_ptsz, CUdeviceptr, dstDevice, CUcontext, dstContext, CUdeviceptr, srcDevice, CUcontext, srcContext, size_t, ByteCount, CUstream, hStream)
+#undef cuMemcpyHtoAAsync
 DEF_FN(CUresult, cuMemcpyHtoAAsync, CUarray, dstArray, size_t, dstOffset, const void*, srcHost, size_t, ByteCount, CUstream, hStream)
+#undef cuMemcpyAtoHAsync
 DEF_FN(CUresult, cuMemcpyAtoHAsync, void*, dstHost, CUarray, srcArray, size_t, srcOffset, size_t, ByteCount, CUstream, hStream)
 DEF_FN(CUresult, cuMemcpy3DPeerAsync, const CUDA_MEMCPY3D_PEER*, pCopy, CUstream, hStream)
 DEF_FN(CUresult, cuMemcpy3DPeerAsync_ptsz, const CUDA_MEMCPY3D_PEER*, pCopy, CUstream, hStream)
+#undef cuMemcpyHtoDAsync
 DEF_FN(CUresult, cuMemcpyHtoDAsync, CUdeviceptr, dstDevice, const void*, srcHost, size_t, ByteCount, CUstream, hStream)
 DEF_FN(CUresult, cuMemcpyHtoDAsync_v2_ptsz, CUdeviceptr, dstDevice, const void*, srcHost, size_t, ByteCount, CUstream, hStream)
+#undef cuMemcpyDtoHAsync
 DEF_FN(CUresult, cuMemcpyDtoHAsync, void*, dstHost, CUdeviceptr, srcDevice, size_t, ByteCount, CUstream, hStream)
 DEF_FN(CUresult, cuMemcpyDtoHAsync_v2_ptsz, void*, dstHost, CUdeviceptr, srcDevice, size_t, ByteCount, CUstream, hStream)
+#undef cuMemcpyDtoDAsync
 DEF_FN(CUresult, cuMemcpyDtoDAsync, CUdeviceptr, dstDevice, CUdeviceptr, srcDevice, size_t, ByteCount, CUstream, hStream)
 DEF_FN(CUresult, cuMemcpyDtoDAsync_v2_ptsz, CUdeviceptr, dstDevice, CUdeviceptr, srcDevice, size_t, ByteCount, CUstream, hStream)
+#undef cuMemcpy2DAsync
 DEF_FN(CUresult, cuMemcpy2DAsync, const CUDA_MEMCPY2D*, pCopy, CUstream, hStream)
 DEF_FN(CUresult, cuMemcpy2DAsync_v2_ptsz, const CUDA_MEMCPY2D*, pCopy, CUstream, hStream)
+#undef cuMemcpy3DAsync
 DEF_FN(CUresult, cuMemcpy3DAsync, const CUDA_MEMCPY3D*, pCopy, CUstream, hStream)
 DEF_FN(CUresult, cuMemcpy3DAsync_v2_ptsz, const CUDA_MEMCPY3D*, pCopy, CUstream, hStream)
 DEF_FN(CUresult, cuMemcpyAsync, CUdeviceptr, dst, CUdeviceptr, src, size_t, ByteCount, CUstream, hStream)
@@ -567,14 +722,19 @@ DEF_FN(CUresult, cuEventRecord_ptsz, CUevent, hEvent, CUstream, hStream)
 DEF_FN(CUresult, cuEventQuery, CUevent, hEvent)
 DEF_FN(CUresult, cuEventSynchronize, CUevent, hEvent)
 DEF_FN(CUresult, cuEventElapsedTime, float*, pMilliseconds, CUevent, hStart, CUevent, hEnd)
+#undef cuStreamWaitValue32
 DEF_FN(CUresult, cuStreamWaitValue32, CUstream, stream, CUdeviceptr, addr, cuuint32_t, value, unsigned int, flags)
 DEF_FN(CUresult, cuStreamWaitValue32_ptsz, CUstream, stream, CUdeviceptr, addr, cuuint32_t, value, unsigned int, flags)
+#undef cuStreamWriteValue32
 DEF_FN(CUresult, cuStreamWriteValue32, CUstream, stream, CUdeviceptr, addr, cuuint32_t, value, unsigned int, flags)
 DEF_FN(CUresult, cuStreamWriteValue32_ptsz, CUstream, stream, CUdeviceptr, addr, cuuint32_t, value, unsigned int, flags)
+#undef cuStreamWaitValue64
 DEF_FN(CUresult, cuStreamWaitValue64, CUstream, stream, CUdeviceptr, addr, cuuint64_t, value, unsigned int, flags)
 DEF_FN(CUresult, cuStreamWaitValue64_ptsz, CUstream, stream, CUdeviceptr, addr, cuuint64_t, value, unsigned int, flags)
+#undef cuStreamWriteValue64
 DEF_FN(CUresult, cuStreamWriteValue64, CUstream, stream, CUdeviceptr, addr, cuuint64_t, value, unsigned int, flags)
 DEF_FN(CUresult, cuStreamWriteValue64_ptsz, CUstream, stream, CUdeviceptr, addr, cuuint64_t, value, unsigned int, flags)
+#undef cuStreamBatchMemOp
 DEF_FN(CUresult, cuStreamBatchMemOp, CUstream, stream, unsigned int, count, CUstreamBatchMemOpParams*, paramArray, unsigned int, flags)
 DEF_FN(CUresult, cuStreamBatchMemOp_ptsz, CUstream, stream, unsigned int, count, CUstreamBatchMemOpParams*, paramArray, unsigned int, flags)
 DEF_FN(CUresult, cuStreamCreate, CUstream*, phStream, unsigned int, Flags)
@@ -600,6 +760,7 @@ DEF_FN(CUresult, cuCtxDisablePeerAccess, CUcontext, peerContext)
 DEF_FN(CUresult, cuIpcGetEventHandle, CUipcEventHandle*, pHandle, CUevent, event)
 DEF_FN(CUresult, cuIpcOpenEventHandle, CUevent*, phEvent, CUipcEventHandle, handle)
 DEF_FN(CUresult, cuIpcGetMemHandle, CUipcMemHandle*, pHandle, CUdeviceptr, dptr)
+#undef cuIpcOpenMemHandle
 DEF_FN(CUresult, cuIpcOpenMemHandle, CUdeviceptr*, pdptr, CUipcMemHandle, handle, unsigned int, Flags)
 DEF_FN(CUresult, cuIpcCloseMemHandle, CUdeviceptr, dptr)
 DEF_FN(CUresult, cuGraphicsUnregisterResource, CUgraphicsResource, resource)
@@ -609,7 +770,9 @@ DEF_FN(CUresult, cuGraphicsUnmapResources, unsigned int, count, CUgraphicsResour
 DEF_FN(CUresult, cuGraphicsUnmapResources_ptsz, unsigned int, count, CUgraphicsResource*, resources, CUstream, hStream)
 DEF_FN(CUresult, cuGraphicsSubResourceGetMappedArray, CUarray*, pArray, CUgraphicsResource, resource, unsigned int, arrayIndex, unsigned int, mipLevel)
 DEF_FN(CUresult, cuGraphicsResourceGetMappedMipmappedArray, CUmipmappedArray*, pMipmappedArray, CUgraphicsResource, resource)
+#undef cuGraphicsResourceGetMappedPointer
 DEF_FN(CUresult, cuGraphicsResourceGetMappedPointer, CUdeviceptr*, pDevPtr, size_t*, pSize, CUgraphicsResource, resource)
+#undef cuGraphicsResourceSetMapFlags
 DEF_FN(CUresult, cuGraphicsResourceSetMapFlags, CUgraphicsResource, resource, unsigned int, flags)
 //DEF_FN(CUresult, cuGetExportTable, const void**, ppExportTable, const CUuuid*, pExportTableId)
 
@@ -672,8 +835,11 @@ CUresult cuGetErrorString(CUresult error, const char** pStr)
 }
 DEF_FN(CUresult, cuGetErrorName, CUresult, error, const char**, pStr)
 DEF_FN(CUresult, cuGraphCreate, CUgraph*, phGraph, unsigned int, flags)
+#undef cuGraphAddKernelNode
 DEF_FN(CUresult, cuGraphAddKernelNode, CUgraphNode*, phGraphNode, CUgraph, hGraph, const CUgraphNode*, dependencies, size_t, numDependencies, const CUDA_KERNEL_NODE_PARAMS*, nodeParams)
+#undef cuGraphKernelNodeGetParams
 DEF_FN(CUresult, cuGraphKernelNodeGetParams, CUgraphNode, hNode, CUDA_KERNEL_NODE_PARAMS*, nodeParams)
+#undef cuGraphKernelNodeSetParams
 DEF_FN(CUresult, cuGraphKernelNodeSetParams, CUgraphNode, hNode, const CUDA_KERNEL_NODE_PARAMS*, nodeParams)
 DEF_FN(CUresult, cuGraphAddMemcpyNode, CUgraphNode*, phGraphNode, CUgraph, hGraph, const CUgraphNode*, dependencies, size_t, numDependencies, const CUDA_MEMCPY3D*, copyParams, CUcontext, ctx)
 DEF_FN(CUresult, cuGraphMemcpyNodeGetParams, CUgraphNode, hNode, CUDA_MEMCPY3D*, nodeParams)
@@ -697,7 +863,12 @@ DEF_FN(CUresult, cuGraphNodeGetDependencies, CUgraphNode, hNode, CUgraphNode*, d
 DEF_FN(CUresult, cuGraphNodeGetDependentNodes, CUgraphNode, hNode, CUgraphNode*, dependentNodes, size_t*, numDependentNodes)
 DEF_FN(CUresult, cuGraphAddDependencies, CUgraph, hGraph, const CUgraphNode*, from, const CUgraphNode*, to, size_t, numDependencies)
 DEF_FN(CUresult, cuGraphRemoveDependencies, CUgraph, hGraph, const CUgraphNode*, from, const CUgraphNode*, to, size_t, numDependencies)
+#if CUDA_VERSION >= 12000
+#undef cuGraphInstantiate
+DEF_FN(CUresult, cuGraphInstantiate, CUgraphExec*, phGraphExec, CUgraph, hGraph, unsigned long long, flags)
+#else
 DEF_FN(CUresult, cuGraphInstantiate, CUgraphExec*, phGraphExec, CUgraph, hGraph, CUgraphNode*, phErrorNode, char*, logBuffer, size_t, bufferSize)
+#endif
 DEF_FN(CUresult, cuGraphLaunch, CUgraphExec, hGraphExec, CUstream, hStream)
 DEF_FN(CUresult, cuGraphLaunch_ptsz, CUgraphExec, hGraphExec, CUstream, hStream)
 DEF_FN(CUresult, cuGraphExecDestroy, CUgraphExec, hGraphExec)
@@ -705,7 +876,6 @@ DEF_FN(CUresult, cuGraphDestroyNode, CUgraphNode, hNode)
 DEF_FN(CUresult, cuGraphDestroy, CUgraph, hGraph)
 DEF_FN(CUresult, cuGraphDestroy_ptsz, CUgraph, hGraph)
 DEF_FN(CUresult, cuStreamBeginCapture_ptsz, CUstream, hStream)
-DEF_FN(CUresult, cuStreamBeginCapture, CUstream, hStream, CUstreamCaptureMode, mode)
 #undef cuStreamBeginCapture
 DEF_FN(CUresult, cuStreamBeginCapture, CUstream, hStream, CUstreamCaptureMode, mode)
 DEF_FN(CUresult, cuStreamBeginCapture_v2_ptsz, CUstream, hStream)
@@ -714,6 +884,30 @@ DEF_FN(CUresult, cuStreamEndCapture_ptsz, CUstream, hStream, CUgraph*, phGraph)
 DEF_FN(CUresult, cuStreamIsCapturing, CUstream, hStream, CUstreamCaptureStatus*, captureStatus)
 DEF_FN(CUresult, cuStreamIsCapturing_ptsz, CUstream, hStream, CUstreamCaptureStatus*, captureStatus)
 DEF_FN(CUresult, cuThreadExchangeStreamCaptureMode, CUstreamCaptureMode*, mode)
-DEF_FN(CUresult, cuStreamGetCaptureInfo, CUstream, hStream, CUstreamCaptureStatus*, captureStatus, cuuint64_t*, id)
+#undef cuStreamGetCaptureInfo
+DEF_FN(CUresult, cuStreamGetCaptureInfo, CUstream, hStream, CUstreamCaptureStatus*, captureStatus_out, cuuint64_t*, id_out, CUgraph*. graph_out, const CUgraphNode**, dependencies_out, size_t*, numDependencies_out)
 DEF_FN(CUresult, cuStreamGetCaptureInfo_ptsz, CUstream, hStream, CUstreamCaptureStatus*, captureStatus, cuuint64_t*, id)
+#undef cuGraphExecKernelNodeSetParams
 DEF_FN(CUresult, cuGraphExecKernelNodeSetParams, CUgraphExec, hGraphExec, CUgraphNode, hNode, const CUDA_KERNEL_NODE_PARAMS*, nodeParams)
+
+#if CUDA_VERSION >= 12000
+#undef cuGetProcAddress
+CUresult cuGetProcAddress(const char* symbol, void** pfn, int cudaVersion, cuuint64_t flags, CUdriverProcAddressQueryResult* symbolStatus) 
+{
+	enum clnt_stat retval;
+    ptr_result result;
+    LOGE(LOG_DEBUG, "%s(%s, %d, %llx)", __FUNCTION__, symbol, cudaVersion, flags);
+
+    *pfn = elf2_symbol_address(symbol);
+    if (*pfn == NULL) {
+        LOGE(LOG_WARNING, "symbol %s not found.", symbol);
+        return CUDA_ERROR_UNKNOWN;
+    }
+    // Pytorch uses the 11.3 API of this function which does not have the symbolStatus parameter
+    // Because we do not support API versioning yet and to avoid segfaults, we ignore this parameter for now.
+    //*symbolStatus = CU_GET_PROC_ADDRESS_VERSION_NOT_SUFFICIENT;
+    return cudaSuccess;
+}
+#endif
+
+
diff --git a/cpu/cpu-client-nvml.c b/cpu/cpu-client-nvml.c
new file mode 100644
index 00000000..29f86380
--- /dev/null
+++ b/cpu/cpu-client-nvml.c
@@ -0,0 +1,211 @@
+#define _GNU_SOURCE
+#include <stdio.h>
+#include <stdlib.h>
+#include <nvml.h>
+
+#include "cpu-libwrap.h"
+#include "cpu_rpc_prot.h"
+#include "cpu-common.h"
+#include "cpu-utils.h"
+#include "log.h"
+
+#ifdef WITH_API_CNT
+static int api_call_cnt = 0;
+void cpu_nvml_print_api_call_cnt(void)
+{
+    LOG(LOG_INFO, "nvml api-call-cnt: %d", api_call_cnt);
+}
+#endif //WITH_API_CNT
+
+nvmlReturn_t nvmlInitWithFlags ( unsigned int  flags )
+{
+#ifdef WITH_API_CNT
+    api_call_cnt++;
+#endif //WITH_API_CNT
+    int result;
+    enum clnt_stat retval_1;
+    retval_1 = rpc_nvmlinitwithflags_1(flags, &result, clnt);
+    if (retval_1 != RPC_SUCCESS) {
+        LOGE(LOG_ERROR, "call failed: %s", __FUNCTION__);
+        return result;
+    }
+    return result;
+}
+
+#undef nvmlInit
+nvmlReturn_t nvmlInit(void)
+{
+    return nvmlInitWithFlags(0);
+}
+
+nvmlReturn_t nvmlInit_v2 ( void )
+{
+#ifdef WITH_API_CNT
+    api_call_cnt++;
+#endif //WITH_API_CNT
+    int result;
+    enum clnt_stat retval_1;
+    retval_1 = rpc_nvmlinit_v2_1(&result, clnt);
+    if (retval_1 != RPC_SUCCESS) {
+        LOGE(LOG_ERROR, "call failed: %s", __FUNCTION__);
+        return result;
+    }
+    return result;
+}
+nvmlReturn_t nvmlShutdown ( void )
+{
+#ifdef WITH_API_CNT
+    api_call_cnt++;
+#endif //WITH_API_CNT
+    int result;
+    enum clnt_stat retval_1;
+    retval_1 = rpc_nvmlshutdown_1(&result, clnt);
+    if (retval_1 != RPC_SUCCESS) {
+        LOGE(LOG_ERROR, "call failed: %s", __FUNCTION__);
+        return result;
+    }
+    return result;
+}
+
+
+DEF_FN(nvmlReturn_t, nvmlDeviceGetAPIRestriction, nvmlDevice_t, device, nvmlRestrictedAPI_t, apiType, nvmlEnableState_t*, isRestricted )
+DEF_FN(nvmlReturn_t, nvmlDeviceGetAdaptiveClockInfoStatus, nvmlDevice_t, device, unsigned int*, adaptiveClockStatus )
+DEF_FN(nvmlReturn_t, nvmlDeviceGetApplicationsClock, nvmlDevice_t, device, nvmlClockType_t, clockType, unsigned int*, clockMHz )
+#if NVML_API_VERSION >= 12
+DEF_FN(nvmlReturn_t, nvmlDeviceGetArchitecture, nvmlDevice_t, device, nvmlDeviceArchitecture_t*, arch )
+DEF_FN(nvmlReturn_t, nvmlDeviceGetAttributes_v2, nvmlDevice_t, device, nvmlDeviceAttributes_t*, attributes )
+#endif
+DEF_FN(nvmlReturn_t, nvmlDeviceGetAutoBoostedClocksEnabled, nvmlDevice_t, device, nvmlEnableState_t*, isEnabled, nvmlEnableState_t*, defaultIsEnabled )
+DEF_FN(nvmlReturn_t, nvmlDeviceGetBAR1MemoryInfo, nvmlDevice_t, device, nvmlBAR1Memory_t*, bar1Memory )
+DEF_FN(nvmlReturn_t, nvmlDeviceGetBoardId, nvmlDevice_t, device, unsigned int*, boardId )
+DEF_FN(nvmlReturn_t, nvmlDeviceGetBoardPartNumber, nvmlDevice_t, device, char*, partNumber, unsigned int,  length )
+DEF_FN(nvmlReturn_t, nvmlDeviceGetBrand, nvmlDevice_t, device, nvmlBrandType_t*, type )
+DEF_FN(nvmlReturn_t, nvmlDeviceGetBridgeChipInfo, nvmlDevice_t, device, nvmlBridgeChipHierarchy_t*, bridgeHierarchy )
+DEF_FN(nvmlReturn_t, nvmlDeviceGetClock, nvmlDevice_t, device, nvmlClockType_t, clockType, nvmlClockId_t, clockId, unsigned int*, clockMHz )
+DEF_FN(nvmlReturn_t, nvmlDeviceGetClockInfo, nvmlDevice_t, device, nvmlClockType_t, type, unsigned int*, clock )
+DEF_FN(nvmlReturn_t, nvmlDeviceGetComputeMode, nvmlDevice_t, device, nvmlComputeMode_t*, mode )
+DEF_FN(nvmlReturn_t, nvmlDeviceGetComputeRunningProcesses_v3, nvmlDevice_t, device, unsigned int*, infoCount, nvmlProcessInfo_t*, infos )
+nvmlReturn_t nvmlDeviceGetCount_v2(unsigned int* deviceCount )
+{
+#ifdef WITH_API_CNT
+    api_call_cnt++;
+#endif //WITH_API_CNT
+    int_result result;
+    enum clnt_stat retval_1;
+    if (deviceCount == NULL) {
+        return NVML_ERROR_INVALID_ARGUMENT;
+    }
+    retval_1 = rpc_nvmldevicegetcount_v2_1(&result, clnt);
+    if (retval_1 != RPC_SUCCESS) {
+        LOGE(LOG_ERROR, "call failed: %s", __FUNCTION__);
+    }
+    if (result.err == 0) {
+        *deviceCount = result.int_result_u.data;
+    }
+    return result.err;
+}
+DEF_FN(nvmlReturn_t, nvmlDeviceGetCudaComputeCapability, nvmlDevice_t, device, int*, major, int*, minor )
+DEF_FN(nvmlReturn_t, nvmlDeviceGetCurrPcieLinkGeneration, nvmlDevice_t, device, unsigned int*, currLinkGen )
+DEF_FN(nvmlReturn_t, nvmlDeviceGetCurrPcieLinkWidth, nvmlDevice_t, device, unsigned int*, currLinkWidth )
+DEF_FN(nvmlReturn_t, nvmlDeviceGetCurrentClocksThrottleReasons, nvmlDevice_t, device, unsigned long long*, clocksThrottleReasons )
+DEF_FN(nvmlReturn_t, nvmlDeviceGetDecoderUtilization, nvmlDevice_t, device, unsigned int*, utilization, unsigned int*, samplingPeriodUs )
+DEF_FN(nvmlReturn_t, nvmlDeviceGetDefaultApplicationsClock, nvmlDevice_t, device, nvmlClockType_t, clockType, unsigned int*, clockMHz )
+DEF_FN(nvmlReturn_t, nvmlDeviceGetDefaultEccMode, nvmlDevice_t, device, nvmlEnableState_t*, defaultMode )
+DEF_FN(nvmlReturn_t, nvmlDeviceGetDetailedEccErrors, nvmlDevice_t, device, nvmlMemoryErrorType_t, errorType, nvmlEccCounterType_t, counterType, nvmlEccErrorCounts_t*, eccCounts )
+DEF_FN(nvmlReturn_t, nvmlDeviceGetDisplayActive, nvmlDevice_t, device, nvmlEnableState_t*, isActive )
+DEF_FN(nvmlReturn_t, nvmlDeviceGetDisplayMode, nvmlDevice_t, device, nvmlEnableState_t*, display )
+DEF_FN(nvmlReturn_t, nvmlDeviceGetDriverModel, nvmlDevice_t, device, nvmlDriverModel_t*, current, nvmlDriverModel_t*, pending )
+DEF_FN(nvmlReturn_t, nvmlDeviceGetEccMode, nvmlDevice_t, device, nvmlEnableState_t*, current, nvmlEnableState_t*, pending )
+DEF_FN(nvmlReturn_t, nvmlDeviceGetEncoderCapacity, nvmlDevice_t, device, nvmlEncoderType_t, encoderQueryType, unsigned int*, encoderCapacity )
+DEF_FN(nvmlReturn_t, nvmlDeviceGetEncoderSessions, nvmlDevice_t, device, unsigned int*, sessionCount, nvmlEncoderSessionInfo_t*, sessionInfos )
+DEF_FN(nvmlReturn_t, nvmlDeviceGetEncoderStats, nvmlDevice_t, device, unsigned int*, sessionCount, unsigned int*, averageFps, unsigned int*, averageLatency )
+DEF_FN(nvmlReturn_t, nvmlDeviceGetEncoderUtilization, nvmlDevice_t, device, unsigned int*, utilization, unsigned int*, samplingPeriodUs )
+DEF_FN(nvmlReturn_t, nvmlDeviceGetEnforcedPowerLimit, nvmlDevice_t, device, unsigned int*, limit )
+DEF_FN(nvmlReturn_t, nvmlDeviceGetFBCSessions, nvmlDevice_t, device, unsigned int*, sessionCount, nvmlFBCSessionInfo_t*, sessionInfo )
+DEF_FN(nvmlReturn_t, nvmlDeviceGetFBCStats, nvmlDevice_t, device, nvmlFBCStats_t*, fbcStats )
+#if NVML_API_VERSION >= 12
+DEF_FN(nvmlReturn_t, nvmlDeviceGetFanControlPolicy_v2, nvmlDevice_t, device, unsigned int,  fan, nvmlFanControlPolicy_t*, policy )
+#endif
+DEF_FN(nvmlReturn_t, nvmlDeviceGetFanSpeed, nvmlDevice_t, device, unsigned int*, speed )
+DEF_FN(nvmlReturn_t, nvmlDeviceGetFanSpeed_v2, nvmlDevice_t, device, unsigned int,  fan, unsigned int*, speed )
+DEF_FN(nvmlReturn_t, nvmlDeviceGetGpuMaxPcieLinkGeneration, nvmlDevice_t, device, unsigned int*, maxLinkGenDevice )
+DEF_FN(nvmlReturn_t, nvmlDeviceGetGpuOperationMode, nvmlDevice_t, device, nvmlGpuOperationMode_t*, current, nvmlGpuOperationMode_t*, pending )
+DEF_FN(nvmlReturn_t, nvmlDeviceGetGraphicsRunningProcesses_v3, nvmlDevice_t, device, unsigned int*, infoCount, nvmlProcessInfo_t*, infos )
+DEF_FN(nvmlReturn_t, nvmlDeviceGetHandleByIndex_v2, unsigned int,  index, nvmlDevice_t*, device )
+DEF_FN(nvmlReturn_t, nvmlDeviceGetHandleByPciBusId_v2, const char*, pciBusId, nvmlDevice_t*, device )
+DEF_FN(nvmlReturn_t, nvmlDeviceGetHandleBySerial, const char*, serial, nvmlDevice_t*, device )
+DEF_FN(nvmlReturn_t, nvmlDeviceGetHandleByUUID, const char*, uuid, nvmlDevice_t*, device )
+DEF_FN(nvmlReturn_t, nvmlDeviceGetIndex, nvmlDevice_t, device, unsigned int*, index )
+DEF_FN(nvmlReturn_t, nvmlDeviceGetInforomConfigurationChecksum, nvmlDevice_t, device, unsigned int*, checksum )
+DEF_FN(nvmlReturn_t, nvmlDeviceGetInforomImageVersion, nvmlDevice_t, device, char*, version, unsigned int,  length )
+DEF_FN(nvmlReturn_t, nvmlDeviceGetInforomVersion, nvmlDevice_t, device, nvmlInforomObject_t, object, char*, version, unsigned int,  length )
+DEF_FN(nvmlReturn_t, nvmlDeviceGetIrqNum, nvmlDevice_t, device, unsigned int*, irqNum )
+DEF_FN(nvmlReturn_t, nvmlDeviceGetMPSComputeRunningProcesses_v3, nvmlDevice_t, device, unsigned int*, infoCount, nvmlProcessInfo_t*, infos )
+DEF_FN(nvmlReturn_t, nvmlDeviceGetMaxClockInfo, nvmlDevice_t, device, nvmlClockType_t, type, unsigned int*, clock )
+DEF_FN(nvmlReturn_t, nvmlDeviceGetMaxCustomerBoostClock, nvmlDevice_t, device, nvmlClockType_t, clockType, unsigned int*, clockMHz )
+DEF_FN(nvmlReturn_t, nvmlDeviceGetMaxPcieLinkGeneration, nvmlDevice_t, device, unsigned int*, maxLinkGen )
+DEF_FN(nvmlReturn_t, nvmlDeviceGetMaxPcieLinkWidth, nvmlDevice_t, device, unsigned int*, maxLinkWidth )
+DEF_FN(nvmlReturn_t, nvmlDeviceGetMemoryBusWidth, nvmlDevice_t, device, unsigned int*, busWidth )
+DEF_FN(nvmlReturn_t, nvmlDeviceGetMemoryErrorCounter, nvmlDevice_t, device, nvmlMemoryErrorType_t, errorType, nvmlEccCounterType_t, counterType, nvmlMemoryLocation_t, locationType, unsigned long long*, count )
+DEF_FN(nvmlReturn_t, nvmlDeviceGetMemoryInfo, nvmlDevice_t, device, nvmlMemory_t*, memory )
+DEF_FN(nvmlReturn_t, nvmlDeviceGetMinMaxFanSpeed, nvmlDevice_t, device, unsigned int*, minSpeed, unsigned int*, maxSpeed )
+DEF_FN(nvmlReturn_t, nvmlDeviceGetMinorNumber, nvmlDevice_t, device, unsigned int*, minorNumber )
+DEF_FN(nvmlReturn_t, nvmlDeviceGetMultiGpuBoard, nvmlDevice_t, device, unsigned int*, multiGpuBool )
+DEF_FN(nvmlReturn_t, nvmlDeviceGetName, nvmlDevice_t, device, char*, name, unsigned int,  length )
+DEF_FN(nvmlReturn_t, nvmlDeviceGetNumFans, nvmlDevice_t, device, unsigned int*, numFans )
+DEF_FN(nvmlReturn_t, nvmlDeviceGetNumGpuCores, nvmlDevice_t, device, unsigned int*, numCores )
+DEF_FN(nvmlReturn_t, nvmlDeviceGetP2PStatus, nvmlDevice_t, device1, nvmlDevice_t, device2, nvmlGpuP2PCapsIndex_t, p2pIndex, nvmlGpuP2PStatus_t*, p2pStatus )
+DEF_FN(nvmlReturn_t, nvmlDeviceGetPciInfo_v3, nvmlDevice_t, device, nvmlPciInfo_t*, pci )
+DEF_FN(nvmlReturn_t, nvmlDeviceGetPcieLinkMaxSpeed, nvmlDevice_t, device, unsigned int*, maxSpeed )
+DEF_FN(nvmlReturn_t, nvmlDeviceGetPcieReplayCounter, nvmlDevice_t, device, unsigned int*, value )
+DEF_FN(nvmlReturn_t, nvmlDeviceGetPcieSpeed, nvmlDevice_t, device, unsigned int*, pcieSpeed )
+DEF_FN(nvmlReturn_t, nvmlDeviceGetPcieThroughput, nvmlDevice_t, device, nvmlPcieUtilCounter_t, counter, unsigned int*, value )
+DEF_FN(nvmlReturn_t, nvmlDeviceGetPerformanceState, nvmlDevice_t, device, nvmlPstates_t*, pState )
+DEF_FN(nvmlReturn_t, nvmlDeviceGetPersistenceMode, nvmlDevice_t, device, nvmlEnableState_t*, mode )
+DEF_FN(nvmlReturn_t, nvmlDeviceGetPowerManagementDefaultLimit, nvmlDevice_t, device, unsigned int*, defaultLimit )
+DEF_FN(nvmlReturn_t, nvmlDeviceGetPowerManagementLimit, nvmlDevice_t, device, unsigned int*, limit )
+DEF_FN(nvmlReturn_t, nvmlDeviceGetPowerManagementLimitConstraints, nvmlDevice_t, device, unsigned int*, minLimit, unsigned int*, maxLimit )
+DEF_FN(nvmlReturn_t, nvmlDeviceGetPowerManagementMode, nvmlDevice_t, device, nvmlEnableState_t*, mode )
+#if NVML_API_VERSION >= 12
+DEF_FN(nvmlReturn_t, nvmlDeviceGetPowerSource, nvmlDevice_t, device, nvmlPowerSource_t*, powerSource )
+#endif
+DEF_FN(nvmlReturn_t, nvmlDeviceGetPowerState, nvmlDevice_t, device, nvmlPstates_t*, pState )
+DEF_FN(nvmlReturn_t, nvmlDeviceGetPowerUsage, nvmlDevice_t, device, unsigned int*, power )
+DEF_FN(nvmlReturn_t, nvmlDeviceGetRemappedRows, nvmlDevice_t, device, unsigned int*, corrRows, unsigned int*, uncRows, unsigned int*, isPending, unsigned int*, failureOccurred )
+DEF_FN(nvmlReturn_t, nvmlDeviceGetRetiredPages, nvmlDevice_t, device, nvmlPageRetirementCause_t, cause, unsigned int*, pageCount, unsigned long long*, addresses )
+DEF_FN(nvmlReturn_t, nvmlDeviceGetRetiredPagesPendingStatus, nvmlDevice_t, device, nvmlEnableState_t*, isPending )
+DEF_FN(nvmlReturn_t, nvmlDeviceGetRetiredPages_v2, nvmlDevice_t, device, nvmlPageRetirementCause_t, cause, unsigned int*, pageCount, unsigned long long*, addresses, unsigned long long*, timestamps )
+#if NVML_API_VERSION >= 12
+DEF_FN(nvmlReturn_t, nvmlDeviceGetRowRemapperHistogram, nvmlDevice_t, device, nvmlRowRemapperHistogramValues_t*, values )
+#endif
+DEF_FN(nvmlReturn_t, nvmlDeviceGetSamples, nvmlDevice_t, device, nvmlSamplingType_t, type, unsigned long long, lastSeenTimeStamp, nvmlValueType_t*, sampleValType, unsigned int*, sampleCount, nvmlSample_t*, samples )
+DEF_FN(nvmlReturn_t, nvmlDeviceGetSerial, nvmlDevice_t, device, char*, serial, unsigned int,  length )
+DEF_FN(nvmlReturn_t, nvmlDeviceGetSupportedClocksThrottleReasons, nvmlDevice_t, device, unsigned long long*, supportedClocksThrottleReasons )
+DEF_FN(nvmlReturn_t, nvmlDeviceGetSupportedGraphicsClocks, nvmlDevice_t, device, unsigned int,  memoryClockMHz, unsigned int*, count, unsigned int*, clocksMHz )
+DEF_FN(nvmlReturn_t, nvmlDeviceGetSupportedMemoryClocks, nvmlDevice_t, device, unsigned int*, count, unsigned int*, clocksMHz )
+DEF_FN(nvmlReturn_t, nvmlDeviceGetTargetFanSpeed, nvmlDevice_t, device, unsigned int,  fan, unsigned int*, targetSpeed )
+DEF_FN(nvmlReturn_t, nvmlDeviceGetTemperature, nvmlDevice_t, device, nvmlTemperatureSensors_t, sensorType, unsigned int*, temp )
+DEF_FN(nvmlReturn_t, nvmlDeviceGetTemperatureThreshold, nvmlDevice_t, device, nvmlTemperatureThresholds_t, thresholdType, unsigned int*, temp )
+#if NVML_API_VERSION >= 12
+DEF_FN(nvmlReturn_t, nvmlDeviceGetThermalSettings, nvmlDevice_t, device, unsigned int,  sensorIndex, nvmlGpuThermalSettings_t*, pThermalSettings )
+#endif
+DEF_FN(nvmlReturn_t, nvmlDeviceGetTopologyCommonAncestor, nvmlDevice_t, device1, nvmlDevice_t, device2, nvmlGpuTopologyLevel_t*, pathInfo )
+DEF_FN(nvmlReturn_t, nvmlDeviceGetTopologyNearestGpus, nvmlDevice_t, device, nvmlGpuTopologyLevel_t, level, unsigned int*, count, nvmlDevice_t*, deviceArray )
+DEF_FN(nvmlReturn_t, nvmlDeviceGetTotalEccErrors, nvmlDevice_t, device, nvmlMemoryErrorType_t, errorType, nvmlEccCounterType_t, counterType, unsigned long long*, eccCounts )
+DEF_FN(nvmlReturn_t, nvmlDeviceGetTotalEnergyConsumption, nvmlDevice_t, device, unsigned long long*, energy )
+DEF_FN(nvmlReturn_t, nvmlDeviceGetUUID, nvmlDevice_t, device, char*, uuid, unsigned int,  length )
+DEF_FN(nvmlReturn_t, nvmlDeviceGetUtilizationRates, nvmlDevice_t, device, nvmlUtilization_t*, utilization )
+DEF_FN(nvmlReturn_t, nvmlDeviceGetVbiosVersion, nvmlDevice_t, device, char*, version, unsigned int,  length )
+DEF_FN(nvmlReturn_t, nvmlDeviceGetViolationStatus, nvmlDevice_t, device, nvmlPerfPolicyType_t, perfPolicyType, nvmlViolationTime_t*, violTime )
+DEF_FN(nvmlReturn_t, nvmlDeviceOnSameBoard, nvmlDevice_t, device1, nvmlDevice_t, device2, int*, onSameBoard )
+DEF_FN(nvmlReturn_t, nvmlDeviceResetApplicationsClocks, nvmlDevice_t, device )
+DEF_FN(nvmlReturn_t, nvmlDeviceSetAutoBoostedClocksEnabled, nvmlDevice_t, device, nvmlEnableState_t, enabled )
+DEF_FN(nvmlReturn_t, nvmlDeviceSetDefaultAutoBoostedClocksEnabled, nvmlDevice_t, device, nvmlEnableState_t, enabled, unsigned int,  flags )
+DEF_FN(nvmlReturn_t, nvmlDeviceSetDefaultFanSpeed_v2, nvmlDevice_t, device, unsigned int,  fan )
+#if NVML_API_VERSION >= 12
+DEF_FN(nvmlReturn_t, nvmlDeviceSetFanControlPolicy, nvmlDevice_t, device, unsigned int,  fan, nvmlFanControlPolicy_t, policy )
+#endif
+DEF_FN(nvmlReturn_t, nvmlDeviceSetTemperatureThreshold, nvmlDevice_t, device, nvmlTemperatureThresholds_t, thresholdType, int*, temp )
+DEF_FN(nvmlReturn_t, nvmlDeviceValidateInforom, nvmlDevice_t, device )
+DEF_FN(nvmlReturn_t, nvmlSystemGetTopologyGpuSet, unsigned int,  cpuNumber, unsigned int*, count, nvmlDevice_t*, deviceArray )
+DEF_FN(nvmlReturn_t, nvmlVgpuInstanceGetMdevUUID, nvmlVgpuInstance_t, vgpuInstance, char*, mdevUuid, unsigned int,  size )
diff --git a/cpu/cpu-client-runtime.c b/cpu/cpu-client-runtime.c
index 373993a3..cbd1eab0 100644
--- a/cpu/cpu-client-runtime.c
+++ b/cpu/cpu-client-runtime.c
@@ -1,4 +1,3 @@
-#include "mt-memcpy.h"
 #define _GNU_SOURCE
 #include <stdio.h>
 #include <stdlib.h>
@@ -24,6 +23,7 @@
 #include "cpu-utils.h"
 #include "log.h"
 #include "oob.h"
+#include "mt-memcpy.h"
 #ifdef WITH_IB
 #include "cpu-ib.h"
 #endif //WITH_IB
@@ -269,12 +269,12 @@ cudaError_t cudaDeviceSynchronize(void)
 #endif //WITH_API_CNT
     int result = -1;
     enum clnt_stat retval_1;
-    for (int i=0; result != 0 && i < 10; ++i) {
-        retval_1 = cuda_device_synchronize_1(&result, clnt);
-        if (retval_1 != RPC_SUCCESS) {
-            clnt_perror (clnt, "call failed");
-        }
-    }
+
+    struct timeval timeout = {.tv_sec = -1, .tv_usec = 0};
+
+    return (clnt_call (clnt, CUDA_DEVICE_SYNCHRONIZE, (xdrproc_t) xdr_void, (caddr_t) NULL,
+		    (xdrproc_t) xdr_int, (caddr_t) &result,
+		    timeout));
     return result;
 }
 
@@ -329,15 +329,18 @@ cudaError_t cudaGetDeviceFlags(unsigned int* flags)
     return result.err;
 }
 
+#undef cudaGetDeviceProperties
 cudaError_t cudaGetDeviceProperties(struct cudaDeviceProp* prop, int device)
 {
 #ifdef WITH_API_CNT
     api_call_cnt++;
 #endif //WITH_API_CNT
-    mem_result result;
-    result.mem_result_u.data.mem_data_len = sizeof(struct cudaDeviceProp);
-    result.mem_result_u.data.mem_data_val = (char*)prop;
+    cuda_device_prop_result result;
     enum clnt_stat retval;
+    if (prop == NULL) {
+        LOGE(LOG_ERROR, "error: prop == NULL");
+        return cudaErrorInvalidValue;
+    }
     retval = cuda_get_device_properties_1(device, &result, clnt);
     if (retval != RPC_SUCCESS) {
         clnt_perror (clnt, "call failed");
@@ -345,13 +348,21 @@ cudaError_t cudaGetDeviceProperties(struct cudaDeviceProp* prop, int device)
     if (result.err != 0) {
         return result.err;
     }
-    if (result.mem_result_u.data.mem_data_len != sizeof(struct cudaDeviceProp)) {
-        LOGE(LOG_ERROR, "error: expected size != retrieved size\n");
+    // if (memcpy(prop, result.mem_result_u.data.mem_data_val, sizeof(struct cudaDeviceProp)) == NULL) {
+    //FIXME: Don't know why, but pytorch expects a different definition of cudaDeviceProp, which is only 728 bytes long
+    if (memcpy(prop, result.cuda_device_prop_result_u.data, 728) == NULL) {
+        LOGE(LOG_ERROR, "error: memcpy failed");
         return result.err;
     }
     return result.err;
 }
 
+cudaError_t cudaGetDeviceProperties_v2(struct cudaDeviceProp* prop, int device)
+{
+    return cudaGetDeviceProperties(prop, device);
+}
+
+
 DEF_FN(cudaError_t, cudaIpcCloseMemHandle, void*, devPtr)
 DEF_FN(cudaError_t, cudaIpcGetEventHandle, cudaIpcEventHandle_t*, handle, cudaEvent_t, event)
 DEF_FN(cudaError_t, cudaIpcGetMemHandle, cudaIpcMemHandle_t*, handle, void*, devPtr)
@@ -572,7 +583,25 @@ cudaError_t cudaStreamGetPriority(cudaStream_t hStream, int* priority)
     return result.err;
 }
 
-DEF_FN(cudaError_t, cudaStreamIsCapturing, cudaStream_t, stream, enum cudaStreamCaptureStatus*, pCaptureStatus)
+cudaError_t cudaStreamIsCapturing(cudaStream_t stream, enum cudaStreamCaptureStatus* pCaptureStatus)
+{
+#ifdef WITH_API_CNT
+    api_call_cnt++;
+#endif //WITH_API_CNT
+    int_result result;
+    enum clnt_stat retval_1;
+    if (pCaptureStatus == NULL) {
+        return cudaErrorInvalidValue;
+    }
+    retval_1 = cuda_stream_is_capturing_1((ptr)stream, &result, clnt);
+    if (retval_1 != RPC_SUCCESS) {
+        clnt_perror (clnt, "call failed");
+    }
+    if (result.err == 0) {
+        *pCaptureStatus = (enum cudaStreamCaptureStatus)result.int_result_u.data;
+    }
+    return result.err;
+}
 
 cudaError_t cudaStreamQuery(cudaStream_t stream)
 {
@@ -752,7 +781,9 @@ DEF_FN(cudaError_t, cudaExternalMemoryGetMappedBuffer, void**, devPtr, cudaExter
 DEF_FN(cudaError_t, cudaExternalMemoryGetMappedMipmappedArray, cudaMipmappedArray_t*, mipmap, cudaExternalMemory_t, extMem, const struct cudaExternalMemoryMipmappedArrayDesc*, mipmapDesc)
 DEF_FN(cudaError_t, cudaImportExternalMemory, cudaExternalMemory_t*, extMem_out, const struct cudaExternalMemoryHandleDesc*, memHandleDesc)
 DEF_FN(cudaError_t, cudaImportExternalSemaphore, cudaExternalSemaphore_t*, extSem_out, const struct cudaExternalSemaphoreHandleDesc*, semHandleDesc)
+#undef cudaSignalExternalSemaphoresAsync
 DEF_FN(cudaError_t, cudaSignalExternalSemaphoresAsync, const cudaExternalSemaphore_t*, extSemArray, const struct cudaExternalSemaphoreSignalParams*, paramsArray, unsigned int,  numExtSems, cudaStream_t, stream)
+#undef cudaWaitExternalSemaphoresAsync
 DEF_FN(cudaError_t, cudaWaitExternalSemaphoresAsync, const cudaExternalSemaphore_t*, extSemArray, const struct cudaExternalSemaphoreWaitParams*, paramsArray, unsigned int,  numExtSems, cudaStream_t, stream)
 
 cudaError_t cudaFuncGetAttributes(struct cudaFuncAttributes* attr, const void* func)
@@ -1088,12 +1119,12 @@ cudaError_t cudaFreeArray(cudaArray_t array)
 }
 
 typedef struct host_alloc_info {
-    int cnt;
+    int idx;
     size_t size;
     void *client_ptr;
 } host_alloc_info_t;
 static host_alloc_info_t hainfo[64] = {0};
-static size_t hainfo_cnt = 1;
+static size_t hainfo_cnt = 0;
 static int hainfo_getindex(void *client_ptr)
 {
     int i;
@@ -1195,44 +1226,49 @@ cudaError_t cudaHostAlloc(void** pHost, size_t size, unsigned int flags)
 #ifdef WITH_API_CNT
     api_call_cnt++;
 #endif //WITH_API_CNT
-    int ret = cudaErrorMemoryAllocation;
+    sz_result ret = {.err = cudaErrorMemoryAllocation};
+    int reg_ret;
     int fd_shm;
-    char shm_name[128];
+    char *shm_name = NULL;
     enum clnt_stat retval_1;
     
     if (shm_enabled && connection_is_local == 1) { //Use local shared memory
+        retval_1 = cuda_host_alloc_1(size, flags, &ret, clnt);
+        if (retval_1 != RPC_SUCCESS || ret.err != cudaSuccess) {
+            LOGE(LOG_ERROR, "cudaHostAlloc failed on server-side.");
+            goto out;
+        }
 
-        snprintf(shm_name, 128, "/crickethostalloc-%zu", hainfo_cnt);
-        if ((fd_shm = shm_open(shm_name, O_RDWR | O_CREAT, S_IRWXU)) == -1) {
-            LOGE(LOG_ERROR, "ERROR: could not open shared memory \"%s\" with size %d: %s", shm_name, size, strerror(errno));
+        if (asprintf(&shm_name, "/crickethostalloc-%zu", ret.sz_result_u.data) == -1) {
+            LOGE(LOG_ERROR, "ERROR: asprintf failed: %s", strerror(errno));
+            ret.err = cudaErrorMemoryAllocation;
             goto out;
         }
-        if (ftruncate(fd_shm, size) == -1) {
-            LOGE(LOG_ERROR, "ERROR: cannot resize shared memory");
-            shm_unlink(shm_name);
+        
+        if ((fd_shm = shm_open(shm_name, O_RDWR, S_IREAD | S_IWRITE)) == -1) {
+            LOGE(LOG_ERROR, "ERROR: could not open shared memory \"%s\" with size %d: %s", shm_name, size, strerror(errno));
+            ret.err = cudaErrorMemoryAllocation;
             goto out;
         }
-        LOGE(LOG_DEBUG, "shm opened with name \"%s\", size: %d", shm_name, size);
+
         if ((*pHost = mmap(NULL, size, PROT_READ | PROT_WRITE, MAP_SHARED, fd_shm, 0)) == MAP_FAILED) {
             LOGE(LOG_ERROR, "ERROR: mmap returned unexpected pointer: %p", *pHost);
             shm_unlink(shm_name);
+            ret.err = cudaErrorMemoryAllocation;
             goto out;
         }
 
-        hainfo[hainfo_cnt].cnt = hainfo_cnt;
+        hainfo[hainfo_cnt].idx = ret.sz_result_u.data;
         hainfo[hainfo_cnt].size = size;
         hainfo[hainfo_cnt].client_ptr = *pHost;
-
-        retval_1 = cuda_host_alloc_1(hainfo_cnt, size, (uint64_t)*pHost, flags, &ret, clnt);
-        if (retval_1 != RPC_SUCCESS) {
-            clnt_perror (clnt, "call failed");
-        }
-        if (ret == cudaSuccess) {
-            hainfo_cnt++;
-        } else {
-            munmap(*pHost, size);
-            *pHost = NULL;
+        hainfo_cnt++;
+        
+        retval_1 = cuda_host_alloc_regshm_1(ret.sz_result_u.data, (ptr)*pHost, &reg_ret, clnt);
+        if (retval_1 != RPC_SUCCESS || ret.err != cudaSuccess) {
+            LOGE(LOG_ERROR, "cudaHostAlloc failed on server-side.");
+            goto out;
         }
+
         shm_unlink(shm_name);
     } else if (socktype == TCP) { //Use infiniband
 #ifdef WITH_IB
@@ -1240,14 +1276,14 @@ cudaError_t cudaHostAlloc(void** pHost, size_t size, unsigned int flags)
             LOGE(LOG_ERROR, "failed to register infiniband memory region");
             goto out;
         }
-        hainfo[hainfo_cnt].cnt = hainfo_cnt;
+        hainfo[hainfo_cnt].idx = hainfo_cnt;
         hainfo[hainfo_cnt].size = size;
         hainfo[hainfo_cnt].client_ptr = *pHost;
 
         hainfo_cnt++;
 
         retval_1 = RPC_SUCCESS;
-        ret = cudaSuccess;
+        ret.err = cudaSuccess;
 
 #else
         LOGE(LOG_DEBUG, "cudaHostAlloc is not supported for TCP transports without IB. Using malloc instead...");
@@ -1255,7 +1291,7 @@ cudaError_t cudaHostAlloc(void** pHost, size_t size, unsigned int flags)
         if (*pHost == NULL) {
             goto out;
         } else {
-            ret = cudaSuccess;
+            ret.err = cudaSuccess;
             goto out;
         }
 #endif //WITH_IB
@@ -1264,7 +1300,8 @@ cudaError_t cudaHostAlloc(void** pHost, size_t size, unsigned int flags)
         goto out;
     }
 out:
-    return ret;
+    free(shm_name);
+    return ret.err;
 }
 
 cudaError_t cudaHostGetDevicePointer(void** pDevice, void* pHost, unsigned int flags)
@@ -1528,7 +1565,6 @@ extern char server[256];
 #define WITH_MT_MEMCPY
 cudaError_t cudaMemcpy(void* dst, const void* src, size_t count, enum cudaMemcpyKind kind)
 {
-    
 #ifdef WITH_API_CNT
     api_call_cnt++;
     memcpy_cnt += count;
@@ -1536,9 +1572,9 @@ cudaError_t cudaMemcpy(void* dst, const void* src, size_t count, enum cudaMemcpy
     int ret = 1;
     enum clnt_stat retval;
     if (kind == cudaMemcpyHostToDevice) {
-//get index of mem reg (src: cpu reg memregion)
+        // get index of mem reg (src: cpu reg memregion)
         int index = hainfo_getindex((void*)src);
-//         not a cudaHostAlloc'ed memory 
+        // not a cudaHostAlloc'ed memory 
         if (index == -1) {
 #ifdef WITH_MT_MEMCPY
             if (count > 2*MT_MEMCPY_MEM_PER_THREAD) {
@@ -1572,7 +1608,7 @@ cudaError_t cudaMemcpy(void* dst, const void* src, size_t count, enum cudaMemcpy
 #endif //WITH_MT_MEMCPY
         } else {
             if (shm_enabled && connection_is_local == 1) { //Use local shared memory
-                retval = cuda_memcpy_shm_1(index, (ptr)dst, count, kind, &ret, clnt);
+                retval = cuda_memcpy_shm_1(hainfo[index].idx, (ptr)dst, count, kind, &ret, clnt);
             } else if (socktype == TCP) { //Use infiniband
 #ifdef WITH_IB
                 //the following commend connects to serverside cuda_memcpy_ib_1_svc, server thread is initialized waiting for client send
@@ -1635,7 +1671,7 @@ cudaError_t cudaMemcpy(void* dst, const void* src, size_t count, enum cudaMemcpy
 #endif //WITH_MT_MEMCPY
         } else {
             if (shm_enabled && connection_is_local) { //Use local shared memory
-                retval = cuda_memcpy_shm_1(index, (ptr)src, count, kind, &ret, clnt);
+                retval = cuda_memcpy_shm_1(hainfo[index].idx, (ptr)src, count, kind, &ret, clnt);
             } else if (socktype == TCP) { //Use infiniband
 #ifdef WITH_IB
                 pthread_t thread = {0};
@@ -1758,7 +1794,19 @@ cudaError_t cudaMemset2D(void* devPtr, size_t pitch, int value, size_t width, si
     return result;
 }
 
-DEF_FN(cudaError_t, cudaMemset2DAsync, void*, devPtr, size_t, pitch, int,  value, size_t, width, size_t, height, cudaStream_t, stream)
+cudaError_t cudaMemset2DAsync(void* devPtr, size_t pitch, int value, size_t width, size_t height, cudaStream_t stream)
+{
+#ifdef WITH_API_CNT
+    api_call_cnt++;
+#endif //WITH_API_CNT
+    int result;
+    enum clnt_stat retval;
+    retval = cuda_memset_2d_async_1((ptr)devPtr, pitch, value, width, height, (ptr)stream, &result, clnt);
+    if (retval != RPC_SUCCESS) {
+        clnt_perror (clnt, "call failed");
+    }
+    return result;
+}
 
 cudaError_t cudaMemset3D(struct cudaPitchedPtr pitchedDevPtr, int value, struct cudaExtent extent)
 {
@@ -1782,8 +1830,42 @@ cudaError_t cudaMemset3D(struct cudaPitchedPtr pitchedDevPtr, int value, struct
     return result;
 }
 
-DEF_FN(cudaError_t, cudaMemset3DAsync, struct cudaPitchedPtr, pitchedDevPtr, int,  value, struct cudaExtent, extent, cudaStream_t, stream)
-DEF_FN(cudaError_t, cudaMemsetAsync, void*, devPtr, int,  value, size_t, count, cudaStream_t, stream)
+cudaError_t cudaMemset3DAsync(struct cudaPitchedPtr pitchedDevPtr, int  value, struct cudaExtent extent, cudaStream_t stream)
+{
+#ifdef WITH_API_CNT
+    api_call_cnt++;
+#endif //WITH_API_CNT
+    int result;
+    enum clnt_stat retval;
+    retval = cuda_memset_3d_async_1(pitchedDevPtr.pitch,
+                              (ptr)pitchedDevPtr.ptr,
+                              pitchedDevPtr.xsize,
+                              pitchedDevPtr.ysize,
+                              value,
+                              extent.depth,
+                              extent.height,
+                              extent.width, 
+                              (ptr)stream,
+                              &result, clnt);
+    if (retval != RPC_SUCCESS) {
+        clnt_perror (clnt, "call failed");
+    }
+    return result;
+}
+
+cudaError_t cudaMemsetAsync(void* devPtr, int value, size_t count, cudaStream_t stream)
+{
+#ifdef WITH_API_CNT
+    api_call_cnt++;
+#endif //WITH_API_CNT
+    int result;
+    enum clnt_stat retval;
+    retval = cuda_memset_async_1((ptr)devPtr, value, count, (ptr)stream, &result, clnt);
+    if (retval != RPC_SUCCESS) {
+        clnt_perror (clnt, "call failed");
+    }
+    return result;
+}
 
 DEF_FN(struct cudaExtent, make_cudaExtent, size_t, w, size_t, h, size_t, d)
 DEF_FN(struct cudaPitchedPtr, make_cudaPitchedPtr, void*, d, size_t, p, size_t, xsz, size_t, ysz)
@@ -1907,7 +1989,11 @@ DEF_FN(cudaError_t, cudaGraphGetNodes, cudaGraph_t, graph, cudaGraphNode_t*, nod
 DEF_FN(cudaError_t, cudaGraphGetRootNodes, cudaGraph_t, graph, cudaGraphNode_t*, pRootNodes, size_t*, pNumRootNodes)
 DEF_FN(cudaError_t, cudaGraphHostNodeGetParams, cudaGraphNode_t, node, struct cudaHostNodeParams*, pNodeParams)
 DEF_FN(cudaError_t, cudaGraphHostNodeSetParams, cudaGraphNode_t, node, const struct cudaHostNodeParams*, pNodeParams)
+#if CUDART_VERSION >= 12000
+DEF_FN(cudaError_t, cudaGraphInstantiate, cudaGraphExec_t*, pGraphExec, cudaGraph_t, graph, unsigned long long, flags)
+#else
 DEF_FN(cudaError_t, cudaGraphInstantiate, cudaGraphExec_t*, pGraphExec, cudaGraph_t, graph, cudaGraphNode_t*, pErrorNode, char*, pLogBuffer, size_t, bufferSize)
+#endif
 DEF_FN(cudaError_t, cudaGraphKernelNodeGetParams, cudaGraphNode_t, node, struct cudaKernelNodeParams*, pNodeParams)
 DEF_FN(cudaError_t, cudaGraphKernelNodeSetParams, cudaGraphNode_t, node, const struct cudaKernelNodeParams*, pNodeParams)
 DEF_FN(cudaError_t, cudaGraphLaunch, cudaGraphExec_t, graphExec, cudaStream_t, stream)
@@ -1920,6 +2006,33 @@ DEF_FN(cudaError_t, cudaGraphNodeGetDependencies, cudaGraphNode_t, node, cudaGra
 DEF_FN(cudaError_t, cudaGraphNodeGetDependentNodes, cudaGraphNode_t, node, cudaGraphNode_t*, pDependentNodes, size_t*, pNumDependentNodes)
 DEF_FN(cudaError_t, cudaGraphNodeGetType, cudaGraphNode_t, node, enum cudaGraphNodeType*, pType)
 DEF_FN(cudaError_t, cudaGraphRemoveDependencies, cudaGraph_t, graph, const cudaGraphNode_t*, from, const cudaGraphNode_t*, to, size_t, numDependencies)
-DEF_FN(cudaError_t, cudaProfilerInitialize, const char*, configFile, const char*, outputFile, cudaOutputMode_t, outputMode)
 DEF_FN(cudaError_t, cudaProfilerStart, void)
 DEF_FN(cudaError_t, cudaProfilerStop, void)
+
+cudaError_t cudaProfilerStart(void)
+{
+#ifdef WITH_API_CNT
+    api_call_cnt++;
+#endif //WITH_API_CNT
+    int result;
+    enum clnt_stat retval;
+    retval = cuda_profiler_start_1(&result, clnt);
+    if (retval != RPC_SUCCESS) {
+        clnt_perror (clnt, "call failed");
+    }
+    return result;
+}
+
+cudaError_t cudaProfilerStop(void)
+{
+#ifdef WITH_API_CNT
+    api_call_cnt++;
+#endif //WITH_API_CNT
+    int result;
+    enum clnt_stat retval;
+    retval = cuda_profiler_stop_1(&result, clnt);
+    if (retval != RPC_SUCCESS) {
+        clnt_perror (clnt, "call failed");
+    }
+    return result;
+}
\ No newline at end of file
diff --git a/cpu/cpu-client.c b/cpu/cpu-client.c
index 45f92d51..c4bc68d1 100644
--- a/cpu/cpu-client.c
+++ b/cpu/cpu-client.c
@@ -1,30 +1,32 @@
 #define _GNU_SOURCE
-#include <stdio.h>
-#include <stdlib.h>
 #include <cuda.h>
 #include <driver_types.h>
+#include <link.h>
+#include <stdio.h>
+#include <stdlib.h>
 
-//For TCP socket
-#include <sys/socket.h>
-#include <netinet/in.h>
+// For TCP socket
 #include <arpa/inet.h>
 #include <netdb.h>
+#include <netinet/in.h>
+#include <sys/socket.h>
 
-#include "cpu-libwrap.h"
-#include "cpu_rpc_prot.h"
 #include "cpu-common.h"
+#include "cpu-libwrap.h"
 #include "cpu-utils.h"
+#include "cpu_rpc_prot.h"
 #include "list.h"
+#include "cpu-elf2.h"
 #ifdef WITH_IB
 #include "cpu-ib.h"
-#endif //WITH_IB
+#endif // WITH_IB
 
-//static const char* LIBCUDA_PATH = "/lib64/libcuda.so";
-const char* LIBCUDA_PATH = "/usr/local/cuda/lib64/libcudart.so";
+// static const char* LIBCUDA_PATH = "/lib64/libcuda.so";
+const char *LIBCUDA_PATH = "/usr/local/cuda/lib64/libcudart.so";
 
 CLIENT *clnt = NULL;
 
-list kernel_infos = {0};
+list kernel_infos = { 0 };
 
 char server[256];
 
@@ -34,30 +36,33 @@ int shm_enabled = 1;
 int initialized = 0;
 
 #ifdef WITH_IB
-    int ib_device = 0;
-#endif //WITH_IB
+int ib_device = 0;
+#endif // WITH_IB
 
 #ifdef WITH_API_CNT
 extern void cpu_runtime_print_api_call_cnt(void);
-#endif //WITH_API_CNT
+#endif // WITH_API_CNT
 
 static void rpc_connect(void)
 {
     int isock;
-    struct sockaddr_un sock_un = {0};
-    struct sockaddr_in sock_in = {0};
-    struct sockaddr_in local_addr = {0};
+    struct sockaddr_un sock_un = { 0 };
+    struct sockaddr_in sock_in = { 0 };
+    struct sockaddr_in local_addr = { 0 };
     struct hostent *hp;
     socklen_t sockaddr_len = sizeof(struct sockaddr_in);
-    unsigned long prog=0, vers=0;
+    unsigned long prog = 0, vers = 0;
 
     char envvar[] = "REMOTE_GPU_ADDRESS";
 
-    if(!getenv(envvar)) {
-        LOG(LOG_ERROR, "Environment variable %s does not exist. It must contain the address where the server application is listening.", envvar);
+    if (!getenv(envvar)) {
+        LOG(LOG_ERROR,
+            "Environment variable %s does not exist. It must contain the "
+            "address where the server application is listening.",
+            envvar);
         exit(1);
     }
-    if(strncpy(server, getenv(envvar), 256) == NULL) {
+    if (strncpy(server, getenv(envvar), 256) == NULL) {
         LOGE(LOG_ERROR, "strncpy failed.");
         exit(1);
     }
@@ -65,23 +70,24 @@ static void rpc_connect(void)
 
 #ifdef WITH_IB
 
-    if(getenv("IB_DEVICE_ID")) {
+    if (getenv("IB_DEVICE_ID")) {
         ib_device = atoi(getenv("IB_DEVICE_ID"));
     }
     LOG(LOG_INFO, "Using IB device: %d.", ib_device);
 
-#endif //WITH_IB
+#endif // WITH_IB
 
-    LOGE(LOG_INFO, "test\n");
-    if(getenv("CRICKET_NOHASH")) {
-        prog=99;
-        vers=1;
-    } else if (cpu_utils_md5hash("/proc/self/exe", &prog, &vers) != 0) {
-        LOGE(LOG_ERROR, "error while creating binary checksum");
-        exit(0);
+    prog = 99;
+    vers = 1;
+    const char *env_vers = getenv("CRICKET_RPCID");
+    if (env_vers != NULL) {
+        if (sscanf(env_vers, "%lu", &vers) != 1) {
+            LOGE(LOG_ERROR, "error parsing CRICKET_RPCID");
+            exit(1);
+        }
     }
 
-    char* cmd = NULL;
+    char *cmd = NULL;
     if (cpu_utils_command(&cmd) != 0) {
         LOGE(LOG_ERROR, "error getting command");
     } else {
@@ -109,18 +115,19 @@ static void rpc_connect(void)
             LOGE(LOG_ERROR, "error resolving hostname: %s", server);
             exit(1);
         }
-        sock_in.sin_addr = *(struct in_addr*)hp->h_addr;
-        //inet_aton("137.226.133.199", &sock_in.sin_addr);
+        sock_in.sin_addr = *(struct in_addr *)hp->h_addr;
+        // inet_aton("137.226.133.199", &sock_in.sin_addr);
 
         clnt = clnttcp_create(&sock_in, prog, vers, &isock, 0, 0);
         getsockname(isock, &local_addr, &sockaddr_len);
-        connection_is_local = (local_addr.sin_addr.s_addr == sock_in.sin_addr.s_addr);
+        connection_is_local =
+            (local_addr.sin_addr.s_addr == sock_in.sin_addr.s_addr);
         break;
     case UDP:
-        /* From RPCEGEN documentation: 
+        /* From RPCEGEN documentation:
          * Warning: since UDP-based RPC messages can only hold up to 8 Kbytes
-         * of encoded data, this transport cannot be used for procedures that 
-         * take large arguments or return huge results. 
+         * of encoded data, this transport cannot be used for procedures that
+         * take large arguments or return huge results.
          * -> Sounds like UDP does not make sense for CUDA, because we need to
          *    be able to copy large memory chunks
          **/
@@ -130,11 +137,12 @@ static void rpc_connect(void)
 
     if (clnt == NULL) {
         clnt_pcreateerror("[rpc] Error");
-        exit (1);
+        exit(1);
     }
 }
 
-static void repair_connection(int signo) {
+static void repair_connection(int signo)
+{
     enum clnt_stat retval_1;
     int result_1;
     /*LOGE(LOG_INFO, "Trying connection...");
@@ -154,13 +162,14 @@ static void repair_connection(int signo) {
     }
 }
 
-void __attribute__ ((constructor)) init_rpc(void)
+void __attribute__((constructor)) init_rpc(void)
 {
     enum clnt_stat retval_1;
     int result_1;
     int_result result_2;
     char *printmessage_1_arg1 = "hello";
 
+    LOG(LOG_DBG(1), "log level is %d", LOG_LEVEL);
     init_log(LOG_LEVEL, __FILE__);
     rpc_connect();
 
@@ -172,24 +181,29 @@ void __attribute__ ((constructor)) init_rpc(void)
 
     retval_1 = rpc_printmessage_1(printmessage_1_arg1, &result_1, clnt);
     if (retval_1 != RPC_SUCCESS) {
-        clnt_perror (clnt, "call failed");
+        clnt_perror(clnt, "call failed");
     }
 
     if (list_init(&kernel_infos, sizeof(kernel_info_t)) != 0) {
         LOGE(LOG_ERROR, "list init failed.");
     }
 
-    if (cpu_utils_parameter_info(&kernel_infos, "/proc/self/exe") != 0) {
-        LOG(LOG_ERROR, "error while getting parameter size. Check whether cuobjdump binary is in PATH! Trying anyway (will only work if there is no kernel in this binary)");
+    if (elf2_init() != 0) {
+        LOGE(LOG_ERROR, "libelf init failed");
     }
+
+    // if (cpu_utils_parameter_info(&kernel_infos, "/proc/self/exe") != 0) {
+    //     LOG(LOG_ERROR, "error while getting parameter size. Check whether "
+    //                    "cuobjdump binary is in PATH! Trying anyway (will only "
+    //                    "work if there is no kernel in this binary)");
+    // }
 #ifdef WITH_IB
     if (ib_init(ib_device, server) != 0) {
         LOG(LOG_ERROR, "initilization of infiniband verbs failed.");
     }
-#endif //WITH_IB
-
+#endif // WITH_IB
 }
-void __attribute__ ((destructor)) deinit_rpc(void)
+void __attribute__((destructor)) deinit_rpc(void)
 {
     enum clnt_stat retval_1;
     int result;
@@ -202,151 +216,210 @@ void __attribute__ ((destructor)) deinit_rpc(void)
         list_free(&kernel_infos);
 #ifdef WITH_API_CNT
         cpu_runtime_print_api_call_cnt();
-#endif //WITH_API_CNT
+#endif // WITH_API_CNT
     }
 
     if (clnt != NULL) {
-        clnt_destroy (clnt);
+        clnt_destroy(clnt);
     }
 }
 
-void __cudaRegisterVar(void **fatCubinHandle, char *hostVar, char *deviceAddress, const char *deviceName, int ext, size_t size, int constant, int global)
-{
-}
 
-void __cudaRegisterFunction(void **fatCubinHandle, const char *hostFun, char *deviceFun,
-                            const char *deviceName, int thread_limit, uint3 *tid,
-                            uint3 *bid, dim3 *bDim, dim3 *gDim, int *wSize)
+static void *(*dlopen_orig)(const char *, int) = NULL;
+static int (*dlclose_orig)(void *) = NULL;
+static void *dl_handle = NULL;
+
+void *dlopen(const char *filename, int flag)
 {
-    int result;
-    enum clnt_stat retval_1;
+    void *ret = NULL;
+    struct link_map *map;
+    int has_kernel = 0;
+    LOG(LOG_DBG(1), "intercepted dlopen(%s, %d)", filename, flag);
+
+    if (filename == NULL) {
+        return dlopen_orig(filename, flag);
+    }
 
-    printf("__cudaRegisterFunction(fatCubinHandle=%p, hostFun=%p, devFunc=%s, deviceName=%s, thread_limit=%d, tid=[%p], bid=[%p], bDim=[%p], gDim=[%p], wSize=%p)\n", fatCubinHandle, hostFun, deviceFun, deviceName, thread_limit, tid, bid, bDim, gDim, wSize);
+    if (dlopen_orig == NULL) {
+        if ((dlopen_orig = dlsym(RTLD_NEXT, "dlopen")) == NULL) {
+            LOGE(LOG_ERROR, "[dlopen] dlsym failed");
+        }
+    }
 
-    kernel_info_t *info = cricketd_utils_search_info(&kernel_infos, (char*)deviceName);
-    if (info == NULL) {
-        LOGE(LOG_ERROR, "request to register unknown function: \"%s\"", deviceName);
-        retval_1 = cuda_register_function_1((ptr)fatCubinHandle, (ptr)hostFun, deviceFun, (char*)deviceName, thread_limit, &result, clnt);
-        if (retval_1 != RPC_SUCCESS) {
-            LOGE(LOG_ERROR, "call failed.");
+    static const char *replace_libs[] = {
+        "libcuda.so.1",
+        "libcuda.so",
+        "libnvidia-ml.so.1",
+        "libcudnn_cnn_infer.so.8"
+    };
+    static const size_t replace_libs_sz = sizeof(replace_libs) / sizeof(char *);
+    if (filename != NULL) {
+        for (size_t i=0; i != replace_libs_sz; ++i) {
+            if (strcmp(filename, replace_libs[i]) == 0) {
+                LOG(LOG_DEBUG, "replacing dlopen call to %s with cricket-client.so", filename);
+                dl_handle = dlopen_orig("cricket-client.so", flag);
+                if (clnt == NULL) {
+                    LOGE(LOG_ERROR, "rpc seems to be uninitialized");
+                }
+                return dl_handle;
+            }
         }
+    }
+    /* filename is NULL or not in replace_libs list */
+    if ((ret = dlopen_orig(filename, flag)) == NULL) {
+        LOGE(LOG_ERROR, "dlopen failed: ", dlerror());
+    } else if (has_kernel) {
+        dlinfo(ret, RTLD_DI_LINKMAP, &map);
+        LOGE(LOG_DEBUG, "dlopen to  %p", map->l_addr);
+    }
+    return ret;
+}
 
-        return;
+int dlclose(void *handle)
+{
+    if (handle == NULL) {
+        LOGE(LOG_ERROR, "[dlclose] handle NULL");
+        return -1;
+    } else if (dlclose_orig == NULL) {
+        if ((dlclose_orig = dlsym(RTLD_NEXT, "dlclose")) == NULL) {
+            LOGE(LOG_ERROR, "[dlclose] dlsym failed");
+        }
     }
-    info->host_fun = (void*)hostFun;
 
-    if (retval_1 != RPC_SUCCESS) {
-        clnt_perror (clnt, "call failed");
+    // Ignore dlclose call that would close this library
+    if (dl_handle == handle) {
+        LOGE(LOG_DEBUG, "[dlclose] ignore close");
+        return 0;
+    } else {
+        return dlclose_orig(handle);
     }
 }
 
-struct __fatCubin {
-    uint32_t magic;
-    uint32_t seq;
-    uint64_t text;
-    uint64_t data;
-    uint64_t ptr;
-    uint64_t ptr2;
-    uint64_t zero;
-};
-
-struct rpc_fatCubin {
-    uint32_t magic;
-    uint32_t seq;
-    uint64_t text;
-    uint64_t data;
-    uint64_t ptr;
-    uint64_t ptr2;
-    uint64_t zero;
-};
-
-void** __cudaRegisterFatBinary(void *fatCubin)
+void __cudaRegisterVar(void **fatCubinHandle, char *hostVar, char *deviceAddress,
+                       const char *deviceName, int ext, size_t size, int constant,
+                       int global);
+
+void __cudaRegisterVar(void **fatCubinHandle, char *hostVar, char *deviceAddress,
+                       const char *deviceName, int ext, size_t size, int constant,
+                       int global)
 {
-    ptr_result result;
     enum clnt_stat retval_1;
-
-    struct __fatCubin *fat = (struct __fatCubin*)((fatCubin));
-    struct rpc_fatCubin rpc_fat = {.magic = fat->magic,
-                                   .seq   = fat->seq,
-                                   .text  = fat->text,
-                                   .data  = fat->data,
-                                   .ptr   = fat->ptr,
-                                   .ptr2  = fat->ptr2,
-                                   .zero  = fat->zero};
-    LOGE(LOG_DEBUG, "__cudaRegisterFatBinary");
-    //printf("__cudaRegisterFatBinary(magic: %x, seq: %x, text: %lx, data: %lx, ptr: %lx, ptr2: %lx, zero: %lx\n",
-    //       fat->magic, fat->seq, fat->text, fat->data, fat->ptr, fat->ptr2, fat->zero);
-    retval_1 = RPC_SUCCESS;//cuda_register_fat_binary_1(rpc_fat, &result, clnt);
+    int result;
+    LOGE(LOG_DEBUG, "__cudaRegisterVar(fatCubinHandle=%p, hostVar=%p, deviceAddress=%p, "
+           "deviceName=%s, ext=%d, size=%zu, constant=%d, global=%d)\n",
+           fatCubinHandle, hostVar, deviceAddress, deviceName, ext, size, constant, global);
+    retval_1 = rpc_register_var_1((ptr)fatCubinHandle, (ptr)hostVar, (ptr)deviceAddress, (char*)deviceName, ext, size, constant, global,
+                                       &result, clnt);
     if (retval_1 != RPC_SUCCESS) {
-        clnt_perror (clnt, "call failed");
+        LOGE(LOG_ERROR, "call failed.");
     }
-    if (result.err != 0) {
-        return NULL;
-    }
-    return (void*)result.ptr_result_u.ptr;
 }
 
-void __cudaRegisterFatBinaryEnd(void **fatCubinHandle)
+void __cudaRegisterFunction(void **fatCubinHandle, const char *hostFun,
+                            char *deviceFun, const char *deviceName,
+                            int thread_limit, uint3 *tid, uint3 *bid,
+                            dim3 *bDim, dim3 *gDim, int *wSize)
 {
-    int result;
+    ptr_result result;
     enum clnt_stat retval_1;
 
-    //printf("__cudaRegisterFatBinaryEnd(fatCubinHandle=%p)\n", fatCubinHandle);
+    LOGE(LOG_DEBUG, "__cudaRegisterFunction(fatCubinHandle=%p, hostFun=%p, devFunc=%s, "
+           "deviceName=%s, thread_limit=%d, tid=[%p], bid=[%p], bDim=[%p], "
+           "gDim=[%p], wSize=%p)\n",
+           fatCubinHandle, hostFun, deviceFun, deviceName, thread_limit, tid,
+           bid, bDim, gDim, wSize);
 
-    retval_1 = RPC_SUCCESS;//cuda_register_fat_binary_end_1((uint64_t)fatCubinHandle, &result, clnt);
-    if (retval_1 != RPC_SUCCESS) {
-        clnt_perror (clnt, "call failed");
+    kernel_info_t *info = utils_search_info(&kernel_infos, (char *)deviceName);
+    if (info == NULL) {
+        LOGE(LOG_ERROR, "request to register unknown function: \"%s\"",
+             deviceName);
+        return;
+    } else {
+        LOGE(LOG_DEBUG, "request to register known function: \"%s\"",
+             deviceName);
+        retval_1 = rpc_register_function_1((ptr)fatCubinHandle, (ptr)hostFun,
+                                           deviceFun, (char*)deviceName, thread_limit,
+                                           &result, clnt);
+        if (retval_1 != RPC_SUCCESS) {
+            LOGE(LOG_ERROR, "call failed.");
+            exit(1);
+        }
+        if (result.err != 0) {
+            LOGE(LOG_ERROR, "error registering function: %d", result.err);
+            exit(1);
+        }
+        info->host_fun = (void *)hostFun;
     }
 }
 
-static void *(*dlopen_orig)(const char *, int) = NULL;
-static int   (*dlclose_orig)(void *) = NULL;
-static void *dl_handle = NULL;
 
-void *dlopen(const char *filename, int flag)
+void **__cudaRegisterFatBinary(void *fatCubin)
 {
-    LOG(LOG_DEBUG, "intercepted dlopen(%s, %d)", filename, flag);
-    if (dlopen_orig == NULL) {
-        if ( (dlopen_orig = dlsym(RTLD_NEXT, "dlopen")) == NULL) {
-            LOGE(LOG_ERROR, "[dlopen] dlsym failed");
-        }
+    void **result;
+    int rpc_result;
+    enum clnt_stat retval_1;
+    size_t fatbin_size;
+    LOGE(LOG_DEBUG, "__cudaRegisterFatBinary(fatCubin=%p)", fatCubin);
+
+    mem_data rpc_fat = { .mem_data_len = 0, .mem_data_val = NULL };
+
+    if (elf2_get_fatbin_info((struct fat_header *)fatCubin,
+                                &kernel_infos,
+                                (uint8_t **)&rpc_fat.mem_data_val,
+                                &fatbin_size) != 0) {
+        LOGE(LOG_ERROR, "error getting fatbin info");
+        return NULL;
     }
+    rpc_fat.mem_data_len = fatbin_size;
 
-    if (filename != NULL && strcmp(filename, "libcuda.so.1") == 0) {
-        LOG(LOG_DEBUG, "replacing dlopen call to cuda driver library with cricket-client.so");
-        dl_handle = dlopen_orig("cricket-client.so", flag);
-        if (clnt == NULL) {
-            LOGE(LOG_ERROR, "rpc seems to be uninitialized");
-        }
-        return dl_handle;
-    } else {
-        LOGE(LOG_DEBUG, "request to dlopen \"%s\"", filename);
-        if (cpu_utils_contains_kernel(filename) == 0) {
-            LOGE(LOG_ERROR, "file does not contain a kernel");
-        }
-        return dlopen_orig(filename, flag);
+    // CUDA registers an atexit handler for fatbin cleanup that accesses
+    // the fatbin data structure. Let's allocate some zeroes to avoid segfaults.
+    result = (void**)calloc(1, 0x58);
+
+    retval_1 = rpc_elf_load_1(rpc_fat, (ptr)result, &rpc_result, clnt);
+    if (retval_1 != RPC_SUCCESS) {
+        LOGE(LOG_ERROR, "call failed.");
+    }
+    if (rpc_result != 0) {
+        LOGE(LOG_ERROR, "error registering fatbin: %d", rpc_result);
+        return NULL;
     }
+    LOG(LOG_DEBUG, "fatbin loaded to %p", result);
+    // we return a bunch of zeroes to avoid segfaults. The memory is
+    // mapped by the modules resource 
+    return result;
 }
 
-int dlclose(void *handle)
-{
-    if (handle == NULL) {
-        LOGE(LOG_ERROR, "[dlclose] handle NULL");
-        return -1;
-    } else if (dlclose_orig == NULL) {
-        if ( (dlclose_orig = dlsym(RTLD_NEXT, "dlclose")) == NULL) {
-            LOGE(LOG_ERROR, "[dlclose] dlsym failed");
-        }
-    }
+void __cudaUnregisterFatBinary(void **fatCubinHandle)
+{  
+    int result;
+    enum clnt_stat retval_1;
 
-    // Ignore dlclose call that would close this library
-    if (dl_handle == handle) {
-        LOGE(LOG_DEBUG, "[dlclose] ignore close");
-        return 0;
-    } else {
-        return dlclose_orig(handle);
+    LOGE(LOG_DEBUG, "__cudaUnregisterFatBinary(fatCubinHandle=%p)",
+         fatCubinHandle);
+
+    if (fatCubinHandle == NULL) {
+        LOGE(LOG_WARNING, "fatCubinHandle is NULL - so we have nothing to unload. (This is okay if this binary does not contain a kernel.)");
+        return;
     }
 
+    // retval_1 = rpc_elf_unload_1((ptr)fatCubinHandle, &result, clnt);
+    // if (retval_1 != RPC_SUCCESS || result != 0) {
+    //     LOGE(LOG_ERROR, "call failed.");
+    // }
 }
 
+// void __cudaRegisterFatBinaryEnd(void **fatCubinHandle)
+// {
+//     int result;
+//     enum clnt_stat retval_1;
+
+//     //printf("__cudaRegisterFatBinaryEnd(fatCubinHandle=%p)\n",
+//     fatCubinHandle);
 
+//     retval_1 =
+//     RPC_SUCCESS;//cuda_register_fat_binary_end_1((uint64_t)fatCubinHandle,
+//     &result, clnt); if (retval_1 != RPC_SUCCESS) {
+//         clnt_perror (clnt, "call failed");
+//     }
+// }
diff --git a/cpu/cpu-elf2.c b/cpu/cpu-elf2.c
new file mode 100644
index 00000000..89fcb24a
--- /dev/null
+++ b/cpu/cpu-elf2.c
@@ -0,0 +1,999 @@
+#define _GNU_SOURCE
+#include <stdlib.h>
+#include <stdio.h>
+#include <stdint.h>
+#include <unistd.h>
+#include <errno.h>
+#include <string.h>
+#include <libelf.h>
+#include <gelf.h>
+#include <dlfcn.h>
+
+#include "cpu-common.h"
+#include "log.h"
+#include "cpu-elf2.h"
+#include "cpu-utils.h"
+
+#define uint16_t unsigned short
+#define CRICKET_ELF_NV_INFO_PREFIX ".nv.info"
+#define CRICKET_ELF_NV_SHARED_PREFIX ".nv.shared."
+#define CRICKET_ELF_NV_TEXT_PREFIX ".nv.text."
+#define CRICKET_ELF_TEXT_PREFIX ".text."
+
+#define CRICKET_ELF_FATBIN ".nv_fatbin"
+#define CRICKET_ELF_REGFUN "_ZL24__sti____cudaRegisterAllv"
+
+#define FATBIN_STRUCT_MAGIC 0x466243b1
+#define FATBIN_TEXT_MAGIC   0xBA55ED50
+
+struct  __attribute__((__packed__)) fat_elf_header
+{
+    uint32_t magic;
+    uint16_t version;
+    uint16_t header_size;
+    uint64_t size;
+};
+struct  __attribute__((__packed__)) fat_text_header
+{
+    uint16_t kind;
+    uint16_t unknown1;
+    uint32_t header_size;
+    uint64_t size;
+    uint32_t compressed_size;       // Size of compressed data
+    uint32_t unknown2;              // Address size for PTX?
+    uint16_t minor;
+    uint16_t major;
+    uint32_t arch;
+    uint32_t obj_name_offset;
+    uint32_t obj_name_len;
+    uint64_t flags;
+    uint64_t zero;                  // Alignment for compression?
+    uint64_t decompressed_size;     // Length of compressed data in decompressed representation.
+                                    // There is an uncompressed footer so this is generally smaller
+                                    // than size.
+};
+
+#define FATBIN_FLAG_64BIT     0x0000000000000001LL
+#define FATBIN_FLAG_DEBUG     0x0000000000000002LL
+#define FATBIN_FLAG_LINUX     0x0000000000000010LL
+#define FATBIN_FLAG_COMPRESS  0x0000000000002000LL
+
+int elf2_init(void)
+{
+    if (elf_version(EV_CURRENT) == EV_NONE) {
+        LOGE(LOG_ERROR, "ELF library initialization failed: %s", elf_errmsg(-1));
+        return -1;
+    }
+    return 0;
+}
+
+static int flag_to_str(char** str, uint64_t flag)
+{
+    return asprintf(str, "64Bit: %s, Debug: %s, Linux: %s, Compress %s",
+        (flag & FATBIN_FLAG_64BIT) ? "yes" : "no",
+        (flag & FATBIN_FLAG_DEBUG) ? "yes" : "no",
+        (flag & FATBIN_FLAG_LINUX) ? "yes" : "no",
+        (flag & FATBIN_FLAG_COMPRESS) ? "yes" : "no");
+}
+
+static void print_header(struct fat_text_header *th)
+{
+    char* flagstr = NULL;
+    flag_to_str(&flagstr, th->flags);
+
+    LOGE(LOG_DBG(1), "text_header: fatbin_kind: %#x, header_size %#x, size %#zx, compressed_size %#x,\
+ minor %#x, major %#x, arch %d, decompressed_size %#zx\n\tflags: %s\n",
+        th->kind,
+        th->header_size,
+        th->size,
+        th->compressed_size,
+        th->minor,
+        th->major,
+        th->arch,
+        th->decompressed_size,
+        flagstr);
+    LOGE(LOG_DBG(1), "\tunknown fields: unknown1: %#x, unknown2: %#x, zeros: %#zx\n",
+        th->unknown1,
+        th->unknown2,
+        th->zero);
+
+    free(flagstr);
+}
+
+/** Check the header of a fatbin
+ * Performs some integrity checks and returns the elf header
+ * @param fatbin_data Pointer to the fatbin data
+ * @param fatbin_size Size of the fatbin data
+ * @param decompressed_size Pointer to a variable that will be set to the size of the decompressed data
+ * @param compressed_data Pointer to a variable that will be set to point to the compressed data
+*/
+static int get_elf_header(const uint8_t* fatbin_data, size_t fatbin_size, struct fat_elf_header **elf_header)
+{
+    struct fat_elf_header *eh = NULL;
+
+    if (fatbin_data == NULL || elf_header == NULL) {
+        LOGE(LOG_ERROR, "fatbin_data is NULL");
+        return 1;
+    }
+
+    if (fatbin_size < sizeof(struct fat_elf_header)) {
+        LOGE(LOG_ERROR, "fatbin_size is too small");
+        return 1;
+    }
+
+    eh = (struct fat_elf_header*) fatbin_data;
+    if (eh->magic != FATBIN_TEXT_MAGIC) {
+        LOGE(LOG_ERROR, "Invalid magic  number: expected %#x but got %#x", FATBIN_TEXT_MAGIC, eh->magic);
+        return 1;
+    }
+
+    if (eh->version != 1 || eh->header_size != sizeof(struct fat_elf_header)) {
+        LOGE(LOG_ERROR, "fatbin text version is wrong or header size is inconsistent.\
+            This is a sanity check to avoid reading a new fatbinary format");
+        return 1;
+    }
+    
+    *elf_header = eh;
+    return 0;
+}
+
+/** Check the text header of a fatbin
+ * Performs some integrity checks and returns the text header
+ * @param fatbin_data Pointer to the fatbin data
+ * @param fatbin_size Size of the fatbin data
+ * @param decompressed_size Pointer to a variable that will be set to the size of the decompressed data
+ * @param compressed_data Pointer to a variable that will be set to point to the compressed data
+*/
+static int get_text_header(const uint8_t* fatbin_data, size_t fatbin_size, struct fat_text_header **text_header)
+{
+    struct fat_text_header *th = NULL;
+
+    if (fatbin_data == NULL || text_header == NULL) {
+        LOGE(LOG_ERROR, "fatbin_data is NULL");
+        return 1;
+    }
+
+    if (fatbin_size < sizeof(struct fat_text_header)) {
+        LOGE(LOG_ERROR, "fatbin_size is too small");
+        return 1;
+    }
+
+    th = (struct fat_text_header*)fatbin_data;
+
+    if(th->obj_name_offset != 0) {
+        if (((char*)th)[th->obj_name_offset + th->obj_name_len] != '\0') {
+            LOGE(LOG_WARNING, "Fatbin object name is not null terminated");
+        } else {
+            char *obj_name = (char*)th + th->obj_name_offset;
+            LOGE(LOG_DEBUG, "Fatbin object name: %s (len:%#x)", obj_name, th->obj_name_len);
+        }
+    }
+
+    *text_header = th;
+    return 0;
+}
+
+/** Decompresses a fatbin file
+ * @param input Pointer compressed input data
+ * @param input_size Size of compressed data
+ * @param output preallocated memory where decompressed output should be stored
+ * @param output_size size of output buffer. Should be equal to the size of the decompressed data
+ */
+static size_t decompress(const uint8_t* input, size_t input_size, uint8_t* output, size_t output_size)
+{
+    size_t ipos = 0, opos = 0;  
+    uint64_t next_nclen;  // length of next non-compressed segment
+    uint64_t next_clen;   // length of next compressed segment
+    uint64_t back_offset; // negative offset where redudant data is located, relative to current opos
+
+    while (ipos < input_size) {
+        next_nclen = (input[ipos] & 0xf0) >> 4;
+        next_clen = 4 + (input[ipos] & 0xf);
+        if (next_nclen == 0xf) {
+            do {
+                next_nclen += input[++ipos];
+            } while (input[ipos] == 0xff);
+        }
+        
+        if (memcpy(output + opos, input + (++ipos), next_nclen) == NULL) {
+            LOGE(LOG_ERROR, "copying data");
+            return 0;
+        }
+#ifdef FATBIN_DECOMPRESS_DEBUG
+        printf("%#04zx nocompress (len:%#x):\n", opos, next_nclen);
+        hexdump(output + opos, next_nclen);
+#endif
+        ipos += next_nclen;
+        opos += next_nclen;
+        if (ipos >= input_size || opos >= output_size) {
+            break;
+        }
+        back_offset = input[ipos] + (input[ipos + 1] << 8);       
+        ipos += 2;
+        if (next_clen == 0xf+4) {
+            do {
+                next_clen += input[ipos++];
+            } while (input[ipos - 1] == 0xff);
+        }
+#ifdef FATBIN_DECOMPRESS_DEBUG
+        printf("%#04zx compress (decompressed len: %#x, back_offset %#x):\n", opos, next_clen, back_offset);
+#endif
+        if (next_clen <= back_offset) {
+            if (memcpy(output + opos, output + opos - back_offset, next_clen) == NULL) {
+                LOGE(LOG_ERROR, "Error copying data");
+                return 0;
+            }
+        } else {
+            if (memcpy(output + opos, output + opos - back_offset, back_offset) == NULL) {
+                LOGE(LOG_ERROR, "Error copying data");
+                return 0;
+            }
+            for (size_t i = back_offset; i < next_clen; i++) {
+                output[opos + i] = output[opos + i - back_offset];
+            }
+        }
+#ifdef FATBIN_DECOMPRESS_DEBUG
+        hexdump(output + opos, next_clen);
+#endif
+        opos += next_clen;
+    }
+    LOGE(LOG_DEBUG, "ipos: %#zx, opos: %#zx, ilen: %#zx, olen: %#zx", ipos, opos, input_size, output_size);
+    return opos;
+}
+
+static ssize_t decompress_section(const uint8_t *input, uint8_t **output, size_t *output_size,
+                                  struct fat_elf_header *eh, struct fat_text_header *th, size_t *eh_out_offset)
+{
+    struct fat_text_header *th_out = NULL;
+    struct fat_elf_header *eh_out = NULL;
+    uint8_t *output_pos = 0;
+    size_t padding;
+    size_t input_read = 0;
+    const uint8_t zeroes[6] = {0};
+
+    if (output == NULL || output_size == NULL || eh == NULL || th == NULL || eh_out_offset == NULL) {
+        LOGE(LOG_ERROR, "invalid parameters");
+        return 1;
+    }
+
+    if ((*output = realloc(*output, *output_size + th->decompressed_size + eh->header_size + th->header_size)) == NULL) {
+        LOGE(LOG_ERROR, "Error allocating memory of size %#zx for output buffer: %s", 
+                *output_size + th->decompressed_size + eh->header_size + th->header_size, strerror(errno));
+        goto error;
+    }
+    output_pos = *output + *output_size;
+    *output_size += th->decompressed_size + th->header_size;
+
+    if (input == (uint8_t*)eh + eh->header_size + th->header_size) { // We are at the first section
+        if (memcpy(output_pos, eh, eh->header_size) == NULL) {
+            LOGE(LOG_ERROR, "Error copying data");
+            goto error;
+        }
+        eh_out = ((struct fat_elf_header*)(output_pos));
+        eh_out->size = 0;
+        *eh_out_offset = output_pos - *output;
+        output_pos += eh->header_size;
+        *output_size += eh->header_size;
+    }
+    eh_out = ((struct fat_elf_header*)(*output + *eh_out_offset)); // repair pointer in case realloc moved the buffer
+    eh_out->size += th->decompressed_size + th->header_size;       // set size
+
+    if (memcpy(output_pos, th, th->header_size) == NULL) {
+        LOGE(LOG_ERROR, "Error copying data");
+        goto error;
+    }
+    th_out = ((struct fat_text_header*)output_pos);
+    th_out->flags &= ~FATBIN_FLAG_COMPRESS;  // clear compressed flag
+    th_out->compressed_size = 0;             // clear compressed size
+    th_out->decompressed_size = 0;           // clear decompressed size
+    th_out->size = th->decompressed_size;    // set size
+
+    output_pos += th->header_size;
+
+    if (decompress(input, th->compressed_size, output_pos, th->decompressed_size) != th->decompressed_size) {
+        LOGE(LOG_ERROR, "Decompression failed");
+        goto error;
+    }
+
+    input_read += th->compressed_size;
+    output_pos += th->decompressed_size;
+
+    // if (input_pos != (uint8_t*)th + eh->size) {
+    //     printf("There is %#zx bytes of data remaining\n", (uint8_t*)th + eh->size - input_pos);
+    // }
+    
+    padding = (8 - (size_t)(input + input_read) % 8);
+    if (memcmp(input + input_read, zeroes, padding) != 0) {
+        LOGE(LOG_ERROR, "expected %#zx zero bytes, got:", padding);
+        hexdump(input + input_read, 0x60);
+        goto error;
+    }
+    input_read += padding;
+
+    padding = ((8 - (size_t)th->decompressed_size) % 8);
+    // Because we always allocated enough memory for one more elf_header and this is smaller than
+    // the maximal padding of 7, we do not have to reallocate here.
+    memset(output_pos, 0, padding);
+    *output_size += padding;
+    eh_out->size += padding;
+    th_out->size += padding;
+
+    return input_read;
+ error:
+    free(*output);
+    *output = NULL;
+    return -1;
+}
+
+static ssize_t decompress_single_section(const uint8_t *input, uint8_t **output, size_t *output_size,
+                                         struct fat_elf_header *eh, struct fat_text_header *th)
+{
+    size_t padding;
+    size_t input_read = 0;
+    size_t output_written = 0;
+    size_t decompress_ret = 0;
+    const uint8_t zeroes[8] = {0};
+
+    if (input == NULL || output == NULL || eh == NULL || th == NULL) {
+        LOGE(LOG_ERROR, "invalid parameters");
+        return 1;
+    }
+
+    // add max padding of 7 bytes
+    if ((*output = malloc(th->decompressed_size + 7)) == NULL) {
+        LOGE(LOG_ERROR, "Error allocating memory of size %#zx for output buffer: %s", 
+                th->decompressed_size, strerror(errno));
+        goto error;
+    }
+    print_header(th);
+
+    if ((decompress_ret = decompress(input, th->compressed_size, *output, th->decompressed_size)) != th->decompressed_size) {
+        LOGE(LOG_ERROR, "Decompression failed: decompressed size is %#zx, but header says %#zx", 
+                decompress_ret, th->decompressed_size);
+        LOGE(LOG_ERROR, "input pos: %#zx, output pos: %#zx", input - (uint8_t*)eh, *output);
+        hexdump(input, 0x160);
+        if (decompress_ret >= 0x60)
+            hexdump((*output) + decompress_ret - 0x60, 0x60);
+        goto error;
+    }
+    input_read += th->compressed_size;
+    output_written += th->decompressed_size;
+
+    padding = ((8 - (size_t)(input + input_read)) % 8);
+    if (memcmp(input + input_read, zeroes, padding) != 0) {
+        LOGE(LOG_ERROR, "expected %#zx zero bytes, got:", padding);
+        hexdump(input + input_read, 0x60);
+        goto error;
+    }
+    input_read += padding;
+
+    padding = ((8 - (size_t)th->decompressed_size) % 8);
+    // Because we always allocated enough memory for one more elf_header and this is smaller than
+    // the maximal padding of 7, we do not have to reallocate here.
+    memset(*output, 0, padding);
+    output_written += padding;
+
+    *output_size = output_written;
+    return input_read;
+ error:
+    free(*output);
+    *output = NULL;
+    return -1;
+}
+
+/** Decompresses a fatbin file
+ * @param fatbin_data Pointer to the fatbin data
+ * @param fatbin_size Size of the fatbin data
+ * @param decompressed_data Pointer to a variable that will be set to point to the decompressed data
+ * @param decompressed_size Pointer to a variable that will be set to the size of the decompressed data
+ */
+static size_t decompress_fatbin(const uint8_t* fatbin_data, size_t fatbin_size, uint8_t** decompressed_data)
+{
+    struct fat_elf_header *eh = NULL;
+    size_t eh_out_offset = 0;
+    struct fat_text_header *th = NULL;
+    const uint8_t *input_pos = fatbin_data;
+
+    int i = 0;
+    uint8_t *output = NULL;
+    size_t output_size = 0;
+    ssize_t input_read;
+
+    if (fatbin_data == NULL || decompressed_data == NULL) {
+        LOGE(LOG_ERROR, "fatbin_data is NULL");
+        goto error;
+    }
+
+    while (input_pos < fatbin_data + fatbin_size) {
+        if (get_elf_header(input_pos, fatbin_size - (input_pos - fatbin_data), &eh) != 0) {
+            LOGE(LOG_ERROR, "Something went wrong while checking the header.");
+            goto error;
+        }
+        // printf("elf header no. %d: magic: %#x, version: %#x, header_size: %#x, size: %#zx\n",
+        //        i++, eh->magic, eh->version, eh->header_size, eh->size);
+        input_pos += eh->header_size;
+        do {
+            if (get_text_header(input_pos, fatbin_size - (input_pos - fatbin_data) - eh->header_size, &th) != 0) {
+                LOGE(LOG_ERROR, "Something went wrong while checking the header.");
+                goto error;
+            }
+            //print_header(th);
+            input_pos += th->header_size;
+
+            if ((input_read = decompress_section(input_pos, &output, &output_size, eh, th, &eh_out_offset)) < 0) {
+                LOGE(LOG_ERROR, "Something went wrong while decompressing text section.");
+                goto error;
+            }
+            input_pos += input_read;
+
+        } while (input_pos < (uint8_t*)eh + eh->header_size + eh->size);
+
+        //printf("##### Decompressed data (size %#zx): #####\n", th->decompressed_size);
+        //hexdump(output_pos, th->decompressed_size);
+    }
+
+    *decompressed_data = output;
+    return output_size;
+ error:
+    if (output != NULL) {
+        free(output);
+    }
+    *decompressed_data = NULL;
+    return 0;
+}
+
+int elf2_get_fatbin_info(const struct fat_header *fatbin, list *kernel_infos, uint8_t** fatbin_mem, size_t* fatbin_size)
+{
+    struct fat_elf_header* eh;
+    struct fat_text_header* th;
+    const uint8_t *input_pos = NULL;
+    const uint8_t *fatbin_data = NULL;
+    uint8_t *text_data = NULL;
+    size_t text_data_size = 0;
+    size_t fatbin_total_size = 0;
+    int ret = -1;
+    if (fatbin == NULL || fatbin_mem == NULL || fatbin_size == NULL) {
+        LOGE(LOG_ERROR, "at least one parameter is NULL");
+        goto error;
+    }
+    fatbin_data = input_pos = (const uint8_t*)fatbin->text;
+    if (fatbin->magic != FATBIN_STRUCT_MAGIC) {
+        LOGE(LOG_ERROR, "fatbin struct magic number is wrong. Got %llx, expected %llx.", fatbin->magic, FATBIN_STRUCT_MAGIC);
+        goto error;
+    }
+    LOGE(LOG_DBG(1), "Fatbin: magic: %x, version: %x, text: %lx, data: %lx, ptr: %lx, ptr2: %lx, zero: %lx",
+           fatbin->magic, fatbin->version, fatbin->text, fatbin->data, fatbin->unknown, fatbin->text2, fatbin->zero);
+
+    if (get_elf_header((uint8_t*)fatbin->text, sizeof(struct fat_elf_header), &eh) != 0) {
+        LOGE(LOG_ERROR, "Something went wrong while checking the header.");
+        goto error;
+    }
+    // LOGE(LOG_DBG(1), "elf header: magic: %#x, version: %#x, header_size: %#x, size: %#zx",
+    //        eh->magic, eh->version, eh->header_size, eh->size); 
+
+    input_pos += eh->header_size;
+    fatbin_total_size = eh->header_size + eh->size;
+    do {
+        if (get_text_header(input_pos, *fatbin_size - (input_pos - fatbin_data) - eh->header_size, &th) != 0) {
+            LOGE(LOG_ERROR, "Something went wrong while checking the header.");
+            goto error;
+        }
+        //print_header(th);
+        input_pos += th->header_size;
+        if (th->kind != 2) { // section does not cotain device code (but e.g. PTX)
+            input_pos += th->size;
+            continue;
+        }
+        if (th->flags & FATBIN_FLAG_DEBUG) {
+            LOGE(LOG_DEBUG, "fatbin contains debug information.");
+        }
+
+        if (th->flags & FATBIN_FLAG_COMPRESS) {
+            ssize_t input_read;
+
+            LOGE(LOG_DEBUG, "fatbin contains compressed device code. Decompressing...");
+            if ((input_read = decompress_single_section(input_pos, &text_data, &text_data_size, eh, th)) < 0) {
+                LOGE(LOG_ERROR, "Something went wrong while decompressing text section.");
+                goto error;
+            }
+            input_pos += input_read;
+            //hexdump(text_data, text_data_size);
+        } else {
+            text_data = (uint8_t*)input_pos;
+            text_data_size = th->size;
+            input_pos += th->size;
+        }
+        // print_header(th);
+        if (elf2_parameter_info(kernel_infos, text_data , text_data_size) != 0) {
+            LOGE(LOG_ERROR, "error getting parameter info");
+            goto error;
+        }
+        if (th->flags & FATBIN_FLAG_COMPRESS) {
+            free(text_data);
+        }
+    } while (input_pos < (uint8_t*)eh + eh->header_size + eh->size);
+
+    // if (get_elf_header((uint8_t*)fatbin->text2, sizeof(struct fat_elf_header), &eh) != 0) {
+    //     LOGE(LOG_ERROR, "Something went wrong while checking the header.");
+    //     goto error;
+    // }
+    // fatbin_total_size += eh->header_size + eh->size;
+
+    *fatbin_mem = (void*)fatbin->text;
+    *fatbin_size = fatbin_total_size;
+    ret = 0;
+ error:
+    return ret;
+}
+
+static void print_hexmem(void *mem, size_t len)
+{
+    for (int i=0; i<len; i++) {
+        printf("%02x ", ((uint8_t*)mem)[i]);
+    }
+    printf("\n");
+}
+
+#define EIATTR_PARAM_CBANK              0xa
+#define EIATTR_EXTERNS                  0xf
+#define EIATTR_FRAME_SIZE               0x11
+#define EIATTR_MIN_STACK_SIZE           0x12
+#define EIATTR_KPARAM_INFO              0x17
+#define EIATTR_CBANK_PARAM_SIZE         0x19
+#define EIATTR_MAX_REG_COUNT            0x1b
+#define EIATTR_EXIT_INSTR_OFFSETS       0x1c
+#define EIATTR_S2RCTAID_INSTR_OFFSETS   0x1d
+#define EIATTR_CRS_STACK_SIZE           0x1e
+#define EIATTR_SW1850030_WAR            0x2a
+#define EIATTR_REGCOUNT                 0x2f
+#define EIATTR_SW2393858_WAR            0x30
+#define EIATTR_INDIRECT_BRANCH_TARGETS  0x34
+#define EIATTR_CUDA_API_VERSION         0x37
+
+#define EIFMT_NVAL                      0x1
+#define EIFMT_HVAL                      0x3
+#define EIFMT_SVAL                      0x4
+
+static int get_section_by_name(Elf *elf, const char *name, Elf_Scn **section)
+{
+    Elf_Scn *scn = NULL;
+    GElf_Shdr shdr;
+    char *section_name = NULL;
+    size_t str_section_index;
+
+    if (elf == NULL || name == NULL || section == NULL) {
+        LOGE(LOG_ERROR, "invalid argument");
+        return -1;
+    }
+
+    if (elf_getshdrstrndx(elf, &str_section_index) != 0) {
+        LOGE(LOG_ERROR, "elf_getshstrndx failed");
+        return -1;
+    }
+
+    while ((scn = elf_nextscn(elf, scn)) != NULL) {
+        if (gelf_getshdr(scn, &shdr) != &shdr) {
+            LOGE(LOG_ERROR, "gelf_getshdr failed");
+            return -1;
+        }
+        if ((section_name = elf_strptr(elf, str_section_index, shdr.sh_name)) == NULL) {
+            LOGE(LOG_ERROR, "elf_strptr failed");
+            return -1;
+        }
+        if (strcmp(section_name, name) == 0) {
+            *section = scn;
+            return 0;
+        }
+    }
+    return -1;
+}
+
+static int print_sections(Elf *elf)
+{
+    Elf_Scn *scn = NULL;
+    GElf_Shdr shdr;
+    char *section_name = NULL;
+    size_t str_section_index;
+
+    if (elf == NULL) {
+        LOGE(LOG_ERROR, "invalid argument");
+        return -1;
+    }
+
+    if (elf_getshdrstrndx(elf, &str_section_index) != 0) {
+        LOGE(LOG_ERROR, "elf_getshstrndx Wfailed");
+        return -1;
+    }
+
+    while ((scn = elf_nextscn(elf, scn)) != NULL) {
+        if (gelf_getshdr(scn, &shdr) != &shdr) {
+            LOGE(LOG_ERROR, "gelf_getshdr failed");
+            return -1;
+        }
+        if ((section_name = elf_strptr(elf, str_section_index, shdr.sh_name)) == NULL) {
+            LOGE(LOG_ERROR, "elf_strptr failed");
+            return -1;
+        }
+    }
+    return -1;
+}
+
+static char* get_kernel_section_from_kernel_name(const char *kernel_name)
+{
+    char *section_name = NULL;
+    if (kernel_name == NULL) {
+        LOGE(LOG_ERROR, "invalid argument");
+        return NULL;
+    }
+
+    if (kernel_name[0] == '$') {
+        const char *p;
+        if ((p = strchr(kernel_name+1, '$')) == NULL) {
+            LOGE(LOG_ERROR, "invalid kernel name");
+            return NULL;
+        }
+        int len = (p - kernel_name) - 1;
+        if (asprintf(&section_name, ".nv.info.%.*s", len, kernel_name+1) == -1) {
+            LOGE(LOG_ERROR, "asprintf failed");
+            return NULL;
+        }
+    } else {
+        if (asprintf(&section_name, ".nv.info.%s", kernel_name) == -1) {
+            LOGE(LOG_ERROR, "asprintf failed");
+            return NULL;
+        }
+    }
+    return section_name;
+}
+
+static int get_parm_for_kernel(Elf *elf, kernel_info_t *kernel, void* memory, size_t memsize)
+{
+    struct __attribute__((__packed__)) nv_info_kernel_entry {
+        uint8_t format;
+        uint8_t attribute;
+        uint16_t values_size;
+        uint32_t values;
+    };
+    struct __attribute__((__packed__)) nv_info_kparam_info {
+        uint32_t index;
+        uint16_t ordinal;
+        uint16_t offset;
+        uint16_t unknown : 12;
+        uint8_t  cbank : 6;
+        uint16_t size : 14;
+        // missing are "space" (possible padding info?), and "Pointee's logAlignment"
+        // these were always 0 in the kernels I tested
+    };
+    int ret = -1;
+    char *section_name = NULL;
+    Elf_Scn *section = NULL;
+    Elf_Data *data = NULL;
+
+    if (kernel == NULL || kernel->name == NULL || memory == NULL) {
+        LOGE(LOG_ERROR, "at least one parameter is NULL");
+        goto cleanup;
+    }
+    kernel->param_num = 0;
+    kernel->param_offsets = NULL;
+    kernel->param_sizes = NULL;
+
+    if ((section_name = get_kernel_section_from_kernel_name(kernel->name)) == NULL) {
+        LOGE(LOG_ERROR, "get_kernel_section_from_kernel_name failed");
+        goto cleanup;
+    }
+
+    if (get_section_by_name(elf, section_name, &section) != 0) {
+        LOGE(LOG_ERROR, "section %s not found", section_name);
+        goto cleanup;
+    }
+
+    if ((data = elf_getdata(section, NULL)) == NULL) {
+        LOGE(LOG_ERROR, "error getting section data");
+        goto cleanup;
+    }
+
+    //print_hexmem(data->d_buf, data->d_size);
+
+    size_t secpos=0;
+    int i=0;
+    while (secpos < data->d_size) {
+        struct nv_info_kernel_entry *entry = (struct nv_info_kernel_entry*)(data->d_buf+secpos);
+        // printf("entry %d: format: %#x, attr: %#x, ", i++, entry->format, entry->attribute);
+        if (entry->format == EIFMT_SVAL && entry->attribute == EIATTR_KPARAM_INFO) {
+            if (entry->values_size != 0xc) {
+                LOGE(LOG_ERROR, "EIATTR_KPARAM_INFO values size has not the expected value of 0xc");
+                goto cleanup;
+            }
+            struct nv_info_kparam_info *kparam = (struct nv_info_kparam_info*)&entry->values;
+            // printf("kparam: index: %#x, ordinal: %#x, offset: %#x, unknown: %#0x, cbank: %#0x, size: %#0x\n",
+            //     kparam->index, kparam->ordinal, kparam->offset, kparam->unknown, kparam->cbank, kparam->size);
+            LOGE(LOG_DBG(1), "param %d: offset: %#x, size: %#x", kparam->ordinal, kparam->offset, kparam->size);
+            if (kparam->ordinal >= kernel->param_num) {
+                kernel->param_offsets = realloc(kernel->param_offsets,
+                                              (kparam->ordinal+1)*sizeof(uint16_t));
+                kernel->param_sizes = realloc(kernel->param_sizes,
+                                            (kparam->ordinal+1)*sizeof(uint16_t));
+                kernel->param_num = kparam->ordinal+1;
+            }
+            kernel->param_offsets[kparam->ordinal] = kparam->offset;
+            kernel->param_sizes[kparam->ordinal] = kparam->size;
+            secpos += sizeof(struct nv_info_kernel_entry) + entry->values_size-4;
+        } else if (entry->format == EIFMT_HVAL && entry->attribute == EIATTR_CBANK_PARAM_SIZE) {
+            kernel->param_size = entry->values_size;
+            LOGE(LOG_DEBUG, "cbank_param_size: %#0x", entry->values_size);
+            secpos += sizeof(struct nv_info_kernel_entry)-4;
+        } else if (entry->format == EIFMT_HVAL) {
+            // printf("hval: %#x(%d)\n", entry->values_size, entry->values_size);
+            secpos += sizeof(struct nv_info_kernel_entry)-4;
+        } else if (entry->format == EIFMT_SVAL) {
+            // printf("sval_size: %#x ", entry->values_size);
+            // for (int j=0; j*sizeof(uint32_t) < entry->values_size; j++) {
+            //     printf("val%d: %#x(%d) ", j, (&entry->values)[j], (&entry->values)[j]);
+            // }
+            // printf("\n");
+            secpos += sizeof(struct nv_info_kernel_entry) + entry->values_size-4;
+        } else if (entry->format == EIFMT_NVAL) {
+            // printf("nval\n");
+            secpos += sizeof(struct nv_info_kernel_entry)-4;
+        } else {
+            LOGE(LOG_WARNING, "unknown format: %#x", entry->format);
+            secpos += sizeof(struct nv_info_kernel_entry)-4;
+        }
+    }
+    // printf("remaining: %d\n", data->d_size % sizeof(struct nv_info_kernel_entry));
+    ret = 0;
+ cleanup:
+    free(section_name);
+    return ret;
+}
+
+static int get_symtab(Elf *elf, Elf_Data **symbol_table_data, size_t *symbol_table_size, GElf_Shdr *symbol_table_shdr)
+{
+    GElf_Shdr shdr;
+    Elf_Scn *section = NULL;
+
+    if (elf == NULL || symbol_table_data == NULL || symbol_table_size == NULL) {
+        LOGE(LOG_ERROR, "invalid argument");
+        return -1;
+    }
+
+    if (get_section_by_name(elf, ".symtab", &section) != 0) {
+        LOGE(LOG_ERROR, "could not find .symtab section");
+        return -1;
+    }
+
+    if (gelf_getshdr(section, &shdr) == NULL) {
+        LOGE(LOG_ERROR, "gelf_getshdr failed");
+        return -1;
+    }
+
+    if (symbol_table_shdr != NULL) {
+        *symbol_table_shdr = shdr;
+    }
+
+    if(shdr.sh_type != SHT_SYMTAB) {
+        LOGE(LOG_ERROR, "not a symbol table: %d", shdr.sh_type);
+        return -1;
+    }
+
+    if ((*symbol_table_data = elf_getdata(section, NULL)) == NULL) {
+        LOGE(LOG_ERROR, "elf_getdata failed");
+        return -1;
+    }
+
+    *symbol_table_size = shdr.sh_size / shdr.sh_entsize;
+
+    return 0;
+}
+
+static void print_symtab(Elf *elf)
+{
+    GElf_Sym sym;
+    Elf_Data *symbol_table_data = NULL;
+    GElf_Shdr shdr;
+    size_t symnum;
+    int i = 0;
+
+    if (get_symtab(elf, &symbol_table_data, &symnum, &shdr) != 0) {
+        LOGE(LOG_ERROR, "could not get symbol table");
+        return;
+    }
+
+    LOGE(LOG_DEBUG, "found %d symbols", symnum);
+
+    while (gelf_getsym(symbol_table_data, i, &sym) != NULL) {
+        printf("sym %d: name: %s, value: %#lx, size: %#lx, info: %#x, other: %#x, shndx: %#x\n", i,
+               elf_strptr(elf, shdr.sh_link, sym.st_name),
+               sym.st_value, sym.st_size, sym.st_info, sym.st_other, sym.st_shndx);
+        i++;
+    }
+}
+
+static int check_elf(Elf *elf)
+{
+    Elf_Kind ek;
+    GElf_Ehdr ehdr;
+
+    int elfclass;
+    char *id;
+    size_t program_header_num;
+    size_t sections_num;
+    size_t section_str_num;
+    int ret = -1;
+
+    if ((ek = elf_kind(elf)) != ELF_K_ELF) {
+        LOGE(LOG_ERROR, "elf_kind is not ELF_K_ELF, but %d", ek);
+        goto cleanup;
+    }
+
+    if (gelf_getehdr(elf, &ehdr) == NULL) {
+        LOGE(LOG_ERROR, "gelf_getehdr failed");
+        goto cleanup;
+    }
+
+    if ((elfclass = gelf_getclass(elf)) == ELFCLASSNONE) {
+        LOGE(LOG_ERROR, "gelf_getclass failed");
+        goto cleanup;
+    }
+
+    if ((id = elf_getident(elf, NULL)) == NULL) {
+        LOGE(LOG_ERROR, "elf_getident failed");
+        goto cleanup;
+    }
+
+    LOGE(LOG_DBG(1), "elfclass: %d-bit; elf ident[0..%d]: %7s",
+        (elfclass == ELFCLASS32) ? 32 : 64,
+        EI_ABIVERSION, id);
+
+    if (elf_getshdrnum(elf, &sections_num) != 0) {
+        LOGE(LOG_ERROR, "elf_getphdrnum failed");
+        goto cleanup;
+    }
+
+    if (elf_getphdrnum(elf, &program_header_num) != 0) {
+        LOGE(LOG_ERROR, "elf_getshdrnum failed");
+        goto cleanup;
+    }
+
+    if (elf_getshdrstrndx(elf, &section_str_num) != 0) {
+        LOGE(LOG_ERROR, "elf_getshstrndx Wfailed");
+        goto cleanup;
+    }
+
+    LOGE(LOG_DBG(1), "elf contains %d sections, %d program_headers, string table section: %d",
+        sections_num, program_header_num, section_str_num);
+
+    ret = 0;
+cleanup:
+    return ret;
+}
+
+int elf2_parameter_info(list *kernel_infos, void* memory, size_t memsize)
+{
+    struct __attribute__((__packed__)) nv_info_entry{
+        uint8_t format;
+        uint8_t attribute;
+        uint16_t values_size;
+        uint32_t kernel_id;
+        uint32_t value;
+    };
+
+    Elf *elf = NULL;
+    Elf_Scn *section = NULL;
+    Elf_Data *data = NULL, *symbol_table_data = NULL;
+    GElf_Shdr symtab_shdr;
+    size_t symnum;
+    int i = 0;
+    GElf_Sym sym;
+
+    int ret = -1;
+    kernel_info_t *ki = NULL;
+    const char *kernel_str;
+
+    if (memory == NULL || memsize == 0) {
+        LOGE(LOG_ERROR, "memory was NULL or memsize was 0");
+        return -1;
+    }
+
+#define ELF_DUMP_TO_FILE 1
+
+#ifdef ELF_DUMP_TO_FILE
+    FILE* fd2 = fopen("/tmp/cricket-elf-dump", "wb");
+    fwrite(memory, memsize, 1, fd2);
+    fclose(fd2);
+#endif
+
+    if ((elf = elf_memory(memory, memsize)) == NULL) {
+        LOGE(LOG_ERROR, "elf_memory failed");
+        goto cleanup;
+    }
+
+    if (check_elf(elf) != 0) {
+        LOGE(LOG_ERROR, "check_elf failed");
+        goto cleanup;
+    }
+
+    if (get_symtab(elf, &symbol_table_data, &symnum, &symtab_shdr) != 0) {
+        LOGE(LOG_ERROR, "could not get symbol table");
+        goto cleanup;
+    }
+
+    if (get_section_by_name(elf, ".nv.info", &section) != 0) {
+        LOGE(LOG_WARNING, "could not find .nv.info section. This means this binary does not contain any kernels.");
+        ret = 0;    // This is not an error.
+        goto cleanup;
+    }
+
+    if ((data = elf_getdata(section, NULL)) == NULL) {
+        LOGE(LOG_ERROR, "elf_getdata failed");
+        goto cleanup;
+    }
+
+    for (size_t secpos=0; secpos < data->d_size; secpos += sizeof(struct nv_info_entry)) {
+        struct nv_info_entry *entry = (struct nv_info_entry *)(data->d_buf+secpos);
+        // LOGE(LOG_DBG(1), "%d: format: %#x, attr: %#x, values_size: %#x kernel: %#x, sval: %#x(%d)", 
+        // i++, entry->format, entry->attribute, entry->values_size, entry->kernel_id, 
+        // entry->value, entry->value);
+
+        if (entry->values_size != 8) {
+            LOGE(LOG_ERROR, "unexpected values_size: %#x", entry->values_size);
+            continue;
+        }
+
+        if (entry->attribute != EIATTR_FRAME_SIZE) {
+            continue;
+        }
+
+        if (entry->kernel_id >= symnum) {
+            LOGE(LOG_ERROR, "kernel_id out of bounds: %#x", entry->kernel_id);
+            continue;
+        }
+
+        if (gelf_getsym(symbol_table_data, entry->kernel_id, &sym) == NULL) {
+            LOGE(LOG_ERROR, "gelf_getsym failed for entry %d", entry->kernel_id);
+            continue;
+        }
+        if ((kernel_str = elf_strptr(elf, symtab_shdr.sh_link, sym.st_name) ) == NULL) {
+            LOGE(LOG_ERROR, "strptr failed for entry %d", entry->kernel_id);
+            continue;
+        }
+
+        if (utils_search_info(kernel_infos, kernel_str) != NULL) {
+            continue;
+        }
+
+        LOGE(LOG_DEBUG, "found new kernel: %s (symbol table id: %#x)", kernel_str, entry->kernel_id);
+
+        if (list_append(kernel_infos, (void**)&ki) != 0) {
+            LOGE(LOG_ERROR, "error on appending to list");
+            goto cleanup;
+        }
+
+        size_t buflen = strlen(kernel_str)+1;
+        if ((ki->name = malloc(buflen)) == NULL) {
+            LOGE(LOG_ERROR, "malloc failed");
+            goto cleanup;
+        }
+        if (strncpy(ki->name, kernel_str, buflen) != ki->name) {
+            LOGE(LOG_ERROR, "strncpy failed");
+            goto cleanup;
+        }
+
+        if (get_parm_for_kernel(elf, ki, memory, memsize) != 0) {
+            LOGE(LOG_ERROR, "get_parm_for_kernel failed for kernel %s", kernel_str);
+            goto cleanup;
+        }
+    }
+
+    ret = 0;
+ cleanup:
+    if (elf != NULL) {
+        elf_end(elf);
+    }
+    return ret;
+}
+
+void* elf2_symbol_address(const char *symbol)
+{
+    return dlsym(RTLD_DEFAULT, symbol);
+}
\ No newline at end of file
diff --git a/cpu/cpu-elf2.h b/cpu/cpu-elf2.h
new file mode 100644
index 00000000..4223498e
--- /dev/null
+++ b/cpu/cpu-elf2.h
@@ -0,0 +1,25 @@
+#ifndef _ELF_H_
+#define _ELF_H_
+
+#include <stdint.h>
+#include "cpu-common.h"
+#include "list.h"
+
+struct __attribute__((__packed__)) fat_header {
+    uint32_t magic;
+    uint32_t version;
+    uint64_t text;      // points to first text section
+    uint64_t data;      // points to outside of the file
+    uint64_t unknown;
+    uint64_t text2;     // points to second text section
+    uint64_t zero;
+};
+
+int elf2_init(void);
+int elf2_get_fatbin_info(const struct fat_header *fatbin, list *kernel_infos, uint8_t** fatbin_mem, size_t* fatbin_size);
+
+int elf2_parameter_info(list *kernel_infos, void* memory, size_t memsize);
+void* elf2_symbol_address(const char *symbol);
+//int elf2_contains_kernel(void* memory, size_t memsize);
+
+#endif //_ELF_H_
diff --git a/cpu/cpu-libwrap.h b/cpu/cpu-libwrap.h
index 361f4105..5b3a8ba7 100644
--- a/cpu/cpu-libwrap.h
+++ b/cpu/cpu-libwrap.h
@@ -186,10 +186,24 @@ RET NAME(P1_TYPE P1_NAME, P2_TYPE P2_NAME, P3_TYPE P3_NAME, P4_TYPE P4_NAME, P5_
     DEF_FN_PTR(RET, P1_TYPE, P2_TYPE, P3_TYPE, P4_TYPE, P5_TYPE, P6_TYPE, P7_TYPE, P8_TYPE, P9_TYPE, P10_TYPE, P11_TYPE, P12_TYPE, P13_TYPE, P14_TYPE, P15_TYPE, P16_TYPE, P17_TYPE, P18_TYPE, P19_TYPE, P20_TYPE, P21_TYPE); \
     DEF_FN_BODY(RET, NAME, P1_NAME, P2_NAME, P3_NAME, P4_NAME, P5_NAME, P6_NAME, P7_NAME, P8_NAME, P9_NAME, P10_NAME, P11_NAME, P12_NAME, P13_NAME, P14_NAME, P15_NAME, P16_NAME, P17_NAME, P18_NAME, P19_NAME, P20_NAME, P21_NAME); \
 }
+#define DEF_FN_22(RET, NAME, P1_TYPE, P1_NAME, P2_TYPE, P2_NAME, P3_TYPE, P3_NAME, P4_TYPE, P4_NAME, P5_TYPE, P5_NAME, P6_TYPE, P6_NAME, P7_TYPE, P7_NAME, P8_TYPE, P8_NAME, P9_TYPE, P9_NAME, P10_TYPE, P10_NAME, P11_TYPE, P11_NAME, P12_TYPE, P12_NAME, P13_TYPE, P13_NAME, P14_TYPE, P14_NAME, P15_TYPE, P15_NAME, P16_TYPE, P16_NAME, P17_TYPE, P17_NAME, P18_TYPE, P18_NAME, P19_TYPE, P19_NAME, P20_TYPE, P20_NAME, P21_TYPE, P21_NAME, P22_TYPE, P22_NAME) \
+RET NAME(P1_TYPE P1_NAME, P2_TYPE P2_NAME, P3_TYPE P3_NAME, P4_TYPE P4_NAME, P5_TYPE P5_NAME, P6_TYPE P6_NAME, P7_TYPE P7_NAME, P8_TYPE P8_NAME, P9_TYPE P9_NAME, P10_TYPE P10_NAME, P11_TYPE P11_NAME, P12_TYPE P12_NAME, P13_TYPE P13_NAME, P14_TYPE P14_NAME, P15_TYPE P15_NAME, P16_TYPE P16_NAME, P17_TYPE P17_NAME, P18_TYPE P18_NAME, P19_TYPE P19_NAME, P20_TYPE P20_NAME, P21_TYPE P21_NAME, P22_TYPE P22_NAME) \
+{ \
+    DEF_FN_PTR(RET, P1_TYPE, P2_TYPE, P3_TYPE, P4_TYPE, P5_TYPE, P6_TYPE, P7_TYPE, P8_TYPE, P9_TYPE, P10_TYPE, P11_TYPE, P12_TYPE, P13_TYPE, P14_TYPE, P15_TYPE, P16_TYPE, P17_TYPE, P18_TYPE, P19_TYPE, P20_TYPE, P21_TYPE, P22_TYPE); \
+    DEF_FN_BODY(RET, NAME, P1_NAME, P2_NAME, P3_NAME, P4_NAME, P5_NAME, P6_NAME, P7_NAME, P8_NAME, P9_NAME, P10_NAME, P11_NAME, P12_NAME, P13_NAME, P14_NAME, P15_NAME, P16_NAME, P17_NAME, P18_NAME, P19_NAME, P20_NAME, P21_NAME, P22_NAME); \
+}
+#define DEF_FN_23(RET, NAME, P1_TYPE, P1_NAME, P2_TYPE, P2_NAME, P3_TYPE, P3_NAME, P4_TYPE, P4_NAME, P5_TYPE, P5_NAME, P6_TYPE, P6_NAME, P7_TYPE, P7_NAME, P8_TYPE, P8_NAME, P9_TYPE, P9_NAME, P10_TYPE, P10_NAME, P11_TYPE, P11_NAME, P12_TYPE, P12_NAME, P13_TYPE, P13_NAME, P14_TYPE, P14_NAME, P15_TYPE, P15_NAME, P16_TYPE, P16_NAME, P17_TYPE, P17_NAME, P18_TYPE, P18_NAME, P19_TYPE, P19_NAME, P20_TYPE, P20_NAME, P21_TYPE, P21_NAME, P22_TYPE, P22_NAME, P23_TYPE, P23_NAME) \
+RET NAME(P1_TYPE P1_NAME, P2_TYPE P2_NAME, P3_TYPE P3_NAME, P4_TYPE P4_NAME, P5_TYPE P5_NAME, P6_TYPE P6_NAME, P7_TYPE P7_NAME, P8_TYPE P8_NAME, P9_TYPE P9_NAME, P10_TYPE P10_NAME, P11_TYPE P11_NAME, P12_TYPE P12_NAME, P13_TYPE P13_NAME, P14_TYPE P14_NAME, P15_TYPE P15_NAME, P16_TYPE P16_NAME, P17_TYPE P17_NAME, P18_TYPE P18_NAME, P19_TYPE P19_NAME, P20_TYPE P20_NAME, P21_TYPE P21_NAME, P22_TYPE P22_NAME, P23_TYPE P23_NAME) \
+{ \
+    DEF_FN_PTR(RET, P1_TYPE, P2_TYPE, P3_TYPE, P4_TYPE, P5_TYPE, P6_TYPE, P7_TYPE, P8_TYPE, P9_TYPE, P10_TYPE, P11_TYPE, P12_TYPE, P13_TYPE, P14_TYPE, P15_TYPE, P16_TYPE, P17_TYPE, P18_TYPE, P19_TYPE, P20_TYPE, P21_TYPE, P22_TYPE, P23_TYPE); \
+    DEF_FN_BODY(RET, NAME, P1_NAME, P2_NAME, P3_NAME, P4_NAME, P5_NAME, P6_NAME, P7_NAME, P8_NAME, P9_NAME, P10_NAME, P11_NAME, P12_NAME, P13_NAME, P14_NAME, P15_NAME, P16_NAME, P17_NAME, P18_NAME, P19_NAME, P20_NAME, P21_NAME, P22_NAME, P23_NAME); \
+}
 
-#define DEF_FN_X(x, RET, NAME, P1_TYPE, P1_NAME, P2_TYPE, P2_NAME, P3_TYPE, P3_NAME, P4_TYPE, P4_NAME, P5_TYPE, P5_NAME, P6_TYPE, P6_NAME, P7_TYPE, P7_NAME, P8_TYPE, P8_NAME, P9_TYPE, P9_NAME, P10_TYPE, P10_NAME, P11_TYPE, P11_NAME, P12_TYPE, P12_NAME, P13_TYPE, P13_NAME, P14_TYPE, P14_NAME, P15_TYPE, P15_NAME, P16_TYPE, P16_NAME, P17_TYPE, P17_NAME, P18_TYPE, P18_NAME, P19_TYPE, P19_NAME, P20_TYPE, P20_NAME, P21_TYPE, P21_NAME, FUNC, ...) FUNC
+#define DEF_FN_X(x, RET, NAME, P1_TYPE, P1_NAME, P2_TYPE, P2_NAME, P3_TYPE, P3_NAME, P4_TYPE, P4_NAME, P5_TYPE, P5_NAME, P6_TYPE, P6_NAME, P7_TYPE, P7_NAME, P8_TYPE, P8_NAME, P9_TYPE, P9_NAME, P10_TYPE, P10_NAME, P11_TYPE, P11_NAME, P12_TYPE, P12_NAME, P13_TYPE, P13_NAME, P14_TYPE, P14_NAME, P15_TYPE, P15_NAME, P16_TYPE, P16_NAME, P17_TYPE, P17_NAME, P18_TYPE, P18_NAME, P19_TYPE, P19_NAME, P20_TYPE, P20_NAME, P21_TYPE, P21_NAME, P22_TYPE, P22_NAME, P23_TYPE, P23_NAME, FUNC, ...) FUNC
 
 #define DEF_FN(...) DEF_FN_X(,##__VA_ARGS__,\
+                    DEF_FN_23(__VA_ARGS__),,\
+                    DEF_FN_22(__VA_ARGS__),,\
                     DEF_FN_21(__VA_ARGS__),,\
                     DEF_FN_20(__VA_ARGS__),,\
                     DEF_FN_19(__VA_ARGS__),,\
diff --git a/cpu/cpu-server-cublas.c b/cpu/cpu-server-cublas.c
index 972b2c31..ad54eca0 100644
--- a/cpu/cpu-server-cublas.c
+++ b/cpu/cpu-server-cublas.c
@@ -16,6 +16,7 @@
 #define WITH_RECORDER
 #include "api-recorder.h"
 #include "cpu-server-cublas.h"
+#include "gsched.h"
 
 
 
@@ -43,9 +44,12 @@ bool_t rpc_cublascreate_1_svc(ptr_result *result, struct svc_req *rqstp)
     RECORD_VOID_API;
     LOGE(LOG_DEBUG, "cublasCreate_v2");
 
+    GSCHED_RETAIN;
     result->err = cublasCreate_v2((cublasHandle_t*)&result->ptr_result_u.ptr);
-    RECORD_RESULT(ptr_result_u, *result);
     resource_mg_create(&rm_cublas, (void*)result->ptr_result_u.ptr);
+    GSCHED_RELEASE;
+    
+    RECORD_RESULT(ptr_result_u, *result);
     return 1;
 }
 
@@ -55,15 +59,33 @@ bool_t rpc_cublasdgemm_1_svc(ptr handle, int transa, int transb, int m, int n, i
             ptr C, int ldc,
             int *result, struct svc_req *rqstp)
 {
+    RECORD_API(rpc_cublasdgemm_1_argument);
+    RECORD_ARG(1, handle);
+    RECORD_ARG(2, transa);
+    RECORD_ARG(3, transb);
+    RECORD_ARG(4, m);
+    RECORD_ARG(5, n);
+    RECORD_ARG(6, k);
+    RECORD_ARG(7, alpha);
+    RECORD_ARG(8, A);
+    RECORD_ARG(9, lda);
+    RECORD_ARG(10, B);
+    RECORD_ARG(11, ldb);
+    RECORD_ARG(12, beta);
+    RECORD_ARG(13, C);
+    RECORD_ARG(14, ldc);
     LOGE(LOG_DEBUG, "cublasDgemm");
+    GSCHED_RETAIN;
     *result = cublasDgemm(resource_mg_get(&rm_cublas, (void*)handle),
                     (cublasOperation_t) transa,
                     (cublasOperation_t) transb,
                     m, n, k, &alpha,
-                    resource_mg_get(&rm_cublas, (void*)A), lda,
-                    resource_mg_get(&rm_cublas, (void*)B), ldb, &beta,
-                    resource_mg_get(&rm_cublas, (void*)C), ldc
+                    resource_mg_get(&rm_memory, (void*)A), lda,
+                    resource_mg_get(&rm_memory, (void*)B), ldb, &beta,
+                    resource_mg_get(&rm_memory, (void*)C), ldc
     );
+    GSCHED_RELEASE;
+    RECORD_RESULT(integer, *result);
     return 1;
 }
 
@@ -72,7 +94,209 @@ bool_t rpc_cublasdestroy_1_svc(ptr handle, int *result, struct svc_req *rqstp)
     RECORD_API(ptr);
     RECORD_SINGLE_ARG(handle);
     LOGE(LOG_DEBUG, "cublasDestroy_v2");
+    GSCHED_RETAIN;
     *result = cublasDestroy_v2(resource_mg_get(&rm_cublas, (void*)handle));
+    GSCHED_RELEASE;
+    RECORD_RESULT(integer, *result);
+    return 1;
+}
+
+bool_t rpc_cublassetworkspace_1_svc(ptr handle, ptr workspace, size_t workspaceSizeInBytes, int *result, struct svc_req *rqstp)
+{
+    RECORD_API(rpc_cublassetworkspace_1_argument);
+    RECORD_NARG(handle);
+    RECORD_NARG(workspace);
+    RECORD_NARG(workspaceSizeInBytes);
+    LOGE(LOG_DEBUG, "%s", __FUNCTION__);
+    GSCHED_RETAIN;
+#if CUBLAS_VERSION >= 11000
+    *result = cublasSetWorkspace(
+        resource_mg_get(&rm_cublas, (void*)handle),
+        resource_mg_get(&rm_memory, (void*)workspace),
+        workspaceSizeInBytes);
+#else
+    LOGE(LOG_ERROR, "cublassetworkspace not supported in this version");
+    *result = -1;
+#endif
+    GSCHED_RELEASE;
+    RECORD_RESULT(integer, *result);
+    return 1;
+}
+
+bool_t rpc_cublassetstream_1_svc(ptr handle, ptr streamId, int *result, struct svc_req *rqstp)
+{
+    RECORD_API(rpc_cublassetstream_1_argument);
+    RECORD_NARG(handle);
+    RECORD_NARG(streamId);
+    LOGE(LOG_DEBUG, "%s", __FUNCTION__);
+    GSCHED_RETAIN;
+    *result = cublasSetStream(
+        resource_mg_get(&rm_cublas, (void*)handle),
+        resource_mg_get(&rm_streams, (void*)streamId));
+    GSCHED_RELEASE;
+    RECORD_RESULT(integer, *result);
+    return 1;
+}
+
+bool_t rpc_cublassetmathmode_1_svc(ptr handle, int mode, int *result, struct svc_req *rqstp)
+{
+    RECORD_API(rpc_cublassetmathmode_1_argument);
+    RECORD_NARG(handle);
+    RECORD_NARG(mode);
+    LOGE(LOG_DEBUG, "%s", __FUNCTION__);
+    GSCHED_RETAIN;
+    *result = cublasSetMathMode(
+        resource_mg_get(&rm_cublas, (void*)handle),
+        (cublasMath_t)mode);
+    GSCHED_RELEASE;
     RECORD_RESULT(integer, *result);
     return 1;
 }
+
+bool_t rpc_cublassgemm_1_svc(ptr handle, int transa, int transb, int m, int n, int k, float alpha,
+            ptr A, int lda,
+            ptr B, int ldb, float beta,
+            ptr C, int ldc,
+            int *result, struct svc_req *rqstp)
+{
+    RECORD_API(rpc_cublassgemm_1_argument);
+    RECORD_ARG(1, handle);
+    RECORD_ARG(2, transa);
+    RECORD_ARG(3, transb);
+    RECORD_ARG(4, m);
+    RECORD_ARG(5, n);
+    RECORD_ARG(6, k);
+    RECORD_ARG(7, alpha);
+    RECORD_ARG(8, A);
+    RECORD_ARG(9, lda);
+    RECORD_ARG(10, B);
+    RECORD_ARG(11, ldb);
+    RECORD_ARG(12, beta);
+    RECORD_ARG(13, C);
+    RECORD_ARG(14, ldc);
+    LOGE(LOG_DEBUG, "cublasSgemm");
+    GSCHED_RETAIN;
+#if CUBLAS_VERSION >= 11000
+    *result = cublasSgemm(resource_mg_get(&rm_cublas, (void*)handle),
+                    (cublasOperation_t) transa,
+                    (cublasOperation_t) transb,
+                    m, n, k, &alpha,
+                    resource_mg_get(&rm_memory, (void*)A), lda,
+                    resource_mg_get(&rm_memory, (void*)B), ldb, &beta,
+                    resource_mg_get(&rm_memory, (void*)C), ldc
+    );
+#else
+    LOGE(LOG_ERROR, "cublassetworkspace not supported in this version");
+    *result = -1;
+#endif
+    GSCHED_RELEASE;
+    RECORD_RESULT(integer, *result);
+    return 1;
+}
+
+bool_t rpc_cublassgemv_1_svc(ptr handle, int trans, int m, 
+            int n, float alpha,
+            ptr A, int lda,
+            ptr x, int incx, float beta,
+            ptr y, int incy,
+            int *result, struct svc_req *rqstp)
+{
+    RECORD_API(rpc_cublassgemv_1_argument);
+    RECORD_ARG(1, handle);
+    RECORD_ARG(2, trans);
+    RECORD_ARG(3, m);
+    RECORD_ARG(4, n);
+    RECORD_ARG(5, alpha);
+    RECORD_ARG(6, A);
+    RECORD_ARG(7, lda);
+    RECORD_ARG(8, x);
+    RECORD_ARG(9, incx);
+    RECORD_ARG(10, beta);
+    RECORD_ARG(11, y);
+    RECORD_ARG(12, incy);
+    LOGE(LOG_DEBUG, "cublasSgemv");
+    GSCHED_RETAIN;
+    *result = cublasSgemv(resource_mg_get(&rm_cublas, (void*)handle),
+                    (cublasOperation_t) trans,
+                    m, n, &alpha,
+                    resource_mg_get(&rm_memory, (void*)A), lda,
+                    resource_mg_get(&rm_memory, (void*)x), incx, &beta,
+                    resource_mg_get(&rm_memory, (void*)y), incy
+    );
+    GSCHED_RELEASE;
+    RECORD_RESULT(integer, *result);
+    return 1;
+}
+
+bool_t rpc_cublasdgemv_1_svc(ptr handle, int trans, int m, 
+            int n, double alpha,
+            ptr A, int lda,
+            ptr x, int incx, double beta,
+            ptr y, int incy,
+            int *result, struct svc_req *rqstp)
+{
+    RECORD_API(rpc_cublasdgemv_1_argument);
+    RECORD_ARG(1, handle);
+    RECORD_ARG(2, trans);
+    RECORD_ARG(3, m);
+    RECORD_ARG(4, n);
+    RECORD_ARG(5, alpha);
+    RECORD_ARG(6, A);
+    RECORD_ARG(7, lda);
+    RECORD_ARG(8, x);
+    RECORD_ARG(9, incx);
+    RECORD_ARG(10, beta);
+    RECORD_ARG(11, y);
+    RECORD_ARG(12, incy);
+    LOGE(LOG_DEBUG, "cublasDgemv");
+    GSCHED_RETAIN;
+    *result = cublasDgemv(resource_mg_get(&rm_cublas, (void*)handle),
+                    (cublasOperation_t) trans,
+                    m, n, &alpha,
+                    resource_mg_get(&rm_memory, (void*)A), lda,
+                    resource_mg_get(&rm_memory, (void*)x), incx, &beta,
+                    resource_mg_get(&rm_memory, (void*)y), incy
+    );
+    GSCHED_RELEASE;
+    RECORD_RESULT(integer, *result);
+    return 1;
+}
+
+bool_t rpc_cublassgemmex_1_svc(ptr handle, int transa, int transb, int m, int n, int k, float alpha,
+            ptr A, int Atype, int lda,
+            ptr B, int Btype, int ldb, float beta,
+            ptr C, int Ctype, int ldc,
+            int *result, struct svc_req *rqstp)
+{
+    RECORD_API(rpc_cublassgemmex_1_argument);
+    RECORD_ARG(1, handle);
+    RECORD_ARG(2, transa);
+    RECORD_ARG(3, transb);
+    RECORD_ARG(4, m);
+    RECORD_ARG(5, n);
+    RECORD_ARG(6, k);
+    RECORD_ARG(7, alpha);
+    RECORD_ARG(8, A);
+    RECORD_ARG(9, Atype);
+    RECORD_ARG(10, lda);
+    RECORD_ARG(11, B);
+    RECORD_ARG(12, Btype);
+    RECORD_ARG(13, ldb);
+    RECORD_ARG(14, beta);
+    RECORD_ARG(15, C);
+    RECORD_ARG(16, Ctype);
+    RECORD_ARG(17, ldc);
+    LOGE(LOG_DEBUG, "cublasSgemmEx");
+    GSCHED_RETAIN;
+    *result = cublasSgemmEx(resource_mg_get(&rm_cublas, (void*)handle),
+                    (cublasOperation_t) transa,
+                    (cublasOperation_t) transb,
+                    m, n, k, &alpha,
+                    resource_mg_get(&rm_memory, (void*)A), (cudaDataType_t)Atype, lda,
+                    resource_mg_get(&rm_memory, (void*)B), (cudaDataType_t)Btype, ldb, &beta,
+                    resource_mg_get(&rm_memory, (void*)C), (cudaDataType_t)Ctype, ldc
+    );
+    GSCHED_RELEASE;
+    RECORD_RESULT(integer, *result);
+    return 1;
+}
\ No newline at end of file
diff --git a/cpu/cpu-server-cudnn.c b/cpu/cpu-server-cudnn.c
new file mode 100644
index 00000000..70e4abce
--- /dev/null
+++ b/cpu/cpu-server-cudnn.c
@@ -0,0 +1,1396 @@
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <cudnn.h>
+#include <stdbool.h>
+
+#include "cpu_rpc_prot.h"
+#include "cpu-common.h"
+#include "cpu-utils.h"
+#include "log.h"
+#include "resource-mg.h"
+#include "gsched.h"
+
+#define WITH_RECORDER
+#include "api-recorder.h"
+
+#include "cpu-server-cudnn.h"
+
+
+
+int server_cudnn_init(int bypass)
+{
+    int ret = 0;
+    ret &= resource_mg_init(&rm_cudnn, bypass);
+    ret &= resource_mg_init(&rm_cudnn_tensors, bypass);
+    ret &= resource_mg_init(&rm_cudnn_filters, bypass);
+    ret &= resource_mg_init(&rm_cudnn_poolings, bypass);
+    ret &= resource_mg_init(&rm_cudnn_activations, bypass);
+    ret &= resource_mg_init(&rm_cudnn_lrns, bypass);
+    ret &= resource_mg_init(&rm_cudnn_convs, bypass);
+    ret &= resource_mg_init(&rm_cudnn_backendds, bypass);
+    return ret;
+}
+
+int server_cudnn_deinit(void)
+{
+    resource_mg_free(&rm_cudnn);
+    resource_mg_free(&rm_cudnn_tensors);
+    resource_mg_free(&rm_cudnn_filters);
+    resource_mg_free(&rm_cudnn_poolings);
+    resource_mg_free(&rm_cudnn_activations);
+    resource_mg_free(&rm_cudnn_lrns);
+    resource_mg_free(&rm_cudnn_convs);
+    resource_mg_free(&rm_cudnn_backendds);
+    return 0;
+
+}
+
+bool_t rpc_cudnngetversion_1_svc(size_t *result, struct svc_req *rqstp)
+{
+    LOGE(LOG_DEBUG, "%s", __FUNCTION__);
+
+    GSCHED_RETAIN;
+    *result = cudnnGetVersion();
+    GSCHED_RELEASE;
+    return 1;
+}
+
+bool_t rpc_cudnngetmaxdeviceversion_1_svc(size_t *result, struct svc_req *rqstp)
+{
+    LOGE(LOG_DEBUG, "%s", __FUNCTION__);
+
+    GSCHED_RETAIN;
+    *result = cudnnGetMaxDeviceVersion();
+    GSCHED_RELEASE;
+    return 1;
+}
+
+bool_t rpc_cudnngetcudartversion_1_svc(size_t *result, struct svc_req *rqstp)
+{
+    LOGE(LOG_DEBUG, "%s", __FUNCTION__);
+
+    GSCHED_RETAIN;
+    *result = cudnnGetCudartVersion();
+    GSCHED_RELEASE;
+    return 1;
+}
+
+bool_t rpc_cudnngeterrorstring_1_svc(int status, char **result, struct svc_req *rqstp)
+{
+    const char* str;
+    *result = malloc(128);
+    LOGE(LOG_DEBUG, "%s", __FUNCTION__);
+    GSCHED_RETAIN;
+    str = cudnnGetErrorString((cudnnStatus_t)status);
+    strncpy(*result, str, 128);
+    GSCHED_RELEASE;
+    return 1;
+}
+
+bool_t rpc_cudnnqueryruntimeerror_1_svc(ptr handle, int mode, int_result *result, struct svc_req *rqstp)
+{
+    LOGE(LOG_DEBUG, "%s", __FUNCTION__);
+    cudnnRuntimeTag_t *tag;
+
+    GSCHED_RETAIN;
+    result->err = cudnnQueryRuntimeError(
+        (cudnnHandle_t)resource_mg_get(&rm_cudnn, (void*)handle),
+        (cudnnStatus_t*)&result->int_result_u.data, (cudnnErrQueryMode_t)mode, tag);
+    GSCHED_RELEASE;
+    return 1;
+}
+
+bool_t rpc_cudnngetproperty_1_svc(int type, int_result *result, struct svc_req *rqstp)
+{
+    LOGE(LOG_DEBUG, "%s", __FUNCTION__);
+
+    GSCHED_RETAIN;
+    result->err = cudnnGetProperty((libraryPropertyType)type, &result->int_result_u.data); 
+    GSCHED_RELEASE;
+    return 1;
+}
+
+bool_t rpc_cudnncreate_1_svc(ptr_result *result, struct svc_req *rqstp)
+{
+    RECORD_VOID_API;
+    LOGE(LOG_DEBUG, "%s", __FUNCTION__);
+
+    GSCHED_RETAIN;
+    result->err = cudnnCreate((cudnnHandle_t*)&result->ptr_result_u.ptr);
+    if (resource_mg_create(&rm_cudnn, (void*)result->ptr_result_u.ptr) != 0) {
+        LOGE(LOG_ERROR, "error in resource manager");
+    }
+    GSCHED_RELEASE;
+    RECORD_RESULT(ptr_result_u, *result);
+    return 1;
+}
+
+bool_t rpc_cudnndestroy_1_svc(ptr handle, int *result, struct svc_req *rqstp)
+{
+    RECORD_API(ptr);
+    RECORD_SINGLE_ARG(handle);
+    LOGE(LOG_DEBUG, "%s", __FUNCTION__);
+
+    GSCHED_RETAIN;
+    *result = cudnnDestroy(
+        (cudnnHandle_t)resource_mg_get(&rm_cudnn, (void*)handle));
+    // TODO: Remove from resource manager
+    GSCHED_RELEASE;
+    RECORD_RESULT(integer, *result);
+    return 1;
+}
+
+bool_t rpc_cudnnsetstream_1_svc(ptr handle, ptr streamId, int *result, struct svc_req *rqstp)
+{
+    RECORD_API(rpc_cudnnsetstream_1_argument);
+    RECORD_NARG(handle);
+    RECORD_NARG(streamId);
+    LOGE(LOG_DEBUG, "%s", __FUNCTION__);
+
+    GSCHED_RETAIN;
+    *result = cudnnSetStream(
+        (cudnnHandle_t)resource_mg_get(&rm_cudnn, (void*)handle),
+        (cudaStream_t)resource_mg_get(&rm_streams, (void*)streamId));
+    GSCHED_RELEASE;
+    RECORD_RESULT(integer, *result);
+    return 1;
+}
+
+bool_t rpc_cudnngetstream_1_svc(ptr handle, ptr_result *result, struct svc_req *rqstp)
+{
+    LOGE(LOG_DEBUG, "%s", __FUNCTION__);
+
+    GSCHED_RETAIN;
+    result->err = cudnnGetStream(
+        (cudnnHandle_t)resource_mg_get(&rm_cudnn, (void*)handle),
+        (cudaStream_t*)&result->ptr_result_u.ptr);
+
+    GSCHED_RELEASE;
+    return 1;
+}
+
+bool_t rpc_cudnncreatetensordescriptor_1_svc(ptr_result *result, struct svc_req *rqstp)
+{
+    RECORD_VOID_API;
+    LOGE(LOG_DEBUG, "%s", __FUNCTION__);
+
+    GSCHED_RETAIN;
+    result->err = cudnnCreateTensorDescriptor((cudnnTensorDescriptor_t*)&result->ptr_result_u.ptr);
+    if (resource_mg_create(&rm_cudnn_tensors, (void*)result->ptr_result_u.ptr) != 0) {
+        LOGE(LOG_ERROR, "error in resource manager");
+    }
+    GSCHED_RELEASE;
+    RECORD_RESULT(ptr_result_u, *result);
+    return 1;
+}
+
+bool_t rpc_cudnnsettensor4ddescriptor_1_svc(ptr tensorDesc, int format, int dataType, int n, int c, int h, int w, int *result, struct svc_req *rqstp)
+{
+    RECORD_API(rpc_cudnnsettensor4ddescriptor_1_argument);
+    RECORD_NARG(tensorDesc);
+    RECORD_NARG(format);
+    RECORD_NARG(dataType);
+    RECORD_NARG(n);
+    RECORD_NARG(c);
+    RECORD_NARG(h);
+    RECORD_NARG(w);
+
+    LOGE(LOG_DEBUG, "%s", __FUNCTION__);
+
+    GSCHED_RETAIN;
+    *result = cudnnSetTensor4dDescriptor(
+        (cudnnTensorDescriptor_t)resource_mg_get(&rm_cudnn_tensors, (void*)tensorDesc),
+        (cudnnTensorFormat_t)format,
+        (cudnnDataType_t)dataType,
+        n, c, h, w);
+    GSCHED_RELEASE;
+    RECORD_RESULT(integer, *result);
+    return 1;
+}
+
+bool_t rpc_cudnnsettensor4ddescriptorex_1_svc(ptr tensorDesc, int dataType, int n, int c, int h, int w, int nStride, int cStride, int hStride, int wStride, int *result, struct svc_req *rqstp)
+{
+    RECORD_API(rpc_cudnnsettensor4ddescriptorex_1_argument);
+    RECORD_NARG(tensorDesc);
+    RECORD_NARG(dataType);
+    RECORD_NARG(n);
+    RECORD_NARG(c);
+    RECORD_NARG(h);
+    RECORD_NARG(w);
+    RECORD_NARG(nStride);
+    RECORD_NARG(cStride);
+    RECORD_NARG(hStride);
+    RECORD_NARG(wStride);
+
+    LOGE(LOG_DEBUG, "%s", __FUNCTION__);
+
+    GSCHED_RETAIN;
+    *result = cudnnSetTensor4dDescriptorEx(
+        (cudnnTensorDescriptor_t)resource_mg_get(&rm_cudnn_tensors, (void*)tensorDesc),
+        (cudnnDataType_t)dataType,
+        n, c, h, w, nStride, cStride, hStride, wStride);
+    GSCHED_RELEASE;
+    RECORD_RESULT(integer, *result);
+    return 1;
+}
+
+bool_t rpc_cudnngettensor4ddescriptor_1_svc(ptr tensorDesc, int9_result *result, struct svc_req *rqstp)
+{
+    LOGE(LOG_DEBUG, "%s", __FUNCTION__);
+
+    GSCHED_RETAIN;
+    result->err = cudnnGetTensor4dDescriptor(
+        (cudnnTensorDescriptor_t)resource_mg_get(&rm_cudnn_tensors, (void*)tensorDesc),
+        (cudnnDataType_t*)&result->int9_result_u.data[0],
+        &result->int9_result_u.data[1],
+        &result->int9_result_u.data[2],
+        &result->int9_result_u.data[3],
+        &result->int9_result_u.data[4],
+        &result->int9_result_u.data[5],
+        &result->int9_result_u.data[6],
+        &result->int9_result_u.data[7],
+        &result->int9_result_u.data[8]);
+    GSCHED_RELEASE;
+    return 1;
+}
+
+bool_t rpc_cudnnsettensornddescriptor_1_svc(ptr tensorDesc, int dataType, int nbDims, mem_data dimA, mem_data strideA, int *result, struct svc_req *rqstp)
+{
+    RECORD_API(rpc_cudnnsettensornddescriptor_1_argument);
+    RECORD_NARG(tensorDesc);
+    RECORD_NARG(dataType);
+    RECORD_NARG(nbDims);
+    RECORD_NARG(dimA);
+    RECORD_NARG(strideA);
+    
+    //TODO: Recording dimA and strideA is not as easy as done here.
+
+    LOGE(LOG_DEBUG, "%s", __FUNCTION__);
+
+    if (dimA.mem_data_len != nbDims * sizeof(int) || strideA.mem_data_len != nbDims * sizeof(int)) {
+        LOGE(LOG_ERROR, "array dimensions not as expected.");
+        return 0;
+    }
+    GSCHED_RETAIN;
+    *result = cudnnSetTensorNdDescriptor(
+        (cudnnTensorDescriptor_t)resource_mg_get(&rm_cudnn_tensors, (void*)tensorDesc),
+        (cudnnDataType_t)dataType,
+        nbDims,
+        (const int*)dimA.mem_data_val,
+        (const int*)strideA.mem_data_val);
+    GSCHED_RELEASE;
+    RECORD_RESULT(integer, *result);
+    return 1;
+}
+
+bool_t rpc_cudnnsettensornddescriptorex_1_svc(ptr tensorDesc, int format, int dataType, int nbDims, mem_data dimA, int *result, struct svc_req *rqstp)
+{
+    RECORD_API(rpc_cudnnsettensornddescriptorex_1_argument);
+    RECORD_NARG(tensorDesc);
+    RECORD_NARG(format);
+    RECORD_NARG(dataType);
+    RECORD_NARG(nbDims);
+    RECORD_NARG(dimA);
+    
+    //TODO: Recording dimA and strideA is not as easy as done here.
+
+    LOGE(LOG_DEBUG, "%s", __FUNCTION__);
+
+    if (dimA.mem_data_len != nbDims * sizeof(int)) {
+        LOGE(LOG_ERROR, "array dimensions not as expected.");
+        return 0;
+    }
+    GSCHED_RETAIN;
+    *result = cudnnSetTensorNdDescriptorEx(
+        (cudnnTensorDescriptor_t)resource_mg_get(&rm_cudnn_tensors, (void*)tensorDesc),
+        (cudnnTensorFormat_t)format,   
+        (cudnnDataType_t)dataType,
+        nbDims,
+        (const int*)dimA.mem_data_val);
+    GSCHED_RELEASE;
+    RECORD_RESULT(integer, *result);
+    return 1;
+}
+
+bool_t rpc_cudnngettensornddescriptor_1_svc(ptr tensorDesc, int nbDimsRequested, mem_result *result, struct svc_req *rqstp)
+{
+    LOGE(LOG_DEBUG, "%s", __FUNCTION__);
+    result->mem_result_u.data.mem_data_len = sizeof(cudnnDataType_t) + sizeof(int) + nbDimsRequested*sizeof(int)*2;
+    if ((result->mem_result_u.data.mem_data_val = malloc(result->mem_result_u.data.mem_data_len)) == NULL) {
+        LOGE(LOG_ERROR, "malloc failed");
+        return 0;
+    }
+    
+    GSCHED_RETAIN;
+    result->err = cudnnGetTensorNdDescriptor(
+        (cudnnTensorDescriptor_t)resource_mg_get(&rm_cudnn_tensors, (void*)tensorDesc),
+        nbDimsRequested,
+        (cudnnDataType_t*)result->mem_result_u.data.mem_data_val,
+        (int*)&result->mem_result_u.data.mem_data_val[sizeof(cudnnDataType_t)],
+        (int*)&result->mem_result_u.data.mem_data_val[sizeof(cudnnDataType_t)+sizeof(int)],
+        (int*)&result->mem_result_u.data.mem_data_val[sizeof(cudnnDataType_t)+sizeof(int)+nbDimsRequested*sizeof(int)]);
+
+    GSCHED_RELEASE;
+    return 1;
+}
+
+bool_t rpc_cudnngettensorsizeinbytes_1_svc(ptr tensorDesc, sz_result *result, struct svc_req *rqstp)
+{
+    LOGE(LOG_DEBUG, "%s", __FUNCTION__);
+    GSCHED_RETAIN;
+    result->err = cudnnGetTensorSizeInBytes(
+        (cudnnTensorDescriptor_t)resource_mg_get(&rm_cudnn_tensors, (void*)tensorDesc),
+        &result->sz_result_u.data);
+    GSCHED_RELEASE;
+    return 1;
+}
+
+bool_t rpc_cudnndestroytensordescriptor_1_svc(ptr tensorDesc, int *result, struct svc_req *rqstp)
+{
+    RECORD_API(ptr);
+    RECORD_SINGLE_ARG(tensorDesc);
+    LOGE(LOG_DEBUG, "%s", __FUNCTION__);
+
+    GSCHED_RETAIN;
+    *result = cudnnDestroyTensorDescriptor(
+        (cudnnTensorDescriptor_t)resource_mg_get(&rm_cudnn_tensors, (void*)tensorDesc));
+    // TODO: Remove from resource manager
+    GSCHED_RELEASE;
+    RECORD_RESULT(integer, *result);
+    return 1;
+}
+
+
+bool_t rpc_cudnncreatefilterdescriptor_1_svc(ptr_result *result, struct svc_req *rqstp)
+{
+    RECORD_VOID_API;
+    LOGE(LOG_DEBUG, "%s", __FUNCTION__);
+
+    GSCHED_RETAIN;
+    result->err = cudnnCreateFilterDescriptor((cudnnFilterDescriptor_t*)&result->ptr_result_u.ptr);
+    if (resource_mg_create(&rm_cudnn_filters, (void*)result->ptr_result_u.ptr) != 0) {
+        LOGE(LOG_ERROR, "error in resource manager");
+    }
+    GSCHED_RELEASE;
+    RECORD_RESULT(ptr_result_u, *result);
+    return 1;
+}
+
+bool_t rpc_cudnnsetfilter4ddescriptor_1_svc(ptr filterDesc, int dataType, int format, int k, int c, int h, int w, int *result, struct svc_req *rqstp)
+{
+    RECORD_API(rpc_cudnnsetfilter4ddescriptor_1_argument);
+    RECORD_NARG(filterDesc);
+    RECORD_NARG(dataType);
+    RECORD_NARG(format);
+    RECORD_NARG(k);
+    RECORD_NARG(c);
+    RECORD_NARG(h);
+    RECORD_NARG(w);
+
+    LOGE(LOG_DEBUG, "%s", __FUNCTION__);
+
+    GSCHED_RETAIN;
+    *result = cudnnSetFilter4dDescriptor(
+        (cudnnFilterDescriptor_t)resource_mg_get(&rm_cudnn_filters, (void*)filterDesc),
+        (cudnnDataType_t)dataType,
+        (cudnnTensorFormat_t)format,
+        k, c, h, w);
+    GSCHED_RELEASE;
+    RECORD_RESULT(integer, *result);
+    return 1;
+}
+
+bool_t rpc_cudnngetfilter4ddescriptor_1_svc(ptr filterDesc, int6_result *result, struct svc_req *rqstp)
+{
+    LOGE(LOG_DEBUG, "%s", __FUNCTION__);
+
+    GSCHED_RETAIN;
+    result->err = cudnnGetFilter4dDescriptor(
+        (cudnnFilterDescriptor_t)resource_mg_get(&rm_cudnn_filters, (void*)filterDesc),
+        (cudnnDataType_t*)&result->int6_result_u.data[0],
+        (cudnnTensorFormat_t*)&result->int6_result_u.data[1],
+        &result->int6_result_u.data[2],
+        &result->int6_result_u.data[3],
+        &result->int6_result_u.data[4],
+        &result->int6_result_u.data[5]);
+    GSCHED_RELEASE;
+    return 1;
+}
+
+bool_t rpc_cudnnsetfilternddescriptor_1_svc(ptr filterDesc, int dataType, int format, int nbDims, mem_data filterDimA, int *result, struct svc_req *rqstp)
+{
+    RECORD_API(rpc_cudnnsetfilternddescriptor_1_argument);
+    RECORD_NARG(filterDesc);
+    RECORD_NARG(dataType);
+    RECORD_NARG(format);
+    RECORD_NARG(nbDims);
+    RECORD_NARG(filterDimA);
+    
+    //TODO: Recording filterDimA is not as easy as done here.
+
+    LOGE(LOG_DEBUG, "%s", __FUNCTION__);
+
+    if (filterDimA.mem_data_len != nbDims * sizeof(int)) {
+        LOGE(LOG_ERROR, "array dimension not as expected.");
+        return 0;
+    }
+    GSCHED_RETAIN;
+    *result = cudnnSetFilterNdDescriptor(
+        (cudnnFilterDescriptor_t)resource_mg_get(&rm_cudnn_filters, (void*)filterDesc),
+        (cudnnDataType_t)dataType,
+        (cudnnTensorFormat_t)format,
+        nbDims,
+        (const int*)filterDimA.mem_data_val);
+    GSCHED_RELEASE;
+    RECORD_RESULT(integer, *result);
+    return 1;
+}
+
+bool_t rpc_cudnngetfilternddescriptor_1_svc(ptr filterDesc, int nbDimsRequested, mem_result *result, struct svc_req *rqstp)
+{
+    LOGE(LOG_DEBUG, "%s", __FUNCTION__);
+    result->mem_result_u.data.mem_data_len = sizeof(cudnnDataType_t) + sizeof(cudnnTensorFormat_t) + sizeof(int) + nbDimsRequested*sizeof(int);
+    if ((result->mem_result_u.data.mem_data_val = malloc(result->mem_result_u.data.mem_data_len)) == NULL) {
+        LOGE(LOG_ERROR, "malloc failed");
+        return 0;
+    }
+    
+    GSCHED_RETAIN;
+    result->err = cudnnGetFilterNdDescriptor(
+        (cudnnFilterDescriptor_t)resource_mg_get(&rm_cudnn_filters, (void*)filterDesc),
+        nbDimsRequested,
+        (cudnnDataType_t*)result->mem_result_u.data.mem_data_val,
+        (cudnnTensorFormat_t*)&result->mem_result_u.data.mem_data_val[sizeof(cudnnDataType_t)],
+        (int*)&result->mem_result_u.data.mem_data_val[sizeof(cudnnDataType_t)+sizeof(cudnnTensorDescriptor_t)],
+        (int*)&result->mem_result_u.data.mem_data_val[sizeof(cudnnDataType_t)+sizeof(cudnnTensorDescriptor_t)+sizeof(int)]);
+    GSCHED_RELEASE;
+    return 1;
+}
+
+bool_t rpc_cudnngetfiltersizeinbytes_1_svc(ptr filterDesc, sz_result *result, struct svc_req *rqstp)
+{
+    LOGE(LOG_DEBUG, "%s", __FUNCTION__);
+    GSCHED_RETAIN;
+    result->err = cudnnGetFilterSizeInBytes(
+        (cudnnFilterDescriptor_t)resource_mg_get(&rm_cudnn_filters, (void*)filterDesc),
+        &result->sz_result_u.data);
+    GSCHED_RELEASE;
+    return 1;
+}
+
+bool_t rpc_cudnntransformfilter_1_svc(ptr handle, ptr transDesc, cudnn_scaling_t alpha, ptr srcDesc, ptr srcData, cudnn_scaling_t beta, ptr destDesc, ptr destData, int *result, struct svc_req *rqstp)
+{
+    RECORD_API(rpc_cudnntransformfilter_1_argument);
+    RECORD_NARG(handle);
+    RECORD_NARG(transDesc);
+    RECORD_NARG(alpha);
+    RECORD_NARG(srcDesc);
+    RECORD_NARG(srcData);
+    RECORD_NARG(beta);
+    RECORD_NARG(destDesc);
+    RECORD_NARG(destData);
+    
+    LOGE(LOG_DEBUG, "%s", __FUNCTION__);
+
+    GSCHED_RETAIN;
+    *result = cudnnTransformFilter(
+        (cudnnHandle_t)resource_mg_get(&rm_cudnn, (void*)handle),
+        (const cudnnTensorTransformDescriptor_t)resource_mg_get(&rm_cudnn_tensortransform, (void*)transDesc),
+        (alpha.dataType == CUDNN_DATA_DOUBLE ? (const void*)&alpha.cudnn_scaling_t_u.d : (const void*)&alpha.cudnn_scaling_t_u.f),
+        (const cudnnFilterDescriptor_t)resource_mg_get(&rm_cudnn_filters, (void*)srcDesc),
+        (const void*)srcData,
+        (beta.dataType == CUDNN_DATA_DOUBLE ? (const void*)&beta.cudnn_scaling_t_u.d : (const void*)&beta.cudnn_scaling_t_u.f),
+        (const cudnnFilterDescriptor_t)resource_mg_get(&rm_cudnn_filters, (void*)destDesc),
+        (void*)destData);
+    GSCHED_RELEASE;
+    RECORD_RESULT(integer, *result);
+    return 1;
+}
+
+bool_t rpc_cudnndestroyfilterdescriptor_1_svc(ptr filterDesc, int *result, struct svc_req *rqstp)
+{
+    RECORD_API(ptr);
+    RECORD_SINGLE_ARG(filterDesc);
+    LOGE(LOG_DEBUG, "%s", __FUNCTION__);
+
+    GSCHED_RETAIN;
+    *result = cudnnDestroyFilterDescriptor(
+        (cudnnFilterDescriptor_t)resource_mg_get(&rm_cudnn_filters, (void*)filterDesc));
+    // TODO: Remove from resource manager
+    GSCHED_RELEASE;
+    RECORD_RESULT(integer, *result);
+    return 1;
+}
+
+bool_t rpc_cudnncreatepoolingdescriptor_1_svc(ptr_result *result, struct svc_req *rqstp)
+{
+    RECORD_VOID_API;
+    LOGE(LOG_DEBUG, "%s", __FUNCTION__);
+
+    GSCHED_RETAIN;
+    result->err = cudnnCreatePoolingDescriptor((cudnnPoolingDescriptor_t*)&result->ptr_result_u.ptr);
+    if (resource_mg_create(&rm_cudnn_poolings, (void*)result->ptr_result_u.ptr) != 0) {
+        LOGE(LOG_ERROR, "error in resource manager");
+    }
+    GSCHED_RELEASE;
+    RECORD_RESULT(ptr_result_u, *result);
+    return 1;
+}
+
+bool_t rpc_cudnnsetpooling2ddescriptor_1_svc(ptr poolingDesc, int mode, int maxpoolingNanOpt, int windowHeight, int windowWidth, int verticalPadding, int horizontalPadding, int verticalStride, int horizontalStride, int *result, struct svc_req *rqstp)
+{
+    RECORD_API(rpc_cudnnsetpooling2ddescriptor_1_argument);
+    RECORD_NARG(poolingDesc);
+    RECORD_NARG(mode);
+    RECORD_NARG(maxpoolingNanOpt);
+    RECORD_NARG(windowHeight);
+    RECORD_NARG(windowWidth);
+    RECORD_NARG(verticalPadding);
+    RECORD_NARG(horizontalPadding);
+    RECORD_NARG(verticalStride);
+    RECORD_NARG(horizontalStride);
+
+    LOGE(LOG_DEBUG, "%s", __FUNCTION__);
+
+    GSCHED_RETAIN;
+    *result = cudnnSetPooling2dDescriptor(
+        (cudnnPoolingDescriptor_t)resource_mg_get(&rm_cudnn_poolings, (void*)poolingDesc),
+        (cudnnPoolingMode_t)mode,
+        (cudnnNanPropagation_t)maxpoolingNanOpt,
+        windowHeight, windowWidth,
+        verticalPadding, horizontalPadding,
+        verticalStride, horizontalStride);
+    GSCHED_RELEASE;
+    RECORD_RESULT(integer, *result);
+    return 1;
+}
+
+bool_t rpc_cudnngetpooling2ddescriptor_1_svc(ptr poolingDesc, int8_result *result, struct svc_req *rqstp)
+{
+    LOGE(LOG_DEBUG, "%s", __FUNCTION__);
+
+    GSCHED_RETAIN;
+    result->err = cudnnGetPooling2dDescriptor(
+        (cudnnPoolingDescriptor_t)resource_mg_get(&rm_cudnn_poolings, (void*)poolingDesc),
+        (cudnnPoolingMode_t*)&result->int8_result_u.data[0],
+        (cudnnNanPropagation_t*)&result->int8_result_u.data[1],
+        &result->int8_result_u.data[2],
+        &result->int8_result_u.data[3],
+        &result->int8_result_u.data[4],
+        &result->int8_result_u.data[5],
+        &result->int8_result_u.data[6],
+        &result->int8_result_u.data[7]);
+    GSCHED_RELEASE;
+    return 1;
+}
+
+bool_t rpc_cudnnsetpoolingnddescriptor_1_svc(ptr poolingDesc, int mode, int maxpoolingNanOpt, int nbDims, mem_data windowDimA, mem_data paddingA, mem_data strideA, int *result, struct svc_req *rqstp)
+{
+    RECORD_API(rpc_cudnnsetpoolingnddescriptor_1_argument);
+    RECORD_NARG(poolingDesc);
+    RECORD_NARG(mode);
+    RECORD_NARG(maxpoolingNanOpt);
+    RECORD_NARG(nbDims);
+    RECORD_NARG(windowDimA);
+    RECORD_NARG(paddingA);
+    RECORD_NARG(strideA);
+    //TODO: Recording windowDimA, paddingA and strideA are not as easy as done here.
+
+    LOGE(LOG_DEBUG, "%s", __FUNCTION__);
+
+    if (windowDimA.mem_data_len != nbDims * sizeof(int) ||
+        paddingA.mem_data_len != nbDims * sizeof(int) ||
+        strideA.mem_data_len != nbDims * sizeof(int)) {
+        LOGE(LOG_ERROR, "array dimensions not as expected.");
+        return 0;
+    }
+    GSCHED_RETAIN;
+    *result = cudnnSetPoolingNdDescriptor(
+        (cudnnPoolingDescriptor_t)resource_mg_get(&rm_cudnn_poolings, (void*)poolingDesc),
+        (cudnnPoolingMode_t)mode,
+        (cudnnNanPropagation_t)maxpoolingNanOpt,
+        nbDims,
+        (const int*)windowDimA.mem_data_val,
+        (const int*)paddingA.mem_data_val,
+        (const int*)strideA.mem_data_val);
+    GSCHED_RELEASE;
+    RECORD_RESULT(integer, *result);
+    return 1;
+}
+
+bool_t rpc_cudnngetpoolingnddescriptor_1_svc(ptr poolingDesc, int nbDimsRequested, mem_result *result, struct svc_req *rqstp)
+{
+    LOGE(LOG_DEBUG, "%s", __FUNCTION__);
+    result->mem_result_u.data.mem_data_len = sizeof(cudnnPoolingMode_t) + sizeof(cudnnNanPropagation_t) + nbDimsRequested * sizeof(int) * 3;
+    if ((result->mem_result_u.data.mem_data_val = malloc(result->mem_result_u.data.mem_data_len)) == NULL) {
+        LOGE(LOG_ERROR, "malloc failed");
+        return 0;
+    }
+    
+    size_t offsets[] = {
+        0,
+        sizeof(cudnnPoolingMode_t),
+        sizeof(cudnnPoolingMode_t) + sizeof(cudnnNanPropagation_t),
+        sizeof(cudnnPoolingMode_t) + sizeof(cudnnNanPropagation_t) + sizeof(int),
+        sizeof(cudnnPoolingMode_t) + sizeof(cudnnNanPropagation_t) + sizeof(int) + sizeof(int) * nbDimsRequested,
+        sizeof(cudnnPoolingMode_t) + sizeof(cudnnNanPropagation_t) + sizeof(int) + sizeof(int) * nbDimsRequested * 2,
+    };
+    
+    GSCHED_RETAIN;
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Wint-to-pointer-cast"
+    result->err = cudnnGetPoolingNdDescriptor(
+        (cudnnPoolingDescriptor_t)resource_mg_get(&rm_cudnn_poolings, (void*)poolingDesc),
+        nbDimsRequested,
+        (cudnnPoolingMode_t*)result->mem_result_u.data.mem_data_val[offsets[0]],
+        (cudnnNanPropagation_t*)result->mem_result_u.data.mem_data_val[offsets[1]],
+        (int*)result->mem_result_u.data.mem_data_val[offsets[2]],
+        (int*)result->mem_result_u.data.mem_data_val[offsets[3]],
+        (int*)result->mem_result_u.data.mem_data_val[offsets[4]],
+        (int*)result->mem_result_u.data.mem_data_val[offsets[5]]);
+#pragma GCC diagnostic pop
+
+    GSCHED_RELEASE;
+    return 1;
+}
+
+bool_t rpc_cudnngetpoolingndforwardoutputdim_1_svc(ptr poolingDesc, ptr inputTensorDesc, int nbDims, mem_result *result, struct svc_req *rqstp)
+{
+    LOGE(LOG_DEBUG, "%s", __FUNCTION__);
+    GSCHED_RETAIN;
+    result->mem_result_u.data.mem_data_len = sizeof(int) * nbDims;
+    if ((result->mem_result_u.data.mem_data_val = malloc(result->mem_result_u.data.mem_data_len)) == NULL) {
+        LOGE(LOG_ERROR, "malloc failed");
+        return 0;
+    }
+    result->err = cudnnGetPoolingNdForwardOutputDim(
+        (cudnnPoolingDescriptor_t)resource_mg_get(&rm_cudnn_poolings, (void*)poolingDesc),
+        (cudnnTensorDescriptor_t)resource_mg_get(&rm_cudnn_tensors, (void*)inputTensorDesc),
+        nbDims,
+        (int*)result->mem_result_u.data.mem_data_val);
+    GSCHED_RELEASE;
+    return 1;
+}
+
+bool_t rpc_cudnngetpooling2dforwardoutputdim_1_svc(ptr poolingDesc, ptr inputTensorDesc, int4_result *result, struct svc_req *rqstp)
+{
+    LOGE(LOG_DEBUG, "%s", __FUNCTION__);
+    GSCHED_RETAIN;
+    result->err = cudnnGetPooling2dForwardOutputDim(
+        (cudnnPoolingDescriptor_t)resource_mg_get(&rm_cudnn_poolings, (void*)poolingDesc),
+        (cudnnTensorDescriptor_t)resource_mg_get(&rm_cudnn_tensors, (void*)inputTensorDesc),
+        (int*)&result->int4_result_u.data[0],
+        (int*)&result->int4_result_u.data[1],
+        (int*)&result->int4_result_u.data[2],
+        (int*)&result->int4_result_u.data[3]);
+    GSCHED_RELEASE;
+    return 1;
+}
+
+bool_t rpc_cudnndestroypoolingdescriptor_1_svc(ptr poolingDesc, int *result, struct svc_req *rqstp)
+{
+    RECORD_API(ptr);
+    RECORD_SINGLE_ARG(poolingDesc);
+    LOGE(LOG_DEBUG, "%s", __FUNCTION__);
+
+    GSCHED_RETAIN;
+    *result = cudnnDestroyPoolingDescriptor(
+        (cudnnPoolingDescriptor_t)resource_mg_get(&rm_cudnn_poolings, (void*)poolingDesc));
+    // TODO: Remove from resource manager
+    GSCHED_RELEASE;
+    RECORD_RESULT(integer, *result);
+    return 1;
+}
+
+bool_t rpc_cudnncreateactivationdescriptor_1_svc(ptr_result *result, struct svc_req *rqstp)
+{
+    RECORD_VOID_API;
+    LOGE(LOG_DEBUG, "%s", __FUNCTION__);
+
+    GSCHED_RETAIN;
+    result->err = cudnnCreateActivationDescriptor((cudnnActivationDescriptor_t*)&result->ptr_result_u.ptr);
+    if (resource_mg_create(&rm_cudnn_activations, (void*)result->ptr_result_u.ptr) != 0) {
+        LOGE(LOG_ERROR, "error in resource manager");
+    }
+    GSCHED_RELEASE;
+    RECORD_RESULT(ptr_result_u, *result);
+    return 1;
+}
+
+bool_t rpc_cudnnsetactivationdescriptor_1_svc(ptr activationDesc, int mode, int reluNanOpt, double coef, int *result, struct svc_req *rqstp)
+{
+    RECORD_API(rpc_cudnnsetactivationdescriptor_1_argument);
+    RECORD_NARG(activationDesc);
+    RECORD_NARG(mode);
+    RECORD_NARG(reluNanOpt);
+    RECORD_NARG(coef);
+
+    LOGE(LOG_DEBUG, "%s", __FUNCTION__);
+
+    GSCHED_RETAIN;
+    *result = cudnnSetActivationDescriptor(
+        (cudnnActivationDescriptor_t)resource_mg_get(&rm_cudnn_activations, (void*)activationDesc),
+        (cudnnActivationMode_t)mode,
+        (cudnnNanPropagation_t)reluNanOpt,
+        coef);
+    GSCHED_RELEASE;
+    RECORD_RESULT(integer, *result);
+    return 1;
+}
+
+bool_t rpc_cudnngetactivationdescriptor_1_svc(ptr activationDesc, int2d1_result *result, struct svc_req *rqstp)
+{
+    LOGE(LOG_DEBUG, "%s", __FUNCTION__);
+
+    GSCHED_RETAIN;
+    result->err = cudnnGetActivationDescriptor(
+        (cudnnActivationDescriptor_t)resource_mg_get(&rm_cudnn_activations, (void*)activationDesc),
+        (cudnnActivationMode_t*)&result->int2d1_result_u.data.i[0],
+        (cudnnNanPropagation_t*)&result->int2d1_result_u.data.i[1],
+        &result->int2d1_result_u.data.d);
+    GSCHED_RELEASE;
+    return 1;
+}
+
+bool_t rpc_cudnnsetactivationdescriptorswishbeta_1_svc(ptr activationDesc, double swish_beta, int *result, struct svc_req *rqstp)
+{
+    RECORD_API(rpc_cudnnsetactivationdescriptorswishbeta_1_argument);
+    RECORD_NARG(activationDesc);
+    RECORD_NARG(swish_beta);
+
+    LOGE(LOG_DEBUG, "%s", __FUNCTION__);
+
+    GSCHED_RETAIN;
+    *result = cudnnSetActivationDescriptorSwishBeta(
+        (cudnnActivationDescriptor_t)resource_mg_get(&rm_cudnn_activations, (void*)activationDesc),
+        swish_beta);
+    GSCHED_RELEASE;
+    RECORD_RESULT(integer, *result);
+    return 1;
+}
+
+bool_t rpc_cudnngetactivationdescriptorswishbeta_1_svc(ptr activationDesc, d_result *result, struct svc_req *rqstp)
+{
+    LOGE(LOG_DEBUG, "%s", __FUNCTION__);
+
+    GSCHED_RETAIN;
+    result->err = cudnnGetActivationDescriptorSwishBeta(
+        (cudnnActivationDescriptor_t)resource_mg_get(&rm_cudnn_activations, (void*)activationDesc),
+        &result->d_result_u.data);
+    GSCHED_RELEASE;
+    return 1;
+}
+
+bool_t rpc_cudnndestroyactivationdescriptor_1_svc(ptr activationDesc, int *result, struct svc_req *rqstp)
+{
+    RECORD_API(ptr);
+    RECORD_SINGLE_ARG(activationDesc);
+    LOGE(LOG_DEBUG, "%s", __FUNCTION__);
+
+    GSCHED_RETAIN;
+    *result = cudnnDestroyActivationDescriptor(
+        (cudnnActivationDescriptor_t)resource_mg_get(&rm_cudnn_activations, (void*)activationDesc));
+    // TODO: Remove from resource manager
+    GSCHED_RELEASE;
+    RECORD_RESULT(integer, *result);
+    return 1;
+}
+
+bool_t rpc_cudnncreatelrndescriptor_1_svc(ptr_result *result, struct svc_req *rqstp)
+{
+    RECORD_VOID_API;
+    LOGE(LOG_DEBUG, "%s", __FUNCTION__);
+
+    GSCHED_RETAIN;
+    result->err = cudnnCreateLRNDescriptor((cudnnLRNDescriptor_t*)&result->ptr_result_u.ptr);
+    if (resource_mg_create(&rm_cudnn_lrns, (void*)result->ptr_result_u.ptr) != 0) {
+        LOGE(LOG_ERROR, "error in resource manager");
+    }
+    GSCHED_RELEASE;
+    RECORD_RESULT(ptr_result_u, *result);
+    return 1;
+}
+
+bool_t rpc_cudnnsetlrndescriptor_1_svc(ptr normDesc, unsigned lrnN, double lrnAlpha, double lrnBeta, double lrnK, int *result, struct svc_req *rqstp)
+{
+    RECORD_API(rpc_cudnnsetlrndescriptor_1_argument);
+    RECORD_NARG(normDesc);
+    RECORD_NARG(lrnN);
+    RECORD_NARG(lrnAlpha);
+    RECORD_NARG(lrnBeta);
+    RECORD_NARG(lrnK);
+
+    LOGE(LOG_DEBUG, "%s", __FUNCTION__);
+
+    GSCHED_RETAIN;
+    *result = cudnnSetLRNDescriptor(
+        (cudnnLRNDescriptor_t)resource_mg_get(&rm_cudnn_lrns, (void*)normDesc),
+        lrnN,
+        lrnAlpha,
+        lrnBeta,
+        lrnK);
+    GSCHED_RELEASE;
+    RECORD_RESULT(integer, *result);
+    return 1;
+}
+
+bool_t rpc_cudnngetlrndescriptor_1_svc(ptr normDesc, int1d3_result *result, struct svc_req *rqstp)
+{
+    LOGE(LOG_DEBUG, "%s", __FUNCTION__);
+
+    GSCHED_RETAIN;
+    result->err = cudnnGetLRNDescriptor(
+        (cudnnLRNDescriptor_t)resource_mg_get(&rm_cudnn_lrns, (void*)normDesc),
+        (unsigned int*)&result->int1d3_result_u.data.i,
+        &result->int1d3_result_u.data.d[0],
+        &result->int1d3_result_u.data.d[1],
+        &result->int1d3_result_u.data.d[2]);
+    GSCHED_RELEASE;
+    return 1;
+}
+
+bool_t rpc_cudnndestroylrndescriptor_1_svc(ptr lrnDesc, int *result, struct svc_req *rqstp)
+{
+    RECORD_API(ptr);
+    RECORD_SINGLE_ARG(lrnDesc);
+    LOGE(LOG_DEBUG, "%s", __FUNCTION__);
+
+    GSCHED_RETAIN;
+    *result = cudnnDestroyLRNDescriptor(
+        (cudnnLRNDescriptor_t)resource_mg_get(&rm_cudnn_lrns, (void*)lrnDesc));
+    // TODO: Remove from resource manager
+    GSCHED_RELEASE;
+    RECORD_RESULT(integer, *result);
+    return 1;
+}
+
+bool_t rpc_cudnnpoolingforward_1_svc(ptr handle, ptr poolingDesc,           cudnn_scaling_t alpha, ptr xDesc, ptr x, cudnn_scaling_t beta, ptr yDesc, ptr y, int *result, struct svc_req *rqstp)
+{
+    RECORD_API(rpc_cudnnpoolingforward_1_argument);
+    RECORD_NARG(handle);
+    RECORD_NARG(poolingDesc);
+    RECORD_NARG(alpha);
+    RECORD_NARG(xDesc);
+    RECORD_NARG(x);
+    RECORD_NARG(beta);
+    RECORD_NARG(yDesc);
+    RECORD_NARG(y);
+    
+    LOGE(LOG_DEBUG, "%s", __FUNCTION__);
+    GSCHED_RETAIN;
+    *result = cudnnPoolingForward(
+        (cudnnHandle_t)resource_mg_get(&rm_cudnn, (void*)handle),
+        (cudnnPoolingDescriptor_t)resource_mg_get(&rm_cudnn_poolings, (void*)poolingDesc),
+        (alpha.dataType == CUDNN_DATA_DOUBLE ? (const void*)&alpha.cudnn_scaling_t_u.d : (const void*)&alpha.cudnn_scaling_t_u.f),
+        (cudnnTensorDescriptor_t)resource_mg_get(&rm_cudnn_tensors, (void*)xDesc),
+        (const void*)resource_mg_get(&rm_memory, (void*)x),
+        (beta.dataType == CUDNN_DATA_DOUBLE ? (const void*)&beta.cudnn_scaling_t_u.d : (const void*)&beta.cudnn_scaling_t_u.f),
+        (cudnnTensorDescriptor_t)resource_mg_get(&rm_cudnn_tensors, (void*)yDesc),
+        (void*)resource_mg_get(&rm_memory, (void*)y));
+    GSCHED_RELEASE;
+    RECORD_RESULT(integer, *result);
+    return 1;
+}
+
+bool_t rpc_cudnnactivationforward_1_svc(ptr handle, ptr activationDesc, cudnn_scaling_t alpha, ptr xDesc, ptr x, cudnn_scaling_t beta, ptr yDesc, ptr y, int *result, struct svc_req *rqstp)
+{
+    RECORD_API(rpc_cudnnactivationforward_1_argument);
+    RECORD_NARG(handle);
+    RECORD_NARG(activationDesc);
+    RECORD_NARG(alpha);
+    RECORD_NARG(xDesc);
+    RECORD_NARG(x);
+    RECORD_NARG(beta);
+    RECORD_NARG(yDesc);
+    RECORD_NARG(y);
+    
+    LOGE(LOG_DEBUG, "%s", __FUNCTION__);
+    GSCHED_RETAIN;
+    *result = cudnnActivationForward(
+        (cudnnHandle_t)resource_mg_get(&rm_cudnn, (void*)handle),
+        (cudnnActivationDescriptor_t)resource_mg_get(&rm_cudnn_activations, (void*)activationDesc),
+        (alpha.dataType == CUDNN_DATA_DOUBLE ? (const void*)&alpha.cudnn_scaling_t_u.d : (const void*)&alpha.cudnn_scaling_t_u.f),
+        (cudnnTensorDescriptor_t)resource_mg_get(&rm_cudnn_tensors, (void*)xDesc),
+        (const void*)resource_mg_get(&rm_memory, (void*)x),
+        (beta.dataType == CUDNN_DATA_DOUBLE ? (const void*)&beta.cudnn_scaling_t_u.d : (const void*)&beta.cudnn_scaling_t_u.f),
+        (cudnnTensorDescriptor_t)resource_mg_get(&rm_cudnn_tensors, (void*)yDesc),
+        (void*)resource_mg_get(&rm_memory, (void*)y));
+    GSCHED_RELEASE;
+    RECORD_RESULT(integer, *result);
+    return 1;
+}
+
+bool_t rpc_cudnnlrncrosschannelforward_1_svc(ptr handle, ptr normDesc, int lrnMode, cudnn_scaling_t alpha, ptr xDesc, ptr x, cudnn_scaling_t beta, ptr yDesc, ptr y, int *result, struct svc_req *rqstp)
+{
+    RECORD_API(rpc_cudnnlrncrosschannelforward_1_argument);
+    RECORD_NARG(handle);
+    RECORD_NARG(normDesc);
+    RECORD_NARG(lrnMode);
+    RECORD_NARG(alpha);
+    RECORD_NARG(xDesc);
+    RECORD_NARG(x);
+    RECORD_NARG(beta);
+    RECORD_NARG(yDesc);
+    RECORD_NARG(y);
+    
+    LOGE(LOG_DEBUG, "%s", __FUNCTION__);
+    GSCHED_RETAIN;
+    *result = cudnnLRNCrossChannelForward(
+        (cudnnHandle_t)resource_mg_get(&rm_cudnn, (void*)handle),
+        (cudnnLRNDescriptor_t)resource_mg_get(&rm_cudnn_lrns, (void*)normDesc),
+        (cudnnLRNMode_t)lrnMode,
+        (alpha.dataType == CUDNN_DATA_DOUBLE ? (const void*)&alpha.cudnn_scaling_t_u.d : (const void*)&alpha.cudnn_scaling_t_u.f),
+        (cudnnTensorDescriptor_t)resource_mg_get(&rm_cudnn_tensors, (void*)xDesc),
+        (const void*)resource_mg_get(&rm_memory, (void*)x),
+        (beta.dataType == CUDNN_DATA_DOUBLE ? (const void*)&beta.cudnn_scaling_t_u.d : (const void*)&beta.cudnn_scaling_t_u.f),
+        (cudnnTensorDescriptor_t)resource_mg_get(&rm_cudnn_tensors, (void*)yDesc),
+        (void*)resource_mg_get(&rm_memory, (void*)y));
+    GSCHED_RELEASE;
+    RECORD_RESULT(integer, *result);
+    return 1;
+}
+
+bool_t rpc_cudnnsoftmaxforward_1_svc(ptr handle, int algo, int mode, cudnn_scaling_t alpha, ptr xDesc, ptr x, cudnn_scaling_t beta, ptr yDesc, ptr y, int *result, struct svc_req *rqstp)
+{
+    RECORD_API(rpc_cudnnsoftmaxforward_1_argument);
+    RECORD_NARG(handle);
+    RECORD_NARG(algo);
+    RECORD_NARG(mode);
+    RECORD_NARG(alpha);
+    RECORD_NARG(xDesc);
+    RECORD_NARG(x);
+    RECORD_NARG(beta);
+    RECORD_NARG(yDesc);
+    RECORD_NARG(y);
+
+    LOGE(LOG_DEBUG, "%s", __FUNCTION__);
+    GSCHED_RETAIN;
+    *result = cudnnSoftmaxForward(
+        (cudnnHandle_t)resource_mg_get(&rm_cudnn, (void*)handle),
+        (cudnnSoftmaxAlgorithm_t)algo,
+        (cudnnSoftmaxMode_t)mode,
+        (alpha.dataType == CUDNN_DATA_DOUBLE ? (const void*)&alpha.cudnn_scaling_t_u.d : (const void*)&alpha.cudnn_scaling_t_u.f),
+        (cudnnTensorDescriptor_t)resource_mg_get(&rm_cudnn_tensors, (void*)xDesc),
+        (const void*)resource_mg_get(&rm_memory, (void*)x),
+        (beta.dataType == CUDNN_DATA_DOUBLE ? (const void*)&beta.cudnn_scaling_t_u.d : (const void*)&beta.cudnn_scaling_t_u.f),
+        (cudnnTensorDescriptor_t)resource_mg_get(&rm_cudnn_tensors, (void*)yDesc),
+        (void*)resource_mg_get(&rm_memory, (void*)y));
+    GSCHED_RELEASE;
+    RECORD_RESULT(integer, *result);
+    return 1;
+}
+
+/* cudnn cnn inference */
+bool_t rpc_cudnngetconvolutionndforwardoutputdim_1_svc(ptr convDesc, ptr inputTensorDesc, ptr filterDesc, int nbDims, mem_result *result, struct svc_req *rqstp)
+{
+    LOGE(LOG_DEBUG, "%s", __FUNCTION__);
+    GSCHED_RETAIN;
+    result->mem_result_u.data.mem_data_len = sizeof(int) * nbDims;
+    if ((result->mem_result_u.data.mem_data_val = malloc(result->mem_result_u.data.mem_data_len)) == NULL) {
+        LOGE(LOG_ERROR, "malloc failed");
+        return 0;
+    }
+    result->err = cudnnGetConvolutionNdForwardOutputDim(
+        (cudnnConvolutionDescriptor_t)resource_mg_get(&rm_cudnn_convs, (void*)convDesc),
+        (cudnnTensorDescriptor_t)resource_mg_get(&rm_cudnn_tensors, (void*)inputTensorDesc),
+        (cudnnFilterDescriptor_t)resource_mg_get(&rm_cudnn_filters, (void*)filterDesc),
+        nbDims,
+        (int*)result->mem_result_u.data.mem_data_val);
+    GSCHED_RELEASE;
+    return 1;
+}
+
+bool_t rpc_cudnncreateconvolutiondescriptor_1_svc(ptr_result *result, struct svc_req *rqstp)
+{
+    RECORD_VOID_API;
+    LOGE(LOG_DEBUG, "%s", __FUNCTION__);
+
+    GSCHED_RETAIN;
+    result->err = cudnnCreateConvolutionDescriptor((cudnnConvolutionDescriptor_t*)&result->ptr_result_u.ptr);
+    if (resource_mg_create(&rm_cudnn_convs, (void*)result->ptr_result_u.ptr) != 0) {
+        LOGE(LOG_ERROR, "error in resource manager");
+    }
+    GSCHED_RELEASE;
+    RECORD_RESULT(ptr_result_u, *result);
+    return 1;
+}
+
+bool_t rpc_cudnndestroyconvolutiondescriptor_1_svc(ptr convDesc, int *result, struct svc_req *rqstp)
+{
+    RECORD_API(ptr);
+    RECORD_SINGLE_ARG(convDesc);
+    LOGE(LOG_DEBUG, "%s", __FUNCTION__);
+
+    GSCHED_RETAIN;
+    *result = cudnnDestroyConvolutionDescriptor(
+        (cudnnConvolutionDescriptor_t)resource_mg_get(&rm_cudnn_convs, (void*)convDesc));
+    // TODO: Remove from resource manager
+    GSCHED_RELEASE;
+    RECORD_RESULT(integer, *result);
+    return 1;
+}
+
+bool_t rpc_cudnnsetconvolutionnddescriptor_1_svc(ptr convDesc, int arrayLength, mem_data padA, mem_data filterStrideA, mem_data dilationA, int mode, int computeType, int *result, struct svc_req *rqstp)
+{
+    RECORD_API(rpc_cudnnsetconvolutionnddescriptor_1_argument);
+    RECORD_NARG(convDesc);
+    RECORD_NARG(arrayLength);
+    RECORD_NARG(padA);
+    RECORD_NARG(filterStrideA);
+    RECORD_NARG(dilationA);
+    RECORD_NARG(mode);
+    RECORD_NARG(computeType);
+    //TODO: Recording mem_data is not as easy as done here.
+
+    LOGE(LOG_DEBUG, "%s", __FUNCTION__);
+
+    if (padA.mem_data_len != arrayLength * sizeof(int) ||
+        filterStrideA.mem_data_len != arrayLength * sizeof(int) ||
+        dilationA.mem_data_len != arrayLength * sizeof(int)) {
+        LOGE(LOG_ERROR, "array dimensions not as expected.");
+        return 0;
+    }
+    GSCHED_RETAIN;
+    *result = cudnnSetConvolutionNdDescriptor(
+        (cudnnConvolutionDescriptor_t)resource_mg_get(&rm_cudnn_convs, (void*)convDesc),
+        arrayLength,
+        (const int*)padA.mem_data_val,
+        (const int*)filterStrideA.mem_data_val,
+        (const int*)dilationA.mem_data_val,
+        (cudnnConvolutionMode_t)mode,
+        (cudnnDataType_t)computeType);
+    GSCHED_RELEASE;
+    RECORD_RESULT(integer, *result);
+    return 1;
+}
+
+bool_t rpc_cudnngetconvolutionforwardalgorithm_v7_1_svc(ptr handle, ptr srcDesc, ptr filterDesc, ptr convDesc, ptr destDesc, int requestedAlgoCount, mem_result *result, struct svc_req *rqstp)
+{
+    LOGE(LOG_DEBUG, "%s", __FUNCTION__);
+    GSCHED_RETAIN;
+    result->mem_result_u.data.mem_data_len = sizeof(int) + sizeof(cudnnConvolutionFwdAlgoPerf_t) * requestedAlgoCount;
+    if ((result->mem_result_u.data.mem_data_val = malloc(result->mem_result_u.data.mem_data_len)) == NULL) {
+        LOGE(LOG_ERROR, "malloc failed");
+        return 0;
+    }
+    result->err = cudnnGetConvolutionForwardAlgorithm_v7(
+        (cudnnHandle_t)resource_mg_get(&rm_cudnn, (void*)handle),
+        (cudnnTensorDescriptor_t)resource_mg_get(&rm_cudnn_tensors, (void*)srcDesc),
+        (cudnnFilterDescriptor_t)resource_mg_get(&rm_cudnn_filters, (void*)filterDesc),
+        (cudnnConvolutionDescriptor_t)resource_mg_get(&rm_cudnn_convs, (void*)convDesc),
+        (cudnnTensorDescriptor_t)resource_mg_get(&rm_cudnn_tensors, (void*)destDesc),
+        requestedAlgoCount,
+        (int*)result->mem_result_u.data.mem_data_val,
+        (cudnnConvolutionFwdAlgoPerf_t*)(result->mem_result_u.data.mem_data_val + sizeof(int)));
+    GSCHED_RELEASE;
+    return 1;
+}
+
+bool_t rpc_cudnnfindconvolutionforwardalgorithm_1_svc(ptr handle, ptr xDesc, ptr wDesc, ptr convDesc, ptr yDesc, int requestedAlgoCount, mem_result *result, struct svc_req *rqstp)
+{
+    LOGE(LOG_DEBUG, "%s", __FUNCTION__);
+    GSCHED_RETAIN;
+    result->mem_result_u.data.mem_data_len = sizeof(int) + sizeof(cudnnConvolutionFwdAlgoPerf_t) * requestedAlgoCount;
+    if ((result->mem_result_u.data.mem_data_val = malloc(result->mem_result_u.data.mem_data_len)) == NULL) {
+        LOGE(LOG_ERROR, "malloc failed");
+        return 0;
+    }
+    result->err = cudnnFindConvolutionForwardAlgorithm(
+        (cudnnHandle_t)resource_mg_get(&rm_cudnn, (void*)handle),
+        (cudnnTensorDescriptor_t)resource_mg_get(&rm_cudnn_tensors, (void*)xDesc),
+        (cudnnFilterDescriptor_t)resource_mg_get(&rm_cudnn_filters, (void*)wDesc),
+        (cudnnConvolutionDescriptor_t)resource_mg_get(&rm_cudnn_convs, (void*)convDesc),
+        (cudnnTensorDescriptor_t)resource_mg_get(&rm_cudnn_tensors, (void*)yDesc),
+        requestedAlgoCount,
+        (int*)result->mem_result_u.data.mem_data_val,
+        (cudnnConvolutionFwdAlgoPerf_t*)(result->mem_result_u.data.mem_data_val + sizeof(int)));
+    GSCHED_RELEASE;
+    return 1;
+}
+
+bool_t rpc_cudnngetconvolutionforwardworkspacesize_1_svc(ptr handle, ptr xDesc, ptr wDesc, ptr convDesc, ptr yDesc, int algo, sz_result *result, struct svc_req *rqstp)
+{
+    LOGE(LOG_DEBUG, "%s", __FUNCTION__);
+    GSCHED_RETAIN;
+    result->err = cudnnGetConvolutionForwardWorkspaceSize(
+        (cudnnHandle_t)resource_mg_get(&rm_cudnn, (void*)handle),
+        (cudnnTensorDescriptor_t)resource_mg_get(&rm_cudnn_tensors, (void*)xDesc),
+        (cudnnFilterDescriptor_t)resource_mg_get(&rm_cudnn_filters, (void*)wDesc),
+        (cudnnConvolutionDescriptor_t)resource_mg_get(&rm_cudnn_convs, (void*)convDesc),
+        (cudnnTensorDescriptor_t)resource_mg_get(&rm_cudnn_tensors, (void*)yDesc),
+        (cudnnConvolutionFwdAlgo_t)algo,
+        (size_t*)&result->sz_result_u.data);
+    GSCHED_RELEASE;
+    return 1;
+}
+
+bool_t rpc_cudnnconvolutionforward_1_svc(ptr handle, cudnn_scaling_t alpha, ptr xDesc, ptr x, ptr wDesc, ptr w, ptr convDesc, int algo, ptr workSpace, size_t workSpaceSizeInBytes, cudnn_scaling_t beta, ptr yDesc, ptr y, int *result, struct svc_req *rqstp)
+{
+    RECORD_API(rpc_cudnnconvolutionforward_1_argument);
+    RECORD_NARG(handle);
+    RECORD_NARG(alpha);
+    RECORD_NARG(xDesc);
+    RECORD_NARG(x);
+    RECORD_NARG(wDesc);
+    RECORD_NARG(w);
+    RECORD_NARG(convDesc);
+    RECORD_NARG(algo);
+    RECORD_NARG(workSpace);
+    RECORD_NARG(workSpaceSizeInBytes);
+    RECORD_NARG(beta);
+    RECORD_NARG(yDesc);
+    RECORD_NARG(y);
+    
+    LOGE(LOG_DEBUG, "%s", __FUNCTION__);
+    GSCHED_RETAIN;
+    *result = cudnnConvolutionForward(
+        (cudnnHandle_t)resource_mg_get(&rm_cudnn, (void*)handle),
+        (alpha.dataType == CUDNN_DATA_DOUBLE ? (const void*)&alpha.cudnn_scaling_t_u.d : (const void*)&alpha.cudnn_scaling_t_u.f),
+        (cudnnTensorDescriptor_t)resource_mg_get(&rm_cudnn_tensors, (void*)xDesc),
+        (const void*)resource_mg_get(&rm_memory, (void*)x),
+        (cudnnFilterDescriptor_t)resource_mg_get(&rm_cudnn_filters, (void*)wDesc),
+        (const void*)resource_mg_get(&rm_memory, (void*)w),
+        (cudnnConvolutionDescriptor_t)resource_mg_get(&rm_cudnn_convs, (void*)convDesc),
+        algo,
+        (void*)resource_mg_get(&rm_memory, (void*)workSpace),
+        workSpaceSizeInBytes,
+        (beta.dataType == CUDNN_DATA_DOUBLE ? (const void*)&beta.cudnn_scaling_t_u.d : (const void*)&beta.cudnn_scaling_t_u.f),
+        (cudnnTensorDescriptor_t)resource_mg_get(&rm_cudnn_tensors, (void*)yDesc),
+        (void*)resource_mg_get(&rm_memory, (void*)y));
+    GSCHED_RELEASE;
+    RECORD_RESULT(integer, *result);
+    return 1;
+}
+
+bool_t rpc_cudnnaddtensor_1_svc(ptr handle, cudnn_scaling_t alpha, ptr aDesc, ptr A, cudnn_scaling_t beta, ptr cDesc, ptr C, int *result, struct svc_req *rqstp)
+{
+    RECORD_API(rpc_cudnnaddtensor_1_argument);
+    RECORD_NARG(handle);
+    RECORD_NARG(alpha);
+    RECORD_NARG(aDesc);
+    RECORD_NARG(A);
+    RECORD_NARG(beta);
+    RECORD_NARG(cDesc);
+    RECORD_NARG(C);
+    
+    LOGE(LOG_DEBUG, "%s", __FUNCTION__);
+    GSCHED_RETAIN;
+    *result = cudnnAddTensor(
+        (cudnnHandle_t)resource_mg_get(&rm_cudnn, (void*)handle),
+        (alpha.dataType == CUDNN_DATA_DOUBLE ? (const void*)&alpha.cudnn_scaling_t_u.d : (const void*)&alpha.cudnn_scaling_t_u.f),
+        (cudnnTensorDescriptor_t)resource_mg_get(&rm_cudnn_tensors, (void*)aDesc),
+        (const void*)resource_mg_get(&rm_memory, (void*)A),
+        (beta.dataType == CUDNN_DATA_DOUBLE ? (const void*)&beta.cudnn_scaling_t_u.d : (const void*)&beta.cudnn_scaling_t_u.f),
+        (cudnnTensorDescriptor_t)resource_mg_get(&rm_cudnn_tensors, (void*)cDesc),
+        (void*)resource_mg_get(&rm_memory, (void*)C));
+    GSCHED_RELEASE;
+    RECORD_RESULT(integer, *result);
+    return 1;
+}
+
+bool_t rpc_cudnntransformtensor_1_svc(ptr handle, cudnn_scaling_t alpha, ptr xDesc, ptr x, cudnn_scaling_t beta, ptr yDesc, ptr y, int *result, struct svc_req *rqstp)
+{
+    RECORD_API(rpc_cudnntransformtensor_1_argument);
+    RECORD_NARG(handle);
+    RECORD_NARG(alpha);
+    RECORD_NARG(xDesc);
+    RECORD_NARG(x);
+    RECORD_NARG(beta);
+    RECORD_NARG(yDesc);
+    RECORD_NARG(y);
+    
+    LOGE(LOG_DEBUG, "%s", __FUNCTION__);
+    GSCHED_RETAIN;
+    *result = cudnnTransformTensor(
+        (cudnnHandle_t)resource_mg_get(&rm_cudnn, (void*)handle),
+        (alpha.dataType == CUDNN_DATA_DOUBLE ? (const void*)&alpha.cudnn_scaling_t_u.d : (const void*)&alpha.cudnn_scaling_t_u.f),
+        (cudnnTensorDescriptor_t)resource_mg_get(&rm_cudnn_tensors, (void*)xDesc),
+        (const void*)resource_mg_get(&rm_memory, (void*)x),
+        (beta.dataType == CUDNN_DATA_DOUBLE ? (const void*)&beta.cudnn_scaling_t_u.d : (const void*)&beta.cudnn_scaling_t_u.f),
+        (cudnnTensorDescriptor_t)resource_mg_get(&rm_cudnn_tensors, (void*)yDesc),
+        (void*)resource_mg_get(&rm_memory, (void*)y));
+    GSCHED_RELEASE;
+    RECORD_RESULT(integer, *result);
+    return 1;
+}
+
+static const size_t backendAttributeSizes[] = {
+    [CUDNN_TYPE_HANDLE] = sizeof(cudnnHandle_t),
+    [CUDNN_TYPE_DATA_TYPE] = sizeof(cudnnDataType_t),
+    [CUDNN_TYPE_BOOLEAN] = sizeof(bool),
+    [CUDNN_TYPE_INT64] = sizeof(int64_t),
+    [CUDNN_TYPE_FLOAT] = sizeof(float),
+    [CUDNN_TYPE_DOUBLE] = sizeof(double),
+    [CUDNN_TYPE_VOID_PTR] = sizeof(void *),
+    [CUDNN_TYPE_CONVOLUTION_MODE] = sizeof(cudnnConvolutionMode_t),
+    [CUDNN_TYPE_HEUR_MODE] = sizeof(cudnnBackendHeurMode_t),
+    [CUDNN_TYPE_KNOB_TYPE] = sizeof(cudnnBackendKnobType_t),
+    [CUDNN_TYPE_NAN_PROPOGATION] = sizeof(cudnnNanPropagation_t),
+    [CUDNN_TYPE_NUMERICAL_NOTE] = sizeof(cudnnBackendNumericalNote_t),
+    [CUDNN_TYPE_LAYOUT_TYPE] = sizeof(cudnnBackendLayoutType_t),
+    [CUDNN_TYPE_ATTRIB_NAME] = sizeof(cudnnBackendAttributeName_t),
+    [CUDNN_TYPE_POINTWISE_MODE] = sizeof(cudnnPointwiseMode_t),
+    [CUDNN_TYPE_BACKEND_DESCRIPTOR] = sizeof(cudnnBackendDescriptor_t),
+    [CUDNN_TYPE_GENSTATS_MODE] = sizeof(cudnnGenStatsMode_t),
+    [CUDNN_TYPE_BN_FINALIZE_STATS_MODE] = sizeof(cudnnBnFinalizeStatsMode_t),
+    [CUDNN_TYPE_REDUCTION_OPERATOR_TYPE] = sizeof(cudnnReduceTensorOp_t),
+    [CUDNN_TYPE_BEHAVIOR_NOTE] = sizeof(cudnnBackendBehaviorNote_t),
+    [CUDNN_TYPE_TENSOR_REORDERING_MODE] = sizeof(cudnnBackendTensorReordering_t),
+    [CUDNN_TYPE_RESAMPLE_MODE] = sizeof(cudnnResampleMode_t),
+    [CUDNN_TYPE_PADDING_MODE] = sizeof(cudnnPaddingMode_t),
+    [CUDNN_TYPE_INT32] = sizeof(int32_t),
+    [CUDNN_TYPE_CHAR] = sizeof(char),
+    [CUDNN_TYPE_SIGNAL_MODE] = sizeof(cudnnSignalMode_t),
+    [CUDNN_TYPE_FRACTION] = sizeof(cudnnFraction_t),
+    [CUDNN_TYPE_NORM_MODE] = sizeof(cudnnBackendNormMode_t),
+    [CUDNN_TYPE_NORM_FWD_PHASE] = sizeof(cudnnBackendNormFwdPhase_t),
+    [CUDNN_TYPE_RNG_DISTRIBUTION] = sizeof(cudnnRngDistribution_t),
+};
+
+bool_t rpc_cudnnbackendcreatedescriptor_1_svc(int descriptorType, ptr_result *result, struct svc_req *rqstp)
+{
+    RECORD_API(int);
+    RECORD_SINGLE_ARG(descriptorType);
+    LOGE(LOG_DEBUG, "%s", __FUNCTION__);
+
+    GSCHED_RETAIN;
+    result->err = cudnnBackendCreateDescriptor(
+        (cudnnBackendDescriptorType_t)descriptorType,
+        (cudnnBackendDescriptor_t*)&result->ptr_result_u.ptr);
+    if (resource_mg_create(&rm_cudnn_backendds, (void*)result->ptr_result_u.ptr) != 0) {
+        LOGE(LOG_ERROR, "error in resource manager");
+    }
+    GSCHED_RELEASE;
+    RECORD_RESULT(ptr_result_u, *result);
+    return 1;
+}
+
+bool_t rpc_cudnnbackenddestroydescriptor_1_svc(ptr descriptor, int *result, struct svc_req *rqstp)
+{
+    RECORD_API(ptr);
+    RECORD_SINGLE_ARG(descriptor);
+    LOGE(LOG_DEBUG, "%s", __FUNCTION__);
+
+    GSCHED_RETAIN;
+    *result = cudnnBackendDestroyDescriptor(
+        (cudnnBackendDescriptor_t)resource_mg_get(&rm_cudnn_backendds, (void*)descriptor));
+    // TODO: Remove from resource manager
+    GSCHED_RELEASE;
+    RECORD_RESULT(integer, *result);
+    return 1;
+}
+
+bool_t rpc_cudnnbackendinitialize_1_svc(ptr descriptor, int *result, struct svc_req *rqstp)
+{
+    RECORD_API(ptr);
+    RECORD_SINGLE_ARG(descriptor);
+    LOGE(LOG_DEBUG, "%s", __FUNCTION__);
+
+    GSCHED_RETAIN;
+    *result = cudnnBackendInitialize(
+        (cudnnBackendDescriptor_t)resource_mg_get(&rm_cudnn_backendds, (void*)descriptor));
+    GSCHED_RELEASE;
+    RECORD_RESULT(integer, *result);
+    return 1;
+}
+
+bool_t rpc_cudnnbackendfinalize_1_svc(ptr descriptor, int *result, struct svc_req *rqstp)
+{
+    RECORD_API(ptr);
+    RECORD_SINGLE_ARG(descriptor);
+    LOGE(LOG_DEBUG, "%s", __FUNCTION__);
+
+    GSCHED_RETAIN;
+    *result = cudnnBackendFinalize(
+        (cudnnBackendDescriptor_t)resource_mg_get(&rm_cudnn_backendds, (void*)descriptor));
+    GSCHED_RELEASE;
+    RECORD_RESULT(integer, *result);
+    return 1;
+}
+bool_t rpc_cudnnbackendsetattribute_1_svc(
+                         ptr descriptor,
+                         int attributeName,
+                         int attributeType,
+                         int64_t elementCount,
+                         mem_data arrayOfElements,
+                         int *result, struct svc_req *rqstp)
+{
+    RECORD_API(rpc_cudnnbackendsetattribute_1_argument);
+    RECORD_NARG(descriptor);
+    RECORD_NARG(attributeName);
+    RECORD_NARG(attributeType);
+    RECORD_NARG(elementCount);
+    RECORD_NARG(arrayOfElements);
+
+    LOGE(LOG_DEBUG, "%s", __FUNCTION__);
+    
+    if (attributeType < 0 || attributeType >= CUDNN_TYPE_RNG_DISTRIBUTION) {
+        LOGE(LOG_ERROR, "attributeType out of range.");
+        return 0;
+    }
+
+    if (arrayOfElements.mem_data_len != elementCount * backendAttributeSizes[attributeType]) {
+        LOGE(LOG_ERROR, "array dimensions not as expected.");
+        return 0;
+    }
+    GSCHED_RETAIN;
+    *result = cudnnBackendSetAttribute(
+        (cudnnBackendDescriptor_t)resource_mg_get(&rm_cudnn_backendds, (void*)descriptor),
+        (cudnnBackendAttributeName_t)attributeName,
+        (cudnnBackendAttributeType_t)attributeType,
+        elementCount,
+        arrayOfElements.mem_data_val);
+    GSCHED_RELEASE;
+    RECORD_RESULT(integer, *result);
+    return 1;
+}
+
+bool_t rpc_cudnnbackendgetattribute_1_svc(ptr descriptor, int attributeName, int attributeType, int64_t requestedElementCount, mem_result *result, struct svc_req *rqstp)
+{
+    void *arrayOfElements = NULL;
+    LOGE(LOG_DEBUG, "%s", __FUNCTION__);
+    if (attributeType < 0 || attributeType >= CUDNN_TYPE_RNG_DISTRIBUTION) {
+        LOGE(LOG_ERROR, "attributeType out of range.");
+        return 0;
+    }
+    result->mem_result_u.data.mem_data_len = sizeof(int64_t) + requestedElementCount*sizeof(backendAttributeSizes[attributeType]);
+    if ((result->mem_result_u.data.mem_data_val = malloc(result->mem_result_u.data.mem_data_len)) == NULL) {
+        LOGE(LOG_ERROR, "malloc failed");
+        return 0;
+    }
+    if (requestedElementCount > 0) {
+        void *data = result->mem_result_u.data.mem_data_val + sizeof(int64_t);
+    }
+    
+    GSCHED_RETAIN;
+    result->err = cudnnBackendGetAttribute(
+        (cudnnBackendDescriptor_t)resource_mg_get(&rm_cudnn_backendds, (void*)descriptor),
+        (cudnnBackendAttributeName_t)attributeName,
+        (cudnnBackendAttributeType_t)attributeType,
+        requestedElementCount,
+        (int64_t*)result->mem_result_u.data.mem_data_val,
+        arrayOfElements);
+    
+    LOGE(LOG_DEBUG, "desc: %p, name: %d, type: %d, requestedElementCount: %zd, elementCount: %zd, arrayOfElements: %p -> %d", descriptor, attributeName, attributeType, requestedElementCount, *result->mem_result_u.data.mem_data_val, arrayOfElements, result->err);
+
+    GSCHED_RELEASE;
+    return 1;
+}
+bool_t rpc_cudnnbackendexecute_1_svc(ptr handle, ptr executionPlan, ptr variantPack, int *result, struct svc_req *rqstp)
+{
+    RECORD_API(rpc_cudnnbackendexecute_1_argument);
+    RECORD_NARG(handle);
+    RECORD_NARG(executionPlan);
+    RECORD_NARG(variantPack);
+    LOGE(LOG_DEBUG, "%s", __FUNCTION__);
+
+    GSCHED_RETAIN;
+    *result = cudnnBackendExecute(
+        (cudnnHandle_t)resource_mg_get(&rm_cudnn, (void*)handle),
+        (cudnnBackendDescriptor_t)resource_mg_get(&rm_cudnn_backendds, (void*)executionPlan),
+        (cudnnBackendDescriptor_t)resource_mg_get(&rm_cudnn_backendds, (void*)variantPack));
+    GSCHED_RELEASE;
+    RECORD_RESULT(integer, *result);
+    return 1;
+}
\ No newline at end of file
diff --git a/cpu/cpu-server-cudnn.h b/cpu/cpu-server-cudnn.h
new file mode 100644
index 00000000..6c892919
--- /dev/null
+++ b/cpu/cpu-server-cudnn.h
@@ -0,0 +1,9 @@
+#ifndef _CPU_SERVER_CUDNN_H_
+#define _CPU_SERVER_CUDNN_H_
+
+#include "resource-mg.h"
+
+int server_cudnn_init(int restore);
+int server_cudnn_deinit(void);
+
+#endif // _CPU_SERVER_CUDNN_H_
\ No newline at end of file
diff --git a/cpu/cpu-server-driver.c b/cpu/cpu-server-driver.c
index f6714b56..4eb2aad4 100644
--- a/cpu/cpu-server-driver.c
+++ b/cpu/cpu-server-driver.c
@@ -20,16 +20,157 @@ int server_driver_init(int restore)
    
     int ret = 0;
     if (!restore) {
-        ret &= resource_mg_init(&rm_modules, 1);
-        ret &= resource_mg_init(&rm_functions, 1);
+        // we cannot bypass the resource manager for functions and modules
+        // because CUfunctions and modules are at different locations on server and client
+        ret &= resource_mg_init(&rm_modules, 0);
+        ret &= resource_mg_init(&rm_functions, 0);
+        ret &= resource_mg_init(&rm_globals, 0);
     } else {
         ret &= resource_mg_init(&rm_modules, 0);
         ret &= resource_mg_init(&rm_functions, 0);
+        ret &= resource_mg_init(&rm_globals, 0);
         //ret &= server_driver_restore("ckp");
     }
     return ret;
 }
 
+#include <cuda_runtime_api.h>
+
+// Does not support checkpoint/restart yet
+bool_t rpc_elf_load_1_svc(mem_data elf, ptr module_key, int *result, struct svc_req *rqstp)
+{
+    LOGE(LOG_DEBUG, "rpc_elf_load(elf: %p, len: %#x, module_key: %#x)", elf.mem_data_val, elf.mem_data_len, module_key);
+    CUresult res;
+    CUmodule module = NULL;
+    
+    if ((res = cuModuleLoadData(&module, elf.mem_data_val)) != CUDA_SUCCESS) {
+        LOGE(LOG_ERROR, "cuModuleLoadData failed: %d", res);
+        *result = res;
+        return 1;
+    }
+
+    // We add our module using module_key as key. This means a fatbinaryHandle on the client is translated
+    // to a CUmodule on the server.
+    if ((res = resource_mg_add_sorted(&rm_modules, (void*)module_key, (void*)module)) != CUDA_SUCCESS) {
+        LOGE(LOG_ERROR, "resource_mg_create failed: %d", res);
+        *result = res;
+        return 1;
+    }
+
+    LOGE(LOG_DEBUG, "->module: %p", module);
+    *result = 0;
+    return 1;
+}
+
+// Does not support checkpoint/restart yet
+// TODO: We should also remove associated function handles
+bool_t rpc_elf_unload_1_svc(ptr elf_handle, int *result, struct svc_req *rqstp)
+{
+    LOGE(LOG_DEBUG, "rpc_elf_unload(elf_handle: %p)", elf_handle);
+    CUmodule module = NULL;
+    CUresult res;
+    
+    if ((module = (CUmodule)resource_mg_get(&rm_modules, (void*)elf_handle)) == NULL) {
+        LOG(LOG_ERROR, "resource_mg_get failed");
+        *result = -1;
+        return 1;
+    }
+
+    LOGE(LOG_DEBUG,"module: %p", module);
+
+    // if ((res = resource_mg_remove(&rm_modules, (void*)elf_handle)) != CUDA_SUCCESS) {
+    //     LOG(LOG_ERROR, "resource_mg_create failed: %d", res);
+    //     result->err = res;
+    //     return 1;
+    // }
+
+    if ((res = cuModuleUnload(module)) != CUDA_SUCCESS) {
+        const char *errstr;
+        cuGetErrorString(res, &errstr);
+        LOG(LOG_ERROR, "cuModuleUnload failed: %s (%d)", errstr, res);
+        *result = res;
+        return 1;
+    }
+
+    *result = 0;
+    return 1;
+}
+
+// Does not support checkpoint/restart yet
+bool_t rpc_register_function_1_svc(ptr fatCubinHandle, ptr hostFun, char* deviceFun,
+                            char* deviceName, int thread_limit, ptr_result *result, struct svc_req *rqstp)
+{
+    void *module = NULL;
+    RECORD_API(rpc_register_function_1_argument);
+    RECORD_ARG(1, fatCubinHandle);
+    RECORD_ARG(2, hostFun);
+    RECORD_ARG(3, deviceFun);
+    RECORD_ARG(4, deviceName);
+    RECORD_ARG(5, thread_limit);
+    LOG(LOG_DEBUG, "rpc_register_function(fatCubinHandle: %p, hostFun: %p, deviceFun: %s, deviceName: %s, thread_limit: %d)",
+        fatCubinHandle, hostFun, deviceFun, deviceName, thread_limit);
+    GSCHED_RETAIN;
+    //resource_mg_print(&rm_modules);
+    if ((module = resource_mg_get(&rm_modules, (void*)fatCubinHandle)) == (void*)fatCubinHandle) {
+        LOGE(LOG_ERROR, "%p not found in resource manager - we cannot call a function from an unknown module.", fatCubinHandle);
+        result->err = -1;
+        return 1;
+    }
+    result->err = cuModuleGetFunction((CUfunction*)&result->ptr_result_u.ptr,
+                    module,
+                    deviceName);
+    if (resource_mg_add_sorted(&rm_functions, (void*)hostFun, (void*)result->ptr_result_u.ptr) != 0) {
+        LOGE(LOG_ERROR, "error in resource manager");
+    }
+    GSCHED_RELEASE;
+    RECORD_RESULT(ptr_result_u, *result);
+    return 1;
+}
+
+// Does not support checkpoint/restart yet
+bool_t rpc_register_var_1_svc(ptr fatCubinHandle, ptr hostVar, ptr deviceAddress, char *deviceName, int ext, size_t size,
+                        int constant, int global, int *result, struct svc_req *rqstp)
+{
+    RECORD_API(rpc_register_var_1_argument);
+    RECORD_ARG(1, fatCubinHandle);
+    RECORD_ARG(2, hostVar);
+    RECORD_ARG(3, deviceAddress);
+    RECORD_ARG(4, deviceName);
+    RECORD_ARG(5, ext);
+    RECORD_ARG(6, size);
+    RECORD_ARG(7, constant);
+    RECORD_ARG(8, global);
+    
+    LOG(LOG_DEBUG, "rpc_register_var(fatCubinHandle: %p, hostVar: %p, deviceAddress: %p, deviceName: %s, "
+                   "ext: %d, size: %d, constant: %d, global: %d)",
+                   fatCubinHandle, hostVar, deviceAddress, deviceName, ext, size, constant, global);
+    
+    CUdeviceptr dptr = 0;
+    size_t d_size = 0;
+    CUresult res;
+    void *module = NULL;
+    GSCHED_RETAIN;
+    if ((module = resource_mg_get(&rm_modules, (void*)fatCubinHandle)) == (void*)fatCubinHandle) {
+        LOGE(LOG_ERROR, "%p not found in resource manager - we cannot call a function from an unknown module.", fatCubinHandle);
+        *result = -1;
+        return 1;
+    }
+    if ((res = cuModuleGetGlobal(&dptr, &d_size, module, deviceName)) != CUDA_SUCCESS) {
+        LOGE(LOG_ERROR, "cuModuleGetGlobal failed: %d", res);
+        *result = 1;
+        return 1;
+    }
+    if (resource_mg_add_sorted(&rm_globals, (void*)hostVar, (void*)dptr) != 0) {
+        LOGE(LOG_ERROR, "error in resource manager");
+        *result = 1;
+    } else {
+        *result = 0;
+    }
+    GSCHED_RELEASE;
+    RECORD_RESULT(integer, *result);
+    return 1;
+}
+
 int server_driver_deinit(void)
 {
     resource_mg_free(&rm_modules);
@@ -158,6 +299,26 @@ bool_t rpc_cumodulegetfunction_1_svc(uint64_t module, char *name, ptr_result *re
     return 1;
 }
 
+bool_t rpc_cumoduleloaddata_1_svc(mem_data mem, ptr_result *result,
+                                     struct svc_req *rqstp)
+{
+    RECORD_API(mem_data);
+    RECORD_SINGLE_ARG(mem);
+    LOG(LOG_DEBUG, "%s(%p, %#0zx)", __FUNCTION__, mem.mem_data_val, mem.mem_data_len);
+    GSCHED_RETAIN;
+    result->err = cuModuleLoadData((CUmodule*)&result->ptr_result_u.ptr, mem.mem_data_val);
+    GSCHED_RELEASE;
+    if (resource_mg_create(&rm_modules, (void*)result->ptr_result_u.ptr) != 0) {
+        LOGE(LOG_ERROR, "error in resource manager");
+    }
+    if (result->err != 0) {
+        char *err_str = NULL;
+        cuGetErrorName(result->err, &err_str);
+        LOGE(LOG_DEBUG, "cuModuleLoadData result: %s", err_str);
+    }
+    RECORD_RESULT(ptr_result_u, *result);
+    return 1;
+}
 bool_t rpc_cumoduleload_1_svc(char* path, ptr_result *result,
                                      struct svc_req *rqstp)
 {
@@ -170,6 +331,11 @@ bool_t rpc_cumoduleload_1_svc(char* path, ptr_result *result,
     if (resource_mg_create(&rm_modules, (void*)result->ptr_result_u.ptr) != 0) {
         LOGE(LOG_ERROR, "error in resource manager");
     }
+    if (result->err != 0) {
+        char *err_str = NULL;
+        cuGetErrorName(result->err, &err_str);
+        LOGE(LOG_DEBUG, "cuModuleLoad result: %s", err_str);
+    }
     RECORD_RESULT(ptr_result_u, *result);
     return 1;
 }
@@ -181,7 +347,7 @@ bool_t rpc_cumoduleunload_1_svc(ptr module, int *result,
     RECORD_SINGLE_ARG(module);
     LOG(LOG_DEBUG, "%s(%p)", __FUNCTION__, (void*)module);
     GSCHED_RETAIN;
-    *result = cuModuleUnload(resource_mg_get(&rm_streams, (void*)module));
+    *result = cuModuleUnload(resource_mg_get(&rm_modules, (void*)module));
     GSCHED_RELEASE;
     RECORD_RESULT(integer, *result);
     return 1;
@@ -202,6 +368,45 @@ bool_t rpc_cugeterrorstring_1_svc(int err, str_result *result,
     return 1;
 }
 
+bool_t rpc_cudeviceprimaryctxgetstate_1_svc(int dev, dint_result *result,
+                                      struct svc_req *rqstp)
+{
+    LOGE(LOG_DEBUG, "%s(%d)", __FUNCTION__, dev);
+    GSCHED_RETAIN;
+    result->err = cuDevicePrimaryCtxGetState(dev, &(result->dint_result_u.data.i1),
+                                            &(result->dint_result_u.data.i2));
+    LOGE(LOG_DEBUG, "state: %d, flags: %d", result->dint_result_u.data.i1,
+                                           result->dint_result_u.data.i2);
+    GSCHED_RELEASE;
+    return 1;
+}
+
+bool_t rpc_cudevicegetproperties_1_svc(int dev, mem_result *result,
+                                       struct svc_req *rqstp)
+{
+    LOGE(LOG_DEBUG, "%s(%d)", __FUNCTION__, dev);
+    GSCHED_RETAIN;
+    if ((result->mem_result_u.data.mem_data_val = malloc(sizeof(CUdevprop))) == NULL) {
+        result->err = CUDA_ERROR_OUT_OF_MEMORY;
+    }
+    result->mem_result_u.data.mem_data_len = sizeof(CUdevprop);
+    result->err = cuDeviceGetProperties((CUdevprop*)result->mem_result_u.data.mem_data_val, dev);
+    GSCHED_RELEASE;
+    return 1;
+}
+
+bool_t rpc_cudevicecomputecapability_1_svc(int dev, dint_result *result,
+                                           struct svc_req *rqstp)
+{
+    LOGE(LOG_DEBUG, "%s(%d)", __FUNCTION__, dev);
+    GSCHED_RETAIN;
+    result->err = cuDeviceComputeCapability(&(result->dint_result_u.data.i1),
+                                            &(result->dint_result_u.data.i2),
+                                            dev);
+    GSCHED_RELEASE;
+    return 1;
+}
+
 /*
 bool_t rpc_cugetexporttable_1_svc(char *rpc_uuid, ptr_result *result,
                                   struct svc_req *rqstp)
@@ -276,7 +481,6 @@ bool_t rpc_culaunchkernel_1_svc(uint64_t f, unsigned int gridDimX, unsigned int
     void **cuda_args;
     uint16_t *arg_offsets;
     size_t param_num;
-    LOG(LOG_DEBUG, "%s", __FUNCTION__);
     if (args.mem_data_val == NULL) {
         LOGE(LOG_ERROR, "param.mem_data_val is NULL");
         *result = CUDA_ERROR_INVALID_VALUE;
@@ -303,10 +507,15 @@ bool_t rpc_culaunchkernel_1_svc(uint64_t f, unsigned int gridDimX, unsigned int
         LOGE(LOG_DEBUG, "arg: %p (%d)", *(void**)cuda_args[i], *(int*)cuda_args[i]);
     }
 
-    LOGE(LOG_DEBUG, "cuLaunchKernel(func=%p, gridDim=[%d,%d,%d], blockDim=[%d,%d,%d], args=%p, sharedMem=%d, stream=%p)", f, gridDimX, gridDimY, gridDimZ, blockDimX, blockDimY, blockDimZ, cuda_args, sharedMemBytes, (void*)hStream);
+    LOGE(LOG_DEBUG, "cuLaunchKernel(func=%p->%p, gridDim=[%d,%d,%d], blockDim=[%d,%d,%d], args=%p, sharedMem=%d, stream=%p)", f, resource_mg_get(&rm_functions, (void*)f), gridDimX, gridDimY, gridDimZ, blockDimX, blockDimY, blockDimZ, cuda_args, sharedMemBytes, (void*)hStream);
 
     GSCHED_RETAIN;
-    *result = cuLaunchKernel((CUfunction)f, gridDimX, gridDimY, gridDimZ, blockDimX, blockDimY, blockDimZ, sharedMemBytes, (CUstream)hStream, cuda_args, NULL);
+    *result = cuLaunchKernel((CUfunction)resource_mg_get(&rm_functions, (void*)f),
+                              gridDimX, gridDimY, gridDimZ,
+                              blockDimX, blockDimY, blockDimZ,
+                              sharedMemBytes,
+                              (CUstream)hStream,
+                              cuda_args, NULL);
     GSCHED_RELEASE;
 
     free(cuda_args);
@@ -314,6 +523,15 @@ bool_t rpc_culaunchkernel_1_svc(uint64_t f, unsigned int gridDimX, unsigned int
 
 }
 
+bool_t rpc_cudevicegetp2pattribute_1_svc(int attrib, ptr srcDevice, ptr dstDevice, int_result *result, struct svc_req *rqstp)
+{
+    LOG(LOG_DEBUG, "%s", __FUNCTION__);
+    GSCHED_RETAIN;
+    result->err = cuDeviceGetP2PAttribute(&result->int_result_u.data, (CUdevice_P2PAttribute)attrib, (CUdevice)srcDevice, (CUdevice)dstDevice);
+    GSCHED_RELEASE;
+    return 1;
+}
+
 /* ################## START OF HIDDEN FUNCTIONS IMPL ######################## */
 
 /*
diff --git a/cpu/cpu-server-nvml.c b/cpu/cpu-server-nvml.c
new file mode 100644
index 00000000..89467618
--- /dev/null
+++ b/cpu/cpu-server-nvml.c
@@ -0,0 +1,72 @@
+#define _GNU_SOURCE
+#include <stdlib.h>
+#include <stdio.h>
+
+#include <nvml.h>
+#include <cuda_runtime_api.h>
+
+#include "cpu_rpc_prot.h"
+#include "cpu-common.h"
+#include "cpu-utils.h"
+#include "log.h"
+#include "resource-mg.h"
+#define WITH_RECORDER
+#include "api-recorder.h"
+#include "gsched.h"
+
+int server_nvml_init(int restore)
+{
+    int ret = 0;
+    if (!restore) {
+        //ret &= resource_mg_init(&rm_modules, 1);
+    } else {
+        //ret &= resource_mg_init(&rm_modules, 0);
+        //ret &= server_driver_restore("ckp");
+    }
+    return ret;
+}
+
+int server_nvml_deinit(void)
+{
+    //resource_mg_free(&rm_modules);
+    return 0;
+}
+
+bool_t rpc_nvmldevicegetcount_v2_1_svc(int_result *result, struct svc_req *rqstp)
+{
+    LOGE(LOG_DEBUG, "%s", __FUNCTION__);
+    GSCHED_RETAIN;
+    // Workaround for pytorch expecting nvmlDeviceGetCount and cudaGetDeviceCount to be the same
+    //result->err = nvmlDeviceGetCount_v2(&result->int_result_u.data);
+    result->err = cudaGetDeviceCount(&result->int_result_u.data);
+    LOGE(LOG_DEBUG, "%s: %d", __FUNCTION__, result->int_result_u.data);
+    GSCHED_RELEASE;
+    return 1;
+}
+
+bool_t rpc_nvmlinitwithflags_1_svc(int flags, int *result, struct svc_req *rqstp)
+{
+    LOGE(LOG_DEBUG, "%s", __FUNCTION__);
+    GSCHED_RETAIN;
+    *result = nvmlInitWithFlags(flags);
+    GSCHED_RELEASE;
+    return 1;
+}
+
+bool_t rpc_nvmlinit_v2_1_svc(int *result, struct svc_req *rqstp)
+{
+    LOGE(LOG_DEBUG, "%s", __FUNCTION__);
+    GSCHED_RETAIN;
+    *result = nvmlInit_v2();
+    GSCHED_RELEASE;
+    return 1;
+}
+
+bool_t rpc_nvmlshutdown_1_svc(int *result, struct svc_req *rqstp)
+{
+    LOGE(LOG_DEBUG, "%s", __FUNCTION__);
+    GSCHED_RETAIN;
+    *result = nvmlShutdown();
+    GSCHED_RELEASE;
+    return 1;
+}
\ No newline at end of file
diff --git a/cpu/cpu-server-nvml.h b/cpu/cpu-server-nvml.h
new file mode 100644
index 00000000..84a8270c
--- /dev/null
+++ b/cpu/cpu-server-nvml.h
@@ -0,0 +1,9 @@
+#ifndef _CPU_SERVER_NVML_H_
+#define _CPU_SERVER_NVML_H_
+
+int server_nvml_init(int restore);
+int server_nvml_deinit(void);
+//int server_nvml_checkpoint(const char *path, int dump_memory, unsigned long prog, unsigned long vers);
+//int server_nvml_restore(const char *path);
+
+#endif //_CPU_SERVER_NVML_H_
diff --git a/cpu/cpu-server-runtime.c b/cpu/cpu-server-runtime.c
index 87780856..3d70e0a5 100644
--- a/cpu/cpu-server-runtime.c
+++ b/cpu/cpu-server-runtime.c
@@ -2,6 +2,8 @@
 #include <cuda_runtime_api.h>
 #include <cuda.h>
 #include <driver_types.h>
+#include <dlfcn.h>
+#include <cuda_profiler_api.h>
 
 //for strerror
 #include <string.h>
@@ -34,13 +36,13 @@
 #include "mt-memcpy.h"
 
 typedef struct host_alloc_info {
-    int cnt;
+    size_t idx;
     size_t size;
     void *client_ptr;
     void *server_ptr;
 } host_alloc_info_t;
 static host_alloc_info_t hainfo[64];
-static size_t hainfo_cnt = 1;
+static size_t hainfo_cnt = 0;
 list mt_memcpy_list = {0};
 
 static int hainfo_getserverindex(void *server_ptr)
@@ -77,10 +79,21 @@ int server_runtime_init(int restore)
         ret &= resource_mg_init(&rm_events, 0);
         ret &= resource_mg_init(&rm_arrays, 0);
         ret &= resource_mg_init(&rm_memory, 0);
+        ret &= resource_mg_init(&rm_kernels, 0);
         ret &= cusolver_init(0, &rm_streams, &rm_memory);
         ret &= cublas_init(0, &rm_memory);
         ret &= server_runtime_restore("ckp");
     }
+    
+    // Make sure runtime API is initialized
+    // If we don't do this and use the driver API, it might be unintialized
+    cudaError_t cres;
+    if ((cres = cudaSetDevice(0)) != cudaSuccess) {
+        LOG(LOG_ERROR, "cudaSetDevice failed: %d", cres);
+        ret = 1;
+    }
+    cudaDeviceSynchronize();
+
     return ret;
 }
 
@@ -90,6 +103,7 @@ int server_runtime_deinit(void)
     resource_mg_free(&rm_events);
     resource_mg_free(&rm_arrays);
     resource_mg_free(&rm_memory);
+    resource_mg_free(&rm_kernels);
     cusolver_deinit();
     cublas_deinit();
     list_free(&mt_memcpy_list);
@@ -133,6 +147,42 @@ int server_runtime_restore(const char *path)
     return 0;
 }
 
+
+/** implementation for CUDA_REGISTER_FUNCTION(ptr, str, str, str, int)
+ *
+ */
+bool_t cuda_register_function_1_svc(ptr fatCubinHandle, ptr hostFun, char* deviceFun, char* deviceName, int thread_limit, int* result, struct svc_req *rqstp)
+{
+    LOGE(LOG_DEBUG, "cudaRegisterFunction(%p, %p, %s, %s, %d)", fatCubinHandle, hostFun, deviceFun, deviceName, thread_limit);
+    
+    void (*serverFun)(void);
+
+    if ( (serverFun = dlsym(RTLD_NEXT, "dlopen")) == NULL) {
+        LOGE(LOG_ERROR, "failed to get dlopen %s", dlerror());
+        *result = 1;
+        return 1;
+    }
+    
+    if (resource_mg_add_sorted(&rm_kernels, (void*)hostFun, serverFun) != 0) {
+        LOGE(LOG_ERROR, "failed to add kernel to resource manager");
+        *result = 1;
+        return 1;
+    }
+    LOGE(LOG_DEBUG, "added kernel %p->%p to resource manager", hostFun, serverFun);
+    // __cudaRegisterFunction(&fatCubinHandle, hostFun, deviceFun,
+    //                         deviceName, thread_limit, &tid, &bid, &bDim, &gDim, &wSize);
+
+    // LOGE(LOG_DEBUG, "-> %p, {%d, %d, %d}, {%d, %d, %d}, {%d, %d, %d}, {%d, %d, %d}, %d)",
+    //                 fatCubinHandle, 
+    //                 tid.x, tid.y, tid.z,
+    //                 bid.x, bid.y, bid.z,
+    //                 bDim.x, bDim.y, bDim.z,
+    //                 gDim.x, gDim.y, gDim.z,
+    //                 wSize);
+    *result = 0;
+    return 1;
+}
+
 /* ############### RUNTIME API ############### */
 /* ### Device Management ### */
 bool_t cuda_choose_device_1_svc(mem_data prop, int_result *result, struct svc_req *rqstp)
@@ -310,19 +360,14 @@ bool_t cuda_get_device_flags_1_svc(int_result *result, struct svc_req *rqstp)
     return 1;
 }
 
-bool_t cuda_get_device_properties_1_svc(int device, mem_result *result, struct svc_req *rqstp)
+bool_t cuda_get_device_properties_1_svc(int device, cuda_device_prop_result *result, struct svc_req *rqstp)
 {
     LOGE(LOG_DEBUG, "cudaGetDeviceProperties");
-    result->mem_result_u.data.mem_data_val = malloc(sizeof(struct cudaDeviceProp));
-    if (result->mem_result_u.data.mem_data_val == NULL) {
-        LOGE(LOG_ERROR, "malloc failed.");
+    if (sizeof(result->cuda_device_prop_result_u.data) != sizeof(struct cudaDeviceProp)) {
+        LOGE(LOG_ERROR, "cuda_device_prop_result size mismatch");
         return 0;
     }
-    result->mem_result_u.data.mem_data_len = sizeof(struct cudaDeviceProp);
-    result->err = cudaGetDeviceProperties((void*)result->mem_result_u.data.mem_data_val, device);
-    if (result->err != 0) {
-        free(result->mem_result_u.data.mem_data_val);
-    }
+    result->err = cudaGetDeviceProperties((void*)result->cuda_device_prop_result_u.data, device);
     return 1;
 }
 
@@ -542,8 +587,14 @@ bool_t cuda_stream_get_priority_1_svc(ptr hStream, int_result *result, struct sv
     return 1;
 }
 
-/* Capture API does not make sense without graph API */
-//        /* ?         CUDA_STREAM_IS_CAPTURING(ptr)                      = 264;*/
+bool_t cuda_stream_is_capturing_1_svc(ptr stream, int_result *result, struct svc_req *rqstp)
+{
+    LOGE(LOG_DEBUG, "cudaStreamIsCapturing");
+    result->err = cudaStreamIsCapturing(
+      resource_mg_get(&rm_streams, (void*)stream),
+      (enum cudaStreamCaptureStatus*)&result->int_result_u.data);
+    return 1;
+}
 
 bool_t cuda_stream_query_1_svc(ptr hStream, int *result, struct svc_req *rqstp)
 {
@@ -770,7 +821,7 @@ bool_t cuda_launch_cooperative_kernel_1_svc(ptr func, rpc_dim3 gridDim, rpc_dim3
     LOGE(LOG_DEBUG, "cudaLaunchCooperativeKernel(func=%p, gridDim=[%d,%d,%d], blockDim=[%d,%d,%d], args=%p, sharedMem=%d, stream=%p)", func, cuda_gridDim.x, cuda_gridDim.y, cuda_gridDim.z, cuda_blockDim.x, cuda_blockDim.y, cuda_blockDim.z, cuda_args, sharedMem, (void*)stream);
 
     *result = cudaLaunchCooperativeKernel(
-      (void*)func,
+      resource_mg_get(&rm_kernels, (void*)func),
       cuda_gridDim,
       cuda_blockDim,
       cuda_args,
@@ -781,44 +832,6 @@ bool_t cuda_launch_cooperative_kernel_1_svc(ptr func, rpc_dim3 gridDim, rpc_dim3
     return 1;
 }
 
-bool_t cuda_launch_cooperative_kernel_multi_device_1_svc(ptr func, rpc_dim3 gridDim, rpc_dim3 blockDim, mem_data args, size_t sharedMem, ptr stream, int numDevices, int flags, int *result, struct svc_req *rqstp)
-{
-    RECORD_API(cuda_launch_cooperative_kernel_multi_device_1_argument);
-    RECORD_ARG(1, func);
-    RECORD_ARG(2, gridDim);
-    RECORD_ARG(3, blockDim);
-    //TODO: Store parameters explicitly
-    //RECORD_ARG(4, args);
-    RECORD_ARG(5, sharedMem);
-    RECORD_ARG(6, stream);
-    RECORD_ARG(7, numDevices);
-    RECORD_ARG(8, flags);
-    dim3 cuda_gridDim = {gridDim.x, gridDim.y, gridDim.z};
-    dim3 cuda_blockDim = {blockDim.x, blockDim.y, blockDim.z};
-    void **cuda_args;
-    uint16_t *arg_offsets;
-    size_t param_num = *((size_t*)args.mem_data_val);
-    struct cudaLaunchParams lp;
-    arg_offsets = (uint16_t*)(args.mem_data_val+sizeof(size_t));
-    cuda_args = malloc(param_num*sizeof(void*));
-    for (size_t i = 0; i < param_num; ++i) {
-        cuda_args[i] = args.mem_data_val+sizeof(size_t)+param_num*sizeof(uint16_t)+arg_offsets[i];
-        //LOGE(LOG_DEBUG, "arg: %p (%d)\n", *(void**)cuda_args[i], *(int*)cuda_args[i]);
-    }
-
-    LOGE(LOG_DEBUG, "cudaLaunchCooperativeKernelMultiDevice(func=%p, gridDim=[%d,%d,%d], blockDim=[%d,%d,%d], args=%p, sharedMem=%d, stream=%p)", func, cuda_gridDim.x, cuda_gridDim.y, cuda_gridDim.z, cuda_blockDim.x, cuda_blockDim.y, cuda_blockDim.z, cuda_args, sharedMem, (void*)stream);
-    lp.args = cuda_args;
-    lp.blockDim = cuda_blockDim;
-    lp.func = (void*)func;
-    lp.gridDim = cuda_gridDim;
-    lp.sharedMem = sharedMem;
-    lp.stream = resource_mg_get(&rm_streams, (void*)stream);
-    *result = cudaLaunchCooperativeKernelMultiDevice(&lp, numDevices, flags);
-    RECORD_RESULT(integer, *result);
-    LOGE(LOG_DEBUG, "cudaLaunchCooperativeKernelMultiDevice result: %d", *result);
-    return 1;
-}
-
 /* This would require RPCs in the opposite direction.
  * __host__ cudaError_t cudaLaunchHostFunc ( cudaStream_t stream, cudaHostFn_t fn, void* userData )
  *   Enqueues a host function call in a stream.
@@ -848,15 +861,28 @@ bool_t cuda_launch_kernel_1_svc(ptr func, rpc_dim3 gridDim, rpc_dim3 blockDim,
         LOGE(LOG_DEBUG, "arg: %p (%d)", *(void**)cuda_args[i], *(int*)cuda_args[i]);
     }
 
-    LOGE(LOG_DEBUG, "cudaLaunchKernel(func=%p, gridDim=[%d,%d,%d], blockDim=[%d,%d,%d], args=%p, sharedMem=%d, stream=%p)", func, cuda_gridDim.x, cuda_gridDim.y, cuda_gridDim.z, cuda_blockDim.x, cuda_blockDim.y, cuda_blockDim.z, cuda_args, sharedMem, (void*)stream);
-
-    *result = cudaLaunchKernel(
-      (void*)func,
-      cuda_gridDim,
-      cuda_blockDim,
-      cuda_args,
-      sharedMem,
-      resource_mg_get(&rm_streams, (void*)stream));
+    LOGE(LOG_DEBUG, "cudaLaunchKernel(func=%p, gridDim=[%d,%d,%d], blockDim=[%d,%d,%d], args=%p, sharedMem=%d, stream=%p)",
+                    resource_mg_get(&rm_functions, (void*)func),
+                    cuda_gridDim.x, cuda_gridDim.y, cuda_gridDim.z,
+                    cuda_blockDim.x, cuda_blockDim.y, cuda_blockDim.z,
+                    cuda_args,
+                    sharedMem,
+                    (void*)stream);
+
+    *result = cuLaunchKernel((CUfunction)resource_mg_get(&rm_functions, (void*)func),
+                            gridDim.x, gridDim.y, gridDim.z,
+                            blockDim.x, blockDim.y, blockDim.z,
+                            sharedMem,
+                            resource_mg_get(&rm_streams, (void*)stream),
+                            cuda_args, NULL);
+
+    // *result = cudaLaunchKernel(
+    //   resource_mg_get(&rm_functions, (void*)func),
+    //   cuda_gridDim,
+    //   cuda_blockDim,
+    //   cuda_args,
+    //   sharedMem,
+    //   resource_mg_get(&rm_streams, (void*)stream));
     free(cuda_args);
     RECORD_RESULT(integer, *result);
     LOGE(LOG_DEBUG, "cudaLaunchKernel result: %d", *result);
@@ -1028,8 +1054,8 @@ bool_t cuda_free_host_1_svc(int index, int *result, struct svc_req *rqstp)
         *result = cudaSuccess;
         return 1;
     }
-    if (hainfo[index].cnt != 0 &&
-        hainfo[index].cnt == index) {
+    if (hainfo[index].idx != 0 &&
+        hainfo[index].idx == index) {
 
         *result = cudaHostUnregister(hainfo[index].server_ptr);
         munmap(hainfo[index].server_ptr, hainfo[index].size);
@@ -1064,31 +1090,39 @@ bool_t cuda_get_symbol_size_1_svc(ptr symbol, u64_result *result, struct svc_req
     return 1;
 }
 
-bool_t cuda_host_alloc_1_svc(int client_cnt, size_t size, ptr client_ptr, unsigned int flags, int *result, struct svc_req *rqstp)
+bool_t cuda_host_alloc_1_svc(size_t size, unsigned int flags, sz_result *result, struct svc_req *rqstp)
 {
     //TODO: Make checkpointable. Implement reattaching of shm segment.
     int fd_shm;
-    char shm_name[128];
+    char *shm_name = NULL;
     void *shm_addr;
     unsigned int register_flags = 0;
-    *result = cudaErrorMemoryAllocation;
     RECORD_API(cuda_host_alloc_1_argument);
-    RECORD_ARG(1, client_cnt);
-    RECORD_ARG(2, size);
-    RECORD_ARG(3, client_ptr);
-    RECORD_ARG(4, flags);
+    RECORD_ARG(1, size);
+    RECORD_ARG(2, flags);
 
     LOGE(LOG_DEBUG, "cudaHostAlloc");
+    result->err = cudaErrorMemoryAllocation;
 
     if (socktype == UNIX || (shm_enabled && cpu_utils_is_local_connection(rqstp))) { //Use local shared memory
-        snprintf(shm_name, 128, "/crickethostalloc-%d", client_cnt);
-        if ((fd_shm = shm_open(shm_name, O_RDWR, 600)) == -1) {
+        if (asprintf(&shm_name, "/crickethostalloc-%d", hainfo_cnt) == -1) {
+            LOGE(LOG_ERROR, "asprintf failed: %s", strerror(errno));
+            goto out;
+        }
+        if ((fd_shm = shm_open(shm_name, O_RDWR | O_CREAT | O_TRUNC, S_IRWXU)) == -1) {
             LOGE(LOG_ERROR, "could not open shared memory \"%s\" with size %d: %s", shm_name, size, strerror(errno));
             goto out;
         }
+        if (ftruncate(fd_shm, size) == -1) {
+            LOGE(LOG_ERROR, "cannot resize shared memory");
+            shm_unlink(shm_name);
+            goto out;
+        }
+        result->sz_result_u.data = hainfo_cnt;
+        LOGE(LOG_DEBUG, "shm opened with name \"%s\", size: %d", shm_name, size);
         if ((shm_addr = mmap(NULL, size, PROT_READ | PROT_WRITE, MAP_SHARED, fd_shm, 0)) == MAP_FAILED) {
             LOGE(LOG_ERROR, "mmap returned unexpected pointer: %p", shm_addr);
-            goto cleanup;
+            goto out;
         }
 
         if (flags & cudaHostAllocPortable) {
@@ -1101,23 +1135,23 @@ bool_t cuda_host_alloc_1_svc(int client_cnt, size_t size, ptr client_ptr, unsign
             register_flags |= cudaHostRegisterMapped;
         }
 
-        if ((*result = cudaHostRegister(shm_addr, size, flags)) != cudaSuccess) {
+        if ((result->err = cudaHostRegister(shm_addr, size, flags)) != cudaSuccess) {
             LOGE(LOG_ERROR, "cudaHostRegister failed.");
             munmap(shm_addr, size);
-            goto cleanup;
+            goto out;
         }
-
-        hainfo[hainfo_cnt].cnt = client_cnt;
+        hainfo[hainfo_cnt].idx = hainfo_cnt;
         hainfo[hainfo_cnt].size = size;
-        hainfo[hainfo_cnt].client_ptr = (void*)client_ptr;
+        hainfo[hainfo_cnt].client_ptr = NULL;
         hainfo[hainfo_cnt].server_ptr = shm_addr;
         hainfo_cnt++;
     } else if (socktype == TCP) { //Use infiniband
 #ifdef WITH_IB
-   
+        LOGE(LOG_ERROR, "infiniband does not yet support cudaHostAlloc.");
+        goto out;
 #else
-                LOGE(LOG_ERROR, "infiniband is disabled.");
-                goto cleanup;
+        LOGE(LOG_ERROR, "infiniband is disabled.");
+        goto out;
 #endif //WITH_IB
 
     } else {
@@ -1125,14 +1159,40 @@ bool_t cuda_host_alloc_1_svc(int client_cnt, size_t size, ptr client_ptr, unsign
         goto out;
     }
 
+    result->err = cudaSuccess;
+out:
+    RECORD_RESULT(sz_result_u, *result);
+    return 1;
+}
+
+bool_t cuda_host_alloc_regshm_1_svc(size_t hainfo_idx, ptr client_ptr, int *result, struct svc_req *rqstp)
+{
+    char *shm_name = NULL;
+    RECORD_API(cuda_host_alloc_regshm_1_argument);
+    RECORD_ARG(1, hainfo_idx);
+    RECORD_ARG(2, client_ptr);
+
+    LOGE(LOG_DEBUG, "cudaHostAllocRegShm");
+    *result = cudaErrorMemoryAllocation;
+
+    if (socktype != UNIX && !(shm_enabled && cpu_utils_is_local_connection(rqstp))) {
+        LOGE(LOG_ERROR, "cudaHostAllocRegShm is only supported for local connections.");
+        goto out;
+    }
+    if (asprintf(&shm_name, "/crickethostalloc-%d", hainfo_idx) == -1) {
+        LOGE(LOG_ERROR, "asprintf failed: %s", strerror(errno));
+        goto out;
+    }
+    hainfo[hainfo_idx].client_ptr = (void*)client_ptr;
     *result = cudaSuccess;
-cleanup:
-    shm_unlink(shm_name);
 out:
+    shm_unlink(shm_name);
+    free(shm_name);
     RECORD_RESULT(integer, *result);
     return 1;
 }
 
+
 bool_t cuda_host_get_device_pointer_1_svc(ptr pHost, int flags, ptr_result *result, struct svc_req *rqstp)
 {
     LOGE(LOG_DEBUG, "cudaHostGetDevicePointer");
@@ -1165,7 +1225,7 @@ bool_t cuda_malloc_1_svc(size_t argp, ptr_result *result, struct svc_req *rqstp)
 #ifdef WITH_IB
         result->err = ib_allocate_memreg((void**)&result->ptr_result_u.ptr, argp, hainfo_cnt, true);
             if (result->err == 0) {
-                hainfo[hainfo_cnt].cnt = hainfo_cnt;
+                hainfo[hainfo_cnt].idx = hainfo_cnt;
                 hainfo[hainfo_cnt].size = argp;
                 hainfo[hainfo_cnt].server_ptr = (void*)result->ptr_result_u.ptr;
 
@@ -1321,7 +1381,7 @@ bool_t cuda_memcpy_htod_1_svc(uint64_t ptr, mem_data mem, size_t size, int *resu
     RECORD_ARG(2, mem);
     RECORD_ARG(3, size);
 
-    LOGE(LOG_DEBUG, "cudaMemcpyHtoD");
+    LOGE(LOG_DEBUG, "cudaMemcpyHtoD(%p, %p, %zu)", (void*)ptr, mem.mem_data_val, size);
     if (size != mem.mem_data_len) {
         LOGE(LOG_ERROR, "data size mismatch");
         *result = cudaErrorUnknown;
@@ -1476,8 +1536,8 @@ bool_t cuda_memcpy_ib_1_svc(int index, ptr device_ptr, size_t size, int kind, in
     LOGE(LOG_DEBUG, "cudaMemcpyIB");
     *result = cudaErrorInitializationError;
     //anstatt array list (list.c)
-    if (hainfo[index].cnt == 0 ||
-        hainfo[index].cnt != index) {
+    if (hainfo[index].idx == 0 ||
+        hainfo[index].idx != index) {
 
         LOGE(LOG_ERROR, "inconsistent state");
         goto out;
@@ -1529,12 +1589,12 @@ bool_t cuda_memcpy_shm_1_svc(int index, ptr device_ptr, size_t size, int kind, i
     RECORD_ARG(2, device_ptr);
     RECORD_ARG(3, size);
     RECORD_ARG(4, kind);
-    LOGE(LOG_DEBUG, "cudaMemcpyShm");
+    LOGE(LOG_DEBUG, "cudaMemcpyShm(index: %d, device_ptr: %p, size: %d, kind: %d)", index, device_ptr, size, kind);
     *result = cudaErrorInitializationError;
-    if (hainfo[index].cnt == 0 ||
-        hainfo[index].cnt != index) {
+    if (index >= hainfo_cnt ||
+        hainfo[index].idx != index) {
 
-        LOGE(LOG_ERROR, "inconsistent state");
+        LOGE(LOG_ERROR, "inconsistent state: index: %d, hainfo[index].idx: %d", index, hainfo[index].idx);
         goto out;
     }
     if (hainfo[index].size < size) {
@@ -1610,63 +1670,27 @@ bool_t cuda_memcpy_dtoh_1_svc(uint64_t ptr, size_t size, mem_result *result, str
 /* cudaMemcpyPeer ( void* dst, int  dstDevice, const void* src, int  srcDevice, size_t count ) not implemented yet. see cudaMemcpyDtoD */
 /* cudaMemcpyPeerAsync ( void* dst, int  dstDevice, const void* src, int  srcDevice, size_t count, cudaStream_t stream = 0 ) */
 
-bool_t cuda_memcpy_to_symbol_1_svc(uint64_t ptr, mem_data mem, size_t size, size_t offset, int *result, struct svc_req *rqstp)
+bool_t cuda_memcpy_to_symbol_1_svc(uint64_t symbolptr, mem_data mem, size_t size, size_t offset, int *result, struct svc_req *rqstp)
 {
-    RECORD_API(cuda_memcpy_to_symbol_1_argument);
-    RECORD_ARG(1, ptr);
-    RECORD_ARG(2, mem);
-    RECORD_ARG(3, size);
-    RECORD_ARG(4, offset);
-
-    LOGE(LOG_DEBUG, "cudaMemcpyToSymbol");
-    if (size != mem.mem_data_len) {
-        LOGE(LOG_ERROR, "data size mismatch");
-        *result = cudaErrorUnknown;
+    LOGE(LOG_DEBUG, "cudaMemcpyToSymbol(%p, %p, %zu, %zu)", symbolptr, mem.mem_data_val, size, offset);
+    void *symbol_addr = resource_mg_get(&rm_globals, (void*)symbolptr);
+    if (symbol_addr == NULL) {
+        LOGE(LOG_ERROR, "cudaMemcpyToSymbol: symbol not found");
+        *result = cudaErrorInvalidSymbol;
         return 1;
     }
-#ifdef WITH_MEMCPY_REGISTER
-    if ((*result = cudaHostRegister(mem.mem_data_val, size, cudaHostRegisterMapped)) != cudaSuccess) {
-        LOGE(LOG_ERROR, "cudaHostRegister failed: %d.", *result);
-        return 1;
-    }
-#endif
-    *result = cudaMemcpyToSymbol((void*)ptr, mem.mem_data_val, size, offset, cudaMemcpyHostToDevice);
-#ifdef WITH_MEMCPY_REGISTER
-    cudaHostUnregister(mem.mem_data_val);
-#endif
-    RECORD_RESULT(integer, *result);
-    return 1;
+    return cuda_memcpy_htod_1_svc((ptr)(symbol_addr+offset), mem, size, result, rqstp);
 }
 
 bool_t cuda_memcpy_to_symbol_shm_1_svc(int index, ptr device_ptr, size_t size, size_t offset, int kind, int *result, struct svc_req *rqstp)
 {
-    RECORD_API(cuda_memcpy_to_symbol_shm_1_argument);
-    RECORD_ARG(1, index);
-    RECORD_ARG(2, device_ptr);
-    RECORD_ARG(3, size);
-    RECORD_ARG(4, offset);
-    RECORD_ARG(5, kind);
-    LOGE(LOG_DEBUG, "cudaMemcpyToSymbolShm");
-    *result = cudaErrorInitializationError;
-    if (hainfo[index].cnt == 0 ||
-        hainfo[index].cnt != index) {
-
-        LOGE(LOG_ERROR, "inconsistent state");
-        goto out;
-    }
-    if (hainfo[index].size < size) {
-        LOGE(LOG_ERROR, "requested size is smaller than shared memory segment");
-        goto out;
-    }
-
-    if (kind == cudaMemcpyHostToDevice) {
-        *result = cudaMemcpyToSymbol((void*)device_ptr, hainfo[index].server_ptr, size, offset, kind);
-    } else {
-        LOGE(LOG_ERROR, "a kind different from HostToDevice is unsupported for cudaMemcpyToSymbol");
+    void *symbol_addr = resource_mg_get(&rm_globals, (void*)device_ptr);
+    if (symbol_addr == NULL) {
+        LOGE(LOG_ERROR, "cudaMemcpyToSymbol: symbol not found");
+        *result = cudaErrorInvalidSymbol;
+        return 1;
     }
-out:
-    RECORD_RESULT(integer, *result);
-    return 1;
+    return cuda_memcpy_shm_1_svc(index, (ptr)(symbol_addr+offset), size, kind, result, rqstp);
 }
 
 /* cudaMemcpyToSymbolAsync ( const void* symbol, const void* src, size_t count, size_t offset, cudaMemcpyKind kind, cudaStream_t stream = 0 ) not implemented yet */
@@ -1706,7 +1730,26 @@ bool_t cuda_memset_2d_1_svc(ptr devPtr, size_t pitch, int value, size_t width, s
     return 1;
 }
 
-/* cudaMemset2DAsync ( void* devPtr, size_t pitch, int  value, size_t width, size_t height, cudaStream_t stream = 0 ) is not implemented */
+bool_t cuda_memset_2d_async_1_svc(ptr devPtr, size_t pitch, int value, size_t width, size_t height, ptr stream, int *result, struct svc_req *rqstp)
+{
+    RECORD_API(cuda_memset_2d_async_1_argument);
+    RECORD_ARG(1, devPtr);
+    RECORD_ARG(2, pitch);
+    RECORD_ARG(3, value);
+    RECORD_ARG(4, height);
+    RECORD_ARG(5, width);
+    RECORD_ARG(6, stream);
+    LOGE(LOG_DEBUG, "cudaMemset2DAsync");
+    *result = cudaMemset2DAsync(
+      resource_mg_get(&rm_memory, (void*)devPtr),
+      pitch,
+      value,
+      width,
+      height,
+      resource_mg_get(&rm_streams, (void*)stream));
+    RECORD_RESULT(integer, *result);
+    return 1;
+}
 
 bool_t cuda_memset_3d_1_svc(size_t pitch, ptr devPtr, size_t xsize, size_t ysize, int value, size_t depth, size_t height, size_t width, int *result, struct svc_req *rqstp)
 {
@@ -1731,8 +1774,49 @@ bool_t cuda_memset_3d_1_svc(size_t pitch, ptr devPtr, size_t xsize, size_t ysize
     RECORD_RESULT(integer, *result);
     return 1;
 }
-/* cudaMemset3DAsync ( cudaPitchedPtr pitchedDevPtr, int  value, cudaExtent extent, cudaStream_t stream = 0 ) is not implemented */
-/* cudaMemsetAsync ( void* devPtr, int  value, size_t count, cudaStream_t stream = 0 ) is not implemented */
+
+bool_t cuda_memset_3d_async_1_svc(size_t pitch, ptr devPtr, size_t xsize, size_t ysize, int value, size_t depth, size_t height, size_t width, ptr stream, int *result, struct svc_req *rqstp)
+{
+    RECORD_API(cuda_memset_3d_async_1_argument);
+    RECORD_ARG(1, pitch);
+    RECORD_ARG(2, devPtr);
+    RECORD_ARG(3, xsize);
+    RECORD_ARG(4, ysize);
+    RECORD_ARG(5, value);
+    RECORD_ARG(6, depth);
+    RECORD_ARG(7, height);
+    RECORD_ARG(8, width);
+    RECORD_ARG(9, stream);
+    LOGE(LOG_DEBUG, "cudaMemset3DAsync");
+    struct cudaPitchedPtr pptr = {.pitch = pitch,
+                                  .ptr = resource_mg_get(&rm_memory, (void*)devPtr),
+                                  .xsize = xsize,
+                                  .ysize = ysize};
+    struct cudaExtent extent = {.depth = depth,
+                                .height = height,
+                                .width = width};
+    *result = cudaMemset3DAsync(pptr, value, extent,
+                resource_mg_get(&rm_streams, (void*)stream));
+    RECORD_RESULT(integer, *result);
+    return 1;
+}
+
+bool_t cuda_memset_async_1_svc(ptr devPtr, int value, size_t count, ptr stream, int *result, struct svc_req *rqstp)
+{
+    RECORD_API(cuda_memset_async_1_argument);
+    RECORD_ARG(1, devPtr);
+    RECORD_ARG(2, value);
+    RECORD_ARG(3, count);
+    RECORD_ARG(3, stream);
+    LOGE(LOG_DEBUG, "cudaMemsetAsync");
+    *result = cudaMemsetAsync(
+      resource_mg_get(&rm_memory, (void*)devPtr),
+      value,
+      count,
+      resource_mg_get(&rm_streams, (void*)stream));
+    RECORD_RESULT(integer, *result);
+    return 1;
+}
 /* cudaMipmappedArrayGetSparseProperties ( cudaArraySparseProperties* sparseProperties, cudaMipmappedArray_t mipmap ) is not implemented */
 /* make_cudaExtent ( size_t w, size_t h, size_t d ) should be implemented on the client side */
 /* make_cudaPitchedPtr ( void* d, size_t p, size_t xsz, size_t ysz ) should be implemented on the client side */
@@ -1826,3 +1910,21 @@ bool_t cuda_register_fat_binary_end_1_svc(ptr cubinHandle, int *result, struct s
     *result = 0;
     return 1;
 }*/
+
+bool_t cuda_profiler_start_1_svc(int *result, struct svc_req *rqstp)
+{
+    RECORD_VOID_API;
+    LOGE(LOG_DEBUG, "cudaProfilerStart");
+    *result = cudaProfilerStart();
+    RECORD_RESULT(integer, *result);
+    return 1;
+}
+
+bool_t cuda_profiler_stop_1_svc(int *result, struct svc_req *rqstp)
+{
+    RECORD_VOID_API;
+    LOGE(LOG_DEBUG, "cudaProfilerStop");
+    *result = cudaProfilerStop();
+    RECORD_RESULT(integer, *result);
+    return 1;
+}
diff --git a/cpu/cpu-server.c b/cpu/cpu-server.c
index 460edc45..a8bbbe65 100644
--- a/cpu/cpu-server.c
+++ b/cpu/cpu-server.c
@@ -1,3 +1,4 @@
+#define _GNU_SOURCE
 #include <stdlib.h>
 #include <stdio.h>
 #include <sys/socket.h>
@@ -5,6 +6,8 @@
 #include <signal.h> //sigaction
 #include <sys/types.h>
 #include <sys/stat.h>
+#include <dlfcn.h>
+#include <link.h>
 
 #include "cpu-server.h"
 #include "cpu_rpc_prot.h"
@@ -15,12 +18,15 @@
 #include "cpu-server-driver.h"
 #include "rpc/xdr.h"
 #include "cr.h"
+#include "cpu-elf2.h"
 #ifdef WITH_IB
 #include "cpu-ib.h"
 #endif //WITH_IB
 #define WITH_RECORDER
 #include "api-recorder.h"
 #include "gsched.h"
+#include "cpu-server-nvml.h"
+#include "cpu-server-cudnn.h"
 
 INIT_SOCKTYPE
 
@@ -109,27 +115,54 @@ bool_t rpc_checkpoint_1_svc(int *result, struct svc_req *rqstp)
     return ret == 0;
 }
 
-/** implementation for CUDA_REGISTER_FUNCTION(ptr, str, str, str, int)
- *
- */
-bool_t cuda_register_function_1_svc(ptr fatCubinHandle, ptr hostFun, char* deviceFun, char* deviceName, int thread_limit, int* result, struct svc_req *rqstp)
+/* Call CUDA initialization function (usually called by __libc_init_main())
+* Address of "_ZL24__sti____cudaRegisterAllv" in static symbol table is e.g. 0x4016c8
+*/
+void cricket_so_register(void* dlhandle, char *path)
 {
-    LOGE(LOG_DEBUG, "cudaRegisterFunction(%p, %p, %s, %s, %d)", fatCubinHandle, hostFun, deviceFun, deviceName, thread_limit);
-    *result = 0;
-    return 1;
-}
+    // struct link_map *map;
+    // dlinfo(dlhandle, RTLD_DI_LINKMAP, &map);
 
-void cricket_main_hash(char* app_command)
-{
-    cricket_main(app_command, 0, 0);
+    // // add load location of library to offset in symbol table
+    // void (*cudaRegisterAllv)(void) = 
+    //     (void(*)(void)) elf_symbol_address(path, "_ZL24__sti____cudaRegisterAllv");
+    
+    // LOG(LOG_INFO, "found CUDA initialization function at %p + %p = %p", 
+    //     map->l_addr, cudaRegisterAllv, map->l_addr + cudaRegisterAllv);
+
+    // cudaRegisterAllv += map->l_addr;
+    
+    // if (cudaRegisterAllv == NULL) {
+    //     LOGE(LOG_WARNING, "could not find cudaRegisterAllv initialization function in cubin. Kernels cannot be launched without it!");
+    // } else {
+    //     cudaRegisterAllv();
+    // }
 }
 
-void cricket_main_static(size_t prog_num, size_t vers_num)
+bool_t rpc_dlopen_1_svc(char *path, int *result, struct svc_req *rqstp)
 {
-    cricket_main("", prog_num, vers_num);
+    void *dlhandle;
+
+    if (path == NULL) {
+        LOGE(LOG_ERROR, "path is NULL");
+        *result = 1;
+        return 1;
+    }
+    if ((dlhandle = dlopen(path, RTLD_LAZY)) == NULL) {
+        LOGE(LOG_ERROR, "error opening \"%s\": %s. Make sure libraries are present.", path, dlerror());
+        *result = 1;
+        return 1;
+    } else {
+        LOG(LOG_INFO, "dlopened \"%s\"", path);
+
+       //cricket_so_register(dlhandle, path);
+
+    }
+    *result = 0;
+    return 1;
 }
 
-void cricket_main(char* app_command, size_t prog_num, size_t vers_num)
+void cricket_main(size_t prog_num, size_t vers_num)
 {
     int ret = 1;
     register SVCXPRT *transp;
@@ -139,9 +172,10 @@ void cricket_main(char* app_command, size_t prog_num, size_t vers_num)
     struct sigaction act;
     char *command = NULL;
     act.sa_handler = int_handler;
-    sigaction(SIGINT, &act, NULL);
-
+    printf("welcome to cricket!\n");
     init_log(LOG_LEVEL, __FILE__);
+    LOG(LOG_DBG(1), "log level is %d", LOG_LEVEL);
+    sigaction(SIGINT, &act, NULL);
 
     #ifdef WITH_IB
     char client[256];
@@ -174,36 +208,16 @@ void cricket_main(char* app_command, size_t prog_num, size_t vers_num)
             restore = 1;
     }
 
-    if (cpu_utils_command(&command) != 0) {
-        LOG(LOG_WARNING, "could not retrieve command name. This might prevent starting CUDA applications");
-    } else {
-        LOG(LOG_DEBUG, "the command is '%s'", command);
-        //This is a workaround to make LD_PRELOAD work under GDB supervision
-        const char *cmp = "cudbgprocess";
-        if (strncmp(command, cmp, strlen(cmp)) == 0) {
-            LOG(LOG_DEBUG, "skipping RPC server");
-            return;
-        }
-    }
-
     if (restore == 1) {
         if (cr_restore_rpc_id("ckp", &prog, &vers) != 0) {
             LOGE(LOG_ERROR, "error while restoring rpc id");
         }
     } else {
-        if (prog_num == 0) {
-            if (cpu_utils_md5hash(app_command, &prog, &vers) != 0) {
-                LOGE(LOG_ERROR, "error while creating binary checksum");
-                exit(0);
-            }
-        }
-        else {
-            prog = prog_num;
-            vers = vers_num;
-        }
+        prog = prog_num;
+        vers = vers_num;
     }
 
-    LOGE(LOG_DEBUG, "using prog=%d, vers=%d, derived from \"%s\"", prog, vers, app_command);
+    LOGE(LOG_DEBUG, "using prog=%d, vers=%d", prog, vers);
 
 
     switch (socktype) {
@@ -247,16 +261,16 @@ void cricket_main(char* app_command, size_t prog_num, size_t vers_num)
     /* Call CUDA initialization function (usually called by __libc_init_main())
      * Address of "_ZL24__sti____cudaRegisterAllv" in static symbol table is e.g. 0x4016c8
      */
-    void (*cudaRegisterAllv)(void) =
-        (void(*)(void)) cricketd_utils_symbol_address("_ZL24__sti____cudaRegisterAllv");
-    LOG(LOG_INFO, "found CUDA initialization function at %p", cudaRegisterAllv);
-    if (cudaRegisterAllv == NULL) {
-        LOGE(LOG_WARNING, "could not find cudaRegisterAllv initialization function in cubin. Kernels cannot be launched without it!");
-    } else {
-        cudaRegisterAllv();
-    }
-
-    sched = &sched_none; 
+    // void (*cudaRegisterAllv)(void) =
+    //     (void(*)(void)) elf_symbol_address(NULL, "_ZL24__sti____cudaRegisterAllv");
+    // LOG(LOG_INFO, "found CUDA initialization function at %p", cudaRegisterAllv);
+    // if (cudaRegisterAllv == NULL) {
+    //     LOGE(LOG_WARNING, "could not find cudaRegisterAllv initialization function in cubin. Kernels cannot be launched without it!");
+    // } else {
+    //     cudaRegisterAllv();
+    // }
+
+    sched = &sched_none;
     if (sched->init() != 0) {
         LOGE(LOG_ERROR, "initializing scheduler failed.");
         goto cleanup4;
@@ -276,6 +290,16 @@ void cricket_main(char* app_command, size_t prog_num, size_t vers_num)
         LOGE(LOG_ERROR, "initializing server_runtime failed.");
         goto cleanup2;        
     }
+    
+    if (server_nvml_init(restore) != 0) {
+        LOGE(LOG_ERROR, "initializing server_nvml failed.");
+        goto cleanup1;
+    }
+
+    if (server_cudnn_init(restore) != 0) {
+        LOGE(LOG_ERROR, "initializing server_nvml failed.");
+        goto cleanup0;
+    }
 
 #ifdef WITH_IB
 
@@ -289,23 +313,29 @@ void cricket_main(char* app_command, size_t prog_num, size_t vers_num)
 
     if (signal(SIGUSR1, signal_checkpoint) == SIG_ERR) {
         LOGE(LOG_ERROR, "An error occurred while setting a signal handler.");
-        goto cleanup1;
+        goto cleanup00;
     }
 
     LOG(LOG_INFO, "waiting for RPC requests...");
 
+    // make sure that our output is flushed even for non line-buffered shells
+    fflush(stdout);
+
     svc_run();
 
     LOG(LOG_DEBUG, "svc_run returned. Cleaning up.");
     ret = 0;
     //api_records_print();
- cleanup1:
+ cleanup00:
+    server_cudnn_deinit();
+ cleanup0:
     server_driver_deinit();
+ cleanup1:
+    server_nvml_deinit();
  cleanup2:
     server_runtime_deinit();
  cleanup3:
-    api_records_free_args();
-    list_free(&api_records);
+    api_records_free();
  cleanup4:
     pmap_unset(prog, vers);
     svc_destroy(transp);
diff --git a/cpu/cpu-server.h b/cpu/cpu-server.h
index 9c3fcbb0..3eea0f63 100644
--- a/cpu/cpu-server.h
+++ b/cpu/cpu-server.h
@@ -3,8 +3,6 @@
 
 #include <stddef.h>
 
-void cricket_main(char* app_command, size_t prog_version, size_t vers_num);
-void cricket_main_hash(char* app_command);
-void cricket_main_static(size_t prog_num, size_t vers_num);
+void cricket_main(size_t prog_version, size_t vers_num);
 
 #endif //_CPU_SERVER_H_
diff --git a/cpu/cpu-utils.c b/cpu/cpu-utils.c
index 15db60b5..9a4371c8 100644
--- a/cpu/cpu-utils.c
+++ b/cpu/cpu-utils.c
@@ -1,27 +1,19 @@
 #define _GNU_SOURCE
 #include <stdlib.h>
 #include <stdio.h>
+#include <stdint.h>
 #include <unistd.h>
 #include <errno.h>
 #include <string.h>
 #include <sys/wait.h>
-#include <openssl/md5.h>
 #include <linux/limits.h>
 #include "rpc/types.h"
-
-#include <bfd.h>
+#include <sys/stat.h>
 
 #include "cpu-utils.h"
 #include "cpu-common.h"
 #include "log.h"
 
-#define CRICKET_ELF_NV_INFO_PREFIX ".nv.info"
-#define CRICKET_ELF_NV_SHARED_PREFIX ".nv.shared."
-#define CRICKET_ELF_NV_TEXT_PREFIX ".nv.text."
-#define CRICKET_ELF_TEXT_PREFIX ".text."
-
-#define CRICKET_ELF_FATBIN ".nv_fatbin"
-#define CRICKET_ELF_REGFUN "_ZL24__sti____cudaRegisterAllv"
 
 int cpu_utils_command(char **command)
 {
@@ -53,109 +45,6 @@ int cpu_utils_command(char **command)
 
 }
 
-int cpu_utils_md5hash(char *filename, unsigned long *high, unsigned long *low)
-{
-    unsigned char c[MD5_DIGEST_LENGTH];
-    FILE *fd;
-    MD5_CTX mdContext;
-    int bytes;
-    unsigned char data[1024];
-
-    if (filename == NULL || high == NULL || low == NULL) {
-        return -1;
-    }
-
-    if ((fd = fopen(filename, "rb")) == NULL) {
-        LOGE(LOG_ERROR, "%s can't be opened.", filename);
-        return -1;
-    }
-
-    MD5_Init (&mdContext);
-    while ((bytes = fread(data, 1, 1024, fd)) != 0)
-        MD5_Update(&mdContext, data, bytes);
-    MD5_Final(c, &mdContext);
-    fclose (fd);
-    *high = *((unsigned long*)c);
-    *low  = *((unsigned long*)(c+8));
-    return 0;
-}
-
-void* cricketd_utils_symbol_address(char *symbol)
-{
-    bfd *hostbfd = NULL;
-    asection *section;
-    FILE *hostbfd_fd = NULL;
-    void *ret = NULL;
-    size_t symtab_size, symtab_length;
-    asymbol **symtab = NULL;
-    char path[256];
-    size_t length;
-
-
-    bfd_init();
-
-    length = readlink("/proc/self/exe", path, sizeof(path));
-
-    /* Catch some errors: */
-    if (length < 0) {
-        LOGE(LOG_WARNING, "error resolving symlink /proc/self/exe.");
-    } else if (length >= 256) {
-        LOGE(LOG_WARNING, "path was too long and was truncated.");
-    } else {
-        path[length] = '\0';
-        LOG(LOG_DEBUG, "opening '%s'", path);
-    }
-
-    if ((hostbfd_fd = fopen("/proc/self/exe", "rb")) == NULL) {
-        LOGE(LOG_ERROR, "fopen failed");
-        return NULL;
-    }
-
-    if ((hostbfd = bfd_openstreamr("/proc/self/exe", NULL, hostbfd_fd)) == NULL) {
-        LOGE(LOG_ERROR, "bfd_openr failed on %s",
-             "/proc/self/exe");
-        fclose(hostbfd_fd);
-        goto cleanup;
-    }
-
-    if (!bfd_check_format(hostbfd, bfd_object)) {
-        LOGE(LOG_ERROR, "%s has wrong bfd format",
-             "/proc/self/exe");
-        goto cleanup;
-    }
-
-    if ((symtab_size = bfd_get_symtab_upper_bound(hostbfd)) == -1) {
-        LOGE(LOG_ERROR, "bfd_get_symtab_upper_bound failed");
-        return NULL;
-    }
-
-    if ((symtab = (asymbol **)malloc(symtab_size)) == NULL) {
-        LOGE(LOG_ERROR, "malloc symtab failed");
-        return NULL;
-    }
-
-    if ((symtab_length = bfd_canonicalize_symtab(hostbfd, symtab)) == 0) {
-        LOG(LOG_WARNING, "symtab is empty...");
-    } else {
-        //printf("%lu symtab entries\n", symtab_length);
-    }
-
-    for (int i = 0; i < symtab_length; ++i) {
-        if (strcmp(bfd_asymbol_name(symtab[i]), CRICKET_ELF_REGFUN) == 0) {
-            ret = (void*)bfd_asymbol_value(symtab[i]);
-            break;
-        }
-        //printf("%d: %s: %lx\n", i, bfd_asymbol_name(symtab[i]),
-        //       bfd_asymbol_value(symtab[i]));
-    }
-
-
- cleanup:
-    free(symtab);
-    if (hostbfd != NULL)
-        bfd_close(hostbfd);
-    return ret;
-}
 
 int cpu_utils_launch_child(const char *file, char **args)
 {
@@ -173,6 +62,7 @@ int cpu_utils_launch_child(const char *file, char **args)
         return -1;
     } else if (pid == 0) {
         while ((dup2(filedes[1], STDOUT_FILENO) == -1) && (errno == EINTR)) {}
+        while ((dup2(filedes[1], STDERR_FILENO) == -1) && (errno == EINTR)) {}
         close(filedes[1]);
         close(filedes[0]);
         char *env[] = {NULL};
@@ -183,14 +73,14 @@ int cpu_utils_launch_child(const char *file, char **args)
     return filedes[0];
 }
 
-kernel_info_t* cricketd_utils_search_info(list *kernel_infos, char *kernelname)
+kernel_info_t* utils_search_info(list *kernel_infos, const char *kernelname)
 {
     kernel_info_t *info = NULL;
     if (kernel_infos == NULL) {
         LOGE(LOG_ERROR, "list is NULL.");
         return NULL;
     }
-    LOGE(LOG_DEBUG, "searching for %s in %d entries", kernelname, kernel_infos->length);
+    LOGE(LOG_DBG(1), "searching for %s in %d entries", kernelname, kernel_infos->length);
     for (int i=0; i < kernel_infos->length; ++i) {
         if (list_at(kernel_infos, i, (void**)&info) != 0) {
             LOGE(LOG_ERROR, "no element at index %d", i);
@@ -204,10 +94,6 @@ kernel_info_t* cricketd_utils_search_info(list *kernel_infos, char *kernelname)
 
 int cpu_utils_is_local_connection(struct svc_req *rqstp)
 {
-    LOGE(LOG_DEBUG, "%p", rqstp);
-    LOGE(LOG_DEBUG, "%p", rqstp->rq_xprt);
-    LOGE(LOG_DEBUG, "%p", rqstp->rq_xprt->xp_fd);
-
     struct sockaddr_in remote_addr = {0};
     struct sockaddr_in local_addr = {0};
     struct hostent *hp;
@@ -364,6 +250,7 @@ int cpu_utils_contains_kernel(const char *path)
             // Line does not start with .nv.info. so continue searching.
             continue;
         }*/
+        line[strlen(line)-1] = '\0';
         LOGE(LOG_DEBUG, "output: \"%s\"", line);
     }
     ret = 0;
@@ -371,10 +258,10 @@ int cpu_utils_contains_kernel(const char *path)
  cleanup:
     close(output);
     wait(&child_exit);
-    LOG(LOG_DEBUG, "child exit code: %d", child_exit);
+    LOG(LOG_DBG(1), "child exit code: %d", child_exit);
  out:
     free(line);
-    return (ret != 0 ? ret : child_exit);
+    return ret == 0 && child_exit == 0;
 }
 
 int cpu_utils_parameter_info(list *kernel_infos, char *path)
@@ -392,6 +279,11 @@ int cpu_utils_parameter_info(list *kernel_infos, char *path)
     char *kernelname;
     struct stat filestat = {0};
 
+    if (path == NULL) {
+        LOGE(LOG_ERROR, "path is NULL.");
+        goto out;
+    }
+
     if (kernel_infos == NULL) {
         LOGE(LOG_ERROR, "list is NULL.");
         goto out;
@@ -441,13 +333,14 @@ int cpu_utils_parameter_info(list *kernel_infos, char *path)
             goto cleanup2;
         }
 
-        if ((buf->name = malloc(strlen(kernelname))) == NULL) {
+        size_t buflen = strlen(kernelname);
+        if ((buf->name = malloc(buflen)) == NULL) {
             LOGE(LOG_ERROR, "malloc failed");
             goto cleanup2;
         }
         //copy string and remove trailing \n
-        strncpy(buf->name, kernelname, strlen(kernelname)-1);
-        buf->name[strlen(kernelname)-1] = '\0';
+        strncpy(buf->name, kernelname, buflen-1);
+        buf->name[buflen-1] = '\0';
 
         if (cpu_utils_read_pars(buf, fdesc) != 0) {
             LOGE(LOG_ERROR, "reading paramter infos failed.\n");
@@ -470,10 +363,10 @@ int cpu_utils_parameter_info(list *kernel_infos, char *path)
  cleanup1:
     close(output);
     wait(&child_exit);
-    LOG(LOG_DEBUG, "child exit code: %d", child_exit);
+    LOG(LOG_DBG(1), "child exit code: %d", child_exit);
  out:
     free(line);
-    return (ret != 0 ? ret : child_exit);
+    return ret == 0 && child_exit == 0;
 }
 
 void kernel_infos_free(kernel_info_t *infos, size_t kernelnum)
@@ -484,3 +377,35 @@ void kernel_infos_free(kernel_info_t *infos, size_t kernelnum)
         free(infos[i].param_sizes);
     }
 }
+
+void hexdump(const uint8_t* data, size_t size)
+{
+    size_t pos = 0;
+    while (pos < size) {
+        printf("%#05zx: ", pos);
+        for (int i = 0; i < 16; i++) {
+            if (pos + i < size) {
+                printf("%02x", data[pos + i]);
+            } else {
+                printf("  ");
+            }
+            if (i % 4 == 3) {
+                printf(" ");
+            }
+        }
+        printf(" | ");
+        for (int i = 0; i < 16; i++) {
+            if (pos + i < size) {
+                if (data[pos + i] >= 0x20 && data[pos + i] <= 0x7e) {
+                    printf("%c", data[pos + i]);
+                } else {
+                    printf(".");
+                }
+            } else {
+                printf(" ");
+            }
+        }
+        printf("\n");
+        pos += 16;
+    }
+}
\ No newline at end of file
diff --git a/cpu/cpu-utils.h b/cpu/cpu-utils.h
index c0ed97b8..6b1261ef 100644
--- a/cpu/cpu-utils.h
+++ b/cpu/cpu-utils.h
@@ -5,16 +5,18 @@
 #include "cpu-common.h"
 #include "list.h"
 
+
+
 void kernel_infos_free(kernel_info_t *infos, size_t kernelnum);
 
 
 int cpu_utils_is_local_connection(struct svc_req *rqstp);
 int cpu_utils_command(char **command);
 int cpu_utils_md5hash(char *filename, unsigned long *high, unsigned long *low);
-void* cricketd_utils_symbol_address(char *symbol);
 int cricketd_utils_launch_child(const char *file, char **args);
 int cpu_utils_parameter_info(list *kernel_infos, char *path);
-int cpu_utils_contains_kernel(const char *path);
-kernel_info_t* cricketd_utils_search_info(list *kernel_infos, char *kernelname);
+kernel_info_t* utils_search_info(list *kernel_infos, const char *kernelname);
+void hexdump(const uint8_t* data, size_t size);
+
 
 #endif //_CPU_UTILS_H_
diff --git a/cpu/cpu_rpc_prot.x b/cpu/cpu_rpc_prot.x
index f5c405e4..6d505842 100644
--- a/cpu/cpu_rpc_prot.x
+++ b/cpu/cpu_rpc_prot.x
@@ -1,6 +1,8 @@
 typedef opaque mem_data<>;
+
 typedef unsigned hyper size_t;
 typedef unsigned hyper ptr;
+typedef opaque rpc_cuda_device_prop[1032];
 
 struct dint {
     int i1;
@@ -38,6 +40,24 @@ struct rpc_dim3 {
     unsigned int z;
 };
 
+struct int2d1 {
+    int i[2];
+    double d;
+};
+
+struct int1d3 {
+    int i;
+    double d[3];
+};
+
+union cudnn_scaling_t switch (int dataType) {
+case 2:
+case 0:
+    float f;
+case 1:
+    double d;
+};
+
 union int_result switch (int err) {
 case 0:
     int data;
@@ -80,6 +100,13 @@ default:
     void;
 };
 
+union sz_result switch (int err) {
+case 0:
+    size_t data;
+default:
+    void;
+};
+
 union ptr_result switch (int err) {
 case 0:
     ptr ptr;
@@ -108,6 +135,8 @@ default:
     void;
 };
 
+/* memory allocated for RPC. */
+/* Freed rpc_cd_prog_1_freeresult by after RPC. */
 union mem_result switch (int err) {
 case 0:
     mem_data data;
@@ -115,12 +144,79 @@ default:
     void;
 };
 
+union cuda_device_prop_result switch (int err) {
+case 0:
+    rpc_cuda_device_prop data;
+default:
+    void;
+};
+
+union int3_result switch (int err) {
+case 0:
+    int data[3];
+default:
+    void;
+};
+
+union int4_result switch (int err) {
+case 0:
+    int data[4];
+default:
+    void;
+};
+
+union int5_result switch (int err) {
+case 0:
+    int data[5];
+default:
+    void;
+};
+
+union int6_result switch (int err) {
+case 0:
+    int data[6];
+default:
+    void;
+};
+
+union int8_result switch (int err) {
+case 0:
+    int data[8];
+default:
+    void;
+};
+
+union int9_result switch (int err) {
+case 0:
+    int data[9];
+default:
+    void;
+};
+
+union int2d1_result switch (int err) {
+case 0:
+    int2d1 data;
+default:
+    void;
+};
+
+union int1d3_result switch (int err) {
+case 0:
+    int1d3 data;
+default:
+    void;
+};
+
 program RPC_CD_PROG {
     version RPC_CD_VERS {
         int          rpc_checkpoint(void)                                         = 0;
         int          rpc_deinit(void)                                             = 1;
         int          rpc_printmessage(string)                                     = 2;
-        int          CUDA_REGISTER_FUNCTION(ptr, ptr, string, string, int)       = 50;
+        int          rpc_dlopen(string)                                           = 3;
+        ptr_result   rpc_register_function(ptr, ptr, string, string, int)        = 50;
+        int          rpc_elf_load(mem_data, ptr)                                 = 51;
+        int          rpc_elf_unload(ptr)                                         = 52;
+        int          rpc_register_var(ptr, ptr, ptr, string, int, size_t, int, int) = 53;
 
         /* RUNTIME API */
         /* ### Device Management ### */
@@ -143,7 +239,7 @@ program RPC_CD_PROG {
         int_result   CUDA_GET_DEVICE(void)                                      = 117;
         int_result   CUDA_GET_DEVICE_COUNT(void)                                = 118;
         int_result   CUDA_GET_DEVICE_FLAGS(void)                                = 119;
-        mem_result   CUDA_GET_DEVICE_PROPERTIES(int)                            = 120;
+        cuda_device_prop_result CUDA_GET_DEVICE_PROPERTIES(int)                 = 120;
         /*int        CUDA_IPC_CLOSE_MEM_HANDLE(ptr)                             = 121;*/
         /*ptr_result CUDA_IPC_GET_EVENT_HANDLE(int)                             = 122;*/
         /*ptr_result CUDA_IPC_GET_MEM_HANDLE(ptr)                               = 123;*/
@@ -174,7 +270,7 @@ program RPC_CD_PROG {
         /* ?         CUDA_STREAM_GET_CAPTURE_INFO(ptr)                          = 261;*/
         int_result   CUDA_STREAM_GET_FLAGS(ptr)                                 = 262;
         int_result   CUDA_STREAM_GET_PRIORITY(ptr)                              = 263;
-        /* ?         CUDA_STREAM_IS_CAPTURING(ptr)                              = 264;*/
+        int_result   CUDA_STREAM_IS_CAPTURING(ptr)                              = 264;
         int          CUDA_STREAM_QUERY(ptr)                                     = 265;
         /*int        CUDA_STREAM_SET_ATTRIBUTE(ptr, int, ?)                     = 266;*/
         int          CUDA_STREAM_SYNCHRONIZE(ptr)                               = 267;
@@ -201,8 +297,6 @@ program RPC_CD_PROG {
         int          CUDA_FUNC_SET_SHARED_MEM_CONFIG(ptr, int)                  = 313;
         int          CUDA_LAUNCH_COOPERATIVE_KERNEL(ptr, rpc_dim3, 
                           rpc_dim3, mem_data, size_t, ptr)                      = 314;
-        int          CUDA_LAUNCH_COOPERATIVE_KERNEL_MULTI_DEVICE(ptr,
-                          rpc_dim3, rpc_dim3, mem_data, size_t, ptr, int, int)  = 315;
         /*int        CUDA_LAUNCH_HOST_FUNC(ptr, ptr, mem_data)                  = 316;*/
         int          CUDA_LAUNCH_KERNEL(ptr, rpc_dim3, rpc_dim3,
                           mem_data, size_t, ptr)                                = 317;
@@ -225,7 +319,8 @@ program RPC_CD_PROG {
         /*ptr_result CUDA_GET_MIPMAPPED_ARRAY_LEVEL(ptr, int)                   = 406;*/
         ptr_result   CUDA_GET_SYMBOL_ADDRESS(ptr)                               = 407;
         u64_result   CUDA_GET_SYMBOL_SIZE(ptr)                                  = 408;
-        int          CUDA_HOST_ALLOC(int, size_t, ptr, unsigned int)            = 409;
+        sz_result    CUDA_HOST_ALLOC(size_t, unsigned int)                 = 409;
+        int          CUDA_HOST_ALLOC_REGSHM(size_t, ptr)                        = 477;
         ptr_result   CUDA_HOST_GET_DEVICE_POINTER(ptr, int)                     = 410;
         int_result   CUDA_HOST_GET_FLAGS(ptr)                                   = 411;
         /*int        CUDA_HOST_REGISTER(ptr, size_t, int)                       = 412;*/
@@ -263,13 +358,13 @@ program RPC_CD_PROG {
         int          CUDA_MEMCPY_MT_SYNC(int)                                   = 451;
         int          CUDA_MEMSET(ptr, int, size_t)                              = 470;
         int          CUDA_MEMSET_2D(ptr, size_t, int, size_t, size_t)           = 471;
-        /*int        CUDA_MEMSET_2D_ASYNC(ptr, size_t,
-                         int, size_t, size_t, int)                              = 472;*/
+        int          CUDA_MEMSET_2D_ASYNC(ptr, size_t,
+                         int, size_t, size_t, ptr)                              = 472;
         int          CUDA_MEMSET_3D(size_t, ptr, size_t, size_t, int, size_t,
                          size_t, size_t)                                        = 473;
-        /*int        CUDA_MEMSET_3D_ASYNC(size_t, ptr, size_t, size_t, int, 
-                         size_t, size_t, size_t, int)                           = 474;*/
-        /*int        CUDA_MEMSET_ASYNC(ptr, int, size_t, int)                   = 475;*/
+        int          CUDA_MEMSET_3D_ASYNC(size_t, ptr, size_t, size_t, int, 
+                         size_t, size_t, size_t, ptr)                           = 474;
+        int          CUDA_MEMSET_ASYNC(ptr, int, size_t, ptr)                   = 475;
         /*?          CUDA_MIPMAPPED_ARRAY_GET_SPARSE_PROPERTIES(ptr)            = 476;*/
         /* make_ APIs can be copied on the client side */
 
@@ -298,6 +393,8 @@ program RPC_CD_PROG {
         /* NOT IMPLEMENTED */
 
         /* ### Profiler Control ### */
+        int          CUDA_PROFILER_START(void)                                  = 701;
+        int          CUDA_PROFILER_STOP(void)                                   = 702;
         /* NOT IMPLEMENTED */
 
         /* DRIVER API */
@@ -323,6 +420,11 @@ program RPC_CD_PROG {
         ptr_result   rpc_cuModuleLoad(string<>)                                = 1019;
         str_result   rpc_cuGetErrorString(int)                                 = 1020;
         int          rpc_cuModuleUnload(ptr)                                   = 1021;
+        dint_result  rpc_cuDevicePrimaryCtxGetState(int)                       = 1022;
+        mem_result   rpc_cuDeviceGetProperties(int)                            = 1023;
+        dint_result  rpc_cuDeviceComputeCapability(int)                        = 1024;
+        int_result   rpc_cuDeviceGetP2PAttribute(int, ptr, ptr)                = 1025; 
+        ptr_result   rpc_cuModuleLoadData(mem_data mem)                        = 1026;
 
         /* HIDDEN DRIVER API */
 /*        ptr_result   rpc_hidden_get_device_ctx(int)                            = 1101;
@@ -349,5 +451,124 @@ program RPC_CD_PROG {
         int          rpc_cublasDgemm(ptr, int, int, int, int, int, double,
                          ptr, int, ptr, int, double, ptr, int)                 = 3002;
         int          rpc_cublasDestroy(ptr)                                    = 3003;
+        int          rpc_cublasSgemm(ptr, int, int, int, int, int, float,
+                         ptr, int, ptr, int, float, ptr, int)                 = 3004;
+        int          rpc_cublasSgemv(ptr, int, int, int, float,
+                         ptr, int, ptr, int, float, ptr, int)                 = 3005;
+        int          rpc_cublasDgemv(ptr, int, int, int, double,
+                         ptr, int, ptr, int, double, ptr, int)                 = 3006;
+        int          rpc_cublasSgemmEx(ptr, int, int, int, int, int, float,
+                         ptr, int, int, ptr, int, int, float, ptr, int, int)                 = 3007;
+        int          rpc_cublasSetStream(ptr handle, ptr streamId)                             = 3008;
+        int          rpc_cublasSetWorkspace(ptr handle, ptr workspace, size_t workspaceSizeInBytes) = 3009;
+        int          rpc_cublasSetMathMode(ptr handle, int mode) = 3010;
+
+        /* NVML */
+        int_result   rpc_nvmlDeviceGetCount_v2(void)                           = 4000;
+        int          rpc_nvmlInitWithFlags(int)                                = 4001;
+        int          rpc_nvmlInit_v2(void)                                     = 4002;
+        int          rpc_nvmlShutdown(void)                                    = 4003;
+        
+        /* CUDNN */
+        size_t      rpc_cudnnGetVersion(void) = 5000;
+        size_t      rpc_cudnnGetMaxDeviceVersion(void) = 5001;
+        size_t      rpc_cudnnGetCudartVersion(void) = 5002;
+        string      rpc_cudnnGetErrorString (int status) = 5003;
+        int_result  rpc_cudnnQueryRuntimeError(ptr handle, int mode) = 5004;
+        int_result  rpc_cudnnGetProperty(int type) = 5005;
+        ptr_result  rpc_cudnnCreate(void) = 5006;
+        int         rpc_cudnnDestroy(ptr handle) = 5007;
+        int         rpc_cudnnSetStream(ptr handle, ptr streamId) = 5008;
+        ptr_result  rpc_cudnnGetStream(ptr handle) = 5009;
+        ptr_result  rpc_cudnnCreateTensorDescriptor(void) = 5010;
+        int         rpc_cudnnSetTensor4dDescriptor(ptr tensorDesc, int format, int dataType, int n, int c, int h, int w) = 5011;
+        int         rpc_cudnnSetTensor4dDescriptorEx(ptr tensorDesc, int dataType, int n, int c, int h, int w, int nStride, int cStride, int hStride, int wStride) = 5012;
+        int9_result rpc_cudnnGetTensor4dDescriptor(ptr tensorDesc) = 5013;
+        int         rpc_cudnnSetTensorNdDescriptor(ptr tensorDesc, int dataType, int nbDims, mem_data dimA, mem_data strideA) = 5014;
+        int         rpc_cudnnSetTensorNdDescriptorEx(ptr tensorDesc, int format, int dataType, int nbDims, mem_data dimA) = 5015;
+        mem_result  rpc_cudnnGetTensorNdDescriptor(ptr tensorDesc, int nbDimsRequested) = 5016;
+        sz_result   rpc_cudnnGetTensorSizeInBytes(ptr tensorDesc) = 5017;
+        int         rpc_cudnnDestroyTensorDescriptor(ptr tensorDesc) = 5018;
+        /*
+        sz_result   rpc_cudnnInitTransformDest(ptr transformDesc, ptr srcDesc, ptr destDesc) = 5019;
+        ptr_result  rpc_cudnnCreateTensorTransformDescriptor(void) = 5020;
+        int         rpc_cudnnSetTensorTransformDescriptor(ptr transformDesc, uint32_t nbDims, int destFormat, mem_data padBeforeA, mem_data padAfterA, mem_data foldA, int direction) = 5021;
+        mem_result  rpc_cudnnGetTensorTransformDescriptor(ptr transformDesc, uint32_t nbDimsRequested) = 5022;
+        int         rpc_cudnnDestroyTensorTransformDescriptor(ptr transformDesc) = 5023;
+        */
+        int         rpc_cudnnTransformTensor(ptr handle, cudnn_scaling_t alpha, ptr xDesc, ptr x, cudnn_scaling_t beta, ptr yDesc, ptr y) = 5024;
+        /*
+        ptr_result  rpc_cudnnTransformTensorEx(ptr handle, ptr transDesc, cudnn_scaling_t alpha, ptr srcDesc, cudnn_scaling_t srcData, cudnn_scaling_t beta, ptr destDesc) = 5025;
+        */
+        int  rpc_cudnnAddTensor(ptr handle, cudnn_scaling_t alpha, ptr aDesc, ptr A, cudnn_scaling_t beta, ptr cDesc, ptr C) = 5026;
+        /*
+        ptr_result  rpc_cudnnCreateOpTensorDescriptor(void) = 5027;
+        int         rpc_cudnnSetOpTensorDescriptor(ptr opTensorDesc, int opTensorOp, int opTensorCompType, int opTensorNanOpt) = 5028;
+        int3_result rpc_cudnnGetOpTensorDescriptor(ptr opTensorDesc) = 5029;
+        int         rpc_cudnnDestroyOpTensorDescriptor(ptr opTensorDesc) = 5030;
+        mem_result  rpc_cudnnOpTensor(ptr handle, ptr opTensorDesc, cudnn_scaling_t alpha1, ptr aDesc, mem_data A, cudnn_scaling_t alpha2, ptr bDesc, mem_data B, cudnn_scaling_t beta, ptr  cDesc) = 5031;
+        ptr_result  rpc_cudnnCreateReduceTensorDescriptor(void) = 5032;
+        int         rpc_cudnnSetReduceTensorDescriptor(ptr reduceTensorDesc, int reduceTensorOp, int reduceTensorCompType, int reduceTensorNanOpt, int reduceTensorIndices, int reduceTensorIndicesType) = 5033;
+        int5_result rpc_cudnnGetReduceTensorDescriptor(ptr reduceTensorDesc) = 5034;
+        int         rpc_cudnnDestroyReduceTensorDescriptor(ptr reduceTensorDesc) = 5035;
+        sz_result   rpc_cudnnGetReductionIndicesSize(ptr handle, ptr reduceTensorDesc, ptr aDesc, ptr cDesc) = 5036;
+        sz_result   rpc_cudnnGetReductionWorkspaceSize(ptr handle, ptr reduceTensorDesc, ptr aDesc, ptr cDesc) = 5037;
+        mem_result  rpc_cudnnReduceTensor(ptr handle, ptr reduceTensorDesc, ptr indices, size_t indicesSizeInBytes, ptr workspace, size_t workspaceSizeInBytes, cudnn_scaling_t alpha, ptr aDesc, ptr A, cudnn_scaling_t beta, ptr cDesc, ptr C) = 5038;
+        int         rpc_cudnnSetTensor(ptr handle, ptr yDesc, ptr y, mem_data valuePtr) = 5039;
+        int         rpc_cudnnScaleTensor(ptr handle, ptr yDesc, ptr y, cudnn_scaling_t alpha) = 5040; */
+        
+        ptr_result  rpc_cudnnCreateFilterDescriptor(void) = 5041;
+        int         rpc_cudnnSetFilter4dDescriptor(ptr filterDesc, int dataType, int format, int k, int c, int h, int w) = 5042;
+        int6_result rpc_cudnnGetFilter4dDescriptor(ptr filterDesc) = 5043;
+        int         rpc_cudnnSetFilterNdDescriptor(ptr filterDesc, int dataType, int format, int nbDims, mem_data filterDimA) = 5044;
+        mem_result  rpc_cudnnGetFilterNdDescriptor(ptr filterDesc, int nbDimsRequested) = 5045;
+        sz_result   rpc_cudnnGetFilterSizeInBytes(ptr filterDesc) = 5046;
+        int         rpc_cudnnTransformFilter(ptr handle, ptr transDesc, cudnn_scaling_t alpha, ptr srcDesc, ptr srcData, cudnn_scaling_t beta, ptr destDesc, ptr destData) = 5047;
+        int         rpc_cudnnDestroyFilterDescriptor(ptr filterDesc) = 5048;
+        int         rpc_cudnnSoftmaxForward(ptr handle, int algo, int mode, cudnn_scaling_t alpha, ptr xDesc, ptr x, cudnn_scaling_t beta, ptr yDesc, ptr y) = 5049;
+        ptr_result  rpc_cudnnCreatePoolingDescriptor(void) = 5050;
+        int         rpc_cudnnSetPooling2dDescriptor(ptr poolingDesc, int mode, int maxpoolingNanOpt, int windowHeight, int windowWidth, int verticalPadding, int horizontalPadding, int verticalStride, int horizontalStride) = 5051;
+        int8_result rpc_cudnnGetPooling2dDescriptor(ptr poolingDesc) = 5052;
+        int         rpc_cudnnSetPoolingNdDescriptor(ptr poolingDesc, int mode, int maxpoolingNanOpt, int nbDims, mem_data windowDimA, mem_data paddingA, mem_data strideA) = 5053;
+        mem_result  rpc_cudnnGetPoolingNdDescriptor(ptr poolingDesc, int nbDimsRequested) = 5054;
+        mem_result  rpc_cudnnGetPoolingNdForwardOutputDim(ptr poolingDesc, ptr inputTensorDesc, int nbDims) = 5055;
+        int4_result rpc_cudnnGetPooling2dForwardOutputDim(ptr poolingDesc, ptr inputTensorDesc) = 5056;
+        int         rpc_cudnnDestroyPoolingDescriptor(ptr poolingDesc) = 5057;
+        int         rpc_cudnnPoolingForward(ptr handle, ptr poolingDesc, cudnn_scaling_t alpha, ptr xDesc, ptr x, cudnn_scaling_t beta, ptr yDesc, ptr y) = 5058;
+        ptr_result  rpc_cudnnCreateActivationDescriptor(void) = 5059;
+        int         rpc_cudnnSetActivationDescriptor(ptr activationDesc, int mode, int reluNanOpt, double coef) = 5060;
+        int2d1_result rpc_cudnnGetActivationDescriptor(ptr activationDesc) = 5061;
+        int         rpc_cudnnSetActivationDescriptorSwishBeta(ptr activationDesc, double swish_beta) = 5062;
+        d_result    rpc_cudnnGetActivationDescriptorSwishBeta(ptr activationDesc) = 5063;
+        int         rpc_cudnnDestroyActivationDescriptor(ptr activationDesc) = 5064;
+        int         rpc_cudnnActivationForward(ptr handle, ptr activationDesc, cudnn_scaling_t alpha, ptr xDesc, ptr x, cudnn_scaling_t beta, ptr yDesc, ptr y) = 5065;
+        ptr_result  rpc_cudnnCreateLRNDescriptor(void) = 5066;
+        int         rpc_cudnnSetLRNDescriptor(ptr normDesc, unsigned lrnN, double lrnAlpha, double lrnBeta, double lrnK) = 5067;
+        int1d3_result rpc_cudnnGetLRNDescriptor(ptr normDesc) = 5068;
+        int         rpc_cudnnDestroyLRNDescriptor(ptr lrnDesc) = 5069;
+        int         rpc_cudnnLRNCrossChannelForward(ptr handle, ptr normDesc, int lrnMode, cudnn_scaling_t alpha, ptr xDesc, ptr x, cudnn_scaling_t beta, ptr yDesc, ptr y) = 5070;
+        /* cudnn cnn inference */
+        ptr_result  rpc_cudnnCreateConvolutionDescriptor(void) = 5301;
+        int         rpc_cudnnDestroyConvolutionDescriptor(ptr convDesc) = 5302;
+        mem_result  rpc_cudnnGetConvolutionNdForwardOutputDim(ptr convDesc, ptr inputTensorDesc, ptr filterDesc, int nbDims) = 5303;
+        int         rpc_cudnnSetConvolutionNdDescriptor(ptr convDesc, int arrayLength, mem_data padA,  mem_data filterStrideA, mem_data dilationA,  int mode,  int computeType) = 5304;
+        mem_result rpc_cudnnGetConvolutionForwardAlgorithm_v7(ptr handle, ptr srcDesc, ptr filterDesc, ptr convDesc, ptr destDesc, int requestedAlgoCount) = 5305;
+        mem_result rpc_cudnnFindConvolutionForwardAlgorithm(ptr handle, ptr xDesc, ptr wDesc, ptr convDesc, ptr yDesc, int requestedAlgoCount) = 5306;
+        sz_result rpc_cudnnGetConvolutionForwardWorkspaceSize(ptr handle, ptr xDesc, ptr wDesc, ptr convDesc, ptr yDesc, int algo) = 5307;
+        int rpc_cudnnConvolutionForward(ptr handle, cudnn_scaling_t alpha, ptr xDesc, ptr x, ptr wDesc, ptr w, ptr convDesc, int algo, ptr workSpace, size_t workSpaceSizeInBytes, cudnn_scaling_t beta, ptr yDesc, ptr y) = 5308;
+        ptr_result rpc_cudnnBackendCreateDescriptor(int descriptorType) = 5309;
+        int rpc_cudnnBackendDestroyDescriptor(ptr descriptor) = 5310;
+        int rpc_cudnnBackendInitialize(ptr descriptor) = 5311;
+        int rpc_cudnnBackendFinalize(ptr descriptor) = 5312;
+        int rpc_cudnnBackendSetAttribute(ptr descriptor,
+                         int attributeName,
+                         int attributeType,
+                         hyper elementCount,
+                         mem_data arrayOfElements) = 5313;
+        mem_result rpc_cudnnBackendGetAttribute(ptr descriptor,
+                            int attributeName,
+                            int attributeType,
+                            hyper requestedElementCount) = 5314;
+        int rpc_cudnnBackendExecute(ptr handle, ptr executionPlan, ptr variantPack) = 5315;
     } = 1;
 } = 99;
diff --git a/cpu/cr.c b/cpu/cr.c
index e14f58f5..7e1e2e74 100644
--- a/cpu/cr.c
+++ b/cpu/cr.c
@@ -754,7 +754,6 @@ static int cr_restore_resources(const char *path, api_record_t *record, resource
         break;
     case CUDA_LAUNCH_KERNEL:
     case CUDA_LAUNCH_COOPERATIVE_KERNEL:
-    case CUDA_LAUNCH_COOPERATIVE_KERNEL_MULTI_DEVICE:
         break;
     case rpc_cusolverDnCreate:
         if (cr_restore_cusolver(record, rm_cusolver) != 0) {
@@ -821,9 +820,6 @@ int cr_launch_kernel(void)
         } else if (record->function == CUDA_LAUNCH_COOPERATIVE_KERNEL) {
             LOGE(LOG_ERROR, "not yet supported");
             goto cleanup;
-        } else if (record->function == CUDA_LAUNCH_COOPERATIVE_KERNEL_MULTI_DEVICE) {
-            LOGE(LOG_ERROR, "not yet supported");
-            goto cleanup;
         }
     }
     ret = 0;
diff --git a/cpu/gsched_none.c b/cpu/gsched_none.c
index 8b509089..6cb1152f 100644
--- a/cpu/gsched_none.c
+++ b/cpu/gsched_none.c
@@ -23,7 +23,7 @@ int gsched_none_init(void)
     pthread_mutex_init(&mutex_device, NULL);
     pthread_mutex_init(&mutex_ids, NULL);
     if ((res = cudaGetDeviceCount(&cuda_max_devices)) != cudaSuccess) {
-        LOGE(LOG_ERROR, "cudaGetDeviceCount failed: %s", cudaGetErrorString(res));
+        LOGE(LOG_ERROR, "cudaGetDeviceCount failed: %s (%d)", cudaGetErrorString(res), res);
         return 1;
     }
     return 0;
diff --git a/cpu/log.c b/cpu/log.c
index e104890e..6e4807ee 100644
--- a/cpu/log.c
+++ b/cpu/log.c
@@ -20,6 +20,8 @@
 #include <stdarg.h>
 #include <string.h>
 
+static struct timeval start_time = {0};
+
 struct log_data* get_log_data() {
 	static struct log_data log_data;
 	return &log_data;
@@ -46,6 +48,7 @@ void init_log(char log_level, const char* proj_root)
 {
 	get_log_data()->curr_level=log_level;
 	get_log_data()->project_offset = str_find_last_of(proj_root, '/');
+	gettimeofday(&start_time, 0);
 }
 
 void now_time(char* buf)
@@ -57,9 +60,23 @@ void now_time(char* buf)
 	sprintf(buf, "%s.%06ld", buffer, (long)tv.tv_usec);
 }
 
+void delta_time(char* buf)
+{
+	struct timeval tv;
+	gettimeofday(&tv, 0);
+	timersub(&tv, &start_time, &tv);
+	char buffer[100];
+	strftime(buffer, sizeof(buffer), "%X", localtime(&tv.tv_sec));
+	sprintf(buf, "+%s.%06ld", buffer, (long)tv.tv_usec);
+}
+
 const char* to_string(log_level level)
 {
+#ifdef NOCOLORS
 	static const char* const buffer[] = {"ERROR", "WARNING", "INFO", "DEBUG"};
+#else
+	static const char* const buffer[] = {"\033[1m\033[31mERROR\033[0m", "\033[33mWARNING\033[0m", "\033[34mINFO\033[0m", "\033[32mDEBUG\033[0m"};
+#endif //NOCOLORS
 	if(level > LOG_DEBUG){
 		return buffer[LOG_DEBUG];
 	}
@@ -71,9 +88,13 @@ void loggf(log_level level, const char* formatstr, ... )
 	va_list vararg;
 	va_start(vararg, formatstr);
 	
-	char time[100];
+	char time[64];
+#ifdef DELTA_TIME
+	delta_time(time);
+#else
 	now_time(time);
-	printf("%s (%s):\t", time, to_string(level));
+#endif //DELTA_TIME
+	printf("%s %s:\t", time, to_string(level));
 	vprintf(formatstr, vararg);
 	printf("\n");
 }
@@ -84,11 +105,19 @@ void loggfe(log_level level, int line, const char* file, const char* formatstr,
 	va_start(vararg, formatstr);
 	
 	char time[64];
+#ifdef DELTA_TIME
+	delta_time(time);
+#else
 	now_time(time);
+#endif //DELTA_TIME
 	printf("%s %7s: ", time, to_string(level));
 	vprintf(formatstr, vararg);
 	char stripped[64];
 	strcpy(stripped, file);
 	str_strip(stripped, get_log_data()->project_offset);
-	printf("\tin %s(%d)\n", stripped, line);
+#ifdef NOCOLORS
+	printf("\tin %s:%d\n", stripped, line);
+#else
+	printf("\tin \033[4m%s:%d\033[0m\n", stripped, line);
+#endif //NOCOLORS
 }
diff --git a/cpu/log.h b/cpu/log.h
index 81ce80be..379c5865 100644
--- a/cpu/log.h
+++ b/cpu/log.h
@@ -38,6 +38,8 @@ else loggfe(level, __LINE__, __FILE__, __VA_ARGS__)
 #define LOG_DEBUG 3
 #define LOG_DBG(i) LOG_DEBUG + i
 
+#define DELTA_TIME 1
+
 typedef char log_level;
 
 struct log_data{
diff --git a/cpu/resource-mg.c b/cpu/resource-mg.c
index e07e6a5f..f78503fe 100644
--- a/cpu/resource-mg.c
+++ b/cpu/resource-mg.c
@@ -75,6 +75,28 @@ static void* resource_mg_search_map(resource_mg *mg, void *client_address)
     LOGE(LOG_DEBUG, "no find: %p", client_address);
     return client_address;
 }
+
+void resource_mg_print(resource_mg *mg)
+{
+    size_t i;
+    resource_mg_map_elem *elem;
+    if (mg == NULL) {
+        LOGE(LOG_ERROR, "resource manager mg is NULL");
+        return;
+    }
+    LOG(LOG_DEBUG, "new_res:");
+    for (i = 0; i < mg->new_res.length; i++) {
+        LOG(LOG_DEBUG, "%p", *(void**)list_get(&mg->new_res, i));
+    }
+    if (mg->bypass == 0) {
+        LOG(LOG_DEBUG, "map_res:");
+        for (i = 0; i < mg->map_res.length; i++) {
+            elem = list_get(&mg->map_res, i);
+            LOG(LOG_DEBUG, "%p -> %p", elem->client_address, elem->cuda_address);
+        }
+    }
+}
+
 inline void* resource_mg_get(resource_mg *mg, void* client_address)
 {
     if (mg->bypass) {
@@ -85,6 +107,7 @@ inline void* resource_mg_get(resource_mg *mg, void* client_address)
     return 0;
 }
 
+#include <stdio.h>
 int resource_mg_add_sorted(resource_mg *mg, void* client_address, void* cuda_address)
 {
     ssize_t start = 0;
@@ -124,10 +147,11 @@ int resource_mg_add_sorted(resource_mg *mg, void* client_address, void* cuda_add
             return 0;
         }
     }
-    if (end < 0) {
+    if (end < 0LL) {
         end = 0;
     }
-    if (mid_elem->client_address < client_address) {
+    resource_mg_map_elem *end_elem = list_get(&mg->map_res, end);
+    if (end_elem->client_address < client_address) {
         end++;
     }
     return list_insert(&mg->map_res, end, &new_elem);
diff --git a/cpu/resource-mg.h b/cpu/resource-mg.h
index aa8bff25..ee8c44fa 100644
--- a/cpu/resource-mg.h
+++ b/cpu/resource-mg.h
@@ -28,15 +28,28 @@ resource_mg rm_streams;
 resource_mg rm_events;
 resource_mg rm_arrays;
 resource_mg rm_memory;
+resource_mg rm_kernels;
 
 //Driver API RMs
 resource_mg rm_modules;
 resource_mg rm_functions;
+resource_mg rm_globals;
 
 //Other RMs
 resource_mg rm_cusolver;
 resource_mg rm_cublas;
 
+//CUDNN RMs
+resource_mg rm_cudnn;
+resource_mg rm_cudnn_tensors;
+resource_mg rm_cudnn_filters;
+resource_mg rm_cudnn_tensortransform;
+resource_mg rm_cudnn_poolings;
+resource_mg rm_cudnn_activations;
+resource_mg rm_cudnn_lrns;
+resource_mg rm_cudnn_convs;
+resource_mg rm_cudnn_backendds;
+
 
 /** initializes the resource manager
  *
@@ -54,4 +67,6 @@ int resource_mg_create(resource_mg *mg, void* cuda_address);
 
 void* resource_mg_get(resource_mg *mg, void* client_address);
 
+void resource_mg_print(resource_mg *mg);
+
 #endif //_RESOURCE_MG_H_
diff --git a/cpu/server-exe.c b/cpu/server-exe.c
index 8e358be6..a174e0a2 100644
--- a/cpu/server-exe.c
+++ b/cpu/server-exe.c
@@ -3,17 +3,22 @@
 #include "log.h"
 
 #include <stdlib.h>
+#include <stdint.h>
 
 int main(int argc, char** argv)
 {
-
-    //TODO: Check if command path exists
     if (argc == 1) {
-        cricket_main_static(RPC_CD_PROG, RPC_CD_VERS);
+        cricket_main(RPC_CD_PROG, RPC_CD_VERS);
     } else if (argc == 2) {
-        cricket_main_hash(argv[1]);
+        uint64_t vers;
+        if (sscanf(argv[1], "%lu", &vers) != 1) {
+            printf("version string could not be converted to number\n");
+            printf("usage: %s [unique rpc version]\n", argv[0]);
+            return 1;
+        }
+        cricket_main(RPC_CD_PROG, vers);
     } else {
-        LOGE(LOG_ERROR, "usage: %s [command]", argv[0]);
+        printf("usage: %s\n", argv[0]);
     }
     return 0;
 }
diff --git a/cpu/server-library.c b/cpu/server-library.c
deleted file mode 100644
index cd5e57a1..00000000
--- a/cpu/server-library.c
+++ /dev/null
@@ -1,10 +0,0 @@
-
-#include "cpu-server.h"
-#include "log.h"
-
-/* shared object constructor; executes before main and thus hijacks main program */
-void __attribute__ ((constructor)) library_constr(void)
-{
-    cricket_main_hash("/proc/self/exe");
-}
-
diff --git a/docs/pytorch.md b/docs/pytorch.md
new file mode 100644
index 00000000..ebb8a3b5
--- /dev/null
+++ b/docs/pytorch.md
@@ -0,0 +1,130 @@
+# Cricket pyTorch
+
+Get pytorch sources
+```
+git clone git@github.com:pytorch/pytorch.git
+git checkout v1.13.1
+git submodule update --init --recursive
+```
+
+patch sources.
+- link cudart dynamically when building docker image
+- link cudart dynamically when building ATen
+- link cudart dynamically when building nccl
+- deactivate building for some old cuda versions. (optional)
+- add cricket dependencies to dockerfile
+```
+diff --git a/Dockerfile b/Dockerfile
+index 815a9108ce9..53ec7689493 100644
+--- a/Dockerfile
++++ b/Dockerfile
+@@ -53,7 +53,7 @@ WORKDIR /opt/pytorch
+ COPY --from=conda /opt/conda /opt/conda
+ COPY --from=submodule-update /opt/pytorch /opt/pytorch
+ RUN --mount=type=cache,target=/opt/ccache \
+-    TORCH_CUDA_ARCH_LIST="3.5 5.2 6.0 6.1 7.0+PTX 8.0" TORCH_NVCC_FLAGS="-Xfatbin -compress-all" \
++    TORCH_CUDA_ARCH_LIST="6.0 6.1 7.0 8.0" TORCH_NVCC_FLAGS="-Xfatbin -compress-all -cudart shared" \
+     CMAKE_PREFIX_PATH="$(dirname $(which conda))/../" \
+     python setup.py install
+
+@@ -93,3 +93,13 @@ WORKDIR /workspace
+ FROM official as dev
+ # Should override the already installed version from the official-image stage
+ COPY --from=build /opt/conda /opt/conda
++RUN apt-get update && apt-get install -y --no-install-recommends \
++        rpcbind \
++        git \
++        automake \
++        libtool \
++        libssl-dev \
++        inetutils-ping \
++        vim \
++        libgl1-mesa-dev \
++        gdb && \
++    rm -rf /var/lib/apt/lists/*
+diff --git a/aten/src/ATen/CMakeLists.txt b/aten/src/ATen/CMakeLists.txt
+index 3055e290094..4cc14c794b0 100644
+--- a/aten/src/ATen/CMakeLists.txt
++++ b/aten/src/ATen/CMakeLists.txt
+@@ -458,7 +458,7 @@ if(USE_CUDA AND NOT USE_ROCM)
+   endif()
+   if($ENV{ATEN_STATIC_CUDA})
+     list(APPEND ATen_CUDA_DEPENDENCY_LIBS "${CUDA_TOOLKIT_ROOT_DIR}/lib64/libculibos.a")
+-    list(APPEND ATen_CUDA_DEPENDENCY_LIBS "${CUDA_TOOLKIT_ROOT_DIR}/lib64/libcudart_static.a")
++    list(APPEND ATen_CUDA_DEPENDENCY_LIBS "${CUDA_TOOLKIT_ROOT_DIR}/lib64/libcudart.so")
+   endif($ENV{ATEN_STATIC_CUDA})
+ endif()
+```
+`third_party/nccl/nccl`
+```
+diff --git a/makefiles/common.mk b/makefiles/common.mk
+index 1a1c2b6..c781b39 100644
+--- a/makefiles/common.mk
++++ b/makefiles/common.mk
+@@ -54,7 +54,7 @@ CXXFLAGS   := -DCUDA_MAJOR=$(CUDA_MAJOR) -DCUDA_MINOR=$(CUDA_MINOR) -fPIC -fvisi                                                                                             
+ # Maxrregcount needs to be set accordingly to NCCL_MAX_NTHREADS (otherwise it will cause kernel launch errors)                                                                                
+ # 512 : 120, 640 : 96, 768 : 80, 1024 : 60
+ # We would not have to set this if we used __launch_bounds__, but this only works on kernels, not on functions.                                                                               
+-NVCUFLAGS  := -ccbin $(CXX) $(NVCC_GENCODE) -std=c++11 --expt-extended-lambda -Xptxas -maxrregcount=96 -Xfatbin -compress-all                                                                 
++NVCUFLAGS  := -ccbin $(CXX) $(NVCC_GENCODE) -std=c++11 --expt-extended-lambda -Xptxas -maxrregcount=96 -Xfatbin -compress-all -cudart shared                                                           
+ # Use addprefix so that we can specify more than one path
+-NVLDFLAGS  := -L${CUDA_LIB} -lcudart -lrt
++NVLDFLAGS  := -L${CUDA_LIB} -lcudart -lrt -cudart shared
+ 
+ ########## GCOV ##########
+ GCOV ?= 0 # disable by default.
+diff --git a/src/Makefile b/src/Makefile
+index d658c35..5bd9876 100644
+--- a/src/Makefile
++++ b/src/Makefile
+@@ -28,7 +28,7 @@ LIBDIR := $(BUILDDIR)/lib
+ OBJDIR := $(BUILDDIR)/obj
+ PKGDIR := $(BUILDDIR)/lib/pkgconfig
+ ##### target files
+-CUDARTLIB  ?= cudart_static
++CUDARTLIB  ?= cudart
+ INCTARGETS := $(INCEXPORTS:%=$(INCDIR)/%)
+ LIBSONAME  := $(LIBNAME:%=%.$(NCCL_MAJOR))
+ LIBTARGET  := $(LIBNAME:%=%.$(NCCL_MAJOR).$(NCCL_MINOR).$(NCCL_PATCH))
+```
+
+Avoid `CMake Error: File /opt/pytorch/build_variables.bzl does not exist.` (https://github.com/pytorch/pytorch/pull/85947):
+```
+diff --git a/.gitignore b/.gitignore
+index 3e6f3831c4c..db6d9c3527e 100644
+--- a/.gitignore
++++ b/.gitignore
+@@ -214,6 +214,7 @@ build_host_protoc
+ build_android
+ build_ios
+ /build_*
++!/build_variables.bzl
+ .build_debug/*
+ .build_release/*
+ .build_profile/*
+```
+
+build pytorch
+```
+# only necessary when building on an NFS share
+EXTRA_DOCKER_BUILD_FLAGS='--storage-opt "overlay.mount_program=/usr/bin/fuse-overlayfs"'
+
+make -f docker.Makefile
+```
+
+launch cricket server (outside of docker container)
+```
+<path to cricket>/bin/cricket-rpc-server
+```
+
+launch docker container, torch
+```
+sudo docker run --gpus all --rm -it -v <patch-to-cricket>/cricket:/cricket --ipc=host pytorch:latest
+LD_LIBRARY_PATH=/cricket/cpu REMOTE_GPU_ADDRESS=<cricket server address> LD_PRELOAD=/cricket/cpu/cricket-client.so python3 /cricket/tests/test_apps/pytorch_minimal.py
+```
+or under gdb supervision:
+```
+LD_LIBRARY_PATH=/cricket/cpu gdb -x /cricket/tests/gdb_client_cmds python3
+(gdb) run /cricket/tests/test_apps/pytorch_minimal.py 
+```
+
diff --git a/submodules/Makefile b/submodules/Makefile
index e08c8fc9..e30870da 100644
--- a/submodules/Makefile
+++ b/submodules/Makefile
@@ -10,7 +10,7 @@ clean:
 	cd cuda-gdb && git apply -R ../cuda-gdb.patch
 	rm -rf lib
 
-libtirpc:
+libtirpc/install:
 	@echo -e "\033[36m----> autogen libtirpc\033[0m"
 	if [ ! -f "libtirpc/configure" ]; then cd libtirpc && ./bootstrap; fi
 	@echo -e "\033[36m----> Configuring libtirpc\033[0m"
@@ -36,12 +36,17 @@ else
 endif
 
 cuda-gdb/build:
+ifeq (,$(wildcard ./cuda-gdb/build))
 	@echo -e "\033[36m----> Configuring cuda-gdb\033[0m"
+	@echo -e "\033[36m----> extracting cuda-gdb\033[0m"
 	mkdir -p cuda-gdb/build && cd cuda-gdb/build && \
-		../configure --disable-werror --program-prefix=cuda- --enable-cuda --with-python=no --enable-targets="x86_64-apple-darwin,x86_64-unknown-linux-gnu,arm-elf-linux-gnu,m68k-unknown-linux-gnu" CFLAGS='-I/usr/local/cuda/include' LDFLAGS='-lpthread'
+		../configure --disable-werror --program-prefix=cuda- --enable-cuda --with-python=no --enable-targets="x86_64-apple-darwin,x86_64-unknown-linux-gnu,arm-elf-linux-gnu,m68k-unknown-linux-gnu" CFLAGS='-I/usr/local/cuda/include -fPIC' LDFLAGS='-lpthread'
 	@echo -e "\033[36m----> Building cuda-gdb\033[0m"
 	CPATH=/usr/local/cuda/include $(MAKE) -C cuda-gdb/build
 	CPATH=/usr/local/cuda/include $(MAKE) -C cuda-gdb/build/gdb libgdb.a
+else
+	@echo -e "\033[36m----> cuda-gdb/build directory present. Skipping building of cuda-gdb\033[0m"
+endif
 
 lib:
 	mkdir -p lib
diff --git a/tests/.gitignore b/tests/.gitignore
index d5dbc051..c08de5b9 100644
--- a/tests/.gitignore
+++ b/tests/.gitignore
@@ -3,3 +3,4 @@ test-cricket
 test_api
 test_cpu
 test_kernel
+test_kernel_call
diff --git a/tests/Makefile b/tests/Makefile
index 8adc5da7..3048d6d5 100644
--- a/tests/Makefile
+++ b/tests/Makefile
@@ -1,7 +1,11 @@
 #MIT License...
 .PHONY: all clean test_apps cpu gpu samples
 
+ifdef NOSAMPLES
+all: test_apps cpu gpu bin
+else
 all: test_apps cpu gpu samples bin
+endif
 
 test_apps:
 	@echo -e "\033[36m----> Building tests/test_apps\033[0m"
@@ -20,13 +24,13 @@ samples:
 	@echo -e "\033[36m----> Building tests/samples\033[0m"
 	$(MAKE) -C samples
 
-bin: cpu samples test_apps
+bin: cpu test_apps
 	mkdir -p bin
 	cp cpu/unit/*.test bin
 	cp test_apps/*.testapp bin
-	cp samples/matrixMul/matrixMul bin
-	cp samples/bandwidthTest/bandwidthTest bin
-	cp samples/nbody/nbody bin
+ifndef NOSAMPLES
+	cp samples/samples-bin/*.sample bin
+endif
 
 clean:
 	@echo -e "\033[31m----> Cleaning up tests/test_apps\033[0m"
diff --git a/tests/cpu/cubin/Makefile b/tests/cpu/cubin/Makefile
index ca8418a3..c42ee827 100644
--- a/tests/cpu/cubin/Makefile
+++ b/tests/cpu/cubin/Makefile
@@ -8,11 +8,12 @@ LDFLAGS = -arch=$(ARCH) -cudart shared -lcuda
 BINARY = main
 CUBIN = kernel.cubin
 FATBIN = kernel.fatbin
+SHARED = kernel.so
 
 FILES := main.o
 
 .PHONY: all depend clean 
-all : $(BINARY) $(CUBIN) $(FATBIN)
+all : $(BINARY) $(CUBIN) $(FATBIN) $(SHARED)
 
 $(BINARY) : $(FILES)
 	$(LD) $(LDFLAGS) -o $@ $< 
@@ -26,6 +27,9 @@ $(BINARY) : $(FILES)
 %.o : %.cpp
 	$(CC) $(CFLAGS) -c -o $@ $<
 
+%.so : %.cu
+	$(CC) $(CFLAGS) --compiler-options '-fPIC' -o $@ $<
+
 clean :
 	rm -f *.o *.cubin *.fatbin $(BINARY)
 
diff --git a/tests/cpu/cubin/main b/tests/cpu/cubin/main
deleted file mode 100755
index f10f1de5..00000000
Binary files a/tests/cpu/cubin/main and /dev/null differ
diff --git a/tests/cpu/cubin/main.cpp b/tests/cpu/cubin/main.cpp
index c9137243..6bad89b8 100644
--- a/tests/cpu/cubin/main.cpp
+++ b/tests/cpu/cubin/main.cpp
@@ -3,6 +3,9 @@
 #include <cuda_runtime.h>
 #include <cuda.h>
 #include <unistd.h>
+#include <sys/mman.h>
+#include <sys/stat.h>
+#include <fcntl.h>
 
 
 #define printCudaErrors(err) __printCudaErrors (err, __FILE__, __LINE__)
@@ -66,6 +69,46 @@ void check_free_mem(int *mem, size_t len)
     cudaFree(mem);
 }
 
+int getModuleFromCubin(CUmodule *module, const char *cubin)
+{
+    CUresult err;
+    if ((err = cuModuleLoad(module, "kernel.cubin")) != CUDA_SUCCESS) {
+        printCudaErrors(err);
+        return 1;
+    }
+    return 0;
+}
+
+int getModuleFromCubinInMemory(CUmodule *module, const char *cubin)
+{
+    int fd = open(cubin, O_RDONLY);
+    if (fd < 0) {
+        printf("error\n");
+        return 1;
+    }
+    struct stat st;
+    if (fstat(fd, &st) < 0) {
+        printf("error\n");
+        return 1;
+    }
+    printf("size: %#0zx\n", (int)st.st_size);
+    void *buf = mmap(NULL, st.st_size, PROT_READ, MAP_PRIVATE, fd, 0);
+    if (buf == MAP_FAILED) {
+        printf("error\n");
+        return 1;
+    }
+    CUresult err;
+    if ((err = cuModuleLoadData(module, buf)) != CUDA_SUCCESS) {
+        printCudaErrors(err);
+        return 1;
+    }
+    return 0;
+}
+
+int getModuleFromShared(CUmodule **module, const char *cubin)
+{
+    return 0;
+}
 
 int main(int argc, char** argv)
 {
@@ -83,9 +126,18 @@ int main(int argc, char** argv)
     CUmodule module;
     CUfunction func;
     printf("testing cubin...\n");
-    if ((err = cuModuleLoad(&module, "kernel.cubin")) != CUDA_SUCCESS) {
-        printCudaErrors(err);
+    if (getModuleFromCubinInMemory(&module, "kernel.cubin") != 0) {
+        printf("error\n");
+        return 1;
     }
+    // if (getModuleFromCubin(&module, "kernel.cubin") != 0) {
+    //     printf("error\n");
+    //     return 1;
+    // }
+    // if ((err = getModuleFromShared(&module, "kernel.so")) != 0) {
+    //     printf("error\n");
+    //     return 1;
+    // }
 
     if ((err = cuModuleGetFunction(&func, module, "kernel")) != CUDA_SUCCESS) {
         printCudaErrors(err);
diff --git a/tests/cpu/unit/Makefile b/tests/cpu/unit/Makefile
index 6e22bb27..e7de359c 100644
--- a/tests/cpu/unit/Makefile
+++ b/tests/cpu/unit/Makefile
@@ -16,7 +16,7 @@ INC_FLAGS += -I../../../cpu/
 
 LIB_FLAGS += -L$(LIBTIRPC_PREFIX)/lib
 LIB_FLAGS += -L$(CUDA_SRC)/lib64
-LIB_FLAGS += -L../../../cpu/
+LIB_FLAGS += -L../../../bin/
 CC_FLAGS += -std=gnu99 $(INC_FLAGS) -g -ggdb -fsanitize=address -fsanitize=pointer-compare -fsanitize=pointer-subtract -fsanitize-address-use-after-scope
 LD_FLAGS = $(LIB_FLAGS)
 
@@ -32,11 +32,14 @@ CLIENT_LD_FLAGS = $(LD_FLAGS) -l:cricket-client.so
 
 all : $(BIN_CLIENT_TESTS) $(BIN_SERVER_TESTS)
 
-$(BIN_SERVER_TESTS) : %.test:%.o
+../../../bin/cricket-server.so:
+	$(MAKE) -C ../../../ bin/cricket-server.so
+
+$(BIN_SERVER_TESTS) : %.test:%.o ../../../bin/cricket-server.so
 	$(LD) $(CC_FLAGS) -o $@ $< $(SERVER_LD_FLAGS)
 
 $(OBJ_SERVER_TESTS) : %.o:%.c
-	$(CC) $(CC_FLAGS) -c -o $@ $< $(SERVER_LD_FLAGS)
+	$(CC) $(CC_FLAGS) -c -o $@ $<
 
 clean:
 	 rm -f $(OBJ_SERVER_TESTS) $(OBJ_CLIENT_TESTS) $(BIN_SERVER_TESTS) $(BIN_CLIENT_TESTS)
diff --git a/tests/gdb_client_cmds b/tests/gdb_client_cmds
new file mode 100644
index 00000000..825cfdd6
--- /dev/null
+++ b/tests/gdb_client_cmds
@@ -0,0 +1,3 @@
+python gdb.execute("set environment CRICKET_NOHASH=yes")
+python gdb.execute("set environment REMOTE_GPU_ADDRESS=localhost")
+python gdb.execute("set environment LD_PRELOAD=../../cpu/cricket-client.so")
\ No newline at end of file
diff --git a/tests/samples/.gitignore b/tests/samples/.gitignore
new file mode 100644
index 00000000..33a20c36
--- /dev/null
+++ b/tests/samples/.gitignore
@@ -0,0 +1,2 @@
+samples-bin
+samples
diff --git a/tests/samples/Makefile b/tests/samples/Makefile
index 38aa9512..c7c83f63 100644
--- a/tests/samples/Makefile
+++ b/tests/samples/Makefile
@@ -1,55 +1,101 @@
 CC = gcc
 LD = gcc
-CFLAGS = -Wall -std=gnu99
-ARCH = sm_61
-CUDA_DIR = /usr/local/cuda
+CFLAGS = -Wall -std=gnu99 -g -ggdb
+SAMPLES = samples-bin/matrixMul.compressed.sample \
+		  samples-bin/matrixMul.uncompressed.sample \
+		  samples-bin/nbody.uncompressed.sample \
+		  samples-bin/nbody.compressed.sample \
+		  samples-bin/bandwidthTest.sample \
+		  samples-bin/mnistCUDNN.sample
+
+CUDA_PATH = /usr/local/cuda
+SMS = 75 60
+CUDA_SAMPLES_RELEASE ?= 12.1
+CUDA_SAMPLES_URL = https://github.com/NVIDIA/cuda-samples/archive/refs/tags/v${CUDA_SAMPLES_RELEASE}.tar.gz
+CUDNN_SAMPLES_URL = https://developer.download.nvidia.com/compute/cuda/repos/rhel8/x86_64/libcudnn8-samples-8.9.2.26-1.cuda12.1.x86_64.rpm
 
 PWD = $(shell dirname $(realpath $(firstword $(MAKEFILE_LIST))))
 
 .PHONY: all clean distclean
 
-all : matrixMul/matrixMul bandwidthTest/bandwidthTest nbody/nbody
-
-matrixMul :
-	mkdir -p $(PWD)/matrixMul
-	cp -r $(CUDA_DIR)/samples/0_Simple/matrixMul $(PWD)
-	make -C matrixMul clean
-
-matrixMul/matrixMul : matrixMul
-	make -C matrixMul \
-		NVCCFLAGS="-m64 -cudart shared" \
-		GENCODE_FLAGS="-arch=$(ARCH)" \
-		CPATH="$(CUDA_DIR)/samples/common/inc"
-
-bandwidthTest :
-	mkdir -p $(PWD)/bandwidthTest
-	cp -r $(CUDA_DIR)/samples/1_Utilities/bandwidthTest $(PWD)
-	make -C bandwidthTest clean
-
-bandwidthTest/bandwidthTest : bandwidthTest
-	make -C bandwidthTest \
-		NVCCFLAGS="-m64 -cudart shared" \
-		GENCODE_FLAGS="-arch=$(ARCH)" \
-		CPATH="$(CUDA_DIR)/samples/common/inc"
-
-nbody :
-	mkdir -p $(PWD)/nbody
-	cp -r $(CUDA_DIR)/samples/5_Simulations/nbody $(PWD)
-	make -C nbody clean
-
-nbody/nbody : nbody
-	make -C nbody \
-		NVCCFLAGS="-m64 -cudart shared" \
-		GENCODE_FLAGS="-arch=$(ARCH)" \
-		CPATH="$(CUDA_DIR)/samples/common/inc"
+all : $(SAMPLES)
+
+samples:
+	mkdir -p $@
+	wget ${CUDA_SAMPLES_URL} -O - | tar -xz --strip-components=1 -C $@
+
+cudnn-samples:
+	mkdir -p $@
+	wget ${CUDNN_SAMPLES_URL} -O - | rpm2archive - | tar zxf - --strip-components=4 -C $@
+
+samples-bin:
+	mkdir -p $@
+
+samples-bin/data:
+	cp -R cudnn-samples/mnistCUDNN/data $@
+
+samples-bin/mnistCUDNN.sample : cudnn-samples samples-bin samples-bin/data
+	make -C cudnn-samples/mnistCUDNN \
+		clean
+	make -C cudnn-samples/mnistCUDNN \
+		NVCCFLAGS="-cudart shared --no-compress -G" \
+		SMS="${SMS}" \
+		CUDA_PATH=${CUDA_PATH} \
+		DEBUG=1
+	cp cudnn-samples/mnistCUDNN/mnistCUDNN $@
+
+samples-bin/nbody.uncompressed.sample : samples samples-bin
+	make -C samples/Samples/5_Domain_Specific/nbody \
+		clean
+	make -C samples/Samples/5_Domain_Specific/nbody \
+		NVCCFLAGS="-cudart shared --no-compress -g -G" \
+		SMS="${SMS}" \
+		CPATH="samples/Common" \
+		CUDA_PATH=${CUDA_PATH}
+	cp samples/Samples/5_Domain_Specific/nbody/nbody $@
+
+samples-bin/nbody.compressed.sample : samples samples-bin
+	make -C samples/Samples/5_Domain_Specific/nbody \
+		clean
+	make -C samples/Samples/5_Domain_Specific/nbody \
+		NVCCFLAGS="-cudart shared -Xfatbin --compress-all -g -G" \
+		SMS="${SMS}" \
+		CPATH="samples/Common" \
+		CUDA_PATH=${CUDA_PATH}
+	cp samples/Samples/5_Domain_Specific/nbody/nbody $@
+
+samples-bin/matrixMul.compressed.sample : samples samples-bin
+	make -C samples/Samples/0_Introduction/matrixMul \
+		clean
+	make -C samples/Samples/0_Introduction/matrixMul \
+		NVCCFLAGS="-cudart shared -Xfatbin --compress-all" \
+		SMS="${SMS}" \
+		CPATH="samples/Common" \
+		CUDA_PATH=${CUDA_PATH}
+	cp samples/Samples/0_Introduction/matrixMul/matrixMul $@
+
+samples-bin/matrixMul.uncompressed.sample : samples samples-bin
+	make -C samples/Samples/0_Introduction/matrixMul \
+		clean
+	make -C samples/Samples/0_Introduction/matrixMul \
+		NVCCFLAGS="-cudart shared --no-compress" \
+		SMS="${SMS}" \
+		CPATH="samples/Common" \
+		CUDA_PATH=${CUDA_PATH}
+	cp samples/Samples/0_Introduction/matrixMul/matrixMul $@
+
+samples-bin/bandwidthTest.sample : samples samples-bin
+	make -C samples/Samples/1_Utilities/bandwidthTest \
+		clean
+	make -C samples/Samples/1_Utilities/bandwidthTest \
+		NVCCFLAGS="-cudart shared --no-compress" \
+		SMS="${SMS}" \
+		CPATH="samples/Common" \
+		CUDA_PATH=${CUDA_PATH}
+	cp samples/Samples/1_Utilities/bandwidthTest/bandwidthTest $@
 
 clean :
-	rm -f *.elf *.hex *.o *.d .depend *~
-	make -C matrixMul clean
-	make -C bandwidthTest clean
-	make -C nbody clean
+	rm -rf samples-bin
 
 distclean : clean
-	rm -r matrixMul
-	rm -r bandwidthTest
-	rm -r nbody
\ No newline at end of file
+	rm -rf samples
\ No newline at end of file
diff --git a/tests/test_apps/Makefile b/tests/test_apps/Makefile
index 73a1d32c..dafae5a3 100644
--- a/tests/test_apps/Makefile
+++ b/tests/test_apps/Makefile
@@ -10,6 +10,7 @@ CFLAGS = -arch=$(ARCH) -cudart shared
 #CFLAGS = -arch=$(ARCH)
 LD = nvcc -ccbin g++
 LDFLAGS = -arch=$(ARCH) -cudart shared
+DEBUG_FLAGS = #-g -G
 #LDFLAGS = -lcuda -arch=$(ARCH)
 TEST_CPU_BIN = cpu.testapp
 TEST_CPU_O = test_cpu.o
@@ -19,14 +20,19 @@ TEST_KERNEL_BIN = kernel.testapp
 TEST_KERNEL_O = test_kernel.o
 BINARY = cricket.testapp
 
+TEST_KERNEL_LIB_O = test_kernel_lib.o
+TEST_KERNEL_LIB = test_kernel.so
+TEST_KERNEL_LIB_CALL_O = test_kernel_call.o
+TEST_KERNEL_LIB_CALL = test_kernel_call
+
 LIBCUDA_WRAPPER = libcuda.so.1
 LIBCUDA_OBJ = libcuda.o
-LIBCUDA_LIBS = -ldl
+LIBCUDA_LIBS = -ldl -I../../cpu
 
 FILES := matmul.cu
 
 .PHONY: all depend clean 
-all : $(TEST_KERNEL_BIN)
+all : $(TEST_KERNEL_BIN) $(BINARY) $(TEST_CPU_BIN) $(TEST_API_BIN) $(TEST_KERNEL_LIB) $(TEST_KERNEL_LIB_CALL)
 
 $(TEST_CPU_O) : $(FILES)
 	$(CC) -DTEST_CPU $(CFLAGS) -dc -o $@ $<
@@ -55,11 +61,23 @@ $(BINARY) : $(FILES)
 $(LIBCUDA_OBJ) : $(LIBCUDA_OBJ:.o=.c)
 	$(HOST_CC) -c -fpic -o $@ $< $(LIBCUDA_LIBS)
 
+$(TEST_KERNEL_LIB_O) : $(FILES)
+	$(CC) $(CFLAGS) $(DEBUG_FLAGS) -dc --compiler-options '-fPIC' -o $@ $<
+
+$(TEST_KERNEL_LIB) : $(TEST_KERNEL_LIB_O)
+	$(LD) $(LDFLAGS) $(DEBUG_FLAGS) -shared -o lib$@ $^
+
+$(TEST_KERNEL_LIB_CALL_O) : $(TEST_KERNEL_LIB_CALL_O:.o=.c)
+	$(HOST_CC) -c -o $@ $<
+
+$(TEST_KERNEL_LIB_CALL) : $(TEST_KERNEL_LIB_CALL_O)
+	$(HOST_LD) -o $@ $< -I. -ldl
+
 $(LIBCUDA_WRAPPER) : $(LIBCUDA_OBJ)
 	$(HOST_LD) -shared -o $@ $^
 
 
 clean :
-	rm -f *.elf *.hex *.o *.d .depend *~ $(BINARY) $(LIBCUDA_WRAPPER) $(TEST_CPU_BIN) $(TEST_API_BIN) $(TEST_KERNEL_BIN)
+	rm -f *.elf *.hex *.o *.d .depend *~ $(BINARY) $(LIBCUDA_WRAPPER) $(TEST_CPU_BIN) $(TEST_API_BIN) $(TEST_KERNEL_BIN) $(TEST_KERNEL_LIB) $(TEST_KERNEL_LIB_CALL) 
 
 
diff --git a/tests/test_apps/matmul.cu b/tests/test_apps/matmul.cu
index 7790ae7b..b4960c39 100644
--- a/tests/test_apps/matmul.cu
+++ b/tests/test_apps/matmul.cu
@@ -8,7 +8,7 @@
 #include "cricket-cuda.h"
 
 #define N 32
-#define ITERATIONS 1024*128*8*16
+#define ITERATIONS 1024*128*4
 const int blocksize = 32;
 
 #ifndef RANDOM_INIT
@@ -173,7 +173,6 @@ int main()
 #endif //RANDOM_INIT
     uint16_t *res;
     uint16_t *dev_A, *dev_x, *dev_res;
-    uint16_t *dev_ptr;
     struct timeval begin, end;
     struct timeval messb, messa;
     const int A_size = N*N*sizeof(uint16_t);
@@ -253,11 +252,9 @@ int main()
    */
     cudaMalloc( (void**)&dev_x, x_size );
     cudaMalloc( (void**)&dev_res, x_size );
-    cudaMalloc( (void**)&dev_ptr, A_size );
 
     printf("Mallocs done\n");
 
-    cudaMemcpy( dev_ptr, A, A_size, cudaMemcpyHostToDevice );
     cudaMemcpy( dev_A, A, A_size, cudaMemcpyHostToDevice );
     cudaMemcpy( dev_x, x, x_size, cudaMemcpyHostToDevice );
 
@@ -265,7 +262,7 @@ int main()
     dim3 dimBlock( blocksize, 1 );
     dim3 dimGrid( 1, 1);
     kernel<<<dimGrid, dimBlock>>>(dev_A, dev_x, dev_res, 0, 0, 0, 0);
-    //kernel_no_param<<<dimGrid, dimBlock>>>();
+    kernel_no_param<<<dimGrid, dimBlock>>>();
     //void *args = NULL;
     //int result = cudaLaunchKernel((void*)kernel_no_param, dimGrid, dimBlock, &args, 0LL, NULL);
 
@@ -305,7 +302,7 @@ int main()
     gettimeofday(&end, NULL);
 
     printf("elapsed time: %0u.%06u\n", (end.tv_sec - begin.tv_sec), (end.tv_usec - begin.tv_usec));
-
+    free(res);
 
     return (success ? 0 : 1);
 }
diff --git a/tests/test_apps/pytorch_minimal.py b/tests/test_apps/pytorch_minimal.py
new file mode 100644
index 00000000..d6f49e2d
--- /dev/null
+++ b/tests/test_apps/pytorch_minimal.py
@@ -0,0 +1,73 @@
+# BSD 3-Clause License
+# 
+# Copyright (c) 2017-2022, Pytorch contributors
+# All rights reserved.
+# 
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+# 
+# * Redistributions of source code must retain the above copyright notice, this
+#   list of conditions and the following disclaimer.
+# 
+# * Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+# 
+# * Neither the name of the copyright holder nor the names of its
+#   contributors may be used to endorse or promote products derived from
+#   this software without specific prior written permission.
+# 
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+import torch
+import math
+
+
+dtype = torch.float
+device = torch.device("cuda:0")
+
+# Create random input and output data
+x = torch.linspace(-math.pi, math.pi, 2000, device=device, dtype=dtype)
+y = torch.sin(x)
+
+# Randomly initialize weights
+a = torch.randn((), device=device, dtype=dtype)
+b = torch.randn((), device=device, dtype=dtype)
+c = torch.randn((), device=device, dtype=dtype)
+d = torch.randn((), device=device, dtype=dtype)
+
+learning_rate = 1e-6
+for t in range(2000):
+    # Forward pass: compute predicted y
+    y_pred = a + b * x + c * x ** 2 + d * x ** 3
+
+    # Compute and print loss
+    loss = (y_pred - y).pow(2).sum().item()
+    if t % 100 == 99:
+        print(t, loss)
+
+    # Backprop to compute gradients of a, b, c, d with respect to loss
+    grad_y_pred = 2.0 * (y_pred - y)
+    grad_a = grad_y_pred.sum()
+    grad_b = (grad_y_pred * x).sum()
+    grad_c = (grad_y_pred * x ** 2).sum()
+    grad_d = (grad_y_pred * x ** 3).sum()
+
+    # Update weights using gradient descent
+    a -= learning_rate * grad_a
+    b -= learning_rate * grad_b
+    c -= learning_rate * grad_c
+    d -= learning_rate * grad_d
+
+
+print(f'Result: y = {a.item()} + {b.item()} x + {c.item()} x^2 + {d.item()} x^3')
+
diff --git a/tests/test_apps/test_kernel_call.c b/tests/test_apps/test_kernel_call.c
new file mode 100644
index 00000000..ce538a2c
--- /dev/null
+++ b/tests/test_apps/test_kernel_call.c
@@ -0,0 +1,26 @@
+#include <dlfcn.h>
+#include <stdio.h>
+
+int main(int argc, char** argv)
+{
+    void *dlhandle;
+
+    if ((dlhandle = dlopen("./libtest_kernel.so", RTLD_LAZY)) == NULL) {
+        printf("error opening library\n");
+        return 1;
+    }
+
+    int (*fn)(void);
+
+    printf("kernel: %p\n", dlsym(dlhandle, "_Z6kernelPtS_S_csix"));
+
+    if ((fn = dlsym(dlhandle, "main")) == NULL) {
+        printf("dlsym failed\n");
+        return 1;
+    }
+
+    fn();
+
+    return 0;
+}
+
diff --git a/tests/test_apps/yolo.py b/tests/test_apps/yolo.py
new file mode 100644
index 00000000..1d929155
--- /dev/null
+++ b/tests/test_apps/yolo.py
@@ -0,0 +1,12 @@
+import torch
+
+model = torch.hub.load("ultralytics/yolov5", "yolov5s", device='cuda:0')  # or yolov5n - yolov5x6, custom
+
+# Images
+img = "https://ultralytics.com/images/zidane.jpg"  # or file, Path, PIL, OpenCV, numpy, list
+
+# Inference
+results = model(img)
+
+# Results
+results.print()  # or .show(), .save(), .crop(), .pandas(), etc.
\ No newline at end of file
diff --git a/utils/Dockerfile b/utils/Dockerfile
index 30fddb78..66cfcae5 100644
--- a/utils/Dockerfile
+++ b/utils/Dockerfile
@@ -1,4 +1,4 @@
-FROM centos:8
+FROM rockylinux:8
 
 LABEL \
 	org.label-schema.schema-version = "1.0" \
@@ -9,25 +9,25 @@ LABEL \
 	org.label-schema.author.email = "niklas.eiling@eonerc.rwth-aachen.de" \
 	org.label-schema.vcs-url = "https://git.rwth-aachen.de/niklas.eiling/cricket"
 
-RUN cd /etc/yum.repos.d/ && sed -i 's/mirrorlist/#mirrorlist/g' /etc/yum.repos.d/CentOS-* && sed -i 's|#baseurl=http://mirror.centos.org|baseurl=http://vault.centos.org|g' /etc/yum.repos.d/CentOS-* && yum update -y
-
 RUN dnf -y update && \
-    dnf install -y epel-release dnf-plugins-core && \
-    dnf install -y https://rpms.remirepo.net/enterprise/remi-release-8.rpm && \
-    dnf config-manager --set-enabled powertools && \
-    dnf config-manager --set-enabled remi
+	dnf install -y epel-release dnf-plugins-core && \
+	dnf install -y https://rpms.remirepo.net/enterprise/remi-release-8.rpm && \
+	dnf config-manager --set-enabled powertools && \
+	dnf config-manager --set-enabled remi
 
 RUN dnf install -y make bash git gcc autoconf libtool automake rpcgen \
                    ncurses-devel zlib-devel binutils-devel mesa-libGL-devel \
                    libvdpau-devel mesa-libEGL-devel openssl-devel rpcbind \
                    texinfo bison flex python3 which libibverbs libibverbs-devel \
-                   libasan cppcheck wget expat-devel xz-devel mesa-libGLU-devel freeglut-devel
+                   libasan cppcheck wget expat-devel xz-devel mesa-libGLU-devel freeglut-devel \
+                   elfutils-libelf-devel cpio openssl-devel openssl-libs \
+				   freeimage freeimage-devel
 
 ENV LD_LIBRARY_PATH="/usr/local/lib:/usr/local/lib64:${LD_LIBRARY_PATH}"
 
 RUN dnf -y config-manager --add-repo http://developer.download.nvidia.com/compute/cuda/repos/rhel8/x86_64/cuda-rhel8.repo && \
-    dnf --refresh -y install cuda-compiler-11-1 cuda-libraries-devel-11-1 cuda-samples-11-1 cuda-driver-devel-11-1 && \
-    ln -s cuda-11.1 /usr/local/cuda && \
+    dnf --refresh -y install cuda-compiler-12-1 cuda-libraries-devel-12-1 cuda-driver-devel-12-1 cuda-profiler-api-12-1 cuda-nvml-devel-12-1 nvidia-driver-NVML-530.30.02 libcudnn8-devel && \
+    ln -s cuda-12.1 /usr/local/cuda && \
     ln -s libcuda.so /usr/local/cuda/targets/x86_64-linux/lib/stubs/libcuda.so.1
 
 ENV PATH="/usr/local/cuda/bin:${PATH}"
diff --git a/utils/Dockerfile.cuda10 b/utils/Dockerfile.cuda11
similarity index 63%
rename from utils/Dockerfile.cuda10
rename to utils/Dockerfile.cuda11
index 2dcec62b..a261bb98 100644
--- a/utils/Dockerfile.cuda10
+++ b/utils/Dockerfile.cuda11
@@ -9,6 +9,8 @@ LABEL \
 	org.label-schema.author.email = "niklas.eiling@eonerc.rwth-aachen.de" \
 	org.label-schema.vcs-url = "https://git.rwth-aachen.de/niklas.eiling/cricket"
 
+RUN cd /etc/yum.repos.d/ && sed -i 's/mirrorlist/#mirrorlist/g' /etc/yum.repos.d/CentOS-* && sed -i 's|#baseurl=http://mirror.centos.org|baseurl=http://vault.centos.org|g' /etc/yum.repos.d/CentOS-* && yum update -y
+
 RUN dnf -y update && \
     dnf install -y epel-release dnf-plugins-core && \
     dnf install -y https://rpms.remirepo.net/enterprise/remi-release-8.rpm && \
@@ -18,16 +20,17 @@ RUN dnf -y update && \
 RUN dnf install -y make bash git gcc autoconf libtool automake rpcgen \
                    ncurses-devel zlib-devel binutils-devel mesa-libGL-devel \
                    libvdpau-devel mesa-libEGL-devel openssl-devel rpcbind \
-                   texinfo bison flex python3 which libibverbs libasan \
-                   cppcheck wget expat-devel xz-devel
+                   texinfo bison flex python3 which libibverbs libibverbs-devel \
+                   libasan cppcheck wget expat-devel xz-devel mesa-libGLU-devel freeglut-devel \
+                   elfutils-libelf-devel cpio openssl-devel openssl-libs \
+                   freeimage freeimage-devel
 
 ENV LD_LIBRARY_PATH="/usr/local/lib:/usr/local/lib64:${LD_LIBRARY_PATH}"
 
-RUN dnf -y install https://developer.download.nvidia.com/compute/cuda/repos/rhel8/x86_64/cuda-repo-rhel8-10.2.89-1.x86_64.rpm && \
-    dnf --refresh -y install cuda-compiler-10-2 cuda-libraries-dev-10-2 cuda-samples-10-2 cuda-driver-dev-10-2 && \
-    ln -s cuda-10.2 /usr/local/cuda && \
+RUN dnf -y config-manager --add-repo http://developer.download.nvidia.com/compute/cuda/repos/rhel8/x86_64/cuda-rhel8.repo && \
+    dnf --refresh -y install cuda-compiler-11-1 cuda-libraries-devel-11-1 cuda-samples-11-1 cuda-driver-devel-11-1 cuda-nvprof-11-1 cuda-nvml-devel-11-1 nvidia-driver-NVML-530.30.02 libcudnn8-devel && \
+    ln -s cuda-11.1 /usr/local/cuda && \
     ln -s libcuda.so /usr/local/cuda/targets/x86_64-linux/lib/stubs/libcuda.so.1
-    
 
 ENV PATH="/usr/local/cuda/bin:${PATH}"
 ENV LIBRARY_PATH="/usr/local/cuda/targets/x86_64-linux/lib/stubs:$(LIBRARY_PATH}"