From 66642d8379b1fe0fcb4c4b0e0dcb9109c0dfbbc6 Mon Sep 17 00:00:00 2001
From: Niklas Eiling <niklas.eiling@eonerc.rwth-aachen.de>
Date: Wed, 15 Feb 2023 14:16:16 +0100
Subject: [PATCH 01/83] add perf outputs to gitignore

Signed-off-by: Niklas Eiling <niklas.eiling@eonerc.rwth-aachen.de>
---
 .gitignore | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/.gitignore b/.gitignore
index 8c3acb57..814b57cb 100644
--- a/.gitignore
+++ b/.gitignore
@@ -39,3 +39,6 @@ core.*
 compile_commands.json
 tags
 .gdb_history
+
+# perf data
+perf.data

From 403ec5ff024dffcedc15da5dbfb315a78dff223b Mon Sep 17 00:00:00 2001
From: Niklas Eiling <niklas.eiling@eonerc.rwth-aachen.de>
Date: Wed, 15 Feb 2023 14:53:07 +0100
Subject: [PATCH 02/83] fix various errors in the Makefiles that lead to
 building on a non-clean directory not always working

Signed-off-by: Niklas Eiling <niklas.eiling@eonerc.rwth-aachen.de>
---
 Makefile            | 2 +-
 cpu/Makefile        | 8 ++++----
 submodules/Makefile | 2 +-
 3 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/Makefile b/Makefile
index 2e5401fc..9007482d 100644
--- a/Makefile
+++ b/Makefile
@@ -19,7 +19,7 @@ cuda-gdb:
 
 libtirpc:
 	@echo -e "\033[36m----> Building libtirpc\033[0m"
-	$(MAKE) -C submodules libtirpc
+	$(MAKE) -C submodules libtirpc/install
 
 gpu: cuda-gdb
 	@echo -e "\033[36m----> Building gpu\033[0m"
diff --git a/cpu/Makefile b/cpu/Makefile
index a03a7fc9..b9c26bc5 100644
--- a/cpu/Makefile
+++ b/cpu/Makefile
@@ -113,19 +113,19 @@ $(SERVER_BIN) : $(OBJ_SERVER) $(SRC_SERVER_EXE:%.c=%.o)
 	$(LD) $(CC_FLAGS) -o $@ $^ $(SERVER_BIN_LD_FLAGS)
 
 $(RPC_H) : $(RPC_DEF)
-	$(RPCGEN) $(RPCGEN_FLAGS) -h -o $@ $<
+	rm $@ && $(RPCGEN) $(RPCGEN_FLAGS) -h -o $@ $<
 
 $(RPC_CLIENT) : $(RPC_DEF)
-	$(RPCGEN) $(RPCGEN_FLAGS) -l -o $@ $<
+	rm $@ && $(RPCGEN) $(RPCGEN_FLAGS) -l -o $@ $<
 
 $(RPC_SERVER) : $(RPC_DEF)
-	$(RPCGEN) $(RPCGEN_FLAGS) -m -o $@ $<
+	rm $@ && $(RPCGEN) $(RPCGEN_FLAGS) -m -o $@ $<
 
 $(RPC_SERVER_MOD) : $(RPC_SERVER)
 	./generate_dispatch.sh
 
 $(RPC_XDR) : $(RPC_DEF)
-	$(RPCGEN) $(RPCGEN_FLAGS) -c -o $@ $<
+	rm $@ && $(RPCGEN) $(RPCGEN_FLAGS) -c -o $@ $<
 
 %.o : %.c $(RPC_H)
 	$(CC) $(CC_FLAGS) -c -fpic -o $@ $< $(LD_FLAGS) 
diff --git a/submodules/Makefile b/submodules/Makefile
index e08c8fc9..27d55680 100644
--- a/submodules/Makefile
+++ b/submodules/Makefile
@@ -10,7 +10,7 @@ clean:
 	cd cuda-gdb && git apply -R ../cuda-gdb.patch
 	rm -rf lib
 
-libtirpc:
+libtirpc/install:
 	@echo -e "\033[36m----> autogen libtirpc\033[0m"
 	if [ ! -f "libtirpc/configure" ]; then cd libtirpc && ./bootstrap; fi
 	@echo -e "\033[36m----> Configuring libtirpc\033[0m"

From 0e13cbfd062e0f10557268c9286ae27735a94c11 Mon Sep 17 00:00:00 2001
From: Niklas Eiling <niklas.eiling@eonerc.rwth-aachen.de>
Date: Wed, 15 Feb 2023 16:31:36 +0100
Subject: [PATCH 03/83] add test program for cuda code loaded using libdl

Signed-off-by: Niklas Eiling <niklas.eiling@eonerc.rwth-aachen.de>
---
 tests/test_apps/Makefile           | 23 ++++++++++++++++++++---
 tests/test_apps/test_kernel_call.c | 24 ++++++++++++++++++++++++
 2 files changed, 44 insertions(+), 3 deletions(-)
 create mode 100644 tests/test_apps/test_kernel_call.c

diff --git a/tests/test_apps/Makefile b/tests/test_apps/Makefile
index 73a1d32c..f33b0fd8 100644
--- a/tests/test_apps/Makefile
+++ b/tests/test_apps/Makefile
@@ -19,14 +19,19 @@ TEST_KERNEL_BIN = kernel.testapp
 TEST_KERNEL_O = test_kernel.o
 BINARY = cricket.testapp
 
+TEST_KERNEL_LIB_O = test_kernel_lib.o
+TEST_KERNEL_LIB = test_kernel.so
+TEST_KERNEL_LIB_CALL_O = test_kernel_call.o
+TEST_KERNEL_LIB_CALL = test_kernel_call
+
 LIBCUDA_WRAPPER = libcuda.so.1
 LIBCUDA_OBJ = libcuda.o
-LIBCUDA_LIBS = -ldl
+LIBCUDA_LIBS = -ldl -I../../cpu
 
 FILES := matmul.cu
 
 .PHONY: all depend clean 
-all : $(TEST_KERNEL_BIN)
+all : $(TEST_KERNEL_BIN) $(BINARY) $(TEST_CPU_BIN) $(TEST_API_BIN) $(TEST_KERNEL_LIB) $(TEST_KERNEL_LIB_CALL)
 
 $(TEST_CPU_O) : $(FILES)
 	$(CC) -DTEST_CPU $(CFLAGS) -dc -o $@ $<
@@ -55,11 +60,23 @@ $(BINARY) : $(FILES)
 $(LIBCUDA_OBJ) : $(LIBCUDA_OBJ:.o=.c)
 	$(HOST_CC) -c -fpic -o $@ $< $(LIBCUDA_LIBS)
 
+$(TEST_KERNEL_LIB_O) : $(FILES)
+	$(CC) -c --compiler-options '-fPIC' -o $@ $<
+
+$(TEST_KERNEL_LIB) : $(TEST_KERNEL_LIB_O)
+	$(LD) -shared -o lib$@ $^
+
+$(TEST_KERNEL_LIB_CALL_O) : $(TEST_KERNEL_LIB_CALL_O:.o=.c)
+	$(HOST_CC) -c -o $@ $<
+
+$(TEST_KERNEL_LIB_CALL) : $(TEST_KERNEL_LIB_CALL_O)
+	$(HOST_LD) -o $@ $< -I. -ldl
+
 $(LIBCUDA_WRAPPER) : $(LIBCUDA_OBJ)
 	$(HOST_LD) -shared -o $@ $^
 
 
 clean :
-	rm -f *.elf *.hex *.o *.d .depend *~ $(BINARY) $(LIBCUDA_WRAPPER) $(TEST_CPU_BIN) $(TEST_API_BIN) $(TEST_KERNEL_BIN)
+	rm -f *.elf *.hex *.o *.d .depend *~ $(BINARY) $(LIBCUDA_WRAPPER) $(TEST_CPU_BIN) $(TEST_API_BIN) $(TEST_KERNEL_BIN) $(TEST_KERNEL_LIB) $(TEST_KERNEL_LIB_CALL) 
 
 
diff --git a/tests/test_apps/test_kernel_call.c b/tests/test_apps/test_kernel_call.c
new file mode 100644
index 00000000..b702b042
--- /dev/null
+++ b/tests/test_apps/test_kernel_call.c
@@ -0,0 +1,24 @@
+#include <dlfcn.h>
+#include <stdio.h>
+
+int main(int argc, char** argv)
+{
+    void *dlhandle;
+
+    if ((dlhandle = dlopen("./libtest_kernel.so", RTLD_LAZY)) == NULL) {
+        printf("error opening library\n");
+        return 1;
+    }
+
+    int (*fn)(void);
+
+    if ((fn = dlsym(dlhandle, "main")) == NULL) {
+        printf("dlsym failed\n");
+        return 1;
+    }
+
+    fn();
+
+    return 0;
+}
+

From cb391b35bf24e56be50141b05cf8f5b2bb8f50be Mon Sep 17 00:00:00 2001
From: Niklas Eiling <niklas.eiling@eonerc.rwth-aachen.de>
Date: Wed, 15 Feb 2023 18:19:15 +0100
Subject: [PATCH 04/83] when the client dlopens libraries containing cuda
 kernels, also open them at the server.

Signed-off-by: Niklas Eiling <niklas.eiling@eonerc.rwth-aachen.de>
---
 cpu/Makefile             |   8 +-
 cpu/cpu-client.c         | 188 +++++++++++++++++++++------------------
 cpu/cpu-server-runtime.c |  21 +++++
 cpu/cpu-server.c         |  22 +++--
 cpu/cpu-utils.c          |   1 +
 cpu/cpu_rpc_prot.x       |   1 +
 tests/test_apps/Makefile |   4 +-
 7 files changed, 145 insertions(+), 100 deletions(-)

diff --git a/cpu/Makefile b/cpu/Makefile
index b9c26bc5..3ebb1491 100644
--- a/cpu/Makefile
+++ b/cpu/Makefile
@@ -113,19 +113,19 @@ $(SERVER_BIN) : $(OBJ_SERVER) $(SRC_SERVER_EXE:%.c=%.o)
 	$(LD) $(CC_FLAGS) -o $@ $^ $(SERVER_BIN_LD_FLAGS)
 
 $(RPC_H) : $(RPC_DEF)
-	rm $@ && $(RPCGEN) $(RPCGEN_FLAGS) -h -o $@ $<
+	rm -f $@ && $(RPCGEN) $(RPCGEN_FLAGS) -h -o $@ $<
 
 $(RPC_CLIENT) : $(RPC_DEF)
-	rm $@ && $(RPCGEN) $(RPCGEN_FLAGS) -l -o $@ $<
+	rm -f $@ && $(RPCGEN) $(RPCGEN_FLAGS) -l -o $@ $<
 
 $(RPC_SERVER) : $(RPC_DEF)
-	rm $@ && $(RPCGEN) $(RPCGEN_FLAGS) -m -o $@ $<
+	rm -f $@ && $(RPCGEN) $(RPCGEN_FLAGS) -m -o $@ $<
 
 $(RPC_SERVER_MOD) : $(RPC_SERVER)
 	./generate_dispatch.sh
 
 $(RPC_XDR) : $(RPC_DEF)
-	rm $@ && $(RPCGEN) $(RPCGEN_FLAGS) -c -o $@ $<
+	rm -f $@ && $(RPCGEN) $(RPCGEN_FLAGS) -c -o $@ $<
 
 %.o : %.c $(RPC_H)
 	$(CC) $(CC_FLAGS) -c -fpic -o $@ $< $(LD_FLAGS) 
diff --git a/cpu/cpu-client.c b/cpu/cpu-client.c
index 45f92d51..80890caf 100644
--- a/cpu/cpu-client.c
+++ b/cpu/cpu-client.c
@@ -210,95 +210,6 @@ void __attribute__ ((destructor)) deinit_rpc(void)
     }
 }
 
-void __cudaRegisterVar(void **fatCubinHandle, char *hostVar, char *deviceAddress, const char *deviceName, int ext, size_t size, int constant, int global)
-{
-}
-
-void __cudaRegisterFunction(void **fatCubinHandle, const char *hostFun, char *deviceFun,
-                            const char *deviceName, int thread_limit, uint3 *tid,
-                            uint3 *bid, dim3 *bDim, dim3 *gDim, int *wSize)
-{
-    int result;
-    enum clnt_stat retval_1;
-
-    printf("__cudaRegisterFunction(fatCubinHandle=%p, hostFun=%p, devFunc=%s, deviceName=%s, thread_limit=%d, tid=[%p], bid=[%p], bDim=[%p], gDim=[%p], wSize=%p)\n", fatCubinHandle, hostFun, deviceFun, deviceName, thread_limit, tid, bid, bDim, gDim, wSize);
-
-    kernel_info_t *info = cricketd_utils_search_info(&kernel_infos, (char*)deviceName);
-    if (info == NULL) {
-        LOGE(LOG_ERROR, "request to register unknown function: \"%s\"", deviceName);
-        retval_1 = cuda_register_function_1((ptr)fatCubinHandle, (ptr)hostFun, deviceFun, (char*)deviceName, thread_limit, &result, clnt);
-        if (retval_1 != RPC_SUCCESS) {
-            LOGE(LOG_ERROR, "call failed.");
-        }
-
-        return;
-    }
-    info->host_fun = (void*)hostFun;
-
-    if (retval_1 != RPC_SUCCESS) {
-        clnt_perror (clnt, "call failed");
-    }
-}
-
-struct __fatCubin {
-    uint32_t magic;
-    uint32_t seq;
-    uint64_t text;
-    uint64_t data;
-    uint64_t ptr;
-    uint64_t ptr2;
-    uint64_t zero;
-};
-
-struct rpc_fatCubin {
-    uint32_t magic;
-    uint32_t seq;
-    uint64_t text;
-    uint64_t data;
-    uint64_t ptr;
-    uint64_t ptr2;
-    uint64_t zero;
-};
-
-void** __cudaRegisterFatBinary(void *fatCubin)
-{
-    ptr_result result;
-    enum clnt_stat retval_1;
-
-    struct __fatCubin *fat = (struct __fatCubin*)((fatCubin));
-    struct rpc_fatCubin rpc_fat = {.magic = fat->magic,
-                                   .seq   = fat->seq,
-                                   .text  = fat->text,
-                                   .data  = fat->data,
-                                   .ptr   = fat->ptr,
-                                   .ptr2  = fat->ptr2,
-                                   .zero  = fat->zero};
-    LOGE(LOG_DEBUG, "__cudaRegisterFatBinary");
-    //printf("__cudaRegisterFatBinary(magic: %x, seq: %x, text: %lx, data: %lx, ptr: %lx, ptr2: %lx, zero: %lx\n",
-    //       fat->magic, fat->seq, fat->text, fat->data, fat->ptr, fat->ptr2, fat->zero);
-    retval_1 = RPC_SUCCESS;//cuda_register_fat_binary_1(rpc_fat, &result, clnt);
-    if (retval_1 != RPC_SUCCESS) {
-        clnt_perror (clnt, "call failed");
-    }
-    if (result.err != 0) {
-        return NULL;
-    }
-    return (void*)result.ptr_result_u.ptr;
-}
-
-void __cudaRegisterFatBinaryEnd(void **fatCubinHandle)
-{
-    int result;
-    enum clnt_stat retval_1;
-
-    //printf("__cudaRegisterFatBinaryEnd(fatCubinHandle=%p)\n", fatCubinHandle);
-
-    retval_1 = RPC_SUCCESS;//cuda_register_fat_binary_end_1((uint64_t)fatCubinHandle, &result, clnt);
-    if (retval_1 != RPC_SUCCESS) {
-        clnt_perror (clnt, "call failed");
-    }
-}
-
 static void *(*dlopen_orig)(const char *, int) = NULL;
 static int   (*dlclose_orig)(void *) = NULL;
 static void *dl_handle = NULL;
@@ -323,6 +234,15 @@ void *dlopen(const char *filename, int flag)
         LOGE(LOG_DEBUG, "request to dlopen \"%s\"", filename);
         if (cpu_utils_contains_kernel(filename) == 0) {
             LOGE(LOG_ERROR, "file does not contain a kernel");
+        } else {
+            LOGE(LOG_DEBUG, "file contains a kernel");
+            int result;
+            enum clnt_stat retval_1;
+            retval_1 = rpc_dlopen_1((char*)filename, &result, clnt);
+            if (retval_1 != RPC_SUCCESS) {
+                LOGE(LOG_ERROR, "error calling rpc_dlopen");
+            }
+            cpu_utils_parameter_info(&kernel_infos, (char*)filename);
         }
         return dlopen_orig(filename, flag);
     }
@@ -349,4 +269,94 @@ int dlclose(void *handle)
 
 }
 
+// void __cudaRegisterVar(void **fatCubinHandle, char *hostVar, char *deviceAddress, const char *deviceName, int ext, size_t size, int constant, int global)
+// {
+// }
+
+void __cudaRegisterFunction(void **fatCubinHandle, const char *hostFun, char *deviceFun,
+                            const char *deviceName, int thread_limit, uint3 *tid,
+                            uint3 *bid, dim3 *bDim, dim3 *gDim, int *wSize)
+{
+    int result;
+    enum clnt_stat retval_1;
+
+    printf("__cudaRegisterFunction(fatCubinHandle=%p, hostFun=%p, devFunc=%s, deviceName=%s, thread_limit=%d, tid=[%p], bid=[%p], bDim=[%p], gDim=[%p], wSize=%p)\n", fatCubinHandle, hostFun, deviceFun, deviceName, thread_limit, tid, bid, bDim, gDim, wSize);
+
+    kernel_info_t *info = cricketd_utils_search_info(&kernel_infos, (char*)deviceName);
+    if (info == NULL) {
+        LOGE(LOG_ERROR, "request to register unknown function: \"%s\"", deviceName);
+
+
+        return;
+    } else {
+        LOGE(LOG_DEBUG, "request to register known function: \"%s\"", deviceName);
+        retval_1 = cuda_register_function_1((ptr)fatCubinHandle, (ptr)hostFun, deviceFun, (char*)deviceName, thread_limit, &result, clnt);
+        if (retval_1 != RPC_SUCCESS) {
+            LOGE(LOG_ERROR, "call failed.");
+        }
+        info->host_fun = (void*)hostFun;
+    }
+}
+
+// struct __fatCubin {
+//     uint32_t magic;
+//     uint32_t seq;
+//     uint64_t text;
+//     uint64_t data;
+//     uint64_t ptr;
+//     uint64_t ptr2;
+//     uint64_t zero;
+// };
+
+// struct rpc_fatCubin {
+//     uint32_t magic;
+//     uint32_t seq;
+//     uint64_t text;
+//     uint64_t data;
+//     uint64_t ptr;
+//     uint64_t ptr2;
+//     uint64_t zero;
+// };
+
+// void** __cudaRegisterFatBinary(void *fatCubin)
+// {
+//     ptr_result result;
+//     enum clnt_stat retval_1;
+
+//     struct __fatCubin *fat = (struct __fatCubin*)((fatCubin));
+//     struct rpc_fatCubin rpc_fat = {.magic = fat->magic,
+//                                    .seq   = fat->seq,
+//                                    .text  = fat->text,
+//                                    .data  = fat->data,
+//                                    .ptr   = fat->ptr,
+//                                    .ptr2  = fat->ptr2,
+//                                    .zero  = fat->zero};
+//     LOGE(LOG_DEBUG, "__cudaRegisterFatBinary");
+//     //printf("__cudaRegisterFatBinary(magic: %x, seq: %x, text: %lx, data: %lx, ptr: %lx, ptr2: %lx, zero: %lx\n",
+//     //       fat->magic, fat->seq, fat->text, fat->data, fat->ptr, fat->ptr2, fat->zero);
+//     retval_1 = RPC_SUCCESS;//cuda_register_fat_binary_1(rpc_fat, &result, clnt);
+//     if (retval_1 != RPC_SUCCESS) {
+//         clnt_perror (clnt, "call failed");
+//     }
+//     if (result.err != 0) {
+//         return NULL;
+//     }
+//     return (void*)result.ptr_result_u.ptr;
+// }
+
+// void __cudaRegisterFatBinaryEnd(void **fatCubinHandle)
+// {
+//     int result;
+//     enum clnt_stat retval_1;
+
+//     //printf("__cudaRegisterFatBinaryEnd(fatCubinHandle=%p)\n", fatCubinHandle);
+
+//     retval_1 = RPC_SUCCESS;//cuda_register_fat_binary_end_1((uint64_t)fatCubinHandle, &result, clnt);
+//     if (retval_1 != RPC_SUCCESS) {
+//         clnt_perror (clnt, "call failed");
+//     }
+// }
+
+
+
 
diff --git a/cpu/cpu-server-runtime.c b/cpu/cpu-server-runtime.c
index 87780856..930a7e3b 100644
--- a/cpu/cpu-server-runtime.c
+++ b/cpu/cpu-server-runtime.c
@@ -133,6 +133,27 @@ int server_runtime_restore(const char *path)
     return 0;
 }
 
+
+/** implementation for CUDA_REGISTER_FUNCTION(ptr, str, str, str, int)
+ *
+ */
+bool_t cuda_register_function_1_svc(ptr fatCubinHandle, ptr hostFun, char* deviceFun, char* deviceName, int thread_limit, int* result, struct svc_req *rqstp)
+{
+    LOGE(LOG_DEBUG, "cudaRegisterFunction(%p, %p, %s, %s, %d)", fatCubinHandle, hostFun, deviceFun, deviceName, thread_limit);
+    // __cudaRegisterFunction(&fatCubinHandle, hostFun, deviceFun,
+    //                         deviceName, thread_limit, &tid, &bid, &bDim, &gDim, &wSize);
+
+    // LOGE(LOG_DEBUG, "-> %p, {%d, %d, %d}, {%d, %d, %d}, {%d, %d, %d}, {%d, %d, %d}, %d)",
+    //                 fatCubinHandle, 
+    //                 tid.x, tid.y, tid.z,
+    //                 bid.x, bid.y, bid.z,
+    //                 bDim.x, bDim.y, bDim.z,
+    //                 gDim.x, gDim.y, gDim.z,
+    //                 wSize);
+    *result = 0;
+    return 1;
+}
+
 /* ############### RUNTIME API ############### */
 /* ### Device Management ### */
 bool_t cuda_choose_device_1_svc(mem_data prop, int_result *result, struct svc_req *rqstp)
diff --git a/cpu/cpu-server.c b/cpu/cpu-server.c
index 460edc45..44a29ae0 100644
--- a/cpu/cpu-server.c
+++ b/cpu/cpu-server.c
@@ -5,6 +5,7 @@
 #include <signal.h> //sigaction
 #include <sys/types.h>
 #include <sys/stat.h>
+#include <dlfcn.h>
 
 #include "cpu-server.h"
 #include "cpu_rpc_prot.h"
@@ -109,16 +110,27 @@ bool_t rpc_checkpoint_1_svc(int *result, struct svc_req *rqstp)
     return ret == 0;
 }
 
-/** implementation for CUDA_REGISTER_FUNCTION(ptr, str, str, str, int)
- *
- */
-bool_t cuda_register_function_1_svc(ptr fatCubinHandle, ptr hostFun, char* deviceFun, char* deviceName, int thread_limit, int* result, struct svc_req *rqstp)
+bool_t rpc_dlopen_1_svc(char *path, int *result, struct svc_req *rqstp)
 {
-    LOGE(LOG_DEBUG, "cudaRegisterFunction(%p, %p, %s, %s, %d)", fatCubinHandle, hostFun, deviceFun, deviceName, thread_limit);
+    void *dlhandle;
+
+    if (path == NULL) {
+        LOGE(LOG_ERROR, "path is NULL");
+        *result = 1;
+        return 1;
+    }
+    if ((dlhandle = dlopen(path, RTLD_LAZY)) == NULL) {
+        LOGE(LOG_ERROR, "error opening \"%s\": %s. Make sure libraries are present.", path, dlerror());
+        *result = 1;
+        return 1;
+    } else {
+        LOG(LOG_INFO, "dlopened \"%s\"", path);
+    }
     *result = 0;
     return 1;
 }
 
+
 void cricket_main_hash(char* app_command)
 {
     cricket_main(app_command, 0, 0);
diff --git a/cpu/cpu-utils.c b/cpu/cpu-utils.c
index 15db60b5..e7a67cdd 100644
--- a/cpu/cpu-utils.c
+++ b/cpu/cpu-utils.c
@@ -364,6 +364,7 @@ int cpu_utils_contains_kernel(const char *path)
             // Line does not start with .nv.info. so continue searching.
             continue;
         }*/
+        line[strlen(line)-1] = '\0';
         LOGE(LOG_DEBUG, "output: \"%s\"", line);
     }
     ret = 0;
diff --git a/cpu/cpu_rpc_prot.x b/cpu/cpu_rpc_prot.x
index f5c405e4..be3b7d2a 100644
--- a/cpu/cpu_rpc_prot.x
+++ b/cpu/cpu_rpc_prot.x
@@ -120,6 +120,7 @@ program RPC_CD_PROG {
         int          rpc_checkpoint(void)                                         = 0;
         int          rpc_deinit(void)                                             = 1;
         int          rpc_printmessage(string)                                     = 2;
+        int          rpc_dlopen(string)                                           = 3;
         int          CUDA_REGISTER_FUNCTION(ptr, ptr, string, string, int)       = 50;
 
         /* RUNTIME API */
diff --git a/tests/test_apps/Makefile b/tests/test_apps/Makefile
index f33b0fd8..0acd9513 100644
--- a/tests/test_apps/Makefile
+++ b/tests/test_apps/Makefile
@@ -61,10 +61,10 @@ $(LIBCUDA_OBJ) : $(LIBCUDA_OBJ:.o=.c)
 	$(HOST_CC) -c -fpic -o $@ $< $(LIBCUDA_LIBS)
 
 $(TEST_KERNEL_LIB_O) : $(FILES)
-	$(CC) -c --compiler-options '-fPIC' -o $@ $<
+	$(CC) $(CFLAGS) -dc --compiler-options '-fPIC' -o $@ $<
 
 $(TEST_KERNEL_LIB) : $(TEST_KERNEL_LIB_O)
-	$(LD) -shared -o lib$@ $^
+	$(LD) $(LDFLAGS) -shared -o lib$@ $^
 
 $(TEST_KERNEL_LIB_CALL_O) : $(TEST_KERNEL_LIB_CALL_O:.o=.c)
 	$(HOST_CC) -c -o $@ $<

From 905fefeb9172c2ab3da2fa57887f799e96ef683d Mon Sep 17 00:00:00 2001
From: Niklas Eiling <niklas.eiling@eonerc.rwth-aachen.de>
Date: Thu, 16 Feb 2023 22:55:50 +0100
Subject: [PATCH 05/83] add decoding of fatbinary data

Signed-off-by: Niklas Eiling <niklas.eiling@eonerc.rwth-aachen.de>
---
 cpu/cpu-client.c                   |  88 ++++++++++----------
 cpu/cpu-server-runtime.c           |  29 ++++++-
 cpu/cpu-server.c                   |  31 ++++++-
 cpu/cpu-utils.c                    | 125 +++++++++++++++++++++++++++--
 cpu/cpu-utils.h                    |  15 +++-
 cpu/resource-mg.h                  |   1 +
 tests/cpu/cubin/Makefile           |   6 +-
 tests/cpu/cubin/main.cpp           |  22 ++++-
 tests/test_apps/Makefile           |   4 +-
 tests/test_apps/test_kernel_call.c |   2 +
 10 files changed, 260 insertions(+), 63 deletions(-)

diff --git a/cpu/cpu-client.c b/cpu/cpu-client.c
index 80890caf..23523d5b 100644
--- a/cpu/cpu-client.c
+++ b/cpu/cpu-client.c
@@ -3,6 +3,7 @@
 #include <stdlib.h>
 #include <cuda.h>
 #include <driver_types.h>
+#include <link.h>
 
 //For TCP socket
 #include <sys/socket.h>
@@ -216,6 +217,8 @@ static void *dl_handle = NULL;
 
 void *dlopen(const char *filename, int flag)
 {
+    void *ret = NULL;
+    struct link_map *map;
     LOG(LOG_DEBUG, "intercepted dlopen(%s, %d)", filename, flag);
     if (dlopen_orig == NULL) {
         if ( (dlopen_orig = dlsym(RTLD_NEXT, "dlopen")) == NULL) {
@@ -244,7 +247,10 @@ void *dlopen(const char *filename, int flag)
             }
             cpu_utils_parameter_info(&kernel_infos, (char*)filename);
         }
-        return dlopen_orig(filename, flag);
+        ret = dlopen_orig(filename, flag);
+        dlinfo(ret, RTLD_DI_LINKMAP, &map);
+        LOGE(LOG_DEBUG, "dlopen \"%s\" to  %p", filename, map->l_addr);
+        return ret;
     }
 }
 
@@ -298,51 +304,43 @@ void __cudaRegisterFunction(void **fatCubinHandle, const char *hostFun, char *de
     }
 }
 
-// struct __fatCubin {
-//     uint32_t magic;
-//     uint32_t seq;
-//     uint64_t text;
-//     uint64_t data;
-//     uint64_t ptr;
-//     uint64_t ptr2;
-//     uint64_t zero;
-// };
-
-// struct rpc_fatCubin {
-//     uint32_t magic;
-//     uint32_t seq;
-//     uint64_t text;
-//     uint64_t data;
-//     uint64_t ptr;
-//     uint64_t ptr2;
-//     uint64_t zero;
-// };
-
-// void** __cudaRegisterFatBinary(void *fatCubin)
-// {
-//     ptr_result result;
-//     enum clnt_stat retval_1;
+struct rpc_fatCubin {
+    uint32_t magic;
+    uint32_t seq;
+    uint64_t text;
+    uint64_t data;
+    uint64_t ptr;
+    uint64_t ptr2;
+    uint64_t zero;
+};
+
+void** __cudaRegisterFatBinary(void *fatCubin)
+{
+    ptr_result result;
+    enum clnt_stat retval_1;
 
-//     struct __fatCubin *fat = (struct __fatCubin*)((fatCubin));
-//     struct rpc_fatCubin rpc_fat = {.magic = fat->magic,
-//                                    .seq   = fat->seq,
-//                                    .text  = fat->text,
-//                                    .data  = fat->data,
-//                                    .ptr   = fat->ptr,
-//                                    .ptr2  = fat->ptr2,
-//                                    .zero  = fat->zero};
-//     LOGE(LOG_DEBUG, "__cudaRegisterFatBinary");
-//     //printf("__cudaRegisterFatBinary(magic: %x, seq: %x, text: %lx, data: %lx, ptr: %lx, ptr2: %lx, zero: %lx\n",
-//     //       fat->magic, fat->seq, fat->text, fat->data, fat->ptr, fat->ptr2, fat->zero);
-//     retval_1 = RPC_SUCCESS;//cuda_register_fat_binary_1(rpc_fat, &result, clnt);
-//     if (retval_1 != RPC_SUCCESS) {
-//         clnt_perror (clnt, "call failed");
-//     }
-//     if (result.err != 0) {
-//         return NULL;
-//     }
-//     return (void*)result.ptr_result_u.ptr;
-// }
+    if (cpu_utils_get_fatbin_info((struct __fatCubin*)fatCubin) != 0) {
+        LOGE(LOG_ERROR, "error getting fatbin info");
+    }
+
+    struct __fatCubin *fat = (struct __fatCubin*)((fatCubin));
+    struct rpc_fatCubin rpc_fat = {.magic = fat->magic,
+                                   .seq   = fat->seq,
+                                   .text  = fat->text,
+                                   .data  = fat->data,
+                                   .ptr   = fat->ptr,
+                                   .ptr2  = fat->ptr2,
+                                   .zero  = fat->zero};
+
+    retval_1 = RPC_SUCCESS;//cuda_register_fat_binary_1(rpc_fat, &result, clnt);
+    if (retval_1 != RPC_SUCCESS) {
+        clnt_perror (clnt, "call failed");
+    }
+    if (result.err != 0) {
+        return NULL;
+    }
+    return (void*)result.ptr_result_u.ptr;
+}
 
 // void __cudaRegisterFatBinaryEnd(void **fatCubinHandle)
 // {
diff --git a/cpu/cpu-server-runtime.c b/cpu/cpu-server-runtime.c
index 930a7e3b..8c6c8ff5 100644
--- a/cpu/cpu-server-runtime.c
+++ b/cpu/cpu-server-runtime.c
@@ -2,6 +2,7 @@
 #include <cuda_runtime_api.h>
 #include <cuda.h>
 #include <driver_types.h>
+#include <dlfcn.h>
 
 //for strerror
 #include <string.h>
@@ -70,6 +71,9 @@ int server_runtime_init(int restore)
         ret &= resource_mg_init(&rm_events, 1);
         ret &= resource_mg_init(&rm_arrays, 1);
         ret &= resource_mg_init(&rm_memory, 1);
+         // We cannot bypass this RM, because we need translations when a kernel in
+         // a shared object is launched.
+        ret &= resource_mg_init(&rm_kernels, 0);
         ret &= cusolver_init(1, &rm_streams, &rm_memory);
         ret &= cublas_init(1, &rm_memory);
     } else {
@@ -77,6 +81,7 @@ int server_runtime_init(int restore)
         ret &= resource_mg_init(&rm_events, 0);
         ret &= resource_mg_init(&rm_arrays, 0);
         ret &= resource_mg_init(&rm_memory, 0);
+        ret &= resource_mg_init(&rm_kernels, 0);
         ret &= cusolver_init(0, &rm_streams, &rm_memory);
         ret &= cublas_init(0, &rm_memory);
         ret &= server_runtime_restore("ckp");
@@ -90,6 +95,7 @@ int server_runtime_deinit(void)
     resource_mg_free(&rm_events);
     resource_mg_free(&rm_arrays);
     resource_mg_free(&rm_memory);
+    resource_mg_free(&rm_kernels);
     cusolver_deinit();
     cublas_deinit();
     list_free(&mt_memcpy_list);
@@ -140,6 +146,21 @@ int server_runtime_restore(const char *path)
 bool_t cuda_register_function_1_svc(ptr fatCubinHandle, ptr hostFun, char* deviceFun, char* deviceName, int thread_limit, int* result, struct svc_req *rqstp)
 {
     LOGE(LOG_DEBUG, "cudaRegisterFunction(%p, %p, %s, %s, %d)", fatCubinHandle, hostFun, deviceFun, deviceName, thread_limit);
+    
+    void (*serverFun)(void);
+
+    if ( (serverFun = dlsym(RTLD_NEXT, "dlopen")) == NULL) {
+        LOGE(LOG_ERROR, "failed to get dlopen %s", dlerror());
+        *result = 1;
+        return 1;
+    }
+    
+    if (resource_mg_add_sorted(&rm_kernels, (void*)hostFun, serverFun) != 0) {
+        LOGE(LOG_ERROR, "failed to add kernel to resource manager");
+        *result = 1;
+        return 1;
+    }
+    LOGE(LOG_DEBUG, "added kernel %p->%p to resource manager", hostFun, serverFun);
     // __cudaRegisterFunction(&fatCubinHandle, hostFun, deviceFun,
     //                         deviceName, thread_limit, &tid, &bid, &bDim, &gDim, &wSize);
 
@@ -791,7 +812,7 @@ bool_t cuda_launch_cooperative_kernel_1_svc(ptr func, rpc_dim3 gridDim, rpc_dim3
     LOGE(LOG_DEBUG, "cudaLaunchCooperativeKernel(func=%p, gridDim=[%d,%d,%d], blockDim=[%d,%d,%d], args=%p, sharedMem=%d, stream=%p)", func, cuda_gridDim.x, cuda_gridDim.y, cuda_gridDim.z, cuda_blockDim.x, cuda_blockDim.y, cuda_blockDim.z, cuda_args, sharedMem, (void*)stream);
 
     *result = cudaLaunchCooperativeKernel(
-      (void*)func,
+      resource_mg_get(&rm_kernels, (void*)func),
       cuda_gridDim,
       cuda_blockDim,
       cuda_args,
@@ -830,7 +851,7 @@ bool_t cuda_launch_cooperative_kernel_multi_device_1_svc(ptr func, rpc_dim3 grid
     LOGE(LOG_DEBUG, "cudaLaunchCooperativeKernelMultiDevice(func=%p, gridDim=[%d,%d,%d], blockDim=[%d,%d,%d], args=%p, sharedMem=%d, stream=%p)", func, cuda_gridDim.x, cuda_gridDim.y, cuda_gridDim.z, cuda_blockDim.x, cuda_blockDim.y, cuda_blockDim.z, cuda_args, sharedMem, (void*)stream);
     lp.args = cuda_args;
     lp.blockDim = cuda_blockDim;
-    lp.func = (void*)func;
+    lp.func = resource_mg_get(&rm_kernels, (void*)func);
     lp.gridDim = cuda_gridDim;
     lp.sharedMem = sharedMem;
     lp.stream = resource_mg_get(&rm_streams, (void*)stream);
@@ -869,10 +890,10 @@ bool_t cuda_launch_kernel_1_svc(ptr func, rpc_dim3 gridDim, rpc_dim3 blockDim,
         LOGE(LOG_DEBUG, "arg: %p (%d)", *(void**)cuda_args[i], *(int*)cuda_args[i]);
     }
 
-    LOGE(LOG_DEBUG, "cudaLaunchKernel(func=%p, gridDim=[%d,%d,%d], blockDim=[%d,%d,%d], args=%p, sharedMem=%d, stream=%p)", func, cuda_gridDim.x, cuda_gridDim.y, cuda_gridDim.z, cuda_blockDim.x, cuda_blockDim.y, cuda_blockDim.z, cuda_args, sharedMem, (void*)stream);
+    LOGE(LOG_DEBUG, "cudaLaunchKernel(func=%p, gridDim=[%d,%d,%d], blockDim=[%d,%d,%d], args=%p, sharedMem=%d, stream=%p)", resource_mg_get(&rm_kernels, (void*)func), cuda_gridDim.x, cuda_gridDim.y, cuda_gridDim.z, cuda_blockDim.x, cuda_blockDim.y, cuda_blockDim.z, cuda_args, sharedMem, (void*)stream);
 
     *result = cudaLaunchKernel(
-      (void*)func,
+      resource_mg_get(&rm_kernels, (void*)func),
       cuda_gridDim,
       cuda_blockDim,
       cuda_args,
diff --git a/cpu/cpu-server.c b/cpu/cpu-server.c
index 44a29ae0..1170aea8 100644
--- a/cpu/cpu-server.c
+++ b/cpu/cpu-server.c
@@ -1,3 +1,4 @@
+#define _GNU_SOURCE
 #include <stdlib.h>
 #include <stdio.h>
 #include <sys/socket.h>
@@ -6,6 +7,7 @@
 #include <sys/types.h>
 #include <sys/stat.h>
 #include <dlfcn.h>
+#include <link.h>
 
 #include "cpu-server.h"
 #include "cpu_rpc_prot.h"
@@ -110,6 +112,30 @@ bool_t rpc_checkpoint_1_svc(int *result, struct svc_req *rqstp)
     return ret == 0;
 }
 
+/* Call CUDA initialization function (usually called by __libc_init_main())
+* Address of "_ZL24__sti____cudaRegisterAllv" in static symbol table is e.g. 0x4016c8
+*/
+void cricket_so_register(void* dlhandle, char *path)
+{
+    struct link_map *map;
+    dlinfo(dlhandle, RTLD_DI_LINKMAP, &map);
+
+    // add load location of library to offset in symbol table
+    void (*cudaRegisterAllv)(void) = 
+        (void(*)(void)) cricketd_utils_symbol_address(path, "_ZL24__sti____cudaRegisterAllv");
+    
+    LOG(LOG_INFO, "found CUDA initialization function at %p + %p = %p", 
+        map->l_addr, cudaRegisterAllv, map->l_addr + cudaRegisterAllv);
+
+    cudaRegisterAllv += map->l_addr;
+    
+    if (cudaRegisterAllv == NULL) {
+        LOGE(LOG_WARNING, "could not find cudaRegisterAllv initialization function in cubin. Kernels cannot be launched without it!");
+    } else {
+        cudaRegisterAllv();
+    }
+}
+
 bool_t rpc_dlopen_1_svc(char *path, int *result, struct svc_req *rqstp)
 {
     void *dlhandle;
@@ -125,6 +151,9 @@ bool_t rpc_dlopen_1_svc(char *path, int *result, struct svc_req *rqstp)
         return 1;
     } else {
         LOG(LOG_INFO, "dlopened \"%s\"", path);
+
+       //cricket_so_register(dlhandle, path);
+
     }
     *result = 0;
     return 1;
@@ -260,7 +289,7 @@ void cricket_main(char* app_command, size_t prog_num, size_t vers_num)
      * Address of "_ZL24__sti____cudaRegisterAllv" in static symbol table is e.g. 0x4016c8
      */
     void (*cudaRegisterAllv)(void) =
-        (void(*)(void)) cricketd_utils_symbol_address("_ZL24__sti____cudaRegisterAllv");
+        (void(*)(void)) cricketd_utils_symbol_address(NULL, "_ZL24__sti____cudaRegisterAllv");
     LOG(LOG_INFO, "found CUDA initialization function at %p", cudaRegisterAllv);
     if (cudaRegisterAllv == NULL) {
         LOGE(LOG_WARNING, "could not find cudaRegisterAllv initialization function in cubin. Kernels cannot be launched without it!");
diff --git a/cpu/cpu-utils.c b/cpu/cpu-utils.c
index e7a67cdd..d99d6858 100644
--- a/cpu/cpu-utils.c
+++ b/cpu/cpu-utils.c
@@ -23,6 +23,113 @@
 #define CRICKET_ELF_FATBIN ".nv_fatbin"
 #define CRICKET_ELF_REGFUN "_ZL24__sti____cudaRegisterAllv"
 
+#define FATBIN_STRUCT_MAGIC 0x466243b1
+#define FATBIN_TEXT_MAGIC   0xBA55ED50
+
+struct  __attribute__((__packed__)) fat_text_header
+{
+  unsigned int           magic;
+  unsigned short         version;
+  unsigned short         header_size;
+  unsigned long long int fat_size;
+};
+
+#define FATBIN_FLAG_64BIT     0x0000000000000001LL
+#define FATBIN_FLAG_DEBUG     0x0000000000000002LL
+#define FATBIN_FLAG_LINUX     0x0000000000000010LL
+#define FATBIN_FLAG_COMPRESS  0x0000000000002000LL
+
+static int cricket_fatbin_flag_to_str(char** str, uint64_t flag)
+{
+    return asprintf(str, "64Bit: %s, Debug: %s, Linux: %s, Compress %s",
+        (flag & FATBIN_FLAG_64BIT) ? "yes" : "no",
+        (flag & FATBIN_FLAG_DEBUG) ? "yes" : "no",
+        (flag & FATBIN_FLAG_LINUX) ? "yes" : "no",
+        (flag & FATBIN_FLAG_COMPRESS) ? "yes" : "no");
+}
+
+int cpu_utils_get_fatbin_info(struct __fatCubin *fatbin)
+{
+    void *fatbin_ptr = NULL, *fatbin_elfinfo = NULL;
+    struct fat_text_header* fatbin_text = NULL;
+    if (fatbin == NULL) {
+        LOGE(LOG_ERROR, "fatbin is NULL");
+        return -1;
+    }
+    if (fatbin->magic != FATBIN_STRUCT_MAGIC) {
+        LOGE(LOG_ERROR, "fatbin struct magic number is wrong. Got %llx, expected %llx.", fatbin->magic, FATBIN_STRUCT_MAGIC);
+        return -1;
+    }
+    LOG(LOG_DEBUG, "Fatbin: magic: %x, version: %x, text: %lx, data: %lx, ptr: %lx, ptr2: %lx, zero: %lx",
+           fatbin->magic, fatbin->version, fatbin->text, fatbin->data, fatbin->ptr, fatbin->ptr2, fatbin->zero);
+
+    fatbin_text = (struct fat_text_header*)fatbin->text;
+    if (fatbin_text->magic != FATBIN_TEXT_MAGIC) {
+        LOGE(LOG_ERROR, "fatbin text magic number is wrong. Got %x, expected %x.", *((uint32_t*)fatbin_text), FATBIN_TEXT_MAGIC);
+        return -1;
+    }
+    LOG(LOG_DEBUG, "Fatbin.text: magic: %x, version: %d, header_size: %p, fat_size: %p",
+        fatbin_text->magic, fatbin_text->version, fatbin_text->header_size, fatbin_text->fat_size);
+
+    if (fatbin_text->version != 1 || fatbin_text->header_size != sizeof(struct fat_text_header)) {
+        LOGE(LOG_ERROR, "fatbin text version is wrong or header size is inconsistent.\
+            This is a sanity check to avoid reading a new fatbinary format");
+        return -1;
+    }
+    fatbin_ptr = fatbin_elfinfo = (void*)fatbin_text + fatbin_text->header_size;
+
+    struct __attribute__((__packed__)) fatbin_header {
+        short kind;
+        short unknown1;
+        uint32_t header_size;
+        uint64_t fatbin_size;
+        uint64_t some_offset;
+        short minor;
+        short major;
+        uint32_t arch;
+        uint32_t obj_name_offset;
+        uint32_t obj_name_len;
+        uint64_t flags;
+        uint64_t zero;
+        uint64_t unknown2;
+    } *fatbin_header = (struct fatbin_header*)(fatbin_ptr);
+    LOGE(LOG_DEBUG, "Fatbin header: fatbin_offset: %#x, header_size %#x, fatbin_size %#x, some_offset %#x.\
+        minor %#x, major %#x, arch %d, flags %#x",
+        fatbin_header->kind,
+        fatbin_header->header_size,
+        fatbin_header->fatbin_size,
+        fatbin_header->some_offset,
+        fatbin_header->minor,
+        fatbin_header->major,
+        fatbin_header->arch,
+        fatbin_header->flags);
+    LOGE(LOG_DEBUG, "unknown fields: unknown1: %#x, unknown2: %#x, zeros: %#x",
+        fatbin_header->unknown1,
+        fatbin_header->unknown2,
+        fatbin_header->zero);
+    fatbin_ptr += sizeof(struct fatbin_header);
+
+    char *flag_str = NULL;
+    cricket_fatbin_flag_to_str(&flag_str, fatbin_header->flags);
+    LOGE(LOG_DEBUG, "Fatbin flags: %s", flag_str);
+    free(flag_str);
+
+    if(fatbin_header->obj_name_offset != 0) {
+        if (((char*)fatbin_elfinfo)[fatbin_header->obj_name_offset + fatbin_header->obj_name_len] != '\0') {
+            LOGE(LOG_DEBUG, "Fatbin object name is not null terminated");
+        } else {
+            char *obj_name = (char*)fatbin_elfinfo + fatbin_header->obj_name_offset;
+            LOGE(LOG_DEBUG, "Fatbin object name: %s (len:%#x)", obj_name, fatbin_header->obj_name_len);
+        }
+        fatbin_ptr += fatbin_header->obj_name_len+1;
+    }
+
+    for (int i=0; i<64; i++) {
+        printf("%02x ", ((uint8_t*)fatbin_ptr)[i]);
+    }
+    return 0;
+}
+
 int cpu_utils_command(char **command)
 {
     FILE* fd;
@@ -80,7 +187,7 @@ int cpu_utils_md5hash(char *filename, unsigned long *high, unsigned long *low)
     return 0;
 }
 
-void* cricketd_utils_symbol_address(char *symbol)
+void* cricketd_utils_symbol_address(const char* file, char *symbol)
 {
     bfd *hostbfd = NULL;
     asection *section;
@@ -90,15 +197,19 @@ void* cricketd_utils_symbol_address(char *symbol)
     asymbol **symtab = NULL;
     char path[256];
     size_t length;
+    const char self[] = "/proc/self/exe";
+    if (file == NULL) {
+        file = self;
+    }
 
 
     bfd_init();
 
-    length = readlink("/proc/self/exe", path, sizeof(path));
+    length = readlink(file, path, sizeof(path));
 
     /* Catch some errors: */
     if (length < 0) {
-        LOGE(LOG_WARNING, "error resolving symlink /proc/self/exe.");
+        LOGE(LOG_WARNING, "error resolving symlink %s.", file);
     } else if (length >= 256) {
         LOGE(LOG_WARNING, "path was too long and was truncated.");
     } else {
@@ -106,21 +217,21 @@ void* cricketd_utils_symbol_address(char *symbol)
         LOG(LOG_DEBUG, "opening '%s'", path);
     }
 
-    if ((hostbfd_fd = fopen("/proc/self/exe", "rb")) == NULL) {
+    if ((hostbfd_fd = fopen(file, "rb")) == NULL) {
         LOGE(LOG_ERROR, "fopen failed");
         return NULL;
     }
 
-    if ((hostbfd = bfd_openstreamr("/proc/self/exe", NULL, hostbfd_fd)) == NULL) {
+    if ((hostbfd = bfd_openstreamr(file, NULL, hostbfd_fd)) == NULL) {
         LOGE(LOG_ERROR, "bfd_openr failed on %s",
-             "/proc/self/exe");
+             file);
         fclose(hostbfd_fd);
         goto cleanup;
     }
 
     if (!bfd_check_format(hostbfd, bfd_object)) {
         LOGE(LOG_ERROR, "%s has wrong bfd format",
-             "/proc/self/exe");
+             file);
         goto cleanup;
     }
 
diff --git a/cpu/cpu-utils.h b/cpu/cpu-utils.h
index c0ed97b8..67594cec 100644
--- a/cpu/cpu-utils.h
+++ b/cpu/cpu-utils.h
@@ -5,13 +5,26 @@
 #include "cpu-common.h"
 #include "list.h"
 
+struct __fatCubin {
+    uint32_t magic;
+    uint32_t version;
+    uint64_t text;
+    uint64_t data;
+    uint64_t ptr;
+    uint64_t ptr2;
+    uint64_t zero;
+};
+
+
+int cpu_utils_get_fatbin_info(struct __fatCubin *fatbin);
+
 void kernel_infos_free(kernel_info_t *infos, size_t kernelnum);
 
 
 int cpu_utils_is_local_connection(struct svc_req *rqstp);
 int cpu_utils_command(char **command);
 int cpu_utils_md5hash(char *filename, unsigned long *high, unsigned long *low);
-void* cricketd_utils_symbol_address(char *symbol);
+void* cricketd_utils_symbol_address(const char* file, char *symbol);
 int cricketd_utils_launch_child(const char *file, char **args);
 int cpu_utils_parameter_info(list *kernel_infos, char *path);
 int cpu_utils_contains_kernel(const char *path);
diff --git a/cpu/resource-mg.h b/cpu/resource-mg.h
index aa8bff25..0b134da7 100644
--- a/cpu/resource-mg.h
+++ b/cpu/resource-mg.h
@@ -28,6 +28,7 @@ resource_mg rm_streams;
 resource_mg rm_events;
 resource_mg rm_arrays;
 resource_mg rm_memory;
+resource_mg rm_kernels;
 
 //Driver API RMs
 resource_mg rm_modules;
diff --git a/tests/cpu/cubin/Makefile b/tests/cpu/cubin/Makefile
index ca8418a3..c42ee827 100644
--- a/tests/cpu/cubin/Makefile
+++ b/tests/cpu/cubin/Makefile
@@ -8,11 +8,12 @@ LDFLAGS = -arch=$(ARCH) -cudart shared -lcuda
 BINARY = main
 CUBIN = kernel.cubin
 FATBIN = kernel.fatbin
+SHARED = kernel.so
 
 FILES := main.o
 
 .PHONY: all depend clean 
-all : $(BINARY) $(CUBIN) $(FATBIN)
+all : $(BINARY) $(CUBIN) $(FATBIN) $(SHARED)
 
 $(BINARY) : $(FILES)
 	$(LD) $(LDFLAGS) -o $@ $< 
@@ -26,6 +27,9 @@ $(BINARY) : $(FILES)
 %.o : %.cpp
 	$(CC) $(CFLAGS) -c -o $@ $<
 
+%.so : %.cu
+	$(CC) $(CFLAGS) --compiler-options '-fPIC' -o $@ $<
+
 clean :
 	rm -f *.o *.cubin *.fatbin $(BINARY)
 
diff --git a/tests/cpu/cubin/main.cpp b/tests/cpu/cubin/main.cpp
index c9137243..9b30c077 100644
--- a/tests/cpu/cubin/main.cpp
+++ b/tests/cpu/cubin/main.cpp
@@ -66,6 +66,19 @@ void check_free_mem(int *mem, size_t len)
     cudaFree(mem);
 }
 
+int getModuleFromCubin(CUmodule **module, const char *cubin)
+{
+    if ((err = cuModuleLoad(module, "kernel.cubin")) != CUDA_SUCCESS) {
+        printCudaErrors(err);
+        return 1;
+    }
+    return 0;
+}
+
+int getModuleFromShared(CUmodule **module, const char *cubin)
+{
+
+}
 
 int main(int argc, char** argv)
 {
@@ -83,8 +96,13 @@ int main(int argc, char** argv)
     CUmodule module;
     CUfunction func;
     printf("testing cubin...\n");
-    if ((err = cuModuleLoad(&module, "kernel.cubin")) != CUDA_SUCCESS) {
-        printCudaErrors(err);
+    // if ((err = getModuleFromCubin(&module, "kernel.cubin")) != 0) {
+    //     printf("error\n");
+    //     return 1;
+    // }
+    if ((err = getModuleFromShared(&module, "kernel.so")) != 0) {
+        printf("error\n");
+        return 1;
     }
 
     if ((err = cuModuleGetFunction(&func, module, "kernel")) != CUDA_SUCCESS) {
diff --git a/tests/test_apps/Makefile b/tests/test_apps/Makefile
index 0acd9513..b85845a2 100644
--- a/tests/test_apps/Makefile
+++ b/tests/test_apps/Makefile
@@ -61,10 +61,10 @@ $(LIBCUDA_OBJ) : $(LIBCUDA_OBJ:.o=.c)
 	$(HOST_CC) -c -fpic -o $@ $< $(LIBCUDA_LIBS)
 
 $(TEST_KERNEL_LIB_O) : $(FILES)
-	$(CC) $(CFLAGS) -dc --compiler-options '-fPIC' -o $@ $<
+	$(CC) $(CFLAGS) -g -G -dc --compiler-options '-fPIC' -o $@ $<
 
 $(TEST_KERNEL_LIB) : $(TEST_KERNEL_LIB_O)
-	$(LD) $(LDFLAGS) -shared -o lib$@ $^
+	$(LD) $(LDFLAGS) -g -G -shared -o lib$@ $^
 
 $(TEST_KERNEL_LIB_CALL_O) : $(TEST_KERNEL_LIB_CALL_O:.o=.c)
 	$(HOST_CC) -c -o $@ $<
diff --git a/tests/test_apps/test_kernel_call.c b/tests/test_apps/test_kernel_call.c
index b702b042..ce538a2c 100644
--- a/tests/test_apps/test_kernel_call.c
+++ b/tests/test_apps/test_kernel_call.c
@@ -12,6 +12,8 @@ int main(int argc, char** argv)
 
     int (*fn)(void);
 
+    printf("kernel: %p\n", dlsym(dlhandle, "_Z6kernelPtS_S_csix"));
+
     if ((fn = dlsym(dlhandle, "main")) == NULL) {
         printf("dlsym failed\n");
         return 1;

From 0997d44e31de0ce8589aa0d9d18bfde6ecbdf568 Mon Sep 17 00:00:00 2001
From: Niklas Eiling <niklas.eiling@eonerc.rwth-aachen.de>
Date: Fri, 17 Feb 2023 21:00:23 +0100
Subject: [PATCH 06/83] add decoding of embedded fatbinaries

Signed-off-by: Niklas Eiling <niklas.eiling@eonerc.rwth-aachen.de>
---
 cpu/cpu-client.c         |  60 ++++++--------
 cpu/cpu-server-driver.c  |  30 +++++++
 cpu/cpu-server.c         |   1 +
 cpu/cpu-utils.c          | 164 +++++++++++++++++++++++++--------------
 cpu/cpu-utils.h          |  10 +--
 cpu/cpu_rpc_prot.x       |   1 +
 tests/cpu/cubin/main.cpp |  10 +--
 7 files changed, 172 insertions(+), 104 deletions(-)

diff --git a/cpu/cpu-client.c b/cpu/cpu-client.c
index 23523d5b..ecacc809 100644
--- a/cpu/cpu-client.c
+++ b/cpu/cpu-client.c
@@ -235,18 +235,18 @@ void *dlopen(const char *filename, int flag)
         return dl_handle;
     } else {
         LOGE(LOG_DEBUG, "request to dlopen \"%s\"", filename);
-        if (cpu_utils_contains_kernel(filename) == 0) {
-            LOGE(LOG_ERROR, "file does not contain a kernel");
-        } else {
-            LOGE(LOG_DEBUG, "file contains a kernel");
-            int result;
-            enum clnt_stat retval_1;
-            retval_1 = rpc_dlopen_1((char*)filename, &result, clnt);
-            if (retval_1 != RPC_SUCCESS) {
-                LOGE(LOG_ERROR, "error calling rpc_dlopen");
-            }
-            cpu_utils_parameter_info(&kernel_infos, (char*)filename);
-        }
+        // if (cpu_utils_contains_kernel(filename) == 0) {
+        //     LOGE(LOG_ERROR, "file does not contain a kernel");
+        // } else {
+        //     LOGE(LOG_DEBUG, "file contains a kernel");
+        //     int result;
+        //     enum clnt_stat retval_1;
+        //     retval_1 = rpc_dlopen_1((char*)filename, &result, clnt);
+        //     if (retval_1 != RPC_SUCCESS) {
+        //         LOGE(LOG_ERROR, "error calling rpc_dlopen");
+        //     }
+        //     cpu_utils_parameter_info(&kernel_infos, (char*)filename);
+        // }
         ret = dlopen_orig(filename, flag);
         dlinfo(ret, RTLD_DI_LINKMAP, &map);
         LOGE(LOG_DEBUG, "dlopen \"%s\" to  %p", filename, map->l_addr);
@@ -304,42 +304,30 @@ void __cudaRegisterFunction(void **fatCubinHandle, const char *hostFun, char *de
     }
 }
 
-struct rpc_fatCubin {
-    uint32_t magic;
-    uint32_t seq;
-    uint64_t text;
-    uint64_t data;
-    uint64_t ptr;
-    uint64_t ptr2;
-    uint64_t zero;
-};
+
 
 void** __cudaRegisterFatBinary(void *fatCubin)
 {
-    ptr_result result;
+    int result;
     enum clnt_stat retval_1;
 
-    if (cpu_utils_get_fatbin_info((struct __fatCubin*)fatCubin) != 0) {
+    mem_data rpc_fat = {
+        .mem_data_len = 0,
+        .mem_data_val = NULL};
+
+    if (cpu_utils_get_fatbin_info((struct fat_header*)fatCubin, (void**)&rpc_fat.mem_data_val, &rpc_fat.mem_data_len) != 0) {
         LOGE(LOG_ERROR, "error getting fatbin info");
+        return NULL;
     }
 
-    struct __fatCubin *fat = (struct __fatCubin*)((fatCubin));
-    struct rpc_fatCubin rpc_fat = {.magic = fat->magic,
-                                   .seq   = fat->seq,
-                                   .text  = fat->text,
-                                   .data  = fat->data,
-                                   .ptr   = fat->ptr,
-                                   .ptr2  = fat->ptr2,
-                                   .zero  = fat->zero};
-
-    retval_1 = RPC_SUCCESS;//cuda_register_fat_binary_1(rpc_fat, &result, clnt);
+    retval_1 = rpc_loadelf_1(rpc_fat, &result, clnt);
     if (retval_1 != RPC_SUCCESS) {
-        clnt_perror (clnt, "call failed");
+        LOGE(LOG_ERROR, "call failed.");
     }
-    if (result.err != 0) {
+    if (result != 0) {
         return NULL;
     }
-    return (void*)result.ptr_result_u.ptr;
+    return NULL;
 }
 
 // void __cudaRegisterFatBinaryEnd(void **fatCubinHandle)
diff --git a/cpu/cpu-server-driver.c b/cpu/cpu-server-driver.c
index f6714b56..fbdb58d7 100644
--- a/cpu/cpu-server-driver.c
+++ b/cpu/cpu-server-driver.c
@@ -30,6 +30,36 @@ int server_driver_init(int restore)
     return ret;
 }
 
+#include <cuda_runtime_api.h>
+
+bool_t rpc_loadelf_1_svc(mem_data elf, int *result, struct svc_req *rqstp)
+{
+    LOG(LOG_DEBUG, "rpc_loadelf(elf: %p, len: %#x)", elf.mem_data_val, elf.mem_data_len);
+    CUresult res;
+    CUmodule module;
+    cudaError_t cres;
+
+    for (int i=0; i<64; i++) {
+        printf("%02x ", ((uint8_t*)elf.mem_data_val)[i]);
+    }
+    
+    if ((cres = cudaSetDevice(0)) != cudaSuccess) {
+        LOG(LOG_ERROR, "cudaSetDevice failed: %d", cres);
+        *result = cres;
+        return 1;
+    }
+
+    cudaDeviceSynchronize();
+    
+    if ((res =cuModuleLoadData (&module, elf.mem_data_val)) != CUDA_SUCCESS) {
+        LOG(LOG_ERROR, "cuModuleLoadFatBinary failed: %d", res);
+        *result = res;
+        return 1;
+    } 
+    *result = 0;
+    return 1;
+}
+
 int server_driver_deinit(void)
 {
     resource_mg_free(&rm_modules);
diff --git a/cpu/cpu-server.c b/cpu/cpu-server.c
index 1170aea8..c3c78a9a 100644
--- a/cpu/cpu-server.c
+++ b/cpu/cpu-server.c
@@ -160,6 +160,7 @@ bool_t rpc_dlopen_1_svc(char *path, int *result, struct svc_req *rqstp)
 }
 
 
+
 void cricket_main_hash(char* app_command)
 {
     cricket_main(app_command, 0, 0);
diff --git a/cpu/cpu-utils.c b/cpu/cpu-utils.c
index d99d6858..3218a5ac 100644
--- a/cpu/cpu-utils.c
+++ b/cpu/cpu-utils.c
@@ -1,6 +1,7 @@
 #define _GNU_SOURCE
 #include <stdlib.h>
 #include <stdio.h>
+#include <stdint.h>
 #include <unistd.h>
 #include <errno.h>
 #include <string.h>
@@ -15,6 +16,7 @@
 #include "cpu-common.h"
 #include "log.h"
 
+#define uint16_t unsigned short
 #define CRICKET_ELF_NV_INFO_PREFIX ".nv.info"
 #define CRICKET_ELF_NV_SHARED_PREFIX ".nv.shared."
 #define CRICKET_ELF_NV_TEXT_PREFIX ".nv.text."
@@ -26,12 +28,28 @@
 #define FATBIN_STRUCT_MAGIC 0x466243b1
 #define FATBIN_TEXT_MAGIC   0xBA55ED50
 
+struct  __attribute__((__packed__)) fat_elf_header
+{
+    uint32_t magic;
+    uint16_t version;
+    uint16_t header_size;
+    uint64_t fat_size;
+};
 struct  __attribute__((__packed__)) fat_text_header
 {
-  unsigned int           magic;
-  unsigned short         version;
-  unsigned short         header_size;
-  unsigned long long int fat_size;
+    uint16_t kind;
+    uint16_t unknown1;
+    uint32_t header_size;
+    uint64_t fatbin_size;
+    uint64_t some_offset; //Compression related information
+    uint16_t minor;
+    uint16_t major;
+    uint32_t arch;
+    uint32_t obj_name_offset;
+    uint32_t obj_name_len;
+    uint64_t flags;
+    uint64_t zero;
+    uint64_t unknown2;
 };
 
 #define FATBIN_FLAG_64BIT     0x0000000000000001LL
@@ -48,85 +66,115 @@ static int cricket_fatbin_flag_to_str(char** str, uint64_t flag)
         (flag & FATBIN_FLAG_COMPRESS) ? "yes" : "no");
 }
 
-int cpu_utils_get_fatbin_info(struct __fatCubin *fatbin)
+static int cpu_utils_fat_header_decode(void *fat, 
+                                       struct fat_elf_header **fat_elf_header,
+                                       struct fat_text_header **fat_text_header,
+                                       void **fat_text_body_ptr)
 {
-    void *fatbin_ptr = NULL, *fatbin_elfinfo = NULL;
-    struct fat_text_header* fatbin_text = NULL;
-    if (fatbin == NULL) {
-        LOGE(LOG_ERROR, "fatbin is NULL");
-        return -1;
-    }
-    if (fatbin->magic != FATBIN_STRUCT_MAGIC) {
-        LOGE(LOG_ERROR, "fatbin struct magic number is wrong. Got %llx, expected %llx.", fatbin->magic, FATBIN_STRUCT_MAGIC);
+    struct fat_elf_header* feh;
+    struct fat_text_header* fth;
+    void *fat_ptr = NULL;
+    void *fat_text_header_ptr = NULL;
+
+    if (fat == NULL || fat_elf_header == NULL || fat_text_header == NULL || fat_text_body_ptr == NULL) {
+        LOGE(LOG_ERROR, "at least one parameter is NULL");
         return -1;
     }
-    LOG(LOG_DEBUG, "Fatbin: magic: %x, version: %x, text: %lx, data: %lx, ptr: %lx, ptr2: %lx, zero: %lx",
-           fatbin->magic, fatbin->version, fatbin->text, fatbin->data, fatbin->ptr, fatbin->ptr2, fatbin->zero);
 
-    fatbin_text = (struct fat_text_header*)fatbin->text;
-    if (fatbin_text->magic != FATBIN_TEXT_MAGIC) {
-        LOGE(LOG_ERROR, "fatbin text magic number is wrong. Got %x, expected %x.", *((uint32_t*)fatbin_text), FATBIN_TEXT_MAGIC);
+    feh = (struct fat_elf_header*)fat;
+    if (feh->magic != FATBIN_TEXT_MAGIC) {
+        LOGE(LOG_ERROR, "fatbin text magic number is wrong. Got %x, expected %x.", *((uint32_t*)feh), FATBIN_TEXT_MAGIC);
         return -1;
     }
-    LOG(LOG_DEBUG, "Fatbin.text: magic: %x, version: %d, header_size: %p, fat_size: %p",
-        fatbin_text->magic, fatbin_text->version, fatbin_text->header_size, fatbin_text->fat_size);
+    LOGE(LOG_DEBUG, "fat_elf_header: magic: %x, version: %d, header_size: %p, fat_size: %p",
+        feh->magic, feh->version, feh->header_size, feh->fat_size);
 
-    if (fatbin_text->version != 1 || fatbin_text->header_size != sizeof(struct fat_text_header)) {
+    if (feh->version != 1 || feh->header_size != sizeof(struct fat_elf_header)) {
         LOGE(LOG_ERROR, "fatbin text version is wrong or header size is inconsistent.\
             This is a sanity check to avoid reading a new fatbinary format");
         return -1;
     }
-    fatbin_ptr = fatbin_elfinfo = (void*)fatbin_text + fatbin_text->header_size;
-
-    struct __attribute__((__packed__)) fatbin_header {
-        short kind;
-        short unknown1;
-        uint32_t header_size;
-        uint64_t fatbin_size;
-        uint64_t some_offset;
-        short minor;
-        short major;
-        uint32_t arch;
-        uint32_t obj_name_offset;
-        uint32_t obj_name_len;
-        uint64_t flags;
-        uint64_t zero;
-        uint64_t unknown2;
-    } *fatbin_header = (struct fatbin_header*)(fatbin_ptr);
-    LOGE(LOG_DEBUG, "Fatbin header: fatbin_offset: %#x, header_size %#x, fatbin_size %#x, some_offset %#x.\
+    fat_ptr = fat_text_header_ptr = (void*)feh + feh->header_size;
+
+    fth = (struct fat_text_header*)(fat_text_header_ptr);
+    LOGE(LOG_DEBUG, "fat_text_header: fatbin_kind: %#x, header_size %#x, fatbin_size %#x, some_offset %#x.\
         minor %#x, major %#x, arch %d, flags %#x",
-        fatbin_header->kind,
-        fatbin_header->header_size,
-        fatbin_header->fatbin_size,
-        fatbin_header->some_offset,
-        fatbin_header->minor,
-        fatbin_header->major,
-        fatbin_header->arch,
-        fatbin_header->flags);
+        fth->kind,
+        fth->header_size,
+        fth->fatbin_size,
+        fth->some_offset,
+        fth->minor,
+        fth->major,
+        fth->arch,
+        fth->flags);
     LOGE(LOG_DEBUG, "unknown fields: unknown1: %#x, unknown2: %#x, zeros: %#x",
-        fatbin_header->unknown1,
-        fatbin_header->unknown2,
-        fatbin_header->zero);
-    fatbin_ptr += sizeof(struct fatbin_header);
+        fth->unknown1,
+        fth->unknown2,
+        fth->zero);
+    fat_ptr += sizeof(struct fat_header);
+    *fat_text_body_ptr = fat_text_header_ptr + fth->header_size;
+    if (fth->flags & FATBIN_FLAG_DEBUG) {
+        *fat_text_body_ptr += 1;
+    }
 
     char *flag_str = NULL;
-    cricket_fatbin_flag_to_str(&flag_str, fatbin_header->flags);
+    cricket_fatbin_flag_to_str(&flag_str, fth->flags);
     LOGE(LOG_DEBUG, "Fatbin flags: %s", flag_str);
     free(flag_str);
 
-    if(fatbin_header->obj_name_offset != 0) {
-        if (((char*)fatbin_elfinfo)[fatbin_header->obj_name_offset + fatbin_header->obj_name_len] != '\0') {
+    if(fth->obj_name_offset != 0) {
+        if (((char*)fat_text_header_ptr)[fth->obj_name_offset + fth->obj_name_len] != '\0') {
             LOGE(LOG_DEBUG, "Fatbin object name is not null terminated");
         } else {
-            char *obj_name = (char*)fatbin_elfinfo + fatbin_header->obj_name_offset;
-            LOGE(LOG_DEBUG, "Fatbin object name: %s (len:%#x)", obj_name, fatbin_header->obj_name_len);
+            char *obj_name = (char*)fat_text_header_ptr + fth->obj_name_offset;
+            LOGE(LOG_DEBUG, "Fatbin object name: %s (len:%#x)", obj_name, fth->obj_name_len);
         }
-        fatbin_ptr += fatbin_header->obj_name_len+1;
+        fat_ptr += fth->obj_name_len+1;
+    }
+    *fat_elf_header = feh;
+    *fat_text_header = fth;
+    return 0;
+}
+
+int cpu_utils_get_fatbin_info(struct fat_header *fatbin, void** fatbin_mem, unsigned* fatbin_size)
+{
+    struct fat_elf_header* fat_elf_header;
+    struct fat_text_header* fat_text_header;
+    void *fat_ptr = NULL;
+    void *fat_text_body_ptr = NULL;
+    unsigned fatbin_total_size = 0;
+    if (fatbin == NULL || fatbin_mem == NULL || fatbin_size == NULL) {
+        LOGE(LOG_ERROR, "at least one parameter is NULL");
+        return -1;
+    }
+    if (fatbin->magic != FATBIN_STRUCT_MAGIC) {
+        LOGE(LOG_ERROR, "fatbin struct magic number is wrong. Got %llx, expected %llx.", fatbin->magic, FATBIN_STRUCT_MAGIC);
+        return -1;
     }
+    LOG(LOG_DEBUG, "Fatbin: magic: %x, version: %x, text: %lx, data: %lx, ptr: %lx, ptr2: %lx, zero: %lx",
+           fatbin->magic, fatbin->version, fatbin->text, fatbin->data, fatbin->unknown, fatbin->text2, fatbin->zero);
+
+    if (cpu_utils_fat_header_decode((void*)fatbin->text, &fat_elf_header, &fat_text_header, &fat_text_body_ptr) != 0) {
+        LOGE(LOG_ERROR, "fatbin header decode failed");
+        return -1;
+    }
+
+    fatbin_total_size = fat_elf_header->header_size + fat_elf_header->fat_size;
+
+    if (cpu_utils_fat_header_decode((void*)fatbin->text2, &fat_elf_header, &fat_text_header, &fat_text_body_ptr) != 0) {
+        LOGE(LOG_ERROR, "fatbin header decode failed");
+        return -1;
+    }
+    fatbin_total_size += fat_elf_header->header_size + fat_elf_header->fat_size;
+
+    fat_ptr = fatbin->data;
 
     for (int i=0; i<64; i++) {
-        printf("%02x ", ((uint8_t*)fatbin_ptr)[i]);
+        printf("%02x ", ((uint8_t*)fat_ptr)[i]);
     }
+
+    *fatbin_mem = fatbin->text;
+    *fatbin_size = fatbin_total_size;
     return 0;
 }
 
diff --git a/cpu/cpu-utils.h b/cpu/cpu-utils.h
index 67594cec..3afd0a0c 100644
--- a/cpu/cpu-utils.h
+++ b/cpu/cpu-utils.h
@@ -5,18 +5,18 @@
 #include "cpu-common.h"
 #include "list.h"
 
-struct __fatCubin {
+struct fat_header {
     uint32_t magic;
     uint32_t version;
     uint64_t text;
-    uint64_t data;
-    uint64_t ptr;
-    uint64_t ptr2;
+    uint64_t data;  // points to outside of the file
+    uint64_t unknown;
+    uint64_t text2;
     uint64_t zero;
 };
 
 
-int cpu_utils_get_fatbin_info(struct __fatCubin *fatbin);
+int cpu_utils_get_fatbin_info(struct fat_header *fatbin, void** fatbin_mem, unsigned* fatbin_size);
 
 void kernel_infos_free(kernel_info_t *infos, size_t kernelnum);
 
diff --git a/cpu/cpu_rpc_prot.x b/cpu/cpu_rpc_prot.x
index be3b7d2a..8b849cb9 100644
--- a/cpu/cpu_rpc_prot.x
+++ b/cpu/cpu_rpc_prot.x
@@ -122,6 +122,7 @@ program RPC_CD_PROG {
         int          rpc_printmessage(string)                                     = 2;
         int          rpc_dlopen(string)                                           = 3;
         int          CUDA_REGISTER_FUNCTION(ptr, ptr, string, string, int)       = 50;
+        int          rpc_loadelf(mem_data)                                       = 51;
 
         /* RUNTIME API */
         /* ### Device Management ### */
diff --git a/tests/cpu/cubin/main.cpp b/tests/cpu/cubin/main.cpp
index 9b30c077..c8b815e8 100644
--- a/tests/cpu/cubin/main.cpp
+++ b/tests/cpu/cubin/main.cpp
@@ -96,14 +96,14 @@ int main(int argc, char** argv)
     CUmodule module;
     CUfunction func;
     printf("testing cubin...\n");
-    // if ((err = getModuleFromCubin(&module, "kernel.cubin")) != 0) {
-    //     printf("error\n");
-    //     return 1;
-    // }
-    if ((err = getModuleFromShared(&module, "kernel.so")) != 0) {
+    if ((err = getModuleFromCubin(&module, "kernel.cubin")) != 0) {
         printf("error\n");
         return 1;
     }
+    // if ((err = getModuleFromShared(&module, "kernel.so")) != 0) {
+    //     printf("error\n");
+    //     return 1;
+    // }
 
     if ((err = cuModuleGetFunction(&func, module, "kernel")) != CUDA_SUCCESS) {
         printCudaErrors(err);

From 4ff4b5cfc79a4352aed626204eb495403ecea107 Mon Sep 17 00:00:00 2001
From: Niklas Eiling <niklas.eiling@eonerc.rwth-aachen.de>
Date: Fri, 17 Feb 2023 21:18:01 +0100
Subject: [PATCH 07/83] add temporary test code that launches a kernel on the
 server from an elf retrieved via RPC.

Signed-off-by: Niklas Eiling <niklas.eiling@eonerc.rwth-aachen.de>
---
 cpu/cpu-client.c        |  1 +
 cpu/cpu-server-driver.c | 19 +++++++++++++++++--
 2 files changed, 18 insertions(+), 2 deletions(-)

diff --git a/cpu/cpu-client.c b/cpu/cpu-client.c
index ecacc809..d05bfe47 100644
--- a/cpu/cpu-client.c
+++ b/cpu/cpu-client.c
@@ -327,6 +327,7 @@ void** __cudaRegisterFatBinary(void *fatCubin)
     if (result != 0) {
         return NULL;
     }
+    //TODO: return a handle that can be used to idenfity the fatbin for registerFunction
     return NULL;
 }
 
diff --git a/cpu/cpu-server-driver.c b/cpu/cpu-server-driver.c
index fbdb58d7..50b11c30 100644
--- a/cpu/cpu-server-driver.c
+++ b/cpu/cpu-server-driver.c
@@ -51,11 +51,26 @@ bool_t rpc_loadelf_1_svc(mem_data elf, int *result, struct svc_req *rqstp)
 
     cudaDeviceSynchronize();
     
-    if ((res =cuModuleLoadData (&module, elf.mem_data_val)) != CUDA_SUCCESS) {
+    if ((res = cuModuleLoadData (&module, elf.mem_data_val)) != CUDA_SUCCESS) {
         LOG(LOG_ERROR, "cuModuleLoadFatBinary failed: %d", res);
         *result = res;
         return 1;
-    } 
+    }
+
+    CUfunction func;
+    if ((res = cuModuleGetFunction(&func, module, "_Z15kernel_no_paramv")) != CUDA_SUCCESS) {
+        LOG(LOG_ERROR, "cuModuleGetFunction failed: %d", res);
+        *result = res;
+        return 1;
+    }
+    int zero = 0;
+    void *params[] = {NULL, NULL, NULL, &zero, &zero, &zero, &zero, NULL};
+    if ((res = cuLaunchKernel(func, 1, 1, 1, 32, 1, 1, 0, CU_STREAM_DEFAULT, params, NULL)) != CUDA_SUCCESS) {
+        LOG(LOG_ERROR, "cuLaunchKernel failed: %d", res);
+        *result = res;
+        return 1;
+    }
+
     *result = 0;
     return 1;
 }

From e72c11c104dbe21c30443b40797b15e11bf15b43 Mon Sep 17 00:00:00 2001
From: Niklas Eiling <niklas.eiling@eonerc.rwth-aachen.de>
Date: Sat, 18 Feb 2023 12:58:05 +0100
Subject: [PATCH 08/83] add registry for tranferred cubins and kernel functions
 so Cricket is able to identify them when launching kernels

Signed-off-by: Niklas Eiling <niklas.eiling@eonerc.rwth-aachen.de>
---
 cpu/cpu-client.c          | 196 +++++++++++++++++++++-----------------
 cpu/cpu-server-driver.c   |  85 +++++++++++------
 cpu/cpu-server-runtime.c  |  38 +++++---
 cpu/cpu-utils.c           |  17 ++--
 cpu/cpu_rpc_prot.x        |   4 +-
 tests/test_apps/matmul.cu |   4 +-
 6 files changed, 199 insertions(+), 145 deletions(-)

diff --git a/cpu/cpu-client.c b/cpu/cpu-client.c
index d05bfe47..267c700a 100644
--- a/cpu/cpu-client.c
+++ b/cpu/cpu-client.c
@@ -1,31 +1,31 @@
 #define _GNU_SOURCE
-#include <stdio.h>
-#include <stdlib.h>
 #include <cuda.h>
 #include <driver_types.h>
 #include <link.h>
+#include <stdio.h>
+#include <stdlib.h>
 
-//For TCP socket
-#include <sys/socket.h>
-#include <netinet/in.h>
+// For TCP socket
 #include <arpa/inet.h>
 #include <netdb.h>
+#include <netinet/in.h>
+#include <sys/socket.h>
 
-#include "cpu-libwrap.h"
-#include "cpu_rpc_prot.h"
 #include "cpu-common.h"
+#include "cpu-libwrap.h"
 #include "cpu-utils.h"
+#include "cpu_rpc_prot.h"
 #include "list.h"
 #ifdef WITH_IB
 #include "cpu-ib.h"
-#endif //WITH_IB
+#endif // WITH_IB
 
-//static const char* LIBCUDA_PATH = "/lib64/libcuda.so";
-const char* LIBCUDA_PATH = "/usr/local/cuda/lib64/libcudart.so";
+// static const char* LIBCUDA_PATH = "/lib64/libcuda.so";
+const char *LIBCUDA_PATH = "/usr/local/cuda/lib64/libcudart.so";
 
 CLIENT *clnt = NULL;
 
-list kernel_infos = {0};
+list kernel_infos = { 0 };
 
 char server[256];
 
@@ -35,30 +35,33 @@ int shm_enabled = 1;
 int initialized = 0;
 
 #ifdef WITH_IB
-    int ib_device = 0;
-#endif //WITH_IB
+int ib_device = 0;
+#endif // WITH_IB
 
 #ifdef WITH_API_CNT
 extern void cpu_runtime_print_api_call_cnt(void);
-#endif //WITH_API_CNT
+#endif // WITH_API_CNT
 
 static void rpc_connect(void)
 {
     int isock;
-    struct sockaddr_un sock_un = {0};
-    struct sockaddr_in sock_in = {0};
-    struct sockaddr_in local_addr = {0};
+    struct sockaddr_un sock_un = { 0 };
+    struct sockaddr_in sock_in = { 0 };
+    struct sockaddr_in local_addr = { 0 };
     struct hostent *hp;
     socklen_t sockaddr_len = sizeof(struct sockaddr_in);
-    unsigned long prog=0, vers=0;
+    unsigned long prog = 0, vers = 0;
 
     char envvar[] = "REMOTE_GPU_ADDRESS";
 
-    if(!getenv(envvar)) {
-        LOG(LOG_ERROR, "Environment variable %s does not exist. It must contain the address where the server application is listening.", envvar);
+    if (!getenv(envvar)) {
+        LOG(LOG_ERROR,
+            "Environment variable %s does not exist. It must contain the "
+            "address where the server application is listening.",
+            envvar);
         exit(1);
     }
-    if(strncpy(server, getenv(envvar), 256) == NULL) {
+    if (strncpy(server, getenv(envvar), 256) == NULL) {
         LOGE(LOG_ERROR, "strncpy failed.");
         exit(1);
     }
@@ -66,23 +69,23 @@ static void rpc_connect(void)
 
 #ifdef WITH_IB
 
-    if(getenv("IB_DEVICE_ID")) {
+    if (getenv("IB_DEVICE_ID")) {
         ib_device = atoi(getenv("IB_DEVICE_ID"));
     }
     LOG(LOG_INFO, "Using IB device: %d.", ib_device);
 
-#endif //WITH_IB
+#endif // WITH_IB
 
     LOGE(LOG_INFO, "test\n");
-    if(getenv("CRICKET_NOHASH")) {
-        prog=99;
-        vers=1;
+    if (getenv("CRICKET_NOHASH")) {
+        prog = 99;
+        vers = 1;
     } else if (cpu_utils_md5hash("/proc/self/exe", &prog, &vers) != 0) {
         LOGE(LOG_ERROR, "error while creating binary checksum");
         exit(0);
     }
 
-    char* cmd = NULL;
+    char *cmd = NULL;
     if (cpu_utils_command(&cmd) != 0) {
         LOGE(LOG_ERROR, "error getting command");
     } else {
@@ -110,18 +113,19 @@ static void rpc_connect(void)
             LOGE(LOG_ERROR, "error resolving hostname: %s", server);
             exit(1);
         }
-        sock_in.sin_addr = *(struct in_addr*)hp->h_addr;
-        //inet_aton("137.226.133.199", &sock_in.sin_addr);
+        sock_in.sin_addr = *(struct in_addr *)hp->h_addr;
+        // inet_aton("137.226.133.199", &sock_in.sin_addr);
 
         clnt = clnttcp_create(&sock_in, prog, vers, &isock, 0, 0);
         getsockname(isock, &local_addr, &sockaddr_len);
-        connection_is_local = (local_addr.sin_addr.s_addr == sock_in.sin_addr.s_addr);
+        connection_is_local =
+            (local_addr.sin_addr.s_addr == sock_in.sin_addr.s_addr);
         break;
     case UDP:
-        /* From RPCEGEN documentation: 
+        /* From RPCEGEN documentation:
          * Warning: since UDP-based RPC messages can only hold up to 8 Kbytes
-         * of encoded data, this transport cannot be used for procedures that 
-         * take large arguments or return huge results. 
+         * of encoded data, this transport cannot be used for procedures that
+         * take large arguments or return huge results.
          * -> Sounds like UDP does not make sense for CUDA, because we need to
          *    be able to copy large memory chunks
          **/
@@ -131,11 +135,12 @@ static void rpc_connect(void)
 
     if (clnt == NULL) {
         clnt_pcreateerror("[rpc] Error");
-        exit (1);
+        exit(1);
     }
 }
 
-static void repair_connection(int signo) {
+static void repair_connection(int signo)
+{
     enum clnt_stat retval_1;
     int result_1;
     /*LOGE(LOG_INFO, "Trying connection...");
@@ -155,7 +160,7 @@ static void repair_connection(int signo) {
     }
 }
 
-void __attribute__ ((constructor)) init_rpc(void)
+void __attribute__((constructor)) init_rpc(void)
 {
     enum clnt_stat retval_1;
     int result_1;
@@ -173,7 +178,7 @@ void __attribute__ ((constructor)) init_rpc(void)
 
     retval_1 = rpc_printmessage_1(printmessage_1_arg1, &result_1, clnt);
     if (retval_1 != RPC_SUCCESS) {
-        clnt_perror (clnt, "call failed");
+        clnt_perror(clnt, "call failed");
     }
 
     if (list_init(&kernel_infos, sizeof(kernel_info_t)) != 0) {
@@ -181,16 +186,17 @@ void __attribute__ ((constructor)) init_rpc(void)
     }
 
     if (cpu_utils_parameter_info(&kernel_infos, "/proc/self/exe") != 0) {
-        LOG(LOG_ERROR, "error while getting parameter size. Check whether cuobjdump binary is in PATH! Trying anyway (will only work if there is no kernel in this binary)");
+        LOG(LOG_ERROR, "error while getting parameter size. Check whether "
+                       "cuobjdump binary is in PATH! Trying anyway (will only "
+                       "work if there is no kernel in this binary)");
     }
 #ifdef WITH_IB
     if (ib_init(ib_device, server) != 0) {
         LOG(LOG_ERROR, "initilization of infiniband verbs failed.");
     }
-#endif //WITH_IB
-
+#endif // WITH_IB
 }
-void __attribute__ ((destructor)) deinit_rpc(void)
+void __attribute__((destructor)) deinit_rpc(void)
 {
     enum clnt_stat retval_1;
     int result;
@@ -203,16 +209,16 @@ void __attribute__ ((destructor)) deinit_rpc(void)
         list_free(&kernel_infos);
 #ifdef WITH_API_CNT
         cpu_runtime_print_api_call_cnt();
-#endif //WITH_API_CNT
+#endif // WITH_API_CNT
     }
 
     if (clnt != NULL) {
-        clnt_destroy (clnt);
+        clnt_destroy(clnt);
     }
 }
 
 static void *(*dlopen_orig)(const char *, int) = NULL;
-static int   (*dlclose_orig)(void *) = NULL;
+static int (*dlclose_orig)(void *) = NULL;
 static void *dl_handle = NULL;
 
 void *dlopen(const char *filename, int flag)
@@ -221,13 +227,14 @@ void *dlopen(const char *filename, int flag)
     struct link_map *map;
     LOG(LOG_DEBUG, "intercepted dlopen(%s, %d)", filename, flag);
     if (dlopen_orig == NULL) {
-        if ( (dlopen_orig = dlsym(RTLD_NEXT, "dlopen")) == NULL) {
+        if ((dlopen_orig = dlsym(RTLD_NEXT, "dlopen")) == NULL) {
             LOGE(LOG_ERROR, "[dlopen] dlsym failed");
         }
     }
 
     if (filename != NULL && strcmp(filename, "libcuda.so.1") == 0) {
-        LOG(LOG_DEBUG, "replacing dlopen call to cuda driver library with cricket-client.so");
+        LOG(LOG_DEBUG, "replacing dlopen call to cuda driver library with "
+                       "cricket-client.so");
         dl_handle = dlopen_orig("cricket-client.so", flag);
         if (clnt == NULL) {
             LOGE(LOG_ERROR, "rpc seems to be uninitialized");
@@ -235,18 +242,18 @@ void *dlopen(const char *filename, int flag)
         return dl_handle;
     } else {
         LOGE(LOG_DEBUG, "request to dlopen \"%s\"", filename);
-        // if (cpu_utils_contains_kernel(filename) == 0) {
-        //     LOGE(LOG_ERROR, "file does not contain a kernel");
-        // } else {
-        //     LOGE(LOG_DEBUG, "file contains a kernel");
-        //     int result;
-        //     enum clnt_stat retval_1;
-        //     retval_1 = rpc_dlopen_1((char*)filename, &result, clnt);
-        //     if (retval_1 != RPC_SUCCESS) {
-        //         LOGE(LOG_ERROR, "error calling rpc_dlopen");
-        //     }
-        //     cpu_utils_parameter_info(&kernel_infos, (char*)filename);
-        // }
+        if (cpu_utils_contains_kernel(filename) == 0) {
+            LOGE(LOG_ERROR, "file does not contain a kernel");
+        } else {
+            LOGE(LOG_DEBUG, "file contains a kernel");
+            int result;
+            enum clnt_stat retval_1;
+            retval_1 = rpc_dlopen_1((char *)filename, &result, clnt);
+            if (retval_1 != RPC_SUCCESS) {
+                LOGE(LOG_ERROR, "error calling rpc_dlopen");
+            }
+            cpu_utils_parameter_info(&kernel_infos, (char *)filename);
+        }
         ret = dlopen_orig(filename, flag);
         dlinfo(ret, RTLD_DI_LINKMAP, &map);
         LOGE(LOG_DEBUG, "dlopen \"%s\" to  %p", filename, map->l_addr);
@@ -260,7 +267,7 @@ int dlclose(void *handle)
         LOGE(LOG_ERROR, "[dlclose] handle NULL");
         return -1;
     } else if (dlclose_orig == NULL) {
-        if ( (dlclose_orig = dlsym(RTLD_NEXT, "dlclose")) == NULL) {
+        if ((dlclose_orig = dlsym(RTLD_NEXT, "dlclose")) == NULL) {
             LOGE(LOG_ERROR, "[dlclose] dlsym failed");
         }
     }
@@ -272,50 +279,59 @@ int dlclose(void *handle)
     } else {
         return dlclose_orig(handle);
     }
-
 }
 
-// void __cudaRegisterVar(void **fatCubinHandle, char *hostVar, char *deviceAddress, const char *deviceName, int ext, size_t size, int constant, int global)
+// void __cudaRegisterVar(void **fatCubinHandle, char *hostVar, char
+// *deviceAddress, const char *deviceName, int ext, size_t size, int constant,
+// int global)
 // {
 // }
 
-void __cudaRegisterFunction(void **fatCubinHandle, const char *hostFun, char *deviceFun,
-                            const char *deviceName, int thread_limit, uint3 *tid,
-                            uint3 *bid, dim3 *bDim, dim3 *gDim, int *wSize)
+void __cudaRegisterFunction(void **fatCubinHandle, const char *hostFun,
+                            char *deviceFun, const char *deviceName,
+                            int thread_limit, uint3 *tid, uint3 *bid,
+                            dim3 *bDim, dim3 *gDim, int *wSize)
 {
-    int result;
+    ptr_result result;
     enum clnt_stat retval_1;
 
-    printf("__cudaRegisterFunction(fatCubinHandle=%p, hostFun=%p, devFunc=%s, deviceName=%s, thread_limit=%d, tid=[%p], bid=[%p], bDim=[%p], gDim=[%p], wSize=%p)\n", fatCubinHandle, hostFun, deviceFun, deviceName, thread_limit, tid, bid, bDim, gDim, wSize);
+    printf("__cudaRegisterFunction(fatCubinHandle=%p, hostFun=%p, devFunc=%s, "
+           "deviceName=%s, thread_limit=%d, tid=[%p], bid=[%p], bDim=[%p], "
+           "gDim=[%p], wSize=%p)\n",
+           fatCubinHandle, hostFun, deviceFun, deviceName, thread_limit, tid,
+           bid, bDim, gDim, wSize);
 
-    kernel_info_t *info = cricketd_utils_search_info(&kernel_infos, (char*)deviceName);
+    kernel_info_t *info =
+        cricketd_utils_search_info(&kernel_infos, (char *)deviceName);
     if (info == NULL) {
-        LOGE(LOG_ERROR, "request to register unknown function: \"%s\"", deviceName);
-
-
+        LOGE(LOG_ERROR, "request to register unknown function: \"%s\"",
+             deviceName);
         return;
     } else {
-        LOGE(LOG_DEBUG, "request to register known function: \"%s\"", deviceName);
-        retval_1 = cuda_register_function_1((ptr)fatCubinHandle, (ptr)hostFun, deviceFun, (char*)deviceName, thread_limit, &result, clnt);
+        LOGE(LOG_DEBUG, "request to register known function: \"%s\"",
+             deviceName);
+        retval_1 = rpc_register_function_1((ptr)fatCubinHandle, (ptr)hostFun,
+                                           deviceFun, (char*)deviceName, thread_limit,
+                                           &result, clnt);
         if (retval_1 != RPC_SUCCESS) {
             LOGE(LOG_ERROR, "call failed.");
         }
-        info->host_fun = (void*)hostFun;
+        info->host_fun = (void *)hostFun;
     }
 }
 
 
-
-void** __cudaRegisterFatBinary(void *fatCubin)
+void **__cudaRegisterFatBinary(void *fatCubin)
 {
-    int result;
+    ptr_result result;
     enum clnt_stat retval_1;
+    LOGE(LOG_DEBUG, "__cudaRegisterFatBinary(fatCubin=%p)", fatCubin);
 
-    mem_data rpc_fat = {
-        .mem_data_len = 0,
-        .mem_data_val = NULL};
+    mem_data rpc_fat = { .mem_data_len = 0, .mem_data_val = NULL };
 
-    if (cpu_utils_get_fatbin_info((struct fat_header*)fatCubin, (void**)&rpc_fat.mem_data_val, &rpc_fat.mem_data_len) != 0) {
+    if (cpu_utils_get_fatbin_info((struct fat_header *)fatCubin,
+                                  (void **)&rpc_fat.mem_data_val,
+                                  &rpc_fat.mem_data_len) != 0) {
         LOGE(LOG_ERROR, "error getting fatbin info");
         return NULL;
     }
@@ -324,11 +340,13 @@ void** __cudaRegisterFatBinary(void *fatCubin)
     if (retval_1 != RPC_SUCCESS) {
         LOGE(LOG_ERROR, "call failed.");
     }
-    if (result != 0) {
+    if (result.err != 0) {
         return NULL;
     }
-    //TODO: return a handle that can be used to idenfity the fatbin for registerFunction
-    return NULL;
+    LOG(LOG_DEBUG, "fatbin loaded to %p", result.ptr_result_u.ptr);
+    // return a handle that can be used to idenfity the fatbin for
+    // registerFunction
+    return (void **)result.ptr_result_u.ptr;
 }
 
 // void __cudaRegisterFatBinaryEnd(void **fatCubinHandle)
@@ -336,14 +354,12 @@ void** __cudaRegisterFatBinary(void *fatCubin)
 //     int result;
 //     enum clnt_stat retval_1;
 
-//     //printf("__cudaRegisterFatBinaryEnd(fatCubinHandle=%p)\n", fatCubinHandle);
+//     //printf("__cudaRegisterFatBinaryEnd(fatCubinHandle=%p)\n",
+//     fatCubinHandle);
 
-//     retval_1 = RPC_SUCCESS;//cuda_register_fat_binary_end_1((uint64_t)fatCubinHandle, &result, clnt);
-//     if (retval_1 != RPC_SUCCESS) {
+//     retval_1 =
+//     RPC_SUCCESS;//cuda_register_fat_binary_end_1((uint64_t)fatCubinHandle,
+//     &result, clnt); if (retval_1 != RPC_SUCCESS) {
 //         clnt_perror (clnt, "call failed");
 //     }
 // }
-
-
-
-
diff --git a/cpu/cpu-server-driver.c b/cpu/cpu-server-driver.c
index 50b11c30..6b0f7bab 100644
--- a/cpu/cpu-server-driver.c
+++ b/cpu/cpu-server-driver.c
@@ -21,7 +21,9 @@ int server_driver_init(int restore)
     int ret = 0;
     if (!restore) {
         ret &= resource_mg_init(&rm_modules, 1);
-        ret &= resource_mg_init(&rm_functions, 1);
+        // we cannot bypass the resource manager for functions
+        // because CUfunctions are at different locations on server and client
+        ret &= resource_mg_init(&rm_functions, 0);
     } else {
         ret &= resource_mg_init(&rm_modules, 0);
         ret &= resource_mg_init(&rm_functions, 0);
@@ -32,49 +34,66 @@ int server_driver_init(int restore)
 
 #include <cuda_runtime_api.h>
 
-bool_t rpc_loadelf_1_svc(mem_data elf, int *result, struct svc_req *rqstp)
+// Does not support checkpoint/restart yet
+bool_t rpc_loadelf_1_svc(mem_data elf, ptr_result *result, struct svc_req *rqstp)
 {
     LOG(LOG_DEBUG, "rpc_loadelf(elf: %p, len: %#x)", elf.mem_data_val, elf.mem_data_len);
     CUresult res;
     CUmodule module;
-    cudaError_t cres;
-
-    for (int i=0; i<64; i++) {
-        printf("%02x ", ((uint8_t*)elf.mem_data_val)[i]);
-    }
-    
-    if ((cres = cudaSetDevice(0)) != cudaSuccess) {
-        LOG(LOG_ERROR, "cudaSetDevice failed: %d", cres);
-        *result = cres;
-        return 1;
-    }
-
-    cudaDeviceSynchronize();
     
     if ((res = cuModuleLoadData (&module, elf.mem_data_val)) != CUDA_SUCCESS) {
         LOG(LOG_ERROR, "cuModuleLoadFatBinary failed: %d", res);
-        *result = res;
+        result->err = res;
         return 1;
     }
 
-    CUfunction func;
-    if ((res = cuModuleGetFunction(&func, module, "_Z15kernel_no_paramv")) != CUDA_SUCCESS) {
-        LOG(LOG_ERROR, "cuModuleGetFunction failed: %d", res);
-        *result = res;
-        return 1;
-    }
-    int zero = 0;
-    void *params[] = {NULL, NULL, NULL, &zero, &zero, &zero, &zero, NULL};
-    if ((res = cuLaunchKernel(func, 1, 1, 1, 32, 1, 1, 0, CU_STREAM_DEFAULT, params, NULL)) != CUDA_SUCCESS) {
-        LOG(LOG_ERROR, "cuLaunchKernel failed: %d", res);
-        *result = res;
+    if ((res = resource_mg_create(&rm_modules, (void*)module)) != CUDA_SUCCESS) {
+        LOG(LOG_ERROR, "resource_mg_create failed: %d", res);
+        result->err = res;
         return 1;
     }
 
-    *result = 0;
+    LOG(LOG_DEBUG, "->module: %p", module);
+    result->err = 0;
+    result->ptr_result_u.ptr = (ptr)module;
     return 1;
 }
 
+// Does not support checkpoint/restart yet
+bool_t rpc_register_function_1_svc(ptr fatCubinHandle, ptr hostFun, char* deviceFun,
+                            char* deviceName, int thread_limit, ptr_result *result, struct svc_req *rqstp)
+{
+    RECORD_API(rpc_register_function_1_argument);
+    RECORD_ARG(1, fatCubinHandle);
+    RECORD_ARG(2, hostFun);
+    RECORD_ARG(3, deviceFun);
+    RECORD_ARG(4, deviceName);
+    RECORD_ARG(5, thread_limit);
+    LOG(LOG_DEBUG, "rpc_register_function(fatCubinHandle: %p, hostFun: %p, deviceFun: %s, deviceName: %s, thread_limit: %d)",
+        fatCubinHandle, hostFun, deviceFun, deviceName, thread_limit);
+    GSCHED_RETAIN;
+    result->err = cuModuleGetFunction((CUfunction*)&result->ptr_result_u.ptr,
+                    resource_mg_get(&rm_streams, (void*)fatCubinHandle),
+                    deviceName);
+    GSCHED_RELEASE;
+    if (resource_mg_add_sorted(&rm_functions, (void*)hostFun, (void*)result->ptr_result_u.ptr) != 0) {
+        LOGE(LOG_ERROR, "error in resource manager");
+    }
+    RECORD_RESULT(ptr_result_u, *result);
+    return 1;
+
+    // int zero = 0;
+    // void *params[] = {NULL, NULL, NULL, &zero, &zero, &zero, &zero, NULL};
+    // if ((res = cuLaunchKernel(func, 1, 1, 1, 32, 1, 1, 0, CU_STREAM_DEFAULT, params, NULL)) != CUDA_SUCCESS) {
+    //     LOG(LOG_ERROR, "cuLaunchKernel failed: %d", res);
+    //     result->err = res;
+    //     return 1;
+    // }
+
+    // result->err = 0;
+    // return 1;
+}
+
 int server_driver_deinit(void)
 {
     resource_mg_free(&rm_modules);
@@ -321,7 +340,6 @@ bool_t rpc_culaunchkernel_1_svc(uint64_t f, unsigned int gridDimX, unsigned int
     void **cuda_args;
     uint16_t *arg_offsets;
     size_t param_num;
-    LOG(LOG_DEBUG, "%s", __FUNCTION__);
     if (args.mem_data_val == NULL) {
         LOGE(LOG_ERROR, "param.mem_data_val is NULL");
         *result = CUDA_ERROR_INVALID_VALUE;
@@ -348,10 +366,15 @@ bool_t rpc_culaunchkernel_1_svc(uint64_t f, unsigned int gridDimX, unsigned int
         LOGE(LOG_DEBUG, "arg: %p (%d)", *(void**)cuda_args[i], *(int*)cuda_args[i]);
     }
 
-    LOGE(LOG_DEBUG, "cuLaunchKernel(func=%p, gridDim=[%d,%d,%d], blockDim=[%d,%d,%d], args=%p, sharedMem=%d, stream=%p)", f, gridDimX, gridDimY, gridDimZ, blockDimX, blockDimY, blockDimZ, cuda_args, sharedMemBytes, (void*)hStream);
+    LOGE(LOG_DEBUG, "cuLaunchKernel(func=%p->%p, gridDim=[%d,%d,%d], blockDim=[%d,%d,%d], args=%p, sharedMem=%d, stream=%p)", f, resource_mg_get(&rm_functions, (void*)f), gridDimX, gridDimY, gridDimZ, blockDimX, blockDimY, blockDimZ, cuda_args, sharedMemBytes, (void*)hStream);
 
     GSCHED_RETAIN;
-    *result = cuLaunchKernel((CUfunction)f, gridDimX, gridDimY, gridDimZ, blockDimX, blockDimY, blockDimZ, sharedMemBytes, (CUstream)hStream, cuda_args, NULL);
+    *result = cuLaunchKernel((CUfunction)resource_mg_get(&rm_functions, (void*)f),
+                              gridDimX, gridDimY, gridDimZ,
+                              blockDimX, blockDimY, blockDimZ,
+                              sharedMemBytes,
+                              (CUstream)hStream,
+                              cuda_args, NULL);
     GSCHED_RELEASE;
 
     free(cuda_args);
diff --git a/cpu/cpu-server-runtime.c b/cpu/cpu-server-runtime.c
index 8c6c8ff5..96c49842 100644
--- a/cpu/cpu-server-runtime.c
+++ b/cpu/cpu-server-runtime.c
@@ -71,9 +71,6 @@ int server_runtime_init(int restore)
         ret &= resource_mg_init(&rm_events, 1);
         ret &= resource_mg_init(&rm_arrays, 1);
         ret &= resource_mg_init(&rm_memory, 1);
-         // We cannot bypass this RM, because we need translations when a kernel in
-         // a shared object is launched.
-        ret &= resource_mg_init(&rm_kernels, 0);
         ret &= cusolver_init(1, &rm_streams, &rm_memory);
         ret &= cublas_init(1, &rm_memory);
     } else {
@@ -86,6 +83,16 @@ int server_runtime_init(int restore)
         ret &= cublas_init(0, &rm_memory);
         ret &= server_runtime_restore("ckp");
     }
+    
+    // Make sure runtime API is initialized
+    // If we don't do this and use the driver API, it might be unintialized
+    cudaError_t cres;
+    if ((cres = cudaSetDevice(0)) != cudaSuccess) {
+        LOG(LOG_ERROR, "cudaSetDevice failed: %d", cres);
+        ret = 1;
+    }
+    cudaDeviceSynchronize();
+
     return ret;
 }
 
@@ -890,15 +897,22 @@ bool_t cuda_launch_kernel_1_svc(ptr func, rpc_dim3 gridDim, rpc_dim3 blockDim,
         LOGE(LOG_DEBUG, "arg: %p (%d)", *(void**)cuda_args[i], *(int*)cuda_args[i]);
     }
 
-    LOGE(LOG_DEBUG, "cudaLaunchKernel(func=%p, gridDim=[%d,%d,%d], blockDim=[%d,%d,%d], args=%p, sharedMem=%d, stream=%p)", resource_mg_get(&rm_kernels, (void*)func), cuda_gridDim.x, cuda_gridDim.y, cuda_gridDim.z, cuda_blockDim.x, cuda_blockDim.y, cuda_blockDim.z, cuda_args, sharedMem, (void*)stream);
-
-    *result = cudaLaunchKernel(
-      resource_mg_get(&rm_kernels, (void*)func),
-      cuda_gridDim,
-      cuda_blockDim,
-      cuda_args,
-      sharedMem,
-      resource_mg_get(&rm_streams, (void*)stream));
+    LOGE(LOG_DEBUG, "cudaLaunchKernel(func=%p, gridDim=[%d,%d,%d], blockDim=[%d,%d,%d], args=%p, sharedMem=%d, stream=%p)", resource_mg_get(&rm_functions, (void*)func), cuda_gridDim.x, cuda_gridDim.y, cuda_gridDim.z, cuda_blockDim.x, cuda_blockDim.y, cuda_blockDim.z, cuda_args, sharedMem, (void*)stream);
+
+    *result = cuLaunchKernel((CUfunction)resource_mg_get(&rm_functions, (void*)func),
+                            gridDim.x, gridDim.y, gridDim.z,
+                            blockDim.y, blockDim.y, blockDim.z,
+                            sharedMem,
+                            resource_mg_get(&rm_streams, (void*)stream),
+                            cuda_args, NULL);
+
+    // *result = cudaLaunchKernel(
+    //   resource_mg_get(&rm_functions, (void*)func),
+    //   cuda_gridDim,
+    //   cuda_blockDim,
+    //   cuda_args,
+    //   sharedMem,
+    //   resource_mg_get(&rm_streams, (void*)stream));
     free(cuda_args);
     RECORD_RESULT(integer, *result);
     LOGE(LOG_DEBUG, "cudaLaunchKernel result: %d", *result);
diff --git a/cpu/cpu-utils.c b/cpu/cpu-utils.c
index 3218a5ac..3043548d 100644
--- a/cpu/cpu-utils.c
+++ b/cpu/cpu-utils.c
@@ -167,13 +167,13 @@ int cpu_utils_get_fatbin_info(struct fat_header *fatbin, void** fatbin_mem, unsi
     }
     fatbin_total_size += fat_elf_header->header_size + fat_elf_header->fat_size;
 
-    fat_ptr = fatbin->data;
+    // fat_ptr = (void*)fatbin->data;
 
-    for (int i=0; i<64; i++) {
-        printf("%02x ", ((uint8_t*)fat_ptr)[i]);
-    }
+    // for (int i=0; i<64; i++) {
+    //     printf("%02x ", ((uint8_t*)fat_ptr)[i]);
+    // }
 
-    *fatbin_mem = fatbin->text;
+    *fatbin_mem = (void*)fatbin->text;
     *fatbin_size = fatbin_total_size;
     return 0;
 }
@@ -601,13 +601,14 @@ int cpu_utils_parameter_info(list *kernel_infos, char *path)
             goto cleanup2;
         }
 
-        if ((buf->name = malloc(strlen(kernelname))) == NULL) {
+        size_t buflen = strlen(kernelname);
+        if ((buf->name = malloc(buflen)) == NULL) {
             LOGE(LOG_ERROR, "malloc failed");
             goto cleanup2;
         }
         //copy string and remove trailing \n
-        strncpy(buf->name, kernelname, strlen(kernelname)-1);
-        buf->name[strlen(kernelname)-1] = '\0';
+        strncpy(buf->name, kernelname, buflen-1);
+        buf->name[buflen-1] = '\0';
 
         if (cpu_utils_read_pars(buf, fdesc) != 0) {
             LOGE(LOG_ERROR, "reading paramter infos failed.\n");
diff --git a/cpu/cpu_rpc_prot.x b/cpu/cpu_rpc_prot.x
index 8b849cb9..7e41fb91 100644
--- a/cpu/cpu_rpc_prot.x
+++ b/cpu/cpu_rpc_prot.x
@@ -121,8 +121,8 @@ program RPC_CD_PROG {
         int          rpc_deinit(void)                                             = 1;
         int          rpc_printmessage(string)                                     = 2;
         int          rpc_dlopen(string)                                           = 3;
-        int          CUDA_REGISTER_FUNCTION(ptr, ptr, string, string, int)       = 50;
-        int          rpc_loadelf(mem_data)                                       = 51;
+        ptr_result   rpc_register_function(ptr, ptr, string, string, int)        = 50;
+        ptr_result   rpc_loadelf(mem_data)                                       = 51;
 
         /* RUNTIME API */
         /* ### Device Management ### */
diff --git a/tests/test_apps/matmul.cu b/tests/test_apps/matmul.cu
index 7790ae7b..64c67d57 100644
--- a/tests/test_apps/matmul.cu
+++ b/tests/test_apps/matmul.cu
@@ -264,8 +264,8 @@ int main()
 
     dim3 dimBlock( blocksize, 1 );
     dim3 dimGrid( 1, 1);
-    kernel<<<dimGrid, dimBlock>>>(dev_A, dev_x, dev_res, 0, 0, 0, 0);
-    //kernel_no_param<<<dimGrid, dimBlock>>>();
+    //kernel<<<dimGrid, dimBlock>>>(dev_A, dev_x, dev_res, 0, 0, 0, 0);
+    kernel_no_param<<<dimGrid, dimBlock>>>();
     //void *args = NULL;
     //int result = cudaLaunchKernel((void*)kernel_no_param, dimGrid, dimBlock, &args, 0LL, NULL);
 

From 15eb759068215609ce18e6599e4c7616317ad2bb Mon Sep 17 00:00:00 2001
From: Niklas Eiling <niklas.eiling@eonerc.rwth-aachen.de>
Date: Tue, 21 Feb 2023 11:02:56 +0100
Subject: [PATCH 09/83] fix segfault on cleanup because CUDA accesses
 nonexisting fatcubinHandle

Signed-off-by: Niklas Eiling <niklas.eiling@eonerc.rwth-aachen.de>
---
 cpu/cpu-client.c          | 48 +++++++++++++++++++++++++++----
 cpu/cpu-server-driver.c   | 60 +++++++++++++++++++++++++++++++--------
 cpu/cpu_rpc_prot.x        |  3 +-
 tests/test_apps/matmul.cu |  4 +--
 4 files changed, 95 insertions(+), 20 deletions(-)

diff --git a/cpu/cpu-client.c b/cpu/cpu-client.c
index 267c700a..d3566673 100644
--- a/cpu/cpu-client.c
+++ b/cpu/cpu-client.c
@@ -323,7 +323,8 @@ void __cudaRegisterFunction(void **fatCubinHandle, const char *hostFun,
 
 void **__cudaRegisterFatBinary(void *fatCubin)
 {
-    ptr_result result;
+    void **result;
+    int rpc_result;
     enum clnt_stat retval_1;
     LOGE(LOG_DEBUG, "__cudaRegisterFatBinary(fatCubin=%p)", fatCubin);
 
@@ -336,17 +337,54 @@ void **__cudaRegisterFatBinary(void *fatCubin)
         return NULL;
     }
 
-    retval_1 = rpc_loadelf_1(rpc_fat, &result, clnt);
+    // CUDA registers atexit handler for fatbin cleanup. These access
+    // fatbin data structurees 
+    result = (void**)calloc(1, 0x58);
+
+    // void ** (*cudaRegisterFatBinaryOrig)(void *) = NULL;
+
+    // if ((cudaRegisterFatBinaryOrig = dlsym(RTLD_NEXT, "__cudaRegisterFatBinary") ) == NULL) {
+    //     LOGE(LOG_ERROR, "dlsym failed");
+    //     return NULL;
+    // }
+
+    // if ((result = cudaRegisterFatBinaryOrig(fatCubin)) == NULL) {
+    //     LOGE(LOG_ERROR, "error calling original cudaRegisterFatBinary");
+    //     return NULL;
+    // }
+
+    retval_1 = rpc_elf_load_1(rpc_fat, (ptr)result, &rpc_result, clnt);
     if (retval_1 != RPC_SUCCESS) {
         LOGE(LOG_ERROR, "call failed.");
     }
-    if (result.err != 0) {
+    if (rpc_result != 0) {
         return NULL;
     }
-    LOG(LOG_DEBUG, "fatbin loaded to %p", result.ptr_result_u.ptr);
+    LOG(LOG_DEBUG, "fatbin loaded to %p", result);
     // return a handle that can be used to idenfity the fatbin for
     // registerFunction
-    return (void **)result.ptr_result_u.ptr;
+    // TODO: We have to return a proper fatbinary handle because CUDA will segfault
+    // atexit() otherwise.
+    return result;
+}
+
+void __cudaUnregisterFatBinary(void **fatCubinHandle)
+{  
+    int result;
+    enum clnt_stat retval_1;
+
+    LOGE(LOG_DEBUG, "__cudaUnregisterFatBinary(fatCubinHandle=%p)",
+         fatCubinHandle);
+
+    if (fatCubinHandle == NULL) {
+        LOGE(LOG_ERROR, "fatCubinHandle is NULL");
+        return;
+    }
+
+    retval_1 = rpc_elf_unload_1((ptr)fatCubinHandle, &result, clnt);
+    if (retval_1 != RPC_SUCCESS || result != 0) {
+        LOGE(LOG_ERROR, "call failed.");
+    }
 }
 
 // void __cudaRegisterFatBinaryEnd(void **fatCubinHandle)
diff --git a/cpu/cpu-server-driver.c b/cpu/cpu-server-driver.c
index 6b0f7bab..d86d57d5 100644
--- a/cpu/cpu-server-driver.c
+++ b/cpu/cpu-server-driver.c
@@ -20,9 +20,9 @@ int server_driver_init(int restore)
    
     int ret = 0;
     if (!restore) {
-        ret &= resource_mg_init(&rm_modules, 1);
-        // we cannot bypass the resource manager for functions
-        // because CUfunctions are at different locations on server and client
+        // we cannot bypass the resource manager for functions and modules
+        // because CUfunctions and modules are at different locations on server and client
+        ret &= resource_mg_init(&rm_modules, 0);
         ret &= resource_mg_init(&rm_functions, 0);
     } else {
         ret &= resource_mg_init(&rm_modules, 0);
@@ -35,30 +35,66 @@ int server_driver_init(int restore)
 #include <cuda_runtime_api.h>
 
 // Does not support checkpoint/restart yet
-bool_t rpc_loadelf_1_svc(mem_data elf, ptr_result *result, struct svc_req *rqstp)
+bool_t rpc_elf_load_1_svc(mem_data elf, ptr module_key, int *result, struct svc_req *rqstp)
 {
-    LOG(LOG_DEBUG, "rpc_loadelf(elf: %p, len: %#x)", elf.mem_data_val, elf.mem_data_len);
+    LOG(LOG_DEBUG, "rpc_elf_load(elf: %p, len: %#x)", elf.mem_data_val, elf.mem_data_len);
     CUresult res;
     CUmodule module;
     
-    if ((res = cuModuleLoadData (&module, elf.mem_data_val)) != CUDA_SUCCESS) {
+    if ((res = cuModuleLoadData(&module, elf.mem_data_val)) != CUDA_SUCCESS) {
         LOG(LOG_ERROR, "cuModuleLoadFatBinary failed: %d", res);
-        result->err = res;
+        *result = res;
         return 1;
     }
 
-    if ((res = resource_mg_create(&rm_modules, (void*)module)) != CUDA_SUCCESS) {
+    // We add our module using module_key as key. This means a fatbinaryHandle on the client is translated
+    // to a CUmodule on the server.
+    if ((res = resource_mg_add_sorted(&rm_modules, (void*)module_key, (void*)module)) != CUDA_SUCCESS) {
         LOG(LOG_ERROR, "resource_mg_create failed: %d", res);
-        result->err = res;
+        *result = res;
         return 1;
     }
 
     LOG(LOG_DEBUG, "->module: %p", module);
-    result->err = 0;
-    result->ptr_result_u.ptr = (ptr)module;
+    *result = 0;
     return 1;
 }
 
+// Does not support checkpoint/restart yet
+// TODO: We should also remove associated function handles
+bool_t rpc_elf_unload_1_svc(ptr elf_handle, int *result, struct svc_req *rqstp)
+{
+    LOG(LOG_DEBUG, "rpc_elf_unload(elf_handle: %p)", elf_handle);
+    CUmodule module = NULL;
+    CUresult res;
+    
+    if ((module = (CUmodule)resource_mg_get(&rm_modules, (void*)elf_handle)) == NULL) {
+        LOG(LOG_ERROR, "resource_mg_get failed");
+        *result = -1;
+        return 1;
+    }
+
+    // if ((res = resource_mg_remove(&rm_modules, (void*)elf_handle)) != CUDA_SUCCESS) {
+    //     LOG(LOG_ERROR, "resource_mg_create failed: %d", res);
+    //     result->err = res;
+    //     return 1;
+    // }
+
+    if ((res = cuModuleUnload(module)) != CUDA_SUCCESS) {
+        LOG(LOG_ERROR, "cuModuleUnload failed: %d", res);
+        *result = res;
+        return 1;
+    }
+
+    //TODO: Free memory of module
+
+    *result = 0;
+    return 1;
+}
+
+
+
+
 // Does not support checkpoint/restart yet
 bool_t rpc_register_function_1_svc(ptr fatCubinHandle, ptr hostFun, char* deviceFun,
                             char* deviceName, int thread_limit, ptr_result *result, struct svc_req *rqstp)
@@ -73,7 +109,7 @@ bool_t rpc_register_function_1_svc(ptr fatCubinHandle, ptr hostFun, char* device
         fatCubinHandle, hostFun, deviceFun, deviceName, thread_limit);
     GSCHED_RETAIN;
     result->err = cuModuleGetFunction((CUfunction*)&result->ptr_result_u.ptr,
-                    resource_mg_get(&rm_streams, (void*)fatCubinHandle),
+                    resource_mg_get(&rm_modules, (void*)fatCubinHandle),
                     deviceName);
     GSCHED_RELEASE;
     if (resource_mg_add_sorted(&rm_functions, (void*)hostFun, (void*)result->ptr_result_u.ptr) != 0) {
diff --git a/cpu/cpu_rpc_prot.x b/cpu/cpu_rpc_prot.x
index 7e41fb91..72fa0bfe 100644
--- a/cpu/cpu_rpc_prot.x
+++ b/cpu/cpu_rpc_prot.x
@@ -122,7 +122,8 @@ program RPC_CD_PROG {
         int          rpc_printmessage(string)                                     = 2;
         int          rpc_dlopen(string)                                           = 3;
         ptr_result   rpc_register_function(ptr, ptr, string, string, int)        = 50;
-        ptr_result   rpc_loadelf(mem_data)                                       = 51;
+        int          rpc_elf_load(mem_data, ptr)                                 = 51;
+        int          rpc_elf_unload(ptr)                                         = 52;
 
         /* RUNTIME API */
         /* ### Device Management ### */
diff --git a/tests/test_apps/matmul.cu b/tests/test_apps/matmul.cu
index 64c67d57..7790ae7b 100644
--- a/tests/test_apps/matmul.cu
+++ b/tests/test_apps/matmul.cu
@@ -264,8 +264,8 @@ int main()
 
     dim3 dimBlock( blocksize, 1 );
     dim3 dimGrid( 1, 1);
-    //kernel<<<dimGrid, dimBlock>>>(dev_A, dev_x, dev_res, 0, 0, 0, 0);
-    kernel_no_param<<<dimGrid, dimBlock>>>();
+    kernel<<<dimGrid, dimBlock>>>(dev_A, dev_x, dev_res, 0, 0, 0, 0);
+    //kernel_no_param<<<dimGrid, dimBlock>>>();
     //void *args = NULL;
     //int result = cudaLaunchKernel((void*)kernel_no_param, dimGrid, dimBlock, &args, 0LL, NULL);
 

From ff493f6a8048a4cd3a2fd6abf24129af612125b0 Mon Sep 17 00:00:00 2001
From: Niklas Eiling <niklas.eiling@eonerc.rwth-aachen.de>
Date: Tue, 21 Feb 2023 12:25:47 +0100
Subject: [PATCH 10/83] code cleanup. fix wrong passing of dimensions

Signed-off-by: Niklas Eiling <niklas.eiling@eonerc.rwth-aachen.de>
---
 cpu/cpu-client.c         | 22 ++++------------------
 cpu/cpu-server-runtime.c |  2 +-
 2 files changed, 5 insertions(+), 19 deletions(-)

diff --git a/cpu/cpu-client.c b/cpu/cpu-client.c
index d3566673..07544637 100644
--- a/cpu/cpu-client.c
+++ b/cpu/cpu-client.c
@@ -337,22 +337,10 @@ void **__cudaRegisterFatBinary(void *fatCubin)
         return NULL;
     }
 
-    // CUDA registers atexit handler for fatbin cleanup. These access
-    // fatbin data structurees 
+    // CUDA registers an atexit handler for fatbin cleanup that accesses
+    // the fatbin data structure. Let's allocate some zeroes to avoid segfaults.
     result = (void**)calloc(1, 0x58);
 
-    // void ** (*cudaRegisterFatBinaryOrig)(void *) = NULL;
-
-    // if ((cudaRegisterFatBinaryOrig = dlsym(RTLD_NEXT, "__cudaRegisterFatBinary") ) == NULL) {
-    //     LOGE(LOG_ERROR, "dlsym failed");
-    //     return NULL;
-    // }
-
-    // if ((result = cudaRegisterFatBinaryOrig(fatCubin)) == NULL) {
-    //     LOGE(LOG_ERROR, "error calling original cudaRegisterFatBinary");
-    //     return NULL;
-    // }
-
     retval_1 = rpc_elf_load_1(rpc_fat, (ptr)result, &rpc_result, clnt);
     if (retval_1 != RPC_SUCCESS) {
         LOGE(LOG_ERROR, "call failed.");
@@ -361,10 +349,8 @@ void **__cudaRegisterFatBinary(void *fatCubin)
         return NULL;
     }
     LOG(LOG_DEBUG, "fatbin loaded to %p", result);
-    // return a handle that can be used to idenfity the fatbin for
-    // registerFunction
-    // TODO: We have to return a proper fatbinary handle because CUDA will segfault
-    // atexit() otherwise.
+    // we return a bunch of zeroes to avoid segfaults. The memory is
+    // mapped by the modules resource 
     return result;
 }
 
diff --git a/cpu/cpu-server-runtime.c b/cpu/cpu-server-runtime.c
index 96c49842..86240970 100644
--- a/cpu/cpu-server-runtime.c
+++ b/cpu/cpu-server-runtime.c
@@ -901,7 +901,7 @@ bool_t cuda_launch_kernel_1_svc(ptr func, rpc_dim3 gridDim, rpc_dim3 blockDim,
 
     *result = cuLaunchKernel((CUfunction)resource_mg_get(&rm_functions, (void*)func),
                             gridDim.x, gridDim.y, gridDim.z,
-                            blockDim.y, blockDim.y, blockDim.z,
+                            blockDim.x, blockDim.y, blockDim.z,
                             sharedMem,
                             resource_mg_get(&rm_streams, (void*)stream),
                             cuda_args, NULL);

From 98832e232503f3aa8ccf6ad16df5b591b5dc728f Mon Sep 17 00:00:00 2001
From: Niklas Eiling <niklas.eiling@eonerc.rwth-aachen.de>
Date: Tue, 21 Feb 2023 15:32:34 +0100
Subject: [PATCH 11/83] use an infinite timeout for kernel calls

Signed-off-by: Niklas Eiling <niklas.eiling@eonerc.rwth-aachen.de>
---
 cpu/cpu-client-runtime.c | 15 ++++++++++++++-
 1 file changed, 14 insertions(+), 1 deletion(-)

diff --git a/cpu/cpu-client-runtime.c b/cpu/cpu-client-runtime.c
index 373993a3..cf486890 100644
--- a/cpu/cpu-client-runtime.c
+++ b/cpu/cpu-client-runtime.c
@@ -969,7 +969,20 @@ cudaError_t cudaLaunchKernel(const void* func, dim3 gridDim, dim3 blockDim, void
                args[j],
                size);
     }
-    retval_1 = cuda_launch_kernel_1((uint64_t)func, rpc_gridDim, rpc_blockDim, rpc_args, sharedMem, (uint64_t)stream, &result, clnt);
+    cuda_launch_kernel_1_argument arg = {
+        .arg1 = (ptr)func,
+        .arg2 = rpc_gridDim,
+        .arg3 = rpc_blockDim,
+        .arg4 = rpc_args,
+        .arg5 = sharedMem,
+        .arg6 = (ptr)stream
+    };
+    struct timeval timeout = {.tv_sec = -1, .tv_usec = 0};
+    // We call the RPC explictly, so we can set the timeout to infinite
+	retval_1 = clnt_call(clnt, CUDA_LAUNCH_KERNEL, (xdrproc_t)xdr_cuda_launch_kernel_1_argument, (caddr_t)&arg,
+		                 (xdrproc_t)xdr_int, (caddr_t)&result,
+		                 timeout);
+    //retval_1 = cuda_launch_kernel_1((uint64_t)func, rpc_gridDim, rpc_blockDim, rpc_args, sharedMem, (uint64_t)stream, &result, clnt);
     if (retval_1 != RPC_SUCCESS) {
         clnt_perror (clnt, "call failed");
     }

From 6eeef6cd788ccc135f7c80f5399d8b76cffb26d2 Mon Sep 17 00:00:00 2001
From: Niklas Eiling <niklas.eiling@eonerc.rwth-aachen.de>
Date: Tue, 21 Feb 2023 16:13:34 +0100
Subject: [PATCH 12/83] remove timeout for cudaDeviceSynchronize

Signed-off-by: Niklas Eiling <niklas.eiling@eonerc.rwth-aachen.de>
---
 cpu/cpu-client-runtime.c | 27 +++++++--------------------
 cpu/cpu-client.c         |  6 ------
 cpu/cpu-server-driver.c  |  3 ---
 3 files changed, 7 insertions(+), 29 deletions(-)

diff --git a/cpu/cpu-client-runtime.c b/cpu/cpu-client-runtime.c
index cf486890..12a25902 100644
--- a/cpu/cpu-client-runtime.c
+++ b/cpu/cpu-client-runtime.c
@@ -269,12 +269,12 @@ cudaError_t cudaDeviceSynchronize(void)
 #endif //WITH_API_CNT
     int result = -1;
     enum clnt_stat retval_1;
-    for (int i=0; result != 0 && i < 10; ++i) {
-        retval_1 = cuda_device_synchronize_1(&result, clnt);
-        if (retval_1 != RPC_SUCCESS) {
-            clnt_perror (clnt, "call failed");
-        }
-    }
+
+    struct timeval timeout = {.tv_sec = -1, .tv_usec = 0};
+
+    return (clnt_call (clnt, CUDA_DEVICE_SYNCHRONIZE, (xdrproc_t) xdr_void, (caddr_t) NULL,
+		    (xdrproc_t) xdr_int, (caddr_t) &result,
+		    timeout));
     return result;
 }
 
@@ -969,20 +969,7 @@ cudaError_t cudaLaunchKernel(const void* func, dim3 gridDim, dim3 blockDim, void
                args[j],
                size);
     }
-    cuda_launch_kernel_1_argument arg = {
-        .arg1 = (ptr)func,
-        .arg2 = rpc_gridDim,
-        .arg3 = rpc_blockDim,
-        .arg4 = rpc_args,
-        .arg5 = sharedMem,
-        .arg6 = (ptr)stream
-    };
-    struct timeval timeout = {.tv_sec = -1, .tv_usec = 0};
-    // We call the RPC explictly, so we can set the timeout to infinite
-	retval_1 = clnt_call(clnt, CUDA_LAUNCH_KERNEL, (xdrproc_t)xdr_cuda_launch_kernel_1_argument, (caddr_t)&arg,
-		                 (xdrproc_t)xdr_int, (caddr_t)&result,
-		                 timeout);
-    //retval_1 = cuda_launch_kernel_1((uint64_t)func, rpc_gridDim, rpc_blockDim, rpc_args, sharedMem, (uint64_t)stream, &result, clnt);
+    retval_1 = cuda_launch_kernel_1((uint64_t)func, rpc_gridDim, rpc_blockDim, rpc_args, sharedMem, (uint64_t)stream, &result, clnt);
     if (retval_1 != RPC_SUCCESS) {
         clnt_perror (clnt, "call failed");
     }
diff --git a/cpu/cpu-client.c b/cpu/cpu-client.c
index 07544637..d3ca92bd 100644
--- a/cpu/cpu-client.c
+++ b/cpu/cpu-client.c
@@ -246,12 +246,6 @@ void *dlopen(const char *filename, int flag)
             LOGE(LOG_ERROR, "file does not contain a kernel");
         } else {
             LOGE(LOG_DEBUG, "file contains a kernel");
-            int result;
-            enum clnt_stat retval_1;
-            retval_1 = rpc_dlopen_1((char *)filename, &result, clnt);
-            if (retval_1 != RPC_SUCCESS) {
-                LOGE(LOG_ERROR, "error calling rpc_dlopen");
-            }
             cpu_utils_parameter_info(&kernel_infos, (char *)filename);
         }
         ret = dlopen_orig(filename, flag);
diff --git a/cpu/cpu-server-driver.c b/cpu/cpu-server-driver.c
index d86d57d5..4781d144 100644
--- a/cpu/cpu-server-driver.c
+++ b/cpu/cpu-server-driver.c
@@ -92,9 +92,6 @@ bool_t rpc_elf_unload_1_svc(ptr elf_handle, int *result, struct svc_req *rqstp)
     return 1;
 }
 
-
-
-
 // Does not support checkpoint/restart yet
 bool_t rpc_register_function_1_svc(ptr fatCubinHandle, ptr hostFun, char* deviceFun,
                             char* deviceName, int thread_limit, ptr_result *result, struct svc_req *rqstp)

From 74179ef74d726891bfffaa74f5a75512a2550d82 Mon Sep 17 00:00:00 2001
From: Niklas Eiling <niklas.eiling@eonerc.rwth-aachen.de>
Date: Thu, 9 Mar 2023 09:36:26 +0100
Subject: [PATCH 13/83] make cpu_utils_contains_kernel return the right value

Signed-off-by: Niklas Eiling <niklas.eiling@eonerc.rwth-aachen.de>
---
 cpu/cpu-client.c | 9 ++++++---
 cpu/cpu-utils.c  | 2 +-
 2 files changed, 7 insertions(+), 4 deletions(-)

diff --git a/cpu/cpu-client.c b/cpu/cpu-client.c
index d3ca92bd..b47b6e6c 100644
--- a/cpu/cpu-client.c
+++ b/cpu/cpu-client.c
@@ -248,9 +248,12 @@ void *dlopen(const char *filename, int flag)
             LOGE(LOG_DEBUG, "file contains a kernel");
             cpu_utils_parameter_info(&kernel_infos, (char *)filename);
         }
-        ret = dlopen_orig(filename, flag);
-        dlinfo(ret, RTLD_DI_LINKMAP, &map);
-        LOGE(LOG_DEBUG, "dlopen \"%s\" to  %p", filename, map->l_addr);
+        if ((ret = dlopen_orig(filename, flag)) == NULL) {
+            LOGE(LOG_ERROR, "dlopen failed");
+        } else {
+            dlinfo(ret, RTLD_DI_LINKMAP, &map);
+            LOGE(LOG_DEBUG, "dlopen \"%s\" to  %p", filename, map->l_addr);
+        }
         return ret;
     }
 }
diff --git a/cpu/cpu-utils.c b/cpu/cpu-utils.c
index 3043548d..6f1ea16e 100644
--- a/cpu/cpu-utils.c
+++ b/cpu/cpu-utils.c
@@ -534,7 +534,7 @@ int cpu_utils_contains_kernel(const char *path)
     LOG(LOG_DEBUG, "child exit code: %d", child_exit);
  out:
     free(line);
-    return (ret != 0 ? ret : child_exit);
+    return (ret != 0 ? 0 : child_exit);
 }
 
 int cpu_utils_parameter_info(list *kernel_infos, char *path)

From 36ead03e507b33498d67990f203e645399e72d76 Mon Sep 17 00:00:00 2001
From: Niklas Eiling <niklas.eiling@eonerc.rwth-aachen.de>
Date: Thu, 9 Mar 2023 15:57:26 +0100
Subject: [PATCH 14/83] add cudaRegisterVar client function

Signed-off-by: Niklas Eiling <niklas.eiling@eonerc.rwth-aachen.de>
---
 cpu/cpu-client.c | 18 ++++++++++--------
 1 file changed, 10 insertions(+), 8 deletions(-)

diff --git a/cpu/cpu-client.c b/cpu/cpu-client.c
index b47b6e6c..c47b9343 100644
--- a/cpu/cpu-client.c
+++ b/cpu/cpu-client.c
@@ -278,11 +278,14 @@ int dlclose(void *handle)
     }
 }
 
-// void __cudaRegisterVar(void **fatCubinHandle, char *hostVar, char
-// *deviceAddress, const char *deviceName, int ext, size_t size, int constant,
-// int global)
-// {
-// }
+void __cudaRegisterVar(void **fatCubinHandle, char *hostVar, char
+                       *deviceAddress, const char *deviceName, int ext, size_t size, int constant,
+                       int global)
+{
+    LOGE(LOG_DEBUG, "__cudaRegisterVar(fatCubinHandle=%p, hostVar=%p, deviceAddress=%p, "
+           "deviceName=%s, ext=%d, size=%zu, constant=%d, global=%d)\n",
+           fatCubinHandle, hostVar, deviceAddress, deviceName, ext, size, constant, global);
+}
 
 void __cudaRegisterFunction(void **fatCubinHandle, const char *hostFun,
                             char *deviceFun, const char *deviceName,
@@ -292,14 +295,13 @@ void __cudaRegisterFunction(void **fatCubinHandle, const char *hostFun,
     ptr_result result;
     enum clnt_stat retval_1;
 
-    printf("__cudaRegisterFunction(fatCubinHandle=%p, hostFun=%p, devFunc=%s, "
+    LOGE(LOG_DEBUG, "__cudaRegisterFunction(fatCubinHandle=%p, hostFun=%p, devFunc=%s, "
            "deviceName=%s, thread_limit=%d, tid=[%p], bid=[%p], bDim=[%p], "
            "gDim=[%p], wSize=%p)\n",
            fatCubinHandle, hostFun, deviceFun, deviceName, thread_limit, tid,
            bid, bDim, gDim, wSize);
 
-    kernel_info_t *info =
-        cricketd_utils_search_info(&kernel_infos, (char *)deviceName);
+    kernel_info_t *info = cricketd_utils_search_info(&kernel_infos, (char *)deviceName);
     if (info == NULL) {
         LOGE(LOG_ERROR, "request to register unknown function: \"%s\"",
              deviceName);

From babb70c6029621039facaed2ba40e7b1e0214fab Mon Sep 17 00:00:00 2001
From: Niklas Eiling <niklas.eiling@eonerc.rwth-aachen.de>
Date: Thu, 9 Mar 2023 16:03:44 +0100
Subject: [PATCH 15/83] add gdb commands file for debugging client apps

Signed-off-by: Niklas Eiling <niklas.eiling@eonerc.rwth-aachen.de>
---
 tests/gdb_client_cmds | 3 +++
 1 file changed, 3 insertions(+)
 create mode 100644 tests/gdb_client_cmds

diff --git a/tests/gdb_client_cmds b/tests/gdb_client_cmds
new file mode 100644
index 00000000..825cfdd6
--- /dev/null
+++ b/tests/gdb_client_cmds
@@ -0,0 +1,3 @@
+python gdb.execute("set environment CRICKET_NOHASH=yes")
+python gdb.execute("set environment REMOTE_GPU_ADDRESS=localhost")
+python gdb.execute("set environment LD_PRELOAD=../../cpu/cricket-client.so")
\ No newline at end of file

From d1f61732a9b9f4aa6df116412dfe3fad69788895 Mon Sep 17 00:00:00 2001
From: Niklas Eiling <niklas.eiling@eonerc.rwth-aachen.de>
Date: Fri, 10 Mar 2023 14:17:29 +0100
Subject: [PATCH 16/83] fix cpu_utils_contains_kernel and
 cpu_utils_parameter_info returning the wrong value

Signed-off-by: Niklas Eiling <niklas.eiling@eonerc.rwth-aachen.de>
---
 cpu/cpu-client.c | 3 +--
 cpu/cpu-utils.c  | 4 ++--
 2 files changed, 3 insertions(+), 4 deletions(-)

diff --git a/cpu/cpu-client.c b/cpu/cpu-client.c
index c47b9343..15b5cd42 100644
--- a/cpu/cpu-client.c
+++ b/cpu/cpu-client.c
@@ -242,11 +242,10 @@ void *dlopen(const char *filename, int flag)
         return dl_handle;
     } else {
         LOGE(LOG_DEBUG, "request to dlopen \"%s\"", filename);
-        if (cpu_utils_contains_kernel(filename) == 0) {
+        if (cpu_utils_parameter_info(&kernel_infos, (char *)filename) == 0) {
             LOGE(LOG_ERROR, "file does not contain a kernel");
         } else {
             LOGE(LOG_DEBUG, "file contains a kernel");
-            cpu_utils_parameter_info(&kernel_infos, (char *)filename);
         }
         if ((ret = dlopen_orig(filename, flag)) == NULL) {
             LOGE(LOG_ERROR, "dlopen failed");
diff --git a/cpu/cpu-utils.c b/cpu/cpu-utils.c
index 6f1ea16e..0445bb2e 100644
--- a/cpu/cpu-utils.c
+++ b/cpu/cpu-utils.c
@@ -534,7 +534,7 @@ int cpu_utils_contains_kernel(const char *path)
     LOG(LOG_DEBUG, "child exit code: %d", child_exit);
  out:
     free(line);
-    return (ret != 0 ? 0 : child_exit);
+    return ret == 0 && child_exit == 0;
 }
 
 int cpu_utils_parameter_info(list *kernel_infos, char *path)
@@ -634,7 +634,7 @@ int cpu_utils_parameter_info(list *kernel_infos, char *path)
     LOG(LOG_DEBUG, "child exit code: %d", child_exit);
  out:
     free(line);
-    return (ret != 0 ? ret : child_exit);
+    return ret == 0 && child_exit == 0;
 }
 
 void kernel_infos_free(kernel_info_t *infos, size_t kernelnum)

From 9cb6aafa966438052a58ecfb508d400315a08d6c Mon Sep 17 00:00:00 2001
From: Niklas Eiling <niklas.eiling@eonerc.rwth-aachen.de>
Date: Fri, 10 Mar 2023 15:07:24 +0100
Subject: [PATCH 17/83] make cpu_utils_launch_child also redirect stderr of
 child processes to pipe

Signed-off-by: Niklas Eiling <niklas.eiling@eonerc.rwth-aachen.de>
---
 cpu/cpu-client.c | 7 +++----
 cpu/cpu-utils.c  | 3 ++-
 2 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/cpu/cpu-client.c b/cpu/cpu-client.c
index 15b5cd42..e31e8683 100644
--- a/cpu/cpu-client.c
+++ b/cpu/cpu-client.c
@@ -225,7 +225,7 @@ void *dlopen(const char *filename, int flag)
 {
     void *ret = NULL;
     struct link_map *map;
-    LOG(LOG_DEBUG, "intercepted dlopen(%s, %d)", filename, flag);
+    LOG(LOG_DBG(1), "intercepted dlopen(%s, %d)", filename, flag);
     if (dlopen_orig == NULL) {
         if ((dlopen_orig = dlsym(RTLD_NEXT, "dlopen")) == NULL) {
             LOGE(LOG_ERROR, "[dlopen] dlsym failed");
@@ -241,11 +241,10 @@ void *dlopen(const char *filename, int flag)
         }
         return dl_handle;
     } else {
-        LOGE(LOG_DEBUG, "request to dlopen \"%s\"", filename);
         if (cpu_utils_parameter_info(&kernel_infos, (char *)filename) == 0) {
-            LOGE(LOG_ERROR, "file does not contain a kernel");
+            LOGE(LOG_DEBUG, "dlopen file \"%s\", but does not contain a kernel", filename);
         } else {
-            LOGE(LOG_DEBUG, "file contains a kernel");
+            LOGE(LOG_DEBUG, "dlopen file \"%s\", contains a kernel", filename);
         }
         if ((ret = dlopen_orig(filename, flag)) == NULL) {
             LOGE(LOG_ERROR, "dlopen failed");
diff --git a/cpu/cpu-utils.c b/cpu/cpu-utils.c
index 0445bb2e..be9a460b 100644
--- a/cpu/cpu-utils.c
+++ b/cpu/cpu-utils.c
@@ -332,6 +332,7 @@ int cpu_utils_launch_child(const char *file, char **args)
         return -1;
     } else if (pid == 0) {
         while ((dup2(filedes[1], STDOUT_FILENO) == -1) && (errno == EINTR)) {}
+        while ((dup2(filedes[1], STDERR_FILENO) == -1) && (errno == EINTR)) {}
         close(filedes[1]);
         close(filedes[0]);
         char *env[] = {NULL};
@@ -631,7 +632,7 @@ int cpu_utils_parameter_info(list *kernel_infos, char *path)
  cleanup1:
     close(output);
     wait(&child_exit);
-    LOG(LOG_DEBUG, "child exit code: %d", child_exit);
+    LOG(LOG_DBG(1), "child exit code: %d", child_exit);
  out:
     free(line);
     return ret == 0 && child_exit == 0;

From ac36e855a2fe292a89838a997b0a7a3580be116e Mon Sep 17 00:00:00 2001
From: Niklas Eiling <niklas.eiling@eonerc.rwth-aachen.de>
Date: Fri, 10 Mar 2023 15:30:35 +0100
Subject: [PATCH 18/83] reduce debugging output verbosity and add some NULL
 checks

Signed-off-by: Niklas Eiling <niklas.eiling@eonerc.rwth-aachen.de>
---
 cpu/cpu-client.c | 15 +++++++++++----
 cpu/cpu-utils.c  |  5 +++++
 2 files changed, 16 insertions(+), 4 deletions(-)

diff --git a/cpu/cpu-client.c b/cpu/cpu-client.c
index e31e8683..a3a5e340 100644
--- a/cpu/cpu-client.c
+++ b/cpu/cpu-client.c
@@ -225,7 +225,14 @@ void *dlopen(const char *filename, int flag)
 {
     void *ret = NULL;
     struct link_map *map;
+    int has_kernel = 0;
     LOG(LOG_DBG(1), "intercepted dlopen(%s, %d)", filename, flag);
+
+    if (filename == NULL) {
+        LOG(LOG_WARNING, "dlopen called with NULL filename");
+        return ret;
+    }
+
     if (dlopen_orig == NULL) {
         if ((dlopen_orig = dlsym(RTLD_NEXT, "dlopen")) == NULL) {
             LOGE(LOG_ERROR, "[dlopen] dlsym failed");
@@ -241,16 +248,16 @@ void *dlopen(const char *filename, int flag)
         }
         return dl_handle;
     } else {
-        if (cpu_utils_parameter_info(&kernel_infos, (char *)filename) == 0) {
-            LOGE(LOG_DEBUG, "dlopen file \"%s\", but does not contain a kernel", filename);
+        if ((has_kernel = cpu_utils_parameter_info(&kernel_infos, (char *)filename)) == 0) {
+            LOGE(LOG_DBG(1), "dlopen file \"%s\", but does not contain a kernel", filename);
         } else {
             LOGE(LOG_DEBUG, "dlopen file \"%s\", contains a kernel", filename);
         }
         if ((ret = dlopen_orig(filename, flag)) == NULL) {
             LOGE(LOG_ERROR, "dlopen failed");
-        } else {
+        } else if (has_kernel) {
             dlinfo(ret, RTLD_DI_LINKMAP, &map);
-            LOGE(LOG_DEBUG, "dlopen \"%s\" to  %p", filename, map->l_addr);
+            LOGE(LOG_DEBUG, "dlopen to  %p", map->l_addr);
         }
         return ret;
     }
diff --git a/cpu/cpu-utils.c b/cpu/cpu-utils.c
index be9a460b..fb847850 100644
--- a/cpu/cpu-utils.c
+++ b/cpu/cpu-utils.c
@@ -553,6 +553,11 @@ int cpu_utils_parameter_info(list *kernel_infos, char *path)
     char *kernelname;
     struct stat filestat = {0};
 
+    if (path == NULL) {
+        LOGE(LOG_ERROR, "path is NULL.");
+        goto out;
+    }
+
     if (kernel_infos == NULL) {
         LOGE(LOG_ERROR, "list is NULL.");
         goto out;

From 07ed9312002df601c64b2b4ba39d1f2174316c72 Mon Sep 17 00:00:00 2001
From: Niklas Eiling <niklas.eiling@eonerc.rwth-aachen.de>
Date: Fri, 10 Mar 2023 15:36:23 +0100
Subject: [PATCH 19/83] make dlopen return a handle to the main program if it
 is called with a NULL filename

Signed-off-by: Niklas Eiling <niklas.eiling@eonerc.rwth-aachen.de>
---
 cpu/cpu-client.c | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/cpu/cpu-client.c b/cpu/cpu-client.c
index a3a5e340..cb725b52 100644
--- a/cpu/cpu-client.c
+++ b/cpu/cpu-client.c
@@ -229,8 +229,7 @@ void *dlopen(const char *filename, int flag)
     LOG(LOG_DBG(1), "intercepted dlopen(%s, %d)", filename, flag);
 
     if (filename == NULL) {
-        LOG(LOG_WARNING, "dlopen called with NULL filename");
-        return ret;
+        return dlopen_orig(filename, flag);
     }
 
     if (dlopen_orig == NULL) {

From e9b2c1c1834bedf815e5292e9eaada7f97fb5ea9 Mon Sep 17 00:00:00 2001
From: Niklas Eiling <niklas.eiling@eonerc.rwth-aachen.de>
Date: Fri, 10 Mar 2023 16:07:21 +0100
Subject: [PATCH 20/83] fix ci error by making tests/cpu/cubin/main.cpp compile

Signed-off-by: Niklas Eiling <niklas.eiling@eonerc.rwth-aachen.de>
---
 tests/.gitignore         |   1 +
 tests/cpu/cubin/main     | Bin 23184 -> 0 bytes
 tests/cpu/cubin/main.cpp |   7 ++++---
 3 files changed, 5 insertions(+), 3 deletions(-)
 delete mode 100755 tests/cpu/cubin/main

diff --git a/tests/.gitignore b/tests/.gitignore
index d5dbc051..c08de5b9 100644
--- a/tests/.gitignore
+++ b/tests/.gitignore
@@ -3,3 +3,4 @@ test-cricket
 test_api
 test_cpu
 test_kernel
+test_kernel_call
diff --git a/tests/cpu/cubin/main b/tests/cpu/cubin/main
deleted file mode 100755
index f10f1de59057402075999a006cae4f2e97bf76cb..0000000000000000000000000000000000000000
GIT binary patch
literal 0
HcmV?d00001

literal 23184
zcmeHPe{dYteSdfI582p~3^o`;;)Plr8{)HMY$NldxwC!pD#*65C1QS<)A?>?9X{PD
zcY8$6j2ky1G#6p;Oq-OZwBx2TnIz30Zdx+NY4H&gOfpRBBrb6aWYm-dDUeqD2oc0^
z{e0iP@80g-ZbF-oq*J|-?z`X5_s4th``*5N`*!zzG~V4A4uu4#u=oQ(+|<0Ngsfj7
zj_j1I!aBq}5fz$PF6IE4h5x9O5Vek&PAk_;8<k!IDA{#{EubSDl?O~Ow_w4POGtFK
zl366GIE%JiVzEpGN=b?AsI14{Lmbh57kx>UqlF44lO}%&CcBEVt0+6BnzCcc^`pAc
zf3K4FhNIw!j}j4E<Zq3VE~TCynrS+oS}a-6s61erwP3-N?QVu0mGhmZ7NxjT<rfr}
z9#wv(+zxB9nY|m<t;wcWXEV9d$m)@f4XZb-ixl&bwX)vipLDlw?~^%ESd}X&r|J<#
z^6y^r+0PE#{=$o`AAaD9tMlI-dG49X<D|O;|J-i!K)E1%<11s_HNe%UoiMzx9u|)R
z^TWRpfZrP6=WKxf`2ai`;D@@EUwO6#;8z9UO#%4L0r|!Q@L~AT@b66<0r-{kKLhZG
z1MoyZzV|_2!@oEE1pv(pg8u;gEE3DaQLSG}`MeY8m#I=ZNj<S$q)OZJ=~C9{%VqP)
zw9R%~On0)BOAX#+6mmw^mTfi6PQYd+p92OiUMS=XyUao+w~w6{OzFNo(k2pJz1tFL
zqhRdI6iuVhyKQqepEG)sd$Wc}B=!yEbBUsvESQOerRu9l1@0^u1~N=1w<WXLe9AgB
zhEl@^;D_+O9NTs#%}tqHvT#5u?J~?Q#@(5eWnEw3qM1%zeKi$$1C@WsKH8T`C5l#@
zC}<XLjATqPTr!ISc1EEf28K~{b3hcchB1tykgBTEP%@JfTf4h9ZBDF>tc|Rvh6v-V
zZm@r4(@4}h_br*FXURe$B<{o^jTajJ$5%gtVWB}{U?B~O4QOi~uS&}@^C*`O0i!g&
zykAlp`{*3tG~vLV{dUTMU&P9w(+*q>y;92ki>^5|?r>VF`rTXrswh&VIdC5DNU+j@
zqk?v7bKpFdld!{qJFj<92ksp2bq7vm;?(27?Q;ae-3}Z9wNt+XpKqf=3_9=&9Qd#U
zU*Nz;9QZ;9e#n70JMd39aBgc7A93J|9Qv;|zxPdj^rc39Z1$>4G@*aIVuq_{^wGx~
zPdT)NsII>f;%e&({5CI-;)wV`(wQ8uBDF3fo|<s-jN~sQo|<m5BKi5mHxPeJ@?qks
zsV0v~{>_WQQxi=dk^C#fQ`1ZylKhW}rzV*kmi!Nhr>2<fm;ApGPfak{Bl+(TPobZT
zO8%R~Q^+UVB>y$yDb$mi<WCS!A)aiJ{FjNRQ(;m_{)@y@s3)i1u2v~-?LX-%fBR0o
z`~!XT`Kg}XuG5vamZ;EAKUQfba{7*ih|`j9F44qmM@awFK%{y3J@A0$&=233fkXZ9
z?7c)bPMb^61fPa3HNm{<c=Pf@q<BhE=x>zz>(>%l^|t=l+YNeoN`GwP{d(vL{o8Mw
zi{W5MIcTho4>T{|QnRQ058e0{#G-UjpI-jq#79u8CuYA0B(&$$ky8V;!cqH@{)?X^
zMR@&Z@>)JUu@x0RvgZ_`yU=Mm^zy+c^>V4Am&f(;DSa$n(MKmjdN~gEYx?NdLS5zW
z#J)<^u=z8wM<hR53F(Iyf1HkwQ$Ll4p)-%r@d-lppS`T_ue9i^9w+akl?K!_tUop$
zhWm3eD`k#j6yj+rUk|!hIevEZ;91f9smeVSbMaXG?1>fd_UIp>WuWmDXy~Wo6EcUf
z_=J9ayi!~W7Zc^j@C(O%6XgX*^wIc4=*92XWiux9vC_FmsFzcIm%LT2*5~&=M}D1@
zU-{W1`q-Z7Z`gs;5xC<B+%Y?F$5i0X*@1dj0(S;AI>vEFRgQx{^9Y@!R2}_1Rp+VE
zgHP4M_tc3Ej_Q0Es!zmU2zf$ythj+3QSioQp{{)sX#0(4%u9hzl%F{HNd*5m+=z*<
z{~Vnz{sNkt0$e$C@C9McKlv%hC+<S-&_`TOPux$(uCSrI^_LHR2|+u0QhK^w7WVw;
z!Sl7ko<Gs;DC{@UOjJ+}7?nvs9G_Ah<vKLn$-kytUqgW)7=QH3;3(@q0Z@)lj~<+^
z`JF!TL5JUF@;kO|9A@WEoRntYBFjg7EpK;N{tJ#z(hxWClQ*l?)A6%3{K3sx^l3B;
z6%X}4jBZ)RsZ{Hf7@zNgUf;2ucZKh2yekxwJup^&QqmLD9UtXB`6@hP_-?DOLz2|g
zd2kbDOfea=W0F8jz>ZT@9)%OxU{;?+@LSMjrxE=rSm8g8V|mKWf~Kvcl@GoIo$)d{
zz`>V<siVx(V{u?l!)X_lZ0z9lSm`BwZ1(;|ni$)L0`!e{mEVY!rykvHwSd)8x}kiM
z>NN4n8yIx#ELzG3&r>nVeQ2Wd!i<i^&%^gWK?wyPE1iey7U_DLO6uqOW$0S2|IBjD
z<6Z27UFGk^Zs{t&9_x#h--`8iA8tLnP!oOn;nhTUb+7s%T`MO}Pb0U--fA$Hw?CuC
z=kD^G-R0M}l>aMMUHq&*dMc!Ee6I9kx{ltyCw50{PwdWE!a2su>jA1qlzj3PH)P-4
zL5S2^T-lmgrCq0O?%NX6c9wExX2{TDJzW|mmHC3!n!ZkJEk@=GnnXrY!@~l&mNX%l
zFN&p9$|x2GO4%!fX%tONy|q+nFBBsYv0u*DBa&AP^6U9YANmtUVd|PlVG#;NK`R)@
zoM|p*Yt)H2Ip-9N5&fvOE^-aerNX##EIAd_aa2w6Zkm0Qh)OJq^#XK&NO7=e7R=;c
z95FpGWbu+Fii5fIh=}Cwj%0EJd3lh++<#voIXo!2;X>Zbn+Jvs$rX%jBx&83yt~eQ
zYFUHU;;?~(T!Y1Si92ViH6cwyh{IJ_Qk<|=yA8tO#+LbOu7p@kPa73&QFN)InxZQe
zZBw*EQbb%)%N~6}@s@o^SoR@d*@uK>9}<>5no;V9g=HUB_F-Y!hn0O;*@u;VSlOfe
zi|GQ2iQEE^Updhb;_155XH<FKqbL*7cP5%J`q}@bQu<}~bZ?}(w$rVBv!g4NbSNJg
zJkX^Ir+$Jm4J+0rx+*B^aT(QVYx|-8ggVsO_bdBx+oN^B`7`~y&3ikkY)@ivrgU8E
zmo#))!}VDlQ6o*gTCE-<^rzM8CqUXj9|HXm=sD2ypdH9>{m-h^J3zA-J`aITV)#7@
z+Jv!w8hRfBT?+mTXazKi3u>bZ-eSlpg+8)VghpCImo1pvcmz5aspQ(`@N3oT8dBIY
zw}r4a{ND$R&m@`zZoz*C%Gv@u(bUq}wDbeb7ks$!ka+*H>#w?I#g#y%FRIrZl=)%;
zVofdghBq&qb2Bnj^7}xWQ1+Bl9==m5Tn_9@_`ex)y7b%j;aj9!1N0RB{}E-|NabMv
z@zI8cH6dGP3ZieL4A**fR$Drh&tk})f4y2g=+tQ#kqRVV2Rw#Yx8m5&r{Agv#iAGT
zMwHVad1q701L1hn()%0YP1@mE@urpc&DNXR?wzAIb&Ss4+7!(-b;O$5VofVIHEElg
zmTqcl+0@i1<Npx+w80O@n%vE#nF!28U?u`H5txa<?_vbb%#X@q9aWSrp_IIo*3>C6
zPB;Hf{=FW4rH7|AcS;K^B91pk<q@ytcBpyPrAp4vQ!i6I%{eG7vxqpZo}1G3of6A`
z`gS!>{Da{r5&PNhnOPRkYuuHv#q*l9oI40PN5pS!-sxPUC!FV(Le2lwv`idQ_2c;&
z-{)OIMo1lVqmnXSnPc%>?hz@K^L=@=RPngSk@Y#G^l_^rdCqrA@!OSLnf=xXKgaeK
z;C0pBbLK^*=wd~GU(pSUb}D*{q63QFt?0dqep=DbDf$&fpHTE^MSr9yWx93q=IgYT
zctX5N>xit6v}tSG*0#5;Z(paa+-amW9jgTrU%hUXpngH~XX@}<V=F?}3~#)a?m?ET
z^Tej4L&?&cT`1*pw7h_aVV+E2YiZiL$y_dvMF?Nr3tYNaZQb!f@x{l{=-hv;{H~JS
zpxLE+1)dM4_9s%=WN}ak`<aw1-&LuB1g!!X)_p%PKL3=z&K)~@yE<<b^1fQm`^mSo
zy;?q*+r2vx+qEkWtsUEULF6gVTslAO`LDFQLUwVFul!KPObir~Lq=jakLQ6#q2AY=
z3xV0|DV8s*ZUrNm&FsrT9N13$z6n!r>}T3(rk!TaCpA{goKG|3-nyCV6J1Z#weDTN
zK51$m(aQ4)nK|dbVrCwJQ2Zw65j0Pt`5wKxx;0imPx7`G&zqFK@AzTK{O3(U^Glwu
zFrAsVpiwAU?J_gJoVgFc`8hLhnYmx%oA;S{OKp6-`qwsZp__Ks`bV<e{)U6sKKyi8
z#?()T=R8dPba;L6TWa0NPlwkGqiXHYPp55uREFniMT4%p^r|AI^~%?8`?^c(DDT3$
zj$AMDti`;ebsaMNU7C+^d$p?e3Yd?%>idi4W4~<v$N6yo_*L@@>W4h<V>;6h>8wOj
z^S|G8Kb&bNA^tCn7c=cN(@xeHv10bG<GITJ_v^3UT#B!KO#in!Jg56^V^j|9etvjP
z7gyJJsTlCnfQIh~N9&~Y-B_<L-XKh0e3t#(*y*eRGoOmD{v5&2mEHPth2Qho20W$a
z=eTbDd4it@yYcz<b7MDtf&Kj4jV}=6DyD9Hp*?qZ<4uC+&~Ch0=st08z#Rd<|LE3V
zB+f04Iw5)7fG<4xJ7cG?0Z%3Sm-y)~w!fox>o2jN_q*|n@f~%)6OzXb;u8BiKBusO
z-X>Mg?cMmL;z3`$_C2UupWjMwqV#US<)We;b%B_+h*{$0stW{jBLU%s_VZ+xZwtVa
z3g>wt>&th@;m3;~4$%Kx0DdX}e<lF`SpdF(D(q99R)zCADw$KdQQ>?qXrV&f8sMiC
zfPX3g|4IP<OaT7#0DL)3JbmhQLjb-<;okb*qi}D1zX05?pFFJe`F@Av^9_abSj_mp
zE8N>3-Uz^#(1p&YUTXvJT>*GL0Dm9=|9k-cb%lHD`@F*WK8))dnq`&Wi)#w^#^*-h
zesR`;hs8p{>jmup0}Ag~>j#YQQTS00UIZS(TS}kT`lTv$CrTfac>Or`1?7j|!?Lu6
zcv$M!kB9WBlwbM3FZGv+V=7;|rc#<#{(1d_=kgcfW}eE)@3FJ~8hMjnKh9q#^=soi
zzn`=NxL<j81E+ZYYg1H~qgDRmd=Du90qOrDvEUL*U#Uyt=l_qPPdf;BuP>jGe(J9a
ze<Sr5*Ih^cLE-$q8RvUi`l-KO9KZ`)7qXIrJ_r34ajAGn#h*oYLSF;!y}mpT{Y6;e
z;p;5IBGaqDuSDG5R&h%y{5^Ql#xLLNCB9J5H@1}cJ+>(DMd4=A<GGGsiiW;0Tz}o)
zMt(33@OXt{$YccMI~st0Iso4*<5s&)tx^6TCOlj}pE)J{EEF%P^Gkjs0GsatUlhL7
z-lsvi5&sd8@4SHax-0-+3EXcVh#La*w*}yL0H^-oZFl<Cpath$#nVqR0e(gT@Wayo
zvbz5E=K=bEEAfSb$5lfG?yJD5za^Wa5|Q6(;OIHvnnc_`Zzw;l3!+jWzstZ;GxaO@
zp)bEFasOE^aS6G8t_i?<1MuMh`~l#a%d^G&RRAv5l!^t~Y-SDK&|8CDW)d6Ltu7ep
z!K7*JJhP@_O(atLgC@46NfpfY2tKJ(cb17vE@MVgUOr*u(q1ewm^Zx`>G|Q2QZAd>
zZ)6Yn8UzXPIUx1}DGmD?1;`?DTMkckWU<@=n46hY0-p-zQ_lLjad*AlGIu@Qvidl<
z<v#InD@X_Rad9iv$Hy&~P5`-M=+^PX%Po`++;K~!vsTcOra_yVP!NYRhE<eWLj|c(
zKQgidt7SLK@2VL9tm}I<j0)nX>uv+Levr&3_8!CfxYrcwV_uW_$Gax?#JVQ%i?g+T
zOsZJ2!Xlfo76P@qj4gJ>#~~a*pzGSE=30u$7Q;Z%EMbR;l&EddlQ4(yvMo00DPlL9
zbUv{!o8OzvCemiUP)sCCBO;X_8qOM~k&axucH??~r3B@XNEQmo1E{8nt$3`LW7DOf
zA=<FV!O5DsRIy{$eiXm7S0rxhUfX7E?}JT!Hq$50dNP-mdk4Aq{INFt=`%A~X|&cd
z>cN=~XGR*<uq}!KNw*};B&oHpM@f=ud!Sxp#H@X7ow%*Nt+v}xS0=uZIJwtRPfu@R
z2zBV(8QT_5#J6vuy^9if4LcFn6{v67DH2<5-5%T4wHY$%_jV1_Zoq2d#MbT|n_}IG
z9i5%K;=PI9*rx6{yc%g)(ykpi>C%~8qEs|sep~ys3B2}<*NiuFnTqY}?D8StrJ<gi
zQW<NHB)MHuG)nstLEEu4liZ#NO<c?;231h~PjkzU+-u3tkKFkv8g*`T6u2XkpFQf+
zK3p({lLaF&WDNE6WGK4WDN4nbitXOE$<GT7Snfjb=2mhyD{HqVuY%hx>gn3@3b8@>
zt<9Vub8VHSwtaFtOeFRei%OsNi1JFQ4HA6zQahsm$#&dn?V?q5bTVqcU$}YJZD{gA
z18>F;U}r2DI2kuP4>|+wQlp}dUAxjrds8gGP*T8YyDdLi-3C|QI=;g#z3_=Xf9=m-
z|J(Zk$oJ=Q{m<dW{zz<7*J?AlPjUNlrO4h=2HWaoaA_cJkUk5Dqz~la&!T3*lH6?+
zinL3aR8vd>GK>jHGEhguSxji?3>6WRhJc6+T%?ef=N`5npdkW-2W>KdVFUxsE@R&i
z1~m~e2Gu||n1)~GEN|BNZh7NlWSShxV4k&)CQ<l_$jq&b&~-!vW2@!bu4p7_VsEv*
zCG7LyGcEidL$uO|DLd|Et;Flhyw4<mf271xmye159{w?9w(WU6n)h+kAaf?Z4`i$z
z2Tprlw>DLQR<NR?Ui&S;@o99P%l}|FDm5SU*z<fH(`Hvv-_6_QuY{vgUQzZfisJT5
z)}8Hnec%@04fyB$dA<CMDk$HtN~g*P9}}hT&gh#bw&!*A?UW(X`z&JLZ?GQI{gBf)
zRLmcm6O|T+l)c4S-?5MmmtQL3=Og&1qOyHO*;kbPh*I>%kL!Prvfrfi_&cx;YE+~{
z)VYwzo@_XOw!a@3si^Gib^qf^(Yq2&Wp$=97^giy9~f8mat{E9fqdr&Jkvk**z>yL
z6zwpE6z76jtZk=Q@fjRA^XKOc9V!EUkAn5R<v*(IIsUv}qAUA9WJ5(cehh!!W6#eo
zo>cZNo(#SAfA6v9=N#u2TRCt$aeiL=$36DEUd8*dX4z0tPjSR|JofzDq+flQcue&l
zcH_<eSqPf&Pv;;dUgzTdS-twqGo649y@$m1{N3g8_mH<*f3>(8&-SlDKwX6G`FRcR
zf2*m6<!;1w-2dK!Eme{2d4I-n^<s<$nKQ9H(=d#k_V)81&9ZvXX^dmGV|szdp4W38
zRQBHXqP(1G5diYV<v*(ROYuXRBtj93!LOIU820#+`*_$&IAD7o|5!%4l$b8Zff{Ng
z&d;Zw)Fdm8cm@!^`tZ4DSy;cjZ?LGoz@qO3p|vB98=n_6OmN*TP3!llZc!e`SjOdM
Up1%L{v)_HQrLo>);9<po1KzUVTL1t6

diff --git a/tests/cpu/cubin/main.cpp b/tests/cpu/cubin/main.cpp
index c8b815e8..5b04b744 100644
--- a/tests/cpu/cubin/main.cpp
+++ b/tests/cpu/cubin/main.cpp
@@ -66,8 +66,9 @@ void check_free_mem(int *mem, size_t len)
     cudaFree(mem);
 }
 
-int getModuleFromCubin(CUmodule **module, const char *cubin)
+int getModuleFromCubin(CUmodule *module, const char *cubin)
 {
+    CUresult err;
     if ((err = cuModuleLoad(module, "kernel.cubin")) != CUDA_SUCCESS) {
         printCudaErrors(err);
         return 1;
@@ -77,7 +78,7 @@ int getModuleFromCubin(CUmodule **module, const char *cubin)
 
 int getModuleFromShared(CUmodule **module, const char *cubin)
 {
-
+    return 0;
 }
 
 int main(int argc, char** argv)
@@ -96,7 +97,7 @@ int main(int argc, char** argv)
     CUmodule module;
     CUfunction func;
     printf("testing cubin...\n");
-    if ((err = getModuleFromCubin(&module, "kernel.cubin")) != 0) {
+    if (getModuleFromCubin(&module, "kernel.cubin") != 0) {
         printf("error\n");
         return 1;
     }

From dec25d1e1f7afdb53dd9a29b6d66387f07bb813a Mon Sep 17 00:00:00 2001
From: Niklas Eiling <niklas.eiling@eonerc.rwth-aachen.de>
Date: Fri, 24 Mar 2023 12:30:32 +0100
Subject: [PATCH 21/83] parse kernel parameter infos from in-memory elf using
 libbfd

Signed-off-by: Niklas Eiling <niklas.eiling@eonerc.rwth-aachen.de>
---
 cpu/Makefile              |  18 +-
 cpu/bfd_extracts.h        |  64 ++++
 cpu/cpu-client-driver.c   |   2 +-
 cpu/cpu-client.c          |  20 +-
 cpu/cpu-elf.c             | 616 ++++++++++++++++++++++++++++++++++++++
 cpu/cpu-elf.h             |  25 ++
 cpu/cpu-server.c          |   1 +
 cpu/cpu-utils.c           | 245 +--------------
 cpu/cpu-utils.h           |  15 +-
 submodules/Makefile       |   2 +-
 tests/test_apps/Makefile  |   5 +-
 tests/test_apps/matmul.cu |   4 +-
 12 files changed, 739 insertions(+), 278 deletions(-)
 create mode 100644 cpu/bfd_extracts.h
 create mode 100644 cpu/cpu-elf.c
 create mode 100644 cpu/cpu-elf.h

diff --git a/cpu/Makefile b/cpu/Makefile
index 3ebb1491..d2d0527b 100644
--- a/cpu/Makefile
+++ b/cpu/Makefile
@@ -7,6 +7,8 @@ CLIENT = cricket-client.so
 
 CUDA_SRC = /usr/local/cuda
 LIBTIRPC_PREFIX = ../submodules/libtirpc/install
+SUBMODULE_LIBS = ../submodules/lib
+BFD_INC_PREFIX = ../submodules/cuda-gdb/bfd
 
 CC = gcc
 LD = gcc
@@ -39,7 +41,8 @@ SRC_SERVER = $(RPC_XDR)                 \
 			 cr.c 					    \
 			 gsched_none.c 			    \
 			 oob.c 					    \
-			 mt-memcpy.c
+			 mt-memcpy.c				\
+			 cpu-elf.c
 
 SRC_SERVER_LIB = server-library.c
 SRC_SERVER_EXE = server-exe.c
@@ -55,7 +58,8 @@ SRC_CLIENT = $(RPC_XDR)                 \
 			 cpu-libwrap.c              \
 			 cpu-client-cusolver.c 		\
 			 oob.c 					    \
-			 mt-memcpy.c
+			 mt-memcpy.c				\
+			 cpu-elf.c
 
 # 			 cpu-client-driver-hidden.c \
 
@@ -71,12 +75,14 @@ OBJ_CLIENT = $(SRC_CLIENT:%.c=%.o)
 RPCGEN_FLAGS = -C -M -N
 INC_FLAGS += -I$(LIBTIRPC_PREFIX)/include/tirpc
 INC_FLAGS += -I$(CUDA_SRC)/include
+INC_FLAGS += -I$(BFD_INC_PREFIX)
 
-LIB_FLAGS += -L$(LIBTIRPC_PREFIX)/lib -L$(CUDA_SRC)/lib64
+LIB_FLAGS += -L$(LIBTIRPC_PREFIX)/lib
+LIB_FLAGS += -L$(CUDA_SRC)/lib64
 CC_FLAGS += -std=gnu99 $(INC_FLAGS) -O2
 # TODO: use extern in header files instead of direct definition e.g. in cpu-common.h to remove -fcommon flag
 CC_FLAGS += -fcommon
-LD_FLAGS = $(LIB_FLAGS) -ltirpc -ldl -lcrypto
+LD_FLAGS = $(LIB_FLAGS) -ltirpc -ldl -lcrypto -lbfd
 
 ifdef WITH_DEBUG
 # use ASAN_OPTIONS=protect_shadow_gap=0  LSAN_OPTIONS=fast_unwind_on_malloc=0 when running
@@ -94,9 +100,9 @@ ifdef WITH_IB
 CC_FLAGS += -DWITH_IB=$(WITH_IB)
 endif
 
-SERVER_LD_FLAGS = $(LD_FLAGS) -lcudart -lcusolver -lcuda -lcublas -lbfd -lrt -lpthread
+SERVER_LD_FLAGS = $(LD_FLAGS) -lcudart -lcusolver -lcuda -lcublas -lrt -lpthread
 SERVER_BIN_LD_FLAGS = $(SERVER_LD_FLAGS) -Wl,--unresolved-symbols=ignore-in-object-files
-CLIENT_LD_FLAGS = $(LD_FLAGS) -lbfd
+CLIENT_LD_FLAGS = $(LD_FLAGS)
 
 # Targets
 .PHONY: all clean
diff --git a/cpu/bfd_extracts.h b/cpu/bfd_extracts.h
new file mode 100644
index 00000000..1ce5f46e
--- /dev/null
+++ b/cpu/bfd_extracts.h
@@ -0,0 +1,64 @@
+/* DO NOT EDIT!  -*- buffer-read-only: t -*-  This file is automatically 
+   generated from "libbfd-in.h", "init.c", "libbfd.c", "bfdio.c", 
+   "bfdwin.c", "cache.c", "reloc.c", "archures.c" and "elf.c".
+   Run "make headers" in your build bfd/ to regenerate.  */
+
+/* libbfd.h -- Declarations used by bfd library *implementation*.
+   (This include file is not for users of the library.)
+
+   Copyright 1990, 1991, 1992, 1993, 1994, 1995, 1996, 1997, 1998,
+   1999, 2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008, 2009,
+   2010, 2011, 2012
+   Free Software Foundation, Inc.
+
+   Written by Cygnus Support.
+
+   This file is part of BFD, the Binary File Descriptor library.
+
+   This program is free software; you can redistribute it and/or modify
+   it under the terms of the GNU General Public License as published by
+   the Free Software Foundation; either version 3 of the License, or
+   (at your option) any later version.
+
+   This program is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+   GNU General Public License for more details.
+
+   You should have received a copy of the GNU General Public License
+   along with this program; if not, write to the Free Software
+   Foundation, Inc., 51 Franklin Street - Fifth Floor, Boston,
+   MA 02110-1301, USA.  */
+   
+#include <bfd.h>
+/* Extracted from bfdio.c.  */
+struct bfd_iovec
+{
+  /* To avoid problems with macros, a "b" rather than "f"
+     prefix is prepended to each method name.  */
+  /* Attempt to read/write NBYTES on ABFD's IOSTREAM storing/fetching
+     bytes starting at PTR.  Return the number of bytes actually
+     transfered (a read past end-of-file returns less than NBYTES),
+     or -1 (setting <<bfd_error>>) if an error occurs.  */
+  file_ptr (*bread) (struct bfd *abfd, void *ptr, file_ptr nbytes);
+  file_ptr (*bwrite) (struct bfd *abfd, const void *ptr,
+                      file_ptr nbytes);
+  /* Return the current IOSTREAM file offset, or -1 (setting <<bfd_error>>
+     if an error occurs.  */
+  file_ptr (*btell) (struct bfd *abfd);
+  /* For the following, on successful completion a value of 0 is returned.
+     Otherwise, a value of -1 is returned (and  <<bfd_error>> is set).  */
+  int (*bseek) (struct bfd *abfd, file_ptr offset, int whence);
+  int (*bclose) (struct bfd *abfd);
+  int (*bflush) (struct bfd *abfd);
+  int (*bstat) (struct bfd *abfd, struct stat *sb);
+  /* Mmap a part of the files. ADDR, LEN, PROT, FLAGS and OFFSET are the usual
+     mmap parameter, except that LEN and OFFSET do not need to be page
+     aligned.  Returns (void *)-1 on failure, mmapped address on success.
+     Also write in MAP_ADDR the address of the page aligned buffer and in
+     MAP_LEN the size mapped (a page multiple).  Use unmap with MAP_ADDR and
+     MAP_LEN to unmap.  */
+  void *(*bmmap) (struct bfd *abfd, void *addr, bfd_size_type len,
+                  int prot, int flags, file_ptr offset,
+                  void **map_addr, bfd_size_type *map_len);
+};
\ No newline at end of file
diff --git a/cpu/cpu-client-driver.c b/cpu/cpu-client-driver.c
index 06f908be..a7499e0e 100644
--- a/cpu/cpu-client-driver.c
+++ b/cpu/cpu-client-driver.c
@@ -373,7 +373,7 @@ CUresult cuModuleGetFunction(CUfunction* hfun, CUmodule hmod, const char* name)
         return CUDA_ERROR_UNKNOWN;
 	}
     *hfun = (CUfunction)result.ptr_result_u.ptr;
-    if ((info = cricketd_utils_search_info(&kernel_infos, (char*)name)) == NULL) {
+    if ((info = utils_search_info(&kernel_infos, (char*)name)) == NULL) {
         LOGE(LOG_ERROR, "cannot find kernel %s kernel_info_t");
         return CUDA_ERROR_UNKNOWN;
     }
diff --git a/cpu/cpu-client.c b/cpu/cpu-client.c
index cb725b52..21acc4a7 100644
--- a/cpu/cpu-client.c
+++ b/cpu/cpu-client.c
@@ -16,6 +16,7 @@
 #include "cpu-utils.h"
 #include "cpu_rpc_prot.h"
 #include "list.h"
+#include "cpu-elf.h"
 #ifdef WITH_IB
 #include "cpu-ib.h"
 #endif // WITH_IB
@@ -247,11 +248,11 @@ void *dlopen(const char *filename, int flag)
         }
         return dl_handle;
     } else {
-        if ((has_kernel = cpu_utils_parameter_info(&kernel_infos, (char *)filename)) == 0) {
-            LOGE(LOG_DBG(1), "dlopen file \"%s\", but does not contain a kernel", filename);
-        } else {
-            LOGE(LOG_DEBUG, "dlopen file \"%s\", contains a kernel", filename);
-        }
+        // if ((has_kernel = cpu_utils_parameter_info(&kernel_infos, (char *)filename)) == 0) {
+        //     LOGE(LOG_DBG(1), "dlopen file \"%s\", but does not contain a kernel", filename);
+        // } else {
+        //     LOGE(LOG_DEBUG, "dlopen file \"%s\", contains a kernel", filename);
+        // }
         if ((ret = dlopen_orig(filename, flag)) == NULL) {
             LOGE(LOG_ERROR, "dlopen failed");
         } else if (has_kernel) {
@@ -305,7 +306,7 @@ void __cudaRegisterFunction(void **fatCubinHandle, const char *hostFun,
            fatCubinHandle, hostFun, deviceFun, deviceName, thread_limit, tid,
            bid, bDim, gDim, wSize);
 
-    kernel_info_t *info = cricketd_utils_search_info(&kernel_infos, (char *)deviceName);
+    kernel_info_t *info = utils_search_info(&kernel_infos, (char *)deviceName);
     if (info == NULL) {
         LOGE(LOG_ERROR, "request to register unknown function: \"%s\"",
              deviceName);
@@ -333,9 +334,10 @@ void **__cudaRegisterFatBinary(void *fatCubin)
 
     mem_data rpc_fat = { .mem_data_len = 0, .mem_data_val = NULL };
 
-    if (cpu_utils_get_fatbin_info((struct fat_header *)fatCubin,
-                                  (void **)&rpc_fat.mem_data_val,
-                                  &rpc_fat.mem_data_len) != 0) {
+    if (elf_get_fatbin_info((struct fat_header *)fatCubin,
+                                &kernel_infos,
+                                (void **)&rpc_fat.mem_data_val,
+                                &rpc_fat.mem_data_len) != 0) {
         LOGE(LOG_ERROR, "error getting fatbin info");
         return NULL;
     }
diff --git a/cpu/cpu-elf.c b/cpu/cpu-elf.c
new file mode 100644
index 00000000..adf495fb
--- /dev/null
+++ b/cpu/cpu-elf.c
@@ -0,0 +1,616 @@
+#define _GNU_SOURCE
+#include <stdlib.h>
+#include <stdio.h>
+#include <stdint.h>
+#include <unistd.h>
+#include <errno.h>
+#include <string.h>
+
+#include "cpu-common.h"
+#include "log.h"
+#include "cpu-elf.h"
+#include "cpu-utils.h"
+
+#include "bfd_extracts.h"
+
+#define uint16_t unsigned short
+#define CRICKET_ELF_NV_INFO_PREFIX ".nv.info"
+#define CRICKET_ELF_NV_SHARED_PREFIX ".nv.shared."
+#define CRICKET_ELF_NV_TEXT_PREFIX ".nv.text."
+#define CRICKET_ELF_TEXT_PREFIX ".text."
+
+#define CRICKET_ELF_FATBIN ".nv_fatbin"
+#define CRICKET_ELF_REGFUN "_ZL24__sti____cudaRegisterAllv"
+
+#define FATBIN_STRUCT_MAGIC 0x466243b1
+#define FATBIN_TEXT_MAGIC   0xBA55ED50
+
+struct  __attribute__((__packed__)) fat_elf_header
+{
+    uint32_t magic;
+    uint16_t version;
+    uint16_t header_size;
+    uint64_t fat_size;
+};
+struct  __attribute__((__packed__)) fat_text_header
+{
+    uint16_t kind;
+    uint16_t unknown1;
+    uint32_t header_size;
+    uint64_t fatbin_size;
+    uint64_t some_offset; //Compression related information
+    uint16_t minor;
+    uint16_t major;
+    uint32_t arch;
+    uint32_t obj_name_offset;
+    uint32_t obj_name_len;
+    uint64_t flags;
+    uint64_t zero;
+    uint64_t unknown2;
+};
+
+#define FATBIN_FLAG_64BIT     0x0000000000000001LL
+#define FATBIN_FLAG_DEBUG     0x0000000000000002LL
+#define FATBIN_FLAG_LINUX     0x0000000000000010LL
+#define FATBIN_FLAG_COMPRESS  0x0000000000002000LL
+
+static int flag_to_str(char** str, uint64_t flag)
+{
+    return asprintf(str, "64Bit: %s, Debug: %s, Linux: %s, Compress %s",
+        (flag & FATBIN_FLAG_64BIT) ? "yes" : "no",
+        (flag & FATBIN_FLAG_DEBUG) ? "yes" : "no",
+        (flag & FATBIN_FLAG_LINUX) ? "yes" : "no",
+        (flag & FATBIN_FLAG_COMPRESS) ? "yes" : "no");
+}
+
+static int fat_header_decode(void *fat, 
+                                       struct fat_elf_header **fat_elf_header,
+                                       struct fat_text_header **fat_text_header,
+                                       void **fat_text_body_ptr)
+{
+    struct fat_elf_header* feh;
+    struct fat_text_header* fth;
+    void *fat_ptr = NULL;
+    void *fat_text_header_ptr = NULL;
+
+    if (fat == NULL || fat_elf_header == NULL || fat_text_header == NULL || fat_text_body_ptr == NULL) {
+        LOGE(LOG_ERROR, "at least one parameter is NULL");
+        return -1;
+    }
+
+    feh = (struct fat_elf_header*)fat;
+    if (feh->magic != FATBIN_TEXT_MAGIC) {
+        LOGE(LOG_ERROR, "fatbin text magic number is wrong. Got %x, expected %x.", *((uint32_t*)feh), FATBIN_TEXT_MAGIC);
+        return -1;
+    }
+    LOGE(LOG_DBG(1), "fat_elf_header: magic: %x, version: %d, header_size: %p, fat_size: %p",
+        feh->magic, feh->version, feh->header_size, feh->fat_size);
+
+    if (feh->version != 1 || feh->header_size != sizeof(struct fat_elf_header)) {
+        LOGE(LOG_ERROR, "fatbin text version is wrong or header size is inconsistent.\
+            This is a sanity check to avoid reading a new fatbinary format");
+        return -1;
+    }
+    fat_ptr = fat_text_header_ptr = (void*)feh + feh->header_size;
+
+    fth = (struct fat_text_header*)(fat_text_header_ptr);
+    LOGE(LOG_DBG(1), "fat_text_header: fatbin_kind: %#x, header_size %#x, fatbin_size %#x, some_offset %#x.\
+        minor %#x, major %#x, arch %d, flags %#x",
+        fth->kind,
+        fth->header_size,
+        fth->fatbin_size,
+        fth->some_offset,
+        fth->minor,
+        fth->major,
+        fth->arch,
+        fth->flags);
+    LOGE(LOG_DBG(1), "unknown fields: unknown1: %#x, unknown2: %#x, zeros: %#x",
+        fth->unknown1,
+        fth->unknown2,
+        fth->zero);
+    fat_ptr += sizeof(struct fat_header);
+    *fat_text_body_ptr = fat_text_header_ptr + fth->header_size;
+    if (fth->flags & FATBIN_FLAG_DEBUG) {
+        *fat_text_body_ptr += 1;
+    }
+
+    char *flag_str = NULL;
+    flag_to_str(&flag_str, fth->flags);
+    LOGE(LOG_DBG(1), "Fatbin flags: %s", flag_str);
+    free(flag_str);
+
+    if(fth->obj_name_offset != 0) {
+        if (((char*)fat_text_header_ptr)[fth->obj_name_offset + fth->obj_name_len] != '\0') {
+            LOGE(LOG_DEBUG, "Fatbin object name is not null terminated");
+        } else {
+            char *obj_name = (char*)fat_text_header_ptr + fth->obj_name_offset;
+            LOGE(LOG_DEBUG, "Fatbin object name: %s (len:%#x)", obj_name, fth->obj_name_len);
+        }
+        fat_ptr += fth->obj_name_len+1;
+    }
+    *fat_elf_header = feh;
+    *fat_text_header = fth;
+    return 0;
+}
+
+int elf_get_fatbin_info(struct fat_header *fatbin, list *kernel_infos, void** fatbin_mem, unsigned* fatbin_size)
+{
+    struct fat_elf_header* fat_elf_header;
+    struct fat_text_header* fat_text_header;
+    void *fat_ptr = NULL;
+    void *fat_text_body_ptr = NULL;
+    unsigned fatbin_total_size = 0;
+    if (fatbin == NULL || fatbin_mem == NULL || fatbin_size == NULL) {
+        LOGE(LOG_ERROR, "at least one parameter is NULL");
+        return -1;
+    }
+    if (fatbin->magic != FATBIN_STRUCT_MAGIC) {
+        LOGE(LOG_ERROR, "fatbin struct magic number is wrong. Got %llx, expected %llx.", fatbin->magic, FATBIN_STRUCT_MAGIC);
+        return -1;
+    }
+    LOG(LOG_DBG(1), "Fatbin: magic: %x, version: %x, text: %lx, data: %lx, ptr: %lx, ptr2: %lx, zero: %lx",
+           fatbin->magic, fatbin->version, fatbin->text, fatbin->data, fatbin->unknown, fatbin->text2, fatbin->zero);
+
+    if (fat_header_decode((void*)fatbin->text, &fat_elf_header, &fat_text_header, &fat_text_body_ptr) != 0) {
+        LOGE(LOG_ERROR, "fatbin header decode failed");
+        return -1;
+    }
+
+
+    fatbin_total_size = fat_elf_header->header_size + fat_elf_header->fat_size;
+
+    // for (int i=0; i<64; i++) {
+    //     printf("%02x ", ((uint8_t*)fat_text_body_ptr)[i]);
+    // }
+    // printf("\n");
+
+    if (elf_parameter_info(kernel_infos, fat_text_body_ptr, fat_elf_header->fat_size) != 0) {
+        LOGE(LOG_ERROR, "error getting symbol table");
+        return -1;
+    }
+
+    if (fat_header_decode((void*)fatbin->text2, &fat_elf_header, &fat_text_header, &fat_text_body_ptr) != 0) {
+        LOGE(LOG_ERROR, "fatbin header decode failed");
+        return -1;
+    }
+    fatbin_total_size += fat_elf_header->header_size + fat_elf_header->fat_size;
+
+    // if (cricketd_utils_symtab(fat_text_body_ptr, fat_elf_header->fat_size) == NULL) {
+    //     LOGE(LOG_ERROR, "error getting symbol table");
+    //     return -1;
+    // }
+    fat_ptr = (void*)fatbin->data;
+
+    // for (int i=0; i<64; i++) {
+    //     printf("%02x ", ((uint8_t*)fatbin->text)[i]);
+    // }
+    // printf("\n");
+
+    *fatbin_mem = (void*)fatbin->text;
+    *fatbin_size = fatbin_total_size;
+    return 0;
+}
+
+size_t cudabfd_size = 0;
+int (*orig_cudabfd_stat)(struct bfd *abfd, struct stat* sb);
+int cudabfd_stat(struct bfd *bfd, struct stat *sb)
+{
+    //int ret = orig_cudabfd_stat(bfd, sb);
+    sb->st_size = cudabfd_size;
+    return 0;
+}
+
+static void print_sections(asection *sections)
+{
+    for (asection *section = sections; section != NULL; section = section->next) {
+        printf("section: %s (len: %#x)\n", section->name, section->size);
+    }
+}
+
+static void print_hexmem(void *mem, size_t len)
+{
+    for (int i=0; i<len; i++) {
+        printf("%02x ", ((uint8_t*)mem)[i]);
+    }
+    printf("\n");
+}
+
+struct symtab {
+    asymbol **symtab;
+    size_t symtab_size;
+    size_t symtab_length;
+};
+
+static int symtab_init(bfd *bfd, struct symtab *st)
+{
+    if (st == NULL || bfd == NULL) {
+        LOGE(LOG_ERROR, "at least one parameter is NULL");
+        return -1;
+    }
+
+    if (memset(st, 0, sizeof(struct symtab)) == NULL) {
+        LOGE(LOG_ERROR, "memset failed");
+        return -1;
+    }
+
+    if ((st->symtab_size = bfd_get_symtab_upper_bound(bfd)) == -1) {
+        LOGE(LOG_ERROR, "bfd_get_symtab_upper_bound failed");
+        return -1;
+    }
+
+    if ((st->symtab = (asymbol **)malloc(st->symtab_size)) == NULL) {
+        LOGE(LOG_ERROR, "malloc symtab failed");
+        return -1;
+    }
+
+    if ((st->symtab_length = bfd_canonicalize_symtab(bfd, st->symtab)) == 0) {
+        LOG(LOG_WARNING, "symtab is empty...");
+    } else {
+        LOGE(LOG_DBG(1), "%lu symtab entries", st->symtab_length);
+    }
+    return 0;
+}
+
+static void symtab_free(struct symtab* st)
+{
+    if (st == NULL) {
+        return;
+    }
+    free(st->symtab);
+    memset(st, 0, sizeof(struct symtab));
+}
+
+static int symtab_symbol_at(struct symtab* st, size_t index, const char** sym)
+{
+    if (st == NULL || sym == NULL) {
+        LOGE(LOG_ERROR, "at least one parameter is NULL");
+        return -1;
+    }
+
+    if (index >= st->symtab_length+1 || index == 0) {
+        LOGE(LOG_ERROR, "index out of bounds");
+        return -1;
+    }
+    // The first entry of any symbol table is for undefined symbols and is always zero.
+    // Libbfd ignores this entry, but readelf does not so there is a difference of one
+    // between libbfd indices and those referenced by the .nv.info sections.
+    *sym = bfd_asymbol_name(st->symtab[index-1]);
+    return 0;
+}
+
+static void symtab_print(struct symtab* st)
+{
+    const char* sym;
+    for (int i = 1; i < st->symtab_length+1; ++i) {
+        symtab_symbol_at(st, i, &sym);
+        printf("%#x: name: %s\n", i, sym);
+    }
+}
+
+#define EIATTR_PARAM_CBANK              0xa
+#define EIATTR_EXTERNS                  0xf
+#define EIATTR_FRAME_SIZE               0x11
+#define EIATTR_MIN_STACK_SIZE           0x12
+#define EIATTR_KPARAM_INFO              0x17
+#define EIATTR_CBANK_PARAM_SIZE         0x19
+#define EIATTR_MAX_REG_COUNT            0x1b
+#define EIATTR_EXIT_INSTR_OFFSETS       0x1c
+#define EIATTR_S2RCTAID_INSTR_OFFSETS   0x1d
+#define EIATTR_CRS_STACK_SIZE           0x1e
+#define EIATTR_SW1850030_WAR            0x2a
+#define EIATTR_REGCOUNT                 0x2f
+#define EIATTR_SW2393858_WAR            0x30
+#define EIATTR_INDIRECT_BRANCH_TARGETS  0x34
+#define EIATTR_CUDA_API_VERSION         0x37
+
+#define EIFMT_NVAL                      0x1
+#define EIFMT_HVAL                      0x3
+#define EIFMT_SVAL                      0x4
+
+static int get_parm_for_kernel(bfd *bfd,  kernel_info_t *kernel, void* memory, size_t memsize)
+{
+    struct __attribute__((__packed__)) nv_info_kernel_entry {
+        uint8_t format;
+        uint8_t attribute;
+        uint16_t values_size;
+        uint32_t values;
+    };
+    struct __attribute__((__packed__)) nv_info_kparam_info {
+        uint32_t index;
+        uint16_t ordinal;
+        uint16_t offset;
+        uint16_t unknown : 12;
+        uint8_t  cbank : 6;
+        uint16_t size : 14;
+        // missing are "space" (possible padding info?), and "Pointee's logAlignment"
+        // these were always 0 in the kernels I tested
+    };
+    asection *section = NULL;
+    int ret = -1;
+    char *section_name = NULL;
+
+    if (bfd == NULL || kernel == NULL || kernel->name == NULL || memory == NULL) {
+        LOGE(LOG_ERROR, "at least one parameter is NULL");
+        return ret;
+    }
+    kernel->param_num = 0;
+    kernel->param_offsets = NULL;
+    kernel->param_sizes = NULL;
+
+    if (asprintf(&section_name, ".nv.info.%s", kernel->name) == -1) {
+        LOGE(LOG_ERROR, "asprintf failed");
+        return ret;
+    }
+
+    if ((section = bfd_get_section_by_name(bfd, section_name))== NULL) {
+        LOGE(LOG_ERROR, "%s section not found", section_name);
+        goto cleanup;
+    }
+
+    LOGE(LOG_DBG(1), "name: %s, index: %d, size 0x%lx, pos:%p", section->name,
+        section->index, section->size, (void *)section->filepos);
+
+    //print_hexmem(memory+section->filepos, section->size);
+
+    size_t secpos=0;
+    int i=0;
+    while (secpos < section->size) {
+        struct nv_info_kernel_entry *entry = (struct nv_info_kernel_entry*)(memory+section->filepos+secpos);
+        // printf("entry %d: format: %#x, attr: %#x, ", i++, entry->format, entry->attribute);
+        if (entry->format == EIFMT_SVAL && entry->attribute == EIATTR_KPARAM_INFO) {
+            if (entry->values_size != 0xc) {
+                LOGE(LOG_ERROR, "EIATTR_KPARAM_INFO values size has not the expected value of 0xc");
+                goto cleanup;
+            }
+            struct nv_info_kparam_info *kparam = (struct nv_info_kparam_info*)&entry->values;
+            // printf("kparam: index: %#x, ordinal: %#x, offset: %#x, unknown: %#0x, cbank: %#0x, size: %#0x\n",
+            //     kparam->index, kparam->ordinal, kparam->offset, kparam->unknown, kparam->cbank, kparam->size);
+            LOGE(LOG_DEBUG, "param %d: offset: %#x, size: %#x", kparam->ordinal, kparam->offset, kparam->size);
+            if (kparam->ordinal >= kernel->param_num) {
+                kernel->param_offsets = realloc(kernel->param_offsets,
+                                              (kparam->ordinal+1)*sizeof(uint16_t));
+                kernel->param_sizes = realloc(kernel->param_sizes,
+                                            (kparam->ordinal+1)*sizeof(uint16_t));
+                kernel->param_num = kparam->ordinal+1;
+            }
+            kernel->param_offsets[kparam->ordinal] = kparam->offset;
+            kernel->param_sizes[kparam->ordinal] = kparam->size;
+            secpos += sizeof(struct nv_info_kernel_entry) + entry->values_size-4;
+        } else if (entry->format == EIFMT_HVAL && entry->attribute == EIATTR_CBANK_PARAM_SIZE) {
+            kernel->param_size = entry->values_size;
+            LOGE(LOG_DEBUG, "cbank_param_size: %#0x", entry->values_size);
+            secpos += sizeof(struct nv_info_kernel_entry)-4;
+        } else if (entry->format == EIFMT_HVAL) {
+            // printf("hval: %#x(%d)\n", entry->values_size, entry->values_size);
+            secpos += sizeof(struct nv_info_kernel_entry)-4;
+        } else if (entry->format == EIFMT_SVAL) {
+            // printf("sval_size: %#x ", entry->values_size);
+            // for (int j=0; j*sizeof(uint32_t) < entry->values_size; j++) {
+            //     printf("val%d: %#x(%d) ", j, (&entry->values)[j], (&entry->values)[j]);
+            // }
+            // printf("\n");
+            secpos += sizeof(struct nv_info_kernel_entry) + entry->values_size-4;
+        } else if (entry->format == EIFMT_NVAL) {
+            // printf("nval\n");
+            secpos += sizeof(struct nv_info_kernel_entry)-4;
+        } else {
+            LOGE(LOG_WARNING, "unknown format: %#x", entry->format);
+            secpos += sizeof(struct nv_info_kernel_entry)-4;
+        }
+    }
+    // printf("remaining: %d\n", section->size % sizeof(struct nv_info_kernel_entry));
+    ret = 0;
+ cleanup:
+    free(section_name);
+    return ret;
+}
+
+int elf_parameter_info(list *kernel_infos, void* memory, size_t memsize)
+{
+    struct __attribute__((__packed__)) nv_info_entry{
+        uint8_t format;
+        uint8_t attribute;
+        uint16_t values_size;
+        uint32_t kernel_id;
+        uint32_t value;
+    };
+
+    bfd *bfd = NULL;
+    FILE *fd, *fd2 = NULL;
+    asection *section = NULL;
+    int ret = -1;
+    struct symtab symtab = {0};
+    char path[256];
+    struct bfd_iovec *iovec = NULL;
+    const struct bfd_iovec *orig_iovec = NULL;
+
+    kernel_info_t *ki = NULL;
+
+    if (memory == NULL || memsize == 0) {
+        LOGE(LOG_ERROR, "memory was NULL or memsize was 0");
+        return -1;
+    }
+
+    if ((fd = fmemopen(memory, memsize, "rb")) == NULL) {
+        LOGE(LOG_ERROR, "fmemopen failed");
+        goto cleanup;
+    }
+
+    bfd_init();
+
+    if ((bfd = bfd_openstreamr("", "elf64-little", fd)) == NULL) {
+        LOGE(LOG_ERROR, "bfd_openr failed");
+        goto cleanup;
+    }
+
+    //We change the iovec of cudabfd so we can report the correct filesize
+    //because in-memory files always report a file size of 0, which creates 
+    //problems elsewhere
+    cudabfd_size = memsize;
+    orig_cudabfd_stat = bfd->iovec->bstat;
+    orig_iovec = bfd->iovec;
+    iovec = (struct bfd_iovec*)malloc(sizeof(struct bfd_iovec));
+    memcpy(iovec, bfd->iovec, sizeof(struct bfd_iovec));
+    iovec->bstat = cudabfd_stat;
+    bfd->iovec = iovec;
+
+    if (!bfd_check_format(bfd, bfd_object)) {
+        LOGE(LOG_ERROR, "bfd has wrong format");
+        goto cleanup;
+    }
+    // print_sections(bfd->sections);
+
+    if  (symtab_init(bfd, &symtab) != 0) {
+        LOGE(LOG_ERROR, "symtab_init failed");
+        goto cleanup;
+    }
+    // symtab_print(&symtab);
+
+    section = bfd_get_section_by_name(bfd, ".nv.info");
+    if (section == NULL) {
+        LOGE(LOG_ERROR, ".nv.info section not found");
+        goto cleanup;
+    }
+
+    LOGE(LOG_DBG(1), "name: %s, index: %d, size 0x%lx, pos:%p", section->name,
+        section->index, section->size, (void *)section->filepos);
+    //print_hexmem(memory+section->filepos, section->size); 
+    int i = 0;
+    const char *kernel_str;
+    for (size_t secpos=0; secpos < section->size; secpos += sizeof(struct nv_info_entry)) {
+        struct nv_info_entry *entry = (struct nv_info_entry*)(memory+section->filepos+secpos);
+        if (entry->values_size != 8) {
+            LOGE(LOG_ERROR, "unexpected values_size: %#x", entry->values_size);
+            continue;
+        }
+        // printf("%d: format: %#x, attr: %#x, values_size: %#x kernel: %#x, sval: %#x(%d)\n", 
+        //         i++, entry->format, entry->attribute, entry->values_size, entry->kernel_id, 
+        //         entry->value, entry->value);
+        if (entry->attribute != EIATTR_FRAME_SIZE) {
+            continue;
+        }
+        if (symtab_symbol_at(&symtab, entry->kernel_id, &kernel_str) != 0) {
+            LOGE(LOG_ERROR, "symtab_symbol_at failed for entry %d", i);
+            continue;
+        }
+        if (utils_search_info(kernel_infos, kernel_str) != NULL) {
+            continue;
+        }
+
+        LOGE(LOG_DEBUG, "found new kernel: %s (symbol table id: %#x)", kernel_str, entry->kernel_id);
+
+        if (list_append(kernel_infos, (void**)&ki) != 0) {
+            LOGE(LOG_ERROR, "error on appending to list");
+            goto cleanup;
+        }
+
+        size_t buflen = strlen(kernel_str)+1;
+        if ((ki->name = malloc(buflen)) == NULL) {
+            LOGE(LOG_ERROR, "malloc failed");
+            goto cleanup;
+        }
+        if (strncpy(ki->name, kernel_str, buflen) != ki->name) {
+            LOGE(LOG_ERROR, "strncpy failed");
+            goto cleanup;
+        }
+
+        if (get_parm_for_kernel(bfd, ki, memory, memsize) != 0) {
+            LOGE(LOG_ERROR, "get_parm_for_kernel failed for kernel %s", kernel_str);
+            goto cleanup;
+        }
+    }
+
+    ret = 0;
+ cleanup:
+    free(iovec);
+    if (fd != NULL)
+        fclose(fd);
+    symtab_free(&symtab);
+    if (bfd != NULL) {
+        // Also closes fd
+        bfd_close(bfd);
+    }
+    return ret;
+}
+
+
+void* elf_symbol_address(const char* file, char *symbol)
+{
+    bfd *hostbfd = NULL;
+    asection *section;
+    FILE *hostbfd_fd = NULL;
+    void *ret = NULL;
+    size_t symtab_size, symtab_length;
+    asymbol **symtab = NULL;
+    char path[256];
+    size_t length;
+    const char self[] = "/proc/self/exe";
+    if (file == NULL) {
+        file = self;
+    }
+
+
+    bfd_init();
+
+    length = readlink(file, path, sizeof(path));
+
+    /* Catch some errors: */
+    if (length < 0) {
+        LOGE(LOG_WARNING, "error resolving symlink %s.", file);
+    } else if (length >= 256) {
+        LOGE(LOG_WARNING, "path was too long and was truncated.");
+    } else {
+        path[length] = '\0';
+        LOG(LOG_DEBUG, "opening '%s'", path);
+    }
+
+    if ((hostbfd_fd = fopen(file, "rb")) == NULL) {
+        LOGE(LOG_ERROR, "fopen failed");
+        return NULL;
+    }
+
+    if ((hostbfd = bfd_openstreamr(file, NULL, hostbfd_fd)) == NULL) {
+        LOGE(LOG_ERROR, "bfd_openr failed on %s",
+             file);
+        fclose(hostbfd_fd);
+        goto cleanup;
+    }
+
+    if (!bfd_check_format(hostbfd, bfd_object)) {
+        LOGE(LOG_ERROR, "%s has wrong bfd format",
+             file);
+        goto cleanup;
+    }
+
+    if ((symtab_size = bfd_get_symtab_upper_bound(hostbfd)) == -1) {
+        LOGE(LOG_ERROR, "bfd_get_symtab_upper_bound failed");
+        return NULL;
+    }
+
+    if ((symtab = (asymbol **)malloc(symtab_size)) == NULL) {
+        LOGE(LOG_ERROR, "malloc symtab failed");
+        return NULL;
+    }
+
+    if ((symtab_length = bfd_canonicalize_symtab(hostbfd, symtab)) == 0) {
+        LOG(LOG_WARNING, "symtab is empty...");
+    } else {
+        //printf("%lu symtab entries\n", symtab_length);
+    }
+
+    for (int i = 0; i < symtab_length; ++i) {
+        if (strcmp(bfd_asymbol_name(symtab[i]), CRICKET_ELF_REGFUN) == 0) {
+            ret = (void*)bfd_asymbol_value(symtab[i]);
+            break;
+        }
+        //printf("%d: %s: %lx\n", i, bfd_asymbol_name(symtab[i]),
+        //       bfd_asymbol_value(symtab[i]));
+    }
+
+
+ cleanup:
+    free(symtab);
+    if (hostbfd != NULL)
+        bfd_close(hostbfd);
+    return ret;
+}
\ No newline at end of file
diff --git a/cpu/cpu-elf.h b/cpu/cpu-elf.h
new file mode 100644
index 00000000..d5c5dd32
--- /dev/null
+++ b/cpu/cpu-elf.h
@@ -0,0 +1,25 @@
+#ifndef _ELF_H_
+#define _ELF_H_
+
+#include <stdint.h>
+#include "cpu-common.h"
+#include "list.h"
+
+struct fat_header {
+    uint32_t magic;
+    uint32_t version;
+    uint64_t text;
+    uint64_t data;  // points to outside of the file
+    uint64_t unknown;
+    uint64_t text2;
+    uint64_t zero;
+};
+
+
+int elf_get_fatbin_info(struct fat_header *fatbin, list *kernel_infos, void** fatbin_mem, unsigned* fatbin_size);
+
+int elf_parameter_info(list *kernel_infos, void* memory, size_t memsize);
+void* elf_symbol_address(const char* file, char *symbol);
+int elf_contains_kernel(void* memory, size_t memsize);
+
+#endif //_ELF_H_
diff --git a/cpu/cpu-server.c b/cpu/cpu-server.c
index c3c78a9a..c2c77b10 100644
--- a/cpu/cpu-server.c
+++ b/cpu/cpu-server.c
@@ -18,6 +18,7 @@
 #include "cpu-server-driver.h"
 #include "rpc/xdr.h"
 #include "cr.h"
+#include "cpu-elf.h"
 #ifdef WITH_IB
 #include "cpu-ib.h"
 #endif //WITH_IB
diff --git a/cpu/cpu-utils.c b/cpu/cpu-utils.c
index fb847850..3538557b 100644
--- a/cpu/cpu-utils.c
+++ b/cpu/cpu-utils.c
@@ -9,174 +9,12 @@
 #include <openssl/md5.h>
 #include <linux/limits.h>
 #include "rpc/types.h"
-
 #include <bfd.h>
 
 #include "cpu-utils.h"
 #include "cpu-common.h"
 #include "log.h"
 
-#define uint16_t unsigned short
-#define CRICKET_ELF_NV_INFO_PREFIX ".nv.info"
-#define CRICKET_ELF_NV_SHARED_PREFIX ".nv.shared."
-#define CRICKET_ELF_NV_TEXT_PREFIX ".nv.text."
-#define CRICKET_ELF_TEXT_PREFIX ".text."
-
-#define CRICKET_ELF_FATBIN ".nv_fatbin"
-#define CRICKET_ELF_REGFUN "_ZL24__sti____cudaRegisterAllv"
-
-#define FATBIN_STRUCT_MAGIC 0x466243b1
-#define FATBIN_TEXT_MAGIC   0xBA55ED50
-
-struct  __attribute__((__packed__)) fat_elf_header
-{
-    uint32_t magic;
-    uint16_t version;
-    uint16_t header_size;
-    uint64_t fat_size;
-};
-struct  __attribute__((__packed__)) fat_text_header
-{
-    uint16_t kind;
-    uint16_t unknown1;
-    uint32_t header_size;
-    uint64_t fatbin_size;
-    uint64_t some_offset; //Compression related information
-    uint16_t minor;
-    uint16_t major;
-    uint32_t arch;
-    uint32_t obj_name_offset;
-    uint32_t obj_name_len;
-    uint64_t flags;
-    uint64_t zero;
-    uint64_t unknown2;
-};
-
-#define FATBIN_FLAG_64BIT     0x0000000000000001LL
-#define FATBIN_FLAG_DEBUG     0x0000000000000002LL
-#define FATBIN_FLAG_LINUX     0x0000000000000010LL
-#define FATBIN_FLAG_COMPRESS  0x0000000000002000LL
-
-static int cricket_fatbin_flag_to_str(char** str, uint64_t flag)
-{
-    return asprintf(str, "64Bit: %s, Debug: %s, Linux: %s, Compress %s",
-        (flag & FATBIN_FLAG_64BIT) ? "yes" : "no",
-        (flag & FATBIN_FLAG_DEBUG) ? "yes" : "no",
-        (flag & FATBIN_FLAG_LINUX) ? "yes" : "no",
-        (flag & FATBIN_FLAG_COMPRESS) ? "yes" : "no");
-}
-
-static int cpu_utils_fat_header_decode(void *fat, 
-                                       struct fat_elf_header **fat_elf_header,
-                                       struct fat_text_header **fat_text_header,
-                                       void **fat_text_body_ptr)
-{
-    struct fat_elf_header* feh;
-    struct fat_text_header* fth;
-    void *fat_ptr = NULL;
-    void *fat_text_header_ptr = NULL;
-
-    if (fat == NULL || fat_elf_header == NULL || fat_text_header == NULL || fat_text_body_ptr == NULL) {
-        LOGE(LOG_ERROR, "at least one parameter is NULL");
-        return -1;
-    }
-
-    feh = (struct fat_elf_header*)fat;
-    if (feh->magic != FATBIN_TEXT_MAGIC) {
-        LOGE(LOG_ERROR, "fatbin text magic number is wrong. Got %x, expected %x.", *((uint32_t*)feh), FATBIN_TEXT_MAGIC);
-        return -1;
-    }
-    LOGE(LOG_DEBUG, "fat_elf_header: magic: %x, version: %d, header_size: %p, fat_size: %p",
-        feh->magic, feh->version, feh->header_size, feh->fat_size);
-
-    if (feh->version != 1 || feh->header_size != sizeof(struct fat_elf_header)) {
-        LOGE(LOG_ERROR, "fatbin text version is wrong or header size is inconsistent.\
-            This is a sanity check to avoid reading a new fatbinary format");
-        return -1;
-    }
-    fat_ptr = fat_text_header_ptr = (void*)feh + feh->header_size;
-
-    fth = (struct fat_text_header*)(fat_text_header_ptr);
-    LOGE(LOG_DEBUG, "fat_text_header: fatbin_kind: %#x, header_size %#x, fatbin_size %#x, some_offset %#x.\
-        minor %#x, major %#x, arch %d, flags %#x",
-        fth->kind,
-        fth->header_size,
-        fth->fatbin_size,
-        fth->some_offset,
-        fth->minor,
-        fth->major,
-        fth->arch,
-        fth->flags);
-    LOGE(LOG_DEBUG, "unknown fields: unknown1: %#x, unknown2: %#x, zeros: %#x",
-        fth->unknown1,
-        fth->unknown2,
-        fth->zero);
-    fat_ptr += sizeof(struct fat_header);
-    *fat_text_body_ptr = fat_text_header_ptr + fth->header_size;
-    if (fth->flags & FATBIN_FLAG_DEBUG) {
-        *fat_text_body_ptr += 1;
-    }
-
-    char *flag_str = NULL;
-    cricket_fatbin_flag_to_str(&flag_str, fth->flags);
-    LOGE(LOG_DEBUG, "Fatbin flags: %s", flag_str);
-    free(flag_str);
-
-    if(fth->obj_name_offset != 0) {
-        if (((char*)fat_text_header_ptr)[fth->obj_name_offset + fth->obj_name_len] != '\0') {
-            LOGE(LOG_DEBUG, "Fatbin object name is not null terminated");
-        } else {
-            char *obj_name = (char*)fat_text_header_ptr + fth->obj_name_offset;
-            LOGE(LOG_DEBUG, "Fatbin object name: %s (len:%#x)", obj_name, fth->obj_name_len);
-        }
-        fat_ptr += fth->obj_name_len+1;
-    }
-    *fat_elf_header = feh;
-    *fat_text_header = fth;
-    return 0;
-}
-
-int cpu_utils_get_fatbin_info(struct fat_header *fatbin, void** fatbin_mem, unsigned* fatbin_size)
-{
-    struct fat_elf_header* fat_elf_header;
-    struct fat_text_header* fat_text_header;
-    void *fat_ptr = NULL;
-    void *fat_text_body_ptr = NULL;
-    unsigned fatbin_total_size = 0;
-    if (fatbin == NULL || fatbin_mem == NULL || fatbin_size == NULL) {
-        LOGE(LOG_ERROR, "at least one parameter is NULL");
-        return -1;
-    }
-    if (fatbin->magic != FATBIN_STRUCT_MAGIC) {
-        LOGE(LOG_ERROR, "fatbin struct magic number is wrong. Got %llx, expected %llx.", fatbin->magic, FATBIN_STRUCT_MAGIC);
-        return -1;
-    }
-    LOG(LOG_DEBUG, "Fatbin: magic: %x, version: %x, text: %lx, data: %lx, ptr: %lx, ptr2: %lx, zero: %lx",
-           fatbin->magic, fatbin->version, fatbin->text, fatbin->data, fatbin->unknown, fatbin->text2, fatbin->zero);
-
-    if (cpu_utils_fat_header_decode((void*)fatbin->text, &fat_elf_header, &fat_text_header, &fat_text_body_ptr) != 0) {
-        LOGE(LOG_ERROR, "fatbin header decode failed");
-        return -1;
-    }
-
-    fatbin_total_size = fat_elf_header->header_size + fat_elf_header->fat_size;
-
-    if (cpu_utils_fat_header_decode((void*)fatbin->text2, &fat_elf_header, &fat_text_header, &fat_text_body_ptr) != 0) {
-        LOGE(LOG_ERROR, "fatbin header decode failed");
-        return -1;
-    }
-    fatbin_total_size += fat_elf_header->header_size + fat_elf_header->fat_size;
-
-    // fat_ptr = (void*)fatbin->data;
-
-    // for (int i=0; i<64; i++) {
-    //     printf("%02x ", ((uint8_t*)fat_ptr)[i]);
-    // }
-
-    *fatbin_mem = (void*)fatbin->text;
-    *fatbin_size = fatbin_total_size;
-    return 0;
-}
 
 int cpu_utils_command(char **command)
 {
@@ -235,86 +73,7 @@ int cpu_utils_md5hash(char *filename, unsigned long *high, unsigned long *low)
     return 0;
 }
 
-void* cricketd_utils_symbol_address(const char* file, char *symbol)
-{
-    bfd *hostbfd = NULL;
-    asection *section;
-    FILE *hostbfd_fd = NULL;
-    void *ret = NULL;
-    size_t symtab_size, symtab_length;
-    asymbol **symtab = NULL;
-    char path[256];
-    size_t length;
-    const char self[] = "/proc/self/exe";
-    if (file == NULL) {
-        file = self;
-    }
-
-
-    bfd_init();
-
-    length = readlink(file, path, sizeof(path));
-
-    /* Catch some errors: */
-    if (length < 0) {
-        LOGE(LOG_WARNING, "error resolving symlink %s.", file);
-    } else if (length >= 256) {
-        LOGE(LOG_WARNING, "path was too long and was truncated.");
-    } else {
-        path[length] = '\0';
-        LOG(LOG_DEBUG, "opening '%s'", path);
-    }
-
-    if ((hostbfd_fd = fopen(file, "rb")) == NULL) {
-        LOGE(LOG_ERROR, "fopen failed");
-        return NULL;
-    }
-
-    if ((hostbfd = bfd_openstreamr(file, NULL, hostbfd_fd)) == NULL) {
-        LOGE(LOG_ERROR, "bfd_openr failed on %s",
-             file);
-        fclose(hostbfd_fd);
-        goto cleanup;
-    }
 
-    if (!bfd_check_format(hostbfd, bfd_object)) {
-        LOGE(LOG_ERROR, "%s has wrong bfd format",
-             file);
-        goto cleanup;
-    }
-
-    if ((symtab_size = bfd_get_symtab_upper_bound(hostbfd)) == -1) {
-        LOGE(LOG_ERROR, "bfd_get_symtab_upper_bound failed");
-        return NULL;
-    }
-
-    if ((symtab = (asymbol **)malloc(symtab_size)) == NULL) {
-        LOGE(LOG_ERROR, "malloc symtab failed");
-        return NULL;
-    }
-
-    if ((symtab_length = bfd_canonicalize_symtab(hostbfd, symtab)) == 0) {
-        LOG(LOG_WARNING, "symtab is empty...");
-    } else {
-        //printf("%lu symtab entries\n", symtab_length);
-    }
-
-    for (int i = 0; i < symtab_length; ++i) {
-        if (strcmp(bfd_asymbol_name(symtab[i]), CRICKET_ELF_REGFUN) == 0) {
-            ret = (void*)bfd_asymbol_value(symtab[i]);
-            break;
-        }
-        //printf("%d: %s: %lx\n", i, bfd_asymbol_name(symtab[i]),
-        //       bfd_asymbol_value(symtab[i]));
-    }
-
-
- cleanup:
-    free(symtab);
-    if (hostbfd != NULL)
-        bfd_close(hostbfd);
-    return ret;
-}
 
 int cpu_utils_launch_child(const char *file, char **args)
 {
@@ -343,14 +102,14 @@ int cpu_utils_launch_child(const char *file, char **args)
     return filedes[0];
 }
 
-kernel_info_t* cricketd_utils_search_info(list *kernel_infos, char *kernelname)
+kernel_info_t* utils_search_info(list *kernel_infos, const char *kernelname)
 {
     kernel_info_t *info = NULL;
     if (kernel_infos == NULL) {
         LOGE(LOG_ERROR, "list is NULL.");
         return NULL;
     }
-    LOGE(LOG_DEBUG, "searching for %s in %d entries", kernelname, kernel_infos->length);
+    LOGE(LOG_DBG(1), "searching for %s in %d entries", kernelname, kernel_infos->length);
     for (int i=0; i < kernel_infos->length; ++i) {
         if (list_at(kernel_infos, i, (void**)&info) != 0) {
             LOGE(LOG_ERROR, "no element at index %d", i);
diff --git a/cpu/cpu-utils.h b/cpu/cpu-utils.h
index 3afd0a0c..e40bea2e 100644
--- a/cpu/cpu-utils.h
+++ b/cpu/cpu-utils.h
@@ -5,29 +5,16 @@
 #include "cpu-common.h"
 #include "list.h"
 
-struct fat_header {
-    uint32_t magic;
-    uint32_t version;
-    uint64_t text;
-    uint64_t data;  // points to outside of the file
-    uint64_t unknown;
-    uint64_t text2;
-    uint64_t zero;
-};
 
 
-int cpu_utils_get_fatbin_info(struct fat_header *fatbin, void** fatbin_mem, unsigned* fatbin_size);
-
 void kernel_infos_free(kernel_info_t *infos, size_t kernelnum);
 
 
 int cpu_utils_is_local_connection(struct svc_req *rqstp);
 int cpu_utils_command(char **command);
 int cpu_utils_md5hash(char *filename, unsigned long *high, unsigned long *low);
-void* cricketd_utils_symbol_address(const char* file, char *symbol);
 int cricketd_utils_launch_child(const char *file, char **args);
 int cpu_utils_parameter_info(list *kernel_infos, char *path);
-int cpu_utils_contains_kernel(const char *path);
-kernel_info_t* cricketd_utils_search_info(list *kernel_infos, char *kernelname);
+kernel_info_t* utils_search_info(list *kernel_infos, const char *kernelname);
 
 #endif //_CPU_UTILS_H_
diff --git a/submodules/Makefile b/submodules/Makefile
index 27d55680..54fb160a 100644
--- a/submodules/Makefile
+++ b/submodules/Makefile
@@ -38,7 +38,7 @@ endif
 cuda-gdb/build:
 	@echo -e "\033[36m----> Configuring cuda-gdb\033[0m"
 	mkdir -p cuda-gdb/build && cd cuda-gdb/build && \
-		../configure --disable-werror --program-prefix=cuda- --enable-cuda --with-python=no --enable-targets="x86_64-apple-darwin,x86_64-unknown-linux-gnu,arm-elf-linux-gnu,m68k-unknown-linux-gnu" CFLAGS='-I/usr/local/cuda/include' LDFLAGS='-lpthread'
+		../configure --disable-werror --program-prefix=cuda- --enable-cuda --with-python=no --enable-targets="x86_64-apple-darwin,x86_64-unknown-linux-gnu,arm-elf-linux-gnu,m68k-unknown-linux-gnu" CFLAGS='-I/usr/local/cuda/include -fPIC' LDFLAGS='-lpthread'
 	@echo -e "\033[36m----> Building cuda-gdb\033[0m"
 	CPATH=/usr/local/cuda/include $(MAKE) -C cuda-gdb/build
 	CPATH=/usr/local/cuda/include $(MAKE) -C cuda-gdb/build/gdb libgdb.a
diff --git a/tests/test_apps/Makefile b/tests/test_apps/Makefile
index b85845a2..dafae5a3 100644
--- a/tests/test_apps/Makefile
+++ b/tests/test_apps/Makefile
@@ -10,6 +10,7 @@ CFLAGS = -arch=$(ARCH) -cudart shared
 #CFLAGS = -arch=$(ARCH)
 LD = nvcc -ccbin g++
 LDFLAGS = -arch=$(ARCH) -cudart shared
+DEBUG_FLAGS = #-g -G
 #LDFLAGS = -lcuda -arch=$(ARCH)
 TEST_CPU_BIN = cpu.testapp
 TEST_CPU_O = test_cpu.o
@@ -61,10 +62,10 @@ $(LIBCUDA_OBJ) : $(LIBCUDA_OBJ:.o=.c)
 	$(HOST_CC) -c -fpic -o $@ $< $(LIBCUDA_LIBS)
 
 $(TEST_KERNEL_LIB_O) : $(FILES)
-	$(CC) $(CFLAGS) -g -G -dc --compiler-options '-fPIC' -o $@ $<
+	$(CC) $(CFLAGS) $(DEBUG_FLAGS) -dc --compiler-options '-fPIC' -o $@ $<
 
 $(TEST_KERNEL_LIB) : $(TEST_KERNEL_LIB_O)
-	$(LD) $(LDFLAGS) -g -G -shared -o lib$@ $^
+	$(LD) $(LDFLAGS) $(DEBUG_FLAGS) -shared -o lib$@ $^
 
 $(TEST_KERNEL_LIB_CALL_O) : $(TEST_KERNEL_LIB_CALL_O:.o=.c)
 	$(HOST_CC) -c -o $@ $<
diff --git a/tests/test_apps/matmul.cu b/tests/test_apps/matmul.cu
index 7790ae7b..ea5f89ba 100644
--- a/tests/test_apps/matmul.cu
+++ b/tests/test_apps/matmul.cu
@@ -8,7 +8,7 @@
 #include "cricket-cuda.h"
 
 #define N 32
-#define ITERATIONS 1024*128*8*16
+#define ITERATIONS 1024*128*4
 const int blocksize = 32;
 
 #ifndef RANDOM_INIT
@@ -265,7 +265,7 @@ int main()
     dim3 dimBlock( blocksize, 1 );
     dim3 dimGrid( 1, 1);
     kernel<<<dimGrid, dimBlock>>>(dev_A, dev_x, dev_res, 0, 0, 0, 0);
-    //kernel_no_param<<<dimGrid, dimBlock>>>();
+    kernel_no_param<<<dimGrid, dimBlock>>>();
     //void *args = NULL;
     //int result = cudaLaunchKernel((void*)kernel_no_param, dimGrid, dimBlock, &args, 0LL, NULL);
 

From 09b34f612aa9623d6d624d06dd69ea092a209c5f Mon Sep 17 00:00:00 2001
From: Niklas Eiling <niklas.eiling@eonerc.rwth-aachen.de>
Date: Fri, 24 Mar 2023 14:25:23 +0100
Subject: [PATCH 22/83] fix cpu-server not using the new name of
 elf_symbol_address

Signed-off-by: Niklas Eiling <niklas.eiling@eonerc.rwth-aachen.de>
---
 cpu/cpu-server.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/cpu/cpu-server.c b/cpu/cpu-server.c
index c2c77b10..d750d0d6 100644
--- a/cpu/cpu-server.c
+++ b/cpu/cpu-server.c
@@ -123,7 +123,7 @@ void cricket_so_register(void* dlhandle, char *path)
 
     // add load location of library to offset in symbol table
     void (*cudaRegisterAllv)(void) = 
-        (void(*)(void)) cricketd_utils_symbol_address(path, "_ZL24__sti____cudaRegisterAllv");
+        (void(*)(void)) elf_symbol_address(path, "_ZL24__sti____cudaRegisterAllv");
     
     LOG(LOG_INFO, "found CUDA initialization function at %p + %p = %p", 
         map->l_addr, cudaRegisterAllv, map->l_addr + cudaRegisterAllv);
@@ -291,7 +291,7 @@ void cricket_main(char* app_command, size_t prog_num, size_t vers_num)
      * Address of "_ZL24__sti____cudaRegisterAllv" in static symbol table is e.g. 0x4016c8
      */
     void (*cudaRegisterAllv)(void) =
-        (void(*)(void)) cricketd_utils_symbol_address(NULL, "_ZL24__sti____cudaRegisterAllv");
+        (void(*)(void)) elf_symbol_address(NULL, "_ZL24__sti____cudaRegisterAllv");
     LOG(LOG_INFO, "found CUDA initialization function at %p", cudaRegisterAllv);
     if (cudaRegisterAllv == NULL) {
         LOGE(LOG_WARNING, "could not find cudaRegisterAllv initialization function in cubin. Kernels cannot be launched without it!");

From 701d4bd2a9028107fc1795bd68d6b57750c1cc73 Mon Sep 17 00:00:00 2001
From: Niklas Eiling <niklas.eiling@eonerc.rwth-aachen.de>
Date: Fri, 24 Mar 2023 15:23:14 +0100
Subject: [PATCH 23/83] add possibility to dump elfs

Signed-off-by: Niklas Eiling <niklas.eiling@eonerc.rwth-aachen.de>
---
 cpu/cpu-elf.c | 10 +++++++++-
 1 file changed, 9 insertions(+), 1 deletion(-)

diff --git a/cpu/cpu-elf.c b/cpu/cpu-elf.c
index adf495fb..82a69474 100644
--- a/cpu/cpu-elf.c
+++ b/cpu/cpu-elf.c
@@ -405,6 +405,8 @@ static int get_parm_for_kernel(bfd *bfd,  kernel_info_t *kernel, void* memory, s
     return ret;
 }
 
+#define ELF_DUMP_TO_FILE 1
+
 int elf_parameter_info(list *kernel_infos, void* memory, size_t memsize)
 {
     struct __attribute__((__packed__)) nv_info_entry{
@@ -416,7 +418,7 @@ int elf_parameter_info(list *kernel_infos, void* memory, size_t memsize)
     };
 
     bfd *bfd = NULL;
-    FILE *fd, *fd2 = NULL;
+    FILE *fd = NULL;
     asection *section = NULL;
     int ret = -1;
     struct symtab symtab = {0};
@@ -431,6 +433,12 @@ int elf_parameter_info(list *kernel_infos, void* memory, size_t memsize)
         return -1;
     }
 
+#ifdef ELF_DUMP_TO_FILE
+    FILE* fd2 = fopen("/tmp/cricket-elf-dump", "wb");
+    fwrite(memory, memsize, 1, fd2);
+    fclose(fd2);
+#endif
+
     if ((fd = fmemopen(memory, memsize, "rb")) == NULL) {
         LOGE(LOG_ERROR, "fmemopen failed");
         goto cleanup;

From 89f78e6ccecf197d088f7461035b4bae85079dd7 Mon Sep 17 00:00:00 2001
From: Niklas Eiling <niklas.eiling@eonerc.rwth-aachen.de>
Date: Mon, 27 Mar 2023 11:40:13 +0200
Subject: [PATCH 24/83] make higher log levels configurable from makefile

Signed-off-by: Niklas Eiling <niklas.eiling@eonerc.rwth-aachen.de>
---
 cpu/Makefile     | 5 +++++
 cpu/cpu-client.c | 1 +
 cpu/cpu-server.c | 2 +-
 3 files changed, 7 insertions(+), 1 deletion(-)

diff --git a/cpu/Makefile b/cpu/Makefile
index d2d0527b..285492ec 100644
--- a/cpu/Makefile
+++ b/cpu/Makefile
@@ -96,6 +96,11 @@ endif
 ifdef LOG
 CC_FLAGS += -DLOG_LEVEL=LOG_$(LOG)
 endif
+
+ifdef LOGN
+CC_FLAGS += -DLOG_LEVEL=$(LOGN)
+endif
+
 ifdef WITH_IB
 CC_FLAGS += -DWITH_IB=$(WITH_IB)
 endif
diff --git a/cpu/cpu-client.c b/cpu/cpu-client.c
index 21acc4a7..530c3fa9 100644
--- a/cpu/cpu-client.c
+++ b/cpu/cpu-client.c
@@ -168,6 +168,7 @@ void __attribute__((constructor)) init_rpc(void)
     int_result result_2;
     char *printmessage_1_arg1 = "hello";
 
+    LOG(LOG_DBG(1), "log level is %d", LOG_LEVEL);
     init_log(LOG_LEVEL, __FILE__);
     rpc_connect();
 
diff --git a/cpu/cpu-server.c b/cpu/cpu-server.c
index d750d0d6..7d247e0f 100644
--- a/cpu/cpu-server.c
+++ b/cpu/cpu-server.c
@@ -183,7 +183,7 @@ void cricket_main(char* app_command, size_t prog_num, size_t vers_num)
     char *command = NULL;
     act.sa_handler = int_handler;
     sigaction(SIGINT, &act, NULL);
-
+    LOG(LOG_DBG(1), "log level is %d", LOG_LEVEL);
     init_log(LOG_LEVEL, __FILE__);
 
     #ifdef WITH_IB

From 4d7dc55d26a6ad0f44dde0ae0d25ff3679d188f4 Mon Sep 17 00:00:00 2001
From: Niklas Eiling <niklas.eiling@eonerc.rwth-aachen.de>
Date: Thu, 30 Mar 2023 11:47:58 +0200
Subject: [PATCH 25/83] add comments and additional error handling

Signed-off-by: Niklas Eiling <niklas.eiling@eonerc.rwth-aachen.de>
---
 cpu/cpu-elf.c   | 24 +++++++++++++++++-------
 cpu/cpu-utils.c |  2 +-
 2 files changed, 18 insertions(+), 8 deletions(-)

diff --git a/cpu/cpu-elf.c b/cpu/cpu-elf.c
index 82a69474..1faa6f39 100644
--- a/cpu/cpu-elf.c
+++ b/cpu/cpu-elf.c
@@ -45,8 +45,8 @@ struct  __attribute__((__packed__)) fat_text_header
     uint32_t obj_name_offset;
     uint32_t obj_name_len;
     uint64_t flags;
-    uint64_t zero;
-    uint64_t unknown2;
+    uint64_t zero;      //Alignment for compression?
+    uint64_t unknown2;  //Compression related information (deflated size?)
 };
 
 #define FATBIN_FLAG_64BIT     0x0000000000000001LL
@@ -64,9 +64,9 @@ static int flag_to_str(char** str, uint64_t flag)
 }
 
 static int fat_header_decode(void *fat, 
-                                       struct fat_elf_header **fat_elf_header,
-                                       struct fat_text_header **fat_text_header,
-                                       void **fat_text_body_ptr)
+                            struct fat_elf_header **fat_elf_header,
+                            struct fat_text_header **fat_text_header,
+                            void **fat_text_body_ptr)
 {
     struct fat_elf_header* feh;
     struct fat_text_header* fth;
@@ -110,7 +110,8 @@ static int fat_header_decode(void *fat,
         fth->zero);
     fat_ptr += sizeof(struct fat_header);
     *fat_text_body_ptr = fat_text_header_ptr + fth->header_size;
-    if (fth->flags & FATBIN_FLAG_DEBUG) {
+    if (fth->flags & FATBIN_FLAG_DEBUG || fth->flags & FATBIN_FLAG_COMPRESS) {
+       LOGE(LOG_DBG(1), "skipping extra byte \"%#02x\"", *((uint8_t*)*fat_text_body_ptr));
         *fat_text_body_ptr += 1;
     }
 
@@ -164,6 +165,15 @@ int elf_get_fatbin_info(struct fat_header *fatbin, list *kernel_infos, void** fa
     // }
     // printf("\n");
 
+    if (fat_text_header->flags & FATBIN_FLAG_COMPRESS) {
+        LOGE(LOG_WARNING, "fatbin contains compressed device code. This is not supported yet.");
+        return -1;
+    }
+    if (fat_text_header->flags & FATBIN_FLAG_DEBUG) {
+        LOGE(LOG_WARNING, "fatbin contains debug information. This is not supported yet.");
+        return -1;
+    }
+
     if (elf_parameter_info(kernel_infos, fat_text_body_ptr, fat_elf_header->fat_size) != 0) {
         LOGE(LOG_ERROR, "error getting symbol table");
         return -1;
@@ -405,7 +415,7 @@ static int get_parm_for_kernel(bfd *bfd,  kernel_info_t *kernel, void* memory, s
     return ret;
 }
 
-#define ELF_DUMP_TO_FILE 1
+//#define ELF_DUMP_TO_FILE 1
 
 int elf_parameter_info(list *kernel_infos, void* memory, size_t memsize)
 {
diff --git a/cpu/cpu-utils.c b/cpu/cpu-utils.c
index 3538557b..173f2952 100644
--- a/cpu/cpu-utils.c
+++ b/cpu/cpu-utils.c
@@ -291,7 +291,7 @@ int cpu_utils_contains_kernel(const char *path)
  cleanup:
     close(output);
     wait(&child_exit);
-    LOG(LOG_DEBUG, "child exit code: %d", child_exit);
+    LOG(LOG_DBG(1), "child exit code: %d", child_exit);
  out:
     free(line);
     return ret == 0 && child_exit == 0;

From d9870e03839985015ff77b46cc5b59cb265bce70 Mon Sep 17 00:00:00 2001
From: Niklas Eiling <niklas.eiling@eonerc.rwth-aachen.de>
Date: Thu, 30 Mar 2023 15:39:36 +0200
Subject: [PATCH 26/83] add elf_init function to avoid multiple initializations
 of libbfd

Signed-off-by: Niklas Eiling <niklas.eiling@eonerc.rwth-aachen.de>
---
 cpu/cpu-client.c | 2 ++
 cpu/cpu-elf.c    | 6 +++++-
 cpu/cpu-elf.h    | 2 +-
 3 files changed, 8 insertions(+), 2 deletions(-)

diff --git a/cpu/cpu-client.c b/cpu/cpu-client.c
index 530c3fa9..bdc53d6b 100644
--- a/cpu/cpu-client.c
+++ b/cpu/cpu-client.c
@@ -187,6 +187,8 @@ void __attribute__((constructor)) init_rpc(void)
         LOGE(LOG_ERROR, "list init failed.");
     }
 
+    elf_init();
+
     if (cpu_utils_parameter_info(&kernel_infos, "/proc/self/exe") != 0) {
         LOG(LOG_ERROR, "error while getting parameter size. Check whether "
                        "cuobjdump binary is in PATH! Trying anyway (will only "
diff --git a/cpu/cpu-elf.c b/cpu/cpu-elf.c
index 1faa6f39..a0c7a3be 100644
--- a/cpu/cpu-elf.c
+++ b/cpu/cpu-elf.c
@@ -54,6 +54,11 @@ struct  __attribute__((__packed__)) fat_text_header
 #define FATBIN_FLAG_LINUX     0x0000000000000010LL
 #define FATBIN_FLAG_COMPRESS  0x0000000000002000LL
 
+void elf_init(void)
+{
+    bfd_init();
+}
+
 static int flag_to_str(char** str, uint64_t flag)
 {
     return asprintf(str, "64Bit: %s, Debug: %s, Linux: %s, Compress %s",
@@ -454,7 +459,6 @@ int elf_parameter_info(list *kernel_infos, void* memory, size_t memsize)
         goto cleanup;
     }
 
-    bfd_init();
 
     if ((bfd = bfd_openstreamr("", "elf64-little", fd)) == NULL) {
         LOGE(LOG_ERROR, "bfd_openr failed");
diff --git a/cpu/cpu-elf.h b/cpu/cpu-elf.h
index d5c5dd32..4c9abe4f 100644
--- a/cpu/cpu-elf.h
+++ b/cpu/cpu-elf.h
@@ -15,7 +15,7 @@ struct fat_header {
     uint64_t zero;
 };
 
-
+void elf_init(void);
 int elf_get_fatbin_info(struct fat_header *fatbin, list *kernel_infos, void** fatbin_mem, unsigned* fatbin_size);
 
 int elf_parameter_info(list *kernel_infos, void* memory, size_t memsize);

From 45e7e1819680a49e73c7e3d32edc4f6ce133847f Mon Sep 17 00:00:00 2001
From: Niklas Eiling <niklas.eiling@eonerc.rwth-aachen.de>
Date: Tue, 11 Apr 2023 11:31:45 +0200
Subject: [PATCH 27/83] use libelf instead of libbfd for elf manuipulation
 because of better stability. New code is located in cpu-elf.c

Signed-off-by: Niklas Eiling <niklas.eiling@eonerc.rwth-aachen.de>
---
 cpu/Makefile     |   8 +-
 cpu/cpu-client.c |   8 +-
 cpu/cpu-elf.c    |   4 +-
 cpu/cpu-elf2.c   | 625 +++++++++++++++++++++++++++++++++++++++++++++++
 cpu/cpu-elf2.h   |  25 ++
 cpu/cpu-server.c |  16 +-
 cpu/cpu-utils.c  |   2 +-
 7 files changed, 669 insertions(+), 19 deletions(-)
 create mode 100644 cpu/cpu-elf2.c
 create mode 100644 cpu/cpu-elf2.h

diff --git a/cpu/Makefile b/cpu/Makefile
index 285492ec..a2d38223 100644
--- a/cpu/Makefile
+++ b/cpu/Makefile
@@ -8,7 +8,6 @@ CLIENT = cricket-client.so
 CUDA_SRC = /usr/local/cuda
 LIBTIRPC_PREFIX = ../submodules/libtirpc/install
 SUBMODULE_LIBS = ../submodules/lib
-BFD_INC_PREFIX = ../submodules/cuda-gdb/bfd
 
 CC = gcc
 LD = gcc
@@ -42,7 +41,7 @@ SRC_SERVER = $(RPC_XDR)                 \
 			 gsched_none.c 			    \
 			 oob.c 					    \
 			 mt-memcpy.c				\
-			 cpu-elf.c
+			 cpu-elf2.c
 
 SRC_SERVER_LIB = server-library.c
 SRC_SERVER_EXE = server-exe.c
@@ -59,7 +58,7 @@ SRC_CLIENT = $(RPC_XDR)                 \
 			 cpu-client-cusolver.c 		\
 			 oob.c 					    \
 			 mt-memcpy.c				\
-			 cpu-elf.c
+			 cpu-elf2.c
 
 # 			 cpu-client-driver-hidden.c \
 
@@ -75,14 +74,13 @@ OBJ_CLIENT = $(SRC_CLIENT:%.c=%.o)
 RPCGEN_FLAGS = -C -M -N
 INC_FLAGS += -I$(LIBTIRPC_PREFIX)/include/tirpc
 INC_FLAGS += -I$(CUDA_SRC)/include
-INC_FLAGS += -I$(BFD_INC_PREFIX)
 
 LIB_FLAGS += -L$(LIBTIRPC_PREFIX)/lib
 LIB_FLAGS += -L$(CUDA_SRC)/lib64
 CC_FLAGS += -std=gnu99 $(INC_FLAGS) -O2
 # TODO: use extern in header files instead of direct definition e.g. in cpu-common.h to remove -fcommon flag
 CC_FLAGS += -fcommon
-LD_FLAGS = $(LIB_FLAGS) -ltirpc -ldl -lcrypto -lbfd
+LD_FLAGS = $(LIB_FLAGS) -ltirpc -ldl -lcrypto -lelf
 
 ifdef WITH_DEBUG
 # use ASAN_OPTIONS=protect_shadow_gap=0  LSAN_OPTIONS=fast_unwind_on_malloc=0 when running
diff --git a/cpu/cpu-client.c b/cpu/cpu-client.c
index bdc53d6b..20d38f12 100644
--- a/cpu/cpu-client.c
+++ b/cpu/cpu-client.c
@@ -16,7 +16,7 @@
 #include "cpu-utils.h"
 #include "cpu_rpc_prot.h"
 #include "list.h"
-#include "cpu-elf.h"
+#include "cpu-elf2.h"
 #ifdef WITH_IB
 #include "cpu-ib.h"
 #endif // WITH_IB
@@ -187,7 +187,9 @@ void __attribute__((constructor)) init_rpc(void)
         LOGE(LOG_ERROR, "list init failed.");
     }
 
-    elf_init();
+    if (elf2_init() != 0) {
+        LOGE(LOG_ERROR, "libelf init failed");
+    }
 
     if (cpu_utils_parameter_info(&kernel_infos, "/proc/self/exe") != 0) {
         LOG(LOG_ERROR, "error while getting parameter size. Check whether "
@@ -337,7 +339,7 @@ void **__cudaRegisterFatBinary(void *fatCubin)
 
     mem_data rpc_fat = { .mem_data_len = 0, .mem_data_val = NULL };
 
-    if (elf_get_fatbin_info((struct fat_header *)fatCubin,
+    if (elf2_get_fatbin_info((struct fat_header *)fatCubin,
                                 &kernel_infos,
                                 (void **)&rpc_fat.mem_data_val,
                                 &rpc_fat.mem_data_len) != 0) {
diff --git a/cpu/cpu-elf.c b/cpu/cpu-elf.c
index a0c7a3be..866144cb 100644
--- a/cpu/cpu-elf.c
+++ b/cpu/cpu-elf.c
@@ -38,7 +38,7 @@ struct  __attribute__((__packed__)) fat_text_header
     uint16_t unknown1;
     uint32_t header_size;
     uint64_t fatbin_size;
-    uint64_t some_offset; //Compression related information
+    uint64_t compressed_size; //Compression related information
     uint16_t minor;
     uint16_t major;
     uint32_t arch;
@@ -104,7 +104,7 @@ static int fat_header_decode(void *fat,
         fth->kind,
         fth->header_size,
         fth->fatbin_size,
-        fth->some_offset,
+        fth->compressed_size,
         fth->minor,
         fth->major,
         fth->arch,
diff --git a/cpu/cpu-elf2.c b/cpu/cpu-elf2.c
new file mode 100644
index 00000000..097f93c7
--- /dev/null
+++ b/cpu/cpu-elf2.c
@@ -0,0 +1,625 @@
+#define _GNU_SOURCE
+#include <stdlib.h>
+#include <stdio.h>
+#include <stdint.h>
+#include <unistd.h>
+#include <errno.h>
+#include <string.h>
+#include <libelf.h>
+#include <gelf.h>
+
+#include "cpu-common.h"
+#include "log.h"
+#include "cpu-elf2.h"
+#include "cpu-utils.h"
+
+#define uint16_t unsigned short
+#define CRICKET_ELF_NV_INFO_PREFIX ".nv.info"
+#define CRICKET_ELF_NV_SHARED_PREFIX ".nv.shared."
+#define CRICKET_ELF_NV_TEXT_PREFIX ".nv.text."
+#define CRICKET_ELF_TEXT_PREFIX ".text."
+
+#define CRICKET_ELF_FATBIN ".nv_fatbin"
+#define CRICKET_ELF_REGFUN "_ZL24__sti____cudaRegisterAllv"
+
+#define FATBIN_STRUCT_MAGIC 0x466243b1
+#define FATBIN_TEXT_MAGIC   0xBA55ED50
+
+struct  __attribute__((__packed__)) fat_elf_header
+{
+    uint32_t magic;
+    uint16_t version;
+    uint16_t header_size;
+    uint64_t fat_size;
+};
+struct  __attribute__((__packed__)) fat_text_header
+{
+    uint16_t kind;
+    uint16_t unknown1;
+    uint32_t header_size;
+    uint64_t fatbin_size;
+    uint64_t compressed_size; // Compression related information
+    uint16_t minor;
+    uint16_t major;
+    uint32_t arch;
+    uint32_t obj_name_offset;
+    uint32_t obj_name_len;
+    uint64_t flags;
+    uint64_t zero;      // Alignment for compression?
+    uint64_t decompressed_len;  // Length of compressed data. There is an uncompressed footer
+                              // so this is generally smaller than fatbin_size
+};
+
+#define FATBIN_FLAG_64BIT     0x0000000000000001LL
+#define FATBIN_FLAG_DEBUG     0x0000000000000002LL
+#define FATBIN_FLAG_LINUX     0x0000000000000010LL
+#define FATBIN_FLAG_COMPRESS  0x0000000000002000LL
+
+int elf2_init(void)
+{
+    if (elf_version(EV_CURRENT) == EV_NONE) {
+        LOGE(LOG_ERROR, "ELF library initialization failed: %s", elf_errmsg(-1));
+        return -1;
+    }
+}
+
+static int flag_to_str(char** str, uint64_t flag)
+{
+    return asprintf(str, "64Bit: %s, Debug: %s, Linux: %s, Compress %s",
+        (flag & FATBIN_FLAG_64BIT) ? "yes" : "no",
+        (flag & FATBIN_FLAG_DEBUG) ? "yes" : "no",
+        (flag & FATBIN_FLAG_LINUX) ? "yes" : "no",
+        (flag & FATBIN_FLAG_COMPRESS) ? "yes" : "no");
+}
+
+static int fat_header_decode(void *fat, 
+                            struct fat_elf_header **fat_elf_header,
+                            struct fat_text_header **fat_text_header,
+                            void **fat_text_body_ptr)
+{
+    struct fat_elf_header* feh;
+    struct fat_text_header* fth;
+    void *fat_ptr = NULL;
+    void *fat_text_header_ptr = NULL;
+
+    if (fat == NULL || fat_elf_header == NULL || fat_text_header == NULL || fat_text_body_ptr == NULL) {
+        LOGE(LOG_ERROR, "at least one parameter is NULL");
+        return -1;
+    }
+
+    feh = (struct fat_elf_header*)fat;
+    if (feh->magic != FATBIN_TEXT_MAGIC) {
+        LOGE(LOG_ERROR, "fatbin text magic number is wrong. Got %x, expected %x.", *((uint32_t*)feh), FATBIN_TEXT_MAGIC);
+        return -1;
+    }
+    LOGE(LOG_DBG(1), "fat_elf_header: magic: %x, version: %d, header_size: %p, fat_size: %p",
+        feh->magic, feh->version, feh->header_size, feh->fat_size);
+
+    if (feh->version != 1 || feh->header_size != sizeof(struct fat_elf_header)) {
+        LOGE(LOG_ERROR, "fatbin text version is wrong or header size is inconsistent.\
+            This is a sanity check to avoid reading a new fatbinary format");
+        return -1;
+    }
+    fat_ptr = fat_text_header_ptr = (void*)feh + feh->header_size;
+
+    fth = (struct fat_text_header*)(fat_text_header_ptr);
+    LOGE(LOG_DBG(1), "fat_text_header: fatbin_kind: %#x, header_size %#x, fatbin_size %#x, compressed_size %#x,\
+        minor %#x, major %#x, arch %d, flags %#x, compressed_len %#x",
+        fth->kind,
+        fth->header_size,
+        fth->fatbin_size,
+        fth->compressed_size,
+        fth->minor,
+        fth->major,
+        fth->arch,
+        fth->flags,
+        fth->decompressed_len);
+    LOGE(LOG_DBG(1), "unknown fields: unknown1: %#x, zeros: %#x",
+        fth->unknown1,
+        fth->zero);
+    fat_ptr += sizeof(struct fat_header);
+    *fat_text_body_ptr = fat_text_header_ptr + fth->header_size;
+    if (fth->flags & FATBIN_FLAG_DEBUG || fth->flags & FATBIN_FLAG_COMPRESS) {
+       LOGE(LOG_DBG(1), "skipping extra byte \"%#02x\"", *((uint8_t*)*fat_text_body_ptr));
+        *fat_text_body_ptr += 1;
+    }
+
+    char *flag_str = NULL;
+    flag_to_str(&flag_str, fth->flags);
+    LOGE(LOG_DBG(1), "Fatbin flags: %s", flag_str);
+    free(flag_str);
+
+    if(fth->obj_name_offset != 0) {
+        if (((char*)fat_text_header_ptr)[fth->obj_name_offset + fth->obj_name_len] != '\0') {
+            LOGE(LOG_DEBUG, "Fatbin object name is not null terminated");
+        } else {
+            char *obj_name = (char*)fat_text_header_ptr + fth->obj_name_offset;
+            LOGE(LOG_DEBUG, "Fatbin object name: %s (len:%#x)", obj_name, fth->obj_name_len);
+        }
+        fat_ptr += fth->obj_name_len+1;
+    }
+    *fat_elf_header = feh;
+    *fat_text_header = fth;
+    return 0;
+}
+
+int elf2_get_fatbin_info(struct fat_header *fatbin, list *kernel_infos, void** fatbin_mem, unsigned* fatbin_size)
+{
+    struct fat_elf_header* fat_elf_header;
+    struct fat_text_header* fat_text_header;
+    void *fat_ptr = NULL;
+    void *fat_text_body_ptr = NULL;
+    unsigned fatbin_total_size = 0;
+    if (fatbin == NULL || fatbin_mem == NULL || fatbin_size == NULL) {
+        LOGE(LOG_ERROR, "at least one parameter is NULL");
+        return -1;
+    }
+    if (fatbin->magic != FATBIN_STRUCT_MAGIC) {
+        LOGE(LOG_ERROR, "fatbin struct magic number is wrong. Got %llx, expected %llx.", fatbin->magic, FATBIN_STRUCT_MAGIC);
+        return -1;
+    }
+    LOG(LOG_DBG(1), "Fatbin: magic: %x, version: %x, text: %lx, data: %lx, ptr: %lx, ptr2: %lx, zero: %lx",
+           fatbin->magic, fatbin->version, fatbin->text, fatbin->data, fatbin->unknown, fatbin->text2, fatbin->zero);
+
+    if (fat_header_decode((void*)fatbin->text, &fat_elf_header, &fat_text_header, &fat_text_body_ptr) != 0) {
+        LOGE(LOG_ERROR, "fatbin header decode failed");
+        return -1;
+    }
+
+
+    fatbin_total_size = fat_elf_header->header_size + fat_elf_header->fat_size;
+
+    // for (int i=0; i<64; i++) {
+    //     printf("%02x ", ((uint8_t*)fat_text_body_ptr)[i]);
+    // }
+    // printf("\n");
+
+    if (fat_text_header->flags & FATBIN_FLAG_COMPRESS) {
+        LOGE(LOG_WARNING, "fatbin contains compressed device code. This is not supported yet.");
+        //return -1;
+    }
+    if (fat_text_header->flags & FATBIN_FLAG_DEBUG) {
+        LOGE(LOG_WARNING, "fatbin contains debug information. This is not supported yet.");
+        return -1;
+    }
+
+    if (elf2_parameter_info(kernel_infos, fat_text_body_ptr, fat_elf_header->fat_size) != 0) {
+        LOGE(LOG_ERROR, "error getting parameter info");
+        return -1;
+    }
+
+    if (fat_header_decode((void*)fatbin->text2, &fat_elf_header, &fat_text_header, &fat_text_body_ptr) != 0) {
+        LOGE(LOG_ERROR, "fatbin header decode failed");
+        return -1;
+    }
+    fatbin_total_size += fat_elf_header->header_size + fat_elf_header->fat_size;
+
+    // if (cricketd_utils_symtab(fat_text_body_ptr, fat_elf_header->fat_size) == NULL) {
+    //     LOGE(LOG_ERROR, "error getting symbol table");
+    //     return -1;
+    // }
+    fat_ptr = (void*)fatbin->data;
+
+    // for (int i=0; i<64; i++) {
+    //     printf("%02x ", ((uint8_t*)fatbin->text)[i]);
+    // }
+    // printf("\n");
+
+    *fatbin_mem = (void*)fatbin->text;
+    *fatbin_size = fatbin_total_size;
+    return 0;
+}
+
+static void print_hexmem(void *mem, size_t len)
+{
+    for (int i=0; i<len; i++) {
+        printf("%02x ", ((uint8_t*)mem)[i]);
+    }
+    printf("\n");
+}
+
+#define EIATTR_PARAM_CBANK              0xa
+#define EIATTR_EXTERNS                  0xf
+#define EIATTR_FRAME_SIZE               0x11
+#define EIATTR_MIN_STACK_SIZE           0x12
+#define EIATTR_KPARAM_INFO              0x17
+#define EIATTR_CBANK_PARAM_SIZE         0x19
+#define EIATTR_MAX_REG_COUNT            0x1b
+#define EIATTR_EXIT_INSTR_OFFSETS       0x1c
+#define EIATTR_S2RCTAID_INSTR_OFFSETS   0x1d
+#define EIATTR_CRS_STACK_SIZE           0x1e
+#define EIATTR_SW1850030_WAR            0x2a
+#define EIATTR_REGCOUNT                 0x2f
+#define EIATTR_SW2393858_WAR            0x30
+#define EIATTR_INDIRECT_BRANCH_TARGETS  0x34
+#define EIATTR_CUDA_API_VERSION         0x37
+
+#define EIFMT_NVAL                      0x1
+#define EIFMT_HVAL                      0x3
+#define EIFMT_SVAL                      0x4
+
+
+static int get_section_by_name(Elf *elf, const char *name, Elf_Scn **section)
+{
+    Elf_Scn *scn = NULL;
+    GElf_Shdr shdr;
+    char *section_name = NULL;
+    size_t str_section_index;
+
+    if (elf == NULL || name == NULL || section == NULL) {
+        LOGE(LOG_ERROR, "invalid argument");
+        return -1;
+    }
+
+    if (elf_getshdrstrndx(elf, &str_section_index) != 0) {
+        LOGE(LOG_ERROR, "elf_getshstrndx Wfailed");
+        return -1;
+    }
+
+    while ((scn = elf_nextscn(elf, scn)) != NULL) {
+        if (gelf_getshdr(scn, &shdr) != &shdr) {
+            LOGE(LOG_ERROR, "gelf_getshdr failed");
+            return -1;
+        }
+        if ((section_name = elf_strptr(elf, str_section_index, shdr.sh_name)) == NULL) {
+            LOGE(LOG_ERROR, "elf_strptr failed");
+            return -1;
+        }
+        //printf("%s, %#0x %#0x\n", section_name, shdr.sh_flags, shdr.sh_type);
+        if (strcmp(section_name, name) == 0) {
+            *section = scn;
+            return 0;
+        }
+    }
+    return -1;
+}
+
+static int get_parm_for_kernel(Elf *elf, kernel_info_t *kernel, void* memory, size_t memsize)
+{
+    struct __attribute__((__packed__)) nv_info_kernel_entry {
+        uint8_t format;
+        uint8_t attribute;
+        uint16_t values_size;
+        uint32_t values;
+    };
+    struct __attribute__((__packed__)) nv_info_kparam_info {
+        uint32_t index;
+        uint16_t ordinal;
+        uint16_t offset;
+        uint16_t unknown : 12;
+        uint8_t  cbank : 6;
+        uint16_t size : 14;
+        // missing are "space" (possible padding info?), and "Pointee's logAlignment"
+        // these were always 0 in the kernels I tested
+    };
+    int ret = -1;
+    char *section_name = NULL;
+    Elf_Scn *section = NULL;
+    Elf_Data *data = NULL;
+
+    if (kernel == NULL || kernel->name == NULL || memory == NULL) {
+        LOGE(LOG_ERROR, "at least one parameter is NULL");
+        goto cleanup;
+    }
+    kernel->param_num = 0;
+    kernel->param_offsets = NULL;
+    kernel->param_sizes = NULL;
+
+    if (asprintf(&section_name, ".nv.info.%s", kernel->name) == -1) {
+        LOGE(LOG_ERROR, "asprintf failed");
+        goto cleanup;
+    }
+
+    if (get_section_by_name(elf, section_name, &section) != 0) {
+        LOGE(LOG_ERROR, "section %s not found", section_name);
+        goto cleanup;
+    }
+
+    if ((data = elf_getdata(section, NULL)) == NULL) {
+        LOGE(LOG_ERROR, "error getting section data");
+        goto cleanup;
+    }
+
+    //print_hexmem(data->d_buf, data->d_size);
+
+    size_t secpos=0;
+    int i=0;
+    while (secpos < data->d_size) {
+        struct nv_info_kernel_entry *entry = (struct nv_info_kernel_entry*)(data->d_buf+secpos);
+        // printf("entry %d: format: %#x, attr: %#x, ", i++, entry->format, entry->attribute);
+        if (entry->format == EIFMT_SVAL && entry->attribute == EIATTR_KPARAM_INFO) {
+            if (entry->values_size != 0xc) {
+                LOGE(LOG_ERROR, "EIATTR_KPARAM_INFO values size has not the expected value of 0xc");
+                goto cleanup;
+            }
+            struct nv_info_kparam_info *kparam = (struct nv_info_kparam_info*)&entry->values;
+            // printf("kparam: index: %#x, ordinal: %#x, offset: %#x, unknown: %#0x, cbank: %#0x, size: %#0x\n",
+            //     kparam->index, kparam->ordinal, kparam->offset, kparam->unknown, kparam->cbank, kparam->size);
+            LOGE(LOG_DBG(1), "param %d: offset: %#x, size: %#x", kparam->ordinal, kparam->offset, kparam->size);
+            if (kparam->ordinal >= kernel->param_num) {
+                kernel->param_offsets = realloc(kernel->param_offsets,
+                                              (kparam->ordinal+1)*sizeof(uint16_t));
+                kernel->param_sizes = realloc(kernel->param_sizes,
+                                            (kparam->ordinal+1)*sizeof(uint16_t));
+                kernel->param_num = kparam->ordinal+1;
+            }
+            kernel->param_offsets[kparam->ordinal] = kparam->offset;
+            kernel->param_sizes[kparam->ordinal] = kparam->size;
+            secpos += sizeof(struct nv_info_kernel_entry) + entry->values_size-4;
+        } else if (entry->format == EIFMT_HVAL && entry->attribute == EIATTR_CBANK_PARAM_SIZE) {
+            kernel->param_size = entry->values_size;
+            LOGE(LOG_DEBUG, "cbank_param_size: %#0x", entry->values_size);
+            secpos += sizeof(struct nv_info_kernel_entry)-4;
+        } else if (entry->format == EIFMT_HVAL) {
+            // printf("hval: %#x(%d)\n", entry->values_size, entry->values_size);
+            secpos += sizeof(struct nv_info_kernel_entry)-4;
+        } else if (entry->format == EIFMT_SVAL) {
+            // printf("sval_size: %#x ", entry->values_size);
+            // for (int j=0; j*sizeof(uint32_t) < entry->values_size; j++) {
+            //     printf("val%d: %#x(%d) ", j, (&entry->values)[j], (&entry->values)[j]);
+            // }
+            // printf("\n");
+            secpos += sizeof(struct nv_info_kernel_entry) + entry->values_size-4;
+        } else if (entry->format == EIFMT_NVAL) {
+            // printf("nval\n");
+            secpos += sizeof(struct nv_info_kernel_entry)-4;
+        } else {
+            LOGE(LOG_WARNING, "unknown format: %#x", entry->format);
+            secpos += sizeof(struct nv_info_kernel_entry)-4;
+        }
+    }
+    // printf("remaining: %d\n", data->d_size % sizeof(struct nv_info_kernel_entry));
+    ret = 0;
+ cleanup:
+    free(section_name);
+    return ret;
+}
+
+
+static int get_symtab(Elf *elf, Elf_Data **symbol_table_data, size_t *symbol_table_size, GElf_Shdr *symbol_table_shdr)
+{
+    GElf_Shdr shdr;
+    Elf_Scn *section = NULL;
+
+    if (elf == NULL || symbol_table_data == NULL || symbol_table_size == NULL) {
+        LOGE(LOG_ERROR, "invalid argument");
+        return -1;
+    }
+
+    if (get_section_by_name(elf, ".symtab", &section) != 0) {
+        LOGE(LOG_ERROR, "could not find .nv.info section");
+        return -1;
+    }
+
+    if (gelf_getshdr(section, &shdr) == NULL) {
+        LOGE(LOG_ERROR, "gelf_getshdr failed");
+        return -1;
+    }
+
+    if (symbol_table_shdr != NULL) {
+        *symbol_table_shdr = shdr;
+    }
+
+    if(shdr.sh_type != SHT_SYMTAB) {
+        LOGE(LOG_ERROR, "not a symbol table: %d", shdr.sh_type);
+        return -1;
+    }
+
+    if ((*symbol_table_data = elf_getdata(section, NULL)) == NULL) {
+        LOGE(LOG_ERROR, "elf_getdata failed");
+        return -1;
+    }
+
+    *symbol_table_size = shdr.sh_size / shdr.sh_entsize;
+
+    return 0;
+}
+
+static void print_symtab(Elf *elf)
+{
+    GElf_Sym sym;
+    Elf_Data *symbol_table_data = NULL;
+    GElf_Shdr shdr;
+    size_t symnum;
+    int i = 0;
+
+    if (get_symtab(elf, &symbol_table_data, &symnum, &shdr) != 0) {
+        LOGE(LOG_ERROR, "could not get symbol table");
+        return;
+    }
+
+    LOGE(LOG_DEBUG, "found %d symbols", symnum);
+
+    while (gelf_getsym(symbol_table_data, i, &sym) != NULL) {
+        printf("sym %d: name: %s, value: %#x, size: %#x, info: %#x, other: %#x, shndx: %#x\n", i,
+               elf_strptr(elf, shdr.sh_link, sym.st_name),
+               sym.st_value, sym.st_size, sym.st_info, sym.st_other, sym.st_shndx);
+        i++;
+    }
+}
+
+static int check_elf(Elf *elf)
+{
+    Elf_Kind ek;
+    GElf_Ehdr ehdr;
+
+    int elfclass;
+    char *id;
+    size_t program_header_num;
+    size_t sections_num;
+    size_t section_str_num;
+    int ret = -1;
+
+    if ((ek = elf_kind(elf)) != ELF_K_ELF) {
+        LOGE(LOG_ERROR, "elf_kind is not ELF_K_ELF, but %d", ek);
+        goto cleanup;
+    }
+
+    if (gelf_getehdr(elf, &ehdr) == NULL) {
+        LOGE(LOG_ERROR, "gelf_getehdr failed");
+        goto cleanup;
+    }
+
+    if ((elfclass = gelf_getclass(elf)) == ELFCLASSNONE) {
+        LOGE(LOG_ERROR, "gelf_getclass failed");
+        goto cleanup;
+    }
+
+    if ((id = elf_getident(elf, NULL)) == NULL) {
+        LOGE(LOG_ERROR, "elf_getident failed");
+        goto cleanup;
+    }
+
+    LOGE(LOG_DBG(1), "elfclass: %d-bit; elf ident[0..%d]: %7s",
+        (elfclass == ELFCLASS32) ? 32 : 64,
+        EI_ABIVERSION, id);
+
+    if (elf_getshdrnum(elf, &sections_num) != 0) {
+        LOGE(LOG_ERROR, "elf_getphdrnum failed");
+        goto cleanup;
+    }
+
+    if (elf_getphdrnum(elf, &program_header_num) != 0) {
+        LOGE(LOG_ERROR, "elf_getshdrnum failed");
+        goto cleanup;
+    }
+
+    if (elf_getshdrstrndx(elf, &section_str_num) != 0) {
+        LOGE(LOG_ERROR, "elf_getshstrndx Wfailed");
+        goto cleanup;
+    }
+
+    LOGE(LOG_DBG(1), "elf contains %d sections, %d program_headers, string table section: %d",
+        sections_num, program_header_num, section_str_num);
+
+    ret = 0;
+cleanup:
+    return ret;
+}
+
+int elf2_parameter_info(list *kernel_infos, void* memory, size_t memsize)
+{
+    struct __attribute__((__packed__)) nv_info_entry{
+        uint8_t format;
+        uint8_t attribute;
+        uint16_t values_size;
+        uint32_t kernel_id;
+        uint32_t value;
+    };
+
+    Elf *elf = NULL;
+    Elf_Scn *section = NULL;
+    Elf_Data *data = NULL, *symbol_table_data = NULL;
+    GElf_Shdr symtab_shdr;
+    size_t symnum;
+    int i = 0;
+    GElf_Sym sym;
+
+    int ret = -1;
+    kernel_info_t *ki = NULL;
+    const char *kernel_str;
+
+    if (memory == NULL || memsize == 0) {
+        LOGE(LOG_ERROR, "memory was NULL or memsize was 0");
+        return -1;
+    }
+
+//#define ELF_DUMP_TO_FILE 1
+
+#ifdef ELF_DUMP_TO_FILE
+    FILE* fd2 = fopen("/tmp/cricket-elf-dump", "wb");
+    fwrite(memory-1, memsize, 1, fd2);
+    fclose(fd2);
+#endif
+
+
+    if ((elf = elf_memory(memory, memsize)) == NULL) {
+        LOGE(LOG_ERROR, "elf_memory failed");
+        goto cleanup;
+    }
+
+    if (check_elf(elf) != 0) {
+        LOGE(LOG_ERROR, "check_elf failed");
+        goto cleanup;
+    }
+
+    //print_symtab(elf);
+
+    if (get_symtab(elf, &symbol_table_data, &symnum, &symtab_shdr) != 0) {
+        LOGE(LOG_ERROR, "could not get symbol table");
+        goto cleanup;
+    }
+
+    if (get_section_by_name(elf, ".nv.info", &section) != 0) {
+        LOGE(LOG_ERROR, "could not find .nv.info section");
+        goto cleanup;
+    }
+
+    if ((data = elf_getdata(section, NULL)) == NULL) {
+        LOGE(LOG_ERROR, "elf_getdata failed");
+        goto cleanup;
+    }
+
+    for (size_t secpos=0; secpos < data->d_size; secpos += sizeof(struct nv_info_entry)) {
+        struct nv_info_entry *entry = (struct nv_info_entry *)(data->d_buf+secpos);
+        LOGE(LOG_DBG(1), "%d: format: %#x, attr: %#x, values_size: %#x kernel: %#x, sval: %#x(%d)", 
+        i++, entry->format, entry->attribute, entry->values_size, entry->kernel_id, 
+        entry->value, entry->value);
+
+        if (entry->values_size != 8) {
+            LOGE(LOG_ERROR, "unexpected values_size: %#x", entry->values_size);
+            continue;
+        }
+
+        if (entry->attribute != EIATTR_FRAME_SIZE) {
+            continue;
+        }
+
+        if (entry->kernel_id >= symnum) {
+            LOGE(LOG_ERROR, "kernel_id out of bounds: %#x", entry->kernel_id);
+            continue;
+        }
+
+        if (gelf_getsym(symbol_table_data, entry->kernel_id, &sym) == NULL) {
+            LOGE(LOG_ERROR, "gelf_getsym failed for entry %d", entry->kernel_id);
+            continue;
+        }
+        if ((kernel_str = elf_strptr(elf, symtab_shdr.sh_link, sym.st_name) ) == NULL) {
+            LOGE(LOG_ERROR, "strptr failed for entry %d", entry->kernel_id);
+            continue;
+        }
+
+        if (utils_search_info(kernel_infos, kernel_str) != NULL) {
+            continue;
+        }
+
+        LOGE(LOG_DEBUG, "found new kernel: %s (symbol table id: %#x)", kernel_str, entry->kernel_id);
+
+        if (list_append(kernel_infos, (void**)&ki) != 0) {
+            LOGE(LOG_ERROR, "error on appending to list");
+            goto cleanup;
+        }
+
+        size_t buflen = strlen(kernel_str)+1;
+        if ((ki->name = malloc(buflen)) == NULL) {
+            LOGE(LOG_ERROR, "malloc failed");
+            goto cleanup;
+        }
+        if (strncpy(ki->name, kernel_str, buflen) != ki->name) {
+            LOGE(LOG_ERROR, "strncpy failed");
+            goto cleanup;
+        }
+
+        if (get_parm_for_kernel(elf, ki, memory, memsize) != 0) {
+            LOGE(LOG_ERROR, "get_parm_for_kernel failed for kernel %s", kernel_str);
+            goto cleanup;
+        }
+    }
+
+    ret = 0;
+ cleanup:
+    if (elf != NULL) {
+        elf_end(elf);
+    }
+    return ret;
+}
\ No newline at end of file
diff --git a/cpu/cpu-elf2.h b/cpu/cpu-elf2.h
new file mode 100644
index 00000000..c7309d71
--- /dev/null
+++ b/cpu/cpu-elf2.h
@@ -0,0 +1,25 @@
+#ifndef _ELF_H_
+#define _ELF_H_
+
+#include <stdint.h>
+#include "cpu-common.h"
+#include "list.h"
+
+struct fat_header {
+    uint32_t magic;
+    uint32_t version;
+    uint64_t text;
+    uint64_t data;  // points to outside of the file
+    uint64_t unknown;
+    uint64_t text2;     // points to footer of text section
+    uint64_t zero;
+};
+
+int elf2_init(void);
+int elf2_get_fatbin_info(struct fat_header *fatbin, list *kernel_infos, void** fatbin_mem, unsigned* fatbin_size);
+
+int elf2_parameter_info(list *kernel_infos, void* memory, size_t memsize);
+void* elf2_symbol_address(const char* file, char *symbol);
+int elf2_contains_kernel(void* memory, size_t memsize);
+
+#endif //_ELF_H_
diff --git a/cpu/cpu-server.c b/cpu/cpu-server.c
index 7d247e0f..d28ee15f 100644
--- a/cpu/cpu-server.c
+++ b/cpu/cpu-server.c
@@ -290,14 +290,14 @@ void cricket_main(char* app_command, size_t prog_num, size_t vers_num)
     /* Call CUDA initialization function (usually called by __libc_init_main())
      * Address of "_ZL24__sti____cudaRegisterAllv" in static symbol table is e.g. 0x4016c8
      */
-    void (*cudaRegisterAllv)(void) =
-        (void(*)(void)) elf_symbol_address(NULL, "_ZL24__sti____cudaRegisterAllv");
-    LOG(LOG_INFO, "found CUDA initialization function at %p", cudaRegisterAllv);
-    if (cudaRegisterAllv == NULL) {
-        LOGE(LOG_WARNING, "could not find cudaRegisterAllv initialization function in cubin. Kernels cannot be launched without it!");
-    } else {
-        cudaRegisterAllv();
-    }
+    // void (*cudaRegisterAllv)(void) =
+    //     (void(*)(void)) elf_symbol_address(NULL, "_ZL24__sti____cudaRegisterAllv");
+    // LOG(LOG_INFO, "found CUDA initialization function at %p", cudaRegisterAllv);
+    // if (cudaRegisterAllv == NULL) {
+    //     LOGE(LOG_WARNING, "could not find cudaRegisterAllv initialization function in cubin. Kernels cannot be launched without it!");
+    // } else {
+    //     cudaRegisterAllv();
+    // }
 
     sched = &sched_none; 
     if (sched->init() != 0) {
diff --git a/cpu/cpu-utils.c b/cpu/cpu-utils.c
index 173f2952..f7955353 100644
--- a/cpu/cpu-utils.c
+++ b/cpu/cpu-utils.c
@@ -9,7 +9,7 @@
 #include <openssl/md5.h>
 #include <linux/limits.h>
 #include "rpc/types.h"
-#include <bfd.h>
+#include <sys/stat.h>
 
 #include "cpu-utils.h"
 #include "cpu-common.h"

From 8de247f317a7cdb4b1220664902d04db8acac961 Mon Sep 17 00:00:00 2001
From: Niklas Eiling <niklas.eiling@eonerc.rwth-aachen.de>
Date: Tue, 11 Apr 2023 15:06:53 +0200
Subject: [PATCH 28/83] add colors to log.c

Signed-off-by: Niklas Eiling <niklas.eiling@eonerc.rwth-aachen.de>
---
 cpu/log.c | 35 ++++++++++++++++++++++++++++++++---
 cpu/log.h |  2 ++
 2 files changed, 34 insertions(+), 3 deletions(-)

diff --git a/cpu/log.c b/cpu/log.c
index e104890e..6e4807ee 100644
--- a/cpu/log.c
+++ b/cpu/log.c
@@ -20,6 +20,8 @@
 #include <stdarg.h>
 #include <string.h>
 
+static struct timeval start_time = {0};
+
 struct log_data* get_log_data() {
 	static struct log_data log_data;
 	return &log_data;
@@ -46,6 +48,7 @@ void init_log(char log_level, const char* proj_root)
 {
 	get_log_data()->curr_level=log_level;
 	get_log_data()->project_offset = str_find_last_of(proj_root, '/');
+	gettimeofday(&start_time, 0);
 }
 
 void now_time(char* buf)
@@ -57,9 +60,23 @@ void now_time(char* buf)
 	sprintf(buf, "%s.%06ld", buffer, (long)tv.tv_usec);
 }
 
+void delta_time(char* buf)
+{
+	struct timeval tv;
+	gettimeofday(&tv, 0);
+	timersub(&tv, &start_time, &tv);
+	char buffer[100];
+	strftime(buffer, sizeof(buffer), "%X", localtime(&tv.tv_sec));
+	sprintf(buf, "+%s.%06ld", buffer, (long)tv.tv_usec);
+}
+
 const char* to_string(log_level level)
 {
+#ifdef NOCOLORS
 	static const char* const buffer[] = {"ERROR", "WARNING", "INFO", "DEBUG"};
+#else
+	static const char* const buffer[] = {"\033[1m\033[31mERROR\033[0m", "\033[33mWARNING\033[0m", "\033[34mINFO\033[0m", "\033[32mDEBUG\033[0m"};
+#endif //NOCOLORS
 	if(level > LOG_DEBUG){
 		return buffer[LOG_DEBUG];
 	}
@@ -71,9 +88,13 @@ void loggf(log_level level, const char* formatstr, ... )
 	va_list vararg;
 	va_start(vararg, formatstr);
 	
-	char time[100];
+	char time[64];
+#ifdef DELTA_TIME
+	delta_time(time);
+#else
 	now_time(time);
-	printf("%s (%s):\t", time, to_string(level));
+#endif //DELTA_TIME
+	printf("%s %s:\t", time, to_string(level));
 	vprintf(formatstr, vararg);
 	printf("\n");
 }
@@ -84,11 +105,19 @@ void loggfe(log_level level, int line, const char* file, const char* formatstr,
 	va_start(vararg, formatstr);
 	
 	char time[64];
+#ifdef DELTA_TIME
+	delta_time(time);
+#else
 	now_time(time);
+#endif //DELTA_TIME
 	printf("%s %7s: ", time, to_string(level));
 	vprintf(formatstr, vararg);
 	char stripped[64];
 	strcpy(stripped, file);
 	str_strip(stripped, get_log_data()->project_offset);
-	printf("\tin %s(%d)\n", stripped, line);
+#ifdef NOCOLORS
+	printf("\tin %s:%d\n", stripped, line);
+#else
+	printf("\tin \033[4m%s:%d\033[0m\n", stripped, line);
+#endif //NOCOLORS
 }
diff --git a/cpu/log.h b/cpu/log.h
index 81ce80be..379c5865 100644
--- a/cpu/log.h
+++ b/cpu/log.h
@@ -38,6 +38,8 @@ else loggfe(level, __LINE__, __FILE__, __VA_ARGS__)
 #define LOG_DEBUG 3
 #define LOG_DBG(i) LOG_DEBUG + i
 
+#define DELTA_TIME 1
+
 typedef char log_level;
 
 struct log_data{

From 6acdf43013ae9bace72bc4c4d095063644a124ae Mon Sep 17 00:00:00 2001
From: Niklas Eiling <niklas.eiling@eonerc.rwth-aachen.de>
Date: Thu, 4 May 2023 11:34:30 +0200
Subject: [PATCH 29/83] migrate to new elf handling. add decompression support
 for cuda fatbinaries

Signed-off-by: Niklas Eiling <niklas.eiling@eonerc.rwth-aachen.de>
---
 .gitignore               |   1 +
 cpu/cpu-client.c         |  25 +-
 cpu/cpu-elf.c            | 638 ---------------------------------------
 cpu/cpu-elf.h            |  25 --
 cpu/cpu-elf2.c           | 576 ++++++++++++++++++++++++++++-------
 cpu/cpu-elf2.h           |  12 +-
 cpu/cpu-server-runtime.c |   8 +-
 cpu/cpu-utils.c          |  32 ++
 cpu/cpu-utils.h          |   2 +
 9 files changed, 526 insertions(+), 793 deletions(-)
 delete mode 100644 cpu/cpu-elf.c
 delete mode 100644 cpu/cpu-elf.h

diff --git a/.gitignore b/.gitignore
index 814b57cb..f855bf18 100644
--- a/.gitignore
+++ b/.gitignore
@@ -4,6 +4,7 @@ build/
 .clangd
 .project
 .cproject
+*.code-workspace
 .settings/
 .vscode/
 .directory
diff --git a/cpu/cpu-client.c b/cpu/cpu-client.c
index 20d38f12..3746cc9d 100644
--- a/cpu/cpu-client.c
+++ b/cpu/cpu-client.c
@@ -77,11 +77,10 @@ static void rpc_connect(void)
 
 #endif // WITH_IB
 
-    LOGE(LOG_INFO, "test\n");
-    if (getenv("CRICKET_NOHASH")) {
-        prog = 99;
-        vers = 1;
-    } else if (cpu_utils_md5hash("/proc/self/exe", &prog, &vers) != 0) {
+    //TODO: This is not necessary anymore. We should fix a static prog/vers
+    prog = 99;
+    vers = 1;
+    if (getenv("CRICKET_HASH") && cpu_utils_md5hash("/proc/self/exe", &prog, &vers) != 0) {
         LOGE(LOG_ERROR, "error while creating binary checksum");
         exit(0);
     }
@@ -191,11 +190,11 @@ void __attribute__((constructor)) init_rpc(void)
         LOGE(LOG_ERROR, "libelf init failed");
     }
 
-    if (cpu_utils_parameter_info(&kernel_infos, "/proc/self/exe") != 0) {
-        LOG(LOG_ERROR, "error while getting parameter size. Check whether "
-                       "cuobjdump binary is in PATH! Trying anyway (will only "
-                       "work if there is no kernel in this binary)");
-    }
+    // if (cpu_utils_parameter_info(&kernel_infos, "/proc/self/exe") != 0) {
+    //     LOG(LOG_ERROR, "error while getting parameter size. Check whether "
+    //                    "cuobjdump binary is in PATH! Trying anyway (will only "
+    //                    "work if there is no kernel in this binary)");
+    // }
 #ifdef WITH_IB
     if (ib_init(ib_device, server) != 0) {
         LOG(LOG_ERROR, "initilization of infiniband verbs failed.");
@@ -335,17 +334,19 @@ void **__cudaRegisterFatBinary(void *fatCubin)
     void **result;
     int rpc_result;
     enum clnt_stat retval_1;
+    size_t fatbin_size;
     LOGE(LOG_DEBUG, "__cudaRegisterFatBinary(fatCubin=%p)", fatCubin);
 
     mem_data rpc_fat = { .mem_data_len = 0, .mem_data_val = NULL };
 
     if (elf2_get_fatbin_info((struct fat_header *)fatCubin,
                                 &kernel_infos,
-                                (void **)&rpc_fat.mem_data_val,
-                                &rpc_fat.mem_data_len) != 0) {
+                                (uint8_t **)&rpc_fat.mem_data_val,
+                                &fatbin_size) != 0) {
         LOGE(LOG_ERROR, "error getting fatbin info");
         return NULL;
     }
+    rpc_fat.mem_data_len = fatbin_size;
 
     // CUDA registers an atexit handler for fatbin cleanup that accesses
     // the fatbin data structure. Let's allocate some zeroes to avoid segfaults.
diff --git a/cpu/cpu-elf.c b/cpu/cpu-elf.c
deleted file mode 100644
index 866144cb..00000000
--- a/cpu/cpu-elf.c
+++ /dev/null
@@ -1,638 +0,0 @@
-#define _GNU_SOURCE
-#include <stdlib.h>
-#include <stdio.h>
-#include <stdint.h>
-#include <unistd.h>
-#include <errno.h>
-#include <string.h>
-
-#include "cpu-common.h"
-#include "log.h"
-#include "cpu-elf.h"
-#include "cpu-utils.h"
-
-#include "bfd_extracts.h"
-
-#define uint16_t unsigned short
-#define CRICKET_ELF_NV_INFO_PREFIX ".nv.info"
-#define CRICKET_ELF_NV_SHARED_PREFIX ".nv.shared."
-#define CRICKET_ELF_NV_TEXT_PREFIX ".nv.text."
-#define CRICKET_ELF_TEXT_PREFIX ".text."
-
-#define CRICKET_ELF_FATBIN ".nv_fatbin"
-#define CRICKET_ELF_REGFUN "_ZL24__sti____cudaRegisterAllv"
-
-#define FATBIN_STRUCT_MAGIC 0x466243b1
-#define FATBIN_TEXT_MAGIC   0xBA55ED50
-
-struct  __attribute__((__packed__)) fat_elf_header
-{
-    uint32_t magic;
-    uint16_t version;
-    uint16_t header_size;
-    uint64_t fat_size;
-};
-struct  __attribute__((__packed__)) fat_text_header
-{
-    uint16_t kind;
-    uint16_t unknown1;
-    uint32_t header_size;
-    uint64_t fatbin_size;
-    uint64_t compressed_size; //Compression related information
-    uint16_t minor;
-    uint16_t major;
-    uint32_t arch;
-    uint32_t obj_name_offset;
-    uint32_t obj_name_len;
-    uint64_t flags;
-    uint64_t zero;      //Alignment for compression?
-    uint64_t unknown2;  //Compression related information (deflated size?)
-};
-
-#define FATBIN_FLAG_64BIT     0x0000000000000001LL
-#define FATBIN_FLAG_DEBUG     0x0000000000000002LL
-#define FATBIN_FLAG_LINUX     0x0000000000000010LL
-#define FATBIN_FLAG_COMPRESS  0x0000000000002000LL
-
-void elf_init(void)
-{
-    bfd_init();
-}
-
-static int flag_to_str(char** str, uint64_t flag)
-{
-    return asprintf(str, "64Bit: %s, Debug: %s, Linux: %s, Compress %s",
-        (flag & FATBIN_FLAG_64BIT) ? "yes" : "no",
-        (flag & FATBIN_FLAG_DEBUG) ? "yes" : "no",
-        (flag & FATBIN_FLAG_LINUX) ? "yes" : "no",
-        (flag & FATBIN_FLAG_COMPRESS) ? "yes" : "no");
-}
-
-static int fat_header_decode(void *fat, 
-                            struct fat_elf_header **fat_elf_header,
-                            struct fat_text_header **fat_text_header,
-                            void **fat_text_body_ptr)
-{
-    struct fat_elf_header* feh;
-    struct fat_text_header* fth;
-    void *fat_ptr = NULL;
-    void *fat_text_header_ptr = NULL;
-
-    if (fat == NULL || fat_elf_header == NULL || fat_text_header == NULL || fat_text_body_ptr == NULL) {
-        LOGE(LOG_ERROR, "at least one parameter is NULL");
-        return -1;
-    }
-
-    feh = (struct fat_elf_header*)fat;
-    if (feh->magic != FATBIN_TEXT_MAGIC) {
-        LOGE(LOG_ERROR, "fatbin text magic number is wrong. Got %x, expected %x.", *((uint32_t*)feh), FATBIN_TEXT_MAGIC);
-        return -1;
-    }
-    LOGE(LOG_DBG(1), "fat_elf_header: magic: %x, version: %d, header_size: %p, fat_size: %p",
-        feh->magic, feh->version, feh->header_size, feh->fat_size);
-
-    if (feh->version != 1 || feh->header_size != sizeof(struct fat_elf_header)) {
-        LOGE(LOG_ERROR, "fatbin text version is wrong or header size is inconsistent.\
-            This is a sanity check to avoid reading a new fatbinary format");
-        return -1;
-    }
-    fat_ptr = fat_text_header_ptr = (void*)feh + feh->header_size;
-
-    fth = (struct fat_text_header*)(fat_text_header_ptr);
-    LOGE(LOG_DBG(1), "fat_text_header: fatbin_kind: %#x, header_size %#x, fatbin_size %#x, some_offset %#x.\
-        minor %#x, major %#x, arch %d, flags %#x",
-        fth->kind,
-        fth->header_size,
-        fth->fatbin_size,
-        fth->compressed_size,
-        fth->minor,
-        fth->major,
-        fth->arch,
-        fth->flags);
-    LOGE(LOG_DBG(1), "unknown fields: unknown1: %#x, unknown2: %#x, zeros: %#x",
-        fth->unknown1,
-        fth->unknown2,
-        fth->zero);
-    fat_ptr += sizeof(struct fat_header);
-    *fat_text_body_ptr = fat_text_header_ptr + fth->header_size;
-    if (fth->flags & FATBIN_FLAG_DEBUG || fth->flags & FATBIN_FLAG_COMPRESS) {
-       LOGE(LOG_DBG(1), "skipping extra byte \"%#02x\"", *((uint8_t*)*fat_text_body_ptr));
-        *fat_text_body_ptr += 1;
-    }
-
-    char *flag_str = NULL;
-    flag_to_str(&flag_str, fth->flags);
-    LOGE(LOG_DBG(1), "Fatbin flags: %s", flag_str);
-    free(flag_str);
-
-    if(fth->obj_name_offset != 0) {
-        if (((char*)fat_text_header_ptr)[fth->obj_name_offset + fth->obj_name_len] != '\0') {
-            LOGE(LOG_DEBUG, "Fatbin object name is not null terminated");
-        } else {
-            char *obj_name = (char*)fat_text_header_ptr + fth->obj_name_offset;
-            LOGE(LOG_DEBUG, "Fatbin object name: %s (len:%#x)", obj_name, fth->obj_name_len);
-        }
-        fat_ptr += fth->obj_name_len+1;
-    }
-    *fat_elf_header = feh;
-    *fat_text_header = fth;
-    return 0;
-}
-
-int elf_get_fatbin_info(struct fat_header *fatbin, list *kernel_infos, void** fatbin_mem, unsigned* fatbin_size)
-{
-    struct fat_elf_header* fat_elf_header;
-    struct fat_text_header* fat_text_header;
-    void *fat_ptr = NULL;
-    void *fat_text_body_ptr = NULL;
-    unsigned fatbin_total_size = 0;
-    if (fatbin == NULL || fatbin_mem == NULL || fatbin_size == NULL) {
-        LOGE(LOG_ERROR, "at least one parameter is NULL");
-        return -1;
-    }
-    if (fatbin->magic != FATBIN_STRUCT_MAGIC) {
-        LOGE(LOG_ERROR, "fatbin struct magic number is wrong. Got %llx, expected %llx.", fatbin->magic, FATBIN_STRUCT_MAGIC);
-        return -1;
-    }
-    LOG(LOG_DBG(1), "Fatbin: magic: %x, version: %x, text: %lx, data: %lx, ptr: %lx, ptr2: %lx, zero: %lx",
-           fatbin->magic, fatbin->version, fatbin->text, fatbin->data, fatbin->unknown, fatbin->text2, fatbin->zero);
-
-    if (fat_header_decode((void*)fatbin->text, &fat_elf_header, &fat_text_header, &fat_text_body_ptr) != 0) {
-        LOGE(LOG_ERROR, "fatbin header decode failed");
-        return -1;
-    }
-
-
-    fatbin_total_size = fat_elf_header->header_size + fat_elf_header->fat_size;
-
-    // for (int i=0; i<64; i++) {
-    //     printf("%02x ", ((uint8_t*)fat_text_body_ptr)[i]);
-    // }
-    // printf("\n");
-
-    if (fat_text_header->flags & FATBIN_FLAG_COMPRESS) {
-        LOGE(LOG_WARNING, "fatbin contains compressed device code. This is not supported yet.");
-        return -1;
-    }
-    if (fat_text_header->flags & FATBIN_FLAG_DEBUG) {
-        LOGE(LOG_WARNING, "fatbin contains debug information. This is not supported yet.");
-        return -1;
-    }
-
-    if (elf_parameter_info(kernel_infos, fat_text_body_ptr, fat_elf_header->fat_size) != 0) {
-        LOGE(LOG_ERROR, "error getting symbol table");
-        return -1;
-    }
-
-    if (fat_header_decode((void*)fatbin->text2, &fat_elf_header, &fat_text_header, &fat_text_body_ptr) != 0) {
-        LOGE(LOG_ERROR, "fatbin header decode failed");
-        return -1;
-    }
-    fatbin_total_size += fat_elf_header->header_size + fat_elf_header->fat_size;
-
-    // if (cricketd_utils_symtab(fat_text_body_ptr, fat_elf_header->fat_size) == NULL) {
-    //     LOGE(LOG_ERROR, "error getting symbol table");
-    //     return -1;
-    // }
-    fat_ptr = (void*)fatbin->data;
-
-    // for (int i=0; i<64; i++) {
-    //     printf("%02x ", ((uint8_t*)fatbin->text)[i]);
-    // }
-    // printf("\n");
-
-    *fatbin_mem = (void*)fatbin->text;
-    *fatbin_size = fatbin_total_size;
-    return 0;
-}
-
-size_t cudabfd_size = 0;
-int (*orig_cudabfd_stat)(struct bfd *abfd, struct stat* sb);
-int cudabfd_stat(struct bfd *bfd, struct stat *sb)
-{
-    //int ret = orig_cudabfd_stat(bfd, sb);
-    sb->st_size = cudabfd_size;
-    return 0;
-}
-
-static void print_sections(asection *sections)
-{
-    for (asection *section = sections; section != NULL; section = section->next) {
-        printf("section: %s (len: %#x)\n", section->name, section->size);
-    }
-}
-
-static void print_hexmem(void *mem, size_t len)
-{
-    for (int i=0; i<len; i++) {
-        printf("%02x ", ((uint8_t*)mem)[i]);
-    }
-    printf("\n");
-}
-
-struct symtab {
-    asymbol **symtab;
-    size_t symtab_size;
-    size_t symtab_length;
-};
-
-static int symtab_init(bfd *bfd, struct symtab *st)
-{
-    if (st == NULL || bfd == NULL) {
-        LOGE(LOG_ERROR, "at least one parameter is NULL");
-        return -1;
-    }
-
-    if (memset(st, 0, sizeof(struct symtab)) == NULL) {
-        LOGE(LOG_ERROR, "memset failed");
-        return -1;
-    }
-
-    if ((st->symtab_size = bfd_get_symtab_upper_bound(bfd)) == -1) {
-        LOGE(LOG_ERROR, "bfd_get_symtab_upper_bound failed");
-        return -1;
-    }
-
-    if ((st->symtab = (asymbol **)malloc(st->symtab_size)) == NULL) {
-        LOGE(LOG_ERROR, "malloc symtab failed");
-        return -1;
-    }
-
-    if ((st->symtab_length = bfd_canonicalize_symtab(bfd, st->symtab)) == 0) {
-        LOG(LOG_WARNING, "symtab is empty...");
-    } else {
-        LOGE(LOG_DBG(1), "%lu symtab entries", st->symtab_length);
-    }
-    return 0;
-}
-
-static void symtab_free(struct symtab* st)
-{
-    if (st == NULL) {
-        return;
-    }
-    free(st->symtab);
-    memset(st, 0, sizeof(struct symtab));
-}
-
-static int symtab_symbol_at(struct symtab* st, size_t index, const char** sym)
-{
-    if (st == NULL || sym == NULL) {
-        LOGE(LOG_ERROR, "at least one parameter is NULL");
-        return -1;
-    }
-
-    if (index >= st->symtab_length+1 || index == 0) {
-        LOGE(LOG_ERROR, "index out of bounds");
-        return -1;
-    }
-    // The first entry of any symbol table is for undefined symbols and is always zero.
-    // Libbfd ignores this entry, but readelf does not so there is a difference of one
-    // between libbfd indices and those referenced by the .nv.info sections.
-    *sym = bfd_asymbol_name(st->symtab[index-1]);
-    return 0;
-}
-
-static void symtab_print(struct symtab* st)
-{
-    const char* sym;
-    for (int i = 1; i < st->symtab_length+1; ++i) {
-        symtab_symbol_at(st, i, &sym);
-        printf("%#x: name: %s\n", i, sym);
-    }
-}
-
-#define EIATTR_PARAM_CBANK              0xa
-#define EIATTR_EXTERNS                  0xf
-#define EIATTR_FRAME_SIZE               0x11
-#define EIATTR_MIN_STACK_SIZE           0x12
-#define EIATTR_KPARAM_INFO              0x17
-#define EIATTR_CBANK_PARAM_SIZE         0x19
-#define EIATTR_MAX_REG_COUNT            0x1b
-#define EIATTR_EXIT_INSTR_OFFSETS       0x1c
-#define EIATTR_S2RCTAID_INSTR_OFFSETS   0x1d
-#define EIATTR_CRS_STACK_SIZE           0x1e
-#define EIATTR_SW1850030_WAR            0x2a
-#define EIATTR_REGCOUNT                 0x2f
-#define EIATTR_SW2393858_WAR            0x30
-#define EIATTR_INDIRECT_BRANCH_TARGETS  0x34
-#define EIATTR_CUDA_API_VERSION         0x37
-
-#define EIFMT_NVAL                      0x1
-#define EIFMT_HVAL                      0x3
-#define EIFMT_SVAL                      0x4
-
-static int get_parm_for_kernel(bfd *bfd,  kernel_info_t *kernel, void* memory, size_t memsize)
-{
-    struct __attribute__((__packed__)) nv_info_kernel_entry {
-        uint8_t format;
-        uint8_t attribute;
-        uint16_t values_size;
-        uint32_t values;
-    };
-    struct __attribute__((__packed__)) nv_info_kparam_info {
-        uint32_t index;
-        uint16_t ordinal;
-        uint16_t offset;
-        uint16_t unknown : 12;
-        uint8_t  cbank : 6;
-        uint16_t size : 14;
-        // missing are "space" (possible padding info?), and "Pointee's logAlignment"
-        // these were always 0 in the kernels I tested
-    };
-    asection *section = NULL;
-    int ret = -1;
-    char *section_name = NULL;
-
-    if (bfd == NULL || kernel == NULL || kernel->name == NULL || memory == NULL) {
-        LOGE(LOG_ERROR, "at least one parameter is NULL");
-        return ret;
-    }
-    kernel->param_num = 0;
-    kernel->param_offsets = NULL;
-    kernel->param_sizes = NULL;
-
-    if (asprintf(&section_name, ".nv.info.%s", kernel->name) == -1) {
-        LOGE(LOG_ERROR, "asprintf failed");
-        return ret;
-    }
-
-    if ((section = bfd_get_section_by_name(bfd, section_name))== NULL) {
-        LOGE(LOG_ERROR, "%s section not found", section_name);
-        goto cleanup;
-    }
-
-    LOGE(LOG_DBG(1), "name: %s, index: %d, size 0x%lx, pos:%p", section->name,
-        section->index, section->size, (void *)section->filepos);
-
-    //print_hexmem(memory+section->filepos, section->size);
-
-    size_t secpos=0;
-    int i=0;
-    while (secpos < section->size) {
-        struct nv_info_kernel_entry *entry = (struct nv_info_kernel_entry*)(memory+section->filepos+secpos);
-        // printf("entry %d: format: %#x, attr: %#x, ", i++, entry->format, entry->attribute);
-        if (entry->format == EIFMT_SVAL && entry->attribute == EIATTR_KPARAM_INFO) {
-            if (entry->values_size != 0xc) {
-                LOGE(LOG_ERROR, "EIATTR_KPARAM_INFO values size has not the expected value of 0xc");
-                goto cleanup;
-            }
-            struct nv_info_kparam_info *kparam = (struct nv_info_kparam_info*)&entry->values;
-            // printf("kparam: index: %#x, ordinal: %#x, offset: %#x, unknown: %#0x, cbank: %#0x, size: %#0x\n",
-            //     kparam->index, kparam->ordinal, kparam->offset, kparam->unknown, kparam->cbank, kparam->size);
-            LOGE(LOG_DEBUG, "param %d: offset: %#x, size: %#x", kparam->ordinal, kparam->offset, kparam->size);
-            if (kparam->ordinal >= kernel->param_num) {
-                kernel->param_offsets = realloc(kernel->param_offsets,
-                                              (kparam->ordinal+1)*sizeof(uint16_t));
-                kernel->param_sizes = realloc(kernel->param_sizes,
-                                            (kparam->ordinal+1)*sizeof(uint16_t));
-                kernel->param_num = kparam->ordinal+1;
-            }
-            kernel->param_offsets[kparam->ordinal] = kparam->offset;
-            kernel->param_sizes[kparam->ordinal] = kparam->size;
-            secpos += sizeof(struct nv_info_kernel_entry) + entry->values_size-4;
-        } else if (entry->format == EIFMT_HVAL && entry->attribute == EIATTR_CBANK_PARAM_SIZE) {
-            kernel->param_size = entry->values_size;
-            LOGE(LOG_DEBUG, "cbank_param_size: %#0x", entry->values_size);
-            secpos += sizeof(struct nv_info_kernel_entry)-4;
-        } else if (entry->format == EIFMT_HVAL) {
-            // printf("hval: %#x(%d)\n", entry->values_size, entry->values_size);
-            secpos += sizeof(struct nv_info_kernel_entry)-4;
-        } else if (entry->format == EIFMT_SVAL) {
-            // printf("sval_size: %#x ", entry->values_size);
-            // for (int j=0; j*sizeof(uint32_t) < entry->values_size; j++) {
-            //     printf("val%d: %#x(%d) ", j, (&entry->values)[j], (&entry->values)[j]);
-            // }
-            // printf("\n");
-            secpos += sizeof(struct nv_info_kernel_entry) + entry->values_size-4;
-        } else if (entry->format == EIFMT_NVAL) {
-            // printf("nval\n");
-            secpos += sizeof(struct nv_info_kernel_entry)-4;
-        } else {
-            LOGE(LOG_WARNING, "unknown format: %#x", entry->format);
-            secpos += sizeof(struct nv_info_kernel_entry)-4;
-        }
-    }
-    // printf("remaining: %d\n", section->size % sizeof(struct nv_info_kernel_entry));
-    ret = 0;
- cleanup:
-    free(section_name);
-    return ret;
-}
-
-//#define ELF_DUMP_TO_FILE 1
-
-int elf_parameter_info(list *kernel_infos, void* memory, size_t memsize)
-{
-    struct __attribute__((__packed__)) nv_info_entry{
-        uint8_t format;
-        uint8_t attribute;
-        uint16_t values_size;
-        uint32_t kernel_id;
-        uint32_t value;
-    };
-
-    bfd *bfd = NULL;
-    FILE *fd = NULL;
-    asection *section = NULL;
-    int ret = -1;
-    struct symtab symtab = {0};
-    char path[256];
-    struct bfd_iovec *iovec = NULL;
-    const struct bfd_iovec *orig_iovec = NULL;
-
-    kernel_info_t *ki = NULL;
-
-    if (memory == NULL || memsize == 0) {
-        LOGE(LOG_ERROR, "memory was NULL or memsize was 0");
-        return -1;
-    }
-
-#ifdef ELF_DUMP_TO_FILE
-    FILE* fd2 = fopen("/tmp/cricket-elf-dump", "wb");
-    fwrite(memory, memsize, 1, fd2);
-    fclose(fd2);
-#endif
-
-    if ((fd = fmemopen(memory, memsize, "rb")) == NULL) {
-        LOGE(LOG_ERROR, "fmemopen failed");
-        goto cleanup;
-    }
-
-
-    if ((bfd = bfd_openstreamr("", "elf64-little", fd)) == NULL) {
-        LOGE(LOG_ERROR, "bfd_openr failed");
-        goto cleanup;
-    }
-
-    //We change the iovec of cudabfd so we can report the correct filesize
-    //because in-memory files always report a file size of 0, which creates 
-    //problems elsewhere
-    cudabfd_size = memsize;
-    orig_cudabfd_stat = bfd->iovec->bstat;
-    orig_iovec = bfd->iovec;
-    iovec = (struct bfd_iovec*)malloc(sizeof(struct bfd_iovec));
-    memcpy(iovec, bfd->iovec, sizeof(struct bfd_iovec));
-    iovec->bstat = cudabfd_stat;
-    bfd->iovec = iovec;
-
-    if (!bfd_check_format(bfd, bfd_object)) {
-        LOGE(LOG_ERROR, "bfd has wrong format");
-        goto cleanup;
-    }
-    // print_sections(bfd->sections);
-
-    if  (symtab_init(bfd, &symtab) != 0) {
-        LOGE(LOG_ERROR, "symtab_init failed");
-        goto cleanup;
-    }
-    // symtab_print(&symtab);
-
-    section = bfd_get_section_by_name(bfd, ".nv.info");
-    if (section == NULL) {
-        LOGE(LOG_ERROR, ".nv.info section not found");
-        goto cleanup;
-    }
-
-    LOGE(LOG_DBG(1), "name: %s, index: %d, size 0x%lx, pos:%p", section->name,
-        section->index, section->size, (void *)section->filepos);
-    //print_hexmem(memory+section->filepos, section->size); 
-    int i = 0;
-    const char *kernel_str;
-    for (size_t secpos=0; secpos < section->size; secpos += sizeof(struct nv_info_entry)) {
-        struct nv_info_entry *entry = (struct nv_info_entry*)(memory+section->filepos+secpos);
-        if (entry->values_size != 8) {
-            LOGE(LOG_ERROR, "unexpected values_size: %#x", entry->values_size);
-            continue;
-        }
-        // printf("%d: format: %#x, attr: %#x, values_size: %#x kernel: %#x, sval: %#x(%d)\n", 
-        //         i++, entry->format, entry->attribute, entry->values_size, entry->kernel_id, 
-        //         entry->value, entry->value);
-        if (entry->attribute != EIATTR_FRAME_SIZE) {
-            continue;
-        }
-        if (symtab_symbol_at(&symtab, entry->kernel_id, &kernel_str) != 0) {
-            LOGE(LOG_ERROR, "symtab_symbol_at failed for entry %d", i);
-            continue;
-        }
-        if (utils_search_info(kernel_infos, kernel_str) != NULL) {
-            continue;
-        }
-
-        LOGE(LOG_DEBUG, "found new kernel: %s (symbol table id: %#x)", kernel_str, entry->kernel_id);
-
-        if (list_append(kernel_infos, (void**)&ki) != 0) {
-            LOGE(LOG_ERROR, "error on appending to list");
-            goto cleanup;
-        }
-
-        size_t buflen = strlen(kernel_str)+1;
-        if ((ki->name = malloc(buflen)) == NULL) {
-            LOGE(LOG_ERROR, "malloc failed");
-            goto cleanup;
-        }
-        if (strncpy(ki->name, kernel_str, buflen) != ki->name) {
-            LOGE(LOG_ERROR, "strncpy failed");
-            goto cleanup;
-        }
-
-        if (get_parm_for_kernel(bfd, ki, memory, memsize) != 0) {
-            LOGE(LOG_ERROR, "get_parm_for_kernel failed for kernel %s", kernel_str);
-            goto cleanup;
-        }
-    }
-
-    ret = 0;
- cleanup:
-    free(iovec);
-    if (fd != NULL)
-        fclose(fd);
-    symtab_free(&symtab);
-    if (bfd != NULL) {
-        // Also closes fd
-        bfd_close(bfd);
-    }
-    return ret;
-}
-
-
-void* elf_symbol_address(const char* file, char *symbol)
-{
-    bfd *hostbfd = NULL;
-    asection *section;
-    FILE *hostbfd_fd = NULL;
-    void *ret = NULL;
-    size_t symtab_size, symtab_length;
-    asymbol **symtab = NULL;
-    char path[256];
-    size_t length;
-    const char self[] = "/proc/self/exe";
-    if (file == NULL) {
-        file = self;
-    }
-
-
-    bfd_init();
-
-    length = readlink(file, path, sizeof(path));
-
-    /* Catch some errors: */
-    if (length < 0) {
-        LOGE(LOG_WARNING, "error resolving symlink %s.", file);
-    } else if (length >= 256) {
-        LOGE(LOG_WARNING, "path was too long and was truncated.");
-    } else {
-        path[length] = '\0';
-        LOG(LOG_DEBUG, "opening '%s'", path);
-    }
-
-    if ((hostbfd_fd = fopen(file, "rb")) == NULL) {
-        LOGE(LOG_ERROR, "fopen failed");
-        return NULL;
-    }
-
-    if ((hostbfd = bfd_openstreamr(file, NULL, hostbfd_fd)) == NULL) {
-        LOGE(LOG_ERROR, "bfd_openr failed on %s",
-             file);
-        fclose(hostbfd_fd);
-        goto cleanup;
-    }
-
-    if (!bfd_check_format(hostbfd, bfd_object)) {
-        LOGE(LOG_ERROR, "%s has wrong bfd format",
-             file);
-        goto cleanup;
-    }
-
-    if ((symtab_size = bfd_get_symtab_upper_bound(hostbfd)) == -1) {
-        LOGE(LOG_ERROR, "bfd_get_symtab_upper_bound failed");
-        return NULL;
-    }
-
-    if ((symtab = (asymbol **)malloc(symtab_size)) == NULL) {
-        LOGE(LOG_ERROR, "malloc symtab failed");
-        return NULL;
-    }
-
-    if ((symtab_length = bfd_canonicalize_symtab(hostbfd, symtab)) == 0) {
-        LOG(LOG_WARNING, "symtab is empty...");
-    } else {
-        //printf("%lu symtab entries\n", symtab_length);
-    }
-
-    for (int i = 0; i < symtab_length; ++i) {
-        if (strcmp(bfd_asymbol_name(symtab[i]), CRICKET_ELF_REGFUN) == 0) {
-            ret = (void*)bfd_asymbol_value(symtab[i]);
-            break;
-        }
-        //printf("%d: %s: %lx\n", i, bfd_asymbol_name(symtab[i]),
-        //       bfd_asymbol_value(symtab[i]));
-    }
-
-
- cleanup:
-    free(symtab);
-    if (hostbfd != NULL)
-        bfd_close(hostbfd);
-    return ret;
-}
\ No newline at end of file
diff --git a/cpu/cpu-elf.h b/cpu/cpu-elf.h
deleted file mode 100644
index 4c9abe4f..00000000
--- a/cpu/cpu-elf.h
+++ /dev/null
@@ -1,25 +0,0 @@
-#ifndef _ELF_H_
-#define _ELF_H_
-
-#include <stdint.h>
-#include "cpu-common.h"
-#include "list.h"
-
-struct fat_header {
-    uint32_t magic;
-    uint32_t version;
-    uint64_t text;
-    uint64_t data;  // points to outside of the file
-    uint64_t unknown;
-    uint64_t text2;
-    uint64_t zero;
-};
-
-void elf_init(void);
-int elf_get_fatbin_info(struct fat_header *fatbin, list *kernel_infos, void** fatbin_mem, unsigned* fatbin_size);
-
-int elf_parameter_info(list *kernel_infos, void* memory, size_t memsize);
-void* elf_symbol_address(const char* file, char *symbol);
-int elf_contains_kernel(void* memory, size_t memsize);
-
-#endif //_ELF_H_
diff --git a/cpu/cpu-elf2.c b/cpu/cpu-elf2.c
index 097f93c7..13dd6fc5 100644
--- a/cpu/cpu-elf2.c
+++ b/cpu/cpu-elf2.c
@@ -30,24 +30,26 @@ struct  __attribute__((__packed__)) fat_elf_header
     uint32_t magic;
     uint16_t version;
     uint16_t header_size;
-    uint64_t fat_size;
+    uint64_t size;
 };
 struct  __attribute__((__packed__)) fat_text_header
 {
     uint16_t kind;
     uint16_t unknown1;
     uint32_t header_size;
-    uint64_t fatbin_size;
-    uint64_t compressed_size; // Compression related information
+    uint64_t size;
+    uint32_t compressed_size;       // Size of compressed data
+    uint32_t unknown2;              // Address size for PTX?
     uint16_t minor;
     uint16_t major;
     uint32_t arch;
     uint32_t obj_name_offset;
     uint32_t obj_name_len;
     uint64_t flags;
-    uint64_t zero;      // Alignment for compression?
-    uint64_t decompressed_len;  // Length of compressed data. There is an uncompressed footer
-                              // so this is generally smaller than fatbin_size
+    uint64_t zero;                  // Alignment for compression?
+    uint64_t decompressed_size;     // Length of compressed data in decompressed representation.
+                                    // There is an uncompressed footer so this is generally smaller
+                                    // than size.
 };
 
 #define FATBIN_FLAG_64BIT     0x0000000000000001LL
@@ -61,6 +63,7 @@ int elf2_init(void)
         LOGE(LOG_ERROR, "ELF library initialization failed: %s", elf_errmsg(-1));
         return -1;
     }
+    return 0;
 }
 
 static int flag_to_str(char** str, uint64_t flag)
@@ -72,142 +75,434 @@ static int flag_to_str(char** str, uint64_t flag)
         (flag & FATBIN_FLAG_COMPRESS) ? "yes" : "no");
 }
 
-static int fat_header_decode(void *fat, 
-                            struct fat_elf_header **fat_elf_header,
-                            struct fat_text_header **fat_text_header,
-                            void **fat_text_body_ptr)
+static void print_header(struct fat_text_header *th)
 {
-    struct fat_elf_header* feh;
-    struct fat_text_header* fth;
-    void *fat_ptr = NULL;
-    void *fat_text_header_ptr = NULL;
+    char* flagstr = NULL;
+    flag_to_str(&flagstr, th->flags);
+
+    LOGE(LOG_DBG(1), "text_header: fatbin_kind: %#x, header_size %#x, size %#zx, compressed_size %#x,\
+ minor %#x, major %#x, arch %d, decompressed_size %#zx\n\tflags: %s\n",
+        th->kind,
+        th->header_size,
+        th->size,
+        th->compressed_size,
+        th->minor,
+        th->major,
+        th->arch,
+        th->decompressed_size,
+        flagstr);
+    LOGE(LOG_DBG(1), "\tunknown fields: unknown1: %#x, unknown2: %#x, zeros: %#zx\n",
+        th->unknown1,
+        th->unknown2,
+        th->zero);
+}
 
-    if (fat == NULL || fat_elf_header == NULL || fat_text_header == NULL || fat_text_body_ptr == NULL) {
-        LOGE(LOG_ERROR, "at least one parameter is NULL");
-        return -1;
+/** Check the header of a fatbin
+ * Performs some integrity checks and returns the elf header
+ * @param fatbin_data Pointer to the fatbin data
+ * @param fatbin_size Size of the fatbin data
+ * @param decompressed_size Pointer to a variable that will be set to the size of the decompressed data
+ * @param compressed_data Pointer to a variable that will be set to point to the compressed data
+*/
+static int get_elf_header(const uint8_t* fatbin_data, size_t fatbin_size, struct fat_elf_header **elf_header)
+{
+    struct fat_elf_header *eh = NULL;
+
+    if (fatbin_data == NULL || elf_header == NULL) {
+        LOGE(LOG_ERROR, "fatbin_data is NULL");
+        return 1;
     }
 
-    feh = (struct fat_elf_header*)fat;
-    if (feh->magic != FATBIN_TEXT_MAGIC) {
-        LOGE(LOG_ERROR, "fatbin text magic number is wrong. Got %x, expected %x.", *((uint32_t*)feh), FATBIN_TEXT_MAGIC);
-        return -1;
+    if (fatbin_size < sizeof(struct fat_elf_header)) {
+        LOGE(LOG_ERROR, "fatbin_size is too small");
+        return 1;
     }
-    LOGE(LOG_DBG(1), "fat_elf_header: magic: %x, version: %d, header_size: %p, fat_size: %p",
-        feh->magic, feh->version, feh->header_size, feh->fat_size);
 
-    if (feh->version != 1 || feh->header_size != sizeof(struct fat_elf_header)) {
+    eh = (struct fat_elf_header*) fatbin_data;
+    if (eh->magic != FATBIN_TEXT_MAGIC) {
+        LOGE(LOG_ERROR, "Invalid magic  number: expected %#x but got %#x", FATBIN_TEXT_MAGIC, eh->magic);
+        return 1;
+    }
+
+    if (eh->version != 1 || eh->header_size != sizeof(struct fat_elf_header)) {
         LOGE(LOG_ERROR, "fatbin text version is wrong or header size is inconsistent.\
             This is a sanity check to avoid reading a new fatbinary format");
-        return -1;
+        return 1;
     }
-    fat_ptr = fat_text_header_ptr = (void*)feh + feh->header_size;
-
-    fth = (struct fat_text_header*)(fat_text_header_ptr);
-    LOGE(LOG_DBG(1), "fat_text_header: fatbin_kind: %#x, header_size %#x, fatbin_size %#x, compressed_size %#x,\
-        minor %#x, major %#x, arch %d, flags %#x, compressed_len %#x",
-        fth->kind,
-        fth->header_size,
-        fth->fatbin_size,
-        fth->compressed_size,
-        fth->minor,
-        fth->major,
-        fth->arch,
-        fth->flags,
-        fth->decompressed_len);
-    LOGE(LOG_DBG(1), "unknown fields: unknown1: %#x, zeros: %#x",
-        fth->unknown1,
-        fth->zero);
-    fat_ptr += sizeof(struct fat_header);
-    *fat_text_body_ptr = fat_text_header_ptr + fth->header_size;
-    if (fth->flags & FATBIN_FLAG_DEBUG || fth->flags & FATBIN_FLAG_COMPRESS) {
-       LOGE(LOG_DBG(1), "skipping extra byte \"%#02x\"", *((uint8_t*)*fat_text_body_ptr));
-        *fat_text_body_ptr += 1;
-    }
-
-    char *flag_str = NULL;
-    flag_to_str(&flag_str, fth->flags);
-    LOGE(LOG_DBG(1), "Fatbin flags: %s", flag_str);
-    free(flag_str);
-
-    if(fth->obj_name_offset != 0) {
-        if (((char*)fat_text_header_ptr)[fth->obj_name_offset + fth->obj_name_len] != '\0') {
-            LOGE(LOG_DEBUG, "Fatbin object name is not null terminated");
+    
+    *elf_header = eh;
+    return 0;
+}
+
+/** Check the text header of a fatbin
+ * Performs some integrity checks and returns the text header
+ * @param fatbin_data Pointer to the fatbin data
+ * @param fatbin_size Size of the fatbin data
+ * @param decompressed_size Pointer to a variable that will be set to the size of the decompressed data
+ * @param compressed_data Pointer to a variable that will be set to point to the compressed data
+*/
+static int get_text_header(const uint8_t* fatbin_data, size_t fatbin_size, struct fat_text_header **text_header)
+{
+    struct fat_text_header *th = NULL;
+
+    if (fatbin_data == NULL || text_header == NULL) {
+        LOGE(LOG_ERROR, "fatbin_data is NULL");
+        return 1;
+    }
+
+    if (fatbin_size < sizeof(struct fat_text_header)) {
+        LOGE(LOG_ERROR, "fatbin_size is too small");
+        return 1;
+    }
+
+    th = (struct fat_text_header*)fatbin_data;
+
+    if(th->obj_name_offset != 0) {
+        if (((char*)th)[th->obj_name_offset + th->obj_name_len] != '\0') {
+            LOGE(LOG_WARNING, "Fatbin object name is not null terminated");
         } else {
-            char *obj_name = (char*)fat_text_header_ptr + fth->obj_name_offset;
-            LOGE(LOG_DEBUG, "Fatbin object name: %s (len:%#x)", obj_name, fth->obj_name_len);
+            char *obj_name = (char*)th + th->obj_name_offset;
+            LOGE(LOG_DEBUG, "Fatbin object name: %s (len:%#x)", obj_name, th->obj_name_len);
         }
-        fat_ptr += fth->obj_name_len+1;
     }
-    *fat_elf_header = feh;
-    *fat_text_header = fth;
+
+    *text_header = th;
     return 0;
 }
 
-int elf2_get_fatbin_info(struct fat_header *fatbin, list *kernel_infos, void** fatbin_mem, unsigned* fatbin_size)
+/** Decompresses a fatbin file
+ * @param input Pointer compressed input data
+ * @param input_size Size of compressed data
+ * @param output preallocated memory where decompressed output should be stored
+ * @param output_size size of output buffer. Should be equal to the size of the decompressed data
+ */
+static size_t decompress(const uint8_t* input, size_t input_size, uint8_t* output, size_t output_size)
 {
-    struct fat_elf_header* fat_elf_header;
-    struct fat_text_header* fat_text_header;
-    void *fat_ptr = NULL;
-    void *fat_text_body_ptr = NULL;
-    unsigned fatbin_total_size = 0;
-    if (fatbin == NULL || fatbin_mem == NULL || fatbin_size == NULL) {
-        LOGE(LOG_ERROR, "at least one parameter is NULL");
-        return -1;
+    size_t ipos = 0, opos = 0;  
+    uint16_t next_nclen;  // length of next non-compressed segment
+    uint16_t next_clen;   // length of next compressed segment
+    uint16_t back_offset; // negative offset where redudant data is located, relative to current opos
+
+    while (ipos < input_size) {
+        next_nclen = (input[ipos] & 0xf0) >> 4;
+        next_clen = 4 + (input[ipos] & 0xf);
+        if (next_nclen == 0xf) {
+            next_nclen += input[++ipos];
+        }
+        
+        if (memcpy(output + opos, input + (++ipos), next_nclen) == NULL) {
+            LOGE(LOG_ERROR, "copying data");
+            return 0;
+        }
+#ifdef FATBIN_DECOMPRESS_DEBUG
+        printf("%#04zx nocompress (len:%#x):\n", opos, next_nclen);
+        hexdump(output + opos, next_nclen);
+#endif
+        ipos += next_nclen;
+        opos += next_nclen;
+        if (ipos >= input_size || opos >= output_size) {
+            break;
+        }
+        back_offset = input[ipos] + (input[ipos + 1] << 8);       
+        ipos += 2;
+        if (next_clen == 0xf+4) {
+            do {
+                next_clen += input[ipos++];
+            } while (input[ipos - 1] == 0xff);
+        }
+#ifdef FATBIN_DECOMPRESS_DEBUG
+        printf("%#04zx compress (decompressed len: %#x, back_offset %#x):\n", opos, next_clen, back_offset);
+#endif
+        if (next_clen <= back_offset) {
+            if (memcpy(output + opos, output + opos - back_offset, next_clen) == NULL) {
+                LOGE(LOG_ERROR, "Error copying data");
+                return 0;
+            }
+        } else {
+            if (memcpy(output + opos, output + opos - back_offset, back_offset) == NULL) {
+                LOGE(LOG_ERROR, "Error copying data");
+                return 0;
+            }
+            for (size_t i = back_offset; i < next_clen; i++) {
+                output[opos + i] = output[opos + i - back_offset];
+            }
+        }
+#ifdef FATBIN_DECOMPRESS_DEBUG
+        hexdump(output + opos, next_clen);
+#endif
+        opos += next_clen;
     }
-    if (fatbin->magic != FATBIN_STRUCT_MAGIC) {
-        LOGE(LOG_ERROR, "fatbin struct magic number is wrong. Got %llx, expected %llx.", fatbin->magic, FATBIN_STRUCT_MAGIC);
-        return -1;
+    return opos;
+}
+
+static ssize_t decompress_section(const uint8_t *input, uint8_t **output, size_t *output_size,
+                                  struct fat_elf_header *eh, struct fat_text_header *th, size_t *eh_out_offset)
+{
+    struct fat_text_header *th_out = NULL;
+    struct fat_elf_header *eh_out = NULL;
+    uint8_t *output_pos = 0;
+    size_t padding;
+    size_t input_read = 0;
+    const uint8_t zeroes[6] = {0};
+
+    if (output == NULL || output_size == NULL || eh == NULL || th == NULL || eh_out_offset == NULL) {
+        LOGE(LOG_ERROR, "invalid parameters");
+        return 1;
+    }
+
+    if ((*output = realloc(*output, *output_size + th->decompressed_size + eh->header_size + th->header_size)) == NULL) {
+        LOGE(LOG_ERROR, "Error allocating memory of size %#zx for output buffer: %s", 
+                *output_size + th->decompressed_size + eh->header_size + th->header_size, strerror(errno));
+        goto error;
+    }
+    output_pos = *output + *output_size;
+    *output_size += th->decompressed_size + th->header_size;
+
+    if (input == (uint8_t*)eh + eh->header_size + th->header_size) { // We are at the first section
+        if (memcpy(output_pos, eh, eh->header_size) == NULL) {
+            LOGE(LOG_ERROR, "Error copying data");
+            goto error;
+        }
+        eh_out = ((struct fat_elf_header*)(output_pos));
+        eh_out->size = 0;
+        *eh_out_offset = output_pos - *output;
+        output_pos += eh->header_size;
+        *output_size += eh->header_size;
     }
-    LOG(LOG_DBG(1), "Fatbin: magic: %x, version: %x, text: %lx, data: %lx, ptr: %lx, ptr2: %lx, zero: %lx",
-           fatbin->magic, fatbin->version, fatbin->text, fatbin->data, fatbin->unknown, fatbin->text2, fatbin->zero);
+    eh_out = ((struct fat_elf_header*)(*output + *eh_out_offset)); // repair pointer in case realloc moved the buffer
+    eh_out->size += th->decompressed_size + th->header_size;       // set size
 
-    if (fat_header_decode((void*)fatbin->text, &fat_elf_header, &fat_text_header, &fat_text_body_ptr) != 0) {
-        LOGE(LOG_ERROR, "fatbin header decode failed");
-        return -1;
+    if (memcpy(output_pos, th, th->header_size) == NULL) {
+        LOGE(LOG_ERROR, "Error copying data");
+        goto error;
     }
+    th_out = ((struct fat_text_header*)output_pos);
+    th_out->flags &= ~FATBIN_FLAG_COMPRESS;  // clear compressed flag
+    th_out->compressed_size = 0;             // clear compressed size
+    th_out->decompressed_size = 0;           // clear decompressed size
+    th_out->size = th->decompressed_size;    // set size
 
+    output_pos += th->header_size;
 
-    fatbin_total_size = fat_elf_header->header_size + fat_elf_header->fat_size;
+    if (decompress(input, th->compressed_size, output_pos, th->decompressed_size) != th->decompressed_size) {
+        LOGE(LOG_ERROR, "Decompression failed");
+        goto error;
+    }
+
+    input_read += th->compressed_size;
+    output_pos += th->decompressed_size;
 
-    // for (int i=0; i<64; i++) {
-    //     printf("%02x ", ((uint8_t*)fat_text_body_ptr)[i]);
+    // if (input_pos != (uint8_t*)th + eh->size) {
+    //     printf("There is %#zx bytes of data remaining\n", (uint8_t*)th + eh->size - input_pos);
     // }
-    // printf("\n");
+    
+    padding = (8 - (size_t)(input + input_read) % 8);
+    if (memcmp(input + input_read, zeroes, padding) != 0) {
+        LOGE(LOG_ERROR, "expected %#zx zero bytes, got:", padding);
+        hexdump(input + input_read, 0x60);
+        goto error;
+    }
+    input_read += padding;
+
+    padding = ((8 - (size_t)th->decompressed_size) % 8);
+    // Because we always allocated enough memory for one more elf_header and this is smaller than
+    // the maximal padding of 7, we do not have to reallocate here.
+    memset(output_pos, 0, padding);
+    *output_size += padding;
+    eh_out->size += padding;
+    th_out->size += padding;
+
+    return input_read;
+ error:
+    free(*output);
+    *output = NULL;
+    return -1;
+}
 
-    if (fat_text_header->flags & FATBIN_FLAG_COMPRESS) {
-        LOGE(LOG_WARNING, "fatbin contains compressed device code. This is not supported yet.");
-        //return -1;
+static ssize_t decompress_single_section(const uint8_t *input, uint8_t **output, size_t *output_size,
+                                         struct fat_elf_header *eh, struct fat_text_header *th)
+{
+    size_t padding;
+    size_t input_read = 0;
+    size_t output_written = 0;
+    const uint8_t zeroes[6] = {0};
+
+    if (input == NULL || output == NULL || eh == NULL || th == NULL) {
+        LOGE(LOG_ERROR, "invalid parameters");
+        return 1;
     }
-    if (fat_text_header->flags & FATBIN_FLAG_DEBUG) {
-        LOGE(LOG_WARNING, "fatbin contains debug information. This is not supported yet.");
-        return -1;
+
+    // add max padding of 7 bytes
+    if ((*output = malloc(th->decompressed_size + 7)) == NULL) {
+        LOGE(LOG_ERROR, "Error allocating memory of size %#zx for output buffer: %s", 
+                th->decompressed_size, strerror(errno));
+        goto error;
     }
 
-    if (elf2_parameter_info(kernel_infos, fat_text_body_ptr, fat_elf_header->fat_size) != 0) {
-        LOGE(LOG_ERROR, "error getting parameter info");
-        return -1;
+    if (decompress(input, th->compressed_size, *output, th->decompressed_size) != th->decompressed_size) {
+        LOGE(LOG_ERROR, "Decompression failed");
+        goto error;
     }
+    input_read += th->compressed_size;
+    output_written += th->decompressed_size;
 
-    if (fat_header_decode((void*)fatbin->text2, &fat_elf_header, &fat_text_header, &fat_text_body_ptr) != 0) {
-        LOGE(LOG_ERROR, "fatbin header decode failed");
-        return -1;
+    padding = (8 - (size_t)(input + input_read) % 8);
+    if (memcmp(input + input_read, zeroes, padding) != 0) {
+        LOGE(LOG_ERROR, "expected %#zx zero bytes, got:", padding);
+        hexdump(input + input_read, 0x60);
+        goto error;
     }
-    fatbin_total_size += fat_elf_header->header_size + fat_elf_header->fat_size;
+    input_read += padding;
 
-    // if (cricketd_utils_symtab(fat_text_body_ptr, fat_elf_header->fat_size) == NULL) {
-    //     LOGE(LOG_ERROR, "error getting symbol table");
-    //     return -1;
-    // }
-    fat_ptr = (void*)fatbin->data;
+    padding = ((8 - (size_t)th->decompressed_size) % 8);
+    // Because we always allocated enough memory for one more elf_header and this is smaller than
+    // the maximal padding of 7, we do not have to reallocate here.
+    memset(*output, 0, padding);
+    output_written += padding;
 
-    // for (int i=0; i<64; i++) {
-    //     printf("%02x ", ((uint8_t*)fatbin->text)[i]);
-    // }
-    // printf("\n");
+    *output_size = output_written;
+    return input_read;
+ error:
+    free(*output);
+    *output = NULL;
+    return -1;
+}
+
+/** Decompresses a fatbin file
+ * @param fatbin_data Pointer to the fatbin data
+ * @param fatbin_size Size of the fatbin data
+ * @param decompressed_data Pointer to a variable that will be set to point to the decompressed data
+ * @param decompressed_size Pointer to a variable that will be set to the size of the decompressed data
+ */
+static size_t decompress_fatbin(const uint8_t* fatbin_data, size_t fatbin_size, uint8_t** decompressed_data)
+{
+    struct fat_elf_header *eh = NULL;
+    size_t eh_out_offset = 0;
+    struct fat_text_header *th = NULL;
+    const uint8_t *input_pos = fatbin_data;
+
+    int i = 0;
+    uint8_t *output = NULL;
+    size_t output_size = 0;
+    ssize_t input_read;
+
+    if (fatbin_data == NULL || decompressed_data == NULL) {
+        LOGE(LOG_ERROR, "fatbin_data is NULL");
+        goto error;
+    }
+
+    while (input_pos < fatbin_data + fatbin_size) {
+        if (get_elf_header(input_pos, fatbin_size - (input_pos - fatbin_data), &eh) != 0) {
+            LOGE(LOG_ERROR, "Something went wrong while checking the header.");
+            goto error;
+        }
+        // printf("elf header no. %d: magic: %#x, version: %#x, header_size: %#x, size: %#zx\n",
+        //        i++, eh->magic, eh->version, eh->header_size, eh->size);
+        input_pos += eh->header_size;
+        do {
+            if (get_text_header(input_pos, fatbin_size - (input_pos - fatbin_data) - eh->header_size, &th) != 0) {
+                LOGE(LOG_ERROR, "Something went wrong while checking the header.");
+                goto error;
+            }
+            //print_header(th);
+            input_pos += th->header_size;
+
+            if ((input_read = decompress_section(input_pos, &output, &output_size, eh, th, &eh_out_offset)) < 0) {
+                LOGE(LOG_ERROR, "Something went wrong while decompressing text section.");
+                goto error;
+            }
+            input_pos += input_read;
+
+        } while (input_pos < (uint8_t*)eh + eh->header_size + eh->size);
+
+        //printf("##### Decompressed data (size %#zx): #####\n", th->decompressed_size);
+        //hexdump(output_pos, th->decompressed_size);
+    }
+
+    *decompressed_data = output;
+    return output_size;
+ error:
+    if (output != NULL) {
+        free(output);
+    }
+    *decompressed_data = NULL;
+    return 0;
+}
+
+int elf2_get_fatbin_info(const struct fat_header *fatbin, list *kernel_infos, uint8_t** fatbin_mem, size_t* fatbin_size)
+{
+    struct fat_elf_header* eh;
+    struct fat_text_header* th;
+    const uint8_t *input_pos = NULL;
+    const uint8_t *fatbin_data = NULL;
+    uint8_t *text_data = NULL;
+    size_t text_data_size = 0;
+    size_t fatbin_total_size = 0;
+    int ret = -1;
+    if (fatbin == NULL || fatbin_mem == NULL || fatbin_size == NULL) {
+        LOGE(LOG_ERROR, "at least one parameter is NULL");
+        goto error;
+    }
+    fatbin_data = input_pos = (const uint8_t*)fatbin->text;
+    if (fatbin->magic != FATBIN_STRUCT_MAGIC) {
+        LOGE(LOG_ERROR, "fatbin struct magic number is wrong. Got %llx, expected %llx.", fatbin->magic, FATBIN_STRUCT_MAGIC);
+        goto error;
+    }
+    LOG(LOG_DBG(1), "Fatbin: magic: %x, version: %x, text: %lx, data: %lx, ptr: %lx, ptr2: %lx, zero: %lx",
+           fatbin->magic, fatbin->version, fatbin->text, fatbin->data, fatbin->unknown, fatbin->text2, fatbin->zero);
+
+    if (get_elf_header((uint8_t*)fatbin->text, sizeof(struct fat_elf_header), &eh) != 0) {
+        LOGE(LOG_ERROR, "Something went wrong while checking the header.");
+        goto error;
+    }
+    input_pos += eh->header_size;
+    fatbin_total_size = eh->header_size + eh->size;
+    do {
+        if (get_text_header(input_pos, *fatbin_size - (input_pos - fatbin_data) - eh->header_size, &th) != 0) {
+            fprintf(stderr, "Something went wrong while checking the header.\n");
+            goto error;
+        }
+        //print_header(th);
+        input_pos += th->header_size;
+        if (th->flags & FATBIN_FLAG_DEBUG) {
+            LOGE(LOG_DEBUG, "fatbin contains debug information. This is not supported, yet.");
+            goto error;
+        }
+
+        if (th->flags & FATBIN_FLAG_COMPRESS) {
+            ssize_t input_read;
+
+            LOGE(LOG_DEBUG, "fatbin contains compressed device code. Decompressing...");
+            if ((input_read = decompress_single_section(input_pos, &text_data, &text_data_size, eh, th)) < 0) {
+                fprintf(stderr, "Something went wrong while decompressing text section.\n");
+                goto error;
+            }
+            input_pos += input_read;
+        } else {
+            text_data = (uint8_t*)input_pos;
+            text_data_size = th->size;
+            input_pos += th->size;
+        }
+        if (elf2_parameter_info(kernel_infos, text_data , text_data_size) != 0) {
+            LOGE(LOG_ERROR, "error getting parameter info");
+            goto error;
+        }
+        if (th->flags & FATBIN_FLAG_COMPRESS) {
+            free(text_data);
+        }
+    } while (input_pos < (uint8_t*)eh + eh->header_size + eh->size);
+
+    if (get_elf_header((uint8_t*)fatbin->text2, sizeof(struct fat_elf_header), &eh) != 0) {
+        LOGE(LOG_ERROR, "Something went wrong while checking the header.");
+        goto error;
+    }
+    fatbin_total_size += eh->header_size + eh->size;
 
     *fatbin_mem = (void*)fatbin->text;
     *fatbin_size = fatbin_total_size;
-    return 0;
+    ret = 0;
+ error:    
+    return ret;
 }
 
 static void print_hexmem(void *mem, size_t len)
@@ -238,7 +533,6 @@ static void print_hexmem(void *mem, size_t len)
 #define EIFMT_HVAL                      0x3
 #define EIFMT_SVAL                      0x4
 
-
 static int get_section_by_name(Elf *elf, const char *name, Elf_Scn **section)
 {
     Elf_Scn *scn = NULL;
@@ -274,6 +568,65 @@ static int get_section_by_name(Elf *elf, const char *name, Elf_Scn **section)
     return -1;
 }
 
+static int print_sections(Elf *elf)
+{
+    Elf_Scn *scn = NULL;
+    GElf_Shdr shdr;
+    char *section_name = NULL;
+    size_t str_section_index;
+
+    if (elf == NULL) {
+        LOGE(LOG_ERROR, "invalid argument");
+        return -1;
+    }
+
+    if (elf_getshdrstrndx(elf, &str_section_index) != 0) {
+        LOGE(LOG_ERROR, "elf_getshstrndx Wfailed");
+        return -1;
+    }
+
+    while ((scn = elf_nextscn(elf, scn)) != NULL) {
+        if (gelf_getshdr(scn, &shdr) != &shdr) {
+            LOGE(LOG_ERROR, "gelf_getshdr failed");
+            return -1;
+        }
+        if ((section_name = elf_strptr(elf, str_section_index, shdr.sh_name)) == NULL) {
+            LOGE(LOG_ERROR, "elf_strptr failed");
+            return -1;
+        }
+        printf("%s, %#0lx %#0x\n", section_name, shdr.sh_flags, shdr.sh_type);
+    }
+    return -1;
+}
+
+static char* get_kernel_section_from_kernel_name(const char *kernel_name)
+{
+    char *section_name = NULL;
+    if (kernel_name == NULL) {
+        LOGE(LOG_ERROR, "invalid argument");
+        return NULL;
+    }
+
+    if (kernel_name[0] == '$') {
+        const char *p;
+        if ((p = strchr(kernel_name+1, '$')) == NULL) {
+            LOGE(LOG_ERROR, "invalid kernel name");
+            return NULL;
+        }
+        int len = (p - kernel_name) - 1;
+        if (asprintf(&section_name, ".nv.info.%.*s", len, kernel_name+1) == -1) {
+            LOGE(LOG_ERROR, "asprintf failed");
+            return NULL;
+        }
+    } else {
+        if (asprintf(&section_name, ".nv.info.%s", kernel_name) == -1) {
+            LOGE(LOG_ERROR, "asprintf failed");
+            return NULL;
+        }
+    }
+    return section_name;
+}
+
 static int get_parm_for_kernel(Elf *elf, kernel_info_t *kernel, void* memory, size_t memsize)
 {
     struct __attribute__((__packed__)) nv_info_kernel_entry {
@@ -305,11 +658,13 @@ static int get_parm_for_kernel(Elf *elf, kernel_info_t *kernel, void* memory, si
     kernel->param_offsets = NULL;
     kernel->param_sizes = NULL;
 
-    if (asprintf(&section_name, ".nv.info.%s", kernel->name) == -1) {
-        LOGE(LOG_ERROR, "asprintf failed");
+    if ((section_name = get_kernel_section_from_kernel_name(kernel->name)) == NULL) {
+        LOGE(LOG_ERROR, "get_kernel_section_from_kernel_name failed");
         goto cleanup;
     }
 
+    print_sections(elf);
+
     if (get_section_by_name(elf, section_name, &section) != 0) {
         LOGE(LOG_ERROR, "section %s not found", section_name);
         goto cleanup;
@@ -375,7 +730,6 @@ static int get_parm_for_kernel(Elf *elf, kernel_info_t *kernel, void* memory, si
     return ret;
 }
 
-
 static int get_symtab(Elf *elf, Elf_Data **symbol_table_data, size_t *symbol_table_size, GElf_Shdr *symbol_table_shdr)
 {
     GElf_Shdr shdr;
@@ -431,7 +785,7 @@ static void print_symtab(Elf *elf)
     LOGE(LOG_DEBUG, "found %d symbols", symnum);
 
     while (gelf_getsym(symbol_table_data, i, &sym) != NULL) {
-        printf("sym %d: name: %s, value: %#x, size: %#x, info: %#x, other: %#x, shndx: %#x\n", i,
+        printf("sym %d: name: %s, value: %#lx, size: %#lx, info: %#x, other: %#x, shndx: %#x\n", i,
                elf_strptr(elf, shdr.sh_link, sym.st_name),
                sym.st_value, sym.st_size, sym.st_info, sym.st_other, sym.st_shndx);
         i++;
diff --git a/cpu/cpu-elf2.h b/cpu/cpu-elf2.h
index c7309d71..0cbedb49 100644
--- a/cpu/cpu-elf2.h
+++ b/cpu/cpu-elf2.h
@@ -8,18 +8,18 @@
 struct fat_header {
     uint32_t magic;
     uint32_t version;
-    uint64_t text;
-    uint64_t data;  // points to outside of the file
+    uint64_t text;      // points to first text section
+    uint64_t data;      // points to outside of the file
     uint64_t unknown;
-    uint64_t text2;     // points to footer of text section
+    uint64_t text2;     // points to second text section
     uint64_t zero;
 };
 
 int elf2_init(void);
-int elf2_get_fatbin_info(struct fat_header *fatbin, list *kernel_infos, void** fatbin_mem, unsigned* fatbin_size);
+int elf2_get_fatbin_info(const struct fat_header *fatbin, list *kernel_infos, uint8_t** fatbin_mem, size_t* fatbin_size);
 
 int elf2_parameter_info(list *kernel_infos, void* memory, size_t memsize);
-void* elf2_symbol_address(const char* file, char *symbol);
-int elf2_contains_kernel(void* memory, size_t memsize);
+//void* elf2_symbol_address(const char* file, char *symbol);
+//int elf2_contains_kernel(void* memory, size_t memsize);
 
 #endif //_ELF_H_
diff --git a/cpu/cpu-server-runtime.c b/cpu/cpu-server-runtime.c
index 86240970..035846fd 100644
--- a/cpu/cpu-server-runtime.c
+++ b/cpu/cpu-server-runtime.c
@@ -897,7 +897,13 @@ bool_t cuda_launch_kernel_1_svc(ptr func, rpc_dim3 gridDim, rpc_dim3 blockDim,
         LOGE(LOG_DEBUG, "arg: %p (%d)", *(void**)cuda_args[i], *(int*)cuda_args[i]);
     }
 
-    LOGE(LOG_DEBUG, "cudaLaunchKernel(func=%p, gridDim=[%d,%d,%d], blockDim=[%d,%d,%d], args=%p, sharedMem=%d, stream=%p)", resource_mg_get(&rm_functions, (void*)func), cuda_gridDim.x, cuda_gridDim.y, cuda_gridDim.z, cuda_blockDim.x, cuda_blockDim.y, cuda_blockDim.z, cuda_args, sharedMem, (void*)stream);
+    LOGE(LOG_DEBUG, "cudaLaunchKernel(func=%p, gridDim=[%d,%d,%d], blockDim=[%d,%d,%d], args=%p, sharedMem=%d, stream=%p)",
+                    resource_mg_get(&rm_functions, (void*)func),
+                    cuda_gridDim.x, cuda_gridDim.y, cuda_gridDim.z,
+                    cuda_blockDim.x, cuda_blockDim.y, cuda_blockDim.z,
+                    cuda_args,
+                    sharedMem,
+                    (void*)stream);
 
     *result = cuLaunchKernel((CUfunction)resource_mg_get(&rm_functions, (void*)func),
                             gridDim.x, gridDim.y, gridDim.z,
diff --git a/cpu/cpu-utils.c b/cpu/cpu-utils.c
index f7955353..098593c3 100644
--- a/cpu/cpu-utils.c
+++ b/cpu/cpu-utils.c
@@ -410,3 +410,35 @@ void kernel_infos_free(kernel_info_t *infos, size_t kernelnum)
         free(infos[i].param_sizes);
     }
 }
+
+void hexdump(const uint8_t* data, size_t size)
+{
+    size_t pos = 0;
+    while (pos < size) {
+        printf("%#05zx: ", pos);
+        for (int i = 0; i < 16; i++) {
+            if (pos + i < size) {
+                printf("%02x", data[pos + i]);
+            } else {
+                printf("  ");
+            }
+            if (i % 4 == 3) {
+                printf(" ");
+            }
+        }
+        printf(" | ");
+        for (int i = 0; i < 16; i++) {
+            if (pos + i < size) {
+                if (data[pos + i] >= 0x20 && data[pos + i] <= 0x7e) {
+                    printf("%c", data[pos + i]);
+                } else {
+                    printf(".");
+                }
+            } else {
+                printf(" ");
+            }
+        }
+        printf("\n");
+        pos += 16;
+    }
+}
\ No newline at end of file
diff --git a/cpu/cpu-utils.h b/cpu/cpu-utils.h
index e40bea2e..6b1261ef 100644
--- a/cpu/cpu-utils.h
+++ b/cpu/cpu-utils.h
@@ -16,5 +16,7 @@ int cpu_utils_md5hash(char *filename, unsigned long *high, unsigned long *low);
 int cricketd_utils_launch_child(const char *file, char **args);
 int cpu_utils_parameter_info(list *kernel_infos, char *path);
 kernel_info_t* utils_search_info(list *kernel_infos, const char *kernelname);
+void hexdump(const uint8_t* data, size_t size);
+
 
 #endif //_CPU_UTILS_H_

From 975cd31bfd8ab7d74afdeaf3b3eb85dddd7e08c9 Mon Sep 17 00:00:00 2001
From: Niklas Eiling <niklas.eiling@eonerc.rwth-aachen.de>
Date: Wed, 10 May 2023 11:15:12 +0200
Subject: [PATCH 30/83] port to CUDA 12.1

Signed-off-by: Niklas Eiling <niklas.eiling@eonerc.rwth-aachen.de>
---
 cpu/cpu-client-driver.c  |   4 +-
 cpu/cpu-client-runtime.c |  31 ++++++++++-
 cpu/cpu-server-driver.c  |   2 +-
 cpu/cpu-server-runtime.c |  57 +++++++------------
 cpu/cpu-server.c         |  28 +++++-----
 cpu/cpu_rpc_prot.x       |   4 +-
 cpu/cr.c                 |   4 --
 tests/samples/Makefile   | 115 ++++++++++++++++++++++++---------------
 8 files changed, 138 insertions(+), 107 deletions(-)

diff --git a/cpu/cpu-client-driver.c b/cpu/cpu-client-driver.c
index a7499e0e..a37810c4 100644
--- a/cpu/cpu-client-driver.c
+++ b/cpu/cpu-client-driver.c
@@ -697,7 +697,7 @@ DEF_FN(CUresult, cuGraphNodeGetDependencies, CUgraphNode, hNode, CUgraphNode*, d
 DEF_FN(CUresult, cuGraphNodeGetDependentNodes, CUgraphNode, hNode, CUgraphNode*, dependentNodes, size_t*, numDependentNodes)
 DEF_FN(CUresult, cuGraphAddDependencies, CUgraph, hGraph, const CUgraphNode*, from, const CUgraphNode*, to, size_t, numDependencies)
 DEF_FN(CUresult, cuGraphRemoveDependencies, CUgraph, hGraph, const CUgraphNode*, from, const CUgraphNode*, to, size_t, numDependencies)
-DEF_FN(CUresult, cuGraphInstantiate, CUgraphExec*, phGraphExec, CUgraph, hGraph, CUgraphNode*, phErrorNode, char*, logBuffer, size_t, bufferSize)
+DEF_FN(CUresult, cuGraphInstantiate, CUgraphExec*, phGraphExec, CUgraph, hGraph, unsigned long long, flags)
 DEF_FN(CUresult, cuGraphLaunch, CUgraphExec, hGraphExec, CUstream, hStream)
 DEF_FN(CUresult, cuGraphLaunch_ptsz, CUgraphExec, hGraphExec, CUstream, hStream)
 DEF_FN(CUresult, cuGraphExecDestroy, CUgraphExec, hGraphExec)
@@ -714,6 +714,6 @@ DEF_FN(CUresult, cuStreamEndCapture_ptsz, CUstream, hStream, CUgraph*, phGraph)
 DEF_FN(CUresult, cuStreamIsCapturing, CUstream, hStream, CUstreamCaptureStatus*, captureStatus)
 DEF_FN(CUresult, cuStreamIsCapturing_ptsz, CUstream, hStream, CUstreamCaptureStatus*, captureStatus)
 DEF_FN(CUresult, cuThreadExchangeStreamCaptureMode, CUstreamCaptureMode*, mode)
-DEF_FN(CUresult, cuStreamGetCaptureInfo, CUstream, hStream, CUstreamCaptureStatus*, captureStatus, cuuint64_t*, id)
+DEF_FN(CUresult, cuStreamGetCaptureInfo, CUstream, hStream, CUstreamCaptureStatus*, captureStatus_out, cuuint64_t*, id_out, CUgraph*. graph_out, const CUgraphNode**, dependencies_out, size_t*, numDependencies_out)
 DEF_FN(CUresult, cuStreamGetCaptureInfo_ptsz, CUstream, hStream, CUstreamCaptureStatus*, captureStatus, cuuint64_t*, id)
 DEF_FN(CUresult, cuGraphExecKernelNodeSetParams, CUgraphExec, hGraphExec, CUgraphNode, hNode, const CUDA_KERNEL_NODE_PARAMS*, nodeParams)
diff --git a/cpu/cpu-client-runtime.c b/cpu/cpu-client-runtime.c
index 12a25902..f5e46b48 100644
--- a/cpu/cpu-client-runtime.c
+++ b/cpu/cpu-client-runtime.c
@@ -1907,7 +1907,7 @@ DEF_FN(cudaError_t, cudaGraphGetNodes, cudaGraph_t, graph, cudaGraphNode_t*, nod
 DEF_FN(cudaError_t, cudaGraphGetRootNodes, cudaGraph_t, graph, cudaGraphNode_t*, pRootNodes, size_t*, pNumRootNodes)
 DEF_FN(cudaError_t, cudaGraphHostNodeGetParams, cudaGraphNode_t, node, struct cudaHostNodeParams*, pNodeParams)
 DEF_FN(cudaError_t, cudaGraphHostNodeSetParams, cudaGraphNode_t, node, const struct cudaHostNodeParams*, pNodeParams)
-DEF_FN(cudaError_t, cudaGraphInstantiate, cudaGraphExec_t*, pGraphExec, cudaGraph_t, graph, cudaGraphNode_t*, pErrorNode, char*, pLogBuffer, size_t, bufferSize)
+DEF_FN(cudaError_t, cudaGraphInstantiate, cudaGraphExec_t*, pGraphExec, cudaGraph_t, graph, unsigned long long, flags)
 DEF_FN(cudaError_t, cudaGraphKernelNodeGetParams, cudaGraphNode_t, node, struct cudaKernelNodeParams*, pNodeParams)
 DEF_FN(cudaError_t, cudaGraphKernelNodeSetParams, cudaGraphNode_t, node, const struct cudaKernelNodeParams*, pNodeParams)
 DEF_FN(cudaError_t, cudaGraphLaunch, cudaGraphExec_t, graphExec, cudaStream_t, stream)
@@ -1920,6 +1920,33 @@ DEF_FN(cudaError_t, cudaGraphNodeGetDependencies, cudaGraphNode_t, node, cudaGra
 DEF_FN(cudaError_t, cudaGraphNodeGetDependentNodes, cudaGraphNode_t, node, cudaGraphNode_t*, pDependentNodes, size_t*, pNumDependentNodes)
 DEF_FN(cudaError_t, cudaGraphNodeGetType, cudaGraphNode_t, node, enum cudaGraphNodeType*, pType)
 DEF_FN(cudaError_t, cudaGraphRemoveDependencies, cudaGraph_t, graph, const cudaGraphNode_t*, from, const cudaGraphNode_t*, to, size_t, numDependencies)
-DEF_FN(cudaError_t, cudaProfilerInitialize, const char*, configFile, const char*, outputFile, cudaOutputMode_t, outputMode)
 DEF_FN(cudaError_t, cudaProfilerStart, void)
 DEF_FN(cudaError_t, cudaProfilerStop, void)
+
+cudaError_t cudaProfilerStart(void)
+{
+#ifdef WITH_API_CNT
+    api_call_cnt++;
+#endif //WITH_API_CNT
+    int result;
+    enum clnt_stat retval;
+    retval = cuda_profiler_start_1(&result, clnt);
+    if (retval != RPC_SUCCESS) {
+        clnt_perror (clnt, "call failed");
+    }
+    return result;
+}
+
+cudaError_t cudaProfilerStop(void)
+{
+#ifdef WITH_API_CNT
+    api_call_cnt++;
+#endif //WITH_API_CNT
+    int result;
+    enum clnt_stat retval;
+    retval = cuda_profiler_stop_1(&result, clnt);
+    if (retval != RPC_SUCCESS) {
+        clnt_perror (clnt, "call failed");
+    }
+    return result;
+}
\ No newline at end of file
diff --git a/cpu/cpu-server-driver.c b/cpu/cpu-server-driver.c
index 4781d144..00da6579 100644
--- a/cpu/cpu-server-driver.c
+++ b/cpu/cpu-server-driver.c
@@ -37,7 +37,7 @@ int server_driver_init(int restore)
 // Does not support checkpoint/restart yet
 bool_t rpc_elf_load_1_svc(mem_data elf, ptr module_key, int *result, struct svc_req *rqstp)
 {
-    LOG(LOG_DEBUG, "rpc_elf_load(elf: %p, len: %#x)", elf.mem_data_val, elf.mem_data_len);
+    LOG(LOG_DEBUG, "rpc_elf_load(elf: %p, len: %#x, key: %#x)", elf.mem_data_val, elf.mem_data_len);
     CUresult res;
     CUmodule module;
     
diff --git a/cpu/cpu-server-runtime.c b/cpu/cpu-server-runtime.c
index 035846fd..f437c6f8 100644
--- a/cpu/cpu-server-runtime.c
+++ b/cpu/cpu-server-runtime.c
@@ -830,44 +830,6 @@ bool_t cuda_launch_cooperative_kernel_1_svc(ptr func, rpc_dim3 gridDim, rpc_dim3
     return 1;
 }
 
-bool_t cuda_launch_cooperative_kernel_multi_device_1_svc(ptr func, rpc_dim3 gridDim, rpc_dim3 blockDim, mem_data args, size_t sharedMem, ptr stream, int numDevices, int flags, int *result, struct svc_req *rqstp)
-{
-    RECORD_API(cuda_launch_cooperative_kernel_multi_device_1_argument);
-    RECORD_ARG(1, func);
-    RECORD_ARG(2, gridDim);
-    RECORD_ARG(3, blockDim);
-    //TODO: Store parameters explicitly
-    //RECORD_ARG(4, args);
-    RECORD_ARG(5, sharedMem);
-    RECORD_ARG(6, stream);
-    RECORD_ARG(7, numDevices);
-    RECORD_ARG(8, flags);
-    dim3 cuda_gridDim = {gridDim.x, gridDim.y, gridDim.z};
-    dim3 cuda_blockDim = {blockDim.x, blockDim.y, blockDim.z};
-    void **cuda_args;
-    uint16_t *arg_offsets;
-    size_t param_num = *((size_t*)args.mem_data_val);
-    struct cudaLaunchParams lp;
-    arg_offsets = (uint16_t*)(args.mem_data_val+sizeof(size_t));
-    cuda_args = malloc(param_num*sizeof(void*));
-    for (size_t i = 0; i < param_num; ++i) {
-        cuda_args[i] = args.mem_data_val+sizeof(size_t)+param_num*sizeof(uint16_t)+arg_offsets[i];
-        //LOGE(LOG_DEBUG, "arg: %p (%d)\n", *(void**)cuda_args[i], *(int*)cuda_args[i]);
-    }
-
-    LOGE(LOG_DEBUG, "cudaLaunchCooperativeKernelMultiDevice(func=%p, gridDim=[%d,%d,%d], blockDim=[%d,%d,%d], args=%p, sharedMem=%d, stream=%p)", func, cuda_gridDim.x, cuda_gridDim.y, cuda_gridDim.z, cuda_blockDim.x, cuda_blockDim.y, cuda_blockDim.z, cuda_args, sharedMem, (void*)stream);
-    lp.args = cuda_args;
-    lp.blockDim = cuda_blockDim;
-    lp.func = resource_mg_get(&rm_kernels, (void*)func);
-    lp.gridDim = cuda_gridDim;
-    lp.sharedMem = sharedMem;
-    lp.stream = resource_mg_get(&rm_streams, (void*)stream);
-    *result = cudaLaunchCooperativeKernelMultiDevice(&lp, numDevices, flags);
-    RECORD_RESULT(integer, *result);
-    LOGE(LOG_DEBUG, "cudaLaunchCooperativeKernelMultiDevice result: %d", *result);
-    return 1;
-}
-
 /* This would require RPCs in the opposite direction.
  * __host__ cudaError_t cudaLaunchHostFunc ( cudaStream_t stream, cudaHostFn_t fn, void* userData )
  *   Enqueues a host function call in a stream.
@@ -1888,3 +1850,22 @@ bool_t cuda_register_fat_binary_end_1_svc(ptr cubinHandle, int *result, struct s
     *result = 0;
     return 1;
 }*/
+#include <cuda_profiler_api.h>
+
+bool_t cuda_profiler_start_1_svc(int *result, struct svc_req *rqstp)
+{
+    RECORD_VOID_API;
+    LOGE(LOG_DEBUG, "cudaProfilerStart");
+    *result = cudaProfilerStart();
+    RECORD_RESULT(integer, *result);
+    return 1;
+}
+
+bool_t cuda_profiler_stop_1_svc(int *result, struct svc_req *rqstp)
+{
+    RECORD_VOID_API;
+    LOGE(LOG_DEBUG, "cudaProfilerStop");
+    *result = cudaProfilerStop();
+    RECORD_RESULT(integer, *result);
+    return 1;
+}
diff --git a/cpu/cpu-server.c b/cpu/cpu-server.c
index d28ee15f..e5182324 100644
--- a/cpu/cpu-server.c
+++ b/cpu/cpu-server.c
@@ -18,7 +18,7 @@
 #include "cpu-server-driver.h"
 #include "rpc/xdr.h"
 #include "cr.h"
-#include "cpu-elf.h"
+#include "cpu-elf2.h"
 #ifdef WITH_IB
 #include "cpu-ib.h"
 #endif //WITH_IB
@@ -118,23 +118,23 @@ bool_t rpc_checkpoint_1_svc(int *result, struct svc_req *rqstp)
 */
 void cricket_so_register(void* dlhandle, char *path)
 {
-    struct link_map *map;
-    dlinfo(dlhandle, RTLD_DI_LINKMAP, &map);
+    // struct link_map *map;
+    // dlinfo(dlhandle, RTLD_DI_LINKMAP, &map);
 
-    // add load location of library to offset in symbol table
-    void (*cudaRegisterAllv)(void) = 
-        (void(*)(void)) elf_symbol_address(path, "_ZL24__sti____cudaRegisterAllv");
+    // // add load location of library to offset in symbol table
+    // void (*cudaRegisterAllv)(void) = 
+    //     (void(*)(void)) elf_symbol_address(path, "_ZL24__sti____cudaRegisterAllv");
     
-    LOG(LOG_INFO, "found CUDA initialization function at %p + %p = %p", 
-        map->l_addr, cudaRegisterAllv, map->l_addr + cudaRegisterAllv);
+    // LOG(LOG_INFO, "found CUDA initialization function at %p + %p = %p", 
+    //     map->l_addr, cudaRegisterAllv, map->l_addr + cudaRegisterAllv);
 
-    cudaRegisterAllv += map->l_addr;
+    // cudaRegisterAllv += map->l_addr;
     
-    if (cudaRegisterAllv == NULL) {
-        LOGE(LOG_WARNING, "could not find cudaRegisterAllv initialization function in cubin. Kernels cannot be launched without it!");
-    } else {
-        cudaRegisterAllv();
-    }
+    // if (cudaRegisterAllv == NULL) {
+    //     LOGE(LOG_WARNING, "could not find cudaRegisterAllv initialization function in cubin. Kernels cannot be launched without it!");
+    // } else {
+    //     cudaRegisterAllv();
+    // }
 }
 
 bool_t rpc_dlopen_1_svc(char *path, int *result, struct svc_req *rqstp)
diff --git a/cpu/cpu_rpc_prot.x b/cpu/cpu_rpc_prot.x
index 72fa0bfe..45495011 100644
--- a/cpu/cpu_rpc_prot.x
+++ b/cpu/cpu_rpc_prot.x
@@ -204,8 +204,6 @@ program RPC_CD_PROG {
         int          CUDA_FUNC_SET_SHARED_MEM_CONFIG(ptr, int)                  = 313;
         int          CUDA_LAUNCH_COOPERATIVE_KERNEL(ptr, rpc_dim3, 
                           rpc_dim3, mem_data, size_t, ptr)                      = 314;
-        int          CUDA_LAUNCH_COOPERATIVE_KERNEL_MULTI_DEVICE(ptr,
-                          rpc_dim3, rpc_dim3, mem_data, size_t, ptr, int, int)  = 315;
         /*int        CUDA_LAUNCH_HOST_FUNC(ptr, ptr, mem_data)                  = 316;*/
         int          CUDA_LAUNCH_KERNEL(ptr, rpc_dim3, rpc_dim3,
                           mem_data, size_t, ptr)                                = 317;
@@ -301,6 +299,8 @@ program RPC_CD_PROG {
         /* NOT IMPLEMENTED */
 
         /* ### Profiler Control ### */
+        int          CUDA_PROFILER_START(void)                                  = 701;
+        int          CUDA_PROFILER_STOP(void)                                   = 702;
         /* NOT IMPLEMENTED */
 
         /* DRIVER API */
diff --git a/cpu/cr.c b/cpu/cr.c
index e14f58f5..7e1e2e74 100644
--- a/cpu/cr.c
+++ b/cpu/cr.c
@@ -754,7 +754,6 @@ static int cr_restore_resources(const char *path, api_record_t *record, resource
         break;
     case CUDA_LAUNCH_KERNEL:
     case CUDA_LAUNCH_COOPERATIVE_KERNEL:
-    case CUDA_LAUNCH_COOPERATIVE_KERNEL_MULTI_DEVICE:
         break;
     case rpc_cusolverDnCreate:
         if (cr_restore_cusolver(record, rm_cusolver) != 0) {
@@ -821,9 +820,6 @@ int cr_launch_kernel(void)
         } else if (record->function == CUDA_LAUNCH_COOPERATIVE_KERNEL) {
             LOGE(LOG_ERROR, "not yet supported");
             goto cleanup;
-        } else if (record->function == CUDA_LAUNCH_COOPERATIVE_KERNEL_MULTI_DEVICE) {
-            LOGE(LOG_ERROR, "not yet supported");
-            goto cleanup;
         }
     }
     ret = 0;
diff --git a/tests/samples/Makefile b/tests/samples/Makefile
index 38aa9512..1bb16a04 100644
--- a/tests/samples/Makefile
+++ b/tests/samples/Makefile
@@ -1,55 +1,82 @@
 CC = gcc
 LD = gcc
-CFLAGS = -Wall -std=gnu99
-ARCH = sm_61
-CUDA_DIR = /usr/local/cuda
+CFLAGS = -Wall -std=gnu99 -g -ggdb
+SAMPLES = samples-bin/matrixMul.compressed.sample \
+		  samples-bin/matrixMul.uncompressed.sample \
+		  samples-bin/nbody.uncompressed.sample \
+		  samples-bin/nbody.compressed.sample \
+		  samples-bin/bandwidthTest.sample
+
+CUDA_PATH = /usr/local/cuda
+SMS = 75 60
+CUDA_SAMPLES_RELEASE = 12.1
+CUDA_SAMPLES_URL = https://github.com/NVIDIA/cuda-samples/archive/refs/tags/v${CUDA_SAMPLES_RELEASE}.tar.gz
 
 PWD = $(shell dirname $(realpath $(firstword $(MAKEFILE_LIST))))
 
 .PHONY: all clean distclean
 
-all : matrixMul/matrixMul bandwidthTest/bandwidthTest nbody/nbody
-
-matrixMul :
-	mkdir -p $(PWD)/matrixMul
-	cp -r $(CUDA_DIR)/samples/0_Simple/matrixMul $(PWD)
-	make -C matrixMul clean
-
-matrixMul/matrixMul : matrixMul
-	make -C matrixMul \
-		NVCCFLAGS="-m64 -cudart shared" \
-		GENCODE_FLAGS="-arch=$(ARCH)" \
-		CPATH="$(CUDA_DIR)/samples/common/inc"
-
-bandwidthTest :
-	mkdir -p $(PWD)/bandwidthTest
-	cp -r $(CUDA_DIR)/samples/1_Utilities/bandwidthTest $(PWD)
-	make -C bandwidthTest clean
-
-bandwidthTest/bandwidthTest : bandwidthTest
-	make -C bandwidthTest \
-		NVCCFLAGS="-m64 -cudart shared" \
-		GENCODE_FLAGS="-arch=$(ARCH)" \
-		CPATH="$(CUDA_DIR)/samples/common/inc"
-
-nbody :
-	mkdir -p $(PWD)/nbody
-	cp -r $(CUDA_DIR)/samples/5_Simulations/nbody $(PWD)
-	make -C nbody clean
-
-nbody/nbody : nbody
-	make -C nbody \
-		NVCCFLAGS="-m64 -cudart shared" \
-		GENCODE_FLAGS="-arch=$(ARCH)" \
-		CPATH="$(CUDA_DIR)/samples/common/inc"
+all : $(SAMPLES)
+
+samples:
+	mkdir -p $@
+	wget ${CUDA_SAMPLES_URL} -O - | tar -xz --strip-components=1 -C $@
+
+samples-bin:
+	mkdir -p $@
+
+samples-bin/nbody.uncompressed.sample : samples samples-bin
+	make -C samples/Samples/5_Domain_Specific/nbody \
+		clean
+	make -C samples/Samples/5_Domain_Specific/nbody \
+		NVCCFLAGS="-cudart shared --no-compress -g -G" \
+		SMS="${SMS}" \
+		CPATH="samples/Common" \
+		CUDA_PATH=${CUDA_PATH}
+	cp samples/Samples/5_Domain_Specific/nbody/nbody $@
+
+samples-bin/nbody.compressed.sample : samples samples-bin
+	make -C samples/Samples/5_Domain_Specific/nbody \
+		clean
+	make -C samples/Samples/5_Domain_Specific/nbody \
+		NVCCFLAGS="-cudart shared -Xfatbin --compress-all -g -G" \
+		SMS="${SMS}" \
+		CPATH="samples/Common" \
+		CUDA_PATH=${CUDA_PATH}
+	cp samples/Samples/5_Domain_Specific/nbody/nbody $@
+
+samples-bin/matrixMul.compressed.sample : samples samples-bin
+	make -C samples/Samples/0_Introduction/matrixMul \
+		clean
+	make -C samples/Samples/0_Introduction/matrixMul \
+		NVCCFLAGS="-cudart shared -Xfatbin --compress-all" \
+		SMS="${SMS}" \
+		CPATH="samples/Common" \
+		CUDA_PATH=${CUDA_PATH}
+	cp samples/Samples/0_Introduction/matrixMul/matrixMul $@
+
+samples-bin/matrixMul.uncompressed.sample : samples samples-bin
+	make -C samples/Samples/0_Introduction/matrixMul \
+		clean
+	make -C samples/Samples/0_Introduction/matrixMul \
+		NVCCFLAGS="-cudart shared --no-compress" \
+		SMS="${SMS}" \
+		CPATH="samples/Common" \
+		CUDA_PATH=${CUDA_PATH}
+	cp samples/Samples/0_Introduction/matrixMul/matrixMul $@
+
+samples-bin/bandwidthTest.sample : samples samples-bin
+	make -C samples/Samples/1_Utilities/bandwidthTest \
+		clean
+	make -C samples/Samples/1_Utilities/bandwidthTest \
+		NVCCFLAGS="-cudart shared --no-compress" \
+		SMS="${SMS}" \
+		CPATH="samples/Common" \
+		CUDA_PATH=${CUDA_PATH}
+	cp samples/Samples/1_Utilities/bandwidthTest/bandwidthTest $@
 
 clean :
-	rm -f *.elf *.hex *.o *.d .depend *~
-	make -C matrixMul clean
-	make -C bandwidthTest clean
-	make -C nbody clean
+	rm -rf samples-bin
 
 distclean : clean
-	rm -r matrixMul
-	rm -r bandwidthTest
-	rm -r nbody
\ No newline at end of file
+	rm -rf samples
\ No newline at end of file

From eeb8e484947f79e5c14f9e5e4571a242005f2420 Mon Sep 17 00:00:00 2001
From: Niklas Eiling <niklas.eiling@eonerc.rwth-aachen.de>
Date: Wed, 10 May 2023 11:16:09 +0200
Subject: [PATCH 31/83] fix elf handling to work with a wider variety of CUDA
 kernels

Signed-off-by: Niklas Eiling <niklas.eiling@eonerc.rwth-aachen.de>
---
 cpu/cpu-elf2.c | 30 +++++++++++++++++++-----------
 cpu/cpu-elf2.h |  2 +-
 2 files changed, 20 insertions(+), 12 deletions(-)

diff --git a/cpu/cpu-elf2.c b/cpu/cpu-elf2.c
index 13dd6fc5..6c055f69 100644
--- a/cpu/cpu-elf2.c
+++ b/cpu/cpu-elf2.c
@@ -464,8 +464,16 @@ int elf2_get_fatbin_info(const struct fat_header *fatbin, list *kernel_infos, ui
         }
         //print_header(th);
         input_pos += th->header_size;
+        if (th->kind != 2) { // section does not cotain device code (but e.g. PTX)
+            if (th->flags & FATBIN_FLAG_COMPRESS) {
+                input_pos += th->decompressed_size;
+            } else {
+                input_pos += th->size;
+            }
+            continue;
+        }
         if (th->flags & FATBIN_FLAG_DEBUG) {
-            LOGE(LOG_DEBUG, "fatbin contains debug information. This is not supported, yet.");
+            LOGE(LOG_DEBUG, "fatbin contains debug information.");
             goto error;
         }
 
@@ -478,11 +486,13 @@ int elf2_get_fatbin_info(const struct fat_header *fatbin, list *kernel_infos, ui
                 goto error;
             }
             input_pos += input_read;
+            hexdump(text_data, text_data_size);
         } else {
             text_data = (uint8_t*)input_pos;
             text_data_size = th->size;
             input_pos += th->size;
         }
+        print_header(th);
         if (elf2_parameter_info(kernel_infos, text_data , text_data_size) != 0) {
             LOGE(LOG_ERROR, "error getting parameter info");
             goto error;
@@ -492,11 +502,11 @@ int elf2_get_fatbin_info(const struct fat_header *fatbin, list *kernel_infos, ui
         }
     } while (input_pos < (uint8_t*)eh + eh->header_size + eh->size);
 
-    if (get_elf_header((uint8_t*)fatbin->text2, sizeof(struct fat_elf_header), &eh) != 0) {
-        LOGE(LOG_ERROR, "Something went wrong while checking the header.");
-        goto error;
-    }
-    fatbin_total_size += eh->header_size + eh->size;
+    // if (get_elf_header((uint8_t*)fatbin->text2, sizeof(struct fat_elf_header), &eh) != 0) {
+    //     LOGE(LOG_ERROR, "Something went wrong while checking the header.");
+    //     goto error;
+    // }
+    // fatbin_total_size += eh->header_size + eh->size;
 
     *fatbin_mem = (void*)fatbin->text;
     *fatbin_size = fatbin_total_size;
@@ -663,8 +673,6 @@ static int get_parm_for_kernel(Elf *elf, kernel_info_t *kernel, void* memory, si
         goto cleanup;
     }
 
-    print_sections(elf);
-
     if (get_section_by_name(elf, section_name, &section) != 0) {
         LOGE(LOG_ERROR, "section %s not found", section_name);
         goto cleanup;
@@ -916,9 +924,9 @@ int elf2_parameter_info(list *kernel_infos, void* memory, size_t memsize)
 
     for (size_t secpos=0; secpos < data->d_size; secpos += sizeof(struct nv_info_entry)) {
         struct nv_info_entry *entry = (struct nv_info_entry *)(data->d_buf+secpos);
-        LOGE(LOG_DBG(1), "%d: format: %#x, attr: %#x, values_size: %#x kernel: %#x, sval: %#x(%d)", 
-        i++, entry->format, entry->attribute, entry->values_size, entry->kernel_id, 
-        entry->value, entry->value);
+        // LOGE(LOG_DBG(1), "%d: format: %#x, attr: %#x, values_size: %#x kernel: %#x, sval: %#x(%d)", 
+        // i++, entry->format, entry->attribute, entry->values_size, entry->kernel_id, 
+        // entry->value, entry->value);
 
         if (entry->values_size != 8) {
             LOGE(LOG_ERROR, "unexpected values_size: %#x", entry->values_size);
diff --git a/cpu/cpu-elf2.h b/cpu/cpu-elf2.h
index 0cbedb49..f484500f 100644
--- a/cpu/cpu-elf2.h
+++ b/cpu/cpu-elf2.h
@@ -19,7 +19,7 @@ int elf2_init(void);
 int elf2_get_fatbin_info(const struct fat_header *fatbin, list *kernel_infos, uint8_t** fatbin_mem, size_t* fatbin_size);
 
 int elf2_parameter_info(list *kernel_infos, void* memory, size_t memsize);
-//void* elf2_symbol_address(const char* file, char *symbol);
+void* elf2_symbol_address(const char* file, char *symbol);
 //int elf2_contains_kernel(void* memory, size_t memsize);
 
 #endif //_ELF_H_

From 66eb96106ca88942ac68ea6b6c0ca549744fb749 Mon Sep 17 00:00:00 2001
From: Niklas Eiling <niklas.eiling@eonerc.rwth-aachen.de>
Date: Thu, 11 May 2023 11:47:30 +0200
Subject: [PATCH 32/83] fix memory leaks identified by gcc sanitizer

Signed-off-by: Niklas Eiling <niklas.eiling@eonerc.rwth-aachen.de>
---
 cpu/Makefile              |  2 +-
 cpu/api-recorder.c        | 25 ++++++++++++++++++++++++-
 cpu/api-recorder.h        |  2 +-
 cpu/cpu-elf2.c            |  2 ++
 cpu/cpu-server-driver.c   | 12 +++++++++---
 cpu/cpu-server.c          |  3 +--
 tests/test_apps/matmul.cu |  5 +----
 7 files changed, 39 insertions(+), 12 deletions(-)

diff --git a/cpu/Makefile b/cpu/Makefile
index a2d38223..9b81ecd4 100644
--- a/cpu/Makefile
+++ b/cpu/Makefile
@@ -84,7 +84,7 @@ LD_FLAGS = $(LIB_FLAGS) -ltirpc -ldl -lcrypto -lelf
 
 ifdef WITH_DEBUG
 # use ASAN_OPTIONS=protect_shadow_gap=0  LSAN_OPTIONS=fast_unwind_on_malloc=0 when running
-CC_FLAGS += -g -ggdb #-fsanitize=address -fsanitize=pointer-compare -fsanitize=pointer-subtract -fsanitize-address-use-after-scope
+CC_FLAGS += -g -ggdb -fsanitize=address -fsanitize=pointer-compare -fsanitize=pointer-subtract -fsanitize-address-use-after-scope
 endif
 
 ifdef WITH_IB
diff --git a/cpu/api-recorder.c b/cpu/api-recorder.c
index f02cdc37..e67204d2 100644
--- a/cpu/api-recorder.c
+++ b/cpu/api-recorder.c
@@ -4,11 +4,13 @@
 
 #include "api-recorder.h"
 #include "log.h"
+#include "list.h"
 
 
 list api_records;
 
-void api_records_free_args(void)
+
+static void api_records_free_args(void)
 {
     api_record_t *record;
     for (size_t i = 0; i < api_records.length; i++) {
@@ -22,6 +24,27 @@ void api_records_free_args(void)
 
 }
 
+static void api_records_free_data(void)
+{
+    api_record_t *record;
+    for (size_t i = 0; i < api_records.length; i++) {
+        if (list_at(&api_records, i, (void**)&record) != 0) {
+            LOGE(LOG_ERROR, "list_at %zu returned an error.", i);
+            continue;
+        }
+        free(record->data);
+        record->data = NULL;
+    }
+}
+
+
+void api_records_free(void)
+{
+    api_records_free_args();
+    api_records_free_data();
+    list_free(&api_records);
+}
+
 size_t api_records_malloc_get_size(void *ptr)
 {
     api_record_t *record;
diff --git a/cpu/api-recorder.h b/cpu/api-recorder.h
index 856a3121..627694ae 100644
--- a/cpu/api-recorder.h
+++ b/cpu/api-recorder.h
@@ -65,7 +65,7 @@ typedef struct api_record {
 extern list api_records;
 
 
-void api_records_free_args(void);
+void api_records_free(void);
 void api_records_print(void);
 void api_records_print_records(api_record_t *record);
 
diff --git a/cpu/cpu-elf2.c b/cpu/cpu-elf2.c
index 6c055f69..42126b4b 100644
--- a/cpu/cpu-elf2.c
+++ b/cpu/cpu-elf2.c
@@ -95,6 +95,8 @@ static void print_header(struct fat_text_header *th)
         th->unknown1,
         th->unknown2,
         th->zero);
+
+    free(flagstr);
 }
 
 /** Check the header of a fatbin
diff --git a/cpu/cpu-server-driver.c b/cpu/cpu-server-driver.c
index 00da6579..3b2c3e8f 100644
--- a/cpu/cpu-server-driver.c
+++ b/cpu/cpu-server-driver.c
@@ -37,12 +37,12 @@ int server_driver_init(int restore)
 // Does not support checkpoint/restart yet
 bool_t rpc_elf_load_1_svc(mem_data elf, ptr module_key, int *result, struct svc_req *rqstp)
 {
-    LOG(LOG_DEBUG, "rpc_elf_load(elf: %p, len: %#x, key: %#x)", elf.mem_data_val, elf.mem_data_len);
+    LOG(LOG_DEBUG, "rpc_elf_load(elf: %p, len: %#x, module_key: %#x)", elf.mem_data_val, elf.mem_data_len, module_key);
     CUresult res;
     CUmodule module;
     
     if ((res = cuModuleLoadData(&module, elf.mem_data_val)) != CUDA_SUCCESS) {
-        LOG(LOG_ERROR, "cuModuleLoadFatBinary failed: %d", res);
+        LOG(LOG_ERROR, "cuModuleLoadData failed: %d", res);
         *result = res;
         return 1;
     }
@@ -96,6 +96,7 @@ bool_t rpc_elf_unload_1_svc(ptr elf_handle, int *result, struct svc_req *rqstp)
 bool_t rpc_register_function_1_svc(ptr fatCubinHandle, ptr hostFun, char* deviceFun,
                             char* deviceName, int thread_limit, ptr_result *result, struct svc_req *rqstp)
 {
+    void *module = NULL;
     RECORD_API(rpc_register_function_1_argument);
     RECORD_ARG(1, fatCubinHandle);
     RECORD_ARG(2, hostFun);
@@ -105,8 +106,13 @@ bool_t rpc_register_function_1_svc(ptr fatCubinHandle, ptr hostFun, char* device
     LOG(LOG_DEBUG, "rpc_register_function(fatCubinHandle: %p, hostFun: %p, deviceFun: %s, deviceName: %s, thread_limit: %d)",
         fatCubinHandle, hostFun, deviceFun, deviceName, thread_limit);
     GSCHED_RETAIN;
+    if ((module = resource_mg_get(&rm_modules, (void*)fatCubinHandle)) == fatCubinHandle) {
+        LOG(LOG_ERROR, "%p not found in resource manager - we cannot call a function from an unknown module.", fatCubinHandle);
+        result->err = -1;
+        return 1;
+    }
     result->err = cuModuleGetFunction((CUfunction*)&result->ptr_result_u.ptr,
-                    resource_mg_get(&rm_modules, (void*)fatCubinHandle),
+                    module,
                     deviceName);
     GSCHED_RELEASE;
     if (resource_mg_add_sorted(&rm_functions, (void*)hostFun, (void*)result->ptr_result_u.ptr) != 0) {
diff --git a/cpu/cpu-server.c b/cpu/cpu-server.c
index e5182324..9e309d8e 100644
--- a/cpu/cpu-server.c
+++ b/cpu/cpu-server.c
@@ -347,8 +347,7 @@ void cricket_main(char* app_command, size_t prog_num, size_t vers_num)
  cleanup2:
     server_runtime_deinit();
  cleanup3:
-    api_records_free_args();
-    list_free(&api_records);
+    api_records_free();
  cleanup4:
     pmap_unset(prog, vers);
     svc_destroy(transp);
diff --git a/tests/test_apps/matmul.cu b/tests/test_apps/matmul.cu
index ea5f89ba..b4960c39 100644
--- a/tests/test_apps/matmul.cu
+++ b/tests/test_apps/matmul.cu
@@ -173,7 +173,6 @@ int main()
 #endif //RANDOM_INIT
     uint16_t *res;
     uint16_t *dev_A, *dev_x, *dev_res;
-    uint16_t *dev_ptr;
     struct timeval begin, end;
     struct timeval messb, messa;
     const int A_size = N*N*sizeof(uint16_t);
@@ -253,11 +252,9 @@ int main()
    */
     cudaMalloc( (void**)&dev_x, x_size );
     cudaMalloc( (void**)&dev_res, x_size );
-    cudaMalloc( (void**)&dev_ptr, A_size );
 
     printf("Mallocs done\n");
 
-    cudaMemcpy( dev_ptr, A, A_size, cudaMemcpyHostToDevice );
     cudaMemcpy( dev_A, A, A_size, cudaMemcpyHostToDevice );
     cudaMemcpy( dev_x, x, x_size, cudaMemcpyHostToDevice );
 
@@ -305,7 +302,7 @@ int main()
     gettimeofday(&end, NULL);
 
     printf("elapsed time: %0u.%06u\n", (end.tv_sec - begin.tv_sec), (end.tv_usec - begin.tv_usec));
-
+    free(res);
 
     return (success ? 0 : 1);
 }

From b2306876e192c1f7362fed1d5d11dd8fa831bd33 Mon Sep 17 00:00:00 2001
From: Niklas Eiling <niklas.eiling@eonerc.rwth-aachen.de>
Date: Fri, 12 May 2023 12:26:51 +0200
Subject: [PATCH 33/83] clean up of uneeded code paths relating to old
 LD_PRELOADing of server. Cleanup of shm support

Signed-off-by: Niklas Eiling <niklas.eiling@eonerc.rwth-aachen.de>
---
 cpu/Makefile             |  16 +++----
 cpu/api-recorder.h       |   1 +
 cpu/bfd_extracts.h       |  64 -------------------------
 cpu/cpu-client-runtime.c |  58 +++++++++++++----------
 cpu/cpu-client.c         |  10 ++--
 cpu/cpu-server-driver.c  |   2 +-
 cpu/cpu-server-runtime.c | 100 ++++++++++++++++++++++++++-------------
 cpu/cpu-server.c         |  42 ++--------------
 cpu/cpu-server.h         |   4 +-
 cpu/cpu-utils.c          |   4 --
 cpu/cpu_rpc_prot.x       |  10 +++-
 cpu/gsched_none.c        |   2 +-
 cpu/server-exe.c         |  15 ++++--
 cpu/server-library.c     |  10 ----
 14 files changed, 139 insertions(+), 199 deletions(-)
 delete mode 100644 cpu/bfd_extracts.h
 delete mode 100644 cpu/server-library.c

diff --git a/cpu/Makefile b/cpu/Makefile
index 9b81ecd4..8f29d031 100644
--- a/cpu/Makefile
+++ b/cpu/Makefile
@@ -1,7 +1,5 @@
-#RPC server library
-SERVER = cricket-server.so
 #Standalone RPC Server
-SERVER_BIN = cricket-rpc-server
+SERVER = cricket-rpc-server
 #RPC client library
 CLIENT = cricket-client.so
 
@@ -84,7 +82,7 @@ LD_FLAGS = $(LIB_FLAGS) -ltirpc -ldl -lcrypto -lelf
 
 ifdef WITH_DEBUG
 # use ASAN_OPTIONS=protect_shadow_gap=0  LSAN_OPTIONS=fast_unwind_on_malloc=0 when running
-CC_FLAGS += -g -ggdb -fsanitize=address -fsanitize=pointer-compare -fsanitize=pointer-subtract -fsanitize-address-use-after-scope
+CC_FLAGS += -g -ggdb #-fsanitize=address -fsanitize=pointer-compare -fsanitize=pointer-subtract -fsanitize-address-use-after-scope
 endif
 
 ifdef WITH_IB
@@ -110,15 +108,12 @@ CLIENT_LD_FLAGS = $(LD_FLAGS)
 # Targets
 .PHONY: all clean
 
-all : $(SERVER) $(SERVER_BIN) $(CLIENT)
+all : $(SERVER) $(CLIENT)
 
 $(CLIENT) : $(OBJ_CLIENT)
 	$(LD) $(CC_FLAGS) -shared -o $@ $^ $(CLIENT_LD_FLAGS)
 
-$(SERVER) : $(OBJ_SERVER) $(SRC_SERVER_LIB:%.c=%.o)
-	$(LD) $(CC_FLAGS) -shared -o $@ $^ $(SERVER_LD_FLAGS)
-
-$(SERVER_BIN) : $(OBJ_SERVER) $(SRC_SERVER_EXE:%.c=%.o)
+$(SERVER) : $(OBJ_SERVER) $(SRC_SERVER_EXE:%.c=%.o)
 	$(LD) $(CC_FLAGS) -o $@ $^ $(SERVER_BIN_LD_FLAGS)
 
 $(RPC_H) : $(RPC_DEF)
@@ -140,7 +135,8 @@ $(RPC_XDR) : $(RPC_DEF)
 	$(CC) $(CC_FLAGS) -c -fpic -o $@ $< $(LD_FLAGS) 
 
 clean:
-	 rm -f $(RPC_H) $(RPC_CLIENT) $(RPC_SERVER) $(RPC_SERVER_BIN) $(RPC_SERVER_MOD) $(RPC_XDR) $(OBJ_CLIENT) $(OBJ_SERVER) $(SERVER) $(CLIENT)
+	 rm -f $(RPC_H) $(RPC_CLIENT) $(RPC_SERVER) $(RPC_SERVER_MOD) $(RPC_XDR) $(OBJ_CLIENT) $(OBJ_SERVER) $(SERVER) $(CLIENT) $(SRC_SERVER_EXE:%.c=%.o)
+
 
 
 
diff --git a/cpu/api-recorder.h b/cpu/api-recorder.h
index 627694ae..c642fbfc 100644
--- a/cpu/api-recorder.h
+++ b/cpu/api-recorder.h
@@ -58,6 +58,7 @@ typedef struct api_record {
         void* ptr;
         int integer;
         ptr_result ptr_result_u;
+        sz_result sz_result_u;
     } result;
     void *data;
     size_t data_size;
diff --git a/cpu/bfd_extracts.h b/cpu/bfd_extracts.h
deleted file mode 100644
index 1ce5f46e..00000000
--- a/cpu/bfd_extracts.h
+++ /dev/null
@@ -1,64 +0,0 @@
-/* DO NOT EDIT!  -*- buffer-read-only: t -*-  This file is automatically 
-   generated from "libbfd-in.h", "init.c", "libbfd.c", "bfdio.c", 
-   "bfdwin.c", "cache.c", "reloc.c", "archures.c" and "elf.c".
-   Run "make headers" in your build bfd/ to regenerate.  */
-
-/* libbfd.h -- Declarations used by bfd library *implementation*.
-   (This include file is not for users of the library.)
-
-   Copyright 1990, 1991, 1992, 1993, 1994, 1995, 1996, 1997, 1998,
-   1999, 2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008, 2009,
-   2010, 2011, 2012
-   Free Software Foundation, Inc.
-
-   Written by Cygnus Support.
-
-   This file is part of BFD, the Binary File Descriptor library.
-
-   This program is free software; you can redistribute it and/or modify
-   it under the terms of the GNU General Public License as published by
-   the Free Software Foundation; either version 3 of the License, or
-   (at your option) any later version.
-
-   This program is distributed in the hope that it will be useful,
-   but WITHOUT ANY WARRANTY; without even the implied warranty of
-   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-   GNU General Public License for more details.
-
-   You should have received a copy of the GNU General Public License
-   along with this program; if not, write to the Free Software
-   Foundation, Inc., 51 Franklin Street - Fifth Floor, Boston,
-   MA 02110-1301, USA.  */
-   
-#include <bfd.h>
-/* Extracted from bfdio.c.  */
-struct bfd_iovec
-{
-  /* To avoid problems with macros, a "b" rather than "f"
-     prefix is prepended to each method name.  */
-  /* Attempt to read/write NBYTES on ABFD's IOSTREAM storing/fetching
-     bytes starting at PTR.  Return the number of bytes actually
-     transfered (a read past end-of-file returns less than NBYTES),
-     or -1 (setting <<bfd_error>>) if an error occurs.  */
-  file_ptr (*bread) (struct bfd *abfd, void *ptr, file_ptr nbytes);
-  file_ptr (*bwrite) (struct bfd *abfd, const void *ptr,
-                      file_ptr nbytes);
-  /* Return the current IOSTREAM file offset, or -1 (setting <<bfd_error>>
-     if an error occurs.  */
-  file_ptr (*btell) (struct bfd *abfd);
-  /* For the following, on successful completion a value of 0 is returned.
-     Otherwise, a value of -1 is returned (and  <<bfd_error>> is set).  */
-  int (*bseek) (struct bfd *abfd, file_ptr offset, int whence);
-  int (*bclose) (struct bfd *abfd);
-  int (*bflush) (struct bfd *abfd);
-  int (*bstat) (struct bfd *abfd, struct stat *sb);
-  /* Mmap a part of the files. ADDR, LEN, PROT, FLAGS and OFFSET are the usual
-     mmap parameter, except that LEN and OFFSET do not need to be page
-     aligned.  Returns (void *)-1 on failure, mmapped address on success.
-     Also write in MAP_ADDR the address of the page aligned buffer and in
-     MAP_LEN the size mapped (a page multiple).  Use unmap with MAP_ADDR and
-     MAP_LEN to unmap.  */
-  void *(*bmmap) (struct bfd *abfd, void *addr, bfd_size_type len,
-                  int prot, int flags, file_ptr offset,
-                  void **map_addr, bfd_size_type *map_len);
-};
\ No newline at end of file
diff --git a/cpu/cpu-client-runtime.c b/cpu/cpu-client-runtime.c
index f5e46b48..8d1f8ba2 100644
--- a/cpu/cpu-client-runtime.c
+++ b/cpu/cpu-client-runtime.c
@@ -1,4 +1,3 @@
-#include "mt-memcpy.h"
 #define _GNU_SOURCE
 #include <stdio.h>
 #include <stdlib.h>
@@ -24,6 +23,7 @@
 #include "cpu-utils.h"
 #include "log.h"
 #include "oob.h"
+#include "mt-memcpy.h"
 #ifdef WITH_IB
 #include "cpu-ib.h"
 #endif //WITH_IB
@@ -1088,12 +1088,12 @@ cudaError_t cudaFreeArray(cudaArray_t array)
 }
 
 typedef struct host_alloc_info {
-    int cnt;
+    int idx;
     size_t size;
     void *client_ptr;
 } host_alloc_info_t;
 static host_alloc_info_t hainfo[64] = {0};
-static size_t hainfo_cnt = 1;
+static size_t hainfo_cnt = 0;
 static int hainfo_getindex(void *client_ptr)
 {
     int i;
@@ -1195,44 +1195,49 @@ cudaError_t cudaHostAlloc(void** pHost, size_t size, unsigned int flags)
 #ifdef WITH_API_CNT
     api_call_cnt++;
 #endif //WITH_API_CNT
-    int ret = cudaErrorMemoryAllocation;
+    sz_result ret = {.err = cudaErrorMemoryAllocation};
+    int reg_ret;
     int fd_shm;
-    char shm_name[128];
+    char *shm_name = NULL;
     enum clnt_stat retval_1;
     
     if (shm_enabled && connection_is_local == 1) { //Use local shared memory
+        retval_1 = cuda_host_alloc_1(size, flags, &ret, clnt);
+        if (retval_1 != RPC_SUCCESS || ret.err != cudaSuccess) {
+            LOGE(LOG_ERROR, "cudaHostAlloc failed on server-side.");
+            goto out;
+        }
 
-        snprintf(shm_name, 128, "/crickethostalloc-%zu", hainfo_cnt);
-        if ((fd_shm = shm_open(shm_name, O_RDWR | O_CREAT, S_IRWXU)) == -1) {
-            LOGE(LOG_ERROR, "ERROR: could not open shared memory \"%s\" with size %d: %s", shm_name, size, strerror(errno));
+        if (asprintf(&shm_name, "/crickethostalloc-%zu", ret.sz_result_u.data) == -1) {
+            LOGE(LOG_ERROR, "ERROR: asprintf failed: %s", strerror(errno));
+            ret.err = cudaErrorMemoryAllocation;
             goto out;
         }
-        if (ftruncate(fd_shm, size) == -1) {
-            LOGE(LOG_ERROR, "ERROR: cannot resize shared memory");
-            shm_unlink(shm_name);
+        
+        if ((fd_shm = shm_open(shm_name, O_RDWR, S_IREAD | S_IWRITE)) == -1) {
+            LOGE(LOG_ERROR, "ERROR: could not open shared memory \"%s\" with size %d: %s", shm_name, size, strerror(errno));
+            ret.err = cudaErrorMemoryAllocation;
             goto out;
         }
-        LOGE(LOG_DEBUG, "shm opened with name \"%s\", size: %d", shm_name, size);
+
         if ((*pHost = mmap(NULL, size, PROT_READ | PROT_WRITE, MAP_SHARED, fd_shm, 0)) == MAP_FAILED) {
             LOGE(LOG_ERROR, "ERROR: mmap returned unexpected pointer: %p", *pHost);
             shm_unlink(shm_name);
+            ret.err = cudaErrorMemoryAllocation;
             goto out;
         }
 
-        hainfo[hainfo_cnt].cnt = hainfo_cnt;
+        hainfo[hainfo_cnt].idx = ret.sz_result_u.data;
         hainfo[hainfo_cnt].size = size;
         hainfo[hainfo_cnt].client_ptr = *pHost;
-
-        retval_1 = cuda_host_alloc_1(hainfo_cnt, size, (uint64_t)*pHost, flags, &ret, clnt);
-        if (retval_1 != RPC_SUCCESS) {
-            clnt_perror (clnt, "call failed");
-        }
-        if (ret == cudaSuccess) {
-            hainfo_cnt++;
-        } else {
-            munmap(*pHost, size);
-            *pHost = NULL;
+        hainfo_cnt++;
+        
+        retval_1 = cuda_host_alloc_regshm_1(ret.sz_result_u.data, (ptr)*pHost, &reg_ret, clnt);
+        if (retval_1 != RPC_SUCCESS || ret.err != cudaSuccess) {
+            LOGE(LOG_ERROR, "cudaHostAlloc failed on server-side.");
+            goto out;
         }
+
         shm_unlink(shm_name);
     } else if (socktype == TCP) { //Use infiniband
 #ifdef WITH_IB
@@ -1240,7 +1245,7 @@ cudaError_t cudaHostAlloc(void** pHost, size_t size, unsigned int flags)
             LOGE(LOG_ERROR, "failed to register infiniband memory region");
             goto out;
         }
-        hainfo[hainfo_cnt].cnt = hainfo_cnt;
+        hainfo[hainfo_cnt].idx = hainfo_cnt;
         hainfo[hainfo_cnt].size = size;
         hainfo[hainfo_cnt].client_ptr = *pHost;
 
@@ -1255,7 +1260,7 @@ cudaError_t cudaHostAlloc(void** pHost, size_t size, unsigned int flags)
         if (*pHost == NULL) {
             goto out;
         } else {
-            ret = cudaSuccess;
+            ret.err = cudaSuccess;
             goto out;
         }
 #endif //WITH_IB
@@ -1264,7 +1269,8 @@ cudaError_t cudaHostAlloc(void** pHost, size_t size, unsigned int flags)
         goto out;
     }
 out:
-    return ret;
+    free(shm_name);
+    return ret.err;
 }
 
 cudaError_t cudaHostGetDevicePointer(void** pDevice, void* pHost, unsigned int flags)
diff --git a/cpu/cpu-client.c b/cpu/cpu-client.c
index 3746cc9d..09b35e59 100644
--- a/cpu/cpu-client.c
+++ b/cpu/cpu-client.c
@@ -77,12 +77,14 @@ static void rpc_connect(void)
 
 #endif // WITH_IB
 
-    //TODO: This is not necessary anymore. We should fix a static prog/vers
     prog = 99;
     vers = 1;
-    if (getenv("CRICKET_HASH") && cpu_utils_md5hash("/proc/self/exe", &prog, &vers) != 0) {
-        LOGE(LOG_ERROR, "error while creating binary checksum");
-        exit(0);
+    const char *env_vers = getenv("CRICKET_RPCID");
+    if (env_vers != NULL) {
+        if (sscanf(env_vers, "%lu", &vers) != 1) {
+            LOGE(LOG_ERROR, "error parsing CRICKET_RPCID");
+            exit(1);
+        }
     }
 
     char *cmd = NULL;
diff --git a/cpu/cpu-server-driver.c b/cpu/cpu-server-driver.c
index 3b2c3e8f..fb143f00 100644
--- a/cpu/cpu-server-driver.c
+++ b/cpu/cpu-server-driver.c
@@ -106,7 +106,7 @@ bool_t rpc_register_function_1_svc(ptr fatCubinHandle, ptr hostFun, char* device
     LOG(LOG_DEBUG, "rpc_register_function(fatCubinHandle: %p, hostFun: %p, deviceFun: %s, deviceName: %s, thread_limit: %d)",
         fatCubinHandle, hostFun, deviceFun, deviceName, thread_limit);
     GSCHED_RETAIN;
-    if ((module = resource_mg_get(&rm_modules, (void*)fatCubinHandle)) == fatCubinHandle) {
+    if ((module = resource_mg_get(&rm_modules, (void*)fatCubinHandle)) == (void*)fatCubinHandle) {
         LOG(LOG_ERROR, "%p not found in resource manager - we cannot call a function from an unknown module.", fatCubinHandle);
         result->err = -1;
         return 1;
diff --git a/cpu/cpu-server-runtime.c b/cpu/cpu-server-runtime.c
index f437c6f8..b1ba19be 100644
--- a/cpu/cpu-server-runtime.c
+++ b/cpu/cpu-server-runtime.c
@@ -35,13 +35,13 @@
 #include "mt-memcpy.h"
 
 typedef struct host_alloc_info {
-    int cnt;
+    size_t idx;
     size_t size;
     void *client_ptr;
     void *server_ptr;
 } host_alloc_info_t;
 static host_alloc_info_t hainfo[64];
-static size_t hainfo_cnt = 1;
+static size_t hainfo_cnt = 0;
 list mt_memcpy_list = {0};
 
 static int hainfo_getserverindex(void *server_ptr)
@@ -1052,8 +1052,8 @@ bool_t cuda_free_host_1_svc(int index, int *result, struct svc_req *rqstp)
         *result = cudaSuccess;
         return 1;
     }
-    if (hainfo[index].cnt != 0 &&
-        hainfo[index].cnt == index) {
+    if (hainfo[index].idx != 0 &&
+        hainfo[index].idx == index) {
 
         *result = cudaHostUnregister(hainfo[index].server_ptr);
         munmap(hainfo[index].server_ptr, hainfo[index].size);
@@ -1088,31 +1088,39 @@ bool_t cuda_get_symbol_size_1_svc(ptr symbol, u64_result *result, struct svc_req
     return 1;
 }
 
-bool_t cuda_host_alloc_1_svc(int client_cnt, size_t size, ptr client_ptr, unsigned int flags, int *result, struct svc_req *rqstp)
+bool_t cuda_host_alloc_1_svc(size_t size, unsigned int flags, sz_result *result, struct svc_req *rqstp)
 {
     //TODO: Make checkpointable. Implement reattaching of shm segment.
     int fd_shm;
-    char shm_name[128];
+    char *shm_name = NULL;
     void *shm_addr;
     unsigned int register_flags = 0;
-    *result = cudaErrorMemoryAllocation;
     RECORD_API(cuda_host_alloc_1_argument);
-    RECORD_ARG(1, client_cnt);
-    RECORD_ARG(2, size);
-    RECORD_ARG(3, client_ptr);
-    RECORD_ARG(4, flags);
+    RECORD_ARG(1, size);
+    RECORD_ARG(2, flags);
 
     LOGE(LOG_DEBUG, "cudaHostAlloc");
+    result->err = cudaErrorMemoryAllocation;
 
     if (socktype == UNIX || (shm_enabled && cpu_utils_is_local_connection(rqstp))) { //Use local shared memory
-        snprintf(shm_name, 128, "/crickethostalloc-%d", client_cnt);
-        if ((fd_shm = shm_open(shm_name, O_RDWR, 600)) == -1) {
+        if (asprintf(&shm_name, "/crickethostalloc-%d", hainfo_cnt) == -1) {
+            LOGE(LOG_ERROR, "asprintf failed: %s", strerror(errno));
+            goto out;
+        }
+        if ((fd_shm = shm_open(shm_name, O_RDWR | O_CREAT | O_TRUNC, S_IRWXU)) == -1) {
             LOGE(LOG_ERROR, "could not open shared memory \"%s\" with size %d: %s", shm_name, size, strerror(errno));
             goto out;
         }
+        if (ftruncate(fd_shm, size) == -1) {
+            LOGE(LOG_ERROR, "cannot resize shared memory");
+            shm_unlink(shm_name);
+            goto out;
+        }
+        result->sz_result_u.data = hainfo_cnt;
+        LOGE(LOG_DEBUG, "shm opened with name \"%s\", size: %d", shm_name, size);
         if ((shm_addr = mmap(NULL, size, PROT_READ | PROT_WRITE, MAP_SHARED, fd_shm, 0)) == MAP_FAILED) {
             LOGE(LOG_ERROR, "mmap returned unexpected pointer: %p", shm_addr);
-            goto cleanup;
+            goto out;
         }
 
         if (flags & cudaHostAllocPortable) {
@@ -1125,23 +1133,23 @@ bool_t cuda_host_alloc_1_svc(int client_cnt, size_t size, ptr client_ptr, unsign
             register_flags |= cudaHostRegisterMapped;
         }
 
-        if ((*result = cudaHostRegister(shm_addr, size, flags)) != cudaSuccess) {
+        if ((result->err = cudaHostRegister(shm_addr, size, flags)) != cudaSuccess) {
             LOGE(LOG_ERROR, "cudaHostRegister failed.");
             munmap(shm_addr, size);
-            goto cleanup;
+            goto out;
         }
-
-        hainfo[hainfo_cnt].cnt = client_cnt;
+        hainfo[hainfo_cnt].idx = hainfo_cnt;
         hainfo[hainfo_cnt].size = size;
-        hainfo[hainfo_cnt].client_ptr = (void*)client_ptr;
+        hainfo[hainfo_cnt].client_ptr = NULL;
         hainfo[hainfo_cnt].server_ptr = shm_addr;
         hainfo_cnt++;
     } else if (socktype == TCP) { //Use infiniband
 #ifdef WITH_IB
-   
+        LOGE(LOG_ERROR, "infiniband does not yet support cudaHostAlloc.");
+        goto cleanup;
 #else
-                LOGE(LOG_ERROR, "infiniband is disabled.");
-                goto cleanup;
+        LOGE(LOG_ERROR, "infiniband is disabled.");
+        goto out;
 #endif //WITH_IB
 
     } else {
@@ -1149,14 +1157,40 @@ bool_t cuda_host_alloc_1_svc(int client_cnt, size_t size, ptr client_ptr, unsign
         goto out;
     }
 
+    result->err = cudaSuccess;
+out:
+    RECORD_RESULT(sz_result_u, *result);
+    return 1;
+}
+
+bool_t cuda_host_alloc_regshm_1_svc(size_t hainfo_idx, ptr client_ptr, int *result, struct svc_req *rqstp)
+{
+    char *shm_name = NULL;
+    RECORD_API(cuda_host_alloc_regshm_1_argument);
+    RECORD_ARG(1, hainfo_idx);
+    RECORD_ARG(2, client_ptr);
+
+    LOGE(LOG_DEBUG, "cudaHostAllocRegShm");
+    *result = cudaErrorMemoryAllocation;
+
+    if (socktype != UNIX && !(shm_enabled && cpu_utils_is_local_connection(rqstp))) {
+        LOGE(LOG_ERROR, "cudaHostAllocRegShm is only supported for local connections.");
+        goto out;
+    }
+    if (asprintf(&shm_name, "/crickethostalloc-%d", hainfo_idx) == -1) {
+        LOGE(LOG_ERROR, "asprintf failed: %s", strerror(errno));
+        goto out;
+    }
+    hainfo[hainfo_idx].client_ptr = (void*)client_ptr;
     *result = cudaSuccess;
-cleanup:
-    shm_unlink(shm_name);
 out:
+    shm_unlink(shm_name);
+    free(shm_name);
     RECORD_RESULT(integer, *result);
     return 1;
 }
 
+
 bool_t cuda_host_get_device_pointer_1_svc(ptr pHost, int flags, ptr_result *result, struct svc_req *rqstp)
 {
     LOGE(LOG_DEBUG, "cudaHostGetDevicePointer");
@@ -1189,7 +1223,7 @@ bool_t cuda_malloc_1_svc(size_t argp, ptr_result *result, struct svc_req *rqstp)
 #ifdef WITH_IB
         result->err = ib_allocate_memreg((void**)&result->ptr_result_u.ptr, argp, hainfo_cnt, true);
             if (result->err == 0) {
-                hainfo[hainfo_cnt].cnt = hainfo_cnt;
+                hainfo[hainfo_cnt].idx = hainfo_cnt;
                 hainfo[hainfo_cnt].size = argp;
                 hainfo[hainfo_cnt].server_ptr = (void*)result->ptr_result_u.ptr;
 
@@ -1500,8 +1534,8 @@ bool_t cuda_memcpy_ib_1_svc(int index, ptr device_ptr, size_t size, int kind, in
     LOGE(LOG_DEBUG, "cudaMemcpyIB");
     *result = cudaErrorInitializationError;
     //anstatt array list (list.c)
-    if (hainfo[index].cnt == 0 ||
-        hainfo[index].cnt != index) {
+    if (hainfo[index].idx == 0 ||
+        hainfo[index].idx != index) {
 
         LOGE(LOG_ERROR, "inconsistent state");
         goto out;
@@ -1553,12 +1587,12 @@ bool_t cuda_memcpy_shm_1_svc(int index, ptr device_ptr, size_t size, int kind, i
     RECORD_ARG(2, device_ptr);
     RECORD_ARG(3, size);
     RECORD_ARG(4, kind);
-    LOGE(LOG_DEBUG, "cudaMemcpyShm");
+    LOGE(LOG_DEBUG, "cudaMemcpyShm(index: %d, device_ptr: %p, size: %d, kind: %d)", index, device_ptr, size, kind);
     *result = cudaErrorInitializationError;
-    if (hainfo[index].cnt == 0 ||
-        hainfo[index].cnt != index) {
+    if (index >= hainfo_cnt ||
+        hainfo[index].idx != index) {
 
-        LOGE(LOG_ERROR, "inconsistent state");
+        LOGE(LOG_ERROR, "inconsistent state: index: %d, hainfo[index].idx: %d", index, hainfo[index].idx);
         goto out;
     }
     if (hainfo[index].size < size) {
@@ -1672,8 +1706,8 @@ bool_t cuda_memcpy_to_symbol_shm_1_svc(int index, ptr device_ptr, size_t size, s
     RECORD_ARG(5, kind);
     LOGE(LOG_DEBUG, "cudaMemcpyToSymbolShm");
     *result = cudaErrorInitializationError;
-    if (hainfo[index].cnt == 0 ||
-        hainfo[index].cnt != index) {
+    if (hainfo[index].idx == 0 ||
+        hainfo[index].idx != index) {
 
         LOGE(LOG_ERROR, "inconsistent state");
         goto out;
diff --git a/cpu/cpu-server.c b/cpu/cpu-server.c
index 9e309d8e..9b0dccbd 100644
--- a/cpu/cpu-server.c
+++ b/cpu/cpu-server.c
@@ -160,19 +160,7 @@ bool_t rpc_dlopen_1_svc(char *path, int *result, struct svc_req *rqstp)
     return 1;
 }
 
-
-
-void cricket_main_hash(char* app_command)
-{
-    cricket_main(app_command, 0, 0);
-}
-
-void cricket_main_static(size_t prog_num, size_t vers_num)
-{
-    cricket_main("", prog_num, vers_num);
-}
-
-void cricket_main(char* app_command, size_t prog_num, size_t vers_num)
+void cricket_main(size_t prog_num, size_t vers_num)
 {
     int ret = 1;
     register SVCXPRT *transp;
@@ -217,36 +205,16 @@ void cricket_main(char* app_command, size_t prog_num, size_t vers_num)
             restore = 1;
     }
 
-    if (cpu_utils_command(&command) != 0) {
-        LOG(LOG_WARNING, "could not retrieve command name. This might prevent starting CUDA applications");
-    } else {
-        LOG(LOG_DEBUG, "the command is '%s'", command);
-        //This is a workaround to make LD_PRELOAD work under GDB supervision
-        const char *cmp = "cudbgprocess";
-        if (strncmp(command, cmp, strlen(cmp)) == 0) {
-            LOG(LOG_DEBUG, "skipping RPC server");
-            return;
-        }
-    }
-
     if (restore == 1) {
         if (cr_restore_rpc_id("ckp", &prog, &vers) != 0) {
             LOGE(LOG_ERROR, "error while restoring rpc id");
         }
     } else {
-        if (prog_num == 0) {
-            if (cpu_utils_md5hash(app_command, &prog, &vers) != 0) {
-                LOGE(LOG_ERROR, "error while creating binary checksum");
-                exit(0);
-            }
-        }
-        else {
-            prog = prog_num;
-            vers = vers_num;
-        }
+        prog = prog_num;
+        vers = vers_num;
     }
 
-    LOGE(LOG_DEBUG, "using prog=%d, vers=%d, derived from \"%s\"", prog, vers, app_command);
+    LOGE(LOG_DEBUG, "using prog=%d, vers=%d", prog, vers);
 
 
     switch (socktype) {
@@ -299,7 +267,7 @@ void cricket_main(char* app_command, size_t prog_num, size_t vers_num)
     //     cudaRegisterAllv();
     // }
 
-    sched = &sched_none; 
+    sched = &sched_none;
     if (sched->init() != 0) {
         LOGE(LOG_ERROR, "initializing scheduler failed.");
         goto cleanup4;
diff --git a/cpu/cpu-server.h b/cpu/cpu-server.h
index 9c3fcbb0..3eea0f63 100644
--- a/cpu/cpu-server.h
+++ b/cpu/cpu-server.h
@@ -3,8 +3,6 @@
 
 #include <stddef.h>
 
-void cricket_main(char* app_command, size_t prog_version, size_t vers_num);
-void cricket_main_hash(char* app_command);
-void cricket_main_static(size_t prog_num, size_t vers_num);
+void cricket_main(size_t prog_version, size_t vers_num);
 
 #endif //_CPU_SERVER_H_
diff --git a/cpu/cpu-utils.c b/cpu/cpu-utils.c
index 098593c3..cf67ce07 100644
--- a/cpu/cpu-utils.c
+++ b/cpu/cpu-utils.c
@@ -123,10 +123,6 @@ kernel_info_t* utils_search_info(list *kernel_infos, const char *kernelname)
 
 int cpu_utils_is_local_connection(struct svc_req *rqstp)
 {
-    LOGE(LOG_DEBUG, "%p", rqstp);
-    LOGE(LOG_DEBUG, "%p", rqstp->rq_xprt);
-    LOGE(LOG_DEBUG, "%p", rqstp->rq_xprt->xp_fd);
-
     struct sockaddr_in remote_addr = {0};
     struct sockaddr_in local_addr = {0};
     struct hostent *hp;
diff --git a/cpu/cpu_rpc_prot.x b/cpu/cpu_rpc_prot.x
index 45495011..2a147a7c 100644
--- a/cpu/cpu_rpc_prot.x
+++ b/cpu/cpu_rpc_prot.x
@@ -80,6 +80,13 @@ default:
     void;
 };
 
+union sz_result switch (int err) {
+case 0:
+    size_t data;
+default:
+    void;
+};
+
 union ptr_result switch (int err) {
 case 0:
     ptr ptr;
@@ -226,7 +233,8 @@ program RPC_CD_PROG {
         /*ptr_result CUDA_GET_MIPMAPPED_ARRAY_LEVEL(ptr, int)                   = 406;*/
         ptr_result   CUDA_GET_SYMBOL_ADDRESS(ptr)                               = 407;
         u64_result   CUDA_GET_SYMBOL_SIZE(ptr)                                  = 408;
-        int          CUDA_HOST_ALLOC(int, size_t, ptr, unsigned int)            = 409;
+        sz_result    CUDA_HOST_ALLOC(size_t, unsigned int)                 = 409;
+        int          CUDA_HOST_ALLOC_REGSHM(size_t, ptr)                        = 477;
         ptr_result   CUDA_HOST_GET_DEVICE_POINTER(ptr, int)                     = 410;
         int_result   CUDA_HOST_GET_FLAGS(ptr)                                   = 411;
         /*int        CUDA_HOST_REGISTER(ptr, size_t, int)                       = 412;*/
diff --git a/cpu/gsched_none.c b/cpu/gsched_none.c
index 8b509089..6cb1152f 100644
--- a/cpu/gsched_none.c
+++ b/cpu/gsched_none.c
@@ -23,7 +23,7 @@ int gsched_none_init(void)
     pthread_mutex_init(&mutex_device, NULL);
     pthread_mutex_init(&mutex_ids, NULL);
     if ((res = cudaGetDeviceCount(&cuda_max_devices)) != cudaSuccess) {
-        LOGE(LOG_ERROR, "cudaGetDeviceCount failed: %s", cudaGetErrorString(res));
+        LOGE(LOG_ERROR, "cudaGetDeviceCount failed: %s (%d)", cudaGetErrorString(res), res);
         return 1;
     }
     return 0;
diff --git a/cpu/server-exe.c b/cpu/server-exe.c
index 8e358be6..805dd9ee 100644
--- a/cpu/server-exe.c
+++ b/cpu/server-exe.c
@@ -3,17 +3,22 @@
 #include "log.h"
 
 #include <stdlib.h>
+#include <stdint.h>
 
 int main(int argc, char** argv)
 {
-
-    //TODO: Check if command path exists
     if (argc == 1) {
-        cricket_main_static(RPC_CD_PROG, RPC_CD_VERS);
+        cricket_main(RPC_CD_PROG, RPC_CD_VERS);
     } else if (argc == 2) {
-        cricket_main_hash(argv[1]);
+        uint64_t vers;
+        if (sscanf(argv[1], "%lu", &vers) != 1) {
+            LOGE(LOG_ERROR, "version string could not be converted to number");
+            LOGE(LOG_INFO, "usage: %s [unique rpc version]", argv[0]);
+            return 1;
+        }
+        cricket_main(RPC_CD_PROG, vers);
     } else {
-        LOGE(LOG_ERROR, "usage: %s [command]", argv[0]);
+        LOGE(LOG_INFO, "usage: %s", argv[0]);
     }
     return 0;
 }
diff --git a/cpu/server-library.c b/cpu/server-library.c
deleted file mode 100644
index cd5e57a1..00000000
--- a/cpu/server-library.c
+++ /dev/null
@@ -1,10 +0,0 @@
-
-#include "cpu-server.h"
-#include "log.h"
-
-/* shared object constructor; executes before main and thus hijacks main program */
-void __attribute__ ((constructor)) library_constr(void)
-{
-    cricket_main_hash("/proc/self/exe");
-}
-

From 6e461546c432993a22bc1c9fb9f64a834cb3b85f Mon Sep 17 00:00:00 2001
From: Niklas Eiling <niklas.eiling@eonerc.rwth-aachen.de>
Date: Fri, 12 May 2023 15:51:07 +0200
Subject: [PATCH 34/83] fix cudaMemcpy using correct shm index references

Signed-off-by: Niklas Eiling <niklas.eiling@eonerc.rwth-aachen.de>
---
 cpu/cpu-client-runtime.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/cpu/cpu-client-runtime.c b/cpu/cpu-client-runtime.c
index 8d1f8ba2..c8d68b83 100644
--- a/cpu/cpu-client-runtime.c
+++ b/cpu/cpu-client-runtime.c
@@ -1578,7 +1578,7 @@ cudaError_t cudaMemcpy(void* dst, const void* src, size_t count, enum cudaMemcpy
 #endif //WITH_MT_MEMCPY
         } else {
             if (shm_enabled && connection_is_local == 1) { //Use local shared memory
-                retval = cuda_memcpy_shm_1(index, (ptr)dst, count, kind, &ret, clnt);
+                retval = cuda_memcpy_shm_1(hainfo[index].idx, (ptr)dst, count, kind, &ret, clnt);
             } else if (socktype == TCP) { //Use infiniband
 #ifdef WITH_IB
                 //the following commend connects to serverside cuda_memcpy_ib_1_svc, server thread is initialized waiting for client send
@@ -1641,7 +1641,7 @@ cudaError_t cudaMemcpy(void* dst, const void* src, size_t count, enum cudaMemcpy
 #endif //WITH_MT_MEMCPY
         } else {
             if (shm_enabled && connection_is_local) { //Use local shared memory
-                retval = cuda_memcpy_shm_1(index, (ptr)src, count, kind, &ret, clnt);
+                retval = cuda_memcpy_shm_1(hainfo[index].idx, (ptr)src, count, kind, &ret, clnt);
             } else if (socktype == TCP) { //Use infiniband
 #ifdef WITH_IB
                 pthread_t thread = {0};

From 4a4bd02f3f679a0ec1e7a1174813ba0fc6d49c70 Mon Sep 17 00:00:00 2001
From: Niklas Eiling <niklas.eiling@eonerc.rwth-aachen.de>
Date: Mon, 15 May 2023 15:21:00 +0200
Subject: [PATCH 35/83] fix resource manager add_sorted function inserting and
 wrong location in list leading to binary search failing sometimes.

Signed-off-by: Niklas Eiling <niklas.eiling@eonerc.rwth-aachen.de>
---
 cpu/cpu-client-runtime.c |  5 ++---
 cpu/cpu-server-driver.c  |  3 ++-
 cpu/resource-mg.c        | 28 ++++++++++++++++++++++++++--
 cpu/resource-mg.h        |  2 ++
 4 files changed, 32 insertions(+), 6 deletions(-)

diff --git a/cpu/cpu-client-runtime.c b/cpu/cpu-client-runtime.c
index c8d68b83..df3148b4 100644
--- a/cpu/cpu-client-runtime.c
+++ b/cpu/cpu-client-runtime.c
@@ -1534,7 +1534,6 @@ extern char server[256];
 #define WITH_MT_MEMCPY
 cudaError_t cudaMemcpy(void* dst, const void* src, size_t count, enum cudaMemcpyKind kind)
 {
-    
 #ifdef WITH_API_CNT
     api_call_cnt++;
     memcpy_cnt += count;
@@ -1542,9 +1541,9 @@ cudaError_t cudaMemcpy(void* dst, const void* src, size_t count, enum cudaMemcpy
     int ret = 1;
     enum clnt_stat retval;
     if (kind == cudaMemcpyHostToDevice) {
-//get index of mem reg (src: cpu reg memregion)
+        // get index of mem reg (src: cpu reg memregion)
         int index = hainfo_getindex((void*)src);
-//         not a cudaHostAlloc'ed memory 
+        // not a cudaHostAlloc'ed memory 
         if (index == -1) {
 #ifdef WITH_MT_MEMCPY
             if (count > 2*MT_MEMCPY_MEM_PER_THREAD) {
diff --git a/cpu/cpu-server-driver.c b/cpu/cpu-server-driver.c
index fb143f00..1cb7d324 100644
--- a/cpu/cpu-server-driver.c
+++ b/cpu/cpu-server-driver.c
@@ -106,8 +106,9 @@ bool_t rpc_register_function_1_svc(ptr fatCubinHandle, ptr hostFun, char* device
     LOG(LOG_DEBUG, "rpc_register_function(fatCubinHandle: %p, hostFun: %p, deviceFun: %s, deviceName: %s, thread_limit: %d)",
         fatCubinHandle, hostFun, deviceFun, deviceName, thread_limit);
     GSCHED_RETAIN;
+    //resource_mg_print(&rm_modules);
     if ((module = resource_mg_get(&rm_modules, (void*)fatCubinHandle)) == (void*)fatCubinHandle) {
-        LOG(LOG_ERROR, "%p not found in resource manager - we cannot call a function from an unknown module.", fatCubinHandle);
+        LOGE(LOG_ERROR, "%p not found in resource manager - we cannot call a function from an unknown module.", fatCubinHandle);
         result->err = -1;
         return 1;
     }
diff --git a/cpu/resource-mg.c b/cpu/resource-mg.c
index e07e6a5f..f78503fe 100644
--- a/cpu/resource-mg.c
+++ b/cpu/resource-mg.c
@@ -75,6 +75,28 @@ static void* resource_mg_search_map(resource_mg *mg, void *client_address)
     LOGE(LOG_DEBUG, "no find: %p", client_address);
     return client_address;
 }
+
+void resource_mg_print(resource_mg *mg)
+{
+    size_t i;
+    resource_mg_map_elem *elem;
+    if (mg == NULL) {
+        LOGE(LOG_ERROR, "resource manager mg is NULL");
+        return;
+    }
+    LOG(LOG_DEBUG, "new_res:");
+    for (i = 0; i < mg->new_res.length; i++) {
+        LOG(LOG_DEBUG, "%p", *(void**)list_get(&mg->new_res, i));
+    }
+    if (mg->bypass == 0) {
+        LOG(LOG_DEBUG, "map_res:");
+        for (i = 0; i < mg->map_res.length; i++) {
+            elem = list_get(&mg->map_res, i);
+            LOG(LOG_DEBUG, "%p -> %p", elem->client_address, elem->cuda_address);
+        }
+    }
+}
+
 inline void* resource_mg_get(resource_mg *mg, void* client_address)
 {
     if (mg->bypass) {
@@ -85,6 +107,7 @@ inline void* resource_mg_get(resource_mg *mg, void* client_address)
     return 0;
 }
 
+#include <stdio.h>
 int resource_mg_add_sorted(resource_mg *mg, void* client_address, void* cuda_address)
 {
     ssize_t start = 0;
@@ -124,10 +147,11 @@ int resource_mg_add_sorted(resource_mg *mg, void* client_address, void* cuda_add
             return 0;
         }
     }
-    if (end < 0) {
+    if (end < 0LL) {
         end = 0;
     }
-    if (mid_elem->client_address < client_address) {
+    resource_mg_map_elem *end_elem = list_get(&mg->map_res, end);
+    if (end_elem->client_address < client_address) {
         end++;
     }
     return list_insert(&mg->map_res, end, &new_elem);
diff --git a/cpu/resource-mg.h b/cpu/resource-mg.h
index 0b134da7..6f85155c 100644
--- a/cpu/resource-mg.h
+++ b/cpu/resource-mg.h
@@ -55,4 +55,6 @@ int resource_mg_create(resource_mg *mg, void* cuda_address);
 
 void* resource_mg_get(resource_mg *mg, void* client_address);
 
+void resource_mg_print(resource_mg *mg);
+
 #endif //_RESOURCE_MG_H_

From bde65005533f4f87c846ff9513da5dccf3c482ed Mon Sep 17 00:00:00 2001
From: Niklas Eiling <niklas.eiling@eonerc.rwth-aachen.de>
Date: Tue, 16 May 2023 10:08:05 +0200
Subject: [PATCH 36/83] fix wrong decoding of compressed kernels

when walking through compressed cubins we added the decompressed size to the current location instead of the compressed size leading to jumping over important parts of the cubin.

Signed-off-by: Niklas Eiling <niklas.eiling@eonerc.rwth-aachen.de>
---
 cpu/cpu-elf2.c | 15 +++++++--------
 cpu/cpu-elf2.h |  2 +-
 2 files changed, 8 insertions(+), 9 deletions(-)

diff --git a/cpu/cpu-elf2.c b/cpu/cpu-elf2.c
index 42126b4b..1ada98f9 100644
--- a/cpu/cpu-elf2.c
+++ b/cpu/cpu-elf2.c
@@ -450,13 +450,16 @@ int elf2_get_fatbin_info(const struct fat_header *fatbin, list *kernel_infos, ui
         LOGE(LOG_ERROR, "fatbin struct magic number is wrong. Got %llx, expected %llx.", fatbin->magic, FATBIN_STRUCT_MAGIC);
         goto error;
     }
-    LOG(LOG_DBG(1), "Fatbin: magic: %x, version: %x, text: %lx, data: %lx, ptr: %lx, ptr2: %lx, zero: %lx",
+    LOGE(LOG_DBG(1), "Fatbin: magic: %x, version: %x, text: %lx, data: %lx, ptr: %lx, ptr2: %lx, zero: %lx",
            fatbin->magic, fatbin->version, fatbin->text, fatbin->data, fatbin->unknown, fatbin->text2, fatbin->zero);
 
     if (get_elf_header((uint8_t*)fatbin->text, sizeof(struct fat_elf_header), &eh) != 0) {
         LOGE(LOG_ERROR, "Something went wrong while checking the header.");
         goto error;
     }
+    LOGE(LOG_DBG(1), "elf header: magic: %#x, version: %#x, header_size: %#x, size: %#zx",
+           eh->magic, eh->version, eh->header_size, eh->size); 
+
     input_pos += eh->header_size;
     fatbin_total_size = eh->header_size + eh->size;
     do {
@@ -464,14 +467,10 @@ int elf2_get_fatbin_info(const struct fat_header *fatbin, list *kernel_infos, ui
             fprintf(stderr, "Something went wrong while checking the header.\n");
             goto error;
         }
-        //print_header(th);
+        print_header(th);
         input_pos += th->header_size;
         if (th->kind != 2) { // section does not cotain device code (but e.g. PTX)
-            if (th->flags & FATBIN_FLAG_COMPRESS) {
-                input_pos += th->decompressed_size;
-            } else {
-                input_pos += th->size;
-            }
+            input_pos += th->size;
             continue;
         }
         if (th->flags & FATBIN_FLAG_DEBUG) {
@@ -513,7 +512,7 @@ int elf2_get_fatbin_info(const struct fat_header *fatbin, list *kernel_infos, ui
     *fatbin_mem = (void*)fatbin->text;
     *fatbin_size = fatbin_total_size;
     ret = 0;
- error:    
+ error:
     return ret;
 }
 
diff --git a/cpu/cpu-elf2.h b/cpu/cpu-elf2.h
index f484500f..a170cb37 100644
--- a/cpu/cpu-elf2.h
+++ b/cpu/cpu-elf2.h
@@ -5,7 +5,7 @@
 #include "cpu-common.h"
 #include "list.h"
 
-struct fat_header {
+struct __attribute__((__packed__)) fat_header {
     uint32_t magic;
     uint32_t version;
     uint64_t text;      // points to first text section

From 33e0fe4b5be7745fe0817e3dc4c105edd215d62e Mon Sep 17 00:00:00 2001
From: Niklas Eiling <niklas.eiling@eonerc.rwth-aachen.de>
Date: Tue, 16 May 2023 10:36:26 +0200
Subject: [PATCH 37/83] update dockerfiles so they install cuda profiler api
 and add new Dockerfile for CUDA 12.1

Signed-off-by: Niklas Eiling <niklas.eiling@eonerc.rwth-aachen.de>
---
 .gitlab-ci.yml           | 54 +++++++++++++++++++++++++++++++++-------
 Makefile                 |  5 ++--
 cpu/Makefile             |  6 ++++-
 cpu/cpu-client-driver.c  |  4 +++
 cpu/cpu-client-runtime.c |  6 ++++-
 cpu/cpu-elf2.c           |  6 ++---
 cpu/cpu-server-runtime.c |  4 +--
 submodules/Makefile      |  5 ++++
 tests/Makefile           | 12 ++++++---
 tests/cpu/unit/Makefile  |  9 ++++---
 tests/samples/Makefile   |  2 +-
 utils/Dockerfile         | 19 +++++++-------
 utils/Dockerfile.cuda10  |  9 ++++---
 utils/Dockerfile.cuda11  | 43 ++++++++++++++++++++++++++++++++
 14 files changed, 145 insertions(+), 39 deletions(-)
 create mode 100644 utils/Dockerfile.cuda11

diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml
index 35089bc0..a8506d82 100644
--- a/.gitlab-ci.yml
+++ b/.gitlab-ci.yml
@@ -21,7 +21,7 @@ stages:
 ##############################################################################
 
 # Build docker image
-prepare:centos8:docker-dev:
+prepare:rocky9:docker-dev:
   stage: prepare
   script:
     - docker build
@@ -31,6 +31,16 @@ prepare:centos8:docker-dev:
   tags:
     - docker
 
+prepare:centos8:cuda11:
+  stage: prepare
+  script:
+    - docker build
+        --file utils/Dockerfile.cuda11
+        --tag ${DOCKER_IMAGE_DEV}_cuda11:${DOCKER_TAG}
+        --tag ${DOCKER_IMAGE_DEV}_cuda11:latest .
+  tags:
+    - docker
+
 prepare:centos8:cuda10:
   stage: prepare
   script:
@@ -57,7 +67,7 @@ prepare:centos8:cuda10:
 
 build:
   stage: build
-  needs: ["prepare:centos8:docker-dev"]
+  needs: ["prepare:rocky9:docker-dev"]
   script:
    - make -j 32 libtirpc
    - make -j 32 cuda-gdb
@@ -82,7 +92,7 @@ build:
 
 build:ib:
   stage: build
-  needs: ["prepare:centos8:docker-dev"]
+  needs: ["prepare:rocky9:docker-dev"]
   script:
    - make -j 32 libtirpc
    - make -j 32 cuda-gdb
@@ -108,13 +118,39 @@ build:ib:
   tags:
     - docker
 
+build:cuda11:
+  stage: build
+  needs: ["prepare:centos8:cuda11"]
+  script:
+   - make -j 32 libtirpc
+   - make -j 32 cuda-gdb
+   - make -j 1 LOG=INFO NOSAMPLES=yes
+  artifacts:
+    expire_in: 1 week
+    paths:
+      - bin
+      - tests/bin
+  image: ${DOCKER_IMAGE_DEV}_cuda11:${DOCKER_TAG}
+  cache:
+    paths:
+      - gpu/build
+      - cpu/*.o
+      - tests/cpu/*.o
+      - tests/test_apps/*.o
+      - submodules/libtirpc
+      - submodules/cuda-gdb
+      - submodules/cuda-gdb-src.rpm
+    key: build_cuda11
+  tags:
+    - docker
+
 build:cuda10:
   stage: build
   needs: ["prepare:centos8:cuda10"]
   script:
    - make -j 32 libtirpc
    - make -j 32 cuda-gdb
-   - make -j 1 LOG=INFO
+   - make -j 1 LOG=INFO NOSAMPLES=yes
   artifacts:
     expire_in: 1 week
     paths:
@@ -136,7 +172,7 @@ build:cuda10:
 
 build:debug:
   stage: build
-  needs: ["prepare:centos8:docker-dev"]
+  needs: ["prepare:rocky9:docker-dev"]
   script:
    - make -j 32 libtirpc
    - make -j 32 cuda-gdb
@@ -179,7 +215,7 @@ build:debug:
       echo $KNOWN_HOSTS > ~/.ssh/known_hosts && chmod 600 ~/.ssh/id_rsa
     - ssh $GPU_TARGET mkdir -p $RDIR
     - scp -r $LDIR/* $GPU_TARGET:$RDIR/
-    - ssh $GPU_TARGET "LD_PRELOAD=$RDIR/libtirpc.so.3:$RDIR/cricket-server.so $RDIR/$TEST_BINARY" &
+    - ssh $GPU_TARGET "LD_PRELOAD=$RDIR/libtirpc.so.3 $RDIR/cricket-rpc-server" &
     - sleep 2
     - REMOTE_GPU_ADDRESS="ghost.acs-lab.eonerc.rwth-aachen.de" PATH=$LDIR:$PATH LD_PRELOAD=$LDIR/libtirpc.so.3:$LDIR/cricket-client.so $LDIR/$TEST_BINARY $PARAMETER
   after_script:
@@ -221,16 +257,16 @@ test:test_kernel:
 test:samples:matrixMul:
     extends: .remote-gpu
     variables:
-      TEST_BINARY: 'tests/matrixMul'
+      TEST_BINARY: 'tests/matrixMul.compressed.sample'
 
 test:samples:bandwidthTest:
     extends: .remote-gpu
     variables:
-      TEST_BINARY: 'tests/bandwidthTest'
+      TEST_BINARY: 'tests/bandwidthTest.sample'
 
 test:samples:nbody:
     extends: .remote-gpu
     variables:
-      TEST_BINARY: 'tests/nbody'
+      TEST_BINARY: 'tests/nbody.uncompressed.sample'
       PARAMETER: '-benchmark'
 
diff --git a/Makefile b/Makefile
index 9007482d..7cc6f46c 100644
--- a/Makefile
+++ b/Makefile
@@ -33,7 +33,7 @@ tests:
 	@echo -e "\033[36m----> Building test kernels\033[0m"
 	$(MAKE) -C tests
 
-install-cpu: bin/cricket-client.so bin/cricket-server.so bin/libtirpc.so bin/libtirpc.so.3 bin/tests
+install-cpu: bin/cricket-client.so bin/cricket-rpc-server bin/libtirpc.so bin/libtirpc.so.3 bin/tests
 	@echo -e "\033[36m----> Copying cpu binaries to build/bin\033[0m"
 
 install: install-cpu bin/cricket
@@ -51,7 +51,8 @@ bin/cricket-client.so: bin
 
 bin/cricket-server.so: bin
 	$(MAKE) -C cpu cricket-server.so
-	cp cpu/cricket-server.so bin
+	mv cpu/cricket-server.so bin/cricket-server.so
+
 
 bin/cricket-rpc-server: bin
 	$(MAKE) -C cpu cricket-rpc-server
diff --git a/cpu/Makefile b/cpu/Makefile
index 8f29d031..3369aeb9 100644
--- a/cpu/Makefile
+++ b/cpu/Makefile
@@ -1,5 +1,6 @@
 #Standalone RPC Server
 SERVER = cricket-rpc-server
+SERVER_LIB = cricket-server.so
 #RPC client library
 CLIENT = cricket-client.so
 
@@ -113,6 +114,9 @@ all : $(SERVER) $(CLIENT)
 $(CLIENT) : $(OBJ_CLIENT)
 	$(LD) $(CC_FLAGS) -shared -o $@ $^ $(CLIENT_LD_FLAGS)
 
+$(SERVER_LIB) : $(OBJ_SERVER) $(SRC_SERVER_EXE:%.c=%.o)
+	$(LD) $(CC_FLAGS) -shared -o $@ $^ $(SERVER_BIN_LD_FLAGS)
+
 $(SERVER) : $(OBJ_SERVER) $(SRC_SERVER_EXE:%.c=%.o)
 	$(LD) $(CC_FLAGS) -o $@ $^ $(SERVER_BIN_LD_FLAGS)
 
@@ -135,7 +139,7 @@ $(RPC_XDR) : $(RPC_DEF)
 	$(CC) $(CC_FLAGS) -c -fpic -o $@ $< $(LD_FLAGS) 
 
 clean:
-	 rm -f $(RPC_H) $(RPC_CLIENT) $(RPC_SERVER) $(RPC_SERVER_MOD) $(RPC_XDR) $(OBJ_CLIENT) $(OBJ_SERVER) $(SERVER) $(CLIENT) $(SRC_SERVER_EXE:%.c=%.o)
+	 rm -f $(RPC_H) $(RPC_CLIENT) $(RPC_SERVER) $(RPC_SERVER_MOD) $(RPC_XDR) $(OBJ_CLIENT) $(OBJ_SERVER) $(SERVER) $(SERVER_LIB) $(CLIENT) $(SRC_SERVER_EXE:%.c=%.o)
 
 
 
diff --git a/cpu/cpu-client-driver.c b/cpu/cpu-client-driver.c
index a37810c4..f180394f 100644
--- a/cpu/cpu-client-driver.c
+++ b/cpu/cpu-client-driver.c
@@ -697,7 +697,11 @@ DEF_FN(CUresult, cuGraphNodeGetDependencies, CUgraphNode, hNode, CUgraphNode*, d
 DEF_FN(CUresult, cuGraphNodeGetDependentNodes, CUgraphNode, hNode, CUgraphNode*, dependentNodes, size_t*, numDependentNodes)
 DEF_FN(CUresult, cuGraphAddDependencies, CUgraph, hGraph, const CUgraphNode*, from, const CUgraphNode*, to, size_t, numDependencies)
 DEF_FN(CUresult, cuGraphRemoveDependencies, CUgraph, hGraph, const CUgraphNode*, from, const CUgraphNode*, to, size_t, numDependencies)
+#if CUDA_VERSION >= 12000
 DEF_FN(CUresult, cuGraphInstantiate, CUgraphExec*, phGraphExec, CUgraph, hGraph, unsigned long long, flags)
+#else
+DEF_FN(CUresult, cuGraphInstantiate, CUgraphExec*, phGraphExec, CUgraph, hGraph, CUgraphNode*, phErrorNode, char*, logBuffer, size_t, bufferSize)
+#endif
 DEF_FN(CUresult, cuGraphLaunch, CUgraphExec, hGraphExec, CUstream, hStream)
 DEF_FN(CUresult, cuGraphLaunch_ptsz, CUgraphExec, hGraphExec, CUstream, hStream)
 DEF_FN(CUresult, cuGraphExecDestroy, CUgraphExec, hGraphExec)
diff --git a/cpu/cpu-client-runtime.c b/cpu/cpu-client-runtime.c
index df3148b4..9ccf5e9e 100644
--- a/cpu/cpu-client-runtime.c
+++ b/cpu/cpu-client-runtime.c
@@ -1252,7 +1252,7 @@ cudaError_t cudaHostAlloc(void** pHost, size_t size, unsigned int flags)
         hainfo_cnt++;
 
         retval_1 = RPC_SUCCESS;
-        ret = cudaSuccess;
+        ret.err = cudaSuccess;
 
 #else
         LOGE(LOG_DEBUG, "cudaHostAlloc is not supported for TCP transports without IB. Using malloc instead...");
@@ -1912,7 +1912,11 @@ DEF_FN(cudaError_t, cudaGraphGetNodes, cudaGraph_t, graph, cudaGraphNode_t*, nod
 DEF_FN(cudaError_t, cudaGraphGetRootNodes, cudaGraph_t, graph, cudaGraphNode_t*, pRootNodes, size_t*, pNumRootNodes)
 DEF_FN(cudaError_t, cudaGraphHostNodeGetParams, cudaGraphNode_t, node, struct cudaHostNodeParams*, pNodeParams)
 DEF_FN(cudaError_t, cudaGraphHostNodeSetParams, cudaGraphNode_t, node, const struct cudaHostNodeParams*, pNodeParams)
+#if CUDART_VERSION >= 12000
 DEF_FN(cudaError_t, cudaGraphInstantiate, cudaGraphExec_t*, pGraphExec, cudaGraph_t, graph, unsigned long long, flags)
+#else
+DEF_FN(cudaError_t, cudaGraphInstantiate, cudaGraphExec_t*, pGraphExec, cudaGraph_t, graph, cudaGraphNode_t*, pErrorNode, char*, pLogBuffer, size_t, bufferSize)
+#endif
 DEF_FN(cudaError_t, cudaGraphKernelNodeGetParams, cudaGraphNode_t, node, struct cudaKernelNodeParams*, pNodeParams)
 DEF_FN(cudaError_t, cudaGraphKernelNodeSetParams, cudaGraphNode_t, node, const struct cudaKernelNodeParams*, pNodeParams)
 DEF_FN(cudaError_t, cudaGraphLaunch, cudaGraphExec_t, graphExec, cudaStream_t, stream)
diff --git a/cpu/cpu-elf2.c b/cpu/cpu-elf2.c
index 1ada98f9..799b6103 100644
--- a/cpu/cpu-elf2.c
+++ b/cpu/cpu-elf2.c
@@ -457,8 +457,8 @@ int elf2_get_fatbin_info(const struct fat_header *fatbin, list *kernel_infos, ui
         LOGE(LOG_ERROR, "Something went wrong while checking the header.");
         goto error;
     }
-    LOGE(LOG_DBG(1), "elf header: magic: %#x, version: %#x, header_size: %#x, size: %#zx",
-           eh->magic, eh->version, eh->header_size, eh->size); 
+    // LOGE(LOG_DBG(1), "elf header: magic: %#x, version: %#x, header_size: %#x, size: %#zx",
+    //        eh->magic, eh->version, eh->header_size, eh->size); 
 
     input_pos += eh->header_size;
     fatbin_total_size = eh->header_size + eh->size;
@@ -467,7 +467,7 @@ int elf2_get_fatbin_info(const struct fat_header *fatbin, list *kernel_infos, ui
             fprintf(stderr, "Something went wrong while checking the header.\n");
             goto error;
         }
-        print_header(th);
+        //print_header(th);
         input_pos += th->header_size;
         if (th->kind != 2) { // section does not cotain device code (but e.g. PTX)
             input_pos += th->size;
diff --git a/cpu/cpu-server-runtime.c b/cpu/cpu-server-runtime.c
index b1ba19be..e53e1182 100644
--- a/cpu/cpu-server-runtime.c
+++ b/cpu/cpu-server-runtime.c
@@ -3,6 +3,7 @@
 #include <cuda.h>
 #include <driver_types.h>
 #include <dlfcn.h>
+#include <cuda_profiler_api.h>
 
 //for strerror
 #include <string.h>
@@ -1146,7 +1147,7 @@ bool_t cuda_host_alloc_1_svc(size_t size, unsigned int flags, sz_result *result,
     } else if (socktype == TCP) { //Use infiniband
 #ifdef WITH_IB
         LOGE(LOG_ERROR, "infiniband does not yet support cudaHostAlloc.");
-        goto cleanup;
+        goto out;
 #else
         LOGE(LOG_ERROR, "infiniband is disabled.");
         goto out;
@@ -1884,7 +1885,6 @@ bool_t cuda_register_fat_binary_end_1_svc(ptr cubinHandle, int *result, struct s
     *result = 0;
     return 1;
 }*/
-#include <cuda_profiler_api.h>
 
 bool_t cuda_profiler_start_1_svc(int *result, struct svc_req *rqstp)
 {
diff --git a/submodules/Makefile b/submodules/Makefile
index 54fb160a..e30870da 100644
--- a/submodules/Makefile
+++ b/submodules/Makefile
@@ -36,12 +36,17 @@ else
 endif
 
 cuda-gdb/build:
+ifeq (,$(wildcard ./cuda-gdb/build))
 	@echo -e "\033[36m----> Configuring cuda-gdb\033[0m"
+	@echo -e "\033[36m----> extracting cuda-gdb\033[0m"
 	mkdir -p cuda-gdb/build && cd cuda-gdb/build && \
 		../configure --disable-werror --program-prefix=cuda- --enable-cuda --with-python=no --enable-targets="x86_64-apple-darwin,x86_64-unknown-linux-gnu,arm-elf-linux-gnu,m68k-unknown-linux-gnu" CFLAGS='-I/usr/local/cuda/include -fPIC' LDFLAGS='-lpthread'
 	@echo -e "\033[36m----> Building cuda-gdb\033[0m"
 	CPATH=/usr/local/cuda/include $(MAKE) -C cuda-gdb/build
 	CPATH=/usr/local/cuda/include $(MAKE) -C cuda-gdb/build/gdb libgdb.a
+else
+	@echo -e "\033[36m----> cuda-gdb/build directory present. Skipping building of cuda-gdb\033[0m"
+endif
 
 lib:
 	mkdir -p lib
diff --git a/tests/Makefile b/tests/Makefile
index 8adc5da7..3048d6d5 100644
--- a/tests/Makefile
+++ b/tests/Makefile
@@ -1,7 +1,11 @@
 #MIT License...
 .PHONY: all clean test_apps cpu gpu samples
 
+ifdef NOSAMPLES
+all: test_apps cpu gpu bin
+else
 all: test_apps cpu gpu samples bin
+endif
 
 test_apps:
 	@echo -e "\033[36m----> Building tests/test_apps\033[0m"
@@ -20,13 +24,13 @@ samples:
 	@echo -e "\033[36m----> Building tests/samples\033[0m"
 	$(MAKE) -C samples
 
-bin: cpu samples test_apps
+bin: cpu test_apps
 	mkdir -p bin
 	cp cpu/unit/*.test bin
 	cp test_apps/*.testapp bin
-	cp samples/matrixMul/matrixMul bin
-	cp samples/bandwidthTest/bandwidthTest bin
-	cp samples/nbody/nbody bin
+ifndef NOSAMPLES
+	cp samples/samples-bin/*.sample bin
+endif
 
 clean:
 	@echo -e "\033[31m----> Cleaning up tests/test_apps\033[0m"
diff --git a/tests/cpu/unit/Makefile b/tests/cpu/unit/Makefile
index 6e22bb27..e7de359c 100644
--- a/tests/cpu/unit/Makefile
+++ b/tests/cpu/unit/Makefile
@@ -16,7 +16,7 @@ INC_FLAGS += -I../../../cpu/
 
 LIB_FLAGS += -L$(LIBTIRPC_PREFIX)/lib
 LIB_FLAGS += -L$(CUDA_SRC)/lib64
-LIB_FLAGS += -L../../../cpu/
+LIB_FLAGS += -L../../../bin/
 CC_FLAGS += -std=gnu99 $(INC_FLAGS) -g -ggdb -fsanitize=address -fsanitize=pointer-compare -fsanitize=pointer-subtract -fsanitize-address-use-after-scope
 LD_FLAGS = $(LIB_FLAGS)
 
@@ -32,11 +32,14 @@ CLIENT_LD_FLAGS = $(LD_FLAGS) -l:cricket-client.so
 
 all : $(BIN_CLIENT_TESTS) $(BIN_SERVER_TESTS)
 
-$(BIN_SERVER_TESTS) : %.test:%.o
+../../../bin/cricket-server.so:
+	$(MAKE) -C ../../../ bin/cricket-server.so
+
+$(BIN_SERVER_TESTS) : %.test:%.o ../../../bin/cricket-server.so
 	$(LD) $(CC_FLAGS) -o $@ $< $(SERVER_LD_FLAGS)
 
 $(OBJ_SERVER_TESTS) : %.o:%.c
-	$(CC) $(CC_FLAGS) -c -o $@ $< $(SERVER_LD_FLAGS)
+	$(CC) $(CC_FLAGS) -c -o $@ $<
 
 clean:
 	 rm -f $(OBJ_SERVER_TESTS) $(OBJ_CLIENT_TESTS) $(BIN_SERVER_TESTS) $(BIN_CLIENT_TESTS)
diff --git a/tests/samples/Makefile b/tests/samples/Makefile
index 1bb16a04..ab1ae4aa 100644
--- a/tests/samples/Makefile
+++ b/tests/samples/Makefile
@@ -9,7 +9,7 @@ SAMPLES = samples-bin/matrixMul.compressed.sample \
 
 CUDA_PATH = /usr/local/cuda
 SMS = 75 60
-CUDA_SAMPLES_RELEASE = 12.1
+CUDA_SAMPLES_RELEASE ?= 12.1
 CUDA_SAMPLES_URL = https://github.com/NVIDIA/cuda-samples/archive/refs/tags/v${CUDA_SAMPLES_RELEASE}.tar.gz
 
 PWD = $(shell dirname $(realpath $(firstword $(MAKEFILE_LIST))))
diff --git a/utils/Dockerfile b/utils/Dockerfile
index 30fddb78..65d134bc 100644
--- a/utils/Dockerfile
+++ b/utils/Dockerfile
@@ -1,4 +1,4 @@
-FROM centos:8
+FROM rockylinux:8
 
 LABEL \
 	org.label-schema.schema-version = "1.0" \
@@ -9,25 +9,24 @@ LABEL \
 	org.label-schema.author.email = "niklas.eiling@eonerc.rwth-aachen.de" \
 	org.label-schema.vcs-url = "https://git.rwth-aachen.de/niklas.eiling/cricket"
 
-RUN cd /etc/yum.repos.d/ && sed -i 's/mirrorlist/#mirrorlist/g' /etc/yum.repos.d/CentOS-* && sed -i 's|#baseurl=http://mirror.centos.org|baseurl=http://vault.centos.org|g' /etc/yum.repos.d/CentOS-* && yum update -y
-
 RUN dnf -y update && \
-    dnf install -y epel-release dnf-plugins-core && \
-    dnf install -y https://rpms.remirepo.net/enterprise/remi-release-8.rpm && \
-    dnf config-manager --set-enabled powertools && \
-    dnf config-manager --set-enabled remi
+	dnf install -y epel-release dnf-plugins-core && \
+	dnf install -y https://rpms.remirepo.net/enterprise/remi-release-8.rpm && \
+	dnf config-manager --set-enabled powertools && \
+	dnf config-manager --set-enabled remi
 
 RUN dnf install -y make bash git gcc autoconf libtool automake rpcgen \
                    ncurses-devel zlib-devel binutils-devel mesa-libGL-devel \
                    libvdpau-devel mesa-libEGL-devel openssl-devel rpcbind \
                    texinfo bison flex python3 which libibverbs libibverbs-devel \
-                   libasan cppcheck wget expat-devel xz-devel mesa-libGLU-devel freeglut-devel
+                   libasan cppcheck wget expat-devel xz-devel mesa-libGLU-devel freeglut-devel \
+                   elfutils-libelf-devel cpio openssl-devel openssl-libs
 
 ENV LD_LIBRARY_PATH="/usr/local/lib:/usr/local/lib64:${LD_LIBRARY_PATH}"
 
 RUN dnf -y config-manager --add-repo http://developer.download.nvidia.com/compute/cuda/repos/rhel8/x86_64/cuda-rhel8.repo && \
-    dnf --refresh -y install cuda-compiler-11-1 cuda-libraries-devel-11-1 cuda-samples-11-1 cuda-driver-devel-11-1 && \
-    ln -s cuda-11.1 /usr/local/cuda && \
+    dnf --refresh -y install cuda-compiler-12-1 cuda-libraries-devel-12-1 cuda-driver-devel-12-1 cuda-profiler-api-12-1 && \
+    ln -s cuda-12.1 /usr/local/cuda && \
     ln -s libcuda.so /usr/local/cuda/targets/x86_64-linux/lib/stubs/libcuda.so.1
 
 ENV PATH="/usr/local/cuda/bin:${PATH}"
diff --git a/utils/Dockerfile.cuda10 b/utils/Dockerfile.cuda10
index 2dcec62b..4bd94bb1 100644
--- a/utils/Dockerfile.cuda10
+++ b/utils/Dockerfile.cuda10
@@ -8,6 +8,8 @@ LABEL \
 	org.label-schema.author.name = "Niklas Eiling" \
 	org.label-schema.author.email = "niklas.eiling@eonerc.rwth-aachen.de" \
 	org.label-schema.vcs-url = "https://git.rwth-aachen.de/niklas.eiling/cricket"
+    
+RUN cd /etc/yum.repos.d/ && sed -i 's/mirrorlist/#mirrorlist/g' /etc/yum.repos.d/CentOS-* && sed -i 's|#baseurl=http://mirror.centos.org|baseurl=http://vault.centos.org|g' /etc/yum.repos.d/CentOS-* && yum update -y
 
 RUN dnf -y update && \
     dnf install -y epel-release dnf-plugins-core && \
@@ -19,12 +21,13 @@ RUN dnf install -y make bash git gcc autoconf libtool automake rpcgen \
                    ncurses-devel zlib-devel binutils-devel mesa-libGL-devel \
                    libvdpau-devel mesa-libEGL-devel openssl-devel rpcbind \
                    texinfo bison flex python3 which libibverbs libasan \
-                   cppcheck wget expat-devel xz-devel
+                   cppcheck wget expat-devel xz-devel elfutils-libelf-devel
 
 ENV LD_LIBRARY_PATH="/usr/local/lib:/usr/local/lib64:${LD_LIBRARY_PATH}"
 
-RUN dnf -y install https://developer.download.nvidia.com/compute/cuda/repos/rhel8/x86_64/cuda-repo-rhel8-10.2.89-1.x86_64.rpm && \
-    dnf --refresh -y install cuda-compiler-10-2 cuda-libraries-dev-10-2 cuda-samples-10-2 cuda-driver-dev-10-2 && \
+RUN dnf --refresh -y install https://developer.download.nvidia.com/compute/cuda/repos/rhel8/x86_64/cuda-repo-rhel8-10.2.89-1.x86_64.rpm && \
+    rpm --import https://developer.download.nvidia.com/compute/cuda/repos/rhel8/x86_64/D42D0685.pub && \
+    dnf --refresh -y install cuda-compiler-10-2 cuda-libraries-dev-10-2 cuda-samples-10-2 cuda-driver-dev-10-2 cuda-misc-headers-10-2 && \
     ln -s cuda-10.2 /usr/local/cuda && \
     ln -s libcuda.so /usr/local/cuda/targets/x86_64-linux/lib/stubs/libcuda.so.1
     
diff --git a/utils/Dockerfile.cuda11 b/utils/Dockerfile.cuda11
new file mode 100644
index 00000000..b84a2782
--- /dev/null
+++ b/utils/Dockerfile.cuda11
@@ -0,0 +1,43 @@
+FROM centos:8
+
+LABEL \
+	org.label-schema.schema-version = "1.0" \
+	org.label-schema.name = "cricket" \
+	org.label-schema.license = "MIT" \
+	org.label-schema.vendor = "Institute for Automation of Complex Power Systems, RWTH Aachen University" \
+	org.label-schema.author.name = "Niklas Eiling" \
+	org.label-schema.author.email = "niklas.eiling@eonerc.rwth-aachen.de" \
+	org.label-schema.vcs-url = "https://git.rwth-aachen.de/niklas.eiling/cricket"
+
+RUN cd /etc/yum.repos.d/ && sed -i 's/mirrorlist/#mirrorlist/g' /etc/yum.repos.d/CentOS-* && sed -i 's|#baseurl=http://mirror.centos.org|baseurl=http://vault.centos.org|g' /etc/yum.repos.d/CentOS-* && yum update -y
+
+RUN dnf -y update && \
+    dnf install -y epel-release dnf-plugins-core && \
+    dnf install -y https://rpms.remirepo.net/enterprise/remi-release-8.rpm && \
+    dnf config-manager --set-enabled powertools && \
+    dnf config-manager --set-enabled remi
+
+RUN dnf install -y make bash git gcc autoconf libtool automake rpcgen \
+                   ncurses-devel zlib-devel binutils-devel mesa-libGL-devel \
+                   libvdpau-devel mesa-libEGL-devel openssl-devel rpcbind \
+                   texinfo bison flex python3 which libibverbs libibverbs-devel \
+                   libasan cppcheck wget expat-devel xz-devel mesa-libGLU-devel freeglut-devel \
+                   elfutils-libelf-devel cpio openssl-devel openssl-libs
+
+ENV LD_LIBRARY_PATH="/usr/local/lib:/usr/local/lib64:${LD_LIBRARY_PATH}"
+
+RUN dnf -y config-manager --add-repo http://developer.download.nvidia.com/compute/cuda/repos/rhel8/x86_64/cuda-rhel8.repo && \
+    dnf --refresh -y install cuda-compiler-11-1 cuda-libraries-devel-11-1 cuda-samples-11-1 cuda-driver-devel-11-1 cuda-nvprof-11-1 && \
+    ln -s cuda-11.1 /usr/local/cuda && \
+    ln -s libcuda.so /usr/local/cuda/targets/x86_64-linux/lib/stubs/libcuda.so.1
+
+ENV PATH="/usr/local/cuda/bin:${PATH}"
+ENV LIBRARY_PATH="/usr/local/cuda/targets/x86_64-linux/lib/stubs:$(LIBRARY_PATH}"
+ENV LD_LIBRARY_PATH="/usr/local/cuda/lib64:/usr/local/cuda/targets/x86_64-linux/lib/stubs:${LD_LIBRARY_PATH}"
+
+#COPY --chown=root .ssh /root/.ssh
+
+WORKDIR /cricket
+
+ENTRYPOINT /bin/bash
+

From dcd900950a494dbf9efa53a4b0b8df30ee8beebd Mon Sep 17 00:00:00 2001
From: Niklas Eiling <niklas.eiling@eonerc.rwth-aachen.de>
Date: Tue, 16 May 2023 17:17:31 +0200
Subject: [PATCH 38/83] if a binary does not contain any kernel cricket should
 not show any erros.

I replaced them with warnings for now. We should probably do better sanity checking before assuming not found .nv.info sections only occur in binaries without kernels.

Signed-off-by: Niklas Eiling <niklas.eiling@eonerc.rwth-aachen.de>
---
 cpu/cpu-client.c | 2 +-
 cpu/cpu-elf2.c   | 3 ++-
 2 files changed, 3 insertions(+), 2 deletions(-)

diff --git a/cpu/cpu-client.c b/cpu/cpu-client.c
index 09b35e59..1403d1d3 100644
--- a/cpu/cpu-client.c
+++ b/cpu/cpu-client.c
@@ -376,7 +376,7 @@ void __cudaUnregisterFatBinary(void **fatCubinHandle)
          fatCubinHandle);
 
     if (fatCubinHandle == NULL) {
-        LOGE(LOG_ERROR, "fatCubinHandle is NULL");
+        LOGE(LOG_WARNING, "fatCubinHandle is NULL - so we have nothing to unload. (This is okay if this binary does not contain a kernel.)");
         return;
     }
 
diff --git a/cpu/cpu-elf2.c b/cpu/cpu-elf2.c
index 799b6103..d460d953 100644
--- a/cpu/cpu-elf2.c
+++ b/cpu/cpu-elf2.c
@@ -914,7 +914,8 @@ int elf2_parameter_info(list *kernel_infos, void* memory, size_t memsize)
     }
 
     if (get_section_by_name(elf, ".nv.info", &section) != 0) {
-        LOGE(LOG_ERROR, "could not find .nv.info section");
+        LOGE(LOG_WARNING, "could not find .nv.info section. This means this binary does not contain any kernels.");
+        ret = 0;    // This is not an error.
         goto cleanup;
     }
 

From d944cd95a0e472ae86d8d76b54feef64688ac51b Mon Sep 17 00:00:00 2001
From: Niklas Eiling <niklas.eiling@eonerc.rwth-aachen.de>
Date: Tue, 16 May 2023 17:25:31 +0200
Subject: [PATCH 39/83] cricket supports binaries with debug symbols so we
 should not throw an error when we find binaries with debug symbols

Signed-off-by: Niklas Eiling <niklas.eiling@eonerc.rwth-aachen.de>
---
 cpu/cpu-elf2.c | 1 -
 1 file changed, 1 deletion(-)

diff --git a/cpu/cpu-elf2.c b/cpu/cpu-elf2.c
index d460d953..52c8ee7d 100644
--- a/cpu/cpu-elf2.c
+++ b/cpu/cpu-elf2.c
@@ -475,7 +475,6 @@ int elf2_get_fatbin_info(const struct fat_header *fatbin, list *kernel_infos, ui
         }
         if (th->flags & FATBIN_FLAG_DEBUG) {
             LOGE(LOG_DEBUG, "fatbin contains debug information.");
-            goto error;
         }
 
         if (th->flags & FATBIN_FLAG_COMPRESS) {

From a0473acc26e17848946f2c8901fd1bfd07a7d937 Mon Sep 17 00:00:00 2001
From: Niklas Eiling <niklas.eiling@eonerc.rwth-aachen.de>
Date: Thu, 18 May 2023 14:48:06 +0200
Subject: [PATCH 40/83] implement cudaRegisterVar API so that we support
 cudaMemcpyToSymbol

Signed-off-by: Niklas Eiling <niklas.eiling@eonerc.rwth-aachen.de>
---
 cpu/cpu-client.c         | 15 ++++++++--
 cpu/cpu-server-driver.c  | 57 ++++++++++++++++++++++++++++-------
 cpu/cpu-server-runtime.c | 64 +++++++++-------------------------------
 cpu/cpu_rpc_prot.x       |  1 +
 cpu/resource-mg.h        |  1 +
 5 files changed, 75 insertions(+), 63 deletions(-)

diff --git a/cpu/cpu-client.c b/cpu/cpu-client.c
index 1403d1d3..6cdaf81c 100644
--- a/cpu/cpu-client.c
+++ b/cpu/cpu-client.c
@@ -289,13 +289,24 @@ int dlclose(void *handle)
     }
 }
 
-void __cudaRegisterVar(void **fatCubinHandle, char *hostVar, char
-                       *deviceAddress, const char *deviceName, int ext, size_t size, int constant,
+void __cudaRegisterVar(void **fatCubinHandle, char *hostVar, char *deviceAddress,
+                       const char *deviceName, int ext, size_t size, int constant,
+                       int global);
+
+void __cudaRegisterVar(void **fatCubinHandle, char *hostVar, char *deviceAddress,
+                       const char *deviceName, int ext, size_t size, int constant,
                        int global)
 {
+    enum clnt_stat retval_1;
+    int result;
     LOGE(LOG_DEBUG, "__cudaRegisterVar(fatCubinHandle=%p, hostVar=%p, deviceAddress=%p, "
            "deviceName=%s, ext=%d, size=%zu, constant=%d, global=%d)\n",
            fatCubinHandle, hostVar, deviceAddress, deviceName, ext, size, constant, global);
+    retval_1 = rpc_register_var_1((ptr)fatCubinHandle, (ptr)hostVar, (ptr)deviceAddress, (char*)deviceName, ext, size, constant, global,
+                                       &result, clnt);
+    if (retval_1 != RPC_SUCCESS) {
+        LOGE(LOG_ERROR, "call failed.");
+    }
 }
 
 void __cudaRegisterFunction(void **fatCubinHandle, const char *hostFun,
diff --git a/cpu/cpu-server-driver.c b/cpu/cpu-server-driver.c
index 1cb7d324..5894c126 100644
--- a/cpu/cpu-server-driver.c
+++ b/cpu/cpu-server-driver.c
@@ -24,9 +24,11 @@ int server_driver_init(int restore)
         // because CUfunctions and modules are at different locations on server and client
         ret &= resource_mg_init(&rm_modules, 0);
         ret &= resource_mg_init(&rm_functions, 0);
+        ret &= resource_mg_init(&rm_globals, 0);
     } else {
         ret &= resource_mg_init(&rm_modules, 0);
         ret &= resource_mg_init(&rm_functions, 0);
+        ret &= resource_mg_init(&rm_globals, 0);
         //ret &= server_driver_restore("ckp");
     }
     return ret;
@@ -115,23 +117,56 @@ bool_t rpc_register_function_1_svc(ptr fatCubinHandle, ptr hostFun, char* device
     result->err = cuModuleGetFunction((CUfunction*)&result->ptr_result_u.ptr,
                     module,
                     deviceName);
-    GSCHED_RELEASE;
     if (resource_mg_add_sorted(&rm_functions, (void*)hostFun, (void*)result->ptr_result_u.ptr) != 0) {
         LOGE(LOG_ERROR, "error in resource manager");
     }
+    GSCHED_RELEASE;
     RECORD_RESULT(ptr_result_u, *result);
     return 1;
+}
 
-    // int zero = 0;
-    // void *params[] = {NULL, NULL, NULL, &zero, &zero, &zero, &zero, NULL};
-    // if ((res = cuLaunchKernel(func, 1, 1, 1, 32, 1, 1, 0, CU_STREAM_DEFAULT, params, NULL)) != CUDA_SUCCESS) {
-    //     LOG(LOG_ERROR, "cuLaunchKernel failed: %d", res);
-    //     result->err = res;
-    //     return 1;
-    // }
-
-    // result->err = 0;
-    // return 1;
+// Does not support checkpoint/restart yet
+bool_t rpc_register_var_1_svc(ptr fatCubinHandle, ptr hostVar, ptr deviceAddress, char *deviceName, int ext, size_t size,
+                        int constant, int global, int *result, struct svc_req *rqstp)
+{
+    RECORD_API(rpc_register_var_1_argument);
+    RECORD_ARG(1, fatCubinHandle);
+    RECORD_ARG(2, hostVar);
+    RECORD_ARG(3, deviceAddress);
+    RECORD_ARG(4, deviceName);
+    RECORD_ARG(5, ext);
+    RECORD_ARG(6, size);
+    RECORD_ARG(7, constant);
+    RECORD_ARG(8, global);
+    
+    LOG(LOG_DEBUG, "rpc_register_var(fatCubinHandle: %p, hostVar: %p, deviceAddress: %p, deviceName: %s, "
+                   "ext: %d, size: %d, constant: %d, global: %d)",
+                   fatCubinHandle, hostVar, deviceAddress, deviceName, ext, size, constant, global);
+    
+    CUdeviceptr dptr = 0;
+    size_t d_size = 0;
+    CUresult res;
+    void *module = NULL;
+    GSCHED_RETAIN;
+    if ((module = resource_mg_get(&rm_modules, (void*)fatCubinHandle)) == (void*)fatCubinHandle) {
+        LOGE(LOG_ERROR, "%p not found in resource manager - we cannot call a function from an unknown module.", fatCubinHandle);
+        *result = -1;
+        return 1;
+    }
+    if ((res = cuModuleGetGlobal(&dptr, &d_size, module, deviceName)) != CUDA_SUCCESS) {
+        LOGE(LOG_ERROR, "cuModuleGetGlobal failed: %d", res);
+        *result = 1;
+        return 1;
+    }
+    if (resource_mg_add_sorted(&rm_globals, (void*)hostVar, (void*)dptr) != 0) {
+        LOGE(LOG_ERROR, "error in resource manager");
+        *result = 1;
+    } else {
+        *result = 0;
+    }
+    GSCHED_RELEASE;
+    RECORD_RESULT(integer, *result);
+    return 1;
 }
 
 int server_driver_deinit(void)
diff --git a/cpu/cpu-server-runtime.c b/cpu/cpu-server-runtime.c
index e53e1182..a5a94058 100644
--- a/cpu/cpu-server-runtime.c
+++ b/cpu/cpu-server-runtime.c
@@ -1380,7 +1380,7 @@ bool_t cuda_memcpy_htod_1_svc(uint64_t ptr, mem_data mem, size_t size, int *resu
     RECORD_ARG(2, mem);
     RECORD_ARG(3, size);
 
-    LOGE(LOG_DEBUG, "cudaMemcpyHtoD");
+    LOGE(LOG_DEBUG, "cudaMemcpyHtoD(%p, %p, %zu)", (void*)ptr, mem.mem_data_val, size);
     if (size != mem.mem_data_len) {
         LOGE(LOG_ERROR, "data size mismatch");
         *result = cudaErrorUnknown;
@@ -1669,63 +1669,27 @@ bool_t cuda_memcpy_dtoh_1_svc(uint64_t ptr, size_t size, mem_result *result, str
 /* cudaMemcpyPeer ( void* dst, int  dstDevice, const void* src, int  srcDevice, size_t count ) not implemented yet. see cudaMemcpyDtoD */
 /* cudaMemcpyPeerAsync ( void* dst, int  dstDevice, const void* src, int  srcDevice, size_t count, cudaStream_t stream = 0 ) */
 
-bool_t cuda_memcpy_to_symbol_1_svc(uint64_t ptr, mem_data mem, size_t size, size_t offset, int *result, struct svc_req *rqstp)
+bool_t cuda_memcpy_to_symbol_1_svc(uint64_t symbolptr, mem_data mem, size_t size, size_t offset, int *result, struct svc_req *rqstp)
 {
-    RECORD_API(cuda_memcpy_to_symbol_1_argument);
-    RECORD_ARG(1, ptr);
-    RECORD_ARG(2, mem);
-    RECORD_ARG(3, size);
-    RECORD_ARG(4, offset);
-
-    LOGE(LOG_DEBUG, "cudaMemcpyToSymbol");
-    if (size != mem.mem_data_len) {
-        LOGE(LOG_ERROR, "data size mismatch");
-        *result = cudaErrorUnknown;
-        return 1;
-    }
-#ifdef WITH_MEMCPY_REGISTER
-    if ((*result = cudaHostRegister(mem.mem_data_val, size, cudaHostRegisterMapped)) != cudaSuccess) {
-        LOGE(LOG_ERROR, "cudaHostRegister failed: %d.", *result);
+    LOGE(LOG_DEBUG, "cudaMemcpyToSymbol(%p, %p, %zu, %zu)", symbolptr, mem.mem_data_val, size, offset);
+    void *symbol_addr = resource_mg_get(&rm_globals, (void*)symbolptr);
+    if (symbol_addr == NULL) {
+        LOGE(LOG_ERROR, "cudaMemcpyToSymbol: symbol not found");
+        *result = cudaErrorInvalidSymbol;
         return 1;
     }
-#endif
-    *result = cudaMemcpyToSymbol((void*)ptr, mem.mem_data_val, size, offset, cudaMemcpyHostToDevice);
-#ifdef WITH_MEMCPY_REGISTER
-    cudaHostUnregister(mem.mem_data_val);
-#endif
-    RECORD_RESULT(integer, *result);
-    return 1;
+    return cuda_memcpy_htod_1_svc((ptr)(symbol_addr+offset), mem, size, result, rqstp);
 }
 
 bool_t cuda_memcpy_to_symbol_shm_1_svc(int index, ptr device_ptr, size_t size, size_t offset, int kind, int *result, struct svc_req *rqstp)
 {
-    RECORD_API(cuda_memcpy_to_symbol_shm_1_argument);
-    RECORD_ARG(1, index);
-    RECORD_ARG(2, device_ptr);
-    RECORD_ARG(3, size);
-    RECORD_ARG(4, offset);
-    RECORD_ARG(5, kind);
-    LOGE(LOG_DEBUG, "cudaMemcpyToSymbolShm");
-    *result = cudaErrorInitializationError;
-    if (hainfo[index].idx == 0 ||
-        hainfo[index].idx != index) {
-
-        LOGE(LOG_ERROR, "inconsistent state");
-        goto out;
-    }
-    if (hainfo[index].size < size) {
-        LOGE(LOG_ERROR, "requested size is smaller than shared memory segment");
-        goto out;
-    }
-
-    if (kind == cudaMemcpyHostToDevice) {
-        *result = cudaMemcpyToSymbol((void*)device_ptr, hainfo[index].server_ptr, size, offset, kind);
-    } else {
-        LOGE(LOG_ERROR, "a kind different from HostToDevice is unsupported for cudaMemcpyToSymbol");
+    void *symbol_addr = resource_mg_get(&rm_globals, (void*)device_ptr);
+    if (symbol_addr == NULL) {
+        LOGE(LOG_ERROR, "cudaMemcpyToSymbol: symbol not found");
+        *result = cudaErrorInvalidSymbol;
+        return 1;
     }
-out:
-    RECORD_RESULT(integer, *result);
-    return 1;
+    return cuda_memcpy_shm_1_svc(index, (ptr)(symbol_addr+offset), size, kind, result, rqstp);
 }
 
 /* cudaMemcpyToSymbolAsync ( const void* symbol, const void* src, size_t count, size_t offset, cudaMemcpyKind kind, cudaStream_t stream = 0 ) not implemented yet */
diff --git a/cpu/cpu_rpc_prot.x b/cpu/cpu_rpc_prot.x
index 2a147a7c..fd84c248 100644
--- a/cpu/cpu_rpc_prot.x
+++ b/cpu/cpu_rpc_prot.x
@@ -131,6 +131,7 @@ program RPC_CD_PROG {
         ptr_result   rpc_register_function(ptr, ptr, string, string, int)        = 50;
         int          rpc_elf_load(mem_data, ptr)                                 = 51;
         int          rpc_elf_unload(ptr)                                         = 52;
+        int          rpc_register_var(ptr, ptr, ptr, string, int, size_t, int, int) = 53;
 
         /* RUNTIME API */
         /* ### Device Management ### */
diff --git a/cpu/resource-mg.h b/cpu/resource-mg.h
index 6f85155c..5b542a83 100644
--- a/cpu/resource-mg.h
+++ b/cpu/resource-mg.h
@@ -33,6 +33,7 @@ resource_mg rm_kernels;
 //Driver API RMs
 resource_mg rm_modules;
 resource_mg rm_functions;
+resource_mg rm_globals;
 
 //Other RMs
 resource_mg rm_cusolver;

From 0df2fd3205d7ecdca1f52276ae11bff2156989b1 Mon Sep 17 00:00:00 2001
From: Niklas Eiling <niklas.eiling@eonerc.rwth-aachen.de>
Date: Thu, 1 Jun 2023 09:10:06 +0200
Subject: [PATCH 41/83] add some driver apis, fix shadowing CUDA functions not
 working when their name is redefined using macros

Signed-off-by: Niklas Eiling <niklas.eiling@eonerc.rwth-aachen.de>
---
 .gitignore               |   1 +
 .gitlab-ci.yml           |   4 +-
 cpu/cpu-client-driver.c  | 159 ++++++++++++++++++++++++++++++++++++---
 cpu/cpu-client-runtime.c |  35 ++++++++-
 cpu/cpu-client.c         |  10 ++-
 cpu/cpu-elf2.c           |  32 ++++----
 cpu/cpu-server-driver.c  |  47 +++++++++++-
 cpu/cpu-server-runtime.c |  10 ++-
 cpu/cpu_rpc_prot.x       |   5 +-
 tests/samples/.gitignore |   2 +
 10 files changed, 263 insertions(+), 42 deletions(-)
 create mode 100644 tests/samples/.gitignore

diff --git a/.gitignore b/.gitignore
index f855bf18..11652fae 100644
--- a/.gitignore
+++ b/.gitignore
@@ -43,3 +43,4 @@ tags
 
 # perf data
 perf.data
+main
diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml
index a8506d82..b3908604 100644
--- a/.gitlab-ci.yml
+++ b/.gitlab-ci.yml
@@ -215,9 +215,9 @@ build:debug:
       echo $KNOWN_HOSTS > ~/.ssh/known_hosts && chmod 600 ~/.ssh/id_rsa
     - ssh $GPU_TARGET mkdir -p $RDIR
     - scp -r $LDIR/* $GPU_TARGET:$RDIR/
-    - ssh $GPU_TARGET "LD_PRELOAD=$RDIR/libtirpc.so.3 $RDIR/cricket-rpc-server" &
+    - ssh $GPU_TARGET "LD_PRELOAD=$RDIR/libtirpc.so.3 $RDIR/cricket-rpc-server 255" &
     - sleep 2
-    - REMOTE_GPU_ADDRESS="ghost.acs-lab.eonerc.rwth-aachen.de" PATH=$LDIR:$PATH LD_PRELOAD=$LDIR/libtirpc.so.3:$LDIR/cricket-client.so $LDIR/$TEST_BINARY $PARAMETER
+    - CRICKET_RPCID=255 REMOTE_GPU_ADDRESS="ghost.acs-lab.eonerc.rwth-aachen.de" PATH=$LDIR:$PATH LD_PRELOAD=$LDIR/libtirpc.so.3:$LDIR/cricket-client.so $LDIR/$TEST_BINARY $PARAMETER
   after_script:
     - ssh $GPU_TARGET rm -rf $RDIR
     - ssh $GPU_TARGET pkill -fe -2 $RDIR/test_kernel
diff --git a/cpu/cpu-client-driver.c b/cpu/cpu-client-driver.c
index f180394f..149b2ce7 100644
--- a/cpu/cpu-client-driver.c
+++ b/cpu/cpu-client-driver.c
@@ -21,11 +21,12 @@
 //DEF_FN(CUresult, cuProfilerStart)
 //DEF_FN(CUresult, cuProfilerStop)
 DEF_FN(CUresult, cuVDPAUGetDevice, CUdevice*, pDevice, VdpDevice, vdpDevice, VdpGetProcAddress*, vdpGetProcAddress)
+#undef cuVDPAUCtxCreate
 DEF_FN(CUresult, cuVDPAUCtxCreate, CUcontext*, pCtx, unsigned int, flags, CUdevice, device, VdpDevice, vdpDevice, VdpGetProcAddress*, vdpGetProcAddress)
 DEF_FN(CUresult, cuGraphicsVDPAURegisterVideoSurface, CUgraphicsResource*, pCudaResource, VdpVideoSurface, vdpSurface, unsigned int, flags)
 DEF_FN(CUresult, cuGraphicsVDPAURegisterOutputSurface, CUgraphicsResource*, pCudaResource, VdpOutputSurface, vdpSurface, unsigned int, flags)
 
-//DEF_FN(CUresult, cuDeviceTotalMem, size_t*, bytes, CUdevice, dev)
+#undef cuDeviceTotalMem
 CUresult cuDeviceTotalMem(size_t* bytes, CUdevice dev)
 {
 	enum clnt_stat retval;
@@ -41,7 +42,7 @@ CUresult cuDeviceTotalMem(size_t* bytes, CUdevice dev)
     return result.err;
 }
 
-//DEF_FN(CUresult, cuCtxCreate, CUcontext*, pctx, unsigned int, flags, CUdevice, dev)
+#undef cuCtxCreate
 CUresult cuCtxCreate(CUcontext *pctx, unsigned int flags, CUdevice dev)
 {
     DEF_FN_PTR(CUresult, CUcontext*, unsigned int, CUdevice);
@@ -51,10 +52,12 @@ CUresult cuCtxCreate(CUcontext *pctx, unsigned int flags, CUdevice dev)
     return ret;
 }
 DEF_FN(CUresult, cuCtxSynchronize)
+#undef cuModuleGetGlobal
 DEF_FN(CUresult, cuModuleGetGlobal, CUdeviceptr*, dptr, size_t*, bytes, CUmodule, hmod, const char*, name)
+#undef cuMemGetInfo
 DEF_FN(CUresult, cuMemGetInfo, size_t*, free, size_t*, total)
 
-//DEF_FN(CUresult, cuMemAlloc, CUdeviceptr*, dptr, size_t, bytesize)
+#undef cuMemAlloc
 CUresult cuMemAlloc(CUdeviceptr* dptr, size_t bytesize)
 {
 	enum clnt_stat retval;
@@ -71,30 +74,40 @@ CUresult cuMemAlloc(CUdeviceptr* dptr, size_t bytesize)
     return result.err;
 }
 
+#undef cuMemAllocPitch
 DEF_FN(CUresult, cuMemAllocPitch, CUdeviceptr*, dptr, size_t*, pPitch, size_t, WidthInBytes, size_t, Height, unsigned int, ElementSizeBytes)
+#undef cuMemFree
 DEF_FN(CUresult, cuMemFree, CUdeviceptr, dptr)
+#undef cuMemGetAddressRange
 DEF_FN(CUresult, cuMemGetAddressRange, CUdeviceptr*, pbase, size_t*, psize, CUdeviceptr, dptr)
+#undef cuMemHostGetDevicePointer
 DEF_FN(CUresult, cuMemHostGetDevicePointer, CUdeviceptr*, pdptr, void*, p, unsigned int, Flags)
+#undef cuMemHostRegister
 DEF_FN(CUresult, cuMemHostRegister, void*, p, size_t, bytesize, unsigned int, Flags)
+#undef cuMemsetD8
 DEF_FN(CUresult, cuMemsetD8, CUdeviceptr, dstDevice, unsigned char, uc, size_t, N);
 DEF_FN(CUresult, cuMemsetD8_v2_ptds, CUdeviceptr, dstDevice, unsigned char, uc, size_t, N);
+#undef cuMemsetD2D8
 DEF_FN(CUresult, cuMemsetD2D8, CUdeviceptr, dstDevice, size_t, dstPitch, unsigned char, uc, size_t, Width, size_t, Height)
 DEF_FN(CUresult, cuMemsetD2D8_v2_ptds, CUdeviceptr, dstDevice, size_t, dstPitch, unsigned char, uc, size_t, Width, size_t, Height)
+#undef cuEventDestroy
 DEF_FN(CUresult, cuEventDestroy, CUevent, hEvent)
+#undef cuStreamDestroy
 DEF_FN(CUresult, cuStreamDestroy, CUstream, hStream)
+#undef cuGLCtxCreate
 DEF_FN(CUresult, cuGLCtxCreate, CUcontext*, pCtx, unsigned int, Flags, CUdevice, device)
+#undef cuArrayCreate
 DEF_FN(CUresult, cuArrayCreate, CUarray*, pHandle, const CUDA_ARRAY_DESCRIPTOR*, pAllocateArray)
+#undef cuArrayGetDescriptor
 DEF_FN(CUresult, cuArrayGetDescriptor, CUDA_ARRAY_DESCRIPTOR*, pArrayDescriptor, CUarray, hArray)
+#undef cuArray3DCreate
 DEF_FN(CUresult, cuArray3DCreate, CUarray*, pHandle, const CUDA_ARRAY3D_DESCRIPTOR*, pAllocateArray)
+#undef cuArray3DGetDescriptor
 DEF_FN(CUresult, cuArray3DGetDescriptor, CUDA_ARRAY3D_DESCRIPTOR*, pArrayDescriptor, CUarray, hArray)
+#undef cuTexRefSetAddress2D
 DEF_FN(CUresult, cuTexRefSetAddress2D, CUtexref, hTexRef, const CUDA_ARRAY_DESCRIPTOR*, desc, CUdeviceptr, dptr, size_t, Pitch)
+#undef cuTexRefSetAddress
 DEF_FN(CUresult, cuTexRefSetAddress, size_t*, ByteOffset, CUtexref, hTexRef, CUdeviceptr, dptr, size_t, bytes)
-
-
-
-
-
-
 DEF_FN(CUresult, cuGLInit)
 #undef cuGLGetDevices
 #undef cuGLMapBufferObject_v2
@@ -227,7 +240,53 @@ CUresult cuDeviceGetAttribute(int* pi, CUdevice_attribute attrib, CUdevice dev)
     *pi = result.int_result_u.data;
     return result.err;
 }
-DEF_FN(CUresult, cuDeviceGetProperties, CUdevprop*, prop, CUdevice, dev)
+
+CUresult cuDeviceGetProperties(CUdevprop* prop, CUdevice dev)
+{
+	enum clnt_stat retval;
+    mem_result result;
+    if (prop == NULL) {
+        LOGE(LOG_ERROR, "%s: prop is NULL", __FUNCTION__);
+        return CUDA_ERROR_INVALID_VALUE;
+    }
+    retval = rpc_cudevicegetproperties_1(dev, &result, clnt);
+    LOGE(LOG_DEBUG, "%s = %d, result len: %d", __FUNCTION__, result.err,
+                                        result.mem_result_u.data.mem_data_len);
+	if (retval != RPC_SUCCESS) {
+		fprintf(stderr, "[rpc] %s failed.", __FUNCTION__);
+        return CUDA_ERROR_UNKNOWN;
+	}
+    if (result.mem_result_u.data.mem_data_len != sizeof(CUdevprop)) {
+        LOGE(LOG_ERROR, "%s: size mismatch", __FUNCTION__);
+        return CUDA_ERROR_INVALID_VALUE;
+    }
+    if (memcpy(prop, result.mem_result_u.data.mem_data_val, sizeof(CUdevprop)) == NULL) {
+        LOGE(LOG_ERROR, "%s: memcpy failed", __FUNCTION__);
+        return CUDA_ERROR_UNKNOWN;
+    }
+    return result.err;
+}
+CUresult cuDeviceComputeCapability(int* major, int* minor, CUdevice dev)
+{
+    enum clnt_stat retval;
+    dint_result result;
+    if (major == NULL || minor == NULL) {
+        LOGE(LOG_ERROR, "%s: major or minor is NULL", __FUNCTION__);
+        return CUDA_ERROR_INVALID_VALUE;
+    }
+    retval = rpc_cudevicecomputecapability_1(dev, &result, clnt);
+    LOGE(LOG_DEBUG, "%s = %d, result %d, %d", __FUNCTION__, result.err,
+                                        result.dint_result_u.data.i1,
+                                        result.dint_result_u.data.i2);
+    if (retval != RPC_SUCCESS) {
+        fprintf(stderr, "[rpc] %s failed.", __FUNCTION__);
+        return CUDA_ERROR_UNKNOWN;
+    }
+    *major = result.dint_result_u.data.i1;
+    *minor = result.dint_result_u.data.i2;
+    return result.err;
+} 
+
 DEF_FN(CUresult, cuDeviceGetByPCIBusId, CUdevice*, dev, const char*, pciBusId)
 DEF_FN(CUresult, cuDeviceGetP2PAttribute, int*, value, CUdevice_P2PAttribute, attrib, CUdevice, srcDevice, CUdevice, dstDevice)
 //DEF_FN(CUresult, cuDriverGetVersion, int*, driverVersion)
@@ -261,9 +320,31 @@ CUresult cuDevicePrimaryCtxRetain(CUcontext *pctx, CUdevice dev)
     *pctx = (CUcontext)result.ptr_result_u.ptr;
     return result.err;
 }
+#undef cuDevicePrimaryCtxRelease
 DEF_FN(CUresult, cuDevicePrimaryCtxRelease, CUdevice, dev)
+#undef cuDevicePrimaryCtxSetFlags
 DEF_FN(CUresult, cuDevicePrimaryCtxSetFlags, CUdevice, dev, unsigned int, flags)
-DEF_FN(CUresult, cuDevicePrimaryCtxGetState, CUdevice, dev, unsigned int*, flags, int*, active)
+CUresult cuDevicePrimaryCtxGetState(CUdevice dev, unsigned int* flags, int* active)
+{
+	enum clnt_stat retval;
+    dint_result result;
+    if (flags == NULL || active == NULL) {
+        LOGE(LOG_ERROR, "%s flags or active is NULL.", __FUNCTION__);
+        return CUDA_ERROR_INVALID_VALUE;
+    }
+    retval = rpc_cudeviceprimaryctxgetstate_1(dev, &result, clnt);
+    LOGE(LOG_DEBUG, "%s = %d, result %d %d", __FUNCTION__, result.err,
+                                        result.dint_result_u.data.i1,
+                                        result.dint_result_u.data.i2);
+	if (retval != RPC_SUCCESS) {
+		LOGE(LOG_ERROR, "%s failed.", __FUNCTION__);
+        return CUDA_ERROR_UNKNOWN;
+	}
+    *flags = result.dint_result_u.data.i1;
+    *active = result.dint_result_u.data.i2; 
+    return result.err;
+}
+#undef cuDevicePrimaryCtxReset
 DEF_FN(CUresult, cuDevicePrimaryCtxReset, CUdevice, dev)
 DEF_FN(CUresult, cuCtxGetFlags, unsigned int*, flags)
 //DEF_FN(CUresult, cuCtxSetCurrent, CUcontext, ctx)
@@ -402,6 +483,7 @@ DEF_FN(CUresult, cuPointerGetAttributes, unsigned int, numAttributes, CUpointer_
 DEF_FN(CUresult, cuMemcpy, CUdeviceptr, dst, CUdeviceptr, src, size_t, ByteCount)
 DEF_FN(CUresult, cuMemcpy_ptds, CUdeviceptr, dst, CUdeviceptr, src, size_t, ByteCount)
 //DEF_FN(CUresult, cuMemcpyHtoD, CUdeviceptr, dstDevice, const void*, srcHost, size_t, ByteCount)
+#undef cuMemcpyHtoD
 CUresult cuMemcpyHtoD(CUdeviceptr dstDevice, const void* srcHost, size_t ByteCount)
 {
 	enum clnt_stat retval;
@@ -418,34 +500,51 @@ CUresult cuMemcpyHtoD(CUdeviceptr dstDevice, const void* srcHost, size_t ByteCou
     return result;
 }
 DEF_FN(CUresult, cuMemcpyHtoD_v2_ptds, CUdeviceptr, dstDevice, const void*, srcHost, size_t, ByteCount)
+#undef cuMemcpyDtoH
 DEF_FN(CUresult, cuMemcpyDtoH, void*, dstHost, CUdeviceptr, srcDevice, size_t, ByteCount)
 DEF_FN(CUresult, cuMemcpyDtoH_v2_ptds, void*, dstHost, CUdeviceptr, srcDevice, size_t, ByteCount)
+#undef cuMemcpyDtoD
 DEF_FN(CUresult, cuMemcpyDtoD, CUdeviceptr, dstDevice, CUdeviceptr, srcDevice, size_t, ByteCount)
 DEF_FN(CUresult, cuMemcpyDtoD_v2_ptds, CUdeviceptr, dstDevice, CUdeviceptr, srcDevice, size_t, ByteCount)
+#undef cuMemcpyDtoA
 DEF_FN(CUresult, cuMemcpyDtoA, CUarray, dstArray, size_t, dstOffset, CUdeviceptr, srcDevice, size_t, ByteCount)
+#undef cuMemcpyAtoD
 DEF_FN(CUresult, cuMemcpyAtoD, CUdeviceptr, dstDevice, CUarray, srcArray, size_t, srcOffset, size_t, ByteCount)
+#undef cuMemcpyHtoA
 DEF_FN(CUresult, cuMemcpyHtoA, CUarray, dstArray, size_t, dstOffset, const void*, srcHost, size_t, ByteCount)
+#undef cuMemcpyAtoH
 DEF_FN(CUresult, cuMemcpyAtoH, void*, dstHost, CUarray, srcArray, size_t, srcOffset, size_t, ByteCount)
+#undef cuMemcpyAtoA
 DEF_FN(CUresult, cuMemcpyAtoA, CUarray, dstArray, size_t, dstOffset, CUarray, srcArray, size_t, srcOffset, size_t, ByteCount)
+#undef cuMemcpy2D
 DEF_FN(CUresult, cuMemcpy2D, const CUDA_MEMCPY2D*, pCopy)
+#undef cuMemcpy2DUnaligned
 DEF_FN(CUresult, cuMemcpy2DUnaligned, const CUDA_MEMCPY2D*, pCopy)
 DEF_FN(CUresult, cuMemcpy2DUnaligned_v2_ptds, const CUDA_MEMCPY2D*, pCopy)
+#undef cuMemcpy3D
 DEF_FN(CUresult, cuMemcpy3D, const CUDA_MEMCPY3D*, pCopy)
 DEF_FN(CUresult, cuMemcpy3D_v2_ptds, const CUDA_MEMCPY3D*, pCopy)
 DEF_FN(CUresult, cuMemcpyPeerAsync, CUdeviceptr, dstDevice, CUcontext, dstContext, CUdeviceptr, srcDevice, CUcontext, srcContext, size_t, ByteCount, CUstream, hStream)
 DEF_FN(CUresult, cuMemcpyPeerAsync_ptsz, CUdeviceptr, dstDevice, CUcontext, dstContext, CUdeviceptr, srcDevice, CUcontext, srcContext, size_t, ByteCount, CUstream, hStream)
+#undef cuMemcpyHtoAAsync
 DEF_FN(CUresult, cuMemcpyHtoAAsync, CUarray, dstArray, size_t, dstOffset, const void*, srcHost, size_t, ByteCount, CUstream, hStream)
+#undef cuMemcpyAtoHAsync
 DEF_FN(CUresult, cuMemcpyAtoHAsync, void*, dstHost, CUarray, srcArray, size_t, srcOffset, size_t, ByteCount, CUstream, hStream)
 DEF_FN(CUresult, cuMemcpy3DPeerAsync, const CUDA_MEMCPY3D_PEER*, pCopy, CUstream, hStream)
 DEF_FN(CUresult, cuMemcpy3DPeerAsync_ptsz, const CUDA_MEMCPY3D_PEER*, pCopy, CUstream, hStream)
+#undef cuMemcpyHtoDAsync
 DEF_FN(CUresult, cuMemcpyHtoDAsync, CUdeviceptr, dstDevice, const void*, srcHost, size_t, ByteCount, CUstream, hStream)
 DEF_FN(CUresult, cuMemcpyHtoDAsync_v2_ptsz, CUdeviceptr, dstDevice, const void*, srcHost, size_t, ByteCount, CUstream, hStream)
+#undef cuMemcpyDtoHAsync
 DEF_FN(CUresult, cuMemcpyDtoHAsync, void*, dstHost, CUdeviceptr, srcDevice, size_t, ByteCount, CUstream, hStream)
 DEF_FN(CUresult, cuMemcpyDtoHAsync_v2_ptsz, void*, dstHost, CUdeviceptr, srcDevice, size_t, ByteCount, CUstream, hStream)
+#undef cuMemcpyDtoDAsync
 DEF_FN(CUresult, cuMemcpyDtoDAsync, CUdeviceptr, dstDevice, CUdeviceptr, srcDevice, size_t, ByteCount, CUstream, hStream)
 DEF_FN(CUresult, cuMemcpyDtoDAsync_v2_ptsz, CUdeviceptr, dstDevice, CUdeviceptr, srcDevice, size_t, ByteCount, CUstream, hStream)
+#undef cuMemcpy2DAsync
 DEF_FN(CUresult, cuMemcpy2DAsync, const CUDA_MEMCPY2D*, pCopy, CUstream, hStream)
 DEF_FN(CUresult, cuMemcpy2DAsync_v2_ptsz, const CUDA_MEMCPY2D*, pCopy, CUstream, hStream)
+#undef cuMemcpy3DAsync
 DEF_FN(CUresult, cuMemcpy3DAsync, const CUDA_MEMCPY3D*, pCopy, CUstream, hStream)
 DEF_FN(CUresult, cuMemcpy3DAsync_v2_ptsz, const CUDA_MEMCPY3D*, pCopy, CUstream, hStream)
 DEF_FN(CUresult, cuMemcpyAsync, CUdeviceptr, dst, CUdeviceptr, src, size_t, ByteCount, CUstream, hStream)
@@ -567,14 +666,19 @@ DEF_FN(CUresult, cuEventRecord_ptsz, CUevent, hEvent, CUstream, hStream)
 DEF_FN(CUresult, cuEventQuery, CUevent, hEvent)
 DEF_FN(CUresult, cuEventSynchronize, CUevent, hEvent)
 DEF_FN(CUresult, cuEventElapsedTime, float*, pMilliseconds, CUevent, hStart, CUevent, hEnd)
+#undef cuStreamWaitValue32
 DEF_FN(CUresult, cuStreamWaitValue32, CUstream, stream, CUdeviceptr, addr, cuuint32_t, value, unsigned int, flags)
 DEF_FN(CUresult, cuStreamWaitValue32_ptsz, CUstream, stream, CUdeviceptr, addr, cuuint32_t, value, unsigned int, flags)
+#undef cuStreamWriteValue32
 DEF_FN(CUresult, cuStreamWriteValue32, CUstream, stream, CUdeviceptr, addr, cuuint32_t, value, unsigned int, flags)
 DEF_FN(CUresult, cuStreamWriteValue32_ptsz, CUstream, stream, CUdeviceptr, addr, cuuint32_t, value, unsigned int, flags)
+#undef cuStreamWaitValue64
 DEF_FN(CUresult, cuStreamWaitValue64, CUstream, stream, CUdeviceptr, addr, cuuint64_t, value, unsigned int, flags)
 DEF_FN(CUresult, cuStreamWaitValue64_ptsz, CUstream, stream, CUdeviceptr, addr, cuuint64_t, value, unsigned int, flags)
+#undef cuStreamWriteValue64
 DEF_FN(CUresult, cuStreamWriteValue64, CUstream, stream, CUdeviceptr, addr, cuuint64_t, value, unsigned int, flags)
 DEF_FN(CUresult, cuStreamWriteValue64_ptsz, CUstream, stream, CUdeviceptr, addr, cuuint64_t, value, unsigned int, flags)
+#undef cuStreamBatchMemOp
 DEF_FN(CUresult, cuStreamBatchMemOp, CUstream, stream, unsigned int, count, CUstreamBatchMemOpParams*, paramArray, unsigned int, flags)
 DEF_FN(CUresult, cuStreamBatchMemOp_ptsz, CUstream, stream, unsigned int, count, CUstreamBatchMemOpParams*, paramArray, unsigned int, flags)
 DEF_FN(CUresult, cuStreamCreate, CUstream*, phStream, unsigned int, Flags)
@@ -600,6 +704,7 @@ DEF_FN(CUresult, cuCtxDisablePeerAccess, CUcontext, peerContext)
 DEF_FN(CUresult, cuIpcGetEventHandle, CUipcEventHandle*, pHandle, CUevent, event)
 DEF_FN(CUresult, cuIpcOpenEventHandle, CUevent*, phEvent, CUipcEventHandle, handle)
 DEF_FN(CUresult, cuIpcGetMemHandle, CUipcMemHandle*, pHandle, CUdeviceptr, dptr)
+#undef cuIpcOpenMemHandle
 DEF_FN(CUresult, cuIpcOpenMemHandle, CUdeviceptr*, pdptr, CUipcMemHandle, handle, unsigned int, Flags)
 DEF_FN(CUresult, cuIpcCloseMemHandle, CUdeviceptr, dptr)
 DEF_FN(CUresult, cuGraphicsUnregisterResource, CUgraphicsResource, resource)
@@ -609,7 +714,9 @@ DEF_FN(CUresult, cuGraphicsUnmapResources, unsigned int, count, CUgraphicsResour
 DEF_FN(CUresult, cuGraphicsUnmapResources_ptsz, unsigned int, count, CUgraphicsResource*, resources, CUstream, hStream)
 DEF_FN(CUresult, cuGraphicsSubResourceGetMappedArray, CUarray*, pArray, CUgraphicsResource, resource, unsigned int, arrayIndex, unsigned int, mipLevel)
 DEF_FN(CUresult, cuGraphicsResourceGetMappedMipmappedArray, CUmipmappedArray*, pMipmappedArray, CUgraphicsResource, resource)
+#undef cuGraphicsResourceGetMappedPointer
 DEF_FN(CUresult, cuGraphicsResourceGetMappedPointer, CUdeviceptr*, pDevPtr, size_t*, pSize, CUgraphicsResource, resource)
+#undef cuGraphicsResourceSetMapFlags
 DEF_FN(CUresult, cuGraphicsResourceSetMapFlags, CUgraphicsResource, resource, unsigned int, flags)
 //DEF_FN(CUresult, cuGetExportTable, const void**, ppExportTable, const CUuuid*, pExportTableId)
 
@@ -672,8 +779,11 @@ CUresult cuGetErrorString(CUresult error, const char** pStr)
 }
 DEF_FN(CUresult, cuGetErrorName, CUresult, error, const char**, pStr)
 DEF_FN(CUresult, cuGraphCreate, CUgraph*, phGraph, unsigned int, flags)
+#undef cuGraphAddKernelNode
 DEF_FN(CUresult, cuGraphAddKernelNode, CUgraphNode*, phGraphNode, CUgraph, hGraph, const CUgraphNode*, dependencies, size_t, numDependencies, const CUDA_KERNEL_NODE_PARAMS*, nodeParams)
+#undef cuGraphKernelNodeGetParams
 DEF_FN(CUresult, cuGraphKernelNodeGetParams, CUgraphNode, hNode, CUDA_KERNEL_NODE_PARAMS*, nodeParams)
+#undef cuGraphKernelNodeSetParams
 DEF_FN(CUresult, cuGraphKernelNodeSetParams, CUgraphNode, hNode, const CUDA_KERNEL_NODE_PARAMS*, nodeParams)
 DEF_FN(CUresult, cuGraphAddMemcpyNode, CUgraphNode*, phGraphNode, CUgraph, hGraph, const CUgraphNode*, dependencies, size_t, numDependencies, const CUDA_MEMCPY3D*, copyParams, CUcontext, ctx)
 DEF_FN(CUresult, cuGraphMemcpyNodeGetParams, CUgraphNode, hNode, CUDA_MEMCPY3D*, nodeParams)
@@ -698,6 +808,7 @@ DEF_FN(CUresult, cuGraphNodeGetDependentNodes, CUgraphNode, hNode, CUgraphNode*,
 DEF_FN(CUresult, cuGraphAddDependencies, CUgraph, hGraph, const CUgraphNode*, from, const CUgraphNode*, to, size_t, numDependencies)
 DEF_FN(CUresult, cuGraphRemoveDependencies, CUgraph, hGraph, const CUgraphNode*, from, const CUgraphNode*, to, size_t, numDependencies)
 #if CUDA_VERSION >= 12000
+#undef cuGraphInstantiate
 DEF_FN(CUresult, cuGraphInstantiate, CUgraphExec*, phGraphExec, CUgraph, hGraph, unsigned long long, flags)
 #else
 DEF_FN(CUresult, cuGraphInstantiate, CUgraphExec*, phGraphExec, CUgraph, hGraph, CUgraphNode*, phErrorNode, char*, logBuffer, size_t, bufferSize)
@@ -709,7 +820,6 @@ DEF_FN(CUresult, cuGraphDestroyNode, CUgraphNode, hNode)
 DEF_FN(CUresult, cuGraphDestroy, CUgraph, hGraph)
 DEF_FN(CUresult, cuGraphDestroy_ptsz, CUgraph, hGraph)
 DEF_FN(CUresult, cuStreamBeginCapture_ptsz, CUstream, hStream)
-DEF_FN(CUresult, cuStreamBeginCapture, CUstream, hStream, CUstreamCaptureMode, mode)
 #undef cuStreamBeginCapture
 DEF_FN(CUresult, cuStreamBeginCapture, CUstream, hStream, CUstreamCaptureMode, mode)
 DEF_FN(CUresult, cuStreamBeginCapture_v2_ptsz, CUstream, hStream)
@@ -718,6 +828,31 @@ DEF_FN(CUresult, cuStreamEndCapture_ptsz, CUstream, hStream, CUgraph*, phGraph)
 DEF_FN(CUresult, cuStreamIsCapturing, CUstream, hStream, CUstreamCaptureStatus*, captureStatus)
 DEF_FN(CUresult, cuStreamIsCapturing_ptsz, CUstream, hStream, CUstreamCaptureStatus*, captureStatus)
 DEF_FN(CUresult, cuThreadExchangeStreamCaptureMode, CUstreamCaptureMode*, mode)
+#undef cuStreamGetCaptureInfo
 DEF_FN(CUresult, cuStreamGetCaptureInfo, CUstream, hStream, CUstreamCaptureStatus*, captureStatus_out, cuuint64_t*, id_out, CUgraph*. graph_out, const CUgraphNode**, dependencies_out, size_t*, numDependencies_out)
 DEF_FN(CUresult, cuStreamGetCaptureInfo_ptsz, CUstream, hStream, CUstreamCaptureStatus*, captureStatus, cuuint64_t*, id)
+#undef cuGraphExecKernelNodeSetParams
 DEF_FN(CUresult, cuGraphExecKernelNodeSetParams, CUgraphExec, hGraphExec, CUgraphNode, hNode, const CUDA_KERNEL_NODE_PARAMS*, nodeParams)
+
+#if CUDA_VERSION >= 12000
+#undef cuGetProcAddress
+CUresult cuGetProcAddress(const char* symbol, void** pfn, int cudaVersion, cuuint64_t flags, CUdriverProcAddressQueryResult* symbolStatus) 
+{
+	enum clnt_stat retval;
+    ptr_result result;
+    LOGE(LOG_DEBUG, "%s(%s, %d, %llx)", __FUNCTION__, symbol, cudaVersion, flags);
+
+    *pfn = NULL;
+    *symbolStatus = CU_GET_PROC_ADDRESS_VERSION_NOT_SUFFICIENT;
+	// if (retval != RPC_SUCCESS) {
+	// 	fprintf(stderr, "[rpc] %s failed.", __FUNCTION__);
+    //     return CUDA_ERROR_UNKNOWN;
+	// }
+    // if (pStr != NULL) {
+    //    if ((*pStr = malloc(128)) != NULL) {
+    //        strncpy((char*)(*pStr), result.str_result_u.str, 128);
+    //     }
+    // }
+    return cudaSuccess;
+}
+#endif
\ No newline at end of file
diff --git a/cpu/cpu-client-runtime.c b/cpu/cpu-client-runtime.c
index 9ccf5e9e..4eafc639 100644
--- a/cpu/cpu-client-runtime.c
+++ b/cpu/cpu-client-runtime.c
@@ -329,15 +329,18 @@ cudaError_t cudaGetDeviceFlags(unsigned int* flags)
     return result.err;
 }
 
+#undef cudaGetDeviceProperties
 cudaError_t cudaGetDeviceProperties(struct cudaDeviceProp* prop, int device)
 {
 #ifdef WITH_API_CNT
     api_call_cnt++;
 #endif //WITH_API_CNT
     mem_result result;
-    result.mem_result_u.data.mem_data_len = sizeof(struct cudaDeviceProp);
-    result.mem_result_u.data.mem_data_val = (char*)prop;
     enum clnt_stat retval;
+    if (prop == NULL) {
+        LOGE(LOG_ERROR, "error: prop == NULL\n");
+        return cudaErrorInvalidValue;
+    }
     retval = cuda_get_device_properties_1(device, &result, clnt);
     if (retval != RPC_SUCCESS) {
         clnt_perror (clnt, "call failed");
@@ -346,7 +349,11 @@ cudaError_t cudaGetDeviceProperties(struct cudaDeviceProp* prop, int device)
         return result.err;
     }
     if (result.mem_result_u.data.mem_data_len != sizeof(struct cudaDeviceProp)) {
-        LOGE(LOG_ERROR, "error: expected size != retrieved size\n");
+        LOGE(LOG_ERROR, "error: expected size != retrieved size");
+        return result.err;
+    }
+    if (memcpy(prop, result.mem_result_u.data.mem_data_val, sizeof(struct cudaDeviceProp)) == NULL) {
+        LOGE(LOG_ERROR, "error: memcpy failed");
         return result.err;
     }
     return result.err;
@@ -572,7 +579,25 @@ cudaError_t cudaStreamGetPriority(cudaStream_t hStream, int* priority)
     return result.err;
 }
 
-DEF_FN(cudaError_t, cudaStreamIsCapturing, cudaStream_t, stream, enum cudaStreamCaptureStatus*, pCaptureStatus)
+cudaError_t cudaStreamIsCapturing(cudaStream_t stream, enum cudaStreamCaptureStatus* pCaptureStatus)
+{
+#ifdef WITH_API_CNT
+    api_call_cnt++;
+#endif //WITH_API_CNT
+    int_result result;
+    enum clnt_stat retval_1;
+    if (pCaptureStatus == NULL) {
+        return cudaErrorInvalidValue;
+    }
+    retval_1 = cuda_stream_is_capturing_1((ptr)stream, &result, clnt);
+    if (retval_1 != RPC_SUCCESS) {
+        clnt_perror (clnt, "call failed");
+    }
+    if (result.err == 0) {
+        *pCaptureStatus = (enum cudaStreamCaptureStatus)result.int_result_u.data;
+    }
+    return result.err;
+}
 
 cudaError_t cudaStreamQuery(cudaStream_t stream)
 {
@@ -752,7 +777,9 @@ DEF_FN(cudaError_t, cudaExternalMemoryGetMappedBuffer, void**, devPtr, cudaExter
 DEF_FN(cudaError_t, cudaExternalMemoryGetMappedMipmappedArray, cudaMipmappedArray_t*, mipmap, cudaExternalMemory_t, extMem, const struct cudaExternalMemoryMipmappedArrayDesc*, mipmapDesc)
 DEF_FN(cudaError_t, cudaImportExternalMemory, cudaExternalMemory_t*, extMem_out, const struct cudaExternalMemoryHandleDesc*, memHandleDesc)
 DEF_FN(cudaError_t, cudaImportExternalSemaphore, cudaExternalSemaphore_t*, extSem_out, const struct cudaExternalSemaphoreHandleDesc*, semHandleDesc)
+#undef cudaSignalExternalSemaphoresAsync
 DEF_FN(cudaError_t, cudaSignalExternalSemaphoresAsync, const cudaExternalSemaphore_t*, extSemArray, const struct cudaExternalSemaphoreSignalParams*, paramsArray, unsigned int,  numExtSems, cudaStream_t, stream)
+#undef cudaWaitExternalSemaphoresAsync
 DEF_FN(cudaError_t, cudaWaitExternalSemaphoresAsync, const cudaExternalSemaphore_t*, extSemArray, const struct cudaExternalSemaphoreWaitParams*, paramsArray, unsigned int,  numExtSems, cudaStream_t, stream)
 
 cudaError_t cudaFuncGetAttributes(struct cudaFuncAttributes* attr, const void* func)
diff --git a/cpu/cpu-client.c b/cpu/cpu-client.c
index 6cdaf81c..fb535b44 100644
--- a/cpu/cpu-client.c
+++ b/cpu/cpu-client.c
@@ -245,7 +245,9 @@ void *dlopen(const char *filename, int flag)
         }
     }
 
-    if (filename != NULL && strcmp(filename, "libcuda.so.1") == 0) {
+    if (filename != NULL && 
+        (strcmp(filename, "libcuda.so.1") == 0 ||
+        strcmp(filename, "libcuda.so") == 0)) {
         LOG(LOG_DEBUG, "replacing dlopen call to cuda driver library with "
                        "cricket-client.so");
         dl_handle = dlopen_orig("cricket-client.so", flag);
@@ -336,6 +338,11 @@ void __cudaRegisterFunction(void **fatCubinHandle, const char *hostFun,
                                            &result, clnt);
         if (retval_1 != RPC_SUCCESS) {
             LOGE(LOG_ERROR, "call failed.");
+            exit(1);
+        }
+        if (result.err != 0) {
+            LOGE(LOG_ERROR, "error registering function: %d", result.err);
+            exit(1);
         }
         info->host_fun = (void *)hostFun;
     }
@@ -370,6 +377,7 @@ void **__cudaRegisterFatBinary(void *fatCubin)
         LOGE(LOG_ERROR, "call failed.");
     }
     if (rpc_result != 0) {
+        LOGE(LOG_ERROR, "error registering fatbin: %d", rpc_result);
         return NULL;
     }
     LOG(LOG_DEBUG, "fatbin loaded to %p", result);
diff --git a/cpu/cpu-elf2.c b/cpu/cpu-elf2.c
index 52c8ee7d..21e80fb3 100644
--- a/cpu/cpu-elf2.c
+++ b/cpu/cpu-elf2.c
@@ -327,6 +327,7 @@ static ssize_t decompress_single_section(const uint8_t *input, uint8_t **output,
     size_t padding;
     size_t input_read = 0;
     size_t output_written = 0;
+    size_t decompress_ret = 0;
     const uint8_t zeroes[6] = {0};
 
     if (input == NULL || output == NULL || eh == NULL || th == NULL) {
@@ -340,10 +341,12 @@ static ssize_t decompress_single_section(const uint8_t *input, uint8_t **output,
                 th->decompressed_size, strerror(errno));
         goto error;
     }
+    print_header(th);
 
-    if (decompress(input, th->compressed_size, *output, th->decompressed_size) != th->decompressed_size) {
-        LOGE(LOG_ERROR, "Decompression failed");
-        goto error;
+    if ((decompress_ret = decompress(input, th->compressed_size, *output, th->decompressed_size)) != th->decompressed_size) {
+        LOGE(LOG_ERROR, "Decompression failed: decompressed size is %#zx, but header says %#zx", 
+                decompress_ret, th->decompressed_size);
+        //goto error;
     }
     input_read += th->compressed_size;
     output_written += th->decompressed_size;
@@ -464,7 +467,7 @@ int elf2_get_fatbin_info(const struct fat_header *fatbin, list *kernel_infos, ui
     fatbin_total_size = eh->header_size + eh->size;
     do {
         if (get_text_header(input_pos, *fatbin_size - (input_pos - fatbin_data) - eh->header_size, &th) != 0) {
-            fprintf(stderr, "Something went wrong while checking the header.\n");
+            LOGE(LOG_ERROR, "Something went wrong while checking the header.");
             goto error;
         }
         //print_header(th);
@@ -482,17 +485,17 @@ int elf2_get_fatbin_info(const struct fat_header *fatbin, list *kernel_infos, ui
 
             LOGE(LOG_DEBUG, "fatbin contains compressed device code. Decompressing...");
             if ((input_read = decompress_single_section(input_pos, &text_data, &text_data_size, eh, th)) < 0) {
-                fprintf(stderr, "Something went wrong while decompressing text section.\n");
+                LOGE(LOG_ERROR, "Something went wrong while decompressing text section.");
                 goto error;
             }
             input_pos += input_read;
-            hexdump(text_data, text_data_size);
+            //hexdump(text_data, text_data_size);
         } else {
             text_data = (uint8_t*)input_pos;
             text_data_size = th->size;
             input_pos += th->size;
         }
-        print_header(th);
+        // print_header(th);
         if (elf2_parameter_info(kernel_infos, text_data , text_data_size) != 0) {
             LOGE(LOG_ERROR, "error getting parameter info");
             goto error;
@@ -556,7 +559,7 @@ static int get_section_by_name(Elf *elf, const char *name, Elf_Scn **section)
     }
 
     if (elf_getshdrstrndx(elf, &str_section_index) != 0) {
-        LOGE(LOG_ERROR, "elf_getshstrndx Wfailed");
+        LOGE(LOG_ERROR, "elf_getshstrndx failed");
         return -1;
     }
 
@@ -569,7 +572,6 @@ static int get_section_by_name(Elf *elf, const char *name, Elf_Scn **section)
             LOGE(LOG_ERROR, "elf_strptr failed");
             return -1;
         }
-        //printf("%s, %#0x %#0x\n", section_name, shdr.sh_flags, shdr.sh_type);
         if (strcmp(section_name, name) == 0) {
             *section = scn;
             return 0;
@@ -604,7 +606,6 @@ static int print_sections(Elf *elf)
             LOGE(LOG_ERROR, "elf_strptr failed");
             return -1;
         }
-        printf("%s, %#0lx %#0x\n", section_name, shdr.sh_flags, shdr.sh_type);
     }
     return -1;
 }
@@ -749,7 +750,7 @@ static int get_symtab(Elf *elf, Elf_Data **symbol_table_data, size_t *symbol_tab
     }
 
     if (get_section_by_name(elf, ".symtab", &section) != 0) {
-        LOGE(LOG_ERROR, "could not find .nv.info section");
+        LOGE(LOG_ERROR, "could not find .symtab section");
         return -1;
     }
 
@@ -886,15 +887,16 @@ int elf2_parameter_info(list *kernel_infos, void* memory, size_t memsize)
         return -1;
     }
 
-//#define ELF_DUMP_TO_FILE 1
+    hexdump(memory, 0x10);
+
+#define ELF_DUMP_TO_FILE 1
 
 #ifdef ELF_DUMP_TO_FILE
     FILE* fd2 = fopen("/tmp/cricket-elf-dump", "wb");
-    fwrite(memory-1, memsize, 1, fd2);
+    fwrite(memory, memsize, 1, fd2);
     fclose(fd2);
 #endif
 
-
     if ((elf = elf_memory(memory, memsize)) == NULL) {
         LOGE(LOG_ERROR, "elf_memory failed");
         goto cleanup;
@@ -905,8 +907,6 @@ int elf2_parameter_info(list *kernel_infos, void* memory, size_t memsize)
         goto cleanup;
     }
 
-    //print_symtab(elf);
-
     if (get_symtab(elf, &symbol_table_data, &symnum, &symtab_shdr) != 0) {
         LOGE(LOG_ERROR, "could not get symbol table");
         goto cleanup;
diff --git a/cpu/cpu-server-driver.c b/cpu/cpu-server-driver.c
index 5894c126..1a1ea85e 100644
--- a/cpu/cpu-server-driver.c
+++ b/cpu/cpu-server-driver.c
@@ -39,12 +39,12 @@ int server_driver_init(int restore)
 // Does not support checkpoint/restart yet
 bool_t rpc_elf_load_1_svc(mem_data elf, ptr module_key, int *result, struct svc_req *rqstp)
 {
-    LOG(LOG_DEBUG, "rpc_elf_load(elf: %p, len: %#x, module_key: %#x)", elf.mem_data_val, elf.mem_data_len, module_key);
+    LOGE(LOG_DEBUG, "rpc_elf_load(elf: %p, len: %#x, module_key: %#x)", elf.mem_data_val, elf.mem_data_len, module_key);
     CUresult res;
     CUmodule module;
     
     if ((res = cuModuleLoadData(&module, elf.mem_data_val)) != CUDA_SUCCESS) {
-        LOG(LOG_ERROR, "cuModuleLoadData failed: %d", res);
+        LOGE(LOG_ERROR, "cuModuleLoadData failed: %d", res);
         *result = res;
         return 1;
     }
@@ -52,12 +52,12 @@ bool_t rpc_elf_load_1_svc(mem_data elf, ptr module_key, int *result, struct svc_
     // We add our module using module_key as key. This means a fatbinaryHandle on the client is translated
     // to a CUmodule on the server.
     if ((res = resource_mg_add_sorted(&rm_modules, (void*)module_key, (void*)module)) != CUDA_SUCCESS) {
-        LOG(LOG_ERROR, "resource_mg_create failed: %d", res);
+        LOGE(LOG_ERROR, "resource_mg_create failed: %d", res);
         *result = res;
         return 1;
     }
 
-    LOG(LOG_DEBUG, "->module: %p", module);
+    LOGE(LOG_DEBUG, "->module: %p", module);
     *result = 0;
     return 1;
 }
@@ -341,6 +341,45 @@ bool_t rpc_cugeterrorstring_1_svc(int err, str_result *result,
     return 1;
 }
 
+bool_t rpc_cudeviceprimaryctxgetstate_1_svc(int dev, dint_result *result,
+                                      struct svc_req *rqstp)
+{
+    LOGE(LOG_DEBUG, "%s(%d)", __FUNCTION__, dev);
+    GSCHED_RETAIN;
+    result->err = cuDevicePrimaryCtxGetState(dev, &(result->dint_result_u.data.i1),
+                                            &(result->dint_result_u.data.i2));
+    LOGE(LOG_DEBUG, "state: %d, flags: %d", result->dint_result_u.data.i1,
+                                           result->dint_result_u.data.i2);
+    GSCHED_RELEASE;
+    return 1;
+}
+
+bool_t rpc_cudevicegetproperties_1_svc(int dev, mem_result *result,
+                                       struct svc_req *rqstp)
+{
+    LOGE(LOG_DEBUG, "%s(%d)", __FUNCTION__, dev);
+    GSCHED_RETAIN;
+    if ((result->mem_result_u.data.mem_data_val = malloc(sizeof(CUdevprop))) == NULL) {
+        result->err = CUDA_ERROR_OUT_OF_MEMORY;
+    }
+    result->mem_result_u.data.mem_data_len = sizeof(CUdevprop);
+    result->err = cuDeviceGetProperties((CUdevprop*)result->mem_result_u.data.mem_data_val, dev);
+    GSCHED_RELEASE;
+    return 1;
+}
+
+bool_t rpc_cudevicecomputecapability_1_svc(int dev, dint_result *result,
+                                           struct svc_req *rqstp)
+{
+    LOGE(LOG_DEBUG, "%s(%d)", __FUNCTION__, dev);
+    GSCHED_RETAIN;
+    result->err = cuDeviceComputeCapability(&(result->dint_result_u.data.i1),
+                                            &(result->dint_result_u.data.i2),
+                                            dev);
+    GSCHED_RELEASE;
+    return 1;
+}
+
 /*
 bool_t rpc_cugetexporttable_1_svc(char *rpc_uuid, ptr_result *result,
                                   struct svc_req *rqstp)
diff --git a/cpu/cpu-server-runtime.c b/cpu/cpu-server-runtime.c
index a5a94058..d939e0b0 100644
--- a/cpu/cpu-server-runtime.c
+++ b/cpu/cpu-server-runtime.c
@@ -592,8 +592,14 @@ bool_t cuda_stream_get_priority_1_svc(ptr hStream, int_result *result, struct sv
     return 1;
 }
 
-/* Capture API does not make sense without graph API */
-//        /* ?         CUDA_STREAM_IS_CAPTURING(ptr)                      = 264;*/
+bool_t cuda_stream_is_capturing_1_svc(ptr stream, int_result *result, struct svc_req *rqstp)
+{
+    LOGE(LOG_DEBUG, "cudaStreamIsCapturing");
+    result->err = cudaStreamIsCapturing(
+      resource_mg_get(&rm_streams, (void*)stream),
+      (enum cudaStreamCaptureStatus*)&result->int_result_u.data);
+    return 1;
+}
 
 bool_t cuda_stream_query_1_svc(ptr hStream, int *result, struct svc_req *rqstp)
 {
diff --git a/cpu/cpu_rpc_prot.x b/cpu/cpu_rpc_prot.x
index fd84c248..69363a20 100644
--- a/cpu/cpu_rpc_prot.x
+++ b/cpu/cpu_rpc_prot.x
@@ -185,7 +185,7 @@ program RPC_CD_PROG {
         /* ?         CUDA_STREAM_GET_CAPTURE_INFO(ptr)                          = 261;*/
         int_result   CUDA_STREAM_GET_FLAGS(ptr)                                 = 262;
         int_result   CUDA_STREAM_GET_PRIORITY(ptr)                              = 263;
-        /* ?         CUDA_STREAM_IS_CAPTURING(ptr)                              = 264;*/
+        int_result   CUDA_STREAM_IS_CAPTURING(ptr)                              = 264;
         int          CUDA_STREAM_QUERY(ptr)                                     = 265;
         /*int        CUDA_STREAM_SET_ATTRIBUTE(ptr, int, ?)                     = 266;*/
         int          CUDA_STREAM_SYNCHRONIZE(ptr)                               = 267;
@@ -335,6 +335,9 @@ program RPC_CD_PROG {
         ptr_result   rpc_cuModuleLoad(string<>)                                = 1019;
         str_result   rpc_cuGetErrorString(int)                                 = 1020;
         int          rpc_cuModuleUnload(ptr)                                   = 1021;
+        dint_result  rpc_cuDevicePrimaryCtxGetState(int)                       = 1022;
+        mem_result   rpc_cuDeviceGetProperties(int)                            = 1023;
+        dint_result  rpc_cuDeviceComputeCapability(int)                        = 1024;
 
         /* HIDDEN DRIVER API */
 /*        ptr_result   rpc_hidden_get_device_ctx(int)                            = 1101;
diff --git a/tests/samples/.gitignore b/tests/samples/.gitignore
new file mode 100644
index 00000000..33a20c36
--- /dev/null
+++ b/tests/samples/.gitignore
@@ -0,0 +1,2 @@
+samples-bin
+samples

From 0641cccaf58d794a373cd20ae25c9e147be01d99 Mon Sep 17 00:00:00 2001
From: Niklas Eiling <niklas.eiling@eonerc.rwth-aachen.de>
Date: Fri, 2 Jun 2023 13:40:44 +0200
Subject: [PATCH 42/83] add nvml support

Signed-off-by: Niklas Eiling <niklas.eiling@eonerc.rwth-aachen.de>
---
 cpu/Makefile                       |   9 +-
 cpu/cpu-client-nvml.c              | 199 +++++++++++++++++++++++++++++
 cpu/cpu-client-runtime.c           |   4 +-
 cpu/cpu-client.c                   |   5 +-
 cpu/cpu-server-nvml.c              |  71 ++++++++++
 cpu/cpu_rpc_prot.x                 |   6 +
 tests/test_apps/pytorch_minimal.py |   9 ++
 7 files changed, 297 insertions(+), 6 deletions(-)
 create mode 100644 cpu/cpu-client-nvml.c
 create mode 100644 cpu/cpu-server-nvml.c
 create mode 100644 tests/test_apps/pytorch_minimal.py

diff --git a/cpu/Makefile b/cpu/Makefile
index 3369aeb9..8af92730 100644
--- a/cpu/Makefile
+++ b/cpu/Makefile
@@ -40,7 +40,8 @@ SRC_SERVER = $(RPC_XDR)                 \
 			 gsched_none.c 			    \
 			 oob.c 					    \
 			 mt-memcpy.c				\
-			 cpu-elf2.c
+			 cpu-elf2.c					\
+			 cpu-server-nvml.c
 
 SRC_SERVER_LIB = server-library.c
 SRC_SERVER_EXE = server-exe.c
@@ -57,7 +58,8 @@ SRC_CLIENT = $(RPC_XDR)                 \
 			 cpu-client-cusolver.c 		\
 			 oob.c 					    \
 			 mt-memcpy.c				\
-			 cpu-elf2.c
+			 cpu-elf2.c					\
+			 cpu-client-nvml.c
 
 # 			 cpu-client-driver-hidden.c \
 
@@ -76,6 +78,7 @@ INC_FLAGS += -I$(CUDA_SRC)/include
 
 LIB_FLAGS += -L$(LIBTIRPC_PREFIX)/lib
 LIB_FLAGS += -L$(CUDA_SRC)/lib64
+LIB_FLAGS += -L$(CUDA_SRC)/lib64/stubs
 CC_FLAGS += -std=gnu99 $(INC_FLAGS) -O2
 # TODO: use extern in header files instead of direct definition e.g. in cpu-common.h to remove -fcommon flag
 CC_FLAGS += -fcommon
@@ -102,7 +105,7 @@ ifdef WITH_IB
 CC_FLAGS += -DWITH_IB=$(WITH_IB)
 endif
 
-SERVER_LD_FLAGS = $(LD_FLAGS) -lcudart -lcusolver -lcuda -lcublas -lrt -lpthread
+SERVER_LD_FLAGS = $(LD_FLAGS) -lcudart -lcusolver -lcuda -lcublas -lrt -lpthread -lnvidia-ml
 SERVER_BIN_LD_FLAGS = $(SERVER_LD_FLAGS) -Wl,--unresolved-symbols=ignore-in-object-files
 CLIENT_LD_FLAGS = $(LD_FLAGS)
 
diff --git a/cpu/cpu-client-nvml.c b/cpu/cpu-client-nvml.c
new file mode 100644
index 00000000..0e97f5b7
--- /dev/null
+++ b/cpu/cpu-client-nvml.c
@@ -0,0 +1,199 @@
+#define _GNU_SOURCE
+#include <stdio.h>
+#include <stdlib.h>
+#include <nvml.h>
+
+#include "cpu-libwrap.h"
+#include "cpu_rpc_prot.h"
+#include "cpu-common.h"
+#include "cpu-utils.h"
+#include "log.h"
+
+#ifdef WITH_API_CNT
+static int api_call_cnt = 0;
+void cpu_nvml_print_api_call_cnt(void)
+{
+    LOG(LOG_INFO, "nvml api-call-cnt: %d", api_call_cnt);
+}
+#endif //WITH_API_CNT
+
+nvmlReturn_t nvmlInitWithFlags ( unsigned int  flags )
+{
+#ifdef WITH_API_CNT
+    api_call_cnt++;
+#endif //WITH_API_CNT
+    int result;
+    enum clnt_stat retval_1;
+    retval_1 = rpc_nvmlinitwithflags_1(flags, &result, clnt);
+    if (retval_1 != RPC_SUCCESS) {
+        LOGE(LOG_ERROR, "call failed: %s", __FUNCTION__);
+        return result;
+    }
+    return result;
+}
+
+#undef nvmlInit
+nvmlReturn_t nvmlInit(void)
+{
+    return nvmlInitWithFlags(0);
+}
+
+nvmlReturn_t nvmlInit_v2 ( void )
+{
+#ifdef WITH_API_CNT
+    api_call_cnt++;
+#endif //WITH_API_CNT
+    int result;
+    enum clnt_stat retval_1;
+    retval_1 = rpc_nvmlinit_v2_1(&result, clnt);
+    if (retval_1 != RPC_SUCCESS) {
+        LOGE(LOG_ERROR, "call failed: %s", __FUNCTION__);
+        return result;
+    }
+    return result;
+}
+nvmlReturn_t nvmlShutdown ( void )
+{
+#ifdef WITH_API_CNT
+    api_call_cnt++;
+#endif //WITH_API_CNT
+    int result;
+    enum clnt_stat retval_1;
+    retval_1 = rpc_nvmlshutdown_1(&result, clnt);
+    if (retval_1 != RPC_SUCCESS) {
+        LOGE(LOG_ERROR, "call failed: %s", __FUNCTION__);
+        return result;
+    }
+    return result;
+}
+
+
+DEF_FN(nvmlReturn_t, nvmlDeviceGetAPIRestriction, nvmlDevice_t, device, nvmlRestrictedAPI_t, apiType, nvmlEnableState_t*, isRestricted )
+DEF_FN(nvmlReturn_t, nvmlDeviceGetAdaptiveClockInfoStatus, nvmlDevice_t, device, unsigned int*, adaptiveClockStatus )
+DEF_FN(nvmlReturn_t, nvmlDeviceGetApplicationsClock, nvmlDevice_t, device, nvmlClockType_t, clockType, unsigned int*, clockMHz )
+DEF_FN(nvmlReturn_t, nvmlDeviceGetArchitecture, nvmlDevice_t, device, nvmlDeviceArchitecture_t*, arch )
+DEF_FN(nvmlReturn_t, nvmlDeviceGetAttributes_v2, nvmlDevice_t, device, nvmlDeviceAttributes_t*, attributes )
+DEF_FN(nvmlReturn_t, nvmlDeviceGetAutoBoostedClocksEnabled, nvmlDevice_t, device, nvmlEnableState_t*, isEnabled, nvmlEnableState_t*, defaultIsEnabled )
+DEF_FN(nvmlReturn_t, nvmlDeviceGetBAR1MemoryInfo, nvmlDevice_t, device, nvmlBAR1Memory_t*, bar1Memory )
+DEF_FN(nvmlReturn_t, nvmlDeviceGetBoardId, nvmlDevice_t, device, unsigned int*, boardId )
+DEF_FN(nvmlReturn_t, nvmlDeviceGetBoardPartNumber, nvmlDevice_t, device, char*, partNumber, unsigned int,  length )
+DEF_FN(nvmlReturn_t, nvmlDeviceGetBrand, nvmlDevice_t, device, nvmlBrandType_t*, type )
+DEF_FN(nvmlReturn_t, nvmlDeviceGetBridgeChipInfo, nvmlDevice_t, device, nvmlBridgeChipHierarchy_t*, bridgeHierarchy )
+DEF_FN(nvmlReturn_t, nvmlDeviceGetClock, nvmlDevice_t, device, nvmlClockType_t, clockType, nvmlClockId_t, clockId, unsigned int*, clockMHz )
+DEF_FN(nvmlReturn_t, nvmlDeviceGetClockInfo, nvmlDevice_t, device, nvmlClockType_t, type, unsigned int*, clock )
+DEF_FN(nvmlReturn_t, nvmlDeviceGetComputeMode, nvmlDevice_t, device, nvmlComputeMode_t*, mode )
+DEF_FN(nvmlReturn_t, nvmlDeviceGetComputeRunningProcesses_v3, nvmlDevice_t, device, unsigned int*, infoCount, nvmlProcessInfo_t*, infos )
+nvmlReturn_t nvmlDeviceGetCount_v2(unsigned int* deviceCount )
+{
+#ifdef WITH_API_CNT
+    api_call_cnt++;
+#endif //WITH_API_CNT
+    int_result result;
+    enum clnt_stat retval_1;
+    if (deviceCount == NULL) {
+        return NVML_ERROR_INVALID_ARGUMENT;
+    }
+    retval_1 = rpc_nvmldevicegetcount_v2_1(&result, clnt);
+    if (retval_1 != RPC_SUCCESS) {
+        clnt_perror (clnt, "call failed");
+    }
+    if (result.err == 0) {
+        *deviceCount = result.int_result_u.data;
+    }
+    return result.err;
+}
+DEF_FN(nvmlReturn_t, nvmlDeviceGetCudaComputeCapability, nvmlDevice_t, device, int*, major, int*, minor )
+DEF_FN(nvmlReturn_t, nvmlDeviceGetCurrPcieLinkGeneration, nvmlDevice_t, device, unsigned int*, currLinkGen )
+DEF_FN(nvmlReturn_t, nvmlDeviceGetCurrPcieLinkWidth, nvmlDevice_t, device, unsigned int*, currLinkWidth )
+DEF_FN(nvmlReturn_t, nvmlDeviceGetCurrentClocksThrottleReasons, nvmlDevice_t, device, unsigned long long*, clocksThrottleReasons )
+DEF_FN(nvmlReturn_t, nvmlDeviceGetDecoderUtilization, nvmlDevice_t, device, unsigned int*, utilization, unsigned int*, samplingPeriodUs )
+DEF_FN(nvmlReturn_t, nvmlDeviceGetDefaultApplicationsClock, nvmlDevice_t, device, nvmlClockType_t, clockType, unsigned int*, clockMHz )
+DEF_FN(nvmlReturn_t, nvmlDeviceGetDefaultEccMode, nvmlDevice_t, device, nvmlEnableState_t*, defaultMode )
+DEF_FN(nvmlReturn_t, nvmlDeviceGetDetailedEccErrors, nvmlDevice_t, device, nvmlMemoryErrorType_t, errorType, nvmlEccCounterType_t, counterType, nvmlEccErrorCounts_t*, eccCounts )
+DEF_FN(nvmlReturn_t, nvmlDeviceGetDisplayActive, nvmlDevice_t, device, nvmlEnableState_t*, isActive )
+DEF_FN(nvmlReturn_t, nvmlDeviceGetDisplayMode, nvmlDevice_t, device, nvmlEnableState_t*, display )
+DEF_FN(nvmlReturn_t, nvmlDeviceGetDriverModel, nvmlDevice_t, device, nvmlDriverModel_t*, current, nvmlDriverModel_t*, pending )
+DEF_FN(nvmlReturn_t, nvmlDeviceGetEccMode, nvmlDevice_t, device, nvmlEnableState_t*, current, nvmlEnableState_t*, pending )
+DEF_FN(nvmlReturn_t, nvmlDeviceGetEncoderCapacity, nvmlDevice_t, device, nvmlEncoderType_t, encoderQueryType, unsigned int*, encoderCapacity )
+DEF_FN(nvmlReturn_t, nvmlDeviceGetEncoderSessions, nvmlDevice_t, device, unsigned int*, sessionCount, nvmlEncoderSessionInfo_t*, sessionInfos )
+DEF_FN(nvmlReturn_t, nvmlDeviceGetEncoderStats, nvmlDevice_t, device, unsigned int*, sessionCount, unsigned int*, averageFps, unsigned int*, averageLatency )
+DEF_FN(nvmlReturn_t, nvmlDeviceGetEncoderUtilization, nvmlDevice_t, device, unsigned int*, utilization, unsigned int*, samplingPeriodUs )
+DEF_FN(nvmlReturn_t, nvmlDeviceGetEnforcedPowerLimit, nvmlDevice_t, device, unsigned int*, limit )
+DEF_FN(nvmlReturn_t, nvmlDeviceGetFBCSessions, nvmlDevice_t, device, unsigned int*, sessionCount, nvmlFBCSessionInfo_t*, sessionInfo )
+DEF_FN(nvmlReturn_t, nvmlDeviceGetFBCStats, nvmlDevice_t, device, nvmlFBCStats_t*, fbcStats )
+DEF_FN(nvmlReturn_t, nvmlDeviceGetFanControlPolicy_v2, nvmlDevice_t, device, unsigned int,  fan, nvmlFanControlPolicy_t*, policy )
+DEF_FN(nvmlReturn_t, nvmlDeviceGetFanSpeed, nvmlDevice_t, device, unsigned int*, speed )
+DEF_FN(nvmlReturn_t, nvmlDeviceGetFanSpeed_v2, nvmlDevice_t, device, unsigned int,  fan, unsigned int*, speed )
+DEF_FN(nvmlReturn_t, nvmlDeviceGetGpuMaxPcieLinkGeneration, nvmlDevice_t, device, unsigned int*, maxLinkGenDevice )
+DEF_FN(nvmlReturn_t, nvmlDeviceGetGpuOperationMode, nvmlDevice_t, device, nvmlGpuOperationMode_t*, current, nvmlGpuOperationMode_t*, pending )
+DEF_FN(nvmlReturn_t, nvmlDeviceGetGraphicsRunningProcesses_v3, nvmlDevice_t, device, unsigned int*, infoCount, nvmlProcessInfo_t*, infos )
+DEF_FN(nvmlReturn_t, nvmlDeviceGetHandleByIndex_v2, unsigned int,  index, nvmlDevice_t*, device )
+DEF_FN(nvmlReturn_t, nvmlDeviceGetHandleByPciBusId_v2, const char*, pciBusId, nvmlDevice_t*, device )
+DEF_FN(nvmlReturn_t, nvmlDeviceGetHandleBySerial, const char*, serial, nvmlDevice_t*, device )
+DEF_FN(nvmlReturn_t, nvmlDeviceGetHandleByUUID, const char*, uuid, nvmlDevice_t*, device )
+DEF_FN(nvmlReturn_t, nvmlDeviceGetIndex, nvmlDevice_t, device, unsigned int*, index )
+DEF_FN(nvmlReturn_t, nvmlDeviceGetInforomConfigurationChecksum, nvmlDevice_t, device, unsigned int*, checksum )
+DEF_FN(nvmlReturn_t, nvmlDeviceGetInforomImageVersion, nvmlDevice_t, device, char*, version, unsigned int,  length )
+DEF_FN(nvmlReturn_t, nvmlDeviceGetInforomVersion, nvmlDevice_t, device, nvmlInforomObject_t, object, char*, version, unsigned int,  length )
+DEF_FN(nvmlReturn_t, nvmlDeviceGetIrqNum, nvmlDevice_t, device, unsigned int*, irqNum )
+DEF_FN(nvmlReturn_t, nvmlDeviceGetMPSComputeRunningProcesses_v3, nvmlDevice_t, device, unsigned int*, infoCount, nvmlProcessInfo_t*, infos )
+DEF_FN(nvmlReturn_t, nvmlDeviceGetMaxClockInfo, nvmlDevice_t, device, nvmlClockType_t, type, unsigned int*, clock )
+DEF_FN(nvmlReturn_t, nvmlDeviceGetMaxCustomerBoostClock, nvmlDevice_t, device, nvmlClockType_t, clockType, unsigned int*, clockMHz )
+DEF_FN(nvmlReturn_t, nvmlDeviceGetMaxPcieLinkGeneration, nvmlDevice_t, device, unsigned int*, maxLinkGen )
+DEF_FN(nvmlReturn_t, nvmlDeviceGetMaxPcieLinkWidth, nvmlDevice_t, device, unsigned int*, maxLinkWidth )
+DEF_FN(nvmlReturn_t, nvmlDeviceGetMemoryBusWidth, nvmlDevice_t, device, unsigned int*, busWidth )
+DEF_FN(nvmlReturn_t, nvmlDeviceGetMemoryErrorCounter, nvmlDevice_t, device, nvmlMemoryErrorType_t, errorType, nvmlEccCounterType_t, counterType, nvmlMemoryLocation_t, locationType, unsigned long long*, count )
+DEF_FN(nvmlReturn_t, nvmlDeviceGetMemoryInfo, nvmlDevice_t, device, nvmlMemory_t*, memory )
+DEF_FN(nvmlReturn_t, nvmlDeviceGetMinMaxFanSpeed, nvmlDevice_t, device, unsigned int*, minSpeed, unsigned int*, maxSpeed )
+DEF_FN(nvmlReturn_t, nvmlDeviceGetMinorNumber, nvmlDevice_t, device, unsigned int*, minorNumber )
+DEF_FN(nvmlReturn_t, nvmlDeviceGetMultiGpuBoard, nvmlDevice_t, device, unsigned int*, multiGpuBool )
+DEF_FN(nvmlReturn_t, nvmlDeviceGetName, nvmlDevice_t, device, char*, name, unsigned int,  length )
+DEF_FN(nvmlReturn_t, nvmlDeviceGetNumFans, nvmlDevice_t, device, unsigned int*, numFans )
+DEF_FN(nvmlReturn_t, nvmlDeviceGetNumGpuCores, nvmlDevice_t, device, unsigned int*, numCores )
+DEF_FN(nvmlReturn_t, nvmlDeviceGetP2PStatus, nvmlDevice_t, device1, nvmlDevice_t, device2, nvmlGpuP2PCapsIndex_t, p2pIndex, nvmlGpuP2PStatus_t*, p2pStatus )
+DEF_FN(nvmlReturn_t, nvmlDeviceGetPciInfo_v3, nvmlDevice_t, device, nvmlPciInfo_t*, pci )
+DEF_FN(nvmlReturn_t, nvmlDeviceGetPcieLinkMaxSpeed, nvmlDevice_t, device, unsigned int*, maxSpeed )
+DEF_FN(nvmlReturn_t, nvmlDeviceGetPcieReplayCounter, nvmlDevice_t, device, unsigned int*, value )
+DEF_FN(nvmlReturn_t, nvmlDeviceGetPcieSpeed, nvmlDevice_t, device, unsigned int*, pcieSpeed )
+DEF_FN(nvmlReturn_t, nvmlDeviceGetPcieThroughput, nvmlDevice_t, device, nvmlPcieUtilCounter_t, counter, unsigned int*, value )
+DEF_FN(nvmlReturn_t, nvmlDeviceGetPerformanceState, nvmlDevice_t, device, nvmlPstates_t*, pState )
+DEF_FN(nvmlReturn_t, nvmlDeviceGetPersistenceMode, nvmlDevice_t, device, nvmlEnableState_t*, mode )
+DEF_FN(nvmlReturn_t, nvmlDeviceGetPowerManagementDefaultLimit, nvmlDevice_t, device, unsigned int*, defaultLimit )
+DEF_FN(nvmlReturn_t, nvmlDeviceGetPowerManagementLimit, nvmlDevice_t, device, unsigned int*, limit )
+DEF_FN(nvmlReturn_t, nvmlDeviceGetPowerManagementLimitConstraints, nvmlDevice_t, device, unsigned int*, minLimit, unsigned int*, maxLimit )
+DEF_FN(nvmlReturn_t, nvmlDeviceGetPowerManagementMode, nvmlDevice_t, device, nvmlEnableState_t*, mode )
+DEF_FN(nvmlReturn_t, nvmlDeviceGetPowerSource, nvmlDevice_t, device, nvmlPowerSource_t*, powerSource )
+DEF_FN(nvmlReturn_t, nvmlDeviceGetPowerState, nvmlDevice_t, device, nvmlPstates_t*, pState )
+DEF_FN(nvmlReturn_t, nvmlDeviceGetPowerUsage, nvmlDevice_t, device, unsigned int*, power )
+DEF_FN(nvmlReturn_t, nvmlDeviceGetRemappedRows, nvmlDevice_t, device, unsigned int*, corrRows, unsigned int*, uncRows, unsigned int*, isPending, unsigned int*, failureOccurred )
+DEF_FN(nvmlReturn_t, nvmlDeviceGetRetiredPages, nvmlDevice_t, device, nvmlPageRetirementCause_t, cause, unsigned int*, pageCount, unsigned long long*, addresses )
+DEF_FN(nvmlReturn_t, nvmlDeviceGetRetiredPagesPendingStatus, nvmlDevice_t, device, nvmlEnableState_t*, isPending )
+DEF_FN(nvmlReturn_t, nvmlDeviceGetRetiredPages_v2, nvmlDevice_t, device, nvmlPageRetirementCause_t, cause, unsigned int*, pageCount, unsigned long long*, addresses, unsigned long long*, timestamps )
+DEF_FN(nvmlReturn_t, nvmlDeviceGetRowRemapperHistogram, nvmlDevice_t, device, nvmlRowRemapperHistogramValues_t*, values )
+DEF_FN(nvmlReturn_t, nvmlDeviceGetSamples, nvmlDevice_t, device, nvmlSamplingType_t, type, unsigned long long, lastSeenTimeStamp, nvmlValueType_t*, sampleValType, unsigned int*, sampleCount, nvmlSample_t*, samples )
+DEF_FN(nvmlReturn_t, nvmlDeviceGetSerial, nvmlDevice_t, device, char*, serial, unsigned int,  length )
+DEF_FN(nvmlReturn_t, nvmlDeviceGetSupportedClocksThrottleReasons, nvmlDevice_t, device, unsigned long long*, supportedClocksThrottleReasons )
+DEF_FN(nvmlReturn_t, nvmlDeviceGetSupportedGraphicsClocks, nvmlDevice_t, device, unsigned int,  memoryClockMHz, unsigned int*, count, unsigned int*, clocksMHz )
+DEF_FN(nvmlReturn_t, nvmlDeviceGetSupportedMemoryClocks, nvmlDevice_t, device, unsigned int*, count, unsigned int*, clocksMHz )
+DEF_FN(nvmlReturn_t, nvmlDeviceGetTargetFanSpeed, nvmlDevice_t, device, unsigned int,  fan, unsigned int*, targetSpeed )
+DEF_FN(nvmlReturn_t, nvmlDeviceGetTemperature, nvmlDevice_t, device, nvmlTemperatureSensors_t, sensorType, unsigned int*, temp )
+DEF_FN(nvmlReturn_t, nvmlDeviceGetTemperatureThreshold, nvmlDevice_t, device, nvmlTemperatureThresholds_t, thresholdType, unsigned int*, temp )
+DEF_FN(nvmlReturn_t, nvmlDeviceGetThermalSettings, nvmlDevice_t, device, unsigned int,  sensorIndex, nvmlGpuThermalSettings_t*, pThermalSettings )
+DEF_FN(nvmlReturn_t, nvmlDeviceGetTopologyCommonAncestor, nvmlDevice_t, device1, nvmlDevice_t, device2, nvmlGpuTopologyLevel_t*, pathInfo )
+DEF_FN(nvmlReturn_t, nvmlDeviceGetTopologyNearestGpus, nvmlDevice_t, device, nvmlGpuTopologyLevel_t, level, unsigned int*, count, nvmlDevice_t*, deviceArray )
+DEF_FN(nvmlReturn_t, nvmlDeviceGetTotalEccErrors, nvmlDevice_t, device, nvmlMemoryErrorType_t, errorType, nvmlEccCounterType_t, counterType, unsigned long long*, eccCounts )
+DEF_FN(nvmlReturn_t, nvmlDeviceGetTotalEnergyConsumption, nvmlDevice_t, device, unsigned long long*, energy )
+DEF_FN(nvmlReturn_t, nvmlDeviceGetUUID, nvmlDevice_t, device, char*, uuid, unsigned int,  length )
+DEF_FN(nvmlReturn_t, nvmlDeviceGetUtilizationRates, nvmlDevice_t, device, nvmlUtilization_t*, utilization )
+DEF_FN(nvmlReturn_t, nvmlDeviceGetVbiosVersion, nvmlDevice_t, device, char*, version, unsigned int,  length )
+DEF_FN(nvmlReturn_t, nvmlDeviceGetViolationStatus, nvmlDevice_t, device, nvmlPerfPolicyType_t, perfPolicyType, nvmlViolationTime_t*, violTime )
+DEF_FN(nvmlReturn_t, nvmlDeviceOnSameBoard, nvmlDevice_t, device1, nvmlDevice_t, device2, int*, onSameBoard )
+DEF_FN(nvmlReturn_t, nvmlDeviceResetApplicationsClocks, nvmlDevice_t, device )
+DEF_FN(nvmlReturn_t, nvmlDeviceSetAutoBoostedClocksEnabled, nvmlDevice_t, device, nvmlEnableState_t, enabled )
+DEF_FN(nvmlReturn_t, nvmlDeviceSetDefaultAutoBoostedClocksEnabled, nvmlDevice_t, device, nvmlEnableState_t, enabled, unsigned int,  flags )
+DEF_FN(nvmlReturn_t, nvmlDeviceSetDefaultFanSpeed_v2, nvmlDevice_t, device, unsigned int,  fan )
+DEF_FN(nvmlReturn_t, nvmlDeviceSetFanControlPolicy, nvmlDevice_t, device, unsigned int,  fan, nvmlFanControlPolicy_t, policy )
+DEF_FN(nvmlReturn_t, nvmlDeviceSetTemperatureThreshold, nvmlDevice_t, device, nvmlTemperatureThresholds_t, thresholdType, int*, temp )
+DEF_FN(nvmlReturn_t, nvmlDeviceValidateInforom, nvmlDevice_t, device )
+DEF_FN(nvmlReturn_t, nvmlSystemGetTopologyGpuSet, unsigned int,  cpuNumber, unsigned int*, count, nvmlDevice_t*, deviceArray )
+DEF_FN(nvmlReturn_t, nvmlVgpuInstanceGetMdevUUID, nvmlVgpuInstance_t, vgpuInstance, char*, mdevUuid, unsigned int,  size )
diff --git a/cpu/cpu-client-runtime.c b/cpu/cpu-client-runtime.c
index 4eafc639..004d7d31 100644
--- a/cpu/cpu-client-runtime.c
+++ b/cpu/cpu-client-runtime.c
@@ -352,7 +352,9 @@ cudaError_t cudaGetDeviceProperties(struct cudaDeviceProp* prop, int device)
         LOGE(LOG_ERROR, "error: expected size != retrieved size");
         return result.err;
     }
-    if (memcpy(prop, result.mem_result_u.data.mem_data_val, sizeof(struct cudaDeviceProp)) == NULL) {
+    // if (memcpy(prop, result.mem_result_u.data.mem_data_val, sizeof(struct cudaDeviceProp)) == NULL) {
+    //FIXME: Don't know why, but pytorch expects a different definition of cudaDeviceProp, which is only 728 bytes long
+    if (memcpy(prop, result.mem_result_u.data.mem_data_val, 728) == NULL) {
         LOGE(LOG_ERROR, "error: memcpy failed");
         return result.err;
     }
diff --git a/cpu/cpu-client.c b/cpu/cpu-client.c
index fb535b44..4cd62f09 100644
--- a/cpu/cpu-client.c
+++ b/cpu/cpu-client.c
@@ -247,8 +247,9 @@ void *dlopen(const char *filename, int flag)
 
     if (filename != NULL && 
         (strcmp(filename, "libcuda.so.1") == 0 ||
-        strcmp(filename, "libcuda.so") == 0)) {
-        LOG(LOG_DEBUG, "replacing dlopen call to cuda driver library with "
+        strcmp(filename, "libcuda.so") == 0) ||
+        strcmp(filename, "libnvidia-ml.so.1") == 0) {
+        LOG(LOG_DEBUG, "replacing dlopen call to cuda library with "
                        "cricket-client.so");
         dl_handle = dlopen_orig("cricket-client.so", flag);
         if (clnt == NULL) {
diff --git a/cpu/cpu-server-nvml.c b/cpu/cpu-server-nvml.c
new file mode 100644
index 00000000..b3152246
--- /dev/null
+++ b/cpu/cpu-server-nvml.c
@@ -0,0 +1,71 @@
+#include <stdlib.h>
+#include <stdio.h>
+
+#include <nvml.h>
+#include <cuda_runtime_api.h>
+
+#include "cpu_rpc_prot.h"
+#include "cpu-common.h"
+#include "cpu-utils.h"
+#include "log.h"
+#include "resource-mg.h"
+#define WITH_RECORDER
+#include "api-recorder.h"
+#include "gsched.h"
+
+int server_nvml_init(int restore)
+{
+    int ret = 0;
+    if (!restore) {
+        //ret &= resource_mg_init(&rm_modules, 1);
+    } else {
+        //ret &= resource_mg_init(&rm_modules, 0);
+        //ret &= server_driver_restore("ckp");
+    }
+    return ret;
+}
+
+int server_nvml_deinit(void)
+{
+    //resource_mg_free(&rm_modules);
+    return 0;
+}
+
+bool_t rpc_nvmldevicegetcount_v2_1_svc(int_result *result, struct svc_req *rqstp)
+{
+    LOGE(LOG_DEBUG, "%s", __FUNCTION__);
+    GSCHED_RETAIN;
+    // Workaround for pytorch expecting nvmlDeviceGetCount and cudaGetDeviceCount to be the same
+    //result->err = nvmlDeviceGetCount_v2(&result->int_result_u.data);
+    result->err = cudaGetDeviceCount(&result->int_result_u.data);
+    LOGE(LOG_DEBUG, "%s: %d", __FUNCTION__, result->int_result_u.data);
+    GSCHED_RELEASE;
+    return 1;
+}
+
+bool_t rpc_nvmlinitwithflags_1_svc(int flags, int *result, struct svc_req *rqstp)
+{
+    LOGE(LOG_DEBUG, "%s", __FUNCTION__);
+    GSCHED_RETAIN;
+    *result = nvmlInitWithFlags(flags);
+    GSCHED_RELEASE;
+    return 1;
+}
+
+bool_t rpc_nvmlinit_v2_1_svc(int *result, struct svc_req *rqstp)
+{
+    LOGE(LOG_DEBUG, "%s", __FUNCTION__);
+    GSCHED_RETAIN;
+    *result = nvmlInit_v2();
+    GSCHED_RELEASE;
+    return 1;
+}
+
+bool_t rpc_nvmlshutdown_1_svc(int *result, struct svc_req *rqstp)
+{
+    LOGE(LOG_DEBUG, "%s", __FUNCTION__);
+    GSCHED_RETAIN;
+    *result = nvmlShutdown();
+    GSCHED_RELEASE;
+    return 1;
+}
\ No newline at end of file
diff --git a/cpu/cpu_rpc_prot.x b/cpu/cpu_rpc_prot.x
index 69363a20..07453e92 100644
--- a/cpu/cpu_rpc_prot.x
+++ b/cpu/cpu_rpc_prot.x
@@ -364,5 +364,11 @@ program RPC_CD_PROG {
         int          rpc_cublasDgemm(ptr, int, int, int, int, int, double,
                          ptr, int, ptr, int, double, ptr, int)                 = 3002;
         int          rpc_cublasDestroy(ptr)                                    = 3003;
+
+        /* NVML */
+        int_result   rpc_nvmlDeviceGetCount_v2(void)                           = 4000;
+        int          rpc_nvmlInitWithFlags(int)                                = 4001;
+        int          rpc_nvmlInit_v2(void)                                     = 4002;
+        int          rpc_nvmlShutdown(void)                                    = 4003;
     } = 1;
 } = 99;
diff --git a/tests/test_apps/pytorch_minimal.py b/tests/test_apps/pytorch_minimal.py
new file mode 100644
index 00000000..9903cb34
--- /dev/null
+++ b/tests/test_apps/pytorch_minimal.py
@@ -0,0 +1,9 @@
+import torch
+import math
+
+
+dtype = torch.float
+device = torch.device("cuda:0")
+
+# Create random input and output data
+x = torch.linspace(-math.pi, math.pi, 2000, device=device, dtype=dtype)

From c9b972654ace0a835f1a38d020089fb28b54807e Mon Sep 17 00:00:00 2001
From: Niklas Eiling <niklas.eiling@eonerc.rwth-aachen.de>
Date: Fri, 2 Jun 2023 13:56:10 +0200
Subject: [PATCH 43/83] add nvml to Dockerfiles

Signed-off-by: Niklas Eiling <niklas.eiling@eonerc.rwth-aachen.de>
---
 utils/Dockerfile        | 2 +-
 utils/Dockerfile.cuda10 | 2 +-
 utils/Dockerfile.cuda11 | 2 +-
 3 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/utils/Dockerfile b/utils/Dockerfile
index 65d134bc..b129e705 100644
--- a/utils/Dockerfile
+++ b/utils/Dockerfile
@@ -25,7 +25,7 @@ RUN dnf install -y make bash git gcc autoconf libtool automake rpcgen \
 ENV LD_LIBRARY_PATH="/usr/local/lib:/usr/local/lib64:${LD_LIBRARY_PATH}"
 
 RUN dnf -y config-manager --add-repo http://developer.download.nvidia.com/compute/cuda/repos/rhel8/x86_64/cuda-rhel8.repo && \
-    dnf --refresh -y install cuda-compiler-12-1 cuda-libraries-devel-12-1 cuda-driver-devel-12-1 cuda-profiler-api-12-1 && \
+    dnf --refresh -y install cuda-compiler-12-1 cuda-libraries-devel-12-1 cuda-driver-devel-12-1 cuda-profiler-api-12-1 cuda-nvml-devel-12-1 && \
     ln -s cuda-12.1 /usr/local/cuda && \
     ln -s libcuda.so /usr/local/cuda/targets/x86_64-linux/lib/stubs/libcuda.so.1
 
diff --git a/utils/Dockerfile.cuda10 b/utils/Dockerfile.cuda10
index 4bd94bb1..0b929603 100644
--- a/utils/Dockerfile.cuda10
+++ b/utils/Dockerfile.cuda10
@@ -27,7 +27,7 @@ ENV LD_LIBRARY_PATH="/usr/local/lib:/usr/local/lib64:${LD_LIBRARY_PATH}"
 
 RUN dnf --refresh -y install https://developer.download.nvidia.com/compute/cuda/repos/rhel8/x86_64/cuda-repo-rhel8-10.2.89-1.x86_64.rpm && \
     rpm --import https://developer.download.nvidia.com/compute/cuda/repos/rhel8/x86_64/D42D0685.pub && \
-    dnf --refresh -y install cuda-compiler-10-2 cuda-libraries-dev-10-2 cuda-samples-10-2 cuda-driver-dev-10-2 cuda-misc-headers-10-2 && \
+    dnf --refresh -y install cuda-compiler-10-2 cuda-libraries-dev-10-2 cuda-samples-10-2 cuda-driver-dev-10-2 cuda-misc-headers-10-2 cuda-nvml-dev-10-2 && \
     ln -s cuda-10.2 /usr/local/cuda && \
     ln -s libcuda.so /usr/local/cuda/targets/x86_64-linux/lib/stubs/libcuda.so.1
     
diff --git a/utils/Dockerfile.cuda11 b/utils/Dockerfile.cuda11
index b84a2782..fcee0ca6 100644
--- a/utils/Dockerfile.cuda11
+++ b/utils/Dockerfile.cuda11
@@ -27,7 +27,7 @@ RUN dnf install -y make bash git gcc autoconf libtool automake rpcgen \
 ENV LD_LIBRARY_PATH="/usr/local/lib:/usr/local/lib64:${LD_LIBRARY_PATH}"
 
 RUN dnf -y config-manager --add-repo http://developer.download.nvidia.com/compute/cuda/repos/rhel8/x86_64/cuda-rhel8.repo && \
-    dnf --refresh -y install cuda-compiler-11-1 cuda-libraries-devel-11-1 cuda-samples-11-1 cuda-driver-devel-11-1 cuda-nvprof-11-1 && \
+    dnf --refresh -y install cuda-compiler-11-1 cuda-libraries-devel-11-1 cuda-samples-11-1 cuda-driver-devel-11-1 cuda-nvprof-11-1 cuda-nvml-devel-11-1 && \
     ln -s cuda-11.1 /usr/local/cuda && \
     ln -s libcuda.so /usr/local/cuda/targets/x86_64-linux/lib/stubs/libcuda.so.1
 

From 3b541b36c76ad03f88e2bfb57c3e8e1e56288255 Mon Sep 17 00:00:00 2001
From: Niklas Eiling <niklas.eiling@eonerc.rwth-aachen.de>
Date: Fri, 2 Jun 2023 14:09:21 +0200
Subject: [PATCH 44/83] add license to pytorch_minimal.py

Signed-off-by: Niklas Eiling <niklas.eiling@eonerc.rwth-aachen.de>
---
 tests/test_apps/pytorch_minimal.py | 64 ++++++++++++++++++++++++++++++
 1 file changed, 64 insertions(+)

diff --git a/tests/test_apps/pytorch_minimal.py b/tests/test_apps/pytorch_minimal.py
index 9903cb34..d6f49e2d 100644
--- a/tests/test_apps/pytorch_minimal.py
+++ b/tests/test_apps/pytorch_minimal.py
@@ -1,3 +1,33 @@
+# BSD 3-Clause License
+# 
+# Copyright (c) 2017-2022, Pytorch contributors
+# All rights reserved.
+# 
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+# 
+# * Redistributions of source code must retain the above copyright notice, this
+#   list of conditions and the following disclaimer.
+# 
+# * Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+# 
+# * Neither the name of the copyright holder nor the names of its
+#   contributors may be used to endorse or promote products derived from
+#   this software without specific prior written permission.
+# 
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
 import torch
 import math
 
@@ -7,3 +37,37 @@
 
 # Create random input and output data
 x = torch.linspace(-math.pi, math.pi, 2000, device=device, dtype=dtype)
+y = torch.sin(x)
+
+# Randomly initialize weights
+a = torch.randn((), device=device, dtype=dtype)
+b = torch.randn((), device=device, dtype=dtype)
+c = torch.randn((), device=device, dtype=dtype)
+d = torch.randn((), device=device, dtype=dtype)
+
+learning_rate = 1e-6
+for t in range(2000):
+    # Forward pass: compute predicted y
+    y_pred = a + b * x + c * x ** 2 + d * x ** 3
+
+    # Compute and print loss
+    loss = (y_pred - y).pow(2).sum().item()
+    if t % 100 == 99:
+        print(t, loss)
+
+    # Backprop to compute gradients of a, b, c, d with respect to loss
+    grad_y_pred = 2.0 * (y_pred - y)
+    grad_a = grad_y_pred.sum()
+    grad_b = (grad_y_pred * x).sum()
+    grad_c = (grad_y_pred * x ** 2).sum()
+    grad_d = (grad_y_pred * x ** 3).sum()
+
+    # Update weights using gradient descent
+    a -= learning_rate * grad_a
+    b -= learning_rate * grad_b
+    c -= learning_rate * grad_c
+    d -= learning_rate * grad_d
+
+
+print(f'Result: y = {a.item()} + {b.item()} x + {c.item()} x^2 + {d.item()} x^3')
+

From 433930b7ad3a477a688eb484912ae198f16bb815 Mon Sep 17 00:00:00 2001
From: Niklas Eiling <niklas.eiling@eonerc.rwth-aachen.de>
Date: Fri, 2 Jun 2023 14:13:59 +0200
Subject: [PATCH 45/83] add nvml library to dockerfiles

Signed-off-by: Niklas Eiling <niklas.eiling@eonerc.rwth-aachen.de>
---
 utils/Dockerfile        | 2 +-
 utils/Dockerfile.cuda10 | 2 +-
 utils/Dockerfile.cuda11 | 2 +-
 3 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/utils/Dockerfile b/utils/Dockerfile
index b129e705..1f981e11 100644
--- a/utils/Dockerfile
+++ b/utils/Dockerfile
@@ -25,7 +25,7 @@ RUN dnf install -y make bash git gcc autoconf libtool automake rpcgen \
 ENV LD_LIBRARY_PATH="/usr/local/lib:/usr/local/lib64:${LD_LIBRARY_PATH}"
 
 RUN dnf -y config-manager --add-repo http://developer.download.nvidia.com/compute/cuda/repos/rhel8/x86_64/cuda-rhel8.repo && \
-    dnf --refresh -y install cuda-compiler-12-1 cuda-libraries-devel-12-1 cuda-driver-devel-12-1 cuda-profiler-api-12-1 cuda-nvml-devel-12-1 && \
+    dnf --refresh -y install cuda-compiler-12-1 cuda-libraries-devel-12-1 cuda-driver-devel-12-1 cuda-profiler-api-12-1 cuda-nvml-devel-12-1 nvidia-driver-NVML-530.30.02 && \
     ln -s cuda-12.1 /usr/local/cuda && \
     ln -s libcuda.so /usr/local/cuda/targets/x86_64-linux/lib/stubs/libcuda.so.1
 
diff --git a/utils/Dockerfile.cuda10 b/utils/Dockerfile.cuda10
index 0b929603..5654597f 100644
--- a/utils/Dockerfile.cuda10
+++ b/utils/Dockerfile.cuda10
@@ -27,7 +27,7 @@ ENV LD_LIBRARY_PATH="/usr/local/lib:/usr/local/lib64:${LD_LIBRARY_PATH}"
 
 RUN dnf --refresh -y install https://developer.download.nvidia.com/compute/cuda/repos/rhel8/x86_64/cuda-repo-rhel8-10.2.89-1.x86_64.rpm && \
     rpm --import https://developer.download.nvidia.com/compute/cuda/repos/rhel8/x86_64/D42D0685.pub && \
-    dnf --refresh -y install cuda-compiler-10-2 cuda-libraries-dev-10-2 cuda-samples-10-2 cuda-driver-dev-10-2 cuda-misc-headers-10-2 cuda-nvml-dev-10-2 && \
+    dnf --refresh -y install cuda-compiler-10-2 cuda-libraries-dev-10-2 cuda-samples-10-2 cuda-driver-dev-10-2 cuda-misc-headers-10-2 cuda-nvml-dev-10-2 nvidia-driver-NVML-530.30.02 && \
     ln -s cuda-10.2 /usr/local/cuda && \
     ln -s libcuda.so /usr/local/cuda/targets/x86_64-linux/lib/stubs/libcuda.so.1
     
diff --git a/utils/Dockerfile.cuda11 b/utils/Dockerfile.cuda11
index fcee0ca6..12f2944d 100644
--- a/utils/Dockerfile.cuda11
+++ b/utils/Dockerfile.cuda11
@@ -27,7 +27,7 @@ RUN dnf install -y make bash git gcc autoconf libtool automake rpcgen \
 ENV LD_LIBRARY_PATH="/usr/local/lib:/usr/local/lib64:${LD_LIBRARY_PATH}"
 
 RUN dnf -y config-manager --add-repo http://developer.download.nvidia.com/compute/cuda/repos/rhel8/x86_64/cuda-rhel8.repo && \
-    dnf --refresh -y install cuda-compiler-11-1 cuda-libraries-devel-11-1 cuda-samples-11-1 cuda-driver-devel-11-1 cuda-nvprof-11-1 cuda-nvml-devel-11-1 && \
+    dnf --refresh -y install cuda-compiler-11-1 cuda-libraries-devel-11-1 cuda-samples-11-1 cuda-driver-devel-11-1 cuda-nvprof-11-1 cuda-nvml-devel-11-1 nvidia-driver-NVML-530.30.02 && \
     ln -s cuda-11.1 /usr/local/cuda && \
     ln -s libcuda.so /usr/local/cuda/targets/x86_64-linux/lib/stubs/libcuda.so.1
 

From f249f8fac995cc347621ccc4e5679eded3db1982 Mon Sep 17 00:00:00 2001
From: Niklas Eiling <niklas.eiling@eonerc.rwth-aachen.de>
Date: Fri, 2 Jun 2023 14:31:12 +0200
Subject: [PATCH 46/83] exclude some nvml definitions when compiling with an
 old CUDA version to make the CI happy

Signed-off-by: Niklas Eiling <niklas.eiling@eonerc.rwth-aachen.de>
---
 cpu/cpu-client-nvml.c | 12 ++++++++++++
 1 file changed, 12 insertions(+)

diff --git a/cpu/cpu-client-nvml.c b/cpu/cpu-client-nvml.c
index 0e97f5b7..f9045ee8 100644
--- a/cpu/cpu-client-nvml.c
+++ b/cpu/cpu-client-nvml.c
@@ -71,8 +71,10 @@ nvmlReturn_t nvmlShutdown ( void )
 DEF_FN(nvmlReturn_t, nvmlDeviceGetAPIRestriction, nvmlDevice_t, device, nvmlRestrictedAPI_t, apiType, nvmlEnableState_t*, isRestricted )
 DEF_FN(nvmlReturn_t, nvmlDeviceGetAdaptiveClockInfoStatus, nvmlDevice_t, device, unsigned int*, adaptiveClockStatus )
 DEF_FN(nvmlReturn_t, nvmlDeviceGetApplicationsClock, nvmlDevice_t, device, nvmlClockType_t, clockType, unsigned int*, clockMHz )
+#if NVML_API_VERSION >= 12
 DEF_FN(nvmlReturn_t, nvmlDeviceGetArchitecture, nvmlDevice_t, device, nvmlDeviceArchitecture_t*, arch )
 DEF_FN(nvmlReturn_t, nvmlDeviceGetAttributes_v2, nvmlDevice_t, device, nvmlDeviceAttributes_t*, attributes )
+#endif
 DEF_FN(nvmlReturn_t, nvmlDeviceGetAutoBoostedClocksEnabled, nvmlDevice_t, device, nvmlEnableState_t*, isEnabled, nvmlEnableState_t*, defaultIsEnabled )
 DEF_FN(nvmlReturn_t, nvmlDeviceGetBAR1MemoryInfo, nvmlDevice_t, device, nvmlBAR1Memory_t*, bar1Memory )
 DEF_FN(nvmlReturn_t, nvmlDeviceGetBoardId, nvmlDevice_t, device, unsigned int*, boardId )
@@ -121,7 +123,9 @@ DEF_FN(nvmlReturn_t, nvmlDeviceGetEncoderUtilization, nvmlDevice_t, device, unsi
 DEF_FN(nvmlReturn_t, nvmlDeviceGetEnforcedPowerLimit, nvmlDevice_t, device, unsigned int*, limit )
 DEF_FN(nvmlReturn_t, nvmlDeviceGetFBCSessions, nvmlDevice_t, device, unsigned int*, sessionCount, nvmlFBCSessionInfo_t*, sessionInfo )
 DEF_FN(nvmlReturn_t, nvmlDeviceGetFBCStats, nvmlDevice_t, device, nvmlFBCStats_t*, fbcStats )
+#if NVML_API_VERSION >= 12
 DEF_FN(nvmlReturn_t, nvmlDeviceGetFanControlPolicy_v2, nvmlDevice_t, device, unsigned int,  fan, nvmlFanControlPolicy_t*, policy )
+#endif
 DEF_FN(nvmlReturn_t, nvmlDeviceGetFanSpeed, nvmlDevice_t, device, unsigned int*, speed )
 DEF_FN(nvmlReturn_t, nvmlDeviceGetFanSpeed_v2, nvmlDevice_t, device, unsigned int,  fan, unsigned int*, speed )
 DEF_FN(nvmlReturn_t, nvmlDeviceGetGpuMaxPcieLinkGeneration, nvmlDevice_t, device, unsigned int*, maxLinkGenDevice )
@@ -162,14 +166,18 @@ DEF_FN(nvmlReturn_t, nvmlDeviceGetPowerManagementDefaultLimit, nvmlDevice_t, dev
 DEF_FN(nvmlReturn_t, nvmlDeviceGetPowerManagementLimit, nvmlDevice_t, device, unsigned int*, limit )
 DEF_FN(nvmlReturn_t, nvmlDeviceGetPowerManagementLimitConstraints, nvmlDevice_t, device, unsigned int*, minLimit, unsigned int*, maxLimit )
 DEF_FN(nvmlReturn_t, nvmlDeviceGetPowerManagementMode, nvmlDevice_t, device, nvmlEnableState_t*, mode )
+#if NVML_API_VERSION >= 12
 DEF_FN(nvmlReturn_t, nvmlDeviceGetPowerSource, nvmlDevice_t, device, nvmlPowerSource_t*, powerSource )
+#endif
 DEF_FN(nvmlReturn_t, nvmlDeviceGetPowerState, nvmlDevice_t, device, nvmlPstates_t*, pState )
 DEF_FN(nvmlReturn_t, nvmlDeviceGetPowerUsage, nvmlDevice_t, device, unsigned int*, power )
 DEF_FN(nvmlReturn_t, nvmlDeviceGetRemappedRows, nvmlDevice_t, device, unsigned int*, corrRows, unsigned int*, uncRows, unsigned int*, isPending, unsigned int*, failureOccurred )
 DEF_FN(nvmlReturn_t, nvmlDeviceGetRetiredPages, nvmlDevice_t, device, nvmlPageRetirementCause_t, cause, unsigned int*, pageCount, unsigned long long*, addresses )
 DEF_FN(nvmlReturn_t, nvmlDeviceGetRetiredPagesPendingStatus, nvmlDevice_t, device, nvmlEnableState_t*, isPending )
 DEF_FN(nvmlReturn_t, nvmlDeviceGetRetiredPages_v2, nvmlDevice_t, device, nvmlPageRetirementCause_t, cause, unsigned int*, pageCount, unsigned long long*, addresses, unsigned long long*, timestamps )
+#if NVML_API_VERSION >= 12
 DEF_FN(nvmlReturn_t, nvmlDeviceGetRowRemapperHistogram, nvmlDevice_t, device, nvmlRowRemapperHistogramValues_t*, values )
+#endif
 DEF_FN(nvmlReturn_t, nvmlDeviceGetSamples, nvmlDevice_t, device, nvmlSamplingType_t, type, unsigned long long, lastSeenTimeStamp, nvmlValueType_t*, sampleValType, unsigned int*, sampleCount, nvmlSample_t*, samples )
 DEF_FN(nvmlReturn_t, nvmlDeviceGetSerial, nvmlDevice_t, device, char*, serial, unsigned int,  length )
 DEF_FN(nvmlReturn_t, nvmlDeviceGetSupportedClocksThrottleReasons, nvmlDevice_t, device, unsigned long long*, supportedClocksThrottleReasons )
@@ -178,7 +186,9 @@ DEF_FN(nvmlReturn_t, nvmlDeviceGetSupportedMemoryClocks, nvmlDevice_t, device, u
 DEF_FN(nvmlReturn_t, nvmlDeviceGetTargetFanSpeed, nvmlDevice_t, device, unsigned int,  fan, unsigned int*, targetSpeed )
 DEF_FN(nvmlReturn_t, nvmlDeviceGetTemperature, nvmlDevice_t, device, nvmlTemperatureSensors_t, sensorType, unsigned int*, temp )
 DEF_FN(nvmlReturn_t, nvmlDeviceGetTemperatureThreshold, nvmlDevice_t, device, nvmlTemperatureThresholds_t, thresholdType, unsigned int*, temp )
+#if NVML_API_VERSION >= 12
 DEF_FN(nvmlReturn_t, nvmlDeviceGetThermalSettings, nvmlDevice_t, device, unsigned int,  sensorIndex, nvmlGpuThermalSettings_t*, pThermalSettings )
+#endif
 DEF_FN(nvmlReturn_t, nvmlDeviceGetTopologyCommonAncestor, nvmlDevice_t, device1, nvmlDevice_t, device2, nvmlGpuTopologyLevel_t*, pathInfo )
 DEF_FN(nvmlReturn_t, nvmlDeviceGetTopologyNearestGpus, nvmlDevice_t, device, nvmlGpuTopologyLevel_t, level, unsigned int*, count, nvmlDevice_t*, deviceArray )
 DEF_FN(nvmlReturn_t, nvmlDeviceGetTotalEccErrors, nvmlDevice_t, device, nvmlMemoryErrorType_t, errorType, nvmlEccCounterType_t, counterType, unsigned long long*, eccCounts )
@@ -192,7 +202,9 @@ DEF_FN(nvmlReturn_t, nvmlDeviceResetApplicationsClocks, nvmlDevice_t, device )
 DEF_FN(nvmlReturn_t, nvmlDeviceSetAutoBoostedClocksEnabled, nvmlDevice_t, device, nvmlEnableState_t, enabled )
 DEF_FN(nvmlReturn_t, nvmlDeviceSetDefaultAutoBoostedClocksEnabled, nvmlDevice_t, device, nvmlEnableState_t, enabled, unsigned int,  flags )
 DEF_FN(nvmlReturn_t, nvmlDeviceSetDefaultFanSpeed_v2, nvmlDevice_t, device, unsigned int,  fan )
+#if NVML_API_VERSION >= 12
 DEF_FN(nvmlReturn_t, nvmlDeviceSetFanControlPolicy, nvmlDevice_t, device, unsigned int,  fan, nvmlFanControlPolicy_t, policy )
+#endif
 DEF_FN(nvmlReturn_t, nvmlDeviceSetTemperatureThreshold, nvmlDevice_t, device, nvmlTemperatureThresholds_t, thresholdType, int*, temp )
 DEF_FN(nvmlReturn_t, nvmlDeviceValidateInforom, nvmlDevice_t, device )
 DEF_FN(nvmlReturn_t, nvmlSystemGetTopologyGpuSet, unsigned int,  cpuNumber, unsigned int*, count, nvmlDevice_t*, deviceArray )

From 6860540d846bef09210c5dbf89f157ac6b120a94 Mon Sep 17 00:00:00 2001
From: Niklas Eiling <niklas.eiling@eonerc.rwth-aachen.de>
Date: Tue, 6 Jun 2023 11:42:40 +0200
Subject: [PATCH 47/83] add cpu-server-nvml head er file and initialization of
 nvml part

Signed-off-by: Niklas Eiling <niklas.eiling@eonerc.rwth-aachen.de>
---
 cpu/cpu-server-nvml.h |  9 +++++++++
 cpu/cpu-server.c      | 12 ++++++++++--
 2 files changed, 19 insertions(+), 2 deletions(-)
 create mode 100644 cpu/cpu-server-nvml.h

diff --git a/cpu/cpu-server-nvml.h b/cpu/cpu-server-nvml.h
new file mode 100644
index 00000000..84a8270c
--- /dev/null
+++ b/cpu/cpu-server-nvml.h
@@ -0,0 +1,9 @@
+#ifndef _CPU_SERVER_NVML_H_
+#define _CPU_SERVER_NVML_H_
+
+int server_nvml_init(int restore);
+int server_nvml_deinit(void);
+//int server_nvml_checkpoint(const char *path, int dump_memory, unsigned long prog, unsigned long vers);
+//int server_nvml_restore(const char *path);
+
+#endif //_CPU_SERVER_NVML_H_
diff --git a/cpu/cpu-server.c b/cpu/cpu-server.c
index 9b0dccbd..86bcd561 100644
--- a/cpu/cpu-server.c
+++ b/cpu/cpu-server.c
@@ -25,6 +25,7 @@
 #define WITH_RECORDER
 #include "api-recorder.h"
 #include "gsched.h"
+#include "cpu-server-nvml.h"
 
 INIT_SOCKTYPE
 
@@ -287,6 +288,11 @@ void cricket_main(size_t prog_num, size_t vers_num)
         LOGE(LOG_ERROR, "initializing server_runtime failed.");
         goto cleanup2;        
     }
+    
+    if (server_nvml_init(restore) != 0) {
+        LOGE(LOG_ERROR, "initializing server_nvml failed.");
+        goto cleanup1;
+    }
 
 #ifdef WITH_IB
 
@@ -300,7 +306,7 @@ void cricket_main(size_t prog_num, size_t vers_num)
 
     if (signal(SIGUSR1, signal_checkpoint) == SIG_ERR) {
         LOGE(LOG_ERROR, "An error occurred while setting a signal handler.");
-        goto cleanup1;
+        goto cleanup0;
     }
 
     LOG(LOG_INFO, "waiting for RPC requests...");
@@ -310,8 +316,10 @@ void cricket_main(size_t prog_num, size_t vers_num)
     LOG(LOG_DEBUG, "svc_run returned. Cleaning up.");
     ret = 0;
     //api_records_print();
- cleanup1:
+ cleanup0:
     server_driver_deinit();
+ cleanup1:
+    server_nvml_deinit();
  cleanup2:
     server_runtime_deinit();
  cleanup3:

From c849bd71c88f0310553a1c4d28332362d9ebc6fc Mon Sep 17 00:00:00 2001
From: Niklas Eiling <niklas.eiling@eonerc.rwth-aachen.de>
Date: Tue, 6 Jun 2023 11:43:37 +0200
Subject: [PATCH 48/83] change c standard to gnu11, improve logging

Signed-off-by: Niklas Eiling <niklas.eiling@eonerc.rwth-aachen.de>
---
 cpu/Makefile             | 4 ++--
 cpu/cpu-client-nvml.c    | 2 +-
 cpu/cpu-client-runtime.c | 2 +-
 cpu/cpu-server-nvml.c    | 1 +
 4 files changed, 5 insertions(+), 4 deletions(-)

diff --git a/cpu/Makefile b/cpu/Makefile
index 8af92730..ed23f83f 100644
--- a/cpu/Makefile
+++ b/cpu/Makefile
@@ -79,14 +79,14 @@ INC_FLAGS += -I$(CUDA_SRC)/include
 LIB_FLAGS += -L$(LIBTIRPC_PREFIX)/lib
 LIB_FLAGS += -L$(CUDA_SRC)/lib64
 LIB_FLAGS += -L$(CUDA_SRC)/lib64/stubs
-CC_FLAGS += -std=gnu99 $(INC_FLAGS) -O2
+CC_FLAGS += -std=gnu11 $(INC_FLAGS) #-O2
 # TODO: use extern in header files instead of direct definition e.g. in cpu-common.h to remove -fcommon flag
 CC_FLAGS += -fcommon
 LD_FLAGS = $(LIB_FLAGS) -ltirpc -ldl -lcrypto -lelf
 
 ifdef WITH_DEBUG
 # use ASAN_OPTIONS=protect_shadow_gap=0  LSAN_OPTIONS=fast_unwind_on_malloc=0 when running
-CC_FLAGS += -g -ggdb #-fsanitize=address -fsanitize=pointer-compare -fsanitize=pointer-subtract -fsanitize-address-use-after-scope
+CC_FLAGS += -g -ggdb #-static-libasan -fsanitize=address -fsanitize=pointer-compare -fsanitize=pointer-subtract -fsanitize-address-use-after-scope
 endif
 
 ifdef WITH_IB
diff --git a/cpu/cpu-client-nvml.c b/cpu/cpu-client-nvml.c
index f9045ee8..29f86380 100644
--- a/cpu/cpu-client-nvml.c
+++ b/cpu/cpu-client-nvml.c
@@ -97,7 +97,7 @@ nvmlReturn_t nvmlDeviceGetCount_v2(unsigned int* deviceCount )
     }
     retval_1 = rpc_nvmldevicegetcount_v2_1(&result, clnt);
     if (retval_1 != RPC_SUCCESS) {
-        clnt_perror (clnt, "call failed");
+        LOGE(LOG_ERROR, "call failed: %s", __FUNCTION__);
     }
     if (result.err == 0) {
         *deviceCount = result.int_result_u.data;
diff --git a/cpu/cpu-client-runtime.c b/cpu/cpu-client-runtime.c
index 004d7d31..cb22b142 100644
--- a/cpu/cpu-client-runtime.c
+++ b/cpu/cpu-client-runtime.c
@@ -338,7 +338,7 @@ cudaError_t cudaGetDeviceProperties(struct cudaDeviceProp* prop, int device)
     mem_result result;
     enum clnt_stat retval;
     if (prop == NULL) {
-        LOGE(LOG_ERROR, "error: prop == NULL\n");
+        LOGE(LOG_ERROR, "error: prop == NULL");
         return cudaErrorInvalidValue;
     }
     retval = cuda_get_device_properties_1(device, &result, clnt);
diff --git a/cpu/cpu-server-nvml.c b/cpu/cpu-server-nvml.c
index b3152246..89467618 100644
--- a/cpu/cpu-server-nvml.c
+++ b/cpu/cpu-server-nvml.c
@@ -1,3 +1,4 @@
+#define _GNU_SOURCE
 #include <stdlib.h>
 #include <stdio.h>
 

From 4c7890409a74ab4143950b76322c7761be40adf5 Mon Sep 17 00:00:00 2001
From: Niklas Eiling <niklas.eiling@eonerc.rwth-aachen.de>
Date: Wed, 7 Jun 2023 14:20:49 +0200
Subject: [PATCH 49/83] add documentation on how to use pytorch to
 docs/pytorch.md

Signed-off-by: Niklas Eiling <niklas.eiling@eonerc.rwth-aachen.de>
---
 docs/pytorch.md | 120 ++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 120 insertions(+)
 create mode 100644 docs/pytorch.md

diff --git a/docs/pytorch.md b/docs/pytorch.md
new file mode 100644
index 00000000..517d2ef9
--- /dev/null
+++ b/docs/pytorch.md
@@ -0,0 +1,120 @@
+# Cricket pyTorch
+
+Get pytorch sources
+```
+git clone git@github.com:pytorch/pytorch.git
+git checkout v1.13.1
+git submodule update --init --recursive
+```
+
+patch sources.
+- link cudart dynamically when building docker image
+- link cudart dynamically when building ATen
+- link cudart dynamically when building nccl
+- deactivate building for some old cuda versions. (optional)
+- add cricket and dependencies to dockerfile
+- deactivate compression we do not fully support fatbin compression
+- remove compression from nccl as well
+```
+diff --git a/CMakeLists.txt b/CMakeLists.txt
+index e2e1f69457e..f6e5542f341 100644
+--- a/CMakeLists.txt
++++ b/CMakeLists.txt
+@@ -554,7 +554,6 @@ if(MSVC)
+   string(APPEND CMAKE_CUDA_FLAGS " -Xcompiler /w -w")
+ endif(MSVC)
+
+-string(APPEND CMAKE_CUDA_FLAGS " -Xfatbin -compress-all")
+
+ if(NOT MSVC)
+   string(APPEND CMAKE_CUDA_FLAGS_DEBUG " -g -lineinfo --source-in-ptx")
+
+diff --git a/Dockerfile b/Dockerfile
+index 815a9108ce9..53ec7689493 100644
+--- a/Dockerfile
++++ b/Dockerfile
+@@ -53,7 +53,7 @@ WORKDIR /opt/pytorch
+ COPY --from=conda /opt/conda /opt/conda
+ COPY --from=submodule-update /opt/pytorch /opt/pytorch
+ RUN --mount=type=cache,target=/opt/ccache \
+-    TORCH_CUDA_ARCH_LIST="3.5 5.2 6.0 6.1 7.0+PTX 8.0" TORCH_NVCC_FLAGS="-Xfatbin -compress-all" \
++    TORCH_CUDA_ARCH_LIST="6.0 6.1 7.0 8.0" TORCH_NVCC_FLAGS="-cudart shared --no-compress" \
+     CMAKE_PREFIX_PATH="$(dirname $(which conda))/../" \
+     python setup.py install
+
+@@ -93,3 +93,13 @@ WORKDIR /workspace
+ FROM official as dev
+ # Should override the already installed version from the official-image stage
+ COPY --from=build /opt/conda /opt/conda
++RUN apt-get update && apt-get install -y --no-install-recommends \
++        rpcbind \
++        git \
++        automake \
++        libtool \
++        libssl-dev \
++        inetutils-ping \
++        vim \
++        gdb && \
++    rm -rf /var/lib/apt/lists/*
+diff --git a/aten/src/ATen/CMakeLists.txt b/aten/src/ATen/CMakeLists.txt
+index 3055e290094..4cc14c794b0 100644
+--- a/aten/src/ATen/CMakeLists.txt
++++ b/aten/src/ATen/CMakeLists.txt
+@@ -458,7 +458,7 @@ if(USE_CUDA AND NOT USE_ROCM)
+   endif()
+   if($ENV{ATEN_STATIC_CUDA})
+     list(APPEND ATen_CUDA_DEPENDENCY_LIBS "${CUDA_TOOLKIT_ROOT_DIR}/lib64/libculibos.a")
+-    list(APPEND ATen_CUDA_DEPENDENCY_LIBS "${CUDA_TOOLKIT_ROOT_DIR}/lib64/libcudart_static.a")
++    list(APPEND ATen_CUDA_DEPENDENCY_LIBS "${CUDA_TOOLKIT_ROOT_DIR}/lib64/libcudart.so")
+   endif($ENV{ATEN_STATIC_CUDA})
+ endif()
+```
+`third_party/nccl/nccl`
+```
+diff --git a/makefiles/common.mk b/makefiles/common.mk
+index 1a1c2b6..c781b39 100644
+--- a/makefiles/common.mk
++++ b/makefiles/common.mk
+@@ -54,7 +54,7 @@ CXXFLAGS   := -DCUDA_MAJOR=$(CUDA_MAJOR) -DCUDA_MINOR=$(CUDA_MINOR) -fPIC -fvisi                                                                                             
+ # Maxrregcount needs to be set accordingly to NCCL_MAX_NTHREADS (otherwise it will cause kernel launch errors)                                                                                
+ # 512 : 120, 640 : 96, 768 : 80, 1024 : 60
+ # We would not have to set this if we used __launch_bounds__, but this only works on kernels, not on functions.                                                                               
+-NVCUFLAGS  := -ccbin $(CXX) $(NVCC_GENCODE) -std=c++11 --expt-extended-lambda -Xptxas -maxrregcount=96 -Xfatbin -compress-all                                                                 
++NVCUFLAGS  := -ccbin $(CXX) $(NVCC_GENCODE) -std=c++11 --expt-extended-lambda -Xptxas -maxrregcount=96 --no-compress -cudart shared                                                           
+ # Use addprefix so that we can specify more than one path
+ NVLDFLAGS  := -L${CUDA_LIB} -lcudart -lrt
+```
+
+Avoid `CMake Error: File /opt/pytorch/build_variables.bzl does not exist.` (https://github.com/pytorch/pytorch/pull/85947):
+```
+diff --git a/.gitignore b/.gitignore
+index 3e6f3831c4c..db6d9c3527e 100644
+--- a/.gitignore
++++ b/.gitignore
+@@ -214,6 +214,7 @@ build_host_protoc
+ build_android
+ build_ios
+ /build_*
++!/build_variables.bzl
+ .build_debug/*
+ .build_release/*
+ .build_profile/*
+```
+
+build pytorch
+```
+# only necessary when building on an NFS share
+EXTRA_DOCKER_BUILD_FLAGS='--storage-opt "overlay.mount_program=/usr/bin/fuse-overlayfs"'
+
+make -f docker.Makefile
+```
+
+launch docker container, torch
+```
+sudo docker run --gpus all --rm -it -v /home/eiling/projects/cricket:/cricket --ipc=host pytorch:latest
+REMOTE_GPU_ADDRESS=<cricket server address> LD_PRELOAD=cricket/cpu/cricket-client.so python3
+
+LD_LIBRARY_PATH=/cricket/cpu gdb -x /cricket/tests/gdb_client_cmds python3
+(gdb) run /cricket/tests/test_apps/pytorch_minimal.py 
+```
+

From 5c647484e2d4f97be463d76fa712f4a56d22fd6d Mon Sep 17 00:00:00 2001
From: Niklas Eiling <niklas.eiling@eonerc.rwth-aachen.de>
Date: Wed, 7 Jun 2023 14:21:34 +0200
Subject: [PATCH 50/83] fix elf decompression handling padding wrong in some
 circumstances

Signed-off-by: Niklas Eiling <niklas.eiling@eonerc.rwth-aachen.de>
---
 cpu/cpu-elf2.c | 11 ++++++++---
 1 file changed, 8 insertions(+), 3 deletions(-)

diff --git a/cpu/cpu-elf2.c b/cpu/cpu-elf2.c
index 21e80fb3..98f576c2 100644
--- a/cpu/cpu-elf2.c
+++ b/cpu/cpu-elf2.c
@@ -234,6 +234,7 @@ static size_t decompress(const uint8_t* input, size_t input_size, uint8_t* outpu
 #endif
         opos += next_clen;
     }
+    LOGE(LOG_DEBUG, "ipos: %#zx, opos: %#zx, ilen: %#zx, olen: %#zx", ipos, opos, input_size, output_size);
     return opos;
 }
 
@@ -328,7 +329,7 @@ static ssize_t decompress_single_section(const uint8_t *input, uint8_t **output,
     size_t input_read = 0;
     size_t output_written = 0;
     size_t decompress_ret = 0;
-    const uint8_t zeroes[6] = {0};
+    const uint8_t zeroes[8] = {0};
 
     if (input == NULL || output == NULL || eh == NULL || th == NULL) {
         LOGE(LOG_ERROR, "invalid parameters");
@@ -346,12 +347,16 @@ static ssize_t decompress_single_section(const uint8_t *input, uint8_t **output,
     if ((decompress_ret = decompress(input, th->compressed_size, *output, th->decompressed_size)) != th->decompressed_size) {
         LOGE(LOG_ERROR, "Decompression failed: decompressed size is %#zx, but header says %#zx", 
                 decompress_ret, th->decompressed_size);
-        //goto error;
+        LOGE(LOG_ERROR, "input pos: %#zx, output pos: %#zx", input - (uint8_t*)eh, *output);
+        hexdump(input, 0x160);
+        if (decompress_ret >= 0x160)
+            hexdump((*output), 0x160);
+        goto error;
     }
     input_read += th->compressed_size;
     output_written += th->decompressed_size;
 
-    padding = (8 - (size_t)(input + input_read) % 8);
+    padding = ((8 - (size_t)(input + input_read)) % 8);
     if (memcmp(input + input_read, zeroes, padding) != 0) {
         LOGE(LOG_ERROR, "expected %#zx zero bytes, got:", padding);
         hexdump(input + input_read, 0x60);

From 1c7d39f74244d3e7588a32eb35c4533a1d8862d3 Mon Sep 17 00:00:00 2001
From: Niklas Eiling <niklas.eiling@eonerc.rwth-aachen.de>
Date: Wed, 7 Jun 2023 15:52:34 +0200
Subject: [PATCH 51/83] fix decompression not working for long uncompressed lz4
 segments

Signed-off-by: Niklas Eiling <niklas.eiling@eonerc.rwth-aachen.de>
---
 cpu/cpu-elf2.c | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/cpu/cpu-elf2.c b/cpu/cpu-elf2.c
index 98f576c2..5e7531e1 100644
--- a/cpu/cpu-elf2.c
+++ b/cpu/cpu-elf2.c
@@ -189,7 +189,9 @@ static size_t decompress(const uint8_t* input, size_t input_size, uint8_t* outpu
         next_nclen = (input[ipos] & 0xf0) >> 4;
         next_clen = 4 + (input[ipos] & 0xf);
         if (next_nclen == 0xf) {
-            next_nclen += input[++ipos];
+            do {
+                next_nclen += input[++ipos];
+            } while (input[ipos] == 0xff);
         }
         
         if (memcpy(output + opos, input + (++ipos), next_nclen) == NULL) {
@@ -349,8 +351,8 @@ static ssize_t decompress_single_section(const uint8_t *input, uint8_t **output,
                 decompress_ret, th->decompressed_size);
         LOGE(LOG_ERROR, "input pos: %#zx, output pos: %#zx", input - (uint8_t*)eh, *output);
         hexdump(input, 0x160);
-        if (decompress_ret >= 0x160)
-            hexdump((*output), 0x160);
+        if (decompress_ret >= 0x60)
+            hexdump((*output) + decompress_ret - 0x60, 0x60);
         goto error;
     }
     input_read += th->compressed_size;

From c9f09b9a5a53af41d93f2b6acfd0ee8550ec60b6 Mon Sep 17 00:00:00 2001
From: Niklas Eiling <niklas.eiling@eonerc.rwth-aachen.de>
Date: Mon, 12 Jun 2023 14:40:10 +0200
Subject: [PATCH 52/83] fix potential segfault because of missing variadic
 parameter in logging

Signed-off-by: Niklas Eiling <niklas.eiling@eonerc.rwth-aachen.de>
---
 cpu/cpu-client-driver.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/cpu/cpu-client-driver.c b/cpu/cpu-client-driver.c
index 149b2ce7..5c1e4b74 100644
--- a/cpu/cpu-client-driver.c
+++ b/cpu/cpu-client-driver.c
@@ -455,7 +455,7 @@ CUresult cuModuleGetFunction(CUfunction* hfun, CUmodule hmod, const char* name)
 	}
     *hfun = (CUfunction)result.ptr_result_u.ptr;
     if ((info = utils_search_info(&kernel_infos, (char*)name)) == NULL) {
-        LOGE(LOG_ERROR, "cannot find kernel %s kernel_info_t");
+        LOGE(LOG_ERROR, "cannot find kernel %s kernel_info_t", name);
         return CUDA_ERROR_UNKNOWN;
     }
     info->host_fun = *hfun;

From c709acf81f655cf882eba2b3758f8d376b08e8ed Mon Sep 17 00:00:00 2001
From: Niklas Eiling <niklas.eiling@eonerc.rwth-aachen.de>
Date: Mon, 12 Jun 2023 14:40:46 +0200
Subject: [PATCH 53/83] use uint64_t for decompressions to fix overflowing of
 range and length specifiers for very long compressed segments

Signed-off-by: Niklas Eiling <niklas.eiling@eonerc.rwth-aachen.de>
---
 cpu/cpu-elf2.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/cpu/cpu-elf2.c b/cpu/cpu-elf2.c
index 5e7531e1..dd0abc35 100644
--- a/cpu/cpu-elf2.c
+++ b/cpu/cpu-elf2.c
@@ -181,9 +181,9 @@ static int get_text_header(const uint8_t* fatbin_data, size_t fatbin_size, struc
 static size_t decompress(const uint8_t* input, size_t input_size, uint8_t* output, size_t output_size)
 {
     size_t ipos = 0, opos = 0;  
-    uint16_t next_nclen;  // length of next non-compressed segment
-    uint16_t next_clen;   // length of next compressed segment
-    uint16_t back_offset; // negative offset where redudant data is located, relative to current opos
+    uint64_t next_nclen;  // length of next non-compressed segment
+    uint64_t next_clen;   // length of next compressed segment
+    uint64_t back_offset; // negative offset where redudant data is located, relative to current opos
 
     while (ipos < input_size) {
         next_nclen = (input[ipos] & 0xf0) >> 4;

From 56ce0602d7dc49ae92a6d63836bb8ab9e172a0b6 Mon Sep 17 00:00:00 2001
From: Niklas Eiling <niklas.eiling@eonerc.rwth-aachen.de>
Date: Mon, 12 Jun 2023 14:57:59 +0200
Subject: [PATCH 54/83] update docs to not deactivate compression as we now
 support compressed pytorch kernels

Signed-off-by: Niklas Eiling <niklas.eiling@eonerc.rwth-aachen.de>
---
 docs/pytorch.md | 48 ++++++++++++++++++++++++++----------------------
 1 file changed, 26 insertions(+), 22 deletions(-)

diff --git a/docs/pytorch.md b/docs/pytorch.md
index 517d2ef9..1ef0865c 100644
--- a/docs/pytorch.md
+++ b/docs/pytorch.md
@@ -12,23 +12,8 @@ patch sources.
 - link cudart dynamically when building ATen
 - link cudart dynamically when building nccl
 - deactivate building for some old cuda versions. (optional)
-- add cricket and dependencies to dockerfile
-- deactivate compression we do not fully support fatbin compression
-- remove compression from nccl as well
+- add cricket dependencies to dockerfile
 ```
-diff --git a/CMakeLists.txt b/CMakeLists.txt
-index e2e1f69457e..f6e5542f341 100644
---- a/CMakeLists.txt
-+++ b/CMakeLists.txt
-@@ -554,7 +554,6 @@ if(MSVC)
-   string(APPEND CMAKE_CUDA_FLAGS " -Xcompiler /w -w")
- endif(MSVC)
-
--string(APPEND CMAKE_CUDA_FLAGS " -Xfatbin -compress-all")
-
- if(NOT MSVC)
-   string(APPEND CMAKE_CUDA_FLAGS_DEBUG " -g -lineinfo --source-in-ptx")
-
 diff --git a/Dockerfile b/Dockerfile
 index 815a9108ce9..53ec7689493 100644
 --- a/Dockerfile
@@ -38,7 +23,7 @@ index 815a9108ce9..53ec7689493 100644
  COPY --from=submodule-update /opt/pytorch /opt/pytorch
  RUN --mount=type=cache,target=/opt/ccache \
 -    TORCH_CUDA_ARCH_LIST="3.5 5.2 6.0 6.1 7.0+PTX 8.0" TORCH_NVCC_FLAGS="-Xfatbin -compress-all" \
-+    TORCH_CUDA_ARCH_LIST="6.0 6.1 7.0 8.0" TORCH_NVCC_FLAGS="-cudart shared --no-compress" \
++    TORCH_CUDA_ARCH_LIST="6.0 6.1 7.0 8.0" TORCH_NVCC_FLAGS="-Xfatbin -compress-all -cudart shared" \
      CMAKE_PREFIX_PATH="$(dirname $(which conda))/../" \
      python setup.py install
 
@@ -80,9 +65,26 @@ index 1a1c2b6..c781b39 100644
  # 512 : 120, 640 : 96, 768 : 80, 1024 : 60
  # We would not have to set this if we used __launch_bounds__, but this only works on kernels, not on functions.                                                                               
 -NVCUFLAGS  := -ccbin $(CXX) $(NVCC_GENCODE) -std=c++11 --expt-extended-lambda -Xptxas -maxrregcount=96 -Xfatbin -compress-all                                                                 
-+NVCUFLAGS  := -ccbin $(CXX) $(NVCC_GENCODE) -std=c++11 --expt-extended-lambda -Xptxas -maxrregcount=96 --no-compress -cudart shared                                                           
++NVCUFLAGS  := -ccbin $(CXX) $(NVCC_GENCODE) -std=c++11 --expt-extended-lambda -Xptxas -maxrregcount=96 -Xfatbin -compress-all -cudart shared                                                           
  # Use addprefix so that we can specify more than one path
- NVLDFLAGS  := -L${CUDA_LIB} -lcudart -lrt
+-NVLDFLAGS  := -L${CUDA_LIB} -lcudart -lrt
++NVLDFLAGS  := -L${CUDA_LIB} -lcudart -lrt -cudart shared
+ 
+ ########## GCOV ##########
+ GCOV ?= 0 # disable by default.
+diff --git a/src/Makefile b/src/Makefile
+index d658c35..5bd9876 100644
+--- a/src/Makefile
++++ b/src/Makefile
+@@ -28,7 +28,7 @@ LIBDIR := $(BUILDDIR)/lib
+ OBJDIR := $(BUILDDIR)/obj
+ PKGDIR := $(BUILDDIR)/lib/pkgconfig
+ ##### target files
+-CUDARTLIB  ?= cudart_static
++CUDARTLIB  ?= cudart
+ INCTARGETS := $(INCEXPORTS:%=$(INCDIR)/%)
+ LIBSONAME  := $(LIBNAME:%=%.$(NCCL_MAJOR))
+ LIBTARGET  := $(LIBNAME:%=%.$(NCCL_MAJOR).$(NCCL_MINOR).$(NCCL_PATCH))
 ```
 
 Avoid `CMake Error: File /opt/pytorch/build_variables.bzl does not exist.` (https://github.com/pytorch/pytorch/pull/85947):
@@ -111,9 +113,11 @@ make -f docker.Makefile
 
 launch docker container, torch
 ```
-sudo docker run --gpus all --rm -it -v /home/eiling/projects/cricket:/cricket --ipc=host pytorch:latest
-REMOTE_GPU_ADDRESS=<cricket server address> LD_PRELOAD=cricket/cpu/cricket-client.so python3
-
+sudo docker run --gpus all --rm -it -v <patch-to-cricket>/cricket:/cricket --ipc=host pytorch:latest
+LD_LIBRARY_PATH=/cricket/cpu REMOTE_GPU_ADDRESS=<cricket server address> LD_PRELOAD=/cricket/cpu/cricket-client.so python3 /cricket/tests/test_apps/pytorch_minimal.py
+```
+or under gdb supervision:
+```
 LD_LIBRARY_PATH=/cricket/cpu gdb -x /cricket/tests/gdb_client_cmds python3
 (gdb) run /cricket/tests/test_apps/pytorch_minimal.py 
 ```

From da4682e5afdbd033d55d6c36100fcf959478bced Mon Sep 17 00:00:00 2001
From: Niklas Eiling <niklas.eiling@eonerc.rwth-aachen.de>
Date: Tue, 13 Jun 2023 09:29:51 +0200
Subject: [PATCH 55/83] add v2 implementation of cudaGetDeviceProperties

Signed-off-by: Niklas Eiling <niklas.eiling@eonerc.rwth-aachen.de>
---
 cpu/cpu-client-runtime.c | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/cpu/cpu-client-runtime.c b/cpu/cpu-client-runtime.c
index cb22b142..b33459e3 100644
--- a/cpu/cpu-client-runtime.c
+++ b/cpu/cpu-client-runtime.c
@@ -360,6 +360,11 @@ cudaError_t cudaGetDeviceProperties(struct cudaDeviceProp* prop, int device)
     }
     return result.err;
 }
+cudaError_t cudaGetDeviceProperties_v2(struct cudaDeviceProp* prop, int device)
+{
+    return cudaGetDeviceProperties(prop, device);
+}
+
 
 DEF_FN(cudaError_t, cudaIpcCloseMemHandle, void*, devPtr)
 DEF_FN(cudaError_t, cudaIpcGetEventHandle, cudaIpcEventHandle_t*, handle, cudaEvent_t, event)

From 9f4e797d4b2a9a2115037bb1f1ce4a0c10686773 Mon Sep 17 00:00:00 2001
From: Niklas Eiling <niklas.eiling@eonerc.rwth-aachen.de>
Date: Tue, 13 Jun 2023 12:24:04 +0200
Subject: [PATCH 56/83] add libgl dependency to pytorch documentation

Signed-off-by: Niklas Eiling <niklas.eiling@eonerc.rwth-aachen.de>
---
 docs/pytorch.md | 1 +
 1 file changed, 1 insertion(+)

diff --git a/docs/pytorch.md b/docs/pytorch.md
index 1ef0865c..dbf54a9e 100644
--- a/docs/pytorch.md
+++ b/docs/pytorch.md
@@ -39,6 +39,7 @@ index 815a9108ce9..53ec7689493 100644
 +        libssl-dev \
 +        inetutils-ping \
 +        vim \
++        libgl1-mesa-dev \
 +        gdb && \
 +    rm -rf /var/lib/apt/lists/*
 diff --git a/aten/src/ATen/CMakeLists.txt b/aten/src/ATen/CMakeLists.txt

From 8de9fb8a96eefed1182ae7ee771e88581f0eaada Mon Sep 17 00:00:00 2001
From: Niklas Eiling <niklas.eiling@eonerc.rwth-aachen.de>
Date: Tue, 13 Jun 2023 12:24:30 +0200
Subject: [PATCH 57/83] improve support for cuGetProcAddress

Signed-off-by: Niklas Eiling <niklas.eiling@eonerc.rwth-aachen.de>
---
 cpu/cpu-client-driver.c | 20 +++++++++-----------
 cpu/cpu-elf2.c          |  6 ++++++
 cpu/cpu-elf2.h          |  2 +-
 3 files changed, 16 insertions(+), 12 deletions(-)

diff --git a/cpu/cpu-client-driver.c b/cpu/cpu-client-driver.c
index 5c1e4b74..7d639df2 100644
--- a/cpu/cpu-client-driver.c
+++ b/cpu/cpu-client-driver.c
@@ -15,6 +15,7 @@
 #include "cpu_rpc_prot.h"
 #include "cpu-common.h"
 #include "cpu-utils.h"
+#include "cpu-elf2.h"
 
 
 //DEF_FN(CUresult, cuProfilerInitialize, const char*, configFile, const char*, outputFile, CUoutput_mode, outputMode)
@@ -842,17 +843,14 @@ CUresult cuGetProcAddress(const char* symbol, void** pfn, int cudaVersion, cuuin
     ptr_result result;
     LOGE(LOG_DEBUG, "%s(%s, %d, %llx)", __FUNCTION__, symbol, cudaVersion, flags);
 
-    *pfn = NULL;
-    *symbolStatus = CU_GET_PROC_ADDRESS_VERSION_NOT_SUFFICIENT;
-	// if (retval != RPC_SUCCESS) {
-	// 	fprintf(stderr, "[rpc] %s failed.", __FUNCTION__);
-    //     return CUDA_ERROR_UNKNOWN;
-	// }
-    // if (pStr != NULL) {
-    //    if ((*pStr = malloc(128)) != NULL) {
-    //        strncpy((char*)(*pStr), result.str_result_u.str, 128);
-    //     }
-    // }
+    *pfn = elf2_symbol_address(symbol);
+    if (*pfn == NULL) {
+        LOGE(LOG_WARNING, "symbol %s found.", symbol);
+        return CUDA_ERROR_UNKNOWN;
+    }
+    // Pytorch uses the 11.3 API of this function which does not have the symbolStatus parameter
+    // Because we do not support API versioning yet and to avoid segfaults, we ignore this parameter for now.
+    //*symbolStatus = CU_GET_PROC_ADDRESS_VERSION_NOT_SUFFICIENT;
     return cudaSuccess;
 }
 #endif
\ No newline at end of file
diff --git a/cpu/cpu-elf2.c b/cpu/cpu-elf2.c
index dd0abc35..37a1e486 100644
--- a/cpu/cpu-elf2.c
+++ b/cpu/cpu-elf2.c
@@ -7,6 +7,7 @@
 #include <string.h>
 #include <libelf.h>
 #include <gelf.h>
+#include <dlfcn.h>
 
 #include "cpu-common.h"
 #include "log.h"
@@ -992,4 +993,9 @@ int elf2_parameter_info(list *kernel_infos, void* memory, size_t memsize)
         elf_end(elf);
     }
     return ret;
+}
+
+void* elf2_symbol_address(const char *symbol)
+{
+    return dlsym(RTLD_DEFAULT, symbol);
 }
\ No newline at end of file
diff --git a/cpu/cpu-elf2.h b/cpu/cpu-elf2.h
index a170cb37..4223498e 100644
--- a/cpu/cpu-elf2.h
+++ b/cpu/cpu-elf2.h
@@ -19,7 +19,7 @@ int elf2_init(void);
 int elf2_get_fatbin_info(const struct fat_header *fatbin, list *kernel_infos, uint8_t** fatbin_mem, size_t* fatbin_size);
 
 int elf2_parameter_info(list *kernel_infos, void* memory, size_t memsize);
-void* elf2_symbol_address(const char* file, char *symbol);
+void* elf2_symbol_address(const char *symbol);
 //int elf2_contains_kernel(void* memory, size_t memsize);
 
 #endif //_ELF_H_

From d41d195df2b697c396dbf45270495f04aeb7ecea Mon Sep 17 00:00:00 2001
From: Niklas Eiling <niklas.eiling@eonerc.rwth-aachen.de>
Date: Wed, 14 Jun 2023 14:43:05 +0200
Subject: [PATCH 58/83] add cuDNN tests to tests/samples

Signed-off-by: Niklas Eiling <niklas.eiling@eonerc.rwth-aachen.de>
---
 tests/samples/Makefile  | 14 ++++++++++++++
 utils/Dockerfile        |  3 ++-
 utils/Dockerfile.cuda10 |  3 ++-
 utils/Dockerfile.cuda11 |  3 ++-
 4 files changed, 20 insertions(+), 3 deletions(-)

diff --git a/tests/samples/Makefile b/tests/samples/Makefile
index ab1ae4aa..97c3bcc2 100644
--- a/tests/samples/Makefile
+++ b/tests/samples/Makefile
@@ -11,6 +11,7 @@ CUDA_PATH = /usr/local/cuda
 SMS = 75 60
 CUDA_SAMPLES_RELEASE ?= 12.1
 CUDA_SAMPLES_URL = https://github.com/NVIDIA/cuda-samples/archive/refs/tags/v${CUDA_SAMPLES_RELEASE}.tar.gz
+CUDNN_SAMPLES_URL = https://developer.download.nvidia.com/compute/cuda/repos/rhel8/x86_64/libcudnn8-samples-8.9.2.26-1.cuda12.1.x86_64.rpm
 
 PWD = $(shell dirname $(realpath $(firstword $(MAKEFILE_LIST))))
 
@@ -22,9 +23,22 @@ samples:
 	mkdir -p $@
 	wget ${CUDA_SAMPLES_URL} -O - | tar -xz --strip-components=1 -C $@
 
+cudnn-samples:
+	mkdir -p $@
+	wget ${CUDNN_SAMPLES_URL} -O - | rpm2archive - | tar zxf - --strip-components=4 -C $@
+
 samples-bin:
 	mkdir -p $@
 
+samples-bin/mnistCUDNN.sample : cudnn-samples samples-bin
+	make -C cudnn-samples/mnistCUDNN \
+		clean
+	make -C cudnn-samples/mnistCUDNN \
+		NVCCFLAGS="-cudart shared --no-compress -g -G" \
+		SMS="${SMS}" \
+		CUDA_PATH=${CUDA_PATH}
+	cp cudnn-samples/mnistCUDNN/mnistCUDNN $@
+
 samples-bin/nbody.uncompressed.sample : samples samples-bin
 	make -C samples/Samples/5_Domain_Specific/nbody \
 		clean
diff --git a/utils/Dockerfile b/utils/Dockerfile
index 1f981e11..a31884fe 100644
--- a/utils/Dockerfile
+++ b/utils/Dockerfile
@@ -20,7 +20,8 @@ RUN dnf install -y make bash git gcc autoconf libtool automake rpcgen \
                    libvdpau-devel mesa-libEGL-devel openssl-devel rpcbind \
                    texinfo bison flex python3 which libibverbs libibverbs-devel \
                    libasan cppcheck wget expat-devel xz-devel mesa-libGLU-devel freeglut-devel \
-                   elfutils-libelf-devel cpio openssl-devel openssl-libs
+                   elfutils-libelf-devel cpio openssl-devel openssl-libs \
+				   freeimage freeimage-devel
 
 ENV LD_LIBRARY_PATH="/usr/local/lib:/usr/local/lib64:${LD_LIBRARY_PATH}"
 
diff --git a/utils/Dockerfile.cuda10 b/utils/Dockerfile.cuda10
index 5654597f..02ff496f 100644
--- a/utils/Dockerfile.cuda10
+++ b/utils/Dockerfile.cuda10
@@ -21,7 +21,8 @@ RUN dnf install -y make bash git gcc autoconf libtool automake rpcgen \
                    ncurses-devel zlib-devel binutils-devel mesa-libGL-devel \
                    libvdpau-devel mesa-libEGL-devel openssl-devel rpcbind \
                    texinfo bison flex python3 which libibverbs libasan \
-                   cppcheck wget expat-devel xz-devel elfutils-libelf-devel
+                   cppcheck wget expat-devel xz-devel elfutils-libelf-devel \
+                   freeimage freeimage-devel
 
 ENV LD_LIBRARY_PATH="/usr/local/lib:/usr/local/lib64:${LD_LIBRARY_PATH}"
 
diff --git a/utils/Dockerfile.cuda11 b/utils/Dockerfile.cuda11
index 12f2944d..a31eb196 100644
--- a/utils/Dockerfile.cuda11
+++ b/utils/Dockerfile.cuda11
@@ -22,7 +22,8 @@ RUN dnf install -y make bash git gcc autoconf libtool automake rpcgen \
                    libvdpau-devel mesa-libEGL-devel openssl-devel rpcbind \
                    texinfo bison flex python3 which libibverbs libibverbs-devel \
                    libasan cppcheck wget expat-devel xz-devel mesa-libGLU-devel freeglut-devel \
-                   elfutils-libelf-devel cpio openssl-devel openssl-libs
+                   elfutils-libelf-devel cpio openssl-devel openssl-libs \
+                   freeimage freeimage-devel
 
 ENV LD_LIBRARY_PATH="/usr/local/lib:/usr/local/lib64:${LD_LIBRARY_PATH}"
 

From 523d86e9927825dec6c08ad6a03dc76ff2b98803 Mon Sep 17 00:00:00 2001
From: Niklas Eiling <niklas.eiling@eonerc.rwth-aachen.de>
Date: Wed, 14 Jun 2023 14:44:00 +0200
Subject: [PATCH 59/83] use fixed size rpc array instead of opaque variable
 length array for cudaDeviceProp

Signed-off-by: Niklas Eiling <niklas.eiling@eonerc.rwth-aachen.de>
---
 cpu/cpu-client-driver.c  |  2 +-
 cpu/cpu-client-runtime.c |  9 +++------
 cpu/cpu-server-runtime.c | 13 ++++---------
 cpu/cpu_rpc_prot.x       | 10 +++++++++-
 4 files changed, 17 insertions(+), 17 deletions(-)

diff --git a/cpu/cpu-client-driver.c b/cpu/cpu-client-driver.c
index 7d639df2..1c6d8590 100644
--- a/cpu/cpu-client-driver.c
+++ b/cpu/cpu-client-driver.c
@@ -845,7 +845,7 @@ CUresult cuGetProcAddress(const char* symbol, void** pfn, int cudaVersion, cuuin
 
     *pfn = elf2_symbol_address(symbol);
     if (*pfn == NULL) {
-        LOGE(LOG_WARNING, "symbol %s found.", symbol);
+        LOGE(LOG_WARNING, "symbol %s not found.", symbol);
         return CUDA_ERROR_UNKNOWN;
     }
     // Pytorch uses the 11.3 API of this function which does not have the symbolStatus parameter
diff --git a/cpu/cpu-client-runtime.c b/cpu/cpu-client-runtime.c
index b33459e3..1c6316b6 100644
--- a/cpu/cpu-client-runtime.c
+++ b/cpu/cpu-client-runtime.c
@@ -335,7 +335,7 @@ cudaError_t cudaGetDeviceProperties(struct cudaDeviceProp* prop, int device)
 #ifdef WITH_API_CNT
     api_call_cnt++;
 #endif //WITH_API_CNT
-    mem_result result;
+    cuda_device_prop_result result;
     enum clnt_stat retval;
     if (prop == NULL) {
         LOGE(LOG_ERROR, "error: prop == NULL");
@@ -348,18 +348,15 @@ cudaError_t cudaGetDeviceProperties(struct cudaDeviceProp* prop, int device)
     if (result.err != 0) {
         return result.err;
     }
-    if (result.mem_result_u.data.mem_data_len != sizeof(struct cudaDeviceProp)) {
-        LOGE(LOG_ERROR, "error: expected size != retrieved size");
-        return result.err;
-    }
     // if (memcpy(prop, result.mem_result_u.data.mem_data_val, sizeof(struct cudaDeviceProp)) == NULL) {
     //FIXME: Don't know why, but pytorch expects a different definition of cudaDeviceProp, which is only 728 bytes long
-    if (memcpy(prop, result.mem_result_u.data.mem_data_val, 728) == NULL) {
+    if (memcpy(prop, result.cuda_device_prop_result_u.data, 728) == NULL) {
         LOGE(LOG_ERROR, "error: memcpy failed");
         return result.err;
     }
     return result.err;
 }
+
 cudaError_t cudaGetDeviceProperties_v2(struct cudaDeviceProp* prop, int device)
 {
     return cudaGetDeviceProperties(prop, device);
diff --git a/cpu/cpu-server-runtime.c b/cpu/cpu-server-runtime.c
index d939e0b0..66cc937a 100644
--- a/cpu/cpu-server-runtime.c
+++ b/cpu/cpu-server-runtime.c
@@ -360,19 +360,14 @@ bool_t cuda_get_device_flags_1_svc(int_result *result, struct svc_req *rqstp)
     return 1;
 }
 
-bool_t cuda_get_device_properties_1_svc(int device, mem_result *result, struct svc_req *rqstp)
+bool_t cuda_get_device_properties_1_svc(int device, cuda_device_prop_result *result, struct svc_req *rqstp)
 {
     LOGE(LOG_DEBUG, "cudaGetDeviceProperties");
-    result->mem_result_u.data.mem_data_val = malloc(sizeof(struct cudaDeviceProp));
-    if (result->mem_result_u.data.mem_data_val == NULL) {
-        LOGE(LOG_ERROR, "malloc failed.");
+    if (sizeof(result->cuda_device_prop_result_u.data) != sizeof(struct cudaDeviceProp)) {
+        LOGE(LOG_ERROR, "cuda_device_prop_result size mismatch");
         return 0;
     }
-    result->mem_result_u.data.mem_data_len = sizeof(struct cudaDeviceProp);
-    result->err = cudaGetDeviceProperties((void*)result->mem_result_u.data.mem_data_val, device);
-    if (result->err != 0) {
-        free(result->mem_result_u.data.mem_data_val);
-    }
+    result->err = cudaGetDeviceProperties((void*)result->cuda_device_prop_result_u.data, device);
     return 1;
 }
 
diff --git a/cpu/cpu_rpc_prot.x b/cpu/cpu_rpc_prot.x
index 07453e92..5619be66 100644
--- a/cpu/cpu_rpc_prot.x
+++ b/cpu/cpu_rpc_prot.x
@@ -1,6 +1,7 @@
 typedef opaque mem_data<>;
 typedef unsigned hyper size_t;
 typedef unsigned hyper ptr;
+typedef opaque rpc_cuda_device_prop[1032];
 
 struct dint {
     int i1;
@@ -122,6 +123,13 @@ default:
     void;
 };
 
+union cuda_device_prop_result switch (int err) {
+case 0:
+    rpc_cuda_device_prop data;
+default:
+    void;
+};
+
 program RPC_CD_PROG {
     version RPC_CD_VERS {
         int          rpc_checkpoint(void)                                         = 0;
@@ -154,7 +162,7 @@ program RPC_CD_PROG {
         int_result   CUDA_GET_DEVICE(void)                                      = 117;
         int_result   CUDA_GET_DEVICE_COUNT(void)                                = 118;
         int_result   CUDA_GET_DEVICE_FLAGS(void)                                = 119;
-        mem_result   CUDA_GET_DEVICE_PROPERTIES(int)                            = 120;
+        cuda_device_prop_result CUDA_GET_DEVICE_PROPERTIES(int)                 = 120;
         /*int        CUDA_IPC_CLOSE_MEM_HANDLE(ptr)                             = 121;*/
         /*ptr_result CUDA_IPC_GET_EVENT_HANDLE(int)                             = 122;*/
         /*ptr_result CUDA_IPC_GET_MEM_HANDLE(ptr)                               = 123;*/

From d786d9c4b06629026e3de15940487309ae4545d0 Mon Sep 17 00:00:00 2001
From: Niklas Eiling <niklas.eiling@eonerc.rwth-aachen.de>
Date: Fri, 16 Jun 2023 17:12:01 +0200
Subject: [PATCH 60/83] add cuDNN API stubs

Signed-off-by: Niklas Eiling <niklas.eiling@eonerc.rwth-aachen.de>
---
 cpu/Makefile            |   3 +-
 cpu/cpu-client-cudnn.c  | 114 ++++++++++++++++++++++++++++++++++++++++
 cpu/cpu-client-driver.c |  16 +++++-
 cpu/cpu-client.c        |   1 +
 cpu/cpu-server-driver.c |   9 ++++
 cpu/cpu_rpc_prot.x      |   1 +
 6 files changed, 141 insertions(+), 3 deletions(-)
 create mode 100644 cpu/cpu-client-cudnn.c

diff --git a/cpu/Makefile b/cpu/Makefile
index ed23f83f..78d0425a 100644
--- a/cpu/Makefile
+++ b/cpu/Makefile
@@ -59,7 +59,8 @@ SRC_CLIENT = $(RPC_XDR)                 \
 			 oob.c 					    \
 			 mt-memcpy.c				\
 			 cpu-elf2.c					\
-			 cpu-client-nvml.c
+			 cpu-client-nvml.c          \
+			 cpu-client-cudnn.c
 
 # 			 cpu-client-driver-hidden.c \
 
diff --git a/cpu/cpu-client-cudnn.c b/cpu/cpu-client-cudnn.c
new file mode 100644
index 00000000..e3ab5f3f
--- /dev/null
+++ b/cpu/cpu-client-cudnn.c
@@ -0,0 +1,114 @@
+#include <cuda_runtime.h>
+#include <cudnn.h>
+#include <stdint.h>
+
+#include "cpu-libwrap.h"
+#include "cpu_rpc_prot.h"
+#include "cpu-common.h"
+#include "cpu-utils.h"
+#include "log.h"
+
+DEF_FN(size_t, cudnnGetVersion, void)
+DEF_FN(size_t, cudnnGetMaxDeviceVersion, void)
+DEF_FN(size_t, cudnnGetCudartVersion, void)
+DEF_FN(const char *cudnnGetErrorString, cudnnStatus_t, status)
+DEF_FN(cudnnStatus_t, cudnnQueryRuntimeError, cudnnHandle_t, handle, cudnnStatus_t*, rstatus, cudnnErrQueryMode_t  mode, cudnnRuntimeTag_t *, tag)
+DEF_FN(cudnnStatus_t, cudnnGetProperty, libraryPropertyType, type, int *, value)
+DEF_FN(cudnnStatus_t, cudnnCreate, cudnnHandle_t*, handle)
+DEF_FN(cudnnStatus_t, cudnnDestroy, cudnnHandle_t, handle)
+DEF_FN(cudnnStatus_t, cudnnSetStream, cudnnHandle_t, handle, cudaStream_t, streamId)
+DEF_FN(cudnnStatus_t, cudnnGetStream, cudnnHandle_t, handle, cudaStream_t *, streamId)
+DEF_FN(cudnnStatus_t, cudnnCreateTensorDescriptor, cudnnTensorDescriptor_t *, tensorDesc)
+DEF_FN(cudnnStatus_t, cudnnSetTensor4dDescriptor, cudnnTensorDescriptor_t, tensorDesc, cudnnTensorFormat_t, format, cudnnDataType_t, dataType, int, n, int, c, int, h, int, w) 
+DEF_FN(cudnnStatus_t, cudnnSetTensor4dDescriptorEx, cudnnTensorDescriptor_t, tensorDesc, cudnnDataType_t, dataType, int, n, int, c, int, h, int, w, int, nStride, int, cStride, int, hStride, int, wStride)
+DEF_FN(cudnnStatus_t, cudnnGetTensor4dDescriptor, const cudnnTensorDescriptor_t, tensorDesc, cudnnDataType_t *, dataType, int*, n, int*, c, int*, h, int*, w, int*, nStride, int*, cStride, int*, hStride, int*, wStride)
+DEF_FN(cudnnStatus_t, cudnnSetTensorNdDescriptor, cudnnTensorDescriptor_t, tensorDesc, cudnnDataType_t, dataType, int, nbDims, const int*, dimA, const int*, strideA)
+DEF_FN(cudnnStatus_t, cudnnSetTensorNdDescriptorEx, cudnnTensorDescriptor_t, tensorDesc, cudnnTensorFormat_t, format, cudnnDataType_t, dataType, int, nbDims, const int*, dimA)
+DEF_FN(cudnnStatus_t, cudnnGetTensorNdDescriptor, const cudnnTensorDescriptor_t, tensorDesc, int, nbDimsRequested, cudnnDataType_t *, dataType, int*, nbDims, int*, dimA, int*, strideA)
+DEF_FN(cudnnStatus_t, cudnnGetTensorSizeInBytes, const cudnnTensorDescriptor_t, tensorDesc, size_t*, size)
+DEF_FN(cudnnStatus_t, cudnnDestroyTensorDescriptor, cudnnTensorDescriptor_t, tensorDesc)
+DEF_FN(cudnnStatus_t, cudnnInitTransformDest, const cudnnTensorTransformDescriptor_t, transformDesc, const cudnnTensorDescriptor_t, srcDesc, cudnnTensorDescriptor_t, destDesc, size_t*, destSizeInBytes)
+DEF_FN(cudnnStatus_t, cudnnCreateTensorTransformDescriptor, cudnnTensorTransformDescriptor_t *, transformDesc)
+DEF_FN(cudnnStatus_t, cudnnSetTensorTransformDescriptor, cudnnTensorTransformDescriptor_t, transformDesc, const uint32_t, nbDims, const cudnnTensorFormat_t, destFormat, const int32_t*, padBeforeA, const int32_t*, padAfterA, const uint32_t*, foldA, const cudnnFoldingDirection_t,  direction)
+DEF_FN(cudnnStatus_t, cudnnGetTensorTransformDescriptor, cudnnTensorTransformDescriptor_t, transformDesc, uint32_t, nbDimsRequested, cudnnTensorFormat_t *, destFormat, int32_t*, padBeforeA, int32_t*, padAfterA, uint32_t*, foldA, cudnnFoldingDirection_t *, direction)
+DEF_FN(cudnnStatus_t, cudnnDestroyTensorTransformDescriptor, cudnnTensorTransformDescriptor_t, transformDesc)
+DEF_FN(cudnnStatus_t, cudnnTransformTensor, cudnnHandle_t, handle, const void *, alpha, const cudnnTensorDescriptor_t, xDesc, const void *, x, const void *, beta, const cudnnTensorDescriptor_t, yDesc, void *, y)
+DEF_FN(cudnnStatus_t, cudnnTransformTensorEx, cudnnHandle_t, handle, const cudnnTensorTransformDescriptor_t, transDesc, const void *, alpha, const cudnnTensorDescriptor_t, srcDesc, const void *, srcData, const void *, beta, const cudnnTensorDescriptor_t, destDesc, void *, destData)
+DEF_FN(cudnnStatus_t, cudnnAddTensor, cudnnHandle_t, handle, const void *, alpha, const cudnnTensorDescriptor_t, aDesc, const void *, A, const void *, ,beta, const cudnnTensorDescriptor_t, cDesc, void *, C)
+DEF_FN(cudnnStatus_t, cudnnCreateOpTensorDescriptor, cudnnOpTensorDescriptor_t *, opTensorDesc)
+DEF_FN(cudnnStatus_t, cudnnSetOpTensorDescriptor, cudnnOpTensorDescriptor_t, opTensorDesc, cudnnOpTensorOp_t, opTensorOp, cudnnDataType_t, opTensorCompType, cudnnNanPropagation_t, opTensorNanOpt)
+DEF_FN(cudnnStatus_t, cudnnGetOpTensorDescriptor, const cudnnOpTensorDescriptor_t, opTensorDesc, cudnnOpTensorOp_t *, opTensorOp, cudnnDataType_t *, opTensorCompType, cudnnNanPropagation_t *, opTensorNanOpt)
+DEF_FN(cudnnStatus_t, cudnnDestroyOpTensorDescriptor, cudnnOpTensorDescriptor_t, opTensorDesc)
+DEF_FN(cudnnStatus_t, cudnnOpTensor, cudnnHandle_t, handle, const cudnnOpTensorDescriptor_t, opTensorDesc, const void *, alpha1, const cudnnTensorDescriptor_t, aDesc, const void *, A, const void *, alpha2, const cudnnTensorDescriptor_t, bDesc, const void *, B, const void *, beta, const cudnnTensorDescriptor_t,  cDesc, void *, C)
+DEF_FN(cudnnStatus_t, cudnnCreateReduceTensorDescriptor, cudnnReduceTensorDescriptor_t *, reduceTensorDesc)
+DEF_FN(cudnnStatus_t, cudnnSetReduceTensorDescriptor, cudnnReduceTensorDescriptor_t, reduceTensorDesc, cudnnReduceTensorOp_t, reduceTensorOp, cudnnDataType_t, reduceTensorCompType, cudnnNanPropagation_t, reduceTensorNanOpt, cudnnReduceTensorIndices_t, reduceTensorIndices, cudnnIndicesType_t, reduceTensorIndicesType)
+DEF_FN(cudnnStatus_t, cudnnGetReduceTensorDescriptor, const cudnnReduceTensorDescriptor_t, reduceTensorDesc, cudnnReduceTensorOp_t *, reduceTensorOp, cudnnDataType_t *, reduceTensorCompType, cudnnNanPropagation_t *, reduceTensorNanOpt, cudnnReduceTensorIndices_t *, reduceTensorIndices, cudnnIndicesType_t *, reduceTensorIndicesType)
+DEF_FN(cudnnStatus_t, cudnnDestroyReduceTensorDescriptor, cudnnReduceTensorDescriptor_t, reduceTensorDesc)
+DEF_FN(cudnnStatus_t, cudnnGetReductionIndicesSize, cudnnHandle_t, handle, const cudnnReduceTensorDescriptor_t, reduceTensorDesc, const cudnnTensorDescriptor_t, aDesc, const cudnnTensorDescriptor_t, cDesc, size_t*, sizeInBytes)
+DEF_FN(cudnnStatus_t, cudnnGetReductionWorkspaceSize, cudnnHandle_t, handle, const cudnnReduceTensorDescriptor_t, reduceTensorDesc, const cudnnTensorDescriptor_t, aDesc, const cudnnTensorDescriptor_t, cDesc, size_t*, sizeInBytes)
+DEF_FN(cudnnStatus_t, cudnnReduceTensor, cudnnHandle_t, handle, const cudnnReduceTensorDescriptor_t, reduceTensorDesc, void *, indices, size_t, indicesSizeInBytes, void *, workspace, size_t, workspaceSizeInBytes, const void *, alpha, const cudnnTensorDescriptor_t, aDesc, const void *, A, const void *, beta, const cudnnTensorDescriptor_t, cDesc, void *, C)
+DEF_FN(cudnnStatus_t, cudnnSetTensor, cudnnHandle_t, handle, const cudnnTensorDescriptor_t, yDesc, void *, y, const void *, valuePtr)
+DEF_FN(cudnnStatus_t, cudnnScaleTensor, cudnnHandle_t, handle, const cudnnTensorDescriptor_t, yDesc, void *, y, const void *, alpha)
+DEF_FN(cudnnStatus_t, cudnnCreateFilterDescriptor, cudnnFilterDescriptor_t *, filterDesc)
+DEF_FN(cudnnStatus_t, cudnnSetFilter4dDescriptor, cudnnFilterDescriptor_t, filterDesc, cudnnDataType_t, dataType, cudnnTensorFormat_t, format, int, k, int, c, int, h, int, w) 
+DEF_FN(cudnnStatus_t, cudnnGetFilter4dDescriptor, const cudnnFilterDescriptor_t, filterDesc, cudnnDataType_t *, dataType, cudnnTensorFormat_t *, format, int*, k, int*, c, int*, h, int*, w) 
+DEF_FN(cudnnStatus_t, cudnnSetFilterNdDescriptor, cudnnFilterDescriptor_t, filterDesc, cudnnDataType_t, dataType, cudnnTensorFormat_t, format, int, nbDims, const int*, filterDimA)
+DEF_FN(cudnnStatus_t, cudnnGetFilterNdDescriptor, const cudnnFilterDescriptor_t, filterDesc, int, nbDimsRequested, cudnnDataType_t *, dataType, cudnnTensorFormat_t *, format, int*, nbDims, int*, filterDimA)
+DEF_FN(cudnnStatus_t, cudnnGetFilterSizeInBytes, const cudnnFilterDescriptor_t, filterDesc, size_t*, size)
+DEF_FN(cudnnStatus_t, cudnnTransformFilter, cudnnHandle_t, handle, const cudnnTensorTransformDescriptor_t, transDesc, const void *, alpha, const cudnnFilterDescriptor_t, srcDesc, const void *, srcData, const void *, beta, const cudnnFilterDescriptor_t, destDesc, void *, destData)
+DEF_FN(cudnnStatus_t, cudnnDestroyFilterDescriptor, cudnnFilterDescriptor_t, filterDesc)
+DEF_FN(cudnnStatus_t, cudnnSoftmaxForward, cudnnHandle_t, handle, cudnnSoftmaxAlgorithm_t, algo, cudnnSoftmaxMode_t, mode, const void *,alpha, const cudnnTensorDescriptor_t, xDesc, const void *, x, const void *, beta, const cudnnTensorDescriptor_t, yDesc, void *, y)
+DEF_FN(cudnnStatus_t, cudnnCreatePoolingDescriptor, cudnnPoolingDescriptor_t *, poolingDesc)
+DEF_FN(cudnnStatus_t, cudnnSetPooling2dDescriptor, cudnnPoolingDescriptor_t, poolingDesc, cudnnPoolingMode_t, mode, cudnnNanPropagation_t, maxpoolingNanOpt, int, windowHeight, int, windowWidth, int, verticalPadding, int, horizontalPadding, int, verticalStride, int, horizontalStride)
+DEF_FN(cudnnStatus_t, cudnnGetPooling2dDescriptor, const cudnnPoolingDescriptor_t, poolingDesc, cudnnPoolingMode_t *, mode, cudnnNanPropagation_t *, maxpoolingNanOpt, int*, windowHeight, int*, windowWidth, int*, verticalPadding, int*, horizontalPadding, int*, verticalStride, int*, horizontalStride)
+DEF_FN(cudnnStatus_t, cudnnSetPoolingNdDescriptor, cudnnPoolingDescriptor_t, poolingDesc, const cudnnPoolingMode_t, mode, const cudnnNanPropagation_t, maxpoolingNanOpt, int, nbDims, const int*, windowDimA, const int*, paddingA, const int*, strideA)
+DEF_FN(cudnnStatus_t, cudnnGetPoolingNdDescriptor, const cudnnPoolingDescriptor_t, poolingDesc, int, nbDimsRequested, cudnnPoolingMode_t *, mode, cudnnNanPropagation_t *, maxpoolingNanOpt, int*, nbDims, int*, windowDimA, int*, paddingA, int*, strideA)
+DEF_FN(cudnnStatus_t, cudnnGetPoolingNdForwardOutputDim, const cudnnPoolingDescriptor_t, poolingDesc, const cudnnTensorDescriptor_t, inputTensorDesc, int, nbDims, int*, outputTensorDimA)
+DEF_FN(cudnnStatus_t, cudnnGetPooling2dForwardOutputDim, const cudnnPoolingDescriptor_t, poolingDesc, const cudnnTensorDescriptor_t, inputTensorDesc, int*, n, int*, c, int*, h, int*, w)
+DEF_FN(cudnnStatus_t, cudnnDestroyPoolingDescriptor, cudnnPoolingDescriptor_t, poolingDesc)
+DEF_FN(cudnnStatus_t, cudnnPoolingForward, cudnnHandle_t, handle, const cudnnPoolingDescriptor_t, poolingDesc, const void *, alpha, const cudnnTensorDescriptor_t, xDesc, const void *, x, const void *, beta, const cudnnTensorDescriptor_t, yDesc, void *, y)
+DEF_FN(cudnnStatus_t, cudnnCreateActivationDescriptor, cudnnActivationDescriptor_t *, activationDesc)
+DEF_FN(cudnnStatus_t, cudnnSetActivationDescriptor, cudnnActivationDescriptor_t, activationDesc, cudnnActivationMode_t, mode, cudnnNanPropagation_t, reluNanOpt, double, coef) 
+DEF_FN(cudnnStatus_t, cudnnGetActivationDescriptor, const cudnnActivationDescriptor_t, activationDesc, cudnnActivationMode_t *, mode, cudnnNanPropagation_t *, reluNanOpt, double *, coef) 
+DEF_FN(cudnnStatus_t, cudnnSetActivationDescriptorSwishBeta, cudnnActivationDescriptor_t, activationDesc, double, swish_beta)
+DEF_FN(cudnnStatus_t, cudnnGetActivationDescriptorSwishBeta, cudnnActivationDescriptor_t, activationDesc, double *, swish_beta)
+DEF_FN(cudnnStatus_t, cudnnDestroyActivationDescriptor, cudnnActivationDescriptor_t, activationDesc)
+DEF_FN(cudnnStatus_t, cudnnActivationForward, cudnnHandle_t, handle, cudnnActivationDescriptor_t, activationDesc, const void *, alpha, const cudnnTensorDescriptor_t, xDesc, const void *, x, const void *, beta, const cudnnTensorDescriptor_t, yDesc, void *, y)
+DEF_FN(cudnnStatus_t, cudnnCreateLRNDescriptor, cudnnLRNDescriptor_t *, normDesc)
+DEF_FN(cudnnStatus_t, cudnnSetLRNDescriptor, cudnnLRNDescriptor_t, normDesc, unsigned, lrnN, double, lrnAlpha, double, lrnBeta, double, lrnK)
+DEF_FN(cudnnStatus_t, cudnnGetLRNDescriptor, cudnnLRNDescriptor_t, normDesc, unsigned *, lrnN, double *, lrnAlpha, double *, lrnBeta, double *, lrnK)
+DEF_FN(cudnnStatus_t, cudnnDestroyLRNDescriptor, cudnnLRNDescriptor_t, lrnDesc)
+DEF_FN(cudnnStatus_t, cudnnLRNCrossChannelForward, cudnnHandle_t, handle, cudnnLRNDescriptor_t, normDesc, cudnnLRNMode_t, lrnMode, const void *, alpha, const cudnnTensorDescriptor_t, xDesc, const void *, x, const void *, beta, const cudnnTensorDescriptor_t, yDesc, void *, y)
+DEF_FN(cudnnStatus_t, cudnnDivisiveNormalizationForward, cudnnHandle_t, handle, cudnnLRNDescriptor_t, normDesc, cudnnDivNormMode_t, mode, const void *, alpha, const cudnnTensorDescriptor_t, xDesc, const void *, x, const void *, means, void *, temp, void *, temp2, const void *, beta, const cudnnTensorDescriptor_t, yDesc, void *, y)
+DEF_FN(cudnnStatus_t, cudnnDeriveBNTensorDescriptor, cudnnTensorDescriptor_t, derivedBnDesc, const cudnnTensorDescriptor_t, xDesc, cudnnBatchNormMode_t, mode)
+DEF_FN(cudnnStatus_t, cudnnBatchNormalizationForwardInference, cudnnHandle_t, handle, cudnnBatchNormMode_t, mode, const void *, alpha, const void *, beta, const cudnnTensorDescriptor_t, xDesc, const void *, x, const cudnnTensorDescriptor_t, yDesc, void *, y, const cudnnTensorDescriptor_t,  bnScaleBiasMeanVarDesc, const void *, bnScale, const void *, bnBias, const void *, estimatedMean, const void *, estimatedVariance, double, epsilon)
+DEF_FN(cudnnStatus_t, cudnnDeriveNormTensorDescriptor, cudnnTensorDescriptor_t, derivedNormScaleBiasDesc, cudnnTensorDescriptor_t, derivedNormMeanVarDesc, const cudnnTensorDescriptor_t, xDesc, cudnnNormMode_t, mode, int, groupCnt) 
+DEF_FN(cudnnStatus_t, cudnnNormalizationForwardInference, cudnnHandle_t, handle, cudnnNormMode_t, mode, cudnnNormOps_t, normOps, cudnnNormAlgo_t, algo, const void *, alpha, const void *, beta, const cudnnTensorDescriptor_t, xDesc, const void *, x, const cudnnTensorDescriptor_t normScaleBiasDesc, const void *, normScale, const void *, normBias, const cudnnTensorDescriptor_t, normMeanVarDesc, const void *, estimatedMean, const void *, estimatedVariance, const cudnnTensorDescriptor_t, zDesc, const void *, z, cudnnActivationDescriptor_t, activationDesc, const cudnnTensorDescriptor_t, yDesc, void *, y, double, epsilon, int, groupCnt) 
+DEF_FN(cudnnStatus_t, cudnnCreateSpatialTransformerDescriptor, cudnnSpatialTransformerDescriptor_t *, stDesc)
+DEF_FN(cudnnStatus_t, cudnnSetSpatialTransformerNdDescriptor, cudnnSpatialTransformerDescriptor_t, stDesc, cudnnSamplerType_t, samplerType, cudnnDataType_t, dataType, const int, nbDims, const int*, dimA)
+DEF_FN(cudnnStatus_t, cudnnDestroySpatialTransformerDescriptor, cudnnSpatialTransformerDescriptor_t, stDesc)
+DEF_FN(cudnnStatus_t, cudnnSpatialTfGridGeneratorForward, cudnnHandle_t, handle, const cudnnSpatialTransformerDescriptor_t, stDesc, const void *, theta, void *, grid)
+DEF_FN(cudnnStatus_t, cudnnSpatialTfSamplerForward, cudnnHandle_t, handle, cudnnSpatialTransformerDescriptor_t, stDesc, const void *, alpha, const cudnnTensorDescriptor_t, xDesc, const void *, x, const void *, grid, const void *, beta, cudnnTensorDescriptor_t, yDesc, void *, y)
+DEF_FN(cudnnStatus_t, cudnnCreateDropoutDescriptor, cudnnDropoutDescriptor_t *, dropoutDesc)
+DEF_FN(cudnnStatus_t, cudnnDestroyDropoutDescriptor, cudnnDropoutDescriptor_t, dropoutDesc)
+DEF_FN(cudnnStatus_t, cudnnDropoutGetStatesSize, cudnnHandle_t, handle, size_t *, sizeInBytes)
+DEF_FN(cudnnStatus_t, cudnnDropoutGetReserveSpaceSize, cudnnTensorDescriptor_t, xdesc, size_t*, sizeInBytes)
+DEF_FN(cudnnStatus_t, cudnnSetDropoutDescriptor, cudnnDropoutDescriptor_t, dropoutDesc, cudnnHandle_t, handle, float, dropout, void *, states, size_t, stateSizeInBytes, unsigned long long, seed)
+DEF_FN(cudnnStatus_t, cudnnRestoreDropoutDescriptor, cudnnDropoutDescriptor_t, dropoutDesc, cudnnHandle_t, handle, float, dropout, void *, states, size_t, stateSizeInBytes, unsigned long long, seed)
+DEF_FN(cudnnStatus_t, cudnnGetDropoutDescriptor, cudnnDropoutDescriptor_t, dropoutDesc, cudnnHandle_t, handle, float *, dropout, void **, states, unsigned long long *, seed)
+DEF_FN(cudnnStatus_t, cudnnDropoutForward, cudnnHandle_t, handle, const cudnnDropoutDescriptor_t, dropoutDesc, const cudnnTensorDescriptor_t, xdesc, const void *, x, const cudnnTensorDescriptor_t, ydesc, void *, y, void *, reserveSpace, size_t, reserveSpaceSizeInBytes)
+DEF_FN(cudnnStatus_t, cudnnCreateAlgorithmDescriptor, cudnnAlgorithmDescriptor_t *, algoDesc)
+DEF_FN(cudnnStatus_t, cudnnSetAlgorithmDescriptor, cudnnAlgorithmDescriptor_t, algoDesc, cudnnAlgorithm_t, algorithm)
+DEF_FN(cudnnStatus_t, cudnnGetAlgorithmDescriptor, const cudnnAlgorithmDescriptor_t, algoDesc, cudnnAlgorithm_t *, algorithm)
+DEF_FN(cudnnStatus_t, cudnnCopyAlgorithmDescriptor, const cudnnAlgorithmDescriptor_t, src, cudnnAlgorithmDescriptor_t, dest)
+DEF_FN(cudnnStatus_t, cudnnDestroyAlgorithmDescriptor, cudnnAlgorithmDescriptor_t, algoDesc)
+DEF_FN(cudnnStatus_t, cudnnCreateAlgorithmPerformance, cudnnAlgorithmPerformance_t *, algoPerf, int, numberToCreate)
+DEF_FN(cudnnStatus_t, cudnnSetAlgorithmPerformance, cudnnAlgorithmPerformance_t, algoPerf, cudnnAlgorithmDescriptor_t, algoDesc, cudnnStatus_t, status, float, time, size_t, memory)
+DEF_FN(cudnnStatus_t, cudnnGetAlgorithmPerformance, const cudnnAlgorithmPerformance_t, algoPerf, cudnnAlgorithmDescriptor_t *, algoDesc, cudnnStatus_t, *, status, float *, time, size_t*, memory)
+DEF_FN(cudnnStatus_t, cudnnDestroyAlgorithmPerformance, cudnnAlgorithmPerformance_t *, algoPerf, int, numberToDestroy)
+DEF_FN(cudnnStatus_t, cudnnGetAlgorithmSpaceSize, cudnnHandle_t, handle, cudnnAlgorithmDescriptor_t, algoDesc, size_t *, algoSpaceSizeInBytes)
+DEF_FN(cudnnStatus_t, cudnnSaveAlgorithm, cudnnHandle_t, handle, cudnnAlgorithmDescriptor_t, algoDesc, void *, algoSpace, size_t, algoSpaceSizeInBytes)
+DEF_FN(cudnnStatus_t, cudnnRestoreAlgorithm, cudnnHandle_t, handle, void *, algoSpace, size_t, algoSpaceSizeInBytes, cudnnAlgorithmDescriptor_t, algoDesc)
+DEF_FN(cudnnStatus_t, cudnnSetCallback, unsigned, mask, void *, udata, cudnnCallback_t, fptr)
+DEF_FN(cudnnStatus_t, cudnnGetCallback, unsigned *, mask, void **, udata, cudnnCallback_t *, fptr)
+DEF_FN(cudnnStatus_t, cudnnOpsInferVersionCheck)
\ No newline at end of file
diff --git a/cpu/cpu-client-driver.c b/cpu/cpu-client-driver.c
index 1c6d8590..f0a00c27 100644
--- a/cpu/cpu-client-driver.c
+++ b/cpu/cpu-client-driver.c
@@ -226,7 +226,7 @@ CUresult cuDeviceGetUuid(CUuuid* uuid, CUdevice dev)
 }
 
 DEF_FN(CUresult, cuDeviceGetLuid, char*, luid, unsigned int*, deviceNodeMask, CUdevice, dev)
-//DEF_FN(CUresult, cuDeviceGetAttribute, int*, pi, CUdevice_attribute, attrib, CUdevice, dev)
+
 CUresult cuDeviceGetAttribute(int* pi, CUdevice_attribute attrib, CUdevice dev)
 {
 	enum clnt_stat retval;
@@ -289,7 +289,19 @@ CUresult cuDeviceComputeCapability(int* major, int* minor, CUdevice dev)
 } 
 
 DEF_FN(CUresult, cuDeviceGetByPCIBusId, CUdevice*, dev, const char*, pciBusId)
-DEF_FN(CUresult, cuDeviceGetP2PAttribute, int*, value, CUdevice_P2PAttribute, attrib, CUdevice, srcDevice, CUdevice, dstDevice)
+CUresult cuDeviceGetP2PAttribute ( int* value, CUdevice_P2PAttribute attrib, CUdevice srcDevice, CUdevice dstDevice ) 
+{
+	enum clnt_stat retval;
+    int_result result;
+    retval = rpc_cudevicegetp2pattribute_1((int)attrib, (ptr)srcDevice, (ptr)dstDevice, &result, clnt);
+    LOGE(LOG_DEBUG, "[rpc] %s(%d, %p, %p) = %d, result %s", __FUNCTION__, attrib, srcDevice, dstDevice, result.err, result.int_result_u.data);
+	if (retval != RPC_SUCCESS) {
+		fprintf(stderr, "[rpc] %s failed.", __FUNCTION__);
+        return CUDA_ERROR_UNKNOWN;
+	}
+    return result.err;
+}
+
 //DEF_FN(CUresult, cuDriverGetVersion, int*, driverVersion)
 CUresult cuDriverGetVersion(int* driverVersion)
 {
diff --git a/cpu/cpu-client.c b/cpu/cpu-client.c
index 4cd62f09..c5543101 100644
--- a/cpu/cpu-client.c
+++ b/cpu/cpu-client.c
@@ -224,6 +224,7 @@ void __attribute__((destructor)) deinit_rpc(void)
     }
 }
 
+
 static void *(*dlopen_orig)(const char *, int) = NULL;
 static int (*dlclose_orig)(void *) = NULL;
 static void *dl_handle = NULL;
diff --git a/cpu/cpu-server-driver.c b/cpu/cpu-server-driver.c
index 1a1ea85e..5f51c6ad 100644
--- a/cpu/cpu-server-driver.c
+++ b/cpu/cpu-server-driver.c
@@ -496,6 +496,15 @@ bool_t rpc_culaunchkernel_1_svc(uint64_t f, unsigned int gridDimX, unsigned int
 
 }
 
+bool_t rpc_cudevicegetp2pattribute_1_svc(int attrib, ptr srcDevice, ptr dstDevice, int_result *result, struct svc_req *rqstp)
+{
+    LOG(LOG_DEBUG, "%s", __FUNCTION__);
+    GSCHED_RETAIN;
+    result->err = cuDeviceGetP2PAttribute(&result->int_result_u.data, (CUdevice_P2PAttribute)attrib, (CUdevice)srcDevice, (CUdevice)dstDevice);
+    GSCHED_RELEASE;
+    return 1;
+}
+
 /* ################## START OF HIDDEN FUNCTIONS IMPL ######################## */
 
 /*
diff --git a/cpu/cpu_rpc_prot.x b/cpu/cpu_rpc_prot.x
index 5619be66..60f5c986 100644
--- a/cpu/cpu_rpc_prot.x
+++ b/cpu/cpu_rpc_prot.x
@@ -346,6 +346,7 @@ program RPC_CD_PROG {
         dint_result  rpc_cuDevicePrimaryCtxGetState(int)                       = 1022;
         mem_result   rpc_cuDeviceGetProperties(int)                            = 1023;
         dint_result  rpc_cuDeviceComputeCapability(int)                        = 1024;
+        int_result   rpc_cuDeviceGetP2PAttribute(int, ptr, ptr)                = 1025; 
 
         /* HIDDEN DRIVER API */
 /*        ptr_result   rpc_hidden_get_device_ctx(int)                            = 1101;

From 2de0e2e350e914cca80ead0418394c14184dfdf5 Mon Sep 17 00:00:00 2001
From: Niklas Eiling <niklas.eiling@eonerc.rwth-aachen.de>
Date: Tue, 20 Jun 2023 10:09:48 +0200
Subject: [PATCH 61/83] add cuDNN implementation

Signed-off-by: Niklas Eiling <niklas.eiling@eonerc.rwth-aachen.de>
---
 cpu/Makefile           |   3 +-
 cpu/api-recorder.h     |   2 +
 cpu/cpu-client-cudnn.c | 215 ++++++++++++++++++++++++++++++++++++++---
 cpu/cpu-server-cudnn.c | 157 ++++++++++++++++++++++++++++++
 cpu/cpu-server-cudnn.h |   9 ++
 cpu/cpu-server.c       |   6 ++
 cpu/cpu_rpc_prot.x     |  13 +++
 cpu/resource-mg.h      |   1 +
 8 files changed, 394 insertions(+), 12 deletions(-)
 create mode 100644 cpu/cpu-server-cudnn.c
 create mode 100644 cpu/cpu-server-cudnn.h

diff --git a/cpu/Makefile b/cpu/Makefile
index 78d0425a..6ef358d4 100644
--- a/cpu/Makefile
+++ b/cpu/Makefile
@@ -41,7 +41,8 @@ SRC_SERVER = $(RPC_XDR)                 \
 			 oob.c 					    \
 			 mt-memcpy.c				\
 			 cpu-elf2.c					\
-			 cpu-server-nvml.c
+			 cpu-server-nvml.c			\
+			 cpu-server-cudnn.c
 
 SRC_SERVER_LIB = server-library.c
 SRC_SERVER_EXE = server-exe.c
diff --git a/cpu/api-recorder.h b/cpu/api-recorder.h
index c642fbfc..37c5e569 100644
--- a/cpu/api-recorder.h
+++ b/cpu/api-recorder.h
@@ -35,6 +35,8 @@
     *arguments = ARG
 #define RECORD_ARG(NUM, ARG) \
     arguments->arg##NUM = ARG
+#define RECORD_NARG(ARG) \
+    arguments->ARG = ARG
 #define RECORD_DATA(SIZE, PTR) \
     record->data_size = SIZE; \
     record->data = malloc(SIZE); \
diff --git a/cpu/cpu-client-cudnn.c b/cpu/cpu-client-cudnn.c
index e3ab5f3f..46a159f6 100644
--- a/cpu/cpu-client-cudnn.c
+++ b/cpu/cpu-client-cudnn.c
@@ -8,17 +8,210 @@
 #include "cpu-utils.h"
 #include "log.h"
 
-DEF_FN(size_t, cudnnGetVersion, void)
-DEF_FN(size_t, cudnnGetMaxDeviceVersion, void)
-DEF_FN(size_t, cudnnGetCudartVersion, void)
-DEF_FN(const char *cudnnGetErrorString, cudnnStatus_t, status)
-DEF_FN(cudnnStatus_t, cudnnQueryRuntimeError, cudnnHandle_t, handle, cudnnStatus_t*, rstatus, cudnnErrQueryMode_t  mode, cudnnRuntimeTag_t *, tag)
-DEF_FN(cudnnStatus_t, cudnnGetProperty, libraryPropertyType, type, int *, value)
-DEF_FN(cudnnStatus_t, cudnnCreate, cudnnHandle_t*, handle)
-DEF_FN(cudnnStatus_t, cudnnDestroy, cudnnHandle_t, handle)
-DEF_FN(cudnnStatus_t, cudnnSetStream, cudnnHandle_t, handle, cudaStream_t, streamId)
-DEF_FN(cudnnStatus_t, cudnnGetStream, cudnnHandle_t, handle, cudaStream_t *, streamId)
-DEF_FN(cudnnStatus_t, cudnnCreateTensorDescriptor, cudnnTensorDescriptor_t *, tensorDesc)
+static size_t cudnn_call_cnt = 0;
+
+size_t cudnnGetVersion(void)
+{
+#ifdef WITH_API_CNT
+    cudnn_call_cnt++;
+#endif //WITH_API_CNT
+    size_t result;
+    enum clnt_stat retval_1;
+    retval_1 = rpc_cudnngetversion_1(&result, clnt);
+    if (retval_1 != RPC_SUCCESS) {
+        LOGE(LOG_ERROR, "%s failed (%d)", __FUNCTION__, retval_1);
+    }
+    return result;
+}
+size_t cudnnGetMaxDeviceVersion(void)
+{
+#ifdef WITH_API_CNT
+    cudnn_call_cnt++;
+#endif //WITH_API_CNT
+    size_t result;
+    enum clnt_stat retval_1;
+    retval_1 = rpc_cudnngetmaxdeviceversion_1(&result, clnt);
+    if (retval_1 != RPC_SUCCESS) {
+        LOGE(LOG_ERROR, "%s failed (%d)", __FUNCTION__, retval_1);
+    }
+    return result;
+}
+size_t cudnnGetCudartVersion(void)
+{
+#ifdef WITH_API_CNT
+    cudnn_call_cnt++;
+#endif //WITH_API_CNT
+    size_t result;
+    enum clnt_stat retval_1;
+    retval_1 = rpc_cudnngetcudartversion_1(&result, clnt);
+    if (retval_1 != RPC_SUCCESS) {
+        LOGE(LOG_ERROR, "%s failed (%d)", __FUNCTION__, retval_1);
+    }
+    return result;
+}
+const char *cudnnGetErrorString(cudnnStatus_t status)
+{
+#ifdef WITH_API_CNT
+    cudnn_call_cnt++;
+#endif //WITH_API_CNT
+    char *result;
+    enum clnt_stat retval_1;
+    retval_1 = rpc_cudnngeterrorstring_1((int)status, &result, clnt);
+    if (retval_1 != RPC_SUCCESS) {
+        LOGE(LOG_ERROR, "%s failed (%d)", __FUNCTION__, retval_1);
+    }
+    if (result == NULL) {
+        LOGE(LOG_ERROR, "%s failed (result is NULL)", __FUNCTION__);
+    }
+    return result;
+}
+
+cudnnStatus_t cudnnQueryRuntimeError(cudnnHandle_t handle, cudnnStatus_t* rstatus, cudnnErrQueryMode_t  mode, cudnnRuntimeTag_t * tag)
+{
+#ifdef WITH_API_CNT
+    cudnn_call_cnt++;
+#endif //WITH_API_CNT
+    int_result result;
+    enum clnt_stat retval_1;
+    retval_1 = rpc_cudnnqueryruntimeerror_1((ptr)handle, (int)mode, &result, clnt);
+    if (retval_1 != RPC_SUCCESS) {
+        LOGE(LOG_ERROR, "%s failed (%d)", __FUNCTION__, retval_1);
+    }
+    if (result.err != CUDNN_STATUS_SUCCESS) {
+        LOGE(LOG_ERROR, "%s failed (result is %d)", __FUNCTION__, result.err);
+    } else {
+        *rstatus = (cudnnStatus_t)result.int_result_u.data;
+        //*tag = NULL;
+    }
+    return result.err;
+}
+
+cudnnStatus_t cudnnGetProperty(libraryPropertyType type, int * value)
+{
+#ifdef WITH_API_CNT
+    cudnn_call_cnt++;
+#endif //WITH_API_CNT
+    int_result result;
+    enum clnt_stat retval_1;
+    if (value == NULL) {
+        LOGE(LOG_ERROR, "%s failed (value is NULL)", __FUNCTION__);
+        return CUDNN_STATUS_BAD_PARAM;
+    }
+    retval_1 = rpc_cudnngetproperty_1((int)type, &result, clnt);
+    if (retval_1 != RPC_SUCCESS) {
+        LOGE(LOG_ERROR, "%s failed (%d)", __FUNCTION__, retval_1);
+    }
+    if (result.err != CUDNN_STATUS_SUCCESS) {
+        LOGE(LOG_ERROR, "%s failed (result is %d)", __FUNCTION__, result.err);
+    } else {
+        *value = result.int_result_u.data;
+    }
+    return result.err;
+}
+
+cudnnStatus_t cudnnCreate(cudnnHandle_t* handle)
+{
+#ifdef WITH_API_CNT
+    cudnn_call_cnt++;
+#endif //WITH_API_CNT
+    ptr_result result;
+    enum clnt_stat retval_1;
+    if (handle == NULL) {
+        LOGE(LOG_ERROR, "%s failed (value is NULL)", __FUNCTION__);
+        return CUDNN_STATUS_BAD_PARAM;
+    }
+    retval_1 = rpc_cudnncreate_1(&result, clnt);
+    if (retval_1 != RPC_SUCCESS) {
+        LOGE(LOG_ERROR, "%s failed (%d)", __FUNCTION__, retval_1);
+    }
+    if (result.err != CUDNN_STATUS_SUCCESS) {
+        LOGE(LOG_ERROR, "%s failed (result is %d)", __FUNCTION__, result.err);
+    } else {
+        *handle = (cudnnHandle_t)result.ptr_result_u.ptr;
+    }
+    return result.err;
+}
+
+cudnnStatus_t cudnnDestroy(cudnnHandle_t handle)
+{
+#ifdef WITH_API_CNT
+    cudnn_call_cnt++;
+#endif //WITH_API_CNT
+    int result;
+    enum clnt_stat retval_1;
+    retval_1 = rpc_cudnndestroy_1((ptr)handle, &result, clnt);
+    if (retval_1 != RPC_SUCCESS) {
+        LOGE(LOG_ERROR, "%s failed (%d)", __FUNCTION__, retval_1);
+    }
+    if (result != CUDNN_STATUS_SUCCESS) {
+        LOGE(LOG_ERROR, "%s failed (result is %d)", __FUNCTION__, result);
+    }
+    return result;
+}
+
+cudnnStatus_t cudnnSetStream(cudnnHandle_t handle, cudaStream_t streamId)
+{
+#ifdef WITH_API_CNT
+    cudnn_call_cnt++;
+#endif //WITH_API_CNT
+    int result;
+    enum clnt_stat retval_1;
+    retval_1 = rpc_cudnnsetstream_1((ptr)handle, (ptr)streamId, &result, clnt);
+    if (retval_1 != RPC_SUCCESS) {
+        LOGE(LOG_ERROR, "%s failed (%d)", __FUNCTION__, retval_1);
+    }
+    if (result != CUDNN_STATUS_SUCCESS) {
+        LOGE(LOG_ERROR, "%s failed (result is %d)", __FUNCTION__, result);
+    }
+    return result;
+}
+
+cudnnStatus_t cudnnGetStream(cudnnHandle_t handle, cudaStream_t * streamId)
+{
+#ifdef WITH_API_CNT
+    cudnn_call_cnt++;
+#endif //WITH_API_CNT
+    ptr_result result;
+    enum clnt_stat retval_1;
+    if (streamId == NULL) {
+        LOGE(LOG_ERROR, "%s failed (value is NULL)", __FUNCTION__);
+        return CUDNN_STATUS_BAD_PARAM;
+    }
+    retval_1 = rpc_cudnngetstream_1((ptr)handle, &result, clnt);
+    if (retval_1 != RPC_SUCCESS) {
+        LOGE(LOG_ERROR, "%s failed (%d)", __FUNCTION__, retval_1);
+    }
+    if (result.err != CUDNN_STATUS_SUCCESS) {
+        LOGE(LOG_ERROR, "%s failed (result is %d)", __FUNCTION__, result.err);
+    } else {
+        *streamId = (cudaStream_t)result.ptr_result_u.ptr;
+    }
+    return result.err;
+}
+
+cudnnStatus_t cudnnCreateTensorDescriptor(cudnnTensorDescriptor_t * tensorDesc)
+{
+#ifdef WITH_API_CNT
+    cudnn_call_cnt++;
+#endif //WITH_API_CNT
+    ptr_result result;
+    enum clnt_stat retval_1;
+    if (tensorDesc == NULL) {
+        LOGE(LOG_ERROR, "%s failed (value is NULL)", __FUNCTION__);
+        return CUDNN_STATUS_BAD_PARAM;
+    }
+    retval_1 = rpc_cudnncreatetensordescriptor_1(&result, clnt);
+    if (retval_1 != RPC_SUCCESS) {
+        LOGE(LOG_ERROR, "%s failed (%d)", __FUNCTION__, retval_1);
+    }
+    if (result.err != CUDNN_STATUS_SUCCESS) {
+        LOGE(LOG_ERROR, "%s failed (result is %d)", __FUNCTION__, result.err);
+    } else {
+        *tensorDesc = (cudnnTensorDescriptor_t)result.ptr_result_u.ptr;
+    }
+    return result.err;
+}
+
 DEF_FN(cudnnStatus_t, cudnnSetTensor4dDescriptor, cudnnTensorDescriptor_t, tensorDesc, cudnnTensorFormat_t, format, cudnnDataType_t, dataType, int, n, int, c, int, h, int, w) 
 DEF_FN(cudnnStatus_t, cudnnSetTensor4dDescriptorEx, cudnnTensorDescriptor_t, tensorDesc, cudnnDataType_t, dataType, int, n, int, c, int, h, int, w, int, nStride, int, cStride, int, hStride, int, wStride)
 DEF_FN(cudnnStatus_t, cudnnGetTensor4dDescriptor, const cudnnTensorDescriptor_t, tensorDesc, cudnnDataType_t *, dataType, int*, n, int*, c, int*, h, int*, w, int*, nStride, int*, cStride, int*, hStride, int*, wStride)
diff --git a/cpu/cpu-server-cudnn.c b/cpu/cpu-server-cudnn.c
new file mode 100644
index 00000000..639f8710
--- /dev/null
+++ b/cpu/cpu-server-cudnn.c
@@ -0,0 +1,157 @@
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <cuda.h>
+#include <cudnn.h>
+
+#include "cpu_rpc_prot.h"
+#include "cpu-common.h"
+#include "cpu-utils.h"
+#include "log.h"
+#include "resource-mg.h"
+#include "gsched.h"
+
+#define WITH_RECORDER
+#include "api-recorder.h"
+
+#include "cpu-server-cudnn.h"
+
+
+
+int server_cudnn_init(int bypass, resource_mg *memory)
+{
+    int ret = 0;
+    ret &= resource_mg_init(&rm_cudnn, bypass);
+    return ret;
+}
+
+int server_cudnn_deinit(void)
+{
+    resource_mg_free(&rm_cudnn);
+    return 0;
+
+}
+
+bool_t rpc_cudnngetversion_1_svc(size_t *result, struct svc_req *rqstp)
+{
+    LOGE(LOG_DEBUG, "%s", __FUNCTION__);
+
+    GSCHED_RETAIN;
+    *result = cudnnGetVersion();
+    GSCHED_RELEASE;
+    return 1;
+}
+
+bool_t rpc_cudnngetmaxdeviceversion_1_svc(size_t *result, struct svc_req *rqstp)
+{
+    LOGE(LOG_DEBUG, "%s", __FUNCTION__);
+
+    GSCHED_RETAIN;
+    *result = cudnnGetMaxDeviceVersion();
+    GSCHED_RELEASE;
+    return 1;
+}
+
+bool_t rpc_cudnngetcudartversion_1_svc(size_t *result, struct svc_req *rqstp)
+{
+    LOGE(LOG_DEBUG, "%s", __FUNCTION__);
+
+    GSCHED_RETAIN;
+    *result = cudnnGetCudartVersion();
+    GSCHED_RELEASE;
+    return 1;
+}
+
+bool_t rpc_cudnngeterrorstring_1_svc(int status, char **result, struct svc_req *rqstp)
+{
+    const char* str;
+    *result = malloc(128);
+    LOGE(LOG_DEBUG, "%s", __FUNCTION__);
+    GSCHED_RETAIN;
+    str = cudnnGetErrorString((cudnnStatus_t)status);
+    strncpy(*result, str, 128);
+    GSCHED_RELEASE;
+    return 1;
+}
+
+bool_t rpc_cudnnqueryruntimeerror_1_svc(ptr handle, int mode, int_result *result, struct svc_req *rqstp)
+{
+    LOGE(LOG_DEBUG, "%s", __FUNCTION__);
+    cudnnRuntimeTag_t *tag;
+
+    GSCHED_RETAIN;
+    result->err = cudnnQueryRuntimeError((cudnnHandle_t)handle, (cudnnStatus_t*)&result->int_result_u.data, (cudnnErrQueryMode_t)mode, tag);
+    GSCHED_RELEASE;
+    return 1;
+}
+
+bool_t rpc_cudnngetproperty_1_svc(int type, int_result *result, struct svc_req *rqstp)
+{
+    LOGE(LOG_DEBUG, "%s", __FUNCTION__);
+
+    GSCHED_RETAIN;
+    result->err = cudnnGetProperty((libraryPropertyType)type, &result->int_result_u.data); 
+    GSCHED_RELEASE;
+    return 1;
+}
+
+bool_t rpc_cudnncreate_1_svc(ptr_result *result, struct svc_req *rqstp)
+{
+    RECORD_VOID_API;
+    LOGE(LOG_DEBUG, "%s", __FUNCTION__);
+
+    GSCHED_RETAIN;
+    result->err = cudnnCreate((cudnnHandle_t*)result->ptr_result_u.ptr);
+    GSCHED_RELEASE;
+    RECORD_RESULT(ptr_result_u, *result);
+    return 1;
+}
+
+bool_t rpc_cudnndestroy_1_svc(ptr handle, int *result, struct svc_req *rqstp)
+{
+    RECORD_API(ptr);
+    RECORD_SINGLE_ARG(handle);
+    LOGE(LOG_DEBUG, "%s", __FUNCTION__);
+
+    GSCHED_RETAIN;
+    *result = cudnnDestroy((cudnnHandle_t)handle);
+    GSCHED_RELEASE;
+    RECORD_RESULT(integer, *result);
+    return 1;
+}
+
+bool_t rpc_cudnnsetstream_1_svc(ptr handle, ptr streamId, int *result, struct svc_req *rqstp)
+{
+    RECORD_API(rpc_cudnnsetstream_1_argument);
+    RECORD_NARG(handle);
+    RECORD_NARG(streamId);
+    LOGE(LOG_DEBUG, "%s", __FUNCTION__);
+
+    GSCHED_RETAIN;
+    *result = cudnnSetStream((cudnnHandle_t)handle, (cudaStream_t)streamId);
+    GSCHED_RELEASE;
+    RECORD_RESULT(integer, *result);
+    return 1;
+}
+
+bool_t rpc_cudnngetstream_1_svc(ptr handle, ptr_result *result, struct svc_req *rqstp)
+{
+    LOGE(LOG_DEBUG, "%s", __FUNCTION__);
+
+    GSCHED_RETAIN;
+    result->err = cudnnGetStream((cudnnHandle_t)handle, (cudaStream_t*)&result->ptr_result_u.ptr);
+    GSCHED_RELEASE;
+    return 1;
+}
+
+bool_t rpc_cudnncreatetensordescriptor_1_svc(ptr_result *result, struct svc_req *rqstp)
+{
+    RECORD_VOID_API;
+    LOGE(LOG_DEBUG, "%s", __FUNCTION__);
+
+    GSCHED_RETAIN;
+    result->err = cudnnCreateTensorDescriptor((cudnnTensorDescriptor_t*)&result->ptr_result_u.ptr);
+    GSCHED_RELEASE;
+    RECORD_RESULT(ptr_result_u, *result);
+    return 1;
+}
\ No newline at end of file
diff --git a/cpu/cpu-server-cudnn.h b/cpu/cpu-server-cudnn.h
new file mode 100644
index 00000000..19586223
--- /dev/null
+++ b/cpu/cpu-server-cudnn.h
@@ -0,0 +1,9 @@
+#ifndef _CPU_SERVER_CUDNN_H_
+#define _CPU_SERVER_CUDNN_H_
+
+#include "resource-mg.h"
+
+int server_cudnn_init(int restore, resource_mg *memory);
+int server_cudnn_deinit(void);
+
+#endif // _CPU_SERVER_CUDNN_H_
\ No newline at end of file
diff --git a/cpu/cpu-server.c b/cpu/cpu-server.c
index 86bcd561..e6f41c32 100644
--- a/cpu/cpu-server.c
+++ b/cpu/cpu-server.c
@@ -26,6 +26,7 @@
 #include "api-recorder.h"
 #include "gsched.h"
 #include "cpu-server-nvml.h"
+#include "cpu-server-cudnn.h"
 
 INIT_SOCKTYPE
 
@@ -294,6 +295,11 @@ void cricket_main(size_t prog_num, size_t vers_num)
         goto cleanup1;
     }
 
+    if (server_cudnn_init(restore) != 0) {
+        LOGE(LOG_ERROR, "initializing server_nvml failed.");
+        goto cleanup1;
+    }
+
 #ifdef WITH_IB
 
     if (ib_init(ib_device, client) != 0) {
diff --git a/cpu/cpu_rpc_prot.x b/cpu/cpu_rpc_prot.x
index 60f5c986..a28519b4 100644
--- a/cpu/cpu_rpc_prot.x
+++ b/cpu/cpu_rpc_prot.x
@@ -379,5 +379,18 @@ program RPC_CD_PROG {
         int          rpc_nvmlInitWithFlags(int)                                = 4001;
         int          rpc_nvmlInit_v2(void)                                     = 4002;
         int          rpc_nvmlShutdown(void)                                    = 4003;
+        
+        /* CUDNN */
+        size_t      rpc_cudnnGetVersion(void) = 5000;
+        size_t      rpc_cudnnGetMaxDeviceVersion(void) = 5001;
+        size_t      rpc_cudnnGetCudartVersion(void) = 5002;
+        string      rpc_cudnnGetErrorString (int status) = 5003;
+        int_result  rpc_cudnnQueryRuntimeError(ptr handle, int mode) = 5004;
+        int_result  rpc_cudnnGetProperty(int type) = 5005;
+        ptr_result  rpc_cudnnCreate(void) = 5006;
+        int         rpc_cudnnDestroy(ptr handle) = 5007;
+        int         rpc_cudnnSetStream(ptr handle, ptr streamId) = 5008;
+        ptr_result  rpc_cudnnGetStream(ptr handle) = 5009;
+        ptr_result  rpc_cudnnCreateTensorDescriptor(void) = 5010;
     } = 1;
 } = 99;
diff --git a/cpu/resource-mg.h b/cpu/resource-mg.h
index 5b542a83..160e39f9 100644
--- a/cpu/resource-mg.h
+++ b/cpu/resource-mg.h
@@ -38,6 +38,7 @@ resource_mg rm_globals;
 //Other RMs
 resource_mg rm_cusolver;
 resource_mg rm_cublas;
+resource_mg rm_cudnn;
 
 
 /** initializes the resource manager

From 0a01b0799d0714e73aef0b7c52dac8ba7e6338ba Mon Sep 17 00:00:00 2001
From: Niklas Eiling <niklas.eiling@eonerc.rwth-aachen.de>
Date: Tue, 20 Jun 2023 10:31:58 +0200
Subject: [PATCH 62/83] use resource managers for cudnn api

Signed-off-by: Niklas Eiling <niklas.eiling@eonerc.rwth-aachen.de>
---
 cpu/Makefile           |  2 +-
 cpu/cpu-server-cudnn.c | 26 ++++++++++++++++++++------
 cpu/cpu-server-cudnn.h |  2 +-
 cpu/cpu-server.c       |  6 ++++--
 cpu/resource-mg.h      |  3 +++
 5 files changed, 29 insertions(+), 10 deletions(-)

diff --git a/cpu/Makefile b/cpu/Makefile
index 6ef358d4..01d9e546 100644
--- a/cpu/Makefile
+++ b/cpu/Makefile
@@ -107,7 +107,7 @@ ifdef WITH_IB
 CC_FLAGS += -DWITH_IB=$(WITH_IB)
 endif
 
-SERVER_LD_FLAGS = $(LD_FLAGS) -lcudart -lcusolver -lcuda -lcublas -lrt -lpthread -lnvidia-ml
+SERVER_LD_FLAGS = $(LD_FLAGS) -lcudart -lcusolver -lcuda -lcublas -lrt -lpthread -lnvidia-ml -lcudnn
 SERVER_BIN_LD_FLAGS = $(SERVER_LD_FLAGS) -Wl,--unresolved-symbols=ignore-in-object-files
 CLIENT_LD_FLAGS = $(LD_FLAGS)
 
diff --git a/cpu/cpu-server-cudnn.c b/cpu/cpu-server-cudnn.c
index 639f8710..a845aa8c 100644
--- a/cpu/cpu-server-cudnn.c
+++ b/cpu/cpu-server-cudnn.c
@@ -18,7 +18,7 @@
 
 
 
-int server_cudnn_init(int bypass, resource_mg *memory)
+int server_cudnn_init(int bypass)
 {
     int ret = 0;
     ret &= resource_mg_init(&rm_cudnn, bypass);
@@ -80,7 +80,9 @@ bool_t rpc_cudnnqueryruntimeerror_1_svc(ptr handle, int mode, int_result *result
     cudnnRuntimeTag_t *tag;
 
     GSCHED_RETAIN;
-    result->err = cudnnQueryRuntimeError((cudnnHandle_t)handle, (cudnnStatus_t*)&result->int_result_u.data, (cudnnErrQueryMode_t)mode, tag);
+    result->err = cudnnQueryRuntimeError(
+        (cudnnHandle_t)resource_mg_get(&rm_cudnn, (void*)handle),
+        (cudnnStatus_t*)&result->int_result_u.data, (cudnnErrQueryMode_t)mode, tag);
     GSCHED_RELEASE;
     return 1;
 }
@@ -101,7 +103,10 @@ bool_t rpc_cudnncreate_1_svc(ptr_result *result, struct svc_req *rqstp)
     LOGE(LOG_DEBUG, "%s", __FUNCTION__);
 
     GSCHED_RETAIN;
-    result->err = cudnnCreate((cudnnHandle_t*)result->ptr_result_u.ptr);
+    result->err = cudnnCreate((cudnnHandle_t*)&result->ptr_result_u.ptr);
+    if (resource_mg_create(&rm_cudnn, (void*)result->ptr_result_u.ptr) != 0) {
+        LOGE(LOG_ERROR, "error in resource manager");
+    }
     GSCHED_RELEASE;
     RECORD_RESULT(ptr_result_u, *result);
     return 1;
@@ -114,7 +119,8 @@ bool_t rpc_cudnndestroy_1_svc(ptr handle, int *result, struct svc_req *rqstp)
     LOGE(LOG_DEBUG, "%s", __FUNCTION__);
 
     GSCHED_RETAIN;
-    *result = cudnnDestroy((cudnnHandle_t)handle);
+    *result = cudnnDestroy(
+        (cudnnHandle_t)resource_mg_get(&rm_cudnn, (void*)handle));
     GSCHED_RELEASE;
     RECORD_RESULT(integer, *result);
     return 1;
@@ -128,7 +134,9 @@ bool_t rpc_cudnnsetstream_1_svc(ptr handle, ptr streamId, int *result, struct sv
     LOGE(LOG_DEBUG, "%s", __FUNCTION__);
 
     GSCHED_RETAIN;
-    *result = cudnnSetStream((cudnnHandle_t)handle, (cudaStream_t)streamId);
+    *result = cudnnSetStream(
+        (cudnnHandle_t)resource_mg_get(&rm_cudnn, (void*)handle),
+        (cudaStream_t)resource_mg_get(&rm_streams, (void*)streamId));
     GSCHED_RELEASE;
     RECORD_RESULT(integer, *result);
     return 1;
@@ -139,7 +147,10 @@ bool_t rpc_cudnngetstream_1_svc(ptr handle, ptr_result *result, struct svc_req *
     LOGE(LOG_DEBUG, "%s", __FUNCTION__);
 
     GSCHED_RETAIN;
-    result->err = cudnnGetStream((cudnnHandle_t)handle, (cudaStream_t*)&result->ptr_result_u.ptr);
+    result->err = cudnnGetStream(
+        (cudnnHandle_t)resource_mg_get(&rm_cudnn, (void*)handle),
+        (cudaStream_t*)&result->ptr_result_u.ptr);
+
     GSCHED_RELEASE;
     return 1;
 }
@@ -151,6 +162,9 @@ bool_t rpc_cudnncreatetensordescriptor_1_svc(ptr_result *result, struct svc_req
 
     GSCHED_RETAIN;
     result->err = cudnnCreateTensorDescriptor((cudnnTensorDescriptor_t*)&result->ptr_result_u.ptr);
+    if (resource_mg_create(&rm_cudnn_tensors, (void*)result->ptr_result_u.ptr) != 0) {
+        LOGE(LOG_ERROR, "error in resource manager");
+    }
     GSCHED_RELEASE;
     RECORD_RESULT(ptr_result_u, *result);
     return 1;
diff --git a/cpu/cpu-server-cudnn.h b/cpu/cpu-server-cudnn.h
index 19586223..6c892919 100644
--- a/cpu/cpu-server-cudnn.h
+++ b/cpu/cpu-server-cudnn.h
@@ -3,7 +3,7 @@
 
 #include "resource-mg.h"
 
-int server_cudnn_init(int restore, resource_mg *memory);
+int server_cudnn_init(int restore);
 int server_cudnn_deinit(void);
 
 #endif // _CPU_SERVER_CUDNN_H_
\ No newline at end of file
diff --git a/cpu/cpu-server.c b/cpu/cpu-server.c
index e6f41c32..ce31f32c 100644
--- a/cpu/cpu-server.c
+++ b/cpu/cpu-server.c
@@ -297,7 +297,7 @@ void cricket_main(size_t prog_num, size_t vers_num)
 
     if (server_cudnn_init(restore) != 0) {
         LOGE(LOG_ERROR, "initializing server_nvml failed.");
-        goto cleanup1;
+        goto cleanup0;
     }
 
 #ifdef WITH_IB
@@ -312,7 +312,7 @@ void cricket_main(size_t prog_num, size_t vers_num)
 
     if (signal(SIGUSR1, signal_checkpoint) == SIG_ERR) {
         LOGE(LOG_ERROR, "An error occurred while setting a signal handler.");
-        goto cleanup0;
+        goto cleanup00;
     }
 
     LOG(LOG_INFO, "waiting for RPC requests...");
@@ -322,6 +322,8 @@ void cricket_main(size_t prog_num, size_t vers_num)
     LOG(LOG_DEBUG, "svc_run returned. Cleaning up.");
     ret = 0;
     //api_records_print();
+ cleanup00:
+    server_cudnn_cleanup();
  cleanup0:
     server_driver_deinit();
  cleanup1:
diff --git a/cpu/resource-mg.h b/cpu/resource-mg.h
index 160e39f9..b9b63891 100644
--- a/cpu/resource-mg.h
+++ b/cpu/resource-mg.h
@@ -38,7 +38,10 @@ resource_mg rm_globals;
 //Other RMs
 resource_mg rm_cusolver;
 resource_mg rm_cublas;
+
+//CUDNN RMs
 resource_mg rm_cudnn;
+resource_mg rm_cudnn_tensors;
 
 
 /** initializes the resource manager

From 9de7292904eab9f47ec90a424a1ef954cadd4500 Mon Sep 17 00:00:00 2001
From: Niklas Eiling <niklas.eiling@eonerc.rwth-aachen.de>
Date: Tue, 20 Jun 2023 14:51:19 +0200
Subject: [PATCH 63/83] add more cuDNN APIs

Signed-off-by: Niklas Eiling <niklas.eiling@eonerc.rwth-aachen.de>
---
 cpu/cpu-client-cudnn.c | 78 ++++++++++++++++++++++++++++++++++++++++--
 cpu/cpu-server.c       |  2 +-
 cpu/cpu_rpc_prot.x     | 75 ++++++++++++++++++++++++++++++++++++++++
 3 files changed, 151 insertions(+), 4 deletions(-)

diff --git a/cpu/cpu-client-cudnn.c b/cpu/cpu-client-cudnn.c
index 46a159f6..32697620 100644
--- a/cpu/cpu-client-cudnn.c
+++ b/cpu/cpu-client-cudnn.c
@@ -242,9 +242,81 @@ DEF_FN(cudnnStatus_t, cudnnGetReductionWorkspaceSize, cudnnHandle_t, handle, con
 DEF_FN(cudnnStatus_t, cudnnReduceTensor, cudnnHandle_t, handle, const cudnnReduceTensorDescriptor_t, reduceTensorDesc, void *, indices, size_t, indicesSizeInBytes, void *, workspace, size_t, workspaceSizeInBytes, const void *, alpha, const cudnnTensorDescriptor_t, aDesc, const void *, A, const void *, beta, const cudnnTensorDescriptor_t, cDesc, void *, C)
 DEF_FN(cudnnStatus_t, cudnnSetTensor, cudnnHandle_t, handle, const cudnnTensorDescriptor_t, yDesc, void *, y, const void *, valuePtr)
 DEF_FN(cudnnStatus_t, cudnnScaleTensor, cudnnHandle_t, handle, const cudnnTensorDescriptor_t, yDesc, void *, y, const void *, alpha)
-DEF_FN(cudnnStatus_t, cudnnCreateFilterDescriptor, cudnnFilterDescriptor_t *, filterDesc)
-DEF_FN(cudnnStatus_t, cudnnSetFilter4dDescriptor, cudnnFilterDescriptor_t, filterDesc, cudnnDataType_t, dataType, cudnnTensorFormat_t, format, int, k, int, c, int, h, int, w) 
-DEF_FN(cudnnStatus_t, cudnnGetFilter4dDescriptor, const cudnnFilterDescriptor_t, filterDesc, cudnnDataType_t *, dataType, cudnnTensorFormat_t *, format, int*, k, int*, c, int*, h, int*, w) 
+
+cudnnStatus_t cudnnCreateFilterDescriptor(cudnnFilterDescriptor_t * filterDesc)
+{
+#ifdef WITH_API_CNT
+    cudnn_call_cnt++;
+#endif //WITH_API_CNT
+    ptr_result result;
+    enum clnt_stat retval_1;
+    if (filterDesc == NULL) {
+        LOGE(LOG_ERROR, "%s failed (value is NULL)", __FUNCTION__);
+        return CUDNN_STATUS_BAD_PARAM;
+    }
+    retval_1 = rpc_cudnncreatefilterdescriptor_1(&result, clnt);
+    if (retval_1 != RPC_SUCCESS) {
+        LOGE(LOG_ERROR, "%s failed (%d)", __FUNCTION__, retval_1);
+    }
+    if (result.err != CUDNN_STATUS_SUCCESS) {
+        LOGE(LOG_ERROR, "%s failed (result is %d)", __FUNCTION__, result.err);
+    } else {
+        *filterDesc = (cudnnFilterDescriptor_t)result.ptr_result_u.ptr;
+    }
+    return result.err;
+}
+
+cudnnStatus_t cudnnSetFilter4dDescriptor(cudnnFilterDescriptor_t filterDesc, cudnnDataType_t dataType, cudnnTensorFormat_t format, int k, int c, int h, int w) 
+{
+#ifdef WITH_API_CNT
+    cudnn_call_cnt++;
+#endif //WITH_API_CNT
+    int result;
+    enum clnt_stat retval_1;
+    retval_1 = rpc_cudnnsetfilter4ddescriptor_1(
+        (ptr)filterDesc,
+        (int)dataType,
+        (int)format,
+        k, c, h, w, &result, clnt);
+
+    if (retval_1 != RPC_SUCCESS) {
+        LOGE(LOG_ERROR, "%s failed (%d)", __FUNCTION__, retval_1);
+    }
+    if (result != CUDNN_STATUS_SUCCESS) {
+        LOGE(LOG_ERROR, "%s failed (result is %d)", __FUNCTION__, result);
+    } 
+    return result;
+}
+
+cudnnStatus_t cudnnGetFilter4dDescriptor(const cudnnFilterDescriptor_t filterDesc, cudnnDataType_t *dataType, cudnnTensorFormat_t *format, int* k, int* c, int* h, int* w) 
+{
+#ifdef WITH_API_CNT
+    cudnn_call_cnt++;
+#endif //WITH_API_CNT
+    int6_result result;
+    enum clnt_stat retval_1;
+    if (dataType == NULL || format == NULL || k == NULL || c == NULL || h == NULL || w == NULL) {
+        LOGE(LOG_ERROR, "%s failed (value is NULL)", __FUNCTION__);
+        return CUDNN_STATUS_BAD_PARAM;
+    }
+    retval_1 = rpc_cudnngetfilter4ddescriptor_1(
+        (ptr)filterDesc,
+        &result, clnt);
+
+    if (retval_1 != RPC_SUCCESS) {
+        LOGE(LOG_ERROR, "%s failed (%d)", __FUNCTION__, retval_1);
+    }
+    if (result.err != CUDNN_STATUS_SUCCESS) {
+        LOGE(LOG_ERROR, "%s failed (result is %d)", __FUNCTION__, result);
+    } 
+    *dataType = (cudnnDataType_t)result.int6_result_u.data[0];
+    *format = (cudnnTensorFormat_t)result.int6_result_u.data[1];
+    *k = result.int6_result_u.data[2];
+    *c = result.int6_result_u.data[3];
+    *h = result.int6_result_u.data[4];
+    *w = result.int6_result_u.data[5];
+    return result.err;
+}
 DEF_FN(cudnnStatus_t, cudnnSetFilterNdDescriptor, cudnnFilterDescriptor_t, filterDesc, cudnnDataType_t, dataType, cudnnTensorFormat_t, format, int, nbDims, const int*, filterDimA)
 DEF_FN(cudnnStatus_t, cudnnGetFilterNdDescriptor, const cudnnFilterDescriptor_t, filterDesc, int, nbDimsRequested, cudnnDataType_t *, dataType, cudnnTensorFormat_t *, format, int*, nbDims, int*, filterDimA)
 DEF_FN(cudnnStatus_t, cudnnGetFilterSizeInBytes, const cudnnFilterDescriptor_t, filterDesc, size_t*, size)
diff --git a/cpu/cpu-server.c b/cpu/cpu-server.c
index ce31f32c..744f7cfc 100644
--- a/cpu/cpu-server.c
+++ b/cpu/cpu-server.c
@@ -323,7 +323,7 @@ void cricket_main(size_t prog_num, size_t vers_num)
     ret = 0;
     //api_records_print();
  cleanup00:
-    server_cudnn_cleanup();
+    server_cudnn_deinit();
  cleanup0:
     server_driver_deinit();
  cleanup1:
diff --git a/cpu/cpu_rpc_prot.x b/cpu/cpu_rpc_prot.x
index a28519b4..fdfa7051 100644
--- a/cpu/cpu_rpc_prot.x
+++ b/cpu/cpu_rpc_prot.x
@@ -39,6 +39,14 @@ struct rpc_dim3 {
     unsigned int z;
 };
 
+union cudnn_scaling_t switch (int dataType) {
+case 2:
+case 0:
+    float f;
+case 1:
+    double d;
+};
+
 union int_result switch (int err) {
 case 0:
     int data;
@@ -130,6 +138,34 @@ default:
     void;
 };
 
+union int3_result switch (int err) {
+case 0:
+    int data[3];
+default:
+    void;
+};
+
+union int5_result switch (int err) {
+case 0:
+    int data[5];
+default:
+    void;
+};
+
+union int6_result switch (int err) {
+case 0:
+    int data[6];
+default:
+    void;
+};
+
+union int9_result switch (int err) {
+case 0:
+    int data[9];
+default:
+    void;
+};
+
 program RPC_CD_PROG {
     version RPC_CD_VERS {
         int          rpc_checkpoint(void)                                         = 0;
@@ -392,5 +428,44 @@ program RPC_CD_PROG {
         int         rpc_cudnnSetStream(ptr handle, ptr streamId) = 5008;
         ptr_result  rpc_cudnnGetStream(ptr handle) = 5009;
         ptr_result  rpc_cudnnCreateTensorDescriptor(void) = 5010;
+        
+        int         rpc_cudnnSetTensor4dDescriptor(ptr tensorDesc, int format, int dataType, int n, int c, int h, int w) = 5011;
+        int         rpc_cudnnSetTensor4dDescriptorEx(ptr tensorDesc, int dataType, int n, int c, int h, int w, int nStride, int cStride, int hStride, int wStride) = 5012;
+        int9_result rpc_cudnnGetTensor4dDescriptor(ptr tensorDesc) = 5013;
+        int         rpc_cudnnSetTensorNdDescriptor(ptr tensorDesc, int dataType, int nbDims, mem_data dimA, mem_data strideA) = 5014;
+        int         rpc_cudnnSetTensorNdDescriptorEx(ptr tensorDesc, int format, int dataType, int nbDims, mem_data dimA) = 5015;
+        mem_result  rpc_cudnnGetTensorNdDescriptor(ptr tensorDesc, int nbDimsRequested) = 5016;
+        sz_result   rpc_cudnnGetTensorSizeInBytes(ptr tensorDesc) = 5017;
+        int         rpc_cudnnDestroyTensorDescriptor(ptr tensorDesc) = 5018;
+        sz_result   rpc_cudnnInitTransformDest(ptr transformDesc, ptr srcDesc, ptr destDesc) = 5019;
+        ptr_result  rpc_cudnnCreateTensorTransformDescriptor(void) = 5020;
+        int         rpc_cudnnSetTensorTransformDescriptor(ptr transformDesc, uint32_t nbDims, int destFormat, mem_data padBeforeA, mem_data padAfterA, mem_data foldA, int direction) = 5021;
+        mem_result  rpc_cudnnGetTensorTransformDescriptor(ptr transformDesc, uint32_t nbDimsRequested) = 5022;
+        int         rpc_cudnnDestroyTensorTransformDescriptor(ptr transformDesc) = 5023;
+        ptr_result  rpc_cudnnTransformTensor(ptr handle, cudnn_scaling_t alpha, ptr xDesc, cudnn_scaling_t x, cudnn_scaling_t beta, ptr yDesc) = 5024;
+        ptr_result  rpc_cudnnTransformTensorEx(ptr handle, ptr transDesc, cudnn_scaling_t alpha, ptr srcDesc, cudnn_scaling_t srcData, cudnn_scaling_t beta, ptr destDesc) = 5025;
+        ptr_result  rpc_cudnnAddTensor(ptr handle, cudnn_scaling_t alpha, ptr aDesc, ptr A, cudnn_scaling_t beta, ptr cDesc, ptr C) = 5026;
+        ptr_result  rpc_cudnnCreateOpTensorDescriptor(void) = 5027;
+        int         rpc_cudnnSetOpTensorDescriptor(ptr opTensorDesc, int opTensorOp, int opTensorCompType, int opTensorNanOpt) = 5028;
+        int3_result rpc_cudnnGetOpTensorDescriptor(ptr opTensorDesc) = 5029;
+        int         rpc_cudnnDestroyOpTensorDescriptor(ptr opTensorDesc) = 5030;
+        mem_result  rpc_cudnnOpTensor(ptr handle, ptr opTensorDesc, cudnn_scaling_t alpha1, ptr aDesc, mem_data A, cudnn_scaling_t alpha2, ptr bDesc, mem_data B, cudnn_scaling_t beta, ptr  cDesc) = 5031;
+        ptr_result  rpc_cudnnCreateReduceTensorDescriptor(void) = 5032;
+        int         rpc_cudnnSetReduceTensorDescriptor(ptr reduceTensorDesc, int reduceTensorOp, int reduceTensorCompType, int reduceTensorNanOpt, int reduceTensorIndices, int reduceTensorIndicesType) = 5033;
+        int5_result rpc_cudnnGetReduceTensorDescriptor(ptr reduceTensorDesc) = 5034;
+        int         rpc_cudnnDestroyReduceTensorDescriptor(ptr reduceTensorDesc) = 5035;
+        sz_result   rpc_cudnnGetReductionIndicesSize(ptr handle, ptr reduceTensorDesc, ptr aDesc, ptr cDesc) = 5036;
+        sz_result   rpc_cudnnGetReductionWorkspaceSize(ptr handle, ptr reduceTensorDesc, ptr aDesc, ptr cDesc) = 5037;
+        mem_result  rpc_cudnnReduceTensor(ptr handle, ptr reduceTensorDesc, ptr indices, size_t indicesSizeInBytes, ptr workspace, size_t workspaceSizeInBytes, cudnn_scaling_t alpha, ptr aDesc, ptr A, cudnn_scaling_t beta, ptr cDesc, ptr C) = 5038;
+        int         rpc_cudnnSetTensor(ptr handle, ptr yDesc, ptr y, mem_data valuePtr) = 5039;
+        int         rpc_cudnnScaleTensor(ptr handle, ptr yDesc, ptr y, cudnn_scaling_t alpha) = 5040;
+        ptr_result  rpc_cudnnCreateFilterDescriptor(void) = 5041;
+        int         rpc_cudnnSetFilter4dDescriptor(ptr filterDesc, int dataType, int format, int k, int c, int h, int w) = 5042;
+        int6_result rpc_cudnnGetFilter4dDescriptor(ptr filterDesc) = 5043;
+        int         rpc_cudnnSetFilterNdDescriptor(ptr filterDesc, int dataType, int format, int nbDims, mem_data filterDimA) = 5044;
+        mem_result  rpc_cudnnGetFilterNdDescriptor(ptr filterDesc, int nbDimsRequested) = 5045;
+        sz_result   rpc_cudnnGetFilterSizeInBytes(ptr filterDesc) = 5046;
+        int         rpc_cudnnTransformFilter(ptr handle, ptr transDesc, cudnn_scaling_t, ptr srcDesc, ptr srcData, cudnn_scaling_t beta, ptr destDesc, ptr destData) = 5047;
+        int         rpc_cudnnDestroyFilterDescriptor(ptr filterDesc) = 5048;
     } = 1;
 } = 99;

From 32796d7caffb54f9f6e511e81035bb14bf0828fd Mon Sep 17 00:00:00 2001
From: Niklas Eiling <niklas.eiling@eonerc.rwth-aachen.de>
Date: Tue, 20 Jun 2023 18:25:04 +0200
Subject: [PATCH 64/83] add cudnn activation and pooling apis

Signed-off-by: Niklas Eiling <niklas.eiling@eonerc.rwth-aachen.de>
---
 cpu/cpu-client-cudnn.c | 550 +++++++++++++++++++++++++++++++++++++++--
 cpu/cpu-server-cudnn.c | 444 +++++++++++++++++++++++++++++++++
 cpu/cpu_rpc_prot.x     |  45 +++-
 cpu/resource-mg.h      |   4 +
 4 files changed, 1017 insertions(+), 26 deletions(-)

diff --git a/cpu/cpu-client-cudnn.c b/cpu/cpu-client-cudnn.c
index 32697620..c519a172 100644
--- a/cpu/cpu-client-cudnn.c
+++ b/cpu/cpu-client-cudnn.c
@@ -308,36 +308,536 @@ cudnnStatus_t cudnnGetFilter4dDescriptor(const cudnnFilterDescriptor_t filterDes
     }
     if (result.err != CUDNN_STATUS_SUCCESS) {
         LOGE(LOG_ERROR, "%s failed (result is %d)", __FUNCTION__, result);
+    } else {
+        *dataType = (cudnnDataType_t)result.int6_result_u.data[0];
+        *format = (cudnnTensorFormat_t)result.int6_result_u.data[1];
+        *k = result.int6_result_u.data[2];
+        *c = result.int6_result_u.data[3];
+        *h = result.int6_result_u.data[4];
+        *w = result.int6_result_u.data[5];
+    }
+    return result.err;
+}
+
+cudnnStatus_t cudnnSetFilterNdDescriptor(cudnnFilterDescriptor_t filterDesc, cudnnDataType_t dataType, cudnnTensorFormat_t format, int nbDims, const int* filterDimA)
+{
+#ifdef WITH_API_CNT
+    cudnn_call_cnt++;
+#endif //WITH_API_CNT
+    int result;
+    enum clnt_stat retval_1;
+    mem_data rpc_filterDimA = {
+        .mem_data_len = nbDims * sizeof(int),
+        .mem_data_val = (char*)filterDimA
+    };
+    retval_1 = rpc_cudnnsetfilternddescriptor_1(
+        (ptr)filterDesc,
+        (int)dataType,
+        (int)format,
+        (int)nbDims,
+        rpc_filterDimA, &result, clnt);
+
+    if (retval_1 != RPC_SUCCESS) {
+        LOGE(LOG_ERROR, "%s failed (%d)", __FUNCTION__, retval_1);
+    }
+    if (result != CUDNN_STATUS_SUCCESS) {
+        LOGE(LOG_ERROR, "%s failed (result is %d)", __FUNCTION__, result);
     } 
-    *dataType = (cudnnDataType_t)result.int6_result_u.data[0];
-    *format = (cudnnTensorFormat_t)result.int6_result_u.data[1];
-    *k = result.int6_result_u.data[2];
-    *c = result.int6_result_u.data[3];
-    *h = result.int6_result_u.data[4];
-    *w = result.int6_result_u.data[5];
+    return result;
+}
+
+cudnnStatus_t cudnnGetFilterNdDescriptor(const cudnnFilterDescriptor_t filterDesc, int nbDimsRequested, cudnnDataType_t * dataType, cudnnTensorFormat_t * format, int* nbDims, int* filterDimA)
+{
+#ifdef WITH_API_CNT
+    cudnn_call_cnt++;
+#endif //WITH_API_CNT
+    mem_result result;
+    enum clnt_stat retval_1;
+    if (dataType == NULL || format == NULL || nbDims == NULL || filterDimA == NULL) {
+        LOGE(LOG_ERROR, "%s failed (value is NULL)", __FUNCTION__);
+        return CUDNN_STATUS_BAD_PARAM;
+    }
+    retval_1 = rpc_cudnngetfilternddescriptor_1(
+        (ptr)filterDesc,
+        nbDimsRequested,
+        &result, clnt);
+
+    if (retval_1 != RPC_SUCCESS) {
+        LOGE(LOG_ERROR, "%s failed (%d)", __FUNCTION__, retval_1);
+    }
+    size_t expected_size = nbDimsRequested * sizeof(int) + sizeof(int) + sizeof(cudnnDataType_t) + sizeof(cudnnTensorFormat_t);
+    if (result.err != CUDNN_STATUS_SUCCESS || result.mem_result_u.data.mem_data_len < expected_size) {
+        LOGE(LOG_ERROR, "%s failed (result is %d)", __FUNCTION__, result.err);
+    } else {
+        size_t offset = 0;
+        *dataType = (cudnnDataType_t)result.mem_result_u.data.mem_data_val[offset];
+        offset += sizeof(cudnnDataType_t);
+        *format = (cudnnTensorFormat_t)result.mem_result_u.data.mem_data_val[offset];
+        offset += sizeof(cudnnTensorFormat_t);
+        *nbDims = (int)result.mem_result_u.data.mem_data_val[offset];
+        offset += sizeof(int);
+        memcpy(filterDimA, result.mem_result_u.data.mem_data_val+offset, *nbDims * sizeof(int));
+    }
     return result.err;
 }
-DEF_FN(cudnnStatus_t, cudnnSetFilterNdDescriptor, cudnnFilterDescriptor_t, filterDesc, cudnnDataType_t, dataType, cudnnTensorFormat_t, format, int, nbDims, const int*, filterDimA)
-DEF_FN(cudnnStatus_t, cudnnGetFilterNdDescriptor, const cudnnFilterDescriptor_t, filterDesc, int, nbDimsRequested, cudnnDataType_t *, dataType, cudnnTensorFormat_t *, format, int*, nbDims, int*, filterDimA)
-DEF_FN(cudnnStatus_t, cudnnGetFilterSizeInBytes, const cudnnFilterDescriptor_t, filterDesc, size_t*, size)
-DEF_FN(cudnnStatus_t, cudnnTransformFilter, cudnnHandle_t, handle, const cudnnTensorTransformDescriptor_t, transDesc, const void *, alpha, const cudnnFilterDescriptor_t, srcDesc, const void *, srcData, const void *, beta, const cudnnFilterDescriptor_t, destDesc, void *, destData)
-DEF_FN(cudnnStatus_t, cudnnDestroyFilterDescriptor, cudnnFilterDescriptor_t, filterDesc)
+
+cudnnStatus_t cudnnGetFilterSizeInBytes(const cudnnFilterDescriptor_t filterDesc, size_t* size)
+{
+#ifdef WITH_API_CNT
+    cudnn_call_cnt++;
+#endif //WITH_API_CNT
+    sz_result result;
+    enum clnt_stat retval_1;
+    if (size == NULL) {
+        LOGE(LOG_ERROR, "%s failed (value is NULL)", __FUNCTION__);
+        return CUDNN_STATUS_BAD_PARAM;
+    }
+    retval_1 = rpc_cudnngetfiltersizeinbytes_1(
+        (ptr)filterDesc,
+        &result, clnt);
+
+    if (retval_1 != RPC_SUCCESS) {
+        LOGE(LOG_ERROR, "%s failed (%d)", __FUNCTION__, retval_1);
+    }
+    if (result.err != CUDNN_STATUS_SUCCESS) {
+        LOGE(LOG_ERROR, "%s failed (result is %d)", __FUNCTION__, result.err);
+    } else {
+        *size = result.sz_result_u.data;
+    }
+    return result.err;
+}
+
+cudnnStatus_t cudnnTransformFilter(cudnnHandle_t handle, const cudnnTensorTransformDescriptor_t transDesc, const void * alpha, const cudnnFilterDescriptor_t srcDesc, const void * srcData, const void * beta, const cudnnFilterDescriptor_t destDesc, void * destData)
+{
+#ifdef WITH_API_CNT
+    cudnn_call_cnt++;
+#endif //WITH_API_CNT
+    int result;
+    enum clnt_stat retval_1;
+    //TODO: Check if we have a float instead of always sending doubles
+    cudnn_scaling_t rpc_alpha = {.dataType = CUDNN_DATA_DOUBLE, .cudnn_scaling_t_u.d = *((double*)alpha)};
+    cudnn_scaling_t rpc_beta = {.dataType = CUDNN_DATA_DOUBLE, .cudnn_scaling_t_u.d = *((double*)beta)};
+    retval_1 = rpc_cudnntransformfilter_1(
+        (ptr)handle,
+        (ptr)transDesc,
+        rpc_alpha,
+        (ptr)srcDesc,
+        (ptr)srcData,
+        rpc_beta,
+        (ptr)destDesc,
+        (ptr)destData,
+        &result, clnt);
+
+    if (retval_1 != RPC_SUCCESS) {
+        LOGE(LOG_ERROR, "%s failed (%d)", __FUNCTION__, retval_1);
+    }
+    if (result != CUDNN_STATUS_SUCCESS) {
+        LOGE(LOG_ERROR, "%s failed (result is %d)", __FUNCTION__, result);
+    }
+    return result;
+}
+
+cudnnStatus_t cudnnDestroyFilterDescriptor(cudnnFilterDescriptor_t filterDesc)
+{
+#ifdef WITH_API_CNT
+    cudnn_call_cnt++;
+#endif //WITH_API_CNT
+    int result;
+    enum clnt_stat retval_1;
+    retval_1 = rpc_cudnndestroyfilterdescriptor_1(
+        (ptr)filterDesc,
+        &result, clnt);
+
+    if (retval_1 != RPC_SUCCESS) {
+        LOGE(LOG_ERROR, "%s failed (%d)", __FUNCTION__, retval_1);
+    }
+    if (result != CUDNN_STATUS_SUCCESS) {
+        LOGE(LOG_ERROR, "%s failed (result is %d)", __FUNCTION__, result);
+    }
+    return result;
+}
+
 DEF_FN(cudnnStatus_t, cudnnSoftmaxForward, cudnnHandle_t, handle, cudnnSoftmaxAlgorithm_t, algo, cudnnSoftmaxMode_t, mode, const void *,alpha, const cudnnTensorDescriptor_t, xDesc, const void *, x, const void *, beta, const cudnnTensorDescriptor_t, yDesc, void *, y)
-DEF_FN(cudnnStatus_t, cudnnCreatePoolingDescriptor, cudnnPoolingDescriptor_t *, poolingDesc)
-DEF_FN(cudnnStatus_t, cudnnSetPooling2dDescriptor, cudnnPoolingDescriptor_t, poolingDesc, cudnnPoolingMode_t, mode, cudnnNanPropagation_t, maxpoolingNanOpt, int, windowHeight, int, windowWidth, int, verticalPadding, int, horizontalPadding, int, verticalStride, int, horizontalStride)
-DEF_FN(cudnnStatus_t, cudnnGetPooling2dDescriptor, const cudnnPoolingDescriptor_t, poolingDesc, cudnnPoolingMode_t *, mode, cudnnNanPropagation_t *, maxpoolingNanOpt, int*, windowHeight, int*, windowWidth, int*, verticalPadding, int*, horizontalPadding, int*, verticalStride, int*, horizontalStride)
-DEF_FN(cudnnStatus_t, cudnnSetPoolingNdDescriptor, cudnnPoolingDescriptor_t, poolingDesc, const cudnnPoolingMode_t, mode, const cudnnNanPropagation_t, maxpoolingNanOpt, int, nbDims, const int*, windowDimA, const int*, paddingA, const int*, strideA)
-DEF_FN(cudnnStatus_t, cudnnGetPoolingNdDescriptor, const cudnnPoolingDescriptor_t, poolingDesc, int, nbDimsRequested, cudnnPoolingMode_t *, mode, cudnnNanPropagation_t *, maxpoolingNanOpt, int*, nbDims, int*, windowDimA, int*, paddingA, int*, strideA)
-DEF_FN(cudnnStatus_t, cudnnGetPoolingNdForwardOutputDim, const cudnnPoolingDescriptor_t, poolingDesc, const cudnnTensorDescriptor_t, inputTensorDesc, int, nbDims, int*, outputTensorDimA)
-DEF_FN(cudnnStatus_t, cudnnGetPooling2dForwardOutputDim, const cudnnPoolingDescriptor_t, poolingDesc, const cudnnTensorDescriptor_t, inputTensorDesc, int*, n, int*, c, int*, h, int*, w)
-DEF_FN(cudnnStatus_t, cudnnDestroyPoolingDescriptor, cudnnPoolingDescriptor_t, poolingDesc)
+cudnnStatus_t cudnnCreatePoolingDescriptor(cudnnPoolingDescriptor_t *poolingDesc)
+{
+#ifdef WITH_API_CNT
+    cudnn_call_cnt++;
+#endif //WITH_API_CNT
+    ptr_result result;
+    enum clnt_stat retval_1;
+    if (poolingDesc == NULL) {
+        LOGE(LOG_ERROR, "%s failed (value is NULL)", __FUNCTION__);
+        return CUDNN_STATUS_BAD_PARAM;
+    }
+    retval_1 = rpc_cudnncreatepoolingdescriptor_1(&result, clnt);
+    if (retval_1 != RPC_SUCCESS) {
+        LOGE(LOG_ERROR, "%s failed (%d)", __FUNCTION__, retval_1);
+    }
+    if (result.err != CUDNN_STATUS_SUCCESS) {
+        LOGE(LOG_ERROR, "%s failed (result is %d)", __FUNCTION__, result.err);
+    } else {
+        *poolingDesc = (cudnnPoolingDescriptor_t)result.ptr_result_u.ptr;
+    }
+    return result.err;
+}
+
+cudnnStatus_t cudnnSetPooling2dDescriptor(cudnnPoolingDescriptor_t poolingDesc, cudnnPoolingMode_t mode, cudnnNanPropagation_t maxpoolingNanOpt, int windowHeight, int windowWidth, int verticalPadding, int horizontalPadding, int verticalStride, int horizontalStride)
+{
+#ifdef WITH_API_CNT
+    cudnn_call_cnt++;
+#endif //WITH_API_CNT
+    int result;
+    enum clnt_stat retval_1;
+    retval_1 = rpc_cudnnsetpooling2ddescriptor_1(
+        (ptr)poolingDesc,
+        (int)mode,
+        (int)maxpoolingNanOpt,
+        windowHeight,
+        windowWidth,
+        verticalPadding,
+        horizontalPadding,
+        verticalStride,
+        horizontalStride,
+        &result, clnt);
+
+    if (retval_1 != RPC_SUCCESS) {
+        LOGE(LOG_ERROR, "%s failed (%d)", __FUNCTION__, retval_1);
+    }
+    if (result != CUDNN_STATUS_SUCCESS) {
+        LOGE(LOG_ERROR, "%s failed (result is %d)", __FUNCTION__, result);
+    } 
+    return result;
+}
+    
+cudnnStatus_t cudnnGetPooling2dDescriptor(const cudnnPoolingDescriptor_t poolingDesc, cudnnPoolingMode_t *mode, cudnnNanPropagation_t *maxpoolingNanOpt, int* windowHeight, int* windowWidth, int* verticalPadding, int* horizontalPadding, int* verticalStride, int* horizontalStride)
+{
+#ifdef WITH_API_CNT
+    cudnn_call_cnt++;
+#endif //WITH_API_CNT
+    int8_result result;
+    enum clnt_stat retval_1;
+    if (mode == NULL || maxpoolingNanOpt == NULL || windowHeight == NULL || windowWidth == NULL || verticalPadding == NULL || verticalStride == NULL || horizontalPadding == NULL || horizontalStride == NULL) {
+        LOGE(LOG_ERROR, "%s failed (value is NULL)", __FUNCTION__);
+        return CUDNN_STATUS_BAD_PARAM;
+    }
+    retval_1 = rpc_cudnngetpooling2ddescriptor_1(
+        (ptr)poolingDesc,
+        &result, clnt);
+
+    if (retval_1 != RPC_SUCCESS) {
+        LOGE(LOG_ERROR, "%s failed (%d)", __FUNCTION__, retval_1);
+    }
+    if (result.err != CUDNN_STATUS_SUCCESS) {
+        LOGE(LOG_ERROR, "%s failed (result is %d)", __FUNCTION__, result);
+    } else {
+        *mode = (cudnnPoolingMode_t)result.int8_result_u.data[0];
+        *maxpoolingNanOpt = (cudnnNanPropagation_t)result.int8_result_u.data[1];
+        *windowHeight = result.int8_result_u.data[2];
+        *windowWidth = result.int8_result_u.data[3];
+        *verticalPadding = result.int8_result_u.data[4];
+        *horizontalPadding = result.int8_result_u.data[5];
+        *verticalStride = result.int8_result_u.data[6];
+        *horizontalStride = result.int8_result_u.data[7];
+    }
+    return result.err;
+}
+
+cudnnStatus_t cudnnSetPoolingNdDescriptor(cudnnPoolingDescriptor_t poolingDesc, const cudnnPoolingMode_t mode, const cudnnNanPropagation_t maxpoolingNanOpt, int nbDims, const int* windowDimA, const int* paddingA, const int* strideA)
+{
+#ifdef WITH_API_CNT
+    cudnn_call_cnt++;
+#endif //WITH_API_CNT
+    int result;
+    enum clnt_stat retval_1;
+    mem_data rpc_windowDimA = {
+        .mem_data_len = nbDims * sizeof(int),
+        .mem_data_val = (char*)windowDimA
+    };
+    mem_data rpc_paddingA = {
+        .mem_data_len = nbDims * sizeof(int),
+        .mem_data_val = (char*)paddingA
+    };
+    mem_data rpc_strideA = {
+        .mem_data_len = nbDims * sizeof(int),
+        .mem_data_val = (char*)strideA
+    };
+    retval_1 = rpc_cudnnsetpoolingnddescriptor_1(
+        (ptr)poolingDesc,
+        (int)mode,
+        (int)maxpoolingNanOpt,
+        (int)nbDims,
+        rpc_windowDimA,
+        rpc_paddingA,
+        rpc_strideA,
+        &result, clnt);
+
+    if (retval_1 != RPC_SUCCESS) {
+        LOGE(LOG_ERROR, "%s failed (%d)", __FUNCTION__, retval_1);
+    }
+    if (result != CUDNN_STATUS_SUCCESS) {
+        LOGE(LOG_ERROR, "%s failed (result is %d)", __FUNCTION__, result);
+    } 
+    return result;
+}
+
+cudnnStatus_t cudnnGetPoolingNdDescriptor(const cudnnPoolingDescriptor_t poolingDesc, int nbDimsRequested, cudnnPoolingMode_t * mode, cudnnNanPropagation_t * maxpoolingNanOpt, int* nbDims, int* windowDimA, int* paddingA, int* strideA)
+{
+#ifdef WITH_API_CNT
+    cudnn_call_cnt++;
+#endif //WITH_API_CNT
+    mem_result result;
+    enum clnt_stat retval_1;
+    if (mode == NULL || maxpoolingNanOpt == NULL || nbDims == NULL || windowDimA == NULL || paddingA == NULL || strideA == NULL) {
+        LOGE(LOG_ERROR, "%s failed (value is NULL)", __FUNCTION__);
+        return CUDNN_STATUS_BAD_PARAM;
+    }
+    retval_1 = rpc_cudnngetpoolingnddescriptor_1(
+        (ptr)poolingDesc,
+        nbDimsRequested,
+        &result, clnt); 
+    if (retval_1 != RPC_SUCCESS) {
+        LOGE(LOG_ERROR, "%s failed (%d)", __FUNCTION__, retval_1);
+    }
+    size_t expected_size = nbDimsRequested * sizeof(int) * 3 + sizeof(int) + sizeof(cudnnPoolingMode_t) + sizeof(cudnnNanPropagation_t);
+    if (result.err != CUDNN_STATUS_SUCCESS || result.mem_result_u.data.mem_data_len != expected_size) {
+        LOGE(LOG_ERROR, "%s failed (result is %d)", __FUNCTION__, result.err);
+    } else {
+        size_t offset = 0;
+        *mode = (cudnnPoolingMode_t)result.mem_result_u.data.mem_data_val[offset];
+        offset += sizeof(cudnnPoolingMode_t);
+        *maxpoolingNanOpt = (cudnnNanPropagation_t)result.mem_result_u.data.mem_data_val[offset];
+        offset += sizeof(cudnnNanPropagation_t);
+        *nbDims = (int)result.mem_result_u.data.mem_data_val[offset];
+        offset += sizeof(int);
+        memcpy(windowDimA, result.mem_result_u.data.mem_data_val+offset, *nbDims * sizeof(int));
+        offset += *nbDims * sizeof(int);
+        memcpy(paddingA, result.mem_result_u.data.mem_data_val+offset, *nbDims * sizeof(int));
+        offset += *nbDims * sizeof(int);
+        memcpy(strideA, result.mem_result_u.data.mem_data_val+offset, *nbDims * sizeof(int));
+    }
+    return result.err;
+}
+
+cudnnStatus_t cudnnGetPoolingNdForwardOutputDim(const cudnnPoolingDescriptor_t poolingDesc, const cudnnTensorDescriptor_t inputTensorDesc, int nbDims, int* outputTensorDimA)
+{
+#ifdef WITH_API_CNT
+    cudnn_call_cnt++;
+#endif //WITH_API_CNT
+    mem_result result;
+    enum clnt_stat retval_1;
+    if (outputTensorDimA == NULL) {
+        LOGE(LOG_ERROR, "%s failed (value is NULL)", __FUNCTION__);
+        return CUDNN_STATUS_BAD_PARAM;
+    }
+    retval_1 = rpc_cudnngetpoolingndforwardoutputdim_1(
+        (ptr)poolingDesc,
+        (ptr)inputTensorDesc,
+        nbDims,
+        &result, clnt); 
+    if (retval_1 != RPC_SUCCESS) {
+        LOGE(LOG_ERROR, "%s failed (%d)", __FUNCTION__, retval_1);
+    }
+    size_t expected_size = nbDims * sizeof(int);
+    if (result.err != CUDNN_STATUS_SUCCESS || result.mem_result_u.data.mem_data_len != expected_size) {
+        LOGE(LOG_ERROR, "%s failed (result is %d)", __FUNCTION__, result.err);
+    } else {
+        memcpy(outputTensorDimA, result.mem_result_u.data.mem_data_val, nbDims * sizeof(int));
+    }
+    return result.err;
+}
+
+cudnnStatus_t cudnnGetPooling2dForwardOutputDim(const cudnnPoolingDescriptor_t poolingDesc, const cudnnTensorDescriptor_t inputTensorDesc, int* n, int* c, int* h, int* w)
+{
+#ifdef WITH_API_CNT
+    cudnn_call_cnt++;
+#endif //WITH_API_CNT
+    int4_result result;
+    enum clnt_stat retval_1;
+    if (n == NULL || c == NULL || h == NULL || w == NULL) {
+        LOGE(LOG_ERROR, "%s failed (value is NULL)", __FUNCTION__);
+        return CUDNN_STATUS_BAD_PARAM;
+    }
+    retval_1 = rpc_cudnngetpooling2dforwardoutputdim_1(
+        (ptr)poolingDesc,
+        (ptr)inputTensorDesc,
+        &result, clnt); 
+    if (retval_1 != RPC_SUCCESS) {
+        LOGE(LOG_ERROR, "%s failed (%d)", __FUNCTION__, retval_1);
+    }
+    if (result.err != CUDNN_STATUS_SUCCESS) {
+        LOGE(LOG_ERROR, "%s failed (result is %d)", __FUNCTION__, result.err);
+    } else {
+        *n = result.int4_result_u.data[0];
+        *c = result.int4_result_u.data[1];
+        *h = result.int4_result_u.data[2];
+        *w = result.int4_result_u.data[3];
+    }
+    return result.err;
+}
+
+cudnnStatus_t cudnnDestroyPoolingDescriptor(cudnnPoolingDescriptor_t poolingDesc)
+{
+#ifdef WITH_API_CNT
+    cudnn_call_cnt++;
+#endif //WITH_API_CNT
+    int result;
+    enum clnt_stat retval_1;
+    retval_1 = rpc_cudnndestroypoolingdescriptor_1(
+        (ptr)poolingDesc,
+        &result, clnt);
+
+    if (retval_1 != RPC_SUCCESS) {
+        LOGE(LOG_ERROR, "%s failed (%d)", __FUNCTION__, retval_1);
+    }
+    if (result != CUDNN_STATUS_SUCCESS) {
+        LOGE(LOG_ERROR, "%s failed (result is %d)", __FUNCTION__, result);
+    }
+    return result;
+}
+
 DEF_FN(cudnnStatus_t, cudnnPoolingForward, cudnnHandle_t, handle, const cudnnPoolingDescriptor_t, poolingDesc, const void *, alpha, const cudnnTensorDescriptor_t, xDesc, const void *, x, const void *, beta, const cudnnTensorDescriptor_t, yDesc, void *, y)
-DEF_FN(cudnnStatus_t, cudnnCreateActivationDescriptor, cudnnActivationDescriptor_t *, activationDesc)
-DEF_FN(cudnnStatus_t, cudnnSetActivationDescriptor, cudnnActivationDescriptor_t, activationDesc, cudnnActivationMode_t, mode, cudnnNanPropagation_t, reluNanOpt, double, coef) 
-DEF_FN(cudnnStatus_t, cudnnGetActivationDescriptor, const cudnnActivationDescriptor_t, activationDesc, cudnnActivationMode_t *, mode, cudnnNanPropagation_t *, reluNanOpt, double *, coef) 
-DEF_FN(cudnnStatus_t, cudnnSetActivationDescriptorSwishBeta, cudnnActivationDescriptor_t, activationDesc, double, swish_beta)
-DEF_FN(cudnnStatus_t, cudnnGetActivationDescriptorSwishBeta, cudnnActivationDescriptor_t, activationDesc, double *, swish_beta)
-DEF_FN(cudnnStatus_t, cudnnDestroyActivationDescriptor, cudnnActivationDescriptor_t, activationDesc)
+
+cudnnStatus_t cudnnCreateActivationDescriptor(cudnnActivationDescriptor_t * activationDesc)
+{
+#ifdef WITH_API_CNT
+    cudnn_call_cnt++;
+#endif //WITH_API_CNT
+    ptr_result result;
+    enum clnt_stat retval_1;
+    if (activationDesc == NULL) {
+        LOGE(LOG_ERROR, "%s failed (value is NULL)", __FUNCTION__);
+        return CUDNN_STATUS_BAD_PARAM;
+    }
+    retval_1 = rpc_cudnncreateactivationdescriptor_1(&result, clnt);
+    if (retval_1 != RPC_SUCCESS) {
+        LOGE(LOG_ERROR, "%s failed (%d)", __FUNCTION__, retval_1);
+    }
+    if (result.err != CUDNN_STATUS_SUCCESS) {
+        LOGE(LOG_ERROR, "%s failed (result is %d)", __FUNCTION__, result.err);
+    } else {
+        *activationDesc = (cudnnActivationDescriptor_t)result.ptr_result_u.ptr;
+    }
+    return result.err;
+}
+
+cudnnStatus_t cudnnSetActivationDescriptor(cudnnActivationDescriptor_t activationDesc, cudnnActivationMode_t mode, cudnnNanPropagation_t reluNanOpt, double coef) 
+{
+#ifdef WITH_API_CNT
+    cudnn_call_cnt++;
+#endif //WITH_API_CNT
+    int result;
+    enum clnt_stat retval_1;
+    retval_1 = rpc_cudnnsetactivationdescriptor_1(
+        (ptr)activationDesc,
+        (int)mode,
+        (int)reluNanOpt,
+        coef,
+        &result, clnt);
+
+    if (retval_1 != RPC_SUCCESS) {
+        LOGE(LOG_ERROR, "%s failed (%d)", __FUNCTION__, retval_1);
+    }
+    if (result != CUDNN_STATUS_SUCCESS) {
+        LOGE(LOG_ERROR, "%s failed (result is %d)", __FUNCTION__, result);
+    } 
+    return result;
+}
+
+cudnnStatus_t cudnnGetActivationDescriptor(const cudnnActivationDescriptor_t activationDesc, cudnnActivationMode_t *mode, cudnnNanPropagation_t *reluNanOpt, double *coef) 
+{
+#ifdef WITH_API_CNT
+    cudnn_call_cnt++;
+#endif //WITH_API_CNT
+    int2d1_result result;
+    enum clnt_stat retval_1;
+    if (mode == NULL || reluNanOpt == NULL || coef == NULL) {
+        LOGE(LOG_ERROR, "%s failed (value is NULL)", __FUNCTION__);
+        return CUDNN_STATUS_BAD_PARAM;
+    }
+    retval_1 = rpc_cudnngetactivationdescriptor_1(
+        (ptr)activationDesc,
+        &result, clnt);
+
+    if (retval_1 != RPC_SUCCESS) {
+        LOGE(LOG_ERROR, "%s failed (%d)", __FUNCTION__, retval_1);
+    }
+    if (result.err != CUDNN_STATUS_SUCCESS) {
+        LOGE(LOG_ERROR, "%s failed (result is %d)", __FUNCTION__, result);
+    } else {
+        *mode = (cudnnActivationMode_t)result.int2d1_result_u.data.i[0];
+        *reluNanOpt = (cudnnNanPropagation_t)result.int2d1_result_u.data.i[1];
+        *coef = result.int2d1_result_u.data.d;
+    }
+    return result.err;
+}
+
+cudnnStatus_t cudnnSetActivationDescriptorSwishBeta(cudnnActivationDescriptor_t activationDesc, double swish_beta)
+{
+#ifdef WITH_API_CNT
+    cudnn_call_cnt++;
+#endif //WITH_API_CNT
+    int result;
+    enum clnt_stat retval_1;
+    retval_1 = rpc_cudnnsetactivationdescriptorswishbeta_1(
+        (ptr)activationDesc,
+        swish_beta,
+        &result, clnt);
+
+    if (retval_1 != RPC_SUCCESS) {
+        LOGE(LOG_ERROR, "%s failed (%d)", __FUNCTION__, retval_1);
+    }
+    if (result != CUDNN_STATUS_SUCCESS) {
+        LOGE(LOG_ERROR, "%s failed (result is %d)", __FUNCTION__, result);
+    } 
+    return result;
+}
+    
+cudnnStatus_t cudnnGetActivationDescriptorSwishBeta(cudnnActivationDescriptor_t activationDesc, double * swish_beta)
+{
+#ifdef WITH_API_CNT
+    cudnn_call_cnt++;
+#endif //WITH_API_CNT
+    d_result result;
+    enum clnt_stat retval_1;
+    if (swish_beta == NULL) {
+        LOGE(LOG_ERROR, "%s failed (value is NULL)", __FUNCTION__);
+        return CUDNN_STATUS_BAD_PARAM;
+    }
+    retval_1 = rpc_cudnngetactivationdescriptorswishbeta_1(
+        (ptr)activationDesc,
+        &result, clnt);
+
+    if (retval_1 != RPC_SUCCESS) {
+        LOGE(LOG_ERROR, "%s failed (%d)", __FUNCTION__, retval_1);
+    }
+    if (result.err != CUDNN_STATUS_SUCCESS) {
+        LOGE(LOG_ERROR, "%s failed (result is %d)", __FUNCTION__, result);
+    } else {
+        *swish_beta = result.d_result_u.data;
+    }
+    return result.err;
+}
+
+cudnnStatus_t cudnnDestroyActivationDescriptor(cudnnActivationDescriptor_t activationDesc)
+{
+#ifdef WITH_API_CNT
+    cudnn_call_cnt++;
+#endif //WITH_API_CNT
+    int result;
+    enum clnt_stat retval_1;
+    retval_1 = rpc_cudnndestroyactivationdescriptor_1(
+        (ptr)activationDesc,
+        &result, clnt);
+
+    if (retval_1 != RPC_SUCCESS) {
+        LOGE(LOG_ERROR, "%s failed (%d)", __FUNCTION__, retval_1);
+    }
+    if (result != CUDNN_STATUS_SUCCESS) {
+        LOGE(LOG_ERROR, "%s failed (result is %d)", __FUNCTION__, result);
+    }
+    return result;
+}
+
 DEF_FN(cudnnStatus_t, cudnnActivationForward, cudnnHandle_t, handle, cudnnActivationDescriptor_t, activationDesc, const void *, alpha, const cudnnTensorDescriptor_t, xDesc, const void *, x, const void *, beta, const cudnnTensorDescriptor_t, yDesc, void *, y)
 DEF_FN(cudnnStatus_t, cudnnCreateLRNDescriptor, cudnnLRNDescriptor_t *, normDesc)
 DEF_FN(cudnnStatus_t, cudnnSetLRNDescriptor, cudnnLRNDescriptor_t, normDesc, unsigned, lrnN, double, lrnAlpha, double, lrnBeta, double, lrnK)
diff --git a/cpu/cpu-server-cudnn.c b/cpu/cpu-server-cudnn.c
index a845aa8c..511626b5 100644
--- a/cpu/cpu-server-cudnn.c
+++ b/cpu/cpu-server-cudnn.c
@@ -22,12 +22,20 @@ int server_cudnn_init(int bypass)
 {
     int ret = 0;
     ret &= resource_mg_init(&rm_cudnn, bypass);
+    ret &= resource_mg_init(&rm_cudnn_tensors, bypass);
+    ret &= resource_mg_init(&rm_cudnn_filters, bypass);
+    ret &= resource_mg_init(&rm_cudnn_poolings, bypass);
+    ret &= resource_mg_init(&rm_cudnn_activations, bypass);
     return ret;
 }
 
 int server_cudnn_deinit(void)
 {
     resource_mg_free(&rm_cudnn);
+    resource_mg_free(&rm_cudnn_tensors);
+    resource_mg_free(&rm_cudnn_filters);
+    resource_mg_free(&rm_cudnn_poolings);
+    resource_mg_free(&rm_cudnn_activations);
     return 0;
 
 }
@@ -121,6 +129,7 @@ bool_t rpc_cudnndestroy_1_svc(ptr handle, int *result, struct svc_req *rqstp)
     GSCHED_RETAIN;
     *result = cudnnDestroy(
         (cudnnHandle_t)resource_mg_get(&rm_cudnn, (void*)handle));
+    // TODO: Remove from resource manager
     GSCHED_RELEASE;
     RECORD_RESULT(integer, *result);
     return 1;
@@ -168,4 +177,439 @@ bool_t rpc_cudnncreatetensordescriptor_1_svc(ptr_result *result, struct svc_req
     GSCHED_RELEASE;
     RECORD_RESULT(ptr_result_u, *result);
     return 1;
+}
+
+bool_t rpc_cudnncreatefilterdescriptor_1_svc(ptr_result *result, struct svc_req *rqstp)
+{
+    RECORD_VOID_API;
+    LOGE(LOG_DEBUG, "%s", __FUNCTION__);
+
+    GSCHED_RETAIN;
+    result->err = cudnnCreateFilterDescriptor((cudnnFilterDescriptor_t*)&result->ptr_result_u.ptr);
+    if (resource_mg_create(&rm_cudnn_filters, (void*)result->ptr_result_u.ptr) != 0) {
+        LOGE(LOG_ERROR, "error in resource manager");
+    }
+    GSCHED_RELEASE;
+    RECORD_RESULT(ptr_result_u, *result);
+    return 1;
+}
+
+bool_t rpc_cudnnsetfilter4ddescriptor_1_svc(ptr filterDesc, int dataType, int format, int k, int c, int h, int w, int *result, struct svc_req *rqstp)
+{
+    RECORD_API(rpc_cudnnsetfilter4ddescriptor_1_argument);
+    RECORD_NARG(filterDesc);
+    RECORD_NARG(dataType);
+    RECORD_NARG(format);
+    RECORD_NARG(k);
+    RECORD_NARG(c);
+    RECORD_NARG(h);
+    RECORD_NARG(w);
+
+    LOGE(LOG_DEBUG, "%s", __FUNCTION__);
+
+    GSCHED_RETAIN;
+    *result = cudnnSetFilter4dDescriptor(
+        (cudnnFilterDescriptor_t)resource_mg_get(&rm_cudnn_filters, (void*)filterDesc),
+        (cudnnDataType_t)dataType,
+        (cudnnTensorFormat_t)format,
+        k, c, h, w);
+    GSCHED_RELEASE;
+    RECORD_RESULT(integer, *result);
+    return 1;
+}
+
+bool_t rpc_cudnngetfilter4ddescriptor_1_svc(ptr filterDesc, int6_result *result, struct svc_req *rqstp)
+{
+    LOGE(LOG_DEBUG, "%s", __FUNCTION__);
+
+    GSCHED_RETAIN;
+    result->err = cudnnGetFilter4dDescriptor(
+        (cudnnFilterDescriptor_t)resource_mg_get(&rm_cudnn_filters, (void*)filterDesc),
+        (cudnnDataType_t*)&result->int6_result_u.data[0],
+        (cudnnTensorFormat_t*)&result->int6_result_u.data[1],
+        &result->int6_result_u.data[2],
+        &result->int6_result_u.data[3],
+        &result->int6_result_u.data[4],
+        &result->int6_result_u.data[5]);
+    GSCHED_RELEASE;
+    return 1;
+}
+
+bool_t rpc_cudnnsetfilternddescriptor_1_svc(ptr filterDesc, int dataType, int format, int nbDims, mem_data filterDimA, int *result, struct svc_req *rqstp)
+{
+    RECORD_API(rpc_cudnnsetfilternddescriptor_1_argument);
+    RECORD_NARG(filterDesc);
+    RECORD_NARG(dataType);
+    RECORD_NARG(format);
+    RECORD_NARG(nbDims);
+    RECORD_NARG(filterDimA);
+    
+    //TODO: Recording filterDimA is not as easy as done here.
+
+    LOGE(LOG_DEBUG, "%s", __FUNCTION__);
+
+    if (filterDimA.mem_data_len != nbDims * sizeof(int)) {
+        LOGE(LOG_ERROR, "array dimension not as expected.");
+        return 0;
+    }
+    GSCHED_RETAIN;
+    *result = cudnnSetFilterNdDescriptor(
+        (cudnnFilterDescriptor_t)resource_mg_get(&rm_cudnn_filters, (void*)filterDesc),
+        (cudnnDataType_t)dataType,
+        (cudnnTensorFormat_t)format,
+        nbDims,
+        (const int*)filterDimA.mem_data_val);
+    GSCHED_RELEASE;
+    RECORD_RESULT(integer, *result);
+    return 1;
+}
+
+bool_t rpc_cudnngetfilternddescriptor_1_svc(ptr filterDesc, int nbDimsRequested, mem_result *result, struct svc_req *rqstp)
+{
+    LOGE(LOG_DEBUG, "%s", __FUNCTION__);
+    result->mem_result_u.data.mem_data_len = sizeof(cudnnDataType_t) + sizeof(cudnnTensorFormat_t) + sizeof(int) + nbDimsRequested*sizeof(int);
+    if ((result->mem_result_u.data.mem_data_val = malloc(result->mem_result_u.data.mem_data_len)) == NULL) {
+        LOGE(LOG_ERROR, "malloc failed");
+        return 0;
+    }
+    
+    GSCHED_RETAIN;
+    result->err = cudnnGetFilterNdDescriptor(
+        (cudnnFilterDescriptor_t)resource_mg_get(&rm_cudnn_filters, (void*)filterDesc),
+        nbDimsRequested,
+        (cudnnDataType_t*)result->mem_result_u.data.mem_data_val,
+        (cudnnTensorFormat_t*)&result->mem_result_u.data.mem_data_val[sizeof(cudnnDataType_t)],
+        (int*)&result->mem_result_u.data.mem_data_val[sizeof(cudnnDataType_t)+sizeof(cudnnTensorDescriptor_t)],
+        (int*)&result->mem_result_u.data.mem_data_val[sizeof(cudnnDataType_t)+sizeof(cudnnTensorDescriptor_t)+sizeof(int)]);
+    GSCHED_RELEASE;
+    return 1;
+}
+
+bool_t rpc_cudnngetfiltersizeinbytes_1_svc(ptr filterDesc, sz_result *result, struct svc_req *rqstp)
+{
+    LOGE(LOG_DEBUG, "%s", __FUNCTION__);
+    GSCHED_RETAIN;
+    result->err = cudnnGetFilterSizeInBytes(
+        (cudnnFilterDescriptor_t)resource_mg_get(&rm_cudnn_filters, (void*)filterDesc),
+        &result->sz_result_u.data);
+    GSCHED_RELEASE;
+    return 1;
+}
+
+bool_t rpc_cudnntransformfilter_1_svc(ptr handle, ptr transDesc, cudnn_scaling_t alpha, ptr srcDesc, ptr srcData, cudnn_scaling_t beta, ptr destDesc, ptr destData, int *result, struct svc_req *rqstp)
+{
+    RECORD_API(rpc_cudnntransformfilter_1_argument);
+    RECORD_NARG(handle);
+    RECORD_NARG(transDesc);
+    RECORD_NARG(alpha);
+    RECORD_NARG(srcDesc);
+    RECORD_NARG(srcData);
+    RECORD_NARG(beta);
+    RECORD_NARG(destDesc);
+    RECORD_NARG(destData);
+    
+    LOGE(LOG_DEBUG, "%s", __FUNCTION__);
+
+    GSCHED_RETAIN;
+    *result = cudnnTransformFilter(
+        (cudnnHandle_t)resource_mg_get(&rm_cudnn, (void*)handle),
+        (const cudnnTensorTransformDescriptor_t)resource_mg_get(&rm_cudnn_tensortransform, (void*)transDesc),
+        (alpha.dataType == CUDNN_DATA_DOUBLE ? (const void*)&alpha.cudnn_scaling_t_u.d : (const void*)&alpha.cudnn_scaling_t_u.f),
+        (const cudnnFilterDescriptor_t)resource_mg_get(&rm_cudnn_filters, (void*)srcDesc),
+        (const void*)srcData,
+        (beta.dataType == CUDNN_DATA_DOUBLE ? (const void*)&beta.cudnn_scaling_t_u.d : (const void*)&beta.cudnn_scaling_t_u.f),
+        (const cudnnFilterDescriptor_t)resource_mg_get(&rm_cudnn_filters, (void*)destDesc),
+        (void*)destData);
+    GSCHED_RELEASE;
+    RECORD_RESULT(integer, *result);
+    return 1;
+}
+
+bool_t rpc_cudnndestroyfilterdescriptor_1_svc(ptr filterDesc, int *result, struct svc_req *rqstp)
+{
+    RECORD_API(ptr);
+    RECORD_SINGLE_ARG(filterDesc);
+    LOGE(LOG_DEBUG, "%s", __FUNCTION__);
+
+    GSCHED_RETAIN;
+    *result = cudnnDestroyFilterDescriptor(
+        (cudnnFilterDescriptor_t)resource_mg_get(&rm_cudnn_filters, (void*)filterDesc));
+    // TODO: Remove from resource manager
+    GSCHED_RELEASE;
+    RECORD_RESULT(integer, *result);
+    return 1;
+}
+
+bool_t rpc_cudnncreatepoolingdescriptor_1_svc(ptr_result *result, struct svc_req *rqstp)
+{
+    RECORD_VOID_API;
+    LOGE(LOG_DEBUG, "%s", __FUNCTION__);
+
+    GSCHED_RETAIN;
+    result->err = cudnnCreatePoolingDescriptor((cudnnPoolingDescriptor_t*)&result->ptr_result_u.ptr);
+    if (resource_mg_create(&rm_cudnn_poolings, (void*)result->ptr_result_u.ptr) != 0) {
+        LOGE(LOG_ERROR, "error in resource manager");
+    }
+    GSCHED_RELEASE;
+    RECORD_RESULT(ptr_result_u, *result);
+    return 1;
+}
+
+bool_t rpc_cudnnsetpooling2ddescriptor_1_svc(ptr poolingDesc, int mode, int maxpoolingNanOpt, int windowHeight, int windowWidth, int verticalPadding, int horizontalPadding, int verticalStride, int horizontalStride, int *result, struct svc_req *rqstp)
+{
+    RECORD_API(rpc_cudnnsetpooling2ddescriptor_1_argument);
+    RECORD_NARG(poolingDesc);
+    RECORD_NARG(mode);
+    RECORD_NARG(maxpoolingNanOpt);
+    RECORD_NARG(windowHeight);
+    RECORD_NARG(windowWidth);
+    RECORD_NARG(verticalPadding);
+    RECORD_NARG(horizontalPadding);
+    RECORD_NARG(verticalStride);
+    RECORD_NARG(horizontalStride);
+
+    LOGE(LOG_DEBUG, "%s", __FUNCTION__);
+
+    GSCHED_RETAIN;
+    *result = cudnnSetPooling2dDescriptor(
+        (cudnnPoolingDescriptor_t)resource_mg_get(&rm_cudnn_poolings, (void*)poolingDesc),
+        (cudnnPoolingMode_t)mode,
+        (cudnnNanPropagation_t)maxpoolingNanOpt,
+        windowHeight, windowWidth,
+        verticalPadding, horizontalPadding,
+        verticalStride, horizontalStride);
+    GSCHED_RELEASE;
+    RECORD_RESULT(integer, *result);
+    return 1;
+}
+
+bool_t rpc_cudnngetpooling2ddescriptor_1_svc(ptr poolingDesc, int8_result *result, struct svc_req *rqstp)
+{
+    LOGE(LOG_DEBUG, "%s", __FUNCTION__);
+
+    GSCHED_RETAIN;
+    result->err = cudnnGetPooling2dDescriptor(
+        (cudnnPoolingDescriptor_t)resource_mg_get(&rm_cudnn_poolings, (void*)poolingDesc),
+        (cudnnPoolingMode_t*)&result->int8_result_u.data[0],
+        (cudnnNanPropagation_t*)&result->int8_result_u.data[1],
+        &result->int8_result_u.data[2],
+        &result->int8_result_u.data[3],
+        &result->int8_result_u.data[4],
+        &result->int8_result_u.data[5],
+        &result->int8_result_u.data[6],
+        &result->int8_result_u.data[7]);
+    GSCHED_RELEASE;
+    return 1;
+}
+
+bool_t rpc_cudnnsetpoolingnddescriptor_1_svc(ptr poolingDesc, int mode, int maxpoolingNanOpt, int nbDims, mem_data windowDimA, mem_data paddingA, mem_data strideA, int *result, struct svc_req *rqstp)
+{
+    RECORD_API(rpc_cudnnsetpoolingnddescriptor_1_argument);
+    RECORD_NARG(poolingDesc);
+    RECORD_NARG(mode);
+    RECORD_NARG(maxpoolingNanOpt);
+    RECORD_NARG(nbDims);
+    RECORD_NARG(windowDimA);
+    RECORD_NARG(paddingA);
+    RECORD_NARG(strideA);
+    //TODO: Recording windowDimA, paddingA and strideA are not as easy as done here.
+
+    LOGE(LOG_DEBUG, "%s", __FUNCTION__);
+
+    if (windowDimA.mem_data_len != nbDims * sizeof(int) ||
+        paddingA.mem_data_len != nbDims * sizeof(int) ||
+        strideA.mem_data_len != nbDims * sizeof(int)) {
+        LOGE(LOG_ERROR, "array dimensions not as expected.");
+        return 0;
+    }
+    GSCHED_RETAIN;
+    *result = cudnnSetPoolingNdDescriptor(
+        (cudnnPoolingDescriptor_t)resource_mg_get(&rm_cudnn_poolings, (void*)poolingDesc),
+        (cudnnPoolingMode_t)mode,
+        (cudnnNanPropagation_t)maxpoolingNanOpt,
+        nbDims,
+        (const int*)windowDimA.mem_data_val,
+        (const int*)paddingA.mem_data_val,
+        (const int*)strideA.mem_data_val);
+    GSCHED_RELEASE;
+    RECORD_RESULT(integer, *result);
+    return 1;
+}
+
+bool_t rpc_cudnngetpoolingnddescriptor_1_svc(ptr poolingDesc, int nbDimsRequested, mem_result *result, struct svc_req *rqstp)
+{
+    LOGE(LOG_DEBUG, "%s", __FUNCTION__);
+    result->mem_result_u.data.mem_data_len = sizeof(cudnnPoolingMode_t) + sizeof(cudnnNanPropagation_t) + nbDimsRequested * sizeof(int) * 3;
+    if ((result->mem_result_u.data.mem_data_val = malloc(result->mem_result_u.data.mem_data_len)) == NULL) {
+        LOGE(LOG_ERROR, "malloc failed");
+        return 0;
+    }
+    
+    size_t offsets[] = {
+        0,
+        sizeof(cudnnPoolingMode_t),
+        sizeof(cudnnPoolingMode_t) + sizeof(cudnnNanPropagation_t),
+        sizeof(cudnnPoolingMode_t) + sizeof(cudnnNanPropagation_t) + sizeof(int),
+        sizeof(cudnnPoolingMode_t) + sizeof(cudnnNanPropagation_t) + sizeof(int) + sizeof(int) * nbDimsRequested,
+        sizeof(cudnnPoolingMode_t) + sizeof(cudnnNanPropagation_t) + sizeof(int) + sizeof(int) * nbDimsRequested * 2,
+    };
+    
+    GSCHED_RETAIN;
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Wint-to-pointer-cast"
+    result->err = cudnnGetPoolingNdDescriptor(
+        (cudnnPoolingDescriptor_t)resource_mg_get(&rm_cudnn_poolings, (void*)poolingDesc),
+        nbDimsRequested,
+        (cudnnPoolingMode_t*)result->mem_result_u.data.mem_data_val[offsets[0]],
+        (cudnnNanPropagation_t*)result->mem_result_u.data.mem_data_val[offsets[1]],
+        (int*)result->mem_result_u.data.mem_data_val[offsets[2]],
+        (int*)result->mem_result_u.data.mem_data_val[offsets[3]],
+        (int*)result->mem_result_u.data.mem_data_val[offsets[4]],
+        (int*)result->mem_result_u.data.mem_data_val[offsets[5]]);
+#pragma GCC diagnostic pop
+
+    GSCHED_RELEASE;
+    return 1;
+}
+
+bool_t rpc_cudnngetPoolingNdForwardOutputDim_1_svc(ptr poolingDesc, ptr inputTensorDesc, int nbDims, mem_result *result, struct svc_req *rqstp)
+{
+    LOGE(LOG_DEBUG, "%s", __FUNCTION__);
+    GSCHED_RETAIN;
+    result->mem_result_u.data.mem_data_len = sizeof(int) * nbDims;
+    if ((result->mem_result_u.data.mem_data_val = malloc(result->mem_result_u.data.mem_data_len)) == NULL) {
+        LOGE(LOG_ERROR, "malloc failed");
+        return 0;
+    }
+    result->err = cudnnGetPoolingNdForwardOutputDim(
+        (cudnnPoolingDescriptor_t)resource_mg_get(&rm_cudnn_poolings, (void*)poolingDesc),
+        (cudnnTensorDescriptor_t)resource_mg_get(&rm_cudnn_tensors, (void*)inputTensorDesc),
+        nbDims,
+        (int*)&result->mem_result_u.data);
+    GSCHED_RELEASE;
+    return 1;
+}
+
+bool_t rpc_cudnngetPooling2dForwardOutputDim_1_svc(ptr poolingDesc, ptr inputTensorDesc, int4_result *result, struct svc_req *rqstp)
+{
+    LOGE(LOG_DEBUG, "%s", __FUNCTION__);
+    GSCHED_RETAIN;
+    result->err = cudnnGetPooling2dForwardOutputDim(
+        (cudnnPoolingDescriptor_t)resource_mg_get(&rm_cudnn_poolings, (void*)poolingDesc),
+        (cudnnTensorDescriptor_t)resource_mg_get(&rm_cudnn_tensors, (void*)inputTensorDesc),
+        (int*)&result->int4_result_u.data[0],
+        (int*)&result->int4_result_u.data[1],
+        (int*)&result->int4_result_u.data[2],
+        (int*)&result->int4_result_u.data[3]);
+    GSCHED_RELEASE;
+    return 1;
+}
+
+bool_t rpc_cudnndestroypoolingdescriptor_1_svc(ptr poolingDesc, int *result, struct svc_req *rqstp)
+{
+    RECORD_API(ptr);
+    RECORD_SINGLE_ARG(poolingDesc);
+    LOGE(LOG_DEBUG, "%s", __FUNCTION__);
+
+    GSCHED_RETAIN;
+    *result = cudnnDestroyPoolingDescriptor(
+        (cudnnPoolingDescriptor_t)resource_mg_get(&rm_cudnn_poolings, (void*)poolingDesc));
+    // TODO: Remove from resource manager
+    GSCHED_RELEASE;
+    RECORD_RESULT(integer, *result);
+    return 1;
+}
+
+bool_t rpc_cudnncreateactivationdescriptor_1_svc(ptr_result *result, struct svc_req *rqstp)
+{
+    RECORD_VOID_API;
+    LOGE(LOG_DEBUG, "%s", __FUNCTION__);
+
+    GSCHED_RETAIN;
+    result->err = cudnnCreateActivationDescriptor((cudnnActivationDescriptor_t*)&result->ptr_result_u.ptr);
+    if (resource_mg_create(&rm_cudnn_activations, (void*)result->ptr_result_u.ptr) != 0) {
+        LOGE(LOG_ERROR, "error in resource manager");
+    }
+    GSCHED_RELEASE;
+    RECORD_RESULT(ptr_result_u, *result);
+    return 1;
+}
+
+bool_t rpc_cudnnsetactivationdescriptor_1_svc(ptr activationDesc, int mode, int reluNanOpt, double coef, int *result, struct svc_req *rqstp)
+{
+    RECORD_API(rpc_cudnnsetactivationdescriptor_1_argument);
+    RECORD_NARG(activationDesc);
+    RECORD_NARG(mode);
+    RECORD_NARG(reluNanOpt);
+    RECORD_NARG(coef);
+
+    LOGE(LOG_DEBUG, "%s", __FUNCTION__);
+
+    GSCHED_RETAIN;
+    *result = cudnnSetActivationDescriptor(
+        (cudnnActivationDescriptor_t)resource_mg_get(&rm_cudnn_activations, (void*)activationDesc),
+        (cudnnActivationMode_t)mode,
+        (cudnnNanPropagation_t)reluNanOpt,
+        coef);
+    GSCHED_RELEASE;
+    RECORD_RESULT(integer, *result);
+    return 1;
+}
+
+bool_t rpc_cudnngetactivationdescriptor_1_svc(ptr activationDesc, int2d1_result *result, struct svc_req *rqstp)
+{
+    LOGE(LOG_DEBUG, "%s", __FUNCTION__);
+
+    GSCHED_RETAIN;
+    result->err = cudnnGetActivationDescriptor(
+        (cudnnActivationDescriptor_t)resource_mg_get(&rm_cudnn_activations, (void*)activationDesc),
+        (cudnnActivationMode_t*)&result->int2d1_result_u.data.i[0],
+        (cudnnNanPropagation_t*)&result->int2d1_result_u.data.i[1],
+        &result->int2d1_result_u.data.d);
+    GSCHED_RELEASE;
+    return 1;
+}
+
+bool_t rpc_cudnnsetactivationdescriptorswishbeta_1_svc(ptr activationDesc, double swish_beta, int *result, struct svc_req *rqstp)
+{
+    RECORD_API(rpc_cudnnsetactivationdescriptorswishbeta_1_argument);
+    RECORD_NARG(activationDesc);
+    RECORD_NARG(swish_beta);
+
+    LOGE(LOG_DEBUG, "%s", __FUNCTION__);
+
+    GSCHED_RETAIN;
+    *result = cudnnSetActivationDescriptorSwishBeta(
+        (cudnnActivationDescriptor_t)resource_mg_get(&rm_cudnn_activations, (void*)activationDesc),
+        swish_beta);
+    GSCHED_RELEASE;
+    RECORD_RESULT(integer, *result);
+    return 1;
+}
+
+bool_t rpc_cudnngetactivationdescriptorswishbeta_1_svc(ptr activationDesc, d_result *result, struct svc_req *rqstp)
+{
+    LOGE(LOG_DEBUG, "%s", __FUNCTION__);
+
+    GSCHED_RETAIN;
+    result->err = cudnnGetActivationDescriptorSwishBeta(
+        (cudnnActivationDescriptor_t)resource_mg_get(&rm_cudnn_activations, (void*)activationDesc),
+        &result->d_result_u.data);
+    GSCHED_RELEASE;
+    return 1;
+}
+
+bool_t rpc_cudnndestroyactivationdescriptor_1_svc(ptr activationDesc, int *result, struct svc_req *rqstp)
+{
+    RECORD_API(ptr);
+    RECORD_SINGLE_ARG(activationDesc);
+    LOGE(LOG_DEBUG, "%s", __FUNCTION__);
+
+    GSCHED_RETAIN;
+    *result = cudnnDestroyActivationDescriptor(
+        (cudnnActivationDescriptor_t)resource_mg_get(&rm_cudnn_activations, (void*)activationDesc));
+    // TODO: Remove from resource manager
+    GSCHED_RELEASE;
+    RECORD_RESULT(integer, *result);
+    return 1;
 }
\ No newline at end of file
diff --git a/cpu/cpu_rpc_prot.x b/cpu/cpu_rpc_prot.x
index fdfa7051..361fdcf4 100644
--- a/cpu/cpu_rpc_prot.x
+++ b/cpu/cpu_rpc_prot.x
@@ -1,4 +1,5 @@
 typedef opaque mem_data<>;
+
 typedef unsigned hyper size_t;
 typedef unsigned hyper ptr;
 typedef opaque rpc_cuda_device_prop[1032];
@@ -39,6 +40,11 @@ struct rpc_dim3 {
     unsigned int z;
 };
 
+struct int2d1 {
+    int i[2];
+    double d;
+};
+
 union cudnn_scaling_t switch (int dataType) {
 case 2:
 case 0:
@@ -124,6 +130,8 @@ default:
     void;
 };
 
+/* memory allocated for RPC. */
+/* Freed rpc_cd_prog_1_freeresult by after RPC. */
 union mem_result switch (int err) {
 case 0:
     mem_data data;
@@ -145,6 +153,13 @@ default:
     void;
 };
 
+union int4_result switch (int err) {
+case 0:
+    int data[4];
+default:
+    void;
+};
+
 union int5_result switch (int err) {
 case 0:
     int data[5];
@@ -159,6 +174,13 @@ default:
     void;
 };
 
+union int8_result switch (int err) {
+case 0:
+    int data[8];
+default:
+    void;
+};
+
 union int9_result switch (int err) {
 case 0:
     int data[9];
@@ -166,6 +188,13 @@ default:
     void;
 };
 
+union int2d1_result switch (int err) {
+case 0:
+    int2d1 data;
+default:
+    void;
+};
+
 program RPC_CD_PROG {
     version RPC_CD_VERS {
         int          rpc_checkpoint(void)                                         = 0;
@@ -465,7 +494,21 @@ program RPC_CD_PROG {
         int         rpc_cudnnSetFilterNdDescriptor(ptr filterDesc, int dataType, int format, int nbDims, mem_data filterDimA) = 5044;
         mem_result  rpc_cudnnGetFilterNdDescriptor(ptr filterDesc, int nbDimsRequested) = 5045;
         sz_result   rpc_cudnnGetFilterSizeInBytes(ptr filterDesc) = 5046;
-        int         rpc_cudnnTransformFilter(ptr handle, ptr transDesc, cudnn_scaling_t, ptr srcDesc, ptr srcData, cudnn_scaling_t beta, ptr destDesc, ptr destData) = 5047;
+        int         rpc_cudnnTransformFilter(ptr handle, ptr transDesc, cudnn_scaling_t alpha, ptr srcDesc, ptr srcData, cudnn_scaling_t beta, ptr destDesc, ptr destData) = 5047;
         int         rpc_cudnnDestroyFilterDescriptor(ptr filterDesc) = 5048;
+        ptr_result  rpc_cudnnCreatePoolingDescriptor(void) = 5050;
+        int         rpc_cudnnSetPooling2dDescriptor(ptr poolingDesc, int mode, int maxpoolingNanOpt, int windowHeight, int windowWidth, int verticalPadding, int horizontalPadding, int verticalStride, int horizontalStride) = 5051;
+        int8_result rpc_cudnnGetPooling2dDescriptor(ptr poolingDesc) = 5052;
+        int         rpc_cudnnSetPoolingNdDescriptor(ptr poolingDesc, int mode, int maxpoolingNanOpt, int nbDims, mem_data windowDimA, mem_data paddingA, mem_data strideA) = 5053;
+        mem_result  rpc_cudnnGetPoolingNdDescriptor(ptr poolingDesc, int nbDimsRequested) = 5054;
+        mem_result  rpc_cudnnGetPoolingNdForwardOutputDim(ptr poolingDesc, ptr inputTensorDesc, int nbDims) = 5055;
+        int4_result rpc_cudnnGetPooling2dForwardOutputDim(ptr poolingDesc, ptr inputTensorDesc) = 5056;
+        int         rpc_cudnnDestroyPoolingDescriptor(ptr poolingDesc) = 5057;
+        ptr_result  rpc_cudnnCreateActivationDescriptor(void) = 5059;
+        int         rpc_cudnnSetActivationDescriptor(ptr activationDesc, int mode, int reluNanOpt, double coef) = 5060;
+        int2d1_result rpc_cudnnGetActivationDescriptor(ptr activationDesc) = 5061;
+        int         rpc_cudnnSetActivationDescriptorSwishBeta(ptr activationDesc, double swish_beta) = 5062;
+        d_result    rpc_cudnnGetActivationDescriptorSwishBeta(ptr activationDesc) = 5063;
+        int         rpc_cudnnDestroyActivationDescriptor(ptr activationDesc) = 5064;
     } = 1;
 } = 99;
diff --git a/cpu/resource-mg.h b/cpu/resource-mg.h
index b9b63891..b3d3326e 100644
--- a/cpu/resource-mg.h
+++ b/cpu/resource-mg.h
@@ -42,6 +42,10 @@ resource_mg rm_cublas;
 //CUDNN RMs
 resource_mg rm_cudnn;
 resource_mg rm_cudnn_tensors;
+resource_mg rm_cudnn_filters;
+resource_mg rm_cudnn_tensortransform;
+resource_mg rm_cudnn_poolings;
+resource_mg rm_cudnn_activations;
 
 
 /** initializes the resource manager

From e1803697a8527f2e0f176804685b8f37cbb4e7ab Mon Sep 17 00:00:00 2001
From: Niklas Eiling <niklas.eiling@eonerc.rwth-aachen.de>
Date: Wed, 21 Jun 2023 09:46:55 +0200
Subject: [PATCH 65/83] implement cudaMemset Async APIs

Signed-off-by: Niklas Eiling <niklas.eiling@eonerc.rwth-aachen.de>
---
 cpu/cpu-client-runtime.c | 52 +++++++++++++++++++++++++++++--
 cpu/cpu-server-runtime.c | 66 ++++++++++++++++++++++++++++++++++++++--
 cpu/cpu_rpc_prot.x       | 10 +++---
 3 files changed, 117 insertions(+), 11 deletions(-)

diff --git a/cpu/cpu-client-runtime.c b/cpu/cpu-client-runtime.c
index 1c6316b6..cbd1eab0 100644
--- a/cpu/cpu-client-runtime.c
+++ b/cpu/cpu-client-runtime.c
@@ -1794,7 +1794,19 @@ cudaError_t cudaMemset2D(void* devPtr, size_t pitch, int value, size_t width, si
     return result;
 }
 
-DEF_FN(cudaError_t, cudaMemset2DAsync, void*, devPtr, size_t, pitch, int,  value, size_t, width, size_t, height, cudaStream_t, stream)
+cudaError_t cudaMemset2DAsync(void* devPtr, size_t pitch, int value, size_t width, size_t height, cudaStream_t stream)
+{
+#ifdef WITH_API_CNT
+    api_call_cnt++;
+#endif //WITH_API_CNT
+    int result;
+    enum clnt_stat retval;
+    retval = cuda_memset_2d_async_1((ptr)devPtr, pitch, value, width, height, (ptr)stream, &result, clnt);
+    if (retval != RPC_SUCCESS) {
+        clnt_perror (clnt, "call failed");
+    }
+    return result;
+}
 
 cudaError_t cudaMemset3D(struct cudaPitchedPtr pitchedDevPtr, int value, struct cudaExtent extent)
 {
@@ -1818,8 +1830,42 @@ cudaError_t cudaMemset3D(struct cudaPitchedPtr pitchedDevPtr, int value, struct
     return result;
 }
 
-DEF_FN(cudaError_t, cudaMemset3DAsync, struct cudaPitchedPtr, pitchedDevPtr, int,  value, struct cudaExtent, extent, cudaStream_t, stream)
-DEF_FN(cudaError_t, cudaMemsetAsync, void*, devPtr, int,  value, size_t, count, cudaStream_t, stream)
+cudaError_t cudaMemset3DAsync(struct cudaPitchedPtr pitchedDevPtr, int  value, struct cudaExtent extent, cudaStream_t stream)
+{
+#ifdef WITH_API_CNT
+    api_call_cnt++;
+#endif //WITH_API_CNT
+    int result;
+    enum clnt_stat retval;
+    retval = cuda_memset_3d_async_1(pitchedDevPtr.pitch,
+                              (ptr)pitchedDevPtr.ptr,
+                              pitchedDevPtr.xsize,
+                              pitchedDevPtr.ysize,
+                              value,
+                              extent.depth,
+                              extent.height,
+                              extent.width, 
+                              (ptr)stream,
+                              &result, clnt);
+    if (retval != RPC_SUCCESS) {
+        clnt_perror (clnt, "call failed");
+    }
+    return result;
+}
+
+cudaError_t cudaMemsetAsync(void* devPtr, int value, size_t count, cudaStream_t stream)
+{
+#ifdef WITH_API_CNT
+    api_call_cnt++;
+#endif //WITH_API_CNT
+    int result;
+    enum clnt_stat retval;
+    retval = cuda_memset_async_1((ptr)devPtr, value, count, (ptr)stream, &result, clnt);
+    if (retval != RPC_SUCCESS) {
+        clnt_perror (clnt, "call failed");
+    }
+    return result;
+}
 
 DEF_FN(struct cudaExtent, make_cudaExtent, size_t, w, size_t, h, size_t, d)
 DEF_FN(struct cudaPitchedPtr, make_cudaPitchedPtr, void*, d, size_t, p, size_t, xsz, size_t, ysz)
diff --git a/cpu/cpu-server-runtime.c b/cpu/cpu-server-runtime.c
index 66cc937a..394314a2 100644
--- a/cpu/cpu-server-runtime.c
+++ b/cpu/cpu-server-runtime.c
@@ -1730,7 +1730,26 @@ bool_t cuda_memset_2d_1_svc(ptr devPtr, size_t pitch, int value, size_t width, s
     return 1;
 }
 
-/* cudaMemset2DAsync ( void* devPtr, size_t pitch, int  value, size_t width, size_t height, cudaStream_t stream = 0 ) is not implemented */
+bool_t cuda_memset_2d_async_1_svc(ptr devPtr, size_t pitch, int value, size_t width, size_t height, ptr stream, int *result, struct svc_req *rqstp)
+{
+    RECORD_API(cuda_memset_2d_async_1_argument);
+    RECORD_ARG(1, devPtr);
+    RECORD_ARG(2, pitch);
+    RECORD_ARG(3, value);
+    RECORD_ARG(4, height);
+    RECORD_ARG(5, width);
+    RECORD_ARG(6, stream);
+    LOGE(LOG_DEBUG, "cudaMemset2DAsync");
+    *result = cudaMemset2DAsync(
+      resource_mg_get(&rm_memory, (void*)devPtr),
+      pitch,
+      value,
+      width,
+      height,
+      resource_mg_get(&rm_streams, (void*)stream));
+    RECORD_RESULT(integer, *result);
+    return 1;
+}
 
 bool_t cuda_memset_3d_1_svc(size_t pitch, ptr devPtr, size_t xsize, size_t ysize, int value, size_t depth, size_t height, size_t width, int *result, struct svc_req *rqstp)
 {
@@ -1755,8 +1774,49 @@ bool_t cuda_memset_3d_1_svc(size_t pitch, ptr devPtr, size_t xsize, size_t ysize
     RECORD_RESULT(integer, *result);
     return 1;
 }
-/* cudaMemset3DAsync ( cudaPitchedPtr pitchedDevPtr, int  value, cudaExtent extent, cudaStream_t stream = 0 ) is not implemented */
-/* cudaMemsetAsync ( void* devPtr, int  value, size_t count, cudaStream_t stream = 0 ) is not implemented */
+
+bool_t cuda_memset_3d_async_1_svc(size_t pitch, ptr devPtr, size_t xsize, size_t ysize, int value, size_t depth, size_t height, size_t width, ptr stream, int *result, struct svc_req *rqstp)
+{
+    RECORD_API(cuda_memset_3d_async_1_argument);
+    RECORD_ARG(1, pitch);
+    RECORD_ARG(2, devPtr);
+    RECORD_ARG(3, xsize);
+    RECORD_ARG(4, ysize);
+    RECORD_ARG(5, value);
+    RECORD_ARG(6, depth);
+    RECORD_ARG(7, height);
+    RECORD_ARG(8, width);
+    RECORD_ARG(9, stream);
+    LOGE(LOG_DEBUG, "cudaMemset3DAsync");
+    struct cudaPitchedPtr pptr = {.pitch = pitch,
+                                  .ptr = resource_mg_get(&rm_memory, (void*)devPtr),
+                                  .xsize = xsize,
+                                  .ysize = ysize};
+    struct cudaExtent extent = {.depth = depth,
+                                .height = height,
+                                .width = width};
+    *result = cudaMemset3DAsync(pptr, value, extent,
+                resource_mg_get(&rm_streams, (void*)stream));
+    RECORD_RESULT(integer, *result);
+    return 1;
+}
+
+bool_t cuda_memset_asycn_1_svc(ptr devPtr, int value, size_t count, ptr stream, int *result, struct svc_req *rqstp)
+{
+    RECORD_API(cuda_memset_async_1_argument);
+    RECORD_ARG(1, devPtr);
+    RECORD_ARG(2, value);
+    RECORD_ARG(3, count);
+    RECORD_ARG(3, stream);
+    LOGE(LOG_DEBUG, "cudaMemsetAsync");
+    *result = cudaMemsetAsync(
+      resource_mg_get(&rm_memory, (void*)devPtr),
+      value,
+      count,
+      resource_mg_get(&rm_streams, (void*)stream));
+    RECORD_RESULT(integer, *result);
+    return 1;
+}
 /* cudaMipmappedArrayGetSparseProperties ( cudaArraySparseProperties* sparseProperties, cudaMipmappedArray_t mipmap ) is not implemented */
 /* make_cudaExtent ( size_t w, size_t h, size_t d ) should be implemented on the client side */
 /* make_cudaPitchedPtr ( void* d, size_t p, size_t xsz, size_t ysz ) should be implemented on the client side */
diff --git a/cpu/cpu_rpc_prot.x b/cpu/cpu_rpc_prot.x
index 361fdcf4..2b7a4018 100644
--- a/cpu/cpu_rpc_prot.x
+++ b/cpu/cpu_rpc_prot.x
@@ -346,13 +346,13 @@ program RPC_CD_PROG {
         int          CUDA_MEMCPY_MT_SYNC(int)                                   = 451;
         int          CUDA_MEMSET(ptr, int, size_t)                              = 470;
         int          CUDA_MEMSET_2D(ptr, size_t, int, size_t, size_t)           = 471;
-        /*int        CUDA_MEMSET_2D_ASYNC(ptr, size_t,
-                         int, size_t, size_t, int)                              = 472;*/
+        int          CUDA_MEMSET_2D_ASYNC(ptr, size_t,
+                         int, size_t, size_t, ptr)                              = 472;
         int          CUDA_MEMSET_3D(size_t, ptr, size_t, size_t, int, size_t,
                          size_t, size_t)                                        = 473;
-        /*int        CUDA_MEMSET_3D_ASYNC(size_t, ptr, size_t, size_t, int, 
-                         size_t, size_t, size_t, int)                           = 474;*/
-        /*int        CUDA_MEMSET_ASYNC(ptr, int, size_t, int)                   = 475;*/
+        int          CUDA_MEMSET_3D_ASYNC(size_t, ptr, size_t, size_t, int, 
+                         size_t, size_t, size_t, ptr)                           = 474;
+        int          CUDA_MEMSET_ASYNC(ptr, int, size_t, ptr)                   = 475;
         /*?          CUDA_MIPMAPPED_ARRAY_GET_SPARSE_PROPERTIES(ptr)            = 476;*/
         /* make_ APIs can be copied on the client side */
 

From 14838e67f44a11279ab6019da2cbc0941a0ada16 Mon Sep 17 00:00:00 2001
From: Niklas Eiling <niklas.eiling@eonerc.rwth-aachen.de>
Date: Wed, 21 Jun 2023 09:50:16 +0200
Subject: [PATCH 66/83] add cudnn dependency to Dockerfiles

Signed-off-by: Niklas Eiling <niklas.eiling@eonerc.rwth-aachen.de>
---
 utils/Dockerfile        | 2 +-
 utils/Dockerfile.cuda10 | 2 +-
 utils/Dockerfile.cuda11 | 2 +-
 3 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/utils/Dockerfile b/utils/Dockerfile
index a31884fe..66cfcae5 100644
--- a/utils/Dockerfile
+++ b/utils/Dockerfile
@@ -26,7 +26,7 @@ RUN dnf install -y make bash git gcc autoconf libtool automake rpcgen \
 ENV LD_LIBRARY_PATH="/usr/local/lib:/usr/local/lib64:${LD_LIBRARY_PATH}"
 
 RUN dnf -y config-manager --add-repo http://developer.download.nvidia.com/compute/cuda/repos/rhel8/x86_64/cuda-rhel8.repo && \
-    dnf --refresh -y install cuda-compiler-12-1 cuda-libraries-devel-12-1 cuda-driver-devel-12-1 cuda-profiler-api-12-1 cuda-nvml-devel-12-1 nvidia-driver-NVML-530.30.02 && \
+    dnf --refresh -y install cuda-compiler-12-1 cuda-libraries-devel-12-1 cuda-driver-devel-12-1 cuda-profiler-api-12-1 cuda-nvml-devel-12-1 nvidia-driver-NVML-530.30.02 libcudnn8-devel && \
     ln -s cuda-12.1 /usr/local/cuda && \
     ln -s libcuda.so /usr/local/cuda/targets/x86_64-linux/lib/stubs/libcuda.so.1
 
diff --git a/utils/Dockerfile.cuda10 b/utils/Dockerfile.cuda10
index 02ff496f..391c130e 100644
--- a/utils/Dockerfile.cuda10
+++ b/utils/Dockerfile.cuda10
@@ -28,7 +28,7 @@ ENV LD_LIBRARY_PATH="/usr/local/lib:/usr/local/lib64:${LD_LIBRARY_PATH}"
 
 RUN dnf --refresh -y install https://developer.download.nvidia.com/compute/cuda/repos/rhel8/x86_64/cuda-repo-rhel8-10.2.89-1.x86_64.rpm && \
     rpm --import https://developer.download.nvidia.com/compute/cuda/repos/rhel8/x86_64/D42D0685.pub && \
-    dnf --refresh -y install cuda-compiler-10-2 cuda-libraries-dev-10-2 cuda-samples-10-2 cuda-driver-dev-10-2 cuda-misc-headers-10-2 cuda-nvml-dev-10-2 nvidia-driver-NVML-530.30.02 && \
+    dnf --refresh -y install cuda-compiler-10-2 cuda-libraries-dev-10-2 cuda-samples-10-2 cuda-driver-dev-10-2 cuda-misc-headers-10-2 cuda-nvml-dev-10-2 nvidia-driver-NVML-530.30.02 libcudnn8-devel && \
     ln -s cuda-10.2 /usr/local/cuda && \
     ln -s libcuda.so /usr/local/cuda/targets/x86_64-linux/lib/stubs/libcuda.so.1
     
diff --git a/utils/Dockerfile.cuda11 b/utils/Dockerfile.cuda11
index a31eb196..a261bb98 100644
--- a/utils/Dockerfile.cuda11
+++ b/utils/Dockerfile.cuda11
@@ -28,7 +28,7 @@ RUN dnf install -y make bash git gcc autoconf libtool automake rpcgen \
 ENV LD_LIBRARY_PATH="/usr/local/lib:/usr/local/lib64:${LD_LIBRARY_PATH}"
 
 RUN dnf -y config-manager --add-repo http://developer.download.nvidia.com/compute/cuda/repos/rhel8/x86_64/cuda-rhel8.repo && \
-    dnf --refresh -y install cuda-compiler-11-1 cuda-libraries-devel-11-1 cuda-samples-11-1 cuda-driver-devel-11-1 cuda-nvprof-11-1 cuda-nvml-devel-11-1 nvidia-driver-NVML-530.30.02 && \
+    dnf --refresh -y install cuda-compiler-11-1 cuda-libraries-devel-11-1 cuda-samples-11-1 cuda-driver-devel-11-1 cuda-nvprof-11-1 cuda-nvml-devel-11-1 nvidia-driver-NVML-530.30.02 libcudnn8-devel && \
     ln -s cuda-11.1 /usr/local/cuda && \
     ln -s libcuda.so /usr/local/cuda/targets/x86_64-linux/lib/stubs/libcuda.so.1
 

From b392420a77d7ea43105355bed1731cca40c6cba2 Mon Sep 17 00:00:00 2001
From: Niklas Eiling <niklas.eiling@eonerc.rwth-aachen.de>
Date: Wed, 21 Jun 2023 10:12:37 +0200
Subject: [PATCH 67/83] add cudnn LRN api

Signed-off-by: Niklas Eiling <niklas.eiling@eonerc.rwth-aachen.de>
---
 cpu/cpu-client-cudnn.c | 100 +++++++++++++++++++++++++++++++++++++++--
 cpu/cpu-server-cudnn.c |   2 +
 cpu/cpu_rpc_prot.x     |  21 ++++++++-
 cpu/resource-mg.h      |   1 +
 4 files changed, 118 insertions(+), 6 deletions(-)

diff --git a/cpu/cpu-client-cudnn.c b/cpu/cpu-client-cudnn.c
index c519a172..10057cb8 100644
--- a/cpu/cpu-client-cudnn.c
+++ b/cpu/cpu-client-cudnn.c
@@ -839,10 +839,102 @@ cudnnStatus_t cudnnDestroyActivationDescriptor(cudnnActivationDescriptor_t activ
 }
 
 DEF_FN(cudnnStatus_t, cudnnActivationForward, cudnnHandle_t, handle, cudnnActivationDescriptor_t, activationDesc, const void *, alpha, const cudnnTensorDescriptor_t, xDesc, const void *, x, const void *, beta, const cudnnTensorDescriptor_t, yDesc, void *, y)
-DEF_FN(cudnnStatus_t, cudnnCreateLRNDescriptor, cudnnLRNDescriptor_t *, normDesc)
-DEF_FN(cudnnStatus_t, cudnnSetLRNDescriptor, cudnnLRNDescriptor_t, normDesc, unsigned, lrnN, double, lrnAlpha, double, lrnBeta, double, lrnK)
-DEF_FN(cudnnStatus_t, cudnnGetLRNDescriptor, cudnnLRNDescriptor_t, normDesc, unsigned *, lrnN, double *, lrnAlpha, double *, lrnBeta, double *, lrnK)
-DEF_FN(cudnnStatus_t, cudnnDestroyLRNDescriptor, cudnnLRNDescriptor_t, lrnDesc)
+
+cudnnStatus_t cudnnCreateLRNDescriptor(cudnnLRNDescriptor_t * normDesc)
+{
+#ifdef WITH_API_CNT
+    cudnn_call_cnt++;
+#endif //WITH_API_CNT
+    ptr_result result;
+    enum clnt_stat retval_1;
+    if (normDesc == NULL) {
+        LOGE(LOG_ERROR, "%s failed (value is NULL)", __FUNCTION__);
+        return CUDNN_STATUS_BAD_PARAM;
+    }
+    retval_1 = rpc_cudnncreatelrndescriptor_1(&result, clnt);
+    if (retval_1 != RPC_SUCCESS) {
+        LOGE(LOG_ERROR, "%s failed (%d)", __FUNCTION__, retval_1);
+    }
+    if (result.err != CUDNN_STATUS_SUCCESS) {
+        LOGE(LOG_ERROR, "%s failed (result is %d)", __FUNCTION__, result.err);
+    } else {
+        *normDesc = (cudnnLRNDescriptor_t)result.ptr_result_u.ptr;
+    }
+    return result.err;
+}
+
+cudnnStatus_t cudnnSetLRNDescriptor(cudnnLRNDescriptor_t normDesc, unsigned lrnN, double lrnAlpha, double lrnBeta, double lrnK)
+{
+#ifdef WITH_API_CNT
+    cudnn_call_cnt++;
+#endif //WITH_API_CNT
+    int result;
+    enum clnt_stat retval_1;
+    retval_1 = rpc_cudnnsetlrndescriptor_1(
+        (ptr)normDesc,
+        (int)lrnN,
+        lrnAlpha,
+        lrnBeta,
+        lrnK,
+        &result, clnt);
+
+    if (retval_1 != RPC_SUCCESS) {
+        LOGE(LOG_ERROR, "%s failed (%d)", __FUNCTION__, retval_1);
+    }
+    if (result != CUDNN_STATUS_SUCCESS) {
+        LOGE(LOG_ERROR, "%s failed (result is %d)", __FUNCTION__, result);
+    } 
+    return result;
+}
+
+cudnnStatus_t cudnnGetLRNDescriptor(cudnnLRNDescriptor_t normDesc, unsigned * lrnN, double * lrnAlpha, double * lrnBeta, double * lrnK)
+{
+#ifdef WITH_API_CNT
+    cudnn_call_cnt++;
+#endif //WITH_API_CNT
+    int1d3_result result;
+    enum clnt_stat retval_1;
+    if (lrnN == NULL || lrnAlpha == NULL || lrnBeta == NULL || lrnK == NULL) {
+        LOGE(LOG_ERROR, "%s failed (value is NULL)", __FUNCTION__);
+        return CUDNN_STATUS_BAD_PARAM;
+    }
+    retval_1 = rpc_cudnngetlrndescriptor_1(
+        (ptr)normDesc,
+        &result, clnt);
+
+    if (retval_1 != RPC_SUCCESS) {
+        LOGE(LOG_ERROR, "%s failed (%d)", __FUNCTION__, retval_1);
+    }
+    if (result.err != CUDNN_STATUS_SUCCESS) {
+        LOGE(LOG_ERROR, "%s failed (result is %d)", __FUNCTION__, result);
+    } else {
+        *lrnN = result.int1d3_result_u.data.i;
+        *lrnAlpha = result.int1d3_result_u.data.d[0];
+        *lrnBeta = result.int1d3_result_u.data.d[1];
+        *lrnK = result.int1d3_result_u.data.d[2];
+    }
+    return result.err;
+}
+
+cudnnStatus_t cudnnDestroyLRNDescriptor(cudnnLRNDescriptor_t lrnDesc)
+{
+#ifdef WITH_API_CNT
+    cudnn_call_cnt++;
+#endif //WITH_API_CNT
+    int result;
+    enum clnt_stat retval_1;
+    retval_1 = rpc_cudnndestroylrndescriptor_1(
+        (ptr)lrnDesc,
+        &result, clnt);
+
+    if (retval_1 != RPC_SUCCESS) {
+        LOGE(LOG_ERROR, "%s failed (%d)", __FUNCTION__, retval_1);
+    }
+    if (result != CUDNN_STATUS_SUCCESS) {
+        LOGE(LOG_ERROR, "%s failed (result is %d)", __FUNCTION__, result);
+    }
+    return result;
+}
 DEF_FN(cudnnStatus_t, cudnnLRNCrossChannelForward, cudnnHandle_t, handle, cudnnLRNDescriptor_t, normDesc, cudnnLRNMode_t, lrnMode, const void *, alpha, const cudnnTensorDescriptor_t, xDesc, const void *, x, const void *, beta, const cudnnTensorDescriptor_t, yDesc, void *, y)
 DEF_FN(cudnnStatus_t, cudnnDivisiveNormalizationForward, cudnnHandle_t, handle, cudnnLRNDescriptor_t, normDesc, cudnnDivNormMode_t, mode, const void *, alpha, const cudnnTensorDescriptor_t, xDesc, const void *, x, const void *, means, void *, temp, void *, temp2, const void *, beta, const cudnnTensorDescriptor_t, yDesc, void *, y)
 DEF_FN(cudnnStatus_t, cudnnDeriveBNTensorDescriptor, cudnnTensorDescriptor_t, derivedBnDesc, const cudnnTensorDescriptor_t, xDesc, cudnnBatchNormMode_t, mode)
diff --git a/cpu/cpu-server-cudnn.c b/cpu/cpu-server-cudnn.c
index 511626b5..6d722712 100644
--- a/cpu/cpu-server-cudnn.c
+++ b/cpu/cpu-server-cudnn.c
@@ -26,6 +26,7 @@ int server_cudnn_init(int bypass)
     ret &= resource_mg_init(&rm_cudnn_filters, bypass);
     ret &= resource_mg_init(&rm_cudnn_poolings, bypass);
     ret &= resource_mg_init(&rm_cudnn_activations, bypass);
+    ret &= resource_mg_init(&rm_cudnn_lrns, bypass);
     return ret;
 }
 
@@ -36,6 +37,7 @@ int server_cudnn_deinit(void)
     resource_mg_free(&rm_cudnn_filters);
     resource_mg_free(&rm_cudnn_poolings);
     resource_mg_free(&rm_cudnn_activations);
+    resource_mg_free(&rm_cudnn_lrns);
     return 0;
 
 }
diff --git a/cpu/cpu_rpc_prot.x b/cpu/cpu_rpc_prot.x
index 2b7a4018..d840d549 100644
--- a/cpu/cpu_rpc_prot.x
+++ b/cpu/cpu_rpc_prot.x
@@ -45,6 +45,11 @@ struct int2d1 {
     double d;
 };
 
+struct int1d3 {
+    int i;
+    double d[3];
+};
+
 union cudnn_scaling_t switch (int dataType) {
 case 2:
 case 0:
@@ -195,6 +200,13 @@ default:
     void;
 };
 
+union int1d3_result switch (int err) {
+case 0:
+    int1d3 data;
+default:
+    void;
+};
+
 program RPC_CD_PROG {
     version RPC_CD_VERS {
         int          rpc_checkpoint(void)                                         = 0;
@@ -457,7 +469,7 @@ program RPC_CD_PROG {
         int         rpc_cudnnSetStream(ptr handle, ptr streamId) = 5008;
         ptr_result  rpc_cudnnGetStream(ptr handle) = 5009;
         ptr_result  rpc_cudnnCreateTensorDescriptor(void) = 5010;
-        
+        /*
         int         rpc_cudnnSetTensor4dDescriptor(ptr tensorDesc, int format, int dataType, int n, int c, int h, int w) = 5011;
         int         rpc_cudnnSetTensor4dDescriptorEx(ptr tensorDesc, int dataType, int n, int c, int h, int w, int nStride, int cStride, int hStride, int wStride) = 5012;
         int9_result rpc_cudnnGetTensor4dDescriptor(ptr tensorDesc) = 5013;
@@ -487,7 +499,8 @@ program RPC_CD_PROG {
         sz_result   rpc_cudnnGetReductionWorkspaceSize(ptr handle, ptr reduceTensorDesc, ptr aDesc, ptr cDesc) = 5037;
         mem_result  rpc_cudnnReduceTensor(ptr handle, ptr reduceTensorDesc, ptr indices, size_t indicesSizeInBytes, ptr workspace, size_t workspaceSizeInBytes, cudnn_scaling_t alpha, ptr aDesc, ptr A, cudnn_scaling_t beta, ptr cDesc, ptr C) = 5038;
         int         rpc_cudnnSetTensor(ptr handle, ptr yDesc, ptr y, mem_data valuePtr) = 5039;
-        int         rpc_cudnnScaleTensor(ptr handle, ptr yDesc, ptr y, cudnn_scaling_t alpha) = 5040;
+        int         rpc_cudnnScaleTensor(ptr handle, ptr yDesc, ptr y, cudnn_scaling_t alpha) = 5040; */
+        
         ptr_result  rpc_cudnnCreateFilterDescriptor(void) = 5041;
         int         rpc_cudnnSetFilter4dDescriptor(ptr filterDesc, int dataType, int format, int k, int c, int h, int w) = 5042;
         int6_result rpc_cudnnGetFilter4dDescriptor(ptr filterDesc) = 5043;
@@ -510,5 +523,9 @@ program RPC_CD_PROG {
         int         rpc_cudnnSetActivationDescriptorSwishBeta(ptr activationDesc, double swish_beta) = 5062;
         d_result    rpc_cudnnGetActivationDescriptorSwishBeta(ptr activationDesc) = 5063;
         int         rpc_cudnnDestroyActivationDescriptor(ptr activationDesc) = 5064;
+        ptr_result  rpc_cudnnCreateLRNDescriptor(void) = 5066;
+        int         rpc_cudnnSetLRNDescriptor(ptr normDesc, unsigned lrnN, double lrnAlpha, double lrnBeta, double lrnK) = 5067;
+        int1d3_result rpc_cudnnGetLRNDescriptor(ptr normDesc) = 5068;
+        int         rpc_cudnnDestroyLRNDescriptor(ptr lrnDesc) = 5069;
     } = 1;
 } = 99;
diff --git a/cpu/resource-mg.h b/cpu/resource-mg.h
index b3d3326e..29b2dcae 100644
--- a/cpu/resource-mg.h
+++ b/cpu/resource-mg.h
@@ -46,6 +46,7 @@ resource_mg rm_cudnn_filters;
 resource_mg rm_cudnn_tensortransform;
 resource_mg rm_cudnn_poolings;
 resource_mg rm_cudnn_activations;
+resource_mg rm_cudnn_lrns;
 
 
 /** initializes the resource manager

From 26e19bde0c19d760367e958b9f8eaa9d55071280 Mon Sep 17 00:00:00 2001
From: Niklas Eiling <niklas.eiling@eonerc.rwth-aachen.de>
Date: Wed, 21 Jun 2023 10:41:48 +0200
Subject: [PATCH 68/83] add server side cudnn lrn implementations, fix some
 function names

Signed-off-by: Niklas Eiling <niklas.eiling@eonerc.rwth-aachen.de>
---
 cpu/cpu-server-cudnn.c   | 72 ++++++++++++++++++++++++++++++++++++++--
 cpu/cpu-server-runtime.c |  2 +-
 2 files changed, 71 insertions(+), 3 deletions(-)

diff --git a/cpu/cpu-server-cudnn.c b/cpu/cpu-server-cudnn.c
index 6d722712..e71e55ee 100644
--- a/cpu/cpu-server-cudnn.c
+++ b/cpu/cpu-server-cudnn.c
@@ -474,7 +474,7 @@ bool_t rpc_cudnngetpoolingnddescriptor_1_svc(ptr poolingDesc, int nbDimsRequeste
     return 1;
 }
 
-bool_t rpc_cudnngetPoolingNdForwardOutputDim_1_svc(ptr poolingDesc, ptr inputTensorDesc, int nbDims, mem_result *result, struct svc_req *rqstp)
+bool_t rpc_cudnngetpoolingndforwardoutputdim_1_svc(ptr poolingDesc, ptr inputTensorDesc, int nbDims, mem_result *result, struct svc_req *rqstp)
 {
     LOGE(LOG_DEBUG, "%s", __FUNCTION__);
     GSCHED_RETAIN;
@@ -492,7 +492,7 @@ bool_t rpc_cudnngetPoolingNdForwardOutputDim_1_svc(ptr poolingDesc, ptr inputTen
     return 1;
 }
 
-bool_t rpc_cudnngetPooling2dForwardOutputDim_1_svc(ptr poolingDesc, ptr inputTensorDesc, int4_result *result, struct svc_req *rqstp)
+bool_t rpc_cudnngetpooling2dforwardoutputdim_1_svc(ptr poolingDesc, ptr inputTensorDesc, int4_result *result, struct svc_req *rqstp)
 {
     LOGE(LOG_DEBUG, "%s", __FUNCTION__);
     GSCHED_RETAIN;
@@ -614,4 +614,72 @@ bool_t rpc_cudnndestroyactivationdescriptor_1_svc(ptr activationDesc, int *resul
     GSCHED_RELEASE;
     RECORD_RESULT(integer, *result);
     return 1;
+}
+
+bool_t rpc_cudnncreatelrndescriptor_1_svc(ptr_result *result, struct svc_req *rqstp)
+{
+    RECORD_VOID_API;
+    LOGE(LOG_DEBUG, "%s", __FUNCTION__);
+
+    GSCHED_RETAIN;
+    result->err = cudnnCreateLRNDescriptor((cudnnLRNDescriptor_t*)&result->ptr_result_u.ptr);
+    if (resource_mg_create(&rm_cudnn_lrns, (void*)result->ptr_result_u.ptr) != 0) {
+        LOGE(LOG_ERROR, "error in resource manager");
+    }
+    GSCHED_RELEASE;
+    RECORD_RESULT(ptr_result_u, *result);
+    return 1;
+}
+
+bool_t rpc_cudnnsetlrndescriptor_1_svc(ptr normDesc, unsigned lrnN, double lrnAlpha, double lrnBeta, double lrnK, int *result, struct svc_req *rqstp)
+{
+    RECORD_API(rpc_cudnnsetlrndescriptor_1_argument);
+    RECORD_NARG(normDesc);
+    RECORD_NARG(lrnN);
+    RECORD_NARG(lrnAlpha);
+    RECORD_NARG(lrnBeta);
+    RECORD_NARG(lrnK);
+
+    LOGE(LOG_DEBUG, "%s", __FUNCTION__);
+
+    GSCHED_RETAIN;
+    *result = cudnnSetLRNDescriptor(
+        (cudnnLRNDescriptor_t)resource_mg_get(&rm_cudnn_lrns, (void*)normDesc),
+        lrnN,
+        lrnAlpha,
+        lrnBeta,
+        lrnK);
+    GSCHED_RELEASE;
+    RECORD_RESULT(integer, *result);
+    return 1;
+}
+
+bool_t rpc_cudnngetlrndescriptor_1_svc(ptr normDesc, int1d3_result *result, struct svc_req *rqstp)
+{
+    LOGE(LOG_DEBUG, "%s", __FUNCTION__);
+
+    GSCHED_RETAIN;
+    result->err = cudnnGetLRNDescriptor(
+        (cudnnLRNDescriptor_t)resource_mg_get(&rm_cudnn_lrns, (void*)normDesc),
+        (unsigned int*)&result->int1d3_result_u.data.i,
+        &result->int1d3_result_u.data.d[0],
+        &result->int1d3_result_u.data.d[1],
+        &result->int1d3_result_u.data.d[2]);
+    GSCHED_RELEASE;
+    return 1;
+}
+
+bool_t rpc_cudnndestroylrndescriptor_1_svc(ptr lrnDesc, int *result, struct svc_req *rqstp)
+{
+    RECORD_API(ptr);
+    RECORD_SINGLE_ARG(lrnDesc);
+    LOGE(LOG_DEBUG, "%s", __FUNCTION__);
+
+    GSCHED_RETAIN;
+    *result = cudnnDestroyLRNDescriptor(
+        (cudnnLRNDescriptor_t)resource_mg_get(&rm_cudnn_lrns, (void*)lrnDesc));
+    // TODO: Remove from resource manager
+    GSCHED_RELEASE;
+    RECORD_RESULT(integer, *result);
+    return 1;
 }
\ No newline at end of file
diff --git a/cpu/cpu-server-runtime.c b/cpu/cpu-server-runtime.c
index 394314a2..3d70e0a5 100644
--- a/cpu/cpu-server-runtime.c
+++ b/cpu/cpu-server-runtime.c
@@ -1801,7 +1801,7 @@ bool_t cuda_memset_3d_async_1_svc(size_t pitch, ptr devPtr, size_t xsize, size_t
     return 1;
 }
 
-bool_t cuda_memset_asycn_1_svc(ptr devPtr, int value, size_t count, ptr stream, int *result, struct svc_req *rqstp)
+bool_t cuda_memset_async_1_svc(ptr devPtr, int value, size_t count, ptr stream, int *result, struct svc_req *rqstp)
 {
     RECORD_API(cuda_memset_async_1_argument);
     RECORD_ARG(1, devPtr);

From 15fc3a2d5f1bb079d057cda1ea4688d72a307084 Mon Sep 17 00:00:00 2001
From: Niklas Eiling <niklas.eiling@eonerc.rwth-aachen.de>
Date: Wed, 21 Jun 2023 15:51:10 +0200
Subject: [PATCH 69/83] add basic cuBLAS support

Signed-off-by: Niklas Eiling <niklas.eiling@eonerc.rwth-aachen.de>
---
 cpu/Makefile            |   3 +-
 cpu/cpu-client-cublas.c | 173 ++++++++++++++++++++++++++++++++++++++++
 cpu/cpu-client-cudnn.c  |  78 +++++++++---------
 cpu/cpu-server-cublas.c | 136 ++++++++++++++++++++++++++++++-
 cpu/cpu_rpc_prot.x      |   6 ++
 5 files changed, 353 insertions(+), 43 deletions(-)
 create mode 100644 cpu/cpu-client-cublas.c

diff --git a/cpu/Makefile b/cpu/Makefile
index 01d9e546..c2a13b13 100644
--- a/cpu/Makefile
+++ b/cpu/Makefile
@@ -61,7 +61,8 @@ SRC_CLIENT = $(RPC_XDR)                 \
 			 mt-memcpy.c				\
 			 cpu-elf2.c					\
 			 cpu-client-nvml.c          \
-			 cpu-client-cudnn.c
+			 cpu-client-cudnn.c			\
+			 cpu-client-cublas.c
 
 # 			 cpu-client-driver-hidden.c \
 
diff --git a/cpu/cpu-client-cublas.c b/cpu/cpu-client-cublas.c
new file mode 100644
index 00000000..c8ffa2bf
--- /dev/null
+++ b/cpu/cpu-client-cublas.c
@@ -0,0 +1,173 @@
+
+#define _GNU_SOURCE
+#include <stdio.h>
+#include <stdlib.h>
+#include <cuda.h>
+#include <cublas_v2.h>
+
+//for strerror
+#include <string.h>
+#include <errno.h>
+
+#include "cpu-libwrap.h"
+#include "cpu_rpc_prot.h"
+#include "cpu-common.h"
+#include "cpu-utils.h"
+#include "log.h"
+
+#ifdef WITH_API_CNT
+extern int api_call_cnt;
+#endif //WITH_API_CNT
+
+cublasStatus_t cublasCreate(cublasHandle_t* handle)
+{
+#ifdef WITH_API_CNT
+    api_call_cnt++;
+#endif //WITH_API_CNT
+    ptr_result result;
+    enum clnt_stat retval_1;
+    retval_1 = rpc_cublascreate_1(&result, clnt);
+    if (retval_1 != RPC_SUCCESS) {
+        clnt_perror (clnt, "call failed");
+    }
+    if (result.err == 0) {
+        *handle = (void*)result.ptr_result_u.ptr;
+    }
+    return result.err;
+}
+
+cublasStatus_t cublasDestroy(cublasHandle_t handle)
+{
+#ifdef WITH_API_CNT
+    api_call_cnt++;
+#endif //WITH_API_CNT
+    int result;
+    enum clnt_stat retval_1;
+    retval_1 = rpc_cublasdestroy_1((ptr)handle, &result, clnt);
+    if (retval_1 != RPC_SUCCESS) {
+        clnt_perror (clnt, "call failed");
+    }
+    return result;
+}
+
+cublasStatus_t cublasDgemm(cublasHandle_t handle,
+                           cublasOperation_t transa, cublasOperation_t transb,
+                           int m, int n, int k,
+                           const double          *alpha,
+                           const double          *A, int lda,
+                           const double          *B, int ldb,
+                           const double          *beta,
+                           double          *C, int ldc)
+{
+#ifdef WITH_API_CNT
+    api_call_cnt++;
+#endif //WITH_API_CNT
+    int result;
+    enum clnt_stat retval_1;
+    retval_1 = rpc_cublasdgemm_1(
+        (ptr)handle,
+        (int)transa,
+        (int)transb,
+        m, n, k,
+        *alpha,
+        (ptr)A, lda,
+        (ptr)B, ldb,
+        *beta,
+        (ptr)C, ldc,
+         &result, clnt);
+    if (retval_1 != RPC_SUCCESS) {
+        clnt_perror (clnt, "call failed");
+    }
+    return result;
+}
+
+cublasStatus_t cublasSgemm(cublasHandle_t handle,
+                           cublasOperation_t transa, cublasOperation_t transb,
+                           int m, int n, int k,
+                           const float *alpha,
+                           const float *A, int lda,
+                           const float *B, int ldb,
+                           const float *beta,
+                           float *C, int ldc)
+{
+#ifdef WITH_API_CNT
+    api_call_cnt++;
+#endif //WITH_API_CNT
+    int result;
+    enum clnt_stat retval_1;
+    retval_1 = rpc_cublassgemm_1(
+        (ptr)handle,
+        (int)transa,
+        (int)transb,
+        m, n, k,
+        *alpha,
+        (ptr)A, lda,
+        (ptr)B, ldb,
+        *beta,
+        (ptr)C, ldc,
+         &result, clnt);
+    if (retval_1 != RPC_SUCCESS) {
+        clnt_perror (clnt, "call failed");
+    }
+    return result;
+}
+
+cublasStatus_t cublasDgemv(cublasHandle_t handle,
+                           cublasOperation_t trans,
+                           int m, int n,
+                           const double          *alpha,
+                           const double          *A, int lda,
+                           const double          *x, int incx,
+                           const double          *beta,
+                           double          *y, int incy)
+{
+#ifdef WITH_API_CNT
+    api_call_cnt++;
+#endif //WITH_API_CNT
+    int result;
+    enum clnt_stat retval_1;
+    retval_1 = rpc_cublasdgemv_1(
+        (ptr)handle,
+        (int)trans,
+        m, n,
+        *alpha,
+        (ptr)A, lda,
+        (ptr)x, incx,
+        *beta,
+        (ptr)y, incy,
+         &result, clnt);
+    if (retval_1 != RPC_SUCCESS) {
+        clnt_perror (clnt, "call failed");
+    }
+    return result;
+}
+
+cublasStatus_t cublasSgemv(cublasHandle_t handle,
+                           cublasOperation_t trans,
+                           int m, int n,
+                           const float          *alpha,
+                           const float          *A, int lda,
+                           const float          *x, int incx,
+                           const float          *beta,
+                           float          *y, int incy)
+{
+#ifdef WITH_API_CNT
+    api_call_cnt++;
+#endif //WITH_API_CNT
+    int result;
+    enum clnt_stat retval_1;
+    retval_1 = rpc_cublassgemv_1(
+        (ptr)handle,
+        (int)trans,
+        m, n,
+        *alpha,
+        (ptr)A, lda,
+        (ptr)x, incx,
+        *beta,
+        (ptr)y, incy,
+         &result, clnt);
+    if (retval_1 != RPC_SUCCESS) {
+        clnt_perror (clnt, "call failed");
+    }
+    return result;
+}
\ No newline at end of file
diff --git a/cpu/cpu-client-cudnn.c b/cpu/cpu-client-cudnn.c
index 10057cb8..248a1dfe 100644
--- a/cpu/cpu-client-cudnn.c
+++ b/cpu/cpu-client-cudnn.c
@@ -8,12 +8,14 @@
 #include "cpu-utils.h"
 #include "log.h"
 
-static size_t cudnn_call_cnt = 0;
+#ifdef WITH_API_CNT
+extern int api_call_cnt;
+#endif //WITH_API_CNT
 
 size_t cudnnGetVersion(void)
 {
 #ifdef WITH_API_CNT
-    cudnn_call_cnt++;
+    api_call_cnt++;
 #endif //WITH_API_CNT
     size_t result;
     enum clnt_stat retval_1;
@@ -26,7 +28,7 @@ size_t cudnnGetVersion(void)
 size_t cudnnGetMaxDeviceVersion(void)
 {
 #ifdef WITH_API_CNT
-    cudnn_call_cnt++;
+    api_call_cnt++;
 #endif //WITH_API_CNT
     size_t result;
     enum clnt_stat retval_1;
@@ -39,7 +41,7 @@ size_t cudnnGetMaxDeviceVersion(void)
 size_t cudnnGetCudartVersion(void)
 {
 #ifdef WITH_API_CNT
-    cudnn_call_cnt++;
+    api_call_cnt++;
 #endif //WITH_API_CNT
     size_t result;
     enum clnt_stat retval_1;
@@ -52,7 +54,7 @@ size_t cudnnGetCudartVersion(void)
 const char *cudnnGetErrorString(cudnnStatus_t status)
 {
 #ifdef WITH_API_CNT
-    cudnn_call_cnt++;
+    api_call_cnt++;
 #endif //WITH_API_CNT
     char *result;
     enum clnt_stat retval_1;
@@ -69,7 +71,7 @@ const char *cudnnGetErrorString(cudnnStatus_t status)
 cudnnStatus_t cudnnQueryRuntimeError(cudnnHandle_t handle, cudnnStatus_t* rstatus, cudnnErrQueryMode_t  mode, cudnnRuntimeTag_t * tag)
 {
 #ifdef WITH_API_CNT
-    cudnn_call_cnt++;
+    api_call_cnt++;
 #endif //WITH_API_CNT
     int_result result;
     enum clnt_stat retval_1;
@@ -89,7 +91,7 @@ cudnnStatus_t cudnnQueryRuntimeError(cudnnHandle_t handle, cudnnStatus_t* rstatu
 cudnnStatus_t cudnnGetProperty(libraryPropertyType type, int * value)
 {
 #ifdef WITH_API_CNT
-    cudnn_call_cnt++;
+    api_call_cnt++;
 #endif //WITH_API_CNT
     int_result result;
     enum clnt_stat retval_1;
@@ -112,7 +114,7 @@ cudnnStatus_t cudnnGetProperty(libraryPropertyType type, int * value)
 cudnnStatus_t cudnnCreate(cudnnHandle_t* handle)
 {
 #ifdef WITH_API_CNT
-    cudnn_call_cnt++;
+    api_call_cnt++;
 #endif //WITH_API_CNT
     ptr_result result;
     enum clnt_stat retval_1;
@@ -135,7 +137,7 @@ cudnnStatus_t cudnnCreate(cudnnHandle_t* handle)
 cudnnStatus_t cudnnDestroy(cudnnHandle_t handle)
 {
 #ifdef WITH_API_CNT
-    cudnn_call_cnt++;
+    api_call_cnt++;
 #endif //WITH_API_CNT
     int result;
     enum clnt_stat retval_1;
@@ -152,7 +154,7 @@ cudnnStatus_t cudnnDestroy(cudnnHandle_t handle)
 cudnnStatus_t cudnnSetStream(cudnnHandle_t handle, cudaStream_t streamId)
 {
 #ifdef WITH_API_CNT
-    cudnn_call_cnt++;
+    api_call_cnt++;
 #endif //WITH_API_CNT
     int result;
     enum clnt_stat retval_1;
@@ -169,7 +171,7 @@ cudnnStatus_t cudnnSetStream(cudnnHandle_t handle, cudaStream_t streamId)
 cudnnStatus_t cudnnGetStream(cudnnHandle_t handle, cudaStream_t * streamId)
 {
 #ifdef WITH_API_CNT
-    cudnn_call_cnt++;
+    api_call_cnt++;
 #endif //WITH_API_CNT
     ptr_result result;
     enum clnt_stat retval_1;
@@ -192,7 +194,7 @@ cudnnStatus_t cudnnGetStream(cudnnHandle_t handle, cudaStream_t * streamId)
 cudnnStatus_t cudnnCreateTensorDescriptor(cudnnTensorDescriptor_t * tensorDesc)
 {
 #ifdef WITH_API_CNT
-    cudnn_call_cnt++;
+    api_call_cnt++;
 #endif //WITH_API_CNT
     ptr_result result;
     enum clnt_stat retval_1;
@@ -246,7 +248,7 @@ DEF_FN(cudnnStatus_t, cudnnScaleTensor, cudnnHandle_t, handle, const cudnnTensor
 cudnnStatus_t cudnnCreateFilterDescriptor(cudnnFilterDescriptor_t * filterDesc)
 {
 #ifdef WITH_API_CNT
-    cudnn_call_cnt++;
+    api_call_cnt++;
 #endif //WITH_API_CNT
     ptr_result result;
     enum clnt_stat retval_1;
@@ -269,7 +271,7 @@ cudnnStatus_t cudnnCreateFilterDescriptor(cudnnFilterDescriptor_t * filterDesc)
 cudnnStatus_t cudnnSetFilter4dDescriptor(cudnnFilterDescriptor_t filterDesc, cudnnDataType_t dataType, cudnnTensorFormat_t format, int k, int c, int h, int w) 
 {
 #ifdef WITH_API_CNT
-    cudnn_call_cnt++;
+    api_call_cnt++;
 #endif //WITH_API_CNT
     int result;
     enum clnt_stat retval_1;
@@ -291,7 +293,7 @@ cudnnStatus_t cudnnSetFilter4dDescriptor(cudnnFilterDescriptor_t filterDesc, cud
 cudnnStatus_t cudnnGetFilter4dDescriptor(const cudnnFilterDescriptor_t filterDesc, cudnnDataType_t *dataType, cudnnTensorFormat_t *format, int* k, int* c, int* h, int* w) 
 {
 #ifdef WITH_API_CNT
-    cudnn_call_cnt++;
+    api_call_cnt++;
 #endif //WITH_API_CNT
     int6_result result;
     enum clnt_stat retval_1;
@@ -322,7 +324,7 @@ cudnnStatus_t cudnnGetFilter4dDescriptor(const cudnnFilterDescriptor_t filterDes
 cudnnStatus_t cudnnSetFilterNdDescriptor(cudnnFilterDescriptor_t filterDesc, cudnnDataType_t dataType, cudnnTensorFormat_t format, int nbDims, const int* filterDimA)
 {
 #ifdef WITH_API_CNT
-    cudnn_call_cnt++;
+    api_call_cnt++;
 #endif //WITH_API_CNT
     int result;
     enum clnt_stat retval_1;
@@ -349,7 +351,7 @@ cudnnStatus_t cudnnSetFilterNdDescriptor(cudnnFilterDescriptor_t filterDesc, cud
 cudnnStatus_t cudnnGetFilterNdDescriptor(const cudnnFilterDescriptor_t filterDesc, int nbDimsRequested, cudnnDataType_t * dataType, cudnnTensorFormat_t * format, int* nbDims, int* filterDimA)
 {
 #ifdef WITH_API_CNT
-    cudnn_call_cnt++;
+    api_call_cnt++;
 #endif //WITH_API_CNT
     mem_result result;
     enum clnt_stat retval_1;
@@ -384,7 +386,7 @@ cudnnStatus_t cudnnGetFilterNdDescriptor(const cudnnFilterDescriptor_t filterDes
 cudnnStatus_t cudnnGetFilterSizeInBytes(const cudnnFilterDescriptor_t filterDesc, size_t* size)
 {
 #ifdef WITH_API_CNT
-    cudnn_call_cnt++;
+    api_call_cnt++;
 #endif //WITH_API_CNT
     sz_result result;
     enum clnt_stat retval_1;
@@ -410,7 +412,7 @@ cudnnStatus_t cudnnGetFilterSizeInBytes(const cudnnFilterDescriptor_t filterDesc
 cudnnStatus_t cudnnTransformFilter(cudnnHandle_t handle, const cudnnTensorTransformDescriptor_t transDesc, const void * alpha, const cudnnFilterDescriptor_t srcDesc, const void * srcData, const void * beta, const cudnnFilterDescriptor_t destDesc, void * destData)
 {
 #ifdef WITH_API_CNT
-    cudnn_call_cnt++;
+    api_call_cnt++;
 #endif //WITH_API_CNT
     int result;
     enum clnt_stat retval_1;
@@ -440,7 +442,7 @@ cudnnStatus_t cudnnTransformFilter(cudnnHandle_t handle, const cudnnTensorTransf
 cudnnStatus_t cudnnDestroyFilterDescriptor(cudnnFilterDescriptor_t filterDesc)
 {
 #ifdef WITH_API_CNT
-    cudnn_call_cnt++;
+    api_call_cnt++;
 #endif //WITH_API_CNT
     int result;
     enum clnt_stat retval_1;
@@ -461,7 +463,7 @@ DEF_FN(cudnnStatus_t, cudnnSoftmaxForward, cudnnHandle_t, handle, cudnnSoftmaxAl
 cudnnStatus_t cudnnCreatePoolingDescriptor(cudnnPoolingDescriptor_t *poolingDesc)
 {
 #ifdef WITH_API_CNT
-    cudnn_call_cnt++;
+    api_call_cnt++;
 #endif //WITH_API_CNT
     ptr_result result;
     enum clnt_stat retval_1;
@@ -484,7 +486,7 @@ cudnnStatus_t cudnnCreatePoolingDescriptor(cudnnPoolingDescriptor_t *poolingDesc
 cudnnStatus_t cudnnSetPooling2dDescriptor(cudnnPoolingDescriptor_t poolingDesc, cudnnPoolingMode_t mode, cudnnNanPropagation_t maxpoolingNanOpt, int windowHeight, int windowWidth, int verticalPadding, int horizontalPadding, int verticalStride, int horizontalStride)
 {
 #ifdef WITH_API_CNT
-    cudnn_call_cnt++;
+    api_call_cnt++;
 #endif //WITH_API_CNT
     int result;
     enum clnt_stat retval_1;
@@ -512,7 +514,7 @@ cudnnStatus_t cudnnSetPooling2dDescriptor(cudnnPoolingDescriptor_t poolingDesc,
 cudnnStatus_t cudnnGetPooling2dDescriptor(const cudnnPoolingDescriptor_t poolingDesc, cudnnPoolingMode_t *mode, cudnnNanPropagation_t *maxpoolingNanOpt, int* windowHeight, int* windowWidth, int* verticalPadding, int* horizontalPadding, int* verticalStride, int* horizontalStride)
 {
 #ifdef WITH_API_CNT
-    cudnn_call_cnt++;
+    api_call_cnt++;
 #endif //WITH_API_CNT
     int8_result result;
     enum clnt_stat retval_1;
@@ -545,7 +547,7 @@ cudnnStatus_t cudnnGetPooling2dDescriptor(const cudnnPoolingDescriptor_t pooling
 cudnnStatus_t cudnnSetPoolingNdDescriptor(cudnnPoolingDescriptor_t poolingDesc, const cudnnPoolingMode_t mode, const cudnnNanPropagation_t maxpoolingNanOpt, int nbDims, const int* windowDimA, const int* paddingA, const int* strideA)
 {
 #ifdef WITH_API_CNT
-    cudnn_call_cnt++;
+    api_call_cnt++;
 #endif //WITH_API_CNT
     int result;
     enum clnt_stat retval_1;
@@ -583,7 +585,7 @@ cudnnStatus_t cudnnSetPoolingNdDescriptor(cudnnPoolingDescriptor_t poolingDesc,
 cudnnStatus_t cudnnGetPoolingNdDescriptor(const cudnnPoolingDescriptor_t poolingDesc, int nbDimsRequested, cudnnPoolingMode_t * mode, cudnnNanPropagation_t * maxpoolingNanOpt, int* nbDims, int* windowDimA, int* paddingA, int* strideA)
 {
 #ifdef WITH_API_CNT
-    cudnn_call_cnt++;
+    api_call_cnt++;
 #endif //WITH_API_CNT
     mem_result result;
     enum clnt_stat retval_1;
@@ -621,7 +623,7 @@ cudnnStatus_t cudnnGetPoolingNdDescriptor(const cudnnPoolingDescriptor_t pooling
 cudnnStatus_t cudnnGetPoolingNdForwardOutputDim(const cudnnPoolingDescriptor_t poolingDesc, const cudnnTensorDescriptor_t inputTensorDesc, int nbDims, int* outputTensorDimA)
 {
 #ifdef WITH_API_CNT
-    cudnn_call_cnt++;
+    api_call_cnt++;
 #endif //WITH_API_CNT
     mem_result result;
     enum clnt_stat retval_1;
@@ -649,7 +651,7 @@ cudnnStatus_t cudnnGetPoolingNdForwardOutputDim(const cudnnPoolingDescriptor_t p
 cudnnStatus_t cudnnGetPooling2dForwardOutputDim(const cudnnPoolingDescriptor_t poolingDesc, const cudnnTensorDescriptor_t inputTensorDesc, int* n, int* c, int* h, int* w)
 {
 #ifdef WITH_API_CNT
-    cudnn_call_cnt++;
+    api_call_cnt++;
 #endif //WITH_API_CNT
     int4_result result;
     enum clnt_stat retval_1;
@@ -678,7 +680,7 @@ cudnnStatus_t cudnnGetPooling2dForwardOutputDim(const cudnnPoolingDescriptor_t p
 cudnnStatus_t cudnnDestroyPoolingDescriptor(cudnnPoolingDescriptor_t poolingDesc)
 {
 #ifdef WITH_API_CNT
-    cudnn_call_cnt++;
+    api_call_cnt++;
 #endif //WITH_API_CNT
     int result;
     enum clnt_stat retval_1;
@@ -700,7 +702,7 @@ DEF_FN(cudnnStatus_t, cudnnPoolingForward, cudnnHandle_t, handle, const cudnnPoo
 cudnnStatus_t cudnnCreateActivationDescriptor(cudnnActivationDescriptor_t * activationDesc)
 {
 #ifdef WITH_API_CNT
-    cudnn_call_cnt++;
+    api_call_cnt++;
 #endif //WITH_API_CNT
     ptr_result result;
     enum clnt_stat retval_1;
@@ -723,7 +725,7 @@ cudnnStatus_t cudnnCreateActivationDescriptor(cudnnActivationDescriptor_t * acti
 cudnnStatus_t cudnnSetActivationDescriptor(cudnnActivationDescriptor_t activationDesc, cudnnActivationMode_t mode, cudnnNanPropagation_t reluNanOpt, double coef) 
 {
 #ifdef WITH_API_CNT
-    cudnn_call_cnt++;
+    api_call_cnt++;
 #endif //WITH_API_CNT
     int result;
     enum clnt_stat retval_1;
@@ -746,7 +748,7 @@ cudnnStatus_t cudnnSetActivationDescriptor(cudnnActivationDescriptor_t activatio
 cudnnStatus_t cudnnGetActivationDescriptor(const cudnnActivationDescriptor_t activationDesc, cudnnActivationMode_t *mode, cudnnNanPropagation_t *reluNanOpt, double *coef) 
 {
 #ifdef WITH_API_CNT
-    cudnn_call_cnt++;
+    api_call_cnt++;
 #endif //WITH_API_CNT
     int2d1_result result;
     enum clnt_stat retval_1;
@@ -774,7 +776,7 @@ cudnnStatus_t cudnnGetActivationDescriptor(const cudnnActivationDescriptor_t act
 cudnnStatus_t cudnnSetActivationDescriptorSwishBeta(cudnnActivationDescriptor_t activationDesc, double swish_beta)
 {
 #ifdef WITH_API_CNT
-    cudnn_call_cnt++;
+    api_call_cnt++;
 #endif //WITH_API_CNT
     int result;
     enum clnt_stat retval_1;
@@ -795,7 +797,7 @@ cudnnStatus_t cudnnSetActivationDescriptorSwishBeta(cudnnActivationDescriptor_t
 cudnnStatus_t cudnnGetActivationDescriptorSwishBeta(cudnnActivationDescriptor_t activationDesc, double * swish_beta)
 {
 #ifdef WITH_API_CNT
-    cudnn_call_cnt++;
+    api_call_cnt++;
 #endif //WITH_API_CNT
     d_result result;
     enum clnt_stat retval_1;
@@ -821,7 +823,7 @@ cudnnStatus_t cudnnGetActivationDescriptorSwishBeta(cudnnActivationDescriptor_t
 cudnnStatus_t cudnnDestroyActivationDescriptor(cudnnActivationDescriptor_t activationDesc)
 {
 #ifdef WITH_API_CNT
-    cudnn_call_cnt++;
+    api_call_cnt++;
 #endif //WITH_API_CNT
     int result;
     enum clnt_stat retval_1;
@@ -843,7 +845,7 @@ DEF_FN(cudnnStatus_t, cudnnActivationForward, cudnnHandle_t, handle, cudnnActiva
 cudnnStatus_t cudnnCreateLRNDescriptor(cudnnLRNDescriptor_t * normDesc)
 {
 #ifdef WITH_API_CNT
-    cudnn_call_cnt++;
+    api_call_cnt++;
 #endif //WITH_API_CNT
     ptr_result result;
     enum clnt_stat retval_1;
@@ -866,7 +868,7 @@ cudnnStatus_t cudnnCreateLRNDescriptor(cudnnLRNDescriptor_t * normDesc)
 cudnnStatus_t cudnnSetLRNDescriptor(cudnnLRNDescriptor_t normDesc, unsigned lrnN, double lrnAlpha, double lrnBeta, double lrnK)
 {
 #ifdef WITH_API_CNT
-    cudnn_call_cnt++;
+    api_call_cnt++;
 #endif //WITH_API_CNT
     int result;
     enum clnt_stat retval_1;
@@ -890,7 +892,7 @@ cudnnStatus_t cudnnSetLRNDescriptor(cudnnLRNDescriptor_t normDesc, unsigned lrnN
 cudnnStatus_t cudnnGetLRNDescriptor(cudnnLRNDescriptor_t normDesc, unsigned * lrnN, double * lrnAlpha, double * lrnBeta, double * lrnK)
 {
 #ifdef WITH_API_CNT
-    cudnn_call_cnt++;
+    api_call_cnt++;
 #endif //WITH_API_CNT
     int1d3_result result;
     enum clnt_stat retval_1;
@@ -919,7 +921,7 @@ cudnnStatus_t cudnnGetLRNDescriptor(cudnnLRNDescriptor_t normDesc, unsigned * lr
 cudnnStatus_t cudnnDestroyLRNDescriptor(cudnnLRNDescriptor_t lrnDesc)
 {
 #ifdef WITH_API_CNT
-    cudnn_call_cnt++;
+    api_call_cnt++;
 #endif //WITH_API_CNT
     int result;
     enum clnt_stat retval_1;
diff --git a/cpu/cpu-server-cublas.c b/cpu/cpu-server-cublas.c
index 972b2c31..c0d8104d 100644
--- a/cpu/cpu-server-cublas.c
+++ b/cpu/cpu-server-cublas.c
@@ -16,6 +16,7 @@
 #define WITH_RECORDER
 #include "api-recorder.h"
 #include "cpu-server-cublas.h"
+#include "gsched.h"
 
 
 
@@ -43,9 +44,12 @@ bool_t rpc_cublascreate_1_svc(ptr_result *result, struct svc_req *rqstp)
     RECORD_VOID_API;
     LOGE(LOG_DEBUG, "cublasCreate_v2");
 
+    GSCHED_RETAIN;
     result->err = cublasCreate_v2((cublasHandle_t*)&result->ptr_result_u.ptr);
-    RECORD_RESULT(ptr_result_u, *result);
     resource_mg_create(&rm_cublas, (void*)result->ptr_result_u.ptr);
+    GSCHED_RELEASE;
+    
+    RECORD_RESULT(ptr_result_u, *result);
     return 1;
 }
 
@@ -55,15 +59,33 @@ bool_t rpc_cublasdgemm_1_svc(ptr handle, int transa, int transb, int m, int n, i
             ptr C, int ldc,
             int *result, struct svc_req *rqstp)
 {
+    RECORD_API(rpc_cublasdgemm_1_argument);
+    RECORD_ARG(1, handle);
+    RECORD_ARG(2, transa);
+    RECORD_ARG(3, transb);
+    RECORD_ARG(4, m);
+    RECORD_ARG(5, n);
+    RECORD_ARG(6, k);
+    RECORD_ARG(7, alpha);
+    RECORD_ARG(8, A);
+    RECORD_ARG(9, lda);
+    RECORD_ARG(10, B);
+    RECORD_ARG(11, ldb);
+    RECORD_ARG(12, beta);
+    RECORD_ARG(13, C);
+    RECORD_ARG(14, ldc);
     LOGE(LOG_DEBUG, "cublasDgemm");
+    GSCHED_RETAIN;
     *result = cublasDgemm(resource_mg_get(&rm_cublas, (void*)handle),
                     (cublasOperation_t) transa,
                     (cublasOperation_t) transb,
                     m, n, k, &alpha,
-                    resource_mg_get(&rm_cublas, (void*)A), lda,
-                    resource_mg_get(&rm_cublas, (void*)B), ldb, &beta,
-                    resource_mg_get(&rm_cublas, (void*)C), ldc
+                    resource_mg_get(&rm_memory, (void*)A), lda,
+                    resource_mg_get(&rm_memory, (void*)B), ldb, &beta,
+                    resource_mg_get(&rm_memory, (void*)C), ldc
     );
+    GSCHED_RELEASE;
+    RECORD_RESULT(integer, *result);
     return 1;
 }
 
@@ -72,7 +94,113 @@ bool_t rpc_cublasdestroy_1_svc(ptr handle, int *result, struct svc_req *rqstp)
     RECORD_API(ptr);
     RECORD_SINGLE_ARG(handle);
     LOGE(LOG_DEBUG, "cublasDestroy_v2");
+    GSCHED_RETAIN;
     *result = cublasDestroy_v2(resource_mg_get(&rm_cublas, (void*)handle));
+    GSCHED_RELEASE;
+    RECORD_RESULT(integer, *result);
+    return 1;
+}
+
+bool_t rpc_cublassgemm_1_svc(ptr handle, int transa, int transb, int m, int n, int k, float alpha,
+            ptr A, int lda,
+            ptr B, int ldb, float beta,
+            ptr C, int ldc,
+            int *result, struct svc_req *rqstp)
+{
+    RECORD_API(rpc_cublassgemm_1_argument);
+    RECORD_ARG(1, handle);
+    RECORD_ARG(2, transa);
+    RECORD_ARG(3, transb);
+    RECORD_ARG(4, m);
+    RECORD_ARG(5, n);
+    RECORD_ARG(6, k);
+    RECORD_ARG(7, alpha);
+    RECORD_ARG(8, A);
+    RECORD_ARG(9, lda);
+    RECORD_ARG(10, B);
+    RECORD_ARG(11, ldb);
+    RECORD_ARG(12, beta);
+    RECORD_ARG(13, C);
+    RECORD_ARG(14, ldc);
+    LOGE(LOG_DEBUG, "cublasSgemm");
+    GSCHED_RETAIN;
+    *result = cublasSgemm(resource_mg_get(&rm_cublas, (void*)handle),
+                    (cublasOperation_t) transa,
+                    (cublasOperation_t) transb,
+                    m, n, k, &alpha,
+                    resource_mg_get(&rm_memory, (void*)A), lda,
+                    resource_mg_get(&rm_memory, (void*)B), ldb, &beta,
+                    resource_mg_get(&rm_memory, (void*)C), ldc
+    );
+    GSCHED_RELEASE;
     RECORD_RESULT(integer, *result);
     return 1;
 }
+
+bool_t rpc_cublassgemv_1_svc(ptr handle, int trans, int m, 
+            int n, float alpha,
+            ptr A, int lda,
+            ptr x, int incx, float beta,
+            ptr y, int incy,
+            int *result, struct svc_req *rqstp)
+{
+    RECORD_API(rpc_cublassgemv_1_argument);
+    RECORD_ARG(1, handle);
+    RECORD_ARG(2, trans);
+    RECORD_ARG(3, m);
+    RECORD_ARG(4, n);
+    RECORD_ARG(5, alpha);
+    RECORD_ARG(6, A);
+    RECORD_ARG(7, lda);
+    RECORD_ARG(8, x);
+    RECORD_ARG(9, incx);
+    RECORD_ARG(10, beta);
+    RECORD_ARG(11, y);
+    RECORD_ARG(12, incy);
+    LOGE(LOG_DEBUG, "cublasSgemv");
+    GSCHED_RETAIN;
+    *result = cublasSgemv(resource_mg_get(&rm_cublas, (void*)handle),
+                    (cublasOperation_t) trans,
+                    m, n, &alpha,
+                    resource_mg_get(&rm_memory, (void*)A), lda,
+                    resource_mg_get(&rm_memory, (void*)x), incx, &beta,
+                    resource_mg_get(&rm_memory, (void*)y), incy
+    );
+    GSCHED_RELEASE;
+    RECORD_RESULT(integer, *result);
+    return 1;
+}
+
+bool_t rpc_cublasdgemv_1_svc(ptr handle, int trans, int m, 
+            int n, double alpha,
+            ptr A, int lda,
+            ptr x, int incx, double beta,
+            ptr y, int incy,
+            int *result, struct svc_req *rqstp)
+{
+    RECORD_API(rpc_cublasdgemv_1_argument);
+    RECORD_ARG(1, handle);
+    RECORD_ARG(2, trans);
+    RECORD_ARG(3, m);
+    RECORD_ARG(4, n);
+    RECORD_ARG(5, alpha);
+    RECORD_ARG(6, A);
+    RECORD_ARG(7, lda);
+    RECORD_ARG(8, x);
+    RECORD_ARG(9, incx);
+    RECORD_ARG(10, beta);
+    RECORD_ARG(11, y);
+    RECORD_ARG(12, incy);
+    LOGE(LOG_DEBUG, "cublasDgemv");
+    GSCHED_RETAIN;
+    *result = cublasDgemv(resource_mg_get(&rm_cublas, (void*)handle),
+                    (cublasOperation_t) trans,
+                    m, n, &alpha,
+                    resource_mg_get(&rm_memory, (void*)A), lda,
+                    resource_mg_get(&rm_memory, (void*)x), incx, &beta,
+                    resource_mg_get(&rm_memory, (void*)y), incy
+    );
+    GSCHED_RELEASE;
+    RECORD_RESULT(integer, *result);
+    return 1;
+}
\ No newline at end of file
diff --git a/cpu/cpu_rpc_prot.x b/cpu/cpu_rpc_prot.x
index d840d549..0300bd6f 100644
--- a/cpu/cpu_rpc_prot.x
+++ b/cpu/cpu_rpc_prot.x
@@ -450,6 +450,12 @@ program RPC_CD_PROG {
         int          rpc_cublasDgemm(ptr, int, int, int, int, int, double,
                          ptr, int, ptr, int, double, ptr, int)                 = 3002;
         int          rpc_cublasDestroy(ptr)                                    = 3003;
+        int          rpc_cublasSgemm(ptr, int, int, int, int, int, float,
+                         ptr, int, ptr, int, float, ptr, int)                 = 3004;
+        int          rpc_cublasSgemv(ptr, int, int, int, float,
+                         ptr, int, ptr, int, float, ptr, int)                 = 3005;
+        int          rpc_cublasDgemv(ptr, int, int, int, double,
+                         ptr, int, ptr, int, double, ptr, int)                 = 3006;
 
         /* NVML */
         int_result   rpc_nvmlDeviceGetCount_v2(void)                           = 4000;

From 762cada09866dde2ae5f2cc380954829ab25d2f2 Mon Sep 17 00:00:00 2001
From: Niklas Eiling <niklas.eiling@eonerc.rwth-aachen.de>
Date: Wed, 21 Jun 2023 17:35:26 +0200
Subject: [PATCH 70/83] implement cudnn tensor functions

Signed-off-by: Niklas Eiling <niklas.eiling@eonerc.rwth-aachen.de>
---
 cpu/cpu-client-cudnn.c  | 223 ++++++++++++++++++++++++++++++++++++++--
 cpu/cpu-server-cudnn.c  | 124 ++++++++++++++++++++++
 cpu/cpu-server-driver.c |   2 +-
 cpu/cpu_rpc_prot.x      |   2 +-
 4 files changed, 341 insertions(+), 10 deletions(-)

diff --git a/cpu/cpu-client-cudnn.c b/cpu/cpu-client-cudnn.c
index 248a1dfe..03d7fd5d 100644
--- a/cpu/cpu-client-cudnn.c
+++ b/cpu/cpu-client-cudnn.c
@@ -214,14 +214,221 @@ cudnnStatus_t cudnnCreateTensorDescriptor(cudnnTensorDescriptor_t * tensorDesc)
     return result.err;
 }
 
-DEF_FN(cudnnStatus_t, cudnnSetTensor4dDescriptor, cudnnTensorDescriptor_t, tensorDesc, cudnnTensorFormat_t, format, cudnnDataType_t, dataType, int, n, int, c, int, h, int, w) 
-DEF_FN(cudnnStatus_t, cudnnSetTensor4dDescriptorEx, cudnnTensorDescriptor_t, tensorDesc, cudnnDataType_t, dataType, int, n, int, c, int, h, int, w, int, nStride, int, cStride, int, hStride, int, wStride)
-DEF_FN(cudnnStatus_t, cudnnGetTensor4dDescriptor, const cudnnTensorDescriptor_t, tensorDesc, cudnnDataType_t *, dataType, int*, n, int*, c, int*, h, int*, w, int*, nStride, int*, cStride, int*, hStride, int*, wStride)
-DEF_FN(cudnnStatus_t, cudnnSetTensorNdDescriptor, cudnnTensorDescriptor_t, tensorDesc, cudnnDataType_t, dataType, int, nbDims, const int*, dimA, const int*, strideA)
-DEF_FN(cudnnStatus_t, cudnnSetTensorNdDescriptorEx, cudnnTensorDescriptor_t, tensorDesc, cudnnTensorFormat_t, format, cudnnDataType_t, dataType, int, nbDims, const int*, dimA)
-DEF_FN(cudnnStatus_t, cudnnGetTensorNdDescriptor, const cudnnTensorDescriptor_t, tensorDesc, int, nbDimsRequested, cudnnDataType_t *, dataType, int*, nbDims, int*, dimA, int*, strideA)
-DEF_FN(cudnnStatus_t, cudnnGetTensorSizeInBytes, const cudnnTensorDescriptor_t, tensorDesc, size_t*, size)
-DEF_FN(cudnnStatus_t, cudnnDestroyTensorDescriptor, cudnnTensorDescriptor_t, tensorDesc)
+cudnnStatus_t cudnnSetTensor4dDescriptor(cudnnTensorDescriptor_t tensorDesc, cudnnTensorFormat_t format, cudnnDataType_t dataType, int n, int c, int h, int w) 
+{
+#ifdef WITH_API_CNT
+    api_call_cnt++;
+#endif //WITH_API_CNT
+    int result;
+    enum clnt_stat retval_1;
+    retval_1 = rpc_cudnnsettensor4ddescriptor_1(
+        (ptr)tensorDesc,
+        (int)format,
+        (int)dataType,
+        n, c, h, w, &result, clnt);
+
+    if (retval_1 != RPC_SUCCESS) {
+        LOGE(LOG_ERROR, "%s failed (%d)", __FUNCTION__, retval_1);
+    }
+    if (result != CUDNN_STATUS_SUCCESS) {
+        LOGE(LOG_ERROR, "%s failed (result is %d)", __FUNCTION__, result);
+    } 
+    return result;
+}
+
+cudnnStatus_t cudnnSetTensor4dDescriptorEx(cudnnTensorDescriptor_t tensorDesc, cudnnDataType_t dataType, int n, int c, int h, int w, int nStride, int cStride, int hStride, int wStride)
+{
+#ifdef WITH_API_CNT
+    api_call_cnt++;
+#endif //WITH_API_CNT
+    int result;
+    enum clnt_stat retval_1;
+    retval_1 = rpc_cudnnsettensor4ddescriptorex_1(
+        (ptr)tensorDesc,
+        (int)dataType,
+        n, c, h, w, nStride, cStride, hStride, wStride, &result, clnt);
+
+    if (retval_1 != RPC_SUCCESS) {
+        LOGE(LOG_ERROR, "%s failed (%d)", __FUNCTION__, retval_1);
+    }
+    if (result != CUDNN_STATUS_SUCCESS) {
+        LOGE(LOG_ERROR, "%s failed (result is %d)", __FUNCTION__, result);
+    } 
+    return result;
+}
+
+cudnnStatus_t cudnnGetTensor4dDescriptor(const cudnnTensorDescriptor_t tensorDesc, cudnnDataType_t *dataType, int* n, int* c, int* h, int* w, int* nStride, int* cStride, int* hStride, int* wStride)
+{
+#ifdef WITH_API_CNT
+    api_call_cnt++;
+#endif //WITH_API_CNT
+    int9_result result;
+    enum clnt_stat retval_1;
+    if (dataType == NULL || n == NULL || c == NULL || h == NULL || w == NULL || nStride == NULL || cStride == NULL || hStride == NULL || wStride == NULL) { 
+        LOGE(LOG_ERROR, "%s failed (value is NULL)", __FUNCTION__);
+        return CUDNN_STATUS_BAD_PARAM;
+    }
+    retval_1 = rpc_cudnngettensor4ddescriptor_1(
+        (ptr)tensorDesc,
+        &result, clnt);
+
+    if (retval_1 != RPC_SUCCESS) {
+        LOGE(LOG_ERROR, "%s failed (%d)", __FUNCTION__, retval_1);
+    }
+    if (result.err != CUDNN_STATUS_SUCCESS) {
+        LOGE(LOG_ERROR, "%s failed (result is %d)", __FUNCTION__, result);
+    } else {
+        *dataType = (cudnnDataType_t)result.int9_result_u.data[0];
+        *n = result.int9_result_u.data[1];
+        *c = result.int9_result_u.data[2];
+        *h = result.int9_result_u.data[3];
+        *w = result.int9_result_u.data[4];
+        *nStride = result.int9_result_u.data[5];
+        *cStride = result.int9_result_u.data[6];
+        *hStride = result.int9_result_u.data[7];
+        *wStride = result.int9_result_u.data[8];
+    }
+    return result.err;
+}
+
+cudnnStatus_t cudnnSetTensorNdDescriptor(cudnnTensorDescriptor_t tensorDesc, cudnnDataType_t dataType, int nbDims, const int* dimA, const int* strideA)
+{
+#ifdef WITH_API_CNT
+    api_call_cnt++;
+#endif //WITH_API_CNT
+    int result;
+    enum clnt_stat retval_1;
+    mem_data rpc_dimA = {
+        .mem_data_len = nbDims * sizeof(int),
+        .mem_data_val = (char*)dimA
+    };
+    mem_data rpc_strideA = {
+        .mem_data_len = nbDims * sizeof(int),
+        .mem_data_val = (char*)strideA
+    };
+    retval_1 = rpc_cudnnsettensornddescriptor_1(
+        (ptr)tensorDesc,
+        (int)dataType,
+        (int)nbDims,
+        rpc_dimA, rpc_strideA, &result, clnt);
+
+    if (retval_1 != RPC_SUCCESS) {
+        LOGE(LOG_ERROR, "%s failed (%d)", __FUNCTION__, retval_1);
+    }
+    if (result != CUDNN_STATUS_SUCCESS) {
+        LOGE(LOG_ERROR, "%s failed (result is %d)", __FUNCTION__, result);
+    } 
+    return result;
+}
+
+cudnnStatus_t cudnnSetTensorNdDescriptorEx(cudnnTensorDescriptor_t tensorDesc, cudnnTensorFormat_t format, cudnnDataType_t dataType, int nbDims, const int* dimA)
+{
+#ifdef WITH_API_CNT
+    api_call_cnt++;
+#endif //WITH_API_CNT
+    int result;
+    enum clnt_stat retval_1;
+    mem_data rpc_dimA = {
+        .mem_data_len = nbDims * sizeof(int),
+        .mem_data_val = (char*)dimA
+    };
+    retval_1 = rpc_cudnnsettensornddescriptorex_1(
+        (ptr)tensorDesc,
+        (int)format,
+        (int)dataType,
+        (int)nbDims,
+        rpc_dimA, &result, clnt);
+
+    if (retval_1 != RPC_SUCCESS) {
+        LOGE(LOG_ERROR, "%s failed (%d)", __FUNCTION__, retval_1);
+    }
+    if (result != CUDNN_STATUS_SUCCESS) {
+        LOGE(LOG_ERROR, "%s failed (result is %d)", __FUNCTION__, result);
+    } 
+    return result;
+}
+
+cudnnStatus_t cudnnGetTensorNdDescriptor(const cudnnTensorDescriptor_t tensorDesc, int nbDimsRequested, cudnnDataType_t *dataType, int* nbDims, int* dimA, int* strideA)
+{
+#ifdef WITH_API_CNT
+    api_call_cnt++;
+#endif //WITH_API_CNT
+    mem_result result;
+    enum clnt_stat retval_1;
+    if (dataType == NULL || nbDims == NULL || dimA == NULL || strideA == NULL) { 
+        LOGE(LOG_ERROR, "%s failed (value is NULL)", __FUNCTION__);
+        return CUDNN_STATUS_BAD_PARAM;
+    }
+    retval_1 = rpc_cudnngettensornddescriptor_1(
+        (ptr)tensorDesc,
+        nbDimsRequested,
+        &result, clnt);
+
+    if (retval_1 != RPC_SUCCESS) {
+        LOGE(LOG_ERROR, "%s failed (%d)", __FUNCTION__, retval_1);
+    }
+    size_t expected_size = nbDimsRequested * sizeof(int) * 2 + sizeof(int) + sizeof(cudnnDataType_t);
+    if (result.err != CUDNN_STATUS_SUCCESS || result.mem_result_u.data.mem_data_len != expected_size) {
+        LOGE(LOG_ERROR, "%s failed (result is %d)", __FUNCTION__, result.err);
+    } else {
+        size_t offset = 0;
+        *dataType = (cudnnDataType_t)result.mem_result_u.data.mem_data_val[offset];
+        offset += sizeof(cudnnDataType_t);
+        *nbDims = (int)result.mem_result_u.data.mem_data_val[offset];
+        offset += sizeof(int);
+        memcpy(dimA, result.mem_result_u.data.mem_data_val+offset, *nbDims * sizeof(int));
+        offset += *nbDims * sizeof(int);
+        memcpy(strideA, result.mem_result_u.data.mem_data_val+offset, *nbDims * sizeof(int));
+    }
+    return result.err;
+}
+
+cudnnStatus_t cudnnGetTensorSizeInBytes(const cudnnTensorDescriptor_t tensorDesc, size_t* size)
+{
+#ifdef WITH_API_CNT
+    api_call_cnt++;
+#endif //WITH_API_CNT
+    sz_result result;
+    enum clnt_stat retval_1;
+    if (size == NULL) {
+        LOGE(LOG_ERROR, "%s failed (value is NULL)", __FUNCTION__);
+        return CUDNN_STATUS_BAD_PARAM;
+    }
+    retval_1 = rpc_cudnngettensorsizeinbytes_1(
+        (ptr)tensorDesc,
+        &result, clnt);
+
+    if (retval_1 != RPC_SUCCESS) {
+        LOGE(LOG_ERROR, "%s failed (%d)", __FUNCTION__, retval_1);
+    }
+    if (result.err != CUDNN_STATUS_SUCCESS) {
+        LOGE(LOG_ERROR, "%s failed (result is %d)", __FUNCTION__, result.err);
+    } else {
+        *size = result.sz_result_u.data;
+    }
+    return result.err;
+}
+
+cudnnStatus_t cudnnDestroyTensorDescriptor(cudnnTensorDescriptor_t tensorDesc)
+{
+#ifdef WITH_API_CNT
+    api_call_cnt++;
+#endif //WITH_API_CNT
+    int result;
+    enum clnt_stat retval_1;
+    retval_1 = rpc_cudnndestroytensordescriptor_1(
+        (ptr)tensorDesc,
+        &result, clnt);
+
+    if (retval_1 != RPC_SUCCESS) {
+        LOGE(LOG_ERROR, "%s failed (%d)", __FUNCTION__, retval_1);
+    }
+    if (result != CUDNN_STATUS_SUCCESS) {
+        LOGE(LOG_ERROR, "%s failed (result is %d)", __FUNCTION__, result);
+    }
+    return result;
+}
+
 DEF_FN(cudnnStatus_t, cudnnInitTransformDest, const cudnnTensorTransformDescriptor_t, transformDesc, const cudnnTensorDescriptor_t, srcDesc, cudnnTensorDescriptor_t, destDesc, size_t*, destSizeInBytes)
 DEF_FN(cudnnStatus_t, cudnnCreateTensorTransformDescriptor, cudnnTensorTransformDescriptor_t *, transformDesc)
 DEF_FN(cudnnStatus_t, cudnnSetTensorTransformDescriptor, cudnnTensorTransformDescriptor_t, transformDesc, const uint32_t, nbDims, const cudnnTensorFormat_t, destFormat, const int32_t*, padBeforeA, const int32_t*, padAfterA, const uint32_t*, foldA, const cudnnFoldingDirection_t,  direction)
diff --git a/cpu/cpu-server-cudnn.c b/cpu/cpu-server-cudnn.c
index e71e55ee..1f5ebf09 100644
--- a/cpu/cpu-server-cudnn.c
+++ b/cpu/cpu-server-cudnn.c
@@ -181,6 +181,130 @@ bool_t rpc_cudnncreatetensordescriptor_1_svc(ptr_result *result, struct svc_req
     return 1;
 }
 
+bool_t rpc_cudnnsettensor4ddescriptor_1_svc(ptr tensorDesc, int format, int dataType, int n, int c, int h, int w, int *result, struct svc_req *rqstp)
+{
+    RECORD_API(rpc_cudnnsettensor4ddescriptor_1_argument);
+    RECORD_NARG(tensorDesc);
+    RECORD_NARG(format);
+    RECORD_NARG(dataType);
+    RECORD_NARG(n);
+    RECORD_NARG(c);
+    RECORD_NARG(h);
+    RECORD_NARG(w);
+
+    LOGE(LOG_DEBUG, "%s", __FUNCTION__);
+
+    GSCHED_RETAIN;
+    *result = cudnnSetTensor4dDescriptor(
+        (cudnnTensorDescriptor_t)resource_mg_get(&rm_cudnn_tensors, (void*)tensorDesc),
+        (cudnnTensorFormat_t)format,
+        (cudnnDataType_t)dataType,
+        n, c, h, w);
+    GSCHED_RELEASE;
+    RECORD_RESULT(integer, *result);
+    return 1;
+}
+
+bool_t rpc_cudnngettensor4ddescriptor_1_svc(ptr tensorDesc, int9_result *result, struct svc_req *rqstp)
+{
+    LOGE(LOG_DEBUG, "%s", __FUNCTION__);
+
+    GSCHED_RETAIN;
+    result->err = cudnnGetTensor4dDescriptor(
+        (cudnnTensorDescriptor_t)resource_mg_get(&rm_cudnn_tensors, (void*)tensorDesc),
+        (cudnnDataType_t*)&result->int9_result_u.data[0],
+        &result->int9_result_u.data[1],
+        &result->int9_result_u.data[2],
+        &result->int9_result_u.data[3],
+        &result->int9_result_u.data[4],
+        &result->int9_result_u.data[5],
+        &result->int9_result_u.data[6],
+        &result->int9_result_u.data[7],
+        &result->int9_result_u.data[8]);
+    GSCHED_RELEASE;
+    return 1;
+}
+
+bool_t rpc_cudnnsettensornddescriptor_1_svc(ptr tensorDesc, int dataType, int nbDims, mem_data dimA, mem_data strideA, int *result, struct svc_req *rqstp)
+{
+    RECORD_API(rpc_cudnnsettensornddescriptor_1_argument);
+    RECORD_NARG(tensorDesc);
+    RECORD_NARG(dataType);
+    RECORD_NARG(nbDims);
+    RECORD_NARG(dimA);
+    RECORD_NARG(strideA);
+    
+    //TODO: Recording dimA and strideA is not as easy as done here.
+
+    LOGE(LOG_DEBUG, "%s", __FUNCTION__);
+
+    if (dimA.mem_data_len != nbDims * sizeof(int) || strideA.mem_data_len != nbDims * sizeof(int)) {
+        LOGE(LOG_ERROR, "array dimensions not as expected.");
+        return 0;
+    }
+    GSCHED_RETAIN;
+    *result = cudnnSetTensorNdDescriptor(
+        (cudnnTensorDescriptor_t)resource_mg_get(&rm_cudnn_tensors, (void*)tensorDesc),
+        (cudnnDataType_t)dataType,
+        nbDims,
+        (const int*)dimA.mem_data_val,
+        (const int*)strideA.mem_data_val);
+    GSCHED_RELEASE;
+    RECORD_RESULT(integer, *result);
+    return 1;
+}
+
+bool_t rpc_cudnnsettensornddescriptorex_1_svc(ptr tensorDesc, int format, int dataType, int nbDims, mem_data dimA, int *result, struct svc_req *rqstp)
+{
+    RECORD_API(rpc_cudnnsettensornddescriptorex_1_argument);
+    RECORD_NARG(tensorDesc);
+    RECORD_NARG(format);
+    RECORD_NARG(dataType);
+    RECORD_NARG(nbDims);
+    RECORD_NARG(dimA);
+    
+    //TODO: Recording dimA and strideA is not as easy as done here.
+
+    LOGE(LOG_DEBUG, "%s", __FUNCTION__);
+
+    if (dimA.mem_data_len != nbDims * sizeof(int)) {
+        LOGE(LOG_ERROR, "array dimensions not as expected.");
+        return 0;
+    }
+    GSCHED_RETAIN;
+    *result = cudnnSetTensorNdDescriptorEx(
+        (cudnnTensorDescriptor_t)resource_mg_get(&rm_cudnn_tensors, (void*)tensorDesc),
+        (cudnnTensorFormat_t)format,   
+        (cudnnDataType_t)dataType,
+        nbDims,
+        (const int*)dimA.mem_data_val);
+    GSCHED_RELEASE;
+    RECORD_RESULT(integer, *result);
+    return 1;
+}
+
+bool_t rpc_cudnngettensornddescriptor_1_svc(ptr tensorDesc, int nbDimsRequested, mem_result *result, struct svc_req *rqstp)
+{
+    LOGE(LOG_DEBUG, "%s", __FUNCTION__);
+    result->mem_result_u.data.mem_data_len = sizeof(cudnnDataType_t) + sizeof(int) + nbDimsRequested*sizeof(int)*2;
+    if ((result->mem_result_u.data.mem_data_val = malloc(result->mem_result_u.data.mem_data_len)) == NULL) {
+        LOGE(LOG_ERROR, "malloc failed");
+        return 0;
+    }
+    
+    GSCHED_RETAIN;
+    result->err = cudnnGetTensorNdDescriptor(
+        (cudnnTensorDescriptor_t)resource_mg_get(&rm_cudnn_tensors, (void*)tensorDesc),
+        nbDimsRequested,
+        (cudnnDataType_t*)result->mem_result_u.data.mem_data_val,
+        (int*)&result->mem_result_u.data.mem_data_val[sizeof(cudnnDataType_t)],
+        (int*)&result->mem_result_u.data.mem_data_val[sizeof(cudnnDataType_t)+sizeof(int)],
+        (int*)&result->mem_result_u.data.mem_data_val[sizeof(cudnnDataType_t)+sizeof(int)+nbDimsRequested*sizeof(int)]);
+
+    GSCHED_RELEASE;
+    return 1;
+}
+
 bool_t rpc_cudnncreatefilterdescriptor_1_svc(ptr_result *result, struct svc_req *rqstp)
 {
     RECORD_VOID_API;
diff --git a/cpu/cpu-server-driver.c b/cpu/cpu-server-driver.c
index 5f51c6ad..39759457 100644
--- a/cpu/cpu-server-driver.c
+++ b/cpu/cpu-server-driver.c
@@ -320,7 +320,7 @@ bool_t rpc_cumoduleunload_1_svc(ptr module, int *result,
     RECORD_SINGLE_ARG(module);
     LOG(LOG_DEBUG, "%s(%p)", __FUNCTION__, (void*)module);
     GSCHED_RETAIN;
-    *result = cuModuleUnload(resource_mg_get(&rm_streams, (void*)module));
+    *result = cuModuleUnload(resource_mg_get(&rm_modules, (void*)module));
     GSCHED_RELEASE;
     RECORD_RESULT(integer, *result);
     return 1;
diff --git a/cpu/cpu_rpc_prot.x b/cpu/cpu_rpc_prot.x
index 0300bd6f..490ad9be 100644
--- a/cpu/cpu_rpc_prot.x
+++ b/cpu/cpu_rpc_prot.x
@@ -475,7 +475,6 @@ program RPC_CD_PROG {
         int         rpc_cudnnSetStream(ptr handle, ptr streamId) = 5008;
         ptr_result  rpc_cudnnGetStream(ptr handle) = 5009;
         ptr_result  rpc_cudnnCreateTensorDescriptor(void) = 5010;
-        /*
         int         rpc_cudnnSetTensor4dDescriptor(ptr tensorDesc, int format, int dataType, int n, int c, int h, int w) = 5011;
         int         rpc_cudnnSetTensor4dDescriptorEx(ptr tensorDesc, int dataType, int n, int c, int h, int w, int nStride, int cStride, int hStride, int wStride) = 5012;
         int9_result rpc_cudnnGetTensor4dDescriptor(ptr tensorDesc) = 5013;
@@ -484,6 +483,7 @@ program RPC_CD_PROG {
         mem_result  rpc_cudnnGetTensorNdDescriptor(ptr tensorDesc, int nbDimsRequested) = 5016;
         sz_result   rpc_cudnnGetTensorSizeInBytes(ptr tensorDesc) = 5017;
         int         rpc_cudnnDestroyTensorDescriptor(ptr tensorDesc) = 5018;
+        /*
         sz_result   rpc_cudnnInitTransformDest(ptr transformDesc, ptr srcDesc, ptr destDesc) = 5019;
         ptr_result  rpc_cudnnCreateTensorTransformDescriptor(void) = 5020;
         int         rpc_cudnnSetTensorTransformDescriptor(ptr transformDesc, uint32_t nbDims, int destFormat, mem_data padBeforeA, mem_data padAfterA, mem_data foldA, int direction) = 5021;

From 5d381a70d298cc33c49703a0714f5897a3ff47fc Mon Sep 17 00:00:00 2001
From: Niklas Eiling <niklas.eiling@eonerc.rwth-aachen.de>
Date: Thu, 22 Jun 2023 10:31:22 +0200
Subject: [PATCH 71/83] implement three more cudnn tensor APIs

Signed-off-by: Niklas Eiling <niklas.eiling@eonerc.rwth-aachen.de>
---
 cpu/cpu-server-cudnn.c | 53 ++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 53 insertions(+)

diff --git a/cpu/cpu-server-cudnn.c b/cpu/cpu-server-cudnn.c
index 1f5ebf09..228e9920 100644
--- a/cpu/cpu-server-cudnn.c
+++ b/cpu/cpu-server-cudnn.c
@@ -205,6 +205,32 @@ bool_t rpc_cudnnsettensor4ddescriptor_1_svc(ptr tensorDesc, int format, int data
     return 1;
 }
 
+bool_t rpc_cudnnsettensor4ddescriptorex_1_svc(ptr tensorDesc, int dataType, int n, int c, int h, int w, int nStride, int cStride, int hStride, int wStride, int *result, struct svc_req *rqstp)
+{
+    RECORD_API(rpc_cudnnsettensor4ddescriptorex_1_argument);
+    RECORD_NARG(tensorDesc);
+    RECORD_NARG(dataType);
+    RECORD_NARG(n);
+    RECORD_NARG(c);
+    RECORD_NARG(h);
+    RECORD_NARG(w);
+    RECORD_NARG(nStride);
+    RECORD_NARG(cStride);
+    RECORD_NARG(hStride);
+    RECORD_NARG(wStride);
+
+    LOGE(LOG_DEBUG, "%s", __FUNCTION__);
+
+    GSCHED_RETAIN;
+    *result = cudnnSetTensor4dDescriptorEx(
+        (cudnnTensorDescriptor_t)resource_mg_get(&rm_cudnn_tensors, (void*)tensorDesc),
+        (cudnnDataType_t)dataType,
+        n, c, h, w, nStride, cStride, hStride, wStride);
+    GSCHED_RELEASE;
+    RECORD_RESULT(integer, *result);
+    return 1;
+}
+
 bool_t rpc_cudnngettensor4ddescriptor_1_svc(ptr tensorDesc, int9_result *result, struct svc_req *rqstp)
 {
     LOGE(LOG_DEBUG, "%s", __FUNCTION__);
@@ -305,6 +331,33 @@ bool_t rpc_cudnngettensornddescriptor_1_svc(ptr tensorDesc, int nbDimsRequested,
     return 1;
 }
 
+bool_t rpc_cudnngettensorsizeinbytes_1_svc(ptr tensorDesc, sz_result *result, struct svc_req *rqstp)
+{
+    LOGE(LOG_DEBUG, "%s", __FUNCTION__);
+    GSCHED_RETAIN;
+    result->err = cudnnGetTensorSizeInBytes(
+        (cudnnTensorDescriptor_t)resource_mg_get(&rm_cudnn_tensors, (void*)tensorDesc),
+        &result->sz_result_u.data);
+    GSCHED_RELEASE;
+    return 1;
+}
+
+bool_t rpc_cudnndestroytensordescriptor_1_svc(ptr tensorDesc, int *result, struct svc_req *rqstp)
+{
+    RECORD_API(ptr);
+    RECORD_SINGLE_ARG(tensorDesc);
+    LOGE(LOG_DEBUG, "%s", __FUNCTION__);
+
+    GSCHED_RETAIN;
+    *result = cudnnDestroyTensorDescriptor(
+        (cudnnTensorDescriptor_t)resource_mg_get(&rm_cudnn_tensors, (void*)tensorDesc));
+    // TODO: Remove from resource manager
+    GSCHED_RELEASE;
+    RECORD_RESULT(integer, *result);
+    return 1;
+}
+
+
 bool_t rpc_cudnncreatefilterdescriptor_1_svc(ptr_result *result, struct svc_req *rqstp)
 {
     RECORD_VOID_API;

From 6da2f8d80ca0e49e7e1eac64d2e37b6330f75b61 Mon Sep 17 00:00:00 2001
From: Niklas Eiling <niklas.eiling@eonerc.rwth-aachen.de>
Date: Mon, 26 Jun 2023 10:36:17 +0200
Subject: [PATCH 72/83] add cublas and cudnn functions to support mnistCUDNN
 sample Signed-off-by: Niklas Eiling <niklas.eiling@eonerc.rwth-aachen.de>

---
 cpu/cpu-client-cublas.c |  31 +++
 cpu/cpu-client-cudnn.c  | 486 ++++++++++++++++++++++++++++++++++++++--
 cpu/cpu-server-cublas.c |  39 ++++
 cpu/cpu-server-cudnn.c  | 355 ++++++++++++++++++++++++++++-
 cpu/cpu_rpc_prot.x      |  23 +-
 cpu/resource-mg.h       |   1 +
 6 files changed, 916 insertions(+), 19 deletions(-)

diff --git a/cpu/cpu-client-cublas.c b/cpu/cpu-client-cublas.c
index c8ffa2bf..39e87636 100644
--- a/cpu/cpu-client-cublas.c
+++ b/cpu/cpu-client-cublas.c
@@ -112,6 +112,37 @@ cublasStatus_t cublasSgemm(cublasHandle_t handle,
     return result;
 }
 
+cublasStatus_t cublasSgemmEx(cublasHandle_t handle,
+                           cublasOperation_t transa, cublasOperation_t transb,
+                           int m, int n, int k,
+                           const float *alpha,
+                           const void *A, cudaDataType_t Atype, int lda,
+                           const void *B, cudaDataType_t Btype, int ldb,
+                           const float *beta,
+                           void *C, cudaDataType_t Ctype, int ldc)
+{
+#ifdef WITH_API_CNT
+    api_call_cnt++;
+#endif //WITH_API_CNT
+    int result;
+    enum clnt_stat retval_1;
+    retval_1 = rpc_cublassgemmex_1(
+        (ptr)handle,
+        (int)transa,
+        (int)transb,
+        m, n, k,
+        *alpha,
+        (ptr)A, (int)Atype, lda,
+        (ptr)B, (int)Btype, ldb,
+        *beta,
+        (ptr)C, (int)Ctype, ldc,
+         &result, clnt);
+    if (retval_1 != RPC_SUCCESS) {
+        clnt_perror (clnt, "call failed");
+    }
+    return result;
+}
+
 cublasStatus_t cublasDgemv(cublasHandle_t handle,
                            cublasOperation_t trans,
                            int m, int n,
diff --git a/cpu/cpu-client-cudnn.c b/cpu/cpu-client-cudnn.c
index 03d7fd5d..e7c8c691 100644
--- a/cpu/cpu-client-cudnn.c
+++ b/cpu/cpu-client-cudnn.c
@@ -1,4 +1,3 @@
-#include <cuda_runtime.h>
 #include <cudnn.h>
 #include <stdint.h>
 
@@ -25,6 +24,7 @@ size_t cudnnGetVersion(void)
     }
     return result;
 }
+
 size_t cudnnGetMaxDeviceVersion(void)
 {
 #ifdef WITH_API_CNT
@@ -51,12 +51,14 @@ size_t cudnnGetCudartVersion(void)
     }
     return result;
 }
+
 const char *cudnnGetErrorString(cudnnStatus_t status)
 {
 #ifdef WITH_API_CNT
     api_call_cnt++;
 #endif //WITH_API_CNT
-    char *result;
+    static char str[128];
+    char *result = NULL;
     enum clnt_stat retval_1;
     retval_1 = rpc_cudnngeterrorstring_1((int)status, &result, clnt);
     if (retval_1 != RPC_SUCCESS) {
@@ -65,7 +67,8 @@ const char *cudnnGetErrorString(cudnnStatus_t status)
     if (result == NULL) {
         LOGE(LOG_ERROR, "%s failed (result is NULL)", __FUNCTION__);
     }
-    return result;
+    strncpy(str, result, 128);
+    return str; 
 }
 
 cudnnStatus_t cudnnQueryRuntimeError(cudnnHandle_t handle, cudnnStatus_t* rstatus, cudnnErrQueryMode_t  mode, cudnnRuntimeTag_t * tag)
@@ -353,7 +356,9 @@ cudnnStatus_t cudnnGetTensorNdDescriptor(const cudnnTensorDescriptor_t tensorDes
 #ifdef WITH_API_CNT
     api_call_cnt++;
 #endif //WITH_API_CNT
+    size_t expected_size = nbDimsRequested * sizeof(int) * 2 + sizeof(int) + sizeof(cudnnDataType_t);
     mem_result result;
+    result.mem_result_u.data.mem_data_val = malloc(expected_size);
     enum clnt_stat retval_1;
     if (dataType == NULL || nbDims == NULL || dimA == NULL || strideA == NULL) { 
         LOGE(LOG_ERROR, "%s failed (value is NULL)", __FUNCTION__);
@@ -367,7 +372,6 @@ cudnnStatus_t cudnnGetTensorNdDescriptor(const cudnnTensorDescriptor_t tensorDes
     if (retval_1 != RPC_SUCCESS) {
         LOGE(LOG_ERROR, "%s failed (%d)", __FUNCTION__, retval_1);
     }
-    size_t expected_size = nbDimsRequested * sizeof(int) * 2 + sizeof(int) + sizeof(cudnnDataType_t);
     if (result.err != CUDNN_STATUS_SUCCESS || result.mem_result_u.data.mem_data_len != expected_size) {
         LOGE(LOG_ERROR, "%s failed (result is %d)", __FUNCTION__, result.err);
     } else {
@@ -380,6 +384,7 @@ cudnnStatus_t cudnnGetTensorNdDescriptor(const cudnnTensorDescriptor_t tensorDes
         offset += *nbDims * sizeof(int);
         memcpy(strideA, result.mem_result_u.data.mem_data_val+offset, *nbDims * sizeof(int));
     }
+    free(result.mem_result_u.data.mem_data_val);
     return result.err;
 }
 
@@ -434,9 +439,65 @@ DEF_FN(cudnnStatus_t, cudnnCreateTensorTransformDescriptor, cudnnTensorTransform
 DEF_FN(cudnnStatus_t, cudnnSetTensorTransformDescriptor, cudnnTensorTransformDescriptor_t, transformDesc, const uint32_t, nbDims, const cudnnTensorFormat_t, destFormat, const int32_t*, padBeforeA, const int32_t*, padAfterA, const uint32_t*, foldA, const cudnnFoldingDirection_t,  direction)
 DEF_FN(cudnnStatus_t, cudnnGetTensorTransformDescriptor, cudnnTensorTransformDescriptor_t, transformDesc, uint32_t, nbDimsRequested, cudnnTensorFormat_t *, destFormat, int32_t*, padBeforeA, int32_t*, padAfterA, uint32_t*, foldA, cudnnFoldingDirection_t *, direction)
 DEF_FN(cudnnStatus_t, cudnnDestroyTensorTransformDescriptor, cudnnTensorTransformDescriptor_t, transformDesc)
-DEF_FN(cudnnStatus_t, cudnnTransformTensor, cudnnHandle_t, handle, const void *, alpha, const cudnnTensorDescriptor_t, xDesc, const void *, x, const void *, beta, const cudnnTensorDescriptor_t, yDesc, void *, y)
+
+cudnnStatus_t cudnnTransformTensor(cudnnHandle_t handle, const void * alpha, const cudnnTensorDescriptor_t xDesc, const void *x, const void *beta, const cudnnTensorDescriptor_t yDesc, void *y)
+{
+#ifdef WITH_API_CNT
+    api_call_cnt++;
+#endif //WITH_API_CNT
+    int result;
+    enum clnt_stat retval_1;
+    //TODO: Check if we have a float instead of always sending doubles
+    cudnn_scaling_t rpc_alpha = {.dataType = CUDNN_DATA_DOUBLE, .cudnn_scaling_t_u.d = *((double*)alpha)};
+    cudnn_scaling_t rpc_beta = {.dataType = CUDNN_DATA_DOUBLE, .cudnn_scaling_t_u.d = *((double*)beta)};
+    retval_1 = rpc_cudnntransformtensor_1(
+        (ptr)handle,
+        rpc_alpha,
+        (ptr)xDesc,
+        (ptr)x,
+        rpc_beta,
+        (ptr)yDesc,
+        (ptr)y,
+        &result, clnt);
+    if (retval_1 != RPC_SUCCESS) {
+        LOGE(LOG_ERROR, "%s failed (%d)", __FUNCTION__, retval_1);
+    }
+    if (result != CUDNN_STATUS_SUCCESS) {
+        LOGE(LOG_ERROR, "%s failed (result is %d)", __FUNCTION__, result);
+    }
+    return result;
+}
+
 DEF_FN(cudnnStatus_t, cudnnTransformTensorEx, cudnnHandle_t, handle, const cudnnTensorTransformDescriptor_t, transDesc, const void *, alpha, const cudnnTensorDescriptor_t, srcDesc, const void *, srcData, const void *, beta, const cudnnTensorDescriptor_t, destDesc, void *, destData)
-DEF_FN(cudnnStatus_t, cudnnAddTensor, cudnnHandle_t, handle, const void *, alpha, const cudnnTensorDescriptor_t, aDesc, const void *, A, const void *, ,beta, const cudnnTensorDescriptor_t, cDesc, void *, C)
+    
+cudnnStatus_t cudnnAddTensor(cudnnHandle_t handle, const void * alpha, const cudnnTensorDescriptor_t aDesc, const void * A, const void *beta, const cudnnTensorDescriptor_t cDesc, void * C)
+{
+#ifdef WITH_API_CNT
+    api_call_cnt++;
+#endif //WITH_API_CNT
+    int result;
+    enum clnt_stat retval_1;
+    //TODO: Check if we have a float instead of always sending doubles
+    cudnn_scaling_t rpc_alpha = {.dataType = CUDNN_DATA_DOUBLE, .cudnn_scaling_t_u.d = *((double*)alpha)};
+    cudnn_scaling_t rpc_beta = {.dataType = CUDNN_DATA_DOUBLE, .cudnn_scaling_t_u.d = *((double*)beta)};
+    retval_1 = rpc_cudnnaddtensor_1(
+        (ptr)handle,
+        rpc_alpha,
+        (ptr)aDesc,
+        (ptr)A,
+        rpc_beta,
+        (ptr)cDesc,
+        (ptr)C,
+        &result, clnt);
+    if (retval_1 != RPC_SUCCESS) {
+        LOGE(LOG_ERROR, "%s failed (%d)", __FUNCTION__, retval_1);
+    }
+    if (result != CUDNN_STATUS_SUCCESS) {
+        LOGE(LOG_ERROR, "%s failed (result is %d)", __FUNCTION__, result);
+    }
+    return result;
+}
+    
 DEF_FN(cudnnStatus_t, cudnnCreateOpTensorDescriptor, cudnnOpTensorDescriptor_t *, opTensorDesc)
 DEF_FN(cudnnStatus_t, cudnnSetOpTensorDescriptor, cudnnOpTensorDescriptor_t, opTensorDesc, cudnnOpTensorOp_t, opTensorOp, cudnnDataType_t, opTensorCompType, cudnnNanPropagation_t, opTensorNanOpt)
 DEF_FN(cudnnStatus_t, cudnnGetOpTensorDescriptor, const cudnnOpTensorDescriptor_t, opTensorDesc, cudnnOpTensorOp_t *, opTensorOp, cudnnDataType_t *, opTensorCompType, cudnnNanPropagation_t *, opTensorNanOpt)
@@ -560,7 +621,9 @@ cudnnStatus_t cudnnGetFilterNdDescriptor(const cudnnFilterDescriptor_t filterDes
 #ifdef WITH_API_CNT
     api_call_cnt++;
 #endif //WITH_API_CNT
+    size_t expected_size = nbDimsRequested * sizeof(int) + sizeof(int) + sizeof(cudnnDataType_t) + sizeof(cudnnTensorFormat_t);
     mem_result result;
+    result.mem_result_u.data.mem_data_val = (char*)malloc(expected_size);
     enum clnt_stat retval_1;
     if (dataType == NULL || format == NULL || nbDims == NULL || filterDimA == NULL) {
         LOGE(LOG_ERROR, "%s failed (value is NULL)", __FUNCTION__);
@@ -574,7 +637,6 @@ cudnnStatus_t cudnnGetFilterNdDescriptor(const cudnnFilterDescriptor_t filterDes
     if (retval_1 != RPC_SUCCESS) {
         LOGE(LOG_ERROR, "%s failed (%d)", __FUNCTION__, retval_1);
     }
-    size_t expected_size = nbDimsRequested * sizeof(int) + sizeof(int) + sizeof(cudnnDataType_t) + sizeof(cudnnTensorFormat_t);
     if (result.err != CUDNN_STATUS_SUCCESS || result.mem_result_u.data.mem_data_len < expected_size) {
         LOGE(LOG_ERROR, "%s failed (result is %d)", __FUNCTION__, result.err);
     } else {
@@ -587,6 +649,7 @@ cudnnStatus_t cudnnGetFilterNdDescriptor(const cudnnFilterDescriptor_t filterDes
         offset += sizeof(int);
         memcpy(filterDimA, result.mem_result_u.data.mem_data_val+offset, *nbDims * sizeof(int));
     }
+    free(result.mem_result_u.data.mem_data_val);
     return result.err;
 }
 
@@ -666,7 +729,36 @@ cudnnStatus_t cudnnDestroyFilterDescriptor(cudnnFilterDescriptor_t filterDesc)
     return result;
 }
 
-DEF_FN(cudnnStatus_t, cudnnSoftmaxForward, cudnnHandle_t, handle, cudnnSoftmaxAlgorithm_t, algo, cudnnSoftmaxMode_t, mode, const void *,alpha, const cudnnTensorDescriptor_t, xDesc, const void *, x, const void *, beta, const cudnnTensorDescriptor_t, yDesc, void *, y)
+cudnnStatus_t cudnnSoftmaxForward(cudnnHandle_t handle, cudnnSoftmaxAlgorithm_t algo, cudnnSoftmaxMode_t mode, const void *alpha, const cudnnTensorDescriptor_t xDesc, const void *x, const void *beta, const cudnnTensorDescriptor_t yDesc, void * y)
+{
+#ifdef WITH_API_CNT
+    api_call_cnt++;
+#endif //WITH_API_CNT
+    int result;
+    enum clnt_stat retval_1;
+    //TODO: Check if we have a float instead of always sending doubles
+    cudnn_scaling_t rpc_alpha = {.dataType = CUDNN_DATA_DOUBLE, .cudnn_scaling_t_u.d = *((double*)alpha)};
+    cudnn_scaling_t rpc_beta = {.dataType = CUDNN_DATA_DOUBLE, .cudnn_scaling_t_u.d = *((double*)beta)};
+    retval_1 = rpc_cudnnsoftmaxforward_1(
+        (ptr)handle,
+        (int)algo,
+        (int)mode,
+        rpc_alpha,
+        (ptr)xDesc,
+        (ptr)x,
+        rpc_beta,
+        (ptr)yDesc,
+        (ptr)y,
+        &result, clnt);
+    if (retval_1 != RPC_SUCCESS) {
+        LOGE(LOG_ERROR, "%s failed (%d)", __FUNCTION__, retval_1);
+    }
+    if (result != CUDNN_STATUS_SUCCESS) {
+        LOGE(LOG_ERROR, "%s failed (result is %d)", __FUNCTION__, result);
+    }
+    return result;
+}
+    
 cudnnStatus_t cudnnCreatePoolingDescriptor(cudnnPoolingDescriptor_t *poolingDesc)
 {
 #ifdef WITH_API_CNT
@@ -794,7 +886,9 @@ cudnnStatus_t cudnnGetPoolingNdDescriptor(const cudnnPoolingDescriptor_t pooling
 #ifdef WITH_API_CNT
     api_call_cnt++;
 #endif //WITH_API_CNT
+    size_t expected_size = nbDimsRequested * sizeof(int) * 3 + sizeof(int) + sizeof(cudnnPoolingMode_t) + sizeof(cudnnNanPropagation_t);
     mem_result result;
+    result.mem_result_u.data.mem_data_val = (char*)malloc(expected_size);
     enum clnt_stat retval_1;
     if (mode == NULL || maxpoolingNanOpt == NULL || nbDims == NULL || windowDimA == NULL || paddingA == NULL || strideA == NULL) {
         LOGE(LOG_ERROR, "%s failed (value is NULL)", __FUNCTION__);
@@ -807,7 +901,6 @@ cudnnStatus_t cudnnGetPoolingNdDescriptor(const cudnnPoolingDescriptor_t pooling
     if (retval_1 != RPC_SUCCESS) {
         LOGE(LOG_ERROR, "%s failed (%d)", __FUNCTION__, retval_1);
     }
-    size_t expected_size = nbDimsRequested * sizeof(int) * 3 + sizeof(int) + sizeof(cudnnPoolingMode_t) + sizeof(cudnnNanPropagation_t);
     if (result.err != CUDNN_STATUS_SUCCESS || result.mem_result_u.data.mem_data_len != expected_size) {
         LOGE(LOG_ERROR, "%s failed (result is %d)", __FUNCTION__, result.err);
     } else {
@@ -824,6 +917,7 @@ cudnnStatus_t cudnnGetPoolingNdDescriptor(const cudnnPoolingDescriptor_t pooling
         offset += *nbDims * sizeof(int);
         memcpy(strideA, result.mem_result_u.data.mem_data_val+offset, *nbDims * sizeof(int));
     }
+    free(result.mem_result_u.data.mem_data_val);
     return result.err;
 }
 
@@ -833,6 +927,7 @@ cudnnStatus_t cudnnGetPoolingNdForwardOutputDim(const cudnnPoolingDescriptor_t p
     api_call_cnt++;
 #endif //WITH_API_CNT
     mem_result result;
+    result.mem_result_u.data.mem_data_val = (char*)outputTensorDimA;
     enum clnt_stat retval_1;
     if (outputTensorDimA == NULL) {
         LOGE(LOG_ERROR, "%s failed (value is NULL)", __FUNCTION__);
@@ -849,8 +944,6 @@ cudnnStatus_t cudnnGetPoolingNdForwardOutputDim(const cudnnPoolingDescriptor_t p
     size_t expected_size = nbDims * sizeof(int);
     if (result.err != CUDNN_STATUS_SUCCESS || result.mem_result_u.data.mem_data_len != expected_size) {
         LOGE(LOG_ERROR, "%s failed (result is %d)", __FUNCTION__, result.err);
-    } else {
-        memcpy(outputTensorDimA, result.mem_result_u.data.mem_data_val, nbDims * sizeof(int));
     }
     return result.err;
 }
@@ -904,7 +997,34 @@ cudnnStatus_t cudnnDestroyPoolingDescriptor(cudnnPoolingDescriptor_t poolingDesc
     return result;
 }
 
-DEF_FN(cudnnStatus_t, cudnnPoolingForward, cudnnHandle_t, handle, const cudnnPoolingDescriptor_t, poolingDesc, const void *, alpha, const cudnnTensorDescriptor_t, xDesc, const void *, x, const void *, beta, const cudnnTensorDescriptor_t, yDesc, void *, y)
+cudnnStatus_t cudnnPoolingForward(cudnnHandle_t handle, const cudnnPoolingDescriptor_t poolingDesc, const void * alpha, const cudnnTensorDescriptor_t xDesc, const void * x, const void * beta, const cudnnTensorDescriptor_t yDesc, void * y)
+{
+#ifdef WITH_API_CNT
+    api_call_cnt++;
+#endif //WITH_API_CNT
+    int result;
+    enum clnt_stat retval_1;
+    //TODO: Check if we have a float instead of always sending doubles
+    cudnn_scaling_t rpc_alpha = {.dataType = CUDNN_DATA_DOUBLE, .cudnn_scaling_t_u.d = *((double*)alpha)};
+    cudnn_scaling_t rpc_beta = {.dataType = CUDNN_DATA_DOUBLE, .cudnn_scaling_t_u.d = *((double*)beta)};
+    retval_1 = rpc_cudnnpoolingforward_1(
+        (ptr)handle,
+        (ptr)poolingDesc,
+        rpc_alpha,
+        (ptr)xDesc,
+        (ptr)x,
+        rpc_beta,
+        (ptr)yDesc,
+        (ptr)y,
+        &result, clnt);
+    if (retval_1 != RPC_SUCCESS) {
+        LOGE(LOG_ERROR, "%s failed (%d)", __FUNCTION__, retval_1);
+    }
+    if (result != CUDNN_STATUS_SUCCESS) {
+        LOGE(LOG_ERROR, "%s failed (result is %d)", __FUNCTION__, result);
+    }
+    return result;
+}
 
 cudnnStatus_t cudnnCreateActivationDescriptor(cudnnActivationDescriptor_t * activationDesc)
 {
@@ -1047,7 +1167,34 @@ cudnnStatus_t cudnnDestroyActivationDescriptor(cudnnActivationDescriptor_t activ
     return result;
 }
 
-DEF_FN(cudnnStatus_t, cudnnActivationForward, cudnnHandle_t, handle, cudnnActivationDescriptor_t, activationDesc, const void *, alpha, const cudnnTensorDescriptor_t, xDesc, const void *, x, const void *, beta, const cudnnTensorDescriptor_t, yDesc, void *, y)
+cudnnStatus_t cudnnActivationForward(cudnnHandle_t handle, cudnnActivationDescriptor_t activationDesc, const void * alpha, const cudnnTensorDescriptor_t xDesc, const void * x, const void * beta, const cudnnTensorDescriptor_t yDesc, void * y)
+{
+#ifdef WITH_API_CNT
+    api_call_cnt++;
+#endif //WITH_API_CNT
+    int result;
+    enum clnt_stat retval_1;
+    //TODO: Check if we have a float instead of always sending doubles
+    cudnn_scaling_t rpc_alpha = {.dataType = CUDNN_DATA_DOUBLE, .cudnn_scaling_t_u.d = *((double*)alpha)};
+    cudnn_scaling_t rpc_beta = {.dataType = CUDNN_DATA_DOUBLE, .cudnn_scaling_t_u.d = *((double*)beta)};
+    retval_1 = rpc_cudnnactivationforward_1(
+        (ptr)handle,
+        (ptr)activationDesc,
+        rpc_alpha,
+        (ptr)xDesc,
+        (ptr)x,
+        rpc_beta,
+        (ptr)yDesc,
+        (ptr)y,
+        &result, clnt);
+    if (retval_1 != RPC_SUCCESS) {
+        LOGE(LOG_ERROR, "%s failed (%d)", __FUNCTION__, retval_1);
+    }
+    if (result != CUDNN_STATUS_SUCCESS) {
+        LOGE(LOG_ERROR, "%s failed (result is %d)", __FUNCTION__, result);
+    }
+    return result;
+}
 
 cudnnStatus_t cudnnCreateLRNDescriptor(cudnnLRNDescriptor_t * normDesc)
 {
@@ -1144,7 +1291,37 @@ cudnnStatus_t cudnnDestroyLRNDescriptor(cudnnLRNDescriptor_t lrnDesc)
     }
     return result;
 }
-DEF_FN(cudnnStatus_t, cudnnLRNCrossChannelForward, cudnnHandle_t, handle, cudnnLRNDescriptor_t, normDesc, cudnnLRNMode_t, lrnMode, const void *, alpha, const cudnnTensorDescriptor_t, xDesc, const void *, x, const void *, beta, const cudnnTensorDescriptor_t, yDesc, void *, y)
+
+cudnnStatus_t cudnnLRNCrossChannelForward(cudnnHandle_t handle, cudnnLRNDescriptor_t normDesc, cudnnLRNMode_t lrnMode, const void * alpha, const cudnnTensorDescriptor_t xDesc, const void * x, const void * beta, const cudnnTensorDescriptor_t yDesc, void * y)
+{
+#ifdef WITH_API_CNT
+    api_call_cnt++;
+#endif //WITH_API_CNT
+    int result;
+    enum clnt_stat retval_1;
+    //TODO: Check if we have a float instead of always sending doubles
+    cudnn_scaling_t rpc_alpha = {.dataType = CUDNN_DATA_DOUBLE, .cudnn_scaling_t_u.d = *((double*)alpha)};
+    cudnn_scaling_t rpc_beta = {.dataType = CUDNN_DATA_DOUBLE, .cudnn_scaling_t_u.d = *((double*)beta)};
+    retval_1 = rpc_cudnnlrncrosschannelforward_1(
+        (ptr)handle,
+        (ptr)normDesc,
+        (int)lrnMode,
+        rpc_alpha,
+        (ptr)xDesc,
+        (ptr)x,
+        rpc_beta,
+        (ptr)yDesc,
+        (ptr)y,
+        &result, clnt);
+    if (retval_1 != RPC_SUCCESS) {
+        LOGE(LOG_ERROR, "%s failed (%d)", __FUNCTION__, retval_1);
+    }
+    if (result != CUDNN_STATUS_SUCCESS) {
+        LOGE(LOG_ERROR, "%s failed (result is %d)", __FUNCTION__, result);
+    }
+    return result;
+}
+
 DEF_FN(cudnnStatus_t, cudnnDivisiveNormalizationForward, cudnnHandle_t, handle, cudnnLRNDescriptor_t, normDesc, cudnnDivNormMode_t, mode, const void *, alpha, const cudnnTensorDescriptor_t, xDesc, const void *, x, const void *, means, void *, temp, void *, temp2, const void *, beta, const cudnnTensorDescriptor_t, yDesc, void *, y)
 DEF_FN(cudnnStatus_t, cudnnDeriveBNTensorDescriptor, cudnnTensorDescriptor_t, derivedBnDesc, const cudnnTensorDescriptor_t, xDesc, cudnnBatchNormMode_t, mode)
 DEF_FN(cudnnStatus_t, cudnnBatchNormalizationForwardInference, cudnnHandle_t, handle, cudnnBatchNormMode_t, mode, const void *, alpha, const void *, beta, const cudnnTensorDescriptor_t, xDesc, const void *, x, const cudnnTensorDescriptor_t, yDesc, void *, y, const cudnnTensorDescriptor_t,  bnScaleBiasMeanVarDesc, const void *, bnScale, const void *, bnBias, const void *, estimatedMean, const void *, estimatedVariance, double, epsilon)
@@ -1177,4 +1354,283 @@ DEF_FN(cudnnStatus_t, cudnnSaveAlgorithm, cudnnHandle_t, handle, cudnnAlgorithmD
 DEF_FN(cudnnStatus_t, cudnnRestoreAlgorithm, cudnnHandle_t, handle, void *, algoSpace, size_t, algoSpaceSizeInBytes, cudnnAlgorithmDescriptor_t, algoDesc)
 DEF_FN(cudnnStatus_t, cudnnSetCallback, unsigned, mask, void *, udata, cudnnCallback_t, fptr)
 DEF_FN(cudnnStatus_t, cudnnGetCallback, unsigned *, mask, void **, udata, cudnnCallback_t *, fptr)
-DEF_FN(cudnnStatus_t, cudnnOpsInferVersionCheck)
\ No newline at end of file
+DEF_FN(cudnnStatus_t, cudnnOpsInferVersionCheck)
+
+
+/***************** cudnn_cnn_infer *******************/
+
+cudnnStatus_t cudnnCreateConvolutionDescriptor(cudnnConvolutionDescriptor_t* convDesc)
+{
+#ifdef WITH_API_CNT
+    api_call_cnt++;
+#endif //WITH_API_CNT
+    ptr_result result;
+    enum clnt_stat retval_1;
+    if (convDesc == NULL) {
+        LOGE(LOG_ERROR, "%s failed (value is NULL)", __FUNCTION__);
+        return CUDNN_STATUS_BAD_PARAM;
+    }
+    retval_1 = rpc_cudnncreateconvolutiondescriptor_1(&result, clnt);
+    if (retval_1 != RPC_SUCCESS) {
+        LOGE(LOG_ERROR, "%s failed (%d)", __FUNCTION__, retval_1);
+    }
+    if (result.err != CUDNN_STATUS_SUCCESS) {
+        LOGE(LOG_ERROR, "%s failed (result is %d)", __FUNCTION__, result.err);
+    } else {
+        *convDesc = (cudnnConvolutionDescriptor_t)result.ptr_result_u.ptr;
+    }
+    return result.err;
+}
+    
+cudnnStatus_t cudnnDestroyConvolutionDescriptor(cudnnConvolutionDescriptor_t convDesc)
+{
+#ifdef WITH_API_CNT
+    api_call_cnt++;
+#endif //WITH_API_CNT
+    int result;
+    enum clnt_stat retval_1;
+    retval_1 = rpc_cudnndestroyconvolutiondescriptor_1(
+        (ptr)convDesc,
+        &result, clnt);
+
+    if (retval_1 != RPC_SUCCESS) {
+        LOGE(LOG_ERROR, "%s failed (%d)", __FUNCTION__, retval_1);
+    }
+    if (result != CUDNN_STATUS_SUCCESS) {
+        LOGE(LOG_ERROR, "%s failed (result is %d)", __FUNCTION__, result);
+    }
+    return result;
+}
+DEF_FN(cudnnStatus_t, cudnnSetConvolutionMathType,  cudnnConvolutionDescriptor_t, convDesc,  cudnnMathType_t, mathType)
+DEF_FN(cudnnStatus_t, cudnnGetConvolutionMathType,  cudnnConvolutionDescriptor_t, convDesc,  cudnnMathType_t*, mathType)
+DEF_FN(cudnnStatus_t, cudnnSetConvolutionGroupCount,  cudnnConvolutionDescriptor_t, convDesc,  int, groupCount)
+DEF_FN(cudnnStatus_t, cudnnGetConvolutionGroupCount,  cudnnConvolutionDescriptor_t, convDesc,  int*, groupCount)
+DEF_FN(cudnnStatus_t, cudnnSetConvolutionReorderType,  cudnnConvolutionDescriptor_t, convDesc,  cudnnReorderType_t, reorderType)
+DEF_FN(cudnnStatus_t, cudnnGetConvolutionReorderType,  cudnnConvolutionDescriptor_t, convDesc,  cudnnReorderType_t*, reorderType)
+DEF_FN(cudnnStatus_t, cudnnSetConvolution2dDescriptor,  cudnnConvolutionDescriptor_t, convDesc,  int, pad_h,  int, pad_w,  int, u, int, v, int, dilation_h,  int, dilation_w,  cudnnConvolutionMode_t, mode,  cudnnDataType_t, computeType)
+DEF_FN(cudnnStatus_t, cudnnGetConvolution2dDescriptor,  const cudnnConvolutionDescriptor_t, convDesc,  int*, pad_h,  int*, pad_w,  int*, u,  int*, v,  int*, dilation_h,  int*, dilation_w,  cudnnConvolutionMode_t*, mode,  cudnnDataType_t*, computeType)
+    
+cudnnStatus_t cudnnSetConvolutionNdDescriptor(cudnnConvolutionDescriptor_t convDesc,  int arrayLength,  const int* padA,  const int* filterStrideA,  const int* dilationA,  cudnnConvolutionMode_t mode,  cudnnDataType_t computeType)
+{
+#ifdef WITH_API_CNT
+    api_call_cnt++;
+#endif //WITH_API_CNT
+    int result;
+    enum clnt_stat retval_1;
+    mem_data rpc_windowDimA = {
+        .mem_data_len = arrayLength * sizeof(int),
+        .mem_data_val = (char*)padA
+    };
+    mem_data rpc_paddingA = {
+        .mem_data_len = arrayLength * sizeof(int),
+        .mem_data_val = (char*)filterStrideA
+    };
+    mem_data rpc_strideA = {
+        .mem_data_len = arrayLength * sizeof(int),
+        .mem_data_val = (char*)dilationA
+    };
+    retval_1 = rpc_cudnnsetconvolutionnddescriptor_1(
+        (ptr)convDesc,
+        arrayLength,
+        rpc_windowDimA,
+        rpc_paddingA,
+        rpc_strideA,
+        mode,
+        computeType,
+        &result, clnt);
+
+    if (retval_1 != RPC_SUCCESS) {
+        LOGE(LOG_ERROR, "%s failed (%d)", __FUNCTION__, retval_1);
+    }
+    if (result != CUDNN_STATUS_SUCCESS) {
+        LOGE(LOG_ERROR, "%s failed (result is %d)", __FUNCTION__, result);
+    } 
+    return result;
+}
+
+DEF_FN(cudnnStatus_t, cudnnGetConvolutionNdDescriptor,  const cudnnConvolutionDescriptor_t, convDesc,  int, arrayLengthRequested,  int*, arrayLength,  int*, padA,  int*, strideA,  int*, dilationA,  cudnnConvolutionMode_t*, mode,  cudnnDataType_t*, computeType)
+DEF_FN(cudnnStatus_t, cudnnGetConvolution2dForwardOutputDim,  const cudnnConvolutionDescriptor_t, convDesc,  const cudnnTensorDescriptor_t, inputTensorDesc,  const cudnnFilterDescriptor_t, filterDesc,  int*, n,  int*, c,  int*, h,  int*, w)
+
+cudnnStatus_t cudnnGetConvolutionNdForwardOutputDim(const cudnnConvolutionDescriptor_t convDesc,  const cudnnTensorDescriptor_t inputTensorDesc,  const cudnnFilterDescriptor_t filterDesc,  int nbDims,  int* tensorOutputDimA)
+{
+#ifdef WITH_API_CNT
+    api_call_cnt++;
+#endif //WITH_API_CNT
+    mem_result result;
+    result.mem_result_u.data.mem_data_val = (char*)tensorOutputDimA;
+    enum clnt_stat retval_1;
+    if (tensorOutputDimA == NULL) {
+        LOGE(LOG_ERROR, "%s failed (value is NULL)", __FUNCTION__);
+        return CUDNN_STATUS_BAD_PARAM;
+    }
+    retval_1 = rpc_cudnngetconvolutionndforwardoutputdim_1(
+        (ptr)convDesc,
+        (ptr)inputTensorDesc,
+        (ptr)filterDesc,
+        nbDims,
+        &result, clnt); 
+    if (retval_1 != RPC_SUCCESS) {
+        LOGE(LOG_ERROR, "%s failed (%d)", __FUNCTION__, retval_1);
+    }
+    size_t expected_size = nbDims * sizeof(int);
+    if (result.err != CUDNN_STATUS_SUCCESS || result.mem_result_u.data.mem_data_len != expected_size) {
+        LOGE(LOG_ERROR, "%s failed (result is %d)", __FUNCTION__, result.err);
+    }
+    return result.err;
+}
+
+DEF_FN(cudnnStatus_t, cudnnGetConvolutionForwardAlgorithmMaxCount,  cudnnHandle_t, handle,  int*, count)
+
+cudnnStatus_t cudnnGetConvolutionForwardAlgorithm_v7(cudnnHandle_t handle,  const cudnnTensorDescriptor_t srcDesc,  const cudnnFilterDescriptor_t filterDesc,  const cudnnConvolutionDescriptor_t convDesc,  const cudnnTensorDescriptor_t destDesc,  const int requestedAlgoCount,  int* returnedAlgoCount,  cudnnConvolutionFwdAlgoPerf_t* perfResults)
+{
+#ifdef WITH_API_CNT
+    api_call_cnt++;
+#endif //WITH_API_CNT
+    mem_result result;
+    result.mem_result_u.data.mem_data_val = (char*)malloc(requestedAlgoCount * sizeof(cudnnConvolutionFwdAlgoPerf_t) + sizeof(int));
+    enum clnt_stat retval_1;
+    if (returnedAlgoCount == NULL || perfResults == NULL) {
+        LOGE(LOG_ERROR, "%s failed (value is NULL)", __FUNCTION__);
+        return CUDNN_STATUS_BAD_PARAM;
+    }
+    retval_1 = rpc_cudnngetconvolutionforwardalgorithm_v7_1(
+        (ptr)handle,
+        (ptr)srcDesc,
+        (ptr)filterDesc,
+        (ptr)convDesc,
+        (ptr)destDesc,
+        requestedAlgoCount,
+        &result, clnt); 
+    if (retval_1 != RPC_SUCCESS) {
+        LOGE(LOG_ERROR, "%s failed (%d)", __FUNCTION__, retval_1);
+    }
+    size_t expected_size = requestedAlgoCount * sizeof(cudnnConvolutionFwdAlgoPerf_t) + sizeof(int);
+    if (result.err != CUDNN_STATUS_SUCCESS || result.mem_result_u.data.mem_data_len != expected_size) {
+        LOGE(LOG_ERROR, "%s failed (result is %d)", __FUNCTION__, result.err);
+    } else {
+        *returnedAlgoCount = *(int*)result.mem_result_u.data.mem_data_val;
+        if (*returnedAlgoCount > requestedAlgoCount) {
+            LOGE(LOG_ERROR, "%s failed (returnedAlgoCount is %d, requestedAlgoCount is %d)", __FUNCTION__, *returnedAlgoCount, requestedAlgoCount);
+            return CUDNN_STATUS_INTERNAL_ERROR;
+        }
+        memcpy(perfResults, result.mem_result_u.data.mem_data_val + sizeof(int), *returnedAlgoCount * sizeof(cudnnConvolutionFwdAlgoPerf_t));
+    }
+    free(result.mem_result_u.data.mem_data_val);
+    return result.err;
+}
+
+cudnnStatus_t cudnnFindConvolutionForwardAlgorithm( cudnnHandle_t handle,  const cudnnTensorDescriptor_t xDesc,  const cudnnFilterDescriptor_t wDesc,  const cudnnConvolutionDescriptor_t convDesc,  const cudnnTensorDescriptor_t yDesc,  const int requestedAlgoCount,  int* returnedAlgoCount,  cudnnConvolutionFwdAlgoPerf_t* perfResults)
+{
+#ifdef WITH_API_CNT
+    api_call_cnt++;
+#endif //WITH_API_CNT
+    mem_result result;
+    result.mem_result_u.data.mem_data_val = (char*)malloc(requestedAlgoCount * sizeof(cudnnConvolutionFwdAlgoPerf_t) + sizeof(int));
+    enum clnt_stat retval_1;
+    if (returnedAlgoCount == NULL || perfResults == NULL) {
+        LOGE(LOG_ERROR, "%s failed (value is NULL)", __FUNCTION__);
+        return CUDNN_STATUS_BAD_PARAM;
+    }
+    retval_1 = rpc_cudnnfindconvolutionforwardalgorithm_1(
+        (ptr)handle,
+        (ptr)xDesc,
+        (ptr)wDesc,
+        (ptr)convDesc,
+        (ptr)yDesc,
+        requestedAlgoCount,
+        &result, clnt); 
+    if (retval_1 != RPC_SUCCESS) {
+        LOGE(LOG_ERROR, "%s failed (%d)", __FUNCTION__, retval_1);
+    }
+    size_t expected_size = requestedAlgoCount * sizeof(cudnnConvolutionFwdAlgoPerf_t) + sizeof(int);
+    if (result.err != CUDNN_STATUS_SUCCESS || result.mem_result_u.data.mem_data_len != expected_size) {
+        LOGE(LOG_ERROR, "%s failed (result is %d)", __FUNCTION__, result.err);
+    } else {
+        *returnedAlgoCount = *(int*)result.mem_result_u.data.mem_data_val;
+        if (*returnedAlgoCount > requestedAlgoCount) {
+            LOGE(LOG_ERROR, "%s failed (returnedAlgoCount is %d, requestedAlgoCount is %d)", __FUNCTION__, *returnedAlgoCount, requestedAlgoCount);
+            return CUDNN_STATUS_INTERNAL_ERROR;
+        }
+        memcpy(perfResults, result.mem_result_u.data.mem_data_val + sizeof(int), *returnedAlgoCount * sizeof(cudnnConvolutionFwdAlgoPerf_t));
+    }
+    free(result.mem_result_u.data.mem_data_val);
+    return result.err;
+}
+    
+DEF_FN(cudnnStatus_t, cudnnFindConvolutionForwardAlgorithmEx,  cudnnHandle_t, handle,  const cudnnTensorDescriptor_t, xDesc,  const void*, x,  const cudnnFilterDescriptor_t, wDesc,  const void*, w,  const cudnnConvolutionDescriptor_t, convDesc,  const cudnnTensorDescriptor_t, yDesc,  void*, y,  const int, requestedAlgoCount,  int*, returnedAlgoCount,  cudnnConvolutionFwdAlgoPerf_t*, perfResults,  void*, workSpace,  size_t, workSpaceSizeInBytes)
+DEF_FN(cudnnStatus_t, cudnnIm2Col,  cudnnHandle_t, handle,  const cudnnTensorDescriptor_t, xDesc,  const void*, x,  const cudnnFilterDescriptor_t, wDesc,  const cudnnConvolutionDescriptor_t, convDesc,  void*, colBuffer)
+DEF_FN(cudnnStatus_t, cudnnReorderFilterAndBias,  cudnnHandle_t, handle,  const cudnnFilterDescriptor_t, filterDesc,  cudnnReorderType_t, reorderType,  const void*, filterData,  void*, reorderedFilterData,  int, reorderBias,  const void*, biasData,  void*, reorderedBiasData)
+
+cudnnStatus_t cudnnGetConvolutionForwardWorkspaceSize( cudnnHandle_t handle,  const cudnnTensorDescriptor_t xDesc,  const cudnnFilterDescriptor_t wDesc,  const cudnnConvolutionDescriptor_t convDesc,  const cudnnTensorDescriptor_t yDesc,  cudnnConvolutionFwdAlgo_t algo,  size_t* sizeInBytes)
+{
+#ifdef WITH_API_CNT
+    api_call_cnt++;
+#endif //WITH_API_CNT
+    sz_result result;
+    enum clnt_stat retval_1;
+    if (sizeInBytes == NULL) {
+        LOGE(LOG_ERROR, "%s failed (value is NULL)", __FUNCTION__);
+        return CUDNN_STATUS_BAD_PARAM;
+    }
+    retval_1 = rpc_cudnngetconvolutionforwardworkspacesize_1(
+        (ptr)handle,
+        (ptr)xDesc,
+        (ptr)wDesc,
+        (ptr)convDesc,
+        (ptr)yDesc,
+        algo,
+        &result, clnt);
+    if (retval_1 != RPC_SUCCESS) {
+        LOGE(LOG_ERROR, "%s failed (%d)", __FUNCTION__, retval_1);
+    }
+    if (result.err != CUDNN_STATUS_SUCCESS) {
+        LOGE(LOG_ERROR, "%s failed (result is %d)", __FUNCTION__, result.err);
+    } else {
+        *sizeInBytes = result.sz_result_u.data;
+    }
+    return result.err;
+}
+
+cudnnStatus_t cudnnConvolutionForward(cudnnHandle_t handle,  const void* alpha,  const cudnnTensorDescriptor_t xDesc,  const void* x,  const cudnnFilterDescriptor_t wDesc,  const void* w,  const cudnnConvolutionDescriptor_t convDesc,  cudnnConvolutionFwdAlgo_t algo,  void* workSpace,  size_t workSpaceSizeInBytes,  const void* beta,  const cudnnTensorDescriptor_t yDesc,  void* y)
+{
+#ifdef WITH_API_CNT
+    api_call_cnt++;
+#endif //WITH_API_CNT
+    int result;
+    enum clnt_stat retval_1;
+    //TODO: Check if we have a float instead of always sending doubles
+    cudnn_scaling_t rpc_alpha = {.dataType = CUDNN_DATA_DOUBLE, .cudnn_scaling_t_u.d = *((double*)alpha)};
+    cudnn_scaling_t rpc_beta = {.dataType = CUDNN_DATA_DOUBLE, .cudnn_scaling_t_u.d = *((double*)beta)};
+    retval_1 = rpc_cudnnconvolutionforward_1(
+        (ptr)handle,
+        rpc_alpha,
+        (ptr)xDesc,
+        (ptr)x,
+        (ptr)wDesc,
+        (ptr)w,
+        (ptr)convDesc,
+        algo,
+        (ptr)workSpace,
+        workSpaceSizeInBytes,
+        rpc_beta,
+        (ptr)yDesc,
+        (ptr)y,
+        &result, clnt);
+    if (retval_1 != RPC_SUCCESS) {
+        LOGE(LOG_ERROR, "%s failed (%d)", __FUNCTION__, retval_1);
+    }
+    if (result != CUDNN_STATUS_SUCCESS) {
+        LOGE(LOG_ERROR, "%s failed (result is %d)", __FUNCTION__, result);
+    }
+    return result;
+}
+
+DEF_FN(cudnnStatus_t, cudnnConvolutionBiasActivationForward,  cudnnHandle_t, handle,  const void*, alpha1,  const cudnnTensorDescriptor_t, xDesc,  const void*, x,  const cudnnFilterDescriptor_t, wDesc,  const void*, w,  const cudnnConvolutionDescriptor_t, convDesc,  cudnnConvolutionFwdAlgo_t, algo,  void*, workSpace,  size_t, workSpaceSizeInBytes,  const void*, alpha2,  const cudnnTensorDescriptor_t, zDesc,  const void*, z,  const cudnnTensorDescriptor_t, biasDesc,  const void*, bias,  const cudnnActivationDescriptor_t, activationDesc,  const cudnnTensorDescriptor_t, yDesc,  void*, y)
+DEF_FN(cudnnStatus_t, cudnnGetConvolutionBackwardDataAlgorithmMaxCount,  cudnnHandle_t, handle,  int*, count)
+DEF_FN(cudnnStatus_t, cudnnFindConvolutionBackwardDataAlgorithm,  cudnnHandle_t, handle,  const cudnnFilterDescriptor_t, wDesc,  const cudnnTensorDescriptor_t, dyDesc,  const cudnnConvolutionDescriptor_t, convDesc,  const cudnnTensorDescriptor_t, dxDesc,  const int, requestedAlgoCount,  int*, returnedAlgoCount,  cudnnConvolutionBwdDataAlgoPerf_t*, perfResults)
+DEF_FN(cudnnStatus_t, cudnnFindConvolutionBackwardDataAlgorithmEx,  cudnnHandle_t, handle,  const cudnnFilterDescriptor_t, wDesc,  const void*, w,  const cudnnTensorDescriptor_t, dyDesc,  const void*, dy,  const cudnnConvolutionDescriptor_t, convDesc,  const cudnnTensorDescriptor_t, dxDesc,  void*, dx,  const int, requestedAlgoCount,  int*, returnedAlgoCount,  cudnnConvolutionBwdDataAlgoPerf_t*, perfResults,  void*, workSpace,  size_t, workSpaceSizeInBytes)
+DEF_FN(cudnnStatus_t, cudnnGetConvolutionBackwardDataAlgorithm_v7,  cudnnHandle_t, handle,  const cudnnFilterDescriptor_t, filterDesc,  const cudnnTensorDescriptor_t, diffDesc,  const cudnnConvolutionDescriptor_t, convDesc,  const cudnnTensorDescriptor_t, gradDesc,  const int, requestedAlgoCount,  int*, returnedAlgoCount,  cudnnConvolutionBwdDataAlgoPerf_t*, perfResults)
+DEF_FN(cudnnStatus_t, cudnnGetConvolutionBackwardDataWorkspaceSize,  cudnnHandle_t, handle,  const cudnnFilterDescriptor_t, wDesc,  const cudnnTensorDescriptor_t, dyDesc,  const cudnnConvolutionDescriptor_t, convDesc,  const cudnnTensorDescriptor_t, dxDesc,  cudnnConvolutionBwdDataAlgo_t, algo,  size_t*, sizeInBytes)
+DEF_FN(cudnnStatus_t, cudnnConvolutionBackwardData,  cudnnHandle_t, handle,  const void*, alpha,  const cudnnFilterDescriptor_t, wDesc,  const void*, w,  const cudnnTensorDescriptor_t, dyDesc,  const void*, dy,  const cudnnConvolutionDescriptor_t, convDesc,  cudnnConvolutionBwdDataAlgo_t, algo,  void*, workSpace,  size_t, workSpaceSizeInBytes,  const void*, beta,  const cudnnTensorDescriptor_t, dxDesc,  void*, dx)
+DEF_FN(cudnnStatus_t, cudnnGetFoldedConvBackwardDataDescriptors,  const cudnnHandle_t, handle,  const cudnnFilterDescriptor_t, filterDesc,  const cudnnTensorDescriptor_t, diffDesc,  const cudnnConvolutionDescriptor_t, convDesc,  const cudnnTensorDescriptor_t, gradDesc,  const cudnnTensorFormat_t, transformFormat,  cudnnFilterDescriptor_t, foldedFilterDesc,  cudnnTensorDescriptor_t, paddedDiffDesc,  cudnnConvolutionDescriptor_t, foldedConvDesc,  cudnnTensorDescriptor_t, foldedGradDesc,  cudnnTensorTransformDescriptor_t, filterFoldTransDesc,  cudnnTensorTransformDescriptor_t, diffPadTransDesc,  cudnnTensorTransformDescriptor_t, gradFoldTransDesc,  cudnnTensorTransformDescriptor_t, gradUnfoldTransDesc)
+DEF_FN(cudnnStatus_t, cudnnCnnInferVersionCheck)
diff --git a/cpu/cpu-server-cublas.c b/cpu/cpu-server-cublas.c
index c0d8104d..e93f5036 100644
--- a/cpu/cpu-server-cublas.c
+++ b/cpu/cpu-server-cublas.c
@@ -203,4 +203,43 @@ bool_t rpc_cublasdgemv_1_svc(ptr handle, int trans, int m,
     GSCHED_RELEASE;
     RECORD_RESULT(integer, *result);
     return 1;
+}
+
+bool_t rpc_cublassgemmex_1_svc(ptr handle, int transa, int transb, int m, int n, int k, float alpha,
+            ptr A, int Atype, int lda,
+            ptr B, int Btype, int ldb, float beta,
+            ptr C, int Ctype, int ldc,
+            int *result, struct svc_req *rqstp)
+{
+    RECORD_API(rpc_cublassgemmex_1_argument);
+    RECORD_ARG(1, handle);
+    RECORD_ARG(2, transa);
+    RECORD_ARG(3, transb);
+    RECORD_ARG(4, m);
+    RECORD_ARG(5, n);
+    RECORD_ARG(6, k);
+    RECORD_ARG(7, alpha);
+    RECORD_ARG(8, A);
+    RECORD_ARG(9, Atype);
+    RECORD_ARG(10, lda);
+    RECORD_ARG(11, B);
+    RECORD_ARG(12, Btype);
+    RECORD_ARG(13, ldb);
+    RECORD_ARG(14, beta);
+    RECORD_ARG(15, C);
+    RECORD_ARG(16, Ctype);
+    RECORD_ARG(17, ldc);
+    LOGE(LOG_DEBUG, "cublasSgemmEx");
+    GSCHED_RETAIN;
+    *result = cublasSgemmEx(resource_mg_get(&rm_cublas, (void*)handle),
+                    (cublasOperation_t) transa,
+                    (cublasOperation_t) transb,
+                    m, n, k, &alpha,
+                    resource_mg_get(&rm_memory, (void*)A), (cudaDataType_t)Atype, lda,
+                    resource_mg_get(&rm_memory, (void*)B), (cudaDataType_t)Btype, ldb, &beta,
+                    resource_mg_get(&rm_memory, (void*)C), (cudaDataType_t)Ctype, ldc
+    );
+    GSCHED_RELEASE;
+    RECORD_RESULT(integer, *result);
+    return 1;
 }
\ No newline at end of file
diff --git a/cpu/cpu-server-cudnn.c b/cpu/cpu-server-cudnn.c
index 228e9920..16a11b06 100644
--- a/cpu/cpu-server-cudnn.c
+++ b/cpu/cpu-server-cudnn.c
@@ -1,7 +1,6 @@
 
 #include <stdio.h>
 #include <stdlib.h>
-#include <cuda.h>
 #include <cudnn.h>
 
 #include "cpu_rpc_prot.h"
@@ -27,6 +26,7 @@ int server_cudnn_init(int bypass)
     ret &= resource_mg_init(&rm_cudnn_poolings, bypass);
     ret &= resource_mg_init(&rm_cudnn_activations, bypass);
     ret &= resource_mg_init(&rm_cudnn_lrns, bypass);
+    ret &= resource_mg_init(&rm_cudnn_convs, bypass);
     return ret;
 }
 
@@ -38,6 +38,7 @@ int server_cudnn_deinit(void)
     resource_mg_free(&rm_cudnn_poolings);
     resource_mg_free(&rm_cudnn_activations);
     resource_mg_free(&rm_cudnn_lrns);
+    resource_mg_free(&rm_cudnn_convs);
     return 0;
 
 }
@@ -664,7 +665,7 @@ bool_t rpc_cudnngetpoolingndforwardoutputdim_1_svc(ptr poolingDesc, ptr inputTen
         (cudnnPoolingDescriptor_t)resource_mg_get(&rm_cudnn_poolings, (void*)poolingDesc),
         (cudnnTensorDescriptor_t)resource_mg_get(&rm_cudnn_tensors, (void*)inputTensorDesc),
         nbDims,
-        (int*)&result->mem_result_u.data);
+        (int*)result->mem_result_u.data.mem_data_val);
     GSCHED_RELEASE;
     return 1;
 }
@@ -859,4 +860,354 @@ bool_t rpc_cudnndestroylrndescriptor_1_svc(ptr lrnDesc, int *result, struct svc_
     GSCHED_RELEASE;
     RECORD_RESULT(integer, *result);
     return 1;
+}
+
+bool_t rpc_cudnnpoolingforward_1_svc(ptr handle, ptr poolingDesc,           cudnn_scaling_t alpha, ptr xDesc, ptr x, cudnn_scaling_t beta, ptr yDesc, ptr y, int *result, struct svc_req *rqstp)
+{
+    RECORD_API(rpc_cudnnpoolingforward_1_argument);
+    RECORD_NARG(handle);
+    RECORD_NARG(poolingDesc);
+    RECORD_NARG(alpha);
+    RECORD_NARG(xDesc);
+    RECORD_NARG(x);
+    RECORD_NARG(beta);
+    RECORD_NARG(yDesc);
+    RECORD_NARG(y);
+    
+    LOGE(LOG_DEBUG, "%s", __FUNCTION__);
+    GSCHED_RETAIN;
+    *result = cudnnPoolingForward(
+        (cudnnHandle_t)resource_mg_get(&rm_cudnn, (void*)handle),
+        (cudnnPoolingDescriptor_t)resource_mg_get(&rm_cudnn_poolings, (void*)poolingDesc),
+        (alpha.dataType == CUDNN_DATA_DOUBLE ? (const void*)&alpha.cudnn_scaling_t_u.d : (const void*)&alpha.cudnn_scaling_t_u.f),
+        (cudnnTensorDescriptor_t)resource_mg_get(&rm_cudnn_tensors, (void*)xDesc),
+        (const void*)resource_mg_get(&rm_memory, (void*)x),
+        (beta.dataType == CUDNN_DATA_DOUBLE ? (const void*)&beta.cudnn_scaling_t_u.d : (const void*)&beta.cudnn_scaling_t_u.f),
+        (cudnnTensorDescriptor_t)resource_mg_get(&rm_cudnn_tensors, (void*)yDesc),
+        (void*)resource_mg_get(&rm_memory, (void*)y));
+    GSCHED_RELEASE;
+    RECORD_RESULT(integer, *result);
+    return 1;
+}
+
+bool_t rpc_cudnnactivationforward_1_svc(ptr handle, ptr activationDesc, cudnn_scaling_t alpha, ptr xDesc, ptr x, cudnn_scaling_t beta, ptr yDesc, ptr y, int *result, struct svc_req *rqstp)
+{
+    RECORD_API(rpc_cudnnactivationforward_1_argument);
+    RECORD_NARG(handle);
+    RECORD_NARG(activationDesc);
+    RECORD_NARG(alpha);
+    RECORD_NARG(xDesc);
+    RECORD_NARG(x);
+    RECORD_NARG(beta);
+    RECORD_NARG(yDesc);
+    RECORD_NARG(y);
+    
+    LOGE(LOG_DEBUG, "%s", __FUNCTION__);
+    GSCHED_RETAIN;
+    *result = cudnnActivationForward(
+        (cudnnHandle_t)resource_mg_get(&rm_cudnn, (void*)handle),
+        (cudnnActivationDescriptor_t)resource_mg_get(&rm_cudnn_activations, (void*)activationDesc),
+        (alpha.dataType == CUDNN_DATA_DOUBLE ? (const void*)&alpha.cudnn_scaling_t_u.d : (const void*)&alpha.cudnn_scaling_t_u.f),
+        (cudnnTensorDescriptor_t)resource_mg_get(&rm_cudnn_tensors, (void*)xDesc),
+        (const void*)resource_mg_get(&rm_memory, (void*)x),
+        (beta.dataType == CUDNN_DATA_DOUBLE ? (const void*)&beta.cudnn_scaling_t_u.d : (const void*)&beta.cudnn_scaling_t_u.f),
+        (cudnnTensorDescriptor_t)resource_mg_get(&rm_cudnn_tensors, (void*)yDesc),
+        (void*)resource_mg_get(&rm_memory, (void*)y));
+    GSCHED_RELEASE;
+    RECORD_RESULT(integer, *result);
+    return 1;
+}
+
+bool_t rpc_cudnnlrncrosschannelforward_1_svc(ptr handle, ptr normDesc, int lrnMode, cudnn_scaling_t alpha, ptr xDesc, ptr x, cudnn_scaling_t beta, ptr yDesc, ptr y, int *result, struct svc_req *rqstp)
+{
+    RECORD_API(rpc_cudnnlrncrosschannelforward_1_argument);
+    RECORD_NARG(handle);
+    RECORD_NARG(normDesc);
+    RECORD_NARG(lrnMode);
+    RECORD_NARG(alpha);
+    RECORD_NARG(xDesc);
+    RECORD_NARG(x);
+    RECORD_NARG(beta);
+    RECORD_NARG(yDesc);
+    RECORD_NARG(y);
+    
+    LOGE(LOG_DEBUG, "%s", __FUNCTION__);
+    GSCHED_RETAIN;
+    *result = cudnnLRNCrossChannelForward(
+        (cudnnHandle_t)resource_mg_get(&rm_cudnn, (void*)handle),
+        (cudnnLRNDescriptor_t)resource_mg_get(&rm_cudnn_lrns, (void*)normDesc),
+        (cudnnLRNMode_t)lrnMode,
+        (alpha.dataType == CUDNN_DATA_DOUBLE ? (const void*)&alpha.cudnn_scaling_t_u.d : (const void*)&alpha.cudnn_scaling_t_u.f),
+        (cudnnTensorDescriptor_t)resource_mg_get(&rm_cudnn_tensors, (void*)xDesc),
+        (const void*)resource_mg_get(&rm_memory, (void*)x),
+        (beta.dataType == CUDNN_DATA_DOUBLE ? (const void*)&beta.cudnn_scaling_t_u.d : (const void*)&beta.cudnn_scaling_t_u.f),
+        (cudnnTensorDescriptor_t)resource_mg_get(&rm_cudnn_tensors, (void*)yDesc),
+        (void*)resource_mg_get(&rm_memory, (void*)y));
+    GSCHED_RELEASE;
+    RECORD_RESULT(integer, *result);
+    return 1;
+}
+
+bool_t rpc_cudnnsoftmaxforward_1_svc(ptr handle, int algo, int mode, cudnn_scaling_t alpha, ptr xDesc, ptr x, cudnn_scaling_t beta, ptr yDesc, ptr y, int *result, struct svc_req *rqstp)
+{
+    RECORD_API(rpc_cudnnsoftmaxforward_1_argument);
+    RECORD_NARG(handle);
+    RECORD_NARG(algo);
+    RECORD_NARG(mode);
+    RECORD_NARG(alpha);
+    RECORD_NARG(xDesc);
+    RECORD_NARG(x);
+    RECORD_NARG(beta);
+    RECORD_NARG(yDesc);
+    RECORD_NARG(y);
+
+    LOGE(LOG_DEBUG, "%s", __FUNCTION__);
+    GSCHED_RETAIN;
+    *result = cudnnSoftmaxForward(
+        (cudnnHandle_t)resource_mg_get(&rm_cudnn, (void*)handle),
+        (cudnnSoftmaxAlgorithm_t)algo,
+        (cudnnSoftmaxMode_t)mode,
+        (alpha.dataType == CUDNN_DATA_DOUBLE ? (const void*)&alpha.cudnn_scaling_t_u.d : (const void*)&alpha.cudnn_scaling_t_u.f),
+        (cudnnTensorDescriptor_t)resource_mg_get(&rm_cudnn_tensors, (void*)xDesc),
+        (const void*)resource_mg_get(&rm_memory, (void*)x),
+        (beta.dataType == CUDNN_DATA_DOUBLE ? (const void*)&beta.cudnn_scaling_t_u.d : (const void*)&beta.cudnn_scaling_t_u.f),
+        (cudnnTensorDescriptor_t)resource_mg_get(&rm_cudnn_tensors, (void*)yDesc),
+        (void*)resource_mg_get(&rm_memory, (void*)y));
+    GSCHED_RELEASE;
+    RECORD_RESULT(integer, *result);
+    return 1;
+}
+
+/* cudnn cnn inference */
+bool_t rpc_cudnngetconvolutionndforwardoutputdim_1_svc(ptr convDesc, ptr inputTensorDesc, ptr filterDesc, int nbDims, mem_result *result, struct svc_req *rqstp)
+{
+    LOGE(LOG_DEBUG, "%s", __FUNCTION__);
+    GSCHED_RETAIN;
+    result->mem_result_u.data.mem_data_len = sizeof(int) * nbDims;
+    if ((result->mem_result_u.data.mem_data_val = malloc(result->mem_result_u.data.mem_data_len)) == NULL) {
+        LOGE(LOG_ERROR, "malloc failed");
+        return 0;
+    }
+    result->err = cudnnGetConvolutionNdForwardOutputDim(
+        (cudnnConvolutionDescriptor_t)resource_mg_get(&rm_cudnn_convs, (void*)convDesc),
+        (cudnnTensorDescriptor_t)resource_mg_get(&rm_cudnn_tensors, (void*)inputTensorDesc),
+        (cudnnFilterDescriptor_t)resource_mg_get(&rm_cudnn_filters, (void*)filterDesc),
+        nbDims,
+        (int*)result->mem_result_u.data.mem_data_val);
+    GSCHED_RELEASE;
+    return 1;
+}
+
+bool_t rpc_cudnncreateconvolutiondescriptor_1_svc(ptr_result *result, struct svc_req *rqstp)
+{
+    RECORD_VOID_API;
+    LOGE(LOG_DEBUG, "%s", __FUNCTION__);
+
+    GSCHED_RETAIN;
+    result->err = cudnnCreateConvolutionDescriptor((cudnnConvolutionDescriptor_t*)&result->ptr_result_u.ptr);
+    if (resource_mg_create(&rm_cudnn_convs, (void*)result->ptr_result_u.ptr) != 0) {
+        LOGE(LOG_ERROR, "error in resource manager");
+    }
+    GSCHED_RELEASE;
+    RECORD_RESULT(ptr_result_u, *result);
+    return 1;
+}
+
+bool_t rpc_cudnndestroyconvolutiondescriptor_1_svc(ptr convDesc, int *result, struct svc_req *rqstp)
+{
+    RECORD_API(ptr);
+    RECORD_SINGLE_ARG(convDesc);
+    LOGE(LOG_DEBUG, "%s", __FUNCTION__);
+
+    GSCHED_RETAIN;
+    *result = cudnnDestroyConvolutionDescriptor(
+        (cudnnConvolutionDescriptor_t)resource_mg_get(&rm_cudnn_convs, (void*)convDesc));
+    // TODO: Remove from resource manager
+    GSCHED_RELEASE;
+    RECORD_RESULT(integer, *result);
+    return 1;
+}
+
+bool_t rpc_cudnnsetconvolutionnddescriptor_1_svc(ptr convDesc, int arrayLength, mem_data padA, mem_data filterStrideA, mem_data dilationA, int mode, int computeType, int *result, struct svc_req *rqstp)
+{
+    RECORD_API(rpc_cudnnsetconvolutionnddescriptor_1_argument);
+    RECORD_NARG(convDesc);
+    RECORD_NARG(arrayLength);
+    RECORD_NARG(padA);
+    RECORD_NARG(filterStrideA);
+    RECORD_NARG(dilationA);
+    RECORD_NARG(mode);
+    RECORD_NARG(computeType);
+    //TODO: Recording mem_data is not as easy as done here.
+
+    LOGE(LOG_DEBUG, "%s", __FUNCTION__);
+
+    if (padA.mem_data_len != arrayLength * sizeof(int) ||
+        filterStrideA.mem_data_len != arrayLength * sizeof(int) ||
+        dilationA.mem_data_len != arrayLength * sizeof(int)) {
+        LOGE(LOG_ERROR, "array dimensions not as expected.");
+        return 0;
+    }
+    GSCHED_RETAIN;
+    *result = cudnnSetConvolutionNdDescriptor(
+        (cudnnConvolutionDescriptor_t)resource_mg_get(&rm_cudnn_convs, (void*)convDesc),
+        arrayLength,
+        (const int*)padA.mem_data_val,
+        (const int*)filterStrideA.mem_data_val,
+        (const int*)dilationA.mem_data_val,
+        (cudnnConvolutionMode_t)mode,
+        (cudnnDataType_t)computeType);
+    GSCHED_RELEASE;
+    RECORD_RESULT(integer, *result);
+    return 1;
+}
+
+bool_t rpc_cudnngetconvolutionforwardalgorithm_v7_1_svc(ptr handle, ptr srcDesc, ptr filterDesc, ptr convDesc, ptr destDesc, int requestedAlgoCount, mem_result *result, struct svc_req *rqstp)
+{
+    LOGE(LOG_DEBUG, "%s", __FUNCTION__);
+    GSCHED_RETAIN;
+    result->mem_result_u.data.mem_data_len = sizeof(int) + sizeof(cudnnConvolutionFwdAlgoPerf_t) * requestedAlgoCount;
+    if ((result->mem_result_u.data.mem_data_val = malloc(result->mem_result_u.data.mem_data_len)) == NULL) {
+        LOGE(LOG_ERROR, "malloc failed");
+        return 0;
+    }
+    result->err = cudnnGetConvolutionForwardAlgorithm_v7(
+        (cudnnHandle_t)resource_mg_get(&rm_cudnn, (void*)handle),
+        (cudnnTensorDescriptor_t)resource_mg_get(&rm_cudnn_tensors, (void*)srcDesc),
+        (cudnnFilterDescriptor_t)resource_mg_get(&rm_cudnn_filters, (void*)filterDesc),
+        (cudnnConvolutionDescriptor_t)resource_mg_get(&rm_cudnn_convs, (void*)convDesc),
+        (cudnnTensorDescriptor_t)resource_mg_get(&rm_cudnn_tensors, (void*)destDesc),
+        requestedAlgoCount,
+        (int*)result->mem_result_u.data.mem_data_val,
+        (cudnnConvolutionFwdAlgoPerf_t*)(result->mem_result_u.data.mem_data_val + sizeof(int)));
+    GSCHED_RELEASE;
+    return 1;
+}
+
+bool_t rpc_cudnnfindconvolutionforwardalgorithm_1_svc(ptr handle, ptr xDesc, ptr wDesc, ptr convDesc, ptr yDesc, int requestedAlgoCount, mem_result *result, struct svc_req *rqstp)
+{
+    LOGE(LOG_DEBUG, "%s", __FUNCTION__);
+    GSCHED_RETAIN;
+    result->mem_result_u.data.mem_data_len = sizeof(int) + sizeof(cudnnConvolutionFwdAlgoPerf_t) * requestedAlgoCount;
+    if ((result->mem_result_u.data.mem_data_val = malloc(result->mem_result_u.data.mem_data_len)) == NULL) {
+        LOGE(LOG_ERROR, "malloc failed");
+        return 0;
+    }
+    result->err = cudnnFindConvolutionForwardAlgorithm(
+        (cudnnHandle_t)resource_mg_get(&rm_cudnn, (void*)handle),
+        (cudnnTensorDescriptor_t)resource_mg_get(&rm_cudnn_tensors, (void*)xDesc),
+        (cudnnFilterDescriptor_t)resource_mg_get(&rm_cudnn_filters, (void*)wDesc),
+        (cudnnConvolutionDescriptor_t)resource_mg_get(&rm_cudnn_convs, (void*)convDesc),
+        (cudnnTensorDescriptor_t)resource_mg_get(&rm_cudnn_tensors, (void*)yDesc),
+        requestedAlgoCount,
+        (int*)result->mem_result_u.data.mem_data_val,
+        (cudnnConvolutionFwdAlgoPerf_t*)(result->mem_result_u.data.mem_data_val + sizeof(int)));
+    GSCHED_RELEASE;
+    return 1;
+}
+
+bool_t rpc_cudnngetconvolutionforwardworkspacesize_1_svc(ptr handle, ptr xDesc, ptr wDesc, ptr convDesc, ptr yDesc, int algo, sz_result *result, struct svc_req *rqstp)
+{
+    LOGE(LOG_DEBUG, "%s", __FUNCTION__);
+    GSCHED_RETAIN;
+    result->err = cudnnGetConvolutionForwardWorkspaceSize(
+        (cudnnHandle_t)resource_mg_get(&rm_cudnn, (void*)handle),
+        (cudnnTensorDescriptor_t)resource_mg_get(&rm_cudnn_tensors, (void*)xDesc),
+        (cudnnFilterDescriptor_t)resource_mg_get(&rm_cudnn_filters, (void*)wDesc),
+        (cudnnConvolutionDescriptor_t)resource_mg_get(&rm_cudnn_convs, (void*)convDesc),
+        (cudnnTensorDescriptor_t)resource_mg_get(&rm_cudnn_tensors, (void*)yDesc),
+        (cudnnConvolutionFwdAlgo_t)algo,
+        (size_t*)&result->sz_result_u.data);
+    GSCHED_RELEASE;
+    return 1;
+}
+
+bool_t rpc_cudnnconvolutionforward_1_svc(ptr handle, cudnn_scaling_t alpha, ptr xDesc, ptr x, ptr wDesc, ptr w, ptr convDesc, int algo, ptr workSpace, size_t workSpaceSizeInBytes, cudnn_scaling_t beta, ptr yDesc, ptr y, int *result, struct svc_req *rqstp)
+{
+    RECORD_API(rpc_cudnnconvolutionforward_1_argument);
+    RECORD_NARG(handle);
+    RECORD_NARG(alpha);
+    RECORD_NARG(xDesc);
+    RECORD_NARG(x);
+    RECORD_NARG(wDesc);
+    RECORD_NARG(w);
+    RECORD_NARG(convDesc);
+    RECORD_NARG(algo);
+    RECORD_NARG(workSpace);
+    RECORD_NARG(workSpaceSizeInBytes);
+    RECORD_NARG(beta);
+    RECORD_NARG(yDesc);
+    RECORD_NARG(y);
+    
+    LOGE(LOG_DEBUG, "%s", __FUNCTION__);
+    GSCHED_RETAIN;
+    *result = cudnnConvolutionForward(
+        (cudnnHandle_t)resource_mg_get(&rm_cudnn, (void*)handle),
+        (alpha.dataType == CUDNN_DATA_DOUBLE ? (const void*)&alpha.cudnn_scaling_t_u.d : (const void*)&alpha.cudnn_scaling_t_u.f),
+        (cudnnTensorDescriptor_t)resource_mg_get(&rm_cudnn_tensors, (void*)xDesc),
+        (const void*)resource_mg_get(&rm_memory, (void*)x),
+        (cudnnFilterDescriptor_t)resource_mg_get(&rm_cudnn_filters, (void*)wDesc),
+        (const void*)resource_mg_get(&rm_memory, (void*)w),
+        (cudnnConvolutionDescriptor_t)resource_mg_get(&rm_cudnn_convs, (void*)convDesc),
+        algo,
+        (void*)resource_mg_get(&rm_memory, (void*)workSpace),
+        workSpaceSizeInBytes,
+        (beta.dataType == CUDNN_DATA_DOUBLE ? (const void*)&beta.cudnn_scaling_t_u.d : (const void*)&beta.cudnn_scaling_t_u.f),
+        (cudnnTensorDescriptor_t)resource_mg_get(&rm_cudnn_tensors, (void*)yDesc),
+        (void*)resource_mg_get(&rm_memory, (void*)y));
+    GSCHED_RELEASE;
+    RECORD_RESULT(integer, *result);
+    return 1;
+}
+
+bool_t rpc_cudnnaddtensor_1_svc(ptr handle, cudnn_scaling_t alpha, ptr aDesc, ptr A, cudnn_scaling_t beta, ptr cDesc, ptr C, int *result, struct svc_req *rqstp)
+{
+    RECORD_API(rpc_cudnnaddtensor_1_argument);
+    RECORD_NARG(handle);
+    RECORD_NARG(alpha);
+    RECORD_NARG(aDesc);
+    RECORD_NARG(A);
+    RECORD_NARG(beta);
+    RECORD_NARG(cDesc);
+    RECORD_NARG(C);
+    
+    LOGE(LOG_DEBUG, "%s", __FUNCTION__);
+    GSCHED_RETAIN;
+    *result = cudnnAddTensor(
+        (cudnnHandle_t)resource_mg_get(&rm_cudnn, (void*)handle),
+        (alpha.dataType == CUDNN_DATA_DOUBLE ? (const void*)&alpha.cudnn_scaling_t_u.d : (const void*)&alpha.cudnn_scaling_t_u.f),
+        (cudnnTensorDescriptor_t)resource_mg_get(&rm_cudnn_tensors, (void*)aDesc),
+        (const void*)resource_mg_get(&rm_memory, (void*)A),
+        (beta.dataType == CUDNN_DATA_DOUBLE ? (const void*)&beta.cudnn_scaling_t_u.d : (const void*)&beta.cudnn_scaling_t_u.f),
+        (cudnnTensorDescriptor_t)resource_mg_get(&rm_cudnn_tensors, (void*)cDesc),
+        (void*)resource_mg_get(&rm_memory, (void*)C));
+    GSCHED_RELEASE;
+    RECORD_RESULT(integer, *result);
+    return 1;
+}
+
+bool_t rpc_cudnntransformtensor_1_svc(ptr handle, cudnn_scaling_t alpha, ptr xDesc, ptr x, cudnn_scaling_t beta, ptr yDesc, ptr y, int *result, struct svc_req *rqstp)
+{
+    RECORD_API(rpc_cudnntransformtensor_1_argument);
+    RECORD_NARG(handle);
+    RECORD_NARG(alpha);
+    RECORD_NARG(xDesc);
+    RECORD_NARG(x);
+    RECORD_NARG(beta);
+    RECORD_NARG(yDesc);
+    RECORD_NARG(y);
+    
+    LOGE(LOG_DEBUG, "%s", __FUNCTION__);
+    GSCHED_RETAIN;
+    *result = cudnnTransformTensor(
+        (cudnnHandle_t)resource_mg_get(&rm_cudnn, (void*)handle),
+        (alpha.dataType == CUDNN_DATA_DOUBLE ? (const void*)&alpha.cudnn_scaling_t_u.d : (const void*)&alpha.cudnn_scaling_t_u.f),
+        (cudnnTensorDescriptor_t)resource_mg_get(&rm_cudnn_tensors, (void*)xDesc),
+        (const void*)resource_mg_get(&rm_memory, (void*)x),
+        (beta.dataType == CUDNN_DATA_DOUBLE ? (const void*)&beta.cudnn_scaling_t_u.d : (const void*)&beta.cudnn_scaling_t_u.f),
+        (cudnnTensorDescriptor_t)resource_mg_get(&rm_cudnn_tensors, (void*)yDesc),
+        (void*)resource_mg_get(&rm_memory, (void*)y));
+    GSCHED_RELEASE;
+    RECORD_RESULT(integer, *result);
+    return 1;
 }
\ No newline at end of file
diff --git a/cpu/cpu_rpc_prot.x b/cpu/cpu_rpc_prot.x
index 490ad9be..987b89af 100644
--- a/cpu/cpu_rpc_prot.x
+++ b/cpu/cpu_rpc_prot.x
@@ -456,6 +456,8 @@ program RPC_CD_PROG {
                          ptr, int, ptr, int, float, ptr, int)                 = 3005;
         int          rpc_cublasDgemv(ptr, int, int, int, double,
                          ptr, int, ptr, int, double, ptr, int)                 = 3006;
+        int          rpc_cublasSgemmEx(ptr, int, int, int, int, int, float,
+                         ptr, int, int, ptr, int, int, float, ptr, int, int)                 = 3007;
 
         /* NVML */
         int_result   rpc_nvmlDeviceGetCount_v2(void)                           = 4000;
@@ -489,9 +491,13 @@ program RPC_CD_PROG {
         int         rpc_cudnnSetTensorTransformDescriptor(ptr transformDesc, uint32_t nbDims, int destFormat, mem_data padBeforeA, mem_data padAfterA, mem_data foldA, int direction) = 5021;
         mem_result  rpc_cudnnGetTensorTransformDescriptor(ptr transformDesc, uint32_t nbDimsRequested) = 5022;
         int         rpc_cudnnDestroyTensorTransformDescriptor(ptr transformDesc) = 5023;
-        ptr_result  rpc_cudnnTransformTensor(ptr handle, cudnn_scaling_t alpha, ptr xDesc, cudnn_scaling_t x, cudnn_scaling_t beta, ptr yDesc) = 5024;
+        */
+        int         rpc_cudnnTransformTensor(ptr handle, cudnn_scaling_t alpha, ptr xDesc, ptr x, cudnn_scaling_t beta, ptr yDesc, ptr y) = 5024;
+        /*
         ptr_result  rpc_cudnnTransformTensorEx(ptr handle, ptr transDesc, cudnn_scaling_t alpha, ptr srcDesc, cudnn_scaling_t srcData, cudnn_scaling_t beta, ptr destDesc) = 5025;
-        ptr_result  rpc_cudnnAddTensor(ptr handle, cudnn_scaling_t alpha, ptr aDesc, ptr A, cudnn_scaling_t beta, ptr cDesc, ptr C) = 5026;
+        */
+        int  rpc_cudnnAddTensor(ptr handle, cudnn_scaling_t alpha, ptr aDesc, ptr A, cudnn_scaling_t beta, ptr cDesc, ptr C) = 5026;
+        /*
         ptr_result  rpc_cudnnCreateOpTensorDescriptor(void) = 5027;
         int         rpc_cudnnSetOpTensorDescriptor(ptr opTensorDesc, int opTensorOp, int opTensorCompType, int opTensorNanOpt) = 5028;
         int3_result rpc_cudnnGetOpTensorDescriptor(ptr opTensorDesc) = 5029;
@@ -515,6 +521,7 @@ program RPC_CD_PROG {
         sz_result   rpc_cudnnGetFilterSizeInBytes(ptr filterDesc) = 5046;
         int         rpc_cudnnTransformFilter(ptr handle, ptr transDesc, cudnn_scaling_t alpha, ptr srcDesc, ptr srcData, cudnn_scaling_t beta, ptr destDesc, ptr destData) = 5047;
         int         rpc_cudnnDestroyFilterDescriptor(ptr filterDesc) = 5048;
+        int         rpc_cudnnSoftmaxForward(ptr handle, int algo, int mode, cudnn_scaling_t alpha, ptr xDesc, ptr x, cudnn_scaling_t beta, ptr yDesc, ptr y) = 5049;
         ptr_result  rpc_cudnnCreatePoolingDescriptor(void) = 5050;
         int         rpc_cudnnSetPooling2dDescriptor(ptr poolingDesc, int mode, int maxpoolingNanOpt, int windowHeight, int windowWidth, int verticalPadding, int horizontalPadding, int verticalStride, int horizontalStride) = 5051;
         int8_result rpc_cudnnGetPooling2dDescriptor(ptr poolingDesc) = 5052;
@@ -523,15 +530,27 @@ program RPC_CD_PROG {
         mem_result  rpc_cudnnGetPoolingNdForwardOutputDim(ptr poolingDesc, ptr inputTensorDesc, int nbDims) = 5055;
         int4_result rpc_cudnnGetPooling2dForwardOutputDim(ptr poolingDesc, ptr inputTensorDesc) = 5056;
         int         rpc_cudnnDestroyPoolingDescriptor(ptr poolingDesc) = 5057;
+        int         rpc_cudnnPoolingForward(ptr handle, ptr poolingDesc, cudnn_scaling_t alpha, ptr xDesc, ptr x, cudnn_scaling_t beta, ptr yDesc, ptr y) = 5058;
         ptr_result  rpc_cudnnCreateActivationDescriptor(void) = 5059;
         int         rpc_cudnnSetActivationDescriptor(ptr activationDesc, int mode, int reluNanOpt, double coef) = 5060;
         int2d1_result rpc_cudnnGetActivationDescriptor(ptr activationDesc) = 5061;
         int         rpc_cudnnSetActivationDescriptorSwishBeta(ptr activationDesc, double swish_beta) = 5062;
         d_result    rpc_cudnnGetActivationDescriptorSwishBeta(ptr activationDesc) = 5063;
         int         rpc_cudnnDestroyActivationDescriptor(ptr activationDesc) = 5064;
+        int         rpc_cudnnActivationForward(ptr handle, ptr activationDesc, cudnn_scaling_t alpha, ptr xDesc, ptr x, cudnn_scaling_t beta, ptr yDesc, ptr y) = 5065;
         ptr_result  rpc_cudnnCreateLRNDescriptor(void) = 5066;
         int         rpc_cudnnSetLRNDescriptor(ptr normDesc, unsigned lrnN, double lrnAlpha, double lrnBeta, double lrnK) = 5067;
         int1d3_result rpc_cudnnGetLRNDescriptor(ptr normDesc) = 5068;
         int         rpc_cudnnDestroyLRNDescriptor(ptr lrnDesc) = 5069;
+        int         rpc_cudnnLRNCrossChannelForward(ptr handle, ptr normDesc, int lrnMode, cudnn_scaling_t alpha, ptr xDesc, ptr x, cudnn_scaling_t beta, ptr yDesc, ptr y) = 5070;
+        /* cudnn cnn inference */
+        ptr_result  rpc_cudnnCreateConvolutionDescriptor(void) = 5301;
+        int         rpc_cudnnDestroyConvolutionDescriptor(ptr convDesc) = 5302;
+        mem_result  rpc_cudnnGetConvolutionNdForwardOutputDim(ptr convDesc, ptr inputTensorDesc, ptr filterDesc, int nbDims) = 5303;
+        int         rpc_cudnnSetConvolutionNdDescriptor(ptr convDesc, int arrayLength, mem_data padA,  mem_data filterStrideA, mem_data dilationA,  int mode,  int computeType) = 5304;
+        mem_result rpc_cudnnGetConvolutionForwardAlgorithm_v7(ptr handle, ptr srcDesc, ptr filterDesc, ptr convDesc, ptr destDesc, int requestedAlgoCount) = 5305;
+        mem_result rpc_cudnnFindConvolutionForwardAlgorithm(ptr handle, ptr xDesc, ptr wDesc, ptr convDesc, ptr yDesc, int requestedAlgoCount) = 5306;
+        sz_result rpc_cudnnGetConvolutionForwardWorkspaceSize(ptr handle, ptr xDesc, ptr wDesc, ptr convDesc, ptr yDesc, int algo) = 5307;
+        int rpc_cudnnConvolutionForward(ptr handle, cudnn_scaling_t alpha, ptr xDesc, ptr x, ptr wDesc, ptr w, ptr convDesc, int algo, ptr workSpace, size_t workSpaceSizeInBytes, cudnn_scaling_t beta, ptr yDesc, ptr y) = 5308;
     } = 1;
 } = 99;
diff --git a/cpu/resource-mg.h b/cpu/resource-mg.h
index 29b2dcae..0e5f42e4 100644
--- a/cpu/resource-mg.h
+++ b/cpu/resource-mg.h
@@ -47,6 +47,7 @@ resource_mg rm_cudnn_tensortransform;
 resource_mg rm_cudnn_poolings;
 resource_mg rm_cudnn_activations;
 resource_mg rm_cudnn_lrns;
+resource_mg rm_cudnn_convs;
 
 
 /** initializes the resource manager

From 8a911ca9fa7b0c415bef048edcafb664b56ba5a7 Mon Sep 17 00:00:00 2001
From: Niklas Eiling <niklas.eiling@eonerc.rwth-aachen.de>
Date: Mon, 26 Jun 2023 11:15:25 +0200
Subject: [PATCH 73/83] fix faulty if statement when intercepting dlopen calls

Signed-off-by: Niklas Eiling <niklas.eiling@eonerc.rwth-aachen.de>
---
 cpu/cpu-client.c | 15 +++++++--------
 1 file changed, 7 insertions(+), 8 deletions(-)

diff --git a/cpu/cpu-client.c b/cpu/cpu-client.c
index c5543101..ab3d57e3 100644
--- a/cpu/cpu-client.c
+++ b/cpu/cpu-client.c
@@ -248,10 +248,9 @@ void *dlopen(const char *filename, int flag)
 
     if (filename != NULL && 
         (strcmp(filename, "libcuda.so.1") == 0 ||
-        strcmp(filename, "libcuda.so") == 0) ||
-        strcmp(filename, "libnvidia-ml.so.1") == 0) {
-        LOG(LOG_DEBUG, "replacing dlopen call to cuda library with "
-                       "cricket-client.so");
+        strcmp(filename, "libcuda.so") == 0 ||
+        strcmp(filename, "libnvidia-ml.so.1")) == 0) {
+        LOG(LOG_DEBUG, "replacing dlopen call to %s with cricket-client.so", filename);
         dl_handle = dlopen_orig("cricket-client.so", flag);
         if (clnt == NULL) {
             LOGE(LOG_ERROR, "rpc seems to be uninitialized");
@@ -401,10 +400,10 @@ void __cudaUnregisterFatBinary(void **fatCubinHandle)
         return;
     }
 
-    retval_1 = rpc_elf_unload_1((ptr)fatCubinHandle, &result, clnt);
-    if (retval_1 != RPC_SUCCESS || result != 0) {
-        LOGE(LOG_ERROR, "call failed.");
-    }
+    // retval_1 = rpc_elf_unload_1((ptr)fatCubinHandle, &result, clnt);
+    // if (retval_1 != RPC_SUCCESS || result != 0) {
+    //     LOGE(LOG_ERROR, "call failed.");
+    // }
 }
 
 // void __cudaRegisterFatBinaryEnd(void **fatCubinHandle)

From 122b72102ac629048bc3eb44276504c1ceed20d6 Mon Sep 17 00:00:00 2001
From: Niklas Eiling <niklas.eiling@eonerc.rwth-aachen.de>
Date: Mon, 26 Jun 2023 11:16:18 +0200
Subject: [PATCH 74/83] improve logging for unloading of modules

Signed-off-by: Niklas Eiling <niklas.eiling@eonerc.rwth-aachen.de>
---
 cpu/cpu-server-driver.c | 12 +++++++-----
 1 file changed, 7 insertions(+), 5 deletions(-)

diff --git a/cpu/cpu-server-driver.c b/cpu/cpu-server-driver.c
index 39759457..0277e414 100644
--- a/cpu/cpu-server-driver.c
+++ b/cpu/cpu-server-driver.c
@@ -41,7 +41,7 @@ bool_t rpc_elf_load_1_svc(mem_data elf, ptr module_key, int *result, struct svc_
 {
     LOGE(LOG_DEBUG, "rpc_elf_load(elf: %p, len: %#x, module_key: %#x)", elf.mem_data_val, elf.mem_data_len, module_key);
     CUresult res;
-    CUmodule module;
+    CUmodule module = NULL;
     
     if ((res = cuModuleLoadData(&module, elf.mem_data_val)) != CUDA_SUCCESS) {
         LOGE(LOG_ERROR, "cuModuleLoadData failed: %d", res);
@@ -66,7 +66,7 @@ bool_t rpc_elf_load_1_svc(mem_data elf, ptr module_key, int *result, struct svc_
 // TODO: We should also remove associated function handles
 bool_t rpc_elf_unload_1_svc(ptr elf_handle, int *result, struct svc_req *rqstp)
 {
-    LOG(LOG_DEBUG, "rpc_elf_unload(elf_handle: %p)", elf_handle);
+    LOGE(LOG_DEBUG, "rpc_elf_unload(elf_handle: %p)", elf_handle);
     CUmodule module = NULL;
     CUresult res;
     
@@ -76,6 +76,8 @@ bool_t rpc_elf_unload_1_svc(ptr elf_handle, int *result, struct svc_req *rqstp)
         return 1;
     }
 
+    LOGE(LOG_DEBUG,"module: %p", module);
+
     // if ((res = resource_mg_remove(&rm_modules, (void*)elf_handle)) != CUDA_SUCCESS) {
     //     LOG(LOG_ERROR, "resource_mg_create failed: %d", res);
     //     result->err = res;
@@ -83,13 +85,13 @@ bool_t rpc_elf_unload_1_svc(ptr elf_handle, int *result, struct svc_req *rqstp)
     // }
 
     if ((res = cuModuleUnload(module)) != CUDA_SUCCESS) {
-        LOG(LOG_ERROR, "cuModuleUnload failed: %d", res);
+        const char *errstr;
+        cuGetErrorString(res, &errstr);
+        LOG(LOG_ERROR, "cuModuleUnload failed: %s (%d)", errstr, res);
         *result = res;
         return 1;
     }
 
-    //TODO: Free memory of module
-
     *result = 0;
     return 1;
 }

From e8813ea58f5c38d1730fcaa644c245f8b3a229f8 Mon Sep 17 00:00:00 2001
From: Niklas Eiling <niklas.eiling@eonerc.rwth-aachen.de>
Date: Mon, 26 Jun 2023 11:16:38 +0200
Subject: [PATCH 75/83] improve docs/pytorch.md

Signed-off-by: Niklas Eiling <niklas.eiling@eonerc.rwth-aachen.de>
---
 docs/pytorch.md | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/docs/pytorch.md b/docs/pytorch.md
index dbf54a9e..ebb8a3b5 100644
--- a/docs/pytorch.md
+++ b/docs/pytorch.md
@@ -112,6 +112,11 @@ EXTRA_DOCKER_BUILD_FLAGS='--storage-opt "overlay.mount_program=/usr/bin/fuse-ove
 make -f docker.Makefile
 ```
 
+launch cricket server (outside of docker container)
+```
+<path to cricket>/bin/cricket-rpc-server
+```
+
 launch docker container, torch
 ```
 sudo docker run --gpus all --rm -it -v <patch-to-cricket>/cricket:/cricket --ipc=host pytorch:latest

From e5dbebfc63abb352c0524f6db7aaac2b343dc2ea Mon Sep 17 00:00:00 2001
From: Niklas Eiling <niklas.eiling@eonerc.rwth-aachen.de>
Date: Thu, 29 Jun 2023 14:50:03 +0200
Subject: [PATCH 76/83] improve cublas implementation, add cudnnBackend
 implementation

Signed-off-by: Niklas Eiling <niklas.eiling@eonerc.rwth-aachen.de>
---
 cpu/cpu-client-cublas.c | 685 ++++++++++++++++++++++++++++++++++++----
 cpu/cpu-client-cudnn.c  | 218 +++++++++++++
 cpu/cpu-client-driver.c |   7 +-
 cpu/cpu-client.c        |  47 +--
 cpu/cpu-libwrap.h       |  16 +-
 cpu/cpu-server-cublas.c |  47 +++
 cpu/cpu-server-cudnn.c  | 183 +++++++++++
 cpu/cpu_rpc_prot.x      |  17 +
 cpu/resource-mg.h       |   1 +
 tests/samples/Makefile  |   5 +-
 tests/test_apps/yolo.py |  12 +
 11 files changed, 1147 insertions(+), 91 deletions(-)
 create mode 100644 tests/test_apps/yolo.py

diff --git a/cpu/cpu-client-cublas.c b/cpu/cpu-client-cublas.c
index 39e87636..f9fbc159 100644
--- a/cpu/cpu-client-cublas.c
+++ b/cpu/cpu-client-cublas.c
@@ -19,7 +19,7 @@
 extern int api_call_cnt;
 #endif //WITH_API_CNT
 
-cublasStatus_t cublasCreate(cublasHandle_t* handle)
+cublasStatus_t cublasCreate_v2(cublasHandle_t* handle)
 {
 #ifdef WITH_API_CNT
     api_call_cnt++;
@@ -36,7 +36,7 @@ cublasStatus_t cublasCreate(cublasHandle_t* handle)
     return result.err;
 }
 
-cublasStatus_t cublasDestroy(cublasHandle_t handle)
+cublasStatus_t cublasDestroy_v2(cublasHandle_t handle)
 {
 #ifdef WITH_API_CNT
     api_call_cnt++;
@@ -50,30 +50,257 @@ cublasStatus_t cublasDestroy(cublasHandle_t handle)
     return result;
 }
 
-cublasStatus_t cublasDgemm(cublasHandle_t handle,
-                           cublasOperation_t transa, cublasOperation_t transb,
-                           int m, int n, int k,
-                           const double          *alpha,
-                           const double          *A, int lda,
-                           const double          *B, int ldb,
-                           const double          *beta,
-                           double          *C, int ldc)
+DEF_FN(cublasStatus_t, cublasGetVersion_v2, cublasHandle_t, handle, int*, version);
+DEF_FN(cublasStatus_t, cublasGetProperty, libraryPropertyType, type, int*, value);
+DEF_FN(size_t, cublasGetCudartVersion);
+cublasStatus_t cublasSetWorkspace_v2(cublasHandle_t handle, void* workspace, size_t workspaceSizeInBytes)
 {
 #ifdef WITH_API_CNT
     api_call_cnt++;
 #endif //WITH_API_CNT
     int result;
     enum clnt_stat retval_1;
-    retval_1 = rpc_cublasdgemm_1(
+    retval_1 = rpc_cublassetworkspace_1(
         (ptr)handle,
-        (int)transa,
-        (int)transb,
-        m, n, k,
+        (ptr)workspace,
+        workspaceSizeInBytes,
+        &result, clnt);
+    if (retval_1 != RPC_SUCCESS) {
+        clnt_perror (clnt, "call failed");
+    }
+    return result;
+}
+
+cublasStatus_t cublasSetStream_v2(cublasHandle_t handle, cudaStream_t streamId)
+{
+#ifdef WITH_API_CNT
+    api_call_cnt++;
+#endif //WITH_API_CNT
+    int result;
+    enum clnt_stat retval_1;
+    retval_1 = rpc_cublassetstream_1(
+        (ptr)handle,
+        (ptr)streamId,
+        &result, clnt);
+    if (retval_1 != RPC_SUCCESS) {
+        clnt_perror (clnt, "call failed");
+    }
+    return result;
+}
+
+DEF_FN(cublasStatus_t, cublasGetStream_v2, cublasHandle_t, handle, cudaStream_t*, streamId);
+DEF_FN(cublasStatus_t, cublasGetPointerMode_v2, cublasHandle_t, handle, cublasPointerMode_t*, mode);
+DEF_FN(cublasStatus_t, cublasSetPointerMode_v2, cublasHandle_t, handle, cublasPointerMode_t, mode);
+DEF_FN(cublasStatus_t, cublasGetAtomicsMode, cublasHandle_t, handle, cublasAtomicsMode_t*, mode);
+DEF_FN(cublasStatus_t, cublasSetAtomicsMode, cublasHandle_t, handle, cublasAtomicsMode_t, mode);
+DEF_FN(cublasStatus_t, cublasGetMathMode, cublasHandle_t, handle, cublasMath_t*, mode);
+cublasStatus_t cublasSetMathMode(cublasHandle_t handle, cublasMath_t mode)
+{
+#ifdef WITH_API_CNT
+    api_call_cnt++;
+#endif //WITH_API_CNT
+    int result;
+    enum clnt_stat retval_1;
+    retval_1 = rpc_cublassetmathmode_1(
+        (ptr)handle,
+        (int)mode,
+        &result, clnt);
+    if (retval_1 != RPC_SUCCESS) {
+        clnt_perror (clnt, "call failed");
+    }
+    return result;
+}
+
+DEF_FN(cublasStatus_t, cublasGetSmCountTarget, cublasHandle_t, handle, int*, smCountTarget);
+DEF_FN(cublasStatus_t, cublasSetSmCountTarget, cublasHandle_t, handle, int, smCountTarget);
+DEF_FN(const char*, cublasGetStatusName, cublasStatus_t, status);
+DEF_FN(const char*, cublasGetStatusString, cublasStatus_t, status);
+DEF_FN(cublasStatus_t, cublasLoggerConfigure, int, logIsOn, int, logToStdOut, int, logToStdErr, const char*, logFileName);
+DEF_FN(cublasStatus_t, cublasSetLoggerCallback, cublasLogCallback, userCallback);
+DEF_FN(cublasStatus_t, cublasGetLoggerCallback, cublasLogCallback*, userCallback);
+DEF_FN(cublasStatus_t, cublasSetVector, int, n, int, elemSize, const void*, x, int, incx, void*, devicePtr, int, incy);
+DEF_FN(cublasStatus_t, cublasSetVector_64, int64_t, n, int64_t, elemSize, const void*, x, int64_t, incx, void*, devicePtr, int64_t, incy);
+DEF_FN(cublasStatus_t, cublasGetVector, int, n, int, elemSize, const void*, x, int, incx, void*, y, int, incy);
+DEF_FN(cublasStatus_t, cublasGetVector_64, int64_t, n, int64_t, elemSize, const void*, x, int64_t, incx, void*, y, int64_t, incy);
+DEF_FN(cublasStatus_t, cublasSetMatrix, int, rows, int, cols, int, elemSize, const void*, A, int, lda, void*, B, int, ldb);
+DEF_FN(cublasStatus_t, cublasSetMatrix_64, int64_t, rows, int64_t, cols, int64_t, elemSize, const void*, A, int64_t, lda, void*, B, int64_t, ldb);
+DEF_FN(cublasStatus_t, cublasGetMatrix, int, rows, int, cols, int, elemSize, const void*, A, int, lda, void*, B, int, ldb);
+DEF_FN(cublasStatus_t, cublasGetMatrix_64, int64_t, rows, int64_t, cols, int64_t, elemSize, const void*, A, int64_t, lda, void*, B, int64_t, ldb);
+DEF_FN(cublasStatus_t, cublasSetVectorAsync, , int, n, int, elemSize, const void*, hostPtr, int, incx, void*, devicePtr, int, incy, cudaStream_t, stream);
+DEF_FN(cublasStatus_t, cublasSetVectorAsync_64, , int64_t, n, int64_t, elemSize, const void*, hostPtr, int64_t, incx, void*, devicePtr, int64_t, incy, cudaStream_t, stream);
+DEF_FN(cublasStatus_t, cublasGetVectorAsync, , int, n, int, elemSize, const void*, devicePtr, int, incx, void*, hostPtr, int, incy, cudaStream_t, stream);
+DEF_FN(cublasStatus_t, cublasGetVectorAsync_64, , int64_t, n, int64_t, elemSize, const void*, devicePtr, int64_t, incx, void*, hostPtr, int64_t, incy, cudaStream_t, stream);
+DEF_FN(cublasStatus_t, cublasSetMatrixAsync, int, rows, int, cols, int, elemSize, const void*, A, int, lda, void*, B, int, ldb, cudaStream_t, stream);
+DEF_FN(cublasStatus_t, cublasSetMatrixAsync_64, int64_t, rows, int64_t, cols, int64_t, elemSize, const void*, A, int64_t, lda, void*, B, int64_t, ldb, cudaStream_t, stream);
+DEF_FN(cublasStatus_t, cublasGetMatrixAsync, int, rows, int, cols, int, elemSize, const void*, A, int, lda, void*, B, int, ldb, cudaStream_t, stream);
+DEF_FN(cublasStatus_t, cublasGetMatrixAsync_64, int64_t, rows, int64_t, cols, int64_t, elemSize, const void*, A, int64_t, lda, void*, B, int64_t, ldb, cudaStream_t, stream);
+void cublasXerbla(const char* srName, int info) {
+    void (*fun)(const char*, int);
+    char* error_str; *(void **)(&fun) = dlsym(libwrap_get_sohandle(), "cublasXerbla");
+    if ((error_str = dlerror()) != ((void *)0)) {
+        if (0 > get_log_data()->curr_level) ;
+        else 
+            loggfe(0, 88, "/home/eiling/projects/cricket/cpu/cpu-client-cublas.c", "[libwrap] %s", error_str); 
+    }
+    if (3 > get_log_data()->curr_level) ;
+    else 
+        loggf(3, "%s called", "cublasXerbla");
+    (*fun)(srName, info); 
+    if (3 > get_log_data()->curr_level) ;
+    else loggf(3, "%s finished", "cublasXerbla");
+}
+DEF_FN(cublasStatus_t, cublasNrm2Ex, cublasHandle_t, handle, int, n, const void*, x, cudaDataType, xType, int, incx, void*, result, cudaDataType, resultType, cudaDataType, executionType);
+DEF_FN(cublasStatus_t, cublasNrm2Ex_64, cublasHandle_t, handle, int64_t, n, const void*, x, cudaDataType, xType, int64_t, incx, void*, result, cudaDataType, resultType, cudaDataType, executionType);
+DEF_FN(cublasStatus_t, cublasSnrm2_v2, cublasHandle_t, handle, int, n, const float*, x, int, incx, float*, result);
+DEF_FN(cublasStatus_t, cublasSnrm2_v2_64, cublasHandle_t, handle, int64_t, n, const float*, x, int64_t, incx, float*, result);
+DEF_FN(cublasStatus_t, cublasDnrm2_v2, cublasHandle_t, handle, int, n, const double*, x, int, incx, double*, result);
+DEF_FN(cublasStatus_t, cublasDnrm2_v2_64, cublasHandle_t, handle, int64_t, n, const double*, x, int64_t, incx, double*, result);
+DEF_FN(cublasStatus_t, cublasScnrm2_v2, cublasHandle_t, handle, int, n, const cuComplex*, x, int, incx, float*, result);
+DEF_FN(cublasStatus_t, cublasScnrm2_v2_64, cublasHandle_t, handle, int64_t, n, const cuComplex*, x, int64_t, incx, float*, result);
+DEF_FN(cublasStatus_t, cublasDznrm2_v2, cublasHandle_t, handle, int, n, const cuDoubleComplex*, x, int, incx, double*, result);
+DEF_FN(cublasStatus_t, cublasDznrm2_v2_64, cublasHandle_t, handle, int64_t, n, const cuDoubleComplex*, x, int64_t, incx, double*, result);
+DEF_FN(cublasStatus_t, cublasDotEx, cublasHandle_t, handle, int, n, const void*, x, cudaDataType, xType, int, incx, const void*, y, cudaDataType, yType, int, incy, void*, result, cudaDataType, resultType, cudaDataType, executionType);
+DEF_FN(cublasStatus_t, cublasDotEx_64, cublasHandle_t, handle, int64_t, n, const void*, x, cudaDataType, xType, int64_t, incx, const void*, y, cudaDataType, yType, int64_t, incy, void*, result, cudaDataType, resultType, cudaDataType, executionType);
+DEF_FN(cublasStatus_t, cublasDotcEx, cublasHandle_t, handle, int, n, const void*, x, cudaDataType, xType, int, incx, const void*, y, cudaDataType, yType, int, incy, void*, result, cudaDataType, resultType, cudaDataType, executionType);
+DEF_FN(cublasStatus_t, cublasDotcEx_64, cublasHandle_t, handle, int64_t, n, const void*, x, cudaDataType, xType, int64_t, incx, const void*, y, cudaDataType, yType, int64_t, incy, void*, result, cudaDataType, resultType, cudaDataType, executionType);
+DEF_FN(cublasStatus_t, cublasSdot_v2, cublasHandle_t, handle, int, n, const float*, x, int, incx, const float*, y, int, incy, float*, result);
+DEF_FN(cublasStatus_t, cublasSdot_v2_64, , cublasHandle_t, handle, int64_t, n, const float*, x, int64_t, incx, const float*, y, int64_t, incy, float*, result);
+DEF_FN(cublasStatus_t, cublasDdot_v2, cublasHandle_t, handle, int, n, const double*, x, int, incx, const double*, y, int, incy, double*, result);
+DEF_FN(cublasStatus_t, cublasDdot_v2_64, , cublasHandle_t, handle, int64_t, n, const double*, x, int64_t, incx, const double*, y, int64_t, incy, double*, result);
+DEF_FN(cublasStatus_t, cublasCdotu_v2, , cublasHandle_t, handle, int, n, const cuComplex*, x, int, incx, const cuComplex*, y, int, incy, cuComplex*, result);
+DEF_FN(cublasStatus_t, cublasCdotu_v2_64, cublasHandle_t, handle, int64_t, n, const cuComplex*, x, int64_t, incx, const cuComplex*, y, int64_t, incy, cuComplex*, result);
+DEF_FN(cublasStatus_t, cublasCdotc_v2, , cublasHandle_t, handle, int, n, const cuComplex*, x, int, incx, const cuComplex*, y, int, incy, cuComplex*, result);
+DEF_FN(cublasStatus_t, cublasCdotc_v2_64, cublasHandle_t, handle, int64_t, n, const cuComplex*, x, int64_t, incx, const cuComplex*, y, int64_t, incy, cuComplex*, result);
+DEF_FN(cublasStatus_t, cublasZdotu_v2, cublasHandle_t, handle, int, n, const cuDoubleComplex*, x, int, incx, const cuDoubleComplex*, y, int, incy, cuDoubleComplex*, result);
+DEF_FN(cublasStatus_t, cublasZdotu_v2_64, cublasHandle_t, handle, int64_t, n, const cuDoubleComplex*, x, int64_t, incx, const cuDoubleComplex*, y, int64_t, incy, cuDoubleComplex*, result);
+DEF_FN(cublasStatus_t, cublasZdotc_v2, cublasHandle_t, handle, int, n, const cuDoubleComplex*, x, int, incx, const cuDoubleComplex*, y, int, incy, cuDoubleComplex*, result);
+DEF_FN(cublasStatus_t, cublasZdotc_v2_64, cublasHandle_t, handle, int64_t, n, const cuDoubleComplex*, x, int64_t, incx, const cuDoubleComplex*, y, int64_t, incy, cuDoubleComplex*, result);
+DEF_FN(cublasStatus_t, cublasScalEx, cublasHandle_t, handle, int, n, const void*, alpha, cudaDataType, alphaType, void*, x, cudaDataType, xType, int, incx, cudaDataType, executionType);
+DEF_FN(cublasStatus_t, cublasScalEx_64, cublasHandle_t, handle, int64_t, n, const void*, alpha, cudaDataType, alphaType, void*, x, cudaDataType, xType, int64_t, incx, cudaDataType, executionType);
+DEF_FN(cublasStatus_t, cublasSscal_v2, cublasHandle_t, handle, int, n, const float*, alpha, float*, x, int, incx);
+DEF_FN(cublasStatus_t, cublasSscal_v2_64, cublasHandle_t, handle, int64_t, n, const float*, alpha, float*, x, int64_t, incx);
+DEF_FN(cublasStatus_t, cublasDscal_v2, cublasHandle_t, handle, int, n, const double*, alpha, double*, x, int, incx);
+DEF_FN(cublasStatus_t, cublasDscal_v2_64, cublasHandle_t, handle, int64_t, n, const double*, alpha, double*, x, int64_t, incx);
+DEF_FN(cublasStatus_t, cublasCscal_v2, cublasHandle_t, handle, int, n, const cuComplex*, alpha, cuComplex*, x, int, incx);
+DEF_FN(cublasStatus_t, cublasCscal_v2_64, cublasHandle_t, handle, int64_t, n, const cuComplex*, alpha, cuComplex*, x, int64_t, incx);
+DEF_FN(cublasStatus_t, cublasCsscal_v2, cublasHandle_t, handle, int, n, const float*, alpha, cuComplex*, x, int, incx);
+DEF_FN(cublasStatus_t, cublasCsscal_v2_64, cublasHandle_t, handle, int64_t, n, const float*, alpha, cuComplex*, x, int64_t, incx);
+DEF_FN(cublasStatus_t, cublasZscal_v2, cublasHandle_t, handle, int, n, const cuDoubleComplex*, alpha, cuDoubleComplex*, x, int, incx);
+DEF_FN(cublasStatus_t, cublasZscal_v2_64, cublasHandle_t, handle, int64_t, n, const cuDoubleComplex*, alpha, cuDoubleComplex*, x, int64_t, incx);
+DEF_FN(cublasStatus_t, cublasZdscal_v2, cublasHandle_t, handle, int, n, const double*, alpha, cuDoubleComplex*, x, int, incx);
+DEF_FN(cublasStatus_t, cublasZdscal_v2_64, cublasHandle_t, handle, int64_t, n, const double*, alpha, cuDoubleComplex*, x, int64_t, incx);
+DEF_FN(cublasStatus_t, cublasAxpyEx, cublasHandle_t, handle, int, n, const void*, alpha, cudaDataType, alphaType, const void*, x, cudaDataType, xType, int, incx, void*, y, cudaDataType, yType, int, incy, cudaDataType, executiontype);
+DEF_FN(cublasStatus_t, cublasAxpyEx_64, cublasHandle_t, handle, int64_t, n, const void*, alpha, cudaDataType, alphaType, const void*, x, cudaDataType, xType, int64_t, incx, void*, y, cudaDataType, yType, int64_t, incy, cudaDataType, executiontype);
+DEF_FN(cublasStatus_t, cublasSaxpy_v2, cublasHandle_t, handle, int, n, const float*, alpha, const float*, x, int, incx, float*, y, int, incy);
+DEF_FN(cublasStatus_t, cublasSaxpy_v2_64, , cublasHandle_t, handle, int64_t, n, const float*, alpha, const float*, x, int64_t, incx, float*, y, int64_t, incy);
+DEF_FN(cublasStatus_t, cublasDaxpy_v2, cublasHandle_t, handle, int, n, const double*, alpha, const double*, x, int, incx, double*, y, int, incy);
+DEF_FN(cublasStatus_t, cublasDaxpy_v2_64, , cublasHandle_t, handle, int64_t, n, const double*, alpha, const double*, x, int64_t, incx, double*, y, int64_t, incy);
+DEF_FN(cublasStatus_t, cublasCaxpy_v2, , cublasHandle_t, handle, int, n, const cuComplex*, alpha, const cuComplex*, x, int, incx, cuComplex*, y, int, incy);
+DEF_FN(cublasStatus_t, cublasCaxpy_v2_64, cublasHandle_t, handle, int64_t, n, const cuComplex*, alpha, const cuComplex*, x, int64_t, incx, cuComplex*, y, int64_t, incy);
+DEF_FN(cublasStatus_t, cublasZaxpy_v2, cublasHandle_t, handle, int, n, const cuDoubleComplex*, alpha, const cuDoubleComplex*, x, int, incx, cuDoubleComplex*, y, int, incy);
+DEF_FN(cublasStatus_t, cublasZaxpy_v2_64, cublasHandle_t, handle, int64_t, n, const cuDoubleComplex*, alpha, const cuDoubleComplex*, x, int64_t, incx, cuDoubleComplex*, y, int64_t, incy);
+DEF_FN(cublasStatus_t, cublasCopyEx, , cublasHandle_t, handle, int, n, const void*, x, cudaDataType, xType, int, incx, void*, y, cudaDataType, yType, int, incy);
+DEF_FN(cublasStatus_t, cublasCopyEx_64, cublasHandle_t, handle, int64_t, n, const void*, x, cudaDataType, xType, int64_t, incx, void*, y, cudaDataType, yType, int64_t, incy);
+DEF_FN(cublasStatus_t, cublasScopy_v2, cublasHandle_t, handle, int, n, const float*, x, int, incx, float*, y, int, incy);
+DEF_FN(cublasStatus_t, cublasScopy_v2_64, cublasHandle_t, handle, int64_t, n, const float*, x, int64_t, incx, float*, y, int64_t, incy);
+DEF_FN(cublasStatus_t, cublasDcopy_v2, cublasHandle_t, handle, int, n, const double*, x, int, incx, double*, y, int, incy);
+DEF_FN(cublasStatus_t, cublasDcopy_v2_64, cublasHandle_t, handle, int64_t, n, const double*, x, int64_t, incx, double*, y, int64_t, incy);
+DEF_FN(cublasStatus_t, cublasCcopy_v2, cublasHandle_t, handle, int, n, const cuComplex*, x, int, incx, cuComplex*, y, int, incy);
+DEF_FN(cublasStatus_t, cublasCcopy_v2_64, cublasHandle_t, handle, int64_t, n, const cuComplex*, x, int64_t, incx, cuComplex*, y, int64_t, incy);
+DEF_FN(cublasStatus_t, cublasZcopy_v2, cublasHandle_t, handle, int, n, const cuDoubleComplex*, x, int, incx, cuDoubleComplex*, y, int, incy);
+DEF_FN(cublasStatus_t, cublasZcopy_v2_64, , cublasHandle_t, handle, int64_t, n, const cuDoubleComplex*, x, int64_t, incx, cuDoubleComplex*, y, int64_t, incy);
+DEF_FN(cublasStatus_t, cublasSswap_v2, cublasHandle_t, handle, int, n, float*, x, int, incx, float*, y, int, incy);
+DEF_FN(cublasStatus_t, cublasSswap_v2_64, cublasHandle_t, handle, int64_t, n, float*, x, int64_t, incx, float*, y, int64_t, incy);
+DEF_FN(cublasStatus_t, cublasDswap_v2, cublasHandle_t, handle, int, n, double*, x, int, incx, double*, y, int, incy);
+DEF_FN(cublasStatus_t, cublasDswap_v2_64, cublasHandle_t, handle, int64_t, n, double*, x, int64_t, incx, double*, y, int64_t, incy);
+DEF_FN(cublasStatus_t, cublasCswap_v2, cublasHandle_t, handle, int, n, cuComplex*, x, int, incx, cuComplex*, y, int, incy);
+DEF_FN(cublasStatus_t, cublasCswap_v2_64, cublasHandle_t, handle, int64_t, n, cuComplex*, x, int64_t, incx, cuComplex*, y, int64_t, incy);
+DEF_FN(cublasStatus_t, cublasZswap_v2, cublasHandle_t, handle, int, n, cuDoubleComplex*, x, int, incx, cuDoubleComplex*, y, int, incy);
+DEF_FN(cublasStatus_t, cublasZswap_v2_64, cublasHandle_t, handle, int64_t, n, cuDoubleComplex*, x, int64_t, incx, cuDoubleComplex*, y, int64_t, incy);
+DEF_FN(cublasStatus_t, cublasSwapEx, , cublasHandle_t, handle, int, n, void*, x, cudaDataType, xType, int, incx, void*, y, cudaDataType, yType, int, incy);
+DEF_FN(cublasStatus_t, cublasSwapEx_64, cublasHandle_t, handle, int64_t, n, void*, x, cudaDataType, xType, int64_t, incx, void*, y, cudaDataType, yType, int64_t, incy);
+DEF_FN(cublasStatus_t, cublasIsamax_v2, cublasHandle_t, handle, int, n, const float*, x, int, incx, int*, result);
+DEF_FN(cublasStatus_t, cublasIsamax_v2_64, cublasHandle_t, handle, int64_t, n, const float*, x, int64_t, incx, int64_t*, result);
+DEF_FN(cublasStatus_t, cublasIdamax_v2, cublasHandle_t, handle, int, n, const double*, x, int, incx, int*, result);
+DEF_FN(cublasStatus_t, cublasIdamax_v2_64, cublasHandle_t, handle, int64_t, n, const double*, x, int64_t, incx, int64_t*, result);
+DEF_FN(cublasStatus_t, cublasIcamax_v2, cublasHandle_t, handle, int, n, const cuComplex*, x, int, incx, int*, result);
+DEF_FN(cublasStatus_t, cublasIcamax_v2_64, cublasHandle_t, handle, int64_t, n, const cuComplex*, x, int64_t, incx, int64_t*, result);
+DEF_FN(cublasStatus_t, cublasIzamax_v2, cublasHandle_t, handle, int, n, const cuDoubleComplex*, x, int, incx, int*, result);
+DEF_FN(cublasStatus_t, cublasIzamax_v2_64, cublasHandle_t, handle, int64_t, n, const cuDoubleComplex*, x, int64_t, incx, int64_t*, result);
+DEF_FN(cublasStatus_t, cublasIamaxEx, cublasHandle_t, handle, int, n, const void*, x, cudaDataType, xType, int, incx, int*, result);
+DEF_FN(cublasStatus_t, cublasIamaxEx_64, cublasHandle_t, handle, int64_t, n, const void*, x, cudaDataType, xType, int64_t, incx, int64_t*, result);
+DEF_FN(cublasStatus_t, cublasIsamin_v2, cublasHandle_t, handle, int, n, const float*, x, int, incx, int*, result);
+DEF_FN(cublasStatus_t, cublasIsamin_v2_64, cublasHandle_t, handle, int64_t, n, const float*, x, int64_t, incx, int64_t*, result);
+DEF_FN(cublasStatus_t, cublasIdamin_v2, cublasHandle_t, handle, int, n, const double*, x, int, incx, int*, result);
+DEF_FN(cublasStatus_t, cublasIdamin_v2_64, cublasHandle_t, handle, int64_t, n, const double*, x, int64_t, incx, int64_t*, result);
+DEF_FN(cublasStatus_t, cublasIcamin_v2, cublasHandle_t, handle, int, n, const cuComplex*, x, int, incx, int*, result);
+DEF_FN(cublasStatus_t, cublasIcamin_v2_64, cublasHandle_t, handle, int64_t, n, const cuComplex*, x, int64_t, incx, int64_t*, result);
+DEF_FN(cublasStatus_t, cublasIzamin_v2, cublasHandle_t, handle, int, n, const cuDoubleComplex*, x, int, incx, int*, result);
+DEF_FN(cublasStatus_t, cublasIzamin_v2_64, cublasHandle_t, handle, int64_t, n, const cuDoubleComplex*, x, int64_t, incx, int64_t*, result);
+DEF_FN(cublasStatus_t, cublasIaminEx, cublasHandle_t, handle, int, n, const void*, x, cudaDataType, xType, int, incx, int*, result);
+DEF_FN(cublasStatus_t, cublasIaminEx_64, cublasHandle_t, handle, int64_t, n, const void*, x, cudaDataType, xType, int64_t, incx, int64_t*, result);
+DEF_FN(cublasStatus_t, cublasAsumEx, cublasHandle_t, handle, int, n, const void*, x, cudaDataType, xType, int, incx, void*, result, cudaDataType, resultType, cudaDataType, executiontype);
+DEF_FN(cublasStatus_t, cublasAsumEx_64, cublasHandle_t, handle, int64_t, n, const void*, x, cudaDataType, xType, int64_t, incx, void*, result, cudaDataType, resultType, cudaDataType, executiontype);
+DEF_FN(cublasStatus_t, cublasSasum_v2, cublasHandle_t, handle, int, n, const float*, x, int, incx, float*, result);
+DEF_FN(cublasStatus_t, cublasSasum_v2_64, cublasHandle_t, handle, int64_t, n, const float*, x, int64_t, incx, float*, result);
+DEF_FN(cublasStatus_t, cublasDasum_v2, cublasHandle_t, handle, int, n, const double*, x, int, incx, double*, result);
+DEF_FN(cublasStatus_t, cublasDasum_v2_64, cublasHandle_t, handle, int64_t, n, const double*, x, int64_t, incx, double*, result);
+DEF_FN(cublasStatus_t, cublasScasum_v2, cublasHandle_t, handle, int, n, const cuComplex*, x, int, incx, float*, result);
+DEF_FN(cublasStatus_t, cublasScasum_v2_64, cublasHandle_t, handle, int64_t, n, const cuComplex*, x, int64_t, incx, float*, result);
+DEF_FN(cublasStatus_t, cublasDzasum_v2, cublasHandle_t, handle, int, n, const cuDoubleComplex*, x, int, incx, double*, result);
+DEF_FN(cublasStatus_t, cublasDzasum_v2_64, cublasHandle_t, handle, int64_t, n, const cuDoubleComplex*, x, int64_t, incx, double*, result);
+DEF_FN(cublasStatus_t, cublasSrot_v2, cublasHandle_t, handle, int, n, float*, x, int, incx, float*, y, int, incy, const float*, c, const float*, s);
+DEF_FN(cublasStatus_t, cublasSrot_v2_64, , cublasHandle_t, handle, int64_t, n, float*, x, int64_t, incx, float*, y, int64_t, incy, const float*, c, const float*, s);
+DEF_FN(cublasStatus_t, cublasDrot_v2, cublasHandle_t, handle, int, n, double*, x, int, incx, double*, y, int, incy, const double*, c, const double*, s);
+DEF_FN(cublasStatus_t, cublasDrot_v2_64, cublasHandle_t, handle, int64_t, n, double*, x, int64_t, incx, double*, y, int64_t, incy, const double*, c, const double*, s);
+DEF_FN(cublasStatus_t, cublasCrot_v2, , cublasHandle_t, handle, int, n, cuComplex*, x, int, incx, cuComplex*, y, int, incy, const float*, c, const cuComplex*, s);
+DEF_FN(cublasStatus_t, cublasCrot_v2_64, cublasHandle_t, handle, int64_t, n, cuComplex*, x, int64_t, incx, cuComplex*, y, int64_t, incy, const float*, c, const cuComplex*, s);
+DEF_FN(cublasStatus_t, cublasCsrot_v2, , cublasHandle_t, handle, int, n, cuComplex*, x, int, incx, cuComplex*, y, int, incy, const float*, c, const float*, s);
+DEF_FN(cublasStatus_t, cublasCsrot_v2_64, cublasHandle_t, handle, int64_t, n, cuComplex*, x, int64_t, incx, cuComplex*, y, int64_t, incy, const float*, c, const float*, s);
+DEF_FN(cublasStatus_t, cublasZrot_v2, cublasHandle_t, handle, int, n, cuDoubleComplex*, x, int, incx, cuDoubleComplex*, y, int, incy, const double*, c, const cuDoubleComplex*, s);
+DEF_FN(cublasStatus_t, cublasZrot_v2_64, cublasHandle_t, handle, int64_t, n, cuDoubleComplex*, x, int64_t, incx, cuDoubleComplex*, y, int64_t, incy, const double*, c, const cuDoubleComplex*, s);
+DEF_FN(cublasStatus_t, cublasZdrot_v2, cublasHandle_t, handle, int, n, cuDoubleComplex*, x, int, incx, cuDoubleComplex*, y, int, incy, const double*, c, const double*, s);
+DEF_FN(cublasStatus_t, cublasZdrot_v2_64, cublasHandle_t, handle, int64_t, n, cuDoubleComplex*, x, int64_t, incx, cuDoubleComplex*, y, int64_t, incy, const double*, c, const double*, s);
+DEF_FN(cublasStatus_t, cublasRotEx, cublasHandle_t, handle, int, n, void*, x, cudaDataType, xType, int, incx, void*, y, cudaDataType, yType, int, incy, const void*, c, const void*, s, cudaDataType, csType, cudaDataType, executiontype);
+DEF_FN(cublasStatus_t, cublasRotEx_64, cublasHandle_t, handle, int64_t, n, void*, x, cudaDataType, xType, int64_t, incx, void*, y, cudaDataType, yType, int64_t, incy, const void*, c, const void*, s, cudaDataType, csType, cudaDataType, executiontype);
+DEF_FN(cublasStatus_t, cublasSrotg_v2, cublasHandle_t, handle, float*, a, float*, b, float*, c, float*, s);
+DEF_FN(cublasStatus_t, cublasDrotg_v2, cublasHandle_t, handle, double*, a, double*, b, double*, c, double*, s);
+DEF_FN(cublasStatus_t, cublasCrotg_v2, cublasHandle_t, handle, cuComplex*, a, cuComplex*, b, float*, c, cuComplex*, s);
+DEF_FN(cublasStatus_t, cublasZrotg_v2, cublasHandle_t, handle, cuDoubleComplex*, a, cuDoubleComplex*, b, double*, c, cuDoubleComplex*, s);
+DEF_FN(cublasStatus_t, cublasRotgEx, cublasHandle_t, handle, void*, a, void*, b, cudaDataType, abType, void*, c, void*, s, cudaDataType, csType, cudaDataType, executiontype);
+DEF_FN(cublasStatus_t, cublasSrotm_v2, cublasHandle_t, handle, int, n, float*, x, int, incx, float*, y, int, incy, const float*, param);
+DEF_FN(cublasStatus_t, cublasSrotm_v2_64, cublasHandle_t, handle, int64_t, n, float*, x, int64_t, incx, float*, y, int64_t, incy, const float*, param);
+DEF_FN(cublasStatus_t, cublasDrotm_v2, cublasHandle_t, handle, int, n, double*, x, int, incx, double*, y, int, incy, const double*, param);
+DEF_FN(cublasStatus_t, cublasDrotm_v2_64, , cublasHandle_t, handle, int64_t, n, double*, x, int64_t, incx, double*, y, int64_t, incy, const double*, param);
+DEF_FN(cublasStatus_t, cublasRotmEx, cublasHandle_t, handle, int, n, void*, x, cudaDataType, xType, int, incx, void*, y, cudaDataType, yType, int, incy, const void*, param, cudaDataType, paramType, cudaDataType, executiontype);
+DEF_FN(cublasStatus_t, cublasRotmEx_64, cublasHandle_t, handle, int64_t, n, void*, x, cudaDataType, xType, int64_t, incx, void*, y, cudaDataType, yType, int64_t, incy, const void*, param, cudaDataType, paramType, cudaDataType, executiontype);
+DEF_FN(cublasStatus_t, cublasSrotmg_v2, cublasHandle_t, handle, float*, d1, float*, d2, float*, x1, const float*, y1, float*, param);
+DEF_FN(cublasStatus_t, cublasDrotmg_v2, cublasHandle_t, handle, double*, d1, double*, d2, double*, x1, const double*, y1, double*, param);
+DEF_FN(cublasStatus_t, cublasRotmgEx, cublasHandle_t, handle, void*, d1, cudaDataType, d1Type, void*, d2, cudaDataType, d2Type, void*, x1, cudaDataType, x1Type, const void*, y1, cudaDataType, y1Type, void*, param, cudaDataType, paramType, cudaDataType, executiontype);
+
+cublasStatus_t cublasSgemv_v2(cublasHandle_t handle,
+                           cublasOperation_t trans,
+                           int m, int n,
+                           const float          *alpha,
+                           const float          *A, int lda,
+                           const float          *x, int incx,
+                           const float          *beta,
+                           float          *y, int incy)
+{
+#ifdef WITH_API_CNT
+    api_call_cnt++;
+#endif //WITH_API_CNT
+    int result;
+    enum clnt_stat retval_1;
+    retval_1 = rpc_cublassgemv_1(
+        (ptr)handle,
+        (int)trans,
+        m, n,
         *alpha,
         (ptr)A, lda,
-        (ptr)B, ldb,
+        (ptr)x, incx,
         *beta,
-        (ptr)C, ldc,
+        (ptr)y, incy,
          &result, clnt);
     if (retval_1 != RPC_SUCCESS) {
         clnt_perror (clnt, "call failed");
@@ -81,91 +308,251 @@ cublasStatus_t cublasDgemm(cublasHandle_t handle,
     return result;
 }
 
-cublasStatus_t cublasSgemm(cublasHandle_t handle,
-                           cublasOperation_t transa, cublasOperation_t transb,
-                           int m, int n, int k,
-                           const float *alpha,
-                           const float *A, int lda,
-                           const float *B, int ldb,
-                           const float *beta,
-                           float *C, int ldc)
+DEF_FN(cublasStatus_t, cublasSgemv_v2_64, cublasHandle_t, handle, cublasOperation_t, trans, int64_t, m, int64_t, n, const float*, alpha, const float*, A, int64_t, lda, const float*, x, int64_t, incx, const float*, beta, float*, y, int64_t, incy);
+
+cublasStatus_t cublasDgemv_v2(cublasHandle_t handle,
+cublasOperation_t trans,
+                           int m, int n,
+                           const double          *alpha,
+                           const double          *A, int lda,
+                           const double          *x, int incx,
+                           const double          *beta,
+                           double          *y, int incy)
 {
 #ifdef WITH_API_CNT
     api_call_cnt++;
 #endif //WITH_API_CNT
     int result;
     enum clnt_stat retval_1;
-    retval_1 = rpc_cublassgemm_1(
+    retval_1 = rpc_cublasdgemv_1(
         (ptr)handle,
-        (int)transa,
-        (int)transb,
-        m, n, k,
+        (int)trans,
+        m, n,
         *alpha,
         (ptr)A, lda,
-        (ptr)B, ldb,
+        (ptr)x, incx,
         *beta,
-        (ptr)C, ldc,
+        (ptr)y, incy,
          &result, clnt);
     if (retval_1 != RPC_SUCCESS) {
         clnt_perror (clnt, "call failed");
     }
     return result;
 }
+DEF_FN(cublasStatus_t, cublasDgemv_v2_64, cublasHandle_t, handle, cublasOperation_t, trans, int64_t, m, int64_t, n, const double*, alpha, const double*, A, int64_t, lda, const double*, x, int64_t, incx, const double*, beta, double*, y, int64_t, incy);
+DEF_FN(cublasStatus_t, cublasCgemv_v2, cublasHandle_t, handle, cublasOperation_t, trans, int, m, int, n, const cuComplex*, alpha, const cuComplex*, A, int, lda, const cuComplex*, x, int, incx, const cuComplex*, beta, cuComplex*, y, int, incy);
+DEF_FN(cublasStatus_t, cublasCgemv_v2_64, cublasHandle_t, handle, cublasOperation_t, trans, int64_t, m, int64_t, n, const cuComplex*, alpha, const cuComplex*, A, int64_t, lda, const cuComplex*, x, int64_t, incx, const cuComplex*, beta, cuComplex*, y, int64_t, incy);
+DEF_FN(cublasStatus_t, cublasZgemv_v2, cublasHandle_t, handle, cublasOperation_t, trans, int, m, int, n, const cuDoubleComplex*, alpha, const cuDoubleComplex*, A, int, lda, const cuDoubleComplex*, x, int, incx, const cuDoubleComplex*, beta, cuDoubleComplex*, y, int, incy);
+DEF_FN(cublasStatus_t, cublasZgemv_v2_64, cublasHandle_t, handle, cublasOperation_t, trans, int64_t, m, int64_t, n, const cuDoubleComplex*, alpha, const cuDoubleComplex*, A, int64_t, lda, const cuDoubleComplex*, x, int64_t, incx, const cuDoubleComplex*, beta, cuDoubleComplex*, y, int64_t, incy);
+DEF_FN(cublasStatus_t, cublasSgbmv_v2, cublasHandle_t, handle, cublasOperation_t, trans, int, m, int, n, int, kl, int, ku, const float*, alpha, const float*, A, int, lda, const float*, x, int, incx, const float*, beta, float*, y, int, incy);
+DEF_FN(cublasStatus_t, cublasSgbmv_v2_64, cublasHandle_t, handle, cublasOperation_t, trans, int64_t, m, int64_t, n, int64_t, kl, int64_t, ku, const float*, alpha, const float*, A, int64_t, lda, const float*, x, int64_t, incx, const float*, beta, float*, y, int64_t, incy);
+DEF_FN(cublasStatus_t, cublasDgbmv_v2, cublasHandle_t, handle, cublasOperation_t, trans, int, m, int, n, int, kl, int, ku, const double*, alpha, const double*, A, int, lda, const double*, x, int, incx, const double*, beta, double*, y, int, incy);
+DEF_FN(cublasStatus_t, cublasDgbmv_v2_64, cublasHandle_t, handle, cublasOperation_t, trans, int64_t, m, int64_t, n, int64_t, kl, int64_t, ku, const double*, alpha, const double*, A, int64_t, lda, const double*, x, int64_t, incx, const double*, beta, double*, y, int64_t, incy);
+DEF_FN(cublasStatus_t, cublasCgbmv_v2, cublasHandle_t, handle, cublasOperation_t, trans, int, m, int, n, int, kl, int, ku, const cuComplex*, alpha, const cuComplex*, A, int, lda, const cuComplex*, x, int, incx, const cuComplex*, beta, cuComplex*, y, int, incy);
+DEF_FN(cublasStatus_t, cublasCgbmv_v2_64, cublasHandle_t, handle, cublasOperation_t, trans, int64_t, m, int64_t, n, int64_t, kl, int64_t, ku, const cuComplex*, alpha, const cuComplex*, A, int64_t, lda, const cuComplex*, x, int64_t, incx, const cuComplex*, beta, cuComplex*, y, int64_t, incy);
+DEF_FN(cublasStatus_t, cublasZgbmv_v2, cublasHandle_t, handle, cublasOperation_t, trans, int, m, int, n, int, kl, int, ku, const cuDoubleComplex*, alpha, const cuDoubleComplex*, A, int, lda, const cuDoubleComplex*, x, int, incx, const cuDoubleComplex*, beta, cuDoubleComplex*, y, int, incy);
+DEF_FN(cublasStatus_t, cublasZgbmv_v2_64, cublasHandle_t, handle, cublasOperation_t, trans, int64_t, m, int64_t, n, int64_t, kl, int64_t, ku, const cuDoubleComplex*, alpha, const cuDoubleComplex*, A, int64_t, lda, const cuDoubleComplex*, x, int64_t, incx, const cuDoubleComplex*, beta, cuDoubleComplex*, y, int64_t, incy);
+DEF_FN(cublasStatus_t, cublasStrmv_v2, cublasHandle_t, handle, cublasFillMode_t, uplo, cublasOperation_t, trans, cublasDiagType_t, diag, int, n, const float*, A, int, lda, float*, x, int, incx);
+DEF_FN(cublasStatus_t, cublasStrmv_v2_64, cublasHandle_t, handle, cublasFillMode_t, uplo, cublasOperation_t, trans, cublasDiagType_t, diag, int64_t, n, const float*, A, int64_t, lda, float*, x, int64_t, incx);
+DEF_FN(cublasStatus_t, cublasDtrmv_v2, cublasHandle_t, handle, cublasFillMode_t, uplo, cublasOperation_t, trans, cublasDiagType_t, diag, int, n, const double*, A, int, lda, double*, x, int, incx);
+DEF_FN(cublasStatus_t, cublasDtrmv_v2_64, cublasHandle_t, handle, cublasFillMode_t, uplo, cublasOperation_t, trans, cublasDiagType_t, diag, int64_t, n, const double*, A, int64_t, lda, double*, x, int64_t, incx);
+DEF_FN(cublasStatus_t, cublasCtrmv_v2, cublasHandle_t, handle, cublasFillMode_t, uplo, cublasOperation_t, trans, cublasDiagType_t, diag, int, n, const cuComplex*, A, int, lda, cuComplex*, x, int, incx);
+DEF_FN(cublasStatus_t, cublasCtrmv_v2_64, cublasHandle_t, handle, cublasFillMode_t, uplo, cublasOperation_t, trans, cublasDiagType_t, diag, int64_t, n, const cuComplex*, A, int64_t, lda, cuComplex*, x, int64_t, incx);
+DEF_FN(cublasStatus_t, cublasZtrmv_v2, cublasHandle_t, handle, cublasFillMode_t, uplo, cublasOperation_t, trans, cublasDiagType_t, diag, int, n, const cuDoubleComplex*, A, int, lda, cuDoubleComplex*, x, int, incx);
+DEF_FN(cublasStatus_t, cublasZtrmv_v2_64, cublasHandle_t, handle, cublasFillMode_t, uplo, cublasOperation_t, trans, cublasDiagType_t, diag, int64_t, n, const cuDoubleComplex*, A, int64_t, lda, cuDoubleComplex*, x, int64_t, incx);
+DEF_FN(cublasStatus_t, cublasStbmv_v2, cublasHandle_t, handle, cublasFillMode_t, uplo, cublasOperation_t, trans, cublasDiagType_t, diag, int, n, int, k, const float*, A, int, lda, float*, x, int, incx);
+DEF_FN(cublasStatus_t, cublasStbmv_v2_64, cublasHandle_t, handle, cublasFillMode_t, uplo, cublasOperation_t, trans, cublasDiagType_t, diag, int64_t, n, int64_t, k, const float*, A, int64_t, lda, float*, x, int64_t, incx);
+DEF_FN(cublasStatus_t, cublasDtbmv_v2, cublasHandle_t, handle, cublasFillMode_t, uplo, cublasOperation_t, trans, cublasDiagType_t, diag, int, n, int, k, const double*, A, int, lda, double*, x, int, incx);
+DEF_FN(cublasStatus_t, cublasDtbmv_v2_64, cublasHandle_t, handle, cublasFillMode_t, uplo, cublasOperation_t, trans, cublasDiagType_t, diag, int64_t, n, int64_t, k, const double*, A, int64_t, lda, double*, x, int64_t, incx);
+DEF_FN(cublasStatus_t, cublasCtbmv_v2, cublasHandle_t, handle, cublasFillMode_t, uplo, cublasOperation_t, trans, cublasDiagType_t, diag, int, n, int, k, const cuComplex*, A, int, lda, cuComplex*, x, int, incx);
+DEF_FN(cublasStatus_t, cublasCtbmv_v2_64, cublasHandle_t, handle, cublasFillMode_t, uplo, cublasOperation_t, trans, cublasDiagType_t, diag, int64_t, n, int64_t, k, const cuComplex*, A, int64_t, lda, cuComplex*, x, int64_t, incx);
+DEF_FN(cublasStatus_t, cublasZtbmv_v2, cublasHandle_t, handle, cublasFillMode_t, uplo, cublasOperation_t, trans, cublasDiagType_t, diag, int, n, int, k, const cuDoubleComplex*, A, int, lda, cuDoubleComplex*, x, int, incx);
+DEF_FN(cublasStatus_t, cublasZtbmv_v2_64, cublasHandle_t, handle, cublasFillMode_t, uplo, cublasOperation_t, trans, cublasDiagType_t, diag, int64_t, n, int64_t, k, const cuDoubleComplex*, A, int64_t, lda, cuDoubleComplex*, x, int64_t, incx);
+DEF_FN(cublasStatus_t, cublasStpmv_v2, cublasHandle_t, handle, cublasFillMode_t, uplo, cublasOperation_t, trans, cublasDiagType_t, diag, int, n, const float*, AP, float*, x, int, incx);
+DEF_FN(cublasStatus_t, cublasStpmv_v2_64, cublasHandle_t, handle, cublasFillMode_t, uplo, cublasOperation_t, trans, cublasDiagType_t, diag, int64_t, n, const float*, AP, float*, x, int64_t, incx);
+DEF_FN(cublasStatus_t, cublasDtpmv_v2, cublasHandle_t, handle, cublasFillMode_t, uplo, cublasOperation_t, trans, cublasDiagType_t, diag, int, n, const double*, AP, double*, x, int, incx);
+DEF_FN(cublasStatus_t, cublasDtpmv_v2_64, cublasHandle_t, handle, cublasFillMode_t, uplo, cublasOperation_t, trans, cublasDiagType_t, diag, int64_t, n, const double*, AP, double*, x, int64_t, incx);
+DEF_FN(cublasStatus_t, cublasCtpmv_v2, cublasHandle_t, handle, cublasFillMode_t, uplo, cublasOperation_t, trans, cublasDiagType_t, diag, int, n, const cuComplex*, AP, cuComplex*, x, int, incx);
+DEF_FN(cublasStatus_t, cublasCtpmv_v2_64, cublasHandle_t, handle, cublasFillMode_t, uplo, cublasOperation_t, trans, cublasDiagType_t, diag, int64_t, n, const cuComplex*, AP, cuComplex*, x, int64_t, incx);
+DEF_FN(cublasStatus_t, cublasZtpmv_v2, cublasHandle_t, handle, cublasFillMode_t, uplo, cublasOperation_t, trans, cublasDiagType_t, diag, int, n, const cuDoubleComplex*, AP, cuDoubleComplex*, x, int, incx);
+DEF_FN(cublasStatus_t, cublasZtpmv_v2_64, cublasHandle_t, handle, cublasFillMode_t, uplo, cublasOperation_t, trans, cublasDiagType_t, diag, int64_t, n, const cuDoubleComplex*, AP, cuDoubleComplex*, x, int64_t, incx);
+DEF_FN(cublasStatus_t, cublasStrsv_v2, cublasHandle_t, handle, cublasFillMode_t, uplo, cublasOperation_t, trans, cublasDiagType_t, diag, int, n, const float*, A, int, lda, float*, x, int, incx);
+DEF_FN(cublasStatus_t, cublasStrsv_v2_64, cublasHandle_t, handle, cublasFillMode_t, uplo, cublasOperation_t, trans, cublasDiagType_t, diag, int64_t, n, const float*, A, int64_t, lda, float*, x, int64_t, incx);
+DEF_FN(cublasStatus_t, cublasDtrsv_v2, cublasHandle_t, handle, cublasFillMode_t, uplo, cublasOperation_t, trans, cublasDiagType_t, diag, int, n, const double*, A, int, lda, double*, x, int, incx);
+DEF_FN(cublasStatus_t, cublasDtrsv_v2_64, cublasHandle_t, handle, cublasFillMode_t, uplo, cublasOperation_t, trans, cublasDiagType_t, diag, int64_t, n, const double*, A, int64_t, lda, double*, x, int64_t, incx);
+DEF_FN(cublasStatus_t, cublasCtrsv_v2, cublasHandle_t, handle, cublasFillMode_t, uplo, cublasOperation_t, trans, cublasDiagType_t, diag, int, n, const cuComplex*, A, int, lda, cuComplex*, x, int, incx);
+DEF_FN(cublasStatus_t, cublasCtrsv_v2_64, cublasHandle_t, handle, cublasFillMode_t, uplo, cublasOperation_t, trans, cublasDiagType_t, diag, int64_t, n, const cuComplex*, A, int64_t, lda, cuComplex*, x, int64_t, incx);
+DEF_FN(cublasStatus_t, cublasZtrsv_v2, cublasHandle_t, handle, cublasFillMode_t, uplo, cublasOperation_t, trans, cublasDiagType_t, diag, int, n, const cuDoubleComplex*, A, int, lda, cuDoubleComplex*, x, int, incx);
+DEF_FN(cublasStatus_t, cublasZtrsv_v2_64, cublasHandle_t, handle, cublasFillMode_t, uplo, cublasOperation_t, trans, cublasDiagType_t, diag, int64_t, n, const cuDoubleComplex*, A, int64_t, lda, cuDoubleComplex*, x, int64_t, incx);
+DEF_FN(cublasStatus_t, cublasStpsv_v2, cublasHandle_t, handle, cublasFillMode_t, uplo, cublasOperation_t, trans, cublasDiagType_t, diag, int, n, const float*, AP, float*, x, int, incx);
+DEF_FN(cublasStatus_t, cublasStpsv_v2_64, cublasHandle_t, handle, cublasFillMode_t, uplo, cublasOperation_t, trans, cublasDiagType_t, diag, int64_t, n, const float*, AP, float*, x, int64_t, incx);
+DEF_FN(cublasStatus_t, cublasDtpsv_v2, cublasHandle_t, handle, cublasFillMode_t, uplo, cublasOperation_t, trans, cublasDiagType_t, diag, int, n, const double*, AP, double*, x, int, incx);
+DEF_FN(cublasStatus_t, cublasDtpsv_v2_64, cublasHandle_t, handle, cublasFillMode_t, uplo, cublasOperation_t, trans, cublasDiagType_t, diag, int64_t, n, const double*, AP, double*, x, int64_t, incx);
+DEF_FN(cublasStatus_t, cublasCtpsv_v2, cublasHandle_t, handle, cublasFillMode_t, uplo, cublasOperation_t, trans, cublasDiagType_t, diag, int, n, const cuComplex*, AP, cuComplex*, x, int, incx);
+DEF_FN(cublasStatus_t, cublasCtpsv_v2_64, cublasHandle_t, handle, cublasFillMode_t, uplo, cublasOperation_t, trans, cublasDiagType_t, diag, int64_t, n, const cuComplex*, AP, cuComplex*, x, int64_t, incx);
+DEF_FN(cublasStatus_t, cublasZtpsv_v2, cublasHandle_t, handle, cublasFillMode_t, uplo, cublasOperation_t, trans, cublasDiagType_t, diag, int, n, const cuDoubleComplex*, AP, cuDoubleComplex*, x, int, incx);
+DEF_FN(cublasStatus_t, cublasZtpsv_v2_64, cublasHandle_t, handle, cublasFillMode_t, uplo, cublasOperation_t, trans, cublasDiagType_t, diag, int64_t, n, const cuDoubleComplex*, AP, cuDoubleComplex*, x, int64_t, incx);
+DEF_FN(cublasStatus_t, cublasStbsv_v2, cublasHandle_t, handle, cublasFillMode_t, uplo, cublasOperation_t, trans, cublasDiagType_t, diag, int, n, int, k, const float*, A, int, lda, float*, x, int, incx);
+DEF_FN(cublasStatus_t, cublasStbsv_v2_64, cublasHandle_t, handle, cublasFillMode_t, uplo, cublasOperation_t, trans, cublasDiagType_t, diag, int64_t, n, int64_t, k, const float*, A, int64_t, lda, float*, x, int64_t, incx);
+DEF_FN(cublasStatus_t, cublasDtbsv_v2, cublasHandle_t, handle, cublasFillMode_t, uplo, cublasOperation_t, trans, cublasDiagType_t, diag, int, n, int, k, const double*, A, int, lda, double*, x, int, incx);
+DEF_FN(cublasStatus_t, cublasDtbsv_v2_64, cublasHandle_t, handle, cublasFillMode_t, uplo, cublasOperation_t, trans, cublasDiagType_t, diag, int64_t, n, int64_t, k, const double*, A, int64_t, lda, double*, x, int64_t, incx);
+DEF_FN(cublasStatus_t, cublasCtbsv_v2, cublasHandle_t, handle, cublasFillMode_t, uplo, cublasOperation_t, trans, cublasDiagType_t, diag, int, n, int, k, const cuComplex*, A, int, lda, cuComplex*, x, int, incx);
+DEF_FN(cublasStatus_t, cublasCtbsv_v2_64, cublasHandle_t, handle, cublasFillMode_t, uplo, cublasOperation_t, trans, cublasDiagType_t, diag, int64_t, n, int64_t, k, const cuComplex*, A, int64_t, lda, cuComplex*, x, int64_t, incx);
+DEF_FN(cublasStatus_t, cublasZtbsv_v2, cublasHandle_t, handle, cublasFillMode_t, uplo, cublasOperation_t, trans, cublasDiagType_t, diag, int, n, int, k, const cuDoubleComplex*, A, int, lda, cuDoubleComplex*, x, int, incx);
+DEF_FN(cublasStatus_t, cublasZtbsv_v2_64, cublasHandle_t, handle, cublasFillMode_t, uplo, cublasOperation_t, trans, cublasDiagType_t, diag, int64_t, n, int64_t, k, const cuDoubleComplex*, A, int64_t, lda, cuDoubleComplex*, x, int64_t, incx);
+DEF_FN(cublasStatus_t, cublasSsymv_v2, cublasHandle_t, handle, cublasFillMode_t, uplo, int, n, const float*, alpha, const float*, A, int, lda, const float*, x, int, incx, const float*, beta, float*, y, int, incy);
+DEF_FN(cublasStatus_t, cublasSsymv_v2_64, cublasHandle_t, handle, cublasFillMode_t, uplo, int64_t, n, const float*, alpha, const float*, A, int64_t, lda, const float*, x, int64_t, incx, const float*, beta, float*, y, int64_t, incy);
+DEF_FN(cublasStatus_t, cublasDsymv_v2, cublasHandle_t, handle, cublasFillMode_t, uplo, int, n, const double*, alpha, const double*, A, int, lda, const double*, x, int, incx, const double*, beta, double*, y, int, incy);
+DEF_FN(cublasStatus_t, cublasDsymv_v2_64, cublasHandle_t, handle, cublasFillMode_t, uplo, int64_t, n, const double*, alpha, const double*, A, int64_t, lda, const double*, x, int64_t, incx, const double*, beta, double*, y, int64_t, incy);
+DEF_FN(cublasStatus_t, cublasCsymv_v2, cublasHandle_t, handle, cublasFillMode_t, uplo, int, n, const cuComplex*, alpha, const cuComplex*, A, int, lda, const cuComplex*, x, int, incx, const cuComplex*, beta, cuComplex*, y, int, incy);
+DEF_FN(cublasStatus_t, cublasCsymv_v2_64, cublasHandle_t, handle, cublasFillMode_t, uplo, int64_t, n, const cuComplex*, alpha, const cuComplex*, A, int64_t, lda, const cuComplex*, x, int64_t, incx, const cuComplex*, beta, cuComplex*, y, int64_t, incy);
+DEF_FN(cublasStatus_t, cublasZsymv_v2, cublasHandle_t, handle, cublasFillMode_t, uplo, int, n, const cuDoubleComplex*, alpha, const cuDoubleComplex*, A, int, lda, const cuDoubleComplex*, x, int, incx, const cuDoubleComplex*, beta, cuDoubleComplex*, y, int, incy);
+DEF_FN(cublasStatus_t, cublasZsymv_v2_64, cublasHandle_t, handle, cublasFillMode_t, uplo, int64_t, n, const cuDoubleComplex*, alpha, const cuDoubleComplex*, A, int64_t, lda, const cuDoubleComplex*, x, int64_t, incx, const cuDoubleComplex*, beta, cuDoubleComplex*, y, int64_t, incy);
+DEF_FN(cublasStatus_t, cublasChemv_v2, cublasHandle_t, handle, cublasFillMode_t, uplo, int, n, const cuComplex*, alpha, const cuComplex*, A, int, lda, const cuComplex*, x, int, incx, const cuComplex*, beta, cuComplex*, y, int, incy);
+DEF_FN(cublasStatus_t, cublasChemv_v2_64, cublasHandle_t, handle, cublasFillMode_t, uplo, int64_t, n, const cuComplex*, alpha, const cuComplex*, A, int64_t, lda, const cuComplex*, x, int64_t, incx, const cuComplex*, beta, cuComplex*, y, int64_t, incy);
+DEF_FN(cublasStatus_t, cublasZhemv_v2, cublasHandle_t, handle, cublasFillMode_t, uplo, int, n, const cuDoubleComplex*, alpha, const cuDoubleComplex*, A, int, lda, const cuDoubleComplex*, x, int, incx, const cuDoubleComplex*, beta, cuDoubleComplex*, y, int, incy);
+DEF_FN(cublasStatus_t, cublasZhemv_v2_64, cublasHandle_t, handle, cublasFillMode_t, uplo, int64_t, n, const cuDoubleComplex*, alpha, const cuDoubleComplex*, A, int64_t, lda, const cuDoubleComplex*, x, int64_t, incx, const cuDoubleComplex*, beta, cuDoubleComplex*, y, int64_t, incy);
+DEF_FN(cublasStatus_t, cublasSsbmv_v2, cublasHandle_t, handle, cublasFillMode_t, uplo, int, n, int, k, const float*, alpha, const float*, A, int, lda, const float*, x, int, incx, const float*, beta, float*, y, int, incy);
+DEF_FN(cublasStatus_t, cublasSsbmv_v2_64, cublasHandle_t, handle, cublasFillMode_t, uplo, int64_t, n, int64_t, k, const float*, alpha, const float*, A, int64_t, lda, const float*, x, int64_t, incx, const float*, beta, float*, y, int64_t, incy);
+DEF_FN(cublasStatus_t, cublasDsbmv_v2, cublasHandle_t, handle, cublasFillMode_t, uplo, int, n, int, k, const double*, alpha, const double*, A, int, lda, const double*, x, int, incx, const double*, beta, double*, y, int, incy);
+DEF_FN(cublasStatus_t, cublasDsbmv_v2_64, cublasHandle_t, handle, cublasFillMode_t, uplo, int64_t, n, int64_t, k, const double*, alpha, const double*, A, int64_t, lda, const double*, x, int64_t, incx, const double*, beta, double*, y, int64_t, incy);
+DEF_FN(cublasStatus_t, cublasChbmv_v2, cublasHandle_t, handle, cublasFillMode_t, uplo, int, n, int, k, const cuComplex*, alpha, const cuComplex*, A, int, lda, const cuComplex*, x, int, incx, const cuComplex*, beta, cuComplex*, y, int, incy);
+DEF_FN(cublasStatus_t, cublasChbmv_v2_64, cublasHandle_t, handle, cublasFillMode_t, uplo, int64_t, n, int64_t, k, const cuComplex*, alpha, const cuComplex*, A, int64_t, lda, const cuComplex*, x, int64_t, incx, const cuComplex*, beta, cuComplex*, y, int64_t, incy);
+DEF_FN(cublasStatus_t, cublasZhbmv_v2, cublasHandle_t, handle, cublasFillMode_t, uplo, int, n, int, k, const cuDoubleComplex*, alpha, const cuDoubleComplex*, A, int, lda, const cuDoubleComplex*, x, int, incx, const cuDoubleComplex*, beta, cuDoubleComplex*, y, int, incy);
+DEF_FN(cublasStatus_t, cublasZhbmv_v2_64, cublasHandle_t, handle, cublasFillMode_t, uplo, int64_t, n, int64_t, k, const cuDoubleComplex*, alpha, const cuDoubleComplex*, A, int64_t, lda, const cuDoubleComplex*, x, int64_t, incx, const cuDoubleComplex*, beta, cuDoubleComplex*, y, int64_t, incy);
+DEF_FN(cublasStatus_t, cublasSspmv_v2, cublasHandle_t, handle, cublasFillMode_t, uplo, int, n, const float*, alpha, const float*, AP, const float*, x, int, incx, const float*, beta, float*, y, int, incy);
+DEF_FN(cublasStatus_t, cublasSspmv_v2_64, cublasHandle_t, handle, cublasFillMode_t, uplo, int64_t, n, const float*, alpha, const float*, AP, const float*, x, int64_t, incx, const float*, beta, float*, y, int64_t, incy);
+DEF_FN(cublasStatus_t, cublasDspmv_v2, cublasHandle_t, handle, cublasFillMode_t, uplo, int, n, const double*, alpha, const double*, AP, const double*, x, int, incx, const double*, beta, double*, y, int, incy);
+DEF_FN(cublasStatus_t, cublasDspmv_v2_64, cublasHandle_t, handle, cublasFillMode_t, uplo, int64_t, n, const double*, alpha, const double*, AP, const double*, x, int64_t, incx, const double*, beta, double*, y, int64_t, incy);
+DEF_FN(cublasStatus_t, cublasChpmv_v2, cublasHandle_t, handle, cublasFillMode_t, uplo, int, n, const cuComplex*, alpha, const cuComplex*, AP, const cuComplex*, x, int, incx, const cuComplex*, beta, cuComplex*, y, int, incy);
+DEF_FN(cublasStatus_t, cublasChpmv_v2_64, cublasHandle_t, handle, cublasFillMode_t, uplo, int64_t, n, const cuComplex*, alpha, const cuComplex*, AP, const cuComplex*, x, int64_t, incx, const cuComplex*, beta, cuComplex*, y, int64_t, incy);
+DEF_FN(cublasStatus_t, cublasZhpmv_v2, cublasHandle_t, handle, cublasFillMode_t, uplo, int, n, const cuDoubleComplex*, alpha, const cuDoubleComplex*, AP, const cuDoubleComplex*, x, int, incx, const cuDoubleComplex*, beta, cuDoubleComplex*, y, int, incy);
+DEF_FN(cublasStatus_t, cublasZhpmv_v2_64, cublasHandle_t, handle, cublasFillMode_t, uplo, int64_t, n, const cuDoubleComplex*, alpha, const cuDoubleComplex*, AP, const cuDoubleComplex*, x, int64_t, incx, const cuDoubleComplex*, beta, cuDoubleComplex*, y, int64_t, incy);
+DEF_FN(cublasStatus_t, cublasSger_v2, cublasHandle_t, handle, int, m, int, n, const float*, alpha, const float*, x, int, incx, const float*, y, int, incy, float*, A, int, lda);
+DEF_FN(cublasStatus_t, cublasSger_v2_64, cublasHandle_t, handle, int64_t, m, int64_t, n, const float*, alpha, const float*, x, int64_t, incx, const float*, y, int64_t, incy, float*, A, int64_t, lda);
+DEF_FN(cublasStatus_t, cublasDger_v2, cublasHandle_t, handle, int, m, int, n, const double*, alpha, const double*, x, int, incx, const double*, y, int, incy, double*, A, int, lda);
+DEF_FN(cublasStatus_t, cublasDger_v2_64, cublasHandle_t, handle, int64_t, m, int64_t, n, const double*, alpha, const double*, x, int64_t, incx, const double*, y, int64_t, incy, double*, A, int64_t, lda);
+DEF_FN(cublasStatus_t, cublasCgeru_v2, cublasHandle_t, handle, int, m, int, n, const cuComplex*, alpha, const cuComplex*, x, int, incx, const cuComplex*, y, int, incy, cuComplex*, A, int, lda);
+DEF_FN(cublasStatus_t, cublasCgeru_v2_64, cublasHandle_t, handle, int64_t, m, int64_t, n, const cuComplex*, alpha, const cuComplex*, x, int64_t, incx, const cuComplex*, y, int64_t, incy, cuComplex*, A, int64_t, lda);
+DEF_FN(cublasStatus_t, cublasCgerc_v2, cublasHandle_t, handle, int, m, int, n, const cuComplex*, alpha, const cuComplex*, x, int, incx, const cuComplex*, y, int, incy, cuComplex*, A, int, lda);
+DEF_FN(cublasStatus_t, cublasCgerc_v2_64, cublasHandle_t, handle, int64_t, m, int64_t, n, const cuComplex*, alpha, const cuComplex*, x, int64_t, incx, const cuComplex*, y, int64_t, incy, cuComplex*, A, int64_t, lda);
+DEF_FN(cublasStatus_t, cublasZgeru_v2, cublasHandle_t, handle, int, m, int, n, const cuDoubleComplex*, alpha, const cuDoubleComplex*, x, int, incx, const cuDoubleComplex*, y, int, incy, cuDoubleComplex*, A, int, lda);
+DEF_FN(cublasStatus_t, cublasZgeru_v2_64, cublasHandle_t, handle, int64_t, m, int64_t, n, const cuDoubleComplex*, alpha, const cuDoubleComplex*, x, int64_t, incx, const cuDoubleComplex*, y, int64_t, incy, cuDoubleComplex*, A, int64_t, lda);
+DEF_FN(cublasStatus_t, cublasZgerc_v2, cublasHandle_t, handle, int, m, int, n, const cuDoubleComplex*, alpha, const cuDoubleComplex*, x, int, incx, const cuDoubleComplex*, y, int, incy, cuDoubleComplex*, A, int, lda);
+DEF_FN(cublasStatus_t, cublasZgerc_v2_64, cublasHandle_t, handle, int64_t, m, int64_t, n, const cuDoubleComplex*, alpha, const cuDoubleComplex*, x, int64_t, incx, const cuDoubleComplex*, y, int64_t, incy, cuDoubleComplex*, A, int64_t, lda);
+DEF_FN(cublasStatus_t, cublasSsyr_v2, cublasHandle_t, handle, cublasFillMode_t, uplo, int, n, const float*, alpha, const float*, x, int, incx, float*, A, int, lda);
+DEF_FN(cublasStatus_t, cublasSsyr_v2_64, cublasHandle_t, handle, cublasFillMode_t, uplo, int64_t, n, const float*, alpha, const float*, x, int64_t, incx, float*, A, int64_t, lda);
+DEF_FN(cublasStatus_t, cublasDsyr_v2, cublasHandle_t, handle, cublasFillMode_t, uplo, int, n, const double*, alpha, const double*, x, int, incx, double*, A, int, lda);
+DEF_FN(cublasStatus_t, cublasDsyr_v2_64, cublasHandle_t, handle, cublasFillMode_t, uplo, int64_t, n, const double*, alpha, const double*, x, int64_t, incx, double*, A, int64_t, lda);
+DEF_FN(cublasStatus_t, cublasCsyr_v2, cublasHandle_t, handle, cublasFillMode_t, uplo, int, n, const cuComplex*, alpha, const cuComplex*, x, int, incx, cuComplex*, A, int, lda);
+DEF_FN(cublasStatus_t, cublasCsyr_v2_64, cublasHandle_t, handle, cublasFillMode_t, uplo, int64_t, n, const cuComplex*, alpha, const cuComplex*, x, int64_t, incx, cuComplex*, A, int64_t, lda);
+DEF_FN(cublasStatus_t, cublasZsyr_v2, cublasHandle_t, handle, cublasFillMode_t, uplo, int, n, const cuDoubleComplex*, alpha, const cuDoubleComplex*, x, int, incx, cuDoubleComplex*, A, int, lda);
+DEF_FN(cublasStatus_t, cublasZsyr_v2_64, cublasHandle_t, handle, cublasFillMode_t, uplo, int64_t, n, const cuDoubleComplex*, alpha, const cuDoubleComplex*, x, int64_t, incx, cuDoubleComplex*, A, int64_t, lda);
+DEF_FN(cublasStatus_t, cublasCher_v2, cublasHandle_t, handle, cublasFillMode_t, uplo, int, n, const float*, alpha, const cuComplex*, x, int, incx, cuComplex*, A, int, lda);
+DEF_FN(cublasStatus_t, cublasCher_v2_64, cublasHandle_t, handle, cublasFillMode_t, uplo, int64_t, n, const float*, alpha, const cuComplex*, x, int64_t, incx, cuComplex*, A, int64_t, lda);
+DEF_FN(cublasStatus_t, cublasZher_v2, cublasHandle_t, handle, cublasFillMode_t, uplo, int, n, const double*, alpha, const cuDoubleComplex*, x, int, incx, cuDoubleComplex*, A, int, lda);
+DEF_FN(cublasStatus_t, cublasZher_v2_64, cublasHandle_t, handle, cublasFillMode_t, uplo, int64_t, n, const double*, alpha, const cuDoubleComplex*, x, int64_t, incx, cuDoubleComplex*, A, int64_t, lda);
+DEF_FN(cublasStatus_t, cublasSspr_v2, , cublasHandle_t, handle, cublasFillMode_t, uplo, int, n, const float*, alpha, const float*, x, int, incx, float*, AP);
+DEF_FN(cublasStatus_t, cublasSspr_v2_64, cublasHandle_t, handle, cublasFillMode_t, uplo, int64_t, n, const float*, alpha, const float*, x, int64_t, incx, float*, AP);
+DEF_FN(cublasStatus_t, cublasDspr_v2, , cublasHandle_t, handle, cublasFillMode_t, uplo, int, n, const double*, alpha, const double*, x, int, incx, double*, AP);
+DEF_FN(cublasStatus_t, cublasDspr_v2_64, cublasHandle_t, handle, cublasFillMode_t, uplo, int64_t, n, const double*, alpha, const double*, x, int64_t, incx, double*, AP);
+DEF_FN(cublasStatus_t, cublasChpr_v2, cublasHandle_t, handle, cublasFillMode_t, uplo, int, n, const float*, alpha, const cuComplex*, x, int, incx, cuComplex*, AP);
+DEF_FN(cublasStatus_t, cublasChpr_v2_64, cublasHandle_t, handle, cublasFillMode_t, uplo, int64_t, n, const float*, alpha, const cuComplex*, x, int64_t, incx, cuComplex*, AP);
+DEF_FN(cublasStatus_t, cublasZhpr_v2, cublasHandle_t, handle, cublasFillMode_t, uplo, int, n, const double*, alpha, const cuDoubleComplex*, x, int, incx, cuDoubleComplex*, AP);
+DEF_FN(cublasStatus_t, cublasZhpr_v2_64, cublasHandle_t, handle, cublasFillMode_t, uplo, int64_t, n, const double*, alpha, const cuDoubleComplex*, x, int64_t, incx, cuDoubleComplex*, AP);
+DEF_FN(cublasStatus_t, cublasSsyr2_v2, cublasHandle_t, handle, cublasFillMode_t, uplo, int, n, const float*, alpha, const float*, x, int, incx, const float*, y, int, incy, float*, A, int, lda);
+DEF_FN(cublasStatus_t, cublasSsyr2_v2_64, cublasHandle_t, handle, cublasFillMode_t, uplo, int64_t, n, const float*, alpha, const float*, x, int64_t, incx, const float*, y, int64_t, incy, float*, A, int64_t, lda);
+DEF_FN(cublasStatus_t, cublasDsyr2_v2, cublasHandle_t, handle, cublasFillMode_t, uplo, int, n, const double*, alpha, const double*, x, int, incx, const double*, y, int, incy, double*, A, int, lda);
+DEF_FN(cublasStatus_t, cublasDsyr2_v2_64, cublasHandle_t, handle, cublasFillMode_t, uplo, int64_t, n, const double*, alpha, const double*, x, int64_t, incx, const double*, y, int64_t, incy, double*, A, int64_t, lda);
+DEF_FN(cublasStatus_t, cublasCsyr2_v2, cublasHandle_t, handle, cublasFillMode_t, uplo, int, n, const cuComplex*, alpha, const cuComplex*, x, int, incx, const cuComplex*, y, int, incy, cuComplex*, A, int, lda);
+DEF_FN(cublasStatus_t, cublasCsyr2_v2_64, cublasHandle_t, handle, cublasFillMode_t, uplo, int64_t, n, const cuComplex*, alpha, const cuComplex*, x, int64_t, incx, const cuComplex*, y, int64_t, incy, cuComplex*, A, int64_t, lda);
+DEF_FN(cublasStatus_t, cublasZsyr2_v2, cublasHandle_t, handle, cublasFillMode_t, uplo, int, n, const cuDoubleComplex*, alpha, const cuDoubleComplex*, x, int, incx, const cuDoubleComplex*, y, int, incy, cuDoubleComplex*, A, int, lda);
+DEF_FN(cublasStatus_t, cublasZsyr2_v2_64, cublasHandle_t, handle, cublasFillMode_t, uplo, int64_t, n, const cuDoubleComplex*, alpha, const cuDoubleComplex*, x, int64_t, incx, const cuDoubleComplex*, y, int64_t, incy, cuDoubleComplex*, A, int64_t, lda);
+DEF_FN(cublasStatus_t, cublasCher2_v2, cublasHandle_t, handle, cublasFillMode_t, uplo, int, n, const cuComplex*, alpha, const cuComplex*, x, int, incx, const cuComplex*, y, int, incy, cuComplex*, A, int, lda);
+DEF_FN(cublasStatus_t, cublasCher2_v2_64, cublasHandle_t, handle, cublasFillMode_t, uplo, int64_t, n, const cuComplex*, alpha, const cuComplex*, x, int64_t, incx, const cuComplex*, y, int64_t, incy, cuComplex*, A, int64_t, lda);
+DEF_FN(cublasStatus_t, cublasZher2_v2, cublasHandle_t, handle, cublasFillMode_t, uplo, int, n, const cuDoubleComplex*, alpha, const cuDoubleComplex*, x, int, incx, const cuDoubleComplex*, y, int, incy, cuDoubleComplex*, A, int, lda);
+DEF_FN(cublasStatus_t, cublasZher2_v2_64, cublasHandle_t, handle, cublasFillMode_t, uplo, int64_t, n, const cuDoubleComplex*, alpha, const cuDoubleComplex*, x, int64_t, incx, const cuDoubleComplex*, y, int64_t, incy, cuDoubleComplex*, A, int64_t, lda);
+DEF_FN(cublasStatus_t, cublasSspr2_v2, cublasHandle_t, handle, cublasFillMode_t, uplo, int, n, const float*, alpha, const float*, x, int, incx, const float*, y, int, incy, float*, AP);
+DEF_FN(cublasStatus_t, cublasSspr2_v2_64, cublasHandle_t, handle, cublasFillMode_t, uplo, int64_t, n, const float*, alpha, const float*, x, int64_t, incx, const float*, y, int64_t, incy, float*, AP);
+DEF_FN(cublasStatus_t, cublasDspr2_v2, cublasHandle_t, handle, cublasFillMode_t, uplo, int, n, const double*, alpha, const double*, x, int, incx, const double*, y, int, incy, double*, AP);
+DEF_FN(cublasStatus_t, cublasDspr2_v2_64, cublasHandle_t, handle, cublasFillMode_t, uplo, int64_t, n, const double*, alpha, const double*, x, int64_t, incx, const double*, y, int64_t, incy, double*, AP);
+DEF_FN(cublasStatus_t, cublasChpr2_v2, cublasHandle_t, handle, cublasFillMode_t, uplo, int, n, const cuComplex*, alpha, const cuComplex*, x, int, incx, const cuComplex*, y, int, incy, cuComplex*, AP);
+DEF_FN(cublasStatus_t, cublasChpr2_v2_64, cublasHandle_t, handle, cublasFillMode_t, uplo, int64_t, n, const cuComplex*, alpha, const cuComplex*, x, int64_t, incx, const cuComplex*, y, int64_t, incy, cuComplex*, AP);
+DEF_FN(cublasStatus_t, cublasZhpr2_v2, cublasHandle_t, handle, cublasFillMode_t, uplo, int, n, const cuDoubleComplex*, alpha, const cuDoubleComplex*, x, int, incx, const cuDoubleComplex*, y, int, incy, cuDoubleComplex*, AP);
+DEF_FN(cublasStatus_t, cublasZhpr2_v2_64, cublasHandle_t, handle, cublasFillMode_t, uplo, int64_t, n, const cuDoubleComplex*, alpha, const cuDoubleComplex*, x, int64_t, incx, const cuDoubleComplex*, y, int64_t, incy, cuDoubleComplex*, AP);
+DEF_FN(cublasStatus_t, cublasSgemvBatched, cublasHandle_t, handle, cublasOperation_t, trans, int, m, int, n, const float*, alpha, const float* const*,  Aarray, int, lda, const float* const*,  xarray, int, incx, const float*, beta, float* const*,  yarray, int, incy, int, batchCount);
+DEF_FN(cublasStatus_t, cublasSgemvBatched_64, cublasHandle_t, handle, cublasOperation_t, trans, int64_t, m, int64_t, n, const float*, alpha, const float* const*,  Aarray, int64_t, lda, const float* const*,  xarray, int64_t, incx, const float*, beta, float* const*,  yarray, int64_t, incy, int64_t, batchCount);
+DEF_FN(cublasStatus_t, cublasDgemvBatched, cublasHandle_t, handle, cublasOperation_t, trans, int, m, int, n, const double*, alpha, const double* const*,  Aarray, int, lda, const double* const*,  xarray, int, incx, const double*, beta, double* const*,  yarray, int, incy, int, batchCount);
+DEF_FN(cublasStatus_t, cublasDgemvBatched_64, cublasHandle_t, handle, cublasOperation_t, trans, int64_t, m, int64_t, n, const double*, alpha, const double* const*,  Aarray, int64_t, lda, const double* const*,  xarray, int64_t, incx, const double*, beta, double* const*,  yarray, int64_t, incy, int64_t, batchCount);
+DEF_FN(cublasStatus_t, cublasCgemvBatched, cublasHandle_t, handle, cublasOperation_t, trans, int, m, int, n, const cuComplex*, alpha, const cuComplex* const*,  Aarray, int, lda, const cuComplex* const*,  xarray, int, incx, const cuComplex*, beta, cuComplex* const*,  yarray, int, incy, int, batchCount);
+DEF_FN(cublasStatus_t, cublasCgemvBatched_64, cublasHandle_t, handle, cublasOperation_t, trans, int64_t, m, int64_t, n, const cuComplex*, alpha, const cuComplex* const*,  Aarray, int64_t, lda, const cuComplex* const*,  xarray, int64_t, incx, const cuComplex*, beta, cuComplex* const*,  yarray, int64_t, incy, int64_t, batchCount);
+DEF_FN(cublasStatus_t, cublasZgemvBatched, cublasHandle_t, handle, cublasOperation_t, trans, int, m, int, n, const cuDoubleComplex*, alpha, const cuDoubleComplex* const*,  Aarray, int, lda, const cuDoubleComplex* const*,  xarray, int, incx, const cuDoubleComplex*, beta, cuDoubleComplex* const*,  yarray, int, incy, int, batchCount);
+DEF_FN(cublasStatus_t, cublasZgemvBatched_64, cublasHandle_t, handle, cublasOperation_t, trans, int64_t, m, int64_t, n, const cuDoubleComplex*, alpha, const cuDoubleComplex* const*,  Aarray, int64_t, lda, const cuDoubleComplex* const*,  xarray, int64_t, incx, const cuDoubleComplex*, beta, cuDoubleComplex* const*,  yarray, int64_t, incy, int64_t, batchCount);
+DEF_FN(cublasStatus_t, cublasSgemvStridedBatched, cublasHandle_t, handle, cublasOperation_t, trans, int, m, int, n, const float*, alpha, const float*, A, int, lda, long long int, strideA, const float*, x, int, incx, long long int, stridex, const float*, beta, float*, y, int, incy, long long int, stridey, int, batchCount);
+DEF_FN(cublasStatus_t, cublasSgemvStridedBatched_64, cublasHandle_t, handle, cublasOperation_t, trans, int64_t, m, int64_t, n, const float*, alpha, const float*, A, int64_t, lda, long long int, strideA, const float*, x, int64_t, incx, long long int, stridex, const float*, beta, float*, y, int64_t, incy, long long int, stridey, int64_t, batchCount);
+DEF_FN(cublasStatus_t, cublasDgemvStridedBatched, cublasHandle_t, handle, cublasOperation_t, trans, int, m, int, n, const double*, alpha, const double*, A, int, lda, long long int, strideA, const double*, x, int, incx, long long int, stridex, const double*, beta, double*, y, int, incy, long long int, stridey, int, batchCount);
+DEF_FN(cublasStatus_t, cublasDgemvStridedBatched_64, cublasHandle_t, handle, cublasOperation_t, trans, int64_t, m, int64_t, n, const double*, alpha, const double*, A, int64_t, lda, long long int, strideA, const double*, x, int64_t, incx, long long int, stridex, const double*, beta, double*, y, int64_t, incy, long long int, stridey, int64_t, batchCount);
+DEF_FN(cublasStatus_t, cublasCgemvStridedBatched, cublasHandle_t, handle, cublasOperation_t, trans, int, m, int, n, const cuComplex*, alpha, const cuComplex*, A, int, lda, long long int, strideA, const cuComplex*, x, int, incx, long long int, stridex, const cuComplex*, beta, cuComplex*, y, int, incy, long long int, stridey, int, batchCount);
+DEF_FN(cublasStatus_t, cublasCgemvStridedBatched_64, cublasHandle_t, handle, cublasOperation_t, trans, int64_t, m, int64_t, n, const cuComplex*, alpha, const cuComplex*, A, int64_t, lda, long long int, strideA, const cuComplex*, x, int64_t, incx, long long int, stridex, const cuComplex*, beta, cuComplex*, y, int64_t, incy, long long int, stridey, int64_t, batchCount);
+DEF_FN(cublasStatus_t, cublasZgemvStridedBatched, cublasHandle_t, handle, cublasOperation_t, trans, int, m, int, n, const cuDoubleComplex*, alpha, const cuDoubleComplex*, A, int, lda, long long int, strideA, const cuDoubleComplex*, x, int, incx, long long int, stridex, const cuDoubleComplex*, beta, cuDoubleComplex*, y, int, incy, long long int, stridey, int, batchCount);
+DEF_FN(cublasStatus_t, cublasZgemvStridedBatched_64, cublasHandle_t, handle, cublasOperation_t, trans, int64_t, m, int64_t, n, const cuDoubleComplex*, alpha, const cuDoubleComplex*, A, int64_t, lda, long long int, strideA, const cuDoubleComplex*, x, int64_t, incx, long long int, stridex, const cuDoubleComplex*, beta, cuDoubleComplex*, y, int64_t, incy, long long int, stridey, int64_t, batchCount);
 
-cublasStatus_t cublasSgemmEx(cublasHandle_t handle,
+cublasStatus_t cublasSgemm_v2(cublasHandle_t handle,
                            cublasOperation_t transa, cublasOperation_t transb,
                            int m, int n, int k,
                            const float *alpha,
-                           const void *A, cudaDataType_t Atype, int lda,
-                           const void *B, cudaDataType_t Btype, int ldb,
+                           const float *A, int lda,
+                           const float *B, int ldb,
                            const float *beta,
-                           void *C, cudaDataType_t Ctype, int ldc)
+                           float *C, int ldc)
 {
 #ifdef WITH_API_CNT
     api_call_cnt++;
 #endif //WITH_API_CNT
     int result;
     enum clnt_stat retval_1;
-    retval_1 = rpc_cublassgemmex_1(
+    retval_1 = rpc_cublassgemm_1(
         (ptr)handle,
         (int)transa,
         (int)transb,
         m, n, k,
         *alpha,
-        (ptr)A, (int)Atype, lda,
-        (ptr)B, (int)Btype, ldb,
+        (ptr)A, lda,
+        (ptr)B, ldb,
         *beta,
-        (ptr)C, (int)Ctype, ldc,
+        (ptr)C, ldc,
          &result, clnt);
     if (retval_1 != RPC_SUCCESS) {
         clnt_perror (clnt, "call failed");
     }
     return result;
 }
+DEF_FN(cublasStatus_t, cublasSgemm_v2_64, cublasHandle_t, handle, cublasOperation_t, transa, cublasOperation_t, transb, int64_t, m, int64_t, n, int64_t, k, const float*, alpha, const float*, A, int64_t, lda, const float*, B, int64_t, ldb, const float*, beta, float*, C, int64_t, ldc);
 
-cublasStatus_t cublasDgemv(cublasHandle_t handle,
-                           cublasOperation_t trans,
-                           int m, int n,
+cublasStatus_t cublasDgemm_v2(cublasHandle_t handle,
+                           cublasOperation_t transa, cublasOperation_t transb,
+                           int m, int n, int k,
                            const double          *alpha,
                            const double          *A, int lda,
-                           const double          *x, int incx,
+                           const double          *B, int ldb,
                            const double          *beta,
-                           double          *y, int incy)
+                           double          *C, int ldc)
 {
 #ifdef WITH_API_CNT
     api_call_cnt++;
 #endif //WITH_API_CNT
     int result;
     enum clnt_stat retval_1;
-    retval_1 = rpc_cublasdgemv_1(
+    retval_1 = rpc_cublasdgemm_1(
         (ptr)handle,
-        (int)trans,
-        m, n,
+        (int)transa,
+        (int)transb,
+        m, n, k,
         *alpha,
         (ptr)A, lda,
-        (ptr)x, incx,
+        (ptr)B, ldb,
         *beta,
-        (ptr)y, incy,
+        (ptr)C, ldc,
          &result, clnt);
     if (retval_1 != RPC_SUCCESS) {
         clnt_perror (clnt, "call failed");
@@ -173,32 +560,204 @@ cublasStatus_t cublasDgemv(cublasHandle_t handle,
     return result;
 }
 
-cublasStatus_t cublasSgemv(cublasHandle_t handle,
-                           cublasOperation_t trans,
-                           int m, int n,
-                           const float          *alpha,
-                           const float          *A, int lda,
-                           const float          *x, int incx,
-                           const float          *beta,
-                           float          *y, int incy)
+DEF_FN(cublasStatus_t, cublasDgemm_v2_64, cublasHandle_t, handle, cublasOperation_t, transa, cublasOperation_t, transb, int64_t, m, int64_t, n, int64_t, k, const double*, alpha, const double*, A, int64_t, lda, const double*, B, int64_t, ldb, const double*, beta, double*, C, int64_t, ldc);
+DEF_FN(cublasStatus_t, cublasCgemm_v2, cublasHandle_t, handle, cublasOperation_t, transa, cublasOperation_t, transb, int, m, int, n, int, k, const cuComplex*, alpha, const cuComplex*, A, int, lda, const cuComplex*, B, int, ldb, const cuComplex*, beta, cuComplex*, C, int, ldc);
+DEF_FN(cublasStatus_t, cublasCgemm_v2_64, cublasHandle_t, handle, cublasOperation_t, transa, cublasOperation_t, transb, int64_t, m, int64_t, n, int64_t, k, const cuComplex*, alpha, const cuComplex*, A, int64_t, lda, const cuComplex*, B, int64_t, ldb, const cuComplex*, beta, cuComplex*, C, int64_t, ldc);
+DEF_FN(cublasStatus_t, cublasCgemm3m, cublasHandle_t, handle, cublasOperation_t, transa, cublasOperation_t, transb, int, m, int, n, int, k, const cuComplex*, alpha, const cuComplex*, A, int, lda, const cuComplex*, B, int, ldb, const cuComplex*, beta, cuComplex*, C, int, ldc);
+DEF_FN(cublasStatus_t, cublasCgemm3m_64, cublasHandle_t, handle, cublasOperation_t, transa, cublasOperation_t, transb, int64_t, m, int64_t, n, int64_t, k, const cuComplex*, alpha, const cuComplex*, A, int64_t, lda, const cuComplex*, B, int64_t, ldb, const cuComplex*, beta, cuComplex*, C, int64_t, ldc);
+DEF_FN(cublasStatus_t, cublasCgemm3mEx, cublasHandle_t, handle, cublasOperation_t, transa, cublasOperation_t, transb, int, m, int, n, int, k, const cuComplex*, alpha, const void*, A, cudaDataType, Atype, int, lda, const void*, B, cudaDataType, Btype, int, ldb, const cuComplex*, beta, void*, C, cudaDataType, Ctype, int, ldc);
+DEF_FN(cublasStatus_t, cublasCgemm3mEx_64, cublasHandle_t, handle, cublasOperation_t, transa, cublasOperation_t, transb, int64_t, m, int64_t, n, int64_t, k, const cuComplex*, alpha, const void*, A, cudaDataType, Atype, int64_t, lda, const void*, B, cudaDataType, Btype, int64_t, ldb, const cuComplex*, beta, void*, C, cudaDataType, Ctype, int64_t, ldc);
+DEF_FN(cublasStatus_t, cublasZgemm_v2, cublasHandle_t, handle, cublasOperation_t, transa, cublasOperation_t, transb, int, m, int, n, int, k, const cuDoubleComplex*, alpha, const cuDoubleComplex*, A, int, lda, const cuDoubleComplex*, B, int, ldb, const cuDoubleComplex*, beta, cuDoubleComplex*, C, int, ldc);
+DEF_FN(cublasStatus_t, cublasZgemm_v2_64, cublasHandle_t, handle, cublasOperation_t, transa, cublasOperation_t, transb, int64_t, m, int64_t, n, int64_t, k, const cuDoubleComplex*, alpha, const cuDoubleComplex*, A, int64_t, lda, const cuDoubleComplex*, B, int64_t, ldb, const cuDoubleComplex*, beta, cuDoubleComplex*, C, int64_t, ldc);
+DEF_FN(cublasStatus_t, cublasZgemm3m, cublasHandle_t, handle, cublasOperation_t, transa, cublasOperation_t, transb, int, m, int, n, int, k, const cuDoubleComplex*, alpha, const cuDoubleComplex*, A, int, lda, const cuDoubleComplex*, B, int, ldb, const cuDoubleComplex*, beta, cuDoubleComplex*, C, int, ldc);
+DEF_FN(cublasStatus_t, cublasZgemm3m_64, cublasHandle_t, handle, cublasOperation_t, transa, cublasOperation_t, transb, int64_t, m, int64_t, n, int64_t, k, const cuDoubleComplex*, alpha, const cuDoubleComplex*, A, int64_t, lda, const cuDoubleComplex*, B, int64_t, ldb, const cuDoubleComplex*, beta, cuDoubleComplex*, C, int64_t, ldc);
+
+cublasStatus_t cublasSgemmEx(cublasHandle_t handle,
+                           cublasOperation_t transa, cublasOperation_t transb,
+                           int m, int n, int k,
+                           const float *alpha,
+                           const void *A, cudaDataType_t Atype, int lda,
+                           const void *B, cudaDataType_t Btype, int ldb,
+                           const float *beta,
+                           void *C, cudaDataType_t Ctype, int ldc)
 {
 #ifdef WITH_API_CNT
     api_call_cnt++;
 #endif //WITH_API_CNT
     int result;
     enum clnt_stat retval_1;
-    retval_1 = rpc_cublassgemv_1(
+    retval_1 = rpc_cublassgemmex_1(
         (ptr)handle,
-        (int)trans,
-        m, n,
+        (int)transa,
+        (int)transb,
+        m, n, k,
         *alpha,
-        (ptr)A, lda,
-        (ptr)x, incx,
+        (ptr)A, (int)Atype, lda,
+        (ptr)B, (int)Btype, ldb,
         *beta,
-        (ptr)y, incy,
+        (ptr)C, (int)Ctype, ldc,
          &result, clnt);
     if (retval_1 != RPC_SUCCESS) {
         clnt_perror (clnt, "call failed");
     }
     return result;
-}
\ No newline at end of file
+}
+
+
+DEF_FN(cublasStatus_t, cublasSgemmEx_64, cublasHandle_t, handle, cublasOperation_t, transa, cublasOperation_t, transb, int64_t, m, int64_t, n, int64_t, k, const float*, alpha, const void*, A, cudaDataType, Atype, int64_t, lda, const void*, B, cudaDataType, Btype, int64_t, ldb, const float*, beta, void*, C, cudaDataType, Ctype, int64_t, ldc);
+DEF_FN(cublasStatus_t, cublasGemmEx, cublasHandle_t, handle, cublasOperation_t, transa, cublasOperation_t, transb, int, m, int, n, int, k, const void*, alpha, const void*, A, cudaDataType, Atype, int, lda, const void*, B, cudaDataType, Btype, int, ldb, const void*, beta, void*, C, cudaDataType, Ctype, int, ldc, cublasComputeType_t, computeType, cublasGemmAlgo_t, algo);
+DEF_FN(cublasStatus_t, cublasGemmEx_64, cublasHandle_t, handle, cublasOperation_t, transa, cublasOperation_t, transb, int64_t, m, int64_t, n, int64_t, k, const void*, alpha, const void*, A, cudaDataType, Atype, int64_t, lda, const void*, B, cudaDataType, Btype, int64_t, ldb, const void*, beta, void*, C, cudaDataType, Ctype, int64_t, ldc, cublasComputeType_t, computeType, cublasGemmAlgo_t, algo);
+DEF_FN(cublasStatus_t, cublasCgemmEx, cublasHandle_t, handle, cublasOperation_t, transa, cublasOperation_t, transb, int, m, int, n, int, k, const cuComplex*, alpha, const void*, A, cudaDataType, Atype, int, lda, const void*, B, cudaDataType, Btype, int, ldb, const cuComplex*, beta, void*, C, cudaDataType, Ctype, int, ldc);
+DEF_FN(cublasStatus_t, cublasCgemmEx_64, cublasHandle_t, handle, cublasOperation_t, transa, cublasOperation_t, transb, int64_t, m, int64_t, n, int64_t, k, const cuComplex*, alpha, const void*, A, cudaDataType, Atype, int64_t, lda, const void*, B, cudaDataType, Btype, int64_t, ldb, const cuComplex*, beta, void*, C, cudaDataType, Ctype, int64_t, ldc);
+DEF_FN(cublasStatus_t, cublasSsyrk_v2, cublasHandle_t, handle, cublasFillMode_t, uplo, cublasOperation_t, trans, int, n, int, k, const float*, alpha, const float*, A, int, lda, const float*, beta, float*, C, int, ldc);
+DEF_FN(cublasStatus_t, cublasSsyrk_v2_64, cublasHandle_t, handle, cublasFillMode_t, uplo, cublasOperation_t, trans, int64_t, n, int64_t, k, const float*, alpha, const float*, A, int64_t, lda, const float*, beta, float*, C, int64_t, ldc);
+DEF_FN(cublasStatus_t, cublasDsyrk_v2, cublasHandle_t, handle, cublasFillMode_t, uplo, cublasOperation_t, trans, int, n, int, k, const double*, alpha, const double*, A, int, lda, const double*, beta, double*, C, int, ldc);
+DEF_FN(cublasStatus_t, cublasDsyrk_v2_64, cublasHandle_t, handle, cublasFillMode_t, uplo, cublasOperation_t, trans, int64_t, n, int64_t, k, const double*, alpha, const double*, A, int64_t, lda, const double*, beta, double*, C, int64_t, ldc);
+DEF_FN(cublasStatus_t, cublasCsyrk_v2, cublasHandle_t, handle, cublasFillMode_t, uplo, cublasOperation_t, trans, int, n, int, k, const cuComplex*, alpha, const cuComplex*, A, int, lda, const cuComplex*, beta, cuComplex*, C, int, ldc);
+DEF_FN(cublasStatus_t, cublasCsyrk_v2_64, cublasHandle_t, handle, cublasFillMode_t, uplo, cublasOperation_t, trans, int64_t, n, int64_t, k, const cuComplex*, alpha, const cuComplex*, A, int64_t, lda, const cuComplex*, beta, cuComplex*, C, int64_t, ldc);
+DEF_FN(cublasStatus_t, cublasZsyrk_v2, cublasHandle_t, handle, cublasFillMode_t, uplo, cublasOperation_t, trans, int, n, int, k, const cuDoubleComplex*, alpha, const cuDoubleComplex*, A, int, lda, const cuDoubleComplex*, beta, cuDoubleComplex*, C, int, ldc);
+DEF_FN(cublasStatus_t, cublasZsyrk_v2_64, cublasHandle_t, handle, cublasFillMode_t, uplo, cublasOperation_t, trans, int64_t, n, int64_t, k, const cuDoubleComplex*, alpha, const cuDoubleComplex*, A, int64_t, lda, const cuDoubleComplex*, beta, cuDoubleComplex*, C, int64_t, ldc);
+DEF_FN(cublasStatus_t, cublasCsyrkEx, cublasHandle_t, handle, cublasFillMode_t, uplo, cublasOperation_t, trans, int, n, int, k, const cuComplex*, alpha, const void*, A, cudaDataType, Atype, int, lda, const cuComplex*, beta, void*, C, cudaDataType, Ctype, int, ldc);
+DEF_FN(cublasStatus_t, cublasCsyrkEx_64, cublasHandle_t, handle, cublasFillMode_t, uplo, cublasOperation_t, trans, int64_t, n, int64_t, k, const cuComplex*, alpha, const void*, A, cudaDataType, Atype, int64_t, lda, const cuComplex*, beta, void*, C, cudaDataType, Ctype, int64_t, ldc);
+DEF_FN(cublasStatus_t, cublasCsyrk3mEx, cublasHandle_t, handle, cublasFillMode_t, uplo, cublasOperation_t, trans, int, n, int, k, const cuComplex*, alpha, const void*, A, cudaDataType, Atype, int, lda, const cuComplex*, beta, void*, C, cudaDataType, Ctype, int, ldc);
+DEF_FN(cublasStatus_t, cublasCsyrk3mEx_64, cublasHandle_t, handle, cublasFillMode_t, uplo, cublasOperation_t, trans, int64_t, n, int64_t, k, const cuComplex*, alpha, const void*, A, cudaDataType, Atype, int64_t, lda, const cuComplex*, beta, void*, C, cudaDataType, Ctype, int64_t, ldc);
+DEF_FN(cublasStatus_t, cublasCherk_v2, cublasHandle_t, handle, cublasFillMode_t, uplo, cublasOperation_t, trans, int, n, int, k, const float*, alpha, const cuComplex*, A, int, lda, const float*, beta, cuComplex*, C, int, ldc);
+DEF_FN(cublasStatus_t, cublasCherk_v2_64, cublasHandle_t, handle, cublasFillMode_t, uplo, cublasOperation_t, trans, int64_t, n, int64_t, k, const float*, alpha, const cuComplex*, A, int64_t, lda, const float*, beta, cuComplex*, C, int64_t, ldc);
+DEF_FN(cublasStatus_t, cublasZherk_v2, cublasHandle_t, handle, cublasFillMode_t, uplo, cublasOperation_t, trans, int, n, int, k, const double*, alpha, const cuDoubleComplex*, A, int, lda, const double*, beta, cuDoubleComplex*, C, int, ldc);
+DEF_FN(cublasStatus_t, cublasZherk_v2_64, cublasHandle_t, handle, cublasFillMode_t, uplo, cublasOperation_t, trans, int64_t, n, int64_t, k, const double*, alpha, const cuDoubleComplex*, A, int64_t, lda, const double*, beta, cuDoubleComplex*, C, int64_t, ldc);
+DEF_FN(cublasStatus_t, cublasCherkEx, cublasHandle_t, handle, cublasFillMode_t, uplo, cublasOperation_t, trans, int, n, int, k, const float*, alpha, const void*, A, cudaDataType, Atype, int, lda, const float*, beta, void*, C, cudaDataType, Ctype, int, ldc);
+DEF_FN(cublasStatus_t, cublasCherkEx_64, cublasHandle_t, handle, cublasFillMode_t, uplo, cublasOperation_t, trans, int64_t, n, int64_t, k, const float*, alpha, const void*, A, cudaDataType, Atype, int64_t, lda, const float*, beta, void*, C, cudaDataType, Ctype, int64_t, ldc);
+DEF_FN(cublasStatus_t, cublasCherk3mEx, cublasHandle_t, handle, cublasFillMode_t, uplo, cublasOperation_t, trans, int, n, int, k, const float*, alpha, const void*, A, cudaDataType, Atype, int, lda, const float*, beta, void*, C, cudaDataType, Ctype, int, ldc);
+DEF_FN(cublasStatus_t, cublasCherk3mEx_64, cublasHandle_t, handle, cublasFillMode_t, uplo, cublasOperation_t, trans, int64_t, n, int64_t, k, const float*, alpha, const void*, A, cudaDataType, Atype, int64_t, lda, const float*, beta, void*, C, cudaDataType, Ctype, int64_t, ldc);
+DEF_FN(cublasStatus_t, cublasSsyr2k_v2, cublasHandle_t, handle, cublasFillMode_t, uplo, cublasOperation_t, trans, int, n, int, k, const float*, alpha, const float*, A, int, lda, const float*, B, int, ldb, const float*, beta, float*, C, int, ldc);
+DEF_FN(cublasStatus_t, cublasSsyr2k_v2_64, cublasHandle_t, handle, cublasFillMode_t, uplo, cublasOperation_t, trans, int64_t, n, int64_t, k, const float*, alpha, const float*, A, int64_t, lda, const float*, B, int64_t, ldb, const float*, beta, float*, C, int64_t, ldc);
+DEF_FN(cublasStatus_t, cublasDsyr2k_v2, cublasHandle_t, handle, cublasFillMode_t, uplo, cublasOperation_t, trans, int, n, int, k, const double*, alpha, const double*, A, int, lda, const double*, B, int, ldb, const double*, beta, double*, C, int, ldc);
+DEF_FN(cublasStatus_t, cublasDsyr2k_v2_64, cublasHandle_t, handle, cublasFillMode_t, uplo, cublasOperation_t, trans, int64_t, n, int64_t, k, const double*, alpha, const double*, A, int64_t, lda, const double*, B, int64_t, ldb, const double*, beta, double*, C, int64_t, ldc);
+DEF_FN(cublasStatus_t, cublasCsyr2k_v2, cublasHandle_t, handle, cublasFillMode_t, uplo, cublasOperation_t, trans, int, n, int, k, const cuComplex*, alpha, const cuComplex*, A, int, lda, const cuComplex*, B, int, ldb, const cuComplex*, beta, cuComplex*, C, int, ldc);
+DEF_FN(cublasStatus_t, cublasCsyr2k_v2_64, cublasHandle_t, handle, cublasFillMode_t, uplo, cublasOperation_t, trans, int64_t, n, int64_t, k, const cuComplex*, alpha, const cuComplex*, A, int64_t, lda, const cuComplex*, B, int64_t, ldb, const cuComplex*, beta, cuComplex*, C, int64_t, ldc);
+DEF_FN(cublasStatus_t, cublasZsyr2k_v2, cublasHandle_t, handle, cublasFillMode_t, uplo, cublasOperation_t, trans, int, n, int, k, const cuDoubleComplex*, alpha, const cuDoubleComplex*, A, int, lda, const cuDoubleComplex*, B, int, ldb, const cuDoubleComplex*, beta, cuDoubleComplex*, C, int, ldc);
+DEF_FN(cublasStatus_t, cublasZsyr2k_v2_64, cublasHandle_t, handle, cublasFillMode_t, uplo, cublasOperation_t, trans, int64_t, n, int64_t, k, const cuDoubleComplex*, alpha, const cuDoubleComplex*, A, int64_t, lda, const cuDoubleComplex*, B, int64_t, ldb, const cuDoubleComplex*, beta, cuDoubleComplex*, C, int64_t, ldc);
+DEF_FN(cublasStatus_t, cublasCher2k_v2, cublasHandle_t, handle, cublasFillMode_t, uplo, cublasOperation_t, trans, int, n, int, k, const cuComplex*, alpha, const cuComplex*, A, int, lda, const cuComplex*, B, int, ldb, const float*, beta, cuComplex*, C, int, ldc);
+DEF_FN(cublasStatus_t, cublasCher2k_v2_64, cublasHandle_t, handle, cublasFillMode_t, uplo, cublasOperation_t, trans, int64_t, n, int64_t, k, const cuComplex*, alpha, const cuComplex*, A, int64_t, lda, const cuComplex*, B, int64_t, ldb, const float*, beta, cuComplex*, C, int64_t, ldc);
+DEF_FN(cublasStatus_t, cublasZher2k_v2, cublasHandle_t, handle, cublasFillMode_t, uplo, cublasOperation_t, trans, int, n, int, k, const cuDoubleComplex*, alpha, const cuDoubleComplex*, A, int, lda, const cuDoubleComplex*, B, int, ldb, const double*, beta, cuDoubleComplex*, C, int, ldc);
+DEF_FN(cublasStatus_t, cublasZher2k_v2_64, cublasHandle_t, handle, cublasFillMode_t, uplo, cublasOperation_t, trans, int64_t, n, int64_t, k, const cuDoubleComplex*, alpha, const cuDoubleComplex*, A, int64_t, lda, const cuDoubleComplex*, B, int64_t, ldb, const double*, beta, cuDoubleComplex*, C, int64_t, ldc);
+DEF_FN(cublasStatus_t, cublasSsyrkx, cublasHandle_t, handle, cublasFillMode_t, uplo, cublasOperation_t, trans, int, n, int, k, const float*, alpha, const float*, A, int, lda, const float*, B, int, ldb, const float*, beta, float*, C, int, ldc);
+DEF_FN(cublasStatus_t, cublasSsyrkx_64, cublasHandle_t, handle, cublasFillMode_t, uplo, cublasOperation_t, trans, int64_t, n, int64_t, k, const float*, alpha, const float*, A, int64_t, lda, const float*, B, int64_t, ldb, const float*, beta, float*, C, int64_t, ldc);
+DEF_FN(cublasStatus_t, cublasDsyrkx, cublasHandle_t, handle, cublasFillMode_t, uplo, cublasOperation_t, trans, int, n, int, k, const double*, alpha, const double*, A, int, lda, const double*, B, int, ldb, const double*, beta, double*, C, int, ldc);
+DEF_FN(cublasStatus_t, cublasDsyrkx_64, cublasHandle_t, handle, cublasFillMode_t, uplo, cublasOperation_t, trans, int64_t, n, int64_t, k, const double*, alpha, const double*, A, int64_t, lda, const double*, B, int64_t, ldb, const double*, beta, double*, C, int64_t, ldc);
+DEF_FN(cublasStatus_t, cublasCsyrkx, cublasHandle_t, handle, cublasFillMode_t, uplo, cublasOperation_t, trans, int, n, int, k, const cuComplex*, alpha, const cuComplex*, A, int, lda, const cuComplex*, B, int, ldb, const cuComplex*, beta, cuComplex*, C, int, ldc);
+DEF_FN(cublasStatus_t, cublasCsyrkx_64, cublasHandle_t, handle, cublasFillMode_t, uplo, cublasOperation_t, trans, int64_t, n, int64_t, k, const cuComplex*, alpha, const cuComplex*, A, int64_t, lda, const cuComplex*, B, int64_t, ldb, const cuComplex*, beta, cuComplex*, C, int64_t, ldc);
+DEF_FN(cublasStatus_t, cublasZsyrkx, cublasHandle_t, handle, cublasFillMode_t, uplo, cublasOperation_t, trans, int, n, int, k, const cuDoubleComplex*, alpha, const cuDoubleComplex*, A, int, lda, const cuDoubleComplex*, B, int, ldb, const cuDoubleComplex*, beta, cuDoubleComplex*, C, int, ldc);
+DEF_FN(cublasStatus_t, cublasZsyrkx_64, cublasHandle_t, handle, cublasFillMode_t, uplo, cublasOperation_t, trans, int64_t, n, int64_t, k, const cuDoubleComplex*, alpha, const cuDoubleComplex*, A, int64_t, lda, const cuDoubleComplex*, B, int64_t, ldb, const cuDoubleComplex*, beta, cuDoubleComplex*, C, int64_t, ldc);
+DEF_FN(cublasStatus_t, cublasCherkx, cublasHandle_t, handle, cublasFillMode_t, uplo, cublasOperation_t, trans, int, n, int, k, const cuComplex*, alpha, const cuComplex*, A, int, lda, const cuComplex*, B, int, ldb, const float*, beta, cuComplex*, C, int, ldc);
+DEF_FN(cublasStatus_t, cublasCherkx_64, cublasHandle_t, handle, cublasFillMode_t, uplo, cublasOperation_t, trans, int64_t, n, int64_t, k, const cuComplex*, alpha, const cuComplex*, A, int64_t, lda, const cuComplex*, B, int64_t, ldb, const float*, beta, cuComplex*, C, int64_t, ldc);
+DEF_FN(cublasStatus_t, cublasZherkx, cublasHandle_t, handle, cublasFillMode_t, uplo, cublasOperation_t, trans, int, n, int, k, const cuDoubleComplex*, alpha, const cuDoubleComplex*, A, int, lda, const cuDoubleComplex*, B, int, ldb, const double*, beta, cuDoubleComplex*, C, int, ldc);
+DEF_FN(cublasStatus_t, cublasZherkx_64, cublasHandle_t, handle, cublasFillMode_t, uplo, cublasOperation_t, trans, int64_t, n, int64_t, k, const cuDoubleComplex*, alpha, const cuDoubleComplex*, A, int64_t, lda, const cuDoubleComplex*, B, int64_t, ldb, const double*, beta, cuDoubleComplex*, C, int64_t, ldc);
+DEF_FN(cublasStatus_t, cublasSsymm_v2, cublasHandle_t, handle, cublasSideMode_t, side, cublasFillMode_t, uplo, int, m, int, n, const float*, alpha, const float*, A, int, lda, const float*, B, int, ldb, const float*, beta, float*, C, int, ldc);
+DEF_FN(cublasStatus_t, cublasSsymm_v2_64, cublasHandle_t, handle, cublasSideMode_t, side, cublasFillMode_t, uplo, int64_t, m, int64_t, n, const float*, alpha, const float*, A, int64_t, lda, const float*, B, int64_t, ldb, const float*, beta, float*, C, int64_t, ldc);
+DEF_FN(cublasStatus_t, cublasDsymm_v2, cublasHandle_t, handle, cublasSideMode_t, side, cublasFillMode_t, uplo, int, m, int, n, const double*, alpha, const double*, A, int, lda, const double*, B, int, ldb, const double*, beta, double*, C, int, ldc);
+DEF_FN(cublasStatus_t, cublasDsymm_v2_64, cublasHandle_t, handle, cublasSideMode_t, side, cublasFillMode_t, uplo, int64_t, m, int64_t, n, const double*, alpha, const double*, A, int64_t, lda, const double*, B, int64_t, ldb, const double*, beta, double*, C, int64_t, ldc);
+DEF_FN(cublasStatus_t, cublasCsymm_v2, cublasHandle_t, handle, cublasSideMode_t, side, cublasFillMode_t, uplo, int, m, int, n, const cuComplex*, alpha, const cuComplex*, A, int, lda, const cuComplex*, B, int, ldb, const cuComplex*, beta, cuComplex*, C, int, ldc);
+DEF_FN(cublasStatus_t, cublasCsymm_v2_64, cublasHandle_t, handle, cublasSideMode_t, side, cublasFillMode_t, uplo, int64_t, m, int64_t, n, const cuComplex*, alpha, const cuComplex*, A, int64_t, lda, const cuComplex*, B, int64_t, ldb, const cuComplex*, beta, cuComplex*, C, int64_t, ldc);
+DEF_FN(cublasStatus_t, cublasZsymm_v2, cublasHandle_t, handle, cublasSideMode_t, side, cublasFillMode_t, uplo, int, m, int, n, const cuDoubleComplex*, alpha, const cuDoubleComplex*, A, int, lda, const cuDoubleComplex*, B, int, ldb, const cuDoubleComplex*, beta, cuDoubleComplex*, C, int, ldc);
+DEF_FN(cublasStatus_t, cublasZsymm_v2_64, cublasHandle_t, handle, cublasSideMode_t, side, cublasFillMode_t, uplo, int64_t, m, int64_t, n, const cuDoubleComplex*, alpha, const cuDoubleComplex*, A, int64_t, lda, const cuDoubleComplex*, B, int64_t, ldb, const cuDoubleComplex*, beta, cuDoubleComplex*, C, int64_t, ldc);
+DEF_FN(cublasStatus_t, cublasChemm_v2, cublasHandle_t, handle, cublasSideMode_t, side, cublasFillMode_t, uplo, int, m, int, n, const cuComplex*, alpha, const cuComplex*, A, int, lda, const cuComplex*, B, int, ldb, const cuComplex*, beta, cuComplex*, C, int, ldc);
+DEF_FN(cublasStatus_t, cublasChemm_v2_64, cublasHandle_t, handle, cublasSideMode_t, side, cublasFillMode_t, uplo, int64_t, m, int64_t, n, const cuComplex*, alpha, const cuComplex*, A, int64_t, lda, const cuComplex*, B, int64_t, ldb, const cuComplex*, beta, cuComplex*, C, int64_t, ldc);
+DEF_FN(cublasStatus_t, cublasZhemm_v2, cublasHandle_t, handle, cublasSideMode_t, side, cublasFillMode_t, uplo, int, m, int, n, const cuDoubleComplex*, alpha, const cuDoubleComplex*, A, int, lda, const cuDoubleComplex*, B, int, ldb, const cuDoubleComplex*, beta, cuDoubleComplex*, C, int, ldc);
+DEF_FN(cublasStatus_t, cublasZhemm_v2_64, cublasHandle_t, handle, cublasSideMode_t, side, cublasFillMode_t, uplo, int64_t, m, int64_t, n, const cuDoubleComplex*, alpha, const cuDoubleComplex*, A, int64_t, lda, const cuDoubleComplex*, B, int64_t, ldb, const cuDoubleComplex*, beta, cuDoubleComplex*, C, int64_t, ldc);
+DEF_FN(cublasStatus_t, cublasStrsm_v2, cublasHandle_t, handle, cublasSideMode_t, side, cublasFillMode_t, uplo, cublasOperation_t, trans, cublasDiagType_t, diag, int, m, int, n, const float*, alpha, const float*, A, int, lda, float*, B, int, ldb);
+DEF_FN(cublasStatus_t, cublasStrsm_v2_64, cublasHandle_t, handle, cublasSideMode_t, side, cublasFillMode_t, uplo, cublasOperation_t, trans, cublasDiagType_t, diag, int64_t, m, int64_t, n, const float*, alpha, const float*, A, int64_t, lda, float*, B, int64_t, ldb);
+DEF_FN(cublasStatus_t, cublasDtrsm_v2, cublasHandle_t, handle, cublasSideMode_t, side, cublasFillMode_t, uplo, cublasOperation_t, trans, cublasDiagType_t, diag, int, m, int, n, const double*, alpha, const double*, A, int, lda, double*, B, int, ldb);
+DEF_FN(cublasStatus_t, cublasDtrsm_v2_64, cublasHandle_t, handle, cublasSideMode_t, side, cublasFillMode_t, uplo, cublasOperation_t, trans, cublasDiagType_t, diag, int64_t, m, int64_t, n, const double*, alpha, const double*, A, int64_t, lda, double*, B, int64_t, ldb);
+DEF_FN(cublasStatus_t, cublasCtrsm_v2, cublasHandle_t, handle, cublasSideMode_t, side, cublasFillMode_t, uplo, cublasOperation_t, trans, cublasDiagType_t, diag, int, m, int, n, const cuComplex*, alpha, const cuComplex*, A, int, lda, cuComplex*, B, int, ldb);
+DEF_FN(cublasStatus_t, cublasCtrsm_v2_64, cublasHandle_t, handle, cublasSideMode_t, side, cublasFillMode_t, uplo, cublasOperation_t, trans, cublasDiagType_t, diag, int64_t, m, int64_t, n, const cuComplex*, alpha, const cuComplex*, A, int64_t, lda, cuComplex*, B, int64_t, ldb);
+DEF_FN(cublasStatus_t, cublasZtrsm_v2, cublasHandle_t, handle, cublasSideMode_t, side, cublasFillMode_t, uplo, cublasOperation_t, trans, cublasDiagType_t, diag, int, m, int, n, const cuDoubleComplex*, alpha, const cuDoubleComplex*, A, int, lda, cuDoubleComplex*, B, int, ldb);
+DEF_FN(cublasStatus_t, cublasZtrsm_v2_64, cublasHandle_t, handle, cublasSideMode_t, side, cublasFillMode_t, uplo, cublasOperation_t, trans, cublasDiagType_t, diag, int64_t, m, int64_t, n, const cuDoubleComplex*, alpha, const cuDoubleComplex*, A, int64_t, lda, cuDoubleComplex*, B, int64_t, ldb);
+DEF_FN(cublasStatus_t, cublasStrmm_v2, cublasHandle_t, handle, cublasSideMode_t, side, cublasFillMode_t, uplo, cublasOperation_t, trans, cublasDiagType_t, diag, int, m, int, n, const float*, alpha, const float*, A, int, lda, const float*, B, int, ldb, float*, C, int, ldc);
+DEF_FN(cublasStatus_t, cublasStrmm_v2_64, cublasHandle_t, handle, cublasSideMode_t, side, cublasFillMode_t, uplo, cublasOperation_t, trans, cublasDiagType_t, diag, int64_t, m, int64_t, n, const float*, alpha, const float*, A, int64_t, lda, const float*, B, int64_t, ldb, float*, C, int64_t, ldc);
+DEF_FN(cublasStatus_t, cublasDtrmm_v2, cublasHandle_t, handle, cublasSideMode_t, side, cublasFillMode_t, uplo, cublasOperation_t, trans, cublasDiagType_t, diag, int, m, int, n, const double*, alpha, const double*, A, int, lda, const double*, B, int, ldb, double*, C, int, ldc);
+DEF_FN(cublasStatus_t, cublasDtrmm_v2_64, cublasHandle_t, handle, cublasSideMode_t, side, cublasFillMode_t, uplo, cublasOperation_t, trans, cublasDiagType_t, diag, int64_t, m, int64_t, n, const double*, alpha, const double*, A, int64_t, lda, const double*, B, int64_t, ldb, double*, C, int64_t, ldc);
+DEF_FN(cublasStatus_t, cublasCtrmm_v2, cublasHandle_t, handle, cublasSideMode_t, side, cublasFillMode_t, uplo, cublasOperation_t, trans, cublasDiagType_t, diag, int, m, int, n, const cuComplex*, alpha, const cuComplex*, A, int, lda, const cuComplex*, B, int, ldb, cuComplex*, C, int, ldc);
+DEF_FN(cublasStatus_t, cublasCtrmm_v2_64, cublasHandle_t, handle, cublasSideMode_t, side, cublasFillMode_t, uplo, cublasOperation_t, trans, cublasDiagType_t, diag, int64_t, m, int64_t, n, const cuComplex*, alpha, const cuComplex*, A, int64_t, lda, const cuComplex*, B, int64_t, ldb, cuComplex*, C, int64_t, ldc);
+DEF_FN(cublasStatus_t, cublasZtrmm_v2, cublasHandle_t, handle, cublasSideMode_t, side, cublasFillMode_t, uplo, cublasOperation_t, trans, cublasDiagType_t, diag, int, m, int, n, const cuDoubleComplex*, alpha, const cuDoubleComplex*, A, int, lda, const cuDoubleComplex*, B, int, ldb, cuDoubleComplex*, C, int, ldc);
+DEF_FN(cublasStatus_t, cublasZtrmm_v2_64, cublasHandle_t, handle, cublasSideMode_t, side, cublasFillMode_t, uplo, cublasOperation_t, trans, cublasDiagType_t, diag, int64_t, m, int64_t, n, const cuDoubleComplex*, alpha, const cuDoubleComplex*, A, int64_t, lda, const cuDoubleComplex*, B, int64_t, ldb, cuDoubleComplex*, C, int64_t, ldc);
+DEF_FN(cublasStatus_t, cublasSgemmBatched, cublasHandle_t, handle, cublasOperation_t, transa, cublasOperation_t, transb, int, m, int, n, int, k, const float*, alpha, const float* const*,  Aarray, int, lda, const float* const*,  Barray, int, ldb, const float*, beta, float* const*,  Carray, int, ldc, int, batchCount);
+DEF_FN(cublasStatus_t, cublasSgemmBatched_64, cublasHandle_t, handle, cublasOperation_t, transa, cublasOperation_t, transb, int64_t, m, int64_t, n, int64_t, k, const float*, alpha, const float* const*,  Aarray, int64_t, lda, const float* const*,  Barray, int64_t, ldb, const float*, beta, float* const*,  Carray, int64_t, ldc, int64_t, batchCount);
+DEF_FN(cublasStatus_t, cublasDgemmBatched, cublasHandle_t, handle, cublasOperation_t, transa, cublasOperation_t, transb, int, m, int, n, int, k, const double*, alpha, const double* const*,  Aarray, int, lda, const double* const*,  Barray, int, ldb, const double*, beta, double* const*,  Carray, int, ldc, int, batchCount);
+DEF_FN(cublasStatus_t, cublasDgemmBatched_64, cublasHandle_t, handle, cublasOperation_t, transa, cublasOperation_t, transb, int64_t, m, int64_t, n, int64_t, k, const double*, alpha, const double* const*,  Aarray, int64_t, lda, const double* const*,  Barray, int64_t, ldb, const double*, beta, double* const*,  Carray, int64_t, ldc, int64_t, batchCount);
+DEF_FN(cublasStatus_t, cublasCgemmBatched, cublasHandle_t, handle, cublasOperation_t, transa, cublasOperation_t, transb, int, m, int, n, int, k, const cuComplex*, alpha, const cuComplex* const*,  Aarray, int, lda, const cuComplex* const*,  Barray, int, ldb, const cuComplex*, beta, cuComplex* const*,  Carray, int, ldc, int, batchCount);
+DEF_FN(cublasStatus_t, cublasCgemmBatched_64, cublasHandle_t, handle, cublasOperation_t, transa, cublasOperation_t, transb, int64_t, m, int64_t, n, int64_t, k, const cuComplex*, alpha, const cuComplex* const*,  Aarray, int64_t, lda, const cuComplex* const*,  Barray, int64_t, ldb, const cuComplex*, beta, cuComplex* const*,  Carray, int64_t, ldc, int64_t, batchCount);
+DEF_FN(cublasStatus_t, cublasCgemm3mBatched, cublasHandle_t, handle, cublasOperation_t, transa, cublasOperation_t, transb, int, m, int, n, int, k, const cuComplex*, alpha, const cuComplex* const*,  Aarray, int, lda, const cuComplex* const*,  Barray, int, ldb, const cuComplex*, beta, cuComplex* const*,  Carray, int, ldc, int, batchCount);
+DEF_FN(cublasStatus_t, cublasCgemm3mBatched_64, cublasHandle_t, handle, cublasOperation_t, transa, cublasOperation_t, transb, int64_t, m, int64_t, n, int64_t, k, const cuComplex*, alpha, const cuComplex* const*,  Aarray, int64_t, lda, const cuComplex* const*,  Barray, int64_t, ldb, const cuComplex*, beta, cuComplex* const*,  Carray, int64_t, ldc, int64_t, batchCount);
+DEF_FN(cublasStatus_t, cublasZgemmBatched, cublasHandle_t, handle, cublasOperation_t, transa, cublasOperation_t, transb, int, m, int, n, int, k, const cuDoubleComplex*, alpha, const cuDoubleComplex* const*,  Aarray, int, lda, const cuDoubleComplex* const*,  Barray, int, ldb, const cuDoubleComplex*, beta, cuDoubleComplex* const*,  Carray, int, ldc, int, batchCount);
+DEF_FN(cublasStatus_t, cublasZgemmBatched_64, cublasHandle_t, handle, cublasOperation_t, transa, cublasOperation_t, transb, int64_t, m, int64_t, n, int64_t, k, const cuDoubleComplex*, alpha, const cuDoubleComplex* const*,  Aarray, int64_t, lda, const cuDoubleComplex* const*,  Barray, int64_t, ldb, const cuDoubleComplex*, beta, cuDoubleComplex* const*,  Carray, int64_t, ldc, int64_t, batchCount);
+DEF_FN(cublasStatus_t, cublasSgemmStridedBatched, cublasHandle_t, handle, cublasOperation_t, transa, cublasOperation_t, transb, int, m, int, n, int, k, const float*, alpha, const float*, A, int, lda, long long int, strideA, const float*, B, int, ldb, long long int, strideB, const float*, beta, float*, C, int, ldc, long long int, strideC, int, batchCount);
+DEF_FN(cublasStatus_t, cublasSgemmStridedBatched_64, cublasHandle_t, handle, cublasOperation_t, transa, cublasOperation_t, transb, int64_t, m, int64_t, n, int64_t, k, const float*, alpha, const float*, A, int64_t, lda, long long int, strideA, const float*, B, int64_t, ldb, long long int, strideB, const float*, beta, float*, C, int64_t, ldc, long long int, strideC, int64_t, batchCount);
+DEF_FN(cublasStatus_t, cublasDgemmStridedBatched, cublasHandle_t, handle, cublasOperation_t, transa, cublasOperation_t, transb, int, m, int, n, int, k, const double*, alpha, const double*, A, int, lda, long long int, strideA, const double*, B, int, ldb, long long int, strideB, const double*, beta, double*, C, int, ldc, long long int, strideC, int, batchCount);
+DEF_FN(cublasStatus_t, cublasDgemmStridedBatched_64, cublasHandle_t, handle, cublasOperation_t, transa, cublasOperation_t, transb, int64_t, m, int64_t, n, int64_t, k, const double*, alpha, const double*, A, int64_t, lda, long long int, strideA, const double*, B, int64_t, ldb, long long int, strideB, const double*, beta, double*, C, int64_t, ldc, long long int, strideC, int64_t, batchCount);
+DEF_FN(cublasStatus_t, cublasCgemmStridedBatched, cublasHandle_t, handle, cublasOperation_t, transa, cublasOperation_t, transb, int, m, int, n, int, k, const cuComplex*, alpha, const cuComplex*, A, int, lda, long long int, strideA, const cuComplex*, B, int, ldb, long long int, strideB, const cuComplex*, beta, cuComplex*, C, int, ldc, long long int, strideC, int, batchCount);
+DEF_FN(cublasStatus_t, cublasCgemmStridedBatched_64, cublasHandle_t, handle, cublasOperation_t, transa, cublasOperation_t, transb, int64_t, m, int64_t, n, int64_t, k, const cuComplex*, alpha, const cuComplex*, A, int64_t, lda, long long int, strideA, const cuComplex*, B, int64_t, ldb, long long int, strideB, const cuComplex*, beta, cuComplex*, C, int64_t, ldc, long long int, strideC, int64_t, batchCount);
+DEF_FN(cublasStatus_t, cublasCgemm3mStridedBatched, cublasHandle_t, handle, cublasOperation_t, transa, cublasOperation_t, transb, int, m, int, n, int, k, const cuComplex*, alpha, const cuComplex*, A, int, lda, long long int, strideA, const cuComplex*, B, int, ldb, long long int, strideB, const cuComplex*, beta, cuComplex*, C, int, ldc, long long int, strideC, int, batchCount);
+DEF_FN(cublasStatus_t, cublasCgemm3mStridedBatched_64, cublasHandle_t, handle, cublasOperation_t, transa, cublasOperation_t, transb, int64_t, m, int64_t, n, int64_t, k, const cuComplex*, alpha, const cuComplex*, A, int64_t, lda, long long int, strideA, const cuComplex*, B, int64_t, ldb, long long int, strideB, const cuComplex*, beta, cuComplex*, C, int64_t, ldc, long long int, strideC, int64_t, batchCount);
+DEF_FN(cublasStatus_t, cublasZgemmStridedBatched, cublasHandle_t, handle, cublasOperation_t, transa, cublasOperation_t, transb, int, m, int, n, int, k, const cuDoubleComplex*, alpha, const cuDoubleComplex*, A, int, lda, long long int, strideA, const cuDoubleComplex*, B, int, ldb, long long int, strideB, const cuDoubleComplex*, beta, cuDoubleComplex*, C, int, ldc, long long int, strideC, int, batchCount);
+DEF_FN(cublasStatus_t, cublasZgemmStridedBatched_64, cublasHandle_t, handle, cublasOperation_t, transa, cublasOperation_t, transb, int64_t, m, int64_t, n, int64_t, k, const cuDoubleComplex*, alpha, const cuDoubleComplex*, A, int64_t, lda, long long int, strideA, const cuDoubleComplex*, B, int64_t, ldb, long long int, strideB, const cuDoubleComplex*, beta, cuDoubleComplex*, C, int64_t, ldc, long long int, strideC, int64_t, batchCount);
+DEF_FN(cublasStatus_t, cublasGemmBatchedEx, cublasHandle_t, handle, cublasOperation_t, transa, cublasOperation_t, transb, int, m, int, n, int, k, const void*, alpha, const void* const*,  Aarray, cudaDataType, Atype, int, lda, const void* const*,  Barray, cudaDataType, Btype, int, ldb, const void*, beta, void* const*,  Carray, cudaDataType, Ctype, int, ldc, int, batchCount, cublasComputeType_t, computeType, cublasGemmAlgo_t, algo);
+DEF_FN(cublasStatus_t, cublasGemmBatchedEx_64, cublasHandle_t, handle, cublasOperation_t, transa, cublasOperation_t, transb, int64_t, m, int64_t, n, int64_t, k, const void*, alpha, const void* const*,  Aarray, cudaDataType, Atype, int64_t, lda, const void* const*,  Barray, cudaDataType, Btype, int64_t, ldb, const void*, beta, void* const*,  Carray, cudaDataType, Ctype, int64_t, ldc, int64_t, batchCount, cublasComputeType_t, computeType, cublasGemmAlgo_t, algo);
+DEF_FN(cublasStatus_t, cublasGemmStridedBatchedEx, cublasHandle_t, handle, cublasOperation_t, transa, cublasOperation_t, transb, int, m, int, n, int, k, const void*, alpha, const void*, A, cudaDataType, Atype, int, lda, long long int, strideA, const void*, B, cudaDataType, Btype, int, ldb, long long int, strideB, const void*, beta, void*, C, cudaDataType, Ctype, int, ldc, long long int, strideC, int, batchCount, cublasComputeType_t, computeType, cublasGemmAlgo_t, algo);
+DEF_FN(cublasStatus_t, cublasGemmStridedBatchedEx_64, cublasHandle_t, handle, cublasOperation_t, transa, cublasOperation_t, transb, int64_t, m, int64_t, n, int64_t, k, const void*, alpha, const void*, A, cudaDataType, Atype, int64_t, lda, long long int, strideA, const void*, B, cudaDataType, Btype, int64_t, ldb, long long int, strideB, const void*, beta, void*, C, cudaDataType, Ctype, int64_t, ldc, long long int, strideC, int64_t, batchCount, cublasComputeType_t, computeType, cublasGemmAlgo_t, algo);
+DEF_FN(cublasStatus_t, cublasSgeam, cublasHandle_t, handle, cublasOperation_t, transa, cublasOperation_t, transb, int, m, int, n, const float*, alpha, const float*, A, int, lda, const float*, beta, const float*, B, int, ldb, float*, C, int, ldc);
+DEF_FN(cublasStatus_t, cublasSgeam_64, cublasHandle_t, handle, cublasOperation_t, transa, cublasOperation_t, transb, int64_t, m, int64_t, n, const float*, alpha, const float*, A, int64_t, lda, const float*, beta, const float*, B, int64_t, ldb, float*, C, int64_t, ldc);
+DEF_FN(cublasStatus_t, cublasDgeam, cublasHandle_t, handle, cublasOperation_t, transa, cublasOperation_t, transb, int, m, int, n, const double*, alpha, const double*, A, int, lda, const double*, beta, const double*, B, int, ldb, double*, C, int, ldc);
+DEF_FN(cublasStatus_t, cublasDgeam_64, cublasHandle_t, handle, cublasOperation_t, transa, cublasOperation_t, transb, int64_t, m, int64_t, n, const double*, alpha, const double*, A, int64_t, lda, const double*, beta, const double*, B, int64_t, ldb, double*, C, int64_t, ldc);
+DEF_FN(cublasStatus_t, cublasCgeam, cublasHandle_t, handle, cublasOperation_t, transa, cublasOperation_t, transb, int, m, int, n, const cuComplex*, alpha, const cuComplex*, A, int, lda, const cuComplex*, beta, const cuComplex*, B, int, ldb, cuComplex*, C, int, ldc);
+DEF_FN(cublasStatus_t, cublasCgeam_64, cublasHandle_t, handle, cublasOperation_t, transa, cublasOperation_t, transb, int64_t, m, int64_t, n, const cuComplex*, alpha, const cuComplex*, A, int64_t, lda, const cuComplex*, beta, const cuComplex*, B, int64_t, ldb, cuComplex*, C, int64_t, ldc);
+DEF_FN(cublasStatus_t, cublasZgeam, cublasHandle_t, handle, cublasOperation_t, transa, cublasOperation_t, transb, int, m, int, n, const cuDoubleComplex*, alpha, const cuDoubleComplex*, A, int, lda, const cuDoubleComplex*, beta, const cuDoubleComplex*, B, int, ldb, cuDoubleComplex*, C, int, ldc);
+DEF_FN(cublasStatus_t, cublasZgeam_64, cublasHandle_t, handle, cublasOperation_t, transa, cublasOperation_t, transb, int64_t, m, int64_t, n, const cuDoubleComplex*, alpha, const cuDoubleComplex*, A, int64_t, lda, const cuDoubleComplex*, beta, const cuDoubleComplex*, B, int64_t, ldb, cuDoubleComplex*, C, int64_t, ldc);
+DEF_FN(cublasStatus_t, cublasStrsmBatched, cublasHandle_t, handle, cublasSideMode_t, side, cublasFillMode_t, uplo, cublasOperation_t, trans, cublasDiagType_t, diag, int, m, int, n, const float*, alpha, const float* const*,  A, int, lda, float* const*,  B, int, ldb, int, batchCount);
+DEF_FN(cublasStatus_t, cublasStrsmBatched_64, cublasHandle_t, handle, cublasSideMode_t, side, cublasFillMode_t, uplo, cublasOperation_t, trans, cublasDiagType_t, diag, int64_t, m, int64_t, n, const float*, alpha, const float* const*,  A, int64_t, lda, float* const*,  B, int64_t, ldb, int64_t, batchCount);
+DEF_FN(cublasStatus_t, cublasDtrsmBatched, cublasHandle_t, handle, cublasSideMode_t, side, cublasFillMode_t, uplo, cublasOperation_t, trans, cublasDiagType_t, diag, int, m, int, n, const double*, alpha, const double* const*,  A, int, lda, double* const*,  B, int, ldb, int, batchCount);
+DEF_FN(cublasStatus_t, cublasDtrsmBatched_64, cublasHandle_t, handle, cublasSideMode_t, side, cublasFillMode_t, uplo, cublasOperation_t, trans, cublasDiagType_t, diag, int64_t, m, int64_t, n, const double*, alpha, const double* const*,  A, int64_t, lda, double* const*,  B, int64_t, ldb, int64_t, batchCount);
+DEF_FN(cublasStatus_t, cublasCtrsmBatched, cublasHandle_t, handle, cublasSideMode_t, side, cublasFillMode_t, uplo, cublasOperation_t, trans, cublasDiagType_t, diag, int, m, int, n, const cuComplex*, alpha, const cuComplex* const*,  A, int, lda, cuComplex* const*,  B, int, ldb, int, batchCount);
+DEF_FN(cublasStatus_t, cublasCtrsmBatched_64, cublasHandle_t, handle, cublasSideMode_t, side, cublasFillMode_t, uplo, cublasOperation_t, trans, cublasDiagType_t, diag, int64_t, m, int64_t, n, const cuComplex*, alpha, const cuComplex* const*,  A, int64_t, lda, cuComplex* const*,  B, int64_t, ldb, int64_t, batchCount);
+DEF_FN(cublasStatus_t, cublasZtrsmBatched, cublasHandle_t, handle, cublasSideMode_t, side, cublasFillMode_t, uplo, cublasOperation_t, trans, cublasDiagType_t, diag, int, m, int, n, const cuDoubleComplex*, alpha, const cuDoubleComplex* const*,  A, int, lda, cuDoubleComplex* const*,  B, int, ldb, int, batchCount);
+DEF_FN(cublasStatus_t, cublasZtrsmBatched_64, cublasHandle_t, handle, cublasSideMode_t, side, cublasFillMode_t, uplo, cublasOperation_t, trans, cublasDiagType_t, diag, int64_t, m, int64_t, n, const cuDoubleComplex*, alpha, const cuDoubleComplex* const*,  A, int64_t, lda, cuDoubleComplex* const*,  B, int64_t, ldb, int64_t, batchCount);
+DEF_FN(cublasStatus_t, cublasSdgmm, cublasHandle_t, handle, cublasSideMode_t, mode, int, m, int, n, const float*, A, int, lda, const float*, x, int, incx, float*, C, int, ldc);
+DEF_FN(cublasStatus_t, cublasSdgmm_64, cublasHandle_t, handle, cublasSideMode_t, mode, int64_t, m, int64_t, n, const float*, A, int64_t, lda, const float*, x, int64_t, incx, float*, C, int64_t, ldc);
+DEF_FN(cublasStatus_t, cublasDdgmm, cublasHandle_t, handle, cublasSideMode_t, mode, int, m, int, n, const double*, A, int, lda, const double*, x, int, incx, double*, C, int, ldc);
+DEF_FN(cublasStatus_t, cublasDdgmm_64, cublasHandle_t, handle, cublasSideMode_t, mode, int64_t, m, int64_t, n, const double*, A, int64_t, lda, const double*, x, int64_t, incx, double*, C, int64_t, ldc);
+DEF_FN(cublasStatus_t, cublasCdgmm, cublasHandle_t, handle, cublasSideMode_t, mode, int, m, int, n, const cuComplex*, A, int, lda, const cuComplex*, x, int, incx, cuComplex*, C, int, ldc);
+DEF_FN(cublasStatus_t, cublasCdgmm_64, cublasHandle_t, handle, cublasSideMode_t, mode, int64_t, m, int64_t, n, const cuComplex*, A, int64_t, lda, const cuComplex*, x, int64_t, incx, cuComplex*, C, int64_t, ldc);
+DEF_FN(cublasStatus_t, cublasZdgmm, cublasHandle_t, handle, cublasSideMode_t, mode, int, m, int, n, const cuDoubleComplex*, A, int, lda, const cuDoubleComplex*, x, int, incx, cuDoubleComplex*, C, int, ldc);
+DEF_FN(cublasStatus_t, cublasZdgmm_64, cublasHandle_t, handle, cublasSideMode_t, mode, int64_t, m, int64_t, n, const cuDoubleComplex*, A, int64_t, lda, const cuDoubleComplex*, x, int64_t, incx, cuDoubleComplex*, C, int64_t, ldc);
+DEF_FN(cublasStatus_t, cublasSmatinvBatched, cublasHandle_t, handle, int, n, const float* const*,  A, int, lda, float* const*,  Ainv, int, lda_inv, int*, info, int, batchSize);
+DEF_FN(cublasStatus_t, cublasDmatinvBatched, cublasHandle_t, handle, int, n, const double* const*,  A, int, lda, double* const*,  Ainv, int, lda_inv, int*, info, int, batchSize);
+DEF_FN(cublasStatus_t, cublasCmatinvBatched, cublasHandle_t, handle, int, n, const cuComplex* const*,  A, int, lda, cuComplex* const*,  Ainv, int, lda_inv, int*, info, int, batchSize);
+DEF_FN(cublasStatus_t, cublasZmatinvBatched, cublasHandle_t, handle, int, n, const cuDoubleComplex* const*,  A, int, lda, cuDoubleComplex* const*,  Ainv, int, lda_inv, int*, info, int, batchSize);
+DEF_FN(cublasStatus_t, cublasSgeqrfBatched, cublasHandle_t, handle, int, m, int, n, float* const*,  Aarray, int, lda, float* const*,  TauArray, int*, info, int, batchSize);
+DEF_FN(cublasStatus_t, cublasDgeqrfBatched, cublasHandle_t, handle, int, m, int, n, double* const*,  Aarray, int, lda, double* const*,  TauArray, int*, info, int, batchSize);
+DEF_FN(cublasStatus_t, cublasCgeqrfBatched, cublasHandle_t, handle, int, m, int, n, cuComplex* const*,  Aarray, int, lda, cuComplex* const*,  TauArray, int*, info, int, batchSize);
+DEF_FN(cublasStatus_t, cublasZgeqrfBatched, cublasHandle_t, handle, int, m, int, n, cuDoubleComplex* const*,  Aarray, int, lda, cuDoubleComplex* const*,  TauArray, int*, info, int, batchSize);
+DEF_FN(cublasStatus_t, cublasSgelsBatched, cublasHandle_t, handle, cublasOperation_t, trans, int, m, int, n, int, nrhs, float* const*,  Aarray, int, lda, float* const*,  Carray, int, ldc, int*, info, int*, devInfoArray, int, batchSize);
+DEF_FN(cublasStatus_t, cublasDgelsBatched, cublasHandle_t, handle, cublasOperation_t, trans, int, m, int, n, int, nrhs, double* const*,  Aarray, int, lda, double* const*,  Carray, int, ldc, int*, info, int*, devInfoArray, int, batchSize);
+DEF_FN(cublasStatus_t, cublasCgelsBatched, cublasHandle_t, handle, cublasOperation_t, trans, int, m, int, n, int, nrhs, cuComplex* const*,  Aarray, int, lda, cuComplex* const*,  Carray, int, ldc, int*, info, int*, devInfoArray, int, batchSize);
+DEF_FN(cublasStatus_t, cublasZgelsBatched, cublasHandle_t, handle, cublasOperation_t, trans, int, m, int, n, int, nrhs, cuDoubleComplex* const*,  Aarray, int, lda, cuDoubleComplex* const*,  Carray, int, ldc, int*, info, int*, devInfoArray, int, batchSize);
+DEF_FN(cublasStatus_t, cublasStpttr, cublasHandle_t, handle, cublasFillMode_t, uplo, int, n, const float*, AP, float*, A, int, lda);
+DEF_FN(cublasStatus_t, cublasDtpttr, cublasHandle_t, handle, cublasFillMode_t, uplo, int, n, const double*, AP, double*, A, int, lda);
+DEF_FN(cublasStatus_t, cublasCtpttr, cublasHandle_t, handle, cublasFillMode_t, uplo, int, n, const cuComplex*, AP, cuComplex*, A, int, lda);
+DEF_FN(cublasStatus_t, cublasZtpttr, , cublasHandle_t, handle, cublasFillMode_t, uplo, int, n, const cuDoubleComplex*, AP, cuDoubleComplex*, A, int, lda);
+DEF_FN(cublasStatus_t, cublasStrttp, cublasHandle_t, handle, cublasFillMode_t, uplo, int, n, const float*, A, int, lda, float*, AP);
+DEF_FN(cublasStatus_t, cublasDtrttp, cublasHandle_t, handle, cublasFillMode_t, uplo, int, n, const double*, A, int, lda, double*, AP);
+DEF_FN(cublasStatus_t, cublasCtrttp, cublasHandle_t, handle, cublasFillMode_t, uplo, int, n, const cuComplex*, A, int, lda, cuComplex*, AP);
+DEF_FN(cublasStatus_t, cublasZtrttp, , cublasHandle_t, handle, cublasFillMode_t, uplo, int, n, const cuDoubleComplex*, A, int, lda, cuDoubleComplex*, AP);
+DEF_FN(cublasStatus_t, cublasSgetrfBatched, cublasHandle_t, handle, int, n, float* const*,  A, int, lda, int*, P, int*, info, int, batchSize);
+DEF_FN(cublasStatus_t, cublasDgetrfBatched, cublasHandle_t, handle, int, n, double* const*,  A, int, lda, int*, P, int*, info, int, batchSize);
+DEF_FN(cublasStatus_t, cublasCgetrfBatched, cublasHandle_t, handle, int, n, cuComplex* const*,  A, int, lda, int*, P, int*, info, int, batchSize);
+DEF_FN(cublasStatus_t, cublasZgetrfBatched, , cublasHandle_t, handle, int, n, cuDoubleComplex* const*,  A, int, lda, int*, P, int*, info, int, batchSize);
+DEF_FN(cublasStatus_t, cublasSgetriBatched, cublasHandle_t, handle, int, n, const float* const*,  A, int, lda, const int*, P, float* const*,  C, int, ldc, int*, info, int, batchSize);
+DEF_FN(cublasStatus_t, cublasDgetriBatched, cublasHandle_t, handle, int, n, const double* const*,  A, int, lda, const int*, P, double* const*,  C, int, ldc, int*, info, int, batchSize);
+DEF_FN(cublasStatus_t, cublasCgetriBatched, cublasHandle_t, handle, int, n, const cuComplex* const*,  A, int, lda, const int*, P, cuComplex* const*,  C, int, ldc, int*, info, int, batchSize);
+DEF_FN(cublasStatus_t, cublasZgetriBatched, cublasHandle_t, handle, int, n, const cuDoubleComplex* const*,  A, int, lda, const int*, P, cuDoubleComplex* const*,  C, int, ldc, int*, info, int, batchSize);
+DEF_FN(cublasStatus_t, cublasSgetrsBatched, cublasHandle_t, handle, cublasOperation_t, trans, int, n, int, nrhs, const float* const*,  Aarray, int, lda, const int*, devIpiv, float* const*,  Barray, int, ldb, int*, info, int, batchSize);
+DEF_FN(cublasStatus_t, cublasDgetrsBatched, cublasHandle_t, handle, cublasOperation_t, trans, int, n, int, nrhs, const double* const*,  Aarray, int, lda, const int*, devIpiv, double* const*,  Barray, int, ldb, int*, info, int, batchSize);
+DEF_FN(cublasStatus_t, cublasCgetrsBatched, cublasHandle_t, handle, cublasOperation_t, trans, int, n, int, nrhs, const cuComplex* const*,  Aarray, int, lda, const int*, devIpiv, cuComplex* const*,  Barray, int, ldb, int*, info, int, batchSize);
+DEF_FN(cublasStatus_t, cublasZgetrsBatched, cublasHandle_t, handle, cublasOperation_t, trans, int, n, int, nrhs, const cuDoubleComplex* const*,  Aarray, int, lda, const int*, devIpiv, cuDoubleComplex* const*,  Barray, int, ldb, int*, info, int, batchSize);
diff --git a/cpu/cpu-client-cudnn.c b/cpu/cpu-client-cudnn.c
index e7c8c691..05136fe2 100644
--- a/cpu/cpu-client-cudnn.c
+++ b/cpu/cpu-client-cudnn.c
@@ -1,5 +1,6 @@
 #include <cudnn.h>
 #include <stdint.h>
+#include <stdbool.h>
 
 #include "cpu-libwrap.h"
 #include "cpu_rpc_prot.h"
@@ -1634,3 +1635,220 @@ DEF_FN(cudnnStatus_t, cudnnGetConvolutionBackwardDataWorkspaceSize,  cudnnHandle
 DEF_FN(cudnnStatus_t, cudnnConvolutionBackwardData,  cudnnHandle_t, handle,  const void*, alpha,  const cudnnFilterDescriptor_t, wDesc,  const void*, w,  const cudnnTensorDescriptor_t, dyDesc,  const void*, dy,  const cudnnConvolutionDescriptor_t, convDesc,  cudnnConvolutionBwdDataAlgo_t, algo,  void*, workSpace,  size_t, workSpaceSizeInBytes,  const void*, beta,  const cudnnTensorDescriptor_t, dxDesc,  void*, dx)
 DEF_FN(cudnnStatus_t, cudnnGetFoldedConvBackwardDataDescriptors,  const cudnnHandle_t, handle,  const cudnnFilterDescriptor_t, filterDesc,  const cudnnTensorDescriptor_t, diffDesc,  const cudnnConvolutionDescriptor_t, convDesc,  const cudnnTensorDescriptor_t, gradDesc,  const cudnnTensorFormat_t, transformFormat,  cudnnFilterDescriptor_t, foldedFilterDesc,  cudnnTensorDescriptor_t, paddedDiffDesc,  cudnnConvolutionDescriptor_t, foldedConvDesc,  cudnnTensorDescriptor_t, foldedGradDesc,  cudnnTensorTransformDescriptor_t, filterFoldTransDesc,  cudnnTensorTransformDescriptor_t, diffPadTransDesc,  cudnnTensorTransformDescriptor_t, gradFoldTransDesc,  cudnnTensorTransformDescriptor_t, gradUnfoldTransDesc)
 DEF_FN(cudnnStatus_t, cudnnCnnInferVersionCheck)
+
+/********************** CUDNN BACKEND API ********************************/
+cudnnStatus_t cudnnBackendCreateDescriptor(cudnnBackendDescriptorType_t descriptorType, cudnnBackendDescriptor_t *descriptor)
+{
+#ifdef WITH_API_CNT
+    api_call_cnt++;
+#endif //WITH_API_CNT
+    ptr_result result;
+    enum clnt_stat retval_1;
+    LOGE(LOG_DEBUG, "%s(%d)", __FUNCTION__, descriptorType);
+    if (descriptor == NULL) {
+        LOGE(LOG_ERROR, "%s failed (descriptor is NULL)", __FUNCTION__);
+        return CUDNN_STATUS_BAD_PARAM;
+    }
+    retval_1 = rpc_cudnnbackendcreatedescriptor_1(
+        (int)descriptorType,
+        &result, clnt);
+    if (retval_1 != RPC_SUCCESS) {
+        LOGE(LOG_ERROR, "%s failed (%d)", __FUNCTION__, retval_1);
+    }
+    if (result.err != CUDNN_STATUS_SUCCESS) {
+        LOGE(LOG_ERROR, "%s failed (result is %d)", __FUNCTION__, result.err);
+    } else {
+        *descriptor = (void*)result.ptr_result_u.ptr;
+        LOGE(LOG_DEBUG, "-> %p", *descriptor);
+    }
+    return result.err;
+}
+
+cudnnStatus_t cudnnBackendDestroyDescriptor(cudnnBackendDescriptor_t descriptor)
+{
+#ifdef WITH_API_CNT
+    api_call_cnt++;
+#endif //WITH_API_CNT
+    int result;
+    enum clnt_stat retval_1;
+    LOGE(LOG_DEBUG, "%s(%p)", __FUNCTION__, descriptor);
+    retval_1 = rpc_cudnnbackenddestroydescriptor_1((ptr)descriptor, &result, clnt);
+    if (retval_1 != RPC_SUCCESS) {
+        LOGE(LOG_ERROR, "%s failed (%d)", __FUNCTION__, retval_1);
+    }
+    if (result != CUDNN_STATUS_SUCCESS) {
+        LOGE(LOG_ERROR, "%s failed (result is %d)", __FUNCTION__, result);
+    }
+    return result;
+}
+
+cudnnStatus_t cudnnBackendInitialize(cudnnBackendDescriptor_t descriptor)
+{
+#ifdef WITH_API_CNT
+    api_call_cnt++;
+#endif //WITH_API_CNT
+    int result;
+    enum clnt_stat retval_1;
+    LOGE(LOG_DEBUG, "%s(%p)", __FUNCTION__, descriptor);
+    retval_1 = rpc_cudnnbackendinitialize_1((ptr)descriptor, &result, clnt);
+    if (retval_1 != RPC_SUCCESS) {
+        LOGE(LOG_ERROR, "%s failed (%d)", __FUNCTION__, retval_1);
+    }
+    if (result != CUDNN_STATUS_SUCCESS) {
+        LOGE(LOG_ERROR, "%s failed (result is %d)", __FUNCTION__, result);
+    }
+    return result;
+}
+
+cudnnStatus_t cudnnBackendFinalize(cudnnBackendDescriptor_t descriptor)
+{
+#ifdef WITH_API_CNT
+    api_call_cnt++;
+#endif //WITH_API_CNT
+    int result;
+    enum clnt_stat retval_1;
+    LOGE(LOG_DEBUG, "%s(%p)", __FUNCTION__, descriptor);
+    retval_1 = rpc_cudnnbackendfinalize_1((ptr)descriptor, &result, clnt);
+    if (retval_1 != RPC_SUCCESS) {
+        LOGE(LOG_ERROR, "%s failed (%d)", __FUNCTION__, retval_1);
+    }
+    if (result != CUDNN_STATUS_SUCCESS) {
+        LOGE(LOG_ERROR, "%s failed (result is %d)", __FUNCTION__, result);
+    }
+    return result;
+}
+
+static const size_t backendAttributeSizes[] = {
+    [CUDNN_TYPE_HANDLE] = sizeof(cudnnHandle_t),
+    [CUDNN_TYPE_DATA_TYPE] = sizeof(cudnnDataType_t),
+    [CUDNN_TYPE_BOOLEAN] = sizeof(bool),
+    [CUDNN_TYPE_INT64] = sizeof(int64_t),
+    [CUDNN_TYPE_FLOAT] = sizeof(float),
+    [CUDNN_TYPE_DOUBLE] = sizeof(double),
+    [CUDNN_TYPE_VOID_PTR] = sizeof(void *),
+    [CUDNN_TYPE_CONVOLUTION_MODE] = sizeof(cudnnConvolutionMode_t),
+    [CUDNN_TYPE_HEUR_MODE] = sizeof(cudnnBackendHeurMode_t),
+    [CUDNN_TYPE_KNOB_TYPE] = sizeof(cudnnBackendKnobType_t),
+    [CUDNN_TYPE_NAN_PROPOGATION] = sizeof(cudnnNanPropagation_t),
+    [CUDNN_TYPE_NUMERICAL_NOTE] = sizeof(cudnnBackendNumericalNote_t),
+    [CUDNN_TYPE_LAYOUT_TYPE] = sizeof(cudnnBackendLayoutType_t),
+    [CUDNN_TYPE_ATTRIB_NAME] = sizeof(cudnnBackendAttributeName_t),
+    [CUDNN_TYPE_POINTWISE_MODE] = sizeof(cudnnPointwiseMode_t),
+    [CUDNN_TYPE_BACKEND_DESCRIPTOR] = sizeof(cudnnBackendDescriptor_t),
+    [CUDNN_TYPE_GENSTATS_MODE] = sizeof(cudnnGenStatsMode_t),
+    [CUDNN_TYPE_BN_FINALIZE_STATS_MODE] = sizeof(cudnnBnFinalizeStatsMode_t),
+    [CUDNN_TYPE_REDUCTION_OPERATOR_TYPE] = sizeof(cudnnReduceTensorOp_t),
+    [CUDNN_TYPE_BEHAVIOR_NOTE] = sizeof(cudnnBackendBehaviorNote_t),
+    [CUDNN_TYPE_TENSOR_REORDERING_MODE] = sizeof(cudnnBackendTensorReordering_t),
+    [CUDNN_TYPE_RESAMPLE_MODE] = sizeof(cudnnResampleMode_t),
+    [CUDNN_TYPE_PADDING_MODE] = sizeof(cudnnPaddingMode_t),
+    [CUDNN_TYPE_INT32] = sizeof(int32_t),
+    [CUDNN_TYPE_CHAR] = sizeof(char),
+    [CUDNN_TYPE_SIGNAL_MODE] = sizeof(cudnnSignalMode_t),
+    [CUDNN_TYPE_FRACTION] = sizeof(cudnnFraction_t),
+    [CUDNN_TYPE_NORM_MODE] = sizeof(cudnnBackendNormMode_t),
+    [CUDNN_TYPE_NORM_FWD_PHASE] = sizeof(cudnnBackendNormFwdPhase_t),
+    [CUDNN_TYPE_RNG_DISTRIBUTION] = sizeof(cudnnRngDistribution_t),
+};
+cudnnStatus_t cudnnBackendSetAttribute(cudnnBackendDescriptor_t descriptor,
+                         cudnnBackendAttributeName_t attributeName,
+                         cudnnBackendAttributeType_t attributeType,
+                         int64_t elementCount,
+                         const void *arrayOfElements)
+{
+#ifdef WITH_API_CNT
+    api_call_cnt++;
+#endif //WITH_API_CNT
+    int result;
+    LOGE(LOG_DEBUG, "%s(%p, %d, %d, %ld, %p)", __FUNCTION__, descriptor, attributeName, attributeType, elementCount, arrayOfElements);
+    if (attributeType > CUDNN_TYPE_RNG_DISTRIBUTION) {
+        LOGE(LOG_ERROR, "%s failed (attributeType is too large %d)", __FUNCTION__, attributeType);
+        return CUDNN_STATUS_BAD_PARAM;
+    }
+    mem_data data = {
+        .mem_data_len = elementCount * backendAttributeSizes[attributeType],
+        .mem_data_val = (char *)arrayOfElements
+    };
+    enum clnt_stat retval_1;
+    retval_1 = rpc_cudnnbackendsetattribute_1(
+        (ptr)descriptor,
+        (int)attributeName,
+        (int)attributeType,
+        elementCount,
+        data,
+        &result, clnt);
+    if (retval_1 != RPC_SUCCESS) {
+        LOGE(LOG_ERROR, "%s failed (%d)", __FUNCTION__, retval_1);
+    }
+    if (result != CUDNN_STATUS_SUCCESS) {
+        LOGE(LOG_ERROR, "%s failed (result is %d)", __FUNCTION__, result);
+    }
+    return result;
+}
+
+cudnnStatus_t cudnnBackendGetAttribute(cudnnBackendDescriptor_t const descriptor,
+                         cudnnBackendAttributeName_t attributeName,
+                         cudnnBackendAttributeType_t attributeType,
+                         int64_t requestedElementCount,
+                         int64_t *elementCount,
+                         void *arrayOfElements)
+{
+#ifdef WITH_API_CNT
+    api_call_cnt++;
+#endif //WITH_API_CNT
+    mem_result result;
+    enum clnt_stat retval_1;
+    LOGE(LOG_DEBUG, "%s(%p, %d, %d, %ld, %p, %p)", __FUNCTION__, descriptor, attributeName, attributeType, requestedElementCount, elementCount, arrayOfElements);
+    size_t expected_size = requestedElementCount * backendAttributeSizes[attributeType] + sizeof(int64_t);
+    result.mem_result_u.data.mem_data_val = malloc(expected_size);
+    if (result.mem_result_u.data.mem_data_val == NULL) {
+        LOGE(LOG_ERROR, "%s failed (malloc failed)", __FUNCTION__);
+        return CUDNN_STATUS_ALLOC_FAILED;
+    }
+    retval_1 = rpc_cudnnbackendgetattribute_1(
+        (ptr)descriptor,
+        (int)attributeName,
+        (int)attributeType,
+        requestedElementCount,
+        &result, clnt);
+    if (retval_1 != RPC_SUCCESS) {
+        LOGE(LOG_ERROR, "%s failed (%d)", __FUNCTION__, retval_1);
+    }
+    if (result.err != CUDNN_STATUS_SUCCESS || result.mem_result_u.data.mem_data_len != expected_size) {
+        LOGE(LOG_ERROR, "%s failed (result is %d, size is %zd, expected %zd)", __FUNCTION__, result.err, result.mem_result_u.data.mem_data_len, expected_size);
+        if (elementCount != NULL) {
+            *elementCount = 0;
+        }
+    } else {
+        if (elementCount != NULL) {
+            *elementCount = *(int64_t*)result.mem_result_u.data.mem_data_val;
+            LOGE(LOG_DEBUG, "elementCount = %ld", *elementCount);
+        }
+        if (arrayOfElements != NULL) {
+            memcpy(arrayOfElements, result.mem_result_u.data.mem_data_val + sizeof(int64_t), *elementCount * backendAttributeSizes[attributeType]);
+        }
+    }
+    return result.err;
+}
+
+cudnnStatus_t cudnnBackendExecute(cudnnHandle_t handle, cudnnBackendDescriptor_t executionPlan, cudnnBackendDescriptor_t variantPack)
+{
+#ifdef WITH_API_CNT
+    api_call_cnt++;
+#endif //WITH_API_CNT
+    int result;
+    enum clnt_stat retval_1;
+    LOGE(LOG_DEBUG, "%s(%p, %p, %p)", __FUNCTION__, handle, executionPlan, variantPack);
+    retval_1 = rpc_cudnnbackendexecute_1(
+        (ptr)handle,
+        (ptr)executionPlan,
+        (ptr)variantPack,
+        &result, clnt);
+    if (retval_1 != RPC_SUCCESS) {
+        LOGE(LOG_ERROR, "%s failed (%d)", __FUNCTION__, retval_1);
+    }
+    if (result != CUDNN_STATUS_SUCCESS) {
+        LOGE(LOG_ERROR, "%s failed (result is %d)", __FUNCTION__, result);
+    }
+    return result;
+}
\ No newline at end of file
diff --git a/cpu/cpu-client-driver.c b/cpu/cpu-client-driver.c
index f0a00c27..45d44ba8 100644
--- a/cpu/cpu-client-driver.c
+++ b/cpu/cpu-client-driver.c
@@ -439,7 +439,7 @@ CUresult cuModuleLoad(CUmodule* module, const char* fname)
     return result.err;
 }
 DEF_FN(CUresult, cuModuleLoadData, CUmodule*, module, const void*, image)
-//DEF_FN(CUresult, cuModuleLoadDataEx, CUmodule*, module, const void*, image, unsigned int, numOptions, CUjit_option*, options, void**, optionValues)
+DEF_FN(CUresult, cuModuleLoadDataEx, CUmodule*, module, const void*, image, unsigned int, numOptions, CUjit_option*, options, void**, optionValues)
 DEF_FN(CUresult, cuModuleLoadFatBinary, CUmodule*, module, const void*, fatCubin)
 CUresult cuModuleUnload(CUmodule hmod)
 {
@@ -454,7 +454,6 @@ CUresult cuModuleUnload(CUmodule hmod)
 	}
     return result;
 }
-//DEF_FN(CUresult, cuModuleGetFunction, CUfunction*, hfunc, CUmodule, hmod, const char*, name)
 CUresult cuModuleGetFunction(CUfunction* hfun, CUmodule hmod, const char* name)
 {
 	enum clnt_stat retval;
@@ -865,4 +864,6 @@ CUresult cuGetProcAddress(const char* symbol, void** pfn, int cudaVersion, cuuin
     //*symbolStatus = CU_GET_PROC_ADDRESS_VERSION_NOT_SUFFICIENT;
     return cudaSuccess;
 }
-#endif
\ No newline at end of file
+#endif
+
+
diff --git a/cpu/cpu-client.c b/cpu/cpu-client.c
index ab3d57e3..c4bc68d1 100644
--- a/cpu/cpu-client.c
+++ b/cpu/cpu-client.c
@@ -246,30 +246,33 @@ void *dlopen(const char *filename, int flag)
         }
     }
 
-    if (filename != NULL && 
-        (strcmp(filename, "libcuda.so.1") == 0 ||
-        strcmp(filename, "libcuda.so") == 0 ||
-        strcmp(filename, "libnvidia-ml.so.1")) == 0) {
-        LOG(LOG_DEBUG, "replacing dlopen call to %s with cricket-client.so", filename);
-        dl_handle = dlopen_orig("cricket-client.so", flag);
-        if (clnt == NULL) {
-            LOGE(LOG_ERROR, "rpc seems to be uninitialized");
+    static const char *replace_libs[] = {
+        "libcuda.so.1",
+        "libcuda.so",
+        "libnvidia-ml.so.1",
+        "libcudnn_cnn_infer.so.8"
+    };
+    static const size_t replace_libs_sz = sizeof(replace_libs) / sizeof(char *);
+    if (filename != NULL) {
+        for (size_t i=0; i != replace_libs_sz; ++i) {
+            if (strcmp(filename, replace_libs[i]) == 0) {
+                LOG(LOG_DEBUG, "replacing dlopen call to %s with cricket-client.so", filename);
+                dl_handle = dlopen_orig("cricket-client.so", flag);
+                if (clnt == NULL) {
+                    LOGE(LOG_ERROR, "rpc seems to be uninitialized");
+                }
+                return dl_handle;
+            }
         }
-        return dl_handle;
-    } else {
-        // if ((has_kernel = cpu_utils_parameter_info(&kernel_infos, (char *)filename)) == 0) {
-        //     LOGE(LOG_DBG(1), "dlopen file \"%s\", but does not contain a kernel", filename);
-        // } else {
-        //     LOGE(LOG_DEBUG, "dlopen file \"%s\", contains a kernel", filename);
-        // }
-        if ((ret = dlopen_orig(filename, flag)) == NULL) {
-            LOGE(LOG_ERROR, "dlopen failed");
-        } else if (has_kernel) {
-            dlinfo(ret, RTLD_DI_LINKMAP, &map);
-            LOGE(LOG_DEBUG, "dlopen to  %p", map->l_addr);
-        }
-        return ret;
     }
+    /* filename is NULL or not in replace_libs list */
+    if ((ret = dlopen_orig(filename, flag)) == NULL) {
+        LOGE(LOG_ERROR, "dlopen failed: ", dlerror());
+    } else if (has_kernel) {
+        dlinfo(ret, RTLD_DI_LINKMAP, &map);
+        LOGE(LOG_DEBUG, "dlopen to  %p", map->l_addr);
+    }
+    return ret;
 }
 
 int dlclose(void *handle)
diff --git a/cpu/cpu-libwrap.h b/cpu/cpu-libwrap.h
index 361f4105..5b3a8ba7 100644
--- a/cpu/cpu-libwrap.h
+++ b/cpu/cpu-libwrap.h
@@ -186,10 +186,24 @@ RET NAME(P1_TYPE P1_NAME, P2_TYPE P2_NAME, P3_TYPE P3_NAME, P4_TYPE P4_NAME, P5_
     DEF_FN_PTR(RET, P1_TYPE, P2_TYPE, P3_TYPE, P4_TYPE, P5_TYPE, P6_TYPE, P7_TYPE, P8_TYPE, P9_TYPE, P10_TYPE, P11_TYPE, P12_TYPE, P13_TYPE, P14_TYPE, P15_TYPE, P16_TYPE, P17_TYPE, P18_TYPE, P19_TYPE, P20_TYPE, P21_TYPE); \
     DEF_FN_BODY(RET, NAME, P1_NAME, P2_NAME, P3_NAME, P4_NAME, P5_NAME, P6_NAME, P7_NAME, P8_NAME, P9_NAME, P10_NAME, P11_NAME, P12_NAME, P13_NAME, P14_NAME, P15_NAME, P16_NAME, P17_NAME, P18_NAME, P19_NAME, P20_NAME, P21_NAME); \
 }
+#define DEF_FN_22(RET, NAME, P1_TYPE, P1_NAME, P2_TYPE, P2_NAME, P3_TYPE, P3_NAME, P4_TYPE, P4_NAME, P5_TYPE, P5_NAME, P6_TYPE, P6_NAME, P7_TYPE, P7_NAME, P8_TYPE, P8_NAME, P9_TYPE, P9_NAME, P10_TYPE, P10_NAME, P11_TYPE, P11_NAME, P12_TYPE, P12_NAME, P13_TYPE, P13_NAME, P14_TYPE, P14_NAME, P15_TYPE, P15_NAME, P16_TYPE, P16_NAME, P17_TYPE, P17_NAME, P18_TYPE, P18_NAME, P19_TYPE, P19_NAME, P20_TYPE, P20_NAME, P21_TYPE, P21_NAME, P22_TYPE, P22_NAME) \
+RET NAME(P1_TYPE P1_NAME, P2_TYPE P2_NAME, P3_TYPE P3_NAME, P4_TYPE P4_NAME, P5_TYPE P5_NAME, P6_TYPE P6_NAME, P7_TYPE P7_NAME, P8_TYPE P8_NAME, P9_TYPE P9_NAME, P10_TYPE P10_NAME, P11_TYPE P11_NAME, P12_TYPE P12_NAME, P13_TYPE P13_NAME, P14_TYPE P14_NAME, P15_TYPE P15_NAME, P16_TYPE P16_NAME, P17_TYPE P17_NAME, P18_TYPE P18_NAME, P19_TYPE P19_NAME, P20_TYPE P20_NAME, P21_TYPE P21_NAME, P22_TYPE P22_NAME) \
+{ \
+    DEF_FN_PTR(RET, P1_TYPE, P2_TYPE, P3_TYPE, P4_TYPE, P5_TYPE, P6_TYPE, P7_TYPE, P8_TYPE, P9_TYPE, P10_TYPE, P11_TYPE, P12_TYPE, P13_TYPE, P14_TYPE, P15_TYPE, P16_TYPE, P17_TYPE, P18_TYPE, P19_TYPE, P20_TYPE, P21_TYPE, P22_TYPE); \
+    DEF_FN_BODY(RET, NAME, P1_NAME, P2_NAME, P3_NAME, P4_NAME, P5_NAME, P6_NAME, P7_NAME, P8_NAME, P9_NAME, P10_NAME, P11_NAME, P12_NAME, P13_NAME, P14_NAME, P15_NAME, P16_NAME, P17_NAME, P18_NAME, P19_NAME, P20_NAME, P21_NAME, P22_NAME); \
+}
+#define DEF_FN_23(RET, NAME, P1_TYPE, P1_NAME, P2_TYPE, P2_NAME, P3_TYPE, P3_NAME, P4_TYPE, P4_NAME, P5_TYPE, P5_NAME, P6_TYPE, P6_NAME, P7_TYPE, P7_NAME, P8_TYPE, P8_NAME, P9_TYPE, P9_NAME, P10_TYPE, P10_NAME, P11_TYPE, P11_NAME, P12_TYPE, P12_NAME, P13_TYPE, P13_NAME, P14_TYPE, P14_NAME, P15_TYPE, P15_NAME, P16_TYPE, P16_NAME, P17_TYPE, P17_NAME, P18_TYPE, P18_NAME, P19_TYPE, P19_NAME, P20_TYPE, P20_NAME, P21_TYPE, P21_NAME, P22_TYPE, P22_NAME, P23_TYPE, P23_NAME) \
+RET NAME(P1_TYPE P1_NAME, P2_TYPE P2_NAME, P3_TYPE P3_NAME, P4_TYPE P4_NAME, P5_TYPE P5_NAME, P6_TYPE P6_NAME, P7_TYPE P7_NAME, P8_TYPE P8_NAME, P9_TYPE P9_NAME, P10_TYPE P10_NAME, P11_TYPE P11_NAME, P12_TYPE P12_NAME, P13_TYPE P13_NAME, P14_TYPE P14_NAME, P15_TYPE P15_NAME, P16_TYPE P16_NAME, P17_TYPE P17_NAME, P18_TYPE P18_NAME, P19_TYPE P19_NAME, P20_TYPE P20_NAME, P21_TYPE P21_NAME, P22_TYPE P22_NAME, P23_TYPE P23_NAME) \
+{ \
+    DEF_FN_PTR(RET, P1_TYPE, P2_TYPE, P3_TYPE, P4_TYPE, P5_TYPE, P6_TYPE, P7_TYPE, P8_TYPE, P9_TYPE, P10_TYPE, P11_TYPE, P12_TYPE, P13_TYPE, P14_TYPE, P15_TYPE, P16_TYPE, P17_TYPE, P18_TYPE, P19_TYPE, P20_TYPE, P21_TYPE, P22_TYPE, P23_TYPE); \
+    DEF_FN_BODY(RET, NAME, P1_NAME, P2_NAME, P3_NAME, P4_NAME, P5_NAME, P6_NAME, P7_NAME, P8_NAME, P9_NAME, P10_NAME, P11_NAME, P12_NAME, P13_NAME, P14_NAME, P15_NAME, P16_NAME, P17_NAME, P18_NAME, P19_NAME, P20_NAME, P21_NAME, P22_NAME, P23_NAME); \
+}
 
-#define DEF_FN_X(x, RET, NAME, P1_TYPE, P1_NAME, P2_TYPE, P2_NAME, P3_TYPE, P3_NAME, P4_TYPE, P4_NAME, P5_TYPE, P5_NAME, P6_TYPE, P6_NAME, P7_TYPE, P7_NAME, P8_TYPE, P8_NAME, P9_TYPE, P9_NAME, P10_TYPE, P10_NAME, P11_TYPE, P11_NAME, P12_TYPE, P12_NAME, P13_TYPE, P13_NAME, P14_TYPE, P14_NAME, P15_TYPE, P15_NAME, P16_TYPE, P16_NAME, P17_TYPE, P17_NAME, P18_TYPE, P18_NAME, P19_TYPE, P19_NAME, P20_TYPE, P20_NAME, P21_TYPE, P21_NAME, FUNC, ...) FUNC
+#define DEF_FN_X(x, RET, NAME, P1_TYPE, P1_NAME, P2_TYPE, P2_NAME, P3_TYPE, P3_NAME, P4_TYPE, P4_NAME, P5_TYPE, P5_NAME, P6_TYPE, P6_NAME, P7_TYPE, P7_NAME, P8_TYPE, P8_NAME, P9_TYPE, P9_NAME, P10_TYPE, P10_NAME, P11_TYPE, P11_NAME, P12_TYPE, P12_NAME, P13_TYPE, P13_NAME, P14_TYPE, P14_NAME, P15_TYPE, P15_NAME, P16_TYPE, P16_NAME, P17_TYPE, P17_NAME, P18_TYPE, P18_NAME, P19_TYPE, P19_NAME, P20_TYPE, P20_NAME, P21_TYPE, P21_NAME, P22_TYPE, P22_NAME, P23_TYPE, P23_NAME, FUNC, ...) FUNC
 
 #define DEF_FN(...) DEF_FN_X(,##__VA_ARGS__,\
+                    DEF_FN_23(__VA_ARGS__),,\
+                    DEF_FN_22(__VA_ARGS__),,\
                     DEF_FN_21(__VA_ARGS__),,\
                     DEF_FN_20(__VA_ARGS__),,\
                     DEF_FN_19(__VA_ARGS__),,\
diff --git a/cpu/cpu-server-cublas.c b/cpu/cpu-server-cublas.c
index e93f5036..f3b9dbab 100644
--- a/cpu/cpu-server-cublas.c
+++ b/cpu/cpu-server-cublas.c
@@ -101,6 +101,53 @@ bool_t rpc_cublasdestroy_1_svc(ptr handle, int *result, struct svc_req *rqstp)
     return 1;
 }
 
+bool_t rpc_cublassetworkspace_1_svc(ptr handle, ptr workspace, size_t workspaceSizeInBytes, int *result, struct svc_req *rqstp)
+{
+    RECORD_API(rpc_cublassetworkspace_1_argument);
+    RECORD_NARG(handle);
+    RECORD_NARG(workspace);
+    RECORD_NARG(workspaceSizeInBytes);
+    LOGE(LOG_DEBUG, "%s", __FUNCTION__);
+    GSCHED_RETAIN;
+    *result = cublasSetWorkspace(
+        resource_mg_get(&rm_cublas, (void*)handle),
+        resource_mg_get(&rm_memory, (void*)workspace),
+        workspaceSizeInBytes);
+    GSCHED_RELEASE;
+    RECORD_RESULT(integer, *result);
+    return 1;
+}
+
+bool_t rpc_cublassetstream_1_svc(ptr handle, ptr streamId, int *result, struct svc_req *rqstp)
+{
+    RECORD_API(rpc_cublassetstream_1_argument);
+    RECORD_NARG(handle);
+    RECORD_NARG(streamId);
+    LOGE(LOG_DEBUG, "%s", __FUNCTION__);
+    GSCHED_RETAIN;
+    *result = cublasSetStream(
+        resource_mg_get(&rm_cublas, (void*)handle),
+        resource_mg_get(&rm_streams, (void*)streamId));
+    GSCHED_RELEASE;
+    RECORD_RESULT(integer, *result);
+    return 1;
+}
+
+bool_t rpc_cublassetmathmode_1_svc(ptr handle, int mode, int *result, struct svc_req *rqstp)
+{
+    RECORD_API(rpc_cublassetmathmode_1_argument);
+    RECORD_NARG(handle);
+    RECORD_NARG(mode);
+    LOGE(LOG_DEBUG, "%s", __FUNCTION__);
+    GSCHED_RETAIN;
+    *result = cublasSetMathMode(
+        resource_mg_get(&rm_cublas, (void*)handle),
+        (cublasMath_t)mode);
+    GSCHED_RELEASE;
+    RECORD_RESULT(integer, *result);
+    return 1;
+}
+
 bool_t rpc_cublassgemm_1_svc(ptr handle, int transa, int transb, int m, int n, int k, float alpha,
             ptr A, int lda,
             ptr B, int ldb, float beta,
diff --git a/cpu/cpu-server-cudnn.c b/cpu/cpu-server-cudnn.c
index 16a11b06..70e4abce 100644
--- a/cpu/cpu-server-cudnn.c
+++ b/cpu/cpu-server-cudnn.c
@@ -2,6 +2,7 @@
 #include <stdio.h>
 #include <stdlib.h>
 #include <cudnn.h>
+#include <stdbool.h>
 
 #include "cpu_rpc_prot.h"
 #include "cpu-common.h"
@@ -27,6 +28,7 @@ int server_cudnn_init(int bypass)
     ret &= resource_mg_init(&rm_cudnn_activations, bypass);
     ret &= resource_mg_init(&rm_cudnn_lrns, bypass);
     ret &= resource_mg_init(&rm_cudnn_convs, bypass);
+    ret &= resource_mg_init(&rm_cudnn_backendds, bypass);
     return ret;
 }
 
@@ -39,6 +41,7 @@ int server_cudnn_deinit(void)
     resource_mg_free(&rm_cudnn_activations);
     resource_mg_free(&rm_cudnn_lrns);
     resource_mg_free(&rm_cudnn_convs);
+    resource_mg_free(&rm_cudnn_backendds);
     return 0;
 
 }
@@ -1210,4 +1213,184 @@ bool_t rpc_cudnntransformtensor_1_svc(ptr handle, cudnn_scaling_t alpha, ptr xDe
     GSCHED_RELEASE;
     RECORD_RESULT(integer, *result);
     return 1;
+}
+
+static const size_t backendAttributeSizes[] = {
+    [CUDNN_TYPE_HANDLE] = sizeof(cudnnHandle_t),
+    [CUDNN_TYPE_DATA_TYPE] = sizeof(cudnnDataType_t),
+    [CUDNN_TYPE_BOOLEAN] = sizeof(bool),
+    [CUDNN_TYPE_INT64] = sizeof(int64_t),
+    [CUDNN_TYPE_FLOAT] = sizeof(float),
+    [CUDNN_TYPE_DOUBLE] = sizeof(double),
+    [CUDNN_TYPE_VOID_PTR] = sizeof(void *),
+    [CUDNN_TYPE_CONVOLUTION_MODE] = sizeof(cudnnConvolutionMode_t),
+    [CUDNN_TYPE_HEUR_MODE] = sizeof(cudnnBackendHeurMode_t),
+    [CUDNN_TYPE_KNOB_TYPE] = sizeof(cudnnBackendKnobType_t),
+    [CUDNN_TYPE_NAN_PROPOGATION] = sizeof(cudnnNanPropagation_t),
+    [CUDNN_TYPE_NUMERICAL_NOTE] = sizeof(cudnnBackendNumericalNote_t),
+    [CUDNN_TYPE_LAYOUT_TYPE] = sizeof(cudnnBackendLayoutType_t),
+    [CUDNN_TYPE_ATTRIB_NAME] = sizeof(cudnnBackendAttributeName_t),
+    [CUDNN_TYPE_POINTWISE_MODE] = sizeof(cudnnPointwiseMode_t),
+    [CUDNN_TYPE_BACKEND_DESCRIPTOR] = sizeof(cudnnBackendDescriptor_t),
+    [CUDNN_TYPE_GENSTATS_MODE] = sizeof(cudnnGenStatsMode_t),
+    [CUDNN_TYPE_BN_FINALIZE_STATS_MODE] = sizeof(cudnnBnFinalizeStatsMode_t),
+    [CUDNN_TYPE_REDUCTION_OPERATOR_TYPE] = sizeof(cudnnReduceTensorOp_t),
+    [CUDNN_TYPE_BEHAVIOR_NOTE] = sizeof(cudnnBackendBehaviorNote_t),
+    [CUDNN_TYPE_TENSOR_REORDERING_MODE] = sizeof(cudnnBackendTensorReordering_t),
+    [CUDNN_TYPE_RESAMPLE_MODE] = sizeof(cudnnResampleMode_t),
+    [CUDNN_TYPE_PADDING_MODE] = sizeof(cudnnPaddingMode_t),
+    [CUDNN_TYPE_INT32] = sizeof(int32_t),
+    [CUDNN_TYPE_CHAR] = sizeof(char),
+    [CUDNN_TYPE_SIGNAL_MODE] = sizeof(cudnnSignalMode_t),
+    [CUDNN_TYPE_FRACTION] = sizeof(cudnnFraction_t),
+    [CUDNN_TYPE_NORM_MODE] = sizeof(cudnnBackendNormMode_t),
+    [CUDNN_TYPE_NORM_FWD_PHASE] = sizeof(cudnnBackendNormFwdPhase_t),
+    [CUDNN_TYPE_RNG_DISTRIBUTION] = sizeof(cudnnRngDistribution_t),
+};
+
+bool_t rpc_cudnnbackendcreatedescriptor_1_svc(int descriptorType, ptr_result *result, struct svc_req *rqstp)
+{
+    RECORD_API(int);
+    RECORD_SINGLE_ARG(descriptorType);
+    LOGE(LOG_DEBUG, "%s", __FUNCTION__);
+
+    GSCHED_RETAIN;
+    result->err = cudnnBackendCreateDescriptor(
+        (cudnnBackendDescriptorType_t)descriptorType,
+        (cudnnBackendDescriptor_t*)&result->ptr_result_u.ptr);
+    if (resource_mg_create(&rm_cudnn_backendds, (void*)result->ptr_result_u.ptr) != 0) {
+        LOGE(LOG_ERROR, "error in resource manager");
+    }
+    GSCHED_RELEASE;
+    RECORD_RESULT(ptr_result_u, *result);
+    return 1;
+}
+
+bool_t rpc_cudnnbackenddestroydescriptor_1_svc(ptr descriptor, int *result, struct svc_req *rqstp)
+{
+    RECORD_API(ptr);
+    RECORD_SINGLE_ARG(descriptor);
+    LOGE(LOG_DEBUG, "%s", __FUNCTION__);
+
+    GSCHED_RETAIN;
+    *result = cudnnBackendDestroyDescriptor(
+        (cudnnBackendDescriptor_t)resource_mg_get(&rm_cudnn_backendds, (void*)descriptor));
+    // TODO: Remove from resource manager
+    GSCHED_RELEASE;
+    RECORD_RESULT(integer, *result);
+    return 1;
+}
+
+bool_t rpc_cudnnbackendinitialize_1_svc(ptr descriptor, int *result, struct svc_req *rqstp)
+{
+    RECORD_API(ptr);
+    RECORD_SINGLE_ARG(descriptor);
+    LOGE(LOG_DEBUG, "%s", __FUNCTION__);
+
+    GSCHED_RETAIN;
+    *result = cudnnBackendInitialize(
+        (cudnnBackendDescriptor_t)resource_mg_get(&rm_cudnn_backendds, (void*)descriptor));
+    GSCHED_RELEASE;
+    RECORD_RESULT(integer, *result);
+    return 1;
+}
+
+bool_t rpc_cudnnbackendfinalize_1_svc(ptr descriptor, int *result, struct svc_req *rqstp)
+{
+    RECORD_API(ptr);
+    RECORD_SINGLE_ARG(descriptor);
+    LOGE(LOG_DEBUG, "%s", __FUNCTION__);
+
+    GSCHED_RETAIN;
+    *result = cudnnBackendFinalize(
+        (cudnnBackendDescriptor_t)resource_mg_get(&rm_cudnn_backendds, (void*)descriptor));
+    GSCHED_RELEASE;
+    RECORD_RESULT(integer, *result);
+    return 1;
+}
+bool_t rpc_cudnnbackendsetattribute_1_svc(
+                         ptr descriptor,
+                         int attributeName,
+                         int attributeType,
+                         int64_t elementCount,
+                         mem_data arrayOfElements,
+                         int *result, struct svc_req *rqstp)
+{
+    RECORD_API(rpc_cudnnbackendsetattribute_1_argument);
+    RECORD_NARG(descriptor);
+    RECORD_NARG(attributeName);
+    RECORD_NARG(attributeType);
+    RECORD_NARG(elementCount);
+    RECORD_NARG(arrayOfElements);
+
+    LOGE(LOG_DEBUG, "%s", __FUNCTION__);
+    
+    if (attributeType < 0 || attributeType >= CUDNN_TYPE_RNG_DISTRIBUTION) {
+        LOGE(LOG_ERROR, "attributeType out of range.");
+        return 0;
+    }
+
+    if (arrayOfElements.mem_data_len != elementCount * backendAttributeSizes[attributeType]) {
+        LOGE(LOG_ERROR, "array dimensions not as expected.");
+        return 0;
+    }
+    GSCHED_RETAIN;
+    *result = cudnnBackendSetAttribute(
+        (cudnnBackendDescriptor_t)resource_mg_get(&rm_cudnn_backendds, (void*)descriptor),
+        (cudnnBackendAttributeName_t)attributeName,
+        (cudnnBackendAttributeType_t)attributeType,
+        elementCount,
+        arrayOfElements.mem_data_val);
+    GSCHED_RELEASE;
+    RECORD_RESULT(integer, *result);
+    return 1;
+}
+
+bool_t rpc_cudnnbackendgetattribute_1_svc(ptr descriptor, int attributeName, int attributeType, int64_t requestedElementCount, mem_result *result, struct svc_req *rqstp)
+{
+    void *arrayOfElements = NULL;
+    LOGE(LOG_DEBUG, "%s", __FUNCTION__);
+    if (attributeType < 0 || attributeType >= CUDNN_TYPE_RNG_DISTRIBUTION) {
+        LOGE(LOG_ERROR, "attributeType out of range.");
+        return 0;
+    }
+    result->mem_result_u.data.mem_data_len = sizeof(int64_t) + requestedElementCount*sizeof(backendAttributeSizes[attributeType]);
+    if ((result->mem_result_u.data.mem_data_val = malloc(result->mem_result_u.data.mem_data_len)) == NULL) {
+        LOGE(LOG_ERROR, "malloc failed");
+        return 0;
+    }
+    if (requestedElementCount > 0) {
+        void *data = result->mem_result_u.data.mem_data_val + sizeof(int64_t);
+    }
+    
+    GSCHED_RETAIN;
+    result->err = cudnnBackendGetAttribute(
+        (cudnnBackendDescriptor_t)resource_mg_get(&rm_cudnn_backendds, (void*)descriptor),
+        (cudnnBackendAttributeName_t)attributeName,
+        (cudnnBackendAttributeType_t)attributeType,
+        requestedElementCount,
+        (int64_t*)result->mem_result_u.data.mem_data_val,
+        arrayOfElements);
+    
+    LOGE(LOG_DEBUG, "desc: %p, name: %d, type: %d, requestedElementCount: %zd, elementCount: %zd, arrayOfElements: %p -> %d", descriptor, attributeName, attributeType, requestedElementCount, *result->mem_result_u.data.mem_data_val, arrayOfElements, result->err);
+
+    GSCHED_RELEASE;
+    return 1;
+}
+bool_t rpc_cudnnbackendexecute_1_svc(ptr handle, ptr executionPlan, ptr variantPack, int *result, struct svc_req *rqstp)
+{
+    RECORD_API(rpc_cudnnbackendexecute_1_argument);
+    RECORD_NARG(handle);
+    RECORD_NARG(executionPlan);
+    RECORD_NARG(variantPack);
+    LOGE(LOG_DEBUG, "%s", __FUNCTION__);
+
+    GSCHED_RETAIN;
+    *result = cudnnBackendExecute(
+        (cudnnHandle_t)resource_mg_get(&rm_cudnn, (void*)handle),
+        (cudnnBackendDescriptor_t)resource_mg_get(&rm_cudnn_backendds, (void*)executionPlan),
+        (cudnnBackendDescriptor_t)resource_mg_get(&rm_cudnn_backendds, (void*)variantPack));
+    GSCHED_RELEASE;
+    RECORD_RESULT(integer, *result);
+    return 1;
 }
\ No newline at end of file
diff --git a/cpu/cpu_rpc_prot.x b/cpu/cpu_rpc_prot.x
index 987b89af..2ec9afa4 100644
--- a/cpu/cpu_rpc_prot.x
+++ b/cpu/cpu_rpc_prot.x
@@ -458,6 +458,9 @@ program RPC_CD_PROG {
                          ptr, int, ptr, int, double, ptr, int)                 = 3006;
         int          rpc_cublasSgemmEx(ptr, int, int, int, int, int, float,
                          ptr, int, int, ptr, int, int, float, ptr, int, int)                 = 3007;
+        int          rpc_cublasSetStream(ptr handle, ptr streamId)                             = 3008;
+        int          rpc_cublasSetWorkspace(ptr handle, ptr workspace, size_t workspaceSizeInBytes) = 3009;
+        int          rpc_cublasSetMathMode(ptr handle, int mode) = 3010;
 
         /* NVML */
         int_result   rpc_nvmlDeviceGetCount_v2(void)                           = 4000;
@@ -552,5 +555,19 @@ program RPC_CD_PROG {
         mem_result rpc_cudnnFindConvolutionForwardAlgorithm(ptr handle, ptr xDesc, ptr wDesc, ptr convDesc, ptr yDesc, int requestedAlgoCount) = 5306;
         sz_result rpc_cudnnGetConvolutionForwardWorkspaceSize(ptr handle, ptr xDesc, ptr wDesc, ptr convDesc, ptr yDesc, int algo) = 5307;
         int rpc_cudnnConvolutionForward(ptr handle, cudnn_scaling_t alpha, ptr xDesc, ptr x, ptr wDesc, ptr w, ptr convDesc, int algo, ptr workSpace, size_t workSpaceSizeInBytes, cudnn_scaling_t beta, ptr yDesc, ptr y) = 5308;
+        ptr_result rpc_cudnnBackendCreateDescriptor(int descriptorType) = 5309;
+        int rpc_cudnnBackendDestroyDescriptor(ptr descriptor) = 5310;
+        int rpc_cudnnBackendInitialize(ptr descriptor) = 5311;
+        int rpc_cudnnBackendFinalize(ptr descriptor) = 5312;
+        int rpc_cudnnBackendSetAttribute(ptr descriptor,
+                         int attributeName,
+                         int attributeType,
+                         int64_t elementCount,
+                         mem_data arrayOfElements) = 5313;
+        mem_result rpc_cudnnBackendGetAttribute(ptr descriptor,
+                            int attributeName,
+                            int attributeType,
+                            int64_t requestedElementCount) = 5314;
+        int rpc_cudnnBackendExecute(ptr handle, ptr executionPlan, ptr variantPack) = 5315;
     } = 1;
 } = 99;
diff --git a/cpu/resource-mg.h b/cpu/resource-mg.h
index 0e5f42e4..ee8c44fa 100644
--- a/cpu/resource-mg.h
+++ b/cpu/resource-mg.h
@@ -48,6 +48,7 @@ resource_mg rm_cudnn_poolings;
 resource_mg rm_cudnn_activations;
 resource_mg rm_cudnn_lrns;
 resource_mg rm_cudnn_convs;
+resource_mg rm_cudnn_backendds;
 
 
 /** initializes the resource manager
diff --git a/tests/samples/Makefile b/tests/samples/Makefile
index 97c3bcc2..d52596db 100644
--- a/tests/samples/Makefile
+++ b/tests/samples/Makefile
@@ -34,9 +34,10 @@ samples-bin/mnistCUDNN.sample : cudnn-samples samples-bin
 	make -C cudnn-samples/mnistCUDNN \
 		clean
 	make -C cudnn-samples/mnistCUDNN \
-		NVCCFLAGS="-cudart shared --no-compress -g -G" \
+		NVCCFLAGS="-cudart shared --no-compress -G" \
 		SMS="${SMS}" \
-		CUDA_PATH=${CUDA_PATH}
+		CUDA_PATH=${CUDA_PATH} \
+		DEBUG=1
 	cp cudnn-samples/mnistCUDNN/mnistCUDNN $@
 
 samples-bin/nbody.uncompressed.sample : samples samples-bin
diff --git a/tests/test_apps/yolo.py b/tests/test_apps/yolo.py
new file mode 100644
index 00000000..1d929155
--- /dev/null
+++ b/tests/test_apps/yolo.py
@@ -0,0 +1,12 @@
+import torch
+
+model = torch.hub.load("ultralytics/yolov5", "yolov5s", device='cuda:0')  # or yolov5n - yolov5x6, custom
+
+# Images
+img = "https://ultralytics.com/images/zidane.jpg"  # or file, Path, PIL, OpenCV, numpy, list
+
+# Inference
+results = model(img)
+
+# Results
+results.print()  # or .show(), .save(), .crop(), .pandas(), etc.
\ No newline at end of file

From ce21d8a856ccbf9048e1b8c97488d2b2e0829e2b Mon Sep 17 00:00:00 2001
From: Niklas Eiling <niklas.eiling@eonerc.rwth-aachen.de>
Date: Thu, 13 Jul 2023 12:12:11 +0200
Subject: [PATCH 77/83] improve debug output for cuModuleLoad

Signed-off-by: Niklas Eiling <niklas.eiling@eonerc.rwth-aachen.de>
---
 cpu/cpu-server-driver.c | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/cpu/cpu-server-driver.c b/cpu/cpu-server-driver.c
index 0277e414..305368b5 100644
--- a/cpu/cpu-server-driver.c
+++ b/cpu/cpu-server-driver.c
@@ -311,6 +311,9 @@ bool_t rpc_cumoduleload_1_svc(char* path, ptr_result *result,
     if (resource_mg_create(&rm_modules, (void*)result->ptr_result_u.ptr) != 0) {
         LOGE(LOG_ERROR, "error in resource manager");
     }
+    char *err_str = NULL;
+    cuGetErrorName(result->err, &err_str);
+    LOGE(LOG_DEBUG, "cuModuleLoad result: %s", err_str);
     RECORD_RESULT(ptr_result_u, *result);
     return 1;
 }

From 481dec9e9b4e55ea7cbf918bc6ac3e548e5e04a1 Mon Sep 17 00:00:00 2001
From: Niklas Eiling <niklas.eiling@eonerc.rwth-aachen.de>
Date: Thu, 13 Jul 2023 15:47:43 +0200
Subject: [PATCH 78/83] add support for cuModuleLoadData

Signed-off-by: Niklas Eiling <niklas.eiling@eonerc.rwth-aachen.de>
---
 cpu/cpu-client-driver.c  | 46 +++++++++++++++++++++++++++++++++++++++-
 cpu/cpu-elf2.c           |  2 --
 cpu/cpu-server-driver.c  | 28 +++++++++++++++++++++---
 cpu/cpu_rpc_prot.x       |  5 +++--
 tests/cpu/cubin/main.cpp | 35 +++++++++++++++++++++++++++++-
 5 files changed, 107 insertions(+), 9 deletions(-)

diff --git a/cpu/cpu-client-driver.c b/cpu/cpu-client-driver.c
index 45d44ba8..4d131737 100644
--- a/cpu/cpu-client-driver.c
+++ b/cpu/cpu-client-driver.c
@@ -6,6 +6,7 @@
 #include <cudaEGL.h>
 #include <vdpau/vdpau.h>
 #include <cudaVDPAU.h>
+#include <elf.h>
 
 #include <driver_types.h>
 #include <string.h>
@@ -438,7 +439,50 @@ CUresult cuModuleLoad(CUmodule* module, const char* fname)
     }
     return result.err;
 }
-DEF_FN(CUresult, cuModuleLoadData, CUmodule*, module, const void*, image)
+
+
+CUresult cuModuleLoadData(CUmodule* module, const void* image)
+{
+	enum clnt_stat retval;
+    ptr_result result;
+    mem_data mem;
+
+    if (image == NULL) {
+        LOGE(LOG_ERROR, "image is NULL!");
+        return CUDA_ERROR_INVALID_IMAGE;
+    }
+    Elf64_Ehdr *ehdr = (Elf64_Ehdr*)image;
+
+    if (ehdr->e_ident[EI_MAG0] != ELFMAG0 ||
+        ehdr->e_ident[EI_MAG1] != ELFMAG1 ||
+        ehdr->e_ident[EI_MAG2] != ELFMAG2 ||
+        ehdr->e_ident[EI_MAG3] != ELFMAG3) {
+        LOGE(LOG_ERROR, "image is not an ELF!");
+        return CUDA_ERROR_INVALID_IMAGE;
+    }
+
+    mem.mem_data_len = ehdr->e_shoff + ehdr->e_shnum * ehdr->e_shentsize;
+    mem.mem_data_val = (uint8_t*)image;
+
+    LOGE(LOG_DEBUG, "image_size = %#0zx", mem.mem_data_len);
+    
+    if (elf2_parameter_info(&kernel_infos, mem.mem_data_val, mem.mem_data_len) != 0) {
+        LOGE(LOG_ERROR, "could not get kernel infos from memory");
+        return CUDA_ERROR_INVALID_IMAGE;
+    }
+
+    retval = rpc_cumoduleloaddata_1(mem, &result, clnt);
+    printf("[rpc] %s(%p) = %d, result %p\n", __FUNCTION__, image, result.err, (void*)result.ptr_result_u.ptr);
+	if (retval != RPC_SUCCESS) {
+		fprintf(stderr, "[rpc] %s failed.", __FUNCTION__);
+        return CUDA_ERROR_UNKNOWN;
+	}
+    if (module != NULL) {
+       *module = (CUmodule)result.ptr_result_u.ptr;
+    }
+    return result.err;
+}
+
 DEF_FN(CUresult, cuModuleLoadDataEx, CUmodule*, module, const void*, image, unsigned int, numOptions, CUjit_option*, options, void**, optionValues)
 DEF_FN(CUresult, cuModuleLoadFatBinary, CUmodule*, module, const void*, fatCubin)
 CUresult cuModuleUnload(CUmodule hmod)
diff --git a/cpu/cpu-elf2.c b/cpu/cpu-elf2.c
index 37a1e486..89fcb24a 100644
--- a/cpu/cpu-elf2.c
+++ b/cpu/cpu-elf2.c
@@ -895,8 +895,6 @@ int elf2_parameter_info(list *kernel_infos, void* memory, size_t memsize)
         return -1;
     }
 
-    hexdump(memory, 0x10);
-
 #define ELF_DUMP_TO_FILE 1
 
 #ifdef ELF_DUMP_TO_FILE
diff --git a/cpu/cpu-server-driver.c b/cpu/cpu-server-driver.c
index 305368b5..4eb2aad4 100644
--- a/cpu/cpu-server-driver.c
+++ b/cpu/cpu-server-driver.c
@@ -299,6 +299,26 @@ bool_t rpc_cumodulegetfunction_1_svc(uint64_t module, char *name, ptr_result *re
     return 1;
 }
 
+bool_t rpc_cumoduleloaddata_1_svc(mem_data mem, ptr_result *result,
+                                     struct svc_req *rqstp)
+{
+    RECORD_API(mem_data);
+    RECORD_SINGLE_ARG(mem);
+    LOG(LOG_DEBUG, "%s(%p, %#0zx)", __FUNCTION__, mem.mem_data_val, mem.mem_data_len);
+    GSCHED_RETAIN;
+    result->err = cuModuleLoadData((CUmodule*)&result->ptr_result_u.ptr, mem.mem_data_val);
+    GSCHED_RELEASE;
+    if (resource_mg_create(&rm_modules, (void*)result->ptr_result_u.ptr) != 0) {
+        LOGE(LOG_ERROR, "error in resource manager");
+    }
+    if (result->err != 0) {
+        char *err_str = NULL;
+        cuGetErrorName(result->err, &err_str);
+        LOGE(LOG_DEBUG, "cuModuleLoadData result: %s", err_str);
+    }
+    RECORD_RESULT(ptr_result_u, *result);
+    return 1;
+}
 bool_t rpc_cumoduleload_1_svc(char* path, ptr_result *result,
                                      struct svc_req *rqstp)
 {
@@ -311,9 +331,11 @@ bool_t rpc_cumoduleload_1_svc(char* path, ptr_result *result,
     if (resource_mg_create(&rm_modules, (void*)result->ptr_result_u.ptr) != 0) {
         LOGE(LOG_ERROR, "error in resource manager");
     }
-    char *err_str = NULL;
-    cuGetErrorName(result->err, &err_str);
-    LOGE(LOG_DEBUG, "cuModuleLoad result: %s", err_str);
+    if (result->err != 0) {
+        char *err_str = NULL;
+        cuGetErrorName(result->err, &err_str);
+        LOGE(LOG_DEBUG, "cuModuleLoad result: %s", err_str);
+    }
     RECORD_RESULT(ptr_result_u, *result);
     return 1;
 }
diff --git a/cpu/cpu_rpc_prot.x b/cpu/cpu_rpc_prot.x
index 2ec9afa4..6d505842 100644
--- a/cpu/cpu_rpc_prot.x
+++ b/cpu/cpu_rpc_prot.x
@@ -424,6 +424,7 @@ program RPC_CD_PROG {
         mem_result   rpc_cuDeviceGetProperties(int)                            = 1023;
         dint_result  rpc_cuDeviceComputeCapability(int)                        = 1024;
         int_result   rpc_cuDeviceGetP2PAttribute(int, ptr, ptr)                = 1025; 
+        ptr_result   rpc_cuModuleLoadData(mem_data mem)                        = 1026;
 
         /* HIDDEN DRIVER API */
 /*        ptr_result   rpc_hidden_get_device_ctx(int)                            = 1101;
@@ -562,12 +563,12 @@ program RPC_CD_PROG {
         int rpc_cudnnBackendSetAttribute(ptr descriptor,
                          int attributeName,
                          int attributeType,
-                         int64_t elementCount,
+                         hyper elementCount,
                          mem_data arrayOfElements) = 5313;
         mem_result rpc_cudnnBackendGetAttribute(ptr descriptor,
                             int attributeName,
                             int attributeType,
-                            int64_t requestedElementCount) = 5314;
+                            hyper requestedElementCount) = 5314;
         int rpc_cudnnBackendExecute(ptr handle, ptr executionPlan, ptr variantPack) = 5315;
     } = 1;
 } = 99;
diff --git a/tests/cpu/cubin/main.cpp b/tests/cpu/cubin/main.cpp
index 5b04b744..6bad89b8 100644
--- a/tests/cpu/cubin/main.cpp
+++ b/tests/cpu/cubin/main.cpp
@@ -3,6 +3,9 @@
 #include <cuda_runtime.h>
 #include <cuda.h>
 #include <unistd.h>
+#include <sys/mman.h>
+#include <sys/stat.h>
+#include <fcntl.h>
 
 
 #define printCudaErrors(err) __printCudaErrors (err, __FILE__, __LINE__)
@@ -76,6 +79,32 @@ int getModuleFromCubin(CUmodule *module, const char *cubin)
     return 0;
 }
 
+int getModuleFromCubinInMemory(CUmodule *module, const char *cubin)
+{
+    int fd = open(cubin, O_RDONLY);
+    if (fd < 0) {
+        printf("error\n");
+        return 1;
+    }
+    struct stat st;
+    if (fstat(fd, &st) < 0) {
+        printf("error\n");
+        return 1;
+    }
+    printf("size: %#0zx\n", (int)st.st_size);
+    void *buf = mmap(NULL, st.st_size, PROT_READ, MAP_PRIVATE, fd, 0);
+    if (buf == MAP_FAILED) {
+        printf("error\n");
+        return 1;
+    }
+    CUresult err;
+    if ((err = cuModuleLoadData(module, buf)) != CUDA_SUCCESS) {
+        printCudaErrors(err);
+        return 1;
+    }
+    return 0;
+}
+
 int getModuleFromShared(CUmodule **module, const char *cubin)
 {
     return 0;
@@ -97,10 +126,14 @@ int main(int argc, char** argv)
     CUmodule module;
     CUfunction func;
     printf("testing cubin...\n");
-    if (getModuleFromCubin(&module, "kernel.cubin") != 0) {
+    if (getModuleFromCubinInMemory(&module, "kernel.cubin") != 0) {
         printf("error\n");
         return 1;
     }
+    // if (getModuleFromCubin(&module, "kernel.cubin") != 0) {
+    //     printf("error\n");
+    //     return 1;
+    // }
     // if ((err = getModuleFromShared(&module, "kernel.so")) != 0) {
     //     printf("error\n");
     //     return 1;

From fbf7dad8457e923b3f1972d22cde920e082b75b6 Mon Sep 17 00:00:00 2001
From: Niklas Eiling <niklas.eiling@eonerc.rwth-aachen.de>
Date: Thu, 13 Jul 2023 15:52:14 +0200
Subject: [PATCH 79/83] cublas: remove usage of new APIs if we compile for CUDA
 10

Signed-off-by: Niklas Eiling <niklas.eiling@eonerc.rwth-aachen.de>
---
 cpu/cpu-server-cublas.c | 10 ++++++++++
 1 file changed, 10 insertions(+)

diff --git a/cpu/cpu-server-cublas.c b/cpu/cpu-server-cublas.c
index f3b9dbab..ad54eca0 100644
--- a/cpu/cpu-server-cublas.c
+++ b/cpu/cpu-server-cublas.c
@@ -109,10 +109,15 @@ bool_t rpc_cublassetworkspace_1_svc(ptr handle, ptr workspace, size_t workspaceS
     RECORD_NARG(workspaceSizeInBytes);
     LOGE(LOG_DEBUG, "%s", __FUNCTION__);
     GSCHED_RETAIN;
+#if CUBLAS_VERSION >= 11000
     *result = cublasSetWorkspace(
         resource_mg_get(&rm_cublas, (void*)handle),
         resource_mg_get(&rm_memory, (void*)workspace),
         workspaceSizeInBytes);
+#else
+    LOGE(LOG_ERROR, "cublassetworkspace not supported in this version");
+    *result = -1;
+#endif
     GSCHED_RELEASE;
     RECORD_RESULT(integer, *result);
     return 1;
@@ -171,6 +176,7 @@ bool_t rpc_cublassgemm_1_svc(ptr handle, int transa, int transb, int m, int n, i
     RECORD_ARG(14, ldc);
     LOGE(LOG_DEBUG, "cublasSgemm");
     GSCHED_RETAIN;
+#if CUBLAS_VERSION >= 11000
     *result = cublasSgemm(resource_mg_get(&rm_cublas, (void*)handle),
                     (cublasOperation_t) transa,
                     (cublasOperation_t) transb,
@@ -179,6 +185,10 @@ bool_t rpc_cublassgemm_1_svc(ptr handle, int transa, int transb, int m, int n, i
                     resource_mg_get(&rm_memory, (void*)B), ldb, &beta,
                     resource_mg_get(&rm_memory, (void*)C), ldc
     );
+#else
+    LOGE(LOG_ERROR, "cublassetworkspace not supported in this version");
+    *result = -1;
+#endif
     GSCHED_RELEASE;
     RECORD_RESULT(integer, *result);
     return 1;

From bf3a15e1590d7caafaa6eb9b85fbd9534dcdc097 Mon Sep 17 00:00:00 2001
From: Niklas Eiling <niklas.eiling@eonerc.rwth-aachen.de>
Date: Mon, 17 Jul 2023 14:37:42 +0200
Subject: [PATCH 80/83] fix using logger function before initialization

Signed-off-by: Niklas Eiling <niklas.eiling@eonerc.rwth-aachen.de>
---
 cpu/server-exe.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/cpu/server-exe.c b/cpu/server-exe.c
index 805dd9ee..a174e0a2 100644
--- a/cpu/server-exe.c
+++ b/cpu/server-exe.c
@@ -12,13 +12,13 @@ int main(int argc, char** argv)
     } else if (argc == 2) {
         uint64_t vers;
         if (sscanf(argv[1], "%lu", &vers) != 1) {
-            LOGE(LOG_ERROR, "version string could not be converted to number");
-            LOGE(LOG_INFO, "usage: %s [unique rpc version]", argv[0]);
+            printf("version string could not be converted to number\n");
+            printf("usage: %s [unique rpc version]\n", argv[0]);
             return 1;
         }
         cricket_main(RPC_CD_PROG, vers);
     } else {
-        LOGE(LOG_INFO, "usage: %s", argv[0]);
+        printf("usage: %s\n", argv[0]);
     }
     return 0;
 }

From f30d9b0f3c736469053c1bb9adc98914a2b518da Mon Sep 17 00:00:00 2001
From: Niklas Eiling <niklas.eiling@eonerc.rwth-aachen.de>
Date: Mon, 17 Jul 2023 14:53:07 +0200
Subject: [PATCH 81/83] fix no output on weird shells, e.g. ssh

Signed-off-by: Niklas Eiling <niklas.eiling@eonerc.rwth-aachen.de>
---
 cpu/cpu-server.c | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/cpu/cpu-server.c b/cpu/cpu-server.c
index 744f7cfc..a8bbbe65 100644
--- a/cpu/cpu-server.c
+++ b/cpu/cpu-server.c
@@ -172,9 +172,10 @@ void cricket_main(size_t prog_num, size_t vers_num)
     struct sigaction act;
     char *command = NULL;
     act.sa_handler = int_handler;
-    sigaction(SIGINT, &act, NULL);
-    LOG(LOG_DBG(1), "log level is %d", LOG_LEVEL);
+    printf("welcome to cricket!\n");
     init_log(LOG_LEVEL, __FILE__);
+    LOG(LOG_DBG(1), "log level is %d", LOG_LEVEL);
+    sigaction(SIGINT, &act, NULL);
 
     #ifdef WITH_IB
     char client[256];
@@ -317,6 +318,9 @@ void cricket_main(size_t prog_num, size_t vers_num)
 
     LOG(LOG_INFO, "waiting for RPC requests...");
 
+    // make sure that our output is flushed even for non line-buffered shells
+    fflush(stdout);
+
     svc_run();
 
     LOG(LOG_DEBUG, "svc_run returned. Cleaning up.");

From 07db2ba3c16756df10192cc4d5c4feecd4051eb3 Mon Sep 17 00:00:00 2001
From: Niklas Eiling <niklas.eiling@eonerc.rwth-aachen.de>
Date: Tue, 18 Jul 2023 14:24:05 +0200
Subject: [PATCH 82/83] remove md5

Signed-off-by: Niklas Eiling <niklas.eiling@eonerc.rwth-aachen.de>
---
 cpu/cpu-utils.c | 29 -----------------------------
 1 file changed, 29 deletions(-)

diff --git a/cpu/cpu-utils.c b/cpu/cpu-utils.c
index cf67ce07..9a4371c8 100644
--- a/cpu/cpu-utils.c
+++ b/cpu/cpu-utils.c
@@ -6,7 +6,6 @@
 #include <errno.h>
 #include <string.h>
 #include <sys/wait.h>
-#include <openssl/md5.h>
 #include <linux/limits.h>
 #include "rpc/types.h"
 #include <sys/stat.h>
@@ -46,34 +45,6 @@ int cpu_utils_command(char **command)
 
 }
 
-int cpu_utils_md5hash(char *filename, unsigned long *high, unsigned long *low)
-{
-    unsigned char c[MD5_DIGEST_LENGTH];
-    FILE *fd;
-    MD5_CTX mdContext;
-    int bytes;
-    unsigned char data[1024];
-
-    if (filename == NULL || high == NULL || low == NULL) {
-        return -1;
-    }
-
-    if ((fd = fopen(filename, "rb")) == NULL) {
-        LOGE(LOG_ERROR, "%s can't be opened.", filename);
-        return -1;
-    }
-
-    MD5_Init (&mdContext);
-    while ((bytes = fread(data, 1, 1024, fd)) != 0)
-        MD5_Update(&mdContext, data, bytes);
-    MD5_Final(c, &mdContext);
-    fclose (fd);
-    *high = *((unsigned long*)c);
-    *low  = *((unsigned long*)(c+8));
-    return 0;
-}
-
-
 
 int cpu_utils_launch_child(const char *file, char **args)
 {

From 088b6fc96a58ef668b28a0715cd5f61d9b72f588 Mon Sep 17 00:00:00 2001
From: Niklas Eiling <niklas.eiling@eonerc.rwth-aachen.de>
Date: Tue, 18 Jul 2023 16:28:01 +0200
Subject: [PATCH 83/83] remove cuda 10 support, add cudnn CI test

Signed-off-by: Niklas Eiling <niklas.eiling@eonerc.rwth-aachen.de>
---
 .gitlab-ci.yml          | 55 +++++++++++------------------------------
 tests/samples/Makefile  |  8 ++++--
 utils/Dockerfile.cuda10 | 45 ---------------------------------
 3 files changed, 20 insertions(+), 88 deletions(-)
 delete mode 100644 utils/Dockerfile.cuda10

diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml
index b3908604..df9b8eb4 100644
--- a/.gitlab-ci.yml
+++ b/.gitlab-ci.yml
@@ -41,16 +41,6 @@ prepare:centos8:cuda11:
   tags:
     - docker
 
-prepare:centos8:cuda10:
-  stage: prepare
-  script:
-    - docker build
-        --file utils/Dockerfile.cuda10
-        --tag ${DOCKER_IMAGE_DEV}_cuda10:${DOCKER_TAG}
-        --tag ${DOCKER_IMAGE_DEV}_cuda10:latest .
-  tags:
-    - docker
-
 # check if styleguide is fulfilled
 #style_check:
 #  stage: build
@@ -78,6 +68,7 @@ build:
     paths:
       - bin
       - tests/bin
+      - tests/samples/samples-bin
   image: ${DOCKER_IMAGE_DEV}:${DOCKER_TAG}
   cache:
     paths:
@@ -144,32 +135,6 @@ build:cuda11:
   tags:
     - docker
 
-build:cuda10:
-  stage: build
-  needs: ["prepare:centos8:cuda10"]
-  script:
-   - make -j 32 libtirpc
-   - make -j 32 cuda-gdb
-   - make -j 1 LOG=INFO NOSAMPLES=yes
-  artifacts:
-    expire_in: 1 week
-    paths:
-      - bin
-      - tests/bin
-  image: ${DOCKER_IMAGE_DEV}_cuda10:${DOCKER_TAG}
-  cache:
-    paths:
-      - gpu/build
-      - cpu/*.o
-      - tests/cpu/*.o
-      - tests/test_apps/*.o
-      - submodules/libtirpc
-      - submodules/cuda-gdb
-      - submodules/cuda-gdb-src.rpm
-    key: build_cuda10
-  tags:
-    - docker
-
 build:debug:
   stage: build
   needs: ["prepare:rocky9:docker-dev"]
@@ -206,6 +171,7 @@ build:debug:
     LDIR: '$CI_BUILDS_DIR/$CI_PROJECT_PATH/bin'
     SAMPLES_PATH: '/usr/local/cuda/samples'
     PARAMETER: ''
+    CHDIR: 'tests'
   script:
     - mkdir ~/.ssh &&
       echo "-----BEGIN OPENSSH PRIVATE KEY-----" > ~/.ssh/id_rsa &&
@@ -217,7 +183,8 @@ build:debug:
     - scp -r $LDIR/* $GPU_TARGET:$RDIR/
     - ssh $GPU_TARGET "LD_PRELOAD=$RDIR/libtirpc.so.3 $RDIR/cricket-rpc-server 255" &
     - sleep 2
-    - CRICKET_RPCID=255 REMOTE_GPU_ADDRESS="ghost.acs-lab.eonerc.rwth-aachen.de" PATH=$LDIR:$PATH LD_PRELOAD=$LDIR/libtirpc.so.3:$LDIR/cricket-client.so $LDIR/$TEST_BINARY $PARAMETER
+    - cd $LDIR/$CHDIR
+    - CRICKET_RPCID=255 REMOTE_GPU_ADDRESS="ghost.acs-lab.eonerc.rwth-aachen.de" PATH=$LDIR:$PATH LD_PRELOAD=$LDIR/libtirpc.so.3:$LDIR/cricket-client.so ./$TEST_BINARY $PARAMETER
   after_script:
     - ssh $GPU_TARGET rm -rf $RDIR
     - ssh $GPU_TARGET pkill -fe -2 $RDIR/test_kernel
@@ -252,21 +219,27 @@ test:test_programs(2/2):
 test:test_kernel:
     extends: .remote-gpu
     variables:
-      TEST_BINARY: 'tests/kernel.testapp'
+      TEST_BINARY: 'kernel.testapp'
 
 test:samples:matrixMul:
     extends: .remote-gpu
     variables:
-      TEST_BINARY: 'tests/matrixMul.compressed.sample'
+      TEST_BINARY: 'matrixMul.compressed.sample'
 
 test:samples:bandwidthTest:
     extends: .remote-gpu
     variables:
-      TEST_BINARY: 'tests/bandwidthTest.sample'
+      TEST_BINARY: 'bandwidthTest.sample'
 
 test:samples:nbody:
     extends: .remote-gpu
     variables:
-      TEST_BINARY: 'tests/nbody.uncompressed.sample'
+      TEST_BINARY: 'nbody.uncompressed.sample'
       PARAMETER: '-benchmark'
 
+test:samples:mnistCUDNN:
+    extends: .remote-gpu
+    variables:
+      CHDIR: '../tests/samples/samples-bin'
+      TEST_BINARY: 'mnistCUDNN.sample'
+
diff --git a/tests/samples/Makefile b/tests/samples/Makefile
index d52596db..c7c83f63 100644
--- a/tests/samples/Makefile
+++ b/tests/samples/Makefile
@@ -5,7 +5,8 @@ SAMPLES = samples-bin/matrixMul.compressed.sample \
 		  samples-bin/matrixMul.uncompressed.sample \
 		  samples-bin/nbody.uncompressed.sample \
 		  samples-bin/nbody.compressed.sample \
-		  samples-bin/bandwidthTest.sample
+		  samples-bin/bandwidthTest.sample \
+		  samples-bin/mnistCUDNN.sample
 
 CUDA_PATH = /usr/local/cuda
 SMS = 75 60
@@ -30,7 +31,10 @@ cudnn-samples:
 samples-bin:
 	mkdir -p $@
 
-samples-bin/mnistCUDNN.sample : cudnn-samples samples-bin
+samples-bin/data:
+	cp -R cudnn-samples/mnistCUDNN/data $@
+
+samples-bin/mnistCUDNN.sample : cudnn-samples samples-bin samples-bin/data
 	make -C cudnn-samples/mnistCUDNN \
 		clean
 	make -C cudnn-samples/mnistCUDNN \
diff --git a/utils/Dockerfile.cuda10 b/utils/Dockerfile.cuda10
deleted file mode 100644
index 391c130e..00000000
--- a/utils/Dockerfile.cuda10
+++ /dev/null
@@ -1,45 +0,0 @@
-FROM centos:8
-
-LABEL \
-	org.label-schema.schema-version = "1.0" \
-	org.label-schema.name = "cricket" \
-	org.label-schema.license = "MIT" \
-	org.label-schema.vendor = "Institute for Automation of Complex Power Systems, RWTH Aachen University" \
-	org.label-schema.author.name = "Niklas Eiling" \
-	org.label-schema.author.email = "niklas.eiling@eonerc.rwth-aachen.de" \
-	org.label-schema.vcs-url = "https://git.rwth-aachen.de/niklas.eiling/cricket"
-    
-RUN cd /etc/yum.repos.d/ && sed -i 's/mirrorlist/#mirrorlist/g' /etc/yum.repos.d/CentOS-* && sed -i 's|#baseurl=http://mirror.centos.org|baseurl=http://vault.centos.org|g' /etc/yum.repos.d/CentOS-* && yum update -y
-
-RUN dnf -y update && \
-    dnf install -y epel-release dnf-plugins-core && \
-    dnf install -y https://rpms.remirepo.net/enterprise/remi-release-8.rpm && \
-    dnf config-manager --set-enabled powertools && \
-    dnf config-manager --set-enabled remi
-
-RUN dnf install -y make bash git gcc autoconf libtool automake rpcgen \
-                   ncurses-devel zlib-devel binutils-devel mesa-libGL-devel \
-                   libvdpau-devel mesa-libEGL-devel openssl-devel rpcbind \
-                   texinfo bison flex python3 which libibverbs libasan \
-                   cppcheck wget expat-devel xz-devel elfutils-libelf-devel \
-                   freeimage freeimage-devel
-
-ENV LD_LIBRARY_PATH="/usr/local/lib:/usr/local/lib64:${LD_LIBRARY_PATH}"
-
-RUN dnf --refresh -y install https://developer.download.nvidia.com/compute/cuda/repos/rhel8/x86_64/cuda-repo-rhel8-10.2.89-1.x86_64.rpm && \
-    rpm --import https://developer.download.nvidia.com/compute/cuda/repos/rhel8/x86_64/D42D0685.pub && \
-    dnf --refresh -y install cuda-compiler-10-2 cuda-libraries-dev-10-2 cuda-samples-10-2 cuda-driver-dev-10-2 cuda-misc-headers-10-2 cuda-nvml-dev-10-2 nvidia-driver-NVML-530.30.02 libcudnn8-devel && \
-    ln -s cuda-10.2 /usr/local/cuda && \
-    ln -s libcuda.so /usr/local/cuda/targets/x86_64-linux/lib/stubs/libcuda.so.1
-    
-
-ENV PATH="/usr/local/cuda/bin:${PATH}"
-ENV LIBRARY_PATH="/usr/local/cuda/targets/x86_64-linux/lib/stubs:$(LIBRARY_PATH}"
-ENV LD_LIBRARY_PATH="/usr/local/cuda/lib64:/usr/local/cuda/targets/x86_64-linux/lib/stubs:${LD_LIBRARY_PATH}"
-
-#COPY --chown=root .ssh /root/.ssh
-
-WORKDIR /cricket
-
-ENTRYPOINT /bin/bash
-