Merge branch 'main' of github.com:kevmo314/scuda

kevmo314 · Nov 18, 2024 · 8e3d836 · 8e3d836
2 parents 11f8e43 + 5c31f28
commit 8e3d836
Show file tree

Hide file tree

Showing 17 changed files with 11,090 additions and 7,354 deletions.
diff --git a/README.md b/README.md
@@ -3,14 +3,27 @@
 SCUDA is a GPU over IP bridge allowing GPUs on remote machines to be attached
 to CPU-only machines.
 
-## Demo
+## Demos
+
+### CUBLAS Matrix Multiplication
+
+The below demo displays a NVIDIA GeForce RTX 4090 running on a remote machine (right pane).
+Left pane is a Mac running a docker container with nvidia utils installed.
+
+The docker container runs this [matrixMulCUBLAS](https://github.com/zchee/cuda-sample/blob/master/0_Simple/matrixMulCUBLAS/matrixMulCUBLAS.cpp) example.
+
+You can view the docker image used [here](./deploy/Dockerfile.cublas-test).
+
+https://github.com/user-attachments/assets/4bf130c5-5544-442f-b1a5-6216255ab499
+
+### Simple torch example
 
 The below demo displays a NVIDIA GeForce RTX 4090 running on a remote machine (right pane).
 Left pane is a Mac running a docker container with nvidia utils installed.
 
 The docker container runs `python3 -c "import torch; print(torch.cuda.is_available())"` to check if cuda is available.
 
-You can view the docker image used [here](./Dockerfile.test).
+You can view the docker image used [here](./deploy/Dockerfile.torch-test).
 
 https://github.com/user-attachments/assets/035950bb-3cc1-4c73-9ad5-b00871a159ec
 

diff --git a/client.cpp b/client.cpp
@@ -86,28 +86,31 @@ int rpc_open()
         memset(&hints, 0, sizeof(hints));
         hints.ai_family = AF_INET;
         hints.ai_socktype = SOCK_STREAM;
-        if (getaddrinfo(server_ip, port, &hints, &res) != 0)
+        if (getaddrinfo(host, port, &hints, &res) != 0)
         {
-#ifdef VERBOSE
             std::cout << "getaddrinfo of " << host << " port " << port << " failed" << std::endl;
-#endif
-            return -1;
+            continue;
         }
 
         int flag = 1;
         int sockfd = socket(res->ai_family, res->ai_socktype, res->ai_protocol);
-        if (sockfd == -1 ||
-            setsockopt(sockfd, IPPROTO_TCP, TCP_NODELAY, (char *)&flag, sizeof(int)) < 0 ||
-            connect(sockfd, res->ai_addr, res->ai_addrlen) < 0)
+        if (sockfd == -1)
         {
-#ifdef VERBOSE
-            std::cout << "connect to " << host << " port " << port << " failed" << std::endl;
-#endif
-            return -1;
+            printf("socket creation failed...\n");
+            exit(1);
+        }
+
+        int opts = setsockopt(sockfd, IPPROTO_TCP, TCP_NODELAY, (char *)&flag, sizeof(int));
+        if (connect(sockfd, res->ai_addr, res->ai_addrlen) < 0)
+        {
+            std::cerr << "Connecting to " << host << " port " << port << " failed: "
+                      << strerror(errno) << std::endl;
+            exit(1);
         }
 
         conns[nconns++] = {sockfd, 0, 0, 0, 0, PTHREAD_MUTEX_INITIALIZER, PTHREAD_MUTEX_INITIALIZER, PTHREAD_COND_INITIALIZER};
     }
+
     if (pthread_mutex_unlock(&conn_mutex) < 0)
         return -1;
     if (nconns == 0)
@@ -159,6 +162,8 @@ int rpc_end_request(const int index)
 int rpc_wait_for_response(const int index)
 {
     int wait_for_request_id = rpc_end_request(index);
+    if (wait_for_request_id < 0)
+        return -1;
 
     if (pthread_mutex_lock(&conns[index].read_mutex) < 0)
         return -1;
@@ -206,13 +211,13 @@ int rpc_read(const int index, void *data, size_t size)
             }
             size -= bytesRead;
         }
+        return size;
     }
-    else if (read(conns[index].connfd, data, size) < 0)
-    {
+
+    ssize_t n = recv(conns[index].connfd, data, size, MSG_WAITALL);
+    if (n < 0)
         pthread_mutex_unlock(&conns[index].read_mutex);
-        return -1;
-    }
-    return 0;
+    return n;
 }
 
 int rpc_end_response(const int index, void *result)

diff --git a/codegen/annotations.h b/codegen/annotations.h
@@ -1,5 +1,6 @@
 #include <nvml.h>
 #include <cuda.h>
+#include <cublas_v2.h>
 #include <cuda_runtime.h>
 
 /**
@@ -960,23 +961,23 @@ nvmlReturn_t nvmlEventSetWait_v2(nvmlEventSet_t set, nvmlEventData_t *data, unsi
  */
 nvmlReturn_t nvmlEventSetFree(nvmlEventSet_t set);
 /**
- * @param pciInfo SEND_ONLY DEREFERENCE
+ * @param pciInfo SEND_RECV
  * @param newState SEND_ONLY
  */
 nvmlReturn_t nvmlDeviceModifyDrainState(nvmlPciInfo_t *pciInfo, nvmlEnableState_t newState);
 /**
- * @param pciInfo SEND_ONLY DEREFERENCE
+ * @param pciInfo SEND_RECV
  * @param currentState RECV_ONLY
  */
 nvmlReturn_t nvmlDeviceQueryDrainState(nvmlPciInfo_t *pciInfo, nvmlEnableState_t *currentState);
 /**
- * @param pciInfo SEND_ONLY DEREFERENCE
+ * @param pciInfo SEND_RECV
  * @param gpuState SEND_ONLY
  * @param linkState SEND_ONLY
  */
 nvmlReturn_t nvmlDeviceRemoveGpu_v2(nvmlPciInfo_t *pciInfo, nvmlDetachGpuState_t gpuState, nvmlPcieLinkState_t linkState);
 /**
- * @param pciInfo SEND_ONLY DEREFERENCE
+ * @param pciInfo SEND_RECV
  */
 nvmlReturn_t nvmlDeviceDiscoverGpus(nvmlPciInfo_t *pciInfo);
 /**
@@ -1552,7 +1553,7 @@ nvmlReturn_t nvmlDeviceGetGpuFabricInfo(nvmlDevice_t device, nvmlGpuFabricInfo_t
  */
 nvmlReturn_t nvmlGpmMetricsGet(nvmlGpmMetricsGet_t *metricsGet);
 /**
- * @param gpmSample RECV_ONLY
+ * @param gpmSample SEND_ONLY
  */
 nvmlReturn_t nvmlGpmSampleFree(nvmlGpmSample_t gpmSample);
 /**
@@ -1561,13 +1562,13 @@ nvmlReturn_t nvmlGpmSampleFree(nvmlGpmSample_t gpmSample);
 nvmlReturn_t nvmlGpmSampleAlloc(nvmlGpmSample_t *gpmSample);
 /**
  * @param device SEND_ONLY
- * @param gpmSample RECV_ONLY SIZE:sizeof(nvmlGpmSample_t)
+ * @param gpmSample SEND_ONLY
  */
 nvmlReturn_t nvmlGpmSampleGet(nvmlDevice_t device, nvmlGpmSample_t gpmSample);
 /**
  * @param device SEND_ONLY
  * @param gpuInstanceId SEND_ONLY
- * @param gpmSample RECV_ONLY SIZE:sizeof(nvmlGpmSample_t)
+ * @param gpmSample SEND_ONLY
  */
 nvmlReturn_t nvmlGpmMigSampleGet(nvmlDevice_t device, unsigned int gpuInstanceId, nvmlGpmSample_t gpmSample);
 /**
@@ -2040,29 +2041,29 @@ CUresult cuMemFree_v2(CUdeviceptr dptr);
  */
 CUresult cuMemGetAddressRange_v2(CUdeviceptr *pbase, size_t *psize, CUdeviceptr dptr);
 /**
- * @param pp SEND_RECV
+ * @param pp RECV_ONLY
  * @param bytesize SEND_ONLY
  */
 CUresult cuMemAllocHost_v2(void **pp, size_t bytesize);
 /**
- * @param p SEND_RECV
+ * @param p SEND_ONLY
  */
 CUresult cuMemFreeHost(void *p);
 /**
- * @param pp SEND_RECV
+ * @param pp RECV_ONLY
  * @param bytesize SEND_ONLY
  * @param Flags SEND_ONLY
  */
 CUresult cuMemHostAlloc(void **pp, size_t bytesize, unsigned int Flags);
 /**
  * @param pdptr SEND_RECV
- * @param p SEND_RECV
+ * @param p SEND_ONLY
  * @param Flags SEND_ONLY
  */
 CUresult cuMemHostGetDevicePointer_v2(CUdeviceptr *pdptr, void *p, unsigned int Flags);
 /**
  * @param pFlags SEND_RECV
- * @param p SEND_RECV
+ * @param p SEND_ONLY
  */
 CUresult cuMemHostGetFlags(unsigned int *pFlags, void *p);
 /**
@@ -3236,8 +3237,8 @@ CUresult cuGraphMemAllocNodeGetParams(CUgraphNode hNode, CUDA_MEM_ALLOC_NODE_PAR
 /**
  * @param phGraphNode SEND_RECV
  * @param hGraph SEND_ONLY
- * @param dependencies SEND_RECV
  * @param numDependencies SEND_ONLY
+ * @param dependencies SEND_ONLY LENGTH:numDependencies
  * @param dptr SEND_ONLY
  */
 CUresult cuGraphAddMemFreeNode(CUgraphNode *phGraphNode, CUgraph hGraph, const CUgraphNode *dependencies, size_t numDependencies, CUdeviceptr dptr);
@@ -5544,3 +5545,35 @@ cudaError_t cudaGetExportTable(const void **ppExportTable, const cudaUUID_t *pEx
  * @param symbolPtr SEND_RECV
  */
 cudaError_t cudaGetFuncBySymbol(cudaFunction_t *functionPtr, const void *symbolPtr);
+/**
+ * @param handle RECV_ONLY
+ */
+cublasStatus_t cublasCreate_v2(cublasHandle_t *handle);
+/**
+ * @param handle SEND_ONLY
+ */
+cublasStatus_t cublasDestroy_v2(cublasHandle_t handle);
+/**
+ * @param handle SEND_ONLY
+ * @param transa SEND_ONLY
+ * @param transb SEND_ONLY
+ * @param m SEND_ONLY
+ * @param n SEND_ONLY
+ * @param k SEND_ONLY
+ * @param alpha SEND_ONLY NULLABLE
+ * @param A SEND_ONLY
+ * @param lda SEND_ONLY
+ * @param B SEND_ONLY
+ * @param ldb SEND_ONLY
+ * @param beta SEND_ONLY NULLABLE
+ * @param C SEND_ONLY
+ * @param ldc SEND_ONLY
+ */
+cublasStatus_t cublasSgemm_v2(cublasHandle_t handle,
+                              cublasOperation_t transa, cublasOperation_t transb,
+                              int m, int n, int k,
+                              const float *alpha,
+                              const float *A, int lda,
+                              const float *B, int ldb,
+                              const float *beta,
+                              float *C, int ldc);