Skip to content

Commit

Permalink
Merge branch 'main' of github.com:kevmo314/scuda
Browse files Browse the repository at this point in the history
  • Loading branch information
brodeynewman committed Nov 18, 2024
2 parents 11f8e43 + 5c31f28 commit 8e3d836
Show file tree
Hide file tree
Showing 17 changed files with 11,090 additions and 7,354 deletions.
17 changes: 15 additions & 2 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -3,14 +3,27 @@
SCUDA is a GPU over IP bridge allowing GPUs on remote machines to be attached
to CPU-only machines.

## Demo
## Demos

### CUBLAS Matrix Multiplication

The below demo displays a NVIDIA GeForce RTX 4090 running on a remote machine (right pane).
Left pane is a Mac running a docker container with nvidia utils installed.

The docker container runs this [matrixMulCUBLAS](https://github.com/zchee/cuda-sample/blob/master/0_Simple/matrixMulCUBLAS/matrixMulCUBLAS.cpp) example.

You can view the docker image used [here](./deploy/Dockerfile.cublas-test).

https://github.com/user-attachments/assets/4bf130c5-5544-442f-b1a5-6216255ab499

### Simple torch example

The below demo displays a NVIDIA GeForce RTX 4090 running on a remote machine (right pane).
Left pane is a Mac running a docker container with nvidia utils installed.

The docker container runs `python3 -c "import torch; print(torch.cuda.is_available())"` to check if cuda is available.

You can view the docker image used [here](./Dockerfile.test).
You can view the docker image used [here](./deploy/Dockerfile.torch-test).

https://github.com/user-attachments/assets/035950bb-3cc1-4c73-9ad5-b00871a159ec

Expand Down
37 changes: 21 additions & 16 deletions client.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -86,28 +86,31 @@ int rpc_open()
memset(&hints, 0, sizeof(hints));
hints.ai_family = AF_INET;
hints.ai_socktype = SOCK_STREAM;
if (getaddrinfo(server_ip, port, &hints, &res) != 0)
if (getaddrinfo(host, port, &hints, &res) != 0)
{
#ifdef VERBOSE
std::cout << "getaddrinfo of " << host << " port " << port << " failed" << std::endl;
#endif
return -1;
continue;
}

int flag = 1;
int sockfd = socket(res->ai_family, res->ai_socktype, res->ai_protocol);
if (sockfd == -1 ||
setsockopt(sockfd, IPPROTO_TCP, TCP_NODELAY, (char *)&flag, sizeof(int)) < 0 ||
connect(sockfd, res->ai_addr, res->ai_addrlen) < 0)
if (sockfd == -1)
{
#ifdef VERBOSE
std::cout << "connect to " << host << " port " << port << " failed" << std::endl;
#endif
return -1;
printf("socket creation failed...\n");
exit(1);
}

int opts = setsockopt(sockfd, IPPROTO_TCP, TCP_NODELAY, (char *)&flag, sizeof(int));
if (connect(sockfd, res->ai_addr, res->ai_addrlen) < 0)
{
std::cerr << "Connecting to " << host << " port " << port << " failed: "
<< strerror(errno) << std::endl;
exit(1);
}

conns[nconns++] = {sockfd, 0, 0, 0, 0, PTHREAD_MUTEX_INITIALIZER, PTHREAD_MUTEX_INITIALIZER, PTHREAD_COND_INITIALIZER};
}

if (pthread_mutex_unlock(&conn_mutex) < 0)
return -1;
if (nconns == 0)
Expand Down Expand Up @@ -159,6 +162,8 @@ int rpc_end_request(const int index)
int rpc_wait_for_response(const int index)
{
int wait_for_request_id = rpc_end_request(index);
if (wait_for_request_id < 0)
return -1;

if (pthread_mutex_lock(&conns[index].read_mutex) < 0)
return -1;
Expand Down Expand Up @@ -206,13 +211,13 @@ int rpc_read(const int index, void *data, size_t size)
}
size -= bytesRead;
}
return size;
}
else if (read(conns[index].connfd, data, size) < 0)
{

ssize_t n = recv(conns[index].connfd, data, size, MSG_WAITALL);
if (n < 0)
pthread_mutex_unlock(&conns[index].read_mutex);
return -1;
}
return 0;
return n;
}

int rpc_end_response(const int index, void *result)
Expand Down
59 changes: 46 additions & 13 deletions codegen/annotations.h
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
#include <nvml.h>
#include <cuda.h>
#include <cublas_v2.h>
#include <cuda_runtime.h>

/**
Expand Down Expand Up @@ -960,23 +961,23 @@ nvmlReturn_t nvmlEventSetWait_v2(nvmlEventSet_t set, nvmlEventData_t *data, unsi
*/
nvmlReturn_t nvmlEventSetFree(nvmlEventSet_t set);
/**
* @param pciInfo SEND_ONLY DEREFERENCE
* @param pciInfo SEND_RECV
* @param newState SEND_ONLY
*/
nvmlReturn_t nvmlDeviceModifyDrainState(nvmlPciInfo_t *pciInfo, nvmlEnableState_t newState);
/**
* @param pciInfo SEND_ONLY DEREFERENCE
* @param pciInfo SEND_RECV
* @param currentState RECV_ONLY
*/
nvmlReturn_t nvmlDeviceQueryDrainState(nvmlPciInfo_t *pciInfo, nvmlEnableState_t *currentState);
/**
* @param pciInfo SEND_ONLY DEREFERENCE
* @param pciInfo SEND_RECV
* @param gpuState SEND_ONLY
* @param linkState SEND_ONLY
*/
nvmlReturn_t nvmlDeviceRemoveGpu_v2(nvmlPciInfo_t *pciInfo, nvmlDetachGpuState_t gpuState, nvmlPcieLinkState_t linkState);
/**
* @param pciInfo SEND_ONLY DEREFERENCE
* @param pciInfo SEND_RECV
*/
nvmlReturn_t nvmlDeviceDiscoverGpus(nvmlPciInfo_t *pciInfo);
/**
Expand Down Expand Up @@ -1552,7 +1553,7 @@ nvmlReturn_t nvmlDeviceGetGpuFabricInfo(nvmlDevice_t device, nvmlGpuFabricInfo_t
*/
nvmlReturn_t nvmlGpmMetricsGet(nvmlGpmMetricsGet_t *metricsGet);
/**
* @param gpmSample RECV_ONLY
* @param gpmSample SEND_ONLY
*/
nvmlReturn_t nvmlGpmSampleFree(nvmlGpmSample_t gpmSample);
/**
Expand All @@ -1561,13 +1562,13 @@ nvmlReturn_t nvmlGpmSampleFree(nvmlGpmSample_t gpmSample);
nvmlReturn_t nvmlGpmSampleAlloc(nvmlGpmSample_t *gpmSample);
/**
* @param device SEND_ONLY
* @param gpmSample RECV_ONLY SIZE:sizeof(nvmlGpmSample_t)
* @param gpmSample SEND_ONLY
*/
nvmlReturn_t nvmlGpmSampleGet(nvmlDevice_t device, nvmlGpmSample_t gpmSample);
/**
* @param device SEND_ONLY
* @param gpuInstanceId SEND_ONLY
* @param gpmSample RECV_ONLY SIZE:sizeof(nvmlGpmSample_t)
* @param gpmSample SEND_ONLY
*/
nvmlReturn_t nvmlGpmMigSampleGet(nvmlDevice_t device, unsigned int gpuInstanceId, nvmlGpmSample_t gpmSample);
/**
Expand Down Expand Up @@ -2040,29 +2041,29 @@ CUresult cuMemFree_v2(CUdeviceptr dptr);
*/
CUresult cuMemGetAddressRange_v2(CUdeviceptr *pbase, size_t *psize, CUdeviceptr dptr);
/**
* @param pp SEND_RECV
* @param pp RECV_ONLY
* @param bytesize SEND_ONLY
*/
CUresult cuMemAllocHost_v2(void **pp, size_t bytesize);
/**
* @param p SEND_RECV
* @param p SEND_ONLY
*/
CUresult cuMemFreeHost(void *p);
/**
* @param pp SEND_RECV
* @param pp RECV_ONLY
* @param bytesize SEND_ONLY
* @param Flags SEND_ONLY
*/
CUresult cuMemHostAlloc(void **pp, size_t bytesize, unsigned int Flags);
/**
* @param pdptr SEND_RECV
* @param p SEND_RECV
* @param p SEND_ONLY
* @param Flags SEND_ONLY
*/
CUresult cuMemHostGetDevicePointer_v2(CUdeviceptr *pdptr, void *p, unsigned int Flags);
/**
* @param pFlags SEND_RECV
* @param p SEND_RECV
* @param p SEND_ONLY
*/
CUresult cuMemHostGetFlags(unsigned int *pFlags, void *p);
/**
Expand Down Expand Up @@ -3236,8 +3237,8 @@ CUresult cuGraphMemAllocNodeGetParams(CUgraphNode hNode, CUDA_MEM_ALLOC_NODE_PAR
/**
* @param phGraphNode SEND_RECV
* @param hGraph SEND_ONLY
* @param dependencies SEND_RECV
* @param numDependencies SEND_ONLY
* @param dependencies SEND_ONLY LENGTH:numDependencies
* @param dptr SEND_ONLY
*/
CUresult cuGraphAddMemFreeNode(CUgraphNode *phGraphNode, CUgraph hGraph, const CUgraphNode *dependencies, size_t numDependencies, CUdeviceptr dptr);
Expand Down Expand Up @@ -5544,3 +5545,35 @@ cudaError_t cudaGetExportTable(const void **ppExportTable, const cudaUUID_t *pEx
* @param symbolPtr SEND_RECV
*/
cudaError_t cudaGetFuncBySymbol(cudaFunction_t *functionPtr, const void *symbolPtr);
/**
* @param handle RECV_ONLY
*/
cublasStatus_t cublasCreate_v2(cublasHandle_t *handle);
/**
* @param handle SEND_ONLY
*/
cublasStatus_t cublasDestroy_v2(cublasHandle_t handle);
/**
* @param handle SEND_ONLY
* @param transa SEND_ONLY
* @param transb SEND_ONLY
* @param m SEND_ONLY
* @param n SEND_ONLY
* @param k SEND_ONLY
* @param alpha SEND_ONLY NULLABLE
* @param A SEND_ONLY
* @param lda SEND_ONLY
* @param B SEND_ONLY
* @param ldb SEND_ONLY
* @param beta SEND_ONLY NULLABLE
* @param C SEND_ONLY
* @param ldc SEND_ONLY
*/
cublasStatus_t cublasSgemm_v2(cublasHandle_t handle,
cublasOperation_t transa, cublasOperation_t transb,
int m, int n, int k,
const float *alpha,
const float *A, int lda,
const float *B, int ldb,
const float *beta,
float *C, int ldc);
Loading

0 comments on commit 8e3d836

Please sign in to comment.