Skip to content

Commit

Permalink
chore: cudnn wip
Browse files Browse the repository at this point in the history
  • Loading branch information
brodeynewman committed Nov 19, 2024
1 parent 8e3d836 commit bb19397
Show file tree
Hide file tree
Showing 7 changed files with 132 additions and 1 deletion.
5 changes: 5 additions & 0 deletions codegen/annotations.h
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
#include <nvml.h>
#include <cuda.h>
#include <cudnn.h>
#include <cublas_v2.h>
#include <cuda_runtime.h>

Expand Down Expand Up @@ -5545,6 +5546,10 @@ cudaError_t cudaGetExportTable(const void **ppExportTable, const cudaUUID_t *pEx
* @param symbolPtr SEND_RECV
*/
cudaError_t cudaGetFuncBySymbol(cudaFunction_t *functionPtr, const void *symbolPtr);
/**
* @param handle RECV_ONLY
*/
cudnnStatus_t cudnnCreate(cudnnHandle_t *handle);
/**
* @param handle RECV_ONLY
*/
Expand Down
6 changes: 6 additions & 0 deletions codegen/codegen.py
Original file line number Diff line number Diff line change
Expand Up @@ -605,6 +605,8 @@ def error_const(return_type: str) -> str:
return "cudaErrorDevicesUnavailable"
if return_type == "cublasStatus_t":
return "CUBLAS_STATUS_NOT_INITIALIZED"
if return_type == "cudnnStatus_t":
return "CUDNN_STATUS_NOT_INITIALIZED"
raise NotImplementedError("Unknown return type: %s" % return_type)


Expand All @@ -618,6 +620,7 @@ def main():
options = ParserOptions(preprocessor=make_gcc_preprocessor(defines=["CUBLASAPI="]))

nvml_ast: ParsedData = parse_file("/usr/include/nvml.h", options=options)
cudnn_graph_ast: ParsedData = parse_file("/usr/include/cudnn_graph.h", options=options)
cuda_ast: ParsedData = parse_file("/usr/include/cuda.h", options=options)
cublas_ast: ParsedData = parse_file("/usr/include/cublas_api.h", options=options)
cudart_ast: ParsedData = parse_file(
Expand All @@ -632,6 +635,7 @@ def main():
+ cuda_ast.namespace.functions
+ cudart_ast.namespace.functions
+ cublas_ast.namespace.functions
+ cudnn_graph_ast.namespace.functions
)

functions_with_annotations: list[tuple[Function, Function, list[Operation]]] = []
Expand Down Expand Up @@ -675,6 +679,7 @@ def main():
f.write(
"#include <nvml.h>\n"
"#include <cuda.h>\n"
"#include <cudnn.h>\n"
"#include <cublas_v2.h>\n"
"#include <cuda_runtime_api.h>\n\n"
"#include <cstring>\n"
Expand Down Expand Up @@ -816,6 +821,7 @@ def main():
f.write(
"#include <nvml.h>\n"
"#include <cuda.h>\n"
"#include <cudnn.h>\n"
"#include <cublas_v2.h>\n"
"#include <cuda_runtime_api.h>\n\n"
"#include <cstring>\n"
Expand Down
1 change: 1 addition & 0 deletions codegen/gen_api.h
Original file line number Diff line number Diff line change
Expand Up @@ -888,3 +888,4 @@
#define RPC_cublasCreate_v2 887
#define RPC_cublasDestroy_v2 888
#define RPC_cublasSgemm_v2 889
#define RPC_cudnnCreate 890
16 changes: 16 additions & 0 deletions codegen/gen_client.cpp
Original file line number Diff line number Diff line change
@@ -1,5 +1,7 @@
#include <nvml.h>
#include <cuda.h>
#include <cudnn.h>
#include <iostream>
#include <cublas_v2.h>
#include <cuda_runtime_api.h>

Expand Down Expand Up @@ -9113,6 +9115,8 @@ cudaError_t cudaOccupancyMaxActiveClusters(int* numClusters, const void* func, c

cudaError_t cudaMallocManaged(void** devPtr, size_t size, unsigned int flags)
{
std::cout << "calling cudaMallocManaged" << std::endl;

cudaError_t return_value;
if (rpc_start_request(0, RPC_cudaMallocManaged) < 0 ||
rpc_write(0, devPtr, sizeof(void*)) < 0 ||
Expand Down Expand Up @@ -11160,6 +11164,17 @@ cublasStatus_t cublasSgemm_v2(cublasHandle_t handle, cublasOperation_t transa, c
return return_value;
}

cudnnStatus_t cudnnCreate(cudnnHandle_t* handle)
{
cudnnStatus_t return_value;
if (rpc_start_request(0, RPC_cudnnCreate) < 0 ||
rpc_wait_for_response(0) < 0 ||
rpc_read(0, handle, sizeof(cudnnHandle_t)) < 0 ||
rpc_end_response(0, &return_value) < 0)
return CUDNN_STATUS_NOT_INITIALIZED;
return return_value;
}

std::unordered_map<std::string, void *> functionMap = {
{"__cudaRegisterVar", (void *)__cudaRegisterVar},
{"__cudaRegisterFunction", (void *)__cudaRegisterFunction},
Expand Down Expand Up @@ -12018,6 +12033,7 @@ std::unordered_map<std::string, void *> functionMap = {
{"cublasCreate_v2", (void *)cublasCreate_v2},
{"cublasDestroy_v2", (void *)cublasDestroy_v2},
{"cublasSgemm_v2", (void *)cublasSgemm_v2},
{"cudnnCreate", (void *)cudnnCreate},
{"cuMemcpy_ptds", (void *)cuMemcpy},
{"cuMemcpyAsync_ptsz", (void *)cuMemcpyAsync},
{"cuMemcpyPeer_ptds", (void *)cuMemcpyPeer},
Expand Down
32 changes: 32 additions & 0 deletions codegen/gen_server.cpp
Original file line number Diff line number Diff line change
@@ -1,5 +1,7 @@
#include <nvml.h>
#include <iostream>
#include <cuda.h>
#include <cudnn.h>
#include <cublas_v2.h>
#include <cuda_runtime_api.h>

Expand Down Expand Up @@ -19392,6 +19394,9 @@ int handle_cudaMallocManaged(void *conn)
unsigned int flags;
int request_id;
cudaError_t result;

std::cout << "Calling handle_cudaMallocManaged" << std::endl;

if (
rpc_read(conn, &devPtr, sizeof(void*)) < 0 ||
rpc_read(conn, &size, sizeof(size_t)) < 0 ||
Expand All @@ -19404,6 +19409,8 @@ int handle_cudaMallocManaged(void *conn)
goto ERROR_0;
result = cudaMallocManaged(&devPtr, size, flags);

std::cout << "Calling result:: " << result << std::endl;

if (rpc_start_response(conn, request_id) < 0 ||
rpc_write(conn, &devPtr, sizeof(void*)) < 0 ||
rpc_end_response(conn, &result) < 0)
Expand Down Expand Up @@ -23682,6 +23689,30 @@ int handle_cublasSgemm_v2(void *conn)
return -1;
}

int handle_cudnnCreate(void *conn)
{
cudnnHandle_t handle;
int request_id;
cudnnStatus_t result;
if (
false)
goto ERROR_0;

request_id = rpc_end_request(conn);
if (request_id < 0)
goto ERROR_0;
result = cudnnCreate(&handle);

if (rpc_start_response(conn, request_id) < 0 ||
rpc_write(conn, &handle, sizeof(cudnnHandle_t)) < 0 ||
rpc_end_response(conn, &result) < 0)
goto ERROR_0;

return 0;
ERROR_0:
return -1;
}

static RequestHandler opHandlers[] = {
handle___cudaRegisterVar,
handle___cudaRegisterFunction,
Expand Down Expand Up @@ -24573,6 +24604,7 @@ static RequestHandler opHandlers[] = {
handle_cublasCreate_v2,
handle_cublasDestroy_v2,
handle_cublasSgemm_v2,
handle_cudnnCreate,
};

RequestHandler get_handler(const int op)
Expand Down
3 changes: 2 additions & 1 deletion local.sh
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,7 @@ build() {
echo "building vector file for test..."

nvcc --cudart=shared -lnvidia-ml -lcuda ./test/vector_add.cu -o vector.o
nvcc --cudart=shared -lnvidia-ml -lcuda -lcudnn ./test/cudnn.cu -o cudnn.o

if [ ! -f "$libscuda_path" ]; then
echo "libscuda.so not found. build may have failed."
Expand All @@ -36,7 +37,7 @@ server() {
echo "building server..."

if [[ "$(uname)" == "Linux" ]]; then
nvcc --compiler-options -g,-Wno-deprecated-declarations -o $server_out_path $server_path -lnvidia-ml -lcuda -lcublas --cudart=shared
nvcc --compiler-options -g,-Wno-deprecated-declarations -o $server_out_path $server_path -lnvidia-ml -lcuda -lcublas -lcudnn --cudart=shared
else
echo "No compiler options set for os "$(uname)""
fi
Expand Down
70 changes: 70 additions & 0 deletions test/cudnn.cu
Original file line number Diff line number Diff line change
@@ -0,0 +1,70 @@
#include <iostream>
#include <cuda_runtime.h>
#include <cudnn.h>

/**
* Minimal example to apply sigmoid activation on a tensor
* using cuDNN.
**/
int main(int argc, char** argv)
{
int numGPUs;
cudaGetDeviceCount(&numGPUs);
std::cout << "Found " << numGPUs << " GPUs." << std::endl;
cudaSetDevice(0); // use GPU0
int device;
struct cudaDeviceProp devProp;
cudaGetDevice(&device);
cudaGetDeviceProperties(&devProp, device);
std::cout << "Compute capability:" << devProp.major << "." << devProp.minor << std::endl;

cudnnHandle_t handle_;
cudnnCreate(&handle_);
std::cout << "Created cuDNN handle" << std::endl;

// create the tensor descriptor
cudnnDataType_t dtype = CUDNN_DATA_FLOAT;
cudnnTensorFormat_t format = CUDNN_TENSOR_NCHW;

int n = 1, c = 1, h = 1, w = 10;
int NUM_ELEMENTS = n*c*h*w;
cudnnTensorDescriptor_t x_desc;

cudnnCreateTensorDescriptor(&x_desc);
cudnnSetTensor4dDescriptor(x_desc, format, dtype, n, c, h, w);

// create the tensor
float *x;
cudaMallocManaged(&x, NUM_ELEMENTS * sizeof(float));
for(int i=0;i<NUM_ELEMENTS;i++) x[i] = i * 1.00f;
std::cout << "Original array: ";
for(int i=0;i<NUM_ELEMENTS;i++) std::cout << x[i] << " ";

// create activation function descriptor
float alpha[1] = {1};
float beta[1] = {0.0};
cudnnActivationDescriptor_t sigmoid_activation;
cudnnActivationMode_t mode = CUDNN_ACTIVATION_SIGMOID;
cudnnNanPropagation_t prop = CUDNN_NOT_PROPAGATE_NAN;
cudnnCreateActivationDescriptor(&sigmoid_activation);
cudnnSetActivationDescriptor(sigmoid_activation, mode, prop, 0.0f);

cudnnActivationForward(
handle_,
sigmoid_activation,
alpha,
x_desc,
x,
beta,
x_desc,
x
);

cudnnDestroy(handle_);
std::cout << std::endl << "Destroyed cuDNN handle." << std::endl;
std::cout << "New array: ";
for(int i=0;i<NUM_ELEMENTS;i++) std::cout << x[i] << " ";
std::cout << std::endl;
cudaFree(x);
return 0;
}

0 comments on commit bb19397

Please sign in to comment.