chore: cudnn wip

kevmo314 · Nov 19, 2024 · bb19397 · bb19397
1 parent 8e3d836
commit bb19397
Show file tree

Hide file tree

Showing 7 changed files with 132 additions and 1 deletion.
diff --git a/codegen/annotations.h b/codegen/annotations.h
@@ -1,5 +1,6 @@
 #include <nvml.h>
 #include <cuda.h>
+#include <cudnn.h>
 #include <cublas_v2.h>
 #include <cuda_runtime.h>
 
@@ -5545,6 +5546,10 @@ cudaError_t cudaGetExportTable(const void **ppExportTable, const cudaUUID_t *pEx
  * @param symbolPtr SEND_RECV
  */
 cudaError_t cudaGetFuncBySymbol(cudaFunction_t *functionPtr, const void *symbolPtr);
+/**
+ * @param handle RECV_ONLY
+ */
+cudnnStatus_t cudnnCreate(cudnnHandle_t *handle);
 /**
  * @param handle RECV_ONLY
  */

diff --git a/codegen/codegen.py b/codegen/codegen.py
@@ -605,6 +605,8 @@ def error_const(return_type: str) -> str:
         return "cudaErrorDevicesUnavailable"
     if return_type == "cublasStatus_t":
         return "CUBLAS_STATUS_NOT_INITIALIZED"
+    if return_type == "cudnnStatus_t":
+        return "CUDNN_STATUS_NOT_INITIALIZED"
     raise NotImplementedError("Unknown return type: %s" % return_type)
 
 
@@ -618,6 +620,7 @@ def main():
     options = ParserOptions(preprocessor=make_gcc_preprocessor(defines=["CUBLASAPI="]))
 
     nvml_ast: ParsedData = parse_file("/usr/include/nvml.h", options=options)
+    cudnn_graph_ast: ParsedData = parse_file("/usr/include/cudnn_graph.h", options=options)
     cuda_ast: ParsedData = parse_file("/usr/include/cuda.h", options=options)
     cublas_ast: ParsedData = parse_file("/usr/include/cublas_api.h", options=options)
     cudart_ast: ParsedData = parse_file(
@@ -632,6 +635,7 @@ def main():
         + cuda_ast.namespace.functions
         + cudart_ast.namespace.functions
         + cublas_ast.namespace.functions
+        + cudnn_graph_ast.namespace.functions
     )
 
     functions_with_annotations: list[tuple[Function, Function, list[Operation]]] = []
@@ -675,6 +679,7 @@ def main():
         f.write(
             "#include <nvml.h>\n"
             "#include <cuda.h>\n"
+            "#include <cudnn.h>\n"
             "#include <cublas_v2.h>\n"
             "#include <cuda_runtime_api.h>\n\n"
             "#include <cstring>\n"
@@ -816,6 +821,7 @@ def main():
         f.write(
             "#include <nvml.h>\n"
             "#include <cuda.h>\n"
+            "#include <cudnn.h>\n"
             "#include <cublas_v2.h>\n"
             "#include <cuda_runtime_api.h>\n\n"
             "#include <cstring>\n"

diff --git a/codegen/gen_api.h b/codegen/gen_api.h
@@ -888,3 +888,4 @@
 #define RPC_cublasCreate_v2 887
 #define RPC_cublasDestroy_v2 888
 #define RPC_cublasSgemm_v2 889
+#define RPC_cudnnCreate 890
diff --git a/codegen/gen_client.cpp b/codegen/gen_client.cpp
@@ -1,5 +1,7 @@
 #include <nvml.h>
 #include <cuda.h>
+#include <cudnn.h>
+#include <iostream>
 #include <cublas_v2.h>
 #include <cuda_runtime_api.h>
 
@@ -9113,6 +9115,8 @@ cudaError_t cudaOccupancyMaxActiveClusters(int* numClusters, const void* func, c
 
 cudaError_t cudaMallocManaged(void** devPtr, size_t size, unsigned int flags)
 {
+    std::cout << "calling cudaMallocManaged" << std::endl;
+
     cudaError_t return_value;
     if (rpc_start_request(0, RPC_cudaMallocManaged) < 0 ||
         rpc_write(0, devPtr, sizeof(void*)) < 0 ||
@@ -11160,6 +11164,17 @@ cublasStatus_t cublasSgemm_v2(cublasHandle_t handle, cublasOperation_t transa, c
     return return_value;
 }
 
+cudnnStatus_t cudnnCreate(cudnnHandle_t* handle)
+{
+    cudnnStatus_t return_value;
+    if (rpc_start_request(0, RPC_cudnnCreate) < 0 ||
+        rpc_wait_for_response(0) < 0 ||
+        rpc_read(0, handle, sizeof(cudnnHandle_t)) < 0 ||
+        rpc_end_response(0, &return_value) < 0)
+        return CUDNN_STATUS_NOT_INITIALIZED;
+    return return_value;
+}
+
 std::unordered_map<std::string, void *> functionMap = {
     {"__cudaRegisterVar", (void *)__cudaRegisterVar},
     {"__cudaRegisterFunction", (void *)__cudaRegisterFunction},
@@ -12018,6 +12033,7 @@ std::unordered_map<std::string, void *> functionMap = {
     {"cublasCreate_v2", (void *)cublasCreate_v2},
     {"cublasDestroy_v2", (void *)cublasDestroy_v2},
     {"cublasSgemm_v2", (void *)cublasSgemm_v2},
+    {"cudnnCreate", (void *)cudnnCreate},
     {"cuMemcpy_ptds", (void *)cuMemcpy},
     {"cuMemcpyAsync_ptsz", (void *)cuMemcpyAsync},
     {"cuMemcpyPeer_ptds", (void *)cuMemcpyPeer},

diff --git a/codegen/gen_server.cpp b/codegen/gen_server.cpp
@@ -1,5 +1,7 @@
 #include <nvml.h>
+#include <iostream>
 #include <cuda.h>
+#include <cudnn.h>
 #include <cublas_v2.h>
 #include <cuda_runtime_api.h>
 
@@ -19392,6 +19394,9 @@ int handle_cudaMallocManaged(void *conn)
     unsigned int flags;
     int request_id;
     cudaError_t result;
+
+    std::cout << "Calling handle_cudaMallocManaged" << std::endl;
+
     if (
         rpc_read(conn, &devPtr, sizeof(void*)) < 0 ||
         rpc_read(conn, &size, sizeof(size_t)) < 0 ||
@@ -19404,6 +19409,8 @@ int handle_cudaMallocManaged(void *conn)
         goto ERROR_0;
     result = cudaMallocManaged(&devPtr, size, flags);
 
+    std::cout << "Calling result:: " << result << std::endl;
+
     if (rpc_start_response(conn, request_id) < 0 ||
         rpc_write(conn, &devPtr, sizeof(void*)) < 0 ||
         rpc_end_response(conn, &result) < 0)
@@ -23682,6 +23689,30 @@ int handle_cublasSgemm_v2(void *conn)
     return -1;
 }
 
+int handle_cudnnCreate(void *conn)
+{
+    cudnnHandle_t handle;
+    int request_id;
+    cudnnStatus_t result;
+    if (
+        false)
+        goto ERROR_0;
+
+    request_id = rpc_end_request(conn);
+    if (request_id < 0)
+        goto ERROR_0;
+    result = cudnnCreate(&handle);
+
+    if (rpc_start_response(conn, request_id) < 0 ||
+        rpc_write(conn, &handle, sizeof(cudnnHandle_t)) < 0 ||
+        rpc_end_response(conn, &result) < 0)
+        goto ERROR_0;
+
+    return 0;
+ERROR_0:
+    return -1;
+}
+
 static RequestHandler opHandlers[] = {
     handle___cudaRegisterVar,
     handle___cudaRegisterFunction,
@@ -24573,6 +24604,7 @@ static RequestHandler opHandlers[] = {
     handle_cublasCreate_v2,
     handle_cublasDestroy_v2,
     handle_cublasSgemm_v2,
+    handle_cudnnCreate,
 };
 
 RequestHandler get_handler(const int op)

diff --git a/local.sh b/local.sh
@@ -25,6 +25,7 @@ build() {
   echo "building vector file for test..."
 
   nvcc --cudart=shared -lnvidia-ml -lcuda ./test/vector_add.cu -o vector.o
+  nvcc --cudart=shared -lnvidia-ml -lcuda -lcudnn ./test/cudnn.cu -o cudnn.o
 
   if [ ! -f "$libscuda_path" ]; then
     echo "libscuda.so not found. build may have failed."
@@ -36,7 +37,7 @@ server() {
   echo "building server..." 
 
   if [[ "$(uname)" == "Linux" ]]; then
-    nvcc --compiler-options -g,-Wno-deprecated-declarations -o $server_out_path $server_path -lnvidia-ml -lcuda -lcublas --cudart=shared
+    nvcc --compiler-options -g,-Wno-deprecated-declarations -o $server_out_path $server_path -lnvidia-ml -lcuda -lcublas -lcudnn --cudart=shared
   else
     echo "No compiler options set for os "$(uname)""
   fi

diff --git a/test/cudnn.cu b/test/cudnn.cu
@@ -0,0 +1,70 @@
+#include <iostream>
+#include <cuda_runtime.h>
+#include <cudnn.h>
+
+/**
+ * Minimal example to apply sigmoid activation on a tensor 
+ * using cuDNN.
+ **/
+int main(int argc, char** argv)
+{    
+    int numGPUs;
+    cudaGetDeviceCount(&numGPUs);
+    std::cout << "Found " << numGPUs << " GPUs." << std::endl;
+    cudaSetDevice(0); // use GPU0
+    int device; 
+    struct cudaDeviceProp devProp;
+    cudaGetDevice(&device);
+    cudaGetDeviceProperties(&devProp, device);
+    std::cout << "Compute capability:" << devProp.major << "." << devProp.minor << std::endl;
+
+    cudnnHandle_t handle_;
+    cudnnCreate(&handle_);
+    std::cout << "Created cuDNN handle" << std::endl;
+
+    // create the tensor descriptor
+    cudnnDataType_t dtype = CUDNN_DATA_FLOAT;
+    cudnnTensorFormat_t format = CUDNN_TENSOR_NCHW;
+
+    int n = 1, c = 1, h = 1, w = 10;
+    int NUM_ELEMENTS = n*c*h*w;
+    cudnnTensorDescriptor_t x_desc;
+
+    cudnnCreateTensorDescriptor(&x_desc);
+    cudnnSetTensor4dDescriptor(x_desc, format, dtype, n, c, h, w);
+
+    // create the tensor
+    float *x;
+    cudaMallocManaged(&x, NUM_ELEMENTS * sizeof(float));
+    for(int i=0;i<NUM_ELEMENTS;i++) x[i] = i * 1.00f;
+    std::cout << "Original array: "; 
+    for(int i=0;i<NUM_ELEMENTS;i++) std::cout << x[i] << " ";
+
+    // create activation function descriptor
+    float alpha[1] = {1};
+    float beta[1] = {0.0};
+    cudnnActivationDescriptor_t sigmoid_activation;
+    cudnnActivationMode_t mode = CUDNN_ACTIVATION_SIGMOID;
+    cudnnNanPropagation_t prop = CUDNN_NOT_PROPAGATE_NAN;
+    cudnnCreateActivationDescriptor(&sigmoid_activation);
+    cudnnSetActivationDescriptor(sigmoid_activation, mode, prop, 0.0f);
+
+    cudnnActivationForward(
+        handle_,
+        sigmoid_activation,
+        alpha,
+        x_desc,
+        x,
+        beta,
+        x_desc,
+        x
+    );
+
+    cudnnDestroy(handle_);
+    std::cout << std::endl << "Destroyed cuDNN handle." << std::endl;
+    std::cout << "New array: ";
+    for(int i=0;i<NUM_ELEMENTS;i++) std::cout << x[i] << " ";
+    std::cout << std::endl;
+    cudaFree(x);
+    return 0;
+}