Merge pull request #5 from monofuel/hip-cpu-varargs

re-work launch kernel
monofuel · Sep 2, 2024 · 4a4b304 · 4a4b304
2 parents 77ab137 + bc52fbb
commit 4a4b304
Show file tree

Hide file tree

Showing 16 changed files with 102 additions and 87 deletions.
diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml
@@ -41,6 +41,6 @@ jobs:
         nim cpp examples/vector_sum_hip_amd.nim
         #nim cpp examples/vector_sum_hip_nvidia.nim
     
-    - name: Execute CPU Example
-      run: |
-        ./examples/vector_sum_cpu
+    # - name: Execute CPU Example
+    #   run: |
+    #     ./examples/vector_sum_cpu
diff --git a/.gitignore b/.gitignore
@@ -2,6 +2,7 @@
 *
 !*/
 !*.*
+!Dockerfile
 
 # normal ignores:
 *.exe

diff --git a/Dockerfile b/Dockerfile
@@ -0,0 +1,16 @@
+FROM ubuntu:22.04
+
+RUN apt update && apt install -y git gdb gcc g++ make libtbb2-dev && apt clean
+
+RUN git clone --branch devel https://github.com/nim-lang/Nim.git --depth 1 /opt/Nim
+
+WORKDIR /opt/Nim
+RUN sh ./build_all.sh 
+
+RUN ./bin/nim c koch
+RUN ./koch boot -d:release
+RUN ./koch tools
+
+RUN mkdir -p /root/.nimble/bin
+
+ENV PATH="/opt/Nim/bin/:/root/.nimble/bin:${PATH}"
diff --git a/README.md b/README.md
@@ -20,7 +20,7 @@ proc addKernel*(a, b: cint; c: ptr[cint]) {.hippoGlobal.} =
 var c: int32
 var dev_c: ptr[int32]
 handleError(hipMalloc(cast[ptr pointer](addr dev_c), sizeof(int32).cint))
-handleError(launchKernel(addKernel,args = (2,7,dev_c)))
+hippoLaunchKernel(addKernel,args = (2,7,dev_c))
 handleError(hipMemcpy(addr c, dev_c, sizeof(int32).cint, hipMemcpyDeviceToHost))
 echo "2 + 7 = ", c
 handleError(hipFree(dev_c))

diff --git a/docker-compose.yaml b/docker-compose.yaml
@@ -0,0 +1,9 @@
+version: '3.8'
+
+services:
+  hippo-build:
+    build:
+      context: .
+      dockerfile: ./Dockerfile
+    volumes:
+      - .:/p/hippo
diff --git a/examples/vector_sum_cpu.nim b/examples/vector_sum_cpu.nim
@@ -34,11 +34,11 @@ proc main() =
   handleError(hipMemcpy(dev_b, addr b[0], sizeof(int32)*N, hipMemcpyHostToDevice))
 
   # launch kernel
-  handleError(launchKernel(
+  hippoLaunchKernel(
     addkernel,
     gridDim = newDim3(N.uint32),
     args = (dev_a, dev_b, dev_c)
-  ))
+  )
 
   # copy result back to host
   handleError(hipMemcpy(addr c[0], dev_c, sizeof(int32)*N, hipMemcpyDeviceToHost))

diff --git a/examples/vector_sum_cuda.nim b/examples/vector_sum_cuda.nim
@@ -35,11 +35,11 @@ proc main() =
   handleError(cudaMemcpy(dev_b, addr b[0], sizeof(int32)*N, cudaMemcpyHostToDevice))
 
   # launch kernel
-  handleError(launchKernel(
+  hippoLaunchKernel(
     addkernel,
     gridDim = newDim3(N.uint32),
     args = (dev_a, dev_b, dev_c)
-  ))
+  )
 
   # copy result back to host
   handleError(cudaMemcpy(addr c[0], dev_c, sizeof(int32)*N, cudaMemcpyDeviceToHost))

diff --git a/examples/vector_sum_hip_amd.nim b/examples/vector_sum_hip_amd.nim
@@ -34,11 +34,11 @@ proc main() =
   handleError(hipMemcpy(dev_b, addr b[0], sizeof(int32)*N, hipMemcpyHostToDevice))
 
   # launch kernel
-  handleError(launchKernel(
+  hippoLaunchKernel(
     addkernel,
     gridDim = newDim3(N.uint32),
     args = (dev_a, dev_b, dev_c)
-  ))
+  )
 
   # copy result back to host
   handleError(hipMemcpy(addr c[0], dev_c, sizeof(int32)*N, hipMemcpyDeviceToHost))

diff --git a/examples/vector_sum_hip_nvidia.nim b/examples/vector_sum_hip_nvidia.nim
@@ -37,11 +37,11 @@ proc main() =
   handleError(hipMemcpy(dev_b, addr b[0], sizeof(int32)*N, hipMemcpyHostToDevice))
 
   # launch kernel
-  handleError(launchKernel(
+  hippoLaunchKernel(
     addkernel,
     gridDim = newDim3(N.uint32),
     args = (dev_a, dev_b, dev_c)
-  ))
+  )
 
   # copy result back to host
   handleError(hipMemcpy(addr c[0], dev_c, sizeof(int32)*N, hipMemcpyDeviceToHost))

diff --git a/hippo.nimble b/hippo.nimble
@@ -1,4 +1,4 @@
-version     = "0.5.2"
+version     = "0.5.5"
 author      = "Andrew Brower"
 description = "HIP library for Nim"
 license     = "MIT"

diff --git a/src/cuda.nim b/src/cuda.nim
@@ -55,9 +55,6 @@ proc cudaLaunchKernel*(function_address: pointer; numBlocks: Dim3; dimBlocks: Di
 proc cudaLaunchKernel*(function_address: pointer; numBlocks: Dim3; dimBlocks: Dim3;
                      args: ptr pointer, sharedMemBytes: uint32_t, stream: cudaStream_t): cudaError_t {.
     importcpp: "cudaLaunchKernel(@)", header: "cuda_runtime.h".}
-# proc cudaLaunchKernel*(function_address: pointer; numBlocks: dim3; dimBlocks: dim3;
-#                      args: ptr pointer; sharedMemBytes: csize_t; stream: cudaStream_t): cint {.
-#     importcpp: "cudaLaunchKernel(@)", header: "cuda_runtime.h".}
 proc cudaDeviceSynchronize*(): cudaError_t {.header: "cuda_runtime.h",importcpp: "cudaDeviceSynchronize(@)".}
 proc cudaSyncthreads*() {.importcpp: "__syncthreads()", header: "cuda_runtime.h".}
 proc hippoSyncthreads*() {.importcpp: "__syncthreads()", header: "cuda_runtime.h".}

diff --git a/src/hip.nim b/src/hip.nim
@@ -52,9 +52,9 @@ proc hipFree*(`ptr`: pointer): hipError_t {.header: "hip/hip_runtime.h",importcp
 proc hipLaunchKernel*(function_address: pointer; numBlocks: Dim3; dimBlocks: Dim3;
                      args: ptr pointer): hipError_t {.
     importcpp: "hipLaunchKernel(@)", header: "hip/hip_runtime.h".}
-# proc hipLaunchKernel*(function_address: pointer; numBlocks: dim3; dimBlocks: dim3;
-#                      args: ptr pointer; sharedMemBytes: csize_t; stream: hipStream_t): cint {.
-#     importcpp: "hipLaunchKernel(@)", header: "hip/hip_runtime.h".}
+proc hipLaunchKernel*(function_address: pointer; numBlocks: Dim3; dimBlocks: Dim3;
+                     args: ptr pointer; sharedMemBytes: csize_t; stream: hipStream_t): cint {.
+    importcpp: "hipLaunchKernel(@)", header: "hip/hip_runtime.h".}
 proc hipDeviceSynchronize*(): hipError_t {.header: "hip/hip_runtime.h",importcpp: "hipDeviceSynchronize(@)".}
 proc hipSyncthreads*() {.importcpp: "__syncthreads()", header: "hip/hip_runtime.h".}
 proc hippoSyncthreads*() {.importcpp: "__syncthreads()", header: "hip/hip_runtime.h".}
@@ -68,7 +68,6 @@ proc hipLaunchKernelGGL*(
   ) {.
     importcpp: "hipLaunchKernelGGL(@)", header: "hip/hip_runtime.h", varargs.}
 
-
 type ConstCString* {.importc: "const char*".} = object
 converter toCString*(self: ConstCString): cstring {.importc: "(char*)", noconv, nodecl.}
 converter toConstCString*(self: cstring): ConstCString {.importc: "(const char*)", noconv, nodecl.}

diff --git a/src/hippo.nim b/src/hippo.nim
@@ -134,88 +134,81 @@ proc `=destroy`*(mem: var GpuMemory) =
 # -------------------
 # Kernel Execution
 
-proc launchKernel*(
-  kernel: proc,
-  gridDim: Dim3 = newDim3(1,1,1), # default to a grid of 1 block
-  blockDim: Dim3 = newDim3(1,1,1),  # default to 1 thread per block
-  sharedMemBytes: uint32 = 0,
-  stream: HippoStream = nil,
-  args: tuple
-): HippoError =
-  # launchKernel is designed to be similar to `kernel`<<<blockDim, gridDim>>>(args)
-
-  # this function is horrible but it works
-  # needs to be refactored to handle all the different runtimes and arguments better
-
-  # having some issues between hip and hip-cpu, so defining different versions of launchKernel
+when HippoRuntime == "HIP" or HippoRuntime == "HIP_CPU":
+  macro hipLaunchKernelGGLWithTuple(
+    kernel: proc,
+    gridDim: Dim3 = newDim3(1,1,1),
+    blockDim: Dim3 = newDim3(1,1,1),
+    sharedMemBytes: uint32 = 0,
+    stream: HippoStream = nil,
+    args: tuple
+    ): untyped =
+
+    var callNode = newCall(bindSym"hipLaunchKernelGGL")
+
+    # add the fixed vars
+    callNode.add kernel
+    callNode.add gridDim
+    callNode.add blockDim
+    callNode.add sharedMemBytes
+    callNode.add stream
+
+    # add every value of the tuple
+    for child in args:
+      callNode.add child
+    result = callNode
+
+template hippoLaunchKernel*(
+  kernel: proc,                     ## The GPU kernel procedure to launch
+  gridDim: Dim3 = newDim3(1,1,1),   ## default to a grid of 1 block
+  blockDim: Dim3 = newDim3(1,1,1),  ## default to 1 thread per block
+  sharedMemBytes: uint32 = 0,       ## dynamic shared memory amount to allocate
+  stream: HippoStream = nil,        ## Which device stream to run under (defaults to null)
+  args: tuple,                ## Arguments to pass to the GPU kernel
+) =
+  var result: HippoError
+  ## Launch a kernel on the GPU.
+  ## also checks if launchKernel() returns an error.
+  ## Important: this only checks if the kernel launch was successful, not the kernel itself.
+  # 
+  # This code is kinda gross, the launch kernel functions have a lot of different signatures.
+  var kernelArgs: seq[pointer]
+  for key, arg in args.fieldPairs:
+    let a1 = arg
+    kernelArgs.add(cast[pointer](addr a1))
   when HippoRuntime == "HIP" and HipPlatform == "amd":
-    # This branch works for all args
-    echo "executing HIP"
-    var kernelArgs: seq[pointer]
-    for key, arg in args.fieldPairs:
-      kernelArgs.add(cast[pointer](addr arg))
     result = hipLaunchKernel(
       cast[pointer](kernel),
       gridDim,
       blockDim,
       cast[ptr pointer](addr kernelArgs[0]),
+      sharedMemBytes,
+      stream
     )
-  elif HippoRuntime == "HIP" and HipPlatform == "nvidia":
-    # TODO fix args on this branch
-    hipLaunchKernelGGL(
-      kernel,
-      gridDim,
-      blockDim,
-      0, # TODO
-      nil, # TODO
-      # TODO handle args properly
-      cast[ptr[cint]](args[0]),
-      cast[ptr[cint]](args[1]),
-      cast[ptr[cint]](args[2])
-      )
-    result = hipGetLastError()
-  elif HippoRuntime == "HIP_CPU":
-    # TODO fix args on this branch
-    echo "executing kernel on CPU"
-    hipLaunchKernelGGL(
+  elif (HippoRuntime == "HIP" and HipPlatform == "nvidia") or HippoRuntime == "HIP_CPU":
+    hipLaunchKernelGGLWithTuple(
       kernel,
       gridDim,
       blockDim,
-      0, # TODO
-      nil, # TODO
-      # TODO handle args properly
-      args[0],
-      args[1],
-      args[2]
+      sharedMemBytes,
+      stream,
+      args
     )
     result = hipGetLastError()
   elif HippoRuntime == "CUDA":
-    # This branch works for all args
-    echo "executing CUDA"
-    var kernelArgs: seq[pointer]
-    for key, arg in args.fieldPairs:
-      kernelArgs.add(cast[pointer](addr arg))
     result = cudaLaunchKernel(
       kernel,
       gridDim,
       blockDim,
-      cast[ptr pointer](addr kernelArgs[0])
-      #sharedMemBytes,
-      #stream
+      cast[ptr pointer](addr kernelArgs[0]),
+      sharedMemBytes,
+      stream
     )
   else:
     raise newException(Exception, &"Unknown runtime: {HippoRuntime}")
 
-template hippoLaunchKernel*(
-  kernel: proc,                     ## The GPU kernel procedure to launch
-  gridDim: Dim3 = newDim3(1,1,1),   ## default to a grid of 1 block
-  blockDim: Dim3 = newDim3(1,1,1),  ## default to 1 thread per block
-  sharedMemBytes: uint32 = 0,       ## dynamic shared memory amount to allocate
-  stream: HippoStream = nil,        ## Which device stream to run under (defaults to null)
-  args: tuple,                      ## Arguments to pass to the GPU kernel
-) =
-  ## Launch a kernel on the GPU and check for errors
-  handleError(launchKernel(kernel, gridDim, blockDim, sharedMemBytes, stream, args))
+  handleError(result)
+
 
 
 # -------------------

diff --git a/tests/hip/call_params.nim b/tests/hip/call_params.nim
@@ -8,10 +8,10 @@ proc main() =
   var c: int32
   var dev_c: ptr[int32]
   handleError(hipMalloc(cast[ptr pointer](addr dev_c), sizeof(int32).cint))
-  handleError(launchKernel(
+  hippoLaunchKernel(
     addKernel,
     args = (2,7,dev_c)
-  ))
+  )
   handleError(hipMemcpy(addr c, dev_c, sizeof(int32).cint, hipMemcpyDeviceToHost))
   echo "2 + 7 = ", c
   handleError(hipFree(dev_c))

diff --git a/tests/hip/dot.nim b/tests/hip/dot.nim
@@ -62,12 +62,12 @@ proc main() =
   handleError(hipMemcpy(dev_b, addr b[0], sizeof(float64)*N, hipMemcpyHostToDevice))
 
   # launch kernel
-  handleError(launchKernel(
+  hippoLaunchKernel(
     dot,
     gridDim = newDim3(BlocksPerGrid.uint32),
     blockDim = newDim3(ThreadsPerBlock.uint32),
     args = (dev_a, dev_b, dev_partial_c)
-  ))
+  )
 
   # copy memory back from GPU to CPU
   handleError(hipMemcpy(addr partial_c[0], dev_partial_c, BlocksPerGrid * sizeof(float64), hipMemcpyDeviceToHost))

diff --git a/tests/hip/vector_sum.nim b/tests/hip/vector_sum.nim
@@ -29,11 +29,11 @@ proc main() =
   handleError(hipMemcpy(dev_b, addr b[0], sizeof(int32)*N, hipMemcpyHostToDevice))
 
   # launch kernel
-  handleError(launchKernel(
+  hippoLaunchKernel(
     addkernel,
     gridDim = newDim3(N.uint32),
     args = (dev_a, dev_b, dev_c)
-  ))
+  )
 
   # copy result back to host
   handleError(hipMemcpy(addr c[0], dev_c, sizeof(int32)*N, hipMemcpyDeviceToHost))