Skip to content

Commit

Permalink
Merge pull request #5 from monofuel/hip-cpu-varargs
Browse files Browse the repository at this point in the history
re-work launch kernel
  • Loading branch information
monofuel authored Sep 2, 2024
2 parents 77ab137 + bc52fbb commit 4a4b304
Show file tree
Hide file tree
Showing 16 changed files with 102 additions and 87 deletions.
6 changes: 3 additions & 3 deletions .github/workflows/build.yml
Original file line number Diff line number Diff line change
Expand Up @@ -41,6 +41,6 @@ jobs:
nim cpp examples/vector_sum_hip_amd.nim
#nim cpp examples/vector_sum_hip_nvidia.nim
- name: Execute CPU Example
run: |
./examples/vector_sum_cpu
# - name: Execute CPU Example
# run: |
# ./examples/vector_sum_cpu
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
*
!*/
!*.*
!Dockerfile

# normal ignores:
*.exe
Expand Down
16 changes: 16 additions & 0 deletions Dockerfile
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
FROM ubuntu:22.04

RUN apt update && apt install -y git gdb gcc g++ make libtbb2-dev && apt clean

RUN git clone --branch devel https://github.com/nim-lang/Nim.git --depth 1 /opt/Nim

WORKDIR /opt/Nim
RUN sh ./build_all.sh

RUN ./bin/nim c koch
RUN ./koch boot -d:release
RUN ./koch tools

RUN mkdir -p /root/.nimble/bin

ENV PATH="/opt/Nim/bin/:/root/.nimble/bin:${PATH}"
2 changes: 1 addition & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@ proc addKernel*(a, b: cint; c: ptr[cint]) {.hippoGlobal.} =
var c: int32
var dev_c: ptr[int32]
handleError(hipMalloc(cast[ptr pointer](addr dev_c), sizeof(int32).cint))
handleError(launchKernel(addKernel,args = (2,7,dev_c)))
hippoLaunchKernel(addKernel,args = (2,7,dev_c))
handleError(hipMemcpy(addr c, dev_c, sizeof(int32).cint, hipMemcpyDeviceToHost))
echo "2 + 7 = ", c
handleError(hipFree(dev_c))
Expand Down
9 changes: 9 additions & 0 deletions docker-compose.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
version: '3.8'

services:
hippo-build:
build:
context: .
dockerfile: ./Dockerfile
volumes:
- .:/p/hippo
4 changes: 2 additions & 2 deletions examples/vector_sum_cpu.nim
Original file line number Diff line number Diff line change
Expand Up @@ -34,11 +34,11 @@ proc main() =
handleError(hipMemcpy(dev_b, addr b[0], sizeof(int32)*N, hipMemcpyHostToDevice))

# launch kernel
handleError(launchKernel(
hippoLaunchKernel(
addkernel,
gridDim = newDim3(N.uint32),
args = (dev_a, dev_b, dev_c)
))
)

# copy result back to host
handleError(hipMemcpy(addr c[0], dev_c, sizeof(int32)*N, hipMemcpyDeviceToHost))
Expand Down
4 changes: 2 additions & 2 deletions examples/vector_sum_cuda.nim
Original file line number Diff line number Diff line change
Expand Up @@ -35,11 +35,11 @@ proc main() =
handleError(cudaMemcpy(dev_b, addr b[0], sizeof(int32)*N, cudaMemcpyHostToDevice))

# launch kernel
handleError(launchKernel(
hippoLaunchKernel(
addkernel,
gridDim = newDim3(N.uint32),
args = (dev_a, dev_b, dev_c)
))
)

# copy result back to host
handleError(cudaMemcpy(addr c[0], dev_c, sizeof(int32)*N, cudaMemcpyDeviceToHost))
Expand Down
4 changes: 2 additions & 2 deletions examples/vector_sum_hip_amd.nim
Original file line number Diff line number Diff line change
Expand Up @@ -34,11 +34,11 @@ proc main() =
handleError(hipMemcpy(dev_b, addr b[0], sizeof(int32)*N, hipMemcpyHostToDevice))

# launch kernel
handleError(launchKernel(
hippoLaunchKernel(
addkernel,
gridDim = newDim3(N.uint32),
args = (dev_a, dev_b, dev_c)
))
)

# copy result back to host
handleError(hipMemcpy(addr c[0], dev_c, sizeof(int32)*N, hipMemcpyDeviceToHost))
Expand Down
4 changes: 2 additions & 2 deletions examples/vector_sum_hip_nvidia.nim
Original file line number Diff line number Diff line change
Expand Up @@ -37,11 +37,11 @@ proc main() =
handleError(hipMemcpy(dev_b, addr b[0], sizeof(int32)*N, hipMemcpyHostToDevice))

# launch kernel
handleError(launchKernel(
hippoLaunchKernel(
addkernel,
gridDim = newDim3(N.uint32),
args = (dev_a, dev_b, dev_c)
))
)

# copy result back to host
handleError(hipMemcpy(addr c[0], dev_c, sizeof(int32)*N, hipMemcpyDeviceToHost))
Expand Down
2 changes: 1 addition & 1 deletion hippo.nimble
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
version = "0.5.2"
version = "0.5.5"
author = "Andrew Brower"
description = "HIP library for Nim"
license = "MIT"
Expand Down
3 changes: 0 additions & 3 deletions src/cuda.nim
Original file line number Diff line number Diff line change
Expand Up @@ -55,9 +55,6 @@ proc cudaLaunchKernel*(function_address: pointer; numBlocks: Dim3; dimBlocks: Di
proc cudaLaunchKernel*(function_address: pointer; numBlocks: Dim3; dimBlocks: Dim3;
args: ptr pointer, sharedMemBytes: uint32_t, stream: cudaStream_t): cudaError_t {.
importcpp: "cudaLaunchKernel(@)", header: "cuda_runtime.h".}
# proc cudaLaunchKernel*(function_address: pointer; numBlocks: dim3; dimBlocks: dim3;
# args: ptr pointer; sharedMemBytes: csize_t; stream: cudaStream_t): cint {.
# importcpp: "cudaLaunchKernel(@)", header: "cuda_runtime.h".}
proc cudaDeviceSynchronize*(): cudaError_t {.header: "cuda_runtime.h",importcpp: "cudaDeviceSynchronize(@)".}
proc cudaSyncthreads*() {.importcpp: "__syncthreads()", header: "cuda_runtime.h".}
proc hippoSyncthreads*() {.importcpp: "__syncthreads()", header: "cuda_runtime.h".}
Expand Down
7 changes: 3 additions & 4 deletions src/hip.nim
Original file line number Diff line number Diff line change
Expand Up @@ -52,9 +52,9 @@ proc hipFree*(`ptr`: pointer): hipError_t {.header: "hip/hip_runtime.h",importcp
proc hipLaunchKernel*(function_address: pointer; numBlocks: Dim3; dimBlocks: Dim3;
args: ptr pointer): hipError_t {.
importcpp: "hipLaunchKernel(@)", header: "hip/hip_runtime.h".}
# proc hipLaunchKernel*(function_address: pointer; numBlocks: dim3; dimBlocks: dim3;
# args: ptr pointer; sharedMemBytes: csize_t; stream: hipStream_t): cint {.
# importcpp: "hipLaunchKernel(@)", header: "hip/hip_runtime.h".}
proc hipLaunchKernel*(function_address: pointer; numBlocks: Dim3; dimBlocks: Dim3;
args: ptr pointer; sharedMemBytes: csize_t; stream: hipStream_t): cint {.
importcpp: "hipLaunchKernel(@)", header: "hip/hip_runtime.h".}
proc hipDeviceSynchronize*(): hipError_t {.header: "hip/hip_runtime.h",importcpp: "hipDeviceSynchronize(@)".}
proc hipSyncthreads*() {.importcpp: "__syncthreads()", header: "hip/hip_runtime.h".}
proc hippoSyncthreads*() {.importcpp: "__syncthreads()", header: "hip/hip_runtime.h".}
Expand All @@ -68,7 +68,6 @@ proc hipLaunchKernelGGL*(
) {.
importcpp: "hipLaunchKernelGGL(@)", header: "hip/hip_runtime.h", varargs.}


type ConstCString* {.importc: "const char*".} = object
converter toCString*(self: ConstCString): cstring {.importc: "(char*)", noconv, nodecl.}
converter toConstCString*(self: cstring): ConstCString {.importc: "(const char*)", noconv, nodecl.}
Expand Down
115 changes: 54 additions & 61 deletions src/hippo.nim
Original file line number Diff line number Diff line change
Expand Up @@ -134,88 +134,81 @@ proc `=destroy`*(mem: var GpuMemory) =
# -------------------
# Kernel Execution

proc launchKernel*(
kernel: proc,
gridDim: Dim3 = newDim3(1,1,1), # default to a grid of 1 block
blockDim: Dim3 = newDim3(1,1,1), # default to 1 thread per block
sharedMemBytes: uint32 = 0,
stream: HippoStream = nil,
args: tuple
): HippoError =
# launchKernel is designed to be similar to `kernel`<<<blockDim, gridDim>>>(args)

# this function is horrible but it works
# needs to be refactored to handle all the different runtimes and arguments better

# having some issues between hip and hip-cpu, so defining different versions of launchKernel
when HippoRuntime == "HIP" or HippoRuntime == "HIP_CPU":
macro hipLaunchKernelGGLWithTuple(
kernel: proc,
gridDim: Dim3 = newDim3(1,1,1),
blockDim: Dim3 = newDim3(1,1,1),
sharedMemBytes: uint32 = 0,
stream: HippoStream = nil,
args: tuple
): untyped =

var callNode = newCall(bindSym"hipLaunchKernelGGL")

# add the fixed vars
callNode.add kernel
callNode.add gridDim
callNode.add blockDim
callNode.add sharedMemBytes
callNode.add stream

# add every value of the tuple
for child in args:
callNode.add child
result = callNode

template hippoLaunchKernel*(
kernel: proc, ## The GPU kernel procedure to launch
gridDim: Dim3 = newDim3(1,1,1), ## default to a grid of 1 block
blockDim: Dim3 = newDim3(1,1,1), ## default to 1 thread per block
sharedMemBytes: uint32 = 0, ## dynamic shared memory amount to allocate
stream: HippoStream = nil, ## Which device stream to run under (defaults to null)
args: tuple, ## Arguments to pass to the GPU kernel
) =
var result: HippoError
## Launch a kernel on the GPU.
## also checks if launchKernel() returns an error.
## Important: this only checks if the kernel launch was successful, not the kernel itself.
#
# This code is kinda gross, the launch kernel functions have a lot of different signatures.
var kernelArgs: seq[pointer]
for key, arg in args.fieldPairs:
let a1 = arg
kernelArgs.add(cast[pointer](addr a1))
when HippoRuntime == "HIP" and HipPlatform == "amd":
# This branch works for all args
echo "executing HIP"
var kernelArgs: seq[pointer]
for key, arg in args.fieldPairs:
kernelArgs.add(cast[pointer](addr arg))
result = hipLaunchKernel(
cast[pointer](kernel),
gridDim,
blockDim,
cast[ptr pointer](addr kernelArgs[0]),
sharedMemBytes,
stream
)
elif HippoRuntime == "HIP" and HipPlatform == "nvidia":
# TODO fix args on this branch
hipLaunchKernelGGL(
kernel,
gridDim,
blockDim,
0, # TODO
nil, # TODO
# TODO handle args properly
cast[ptr[cint]](args[0]),
cast[ptr[cint]](args[1]),
cast[ptr[cint]](args[2])
)
result = hipGetLastError()
elif HippoRuntime == "HIP_CPU":
# TODO fix args on this branch
echo "executing kernel on CPU"
hipLaunchKernelGGL(
elif (HippoRuntime == "HIP" and HipPlatform == "nvidia") or HippoRuntime == "HIP_CPU":
hipLaunchKernelGGLWithTuple(
kernel,
gridDim,
blockDim,
0, # TODO
nil, # TODO
# TODO handle args properly
args[0],
args[1],
args[2]
sharedMemBytes,
stream,
args
)
result = hipGetLastError()
elif HippoRuntime == "CUDA":
# This branch works for all args
echo "executing CUDA"
var kernelArgs: seq[pointer]
for key, arg in args.fieldPairs:
kernelArgs.add(cast[pointer](addr arg))
result = cudaLaunchKernel(
kernel,
gridDim,
blockDim,
cast[ptr pointer](addr kernelArgs[0])
#sharedMemBytes,
#stream
cast[ptr pointer](addr kernelArgs[0]),
sharedMemBytes,
stream
)
else:
raise newException(Exception, &"Unknown runtime: {HippoRuntime}")

template hippoLaunchKernel*(
kernel: proc, ## The GPU kernel procedure to launch
gridDim: Dim3 = newDim3(1,1,1), ## default to a grid of 1 block
blockDim: Dim3 = newDim3(1,1,1), ## default to 1 thread per block
sharedMemBytes: uint32 = 0, ## dynamic shared memory amount to allocate
stream: HippoStream = nil, ## Which device stream to run under (defaults to null)
args: tuple, ## Arguments to pass to the GPU kernel
) =
## Launch a kernel on the GPU and check for errors
handleError(launchKernel(kernel, gridDim, blockDim, sharedMemBytes, stream, args))
handleError(result)



# -------------------
Expand Down
4 changes: 2 additions & 2 deletions tests/hip/call_params.nim
Original file line number Diff line number Diff line change
Expand Up @@ -8,10 +8,10 @@ proc main() =
var c: int32
var dev_c: ptr[int32]
handleError(hipMalloc(cast[ptr pointer](addr dev_c), sizeof(int32).cint))
handleError(launchKernel(
hippoLaunchKernel(
addKernel,
args = (2,7,dev_c)
))
)
handleError(hipMemcpy(addr c, dev_c, sizeof(int32).cint, hipMemcpyDeviceToHost))
echo "2 + 7 = ", c
handleError(hipFree(dev_c))
Expand Down
4 changes: 2 additions & 2 deletions tests/hip/dot.nim
Original file line number Diff line number Diff line change
Expand Up @@ -62,12 +62,12 @@ proc main() =
handleError(hipMemcpy(dev_b, addr b[0], sizeof(float64)*N, hipMemcpyHostToDevice))

# launch kernel
handleError(launchKernel(
hippoLaunchKernel(
dot,
gridDim = newDim3(BlocksPerGrid.uint32),
blockDim = newDim3(ThreadsPerBlock.uint32),
args = (dev_a, dev_b, dev_partial_c)
))
)

# copy memory back from GPU to CPU
handleError(hipMemcpy(addr partial_c[0], dev_partial_c, BlocksPerGrid * sizeof(float64), hipMemcpyDeviceToHost))
Expand Down
4 changes: 2 additions & 2 deletions tests/hip/vector_sum.nim
Original file line number Diff line number Diff line change
Expand Up @@ -29,11 +29,11 @@ proc main() =
handleError(hipMemcpy(dev_b, addr b[0], sizeof(int32)*N, hipMemcpyHostToDevice))

# launch kernel
handleError(launchKernel(
hippoLaunchKernel(
addkernel,
gridDim = newDim3(N.uint32),
args = (dev_a, dev_b, dev_c)
))
)

# copy result back to host
handleError(hipMemcpy(addr c[0], dev_c, sizeof(int32)*N, hipMemcpyDeviceToHost))
Expand Down

0 comments on commit 4a4b304

Please sign in to comment.