-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathcuda.nim
224 lines (205 loc) · 7.37 KB
/
cuda.nim
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
import macros
import inline
import expr
#macro dumpType(x:typed): auto =
# result = newEmptyNode()
# echo x.getType.treerepr
proc addChildrenFrom*(dst,src: NimNode): NimNode =
for c in src: dst.add(c)
result = dst
macro procInst*(p: typed): auto =
#echo "begin procInst:"
#echo p.treerepr
result = p[0]
macro makeCall*(p: proc, x: tuple): NimNode =
result = newCall(p).addChildrenFrom(x)
type
CudaDim3* {.importc:"dim3",header:"cuda_runtime.h".} = object
x*, y*, z*: cint
cudaError_t* {.importc,header:"cuda_runtime.h".} = object
cudaMemcpyKind* {.importc,header:"cuda_runtime.h".} = object
var
cudaSuccess*{.importC,header:"cuda_runtime.h".}: cudaError_t
cudaErrorNotSupported*{.importC,header:"cuda_runtime.h".}: cudaError_t
cudaMemcpyHostToDevice*{.importC,header:"cuda_runtime.h".}: cudaMemcpyKind
cudaMemcpyDeviceToHost*{.importC,header:"cuda_runtime.h".}: cudaMemcpyKind
#template toPointer*(x: pointer): pointer = x
#template toPointer*[T](x: ptr T): pointer = pointer(x)
#template toPointer*(x: seq): pointer = toPointer(x[0])
#template toPointer*(x: not (pointer|seq)): pointer = pointer(unsafeAddr(x))
template toPointer*(x: typed): pointer =
#dumpType: x
when x is pointer: x
elif x is ptr: x
elif x is seq: toPointer(x[0])
else: pointer(unsafeAddr(x))
template dataAddr*(x: typed): pointer =
#dumpType: x
when x is seq: dataAddr(x[0])
elif x is array: dataAddr(x[0])
#elif x is ptr: x
else: pointer(unsafeAddr(x))
#else: x
proc cudaGetLastError*(): cudaError_t
{.importC,header:"cuda_runtime.h".}
proc cudaGetErrorStringX*(error: cudaError_t): ptr char
{.importC:"cudaGetErrorString",header:"cuda_runtime.h".}
proc cudaGetErrorString*(error: cudaError_t): cstring =
var s {.codegendecl:"const $# $#".} = cudaGetErrorStringX(error)
result = s
proc `$`*(error: cudaError_t): string =
let s = cudaGetErrorString(error)
result = $s
converter toBool*(e: cudaError_t): bool =
cast[cint](e) != cast[cint](cudaSuccess)
proc cudaMalloc*(p:ptr pointer, size: csize): cudaError_t
{.importC,header:"cuda_runtime.h".}
template cudaMalloc*(p:pointer, size: csize): cudaError_t =
cudaMalloc((ptr pointer)(p.addr), size)
proc cudaFree*(p: pointer): cudaError_t
{.importC,header:"cuda_runtime.h".}
proc cudaMallocManaged*(p: ptr pointer, size: csize): cudaError_t
{.importC,header:"cuda_runtime.h".}
proc cudaMemcpyX*(dst,src: pointer, count: csize, kind: cudaMemcpyKind):
cudaError_t {.importC:"cudaMemcpy",header:"cuda_runtime.h".}
template cudaMemcpy*(dst,src: typed, count: csize,
kind: cudaMemcpyKind): cudaError_t =
let pdst = toPointer(dst)
let psrc = toPointer(src)
cudaMemcpyX(pdst, psrc, count, kind)
proc cudaLaunchKernel(p:pointer, gd,bd: CudaDim3, args: ptr pointer):
cudaError_t {.importC,header:"cuda_runtime.h".}
proc cudaDeviceReset*(): cudaError_t
{.importC,header:"cuda_runtime.h".}
proc cudaDeviceSynchronize*(): cudaError_t
{.importC,header:"cuda_runtime.h".}
#proc printf*(fmt:cstring):cint {.importc,varargs,header:"<stdio.h>",discardable.}
#proc fprintf*(stream:ptr FILE,fmt:cstring):cint {.importc,varargs,header:"<stdio.h>".}
#proc malloc*(size: csize):pointer {.importc,header:"<stdlib.h>".}
template cudaDefs(body: untyped): untyped {.dirty.} =
var gridDim{.global,importC,noDecl.}: CudaDim3
var blockIdx{.global,importC,noDecl.}: CudaDim3
var blockDim{.global,importC,noDecl.}: CudaDim3
var threadIdx{.global,importC,noDecl.}: CudaDim3
template getGridDim: untyped {.used.} = gridDim
template getBlockIdx: untyped {.used.} = blockIdx
template getBlockDim: untyped {.used.} = blockDim
template getThreadIdx: untyped {.used.} = threadIdx
template getThreadNum: untyped {.used.} = blockDim.x * blockIdx.x + threadIdx.x
template getNumThreads: untyped {.used.} = gridDim.x * blockDim.x
bind inlineProcs
inlineProcs:
body
template cudaLaunch*(p: proc; blocksPerGrid,threadsPerBlock: SomeInteger;
arg: varargs[pointer,dataAddr]) =
var pp: proc = p
var gridDim, blockDim: CudaDim3
gridDim.x = blocksPerGrid
gridDim.y = 1
gridDim.z = 1
blockDim.x = threadsPerBlock
blockDim.y = 1
blockDim.z = 1
var args: array[arg.len, pointer]
for i in 0..<arg.len: args[i] = arg[i]
#echo "really launching kernel"
let err = cudaLaunchKernel(pp, gridDim, blockDim, addr args[0])
if err:
echo err
quit cast[cint](err)
#macro `<<`*(x:varargs[untyped]): auto =
# result = newEmptyNode()
# echo x.treerepr
template `<<`*(p: proc, x: tuple): untyped = (p,x)
template getInst*(p: untyped): untyped =
#when compiles((var t=p; t)): p
#else:
procInst(p)
#var t =
#t
macro `>>`*(px: tuple, y: any): auto =
#echo "begin >>:"
#echo px.treerepr
#echo "kernel type:"
#echo px[0].getTypeImpl.treerepr
#echo "kernel args:"
#echo y.treerepr
#var a = y
#if y.kind != nnkPar: a = newNimNode(nnkPar).addChildrenFrom(y)
result = newCall(ident("cudaLaunch"))
let krnl = newCall(px[0]).addChildrenFrom(y)
#echo "kernel inst call:"
#echo krnl.treerepr
result.add getAst(getInst(krnl))[0]
result.add px[1][0]
result.add px[1][1]
for c in y:
result.add c
#echo "kernel launch body:"
#echo result.treerepr
macro cuda*(s,p: untyped): auto =
#echo "begin cuda:"
#echo s.treerepr
let ss = s.strVal
#echo "proc:"
#echo p.treerepr
p.expectKind nnkProcDef
result = p
# if p.kind == nnkProcDef:
# result = p
# else:
# result = p[0]
result.addPragma parseExpr("{.codegenDecl:\""&ss&" $# $#$#\".}")[0]
result.body = getAst(cudaDefs(result.body))
var sl = newStmtList()
sl.add( quote do:
{.push checks: off.}
{.push stacktrace: off.} )
sl.add result
result = sl
#echo "end cuda:"
#echo result.treerepr
template cudaGlobal*(p: untyped): auto = cuda("__global__",p)
template onGpu*(nn,tpb: untyped, body: untyped): untyped =
block:
var v = packVars(body, getGpuPtr)
type ByCopy {.bycopy.} [T] = object
d: T
proc kern(xx: ByCopy[type(v)]) {.cudaGlobal.} =
template deref(k: int): untyped = xx.d[k]
substVars(body, deref)
let ni = nn.int32
let threadsPerBlock = tpb.int32
let blocksPerGrid = (ni+threadsPerBlock-1) div threadsPerBlock
#echo "launching kernel"
cudaLaunch(kern, blocksPerGrid, threadsPerBlock, v)
discard cudaDeviceSynchronize()
template onGpu*(nn: untyped, body: untyped): untyped = onGpu(nn, 64, body)
template onGpu*(body: untyped): untyped = onGpu(512*64, 64, body)
when isMainModule:
type FltArr = array[0,float32]
proc vectorAdd*(A: FltArr; B: FltArr; C: var FltArr; n: int32)
{.cudaGlobal.} =
var i = blockDim.x * blockIdx.x + threadIdx.x
if i < n:
C[i] = A[i] + B[i]
proc test =
var n = 50000.cint
var
a = newSeq[float32](n)
b = newSeq[float32](n)
c = newSeq[float32](n)
var threadsPerBlock: cint = 256
var blocksPerGrid: cint = (n + threadsPerBlock - 1) div threadsPerBlock
cudaLaunch(vectorAdd, blocksPerGrid, threadsPerBlock, a, b, c, n)
template getGpuPtr(x: int): untyped = x
template getGpuPtr[T](x: seq[T]): untyped = addr(x[0])
template `[]`(x: ptr SomeNumber, i: SomeInteger): untyped {.used.} =
cast[ptr array[0,type(x[])]](x)[][i]
template `[]=`(x: ptr SomeNumber, i: SomeInteger, y:untyped): untyped {.used.} =
cast[ptr array[0,type(x[])]](x)[][i] = y
onGpu(n):
let i = getBlockDim().x * getBlockIdx().x + getThreadIdx().x
if i < n:
c[i] = a[i] + b[i]
test()