-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathparallelmcca.py
executable file
·85 lines (68 loc) · 2.36 KB
/
parallelmcca.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
# Sample source code from the Tutorial Introduction in the documentation.
import pycuda.driver as cuda
import pycuda.autoinit
from pycuda.compiler import SourceModule
import numpy
import time
class gpuProcessor:
def doParallelTransform(self, inArray, corpusSize):
setupstart = time.time()
# test data
'''a = numpy.array([[10.0,15.0,5.0],
[10.0,15.0,6.0],
[10.0,15.0,7.0],
[10.0,15.0,5.0]],numpy.float32)'''
a = numpy.array(inArray,numpy.float32)
print "Corpus size:", corpusSize
print "Input Size:", a.shape
# Total Vector count
vecCnt=a.shape[0]
# Allocate input array
a_gpu = cuda.mem_alloc(a.size * a.dtype.itemsize)
# Allocate results array Host & Device
dest = numpy.zeros(shape=(vecCnt,1),dtype=numpy.float32)
dest_gpu = cuda.mem_alloc(vecCnt * a.dtype.itemsize)
# Copy array to GPU
cuda.memcpy_htod(a_gpu, a)
mod = SourceModule("""
__global__ void transformVector(float *dest, float *a, float cSize, float vCnt)
{
int tid = (blockIdx.x * 512 + threadIdx.x)*3;
int vIdx = blockIdx.x * 512 + threadIdx.x;
// Make sure only real things run
if(vIdx < vCnt){
float k11 = a[tid+2];
float k12 = a[tid+1] - k11;
float k21 = a[tid] - k11;
float k22 = cSize - a[tid+1] - a[tid];
float C1 = k11 + k12;
float C2 = k21 + k22;
float R1 = k11 + k21;
float R2 = k12 + k22;
float N = k11 + k12 + k21 + k22;
if( k12 == 0 || k21 == 0){
dest[vIdx]=0;
} else {
dest[vIdx] = k11*log((k11*N)/(C1*R1)) + k12*log((k12*N)/(C1*R2)) + k21*log((k21*N)/(C2*R1)) + k22*log((k22*N)/(C2*R2));
}
}
}
""")
# cast cuda kernel
func = mod.get_function("transformVector")
# Make things float32s.. could probably do ints but lets not break it
cSize=numpy.float32(corpusSize)
vCnt=numpy.float32(vecCnt)
gridx=int((vecCnt+511)/512)
# Execute cuda kernel
print "executing kernel block(512,1,1) grid("+str(gridx)+",1)"
#timing
start = time.time()
func(dest_gpu,a_gpu,cSize,vCnt, block=(512,1,1), grid=(gridx,1))
end = time.time()
# Return result to host
cuda.memcpy_dtoh(dest, dest_gpu)
setupend = time.time()
print "elapsed time on GPU:",end-start
print "elapsed time on GPU & startup:",setupend-setupstart
return dest