Skip to content

Commit

Permalink
Fixed bug for keypoints_gpu1.cl (8,4,4)
Browse files Browse the repository at this point in the history
  • Loading branch information
Pierre Paleo committed Aug 29, 2013
1 parent a31f2a5 commit 0396e62
Show file tree
Hide file tree
Showing 2 changed files with 44 additions and 40 deletions.
72 changes: 38 additions & 34 deletions openCL/keypoints_gpu1.cl
Original file line number Diff line number Diff line change
Expand Up @@ -57,23 +57,23 @@ This kernel has to be run with as (8,4,4) workgroup size
*/



__kernel void descriptor(
__global keypoint* keypoints,
__global unsigned char *descriptors,
__global float* grad,
__global float* orim,
int octsize,
int keypoints_start,
__global int* keypoints_end,
// int keypoints_end,
__global int* keypoints_end, //passing counter value to avoid to read it each time
int grad_width,
int grad_height)
{

int lid0 = get_local_id(0); //[0,8[
int lid1 = get_local_id(1); //[0,2[
int lid2 = get_local_id(2); //[0,2[
int lid = (lid0*2+lid1)*2+lid2; //[0,32[, to expand to [0,128[
int lid1 = get_local_id(1); //[0,4[
int lid2 = get_local_id(2); //[0,4[
int lid = (lid0*4+lid1)*4+lid2; //[0,128[
int groupid = get_group_id(0);
keypoint k = keypoints[groupid];
if (!(keypoints_start <= groupid && groupid < *keypoints_end && k.s1 >=0.0f))
Expand All @@ -91,14 +91,14 @@ __kernel void descriptor(
float spacing = k.s2/octsize * 3.0f;
int radius = (int) ((1.414f * spacing * 2.5f) + 0.5f);

int imin = (lid1 == 0 ? -64 : 0),
jmin = (lid2 == 0 ? -64 : 0);
int imax = imin+64,
jmax = jmin+64;
int imin = -64 +32*lid1,
jmin = -64 +32*lid2;
int imax = imin+32,
jmax = jmin+32;

//memset
for (i=0; i < 4; i++) histogram[4*lid+i] = 0.0f;
for (j=0; j < 4; j++) for (i=0; i < 8; i++) hist2[(lid*4+j)*8+i] = 0.0f;
histogram[lid] = 0.0f;
for (i=0; i < 8; i++) hist2[lid*8+i] = 0.0f;

for (i=imin; i < imax; i++) {
for (j2=jmin/8; j2 < jmax/8; j2++) {
Expand Down Expand Up @@ -169,9 +169,8 @@ __kernel void descriptor(


barrier(CLK_LOCAL_MEM_FENCE);
for (i=0; i < 4; i++)
histogram[4*lid+i]
+= hist2[(lid*4+i)*8]+hist2[(lid*4+i)*8+1]+hist2[(lid*4+i)*8+2]+hist2[(lid*4+i)*8+3]+hist2[(lid*4+i)*8+4]+hist2[(lid*4+i)*8+5]+hist2[(lid*4+i)*8+6]+hist2[(lid*4+i)*8+7];
histogram[lid]
+= hist2[lid*8]+hist2[lid*8+1]+hist2[lid*8+2]+hist2[lid*8+3]+hist2[lid*8+4]+hist2[lid*8+5]+hist2[lid*8+6]+hist2[lid*8+7];

barrier(CLK_LOCAL_MEM_FENCE);
//memset of 128 values of hist2 before re-use
Expand All @@ -181,10 +180,10 @@ __kernel void descriptor(
Normalization and thre work shared by the 16 threads (8 values per thread)
*/


for (i=0; i < 4; i++)
if (lid*4+i < 64) hist2[lid*4+i] += hist2[lid*4+i+64];

//parallel reduction to normalize vector
if (lid < 64) {
hist2[lid] += hist2[lid+64];
}
barrier(CLK_LOCAL_MEM_FENCE);
if (lid < 32) {
hist2[lid] += hist2[lid+32];
Expand Down Expand Up @@ -215,43 +214,47 @@ __kernel void descriptor(
//Threshold to 0.2 of the norm, for invariance to illumination
__local int changed[1];
if (lid == 0) changed[0] = 0;
for (i=0; i < 4; i++) {
if (histogram[lid*4+i] > 0.2f) {
histogram[lid*4+i] = 0.2f;
atomic_inc(changed);
}
if (histogram[lid] > 0.2f) {
histogram[lid] = 0.2f;
atomic_inc(changed);
}
barrier(CLK_LOCAL_MEM_FENCE);
//if values have changed, we have to re-normalize
if (changed[0]) {
hist2[lid] = histogram[lid]*histogram[lid];

for (i=0; i < 4; i++)
if (lid*4+i < 64) hist2[lid*4+i] += hist2[lid*4+i+64];
if (lid < 64) {
hist2[lid] += hist2[lid+64];
}
barrier(CLK_LOCAL_MEM_FENCE);
if (lid < 32)
if (lid < 32) {
hist2[lid] += hist2[lid+32];
}
barrier(CLK_LOCAL_MEM_FENCE);
if (lid < 16)
if (lid < 16) {
hist2[lid] += hist2[lid+16];
}
barrier(CLK_LOCAL_MEM_FENCE);
if (lid < 8)
if (lid < 8) {
hist2[lid] += hist2[lid+8];
}
barrier(CLK_LOCAL_MEM_FENCE);
if (lid < 4)
if (lid < 4) {
hist2[lid] += hist2[lid+4];
}
barrier(CLK_LOCAL_MEM_FENCE);
if (lid < 2)
if (lid < 2) {
hist2[lid] += hist2[lid+2];
}
barrier(CLK_LOCAL_MEM_FENCE);
if (lid == 0) hist2[0] = rsqrt(hist2[0]+hist2[1]);
barrier(CLK_LOCAL_MEM_FENCE);
histogram[lid] *= hist2[0];
}



barrier(CLK_LOCAL_MEM_FENCE);
//finally, cast to integer
for (i=0; i < 4; i++)
descriptors[128*groupid+(lid*4+i)]
descriptors[128*groupid+lid]
= (unsigned char) MIN(255,(unsigned char)(512.0f*histogram[lid]));

}
Expand All @@ -260,3 +263,4 @@ __kernel void descriptor(




12 changes: 6 additions & 6 deletions test/test_keypoints_old.py
Original file line number Diff line number Diff line change
Expand Up @@ -78,10 +78,10 @@
class test_keypoints(unittest.TestCase):
def setUp(self):

kernel_file = "keypoints_cpu.cl" if USE_CPU else "keypoints_gpu2.cl"
kernel_file = "keypoints_cpu.cl" if USE_CPU else "keypoints_gpu1.cl"
kernel_path = os.path.join(os.path.dirname(os.path.abspath(sift.__file__)), kernel_file)
kernel_src = open(kernel_path).read()
self.program = pyopencl.Program(ctx, kernel_src).build()
self.program = pyopencl.Program(ctx, kernel_src).build(options="-cl-nv-arch sm_20")
self.wg = (1, 128)


Expand Down Expand Up @@ -194,10 +194,10 @@ def test_descriptor(self):
wg = 1,
shape = keypoints.shape[0]*wg[0],
else:
wg = (8, 8, 8)
shape = int(keypoints.shape[0]*wg[0]), 8, 8
# wg = (4, 4, 8)
# shape = int(keypoints.shape[0]*wg[0]), 4, 8
# wg = (8, 8, 8)
# shape = int(keypoints.shape[0]*wg[0]), 8, 8
wg = (8, 4, 4)
shape = int(keypoints.shape[0]*wg[0]), 4, 4

gpu_keypoints = pyopencl.array.to_device(queue, keypoints)
#NOTE: for the following line, use pyopencl.array.empty instead of pyopencl.array.zeros if the keypoints are compacted
Expand Down

0 comments on commit 0396e62

Please sign in to comment.