We read every piece of feedback, and take your input very seriously.
To see all available qualifiers, see our documentation.
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
The generated OpenCL code has a syntax error.
Following is my kernel in Java.
public static void scan(KernelContext context, IntArray input, IntArray sum) { int[] temp = context.allocateIntLocalArray(4 * 128); int gid2 = context.globalIdx << 1; int group = context.groupIdx; int item = context.localIdx; int n = context.localGroupSizeX << 1; temp[2 * item] = input.get(gid2); temp[2 * item + 1] = input.get(gid2 + 1); int decale = 1; for (int d = n >> 1; d > 0; d = d >> 1) { context.localBarrier(); if (item < d) { int ai = decale * ((item << 1) + 1) - 1; int bi = decale * ((item << 1) + 2) - 1; temp[bi] += temp[ai]; } decale = decale << 1; } if (item == 0) { sum.set(group, temp[n - 1]); temp[n - 1] = 0; } for (int d = 1; d < n; d = d << 1) { decale = decale >> 1; context.localBarrier(); if (item < d) { int ai = decale * ((item << 1) + 1) - 1; int bi = decale * ((item << 1) + 2) - 1; int t = temp[ai]; temp[ai] = temp[bi]; temp[bi] += t; } } context.localBarrier(); input.set(gid2, temp[item << 1]); input.set(gid2 + 1, temp[(item << 1) + 1]); }
Successfully compile the code to OpenCL kernel without any error.
This is the generated OpenCL code which is wrong.
#pragma OPENCL EXTENSION cl_khr_fp64 : enable #pragma OPENCL EXTENSION cl_khr_fp16 : enable #pragma OPENCL EXTENSION cl_khr_int64_base_atomics : enable __kernel void scan(__global long *_kernel_context, __constant uchar *_constant_region, __local uchar *_local_region, __global int *_atomics, __global uchar *input, __global uchar *sum) { ulong ul_8, ul_43, ul_1, ul_0, ul_15; long l_14, l_13, l_42, l_41, l_7, l_6; int i_25, i_27, i_28, i_29, i_30, i_31, i_32, i_17, i_18, i_19, i_20, i_21, i_22, i_23, i_24, i_9, i_10, i_11, i_12, i_16, i_3, i_4, i_5, i_57, i_58, i_59, i_60, i_49, i_50, i_51, i_52, i_53, i_54, i_55, i_56, i_44, i_45, i_46, i_47, i_33, i_34, i_35, i_37, i_38, i_39, i_40; bool b_36, b_26, b_48; // BLOCK 0 ul_0 = (ulong) input; ul_1 = (ulong) sum; __local int adi_2[512]; i_3 = get_global_id(0); i_4 = i_3 << 1; i_5 = i_4 + 6; l_6 = (long) i_5; l_7 = l_6 << 2; ul_8 = ul_0 + l_7; i_9 = *((__global int *) ul_8); i_10 = get_local_id(0); i_11 = i_10 << 1; adi_2[i_11] = i_9; i_12 = i_4 + 7; l_13 = (long) i_12; l_14 = l_13 << 2; ul_15 = ul_0 + l_14; i_16 = *((__global int *) ul_15); i_17 = i_11 + 1; adi_2[i_17] = i_16; i_18 = i_11 + 2; i_19 = get_local_size(0); i_20 = i_19 << 1; i_21 = i_20 >> 1; // BLOCK 1 MERGES [0 5 ] i_22 = 1; i_23 = i_21; for(;i_23 >= 1;) { // BLOCK 2 barrier(CLK_LOCAL_MEM_FENCE); i_24 = i_23 >> 1; i_25 = i_22 << 1; b_26 = i_10 < i_23; if(b_26) { // BLOCK 3 i_27 = i_18 * i_22; i_28 = i_27 + -1; i_29 = adi_2[i_28]; i_30 = i_22 * i_17; i_31 = i_30 + -1; i_32 = adi_2[i_31]; i_33 = i_29 + i_32; adi_2[i_28] = i_33; } // B3 else { // BLOCK 4 } // B4 // BLOCK 5 MERGES [4 3 ] i_34 = i_25; i_35 = i_24; i_22 = i_34; i_23 = i_35; } // B5 // BLOCK 6 b_36 = i_10 == 0; if(b_36) { // BLOCK 7 i_37 = i_20 + -1; i_38 = adi_2[i_37]; i_39 = get_group_id(0); i_40 = i_39 + 6; l_41 = (long) i_40; l_42 = l_41 << 2; ul_43 = ul_1 + l_42; *((__global int *) ul_43) = i_38; adi_2[i_37] = 0; } // B7 else { // BLOCK 8 } // B8 // BLOCK 9 MERGES [8 7 ] // BLOCK 10 MERGES [9 14 ] i_44 = i_22; i_45 = 1; for(;i_45 < i_20;) { // BLOCK 11 barrier(CLK_LOCAL_MEM_FENCE); i_46 = i_45 << 1; i_47 = i_44 >> 1; b_48 = i_10 < i_45; if(b_48) { // BLOCK 12 i_49 = i_47 * i_17; i_50 = i_49 + -1; i_51 = adi_2[i_50]; i_52 = i_18 * i_47; i_53 = i_52 + -1; i_54 = adi_2[i_53]; adi_2[i_50] = i_54; i_55 = adi_2[i_53]; i_56 = i_51 + i_55; adi_2[i_53] = i_56; } // B12 else { // BLOCK 13 } // B13 // BLOCK 14 MERGES [13 12 ] i_57 = i_47; i_58 = i_46; i_44 = i_57; i_45 = i_58; } // B14 // BLOCK 15 barrier(CLK_LOCAL_MEM_FENCE); i_59 = adi_2[i_11]; *((__global int *) ul_8) = i_59; i_60 = adi_2[i_17]; *((__global int *) ul_15) = i_60; return; } // B15 } // kernel
The text was updated successfully, but these errors were encountered:
Thank you for the report. We will work on this.
Sorry, something went wrong.
@ShAlireza can you also provide the exact input sizes that you run the scan kernel? Thanks
Sure. IntArray input = new IntArray(256 * 128 * 4); IntArray sum = new IntArray(256);
And here is my GridScheduler: int totalLocalScanItems = 256 * 128 * 4 / 2; int localItems = totalLocalScanItems / 256;
WorkerGrid scanWorker = new WorkerGrid1D(totalLocalScanItems); scanWorker.setLocalWork(localItems, 1, 1);
Thank you @ShAlireza, I managed to reprorduce it locally. We will work on the issue and we will get back to you.
mikepapadim
No branches or pull requests
Describe the bug
The generated OpenCL code has a syntax error.
How To Reproduce
Following is my kernel in Java.
Expected behavior
Successfully compile the code to OpenCL kernel without any error.
Computing system setup (please complete the following information):
Additional context
This is the generated OpenCL code which is wrong.
The text was updated successfully, but these errors were encountered: