Skip to content

Commit

Permalink
allocations: Make buffer kernel more efficient for multiple allocations
Browse files Browse the repository at this point in the history
- Fix malloc for 'access_string' and 'kernel_string'.
- Fix typo in 'number_of_work_itmes'.
  • Loading branch information
Sreelakshmi Haridas Maruthur committed Jan 17, 2025
1 parent 2ff5cda commit 31d7ec4
Show file tree
Hide file tree
Showing 2 changed files with 41 additions and 22 deletions.
57 changes: 38 additions & 19 deletions test_conformance/allocations/allocation_execute.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -26,13 +26,18 @@ const char *buffer_kernel_pattern = {
"\tint tid = get_global_id(0);\n"
"\tuint r = 0;\n"
"\t%s i;\n"
"\tfor(i=(%s)tid*(%s)per_item; i<(%s)(1+tid)*(%s)per_item; i++) {\n"
"%s"
"\t}\n"
"\tresult[tid] = r;\n"
"}\n"
};

const char *accumulate_pattern = {
"\t%s end%d = min((%s)(1+tid)*(%s)per_item, array_sizes[%d]);\n"
"\tfor(i=(%s)tid*(%s)per_item; i<end%d; i++) {\n"
"\t\tr += buffer%d[i];\n"
"\t}\n"
};

const char *image_kernel_pattern = {
"__kernel void sample_test(%s __global uint *result)\n"
"{\n"
Expand Down Expand Up @@ -159,7 +164,7 @@ int check_image(cl_command_queue queue, cl_mem mem)
int execute_kernel(cl_context context, cl_command_queue *queue,
cl_device_id device_id, int test, cl_mem mems[],
int number_of_mems_used, int verify_checksum,
unsigned int number_of_work_itmes)
unsigned int number_of_work_items)
{

char *argument_string;
Expand All @@ -174,24 +179,38 @@ int execute_kernel(cl_context context, cl_command_queue *queue,
cl_uint per_item;
cl_uint per_item_uint;
cl_uint final_result;
std::vector<cl_uint> returned_results(number_of_work_itmes);
std::vector<cl_uint> returned_results(number_of_work_items);
clEventWrapper event;
cl_int event_status;

// Allocate memory for the kernel source
char *used_pattern = nullptr;
if (test == BUFFER || test == BUFFER_NON_BLOCKING)
{
used_pattern = (char *)accumulate_pattern;
}
else if (test == IMAGE_READ || test == IMAGE_READ_NON_BLOCKING)
{
used_pattern = (char *)read_pattern;
}
else if (test == IMAGE_WRITE || test == IMAGE_WRITE_NON_BLOCKING)
{
used_pattern = (char *)write_pattern;
}
argument_string =
(char *)malloc(sizeof(char) * MAX_NUMBER_TO_ALLOCATE * 64);
access_string = (char *)malloc(sizeof(char) * MAX_NUMBER_TO_ALLOCATE
* (strlen(read_pattern) + 10));
* (strlen(used_pattern) + 10));
kernel_string = (char *)malloc(sizeof(char) * MAX_NUMBER_TO_ALLOCATE
* (strlen(read_pattern) + 10 + 64)
* (strlen(used_pattern) + 10 + 64)
+ 1024);

argument_string[0] = '\0';
access_string[0] = '\0';
kernel_string[0] = '\0';

// Zero the results.
for (i = 0; i < number_of_work_itmes; i++) returned_results[i] = 0;
for (i = 0; i < number_of_work_items; i++) returned_results[i] = 0;

// detect if device supports ulong/int64
// detect whether profile of the device is embedded
Expand All @@ -209,13 +228,6 @@ int execute_kernel(cl_context context, cl_command_queue *queue,
// Build the kernel source
if (test == BUFFER || test == BUFFER_NON_BLOCKING)
{
for (i = 0; i < number_of_mems_used; i++)
{
sprintf(argument_string + strlen(argument_string),
" __global uint *buffer%d, ", i);
sprintf(access_string + strlen(access_string),
"\t\tif (i<array_sizes[%d]) r += buffer%d[i];\n", i, i);
}
char type[10];
if (support64)
{
Expand All @@ -225,8 +237,15 @@ int execute_kernel(cl_context context, cl_command_queue *queue,
{
sprintf(type, "uint");
}
for (i = 0; i < number_of_mems_used; i++)
{
sprintf(argument_string + strlen(argument_string),
" __global uint *buffer%d, ", i);
sprintf(access_string + strlen(access_string), accumulate_pattern,
type, i, type, type, i, type, type, i, i);
}
sprintf(kernel_string, buffer_kernel_pattern, argument_string, type,
type, type, type, type, type, access_string);
type, access_string);
}
else if (test == IMAGE_READ || test == IMAGE_READ_NON_BLOCKING)
{
Expand Down Expand Up @@ -282,14 +301,14 @@ int execute_kernel(cl_context context, cl_command_queue *queue,
// Set the result
result_mem =
clCreateBuffer(context, CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR,
sizeof(cl_uint) * number_of_work_itmes,
sizeof(cl_uint) * number_of_work_items,
returned_results.data(), &error);
test_error(error, "clCreateBuffer failed");
error = clSetKernelArg(kernel, i, sizeof(result_mem), &result_mem);
test_error(error, "clSetKernelArg failed");

// Thread dimensions for execution
global_dims[0] = number_of_work_itmes;
global_dims[0] = number_of_work_items;
global_dims[1] = 1;
global_dims[2] = 1;

Expand Down Expand Up @@ -427,7 +446,7 @@ int execute_kernel(cl_context context, cl_command_queue *queue,
// Verify the checksum.
// Read back the result
error = clEnqueueReadBuffer(*queue, result_mem, CL_TRUE, 0,
sizeof(cl_uint) * number_of_work_itmes,
sizeof(cl_uint) * number_of_work_items,
returned_results.data(), 0, NULL, NULL);
test_error_abort(error, "clEnqueueReadBuffer failed");
final_result = 0;
Expand All @@ -436,7 +455,7 @@ int execute_kernel(cl_context context, cl_command_queue *queue,
{
// For buffers or read images we are just looking at the sum of what
// each thread summed up
for (i = 0; i < number_of_work_itmes; i++)
for (i = 0; i < number_of_work_items; i++)
{
final_result += returned_results[i];
}
Expand Down
6 changes: 3 additions & 3 deletions test_conformance/allocations/main.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -125,7 +125,7 @@ int doTest(cl_device_id device, cl_context context, cl_command_queue queue,
int number_of_mems_used;
cl_ulong max_individual_allocation_size = g_max_individual_allocation_size;
cl_ulong global_mem_size = g_global_mem_size;
unsigned int number_of_work_itmes = 8192 * 32;
unsigned int number_of_work_items = 8192 * 32;
const bool allocate_image =
(alloc_type != BUFFER) && (alloc_type != BUFFER_NON_BLOCKING);

Expand Down Expand Up @@ -183,7 +183,7 @@ int doTest(cl_device_id device, cl_context context, cl_command_queue queue,
g_reduction_percentage);
g_max_size = (size_t)((double)g_max_size
* (double)g_reduction_percentage / 100.0);
number_of_work_itmes = 8192 * 2;
number_of_work_items = 8192 * 2;
}

// Round to nearest MB.
Expand Down Expand Up @@ -220,7 +220,7 @@ int doTest(cl_device_id device, cl_context context, cl_command_queue queue,
error =
execute_kernel(context, &queue, device, alloc_type, mems,
number_of_mems_used, g_write_allocations,
number_of_work_itmes);
number_of_work_items);
}

// If we failed to allocate more than 1/8th of the requested amount
Expand Down

0 comments on commit 31d7ec4

Please sign in to comment.