-
Notifications
You must be signed in to change notification settings - Fork 10
Host Performance
Hüseyin Tuğrul BÜYÜKIŞIK edited this page Jul 4, 2017
·
7 revisions
With a simple kernel working on 3 arrays(but all read/writes are disabled, only kernel executions on device side arrays), different host-device interaction scenarios were tested.
__kernel void test(__global int * x, __global float *y, __global float *z)
{
int id=get_global_id(0);
int n=x[id];
y[id]+=n*z[id];
}
// Dynamic Parallelism feature
__kernel void testDynamic(__global int * x, __global float *y, __global float *z)
{
int id=get_global_id(0);
queue_t defaultQueue=get_default_queue();
void (^compute)(void)=^{test(x,y,z);};
// offset=1024 * id for non-collision on data
ndrange_t nd = ndrange_1D(id*1024,1024,64);
enqueue_kernel(defaultQueue,CLK_ENQUEUE_FLAGS_NO_WAIT,nd,compute);
}
Test-1: simple compute.
Stopwatch stopw = new Stopwatch();
for (int j = 0; j < 5; j++)
{
stopw.Start();
for (int i = 0; i < 1000; i++)
{
// offset=1024 * i for non-collision on data
gpuData0.nextParam(gpuData1, gpuData2).compute(gpgpu, i, "test", 1024, 64, 1024 * i);
}
stopw.Start();
Console.WriteLine(stopw.ElapsedMilliseconds);
stopw.Reset();
}
result: 180 ms on average.
Test-2: enqueue mode.
for (int j = 0; j < 5; j++)
{
gpgpu.enqueueMode = true;
for (int i = 0; i < 1000; i++)
{
// offset=1024 * i for non-collision on data
gpuData0.nextParam(gpuData1, gpuData2).compute(gpgpu, i, "test", 1024, 64, 1024 * i);
}
gpgpu.enqueueMode = false;
gpgpu.lastComputePerformanceReport();
}
result: 24 ms average
Test-3: async enqueue mode(only for read-write-compute operations so only not useful for this benchmark)
for (int j = 0; j < 5; j++)
{
gpgpu.enqueueMode = true;
for (int i = 0; i < 1000; i++)
{
gpgpu.enqueueModeAsyncEnable = true;
gpuData0.nextParam(gpuData1, gpuData2).compute(gpgpu, i, "test", 1024, 64, 1024 * i);
gpgpu.enqueueModeAsyncEnable = false;
}
gpgpu.enqueueMode = false;
gpgpu.lastComputePerformanceReport();
}
result: 25 ms average (but gets much faster than non-async enqueue mode when array read/writes are enabled)
Test-4: dynamic parallelism(OpenCL 2.0).
gpgpu.performanceFeed = true;
for (int j = 0; j < 5; j++)
gpuData0.nextParam(gpuData1, gpuData2).compute(gpgpu, 2543, "testDynamic", 1000,100);
result: 1.7 ms on average (15x the speed of enqueue mode)
Also host-side codes are cleaner with dynamic parallelism.