-
Notifications
You must be signed in to change notification settings - Fork 10
Host Performance
Hüseyin Tuğrul BÜYÜKIŞIK edited this page Jul 4, 2017
·
7 revisions
With a simple kernel working on 3 arrays, different host-device interaction scenarios were tested.
__kernel void test(__global int * x, __global float *y, __global float *z)
{
int id=get_global_id(0);
int n=x[id];
y[id]+=n*z[id];
}
// Dynamic Parallelism feature
__kernel void testDynamic(__global int * x, __global float *y, __global float *z)
{
int id=get_global_id(0);
queue_t defaultQueue=get_default_queue();
void (^compute)(void)=^{test(x,y,z);};
ndrange_t nd = ndrange_1D(id*1024,1024,64);
enqueue_kernel(defaultQueue,CLK_ENQUEUE_FLAGS_NO_WAIT,nd,compute);
}
Test-1: simple compute.
Stopwatch stopw = new Stopwatch();
for (int j = 0; j < 5; j++)
{
stopw.Start();
for (int i = 0; i < 1000; i++)
{
gpuData0.nextParam(gpuData1, gpuData2).compute(gpgpu, i, "test", 1024, 64, 1024 * i);
}
stopw.Start();
Console.WriteLine(stopw.ElapsedMilliseconds);
stopw.Reset();
}
result: 180 ms on average.