Skip to content

Host Performance

Hüseyin Tuğrul BÜYÜKIŞIK edited this page Jul 4, 2017 · 7 revisions

With a simple kernel working on 3 arrays, different host-device interaction scenarios were tested.

__kernel void test(__global int * x, __global float *y, __global float *z)
{
    int id=get_global_id(0);
    int n=x[id];
    y[id]+=n*z[id];
}

// Dynamic Parallelism feature
__kernel void testDynamic(__global int * x, __global float *y, __global float *z)
{
     int id=get_global_id(0);
     queue_t defaultQueue=get_default_queue();
     void (^compute)(void)=^{test(x,y,z);};
     ndrange_t nd = ndrange_1D(id*1024,1024,64);
     enqueue_kernel(defaultQueue,CLK_ENQUEUE_FLAGS_NO_WAIT,nd,compute);
}

Test-1: simple compute.

Stopwatch stopw = new Stopwatch();
for (int j = 0; j < 5; j++)
{
    stopw.Start();
    for (int i = 0; i < 1000; i++)
    {
        gpuData0.nextParam(gpuData1, gpuData2).compute(gpgpu, i, "test", 1024, 64, 1024 * i);
    }
    stopw.Start();
    Console.WriteLine(stopw.ElapsedMilliseconds);
    stopw.Reset();
}

result: 180 ms on average.