Skip to content

Host Performance

Hüseyin Tuğrul BÜYÜKIŞIK edited this page Jul 4, 2017 · 7 revisions

With a simple kernel working on 3 arrays(but all read/writes are disabled, only kernel executions on device side arrays), different host-device interaction scenarios were tested.

__kernel void test(__global int * x, __global float *y, __global float *z)
    int id=get_global_id(0);
    int n=x[id];

// Dynamic Parallelism feature
__kernel void testDynamic(__global int * x, __global float *y, __global float *z)
     int id=get_global_id(0);
     queue_t defaultQueue=get_default_queue();
     void (^compute)(void)=^{test(x,y,z);};
     // offset=1024 * id for non-collision on data
     ndrange_t nd = ndrange_1D(id*1024,1024,64);

Test-1: simple compute.

Stopwatch stopw = new Stopwatch();
for (int j = 0; j < 5; j++)
    for (int i = 0; i < 1000; i++)
        // offset=1024 * i for non-collision on data
        gpuData0.nextParam(gpuData1, gpuData2).compute(gpgpu, i, "test", 1024, 64, 1024 * i);

result: 180 ms on average.

Test-2: enqueue mode.

for (int j = 0; j < 5; j++)
     gpgpu.enqueueMode = true;
     for (int i = 0; i < 1000; i++)
         // offset=1024 * i for non-collision on data
          gpuData0.nextParam(gpuData1, gpuData2).compute(gpgpu, i, "test", 1024, 64, 1024 * i);
     gpgpu.enqueueMode = false;

result: 24 ms average

Test-3: async enqueue mode(only for read-write-compute operations so only not useful for this benchmark)

for (int j = 0; j < 5; j++)
     gpgpu.enqueueMode = true;
     for (int i = 0; i < 1000; i++)
          gpgpu.enqueueModeAsyncEnable = true;
          gpuData0.nextParam(gpuData1, gpuData2).compute(gpgpu, i, "test", 1024, 64, 1024 * i);
          gpgpu.enqueueModeAsyncEnable = false;
     gpgpu.enqueueMode = false;

result: 25 ms average (but gets much faster than non-async enqueue mode when array read/writes are enabled)