From 0f8023e21a4898ecff2eedb876adcfb68da44542 Mon Sep 17 00:00:00 2001 From: Mehmet Yusufoglu Date: Fri, 19 Jan 2024 23:36:54 +0100 Subject: [PATCH] Remove linearized index calculation --- example/convolution1D/src/convolution1D.cpp | 14 ++++++-------- 1 file changed, 6 insertions(+), 8 deletions(-) diff --git a/example/convolution1D/src/convolution1D.cpp b/example/convolution1D/src/convolution1D.cpp index eca4800b4a3..370fce19648 100644 --- a/example/convolution1D/src/convolution1D.cpp +++ b/example/convolution1D/src/convolution1D.cpp @@ -55,24 +55,22 @@ struct ConvolutionKernel Vec const globalThreadIdx = alpaka::getIdx(acc); Vec const globalThreadExtent = alpaka::getWorkDiv(acc); - // Map the three dimensional thread index into a - // one dimensional thread index space - auto const linearizedGlobalThreadIdx = alpaka::mapIdx<1u>(globalThreadIdx, globalThreadExtent)[0]; - - if(linearizedGlobalThreadIdx < static_cast(inputSize)) + // Since the kernel is launched 1-D calculating linearizedGlobalThreadIdx line is unnecessary. + // globalThreadIdx[0] can be used to map all the threads. + if(globalThreadIdx[0] < static_cast(inputSize)) { uint32_t const halfFilterSize = filterSize / 2; TElem result = 0.0f; // Calculate sum of multiplications of corresponding elements for(size_t i = 0; i < filterSize; ++i) { - int inputIndex = linearizedGlobalThreadIdx - halfFilterSize + i; + int inputIndex = globalThreadIdx[0] - halfFilterSize + i; if(inputIndex >= 0 && inputIndex < inputSize) { result += input[inputIndex] * filter[i]; } } - output[linearizedGlobalThreadIdx] = result; + output[globalThreadIdx[0]] = result; } } }; @@ -178,7 +176,7 @@ auto main() -> int alpaka::wait(queue); // Allocate memory on host - BufAcc resultGpuHost = alpaka::allocBuf(devHost, InputSize); + auto resultGpuHost = alpaka::allocBuf(devHost, InputSize); // Copy from device memory to host alpaka::memcpy(queue, resultGpuHost, outputDeviceMemory, InputSize);