Skip to content

Commit

Permalink
Remove linearized index calculation
Browse files Browse the repository at this point in the history
  • Loading branch information
Mehmet Yusufoglu committed Jan 19, 2024
1 parent 3901fcc commit 0f8023e
Showing 1 changed file with 6 additions and 8 deletions.
14 changes: 6 additions & 8 deletions example/convolution1D/src/convolution1D.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -55,24 +55,22 @@ struct ConvolutionKernel
Vec const globalThreadIdx = alpaka::getIdx<alpaka::Grid, alpaka::Threads>(acc);
Vec const globalThreadExtent = alpaka::getWorkDiv<alpaka::Grid, alpaka::Threads>(acc);

// Map the three dimensional thread index into a
// one dimensional thread index space
auto const linearizedGlobalThreadIdx = alpaka::mapIdx<1u>(globalThreadIdx, globalThreadExtent)[0];

if(linearizedGlobalThreadIdx < static_cast<unsigned>(inputSize))
// Since the kernel is launched 1-D calculating linearizedGlobalThreadIdx line is unnecessary.
// globalThreadIdx[0] can be used to map all the threads.
if(globalThreadIdx[0] < static_cast<unsigned>(inputSize))
{
uint32_t const halfFilterSize = filterSize / 2;
TElem result = 0.0f;
// Calculate sum of multiplications of corresponding elements
for(size_t i = 0; i < filterSize; ++i)
{
int inputIndex = linearizedGlobalThreadIdx - halfFilterSize + i;
int inputIndex = globalThreadIdx[0] - halfFilterSize + i;
if(inputIndex >= 0 && inputIndex < inputSize)
{
result += input[inputIndex] * filter[i];
}
}
output[linearizedGlobalThreadIdx] = result;
output[globalThreadIdx[0]] = result;
}
}
};
Expand Down Expand Up @@ -178,7 +176,7 @@ auto main() -> int
alpaka::wait(queue);

// Allocate memory on host
BufAcc resultGpuHost = alpaka::allocBuf<DataType, Idx>(devHost, InputSize);
auto resultGpuHost = alpaka::allocBuf<DataType, Idx>(devHost, InputSize);
// Copy from device memory to host
alpaka::memcpy(queue, resultGpuHost, outputDeviceMemory, InputSize);

Expand Down

0 comments on commit 0f8023e

Please sign in to comment.