diff --git a/example/convolution1D/src/convolution1D.cpp b/example/convolution1D/src/convolution1D.cpp index 370fce19648..294796ba5b7 100644 --- a/example/convolution1D/src/convolution1D.cpp +++ b/example/convolution1D/src/convolution1D.cpp @@ -10,23 +10,16 @@ #include #include #include + //! Convolution Example //! -//! 1D convolution example: Creates two 1D arrays, calculates the convolution integral using those arrays. +//! 1D convolution example: Creates two 1D arrays, applies convolution filter. //! Array sizes are hardcoded. //! -// Size of 1D arrays to be used in convolution integral -// In signal processing domain, the term "kernel" is used for the matrix or array used in convolution integral. -// Here instead of "kernel" the term "filter" is used because kernel has a different meaning in GPU programming. -constexpr size_t FilterSize = 3; -constexpr size_t InputSize = 8; - -constexpr float ExpectedOutput[InputSize] = {0.8f, 1.4f, 2.0f, 2.6f, 3.2f, 3.8f, 4.4f, 2.3f}; - /** - * @brief The ConvolutionKernel function object - * Calculates 1D convolution integral using input and filter arrays. + * @brief The ConvolutionKernel function-object + * Calculates 1D convolution using input and filter arrays. */ struct ConvolutionKernel { @@ -43,7 +36,7 @@ struct ConvolutionKernel TAcc const& acc, TElem const* const input, TElem const* const filter, - TElem* output, + TElem* const output, const std::size_t inputSize, const std::size_t filterSize) const -> void { @@ -64,7 +57,7 @@ struct ConvolutionKernel // Calculate sum of multiplications of corresponding elements for(size_t i = 0; i < filterSize; ++i) { - int inputIndex = globalThreadIdx[0] - halfFilterSize + i; + uint32_t inputIndex = globalThreadIdx[0] - halfFilterSize + i; if(inputIndex >= 0 && inputIndex < inputSize) { result += input[inputIndex] * filter[i]; @@ -93,6 +86,13 @@ auto main() -> int #if defined(ALPAKA_CI) && !defined(ALPAKA_ACC_CPU_B_SEQ_T_SEQ_ENABLED) return EXIT_SUCCESS; #else + // Size of 1D arrays to be used in convolution integral + // Here instead of "convolution kernel" the term "filter" is used because kernel has a different meaning in GPU + // programming. Secondly filter array is not reversed. Implemented like a convolutional layer in CNN. + constexpr size_t filterSize = 3; + constexpr size_t inputSize = 8; + constexpr std::array expectedOutput = {0.8f, 1.4f, 2.0f, 2.6f, 3.2f, 3.8f, 4.4f, 2.3f}; + // Define the index domain using Dim = alpaka::DimInt<1u>; // Index type @@ -118,36 +118,36 @@ auto main() -> int QueueAcc queue(devAcc); // Allocate memory host input - auto hostInputMemory = alpaka::allocBuf(devHost, InputSize); + auto hostInputMemory = alpaka::allocBuf(devHost, inputSize); DataType* nativeHostInputMemory = alpaka::getPtrNative(hostInputMemory); // Fill array with data - for(size_t i = 0; i < InputSize; i++) + for(size_t i = 0; i < inputSize; i++) nativeHostInputMemory[i] = static_cast(i + 1); // Allocate memory host filter - auto hostFilterMemory = alpaka::allocBuf(devHost, FilterSize); + auto hostFilterMemory = alpaka::allocBuf(devHost, filterSize); DataType* nativeHostFilterMemory = alpaka::getPtrNative(hostFilterMemory); // Fill array with any data - for(size_t i = 0; i < FilterSize; i++) + for(size_t i = 0; i < filterSize; i++) nativeHostFilterMemory[i] = static_cast(i + 1) / 10.0f; // Allocate memory in device - BufAcc inputDeviceMemory = alpaka::allocBuf(devAcc, InputSize); - BufAcc filterDeviceMemory = alpaka::allocBuf(devAcc, FilterSize); - BufAcc outputDeviceMemory = alpaka::allocBuf(devAcc, static_cast(InputSize)); + BufAcc inputDeviceMemory = alpaka::allocBuf(devAcc, inputSize); + BufAcc filterDeviceMemory = alpaka::allocBuf(devAcc, filterSize); + BufAcc outputDeviceMemory = alpaka::allocBuf(devAcc, static_cast(inputSize)); // Copy input and filter (convolution kernel array) from host to device - alpaka::memcpy(queue, inputDeviceMemory, hostInputMemory, InputSize); - alpaka::memcpy(queue, filterDeviceMemory, hostFilterMemory, FilterSize); + alpaka::memcpy(queue, inputDeviceMemory, hostInputMemory, inputSize); + alpaka::memcpy(queue, filterDeviceMemory, hostFilterMemory, filterSize); using Vec = alpaka::Vec; using WorkDiv = alpaka::WorkDivMembers; auto const elementsPerThread = Vec::all(static_cast(1)); // Grid size - auto const threadsPerGrid = InputSize; + auto const threadsPerGrid = inputSize; WorkDiv const workDiv = alpaka::getValidWorkDiv( devAcc, threadsPerGrid, @@ -171,22 +171,22 @@ auto main() -> int nativeInputDeviceMemory, nativeFilterDeviceMemory, nativeOutputDeviceMemory, - InputSize, - FilterSize); + inputSize, + filterSize); alpaka::wait(queue); // Allocate memory on host - auto resultGpuHost = alpaka::allocBuf(devHost, InputSize); + auto resultGpuHost = alpaka::allocBuf(devHost, inputSize); // Copy from device memory to host - alpaka::memcpy(queue, resultGpuHost, outputDeviceMemory, InputSize); + alpaka::memcpy(queue, resultGpuHost, outputDeviceMemory, inputSize); bool allEqual{true}; // Print result array at the host - for(size_t i{0}; i < InputSize; i++) + for(size_t i{0}; i < inputSize; i++) { std::cout << "output[" << i << "]:" << std::setprecision(3) << resultGpuHost[i] << "\n"; // Compare with the reference output - bool fuzzyEqual = FuzzyEqual(resultGpuHost[i], ExpectedOutput[i]); + bool fuzzyEqual = FuzzyEqual(resultGpuHost[i], expectedOutput[i]); allEqual = allEqual && fuzzyEqual; } if(!allEqual)