Skip to content

Commit

Permalink
Add const, change some code positions
Browse files Browse the repository at this point in the history
  • Loading branch information
Mehmet Yusufoglu committed Jan 22, 2024
1 parent 0f8023e commit c7665f3
Showing 1 changed file with 29 additions and 29 deletions.
58 changes: 29 additions & 29 deletions example/convolution1D/src/convolution1D.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -10,23 +10,16 @@
#include <iostream>
#include <limits>
#include <type_traits>

//! Convolution Example
//!
//! 1D convolution example: Creates two 1D arrays, calculates the convolution integral using those arrays.
//! 1D convolution example: Creates two 1D arrays, applies convolution filter.
//! Array sizes are hardcoded.
//!

// Size of 1D arrays to be used in convolution integral
// In signal processing domain, the term "kernel" is used for the matrix or array used in convolution integral.
// Here instead of "kernel" the term "filter" is used because kernel has a different meaning in GPU programming.
constexpr size_t FilterSize = 3;
constexpr size_t InputSize = 8;

constexpr float ExpectedOutput[InputSize] = {0.8f, 1.4f, 2.0f, 2.6f, 3.2f, 3.8f, 4.4f, 2.3f};

/**
* @brief The ConvolutionKernel function object
* Calculates 1D convolution integral using input and filter arrays.
* @brief The ConvolutionKernel function-object
* Calculates 1D convolution using input and filter arrays.
*/
struct ConvolutionKernel
{
Expand All @@ -43,7 +36,7 @@ struct ConvolutionKernel
TAcc const& acc,
TElem const* const input,
TElem const* const filter,
TElem* output,
TElem* const output,
const std::size_t inputSize,
const std::size_t filterSize) const -> void
{
Expand All @@ -64,7 +57,7 @@ struct ConvolutionKernel
// Calculate sum of multiplications of corresponding elements
for(size_t i = 0; i < filterSize; ++i)
{
int inputIndex = globalThreadIdx[0] - halfFilterSize + i;
uint32_t inputIndex = globalThreadIdx[0] - halfFilterSize + i;
if(inputIndex >= 0 && inputIndex < inputSize)
{
result += input[inputIndex] * filter[i];
Expand Down Expand Up @@ -93,6 +86,13 @@ auto main() -> int
#if defined(ALPAKA_CI) && !defined(ALPAKA_ACC_CPU_B_SEQ_T_SEQ_ENABLED)
return EXIT_SUCCESS;
#else
// Size of 1D arrays to be used in convolution integral
// Here instead of "convolution kernel" the term "filter" is used because kernel has a different meaning in GPU
// programming. Secondly filter array is not reversed. Implemented like a convolutional layer in CNN.
constexpr size_t filterSize = 3;
constexpr size_t inputSize = 8;
constexpr std::array<float, inputSize> expectedOutput = {0.8f, 1.4f, 2.0f, 2.6f, 3.2f, 3.8f, 4.4f, 2.3f};

// Define the index domain
using Dim = alpaka::DimInt<1u>;
// Index type
Expand All @@ -118,36 +118,36 @@ auto main() -> int
QueueAcc queue(devAcc);

// Allocate memory host input
auto hostInputMemory = alpaka::allocBuf<DataType, Idx>(devHost, InputSize);
auto hostInputMemory = alpaka::allocBuf<DataType, Idx>(devHost, inputSize);
DataType* nativeHostInputMemory = alpaka::getPtrNative(hostInputMemory);

// Fill array with data
for(size_t i = 0; i < InputSize; i++)
for(size_t i = 0; i < inputSize; i++)
nativeHostInputMemory[i] = static_cast<DataType>(i + 1);

// Allocate memory host filter
auto hostFilterMemory = alpaka::allocBuf<DataType, Idx>(devHost, FilterSize);
auto hostFilterMemory = alpaka::allocBuf<DataType, Idx>(devHost, filterSize);
DataType* nativeHostFilterMemory = alpaka::getPtrNative(hostFilterMemory);

// Fill array with any data
for(size_t i = 0; i < FilterSize; i++)
for(size_t i = 0; i < filterSize; i++)
nativeHostFilterMemory[i] = static_cast<DataType>(i + 1) / 10.0f;

// Allocate memory in device
BufAcc inputDeviceMemory = alpaka::allocBuf<DataType, Idx>(devAcc, InputSize);
BufAcc filterDeviceMemory = alpaka::allocBuf<DataType, Idx>(devAcc, FilterSize);
BufAcc outputDeviceMemory = alpaka::allocBuf<DataType, Idx>(devAcc, static_cast<Idx>(InputSize));
BufAcc inputDeviceMemory = alpaka::allocBuf<DataType, Idx>(devAcc, inputSize);
BufAcc filterDeviceMemory = alpaka::allocBuf<DataType, Idx>(devAcc, filterSize);
BufAcc outputDeviceMemory = alpaka::allocBuf<DataType, Idx>(devAcc, static_cast<Idx>(inputSize));

// Copy input and filter (convolution kernel array) from host to device
alpaka::memcpy(queue, inputDeviceMemory, hostInputMemory, InputSize);
alpaka::memcpy(queue, filterDeviceMemory, hostFilterMemory, FilterSize);
alpaka::memcpy(queue, inputDeviceMemory, hostInputMemory, inputSize);
alpaka::memcpy(queue, filterDeviceMemory, hostFilterMemory, filterSize);

using Vec = alpaka::Vec<Dim, Idx>;
using WorkDiv = alpaka::WorkDivMembers<Dim, Idx>;

auto const elementsPerThread = Vec::all(static_cast<Idx>(1));
// Grid size
auto const threadsPerGrid = InputSize;
auto const threadsPerGrid = inputSize;
WorkDiv const workDiv = alpaka::getValidWorkDiv<DevAcc>(
devAcc,
threadsPerGrid,
Expand All @@ -171,22 +171,22 @@ auto main() -> int
nativeInputDeviceMemory,
nativeFilterDeviceMemory,
nativeOutputDeviceMemory,
InputSize,
FilterSize);
inputSize,
filterSize);
alpaka::wait(queue);

// Allocate memory on host
auto resultGpuHost = alpaka::allocBuf<DataType, Idx>(devHost, InputSize);
auto resultGpuHost = alpaka::allocBuf<DataType, Idx>(devHost, inputSize);
// Copy from device memory to host
alpaka::memcpy(queue, resultGpuHost, outputDeviceMemory, InputSize);
alpaka::memcpy(queue, resultGpuHost, outputDeviceMemory, inputSize);

bool allEqual{true};
// Print result array at the host
for(size_t i{0}; i < InputSize; i++)
for(size_t i{0}; i < inputSize; i++)
{
std::cout << "output[" << i << "]:" << std::setprecision(3) << resultGpuHost[i] << "\n";
// Compare with the reference output
bool fuzzyEqual = FuzzyEqual<DataType>(resultGpuHost[i], ExpectedOutput[i]);
bool fuzzyEqual = FuzzyEqual<DataType>(resultGpuHost[i], expectedOutput[i]);
allEqual = allEqual && fuzzyEqual;
}
if(!allEqual)
Expand Down

0 comments on commit c7665f3

Please sign in to comment.