Skip to content

Commit

Permalink
1D convolutional filter using global memory
Browse files Browse the repository at this point in the history
  • Loading branch information
Mehmet Yusufoglu committed Jan 24, 2024
1 parent b161b2f commit 7343be9
Show file tree
Hide file tree
Showing 3 changed files with 226 additions and 0 deletions.
1 change: 1 addition & 0 deletions example/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@ project("alpakaExamples" LANGUAGES CXX)

add_subdirectory("bufferCopy/")
add_subdirectory("complex/")
add_subdirectory("convolution1D/")
add_subdirectory("counterBasedRng/")
add_subdirectory("heatEquation/")
add_subdirectory("helloWorld/")
Expand Down
47 changes: 47 additions & 0 deletions example/convolution1D/CMakeLists.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,47 @@
#
# Copyright 2023 Erik Zenker, Benjamin Worpitz, Jan Stephan
# SPDX-License-Identifier: ISC
#

################################################################################
# Required CMake version.

cmake_minimum_required(VERSION 3.22)

set_property(GLOBAL PROPERTY USE_FOLDERS ON)

################################################################################
# Project.

set(_TARGET_NAME convolution1D)

project(${_TARGET_NAME} LANGUAGES CXX)

#-------------------------------------------------------------------------------
# Find alpaka.

if(NOT TARGET alpaka::alpaka)
option(alpaka_USE_SOURCE_TREE "Use alpaka's source tree instead of an alpaka installation" OFF)

if(alpaka_USE_SOURCE_TREE)
# Don't build the examples recursively
set(alpaka_BUILD_EXAMPLES OFF)
add_subdirectory("${CMAKE_CURRENT_LIST_DIR}/../.." "${CMAKE_BINARY_DIR}/alpaka")
else()
find_package(alpaka REQUIRED)
endif()
endif()

#-------------------------------------------------------------------------------
# Add executable.

alpaka_add_executable(
${_TARGET_NAME}
src/convolution1D.cpp)
target_link_libraries(
${_TARGET_NAME}
PUBLIC alpaka::alpaka)

set_target_properties(${_TARGET_NAME} PROPERTIES FOLDER example)

add_test(NAME ${_TARGET_NAME} COMMAND ${_TARGET_NAME})
178 changes: 178 additions & 0 deletions example/convolution1D/src/convolution1D.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,178 @@
/* Copyright 2023 Benjamin Worpitz, Erik Zenker, Bernhard Manfred Gruber, Jan Stephan
* SPDX-License-Identifier: ISC
*/

#include <alpaka/alpaka.hpp>
#include <alpaka/example/ExampleDefaultAcc.hpp>

#include <cmath>
#include <iomanip>
#include <iostream>
#include <limits>
#include <type_traits>

//! Convolution Example
//!
//! 1D convolution example: Creates two 1D arrays, applies convolution filter.
//! Array sizes are hardcoded.
//!

/**
* @brief The ConvolutionKernel function-object
* Calculates 1D convolution using input and filter arrays.
*/
struct ConvolutionKernel
{
/** @brief Main convolution code
* @param Accelerator
* @param Input array, first input of convolution integral
* @param Filter array, second input of convolution integral
* @param Empty output array to be filled
* @param Input array size
* @param Filter size
*/
template<typename TAcc, typename TElem>
ALPAKA_FN_ACC auto operator()(
TAcc const& acc,
TElem const* const input,
TElem const* const filter,
TElem* const output,
const std::size_t inputSize,
const std::size_t filterSize) const -> void
{
auto const globalThreadIdxX = alpaka::getIdx<alpaka::Grid, alpaka::Threads>(acc)[0];

// Since the kernel is launched 1-D calculating linearizedGlobalThreadIdx line is unnecessary.
// globalThreadIdx[0] can be used to map all the threads.
if(globalThreadIdxX < inputSize)
{
int32_t const halfFilterSize = filterSize / 2;
TElem result = 0.0f;
// Calculate sum of multiplications of corresponding elements
auto const start
= static_cast<int32_t>(std::max(static_cast<int32_t>(globalThreadIdxX) - halfFilterSize, 0));
auto const stop = std::min(globalThreadIdxX + halfFilterSize, inputSize - 1);
for(int32_t i = start; i <= stop; ++i)
result += input[i] * filter[i + halfFilterSize - static_cast<int32_t>(globalThreadIdxX)];
output[globalThreadIdxX] = result;
}
}
};

auto FuzzyEqual(float a, float b) -> bool
{
return std::fabs(a - b) < std::numeric_limits<float>::epsilon() * 10.0f;
}

auto main() -> int
{
// Size of 1D arrays to be used in convolution integral
// Here instead of "convolution kernel" the term "filter" is used because kernel has a different meaning in GPU
// programming. Secondly filter array is not reversed. Implemented like a convolutional layer in CNN.
constexpr size_t filterSize = 3;
using DataType = float;
constexpr size_t inputSize = 8;
constexpr std::array<DataType, inputSize> expectedOutput = {0.8f, 1.4f, 2.0f, 2.6f, 3.2f, 3.8f, 4.4f, 2.3f};

// Define the index domain
using Dim = alpaka::DimInt<1u>;
// Index type
using Idx = std::size_t;

// Define the accelerator
using DevAcc = alpaka::ExampleDefaultAcc<Dim, Idx>;
using QueueProperty = alpaka::Blocking;
using QueueAcc = alpaka::Queue<DevAcc, QueueProperty>;
using BufAcc = alpaka::Buf<DevAcc, DataType, Dim, Idx>;

std::cout << "Using alpaka accelerator: " << alpaka::getAccName<DevAcc>() << '\n';

auto const platformHost = alpaka::PlatformCpu{};
auto const devHost = alpaka::getDevByIdx(platformHost, 0);

// Select a device
auto const platformAcc = alpaka::Platform<DevAcc>{};
auto const devAcc = alpaka::getDevByIdx(platformAcc, 0);

// Create a queue on the device
QueueAcc queue(devAcc);

// Allocate memory host input
auto hostInputMemory = alpaka::allocBuf<DataType, Idx>(devHost, inputSize);

// Fill array with data
for(size_t i = 0; i < inputSize; i++)
hostInputMemory[i] = static_cast<DataType>(i + 1);

// Allocate memory host filter
auto hostFilterMemory = alpaka::allocBuf<DataType, Idx>(devHost, filterSize);

// Fill array with any data
for(size_t i = 0; i < filterSize; i++)
hostFilterMemory[i] = static_cast<DataType>(i + 1) / 10.0f;

// Allocate memory in device
BufAcc inputDeviceMemory = alpaka::allocBuf<DataType, Idx>(devAcc, inputSize);
BufAcc filterDeviceMemory = alpaka::allocBuf<DataType, Idx>(devAcc, filterSize);
BufAcc outputDeviceMemory = alpaka::allocBuf<DataType, Idx>(devAcc, static_cast<Idx>(inputSize));

// Copy input and filter (convolution kernel array) from host to device
alpaka::memcpy(queue, inputDeviceMemory, hostInputMemory, inputSize);
alpaka::memcpy(queue, filterDeviceMemory, hostFilterMemory, filterSize);
// Make sure memcpy finished.
alpaka::wait(queue);
using Vec = alpaka::Vec<Dim, Idx>;
using WorkDiv = alpaka::WorkDivMembers<Dim, Idx>;

auto const elementsPerThread = Vec::all(static_cast<Idx>(1));
// Grid size
auto const threadsPerGrid = inputSize;
WorkDiv const workDiv = alpaka::getValidWorkDiv<DevAcc>(
devAcc,
threadsPerGrid,
elementsPerThread,
false,
alpaka::GridBlockExtentSubDivRestrictions::Unrestricted);

// Instantiate the kernel (gpu code) function-object
ConvolutionKernel convolutionKernel;

// Native pointers needed for the kernel execution function
DataType* nativeFilterDeviceMemory = alpaka::getPtrNative(filterDeviceMemory);
DataType* nativeInputDeviceMemory = alpaka::getPtrNative(inputDeviceMemory);
DataType* nativeOutputDeviceMemory = alpaka::getPtrNative(outputDeviceMemory);

// Run the kernel
alpaka::exec<DevAcc>(
queue,
workDiv,
convolutionKernel,
nativeInputDeviceMemory,
nativeFilterDeviceMemory,
nativeOutputDeviceMemory,
inputSize,
filterSize);

// Allocate memory on host
auto resultGpuHost = alpaka::allocBuf<DataType, Idx>(devHost, inputSize);
// Copy from device memory to host
alpaka::memcpy(queue, resultGpuHost, outputDeviceMemory, inputSize);
alpaka::wait(queue);

bool allEqual{true};
// Print result array at the host
for(size_t i{0}; i < inputSize; i++)
{
std::cout << "output[" << i << "]:" << std::setprecision(3) << resultGpuHost[i] << "\n";
// Compare with the reference output
bool fuzzyEqual = FuzzyEqual(resultGpuHost[i], expectedOutput[i]);
allEqual = allEqual && fuzzyEqual;
}
if(!allEqual)
{
std::cout << "Error: Some convolution results doesn't match!\n";
return EXIT_FAILURE;
}
std::cout << "All results are correct!\n";
return EXIT_SUCCESS;
}

0 comments on commit 7343be9

Please sign in to comment.