-
Notifications
You must be signed in to change notification settings - Fork 75
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
1 parent
8fefd70
commit 710b2ec
Showing
3 changed files
with
258 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,58 @@ | ||
# | ||
# Copyright 2023 Benjamin Worpitz, Jan Stephan | ||
# SPDX-License-Identifier: ISC | ||
# | ||
|
||
################################################################################ | ||
# Required CMake version. | ||
|
||
cmake_minimum_required(VERSION 3.25) | ||
|
||
set_property(GLOBAL PROPERTY USE_FOLDERS ON) | ||
|
||
################################################################################ | ||
# Project. | ||
|
||
set(_TARGET_NAME useBLASInAlpaka) | ||
|
||
project(${_TARGET_NAME} LANGUAGES CXX) | ||
|
||
# Check if Alpaka accelerator is CUDA-only | ||
if(NOT alpaka_ACC_GPU_CUDA_ONLY_MODE) | ||
# Print a warning and skip target creation | ||
message(WARNING "Skipping build of 'useBLASInAlpaka' because alpaka_ACC_GPU_CUDA_ONLY_MODE is not enabled.") | ||
return() | ||
endif() | ||
|
||
# Add cuBLAS library | ||
find_package(CUDA REQUIRED) | ||
set(CUDA_LIBRARIES ${CUDA_LIBRARIES} cublas) | ||
|
||
#------------------------------------------------------------------------------- | ||
# Find alpaka. | ||
|
||
if(NOT TARGET alpaka::alpaka) | ||
option(alpaka_USE_SOURCE_TREE "Use alpaka's source tree instead of an alpaka installation" OFF) | ||
|
||
if(alpaka_USE_SOURCE_TREE) | ||
# Don't build the examples recursively | ||
set(alpaka_BUILD_EXAMPLES OFF) | ||
add_subdirectory("${CMAKE_CURRENT_LIST_DIR}/../.." "${CMAKE_BINARY_DIR}/alpaka") | ||
else() | ||
find_package(alpaka REQUIRED) | ||
endif() | ||
endif() | ||
|
||
#------------------------------------------------------------------------------- | ||
# Add executable. | ||
|
||
alpaka_add_executable( | ||
${_TARGET_NAME} | ||
src/useBLASInAlpaka.cpp) | ||
target_link_libraries( | ||
${_TARGET_NAME} | ||
PUBLIC alpaka::alpaka ${CUDA_LIBRARIES}) | ||
|
||
set_target_properties(${_TARGET_NAME} PROPERTIES FOLDER example) | ||
set_target_properties(${_TARGET_NAME} PROPERTIES CUDA_STANDARD 14) | ||
add_test(NAME ${_TARGET_NAME} COMMAND ${_TARGET_NAME}) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,198 @@ | ||
/* Copyright 2023 Mehmet Yusufoglu, Rene Widera, | ||
* SPDX-License-Identifier: ISC | ||
*/ | ||
/* | ||
* This example uses cuBLAS library functions in alpaka. cuBLAS function is called by using alpaka buffers and queue. | ||
* Since the code needs only AccGpuCuda backend. Make sure the correct alpaka cmake backend flag is set for alpaka. | ||
*/ | ||
#include <alpaka/alpaka.hpp> | ||
#include <alpaka/example/ExecuteForEachAccTag.hpp> | ||
|
||
#include <cublas_v2.h> | ||
|
||
#include <cmath> | ||
#include <iostream> | ||
|
||
// Index type | ||
using Idx = std::size_t; | ||
// Set data type | ||
using DataType = float; | ||
|
||
// Initialize the matrix in column-major order (1D buffer) | ||
void initializeMatrix(DataType* buffer, Idx rows, Idx cols, int value) | ||
{ | ||
for(Idx j = 0; j < cols; ++j) | ||
{ | ||
for(Idx i = 0; i < rows; ++i) | ||
{ | ||
buffer[i + j * rows] = static_cast<DataType>(value); | ||
} | ||
} | ||
} | ||
|
||
// In standard projects, you typically do not execute the code with any available accelerator. | ||
// Instead, a single accelerator is selected once from the active accelerators and the kernels are executed with the | ||
// selected accelerator only. If you use the example as the starting point for your project, you can rename the | ||
// example() function to main() and move the accelerator tag to the function body. | ||
template<alpaka::concepts::Tag TAccTag> | ||
auto example(TAccTag const&) -> int | ||
{ | ||
using Dim1D = alpaka::DimInt<1>; | ||
|
||
// Define matrix dimensions, A is MxK and B is KxN | ||
Idx const M = 4; // Rows in A and C | ||
Idx const N = 2; // Columns in B and C | ||
Idx const K = 3; // Columns in A and rows in B | ||
|
||
// Define the accelerator and queue | ||
// Use Cuda Accelerator. Cmake Acc flags should be set to Cuda-Only | ||
using Acc = alpaka::TagToAcc<TAccTag, Dim1D, Idx>; | ||
using Queue = alpaka::Queue<Acc, alpaka::Blocking>; | ||
|
||
auto const platformHost = alpaka::PlatformCpu{}; | ||
auto const devHost = alpaka::getDevByIdx(platformHost, 0); | ||
auto const platformAcc = alpaka::Platform<Acc>{}; | ||
auto const devAcc = alpaka::getDevByIdx(platformAcc, 0); | ||
|
||
Queue queue(devAcc); | ||
|
||
// Allocate 1D host memory | ||
auto bufHostA = alpaka::allocBuf<DataType, Idx>(devHost, M * K); | ||
auto bufHostB = alpaka::allocBuf<DataType, Idx>(devHost, K * N); | ||
auto bufHostC = alpaka::allocBuf<DataType, Idx>(devHost, M * N); | ||
|
||
DataType* hostA = alpaka::getPtrNative(bufHostA); | ||
DataType* hostB = alpaka::getPtrNative(bufHostB); | ||
DataType* hostC = alpaka::getPtrNative(bufHostC); | ||
|
||
// Initialize host matrices | ||
initializeMatrix(hostA, M, K, 1); // All elements in A are 1 | ||
initializeMatrix(hostB, K, N, 2); // All elements in B are 2 | ||
std::fill(hostC, hostC + (M * N), 0); // Initialize C with 0s | ||
|
||
// Print initialized matrices | ||
std::cout << "Matrix A (Host):" << std::endl; | ||
for(Idx i = 0; i < M; ++i) | ||
{ | ||
for(Idx j = 0; j < K; ++j) | ||
{ | ||
std::cout << hostA[i + j * M] << " "; | ||
} | ||
std::cout << std::endl; | ||
} | ||
|
||
std::cout << "Matrix B (Host):" << std::endl; | ||
for(Idx i = 0; i < K; ++i) | ||
{ | ||
for(Idx j = 0; j < N; ++j) | ||
{ | ||
std::cout << hostB[i + j * K] << " "; | ||
} | ||
std::cout << std::endl; | ||
} | ||
|
||
// Allocate 1D device memory | ||
auto bufDevA = alpaka::allocBuf<DataType, Idx>(devAcc, M * K); | ||
auto bufDevB = alpaka::allocBuf<DataType, Idx>(devAcc, K * N); | ||
auto bufDevC = alpaka::allocBuf<DataType, Idx>(devAcc, M * N); | ||
|
||
// Copy data to device | ||
alpaka::memcpy(queue, bufDevA, bufHostA); | ||
alpaka::memcpy(queue, bufDevB, bufHostB); | ||
alpaka::memcpy(queue, bufDevC, bufHostC); | ||
alpaka::wait(queue); | ||
|
||
std::cout << "Copied matrices A and B to the device." << std::endl; | ||
|
||
// Get the native CUDA stream from Alpaka queue | ||
auto alpakaStream = alpaka::getNativeHandle(queue); | ||
|
||
// cuBLAS setup | ||
cublasHandle_t cublasHandle; | ||
cublasCreate(&cublasHandle); | ||
cublasSetStream(cublasHandle, alpakaStream); | ||
|
||
// Perform matrix multiplication: C = A * B | ||
float alpha = 1.0f, beta = 0.0f; // Set beta to 0.0f to overwrite C | ||
cublasSgemm( | ||
cublasHandle, | ||
CUBLAS_OP_N, | ||
CUBLAS_OP_N, // No transpose for A and B | ||
M, | ||
N, | ||
K, // Dimensions: C = A * B | ||
&alpha, | ||
std::data(bufDevA), | ||
M, // Leading dimension of A | ||
std::data(bufDevB), | ||
K, // Leading dimension of B | ||
&beta, | ||
std::data(bufDevC), | ||
M // Leading dimension of C | ||
); | ||
|
||
alpaka::wait(queue); // Wait for multiplication to complete | ||
std::cout << "Matrix multiplication completed." << std::endl; | ||
|
||
// Copy result back to host | ||
alpaka::memcpy(queue, bufHostC, bufDevC); | ||
alpaka::wait(queue); | ||
std::cout << "Copied result matrix C back to the host." << std::endl; | ||
|
||
// Print result matrix C | ||
std::cout << "Matrix C (Host):" << std::endl; | ||
for(Idx i = 0; i < M; ++i) | ||
{ | ||
for(Idx j = 0; j < N; ++j) | ||
{ | ||
std::cout << hostC[i + j * M] << " "; | ||
} | ||
std::cout << std::endl; | ||
} | ||
|
||
// Verify the result | ||
bool success = true; | ||
DataType expectedValue = 2 * K; // Expected value for all elements in C | ||
for(Idx i = 0; i < M; ++i) | ||
{ | ||
for(Idx j = 0; j < N; ++j) | ||
{ | ||
if(std::fabs(hostC[i + j * M] - expectedValue) > 1e-5f) | ||
{ // Allow small floating-point errors | ||
std::cout << "Mismatch at (" << i << ", " << j << "): " << hostC[i + j * M] << " != " << expectedValue | ||
<< std::endl; | ||
success = false; | ||
} | ||
} | ||
} | ||
|
||
std::cout << "Multiplication of matrices of size " << M << "x" << K << " and " << K << "x" << N | ||
<< (success ? " succeeded!" : " failed!") << std::endl; | ||
|
||
if(!success) | ||
{ | ||
return EXIT_FAILURE; | ||
} | ||
|
||
// Cleanup cuBLAS | ||
cublasDestroy(cublasHandle); | ||
return EXIT_SUCCESS; | ||
} | ||
|
||
auto main() -> int | ||
{ | ||
std::cout << "Check enabled accelerator tags:" << std::endl; | ||
alpaka::printTagNames<alpaka::EnabledAccTags>(); | ||
// Execute the example once for each enabled accelerator. | ||
// If you would like to execute it for a single accelerator only you can use the following code. | ||
// \code{.cpp} | ||
// auto tag = TagCpuSerial; | ||
// return example(tag); | ||
// \endcode | ||
// | ||
// valid tags: | ||
// TagCpuSerial, TagGpuHipRt, TagGpuCudaRt, TagCpuOmp2Blocks, TagCpuTbbBlocks, | ||
// TagCpuOmp2Threads, TagCpuSycl, TagCpuTbbBlocks, TagCpuThreads, | ||
// TagFpgaSyclIntel, TagGenericSycl, TagGpuSyclIntel | ||
return alpaka::executeForEachAccTag([=](auto const& tag) { return example(tag); }); | ||
} |