diff --git a/example/useBLASInAlpaka/CMakeLists.txt b/example/useBLASInAlpaka/CMakeLists.txt index ce291265839..9545b6cf2dd 100644 --- a/example/useBLASInAlpaka/CMakeLists.txt +++ b/example/useBLASInAlpaka/CMakeLists.txt @@ -2,7 +2,6 @@ # Copyright 2023 Benjamin Worpitz, Jan Stephan # SPDX-License-Identifier: ISC # - ################################################################################ # Required CMake version. @@ -17,8 +16,6 @@ set(_TARGET_NAME useBLASInAlpaka) project(${_TARGET_NAME} LANGUAGES CXX) - - # Add cuBLAS library find_package(CUDA REQUIRED) set(CUDA_LIBRARIES ${CUDA_LIBRARIES} cublas) @@ -47,7 +44,6 @@ alpaka_add_executable( target_link_libraries( ${_TARGET_NAME} PUBLIC alpaka::alpaka ${CUDA_LIBRARIES}) - set_target_properties(${_TARGET_NAME} PROPERTIES FOLDER example) set_target_properties(${_TARGET_NAME} PROPERTIES CUDA_STANDARD 14) add_test(NAME ${_TARGET_NAME} COMMAND ${_TARGET_NAME}) diff --git a/example/useBLASInAlpaka/src/useBLASInAlpaka.cpp b/example/useBLASInAlpaka/src/useBLASInAlpaka.cpp index c279ad28948..890e5ae827e 100644 --- a/example/useBLASInAlpaka/src/useBLASInAlpaka.cpp +++ b/example/useBLASInAlpaka/src/useBLASInAlpaka.cpp @@ -1,37 +1,39 @@ #include + #include -#include + #include +#include // Index type using Idx = std::size_t; // Set data type using DataType = float; -// Initialize the matrix in column-major order -template -inline void initializeMatrix(TMdSpan& span, int value) { - auto const numRows = span.extent(0); - auto const numCols = span.extent(1); - for (Idx j = 0; j < numCols; ++j) { - for (Idx i = 0; i < numRows; ++i) { - span(i, j) = static_cast(value); +// Initialize the matrix in column-major order (1D buffer) +void initializeMatrix(DataType* buffer, Idx rows, Idx cols, int value) +{ + for(Idx j = 0; j < cols; ++j) + { + for(Idx i = 0; i < rows; ++i) + { + buffer[i + j * rows] = static_cast(value); } } } -int main() { - using Dim = alpaka::DimInt<2>; +int main() +{ + using Dim1D = alpaka::DimInt<1>; - // Define matrix dimensions, A is MxK and B is KxN - Idx const M = 3; // Rows in A and C + // Define matrix dimensions, A is MxK and B is KxN + Idx const M = 4; // Rows in A and C Idx const N = 2; // Columns in B and C - Idx const K = 1; // Columns in A and rows in B + Idx const K = 3; // Columns in A and rows in B - // Define device and queue - using Acc = alpaka::AccGpuCudaRt; + // Define device and queue + using Acc = alpaka::AccGpuCudaRt; using Queue = alpaka::Queue; - using Vec = alpaka::Vec; auto const platformHost = alpaka::PlatformCpu{}; auto const devHost = alpaka::getDevByIdx(platformHost, 0); @@ -40,127 +42,125 @@ int main() { Queue queue(devAcc); - // Define the 2D extents (dimensions) - Vec const extentA(static_cast(M), static_cast(K)); - Vec const extentB(static_cast(K), static_cast(N)); - Vec const extentC(static_cast(M), static_cast(N)); - - // Allocate host memory - auto bufHostA = alpaka::allocBuf(devHost, extentA); - auto bufHostB = alpaka::allocBuf(devHost, extentB); - auto bufHostC = alpaka::allocBuf(devHost, extentC); + // Allocate 1D host memory + auto bufHostA = alpaka::allocBuf(devHost, M * K); + auto bufHostB = alpaka::allocBuf(devHost, K * N); + auto bufHostC = alpaka::allocBuf(devHost, M * N); - // Create mdspan views for host buffers - auto mdHostA = alpaka::experimental::getMdSpan(bufHostA); - auto mdHostB = alpaka::experimental::getMdSpan(bufHostB); - auto mdHostC = alpaka::experimental::getMdSpan(bufHostC); + DataType* hostA = alpaka::getPtrNative(bufHostA); + DataType* hostB = alpaka::getPtrNative(bufHostB); + DataType* hostC = alpaka::getPtrNative(bufHostC); - // Initialize host matrices - initializeMatrix(mdHostA, 1); // All elements in A are 1 - initializeMatrix(mdHostB, 2); // All elements in B are 2 + // Initialize host matrices + initializeMatrix(hostA, M, K, 1); // All elements in A are 1 + initializeMatrix(hostB, K, N, 2); // All elements in B are 2 + std::fill(hostC, hostC + (M * N), 0); // Initialize C with 0s - // Print initialized matrices on the host + // Print initialized matrices std::cout << "Matrix A (Host):" << std::endl; - for (Idx i = 0; i < M; ++i) { - for (Idx j = 0; j < K; ++j) { - std::cout << mdHostA(i, j) << ""; + for(Idx i = 0; i < M; ++i) + { + for(Idx j = 0; j < K; ++j) + { + std::cout << hostA[i + j * M] << " "; } std::cout << std::endl; } std::cout << "Matrix B (Host):" << std::endl; - for (Idx i = 0; i < K; ++i) { - for (Idx j = 0; j < N; ++j) { - std::cout << mdHostB(i, j) << ""; + for(Idx i = 0; i < K; ++i) + { + for(Idx j = 0; j < N; ++j) + { + std::cout << hostB[i + j * K] << " "; } std::cout << std::endl; } - // Allocate device memory - auto bufDevA = alpaka::allocBuf(devAcc, extentA); - auto bufDevB = alpaka::allocBuf(devAcc, extentB); - auto bufDevC = alpaka::allocBuf(devAcc, extentC); + // Allocate 1D device memory + auto bufDevA = alpaka::allocBuf(devAcc, M * K); + auto bufDevB = alpaka::allocBuf(devAcc, K * N); + auto bufDevC = alpaka::allocBuf(devAcc, M * N); - // Copy data to device + // Copy data to device alpaka::memcpy(queue, bufDevA, bufHostA); alpaka::memcpy(queue, bufDevB, bufHostB); + alpaka::memcpy(queue, bufDevC, bufHostC); // Initialize device C with zeros alpaka::wait(queue); - - - std::cout << "Copied matrices A and B to the device." << std::endl; - // Get the native CUDA stream from Alpaka queue + // Get the native CUDA stream from Alpaka queue auto alpakaStream = alpaka::getNativeHandle(queue); - // cuBLAS setup + // cuBLAS setup cublasHandle_t cublasHandle; cublasCreate(&cublasHandle); cublasSetStream(cublasHandle, alpakaStream); - auto pitchA = alpaka::getPitchesInBytes(bufDevA); - auto pitchB = alpaka::getPitchesInBytes(bufDevB); - auto pitchC = alpaka::getPitchesInBytes(bufDevC); - - std::cout << "pitchA" << pitchA << std::endl; - std::cout << "pitchB" << pitchB << std::endl; - std::cout << "pitchC" << pitchC << std::endl; - // Perform matrix multiplication: C = A * B // Perform matrix multiplication: C = A * B float alpha = 1.0f, beta = 0.0f; // Set beta to 0.0f to overwrite C cublasSgemm( cublasHandle, - CUBLAS_OP_N, CUBLAS_OP_N, // No transpose - M, N, K, // Dimensions + CUBLAS_OP_N, + CUBLAS_OP_N, // No transpose for A and B + M, + N, + K, // Dimensions: C = A * B &alpha, - alpaka::getPtrNative(bufDevA), M, // Leading dimension (rows of A) - alpaka::getPtrNative(bufDevB), K, // Leading dimension (rows of B) + alpaka::getPtrNative(bufDevA), + M, // Leading dimension of A + alpaka::getPtrNative(bufDevB), + K, // Leading dimension of B &beta, - alpaka::getPtrNative(bufDevC), M // Leading dimension (rows of C) - ); - - - + alpaka::getPtrNative(bufDevC), + M // Leading dimension of C + ); alpaka::wait(queue); // Wait for multiplication to complete std::cout << "Matrix multiplication completed." << std::endl; - // Copy result back to host + // Copy result back to host alpaka::memcpy(queue, bufHostC, bufDevC); alpaka::wait(queue); std::cout << "Copied result matrix C back to the host." << std::endl; - // Print result matrix C + // Print result matrix C std::cout << "Matrix C (Host):" << std::endl; - for (Idx i = 0; i < M; ++i) { - for (Idx j = 0; j < N; ++j) { - std::cout << mdHostC(i, j) << " "; + for(Idx i = 0; i < M; ++i) + { + for(Idx j = 0; j < N; ++j) + { + std::cout << hostC[i + j * M] << " "; } std::cout << std::endl; } - // Verify the result + // Verify the result bool success = true; DataType expectedValue = 2 * K; // Expected value for all elements in C - for (Idx i = 0; i < M; ++i) { - for (Idx j = 0; j < N; ++j) { - if (std::fabs(mdHostC(i, j) - expectedValue) > 1e-5f) { // Allow small floating-point errors - std::cout << "Mismatch at (" << i << ", " << j << "): " - << mdHostC(i, j) << " != " << expectedValue << std::endl; + for(Idx i = 0; i < M; ++i) + { + for(Idx j = 0; j < N; ++j) + { + if(std::fabs(hostC[i + j * M] - expectedValue) > 1e-5f) + { // Allow small floating-point errors + std::cout << "Mismatch at (" << i << ", " << j << "): " << hostC[i + j * M] << " != " << expectedValue + << std::endl; success = false; } } } std::cout << "Multiplication of matrices of size " << M << "x" << K << " and " << K << "x" << N - << " using mdspan " << (success ? "succeeded" : "failed") << "!" << std::endl; + << (success ? " succeeded!" : " failed!") << std::endl; - if (!success) { + if(!success) + { return EXIT_FAILURE; } - // Cleanup cuBLAS + // Cleanup cuBLAS cublasDestroy(cublasHandle); return EXIT_SUCCESS;