use 1D arrays as matrices

alpaka-group · Nov 22, 2024 · ae8e4ed · ae8e4ed
1 parent 6b29767
commit ae8e4ed
Show file tree

Hide file tree

Showing 2 changed files with 81 additions and 85 deletions.
diff --git a/example/useBLASInAlpaka/CMakeLists.txt b/example/useBLASInAlpaka/CMakeLists.txt
@@ -2,7 +2,6 @@
 # Copyright 2023 Benjamin Worpitz, Jan Stephan
 # SPDX-License-Identifier: ISC
 #
-
 ################################################################################
 # Required CMake version.
 
@@ -17,8 +16,6 @@ set(_TARGET_NAME useBLASInAlpaka)
 
 project(${_TARGET_NAME} LANGUAGES CXX)
 
-
-
 # Add cuBLAS library
 find_package(CUDA REQUIRED)
 set(CUDA_LIBRARIES ${CUDA_LIBRARIES} cublas)
@@ -47,7 +44,6 @@ alpaka_add_executable(
 target_link_libraries(
     ${_TARGET_NAME}
     PUBLIC alpaka::alpaka ${CUDA_LIBRARIES})
-
 set_target_properties(${_TARGET_NAME} PROPERTIES FOLDER example)
 set_target_properties(${_TARGET_NAME}  PROPERTIES CUDA_STANDARD 14)
 add_test(NAME ${_TARGET_NAME} COMMAND ${_TARGET_NAME})
diff --git a/example/useBLASInAlpaka/src/useBLASInAlpaka.cpp b/example/useBLASInAlpaka/src/useBLASInAlpaka.cpp
@@ -1,37 +1,39 @@
 #include <alpaka/alpaka.hpp>
+
 #include <cublas_v2.h>
-#include <iostream>
+
 #include <cmath>
+#include <iostream>
 
 // Index type
 using Idx = std::size_t;
 // Set data type
 using DataType = float;
 
-// Initialize the matrix in column-major order
-template<typename TMdSpan>
-inline void initializeMatrix(TMdSpan& span, int value) {
-    auto const numRows = span.extent(0);
-    auto const numCols = span.extent(1);
-    for (Idx j = 0; j < numCols; ++j) {
-        for (Idx i = 0; i < numRows; ++i) {
-            span(i, j) = static_cast<DataType>(value);
+// Initialize the matrix in column-major order (1D buffer)
+void initializeMatrix(DataType* buffer, Idx rows, Idx cols, int value)
+{
+    for(Idx j = 0; j < cols; ++j)
+    {
+        for(Idx i = 0; i < rows; ++i)
+        {
+            buffer[i + j * rows] = static_cast<DataType>(value);
         }
     }
 }
 
-int main() {
-    using Dim = alpaka::DimInt<2>;
+int main()
+{
+    using Dim1D = alpaka::DimInt<1>;
 
-           // Define matrix dimensions, A is MxK and B is KxN
-    Idx const M = 3; // Rows in A and C
+    // Define matrix dimensions, A is MxK and B is KxN
+    Idx const M = 4; // Rows in A and C
     Idx const N = 2; // Columns in B and C
-    Idx const K = 1; // Columns in A and rows in B
+    Idx const K = 3; // Columns in A and rows in B
 
-           // Define device and queue
-    using Acc = alpaka::AccGpuCudaRt<Dim, Idx>;
+    // Define device and queue
+    using Acc = alpaka::AccGpuCudaRt<Dim1D, Idx>;
     using Queue = alpaka::Queue<Acc, alpaka::Blocking>;
-    using Vec = alpaka::Vec<Dim, Idx>;
 
     auto const platformHost = alpaka::PlatformCpu{};
     auto const devHost = alpaka::getDevByIdx(platformHost, 0);
@@ -40,127 +42,125 @@ int main() {
 
     Queue queue(devAcc);
 
-           // Define the 2D extents (dimensions)
-    Vec const extentA(static_cast<Idx>(M), static_cast<Idx>(K));
-    Vec const extentB(static_cast<Idx>(K), static_cast<Idx>(N));
-    Vec const extentC(static_cast<Idx>(M), static_cast<Idx>(N));
-
-           // Allocate host memory
-    auto bufHostA = alpaka::allocBuf<DataType, Idx>(devHost, extentA);
-    auto bufHostB = alpaka::allocBuf<DataType, Idx>(devHost, extentB);
-    auto bufHostC = alpaka::allocBuf<DataType, Idx>(devHost, extentC);
+    // Allocate 1D host memory
+    auto bufHostA = alpaka::allocBuf<DataType, Idx>(devHost, M * K);
+    auto bufHostB = alpaka::allocBuf<DataType, Idx>(devHost, K * N);
+    auto bufHostC = alpaka::allocBuf<DataType, Idx>(devHost, M * N);
 
-           // Create mdspan views for host buffers
-    auto mdHostA = alpaka::experimental::getMdSpan(bufHostA);
-    auto mdHostB = alpaka::experimental::getMdSpan(bufHostB);
-    auto mdHostC = alpaka::experimental::getMdSpan(bufHostC);
+    DataType* hostA = alpaka::getPtrNative(bufHostA);
+    DataType* hostB = alpaka::getPtrNative(bufHostB);
+    DataType* hostC = alpaka::getPtrNative(bufHostC);
 
-           // Initialize host matrices
-    initializeMatrix(mdHostA, 1); // All elements in A are 1
-    initializeMatrix(mdHostB, 2); // All elements in B are 2
+    // Initialize host matrices
+    initializeMatrix(hostA, M, K, 1); // All elements in A are 1
+    initializeMatrix(hostB, K, N, 2); // All elements in B are 2
+    std::fill(hostC, hostC + (M * N), 0); // Initialize C with 0s
 
-           // Print initialized matrices on the host
+    // Print initialized matrices
     std::cout << "Matrix A (Host):" << std::endl;
-    for (Idx i = 0; i < M; ++i) {
-        for (Idx j = 0; j < K; ++j) {
-            std::cout << mdHostA(i, j) << "";
+    for(Idx i = 0; i < M; ++i)
+    {
+        for(Idx j = 0; j < K; ++j)
+        {
+            std::cout << hostA[i + j * M] << " ";
         }
         std::cout << std::endl;
     }
 
     std::cout << "Matrix B (Host):" << std::endl;
-    for (Idx i = 0; i < K; ++i) {
-        for (Idx j = 0; j < N; ++j) {
-            std::cout << mdHostB(i, j) << "";
+    for(Idx i = 0; i < K; ++i)
+    {
+        for(Idx j = 0; j < N; ++j)
+        {
+            std::cout << hostB[i + j * K] << " ";
         }
         std::cout << std::endl;
     }
 
-           // Allocate device memory
-    auto bufDevA = alpaka::allocBuf<DataType, Idx>(devAcc, extentA);
-    auto bufDevB = alpaka::allocBuf<DataType, Idx>(devAcc, extentB);
-    auto bufDevC = alpaka::allocBuf<DataType, Idx>(devAcc, extentC);
+    // Allocate 1D device memory
+    auto bufDevA = alpaka::allocBuf<DataType, Idx>(devAcc, M * K);
+    auto bufDevB = alpaka::allocBuf<DataType, Idx>(devAcc, K * N);
+    auto bufDevC = alpaka::allocBuf<DataType, Idx>(devAcc, M * N);
 
-           // Copy data to device
+    // Copy data to device
     alpaka::memcpy(queue, bufDevA, bufHostA);
     alpaka::memcpy(queue, bufDevB, bufHostB);
+    alpaka::memcpy(queue, bufDevC, bufHostC); // Initialize device C with zeros
     alpaka::wait(queue);
 
-
-
-
     std::cout << "Copied matrices A and B to the device." << std::endl;
 
-           // Get the native CUDA stream from Alpaka queue
+    // Get the native CUDA stream from Alpaka queue
     auto alpakaStream = alpaka::getNativeHandle(queue);
 
-           // cuBLAS setup
+    // cuBLAS setup
     cublasHandle_t cublasHandle;
     cublasCreate(&cublasHandle);
     cublasSetStream(cublasHandle, alpakaStream);
-    auto pitchA = alpaka::getPitchesInBytes(bufDevA);
-    auto pitchB = alpaka::getPitchesInBytes(bufDevB);
-    auto pitchC = alpaka::getPitchesInBytes(bufDevC);
-
-    std::cout << "pitchA" << pitchA  << std::endl;
-    std::cout << "pitchB" << pitchB  << std::endl;
-    std::cout << "pitchC" << pitchC  << std::endl;
 
-           // Perform matrix multiplication: C = A * B
     // Perform matrix multiplication: C = A * B
     float alpha = 1.0f, beta = 0.0f; // Set beta to 0.0f to overwrite C
     cublasSgemm(
         cublasHandle,
-        CUBLAS_OP_N, CUBLAS_OP_N, // No transpose
-        M, N, K,                 // Dimensions
+        CUBLAS_OP_N,
+        CUBLAS_OP_N, // No transpose for A and B
+        M,
+        N,
+        K, // Dimensions: C = A * B
         &alpha,
-        alpaka::getPtrNative(bufDevA), M, // Leading dimension (rows of A)
-        alpaka::getPtrNative(bufDevB), K, // Leading dimension (rows of B)
+        alpaka::getPtrNative(bufDevA),
+        M, // Leading dimension of A
+        alpaka::getPtrNative(bufDevB),
+        K, // Leading dimension of B
         &beta,
-        alpaka::getPtrNative(bufDevC), M // Leading dimension (rows of C)
-        );
-
-
-
+        alpaka::getPtrNative(bufDevC),
+        M // Leading dimension of C
+    );
 
     alpaka::wait(queue); // Wait for multiplication to complete
     std::cout << "Matrix multiplication completed." << std::endl;
 
-           // Copy result back to host
+    // Copy result back to host
     alpaka::memcpy(queue, bufHostC, bufDevC);
     alpaka::wait(queue);
     std::cout << "Copied result matrix C back to the host." << std::endl;
 
-           // Print result matrix C
+    // Print result matrix C
     std::cout << "Matrix C (Host):" << std::endl;
-    for (Idx i = 0; i < M; ++i) {
-        for (Idx j = 0; j < N; ++j) {
-            std::cout << mdHostC(i, j) << " ";
+    for(Idx i = 0; i < M; ++i)
+    {
+        for(Idx j = 0; j < N; ++j)
+        {
+            std::cout << hostC[i + j * M] << " ";
         }
         std::cout << std::endl;
     }
 
-           // Verify the result
+    // Verify the result
     bool success = true;
     DataType expectedValue = 2 * K; // Expected value for all elements in C
-    for (Idx i = 0; i < M; ++i) {
-        for (Idx j = 0; j < N; ++j) {
-            if (std::fabs(mdHostC(i, j) - expectedValue) > 1e-5f) { // Allow small floating-point errors
-                std::cout << "Mismatch at (" << i << ", " << j << "): "
-                          << mdHostC(i, j) << " != " << expectedValue << std::endl;
+    for(Idx i = 0; i < M; ++i)
+    {
+        for(Idx j = 0; j < N; ++j)
+        {
+            if(std::fabs(hostC[i + j * M] - expectedValue) > 1e-5f)
+            { // Allow small floating-point errors
+                std::cout << "Mismatch at (" << i << ", " << j << "): " << hostC[i + j * M] << " != " << expectedValue
+                          << std::endl;
                 success = false;
             }
         }
     }
 
     std::cout << "Multiplication of matrices of size " << M << "x" << K << " and " << K << "x" << N
-              << " using mdspan " << (success ? "succeeded" : "failed") << "!" << std::endl;
+              << (success ? " succeeded!" : " failed!") << std::endl;
 
-    if (!success) {
+    if(!success)
+    {
         return EXIT_FAILURE;
     }
 
-           // Cleanup cuBLAS
+    // Cleanup cuBLAS
     cublasDestroy(cublasHandle);
 
     return EXIT_SUCCESS;