From 710b2ec122752914c6fd9a5b517dc5cf8e6f95d5 Mon Sep 17 00:00:00 2001
From: mehmet yusufoglu <mehmetyusufoglu01@gmail.com>
Date: Thu, 21 Nov 2024 18:21:46 +0100
Subject: [PATCH] Run cuBLAS functions from alpaka

---
 example/CMakeLists.txt                        |   2 +
 example/useBLASInAlpaka/CMakeLists.txt        |  58 +++++
 .../useBLASInAlpaka/src/useBLASInAlpaka.cpp   | 198 ++++++++++++++++++
 3 files changed, 258 insertions(+)
 create mode 100644 example/useBLASInAlpaka/CMakeLists.txt
 create mode 100644 example/useBLASInAlpaka/src/useBLASInAlpaka.cpp
diff --git a/example/CMakeLists.txt b/example/CMakeLists.txt
index 541bb7201cd..2c8eae6f892 100644
--- a/example/CMakeLists.txt
+++ b/example/CMakeLists.txt
@@ -36,3 +36,5 @@ add_subdirectory("randomCells2D/")
 add_subdirectory("reduce/")
 add_subdirectory("tagSpecialization/")
 add_subdirectory("vectorAdd/")
+add_subdirectory("useBLASInAlpaka/")
+
diff --git a/example/useBLASInAlpaka/CMakeLists.txt b/example/useBLASInAlpaka/CMakeLists.txt
new file mode 100644
index 00000000000..3793d72db2e
--- /dev/null
+++ b/example/useBLASInAlpaka/CMakeLists.txt
@@ -0,0 +1,58 @@
+#
+# Copyright 2023 Benjamin Worpitz, Jan Stephan
+# SPDX-License-Identifier: ISC
+#
+
+################################################################################
+# Required CMake version.
+
+cmake_minimum_required(VERSION 3.25)
+
+set_property(GLOBAL PROPERTY USE_FOLDERS ON)
+
+################################################################################
+# Project.
+
+set(_TARGET_NAME useBLASInAlpaka)
+
+project(${_TARGET_NAME} LANGUAGES CXX)
+
+# Check if Alpaka accelerator is CUDA-only
+if(NOT alpaka_ACC_GPU_CUDA_ONLY_MODE)
+    # Print a warning and skip target creation
+    message(WARNING "Skipping build of 'useBLASInAlpaka' because alpaka_ACC_GPU_CUDA_ONLY_MODE is not enabled.")
+    return()
+endif()
+
+# Add cuBLAS library
+find_package(CUDA REQUIRED)
+set(CUDA_LIBRARIES ${CUDA_LIBRARIES} cublas)
+
+#-------------------------------------------------------------------------------
+# Find alpaka.
+
+if(NOT TARGET alpaka::alpaka)
+    option(alpaka_USE_SOURCE_TREE "Use alpaka's source tree instead of an alpaka installation" OFF)
+
+    if(alpaka_USE_SOURCE_TREE)
+        # Don't build the examples recursively
+        set(alpaka_BUILD_EXAMPLES OFF)
+        add_subdirectory("${CMAKE_CURRENT_LIST_DIR}/../.." "${CMAKE_BINARY_DIR}/alpaka")
+    else()
+        find_package(alpaka REQUIRED)
+    endif()
+endif()
+
+#-------------------------------------------------------------------------------
+# Add executable.
+
+alpaka_add_executable(
+    ${_TARGET_NAME}
+    src/useBLASInAlpaka.cpp)
+target_link_libraries(
+    ${_TARGET_NAME}
+    PUBLIC alpaka::alpaka ${CUDA_LIBRARIES})
+
+set_target_properties(${_TARGET_NAME} PROPERTIES FOLDER example)
+set_target_properties(${_TARGET_NAME}  PROPERTIES CUDA_STANDARD 14)
+add_test(NAME ${_TARGET_NAME} COMMAND ${_TARGET_NAME})
diff --git a/example/useBLASInAlpaka/src/useBLASInAlpaka.cpp b/example/useBLASInAlpaka/src/useBLASInAlpaka.cpp
new file mode 100644
index 00000000000..14b93d5d127
--- /dev/null
+++ b/example/useBLASInAlpaka/src/useBLASInAlpaka.cpp
@@ -0,0 +1,198 @@
+/* Copyright 2023  Mehmet Yusufoglu, Rene Widera,
+ * SPDX-License-Identifier: ISC
+ */
+/*
+ * This example uses cuBLAS library functions in alpaka. cuBLAS function is called by using alpaka buffers and queue.
+ * Since the code needs only AccGpuCuda backend. Make sure the correct alpaka cmake backend flag is set for alpaka.
+ */
+#include <alpaka/alpaka.hpp>
+#include <alpaka/example/ExecuteForEachAccTag.hpp>
+
+#include <cublas_v2.h>
+
+#include <cmath>
+#include <iostream>
+
+// Index type
+using Idx = std::size_t;
+// Set data type
+using DataType = float;
+
+// Initialize the matrix in column-major order (1D buffer)
+void initializeMatrix(DataType* buffer, Idx rows, Idx cols, int value)
+{
+    for(Idx j = 0; j < cols; ++j)
+    {
+        for(Idx i = 0; i < rows; ++i)
+        {
+            buffer[i + j * rows] = static_cast<DataType>(value);
+        }
+    }
+}
+
+// In standard projects, you typically do not execute the code with any available accelerator.
+// Instead, a single accelerator is selected once from the active accelerators and the kernels are executed with the
+// selected accelerator only. If you use the example as the starting point for your project, you can rename the
+// example() function to main() and move the accelerator tag to the function body.
+template<alpaka::concepts::Tag TAccTag>
+auto example(TAccTag const&) -> int
+{
+    using Dim1D = alpaka::DimInt<1>;
+
+    // Define matrix dimensions, A is MxK and B is KxN
+    Idx const M = 4; // Rows in A and C
+    Idx const N = 2; // Columns in B and C
+    Idx const K = 3; // Columns in A and rows in B
+
+    // Define the accelerator and queue
+    // Use Cuda Accelerator. Cmake Acc flags should be set to Cuda-Only
+    using Acc = alpaka::TagToAcc<TAccTag, Dim1D, Idx>;
+    using Queue = alpaka::Queue<Acc, alpaka::Blocking>;
+
+    auto const platformHost = alpaka::PlatformCpu{};
+    auto const devHost = alpaka::getDevByIdx(platformHost, 0);
+    auto const platformAcc = alpaka::Platform<Acc>{};
+    auto const devAcc = alpaka::getDevByIdx(platformAcc, 0);
+
+    Queue queue(devAcc);
+
+    // Allocate 1D host memory
+    auto bufHostA = alpaka::allocBuf<DataType, Idx>(devHost, M * K);
+    auto bufHostB = alpaka::allocBuf<DataType, Idx>(devHost, K * N);
+    auto bufHostC = alpaka::allocBuf<DataType, Idx>(devHost, M * N);
+
+    DataType* hostA = alpaka::getPtrNative(bufHostA);
+    DataType* hostB = alpaka::getPtrNative(bufHostB);
+    DataType* hostC = alpaka::getPtrNative(bufHostC);
+
+    // Initialize host matrices
+    initializeMatrix(hostA, M, K, 1); // All elements in A are 1
+    initializeMatrix(hostB, K, N, 2); // All elements in B are 2
+    std::fill(hostC, hostC + (M * N), 0); // Initialize C with 0s
+
+    // Print initialized matrices
+    std::cout << "Matrix A (Host):" << std::endl;
+    for(Idx i = 0; i < M; ++i)
+    {
+        for(Idx j = 0; j < K; ++j)
+        {
+            std::cout << hostA[i + j * M] << " ";
+        }
+        std::cout << std::endl;
+    }
+
+    std::cout << "Matrix B (Host):" << std::endl;
+    for(Idx i = 0; i < K; ++i)
+    {
+        for(Idx j = 0; j < N; ++j)
+        {
+            std::cout << hostB[i + j * K] << " ";
+        }
+        std::cout << std::endl;
+    }
+
+    // Allocate 1D device memory
+    auto bufDevA = alpaka::allocBuf<DataType, Idx>(devAcc, M * K);
+    auto bufDevB = alpaka::allocBuf<DataType, Idx>(devAcc, K * N);
+    auto bufDevC = alpaka::allocBuf<DataType, Idx>(devAcc, M * N);
+
+    // Copy data to device
+    alpaka::memcpy(queue, bufDevA, bufHostA);
+    alpaka::memcpy(queue, bufDevB, bufHostB);
+    alpaka::memcpy(queue, bufDevC, bufHostC);
+    alpaka::wait(queue);
+
+    std::cout << "Copied matrices A and B to the device." << std::endl;
+
+    // Get the native CUDA stream from Alpaka queue
+    auto alpakaStream = alpaka::getNativeHandle(queue);
+
+    // cuBLAS setup
+    cublasHandle_t cublasHandle;
+    cublasCreate(&cublasHandle);
+    cublasSetStream(cublasHandle, alpakaStream);
+
+    // Perform matrix multiplication: C = A * B
+    float alpha = 1.0f, beta = 0.0f; // Set beta to 0.0f to overwrite C
+    cublasSgemm(
+        cublasHandle,
+        CUBLAS_OP_N,
+        CUBLAS_OP_N, // No transpose for A and B
+        M,
+        N,
+        K, // Dimensions: C = A * B
+        &alpha,
+        std::data(bufDevA),
+        M, // Leading dimension of A
+        std::data(bufDevB),
+        K, // Leading dimension of B
+        &beta,
+        std::data(bufDevC),
+        M // Leading dimension of C
+    );
+
+    alpaka::wait(queue); // Wait for multiplication to complete
+    std::cout << "Matrix multiplication completed." << std::endl;
+
+    // Copy result back to host
+    alpaka::memcpy(queue, bufHostC, bufDevC);
+    alpaka::wait(queue);
+    std::cout << "Copied result matrix C back to the host." << std::endl;
+
+    // Print result matrix C
+    std::cout << "Matrix C (Host):" << std::endl;
+    for(Idx i = 0; i < M; ++i)
+    {
+        for(Idx j = 0; j < N; ++j)
+        {
+            std::cout << hostC[i + j * M] << " ";
+        }
+        std::cout << std::endl;
+    }
+
+    // Verify the result
+    bool success = true;
+    DataType expectedValue = 2 * K; // Expected value for all elements in C
+    for(Idx i = 0; i < M; ++i)
+    {
+        for(Idx j = 0; j < N; ++j)
+        {
+            if(std::fabs(hostC[i + j * M] - expectedValue) > 1e-5f)
+            { // Allow small floating-point errors
+                std::cout << "Mismatch at (" << i << ", " << j << "): " << hostC[i + j * M] << " != " << expectedValue
+                          << std::endl;
+                success = false;
+            }
+        }
+    }
+
+    std::cout << "Multiplication of matrices of size " << M << "x" << K << " and " << K << "x" << N
+              << (success ? " succeeded!" : " failed!") << std::endl;
+
+    if(!success)
+    {
+        return EXIT_FAILURE;
+    }
+
+    // Cleanup cuBLAS
+    cublasDestroy(cublasHandle);
+    return EXIT_SUCCESS;
+}
+
+auto main() -> int
+{
+    std::cout << "Check enabled accelerator tags:" << std::endl;
+    alpaka::printTagNames<alpaka::EnabledAccTags>();
+    // Execute the example once for each enabled accelerator.
+    // If you would like to execute it for a single accelerator only you can use the following code.
+    //  \code{.cpp}
+    //  auto tag = TagCpuSerial;
+    //  return example(tag);
+    //  \endcode
+    //
+    // valid tags:
+    //   TagCpuSerial, TagGpuHipRt, TagGpuCudaRt, TagCpuOmp2Blocks, TagCpuTbbBlocks,
+    //   TagCpuOmp2Threads, TagCpuSycl, TagCpuTbbBlocks, TagCpuThreads,
+    //   TagFpgaSyclIntel, TagGenericSycl, TagGpuSyclIntel
+    return alpaka::executeForEachAccTag([=](auto const& tag) { return example(tag); });
+}