From af6182900c4b8724ea22d6b67f93d63570673d31 Mon Sep 17 00:00:00 2001 From: TeachRaccooon Date: Fri, 2 Aug 2024 16:05:42 -0400 Subject: [PATCH] Update --- CMakeLists.txt | 1 + RandLAPACK/CMakeLists.txt | 10 +++++----- test/CMakeLists.txt | 2 +- test/drivers/test_cqrrp_gpu.cu | 14 ++++++++++---- 4 files changed, 17 insertions(+), 10 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 77fd3288..39bf2a95 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -7,6 +7,7 @@ set(CMAKE_CUDA_STANDARD 20) set(CMAKE_CUDA_STANDARD_REQUIRED ON) set(CMAKE_CUDA_SEPARABLE_COMPILATION ON) set(CMAKE_CUDA_VISIBILITY_PRESET hidden) +set(CMAKE_CUDA_ARCHITECTURES native) list(APPEND CMAKE_MODULE_PATH "${CMAKE_CURRENT_SOURCE_DIR}/CMake") list(APPEND CMAKE_MODULE_PATH ${CMAKE_SOURCE_DIR}) diff --git a/RandLAPACK/CMakeLists.txt b/RandLAPACK/CMakeLists.txt index 3957616a..055e4a12 100644 --- a/RandLAPACK/CMakeLists.txt +++ b/RandLAPACK/CMakeLists.txt @@ -25,21 +25,23 @@ set(RandLAPACK_cxx_sources rl_cqrrp_gpu.hh ) -set(RandLAPACK_libs +add_library(RandLAPACK INTERFACE) + +target_link_libraries(RandLAPACK INTERFACE RandBLAS lapackpp blaspp Random123 ) + if (RandLAPACK_HAS_OpenMP) - list(APPEND RandLAPACK_libs OpenMP::OpenMP_CXX) + target_link_libraries(RandLAPACK INTERFACE OpenMP::OpenMP_CXX) endif() configure_file(${CMAKE_CURRENT_SOURCE_DIR}/rl_config.hh.in rl_config.hh) install(FILES ${CMAKE_CURRENT_BINARY_DIR}/rl_config.hh DESTINATION ${CMAKE_INSTALL_INCLUDEDIR}/RandLAPACK) -add_library(RandLAPACK INTERFACE) set(RandLAPACK_cxx_opts -Wall -Wextra) if (SANITIZE_ADDRESS) @@ -69,8 +71,6 @@ target_include_directories( $ ) -target_link_libraries(RandLAPACK INTERFACE ${RandLAPACK_libs}) - install( FILES "${CMAKE_CURRENT_SOURCE_DIR}/../RandLAPACK.hh" DESTINATION include diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt index 3d0a3e2b..b26309ba 100644 --- a/test/CMakeLists.txt +++ b/test/CMakeLists.txt @@ -30,7 +30,7 @@ if (GTest_FOUND) add_executable(RandLAPACK_tests_gpu ${RandLAPACK_test_cu_srcs}) add_executable(RandLAPACK_tests ${RandLAPACK_test_srcs}) - target_link_libraries(RandLAPACK_tests_gpu RandLAPACK GTest::GTest GTest::Main) + target_link_libraries(RandLAPACK_tests_gpu RandLAPACK GTest::GTest GTest::Main CUDA::cusolver) set_property(TARGET RandLAPACK_tests_gpu PROPERTY CUDA_ARCHITECTURES native) target_link_libraries(RandLAPACK_tests RandLAPACK GTest::GTest GTest::Main) diff --git a/test/drivers/test_cqrrp_gpu.cu b/test/drivers/test_cqrrp_gpu.cu index 067c2102..8fa529ea 100644 --- a/test/drivers/test_cqrrp_gpu.cu +++ b/test/drivers/test_cqrrp_gpu.cu @@ -10,6 +10,7 @@ #include #include #include +#include // Use cuda kernels. #ifndef USE_CUDA @@ -281,12 +282,16 @@ class TestCQRRP : public ::testing::Test blas::gemm(Layout::ColMajor, Op::NoTrans, Op::NoTrans, d, n, m, 1.0, S, d, all_data.A.data(), m, 0.0, all_data.A_sk, d); free(S); cudaMemcpy(all_data.A_sk_device, all_data.A_sk, d * n * sizeof(double), cudaMemcpyHostToDevice); - + RandLAPACK::CQRRP_blocked_GPU CQRRP_GPU(true, tol, block_size); + auto start = std::chrono::steady_clock::now(); CQRRP_GPU.call(m, n, all_data.A_device, m, all_data.A_sk_device, d, all_data.tau_device, all_data.J_device); - + auto stop = std::chrono::steady_clock::now(); + auto diff = std::chrono::duration_cast(stop - start).count(); auto rank = CQRRP_GPU.rank; printf("RANK AS RETURNED BY CQRRP GPU %4ld\n", rank); + printf(" BLOCK SIZE = %ld\n", block_size); + printf(" TIME (MS) = %ld\n", diff); cudaFree(all_data.A_sk_device); free(all_data.A_sk); @@ -377,7 +382,8 @@ TEST_F(TestCQRRP, CQRRP_GPU_benchmark_16k) { int64_t n = std::pow(2, 14); double d_factor = 1.25; int64_t b_sz_start = 32; - int64_t b_sz_end = 4096; + int64_t b_sz_end = 192; + int64_t b_sz_incr = 8; double tol = std::pow(std::numeric_limits::epsilon(), 0.85); auto state = RandBLAS::RNGState(); @@ -387,7 +393,7 @@ TEST_F(TestCQRRP, CQRRP_GPU_benchmark_16k) { cudaMemcpy(all_data.A_device, all_data.A.data(), m * n * sizeof(double), cudaMemcpyHostToDevice); #if !defined(__APPLE__) - for (;b_sz_start <= b_sz_end; b_sz_start *= 2) { + for (;b_sz_start <= b_sz_end; b_sz_start += b_sz_incr) { bench_CQRRP(d_factor, tol, b_sz_start, all_data, state); } #endif