Skip to content

Commit

Permalink
Update
Browse files Browse the repository at this point in the history
  • Loading branch information
TeachRaccooon committed Aug 2, 2024
1 parent 39e2e3e commit af61829
Show file tree
Hide file tree
Showing 4 changed files with 17 additions and 10 deletions.
1 change: 1 addition & 0 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@ set(CMAKE_CUDA_STANDARD 20)
set(CMAKE_CUDA_STANDARD_REQUIRED ON)
set(CMAKE_CUDA_SEPARABLE_COMPILATION ON)
set(CMAKE_CUDA_VISIBILITY_PRESET hidden)
set(CMAKE_CUDA_ARCHITECTURES native)

list(APPEND CMAKE_MODULE_PATH "${CMAKE_CURRENT_SOURCE_DIR}/CMake")
list(APPEND CMAKE_MODULE_PATH ${CMAKE_SOURCE_DIR})
Expand Down
10 changes: 5 additions & 5 deletions RandLAPACK/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -25,21 +25,23 @@ set(RandLAPACK_cxx_sources
rl_cqrrp_gpu.hh
)

set(RandLAPACK_libs
add_library(RandLAPACK INTERFACE)

target_link_libraries(RandLAPACK INTERFACE
RandBLAS
lapackpp
blaspp
Random123
)

if (RandLAPACK_HAS_OpenMP)
list(APPEND RandLAPACK_libs OpenMP::OpenMP_CXX)
target_link_libraries(RandLAPACK INTERFACE OpenMP::OpenMP_CXX)
endif()

configure_file(${CMAKE_CURRENT_SOURCE_DIR}/rl_config.hh.in rl_config.hh)
install(FILES ${CMAKE_CURRENT_BINARY_DIR}/rl_config.hh
DESTINATION ${CMAKE_INSTALL_INCLUDEDIR}/RandLAPACK)

add_library(RandLAPACK INTERFACE)

set(RandLAPACK_cxx_opts -Wall -Wextra)
if (SANITIZE_ADDRESS)
Expand Down Expand Up @@ -69,8 +71,6 @@ target_include_directories(
$<INSTALL_INTERFACE:${CMAKE_INSTALL_INCLUDEDIR}/RandLAPACK/gpu_functions>
)

target_link_libraries(RandLAPACK INTERFACE ${RandLAPACK_libs})

install(
FILES "${CMAKE_CURRENT_SOURCE_DIR}/../RandLAPACK.hh"
DESTINATION include
Expand Down
2 changes: 1 addition & 1 deletion test/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,7 @@ if (GTest_FOUND)
add_executable(RandLAPACK_tests_gpu ${RandLAPACK_test_cu_srcs})
add_executable(RandLAPACK_tests ${RandLAPACK_test_srcs})

target_link_libraries(RandLAPACK_tests_gpu RandLAPACK GTest::GTest GTest::Main)
target_link_libraries(RandLAPACK_tests_gpu RandLAPACK GTest::GTest GTest::Main CUDA::cusolver)
set_property(TARGET RandLAPACK_tests_gpu PROPERTY CUDA_ARCHITECTURES native)

target_link_libraries(RandLAPACK_tests RandLAPACK GTest::GTest GTest::Main)
Expand Down
14 changes: 10 additions & 4 deletions test/drivers/test_cqrrp_gpu.cu
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@
#include <RandBLAS.hh>
#include <fstream>
#include <gtest/gtest.h>
#include <chrono>

// Use cuda kernels.
#ifndef USE_CUDA
Expand Down Expand Up @@ -281,12 +282,16 @@ class TestCQRRP : public ::testing::Test
blas::gemm(Layout::ColMajor, Op::NoTrans, Op::NoTrans, d, n, m, 1.0, S, d, all_data.A.data(), m, 0.0, all_data.A_sk, d);
free(S);
cudaMemcpy(all_data.A_sk_device, all_data.A_sk, d * n * sizeof(double), cudaMemcpyHostToDevice);

RandLAPACK::CQRRP_blocked_GPU<double, r123::Philox4x32> CQRRP_GPU(true, tol, block_size);
auto start = std::chrono::steady_clock::now();
CQRRP_GPU.call(m, n, all_data.A_device, m, all_data.A_sk_device, d, all_data.tau_device, all_data.J_device);

auto stop = std::chrono::steady_clock::now();
auto diff = std::chrono::duration_cast<std::chrono::milliseconds>(stop - start).count();
auto rank = CQRRP_GPU.rank;
printf("RANK AS RETURNED BY CQRRP GPU %4ld\n", rank);
printf(" BLOCK SIZE = %ld\n", block_size);
printf(" TIME (MS) = %ld\n", diff);

cudaFree(all_data.A_sk_device);
free(all_data.A_sk);
Expand Down Expand Up @@ -377,7 +382,8 @@ TEST_F(TestCQRRP, CQRRP_GPU_benchmark_16k) {
int64_t n = std::pow(2, 14);
double d_factor = 1.25;
int64_t b_sz_start = 32;
int64_t b_sz_end = 4096;
int64_t b_sz_end = 192;
int64_t b_sz_incr = 8;
double tol = std::pow(std::numeric_limits<double>::epsilon(), 0.85);
auto state = RandBLAS::RNGState();

Expand All @@ -387,7 +393,7 @@ TEST_F(TestCQRRP, CQRRP_GPU_benchmark_16k) {
cudaMemcpy(all_data.A_device, all_data.A.data(), m * n * sizeof(double), cudaMemcpyHostToDevice);

#if !defined(__APPLE__)
for (;b_sz_start <= b_sz_end; b_sz_start *= 2) {
for (;b_sz_start <= b_sz_end; b_sz_start += b_sz_incr) {
bench_CQRRP(d_factor, tol, b_sz_start, all_data, state);
}
#endif
Expand Down

0 comments on commit af61829

Please sign in to comment.