From 15fc7becb76cd15144610d4354c5864cb0bfb493 Mon Sep 17 00:00:00 2001 From: pgorlani <92453485+pgorlani@users.noreply.github.com> Date: Wed, 24 Jan 2024 13:27:39 +0000 Subject: [PATCH] Fixes for SPR/SPR2 BLAS 2 routines (#492) This patch fixes the functional correctness of spr/spr2 tests by calculating the required square root using integer arithmetic instead of using sycl::sqrt function. Also: * Removed unnecessary code and improved documentation * Improved unittests by initializing the input matrix with random numbers --- include/operations/blas2_trees.h | 93 ++++++++---------------- src/interface/blas2_interface.hpp | 8 +-- src/operations/blas2/spr.hpp | 95 +++++++++++++++++++++---- test/unittest/blas2/blas2_spr2_test.cpp | 23 +++--- test/unittest/blas2/blas2_spr_test.cpp | 25 +++---- 5 files changed, 132 insertions(+), 112 deletions(-) diff --git a/include/operations/blas2_trees.h b/include/operations/blas2_trees.h index 759bbdd88..34937283e 100644 --- a/include/operations/blas2_trees.h +++ b/include/operations/blas2_trees.h @@ -580,10 +580,22 @@ GerCol make_ger_col( lhs_, scalar_, rhs_1_, rhs_2_, nWG_row_, nWG_col_, local_memory_size_); } -/**** SPR N COLS x (N + 1)/2 ROWS FOR PACKED MATRIX ****/ -/* This class performs rank 1/2 update for symmetric packed matrices. For more - * details on matrix refer to the explanation here: - * https://spec.oneapi.io/versions/1.1-rev-1/elements/oneMKL/source/domains/matrix-storage.html#matrix-storage +/** + * @struct Spr + * @brief Tree node representing a rank 1/2 update for symmetric packed + * matrices, i.e., + * + * Spr : lhs_ = alpha_ * rhs_1_ * rhs_2_' + lhs_ + * Spr2: lhs_ = alpha_ * rhs_1_ * rhs_2_' + alpha_ * rhs_2_ * rhs_1_' + lhs_ + * + * + * @tparam Single true for SPR, false for SPR2 + * @tparam isUpper specifies whether the triangular input matrix is upper + * @param alpha_ scaling factor for vector multiplication + * @param N_ matrix size + * @param lhs_ input/output matrix + * @param rhs_1_ input vector + * @param rhs_2_ input vector */ template @@ -591,82 +603,33 @@ struct Spr { using value_t = typename rhs_1_t::value_t; using index_t = typename rhs_1_t::index_t; + value_t alpha_; + index_t N_; lhs_t lhs_; rhs_1_t rhs_1_; rhs_2_t rhs_2_; - value_t alpha_; - index_t N_, incX_1_, incX_2_; - // cl::sycl::sqrt(float) gives incorrect results when - // the operand becomes big. The sqrt_overflow_limit was - // identified empirically by testing the spr operator - // for matrix sizes up to 16384x16384 on the integrated - // Intel GPU. To make the experiment generic and to reduce - // the chances of failing tests on different hardware we opt - // for a more naive limit. (1048576 = 1024 * 1024) - static constexpr index_t sqrt_overflow_limit = 1048576; - - Spr(lhs_t &_l, index_t N_, value_t _alpha, rhs_1_t &_r1, index_t _incX_1, - rhs_2_t &_r2, index_t _incX_2); + + Spr(lhs_t &_l, index_t N_, value_t _alpha, rhs_1_t &_r1, rhs_2_t &_r2); index_t get_size() const; bool valid_thread(cl::sycl::nd_item<1> ndItem) const; value_t eval(cl::sycl::nd_item<1> ndItem); void bind(cl::sycl::handler &h); void adjust_access_displacement(); - // Row-Col index calculation for Upper Packed Matrix - template - PORTBLAS_ALWAYS_INLINE static typename std::enable_if::type - compute_row_col(const int64_t id, const index_t size, index_t &row, - index_t &col) { - int64_t internal = 1 + 8 * id; - float val = internal * 1.f; - float sqrt = 0.f; - float divisor = id >= sqrt_overflow_limit ? size * 1.f : 1.f; - val = internal / (divisor * divisor); - sqrt = cl::sycl::sqrt(val) * divisor; - col = static_cast((-1 + sqrt) / 2); - row = id - col * (col + 1) / 2; - // adjust the row/col if out of bounds - if (row > col) { - int diff = row - col; - col += diff; - row -= col; - } else if (row < 0) { - col--; - row = id - col * (col + 1) / 2; - } - } - - // Row-Col index calculation for Lower Packed Matrix - template - PORTBLAS_ALWAYS_INLINE static typename std::enable_if::type - compute_row_col(const int64_t id, const index_t size, index_t &row, - index_t &col) { - index_t temp = 2 * size + 1; - int64_t internal = temp * temp - 8 * id; - float val = internal * 1.f; - float sqrt = 0.f; - float divisor = internal >= sqrt_overflow_limit ? 2.f * size : 1.f; - val = internal / (divisor * divisor); - sqrt = cl::sycl::sqrt(val) * divisor; - col = static_cast((temp - sqrt) / 2); - row = id - (col * (temp - col)) / 2 + col; - // adjust row-col if out of bounds - if (row < 0 || col < 0 || row >= size || col >= size || row < col) { - index_t diff = id < size || row < col ? -1 : row >= size ? 1 : 0; - col += diff; - row = id - (col * (temp - col)) / 2 + col; - } - } + index_t int_sqrt(int64_t s); + void compute_row_col(const int64_t id, const index_t size, index_t &row, + index_t &col); }; +/*! + @brief Generator/factory for SPR/SPR2 trees. + */ template Spr make_spr( lhs_t &lhs_, typename rhs_1_t::index_t _N, typename lhs_t::value_t alpha_, - rhs_1_t &rhs_1_, typename rhs_1_t::index_t incX_1, rhs_2_t &rhs_2_, - typename rhs_1_t::index_t incX_2) { + rhs_1_t &rhs_1_, rhs_2_t &rhs_2_) { return Spr(lhs_, _N, alpha_, rhs_1_, - incX_1, rhs_2_, incX_2); + rhs_2_); } } // namespace blas diff --git a/src/interface/blas2_interface.hpp b/src/interface/blas2_interface.hpp index 3e6ee2e31..96340af0b 100644 --- a/src/interface/blas2_interface.hpp +++ b/src/interface/blas2_interface.hpp @@ -988,12 +988,12 @@ typename sb_handle_t::event_t _spr_impl( const index_t globalSize = localSize * nWGPerCol; if (Upper) { - auto spr = make_spr(mA, _N, _alpha, vx, _incx, vx, _incx); + auto spr = make_spr(mA, _N, _alpha, vx, vx); return ret = concatenate_vectors( ret, sb_handle.execute(spr, localSize, globalSize, _dependencies)); } else { - auto spr = make_spr(mA, _N, _alpha, vx, _incx, vx, _incx); + auto spr = make_spr(mA, _N, _alpha, vx, vx); return ret = concatenate_vectors( ret, sb_handle.execute(spr, localSize, globalSize, _dependencies)); @@ -1048,12 +1048,12 @@ typename sb_handle_t::event_t _spr2_impl( const index_t globalSize = localSize * nWGPerCol; if (Upper) { - auto spr2 = make_spr(mA, _N, _alpha, vx, _incx, vy, _incy); + auto spr2 = make_spr(mA, _N, _alpha, vx, vy); return ret = concatenate_vectors( ret, sb_handle.execute(spr2, localSize, globalSize, _dependencies)); } else { - auto spr2 = make_spr(mA, _N, _alpha, vx, _incx, vy, _incy); + auto spr2 = make_spr(mA, _N, _alpha, vx, vy); return ret = concatenate_vectors( ret, sb_handle.execute(spr2, localSize, globalSize, _dependencies)); diff --git a/src/operations/blas2/spr.hpp b/src/operations/blas2/spr.hpp index 2ba6d752a..ee317a92f 100644 --- a/src/operations/blas2/spr.hpp +++ b/src/operations/blas2/spr.hpp @@ -30,21 +30,64 @@ namespace blas { -/**** SPR N COLS x (N + 1)/2 ROWS FOR PACKED MATRIX ****/ - template PORTBLAS_INLINE Spr::Spr( lhs_t& _l, typename rhs_1_t::index_t _N, value_t _alpha, rhs_1_t& _r1, - typename rhs_1_t::index_t _incX_1, rhs_2_t& _r2, - typename rhs_1_t::index_t _incX_2) - : lhs_(_l), - N_(_N), - alpha_(_alpha), - rhs_1_(_r1), - incX_1_(_incX_1), - rhs_2_(_r2), - incX_2_(_incX_2) {} + rhs_2_t& _r2) + : lhs_(_l), N_(_N), alpha_(_alpha), rhs_1_(_r1), rhs_2_(_r2) {} + +/*! + * @brief Compute the integer square root of an integer value by means of a + * fixed-point iteration method. + */ +template +PORTBLAS_ALWAYS_INLINE typename rhs_1_t::index_t +Spr::int_sqrt(int64_t val) { + using index_t = typename rhs_1_t::index_t; + + if (val < 2) return val; + + // Compute x0 as 2^(floor(log2(val)/2) + 1) + index_t p = 0; + int64_t tmp = val; + while (tmp) { + ++p; + tmp >>= 1; + } + index_t x0 = 2 << (p / 2); + index_t x1 = (x0 + val / x0) / 2; + +#pragma unroll 5 + while (x1 < x0) { + x0 = x1; + x1 = (x0 + val / x0) / 2; + } + return x0; +} + +/*! + * @brief Map a global work-item index to triangular matrix coordinates. + */ +template +PORTBLAS_ALWAYS_INLINE void +Spr::compute_row_col( + const int64_t id, const typename rhs_1_t::index_t size, + typename rhs_1_t::index_t& row, typename rhs_1_t::index_t& col) { + using index_t = typename rhs_1_t::index_t; + if constexpr (isUpper) { + const index_t i = (int_sqrt(8L * id + 1L) - 1) / 2; + col = i; + row = id - (i * (i + 1)) / 2; + } else { + const index_t rid = size * (size + 1) / 2 - id - 1; + const index_t i = (int_sqrt(8L * rid + 1L) - 1) / 2; + col = size - 1 - i; + row = size - 1 - (rid - i * (i + 1) / 2); + } +} template @@ -58,12 +101,34 @@ typename rhs_1_t::value_t Spr::eval( index_t row = 0, col = 0; - if (global_idx < lhs_size) { - value_t lhs_val = lhs_.eval(global_idx); - - Spr::compute_row_col( + if (!id) { + Spr::compute_row_col( global_idx, N_, row, col); + } + + row = cl::sycl::group_broadcast(ndItem.get_group(), row); + col = cl::sycl::group_broadcast(ndItem.get_group(), col); + if (global_idx < lhs_size) { + if constexpr (isUpper) { + if (id) { + row += id; + while (row > col) { + ++col; + row -= col; + } + } + } else { + if (id) { + row += id; + while (row >= N_) { + ++col; + row = row - N_ + col; + } + } + } + + value_t lhs_val = lhs_.eval(global_idx); value_t rhs_1_val = rhs_1_.eval(row); value_t rhs_2_val = rhs_2_.eval(col); if constexpr (!Single) { diff --git a/test/unittest/blas2/blas2_spr2_test.cpp b/test/unittest/blas2/blas2_spr2_test.cpp index baa4999e9..68d29f555 100644 --- a/test/unittest/blas2/blas2_spr2_test.cpp +++ b/test/unittest/blas2/blas2_spr2_test.cpp @@ -27,16 +27,16 @@ template using combination_t = - std::tuple; + std::tuple; template void run_test(const combination_t combi) { std::string alloc; index_t n; scalar_t alpha; - char layout, uplo; + char uplo; index_t incX, incY; - std::tie(alloc, layout, uplo, n, alpha, incX, incY) = combi; + std::tie(alloc, uplo, n, alpha, incX, incY) = combi; const size_t x_size = 1 + (n - 1) * incX; const size_t y_size = 1 + (n - 1) * incY; @@ -50,11 +50,10 @@ void run_test(const combination_t combi) { fill_random(vy_cpu); // Output matrix - std::vector a_mp(m_size, 7.0); - std::vector a_cpu_mp(m_size, 7.0); + std::vector a_mp(m_size); + fill_random(a_mp); + std::vector a_cpu_mp = a_mp; - uplo = (uplo == 'u' && layout == 'c') || (uplo == 'l' && layout == 'r') ? 'u' - : 'l'; // SYSTEM SPR2 reference_blas::spr2(&uplo, n, alpha, vx_cpu.data(), incX, vy_cpu.data(), incY, a_cpu_mp.data()); @@ -97,9 +96,9 @@ void run_test(const combination_t combi) { std::string alloc; index_t n; scalar_t alpha; - char layout, uplo; + char uplo; index_t incX, incY; - std::tie(alloc, layout, uplo, n, alpha, incX, incY) = combi; + std::tie(alloc, uplo, n, alpha, incX, incY) = combi; if (alloc == "usm") { #ifdef SB_ENABLE_USM @@ -116,7 +115,6 @@ void run_test(const combination_t combi) { template const auto combi = ::testing::Combine(::testing::Values("usm", "buf"), // allocation type - ::testing::Values('r', 'c'), // matrix layout ::testing::Values('u', 'l'), // UPLO ::testing::Values(1024, 2048, 4096, 8192, 16384), // n ::testing::Values(0.0, 1.0, 1.5), // alpha @@ -128,7 +126,6 @@ const auto combi = template const auto combi = ::testing::Combine(::testing::Values("usm", "buf"), // allocation type - ::testing::Values('r', 'c'), // matrix layout ::testing::Values('u', 'l'), // UPLO ::testing::Values(14, 63, 257, 1010), // n ::testing::Values(1.0), // alpha @@ -141,10 +138,10 @@ template static std::string generate_name( const ::testing::TestParamInfo>& info) { std::string alloc; - char layout, uplo; + char uplo; index_t n, incX, incY; T alpha; - BLAS_GENERATE_NAME(info.param, alloc, layout, uplo, n, alpha, incX, incY); + BLAS_GENERATE_NAME(info.param, alloc, uplo, n, alpha, incX, incY); } BLAS_REGISTER_TEST_ALL(Spr2, combination_t, combi, generate_name); diff --git a/test/unittest/blas2/blas2_spr_test.cpp b/test/unittest/blas2/blas2_spr_test.cpp index 12f0f403b..40429dcec 100644 --- a/test/unittest/blas2/blas2_spr_test.cpp +++ b/test/unittest/blas2/blas2_spr_test.cpp @@ -26,8 +26,7 @@ #include "blas_test.hpp" template -using combination_t = - std::tuple; +using combination_t = std::tuple; template void run_test(const combination_t combi) { @@ -35,9 +34,9 @@ void run_test(const combination_t combi) { index_t n; index_t lda_mul; index_t incX; - char layout, uplo; + char uplo; scalar_t alpha; - std::tie(alloc, layout, uplo, n, alpha, incX) = combi; + std::tie(alloc, uplo, n, alpha, incX) = combi; index_t mA_size = n * n; index_t x_size = 1 + (n - 1) * std::abs(incX); @@ -46,11 +45,9 @@ void run_test(const combination_t combi) { fill_random(x_v); // Output matrix - std::vector a_mp(mA_size, 7.0); - std::vector a_cpu_mp(mA_size, 7.0); - - uplo = (uplo == 'u' && layout == 'c') || (uplo == 'l' && layout == 'r') ? 'u' - : 'l'; + std::vector a_mp(mA_size); + fill_random(a_mp); + std::vector a_cpu_mp = a_mp; // SYSTEM SPR reference_blas::spr(&uplo, n, alpha, x_v.data(), incX, @@ -91,9 +88,9 @@ void run_test(const combination_t combi) { index_t n; index_t lda_mul; index_t incX; - char layout, uplo; + char uplo; scalar_t alpha; - std::tie(alloc, layout, uplo, n, alpha, incX) = combi; + std::tie(alloc, uplo, n, alpha, incX) = combi; if (alloc == "usm") { #ifdef SB_ENABLE_USM @@ -110,7 +107,6 @@ void run_test(const combination_t combi) { template const auto combi = ::testing::Combine(::testing::Values("usm", "buf"), // allocation type - ::testing::Values('r', 'c'), // matrix layout ::testing::Values('u', 'l'), // UPLO ::testing::Values(1024, 2048, 4096, 8192, 16384), // n ::testing::Values(0.0, 1.0, 1.5), // alpha @@ -121,7 +117,6 @@ const auto combi = template const auto combi = ::testing::Combine(::testing::Values("usm", "buf"), // allocation type - ::testing::Values('r', 'c'), // matrix layout ::testing::Values('u', 'l'), // UPLO ::testing::Values(14, 63, 257, 1010), // n ::testing::Values(1.0), // alpha @@ -133,10 +128,10 @@ template static std::string generate_name( const ::testing::TestParamInfo>& info) { std::string alloc; - char layout, uplo; + char uplo; int n, incX; T alpha; - BLAS_GENERATE_NAME(info.param, alloc, layout, uplo, n, alpha, incX); + BLAS_GENERATE_NAME(info.param, alloc, uplo, n, alpha, incX); } BLAS_REGISTER_TEST_ALL(Spr, combination_t, combi, generate_name);