From 15fc7becb76cd15144610d4354c5864cb0bfb493 Mon Sep 17 00:00:00 2001
From: pgorlani <92453485+pgorlani@users.noreply.github.com>
Date: Wed, 24 Jan 2024 13:27:39 +0000
Subject: [PATCH] Fixes for SPR/SPR2 BLAS 2 routines (#492)

This patch fixes the functional correctness of spr/spr2 tests by
calculating the required square root using integer arithmetic
instead of using sycl::sqrt function. Also:

* Removed unnecessary code and improved documentation
* Improved unittests by initializing the input matrix with random numbers
---
 include/operations/blas2_trees.h        | 93 ++++++++----------------
 src/interface/blas2_interface.hpp       |  8 +--
 src/operations/blas2/spr.hpp            | 95 +++++++++++++++++++++----
 test/unittest/blas2/blas2_spr2_test.cpp | 23 +++---
 test/unittest/blas2/blas2_spr_test.cpp  | 25 +++----
 5 files changed, 132 insertions(+), 112 deletions(-)
diff --git a/include/operations/blas2_trees.h b/include/operations/blas2_trees.h
index 759bbdd88..34937283e 100644
--- a/include/operations/blas2_trees.h
+++ b/include/operations/blas2_trees.h
@@ -580,10 +580,22 @@ GerCol<Single, Lower, Diag, Upper, lhs_t, rhs_1_t, rhs_2_t> make_ger_col(
       lhs_, scalar_, rhs_1_, rhs_2_, nWG_row_, nWG_col_, local_memory_size_);
 }
 
-/**** SPR N COLS x (N + 1)/2 ROWS FOR PACKED MATRIX ****/
-/* This class performs rank 1/2 update for symmetric packed matrices. For more
- * details on matrix refer to the explanation here:
- * https://spec.oneapi.io/versions/1.1-rev-1/elements/oneMKL/source/domains/matrix-storage.html#matrix-storage
+/**
+ * @struct Spr
+ * @brief Tree node representing a rank 1/2 update for symmetric packed
+ * matrices, i.e.,
+ *
+ *  Spr : lhs_ = alpha_ * rhs_1_ * rhs_2_' + lhs_
+ *  Spr2: lhs_ = alpha_ * rhs_1_ * rhs_2_' + alpha_ * rhs_2_ * rhs_1_' + lhs_
+ *
+ *
+ * @tparam Single   true for SPR, false for SPR2
+ * @tparam isUpper  specifies whether the triangular input matrix is upper
+ * @param alpha_  scaling factor for vector multiplication
+ * @param N_      matrix size
+ * @param lhs_    input/output matrix
+ * @param rhs_1_  input vector
+ * @param rhs_2_  input vector
  */
 template <bool Single, bool isUpper, typename lhs_t, typename rhs_1_t,
           typename rhs_2_t>
@@ -591,82 +603,33 @@ struct Spr {
   using value_t = typename rhs_1_t::value_t;
   using index_t = typename rhs_1_t::index_t;
 
+  value_t alpha_;
+  index_t N_;
   lhs_t lhs_;
   rhs_1_t rhs_1_;
   rhs_2_t rhs_2_;
-  value_t alpha_;
-  index_t N_, incX_1_, incX_2_;
-  // cl::sycl::sqrt(float) gives incorrect results when
-  // the operand becomes big. The sqrt_overflow_limit was
-  // identified empirically by testing the spr operator
-  // for matrix sizes up to 16384x16384 on the integrated
-  // Intel GPU. To make the experiment generic and to reduce
-  // the chances of failing tests on different hardware we opt
-  // for a more naive limit. (1048576 = 1024 * 1024)
-  static constexpr index_t sqrt_overflow_limit = 1048576;
-
-  Spr(lhs_t &_l, index_t N_, value_t _alpha, rhs_1_t &_r1, index_t _incX_1,
-      rhs_2_t &_r2, index_t _incX_2);
+
+  Spr(lhs_t &_l, index_t N_, value_t _alpha, rhs_1_t &_r1, rhs_2_t &_r2);
   index_t get_size() const;
   bool valid_thread(cl::sycl::nd_item<1> ndItem) const;
   value_t eval(cl::sycl::nd_item<1> ndItem);
   void bind(cl::sycl::handler &h);
   void adjust_access_displacement();
-  // Row-Col index calculation for Upper Packed Matrix
-  template <bool Upper>
-  PORTBLAS_ALWAYS_INLINE static typename std::enable_if<Upper>::type
-  compute_row_col(const int64_t id, const index_t size, index_t &row,
-                  index_t &col) {
-    int64_t internal = 1 + 8 * id;
-    float val = internal * 1.f;
-    float sqrt = 0.f;
-    float divisor = id >= sqrt_overflow_limit ? size * 1.f : 1.f;
-    val = internal / (divisor * divisor);
-    sqrt = cl::sycl::sqrt(val) * divisor;
-    col = static_cast<index_t>((-1 + sqrt) / 2);
-    row = id - col * (col + 1) / 2;
-    // adjust the row/col if out of bounds
-    if (row > col) {
-      int diff = row - col;
-      col += diff;
-      row -= col;
-    } else if (row < 0) {
-      col--;
-      row = id - col * (col + 1) / 2;
-    }
-  }
-
-  // Row-Col index calculation for Lower Packed Matrix
-  template <bool Upper>
-  PORTBLAS_ALWAYS_INLINE static typename std::enable_if<!Upper>::type
-  compute_row_col(const int64_t id, const index_t size, index_t &row,
-                  index_t &col) {
-    index_t temp = 2 * size + 1;
-    int64_t internal = temp * temp - 8 * id;
-    float val = internal * 1.f;
-    float sqrt = 0.f;
-    float divisor = internal >= sqrt_overflow_limit ? 2.f * size : 1.f;
-    val = internal / (divisor * divisor);
-    sqrt = cl::sycl::sqrt(val) * divisor;
-    col = static_cast<index_t>((temp - sqrt) / 2);
-    row = id - (col * (temp - col)) / 2 + col;
-    // adjust row-col if out of bounds
-    if (row < 0 || col < 0 || row >= size || col >= size || row < col) {
-      index_t diff = id < size || row < col ? -1 : row >= size ? 1 : 0;
-      col += diff;
-      row = id - (col * (temp - col)) / 2 + col;
-    }
-  }
+  index_t int_sqrt(int64_t s);
+  void compute_row_col(const int64_t id, const index_t size, index_t &row,
+                       index_t &col);
 };
 
+/*!
+ @brief Generator/factory for SPR/SPR2 trees.
+ */
 template <bool Single, bool isUpper, typename lhs_t, typename rhs_1_t,
           typename rhs_2_t>
 Spr<Single, isUpper, lhs_t, rhs_1_t, rhs_2_t> make_spr(
     lhs_t &lhs_, typename rhs_1_t::index_t _N, typename lhs_t::value_t alpha_,
-    rhs_1_t &rhs_1_, typename rhs_1_t::index_t incX_1, rhs_2_t &rhs_2_,
-    typename rhs_1_t::index_t incX_2) {
+    rhs_1_t &rhs_1_, rhs_2_t &rhs_2_) {
   return Spr<Single, isUpper, lhs_t, rhs_1_t, rhs_2_t>(lhs_, _N, alpha_, rhs_1_,
-                                                       incX_1, rhs_2_, incX_2);
+                                                       rhs_2_);
 }
 
 }  // namespace blas
diff --git a/src/interface/blas2_interface.hpp b/src/interface/blas2_interface.hpp
index 3e6ee2e31..96340af0b 100644
--- a/src/interface/blas2_interface.hpp
+++ b/src/interface/blas2_interface.hpp
@@ -988,12 +988,12 @@ typename sb_handle_t::event_t _spr_impl(
   const index_t globalSize = localSize * nWGPerCol;
 
   if (Upper) {
-    auto spr = make_spr<true, true>(mA, _N, _alpha, vx, _incx, vx, _incx);
+    auto spr = make_spr<true, true>(mA, _N, _alpha, vx, vx);
     return ret = concatenate_vectors(
                ret,
                sb_handle.execute(spr, localSize, globalSize, _dependencies));
   } else {
-    auto spr = make_spr<true, false>(mA, _N, _alpha, vx, _incx, vx, _incx);
+    auto spr = make_spr<true, false>(mA, _N, _alpha, vx, vx);
     return ret = concatenate_vectors(
                ret,
                sb_handle.execute(spr, localSize, globalSize, _dependencies));
@@ -1048,12 +1048,12 @@ typename sb_handle_t::event_t _spr2_impl(
   const index_t globalSize = localSize * nWGPerCol;
 
   if (Upper) {
-    auto spr2 = make_spr<false, true>(mA, _N, _alpha, vx, _incx, vy, _incy);
+    auto spr2 = make_spr<false, true>(mA, _N, _alpha, vx, vy);
     return ret = concatenate_vectors(
                ret,
                sb_handle.execute(spr2, localSize, globalSize, _dependencies));
   } else {
-    auto spr2 = make_spr<false, false>(mA, _N, _alpha, vx, _incx, vy, _incy);
+    auto spr2 = make_spr<false, false>(mA, _N, _alpha, vx, vy);
     return ret = concatenate_vectors(
                ret,
                sb_handle.execute(spr2, localSize, globalSize, _dependencies));
diff --git a/src/operations/blas2/spr.hpp b/src/operations/blas2/spr.hpp
index 2ba6d752a..ee317a92f 100644
--- a/src/operations/blas2/spr.hpp
+++ b/src/operations/blas2/spr.hpp
@@ -30,21 +30,64 @@
 
 namespace blas {
 
-/**** SPR N COLS x (N + 1)/2 ROWS FOR PACKED MATRIX ****/
-
 template <bool Single, bool isUpper, typename lhs_t, typename rhs_1_t,
           typename rhs_2_t>
 PORTBLAS_INLINE Spr<Single, isUpper, lhs_t, rhs_1_t, rhs_2_t>::Spr(
     lhs_t& _l, typename rhs_1_t::index_t _N, value_t _alpha, rhs_1_t& _r1,
-    typename rhs_1_t::index_t _incX_1, rhs_2_t& _r2,
-    typename rhs_1_t::index_t _incX_2)
-    : lhs_(_l),
-      N_(_N),
-      alpha_(_alpha),
-      rhs_1_(_r1),
-      incX_1_(_incX_1),
-      rhs_2_(_r2),
-      incX_2_(_incX_2) {}
+    rhs_2_t& _r2)
+    : lhs_(_l), N_(_N), alpha_(_alpha), rhs_1_(_r1), rhs_2_(_r2) {}
+
+/*!
+ * @brief Compute the integer square root of an integer value by means of a
+ * fixed-point iteration method.
+ */
+template <bool Single, bool isUpper, typename lhs_t, typename rhs_1_t,
+          typename rhs_2_t>
+PORTBLAS_ALWAYS_INLINE typename rhs_1_t::index_t
+Spr<Single, isUpper, lhs_t, rhs_1_t, rhs_2_t>::int_sqrt(int64_t val) {
+  using index_t = typename rhs_1_t::index_t;
+
+  if (val < 2) return val;
+
+  // Compute x0 as 2^(floor(log2(val)/2) + 1)
+  index_t p = 0;
+  int64_t tmp = val;
+  while (tmp) {
+    ++p;
+    tmp >>= 1;
+  }
+  index_t x0 = 2 << (p / 2);
+  index_t x1 = (x0 + val / x0) / 2;
+
+#pragma unroll 5
+  while (x1 < x0) {
+    x0 = x1;
+    x1 = (x0 + val / x0) / 2;
+  }
+  return x0;
+}
+
+/*!
+ * @brief Map a global work-item index to triangular matrix coordinates.
+ */
+template <bool Single, bool isUpper, typename lhs_t, typename rhs_1_t,
+          typename rhs_2_t>
+PORTBLAS_ALWAYS_INLINE void
+Spr<Single, isUpper, lhs_t, rhs_1_t, rhs_2_t>::compute_row_col(
+    const int64_t id, const typename rhs_1_t::index_t size,
+    typename rhs_1_t::index_t& row, typename rhs_1_t::index_t& col) {
+  using index_t = typename rhs_1_t::index_t;
+  if constexpr (isUpper) {
+    const index_t i = (int_sqrt(8L * id + 1L) - 1) / 2;
+    col = i;
+    row = id - (i * (i + 1)) / 2;
+  } else {
+    const index_t rid = size * (size + 1) / 2 - id - 1;
+    const index_t i = (int_sqrt(8L * rid + 1L) - 1) / 2;
+    col = size - 1 - i;
+    row = size - 1 - (rid - i * (i + 1) / 2);
+  }
+}
 
 template <bool Single, bool isUpper, typename lhs_t, typename rhs_1_t,
           typename rhs_2_t>
@@ -58,12 +101,34 @@ typename rhs_1_t::value_t Spr<Single, isUpper, lhs_t, rhs_1_t, rhs_2_t>::eval(
 
   index_t row = 0, col = 0;
 
-  if (global_idx < lhs_size) {
-    value_t lhs_val = lhs_.eval(global_idx);
-
-    Spr<Single, isUpper, lhs_t, rhs_1_t, rhs_2_t>::compute_row_col<isUpper>(
+  if (!id) {
+    Spr<Single, isUpper, lhs_t, rhs_1_t, rhs_2_t>::compute_row_col(
         global_idx, N_, row, col);
+  }
+
+  row = cl::sycl::group_broadcast(ndItem.get_group(), row);
+  col = cl::sycl::group_broadcast(ndItem.get_group(), col);
 
+  if (global_idx < lhs_size) {
+    if constexpr (isUpper) {
+      if (id) {
+        row += id;
+        while (row > col) {
+          ++col;
+          row -= col;
+        }
+      }
+    } else {
+      if (id) {
+        row += id;
+        while (row >= N_) {
+          ++col;
+          row = row - N_ + col;
+        }
+      }
+    }
+
+    value_t lhs_val = lhs_.eval(global_idx);
     value_t rhs_1_val = rhs_1_.eval(row);
     value_t rhs_2_val = rhs_2_.eval(col);
     if constexpr (!Single) {
diff --git a/test/unittest/blas2/blas2_spr2_test.cpp b/test/unittest/blas2/blas2_spr2_test.cpp
index baa4999e9..68d29f555 100644
--- a/test/unittest/blas2/blas2_spr2_test.cpp
+++ b/test/unittest/blas2/blas2_spr2_test.cpp
@@ -27,16 +27,16 @@
 
 template <typename scalar_t>
 using combination_t =
-    std::tuple<std::string, char, char, index_t, scalar_t, index_t, index_t>;
+    std::tuple<std::string, char, index_t, scalar_t, index_t, index_t>;
 
 template <typename scalar_t, helper::AllocType mem_alloc>
 void run_test(const combination_t<scalar_t> combi) {
   std::string alloc;
   index_t n;
   scalar_t alpha;
-  char layout, uplo;
+  char uplo;
   index_t incX, incY;
-  std::tie(alloc, layout, uplo, n, alpha, incX, incY) = combi;
+  std::tie(alloc, uplo, n, alpha, incX, incY) = combi;
 
   const size_t x_size = 1 + (n - 1) * incX;
   const size_t y_size = 1 + (n - 1) * incY;
@@ -50,11 +50,10 @@ void run_test(const combination_t<scalar_t> combi) {
   fill_random(vy_cpu);
 
   // Output matrix
-  std::vector<scalar_t> a_mp(m_size, 7.0);
-  std::vector<scalar_t> a_cpu_mp(m_size, 7.0);
+  std::vector<scalar_t> a_mp(m_size);
+  fill_random(a_mp);
+  std::vector<scalar_t> a_cpu_mp = a_mp;
 
-  uplo = (uplo == 'u' && layout == 'c') || (uplo == 'l' && layout == 'r') ? 'u'
-                                                                          : 'l';
   // SYSTEM SPR2
   reference_blas::spr2<scalar_t>(&uplo, n, alpha, vx_cpu.data(), incX,
                                  vy_cpu.data(), incY, a_cpu_mp.data());
@@ -97,9 +96,9 @@ void run_test(const combination_t<scalar_t> combi) {
   std::string alloc;
   index_t n;
   scalar_t alpha;
-  char layout, uplo;
+  char uplo;
   index_t incX, incY;
-  std::tie(alloc, layout, uplo, n, alpha, incX, incY) = combi;
+  std::tie(alloc, uplo, n, alpha, incX, incY) = combi;
 
   if (alloc == "usm") {
 #ifdef SB_ENABLE_USM
@@ -116,7 +115,6 @@ void run_test(const combination_t<scalar_t> combi) {
 template <typename scalar_t>
 const auto combi =
     ::testing::Combine(::testing::Values("usm", "buf"),  // allocation type
-                       ::testing::Values('r', 'c'),      // matrix layout
                        ::testing::Values('u', 'l'),      // UPLO
                        ::testing::Values(1024, 2048, 4096, 8192, 16384),  // n
                        ::testing::Values<scalar_t>(0.0, 1.0, 1.5),  // alpha
@@ -128,7 +126,6 @@ const auto combi =
 template <typename scalar_t>
 const auto combi =
     ::testing::Combine(::testing::Values("usm", "buf"),       // allocation type
-                       ::testing::Values('r', 'c'),           // matrix layout
                        ::testing::Values('u', 'l'),           // UPLO
                        ::testing::Values(14, 63, 257, 1010),  // n
                        ::testing::Values<scalar_t>(1.0),      // alpha
@@ -141,10 +138,10 @@ template <class T>
 static std::string generate_name(
     const ::testing::TestParamInfo<combination_t<T>>& info) {
   std::string alloc;
-  char layout, uplo;
+  char uplo;
   index_t n, incX, incY;
   T alpha;
-  BLAS_GENERATE_NAME(info.param, alloc, layout, uplo, n, alpha, incX, incY);
+  BLAS_GENERATE_NAME(info.param, alloc, uplo, n, alpha, incX, incY);
 }
 
 BLAS_REGISTER_TEST_ALL(Spr2, combination_t, combi, generate_name);
diff --git a/test/unittest/blas2/blas2_spr_test.cpp b/test/unittest/blas2/blas2_spr_test.cpp
index 12f0f403b..40429dcec 100644
--- a/test/unittest/blas2/blas2_spr_test.cpp
+++ b/test/unittest/blas2/blas2_spr_test.cpp
@@ -26,8 +26,7 @@
 #include "blas_test.hpp"
 
 template <typename scalar_t>
-using combination_t =
-    std::tuple<std::string, char, char, index_t, scalar_t, index_t>;
+using combination_t = std::tuple<std::string, char, index_t, scalar_t, index_t>;
 
 template <typename scalar_t, helper::AllocType mem_alloc>
 void run_test(const combination_t<scalar_t> combi) {
@@ -35,9 +34,9 @@ void run_test(const combination_t<scalar_t> combi) {
   index_t n;
   index_t lda_mul;
   index_t incX;
-  char layout, uplo;
+  char uplo;
   scalar_t alpha;
-  std::tie(alloc, layout, uplo, n, alpha, incX) = combi;
+  std::tie(alloc, uplo, n, alpha, incX) = combi;
   index_t mA_size = n * n;
   index_t x_size = 1 + (n - 1) * std::abs(incX);
 
@@ -46,11 +45,9 @@ void run_test(const combination_t<scalar_t> combi) {
   fill_random(x_v);
 
   // Output matrix
-  std::vector<scalar_t> a_mp(mA_size, 7.0);
-  std::vector<scalar_t> a_cpu_mp(mA_size, 7.0);
-
-  uplo = (uplo == 'u' && layout == 'c') || (uplo == 'l' && layout == 'r') ? 'u'
-                                                                          : 'l';
+  std::vector<scalar_t> a_mp(mA_size);
+  fill_random(a_mp);
+  std::vector<scalar_t> a_cpu_mp = a_mp;
 
   // SYSTEM SPR
   reference_blas::spr<scalar_t>(&uplo, n, alpha, x_v.data(), incX,
@@ -91,9 +88,9 @@ void run_test(const combination_t<scalar_t> combi) {
   index_t n;
   index_t lda_mul;
   index_t incX;
-  char layout, uplo;
+  char uplo;
   scalar_t alpha;
-  std::tie(alloc, layout, uplo, n, alpha, incX) = combi;
+  std::tie(alloc, uplo, n, alpha, incX) = combi;
 
   if (alloc == "usm") {
 #ifdef SB_ENABLE_USM
@@ -110,7 +107,6 @@ void run_test(const combination_t<scalar_t> combi) {
 template <typename scalar_t>
 const auto combi =
     ::testing::Combine(::testing::Values("usm", "buf"),  // allocation type
-                       ::testing::Values('r', 'c'),      // matrix layout
                        ::testing::Values('u', 'l'),      // UPLO
                        ::testing::Values(1024, 2048, 4096, 8192, 16384),  // n
                        ::testing::Values<scalar_t>(0.0, 1.0, 1.5),  // alpha
@@ -121,7 +117,6 @@ const auto combi =
 template <typename scalar_t>
 const auto combi =
     ::testing::Combine(::testing::Values("usm", "buf"),       // allocation type
-                       ::testing::Values('r', 'c'),           // matrix layout
                        ::testing::Values('u', 'l'),           // UPLO
                        ::testing::Values(14, 63, 257, 1010),  // n
                        ::testing::Values<scalar_t>(1.0),      // alpha
@@ -133,10 +128,10 @@ template <class T>
 static std::string generate_name(
     const ::testing::TestParamInfo<combination_t<T>>& info) {
   std::string alloc;
-  char layout, uplo;
+  char uplo;
   int n, incX;
   T alpha;
-  BLAS_GENERATE_NAME(info.param, alloc, layout, uplo, n, alpha, incX);
+  BLAS_GENERATE_NAME(info.param, alloc, uplo, n, alpha, incX);
 }
 
 BLAS_REGISTER_TEST_ALL(Spr, combination_t, combi, generate_name);