NVIDIA · Jul 12, 2024
diff --git a/‎CMakeLists.txt
+20-15 b/‎CMakeLists.txt
+20-15
diff --git a/‎include/matx/executors/support.h
+24 b/‎include/matx/executors/support.h
+24
diff --git a/‎include/matx/kernels/channelize_poly.cuh
+1-1 b/‎include/matx/kernels/channelize_poly.cuh
+1-1
diff --git a/‎include/matx/operators/cov.h
+1-1 b/‎include/matx/operators/cov.h
+1-1
diff --git a/‎include/matx/operators/matmul.h
+6-4 b/‎include/matx/operators/matmul.h
+6-4
diff --git a/‎include/matx/operators/matvec.h
+1-2 b/‎include/matx/operators/matvec.h
+1-2
diff --git a/‎include/matx/operators/outer.h
+1-2 b/‎include/matx/operators/outer.h
+1-2
diff --git a/‎include/matx/operators/qr.h
+1-1 b/‎include/matx/operators/qr.h
+1-1
diff --git a/‎include/matx/operators/svd.h
+2-2 b/‎include/matx/operators/svd.h
+2-2
diff --git a/‎include/matx/transforms/cov.h
+14-10 b/‎include/matx/transforms/cov.h
+14-10
@@ -172,6 +172,20 @@ set(WARN_FLAGS ${WARN_FLAGS} $<$<COMPILE_LANGUAGE:CXX>:-Werror>)
 set (CUTLASS_INC "")
 target_compile_definitions(matx INTERFACE MATX_ENABLE_CUTLASS=0)
 
+if (MATX_NVTX_FLAGS)
+    add_definitions(-DMATX_NVTX_FLAGS)
+    target_compile_definitions(matx INTERFACE MATX_NVTX_FLAGS)
+endif()
+if (MATX_BUILD_32_BIT)
+    set(INT_TYPE "lp64")
+    add_definitions(-DINDEX_32_BIT)
+    target_compile_definitions(matx INTERFACE INDEX_32_BIT)
+else()
+    set(INT_TYPE "ilp64")
+    add_definitions(-DINDEX_64_BIT)
+    target_compile_definitions(matx INTERFACE INDEX_64_BIT)
+endif()
+
 # Host support
 if (MATX_EN_NVPL OR MATX_EN_X86_FFTW)
     message(STATUS "Enabling OpenMP support")
@@ -180,9 +194,12 @@ if (MATX_EN_NVPL OR MATX_EN_X86_FFTW)
     target_compile_options(matx INTERFACE ${OpenMP_CXX_FLAGS})
     target_compile_definitions(matx INTERFACE MATX_EN_OMP=1)
     if (MATX_EN_NVPL)
-        message(STATUS "Enabling NVPL library support for ARM CPUs")
-        find_package(nvpl REQUIRED COMPONENTS fft)
-        target_link_libraries(matx INTERFACE nvpl::fftw)
+        message(STATUS "Enabling NVPL library support for ARM CPUs with ${INT_TYPE} interface")
+        find_package(nvpl REQUIRED COMPONENTS fft blas)
+        if (NOT MATX_BUILD_32_BIT)
+            target_compile_definitions(matx INTERFACE NVPL_ILP64)
+        endif()
+        target_link_libraries(matx INTERFACE nvpl::fftw nvpl::blas_${INT_TYPE}_omp)
         target_compile_definitions(matx INTERFACE MATX_EN_NVPL=1)
     else()
         if (MATX_EN_X86_FFTW)
@@ -316,18 +333,6 @@ if (NOT_SUBPROJECT)
 endif()
 
 
-if (MATX_NVTX_FLAGS)
-    add_definitions(-DMATX_NVTX_FLAGS)
-    target_compile_definitions(matx INTERFACE MATX_NVTX_FLAGS)
-endif()
-if (MATX_BUILD_32_BIT)
-    add_definitions(-DINDEX_32_BIT)
-    target_compile_definitions(matx INTERFACE INDEX_32_BIT)
-else()
-    add_definitions(-DINDEX_64_BIT)
-    target_compile_definitions(matx INTERFACE INDEX_64_BIT)
-endif()
-
 if (MATX_BUILD_EXAMPLES)
     add_subdirectory(examples)
 endif()
 
@@ -46,6 +46,13 @@ namespace matx {
     #define MATX_EN_CPU_FFT 0
 #endif
 
+// MatMul
+#if defined(MATX_EN_NVPL)
+    #define MATX_EN_CPU_MATMUL 1
+#else
+    #define MATX_EN_CPU_MATMUL 0
+#endif  
+
 template <typename Exec, typename T>
 constexpr bool CheckFFTSupport() {
   if constexpr (is_host_executor_v<Exec>) {
@@ -70,5 +77,22 @@ constexpr bool CheckDirect1DConvSupport() {
   }
 }
 
+template <typename Exec, typename T>
+constexpr bool CheckMatMulSupport() {
+  if constexpr (is_host_executor_v<Exec>) {
+    if constexpr (std::is_same_v<T, float> ||
+                  std::is_same_v<T, double> ||
+                  std::is_same_v<T, cuda::std::complex<float>> ||
+                  std::is_same_v<T, cuda::std::complex<double>>) {
+      return MATX_EN_CPU_MATMUL;
+    } else {
+      return false;
+    }
+  }
+  else {
+    return true;
+  }
+}
+
 }; // detail
 }; // matx
@@ -256,7 +256,7 @@ __global__ void ChannelizePoly1D_Smem(OutType output, InType input, FilterType f
         __syncthreads();
 
         // Load next elems_per_channel_per_cta elements for each channel
-        const index_t next_last_elem = cuda::std::min(next_start_elem + by - 1, last_elem);
+        const index_t next_last_elem = cuda::std::min(next_start_elem + static_cast<index_t>(by) - 1, last_elem);
         const uint32_t out_samples_this_iter = static_cast<uint32_t>(next_last_elem - next_start_elem + 1);
         if (ty < out_samples_this_iter) {
             indims[InRank-1] = (next_start_elem + ty) * num_channels + chan;
 
@@ -85,7 +85,7 @@ namespace matx
         template <typename Out, typename Executor>
         void Exec(Out &&out, Executor &&ex) const {
           static_assert(is_cuda_executor_v<Executor>, "cov() only supports the CUDA executor currently");
-          cov_impl(cuda::std::get<0>(out), a_, ex.getStream());
+          cov_impl(cuda::std::get<0>(out), a_, ex);
         }
 
         template <typename ShapeType, typename Executor>
 
@@ -35,7 +35,10 @@
 
 #include "matx/core/type_utils.h"
 #include "matx/operators/base_operator.h"
-#include "matx/transforms/matmul.h"
+#include "matx/transforms/matmul/matmul_cuda.h"
+#ifdef MATX_EN_CPU_MATMUL
+  #include "matx/transforms/matmul/matmul_cblas.h"
+#endif
 
 namespace matx
 {
@@ -108,12 +111,11 @@ namespace matx
 
         template <typename Out, typename Executor>
         void Exec(Out &&out, Executor &&ex) const {
-          static_assert(is_cuda_executor_v<Executor>, "matmul() only supports the CUDA executor currently");
           if constexpr (!std::is_same_v<PermDims, no_permute_t>) {
-            matmul_impl(permute(cuda::std::get<0>(out), perm_), a_, b_, ex.getStream(), alpha_, beta_);
+            matmul_impl(permute(cuda::std::get<0>(out), perm_), a_, b_, ex, alpha_, beta_);
           }
           else {
-            matmul_impl(cuda::std::get<0>(out), a_, b_, ex.getStream(), alpha_, beta_);
+            matmul_impl(cuda::std::get<0>(out), a_, b_, ex, alpha_, beta_);
           }
         }
 
 
@@ -89,8 +89,7 @@ namespace matx
 
         template <typename Out, typename Executor>
         void Exec(Out &&out, Executor &&ex)  const{
-          static_assert(is_cuda_executor_v<Executor>, "matvec() only supports the CUDA executor currently");
-          matvec_impl(cuda::std::get<0>(out), a_, b_, ex.getStream(), alpha_, beta_);
+          matvec_impl(cuda::std::get<0>(out), a_, b_, ex, alpha_, beta_);
         }
 
         template <typename ShapeType, typename Executor>
 
@@ -98,8 +98,7 @@ namespace matx
 
         template <typename Out, typename Executor>
         void Exec(Out &&out, Executor &&ex)  const{
-          static_assert(is_cuda_executor_v<Executor>, "outer() only supports the CUDA executor currently");
-          outer_impl(cuda::std::get<0>(out), a_, b_, ex.getStream(), alpha_, beta_);
+          outer_impl(cuda::std::get<0>(out), a_, b_, ex, alpha_, beta_);
         }
 
         template <typename ShapeType, typename Executor>
 
@@ -64,7 +64,7 @@ namespace detail {
         static_assert(is_cuda_executor_v<Executor>, "svd() only supports the CUDA executor currently");
         static_assert(cuda::std::tuple_size_v<remove_cvref_t<Out>> == 3, "Must use mtie with 3 outputs on qr(). ie: (mtie(Q, R) = qr(A))");
 
-        qr_impl(cuda::std::get<0>(out), cuda::std::get<1>(out), a_, ex.getStream());
+        qr_impl(cuda::std::get<0>(out), cuda::std::get<1>(out), a_, ex);
       }
 
       static __MATX_INLINE__ constexpr __MATX_HOST__ __MATX_DEVICE__ int32_t Rank()
 
@@ -127,7 +127,7 @@ namespace detail {
         static_assert(is_cuda_executor_v<Executor>, "svdpi() only supports the CUDA executor currently");
         static_assert(cuda::std::tuple_size_v<remove_cvref_t<Out>> == 4, "Must use mtie with 3 outputs on svdpi(). ie: (mtie(U, S, V) = svdpi(A))");
 
-        svdpi_impl(cuda::std::get<0>(out), cuda::std::get<1>(out), cuda::std::get<2>(out), a_, x_, iterations_, ex.getStream(), k_);
+        svdpi_impl(cuda::std::get<0>(out), cuda::std::get<1>(out), cuda::std::get<2>(out), a_, x_, iterations_, ex, k_);
       }
 
       static __MATX_INLINE__ constexpr __MATX_HOST__ __MATX_DEVICE__ int32_t Rank()
@@ -204,7 +204,7 @@ namespace detail {
         static_assert(is_cuda_executor_v<Executor>, "svdbpi() only supports the CUDA executor currently");
         static_assert(cuda::std::tuple_size_v<remove_cvref_t<Out>> == 4, "Must use mtie with 3 outputs on svdbpi(). ie: (mtie(U, S, V) = svdbpi(A))");
 
-        svdbpi_impl(cuda::std::get<0>(out), cuda::std::get<1>(out), cuda::std::get<2>(out), a_, max_iters_, tol_, ex.getStream());
+        svdbpi_impl(cuda::std::get<0>(out), cuda::std::get<1>(out), cuda::std::get<2>(out), a_, max_iters_, tol_, ex);
       }
 
       static __MATX_INLINE__ constexpr __MATX_HOST__ __MATX_DEVICE__ int32_t Rank()
 
@@ -39,7 +39,7 @@
 #include "matx/core/error.h"
 #include "matx/core/nvtx.h"
 #include "matx/core/tensor.h"
-#include "matx/transforms/matmul.h"
+#include "matx/transforms/matmul/matmul_cuda.h"
 #include "matx/transforms/transpose.h"
 
 namespace matx {
@@ -137,16 +137,18 @@ template <typename TensorTypeC, typename TensorTypeA> class matxCovHandle_t {
  *   Output covariance matrix
  * @param a
  *   Input tensor A
- * @param stream
- *   CUDA stream
+ * @param exec
+ *   CUDA executor
  *
  */
   inline void Exec(TensorTypeC &c, const TensorTypeA &a,
-                   cudaStream_t stream)
+                   const cudaExecutor &exec)
   {
     MATX_NVTX_START("", matx::MATX_NVTX_LOG_INTERNAL)
+    const auto stream = exec.getStream();
+
     // Calculate a matrix of means
-    matmul_impl(means, onesM, a, stream,
+    matmul_impl(means, onesM, a, exec,
                  1.0f / static_cast<float>(a.Size(RANK - 2)));
 
     // Subtract the means from the observations to get the deviations
@@ -165,7 +167,7 @@ template <typename TensorTypeC, typename TensorTypeA> class matxCovHandle_t {
     }
 
     // Multiply by itself and scale by N-1 for the final covariance
-    matmul_impl(c, devsT, devs, stream,
+    matmul_impl(c, devsT, devs, exec,
                 1.0f / static_cast<float>(a.Size(RANK - 2) - 1));
   }
 
@@ -224,14 +226,16 @@ using cov_cache_t = std::unordered_map<CovParams_t, std::any, CovParamsKeyHash,
  *   Covariance matrix output view
  * @param a
  *   Covariance matrix input view
- * @param stream
- *   CUDA stream
+ * @param exec
+ *   CUDA executor
  */
 template <typename TensorTypeC, typename TensorTypeA>
 void cov_impl(TensorTypeC &c, const TensorTypeA &a,
-         cudaStream_t stream = 0)
+         const cudaExecutor &exec)
 {
   MATX_NVTX_START("", matx::MATX_NVTX_LOG_API)
+  const auto stream = exec.getStream();
+  
   // Get parameters required by these tensors
   auto params = detail::matxCovHandle_t<TensorTypeC, TensorTypeA>::GetCovParams(c, a, stream);
 
@@ -243,7 +247,7 @@ void cov_impl(TensorTypeC &c, const TensorTypeA &a,
       return std::make_shared<cache_val_type>(c, a);
     },
     [&](std::shared_ptr<cache_val_type> ctype) {
-      ctype->Exec(c, a, stream);
+      ctype->Exec(c, a, exec);
     }
   );
 }
Original file line number	Diff line number	Diff line change
`@@ -85,7 +85,7 @@ namespace matx`
`85`	`85`	`template <typename Out, typename Executor>`
`86`	`86`	`void Exec(Out &&out, Executor &&ex) const {`
`87`	`87`	`static_assert(is_cuda_executor_v<Executor>, "cov() only supports the CUDA executor currently");`
`88`		`- cov_impl(cuda::std::get<0>(out), a_, ex.getStream());`
	`88`	`+ cov_impl(cuda::std::get<0>(out), a_, ex);`
`89`	`89`	`}`
`90`	`90`
`91`	`91`	`template <typename ShapeType, typename Executor>`
Original file line number	Diff line number	Diff line change
`@@ -35,7 +35,10 @@`
`35`	`35`
`36`	`36`	`#include "matx/core/type_utils.h"`
`37`	`37`	`#include "matx/operators/base_operator.h"`
`38`		`-#include "matx/transforms/matmul.h"`
	`38`	`+#include "matx/transforms/matmul/matmul_cuda.h"`
	`39`	`+#ifdef MATX_EN_CPU_MATMUL`
	`40`	`+ #include "matx/transforms/matmul/matmul_cblas.h"`
	`41`	`+#endif`
`39`	`42`
`40`	`43`	`namespace matx`
`41`	`44`	`{`
`@@ -108,12 +111,11 @@ namespace matx`
`108`	`111`
`109`	`112`	`template <typename Out, typename Executor>`
`110`	`113`	`void Exec(Out &&out, Executor &&ex) const {`
`111`		`- static_assert(is_cuda_executor_v<Executor>, "matmul() only supports the CUDA executor currently");`
`112`	`114`	`if constexpr (!std::is_same_v<PermDims, no_permute_t>) {`
`113`		`- matmul_impl(permute(cuda::std::get<0>(out), perm_), a_, b_, ex.getStream(), alpha_, beta_);`
	`115`	`+ matmul_impl(permute(cuda::std::get<0>(out), perm_), a_, b_, ex, alpha_, beta_);`
`114`	`116`	`}`
`115`	`117`	`else {`
`116`		`- matmul_impl(cuda::std::get<0>(out), a_, b_, ex.getStream(), alpha_, beta_);`
	`118`	`+ matmul_impl(cuda::std::get<0>(out), a_, b_, ex, alpha_, beta_);`
`117`	`119`	`}`
`118`	`120`	`}`
`119`	`121`
Original file line number	Diff line number	Diff line change
`@@ -89,8 +89,7 @@ namespace matx`
`89`	`89`
`90`	`90`	`template <typename Out, typename Executor>`
`91`	`91`	`void Exec(Out &&out, Executor &&ex) const{`
`92`		`- static_assert(is_cuda_executor_v<Executor>, "matvec() only supports the CUDA executor currently");`
`93`		`- matvec_impl(cuda::std::get<0>(out), a_, b_, ex.getStream(), alpha_, beta_);`
	`92`	`+ matvec_impl(cuda::std::get<0>(out), a_, b_, ex, alpha_, beta_);`
`94`	`93`	`}`
`95`	`94`
`96`	`95`	`template <typename ShapeType, typename Executor>`
Original file line number	Diff line number	Diff line change
`@@ -98,8 +98,7 @@ namespace matx`
`98`	`98`
`99`	`99`	`template <typename Out, typename Executor>`
`100`	`100`	`void Exec(Out &&out, Executor &&ex) const{`
`101`		`- static_assert(is_cuda_executor_v<Executor>, "outer() only supports the CUDA executor currently");`
`102`		`- outer_impl(cuda::std::get<0>(out), a_, b_, ex.getStream(), alpha_, beta_);`
	`101`	`+ outer_impl(cuda::std::get<0>(out), a_, b_, ex, alpha_, beta_);`
`103`	`102`	`}`
`104`	`103`
`105`	`104`	`template <typename ShapeType, typename Executor>`
Original file line number	Diff line number	Diff line change
`@@ -64,7 +64,7 @@ namespace detail {`
`64`	`64`	`static_assert(is_cuda_executor_v<Executor>, "svd() only supports the CUDA executor currently");`
`65`	`65`	`static_assert(cuda::std::tuple_size_v<remove_cvref_t<Out>> == 3, "Must use mtie with 3 outputs on qr(). ie: (mtie(Q, R) = qr(A))");`
`66`	`66`
`67`		`- qr_impl(cuda::std::get<0>(out), cuda::std::get<1>(out), a_, ex.getStream());`
	`67`	`+ qr_impl(cuda::std::get<0>(out), cuda::std::get<1>(out), a_, ex);`
`68`	`68`	`}`
`69`	`69`
`70`	`70`	`static __MATX_INLINE__ constexpr __MATX_HOST__ __MATX_DEVICE__ int32_t Rank()`
Original file line number	Diff line number	Diff line change
`@@ -127,7 +127,7 @@ namespace detail {`
`127`	`127`	`static_assert(is_cuda_executor_v<Executor>, "svdpi() only supports the CUDA executor currently");`
`128`	`128`	`static_assert(cuda::std::tuple_size_v<remove_cvref_t<Out>> == 4, "Must use mtie with 3 outputs on svdpi(). ie: (mtie(U, S, V) = svdpi(A))");`
`129`	`129`
`130`		`- svdpi_impl(cuda::std::get<0>(out), cuda::std::get<1>(out), cuda::std::get<2>(out), a_, x_, iterations_, ex.getStream(), k_);`
	`130`	`+ svdpi_impl(cuda::std::get<0>(out), cuda::std::get<1>(out), cuda::std::get<2>(out), a_, x_, iterations_, ex, k_);`
`131`	`131`	`}`
`132`	`132`
`133`	`133`	`static __MATX_INLINE__ constexpr __MATX_HOST__ __MATX_DEVICE__ int32_t Rank()`
`@@ -204,7 +204,7 @@ namespace detail {`
`204`	`204`	`static_assert(is_cuda_executor_v<Executor>, "svdbpi() only supports the CUDA executor currently");`
`205`	`205`	`static_assert(cuda::std::tuple_size_v<remove_cvref_t<Out>> == 4, "Must use mtie with 3 outputs on svdbpi(). ie: (mtie(U, S, V) = svdbpi(A))");`
`206`	`206`
`207`		`- svdbpi_impl(cuda::std::get<0>(out), cuda::std::get<1>(out), cuda::std::get<2>(out), a_, max_iters_, tol_, ex.getStream());`
	`207`	`+ svdbpi_impl(cuda::std::get<0>(out), cuda::std::get<1>(out), cuda::std::get<2>(out), a_, max_iters_, tol_, ex);`
`208`	`208`	`}`
`209`	`209`
`210`	`210`	`static __MATX_INLINE__ constexpr __MATX_HOST__ __MATX_DEVICE__ int32_t Rank()`