Add configurable scaling modes for pwelch, using custom reduction kernel that performs better than CUB when in memory FFT bin powers are {batches, nfft}

tmartin-gh · tmartin-gh · commit 7c494187ccb6 · 2025-03-03T14:37:55.000-08:00
diff --git a/examples/pwelch.cu b/examples/pwelch.cu
@@ -50,18 +50,15 @@ int main([[maybe_unused]] int argc, [[maybe_unused]] char **argv)
   MATX_ENTER_HANDLER();
   using complex = cuda::std::complex<float>;
 
-  float exec_time_ms;
-  const int num_iterations = 100;
-  index_t signal_size = 256;
-  index_t nperseg = 32;
-  index_t nfft = nperseg;
-  index_t noverlap = 8;
-  float ftone = 3.0;
+  const int num_iterations = 500;
+  index_t signal_size = 256000;
+  index_t nperseg = 512;
+  index_t noverlap = 256;
+  index_t nfft = 65536;
+
+  float ftone = 2048.0;
   cudaStream_t stream;
   cudaStreamCreate(&stream);
-  cudaEvent_t start, stop;
-  cudaEventCreate(&start);
-  cudaEventCreate(&stop);
   cudaExecutor exec{stream};
 
   // Create input signal as a complex exponential
@@ -71,31 +68,30 @@ int main([[maybe_unused]] int argc, [[maybe_unused]] char **argv)
   auto x = make_tensor<complex>({signal_size});
   (x = tmp_x).run(exec); // pre-compute x, tmp_x is otherwise lazily evaluated
 
+  // Create window
+  auto w = make_tensor<complex>({nperseg});
+  (w = flattop<0>({nperseg})).run(exec);
+
   // Create output tensor
   auto Pxx  = make_tensor<typename complex::value_type>({nfft});
 
   // Run one time to pre-cache the FFT plan
-  (Pxx = pwelch(x, nperseg, noverlap, nfft)).run(exec);
+  (Pxx = pwelch(x, w, nperseg, noverlap, nfft)).run(exec);
   exec.sync();
 
   // Start the timing
-  cudaEventRecord(start, stream);
-
-  // Start the timing
-  cudaEventRecord(start, stream);
+  exec.start_timer();
 
   for (int iteration = 0; iteration < num_iterations; iteration++) {
     // Use the PWelch operator
-    (Pxx = pwelch(x, nperseg, noverlap, nfft)).run(exec);
+    (Pxx = pwelch(x, w, nperseg, noverlap, nfft)).run(exec);
   }
-
-  cudaEventRecord(stop, stream);
   exec.sync();
-  cudaEventElapsedTime(&exec_time_ms, start, stop);
+  exec.stop_timer();
 
-  printf("Output Pxx:\n");
-  print(Pxx);
-  printf("PWelchOp avg runtime = %.3f ms\n", exec_time_ms / num_iterations);
+  printf("Pxx(0) = %f\n", Pxx(0));
+  printf("Pxx(ftone) = %f\n", Pxx(2048));
+  printf("PWelchOp avg runtime = %.3f ms\n", exec.get_time_ms() / num_iterations);
 
   MATX_CUDA_CHECK_LAST_ERROR();
   MATX_EXIT_HANDLER();
diff --git a/include/matx/operators/pwelch.h b/include/matx/operators/pwelch.h
@@ -43,17 +43,6 @@ namespace matx
     template <typename OpX, typename OpW>
     class PWelchOp : public BaseOp<PWelchOp<OpX,OpW>>
     {
-      private:
-        typename detail::base_type_t<OpX> x_;
-        typename detail::base_type_t<OpW> w_;
-
-        index_t nperseg_;
-        index_t noverlap_;
-        index_t nfft_;
-        cuda::std::array<index_t, 1> out_dims_;
-        mutable detail::tensor_impl_t<typename remove_cvref_t<OpX>::value_type, 1> tmp_out_;
-        mutable typename remove_cvref_t<OpX>::value_type *ptr = nullptr; 
-
       public:
         using matxop = bool;
         using value_type = typename OpX::value_type::value_type;
@@ -66,9 +55,23 @@ namespace matx
           return "pwelch(" + get_type_str(x_) + "," + get_type_str(w_) + ")";
         }
 
-        __MATX_INLINE__ PWelchOp(const OpX &x, const OpW &w, index_t nperseg, index_t noverlap, index_t nfft) :
-              x_(x), w_(w), nperseg_(nperseg), noverlap_(noverlap), nfft_(nfft) {
-
+        __MATX_INLINE__ PWelchOp(
+              const OpX &x,
+              const OpW &w,
+              index_t nperseg,
+              index_t noverlap,
+              index_t nfft,
+              PwelchOutputScaleMode output_scale_mode,
+              value_type fs
+          ) :
+              x_(x),
+              w_(w),
+              nperseg_(nperseg),
+              noverlap_(noverlap),
+              nfft_(nfft),
+              output_scale_mode_(output_scale_mode),
+              fs_(fs)
+        {
           MATX_STATIC_ASSERT_STR(OpX::Rank() == 1, matxInvalidDim, "pwelch:  Only input rank of 1 is supported presently");
           for (int r = 0; r < OpX::Rank(); r++) {
             out_dims_[r] = nfft_;
@@ -96,25 +99,25 @@ namespace matx
         template <typename Out, typename Executor>
         void Exec(Out &&out, Executor &&ex)  const{
           static_assert(is_cuda_executor_v<Executor>, "pwelch() only supports the CUDA executor currently");
-          pwelch_impl(cuda::std::get<0>(out), x_, w_, nperseg_, noverlap_, nfft_, ex.getStream());
+          pwelch_impl(cuda::std::get<0>(out), x_, w_, nperseg_, noverlap_, nfft_, output_scale_mode_, fs_, ex.getStream());
         }
 
         template <typename ShapeType, typename Executor>
         __MATX_INLINE__ void InnerPreRun([[maybe_unused]] ShapeType &&shape, Executor &&ex) const noexcept
         {
           if constexpr (is_matx_op<OpX>()) {
             x_.PreRun(std::forward<ShapeType>(shape), std::forward<Executor>(ex));
-          } 
+          }
 
           if constexpr (is_matx_op<OpW>()) {
             w_.PreRun(Shape(w_), std::forward<Executor>(ex));
-          }                 
-        }      
+          }
+        }
 
         template <typename ShapeType, typename Executor>
         __MATX_INLINE__ void PreRun([[maybe_unused]] ShapeType &&shape, Executor &&ex) const noexcept
         {
-          InnerPreRun(std::forward<ShapeType>(shape), std::forward<Executor>(ex));    
+          InnerPreRun(std::forward<ShapeType>(shape), std::forward<Executor>(ex));
 
           detail::AllocateTempTensor(tmp_out_, std::forward<Executor>(ex), out_dims_, &ptr);
 
@@ -133,7 +136,20 @@ namespace matx
           }
 
           matxFree(ptr);
-        }               
+        }
+
+      private:
+        typename detail::base_type_t<OpX> x_;
+        typename detail::base_type_t<OpW> w_;
+
+        index_t nperseg_;
+        index_t noverlap_;
+        index_t nfft_;
+        PwelchOutputScaleMode output_scale_mode_;
+        value_type fs_;
+        cuda::std::array<index_t, 1> out_dims_;
+        mutable detail::tensor_impl_t<typename remove_cvref_t<OpX>::value_type, 1> tmp_out_;
+        mutable typename remove_cvref_t<OpX>::value_type *ptr = nullptr;
     };
   }
 
@@ -154,22 +170,41 @@ namespace matx
    *   Number of points to overlap between segments.  Defaults to 0
    * @param nfft
    *   Length of FFT used per segment.  nfft >= nperseg.  Defaults to nfft = nperseg
+   * @param output_scale_mode
+   *   Output scale mode.  Defaults to PwelchOutputScaleMode_Spectrum
+   * @param fs
+   *   Sampling frequency.  Defaults to 1
    *
    * @returns Operator with power spectral density of x
    *
    */
 
   template <typename xType, typename wType>
-    __MATX_INLINE__ auto pwelch(const xType& x, const wType& w, index_t nperseg, index_t noverlap, index_t nfft)
+    __MATX_INLINE__ auto pwelch(
+        const xType& x,
+        const wType& w,
+        index_t nperseg,
+        index_t noverlap,
+        index_t nfft,
+        PwelchOutputScaleMode output_scale_mode = PwelchOutputScaleMode_Spectrum,
+        typename xType::value_type::value_type fs = 1
+    )
   {
     MATX_NVTX_START("", matx::MATX_NVTX_LOG_API)
 
-    return detail::PWelchOp(x, w, nperseg, noverlap, nfft);
+    return detail::PWelchOp(x, w, nperseg, noverlap, nfft, output_scale_mode, fs);
   }
 
   template <typename xType>
-    __MATX_INLINE__ auto pwelch(const xType& x, index_t nperseg, index_t noverlap, index_t nfft)
+    __MATX_INLINE__ auto pwelch(
+        const xType& x,
+        index_t nperseg,
+        index_t noverlap,
+        index_t nfft,
+        PwelchOutputScaleMode output_scale_mode = PwelchOutputScaleMode_Spectrum,
+        typename xType::value_type::value_type fs = 1
+    )
   {
-    return detail::PWelchOp(x, std::nullopt, nperseg, noverlap, nfft);
+    return detail::PWelchOp(x, std::nullopt, nperseg, noverlap, nfft, output_scale_mode, fs);
   }
 }
diff --git a/include/matx/transforms/pwelch.h b/include/matx/transforms/pwelch.h
@@ -34,8 +34,71 @@
 
 namespace matx
 {
+
+  enum PwelchOutputScaleMode {
+    PwelchOutputScaleMode_Spectrum,
+    PwelchOutputScaleMode_Density,
+    PwelchOutputScaleMode_Spectrum_dB,
+    PwelchOutputScaleMode_Density_dB
+  };
+
+  namespace detail {
+    template<PwelchOutputScaleMode OUTPUT_SCALE_MODE, typename T_IN, typename T_OUT>
+    __global__ void pwelch_kernel(const T_IN t_in, T_OUT t_out, typename T_OUT::value_type fs)
+    {
+      const index_t tid = blockIdx.x * blockDim.x + threadIdx.x;
+      const index_t batches = t_in.Shape()[0];
+      const index_t nfft = t_in.Shape()[1];
+
+      if (tid < nfft)
+      {
+        typename T_OUT::value_type pxx = 0;
+        constexpr typename T_OUT::value_type ten = 10;
+
+        for (index_t batch = 0; batch < batches; batch++)
+        {
+          pxx += cuda::std::norm(t_in(batch, tid));
+        }
+
+        if constexpr (OUTPUT_SCALE_MODE == PwelchOutputScaleMode_Spectrum)
+        {
+          t_out(tid) = pxx / batches;
+        }
+        else if constexpr (OUTPUT_SCALE_MODE == PwelchOutputScaleMode_Density)
+        {
+          t_out(tid) = pxx / (batches * fs);
+        }
+        else if constexpr (OUTPUT_SCALE_MODE == PwelchOutputScaleMode_Spectrum_dB)
+        {
+          pxx /= batches;
+          if (pxx != 0)
+          {
+            t_out(tid) = ten * cuda::std::log10(pxx);
+          }
+          else
+          {
+            t_out(tid) = cuda::std::numeric_limits<typename T_OUT::value_type>::lowest();
+          }
+        }
+        else if constexpr (OUTPUT_SCALE_MODE == PwelchOutputScaleMode_Density_dB)
+        {
+          pxx /= (batches * fs);
+          if (pxx != 0)
+          {
+            t_out(tid) = ten * cuda::std::log10(pxx);
+          }
+          else
+          {
+            t_out(tid) = cuda::std::numeric_limits<typename T_OUT::value_type>::lowest();
+          }
+        }
+      }
+    }
+  };
+
+  extern int g_pwelch_alg_mode;
   template <typename PxxType, typename xType, typename wType>
-    __MATX_INLINE__ void pwelch_impl(PxxType Pxx, const xType& x, const wType& w, index_t nperseg, index_t noverlap, index_t nfft, cudaStream_t stream=0)
+    __MATX_INLINE__ void pwelch_impl(PxxType Pxx, const xType& x, const wType& w, index_t nperseg, index_t noverlap, index_t nfft, PwelchOutputScaleMode output_scale_mode, typename PxxType::value_type fs, cudaStream_t stream=0)
     {
       MATX_NVTX_START("", matx::MATX_NVTX_LOG_API)
 
@@ -59,13 +122,25 @@ namespace matx
         (X_with_overlaps = fft(x_with_overlaps * w,nfft)).run(stream);
       }
 
-      // Compute magnitude squared in-place
-      (X_with_overlaps = conj(X_with_overlaps) * X_with_overlaps).run(stream);
-      auto mag_sq_X_with_overlaps = X_with_overlaps.RealView();
+      int tpb = 512;
+      int bpk = (static_cast<int>(nfft) + tpb - 1) / tpb;
 
-      // Perform the reduction across 'batches' rows and normalize
-      auto norm_factor = static_cast<typename PxxType::value_type>(1.) / static_cast<typename PxxType::value_type>(batches);
-      (Pxx = sum(mag_sq_X_with_overlaps, {0}) * norm_factor).run(stream);
+      if (output_scale_mode == PwelchOutputScaleMode_Spectrum)
+      {
+        detail::pwelch_kernel<PwelchOutputScaleMode_Spectrum><<<bpk, tpb, 0, stream>>>(X_with_overlaps, Pxx, fs);
+      }
+      else if (output_scale_mode == PwelchOutputScaleMode_Density)
+      {
+        detail::pwelch_kernel<PwelchOutputScaleMode_Density><<<bpk, tpb, 0, stream>>>(X_with_overlaps, Pxx, fs);
+      }
+      else if (output_scale_mode == PwelchOutputScaleMode_Spectrum_dB)
+      {
+        detail::pwelch_kernel<PwelchOutputScaleMode_Spectrum_dB><<<bpk, tpb, 0, stream>>>(X_with_overlaps, Pxx, fs);
+      }
+      else //if (output_scale_mode == PwelchOutputScaleMode_Density_dB)
+      {
+        detail::pwelch_kernel<PwelchOutputScaleMode_Density_dB><<<bpk, tpb, 0, stream>>>(X_with_overlaps, Pxx, fs);
+      }
     }
 
 } // end namespace matx
diff --git a/test/00_operators/PWelch.cu b/test/00_operators/PWelch.cu
@@ -47,18 +47,23 @@ struct TestParams {
     index_t nperseg;
     index_t noverlap;
     index_t nfft;
+    PwelchOutputScaleMode output_scale_mode;
+    float fs;
     float ftone;
     float sigma;
 };
 
 const std::vector<TestParams> CONFIGS = {
-  {"none", 8, 8, 2, 8, 0., 0.},
-  {"none", 16, 8, 4, 8, 1., 0.},
-  {"none", 16, 8, 4, 8, 2., 1.},
-  {"none", 16384, 256, 64, 256, 63., 0.},
-  {"boxcar", 8, 8, 2, 8, 0., 0.},
-  {"hann", 16, 8, 4, 8, 1., 0.},
-  {"flattop", 1024, 64, 32, 128, 2., 1.},
+  {"none", 8, 8, 2, 8, PwelchOutputScaleMode_Spectrum, 1.0, 0., 0.},
+  {"none", 16, 8, 4, 8, PwelchOutputScaleMode_Spectrum, 1.0, 1., 0.},
+  {"none", 16, 8, 4, 8, PwelchOutputScaleMode_Spectrum, 1.0, 2., 1.},
+  {"none", 16384, 256, 64, 256, PwelchOutputScaleMode_Spectrum, 1.0, 63., 0.},
+  {"boxcar", 8, 8, 2, 8, PwelchOutputScaleMode_Spectrum, 1.0, 0., 0.},
+  {"hann", 16, 8, 4, 8, PwelchOutputScaleMode_Spectrum, 1.0, 1., 0.},
+  {"flattop", 1024, 64, 32, 128, PwelchOutputScaleMode_Spectrum, 2.0, 2., 1.},
+  {"flattop", 1024, 64, 32, 128, PwelchOutputScaleMode_Density, 2.0, 2., 1.},
+  {"flattop", 1024, 64, 32, 128, PwelchOutputScaleMode_Spectrum_dB, 2.0, 2., 1.},
+  {"flattop", 1024, 64, 32, 128, PwelchOutputScaleMode_Density_dB, 2.0, 2., 1.},
 };
 
 class PWelchComplexExponentialTest : public ::testing::TestWithParam<TestParams>
@@ -87,11 +92,26 @@ void helper(PWelchComplexExponentialTest& test)
     "nperseg"_a=test.params.nperseg,
     "noverlap"_a=test.params.noverlap,
     "nfft"_a=test.params.nfft,
+    "scaling"_a="spectrum",
+    "fs"_a =test.params.fs,
     "ftone"_a=test.params.ftone,
     "sigma"_a=test.params.sigma,
     "window_name"_a=test.params.window_name
   );
 
+  if (test.params.output_scale_mode == PwelchOutputScaleMode_Density)
+  {
+    cfg["scaling"] = "density";
+  }
+  else if (test.params.output_scale_mode == PwelchOutputScaleMode_Density_dB)
+  {
+    cfg["scaling"] = "density_dB";
+  }
+  else if (test.params.output_scale_mode == PwelchOutputScaleMode_Spectrum_dB)
+  {
+    cfg["scaling"] = "spectrum_dB";
+  }
+
   test.pb->template InitAndRunTVGeneratorWithCfg<TypeParam>(
       "00_operators", "pwelch_operators", "pwelch_complex_exponential", cfg);
 
@@ -104,7 +124,7 @@ void helper(PWelchComplexExponentialTest& test)
 
   if (test.params.window_name == "none")
   {
-    (Pxx = pwelch(x, test.params.nperseg, test.params.noverlap, test.params.nfft)).run(exec);
+    (Pxx = pwelch(x, test.params.nperseg, test.params.noverlap, test.params.nfft, test.params.output_scale_mode, test.params.fs)).run(exec);
   }
   else
   {
@@ -125,7 +145,7 @@ void helper(PWelchComplexExponentialTest& test)
     {
       ASSERT_TRUE(false) << "Unknown window parameter name " + test.params.window_name;
     }
-    (Pxx = pwelch(x, w, test.params.nperseg, test.params.noverlap, test.params.nfft)).run(exec);
+    (Pxx = pwelch(x, w, test.params.nperseg, test.params.noverlap, test.params.nfft, test.params.output_scale_mode, test.params.fs)).run(exec);
   }
 
   exec.sync();
diff --git a/test/test_vectors/generators/00_operators.py b/test/test_vectors/generators/00_operators.py