NVIDIA · tmartin-gh · Mar 5, 2025 · Mar 3, 2025 · Mar 4, 2025 · Mar 4, 2025
diff --git a/docs_input/api/signalimage/general/pwelch.rst b/docs_input/api/signalimage/general/pwelch.rst
@@ -5,8 +5,8 @@ pwelch
 
 Estimate the power spectral density of a signal using Welch's method [1]_
 
-.. doxygenfunction:: pwelch(const xType& x, const wType& w, index_t nperseg, index_t noverlap, index_t nfft)
-.. doxygenfunction:: pwelch(const xType& x, index_t nperseg, index_t noverlap, index_t nfft)
+.. doxygenfunction:: pwelch(const xType& x, const wType& w, index_t nperseg, index_t noverlap, index_t nfft, PwelchOutputScaleMode output_scale_mode, fsType fs)
+.. doxygenfunction:: pwelch(const xType& x, index_t nperseg, index_t noverlap, index_t nfft, PwelchOutputScaleMode output_scale_mode, fsType fs)
 
 Examples
 ~~~~~~~~

diff --git a/examples/pwelch.cu b/examples/pwelch.cu
@@ -50,18 +50,15 @@ int main([[maybe_unused]] int argc, [[maybe_unused]] char **argv)
   MATX_ENTER_HANDLER();
   using complex = cuda::std::complex<float>;
 
-  float exec_time_ms;
-  const int num_iterations = 100;
-  index_t signal_size = 256;
-  index_t nperseg = 32;
-  index_t nfft = nperseg;
-  index_t noverlap = 8;
-  float ftone = 3.0;
+  const int num_iterations = 500;
+  index_t signal_size = 256000;
+  index_t nperseg = 512;
+  index_t noverlap = 256;
+  index_t nfft = 65536;
+
+  float ftone = 2048.0;
   cudaStream_t stream;
   cudaStreamCreate(&stream);
-  cudaEvent_t start, stop;
-  cudaEventCreate(&start);
-  cudaEventCreate(&stop);
   cudaExecutor exec{stream};
 
   // Create input signal as a complex exponential
@@ -71,31 +68,30 @@ int main([[maybe_unused]] int argc, [[maybe_unused]] char **argv)
   auto x = make_tensor<complex>({signal_size});
   (x = tmp_x).run(exec); // pre-compute x, tmp_x is otherwise lazily evaluated
 
+  // Create window
+  auto w = make_tensor<float>({nperseg});
+  (w = flattop<0>({nperseg})).run(exec);
+
   // Create output tensor
   auto Pxx  = make_tensor<typename complex::value_type>({nfft});
 
   // Run one time to pre-cache the FFT plan
-  (Pxx = pwelch(x, nperseg, noverlap, nfft)).run(exec);
+  (Pxx = pwelch(x, w, nperseg, noverlap, nfft)).run(exec);
   exec.sync();
 
   // Start the timing
-  cudaEventRecord(start, stream);
-
-  // Start the timing
-  cudaEventRecord(start, stream);
+  exec.start_timer();
 
   for (int iteration = 0; iteration < num_iterations; iteration++) {
     // Use the PWelch operator
-    (Pxx = pwelch(x, nperseg, noverlap, nfft)).run(exec);
+    (Pxx = pwelch(x, w, nperseg, noverlap, nfft)).run(exec);
   }
-
-  cudaEventRecord(stop, stream);
   exec.sync();
-  cudaEventElapsedTime(&exec_time_ms, start, stop);
+  exec.stop_timer();
 
-  printf("Output Pxx:\n");
-  print(Pxx);
-  printf("PWelchOp avg runtime = %.3f ms\n", exec_time_ms / num_iterations);
+  printf("Pxx(0) = %f\n", Pxx(0));
+  printf("Pxx(ftone) = %f\n", Pxx(2048));
+  printf("PWelchOp avg runtime = %.3f ms\n", exec.get_time_ms() / num_iterations);
 
   MATX_CUDA_CHECK_LAST_ERROR();
   MATX_EXIT_HANDLER();

diff --git a/include/matx/kernels/pwelch.cuh b/include/matx/kernels/pwelch.cuh
@@ -0,0 +1,93 @@
+////////////////////////////////////////////////////////////////////////////////
+// BSD 3-Clause License
+//
+// Copyright (c) 2023, NVIDIA Corporation
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+//
+// 1. Redistributions of source code must retain the above copyright notice, this
+//    list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright notice,
+//    this list of conditions and the following disclaimer in the documentation
+//    and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the copyright holder nor the names of its
+//    contributors may be used to endorse or promote products derived from
+//    this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+// DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+// FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+// DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+// SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+// CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+// OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+/////////////////////////////////////////////////////////////////////////////////
+
+#pragma once
+
+namespace matx {
+
+  enum PwelchOutputScaleMode {
+    PwelchOutputScaleMode_Spectrum,
+    PwelchOutputScaleMode_Density,
+    PwelchOutputScaleMode_Spectrum_dB,
+    PwelchOutputScaleMode_Density_dB
+  };
+
+  namespace detail {
+
+#ifdef __CUDACC__
+    template<PwelchOutputScaleMode OUTPUT_SCALE_MODE, typename T_IN, typename T_OUT, typename fsType>
+    __global__ void pwelch_kernel(const T_IN t_in, T_OUT t_out, fsType fs)
+    {
+      const index_t tid = blockIdx.x * blockDim.x + threadIdx.x;
+      const index_t batches = t_in.Shape()[0];
+      const index_t nfft = t_in.Shape()[1];
+
+      if (tid < nfft)
+      {
+        typename T_OUT::value_type pxx = 0;
+        constexpr typename T_OUT::value_type ten = 10;
+
+        for (index_t batch = 0; batch < batches; batch++)
+        {
+          pxx += cuda::std::norm(t_in(batch, tid));
+        }
+
+        if constexpr (OUTPUT_SCALE_MODE == PwelchOutputScaleMode_Spectrum) {
+          t_out(tid) = pxx / batches;
+        }
+        else if constexpr (OUTPUT_SCALE_MODE == PwelchOutputScaleMode_Density) {
+          t_out(tid) = pxx / (batches * fs);
+        }
+        else if constexpr (OUTPUT_SCALE_MODE == PwelchOutputScaleMode_Spectrum_dB) {
+          pxx /= batches;
+          if (pxx != 0) {
+            t_out(tid) = ten * cuda::std::log10(pxx);
+          }
+          else {
+            t_out(tid) = cuda::std::numeric_limits<typename T_OUT::value_type>::lowest();
+          }
+        }
+        else if constexpr (OUTPUT_SCALE_MODE == PwelchOutputScaleMode_Density_dB) {
+          pxx /= (batches * fs);
+          if (pxx != 0) {
+            t_out(tid) = ten * cuda::std::log10(pxx);
+          }
+          else {
+            t_out(tid) = cuda::std::numeric_limits<typename T_OUT::value_type>::lowest();
+          }
+        }
+      }
+    }
+#endif
+
+  };
+};
diff --git a/include/matx/operators/pwelch.h b/include/matx/operators/pwelch.h
@@ -40,35 +40,38 @@
 namespace matx
 {
   namespace detail {
-    template <typename OpX, typename OpW>
-    class PWelchOp : public BaseOp<PWelchOp<OpX,OpW>>
+    template <typename OpX, typename OpW, typename fsType>
+    class PWelchOp : public BaseOp<PWelchOp<OpX,OpW,fsType>>
     {
-      private:
-        typename detail::base_type_t<OpX> x_;
-        typename detail::base_type_t<OpW> w_;
-
-        index_t nperseg_;
-        index_t noverlap_;
-        index_t nfft_;
-        cuda::std::array<index_t, 1> out_dims_;
-        mutable detail::tensor_impl_t<typename remove_cvref_t<OpX>::value_type, 1> tmp_out_;
-        mutable typename remove_cvref_t<OpX>::value_type *ptr = nullptr; 
-
       public:
+        static_assert(is_complex_v<typename OpX::value_type>, "pwelch() must have a complex input type");
         using matxop = bool;
         using value_type = typename OpX::value_type::value_type;
         using matx_transform_op = bool;
         using pwelch_xform_op = bool;
 
-        static_assert(is_complex_v<typename OpX::value_type>, "pwelch() must have a complex input type");
 
         __MATX_INLINE__ std::string str() const {
           return "pwelch(" + get_type_str(x_) + "," + get_type_str(w_) + ")";
         }
 
-        __MATX_INLINE__ PWelchOp(const OpX &x, const OpW &w, index_t nperseg, index_t noverlap, index_t nfft) :
-              x_(x), w_(w), nperseg_(nperseg), noverlap_(noverlap), nfft_(nfft) {
-
+        __MATX_INLINE__ PWelchOp(
+              const OpX &x,
+              const OpW &w,
+              index_t nperseg,
+              index_t noverlap,
+              index_t nfft,
+              PwelchOutputScaleMode output_scale_mode,
+              fsType fs
+          ) :
+              x_(x),
+              w_(w),
+              nperseg_(nperseg),
+              noverlap_(noverlap),
+              nfft_(nfft),
+              output_scale_mode_(output_scale_mode),
+              fs_(fs)
+        {
           MATX_STATIC_ASSERT_STR(OpX::Rank() == 1, matxInvalidDim, "pwelch:  Only input rank of 1 is supported presently");
           for (int r = 0; r < OpX::Rank(); r++) {
             out_dims_[r] = nfft_;
@@ -96,25 +99,25 @@ namespace matx
         template <typename Out, typename Executor>
         void Exec(Out &&out, Executor &&ex)  const{
           static_assert(is_cuda_executor_v<Executor>, "pwelch() only supports the CUDA executor currently");
-          pwelch_impl(cuda::std::get<0>(out), x_, w_, nperseg_, noverlap_, nfft_, ex.getStream());
+          pwelch_impl(cuda::std::get<0>(out), x_, w_, nperseg_, noverlap_, nfft_, output_scale_mode_, fs_, ex.getStream());
         }
 
         template <typename ShapeType, typename Executor>
         __MATX_INLINE__ void InnerPreRun([[maybe_unused]] ShapeType &&shape, Executor &&ex) const noexcept
         {
           if constexpr (is_matx_op<OpX>()) {
             x_.PreRun(std::forward<ShapeType>(shape), std::forward<Executor>(ex));
-          } 
+          }
 
           if constexpr (is_matx_op<OpW>()) {
             w_.PreRun(Shape(w_), std::forward<Executor>(ex));
-          }                 
-        }      
+          }
+        }
 
         template <typename ShapeType, typename Executor>
         __MATX_INLINE__ void PreRun([[maybe_unused]] ShapeType &&shape, Executor &&ex) const noexcept
         {
-          InnerPreRun(std::forward<ShapeType>(shape), std::forward<Executor>(ex));    
+          InnerPreRun(std::forward<ShapeType>(shape), std::forward<Executor>(ex));
 
           detail::AllocateTempTensor(tmp_out_, std::forward<Executor>(ex), out_dims_, &ptr);
 
@@ -133,7 +136,20 @@ namespace matx
           }
 
           matxFree(ptr);
-        }               
+        }
+
+      private:
+        typename detail::base_type_t<OpX> x_;
+        typename detail::base_type_t<OpW> w_;
+
+        index_t nperseg_;
+        index_t noverlap_;
+        index_t nfft_;
+        PwelchOutputScaleMode output_scale_mode_;
+        fsType fs_;
+        cuda::std::array<index_t, 1> out_dims_;
+        mutable detail::tensor_impl_t<typename remove_cvref_t<OpX>::value_type, 1> tmp_out_;
+        mutable typename remove_cvref_t<OpX>::value_type *ptr = nullptr;
     };
   }
 
@@ -144,6 +160,8 @@ namespace matx
    *   Input time domain data type
    * @tparam wType
    *   Input window type
+   * @tparam fsType
+   *   Sampling frequency type
    * @param x
    *   Input time domain tensor
    * @param w
@@ -154,22 +172,69 @@ namespace matx
    *   Number of points to overlap between segments.  Defaults to 0
    * @param nfft
    *   Length of FFT used per segment.  nfft >= nperseg.  Defaults to nfft = nperseg
+   * @param output_scale_mode
+   *   Output scale mode.  Defaults to PwelchOutputScaleMode_Spectrum
+   * @param fs
+   *   Sampling frequency.  Defaults to 1
    *
    * @returns Operator with power spectral density of x
    *
    */
 
-  template <typename xType, typename wType>
-    __MATX_INLINE__ auto pwelch(const xType& x, const wType& w, index_t nperseg, index_t noverlap, index_t nfft)
+  template <
+      typename xType,
+      typename wType,
+      typename fsType>
+    __MATX_INLINE__ auto pwelch(
+        const xType& x,
+        const wType& w,
+        index_t nperseg,
+        index_t noverlap,
+        index_t nfft,
+        PwelchOutputScaleMode output_scale_mode = PwelchOutputScaleMode_Spectrum,
+        fsType fs = 1
+    )
+  {
+    return detail::PWelchOp(x, w, nperseg, noverlap, nfft, output_scale_mode, fs);
+  }
+
+  template <
+      typename xType,
+      typename fsType>
+    __MATX_INLINE__ auto pwelch(
+        const xType& x,
+        index_t nperseg,
+        index_t noverlap,
+        index_t nfft,
+        PwelchOutputScaleMode output_scale_mode = PwelchOutputScaleMode_Spectrum,
+        fsType fs = 1
+    )
   {
-    MATX_NVTX_START("", matx::MATX_NVTX_LOG_API)
+    return detail::PWelchOp(x, std::nullopt, nperseg, noverlap, nfft, output_scale_mode, fs);
+  }
 
-    return detail::PWelchOp(x, w, nperseg, noverlap, nfft);
+  template <typename xType, typename wType>
+    __MATX_INLINE__ auto pwelch(
+        const xType& x,
+        const wType& w,
+        index_t nperseg,
+        index_t noverlap,
+        index_t nfft,
+        PwelchOutputScaleMode output_scale_mode = PwelchOutputScaleMode_Spectrum
+    )
+  {
+    return detail::PWelchOp(x, w, nperseg, noverlap, nfft, output_scale_mode, 1.f);
   }
 
   template <typename xType>
-    __MATX_INLINE__ auto pwelch(const xType& x, index_t nperseg, index_t noverlap, index_t nfft)
+    __MATX_INLINE__ auto pwelch(
+        const xType& x,
+        index_t nperseg,
+        index_t noverlap,
+        index_t nfft,
+        PwelchOutputScaleMode output_scale_mode = PwelchOutputScaleMode_Spectrum
+    )
   {
-    return detail::PWelchOp(x, std::nullopt, nperseg, noverlap, nfft);
+    return detail::PWelchOp(x, std::nullopt, nperseg, noverlap, nfft, output_scale_mode, 1.f);
   }
 }