Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add configurable Pwelch scaling and improve performance #897

Merged
merged 4 commits into from
Mar 5, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions docs_input/api/signalimage/general/pwelch.rst
Original file line number Diff line number Diff line change
Expand Up @@ -5,8 +5,8 @@ pwelch

Estimate the power spectral density of a signal using Welch's method [1]_

.. doxygenfunction:: pwelch(const xType& x, const wType& w, index_t nperseg, index_t noverlap, index_t nfft)
.. doxygenfunction:: pwelch(const xType& x, index_t nperseg, index_t noverlap, index_t nfft)
.. doxygenfunction:: pwelch(const xType& x, const wType& w, index_t nperseg, index_t noverlap, index_t nfft, PwelchOutputScaleMode output_scale_mode, fsType fs)
.. doxygenfunction:: pwelch(const xType& x, index_t nperseg, index_t noverlap, index_t nfft, PwelchOutputScaleMode output_scale_mode, fsType fs)

Examples
~~~~~~~~
Expand Down
40 changes: 18 additions & 22 deletions examples/pwelch.cu
Original file line number Diff line number Diff line change
Expand Up @@ -50,18 +50,15 @@ int main([[maybe_unused]] int argc, [[maybe_unused]] char **argv)
MATX_ENTER_HANDLER();
using complex = cuda::std::complex<float>;

float exec_time_ms;
const int num_iterations = 100;
index_t signal_size = 256;
index_t nperseg = 32;
index_t nfft = nperseg;
index_t noverlap = 8;
float ftone = 3.0;
const int num_iterations = 500;
index_t signal_size = 256000;
index_t nperseg = 512;
index_t noverlap = 256;
index_t nfft = 65536;

float ftone = 2048.0;
cudaStream_t stream;
cudaStreamCreate(&stream);
cudaEvent_t start, stop;
cudaEventCreate(&start);
cudaEventCreate(&stop);
cudaExecutor exec{stream};

// Create input signal as a complex exponential
Expand All @@ -71,31 +68,30 @@ int main([[maybe_unused]] int argc, [[maybe_unused]] char **argv)
auto x = make_tensor<complex>({signal_size});
(x = tmp_x).run(exec); // pre-compute x, tmp_x is otherwise lazily evaluated

// Create window
auto w = make_tensor<float>({nperseg});
(w = flattop<0>({nperseg})).run(exec);

// Create output tensor
auto Pxx = make_tensor<typename complex::value_type>({nfft});

// Run one time to pre-cache the FFT plan
(Pxx = pwelch(x, nperseg, noverlap, nfft)).run(exec);
(Pxx = pwelch(x, w, nperseg, noverlap, nfft)).run(exec);
exec.sync();

// Start the timing
cudaEventRecord(start, stream);

// Start the timing
cudaEventRecord(start, stream);
exec.start_timer();

for (int iteration = 0; iteration < num_iterations; iteration++) {
// Use the PWelch operator
(Pxx = pwelch(x, nperseg, noverlap, nfft)).run(exec);
(Pxx = pwelch(x, w, nperseg, noverlap, nfft)).run(exec);
}

cudaEventRecord(stop, stream);
exec.sync();
cudaEventElapsedTime(&exec_time_ms, start, stop);
exec.stop_timer();

printf("Output Pxx:\n");
print(Pxx);
printf("PWelchOp avg runtime = %.3f ms\n", exec_time_ms / num_iterations);
printf("Pxx(0) = %f\n", Pxx(0));
printf("Pxx(ftone) = %f\n", Pxx(2048));
printf("PWelchOp avg runtime = %.3f ms\n", exec.get_time_ms() / num_iterations);

MATX_CUDA_CHECK_LAST_ERROR();
MATX_EXIT_HANDLER();
Expand Down
93 changes: 93 additions & 0 deletions include/matx/kernels/pwelch.cuh
Original file line number Diff line number Diff line change
@@ -0,0 +1,93 @@
////////////////////////////////////////////////////////////////////////////////
// BSD 3-Clause License
//
// Copyright (c) 2023, NVIDIA Corporation
// All rights reserved.
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions are met:
//
// 1. Redistributions of source code must retain the above copyright notice, this
// list of conditions and the following disclaimer.
//
// 2. Redistributions in binary form must reproduce the above copyright notice,
// this list of conditions and the following disclaimer in the documentation
// and/or other materials provided with the distribution.
//
// 3. Neither the name of the copyright holder nor the names of its
// contributors may be used to endorse or promote products derived from
// this software without specific prior written permission.
//
// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
// DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
// FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
// DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
// SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
// CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
// OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
/////////////////////////////////////////////////////////////////////////////////

#pragma once

namespace matx {

enum PwelchOutputScaleMode {
PwelchOutputScaleMode_Spectrum,
PwelchOutputScaleMode_Density,
PwelchOutputScaleMode_Spectrum_dB,
PwelchOutputScaleMode_Density_dB
};

namespace detail {

#ifdef __CUDACC__
template<PwelchOutputScaleMode OUTPUT_SCALE_MODE, typename T_IN, typename T_OUT, typename fsType>
__global__ void pwelch_kernel(const T_IN t_in, T_OUT t_out, fsType fs)
{
const index_t tid = blockIdx.x * blockDim.x + threadIdx.x;
const index_t batches = t_in.Shape()[0];
const index_t nfft = t_in.Shape()[1];

if (tid < nfft)
{
typename T_OUT::value_type pxx = 0;
constexpr typename T_OUT::value_type ten = 10;

for (index_t batch = 0; batch < batches; batch++)
{
pxx += cuda::std::norm(t_in(batch, tid));
}

if constexpr (OUTPUT_SCALE_MODE == PwelchOutputScaleMode_Spectrum) {
t_out(tid) = pxx / batches;
}
else if constexpr (OUTPUT_SCALE_MODE == PwelchOutputScaleMode_Density) {
t_out(tid) = pxx / (batches * fs);
}
else if constexpr (OUTPUT_SCALE_MODE == PwelchOutputScaleMode_Spectrum_dB) {
pxx /= batches;
if (pxx != 0) {
t_out(tid) = ten * cuda::std::log10(pxx);
}
else {
t_out(tid) = cuda::std::numeric_limits<typename T_OUT::value_type>::lowest();
}
}
else if constexpr (OUTPUT_SCALE_MODE == PwelchOutputScaleMode_Density_dB) {
pxx /= (batches * fs);
if (pxx != 0) {
t_out(tid) = ten * cuda::std::log10(pxx);
}
else {
t_out(tid) = cuda::std::numeric_limits<typename T_OUT::value_type>::lowest();
}
}
}
}
#endif

};
};
123 changes: 94 additions & 29 deletions include/matx/operators/pwelch.h
Original file line number Diff line number Diff line change
Expand Up @@ -40,35 +40,38 @@
namespace matx
{
namespace detail {
template <typename OpX, typename OpW>
class PWelchOp : public BaseOp<PWelchOp<OpX,OpW>>
template <typename OpX, typename OpW, typename fsType>
class PWelchOp : public BaseOp<PWelchOp<OpX,OpW,fsType>>
{
private:
typename detail::base_type_t<OpX> x_;
typename detail::base_type_t<OpW> w_;

index_t nperseg_;
index_t noverlap_;
index_t nfft_;
cuda::std::array<index_t, 1> out_dims_;
mutable detail::tensor_impl_t<typename remove_cvref_t<OpX>::value_type, 1> tmp_out_;
mutable typename remove_cvref_t<OpX>::value_type *ptr = nullptr;

public:
static_assert(is_complex_v<typename OpX::value_type>, "pwelch() must have a complex input type");
using matxop = bool;
using value_type = typename OpX::value_type::value_type;
using matx_transform_op = bool;
using pwelch_xform_op = bool;

static_assert(is_complex_v<typename OpX::value_type>, "pwelch() must have a complex input type");

__MATX_INLINE__ std::string str() const {
return "pwelch(" + get_type_str(x_) + "," + get_type_str(w_) + ")";
}

__MATX_INLINE__ PWelchOp(const OpX &x, const OpW &w, index_t nperseg, index_t noverlap, index_t nfft) :
x_(x), w_(w), nperseg_(nperseg), noverlap_(noverlap), nfft_(nfft) {

__MATX_INLINE__ PWelchOp(
const OpX &x,
const OpW &w,
index_t nperseg,
index_t noverlap,
index_t nfft,
PwelchOutputScaleMode output_scale_mode,
fsType fs
) :
x_(x),
w_(w),
nperseg_(nperseg),
noverlap_(noverlap),
nfft_(nfft),
output_scale_mode_(output_scale_mode),
fs_(fs)
{
MATX_STATIC_ASSERT_STR(OpX::Rank() == 1, matxInvalidDim, "pwelch: Only input rank of 1 is supported presently");
for (int r = 0; r < OpX::Rank(); r++) {
out_dims_[r] = nfft_;
Expand Down Expand Up @@ -96,25 +99,25 @@ namespace matx
template <typename Out, typename Executor>
void Exec(Out &&out, Executor &&ex) const{
static_assert(is_cuda_executor_v<Executor>, "pwelch() only supports the CUDA executor currently");
pwelch_impl(cuda::std::get<0>(out), x_, w_, nperseg_, noverlap_, nfft_, ex.getStream());
pwelch_impl(cuda::std::get<0>(out), x_, w_, nperseg_, noverlap_, nfft_, output_scale_mode_, fs_, ex.getStream());
}

template <typename ShapeType, typename Executor>
__MATX_INLINE__ void InnerPreRun([[maybe_unused]] ShapeType &&shape, Executor &&ex) const noexcept
{
if constexpr (is_matx_op<OpX>()) {
x_.PreRun(std::forward<ShapeType>(shape), std::forward<Executor>(ex));
}
}

if constexpr (is_matx_op<OpW>()) {
w_.PreRun(Shape(w_), std::forward<Executor>(ex));
}
}
}
}

template <typename ShapeType, typename Executor>
__MATX_INLINE__ void PreRun([[maybe_unused]] ShapeType &&shape, Executor &&ex) const noexcept
{
InnerPreRun(std::forward<ShapeType>(shape), std::forward<Executor>(ex));
InnerPreRun(std::forward<ShapeType>(shape), std::forward<Executor>(ex));

detail::AllocateTempTensor(tmp_out_, std::forward<Executor>(ex), out_dims_, &ptr);

Expand All @@ -133,7 +136,20 @@ namespace matx
}

matxFree(ptr);
}
}

private:
typename detail::base_type_t<OpX> x_;
typename detail::base_type_t<OpW> w_;

index_t nperseg_;
index_t noverlap_;
index_t nfft_;
PwelchOutputScaleMode output_scale_mode_;
fsType fs_;
cuda::std::array<index_t, 1> out_dims_;
mutable detail::tensor_impl_t<typename remove_cvref_t<OpX>::value_type, 1> tmp_out_;
mutable typename remove_cvref_t<OpX>::value_type *ptr = nullptr;
};
}

Expand All @@ -144,6 +160,8 @@ namespace matx
* Input time domain data type
* @tparam wType
* Input window type
* @tparam fsType
* Sampling frequency type
* @param x
* Input time domain tensor
* @param w
Expand All @@ -154,22 +172,69 @@ namespace matx
* Number of points to overlap between segments. Defaults to 0
* @param nfft
* Length of FFT used per segment. nfft >= nperseg. Defaults to nfft = nperseg
* @param output_scale_mode
* Output scale mode. Defaults to PwelchOutputScaleMode_Spectrum
* @param fs
* Sampling frequency. Defaults to 1
*
* @returns Operator with power spectral density of x
*
*/

template <typename xType, typename wType>
__MATX_INLINE__ auto pwelch(const xType& x, const wType& w, index_t nperseg, index_t noverlap, index_t nfft)
template <
typename xType,
typename wType,
typename fsType>
__MATX_INLINE__ auto pwelch(
const xType& x,
const wType& w,
index_t nperseg,
index_t noverlap,
index_t nfft,
PwelchOutputScaleMode output_scale_mode = PwelchOutputScaleMode_Spectrum,
fsType fs = 1
)
{
return detail::PWelchOp(x, w, nperseg, noverlap, nfft, output_scale_mode, fs);
}

template <
typename xType,
typename fsType>
__MATX_INLINE__ auto pwelch(
const xType& x,
index_t nperseg,
index_t noverlap,
index_t nfft,
PwelchOutputScaleMode output_scale_mode = PwelchOutputScaleMode_Spectrum,
fsType fs = 1
)
{
MATX_NVTX_START("", matx::MATX_NVTX_LOG_API)
return detail::PWelchOp(x, std::nullopt, nperseg, noverlap, nfft, output_scale_mode, fs);
}

return detail::PWelchOp(x, w, nperseg, noverlap, nfft);
template <typename xType, typename wType>
__MATX_INLINE__ auto pwelch(
const xType& x,
const wType& w,
index_t nperseg,
index_t noverlap,
index_t nfft,
PwelchOutputScaleMode output_scale_mode = PwelchOutputScaleMode_Spectrum
)
{
return detail::PWelchOp(x, w, nperseg, noverlap, nfft, output_scale_mode, 1.f);
}

template <typename xType>
__MATX_INLINE__ auto pwelch(const xType& x, index_t nperseg, index_t noverlap, index_t nfft)
__MATX_INLINE__ auto pwelch(
const xType& x,
index_t nperseg,
index_t noverlap,
index_t nfft,
PwelchOutputScaleMode output_scale_mode = PwelchOutputScaleMode_Spectrum
)
{
return detail::PWelchOp(x, std::nullopt, nperseg, noverlap, nfft);
return detail::PWelchOp(x, std::nullopt, nperseg, noverlap, nfft, output_scale_mode, 1.f);
}
}
Loading