Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add configurable Pwelch scaling and improve performance #897

Merged
merged 4 commits into from
Mar 5, 2025
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
40 changes: 18 additions & 22 deletions examples/pwelch.cu
Original file line number Diff line number Diff line change
Expand Up @@ -50,18 +50,15 @@ int main([[maybe_unused]] int argc, [[maybe_unused]] char **argv)
MATX_ENTER_HANDLER();
using complex = cuda::std::complex<float>;

float exec_time_ms;
const int num_iterations = 100;
index_t signal_size = 256;
index_t nperseg = 32;
index_t nfft = nperseg;
index_t noverlap = 8;
float ftone = 3.0;
const int num_iterations = 500;
index_t signal_size = 256000;
index_t nperseg = 512;
index_t noverlap = 256;
index_t nfft = 65536;

float ftone = 2048.0;
cudaStream_t stream;
cudaStreamCreate(&stream);
cudaEvent_t start, stop;
cudaEventCreate(&start);
cudaEventCreate(&stop);
cudaExecutor exec{stream};

// Create input signal as a complex exponential
Expand All @@ -71,31 +68,30 @@ int main([[maybe_unused]] int argc, [[maybe_unused]] char **argv)
auto x = make_tensor<complex>({signal_size});
(x = tmp_x).run(exec); // pre-compute x, tmp_x is otherwise lazily evaluated

// Create window
auto w = make_tensor<complex>({nperseg});
(w = flattop<0>({nperseg})).run(exec);

// Create output tensor
auto Pxx = make_tensor<typename complex::value_type>({nfft});

// Run one time to pre-cache the FFT plan
(Pxx = pwelch(x, nperseg, noverlap, nfft)).run(exec);
(Pxx = pwelch(x, w, nperseg, noverlap, nfft)).run(exec);
exec.sync();

// Start the timing
cudaEventRecord(start, stream);

// Start the timing
cudaEventRecord(start, stream);
exec.start_timer();

for (int iteration = 0; iteration < num_iterations; iteration++) {
// Use the PWelch operator
(Pxx = pwelch(x, nperseg, noverlap, nfft)).run(exec);
(Pxx = pwelch(x, w, nperseg, noverlap, nfft)).run(exec);
}

cudaEventRecord(stop, stream);
exec.sync();
cudaEventElapsedTime(&exec_time_ms, start, stop);
exec.stop_timer();

printf("Output Pxx:\n");
print(Pxx);
printf("PWelchOp avg runtime = %.3f ms\n", exec_time_ms / num_iterations);
printf("Pxx(0) = %f\n", Pxx(0));
printf("Pxx(ftone) = %f\n", Pxx(2048));
printf("PWelchOp avg runtime = %.3f ms\n", exec.get_time_ms() / num_iterations);

MATX_CUDA_CHECK_LAST_ERROR();
MATX_EXIT_HANDLER();
Expand Down
83 changes: 59 additions & 24 deletions include/matx/operators/pwelch.h
Original file line number Diff line number Diff line change
Expand Up @@ -43,17 +43,6 @@ namespace matx
template <typename OpX, typename OpW>
class PWelchOp : public BaseOp<PWelchOp<OpX,OpW>>
{
private:
typename detail::base_type_t<OpX> x_;
typename detail::base_type_t<OpW> w_;

index_t nperseg_;
index_t noverlap_;
index_t nfft_;
cuda::std::array<index_t, 1> out_dims_;
mutable detail::tensor_impl_t<typename remove_cvref_t<OpX>::value_type, 1> tmp_out_;
mutable typename remove_cvref_t<OpX>::value_type *ptr = nullptr;

public:
using matxop = bool;
using value_type = typename OpX::value_type::value_type;
Expand All @@ -66,9 +55,23 @@ namespace matx
return "pwelch(" + get_type_str(x_) + "," + get_type_str(w_) + ")";
}

__MATX_INLINE__ PWelchOp(const OpX &x, const OpW &w, index_t nperseg, index_t noverlap, index_t nfft) :
x_(x), w_(w), nperseg_(nperseg), noverlap_(noverlap), nfft_(nfft) {

__MATX_INLINE__ PWelchOp(
const OpX &x,
const OpW &w,
index_t nperseg,
index_t noverlap,
index_t nfft,
PwelchOutputScaleMode output_scale_mode,
value_type fs
) :
x_(x),
w_(w),
nperseg_(nperseg),
noverlap_(noverlap),
nfft_(nfft),
output_scale_mode_(output_scale_mode),
fs_(fs)
{
MATX_STATIC_ASSERT_STR(OpX::Rank() == 1, matxInvalidDim, "pwelch: Only input rank of 1 is supported presently");
for (int r = 0; r < OpX::Rank(); r++) {
out_dims_[r] = nfft_;
Expand Down Expand Up @@ -96,25 +99,25 @@ namespace matx
template <typename Out, typename Executor>
void Exec(Out &&out, Executor &&ex) const{
static_assert(is_cuda_executor_v<Executor>, "pwelch() only supports the CUDA executor currently");
pwelch_impl(cuda::std::get<0>(out), x_, w_, nperseg_, noverlap_, nfft_, ex.getStream());
pwelch_impl(cuda::std::get<0>(out), x_, w_, nperseg_, noverlap_, nfft_, output_scale_mode_, fs_, ex.getStream());
}

template <typename ShapeType, typename Executor>
__MATX_INLINE__ void InnerPreRun([[maybe_unused]] ShapeType &&shape, Executor &&ex) const noexcept
{
if constexpr (is_matx_op<OpX>()) {
x_.PreRun(std::forward<ShapeType>(shape), std::forward<Executor>(ex));
}
}

if constexpr (is_matx_op<OpW>()) {
w_.PreRun(Shape(w_), std::forward<Executor>(ex));
}
}
}
}

template <typename ShapeType, typename Executor>
__MATX_INLINE__ void PreRun([[maybe_unused]] ShapeType &&shape, Executor &&ex) const noexcept
{
InnerPreRun(std::forward<ShapeType>(shape), std::forward<Executor>(ex));
InnerPreRun(std::forward<ShapeType>(shape), std::forward<Executor>(ex));

detail::AllocateTempTensor(tmp_out_, std::forward<Executor>(ex), out_dims_, &ptr);

Expand All @@ -133,7 +136,20 @@ namespace matx
}

matxFree(ptr);
}
}

private:
typename detail::base_type_t<OpX> x_;
typename detail::base_type_t<OpW> w_;

index_t nperseg_;
index_t noverlap_;
index_t nfft_;
PwelchOutputScaleMode output_scale_mode_;
value_type fs_;
cuda::std::array<index_t, 1> out_dims_;
mutable detail::tensor_impl_t<typename remove_cvref_t<OpX>::value_type, 1> tmp_out_;
mutable typename remove_cvref_t<OpX>::value_type *ptr = nullptr;
};
}

Expand All @@ -154,22 +170,41 @@ namespace matx
* Number of points to overlap between segments. Defaults to 0
* @param nfft
* Length of FFT used per segment. nfft >= nperseg. Defaults to nfft = nperseg
* @param output_scale_mode
* Output scale mode. Defaults to PwelchOutputScaleMode_Spectrum
* @param fs
* Sampling frequency. Defaults to 1
*
* @returns Operator with power spectral density of x
*
*/

template <typename xType, typename wType>
__MATX_INLINE__ auto pwelch(const xType& x, const wType& w, index_t nperseg, index_t noverlap, index_t nfft)
__MATX_INLINE__ auto pwelch(
const xType& x,
const wType& w,
index_t nperseg,
index_t noverlap,
index_t nfft,
PwelchOutputScaleMode output_scale_mode = PwelchOutputScaleMode_Spectrum,
typename xType::value_type::value_type fs = 1
)
{
MATX_NVTX_START("", matx::MATX_NVTX_LOG_API)

return detail::PWelchOp(x, w, nperseg, noverlap, nfft);
return detail::PWelchOp(x, w, nperseg, noverlap, nfft, output_scale_mode, fs);
}

template <typename xType>
__MATX_INLINE__ auto pwelch(const xType& x, index_t nperseg, index_t noverlap, index_t nfft)
__MATX_INLINE__ auto pwelch(
const xType& x,
index_t nperseg,
index_t noverlap,
index_t nfft,
PwelchOutputScaleMode output_scale_mode = PwelchOutputScaleMode_Spectrum,
typename xType::value_type::value_type fs = 1
)
{
return detail::PWelchOp(x, std::nullopt, nperseg, noverlap, nfft);
return detail::PWelchOp(x, std::nullopt, nperseg, noverlap, nfft, output_scale_mode, fs);
}
}
89 changes: 82 additions & 7 deletions include/matx/transforms/pwelch.h
Original file line number Diff line number Diff line change
Expand Up @@ -34,8 +34,71 @@

namespace matx
{

enum PwelchOutputScaleMode {
PwelchOutputScaleMode_Spectrum,
PwelchOutputScaleMode_Density,
PwelchOutputScaleMode_Spectrum_dB,
PwelchOutputScaleMode_Density_dB
};

namespace detail {
template<PwelchOutputScaleMode OUTPUT_SCALE_MODE, typename T_IN, typename T_OUT>
__global__ void pwelch_kernel(const T_IN t_in, T_OUT t_out, typename T_OUT::value_type fs)
{
const index_t tid = blockIdx.x * blockDim.x + threadIdx.x;
const index_t batches = t_in.Shape()[0];
const index_t nfft = t_in.Shape()[1];

if (tid < nfft)
{
typename T_OUT::value_type pxx = 0;
constexpr typename T_OUT::value_type ten = 10;

for (index_t batch = 0; batch < batches; batch++)
{
pxx += cuda::std::norm(t_in(batch, tid));
}

if constexpr (OUTPUT_SCALE_MODE == PwelchOutputScaleMode_Spectrum)
{
t_out(tid) = pxx / batches;
}
else if constexpr (OUTPUT_SCALE_MODE == PwelchOutputScaleMode_Density)
{
t_out(tid) = pxx / (batches * fs);
}
else if constexpr (OUTPUT_SCALE_MODE == PwelchOutputScaleMode_Spectrum_dB)
{
pxx /= batches;
if (pxx != 0)
{
t_out(tid) = ten * cuda::std::log10(pxx);
}
else
{
t_out(tid) = cuda::std::numeric_limits<typename T_OUT::value_type>::lowest();
}
}
else if constexpr (OUTPUT_SCALE_MODE == PwelchOutputScaleMode_Density_dB)
{
pxx /= (batches * fs);
if (pxx != 0)
{
t_out(tid) = ten * cuda::std::log10(pxx);
}
else
{
t_out(tid) = cuda::std::numeric_limits<typename T_OUT::value_type>::lowest();
}
}
}
}
};

extern int g_pwelch_alg_mode;
template <typename PxxType, typename xType, typename wType>
__MATX_INLINE__ void pwelch_impl(PxxType Pxx, const xType& x, const wType& w, index_t nperseg, index_t noverlap, index_t nfft, cudaStream_t stream=0)
__MATX_INLINE__ void pwelch_impl(PxxType Pxx, const xType& x, const wType& w, index_t nperseg, index_t noverlap, index_t nfft, PwelchOutputScaleMode output_scale_mode, typename PxxType::value_type fs, cudaStream_t stream=0)
{
MATX_NVTX_START("", matx::MATX_NVTX_LOG_API)

Expand All @@ -59,13 +122,25 @@ namespace matx
(X_with_overlaps = fft(x_with_overlaps * w,nfft)).run(stream);
}

// Compute magnitude squared in-place
(X_with_overlaps = conj(X_with_overlaps) * X_with_overlaps).run(stream);
auto mag_sq_X_with_overlaps = X_with_overlaps.RealView();
int tpb = 512;
int bpk = (static_cast<int>(nfft) + tpb - 1) / tpb;

// Perform the reduction across 'batches' rows and normalize
auto norm_factor = static_cast<typename PxxType::value_type>(1.) / static_cast<typename PxxType::value_type>(batches);
(Pxx = sum(mag_sq_X_with_overlaps, {0}) * norm_factor).run(stream);
if (output_scale_mode == PwelchOutputScaleMode_Spectrum)
{
detail::pwelch_kernel<PwelchOutputScaleMode_Spectrum><<<bpk, tpb, 0, stream>>>(X_with_overlaps, Pxx, fs);
}
else if (output_scale_mode == PwelchOutputScaleMode_Density)
{
detail::pwelch_kernel<PwelchOutputScaleMode_Density><<<bpk, tpb, 0, stream>>>(X_with_overlaps, Pxx, fs);
}
else if (output_scale_mode == PwelchOutputScaleMode_Spectrum_dB)
{
detail::pwelch_kernel<PwelchOutputScaleMode_Spectrum_dB><<<bpk, tpb, 0, stream>>>(X_with_overlaps, Pxx, fs);
}
else //if (output_scale_mode == PwelchOutputScaleMode_Density_dB)
{
detail::pwelch_kernel<PwelchOutputScaleMode_Density_dB><<<bpk, tpb, 0, stream>>>(X_with_overlaps, Pxx, fs);
}
}

} // end namespace matx
38 changes: 29 additions & 9 deletions test/00_operators/PWelch.cu
Original file line number Diff line number Diff line change
Expand Up @@ -47,18 +47,23 @@ struct TestParams {
index_t nperseg;
index_t noverlap;
index_t nfft;
PwelchOutputScaleMode output_scale_mode;
float fs;
float ftone;
float sigma;
};

const std::vector<TestParams> CONFIGS = {
{"none", 8, 8, 2, 8, 0., 0.},
{"none", 16, 8, 4, 8, 1., 0.},
{"none", 16, 8, 4, 8, 2., 1.},
{"none", 16384, 256, 64, 256, 63., 0.},
{"boxcar", 8, 8, 2, 8, 0., 0.},
{"hann", 16, 8, 4, 8, 1., 0.},
{"flattop", 1024, 64, 32, 128, 2., 1.},
{"none", 8, 8, 2, 8, PwelchOutputScaleMode_Spectrum, 1.0, 0., 0.},
{"none", 16, 8, 4, 8, PwelchOutputScaleMode_Spectrum, 1.0, 1., 0.},
{"none", 16, 8, 4, 8, PwelchOutputScaleMode_Spectrum, 1.0, 2., 1.},
{"none", 16384, 256, 64, 256, PwelchOutputScaleMode_Spectrum, 1.0, 63., 0.},
{"boxcar", 8, 8, 2, 8, PwelchOutputScaleMode_Spectrum, 1.0, 0., 0.},
{"hann", 16, 8, 4, 8, PwelchOutputScaleMode_Spectrum, 1.0, 1., 0.},
{"flattop", 1024, 64, 32, 128, PwelchOutputScaleMode_Spectrum, 2.0, 2., 1.},
{"flattop", 1024, 64, 32, 128, PwelchOutputScaleMode_Density, 2.0, 2., 1.},
{"flattop", 1024, 64, 32, 128, PwelchOutputScaleMode_Spectrum_dB, 2.0, 2., 1.},
{"flattop", 1024, 64, 32, 128, PwelchOutputScaleMode_Density_dB, 2.0, 2., 1.},
};

class PWelchComplexExponentialTest : public ::testing::TestWithParam<TestParams>
Expand Down Expand Up @@ -87,11 +92,26 @@ void helper(PWelchComplexExponentialTest& test)
"nperseg"_a=test.params.nperseg,
"noverlap"_a=test.params.noverlap,
"nfft"_a=test.params.nfft,
"scaling"_a="spectrum",
"fs"_a =test.params.fs,
"ftone"_a=test.params.ftone,
"sigma"_a=test.params.sigma,
"window_name"_a=test.params.window_name
);

if (test.params.output_scale_mode == PwelchOutputScaleMode_Density)
{
cfg["scaling"] = "density";
}
else if (test.params.output_scale_mode == PwelchOutputScaleMode_Density_dB)
{
cfg["scaling"] = "density_dB";
}
else if (test.params.output_scale_mode == PwelchOutputScaleMode_Spectrum_dB)
{
cfg["scaling"] = "spectrum_dB";
}

test.pb->template InitAndRunTVGeneratorWithCfg<TypeParam>(
"00_operators", "pwelch_operators", "pwelch_complex_exponential", cfg);

Expand All @@ -104,7 +124,7 @@ void helper(PWelchComplexExponentialTest& test)

if (test.params.window_name == "none")
{
(Pxx = pwelch(x, test.params.nperseg, test.params.noverlap, test.params.nfft)).run(exec);
(Pxx = pwelch(x, test.params.nperseg, test.params.noverlap, test.params.nfft, test.params.output_scale_mode, test.params.fs)).run(exec);
}
else
{
Expand All @@ -125,7 +145,7 @@ void helper(PWelchComplexExponentialTest& test)
{
ASSERT_TRUE(false) << "Unknown window parameter name " + test.params.window_name;
}
(Pxx = pwelch(x, w, test.params.nperseg, test.params.noverlap, test.params.nfft)).run(exec);
(Pxx = pwelch(x, w, test.params.nperseg, test.params.noverlap, test.params.nfft, test.params.output_scale_mode, test.params.fs)).run(exec);
}

exec.sync();
Expand Down
Loading