diff --git a/src/plugins/intel_cpu/src/nodes/rdft.cpp b/src/plugins/intel_cpu/src/nodes/rdft.cpp index 5220f037788123..d39aa9e23343fe 100644 --- a/src/plugins/intel_cpu/src/nodes/rdft.cpp +++ b/src/plugins/intel_cpu/src/nodes/rdft.cpp @@ -838,17 +838,20 @@ struct RDFTJitExecutor : public RDFTExecutor { rdftKernel.reset(new jit_dft_kernel_f32(isInverse, rdftType)); dftKernel.reset(new jit_dft_kernel_f32(isInverse, complex_to_complex)); vlen = cpu_isa_traits::vlen; - primDesc->setImplementationType(jit_avx512); + if (primDesc) + primDesc->setImplementationType(jit_avx512); } else if (mayiuse(cpu::x64::avx2)) { rdftKernel.reset(new jit_dft_kernel_f32(isInverse, rdftType)); dftKernel.reset(new jit_dft_kernel_f32(isInverse, complex_to_complex)); vlen = cpu_isa_traits::vlen; - primDesc->setImplementationType(jit_avx2); + if (primDesc) + primDesc->setImplementationType(jit_avx2); } else if (mayiuse(cpu::x64::sse41)) { rdftKernel.reset(new jit_dft_kernel_f32(isInverse, rdftType)); dftKernel.reset(new jit_dft_kernel_f32(isInverse, complex_to_complex)); vlen = cpu_isa_traits::vlen; - primDesc->setImplementationType(jit_sse42); + if (primDesc) + primDesc->setImplementationType(jit_sse42); } else { OPENVINO_THROW("Can't create RDFT kernel"); } @@ -1075,22 +1078,6 @@ struct RDFTRefExecutor : public RDFTExecutor { } }; -struct RDFTKey { - bool isInverse; - - size_t hash() const { - using namespace dnnl::impl::primitive_hashing; - - size_t seed = 0; - seed = hash_combine(seed, isInverse); - return seed; - } - - bool operator==(const RDFTKey& rhs) const { - return isInverse == rhs.isInverse; - } -}; - void RDFT::createPrimitive() { RDFTKey key{}; key.isInverse = inverse; @@ -1115,6 +1102,22 @@ void RDFT::createPrimitive() { Node::createPrimitive(); } + +std::shared_ptr RDFTExecutor::build(bool inverse, NodeDesc* primDesc) { + std::shared_ptr executor; +#if defined(OPENVINO_ARCH_X86_64) + using namespace dnnl::impl; + using namespace dnnl::impl::cpu::x64; + if (mayiuse(cpu::x64::sse41)) { + executor = std::make_shared(inverse, primDesc); + return executor; + } +#endif + executor = std::make_shared(inverse); + primDesc->setImplementationType(ref_any); + return executor; +} + } // namespace node } // namespace intel_cpu } // namespace ov diff --git a/src/plugins/intel_cpu/src/nodes/rdft.h b/src/plugins/intel_cpu/src/nodes/rdft.h index fccd6500a50f7c..0de2fa0095df81 100644 --- a/src/plugins/intel_cpu/src/nodes/rdft.h +++ b/src/plugins/intel_cpu/src/nodes/rdft.h @@ -4,6 +4,7 @@ #pragma once +#include "common/primitive_hashing_utils.hpp" #include "kernels/x64/rdft_kernel.hpp" #include "node.h" @@ -30,6 +31,8 @@ struct RDFTExecutor { const std::vector& outputShape, const std::vector& axes); + static std::shared_ptr build(bool inverse, NodeDesc* primDesc = nullptr); + protected: bool isInverse; @@ -125,6 +128,20 @@ class RDFT : public Node { bool isSignalSizesConstant = false; }; +struct RDFTKey { + bool isInverse; + + size_t hash() const { + size_t seed = 0; + seed = dnnl::impl::hash_combine(seed, isInverse); + return seed; + } + + bool operator==(const RDFTKey& rhs) const { + return isInverse == rhs.isInverse; + } +}; + } // namespace node } // namespace intel_cpu } // namespace ov diff --git a/src/plugins/intel_cpu/src/nodes/stft.cpp b/src/plugins/intel_cpu/src/nodes/stft.cpp index 47855a7eff7399..31f3b673f38841 100644 --- a/src/plugins/intel_cpu/src/nodes/stft.cpp +++ b/src/plugins/intel_cpu/src/nodes/stft.cpp @@ -4,6 +4,10 @@ #include "stft.h" +#include "cpu/x64/cpu_isa_traits.hpp" +#include "cpu/x64/jit_generator.hpp" +#include "nodes/common/cpu_memcpy.h" +#include "openvino/core/parallel.hpp" #include "openvino/core/type.hpp" #include "openvino/op/constant.hpp" #include "openvino/op/stft.hpp" @@ -73,15 +77,95 @@ bool STFT::created() const { return getType() == Type::STFT; } +namespace { +static void transpose_out4d(const uint8_t* in, + uint8_t* out, + const VectorDims& in_shape, + const VectorDims& out_shape, + size_t elem_size) { + const std::vector axes_order{0, 2, 1, 3}; + parallel_for3d(out_shape[0], + out_shape[1], + out_shape[2], + [in, out, axes_order, &in_shape, &out_shape, elem_size](size_t i, size_t j, size_t k) { + size_t in_indexes[3]; + in_indexes[axes_order[0]] = i; + in_indexes[axes_order[1]] = j; + in_indexes[axes_order[2]] = k; + size_t in_off = + ((in_indexes[0] * in_shape[1] + in_indexes[1]) * in_shape[2] + in_indexes[2]) * in_shape[3]; + size_t out_off = ((i * out_shape[1] + j) * out_shape[2] + k) * out_shape[3]; + cpu_memcpy(out + out_off * elem_size, in + in_off * elem_size, out_shape[3] * elem_size); + }); +} +} // namespace + void STFT::execute(dnnl::stream strm) { - ov::reference::stft(getSrcDataAtPortAs(DATA_IDX), - getSrcDataAtPortAs(WINDOW_IDX), - getDstDataAtPortAs(0), - ov::Shape{getSrcMemoryAtPort(DATA_IDX)->getStaticDims()}, - ov::Shape{getSrcMemoryAtPort(WINDOW_IDX)->getStaticDims()}, - (getSrcDataAtPortAs(FRAME_SIZE_IDX))[0], - (getSrcDataAtPortAs(FRAME_STEP_IDX))[0], - m_transpose_frames); + const float* signal = getSrcDataAtPortAs(DATA_IDX); + const float* window = getSrcDataAtPortAs(WINDOW_IDX); + float* rdft_result = getDstDataAtPortAs(0); + const VectorDims& signal_shape = getSrcMemoryAtPort(DATA_IDX)->getStaticDims(); + const VectorDims& window_shape = getSrcMemoryAtPort(WINDOW_IDX)->getStaticDims(); + const int64_t frame_size = (getSrcDataAtPortAs(FRAME_SIZE_IDX))[0]; + const int64_t frame_step = (getSrcDataAtPortAs(FRAME_STEP_IDX))[0]; + + const auto is_signal_1D = signal_shape.size() == 1; + const size_t batch_size = is_signal_1D ? 1 : signal_shape[0]; + const size_t signal_axis = is_signal_1D ? 0 : 1; + const auto signal_length = signal_shape[signal_axis]; + const auto num_frames = static_cast((signal_length - frame_size) / frame_step) + 1; + const auto frame_size_dim = static_cast(frame_size); + const auto fft_out_shape = VectorDims{static_cast((frame_size_dim / 2) + 1), 2}; + const auto fft_out_shape_size = shape_size(fft_out_shape); + + const auto window_length = window_shape[0] < frame_size_dim ? window_shape[0] : frame_size_dim; + std::vector pad_window(frame_size, 0); + cpu_parallel_memcpy(pad_window.data() + (frame_size_dim - window_length) / 2, + window, + sizeof(float) * window_shape[0]); + + float* dst = rdft_result; + const auto stft_shape = VectorDims{batch_size, num_frames, fft_out_shape[0], fft_out_shape[1]}; + if (m_transpose_frames) { // Store intermediate results + MemoryPtr dst_mem = + getScratchPadMem(std::make_shared(ov::element::f32, Shape{stft_shape})); + dst = dst_mem->getDataAs(); + } + + parallel_for2d(batch_size, num_frames, [&](size_t batch, size_t frame_idx) { + size_t batch_in_start = batch * signal_length; + size_t batch_frames_out = batch * num_frames; + + const auto frame_start = batch_in_start + frame_idx * frame_step; + const auto frame_end = frame_start + frame_size; + std::vector signal_slice(signal + frame_start, signal + frame_end); + std::transform(signal_slice.begin(), + signal_slice.end(), + pad_window.begin(), + signal_slice.begin(), + std::multiplies()); + + const auto result_idx = (batch_frames_out + frame_idx) * fft_out_shape_size; + auto twiddles = rdft_executor->generateTwiddles({static_cast(signal_slice.size())}, fft_out_shape, {0}); + rdft_executor->execute(signal_slice.data(), + dst + result_idx, + twiddles, + 1, + {0}, + {static_cast(frame_size)}, + {frame_size_dim}, + fft_out_shape, + {1}, + {2, 1}); + }); + if (m_transpose_frames) { + const auto stft_transp_out_shape = VectorDims{batch_size, fft_out_shape[0], num_frames, fft_out_shape[1]}; + transpose_out4d(reinterpret_cast(dst), + reinterpret_cast(rdft_result), + stft_shape, + stft_transp_out_shape, + sizeof(float)); + } } void STFT::executeDynamicImpl(dnnl::stream strm) { @@ -92,6 +176,20 @@ bool STFT::needShapeInfer() const { return !(m_is_frame_size_const && m_is_frame_step_const) || Node::needShapeInfer(); } +void STFT::createPrimitive() { + RDFTKey key{}; + key.isInverse = false; + auto buildExecutor = [&](const RDFTKey& key) -> std::shared_ptr { + return RDFTExecutor::build(key.isInverse, getSelectedPrimitiveDescriptor()); + }; + + auto cache = context->getParamsCache(); + auto result = cache->getOrCreate(key, buildExecutor); + rdft_executor = result.first; + + Node::createPrimitive(); +} + } // namespace node } // namespace intel_cpu } // namespace ov diff --git a/src/plugins/intel_cpu/src/nodes/stft.h b/src/plugins/intel_cpu/src/nodes/stft.h index 7b1684cae4b674..608e14661910e2 100644 --- a/src/plugins/intel_cpu/src/nodes/stft.h +++ b/src/plugins/intel_cpu/src/nodes/stft.h @@ -7,6 +7,7 @@ #include #include "node.h" +#include "rdft.h" namespace ov { namespace intel_cpu { @@ -21,6 +22,7 @@ class STFT : public Node { bool created() const override; static bool isSupportedOperation(const std::shared_ptr& op, std::string& errorMessage) noexcept; bool needPrepareParams() const override; + void createPrimitive() override; void execute(dnnl::stream strm) override; void executeDynamicImpl(dnnl::stream strm) override; @@ -35,6 +37,8 @@ class STFT : public Node { /// STFT params bool m_transpose_frames = false; + // RDFT executor + std::shared_ptr rdft_executor = nullptr; bool m_is_frame_size_const = false; bool m_is_frame_step_const = false;