Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Ne10 FFTs for Neon #219

Merged
merged 43 commits into from
Dec 9, 2024
Merged
Changes from 1 commit
Commits
Show all changes
43 commits
Select commit Hold shift + click to select a range
19531d5
Fixes for testing Neon
christophe0606 Oct 1, 2024
f570d6d
Integrated CFFT F32 of Ne10 for Neon version of the CFFT.
christophe0606 Oct 3, 2024
e2ed162
Some cleaning to Ne10 header used in Neon CFFT.
christophe0606 Oct 3, 2024
f5f5061
Added Neon RFFT - initialization not yet implemented
christophe0606 Oct 8, 2024
078ac55
First working version of RFFT Neon (tested on RFFT 32 only so far)
christophe0606 Oct 9, 2024
69ce3dd
Added other lengths for RFFT Neon.
christophe0606 Oct 9, 2024
b07470f
Added RIFFT Neon F32
christophe0606 Oct 10, 2024
0db81f6
Added documentation for Neon API of CFFT and RFFT F32 that are differ…
christophe0606 Oct 10, 2024
7241cda
Updated MFCC F32 Neon version.
christophe0606 Oct 11, 2024
4f59953
Improved doxygen documentation for new Neon API (RFFT, CFFT, MFCC F32)
christophe0606 Oct 11, 2024
e92a5d7
Added CFTT Q31 Neon
christophe0606 Oct 15, 2024
e9316ac
Added RFFT Q31 Neon.
christophe0606 Oct 17, 2024
30098a4
Updated tests for RFTT, CFFT, MFCC Q31 Neon
christophe0606 Oct 17, 2024
8a105b4
Added Q15 Neon implementations for CFFT, RFFT, MFCC.
christophe0606 Oct 25, 2024
e7b3396
Changed API for Neon CFFT and RFFT F32. Input buffer is const and not…
christophe0606 Oct 25, 2024
1cc49ec
Improved doxygen documentation
christophe0606 Oct 25, 2024
47d568d
Update doxygen for Neon version of transforms.
christophe0606 Oct 25, 2024
dae08f4
Doxygen update for Neon version of cfft q31 and q15
christophe0606 Oct 25, 2024
76e6c32
Corrected doxygen
christophe0606 Oct 25, 2024
7d04f56
Added CFFT, RFFT, MFCC Neon for f16.
christophe0606 Oct 30, 2024
f3cc725
Corrected RFFT F16 neon
christophe0606 Oct 31, 2024
dd8ec7d
Corrected f16 tests
christophe0606 Oct 31, 2024
ea8723a
Don't build RFFT F16 neon when HW not supporting it
christophe0606 Nov 7, 2024
c041a25
Corrected F16 tests for Neon and Cortex-A
christophe0606 Nov 7, 2024
2fdbbbf
Added dynamic init for CFFT F32 Neon version
christophe0606 Nov 13, 2024
2ecb108
Added dynamic init for CFFT Q31 Neon
christophe0606 Nov 13, 2024
67f1b52
Added dynamic CFFT Q15 and F16 for Neon
christophe0606 Nov 13, 2024
b9db487
Improved doxygen for Neon init functions
christophe0606 Nov 13, 2024
44ecc31
Correct doxygen issues with init functions for Neon CFFT
christophe0606 Nov 13, 2024
95ba6e6
Added dynamic RFFT F32 for Neon
christophe0606 Nov 14, 2024
1775adb
Added dynamic Neon RFFT Q31 and Q15
christophe0606 Nov 15, 2024
128ace2
Added dynamic RFFT F16 for Neon
christophe0606 Nov 15, 2024
213c815
For cmake build of tests, added option to disable autovectorization
christophe0606 Nov 18, 2024
d001e1d
Improved perf of CFFT F16 Neon
christophe0606 Nov 19, 2024
a1441c6
Added some support for complex Neon instructions for tests.
christophe0606 Dec 2, 2024
9eec50a
Added radix 3 and 5 for Neon implementation of CFFT F32 and F16.
christophe0606 Dec 3, 2024
f9e98ab
Added radix 3 and 5 for Neon CFFT Q31
christophe0606 Dec 4, 2024
9b8e3ef
Added tests for Neon specific implementations of the FFTs
christophe0606 Dec 5, 2024
d24c98f
Corrected test thresholds for Neon.
christophe0606 Dec 5, 2024
c045007
Tuned test thresholds
christophe0606 Dec 6, 2024
4d3e915
Corrected tests causing problems when building with gcc.
christophe0606 Dec 6, 2024
f91cb53
Improved doxygen for FFTs
christophe0606 Dec 6, 2024
e49e650
Improved doxygen documentation for FFTs.
christophe0606 Dec 9, 2024
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Prev Previous commit
Next Next commit
Added dynamic RFFT F32 for Neon
christophe0606 committed Dec 6, 2024

Verified

This commit was created on GitHub.com and signed with GitHub’s verified signature. The key has expired.
commit 95ba6e6ea75a89c5c7f104d1f5d86bdf4b4b9d4e
5 changes: 4 additions & 1 deletion Include/dsp/transform_functions.h
Original file line number Diff line number Diff line change
@@ -731,7 +731,7 @@ void arm_rfft_fast_f64(
#if defined(ARM_MATH_NEON) && !defined(ARM_MATH_AUTOVECTORIZE)
typedef struct
{
uint16_t nfft;
uint32_t nfft;
const float32_t *r_twiddles;
const uint32_t *r_factors;
const float32_t *r_twiddles_backward;
@@ -764,6 +764,9 @@ arm_status arm_rfft_fast_init_f32 (
uint16_t fftLen);

#if defined(ARM_MATH_NEON) && !defined(ARM_MATH_AUTOVECTORIZE)

extern arm_rfft_fast_instance_f32 *arm_rfft_fast_init_dynamic_f32 (uint32_t fftLen);

void arm_rfft_fast_f32(
const arm_rfft_fast_instance_f32 * S,
const float32_t * p,
193 changes: 185 additions & 8 deletions Ne10/CMSIS_NE10_fft_init.c
Original file line number Diff line number Diff line change
@@ -158,7 +158,7 @@ static void ne10_fft_generate_twiddles_line_float32 (ne10_fft_cpx_float32_t * tw
const ne10_int32_t nfft)
{
ne10_int32_t j, k;
ne10_float32_t phase;
ne10_float64_t phase;
const ne10_float64_t pi = NE10_PI;
//printf("%d %d %d %d\n",mstride,fstride,radix,nfft);

@@ -173,7 +173,7 @@ static void ne10_fft_generate_twiddles_line_float32 (ne10_fft_cpx_float32_t * tw
} // mstride
}

#if 0

// Transposed twiddles matrix [mstride][radix-1]
// First row (k == 0) is ignored because phase == 1, and
// twiddle = (1.0, 0.0).
@@ -189,7 +189,7 @@ static void ne10_fft_generate_twiddles_line_transposed_float32 (
//printf("Transposed\n");

ne10_int32_t j, k;
ne10_float32_t phase;
ne10_float64_t phase;
const ne10_float64_t pi = NE10_PI;

for (j = 0; j < mstride; j++)
@@ -202,7 +202,7 @@ static void ne10_fft_generate_twiddles_line_transposed_float32 (
} // radix
} // mstride
}
#endif



// Twiddles matrix [mstride][radix-1]
@@ -215,7 +215,7 @@ static void ne10_fft_generate_twiddles_line_int32 (ne10_fft_cpx_int32_t * twiddl
const ne10_int32_t nfft)
{
ne10_int32_t j, k;
ne10_float32_t phase;
ne10_float64_t phase;
const ne10_float64_t pi = NE10_PI;

for (j = 0; j < mstride; j++)
@@ -325,7 +325,7 @@ static ne10_fft_cpx_float32_t* ne10_fft_generate_twiddles_float32 (ne10_fft_cpx_
return twiddles;
}

#if 0

static ne10_fft_cpx_float32_t* ne10_fft_generate_twiddles_transposed_float32 (
ne10_fft_cpx_float32_t * twiddles,
const ne10_uint32_t * factors,
@@ -338,7 +338,7 @@ static ne10_fft_cpx_float32_t* ne10_fft_generate_twiddles_transposed_float32 (
twiddles, factors, nfft);
return twiddles;
}
#endif


/**
@addtogroup ComplexFFTF32
@@ -781,4 +781,181 @@ arm_cfft_instance_f16 *arm_cfft_init_dynamic_f16(uint32_t fftLen)
/**
@} end of ComplexFFTF16 group
*/
#endif
#endif

/**
@addtogroup RealFFTF32
@{
*/
/**
* @brief Initialize data structure for a RFFT
*
* @param[in] fftLen The rfft length
*
* @return Pointer to the new structure
*
* @par This function is only available for Neon
* This function is allocating memory. The
* memory must be released when no more used.
* This function can be used with RFFT lengths
* longer than the ones supported on Cortex-M
*/
arm_rfft_fast_instance_f32 *arm_rfft_fast_init_dynamic_f32 (uint32_t nfft)
{
//printf("Alloc r2c\n");
arm_rfft_fast_instance_f32* st = NULL;
ne10_int32_t result;

ne10_uint32_t memneeded = sizeof (arm_rfft_fast_instance_f32)
+ sizeof (ne10_int32_t) * (NE10_MAXFACTORS * 2) /* r_factors */
+ sizeof (ne10_int32_t) * (NE10_MAXFACTORS * 2) /* r_factors_neon */
+ sizeof (ne10_fft_cpx_float32_t) * nfft /* r_twiddles */
+ sizeof (ne10_fft_cpx_float32_t) * nfft/4 /* r_twiddles_neon */
+ sizeof (ne10_fft_cpx_float32_t) * (12 + nfft/32*12) /* r_super_twiddles_neon */
+ NE10_FFT_BYTE_ALIGNMENT; /* 64-bit alignment*/

st = (arm_rfft_fast_instance_f32*) NE10_MALLOC (memneeded);

if (!st)
{
return st;
}

ne10_int32_t i,j;
ne10_fft_cpx_float32_t *tw;
const ne10_float64_t pi = NE10_PI;
ne10_float64_t phase1;

st->nfft = nfft;

ne10_fft_cpx_float32_t *r_twiddles;
uint32_t *r_factors;
ne10_fft_cpx_float32_t *r_twiddles_backward;
ne10_fft_cpx_float32_t *r_twiddles_neon;
ne10_fft_cpx_float32_t *r_twiddles_neon_backward;
uint32_t *r_factors_neon;
ne10_fft_cpx_float32_t *r_super_twiddles_neon;


uintptr_t address = (uintptr_t) st + sizeof (arm_rfft_fast_instance_f32);
NE10_BYTE_ALIGNMENT (address, NE10_FFT_BYTE_ALIGNMENT);

r_twiddles = (ne10_fft_cpx_float32_t*) address;
r_factors = (ne10_uint32_t*) (r_twiddles + nfft);
r_twiddles_neon = (ne10_fft_cpx_float32_t*) (r_factors + (NE10_MAXFACTORS * 2));
r_factors_neon = (ne10_uint32_t*) (r_twiddles_neon + nfft/4);
r_super_twiddles_neon = (ne10_fft_cpx_float32_t*) (r_factors_neon + (NE10_MAXFACTORS * 2));



if (nfft<32)
{
return st;
}

// factors and twiddles for rfft C
ne10_factor (nfft, r_factors, NE10_FACTOR_EIGHT_FIRST_STAGE);

// backward twiddles pointers
r_twiddles_backward = ne10_fft_generate_twiddles_float32 (r_twiddles, r_factors, nfft);

//for(unsigned int i=0;i<nfft;i++)
//{
// printf("%f %f\n",(double)r_twiddles[i].r,(double)r_twiddles[i].i);
//}
//printf("---\n");

// factors and twiddles for rfft neon
result = ne10_factor (nfft/4, r_factors_neon, NE10_FACTOR_EIGHT_FIRST_STAGE);
if (result == NE10_ERR)
{
return st;
}

// Twiddle table is transposed here to improve cache access performance.
r_twiddles_neon_backward = ne10_fft_generate_twiddles_transposed_float32 (
r_twiddles_neon,
r_factors_neon,
nfft/4);

//for(unsigned int i=0;i<nfft/4;i++)
//{
// printf("%f %f\n",(double)r_twiddles_neon[i].r,(double)r_twiddles_neon[i].i);
//}

// nfft/4 x 4
tw = r_super_twiddles_neon;
for (i = 1; i < 4; i ++)
{
for (j = 0; j < 4; j++)
{
phase1 = - 2 * pi * ( (1.0*i * j) / nfft);
tw[4*i-4+j].r = (ne10_float32_t) cos (phase1);
tw[4*i-4+j].i = (ne10_float32_t) sin (phase1);
}
}

ne10_uint32_t k,s;
// [nfft/32] x [3] x [4]
// k s j
for (k=1; k<nfft/32; k++)
{
// transposed
for (s = 1; s < 4; s++)
{
for (j = 0; j < 4; j++)
{
phase1 = - 2 * pi * ( 1.0*((k*4+j) * s) / nfft);
//printf("%d %d %d %f\n",k,j,s,phase1);
tw[12*k+j+4*(s-1)].r = (ne10_float32_t) cos (phase1);
tw[12*k+j+4*(s-1)].i = (ne10_float32_t) sin (phase1);
}
}
}

//printf("---\n");
//for(unsigned int i=0;i<nfft/32*12;i++)
//{
// printf("%f %f\n",(double)r_super_twiddles_neon[i].r,(double)r_super_twiddles_neon[i].i);
//}

ne10_int32_t stage_count = r_factors_neon[0];
//printf("stage %d\n",stage_count);
r_factors_neon[2] = r_factors_neon[2 * (stage_count)]; // first radix
if (stage_count > 1)
{
r_factors_neon[3] = r_factors_neon[2 * (stage_count-1)+1]; // mstride
}
else
{
r_factors_neon[3] = r_factors_neon[2 * (stage_count)+1]; // mstride
}

stage_count = r_factors[0];
//printf("stage %d\n",stage_count);
r_factors[2] = r_factors[2 * (stage_count)]; // first radix
if (stage_count > 1)
{
r_factors[3] = r_factors[2 * (stage_count-1)+1]; // mstride
}
else
{
r_factors[3] = r_factors[2 * (stage_count)+1]; // mstride
}

st->r_twiddles=(float32_t*)r_twiddles;
st->r_factors=r_factors;
st->r_twiddles_backward=(float32_t*)r_twiddles_backward;
st->r_twiddles_neon=(float32_t*)r_twiddles_neon;
st->r_twiddles_neon_backward=(float32_t*)r_twiddles_neon_backward;
st->r_factors_neon=r_factors_neon;
st->r_super_twiddles_neon=(float32_t*)r_super_twiddles_neon;

//printf("%d %d %d %d\n",r_factors[0],r_factors[1],r_factors[2],r_factors[3]);
//printf("%d %d %d %d\n",r_factors_neon[0],r_factors_neon[1],r_factors_neon[2],r_factors_neon[3]);

return st;
}
/**
@} end of RealFFTF32 group
*/