diff --git a/opus/celt/arm/armcpu.c b/opus/celt/arm/armcpu.c index 06a53435..6785121a 100644 --- a/opus/celt/arm/armcpu.c +++ b/opus/celt/arm/armcpu.c @@ -96,7 +96,7 @@ static OPUS_INLINE opus_uint32 opus_cpu_capabilities(void){ /* Linux based */ #include -opus_uint32 opus_cpu_capabilities(void) +static opus_uint32 opus_cpu_capabilities(void) { opus_uint32 flags = 0; FILE *cpuinfo; @@ -169,7 +169,7 @@ opus_uint32 opus_cpu_capabilities(void) #include #include -opus_uint32 opus_cpu_capabilities(void) +static opus_uint32 opus_cpu_capabilities(void) { opus_uint32 flags = 0; @@ -191,6 +191,54 @@ opus_uint32 opus_cpu_capabilities(void) return flags; } +#elif defined(__FreeBSD__) +#include + +static opus_uint32 opus_cpu_capabilities(void) +{ + long hwcap = 0; + opus_uint32 flags = 0; + +# if defined(OPUS_ARM_MAY_HAVE_MEDIA) \ + || defined(OPUS_ARM_MAY_HAVE_NEON) || defined(OPUS_ARM_MAY_HAVE_NEON_INTR) + /* FreeBSD requires armv6+, which always supports media instructions */ + flags |= OPUS_CPU_ARM_MEDIA_FLAG; +# endif + + elf_aux_info(AT_HWCAP, &hwcap, sizeof hwcap); + +# if defined(OPUS_ARM_MAY_HAVE_EDSP) || defined(OPUS_ARM_MAY_HAVE_MEDIA) \ + || defined(OPUS_ARM_MAY_HAVE_NEON) || defined(OPUS_ARM_MAY_HAVE_NEON_INTR) +# ifdef HWCAP_EDSP + if (hwcap & HWCAP_EDSP) + flags |= OPUS_CPU_ARM_EDSP_FLAG; +# endif + +# if defined(OPUS_ARM_MAY_HAVE_NEON) || defined(OPUS_ARM_MAY_HAVE_NEON_INTR) +# ifdef HWCAP_NEON + if (hwcap & HWCAP_NEON) + flags |= OPUS_CPU_ARM_NEON_FLAG; +# elif defined(HWCAP_ASIMD) + if (hwcap & HWCAP_ASIMD) + flags |= OPUS_CPU_ARM_NEON_FLAG | OPUS_CPU_ARM_MEDIA_FLAG | OPUS_CPU_ARM_EDSP_FLAG; +# endif +# endif +# if defined(OPUS_ARM_MAY_HAVE_DOTPROD) && defined(HWCAP_ASIMDDP) + if (hwcap & HWCAP_ASIMDDP) + flags |= OPUS_CPU_ARM_DOTPROD_FLAG; +# endif +# endif + +#if defined(OPUS_ARM_PRESUME_AARCH64_NEON_INTR) + flags |= OPUS_CPU_ARM_EDSP_FLAG | OPUS_CPU_ARM_MEDIA_FLAG | OPUS_CPU_ARM_NEON_FLAG; +# if defined(OPUS_ARM_PRESUME_DOTPROD) + flags |= OPUS_CPU_ARM_DOTPROD_FLAG; +# endif +#endif + + return (flags); +} + #else /* The feature registers which can tell us what the processor supports are * accessible in priveleged modes only, so we can't have a general user-space diff --git a/opus/celt/x86/x86cpu.c b/opus/celt/x86/x86cpu.c index 81cff4f4..2e7c32ae 100644 --- a/opus/celt/x86/x86cpu.c +++ b/opus/celt/x86/x86cpu.c @@ -41,7 +41,7 @@ (defined(OPUS_X86_MAY_HAVE_SSE4_1) && !defined(OPUS_X86_PRESUME_SSE4_1)) || \ (defined(OPUS_X86_MAY_HAVE_AVX2) && !defined(OPUS_X86_PRESUME_AVX2))) -#if defined(_WIN32) +#if defined(_MSC_VER) #include static _inline void cpuid(unsigned int CPUInfo[4], unsigned int InfoType) diff --git a/opus/celt/x86/x86cpu.h b/opus/celt/x86/x86cpu.h index 8ae9be8d..1e5b6a4c 100644 --- a/opus/celt/x86/x86cpu.h +++ b/opus/celt/x86/x86cpu.h @@ -68,8 +68,22 @@ int opus_select_arch(void); Use this to work around those restrictions (which should hopefully all get optimized to a single MOVD instruction). GCC implemented _mm_loadu_si32() since GCC 11; HOWEVER, there is a bug! - https://gcc.gnu.org/bugzilla/show_bug.cgi?id=99754 */ -# if !defined(_MSC_VER) && !OPUS_GNUC_PREREQ(11,3) && !(defined(__clang__) && (__clang_major__ >= 8)) + https://gcc.gnu.org/bugzilla/show_bug.cgi?id=99754 + LLVM implemented _mm_loadu_si32() since Clang 8.0, however the + __clang_major__ version number macro is unreliable, as vendors + (specifically, Apple) will use different numbering schemes than upstream. + Clang's advice is "use feature detection", but they do not provide feature + detection support for specific SIMD functions. + We follow the approach from the SIMDe project and instead detect unrelated + features that should be available in the version we want (see + ).*/ +# if defined(__clang__) +# if __has_warning("-Wextra-semi-stmt") || \ + __has_builtin(__builtin_rotateleft32) +# define OPUS_CLANG_8 (1) +# endif +# endif +# if !defined(_MSC_VER) && !OPUS_GNUC_PREREQ(11,3) && !defined(OPUS_CLANG_8) # include # include diff --git a/opus/silk/x86/NSQ_del_dec_avx2.c b/opus/silk/x86/NSQ_del_dec_avx2.c index 43485871..21f00c2d 100644 --- a/opus/silk/x86/NSQ_del_dec_avx2.c +++ b/opus/silk/x86/NSQ_del_dec_avx2.c @@ -73,7 +73,6 @@ static OPUS_INLINE int verify_assumptions(const silk_encoder_state *psEncC) /* Intrinsics not defined on MSVC */ #ifdef _MSC_VER #include -#define __m128i_u __m128i static inline int __builtin_sadd_overflow(opus_int32 a, opus_int32 b, opus_int32* res) { *res = a+b; @@ -959,7 +958,7 @@ static OPUS_INLINE void silk_nsq_del_dec_scale_states_avx2( { __m256i x = _mm256_cvtepi16_epi64(_mm_loadu_si64(&x16[i])); x = _mm256_slli_epi64(_mm256_mul_epi32(x, _mm256_set1_epi32(inv_gain_Q26)), 16); - _mm_storeu_si128((__m128i_u*)&x_sc_Q10[i], silk_cvtepi64_epi32_high(x)); + _mm_storeu_si128((__m128i*)&x_sc_Q10[i], silk_cvtepi64_epi32_high(x)); } /* After rewhitening the LTP state is un-scaled, so scale with inv_gain_Q16 */ @@ -985,8 +984,8 @@ static OPUS_INLINE void silk_nsq_del_dec_scale_states_avx2( /* Scale long-term shaping state */ for (i = NSQ->sLTP_shp_buf_idx - psEncC->ltp_mem_length; i < NSQ->sLTP_shp_buf_idx; i+=4) { - __m128i_u* p = (__m128i_u*)&NSQ->sLTP_shp_Q14[i]; - *p = silk_mm_smulww_epi32(*p, gain_adj_Q16); + opus_int32 *p = &NSQ->sLTP_shp_Q14[i]; + _mm_storeu_si128((__m128i*)p, silk_mm_smulww_epi32(_mm_loadu_si128((__m128i*)p), gain_adj_Q16)); } /* Scale long-term prediction state */ @@ -1041,13 +1040,13 @@ static OPUS_INLINE void silk_LPC_analysis_filter_avx2( /* Allowing wrap around so that two wraps can cancel each other. The rare cases where the result wraps around can only be triggered by invalid streams*/ - __m256i in_v = _mm256_cvtepi16_epi32(_mm_loadu_si128((__m128i_u*)&in_ptr[-8])); - __m256i B_v = _mm256_cvtepi16_epi32(_mm_loadu_si128((__m128i_u*)& B[0])); + __m256i in_v = _mm256_cvtepi16_epi32(_mm_loadu_si128((__m128i*)&in_ptr[-8])); + __m256i B_v = _mm256_cvtepi16_epi32(_mm_loadu_si128((__m128i*)& B[0])); __m256i sum = _mm256_mullo_epi32(in_v, silk_mm256_reverse_epi32(B_v)); if (order > 10) { - in_v = _mm256_cvtepi16_epi32(_mm_loadu_si128((__m128i_u*)&in_ptr[-16])); - B_v = _mm256_cvtepi16_epi32(_mm_loadu_si128((__m128i_u*)&B [8])); + in_v = _mm256_cvtepi16_epi32(_mm_loadu_si128((__m128i*)&in_ptr[-16])); + B_v = _mm256_cvtepi16_epi32(_mm_loadu_si128((__m128i*)&B [8])); B_v = silk_mm256_reverse_epi32(B_v); } else diff --git a/opus/src/opus_private.h b/opus/src/opus_private.h index 364c21ce..279f5f95 100644 --- a/opus/src/opus_private.h +++ b/opus/src/opus_private.h @@ -214,7 +214,7 @@ int opus_multistream_decode_native( opus_int32 opus_packet_extensions_parse(const unsigned char *data, opus_int32 len, opus_extension_data *extensions, opus_int32 *nb_extensions); -opus_int32 opus_packet_extensions_generate(unsigned char *data, opus_int32 len, const opus_extension_data *extensions, int nb_extensions, int pad); +opus_int32 opus_packet_extensions_generate(unsigned char *data, opus_int32 len, const opus_extension_data *extensions, opus_int32 nb_extensions, int pad); opus_int32 opus_packet_extensions_count(const unsigned char *data, opus_int32 len); diff --git a/opus/src/repacketizer.c b/opus/src/repacketizer.c index 6a7a8b3d..79798b02 100644 --- a/opus/src/repacketizer.c +++ b/opus/src/repacketizer.c @@ -155,7 +155,8 @@ opus_int32 opus_repacketizer_out_range_impl(OpusRepacketizer *rp, int begin, int /* incorporate any extensions from the repacketizer padding */ for (i=begin;ipaddings[i], rp->padding_len[i], &all_extensions[ext_count], &frame_ext_count);