From 61d241f855070e5897fa6d0d4463196498c9215d Mon Sep 17 00:00:00 2001 From: nihuini Date: Mon, 23 Dec 2024 11:10:07 +0800 Subject: [PATCH 1/5] fix clang avx512bf16 build --- src/layer/x86/x86_usability.h | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/layer/x86/x86_usability.h b/src/layer/x86/x86_usability.h index f25b06745e8..8628249e76a 100644 --- a/src/layer/x86/x86_usability.h +++ b/src/layer/x86/x86_usability.h @@ -1490,9 +1490,9 @@ static NCNN_FORCEINLINE __m256i float2bfloat_avx512(const __m512& v0) static NCNN_FORCEINLINE __m512i float2bfloat_avx512(const __m512& v0, const __m512& v1) { #if __AVX512BF16__ - __m256bh _v0 = _mm512_cvtneps_pbh(v0); - __m256bh _v1 = _mm512_cvtneps_pbh(v1); - __m512i _v = _mm512_inserti32x8(_mm512_castsi256_si512((__m256i)_v0), (__m256i)_v1, 1); + __m256i _v0 = (__m256i)_mm512_cvtneps_pbh(v0); + __m256i _v1 = (__m256i)_mm512_cvtneps_pbh(v1); + __m512i _v = _mm512_inserti32x8(_mm512_castsi256_si512(_v0), _v1, 1); #else __m512i _a = _mm512_castps_si512(v0); __m512i _b = _mm512_castps_si512(v1); From 00546c30dbaa8a8ecdf4088fc50e595e28ca506e Mon Sep 17 00:00:00 2001 From: nihuini Date: Mon, 23 Dec 2024 11:36:13 +0800 Subject: [PATCH 2/5] test bf16 insert --- CMakeLists.txt | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 5851552b2a5..43b417b0195 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -522,8 +522,8 @@ else() set(CMAKE_REQUIRED_FLAGS "/arch:AVX512") check_cxx_source_compiles("#include \nint main() { __m512i _s, _a, _b; _s = _mm512_dpwssd_epi32(_s, _a, _b); return 0; }" NCNN_COMPILER_SUPPORT_X86_AVX512_VNNI) - set(CMAKE_REQUIRED_FLAGS "/arch:AVX512") - check_cxx_source_compiles("#include \nint main() { __m256bh _s; __m512bh _a, _b; _s = _mm512_cvtneps_pbh(_mm512_dpbf16_ps(_mm512_cvtpbh_ps(_s), _a, _b)); return 0; }" NCNN_COMPILER_SUPPORT_X86_AVX512_BF16) + set(CMAKE_REQUIRED_FLAGS "/arch:AVX512 /O2") + check_cxx_source_compiles("#include \nint main() { __m512 _s; __m512bh _a, _b; _s = _mm512_dpbf16_ps(_s, _a, _b); return 0; }\n__m512i t(__m512 a) { __m256i _a = (__m256i)_mm512_cvtneps_pbh(a); return _mm512_inserti32x8(_mm512_castsi256_si512(_a), _a, 1); }" NCNN_COMPILER_SUPPORT_X86_AVX512_BF16) set(CMAKE_REQUIRED_FLAGS "/arch:AVX512") check_cxx_source_compiles("#include \nint main() { __m512h _s, _a, _b; _s = _mm512_fmadd_ph(_s, _a, _b); __m512 _s2; _s2 = _mm512_cvtxph_ps(_mm512_cvtxps_ph(_s2)); return 0; }" NCNN_COMPILER_SUPPORT_X86_AVX512_FP16) @@ -559,8 +559,8 @@ else() set(CMAKE_REQUIRED_FLAGS "/arch:AVX512 -mfma -mf16c -mavx512cd -mavx512bw -mavx512dq -mavx512vl -mavx512vnni") check_cxx_source_compiles("#include \nint main() { __m512i _s, _a, _b; _s = _mm512_dpwssd_epi32(_s, _a, _b); return 0; }" NCNN_COMPILER_SUPPORT_X86_AVX512_VNNI) - set(CMAKE_REQUIRED_FLAGS "/arch:AVX512 -mfma -mf16c -mavx512cd -mavx512bw -mavx512dq -mavx512vl -mavx512bf16") - check_cxx_source_compiles("#include \nint main() { __m256bh _s; __m512bh _a, _b; _s = _mm512_cvtneps_pbh(_mm512_dpbf16_ps(_mm512_cvtpbh_ps(_s), _a, _b)); return 0; }" NCNN_COMPILER_SUPPORT_X86_AVX512_BF16) + set(CMAKE_REQUIRED_FLAGS "/arch:AVX512 -mfma -mf16c -mavx512cd -mavx512bw -mavx512dq -mavx512vl -mavx512bf16 /O2 -O2") + check_cxx_source_compiles("#include \nint main() { __m512 _s; __m512bh _a, _b; _s = _mm512_dpbf16_ps(_s, _a, _b); return 0; }\n__m512i t(__m512 a) { __m256i _a = (__m256i)_mm512_cvtneps_pbh(a); return _mm512_inserti32x8(_mm512_castsi256_si512(_a), _a, 1); }" NCNN_COMPILER_SUPPORT_X86_AVX512_BF16) set(CMAKE_REQUIRED_FLAGS "/arch:AVX512 -mfma -mf16c -mavx512cd -mavx512bw -mavx512dq -mavx512vl -mavx512fp16") check_cxx_source_compiles("#include \nint main() { __m512h _s, _a, _b; _s = _mm512_fmadd_ph(_s, _a, _b); __m512 _s2; _s2 = _mm512_cvtxph_ps(_mm512_cvtxps_ph(_s2)); return 0; }" NCNN_COMPILER_SUPPORT_X86_AVX512_FP16) @@ -594,8 +594,8 @@ else() set(CMAKE_REQUIRED_FLAGS "-mfma -mf16c -mavx512f -mavx512cd -mavx512bw -mavx512dq -mavx512vl -mavx512vnni") check_cxx_source_compiles("#include \nint main() { __m512i _s, _a, _b; _s = _mm512_dpwssd_epi32(_s, _a, _b); return 0; }" NCNN_COMPILER_SUPPORT_X86_AVX512_VNNI) - set(CMAKE_REQUIRED_FLAGS "-mfma -mf16c -mavx512f -mavx512cd -mavx512bw -mavx512dq -mavx512vl -mavx512bf16") - check_cxx_source_compiles("#include \nint main() { __m256bh _s; __m512bh _a, _b; _s = _mm512_cvtneps_pbh(_mm512_dpbf16_ps(_mm512_cvtpbh_ps(_s), _a, _b)); return 0; }" NCNN_COMPILER_SUPPORT_X86_AVX512_BF16) + set(CMAKE_REQUIRED_FLAGS "-mfma -mf16c -mavx512f -mavx512cd -mavx512bw -mavx512dq -mavx512vl -mavx512bf16 -O2") + check_cxx_source_compiles("#include \nint main() { __m512 _s; __m512bh _a, _b; _s = _mm512_dpbf16_ps(_s, _a, _b); return 0; }\n__m512i t(__m512 a) { __m256i _a = (__m256i)_mm512_cvtneps_pbh(a); return _mm512_inserti32x8(_mm512_castsi256_si512(_a), _a, 1); }" NCNN_COMPILER_SUPPORT_X86_AVX512_BF16) set(CMAKE_REQUIRED_FLAGS "-mfma -mf16c -mavx512f -mavx512cd -mavx512bw -mavx512dq -mavx512vl -mavx512fp16") check_cxx_source_compiles("#include \nint main() { __m512h _s, _a, _b; _s = _mm512_fmadd_ph(_s, _a, _b); __m512 _s2; _s2 = _mm512_cvtxph_ps(_mm512_cvtxps_ph(_s2)); return 0; }" NCNN_COMPILER_SUPPORT_X86_AVX512_FP16) From d3b227a4bd5aee9260f97d224d333ea701bcec50 Mon Sep 17 00:00:00 2001 From: nihuini Date: Mon, 23 Dec 2024 11:43:28 +0800 Subject: [PATCH 3/5] s --- CMakeLists.txt | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 43b417b0195..adb2db00e6d 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -523,7 +523,7 @@ else() check_cxx_source_compiles("#include \nint main() { __m512i _s, _a, _b; _s = _mm512_dpwssd_epi32(_s, _a, _b); return 0; }" NCNN_COMPILER_SUPPORT_X86_AVX512_VNNI) set(CMAKE_REQUIRED_FLAGS "/arch:AVX512 /O2") - check_cxx_source_compiles("#include \nint main() { __m512 _s; __m512bh _a, _b; _s = _mm512_dpbf16_ps(_s, _a, _b); return 0; }\n__m512i t(__m512 a) { __m256i _a = (__m256i)_mm512_cvtneps_pbh(a); return _mm512_inserti32x8(_mm512_castsi256_si512(_a), _a, 1); }" NCNN_COMPILER_SUPPORT_X86_AVX512_BF16) + check_cxx_source_compiles("#include \nint main() { __m256bh _s; __m512bh _a, _b; _s = _mm512_cvtneps_pbh(_mm512_dpbf16_ps(_mm512_cvtpbh_ps(_s), _a, _b)); return 0; }\n__m512i t(__m512 a) { __m256i _a = (__m256i)_mm512_cvtneps_pbh(a); return _mm512_inserti32x8(_mm512_castsi256_si512(_a), _a, 1); }" NCNN_COMPILER_SUPPORT_X86_AVX512_BF16) set(CMAKE_REQUIRED_FLAGS "/arch:AVX512") check_cxx_source_compiles("#include \nint main() { __m512h _s, _a, _b; _s = _mm512_fmadd_ph(_s, _a, _b); __m512 _s2; _s2 = _mm512_cvtxph_ps(_mm512_cvtxps_ph(_s2)); return 0; }" NCNN_COMPILER_SUPPORT_X86_AVX512_FP16) @@ -560,7 +560,7 @@ else() check_cxx_source_compiles("#include \nint main() { __m512i _s, _a, _b; _s = _mm512_dpwssd_epi32(_s, _a, _b); return 0; }" NCNN_COMPILER_SUPPORT_X86_AVX512_VNNI) set(CMAKE_REQUIRED_FLAGS "/arch:AVX512 -mfma -mf16c -mavx512cd -mavx512bw -mavx512dq -mavx512vl -mavx512bf16 /O2 -O2") - check_cxx_source_compiles("#include \nint main() { __m512 _s; __m512bh _a, _b; _s = _mm512_dpbf16_ps(_s, _a, _b); return 0; }\n__m512i t(__m512 a) { __m256i _a = (__m256i)_mm512_cvtneps_pbh(a); return _mm512_inserti32x8(_mm512_castsi256_si512(_a), _a, 1); }" NCNN_COMPILER_SUPPORT_X86_AVX512_BF16) + check_cxx_source_compiles("#include \nint main() { __m256bh _s; __m512bh _a, _b; _s = _mm512_cvtneps_pbh(_mm512_dpbf16_ps(_mm512_cvtpbh_ps(_s), _a, _b)); return 0; }\n__m512i t(__m512 a) { __m256i _a = (__m256i)_mm512_cvtneps_pbh(a); return _mm512_inserti32x8(_mm512_castsi256_si512(_a), _a, 1); }" NCNN_COMPILER_SUPPORT_X86_AVX512_BF16) set(CMAKE_REQUIRED_FLAGS "/arch:AVX512 -mfma -mf16c -mavx512cd -mavx512bw -mavx512dq -mavx512vl -mavx512fp16") check_cxx_source_compiles("#include \nint main() { __m512h _s, _a, _b; _s = _mm512_fmadd_ph(_s, _a, _b); __m512 _s2; _s2 = _mm512_cvtxph_ps(_mm512_cvtxps_ph(_s2)); return 0; }" NCNN_COMPILER_SUPPORT_X86_AVX512_FP16) @@ -595,7 +595,7 @@ else() check_cxx_source_compiles("#include \nint main() { __m512i _s, _a, _b; _s = _mm512_dpwssd_epi32(_s, _a, _b); return 0; }" NCNN_COMPILER_SUPPORT_X86_AVX512_VNNI) set(CMAKE_REQUIRED_FLAGS "-mfma -mf16c -mavx512f -mavx512cd -mavx512bw -mavx512dq -mavx512vl -mavx512bf16 -O2") - check_cxx_source_compiles("#include \nint main() { __m512 _s; __m512bh _a, _b; _s = _mm512_dpbf16_ps(_s, _a, _b); return 0; }\n__m512i t(__m512 a) { __m256i _a = (__m256i)_mm512_cvtneps_pbh(a); return _mm512_inserti32x8(_mm512_castsi256_si512(_a), _a, 1); }" NCNN_COMPILER_SUPPORT_X86_AVX512_BF16) + check_cxx_source_compiles("#include \nint main() { __m256bh _s; __m512bh _a, _b; _s = _mm512_cvtneps_pbh(_mm512_dpbf16_ps(_mm512_cvtpbh_ps(_s), _a, _b)); return 0; }\n__m512i t(__m512 a) { __m256i _a = (__m256i)_mm512_cvtneps_pbh(a); return _mm512_inserti32x8(_mm512_castsi256_si512(_a), _a, 1); }" NCNN_COMPILER_SUPPORT_X86_AVX512_BF16) set(CMAKE_REQUIRED_FLAGS "-mfma -mf16c -mavx512f -mavx512cd -mavx512bw -mavx512dq -mavx512vl -mavx512fp16") check_cxx_source_compiles("#include \nint main() { __m512h _s, _a, _b; _s = _mm512_fmadd_ph(_s, _a, _b); __m512 _s2; _s2 = _mm512_cvtxph_ps(_mm512_cvtxps_ph(_s2)); return 0; }" NCNN_COMPILER_SUPPORT_X86_AVX512_FP16) From f7bcea6b7b035ed4b42ada8eeed30de321770b09 Mon Sep 17 00:00:00 2001 From: nihui Date: Mon, 23 Dec 2024 08:04:12 +0000 Subject: [PATCH 4/5] check compiler supports isa with optimization enabled --- CMakeLists.txt | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index adb2db00e6d..2b532d7c245 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -139,6 +139,7 @@ endif() ############################################## include(CheckCXXCompilerFlag) +set(CMAKE_TRY_COMPILE_CONFIGURATION release) # gnu inline assembly in clang msvc does not work actually if(NOT (CMAKE_CXX_COMPILER_ID MATCHES "MSVC" OR (CMAKE_CXX_COMPILER_ID MATCHES "Clang" AND CMAKE_CXX_SIMULATE_ID MATCHES "MSVC" AND CMAKE_CXX_COMPILER_FRONTEND_VARIANT MATCHES "MSVC"))) @@ -522,7 +523,7 @@ else() set(CMAKE_REQUIRED_FLAGS "/arch:AVX512") check_cxx_source_compiles("#include \nint main() { __m512i _s, _a, _b; _s = _mm512_dpwssd_epi32(_s, _a, _b); return 0; }" NCNN_COMPILER_SUPPORT_X86_AVX512_VNNI) - set(CMAKE_REQUIRED_FLAGS "/arch:AVX512 /O2") + set(CMAKE_REQUIRED_FLAGS "/arch:AVX512") check_cxx_source_compiles("#include \nint main() { __m256bh _s; __m512bh _a, _b; _s = _mm512_cvtneps_pbh(_mm512_dpbf16_ps(_mm512_cvtpbh_ps(_s), _a, _b)); return 0; }\n__m512i t(__m512 a) { __m256i _a = (__m256i)_mm512_cvtneps_pbh(a); return _mm512_inserti32x8(_mm512_castsi256_si512(_a), _a, 1); }" NCNN_COMPILER_SUPPORT_X86_AVX512_BF16) set(CMAKE_REQUIRED_FLAGS "/arch:AVX512") @@ -559,7 +560,7 @@ else() set(CMAKE_REQUIRED_FLAGS "/arch:AVX512 -mfma -mf16c -mavx512cd -mavx512bw -mavx512dq -mavx512vl -mavx512vnni") check_cxx_source_compiles("#include \nint main() { __m512i _s, _a, _b; _s = _mm512_dpwssd_epi32(_s, _a, _b); return 0; }" NCNN_COMPILER_SUPPORT_X86_AVX512_VNNI) - set(CMAKE_REQUIRED_FLAGS "/arch:AVX512 -mfma -mf16c -mavx512cd -mavx512bw -mavx512dq -mavx512vl -mavx512bf16 /O2 -O2") + set(CMAKE_REQUIRED_FLAGS "/arch:AVX512 -mfma -mf16c -mavx512cd -mavx512bw -mavx512dq -mavx512vl -mavx512bf16") check_cxx_source_compiles("#include \nint main() { __m256bh _s; __m512bh _a, _b; _s = _mm512_cvtneps_pbh(_mm512_dpbf16_ps(_mm512_cvtpbh_ps(_s), _a, _b)); return 0; }\n__m512i t(__m512 a) { __m256i _a = (__m256i)_mm512_cvtneps_pbh(a); return _mm512_inserti32x8(_mm512_castsi256_si512(_a), _a, 1); }" NCNN_COMPILER_SUPPORT_X86_AVX512_BF16) set(CMAKE_REQUIRED_FLAGS "/arch:AVX512 -mfma -mf16c -mavx512cd -mavx512bw -mavx512dq -mavx512vl -mavx512fp16") @@ -594,7 +595,7 @@ else() set(CMAKE_REQUIRED_FLAGS "-mfma -mf16c -mavx512f -mavx512cd -mavx512bw -mavx512dq -mavx512vl -mavx512vnni") check_cxx_source_compiles("#include \nint main() { __m512i _s, _a, _b; _s = _mm512_dpwssd_epi32(_s, _a, _b); return 0; }" NCNN_COMPILER_SUPPORT_X86_AVX512_VNNI) - set(CMAKE_REQUIRED_FLAGS "-mfma -mf16c -mavx512f -mavx512cd -mavx512bw -mavx512dq -mavx512vl -mavx512bf16 -O2") + set(CMAKE_REQUIRED_FLAGS "-mfma -mf16c -mavx512f -mavx512cd -mavx512bw -mavx512dq -mavx512vl -mavx512bf16") check_cxx_source_compiles("#include \nint main() { __m256bh _s; __m512bh _a, _b; _s = _mm512_cvtneps_pbh(_mm512_dpbf16_ps(_mm512_cvtpbh_ps(_s), _a, _b)); return 0; }\n__m512i t(__m512 a) { __m256i _a = (__m256i)_mm512_cvtneps_pbh(a); return _mm512_inserti32x8(_mm512_castsi256_si512(_a), _a, 1); }" NCNN_COMPILER_SUPPORT_X86_AVX512_BF16) set(CMAKE_REQUIRED_FLAGS "-mfma -mf16c -mavx512f -mavx512cd -mavx512bw -mavx512dq -mavx512vl -mavx512fp16") From 90543f56495d87be0bb6b15481e00519c0b5adce Mon Sep 17 00:00:00 2001 From: nihui Date: Mon, 23 Dec 2024 08:18:25 +0000 Subject: [PATCH 5/5] f --- src/layer/x86/gemm_int8.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/layer/x86/gemm_int8.h b/src/layer/x86/gemm_int8.h index f9e0050fd55..132cf9d8cb9 100644 --- a/src/layer/x86/gemm_int8.h +++ b/src/layer/x86/gemm_int8.h @@ -2014,7 +2014,7 @@ static void pack_A_tile_fp32_to_int8(const Mat& A, Mat& AT, int i, int max_ii, i __m256i _pp = combine4x2_epi32(_pp0, _pp1); #if !__AVXVNNIINT8__ - _w_shift = _mm256_dpbusd_epi32(_w_shift, _v127, _pp); + _w_shift = _mm256_comp_dpbusd_epi32(_w_shift, _v127, _pp); #endif // !__AVXVNNIINT8__ _mm256_storeu_si256((__m256i*)pp, _pp); @@ -2108,7 +2108,7 @@ static void pack_A_tile_fp32_to_int8(const Mat& A, Mat& AT, int i, int max_ii, i __m256i _pp = combine4x2_epi32(_pp0, _pp1); #if !__AVXVNNIINT8__ - _w_shift = _mm256_dpbusd_epi32(_w_shift, _v127, _pp); + _w_shift = _mm256_comp_dpbusd_epi32(_w_shift, _v127, _pp); #endif // !__AVXVNNIINT8__ _mm256_storeu_si256((__m256i*)pp, _pp);