Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

x86 optimization for convolution int8 gemm #5874

Open
wants to merge 22 commits into
base: master
Choose a base branch
from
Prev Previous commit
Next Next commit
apply code-format changes
nihui authored and github-actions[bot] committed Jan 13, 2025
commit 1e4163305d17add9b31a0584bf69ba3f02931901
14 changes: 6 additions & 8 deletions src/layer/x86/convolution_im2col_gemm_int8.h
Original file line number Diff line number Diff line change
@@ -5796,7 +5796,7 @@ static void convolution_im2col_input_tile_conv1x1s1d1_int8(const Mat& bottom_blo
_r7 = _mm_add_epi8(_r7, _v127);
#endif // __AVXVNNIINT8__

#else // __AVX512VNNI__ || __AVXVNNI__
#else // __AVX512VNNI__ || __AVXVNNI__
__m128i _t0 = _mm_unpacklo_epi16(_r01, _r23);
__m128i _t1 = _mm_unpackhi_epi16(_r01, _r23);
__m128i _t2 = _mm_unpacklo_epi16(_r45, _r67);
@@ -5934,7 +5934,7 @@ static void convolution_im2col_input_tile_conv1x1s1d1_int8(const Mat& bottom_blo
_r3 = _mm_add_epi8(_r3, _v127);
#endif // __AVXVNNIINT8__

#else // __AVX512VNNI__ || __AVXVNNI__
#else // __AVX512VNNI__ || __AVXVNNI__
__m128i _t0 = _mm_unpacklo_epi16(_r01, _r23);
__m128i _t1 = _mm_unpackhi_epi16(_r01, _r23);
__m128i _t2 = _mm_unpacklo_epi16(_r45, _r67);
@@ -6039,7 +6039,7 @@ static void convolution_im2col_input_tile_conv1x1s1d1_int8(const Mat& bottom_blo
_r1 = _mm_add_epi8(_r1, _v127);
#endif // __AVXVNNIINT8__

#else // __AVX512VNNI__ || __AVXVNNI__
#else // __AVX512VNNI__ || __AVXVNNI__
__m128i _t0 = _mm_unpacklo_epi16(_r01, _r23);
__m128i _t1 = _mm_unpackhi_epi16(_r01, _r23);

@@ -6145,7 +6145,7 @@ static void convolution_im2col_input_tile_conv1x1s1d1_int8(const Mat& bottom_blo
#else // __AVXVNNIINT8__
_r01 = _mm_add_epi8(_r01, _mm_set1_epi8(127));
#endif // __AVXVNNIINT8__
#else // __AVX512VNNI__ || __AVXVNNI__
#else // __AVX512VNNI__ || __AVXVNNI__
__m128i _r01 = _mm_unpacklo_epi16(_r0, _r1);
#endif // __AVX512VNNI__ || __AVXVNNI__
_mm_storeu_si128((__m128i*)pp, _r01);
@@ -6966,7 +6966,6 @@ static void convolution_im2col_input_tile_int8_impl(const Mat& bottom_blob, Mat&
_r0 = _mm256_shuffle_epi32(_r0, _MM_SHUFFLE(3, 1, 2, 0));
_r1 = _mm256_shuffle_epi32(_r1, _MM_SHUFFLE(3, 1, 2, 0));


#endif // __AVX512VNNI__ || __AVXVNNI__

_mm256_storeu_si256((__m256i*)pp, _r0);
@@ -7262,7 +7261,7 @@ static void convolution_im2col_input_tile_int8_impl(const Mat& bottom_blob, Mat&

_mm256_storeu_si256((__m256i*)pp, _r0);
_mm256_storeu_si256((__m256i*)(pp + 32), _r1);
#else // __AVX2__
#else // __AVX2__

__m128i _vindex0 = _mm_add_epi32(_dxy_offset0, _mm_set1_epi32(puv_offset));
__m128i _vindex1 = _mm_add_epi32(_dxy_offset1, _mm_set1_epi32(puv_offset));
@@ -7450,7 +7449,6 @@ static void convolution_im2col_input_tile_int8_impl(const Mat& bottom_blob, Mat&
_r1 = _mm_add_epi8(_r1, _v127);
#endif // __AVXVNNIINT8__


#else // __AVX512VNNI__ || __AVXVNNI__

// 00001111
@@ -7727,7 +7725,7 @@ static void convolution_im2col_input_tile_int8_impl(const Mat& bottom_blob, Mat&
_r0 = _mm_shuffle_epi8(_r0, _mm_setr_epi8(0, 2, 4, 6, 1, 3, 5, 7, 0, 0, 0, 0, 0, 0, 0, 0));

#if __AVXVNNIINT8__
#else // __AVXVNNIINT8__
#else // __AVXVNNIINT8__

__m128i _v127 = _mm_set1_epi8(127);