Skip to content

Commit

Permalink
fix
Browse files Browse the repository at this point in the history
  • Loading branch information
nihui committed Oct 9, 2023
1 parent 1c5cae2 commit 35e3de9
Showing 1 changed file with 36 additions and 36 deletions.
72 changes: 36 additions & 36 deletions src/layer/x86/convolution_im2col_gemm_int8.h
Original file line number Diff line number Diff line change
Expand Up @@ -6349,14 +6349,14 @@ void convolution_im2col_input_tile_int8(const Mat& bottom_blob, Mat& B, int j, i
_rd = _mm_unpacklo_epi64(_r5, _r7);
_re = _mm_unpackhi_epi64(_r1, _r3);
_rf = _mm_unpackhi_epi64(_r5, _r7);
_mm_storeu_si128((__m128i*)pp, _r8);
_mm_storeu_si128((__m128i*)(pp + 16), _r9);
_mm_storeu_si128((__m128i*)(pp + 32), _ra);
_mm_storeu_si128((__m128i*)(pp + 48), _rb);
_mm_storeu_si128((__m128i*)(pp + 64), _rc);
_mm_storeu_si128((__m128i*)(pp + 80), _rd);
_mm_storeu_si128((__m128i*)(pp + 96), _re);
_mm_storeu_si128((__m128i*)(pp + 112), _rf);
_mm_store_si128((__m128i*)pp, _r8);
_mm_store_si128((__m128i*)(pp + 16), _r9);
_mm_store_si128((__m128i*)(pp + 32), _ra);
_mm_store_si128((__m128i*)(pp + 48), _rb);
_mm_store_si128((__m128i*)(pp + 64), _rc);
_mm_store_si128((__m128i*)(pp + 80), _rd);
_mm_store_si128((__m128i*)(pp + 96), _re);
_mm_store_si128((__m128i*)(pp + 112), _rf);
pp += 128;
}
if (elempack == 1)
Expand Down Expand Up @@ -6636,14 +6636,14 @@ void convolution_im2col_input_tile_int8(const Mat& bottom_blob, Mat& B, int j, i
_rd = _mm_unpacklo_epi64(_r5, _r7);
_re = _mm_unpackhi_epi64(_r1, _r3);
_rf = _mm_unpackhi_epi64(_r5, _r7);
_mm_storeu_si128((__m128i*)pp, _r8);
_mm_storeu_si128((__m128i*)(pp + 16), _r9);
_mm_storeu_si128((__m128i*)(pp + 32), _ra);
_mm_storeu_si128((__m128i*)(pp + 48), _rb);
_mm_storeu_si128((__m128i*)(pp + 64), _rc);
_mm_storeu_si128((__m128i*)(pp + 80), _rd);
_mm_storeu_si128((__m128i*)(pp + 96), _re);
_mm_storeu_si128((__m128i*)(pp + 112), _rf);
_mm_store_si128((__m128i*)pp, _r8);
_mm_store_si128((__m128i*)(pp + 16), _r9);
_mm_store_si128((__m128i*)(pp + 32), _ra);
_mm_store_si128((__m128i*)(pp + 48), _rb);
_mm_store_si128((__m128i*)(pp + 64), _rc);
_mm_store_si128((__m128i*)(pp + 80), _rd);
_mm_store_si128((__m128i*)(pp + 96), _re);
_mm_store_si128((__m128i*)(pp + 112), _rf);
pp += 128;
}
if (elempack == 1)
Expand Down Expand Up @@ -6721,7 +6721,7 @@ void convolution_im2col_input_tile_int8(const Mat& bottom_blob, Mat& B, int j, i
__m128i _r0 = _mm_loadl_epi64((const __m128i*)sptr0);
__m128i _r1 = _mm_loadl_epi64((const __m128i*)sptr1);
__m128i _r01 = _mm_unpacklo_epi8(_r0, _r1);
_mm_store_si128((__m128i*)pp, _r01);
_mm_storeu_si128((__m128i*)pp, _r01);
pp += 16;
}
else if (stride_w == 2)
Expand All @@ -6735,7 +6735,7 @@ void convolution_im2col_input_tile_int8(const Mat& bottom_blob, Mat& B, int j, i
_tmp0 = _mm_shuffle_epi32(_tmp0, _MM_SHUFFLE(3, 1, 2, 0));
_tmp1 = _mm_shuffle_epi32(_tmp1, _MM_SHUFFLE(3, 1, 2, 0));
__m128i _r01 = _mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(_tmp0), _mm_castsi128_ps(_tmp1), _MM_SHUFFLE(1, 0, 1, 0)));
_mm_store_si128((__m128i*)pp, _r01);
_mm_storeu_si128((__m128i*)pp, _r01);
pp += 16;
}
else
Expand Down Expand Up @@ -7670,14 +7670,14 @@ static void convolution_im2col_input_tile_int8(const Mat& bottom_blob, Mat& B, i
_rd = _mm_unpacklo_epi64(_r5, _r7);
_re = _mm_unpackhi_epi64(_r1, _r3);
_rf = _mm_unpackhi_epi64(_r5, _r7);
_mm_storeu_si128((__m128i*)pp, _r8);
_mm_storeu_si128((__m128i*)(pp + 16), _r9);
_mm_storeu_si128((__m128i*)(pp + 32), _ra);
_mm_storeu_si128((__m128i*)(pp + 48), _rb);
_mm_storeu_si128((__m128i*)(pp + 64), _rc);
_mm_storeu_si128((__m128i*)(pp + 80), _rd);
_mm_storeu_si128((__m128i*)(pp + 96), _re);
_mm_storeu_si128((__m128i*)(pp + 112), _rf);
_mm_store_si128((__m128i*)pp, _r8);
_mm_store_si128((__m128i*)(pp + 16), _r9);
_mm_store_si128((__m128i*)(pp + 32), _ra);
_mm_store_si128((__m128i*)(pp + 48), _rb);
_mm_store_si128((__m128i*)(pp + 64), _rc);
_mm_store_si128((__m128i*)(pp + 80), _rd);
_mm_store_si128((__m128i*)(pp + 96), _re);
_mm_store_si128((__m128i*)(pp + 112), _rf);
pp += 128;
}
if (elempack == 1)
Expand Down Expand Up @@ -7957,14 +7957,14 @@ static void convolution_im2col_input_tile_int8(const Mat& bottom_blob, Mat& B, i
_rd = _mm_unpacklo_epi64(_r5, _r7);
_re = _mm_unpackhi_epi64(_r1, _r3);
_rf = _mm_unpackhi_epi64(_r5, _r7);
_mm_storeu_si128((__m128i*)pp, _r8);
_mm_storeu_si128((__m128i*)(pp + 16), _r9);
_mm_storeu_si128((__m128i*)(pp + 32), _ra);
_mm_storeu_si128((__m128i*)(pp + 48), _rb);
_mm_storeu_si128((__m128i*)(pp + 64), _rc);
_mm_storeu_si128((__m128i*)(pp + 80), _rd);
_mm_storeu_si128((__m128i*)(pp + 96), _re);
_mm_storeu_si128((__m128i*)(pp + 112), _rf);
_mm_store_si128((__m128i*)pp, _r8);
_mm_store_si128((__m128i*)(pp + 16), _r9);
_mm_store_si128((__m128i*)(pp + 32), _ra);
_mm_store_si128((__m128i*)(pp + 48), _rb);
_mm_store_si128((__m128i*)(pp + 64), _rc);
_mm_store_si128((__m128i*)(pp + 80), _rd);
_mm_store_si128((__m128i*)(pp + 96), _re);
_mm_store_si128((__m128i*)(pp + 112), _rf);
pp += 128;
}
if (elempack == 1)
Expand Down Expand Up @@ -8042,7 +8042,7 @@ static void convolution_im2col_input_tile_int8(const Mat& bottom_blob, Mat& B, i
__m128i _r0 = _mm_loadl_epi64((const __m128i*)sptr0);
__m128i _r1 = _mm_loadl_epi64((const __m128i*)sptr1);
__m128i _r01 = _mm_unpacklo_epi8(_r0, _r1);
_mm_store_si128((__m128i*)pp, _r01);
_mm_storeu_si128((__m128i*)pp, _r01);
pp += 16;
}
else if (stride_w == 2)
Expand All @@ -8056,7 +8056,7 @@ static void convolution_im2col_input_tile_int8(const Mat& bottom_blob, Mat& B, i
_tmp0 = _mm_shuffle_epi32(_tmp0, _MM_SHUFFLE(3, 1, 2, 0));
_tmp1 = _mm_shuffle_epi32(_tmp1, _MM_SHUFFLE(3, 1, 2, 0));
__m128i _r01 = _mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(_tmp0), _mm_castsi128_ps(_tmp1), _MM_SHUFFLE(1, 0, 1, 0)));
_mm_store_si128((__m128i*)pp, _r01);
_mm_storeu_si128((__m128i*)pp, _r01);
pp += 16;
}
else
Expand Down

0 comments on commit 35e3de9

Please sign in to comment.