Skip to content

Commit

Permalink
zzz
Browse files Browse the repository at this point in the history
  • Loading branch information
nihui committed Jan 16, 2025
1 parent 659c457 commit 2aa1233
Show file tree
Hide file tree
Showing 2 changed files with 19 additions and 23 deletions.
24 changes: 13 additions & 11 deletions src/layer/x86/convolution_im2col_gemm_int8.h
Original file line number Diff line number Diff line change
Expand Up @@ -76,7 +76,7 @@ static void convolution_gemm_transB_packed_tile_int8(const Mat& AT_tile, const M
#endif
}

static void convolution_im2col_gemm_get_optimal_tile_mnk_int8(int M, int N, int K, int& TILE_M, int& TILE_N, int& TILE_K, int nT)
static NCNN_FORCEINLINE void convolution_im2col_gemm_get_optimal_tile_mnk_int8(int M, int N, int K, int& TILE_M, int& TILE_N, int& TILE_K, int nT)
{
// resolve optimal tile size from cache size
const size_t l2_cache_size_int8 = (int)(get_cpu_level2_cache_size() / sizeof(signed char));
Expand Down Expand Up @@ -205,11 +205,13 @@ static void convolution_im2col_gemm_get_optimal_tile_mnk_int8(int M, int N, int
}
}

static void convolution_im2col_input_tile_conv1x1s1d1_int8(const Mat& bottom_blob, Mat& B, int j, int max_jj, int k, int max_kk)
static NCNN_FORCEINLINE void convolution_im2col_input_tile_conv1x1s1d1_int8(const Mat& bottom_blob, Mat& B, int j, int max_jj, int k, int max_kk)
{
const int elempack = bottom_blob.elempack;
const int cstep = (int)bottom_blob.cstep;

// NCNN_LOGE("convolution_im2col_input_tile_conv1x1s1d1_int8 %d %d %d %d @%d", j, max_jj, k, max_kk, elempack);

signed char* pp = B;

int jj = 0;
Expand Down Expand Up @@ -820,7 +822,7 @@ static void convolution_im2col_input_tile_int8_impl(const Mat& bottom_blob, Mat&
__m128i _uv = _mm_sub_epi32(_puv, _mm_mullo_epi32(_p, _mm_set1_epi32(maxk)));
__m128i _u = div_kernel_w._mm_comp_div_epu32(_uv);
__m128i _v = _mm_sub_epi32(_uv, _mm_mullo_epi32(_u, _mm_set1_epi32(kernel_w)));
_p = _mm_mullo_epi32(_p, _mm_set1_epi32(bottom_blob.cstep));
_p = _mm_mullo_epi32(_p, _mm_set1_epi32(cstep));
_u = _mm_mullo_epi32(_u, _mm_set1_epi32(dilation_h));
_v = _mm_mullo_epi32(_v, _mm_set1_epi32(dilation_w));
_u = _mm_mullo_epi32(_u, _mm_set1_epi32(w));
Expand Down Expand Up @@ -1031,7 +1033,7 @@ static void convolution_im2col_input_tile_int8_impl(const Mat& bottom_blob, Mat&
__m128i _uv = _mm_sub_epi32(_puv, _mm_mullo_epi32(_p, _mm_set1_epi32(maxk)));
__m128i _u = div_kernel_w._mm_comp_div_epu32(_uv);
__m128i _v = _mm_sub_epi32(_uv, _mm_mullo_epi32(_u, _mm_set1_epi32(kernel_w)));
_p = _mm_mullo_epi32(_p, _mm_set1_epi32(bottom_blob.cstep));
_p = _mm_mullo_epi32(_p, _mm_set1_epi32(cstep));
_u = _mm_mullo_epi32(_u, _mm_set1_epi32(dilation_h));
_v = _mm_mullo_epi32(_v, _mm_set1_epi32(dilation_w));
_u = _mm_mullo_epi32(_u, _mm_set1_epi32(w));
Expand Down Expand Up @@ -1316,7 +1318,7 @@ static void convolution_im2col_input_tile_int8_impl(const Mat& bottom_blob, Mat&
__m128i _uv = _mm_sub_epi32(_puv, _mm_mullo_epi32(_p, _mm_set1_epi32(maxk)));
__m128i _u = div_kernel_w._mm_comp_div_epu32(_uv);
__m128i _v = _mm_sub_epi32(_uv, _mm_mullo_epi32(_u, _mm_set1_epi32(kernel_w)));
_p = _mm_mullo_epi32(_p, _mm_set1_epi32(bottom_blob.cstep));
_p = _mm_mullo_epi32(_p, _mm_set1_epi32(cstep));
_u = _mm_mullo_epi32(_u, _mm_set1_epi32(dilation_h));
_v = _mm_mullo_epi32(_v, _mm_set1_epi32(dilation_w));
_u = _mm_mullo_epi32(_u, _mm_set1_epi32(w));
Expand Down Expand Up @@ -1523,7 +1525,7 @@ static void convolution_im2col_input_tile_int8_impl(const Mat& bottom_blob, Mat&
__m128i _uv = _mm_sub_epi32(_puv, _mm_mullo_epi32(_p, _mm_set1_epi32(maxk)));
__m128i _u = div_kernel_w._mm_comp_div_epu32(_uv);
__m128i _v = _mm_sub_epi32(_uv, _mm_mullo_epi32(_u, _mm_set1_epi32(kernel_w)));
_p = _mm_mullo_epi32(_p, _mm_set1_epi32(bottom_blob.cstep));
_p = _mm_mullo_epi32(_p, _mm_set1_epi32(cstep));
_u = _mm_mullo_epi32(_u, _mm_set1_epi32(dilation_h));
_v = _mm_mullo_epi32(_v, _mm_set1_epi32(dilation_w));
_u = _mm_mullo_epi32(_u, _mm_set1_epi32(w));
Expand Down Expand Up @@ -1835,7 +1837,7 @@ static void convolution_im2col_input_tile_int8_impl(const Mat& bottom_blob, Mat&
__m128i _uv = _mm_sub_epi32(_puv, _mm_mullo_epi32(_p, _mm_set1_epi32(maxk)));
__m128i _u = div_kernel_w._mm_comp_div_epu32(_uv);
__m128i _v = _mm_sub_epi32(_uv, _mm_mullo_epi32(_u, _mm_set1_epi32(kernel_w)));
_p = _mm_mullo_epi32(_p, _mm_set1_epi32(bottom_blob.cstep));
_p = _mm_mullo_epi32(_p, _mm_set1_epi32(cstep));
_u = _mm_mullo_epi32(_u, _mm_set1_epi32(dilation_h));
_v = _mm_mullo_epi32(_v, _mm_set1_epi32(dilation_w));
_u = _mm_mullo_epi32(_u, _mm_set1_epi32(w));
Expand Down Expand Up @@ -1974,7 +1976,7 @@ static void convolution_im2col_input_tile_int8_impl(const Mat& bottom_blob, Mat&
__m128i _uv = _mm_sub_epi32(_puv, _mm_mullo_epi32(_p, _mm_set1_epi32(maxk)));
__m128i _u = div_kernel_w._mm_comp_div_epu32(_uv);
__m128i _v = _mm_sub_epi32(_uv, _mm_mullo_epi32(_u, _mm_set1_epi32(kernel_w)));
_p = _mm_mullo_epi32(_p, _mm_set1_epi32(bottom_blob.cstep));
_p = _mm_mullo_epi32(_p, _mm_set1_epi32(cstep));
_u = _mm_mullo_epi32(_u, _mm_set1_epi32(dilation_h));
_v = _mm_mullo_epi32(_v, _mm_set1_epi32(dilation_w));
_u = _mm_mullo_epi32(_u, _mm_set1_epi32(w));
Expand Down Expand Up @@ -2197,7 +2199,7 @@ static void convolution_im2col_input_tile_int8_impl(const Mat& bottom_blob, Mat&
__m128i _uv = _mm_sub_epi32(_puv, _mm_mullo_epi32(_p, _mm_set1_epi32(maxk)));
__m128i _u = div_kernel_w._mm_comp_div_epu32(_uv);
__m128i _v = _mm_sub_epi32(_uv, _mm_mullo_epi32(_u, _mm_set1_epi32(kernel_w)));
_p = _mm_mullo_epi32(_p, _mm_set1_epi32(bottom_blob.cstep));
_p = _mm_mullo_epi32(_p, _mm_set1_epi32(cstep));
_u = _mm_mullo_epi32(_u, _mm_set1_epi32(dilation_h));
_v = _mm_mullo_epi32(_v, _mm_set1_epi32(dilation_w));
_u = _mm_mullo_epi32(_u, _mm_set1_epi32(w));
Expand Down Expand Up @@ -2321,7 +2323,7 @@ static void convolution_im2col_input_tile_int8_impl(const Mat& bottom_blob, Mat&
__m128i _uv = _mm_sub_epi32(_puv, _mm_mullo_epi32(_p, _mm_set1_epi32(maxk)));
__m128i _u = div_kernel_w._mm_comp_div_epu32(_uv);
__m128i _v = _mm_sub_epi32(_uv, _mm_mullo_epi32(_u, _mm_set1_epi32(kernel_w)));
_p = _mm_mullo_epi32(_p, _mm_set1_epi32(bottom_blob.cstep));
_p = _mm_mullo_epi32(_p, _mm_set1_epi32(cstep));
_u = _mm_mullo_epi32(_u, _mm_set1_epi32(dilation_h));
_v = _mm_mullo_epi32(_v, _mm_set1_epi32(dilation_w));
_u = _mm_mullo_epi32(_u, _mm_set1_epi32(w));
Expand Down Expand Up @@ -2481,7 +2483,7 @@ static void convolution_im2col_input_tile_int8_impl(const Mat& bottom_blob, Mat&
__m128i _uv = _mm_sub_epi32(_puv, _mm_mullo_epi32(_p, _mm_set1_epi32(maxk)));
__m128i _u = div_kernel_w._mm_comp_div_epu32(_uv);
__m128i _v = _mm_sub_epi32(_uv, _mm_mullo_epi32(_u, _mm_set1_epi32(kernel_w)));
_p = _mm_mullo_epi32(_p, _mm_set1_epi32(bottom_blob.cstep));
_p = _mm_mullo_epi32(_p, _mm_set1_epi32(cstep));
_u = _mm_mullo_epi32(_u, _mm_set1_epi32(dilation_h));
_v = _mm_mullo_epi32(_v, _mm_set1_epi32(dilation_w));
_u = _mm_mullo_epi32(_u, _mm_set1_epi32(w));
Expand Down
18 changes: 6 additions & 12 deletions src/layer/x86/x86_usability.h
Original file line number Diff line number Diff line change
Expand Up @@ -42,7 +42,7 @@
class FastDivider_epu32
{
public:
FastDivider_epu32(unsigned int d)
NCNN_FORCEINLINE FastDivider_epu32(unsigned int d)
{
unsigned int m, sh1, sh2;
if (d == 1)
Expand All @@ -54,13 +54,7 @@ class FastDivider_epu32
else
{
// sh = ceil(log2(d))
#ifdef _MSC_VER
unsigned long index;
_BitScanReverse(&index, d - 1);
uint32_t sh = index + 1;
#else
uint32_t sh = 32 - __builtin_clz(d - 1);
#endif
uint32_t sh = portable_ceil_log2(d);
uint32_t m0 = sh == 32 ? 0 : 1 << sh;

m = 1 + uint32_t((uint64_t(m0 - d) << 32) / d);
Expand All @@ -81,7 +75,7 @@ class FastDivider_epu32

#if __AVX2__
#if __AVX512F__
__m512i _mm512_comp_div_epu32(__m512i x) const
NCNN_FORCEINLINE __m512i _mm512_comp_div_epu32(__m512i x) const
{
// xm = (x * multiplier) >> 32
__m512i xm_low = _mm512_srli_epi64(_mm512_mul_epu32(x, _multiplier), 32);
Expand All @@ -93,13 +87,13 @@ class FastDivider_epu32
}
#endif // __AVX512F__

__m256i _mm256_comp_div_epu32(__m256i x) const
NCNN_FORCEINLINE __m256i _mm256_comp_div_epu32(__m256i x) const
{
// xm = (x * multiplier) >> 32
#if __AVX512F__
__m256i xm_low = _mm256_srli_epi64(_mm256_mul_epu32(x, _mm512_castsi512_si256(_multiplier)), 32);
__m256i xm_high = _mm256_mul_epu32(_mm256_srli_epi64(x, 32), _mm512_castsi512_si256(_multiplier));
#elif __AVX2__
#else
__m256i xm_low = _mm256_srli_epi64(_mm256_mul_epu32(x, _multiplier), 32);
__m256i xm_high = _mm256_mul_epu32(_mm256_srli_epi64(x, 32), _multiplier);
#endif
Expand All @@ -109,7 +103,7 @@ class FastDivider_epu32
}
#endif // __AVX2__

__m128i _mm_comp_div_epu32(__m128i x) const
NCNN_FORCEINLINE __m128i _mm_comp_div_epu32(__m128i x) const
{
// xm = (x * multiplier) >> 32
#if __AVX512F__
Expand Down

0 comments on commit 2aa1233

Please sign in to comment.