diff --git a/src/layer/x86/convolution_im2col_gemm_int8.h b/src/layer/x86/convolution_im2col_gemm_int8.h
index 32ef9c64767..ca186c5cbb2 100644
--- a/src/layer/x86/convolution_im2col_gemm_int8.h
+++ b/src/layer/x86/convolution_im2col_gemm_int8.h
@@ -6141,14 +6141,13 @@ static void convolution_im2col_input_tile_conv1x1s1d1_int8(const Mat& bottom_blo
     }
 }
 
-static void convolution_im2col_input_tile_int8(const Mat& bottom_blob, Mat& B, int j, int max_jj, int k, int max_kk, int kernel_w, int kernel_h, int dilation_w, int dilation_h, int stride_w, int stride_h)
+template<int kernel_w, int kernel_h, int dilation_w, int dilation_h, int stride_w, int stride_h>
+#if __AVX512F__
+void convolution_im2col_input_tile_int8_avx512(const Mat& bottom_blob, Mat& B, int j, int max_jj, int k, int max_kk)
+#else  // __AVX512F__
+void convolution_im2col_input_tile_int8(const Mat& bottom_blob, Mat& B, int j, int max_jj, int k, int max_kk)
+#endif // __AVX512F__
 {
-    if (kernel_w == 1 && kernel_h == 1 && dilation_w == 1 && dilation_h == 1 && stride_w == 1 && stride_h == 1)
-    {
-        convolution_im2col_input_tile_conv1x1s1d1_int8(bottom_blob, B, j, max_jj, k, max_kk);
-        return;
-    }
-
     const int w = bottom_blob.w;
     // const int channels = bottom_blob.c;
     const int elempack = bottom_blob.elempack;
@@ -6205,288 +6204,468 @@ static void convolution_im2col_input_tile_int8(const Mat& bottom_blob, Mat& B, i
         int dxe = (j + jj + 14) % outw;
         int dxf = (j + jj + 15) % outw;
 
-        int kk = 0;
-        if (elempack == 1)
+        if (dy0 == dyf)
         {
-            for (; kk + 1 < max_kk; kk += 2)
+            int kk = 0;
+            if (elempack == 1)
             {
-                int p0 = (k + kk) / maxk;
-                int p1 = (k + kk + 1) / maxk;
-                int uv0 = (k + kk) % maxk;
-                int uv1 = (k + kk + 1) % maxk;
-                int u0 = uv0 / kernel_w;
-                int u1 = uv1 / kernel_w;
-                int v0 = uv0 % kernel_w;
-                int v1 = uv1 % kernel_w;
-
-                const Mat img0 = bottom_blob.channel(p0);
-                const Mat img1 = bottom_blob.channel(p1);
-
-                int x00 = stride_w * dx0 + dilation_w * v0;
-                int x01 = stride_w * dx1 + dilation_w * v0;
-                int x02 = stride_w * dx2 + dilation_w * v0;
-                int x03 = stride_w * dx3 + dilation_w * v0;
-                int x04 = stride_w * dx4 + dilation_w * v0;
-                int x05 = stride_w * dx5 + dilation_w * v0;
-                int x06 = stride_w * dx6 + dilation_w * v0;
-                int x07 = stride_w * dx7 + dilation_w * v0;
-                int x08 = stride_w * dx8 + dilation_w * v0;
-                int x09 = stride_w * dx9 + dilation_w * v0;
-                int x0a = stride_w * dxa + dilation_w * v0;
-                int x0b = stride_w * dxb + dilation_w * v0;
-                int x0c = stride_w * dxc + dilation_w * v0;
-                int x0d = stride_w * dxd + dilation_w * v0;
-                int x0e = stride_w * dxe + dilation_w * v0;
-                int x0f = stride_w * dxf + dilation_w * v0;
-                int y00 = stride_h * dy0 + dilation_h * u0;
-                int y01 = stride_h * dy1 + dilation_h * u0;
-                int y02 = stride_h * dy2 + dilation_h * u0;
-                int y03 = stride_h * dy3 + dilation_h * u0;
-                int y04 = stride_h * dy4 + dilation_h * u0;
-                int y05 = stride_h * dy5 + dilation_h * u0;
-                int y06 = stride_h * dy6 + dilation_h * u0;
-                int y07 = stride_h * dy7 + dilation_h * u0;
-                int y08 = stride_h * dy8 + dilation_h * u0;
-                int y09 = stride_h * dy9 + dilation_h * u0;
-                int y0a = stride_h * dya + dilation_h * u0;
-                int y0b = stride_h * dyb + dilation_h * u0;
-                int y0c = stride_h * dyc + dilation_h * u0;
-                int y0d = stride_h * dyd + dilation_h * u0;
-                int y0e = stride_h * dye + dilation_h * u0;
-                int y0f = stride_h * dyf + dilation_h * u0;
-
-                int x10 = stride_w * dx0 + dilation_w * v1;
-                int x11 = stride_w * dx1 + dilation_w * v1;
-                int x12 = stride_w * dx2 + dilation_w * v1;
-                int x13 = stride_w * dx3 + dilation_w * v1;
-                int x14 = stride_w * dx4 + dilation_w * v1;
-                int x15 = stride_w * dx5 + dilation_w * v1;
-                int x16 = stride_w * dx6 + dilation_w * v1;
-                int x17 = stride_w * dx7 + dilation_w * v1;
-                int x18 = stride_w * dx8 + dilation_w * v1;
-                int x19 = stride_w * dx9 + dilation_w * v1;
-                int x1a = stride_w * dxa + dilation_w * v1;
-                int x1b = stride_w * dxb + dilation_w * v1;
-                int x1c = stride_w * dxc + dilation_w * v1;
-                int x1d = stride_w * dxd + dilation_w * v1;
-                int x1e = stride_w * dxe + dilation_w * v1;
-                int x1f = stride_w * dxf + dilation_w * v1;
-                int y10 = stride_h * dy0 + dilation_h * u1;
-                int y11 = stride_h * dy1 + dilation_h * u1;
-                int y12 = stride_h * dy2 + dilation_h * u1;
-                int y13 = stride_h * dy3 + dilation_h * u1;
-                int y14 = stride_h * dy4 + dilation_h * u1;
-                int y15 = stride_h * dy5 + dilation_h * u1;
-                int y16 = stride_h * dy6 + dilation_h * u1;
-                int y17 = stride_h * dy7 + dilation_h * u1;
-                int y18 = stride_h * dy8 + dilation_h * u1;
-                int y19 = stride_h * dy9 + dilation_h * u1;
-                int y1a = stride_h * dya + dilation_h * u1;
-                int y1b = stride_h * dyb + dilation_h * u1;
-                int y1c = stride_h * dyc + dilation_h * u1;
-                int y1d = stride_h * dyd + dilation_h * u1;
-                int y1e = stride_h * dye + dilation_h * u1;
-                int y1f = stride_h * dyf + dilation_h * u1;
-
-                const signed char* sptr00 = img0.row<const signed char>(y00) + x00;
-                const signed char* sptr01 = img0.row<const signed char>(y01) + x01;
-                const signed char* sptr02 = img0.row<const signed char>(y02) + x02;
-                const signed char* sptr03 = img0.row<const signed char>(y03) + x03;
-                const signed char* sptr04 = img0.row<const signed char>(y04) + x04;
-                const signed char* sptr05 = img0.row<const signed char>(y05) + x05;
-                const signed char* sptr06 = img0.row<const signed char>(y06) + x06;
-                const signed char* sptr07 = img0.row<const signed char>(y07) + x07;
-                const signed char* sptr08 = img0.row<const signed char>(y08) + x08;
-                const signed char* sptr09 = img0.row<const signed char>(y09) + x09;
-                const signed char* sptr0a = img0.row<const signed char>(y0a) + x0a;
-                const signed char* sptr0b = img0.row<const signed char>(y0b) + x0b;
-                const signed char* sptr0c = img0.row<const signed char>(y0c) + x0c;
-                const signed char* sptr0d = img0.row<const signed char>(y0d) + x0d;
-                const signed char* sptr0e = img0.row<const signed char>(y0e) + x0e;
-                const signed char* sptr0f = img0.row<const signed char>(y0f) + x0f;
-
-                const signed char* sptr10 = img1.row<const signed char>(y10) + x10;
-                const signed char* sptr11 = img1.row<const signed char>(y11) + x11;
-                const signed char* sptr12 = img1.row<const signed char>(y12) + x12;
-                const signed char* sptr13 = img1.row<const signed char>(y13) + x13;
-                const signed char* sptr14 = img1.row<const signed char>(y14) + x14;
-                const signed char* sptr15 = img1.row<const signed char>(y15) + x15;
-                const signed char* sptr16 = img1.row<const signed char>(y16) + x16;
-                const signed char* sptr17 = img1.row<const signed char>(y17) + x17;
-                const signed char* sptr18 = img1.row<const signed char>(y18) + x18;
-                const signed char* sptr19 = img1.row<const signed char>(y19) + x19;
-                const signed char* sptr1a = img1.row<const signed char>(y1a) + x1a;
-                const signed char* sptr1b = img1.row<const signed char>(y1b) + x1b;
-                const signed char* sptr1c = img1.row<const signed char>(y1c) + x1c;
-                const signed char* sptr1d = img1.row<const signed char>(y1d) + x1d;
-                const signed char* sptr1e = img1.row<const signed char>(y1e) + x1e;
-                const signed char* sptr1f = img1.row<const signed char>(y1f) + x1f;
-
-                pp[0] = sptr00[0];
-                pp[1] = sptr10[0];
-                pp[2] = sptr01[0];
-                pp[3] = sptr11[0];
-                pp[4] = sptr02[0];
-                pp[5] = sptr12[0];
-                pp[6] = sptr03[0];
-                pp[7] = sptr13[0];
-                pp[8] = sptr04[0];
-                pp[9] = sptr14[0];
-                pp[10] = sptr05[0];
-                pp[11] = sptr15[0];
-                pp[12] = sptr06[0];
-                pp[13] = sptr16[0];
-                pp[14] = sptr07[0];
-                pp[15] = sptr17[0];
-                pp[16 + 0] = sptr08[0];
-                pp[16 + 1] = sptr18[0];
-                pp[16 + 2] = sptr09[0];
-                pp[16 + 3] = sptr19[0];
-                pp[16 + 4] = sptr0a[0];
-                pp[16 + 5] = sptr1a[0];
-                pp[16 + 6] = sptr0b[0];
-                pp[16 + 7] = sptr1b[0];
-                pp[16 + 8] = sptr0c[0];
-                pp[16 + 9] = sptr1c[0];
-                pp[16 + 10] = sptr0d[0];
-                pp[16 + 11] = sptr1d[0];
-                pp[16 + 12] = sptr0e[0];
-                pp[16 + 13] = sptr1e[0];
-                pp[16 + 14] = sptr0f[0];
-                pp[16 + 15] = sptr1f[0];
-                pp += 32;
+                for (; kk + 1 < max_kk; kk += 2)
+                {
+                    int p0 = (k + kk) / maxk;
+                    int p1 = (k + kk + 1) / maxk;
+                    int uv0 = (k + kk) % maxk;
+                    int uv1 = (k + kk + 1) % maxk;
+                    int u0 = uv0 / kernel_w;
+                    int u1 = uv1 / kernel_w;
+                    int v0 = uv0 % kernel_w;
+                    int v1 = uv1 % kernel_w;
+
+                    const Mat img0 = bottom_blob.channel(p0);
+                    const Mat img1 = bottom_blob.channel(p1);
+
+                    int x00 = stride_w * dx0 + dilation_w * v0;
+                    int y00 = stride_h * dy0 + dilation_h * u0;
+                    int x10 = stride_w * dx0 + dilation_w * v1;
+                    int y10 = stride_h * dy0 + dilation_h * u1;
+
+                    const signed char* sptr0 = img0.row<const signed char>(y00) + x00;
+                    const signed char* sptr1 = img1.row<const signed char>(y10) + x10;
+
+                    if (stride_w == 1)
+                    {
+                        __m128i _r0 = _mm_loadu_si128((const __m128i*)sptr0);
+                        __m128i _r1 = _mm_loadu_si128((const __m128i*)sptr1);
+                        __m128i _tmp0 = _mm_unpacklo_epi8(_r0, _r1);
+                        __m128i _tmp1 = _mm_unpackhi_epi8(_r0, _r1);
+                        _mm_store_si128((__m128i*)pp, _tmp0);
+                        _mm_store_si128((__m128i*)(pp + 16), _tmp1);
+                        pp += 32;
+                    }
+                    else if (stride_w == 2)
+                    {
+                        __m256i _r0 = _mm256_loadu_si256((const __m256i*)sptr0);
+                        __m256i _r1 = _mm256_loadu_si256((const __m256i*)sptr1);
+                        __m256i _tmp0 = _mm256_unpacklo_epi8(_r0, _r1);
+                        __m256i _tmp1 = _mm256_unpackhi_epi8(_r0, _r1);
+                        _tmp0 = _mm256_shufflehi_epi16(_mm256_shufflelo_epi16(_tmp0, _MM_SHUFFLE(3, 1, 2, 0)), _MM_SHUFFLE(3, 1, 2, 0));
+                        _tmp1 = _mm256_shufflehi_epi16(_mm256_shufflelo_epi16(_tmp1, _MM_SHUFFLE(3, 1, 2, 0)), _MM_SHUFFLE(3, 1, 2, 0));
+                        _tmp0 = _mm256_shuffle_epi32(_tmp0, _MM_SHUFFLE(3, 1, 2, 0));
+                        _tmp1 = _mm256_shuffle_epi32(_tmp1, _MM_SHUFFLE(3, 1, 2, 0));
+                        __m256i _r01 = _mm256_unpacklo_epi64(_tmp0, _tmp1);
+                        _mm256_storeu_si256((__m256i*)pp, _r01);
+                        pp += 32;
+                    }
+                    else
+                    {
+                        pp[0] = sptr0[0];
+                        pp[1] = sptr1[0];
+                        pp[2] = sptr0[stride_w];
+                        pp[3] = sptr1[stride_w];
+                        pp[4] = sptr0[stride_w * 2];
+                        pp[5] = sptr1[stride_w * 2];
+                        pp[6] = sptr0[stride_w * 3];
+                        pp[7] = sptr1[stride_w * 3];
+                        pp[8] = sptr0[stride_w * 4];
+                        pp[9] = sptr1[stride_w * 4];
+                        pp[10] = sptr0[stride_w * 5];
+                        pp[11] = sptr1[stride_w * 5];
+                        pp[12] = sptr0[stride_w * 6];
+                        pp[13] = sptr1[stride_w * 6];
+                        pp[14] = sptr0[stride_w * 7];
+                        pp[15] = sptr1[stride_w * 7];
+                        pp[16 + 0] = sptr0[stride_w * 8];
+                        pp[16 + 1] = sptr1[stride_w * 8];
+                        pp[16 + 2] = sptr0[stride_w * 9];
+                        pp[16 + 3] = sptr1[stride_w * 9];
+                        pp[16 + 4] = sptr0[stride_w * 10];
+                        pp[16 + 5] = sptr1[stride_w * 10];
+                        pp[16 + 6] = sptr0[stride_w * 11];
+                        pp[16 + 7] = sptr1[stride_w * 11];
+                        pp[16 + 8] = sptr0[stride_w * 12];
+                        pp[16 + 9] = sptr1[stride_w * 12];
+                        pp[16 + 10] = sptr0[stride_w * 13];
+                        pp[16 + 11] = sptr1[stride_w * 13];
+                        pp[16 + 12] = sptr0[stride_w * 14];
+                        pp[16 + 13] = sptr1[stride_w * 14];
+                        pp[16 + 14] = sptr0[stride_w * 15];
+                        pp[16 + 15] = sptr1[stride_w * 15];
+                        pp += 32;
+                    }
+                }
             }
-        }
-        for (; kk < max_kk / elempack; kk++)
-        {
-            int p = (k / elempack + kk) / maxk;
-            int uv = (k / elempack + kk) % maxk;
-            int u = uv / kernel_w;
-            int v = uv % kernel_w;
+            for (; kk < max_kk / elempack; kk++)
+            {
+                int p = (k / elempack + kk) / maxk;
+                int uv = (k / elempack + kk) % maxk;
+                int u = uv / kernel_w;
+                int v = uv % kernel_w;
 
-            const Mat img = bottom_blob.channel(p);
+                const Mat img = bottom_blob.channel(p);
 
-            int x0 = stride_w * dx0 + dilation_w * v;
-            int x1 = stride_w * dx1 + dilation_w * v;
-            int x2 = stride_w * dx2 + dilation_w * v;
-            int x3 = stride_w * dx3 + dilation_w * v;
-            int x4 = stride_w * dx4 + dilation_w * v;
-            int x5 = stride_w * dx5 + dilation_w * v;
-            int x6 = stride_w * dx6 + dilation_w * v;
-            int x7 = stride_w * dx7 + dilation_w * v;
-            int x8 = stride_w * dx8 + dilation_w * v;
-            int x9 = stride_w * dx9 + dilation_w * v;
-            int xa = stride_w * dxa + dilation_w * v;
-            int xb = stride_w * dxb + dilation_w * v;
-            int xc = stride_w * dxc + dilation_w * v;
-            int xd = stride_w * dxd + dilation_w * v;
-            int xe = stride_w * dxe + dilation_w * v;
-            int xf = stride_w * dxf + dilation_w * v;
-            int y0 = stride_h * dy0 + dilation_h * u;
-            int y1 = stride_h * dy1 + dilation_h * u;
-            int y2 = stride_h * dy2 + dilation_h * u;
-            int y3 = stride_h * dy3 + dilation_h * u;
-            int y4 = stride_h * dy4 + dilation_h * u;
-            int y5 = stride_h * dy5 + dilation_h * u;
-            int y6 = stride_h * dy6 + dilation_h * u;
-            int y7 = stride_h * dy7 + dilation_h * u;
-            int y8 = stride_h * dy8 + dilation_h * u;
-            int y9 = stride_h * dy9 + dilation_h * u;
-            int ya = stride_h * dya + dilation_h * u;
-            int yb = stride_h * dyb + dilation_h * u;
-            int yc = stride_h * dyc + dilation_h * u;
-            int yd = stride_h * dyd + dilation_h * u;
-            int ye = stride_h * dye + dilation_h * u;
-            int yf = stride_h * dyf + dilation_h * u;
-
-            const signed char* sptr0 = img.row<const signed char>(y0) + x0 * elempack;
-            const signed char* sptr1 = img.row<const signed char>(y1) + x1 * elempack;
-            const signed char* sptr2 = img.row<const signed char>(y2) + x2 * elempack;
-            const signed char* sptr3 = img.row<const signed char>(y3) + x3 * elempack;
-            const signed char* sptr4 = img.row<const signed char>(y4) + x4 * elempack;
-            const signed char* sptr5 = img.row<const signed char>(y5) + x5 * elempack;
-            const signed char* sptr6 = img.row<const signed char>(y6) + x6 * elempack;
-            const signed char* sptr7 = img.row<const signed char>(y7) + x7 * elempack;
-            const signed char* sptr8 = img.row<const signed char>(y8) + x8 * elempack;
-            const signed char* sptr9 = img.row<const signed char>(y9) + x9 * elempack;
-            const signed char* sptra = img.row<const signed char>(ya) + xa * elempack;
-            const signed char* sptrb = img.row<const signed char>(yb) + xb * elempack;
-            const signed char* sptrc = img.row<const signed char>(yc) + xc * elempack;
-            const signed char* sptrd = img.row<const signed char>(yd) + xd * elempack;
-            const signed char* sptre = img.row<const signed char>(ye) + xe * elempack;
-            const signed char* sptrf = img.row<const signed char>(yf) + xf * elempack;
+                int x0 = stride_w * dx0 + dilation_w * v;
+                int y0 = stride_h * dy0 + dilation_h * u;
 
-            if (elempack == 8)
-            {
-                __m128i _r0 = _mm_loadl_epi64((const __m128i*)sptr0);
-                __m128i _r1 = _mm_loadl_epi64((const __m128i*)sptr1);
-                __m128i _r2 = _mm_loadl_epi64((const __m128i*)sptr2);
-                __m128i _r3 = _mm_loadl_epi64((const __m128i*)sptr3);
-                __m128i _r4 = _mm_loadl_epi64((const __m128i*)sptr4);
-                __m128i _r5 = _mm_loadl_epi64((const __m128i*)sptr5);
-                __m128i _r6 = _mm_loadl_epi64((const __m128i*)sptr6);
-                __m128i _r7 = _mm_loadl_epi64((const __m128i*)sptr7);
-                __m128i _r8 = _mm_loadl_epi64((const __m128i*)sptr8);
-                __m128i _r9 = _mm_loadl_epi64((const __m128i*)sptr9);
-                __m128i _ra = _mm_loadl_epi64((const __m128i*)sptra);
-                __m128i _rb = _mm_loadl_epi64((const __m128i*)sptrb);
-                __m128i _rc = _mm_loadl_epi64((const __m128i*)sptrc);
-                __m128i _rd = _mm_loadl_epi64((const __m128i*)sptrd);
-                __m128i _re = _mm_loadl_epi64((const __m128i*)sptre);
-                __m128i _rf = _mm_loadl_epi64((const __m128i*)sptrf);
-                __m128i _r01 = _mm_unpacklo_epi16(_r0, _r1);
-                __m128i _r23 = _mm_unpacklo_epi16(_r2, _r3);
-                __m128i _r45 = _mm_unpacklo_epi16(_r4, _r5);
-                __m128i _r67 = _mm_unpacklo_epi16(_r6, _r7);
-                __m128i _r89 = _mm_unpacklo_epi16(_r8, _r9);
-                __m128i _rab = _mm_unpacklo_epi16(_ra, _rb);
-                __m128i _rcd = _mm_unpacklo_epi16(_rc, _rd);
-                __m128i _ref = _mm_unpacklo_epi16(_re, _rf);
-                _r0 = _mm_unpacklo_epi32(_r01, _r23);
-                _r1 = _mm_unpackhi_epi32(_r01, _r23);
-                _r2 = _mm_unpacklo_epi32(_r45, _r67);
-                _r3 = _mm_unpackhi_epi32(_r45, _r67);
-                _r4 = _mm_unpacklo_epi32(_r89, _rab);
-                _r5 = _mm_unpackhi_epi32(_r89, _rab);
-                _r6 = _mm_unpacklo_epi32(_rcd, _ref);
-                _r7 = _mm_unpackhi_epi32(_rcd, _ref);
-                _r8 = _mm_unpacklo_epi64(_r0, _r2);
-                _r9 = _mm_unpacklo_epi64(_r4, _r6);
-                _ra = _mm_unpackhi_epi64(_r0, _r2);
-                _rb = _mm_unpackhi_epi64(_r4, _r6);
-                _rc = _mm_unpacklo_epi64(_r1, _r3);
-                _rd = _mm_unpacklo_epi64(_r5, _r7);
-                _re = _mm_unpackhi_epi64(_r1, _r3);
-                _rf = _mm_unpackhi_epi64(_r5, _r7);
-                _mm_storeu_si128((__m128i*)pp, _r8);
-                _mm_storeu_si128((__m128i*)(pp + 16), _r9);
-                _mm_storeu_si128((__m128i*)(pp + 32), _ra);
-                _mm_storeu_si128((__m128i*)(pp + 48), _rb);
-                _mm_storeu_si128((__m128i*)(pp + 64), _rc);
-                _mm_storeu_si128((__m128i*)(pp + 80), _rd);
-                _mm_storeu_si128((__m128i*)(pp + 96), _re);
-                _mm_storeu_si128((__m128i*)(pp + 112), _rf);
-                pp += 128;
+                const signed char* sptr = img.row<const signed char>(y0) + x0 * elempack;
+
+                if (elempack == 8)
+                {
+                    __m128i _r0 = _mm_loadl_epi64((const __m128i*)sptr);
+                    __m128i _r1 = _mm_loadl_epi64((const __m128i*)(sptr + stride_w * 8));
+                    __m128i _r2 = _mm_loadl_epi64((const __m128i*)(sptr + stride_w * 16));
+                    __m128i _r3 = _mm_loadl_epi64((const __m128i*)(sptr + stride_w * 24));
+                    __m128i _r4 = _mm_loadl_epi64((const __m128i*)(sptr + stride_w * 32));
+                    __m128i _r5 = _mm_loadl_epi64((const __m128i*)(sptr + stride_w * 40));
+                    __m128i _r6 = _mm_loadl_epi64((const __m128i*)(sptr + stride_w * 48));
+                    __m128i _r7 = _mm_loadl_epi64((const __m128i*)(sptr + stride_w * 56));
+                    __m128i _r8 = _mm_loadl_epi64((const __m128i*)(sptr + stride_w * 64));
+                    __m128i _r9 = _mm_loadl_epi64((const __m128i*)(sptr + stride_w * 72));
+                    __m128i _ra = _mm_loadl_epi64((const __m128i*)(sptr + stride_w * 80));
+                    __m128i _rb = _mm_loadl_epi64((const __m128i*)(sptr + stride_w * 88));
+                    __m128i _rc = _mm_loadl_epi64((const __m128i*)(sptr + stride_w * 96));
+                    __m128i _rd = _mm_loadl_epi64((const __m128i*)(sptr + stride_w * 104));
+                    __m128i _re = _mm_loadl_epi64((const __m128i*)(sptr + stride_w * 112));
+                    __m128i _rf = _mm_loadl_epi64((const __m128i*)(sptr + stride_w * 120));
+                    __m128i _r01 = _mm_unpacklo_epi16(_r0, _r1);
+                    __m128i _r23 = _mm_unpacklo_epi16(_r2, _r3);
+                    __m128i _r45 = _mm_unpacklo_epi16(_r4, _r5);
+                    __m128i _r67 = _mm_unpacklo_epi16(_r6, _r7);
+                    __m128i _r89 = _mm_unpacklo_epi16(_r8, _r9);
+                    __m128i _rab = _mm_unpacklo_epi16(_ra, _rb);
+                    __m128i _rcd = _mm_unpacklo_epi16(_rc, _rd);
+                    __m128i _ref = _mm_unpacklo_epi16(_re, _rf);
+                    _r0 = _mm_unpacklo_epi32(_r01, _r23);
+                    _r1 = _mm_unpackhi_epi32(_r01, _r23);
+                    _r2 = _mm_unpacklo_epi32(_r45, _r67);
+                    _r3 = _mm_unpackhi_epi32(_r45, _r67);
+                    _r4 = _mm_unpacklo_epi32(_r89, _rab);
+                    _r5 = _mm_unpackhi_epi32(_r89, _rab);
+                    _r6 = _mm_unpacklo_epi32(_rcd, _ref);
+                    _r7 = _mm_unpackhi_epi32(_rcd, _ref);
+                    _r8 = _mm_unpacklo_epi64(_r0, _r2);
+                    _r9 = _mm_unpacklo_epi64(_r4, _r6);
+                    _ra = _mm_unpackhi_epi64(_r0, _r2);
+                    _rb = _mm_unpackhi_epi64(_r4, _r6);
+                    _rc = _mm_unpacklo_epi64(_r1, _r3);
+                    _rd = _mm_unpacklo_epi64(_r5, _r7);
+                    _re = _mm_unpackhi_epi64(_r1, _r3);
+                    _rf = _mm_unpackhi_epi64(_r5, _r7);
+                    _mm_storeu_si128((__m128i*)pp, _r8);
+                    _mm_storeu_si128((__m128i*)(pp + 16), _r9);
+                    _mm_storeu_si128((__m128i*)(pp + 32), _ra);
+                    _mm_storeu_si128((__m128i*)(pp + 48), _rb);
+                    _mm_storeu_si128((__m128i*)(pp + 64), _rc);
+                    _mm_storeu_si128((__m128i*)(pp + 80), _rd);
+                    _mm_storeu_si128((__m128i*)(pp + 96), _re);
+                    _mm_storeu_si128((__m128i*)(pp + 112), _rf);
+                    pp += 128;
+                }
+                if (elempack == 1)
+                {
+                    pp[0] = sptr[0];
+                    pp[1] = sptr[stride_w];
+                    pp[2] = sptr[stride_w * 2];
+                    pp[3] = sptr[stride_w * 3];
+                    pp[4] = sptr[stride_w * 4];
+                    pp[5] = sptr[stride_w * 5];
+                    pp[6] = sptr[stride_w * 6];
+                    pp[7] = sptr[stride_w * 7];
+                    pp[8] = sptr[stride_w * 8];
+                    pp[9] = sptr[stride_w * 9];
+                    pp[10] = sptr[stride_w * 10];
+                    pp[11] = sptr[stride_w * 11];
+                    pp[12] = sptr[stride_w * 12];
+                    pp[13] = sptr[stride_w * 13];
+                    pp[14] = sptr[stride_w * 14];
+                    pp[15] = sptr[stride_w * 15];
+                    pp += 16;
+                }
             }
+        }
+        else
+        {
+            int kk = 0;
             if (elempack == 1)
             {
-                pp[0] = sptr0[0];
-                pp[1] = sptr1[0];
-                pp[2] = sptr2[0];
-                pp[3] = sptr3[0];
-                pp[4] = sptr4[0];
-                pp[5] = sptr5[0];
-                pp[6] = sptr6[0];
-                pp[7] = sptr7[0];
-                pp[8] = sptr8[0];
-                pp[9] = sptr9[0];
-                pp[10] = sptra[0];
-                pp[11] = sptrb[0];
-                pp[12] = sptrc[0];
-                pp[13] = sptrd[0];
-                pp[14] = sptre[0];
-                pp[15] = sptrf[0];
-                pp += 16;
+                for (; kk + 1 < max_kk; kk += 2)
+                {
+                    int p0 = (k + kk) / maxk;
+                    int p1 = (k + kk + 1) / maxk;
+                    int uv0 = (k + kk) % maxk;
+                    int uv1 = (k + kk + 1) % maxk;
+                    int u0 = uv0 / kernel_w;
+                    int u1 = uv1 / kernel_w;
+                    int v0 = uv0 % kernel_w;
+                    int v1 = uv1 % kernel_w;
+
+                    const Mat img0 = bottom_blob.channel(p0);
+                    const Mat img1 = bottom_blob.channel(p1);
+
+                    int x00 = stride_w * dx0 + dilation_w * v0;
+                    int x01 = stride_w * dx1 + dilation_w * v0;
+                    int x02 = stride_w * dx2 + dilation_w * v0;
+                    int x03 = stride_w * dx3 + dilation_w * v0;
+                    int x04 = stride_w * dx4 + dilation_w * v0;
+                    int x05 = stride_w * dx5 + dilation_w * v0;
+                    int x06 = stride_w * dx6 + dilation_w * v0;
+                    int x07 = stride_w * dx7 + dilation_w * v0;
+                    int x08 = stride_w * dx8 + dilation_w * v0;
+                    int x09 = stride_w * dx9 + dilation_w * v0;
+                    int x0a = stride_w * dxa + dilation_w * v0;
+                    int x0b = stride_w * dxb + dilation_w * v0;
+                    int x0c = stride_w * dxc + dilation_w * v0;
+                    int x0d = stride_w * dxd + dilation_w * v0;
+                    int x0e = stride_w * dxe + dilation_w * v0;
+                    int x0f = stride_w * dxf + dilation_w * v0;
+                    int y00 = stride_h * dy0 + dilation_h * u0;
+                    int y01 = stride_h * dy1 + dilation_h * u0;
+                    int y02 = stride_h * dy2 + dilation_h * u0;
+                    int y03 = stride_h * dy3 + dilation_h * u0;
+                    int y04 = stride_h * dy4 + dilation_h * u0;
+                    int y05 = stride_h * dy5 + dilation_h * u0;
+                    int y06 = stride_h * dy6 + dilation_h * u0;
+                    int y07 = stride_h * dy7 + dilation_h * u0;
+                    int y08 = stride_h * dy8 + dilation_h * u0;
+                    int y09 = stride_h * dy9 + dilation_h * u0;
+                    int y0a = stride_h * dya + dilation_h * u0;
+                    int y0b = stride_h * dyb + dilation_h * u0;
+                    int y0c = stride_h * dyc + dilation_h * u0;
+                    int y0d = stride_h * dyd + dilation_h * u0;
+                    int y0e = stride_h * dye + dilation_h * u0;
+                    int y0f = stride_h * dyf + dilation_h * u0;
+
+                    int x10 = stride_w * dx0 + dilation_w * v1;
+                    int x11 = stride_w * dx1 + dilation_w * v1;
+                    int x12 = stride_w * dx2 + dilation_w * v1;
+                    int x13 = stride_w * dx3 + dilation_w * v1;
+                    int x14 = stride_w * dx4 + dilation_w * v1;
+                    int x15 = stride_w * dx5 + dilation_w * v1;
+                    int x16 = stride_w * dx6 + dilation_w * v1;
+                    int x17 = stride_w * dx7 + dilation_w * v1;
+                    int x18 = stride_w * dx8 + dilation_w * v1;
+                    int x19 = stride_w * dx9 + dilation_w * v1;
+                    int x1a = stride_w * dxa + dilation_w * v1;
+                    int x1b = stride_w * dxb + dilation_w * v1;
+                    int x1c = stride_w * dxc + dilation_w * v1;
+                    int x1d = stride_w * dxd + dilation_w * v1;
+                    int x1e = stride_w * dxe + dilation_w * v1;
+                    int x1f = stride_w * dxf + dilation_w * v1;
+                    int y10 = stride_h * dy0 + dilation_h * u1;
+                    int y11 = stride_h * dy1 + dilation_h * u1;
+                    int y12 = stride_h * dy2 + dilation_h * u1;
+                    int y13 = stride_h * dy3 + dilation_h * u1;
+                    int y14 = stride_h * dy4 + dilation_h * u1;
+                    int y15 = stride_h * dy5 + dilation_h * u1;
+                    int y16 = stride_h * dy6 + dilation_h * u1;
+                    int y17 = stride_h * dy7 + dilation_h * u1;
+                    int y18 = stride_h * dy8 + dilation_h * u1;
+                    int y19 = stride_h * dy9 + dilation_h * u1;
+                    int y1a = stride_h * dya + dilation_h * u1;
+                    int y1b = stride_h * dyb + dilation_h * u1;
+                    int y1c = stride_h * dyc + dilation_h * u1;
+                    int y1d = stride_h * dyd + dilation_h * u1;
+                    int y1e = stride_h * dye + dilation_h * u1;
+                    int y1f = stride_h * dyf + dilation_h * u1;
+
+                    const signed char* sptr00 = img0.row<const signed char>(y00) + x00;
+                    const signed char* sptr01 = img0.row<const signed char>(y01) + x01;
+                    const signed char* sptr02 = img0.row<const signed char>(y02) + x02;
+                    const signed char* sptr03 = img0.row<const signed char>(y03) + x03;
+                    const signed char* sptr04 = img0.row<const signed char>(y04) + x04;
+                    const signed char* sptr05 = img0.row<const signed char>(y05) + x05;
+                    const signed char* sptr06 = img0.row<const signed char>(y06) + x06;
+                    const signed char* sptr07 = img0.row<const signed char>(y07) + x07;
+                    const signed char* sptr08 = img0.row<const signed char>(y08) + x08;
+                    const signed char* sptr09 = img0.row<const signed char>(y09) + x09;
+                    const signed char* sptr0a = img0.row<const signed char>(y0a) + x0a;
+                    const signed char* sptr0b = img0.row<const signed char>(y0b) + x0b;
+                    const signed char* sptr0c = img0.row<const signed char>(y0c) + x0c;
+                    const signed char* sptr0d = img0.row<const signed char>(y0d) + x0d;
+                    const signed char* sptr0e = img0.row<const signed char>(y0e) + x0e;
+                    const signed char* sptr0f = img0.row<const signed char>(y0f) + x0f;
+
+                    const signed char* sptr10 = img1.row<const signed char>(y10) + x10;
+                    const signed char* sptr11 = img1.row<const signed char>(y11) + x11;
+                    const signed char* sptr12 = img1.row<const signed char>(y12) + x12;
+                    const signed char* sptr13 = img1.row<const signed char>(y13) + x13;
+                    const signed char* sptr14 = img1.row<const signed char>(y14) + x14;
+                    const signed char* sptr15 = img1.row<const signed char>(y15) + x15;
+                    const signed char* sptr16 = img1.row<const signed char>(y16) + x16;
+                    const signed char* sptr17 = img1.row<const signed char>(y17) + x17;
+                    const signed char* sptr18 = img1.row<const signed char>(y18) + x18;
+                    const signed char* sptr19 = img1.row<const signed char>(y19) + x19;
+                    const signed char* sptr1a = img1.row<const signed char>(y1a) + x1a;
+                    const signed char* sptr1b = img1.row<const signed char>(y1b) + x1b;
+                    const signed char* sptr1c = img1.row<const signed char>(y1c) + x1c;
+                    const signed char* sptr1d = img1.row<const signed char>(y1d) + x1d;
+                    const signed char* sptr1e = img1.row<const signed char>(y1e) + x1e;
+                    const signed char* sptr1f = img1.row<const signed char>(y1f) + x1f;
+
+                    pp[0] = sptr00[0];
+                    pp[1] = sptr10[0];
+                    pp[2] = sptr01[0];
+                    pp[3] = sptr11[0];
+                    pp[4] = sptr02[0];
+                    pp[5] = sptr12[0];
+                    pp[6] = sptr03[0];
+                    pp[7] = sptr13[0];
+                    pp[8] = sptr04[0];
+                    pp[9] = sptr14[0];
+                    pp[10] = sptr05[0];
+                    pp[11] = sptr15[0];
+                    pp[12] = sptr06[0];
+                    pp[13] = sptr16[0];
+                    pp[14] = sptr07[0];
+                    pp[15] = sptr17[0];
+                    pp[16 + 0] = sptr08[0];
+                    pp[16 + 1] = sptr18[0];
+                    pp[16 + 2] = sptr09[0];
+                    pp[16 + 3] = sptr19[0];
+                    pp[16 + 4] = sptr0a[0];
+                    pp[16 + 5] = sptr1a[0];
+                    pp[16 + 6] = sptr0b[0];
+                    pp[16 + 7] = sptr1b[0];
+                    pp[16 + 8] = sptr0c[0];
+                    pp[16 + 9] = sptr1c[0];
+                    pp[16 + 10] = sptr0d[0];
+                    pp[16 + 11] = sptr1d[0];
+                    pp[16 + 12] = sptr0e[0];
+                    pp[16 + 13] = sptr1e[0];
+                    pp[16 + 14] = sptr0f[0];
+                    pp[16 + 15] = sptr1f[0];
+                    pp += 32;
+                }
+            }
+            for (; kk < max_kk / elempack; kk++)
+            {
+                int p = (k / elempack + kk) / maxk;
+                int uv = (k / elempack + kk) % maxk;
+                int u = uv / kernel_w;
+                int v = uv % kernel_w;
+
+                const Mat img = bottom_blob.channel(p);
+
+                int x0 = stride_w * dx0 + dilation_w * v;
+                int x1 = stride_w * dx1 + dilation_w * v;
+                int x2 = stride_w * dx2 + dilation_w * v;
+                int x3 = stride_w * dx3 + dilation_w * v;
+                int x4 = stride_w * dx4 + dilation_w * v;
+                int x5 = stride_w * dx5 + dilation_w * v;
+                int x6 = stride_w * dx6 + dilation_w * v;
+                int x7 = stride_w * dx7 + dilation_w * v;
+                int x8 = stride_w * dx8 + dilation_w * v;
+                int x9 = stride_w * dx9 + dilation_w * v;
+                int xa = stride_w * dxa + dilation_w * v;
+                int xb = stride_w * dxb + dilation_w * v;
+                int xc = stride_w * dxc + dilation_w * v;
+                int xd = stride_w * dxd + dilation_w * v;
+                int xe = stride_w * dxe + dilation_w * v;
+                int xf = stride_w * dxf + dilation_w * v;
+                int y0 = stride_h * dy0 + dilation_h * u;
+                int y1 = stride_h * dy1 + dilation_h * u;
+                int y2 = stride_h * dy2 + dilation_h * u;
+                int y3 = stride_h * dy3 + dilation_h * u;
+                int y4 = stride_h * dy4 + dilation_h * u;
+                int y5 = stride_h * dy5 + dilation_h * u;
+                int y6 = stride_h * dy6 + dilation_h * u;
+                int y7 = stride_h * dy7 + dilation_h * u;
+                int y8 = stride_h * dy8 + dilation_h * u;
+                int y9 = stride_h * dy9 + dilation_h * u;
+                int ya = stride_h * dya + dilation_h * u;
+                int yb = stride_h * dyb + dilation_h * u;
+                int yc = stride_h * dyc + dilation_h * u;
+                int yd = stride_h * dyd + dilation_h * u;
+                int ye = stride_h * dye + dilation_h * u;
+                int yf = stride_h * dyf + dilation_h * u;
+
+                const signed char* sptr0 = img.row<const signed char>(y0) + x0 * elempack;
+                const signed char* sptr1 = img.row<const signed char>(y1) + x1 * elempack;
+                const signed char* sptr2 = img.row<const signed char>(y2) + x2 * elempack;
+                const signed char* sptr3 = img.row<const signed char>(y3) + x3 * elempack;
+                const signed char* sptr4 = img.row<const signed char>(y4) + x4 * elempack;
+                const signed char* sptr5 = img.row<const signed char>(y5) + x5 * elempack;
+                const signed char* sptr6 = img.row<const signed char>(y6) + x6 * elempack;
+                const signed char* sptr7 = img.row<const signed char>(y7) + x7 * elempack;
+                const signed char* sptr8 = img.row<const signed char>(y8) + x8 * elempack;
+                const signed char* sptr9 = img.row<const signed char>(y9) + x9 * elempack;
+                const signed char* sptra = img.row<const signed char>(ya) + xa * elempack;
+                const signed char* sptrb = img.row<const signed char>(yb) + xb * elempack;
+                const signed char* sptrc = img.row<const signed char>(yc) + xc * elempack;
+                const signed char* sptrd = img.row<const signed char>(yd) + xd * elempack;
+                const signed char* sptre = img.row<const signed char>(ye) + xe * elempack;
+                const signed char* sptrf = img.row<const signed char>(yf) + xf * elempack;
+
+                if (elempack == 8)
+                {
+                    __m128i _r0 = _mm_loadl_epi64((const __m128i*)sptr0);
+                    __m128i _r1 = _mm_loadl_epi64((const __m128i*)sptr1);
+                    __m128i _r2 = _mm_loadl_epi64((const __m128i*)sptr2);
+                    __m128i _r3 = _mm_loadl_epi64((const __m128i*)sptr3);
+                    __m128i _r4 = _mm_loadl_epi64((const __m128i*)sptr4);
+                    __m128i _r5 = _mm_loadl_epi64((const __m128i*)sptr5);
+                    __m128i _r6 = _mm_loadl_epi64((const __m128i*)sptr6);
+                    __m128i _r7 = _mm_loadl_epi64((const __m128i*)sptr7);
+                    __m128i _r8 = _mm_loadl_epi64((const __m128i*)sptr8);
+                    __m128i _r9 = _mm_loadl_epi64((const __m128i*)sptr9);
+                    __m128i _ra = _mm_loadl_epi64((const __m128i*)sptra);
+                    __m128i _rb = _mm_loadl_epi64((const __m128i*)sptrb);
+                    __m128i _rc = _mm_loadl_epi64((const __m128i*)sptrc);
+                    __m128i _rd = _mm_loadl_epi64((const __m128i*)sptrd);
+                    __m128i _re = _mm_loadl_epi64((const __m128i*)sptre);
+                    __m128i _rf = _mm_loadl_epi64((const __m128i*)sptrf);
+                    __m128i _r01 = _mm_unpacklo_epi16(_r0, _r1);
+                    __m128i _r23 = _mm_unpacklo_epi16(_r2, _r3);
+                    __m128i _r45 = _mm_unpacklo_epi16(_r4, _r5);
+                    __m128i _r67 = _mm_unpacklo_epi16(_r6, _r7);
+                    __m128i _r89 = _mm_unpacklo_epi16(_r8, _r9);
+                    __m128i _rab = _mm_unpacklo_epi16(_ra, _rb);
+                    __m128i _rcd = _mm_unpacklo_epi16(_rc, _rd);
+                    __m128i _ref = _mm_unpacklo_epi16(_re, _rf);
+                    _r0 = _mm_unpacklo_epi32(_r01, _r23);
+                    _r1 = _mm_unpackhi_epi32(_r01, _r23);
+                    _r2 = _mm_unpacklo_epi32(_r45, _r67);
+                    _r3 = _mm_unpackhi_epi32(_r45, _r67);
+                    _r4 = _mm_unpacklo_epi32(_r89, _rab);
+                    _r5 = _mm_unpackhi_epi32(_r89, _rab);
+                    _r6 = _mm_unpacklo_epi32(_rcd, _ref);
+                    _r7 = _mm_unpackhi_epi32(_rcd, _ref);
+                    _r8 = _mm_unpacklo_epi64(_r0, _r2);
+                    _r9 = _mm_unpacklo_epi64(_r4, _r6);
+                    _ra = _mm_unpackhi_epi64(_r0, _r2);
+                    _rb = _mm_unpackhi_epi64(_r4, _r6);
+                    _rc = _mm_unpacklo_epi64(_r1, _r3);
+                    _rd = _mm_unpacklo_epi64(_r5, _r7);
+                    _re = _mm_unpackhi_epi64(_r1, _r3);
+                    _rf = _mm_unpackhi_epi64(_r5, _r7);
+                    _mm_storeu_si128((__m128i*)pp, _r8);
+                    _mm_storeu_si128((__m128i*)(pp + 16), _r9);
+                    _mm_storeu_si128((__m128i*)(pp + 32), _ra);
+                    _mm_storeu_si128((__m128i*)(pp + 48), _rb);
+                    _mm_storeu_si128((__m128i*)(pp + 64), _rc);
+                    _mm_storeu_si128((__m128i*)(pp + 80), _rd);
+                    _mm_storeu_si128((__m128i*)(pp + 96), _re);
+                    _mm_storeu_si128((__m128i*)(pp + 112), _rf);
+                    pp += 128;
+                }
+                if (elempack == 1)
+                {
+                    pp[0] = sptr0[0];
+                    pp[1] = sptr1[0];
+                    pp[2] = sptr2[0];
+                    pp[3] = sptr3[0];
+                    pp[4] = sptr4[0];
+                    pp[5] = sptr5[0];
+                    pp[6] = sptr6[0];
+                    pp[7] = sptr7[0];
+                    pp[8] = sptr8[0];
+                    pp[9] = sptr9[0];
+                    pp[10] = sptra[0];
+                    pp[11] = sptrb[0];
+                    pp[12] = sptrc[0];
+                    pp[13] = sptrd[0];
+                    pp[14] = sptre[0];
+                    pp[15] = sptrf[0];
+                    pp += 16;
+                }
             }
         }
     }
@@ -6510,168 +6689,298 @@ static void convolution_im2col_input_tile_int8(const Mat& bottom_blob, Mat& B, i
         int dx6 = (j + jj + 6) % outw;
         int dx7 = (j + jj + 7) % outw;
 
-        int kk = 0;
-        if (elempack == 1)
+        if (dy0 == dy7)
         {
-            for (; kk + 1 < max_kk; kk += 2)
+            int kk = 0;
+            if (elempack == 1)
             {
-                int p0 = (k + kk) / maxk;
-                int p1 = (k + kk + 1) / maxk;
-                int uv0 = (k + kk) % maxk;
-                int uv1 = (k + kk + 1) % maxk;
-                int u0 = uv0 / kernel_w;
-                int u1 = uv1 / kernel_w;
-                int v0 = uv0 % kernel_w;
-                int v1 = uv1 % kernel_w;
-
-                const Mat img0 = bottom_blob.channel(p0);
-                const Mat img1 = bottom_blob.channel(p1);
-
-                int x00 = stride_w * dx0 + dilation_w * v0;
-                int x01 = stride_w * dx1 + dilation_w * v0;
-                int x02 = stride_w * dx2 + dilation_w * v0;
-                int x03 = stride_w * dx3 + dilation_w * v0;
-                int x04 = stride_w * dx4 + dilation_w * v0;
-                int x05 = stride_w * dx5 + dilation_w * v0;
-                int x06 = stride_w * dx6 + dilation_w * v0;
-                int x07 = stride_w * dx7 + dilation_w * v0;
-                int y00 = stride_h * dy0 + dilation_h * u0;
-                int y01 = stride_h * dy1 + dilation_h * u0;
-                int y02 = stride_h * dy2 + dilation_h * u0;
-                int y03 = stride_h * dy3 + dilation_h * u0;
-                int y04 = stride_h * dy4 + dilation_h * u0;
-                int y05 = stride_h * dy5 + dilation_h * u0;
-                int y06 = stride_h * dy6 + dilation_h * u0;
-                int y07 = stride_h * dy7 + dilation_h * u0;
-
-                int x10 = stride_w * dx0 + dilation_w * v1;
-                int x11 = stride_w * dx1 + dilation_w * v1;
-                int x12 = stride_w * dx2 + dilation_w * v1;
-                int x13 = stride_w * dx3 + dilation_w * v1;
-                int x14 = stride_w * dx4 + dilation_w * v1;
-                int x15 = stride_w * dx5 + dilation_w * v1;
-                int x16 = stride_w * dx6 + dilation_w * v1;
-                int x17 = stride_w * dx7 + dilation_w * v1;
-                int y10 = stride_h * dy0 + dilation_h * u1;
-                int y11 = stride_h * dy1 + dilation_h * u1;
-                int y12 = stride_h * dy2 + dilation_h * u1;
-                int y13 = stride_h * dy3 + dilation_h * u1;
-                int y14 = stride_h * dy4 + dilation_h * u1;
-                int y15 = stride_h * dy5 + dilation_h * u1;
-                int y16 = stride_h * dy6 + dilation_h * u1;
-                int y17 = stride_h * dy7 + dilation_h * u1;
-
-                const signed char* sptr00 = img0.row<const signed char>(y00) + x00;
-                const signed char* sptr01 = img0.row<const signed char>(y01) + x01;
-                const signed char* sptr02 = img0.row<const signed char>(y02) + x02;
-                const signed char* sptr03 = img0.row<const signed char>(y03) + x03;
-                const signed char* sptr04 = img0.row<const signed char>(y04) + x04;
-                const signed char* sptr05 = img0.row<const signed char>(y05) + x05;
-                const signed char* sptr06 = img0.row<const signed char>(y06) + x06;
-                const signed char* sptr07 = img0.row<const signed char>(y07) + x07;
-
-                const signed char* sptr10 = img1.row<const signed char>(y10) + x10;
-                const signed char* sptr11 = img1.row<const signed char>(y11) + x11;
-                const signed char* sptr12 = img1.row<const signed char>(y12) + x12;
-                const signed char* sptr13 = img1.row<const signed char>(y13) + x13;
-                const signed char* sptr14 = img1.row<const signed char>(y14) + x14;
-                const signed char* sptr15 = img1.row<const signed char>(y15) + x15;
-                const signed char* sptr16 = img1.row<const signed char>(y16) + x16;
-                const signed char* sptr17 = img1.row<const signed char>(y17) + x17;
-
-                pp[0] = sptr00[0];
-                pp[1] = sptr10[0];
-                pp[2] = sptr01[0];
-                pp[3] = sptr11[0];
-                pp[4] = sptr02[0];
-                pp[5] = sptr12[0];
-                pp[6] = sptr03[0];
-                pp[7] = sptr13[0];
-                pp[8] = sptr04[0];
-                pp[9] = sptr14[0];
-                pp[10] = sptr05[0];
-                pp[11] = sptr15[0];
-                pp[12] = sptr06[0];
-                pp[13] = sptr16[0];
-                pp[14] = sptr07[0];
-                pp[15] = sptr17[0];
-                pp += 16;
+                for (; kk + 1 < max_kk; kk += 2)
+                {
+                    int p0 = (k + kk) / maxk;
+                    int p1 = (k + kk + 1) / maxk;
+                    int uv0 = (k + kk) % maxk;
+                    int uv1 = (k + kk + 1) % maxk;
+                    int u0 = uv0 / kernel_w;
+                    int u1 = uv1 / kernel_w;
+                    int v0 = uv0 % kernel_w;
+                    int v1 = uv1 % kernel_w;
+
+                    const Mat img0 = bottom_blob.channel(p0);
+                    const Mat img1 = bottom_blob.channel(p1);
+
+                    int x00 = stride_w * dx0 + dilation_w * v0;
+                    int y00 = stride_h * dy0 + dilation_h * u0;
+                    int x10 = stride_w * dx0 + dilation_w * v1;
+                    int y10 = stride_h * dy0 + dilation_h * u1;
+
+                    const signed char* sptr0 = img0.row<const signed char>(y00) + x00;
+                    const signed char* sptr1 = img1.row<const signed char>(y10) + x10;
+
+                    if (stride_w == 1)
+                    {
+                        __m128i _r0 = _mm_loadl_epi64((const __m128i*)sptr0);
+                        __m128i _r1 = _mm_loadl_epi64((const __m128i*)sptr1);
+                        __m128i _r01 = _mm_unpacklo_epi8(_r0, _r1);
+                        _mm_store_si128((__m128i*)pp, _r01);
+                        pp += 16;
+                    }
+                    else if (stride_w == 2)
+                    {
+                        __m128i _r0 = _mm_loadu_si128((const __m128i*)sptr0);
+                        __m128i _r1 = _mm_loadu_si128((const __m128i*)sptr1);
+                        __m128i _tmp0 = _mm_unpacklo_epi8(_r0, _r1);
+                        __m128i _tmp1 = _mm_unpackhi_epi8(_r0, _r1);
+                        _tmp0 = _mm_shufflehi_epi16(_mm_shufflelo_epi16(_tmp0, _MM_SHUFFLE(3, 1, 2, 0)), _MM_SHUFFLE(3, 1, 2, 0));
+                        _tmp1 = _mm_shufflehi_epi16(_mm_shufflelo_epi16(_tmp1, _MM_SHUFFLE(3, 1, 2, 0)), _MM_SHUFFLE(3, 1, 2, 0));
+                        _tmp0 = _mm_shuffle_epi32(_tmp0, _MM_SHUFFLE(3, 1, 2, 0));
+                        _tmp1 = _mm_shuffle_epi32(_tmp1, _MM_SHUFFLE(3, 1, 2, 0));
+                        __m128i _r01 = _mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(_tmp0), _mm_castsi128_ps(_tmp1), _MM_SHUFFLE(1, 0, 1, 0)));
+                        _mm_store_si128((__m128i*)pp, _r01);
+                        pp += 16;
+                    }
+                    else
+                    {
+                        pp[0] = sptr0[0];
+                        pp[1] = sptr1[0];
+                        pp[2] = sptr0[stride_w];
+                        pp[3] = sptr1[stride_w];
+                        pp[4] = sptr0[stride_w * 2];
+                        pp[5] = sptr1[stride_w * 2];
+                        pp[6] = sptr0[stride_w * 3];
+                        pp[7] = sptr1[stride_w * 3];
+                        pp[8] = sptr0[stride_w * 4];
+                        pp[9] = sptr1[stride_w * 4];
+                        pp[10] = sptr0[stride_w * 5];
+                        pp[11] = sptr1[stride_w * 5];
+                        pp[12] = sptr0[stride_w * 6];
+                        pp[13] = sptr1[stride_w * 6];
+                        pp[14] = sptr0[stride_w * 7];
+                        pp[15] = sptr1[stride_w * 7];
+                        pp += 16;
+                    }
+                }
             }
-        }
-        for (; kk < max_kk / elempack; kk++)
-        {
-            int p = (k / elempack + kk) / maxk;
-            int uv = (k / elempack + kk) % maxk;
-            int u = uv / kernel_w;
-            int v = uv % kernel_w;
+            for (; kk < max_kk / elempack; kk++)
+            {
+                int p = (k / elempack + kk) / maxk;
+                int uv = (k / elempack + kk) % maxk;
+                int u = uv / kernel_w;
+                int v = uv % kernel_w;
 
-            const Mat img = bottom_blob.channel(p);
+                const Mat img = bottom_blob.channel(p);
 
-            int x0 = stride_w * dx0 + dilation_w * v;
-            int x1 = stride_w * dx1 + dilation_w * v;
-            int x2 = stride_w * dx2 + dilation_w * v;
-            int x3 = stride_w * dx3 + dilation_w * v;
-            int x4 = stride_w * dx4 + dilation_w * v;
-            int x5 = stride_w * dx5 + dilation_w * v;
-            int x6 = stride_w * dx6 + dilation_w * v;
-            int x7 = stride_w * dx7 + dilation_w * v;
-            int y0 = stride_h * dy0 + dilation_h * u;
-            int y1 = stride_h * dy1 + dilation_h * u;
-            int y2 = stride_h * dy2 + dilation_h * u;
-            int y3 = stride_h * dy3 + dilation_h * u;
-            int y4 = stride_h * dy4 + dilation_h * u;
-            int y5 = stride_h * dy5 + dilation_h * u;
-            int y6 = stride_h * dy6 + dilation_h * u;
-            int y7 = stride_h * dy7 + dilation_h * u;
-
-            const signed char* sptr0 = img.row<const signed char>(y0) + x0 * elempack;
-            const signed char* sptr1 = img.row<const signed char>(y1) + x1 * elempack;
-            const signed char* sptr2 = img.row<const signed char>(y2) + x2 * elempack;
-            const signed char* sptr3 = img.row<const signed char>(y3) + x3 * elempack;
-            const signed char* sptr4 = img.row<const signed char>(y4) + x4 * elempack;
-            const signed char* sptr5 = img.row<const signed char>(y5) + x5 * elempack;
-            const signed char* sptr6 = img.row<const signed char>(y6) + x6 * elempack;
-            const signed char* sptr7 = img.row<const signed char>(y7) + x7 * elempack;
+                int x0 = stride_w * dx0 + dilation_w * v;
+                int y0 = stride_h * dy0 + dilation_h * u;
 
-            if (elempack == 8)
-            {
-                __m128i _r0 = _mm_loadl_epi64((const __m128i*)sptr0);
-                __m128i _r1 = _mm_loadl_epi64((const __m128i*)sptr1);
-                __m128i _r2 = _mm_loadl_epi64((const __m128i*)sptr2);
-                __m128i _r3 = _mm_loadl_epi64((const __m128i*)sptr3);
-                __m128i _r4 = _mm_loadl_epi64((const __m128i*)sptr4);
-                __m128i _r5 = _mm_loadl_epi64((const __m128i*)sptr5);
-                __m128i _r6 = _mm_loadl_epi64((const __m128i*)sptr6);
-                __m128i _r7 = _mm_loadl_epi64((const __m128i*)sptr7);
-                __m128i _r01 = _mm_unpacklo_epi16(_r0, _r1);
-                __m128i _r23 = _mm_unpacklo_epi16(_r2, _r3);
-                __m128i _r45 = _mm_unpacklo_epi16(_r4, _r5);
-                __m128i _r67 = _mm_unpacklo_epi16(_r6, _r7);
-                _r0 = _mm_unpacklo_epi32(_r01, _r23);
-                _r1 = _mm_unpackhi_epi32(_r01, _r23);
-                _r2 = _mm_unpacklo_epi32(_r45, _r67);
-                _r3 = _mm_unpackhi_epi32(_r45, _r67);
-                _r4 = _mm_unpacklo_epi64(_r0, _r2);
-                _r5 = _mm_unpackhi_epi64(_r0, _r2);
-                _r6 = _mm_unpacklo_epi64(_r1, _r3);
-                _r7 = _mm_unpackhi_epi64(_r1, _r3);
-                _mm_storeu_si128((__m128i*)pp, _r4);
-                _mm_storeu_si128((__m128i*)(pp + 16), _r5);
-                _mm_storeu_si128((__m128i*)(pp + 32), _r6);
-                _mm_storeu_si128((__m128i*)(pp + 48), _r7);
-                pp += 64;
+                const signed char* sptr = img.row<const signed char>(y0) + x0 * elempack;
+
+                if (elempack == 8)
+                {
+                    __m128i _r0 = _mm_loadl_epi64((const __m128i*)sptr);
+                    __m128i _r1 = _mm_loadl_epi64((const __m128i*)(sptr + stride_w * 8));
+                    __m128i _r2 = _mm_loadl_epi64((const __m128i*)(sptr + stride_w * 16));
+                    __m128i _r3 = _mm_loadl_epi64((const __m128i*)(sptr + stride_w * 24));
+                    __m128i _r4 = _mm_loadl_epi64((const __m128i*)(sptr + stride_w * 32));
+                    __m128i _r5 = _mm_loadl_epi64((const __m128i*)(sptr + stride_w * 40));
+                    __m128i _r6 = _mm_loadl_epi64((const __m128i*)(sptr + stride_w * 48));
+                    __m128i _r7 = _mm_loadl_epi64((const __m128i*)(sptr + stride_w * 56));
+                    __m128i _r01 = _mm_unpacklo_epi16(_r0, _r1);
+                    __m128i _r23 = _mm_unpacklo_epi16(_r2, _r3);
+                    __m128i _r45 = _mm_unpacklo_epi16(_r4, _r5);
+                    __m128i _r67 = _mm_unpacklo_epi16(_r6, _r7);
+                    _r0 = _mm_unpacklo_epi32(_r01, _r23);
+                    _r1 = _mm_unpackhi_epi32(_r01, _r23);
+                    _r2 = _mm_unpacklo_epi32(_r45, _r67);
+                    _r3 = _mm_unpackhi_epi32(_r45, _r67);
+                    _r4 = _mm_unpacklo_epi64(_r0, _r2);
+                    _r5 = _mm_unpackhi_epi64(_r0, _r2);
+                    _r6 = _mm_unpacklo_epi64(_r1, _r3);
+                    _r7 = _mm_unpackhi_epi64(_r1, _r3);
+                    _mm_storeu_si128((__m128i*)pp, _r4);
+                    _mm_storeu_si128((__m128i*)(pp + 16), _r5);
+                    _mm_storeu_si128((__m128i*)(pp + 32), _r6);
+                    _mm_storeu_si128((__m128i*)(pp + 48), _r7);
+                    pp += 64;
+                }
+                if (elempack == 1)
+                {
+                    pp[0] = sptr[0];
+                    pp[1] = sptr[stride_w];
+                    pp[2] = sptr[stride_w * 2];
+                    pp[3] = sptr[stride_w * 3];
+                    pp[4] = sptr[stride_w * 4];
+                    pp[5] = sptr[stride_w * 5];
+                    pp[6] = sptr[stride_w * 6];
+                    pp[7] = sptr[stride_w * 7];
+                    pp += 8;
+                }
             }
+        }
+        else
+        {
+            int kk = 0;
             if (elempack == 1)
             {
-                pp[0] = sptr0[0];
-                pp[1] = sptr1[0];
-                pp[2] = sptr2[0];
-                pp[3] = sptr3[0];
-                pp[4] = sptr4[0];
-                pp[5] = sptr5[0];
-                pp[6] = sptr6[0];
-                pp[7] = sptr7[0];
-                pp += 8;
+                for (; kk + 1 < max_kk; kk += 2)
+                {
+                    int p0 = (k + kk) / maxk;
+                    int p1 = (k + kk + 1) / maxk;
+                    int uv0 = (k + kk) % maxk;
+                    int uv1 = (k + kk + 1) % maxk;
+                    int u0 = uv0 / kernel_w;
+                    int u1 = uv1 / kernel_w;
+                    int v0 = uv0 % kernel_w;
+                    int v1 = uv1 % kernel_w;
+
+                    const Mat img0 = bottom_blob.channel(p0);
+                    const Mat img1 = bottom_blob.channel(p1);
+
+                    int x00 = stride_w * dx0 + dilation_w * v0;
+                    int x01 = stride_w * dx1 + dilation_w * v0;
+                    int x02 = stride_w * dx2 + dilation_w * v0;
+                    int x03 = stride_w * dx3 + dilation_w * v0;
+                    int x04 = stride_w * dx4 + dilation_w * v0;
+                    int x05 = stride_w * dx5 + dilation_w * v0;
+                    int x06 = stride_w * dx6 + dilation_w * v0;
+                    int x07 = stride_w * dx7 + dilation_w * v0;
+                    int y00 = stride_h * dy0 + dilation_h * u0;
+                    int y01 = stride_h * dy1 + dilation_h * u0;
+                    int y02 = stride_h * dy2 + dilation_h * u0;
+                    int y03 = stride_h * dy3 + dilation_h * u0;
+                    int y04 = stride_h * dy4 + dilation_h * u0;
+                    int y05 = stride_h * dy5 + dilation_h * u0;
+                    int y06 = stride_h * dy6 + dilation_h * u0;
+                    int y07 = stride_h * dy7 + dilation_h * u0;
+
+                    int x10 = stride_w * dx0 + dilation_w * v1;
+                    int x11 = stride_w * dx1 + dilation_w * v1;
+                    int x12 = stride_w * dx2 + dilation_w * v1;
+                    int x13 = stride_w * dx3 + dilation_w * v1;
+                    int x14 = stride_w * dx4 + dilation_w * v1;
+                    int x15 = stride_w * dx5 + dilation_w * v1;
+                    int x16 = stride_w * dx6 + dilation_w * v1;
+                    int x17 = stride_w * dx7 + dilation_w * v1;
+                    int y10 = stride_h * dy0 + dilation_h * u1;
+                    int y11 = stride_h * dy1 + dilation_h * u1;
+                    int y12 = stride_h * dy2 + dilation_h * u1;
+                    int y13 = stride_h * dy3 + dilation_h * u1;
+                    int y14 = stride_h * dy4 + dilation_h * u1;
+                    int y15 = stride_h * dy5 + dilation_h * u1;
+                    int y16 = stride_h * dy6 + dilation_h * u1;
+                    int y17 = stride_h * dy7 + dilation_h * u1;
+
+                    const signed char* sptr00 = img0.row<const signed char>(y00) + x00;
+                    const signed char* sptr01 = img0.row<const signed char>(y01) + x01;
+                    const signed char* sptr02 = img0.row<const signed char>(y02) + x02;
+                    const signed char* sptr03 = img0.row<const signed char>(y03) + x03;
+                    const signed char* sptr04 = img0.row<const signed char>(y04) + x04;
+                    const signed char* sptr05 = img0.row<const signed char>(y05) + x05;
+                    const signed char* sptr06 = img0.row<const signed char>(y06) + x06;
+                    const signed char* sptr07 = img0.row<const signed char>(y07) + x07;
+
+                    const signed char* sptr10 = img1.row<const signed char>(y10) + x10;
+                    const signed char* sptr11 = img1.row<const signed char>(y11) + x11;
+                    const signed char* sptr12 = img1.row<const signed char>(y12) + x12;
+                    const signed char* sptr13 = img1.row<const signed char>(y13) + x13;
+                    const signed char* sptr14 = img1.row<const signed char>(y14) + x14;
+                    const signed char* sptr15 = img1.row<const signed char>(y15) + x15;
+                    const signed char* sptr16 = img1.row<const signed char>(y16) + x16;
+                    const signed char* sptr17 = img1.row<const signed char>(y17) + x17;
+
+                    pp[0] = sptr00[0];
+                    pp[1] = sptr10[0];
+                    pp[2] = sptr01[0];
+                    pp[3] = sptr11[0];
+                    pp[4] = sptr02[0];
+                    pp[5] = sptr12[0];
+                    pp[6] = sptr03[0];
+                    pp[7] = sptr13[0];
+                    pp[8] = sptr04[0];
+                    pp[9] = sptr14[0];
+                    pp[10] = sptr05[0];
+                    pp[11] = sptr15[0];
+                    pp[12] = sptr06[0];
+                    pp[13] = sptr16[0];
+                    pp[14] = sptr07[0];
+                    pp[15] = sptr17[0];
+                    pp += 16;
+                }
+            }
+            for (; kk < max_kk / elempack; kk++)
+            {
+                int p = (k / elempack + kk) / maxk;
+                int uv = (k / elempack + kk) % maxk;
+                int u = uv / kernel_w;
+                int v = uv % kernel_w;
+
+                const Mat img = bottom_blob.channel(p);
+
+                int x0 = stride_w * dx0 + dilation_w * v;
+                int x1 = stride_w * dx1 + dilation_w * v;
+                int x2 = stride_w * dx2 + dilation_w * v;
+                int x3 = stride_w * dx3 + dilation_w * v;
+                int x4 = stride_w * dx4 + dilation_w * v;
+                int x5 = stride_w * dx5 + dilation_w * v;
+                int x6 = stride_w * dx6 + dilation_w * v;
+                int x7 = stride_w * dx7 + dilation_w * v;
+                int y0 = stride_h * dy0 + dilation_h * u;
+                int y1 = stride_h * dy1 + dilation_h * u;
+                int y2 = stride_h * dy2 + dilation_h * u;
+                int y3 = stride_h * dy3 + dilation_h * u;
+                int y4 = stride_h * dy4 + dilation_h * u;
+                int y5 = stride_h * dy5 + dilation_h * u;
+                int y6 = stride_h * dy6 + dilation_h * u;
+                int y7 = stride_h * dy7 + dilation_h * u;
+
+                const signed char* sptr0 = img.row<const signed char>(y0) + x0 * elempack;
+                const signed char* sptr1 = img.row<const signed char>(y1) + x1 * elempack;
+                const signed char* sptr2 = img.row<const signed char>(y2) + x2 * elempack;
+                const signed char* sptr3 = img.row<const signed char>(y3) + x3 * elempack;
+                const signed char* sptr4 = img.row<const signed char>(y4) + x4 * elempack;
+                const signed char* sptr5 = img.row<const signed char>(y5) + x5 * elempack;
+                const signed char* sptr6 = img.row<const signed char>(y6) + x6 * elempack;
+                const signed char* sptr7 = img.row<const signed char>(y7) + x7 * elempack;
+
+                if (elempack == 8)
+                {
+                    __m128i _r0 = _mm_loadl_epi64((const __m128i*)sptr0);
+                    __m128i _r1 = _mm_loadl_epi64((const __m128i*)sptr1);
+                    __m128i _r2 = _mm_loadl_epi64((const __m128i*)sptr2);
+                    __m128i _r3 = _mm_loadl_epi64((const __m128i*)sptr3);
+                    __m128i _r4 = _mm_loadl_epi64((const __m128i*)sptr4);
+                    __m128i _r5 = _mm_loadl_epi64((const __m128i*)sptr5);
+                    __m128i _r6 = _mm_loadl_epi64((const __m128i*)sptr6);
+                    __m128i _r7 = _mm_loadl_epi64((const __m128i*)sptr7);
+                    __m128i _r01 = _mm_unpacklo_epi16(_r0, _r1);
+                    __m128i _r23 = _mm_unpacklo_epi16(_r2, _r3);
+                    __m128i _r45 = _mm_unpacklo_epi16(_r4, _r5);
+                    __m128i _r67 = _mm_unpacklo_epi16(_r6, _r7);
+                    _r0 = _mm_unpacklo_epi32(_r01, _r23);
+                    _r1 = _mm_unpackhi_epi32(_r01, _r23);
+                    _r2 = _mm_unpacklo_epi32(_r45, _r67);
+                    _r3 = _mm_unpackhi_epi32(_r45, _r67);
+                    _r4 = _mm_unpacklo_epi64(_r0, _r2);
+                    _r5 = _mm_unpackhi_epi64(_r0, _r2);
+                    _r6 = _mm_unpacklo_epi64(_r1, _r3);
+                    _r7 = _mm_unpackhi_epi64(_r1, _r3);
+                    _mm_storeu_si128((__m128i*)pp, _r4);
+                    _mm_storeu_si128((__m128i*)(pp + 16), _r5);
+                    _mm_storeu_si128((__m128i*)(pp + 32), _r6);
+                    _mm_storeu_si128((__m128i*)(pp + 48), _r7);
+                    pp += 64;
+                }
+                if (elempack == 1)
+                {
+                    pp[0] = sptr0[0];
+                    pp[1] = sptr1[0];
+                    pp[2] = sptr2[0];
+                    pp[3] = sptr3[0];
+                    pp[4] = sptr4[0];
+                    pp[5] = sptr5[0];
+                    pp[6] = sptr6[0];
+                    pp[7] = sptr7[0];
+                    pp += 8;
+                }
             }
         }
     }
@@ -6687,106 +6996,206 @@ static void convolution_im2col_input_tile_int8(const Mat& bottom_blob, Mat& B, i
         int dx2 = (j + jj + 2) % outw;
         int dx3 = (j + jj + 3) % outw;
 
-        int kk = 0;
-        if (elempack == 1)
+        if (dy0 == dy3)
         {
-            for (; kk + 1 < max_kk; kk += 2)
+            int kk = 0;
+            if (elempack == 1)
             {
-                int p0 = (k + kk) / maxk;
-                int p1 = (k + kk + 1) / maxk;
-                int uv0 = (k + kk) % maxk;
-                int uv1 = (k + kk + 1) % maxk;
-                int u0 = uv0 / kernel_w;
-                int u1 = uv1 / kernel_w;
-                int v0 = uv0 % kernel_w;
-                int v1 = uv1 % kernel_w;
-
-                const Mat img0 = bottom_blob.channel(p0);
-                const Mat img1 = bottom_blob.channel(p1);
-
-                int x00 = stride_w * dx0 + dilation_w * v0;
-                int x01 = stride_w * dx1 + dilation_w * v0;
-                int x02 = stride_w * dx2 + dilation_w * v0;
-                int x03 = stride_w * dx3 + dilation_w * v0;
-                int y00 = stride_h * dy0 + dilation_h * u0;
-                int y01 = stride_h * dy1 + dilation_h * u0;
-                int y02 = stride_h * dy2 + dilation_h * u0;
-                int y03 = stride_h * dy3 + dilation_h * u0;
-
-                int x10 = stride_w * dx0 + dilation_w * v1;
-                int x11 = stride_w * dx1 + dilation_w * v1;
-                int x12 = stride_w * dx2 + dilation_w * v1;
-                int x13 = stride_w * dx3 + dilation_w * v1;
-                int y10 = stride_h * dy0 + dilation_h * u1;
-                int y11 = stride_h * dy1 + dilation_h * u1;
-                int y12 = stride_h * dy2 + dilation_h * u1;
-                int y13 = stride_h * dy3 + dilation_h * u1;
-
-                const signed char* sptr00 = img0.row<const signed char>(y00) + x00;
-                const signed char* sptr01 = img0.row<const signed char>(y01) + x01;
-                const signed char* sptr02 = img0.row<const signed char>(y02) + x02;
-                const signed char* sptr03 = img0.row<const signed char>(y03) + x03;
-
-                const signed char* sptr10 = img1.row<const signed char>(y10) + x10;
-                const signed char* sptr11 = img1.row<const signed char>(y11) + x11;
-                const signed char* sptr12 = img1.row<const signed char>(y12) + x12;
-                const signed char* sptr13 = img1.row<const signed char>(y13) + x13;
-
-                pp[0] = sptr00[0];
-                pp[1] = sptr10[0];
-                pp[2] = sptr01[0];
-                pp[3] = sptr11[0];
-                pp[4] = sptr02[0];
-                pp[5] = sptr12[0];
-                pp[6] = sptr03[0];
-                pp[7] = sptr13[0];
-                pp += 8;
+                for (; kk + 1 < max_kk; kk += 2)
+                {
+                    int p0 = (k + kk) / maxk;
+                    int p1 = (k + kk + 1) / maxk;
+                    int uv0 = (k + kk) % maxk;
+                    int uv1 = (k + kk + 1) % maxk;
+                    int u0 = uv0 / kernel_w;
+                    int u1 = uv1 / kernel_w;
+                    int v0 = uv0 % kernel_w;
+                    int v1 = uv1 % kernel_w;
+
+                    const Mat img0 = bottom_blob.channel(p0);
+                    const Mat img1 = bottom_blob.channel(p1);
+
+                    int x00 = stride_w * dx0 + dilation_w * v0;
+                    int y00 = stride_h * dy0 + dilation_h * u0;
+                    int x10 = stride_w * dx0 + dilation_w * v1;
+                    int y10 = stride_h * dy0 + dilation_h * u1;
+
+                    const signed char* sptr0 = img0.row<const signed char>(y00) + x00;
+                    const signed char* sptr1 = img1.row<const signed char>(y10) + x10;
+
+                    if (stride_w == 1)
+                    {
+                        __m128i _r0 = _mm_loadl_epi64((const __m128i*)sptr0);
+                        __m128i _r1 = _mm_loadl_epi64((const __m128i*)sptr1);
+                        __m128i _r01 = _mm_unpacklo_epi8(_r0, _r1);
+                        _mm_storel_epi64((__m128i*)pp, _r01);
+                        pp += 8;
+                    }
+                    else if (stride_w == 2)
+                    {
+                        __m128i _r0 = _mm_loadl_epi64((const __m128i*)sptr0);
+                        __m128i _r1 = _mm_loadl_epi64((const __m128i*)sptr1);
+                        __m128i _r01 = _mm_unpacklo_epi8(_r0, _r1);
+                        _r01 = _mm_shufflehi_epi16(_mm_shufflelo_epi16(_r01, _MM_SHUFFLE(3, 1, 2, 0)), _MM_SHUFFLE(3, 1, 2, 0));
+                        _r01 = _mm_shuffle_epi32(_r01, _MM_SHUFFLE(3, 1, 2, 0));
+                        _mm_storel_epi64((__m128i*)pp, _r01);
+                        pp += 8;
+                    }
+                    else
+                    {
+                        pp[0] = sptr0[0];
+                        pp[1] = sptr1[0];
+                        pp[2] = sptr0[stride_w];
+                        pp[3] = sptr1[stride_w];
+                        pp[4] = sptr0[stride_w * 2];
+                        pp[5] = sptr1[stride_w * 2];
+                        pp[6] = sptr0[stride_w * 3];
+                        pp[7] = sptr1[stride_w * 3];
+                        pp += 8;
+                    }
+                }
             }
-        }
-        for (; kk < max_kk / elempack; kk++)
-        {
-            int p = (k / elempack + kk) / maxk;
-            int uv = (k / elempack + kk) % maxk;
-            int u = uv / kernel_w;
-            int v = uv % kernel_w;
+            for (; kk < max_kk / elempack; kk++)
+            {
+                int p = (k / elempack + kk) / maxk;
+                int uv = (k / elempack + kk) % maxk;
+                int u = uv / kernel_w;
+                int v = uv % kernel_w;
 
-            const Mat img = bottom_blob.channel(p);
+                const Mat img = bottom_blob.channel(p);
 
-            int x0 = stride_w * dx0 + dilation_w * v;
-            int x1 = stride_w * dx1 + dilation_w * v;
-            int x2 = stride_w * dx2 + dilation_w * v;
-            int x3 = stride_w * dx3 + dilation_w * v;
-            int y0 = stride_h * dy0 + dilation_h * u;
-            int y1 = stride_h * dy1 + dilation_h * u;
-            int y2 = stride_h * dy2 + dilation_h * u;
-            int y3 = stride_h * dy3 + dilation_h * u;
+                int x0 = stride_w * dx0 + dilation_w * v;
+                int y0 = stride_h * dy0 + dilation_h * u;
 
-            const signed char* sptr0 = img.row<const signed char>(y0) + x0 * elempack;
-            const signed char* sptr1 = img.row<const signed char>(y1) + x1 * elempack;
-            const signed char* sptr2 = img.row<const signed char>(y2) + x2 * elempack;
-            const signed char* sptr3 = img.row<const signed char>(y3) + x3 * elempack;
+                const signed char* sptr = img.row<const signed char>(y0) + x0 * elempack;
 
-            if (elempack == 8)
-            {
-                __m128i _r0 = _mm_loadl_epi64((const __m128i*)sptr0);
-                __m128i _r1 = _mm_loadl_epi64((const __m128i*)sptr1);
-                __m128i _r2 = _mm_loadl_epi64((const __m128i*)sptr2);
-                __m128i _r3 = _mm_loadl_epi64((const __m128i*)sptr3);
-                __m128i _r01 = _mm_unpacklo_epi16(_r0, _r1);
-                __m128i _r23 = _mm_unpacklo_epi16(_r2, _r3);
-                _r0 = _mm_unpacklo_epi32(_r01, _r23);
-                _r1 = _mm_unpackhi_epi32(_r01, _r23);
-                _mm_storeu_si128((__m128i*)pp, _r0);
-                _mm_storeu_si128((__m128i*)(pp + 16), _r1);
-                pp += 32;
+                if (elempack == 8)
+                {
+                    __m128i _r0 = _mm_loadl_epi64((const __m128i*)sptr);
+                    __m128i _r1 = _mm_loadl_epi64((const __m128i*)(sptr + stride_w * 8));
+                    __m128i _r2 = _mm_loadl_epi64((const __m128i*)(sptr + stride_w * 16));
+                    __m128i _r3 = _mm_loadl_epi64((const __m128i*)(sptr + stride_w * 24));
+                    __m128i _r01 = _mm_unpacklo_epi16(_r0, _r1);
+                    __m128i _r23 = _mm_unpacklo_epi16(_r2, _r3);
+                    _r0 = _mm_unpacklo_epi32(_r01, _r23);
+                    _r1 = _mm_unpackhi_epi32(_r01, _r23);
+                    _mm_storeu_si128((__m128i*)pp, _r0);
+                    _mm_storeu_si128((__m128i*)(pp + 16), _r1);
+                    pp += 32;
+                }
+                if (elempack == 1)
+                {
+                    pp[0] = sptr[0];
+                    pp[1] = sptr[stride_w];
+                    pp[2] = sptr[stride_w * 2];
+                    pp[3] = sptr[stride_w * 3];
+                    pp += 4;
+                }
             }
+        }
+        else
+        {
+            int kk = 0;
             if (elempack == 1)
             {
-                pp[0] = sptr0[0];
-                pp[1] = sptr1[0];
-                pp[2] = sptr2[0];
-                pp[3] = sptr3[0];
-                pp += 4;
+                for (; kk + 1 < max_kk; kk += 2)
+                {
+                    int p0 = (k + kk) / maxk;
+                    int p1 = (k + kk + 1) / maxk;
+                    int uv0 = (k + kk) % maxk;
+                    int uv1 = (k + kk + 1) % maxk;
+                    int u0 = uv0 / kernel_w;
+                    int u1 = uv1 / kernel_w;
+                    int v0 = uv0 % kernel_w;
+                    int v1 = uv1 % kernel_w;
+
+                    const Mat img0 = bottom_blob.channel(p0);
+                    const Mat img1 = bottom_blob.channel(p1);
+
+                    int x00 = stride_w * dx0 + dilation_w * v0;
+                    int x01 = stride_w * dx1 + dilation_w * v0;
+                    int x02 = stride_w * dx2 + dilation_w * v0;
+                    int x03 = stride_w * dx3 + dilation_w * v0;
+                    int y00 = stride_h * dy0 + dilation_h * u0;
+                    int y01 = stride_h * dy1 + dilation_h * u0;
+                    int y02 = stride_h * dy2 + dilation_h * u0;
+                    int y03 = stride_h * dy3 + dilation_h * u0;
+
+                    int x10 = stride_w * dx0 + dilation_w * v1;
+                    int x11 = stride_w * dx1 + dilation_w * v1;
+                    int x12 = stride_w * dx2 + dilation_w * v1;
+                    int x13 = stride_w * dx3 + dilation_w * v1;
+                    int y10 = stride_h * dy0 + dilation_h * u1;
+                    int y11 = stride_h * dy1 + dilation_h * u1;
+                    int y12 = stride_h * dy2 + dilation_h * u1;
+                    int y13 = stride_h * dy3 + dilation_h * u1;
+
+                    const signed char* sptr00 = img0.row<const signed char>(y00) + x00;
+                    const signed char* sptr01 = img0.row<const signed char>(y01) + x01;
+                    const signed char* sptr02 = img0.row<const signed char>(y02) + x02;
+                    const signed char* sptr03 = img0.row<const signed char>(y03) + x03;
+
+                    const signed char* sptr10 = img1.row<const signed char>(y10) + x10;
+                    const signed char* sptr11 = img1.row<const signed char>(y11) + x11;
+                    const signed char* sptr12 = img1.row<const signed char>(y12) + x12;
+                    const signed char* sptr13 = img1.row<const signed char>(y13) + x13;
+
+                    pp[0] = sptr00[0];
+                    pp[1] = sptr10[0];
+                    pp[2] = sptr01[0];
+                    pp[3] = sptr11[0];
+                    pp[4] = sptr02[0];
+                    pp[5] = sptr12[0];
+                    pp[6] = sptr03[0];
+                    pp[7] = sptr13[0];
+                    pp += 8;
+                }
+            }
+            for (; kk < max_kk / elempack; kk++)
+            {
+                int p = (k / elempack + kk) / maxk;
+                int uv = (k / elempack + kk) % maxk;
+                int u = uv / kernel_w;
+                int v = uv % kernel_w;
+
+                const Mat img = bottom_blob.channel(p);
+
+                int x0 = stride_w * dx0 + dilation_w * v;
+                int x1 = stride_w * dx1 + dilation_w * v;
+                int x2 = stride_w * dx2 + dilation_w * v;
+                int x3 = stride_w * dx3 + dilation_w * v;
+                int y0 = stride_h * dy0 + dilation_h * u;
+                int y1 = stride_h * dy1 + dilation_h * u;
+                int y2 = stride_h * dy2 + dilation_h * u;
+                int y3 = stride_h * dy3 + dilation_h * u;
+
+                const signed char* sptr0 = img.row<const signed char>(y0) + x0 * elempack;
+                const signed char* sptr1 = img.row<const signed char>(y1) + x1 * elempack;
+                const signed char* sptr2 = img.row<const signed char>(y2) + x2 * elempack;
+                const signed char* sptr3 = img.row<const signed char>(y3) + x3 * elempack;
+
+                if (elempack == 8)
+                {
+                    __m128i _r0 = _mm_loadl_epi64((const __m128i*)sptr0);
+                    __m128i _r1 = _mm_loadl_epi64((const __m128i*)sptr1);
+                    __m128i _r2 = _mm_loadl_epi64((const __m128i*)sptr2);
+                    __m128i _r3 = _mm_loadl_epi64((const __m128i*)sptr3);
+                    __m128i _r01 = _mm_unpacklo_epi16(_r0, _r1);
+                    __m128i _r23 = _mm_unpacklo_epi16(_r2, _r3);
+                    _r0 = _mm_unpacklo_epi32(_r01, _r23);
+                    _r1 = _mm_unpackhi_epi32(_r01, _r23);
+                    _mm_storeu_si128((__m128i*)pp, _r0);
+                    _mm_storeu_si128((__m128i*)(pp + 16), _r1);
+                    pp += 32;
+                }
+                if (elempack == 1)
+                {
+                    pp[0] = sptr0[0];
+                    pp[1] = sptr1[0];
+                    pp[2] = sptr2[0];
+                    pp[3] = sptr3[0];
+                    pp += 4;
+                }
             }
         }
     }
@@ -6798,44 +7207,154 @@ static void convolution_im2col_input_tile_int8(const Mat& bottom_blob, Mat& B, i
         int dx0 = (j + jj) % outw;
         int dx1 = (j + jj + 1) % outw;
 
-        int kk = 0;
-        if (elempack == 1)
+        if (dy0 == dy1)
         {
-            for (; kk + 1 < max_kk; kk += 2)
+            int kk = 0;
+            if (elempack == 1)
             {
-                int p0 = (k + kk) / maxk;
-                int p1 = (k + kk + 1) / maxk;
-                int uv0 = (k + kk) % maxk;
-                int uv1 = (k + kk + 1) % maxk;
-                int u0 = uv0 / kernel_w;
-                int u1 = uv1 / kernel_w;
-                int v0 = uv0 % kernel_w;
-                int v1 = uv1 % kernel_w;
-
-                const Mat img0 = bottom_blob.channel(p0);
-                const Mat img1 = bottom_blob.channel(p1);
-
-                int x00 = stride_w * dx0 + dilation_w * v0;
-                int x01 = stride_w * dx1 + dilation_w * v0;
-                int y00 = stride_h * dy0 + dilation_h * u0;
-                int y01 = stride_h * dy1 + dilation_h * u0;
-                int x10 = stride_w * dx0 + dilation_w * v1;
-                int x11 = stride_w * dx1 + dilation_w * v1;
-                int y10 = stride_h * dy0 + dilation_h * u1;
-                int y11 = stride_h * dy1 + dilation_h * u1;
-
-                const signed char* sptr00 = img0.row<const signed char>(y00) + x00;
-                const signed char* sptr01 = img0.row<const signed char>(y01) + x01;
-                const signed char* sptr10 = img1.row<const signed char>(y10) + x10;
-                const signed char* sptr11 = img1.row<const signed char>(y11) + x11;
-
-                pp[0] = sptr00[0];
-                pp[1] = sptr10[0];
-                pp[2] = sptr01[0];
-                pp[3] = sptr11[0];
-                pp += 4;
+                for (; kk + 1 < max_kk; kk += 2)
+                {
+                    int p0 = (k + kk) / maxk;
+                    int p1 = (k + kk + 1) / maxk;
+                    int uv0 = (k + kk) % maxk;
+                    int uv1 = (k + kk + 1) % maxk;
+                    int u0 = uv0 / kernel_w;
+                    int u1 = uv1 / kernel_w;
+                    int v0 = uv0 % kernel_w;
+                    int v1 = uv1 % kernel_w;
+
+                    const Mat img0 = bottom_blob.channel(p0);
+                    const Mat img1 = bottom_blob.channel(p1);
+
+                    int x00 = stride_w * dx0 + dilation_w * v0;
+                    int y00 = stride_h * dy0 + dilation_h * u0;
+                    int x10 = stride_w * dx0 + dilation_w * v1;
+                    int y10 = stride_h * dy0 + dilation_h * u1;
+
+                    const signed char* sptr0 = img0.row<const signed char>(y00) + x00;
+                    const signed char* sptr1 = img1.row<const signed char>(y10) + x10;
+
+                    pp[0] = sptr0[0];
+                    pp[1] = sptr1[0];
+                    pp[2] = sptr0[stride_w];
+                    pp[3] = sptr1[stride_w];
+                    pp += 4;
+                }
+            }
+            for (; kk < max_kk / elempack; kk++)
+            {
+                int p = (k / elempack + kk) / maxk;
+                int uv = (k / elempack + kk) % maxk;
+                int u = uv / kernel_w;
+                int v = uv % kernel_w;
+
+                const Mat img = bottom_blob.channel(p);
+
+                int x0 = stride_w * dx0 + dilation_w * v;
+                int y0 = stride_h * dy0 + dilation_h * u;
+
+                const signed char* sptr = img.row<const signed char>(y0) + x0 * elempack;
+
+#if __SSE2__
+                if (elempack == 8)
+                {
+                    __m128i _r0 = _mm_loadl_epi64((const __m128i*)sptr);
+                    __m128i _r1 = _mm_loadl_epi64((const __m128i*)(sptr + stride_w * 8));
+                    __m128i _r01 = _mm_unpacklo_epi16(_r0, _r1);
+                    _mm_storeu_si128((__m128i*)pp, _r01);
+                    pp += 16;
+                }
+#endif // __SSE2__
+                if (elempack == 1)
+                {
+                    pp[0] = sptr[0];
+                    pp[1] = sptr[stride_w];
+                    pp += 2;
+                }
+            }
+        }
+        else
+        {
+            int kk = 0;
+            if (elempack == 1)
+            {
+                for (; kk + 1 < max_kk; kk += 2)
+                {
+                    int p0 = (k + kk) / maxk;
+                    int p1 = (k + kk + 1) / maxk;
+                    int uv0 = (k + kk) % maxk;
+                    int uv1 = (k + kk + 1) % maxk;
+                    int u0 = uv0 / kernel_w;
+                    int u1 = uv1 / kernel_w;
+                    int v0 = uv0 % kernel_w;
+                    int v1 = uv1 % kernel_w;
+
+                    const Mat img0 = bottom_blob.channel(p0);
+                    const Mat img1 = bottom_blob.channel(p1);
+
+                    int x00 = stride_w * dx0 + dilation_w * v0;
+                    int x01 = stride_w * dx1 + dilation_w * v0;
+                    int y00 = stride_h * dy0 + dilation_h * u0;
+                    int y01 = stride_h * dy1 + dilation_h * u0;
+                    int x10 = stride_w * dx0 + dilation_w * v1;
+                    int x11 = stride_w * dx1 + dilation_w * v1;
+                    int y10 = stride_h * dy0 + dilation_h * u1;
+                    int y11 = stride_h * dy1 + dilation_h * u1;
+
+                    const signed char* sptr00 = img0.row<const signed char>(y00) + x00;
+                    const signed char* sptr01 = img0.row<const signed char>(y01) + x01;
+                    const signed char* sptr10 = img1.row<const signed char>(y10) + x10;
+                    const signed char* sptr11 = img1.row<const signed char>(y11) + x11;
+
+                    pp[0] = sptr00[0];
+                    pp[1] = sptr10[0];
+                    pp[2] = sptr01[0];
+                    pp[3] = sptr11[0];
+                    pp += 4;
+                }
+            }
+            for (; kk < max_kk / elempack; kk++)
+            {
+                int p = (k / elempack + kk) / maxk;
+                int uv = (k / elempack + kk) % maxk;
+                int u = uv / kernel_w;
+                int v = uv % kernel_w;
+
+                const Mat img = bottom_blob.channel(p);
+
+                int x0 = stride_w * dx0 + dilation_w * v;
+                int x1 = stride_w * dx1 + dilation_w * v;
+                int y0 = stride_h * dy0 + dilation_h * u;
+                int y1 = stride_h * dy1 + dilation_h * u;
+
+                const signed char* sptr0 = img.row<const signed char>(y0) + x0 * elempack;
+                const signed char* sptr1 = img.row<const signed char>(y1) + x1 * elempack;
+
+#if __SSE2__
+                if (elempack == 8)
+                {
+                    __m128i _r0 = _mm_loadl_epi64((const __m128i*)sptr0);
+                    __m128i _r1 = _mm_loadl_epi64((const __m128i*)sptr1);
+                    __m128i _r01 = _mm_unpacklo_epi16(_r0, _r1);
+                    _mm_storeu_si128((__m128i*)pp, _r01);
+                    pp += 16;
+                }
+#endif // __SSE2__
+                if (elempack == 1)
+                {
+                    pp[0] = sptr0[0];
+                    pp[1] = sptr1[0];
+                    pp += 2;
+                }
             }
         }
+    }
+    for (; jj < max_jj; jj++)
+    {
+        int dy = (j + jj) / outw;
+        int dx = (j + jj) % outw;
+
+        int kk = 0;
         for (; kk < max_kk / elempack; kk++)
         {
             int p = (k / elempack + kk) / maxk;
@@ -6845,29 +7364,1309 @@ static void convolution_im2col_input_tile_int8(const Mat& bottom_blob, Mat& B, i
 
             const Mat img = bottom_blob.channel(p);
 
-            int x0 = stride_w * dx0 + dilation_w * v;
-            int x1 = stride_w * dx1 + dilation_w * v;
-            int y0 = stride_h * dy0 + dilation_h * u;
-            int y1 = stride_h * dy1 + dilation_h * u;
+            int x = stride_w * dx + dilation_w * v;
+            int y = stride_h * dy + dilation_h * u;
 
-            const signed char* sptr0 = img.row<const signed char>(y0) + x0 * elempack;
-            const signed char* sptr1 = img.row<const signed char>(y1) + x1 * elempack;
+            const signed char* sptr = img.row<const signed char>(y) + x * elempack;
 
 #if __SSE2__
             if (elempack == 8)
             {
-                __m128i _r0 = _mm_loadl_epi64((const __m128i*)sptr0);
-                __m128i _r1 = _mm_loadl_epi64((const __m128i*)sptr1);
-                __m128i _r01 = _mm_unpacklo_epi16(_r0, _r1);
-                _mm_storeu_si128((__m128i*)pp, _r01);
-                pp += 16;
+                _mm_storel_epi64((__m128i*)pp, _mm_loadl_epi64((const __m128i*)sptr));
+                pp += 8;
             }
 #endif // __SSE2__
             if (elempack == 1)
             {
-                pp[0] = sptr0[0];
-                pp[1] = sptr1[0];
-                pp += 2;
+                pp[0] = sptr[0];
+                pp += 1;
+            }
+        }
+    }
+}
+
+#if __AVX512F__
+template void convolution_im2col_input_tile_int8_avx512<1, 1, 1, 1, 2, 2>(const Mat& bottom_blob, Mat& B, int j, int max_jj, int k, int max_kk);
+template void convolution_im2col_input_tile_int8_avx512<3, 3, 1, 1, 1, 1>(const Mat& bottom_blob, Mat& B, int j, int max_jj, int k, int max_kk);
+template void convolution_im2col_input_tile_int8_avx512<3, 3, 1, 1, 2, 2>(const Mat& bottom_blob, Mat& B, int j, int max_jj, int k, int max_kk);
+template void convolution_im2col_input_tile_int8_avx512<5, 5, 1, 1, 1, 1>(const Mat& bottom_blob, Mat& B, int j, int max_jj, int k, int max_kk);
+template void convolution_im2col_input_tile_int8_avx512<5, 5, 1, 1, 2, 2>(const Mat& bottom_blob, Mat& B, int j, int max_jj, int k, int max_kk);
+template void convolution_im2col_input_tile_int8_avx512<7, 7, 1, 1, 2, 2>(const Mat& bottom_blob, Mat& B, int j, int max_jj, int k, int max_kk);
+#else  // __AVX512F__
+template void convolution_im2col_input_tile_int8<1, 1, 1, 1, 2, 2>(const Mat& bottom_blob, Mat& B, int j, int max_jj, int k, int max_kk);
+template void convolution_im2col_input_tile_int8<3, 3, 1, 1, 1, 1>(const Mat& bottom_blob, Mat& B, int j, int max_jj, int k, int max_kk);
+template void convolution_im2col_input_tile_int8<3, 3, 1, 1, 2, 2>(const Mat& bottom_blob, Mat& B, int j, int max_jj, int k, int max_kk);
+template void convolution_im2col_input_tile_int8<5, 5, 1, 1, 1, 1>(const Mat& bottom_blob, Mat& B, int j, int max_jj, int k, int max_kk);
+template void convolution_im2col_input_tile_int8<5, 5, 1, 1, 2, 2>(const Mat& bottom_blob, Mat& B, int j, int max_jj, int k, int max_kk);
+template void convolution_im2col_input_tile_int8<7, 7, 1, 1, 2, 2>(const Mat& bottom_blob, Mat& B, int j, int max_jj, int k, int max_kk);
+#endif // __AVX512F__
+
+static void convolution_im2col_input_tile_int8(const Mat& bottom_blob, Mat& B, int j, int max_jj, int k, int max_kk, int kernel_w, int kernel_h, int dilation_w, int dilation_h, int stride_w, int stride_h)
+{
+    if (kernel_w == 1 && kernel_h == 1 && dilation_w == 1 && dilation_h == 1 && stride_w == 1 && stride_h == 1)
+    {
+        convolution_im2col_input_tile_conv1x1s1d1_int8(bottom_blob, B, j, max_jj, k, max_kk);
+        return;
+    }
+
+    if (kernel_w == 1 && kernel_h == 1 && stride_w == 2 && stride_h == 2)
+    {
+#if __AVX512F__
+        convolution_im2col_input_tile_int8_avx512<1, 1, 1, 1, 2, 2>(bottom_blob, B, j, max_jj, k, max_kk);
+#else  // __AVX512F__
+        convolution_im2col_input_tile_int8<1, 1, 1, 1, 2, 2>(bottom_blob, B, j, max_jj, k, max_kk);
+#endif // __AVX512F__
+        return;
+    }
+
+    if (kernel_w == 3 && kernel_h == 3 && dilation_w == 1 && dilation_h == 1 && stride_w == 1 && stride_h == 1)
+    {
+#if __AVX512F__
+        convolution_im2col_input_tile_int8_avx512<3, 3, 1, 1, 1, 1>(bottom_blob, B, j, max_jj, k, max_kk);
+#else  // __AVX512F__
+        convolution_im2col_input_tile_int8<3, 3, 1, 1, 1, 1>(bottom_blob, B, j, max_jj, k, max_kk);
+#endif // __AVX512F__
+        return;
+    }
+
+    if (kernel_w == 3 && kernel_h == 3 && dilation_w == 1 && dilation_h == 1 && stride_w == 2 && stride_h == 2)
+    {
+#if __AVX512F__
+        convolution_im2col_input_tile_int8_avx512<3, 3, 1, 1, 2, 2>(bottom_blob, B, j, max_jj, k, max_kk);
+#else  // __AVX512F__
+        convolution_im2col_input_tile_int8<3, 3, 1, 1, 2, 2>(bottom_blob, B, j, max_jj, k, max_kk);
+#endif // __AVX512F__
+        return;
+    }
+
+    if (kernel_w == 5 && kernel_h == 5 && dilation_w == 1 && dilation_h == 1 && stride_w == 1 && stride_h == 1)
+    {
+#if __AVX512F__
+        convolution_im2col_input_tile_int8_avx512<5, 5, 1, 1, 1, 1>(bottom_blob, B, j, max_jj, k, max_kk);
+#else  // __AVX512F__
+        convolution_im2col_input_tile_int8<5, 5, 1, 1, 1, 1>(bottom_blob, B, j, max_jj, k, max_kk);
+#endif // __AVX512F__
+        return;
+    }
+
+    if (kernel_w == 5 && kernel_h == 5 && dilation_w == 1 && dilation_h == 1 && stride_w == 2 && stride_h == 2)
+    {
+#if __AVX512F__
+        convolution_im2col_input_tile_int8_avx512<5, 5, 1, 1, 2, 2>(bottom_blob, B, j, max_jj, k, max_kk);
+#else  // __AVX512F__
+        convolution_im2col_input_tile_int8<5, 5, 1, 1, 2, 2>(bottom_blob, B, j, max_jj, k, max_kk);
+#endif // __AVX512F__
+        return;
+    }
+
+    if (kernel_w == 7 && kernel_h == 7 && dilation_w == 1 && dilation_h == 1 && stride_w == 2 && stride_h == 2)
+    {
+#if __AVX512F__
+        convolution_im2col_input_tile_int8_avx512<7, 7, 1, 1, 2, 2>(bottom_blob, B, j, max_jj, k, max_kk);
+#else  // __AVX512F__
+        convolution_im2col_input_tile_int8<7, 7, 1, 1, 2, 2>(bottom_blob, B, j, max_jj, k, max_kk);
+#endif // __AVX512F__
+        return;
+    }
+
+    const int w = bottom_blob.w;
+    // const int channels = bottom_blob.c;
+    const int elempack = bottom_blob.elempack;
+
+    const int kernel_extent_w = dilation_w * (kernel_w - 1) + 1;
+    const int outw = (w - kernel_extent_w) / stride_w + 1;
+
+    // j max_jj     outw*outh    split w and h
+
+    // k max_kk     pa*maxk*(inch/pa)    split inch
+
+    // k/max_kk shall be multiple of maxk
+
+    const int maxk = kernel_w * kernel_h;
+
+    signed char* pp = B;
+
+    int jj = 0;
+#if __SSE2__
+#if defined(__x86_64__) || defined(_M_X64)
+#if __AVX512F__
+    for (; jj + 15 < max_jj; jj += 16)
+    {
+        int dy0 = (j + jj) / outw;
+        int dy1 = (j + jj + 1) / outw;
+        int dy2 = (j + jj + 2) / outw;
+        int dy3 = (j + jj + 3) / outw;
+        int dy4 = (j + jj + 4) / outw;
+        int dy5 = (j + jj + 5) / outw;
+        int dy6 = (j + jj + 6) / outw;
+        int dy7 = (j + jj + 7) / outw;
+        int dy8 = (j + jj + 8) / outw;
+        int dy9 = (j + jj + 9) / outw;
+        int dya = (j + jj + 10) / outw;
+        int dyb = (j + jj + 11) / outw;
+        int dyc = (j + jj + 12) / outw;
+        int dyd = (j + jj + 13) / outw;
+        int dye = (j + jj + 14) / outw;
+        int dyf = (j + jj + 15) / outw;
+        int dx0 = (j + jj) % outw;
+        int dx1 = (j + jj + 1) % outw;
+        int dx2 = (j + jj + 2) % outw;
+        int dx3 = (j + jj + 3) % outw;
+        int dx4 = (j + jj + 4) % outw;
+        int dx5 = (j + jj + 5) % outw;
+        int dx6 = (j + jj + 6) % outw;
+        int dx7 = (j + jj + 7) % outw;
+        int dx8 = (j + jj + 8) % outw;
+        int dx9 = (j + jj + 9) % outw;
+        int dxa = (j + jj + 10) % outw;
+        int dxb = (j + jj + 11) % outw;
+        int dxc = (j + jj + 12) % outw;
+        int dxd = (j + jj + 13) % outw;
+        int dxe = (j + jj + 14) % outw;
+        int dxf = (j + jj + 15) % outw;
+
+        if (dy0 == dyf)
+        {
+            int kk = 0;
+            if (elempack == 1)
+            {
+                for (; kk + 1 < max_kk; kk += 2)
+                {
+                    int p0 = (k + kk) / maxk;
+                    int p1 = (k + kk + 1) / maxk;
+                    int uv0 = (k + kk) % maxk;
+                    int uv1 = (k + kk + 1) % maxk;
+                    int u0 = uv0 / kernel_w;
+                    int u1 = uv1 / kernel_w;
+                    int v0 = uv0 % kernel_w;
+                    int v1 = uv1 % kernel_w;
+
+                    const Mat img0 = bottom_blob.channel(p0);
+                    const Mat img1 = bottom_blob.channel(p1);
+
+                    int x00 = stride_w * dx0 + dilation_w * v0;
+                    int y00 = stride_h * dy0 + dilation_h * u0;
+                    int x10 = stride_w * dx0 + dilation_w * v1;
+                    int y10 = stride_h * dy0 + dilation_h * u1;
+
+                    const signed char* sptr0 = img0.row<const signed char>(y00) + x00;
+                    const signed char* sptr1 = img1.row<const signed char>(y10) + x10;
+
+                    if (stride_w == 1)
+                    {
+                        __m128i _r0 = _mm_loadu_si128((const __m128i*)sptr0);
+                        __m128i _r1 = _mm_loadu_si128((const __m128i*)sptr1);
+                        __m128i _tmp0 = _mm_unpacklo_epi8(_r0, _r1);
+                        __m128i _tmp1 = _mm_unpackhi_epi8(_r0, _r1);
+                        _mm_store_si128((__m128i*)pp, _tmp0);
+                        _mm_store_si128((__m128i*)(pp + 16), _tmp1);
+                        pp += 32;
+                    }
+                    else if (stride_w == 2)
+                    {
+                        __m256i _r0 = _mm256_loadu_si256((const __m256i*)sptr0);
+                        __m256i _r1 = _mm256_loadu_si256((const __m256i*)sptr1);
+                        __m256i _tmp0 = _mm256_unpacklo_epi8(_r0, _r1);
+                        __m256i _tmp1 = _mm256_unpackhi_epi8(_r0, _r1);
+                        _tmp0 = _mm256_shufflehi_epi16(_mm256_shufflelo_epi16(_tmp0, _MM_SHUFFLE(3, 1, 2, 0)), _MM_SHUFFLE(3, 1, 2, 0));
+                        _tmp1 = _mm256_shufflehi_epi16(_mm256_shufflelo_epi16(_tmp1, _MM_SHUFFLE(3, 1, 2, 0)), _MM_SHUFFLE(3, 1, 2, 0));
+                        _tmp0 = _mm256_shuffle_epi32(_tmp0, _MM_SHUFFLE(3, 1, 2, 0));
+                        _tmp1 = _mm256_shuffle_epi32(_tmp1, _MM_SHUFFLE(3, 1, 2, 0));
+                        __m256i _r01 = _mm256_unpacklo_epi64(_tmp0, _tmp1);
+                        _mm256_storeu_si256((__m256i*)pp, _r01);
+                        pp += 32;
+                    }
+                    else
+                    {
+                        pp[0] = sptr0[0];
+                        pp[1] = sptr1[0];
+                        pp[2] = sptr0[stride_w];
+                        pp[3] = sptr1[stride_w];
+                        pp[4] = sptr0[stride_w * 2];
+                        pp[5] = sptr1[stride_w * 2];
+                        pp[6] = sptr0[stride_w * 3];
+                        pp[7] = sptr1[stride_w * 3];
+                        pp[8] = sptr0[stride_w * 4];
+                        pp[9] = sptr1[stride_w * 4];
+                        pp[10] = sptr0[stride_w * 5];
+                        pp[11] = sptr1[stride_w * 5];
+                        pp[12] = sptr0[stride_w * 6];
+                        pp[13] = sptr1[stride_w * 6];
+                        pp[14] = sptr0[stride_w * 7];
+                        pp[15] = sptr1[stride_w * 7];
+                        pp[16 + 0] = sptr0[stride_w * 8];
+                        pp[16 + 1] = sptr1[stride_w * 8];
+                        pp[16 + 2] = sptr0[stride_w * 9];
+                        pp[16 + 3] = sptr1[stride_w * 9];
+                        pp[16 + 4] = sptr0[stride_w * 10];
+                        pp[16 + 5] = sptr1[stride_w * 10];
+                        pp[16 + 6] = sptr0[stride_w * 11];
+                        pp[16 + 7] = sptr1[stride_w * 11];
+                        pp[16 + 8] = sptr0[stride_w * 12];
+                        pp[16 + 9] = sptr1[stride_w * 12];
+                        pp[16 + 10] = sptr0[stride_w * 13];
+                        pp[16 + 11] = sptr1[stride_w * 13];
+                        pp[16 + 12] = sptr0[stride_w * 14];
+                        pp[16 + 13] = sptr1[stride_w * 14];
+                        pp[16 + 14] = sptr0[stride_w * 15];
+                        pp[16 + 15] = sptr1[stride_w * 15];
+                        pp += 32;
+                    }
+                }
+            }
+            for (; kk < max_kk / elempack; kk++)
+            {
+                int p = (k / elempack + kk) / maxk;
+                int uv = (k / elempack + kk) % maxk;
+                int u = uv / kernel_w;
+                int v = uv % kernel_w;
+
+                const Mat img = bottom_blob.channel(p);
+
+                int x0 = stride_w * dx0 + dilation_w * v;
+                int y0 = stride_h * dy0 + dilation_h * u;
+
+                const signed char* sptr = img.row<const signed char>(y0) + x0 * elempack;
+
+                if (elempack == 8)
+                {
+                    __m128i _r0 = _mm_loadl_epi64((const __m128i*)sptr);
+                    __m128i _r1 = _mm_loadl_epi64((const __m128i*)(sptr + stride_w * 8));
+                    __m128i _r2 = _mm_loadl_epi64((const __m128i*)(sptr + stride_w * 16));
+                    __m128i _r3 = _mm_loadl_epi64((const __m128i*)(sptr + stride_w * 24));
+                    __m128i _r4 = _mm_loadl_epi64((const __m128i*)(sptr + stride_w * 32));
+                    __m128i _r5 = _mm_loadl_epi64((const __m128i*)(sptr + stride_w * 40));
+                    __m128i _r6 = _mm_loadl_epi64((const __m128i*)(sptr + stride_w * 48));
+                    __m128i _r7 = _mm_loadl_epi64((const __m128i*)(sptr + stride_w * 56));
+                    __m128i _r8 = _mm_loadl_epi64((const __m128i*)(sptr + stride_w * 64));
+                    __m128i _r9 = _mm_loadl_epi64((const __m128i*)(sptr + stride_w * 72));
+                    __m128i _ra = _mm_loadl_epi64((const __m128i*)(sptr + stride_w * 80));
+                    __m128i _rb = _mm_loadl_epi64((const __m128i*)(sptr + stride_w * 88));
+                    __m128i _rc = _mm_loadl_epi64((const __m128i*)(sptr + stride_w * 96));
+                    __m128i _rd = _mm_loadl_epi64((const __m128i*)(sptr + stride_w * 104));
+                    __m128i _re = _mm_loadl_epi64((const __m128i*)(sptr + stride_w * 112));
+                    __m128i _rf = _mm_loadl_epi64((const __m128i*)(sptr + stride_w * 120));
+                    __m128i _r01 = _mm_unpacklo_epi16(_r0, _r1);
+                    __m128i _r23 = _mm_unpacklo_epi16(_r2, _r3);
+                    __m128i _r45 = _mm_unpacklo_epi16(_r4, _r5);
+                    __m128i _r67 = _mm_unpacklo_epi16(_r6, _r7);
+                    __m128i _r89 = _mm_unpacklo_epi16(_r8, _r9);
+                    __m128i _rab = _mm_unpacklo_epi16(_ra, _rb);
+                    __m128i _rcd = _mm_unpacklo_epi16(_rc, _rd);
+                    __m128i _ref = _mm_unpacklo_epi16(_re, _rf);
+                    _r0 = _mm_unpacklo_epi32(_r01, _r23);
+                    _r1 = _mm_unpackhi_epi32(_r01, _r23);
+                    _r2 = _mm_unpacklo_epi32(_r45, _r67);
+                    _r3 = _mm_unpackhi_epi32(_r45, _r67);
+                    _r4 = _mm_unpacklo_epi32(_r89, _rab);
+                    _r5 = _mm_unpackhi_epi32(_r89, _rab);
+                    _r6 = _mm_unpacklo_epi32(_rcd, _ref);
+                    _r7 = _mm_unpackhi_epi32(_rcd, _ref);
+                    _r8 = _mm_unpacklo_epi64(_r0, _r2);
+                    _r9 = _mm_unpacklo_epi64(_r4, _r6);
+                    _ra = _mm_unpackhi_epi64(_r0, _r2);
+                    _rb = _mm_unpackhi_epi64(_r4, _r6);
+                    _rc = _mm_unpacklo_epi64(_r1, _r3);
+                    _rd = _mm_unpacklo_epi64(_r5, _r7);
+                    _re = _mm_unpackhi_epi64(_r1, _r3);
+                    _rf = _mm_unpackhi_epi64(_r5, _r7);
+                    _mm_storeu_si128((__m128i*)pp, _r8);
+                    _mm_storeu_si128((__m128i*)(pp + 16), _r9);
+                    _mm_storeu_si128((__m128i*)(pp + 32), _ra);
+                    _mm_storeu_si128((__m128i*)(pp + 48), _rb);
+                    _mm_storeu_si128((__m128i*)(pp + 64), _rc);
+                    _mm_storeu_si128((__m128i*)(pp + 80), _rd);
+                    _mm_storeu_si128((__m128i*)(pp + 96), _re);
+                    _mm_storeu_si128((__m128i*)(pp + 112), _rf);
+                    pp += 128;
+                }
+                if (elempack == 1)
+                {
+                    pp[0] = sptr[0];
+                    pp[1] = sptr[stride_w];
+                    pp[2] = sptr[stride_w * 2];
+                    pp[3] = sptr[stride_w * 3];
+                    pp[4] = sptr[stride_w * 4];
+                    pp[5] = sptr[stride_w * 5];
+                    pp[6] = sptr[stride_w * 6];
+                    pp[7] = sptr[stride_w * 7];
+                    pp[8] = sptr[stride_w * 8];
+                    pp[9] = sptr[stride_w * 9];
+                    pp[10] = sptr[stride_w * 10];
+                    pp[11] = sptr[stride_w * 11];
+                    pp[12] = sptr[stride_w * 12];
+                    pp[13] = sptr[stride_w * 13];
+                    pp[14] = sptr[stride_w * 14];
+                    pp[15] = sptr[stride_w * 15];
+                    pp += 16;
+                }
+            }
+        }
+        else
+        {
+            int kk = 0;
+            if (elempack == 1)
+            {
+                for (; kk + 1 < max_kk; kk += 2)
+                {
+                    int p0 = (k + kk) / maxk;
+                    int p1 = (k + kk + 1) / maxk;
+                    int uv0 = (k + kk) % maxk;
+                    int uv1 = (k + kk + 1) % maxk;
+                    int u0 = uv0 / kernel_w;
+                    int u1 = uv1 / kernel_w;
+                    int v0 = uv0 % kernel_w;
+                    int v1 = uv1 % kernel_w;
+
+                    const Mat img0 = bottom_blob.channel(p0);
+                    const Mat img1 = bottom_blob.channel(p1);
+
+                    int x00 = stride_w * dx0 + dilation_w * v0;
+                    int x01 = stride_w * dx1 + dilation_w * v0;
+                    int x02 = stride_w * dx2 + dilation_w * v0;
+                    int x03 = stride_w * dx3 + dilation_w * v0;
+                    int x04 = stride_w * dx4 + dilation_w * v0;
+                    int x05 = stride_w * dx5 + dilation_w * v0;
+                    int x06 = stride_w * dx6 + dilation_w * v0;
+                    int x07 = stride_w * dx7 + dilation_w * v0;
+                    int x08 = stride_w * dx8 + dilation_w * v0;
+                    int x09 = stride_w * dx9 + dilation_w * v0;
+                    int x0a = stride_w * dxa + dilation_w * v0;
+                    int x0b = stride_w * dxb + dilation_w * v0;
+                    int x0c = stride_w * dxc + dilation_w * v0;
+                    int x0d = stride_w * dxd + dilation_w * v0;
+                    int x0e = stride_w * dxe + dilation_w * v0;
+                    int x0f = stride_w * dxf + dilation_w * v0;
+                    int y00 = stride_h * dy0 + dilation_h * u0;
+                    int y01 = stride_h * dy1 + dilation_h * u0;
+                    int y02 = stride_h * dy2 + dilation_h * u0;
+                    int y03 = stride_h * dy3 + dilation_h * u0;
+                    int y04 = stride_h * dy4 + dilation_h * u0;
+                    int y05 = stride_h * dy5 + dilation_h * u0;
+                    int y06 = stride_h * dy6 + dilation_h * u0;
+                    int y07 = stride_h * dy7 + dilation_h * u0;
+                    int y08 = stride_h * dy8 + dilation_h * u0;
+                    int y09 = stride_h * dy9 + dilation_h * u0;
+                    int y0a = stride_h * dya + dilation_h * u0;
+                    int y0b = stride_h * dyb + dilation_h * u0;
+                    int y0c = stride_h * dyc + dilation_h * u0;
+                    int y0d = stride_h * dyd + dilation_h * u0;
+                    int y0e = stride_h * dye + dilation_h * u0;
+                    int y0f = stride_h * dyf + dilation_h * u0;
+
+                    int x10 = stride_w * dx0 + dilation_w * v1;
+                    int x11 = stride_w * dx1 + dilation_w * v1;
+                    int x12 = stride_w * dx2 + dilation_w * v1;
+                    int x13 = stride_w * dx3 + dilation_w * v1;
+                    int x14 = stride_w * dx4 + dilation_w * v1;
+                    int x15 = stride_w * dx5 + dilation_w * v1;
+                    int x16 = stride_w * dx6 + dilation_w * v1;
+                    int x17 = stride_w * dx7 + dilation_w * v1;
+                    int x18 = stride_w * dx8 + dilation_w * v1;
+                    int x19 = stride_w * dx9 + dilation_w * v1;
+                    int x1a = stride_w * dxa + dilation_w * v1;
+                    int x1b = stride_w * dxb + dilation_w * v1;
+                    int x1c = stride_w * dxc + dilation_w * v1;
+                    int x1d = stride_w * dxd + dilation_w * v1;
+                    int x1e = stride_w * dxe + dilation_w * v1;
+                    int x1f = stride_w * dxf + dilation_w * v1;
+                    int y10 = stride_h * dy0 + dilation_h * u1;
+                    int y11 = stride_h * dy1 + dilation_h * u1;
+                    int y12 = stride_h * dy2 + dilation_h * u1;
+                    int y13 = stride_h * dy3 + dilation_h * u1;
+                    int y14 = stride_h * dy4 + dilation_h * u1;
+                    int y15 = stride_h * dy5 + dilation_h * u1;
+                    int y16 = stride_h * dy6 + dilation_h * u1;
+                    int y17 = stride_h * dy7 + dilation_h * u1;
+                    int y18 = stride_h * dy8 + dilation_h * u1;
+                    int y19 = stride_h * dy9 + dilation_h * u1;
+                    int y1a = stride_h * dya + dilation_h * u1;
+                    int y1b = stride_h * dyb + dilation_h * u1;
+                    int y1c = stride_h * dyc + dilation_h * u1;
+                    int y1d = stride_h * dyd + dilation_h * u1;
+                    int y1e = stride_h * dye + dilation_h * u1;
+                    int y1f = stride_h * dyf + dilation_h * u1;
+
+                    const signed char* sptr00 = img0.row<const signed char>(y00) + x00;
+                    const signed char* sptr01 = img0.row<const signed char>(y01) + x01;
+                    const signed char* sptr02 = img0.row<const signed char>(y02) + x02;
+                    const signed char* sptr03 = img0.row<const signed char>(y03) + x03;
+                    const signed char* sptr04 = img0.row<const signed char>(y04) + x04;
+                    const signed char* sptr05 = img0.row<const signed char>(y05) + x05;
+                    const signed char* sptr06 = img0.row<const signed char>(y06) + x06;
+                    const signed char* sptr07 = img0.row<const signed char>(y07) + x07;
+                    const signed char* sptr08 = img0.row<const signed char>(y08) + x08;
+                    const signed char* sptr09 = img0.row<const signed char>(y09) + x09;
+                    const signed char* sptr0a = img0.row<const signed char>(y0a) + x0a;
+                    const signed char* sptr0b = img0.row<const signed char>(y0b) + x0b;
+                    const signed char* sptr0c = img0.row<const signed char>(y0c) + x0c;
+                    const signed char* sptr0d = img0.row<const signed char>(y0d) + x0d;
+                    const signed char* sptr0e = img0.row<const signed char>(y0e) + x0e;
+                    const signed char* sptr0f = img0.row<const signed char>(y0f) + x0f;
+
+                    const signed char* sptr10 = img1.row<const signed char>(y10) + x10;
+                    const signed char* sptr11 = img1.row<const signed char>(y11) + x11;
+                    const signed char* sptr12 = img1.row<const signed char>(y12) + x12;
+                    const signed char* sptr13 = img1.row<const signed char>(y13) + x13;
+                    const signed char* sptr14 = img1.row<const signed char>(y14) + x14;
+                    const signed char* sptr15 = img1.row<const signed char>(y15) + x15;
+                    const signed char* sptr16 = img1.row<const signed char>(y16) + x16;
+                    const signed char* sptr17 = img1.row<const signed char>(y17) + x17;
+                    const signed char* sptr18 = img1.row<const signed char>(y18) + x18;
+                    const signed char* sptr19 = img1.row<const signed char>(y19) + x19;
+                    const signed char* sptr1a = img1.row<const signed char>(y1a) + x1a;
+                    const signed char* sptr1b = img1.row<const signed char>(y1b) + x1b;
+                    const signed char* sptr1c = img1.row<const signed char>(y1c) + x1c;
+                    const signed char* sptr1d = img1.row<const signed char>(y1d) + x1d;
+                    const signed char* sptr1e = img1.row<const signed char>(y1e) + x1e;
+                    const signed char* sptr1f = img1.row<const signed char>(y1f) + x1f;
+
+                    pp[0] = sptr00[0];
+                    pp[1] = sptr10[0];
+                    pp[2] = sptr01[0];
+                    pp[3] = sptr11[0];
+                    pp[4] = sptr02[0];
+                    pp[5] = sptr12[0];
+                    pp[6] = sptr03[0];
+                    pp[7] = sptr13[0];
+                    pp[8] = sptr04[0];
+                    pp[9] = sptr14[0];
+                    pp[10] = sptr05[0];
+                    pp[11] = sptr15[0];
+                    pp[12] = sptr06[0];
+                    pp[13] = sptr16[0];
+                    pp[14] = sptr07[0];
+                    pp[15] = sptr17[0];
+                    pp[16 + 0] = sptr08[0];
+                    pp[16 + 1] = sptr18[0];
+                    pp[16 + 2] = sptr09[0];
+                    pp[16 + 3] = sptr19[0];
+                    pp[16 + 4] = sptr0a[0];
+                    pp[16 + 5] = sptr1a[0];
+                    pp[16 + 6] = sptr0b[0];
+                    pp[16 + 7] = sptr1b[0];
+                    pp[16 + 8] = sptr0c[0];
+                    pp[16 + 9] = sptr1c[0];
+                    pp[16 + 10] = sptr0d[0];
+                    pp[16 + 11] = sptr1d[0];
+                    pp[16 + 12] = sptr0e[0];
+                    pp[16 + 13] = sptr1e[0];
+                    pp[16 + 14] = sptr0f[0];
+                    pp[16 + 15] = sptr1f[0];
+                    pp += 32;
+                }
+            }
+            for (; kk < max_kk / elempack; kk++)
+            {
+                int p = (k / elempack + kk) / maxk;
+                int uv = (k / elempack + kk) % maxk;
+                int u = uv / kernel_w;
+                int v = uv % kernel_w;
+
+                const Mat img = bottom_blob.channel(p);
+
+                int x0 = stride_w * dx0 + dilation_w * v;
+                int x1 = stride_w * dx1 + dilation_w * v;
+                int x2 = stride_w * dx2 + dilation_w * v;
+                int x3 = stride_w * dx3 + dilation_w * v;
+                int x4 = stride_w * dx4 + dilation_w * v;
+                int x5 = stride_w * dx5 + dilation_w * v;
+                int x6 = stride_w * dx6 + dilation_w * v;
+                int x7 = stride_w * dx7 + dilation_w * v;
+                int x8 = stride_w * dx8 + dilation_w * v;
+                int x9 = stride_w * dx9 + dilation_w * v;
+                int xa = stride_w * dxa + dilation_w * v;
+                int xb = stride_w * dxb + dilation_w * v;
+                int xc = stride_w * dxc + dilation_w * v;
+                int xd = stride_w * dxd + dilation_w * v;
+                int xe = stride_w * dxe + dilation_w * v;
+                int xf = stride_w * dxf + dilation_w * v;
+                int y0 = stride_h * dy0 + dilation_h * u;
+                int y1 = stride_h * dy1 + dilation_h * u;
+                int y2 = stride_h * dy2 + dilation_h * u;
+                int y3 = stride_h * dy3 + dilation_h * u;
+                int y4 = stride_h * dy4 + dilation_h * u;
+                int y5 = stride_h * dy5 + dilation_h * u;
+                int y6 = stride_h * dy6 + dilation_h * u;
+                int y7 = stride_h * dy7 + dilation_h * u;
+                int y8 = stride_h * dy8 + dilation_h * u;
+                int y9 = stride_h * dy9 + dilation_h * u;
+                int ya = stride_h * dya + dilation_h * u;
+                int yb = stride_h * dyb + dilation_h * u;
+                int yc = stride_h * dyc + dilation_h * u;
+                int yd = stride_h * dyd + dilation_h * u;
+                int ye = stride_h * dye + dilation_h * u;
+                int yf = stride_h * dyf + dilation_h * u;
+
+                const signed char* sptr0 = img.row<const signed char>(y0) + x0 * elempack;
+                const signed char* sptr1 = img.row<const signed char>(y1) + x1 * elempack;
+                const signed char* sptr2 = img.row<const signed char>(y2) + x2 * elempack;
+                const signed char* sptr3 = img.row<const signed char>(y3) + x3 * elempack;
+                const signed char* sptr4 = img.row<const signed char>(y4) + x4 * elempack;
+                const signed char* sptr5 = img.row<const signed char>(y5) + x5 * elempack;
+                const signed char* sptr6 = img.row<const signed char>(y6) + x6 * elempack;
+                const signed char* sptr7 = img.row<const signed char>(y7) + x7 * elempack;
+                const signed char* sptr8 = img.row<const signed char>(y8) + x8 * elempack;
+                const signed char* sptr9 = img.row<const signed char>(y9) + x9 * elempack;
+                const signed char* sptra = img.row<const signed char>(ya) + xa * elempack;
+                const signed char* sptrb = img.row<const signed char>(yb) + xb * elempack;
+                const signed char* sptrc = img.row<const signed char>(yc) + xc * elempack;
+                const signed char* sptrd = img.row<const signed char>(yd) + xd * elempack;
+                const signed char* sptre = img.row<const signed char>(ye) + xe * elempack;
+                const signed char* sptrf = img.row<const signed char>(yf) + xf * elempack;
+
+                if (elempack == 8)
+                {
+                    __m128i _r0 = _mm_loadl_epi64((const __m128i*)sptr0);
+                    __m128i _r1 = _mm_loadl_epi64((const __m128i*)sptr1);
+                    __m128i _r2 = _mm_loadl_epi64((const __m128i*)sptr2);
+                    __m128i _r3 = _mm_loadl_epi64((const __m128i*)sptr3);
+                    __m128i _r4 = _mm_loadl_epi64((const __m128i*)sptr4);
+                    __m128i _r5 = _mm_loadl_epi64((const __m128i*)sptr5);
+                    __m128i _r6 = _mm_loadl_epi64((const __m128i*)sptr6);
+                    __m128i _r7 = _mm_loadl_epi64((const __m128i*)sptr7);
+                    __m128i _r8 = _mm_loadl_epi64((const __m128i*)sptr8);
+                    __m128i _r9 = _mm_loadl_epi64((const __m128i*)sptr9);
+                    __m128i _ra = _mm_loadl_epi64((const __m128i*)sptra);
+                    __m128i _rb = _mm_loadl_epi64((const __m128i*)sptrb);
+                    __m128i _rc = _mm_loadl_epi64((const __m128i*)sptrc);
+                    __m128i _rd = _mm_loadl_epi64((const __m128i*)sptrd);
+                    __m128i _re = _mm_loadl_epi64((const __m128i*)sptre);
+                    __m128i _rf = _mm_loadl_epi64((const __m128i*)sptrf);
+                    __m128i _r01 = _mm_unpacklo_epi16(_r0, _r1);
+                    __m128i _r23 = _mm_unpacklo_epi16(_r2, _r3);
+                    __m128i _r45 = _mm_unpacklo_epi16(_r4, _r5);
+                    __m128i _r67 = _mm_unpacklo_epi16(_r6, _r7);
+                    __m128i _r89 = _mm_unpacklo_epi16(_r8, _r9);
+                    __m128i _rab = _mm_unpacklo_epi16(_ra, _rb);
+                    __m128i _rcd = _mm_unpacklo_epi16(_rc, _rd);
+                    __m128i _ref = _mm_unpacklo_epi16(_re, _rf);
+                    _r0 = _mm_unpacklo_epi32(_r01, _r23);
+                    _r1 = _mm_unpackhi_epi32(_r01, _r23);
+                    _r2 = _mm_unpacklo_epi32(_r45, _r67);
+                    _r3 = _mm_unpackhi_epi32(_r45, _r67);
+                    _r4 = _mm_unpacklo_epi32(_r89, _rab);
+                    _r5 = _mm_unpackhi_epi32(_r89, _rab);
+                    _r6 = _mm_unpacklo_epi32(_rcd, _ref);
+                    _r7 = _mm_unpackhi_epi32(_rcd, _ref);
+                    _r8 = _mm_unpacklo_epi64(_r0, _r2);
+                    _r9 = _mm_unpacklo_epi64(_r4, _r6);
+                    _ra = _mm_unpackhi_epi64(_r0, _r2);
+                    _rb = _mm_unpackhi_epi64(_r4, _r6);
+                    _rc = _mm_unpacklo_epi64(_r1, _r3);
+                    _rd = _mm_unpacklo_epi64(_r5, _r7);
+                    _re = _mm_unpackhi_epi64(_r1, _r3);
+                    _rf = _mm_unpackhi_epi64(_r5, _r7);
+                    _mm_storeu_si128((__m128i*)pp, _r8);
+                    _mm_storeu_si128((__m128i*)(pp + 16), _r9);
+                    _mm_storeu_si128((__m128i*)(pp + 32), _ra);
+                    _mm_storeu_si128((__m128i*)(pp + 48), _rb);
+                    _mm_storeu_si128((__m128i*)(pp + 64), _rc);
+                    _mm_storeu_si128((__m128i*)(pp + 80), _rd);
+                    _mm_storeu_si128((__m128i*)(pp + 96), _re);
+                    _mm_storeu_si128((__m128i*)(pp + 112), _rf);
+                    pp += 128;
+                }
+                if (elempack == 1)
+                {
+                    pp[0] = sptr0[0];
+                    pp[1] = sptr1[0];
+                    pp[2] = sptr2[0];
+                    pp[3] = sptr3[0];
+                    pp[4] = sptr4[0];
+                    pp[5] = sptr5[0];
+                    pp[6] = sptr6[0];
+                    pp[7] = sptr7[0];
+                    pp[8] = sptr8[0];
+                    pp[9] = sptr9[0];
+                    pp[10] = sptra[0];
+                    pp[11] = sptrb[0];
+                    pp[12] = sptrc[0];
+                    pp[13] = sptrd[0];
+                    pp[14] = sptre[0];
+                    pp[15] = sptrf[0];
+                    pp += 16;
+                }
+            }
+        }
+    }
+#endif // __AVX512F__
+    for (; jj + 7 < max_jj; jj += 8)
+    {
+        int dy0 = (j + jj) / outw;
+        int dy1 = (j + jj + 1) / outw;
+        int dy2 = (j + jj + 2) / outw;
+        int dy3 = (j + jj + 3) / outw;
+        int dy4 = (j + jj + 4) / outw;
+        int dy5 = (j + jj + 5) / outw;
+        int dy6 = (j + jj + 6) / outw;
+        int dy7 = (j + jj + 7) / outw;
+        int dx0 = (j + jj) % outw;
+        int dx1 = (j + jj + 1) % outw;
+        int dx2 = (j + jj + 2) % outw;
+        int dx3 = (j + jj + 3) % outw;
+        int dx4 = (j + jj + 4) % outw;
+        int dx5 = (j + jj + 5) % outw;
+        int dx6 = (j + jj + 6) % outw;
+        int dx7 = (j + jj + 7) % outw;
+
+        if (dy0 == dy7)
+        {
+            int kk = 0;
+            if (elempack == 1)
+            {
+                for (; kk + 1 < max_kk; kk += 2)
+                {
+                    int p0 = (k + kk) / maxk;
+                    int p1 = (k + kk + 1) / maxk;
+                    int uv0 = (k + kk) % maxk;
+                    int uv1 = (k + kk + 1) % maxk;
+                    int u0 = uv0 / kernel_w;
+                    int u1 = uv1 / kernel_w;
+                    int v0 = uv0 % kernel_w;
+                    int v1 = uv1 % kernel_w;
+
+                    const Mat img0 = bottom_blob.channel(p0);
+                    const Mat img1 = bottom_blob.channel(p1);
+
+                    int x00 = stride_w * dx0 + dilation_w * v0;
+                    int y00 = stride_h * dy0 + dilation_h * u0;
+                    int x10 = stride_w * dx0 + dilation_w * v1;
+                    int y10 = stride_h * dy0 + dilation_h * u1;
+
+                    const signed char* sptr0 = img0.row<const signed char>(y00) + x00;
+                    const signed char* sptr1 = img1.row<const signed char>(y10) + x10;
+
+                    if (stride_w == 1)
+                    {
+                        __m128i _r0 = _mm_loadl_epi64((const __m128i*)sptr0);
+                        __m128i _r1 = _mm_loadl_epi64((const __m128i*)sptr1);
+                        __m128i _r01 = _mm_unpacklo_epi8(_r0, _r1);
+                        _mm_store_si128((__m128i*)pp, _r01);
+                        pp += 16;
+                    }
+                    else if (stride_w == 2)
+                    {
+                        __m128i _r0 = _mm_loadu_si128((const __m128i*)sptr0);
+                        __m128i _r1 = _mm_loadu_si128((const __m128i*)sptr1);
+                        __m128i _tmp0 = _mm_unpacklo_epi8(_r0, _r1);
+                        __m128i _tmp1 = _mm_unpackhi_epi8(_r0, _r1);
+                        _tmp0 = _mm_shufflehi_epi16(_mm_shufflelo_epi16(_tmp0, _MM_SHUFFLE(3, 1, 2, 0)), _MM_SHUFFLE(3, 1, 2, 0));
+                        _tmp1 = _mm_shufflehi_epi16(_mm_shufflelo_epi16(_tmp1, _MM_SHUFFLE(3, 1, 2, 0)), _MM_SHUFFLE(3, 1, 2, 0));
+                        _tmp0 = _mm_shuffle_epi32(_tmp0, _MM_SHUFFLE(3, 1, 2, 0));
+                        _tmp1 = _mm_shuffle_epi32(_tmp1, _MM_SHUFFLE(3, 1, 2, 0));
+                        __m128i _r01 = _mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(_tmp0), _mm_castsi128_ps(_tmp1), _MM_SHUFFLE(1, 0, 1, 0)));
+                        _mm_store_si128((__m128i*)pp, _r01);
+                        pp += 16;
+                    }
+                    else
+                    {
+                        pp[0] = sptr0[0];
+                        pp[1] = sptr1[0];
+                        pp[2] = sptr0[stride_w];
+                        pp[3] = sptr1[stride_w];
+                        pp[4] = sptr0[stride_w * 2];
+                        pp[5] = sptr1[stride_w * 2];
+                        pp[6] = sptr0[stride_w * 3];
+                        pp[7] = sptr1[stride_w * 3];
+                        pp[8] = sptr0[stride_w * 4];
+                        pp[9] = sptr1[stride_w * 4];
+                        pp[10] = sptr0[stride_w * 5];
+                        pp[11] = sptr1[stride_w * 5];
+                        pp[12] = sptr0[stride_w * 6];
+                        pp[13] = sptr1[stride_w * 6];
+                        pp[14] = sptr0[stride_w * 7];
+                        pp[15] = sptr1[stride_w * 7];
+                        pp += 16;
+                    }
+                }
+            }
+            for (; kk < max_kk / elempack; kk++)
+            {
+                int p = (k / elempack + kk) / maxk;
+                int uv = (k / elempack + kk) % maxk;
+                int u = uv / kernel_w;
+                int v = uv % kernel_w;
+
+                const Mat img = bottom_blob.channel(p);
+
+                int x0 = stride_w * dx0 + dilation_w * v;
+                int y0 = stride_h * dy0 + dilation_h * u;
+
+                const signed char* sptr = img.row<const signed char>(y0) + x0 * elempack;
+
+                if (elempack == 8)
+                {
+                    __m128i _r0 = _mm_loadl_epi64((const __m128i*)sptr);
+                    __m128i _r1 = _mm_loadl_epi64((const __m128i*)(sptr + stride_w * 8));
+                    __m128i _r2 = _mm_loadl_epi64((const __m128i*)(sptr + stride_w * 16));
+                    __m128i _r3 = _mm_loadl_epi64((const __m128i*)(sptr + stride_w * 24));
+                    __m128i _r4 = _mm_loadl_epi64((const __m128i*)(sptr + stride_w * 32));
+                    __m128i _r5 = _mm_loadl_epi64((const __m128i*)(sptr + stride_w * 40));
+                    __m128i _r6 = _mm_loadl_epi64((const __m128i*)(sptr + stride_w * 48));
+                    __m128i _r7 = _mm_loadl_epi64((const __m128i*)(sptr + stride_w * 56));
+                    __m128i _r01 = _mm_unpacklo_epi16(_r0, _r1);
+                    __m128i _r23 = _mm_unpacklo_epi16(_r2, _r3);
+                    __m128i _r45 = _mm_unpacklo_epi16(_r4, _r5);
+                    __m128i _r67 = _mm_unpacklo_epi16(_r6, _r7);
+                    _r0 = _mm_unpacklo_epi32(_r01, _r23);
+                    _r1 = _mm_unpackhi_epi32(_r01, _r23);
+                    _r2 = _mm_unpacklo_epi32(_r45, _r67);
+                    _r3 = _mm_unpackhi_epi32(_r45, _r67);
+                    _r4 = _mm_unpacklo_epi64(_r0, _r2);
+                    _r5 = _mm_unpackhi_epi64(_r0, _r2);
+                    _r6 = _mm_unpacklo_epi64(_r1, _r3);
+                    _r7 = _mm_unpackhi_epi64(_r1, _r3);
+                    _mm_storeu_si128((__m128i*)pp, _r4);
+                    _mm_storeu_si128((__m128i*)(pp + 16), _r5);
+                    _mm_storeu_si128((__m128i*)(pp + 32), _r6);
+                    _mm_storeu_si128((__m128i*)(pp + 48), _r7);
+                    pp += 64;
+                }
+                if (elempack == 1)
+                {
+                    pp[0] = sptr[0];
+                    pp[1] = sptr[stride_w];
+                    pp[2] = sptr[stride_w * 2];
+                    pp[3] = sptr[stride_w * 3];
+                    pp[4] = sptr[stride_w * 4];
+                    pp[5] = sptr[stride_w * 5];
+                    pp[6] = sptr[stride_w * 6];
+                    pp[7] = sptr[stride_w * 7];
+                    pp += 8;
+                }
+            }
+        }
+        else
+        {
+            int kk = 0;
+            if (elempack == 1)
+            {
+                for (; kk + 1 < max_kk; kk += 2)
+                {
+                    int p0 = (k + kk) / maxk;
+                    int p1 = (k + kk + 1) / maxk;
+                    int uv0 = (k + kk) % maxk;
+                    int uv1 = (k + kk + 1) % maxk;
+                    int u0 = uv0 / kernel_w;
+                    int u1 = uv1 / kernel_w;
+                    int v0 = uv0 % kernel_w;
+                    int v1 = uv1 % kernel_w;
+
+                    const Mat img0 = bottom_blob.channel(p0);
+                    const Mat img1 = bottom_blob.channel(p1);
+
+                    int x00 = stride_w * dx0 + dilation_w * v0;
+                    int x01 = stride_w * dx1 + dilation_w * v0;
+                    int x02 = stride_w * dx2 + dilation_w * v0;
+                    int x03 = stride_w * dx3 + dilation_w * v0;
+                    int x04 = stride_w * dx4 + dilation_w * v0;
+                    int x05 = stride_w * dx5 + dilation_w * v0;
+                    int x06 = stride_w * dx6 + dilation_w * v0;
+                    int x07 = stride_w * dx7 + dilation_w * v0;
+                    int y00 = stride_h * dy0 + dilation_h * u0;
+                    int y01 = stride_h * dy1 + dilation_h * u0;
+                    int y02 = stride_h * dy2 + dilation_h * u0;
+                    int y03 = stride_h * dy3 + dilation_h * u0;
+                    int y04 = stride_h * dy4 + dilation_h * u0;
+                    int y05 = stride_h * dy5 + dilation_h * u0;
+                    int y06 = stride_h * dy6 + dilation_h * u0;
+                    int y07 = stride_h * dy7 + dilation_h * u0;
+
+                    int x10 = stride_w * dx0 + dilation_w * v1;
+                    int x11 = stride_w * dx1 + dilation_w * v1;
+                    int x12 = stride_w * dx2 + dilation_w * v1;
+                    int x13 = stride_w * dx3 + dilation_w * v1;
+                    int x14 = stride_w * dx4 + dilation_w * v1;
+                    int x15 = stride_w * dx5 + dilation_w * v1;
+                    int x16 = stride_w * dx6 + dilation_w * v1;
+                    int x17 = stride_w * dx7 + dilation_w * v1;
+                    int y10 = stride_h * dy0 + dilation_h * u1;
+                    int y11 = stride_h * dy1 + dilation_h * u1;
+                    int y12 = stride_h * dy2 + dilation_h * u1;
+                    int y13 = stride_h * dy3 + dilation_h * u1;
+                    int y14 = stride_h * dy4 + dilation_h * u1;
+                    int y15 = stride_h * dy5 + dilation_h * u1;
+                    int y16 = stride_h * dy6 + dilation_h * u1;
+                    int y17 = stride_h * dy7 + dilation_h * u1;
+
+                    const signed char* sptr00 = img0.row<const signed char>(y00) + x00;
+                    const signed char* sptr01 = img0.row<const signed char>(y01) + x01;
+                    const signed char* sptr02 = img0.row<const signed char>(y02) + x02;
+                    const signed char* sptr03 = img0.row<const signed char>(y03) + x03;
+                    const signed char* sptr04 = img0.row<const signed char>(y04) + x04;
+                    const signed char* sptr05 = img0.row<const signed char>(y05) + x05;
+                    const signed char* sptr06 = img0.row<const signed char>(y06) + x06;
+                    const signed char* sptr07 = img0.row<const signed char>(y07) + x07;
+
+                    const signed char* sptr10 = img1.row<const signed char>(y10) + x10;
+                    const signed char* sptr11 = img1.row<const signed char>(y11) + x11;
+                    const signed char* sptr12 = img1.row<const signed char>(y12) + x12;
+                    const signed char* sptr13 = img1.row<const signed char>(y13) + x13;
+                    const signed char* sptr14 = img1.row<const signed char>(y14) + x14;
+                    const signed char* sptr15 = img1.row<const signed char>(y15) + x15;
+                    const signed char* sptr16 = img1.row<const signed char>(y16) + x16;
+                    const signed char* sptr17 = img1.row<const signed char>(y17) + x17;
+
+                    pp[0] = sptr00[0];
+                    pp[1] = sptr10[0];
+                    pp[2] = sptr01[0];
+                    pp[3] = sptr11[0];
+                    pp[4] = sptr02[0];
+                    pp[5] = sptr12[0];
+                    pp[6] = sptr03[0];
+                    pp[7] = sptr13[0];
+                    pp[8] = sptr04[0];
+                    pp[9] = sptr14[0];
+                    pp[10] = sptr05[0];
+                    pp[11] = sptr15[0];
+                    pp[12] = sptr06[0];
+                    pp[13] = sptr16[0];
+                    pp[14] = sptr07[0];
+                    pp[15] = sptr17[0];
+                    pp += 16;
+                }
+            }
+            for (; kk < max_kk / elempack; kk++)
+            {
+                int p = (k / elempack + kk) / maxk;
+                int uv = (k / elempack + kk) % maxk;
+                int u = uv / kernel_w;
+                int v = uv % kernel_w;
+
+                const Mat img = bottom_blob.channel(p);
+
+                int x0 = stride_w * dx0 + dilation_w * v;
+                int x1 = stride_w * dx1 + dilation_w * v;
+                int x2 = stride_w * dx2 + dilation_w * v;
+                int x3 = stride_w * dx3 + dilation_w * v;
+                int x4 = stride_w * dx4 + dilation_w * v;
+                int x5 = stride_w * dx5 + dilation_w * v;
+                int x6 = stride_w * dx6 + dilation_w * v;
+                int x7 = stride_w * dx7 + dilation_w * v;
+                int y0 = stride_h * dy0 + dilation_h * u;
+                int y1 = stride_h * dy1 + dilation_h * u;
+                int y2 = stride_h * dy2 + dilation_h * u;
+                int y3 = stride_h * dy3 + dilation_h * u;
+                int y4 = stride_h * dy4 + dilation_h * u;
+                int y5 = stride_h * dy5 + dilation_h * u;
+                int y6 = stride_h * dy6 + dilation_h * u;
+                int y7 = stride_h * dy7 + dilation_h * u;
+
+                const signed char* sptr0 = img.row<const signed char>(y0) + x0 * elempack;
+                const signed char* sptr1 = img.row<const signed char>(y1) + x1 * elempack;
+                const signed char* sptr2 = img.row<const signed char>(y2) + x2 * elempack;
+                const signed char* sptr3 = img.row<const signed char>(y3) + x3 * elempack;
+                const signed char* sptr4 = img.row<const signed char>(y4) + x4 * elempack;
+                const signed char* sptr5 = img.row<const signed char>(y5) + x5 * elempack;
+                const signed char* sptr6 = img.row<const signed char>(y6) + x6 * elempack;
+                const signed char* sptr7 = img.row<const signed char>(y7) + x7 * elempack;
+
+                if (elempack == 8)
+                {
+                    __m128i _r0 = _mm_loadl_epi64((const __m128i*)sptr0);
+                    __m128i _r1 = _mm_loadl_epi64((const __m128i*)sptr1);
+                    __m128i _r2 = _mm_loadl_epi64((const __m128i*)sptr2);
+                    __m128i _r3 = _mm_loadl_epi64((const __m128i*)sptr3);
+                    __m128i _r4 = _mm_loadl_epi64((const __m128i*)sptr4);
+                    __m128i _r5 = _mm_loadl_epi64((const __m128i*)sptr5);
+                    __m128i _r6 = _mm_loadl_epi64((const __m128i*)sptr6);
+                    __m128i _r7 = _mm_loadl_epi64((const __m128i*)sptr7);
+                    __m128i _r01 = _mm_unpacklo_epi16(_r0, _r1);
+                    __m128i _r23 = _mm_unpacklo_epi16(_r2, _r3);
+                    __m128i _r45 = _mm_unpacklo_epi16(_r4, _r5);
+                    __m128i _r67 = _mm_unpacklo_epi16(_r6, _r7);
+                    _r0 = _mm_unpacklo_epi32(_r01, _r23);
+                    _r1 = _mm_unpackhi_epi32(_r01, _r23);
+                    _r2 = _mm_unpacklo_epi32(_r45, _r67);
+                    _r3 = _mm_unpackhi_epi32(_r45, _r67);
+                    _r4 = _mm_unpacklo_epi64(_r0, _r2);
+                    _r5 = _mm_unpackhi_epi64(_r0, _r2);
+                    _r6 = _mm_unpacklo_epi64(_r1, _r3);
+                    _r7 = _mm_unpackhi_epi64(_r1, _r3);
+                    _mm_storeu_si128((__m128i*)pp, _r4);
+                    _mm_storeu_si128((__m128i*)(pp + 16), _r5);
+                    _mm_storeu_si128((__m128i*)(pp + 32), _r6);
+                    _mm_storeu_si128((__m128i*)(pp + 48), _r7);
+                    pp += 64;
+                }
+                if (elempack == 1)
+                {
+                    pp[0] = sptr0[0];
+                    pp[1] = sptr1[0];
+                    pp[2] = sptr2[0];
+                    pp[3] = sptr3[0];
+                    pp[4] = sptr4[0];
+                    pp[5] = sptr5[0];
+                    pp[6] = sptr6[0];
+                    pp[7] = sptr7[0];
+                    pp += 8;
+                }
+            }
+        }
+    }
+#endif // defined(__x86_64__) || defined(_M_X64)
+    for (; jj + 3 < max_jj; jj += 4)
+    {
+        int dy0 = (j + jj) / outw;
+        int dy1 = (j + jj + 1) / outw;
+        int dy2 = (j + jj + 2) / outw;
+        int dy3 = (j + jj + 3) / outw;
+        int dx0 = (j + jj) % outw;
+        int dx1 = (j + jj + 1) % outw;
+        int dx2 = (j + jj + 2) % outw;
+        int dx3 = (j + jj + 3) % outw;
+
+        if (dy0 == dy3)
+        {
+            int kk = 0;
+            if (elempack == 1)
+            {
+                for (; kk + 1 < max_kk; kk += 2)
+                {
+                    int p0 = (k + kk) / maxk;
+                    int p1 = (k + kk + 1) / maxk;
+                    int uv0 = (k + kk) % maxk;
+                    int uv1 = (k + kk + 1) % maxk;
+                    int u0 = uv0 / kernel_w;
+                    int u1 = uv1 / kernel_w;
+                    int v0 = uv0 % kernel_w;
+                    int v1 = uv1 % kernel_w;
+
+                    const Mat img0 = bottom_blob.channel(p0);
+                    const Mat img1 = bottom_blob.channel(p1);
+
+                    int x00 = stride_w * dx0 + dilation_w * v0;
+                    int y00 = stride_h * dy0 + dilation_h * u0;
+                    int x10 = stride_w * dx0 + dilation_w * v1;
+                    int y10 = stride_h * dy0 + dilation_h * u1;
+
+                    const signed char* sptr0 = img0.row<const signed char>(y00) + x00;
+                    const signed char* sptr1 = img1.row<const signed char>(y10) + x10;
+
+                    if (stride_w == 1)
+                    {
+                        __m128i _r0 = _mm_loadl_epi64((const __m128i*)sptr0);
+                        __m128i _r1 = _mm_loadl_epi64((const __m128i*)sptr1);
+                        __m128i _r01 = _mm_unpacklo_epi8(_r0, _r1);
+                        _mm_storel_epi64((__m128i*)pp, _r01);
+                        pp += 8;
+                    }
+                    else if (stride_w == 2)
+                    {
+                        __m128i _r0 = _mm_loadl_epi64((const __m128i*)sptr0);
+                        __m128i _r1 = _mm_loadl_epi64((const __m128i*)sptr1);
+                        __m128i _r01 = _mm_unpacklo_epi8(_r0, _r1);
+                        _r01 = _mm_shufflehi_epi16(_mm_shufflelo_epi16(_r01, _MM_SHUFFLE(3, 1, 2, 0)), _MM_SHUFFLE(3, 1, 2, 0));
+                        _r01 = _mm_shuffle_epi32(_r01, _MM_SHUFFLE(3, 1, 2, 0));
+                        _mm_storel_epi64((__m128i*)pp, _r01);
+                        pp += 8;
+                    }
+                    else
+                    {
+                        pp[0] = sptr0[0];
+                        pp[1] = sptr1[0];
+                        pp[2] = sptr0[stride_w];
+                        pp[3] = sptr1[stride_w];
+                        pp[4] = sptr0[stride_w * 2];
+                        pp[5] = sptr1[stride_w * 2];
+                        pp[6] = sptr0[stride_w * 3];
+                        pp[7] = sptr1[stride_w * 3];
+                        pp += 8;
+                    }
+                }
+            }
+            for (; kk < max_kk / elempack; kk++)
+            {
+                int p = (k / elempack + kk) / maxk;
+                int uv = (k / elempack + kk) % maxk;
+                int u = uv / kernel_w;
+                int v = uv % kernel_w;
+
+                const Mat img = bottom_blob.channel(p);
+
+                int x0 = stride_w * dx0 + dilation_w * v;
+                int y0 = stride_h * dy0 + dilation_h * u;
+
+                const signed char* sptr = img.row<const signed char>(y0) + x0 * elempack;
+
+                if (elempack == 8)
+                {
+                    __m128i _r0 = _mm_loadl_epi64((const __m128i*)sptr);
+                    __m128i _r1 = _mm_loadl_epi64((const __m128i*)(sptr + stride_w * 8));
+                    __m128i _r2 = _mm_loadl_epi64((const __m128i*)(sptr + stride_w * 16));
+                    __m128i _r3 = _mm_loadl_epi64((const __m128i*)(sptr + stride_w * 24));
+                    __m128i _r01 = _mm_unpacklo_epi16(_r0, _r1);
+                    __m128i _r23 = _mm_unpacklo_epi16(_r2, _r3);
+                    _r0 = _mm_unpacklo_epi32(_r01, _r23);
+                    _r1 = _mm_unpackhi_epi32(_r01, _r23);
+                    _mm_storeu_si128((__m128i*)pp, _r0);
+                    _mm_storeu_si128((__m128i*)(pp + 16), _r1);
+                    pp += 32;
+                }
+                if (elempack == 1)
+                {
+                    pp[0] = sptr[0];
+                    pp[1] = sptr[stride_w];
+                    pp[2] = sptr[stride_w * 2];
+                    pp[3] = sptr[stride_w * 3];
+                    pp += 4;
+                }
+            }
+        }
+        else
+        {
+            int kk = 0;
+            if (elempack == 1)
+            {
+                for (; kk + 1 < max_kk; kk += 2)
+                {
+                    int p0 = (k + kk) / maxk;
+                    int p1 = (k + kk + 1) / maxk;
+                    int uv0 = (k + kk) % maxk;
+                    int uv1 = (k + kk + 1) % maxk;
+                    int u0 = uv0 / kernel_w;
+                    int u1 = uv1 / kernel_w;
+                    int v0 = uv0 % kernel_w;
+                    int v1 = uv1 % kernel_w;
+
+                    const Mat img0 = bottom_blob.channel(p0);
+                    const Mat img1 = bottom_blob.channel(p1);
+
+                    int x00 = stride_w * dx0 + dilation_w * v0;
+                    int x01 = stride_w * dx1 + dilation_w * v0;
+                    int x02 = stride_w * dx2 + dilation_w * v0;
+                    int x03 = stride_w * dx3 + dilation_w * v0;
+                    int y00 = stride_h * dy0 + dilation_h * u0;
+                    int y01 = stride_h * dy1 + dilation_h * u0;
+                    int y02 = stride_h * dy2 + dilation_h * u0;
+                    int y03 = stride_h * dy3 + dilation_h * u0;
+
+                    int x10 = stride_w * dx0 + dilation_w * v1;
+                    int x11 = stride_w * dx1 + dilation_w * v1;
+                    int x12 = stride_w * dx2 + dilation_w * v1;
+                    int x13 = stride_w * dx3 + dilation_w * v1;
+                    int y10 = stride_h * dy0 + dilation_h * u1;
+                    int y11 = stride_h * dy1 + dilation_h * u1;
+                    int y12 = stride_h * dy2 + dilation_h * u1;
+                    int y13 = stride_h * dy3 + dilation_h * u1;
+
+                    const signed char* sptr00 = img0.row<const signed char>(y00) + x00;
+                    const signed char* sptr01 = img0.row<const signed char>(y01) + x01;
+                    const signed char* sptr02 = img0.row<const signed char>(y02) + x02;
+                    const signed char* sptr03 = img0.row<const signed char>(y03) + x03;
+
+                    const signed char* sptr10 = img1.row<const signed char>(y10) + x10;
+                    const signed char* sptr11 = img1.row<const signed char>(y11) + x11;
+                    const signed char* sptr12 = img1.row<const signed char>(y12) + x12;
+                    const signed char* sptr13 = img1.row<const signed char>(y13) + x13;
+
+                    pp[0] = sptr00[0];
+                    pp[1] = sptr10[0];
+                    pp[2] = sptr01[0];
+                    pp[3] = sptr11[0];
+                    pp[4] = sptr02[0];
+                    pp[5] = sptr12[0];
+                    pp[6] = sptr03[0];
+                    pp[7] = sptr13[0];
+                    pp += 8;
+                }
+            }
+            for (; kk < max_kk / elempack; kk++)
+            {
+                int p = (k / elempack + kk) / maxk;
+                int uv = (k / elempack + kk) % maxk;
+                int u = uv / kernel_w;
+                int v = uv % kernel_w;
+
+                const Mat img = bottom_blob.channel(p);
+
+                int x0 = stride_w * dx0 + dilation_w * v;
+                int x1 = stride_w * dx1 + dilation_w * v;
+                int x2 = stride_w * dx2 + dilation_w * v;
+                int x3 = stride_w * dx3 + dilation_w * v;
+                int y0 = stride_h * dy0 + dilation_h * u;
+                int y1 = stride_h * dy1 + dilation_h * u;
+                int y2 = stride_h * dy2 + dilation_h * u;
+                int y3 = stride_h * dy3 + dilation_h * u;
+
+                const signed char* sptr0 = img.row<const signed char>(y0) + x0 * elempack;
+                const signed char* sptr1 = img.row<const signed char>(y1) + x1 * elempack;
+                const signed char* sptr2 = img.row<const signed char>(y2) + x2 * elempack;
+                const signed char* sptr3 = img.row<const signed char>(y3) + x3 * elempack;
+
+                if (elempack == 8)
+                {
+                    __m128i _r0 = _mm_loadl_epi64((const __m128i*)sptr0);
+                    __m128i _r1 = _mm_loadl_epi64((const __m128i*)sptr1);
+                    __m128i _r2 = _mm_loadl_epi64((const __m128i*)sptr2);
+                    __m128i _r3 = _mm_loadl_epi64((const __m128i*)sptr3);
+                    __m128i _r01 = _mm_unpacklo_epi16(_r0, _r1);
+                    __m128i _r23 = _mm_unpacklo_epi16(_r2, _r3);
+                    _r0 = _mm_unpacklo_epi32(_r01, _r23);
+                    _r1 = _mm_unpackhi_epi32(_r01, _r23);
+                    _mm_storeu_si128((__m128i*)pp, _r0);
+                    _mm_storeu_si128((__m128i*)(pp + 16), _r1);
+                    pp += 32;
+                }
+                if (elempack == 1)
+                {
+                    pp[0] = sptr0[0];
+                    pp[1] = sptr1[0];
+                    pp[2] = sptr2[0];
+                    pp[3] = sptr3[0];
+                    pp += 4;
+                }
+            }
+        }
+    }
+#endif // __SSE2__
+    for (; jj + 1 < max_jj; jj += 2)
+    {
+        int dy0 = (j + jj) / outw;
+        int dy1 = (j + jj + 1) / outw;
+        int dx0 = (j + jj) % outw;
+        int dx1 = (j + jj + 1) % outw;
+
+        if (dy0 == dy1)
+        {
+            int kk = 0;
+            if (elempack == 1)
+            {
+                for (; kk + 1 < max_kk; kk += 2)
+                {
+                    int p0 = (k + kk) / maxk;
+                    int p1 = (k + kk + 1) / maxk;
+                    int uv0 = (k + kk) % maxk;
+                    int uv1 = (k + kk + 1) % maxk;
+                    int u0 = uv0 / kernel_w;
+                    int u1 = uv1 / kernel_w;
+                    int v0 = uv0 % kernel_w;
+                    int v1 = uv1 % kernel_w;
+
+                    const Mat img0 = bottom_blob.channel(p0);
+                    const Mat img1 = bottom_blob.channel(p1);
+
+                    int x00 = stride_w * dx0 + dilation_w * v0;
+                    int y00 = stride_h * dy0 + dilation_h * u0;
+                    int x10 = stride_w * dx0 + dilation_w * v1;
+                    int y10 = stride_h * dy0 + dilation_h * u1;
+
+                    const signed char* sptr0 = img0.row<const signed char>(y00) + x00;
+                    const signed char* sptr1 = img1.row<const signed char>(y10) + x10;
+
+                    pp[0] = sptr0[0];
+                    pp[1] = sptr1[0];
+                    pp[2] = sptr0[stride_w];
+                    pp[3] = sptr1[stride_w];
+                    pp += 4;
+                }
+            }
+            for (; kk < max_kk / elempack; kk++)
+            {
+                int p = (k / elempack + kk) / maxk;
+                int uv = (k / elempack + kk) % maxk;
+                int u = uv / kernel_w;
+                int v = uv % kernel_w;
+
+                const Mat img = bottom_blob.channel(p);
+
+                int x0 = stride_w * dx0 + dilation_w * v;
+                int y0 = stride_h * dy0 + dilation_h * u;
+
+                const signed char* sptr = img.row<const signed char>(y0) + x0 * elempack;
+
+#if __SSE2__
+                if (elempack == 8)
+                {
+                    __m128i _r0 = _mm_loadl_epi64((const __m128i*)sptr);
+                    __m128i _r1 = _mm_loadl_epi64((const __m128i*)(sptr + stride_w * 8));
+                    __m128i _r01 = _mm_unpacklo_epi16(_r0, _r1);
+                    _mm_storeu_si128((__m128i*)pp, _r01);
+                    pp += 16;
+                }
+#endif // __SSE2__
+                if (elempack == 1)
+                {
+                    pp[0] = sptr[0];
+                    pp[1] = sptr[stride_w];
+                    pp += 2;
+                }
+            }
+        }
+        else
+        {
+            int kk = 0;
+            if (elempack == 1)
+            {
+                for (; kk + 1 < max_kk; kk += 2)
+                {
+                    int p0 = (k + kk) / maxk;
+                    int p1 = (k + kk + 1) / maxk;
+                    int uv0 = (k + kk) % maxk;
+                    int uv1 = (k + kk + 1) % maxk;
+                    int u0 = uv0 / kernel_w;
+                    int u1 = uv1 / kernel_w;
+                    int v0 = uv0 % kernel_w;
+                    int v1 = uv1 % kernel_w;
+
+                    const Mat img0 = bottom_blob.channel(p0);
+                    const Mat img1 = bottom_blob.channel(p1);
+
+                    int x00 = stride_w * dx0 + dilation_w * v0;
+                    int x01 = stride_w * dx1 + dilation_w * v0;
+                    int y00 = stride_h * dy0 + dilation_h * u0;
+                    int y01 = stride_h * dy1 + dilation_h * u0;
+                    int x10 = stride_w * dx0 + dilation_w * v1;
+                    int x11 = stride_w * dx1 + dilation_w * v1;
+                    int y10 = stride_h * dy0 + dilation_h * u1;
+                    int y11 = stride_h * dy1 + dilation_h * u1;
+
+                    const signed char* sptr00 = img0.row<const signed char>(y00) + x00;
+                    const signed char* sptr01 = img0.row<const signed char>(y01) + x01;
+                    const signed char* sptr10 = img1.row<const signed char>(y10) + x10;
+                    const signed char* sptr11 = img1.row<const signed char>(y11) + x11;
+
+                    pp[0] = sptr00[0];
+                    pp[1] = sptr10[0];
+                    pp[2] = sptr01[0];
+                    pp[3] = sptr11[0];
+                    pp += 4;
+                }
+            }
+            for (; kk < max_kk / elempack; kk++)
+            {
+                int p = (k / elempack + kk) / maxk;
+                int uv = (k / elempack + kk) % maxk;
+                int u = uv / kernel_w;
+                int v = uv % kernel_w;
+
+                const Mat img = bottom_blob.channel(p);
+
+                int x0 = stride_w * dx0 + dilation_w * v;
+                int x1 = stride_w * dx1 + dilation_w * v;
+                int y0 = stride_h * dy0 + dilation_h * u;
+                int y1 = stride_h * dy1 + dilation_h * u;
+
+                const signed char* sptr0 = img.row<const signed char>(y0) + x0 * elempack;
+                const signed char* sptr1 = img.row<const signed char>(y1) + x1 * elempack;
+
+#if __SSE2__
+                if (elempack == 8)
+                {
+                    __m128i _r0 = _mm_loadl_epi64((const __m128i*)sptr0);
+                    __m128i _r1 = _mm_loadl_epi64((const __m128i*)sptr1);
+                    __m128i _r01 = _mm_unpacklo_epi16(_r0, _r1);
+                    _mm_storeu_si128((__m128i*)pp, _r01);
+                    pp += 16;
+                }
+#endif // __SSE2__
+                if (elempack == 1)
+                {
+                    pp[0] = sptr0[0];
+                    pp[1] = sptr1[0];
+                    pp += 2;
+                }
             }
         }
     }