From afbf2e1a6a34df4ff0c7f8e4e1821b85826f45d0 Mon Sep 17 00:00:00 2001 From: Yolanda Chen Date: Mon, 21 Oct 2024 14:40:59 +0800 Subject: [PATCH] Update 1x8c8 4x8c8 with simplified int8 convert --- ...-qc8w-gemm-1x8c8-minmax-fp32-wasmsdot-u2.c | 16 ++--- ...qs8-qc8w-gemm-1x8c8-minmax-fp32-wasmsdot.c | 16 ++--- ...qc8w-gemm-1x8c8-minmax-fp32-wasmusdot-u2.c | 16 ++--- ...s8-qc8w-gemm-1x8c8-minmax-fp32-wasmusdot.c | 16 ++--- ...-qc8w-gemm-4x8c8-minmax-fp32-wasmsdot-u2.c | 56 ++++++++++-------- ...qs8-qc8w-gemm-4x8c8-minmax-fp32-wasmsdot.c | 56 ++++++++++-------- ...qc8w-gemm-4x8c8-minmax-fp32-wasmusdot-u2.c | 56 ++++++++++-------- ...s8-qc8w-gemm-4x8c8-minmax-fp32-wasmusdot.c | 56 ++++++++++-------- ...qc8w-igemm-4x8c8-minmax-fp32-wasmsdot-u2.c | 58 +++++++++++-------- ...s8-qc8w-igemm-4x8c8-minmax-fp32-wasmsdot.c | 58 +++++++++++-------- ...c8w-igemm-4x8c8-minmax-fp32-wasmusdot-u2.c | 58 +++++++++++-------- ...8-qc8w-igemm-4x8c8-minmax-fp32-wasmusdot.c | 58 +++++++++++-------- 12 files changed, 292 insertions(+), 228 deletions(-) diff --git a/src/qs8-qc8w-gemm/gen/qs8-qc8w-gemm-1x8c8-minmax-fp32-wasmsdot-u2.c b/src/qs8-qc8w-gemm/gen/qs8-qc8w-gemm-1x8c8-minmax-fp32-wasmsdot-u2.c index 797b9fefaa8..51b7253c3d1 100644 --- a/src/qs8-qc8w-gemm/gen/qs8-qc8w-gemm-1x8c8-minmax-fp32-wasmsdot-u2.c +++ b/src/qs8-qc8w-gemm/gen/qs8-qc8w-gemm-1x8c8-minmax-fp32-wasmsdot-u2.c @@ -125,12 +125,12 @@ void xnn_qs8_qc8w_gemm_minmax_fp32_ukernel_1x8c8__wasmsdot_u2( v128_t vacc0x01234567 = wasm_i16x8_narrow_i32x4(vacc0x0123, vacc0x4567); - v128_t vacc0x01234567_0x01234567 = wasm_i8x16_narrow_i16x8(vacc0x01234567, vacc0x01234567); + vacc0x01234567 = wasm_i8x16_narrow_i16x8(vacc0x01234567, vacc0x01234567); - vacc0x01234567_0x01234567 = wasm_i8x16_min(vacc0x01234567_0x01234567, voutput_max); + vacc0x01234567 = wasm_i8x16_min(vacc0x01234567, voutput_max); if XNN_LIKELY(nc >= 8) { - wasm_v128_store64_lane(c0, vacc0x01234567_0x01234567, 0); + wasm_v128_store64_lane(c0, vacc0x01234567, 0); c0 = (int8_t*) ((uintptr_t) c0 + cn_stride); @@ -139,19 +139,19 @@ void xnn_qs8_qc8w_gemm_minmax_fp32_ukernel_1x8c8__wasmsdot_u2( nc -= 8; } else { if (nc & 4) { - wasm_v128_store32_lane(c0, vacc0x01234567_0x01234567, 0); + wasm_v128_store32_lane(c0, vacc0x01234567, 0); c0 += 4; - vacc0x01234567_0x01234567 = wasm_u64x2_shr(vacc0x01234567_0x01234567, 32); + vacc0x01234567 = wasm_u64x2_shr(vacc0x01234567, 32); } if (nc & 2) { - wasm_v128_store16_lane(c0, vacc0x01234567_0x01234567, 0); + wasm_v128_store16_lane(c0, vacc0x01234567, 0); c0 += 2; - vacc0x01234567_0x01234567 = wasm_u32x4_shr(vacc0x01234567_0x01234567, 16); + vacc0x01234567 = wasm_u32x4_shr(vacc0x01234567, 16); } if (nc & 1) { - wasm_v128_store8_lane(c0, vacc0x01234567_0x01234567, 0); + wasm_v128_store8_lane(c0, vacc0x01234567, 0); } nc = 0; diff --git a/src/qs8-qc8w-gemm/gen/qs8-qc8w-gemm-1x8c8-minmax-fp32-wasmsdot.c b/src/qs8-qc8w-gemm/gen/qs8-qc8w-gemm-1x8c8-minmax-fp32-wasmsdot.c index c5a85632103..ca5f04bfbdc 100644 --- a/src/qs8-qc8w-gemm/gen/qs8-qc8w-gemm-1x8c8-minmax-fp32-wasmsdot.c +++ b/src/qs8-qc8w-gemm/gen/qs8-qc8w-gemm-1x8c8-minmax-fp32-wasmsdot.c @@ -100,12 +100,12 @@ void xnn_qs8_qc8w_gemm_minmax_fp32_ukernel_1x8c8__wasmsdot( v128_t vacc0x01234567 = wasm_i16x8_narrow_i32x4(vacc0x0123, vacc0x4567); - v128_t vacc0x01234567_0x01234567 = wasm_i8x16_narrow_i16x8(vacc0x01234567, vacc0x01234567); + vacc0x01234567 = wasm_i8x16_narrow_i16x8(vacc0x01234567, vacc0x01234567); - vacc0x01234567_0x01234567 = wasm_i8x16_min(vacc0x01234567_0x01234567, voutput_max); + vacc0x01234567 = wasm_i8x16_min(vacc0x01234567, voutput_max); if XNN_LIKELY(nc >= 8) { - wasm_v128_store64_lane(c0, vacc0x01234567_0x01234567, 0); + wasm_v128_store64_lane(c0, vacc0x01234567, 0); c0 = (int8_t*) ((uintptr_t) c0 + cn_stride); @@ -114,19 +114,19 @@ void xnn_qs8_qc8w_gemm_minmax_fp32_ukernel_1x8c8__wasmsdot( nc -= 8; } else { if (nc & 4) { - wasm_v128_store32_lane(c0, vacc0x01234567_0x01234567, 0); + wasm_v128_store32_lane(c0, vacc0x01234567, 0); c0 += 4; - vacc0x01234567_0x01234567 = wasm_u64x2_shr(vacc0x01234567_0x01234567, 32); + vacc0x01234567 = wasm_u64x2_shr(vacc0x01234567, 32); } if (nc & 2) { - wasm_v128_store16_lane(c0, vacc0x01234567_0x01234567, 0); + wasm_v128_store16_lane(c0, vacc0x01234567, 0); c0 += 2; - vacc0x01234567_0x01234567 = wasm_u32x4_shr(vacc0x01234567_0x01234567, 16); + vacc0x01234567 = wasm_u32x4_shr(vacc0x01234567, 16); } if (nc & 1) { - wasm_v128_store8_lane(c0, vacc0x01234567_0x01234567, 0); + wasm_v128_store8_lane(c0, vacc0x01234567, 0); } nc = 0; diff --git a/src/qs8-qc8w-gemm/gen/qs8-qc8w-gemm-1x8c8-minmax-fp32-wasmusdot-u2.c b/src/qs8-qc8w-gemm/gen/qs8-qc8w-gemm-1x8c8-minmax-fp32-wasmusdot-u2.c index 394e62dcf48..9014c867678 100644 --- a/src/qs8-qc8w-gemm/gen/qs8-qc8w-gemm-1x8c8-minmax-fp32-wasmusdot-u2.c +++ b/src/qs8-qc8w-gemm/gen/qs8-qc8w-gemm-1x8c8-minmax-fp32-wasmusdot-u2.c @@ -126,12 +126,12 @@ void xnn_qs8_qc8w_gemm_minmax_fp32_ukernel_1x8c8__wasmusdot_u2( v128_t vacc0x01234567 = wasm_i16x8_narrow_i32x4(vacc0x0123, vacc0x4567); - v128_t vacc0x01234567_0x01234567 = wasm_i8x16_narrow_i16x8(vacc0x01234567, vacc0x01234567); + vacc0x01234567 = wasm_i8x16_narrow_i16x8(vacc0x01234567, vacc0x01234567); - vacc0x01234567_0x01234567 = wasm_i8x16_min(vacc0x01234567_0x01234567, voutput_max); + vacc0x01234567 = wasm_i8x16_min(vacc0x01234567, voutput_max); if XNN_LIKELY(nc >= 8) { - wasm_v128_store64_lane(c0, vacc0x01234567_0x01234567, 0); + wasm_v128_store64_lane(c0, vacc0x01234567, 0); c0 = (int8_t*) ((uintptr_t) c0 + cn_stride); @@ -140,19 +140,19 @@ void xnn_qs8_qc8w_gemm_minmax_fp32_ukernel_1x8c8__wasmusdot_u2( nc -= 8; } else { if (nc & 4) { - wasm_v128_store32_lane(c0, vacc0x01234567_0x01234567, 0); + wasm_v128_store32_lane(c0, vacc0x01234567, 0); c0 += 4; - vacc0x01234567_0x01234567 = wasm_u64x2_shr(vacc0x01234567_0x01234567, 32); + vacc0x01234567 = wasm_u64x2_shr(vacc0x01234567, 32); } if (nc & 2) { - wasm_v128_store16_lane(c0, vacc0x01234567_0x01234567, 0); + wasm_v128_store16_lane(c0, vacc0x01234567, 0); c0 += 2; - vacc0x01234567_0x01234567 = wasm_u32x4_shr(vacc0x01234567_0x01234567, 16); + vacc0x01234567 = wasm_u32x4_shr(vacc0x01234567, 16); } if (nc & 1) { - wasm_v128_store8_lane(c0, vacc0x01234567_0x01234567, 0); + wasm_v128_store8_lane(c0, vacc0x01234567, 0); } nc = 0; diff --git a/src/qs8-qc8w-gemm/gen/qs8-qc8w-gemm-1x8c8-minmax-fp32-wasmusdot.c b/src/qs8-qc8w-gemm/gen/qs8-qc8w-gemm-1x8c8-minmax-fp32-wasmusdot.c index 43ea2693f25..24a0d6bf494 100644 --- a/src/qs8-qc8w-gemm/gen/qs8-qc8w-gemm-1x8c8-minmax-fp32-wasmusdot.c +++ b/src/qs8-qc8w-gemm/gen/qs8-qc8w-gemm-1x8c8-minmax-fp32-wasmusdot.c @@ -101,12 +101,12 @@ void xnn_qs8_qc8w_gemm_minmax_fp32_ukernel_1x8c8__wasmusdot( v128_t vacc0x01234567 = wasm_i16x8_narrow_i32x4(vacc0x0123, vacc0x4567); - v128_t vacc0x01234567_0x01234567 = wasm_i8x16_narrow_i16x8(vacc0x01234567, vacc0x01234567); + vacc0x01234567 = wasm_i8x16_narrow_i16x8(vacc0x01234567, vacc0x01234567); - vacc0x01234567_0x01234567 = wasm_i8x16_min(vacc0x01234567_0x01234567, voutput_max); + vacc0x01234567 = wasm_i8x16_min(vacc0x01234567, voutput_max); if XNN_LIKELY(nc >= 8) { - wasm_v128_store64_lane(c0, vacc0x01234567_0x01234567, 0); + wasm_v128_store64_lane(c0, vacc0x01234567, 0); c0 = (int8_t*) ((uintptr_t) c0 + cn_stride); @@ -115,19 +115,19 @@ void xnn_qs8_qc8w_gemm_minmax_fp32_ukernel_1x8c8__wasmusdot( nc -= 8; } else { if (nc & 4) { - wasm_v128_store32_lane(c0, vacc0x01234567_0x01234567, 0); + wasm_v128_store32_lane(c0, vacc0x01234567, 0); c0 += 4; - vacc0x01234567_0x01234567 = wasm_u64x2_shr(vacc0x01234567_0x01234567, 32); + vacc0x01234567 = wasm_u64x2_shr(vacc0x01234567, 32); } if (nc & 2) { - wasm_v128_store16_lane(c0, vacc0x01234567_0x01234567, 0); + wasm_v128_store16_lane(c0, vacc0x01234567, 0); c0 += 2; - vacc0x01234567_0x01234567 = wasm_u32x4_shr(vacc0x01234567_0x01234567, 16); + vacc0x01234567 = wasm_u32x4_shr(vacc0x01234567, 16); } if (nc & 1) { - wasm_v128_store8_lane(c0, vacc0x01234567_0x01234567, 0); + wasm_v128_store8_lane(c0, vacc0x01234567, 0); } nc = 0; diff --git a/src/qs8-qc8w-gemm/gen/qs8-qc8w-gemm-4x8c8-minmax-fp32-wasmsdot-u2.c b/src/qs8-qc8w-gemm/gen/qs8-qc8w-gemm-4x8c8-minmax-fp32-wasmsdot-u2.c index d3f97539817..0f6fb4ab2bd 100644 --- a/src/qs8-qc8w-gemm/gen/qs8-qc8w-gemm-4x8c8-minmax-fp32-wasmsdot-u2.c +++ b/src/qs8-qc8w-gemm/gen/qs8-qc8w-gemm-4x8c8-minmax-fp32-wasmsdot-u2.c @@ -245,17 +245,21 @@ void xnn_qs8_qc8w_gemm_minmax_fp32_ukernel_4x8c8__wasmsdot_u2( v128_t vacc2x01234567 = wasm_i16x8_narrow_i32x4(vacc2x0123, vacc2x4567); v128_t vacc3x01234567 = wasm_i16x8_narrow_i32x4(vacc3x0123, vacc3x4567); - v128_t vacc0x01234567_1x01234567 = wasm_i8x16_narrow_i16x8(vacc0x01234567, vacc1x01234567); - v128_t vacc2x01234567_3x01234567 = wasm_i8x16_narrow_i16x8(vacc2x01234567, vacc3x01234567); + vacc0x01234567 = wasm_i8x16_narrow_i16x8(vacc0x01234567, vacc0x01234567); + vacc1x01234567 = wasm_i8x16_narrow_i16x8(vacc1x01234567, vacc1x01234567); + vacc2x01234567 = wasm_i8x16_narrow_i16x8(vacc2x01234567, vacc2x01234567); + vacc3x01234567 = wasm_i8x16_narrow_i16x8(vacc3x01234567, vacc3x01234567); - vacc0x01234567_1x01234567 = wasm_i8x16_min(vacc0x01234567_1x01234567, voutput_max); - vacc2x01234567_3x01234567 = wasm_i8x16_min(vacc2x01234567_3x01234567, voutput_max); + vacc0x01234567 = wasm_i8x16_min(vacc0x01234567, voutput_max); + vacc1x01234567 = wasm_i8x16_min(vacc1x01234567, voutput_max); + vacc2x01234567 = wasm_i8x16_min(vacc2x01234567, voutput_max); + vacc3x01234567 = wasm_i8x16_min(vacc3x01234567, voutput_max); if XNN_LIKELY(nc >= 8) { - wasm_v128_store64_lane(c0, vacc0x01234567_1x01234567, 0); - wasm_v128_store64_lane(c1, vacc0x01234567_1x01234567, 1); - wasm_v128_store64_lane(c2, vacc2x01234567_3x01234567, 0); - wasm_v128_store64_lane(c3, vacc2x01234567_3x01234567, 1); + wasm_v128_store64_lane(c0, vacc0x01234567, 0); + wasm_v128_store64_lane(c1, vacc1x01234567, 0); + wasm_v128_store64_lane(c2, vacc2x01234567, 0); + wasm_v128_store64_lane(c3, vacc3x01234567, 0); c0 = (int8_t*) ((uintptr_t) c0 + cn_stride); c1 = (int8_t*) ((uintptr_t) c1 + cn_stride); @@ -270,36 +274,40 @@ void xnn_qs8_qc8w_gemm_minmax_fp32_ukernel_4x8c8__wasmsdot_u2( nc -= 8; } else { if (nc & 4) { - wasm_v128_store32_lane(c0, vacc0x01234567_1x01234567, 0); + wasm_v128_store32_lane(c0, vacc0x01234567, 0); c0 += 4; - wasm_v128_store32_lane(c1, vacc0x01234567_1x01234567, 2); + wasm_v128_store32_lane(c1, vacc1x01234567, 0); c1 += 4; - wasm_v128_store32_lane(c2, vacc2x01234567_3x01234567, 0); + wasm_v128_store32_lane(c2, vacc2x01234567, 0); c2 += 4; - wasm_v128_store32_lane(c3, vacc2x01234567_3x01234567, 2); + wasm_v128_store32_lane(c3, vacc3x01234567, 0); c3 += 4; - vacc0x01234567_1x01234567 = wasm_u64x2_shr(vacc0x01234567_1x01234567, 32); - vacc2x01234567_3x01234567 = wasm_u64x2_shr(vacc2x01234567_3x01234567, 32); + vacc0x01234567 = wasm_u64x2_shr(vacc0x01234567, 32); + vacc1x01234567 = wasm_u64x2_shr(vacc1x01234567, 32); + vacc2x01234567 = wasm_u64x2_shr(vacc2x01234567, 32); + vacc3x01234567 = wasm_u64x2_shr(vacc3x01234567, 32); } if (nc & 2) { - wasm_v128_store16_lane(c0, vacc0x01234567_1x01234567, 0); + wasm_v128_store16_lane(c0, vacc0x01234567, 0); c0 += 2; - wasm_v128_store16_lane(c1, vacc0x01234567_1x01234567, 4); + wasm_v128_store16_lane(c1, vacc1x01234567, 0); c1 += 2; - wasm_v128_store16_lane(c2, vacc2x01234567_3x01234567, 0); + wasm_v128_store16_lane(c2, vacc2x01234567, 0); c2 += 2; - wasm_v128_store16_lane(c3, vacc2x01234567_3x01234567, 4); + wasm_v128_store16_lane(c3, vacc3x01234567, 0); c3 += 2; - vacc0x01234567_1x01234567 = wasm_u32x4_shr(vacc0x01234567_1x01234567, 16); - vacc2x01234567_3x01234567 = wasm_u32x4_shr(vacc2x01234567_3x01234567, 16); + vacc0x01234567 = wasm_u32x4_shr(vacc0x01234567, 16); + vacc1x01234567 = wasm_u32x4_shr(vacc1x01234567, 16); + vacc2x01234567 = wasm_u32x4_shr(vacc2x01234567, 16); + vacc3x01234567 = wasm_u32x4_shr(vacc3x01234567, 16); } if (nc & 1) { - wasm_v128_store8_lane(c0, vacc0x01234567_1x01234567, 0); - wasm_v128_store8_lane(c1, vacc0x01234567_1x01234567, 8); - wasm_v128_store8_lane(c2, vacc2x01234567_3x01234567, 0); - wasm_v128_store8_lane(c3, vacc2x01234567_3x01234567, 8); + wasm_v128_store8_lane(c0, vacc0x01234567, 0); + wasm_v128_store8_lane(c1, vacc1x01234567, 0); + wasm_v128_store8_lane(c2, vacc2x01234567, 0); + wasm_v128_store8_lane(c3, vacc3x01234567, 0); } nc = 0; diff --git a/src/qs8-qc8w-gemm/gen/qs8-qc8w-gemm-4x8c8-minmax-fp32-wasmsdot.c b/src/qs8-qc8w-gemm/gen/qs8-qc8w-gemm-4x8c8-minmax-fp32-wasmsdot.c index 8fdd2726a0d..0203c4dccaa 100644 --- a/src/qs8-qc8w-gemm/gen/qs8-qc8w-gemm-4x8c8-minmax-fp32-wasmsdot.c +++ b/src/qs8-qc8w-gemm/gen/qs8-qc8w-gemm-4x8c8-minmax-fp32-wasmsdot.c @@ -187,17 +187,21 @@ void xnn_qs8_qc8w_gemm_minmax_fp32_ukernel_4x8c8__wasmsdot( v128_t vacc2x01234567 = wasm_i16x8_narrow_i32x4(vacc2x0123, vacc2x4567); v128_t vacc3x01234567 = wasm_i16x8_narrow_i32x4(vacc3x0123, vacc3x4567); - v128_t vacc0x01234567_1x01234567 = wasm_i8x16_narrow_i16x8(vacc0x01234567, vacc1x01234567); - v128_t vacc2x01234567_3x01234567 = wasm_i8x16_narrow_i16x8(vacc2x01234567, vacc3x01234567); + vacc0x01234567 = wasm_i8x16_narrow_i16x8(vacc0x01234567, vacc0x01234567); + vacc1x01234567 = wasm_i8x16_narrow_i16x8(vacc1x01234567, vacc1x01234567); + vacc2x01234567 = wasm_i8x16_narrow_i16x8(vacc2x01234567, vacc2x01234567); + vacc3x01234567 = wasm_i8x16_narrow_i16x8(vacc3x01234567, vacc3x01234567); - vacc0x01234567_1x01234567 = wasm_i8x16_min(vacc0x01234567_1x01234567, voutput_max); - vacc2x01234567_3x01234567 = wasm_i8x16_min(vacc2x01234567_3x01234567, voutput_max); + vacc0x01234567 = wasm_i8x16_min(vacc0x01234567, voutput_max); + vacc1x01234567 = wasm_i8x16_min(vacc1x01234567, voutput_max); + vacc2x01234567 = wasm_i8x16_min(vacc2x01234567, voutput_max); + vacc3x01234567 = wasm_i8x16_min(vacc3x01234567, voutput_max); if XNN_LIKELY(nc >= 8) { - wasm_v128_store64_lane(c0, vacc0x01234567_1x01234567, 0); - wasm_v128_store64_lane(c1, vacc0x01234567_1x01234567, 1); - wasm_v128_store64_lane(c2, vacc2x01234567_3x01234567, 0); - wasm_v128_store64_lane(c3, vacc2x01234567_3x01234567, 1); + wasm_v128_store64_lane(c0, vacc0x01234567, 0); + wasm_v128_store64_lane(c1, vacc1x01234567, 0); + wasm_v128_store64_lane(c2, vacc2x01234567, 0); + wasm_v128_store64_lane(c3, vacc3x01234567, 0); c0 = (int8_t*) ((uintptr_t) c0 + cn_stride); c1 = (int8_t*) ((uintptr_t) c1 + cn_stride); @@ -212,36 +216,40 @@ void xnn_qs8_qc8w_gemm_minmax_fp32_ukernel_4x8c8__wasmsdot( nc -= 8; } else { if (nc & 4) { - wasm_v128_store32_lane(c0, vacc0x01234567_1x01234567, 0); + wasm_v128_store32_lane(c0, vacc0x01234567, 0); c0 += 4; - wasm_v128_store32_lane(c1, vacc0x01234567_1x01234567, 2); + wasm_v128_store32_lane(c1, vacc1x01234567, 0); c1 += 4; - wasm_v128_store32_lane(c2, vacc2x01234567_3x01234567, 0); + wasm_v128_store32_lane(c2, vacc2x01234567, 0); c2 += 4; - wasm_v128_store32_lane(c3, vacc2x01234567_3x01234567, 2); + wasm_v128_store32_lane(c3, vacc3x01234567, 0); c3 += 4; - vacc0x01234567_1x01234567 = wasm_u64x2_shr(vacc0x01234567_1x01234567, 32); - vacc2x01234567_3x01234567 = wasm_u64x2_shr(vacc2x01234567_3x01234567, 32); + vacc0x01234567 = wasm_u64x2_shr(vacc0x01234567, 32); + vacc1x01234567 = wasm_u64x2_shr(vacc1x01234567, 32); + vacc2x01234567 = wasm_u64x2_shr(vacc2x01234567, 32); + vacc3x01234567 = wasm_u64x2_shr(vacc3x01234567, 32); } if (nc & 2) { - wasm_v128_store16_lane(c0, vacc0x01234567_1x01234567, 0); + wasm_v128_store16_lane(c0, vacc0x01234567, 0); c0 += 2; - wasm_v128_store16_lane(c1, vacc0x01234567_1x01234567, 4); + wasm_v128_store16_lane(c1, vacc1x01234567, 0); c1 += 2; - wasm_v128_store16_lane(c2, vacc2x01234567_3x01234567, 0); + wasm_v128_store16_lane(c2, vacc2x01234567, 0); c2 += 2; - wasm_v128_store16_lane(c3, vacc2x01234567_3x01234567, 4); + wasm_v128_store16_lane(c3, vacc3x01234567, 0); c3 += 2; - vacc0x01234567_1x01234567 = wasm_u32x4_shr(vacc0x01234567_1x01234567, 16); - vacc2x01234567_3x01234567 = wasm_u32x4_shr(vacc2x01234567_3x01234567, 16); + vacc0x01234567 = wasm_u32x4_shr(vacc0x01234567, 16); + vacc1x01234567 = wasm_u32x4_shr(vacc1x01234567, 16); + vacc2x01234567 = wasm_u32x4_shr(vacc2x01234567, 16); + vacc3x01234567 = wasm_u32x4_shr(vacc3x01234567, 16); } if (nc & 1) { - wasm_v128_store8_lane(c0, vacc0x01234567_1x01234567, 0); - wasm_v128_store8_lane(c1, vacc0x01234567_1x01234567, 8); - wasm_v128_store8_lane(c2, vacc2x01234567_3x01234567, 0); - wasm_v128_store8_lane(c3, vacc2x01234567_3x01234567, 8); + wasm_v128_store8_lane(c0, vacc0x01234567, 0); + wasm_v128_store8_lane(c1, vacc1x01234567, 0); + wasm_v128_store8_lane(c2, vacc2x01234567, 0); + wasm_v128_store8_lane(c3, vacc3x01234567, 0); } nc = 0; diff --git a/src/qs8-qc8w-gemm/gen/qs8-qc8w-gemm-4x8c8-minmax-fp32-wasmusdot-u2.c b/src/qs8-qc8w-gemm/gen/qs8-qc8w-gemm-4x8c8-minmax-fp32-wasmusdot-u2.c index 4d87cc6ceee..c01994cb0f5 100644 --- a/src/qs8-qc8w-gemm/gen/qs8-qc8w-gemm-4x8c8-minmax-fp32-wasmusdot-u2.c +++ b/src/qs8-qc8w-gemm/gen/qs8-qc8w-gemm-4x8c8-minmax-fp32-wasmusdot-u2.c @@ -246,17 +246,21 @@ void xnn_qs8_qc8w_gemm_minmax_fp32_ukernel_4x8c8__wasmusdot_u2( v128_t vacc2x01234567 = wasm_i16x8_narrow_i32x4(vacc2x0123, vacc2x4567); v128_t vacc3x01234567 = wasm_i16x8_narrow_i32x4(vacc3x0123, vacc3x4567); - v128_t vacc0x01234567_1x01234567 = wasm_i8x16_narrow_i16x8(vacc0x01234567, vacc1x01234567); - v128_t vacc2x01234567_3x01234567 = wasm_i8x16_narrow_i16x8(vacc2x01234567, vacc3x01234567); + vacc0x01234567 = wasm_i8x16_narrow_i16x8(vacc0x01234567, vacc0x01234567); + vacc1x01234567 = wasm_i8x16_narrow_i16x8(vacc1x01234567, vacc1x01234567); + vacc2x01234567 = wasm_i8x16_narrow_i16x8(vacc2x01234567, vacc2x01234567); + vacc3x01234567 = wasm_i8x16_narrow_i16x8(vacc3x01234567, vacc3x01234567); - vacc0x01234567_1x01234567 = wasm_i8x16_min(vacc0x01234567_1x01234567, voutput_max); - vacc2x01234567_3x01234567 = wasm_i8x16_min(vacc2x01234567_3x01234567, voutput_max); + vacc0x01234567 = wasm_i8x16_min(vacc0x01234567, voutput_max); + vacc1x01234567 = wasm_i8x16_min(vacc1x01234567, voutput_max); + vacc2x01234567 = wasm_i8x16_min(vacc2x01234567, voutput_max); + vacc3x01234567 = wasm_i8x16_min(vacc3x01234567, voutput_max); if XNN_LIKELY(nc >= 8) { - wasm_v128_store64_lane(c0, vacc0x01234567_1x01234567, 0); - wasm_v128_store64_lane(c1, vacc0x01234567_1x01234567, 1); - wasm_v128_store64_lane(c2, vacc2x01234567_3x01234567, 0); - wasm_v128_store64_lane(c3, vacc2x01234567_3x01234567, 1); + wasm_v128_store64_lane(c0, vacc0x01234567, 0); + wasm_v128_store64_lane(c1, vacc1x01234567, 0); + wasm_v128_store64_lane(c2, vacc2x01234567, 0); + wasm_v128_store64_lane(c3, vacc3x01234567, 0); c0 = (int8_t*) ((uintptr_t) c0 + cn_stride); c1 = (int8_t*) ((uintptr_t) c1 + cn_stride); @@ -271,36 +275,40 @@ void xnn_qs8_qc8w_gemm_minmax_fp32_ukernel_4x8c8__wasmusdot_u2( nc -= 8; } else { if (nc & 4) { - wasm_v128_store32_lane(c0, vacc0x01234567_1x01234567, 0); + wasm_v128_store32_lane(c0, vacc0x01234567, 0); c0 += 4; - wasm_v128_store32_lane(c1, vacc0x01234567_1x01234567, 2); + wasm_v128_store32_lane(c1, vacc1x01234567, 0); c1 += 4; - wasm_v128_store32_lane(c2, vacc2x01234567_3x01234567, 0); + wasm_v128_store32_lane(c2, vacc2x01234567, 0); c2 += 4; - wasm_v128_store32_lane(c3, vacc2x01234567_3x01234567, 2); + wasm_v128_store32_lane(c3, vacc3x01234567, 0); c3 += 4; - vacc0x01234567_1x01234567 = wasm_u64x2_shr(vacc0x01234567_1x01234567, 32); - vacc2x01234567_3x01234567 = wasm_u64x2_shr(vacc2x01234567_3x01234567, 32); + vacc0x01234567 = wasm_u64x2_shr(vacc0x01234567, 32); + vacc1x01234567 = wasm_u64x2_shr(vacc1x01234567, 32); + vacc2x01234567 = wasm_u64x2_shr(vacc2x01234567, 32); + vacc3x01234567 = wasm_u64x2_shr(vacc3x01234567, 32); } if (nc & 2) { - wasm_v128_store16_lane(c0, vacc0x01234567_1x01234567, 0); + wasm_v128_store16_lane(c0, vacc0x01234567, 0); c0 += 2; - wasm_v128_store16_lane(c1, vacc0x01234567_1x01234567, 4); + wasm_v128_store16_lane(c1, vacc1x01234567, 0); c1 += 2; - wasm_v128_store16_lane(c2, vacc2x01234567_3x01234567, 0); + wasm_v128_store16_lane(c2, vacc2x01234567, 0); c2 += 2; - wasm_v128_store16_lane(c3, vacc2x01234567_3x01234567, 4); + wasm_v128_store16_lane(c3, vacc3x01234567, 0); c3 += 2; - vacc0x01234567_1x01234567 = wasm_u32x4_shr(vacc0x01234567_1x01234567, 16); - vacc2x01234567_3x01234567 = wasm_u32x4_shr(vacc2x01234567_3x01234567, 16); + vacc0x01234567 = wasm_u32x4_shr(vacc0x01234567, 16); + vacc1x01234567 = wasm_u32x4_shr(vacc1x01234567, 16); + vacc2x01234567 = wasm_u32x4_shr(vacc2x01234567, 16); + vacc3x01234567 = wasm_u32x4_shr(vacc3x01234567, 16); } if (nc & 1) { - wasm_v128_store8_lane(c0, vacc0x01234567_1x01234567, 0); - wasm_v128_store8_lane(c1, vacc0x01234567_1x01234567, 8); - wasm_v128_store8_lane(c2, vacc2x01234567_3x01234567, 0); - wasm_v128_store8_lane(c3, vacc2x01234567_3x01234567, 8); + wasm_v128_store8_lane(c0, vacc0x01234567, 0); + wasm_v128_store8_lane(c1, vacc1x01234567, 0); + wasm_v128_store8_lane(c2, vacc2x01234567, 0); + wasm_v128_store8_lane(c3, vacc3x01234567, 0); } nc = 0; diff --git a/src/qs8-qc8w-gemm/gen/qs8-qc8w-gemm-4x8c8-minmax-fp32-wasmusdot.c b/src/qs8-qc8w-gemm/gen/qs8-qc8w-gemm-4x8c8-minmax-fp32-wasmusdot.c index 3a3e79ff560..df8ed793a64 100644 --- a/src/qs8-qc8w-gemm/gen/qs8-qc8w-gemm-4x8c8-minmax-fp32-wasmusdot.c +++ b/src/qs8-qc8w-gemm/gen/qs8-qc8w-gemm-4x8c8-minmax-fp32-wasmusdot.c @@ -188,17 +188,21 @@ void xnn_qs8_qc8w_gemm_minmax_fp32_ukernel_4x8c8__wasmusdot( v128_t vacc2x01234567 = wasm_i16x8_narrow_i32x4(vacc2x0123, vacc2x4567); v128_t vacc3x01234567 = wasm_i16x8_narrow_i32x4(vacc3x0123, vacc3x4567); - v128_t vacc0x01234567_1x01234567 = wasm_i8x16_narrow_i16x8(vacc0x01234567, vacc1x01234567); - v128_t vacc2x01234567_3x01234567 = wasm_i8x16_narrow_i16x8(vacc2x01234567, vacc3x01234567); + vacc0x01234567 = wasm_i8x16_narrow_i16x8(vacc0x01234567, vacc0x01234567); + vacc1x01234567 = wasm_i8x16_narrow_i16x8(vacc1x01234567, vacc1x01234567); + vacc2x01234567 = wasm_i8x16_narrow_i16x8(vacc2x01234567, vacc2x01234567); + vacc3x01234567 = wasm_i8x16_narrow_i16x8(vacc3x01234567, vacc3x01234567); - vacc0x01234567_1x01234567 = wasm_i8x16_min(vacc0x01234567_1x01234567, voutput_max); - vacc2x01234567_3x01234567 = wasm_i8x16_min(vacc2x01234567_3x01234567, voutput_max); + vacc0x01234567 = wasm_i8x16_min(vacc0x01234567, voutput_max); + vacc1x01234567 = wasm_i8x16_min(vacc1x01234567, voutput_max); + vacc2x01234567 = wasm_i8x16_min(vacc2x01234567, voutput_max); + vacc3x01234567 = wasm_i8x16_min(vacc3x01234567, voutput_max); if XNN_LIKELY(nc >= 8) { - wasm_v128_store64_lane(c0, vacc0x01234567_1x01234567, 0); - wasm_v128_store64_lane(c1, vacc0x01234567_1x01234567, 1); - wasm_v128_store64_lane(c2, vacc2x01234567_3x01234567, 0); - wasm_v128_store64_lane(c3, vacc2x01234567_3x01234567, 1); + wasm_v128_store64_lane(c0, vacc0x01234567, 0); + wasm_v128_store64_lane(c1, vacc1x01234567, 0); + wasm_v128_store64_lane(c2, vacc2x01234567, 0); + wasm_v128_store64_lane(c3, vacc3x01234567, 0); c0 = (int8_t*) ((uintptr_t) c0 + cn_stride); c1 = (int8_t*) ((uintptr_t) c1 + cn_stride); @@ -213,36 +217,40 @@ void xnn_qs8_qc8w_gemm_minmax_fp32_ukernel_4x8c8__wasmusdot( nc -= 8; } else { if (nc & 4) { - wasm_v128_store32_lane(c0, vacc0x01234567_1x01234567, 0); + wasm_v128_store32_lane(c0, vacc0x01234567, 0); c0 += 4; - wasm_v128_store32_lane(c1, vacc0x01234567_1x01234567, 2); + wasm_v128_store32_lane(c1, vacc1x01234567, 0); c1 += 4; - wasm_v128_store32_lane(c2, vacc2x01234567_3x01234567, 0); + wasm_v128_store32_lane(c2, vacc2x01234567, 0); c2 += 4; - wasm_v128_store32_lane(c3, vacc2x01234567_3x01234567, 2); + wasm_v128_store32_lane(c3, vacc3x01234567, 0); c3 += 4; - vacc0x01234567_1x01234567 = wasm_u64x2_shr(vacc0x01234567_1x01234567, 32); - vacc2x01234567_3x01234567 = wasm_u64x2_shr(vacc2x01234567_3x01234567, 32); + vacc0x01234567 = wasm_u64x2_shr(vacc0x01234567, 32); + vacc1x01234567 = wasm_u64x2_shr(vacc1x01234567, 32); + vacc2x01234567 = wasm_u64x2_shr(vacc2x01234567, 32); + vacc3x01234567 = wasm_u64x2_shr(vacc3x01234567, 32); } if (nc & 2) { - wasm_v128_store16_lane(c0, vacc0x01234567_1x01234567, 0); + wasm_v128_store16_lane(c0, vacc0x01234567, 0); c0 += 2; - wasm_v128_store16_lane(c1, vacc0x01234567_1x01234567, 4); + wasm_v128_store16_lane(c1, vacc1x01234567, 0); c1 += 2; - wasm_v128_store16_lane(c2, vacc2x01234567_3x01234567, 0); + wasm_v128_store16_lane(c2, vacc2x01234567, 0); c2 += 2; - wasm_v128_store16_lane(c3, vacc2x01234567_3x01234567, 4); + wasm_v128_store16_lane(c3, vacc3x01234567, 0); c3 += 2; - vacc0x01234567_1x01234567 = wasm_u32x4_shr(vacc0x01234567_1x01234567, 16); - vacc2x01234567_3x01234567 = wasm_u32x4_shr(vacc2x01234567_3x01234567, 16); + vacc0x01234567 = wasm_u32x4_shr(vacc0x01234567, 16); + vacc1x01234567 = wasm_u32x4_shr(vacc1x01234567, 16); + vacc2x01234567 = wasm_u32x4_shr(vacc2x01234567, 16); + vacc3x01234567 = wasm_u32x4_shr(vacc3x01234567, 16); } if (nc & 1) { - wasm_v128_store8_lane(c0, vacc0x01234567_1x01234567, 0); - wasm_v128_store8_lane(c1, vacc0x01234567_1x01234567, 8); - wasm_v128_store8_lane(c2, vacc2x01234567_3x01234567, 0); - wasm_v128_store8_lane(c3, vacc2x01234567_3x01234567, 8); + wasm_v128_store8_lane(c0, vacc0x01234567, 0); + wasm_v128_store8_lane(c1, vacc1x01234567, 0); + wasm_v128_store8_lane(c2, vacc2x01234567, 0); + wasm_v128_store8_lane(c3, vacc3x01234567, 0); } nc = 0; diff --git a/src/qs8-qc8w-igemm/gen/qs8-qc8w-igemm-4x8c8-minmax-fp32-wasmsdot-u2.c b/src/qs8-qc8w-igemm/gen/qs8-qc8w-igemm-4x8c8-minmax-fp32-wasmsdot-u2.c index 4462dd2e535..5be270f4d9c 100644 --- a/src/qs8-qc8w-igemm/gen/qs8-qc8w-igemm-4x8c8-minmax-fp32-wasmsdot-u2.c +++ b/src/qs8-qc8w-igemm/gen/qs8-qc8w-igemm-4x8c8-minmax-fp32-wasmsdot-u2.c @@ -2,7 +2,7 @@ // Template: src/qs8-igemm/c8-wasmdot.c.in // Generator: tools/xngen // -// Copyright 2023 Google LLC +// Copyright 2024 Google LLC // // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. @@ -263,17 +263,21 @@ void xnn_qs8_qc8w_igemm_minmax_fp32_ukernel_4x8c8__wasmsdot_u2( v128_t vacc2x01234567 = wasm_i16x8_narrow_i32x4(vacc2x0123, vacc2x4567); v128_t vacc3x01234567 = wasm_i16x8_narrow_i32x4(vacc3x0123, vacc3x4567); - v128_t vout0x01234567_1x01234567 = wasm_i8x16_narrow_i16x8(vacc0x01234567, vacc1x01234567); - v128_t vout2x01234567_3x01234567 = wasm_i8x16_narrow_i16x8(vacc2x01234567, vacc3x01234567); + v128_t vout0x01234567 = wasm_i8x16_narrow_i16x8(vacc0x01234567, vacc0x01234567); + v128_t vout1x01234567 = wasm_i8x16_narrow_i16x8(vacc1x01234567, vacc1x01234567); + v128_t vout2x01234567 = wasm_i8x16_narrow_i16x8(vacc2x01234567, vacc2x01234567); + v128_t vout3x01234567 = wasm_i8x16_narrow_i16x8(vacc3x01234567, vacc3x01234567); - vout0x01234567_1x01234567 = wasm_i8x16_min(vout0x01234567_1x01234567, voutput_max); - vout2x01234567_3x01234567 = wasm_i8x16_min(vout2x01234567_3x01234567, voutput_max); + vout0x01234567 = wasm_i8x16_min(vout0x01234567, voutput_max); + vout1x01234567 = wasm_i8x16_min(vout1x01234567, voutput_max); + vout2x01234567 = wasm_i8x16_min(vout2x01234567, voutput_max); + vout3x01234567 = wasm_i8x16_min(vout3x01234567, voutput_max); if (nc >= 8) { - wasm_v128_store64_lane(c3, vout2x01234567_3x01234567, 1); - wasm_v128_store64_lane(c2, vout2x01234567_3x01234567, 0); - wasm_v128_store64_lane(c1, vout0x01234567_1x01234567, 1); - wasm_v128_store64_lane(c0, vout0x01234567_1x01234567, 0); + wasm_v128_store64_lane(c3, vout3x01234567, 0); + wasm_v128_store64_lane(c2, vout2x01234567, 0); + wasm_v128_store64_lane(c1, vout1x01234567, 0); + wasm_v128_store64_lane(c0, vout0x01234567, 0); c3 = (int8_t*) ((uintptr_t) c3 + cn_stride); c2 = (int8_t*) ((uintptr_t) c2 + cn_stride); @@ -285,36 +289,40 @@ void xnn_qs8_qc8w_igemm_minmax_fp32_ukernel_4x8c8__wasmsdot_u2( nc -= 8; } else { if (nc & 4) { - wasm_v128_store32_lane(c3, vout2x01234567_3x01234567, 2); + wasm_v128_store32_lane(c3, vout3x01234567, 0); c3 += 4; - wasm_v128_store32_lane(c2, vout2x01234567_3x01234567, 0); + wasm_v128_store32_lane(c2, vout2x01234567, 0); c2 += 4; - wasm_v128_store32_lane(c1, vout0x01234567_1x01234567, 2); + wasm_v128_store32_lane(c1, vout1x01234567, 0); c1 += 4; - wasm_v128_store32_lane(c0, vout0x01234567_1x01234567, 0); + wasm_v128_store32_lane(c0, vout0x01234567, 0); c0 += 4; - vout0x01234567_1x01234567 = wasm_u64x2_shr(vout0x01234567_1x01234567, 32); - vout2x01234567_3x01234567 = wasm_u64x2_shr(vout2x01234567_3x01234567, 32); + vout0x01234567 = wasm_u64x2_shr(vout0x01234567, 32); + vout1x01234567 = wasm_u64x2_shr(vout1x01234567, 32); + vout2x01234567 = wasm_u64x2_shr(vout2x01234567, 32); + vout3x01234567 = wasm_u64x2_shr(vout3x01234567, 32); } if (nc & 2) { - wasm_v128_store16_lane(c3, vout2x01234567_3x01234567, 4); + wasm_v128_store16_lane(c3, vout3x01234567, 0); c3 += 2; - wasm_v128_store16_lane(c2, vout2x01234567_3x01234567, 0); + wasm_v128_store16_lane(c2, vout2x01234567, 0); c2 += 2; - wasm_v128_store16_lane(c1, vout0x01234567_1x01234567, 4); + wasm_v128_store16_lane(c1, vout1x01234567, 0); c1 += 2; - wasm_v128_store16_lane(c0, vout0x01234567_1x01234567, 0); + wasm_v128_store16_lane(c0, vout0x01234567, 0); c0 += 2; - vout0x01234567_1x01234567 = wasm_u32x4_shr(vout0x01234567_1x01234567, 16); - vout2x01234567_3x01234567 = wasm_u32x4_shr(vout2x01234567_3x01234567, 16); + vout0x01234567 = wasm_u32x4_shr(vout0x01234567, 16); + vout1x01234567 = wasm_u32x4_shr(vout1x01234567, 16); + vout2x01234567 = wasm_u32x4_shr(vout2x01234567, 16); + vout3x01234567 = wasm_u32x4_shr(vout3x01234567, 16); } if (nc & 1) { - wasm_v128_store8_lane(c3, vout2x01234567_3x01234567, 8); - wasm_v128_store8_lane(c2, vout2x01234567_3x01234567, 0); - wasm_v128_store8_lane(c1, vout0x01234567_1x01234567, 8); - wasm_v128_store8_lane(c0, vout0x01234567_1x01234567, 0); + wasm_v128_store8_lane(c3, vout3x01234567, 0); + wasm_v128_store8_lane(c2, vout2x01234567, 0); + wasm_v128_store8_lane(c1, vout1x01234567, 0); + wasm_v128_store8_lane(c0, vout0x01234567, 0); } nc = 0; diff --git a/src/qs8-qc8w-igemm/gen/qs8-qc8w-igemm-4x8c8-minmax-fp32-wasmsdot.c b/src/qs8-qc8w-igemm/gen/qs8-qc8w-igemm-4x8c8-minmax-fp32-wasmsdot.c index f54e642bd18..83bfaf58dcb 100644 --- a/src/qs8-qc8w-igemm/gen/qs8-qc8w-igemm-4x8c8-minmax-fp32-wasmsdot.c +++ b/src/qs8-qc8w-igemm/gen/qs8-qc8w-igemm-4x8c8-minmax-fp32-wasmsdot.c @@ -2,7 +2,7 @@ // Template: src/qs8-igemm/c8-wasmdot.c.in // Generator: tools/xngen // -// Copyright 2023 Google LLC +// Copyright 2024 Google LLC // // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. @@ -205,17 +205,21 @@ void xnn_qs8_qc8w_igemm_minmax_fp32_ukernel_4x8c8__wasmsdot( v128_t vacc2x01234567 = wasm_i16x8_narrow_i32x4(vacc2x0123, vacc2x4567); v128_t vacc3x01234567 = wasm_i16x8_narrow_i32x4(vacc3x0123, vacc3x4567); - v128_t vout0x01234567_1x01234567 = wasm_i8x16_narrow_i16x8(vacc0x01234567, vacc1x01234567); - v128_t vout2x01234567_3x01234567 = wasm_i8x16_narrow_i16x8(vacc2x01234567, vacc3x01234567); + v128_t vout0x01234567 = wasm_i8x16_narrow_i16x8(vacc0x01234567, vacc0x01234567); + v128_t vout1x01234567 = wasm_i8x16_narrow_i16x8(vacc1x01234567, vacc1x01234567); + v128_t vout2x01234567 = wasm_i8x16_narrow_i16x8(vacc2x01234567, vacc2x01234567); + v128_t vout3x01234567 = wasm_i8x16_narrow_i16x8(vacc3x01234567, vacc3x01234567); - vout0x01234567_1x01234567 = wasm_i8x16_min(vout0x01234567_1x01234567, voutput_max); - vout2x01234567_3x01234567 = wasm_i8x16_min(vout2x01234567_3x01234567, voutput_max); + vout0x01234567 = wasm_i8x16_min(vout0x01234567, voutput_max); + vout1x01234567 = wasm_i8x16_min(vout1x01234567, voutput_max); + vout2x01234567 = wasm_i8x16_min(vout2x01234567, voutput_max); + vout3x01234567 = wasm_i8x16_min(vout3x01234567, voutput_max); if (nc >= 8) { - wasm_v128_store64_lane(c3, vout2x01234567_3x01234567, 1); - wasm_v128_store64_lane(c2, vout2x01234567_3x01234567, 0); - wasm_v128_store64_lane(c1, vout0x01234567_1x01234567, 1); - wasm_v128_store64_lane(c0, vout0x01234567_1x01234567, 0); + wasm_v128_store64_lane(c3, vout3x01234567, 0); + wasm_v128_store64_lane(c2, vout2x01234567, 0); + wasm_v128_store64_lane(c1, vout1x01234567, 0); + wasm_v128_store64_lane(c0, vout0x01234567, 0); c3 = (int8_t*) ((uintptr_t) c3 + cn_stride); c2 = (int8_t*) ((uintptr_t) c2 + cn_stride); @@ -227,36 +231,40 @@ void xnn_qs8_qc8w_igemm_minmax_fp32_ukernel_4x8c8__wasmsdot( nc -= 8; } else { if (nc & 4) { - wasm_v128_store32_lane(c3, vout2x01234567_3x01234567, 2); + wasm_v128_store32_lane(c3, vout3x01234567, 0); c3 += 4; - wasm_v128_store32_lane(c2, vout2x01234567_3x01234567, 0); + wasm_v128_store32_lane(c2, vout2x01234567, 0); c2 += 4; - wasm_v128_store32_lane(c1, vout0x01234567_1x01234567, 2); + wasm_v128_store32_lane(c1, vout1x01234567, 0); c1 += 4; - wasm_v128_store32_lane(c0, vout0x01234567_1x01234567, 0); + wasm_v128_store32_lane(c0, vout0x01234567, 0); c0 += 4; - vout0x01234567_1x01234567 = wasm_u64x2_shr(vout0x01234567_1x01234567, 32); - vout2x01234567_3x01234567 = wasm_u64x2_shr(vout2x01234567_3x01234567, 32); + vout0x01234567 = wasm_u64x2_shr(vout0x01234567, 32); + vout1x01234567 = wasm_u64x2_shr(vout1x01234567, 32); + vout2x01234567 = wasm_u64x2_shr(vout2x01234567, 32); + vout3x01234567 = wasm_u64x2_shr(vout3x01234567, 32); } if (nc & 2) { - wasm_v128_store16_lane(c3, vout2x01234567_3x01234567, 4); + wasm_v128_store16_lane(c3, vout3x01234567, 0); c3 += 2; - wasm_v128_store16_lane(c2, vout2x01234567_3x01234567, 0); + wasm_v128_store16_lane(c2, vout2x01234567, 0); c2 += 2; - wasm_v128_store16_lane(c1, vout0x01234567_1x01234567, 4); + wasm_v128_store16_lane(c1, vout1x01234567, 0); c1 += 2; - wasm_v128_store16_lane(c0, vout0x01234567_1x01234567, 0); + wasm_v128_store16_lane(c0, vout0x01234567, 0); c0 += 2; - vout0x01234567_1x01234567 = wasm_u32x4_shr(vout0x01234567_1x01234567, 16); - vout2x01234567_3x01234567 = wasm_u32x4_shr(vout2x01234567_3x01234567, 16); + vout0x01234567 = wasm_u32x4_shr(vout0x01234567, 16); + vout1x01234567 = wasm_u32x4_shr(vout1x01234567, 16); + vout2x01234567 = wasm_u32x4_shr(vout2x01234567, 16); + vout3x01234567 = wasm_u32x4_shr(vout3x01234567, 16); } if (nc & 1) { - wasm_v128_store8_lane(c3, vout2x01234567_3x01234567, 8); - wasm_v128_store8_lane(c2, vout2x01234567_3x01234567, 0); - wasm_v128_store8_lane(c1, vout0x01234567_1x01234567, 8); - wasm_v128_store8_lane(c0, vout0x01234567_1x01234567, 0); + wasm_v128_store8_lane(c3, vout3x01234567, 0); + wasm_v128_store8_lane(c2, vout2x01234567, 0); + wasm_v128_store8_lane(c1, vout1x01234567, 0); + wasm_v128_store8_lane(c0, vout0x01234567, 0); } nc = 0; diff --git a/src/qs8-qc8w-igemm/gen/qs8-qc8w-igemm-4x8c8-minmax-fp32-wasmusdot-u2.c b/src/qs8-qc8w-igemm/gen/qs8-qc8w-igemm-4x8c8-minmax-fp32-wasmusdot-u2.c index 4257b17a28a..315c20b4adf 100644 --- a/src/qs8-qc8w-igemm/gen/qs8-qc8w-igemm-4x8c8-minmax-fp32-wasmusdot-u2.c +++ b/src/qs8-qc8w-igemm/gen/qs8-qc8w-igemm-4x8c8-minmax-fp32-wasmusdot-u2.c @@ -2,7 +2,7 @@ // Template: src/qs8-igemm/c8-wasmdot.c.in // Generator: tools/xngen // -// Copyright 2023 Google LLC +// Copyright 2024 Google LLC // // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. @@ -265,17 +265,21 @@ void xnn_qs8_qc8w_igemm_minmax_fp32_ukernel_4x8c8__wasmusdot_u2( v128_t vacc2x01234567 = wasm_i16x8_narrow_i32x4(vacc2x0123, vacc2x4567); v128_t vacc3x01234567 = wasm_i16x8_narrow_i32x4(vacc3x0123, vacc3x4567); - v128_t vout0x01234567_1x01234567 = wasm_i8x16_narrow_i16x8(vacc0x01234567, vacc1x01234567); - v128_t vout2x01234567_3x01234567 = wasm_i8x16_narrow_i16x8(vacc2x01234567, vacc3x01234567); + v128_t vout0x01234567 = wasm_i8x16_narrow_i16x8(vacc0x01234567, vacc0x01234567); + v128_t vout1x01234567 = wasm_i8x16_narrow_i16x8(vacc1x01234567, vacc1x01234567); + v128_t vout2x01234567 = wasm_i8x16_narrow_i16x8(vacc2x01234567, vacc2x01234567); + v128_t vout3x01234567 = wasm_i8x16_narrow_i16x8(vacc3x01234567, vacc3x01234567); - vout0x01234567_1x01234567 = wasm_i8x16_min(vout0x01234567_1x01234567, voutput_max); - vout2x01234567_3x01234567 = wasm_i8x16_min(vout2x01234567_3x01234567, voutput_max); + vout0x01234567 = wasm_i8x16_min(vout0x01234567, voutput_max); + vout1x01234567 = wasm_i8x16_min(vout1x01234567, voutput_max); + vout2x01234567 = wasm_i8x16_min(vout2x01234567, voutput_max); + vout3x01234567 = wasm_i8x16_min(vout3x01234567, voutput_max); if (nc >= 8) { - wasm_v128_store64_lane(c3, vout2x01234567_3x01234567, 1); - wasm_v128_store64_lane(c2, vout2x01234567_3x01234567, 0); - wasm_v128_store64_lane(c1, vout0x01234567_1x01234567, 1); - wasm_v128_store64_lane(c0, vout0x01234567_1x01234567, 0); + wasm_v128_store64_lane(c3, vout3x01234567, 0); + wasm_v128_store64_lane(c2, vout2x01234567, 0); + wasm_v128_store64_lane(c1, vout1x01234567, 0); + wasm_v128_store64_lane(c0, vout0x01234567, 0); c3 = (int8_t*) ((uintptr_t) c3 + cn_stride); c2 = (int8_t*) ((uintptr_t) c2 + cn_stride); @@ -287,36 +291,40 @@ void xnn_qs8_qc8w_igemm_minmax_fp32_ukernel_4x8c8__wasmusdot_u2( nc -= 8; } else { if (nc & 4) { - wasm_v128_store32_lane(c3, vout2x01234567_3x01234567, 2); + wasm_v128_store32_lane(c3, vout3x01234567, 0); c3 += 4; - wasm_v128_store32_lane(c2, vout2x01234567_3x01234567, 0); + wasm_v128_store32_lane(c2, vout2x01234567, 0); c2 += 4; - wasm_v128_store32_lane(c1, vout0x01234567_1x01234567, 2); + wasm_v128_store32_lane(c1, vout1x01234567, 0); c1 += 4; - wasm_v128_store32_lane(c0, vout0x01234567_1x01234567, 0); + wasm_v128_store32_lane(c0, vout0x01234567, 0); c0 += 4; - vout0x01234567_1x01234567 = wasm_u64x2_shr(vout0x01234567_1x01234567, 32); - vout2x01234567_3x01234567 = wasm_u64x2_shr(vout2x01234567_3x01234567, 32); + vout0x01234567 = wasm_u64x2_shr(vout0x01234567, 32); + vout1x01234567 = wasm_u64x2_shr(vout1x01234567, 32); + vout2x01234567 = wasm_u64x2_shr(vout2x01234567, 32); + vout3x01234567 = wasm_u64x2_shr(vout3x01234567, 32); } if (nc & 2) { - wasm_v128_store16_lane(c3, vout2x01234567_3x01234567, 4); + wasm_v128_store16_lane(c3, vout3x01234567, 0); c3 += 2; - wasm_v128_store16_lane(c2, vout2x01234567_3x01234567, 0); + wasm_v128_store16_lane(c2, vout2x01234567, 0); c2 += 2; - wasm_v128_store16_lane(c1, vout0x01234567_1x01234567, 4); + wasm_v128_store16_lane(c1, vout1x01234567, 0); c1 += 2; - wasm_v128_store16_lane(c0, vout0x01234567_1x01234567, 0); + wasm_v128_store16_lane(c0, vout0x01234567, 0); c0 += 2; - vout0x01234567_1x01234567 = wasm_u32x4_shr(vout0x01234567_1x01234567, 16); - vout2x01234567_3x01234567 = wasm_u32x4_shr(vout2x01234567_3x01234567, 16); + vout0x01234567 = wasm_u32x4_shr(vout0x01234567, 16); + vout1x01234567 = wasm_u32x4_shr(vout1x01234567, 16); + vout2x01234567 = wasm_u32x4_shr(vout2x01234567, 16); + vout3x01234567 = wasm_u32x4_shr(vout3x01234567, 16); } if (nc & 1) { - wasm_v128_store8_lane(c3, vout2x01234567_3x01234567, 8); - wasm_v128_store8_lane(c2, vout2x01234567_3x01234567, 0); - wasm_v128_store8_lane(c1, vout0x01234567_1x01234567, 8); - wasm_v128_store8_lane(c0, vout0x01234567_1x01234567, 0); + wasm_v128_store8_lane(c3, vout3x01234567, 0); + wasm_v128_store8_lane(c2, vout2x01234567, 0); + wasm_v128_store8_lane(c1, vout1x01234567, 0); + wasm_v128_store8_lane(c0, vout0x01234567, 0); } nc = 0; diff --git a/src/qs8-qc8w-igemm/gen/qs8-qc8w-igemm-4x8c8-minmax-fp32-wasmusdot.c b/src/qs8-qc8w-igemm/gen/qs8-qc8w-igemm-4x8c8-minmax-fp32-wasmusdot.c index f8bcfe2bc80..05be50ec24d 100644 --- a/src/qs8-qc8w-igemm/gen/qs8-qc8w-igemm-4x8c8-minmax-fp32-wasmusdot.c +++ b/src/qs8-qc8w-igemm/gen/qs8-qc8w-igemm-4x8c8-minmax-fp32-wasmusdot.c @@ -2,7 +2,7 @@ // Template: src/qs8-igemm/c8-wasmdot.c.in // Generator: tools/xngen // -// Copyright 2023 Google LLC +// Copyright 2024 Google LLC // // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. @@ -207,17 +207,21 @@ void xnn_qs8_qc8w_igemm_minmax_fp32_ukernel_4x8c8__wasmusdot( v128_t vacc2x01234567 = wasm_i16x8_narrow_i32x4(vacc2x0123, vacc2x4567); v128_t vacc3x01234567 = wasm_i16x8_narrow_i32x4(vacc3x0123, vacc3x4567); - v128_t vout0x01234567_1x01234567 = wasm_i8x16_narrow_i16x8(vacc0x01234567, vacc1x01234567); - v128_t vout2x01234567_3x01234567 = wasm_i8x16_narrow_i16x8(vacc2x01234567, vacc3x01234567); + v128_t vout0x01234567 = wasm_i8x16_narrow_i16x8(vacc0x01234567, vacc0x01234567); + v128_t vout1x01234567 = wasm_i8x16_narrow_i16x8(vacc1x01234567, vacc1x01234567); + v128_t vout2x01234567 = wasm_i8x16_narrow_i16x8(vacc2x01234567, vacc2x01234567); + v128_t vout3x01234567 = wasm_i8x16_narrow_i16x8(vacc3x01234567, vacc3x01234567); - vout0x01234567_1x01234567 = wasm_i8x16_min(vout0x01234567_1x01234567, voutput_max); - vout2x01234567_3x01234567 = wasm_i8x16_min(vout2x01234567_3x01234567, voutput_max); + vout0x01234567 = wasm_i8x16_min(vout0x01234567, voutput_max); + vout1x01234567 = wasm_i8x16_min(vout1x01234567, voutput_max); + vout2x01234567 = wasm_i8x16_min(vout2x01234567, voutput_max); + vout3x01234567 = wasm_i8x16_min(vout3x01234567, voutput_max); if (nc >= 8) { - wasm_v128_store64_lane(c3, vout2x01234567_3x01234567, 1); - wasm_v128_store64_lane(c2, vout2x01234567_3x01234567, 0); - wasm_v128_store64_lane(c1, vout0x01234567_1x01234567, 1); - wasm_v128_store64_lane(c0, vout0x01234567_1x01234567, 0); + wasm_v128_store64_lane(c3, vout3x01234567, 0); + wasm_v128_store64_lane(c2, vout2x01234567, 0); + wasm_v128_store64_lane(c1, vout1x01234567, 0); + wasm_v128_store64_lane(c0, vout0x01234567, 0); c3 = (int8_t*) ((uintptr_t) c3 + cn_stride); c2 = (int8_t*) ((uintptr_t) c2 + cn_stride); @@ -229,36 +233,40 @@ void xnn_qs8_qc8w_igemm_minmax_fp32_ukernel_4x8c8__wasmusdot( nc -= 8; } else { if (nc & 4) { - wasm_v128_store32_lane(c3, vout2x01234567_3x01234567, 2); + wasm_v128_store32_lane(c3, vout3x01234567, 0); c3 += 4; - wasm_v128_store32_lane(c2, vout2x01234567_3x01234567, 0); + wasm_v128_store32_lane(c2, vout2x01234567, 0); c2 += 4; - wasm_v128_store32_lane(c1, vout0x01234567_1x01234567, 2); + wasm_v128_store32_lane(c1, vout1x01234567, 0); c1 += 4; - wasm_v128_store32_lane(c0, vout0x01234567_1x01234567, 0); + wasm_v128_store32_lane(c0, vout0x01234567, 0); c0 += 4; - vout0x01234567_1x01234567 = wasm_u64x2_shr(vout0x01234567_1x01234567, 32); - vout2x01234567_3x01234567 = wasm_u64x2_shr(vout2x01234567_3x01234567, 32); + vout0x01234567 = wasm_u64x2_shr(vout0x01234567, 32); + vout1x01234567 = wasm_u64x2_shr(vout1x01234567, 32); + vout2x01234567 = wasm_u64x2_shr(vout2x01234567, 32); + vout3x01234567 = wasm_u64x2_shr(vout3x01234567, 32); } if (nc & 2) { - wasm_v128_store16_lane(c3, vout2x01234567_3x01234567, 4); + wasm_v128_store16_lane(c3, vout3x01234567, 0); c3 += 2; - wasm_v128_store16_lane(c2, vout2x01234567_3x01234567, 0); + wasm_v128_store16_lane(c2, vout2x01234567, 0); c2 += 2; - wasm_v128_store16_lane(c1, vout0x01234567_1x01234567, 4); + wasm_v128_store16_lane(c1, vout1x01234567, 0); c1 += 2; - wasm_v128_store16_lane(c0, vout0x01234567_1x01234567, 0); + wasm_v128_store16_lane(c0, vout0x01234567, 0); c0 += 2; - vout0x01234567_1x01234567 = wasm_u32x4_shr(vout0x01234567_1x01234567, 16); - vout2x01234567_3x01234567 = wasm_u32x4_shr(vout2x01234567_3x01234567, 16); + vout0x01234567 = wasm_u32x4_shr(vout0x01234567, 16); + vout1x01234567 = wasm_u32x4_shr(vout1x01234567, 16); + vout2x01234567 = wasm_u32x4_shr(vout2x01234567, 16); + vout3x01234567 = wasm_u32x4_shr(vout3x01234567, 16); } if (nc & 1) { - wasm_v128_store8_lane(c3, vout2x01234567_3x01234567, 8); - wasm_v128_store8_lane(c2, vout2x01234567_3x01234567, 0); - wasm_v128_store8_lane(c1, vout0x01234567_1x01234567, 8); - wasm_v128_store8_lane(c0, vout0x01234567_1x01234567, 0); + wasm_v128_store8_lane(c3, vout3x01234567, 0); + wasm_v128_store8_lane(c2, vout2x01234567, 0); + wasm_v128_store8_lane(c1, vout1x01234567, 0); + wasm_v128_store8_lane(c0, vout0x01234567, 0); } nc = 0;