From ea3c31190594889b622852e480e3ce3f21a9ac72 Mon Sep 17 00:00:00 2001 From: Alex Crichton Date: Thu, 12 Dec 2024 15:15:57 -0800 Subject: [PATCH] pulley: Get `simd_conversions.wast` test working Lots of narrowing/extending/conversion-related opcodes implemented. Note that these opcodes are all in the "extended" namespace as the 1-byte namespace has started to overflow. --- .../codegen/src/isa/pulley_shared/lower.isle | 64 +++++ .../runtests/simd-fcvt-from-sint.clif | 4 + .../runtests/simd-fcvt-from-uint.clif | 4 + .../filetests/runtests/simd-fvdemote.clif | 4 + .../runtests/simd-fvpromote-low.clif | 4 + .../filetests/runtests/simd-iadd.clif | 4 + .../filetests/runtests/simd-imul-i8x16.clif | 4 + .../filetests/runtests/simd-imul.clif | 4 + .../filetests/runtests/simd-isub.clif | 4 + .../filetests/runtests/simd-snarrow.clif | 4 + .../filetests/runtests/simd-swidenhigh.clif | 4 + .../filetests/runtests/simd-swidenlow.clif | 4 + .../filetests/runtests/simd-unarrow.clif | 4 + .../filetests/runtests/simd-uwidenhigh.clif | 4 + .../filetests/runtests/simd-uwidenlow.clif | 4 + crates/wasmtime/src/runtime/vm/interpreter.rs | 4 + crates/wast-util/src/lib.rs | 7 - pulley/src/interp.rs | 240 ++++++++++++++++++ pulley/src/lib.rs | 82 ++++++ 19 files changed, 446 insertions(+), 7 deletions(-) diff --git a/cranelift/codegen/src/isa/pulley_shared/lower.isle b/cranelift/codegen/src/isa/pulley_shared/lower.isle index fd849c358f3b..5bdd79171d40 100644 --- a/cranelift/codegen/src/isa/pulley_shared/lower.isle +++ b/cranelift/codegen/src/isa/pulley_shared/lower.isle @@ -160,6 +160,11 @@ (rule (lower (has_type $I64 (isub a b))) (pulley_xsub64 a b)) +(rule (lower (has_type $I8X16 (isub a b))) (pulley_vsubi8x16 a b)) +(rule (lower (has_type $I16X8 (isub a b))) (pulley_vsubi16x8 a b)) +(rule (lower (has_type $I32X4 (isub a b))) (pulley_vsubi32x4 a b)) +(rule (lower (has_type $I64X2 (isub a b))) (pulley_vsubi64x2 a b)) + ;;;; Rules for `imul` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; (rule (lower (has_type $I8 (imul a b))) (pulley_xmul32 a b)) @@ -167,6 +172,11 @@ (rule (lower (has_type $I32 (imul a b))) (pulley_xmul32 a b)) (rule (lower (has_type $I64 (imul a b))) (pulley_xmul64 a b)) +(rule (lower (has_type $I8X16 (imul a b))) (pulley_vmuli8x16 a b)) +(rule (lower (has_type $I16X8 (imul a b))) (pulley_vmuli16x8 a b)) +(rule (lower (has_type $I32X4 (imul a b))) (pulley_vmuli32x4 a b)) +(rule (lower (has_type $I64X2 (imul a b))) (pulley_vmuli64x2 a b)) + ;;;; Rules for `umulhi` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; (rule (lower (has_type $I8 (umulhi a b))) @@ -703,6 +713,18 @@ (rule (lower (has_type $F64 (fcvt_from_sint val @ (value_type $I64)))) (pulley_f64_from_x64_s val)) +(rule (lower (has_type $F32X4 (fcvt_from_sint val @ (value_type $I32X4)))) + (pulley_vf32x4_from_i32x4_s val)) + +(rule (lower (has_type $F32X4 (fcvt_from_uint val @ (value_type $I32X4)))) + (pulley_vf32x4_from_i32x4_u val)) + +(rule (lower (has_type $F64X2 (fcvt_from_sint val @ (value_type $I64X2)))) + (pulley_vf64x2_from_i64x2_s val)) + +(rule (lower (has_type $F64X2 (fcvt_from_uint val @ (value_type $I64X2)))) + (pulley_vf64x2_from_i64x2_u val)) + ;;;; Rules for `fcvt_to_{u,s}int_sat` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; (rule (lower (has_type $I32 (fcvt_to_uint_sat val @ (value_type $F32)))) @@ -872,3 +894,45 @@ (rule (lower (vany_true a @ (value_type $I64X2))) (pulley_vanytrue64x2 a)) (rule (lower (vany_true a @ (value_type $F32X4))) (pulley_vanytrue32x4 a)) (rule (lower (vany_true a @ (value_type $F64X2))) (pulley_vanytrue64x2 a)) + +;;;; Rules for `swiden_low` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +(rule (lower (swiden_low a @ (value_type $I8X16))) (pulley_vwidenlow8x16_s a)) +(rule (lower (swiden_low a @ (value_type $I16X8))) (pulley_vwidenlow16x8_s a)) +(rule (lower (swiden_low a @ (value_type $I32X4))) (pulley_vwidenlow32x4_s a)) + +;;;; Rules for `swiden_high` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +(rule (lower (swiden_high a @ (value_type $I8X16))) (pulley_vwidenhigh8x16_s a)) +(rule (lower (swiden_high a @ (value_type $I16X8))) (pulley_vwidenhigh16x8_s a)) +(rule (lower (swiden_high a @ (value_type $I32X4))) (pulley_vwidenhigh32x4_s a)) + +;;;; Rules for `uwiden_low` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +(rule (lower (uwiden_low a @ (value_type $I8X16))) (pulley_vwidenlow8x16_u a)) +(rule (lower (uwiden_low a @ (value_type $I16X8))) (pulley_vwidenlow16x8_u a)) +(rule (lower (uwiden_low a @ (value_type $I32X4))) (pulley_vwidenlow32x4_u a)) + +;;;; Rules for `uwiden_high` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +(rule (lower (uwiden_high a @ (value_type $I8X16))) (pulley_vwidenhigh8x16_u a)) +(rule (lower (uwiden_high a @ (value_type $I16X8))) (pulley_vwidenhigh16x8_u a)) +(rule (lower (uwiden_high a @ (value_type $I32X4))) (pulley_vwidenhigh32x4_u a)) + +;;;; Rules for `snarrow` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +(rule (lower (snarrow a @ (value_type $I16X8) b)) (pulley_vnarrow16x8_s a b)) +(rule (lower (snarrow a @ (value_type $I32X4) b)) (pulley_vnarrow32x4_s a b)) + +;;;; Rules for `unarrow` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +(rule (lower (unarrow a @ (value_type $I16X8) b)) (pulley_vnarrow16x8_u a b)) +(rule (lower (unarrow a @ (value_type $I32X4) b)) (pulley_vnarrow32x4_u a b)) + +;;;; Rules for `fvpromote_low` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +(rule (lower (fvpromote_low a @ (value_type $F32X4))) (pulley_vfpromotelow a)) + +;;;; Rules for `fvdemote` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +(rule (lower (fvdemote a @ (value_type $F64X2))) (pulley_vfdemote a)) diff --git a/cranelift/filetests/filetests/runtests/simd-fcvt-from-sint.clif b/cranelift/filetests/filetests/runtests/simd-fcvt-from-sint.clif index 5db608580d4b..3e84f4f4adf6 100644 --- a/cranelift/filetests/filetests/runtests/simd-fcvt-from-sint.clif +++ b/cranelift/filetests/filetests/runtests/simd-fcvt-from-sint.clif @@ -9,6 +9,10 @@ target x86_64 sse42 has_avx set enable_multi_ret_implicit_sret target riscv64 has_v target riscv64 has_v has_c has_zcb +target pulley32 +target pulley32be +target pulley64 +target pulley64be function %fcvt_from_sint32(i32x4) -> f32x4 { block0(v0: i32x4): diff --git a/cranelift/filetests/filetests/runtests/simd-fcvt-from-uint.clif b/cranelift/filetests/filetests/runtests/simd-fcvt-from-uint.clif index 30615b08044d..117bddbae491 100644 --- a/cranelift/filetests/filetests/runtests/simd-fcvt-from-uint.clif +++ b/cranelift/filetests/filetests/runtests/simd-fcvt-from-uint.clif @@ -10,6 +10,10 @@ target x86_64 sse42 has_avx has_avx512vl has_avx512f set enable_multi_ret_implicit_sret target riscv64 has_v target riscv64 has_v has_c has_zcb +target pulley32 +target pulley32be +target pulley64 +target pulley64be function %fcvt_from_uint32(i32x4) -> f32x4 { block0(v0: i32x4): diff --git a/cranelift/filetests/filetests/runtests/simd-fvdemote.clif b/cranelift/filetests/filetests/runtests/simd-fvdemote.clif index 2005c1e57aeb..1cfe9f7b1ae5 100644 --- a/cranelift/filetests/filetests/runtests/simd-fvdemote.clif +++ b/cranelift/filetests/filetests/runtests/simd-fvdemote.clif @@ -9,6 +9,10 @@ target x86_64 sse42 has_avx set enable_multi_ret_implicit_sret target riscv64 has_v target riscv64 has_v has_c has_zcb +target pulley32 +target pulley32be +target pulley64 +target pulley64be function %fvdemote(f64x2) -> f32x4 { block0(v0: f64x2): diff --git a/cranelift/filetests/filetests/runtests/simd-fvpromote-low.clif b/cranelift/filetests/filetests/runtests/simd-fvpromote-low.clif index 6e85037eee2a..4989c464d016 100644 --- a/cranelift/filetests/filetests/runtests/simd-fvpromote-low.clif +++ b/cranelift/filetests/filetests/runtests/simd-fvpromote-low.clif @@ -9,6 +9,10 @@ target x86_64 sse42 has_avx set enable_multi_ret_implicit_sret target riscv64 has_v target riscv64 has_v has_c has_zcb +target pulley32 +target pulley32be +target pulley64 +target pulley64be function %fvpromote_low(f32x4) -> f64x2 { block0(v0: f32x4): diff --git a/cranelift/filetests/filetests/runtests/simd-iadd.clif b/cranelift/filetests/filetests/runtests/simd-iadd.clif index 9f5d5527c337..4bdf0fbbb82f 100644 --- a/cranelift/filetests/filetests/runtests/simd-iadd.clif +++ b/cranelift/filetests/filetests/runtests/simd-iadd.clif @@ -7,6 +7,10 @@ target x86_64 skylake set enable_multi_ret_implicit_sret target riscv64 has_v target riscv64 has_v has_c has_zcb +target pulley32 +target pulley32be +target pulley64 +target pulley64be function %iadd_i8x16(i8x16, i8x16) -> i8x16 { diff --git a/cranelift/filetests/filetests/runtests/simd-imul-i8x16.clif b/cranelift/filetests/filetests/runtests/simd-imul-i8x16.clif index 98a4d2819b47..c8406bcaac03 100644 --- a/cranelift/filetests/filetests/runtests/simd-imul-i8x16.clif +++ b/cranelift/filetests/filetests/runtests/simd-imul-i8x16.clif @@ -5,6 +5,10 @@ target s390x set enable_multi_ret_implicit_sret target riscv64 has_v target riscv64 has_v has_c has_zcb +target pulley32 +target pulley32be +target pulley64 +target pulley64be function %imul_i8x16(i8x16, i8x16) -> i8x16 { diff --git a/cranelift/filetests/filetests/runtests/simd-imul.clif b/cranelift/filetests/filetests/runtests/simd-imul.clif index e00e8e1626ba..160e225d45de 100644 --- a/cranelift/filetests/filetests/runtests/simd-imul.clif +++ b/cranelift/filetests/filetests/runtests/simd-imul.clif @@ -7,6 +7,10 @@ target x86_64 skylake set enable_multi_ret_implicit_sret target riscv64 has_v target riscv64 has_v has_c has_zcb +target pulley32 +target pulley32be +target pulley64 +target pulley64be function %imul_i16x8(i16x8, i16x8) -> i16x8 { block0(v0:i16x8, v1:i16x8): diff --git a/cranelift/filetests/filetests/runtests/simd-isub.clif b/cranelift/filetests/filetests/runtests/simd-isub.clif index 0554a0c16781..2e20bdf8082b 100644 --- a/cranelift/filetests/filetests/runtests/simd-isub.clif +++ b/cranelift/filetests/filetests/runtests/simd-isub.clif @@ -7,6 +7,10 @@ target x86_64 skylake set enable_multi_ret_implicit_sret target riscv64 has_v target riscv64 has_v has_c has_zcb +target pulley32 +target pulley32be +target pulley64 +target pulley64be function %isub_i8x16(i8x16, i8x16) -> i8x16 { diff --git a/cranelift/filetests/filetests/runtests/simd-snarrow.clif b/cranelift/filetests/filetests/runtests/simd-snarrow.clif index d9cf4fccc5c1..07770261cbaa 100644 --- a/cranelift/filetests/filetests/runtests/simd-snarrow.clif +++ b/cranelift/filetests/filetests/runtests/simd-snarrow.clif @@ -7,6 +7,10 @@ target x86_64 has_sse3 has_ssse3 has_sse41 has_avx set enable_multi_ret_implicit_sret target riscv64 has_v target riscv64 has_v has_c has_zcb +target pulley32 +target pulley32be +target pulley64 +target pulley64be function %snarrow_i16x8(i16x8, i16x8) -> i8x16 { block0(v0: i16x8, v1: i16x8): diff --git a/cranelift/filetests/filetests/runtests/simd-swidenhigh.clif b/cranelift/filetests/filetests/runtests/simd-swidenhigh.clif index 04ffdd0d5e83..1a73ef34abd6 100644 --- a/cranelift/filetests/filetests/runtests/simd-swidenhigh.clif +++ b/cranelift/filetests/filetests/runtests/simd-swidenhigh.clif @@ -9,6 +9,10 @@ target x86_64 sse41 has_avx set enable_multi_ret_implicit_sret target riscv64 has_v target riscv64 has_v has_c has_zcb +target pulley32 +target pulley32be +target pulley64 +target pulley64be function %swidenhigh_i8x16(i8x16) -> i16x8 { block0(v0: i8x16): diff --git a/cranelift/filetests/filetests/runtests/simd-swidenlow.clif b/cranelift/filetests/filetests/runtests/simd-swidenlow.clif index e29e7d714aa0..3313f67457b3 100644 --- a/cranelift/filetests/filetests/runtests/simd-swidenlow.clif +++ b/cranelift/filetests/filetests/runtests/simd-swidenlow.clif @@ -8,6 +8,10 @@ target x86_64 sse41 has_avx set enable_multi_ret_implicit_sret target riscv64 has_v target riscv64 has_v has_c has_zcb +target pulley32 +target pulley32be +target pulley64 +target pulley64be function %swidenlow_i8x16(i8x16) -> i16x8 { block0(v0: i8x16): diff --git a/cranelift/filetests/filetests/runtests/simd-unarrow.clif b/cranelift/filetests/filetests/runtests/simd-unarrow.clif index 7ca214f7cc70..3e824e274529 100644 --- a/cranelift/filetests/filetests/runtests/simd-unarrow.clif +++ b/cranelift/filetests/filetests/runtests/simd-unarrow.clif @@ -9,6 +9,10 @@ target x86_64 sse42 has_avx set enable_multi_ret_implicit_sret target riscv64 has_v target riscv64 has_v has_c has_zcb +target pulley32 +target pulley32be +target pulley64 +target pulley64be function %unarrow_i16x8(i16x8, i16x8) -> i8x16 { block0(v0: i16x8, v1: i16x8): diff --git a/cranelift/filetests/filetests/runtests/simd-uwidenhigh.clif b/cranelift/filetests/filetests/runtests/simd-uwidenhigh.clif index 8118d04699f0..da2d55991150 100644 --- a/cranelift/filetests/filetests/runtests/simd-uwidenhigh.clif +++ b/cranelift/filetests/filetests/runtests/simd-uwidenhigh.clif @@ -8,6 +8,10 @@ target x86_64 sse41 has_avx set enable_multi_ret_implicit_sret target riscv64 has_v target riscv64 has_v has_c has_zcb +target pulley32 +target pulley32be +target pulley64 +target pulley64be function %uwidenhigh_i8x16(i8x16) -> i16x8 { block0(v0: i8x16): diff --git a/cranelift/filetests/filetests/runtests/simd-uwidenlow.clif b/cranelift/filetests/filetests/runtests/simd-uwidenlow.clif index 5d6d044666ff..6bc9f491dc2f 100644 --- a/cranelift/filetests/filetests/runtests/simd-uwidenlow.clif +++ b/cranelift/filetests/filetests/runtests/simd-uwidenlow.clif @@ -8,6 +8,10 @@ target x86_64 sse41 has_avx set enable_multi_ret_implicit_sret target riscv64 has_v target riscv64 has_v has_c has_zcb +target pulley32 +target pulley32be +target pulley64 +target pulley64be function %uwidenlow_i8x16(i8x16) -> i16x8 { block0(v0: i8x16): diff --git a/crates/wasmtime/src/runtime/vm/interpreter.rs b/crates/wasmtime/src/runtime/vm/interpreter.rs index 9836d0450742..fbe7a8cd3ea5 100644 --- a/crates/wasmtime/src/runtime/vm/interpreter.rs +++ b/crates/wasmtime/src/runtime/vm/interpreter.rs @@ -193,6 +193,10 @@ impl InterpreterRef<'_> { clippy::cast_sign_loss, reason = "macro-generated code" )] + #[cfg_attr( + not(feature = "component-model"), + expect(unused_macro_rules, reason = "macro-code") + )] unsafe fn call_indirect_host(&mut self, id: u8) { let id = u32::from(id); let fnptr = self.0[XReg::x0].get_ptr::(); diff --git a/crates/wast-util/src/lib.rs b/crates/wast-util/src/lib.rs index 01be1a00c81a..07f2b8469257 100644 --- a/crates/wast-util/src/lib.rs +++ b/crates/wast-util/src/lib.rs @@ -402,9 +402,7 @@ impl WastTest { if config.compiler == Compiler::CraneliftPulley { let unsupported = [ "misc_testsuite/memory64/simd.wast", - "misc_testsuite/simd/almost-extmul.wast", "misc_testsuite/simd/canonicalize-nan.wast", - "misc_testsuite/simd/cvt-from-uint.wast", "misc_testsuite/simd/issue6725-no-egraph-panic.wast", "misc_testsuite/simd/issue_3327_bnot_lowering.wast", "misc_testsuite/simd/replace-lane-preserve.wast", @@ -428,7 +426,6 @@ impl WastTest { "spec_testsuite/proposals/memory64/i16x8_relaxed_q15mulr_s.wast", "spec_testsuite/proposals/memory64/i32x4_relaxed_trunc.wast", "spec_testsuite/proposals/memory64/i8x16_relaxed_swizzle.wast", - "spec_testsuite/simd_conversions.wast", "spec_testsuite/simd_f32x4.wast", "spec_testsuite/simd_f32x4_arith.wast", "spec_testsuite/simd_f32x4_cmp.wast", @@ -443,7 +440,6 @@ impl WastTest { "spec_testsuite/simd_i16x8_arith2.wast", "spec_testsuite/simd_i16x8_cmp.wast", "spec_testsuite/simd_i16x8_extadd_pairwise_i8x16.wast", - "spec_testsuite/simd_i16x8_extmul_i8x16.wast", "spec_testsuite/simd_i16x8_q15mulr_sat_s.wast", "spec_testsuite/simd_i16x8_sat_arith.wast", "spec_testsuite/simd_i32x4_arith.wast", @@ -451,18 +447,15 @@ impl WastTest { "spec_testsuite/simd_i32x4_cmp.wast", "spec_testsuite/simd_i32x4_dot_i16x8.wast", "spec_testsuite/simd_i32x4_extadd_pairwise_i16x8.wast", - "spec_testsuite/simd_i32x4_extmul_i16x8.wast", "spec_testsuite/simd_i32x4_trunc_sat_f32x4.wast", "spec_testsuite/simd_i32x4_trunc_sat_f64x2.wast", "spec_testsuite/simd_i64x2_arith.wast", "spec_testsuite/simd_i64x2_arith2.wast", "spec_testsuite/simd_i64x2_cmp.wast", - "spec_testsuite/simd_i64x2_extmul_i32x4.wast", "spec_testsuite/simd_i8x16_arith.wast", "spec_testsuite/simd_i8x16_arith2.wast", "spec_testsuite/simd_i8x16_cmp.wast", "spec_testsuite/simd_i8x16_sat_arith.wast", - "spec_testsuite/simd_int_to_int_extend.wast", "spec_testsuite/simd_lane.wast", "spec_testsuite/simd_load.wast", "spec_testsuite/simd_load16_lane.wast", diff --git a/pulley/src/interp.rs b/pulley/src/interp.rs index 694a47b65efe..34e6df6afd83 100644 --- a/pulley/src/interp.rs +++ b/pulley/src/interp.rs @@ -3057,4 +3057,244 @@ impl ExtendedOpVisitor for Interpreter<'_> { self.state[dst].set_i64(a.wrapping_abs()); ControlFlow::Continue(()) } + + fn vf32x4_from_i32x4_s(&mut self, dst: VReg, src: VReg) -> ControlFlow { + let a = self.state[src].get_i32x4(); + self.state[dst].set_f32x4(a.map(|i| i as f32)); + ControlFlow::Continue(()) + } + + fn vf32x4_from_i32x4_u(&mut self, dst: VReg, src: VReg) -> ControlFlow { + let a = self.state[src].get_u32x4(); + self.state[dst].set_f32x4(a.map(|i| i as f32)); + ControlFlow::Continue(()) + } + + fn vf64x2_from_i64x2_s(&mut self, dst: VReg, src: VReg) -> ControlFlow { + let a = self.state[src].get_i64x2(); + self.state[dst].set_f64x2(a.map(|i| i as f64)); + ControlFlow::Continue(()) + } + + fn vf64x2_from_i64x2_u(&mut self, dst: VReg, src: VReg) -> ControlFlow { + let a = self.state[src].get_u64x2(); + self.state[dst].set_f64x2(a.map(|i| i as f64)); + ControlFlow::Continue(()) + } + + fn vwidenlow8x16_s(&mut self, dst: VReg, src: VReg) -> ControlFlow { + let a = *self.state[src].get_i8x16().first_chunk().unwrap(); + self.state[dst].set_i16x8(a.map(|i| i.into())); + ControlFlow::Continue(()) + } + + fn vwidenlow8x16_u(&mut self, dst: VReg, src: VReg) -> ControlFlow { + let a = *self.state[src].get_u8x16().first_chunk().unwrap(); + self.state[dst].set_u16x8(a.map(|i| i.into())); + ControlFlow::Continue(()) + } + + fn vwidenlow16x8_s(&mut self, dst: VReg, src: VReg) -> ControlFlow { + let a = *self.state[src].get_i16x8().first_chunk().unwrap(); + self.state[dst].set_i32x4(a.map(|i| i.into())); + ControlFlow::Continue(()) + } + + fn vwidenlow16x8_u(&mut self, dst: VReg, src: VReg) -> ControlFlow { + let a = *self.state[src].get_u16x8().first_chunk().unwrap(); + self.state[dst].set_u32x4(a.map(|i| i.into())); + ControlFlow::Continue(()) + } + + fn vwidenlow32x4_s(&mut self, dst: VReg, src: VReg) -> ControlFlow { + let a = *self.state[src].get_i32x4().first_chunk().unwrap(); + self.state[dst].set_i64x2(a.map(|i| i.into())); + ControlFlow::Continue(()) + } + + fn vwidenlow32x4_u(&mut self, dst: VReg, src: VReg) -> ControlFlow { + let a = *self.state[src].get_u32x4().first_chunk().unwrap(); + self.state[dst].set_u64x2(a.map(|i| i.into())); + ControlFlow::Continue(()) + } + + fn vwidenhigh8x16_s(&mut self, dst: VReg, src: VReg) -> ControlFlow { + let a = *self.state[src].get_i8x16().last_chunk().unwrap(); + self.state[dst].set_i16x8(a.map(|i| i.into())); + ControlFlow::Continue(()) + } + + fn vwidenhigh8x16_u(&mut self, dst: VReg, src: VReg) -> ControlFlow { + let a = *self.state[src].get_u8x16().last_chunk().unwrap(); + self.state[dst].set_u16x8(a.map(|i| i.into())); + ControlFlow::Continue(()) + } + + fn vwidenhigh16x8_s(&mut self, dst: VReg, src: VReg) -> ControlFlow { + let a = *self.state[src].get_i16x8().last_chunk().unwrap(); + self.state[dst].set_i32x4(a.map(|i| i.into())); + ControlFlow::Continue(()) + } + + fn vwidenhigh16x8_u(&mut self, dst: VReg, src: VReg) -> ControlFlow { + let a = *self.state[src].get_u16x8().last_chunk().unwrap(); + self.state[dst].set_u32x4(a.map(|i| i.into())); + ControlFlow::Continue(()) + } + + fn vwidenhigh32x4_s(&mut self, dst: VReg, src: VReg) -> ControlFlow { + let a = *self.state[src].get_i32x4().last_chunk().unwrap(); + self.state[dst].set_i64x2(a.map(|i| i.into())); + ControlFlow::Continue(()) + } + + fn vwidenhigh32x4_u(&mut self, dst: VReg, src: VReg) -> ControlFlow { + let a = *self.state[src].get_u32x4().last_chunk().unwrap(); + self.state[dst].set_u64x2(a.map(|i| i.into())); + ControlFlow::Continue(()) + } + + fn vnarrow16x8_s(&mut self, operands: BinaryOperands) -> ControlFlow { + let a = self.state[operands.src1].get_i16x8(); + let b = self.state[operands.src2].get_i16x8(); + let mut result = [0; 16]; + for (i, d) in a.iter().chain(&b).zip(&mut result) { + *d = (*i) + .try_into() + .unwrap_or(if *i < 0 { i8::MIN } else { i8::MAX }); + } + self.state[operands.dst].set_i8x16(result); + ControlFlow::Continue(()) + } + + fn vnarrow16x8_u(&mut self, operands: BinaryOperands) -> ControlFlow { + let a = self.state[operands.src1].get_i16x8(); + let b = self.state[operands.src2].get_i16x8(); + let mut result = [0; 16]; + for (i, d) in a.iter().chain(&b).zip(&mut result) { + *d = (*i) + .try_into() + .unwrap_or(if *i < 0 { u8::MIN } else { u8::MAX }); + } + self.state[operands.dst].set_u8x16(result); + ControlFlow::Continue(()) + } + + fn vnarrow32x4_s(&mut self, operands: BinaryOperands) -> ControlFlow { + let a = self.state[operands.src1].get_i32x4(); + let b = self.state[operands.src2].get_i32x4(); + let mut result = [0; 8]; + for (i, d) in a.iter().chain(&b).zip(&mut result) { + *d = (*i) + .try_into() + .unwrap_or(if *i < 0 { i16::MIN } else { i16::MAX }); + } + self.state[operands.dst].set_i16x8(result); + ControlFlow::Continue(()) + } + + fn vnarrow32x4_u(&mut self, operands: BinaryOperands) -> ControlFlow { + let a = self.state[operands.src1].get_i32x4(); + let b = self.state[operands.src2].get_i32x4(); + let mut result = [0; 8]; + for (i, d) in a.iter().chain(&b).zip(&mut result) { + *d = (*i) + .try_into() + .unwrap_or(if *i < 0 { u16::MIN } else { u16::MAX }); + } + self.state[operands.dst].set_u16x8(result); + ControlFlow::Continue(()) + } + + fn vfpromotelow(&mut self, dst: VReg, src: VReg) -> ControlFlow { + let a = self.state[src].get_f32x4(); + self.state[dst].set_f64x2([a[0].into(), a[1].into()]); + ControlFlow::Continue(()) + } + + fn vfdemote(&mut self, dst: VReg, src: VReg) -> ControlFlow { + let a = self.state[src].get_f64x2(); + self.state[dst].set_f32x4([a[0] as f32, a[1] as f32, 0.0, 0.0]); + ControlFlow::Continue(()) + } + + fn vsubi8x16(&mut self, operands: BinaryOperands) -> ControlFlow { + let mut a = self.state[operands.src1].get_i8x16(); + let b = self.state[operands.src2].get_i8x16(); + for (a, b) in a.iter_mut().zip(b) { + *a = a.wrapping_sub(b); + } + self.state[operands.dst].set_i8x16(a); + ControlFlow::Continue(()) + } + + fn vsubi16x8(&mut self, operands: BinaryOperands) -> ControlFlow { + let mut a = self.state[operands.src1].get_i16x8(); + let b = self.state[operands.src2].get_i16x8(); + for (a, b) in a.iter_mut().zip(b) { + *a = a.wrapping_sub(b); + } + self.state[operands.dst].set_i16x8(a); + ControlFlow::Continue(()) + } + + fn vsubi32x4(&mut self, operands: BinaryOperands) -> ControlFlow { + let mut a = self.state[operands.src1].get_i32x4(); + let b = self.state[operands.src2].get_i32x4(); + for (a, b) in a.iter_mut().zip(b) { + *a = a.wrapping_sub(b); + } + self.state[operands.dst].set_i32x4(a); + ControlFlow::Continue(()) + } + + fn vsubi64x2(&mut self, operands: BinaryOperands) -> ControlFlow { + let mut a = self.state[operands.src1].get_i64x2(); + let b = self.state[operands.src2].get_i64x2(); + for (a, b) in a.iter_mut().zip(b) { + *a = a.wrapping_sub(b); + } + self.state[operands.dst].set_i64x2(a); + ControlFlow::Continue(()) + } + + fn vmuli8x16(&mut self, operands: BinaryOperands) -> ControlFlow { + let mut a = self.state[operands.src1].get_i8x16(); + let b = self.state[operands.src2].get_i8x16(); + for (a, b) in a.iter_mut().zip(b) { + *a = a.wrapping_mul(b); + } + self.state[operands.dst].set_i8x16(a); + ControlFlow::Continue(()) + } + + fn vmuli16x8(&mut self, operands: BinaryOperands) -> ControlFlow { + let mut a = self.state[operands.src1].get_i16x8(); + let b = self.state[operands.src2].get_i16x8(); + for (a, b) in a.iter_mut().zip(b) { + *a = a.wrapping_mul(b); + } + self.state[operands.dst].set_i16x8(a); + ControlFlow::Continue(()) + } + + fn vmuli32x4(&mut self, operands: BinaryOperands) -> ControlFlow { + let mut a = self.state[operands.src1].get_i32x4(); + let b = self.state[operands.src2].get_i32x4(); + for (a, b) in a.iter_mut().zip(b) { + *a = a.wrapping_mul(b); + } + self.state[operands.dst].set_i32x4(a); + ControlFlow::Continue(()) + } + + fn vmuli64x2(&mut self, operands: BinaryOperands) -> ControlFlow { + let mut a = self.state[operands.src1].get_i64x2(); + let b = self.state[operands.src2].get_i64x2(); + for (a, b) in a.iter_mut().zip(b) { + *a = a.wrapping_mul(b); + } + self.state[operands.dst].set_i64x2(a); + ControlFlow::Continue(()) + } } diff --git a/pulley/src/lib.rs b/pulley/src/lib.rs index baa4c813ca26..a911acd6cbdb 100644 --- a/pulley/src/lib.rs +++ b/pulley/src/lib.rs @@ -761,6 +761,88 @@ macro_rules! for_each_extended_op { xabs32 = XAbs32 { dst: XReg, src: XReg }; /// `dst = |src|` xabs64 = XAbs64 { dst: XReg, src: XReg }; + + /// TODO + vf32x4_from_i32x4_s = VF32x4FromI32x4S { dst: VReg, src: VReg }; + /// TODO + vf32x4_from_i32x4_u = VF32x4FromI32x4U { dst: VReg, src: VReg }; + /// TODO + vf64x2_from_i64x2_s = VF64x2FromI64x2S { dst: VReg, src: VReg }; + /// TODO + vf64x2_from_i64x2_u = VF64x2FromI64x2U { dst: VReg, src: VReg }; + + /// Widens the low lanes of the input vector, as signed, to twice + /// the width. + vwidenlow8x16_s = VWidenLow8x16S { dst: VReg, src: VReg }; + /// Widens the low lanes of the input vector, as unsigned, to twice + /// the width. + vwidenlow8x16_u = VWidenLow8x16U { dst: VReg, src: VReg }; + /// Widens the low lanes of the input vector, as signed, to twice + /// the width. + vwidenlow16x8_s = VWidenLow16x8S { dst: VReg, src: VReg }; + /// Widens the low lanes of the input vector, as unsigned, to twice + /// the width. + vwidenlow16x8_u = VWidenLow16x8U { dst: VReg, src: VReg }; + /// Widens the low lanes of the input vector, as signed, to twice + /// the width. + vwidenlow32x4_s = VWidenLow32x4S { dst: VReg, src: VReg }; + /// Widens the low lanes of the input vector, as unsigned, to twice + /// the width. + vwidenlow32x4_u = VWidenLow32x4U { dst: VReg, src: VReg }; + /// Widens the high lanes of the input vector, as signed, to twice + /// the width. + vwidenhigh8x16_s = VWidenHigh8x16S { dst: VReg, src: VReg }; + /// Widens the high lanes of the input vector, as unsigned, to twice + /// the width. + vwidenhigh8x16_u = VWidenHigh8x16U { dst: VReg, src: VReg }; + /// Widens the high lanes of the input vector, as signed, to twice + /// the width. + vwidenhigh16x8_s = VWidenHigh16x8S { dst: VReg, src: VReg }; + /// Widens the high lanes of the input vector, as unsigned, to twice + /// the width. + vwidenhigh16x8_u = VWidenHigh16x8U { dst: VReg, src: VReg }; + /// Widens the high lanes of the input vector, as signed, to twice + /// the width. + vwidenhigh32x4_s = VWidenHigh32x4S { dst: VReg, src: VReg }; + /// Widens the high lanes of the input vector, as unsigned, to twice + /// the width. + vwidenhigh32x4_u = VWidenHigh32x4U { dst: VReg, src: VReg }; + + /// Narrows the two 16x8 vectors, assuming all input lanes are + /// signed, to half the width. Narrowing is signed and saturating. + vnarrow16x8_s = Vnarrow16x8S { operands: BinaryOperands }; + /// Narrows the two 16x8 vectors, assuming all input lanes are + /// signed, to half the width. Narrowing is unsigned and saturating. + vnarrow16x8_u = Vnarrow16x8U { operands: BinaryOperands }; + /// Narrows the two 32x4 vectors, assuming all input lanes are + /// signed, to half the width. Narrowing is signed and saturating. + vnarrow32x4_s = Vnarrow32x4S { operands: BinaryOperands }; + /// Narrows the two 32x4 vectors, assuming all input lanes are + /// signed, to half the width. Narrowing is unsigned and saturating. + vnarrow32x4_u = Vnarrow32x4U { operands: BinaryOperands }; + /// Promotes the low two lanes of the f32x4 input to f64x2. + vfpromotelow = VFpromoteLow { dst: VReg, src: VReg }; + /// Demotes the two f64x2 lanes to f32x2 and then extends with two + /// more zero lanes. + vfdemote = VFdemote { dst: VReg, src: VReg }; + + /// `dst = src1 - src2` + vsubi8x16 = VSubI8x16 { operands: BinaryOperands }; + /// `dst = src1 - src2` + vsubi16x8 = VSubI16x8 { operands: BinaryOperands }; + /// `dst = src1 - src2` + vsubi32x4 = VSubI32x4 { operands: BinaryOperands }; + /// `dst = src1 - src2` + vsubi64x2 = VSubI64x2 { operands: BinaryOperands }; + + /// `dst = src1 * src2` + vmuli8x16 = VMulI8x16 { operands: BinaryOperands }; + /// `dst = src1 * src2` + vmuli16x8 = VMulI16x8 { operands: BinaryOperands }; + /// `dst = src1 * src2` + vmuli32x4 = VMulI32x4 { operands: BinaryOperands }; + /// `dst = src1 * src2` + vmuli64x2 = VMulI64x2 { operands: BinaryOperands }; } }; }