From ea3c31190594889b622852e480e3ce3f21a9ac72 Mon Sep 17 00:00:00 2001
From: Alex Crichton <alex@alexcrichton.com>
Date: Thu, 12 Dec 2024 15:15:57 -0800
Subject: [PATCH] pulley: Get `simd_conversions.wast` test working

Lots of narrowing/extending/conversion-related opcodes implemented. Note
that these opcodes are all in the "extended" namespace as the 1-byte
namespace has started to overflow.
---
 .../codegen/src/isa/pulley_shared/lower.isle  |  64 +++++
 .../runtests/simd-fcvt-from-sint.clif         |   4 +
 .../runtests/simd-fcvt-from-uint.clif         |   4 +
 .../filetests/runtests/simd-fvdemote.clif     |   4 +
 .../runtests/simd-fvpromote-low.clif          |   4 +
 .../filetests/runtests/simd-iadd.clif         |   4 +
 .../filetests/runtests/simd-imul-i8x16.clif   |   4 +
 .../filetests/runtests/simd-imul.clif         |   4 +
 .../filetests/runtests/simd-isub.clif         |   4 +
 .../filetests/runtests/simd-snarrow.clif      |   4 +
 .../filetests/runtests/simd-swidenhigh.clif   |   4 +
 .../filetests/runtests/simd-swidenlow.clif    |   4 +
 .../filetests/runtests/simd-unarrow.clif      |   4 +
 .../filetests/runtests/simd-uwidenhigh.clif   |   4 +
 .../filetests/runtests/simd-uwidenlow.clif    |   4 +
 crates/wasmtime/src/runtime/vm/interpreter.rs |   4 +
 crates/wast-util/src/lib.rs                   |   7 -
 pulley/src/interp.rs                          | 240 ++++++++++++++++++
 pulley/src/lib.rs                             |  82 ++++++
 19 files changed, 446 insertions(+), 7 deletions(-)

diff --git a/cranelift/codegen/src/isa/pulley_shared/lower.isle b/cranelift/codegen/src/isa/pulley_shared/lower.isle
index fd849c358f3b..5bdd79171d40 100644
--- a/cranelift/codegen/src/isa/pulley_shared/lower.isle
+++ b/cranelift/codegen/src/isa/pulley_shared/lower.isle
@@ -160,6 +160,11 @@
 (rule (lower (has_type $I64 (isub a b)))
       (pulley_xsub64 a b))
 
+(rule (lower (has_type $I8X16 (isub a b))) (pulley_vsubi8x16 a b))
+(rule (lower (has_type $I16X8 (isub a b))) (pulley_vsubi16x8 a b))
+(rule (lower (has_type $I32X4 (isub a b))) (pulley_vsubi32x4 a b))
+(rule (lower (has_type $I64X2 (isub a b))) (pulley_vsubi64x2 a b))
+
 ;;;; Rules for `imul` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 
 (rule (lower (has_type $I8 (imul a b))) (pulley_xmul32 a b))
@@ -167,6 +172,11 @@
 (rule (lower (has_type $I32 (imul a b))) (pulley_xmul32 a b))
 (rule (lower (has_type $I64 (imul a b))) (pulley_xmul64 a b))
 
+(rule (lower (has_type $I8X16 (imul a b))) (pulley_vmuli8x16 a b))
+(rule (lower (has_type $I16X8 (imul a b))) (pulley_vmuli16x8 a b))
+(rule (lower (has_type $I32X4 (imul a b))) (pulley_vmuli32x4 a b))
+(rule (lower (has_type $I64X2 (imul a b))) (pulley_vmuli64x2 a b))
+
 ;;;; Rules for `umulhi` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 
 (rule (lower (has_type $I8 (umulhi a b)))
@@ -703,6 +713,18 @@
 (rule (lower (has_type $F64 (fcvt_from_sint val @ (value_type $I64))))
   (pulley_f64_from_x64_s val))
 
+(rule (lower (has_type $F32X4 (fcvt_from_sint val @ (value_type $I32X4))))
+  (pulley_vf32x4_from_i32x4_s val))
+
+(rule (lower (has_type $F32X4 (fcvt_from_uint val @ (value_type $I32X4))))
+  (pulley_vf32x4_from_i32x4_u val))
+
+(rule (lower (has_type $F64X2 (fcvt_from_sint val @ (value_type $I64X2))))
+  (pulley_vf64x2_from_i64x2_s val))
+
+(rule (lower (has_type $F64X2 (fcvt_from_uint val @ (value_type $I64X2))))
+  (pulley_vf64x2_from_i64x2_u val))
+
 ;;;; Rules for `fcvt_to_{u,s}int_sat` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 
 (rule (lower (has_type $I32 (fcvt_to_uint_sat val @ (value_type $F32))))
@@ -872,3 +894,45 @@
 (rule (lower (vany_true a @ (value_type $I64X2))) (pulley_vanytrue64x2 a))
 (rule (lower (vany_true a @ (value_type $F32X4))) (pulley_vanytrue32x4 a))
 (rule (lower (vany_true a @ (value_type $F64X2))) (pulley_vanytrue64x2 a))
+
+;;;; Rules for `swiden_low` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+(rule (lower (swiden_low a @ (value_type $I8X16))) (pulley_vwidenlow8x16_s a))
+(rule (lower (swiden_low a @ (value_type $I16X8))) (pulley_vwidenlow16x8_s a))
+(rule (lower (swiden_low a @ (value_type $I32X4))) (pulley_vwidenlow32x4_s a))
+
+;;;; Rules for `swiden_high` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+(rule (lower (swiden_high a @ (value_type $I8X16))) (pulley_vwidenhigh8x16_s a))
+(rule (lower (swiden_high a @ (value_type $I16X8))) (pulley_vwidenhigh16x8_s a))
+(rule (lower (swiden_high a @ (value_type $I32X4))) (pulley_vwidenhigh32x4_s a))
+
+;;;; Rules for `uwiden_low` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+(rule (lower (uwiden_low a @ (value_type $I8X16))) (pulley_vwidenlow8x16_u a))
+(rule (lower (uwiden_low a @ (value_type $I16X8))) (pulley_vwidenlow16x8_u a))
+(rule (lower (uwiden_low a @ (value_type $I32X4))) (pulley_vwidenlow32x4_u a))
+
+;;;; Rules for `uwiden_high` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+(rule (lower (uwiden_high a @ (value_type $I8X16))) (pulley_vwidenhigh8x16_u a))
+(rule (lower (uwiden_high a @ (value_type $I16X8))) (pulley_vwidenhigh16x8_u a))
+(rule (lower (uwiden_high a @ (value_type $I32X4))) (pulley_vwidenhigh32x4_u a))
+
+;;;; Rules for `snarrow` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+(rule (lower (snarrow a @ (value_type $I16X8) b)) (pulley_vnarrow16x8_s a b))
+(rule (lower (snarrow a @ (value_type $I32X4) b)) (pulley_vnarrow32x4_s a b))
+
+;;;; Rules for `unarrow` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+(rule (lower (unarrow a @ (value_type $I16X8) b)) (pulley_vnarrow16x8_u a b))
+(rule (lower (unarrow a @ (value_type $I32X4) b)) (pulley_vnarrow32x4_u a b))
+
+;;;; Rules for `fvpromote_low` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+(rule (lower (fvpromote_low a @ (value_type $F32X4))) (pulley_vfpromotelow a))
+
+;;;; Rules for `fvdemote` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+(rule (lower (fvdemote a @ (value_type $F64X2))) (pulley_vfdemote a))
diff --git a/cranelift/filetests/filetests/runtests/simd-fcvt-from-sint.clif b/cranelift/filetests/filetests/runtests/simd-fcvt-from-sint.clif
index 5db608580d4b..3e84f4f4adf6 100644
--- a/cranelift/filetests/filetests/runtests/simd-fcvt-from-sint.clif
+++ b/cranelift/filetests/filetests/runtests/simd-fcvt-from-sint.clif
@@ -9,6 +9,10 @@ target x86_64 sse42 has_avx
 set enable_multi_ret_implicit_sret
 target riscv64 has_v
 target riscv64 has_v has_c has_zcb
+target pulley32
+target pulley32be
+target pulley64
+target pulley64be
 
 function %fcvt_from_sint32(i32x4) -> f32x4 {
 block0(v0: i32x4):
diff --git a/cranelift/filetests/filetests/runtests/simd-fcvt-from-uint.clif b/cranelift/filetests/filetests/runtests/simd-fcvt-from-uint.clif
index 30615b08044d..117bddbae491 100644
--- a/cranelift/filetests/filetests/runtests/simd-fcvt-from-uint.clif
+++ b/cranelift/filetests/filetests/runtests/simd-fcvt-from-uint.clif
@@ -10,6 +10,10 @@ target x86_64 sse42 has_avx has_avx512vl has_avx512f
 set enable_multi_ret_implicit_sret
 target riscv64 has_v
 target riscv64 has_v has_c has_zcb
+target pulley32
+target pulley32be
+target pulley64
+target pulley64be
 
 function %fcvt_from_uint32(i32x4) -> f32x4 {
 block0(v0: i32x4):
diff --git a/cranelift/filetests/filetests/runtests/simd-fvdemote.clif b/cranelift/filetests/filetests/runtests/simd-fvdemote.clif
index 2005c1e57aeb..1cfe9f7b1ae5 100644
--- a/cranelift/filetests/filetests/runtests/simd-fvdemote.clif
+++ b/cranelift/filetests/filetests/runtests/simd-fvdemote.clif
@@ -9,6 +9,10 @@ target x86_64 sse42 has_avx
 set enable_multi_ret_implicit_sret
 target riscv64 has_v
 target riscv64 has_v has_c has_zcb
+target pulley32
+target pulley32be
+target pulley64
+target pulley64be
 
 function %fvdemote(f64x2) -> f32x4 {
 block0(v0: f64x2):
diff --git a/cranelift/filetests/filetests/runtests/simd-fvpromote-low.clif b/cranelift/filetests/filetests/runtests/simd-fvpromote-low.clif
index 6e85037eee2a..4989c464d016 100644
--- a/cranelift/filetests/filetests/runtests/simd-fvpromote-low.clif
+++ b/cranelift/filetests/filetests/runtests/simd-fvpromote-low.clif
@@ -9,6 +9,10 @@ target x86_64 sse42 has_avx
 set enable_multi_ret_implicit_sret
 target riscv64 has_v
 target riscv64 has_v has_c has_zcb
+target pulley32
+target pulley32be
+target pulley64
+target pulley64be
 
 function %fvpromote_low(f32x4) -> f64x2 {
 block0(v0: f32x4):
diff --git a/cranelift/filetests/filetests/runtests/simd-iadd.clif b/cranelift/filetests/filetests/runtests/simd-iadd.clif
index 9f5d5527c337..4bdf0fbbb82f 100644
--- a/cranelift/filetests/filetests/runtests/simd-iadd.clif
+++ b/cranelift/filetests/filetests/runtests/simd-iadd.clif
@@ -7,6 +7,10 @@ target x86_64 skylake
 set enable_multi_ret_implicit_sret
 target riscv64 has_v
 target riscv64 has_v has_c has_zcb
+target pulley32
+target pulley32be
+target pulley64
+target pulley64be
 
 
 function %iadd_i8x16(i8x16, i8x16) -> i8x16 {
diff --git a/cranelift/filetests/filetests/runtests/simd-imul-i8x16.clif b/cranelift/filetests/filetests/runtests/simd-imul-i8x16.clif
index 98a4d2819b47..c8406bcaac03 100644
--- a/cranelift/filetests/filetests/runtests/simd-imul-i8x16.clif
+++ b/cranelift/filetests/filetests/runtests/simd-imul-i8x16.clif
@@ -5,6 +5,10 @@ target s390x
 set enable_multi_ret_implicit_sret
 target riscv64 has_v
 target riscv64 has_v has_c has_zcb
+target pulley32
+target pulley32be
+target pulley64
+target pulley64be
 
 
 function %imul_i8x16(i8x16, i8x16) -> i8x16 {
diff --git a/cranelift/filetests/filetests/runtests/simd-imul.clif b/cranelift/filetests/filetests/runtests/simd-imul.clif
index e00e8e1626ba..160e225d45de 100644
--- a/cranelift/filetests/filetests/runtests/simd-imul.clif
+++ b/cranelift/filetests/filetests/runtests/simd-imul.clif
@@ -7,6 +7,10 @@ target x86_64 skylake
 set enable_multi_ret_implicit_sret
 target riscv64 has_v
 target riscv64 has_v has_c has_zcb
+target pulley32
+target pulley32be
+target pulley64
+target pulley64be
 
 function %imul_i16x8(i16x8, i16x8) -> i16x8 {
 block0(v0:i16x8, v1:i16x8):
diff --git a/cranelift/filetests/filetests/runtests/simd-isub.clif b/cranelift/filetests/filetests/runtests/simd-isub.clif
index 0554a0c16781..2e20bdf8082b 100644
--- a/cranelift/filetests/filetests/runtests/simd-isub.clif
+++ b/cranelift/filetests/filetests/runtests/simd-isub.clif
@@ -7,6 +7,10 @@ target x86_64 skylake
 set enable_multi_ret_implicit_sret
 target riscv64 has_v
 target riscv64 has_v has_c has_zcb
+target pulley32
+target pulley32be
+target pulley64
+target pulley64be
 
 
 function %isub_i8x16(i8x16, i8x16) -> i8x16 {
diff --git a/cranelift/filetests/filetests/runtests/simd-snarrow.clif b/cranelift/filetests/filetests/runtests/simd-snarrow.clif
index d9cf4fccc5c1..07770261cbaa 100644
--- a/cranelift/filetests/filetests/runtests/simd-snarrow.clif
+++ b/cranelift/filetests/filetests/runtests/simd-snarrow.clif
@@ -7,6 +7,10 @@ target x86_64 has_sse3 has_ssse3 has_sse41 has_avx
 set enable_multi_ret_implicit_sret
 target riscv64 has_v
 target riscv64 has_v has_c has_zcb
+target pulley32
+target pulley32be
+target pulley64
+target pulley64be
 
 function %snarrow_i16x8(i16x8, i16x8) -> i8x16 {
 block0(v0: i16x8, v1: i16x8):
diff --git a/cranelift/filetests/filetests/runtests/simd-swidenhigh.clif b/cranelift/filetests/filetests/runtests/simd-swidenhigh.clif
index 04ffdd0d5e83..1a73ef34abd6 100644
--- a/cranelift/filetests/filetests/runtests/simd-swidenhigh.clif
+++ b/cranelift/filetests/filetests/runtests/simd-swidenhigh.clif
@@ -9,6 +9,10 @@ target x86_64 sse41 has_avx
 set enable_multi_ret_implicit_sret
 target riscv64 has_v
 target riscv64 has_v has_c has_zcb
+target pulley32
+target pulley32be
+target pulley64
+target pulley64be
 
 function %swidenhigh_i8x16(i8x16) -> i16x8 {
 block0(v0: i8x16):
diff --git a/cranelift/filetests/filetests/runtests/simd-swidenlow.clif b/cranelift/filetests/filetests/runtests/simd-swidenlow.clif
index e29e7d714aa0..3313f67457b3 100644
--- a/cranelift/filetests/filetests/runtests/simd-swidenlow.clif
+++ b/cranelift/filetests/filetests/runtests/simd-swidenlow.clif
@@ -8,6 +8,10 @@ target x86_64 sse41 has_avx
 set enable_multi_ret_implicit_sret
 target riscv64 has_v
 target riscv64 has_v has_c has_zcb
+target pulley32
+target pulley32be
+target pulley64
+target pulley64be
 
 function %swidenlow_i8x16(i8x16) -> i16x8 {
 block0(v0: i8x16):
diff --git a/cranelift/filetests/filetests/runtests/simd-unarrow.clif b/cranelift/filetests/filetests/runtests/simd-unarrow.clif
index 7ca214f7cc70..3e824e274529 100644
--- a/cranelift/filetests/filetests/runtests/simd-unarrow.clif
+++ b/cranelift/filetests/filetests/runtests/simd-unarrow.clif
@@ -9,6 +9,10 @@ target x86_64 sse42 has_avx
 set enable_multi_ret_implicit_sret
 target riscv64 has_v
 target riscv64 has_v has_c has_zcb
+target pulley32
+target pulley32be
+target pulley64
+target pulley64be
 
 function %unarrow_i16x8(i16x8, i16x8) -> i8x16 {
 block0(v0: i16x8, v1: i16x8):
diff --git a/cranelift/filetests/filetests/runtests/simd-uwidenhigh.clif b/cranelift/filetests/filetests/runtests/simd-uwidenhigh.clif
index 8118d04699f0..da2d55991150 100644
--- a/cranelift/filetests/filetests/runtests/simd-uwidenhigh.clif
+++ b/cranelift/filetests/filetests/runtests/simd-uwidenhigh.clif
@@ -8,6 +8,10 @@ target x86_64 sse41 has_avx
 set enable_multi_ret_implicit_sret
 target riscv64 has_v
 target riscv64 has_v has_c has_zcb
+target pulley32
+target pulley32be
+target pulley64
+target pulley64be
 
 function %uwidenhigh_i8x16(i8x16) -> i16x8 {
 block0(v0: i8x16):
diff --git a/cranelift/filetests/filetests/runtests/simd-uwidenlow.clif b/cranelift/filetests/filetests/runtests/simd-uwidenlow.clif
index 5d6d044666ff..6bc9f491dc2f 100644
--- a/cranelift/filetests/filetests/runtests/simd-uwidenlow.clif
+++ b/cranelift/filetests/filetests/runtests/simd-uwidenlow.clif
@@ -8,6 +8,10 @@ target x86_64 sse41 has_avx
 set enable_multi_ret_implicit_sret
 target riscv64 has_v
 target riscv64 has_v has_c has_zcb
+target pulley32
+target pulley32be
+target pulley64
+target pulley64be
 
 function %uwidenlow_i8x16(i8x16) -> i16x8 {
 block0(v0: i8x16):
diff --git a/crates/wasmtime/src/runtime/vm/interpreter.rs b/crates/wasmtime/src/runtime/vm/interpreter.rs
index 9836d0450742..fbe7a8cd3ea5 100644
--- a/crates/wasmtime/src/runtime/vm/interpreter.rs
+++ b/crates/wasmtime/src/runtime/vm/interpreter.rs
@@ -193,6 +193,10 @@ impl InterpreterRef<'_> {
         clippy::cast_sign_loss,
         reason = "macro-generated code"
     )]
+    #[cfg_attr(
+        not(feature = "component-model"),
+        expect(unused_macro_rules, reason = "macro-code")
+    )]
     unsafe fn call_indirect_host(&mut self, id: u8) {
         let id = u32::from(id);
         let fnptr = self.0[XReg::x0].get_ptr::<u8>();
diff --git a/crates/wast-util/src/lib.rs b/crates/wast-util/src/lib.rs
index 01be1a00c81a..07f2b8469257 100644
--- a/crates/wast-util/src/lib.rs
+++ b/crates/wast-util/src/lib.rs
@@ -402,9 +402,7 @@ impl WastTest {
         if config.compiler == Compiler::CraneliftPulley {
             let unsupported = [
                 "misc_testsuite/memory64/simd.wast",
-                "misc_testsuite/simd/almost-extmul.wast",
                 "misc_testsuite/simd/canonicalize-nan.wast",
-                "misc_testsuite/simd/cvt-from-uint.wast",
                 "misc_testsuite/simd/issue6725-no-egraph-panic.wast",
                 "misc_testsuite/simd/issue_3327_bnot_lowering.wast",
                 "misc_testsuite/simd/replace-lane-preserve.wast",
@@ -428,7 +426,6 @@ impl WastTest {
                 "spec_testsuite/proposals/memory64/i16x8_relaxed_q15mulr_s.wast",
                 "spec_testsuite/proposals/memory64/i32x4_relaxed_trunc.wast",
                 "spec_testsuite/proposals/memory64/i8x16_relaxed_swizzle.wast",
-                "spec_testsuite/simd_conversions.wast",
                 "spec_testsuite/simd_f32x4.wast",
                 "spec_testsuite/simd_f32x4_arith.wast",
                 "spec_testsuite/simd_f32x4_cmp.wast",
@@ -443,7 +440,6 @@ impl WastTest {
                 "spec_testsuite/simd_i16x8_arith2.wast",
                 "spec_testsuite/simd_i16x8_cmp.wast",
                 "spec_testsuite/simd_i16x8_extadd_pairwise_i8x16.wast",
-                "spec_testsuite/simd_i16x8_extmul_i8x16.wast",
                 "spec_testsuite/simd_i16x8_q15mulr_sat_s.wast",
                 "spec_testsuite/simd_i16x8_sat_arith.wast",
                 "spec_testsuite/simd_i32x4_arith.wast",
@@ -451,18 +447,15 @@ impl WastTest {
                 "spec_testsuite/simd_i32x4_cmp.wast",
                 "spec_testsuite/simd_i32x4_dot_i16x8.wast",
                 "spec_testsuite/simd_i32x4_extadd_pairwise_i16x8.wast",
-                "spec_testsuite/simd_i32x4_extmul_i16x8.wast",
                 "spec_testsuite/simd_i32x4_trunc_sat_f32x4.wast",
                 "spec_testsuite/simd_i32x4_trunc_sat_f64x2.wast",
                 "spec_testsuite/simd_i64x2_arith.wast",
                 "spec_testsuite/simd_i64x2_arith2.wast",
                 "spec_testsuite/simd_i64x2_cmp.wast",
-                "spec_testsuite/simd_i64x2_extmul_i32x4.wast",
                 "spec_testsuite/simd_i8x16_arith.wast",
                 "spec_testsuite/simd_i8x16_arith2.wast",
                 "spec_testsuite/simd_i8x16_cmp.wast",
                 "spec_testsuite/simd_i8x16_sat_arith.wast",
-                "spec_testsuite/simd_int_to_int_extend.wast",
                 "spec_testsuite/simd_lane.wast",
                 "spec_testsuite/simd_load.wast",
                 "spec_testsuite/simd_load16_lane.wast",
diff --git a/pulley/src/interp.rs b/pulley/src/interp.rs
index 694a47b65efe..34e6df6afd83 100644
--- a/pulley/src/interp.rs
+++ b/pulley/src/interp.rs
@@ -3057,4 +3057,244 @@ impl ExtendedOpVisitor for Interpreter<'_> {
         self.state[dst].set_i64(a.wrapping_abs());
         ControlFlow::Continue(())
     }
+
+    fn vf32x4_from_i32x4_s(&mut self, dst: VReg, src: VReg) -> ControlFlow<Done> {
+        let a = self.state[src].get_i32x4();
+        self.state[dst].set_f32x4(a.map(|i| i as f32));
+        ControlFlow::Continue(())
+    }
+
+    fn vf32x4_from_i32x4_u(&mut self, dst: VReg, src: VReg) -> ControlFlow<Done> {
+        let a = self.state[src].get_u32x4();
+        self.state[dst].set_f32x4(a.map(|i| i as f32));
+        ControlFlow::Continue(())
+    }
+
+    fn vf64x2_from_i64x2_s(&mut self, dst: VReg, src: VReg) -> ControlFlow<Done> {
+        let a = self.state[src].get_i64x2();
+        self.state[dst].set_f64x2(a.map(|i| i as f64));
+        ControlFlow::Continue(())
+    }
+
+    fn vf64x2_from_i64x2_u(&mut self, dst: VReg, src: VReg) -> ControlFlow<Done> {
+        let a = self.state[src].get_u64x2();
+        self.state[dst].set_f64x2(a.map(|i| i as f64));
+        ControlFlow::Continue(())
+    }
+
+    fn vwidenlow8x16_s(&mut self, dst: VReg, src: VReg) -> ControlFlow<Done> {
+        let a = *self.state[src].get_i8x16().first_chunk().unwrap();
+        self.state[dst].set_i16x8(a.map(|i| i.into()));
+        ControlFlow::Continue(())
+    }
+
+    fn vwidenlow8x16_u(&mut self, dst: VReg, src: VReg) -> ControlFlow<Done> {
+        let a = *self.state[src].get_u8x16().first_chunk().unwrap();
+        self.state[dst].set_u16x8(a.map(|i| i.into()));
+        ControlFlow::Continue(())
+    }
+
+    fn vwidenlow16x8_s(&mut self, dst: VReg, src: VReg) -> ControlFlow<Done> {
+        let a = *self.state[src].get_i16x8().first_chunk().unwrap();
+        self.state[dst].set_i32x4(a.map(|i| i.into()));
+        ControlFlow::Continue(())
+    }
+
+    fn vwidenlow16x8_u(&mut self, dst: VReg, src: VReg) -> ControlFlow<Done> {
+        let a = *self.state[src].get_u16x8().first_chunk().unwrap();
+        self.state[dst].set_u32x4(a.map(|i| i.into()));
+        ControlFlow::Continue(())
+    }
+
+    fn vwidenlow32x4_s(&mut self, dst: VReg, src: VReg) -> ControlFlow<Done> {
+        let a = *self.state[src].get_i32x4().first_chunk().unwrap();
+        self.state[dst].set_i64x2(a.map(|i| i.into()));
+        ControlFlow::Continue(())
+    }
+
+    fn vwidenlow32x4_u(&mut self, dst: VReg, src: VReg) -> ControlFlow<Done> {
+        let a = *self.state[src].get_u32x4().first_chunk().unwrap();
+        self.state[dst].set_u64x2(a.map(|i| i.into()));
+        ControlFlow::Continue(())
+    }
+
+    fn vwidenhigh8x16_s(&mut self, dst: VReg, src: VReg) -> ControlFlow<Done> {
+        let a = *self.state[src].get_i8x16().last_chunk().unwrap();
+        self.state[dst].set_i16x8(a.map(|i| i.into()));
+        ControlFlow::Continue(())
+    }
+
+    fn vwidenhigh8x16_u(&mut self, dst: VReg, src: VReg) -> ControlFlow<Done> {
+        let a = *self.state[src].get_u8x16().last_chunk().unwrap();
+        self.state[dst].set_u16x8(a.map(|i| i.into()));
+        ControlFlow::Continue(())
+    }
+
+    fn vwidenhigh16x8_s(&mut self, dst: VReg, src: VReg) -> ControlFlow<Done> {
+        let a = *self.state[src].get_i16x8().last_chunk().unwrap();
+        self.state[dst].set_i32x4(a.map(|i| i.into()));
+        ControlFlow::Continue(())
+    }
+
+    fn vwidenhigh16x8_u(&mut self, dst: VReg, src: VReg) -> ControlFlow<Done> {
+        let a = *self.state[src].get_u16x8().last_chunk().unwrap();
+        self.state[dst].set_u32x4(a.map(|i| i.into()));
+        ControlFlow::Continue(())
+    }
+
+    fn vwidenhigh32x4_s(&mut self, dst: VReg, src: VReg) -> ControlFlow<Done> {
+        let a = *self.state[src].get_i32x4().last_chunk().unwrap();
+        self.state[dst].set_i64x2(a.map(|i| i.into()));
+        ControlFlow::Continue(())
+    }
+
+    fn vwidenhigh32x4_u(&mut self, dst: VReg, src: VReg) -> ControlFlow<Done> {
+        let a = *self.state[src].get_u32x4().last_chunk().unwrap();
+        self.state[dst].set_u64x2(a.map(|i| i.into()));
+        ControlFlow::Continue(())
+    }
+
+    fn vnarrow16x8_s(&mut self, operands: BinaryOperands<VReg>) -> ControlFlow<Done> {
+        let a = self.state[operands.src1].get_i16x8();
+        let b = self.state[operands.src2].get_i16x8();
+        let mut result = [0; 16];
+        for (i, d) in a.iter().chain(&b).zip(&mut result) {
+            *d = (*i)
+                .try_into()
+                .unwrap_or(if *i < 0 { i8::MIN } else { i8::MAX });
+        }
+        self.state[operands.dst].set_i8x16(result);
+        ControlFlow::Continue(())
+    }
+
+    fn vnarrow16x8_u(&mut self, operands: BinaryOperands<VReg>) -> ControlFlow<Done> {
+        let a = self.state[operands.src1].get_i16x8();
+        let b = self.state[operands.src2].get_i16x8();
+        let mut result = [0; 16];
+        for (i, d) in a.iter().chain(&b).zip(&mut result) {
+            *d = (*i)
+                .try_into()
+                .unwrap_or(if *i < 0 { u8::MIN } else { u8::MAX });
+        }
+        self.state[operands.dst].set_u8x16(result);
+        ControlFlow::Continue(())
+    }
+
+    fn vnarrow32x4_s(&mut self, operands: BinaryOperands<VReg>) -> ControlFlow<Done> {
+        let a = self.state[operands.src1].get_i32x4();
+        let b = self.state[operands.src2].get_i32x4();
+        let mut result = [0; 8];
+        for (i, d) in a.iter().chain(&b).zip(&mut result) {
+            *d = (*i)
+                .try_into()
+                .unwrap_or(if *i < 0 { i16::MIN } else { i16::MAX });
+        }
+        self.state[operands.dst].set_i16x8(result);
+        ControlFlow::Continue(())
+    }
+
+    fn vnarrow32x4_u(&mut self, operands: BinaryOperands<VReg>) -> ControlFlow<Done> {
+        let a = self.state[operands.src1].get_i32x4();
+        let b = self.state[operands.src2].get_i32x4();
+        let mut result = [0; 8];
+        for (i, d) in a.iter().chain(&b).zip(&mut result) {
+            *d = (*i)
+                .try_into()
+                .unwrap_or(if *i < 0 { u16::MIN } else { u16::MAX });
+        }
+        self.state[operands.dst].set_u16x8(result);
+        ControlFlow::Continue(())
+    }
+
+    fn vfpromotelow(&mut self, dst: VReg, src: VReg) -> ControlFlow<Done> {
+        let a = self.state[src].get_f32x4();
+        self.state[dst].set_f64x2([a[0].into(), a[1].into()]);
+        ControlFlow::Continue(())
+    }
+
+    fn vfdemote(&mut self, dst: VReg, src: VReg) -> ControlFlow<Done> {
+        let a = self.state[src].get_f64x2();
+        self.state[dst].set_f32x4([a[0] as f32, a[1] as f32, 0.0, 0.0]);
+        ControlFlow::Continue(())
+    }
+
+    fn vsubi8x16(&mut self, operands: BinaryOperands<VReg>) -> ControlFlow<Done> {
+        let mut a = self.state[operands.src1].get_i8x16();
+        let b = self.state[operands.src2].get_i8x16();
+        for (a, b) in a.iter_mut().zip(b) {
+            *a = a.wrapping_sub(b);
+        }
+        self.state[operands.dst].set_i8x16(a);
+        ControlFlow::Continue(())
+    }
+
+    fn vsubi16x8(&mut self, operands: BinaryOperands<VReg>) -> ControlFlow<Done> {
+        let mut a = self.state[operands.src1].get_i16x8();
+        let b = self.state[operands.src2].get_i16x8();
+        for (a, b) in a.iter_mut().zip(b) {
+            *a = a.wrapping_sub(b);
+        }
+        self.state[operands.dst].set_i16x8(a);
+        ControlFlow::Continue(())
+    }
+
+    fn vsubi32x4(&mut self, operands: BinaryOperands<VReg>) -> ControlFlow<Done> {
+        let mut a = self.state[operands.src1].get_i32x4();
+        let b = self.state[operands.src2].get_i32x4();
+        for (a, b) in a.iter_mut().zip(b) {
+            *a = a.wrapping_sub(b);
+        }
+        self.state[operands.dst].set_i32x4(a);
+        ControlFlow::Continue(())
+    }
+
+    fn vsubi64x2(&mut self, operands: BinaryOperands<VReg>) -> ControlFlow<Done> {
+        let mut a = self.state[operands.src1].get_i64x2();
+        let b = self.state[operands.src2].get_i64x2();
+        for (a, b) in a.iter_mut().zip(b) {
+            *a = a.wrapping_sub(b);
+        }
+        self.state[operands.dst].set_i64x2(a);
+        ControlFlow::Continue(())
+    }
+
+    fn vmuli8x16(&mut self, operands: BinaryOperands<VReg>) -> ControlFlow<Done> {
+        let mut a = self.state[operands.src1].get_i8x16();
+        let b = self.state[operands.src2].get_i8x16();
+        for (a, b) in a.iter_mut().zip(b) {
+            *a = a.wrapping_mul(b);
+        }
+        self.state[operands.dst].set_i8x16(a);
+        ControlFlow::Continue(())
+    }
+
+    fn vmuli16x8(&mut self, operands: BinaryOperands<VReg>) -> ControlFlow<Done> {
+        let mut a = self.state[operands.src1].get_i16x8();
+        let b = self.state[operands.src2].get_i16x8();
+        for (a, b) in a.iter_mut().zip(b) {
+            *a = a.wrapping_mul(b);
+        }
+        self.state[operands.dst].set_i16x8(a);
+        ControlFlow::Continue(())
+    }
+
+    fn vmuli32x4(&mut self, operands: BinaryOperands<VReg>) -> ControlFlow<Done> {
+        let mut a = self.state[operands.src1].get_i32x4();
+        let b = self.state[operands.src2].get_i32x4();
+        for (a, b) in a.iter_mut().zip(b) {
+            *a = a.wrapping_mul(b);
+        }
+        self.state[operands.dst].set_i32x4(a);
+        ControlFlow::Continue(())
+    }
+
+    fn vmuli64x2(&mut self, operands: BinaryOperands<VReg>) -> ControlFlow<Done> {
+        let mut a = self.state[operands.src1].get_i64x2();
+        let b = self.state[operands.src2].get_i64x2();
+        for (a, b) in a.iter_mut().zip(b) {
+            *a = a.wrapping_mul(b);
+        }
+        self.state[operands.dst].set_i64x2(a);
+        ControlFlow::Continue(())
+    }
 }
diff --git a/pulley/src/lib.rs b/pulley/src/lib.rs
index baa4c813ca26..a911acd6cbdb 100644
--- a/pulley/src/lib.rs
+++ b/pulley/src/lib.rs
@@ -761,6 +761,88 @@ macro_rules! for_each_extended_op {
             xabs32 = XAbs32 { dst: XReg, src: XReg };
             /// `dst = |src|`
             xabs64 = XAbs64 { dst: XReg, src: XReg };
+
+            /// TODO
+            vf32x4_from_i32x4_s = VF32x4FromI32x4S { dst: VReg, src: VReg };
+            /// TODO
+            vf32x4_from_i32x4_u = VF32x4FromI32x4U { dst: VReg, src: VReg };
+            /// TODO
+            vf64x2_from_i64x2_s = VF64x2FromI64x2S { dst: VReg, src: VReg };
+            /// TODO
+            vf64x2_from_i64x2_u = VF64x2FromI64x2U { dst: VReg, src: VReg };
+
+            /// Widens the low lanes of the input vector, as signed, to twice
+            /// the width.
+            vwidenlow8x16_s = VWidenLow8x16S { dst: VReg, src: VReg };
+            /// Widens the low lanes of the input vector, as unsigned, to twice
+            /// the width.
+            vwidenlow8x16_u = VWidenLow8x16U { dst: VReg, src: VReg };
+            /// Widens the low lanes of the input vector, as signed, to twice
+            /// the width.
+            vwidenlow16x8_s = VWidenLow16x8S { dst: VReg, src: VReg };
+            /// Widens the low lanes of the input vector, as unsigned, to twice
+            /// the width.
+            vwidenlow16x8_u = VWidenLow16x8U { dst: VReg, src: VReg };
+            /// Widens the low lanes of the input vector, as signed, to twice
+            /// the width.
+            vwidenlow32x4_s = VWidenLow32x4S { dst: VReg, src: VReg };
+            /// Widens the low lanes of the input vector, as unsigned, to twice
+            /// the width.
+            vwidenlow32x4_u = VWidenLow32x4U { dst: VReg, src: VReg };
+            /// Widens the high lanes of the input vector, as signed, to twice
+            /// the width.
+            vwidenhigh8x16_s = VWidenHigh8x16S { dst: VReg, src: VReg };
+            /// Widens the high lanes of the input vector, as unsigned, to twice
+            /// the width.
+            vwidenhigh8x16_u = VWidenHigh8x16U { dst: VReg, src: VReg };
+            /// Widens the high lanes of the input vector, as signed, to twice
+            /// the width.
+            vwidenhigh16x8_s = VWidenHigh16x8S { dst: VReg, src: VReg };
+            /// Widens the high lanes of the input vector, as unsigned, to twice
+            /// the width.
+            vwidenhigh16x8_u = VWidenHigh16x8U { dst: VReg, src: VReg };
+            /// Widens the high lanes of the input vector, as signed, to twice
+            /// the width.
+            vwidenhigh32x4_s = VWidenHigh32x4S { dst: VReg, src: VReg };
+            /// Widens the high lanes of the input vector, as unsigned, to twice
+            /// the width.
+            vwidenhigh32x4_u = VWidenHigh32x4U { dst: VReg, src: VReg };
+
+            /// Narrows the two 16x8 vectors, assuming all input lanes are
+            /// signed, to half the width. Narrowing is signed and saturating.
+            vnarrow16x8_s = Vnarrow16x8S { operands: BinaryOperands<VReg> };
+            /// Narrows the two 16x8 vectors, assuming all input lanes are
+            /// signed, to half the width. Narrowing is unsigned and saturating.
+            vnarrow16x8_u = Vnarrow16x8U { operands: BinaryOperands<VReg> };
+            /// Narrows the two 32x4 vectors, assuming all input lanes are
+            /// signed, to half the width. Narrowing is signed and saturating.
+            vnarrow32x4_s = Vnarrow32x4S { operands: BinaryOperands<VReg> };
+            /// Narrows the two 32x4 vectors, assuming all input lanes are
+            /// signed, to half the width. Narrowing is unsigned and saturating.
+            vnarrow32x4_u = Vnarrow32x4U { operands: BinaryOperands<VReg> };
+            /// Promotes the low two lanes of the f32x4 input to f64x2.
+            vfpromotelow = VFpromoteLow { dst: VReg, src: VReg };
+            /// Demotes the two f64x2 lanes to f32x2 and then extends with two
+            /// more zero lanes.
+            vfdemote = VFdemote { dst: VReg, src: VReg };
+
+            /// `dst = src1 - src2`
+            vsubi8x16 = VSubI8x16 { operands: BinaryOperands<VReg> };
+            /// `dst = src1 - src2`
+            vsubi16x8 = VSubI16x8 { operands: BinaryOperands<VReg> };
+            /// `dst = src1 - src2`
+            vsubi32x4 = VSubI32x4 { operands: BinaryOperands<VReg> };
+            /// `dst = src1 - src2`
+            vsubi64x2 = VSubI64x2 { operands: BinaryOperands<VReg> };
+
+            /// `dst = src1 * src2`
+            vmuli8x16 = VMulI8x16 { operands: BinaryOperands<VReg> };
+            /// `dst = src1 * src2`
+            vmuli16x8 = VMulI16x8 { operands: BinaryOperands<VReg> };
+            /// `dst = src1 * src2`
+            vmuli32x4 = VMulI32x4 { operands: BinaryOperands<VReg> };
+            /// `dst = src1 * src2`
+            vmuli64x2 = VMulI64x2 { operands: BinaryOperands<VReg> };
         }
     };
 }