diff --git a/cranelift/codegen/meta/src/pulley.rs b/cranelift/codegen/meta/src/pulley.rs
index fa22191d1bba..2c95b6e5366b 100644
--- a/cranelift/codegen/meta/src/pulley.rs
+++ b/cranelift/codegen/meta/src/pulley.rs
@@ -27,10 +27,23 @@ const OPS: &[Inst<'_>] = pulley_interpreter::for_each_op!(define);
 const EXTENDED_OPS: &[Inst<'_>] = pulley_interpreter::for_each_extended_op!(define);
 
 enum Operand<'a> {
-    Normal { name: &'a str, ty: &'a str },
-    Writable { name: &'a str, ty: &'a str },
-    TrapCode { name: &'a str, ty: &'a str },
-    Binop { reg: &'a str },
+    Normal {
+        name: &'a str,
+        ty: &'a str,
+    },
+    Writable {
+        name: &'a str,
+        ty: &'a str,
+    },
+    TrapCode {
+        name: &'a str,
+        ty: &'a str,
+    },
+    Binop {
+        dst: &'a str,
+        src1: &'a str,
+        src2: &'a str,
+    },
 }
 
 impl Inst<'_> {
@@ -38,8 +51,23 @@ impl Inst<'_> {
         self.fields
             .iter()
             .map(|(name, ty)| match (*name, *ty) {
-                ("operands", "BinaryOperands < XReg >") => Operand::Binop { reg: "XReg" },
-                ("operands", "BinaryOperands < FReg >") => Operand::Binop { reg: "FReg" },
+                ("operands", binop) => {
+                    // Parse "BinaryOperands < A >"` as A/A/A
+                    // Parse "BinaryOperands < A, B >"` as A/B/A
+                    // Parse "BinaryOperands < A, B, C >"` as A/B/C
+                    let mut parts = binop
+                        .strip_prefix("BinaryOperands <")
+                        .unwrap()
+                        .strip_suffix(">")
+                        .unwrap()
+                        .trim()
+                        .split(',')
+                        .map(|x| x.trim());
+                    let dst = parts.next().unwrap();
+                    let src1 = parts.next().unwrap_or(dst);
+                    let src2 = parts.next().unwrap_or(dst);
+                    Operand::Binop { dst, src1, src2 }
+                }
                 ("dst", ty) => Operand::Writable { name, ty },
                 (name, ty) => Operand::Normal { name, ty },
             })
@@ -109,7 +137,7 @@ pub fn generate_rust(filename: &str, out_dir: &Path) -> Result<(), Error> {
                     pat.push_str(",");
                     format_string.push_str(&format!(" // trap={{{name}:?}}"));
                 }
-                Operand::Binop { reg: _ } => {
+                Operand::Binop { .. } => {
                     pat.push_str("dst, src1, src2,");
                     format_string.push_str(" {dst}, {src1}, {src2}");
                     locals.push_str(&format!("let dst = reg_name(*dst.to_reg());\n"));
@@ -161,7 +189,7 @@ pub fn generate_rust(filename: &str, out_dir: &Path) -> Result<(), Error> {
                     }
                 }
                 Operand::TrapCode { .. } => {}
-                Operand::Binop { reg: _ } => {
+                Operand::Binop { .. } => {
                     pat.push_str("dst, src1, src2,");
                     uses.push("src1");
                     uses.push("src2");
@@ -221,7 +249,7 @@ pub fn generate_rust(filename: &str, out_dir: &Path) -> Result<(), Error> {
                     pat.push_str(",");
                     trap.push_str(&format!("sink.add_trap({name});\n"));
                 }
-                Operand::Binop { reg: _ } => {
+                Operand::Binop { .. } => {
                     pat.push_str("dst, src1, src2,");
                     args.push_str(
                         "pulley_interpreter::regs::BinaryOperands::new(dst, src1, src2),",
@@ -265,10 +293,10 @@ pub fn generate_isle(filename: &str, out_dir: &Path) -> Result<(), Error> {
                 Operand::Writable { name, ty } => {
                     isle.push_str(&format!("\n    ({name} Writable{ty})"));
                 }
-                Operand::Binop { reg } => {
-                    isle.push_str(&format!("\n    (dst Writable{reg})"));
-                    isle.push_str(&format!("\n    (src1 {reg})"));
-                    isle.push_str(&format!("\n    (src2 {reg})"));
+                Operand::Binop { dst, src1, src2 } => {
+                    isle.push_str(&format!("\n    (dst Writable{dst})"));
+                    isle.push_str(&format!("\n    (src1 {src1})"));
+                    isle.push_str(&format!("\n    (src2 {src2})"));
                 }
             }
         }
@@ -303,13 +331,13 @@ pub fn generate_isle(filename: &str, out_dir: &Path) -> Result<(), Error> {
                     assert!(result.is_none(), "{} has >1 result", inst.snake_name);
                     result = Some(ty);
                 }
-                Operand::Binop { reg } => {
-                    isle.push_str(&format!("{reg} {reg}"));
+                Operand::Binop { dst, src1, src2 } => {
+                    isle.push_str(&format!("{src1} {src2}"));
                     rule.push_str("src1 src2");
                     ops.push("src1");
                     ops.push("src2");
                     assert!(result.is_none(), "{} has >1 result", inst.snake_name);
-                    result = Some(reg);
+                    result = Some(dst);
                 }
             }
             isle.push_str(" ");
diff --git a/cranelift/codegen/src/isa/pulley_shared/abi.rs b/cranelift/codegen/src/isa/pulley_shared/abi.rs
index e2c9317d1d93..292e8b680ac3 100644
--- a/cranelift/codegen/src/isa/pulley_shared/abi.rs
+++ b/cranelift/codegen/src/isa/pulley_shared/abi.rs
@@ -510,17 +510,18 @@ where
         _target_vector_bytes: u32,
         _isa_flags: &PulleyFlags,
     ) -> u32 {
+        // Spill slots are the size of a "word" or a pointer, but Pulley
+        // registers are 8-byte for integers/floats regardless of pointer size.
+        // Calculate the number of slots necessary to store 8 bytes.
+        let slots_for_8bytes = match P::pointer_width() {
+            PointerWidth::PointerWidth32 => 2,
+            PointerWidth::PointerWidth64 => 1,
+        };
         match rc {
-            // Spilling an integer or float register requires spilling 8 bytes,
-            // and spill slots are defined in terms of "word bytes" or the size
-            // of a pointer. That means on 32-bit pulley we need to take up two
-            // spill slots where on 64-bit pulley we need to only take up one
-            // spill slot for integers.
-            RegClass::Int | RegClass::Float => match P::pointer_width() {
-                PointerWidth::PointerWidth32 => 2,
-                PointerWidth::PointerWidth64 => 1,
-            },
-            RegClass::Vector => unreachable!(),
+            // Int/float registers are 8-bytes
+            RegClass::Int | RegClass::Float => slots_for_8bytes,
+            // Vector registers are 16 bytes
+            RegClass::Vector => 2 * slots_for_8bytes,
         }
     }
 
diff --git a/cranelift/codegen/src/isa/pulley_shared/inst.isle b/cranelift/codegen/src/isa/pulley_shared/inst.isle
index 015b547fb96f..384912269c71 100644
--- a/cranelift/codegen/src/isa/pulley_shared/inst.isle
+++ b/cranelift/codegen/src/isa/pulley_shared/inst.isle
@@ -414,6 +414,16 @@
 (rule (pulley_fstore amode src ty flags)
       (SideEffectNoResult.Inst (MInst.FStore amode src ty flags)))
 
+(decl pulley_vload (Amode Type MemFlags) VReg)
+(rule (pulley_vload amode ty flags)
+      (let ((dst WritableVReg (temp_writable_vreg))
+            (_ Unit (emit (MInst.VLoad dst amode ty flags))))
+        dst))
+
+(decl pulley_vstore (Amode VReg Type MemFlags) SideEffectNoResult)
+(rule (pulley_vstore amode src ty flags)
+      (SideEffectNoResult.Inst (MInst.VStore amode src ty flags)))
+
 (decl gen_br_table (XReg MachLabel BoxVecMachLabel) Unit)
 (rule (gen_br_table idx default labels)
       (emit (MInst.BrTable idx default labels)))
diff --git a/cranelift/codegen/src/isa/pulley_shared/inst/mod.rs b/cranelift/codegen/src/isa/pulley_shared/inst/mod.rs
index e2560639d1f0..11aac8e7c304 100644
--- a/cranelift/codegen/src/isa/pulley_shared/inst/mod.rs
+++ b/cranelift/codegen/src/isa/pulley_shared/inst/mod.rs
@@ -453,18 +453,8 @@ where
     }
 
     fn worst_case_size() -> CodeOffset {
-        // `BrIfXeq32 { a, b, taken, not_taken }` expands to `br_if_xeq32 a, b, taken; jump not_taken`.
-        //
-        // The first instruction is seven bytes long:
-        //   * 1 byte opcode
-        //   * 1 byte `a` register encoding
-        //   * 1 byte `b` register encoding
-        //   * 4 byte `taken` displacement
-        //
-        // And the second instruction is five bytes long:
-        //   * 1 byte opcode
-        //   * 4 byte `not_taken` displacement
-        12
+        // `Vconst128 { dst, imm }` is 18 bytes (opcode + dst + 16-byte imm)
+        18
     }
 
     fn ref_type_regclass(_settings: &settings::Flags) -> RegClass {
diff --git a/cranelift/codegen/src/isa/pulley_shared/lower.isle b/cranelift/codegen/src/isa/pulley_shared/lower.isle
index e1df6602706b..a97d26bc7589 100644
--- a/cranelift/codegen/src/isa/pulley_shared/lower.isle
+++ b/cranelift/codegen/src/isa/pulley_shared/lower.isle
@@ -143,6 +143,11 @@
 (rule (lower (has_type $I64 (iadd a b)))
       (pulley_xadd64 a b))
 
+(rule (lower (has_type $I8X16 (iadd a b))) (pulley_vaddi8x16 a b))
+(rule (lower (has_type $I16X8 (iadd a b))) (pulley_vaddi16x8 a b))
+(rule (lower (has_type $I32X4 (iadd a b))) (pulley_vaddi32x4 a b))
+(rule (lower (has_type $I64X2 (iadd a b))) (pulley_vaddi64x2 a b))
+
 ;;;; Rules for `isub` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 
 (rule (lower (has_type $I8 (isub a b)))
@@ -192,6 +197,11 @@
 (rule (lower (has_type $I64 (ishl a b)))
   (pulley_xshl64 a b))
 
+(rule (lower (has_type $I8X16 (ishl a b))) (pulley_vshli8x16 a b))
+(rule (lower (has_type $I16X8 (ishl a b))) (pulley_vshli16x8 a b))
+(rule (lower (has_type $I32X4 (ishl a b))) (pulley_vshli32x4 a b))
+(rule (lower (has_type $I64X2 (ishl a b))) (pulley_vshli64x2 a b))
+
 ;;;; Rules for `ushr` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 
 (rule (lower (has_type $I32 (ushr a b)))
@@ -200,6 +210,11 @@
 (rule (lower (has_type $I64 (ushr a b)))
   (pulley_xshr64_u a b))
 
+(rule (lower (has_type $I8X16 (ushr a b))) (pulley_vshri8x16_u a b))
+(rule (lower (has_type $I16X8 (ushr a b))) (pulley_vshri16x8_u a b))
+(rule (lower (has_type $I32X4 (ushr a b))) (pulley_vshri32x4_u a b))
+(rule (lower (has_type $I64X2 (ushr a b))) (pulley_vshri64x2_u a b))
+
 ;;;; Rules for `sshr` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 
 (rule (lower (has_type $I32 (sshr a b)))
@@ -208,6 +223,11 @@
 (rule (lower (has_type $I64 (sshr a b)))
   (pulley_xshr64_s a b))
 
+(rule (lower (has_type $I8X16 (sshr a b))) (pulley_vshri8x16_s a b))
+(rule (lower (has_type $I16X8 (sshr a b))) (pulley_vshri16x8_s a b))
+(rule (lower (has_type $I32X4 (sshr a b))) (pulley_vshri32x4_s a b))
+(rule (lower (has_type $I64X2 (sshr a b))) (pulley_vshri64x2_s a b))
+
 ;;;; Rules for `band` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 
 (rule 0 (lower (has_type (fits_in_32 _) (band a b)))
@@ -414,6 +434,9 @@
 (rule 1 (lower (has_type $I64 (sload32 flags addr offset)))
   (pulley_xload (amode addr offset) $I32 flags (ExtKind.Sign64)))
 
+(rule 2 (lower (has_type (ty_vec128 ty) (load flags addr offset)))
+  (pulley_vload (amode addr offset) ty flags))
+
 ;;;; Rules for `store` and friends ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 
 (rule (lower (store flags src @ (value_type (ty_int ty)) addr offset))
@@ -431,6 +454,9 @@
 (rule (lower (istore32 flags src addr offset))
   (side_effect (pulley_xstore (amode addr offset) src $I32 flags)))
 
+(rule 2 (lower (store flags src @ (value_type (ty_vec128 ty)) addr offset))
+  (side_effect (pulley_vstore (amode addr offset) src ty flags)))
+
 ;;;; Rules for `stack_addr` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 
 (rule (lower (stack_addr stack_slot offset))
@@ -622,6 +648,8 @@
 
 (rule (lower (has_type $F32 (fadd a b))) (pulley_fadd32 a b))
 (rule (lower (has_type $F64 (fadd a b))) (pulley_fadd64 a b))
+(rule (lower (has_type $F32X4 (fadd a b))) (pulley_vaddf32x4 a b))
+(rule (lower (has_type $F64X2 (fadd a b))) (pulley_vaddf64x2 a b))
 
 ;;;; Rules for `fsub` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 
@@ -687,3 +715,7 @@
 
 (rule (lower (has_type $F32 (fabs a))) (pulley_fabs32 a))
 (rule (lower (has_type $F64 (fabs a))) (pulley_fabs64 a))
+
+;;;; Rules for `vconst` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+(rule (lower (has_type (ty_vec128 _) (vconst (u128_from_constant a)))) (pulley_vconst128 a))
diff --git a/crates/wast-util/src/lib.rs b/crates/wast-util/src/lib.rs
index 00551c46f29e..7c99c6eaaa83 100644
--- a/crates/wast-util/src/lib.rs
+++ b/crates/wast-util/src/lib.rs
@@ -403,10 +403,8 @@ impl WastTest {
                 "misc_testsuite/simd/almost-extmul.wast",
                 "misc_testsuite/simd/canonicalize-nan.wast",
                 "misc_testsuite/simd/cvt-from-uint.wast",
-                "misc_testsuite/simd/interesting-float-splat.wast",
                 "misc_testsuite/simd/issue4807.wast",
                 "misc_testsuite/simd/issue6725-no-egraph-panic.wast",
-                "misc_testsuite/simd/issue_3173_select_v128.wast",
                 "misc_testsuite/simd/issue_3327_bnot_lowering.wast",
                 "misc_testsuite/simd/load_splat_out_of_bounds.wast",
                 "misc_testsuite/simd/replace-lane-preserve.wast",
@@ -418,11 +416,6 @@ impl WastTest {
                 "misc_testsuite/threads/MP_wait.wast",
                 "misc_testsuite/threads/SB_atomic.wast",
                 "misc_testsuite/threads/load-store-alignment.wast",
-                "misc_testsuite/winch/_simd_address.wast",
-                "misc_testsuite/winch/_simd_const.wast",
-                "misc_testsuite/winch/_simd_load.wast",
-                "misc_testsuite/winch/_simd_multivalue.wast",
-                "misc_testsuite/winch/_simd_store.wast",
                 "spec_testsuite/proposals/annotations/simd_lane.wast",
                 "spec_testsuite/proposals/multi-memory/simd_memory-multi.wast",
                 "spec_testsuite/proposals/relaxed-simd/i16x8_relaxed_q15mulr_s.wast",
@@ -433,7 +426,6 @@ impl WastTest {
                 "spec_testsuite/proposals/relaxed-simd/relaxed_madd_nmadd.wast",
                 "spec_testsuite/proposals/relaxed-simd/relaxed_min_max.wast",
                 "spec_testsuite/proposals/threads/atomic.wast",
-                "spec_testsuite/simd_address.wast",
                 "spec_testsuite/simd_align.wast",
                 "spec_testsuite/simd_bit_shift.wast",
                 "spec_testsuite/simd_bitwise.wast",
@@ -484,7 +476,6 @@ impl WastTest {
                 "spec_testsuite/simd_load_splat.wast",
                 "spec_testsuite/simd_load_zero.wast",
                 "spec_testsuite/simd_splat.wast",
-                "spec_testsuite/simd_store.wast",
                 "spec_testsuite/simd_store16_lane.wast",
                 "spec_testsuite/simd_store32_lane.wast",
                 "spec_testsuite/simd_store64_lane.wast",
diff --git a/pulley/src/decode.rs b/pulley/src/decode.rs
index bcd57017283d..d11fbe482d85 100644
--- a/pulley/src/decode.rs
+++ b/pulley/src/decode.rs
@@ -303,6 +303,15 @@ impl Decode for u64 {
     }
 }
 
+impl Decode for u128 {
+    fn decode<T>(bytecode: &mut T) -> Result<Self, T::Error>
+    where
+        T: BytecodeStream,
+    {
+        Ok(u128::from_le_bytes(bytecode.read()?))
+    }
+}
+
 impl Decode for i8 {
     fn decode<T>(bytecode: &mut T) -> Result<Self, T::Error>
     where
@@ -339,6 +348,15 @@ impl Decode for i64 {
     }
 }
 
+impl Decode for i128 {
+    fn decode<T>(bytecode: &mut T) -> Result<Self, T::Error>
+    where
+        T: BytecodeStream,
+    {
+        Ok(i128::from_le_bytes(bytecode.read()?))
+    }
+}
+
 impl Decode for XReg {
     fn decode<T>(bytecode: &mut T) -> Result<Self, T::Error>
     where
@@ -404,7 +422,7 @@ impl Decode for ExtendedOpcode {
     }
 }
 
-impl<R: Reg> Decode for BinaryOperands<R> {
+impl<D: Reg, S1: Reg, S2: Reg> Decode for BinaryOperands<D, S1, S2> {
     fn decode<T>(bytecode: &mut T) -> Result<Self, T::Error>
     where
         T: BytecodeStream,
diff --git a/pulley/src/disas.rs b/pulley/src/disas.rs
index 301bf2c345d6..fedff6ea14be 100644
--- a/pulley/src/disas.rs
+++ b/pulley/src/disas.rs
@@ -149,6 +149,12 @@ impl Disas for i64 {
     }
 }
 
+impl Disas for i128 {
+    fn disas(&self, _position: usize, disas: &mut String) {
+        write!(disas, "{self}").unwrap();
+    }
+}
+
 impl Disas for u8 {
     fn disas(&self, _position: usize, disas: &mut String) {
         write!(disas, "{self}").unwrap();
@@ -173,6 +179,12 @@ impl Disas for u64 {
     }
 }
 
+impl Disas for u128 {
+    fn disas(&self, _position: usize, disas: &mut String) {
+        write!(disas, "{self}").unwrap();
+    }
+}
+
 impl Disas for PcRelOffset {
     fn disas(&self, position: usize, disas: &mut String) {
         let offset = isize::try_from(i32::from(*self)).unwrap();
@@ -192,9 +204,18 @@ fn disas_list<T: Disas>(position: usize, disas: &mut String, iter: impl IntoIter
     }
 }
 
-impl<R: Reg + Disas> Disas for BinaryOperands<R> {
+impl<D, S1, S2> Disas for BinaryOperands<D, S1, S2>
+where
+    D: Reg + Disas,
+    S1: Reg + Disas,
+    S2: Reg + Disas,
+{
     fn disas(&self, position: usize, disas: &mut String) {
-        disas_list(position, disas, [self.dst, self.src1, self.src2])
+        self.dst.disas(position, disas);
+        write!(disas, ", ").unwrap();
+        self.src1.disas(position, disas);
+        write!(disas, ", ").unwrap();
+        self.src2.disas(position, disas);
     }
 }
 
diff --git a/pulley/src/encode.rs b/pulley/src/encode.rs
index 1891b158a7af..c1d7d2dab610 100644
--- a/pulley/src/encode.rs
+++ b/pulley/src/encode.rs
@@ -59,6 +59,17 @@ impl Encode for u64 {
     }
 }
 
+impl Encode for u128 {
+    const WIDTH: u8 = 16;
+
+    fn encode<E>(&self, sink: &mut E)
+    where
+        E: Extend<u8>,
+    {
+        sink.extend(self.to_le_bytes());
+    }
+}
+
 impl Encode for i8 {
     const WIDTH: u8 = 1;
 
@@ -103,6 +114,17 @@ impl Encode for i64 {
     }
 }
 
+impl Encode for i128 {
+    const WIDTH: u8 = 16;
+
+    fn encode<E>(&self, sink: &mut E)
+    where
+        E: Extend<u8>,
+    {
+        sink.extend(self.to_le_bytes());
+    }
+}
+
 impl Encode for XReg {
     const WIDTH: u8 = 1;
 
@@ -147,7 +169,7 @@ impl Encode for PcRelOffset {
     }
 }
 
-impl<R: Reg> Encode for BinaryOperands<R> {
+impl<D: Reg, S1: Reg, S2: Reg> Encode for BinaryOperands<D, S1, S2> {
     const WIDTH: u8 = 2;
 
     fn encode<E>(&self, sink: &mut E)
diff --git a/pulley/src/interp.rs b/pulley/src/interp.rs
index 4800015ddfc2..2579790cda43 100644
--- a/pulley/src/interp.rs
+++ b/pulley/src/interp.rs
@@ -540,11 +540,30 @@ impl fmt::LowerHex for VRegVal {
     }
 }
 
+/// 128-bit vector registers.
+///
+/// This register is always stored in little-endian order and has different
+/// constraints than `XRegVal` and `FRegVal` above. Notably all fields of this
+/// union are the same width so all bits are always defined. Note that
+/// little-endian is required though so bitcasts between different shapes of
+/// vectors works. This union cannot be stored in big-endian.
 #[derive(Copy, Clone)]
+#[repr(align(16))]
 union VRegUnion {
-    // TODO: need to figure out how we are going to handle portability of lane
-    // ordering on top of each lane's endianness.
     u128: u128,
+    i8x16: [i8; 16],
+    i16x8: [i16; 8],
+    i32x4: [i32; 4],
+    i64x2: [i64; 2],
+    u8x16: [u8; 16],
+    u16x8: [u16; 8],
+    u32x4: [u32; 4],
+    u64x2: [u64; 2],
+    // Note that these are `u32` and `u64`, not f32/f64. That's only because
+    // f32/f64 don't have `.to_le()` and `::from_le()` so need to go through the
+    // bits anyway.
+    f32x4: [u32; 4],
+    f64x2: [u64; 2],
 }
 
 impl Default for VRegVal {
@@ -569,6 +588,96 @@ impl VRegVal {
     pub fn set_u128(&mut self, val: u128) {
         self.0.u128 = val.to_le();
     }
+
+    fn get_i8x16(&self) -> [i8; 16] {
+        let val = unsafe { self.0.i8x16 };
+        val.map(|e| i8::from_le(e))
+    }
+
+    fn set_i8x16(&mut self, val: [i8; 16]) {
+        self.0.i8x16 = val.map(|e| e.to_le());
+    }
+
+    fn get_u8x16(&self) -> [u8; 16] {
+        let val = unsafe { self.0.u8x16 };
+        val.map(|e| u8::from_le(e))
+    }
+
+    fn set_u8x16(&mut self, val: [u8; 16]) {
+        self.0.u8x16 = val.map(|e| e.to_le());
+    }
+
+    fn get_i16x8(&self) -> [i16; 8] {
+        let val = unsafe { self.0.i16x8 };
+        val.map(|e| i16::from_le(e))
+    }
+
+    fn set_i16x8(&mut self, val: [i16; 8]) {
+        self.0.i16x8 = val.map(|e| e.to_le());
+    }
+
+    fn get_u16x8(&self) -> [u16; 8] {
+        let val = unsafe { self.0.u16x8 };
+        val.map(|e| u16::from_le(e))
+    }
+
+    fn set_u16x8(&mut self, val: [u16; 8]) {
+        self.0.u16x8 = val.map(|e| e.to_le());
+    }
+
+    fn get_i32x4(&self) -> [i32; 4] {
+        let val = unsafe { self.0.i32x4 };
+        val.map(|e| i32::from_le(e))
+    }
+
+    fn set_i32x4(&mut self, val: [i32; 4]) {
+        self.0.i32x4 = val.map(|e| e.to_le());
+    }
+
+    fn get_u32x4(&self) -> [u32; 4] {
+        let val = unsafe { self.0.u32x4 };
+        val.map(|e| u32::from_le(e))
+    }
+
+    fn set_u32x4(&mut self, val: [u32; 4]) {
+        self.0.u32x4 = val.map(|e| e.to_le());
+    }
+
+    fn get_i64x2(&self) -> [i64; 2] {
+        let val = unsafe { self.0.i64x2 };
+        val.map(|e| i64::from_le(e))
+    }
+
+    fn set_i64x2(&mut self, val: [i64; 2]) {
+        self.0.i64x2 = val.map(|e| e.to_le());
+    }
+
+    fn get_u64x2(&self) -> [u64; 2] {
+        let val = unsafe { self.0.u64x2 };
+        val.map(|e| u64::from_le(e))
+    }
+
+    fn set_u64x2(&mut self, val: [u64; 2]) {
+        self.0.u64x2 = val.map(|e| e.to_le());
+    }
+
+    fn get_f64x2(&self) -> [f64; 2] {
+        let val = unsafe { self.0.f64x2 };
+        val.map(|e| f64::from_bits(u64::from_le(e)))
+    }
+
+    fn set_f64x2(&mut self, val: [f64; 2]) {
+        self.0.f64x2 = val.map(|e| e.to_bits().to_le());
+    }
+
+    fn get_f32x4(&self) -> [f32; 4] {
+        let val = unsafe { self.0.f32x4 };
+        val.map(|e| f32::from_bits(u32::from_le(e)))
+    }
+
+    fn set_f32x4(&mut self, val: [f32; 4]) {
+        self.0.f32x4 = val.map(|e| e.to_bits().to_le());
+    }
 }
 
 /// The machine state for a Pulley virtual machine: the various registers and
@@ -2417,6 +2526,155 @@ impl OpVisitor for Interpreter<'_> {
         self.state[dst].set_f64(a.wasm_abs());
         ControlFlow::Continue(())
     }
+
+    fn vaddi8x16(&mut self, operands: BinaryOperands<VReg>) -> ControlFlow<Done> {
+        let mut a = self.state[operands.src1].get_i8x16();
+        let b = self.state[operands.src2].get_i8x16();
+        for (a, b) in a.iter_mut().zip(b) {
+            *a += b;
+        }
+        self.state[operands.dst].set_i8x16(a);
+        ControlFlow::Continue(())
+    }
+
+    fn vaddi16x8(&mut self, operands: BinaryOperands<VReg>) -> ControlFlow<Done> {
+        let mut a = self.state[operands.src1].get_i16x8();
+        let b = self.state[operands.src2].get_i16x8();
+        for (a, b) in a.iter_mut().zip(b) {
+            *a += b;
+        }
+        self.state[operands.dst].set_i16x8(a);
+        ControlFlow::Continue(())
+    }
+
+    fn vaddi32x4(&mut self, operands: BinaryOperands<VReg>) -> ControlFlow<Done> {
+        let mut a = self.state[operands.src1].get_i32x4();
+        let b = self.state[operands.src2].get_i32x4();
+        for (a, b) in a.iter_mut().zip(b) {
+            *a += b;
+        }
+        self.state[operands.dst].set_i32x4(a);
+        ControlFlow::Continue(())
+    }
+
+    fn vaddi64x2(&mut self, operands: BinaryOperands<VReg>) -> ControlFlow<Done> {
+        let mut a = self.state[operands.src1].get_i64x2();
+        let b = self.state[operands.src2].get_i64x2();
+        for (a, b) in a.iter_mut().zip(b) {
+            *a += b;
+        }
+        self.state[operands.dst].set_i64x2(a);
+        ControlFlow::Continue(())
+    }
+
+    fn vaddf32x4(&mut self, operands: BinaryOperands<VReg>) -> ControlFlow<Done> {
+        let mut a = self.state[operands.src1].get_f32x4();
+        let b = self.state[operands.src2].get_f32x4();
+        for (a, b) in a.iter_mut().zip(b) {
+            *a += b;
+        }
+        self.state[operands.dst].set_f32x4(a);
+        ControlFlow::Continue(())
+    }
+
+    fn vaddf64x2(&mut self, operands: BinaryOperands<VReg>) -> ControlFlow<Done> {
+        let mut a = self.state[operands.src1].get_f64x2();
+        let b = self.state[operands.src2].get_f64x2();
+        for (a, b) in a.iter_mut().zip(b) {
+            *a += b;
+        }
+        self.state[operands.dst].set_f64x2(a);
+        ControlFlow::Continue(())
+    }
+
+    fn vshli8x16(&mut self, operands: BinaryOperands<VReg, VReg, XReg>) -> ControlFlow<Done> {
+        let a = self.state[operands.src1].get_i8x16();
+        let b = self.state[operands.src2].get_u32();
+        self.state[operands.dst].set_i8x16(a.map(|a| a.wrapping_shl(b)));
+        ControlFlow::Continue(())
+    }
+
+    fn vshli16x8(&mut self, operands: BinaryOperands<VReg, VReg, XReg>) -> ControlFlow<Done> {
+        let a = self.state[operands.src1].get_i16x8();
+        let b = self.state[operands.src2].get_u32();
+        self.state[operands.dst].set_i16x8(a.map(|a| a.wrapping_shl(b)));
+        ControlFlow::Continue(())
+    }
+
+    fn vshli32x4(&mut self, operands: BinaryOperands<VReg, VReg, XReg>) -> ControlFlow<Done> {
+        let a = self.state[operands.src1].get_i32x4();
+        let b = self.state[operands.src2].get_u32();
+        self.state[operands.dst].set_i32x4(a.map(|a| a.wrapping_shl(b)));
+        ControlFlow::Continue(())
+    }
+
+    fn vshli64x2(&mut self, operands: BinaryOperands<VReg, VReg, XReg>) -> ControlFlow<Done> {
+        let a = self.state[operands.src1].get_i64x2();
+        let b = self.state[operands.src2].get_u32();
+        self.state[operands.dst].set_i64x2(a.map(|a| a.wrapping_shl(b)));
+        ControlFlow::Continue(())
+    }
+
+    fn vshri8x16_s(&mut self, operands: BinaryOperands<VReg, VReg, XReg>) -> ControlFlow<Done> {
+        let a = self.state[operands.src1].get_i8x16();
+        let b = self.state[operands.src2].get_u32();
+        self.state[operands.dst].set_i8x16(a.map(|a| a.wrapping_shr(b)));
+        ControlFlow::Continue(())
+    }
+
+    fn vshri16x8_s(&mut self, operands: BinaryOperands<VReg, VReg, XReg>) -> ControlFlow<Done> {
+        let a = self.state[operands.src1].get_i16x8();
+        let b = self.state[operands.src2].get_u32();
+        self.state[operands.dst].set_i16x8(a.map(|a| a.wrapping_shr(b)));
+        ControlFlow::Continue(())
+    }
+
+    fn vshri32x4_s(&mut self, operands: BinaryOperands<VReg, VReg, XReg>) -> ControlFlow<Done> {
+        let a = self.state[operands.src1].get_i32x4();
+        let b = self.state[operands.src2].get_u32();
+        self.state[operands.dst].set_i32x4(a.map(|a| a.wrapping_shr(b)));
+        ControlFlow::Continue(())
+    }
+
+    fn vshri64x2_s(&mut self, operands: BinaryOperands<VReg, VReg, XReg>) -> ControlFlow<Done> {
+        let a = self.state[operands.src1].get_i64x2();
+        let b = self.state[operands.src2].get_u32();
+        self.state[operands.dst].set_i64x2(a.map(|a| a.wrapping_shr(b)));
+        ControlFlow::Continue(())
+    }
+
+    fn vshri8x16_u(&mut self, operands: BinaryOperands<VReg, VReg, XReg>) -> ControlFlow<Done> {
+        let a = self.state[operands.src1].get_u8x16();
+        let b = self.state[operands.src2].get_u32();
+        self.state[operands.dst].set_u8x16(a.map(|a| a.wrapping_shr(b)));
+        ControlFlow::Continue(())
+    }
+
+    fn vshri16x8_u(&mut self, operands: BinaryOperands<VReg, VReg, XReg>) -> ControlFlow<Done> {
+        let a = self.state[operands.src1].get_u16x8();
+        let b = self.state[operands.src2].get_u32();
+        self.state[operands.dst].set_u16x8(a.map(|a| a.wrapping_shr(b)));
+        ControlFlow::Continue(())
+    }
+
+    fn vshri32x4_u(&mut self, operands: BinaryOperands<VReg, VReg, XReg>) -> ControlFlow<Done> {
+        let a = self.state[operands.src1].get_u32x4();
+        let b = self.state[operands.src2].get_u32();
+        self.state[operands.dst].set_u32x4(a.map(|a| a.wrapping_shr(b)));
+        ControlFlow::Continue(())
+    }
+
+    fn vshri64x2_u(&mut self, operands: BinaryOperands<VReg, VReg, XReg>) -> ControlFlow<Done> {
+        let a = self.state[operands.src1].get_u64x2();
+        let b = self.state[operands.src2].get_u32();
+        self.state[operands.dst].set_u64x2(a.map(|a| a.wrapping_shr(b)));
+        ControlFlow::Continue(())
+    }
+
+    fn vconst128(&mut self, dst: VReg, val: u128) -> ControlFlow<Done> {
+        self.state[dst].set_u128(val);
+        ControlFlow::Continue(())
+    }
 }
 
 impl ExtendedOpVisitor for Interpreter<'_> {
diff --git a/pulley/src/lib.rs b/pulley/src/lib.rs
index bb8b83f7994a..563ee1df6755 100644
--- a/pulley/src/lib.rs
+++ b/pulley/src/lib.rs
@@ -563,6 +563,47 @@ macro_rules! for_each_op {
             fneg64 = Fneg64 { dst: FReg, src: FReg };
             /// `dst = |src|`
             fabs64 = Fabs64 { dst: FReg, src: FReg };
+
+            /// `dst = imm`
+            vconst128 = Vconst128 { dst: VReg, imm: u128 };
+
+            /// `dst = src1 + src2`
+            vaddi8x16 = VAddI8x16 { operands: BinaryOperands<VReg> };
+            /// `dst = src1 + src2`
+            vaddi16x8 = VAddI16x8 { operands: BinaryOperands<VReg> };
+            /// `dst = src1 + src2`
+            vaddi32x4 = VAddI32x4 { operands: BinaryOperands<VReg> };
+            /// `dst = src1 + src2`
+            vaddi64x2 = VAddI64x2 { operands: BinaryOperands<VReg> };
+            /// `dst = src1 + src2`
+            vaddf32x4 = VAddF32x4 { operands: BinaryOperands<VReg> };
+            /// `dst = src1 + src2`
+            vaddf64x2 = VAddF64x2 { operands: BinaryOperands<VReg> };
+
+            /// `dst = src1 << src2`
+            vshli8x16 = VShlI8x16 { operands: BinaryOperands<VReg, VReg, XReg> };
+            /// `dst = src1 << src2`
+            vshli16x8 = VShlI16x8 { operands: BinaryOperands<VReg, VReg, XReg> };
+            /// `dst = src1 << src2`
+            vshli32x4 = VShlI32x4 { operands: BinaryOperands<VReg, VReg, XReg> };
+            /// `dst = src1 << src2`
+            vshli64x2 = VShlI64x2 { operands: BinaryOperands<VReg, VReg, XReg> };
+            /// `dst = src1 >> src2` (signed)
+            vshri8x16_s = VShrI8x16S { operands: BinaryOperands<VReg, VReg, XReg> };
+            /// `dst = src1 >> src2` (signed)
+            vshri16x8_s = VShrI16x8S { operands: BinaryOperands<VReg, VReg, XReg> };
+            /// `dst = src1 >> src2` (signed)
+            vshri32x4_s = VShrI32x4S { operands: BinaryOperands<VReg, VReg, XReg> };
+            /// `dst = src1 >> src2` (signed)
+            vshri64x2_s = VShrI64x2S { operands: BinaryOperands<VReg, VReg, XReg> };
+            /// `dst = src1 >> src2` (unsigned)
+            vshri8x16_u = VShrI8x16U { operands: BinaryOperands<VReg, VReg, XReg> };
+            /// `dst = src1 >> src2` (unsigned)
+            vshri16x8_u = VShrI16x8U { operands: BinaryOperands<VReg, VReg, XReg> };
+            /// `dst = src1 >> src2` (unsigned)
+            vshri32x4_u = VShrI32x4U { operands: BinaryOperands<VReg, VReg, XReg> };
+            /// `dst = src1 >> src2` (unsigned)
+            vshri64x2_u = VShrI64x2U { operands: BinaryOperands<VReg, VReg, XReg> };
         }
     };
 }
diff --git a/pulley/src/regs.rs b/pulley/src/regs.rs
index deaa08deb19f..00262bf233ff 100644
--- a/pulley/src/regs.rs
+++ b/pulley/src/regs.rs
@@ -164,18 +164,18 @@ impl fmt::Debug for AnyReg {
 /// Operands to a binary operation, packed into a 16-bit word (5 bits per register).
 #[derive(Debug, Clone, Copy, PartialEq, Eq, PartialOrd, Ord, Hash)]
 #[cfg_attr(feature = "arbitrary", derive(arbitrary::Arbitrary))]
-pub struct BinaryOperands<R> {
+pub struct BinaryOperands<D, S1 = D, S2 = D> {
     /// The destination register, packed in bits 0..5.
-    pub dst: R,
+    pub dst: D,
     /// The first source register, packed in bits 5..10.
-    pub src1: R,
+    pub src1: S1,
     /// The second source register, packed in bits 10..15.
-    pub src2: R,
+    pub src2: S2,
 }
 
-impl<R: Reg> BinaryOperands<R> {
+impl<D: Reg, S1: Reg, S2: Reg> BinaryOperands<D, S1, S2> {
     /// Convenience constructor for applying `Into`
-    pub fn new(dst: impl Into<R>, src1: impl Into<R>, src2: impl Into<R>) -> Self {
+    pub fn new(dst: impl Into<D>, src1: impl Into<S1>, src2: impl Into<S2>) -> Self {
         Self {
             dst: dst.into(),
             src1: src1.into(),
@@ -194,9 +194,9 @@ impl<R: Reg> BinaryOperands<R> {
     /// Convert from dense 16 bit encoding. The topmost bit is ignored.
     pub fn from_bits(bits: u16) -> Self {
         Self {
-            dst: R::new((bits & 0b11111) as u8).unwrap(),
-            src1: R::new(((bits >> 5) & 0b11111) as u8).unwrap(),
-            src2: R::new(((bits >> 10) & 0b11111) as u8).unwrap(),
+            dst: D::new((bits & 0b11111) as u8).unwrap(),
+            src1: S1::new(((bits >> 5) & 0b11111) as u8).unwrap(),
+            src2: S2::new(((bits >> 10) & 0b11111) as u8).unwrap(),
         }
     }
 }