diff --git a/cranelift/codegen/src/isa/pulley_shared/inst.isle b/cranelift/codegen/src/isa/pulley_shared/inst.isle index ab0d39a96a16..2f2539917a92 100644 --- a/cranelift/codegen/src/isa/pulley_shared/inst.isle +++ b/cranelift/codegen/src/isa/pulley_shared/inst.isle @@ -78,7 +78,7 @@ ;; control behavior such as endianness. (XLoad (dst WritableXReg) (mem Amode) (ty Type) (flags MemFlags) (ext ExtKind)) (FLoad (dst WritableFReg) (mem Amode) (ty Type) (flags MemFlags)) - (VLoad (dst WritableVReg) (mem Amode) (ty Type) (flags MemFlags)) + (VLoad (dst WritableVReg) (mem Amode) (ty Type) (flags MemFlags) (ext VExtKind)) ;; Stores. (XStore (mem Amode) (src XReg) (ty Type) (flags MemFlags)) @@ -148,6 +148,8 @@ (type ExtKind (enum None Sign32 Sign64 Zero32 Zero64)) +(type VExtKind (enum None S8x8 U8x8 S16x4 U16x4 S32x2 U32x2)) + ;;;; Newtypes for Different Register Classes ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; (type XReg (primitive XReg)) @@ -422,10 +424,10 @@ (rule (pulley_fstore amode src ty flags) (SideEffectNoResult.Inst (MInst.FStore amode src ty flags))) -(decl pulley_vload (Amode Type MemFlags) VReg) -(rule (pulley_vload amode ty flags) +(decl pulley_vload (Amode Type MemFlags VExtKind) VReg) +(rule (pulley_vload amode ty flags ext) (let ((dst WritableVReg (temp_writable_vreg)) - (_ Unit (emit (MInst.VLoad dst amode ty flags)))) + (_ Unit (emit (MInst.VLoad dst amode ty flags ext)))) dst)) (decl pulley_vstore (Amode VReg Type MemFlags) SideEffectNoResult) diff --git a/cranelift/codegen/src/isa/pulley_shared/inst/emit.rs b/cranelift/codegen/src/isa/pulley_shared/inst/emit.rs index 5b0c435e83b7..642662eb8e43 100644 --- a/cranelift/codegen/src/isa/pulley_shared/inst/emit.rs +++ b/cranelift/codegen/src/isa/pulley_shared/inst/emit.rs @@ -393,13 +393,22 @@ fn pulley_emit

( mem, ty, flags, + ext, } => { let r = mem.get_base_register().unwrap(); let x = mem.get_offset_with_state(state); let endian = emit_info.endianness(*flags); assert_eq!(endian, Endianness::Little); assert_eq!(ty.bytes(), 16); - enc::vload128le_offset32(sink, dst, r, x); + match ext { + VExtKind::None => enc::vload128le_offset32(sink, dst, r, x), + VExtKind::S8x8 => enc::vload8x8_s_offset32(sink, dst, r, x), + VExtKind::U8x8 => enc::vload8x8_u_offset32(sink, dst, r, x), + VExtKind::S16x4 => enc::vload16x4le_s_offset32(sink, dst, r, x), + VExtKind::U16x4 => enc::vload16x4le_u_offset32(sink, dst, r, x), + VExtKind::S32x2 => enc::vload32x2le_s_offset32(sink, dst, r, x), + VExtKind::U32x2 => enc::vload32x2le_u_offset32(sink, dst, r, x), + } } Inst::XStore { diff --git a/cranelift/codegen/src/isa/pulley_shared/inst/mod.rs b/cranelift/codegen/src/isa/pulley_shared/inst/mod.rs index bc79c4b322b8..9805d58996a2 100644 --- a/cranelift/codegen/src/isa/pulley_shared/inst/mod.rs +++ b/cranelift/codegen/src/isa/pulley_shared/inst/mod.rs @@ -25,6 +25,7 @@ pub use self::emit::*; pub use crate::isa::pulley_shared::lower::isle::generated_code::MInst as Inst; pub use crate::isa::pulley_shared::lower::isle::generated_code::RawInst; +pub use crate::isa::pulley_shared::lower::isle::generated_code::VExtKind; impl From for Inst { fn from(raw: RawInst) -> Inst { @@ -65,6 +66,7 @@ impl Inst { mem, ty, flags, + ext: VExtKind::None, } } else if ty.is_int() { Inst::XLoad { @@ -242,6 +244,7 @@ fn pulley_get_operands(inst: &mut Inst, collector: &mut impl OperandVisitor) { mem, ty: _, flags: _, + ext: _, } => { collector.reg_def(dst); mem.get_operands(collector); @@ -687,11 +690,12 @@ impl Inst { mem, ty, flags, + ext, } => { let dst = format_reg(*dst.to_reg()); let ty = ty.bits(); let mem = mem.to_string(); - format!("{dst} = vload{ty} {mem} // flags ={flags}") + format!("{dst} = vload{ty}_{ext:?} {mem} // flags ={flags}") } Inst::VStore { diff --git a/cranelift/codegen/src/isa/pulley_shared/lower.isle b/cranelift/codegen/src/isa/pulley_shared/lower.isle index 0ae6935ef11c..84108d07c638 100644 --- a/cranelift/codegen/src/isa/pulley_shared/lower.isle +++ b/cranelift/codegen/src/isa/pulley_shared/lower.isle @@ -504,7 +504,25 @@ (pulley_xload (amode addr offset) $I32 flags (ExtKind.Sign64))) (rule 2 (lower (has_type (ty_vec128 ty) (load flags addr offset))) - (pulley_vload (amode addr offset) ty flags)) + (pulley_vload (amode addr offset) ty flags (VExtKind.None))) + +(rule (lower (has_type ty (sload8x8 flags addr offset))) + (pulley_vload (amode addr offset) ty flags (VExtKind.S8x8))) + +(rule (lower (has_type ty (uload8x8 flags addr offset))) + (pulley_vload (amode addr offset) ty flags (VExtKind.U8x8))) + +(rule (lower (has_type ty (sload16x4 flags addr offset))) + (pulley_vload (amode addr offset) ty flags (VExtKind.S16x4))) + +(rule (lower (has_type ty (uload16x4 flags addr offset))) + (pulley_vload (amode addr offset) ty flags (VExtKind.U16x4))) + +(rule (lower (has_type ty (sload32x2 flags addr offset))) + (pulley_vload (amode addr offset) ty flags (VExtKind.S32x2))) + +(rule (lower (has_type ty (uload32x2 flags addr offset))) + (pulley_vload (amode addr offset) ty flags (VExtKind.U32x2))) ;;;; Rules for `store` and friends ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; diff --git a/crates/wast-util/src/lib.rs b/crates/wast-util/src/lib.rs index 39fb051330cb..ce3647c92d09 100644 --- a/crates/wast-util/src/lib.rs +++ b/crates/wast-util/src/lib.rs @@ -431,7 +431,6 @@ impl WastTest { "spec_testsuite/proposals/memory64/i16x8_relaxed_q15mulr_s.wast", "spec_testsuite/proposals/memory64/i32x4_relaxed_trunc.wast", "spec_testsuite/proposals/memory64/i8x16_relaxed_swizzle.wast", - "spec_testsuite/simd_align.wast", "spec_testsuite/simd_bitwise.wast", "spec_testsuite/simd_boolean.wast", "spec_testsuite/simd_conversions.wast", diff --git a/pulley/src/interp.rs b/pulley/src/interp.rs index 54f8ada170d8..20ce218a9139 100644 --- a/pulley/src/interp.rs +++ b/pulley/src/interp.rs @@ -2754,6 +2754,42 @@ impl OpVisitor for Interpreter<'_> { self.state[dst].set_f64x2([val; 2]); ControlFlow::Continue(()) } + + fn vload8x8_s_offset32(&mut self, dst: VReg, ptr: XReg, offset: i32) -> ControlFlow { + let val = unsafe { self.load::<[i8; 8]>(ptr, offset) }; + self.state[dst].set_i16x8(val.map(|i| i.into())); + ControlFlow::Continue(()) + } + + fn vload8x8_u_offset32(&mut self, dst: VReg, ptr: XReg, offset: i32) -> ControlFlow { + let val = unsafe { self.load::<[u8; 8]>(ptr, offset) }; + self.state[dst].set_u16x8(val.map(|i| i.into())); + ControlFlow::Continue(()) + } + + fn vload16x4le_s_offset32(&mut self, dst: VReg, ptr: XReg, offset: i32) -> ControlFlow { + let val = unsafe { self.load::<[i16; 4]>(ptr, offset) }; + self.state[dst].set_i32x4(val.map(|i| i16::from_le(i).into())); + ControlFlow::Continue(()) + } + + fn vload16x4le_u_offset32(&mut self, dst: VReg, ptr: XReg, offset: i32) -> ControlFlow { + let val = unsafe { self.load::<[u16; 4]>(ptr, offset) }; + self.state[dst].set_u32x4(val.map(|i| u16::from_le(i).into())); + ControlFlow::Continue(()) + } + + fn vload32x2le_s_offset32(&mut self, dst: VReg, ptr: XReg, offset: i32) -> ControlFlow { + let val = unsafe { self.load::<[i32; 2]>(ptr, offset) }; + self.state[dst].set_i64x2(val.map(|i| i32::from_le(i).into())); + ControlFlow::Continue(()) + } + + fn vload32x2le_u_offset32(&mut self, dst: VReg, ptr: XReg, offset: i32) -> ControlFlow { + let val = unsafe { self.load::<[u32; 2]>(ptr, offset) }; + self.state[dst].set_u64x2(val.map(|i| u32::from_le(i).into())); + ControlFlow::Continue(()) + } } impl ExtendedOpVisitor for Interpreter<'_> { diff --git a/pulley/src/lib.rs b/pulley/src/lib.rs index 415a6ea89de7..219dc99971ea 100644 --- a/pulley/src/lib.rs +++ b/pulley/src/lib.rs @@ -631,6 +631,19 @@ macro_rules! for_each_op { vsplatf32 = VSplatF32 { dst: VReg, src: FReg }; /// `dst = splat(src)` vsplatf64 = VSplatF64 { dst: VReg, src: FReg }; + + /// Load the 64-bit source as i8x8 and sign-extend to i16x8. + vload8x8_s_offset32 = VLoad8x8SOffset32 { dst: VReg, ptr: XReg, offset: i32 }; + /// Load the 64-bit source as u8x8 and zero-extend to i16x8. + vload8x8_u_offset32 = VLoad8x8UOffset32 { dst: VReg, ptr: XReg, offset: i32 }; + /// Load the 64-bit source as i16x4 and sign-extend to i32x4. + vload16x4le_s_offset32 = VLoad16x4LeSOffset32 { dst: VReg, ptr: XReg, offset: i32 }; + /// Load the 64-bit source as u16x4 and zero-extend to i32x4. + vload16x4le_u_offset32 = VLoad16x4LeUOffset32 { dst: VReg, ptr: XReg, offset: i32 }; + /// Load the 64-bit source as i32x2 and sign-extend to i64x2. + vload32x2le_s_offset32 = VLoad32x2LeSOffset32 { dst: VReg, ptr: XReg, offset: i32 }; + /// Load the 64-bit source as u32x2 and zero-extend to i64x2. + vload32x2le_u_offset32 = VLoad32x2LeUOffset32 { dst: VReg, ptr: XReg, offset: i32 }; } }; }