Skip to content

Commit

Permalink
pulley: Add macro CallN instructions
Browse files Browse the repository at this point in the history
This commit adds new macro instructions to assist with speeding up calls
between functions. Pulley's previous `Call` instruction was similar to
native call instructions where arguments/results are implicitly in the
right location according to the ABI, but movement between registers is
more expensive with Pulley than with native architectures. The `CallN`
instructions here enable listing a few arguments (only integer
registers) in the opcode itself. This removes the need for individual `xmov`
instructions into individual registers and instead it can all be done
within the opcode handlers.

This additionally enables passing the same argument twice to a function
to reside only in one register. Finally parallel-copies between these
registers are supported as the interpreter loads all registers and then
stores all registers.

These new instructions participate in register allocation differently
from before where the first few arguments are allowed to be in any
register and no longer use `reg_fixed_use`. All other arguments (and all
float arguments for example) continue to use `reg_fixed_use`.

Locally sightglass reports this change speeding up `pulldown-cmark` by
2-10%. On a `fib(N)` micro-benchmark it didn't help as much as I was
hoping that it was going to.
  • Loading branch information
alexcrichton committed Dec 19, 2024
1 parent a179f95 commit 70dd41e
Show file tree
Hide file tree
Showing 15 changed files with 288 additions and 144 deletions.
3 changes: 2 additions & 1 deletion cranelift/codegen/meta/src/pulley.rs
Original file line number Diff line number Diff line change
Expand Up @@ -89,7 +89,8 @@ impl Inst<'_> {
match self.name {
// Skip instructions related to control-flow as those require
// special handling with `MachBuffer`.
"Jump" | "Call" | "CallIndirect" => true,
"Jump" => true,
n if n.starts_with("Call") => true,

// Skip special instructions not used in Cranelift.
"XPush32Many" | "XPush64Many" | "XPop32Many" | "XPop64Many" => true,
Expand Down
33 changes: 29 additions & 4 deletions cranelift/codegen/src/isa/pulley_shared/abi.rs
Original file line number Diff line number Diff line change
Expand Up @@ -441,15 +441,40 @@ where
fn gen_call(
dest: &CallDest,
_tmp: Writable<Reg>,
info: CallInfo<()>,
mut info: CallInfo<()>,
) -> SmallVec<[Self::I; 2]> {
match dest {
// "near" calls are pulley->pulley calls so they use a normal "call"
// opcode
CallDest::ExtName(name, RelocDistance::Near) => smallvec![Inst::Call {
info: Box::new(info.map(|()| name.clone()))
CallDest::ExtName(name, RelocDistance::Near) => {
// The first four integer arguments to a call can be handled via
// special pulley call instructions. Assert here that
// `info.uses` is sorted in order and then take out x0-x3 if
// they're present and move them from `info.uses` to
// `info.dest.args` to be handled differently during register
// allocation.
let mut args = SmallVec::new();
assert!(info
.uses
.iter()
.filter_map(|arg| XReg::new(arg.preg))
.is_sorted());
info.uses.retain(|arg| {
if arg.preg != x0() && arg.preg != x1() && arg.preg != x2() && arg.preg != x3()
{
return true;
}
args.push(XReg::new(arg.vreg).unwrap());
false
});
smallvec![Inst::Call {
info: Box::new(info.map(|()| PulleyCall {
name: name.clone(),
args,
}))
}
.into()]
}
.into()],
// "far" calls are pulley->host calls so they use a different opcode
// which is lowered with a special relocation in the backend.
CallDest::ExtName(name, RelocDistance::Far) => smallvec![Inst::IndirectCallHost {
Expand Down
3 changes: 2 additions & 1 deletion cranelift/codegen/src/isa/pulley_shared/inst.isle
Original file line number Diff line number Diff line change
Expand Up @@ -58,7 +58,7 @@

;; An indirect call out to a host-defined function. The host function
;; pointer is the first "argument" of this function call.
(IndirectCallHost (info BoxCallInfo))
(IndirectCallHost (info BoxCallIndirectHostInfo))

;; Unconditional jumps.
(Jump (label MachLabel))
Expand Down Expand Up @@ -154,6 +154,7 @@
(type BoxReturnCallInfo (primitive BoxReturnCallInfo))
(type BoxReturnCallIndInfo (primitive BoxReturnCallIndInfo))
(type XRegSet (primitive XRegSet))
(type BoxCallIndirectHostInfo (primitive BoxCallIndirectHostInfo))

;;;; Address Modes ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;

Expand Down
13 changes: 13 additions & 0 deletions cranelift/codegen/src/isa/pulley_shared/inst/args.rs
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
//! Pulley instruction arguments.
use super::*;
use crate::ir::ExternalName;
use crate::machinst::abi::StackAMode;
use pulley_interpreter::encode;
use pulley_interpreter::regs::Reg as _;
Expand Down Expand Up @@ -565,3 +566,15 @@ impl fmt::Display for Cond {
}
}
}

/// Payload of `CallInfo` for call instructions
#[derive(Clone, Debug)]
pub struct PulleyCall {
/// The external name that's being called, or the Cranelift-generated
/// function that's being invoked.
pub name: ExternalName,
/// Arguments tracked in this call invocation which aren't assigned fixed
/// registers. This tracks up to 4 registers and all remaining registers
/// will be present and tracked in `CallInfo<T>` fields.
pub args: SmallVec<[XReg; 4]>,
}
32 changes: 26 additions & 6 deletions cranelift/codegen/src/isa/pulley_shared/inst/emit.rs
Original file line number Diff line number Diff line change
Expand Up @@ -172,16 +172,36 @@ fn pulley_emit<P>(
Inst::LoadExtName { .. } => todo!(),

Inst::Call { info } => {
sink.put1(pulley_interpreter::Opcode::Call as u8);
sink.add_reloc(
let offset = sink.cur_offset();

// If arguments happen to already be in the right register for the
// ABI then remove them from this list. Otherwise emit the
// appropriate `Call` instruction depending on how many arguments we
// have that aren't already in their correct register according to
// ABI conventions.
let mut args = &info.dest.args[..];
while !args.is_empty() && args.last().copied() == XReg::new(x_reg(args.len() - 1)) {
args = &args[..args.len() - 1];
}
match args {
[] => enc::call(sink, 0),
[x0] => enc::call1(sink, x0, 0),
[x0, x1] => enc::call2(sink, x0, x1, 0),
[x0, x1, x2] => enc::call3(sink, x0, x1, x2, 0),
[x0, x1, x2, x3] => enc::call4(sink, x0, x1, x2, x3, 0),
_ => unreachable!(),
}
let end = sink.cur_offset();
sink.add_reloc_at_offset(
end - 4,
// TODO: is it actually okay to reuse this reloc here?
Reloc::X86CallPCRel4,
&info.dest,
&info.dest.name,
// This addend adjusts for the difference between the start of
// the instruction and the beginning of the immediate field.
-1,
// the instruction and the beginning of the immediate offset
// field which is always the final 4 bytes of the instruction.
-i64::from(end - offset - 4),
);
sink.put4(0);
if let Some(s) = state.take_stack_map() {
let offset = sink.cur_offset();
sink.push_user_stack_map(state, offset, s);
Expand Down
24 changes: 23 additions & 1 deletion cranelift/codegen/src/isa/pulley_shared/inst/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -151,7 +151,29 @@ fn pulley_get_operands(inst: &mut Inst, collector: &mut impl OperandVisitor) {
collector.reg_def(dst);
}

Inst::Call { info } | Inst::IndirectCallHost { info } => {
Inst::Call { info } => {
let CallInfo {
uses, defs, dest, ..
} = &mut **info;

// Pulley supports having the first few integer arguments in any
// register, so flag that with `reg_use` here.
let PulleyCall { args, .. } = dest;
for arg in args {
collector.reg_use(arg);
}

// Remaining arguments (and return values) are all in fixed
// registers according to Pulley's ABI, however.
for CallArgPair { vreg, preg } in uses {
collector.reg_fixed_use(vreg, *preg);
}
for CallRetPair { vreg, preg } in defs {
collector.reg_fixed_def(vreg, *preg);
}
collector.reg_clobbers(info.clobbers);
}
Inst::IndirectCallHost { info } => {
let CallInfo { uses, defs, .. } = &mut **info;
for CallArgPair { vreg, preg } in uses {
collector.reg_fixed_use(vreg, *preg);
Expand Down
6 changes: 4 additions & 2 deletions cranelift/codegen/src/isa/pulley_shared/lower/isle.rs
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,8 @@ use crate::ir::{condcodes::*, immediates::*, types::*, *};
use crate::isa::pulley_shared::{
abi::*,
inst::{
FReg, OperandSize, ReturnCallInfo, VReg, WritableFReg, WritableVReg, WritableXReg, XReg,
FReg, OperandSize, PulleyCall, ReturnCallInfo, VReg, WritableFReg, WritableVReg,
WritableXReg, XReg,
},
lower::{regs, Cond},
*,
Expand All @@ -26,8 +27,9 @@ use regalloc2::PReg;
type Unit = ();
type VecArgPair = Vec<ArgPair>;
type VecRetPair = Vec<RetPair>;
type BoxCallInfo = Box<CallInfo<ExternalName>>;
type BoxCallInfo = Box<CallInfo<PulleyCall>>;
type BoxCallIndInfo = Box<CallInfo<XReg>>;
type BoxCallIndirectHostInfo = Box<CallInfo<ExternalName>>;
type BoxReturnCallInfo = Box<ReturnCallInfo<ExternalName>>;
type BoxReturnCallIndInfo = Box<ReturnCallInfo<XReg>>;
type BoxExternalName = Box<ExternalName>;
Expand Down
17 changes: 13 additions & 4 deletions cranelift/codegen/src/isa/s390x/inst/emit.rs
Original file line number Diff line number Diff line change
Expand Up @@ -220,7 +220,13 @@ pub fn mem_emit(
&MemArg::Symbol {
ref name, offset, ..
} => {
sink.add_reloc_at_offset(2, Reloc::S390xPCRel32Dbl, &**name, (offset + 2).into());
let reloc_offset = sink.cur_offset() + 2;
sink.add_reloc_at_offset(
reloc_offset,
Reloc::S390xPCRel32Dbl,
&**name,
(offset + 2).into(),
);
put(sink, &enc_ril_b(opcode_ril.unwrap(), rd, 0));
}
_ => unreachable!(),
Expand Down Expand Up @@ -3198,7 +3204,8 @@ impl Inst {
// Add relocation for target function. This has to be done *before*
// the S390xTlsGdCall relocation if any, to ensure linker relaxation
// works correctly.
sink.add_reloc_at_offset(2, Reloc::S390xPLTRel32Dbl, &info.dest, 2);
let offset = sink.cur_offset() + 2;
sink.add_reloc_at_offset(offset, Reloc::S390xPLTRel32Dbl, &info.dest, 2);

if let Some(s) = state.take_stack_map() {
let offset = sink.cur_offset() + 6;
Expand Down Expand Up @@ -3232,7 +3239,8 @@ impl Inst {
}

let opcode = 0xc04; // BCRL
sink.add_reloc_at_offset(2, Reloc::S390xPLTRel32Dbl, &info.dest, 2);
let offset = sink.cur_offset() + 2;
sink.add_reloc_at_offset(offset, Reloc::S390xPLTRel32Dbl, &info.dest, 2);
put(sink, &enc_ril_c(opcode, 15, 0));
sink.add_call_site();
}
Expand All @@ -3257,7 +3265,8 @@ impl Inst {
// *before* the S390xTlsGdCall, to ensure linker relaxation
// works correctly.
let dest = ExternalName::LibCall(LibCall::ElfTlsGetOffset);
sink.add_reloc_at_offset(2, Reloc::S390xPLTRel32Dbl, &dest, 2);
let offset = sink.cur_offset() + 2;
sink.add_reloc_at_offset(offset, Reloc::S390xPLTRel32Dbl, &dest, 2);
match &**symbol {
SymbolReloc::TlsGd { name } => sink.add_reloc(Reloc::S390xTlsGdCall, name, 0),
_ => unreachable!(),
Expand Down
6 changes: 3 additions & 3 deletions cranelift/codegen/src/machinst/buffer.rs
Original file line number Diff line number Diff line change
Expand Up @@ -1536,7 +1536,7 @@ impl<I: VCodeInst> MachBuffer<I> {
}
}

/// Add an external relocation at the given offset from current offset.
/// Add an external relocation at the given offset.
pub fn add_reloc_at_offset<T: Into<RelocTarget> + Clone>(
&mut self,
offset: CodeOffset,
Expand Down Expand Up @@ -1579,7 +1579,7 @@ impl<I: VCodeInst> MachBuffer<I> {
// when a relocation can't otherwise be resolved later, so it shouldn't
// actually result in any memory unsafety or anything like that.
self.relocs.push(MachReloc {
offset: self.data.len() as CodeOffset + offset,
offset,
kind,
target,
addend,
Expand All @@ -1593,7 +1593,7 @@ impl<I: VCodeInst> MachBuffer<I> {
target: &T,
addend: Addend,
) {
self.add_reloc_at_offset(0, kind, target, addend);
self.add_reloc_at_offset(self.data.len() as CodeOffset, kind, target, addend);
}

/// Add a trap record at the current offset.
Expand Down
Loading

0 comments on commit 70dd41e

Please sign in to comment.