x64: port atomic_rmw to ISLE (#4389)

* x64: port `atomic_rmw` to ISLE

This change ports `atomic_rmw` to ISLE for the x64 backend. It does not
change the lowering in any way, though it seems possible that the fixed
regs need not be as fixed and that there are opportunities for single
instruction lowerings. It does rename `inst_common::AtomicRmwOp` to
`MachAtomicRmwOp` to disambiguate with the IR enum with the same name.

* x64: remove remaining hardcoded register constraints for `atomic_rmw`

* x64: use `SyntheticAmode` in `AtomicRmwSeq`

* review: add missing reg collector for amode

* review: collect memory registers in the 'late' phase
This commit is contained in:
Andrew Brown
2022-07-06 16:58:59 -07:00
committed by GitHub
parent f98076ae88
commit 8629cbc6a4
10 changed files with 196 additions and 172 deletions

View File

@@ -306,7 +306,7 @@ impl Amode {
}
}
/// Add the regs mentioned by `self` to `collector`.
/// Add the registers mentioned by `self` to `collector`.
pub(crate) fn get_operands<F: Fn(VReg) -> VReg>(
&self,
collector: &mut OperandCollector<'_, F>,
@@ -325,6 +325,25 @@ impl Amode {
}
}
/// Same as `get_operands`, but add the registers in the "late" phase.
pub(crate) fn get_operands_late<F: Fn(VReg) -> VReg>(
&self,
collector: &mut OperandCollector<'_, F>,
) {
match self {
Amode::ImmReg { base, .. } => {
collector.reg_late_use(*base);
}
Amode::ImmRegRegShift { base, index, .. } => {
collector.reg_late_use(base.to_reg());
collector.reg_late_use(index.to_reg());
}
Amode::RipRelative { .. } => {
// RIP isn't involved in regalloc.
}
}
}
pub(crate) fn get_flags(&self) -> MemFlags {
match self {
Amode::ImmReg { flags, .. } => *flags,
@@ -426,7 +445,7 @@ impl SyntheticAmode {
SyntheticAmode::NominalSPOffset { simm32 }
}
/// Add the regs mentioned by `self` to `collector`.
/// Add the registers mentioned by `self` to `collector`.
pub(crate) fn get_operands<F: Fn(VReg) -> VReg>(
&self,
collector: &mut OperandCollector<'_, F>,
@@ -440,6 +459,20 @@ impl SyntheticAmode {
}
}
/// Same as `get_operands`, but add the register in the "late" phase.
pub(crate) fn get_operands_late<F: Fn(VReg) -> VReg>(
&self,
collector: &mut OperandCollector<'_, F>,
) {
match self {
SyntheticAmode::Real(addr) => addr.get_operands_late(collector),
SyntheticAmode::NominalSPOffset { .. } => {
// Nothing to do; the base is SP and isn't involved in regalloc.
}
SyntheticAmode::ConstantOffset(_) => {}
}
}
pub(crate) fn finalize(&self, state: &mut EmitState, buffer: &MachBuffer<Inst>) -> Amode {
match self {
SyntheticAmode::Real(addr) => addr.clone(),

View File

@@ -2613,118 +2613,116 @@ pub(crate) fn emit(
Inst::AtomicRmwSeq {
ty,
op,
address,
mem,
operand,
temp,
dst_old,
} => {
// FIXME: use real vregs for this seq.
debug_assert_eq!(*address, regs::r9());
debug_assert_eq!(*operand, regs::r10());
debug_assert_eq!(temp.to_reg(), regs::r11());
let operand = allocs.next(*operand);
let temp = allocs.next_writable(*temp);
let dst_old = allocs.next_writable(*dst_old);
debug_assert_eq!(dst_old.to_reg(), regs::rax());
let mem = mem.finalize(state, sink).with_allocs(allocs);
// Emit this:
//
// mov{zbq,zwq,zlq,q} (%r9), %rax // rax = old value
// again:
// movq %rax, %r11 // rax = old value, r11 = old value
// `op`q %r10, %r11 // rax = old value, r11 = new value
// lock cmpxchg{b,w,l,q} %r11, (%r9) // try to store new value
// mov{zbq,zwq,zlq,q} (%r_address), %rax // rax = old value
// again:
// movq %rax, %r_temp // rax = old value, r_temp = old value
// `op`q %r_operand, %r_temp // rax = old value, r_temp = new value
// lock cmpxchg{b,w,l,q} %r_temp, (%r_address) // try to store new value
// jnz again // If this is taken, rax will have a "revised" old value
//
// Operand conventions:
// IN: %r9 (addr), %r10 (2nd arg for `op`)
// OUT: %rax (old value), %r11 (trashed), %rflags (trashed)
// Operand conventions: IN: %r_address, %r_operand OUT: %rax (old
// value), %r_temp (trashed), %rflags (trashed)
//
// In the case where the operation is 'xchg', the "`op`q" instruction is instead
// movq %r10, %r11
// so that we simply write in the destination, the "2nd arg for `op`".
let rax = regs::rax();
let r9 = regs::r9();
let r10 = regs::r10();
let r11 = regs::r11();
let rax_w = Writable::from_reg(rax);
let r11_w = Writable::from_reg(r11);
let amode = Amode::imm_reg(0, r9);
// In the case where the operation is 'xchg', the "`op`q"
// instruction is instead: movq %r_operand,
// %r_temp so that we simply write in the destination, the "2nd
// arg for `op`".
//
// TODO: this sequence can be significantly improved (e.g., to `lock
// <op>`) when it is known that `dst_old` is not used later, see
// https://github.com/bytecodealliance/wasmtime/issues/2153.
let again_label = sink.get_label();
// mov{zbq,zwq,zlq,q} (%r9), %rax
// mov{zbq,zwq,zlq,q} (%r_address), %rax
// No need to call `add_trap` here, since the `i1` emit will do that.
let i1 = Inst::load(*ty, amode.clone(), rax_w, ExtKind::ZeroExtend);
let i1 = Inst::load(*ty, mem.clone(), dst_old, ExtKind::ZeroExtend);
i1.emit(&[], sink, info, state);
// again:
sink.bind_label(again_label);
// movq %rax, %r11
let i2 = Inst::mov_r_r(OperandSize::Size64, rax, r11_w);
// movq %rax, %r_temp
let i2 = Inst::mov_r_r(OperandSize::Size64, dst_old.to_reg(), temp);
i2.emit(&[], sink, info, state);
let r10_rmi = RegMemImm::reg(r10);
let operand_rmi = RegMemImm::reg(operand);
use inst_common::MachAtomicRmwOp as RmwOp;
match op {
inst_common::AtomicRmwOp::Xchg => {
// movq %r10, %r11
let i3 = Inst::mov_r_r(OperandSize::Size64, r10, r11_w);
RmwOp::Xchg => {
// movq %r_operand, %r_temp
let i3 = Inst::mov_r_r(OperandSize::Size64, operand, temp);
i3.emit(&[], sink, info, state);
}
inst_common::AtomicRmwOp::Nand => {
// andq %r10, %r11
RmwOp::Nand => {
// andq %r_operand, %r_temp
let i3 =
Inst::alu_rmi_r(OperandSize::Size64, AluRmiROpcode::And, r10_rmi, r11_w);
Inst::alu_rmi_r(OperandSize::Size64, AluRmiROpcode::And, operand_rmi, temp);
i3.emit(&[], sink, info, state);
// notq %r11
let i4 = Inst::not(OperandSize::Size64, r11_w);
// notq %r_temp
let i4 = Inst::not(OperandSize::Size64, temp);
i4.emit(&[], sink, info, state);
}
inst_common::AtomicRmwOp::Umin
| inst_common::AtomicRmwOp::Umax
| inst_common::AtomicRmwOp::Smin
| inst_common::AtomicRmwOp::Smax => {
// cmp %r11, %r10
let i3 = Inst::cmp_rmi_r(OperandSize::from_ty(*ty), RegMemImm::reg(r11), r10);
RmwOp::Umin | RmwOp::Umax | RmwOp::Smin | RmwOp::Smax => {
// cmp %r_temp, %r_operand
let i3 = Inst::cmp_rmi_r(
OperandSize::from_ty(*ty),
RegMemImm::reg(temp.to_reg()),
operand,
);
i3.emit(&[], sink, info, state);
// cmovcc %r10, %r11
// cmovcc %r_operand, %r_temp
let cc = match op {
inst_common::AtomicRmwOp::Umin => CC::BE,
inst_common::AtomicRmwOp::Umax => CC::NB,
inst_common::AtomicRmwOp::Smin => CC::LE,
inst_common::AtomicRmwOp::Smax => CC::NL,
RmwOp::Umin => CC::BE,
RmwOp::Umax => CC::NB,
RmwOp::Smin => CC::LE,
RmwOp::Smax => CC::NL,
_ => unreachable!(),
};
let i4 = Inst::cmove(OperandSize::Size64, cc, RegMem::reg(r10), r11_w);
let i4 = Inst::cmove(OperandSize::Size64, cc, RegMem::reg(operand), temp);
i4.emit(&[], sink, info, state);
}
_ => {
// opq %r10, %r11
// opq %r_operand, %r_temp
let alu_op = match op {
inst_common::AtomicRmwOp::Add => AluRmiROpcode::Add,
inst_common::AtomicRmwOp::Sub => AluRmiROpcode::Sub,
inst_common::AtomicRmwOp::And => AluRmiROpcode::And,
inst_common::AtomicRmwOp::Or => AluRmiROpcode::Or,
inst_common::AtomicRmwOp::Xor => AluRmiROpcode::Xor,
inst_common::AtomicRmwOp::Xchg
| inst_common::AtomicRmwOp::Nand
| inst_common::AtomicRmwOp::Umin
| inst_common::AtomicRmwOp::Umax
| inst_common::AtomicRmwOp::Smin
| inst_common::AtomicRmwOp::Smax => unreachable!(),
RmwOp::Add => AluRmiROpcode::Add,
RmwOp::Sub => AluRmiROpcode::Sub,
RmwOp::And => AluRmiROpcode::And,
RmwOp::Or => AluRmiROpcode::Or,
RmwOp::Xor => AluRmiROpcode::Xor,
RmwOp::Xchg
| RmwOp::Nand
| RmwOp::Umin
| RmwOp::Umax
| RmwOp::Smin
| RmwOp::Smax => unreachable!(),
};
let i3 = Inst::alu_rmi_r(OperandSize::Size64, alu_op, r10_rmi, r11_w);
let i3 = Inst::alu_rmi_r(OperandSize::Size64, alu_op, operand_rmi, temp);
i3.emit(&[], sink, info, state);
}
}
// lock cmpxchg{b,w,l,q} %r11, (%r9)
// lock cmpxchg{b,w,l,q} %r_temp, (%r_address)
// No need to call `add_trap` here, since the `i4` emit will do that.
let i4 = Inst::LockCmpxchg {
ty: *ty,
replacement: r11,
expected: regs::rax(),
mem: amode.into(),
dst_old: Writable::from_reg(regs::rax()),
replacement: temp.to_reg(),
expected: dst_old.to_reg(),
mem: mem.into(),
dst_old,
};
i4.emit(&[], sink, info, state);

View File

@@ -4611,6 +4611,8 @@ fn test_x64_emit() {
3,
)
.into();
// Use `r9` with a 0 offset.
let am3: SyntheticAmode = Amode::imm_reg(0, r9).into();
// A general 8-bit case.
insns.push((
@@ -4743,8 +4745,8 @@ fn test_x64_emit() {
insns.push((
Inst::AtomicRmwSeq {
ty: types::I8,
op: inst_common::AtomicRmwOp::Or,
address: r9,
op: inst_common::MachAtomicRmwOp::Or,
mem: am3.clone(),
operand: r10,
temp: w_r11,
dst_old: w_rax
@@ -4755,8 +4757,8 @@ fn test_x64_emit() {
insns.push((
Inst::AtomicRmwSeq {
ty: types::I16,
op: inst_common::AtomicRmwOp::And,
address: r9,
op: inst_common::MachAtomicRmwOp::And,
mem: am3.clone(),
operand: r10,
temp: w_r11,
dst_old: w_rax
@@ -4767,8 +4769,8 @@ fn test_x64_emit() {
insns.push((
Inst::AtomicRmwSeq {
ty: types::I32,
op: inst_common::AtomicRmwOp::Xchg,
address: r9,
op: inst_common::MachAtomicRmwOp::Xchg,
mem: am3.clone(),
operand: r10,
temp: w_r11,
dst_old: w_rax
@@ -4779,8 +4781,8 @@ fn test_x64_emit() {
insns.push((
Inst::AtomicRmwSeq {
ty: types::I32,
op: inst_common::AtomicRmwOp::Umin,
address: r9,
op: inst_common::MachAtomicRmwOp::Umin,
mem: am3.clone(),
operand: r10,
temp: w_r11,
dst_old: w_rax
@@ -4791,8 +4793,8 @@ fn test_x64_emit() {
insns.push((
Inst::AtomicRmwSeq {
ty: types::I64,
op: inst_common::AtomicRmwOp::Add,
address: r9,
op: inst_common::MachAtomicRmwOp::Add,
mem: am3.clone(),
operand: r10,
temp: w_r11,
dst_old: w_rax

View File

@@ -2052,13 +2052,19 @@ fn x64_get_operands<F: Fn(VReg) -> VReg>(inst: &Inst, collector: &mut OperandCol
mem.get_operands(collector);
}
Inst::AtomicRmwSeq { .. } => {
// FIXME: take vreg args, not fixed regs, and just use
// reg_fixed_use here.
collector.reg_use(regs::r9());
collector.reg_use(regs::r10());
collector.reg_def(Writable::from_reg(regs::r11()));
collector.reg_def(Writable::from_reg(regs::rax()));
Inst::AtomicRmwSeq {
operand,
temp,
dst_old,
mem,
..
} => {
collector.reg_late_use(*operand);
collector.reg_early_def(*temp);
// This `fixed_def` is needed because `CMPXCHG` always uses this
// register implicitly.
collector.reg_fixed_def(*dst_old, regs::rax());
mem.get_operands_late(collector)
}
Inst::Ret { rets } => {