AArch64 LSE atomic_rmw support

Rename the existing AtomicRMW to AtomicRMWLoop and directly lower
atomic_rmw operations, without a loop if LSE support is available.

Copyright (c) 2021, Arm Limited
This commit is contained in:
Sam Parker
2021-09-10 09:32:56 +01:00
parent d20194fa4c
commit 80d596b055
5 changed files with 605 additions and 22 deletions

View File

@@ -504,6 +504,33 @@ fn enc_dmb_ish() -> u32 {
0xD5033BBF
}
fn enc_ldal(ty: Type, op: AtomicRMWOp, rs: Reg, rt: Writable<Reg>, rn: Reg) -> u32 {
assert!(machreg_to_gpr(rt.to_reg()) != 31);
let sz = match ty {
I64 => 0b11,
I32 => 0b10,
I16 => 0b01,
I8 => 0b00,
_ => unreachable!(),
};
let op = match op {
AtomicRMWOp::Add => 0b000,
AtomicRMWOp::Clr => 0b001,
AtomicRMWOp::Eor => 0b010,
AtomicRMWOp::Set => 0b011,
AtomicRMWOp::Smax => 0b100,
AtomicRMWOp::Smin => 0b101,
AtomicRMWOp::Umax => 0b110,
AtomicRMWOp::Umin => 0b111,
};
0b00_111_000_111_00000_0_000_00_00000_00000
| (sz << 30)
| (machreg_to_gpr(rs) << 16)
| (op << 12)
| (machreg_to_gpr(rn) << 5)
| machreg_to_gpr(rt.to_reg())
}
fn enc_ldar(ty: Type, rt: Writable<Reg>, rn: Reg) -> u32 {
let sz = match ty {
I64 => 0b11,
@@ -1318,7 +1345,10 @@ impl MachInstEmit for Inst {
} => {
sink.put4(enc_ccmp_imm(size, rn, imm, nzcv, cond));
}
&Inst::AtomicRMW { ty, op } => {
&Inst::AtomicRMW { ty, op, rs, rt, rn } => {
sink.put4(enc_ldal(ty, op, rs, rt, rn));
}
&Inst::AtomicRMWLoop { ty, op } => {
/* Emit this:
again:
ldaxr{,b,h} x/w27, [x25]
@@ -1340,7 +1370,7 @@ impl MachInstEmit for Inst {
so that we simply write in the destination, the "2nd arg for op".
*/
// TODO: We should not hardcode registers here, a better idea would be to
// pass some scratch registers in the AtomicRMW pseudo-instruction, and use those
// pass some scratch registers in the AtomicRMWLoop pseudo-instruction, and use those
let xzr = zero_reg();
let x24 = xreg(24);
let x25 = xreg(25);

View File

@@ -5887,7 +5887,7 @@ fn test_aarch64_binemit() {
));
insns.push((
Inst::AtomicRMW {
Inst::AtomicRMWLoop {
ty: I16,
op: inst_common::AtomicRmwOp::Xor,
},
@@ -5897,6 +5897,359 @@ fn test_aarch64_binemit() {
insns.push((
Inst::AtomicRMW {
ty: I8,
op: AtomicRMWOp::Add,
rs: xreg(1),
rt: writable_xreg(2),
rn: xreg(3),
},
"6200E138",
"ldaddalb w1, w2, [x3]",
));
insns.push((
Inst::AtomicRMW {
ty: I16,
op: AtomicRMWOp::Add,
rs: xreg(4),
rt: writable_xreg(5),
rn: xreg(6),
},
"C500E478",
"ldaddalh w4, w5, [x6]",
));
insns.push((
Inst::AtomicRMW {
ty: I32,
op: AtomicRMWOp::Add,
rs: xreg(7),
rt: writable_xreg(8),
rn: xreg(9),
},
"2801E7B8",
"ldaddal w7, w8, [x9]",
));
insns.push((
Inst::AtomicRMW {
ty: I64,
op: AtomicRMWOp::Add,
rs: xreg(10),
rt: writable_xreg(11),
rn: xreg(12),
},
"8B01EAF8",
"ldaddal x10, x11, [x12]",
));
insns.push((
Inst::AtomicRMW {
ty: I8,
op: AtomicRMWOp::Clr,
rs: xreg(13),
rt: writable_xreg(14),
rn: xreg(15),
},
"EE11ED38",
"ldclralb w13, w14, [x15]",
));
insns.push((
Inst::AtomicRMW {
ty: I16,
op: AtomicRMWOp::Clr,
rs: xreg(16),
rt: writable_xreg(17),
rn: xreg(18),
},
"5112F078",
"ldclralh w16, w17, [x18]",
));
insns.push((
Inst::AtomicRMW {
ty: I32,
op: AtomicRMWOp::Clr,
rs: xreg(19),
rt: writable_xreg(20),
rn: xreg(21),
},
"B412F3B8",
"ldclral w19, w20, [x21]",
));
insns.push((
Inst::AtomicRMW {
ty: I64,
op: AtomicRMWOp::Clr,
rs: xreg(22),
rt: writable_xreg(23),
rn: xreg(24),
},
"1713F6F8",
"ldclral x22, x23, [x24]",
));
insns.push((
Inst::AtomicRMW {
ty: I8,
op: AtomicRMWOp::Eor,
rs: xreg(25),
rt: writable_xreg(26),
rn: xreg(27),
},
"7A23F938",
"ldeoralb w25, w26, [x27]",
));
insns.push((
Inst::AtomicRMW {
ty: I16,
op: AtomicRMWOp::Eor,
rs: xreg(28),
rt: writable_xreg(29),
rn: xreg(30),
},
"DD23FC78",
"ldeoralh w28, fp, [lr]",
));
insns.push((
Inst::AtomicRMW {
ty: I32,
op: AtomicRMWOp::Eor,
rs: xreg(29),
rt: writable_xreg(28),
rn: xreg(27),
},
"7C23FDB8",
"ldeoral fp, w28, [x27]",
));
insns.push((
Inst::AtomicRMW {
ty: I64,
op: AtomicRMWOp::Eor,
rs: xreg(26),
rt: writable_xreg(25),
rn: xreg(24),
},
"1923FAF8",
"ldeoral x26, x25, [x24]",
));
insns.push((
Inst::AtomicRMW {
ty: I8,
op: AtomicRMWOp::Set,
rs: xreg(23),
rt: writable_xreg(22),
rn: xreg(21),
},
"B632F738",
"ldsetalb w23, w22, [x21]",
));
insns.push((
Inst::AtomicRMW {
ty: I16,
op: AtomicRMWOp::Set,
rs: xreg(20),
rt: writable_xreg(19),
rn: xreg(18),
},
"5332F478",
"ldsetalh w20, w19, [x18]",
));
insns.push((
Inst::AtomicRMW {
ty: I32,
op: AtomicRMWOp::Set,
rs: xreg(17),
rt: writable_xreg(16),
rn: xreg(15),
},
"F031F1B8",
"ldsetal w17, w16, [x15]",
));
insns.push((
Inst::AtomicRMW {
ty: I64,
op: AtomicRMWOp::Set,
rs: xreg(14),
rt: writable_xreg(13),
rn: xreg(12),
},
"8D31EEF8",
"ldsetal x14, x13, [x12]",
));
insns.push((
Inst::AtomicRMW {
ty: I8,
op: AtomicRMWOp::Smax,
rs: xreg(11),
rt: writable_xreg(10),
rn: xreg(9),
},
"2A41EB38",
"ldsmaxalb w11, w10, [x9]",
));
insns.push((
Inst::AtomicRMW {
ty: I16,
op: AtomicRMWOp::Smax,
rs: xreg(8),
rt: writable_xreg(7),
rn: xreg(6),
},
"C740E878",
"ldsmaxalh w8, w7, [x6]",
));
insns.push((
Inst::AtomicRMW {
ty: I32,
op: AtomicRMWOp::Smax,
rs: xreg(5),
rt: writable_xreg(4),
rn: xreg(3),
},
"6440E5B8",
"ldsmaxal w5, w4, [x3]",
));
insns.push((
Inst::AtomicRMW {
ty: I64,
op: AtomicRMWOp::Smax,
rs: xreg(2),
rt: writable_xreg(1),
rn: xreg(0),
},
"0140E2F8",
"ldsmaxal x2, x1, [x0]",
));
insns.push((
Inst::AtomicRMW {
ty: I8,
op: AtomicRMWOp::Smin,
rs: xreg(1),
rt: writable_xreg(2),
rn: xreg(3),
},
"6250E138",
"ldsminalb w1, w2, [x3]",
));
insns.push((
Inst::AtomicRMW {
ty: I16,
op: AtomicRMWOp::Smin,
rs: xreg(4),
rt: writable_xreg(5),
rn: xreg(6),
},
"C550E478",
"ldsminalh w4, w5, [x6]",
));
insns.push((
Inst::AtomicRMW {
ty: I32,
op: AtomicRMWOp::Smin,
rs: xreg(7),
rt: writable_xreg(8),
rn: xreg(9),
},
"2851E7B8",
"ldsminal w7, w8, [x9]",
));
insns.push((
Inst::AtomicRMW {
ty: I64,
op: AtomicRMWOp::Smin,
rs: xreg(10),
rt: writable_xreg(11),
rn: xreg(12),
},
"8B51EAF8",
"ldsminal x10, x11, [x12]",
));
insns.push((
Inst::AtomicRMW {
ty: I8,
op: AtomicRMWOp::Umax,
rs: xreg(13),
rt: writable_xreg(14),
rn: xreg(15),
},
"EE61ED38",
"ldumaxalb w13, w14, [x15]",
));
insns.push((
Inst::AtomicRMW {
ty: I16,
op: AtomicRMWOp::Umax,
rs: xreg(16),
rt: writable_xreg(17),
rn: xreg(18),
},
"5162F078",
"ldumaxalh w16, w17, [x18]",
));
insns.push((
Inst::AtomicRMW {
ty: I32,
op: AtomicRMWOp::Umax,
rs: xreg(19),
rt: writable_xreg(20),
rn: xreg(21),
},
"B462F3B8",
"ldumaxal w19, w20, [x21]",
));
insns.push((
Inst::AtomicRMW {
ty: I64,
op: AtomicRMWOp::Umax,
rs: xreg(22),
rt: writable_xreg(23),
rn: xreg(24),
},
"1763F6F8",
"ldumaxal x22, x23, [x24]",
));
insns.push((
Inst::AtomicRMW {
ty: I8,
op: AtomicRMWOp::Umin,
rs: xreg(16),
rt: writable_xreg(17),
rn: xreg(18),
},
"5172F038",
"lduminalb w16, w17, [x18]",
));
insns.push((
Inst::AtomicRMW {
ty: I16,
op: AtomicRMWOp::Umin,
rs: xreg(19),
rt: writable_xreg(20),
rn: xreg(21),
},
"B472F378",
"lduminalh w19, w20, [x21]",
));
insns.push((
Inst::AtomicRMW {
ty: I32,
op: AtomicRMWOp::Umin,
rs: xreg(22),
rt: writable_xreg(23),
rn: xreg(24),
},
"1773F6B8",
"lduminal w22, w23, [x24]",
));
insns.push((
Inst::AtomicRMW {
ty: I64,
op: AtomicRMWOp::Umin,
rs: xreg(25),
rt: writable_xreg(26),
rn: xreg(27),
},
"7A73F9F8",
"lduminal x25, x26, [x27]",
));
insns.push((
Inst::AtomicRMWLoop {
ty: I32,
op: inst_common::AtomicRmwOp::Xchg,
},

View File

@@ -451,6 +451,19 @@ pub enum VecShiftImmOp {
Sshr,
}
/// Atomic read-modify-write operations with acquire-release semantics
#[derive(Copy, Clone, Debug, PartialEq, Eq, Hash)]
pub enum AtomicRMWOp {
Add,
Clr,
Eor,
Set,
Smax,
Smin,
Umax,
Umin,
}
/// An operation on the bits of a register. This can be paired with several instruction formats
/// below (see `Inst`) in any combination.
#[derive(Copy, Clone, Debug, PartialEq, Eq, Hash)]
@@ -775,11 +788,22 @@ pub enum Inst {
/// x27 (wr) old value
/// x24 (wr) scratch reg; value afterwards has no meaning
/// x28 (wr) scratch reg; value afterwards has no meaning
AtomicRMW {
AtomicRMWLoop {
ty: Type, // I8, I16, I32 or I64
op: inst_common::AtomicRmwOp,
},
/// An atomic read-modify-write operation. These instructions require the
/// Large System Extension (LSE) ISA support. The instructions have acquire-release
/// semantics.
AtomicRMW {
op: AtomicRMWOp,
rs: Reg,
rt: Writable<Reg>,
rn: Reg,
ty: Type,
},
/// An atomic compare-and-swap operation. This instruction is sequentially consistent.
AtomicCAS {
rs: Writable<Reg>,
@@ -788,10 +812,10 @@ pub enum Inst {
ty: Type,
},
/// Similar to AtomicRMW, a compare-and-swap operation implemented using a load-linked
/// Similar to AtomicRMWLoop, a compare-and-swap operation implemented using a load-linked
/// store-conditional loop.
/// This instruction is sequentially consistent.
/// Note that the operand conventions, although very similar to AtomicRMW, are different:
/// Note that the operand conventions, although very similar to AtomicRMWLoop, are different:
///
/// x25 (rd) address
/// x26 (rd) expected value
@@ -1919,13 +1943,18 @@ fn aarch64_get_regs(inst: &Inst, collector: &mut RegUsageCollector) {
&Inst::CCmpImm { rn, .. } => {
collector.add_use(rn);
}
&Inst::AtomicRMW { .. } => {
&Inst::AtomicRMWLoop { .. } => {
collector.add_use(xreg(25));
collector.add_use(xreg(26));
collector.add_def(writable_xreg(24));
collector.add_def(writable_xreg(27));
collector.add_def(writable_xreg(28));
}
&Inst::AtomicRMW { rs, rt, rn, .. } => {
collector.add_use(rs);
collector.add_def(rt);
collector.add_use(rn);
}
&Inst::AtomicCAS { rs, rt, rn, .. } => {
collector.add_mod(rs);
collector.add_use(rt);
@@ -2561,9 +2590,19 @@ fn aarch64_map_regs<RUM: RegUsageMapper>(inst: &mut Inst, mapper: &RUM) {
&mut Inst::CCmpImm { ref mut rn, .. } => {
map_use(mapper, rn);
}
&mut Inst::AtomicRMW { .. } => {
&mut Inst::AtomicRMWLoop { .. } => {
// There are no vregs to map in this insn.
}
&mut Inst::AtomicRMW {
ref mut rs,
ref mut rt,
ref mut rn,
..
} => {
map_use(mapper, rs);
map_def(mapper, rt);
map_use(mapper, rn);
}
&mut Inst::AtomicCAS {
ref mut rs,
ref mut rt,
@@ -3617,7 +3656,33 @@ impl Inst {
let cond = cond.show_rru(mb_rru);
format!("ccmp {}, {}, {}, {}", rn, imm, nzcv, cond)
}
&Inst::AtomicRMW { ty, op, .. } => {
&Inst::AtomicRMW {
rs, rt, rn, ty, op
} => {
let op = match op {
AtomicRMWOp::Add => "ldaddal",
AtomicRMWOp::Clr => "ldclral",
AtomicRMWOp::Eor => "ldeoral",
AtomicRMWOp::Set => "ldsetal",
AtomicRMWOp::Smax => "ldsmaxal",
AtomicRMWOp::Umax => "ldumaxal",
AtomicRMWOp::Smin => "ldsminal",
AtomicRMWOp::Umin => "lduminal",
};
let size = OperandSize::from_ty(ty);
let rs = show_ireg_sized(rs, mb_rru, size);
let rt = show_ireg_sized(rt.to_reg(), mb_rru, size);
let rn = rn.show_rru(mb_rru);
let ty_suffix = match ty {
I8 => "b",
I16 => "h",
_ => "",
};
format!("{}{} {}, {}, [{}]", op, ty_suffix, rs, rt, rn)
}
&Inst::AtomicRMWLoop { ty, op, .. } => {
format!(
"atomically {{ {}_bits_at_[x25]) {:?}= x26 ; x27 = old_value_at_[x25]; x24,x28 = trash }}",
ty.bits(), op)

View File

@@ -1529,6 +1529,28 @@ pub(crate) fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(
let mut r_arg2 = put_input_in_reg(ctx, inputs[1], NarrowValueMode::None);
let ty_access = ty.unwrap();
assert!(is_valid_atomic_transaction_ty(ty_access));
let op = inst_common::AtomicRmwOp::from(ctx.data(insn).atomic_rmw_op().unwrap());
let lse_op = match op {
AtomicRmwOp::Add => Some(AtomicRMWOp::Add),
AtomicRmwOp::And => Some(AtomicRMWOp::Clr),
AtomicRmwOp::Xor => Some(AtomicRMWOp::Eor),
AtomicRmwOp::Or => Some(AtomicRMWOp::Set),
AtomicRmwOp::Smax => Some(AtomicRMWOp::Smax),
AtomicRmwOp::Umax => Some(AtomicRMWOp::Umax),
AtomicRmwOp::Smin => Some(AtomicRMWOp::Smin),
AtomicRmwOp::Umin => Some(AtomicRMWOp::Umin),
_ => None
};
if isa_flags.use_lse() && lse_op.is_some() {
ctx.emit(Inst::AtomicRMW {
op: lse_op.unwrap(),
rs: r_arg2,
rt: r_dst,
rn: r_addr,
ty: ty_access,
});
} else {
// Make sure that both args are in virtual regs, since in effect
// we have to do a parallel copy to get them safely to the AtomicRMW input
// regs, and that's not guaranteed safe if either is in a real reg.
@@ -1537,13 +1559,12 @@ pub(crate) fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(
// Move the args to the preordained AtomicRMW input regs
ctx.emit(Inst::gen_move(Writable::from_reg(xreg(25)), r_addr, I64));
ctx.emit(Inst::gen_move(Writable::from_reg(xreg(26)), r_arg2, I64));
// Now the AtomicRMW insn itself
let op = inst_common::AtomicRmwOp::from(ctx.data(insn).atomic_rmw_op().unwrap());
ctx.emit(Inst::AtomicRMW { ty: ty_access, op });
ctx.emit(Inst::AtomicRMWLoop { ty: ty_access, op });
// And finally, copy the preordained AtomicRMW output reg to its destination.
ctx.emit(Inst::gen_move(r_dst, xreg(27), I64));
// Also, x24 and x28 are trashed. `fn aarch64_get_regs` must mention that.
}
}
Opcode::AtomicCas => {
let r_dst = get_output_reg(ctx, outputs[0]).only_reg().unwrap();

View File

@@ -0,0 +1,114 @@
test compile
target aarch64 has_lse
function %atomic_rmw_add_i64(i64, i64) {
block0(v0: i64, v1: i64):
v2 = atomic_rmw.i64 add v0, v1
return
}
; check: ldaddal x1, x0, [x0]
function %atomic_rmw_add_i32(i32, i32) {
block0(v0: i32, v1: i32):
v2 = atomic_rmw.i32 add v0, v1
return
}
; check: ldaddal w1, w0, [x0]
function %atomic_rmw_and_i64(i64, i64) {
block0(v0: i64, v1: i64):
v2 = atomic_rmw.i64 and v0, v1
return
}
; check: ldclral x1, x0, [x0]
function %atomic_rmw_and_i32(i32, i32) {
block0(v0: i32, v1: i32):
v2 = atomic_rmw.i32 and v0, v1
return
}
; check: ldclral w1, w0, [x0]
function %atomic_rmw_or_i64(i64, i64) {
block0(v0: i64, v1: i64):
v2 = atomic_rmw.i64 or v0, v1
return
}
; check: ldsetal x1, x0, [x0]
function %atomic_rmw_or_i32(i32, i32) {
block0(v0: i32, v1: i32):
v2 = atomic_rmw.i32 or v0, v1
return
}
; check: ldsetal w1, w0, [x0]
function %atomic_rmw_xor_i64(i64, i64) {
block0(v0: i64, v1: i64):
v2 = atomic_rmw.i64 xor v0, v1
return
}
; check: ldeoral x1, x0, [x0]
function %atomic_rmw_xor_i32(i32, i32) {
block0(v0: i32, v1: i32):
v2 = atomic_rmw.i32 xor v0, v1
return
}
; check: ldeoral w1, w0, [x0]
function %atomic_rmw_smax_i64(i64, i64) {
block0(v0: i64, v1: i64):
v2 = atomic_rmw.i64 smax v0, v1
return
}
; check: ldsmaxal x1, x0, [x0]
function %atomic_rmw_smax_i32(i32, i32) {
block0(v0: i32, v1: i32):
v2 = atomic_rmw.i32 smax v0, v1
return
}
; check: ldsmaxal w1, w0, [x0]
function %atomic_rmw_umax_i64(i64, i64) {
block0(v0: i64, v1: i64):
v2 = atomic_rmw.i64 umax v0, v1
return
}
; check: ldumaxal x1, x0, [x0]
function %atomic_rmw_umax_i32(i32, i32) {
block0(v0: i32, v1: i32):
v2 = atomic_rmw.i32 umax v0, v1
return
}
; check: ldumaxal w1, w0, [x0]
function %atomic_rmw_smin_i64(i64, i64) {
block0(v0: i64, v1: i64):
v2 = atomic_rmw.i64 smin v0, v1
return
}
; check: ldsminal x1, x0, [x0]
function %atomic_rmw_smin_i32(i32, i32) {
block0(v0: i32, v1: i32):
v2 = atomic_rmw.i32 smin v0, v1
return
}
; check: ldsminal w1, w0, [x0]
function %atomic_rmw_umin_i64(i64, i64) {
block0(v0: i64, v1: i64):
v2 = atomic_rmw.i64 umin v0, v1
return
}
; check: lduminal x1, x0, [x0]
function %atomic_rmw_umin_i32(i32, i32) {
block0(v0: i32, v1: i32):
v2 = atomic_rmw.i32 umin v0, v1
return
}
; check: lduminal w1, w0, [x0]