diff --git a/cranelift/codegen/src/isa/aarch64/inst/emit.rs b/cranelift/codegen/src/isa/aarch64/inst/emit.rs index 74ec299bed..198fa67e37 100644 --- a/cranelift/codegen/src/isa/aarch64/inst/emit.rs +++ b/cranelift/codegen/src/isa/aarch64/inst/emit.rs @@ -504,6 +504,33 @@ fn enc_dmb_ish() -> u32 { 0xD5033BBF } +fn enc_ldal(ty: Type, op: AtomicRMWOp, rs: Reg, rt: Writable, rn: Reg) -> u32 { + assert!(machreg_to_gpr(rt.to_reg()) != 31); + let sz = match ty { + I64 => 0b11, + I32 => 0b10, + I16 => 0b01, + I8 => 0b00, + _ => unreachable!(), + }; + let op = match op { + AtomicRMWOp::Add => 0b000, + AtomicRMWOp::Clr => 0b001, + AtomicRMWOp::Eor => 0b010, + AtomicRMWOp::Set => 0b011, + AtomicRMWOp::Smax => 0b100, + AtomicRMWOp::Smin => 0b101, + AtomicRMWOp::Umax => 0b110, + AtomicRMWOp::Umin => 0b111, + }; + 0b00_111_000_111_00000_0_000_00_00000_00000 + | (sz << 30) + | (machreg_to_gpr(rs) << 16) + | (op << 12) + | (machreg_to_gpr(rn) << 5) + | machreg_to_gpr(rt.to_reg()) +} + fn enc_ldar(ty: Type, rt: Writable, rn: Reg) -> u32 { let sz = match ty { I64 => 0b11, @@ -1318,7 +1345,10 @@ impl MachInstEmit for Inst { } => { sink.put4(enc_ccmp_imm(size, rn, imm, nzcv, cond)); } - &Inst::AtomicRMW { ty, op } => { + &Inst::AtomicRMW { ty, op, rs, rt, rn } => { + sink.put4(enc_ldal(ty, op, rs, rt, rn)); + } + &Inst::AtomicRMWLoop { ty, op } => { /* Emit this: again: ldaxr{,b,h} x/w27, [x25] @@ -1340,7 +1370,7 @@ impl MachInstEmit for Inst { so that we simply write in the destination, the "2nd arg for op". */ // TODO: We should not hardcode registers here, a better idea would be to - // pass some scratch registers in the AtomicRMW pseudo-instruction, and use those + // pass some scratch registers in the AtomicRMWLoop pseudo-instruction, and use those let xzr = zero_reg(); let x24 = xreg(24); let x25 = xreg(25); diff --git a/cranelift/codegen/src/isa/aarch64/inst/emit_tests.rs b/cranelift/codegen/src/isa/aarch64/inst/emit_tests.rs index 9e45c6795c..bd4df557ec 100644 --- a/cranelift/codegen/src/isa/aarch64/inst/emit_tests.rs +++ b/cranelift/codegen/src/isa/aarch64/inst/emit_tests.rs @@ -5887,7 +5887,7 @@ fn test_aarch64_binemit() { )); insns.push(( - Inst::AtomicRMW { + Inst::AtomicRMWLoop { ty: I16, op: inst_common::AtomicRmwOp::Xor, }, @@ -5897,6 +5897,359 @@ fn test_aarch64_binemit() { insns.push(( Inst::AtomicRMW { + ty: I8, + op: AtomicRMWOp::Add, + rs: xreg(1), + rt: writable_xreg(2), + rn: xreg(3), + }, + "6200E138", + "ldaddalb w1, w2, [x3]", + )); + insns.push(( + Inst::AtomicRMW { + ty: I16, + op: AtomicRMWOp::Add, + rs: xreg(4), + rt: writable_xreg(5), + rn: xreg(6), + }, + "C500E478", + "ldaddalh w4, w5, [x6]", + )); + insns.push(( + Inst::AtomicRMW { + ty: I32, + op: AtomicRMWOp::Add, + rs: xreg(7), + rt: writable_xreg(8), + rn: xreg(9), + }, + "2801E7B8", + "ldaddal w7, w8, [x9]", + )); + insns.push(( + Inst::AtomicRMW { + ty: I64, + op: AtomicRMWOp::Add, + rs: xreg(10), + rt: writable_xreg(11), + rn: xreg(12), + }, + "8B01EAF8", + "ldaddal x10, x11, [x12]", + )); + insns.push(( + Inst::AtomicRMW { + ty: I8, + op: AtomicRMWOp::Clr, + rs: xreg(13), + rt: writable_xreg(14), + rn: xreg(15), + }, + "EE11ED38", + "ldclralb w13, w14, [x15]", + )); + insns.push(( + Inst::AtomicRMW { + ty: I16, + op: AtomicRMWOp::Clr, + rs: xreg(16), + rt: writable_xreg(17), + rn: xreg(18), + }, + "5112F078", + "ldclralh w16, w17, [x18]", + )); + insns.push(( + Inst::AtomicRMW { + ty: I32, + op: AtomicRMWOp::Clr, + rs: xreg(19), + rt: writable_xreg(20), + rn: xreg(21), + }, + "B412F3B8", + "ldclral w19, w20, [x21]", + )); + insns.push(( + Inst::AtomicRMW { + ty: I64, + op: AtomicRMWOp::Clr, + rs: xreg(22), + rt: writable_xreg(23), + rn: xreg(24), + }, + "1713F6F8", + "ldclral x22, x23, [x24]", + )); + insns.push(( + Inst::AtomicRMW { + ty: I8, + op: AtomicRMWOp::Eor, + rs: xreg(25), + rt: writable_xreg(26), + rn: xreg(27), + }, + "7A23F938", + "ldeoralb w25, w26, [x27]", + )); + insns.push(( + Inst::AtomicRMW { + ty: I16, + op: AtomicRMWOp::Eor, + rs: xreg(28), + rt: writable_xreg(29), + rn: xreg(30), + }, + "DD23FC78", + "ldeoralh w28, fp, [lr]", + )); + insns.push(( + Inst::AtomicRMW { + ty: I32, + op: AtomicRMWOp::Eor, + rs: xreg(29), + rt: writable_xreg(28), + rn: xreg(27), + }, + "7C23FDB8", + "ldeoral fp, w28, [x27]", + )); + insns.push(( + Inst::AtomicRMW { + ty: I64, + op: AtomicRMWOp::Eor, + rs: xreg(26), + rt: writable_xreg(25), + rn: xreg(24), + }, + "1923FAF8", + "ldeoral x26, x25, [x24]", + )); + insns.push(( + Inst::AtomicRMW { + ty: I8, + op: AtomicRMWOp::Set, + rs: xreg(23), + rt: writable_xreg(22), + rn: xreg(21), + }, + "B632F738", + "ldsetalb w23, w22, [x21]", + )); + insns.push(( + Inst::AtomicRMW { + ty: I16, + op: AtomicRMWOp::Set, + rs: xreg(20), + rt: writable_xreg(19), + rn: xreg(18), + }, + "5332F478", + "ldsetalh w20, w19, [x18]", + )); + insns.push(( + Inst::AtomicRMW { + ty: I32, + op: AtomicRMWOp::Set, + rs: xreg(17), + rt: writable_xreg(16), + rn: xreg(15), + }, + "F031F1B8", + "ldsetal w17, w16, [x15]", + )); + insns.push(( + Inst::AtomicRMW { + ty: I64, + op: AtomicRMWOp::Set, + rs: xreg(14), + rt: writable_xreg(13), + rn: xreg(12), + }, + "8D31EEF8", + "ldsetal x14, x13, [x12]", + )); + insns.push(( + Inst::AtomicRMW { + ty: I8, + op: AtomicRMWOp::Smax, + rs: xreg(11), + rt: writable_xreg(10), + rn: xreg(9), + }, + "2A41EB38", + "ldsmaxalb w11, w10, [x9]", + )); + insns.push(( + Inst::AtomicRMW { + ty: I16, + op: AtomicRMWOp::Smax, + rs: xreg(8), + rt: writable_xreg(7), + rn: xreg(6), + }, + "C740E878", + "ldsmaxalh w8, w7, [x6]", + )); + insns.push(( + Inst::AtomicRMW { + ty: I32, + op: AtomicRMWOp::Smax, + rs: xreg(5), + rt: writable_xreg(4), + rn: xreg(3), + }, + "6440E5B8", + "ldsmaxal w5, w4, [x3]", + )); + insns.push(( + Inst::AtomicRMW { + ty: I64, + op: AtomicRMWOp::Smax, + rs: xreg(2), + rt: writable_xreg(1), + rn: xreg(0), + }, + "0140E2F8", + "ldsmaxal x2, x1, [x0]", + )); + insns.push(( + Inst::AtomicRMW { + ty: I8, + op: AtomicRMWOp::Smin, + rs: xreg(1), + rt: writable_xreg(2), + rn: xreg(3), + }, + "6250E138", + "ldsminalb w1, w2, [x3]", + )); + insns.push(( + Inst::AtomicRMW { + ty: I16, + op: AtomicRMWOp::Smin, + rs: xreg(4), + rt: writable_xreg(5), + rn: xreg(6), + }, + "C550E478", + "ldsminalh w4, w5, [x6]", + )); + insns.push(( + Inst::AtomicRMW { + ty: I32, + op: AtomicRMWOp::Smin, + rs: xreg(7), + rt: writable_xreg(8), + rn: xreg(9), + }, + "2851E7B8", + "ldsminal w7, w8, [x9]", + )); + insns.push(( + Inst::AtomicRMW { + ty: I64, + op: AtomicRMWOp::Smin, + rs: xreg(10), + rt: writable_xreg(11), + rn: xreg(12), + }, + "8B51EAF8", + "ldsminal x10, x11, [x12]", + )); + insns.push(( + Inst::AtomicRMW { + ty: I8, + op: AtomicRMWOp::Umax, + rs: xreg(13), + rt: writable_xreg(14), + rn: xreg(15), + }, + "EE61ED38", + "ldumaxalb w13, w14, [x15]", + )); + insns.push(( + Inst::AtomicRMW { + ty: I16, + op: AtomicRMWOp::Umax, + rs: xreg(16), + rt: writable_xreg(17), + rn: xreg(18), + }, + "5162F078", + "ldumaxalh w16, w17, [x18]", + )); + insns.push(( + Inst::AtomicRMW { + ty: I32, + op: AtomicRMWOp::Umax, + rs: xreg(19), + rt: writable_xreg(20), + rn: xreg(21), + }, + "B462F3B8", + "ldumaxal w19, w20, [x21]", + )); + insns.push(( + Inst::AtomicRMW { + ty: I64, + op: AtomicRMWOp::Umax, + rs: xreg(22), + rt: writable_xreg(23), + rn: xreg(24), + }, + "1763F6F8", + "ldumaxal x22, x23, [x24]", + )); + insns.push(( + Inst::AtomicRMW { + ty: I8, + op: AtomicRMWOp::Umin, + rs: xreg(16), + rt: writable_xreg(17), + rn: xreg(18), + }, + "5172F038", + "lduminalb w16, w17, [x18]", + )); + insns.push(( + Inst::AtomicRMW { + ty: I16, + op: AtomicRMWOp::Umin, + rs: xreg(19), + rt: writable_xreg(20), + rn: xreg(21), + }, + "B472F378", + "lduminalh w19, w20, [x21]", + )); + insns.push(( + Inst::AtomicRMW { + ty: I32, + op: AtomicRMWOp::Umin, + rs: xreg(22), + rt: writable_xreg(23), + rn: xreg(24), + }, + "1773F6B8", + "lduminal w22, w23, [x24]", + )); + insns.push(( + Inst::AtomicRMW { + ty: I64, + op: AtomicRMWOp::Umin, + rs: xreg(25), + rt: writable_xreg(26), + rn: xreg(27), + }, + "7A73F9F8", + "lduminal x25, x26, [x27]", + )); + + insns.push(( + Inst::AtomicRMWLoop { ty: I32, op: inst_common::AtomicRmwOp::Xchg, }, diff --git a/cranelift/codegen/src/isa/aarch64/inst/mod.rs b/cranelift/codegen/src/isa/aarch64/inst/mod.rs index ac4d958bb1..f97cd75ef8 100644 --- a/cranelift/codegen/src/isa/aarch64/inst/mod.rs +++ b/cranelift/codegen/src/isa/aarch64/inst/mod.rs @@ -451,6 +451,19 @@ pub enum VecShiftImmOp { Sshr, } +/// Atomic read-modify-write operations with acquire-release semantics +#[derive(Copy, Clone, Debug, PartialEq, Eq, Hash)] +pub enum AtomicRMWOp { + Add, + Clr, + Eor, + Set, + Smax, + Smin, + Umax, + Umin, +} + /// An operation on the bits of a register. This can be paired with several instruction formats /// below (see `Inst`) in any combination. #[derive(Copy, Clone, Debug, PartialEq, Eq, Hash)] @@ -775,11 +788,22 @@ pub enum Inst { /// x27 (wr) old value /// x24 (wr) scratch reg; value afterwards has no meaning /// x28 (wr) scratch reg; value afterwards has no meaning - AtomicRMW { + AtomicRMWLoop { ty: Type, // I8, I16, I32 or I64 op: inst_common::AtomicRmwOp, }, + /// An atomic read-modify-write operation. These instructions require the + /// Large System Extension (LSE) ISA support. The instructions have acquire-release + /// semantics. + AtomicRMW { + op: AtomicRMWOp, + rs: Reg, + rt: Writable, + rn: Reg, + ty: Type, + }, + /// An atomic compare-and-swap operation. This instruction is sequentially consistent. AtomicCAS { rs: Writable, @@ -788,10 +812,10 @@ pub enum Inst { ty: Type, }, - /// Similar to AtomicRMW, a compare-and-swap operation implemented using a load-linked + /// Similar to AtomicRMWLoop, a compare-and-swap operation implemented using a load-linked /// store-conditional loop. /// This instruction is sequentially consistent. - /// Note that the operand conventions, although very similar to AtomicRMW, are different: + /// Note that the operand conventions, although very similar to AtomicRMWLoop, are different: /// /// x25 (rd) address /// x26 (rd) expected value @@ -1919,13 +1943,18 @@ fn aarch64_get_regs(inst: &Inst, collector: &mut RegUsageCollector) { &Inst::CCmpImm { rn, .. } => { collector.add_use(rn); } - &Inst::AtomicRMW { .. } => { + &Inst::AtomicRMWLoop { .. } => { collector.add_use(xreg(25)); collector.add_use(xreg(26)); collector.add_def(writable_xreg(24)); collector.add_def(writable_xreg(27)); collector.add_def(writable_xreg(28)); } + &Inst::AtomicRMW { rs, rt, rn, .. } => { + collector.add_use(rs); + collector.add_def(rt); + collector.add_use(rn); + } &Inst::AtomicCAS { rs, rt, rn, .. } => { collector.add_mod(rs); collector.add_use(rt); @@ -2561,9 +2590,19 @@ fn aarch64_map_regs(inst: &mut Inst, mapper: &RUM) { &mut Inst::CCmpImm { ref mut rn, .. } => { map_use(mapper, rn); } - &mut Inst::AtomicRMW { .. } => { + &mut Inst::AtomicRMWLoop { .. } => { // There are no vregs to map in this insn. } + &mut Inst::AtomicRMW { + ref mut rs, + ref mut rt, + ref mut rn, + .. + } => { + map_use(mapper, rs); + map_def(mapper, rt); + map_use(mapper, rn); + } &mut Inst::AtomicCAS { ref mut rs, ref mut rt, @@ -3617,7 +3656,33 @@ impl Inst { let cond = cond.show_rru(mb_rru); format!("ccmp {}, {}, {}, {}", rn, imm, nzcv, cond) } - &Inst::AtomicRMW { ty, op, .. } => { + &Inst::AtomicRMW { + rs, rt, rn, ty, op + } => { + let op = match op { + AtomicRMWOp::Add => "ldaddal", + AtomicRMWOp::Clr => "ldclral", + AtomicRMWOp::Eor => "ldeoral", + AtomicRMWOp::Set => "ldsetal", + AtomicRMWOp::Smax => "ldsmaxal", + AtomicRMWOp::Umax => "ldumaxal", + AtomicRMWOp::Smin => "ldsminal", + AtomicRMWOp::Umin => "lduminal", + }; + + let size = OperandSize::from_ty(ty); + let rs = show_ireg_sized(rs, mb_rru, size); + let rt = show_ireg_sized(rt.to_reg(), mb_rru, size); + let rn = rn.show_rru(mb_rru); + + let ty_suffix = match ty { + I8 => "b", + I16 => "h", + _ => "", + }; + format!("{}{} {}, {}, [{}]", op, ty_suffix, rs, rt, rn) + } + &Inst::AtomicRMWLoop { ty, op, .. } => { format!( "atomically {{ {}_bits_at_[x25]) {:?}= x26 ; x27 = old_value_at_[x25]; x24,x28 = trash }}", ty.bits(), op) diff --git a/cranelift/codegen/src/isa/aarch64/lower_inst.rs b/cranelift/codegen/src/isa/aarch64/lower_inst.rs index 10c6807555..c743f642a4 100644 --- a/cranelift/codegen/src/isa/aarch64/lower_inst.rs +++ b/cranelift/codegen/src/isa/aarch64/lower_inst.rs @@ -1529,20 +1529,41 @@ pub(crate) fn lower_insn_to_regs>( let mut r_arg2 = put_input_in_reg(ctx, inputs[1], NarrowValueMode::None); let ty_access = ty.unwrap(); assert!(is_valid_atomic_transaction_ty(ty_access)); - // Make sure that both args are in virtual regs, since in effect - // we have to do a parallel copy to get them safely to the AtomicRMW input - // regs, and that's not guaranteed safe if either is in a real reg. - r_addr = ctx.ensure_in_vreg(r_addr, I64); - r_arg2 = ctx.ensure_in_vreg(r_arg2, I64); - // Move the args to the preordained AtomicRMW input regs - ctx.emit(Inst::gen_move(Writable::from_reg(xreg(25)), r_addr, I64)); - ctx.emit(Inst::gen_move(Writable::from_reg(xreg(26)), r_arg2, I64)); - // Now the AtomicRMW insn itself + let op = inst_common::AtomicRmwOp::from(ctx.data(insn).atomic_rmw_op().unwrap()); - ctx.emit(Inst::AtomicRMW { ty: ty_access, op }); - // And finally, copy the preordained AtomicRMW output reg to its destination. - ctx.emit(Inst::gen_move(r_dst, xreg(27), I64)); - // Also, x24 and x28 are trashed. `fn aarch64_get_regs` must mention that. + let lse_op = match op { + AtomicRmwOp::Add => Some(AtomicRMWOp::Add), + AtomicRmwOp::And => Some(AtomicRMWOp::Clr), + AtomicRmwOp::Xor => Some(AtomicRMWOp::Eor), + AtomicRmwOp::Or => Some(AtomicRMWOp::Set), + AtomicRmwOp::Smax => Some(AtomicRMWOp::Smax), + AtomicRmwOp::Umax => Some(AtomicRMWOp::Umax), + AtomicRmwOp::Smin => Some(AtomicRMWOp::Smin), + AtomicRmwOp::Umin => Some(AtomicRMWOp::Umin), + _ => None + }; + if isa_flags.use_lse() && lse_op.is_some() { + ctx.emit(Inst::AtomicRMW { + op: lse_op.unwrap(), + rs: r_arg2, + rt: r_dst, + rn: r_addr, + ty: ty_access, + }); + } else { + // Make sure that both args are in virtual regs, since in effect + // we have to do a parallel copy to get them safely to the AtomicRMW input + // regs, and that's not guaranteed safe if either is in a real reg. + r_addr = ctx.ensure_in_vreg(r_addr, I64); + r_arg2 = ctx.ensure_in_vreg(r_arg2, I64); + // Move the args to the preordained AtomicRMW input regs + ctx.emit(Inst::gen_move(Writable::from_reg(xreg(25)), r_addr, I64)); + ctx.emit(Inst::gen_move(Writable::from_reg(xreg(26)), r_arg2, I64)); + ctx.emit(Inst::AtomicRMWLoop { ty: ty_access, op }); + // And finally, copy the preordained AtomicRMW output reg to its destination. + ctx.emit(Inst::gen_move(r_dst, xreg(27), I64)); + // Also, x24 and x28 are trashed. `fn aarch64_get_regs` must mention that. + } } Opcode::AtomicCas => { diff --git a/cranelift/filetests/filetests/isa/aarch64/atomic-rmw-lse.clif b/cranelift/filetests/filetests/isa/aarch64/atomic-rmw-lse.clif new file mode 100644 index 0000000000..9157c99977 --- /dev/null +++ b/cranelift/filetests/filetests/isa/aarch64/atomic-rmw-lse.clif @@ -0,0 +1,114 @@ +test compile +target aarch64 has_lse + +function %atomic_rmw_add_i64(i64, i64) { +block0(v0: i64, v1: i64): + v2 = atomic_rmw.i64 add v0, v1 + return +} +; check: ldaddal x1, x0, [x0] + +function %atomic_rmw_add_i32(i32, i32) { +block0(v0: i32, v1: i32): + v2 = atomic_rmw.i32 add v0, v1 + return +} +; check: ldaddal w1, w0, [x0] + +function %atomic_rmw_and_i64(i64, i64) { +block0(v0: i64, v1: i64): + v2 = atomic_rmw.i64 and v0, v1 + return +} +; check: ldclral x1, x0, [x0] + +function %atomic_rmw_and_i32(i32, i32) { +block0(v0: i32, v1: i32): + v2 = atomic_rmw.i32 and v0, v1 + return +} +; check: ldclral w1, w0, [x0] + +function %atomic_rmw_or_i64(i64, i64) { +block0(v0: i64, v1: i64): + v2 = atomic_rmw.i64 or v0, v1 + return +} +; check: ldsetal x1, x0, [x0] + +function %atomic_rmw_or_i32(i32, i32) { +block0(v0: i32, v1: i32): + v2 = atomic_rmw.i32 or v0, v1 + return +} +; check: ldsetal w1, w0, [x0] + +function %atomic_rmw_xor_i64(i64, i64) { +block0(v0: i64, v1: i64): + v2 = atomic_rmw.i64 xor v0, v1 + return +} +; check: ldeoral x1, x0, [x0] + +function %atomic_rmw_xor_i32(i32, i32) { +block0(v0: i32, v1: i32): + v2 = atomic_rmw.i32 xor v0, v1 + return +} +; check: ldeoral w1, w0, [x0] + +function %atomic_rmw_smax_i64(i64, i64) { +block0(v0: i64, v1: i64): + v2 = atomic_rmw.i64 smax v0, v1 + return +} +; check: ldsmaxal x1, x0, [x0] + +function %atomic_rmw_smax_i32(i32, i32) { +block0(v0: i32, v1: i32): + v2 = atomic_rmw.i32 smax v0, v1 + return +} +; check: ldsmaxal w1, w0, [x0] + +function %atomic_rmw_umax_i64(i64, i64) { +block0(v0: i64, v1: i64): + v2 = atomic_rmw.i64 umax v0, v1 + return +} +; check: ldumaxal x1, x0, [x0] + +function %atomic_rmw_umax_i32(i32, i32) { +block0(v0: i32, v1: i32): + v2 = atomic_rmw.i32 umax v0, v1 + return +} +; check: ldumaxal w1, w0, [x0] + +function %atomic_rmw_smin_i64(i64, i64) { +block0(v0: i64, v1: i64): + v2 = atomic_rmw.i64 smin v0, v1 + return +} +; check: ldsminal x1, x0, [x0] + +function %atomic_rmw_smin_i32(i32, i32) { +block0(v0: i32, v1: i32): + v2 = atomic_rmw.i32 smin v0, v1 + return +} +; check: ldsminal w1, w0, [x0] + +function %atomic_rmw_umin_i64(i64, i64) { +block0(v0: i64, v1: i64): + v2 = atomic_rmw.i64 umin v0, v1 + return +} +; check: lduminal x1, x0, [x0] + +function %atomic_rmw_umin_i32(i32, i32) { +block0(v0: i32, v1: i32): + v2 = atomic_rmw.i32 umin v0, v1 + return +} +; check: lduminal w1, w0, [x0]