Merge pull request #3128 from sparker-arm/aarch64-atomics

Re-implement AArch64 atomic load and stores
This commit is contained in:
Chris Fallin
2021-08-06 14:38:25 -07:00
committed by GitHub
7 changed files with 436 additions and 183 deletions

View File

@@ -498,7 +498,7 @@ fn enc_dmb_ish() -> u32 {
0xD5033BBF 0xD5033BBF
} }
fn enc_ldxr(ty: Type, rt: Writable<Reg>, rn: Reg) -> u32 { fn enc_ldar(ty: Type, rt: Writable<Reg>, rn: Reg) -> u32 {
let sz = match ty { let sz = match ty {
I64 => 0b11, I64 => 0b11,
I32 => 0b10, I32 => 0b10,
@@ -506,13 +506,13 @@ fn enc_ldxr(ty: Type, rt: Writable<Reg>, rn: Reg) -> u32 {
I8 => 0b00, I8 => 0b00,
_ => unreachable!(), _ => unreachable!(),
}; };
0b00001000_01011111_01111100_00000000 0b00_001000_1_1_0_11111_1_11111_00000_00000
| (sz << 30) | (sz << 30)
| (machreg_to_gpr(rn) << 5) | (machreg_to_gpr(rn) << 5)
| machreg_to_gpr(rt.to_reg()) | machreg_to_gpr(rt.to_reg())
} }
fn enc_stxr(ty: Type, rs: Writable<Reg>, rt: Reg, rn: Reg) -> u32 { fn enc_stlr(ty: Type, rt: Reg, rn: Reg) -> u32 {
let sz = match ty { let sz = match ty {
I64 => 0b11, I64 => 0b11,
I32 => 0b10, I32 => 0b10,
@@ -520,7 +520,35 @@ fn enc_stxr(ty: Type, rs: Writable<Reg>, rt: Reg, rn: Reg) -> u32 {
I8 => 0b00, I8 => 0b00,
_ => unreachable!(), _ => unreachable!(),
}; };
0b00001000_00000000_01111100_00000000 0b00_001000_100_11111_1_11111_00000_00000
| (sz << 30)
| (machreg_to_gpr(rn) << 5)
| machreg_to_gpr(rt)
}
fn enc_ldaxr(ty: Type, rt: Writable<Reg>, rn: Reg) -> u32 {
let sz = match ty {
I64 => 0b11,
I32 => 0b10,
I16 => 0b01,
I8 => 0b00,
_ => unreachable!(),
};
0b00_001000_0_1_0_11111_1_11111_00000_00000
| (sz << 30)
| (machreg_to_gpr(rn) << 5)
| machreg_to_gpr(rt.to_reg())
}
fn enc_stlxr(ty: Type, rs: Writable<Reg>, rt: Reg, rn: Reg) -> u32 {
let sz = match ty {
I64 => 0b11,
I32 => 0b10,
I16 => 0b01,
I8 => 0b00,
_ => unreachable!(),
};
0b00_001000_000_00000_1_11111_00000_00000
| (sz << 30) | (sz << 30)
| (machreg_to_gpr(rs.to_reg()) << 16) | (machreg_to_gpr(rs.to_reg()) << 16)
| (machreg_to_gpr(rn) << 5) | (machreg_to_gpr(rn) << 5)
@@ -1286,20 +1314,18 @@ impl MachInstEmit for Inst {
} }
&Inst::AtomicRMW { ty, op } => { &Inst::AtomicRMW { ty, op } => {
/* Emit this: /* Emit this:
dmb ish
again: again:
ldxr{,b,h} x/w27, [x25] ldaxr{,b,h} x/w27, [x25]
op x28, x27, x26 // op is add,sub,and,orr,eor op x28, x27, x26 // op is add,sub,and,orr,eor
stxr{,b,h} w24, x/w28, [x25] stlxr{,b,h} w24, x/w28, [x25]
cbnz x24, again cbnz x24, again
dmb ish
Operand conventions: Operand conventions:
IN: x25 (addr), x26 (2nd arg for op) IN: x25 (addr), x26 (2nd arg for op)
OUT: x27 (old value), x24 (trashed), x28 (trashed) OUT: x27 (old value), x24 (trashed), x28 (trashed)
It is unfortunate that, per the ARM documentation, x28 cannot be used for It is unfortunate that, per the ARM documentation, x28 cannot be used for
both the store-data and success-flag operands of stxr. This causes the both the store-data and success-flag operands of stlxr. This causes the
instruction's behaviour to be "CONSTRAINED UNPREDICTABLE", so we use x24 instruction's behaviour to be "CONSTRAINED UNPREDICTABLE", so we use x24
instead for the success-flag. instead for the success-flag.
@@ -1320,15 +1346,13 @@ impl MachInstEmit for Inst {
let x28wr = writable_xreg(28); let x28wr = writable_xreg(28);
let again_label = sink.get_label(); let again_label = sink.get_label();
sink.put4(enc_dmb_ish()); // dmb ish
// again: // again:
sink.bind_label(again_label); sink.bind_label(again_label);
let srcloc = state.cur_srcloc(); let srcloc = state.cur_srcloc();
if srcloc != SourceLoc::default() { if srcloc != SourceLoc::default() {
sink.add_trap(srcloc, TrapCode::HeapOutOfBounds); sink.add_trap(srcloc, TrapCode::HeapOutOfBounds);
} }
sink.put4(enc_ldxr(ty, x27wr, x25)); // ldxr x27, [x25] sink.put4(enc_ldaxr(ty, x27wr, x25)); // ldaxr x27, [x25]
match op { match op {
AtomicRmwOp::Xchg => { AtomicRmwOp::Xchg => {
@@ -1420,19 +1444,17 @@ impl MachInstEmit for Inst {
if srcloc != SourceLoc::default() { if srcloc != SourceLoc::default() {
sink.add_trap(srcloc, TrapCode::HeapOutOfBounds); sink.add_trap(srcloc, TrapCode::HeapOutOfBounds);
} }
sink.put4(enc_stxr(ty, x24wr, x28, x25)); // stxr w24, x28, [x25] sink.put4(enc_stlxr(ty, x24wr, x28, x25)); // stlxr w24, x28, [x25]
// cbnz w24, again // cbnz w24, again
// Note, we're actually testing x24, and relying on the default zero-high-half // Note, we're actually testing x24, and relying on the default zero-high-half
// rule in the assignment that `stxr` does. // rule in the assignment that `stlxr` does.
let br_offset = sink.cur_offset(); let br_offset = sink.cur_offset();
sink.put4(enc_conditional_br( sink.put4(enc_conditional_br(
BranchTarget::Label(again_label), BranchTarget::Label(again_label),
CondBrKind::NotZero(x24), CondBrKind::NotZero(x24),
)); ));
sink.use_label_at_offset(br_offset, again_label, LabelUse::Branch19); sink.use_label_at_offset(br_offset, again_label, LabelUse::Branch19);
sink.put4(enc_dmb_ish()); // dmb ish
} }
&Inst::AtomicCAS { rs, rt, rn, ty } => { &Inst::AtomicCAS { rs, rt, rn, ty } => {
let size = match ty { let size = match ty {
@@ -1447,22 +1469,18 @@ impl MachInstEmit for Inst {
} }
&Inst::AtomicCASLoop { ty } => { &Inst::AtomicCASLoop { ty } => {
/* Emit this: /* Emit this:
dmb ish
again: again:
ldxr{,b,h} x/w27, [x25] ldaxr{,b,h} x/w27, [x25]
and x24, x26, MASK (= 2^size_bits - 1) cmp x27, x/w26 uxt{b,h}
cmp x27, x24
b.ne out b.ne out
stxr{,b,h} w24, x/w28, [x25] stlxr{,b,h} w24, x/w28, [x25]
cbnz x24, again cbnz x24, again
out: out:
dmb ish
Operand conventions: Operand conventions:
IN: x25 (addr), x26 (expected value), x28 (replacement value) IN: x25 (addr), x26 (expected value), x28 (replacement value)
OUT: x27 (old value), x24 (trashed) OUT: x27 (old value), x24 (trashed)
*/ */
let xzr = zero_reg();
let x24 = xreg(24); let x24 = xreg(24);
let x25 = xreg(25); let x25 = xreg(25);
let x26 = xreg(26); let x26 = xreg(26);
@@ -1474,37 +1492,25 @@ impl MachInstEmit for Inst {
let again_label = sink.get_label(); let again_label = sink.get_label();
let out_label = sink.get_label(); let out_label = sink.get_label();
sink.put4(enc_dmb_ish()); // dmb ish
// again: // again:
sink.bind_label(again_label); sink.bind_label(again_label);
let srcloc = state.cur_srcloc(); let srcloc = state.cur_srcloc();
if srcloc != SourceLoc::default() { if srcloc != SourceLoc::default() {
sink.add_trap(srcloc, TrapCode::HeapOutOfBounds); sink.add_trap(srcloc, TrapCode::HeapOutOfBounds);
} }
sink.put4(enc_ldxr(ty, x27wr, x25)); // ldxr x27, [x25] // ldaxr x27, [x25]
sink.put4(enc_ldaxr(ty, x27wr, x25));
if ty == I64 { // The top 32-bits are zero-extended by the ldaxr so we don't
// mov x24, x26 // have to use UXTW, just the x-form of the register.
sink.put4(enc_arith_rrr(0b101_01010_00_0, 0b000000, x24wr, xzr, x26)) let (bit21, extend_op) = match ty {
} else { I8 => (0b1, 0b000000),
// and x24, x26, 0xFF/0xFFFF/0xFFFFFFFF I16 => (0b1, 0b001000),
let (mask, s) = match ty { _ => (0b0, 0b000000),
I8 => (0xFF, 7),
I16 => (0xFFFF, 15),
I32 => (0xFFFFFFFF, 31),
_ => unreachable!(),
}; };
sink.put4(enc_arith_rr_imml( let bits_31_21 = 0b111_01011_000 | bit21;
0b100_100100, // cmp x27, x26 (== subs xzr, x27, x26)
ImmLogic::from_n_r_s(mask, true, 0, s, OperandSize::Size64).enc_bits(), sink.put4(enc_arith_rrr(bits_31_21, extend_op, xzrwr, x27, x26));
x26,
x24wr,
))
}
// cmp x27, x24 (== subs xzr, x27, x24)
sink.put4(enc_arith_rrr(0b111_01011_00_0, 0b000000, xzrwr, x27, x24));
// b.ne out // b.ne out
let br_out_offset = sink.cur_offset(); let br_out_offset = sink.cur_offset();
@@ -1518,11 +1524,11 @@ impl MachInstEmit for Inst {
if srcloc != SourceLoc::default() { if srcloc != SourceLoc::default() {
sink.add_trap(srcloc, TrapCode::HeapOutOfBounds); sink.add_trap(srcloc, TrapCode::HeapOutOfBounds);
} }
sink.put4(enc_stxr(ty, x24wr, x28, x25)); // stxr w24, x28, [x25] sink.put4(enc_stlxr(ty, x24wr, x28, x25)); // stlxr w24, x28, [x25]
// cbnz w24, again. // cbnz w24, again.
// Note, we're actually testing x24, and relying on the default zero-high-half // Note, we're actually testing x24, and relying on the default zero-high-half
// rule in the assignment that `stxr` does. // rule in the assignment that `stlxr` does.
let br_again_offset = sink.cur_offset(); let br_again_offset = sink.cur_offset();
sink.put4(enc_conditional_br( sink.put4(enc_conditional_br(
BranchTarget::Label(again_label), BranchTarget::Label(again_label),
@@ -1532,46 +1538,12 @@ impl MachInstEmit for Inst {
// out: // out:
sink.bind_label(out_label); sink.bind_label(out_label);
sink.put4(enc_dmb_ish()); // dmb ish
} }
&Inst::AtomicLoad { ty, r_data, r_addr } => { &Inst::LoadAcquire { access_ty, rt, rn } => {
let op = match ty { sink.put4(enc_ldar(access_ty, rt, rn));
I8 => 0b0011100001,
I16 => 0b0111100001,
I32 => 0b1011100001,
I64 => 0b1111100001,
_ => unreachable!(),
};
sink.put4(enc_dmb_ish()); // dmb ish
let srcloc = state.cur_srcloc();
if srcloc != SourceLoc::default() {
sink.add_trap(srcloc, TrapCode::HeapOutOfBounds);
} }
let uimm12scaled_zero = UImm12Scaled::zero(I8 /*irrelevant*/); &Inst::StoreRelease { access_ty, rt, rn } => {
sink.put4(enc_ldst_uimm12( sink.put4(enc_stlr(access_ty, rt, rn));
op,
uimm12scaled_zero,
r_addr,
r_data.to_reg(),
));
}
&Inst::AtomicStore { ty, r_data, r_addr } => {
let op = match ty {
I8 => 0b0011100000,
I16 => 0b0111100000,
I32 => 0b1011100000,
I64 => 0b1111100000,
_ => unreachable!(),
};
let srcloc = state.cur_srcloc();
if srcloc != SourceLoc::default() {
sink.add_trap(srcloc, TrapCode::HeapOutOfBounds);
}
let uimm12scaled_zero = UImm12Scaled::zero(I8 /*irrelevant*/);
sink.put4(enc_ldst_uimm12(op, uimm12scaled_zero, r_addr, r_data));
sink.put4(enc_dmb_ish()); // dmb ish
} }
&Inst::Fence {} => { &Inst::Fence {} => {
sink.put4(enc_dmb_ish()); // dmb ish sink.put4(enc_dmb_ish()); // dmb ish

View File

@@ -5891,7 +5891,7 @@ fn test_aarch64_binemit() {
ty: I16, ty: I16,
op: inst_common::AtomicRmwOp::Xor, op: inst_common::AtomicRmwOp::Xor,
}, },
"BF3B03D53B7F5F487C031ACA3C7F1848B8FFFFB5BF3B03D5", "3BFF5F487C031ACA3CFF1848B8FFFFB5",
"atomically { 16_bits_at_[x25]) Xor= x26 ; x27 = old_value_at_[x25]; x24,x28 = trash }", "atomically { 16_bits_at_[x25]) Xor= x26 ; x27 = old_value_at_[x25]; x24,x28 = trash }",
)); ));
@@ -5900,7 +5900,7 @@ fn test_aarch64_binemit() {
ty: I32, ty: I32,
op: inst_common::AtomicRmwOp::Xchg, op: inst_common::AtomicRmwOp::Xchg,
}, },
"BF3B03D53B7F5F88FC031AAA3C7F1888B8FFFFB5BF3B03D5", "3BFF5F88FC031AAA3CFF1888B8FFFFB5",
"atomically { 32_bits_at_[x25]) Xchg= x26 ; x27 = old_value_at_[x25]; x24,x28 = trash }", "atomically { 32_bits_at_[x25]) Xchg= x26 ; x27 = old_value_at_[x25]; x24,x28 = trash }",
)); ));
insns.push(( insns.push((
@@ -5947,56 +5947,112 @@ fn test_aarch64_binemit() {
Inst::AtomicCASLoop { Inst::AtomicCASLoop {
ty: I8, ty: I8,
}, },
"BF3B03D53B7F5F08581F40927F0318EB610000543C7F180878FFFFB5BF3B03D5", "3BFF5F087F033AEB610000543CFF180898FFFFB5",
"atomically { compare-and-swap(8_bits_at_[x25], x26 -> x28), x27 = old_value_at_[x25]; x24 = trash }" "atomically { compare-and-swap(8_bits_at_[x25], x26 -> x28), x27 = old_value_at_[x25]; x24 = trash }"
)); ));
insns.push((
Inst::AtomicCASLoop {
ty: I16,
},
"3BFF5F487F233AEB610000543CFF184898FFFFB5",
"atomically { compare-and-swap(16_bits_at_[x25], x26 -> x28), x27 = old_value_at_[x25]; x24 = trash }"
));
insns.push((
Inst::AtomicCASLoop {
ty: I32,
},
"3BFF5F887F031AEB610000543CFF188898FFFFB5",
"atomically { compare-and-swap(32_bits_at_[x25], x26 -> x28), x27 = old_value_at_[x25]; x24 = trash }"
));
insns.push(( insns.push((
Inst::AtomicCASLoop { Inst::AtomicCASLoop {
ty: I64, ty: I64,
}, },
"BF3B03D53B7F5FC8F8031AAA7F0318EB610000543C7F18C878FFFFB5BF3B03D5", "3BFF5FC87F031AEB610000543CFF18C898FFFFB5",
"atomically { compare-and-swap(64_bits_at_[x25], x26 -> x28), x27 = old_value_at_[x25]; x24 = trash }" "atomically { compare-and-swap(64_bits_at_[x25], x26 -> x28), x27 = old_value_at_[x25]; x24 = trash }"
)); ));
insns.push(( insns.push((
Inst::AtomicLoad { Inst::LoadAcquire {
ty: I8, access_ty: I8,
r_data: writable_xreg(7), rt: writable_xreg(7),
r_addr: xreg(28), rn: xreg(28),
}, },
"BF3B03D587034039", "87FFDF08",
"atomically { x7 = zero_extend_8_bits_at[x28] }", "ldarb w7, [x28]",
)); ));
insns.push(( insns.push((
Inst::AtomicLoad { Inst::LoadAcquire {
ty: I64, access_ty: I16,
r_data: writable_xreg(28), rt: writable_xreg(2),
r_addr: xreg(7), rn: xreg(3),
}, },
"BF3B03D5FC0040F9", "62FCDF48",
"atomically { x28 = zero_extend_64_bits_at[x7] }", "ldarh w2, [x3]",
)); ));
insns.push(( insns.push((
Inst::AtomicStore { Inst::LoadAcquire {
ty: I16, access_ty: I32,
r_data: xreg(17), rt: writable_xreg(15),
r_addr: xreg(8), rn: xreg(0),
}, },
"11010079BF3B03D5", "0FFCDF88",
"atomically { 16_bits_at[x8] = x17 }", "ldar w15, [x0]",
)); ));
insns.push(( insns.push((
Inst::AtomicStore { Inst::LoadAcquire {
ty: I32, access_ty: I64,
r_data: xreg(18), rt: writable_xreg(28),
r_addr: xreg(7), rn: xreg(7),
}, },
"F20000B9BF3B03D5", "FCFCDFC8",
"atomically { 32_bits_at[x7] = x18 }", "ldar x28, [x7]",
));
insns.push((
Inst::StoreRelease {
access_ty: I8,
rt: xreg(7),
rn: xreg(28),
},
"87FF9F08",
"stlrb w7, [x28]",
));
insns.push((
Inst::StoreRelease {
access_ty: I16,
rt: xreg(2),
rn: xreg(3),
},
"62FC9F48",
"stlrh w2, [x3]",
));
insns.push((
Inst::StoreRelease {
access_ty: I32,
rt: xreg(15),
rn: xreg(0),
},
"0FFC9F88",
"stlr w15, [x0]",
));
insns.push((
Inst::StoreRelease {
access_ty: I64,
rt: xreg(28),
rn: xreg(7),
},
"FCFC9FC8",
"stlr x28, [x7]",
)); ));
insns.push((Inst::Fence {}, "BF3B03D5", "dmb ish")); insns.push((Inst::Fence {}, "BF3B03D5", "dmb ish"));

View File

@@ -789,10 +789,9 @@ pub enum Inst {
}, },
/// Similar to AtomicRMW, a compare-and-swap operation implemented using a load-linked /// Similar to AtomicRMW, a compare-and-swap operation implemented using a load-linked
/// store-conditional loop. The sequence is both preceded and followed by a fence which is /// store-conditional loop.
/// at least as comprehensive as that of the `Fence` instruction below. This instruction /// This instruction is sequentially consistent.
/// is sequentially consistent. Note that the operand conventions, although very similar /// Note that the operand conventions, although very similar to AtomicRMW, are different:
/// to AtomicRMW, are different:
/// ///
/// x25 (rd) address /// x25 (rd) address
/// x26 (rd) expected value /// x26 (rd) expected value
@@ -803,22 +802,21 @@ pub enum Inst {
ty: Type, // I8, I16, I32 or I64 ty: Type, // I8, I16, I32 or I64
}, },
/// Read `ty` bits from address `r_addr`, zero extend the loaded value to 64 bits and put it /// Read `access_ty` bits from address `rt`, either 8, 16, 32 or 64-bits, and put
/// in `r_data`. The load instruction is preceded by a fence at least as comprehensive as /// it in `rn`, optionally zero-extending to fill a word or double word result.
/// that of the `Fence` instruction below. This instruction is sequentially consistent. /// This instruction is sequentially consistent.
AtomicLoad { LoadAcquire {
ty: Type, // I8, I16, I32 or I64 access_ty: Type, // I8, I16, I32 or I64
r_data: Writable<Reg>, rt: Writable<Reg>,
r_addr: Reg, rn: Reg,
}, },
/// Write the lowest `ty` bits of `r_data` to address `r_addr`, with a memory fence /// Write the lowest `ty` bits of `rt` to address `rn`.
/// instruction following the store. The fence is at least as comprehensive as that of the /// This instruction is sequentially consistent.
/// `Fence` instruction below. This instruction is sequentially consistent. StoreRelease {
AtomicStore { access_ty: Type, // I8, I16, I32 or I64
ty: Type, // I8, I16, I32 or I64 rt: Reg,
r_data: Reg, rn: Reg,
r_addr: Reg,
}, },
/// A memory fence. This must provide ordering to ensure that, at a minimum, neither loads /// A memory fence. This must provide ordering to ensure that, at a minimum, neither loads
@@ -1940,13 +1938,13 @@ fn aarch64_get_regs(inst: &Inst, collector: &mut RegUsageCollector) {
collector.add_def(writable_xreg(24)); collector.add_def(writable_xreg(24));
collector.add_def(writable_xreg(27)); collector.add_def(writable_xreg(27));
} }
&Inst::AtomicLoad { r_data, r_addr, .. } => { &Inst::LoadAcquire { rt, rn, .. } => {
collector.add_use(r_addr); collector.add_use(rn);
collector.add_def(r_data); collector.add_def(rt);
} }
&Inst::AtomicStore { r_data, r_addr, .. } => { &Inst::StoreRelease { rt, rn, .. } => {
collector.add_use(r_addr); collector.add_use(rn);
collector.add_use(r_data); collector.add_use(rt);
} }
&Inst::Fence {} => {} &Inst::Fence {} => {}
&Inst::FpuMove64 { rd, rn } => { &Inst::FpuMove64 { rd, rn } => {
@@ -2579,21 +2577,21 @@ fn aarch64_map_regs<RUM: RegUsageMapper>(inst: &mut Inst, mapper: &RUM) {
&mut Inst::AtomicCASLoop { .. } => { &mut Inst::AtomicCASLoop { .. } => {
// There are no vregs to map in this insn. // There are no vregs to map in this insn.
} }
&mut Inst::AtomicLoad { &mut Inst::LoadAcquire {
ref mut r_data, ref mut rt,
ref mut r_addr, ref mut rn,
.. ..
} => { } => {
map_def(mapper, r_data); map_def(mapper, rt);
map_use(mapper, r_addr); map_use(mapper, rn);
} }
&mut Inst::AtomicStore { &mut Inst::StoreRelease {
ref mut r_data, ref mut rt,
ref mut r_addr, ref mut rn,
.. ..
} => { } => {
map_use(mapper, r_data); map_use(mapper, rt);
map_use(mapper, r_addr); map_use(mapper, rn);
} }
&mut Inst::Fence {} => {} &mut Inst::Fence {} => {}
&mut Inst::FpuMove64 { &mut Inst::FpuMove64 {
@@ -3643,25 +3641,35 @@ impl Inst {
"atomically {{ compare-and-swap({}_bits_at_[x25], x26 -> x28), x27 = old_value_at_[x25]; x24 = trash }}", "atomically {{ compare-and-swap({}_bits_at_[x25], x26 -> x28), x27 = old_value_at_[x25]; x24 = trash }}",
ty.bits()) ty.bits())
} }
&Inst::AtomicLoad { &Inst::LoadAcquire {
ty, r_data, r_addr, .. access_ty, rt, rn, ..
} => { } => {
format!( let (op, ty) = match access_ty {
"atomically {{ {} = zero_extend_{}_bits_at[{}] }}", I8 => ("ldarb", I32),
r_data.show_rru(mb_rru), I16 => ("ldarh", I32),
ty.bits(), I32 => ("ldar", I32),
r_addr.show_rru(mb_rru) I64 => ("ldar", I64),
) _ => panic!("Unsupported type: {}", access_ty),
};
let size = OperandSize::from_ty(ty);
let rt = show_ireg_sized(rt.to_reg(), mb_rru, size);
let rn = rn.show_rru(mb_rru);
format!("{} {}, [{}]", op, rt, rn)
} }
&Inst::AtomicStore { &Inst::StoreRelease {
ty, r_data, r_addr, .. access_ty, rt, rn, ..
} => { } => {
format!( let (op, ty) = match access_ty {
"atomically {{ {}_bits_at[{}] = {} }}", I8 => ("stlrb", I32),
ty.bits(), I16 => ("stlrh", I32),
r_addr.show_rru(mb_rru), I32 => ("stlr", I32),
r_data.show_rru(mb_rru) I64 => ("stlr", I64),
) _ => panic!("Unsupported type: {}", access_ty),
};
let size = OperandSize::from_ty(ty);
let rt = show_ireg_sized(rt, mb_rru, size);
let rn = rn.show_rru(mb_rru);
format!("{} {}, [{}]", op, rt, rn)
} }
&Inst::Fence {} => { &Inst::Fence {} => {
format!("dmb ish") format!("dmb ish")

View File

@@ -1740,6 +1740,22 @@ pub(crate) fn is_valid_atomic_transaction_ty(ty: Type) -> bool {
} }
} }
pub(crate) fn emit_atomic_load<C: LowerCtx<I = Inst>>(
ctx: &mut C,
rt: Writable<Reg>,
insn: IRInst,
) {
assert!(ctx.data(insn).opcode() == Opcode::AtomicLoad);
let inputs = insn_inputs(ctx, insn);
let rn = put_input_in_reg(ctx, inputs[0], NarrowValueMode::None);
let access_ty = ctx.output_ty(insn, 0);
assert!(is_valid_atomic_transaction_ty(access_ty));
// We're ignoring the result type of the load because the LoadAcquire will
// explicitly zero extend to the nearest word, and also zero the high half
// of an X register.
ctx.emit(Inst::LoadAcquire { access_ty, rt, rn });
}
fn load_op_to_ty(op: Opcode) -> Option<Type> { fn load_op_to_ty(op: Opcode) -> Option<Type> {
match op { match op {
Opcode::Sload8 | Opcode::Uload8 | Opcode::Sload8Complex | Opcode::Uload8Complex => Some(I8), Opcode::Sload8 | Opcode::Uload8 | Opcode::Sload8Complex | Opcode::Uload8Complex => Some(I8),

View File

@@ -521,6 +521,19 @@ pub(crate) fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(
} }
Opcode::Uextend | Opcode::Sextend => { Opcode::Uextend | Opcode::Sextend => {
if op == Opcode::Uextend {
let inputs = ctx.get_input_as_source_or_const(inputs[0].insn, inputs[0].input);
if let Some((atomic_load, 0)) = inputs.inst {
if ctx.data(atomic_load).opcode() == Opcode::AtomicLoad {
let output_ty = ty.unwrap();
assert!(output_ty == I32 || output_ty == I64);
let rt = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
emit_atomic_load(ctx, rt, atomic_load);
ctx.sink_inst(atomic_load);
return Ok(());
}
}
}
let output_ty = ty.unwrap(); let output_ty = ty.unwrap();
let input_ty = ctx.input_ty(insn, 0); let input_ty = ctx.input_ty(insn, 0);
let from_bits = ty_bits(input_ty) as u8; let from_bits = ty_bits(input_ty) as u8;
@@ -1523,27 +1536,16 @@ pub(crate) fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(
} }
Opcode::AtomicLoad => { Opcode::AtomicLoad => {
let r_data = get_output_reg(ctx, outputs[0]).only_reg().unwrap(); let rt = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
let r_addr = put_input_in_reg(ctx, inputs[0], NarrowValueMode::None); emit_atomic_load(ctx, rt, insn);
let ty_access = ty.unwrap();
assert!(is_valid_atomic_transaction_ty(ty_access));
ctx.emit(Inst::AtomicLoad {
ty: ty_access,
r_data,
r_addr,
});
} }
Opcode::AtomicStore => { Opcode::AtomicStore => {
let r_data = put_input_in_reg(ctx, inputs[0], NarrowValueMode::None); let rt = put_input_in_reg(ctx, inputs[0], NarrowValueMode::None);
let r_addr = put_input_in_reg(ctx, inputs[1], NarrowValueMode::None); let rn = put_input_in_reg(ctx, inputs[1], NarrowValueMode::None);
let ty_access = ctx.input_ty(insn, 0); let access_ty = ctx.input_ty(insn, 0);
assert!(is_valid_atomic_transaction_ty(ty_access)); assert!(is_valid_atomic_transaction_ty(access_ty));
ctx.emit(Inst::AtomicStore { ctx.emit(Inst::StoreRelease { access_ty, rt, rn });
ty: ty_access,
r_data,
r_addr,
});
} }
Opcode::Fence => { Opcode::Fence => {

View File

@@ -0,0 +1,97 @@
test compile
target aarch64
function %atomic_load_i64(i64) -> i64 {
block0(v0: i64):
v1 = atomic_load.i64 v0
return v1
}
; check: ldar x0, [x0]
; nextln: ldp fp, lr, [sp], #16
; nextln: ret
function %atomic_load_i32(i64) -> i32 {
block0(v0: i64):
v1 = atomic_load.i32 v0
return v1
}
; check: ldar w0, [x0]
; nextln: ldp fp, lr, [sp], #16
; nextln: ret
function %atomic_load_i16(i64) -> i16 {
block0(v0: i64):
v1 = atomic_load.i16 v0
return v1
}
; check: ldarh w0, [x0]
; nextln: ldp fp, lr, [sp], #16
; nextln: ret
function %atomic_load_i8(i64) -> i8 {
block0(v0: i64):
v1 = atomic_load.i8 v0
return v1
}
; check: ldarb w0, [x0]
; nextln: ldp fp, lr, [sp], #16
; nextln: ret
function %atomic_load_i32_i64(i64) -> i64 {
block0(v0: i64):
v1 = atomic_load.i32 v0
v2 = uextend.i64 v1
return v2
}
; check: ldar w0, [x0]
; nextln: ldp fp, lr, [sp], #16
; nextln: ret
function %atomic_load_i16_i64(i64) -> i64 {
block0(v0: i64):
v1 = atomic_load.i16 v0
v2 = uextend.i64 v1
return v2
}
; check: ldarh w0, [x0]
; nextln: ldp fp, lr, [sp], #16
; nextln: ret
function %atomic_load_i8_i64(i64) -> i64 {
block0(v0: i64):
v1 = atomic_load.i8 v0
v2 = uextend.i64 v1
return v2
}
; check: ldarb w0, [x0]
; nextln: ldp fp, lr, [sp], #16
; nextln: ret
function %atomic_load_i16_i32(i64) -> i32 {
block0(v0: i64):
v1 = atomic_load.i16 v0
v2 = uextend.i32 v1
return v2
}
; check: ldarh w0, [x0]
; nextln: ldp fp, lr, [sp], #16
; nextln: ret
function %atomic_load_i8_i32(i64) -> i32 {
block0(v0: i64):
v1 = atomic_load.i8 v0
v2 = uextend.i32 v1
return v2
}
; check: ldarb w0, [x0]
; nextln: ldp fp, lr, [sp], #16
; nextln: ret

View File

@@ -0,0 +1,102 @@
test compile
target aarch64
function %atomic_store_i64(i64, i64) {
block0(v0: i64, v1: i64):
atomic_store.i64 v0, v1
return
}
; check: stlr x0, [x1]
; nextln: ldp fp, lr, [sp], #16
; nextln: ret
function %atomic_store_i32(i32, i64) {
block0(v0: i32, v1: i64):
atomic_store.i32 v0, v1
return
}
; check: stlr w0, [x1]
; nextln: ldp fp, lr, [sp], #16
; nextln: ret
function %atomic_store_i16(i16, i64) {
block0(v0: i16, v1: i64):
atomic_store.i16 v0, v1
return
}
; check: stlrh w0, [x1]
; nextln: ldp fp, lr, [sp], #16
; nextln: ret
function %atomic_store_i8(i8, i64) {
block0(v0: i8, v1: i64):
atomic_store.i8 v0, v1
return
}
; check: stlrb w0, [x1]
; nextln: ldp fp, lr, [sp], #16
; nextln: ret
function %atomic_store_i64_i32(i64, i64) {
block0(v0: i64, v1: i64):
v2 = ireduce.i32 v0
atomic_store.i32 v2, v1
return
}
; check-not: uxt
; check: stlr w0, [x1]
; nextln: ldp fp, lr, [sp], #16
; nextln: ret
function %atomic_store_i64_i16(i64, i64) {
block0(v0: i64, v1: i64):
v2 = ireduce.i16 v0
atomic_store.i16 v2, v1
return
}
; check-not: uxt
; check: stlrh w0, [x1]
; nextln: ldp fp, lr, [sp], #16
; nextln: ret
function %atomic_store_i64_i8(i64, i64) {
block0(v0: i64, v1: i64):
v2 = ireduce.i8 v0
atomic_store.i8 v2, v1
return
}
; check-not: uxt
; check: stlrb w0, [x1]
; nextln: ldp fp, lr, [sp], #16
; nextln: ret
function %atomic_store_i32_i16(i32, i64) {
block0(v0: i32, v1: i64):
v2 = ireduce.i16 v0
atomic_store.i16 v2, v1
return
}
; check-not: uxt
; check: stlrh w0, [x1]
; nextln: ldp fp, lr, [sp], #16
; nextln: ret
function %atomic_store_i32_i8(i32, i64) {
block0(v0: i32, v1: i64):
v2 = ireduce.i8 v0
atomic_store.i8 v2, v1
return
}
; check-not: uxt
; check: stlrb w0, [x1]
; nextln: ldp fp, lr, [sp], #16
; nextln: ret