Re-implement atomic load and stores

The AArch64 support was a bit broken and was using Armv7 style
barriers, which aren't required with Armv8 acquire-release
load/stores.

The fallback CAS loops and RMW, for AArch64, have also been updated
to use acquire-release, exclusive, instructions which, again, remove
the need for barriers. The CAS loop has also been further optimised
by using the extending form of the cmp instruction.

Copyright (c) 2021, Arm Limited.
This commit is contained in:
Sam Parker
2021-07-29 15:41:45 +01:00
parent 85f16f488d
commit cbb7229457
12 changed files with 564 additions and 220 deletions

View File

@@ -498,7 +498,7 @@ fn enc_dmb_ish() -> u32 {
0xD5033BBF
}
fn enc_ldxr(ty: Type, rt: Writable<Reg>, rn: Reg) -> u32 {
fn enc_ldar(ty: Type, rt: Writable<Reg>, rn: Reg) -> u32 {
let sz = match ty {
I64 => 0b11,
I32 => 0b10,
@@ -506,13 +506,13 @@ fn enc_ldxr(ty: Type, rt: Writable<Reg>, rn: Reg) -> u32 {
I8 => 0b00,
_ => unreachable!(),
};
0b00001000_01011111_01111100_00000000
0b00_001000_1_1_0_11111_1_11111_00000_00000
| (sz << 30)
| (machreg_to_gpr(rn) << 5)
| machreg_to_gpr(rt.to_reg())
}
fn enc_stxr(ty: Type, rs: Writable<Reg>, rt: Reg, rn: Reg) -> u32 {
fn enc_stlr(ty: Type, rt: Reg, rn: Reg) -> u32 {
let sz = match ty {
I64 => 0b11,
I32 => 0b10,
@@ -520,7 +520,35 @@ fn enc_stxr(ty: Type, rs: Writable<Reg>, rt: Reg, rn: Reg) -> u32 {
I8 => 0b00,
_ => unreachable!(),
};
0b00001000_00000000_01111100_00000000
0b00_001000_100_11111_1_11111_00000_00000
| (sz << 30)
| (machreg_to_gpr(rn) << 5)
| machreg_to_gpr(rt)
}
fn enc_ldaxr(ty: Type, rt: Writable<Reg>, rn: Reg) -> u32 {
let sz = match ty {
I64 => 0b11,
I32 => 0b10,
I16 => 0b01,
I8 => 0b00,
_ => unreachable!(),
};
0b00_001000_0_1_0_11111_1_11111_00000_00000
| (sz << 30)
| (machreg_to_gpr(rn) << 5)
| machreg_to_gpr(rt.to_reg())
}
fn enc_stlxr(ty: Type, rs: Writable<Reg>, rt: Reg, rn: Reg) -> u32 {
let sz = match ty {
I64 => 0b11,
I32 => 0b10,
I16 => 0b01,
I8 => 0b00,
_ => unreachable!(),
};
0b00_001000_000_00000_1_11111_00000_00000
| (sz << 30)
| (machreg_to_gpr(rs.to_reg()) << 16)
| (machreg_to_gpr(rn) << 5)
@@ -1286,20 +1314,18 @@ impl MachInstEmit for Inst {
}
&Inst::AtomicRMW { ty, op } => {
/* Emit this:
dmb ish
again:
ldxr{,b,h} x/w27, [x25]
ldaxr{,b,h} x/w27, [x25]
op x28, x27, x26 // op is add,sub,and,orr,eor
stxr{,b,h} w24, x/w28, [x25]
stlxr{,b,h} w24, x/w28, [x25]
cbnz x24, again
dmb ish
Operand conventions:
IN: x25 (addr), x26 (2nd arg for op)
OUT: x27 (old value), x24 (trashed), x28 (trashed)
It is unfortunate that, per the ARM documentation, x28 cannot be used for
both the store-data and success-flag operands of stxr. This causes the
both the store-data and success-flag operands of stlxr. This causes the
instruction's behaviour to be "CONSTRAINED UNPREDICTABLE", so we use x24
instead for the success-flag.
@@ -1320,15 +1346,13 @@ impl MachInstEmit for Inst {
let x28wr = writable_xreg(28);
let again_label = sink.get_label();
sink.put4(enc_dmb_ish()); // dmb ish
// again:
sink.bind_label(again_label);
let srcloc = state.cur_srcloc();
if srcloc != SourceLoc::default() {
sink.add_trap(srcloc, TrapCode::HeapOutOfBounds);
}
sink.put4(enc_ldxr(ty, x27wr, x25)); // ldxr x27, [x25]
sink.put4(enc_ldaxr(ty, x27wr, x25)); // ldaxr x27, [x25]
match op {
AtomicRmwOp::Xchg => {
@@ -1420,19 +1444,17 @@ impl MachInstEmit for Inst {
if srcloc != SourceLoc::default() {
sink.add_trap(srcloc, TrapCode::HeapOutOfBounds);
}
sink.put4(enc_stxr(ty, x24wr, x28, x25)); // stxr w24, x28, [x25]
sink.put4(enc_stlxr(ty, x24wr, x28, x25)); // stlxr w24, x28, [x25]
// cbnz w24, again
// Note, we're actually testing x24, and relying on the default zero-high-half
// rule in the assignment that `stxr` does.
// rule in the assignment that `stlxr` does.
let br_offset = sink.cur_offset();
sink.put4(enc_conditional_br(
BranchTarget::Label(again_label),
CondBrKind::NotZero(x24),
));
sink.use_label_at_offset(br_offset, again_label, LabelUse::Branch19);
sink.put4(enc_dmb_ish()); // dmb ish
}
&Inst::AtomicCAS { rs, rt, rn, ty } => {
let size = match ty {
@@ -1447,22 +1469,18 @@ impl MachInstEmit for Inst {
}
&Inst::AtomicCASLoop { ty } => {
/* Emit this:
dmb ish
again:
ldxr{,b,h} x/w27, [x25]
and x24, x26, MASK (= 2^size_bits - 1)
cmp x27, x24
ldaxr{,b,h} x/w27, [x25]
cmp x27, x/w26 uxt{b,h}
b.ne out
stxr{,b,h} w24, x/w28, [x25]
stlxr{,b,h} w24, x/w28, [x25]
cbnz x24, again
out:
dmb ish
Operand conventions:
IN: x25 (addr), x26 (expected value), x28 (replacement value)
OUT: x27 (old value), x24 (trashed)
*/
let xzr = zero_reg();
let x24 = xreg(24);
let x25 = xreg(25);
let x26 = xreg(26);
@@ -1474,37 +1492,25 @@ impl MachInstEmit for Inst {
let again_label = sink.get_label();
let out_label = sink.get_label();
sink.put4(enc_dmb_ish()); // dmb ish
// again:
sink.bind_label(again_label);
let srcloc = state.cur_srcloc();
if srcloc != SourceLoc::default() {
sink.add_trap(srcloc, TrapCode::HeapOutOfBounds);
}
sink.put4(enc_ldxr(ty, x27wr, x25)); // ldxr x27, [x25]
// ldaxr x27, [x25]
sink.put4(enc_ldaxr(ty, x27wr, x25));
if ty == I64 {
// mov x24, x26
sink.put4(enc_arith_rrr(0b101_01010_00_0, 0b000000, x24wr, xzr, x26))
} else {
// and x24, x26, 0xFF/0xFFFF/0xFFFFFFFF
let (mask, s) = match ty {
I8 => (0xFF, 7),
I16 => (0xFFFF, 15),
I32 => (0xFFFFFFFF, 31),
_ => unreachable!(),
};
sink.put4(enc_arith_rr_imml(
0b100_100100,
ImmLogic::from_n_r_s(mask, true, 0, s, OperandSize::Size64).enc_bits(),
x26,
x24wr,
))
}
// cmp x27, x24 (== subs xzr, x27, x24)
sink.put4(enc_arith_rrr(0b111_01011_00_0, 0b000000, xzrwr, x27, x24));
// The top 32-bits are zero-extended by the ldaxr so we don't
// have to use UXTW, just the x-form of the register.
let (bit21, extend_op) = match ty {
I8 => (0b1, 0b000000),
I16 => (0b1, 0b001000),
_ => (0b0, 0b000000),
};
let bits_31_21 = 0b111_01011_000 | bit21;
// cmp x27, x26 (== subs xzr, x27, x26)
sink.put4(enc_arith_rrr(bits_31_21, extend_op, xzrwr, x27, x26));
// b.ne out
let br_out_offset = sink.cur_offset();
@@ -1518,11 +1524,11 @@ impl MachInstEmit for Inst {
if srcloc != SourceLoc::default() {
sink.add_trap(srcloc, TrapCode::HeapOutOfBounds);
}
sink.put4(enc_stxr(ty, x24wr, x28, x25)); // stxr w24, x28, [x25]
sink.put4(enc_stlxr(ty, x24wr, x28, x25)); // stlxr w24, x28, [x25]
// cbnz w24, again.
// Note, we're actually testing x24, and relying on the default zero-high-half
// rule in the assignment that `stxr` does.
// rule in the assignment that `stlxr` does.
let br_again_offset = sink.cur_offset();
sink.put4(enc_conditional_br(
BranchTarget::Label(again_label),
@@ -1532,46 +1538,12 @@ impl MachInstEmit for Inst {
// out:
sink.bind_label(out_label);
sink.put4(enc_dmb_ish()); // dmb ish
}
&Inst::AtomicLoad { ty, r_data, r_addr } => {
let op = match ty {
I8 => 0b0011100001,
I16 => 0b0111100001,
I32 => 0b1011100001,
I64 => 0b1111100001,
_ => unreachable!(),
};
sink.put4(enc_dmb_ish()); // dmb ish
let srcloc = state.cur_srcloc();
if srcloc != SourceLoc::default() {
sink.add_trap(srcloc, TrapCode::HeapOutOfBounds);
}
let uimm12scaled_zero = UImm12Scaled::zero(I8 /*irrelevant*/);
sink.put4(enc_ldst_uimm12(
op,
uimm12scaled_zero,
r_addr,
r_data.to_reg(),
));
&Inst::LoadAcquire { access_ty, rt, rn } => {
sink.put4(enc_ldar(access_ty, rt, rn));
}
&Inst::AtomicStore { ty, r_data, r_addr } => {
let op = match ty {
I8 => 0b0011100000,
I16 => 0b0111100000,
I32 => 0b1011100000,
I64 => 0b1111100000,
_ => unreachable!(),
};
let srcloc = state.cur_srcloc();
if srcloc != SourceLoc::default() {
sink.add_trap(srcloc, TrapCode::HeapOutOfBounds);
}
let uimm12scaled_zero = UImm12Scaled::zero(I8 /*irrelevant*/);
sink.put4(enc_ldst_uimm12(op, uimm12scaled_zero, r_addr, r_data));
sink.put4(enc_dmb_ish()); // dmb ish
&Inst::StoreRelease { access_ty, rt, rn } => {
sink.put4(enc_stlr(access_ty, rt, rn));
}
&Inst::Fence {} => {
sink.put4(enc_dmb_ish()); // dmb ish

View File

@@ -5891,7 +5891,7 @@ fn test_aarch64_binemit() {
ty: I16,
op: inst_common::AtomicRmwOp::Xor,
},
"BF3B03D53B7F5F487C031ACA3C7F1848B8FFFFB5BF3B03D5",
"3BFF5F487C031ACA3CFF1848B8FFFFB5",
"atomically { 16_bits_at_[x25]) Xor= x26 ; x27 = old_value_at_[x25]; x24,x28 = trash }",
));
@@ -5900,7 +5900,7 @@ fn test_aarch64_binemit() {
ty: I32,
op: inst_common::AtomicRmwOp::Xchg,
},
"BF3B03D53B7F5F88FC031AAA3C7F1888B8FFFFB5BF3B03D5",
"3BFF5F88FC031AAA3CFF1888B8FFFFB5",
"atomically { 32_bits_at_[x25]) Xchg= x26 ; x27 = old_value_at_[x25]; x24,x28 = trash }",
));
insns.push((
@@ -5947,56 +5947,112 @@ fn test_aarch64_binemit() {
Inst::AtomicCASLoop {
ty: I8,
},
"BF3B03D53B7F5F08581F40927F0318EB610000543C7F180878FFFFB5BF3B03D5",
"3BFF5F087F033AEB610000543CFF180898FFFFB5",
"atomically { compare-and-swap(8_bits_at_[x25], x26 -> x28), x27 = old_value_at_[x25]; x24 = trash }"
));
insns.push((
Inst::AtomicCASLoop {
ty: I16,
},
"3BFF5F487F233AEB610000543CFF184898FFFFB5",
"atomically { compare-and-swap(16_bits_at_[x25], x26 -> x28), x27 = old_value_at_[x25]; x24 = trash }"
));
insns.push((
Inst::AtomicCASLoop {
ty: I32,
},
"3BFF5F887F031AEB610000543CFF188898FFFFB5",
"atomically { compare-and-swap(32_bits_at_[x25], x26 -> x28), x27 = old_value_at_[x25]; x24 = trash }"
));
insns.push((
Inst::AtomicCASLoop {
ty: I64,
},
"BF3B03D53B7F5FC8F8031AAA7F0318EB610000543C7F18C878FFFFB5BF3B03D5",
"3BFF5FC87F031AEB610000543CFF18C898FFFFB5",
"atomically { compare-and-swap(64_bits_at_[x25], x26 -> x28), x27 = old_value_at_[x25]; x24 = trash }"
));
insns.push((
Inst::AtomicLoad {
ty: I8,
r_data: writable_xreg(7),
r_addr: xreg(28),
Inst::LoadAcquire {
access_ty: I8,
rt: writable_xreg(7),
rn: xreg(28),
},
"BF3B03D587034039",
"atomically { x7 = zero_extend_8_bits_at[x28] }",
"87FFDF08",
"ldarb w7, [x28]",
));
insns.push((
Inst::AtomicLoad {
ty: I64,
r_data: writable_xreg(28),
r_addr: xreg(7),
Inst::LoadAcquire {
access_ty: I16,
rt: writable_xreg(2),
rn: xreg(3),
},
"BF3B03D5FC0040F9",
"atomically { x28 = zero_extend_64_bits_at[x7] }",
"62FCDF48",
"ldarh w2, [x3]",
));
insns.push((
Inst::AtomicStore {
ty: I16,
r_data: xreg(17),
r_addr: xreg(8),
Inst::LoadAcquire {
access_ty: I32,
rt: writable_xreg(15),
rn: xreg(0),
},
"11010079BF3B03D5",
"atomically { 16_bits_at[x8] = x17 }",
"0FFCDF88",
"ldar w15, [x0]",
));
insns.push((
Inst::AtomicStore {
ty: I32,
r_data: xreg(18),
r_addr: xreg(7),
Inst::LoadAcquire {
access_ty: I64,
rt: writable_xreg(28),
rn: xreg(7),
},
"F20000B9BF3B03D5",
"atomically { 32_bits_at[x7] = x18 }",
"FCFCDFC8",
"ldar x28, [x7]",
));
insns.push((
Inst::StoreRelease {
access_ty: I8,
rt: xreg(7),
rn: xreg(28),
},
"87FF9F08",
"stlrb w7, [x28]",
));
insns.push((
Inst::StoreRelease {
access_ty: I16,
rt: xreg(2),
rn: xreg(3),
},
"62FC9F48",
"stlrh w2, [x3]",
));
insns.push((
Inst::StoreRelease {
access_ty: I32,
rt: xreg(15),
rn: xreg(0),
},
"0FFC9F88",
"stlr w15, [x0]",
));
insns.push((
Inst::StoreRelease {
access_ty: I64,
rt: xreg(28),
rn: xreg(7),
},
"FCFC9FC8",
"stlr x28, [x7]",
));
insns.push((Inst::Fence {}, "BF3B03D5", "dmb ish"));

View File

@@ -789,10 +789,9 @@ pub enum Inst {
},
/// Similar to AtomicRMW, a compare-and-swap operation implemented using a load-linked
/// store-conditional loop. The sequence is both preceded and followed by a fence which is
/// at least as comprehensive as that of the `Fence` instruction below. This instruction
/// is sequentially consistent. Note that the operand conventions, although very similar
/// to AtomicRMW, are different:
/// store-conditional loop.
/// This instruction is sequentially consistent.
/// Note that the operand conventions, although very similar to AtomicRMW, are different:
///
/// x25 (rd) address
/// x26 (rd) expected value
@@ -803,22 +802,21 @@ pub enum Inst {
ty: Type, // I8, I16, I32 or I64
},
/// Read `ty` bits from address `r_addr`, zero extend the loaded value to 64 bits and put it
/// in `r_data`. The load instruction is preceded by a fence at least as comprehensive as
/// that of the `Fence` instruction below. This instruction is sequentially consistent.
AtomicLoad {
ty: Type, // I8, I16, I32 or I64
r_data: Writable<Reg>,
r_addr: Reg,
/// Read `access_ty` bits from address `rt`, either 8, 16, 32 or 64-bits, and put
/// it in `rn`, optionally zero-extending to fill a word or double word result.
/// This instruction is sequentially consistent.
LoadAcquire {
access_ty: Type, // I8, I16, I32 or I64
rt: Writable<Reg>,
rn: Reg,
},
/// Write the lowest `ty` bits of `r_data` to address `r_addr`, with a memory fence
/// instruction following the store. The fence is at least as comprehensive as that of the
/// `Fence` instruction below. This instruction is sequentially consistent.
AtomicStore {
ty: Type, // I8, I16, I32 or I64
r_data: Reg,
r_addr: Reg,
/// Write the lowest `ty` bits of `rt` to address `rn`.
/// This instruction is sequentially consistent.
StoreRelease {
access_ty: Type, // I8, I16, I32 or I64
rt: Reg,
rn: Reg,
},
/// A memory fence. This must provide ordering to ensure that, at a minimum, neither loads
@@ -1940,13 +1938,13 @@ fn aarch64_get_regs(inst: &Inst, collector: &mut RegUsageCollector) {
collector.add_def(writable_xreg(24));
collector.add_def(writable_xreg(27));
}
&Inst::AtomicLoad { r_data, r_addr, .. } => {
collector.add_use(r_addr);
collector.add_def(r_data);
&Inst::LoadAcquire { rt, rn, .. } => {
collector.add_use(rn);
collector.add_def(rt);
}
&Inst::AtomicStore { r_data, r_addr, .. } => {
collector.add_use(r_addr);
collector.add_use(r_data);
&Inst::StoreRelease { rt, rn, .. } => {
collector.add_use(rn);
collector.add_use(rt);
}
&Inst::Fence {} => {}
&Inst::FpuMove64 { rd, rn } => {
@@ -2579,21 +2577,21 @@ fn aarch64_map_regs<RUM: RegUsageMapper>(inst: &mut Inst, mapper: &RUM) {
&mut Inst::AtomicCASLoop { .. } => {
// There are no vregs to map in this insn.
}
&mut Inst::AtomicLoad {
ref mut r_data,
ref mut r_addr,
&mut Inst::LoadAcquire {
ref mut rt,
ref mut rn,
..
} => {
map_def(mapper, r_data);
map_use(mapper, r_addr);
map_def(mapper, rt);
map_use(mapper, rn);
}
&mut Inst::AtomicStore {
ref mut r_data,
ref mut r_addr,
&mut Inst::StoreRelease {
ref mut rt,
ref mut rn,
..
} => {
map_use(mapper, r_data);
map_use(mapper, r_addr);
map_use(mapper, rt);
map_use(mapper, rn);
}
&mut Inst::Fence {} => {}
&mut Inst::FpuMove64 {
@@ -3643,25 +3641,35 @@ impl Inst {
"atomically {{ compare-and-swap({}_bits_at_[x25], x26 -> x28), x27 = old_value_at_[x25]; x24 = trash }}",
ty.bits())
}
&Inst::AtomicLoad {
ty, r_data, r_addr, ..
&Inst::LoadAcquire {
access_ty, rt, rn, ..
} => {
format!(
"atomically {{ {} = zero_extend_{}_bits_at[{}] }}",
r_data.show_rru(mb_rru),
ty.bits(),
r_addr.show_rru(mb_rru)
)
let (op, ty) = match access_ty {
I8 => ("ldarb", I32),
I16 => ("ldarh", I32),
I32 => ("ldar", I32),
I64 => ("ldar", I64),
_ => panic!("Unsupported type: {}", access_ty),
};
let size = OperandSize::from_ty(ty);
let rt = show_ireg_sized(rt.to_reg(), mb_rru, size);
let rn = rn.show_rru(mb_rru);
format!("{} {}, [{}]", op, rt, rn)
}
&Inst::AtomicStore {
ty, r_data, r_addr, ..
&Inst::StoreRelease {
access_ty, rt, rn, ..
} => {
format!(
"atomically {{ {}_bits_at[{}] = {} }}",
ty.bits(),
r_addr.show_rru(mb_rru),
r_data.show_rru(mb_rru)
)
let (op, ty) = match access_ty {
I8 => ("stlrb", I32),
I16 => ("stlrh", I32),
I32 => ("stlr", I32),
I64 => ("stlr", I64),
_ => panic!("Unsupported type: {}", access_ty),
};
let size = OperandSize::from_ty(ty);
let rt = show_ireg_sized(rt, mb_rru, size);
let rn = rn.show_rru(mb_rru);
format!("{} {}, [{}]", op, rt, rn)
}
&Inst::Fence {} => {
format!("dmb ish")

View File

@@ -1522,28 +1522,40 @@ pub(crate) fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(
}
}
Opcode::AtomicLoad => {
let r_data = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
let r_addr = put_input_in_reg(ctx, inputs[0], NarrowValueMode::None);
let ty_access = ty.unwrap();
assert!(is_valid_atomic_transaction_ty(ty_access));
ctx.emit(Inst::AtomicLoad {
ty: ty_access,
r_data,
r_addr,
});
Opcode::AtomicLoad
| Opcode::AtomicUload8
| Opcode::AtomicUload16
| Opcode::AtomicUload32 => {
let rt = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
let rn = put_input_in_reg(ctx, inputs[0], NarrowValueMode::None);
let ty = ty.unwrap();
let access_ty = match op {
Opcode::AtomicLoad => ty,
Opcode::AtomicUload8 => I8,
Opcode::AtomicUload16 => I16,
Opcode::AtomicUload32 => I32,
_ => panic!(),
};
assert!(is_valid_atomic_transaction_ty(access_ty));
ctx.emit(Inst::LoadAcquire { access_ty, rt, rn });
}
Opcode::AtomicStore => {
let r_data = put_input_in_reg(ctx, inputs[0], NarrowValueMode::None);
let r_addr = put_input_in_reg(ctx, inputs[1], NarrowValueMode::None);
let ty_access = ctx.input_ty(insn, 0);
assert!(is_valid_atomic_transaction_ty(ty_access));
ctx.emit(Inst::AtomicStore {
ty: ty_access,
r_data,
r_addr,
});
Opcode::AtomicStore
| Opcode::AtomicStore32
| Opcode::AtomicStore16
| Opcode::AtomicStore8 => {
let rt = put_input_in_reg(ctx, inputs[0], NarrowValueMode::None);
let rn = put_input_in_reg(ctx, inputs[1], NarrowValueMode::None);
let ty = ctx.input_ty(insn, 0);
let access_ty = match op {
Opcode::AtomicStore => ty,
Opcode::AtomicStore32 => I32,
Opcode::AtomicStore16 => I16,
Opcode::AtomicStore8 => I8,
_ => unreachable!(),
};
assert!(is_valid_atomic_transaction_ty(access_ty));
ctx.emit(Inst::StoreRelease { access_ty, rt, rn });
}
Opcode::Fence => {

View File

@@ -2734,37 +2734,61 @@ fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(
ctx.emit(Inst::AtomicCas64 { rd, rn, mem });
}
}
Opcode::AtomicLoad => {
Opcode::AtomicLoad
| Opcode::AtomicUload8
| Opcode::AtomicUload16
| Opcode::AtomicUload32 => {
let flags = ctx.memflags(insn).unwrap();
let endianness = flags.endianness(Endianness::Big);
let ty = ty.unwrap();
assert!(is_valid_atomic_transaction_ty(ty));
let access_ty = match op {
Opcode::AtomicLoad => ty,
Opcode::AtomicUload8 => types::I8,
Opcode::AtomicUload16 => types::I16,
Opcode::AtomicUload32 => types::I32,
_ => unreachable!(),
};
assert!(is_valid_atomic_transaction_ty(access_ty));
let mem = lower_address(ctx, &inputs[..], 0, flags);
let rd = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
if endianness == Endianness::Big {
ctx.emit(match ty_bits(ty) {
8 => Inst::Load32ZExt8 { rd, mem },
16 => Inst::Load32ZExt16 { rd, mem },
32 => Inst::Load32 { rd, mem },
64 => Inst::Load64 { rd, mem },
ctx.emit(match (ty_bits(access_ty), ty_bits(ty)) {
(8, 32) => Inst::Load32ZExt8 { rd, mem },
(8, 64) => Inst::Load64ZExt8 { rd, mem },
(16, 32) => Inst::Load32ZExt16 { rd, mem },
(16, 64) => Inst::Load64ZExt16 { rd, mem },
(32, 32) => Inst::Load32 { rd, mem },
(32, 64) => Inst::Load64ZExt32 { rd, mem },
(64, 64) => Inst::Load64 { rd, mem },
_ => panic!("Unsupported size in load"),
});
} else {
ctx.emit(match ty_bits(ty) {
8 => Inst::Load32ZExt8 { rd, mem },
16 => Inst::LoadRev16 { rd, mem },
32 => Inst::LoadRev32 { rd, mem },
64 => Inst::LoadRev64 { rd, mem },
ctx.emit(match (ty_bits(access_ty), ty_bits(ty)) {
(8, 32) => Inst::Load32ZExt8 { rd, mem },
(8, 64) => Inst::Load64ZExt8 { rd, mem },
(16, 32) => Inst::LoadRev16 { rd, mem },
(32, 32) => Inst::LoadRev32 { rd, mem },
(64, 64) => Inst::LoadRev64 { rd, mem },
_ => panic!("Unsupported size in load"),
});
}
}
Opcode::AtomicStore => {
Opcode::AtomicStore
| Opcode::AtomicStore32
| Opcode::AtomicStore16
| Opcode::AtomicStore8 => {
let flags = ctx.memflags(insn).unwrap();
let endianness = flags.endianness(Endianness::Big);
let ty = ctx.input_ty(insn, 0);
let data_ty = ctx.input_ty(insn, 0);
let ty = match op {
Opcode::AtomicStore => data_ty,
Opcode::AtomicStore32 => types::I32,
Opcode::AtomicStore16 => types::I16,
Opcode::AtomicStore8 => types::I8,
_ => unreachable!(),
};
assert!(is_valid_atomic_transaction_ty(ty));
let mem = lower_address(ctx, &inputs[1..], 0, flags);

View File

@@ -5825,7 +5825,10 @@ fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(
ctx.emit(Inst::gen_move(dst, regs::rax(), types::I64));
}
Opcode::AtomicLoad => {
Opcode::AtomicLoad
| Opcode::AtomicUload8
| Opcode::AtomicUload16
| Opcode::AtomicUload32 => {
// This is a normal load. The x86-TSO memory model provides sufficient sequencing
// to satisfy the CLIF synchronisation requirements for `AtomicLoad` without the
// need for any fence instructions.
@@ -5847,11 +5850,21 @@ fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(
}
}
Opcode::AtomicStore => {
Opcode::AtomicStore
| Opcode::AtomicStore32
| Opcode::AtomicStore16
| Opcode::AtomicStore8 => {
// This is a normal store, followed by an `mfence` instruction.
let data = put_input_in_reg(ctx, inputs[0]);
let addr = lower_to_amode(ctx, inputs[1], 0);
let ty_access = ctx.input_ty(insn, 0);
let data_ty = ctx.input_ty(insn, 0);
let ty_access = match op {
Opcode::AtomicStore => data_ty,
Opcode::AtomicStore32 => types::I32,
Opcode::AtomicStore16 => types::I16,
Opcode::AtomicStore8 => types::I8,
_ => unreachable!(),
};
assert!(is_valid_atomic_transaction_ty(ty_access));
ctx.emit(Inst::store(ty_access, data, addr));