diff --git a/cranelift/codegen/meta/src/shared/instructions.rs b/cranelift/codegen/meta/src/shared/instructions.rs index ffded34f95..04cbc87065 100644 --- a/cranelift/codegen/meta/src/shared/instructions.rs +++ b/cranelift/codegen/meta/src/shared/instructions.rs @@ -4600,8 +4600,7 @@ pub(crate) fn define( r#" Atomically load from memory at `p`. - This is a polymorphic instruction that can load any value type which has a memory - representation. It should only be used for integer types with 8, 16, 32 or 64 bits. + It should only be used for integer types with 32 or 64 bits. This operation is sequentially consistent and creates happens-before edges that order normal (non-atomic) loads and stores. "#, @@ -4613,14 +4612,124 @@ pub(crate) fn define( .other_side_effects(true), ); + ig.push( + Inst::new( + "atomic_uload8", + r#" + Atomically load 8 bits from memory at `p` and zero-extend to either 32 or 64 bits. + + This is equivalent to ``load.i8`` followed by ``uextend``. + + This operation is sequentially consistent and creates happens-before edges that order + normal (non-atomic) loads and stores. + "#, + &formats.load_no_offset, + ) + .operands_in(vec![MemFlags, p]) + .operands_out(vec![a]) + .can_load(true) + .other_side_effects(true), + ); + + ig.push( + Inst::new( + "atomic_uload16", + r#" + Atomically load 16 bits from memory at `p` and zero-extend to either 32 or 64 bits. + + This is equivalent to ``load.i16`` followed by ``uextend``. + + This operation is sequentially consistent and creates + happens-before edges that order normal (non-atomic) loads and stores. + "#, + &formats.load_no_offset, + ) + .operands_in(vec![MemFlags, p]) + .operands_out(vec![a]) + .can_load(true) + .other_side_effects(true), + ); + + ig.push( + Inst::new( + "atomic_uload32", + r#" + Atomically load 32 bits from memory at `p` and zero-extend to 64 bits. + + This is equivalent to ``load.i32`` followed by ``uextend``. + + This operation is sequentially consistent and creates + happens-before edges that order normal (non-atomic) loads and stores. + "#, + &formats.load_no_offset, + ) + .operands_in(vec![MemFlags, p]) + .operands_out(vec![a]) + .can_load(true) + .other_side_effects(true), + ); + ig.push( Inst::new( "atomic_store", r#" Atomically store `x` to memory at `p`. - This is a polymorphic instruction that can store any value type with a memory - representation. It should only be used for integer types with 8, 16, 32 or 64 bits. + This is a polymorphic instruction that can store a 32 or 64-bit value. + This operation is sequentially consistent and creates happens-before edges that order + normal (non-atomic) loads and stores. + "#, + &formats.store_no_offset, + ) + .operands_in(vec![MemFlags, x, p]) + .can_store(true) + .other_side_effects(true), + ); + + ig.push( + Inst::new( + "atomic_store8", + r#" + Atomically store the low 8 bits of `x` to memory at `p`. + + This is equivalent to ``ireduce.i8`` followed by ``store.i8``. + + This operation is sequentially consistent and creates happens-before edges that order + normal (non-atomic) loads and stores. + "#, + &formats.store_no_offset, + ) + .operands_in(vec![MemFlags, x, p]) + .can_store(true) + .other_side_effects(true), + ); + + ig.push( + Inst::new( + "atomic_store16", + r#" + Atomically store the low 16 bits of `x` to memory at `p`. + + This is equivalent to ``ireduce.i16`` followed by ``store.i16``. + + This operation is sequentially consistent and creates happens-before edges that order + normal (non-atomic) loads and stores. + "#, + &formats.store_no_offset, + ) + .operands_in(vec![MemFlags, x, p]) + .can_store(true) + .other_side_effects(true), + ); + + ig.push( + Inst::new( + "atomic_store32", + r#" + Atomically store the low 32 bits of `x` to memory at `p`. + + This is equivalent to ``ireduce.i32`` followed by ``store.i32``. + This operation is sequentially consistent and creates happens-before edges that order normal (non-atomic) loads and stores. "#, diff --git a/cranelift/codegen/src/isa/aarch64/inst/emit.rs b/cranelift/codegen/src/isa/aarch64/inst/emit.rs index 5374de6bf8..ce669459e1 100644 --- a/cranelift/codegen/src/isa/aarch64/inst/emit.rs +++ b/cranelift/codegen/src/isa/aarch64/inst/emit.rs @@ -498,7 +498,7 @@ fn enc_dmb_ish() -> u32 { 0xD5033BBF } -fn enc_ldxr(ty: Type, rt: Writable, rn: Reg) -> u32 { +fn enc_ldar(ty: Type, rt: Writable, rn: Reg) -> u32 { let sz = match ty { I64 => 0b11, I32 => 0b10, @@ -506,13 +506,13 @@ fn enc_ldxr(ty: Type, rt: Writable, rn: Reg) -> u32 { I8 => 0b00, _ => unreachable!(), }; - 0b00001000_01011111_01111100_00000000 + 0b00_001000_1_1_0_11111_1_11111_00000_00000 | (sz << 30) | (machreg_to_gpr(rn) << 5) | machreg_to_gpr(rt.to_reg()) } -fn enc_stxr(ty: Type, rs: Writable, rt: Reg, rn: Reg) -> u32 { +fn enc_stlr(ty: Type, rt: Reg, rn: Reg) -> u32 { let sz = match ty { I64 => 0b11, I32 => 0b10, @@ -520,7 +520,35 @@ fn enc_stxr(ty: Type, rs: Writable, rt: Reg, rn: Reg) -> u32 { I8 => 0b00, _ => unreachable!(), }; - 0b00001000_00000000_01111100_00000000 + 0b00_001000_100_11111_1_11111_00000_00000 + | (sz << 30) + | (machreg_to_gpr(rn) << 5) + | machreg_to_gpr(rt) +} + +fn enc_ldaxr(ty: Type, rt: Writable, rn: Reg) -> u32 { + let sz = match ty { + I64 => 0b11, + I32 => 0b10, + I16 => 0b01, + I8 => 0b00, + _ => unreachable!(), + }; + 0b00_001000_0_1_0_11111_1_11111_00000_00000 + | (sz << 30) + | (machreg_to_gpr(rn) << 5) + | machreg_to_gpr(rt.to_reg()) +} + +fn enc_stlxr(ty: Type, rs: Writable, rt: Reg, rn: Reg) -> u32 { + let sz = match ty { + I64 => 0b11, + I32 => 0b10, + I16 => 0b01, + I8 => 0b00, + _ => unreachable!(), + }; + 0b00_001000_000_00000_1_11111_00000_00000 | (sz << 30) | (machreg_to_gpr(rs.to_reg()) << 16) | (machreg_to_gpr(rn) << 5) @@ -1286,20 +1314,18 @@ impl MachInstEmit for Inst { } &Inst::AtomicRMW { ty, op } => { /* Emit this: - dmb ish again: - ldxr{,b,h} x/w27, [x25] + ldaxr{,b,h} x/w27, [x25] op x28, x27, x26 // op is add,sub,and,orr,eor - stxr{,b,h} w24, x/w28, [x25] + stlxr{,b,h} w24, x/w28, [x25] cbnz x24, again - dmb ish Operand conventions: IN: x25 (addr), x26 (2nd arg for op) OUT: x27 (old value), x24 (trashed), x28 (trashed) It is unfortunate that, per the ARM documentation, x28 cannot be used for - both the store-data and success-flag operands of stxr. This causes the + both the store-data and success-flag operands of stlxr. This causes the instruction's behaviour to be "CONSTRAINED UNPREDICTABLE", so we use x24 instead for the success-flag. @@ -1320,15 +1346,13 @@ impl MachInstEmit for Inst { let x28wr = writable_xreg(28); let again_label = sink.get_label(); - sink.put4(enc_dmb_ish()); // dmb ish - // again: sink.bind_label(again_label); let srcloc = state.cur_srcloc(); if srcloc != SourceLoc::default() { sink.add_trap(srcloc, TrapCode::HeapOutOfBounds); } - sink.put4(enc_ldxr(ty, x27wr, x25)); // ldxr x27, [x25] + sink.put4(enc_ldaxr(ty, x27wr, x25)); // ldaxr x27, [x25] match op { AtomicRmwOp::Xchg => { @@ -1420,19 +1444,17 @@ impl MachInstEmit for Inst { if srcloc != SourceLoc::default() { sink.add_trap(srcloc, TrapCode::HeapOutOfBounds); } - sink.put4(enc_stxr(ty, x24wr, x28, x25)); // stxr w24, x28, [x25] + sink.put4(enc_stlxr(ty, x24wr, x28, x25)); // stlxr w24, x28, [x25] // cbnz w24, again // Note, we're actually testing x24, and relying on the default zero-high-half - // rule in the assignment that `stxr` does. + // rule in the assignment that `stlxr` does. let br_offset = sink.cur_offset(); sink.put4(enc_conditional_br( BranchTarget::Label(again_label), CondBrKind::NotZero(x24), )); sink.use_label_at_offset(br_offset, again_label, LabelUse::Branch19); - - sink.put4(enc_dmb_ish()); // dmb ish } &Inst::AtomicCAS { rs, rt, rn, ty } => { let size = match ty { @@ -1447,22 +1469,18 @@ impl MachInstEmit for Inst { } &Inst::AtomicCASLoop { ty } => { /* Emit this: - dmb ish again: - ldxr{,b,h} x/w27, [x25] - and x24, x26, MASK (= 2^size_bits - 1) - cmp x27, x24 + ldaxr{,b,h} x/w27, [x25] + cmp x27, x/w26 uxt{b,h} b.ne out - stxr{,b,h} w24, x/w28, [x25] + stlxr{,b,h} w24, x/w28, [x25] cbnz x24, again out: - dmb ish Operand conventions: IN: x25 (addr), x26 (expected value), x28 (replacement value) OUT: x27 (old value), x24 (trashed) */ - let xzr = zero_reg(); let x24 = xreg(24); let x25 = xreg(25); let x26 = xreg(26); @@ -1474,37 +1492,25 @@ impl MachInstEmit for Inst { let again_label = sink.get_label(); let out_label = sink.get_label(); - sink.put4(enc_dmb_ish()); // dmb ish - // again: sink.bind_label(again_label); let srcloc = state.cur_srcloc(); if srcloc != SourceLoc::default() { sink.add_trap(srcloc, TrapCode::HeapOutOfBounds); } - sink.put4(enc_ldxr(ty, x27wr, x25)); // ldxr x27, [x25] + // ldaxr x27, [x25] + sink.put4(enc_ldaxr(ty, x27wr, x25)); - if ty == I64 { - // mov x24, x26 - sink.put4(enc_arith_rrr(0b101_01010_00_0, 0b000000, x24wr, xzr, x26)) - } else { - // and x24, x26, 0xFF/0xFFFF/0xFFFFFFFF - let (mask, s) = match ty { - I8 => (0xFF, 7), - I16 => (0xFFFF, 15), - I32 => (0xFFFFFFFF, 31), - _ => unreachable!(), - }; - sink.put4(enc_arith_rr_imml( - 0b100_100100, - ImmLogic::from_n_r_s(mask, true, 0, s, OperandSize::Size64).enc_bits(), - x26, - x24wr, - )) - } - - // cmp x27, x24 (== subs xzr, x27, x24) - sink.put4(enc_arith_rrr(0b111_01011_00_0, 0b000000, xzrwr, x27, x24)); + // The top 32-bits are zero-extended by the ldaxr so we don't + // have to use UXTW, just the x-form of the register. + let (bit21, extend_op) = match ty { + I8 => (0b1, 0b000000), + I16 => (0b1, 0b001000), + _ => (0b0, 0b000000), + }; + let bits_31_21 = 0b111_01011_000 | bit21; + // cmp x27, x26 (== subs xzr, x27, x26) + sink.put4(enc_arith_rrr(bits_31_21, extend_op, xzrwr, x27, x26)); // b.ne out let br_out_offset = sink.cur_offset(); @@ -1518,11 +1524,11 @@ impl MachInstEmit for Inst { if srcloc != SourceLoc::default() { sink.add_trap(srcloc, TrapCode::HeapOutOfBounds); } - sink.put4(enc_stxr(ty, x24wr, x28, x25)); // stxr w24, x28, [x25] + sink.put4(enc_stlxr(ty, x24wr, x28, x25)); // stlxr w24, x28, [x25] // cbnz w24, again. // Note, we're actually testing x24, and relying on the default zero-high-half - // rule in the assignment that `stxr` does. + // rule in the assignment that `stlxr` does. let br_again_offset = sink.cur_offset(); sink.put4(enc_conditional_br( BranchTarget::Label(again_label), @@ -1532,46 +1538,12 @@ impl MachInstEmit for Inst { // out: sink.bind_label(out_label); - sink.put4(enc_dmb_ish()); // dmb ish } - &Inst::AtomicLoad { ty, r_data, r_addr } => { - let op = match ty { - I8 => 0b0011100001, - I16 => 0b0111100001, - I32 => 0b1011100001, - I64 => 0b1111100001, - _ => unreachable!(), - }; - sink.put4(enc_dmb_ish()); // dmb ish - - let srcloc = state.cur_srcloc(); - if srcloc != SourceLoc::default() { - sink.add_trap(srcloc, TrapCode::HeapOutOfBounds); - } - let uimm12scaled_zero = UImm12Scaled::zero(I8 /*irrelevant*/); - sink.put4(enc_ldst_uimm12( - op, - uimm12scaled_zero, - r_addr, - r_data.to_reg(), - )); + &Inst::LoadAcquire { access_ty, rt, rn } => { + sink.put4(enc_ldar(access_ty, rt, rn)); } - &Inst::AtomicStore { ty, r_data, r_addr } => { - let op = match ty { - I8 => 0b0011100000, - I16 => 0b0111100000, - I32 => 0b1011100000, - I64 => 0b1111100000, - _ => unreachable!(), - }; - - let srcloc = state.cur_srcloc(); - if srcloc != SourceLoc::default() { - sink.add_trap(srcloc, TrapCode::HeapOutOfBounds); - } - let uimm12scaled_zero = UImm12Scaled::zero(I8 /*irrelevant*/); - sink.put4(enc_ldst_uimm12(op, uimm12scaled_zero, r_addr, r_data)); - sink.put4(enc_dmb_ish()); // dmb ish + &Inst::StoreRelease { access_ty, rt, rn } => { + sink.put4(enc_stlr(access_ty, rt, rn)); } &Inst::Fence {} => { sink.put4(enc_dmb_ish()); // dmb ish diff --git a/cranelift/codegen/src/isa/aarch64/inst/emit_tests.rs b/cranelift/codegen/src/isa/aarch64/inst/emit_tests.rs index b27d183a94..9e45c6795c 100644 --- a/cranelift/codegen/src/isa/aarch64/inst/emit_tests.rs +++ b/cranelift/codegen/src/isa/aarch64/inst/emit_tests.rs @@ -5891,7 +5891,7 @@ fn test_aarch64_binemit() { ty: I16, op: inst_common::AtomicRmwOp::Xor, }, - "BF3B03D53B7F5F487C031ACA3C7F1848B8FFFFB5BF3B03D5", + "3BFF5F487C031ACA3CFF1848B8FFFFB5", "atomically { 16_bits_at_[x25]) Xor= x26 ; x27 = old_value_at_[x25]; x24,x28 = trash }", )); @@ -5900,7 +5900,7 @@ fn test_aarch64_binemit() { ty: I32, op: inst_common::AtomicRmwOp::Xchg, }, - "BF3B03D53B7F5F88FC031AAA3C7F1888B8FFFFB5BF3B03D5", + "3BFF5F88FC031AAA3CFF1888B8FFFFB5", "atomically { 32_bits_at_[x25]) Xchg= x26 ; x27 = old_value_at_[x25]; x24,x28 = trash }", )); insns.push(( @@ -5947,56 +5947,112 @@ fn test_aarch64_binemit() { Inst::AtomicCASLoop { ty: I8, }, - "BF3B03D53B7F5F08581F40927F0318EB610000543C7F180878FFFFB5BF3B03D5", + "3BFF5F087F033AEB610000543CFF180898FFFFB5", "atomically { compare-and-swap(8_bits_at_[x25], x26 -> x28), x27 = old_value_at_[x25]; x24 = trash }" )); + insns.push(( + Inst::AtomicCASLoop { + ty: I16, + }, + "3BFF5F487F233AEB610000543CFF184898FFFFB5", + "atomically { compare-and-swap(16_bits_at_[x25], x26 -> x28), x27 = old_value_at_[x25]; x24 = trash }" + )); + + insns.push(( + Inst::AtomicCASLoop { + ty: I32, + }, + "3BFF5F887F031AEB610000543CFF188898FFFFB5", + "atomically { compare-and-swap(32_bits_at_[x25], x26 -> x28), x27 = old_value_at_[x25]; x24 = trash }" + )); + insns.push(( Inst::AtomicCASLoop { ty: I64, }, - "BF3B03D53B7F5FC8F8031AAA7F0318EB610000543C7F18C878FFFFB5BF3B03D5", + "3BFF5FC87F031AEB610000543CFF18C898FFFFB5", "atomically { compare-and-swap(64_bits_at_[x25], x26 -> x28), x27 = old_value_at_[x25]; x24 = trash }" )); insns.push(( - Inst::AtomicLoad { - ty: I8, - r_data: writable_xreg(7), - r_addr: xreg(28), + Inst::LoadAcquire { + access_ty: I8, + rt: writable_xreg(7), + rn: xreg(28), }, - "BF3B03D587034039", - "atomically { x7 = zero_extend_8_bits_at[x28] }", + "87FFDF08", + "ldarb w7, [x28]", )); insns.push(( - Inst::AtomicLoad { - ty: I64, - r_data: writable_xreg(28), - r_addr: xreg(7), + Inst::LoadAcquire { + access_ty: I16, + rt: writable_xreg(2), + rn: xreg(3), }, - "BF3B03D5FC0040F9", - "atomically { x28 = zero_extend_64_bits_at[x7] }", + "62FCDF48", + "ldarh w2, [x3]", )); insns.push(( - Inst::AtomicStore { - ty: I16, - r_data: xreg(17), - r_addr: xreg(8), + Inst::LoadAcquire { + access_ty: I32, + rt: writable_xreg(15), + rn: xreg(0), }, - "11010079BF3B03D5", - "atomically { 16_bits_at[x8] = x17 }", + "0FFCDF88", + "ldar w15, [x0]", )); insns.push(( - Inst::AtomicStore { - ty: I32, - r_data: xreg(18), - r_addr: xreg(7), + Inst::LoadAcquire { + access_ty: I64, + rt: writable_xreg(28), + rn: xreg(7), }, - "F20000B9BF3B03D5", - "atomically { 32_bits_at[x7] = x18 }", + "FCFCDFC8", + "ldar x28, [x7]", + )); + + insns.push(( + Inst::StoreRelease { + access_ty: I8, + rt: xreg(7), + rn: xreg(28), + }, + "87FF9F08", + "stlrb w7, [x28]", + )); + + insns.push(( + Inst::StoreRelease { + access_ty: I16, + rt: xreg(2), + rn: xreg(3), + }, + "62FC9F48", + "stlrh w2, [x3]", + )); + + insns.push(( + Inst::StoreRelease { + access_ty: I32, + rt: xreg(15), + rn: xreg(0), + }, + "0FFC9F88", + "stlr w15, [x0]", + )); + + insns.push(( + Inst::StoreRelease { + access_ty: I64, + rt: xreg(28), + rn: xreg(7), + }, + "FCFC9FC8", + "stlr x28, [x7]", )); insns.push((Inst::Fence {}, "BF3B03D5", "dmb ish")); diff --git a/cranelift/codegen/src/isa/aarch64/inst/mod.rs b/cranelift/codegen/src/isa/aarch64/inst/mod.rs index d498bc9b85..ce1b520429 100644 --- a/cranelift/codegen/src/isa/aarch64/inst/mod.rs +++ b/cranelift/codegen/src/isa/aarch64/inst/mod.rs @@ -789,10 +789,9 @@ pub enum Inst { }, /// Similar to AtomicRMW, a compare-and-swap operation implemented using a load-linked - /// store-conditional loop. The sequence is both preceded and followed by a fence which is - /// at least as comprehensive as that of the `Fence` instruction below. This instruction - /// is sequentially consistent. Note that the operand conventions, although very similar - /// to AtomicRMW, are different: + /// store-conditional loop. + /// This instruction is sequentially consistent. + /// Note that the operand conventions, although very similar to AtomicRMW, are different: /// /// x25 (rd) address /// x26 (rd) expected value @@ -803,22 +802,21 @@ pub enum Inst { ty: Type, // I8, I16, I32 or I64 }, - /// Read `ty` bits from address `r_addr`, zero extend the loaded value to 64 bits and put it - /// in `r_data`. The load instruction is preceded by a fence at least as comprehensive as - /// that of the `Fence` instruction below. This instruction is sequentially consistent. - AtomicLoad { - ty: Type, // I8, I16, I32 or I64 - r_data: Writable, - r_addr: Reg, + /// Read `access_ty` bits from address `rt`, either 8, 16, 32 or 64-bits, and put + /// it in `rn`, optionally zero-extending to fill a word or double word result. + /// This instruction is sequentially consistent. + LoadAcquire { + access_ty: Type, // I8, I16, I32 or I64 + rt: Writable, + rn: Reg, }, - /// Write the lowest `ty` bits of `r_data` to address `r_addr`, with a memory fence - /// instruction following the store. The fence is at least as comprehensive as that of the - /// `Fence` instruction below. This instruction is sequentially consistent. - AtomicStore { - ty: Type, // I8, I16, I32 or I64 - r_data: Reg, - r_addr: Reg, + /// Write the lowest `ty` bits of `rt` to address `rn`. + /// This instruction is sequentially consistent. + StoreRelease { + access_ty: Type, // I8, I16, I32 or I64 + rt: Reg, + rn: Reg, }, /// A memory fence. This must provide ordering to ensure that, at a minimum, neither loads @@ -1940,13 +1938,13 @@ fn aarch64_get_regs(inst: &Inst, collector: &mut RegUsageCollector) { collector.add_def(writable_xreg(24)); collector.add_def(writable_xreg(27)); } - &Inst::AtomicLoad { r_data, r_addr, .. } => { - collector.add_use(r_addr); - collector.add_def(r_data); + &Inst::LoadAcquire { rt, rn, .. } => { + collector.add_use(rn); + collector.add_def(rt); } - &Inst::AtomicStore { r_data, r_addr, .. } => { - collector.add_use(r_addr); - collector.add_use(r_data); + &Inst::StoreRelease { rt, rn, .. } => { + collector.add_use(rn); + collector.add_use(rt); } &Inst::Fence {} => {} &Inst::FpuMove64 { rd, rn } => { @@ -2579,21 +2577,21 @@ fn aarch64_map_regs(inst: &mut Inst, mapper: &RUM) { &mut Inst::AtomicCASLoop { .. } => { // There are no vregs to map in this insn. } - &mut Inst::AtomicLoad { - ref mut r_data, - ref mut r_addr, + &mut Inst::LoadAcquire { + ref mut rt, + ref mut rn, .. } => { - map_def(mapper, r_data); - map_use(mapper, r_addr); + map_def(mapper, rt); + map_use(mapper, rn); } - &mut Inst::AtomicStore { - ref mut r_data, - ref mut r_addr, + &mut Inst::StoreRelease { + ref mut rt, + ref mut rn, .. } => { - map_use(mapper, r_data); - map_use(mapper, r_addr); + map_use(mapper, rt); + map_use(mapper, rn); } &mut Inst::Fence {} => {} &mut Inst::FpuMove64 { @@ -3643,25 +3641,35 @@ impl Inst { "atomically {{ compare-and-swap({}_bits_at_[x25], x26 -> x28), x27 = old_value_at_[x25]; x24 = trash }}", ty.bits()) } - &Inst::AtomicLoad { - ty, r_data, r_addr, .. + &Inst::LoadAcquire { + access_ty, rt, rn, .. } => { - format!( - "atomically {{ {} = zero_extend_{}_bits_at[{}] }}", - r_data.show_rru(mb_rru), - ty.bits(), - r_addr.show_rru(mb_rru) - ) + let (op, ty) = match access_ty { + I8 => ("ldarb", I32), + I16 => ("ldarh", I32), + I32 => ("ldar", I32), + I64 => ("ldar", I64), + _ => panic!("Unsupported type: {}", access_ty), + }; + let size = OperandSize::from_ty(ty); + let rt = show_ireg_sized(rt.to_reg(), mb_rru, size); + let rn = rn.show_rru(mb_rru); + format!("{} {}, [{}]", op, rt, rn) } - &Inst::AtomicStore { - ty, r_data, r_addr, .. + &Inst::StoreRelease { + access_ty, rt, rn, .. } => { - format!( - "atomically {{ {}_bits_at[{}] = {} }}", - ty.bits(), - r_addr.show_rru(mb_rru), - r_data.show_rru(mb_rru) - ) + let (op, ty) = match access_ty { + I8 => ("stlrb", I32), + I16 => ("stlrh", I32), + I32 => ("stlr", I32), + I64 => ("stlr", I64), + _ => panic!("Unsupported type: {}", access_ty), + }; + let size = OperandSize::from_ty(ty); + let rt = show_ireg_sized(rt, mb_rru, size); + let rn = rn.show_rru(mb_rru); + format!("{} {}, [{}]", op, rt, rn) } &Inst::Fence {} => { format!("dmb ish") diff --git a/cranelift/codegen/src/isa/aarch64/lower_inst.rs b/cranelift/codegen/src/isa/aarch64/lower_inst.rs index f9440dbbb1..8a4df2026b 100644 --- a/cranelift/codegen/src/isa/aarch64/lower_inst.rs +++ b/cranelift/codegen/src/isa/aarch64/lower_inst.rs @@ -1522,28 +1522,40 @@ pub(crate) fn lower_insn_to_regs>( } } - Opcode::AtomicLoad => { - let r_data = get_output_reg(ctx, outputs[0]).only_reg().unwrap(); - let r_addr = put_input_in_reg(ctx, inputs[0], NarrowValueMode::None); - let ty_access = ty.unwrap(); - assert!(is_valid_atomic_transaction_ty(ty_access)); - ctx.emit(Inst::AtomicLoad { - ty: ty_access, - r_data, - r_addr, - }); + Opcode::AtomicLoad + | Opcode::AtomicUload8 + | Opcode::AtomicUload16 + | Opcode::AtomicUload32 => { + let rt = get_output_reg(ctx, outputs[0]).only_reg().unwrap(); + let rn = put_input_in_reg(ctx, inputs[0], NarrowValueMode::None); + let ty = ty.unwrap(); + let access_ty = match op { + Opcode::AtomicLoad => ty, + Opcode::AtomicUload8 => I8, + Opcode::AtomicUload16 => I16, + Opcode::AtomicUload32 => I32, + _ => panic!(), + }; + assert!(is_valid_atomic_transaction_ty(access_ty)); + ctx.emit(Inst::LoadAcquire { access_ty, rt, rn }); } - Opcode::AtomicStore => { - let r_data = put_input_in_reg(ctx, inputs[0], NarrowValueMode::None); - let r_addr = put_input_in_reg(ctx, inputs[1], NarrowValueMode::None); - let ty_access = ctx.input_ty(insn, 0); - assert!(is_valid_atomic_transaction_ty(ty_access)); - ctx.emit(Inst::AtomicStore { - ty: ty_access, - r_data, - r_addr, - }); + Opcode::AtomicStore + | Opcode::AtomicStore32 + | Opcode::AtomicStore16 + | Opcode::AtomicStore8 => { + let rt = put_input_in_reg(ctx, inputs[0], NarrowValueMode::None); + let rn = put_input_in_reg(ctx, inputs[1], NarrowValueMode::None); + let ty = ctx.input_ty(insn, 0); + let access_ty = match op { + Opcode::AtomicStore => ty, + Opcode::AtomicStore32 => I32, + Opcode::AtomicStore16 => I16, + Opcode::AtomicStore8 => I8, + _ => unreachable!(), + }; + assert!(is_valid_atomic_transaction_ty(access_ty)); + ctx.emit(Inst::StoreRelease { access_ty, rt, rn }); } Opcode::Fence => { diff --git a/cranelift/codegen/src/isa/s390x/lower.rs b/cranelift/codegen/src/isa/s390x/lower.rs index b13edc4bb2..8ff375788a 100644 --- a/cranelift/codegen/src/isa/s390x/lower.rs +++ b/cranelift/codegen/src/isa/s390x/lower.rs @@ -2734,37 +2734,61 @@ fn lower_insn_to_regs>( ctx.emit(Inst::AtomicCas64 { rd, rn, mem }); } } - Opcode::AtomicLoad => { + Opcode::AtomicLoad + | Opcode::AtomicUload8 + | Opcode::AtomicUload16 + | Opcode::AtomicUload32 => { let flags = ctx.memflags(insn).unwrap(); let endianness = flags.endianness(Endianness::Big); let ty = ty.unwrap(); - assert!(is_valid_atomic_transaction_ty(ty)); + let access_ty = match op { + Opcode::AtomicLoad => ty, + Opcode::AtomicUload8 => types::I8, + Opcode::AtomicUload16 => types::I16, + Opcode::AtomicUload32 => types::I32, + _ => unreachable!(), + }; + assert!(is_valid_atomic_transaction_ty(access_ty)); let mem = lower_address(ctx, &inputs[..], 0, flags); let rd = get_output_reg(ctx, outputs[0]).only_reg().unwrap(); if endianness == Endianness::Big { - ctx.emit(match ty_bits(ty) { - 8 => Inst::Load32ZExt8 { rd, mem }, - 16 => Inst::Load32ZExt16 { rd, mem }, - 32 => Inst::Load32 { rd, mem }, - 64 => Inst::Load64 { rd, mem }, + ctx.emit(match (ty_bits(access_ty), ty_bits(ty)) { + (8, 32) => Inst::Load32ZExt8 { rd, mem }, + (8, 64) => Inst::Load64ZExt8 { rd, mem }, + (16, 32) => Inst::Load32ZExt16 { rd, mem }, + (16, 64) => Inst::Load64ZExt16 { rd, mem }, + (32, 32) => Inst::Load32 { rd, mem }, + (32, 64) => Inst::Load64ZExt32 { rd, mem }, + (64, 64) => Inst::Load64 { rd, mem }, _ => panic!("Unsupported size in load"), }); } else { - ctx.emit(match ty_bits(ty) { - 8 => Inst::Load32ZExt8 { rd, mem }, - 16 => Inst::LoadRev16 { rd, mem }, - 32 => Inst::LoadRev32 { rd, mem }, - 64 => Inst::LoadRev64 { rd, mem }, + ctx.emit(match (ty_bits(access_ty), ty_bits(ty)) { + (8, 32) => Inst::Load32ZExt8 { rd, mem }, + (8, 64) => Inst::Load64ZExt8 { rd, mem }, + (16, 32) => Inst::LoadRev16 { rd, mem }, + (32, 32) => Inst::LoadRev32 { rd, mem }, + (64, 64) => Inst::LoadRev64 { rd, mem }, _ => panic!("Unsupported size in load"), }); } } - Opcode::AtomicStore => { + Opcode::AtomicStore + | Opcode::AtomicStore32 + | Opcode::AtomicStore16 + | Opcode::AtomicStore8 => { let flags = ctx.memflags(insn).unwrap(); let endianness = flags.endianness(Endianness::Big); - let ty = ctx.input_ty(insn, 0); + let data_ty = ctx.input_ty(insn, 0); + let ty = match op { + Opcode::AtomicStore => data_ty, + Opcode::AtomicStore32 => types::I32, + Opcode::AtomicStore16 => types::I16, + Opcode::AtomicStore8 => types::I8, + _ => unreachable!(), + }; assert!(is_valid_atomic_transaction_ty(ty)); let mem = lower_address(ctx, &inputs[1..], 0, flags); diff --git a/cranelift/codegen/src/isa/x64/lower.rs b/cranelift/codegen/src/isa/x64/lower.rs index b4c05cee8f..19433dc71e 100644 --- a/cranelift/codegen/src/isa/x64/lower.rs +++ b/cranelift/codegen/src/isa/x64/lower.rs @@ -5825,7 +5825,10 @@ fn lower_insn_to_regs>( ctx.emit(Inst::gen_move(dst, regs::rax(), types::I64)); } - Opcode::AtomicLoad => { + Opcode::AtomicLoad + | Opcode::AtomicUload8 + | Opcode::AtomicUload16 + | Opcode::AtomicUload32 => { // This is a normal load. The x86-TSO memory model provides sufficient sequencing // to satisfy the CLIF synchronisation requirements for `AtomicLoad` without the // need for any fence instructions. @@ -5847,11 +5850,21 @@ fn lower_insn_to_regs>( } } - Opcode::AtomicStore => { + Opcode::AtomicStore + | Opcode::AtomicStore32 + | Opcode::AtomicStore16 + | Opcode::AtomicStore8 => { // This is a normal store, followed by an `mfence` instruction. let data = put_input_in_reg(ctx, inputs[0]); let addr = lower_to_amode(ctx, inputs[1], 0); - let ty_access = ctx.input_ty(insn, 0); + let data_ty = ctx.input_ty(insn, 0); + let ty_access = match op { + Opcode::AtomicStore => data_ty, + Opcode::AtomicStore32 => types::I32, + Opcode::AtomicStore16 => types::I16, + Opcode::AtomicStore8 => types::I8, + _ => unreachable!(), + }; assert!(is_valid_atomic_transaction_ty(ty_access)); ctx.emit(Inst::store(ty_access, data, addr)); diff --git a/cranelift/filetests/filetests/isa/aarch64/atomic_load.clif b/cranelift/filetests/filetests/isa/aarch64/atomic_load.clif new file mode 100644 index 0000000000..31af721015 --- /dev/null +++ b/cranelift/filetests/filetests/isa/aarch64/atomic_load.clif @@ -0,0 +1,72 @@ +test compile +target aarch64 + +function %atomic_load_i64(i64) -> i64 { +block0(v0: i64): + v1 = atomic_load.i64 v0 + return v1 +} + +; check: ldar x0, [x0] +; nextln: ldp fp, lr, [sp], #16 +; nextln: ret + +function %atomic_load_i32(i64) -> i32 { +block0(v0: i64): + v1 = atomic_load.i32 v0 + return v1 +} + +; check: ldar w0, [x0] +; nextln: ldp fp, lr, [sp], #16 +; nextln: ret + +function %atomic_uload_i32_i64(i64) -> i64 { +block0(v0: i64): + v1 = atomic_uload32.i64 v0 + return v1 +} + +; check: ldar w0, [x0] +; nextln: ldp fp, lr, [sp], #16 +; nextln: ret + +function %atomic_uload_i16_i32(i64) -> i32 { +block0(v0: i64): + v1 = atomic_uload16.i32 v0 + return v1 +} + +; check: ldarh w0, [x0] +; nextln: ldp fp, lr, [sp], #16 +; nextln: ret + +function %atomic_uload_i16_i64(i64) -> i64 { +block0(v0: i64): + v1 = atomic_uload16.i64 v0 + return v1 +} + +; check: ldarh w0, [x0] +; nextln: ldp fp, lr, [sp], #16 +; nextln: ret + +function %atomic_uload_i8_i32(i64) -> i32 { +block0(v0: i64): + v1 = atomic_uload8.i32 v0 + return v1 +} + +; check: ldarb w0, [x0] +; nextln: ldp fp, lr, [sp], #16 +; nextln: ret + +function %atomic_uload_i8_i64(i64) -> i64 { +block0(v0: i64): + v1 = atomic_uload8.i64 v0 + return v1 +} + +; check: ldarb w0, [x0] +; nextln: ldp fp, lr, [sp], #16 +; nextln: ret diff --git a/cranelift/filetests/filetests/isa/aarch64/atomic_store.clif b/cranelift/filetests/filetests/isa/aarch64/atomic_store.clif new file mode 100644 index 0000000000..9c0cd529d7 --- /dev/null +++ b/cranelift/filetests/filetests/isa/aarch64/atomic_store.clif @@ -0,0 +1,72 @@ +test compile +target aarch64 + +function %atomic_store_i64(i64, i64) { +block0(v0: i64, v1: i64): + atomic_store.i64 v0, v1 + return +} + +; check: stlr x0, [x1] +; nextln: ldp fp, lr, [sp], #16 +; nextln: ret + +function %atomic_store_i32(i32, i64) { +block0(v0: i32, v1: i64): + atomic_store.i32 v0, v1 + return +} + +; check: stlr w0, [x1] +; nextln: ldp fp, lr, [sp], #16 +; nextln: ret + +function %atomic_ustore_i32_i64(i64, i64) { +block0(v0: i64, v1: i64): + atomic_store32.i64 v0, v1 + return +} + +; check: stlr w0, [x1] +; nextln: ldp fp, lr, [sp], #16 +; nextln: ret + +function %atomic_ustore_i16_i32(i32, i64) { +block0(v0: i32, v1: i64): + atomic_store16.i32 v0, v1 + return +} + +; check: stlrh w0, [x1] +; nextln: ldp fp, lr, [sp], #16 +; nextln: ret + +function %atomic_ustore_i16_i64(i64, i64) { +block0(v0: i64, v1: i64): + atomic_store16.i64 v0, v1 + return +} + +; check: stlrh w0, [x1] +; nextln: ldp fp, lr, [sp], #16 +; nextln: ret + +function %atomic_ustore_i8_i32(i32, i64) { +block0(v0: i32, v1: i64): + atomic_store8.i32 v0, v1 + return +} + +; check: stlrb w0, [x1] +; nextln: ldp fp, lr, [sp], #16 +; nextln: ret + +function %atomic_ustore_i8_i64(i64, i64) { +block0(v0: i64, v1: i64): + atomic_store8.i64 v0, v1 + return +} + +; check: stlrb w0, [x1] +; nextln: ldp fp, lr, [sp], #16 +; nextln: ret diff --git a/cranelift/filetests/filetests/isa/s390x/atomic_load-little.clif b/cranelift/filetests/filetests/isa/s390x/atomic_load-little.clif index 5556176bbb..629c432370 100644 --- a/cranelift/filetests/filetests/isa/s390x/atomic_load-little.clif +++ b/cranelift/filetests/filetests/isa/s390x/atomic_load-little.clif @@ -41,29 +41,29 @@ block0: ; check: larl %r1, %sym + 0 ; lrv %r2, 0(%r1) ; nextln: br %r14 -function %atomic_load_i16(i64) -> i16 { +function %atomic_load_i16(i64) -> i32 { block0(v0: i64): - v1 = atomic_load.i16 little v0 + v1 = atomic_uload16.i32 little v0 return v1 } ; check: lrvh %r2, 0(%r2) ; nextln: br %r14 -function %atomic_load_i16_sym() -> i16 { +function %atomic_load_i16_sym() -> i32 { gv0 = symbol colocated %sym block0: v0 = symbol_value.i64 gv0 - v1 = atomic_load.i16 little v0 + v1 = atomic_uload16.i32 little v0 return v1 } ; check: larl %r1, %sym + 0 ; lrvh %r2, 0(%r1) ; nextln: br %r14 -function %atomic_load_i8(i64) -> i8 { +function %atomic_load_i8(i64) -> i32 { block0(v0: i64): - v1 = atomic_load.i8 little v0 + v1 = atomic_uload8.i32 little v0 return v1 } diff --git a/cranelift/filetests/filetests/isa/s390x/atomic_load.clif b/cranelift/filetests/filetests/isa/s390x/atomic_load.clif index b361aaa4c7..9a58de52d1 100644 --- a/cranelift/filetests/filetests/isa/s390x/atomic_load.clif +++ b/cranelift/filetests/filetests/isa/s390x/atomic_load.clif @@ -41,29 +41,29 @@ block0: ; check: lrl %r2, %sym + 0 ; nextln: br %r14 -function %atomic_load_i16(i64) -> i16 { +function %atomic_load_i16(i64) -> i32 { block0(v0: i64): - v1 = atomic_load.i16 v0 + v1 = atomic_uload16.i32 v0 return v1 } ; check: llh %r2, 0(%r2) ; nextln: br %r14 -function %atomic_load_i16_sym() -> i16 { +function %atomic_load_i16_sym() -> i32 { gv0 = symbol colocated %sym block0: v0 = symbol_value.i64 gv0 - v1 = atomic_load.i16 v0 + v1 = atomic_uload16.i32 v0 return v1 } ; check: llhrl %r2, %sym + 0 ; nextln: br %r14 -function %atomic_load_i8(i64) -> i8 { +function %atomic_load_i8(i64) -> i32 { block0(v0: i64): - v1 = atomic_load.i8 v0 + v1 = atomic_uload8.i32 v0 return v1 } diff --git a/cranelift/interpreter/src/step.rs b/cranelift/interpreter/src/step.rs index c9c0372980..2b892de57c 100644 --- a/cranelift/interpreter/src/step.rs +++ b/cranelift/interpreter/src/step.rs @@ -625,8 +625,14 @@ where Opcode::Iconcat => assign(Value::concat(arg(0)?, arg(1)?)?), Opcode::AtomicRmw => unimplemented!("AtomicRmw"), Opcode::AtomicCas => unimplemented!("AtomicCas"), - Opcode::AtomicLoad => unimplemented!("AtomicLoad"), - Opcode::AtomicStore => unimplemented!("AtomicStore"), + Opcode::AtomicLoad + | Opcode::AtomicUload8 + | Opcode::AtomicUload16 + | Opcode::AtomicUload32 => unimplemented!("AtomicLoad"), + Opcode::AtomicStore + | Opcode::AtomicStore8 + | Opcode::AtomicStore16 + | Opcode::AtomicStore32 => unimplemented!("AtomicStore"), Opcode::Fence => unimplemented!("Fence"), Opcode::WideningPairwiseDotProductS => unimplemented!("WideningPairwiseDotProductS"), Opcode::SqmulRoundSat => unimplemented!("SqmulRoundSat"),