diff --git a/cranelift/codegen/meta/src/isa/arm64/mod.rs b/cranelift/codegen/meta/src/isa/arm64/mod.rs index 5d8bc76fc4..cbc21347e9 100644 --- a/cranelift/codegen/meta/src/isa/arm64/mod.rs +++ b/cranelift/codegen/meta/src/isa/arm64/mod.rs @@ -8,7 +8,10 @@ use crate::cdsl::settings::{SettingGroup, SettingGroupBuilder}; use crate::shared::Definitions as SharedDefinitions; fn define_settings(_shared: &SettingGroup) -> SettingGroup { - let setting = SettingGroupBuilder::new("arm64"); + let mut setting = SettingGroupBuilder::new("arm64"); + let has_lse = setting.add_bool("has_lse", "Large System Extensions", false); + + setting.add_predicate("use_lse", predicate!(has_lse)); setting.build() } diff --git a/cranelift/codegen/src/isa/aarch64/inst/emit.rs b/cranelift/codegen/src/isa/aarch64/inst/emit.rs index aaa76a659c..aa708a8524 100644 --- a/cranelift/codegen/src/isa/aarch64/inst/emit.rs +++ b/cranelift/codegen/src/isa/aarch64/inst/emit.rs @@ -462,6 +462,16 @@ fn enc_stxr(ty: Type, rs: Writable, rt: Reg, rn: Reg) -> u32 { | machreg_to_gpr(rt) } +fn enc_cas(size: u32, rs: Writable, rt: Reg, rn: Reg) -> u32 { + debug_assert_eq!(size & 0b11, size); + + 0b00_0010001_1_1_00000_1_11111_00000_00000 + | size << 30 + | machreg_to_gpr(rs.to_reg()) << 16 + | machreg_to_gpr(rn) << 5 + | machreg_to_gpr(rt) +} + fn enc_asimd_mod_imm(rd: Writable, q_op: u32, cmode: u32, imm: u8) -> u32 { let abc = (imm >> 5) as u32; let defgh = (imm & 0b11111) as u32; @@ -1164,7 +1174,18 @@ impl MachInstEmit for Inst { sink.put4(enc_dmb_ish()); // dmb ish } - &Inst::AtomicCAS { ty } => { + &Inst::AtomicCAS { rs, rt, rn, ty } => { + let size = match ty { + I8 => 0b00, + I16 => 0b01, + I32 => 0b10, + I64 => 0b11, + _ => panic!("Unsupported type: {}", ty), + }; + + sink.put4(enc_cas(size, rs, rt, rn)); + } + &Inst::AtomicCASLoop { ty } => { /* Emit this: dmb ish again: diff --git a/cranelift/codegen/src/isa/aarch64/inst/emit_tests.rs b/cranelift/codegen/src/isa/aarch64/inst/emit_tests.rs index 63232d58a4..55e25de5d8 100644 --- a/cranelift/codegen/src/isa/aarch64/inst/emit_tests.rs +++ b/cranelift/codegen/src/isa/aarch64/inst/emit_tests.rs @@ -5235,9 +5235,48 @@ fn test_aarch64_binemit() { "BF3B03D53B7F5F88FC031AAA3C7F1888B8FFFFB5BF3B03D5", "atomically { 32_bits_at_[x25]) Xchg= x26 ; x27 = old_value_at_[x25]; x24,x28 = trash }", )); - insns.push(( Inst::AtomicCAS { + rs: writable_xreg(28), + rt: xreg(20), + rn: xreg(10), + ty: I8, + }, + "54FDFC08", + "casalb w28, w20, [x10]", + )); + insns.push(( + Inst::AtomicCAS { + rs: writable_xreg(2), + rt: xreg(19), + rn: xreg(23), + ty: I16, + }, + "F3FEE248", + "casalh w2, w19, [x23]", + )); + insns.push(( + Inst::AtomicCAS { + rs: writable_xreg(0), + rt: zero_reg(), + rn: stack_reg(), + ty: I32, + }, + "FFFFE088", + "casal w0, wzr, [sp]", + )); + insns.push(( + Inst::AtomicCAS { + rs: writable_xreg(7), + rt: xreg(15), + rn: xreg(27), + ty: I64, + }, + "6FFFE7C8", + "casal x7, x15, [x27]", + )); + insns.push(( + Inst::AtomicCASLoop { ty: I8, }, "BF3B03D53B7F5F08581F40927F0318EB610000543C7F180878FFFFB5BF3B03D5", @@ -5245,7 +5284,7 @@ fn test_aarch64_binemit() { )); insns.push(( - Inst::AtomicCAS { + Inst::AtomicCASLoop { ty: I64, }, "BF3B03D53B7F5FC8F8031AAA7F0318EB610000543C7F18C878FFFFB5BF3B03D5", diff --git a/cranelift/codegen/src/isa/aarch64/inst/mod.rs b/cranelift/codegen/src/isa/aarch64/inst/mod.rs index 42dc7c203a..03e5c6f47b 100644 --- a/cranelift/codegen/src/isa/aarch64/inst/mod.rs +++ b/cranelift/codegen/src/isa/aarch64/inst/mod.rs @@ -696,19 +696,26 @@ pub enum Inst { op: inst_common::AtomicRmwOp, }, + /// An atomic compare-and-swap operation. This instruction is sequentially consistent. + AtomicCAS { + rs: Writable, + rt: Reg, + rn: Reg, + ty: Type, + }, + /// Similar to AtomicRMW, a compare-and-swap operation implemented using a load-linked - /// store-conditional loop. (Although we could possibly implement it more directly using - /// CAS insns that are available in some revisions of AArch64 above 8.0). The sequence is - /// both preceded and followed by a fence which is at least as comprehensive as that of the - /// `Fence` instruction below. This instruction is sequentially consistent. Note that the - /// operand conventions, although very similar to AtomicRMW, are different: + /// store-conditional loop. The sequence is both preceded and followed by a fence which is + /// at least as comprehensive as that of the `Fence` instruction below. This instruction + /// is sequentially consistent. Note that the operand conventions, although very similar + /// to AtomicRMW, are different: /// /// x25 (rd) address /// x26 (rd) expected value /// x28 (rd) replacement value /// x27 (wr) old value /// x24 (wr) scratch reg; value afterwards has no meaning - AtomicCAS { + AtomicCASLoop { ty: Type, // I8, I16, I32 or I64 }, @@ -1755,7 +1762,12 @@ fn aarch64_get_regs(inst: &Inst, collector: &mut RegUsageCollector) { collector.add_def(writable_xreg(27)); collector.add_def(writable_xreg(28)); } - &Inst::AtomicCAS { .. } => { + &Inst::AtomicCAS { rs, rt, rn, .. } => { + collector.add_mod(rs); + collector.add_use(rt); + collector.add_use(rn); + } + &Inst::AtomicCASLoop { .. } => { collector.add_use(xreg(25)); collector.add_use(xreg(26)); collector.add_use(xreg(28)); @@ -2330,7 +2342,17 @@ fn aarch64_map_regs(inst: &mut Inst, mapper: &RUM) { &mut Inst::AtomicRMW { .. } => { // There are no vregs to map in this insn. } - &mut Inst::AtomicCAS { .. } => { + &mut Inst::AtomicCAS { + ref mut rs, + ref mut rt, + ref mut rn, + .. + } => { + map_mod(mapper, rs); + map_use(mapper, rt); + map_use(mapper, rn); + } + &mut Inst::AtomicCASLoop { .. } => { // There are no vregs to map in this insn. } &mut Inst::AtomicLoad { @@ -3302,7 +3324,21 @@ impl Inst { "atomically {{ {}_bits_at_[x25]) {:?}= x26 ; x27 = old_value_at_[x25]; x24,x28 = trash }}", ty.bits(), op) } - &Inst::AtomicCAS { ty, .. } => { + &Inst::AtomicCAS { rs, rt, rn, ty } => { + let op = match ty { + I8 => "casalb", + I16 => "casalh", + I32 | I64 => "casal", + _ => panic!("Unsupported type: {}", ty), + }; + let size = OperandSize::from_ty(ty); + let rs = show_ireg_sized(rs.to_reg(), mb_rru, size); + let rt = show_ireg_sized(rt, mb_rru, size); + let rn = rn.show_rru(mb_rru); + + format!("{} {}, {}, [{}]", op, rs, rt, rn) + } + &Inst::AtomicCASLoop { ty } => { format!( "atomically {{ compare-and-swap({}_bits_at_[x25], x26 -> x28), x27 = old_value_at_[x25]; x24 = trash }}", ty.bits()) diff --git a/cranelift/codegen/src/isa/aarch64/lower.rs b/cranelift/codegen/src/isa/aarch64/lower.rs index 0f37bb6123..4f5893f54b 100644 --- a/cranelift/codegen/src/isa/aarch64/lower.rs +++ b/cranelift/codegen/src/isa/aarch64/lower.rs @@ -1231,7 +1231,7 @@ impl LowerBackend for AArch64Backend { type MInst = Inst; fn lower>(&self, ctx: &mut C, ir_inst: IRInst) -> CodegenResult<()> { - lower_inst::lower_insn_to_regs(ctx, ir_inst, &self.flags) + lower_inst::lower_insn_to_regs(ctx, ir_inst, &self.flags, &self.isa_flags) } fn lower_branch_group>( diff --git a/cranelift/codegen/src/isa/aarch64/lower_inst.rs b/cranelift/codegen/src/isa/aarch64/lower_inst.rs index 93c2385098..9a6b711cb2 100644 --- a/cranelift/codegen/src/isa/aarch64/lower_inst.rs +++ b/cranelift/codegen/src/isa/aarch64/lower_inst.rs @@ -5,6 +5,7 @@ use crate::ir::condcodes::FloatCC; use crate::ir::types::*; use crate::ir::Inst as IRInst; use crate::ir::{InstructionData, Opcode, TrapCode}; +use crate::isa::aarch64::settings as aarch64_settings; use crate::machinst::lower::*; use crate::machinst::*; use crate::settings::Flags; @@ -26,6 +27,7 @@ pub(crate) fn lower_insn_to_regs>( ctx: &mut C, insn: IRInst, flags: &Flags, + isa_flags: &aarch64_settings::Flags, ) -> CodegenResult<()> { let op = ctx.data(insn).opcode(); let inputs = insn_inputs(ctx, insn); @@ -1183,37 +1185,48 @@ pub(crate) fn lower_insn_to_regs>( } Opcode::AtomicCas => { - // This is very similar to, but not identical to, the AtomicRmw case. Note - // that the AtomicCAS sequence does its own masking, so we don't need to worry - // about zero-extending narrow (I8/I16/I32) values here. let r_dst = get_output_reg(ctx, outputs[0]).only_reg().unwrap(); let mut r_addr = put_input_in_reg(ctx, inputs[0], NarrowValueMode::None); let mut r_expected = put_input_in_reg(ctx, inputs[1], NarrowValueMode::None); let mut r_replacement = put_input_in_reg(ctx, inputs[2], NarrowValueMode::None); let ty_access = ty.unwrap(); assert!(is_valid_atomic_transaction_ty(ty_access)); - // Make sure that all three args are in virtual regs. See corresponding comment - // for `Opcode::AtomicRmw` above. - r_addr = ctx.ensure_in_vreg(r_addr, I64); - r_expected = ctx.ensure_in_vreg(r_expected, I64); - r_replacement = ctx.ensure_in_vreg(r_replacement, I64); - // Move the args to the preordained AtomicCAS input regs - ctx.emit(Inst::gen_move(Writable::from_reg(xreg(25)), r_addr, I64)); - ctx.emit(Inst::gen_move( - Writable::from_reg(xreg(26)), - r_expected, - I64, - )); - ctx.emit(Inst::gen_move( - Writable::from_reg(xreg(28)), - r_replacement, - I64, - )); - // Now the AtomicCAS itself, implemented in the normal way, with an LL-SC loop - ctx.emit(Inst::AtomicCAS { ty: ty_access }); - // And finally, copy the preordained AtomicCAS output reg to its destination. - ctx.emit(Inst::gen_move(r_dst, xreg(27), I64)); - // Also, x24 and x28 are trashed. `fn aarch64_get_regs` must mention that. + + if isa_flags.use_lse() { + ctx.emit(Inst::gen_move(r_dst, r_expected, ty_access)); + ctx.emit(Inst::AtomicCAS { + rs: r_dst, + rt: r_replacement, + rn: r_addr, + ty: ty_access, + }); + } else { + // This is very similar to, but not identical to, the AtomicRmw case. Note + // that the AtomicCASLoop sequence does its own masking, so we don't need to worry + // about zero-extending narrow (I8/I16/I32) values here. + // Make sure that all three args are in virtual regs. See corresponding comment + // for `Opcode::AtomicRmw` above. + r_addr = ctx.ensure_in_vreg(r_addr, I64); + r_expected = ctx.ensure_in_vreg(r_expected, I64); + r_replacement = ctx.ensure_in_vreg(r_replacement, I64); + // Move the args to the preordained AtomicCASLoop input regs + ctx.emit(Inst::gen_move(Writable::from_reg(xreg(25)), r_addr, I64)); + ctx.emit(Inst::gen_move( + Writable::from_reg(xreg(26)), + r_expected, + I64, + )); + ctx.emit(Inst::gen_move( + Writable::from_reg(xreg(28)), + r_replacement, + I64, + )); + // Now the AtomicCASLoop itself, implemented in the normal way, with an LL-SC loop + ctx.emit(Inst::AtomicCASLoop { ty: ty_access }); + // And finally, copy the preordained AtomicCASLoop output reg to its destination. + ctx.emit(Inst::gen_move(r_dst, xreg(27), I64)); + // Also, x24 and x28 are trashed. `fn aarch64_get_regs` must mention that. + } } Opcode::AtomicLoad => { diff --git a/cranelift/codegen/src/isa/aarch64/mod.rs b/cranelift/codegen/src/isa/aarch64/mod.rs index cf6ef1fde4..42b47b645e 100644 --- a/cranelift/codegen/src/isa/aarch64/mod.rs +++ b/cranelift/codegen/src/isa/aarch64/mod.rs @@ -2,10 +2,11 @@ use crate::ir::condcodes::IntCC; use crate::ir::Function; +use crate::isa::aarch64::settings as aarch64_settings; use crate::isa::Builder as IsaBuilder; use crate::machinst::{compile, MachBackend, MachCompileResult, TargetIsaAdapter, VCode}; use crate::result::CodegenResult; -use crate::settings; +use crate::settings as shared_settings; use alloc::boxed::Box; use core::hash::{Hash, Hasher}; @@ -18,6 +19,7 @@ mod abi; pub(crate) mod inst; mod lower; mod lower_inst; +mod settings; use inst::create_reg_universe; @@ -26,17 +28,23 @@ use self::inst::EmitInfo; /// An AArch64 backend. pub struct AArch64Backend { triple: Triple, - flags: settings::Flags, + flags: shared_settings::Flags, + isa_flags: aarch64_settings::Flags, reg_universe: RealRegUniverse, } impl AArch64Backend { /// Create a new AArch64 backend with the given (shared) flags. - pub fn new_with_flags(triple: Triple, flags: settings::Flags) -> AArch64Backend { + pub fn new_with_flags( + triple: Triple, + flags: shared_settings::Flags, + isa_flags: aarch64_settings::Flags, + ) -> AArch64Backend { let reg_universe = create_reg_universe(&flags); AArch64Backend { triple, flags, + isa_flags, reg_universe, } } @@ -46,7 +54,7 @@ impl AArch64Backend { fn compile_vcode( &self, func: &Function, - flags: settings::Flags, + flags: shared_settings::Flags, ) -> CodegenResult> { let emit_info = EmitInfo::new(flags.clone()); let abi = Box::new(abi::AArch64ABICallee::new(func, flags)?); @@ -92,12 +100,13 @@ impl MachBackend for AArch64Backend { self.triple.clone() } - fn flags(&self) -> &settings::Flags { + fn flags(&self) -> &shared_settings::Flags { &self.flags } fn hash_all_flags(&self, mut hasher: &mut dyn Hasher) { self.flags.hash(&mut hasher); + self.isa_flags.hash(&mut hasher); } fn reg_universe(&self) -> &RealRegUniverse { @@ -155,9 +164,10 @@ pub fn isa_builder(triple: Triple) -> IsaBuilder { assert!(triple.architecture == Architecture::Aarch64(Aarch64Architecture::Aarch64)); IsaBuilder { triple, - setup: settings::builder(), - constructor: |triple, shared_flags, _| { - let backend = AArch64Backend::new_with_flags(triple, shared_flags); + setup: aarch64_settings::builder(), + constructor: |triple, shared_flags, builder| { + let isa_flags = aarch64_settings::Flags::new(&shared_flags, builder); + let backend = AArch64Backend::new_with_flags(triple, shared_flags, isa_flags); Box::new(TargetIsaAdapter::new(backend)) }, } @@ -192,11 +202,14 @@ mod test { let v1 = pos.ins().iadd(arg0, v0); pos.ins().return_(&[v1]); - let mut shared_flags = settings::builder(); - shared_flags.set("opt_level", "none").unwrap(); + let mut shared_flags_builder = settings::builder(); + shared_flags_builder.set("opt_level", "none").unwrap(); + let shared_flags = settings::Flags::new(shared_flags_builder); + let isa_flags = aarch64_settings::Flags::new(&shared_flags, aarch64_settings::builder()); let backend = AArch64Backend::new_with_flags( Triple::from_str("aarch64").unwrap(), - settings::Flags::new(shared_flags), + shared_flags, + isa_flags, ); let buffer = backend.compile_function(&mut func, false).unwrap().buffer; let code = &buffer.data[..]; @@ -246,11 +259,14 @@ mod test { let v3 = pos.ins().isub(v1, v0); pos.ins().return_(&[v3]); - let mut shared_flags = settings::builder(); - shared_flags.set("opt_level", "none").unwrap(); + let mut shared_flags_builder = settings::builder(); + shared_flags_builder.set("opt_level", "none").unwrap(); + let shared_flags = settings::Flags::new(shared_flags_builder); + let isa_flags = aarch64_settings::Flags::new(&shared_flags, aarch64_settings::builder()); let backend = AArch64Backend::new_with_flags( Triple::from_str("aarch64").unwrap(), - settings::Flags::new(shared_flags), + shared_flags, + isa_flags, ); let result = backend .compile_function(&mut func, /* want_disasm = */ false) diff --git a/cranelift/codegen/src/isa/aarch64/settings.rs b/cranelift/codegen/src/isa/aarch64/settings.rs new file mode 100644 index 0000000000..a9849c121b --- /dev/null +++ b/cranelift/codegen/src/isa/aarch64/settings.rs @@ -0,0 +1,9 @@ +//! AArch64 Settings. + +use crate::settings::{self, detail, Builder}; +use core::fmt; + +// Include code generated by `cranelift-codegen/meta/src/gen_settings.rs:`. This file contains a +// public `Flags` struct with an impl for all of the settings defined in +// `cranelift-codegen/meta/src/isa/arm64/settings.rs`. +include!(concat!(env!("OUT_DIR"), "/settings-arm64.rs")); diff --git a/cranelift/native/src/lib.rs b/cranelift/native/src/lib.rs index 43938bd97e..3be04bc5f1 100644 --- a/cranelift/native/src/lib.rs +++ b/cranelift/native/src/lib.rs @@ -105,6 +105,20 @@ pub fn builder_with_options( } } + // `stdsimd` is necessary for std::is_aarch64_feature_detected!(). + #[cfg(all(target_arch = "aarch64", feature = "stdsimd"))] + { + use cranelift_codegen::settings::Configurable; + + if !infer_native_flags { + return Ok(isa_builder); + } + + if std::is_aarch64_feature_detected!("lse") { + isa_builder.enable("has_lse").unwrap(); + } + } + // squelch warnings about unused mut/variables on some platforms. drop(&mut isa_builder); drop(infer_native_flags);