Cranelift AArch64: Add initial support for the Armv8.1 atomics

This commit enables Cranelift's AArch64 backend to generate code for instruction set extensions (previously only the base Armv8-A architecture was supported); also, it makes it possible to detect the extensions supported by the host when JIT compiling. The new functionality is applied to the IR instruction `AtomicCas`. Copyright (c) 2021, Arm Limited.
2021-03-02 18:35:40 +00:00
parent df6812b855
commit 07c27039b1
9 changed files with 204 additions and 53 deletions
--- a/cranelift/codegen/src/isa/aarch64/lower_inst.rs
+++ b/cranelift/codegen/src/isa/aarch64/lower_inst.rs
@@ -5,6 +5,7 @@ use crate::ir::condcodes::FloatCC;
 use crate::ir::types::*;
 use crate::ir::Inst as IRInst;
 use crate::ir::{InstructionData, Opcode, TrapCode};
+use crate::isa::aarch64::settings as aarch64_settings;
 use crate::machinst::lower::*;
 use crate::machinst::*;
 use crate::settings::Flags;
@@ -26,6 +27,7 @@ pub(crate) fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(
    ctx: &mut C,
    insn: IRInst,
    flags: &Flags,
+    isa_flags: &aarch64_settings::Flags,
 ) -> CodegenResult<()> {
    let op = ctx.data(insn).opcode();
    let inputs = insn_inputs(ctx, insn);
@@ -1183,37 +1185,48 @@ pub(crate) fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(
        }

        Opcode::AtomicCas => {
-            // This is very similar to, but not identical to, the AtomicRmw case.  Note
-            // that the AtomicCAS sequence does its own masking, so we don't need to worry
-            // about zero-extending narrow (I8/I16/I32) values here.
            let r_dst = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
            let mut r_addr = put_input_in_reg(ctx, inputs[0], NarrowValueMode::None);
            let mut r_expected = put_input_in_reg(ctx, inputs[1], NarrowValueMode::None);
            let mut r_replacement = put_input_in_reg(ctx, inputs[2], NarrowValueMode::None);
            let ty_access = ty.unwrap();
            assert!(is_valid_atomic_transaction_ty(ty_access));
-            // Make sure that all three args are in virtual regs.  See corresponding comment
-            // for `Opcode::AtomicRmw` above.
-            r_addr = ctx.ensure_in_vreg(r_addr, I64);
-            r_expected = ctx.ensure_in_vreg(r_expected, I64);
-            r_replacement = ctx.ensure_in_vreg(r_replacement, I64);
-            // Move the args to the preordained AtomicCAS input regs
-            ctx.emit(Inst::gen_move(Writable::from_reg(xreg(25)), r_addr, I64));
-            ctx.emit(Inst::gen_move(
-                Writable::from_reg(xreg(26)),
-                r_expected,
-                I64,
-            ));
-            ctx.emit(Inst::gen_move(
-                Writable::from_reg(xreg(28)),
-                r_replacement,
-                I64,
-            ));
-            // Now the AtomicCAS itself, implemented in the normal way, with an LL-SC loop
-            ctx.emit(Inst::AtomicCAS { ty: ty_access });
-            // And finally, copy the preordained AtomicCAS output reg to its destination.
-            ctx.emit(Inst::gen_move(r_dst, xreg(27), I64));
-            // Also, x24 and x28 are trashed.  `fn aarch64_get_regs` must mention that.
+
+            if isa_flags.use_lse() {
+                ctx.emit(Inst::gen_move(r_dst, r_expected, ty_access));
+                ctx.emit(Inst::AtomicCAS {
+                    rs: r_dst,
+                    rt: r_replacement,
+                    rn: r_addr,
+                    ty: ty_access,
+                });
+            } else {
+                // This is very similar to, but not identical to, the AtomicRmw case.  Note
+                // that the AtomicCASLoop sequence does its own masking, so we don't need to worry
+                // about zero-extending narrow (I8/I16/I32) values here.
+                // Make sure that all three args are in virtual regs.  See corresponding comment
+                // for `Opcode::AtomicRmw` above.
+                r_addr = ctx.ensure_in_vreg(r_addr, I64);
+                r_expected = ctx.ensure_in_vreg(r_expected, I64);
+                r_replacement = ctx.ensure_in_vreg(r_replacement, I64);
+                // Move the args to the preordained AtomicCASLoop input regs
+                ctx.emit(Inst::gen_move(Writable::from_reg(xreg(25)), r_addr, I64));
+                ctx.emit(Inst::gen_move(
+                    Writable::from_reg(xreg(26)),
+                    r_expected,
+                    I64,
+                ));
+                ctx.emit(Inst::gen_move(
+                    Writable::from_reg(xreg(28)),
+                    r_replacement,
+                    I64,
+                ));
+                // Now the AtomicCASLoop itself, implemented in the normal way, with an LL-SC loop
+                ctx.emit(Inst::AtomicCASLoop { ty: ty_access });
+                // And finally, copy the preordained AtomicCASLoop output reg to its destination.
+                ctx.emit(Inst::gen_move(r_dst, xreg(27), I64));
+                // Also, x24 and x28 are trashed.  `fn aarch64_get_regs` must mention that.
+            }
        }

        Opcode::AtomicLoad => {