diff --git a/cranelift/codegen/meta/src/isa/arm64/mod.rs b/cranelift/codegen/meta/src/isa/arm64/mod.rs
index 5d8bc76fc4..cbc21347e9 100644
--- a/cranelift/codegen/meta/src/isa/arm64/mod.rs
+++ b/cranelift/codegen/meta/src/isa/arm64/mod.rs
@@ -8,7 +8,10 @@ use crate::cdsl::settings::{SettingGroup, SettingGroupBuilder};
 use crate::shared::Definitions as SharedDefinitions;
 
 fn define_settings(_shared: &SettingGroup) -> SettingGroup {
-    let setting = SettingGroupBuilder::new("arm64");
+    let mut setting = SettingGroupBuilder::new("arm64");
+    let has_lse = setting.add_bool("has_lse", "Large System Extensions", false);
+
+    setting.add_predicate("use_lse", predicate!(has_lse));
     setting.build()
 }
 
diff --git a/cranelift/codegen/src/isa/aarch64/inst/emit.rs b/cranelift/codegen/src/isa/aarch64/inst/emit.rs
index aaa76a659c..aa708a8524 100644
--- a/cranelift/codegen/src/isa/aarch64/inst/emit.rs
+++ b/cranelift/codegen/src/isa/aarch64/inst/emit.rs
@@ -462,6 +462,16 @@ fn enc_stxr(ty: Type, rs: Writable<Reg>, rt: Reg, rn: Reg) -> u32 {
         | machreg_to_gpr(rt)
 }
 
+fn enc_cas(size: u32, rs: Writable<Reg>, rt: Reg, rn: Reg) -> u32 {
+    debug_assert_eq!(size & 0b11, size);
+
+    0b00_0010001_1_1_00000_1_11111_00000_00000
+        | size << 30
+        | machreg_to_gpr(rs.to_reg()) << 16
+        | machreg_to_gpr(rn) << 5
+        | machreg_to_gpr(rt)
+}
+
 fn enc_asimd_mod_imm(rd: Writable<Reg>, q_op: u32, cmode: u32, imm: u8) -> u32 {
     let abc = (imm >> 5) as u32;
     let defgh = (imm & 0b11111) as u32;
@@ -1164,7 +1174,18 @@ impl MachInstEmit for Inst {
 
                 sink.put4(enc_dmb_ish()); // dmb ish
             }
-            &Inst::AtomicCAS { ty } => {
+            &Inst::AtomicCAS { rs, rt, rn, ty } => {
+                let size = match ty {
+                    I8 => 0b00,
+                    I16 => 0b01,
+                    I32 => 0b10,
+                    I64 => 0b11,
+                    _ => panic!("Unsupported type: {}", ty),
+                };
+
+                sink.put4(enc_cas(size, rs, rt, rn));
+            }
+            &Inst::AtomicCASLoop { ty } => {
                 /* Emit this:
                      dmb         ish
                     again:
diff --git a/cranelift/codegen/src/isa/aarch64/inst/emit_tests.rs b/cranelift/codegen/src/isa/aarch64/inst/emit_tests.rs
index 63232d58a4..55e25de5d8 100644
--- a/cranelift/codegen/src/isa/aarch64/inst/emit_tests.rs
+++ b/cranelift/codegen/src/isa/aarch64/inst/emit_tests.rs
@@ -5235,9 +5235,48 @@ fn test_aarch64_binemit() {
         "BF3B03D53B7F5F88FC031AAA3C7F1888B8FFFFB5BF3B03D5",
         "atomically { 32_bits_at_[x25]) Xchg= x26 ; x27 = old_value_at_[x25]; x24,x28 = trash }",
     ));
-
     insns.push((
         Inst::AtomicCAS {
+            rs: writable_xreg(28),
+            rt: xreg(20),
+            rn: xreg(10),
+            ty: I8,
+        },
+        "54FDFC08",
+        "casalb w28, w20, [x10]",
+    ));
+    insns.push((
+        Inst::AtomicCAS {
+            rs: writable_xreg(2),
+            rt: xreg(19),
+            rn: xreg(23),
+            ty: I16,
+        },
+        "F3FEE248",
+        "casalh w2, w19, [x23]",
+    ));
+    insns.push((
+        Inst::AtomicCAS {
+            rs: writable_xreg(0),
+            rt: zero_reg(),
+            rn: stack_reg(),
+            ty: I32,
+        },
+        "FFFFE088",
+        "casal w0, wzr, [sp]",
+    ));
+    insns.push((
+        Inst::AtomicCAS {
+            rs: writable_xreg(7),
+            rt: xreg(15),
+            rn: xreg(27),
+            ty: I64,
+        },
+        "6FFFE7C8",
+        "casal x7, x15, [x27]",
+    ));
+    insns.push((
+        Inst::AtomicCASLoop {
             ty: I8,
         },
         "BF3B03D53B7F5F08581F40927F0318EB610000543C7F180878FFFFB5BF3B03D5",
@@ -5245,7 +5284,7 @@ fn test_aarch64_binemit() {
     ));
 
     insns.push((
-        Inst::AtomicCAS {
+        Inst::AtomicCASLoop {
             ty: I64,
         },
         "BF3B03D53B7F5FC8F8031AAA7F0318EB610000543C7F18C878FFFFB5BF3B03D5",
diff --git a/cranelift/codegen/src/isa/aarch64/inst/mod.rs b/cranelift/codegen/src/isa/aarch64/inst/mod.rs
index 42dc7c203a..03e5c6f47b 100644
--- a/cranelift/codegen/src/isa/aarch64/inst/mod.rs
+++ b/cranelift/codegen/src/isa/aarch64/inst/mod.rs
@@ -696,19 +696,26 @@ pub enum Inst {
         op: inst_common::AtomicRmwOp,
     },
 
+    /// An atomic compare-and-swap operation. This instruction is sequentially consistent.
+    AtomicCAS {
+        rs: Writable<Reg>,
+        rt: Reg,
+        rn: Reg,
+        ty: Type,
+    },
+
     /// Similar to AtomicRMW, a compare-and-swap operation implemented using a load-linked
-    /// store-conditional loop.  (Although we could possibly implement it more directly using
-    /// CAS insns that are available in some revisions of AArch64 above 8.0).  The sequence is
-    /// both preceded and followed by a fence which is at least as comprehensive as that of the
-    /// `Fence` instruction below.  This instruction is sequentially consistent.  Note that the
-    /// operand conventions, although very similar to AtomicRMW, are different:
+    /// store-conditional loop. The sequence is both preceded and followed by a fence which is
+    /// at least as comprehensive as that of the `Fence` instruction below.  This instruction
+    /// is sequentially consistent.  Note that the operand conventions, although very similar
+    /// to AtomicRMW, are different:
     ///
     /// x25   (rd) address
     /// x26   (rd) expected value
     /// x28   (rd) replacement value
     /// x27   (wr) old value
     /// x24   (wr) scratch reg; value afterwards has no meaning
-    AtomicCAS {
+    AtomicCASLoop {
         ty: Type, // I8, I16, I32 or I64
     },
 
@@ -1755,7 +1762,12 @@ fn aarch64_get_regs(inst: &Inst, collector: &mut RegUsageCollector) {
             collector.add_def(writable_xreg(27));
             collector.add_def(writable_xreg(28));
         }
-        &Inst::AtomicCAS { .. } => {
+        &Inst::AtomicCAS { rs, rt, rn, .. } => {
+            collector.add_mod(rs);
+            collector.add_use(rt);
+            collector.add_use(rn);
+        }
+        &Inst::AtomicCASLoop { .. } => {
             collector.add_use(xreg(25));
             collector.add_use(xreg(26));
             collector.add_use(xreg(28));
@@ -2330,7 +2342,17 @@ fn aarch64_map_regs<RUM: RegUsageMapper>(inst: &mut Inst, mapper: &RUM) {
         &mut Inst::AtomicRMW { .. } => {
             // There are no vregs to map in this insn.
         }
-        &mut Inst::AtomicCAS { .. } => {
+        &mut Inst::AtomicCAS {
+            ref mut rs,
+            ref mut rt,
+            ref mut rn,
+            ..
+        } => {
+            map_mod(mapper, rs);
+            map_use(mapper, rt);
+            map_use(mapper, rn);
+        }
+        &mut Inst::AtomicCASLoop { .. } => {
             // There are no vregs to map in this insn.
         }
         &mut Inst::AtomicLoad {
@@ -3302,7 +3324,21 @@ impl Inst {
                     "atomically {{ {}_bits_at_[x25]) {:?}= x26 ; x27 = old_value_at_[x25]; x24,x28 = trash }}",
                     ty.bits(), op)
             }
-            &Inst::AtomicCAS { ty, .. } => {
+            &Inst::AtomicCAS { rs, rt, rn, ty } => {
+                let op = match ty {
+                    I8 => "casalb",
+                    I16 => "casalh",
+                    I32 | I64 => "casal",
+                    _ => panic!("Unsupported type: {}", ty),
+                };
+                let size = OperandSize::from_ty(ty);
+                let rs = show_ireg_sized(rs.to_reg(), mb_rru, size);
+                let rt = show_ireg_sized(rt, mb_rru, size);
+                let rn = rn.show_rru(mb_rru);
+
+                format!("{} {}, {}, [{}]", op, rs, rt, rn)
+            }
+            &Inst::AtomicCASLoop { ty } => {
                 format!(
                     "atomically {{ compare-and-swap({}_bits_at_[x25], x26 -> x28), x27 = old_value_at_[x25]; x24 = trash }}",
                     ty.bits())
diff --git a/cranelift/codegen/src/isa/aarch64/lower.rs b/cranelift/codegen/src/isa/aarch64/lower.rs
index 0f37bb6123..4f5893f54b 100644
--- a/cranelift/codegen/src/isa/aarch64/lower.rs
+++ b/cranelift/codegen/src/isa/aarch64/lower.rs
@@ -1231,7 +1231,7 @@ impl LowerBackend for AArch64Backend {
     type MInst = Inst;
 
     fn lower<C: LowerCtx<I = Inst>>(&self, ctx: &mut C, ir_inst: IRInst) -> CodegenResult<()> {
-        lower_inst::lower_insn_to_regs(ctx, ir_inst, &self.flags)
+        lower_inst::lower_insn_to_regs(ctx, ir_inst, &self.flags, &self.isa_flags)
     }
 
     fn lower_branch_group<C: LowerCtx<I = Inst>>(
diff --git a/cranelift/codegen/src/isa/aarch64/lower_inst.rs b/cranelift/codegen/src/isa/aarch64/lower_inst.rs
index 93c2385098..9a6b711cb2 100644
--- a/cranelift/codegen/src/isa/aarch64/lower_inst.rs
+++ b/cranelift/codegen/src/isa/aarch64/lower_inst.rs
@@ -5,6 +5,7 @@ use crate::ir::condcodes::FloatCC;
 use crate::ir::types::*;
 use crate::ir::Inst as IRInst;
 use crate::ir::{InstructionData, Opcode, TrapCode};
+use crate::isa::aarch64::settings as aarch64_settings;
 use crate::machinst::lower::*;
 use crate::machinst::*;
 use crate::settings::Flags;
@@ -26,6 +27,7 @@ pub(crate) fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(
     ctx: &mut C,
     insn: IRInst,
     flags: &Flags,
+    isa_flags: &aarch64_settings::Flags,
 ) -> CodegenResult<()> {
     let op = ctx.data(insn).opcode();
     let inputs = insn_inputs(ctx, insn);
@@ -1183,37 +1185,48 @@ pub(crate) fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(
         }
 
         Opcode::AtomicCas => {
-            // This is very similar to, but not identical to, the AtomicRmw case.  Note
-            // that the AtomicCAS sequence does its own masking, so we don't need to worry
-            // about zero-extending narrow (I8/I16/I32) values here.
             let r_dst = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
             let mut r_addr = put_input_in_reg(ctx, inputs[0], NarrowValueMode::None);
             let mut r_expected = put_input_in_reg(ctx, inputs[1], NarrowValueMode::None);
             let mut r_replacement = put_input_in_reg(ctx, inputs[2], NarrowValueMode::None);
             let ty_access = ty.unwrap();
             assert!(is_valid_atomic_transaction_ty(ty_access));
-            // Make sure that all three args are in virtual regs.  See corresponding comment
-            // for `Opcode::AtomicRmw` above.
-            r_addr = ctx.ensure_in_vreg(r_addr, I64);
-            r_expected = ctx.ensure_in_vreg(r_expected, I64);
-            r_replacement = ctx.ensure_in_vreg(r_replacement, I64);
-            // Move the args to the preordained AtomicCAS input regs
-            ctx.emit(Inst::gen_move(Writable::from_reg(xreg(25)), r_addr, I64));
-            ctx.emit(Inst::gen_move(
-                Writable::from_reg(xreg(26)),
-                r_expected,
-                I64,
-            ));
-            ctx.emit(Inst::gen_move(
-                Writable::from_reg(xreg(28)),
-                r_replacement,
-                I64,
-            ));
-            // Now the AtomicCAS itself, implemented in the normal way, with an LL-SC loop
-            ctx.emit(Inst::AtomicCAS { ty: ty_access });
-            // And finally, copy the preordained AtomicCAS output reg to its destination.
-            ctx.emit(Inst::gen_move(r_dst, xreg(27), I64));
-            // Also, x24 and x28 are trashed.  `fn aarch64_get_regs` must mention that.
+
+            if isa_flags.use_lse() {
+                ctx.emit(Inst::gen_move(r_dst, r_expected, ty_access));
+                ctx.emit(Inst::AtomicCAS {
+                    rs: r_dst,
+                    rt: r_replacement,
+                    rn: r_addr,
+                    ty: ty_access,
+                });
+            } else {
+                // This is very similar to, but not identical to, the AtomicRmw case.  Note
+                // that the AtomicCASLoop sequence does its own masking, so we don't need to worry
+                // about zero-extending narrow (I8/I16/I32) values here.
+                // Make sure that all three args are in virtual regs.  See corresponding comment
+                // for `Opcode::AtomicRmw` above.
+                r_addr = ctx.ensure_in_vreg(r_addr, I64);
+                r_expected = ctx.ensure_in_vreg(r_expected, I64);
+                r_replacement = ctx.ensure_in_vreg(r_replacement, I64);
+                // Move the args to the preordained AtomicCASLoop input regs
+                ctx.emit(Inst::gen_move(Writable::from_reg(xreg(25)), r_addr, I64));
+                ctx.emit(Inst::gen_move(
+                    Writable::from_reg(xreg(26)),
+                    r_expected,
+                    I64,
+                ));
+                ctx.emit(Inst::gen_move(
+                    Writable::from_reg(xreg(28)),
+                    r_replacement,
+                    I64,
+                ));
+                // Now the AtomicCASLoop itself, implemented in the normal way, with an LL-SC loop
+                ctx.emit(Inst::AtomicCASLoop { ty: ty_access });
+                // And finally, copy the preordained AtomicCASLoop output reg to its destination.
+                ctx.emit(Inst::gen_move(r_dst, xreg(27), I64));
+                // Also, x24 and x28 are trashed.  `fn aarch64_get_regs` must mention that.
+            }
         }
 
         Opcode::AtomicLoad => {
diff --git a/cranelift/codegen/src/isa/aarch64/mod.rs b/cranelift/codegen/src/isa/aarch64/mod.rs
index cf6ef1fde4..42b47b645e 100644
--- a/cranelift/codegen/src/isa/aarch64/mod.rs
+++ b/cranelift/codegen/src/isa/aarch64/mod.rs
@@ -2,10 +2,11 @@
 
 use crate::ir::condcodes::IntCC;
 use crate::ir::Function;
+use crate::isa::aarch64::settings as aarch64_settings;
 use crate::isa::Builder as IsaBuilder;
 use crate::machinst::{compile, MachBackend, MachCompileResult, TargetIsaAdapter, VCode};
 use crate::result::CodegenResult;
-use crate::settings;
+use crate::settings as shared_settings;
 
 use alloc::boxed::Box;
 use core::hash::{Hash, Hasher};
@@ -18,6 +19,7 @@ mod abi;
 pub(crate) mod inst;
 mod lower;
 mod lower_inst;
+mod settings;
 
 use inst::create_reg_universe;
 
@@ -26,17 +28,23 @@ use self::inst::EmitInfo;
 /// An AArch64 backend.
 pub struct AArch64Backend {
     triple: Triple,
-    flags: settings::Flags,
+    flags: shared_settings::Flags,
+    isa_flags: aarch64_settings::Flags,
     reg_universe: RealRegUniverse,
 }
 
 impl AArch64Backend {
     /// Create a new AArch64 backend with the given (shared) flags.
-    pub fn new_with_flags(triple: Triple, flags: settings::Flags) -> AArch64Backend {
+    pub fn new_with_flags(
+        triple: Triple,
+        flags: shared_settings::Flags,
+        isa_flags: aarch64_settings::Flags,
+    ) -> AArch64Backend {
         let reg_universe = create_reg_universe(&flags);
         AArch64Backend {
             triple,
             flags,
+            isa_flags,
             reg_universe,
         }
     }
@@ -46,7 +54,7 @@ impl AArch64Backend {
     fn compile_vcode(
         &self,
         func: &Function,
-        flags: settings::Flags,
+        flags: shared_settings::Flags,
     ) -> CodegenResult<VCode<inst::Inst>> {
         let emit_info = EmitInfo::new(flags.clone());
         let abi = Box::new(abi::AArch64ABICallee::new(func, flags)?);
@@ -92,12 +100,13 @@ impl MachBackend for AArch64Backend {
         self.triple.clone()
     }
 
-    fn flags(&self) -> &settings::Flags {
+    fn flags(&self) -> &shared_settings::Flags {
         &self.flags
     }
 
     fn hash_all_flags(&self, mut hasher: &mut dyn Hasher) {
         self.flags.hash(&mut hasher);
+        self.isa_flags.hash(&mut hasher);
     }
 
     fn reg_universe(&self) -> &RealRegUniverse {
@@ -155,9 +164,10 @@ pub fn isa_builder(triple: Triple) -> IsaBuilder {
     assert!(triple.architecture == Architecture::Aarch64(Aarch64Architecture::Aarch64));
     IsaBuilder {
         triple,
-        setup: settings::builder(),
-        constructor: |triple, shared_flags, _| {
-            let backend = AArch64Backend::new_with_flags(triple, shared_flags);
+        setup: aarch64_settings::builder(),
+        constructor: |triple, shared_flags, builder| {
+            let isa_flags = aarch64_settings::Flags::new(&shared_flags, builder);
+            let backend = AArch64Backend::new_with_flags(triple, shared_flags, isa_flags);
             Box::new(TargetIsaAdapter::new(backend))
         },
     }
@@ -192,11 +202,14 @@ mod test {
         let v1 = pos.ins().iadd(arg0, v0);
         pos.ins().return_(&[v1]);
 
-        let mut shared_flags = settings::builder();
-        shared_flags.set("opt_level", "none").unwrap();
+        let mut shared_flags_builder = settings::builder();
+        shared_flags_builder.set("opt_level", "none").unwrap();
+        let shared_flags = settings::Flags::new(shared_flags_builder);
+        let isa_flags = aarch64_settings::Flags::new(&shared_flags, aarch64_settings::builder());
         let backend = AArch64Backend::new_with_flags(
             Triple::from_str("aarch64").unwrap(),
-            settings::Flags::new(shared_flags),
+            shared_flags,
+            isa_flags,
         );
         let buffer = backend.compile_function(&mut func, false).unwrap().buffer;
         let code = &buffer.data[..];
@@ -246,11 +259,14 @@ mod test {
         let v3 = pos.ins().isub(v1, v0);
         pos.ins().return_(&[v3]);
 
-        let mut shared_flags = settings::builder();
-        shared_flags.set("opt_level", "none").unwrap();
+        let mut shared_flags_builder = settings::builder();
+        shared_flags_builder.set("opt_level", "none").unwrap();
+        let shared_flags = settings::Flags::new(shared_flags_builder);
+        let isa_flags = aarch64_settings::Flags::new(&shared_flags, aarch64_settings::builder());
         let backend = AArch64Backend::new_with_flags(
             Triple::from_str("aarch64").unwrap(),
-            settings::Flags::new(shared_flags),
+            shared_flags,
+            isa_flags,
         );
         let result = backend
             .compile_function(&mut func, /* want_disasm = */ false)
diff --git a/cranelift/codegen/src/isa/aarch64/settings.rs b/cranelift/codegen/src/isa/aarch64/settings.rs
new file mode 100644
index 0000000000..a9849c121b
--- /dev/null
+++ b/cranelift/codegen/src/isa/aarch64/settings.rs
@@ -0,0 +1,9 @@
+//! AArch64 Settings.
+
+use crate::settings::{self, detail, Builder};
+use core::fmt;
+
+// Include code generated by `cranelift-codegen/meta/src/gen_settings.rs:`. This file contains a
+// public `Flags` struct with an impl for all of the settings defined in
+// `cranelift-codegen/meta/src/isa/arm64/settings.rs`.
+include!(concat!(env!("OUT_DIR"), "/settings-arm64.rs"));
diff --git a/cranelift/native/src/lib.rs b/cranelift/native/src/lib.rs
index 43938bd97e..3be04bc5f1 100644
--- a/cranelift/native/src/lib.rs
+++ b/cranelift/native/src/lib.rs
@@ -105,6 +105,20 @@ pub fn builder_with_options(
         }
     }
 
+    // `stdsimd` is necessary for std::is_aarch64_feature_detected!().
+    #[cfg(all(target_arch = "aarch64", feature = "stdsimd"))]
+    {
+        use cranelift_codegen::settings::Configurable;
+
+        if !infer_native_flags {
+            return Ok(isa_builder);
+        }
+
+        if std::is_aarch64_feature_detected!("lse") {
+            isa_builder.enable("has_lse").unwrap();
+        }
+    }
+
     // squelch warnings about unused mut/variables on some platforms.
     drop(&mut isa_builder);
     drop(infer_native_flags);