From 70cbc4ca7c1437208664feb0b2263ac664be9306 Mon Sep 17 00:00:00 2001
From: Joey Gouly <joey.gouly@arm.com>
Date: Fri, 13 Nov 2020 16:17:25 +0000
Subject: [PATCH] arm64: Refactor Inst::Extend handling

This refactors the handling of Inst::Extend and simplifies the lowering
of Bextend and Bmask, which allows the use of SBFX instructions for
extensions from 1-bit booleans. Other extensions use aliases of BFM,
and the code was changed to reflect that, rather than hard coding bit
patterns. Also ImmLogic is now implemented, so another hard coded
instruction can be removed.

As part of looking at boolean handling, `normalize_boolean_result` was
changed to `materialize_boolean_result`, such that it can use either
CSET or CSETM. Using CSETM saves an instruction (previously CSET + SUB)
for booleans bigger than 1-bit.

Copyright (c) 2020, Arm Limited.
---
 .../codegen/src/isa/aarch64/inst/emit.rs      | 136 +++++++++---------
 .../src/isa/aarch64/inst/emit_tests.rs        |  64 ++++++++-
 cranelift/codegen/src/isa/aarch64/inst/mod.rs | 116 ++++++++-------
 cranelift/codegen/src/isa/aarch64/lower.rs    |  20 +--
 .../codegen/src/isa/aarch64/lower_inst.rs     |  65 +++------
 .../filetests/isa/aarch64/bitops.clif         |  34 ++++-
 .../filetests/isa/aarch64/saturating-ops.clif |   4 +-
 .../isa/aarch64/uextend-sextend.clif          |   4 +-
 8 files changed, 256 insertions(+), 187 deletions(-)
diff --git a/cranelift/codegen/src/isa/aarch64/inst/emit.rs b/cranelift/codegen/src/isa/aarch64/inst/emit.rs
index 597c9ac592..15a3542b57 100644
--- a/cranelift/codegen/src/isa/aarch64/inst/emit.rs
+++ b/cranelift/codegen/src/isa/aarch64/inst/emit.rs
@@ -258,10 +258,6 @@ fn enc_ldst_vec(q: u32, size: u32, rn: Reg, rt: Writable<Reg>) -> u32 {
         | machreg_to_vec(rt.to_reg())
 }
 
-fn enc_extend(top22: u32, rd: Writable<Reg>, rn: Reg) -> u32 {
-    (top22 << 10) | (machreg_to_gpr(rn) << 5) | machreg_to_gpr(rd.to_reg())
-}
-
 fn enc_vec_rrr(top11: u32, rm: Reg, bit15_10: u32, rn: Reg, rd: Writable<Reg>) -> u32 {
     (top11 << 21)
         | (machreg_to_vec(rm) << 16)
@@ -313,6 +309,12 @@ fn enc_cset(rd: Writable<Reg>, cond: Cond) -> u32 {
         | (cond.invert().bits() << 12)
 }
 
+fn enc_csetm(rd: Writable<Reg>, cond: Cond) -> u32 {
+    0b110_11010100_11111_0000_00_11111_00000
+        | machreg_to_gpr(rd.to_reg())
+        | (cond.invert().bits() << 12)
+}
+
 fn enc_ccmp_imm(size: OperandSize, rn: Reg, imm: UImm5, nzcv: NZCV, cond: Cond) -> u32 {
     0b0_1_1_11010010_00000_0000_10_00000_0_0000
         | size.sf_bit() << 31
@@ -322,6 +324,29 @@ fn enc_ccmp_imm(size: OperandSize, rn: Reg, imm: UImm5, nzcv: NZCV, cond: Cond)
         | nzcv.bits()
 }
 
+fn enc_bfm(opc: u8, size: OperandSize, rd: Writable<Reg>, rn: Reg, immr: u8, imms: u8) -> u32 {
+    match size {
+        OperandSize::Size64 => {
+            debug_assert!(immr <= 63);
+            debug_assert!(imms <= 63);
+        }
+        OperandSize::Size32 => {
+            debug_assert!(immr <= 31);
+            debug_assert!(imms <= 31);
+        }
+    }
+    debug_assert_eq!(opc & 0b11, opc);
+    let n_bit = size.sf_bit();
+    0b0_00_100110_0_000000_000000_00000_00000
+        | size.sf_bit() << 31
+        | u32::from(opc) << 29
+        | n_bit << 22
+        | u32::from(immr) << 16
+        | u32::from(imms) << 10
+        | machreg_to_gpr(rn) << 5
+        | machreg_to_gpr(rd.to_reg())
+}
+
 fn enc_vecmov(is_16b: bool, rd: Writable<Reg>, rn: Reg) -> u32 {
     0b00001110_101_00000_00011_1_00000_00000
         | ((is_16b as u32) << 30)
@@ -1020,6 +1045,9 @@ impl MachInstEmit for Inst {
             &Inst::CSet { rd, cond } => {
                 sink.put4(enc_cset(rd, cond));
             }
+            &Inst::CSetm { rd, cond } => {
+                sink.put4(enc_csetm(rd, cond));
+            }
             &Inst::CCmpImm {
                 size,
                 rn,
@@ -1958,75 +1986,47 @@ impl MachInstEmit for Inst {
             &Inst::Extend {
                 rd,
                 rn,
-                signed,
-                from_bits,
+                signed: false,
+                from_bits: 1,
                 to_bits,
-            } if from_bits >= 8 => {
-                let top22 = match (signed, from_bits, to_bits) {
-                    (false, 8, 32) => 0b010_100110_0_000000_000111, // UXTB (32)
-                    (false, 16, 32) => 0b010_100110_0_000000_001111, // UXTH (32)
-                    (true, 8, 32) => 0b000_100110_0_000000_000111,  // SXTB (32)
-                    (true, 16, 32) => 0b000_100110_0_000000_001111, // SXTH (32)
-                    // The 64-bit unsigned variants are the same as the 32-bit ones,
-                    // because writes to Wn zero out the top 32 bits of Xn
-                    (false, 8, 64) => 0b010_100110_0_000000_000111, // UXTB (64)
-                    (false, 16, 64) => 0b010_100110_0_000000_001111, // UXTH (64)
-                    (true, 8, 64) => 0b100_100110_1_000000_000111,  // SXTB (64)
-                    (true, 16, 64) => 0b100_100110_1_000000_001111, // SXTH (64)
-                    // 32-to-64: the unsigned case is a 'mov' (special-cased below).
-                    (false, 32, 64) => 0,                           // MOV
-                    (true, 32, 64) => 0b100_100110_1_000000_011111, // SXTW (64)
-                    _ => panic!(
-                        "Unsupported extend combination: signed = {}, from_bits = {}, to_bits = {}",
-                        signed, from_bits, to_bits
-                    ),
-                };
-                if top22 != 0 {
-                    sink.put4(enc_extend(top22, rd, rn));
-                } else {
-                    let mov = Inst::Mov32 { rd, rm: rn };
-
-                    mov.emit(sink, emit_info, state);
-                }
-            }
-            &Inst::Extend {
-                rd,
-                rn,
-                signed,
-                from_bits,
-                to_bits,
-            } if from_bits == 1 && signed => {
-                assert!(to_bits <= 64);
-                // Reduce sign-extend-from-1-bit to:
-                // - and rd, rn, #1
-                // - sub rd, zr, rd
-
-                // We don't have ImmLogic yet, so we just hardcode this. FIXME.
-                sink.put4(0x92400000 | (machreg_to_gpr(rn) << 5) | machreg_to_gpr(rd.to_reg()));
-                let sub_inst = Inst::AluRRR {
-                    alu_op: ALUOp::Sub64,
-                    rd,
-                    rn: zero_reg(),
-                    rm: rd.to_reg(),
-                };
-                sub_inst.emit(sink, emit_info, state);
-            }
-            &Inst::Extend {
-                rd,
-                rn,
-                signed,
-                from_bits,
-                to_bits,
-            } if from_bits == 1 && !signed => {
+            } => {
                 assert!(to_bits <= 64);
                 // Reduce zero-extend-from-1-bit to:
                 // - and rd, rn, #1
-
-                // We don't have ImmLogic yet, so we just hardcode this. FIXME.
-                sink.put4(0x92400000 | (machreg_to_gpr(rn) << 5) | machreg_to_gpr(rd.to_reg()));
+                // Note: This is special cased as UBFX may take more cycles
+                // than AND on smaller cores.
+                let imml = ImmLogic::maybe_from_u64(1, I32).unwrap();
+                Inst::AluRRImmLogic {
+                    alu_op: ALUOp::And32,
+                    rd,
+                    rn,
+                    imml,
+                }
+                .emit(sink, emit_info, state);
             }
-            &Inst::Extend { .. } => {
-                panic!("Unsupported extend variant");
+            &Inst::Extend {
+                rd,
+                rn,
+                signed: false,
+                from_bits: 32,
+                to_bits: 64,
+            } => {
+                let mov = Inst::Mov32 { rd, rm: rn };
+                mov.emit(sink, emit_info, state);
+            }
+            &Inst::Extend {
+                rd,
+                rn,
+                signed,
+                from_bits,
+                to_bits,
+            } => {
+                let (opc, size) = if signed {
+                    (0b00, OperandSize::from_bits(to_bits))
+                } else {
+                    (0b10, OperandSize::Size32)
+                };
+                sink.put4(enc_bfm(opc, size, rd, rn, 0, from_bits - 1));
             }
             &Inst::Jump { ref dest } => {
                 let off = sink.cur_offset();
diff --git a/cranelift/codegen/src/isa/aarch64/inst/emit_tests.rs b/cranelift/codegen/src/isa/aarch64/inst/emit_tests.rs
index 74aac428ef..6f79a5c7d1 100644
--- a/cranelift/codegen/src/isa/aarch64/inst/emit_tests.rs
+++ b/cranelift/codegen/src/isa/aarch64/inst/emit_tests.rs
@@ -1784,6 +1784,22 @@ fn test_aarch64_binemit() {
         "EFB79F9A",
         "cset x15, ge",
     ));
+    insns.push((
+        Inst::CSetm {
+            rd: writable_xreg(0),
+            cond: Cond::Eq,
+        },
+        "E0139FDA",
+        "csetm x0, eq",
+    ));
+    insns.push((
+        Inst::CSetm {
+            rd: writable_xreg(16),
+            cond: Cond::Vs,
+        },
+        "F0739FDA",
+        "csetm x16, vs",
+    ));
     insns.push((
         Inst::CCmpImm {
             size: OperandSize::Size64,
@@ -3890,6 +3906,50 @@ fn test_aarch64_binemit() {
         "vcsel v5.16b, v10.16b, v19.16b, gt (if-then-else diamond)",
     ));
 
+    insns.push((
+        Inst::Extend {
+            rd: writable_xreg(3),
+            rn: xreg(5),
+            signed: false,
+            from_bits: 1,
+            to_bits: 32,
+        },
+        "A3000012",
+        "and w3, w5, #1",
+    ));
+    insns.push((
+        Inst::Extend {
+            rd: writable_xreg(3),
+            rn: xreg(5),
+            signed: false,
+            from_bits: 1,
+            to_bits: 64,
+        },
+        "A3000012",
+        "and w3, w5, #1",
+    ));
+    insns.push((
+        Inst::Extend {
+            rd: writable_xreg(10),
+            rn: xreg(21),
+            signed: true,
+            from_bits: 1,
+            to_bits: 32,
+        },
+        "AA020013",
+        "sbfx w10, w21, #0, #1",
+    ));
+    insns.push((
+        Inst::Extend {
+            rd: writable_xreg(1),
+            rn: xreg(2),
+            signed: true,
+            from_bits: 1,
+            to_bits: 64,
+        },
+        "41004093",
+        "sbfx x1, x2, #0, #1",
+    ));
     insns.push((
         Inst::Extend {
             rd: writable_xreg(1),
@@ -3943,7 +4003,7 @@ fn test_aarch64_binemit() {
             to_bits: 64,
         },
         "411C0053",
-        "uxtb x1, w2",
+        "uxtb w1, w2",
     ));
     insns.push((
         Inst::Extend {
@@ -3965,7 +4025,7 @@ fn test_aarch64_binemit() {
             to_bits: 64,
         },
         "413C0053",
-        "uxth x1, w2",
+        "uxth w1, w2",
     ));
     insns.push((
         Inst::Extend {
diff --git a/cranelift/codegen/src/isa/aarch64/inst/mod.rs b/cranelift/codegen/src/isa/aarch64/inst/mod.rs
index d09637298c..c24344a303 100644
--- a/cranelift/codegen/src/isa/aarch64/inst/mod.rs
+++ b/cranelift/codegen/src/isa/aarch64/inst/mod.rs
@@ -648,6 +648,12 @@ pub enum Inst {
         cond: Cond,
     },
 
+    /// A conditional-set-mask operation.
+    CSetm {
+        rd: Writable<Reg>,
+        cond: Cond,
+    },
+
     /// A conditional comparison with an immediate.
     CCmpImm {
         size: OperandSize,
@@ -1596,7 +1602,7 @@ fn aarch64_get_regs(inst: &Inst, collector: &mut RegUsageCollector) {
             collector.add_use(rn);
             collector.add_use(rm);
         }
-        &Inst::CSet { rd, .. } => {
+        &Inst::CSet { rd, .. } | &Inst::CSetm { rd, .. } => {
             collector.add_def(rd);
         }
         &Inst::CCmpImm { rn, .. } => {
@@ -2162,7 +2168,7 @@ fn aarch64_map_regs<RUM: RegUsageMapper>(inst: &mut Inst, mapper: &RUM) {
             map_use(mapper, rn);
             map_use(mapper, rm);
         }
-        &mut Inst::CSet { ref mut rd, .. } => {
+        &mut Inst::CSet { ref mut rd, .. } | &mut Inst::CSetm { ref mut rd, .. } => {
             map_def(mapper, rd);
         }
         &mut Inst::CCmpImm { ref mut rn, .. } => {
@@ -3108,6 +3114,11 @@ impl Inst {
                 let cond = cond.show_rru(mb_rru);
                 format!("cset {}, {}", rd, cond)
             }
+            &Inst::CSetm { rd, cond } => {
+                let rd = rd.to_reg().show_rru(mb_rru);
+                let cond = cond.show_rru(mb_rru);
+                format!("csetm {}, {}", rd, cond)
+            }
             &Inst::CCmpImm {
                 size,
                 rn,
@@ -3628,63 +3639,60 @@ impl Inst {
             &Inst::Extend {
                 rd,
                 rn,
-                signed,
-                from_bits,
-                to_bits,
-            } if from_bits >= 8 => {
-                // Is the destination a 32-bit register? Corresponds to whether
-                // extend-to width is <= 32 bits, *unless* we have an unsigned
-                // 32-to-64-bit extension, which is implemented with a "mov" to a
-                // 32-bit (W-reg) dest, because this zeroes the top 32 bits.
-                let dest_size = if !signed && from_bits == 32 && to_bits == 64 {
-                    OperandSize::Size32
-                } else {
-                    OperandSize::from_bits(to_bits)
-                };
-                let rd = show_ireg_sized(rd.to_reg(), mb_rru, dest_size);
-                let rn = show_ireg_sized(rn, mb_rru, OperandSize::from_bits(from_bits));
-                let op = match (signed, from_bits, to_bits) {
-                    (false, 8, 32) => "uxtb",
-                    (true, 8, 32) => "sxtb",
-                    (false, 16, 32) => "uxth",
-                    (true, 16, 32) => "sxth",
-                    (false, 8, 64) => "uxtb",
-                    (true, 8, 64) => "sxtb",
-                    (false, 16, 64) => "uxth",
-                    (true, 16, 64) => "sxth",
-                    (false, 32, 64) => "mov", // special case (see above).
-                    (true, 32, 64) => "sxtw",
-                    _ => panic!("Unsupported Extend case: {:?}", self),
-                };
-                format!("{} {}, {}", op, rd, rn)
-            }
-            &Inst::Extend {
-                rd,
-                rn,
-                signed,
-                from_bits,
-                to_bits,
-            } if from_bits == 1 && signed => {
-                let dest_size = OperandSize::from_bits(to_bits);
-                let zr = if dest_size.is32() { "wzr" } else { "xzr" };
-                let rd32 = show_ireg_sized(rd.to_reg(), mb_rru, OperandSize::Size32);
-                let rd = show_ireg_sized(rd.to_reg(), mb_rru, dest_size);
-                let rn = show_ireg_sized(rn, mb_rru, OperandSize::Size32);
-                format!("and {}, {}, #1 ; sub {}, {}, {}", rd32, rn, rd, zr, rd)
-            }
-            &Inst::Extend {
-                rd,
-                rn,
-                signed,
-                from_bits,
+                signed: false,
+                from_bits: 1,
                 ..
-            } if from_bits == 1 && !signed => {
+            } => {
                 let rd = show_ireg_sized(rd.to_reg(), mb_rru, OperandSize::Size32);
                 let rn = show_ireg_sized(rn, mb_rru, OperandSize::Size32);
                 format!("and {}, {}, #1", rd, rn)
             }
-            &Inst::Extend { .. } => {
-                panic!("Unsupported Extend case");
+            &Inst::Extend {
+                rd,
+                rn,
+                signed: false,
+                from_bits: 32,
+                to_bits: 64,
+            } => {
+                // The case of a zero extension from 32 to 64 bits, is implemented
+                // with a "mov" to a 32-bit (W-reg) dest, because this zeroes
+                // the top 32 bits.
+                let rd = show_ireg_sized(rd.to_reg(), mb_rru, OperandSize::Size32);
+                let rn = show_ireg_sized(rn, mb_rru, OperandSize::Size32);
+                format!("mov {}, {}", rd, rn)
+            }
+            &Inst::Extend {
+                rd,
+                rn,
+                signed,
+                from_bits,
+                to_bits,
+            } => {
+                assert!(from_bits <= to_bits);
+                let op = match (signed, from_bits) {
+                    (false, 8) => "uxtb",
+                    (true, 8) => "sxtb",
+                    (false, 16) => "uxth",
+                    (true, 16) => "sxth",
+                    (true, 32) => "sxtw",
+                    (true, _) => "sbfx",
+                    (false, _) => "ubfx",
+                };
+                if op == "sbfx" || op == "ubfx" {
+                    let dest_size = OperandSize::from_bits(to_bits);
+                    let rd = show_ireg_sized(rd.to_reg(), mb_rru, dest_size);
+                    let rn = show_ireg_sized(rn, mb_rru, dest_size);
+                    format!("{} {}, {}, #0, #{}", op, rd, rn, from_bits)
+                } else {
+                    let dest_size = if signed {
+                        OperandSize::from_bits(to_bits)
+                    } else {
+                        OperandSize::Size32
+                    };
+                    let rd = show_ireg_sized(rd.to_reg(), mb_rru, dest_size);
+                    let rn = show_ireg_sized(rn, mb_rru, OperandSize::from_bits(from_bits));
+                    format!("{} {}, {}", op, rd, rn)
+                }
             }
             &Inst::Call { .. } => format!("bl 0"),
             &Inst::CallInd { ref info, .. } => {
diff --git a/cranelift/codegen/src/isa/aarch64/lower.rs b/cranelift/codegen/src/isa/aarch64/lower.rs
index 8c94aad5fb..45b3b4b832 100644
--- a/cranelift/codegen/src/isa/aarch64/lower.rs
+++ b/cranelift/codegen/src/isa/aarch64/lower.rs
@@ -1152,21 +1152,21 @@ pub(crate) fn lower_fcmp_or_ffcmp_to_flags<C: LowerCtx<I = Inst>>(ctx: &mut C, i
     }
 }
 
-/// Convert a 0 / 1 result, such as from a conditional-set instruction, into a 0
-/// / -1 (all-ones) result as expected for bool operations.
-pub(crate) fn normalize_bool_result<C: LowerCtx<I = Inst>>(
+/// Materialize a boolean value into a register from the flags
+/// (e.g set by a comparison).
+/// A 0 / -1 (all-ones) result as expected for bool operations.
+pub(crate) fn materialize_bool_result<C: LowerCtx<I = Inst>>(
     ctx: &mut C,
     insn: IRInst,
     rd: Writable<Reg>,
+    cond: Cond,
 ) {
-    // A boolean is 0 / -1; if output width is > 1, negate.
+    // A boolean is 0 / -1; if output width is > 1 use `csetm`,
+    // otherwise use `cset`.
     if ty_bits(ctx.output_ty(insn, 0)) > 1 {
-        ctx.emit(Inst::AluRRR {
-            alu_op: ALUOp::Sub64,
-            rd,
-            rn: zero_reg(),
-            rm: rd.to_reg(),
-        });
+        ctx.emit(Inst::CSetm { rd, cond });
+    } else {
+        ctx.emit(Inst::CSet { rd, cond });
     }
 }
 
diff --git a/cranelift/codegen/src/isa/aarch64/lower_inst.rs b/cranelift/codegen/src/isa/aarch64/lower_inst.rs
index 8113221ab5..35bbdc7ee8 100644
--- a/cranelift/codegen/src/isa/aarch64/lower_inst.rs
+++ b/cranelift/codegen/src/isa/aarch64/lower_inst.rs
@@ -1521,8 +1521,7 @@ pub(crate) fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(
             let ifcmp_insn = maybe_input_insn(ctx, inputs[0], Opcode::Ifcmp).unwrap();
             lower_icmp_or_ifcmp_to_flags(ctx, ifcmp_insn, is_signed);
             let rd = get_output_reg(ctx, outputs[0]);
-            ctx.emit(Inst::CSet { rd, cond });
-            normalize_bool_result(ctx, insn, rd);
+            materialize_bool_result(ctx, insn, rd, cond);
         }
 
         Opcode::Trueff => {
@@ -1531,8 +1530,7 @@ pub(crate) fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(
             let ffcmp_insn = maybe_input_insn(ctx, inputs[0], Opcode::Ffcmp).unwrap();
             lower_fcmp_or_ffcmp_to_flags(ctx, ffcmp_insn);
             let rd = get_output_reg(ctx, outputs[0]);
-            ctx.emit(Inst::CSet { rd, cond });
-            normalize_bool_result(ctx, insn, rd);
+            materialize_bool_result(ctx, insn, rd, cond);
         }
 
         Opcode::IsNull | Opcode::IsInvalid => {
@@ -1555,8 +1553,7 @@ pub(crate) fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(
             };
             let const_value = ResultRSEImm12::Imm12(Imm12::maybe_from_u64(const_value).unwrap());
             ctx.emit(alu_inst_imm12(alu_op, writable_zero_reg(), rn, const_value));
-            ctx.emit(Inst::CSet { rd, cond: Cond::Eq });
-            normalize_bool_result(ctx, insn, rd);
+            materialize_bool_result(ctx, insn, rd, Cond::Eq);
         }
 
         Opcode::Copy => {
@@ -1581,11 +1578,6 @@ pub(crate) fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(
             //   sign-extend the -1 to a -1 in the wider width.
             // - Bmask, because the resulting integer mask value must be
             //   all-ones (-1) if the argument is true.
-            //
-            // For a sign-extension from a 1-bit value (Case 1 below), we need
-            // to do things a bit specially, because the ISA does not have a
-            // 1-to-N-bit sign extension instruction.  For 8-bit or wider
-            // sources (Case 2 below), we do a sign extension normally.
 
             let from_ty = ctx.input_ty(insn, 0);
             let to_ty = ctx.output_ty(insn, 0);
@@ -1600,41 +1592,23 @@ pub(crate) fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(
 
             if from_bits == to_bits {
                 // Nothing.
-            } else if from_bits == 1 {
-                assert!(to_bits >= 8);
-                // Case 1: 1-bit to N-bit extension: AND the LSB of source into
-                // dest, generating a value of 0 or 1, then negate to get
-                // 0x000... or 0xfff...
+            } else {
                 let rn = put_input_in_reg(ctx, inputs[0], NarrowValueMode::None);
                 let rd = get_output_reg(ctx, outputs[0]);
-                // AND Rdest, Rsource, #1
-                ctx.emit(Inst::AluRRImmLogic {
-                    alu_op: ALUOp::And64,
-                    rd,
-                    rn,
-                    imml: ImmLogic::maybe_from_u64(1, I64).unwrap(),
-                });
-                // SUB Rdest, XZR, Rdest  (i.e., NEG Rdest)
-                ctx.emit(Inst::AluRRR {
-                    alu_op: ALUOp::Sub64,
-                    rd,
-                    rn: zero_reg(),
-                    rm: rd.to_reg(),
-                });
-            } else {
-                // Case 2: 8-or-more-bit to N-bit extension: just sign-extend. A
-                // `true` (all ones, or `-1`) will be extended to -1 with the
-                // larger width.
-                assert!(from_bits >= 8);
-                let narrow_mode = if to_bits == 64 {
-                    NarrowValueMode::SignExtend64
+                let to_bits = if to_bits == 64 {
+                    64
                 } else {
                     assert!(to_bits <= 32);
-                    NarrowValueMode::SignExtend32
+                    32
                 };
-                let rn = put_input_in_reg(ctx, inputs[0], narrow_mode);
-                let rd = get_output_reg(ctx, outputs[0]);
-                ctx.emit(Inst::gen_move(rd, rn, to_ty));
+                let from_bits = from_bits as u8;
+                ctx.emit(Inst::Extend {
+                    rd,
+                    rn,
+                    signed: true,
+                    from_bits,
+                    to_bits,
+                });
             }
         }
 
@@ -1745,8 +1719,7 @@ pub(crate) fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(
                 let alu_op = choose_32_64(ty, ALUOp::SubS32, ALUOp::SubS64);
                 let rm = put_input_in_rse_imm12(ctx, inputs[1], narrow_mode);
                 ctx.emit(alu_inst_imm12(alu_op, writable_zero_reg(), rn, rm));
-                ctx.emit(Inst::CSet { cond, rd });
-                normalize_bool_result(ctx, insn, rd);
+                materialize_bool_result(ctx, insn, rd, cond);
             } else {
                 let rm = put_input_in_reg(ctx, inputs[1], narrow_mode);
                 lower_vector_compare(ctx, rd, rn, rm, ty, cond)?;
@@ -1771,8 +1744,7 @@ pub(crate) fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(
                     }
                     _ => panic!("Bad float size"),
                 }
-                ctx.emit(Inst::CSet { cond, rd });
-                normalize_bool_result(ctx, insn, rd);
+                materialize_bool_result(ctx, insn, rd, cond);
             } else {
                 lower_vector_compare(ctx, rd, rn, rm, ty, cond)?;
             }
@@ -2105,8 +2077,7 @@ pub(crate) fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(
                 imm12: Imm12::zero(),
             });
 
-            ctx.emit(Inst::CSet { rd, cond: Cond::Ne });
-            normalize_bool_result(ctx, insn, rd);
+            materialize_bool_result(ctx, insn, rd, Cond::Ne);
         }
 
         Opcode::VhighBits => {
diff --git a/cranelift/filetests/filetests/isa/aarch64/bitops.clif b/cranelift/filetests/filetests/isa/aarch64/bitops.clif
index c7ddd04608..ab1c113104 100644
--- a/cranelift/filetests/filetests/isa/aarch64/bitops.clif
+++ b/cranelift/filetests/filetests/isa/aarch64/bitops.clif
@@ -281,7 +281,7 @@ block0(v0: i16):
 
 ; check: stp fp, lr, [sp, #-16]!
 ; nextln: mov fp, sp
-; nextln: uxth x0, w0
+; nextln: uxth w0, w0
 ; nextln: lsr w1, w0, #1
 ; nextln: and x1, x1, #6148914691236517205
 ; nextln: sub x1, x0, x1
@@ -307,7 +307,7 @@ block0(v0: i8):
 
 ; check: stp fp, lr, [sp, #-16]!
 ; nextln: mov fp, sp
-; nextln: uxtb x0, w0
+; nextln: uxtb w0, w0
 ; nextln: lsr w1, w0, #1
 ; nextln: and x1, x1, #6148914691236517205
 ; nextln: sub x1, x0, x1
@@ -324,3 +324,33 @@ block0(v0: i8):
 ; nextln: mov sp, fp
 ; nextln: ldp fp, lr, [sp], #16
 ; nextln: ret
+
+function %bextend_b8() -> b32 {
+block0:
+    v1 = bconst.b8 true
+    v2 = bextend.b32 v1
+    return v2
+}
+
+; check: stp fp, lr, [sp, #-16]!
+; nextln: mov fp, sp
+; nextln: movz x0, #255
+; nextln: sxtb w0, w0
+; nextln: mov sp, fp
+; nextln: ldp fp, lr, [sp], #16
+; nextln: ret
+
+function %bextend_b1() -> b32 {
+block0:
+    v1 = bconst.b1 true
+    v2 = bextend.b32 v1
+    return v2
+}
+
+; check: stp fp, lr, [sp, #-16]!
+; nextln: mov fp, sp
+; nextln: movz x0, #1
+; nextln: sbfx w0, w0, #0, #1
+; nextln: mov sp, fp
+; nextln: ldp fp, lr, [sp], #16
+; nextln: ret
diff --git a/cranelift/filetests/filetests/isa/aarch64/saturating-ops.clif b/cranelift/filetests/filetests/isa/aarch64/saturating-ops.clif
index 7116205dd5..e0bf7c5b3f 100644
--- a/cranelift/filetests/filetests/isa/aarch64/saturating-ops.clif
+++ b/cranelift/filetests/filetests/isa/aarch64/saturating-ops.clif
@@ -25,8 +25,8 @@ block0(v0: i8, v1: i8):
 
 ; check: stp fp, lr, [sp, #-16]!
 ; nextln: mov fp, sp
-; nextln: uxtb x0, w0
-; nextln: uxtb x1, w1
+; nextln: uxtb w0, w0
+; nextln: uxtb w1, w1
 ; nextln: fmov d0, x0
 ; nextln: fmov d1, x1
 ; nextln: uqadd d0, d0, d1
diff --git a/cranelift/filetests/filetests/isa/aarch64/uextend-sextend.clif b/cranelift/filetests/filetests/isa/aarch64/uextend-sextend.clif
index 803f144844..8823351207 100644
--- a/cranelift/filetests/filetests/isa/aarch64/uextend-sextend.clif
+++ b/cranelift/filetests/filetests/isa/aarch64/uextend-sextend.clif
@@ -9,7 +9,7 @@ block0(v0: i8):
 
 ; check: stp fp, lr, [sp, #-16]!
 ; nextln: mov fp, sp
-; nextln: uxtb x0, w0
+; nextln: uxtb w0, w0
 ; nextln: mov sp, fp
 ; nextln: ldp fp, lr, [sp], #16
 ; nextln: ret
@@ -87,7 +87,7 @@ block0(v0: i16):
 
 ; check: stp fp, lr, [sp, #-16]!
 ; nextln: mov fp, sp
-; nextln: uxth x0, w0
+; nextln: uxth w0, w0
 ; nextln: mov sp, fp
 ; nextln: ldp fp, lr, [sp], #16
 ; nextln: ret