diff --git a/cranelift/codegen/src/isa/aarch64/inst/emit.rs b/cranelift/codegen/src/isa/aarch64/inst/emit.rs
index aa6727b978..462628143f 100644
--- a/cranelift/codegen/src/isa/aarch64/inst/emit.rs
+++ b/cranelift/codegen/src/isa/aarch64/inst/emit.rs
@@ -85,12 +85,12 @@ pub fn u64_constant(bits: u64) -> ConstantData {
 // Instructions and subcomponents: emission
 
 fn machreg_to_gpr(m: Reg) -> u32 {
-    assert!(m.get_class() == RegClass::I64);
+    assert_eq!(m.get_class(), RegClass::I64);
     u32::try_from(m.to_real_reg().get_hw_encoding()).unwrap()
 }
 
 fn machreg_to_vec(m: Reg) -> u32 {
-    assert!(m.get_class() == RegClass::V128);
+    assert_eq!(m.get_class(), RegClass::V128);
     u32::try_from(m.to_real_reg().get_hw_encoding()).unwrap()
 }
 
@@ -948,6 +948,44 @@ impl MachInstEmit for Inst {
                 };
                 sink.put4(enc_fpurrr(top22, rd, rn, rm));
             }
+            &Inst::FpuRRI { fpu_op, rd, rn } => match fpu_op {
+                FPUOpRI::UShr32(imm) => {
+                    debug_assert_eq!(32, imm.lane_size_in_bits);
+                    sink.put4(
+                        0b0_0_1_011110_0000000_00_0_0_0_1_00000_00000
+                            | imm.enc() << 16
+                            | machreg_to_vec(rn) << 5
+                            | machreg_to_vec(rd.to_reg()),
+                    )
+                }
+                FPUOpRI::UShr64(imm) => {
+                    debug_assert_eq!(64, imm.lane_size_in_bits);
+                    sink.put4(
+                        0b01_1_111110_0000000_00_0_0_0_1_00000_00000
+                            | imm.enc() << 16
+                            | machreg_to_vec(rn) << 5
+                            | machreg_to_vec(rd.to_reg()),
+                    )
+                }
+                FPUOpRI::Sli64(imm) => {
+                    debug_assert_eq!(64, imm.lane_size_in_bits);
+                    sink.put4(
+                        0b01_1_111110_0000000_010101_00000_00000
+                            | imm.enc() << 16
+                            | machreg_to_vec(rn) << 5
+                            | machreg_to_vec(rd.to_reg()),
+                    )
+                }
+                FPUOpRI::Sli32(imm) => {
+                    debug_assert_eq!(32, imm.lane_size_in_bits);
+                    sink.put4(
+                        0b0_0_1_011110_0000000_010101_00000_00000
+                            | imm.enc() << 16
+                            | machreg_to_vec(rn) << 5
+                            | machreg_to_vec(rd.to_reg()),
+                    )
+                }
+            },
             &Inst::FpuRRRR {
                 fpu_op,
                 rd,
diff --git a/cranelift/codegen/src/isa/aarch64/inst/emit_tests.rs b/cranelift/codegen/src/isa/aarch64/inst/emit_tests.rs
index 55977796ce..b948f4fd8c 100644
--- a/cranelift/codegen/src/isa/aarch64/inst/emit_tests.rs
+++ b/cranelift/codegen/src/isa/aarch64/inst/emit_tests.rs
@@ -2400,6 +2400,46 @@ fn test_aarch64_binemit() {
         "fmadd d15, d30, d31, d1",
     ));
 
+    insns.push((
+        Inst::FpuRRI {
+            fpu_op: FPUOpRI::UShr32(FPURightShiftImm::maybe_from_u8(32, 32).unwrap()),
+            rd: writable_vreg(2),
+            rn: vreg(5),
+        },
+        "A204202F",
+        "ushr v2.2s, v5.2s, #32",
+    ));
+
+    insns.push((
+        Inst::FpuRRI {
+            fpu_op: FPUOpRI::UShr64(FPURightShiftImm::maybe_from_u8(63, 64).unwrap()),
+            rd: writable_vreg(2),
+            rn: vreg(5),
+        },
+        "A204417F",
+        "ushr d2, d5, #63",
+    ));
+
+    insns.push((
+        Inst::FpuRRI {
+            fpu_op: FPUOpRI::Sli32(FPULeftShiftImm::maybe_from_u8(31, 32).unwrap()),
+            rd: writable_vreg(4),
+            rn: vreg(10),
+        },
+        "44553F2F",
+        "sli v4.2s, v10.2s, #31",
+    ));
+
+    insns.push((
+        Inst::FpuRRI {
+            fpu_op: FPUOpRI::Sli64(FPULeftShiftImm::maybe_from_u8(63, 64).unwrap()),
+            rd: writable_vreg(4),
+            rn: vreg(10),
+        },
+        "44557F7F",
+        "sli d4, d10, #63",
+    ));
+
     insns.push((
         Inst::FpuToInt {
             op: FpuToIntOp::F32ToU32,
diff --git a/cranelift/codegen/src/isa/aarch64/inst/imms.rs b/cranelift/codegen/src/isa/aarch64/inst/imms.rs
index b8e6bf65bf..7c473a83d2 100644
--- a/cranelift/codegen/src/isa/aarch64/inst/imms.rs
+++ b/cranelift/codegen/src/isa/aarch64/inst/imms.rs
@@ -106,6 +106,85 @@ impl SImm7Scaled {
     }
 }
 
+#[derive(Clone, Copy, Debug)]
+pub struct FPULeftShiftImm {
+    pub amount: u8,
+    pub lane_size_in_bits: u8,
+}
+
+impl FPULeftShiftImm {
+    pub fn maybe_from_u8(amount: u8, lane_size_in_bits: u8) -> Option<Self> {
+        debug_assert!(lane_size_in_bits == 32 || lane_size_in_bits == 64);
+        if amount < lane_size_in_bits {
+            Some(Self {
+                amount,
+                lane_size_in_bits,
+            })
+        } else {
+            None
+        }
+    }
+
+    pub fn enc(&self) -> u32 {
+        debug_assert!(self.lane_size_in_bits.is_power_of_two());
+        debug_assert!(self.lane_size_in_bits > self.amount);
+        // The encoding of the immediate follows the table below,
+        // where xs encode the shift amount.
+        //
+        // | lane_size_in_bits | encoding |
+        // +------------------------------+
+        // | 8                 | 0001xxx  |
+        // | 16                | 001xxxx  |
+        // | 32                | 01xxxxx  |
+        // | 64                | 1xxxxxx  |
+        //
+        // The highest one bit is represented by `lane_size_in_bits`. Since
+        // `lane_size_in_bits` is a power of 2 and `amount` is less
+        // than `lane_size_in_bits`, they can be ORed
+        // together to produced the encoded value.
+        u32::from(self.lane_size_in_bits | self.amount)
+    }
+}
+
+#[derive(Clone, Copy, Debug)]
+pub struct FPURightShiftImm {
+    pub amount: u8,
+    pub lane_size_in_bits: u8,
+}
+
+impl FPURightShiftImm {
+    pub fn maybe_from_u8(amount: u8, lane_size_in_bits: u8) -> Option<Self> {
+        debug_assert!(lane_size_in_bits == 32 || lane_size_in_bits == 64);
+        if amount > 0 && amount <= lane_size_in_bits {
+            Some(Self {
+                amount,
+                lane_size_in_bits,
+            })
+        } else {
+            None
+        }
+    }
+
+    pub fn enc(&self) -> u32 {
+        debug_assert_ne!(0, self.amount);
+        // The encoding of the immediate follows the table below,
+        // where xs encodes the negated shift amount.
+        //
+        // | lane_size_in_bits | encoding |
+        // +------------------------------+
+        // | 8                 | 0001xxx  |
+        // | 16                | 001xxxx  |
+        // | 32                | 01xxxxx  |
+        // | 64                | 1xxxxxx  |
+        //
+        // The shift amount is negated such that a shift ammount
+        // of 1 (in 64-bit) is encoded as 0b111111 and a shift
+        // amount of 64 is encoded as 0b000000,
+        // in the bottom 6 bits.
+        u32::from((self.lane_size_in_bits * 2) - self.amount)
+    }
+}
+
 /// a 9-bit signed offset.
 #[derive(Clone, Copy, Debug)]
 pub struct SImm9 {
@@ -576,6 +655,18 @@ impl ShowWithRRU for SImm7Scaled {
     }
 }
 
+impl ShowWithRRU for FPULeftShiftImm {
+    fn show_rru(&self, _mb_rru: Option<&RealRegUniverse>) -> String {
+        format!("#{}", self.amount)
+    }
+}
+
+impl ShowWithRRU for FPURightShiftImm {
+    fn show_rru(&self, _mb_rru: Option<&RealRegUniverse>) -> String {
+        format!("#{}", self.amount)
+    }
+}
+
 impl ShowWithRRU for SImm9 {
     fn show_rru(&self, _mb_rru: Option<&RealRegUniverse>) -> String {
         format!("#{}", self.value)
diff --git a/cranelift/codegen/src/isa/aarch64/inst/mod.rs b/cranelift/codegen/src/isa/aarch64/inst/mod.rs
index fd910522c5..53953efc52 100644
--- a/cranelift/codegen/src/isa/aarch64/inst/mod.rs
+++ b/cranelift/codegen/src/isa/aarch64/inst/mod.rs
@@ -4,7 +4,7 @@
 #![allow(dead_code)]
 
 use crate::binemit::CodeOffset;
-use crate::ir::types::{B1, B16, B32, B64, B8, F32, F64, FFLAGS, I16, I32, I64, I8, IFLAGS};
+use crate::ir::types::{B1, B16, B32, B64, B8, F32, F32X2, F64, FFLAGS, I16, I32, I64, I8, IFLAGS};
 use crate::ir::{ExternalName, Opcode, SourceLoc, TrapCode, Type};
 use crate::machinst::*;
 use crate::{settings, CodegenError, CodegenResult};
@@ -124,6 +124,19 @@ pub enum FPUOp2 {
     Min64,
 }
 
+/// A floating-point unit (FPU) operation with two args, a register and an immediate.
+#[derive(Copy, Clone, Debug)]
+pub enum FPUOpRI {
+    /// Unsigned right shift. Rd = Rn << #imm
+    UShr32(FPURightShiftImm),
+    /// Unsigned right shift. Rd = Rn << #imm
+    UShr64(FPURightShiftImm),
+    /// Shift left and insert. Rd |= Rn << #imm
+    Sli32(FPULeftShiftImm),
+    /// Shift left and insert. Rd |= Rn << #imm
+    Sli64(FPULeftShiftImm),
+}
+
 /// A floating-point unit (FPU) operation with three args.
 #[derive(Copy, Clone, Debug, PartialEq, Eq, Hash)]
 pub enum FPUOp3 {
@@ -472,6 +485,12 @@ pub enum Inst {
         rm: Reg,
     },
 
+    FpuRRI {
+        fpu_op: FPUOpRI,
+        rd: Writable<Reg>,
+        rn: Reg,
+    },
+
     /// 3-op FPU instruction.
     FpuRRRR {
         fpu_op: FPUOp3,
@@ -1034,6 +1053,13 @@ fn aarch64_get_regs(inst: &Inst, collector: &mut RegUsageCollector) {
             collector.add_use(rn);
             collector.add_use(rm);
         }
+        &Inst::FpuRRI { fpu_op, rd, rn, .. } => {
+            match fpu_op {
+                FPUOpRI::UShr32(..) | FPUOpRI::UShr64(..) => collector.add_def(rd),
+                FPUOpRI::Sli32(..) | FPUOpRI::Sli64(..) => collector.add_mod(rd),
+            }
+            collector.add_use(rn);
+        }
         &Inst::FpuRRRR { rd, rn, rm, ra, .. } => {
             collector.add_def(rd);
             collector.add_use(rn);
@@ -1482,6 +1508,14 @@ fn aarch64_map_regs(inst: &mut Inst, mapper: &RegUsageMapper) {
             map_use(mapper, rn);
             map_use(mapper, rm);
         }
+        &mut Inst::FpuRRI {
+            ref mut rd,
+            ref mut rn,
+            ..
+        } => {
+            map_def(mapper, rd);
+            map_use(mapper, rn);
+        }
         &mut Inst::FpuRRRR {
             ref mut rd,
             ref mut rn,
@@ -2236,6 +2270,23 @@ impl ShowWithRRU for Inst {
                 let rm = show_freg_sized(rm, mb_rru, size);
                 format!("{} {}, {}, {}", op, rd, rn, rm)
             }
+            &Inst::FpuRRI { fpu_op, rd, rn } => {
+                let (op, imm, vector) = match fpu_op {
+                    FPUOpRI::UShr32(imm) => ("ushr", imm.show_rru(mb_rru), true),
+                    FPUOpRI::UShr64(imm) => ("ushr", imm.show_rru(mb_rru), false),
+                    FPUOpRI::Sli32(imm) => ("sli", imm.show_rru(mb_rru), true),
+                    FPUOpRI::Sli64(imm) => ("sli", imm.show_rru(mb_rru), false),
+                };
+
+                let show_vreg_fn: fn(Reg, Option<&RealRegUniverse>) -> String = if vector {
+                    |reg, mb_rru| show_vreg_vector(reg, mb_rru, F32X2)
+                } else {
+                    show_vreg_scalar
+                };
+                let rd = show_vreg_fn(rd.to_reg(), mb_rru);
+                let rn = show_vreg_fn(rn, mb_rru);
+                format!("{} {}, {}, {}", op, rd, rn, imm)
+            }
             &Inst::FpuRRRR {
                 fpu_op,
                 rd,
diff --git a/cranelift/codegen/src/isa/aarch64/inst/regs.rs b/cranelift/codegen/src/isa/aarch64/inst/regs.rs
index 099d816266..242fb66fc9 100644
--- a/cranelift/codegen/src/isa/aarch64/inst/regs.rs
+++ b/cranelift/codegen/src/isa/aarch64/inst/regs.rs
@@ -1,5 +1,6 @@
 //! AArch64 ISA definitions: registers.
 
+use crate::ir::types::*;
 use crate::isa::aarch64::inst::InstSize;
 use crate::machinst::*;
 use crate::settings;
@@ -307,3 +308,16 @@ pub fn show_vreg_scalar(reg: Reg, mb_rru: Option<&RealRegUniverse>) -> String {
     }
     s
 }
+
+/// Show a vector register.
+pub fn show_vreg_vector(reg: Reg, mb_rru: Option<&RealRegUniverse>, ty: Type) -> String {
+    assert_eq!(RegClass::V128, reg.get_class());
+    let mut s = reg.show_rru(mb_rru);
+
+    match ty {
+        F32X2 => s.push_str(".2s"),
+        _ => unimplemented!(),
+    }
+
+    s
+}
diff --git a/cranelift/codegen/src/isa/aarch64/lower_inst.rs b/cranelift/codegen/src/isa/aarch64/lower_inst.rs
index a92dea7a7b..2946e16471 100644
--- a/cranelift/codegen/src/isa/aarch64/lower_inst.rs
+++ b/cranelift/codegen/src/isa/aarch64/lower_inst.rs
@@ -1460,54 +1460,38 @@ pub(crate) fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(
         Opcode::Fcopysign => {
             // Copy the sign bit from inputs[1] to inputs[0]. We use the following sequence:
             //
-            // (64 bits for example, 32-bit sequence is analogous):
+            // This is a scalar Fcopysign.
+            // This uses scalar NEON operations for 64-bit and vector operations (2S) for 32-bit.
             //
-            // MOV Xtmp1, Dinput0
-            // MOV Xtmp2, Dinput1
-            // AND Xtmp2, 0x8000_0000_0000_0000
-            // BIC Xtmp1, 0x8000_0000_0000_0000
-            // ORR Xtmp1, Xtmp1, Xtmp2
-            // MOV Doutput, Xtmp1
+            //  mov vd, vn
+            //  ushr vtmp, vm, #63 / #31
+            //  sli vd, vtmp, #63 / #31
 
             let ty = ctx.output_ty(insn, 0);
-            let bits = ty_bits(ty);
+            let bits = ty_bits(ty) as u8;
             assert!(bits == 32 || bits == 64);
             let rn = input_to_reg(ctx, inputs[0], NarrowValueMode::None);
             let rm = input_to_reg(ctx, inputs[1], NarrowValueMode::None);
             let rd = output_to_reg(ctx, outputs[0]);
-            let tmp1 = ctx.alloc_tmp(RegClass::I64, I64);
-            let tmp2 = ctx.alloc_tmp(RegClass::I64, I64);
-            ctx.emit(Inst::MovFromVec64 { rd: tmp1, rn: rn });
-            ctx.emit(Inst::MovFromVec64 { rd: tmp2, rn: rm });
-            let imml = if bits == 32 {
-                ImmLogic::maybe_from_u64(0x8000_0000, I32).unwrap()
-            } else {
-                ImmLogic::maybe_from_u64(0x8000_0000_0000_0000, I64).unwrap()
-            };
-            let alu_op = choose_32_64(ty, ALUOp::And32, ALUOp::And64);
-            ctx.emit(Inst::AluRRImmLogic {
-                alu_op,
-                rd: tmp2,
-                rn: tmp2.to_reg(),
-                imml: imml.clone(),
+            let tmp = ctx.alloc_tmp(RegClass::V128, F64);
+
+            // Copy LHS to rd.
+            ctx.emit(Inst::FpuMove64 { rd, rn });
+
+            // Copy the sign bit to the lowest bit in tmp.
+            let imm = FPURightShiftImm::maybe_from_u8(bits - 1, bits).unwrap();
+            ctx.emit(Inst::FpuRRI {
+                fpu_op: choose_32_64(ty, FPUOpRI::UShr32(imm), FPUOpRI::UShr64(imm)),
+                rd: tmp,
+                rn: rm,
             });
-            let alu_op = choose_32_64(ty, ALUOp::AndNot32, ALUOp::AndNot64);
-            ctx.emit(Inst::AluRRImmLogic {
-                alu_op,
-                rd: tmp1,
-                rn: tmp1.to_reg(),
-                imml,
-            });
-            let alu_op = choose_32_64(ty, ALUOp::Orr32, ALUOp::Orr64);
-            ctx.emit(Inst::AluRRR {
-                alu_op,
-                rd: tmp1,
-                rn: tmp1.to_reg(),
-                rm: tmp2.to_reg(),
-            });
-            ctx.emit(Inst::MovToVec64 {
+
+            // Insert the bit from tmp into the sign bit of rd.
+            let imm = FPULeftShiftImm::maybe_from_u8(bits - 1, bits).unwrap();
+            ctx.emit(Inst::FpuRRI {
+                fpu_op: choose_32_64(ty, FPUOpRI::Sli32(imm), FPUOpRI::Sli64(imm)),
                 rd,
-                rn: tmp1.to_reg(),
+                rn: tmp.to_reg(),
             });
         }
 
diff --git a/cranelift/filetests/filetests/vcode/aarch64/floating-point.clif b/cranelift/filetests/filetests/vcode/aarch64/floating-point.clif
index b5d0c3768b..6b991026ae 100644
--- a/cranelift/filetests/filetests/vcode/aarch64/floating-point.clif
+++ b/cranelift/filetests/filetests/vcode/aarch64/floating-point.clif
@@ -397,12 +397,8 @@ block0(v0: f32, v1: f32):
 
 ; check:  stp fp, lr, [sp, #-16]!
 ; nextln:  mov fp, sp
-; nextln:  mov x0, v0.d[0]
-; nextln:  mov x1, v1.d[0]
-; nextln:  and w1, w1, #2147483648
-; nextln:  bic w0, w0, #2147483648
-; nextln:  orr w0, w0, w1
-; nextln:  mov v0.d[0], x0
+; nextln:  ushr v1.2s, v1.2s, #31
+; nextln:  sli v0.2s, v1.2s, #31
 ; nextln:  mov sp, fp
 ; nextln:  ldp fp, lr, [sp], #16
 ; nextln:  ret
@@ -415,12 +411,8 @@ block0(v0: f64, v1: f64):
 
 ; check:  stp fp, lr, [sp, #-16]!
 ; nextln:  mov fp, sp
-; nextln:  mov x0, v0.d[0]
-; nextln:  mov x1, v1.d[0]
-; nextln:  and x1, x1, #9223372036854775808
-; nextln:  bic x0, x0, #9223372036854775808
-; nextln:  orr x0, x0, x1
-; nextln:  mov v0.d[0], x0
+; nextln:  ushr d1, d1, #63
+; nextln:  sli d0, d1, #63
 ; nextln:  mov sp, fp
 ; nextln:  ldp fp, lr, [sp], #16
 ; nextln:  ret