arm64: Use FPU instrctions for Fcopysign

2020-05-21 18:14:12 +01:00
parent 5c39b74eb8
commit 02c3f238f8
7 changed files with 264 additions and 54 deletions
--- a/cranelift/codegen/src/isa/aarch64/inst/emit.rs
+++ b/cranelift/codegen/src/isa/aarch64/inst/emit.rs
@@ -85,12 +85,12 @@ pub fn u64_constant(bits: u64) -> ConstantData {
 // Instructions and subcomponents: emission
 fn machreg_to_gpr(m: Reg) -> u32 {
-    assert!(m.get_class() == RegClass::I64);
+    assert_eq!(m.get_class(), RegClass::I64);
    u32::try_from(m.to_real_reg().get_hw_encoding()).unwrap()
 }
 fn machreg_to_vec(m: Reg) -> u32 {
-    assert!(m.get_class() == RegClass::V128);
+    assert_eq!(m.get_class(), RegClass::V128);
    u32::try_from(m.to_real_reg().get_hw_encoding()).unwrap()
 }
@@ -948,6 +948,44 @@ impl MachInstEmit for Inst {
                };
                sink.put4(enc_fpurrr(top22, rd, rn, rm));
            }
            &Inst::FpuRRI { fpu_op, rd, rn } => match fpu_op {
                FPUOpRI::UShr32(imm) => {
                    debug_assert_eq!(32, imm.lane_size_in_bits);
                    sink.put4(
                        0b0_0_1_011110_0000000_00_0_0_0_1_00000_00000
                            | imm.enc() << 16
                            | machreg_to_vec(rn) << 5
                            | machreg_to_vec(rd.to_reg()),
                    )
                }
                FPUOpRI::UShr64(imm) => {
                    debug_assert_eq!(64, imm.lane_size_in_bits);
                    sink.put4(
                        0b01_1_111110_0000000_00_0_0_0_1_00000_00000
                            | imm.enc() << 16
                            | machreg_to_vec(rn) << 5
                            | machreg_to_vec(rd.to_reg()),
                    )
                }
                FPUOpRI::Sli64(imm) => {
                    debug_assert_eq!(64, imm.lane_size_in_bits);
                    sink.put4(
                        0b01_1_111110_0000000_010101_00000_00000
                            | imm.enc() << 16
                            | machreg_to_vec(rn) << 5
                            | machreg_to_vec(rd.to_reg()),
                    )
                }
                FPUOpRI::Sli32(imm) => {
                    debug_assert_eq!(32, imm.lane_size_in_bits);
                    sink.put4(
                        0b0_0_1_011110_0000000_010101_00000_00000
                            | imm.enc() << 16
                            | machreg_to_vec(rn) << 5
                            | machreg_to_vec(rd.to_reg()),
                    )
                }
            },
            &Inst::FpuRRRR {
                fpu_op,
                rd,
--- a/cranelift/codegen/src/isa/aarch64/inst/emit_tests.rs
+++ b/cranelift/codegen/src/isa/aarch64/inst/emit_tests.rs
@@ -2400,6 +2400,46 @@ fn test_aarch64_binemit() {
        "fmadd d15, d30, d31, d1",
    ));
    insns.push((
        Inst::FpuRRI {
            fpu_op: FPUOpRI::UShr32(FPURightShiftImm::maybe_from_u8(32, 32).unwrap()),
            rd: writable_vreg(2),
            rn: vreg(5),
        },
        "A204202F",
        "ushr v2.2s, v5.2s, #32",
    ));
    insns.push((
        Inst::FpuRRI {
            fpu_op: FPUOpRI::UShr64(FPURightShiftImm::maybe_from_u8(63, 64).unwrap()),
            rd: writable_vreg(2),
            rn: vreg(5),
        },
        "A204417F",
        "ushr d2, d5, #63",
    ));
    insns.push((
        Inst::FpuRRI {
            fpu_op: FPUOpRI::Sli32(FPULeftShiftImm::maybe_from_u8(31, 32).unwrap()),
            rd: writable_vreg(4),
            rn: vreg(10),
        },
        "44553F2F",
        "sli v4.2s, v10.2s, #31",
    ));
    insns.push((
        Inst::FpuRRI {
            fpu_op: FPUOpRI::Sli64(FPULeftShiftImm::maybe_from_u8(63, 64).unwrap()),
            rd: writable_vreg(4),
            rn: vreg(10),
        },
        "44557F7F",
        "sli d4, d10, #63",
    ));
    insns.push((
        Inst::FpuToInt {
            op: FpuToIntOp::F32ToU32,
--- a/cranelift/codegen/src/isa/aarch64/inst/imms.rs
+++ b/cranelift/codegen/src/isa/aarch64/inst/imms.rs
@@ -106,6 +106,85 @@ impl SImm7Scaled {
    }
 }
 #[derive(Clone, Copy, Debug)]
 pub struct FPULeftShiftImm {
    pub amount: u8,
    pub lane_size_in_bits: u8,
 }
 impl FPULeftShiftImm {
    pub fn maybe_from_u8(amount: u8, lane_size_in_bits: u8) -> Option<Self> {
        debug_assert!(lane_size_in_bits == 32 || lane_size_in_bits == 64);
        if amount < lane_size_in_bits {
            Some(Self {
                amount,
                lane_size_in_bits,
            })
        } else {
            None
        }
    }
    pub fn enc(&self) -> u32 {
        debug_assert!(self.lane_size_in_bits.is_power_of_two());
        debug_assert!(self.lane_size_in_bits > self.amount);
        // The encoding of the immediate follows the table below,
        // where xs encode the shift amount.
        //
        // | lane_size_in_bits | encoding |
        // +------------------------------+
        // | 8                 | 0001xxx  |
        // | 16                | 001xxxx  |
        // | 32                | 01xxxxx  |
        // | 64                | 1xxxxxx  |
        //
        // The highest one bit is represented by `lane_size_in_bits`. Since
        // `lane_size_in_bits` is a power of 2 and `amount` is less
        // than `lane_size_in_bits`, they can be ORed
        // together to produced the encoded value.
        u32::from(self.lane_size_in_bits | self.amount)
    }
 }
 #[derive(Clone, Copy, Debug)]
 pub struct FPURightShiftImm {
    pub amount: u8,
    pub lane_size_in_bits: u8,
 }
 impl FPURightShiftImm {
    pub fn maybe_from_u8(amount: u8, lane_size_in_bits: u8) -> Option<Self> {
        debug_assert!(lane_size_in_bits == 32 || lane_size_in_bits == 64);
        if amount > 0 && amount <= lane_size_in_bits {
            Some(Self {
                amount,
                lane_size_in_bits,
            })
        } else {
            None
        }
    }
    pub fn enc(&self) -> u32 {
        debug_assert_ne!(0, self.amount);
        // The encoding of the immediate follows the table below,
        // where xs encodes the negated shift amount.
        //
        // | lane_size_in_bits | encoding |
        // +------------------------------+
        // | 8                 | 0001xxx  |
        // | 16                | 001xxxx  |
        // | 32                | 01xxxxx  |
        // | 64                | 1xxxxxx  |
        //
        // The shift amount is negated such that a shift ammount
        // of 1 (in 64-bit) is encoded as 0b111111 and a shift
        // amount of 64 is encoded as 0b000000,
        // in the bottom 6 bits.
        u32::from((self.lane_size_in_bits * 2) - self.amount)
    }
 }
 /// a 9-bit signed offset.
 #[derive(Clone, Copy, Debug)]
 pub struct SImm9 {
@@ -576,6 +655,18 @@ impl ShowWithRRU for SImm7Scaled {
    }
 }
 impl ShowWithRRU for FPULeftShiftImm {
    fn show_rru(&self, _mb_rru: Option<&RealRegUniverse>) -> String {
        format!("#{}", self.amount)
    }
 }
 impl ShowWithRRU for FPURightShiftImm {
    fn show_rru(&self, _mb_rru: Option<&RealRegUniverse>) -> String {
        format!("#{}", self.amount)
    }
 }
 impl ShowWithRRU for SImm9 {
    fn show_rru(&self, _mb_rru: Option<&RealRegUniverse>) -> String {
        format!("#{}", self.value)
--- a/cranelift/codegen/src/isa/aarch64/inst/mod.rs
+++ b/cranelift/codegen/src/isa/aarch64/inst/mod.rs
@@ -4,7 +4,7 @@
 #![allow(dead_code)]
 use crate::binemit::CodeOffset;
-use crate::ir::types::{B1, B16, B32, B64, B8, F32, F64, FFLAGS, I16, I32, I64, I8, IFLAGS};
+use crate::ir::types::{B1, B16, B32, B64, B8, F32, F32X2, F64, FFLAGS, I16, I32, I64, I8, IFLAGS};
 use crate::ir::{ExternalName, Opcode, SourceLoc, TrapCode, Type};
 use crate::machinst::*;
 use crate::{settings, CodegenError, CodegenResult};
@@ -124,6 +124,19 @@ pub enum FPUOp2 {
    Min64,
 }
 /// A floating-point unit (FPU) operation with two args, a register and an immediate.
 #[derive(Copy, Clone, Debug)]
 pub enum FPUOpRI {
    /// Unsigned right shift. Rd = Rn << #imm
    UShr32(FPURightShiftImm),
    /// Unsigned right shift. Rd = Rn << #imm
    UShr64(FPURightShiftImm),
    /// Shift left and insert. Rd |= Rn << #imm
    Sli32(FPULeftShiftImm),
    /// Shift left and insert. Rd |= Rn << #imm
    Sli64(FPULeftShiftImm),
 }
 /// A floating-point unit (FPU) operation with three args.
 #[derive(Copy, Clone, Debug, PartialEq, Eq, Hash)]
 pub enum FPUOp3 {
@@ -472,6 +485,12 @@ pub enum Inst {
        rm: Reg,
    },
    FpuRRI {
        fpu_op: FPUOpRI,
        rd: Writable<Reg>,
        rn: Reg,
    },
    /// 3-op FPU instruction.
    FpuRRRR {
        fpu_op: FPUOp3,
@@ -1034,6 +1053,13 @@ fn aarch64_get_regs(inst: &Inst, collector: &mut RegUsageCollector) {
            collector.add_use(rn);
            collector.add_use(rm);
        }
        &Inst::FpuRRI { fpu_op, rd, rn, .. } => {
            match fpu_op {
                FPUOpRI::UShr32(..) | FPUOpRI::UShr64(..) => collector.add_def(rd),
                FPUOpRI::Sli32(..) | FPUOpRI::Sli64(..) => collector.add_mod(rd),
            }
            collector.add_use(rn);
        }
        &Inst::FpuRRRR { rd, rn, rm, ra, .. } => {
            collector.add_def(rd);
            collector.add_use(rn);
@@ -1482,6 +1508,14 @@ fn aarch64_map_regs(inst: &mut Inst, mapper: &RegUsageMapper) {
            map_use(mapper, rn);
            map_use(mapper, rm);
        }
        &mut Inst::FpuRRI {
            ref mut rd,
            ref mut rn,
            ..
        } => {
            map_def(mapper, rd);
            map_use(mapper, rn);
        }
        &mut Inst::FpuRRRR {
            ref mut rd,
            ref mut rn,
@@ -2236,6 +2270,23 @@ impl ShowWithRRU for Inst {
                let rm = show_freg_sized(rm, mb_rru, size);
                format!("{} {}, {}, {}", op, rd, rn, rm)
            }
            &Inst::FpuRRI { fpu_op, rd, rn } => {
                let (op, imm, vector) = match fpu_op {
                    FPUOpRI::UShr32(imm) => ("ushr", imm.show_rru(mb_rru), true),
                    FPUOpRI::UShr64(imm) => ("ushr", imm.show_rru(mb_rru), false),
                    FPUOpRI::Sli32(imm) => ("sli", imm.show_rru(mb_rru), true),
                    FPUOpRI::Sli64(imm) => ("sli", imm.show_rru(mb_rru), false),
                };
                let show_vreg_fn: fn(Reg, Option<&RealRegUniverse>) -> String = if vector {
                    |reg, mb_rru| show_vreg_vector(reg, mb_rru, F32X2)
                } else {
                    show_vreg_scalar
                };
                let rd = show_vreg_fn(rd.to_reg(), mb_rru);
                let rn = show_vreg_fn(rn, mb_rru);
                format!("{} {}, {}, {}", op, rd, rn, imm)
            }
            &Inst::FpuRRRR {
                fpu_op,
                rd,
--- a/cranelift/codegen/src/isa/aarch64/inst/regs.rs
+++ b/cranelift/codegen/src/isa/aarch64/inst/regs.rs
@@ -1,5 +1,6 @@
 //! AArch64 ISA definitions: registers.
 use crate::ir::types::*;
 use crate::isa::aarch64::inst::InstSize;
 use crate::machinst::*;
 use crate::settings;
@@ -307,3 +308,16 @@ pub fn show_vreg_scalar(reg: Reg, mb_rru: Option<&RealRegUniverse>) -> String {
    }
    s
 }
 /// Show a vector register.
 pub fn show_vreg_vector(reg: Reg, mb_rru: Option<&RealRegUniverse>, ty: Type) -> String {
    assert_eq!(RegClass::V128, reg.get_class());
    let mut s = reg.show_rru(mb_rru);
    match ty {
        F32X2 => s.push_str(".2s"),
        _ => unimplemented!(),
    }
    s
 }
--- a/cranelift/codegen/src/isa/aarch64/lower_inst.rs
+++ b/cranelift/codegen/src/isa/aarch64/lower_inst.rs
@@ -1460,54 +1460,38 @@ pub(crate) fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(
        Opcode::Fcopysign => {
            // Copy the sign bit from inputs[1] to inputs[0]. We use the following sequence:
            //
-            // (64 bits for example, 32-bit sequence is analogous):
+            // This is a scalar Fcopysign.
            // This uses scalar NEON operations for 64-bit and vector operations (2S) for 32-bit.
            //
-            // MOV Xtmp1, Dinput0
+            //  mov vd, vn
-            // MOV Xtmp2, Dinput1
+            //  ushr vtmp, vm, #63 / #31
-            // AND Xtmp2, 0x8000_0000_0000_0000
+            //  sli vd, vtmp, #63 / #31
            // BIC Xtmp1, 0x8000_0000_0000_0000
            // ORR Xtmp1, Xtmp1, Xtmp2
            // MOV Doutput, Xtmp1
            let ty = ctx.output_ty(insn, 0);
-            let bits = ty_bits(ty);
+            let bits = ty_bits(ty) as u8;
            assert!(bits == 32 || bits == 64);
            let rn = input_to_reg(ctx, inputs[0], NarrowValueMode::None);
            let rm = input_to_reg(ctx, inputs[1], NarrowValueMode::None);
            let rd = output_to_reg(ctx, outputs[0]);
-            let tmp1 = ctx.alloc_tmp(RegClass::I64, I64);
+            let tmp = ctx.alloc_tmp(RegClass::V128, F64);
-            let tmp2 = ctx.alloc_tmp(RegClass::I64, I64);
+
-            ctx.emit(Inst::MovFromVec64 { rd: tmp1, rn: rn });
+            // Copy LHS to rd.
-            ctx.emit(Inst::MovFromVec64 { rd: tmp2, rn: rm });
+            ctx.emit(Inst::FpuMove64 { rd, rn });
-            let imml = if bits == 32 {
+
-                ImmLogic::maybe_from_u64(0x8000_0000, I32).unwrap()
+            // Copy the sign bit to the lowest bit in tmp.
-            } else {
+            let imm = FPURightShiftImm::maybe_from_u8(bits - 1, bits).unwrap();
-                ImmLogic::maybe_from_u64(0x8000_0000_0000_0000, I64).unwrap()
+            ctx.emit(Inst::FpuRRI {
-            };
+                fpu_op: choose_32_64(ty, FPUOpRI::UShr32(imm), FPUOpRI::UShr64(imm)),
-            let alu_op = choose_32_64(ty, ALUOp::And32, ALUOp::And64);
+                rd: tmp,
-            ctx.emit(Inst::AluRRImmLogic {
+                rn: rm,
                alu_op,
                rd: tmp2,
                rn: tmp2.to_reg(),
                imml: imml.clone(),
            });
-            let alu_op = choose_32_64(ty, ALUOp::AndNot32, ALUOp::AndNot64);
+
-            ctx.emit(Inst::AluRRImmLogic {
+            // Insert the bit from tmp into the sign bit of rd.
-                alu_op,
+            let imm = FPULeftShiftImm::maybe_from_u8(bits - 1, bits).unwrap();
-                rd: tmp1,
+            ctx.emit(Inst::FpuRRI {
-                rn: tmp1.to_reg(),
+                fpu_op: choose_32_64(ty, FPUOpRI::Sli32(imm), FPUOpRI::Sli64(imm)),
                imml,
            });
            let alu_op = choose_32_64(ty, ALUOp::Orr32, ALUOp::Orr64);
            ctx.emit(Inst::AluRRR {
                alu_op,
                rd: tmp1,
                rn: tmp1.to_reg(),
                rm: tmp2.to_reg(),
            });
            ctx.emit(Inst::MovToVec64 {
                rd,
-                rn: tmp1.to_reg(),
+                rn: tmp.to_reg(),
            });
        }
--- a/cranelift/filetests/filetests/vcode/aarch64/floating-point.clif
+++ b/cranelift/filetests/filetests/vcode/aarch64/floating-point.clif
@@ -397,12 +397,8 @@ block0(v0: f32, v1: f32):
 ; check:  stp fp, lr, [sp, #-16]!
 ; nextln:  mov fp, sp
-; nextln:  mov x0, v0.d[0]
+; nextln:  ushr v1.2s, v1.2s, #31
-; nextln:  mov x1, v1.d[0]
+; nextln:  sli v0.2s, v1.2s, #31
 ; nextln:  and w1, w1, #2147483648
 ; nextln:  bic w0, w0, #2147483648
 ; nextln:  orr w0, w0, w1
 ; nextln:  mov v0.d[0], x0
 ; nextln:  mov sp, fp
 ; nextln:  ldp fp, lr, [sp], #16
 ; nextln:  ret
@@ -415,12 +411,8 @@ block0(v0: f64, v1: f64):
 ; check:  stp fp, lr, [sp, #-16]!
 ; nextln:  mov fp, sp
-; nextln:  mov x0, v0.d[0]
+; nextln:  ushr d1, d1, #63
-; nextln:  mov x1, v1.d[0]
+; nextln:  sli d0, d1, #63
 ; nextln:  and x1, x1, #9223372036854775808
 ; nextln:  bic x0, x0, #9223372036854775808
 ; nextln:  orr x0, x0, x1
 ; nextln:  mov v0.d[0], x0
 ; nextln:  mov sp, fp
 ; nextln:  ldp fp, lr, [sp], #16
 ; nextln:  ret