diff --git a/cranelift/codegen/src/isa/aarch64/inst/emit.rs b/cranelift/codegen/src/isa/aarch64/inst/emit.rs index aa6727b978..462628143f 100644 --- a/cranelift/codegen/src/isa/aarch64/inst/emit.rs +++ b/cranelift/codegen/src/isa/aarch64/inst/emit.rs @@ -85,12 +85,12 @@ pub fn u64_constant(bits: u64) -> ConstantData { // Instructions and subcomponents: emission fn machreg_to_gpr(m: Reg) -> u32 { - assert!(m.get_class() == RegClass::I64); + assert_eq!(m.get_class(), RegClass::I64); u32::try_from(m.to_real_reg().get_hw_encoding()).unwrap() } fn machreg_to_vec(m: Reg) -> u32 { - assert!(m.get_class() == RegClass::V128); + assert_eq!(m.get_class(), RegClass::V128); u32::try_from(m.to_real_reg().get_hw_encoding()).unwrap() } @@ -948,6 +948,44 @@ impl MachInstEmit for Inst { }; sink.put4(enc_fpurrr(top22, rd, rn, rm)); } + &Inst::FpuRRI { fpu_op, rd, rn } => match fpu_op { + FPUOpRI::UShr32(imm) => { + debug_assert_eq!(32, imm.lane_size_in_bits); + sink.put4( + 0b0_0_1_011110_0000000_00_0_0_0_1_00000_00000 + | imm.enc() << 16 + | machreg_to_vec(rn) << 5 + | machreg_to_vec(rd.to_reg()), + ) + } + FPUOpRI::UShr64(imm) => { + debug_assert_eq!(64, imm.lane_size_in_bits); + sink.put4( + 0b01_1_111110_0000000_00_0_0_0_1_00000_00000 + | imm.enc() << 16 + | machreg_to_vec(rn) << 5 + | machreg_to_vec(rd.to_reg()), + ) + } + FPUOpRI::Sli64(imm) => { + debug_assert_eq!(64, imm.lane_size_in_bits); + sink.put4( + 0b01_1_111110_0000000_010101_00000_00000 + | imm.enc() << 16 + | machreg_to_vec(rn) << 5 + | machreg_to_vec(rd.to_reg()), + ) + } + FPUOpRI::Sli32(imm) => { + debug_assert_eq!(32, imm.lane_size_in_bits); + sink.put4( + 0b0_0_1_011110_0000000_010101_00000_00000 + | imm.enc() << 16 + | machreg_to_vec(rn) << 5 + | machreg_to_vec(rd.to_reg()), + ) + } + }, &Inst::FpuRRRR { fpu_op, rd, diff --git a/cranelift/codegen/src/isa/aarch64/inst/emit_tests.rs b/cranelift/codegen/src/isa/aarch64/inst/emit_tests.rs index 55977796ce..b948f4fd8c 100644 --- a/cranelift/codegen/src/isa/aarch64/inst/emit_tests.rs +++ b/cranelift/codegen/src/isa/aarch64/inst/emit_tests.rs @@ -2400,6 +2400,46 @@ fn test_aarch64_binemit() { "fmadd d15, d30, d31, d1", )); + insns.push(( + Inst::FpuRRI { + fpu_op: FPUOpRI::UShr32(FPURightShiftImm::maybe_from_u8(32, 32).unwrap()), + rd: writable_vreg(2), + rn: vreg(5), + }, + "A204202F", + "ushr v2.2s, v5.2s, #32", + )); + + insns.push(( + Inst::FpuRRI { + fpu_op: FPUOpRI::UShr64(FPURightShiftImm::maybe_from_u8(63, 64).unwrap()), + rd: writable_vreg(2), + rn: vreg(5), + }, + "A204417F", + "ushr d2, d5, #63", + )); + + insns.push(( + Inst::FpuRRI { + fpu_op: FPUOpRI::Sli32(FPULeftShiftImm::maybe_from_u8(31, 32).unwrap()), + rd: writable_vreg(4), + rn: vreg(10), + }, + "44553F2F", + "sli v4.2s, v10.2s, #31", + )); + + insns.push(( + Inst::FpuRRI { + fpu_op: FPUOpRI::Sli64(FPULeftShiftImm::maybe_from_u8(63, 64).unwrap()), + rd: writable_vreg(4), + rn: vreg(10), + }, + "44557F7F", + "sli d4, d10, #63", + )); + insns.push(( Inst::FpuToInt { op: FpuToIntOp::F32ToU32, diff --git a/cranelift/codegen/src/isa/aarch64/inst/imms.rs b/cranelift/codegen/src/isa/aarch64/inst/imms.rs index b8e6bf65bf..7c473a83d2 100644 --- a/cranelift/codegen/src/isa/aarch64/inst/imms.rs +++ b/cranelift/codegen/src/isa/aarch64/inst/imms.rs @@ -106,6 +106,85 @@ impl SImm7Scaled { } } +#[derive(Clone, Copy, Debug)] +pub struct FPULeftShiftImm { + pub amount: u8, + pub lane_size_in_bits: u8, +} + +impl FPULeftShiftImm { + pub fn maybe_from_u8(amount: u8, lane_size_in_bits: u8) -> Option { + debug_assert!(lane_size_in_bits == 32 || lane_size_in_bits == 64); + if amount < lane_size_in_bits { + Some(Self { + amount, + lane_size_in_bits, + }) + } else { + None + } + } + + pub fn enc(&self) -> u32 { + debug_assert!(self.lane_size_in_bits.is_power_of_two()); + debug_assert!(self.lane_size_in_bits > self.amount); + // The encoding of the immediate follows the table below, + // where xs encode the shift amount. + // + // | lane_size_in_bits | encoding | + // +------------------------------+ + // | 8 | 0001xxx | + // | 16 | 001xxxx | + // | 32 | 01xxxxx | + // | 64 | 1xxxxxx | + // + // The highest one bit is represented by `lane_size_in_bits`. Since + // `lane_size_in_bits` is a power of 2 and `amount` is less + // than `lane_size_in_bits`, they can be ORed + // together to produced the encoded value. + u32::from(self.lane_size_in_bits | self.amount) + } +} + +#[derive(Clone, Copy, Debug)] +pub struct FPURightShiftImm { + pub amount: u8, + pub lane_size_in_bits: u8, +} + +impl FPURightShiftImm { + pub fn maybe_from_u8(amount: u8, lane_size_in_bits: u8) -> Option { + debug_assert!(lane_size_in_bits == 32 || lane_size_in_bits == 64); + if amount > 0 && amount <= lane_size_in_bits { + Some(Self { + amount, + lane_size_in_bits, + }) + } else { + None + } + } + + pub fn enc(&self) -> u32 { + debug_assert_ne!(0, self.amount); + // The encoding of the immediate follows the table below, + // where xs encodes the negated shift amount. + // + // | lane_size_in_bits | encoding | + // +------------------------------+ + // | 8 | 0001xxx | + // | 16 | 001xxxx | + // | 32 | 01xxxxx | + // | 64 | 1xxxxxx | + // + // The shift amount is negated such that a shift ammount + // of 1 (in 64-bit) is encoded as 0b111111 and a shift + // amount of 64 is encoded as 0b000000, + // in the bottom 6 bits. + u32::from((self.lane_size_in_bits * 2) - self.amount) + } +} + /// a 9-bit signed offset. #[derive(Clone, Copy, Debug)] pub struct SImm9 { @@ -576,6 +655,18 @@ impl ShowWithRRU for SImm7Scaled { } } +impl ShowWithRRU for FPULeftShiftImm { + fn show_rru(&self, _mb_rru: Option<&RealRegUniverse>) -> String { + format!("#{}", self.amount) + } +} + +impl ShowWithRRU for FPURightShiftImm { + fn show_rru(&self, _mb_rru: Option<&RealRegUniverse>) -> String { + format!("#{}", self.amount) + } +} + impl ShowWithRRU for SImm9 { fn show_rru(&self, _mb_rru: Option<&RealRegUniverse>) -> String { format!("#{}", self.value) diff --git a/cranelift/codegen/src/isa/aarch64/inst/mod.rs b/cranelift/codegen/src/isa/aarch64/inst/mod.rs index fd910522c5..53953efc52 100644 --- a/cranelift/codegen/src/isa/aarch64/inst/mod.rs +++ b/cranelift/codegen/src/isa/aarch64/inst/mod.rs @@ -4,7 +4,7 @@ #![allow(dead_code)] use crate::binemit::CodeOffset; -use crate::ir::types::{B1, B16, B32, B64, B8, F32, F64, FFLAGS, I16, I32, I64, I8, IFLAGS}; +use crate::ir::types::{B1, B16, B32, B64, B8, F32, F32X2, F64, FFLAGS, I16, I32, I64, I8, IFLAGS}; use crate::ir::{ExternalName, Opcode, SourceLoc, TrapCode, Type}; use crate::machinst::*; use crate::{settings, CodegenError, CodegenResult}; @@ -124,6 +124,19 @@ pub enum FPUOp2 { Min64, } +/// A floating-point unit (FPU) operation with two args, a register and an immediate. +#[derive(Copy, Clone, Debug)] +pub enum FPUOpRI { + /// Unsigned right shift. Rd = Rn << #imm + UShr32(FPURightShiftImm), + /// Unsigned right shift. Rd = Rn << #imm + UShr64(FPURightShiftImm), + /// Shift left and insert. Rd |= Rn << #imm + Sli32(FPULeftShiftImm), + /// Shift left and insert. Rd |= Rn << #imm + Sli64(FPULeftShiftImm), +} + /// A floating-point unit (FPU) operation with three args. #[derive(Copy, Clone, Debug, PartialEq, Eq, Hash)] pub enum FPUOp3 { @@ -472,6 +485,12 @@ pub enum Inst { rm: Reg, }, + FpuRRI { + fpu_op: FPUOpRI, + rd: Writable, + rn: Reg, + }, + /// 3-op FPU instruction. FpuRRRR { fpu_op: FPUOp3, @@ -1034,6 +1053,13 @@ fn aarch64_get_regs(inst: &Inst, collector: &mut RegUsageCollector) { collector.add_use(rn); collector.add_use(rm); } + &Inst::FpuRRI { fpu_op, rd, rn, .. } => { + match fpu_op { + FPUOpRI::UShr32(..) | FPUOpRI::UShr64(..) => collector.add_def(rd), + FPUOpRI::Sli32(..) | FPUOpRI::Sli64(..) => collector.add_mod(rd), + } + collector.add_use(rn); + } &Inst::FpuRRRR { rd, rn, rm, ra, .. } => { collector.add_def(rd); collector.add_use(rn); @@ -1482,6 +1508,14 @@ fn aarch64_map_regs(inst: &mut Inst, mapper: &RegUsageMapper) { map_use(mapper, rn); map_use(mapper, rm); } + &mut Inst::FpuRRI { + ref mut rd, + ref mut rn, + .. + } => { + map_def(mapper, rd); + map_use(mapper, rn); + } &mut Inst::FpuRRRR { ref mut rd, ref mut rn, @@ -2236,6 +2270,23 @@ impl ShowWithRRU for Inst { let rm = show_freg_sized(rm, mb_rru, size); format!("{} {}, {}, {}", op, rd, rn, rm) } + &Inst::FpuRRI { fpu_op, rd, rn } => { + let (op, imm, vector) = match fpu_op { + FPUOpRI::UShr32(imm) => ("ushr", imm.show_rru(mb_rru), true), + FPUOpRI::UShr64(imm) => ("ushr", imm.show_rru(mb_rru), false), + FPUOpRI::Sli32(imm) => ("sli", imm.show_rru(mb_rru), true), + FPUOpRI::Sli64(imm) => ("sli", imm.show_rru(mb_rru), false), + }; + + let show_vreg_fn: fn(Reg, Option<&RealRegUniverse>) -> String = if vector { + |reg, mb_rru| show_vreg_vector(reg, mb_rru, F32X2) + } else { + show_vreg_scalar + }; + let rd = show_vreg_fn(rd.to_reg(), mb_rru); + let rn = show_vreg_fn(rn, mb_rru); + format!("{} {}, {}, {}", op, rd, rn, imm) + } &Inst::FpuRRRR { fpu_op, rd, diff --git a/cranelift/codegen/src/isa/aarch64/inst/regs.rs b/cranelift/codegen/src/isa/aarch64/inst/regs.rs index 099d816266..242fb66fc9 100644 --- a/cranelift/codegen/src/isa/aarch64/inst/regs.rs +++ b/cranelift/codegen/src/isa/aarch64/inst/regs.rs @@ -1,5 +1,6 @@ //! AArch64 ISA definitions: registers. +use crate::ir::types::*; use crate::isa::aarch64::inst::InstSize; use crate::machinst::*; use crate::settings; @@ -307,3 +308,16 @@ pub fn show_vreg_scalar(reg: Reg, mb_rru: Option<&RealRegUniverse>) -> String { } s } + +/// Show a vector register. +pub fn show_vreg_vector(reg: Reg, mb_rru: Option<&RealRegUniverse>, ty: Type) -> String { + assert_eq!(RegClass::V128, reg.get_class()); + let mut s = reg.show_rru(mb_rru); + + match ty { + F32X2 => s.push_str(".2s"), + _ => unimplemented!(), + } + + s +} diff --git a/cranelift/codegen/src/isa/aarch64/lower_inst.rs b/cranelift/codegen/src/isa/aarch64/lower_inst.rs index a92dea7a7b..2946e16471 100644 --- a/cranelift/codegen/src/isa/aarch64/lower_inst.rs +++ b/cranelift/codegen/src/isa/aarch64/lower_inst.rs @@ -1460,54 +1460,38 @@ pub(crate) fn lower_insn_to_regs>( Opcode::Fcopysign => { // Copy the sign bit from inputs[1] to inputs[0]. We use the following sequence: // - // (64 bits for example, 32-bit sequence is analogous): + // This is a scalar Fcopysign. + // This uses scalar NEON operations for 64-bit and vector operations (2S) for 32-bit. // - // MOV Xtmp1, Dinput0 - // MOV Xtmp2, Dinput1 - // AND Xtmp2, 0x8000_0000_0000_0000 - // BIC Xtmp1, 0x8000_0000_0000_0000 - // ORR Xtmp1, Xtmp1, Xtmp2 - // MOV Doutput, Xtmp1 + // mov vd, vn + // ushr vtmp, vm, #63 / #31 + // sli vd, vtmp, #63 / #31 let ty = ctx.output_ty(insn, 0); - let bits = ty_bits(ty); + let bits = ty_bits(ty) as u8; assert!(bits == 32 || bits == 64); let rn = input_to_reg(ctx, inputs[0], NarrowValueMode::None); let rm = input_to_reg(ctx, inputs[1], NarrowValueMode::None); let rd = output_to_reg(ctx, outputs[0]); - let tmp1 = ctx.alloc_tmp(RegClass::I64, I64); - let tmp2 = ctx.alloc_tmp(RegClass::I64, I64); - ctx.emit(Inst::MovFromVec64 { rd: tmp1, rn: rn }); - ctx.emit(Inst::MovFromVec64 { rd: tmp2, rn: rm }); - let imml = if bits == 32 { - ImmLogic::maybe_from_u64(0x8000_0000, I32).unwrap() - } else { - ImmLogic::maybe_from_u64(0x8000_0000_0000_0000, I64).unwrap() - }; - let alu_op = choose_32_64(ty, ALUOp::And32, ALUOp::And64); - ctx.emit(Inst::AluRRImmLogic { - alu_op, - rd: tmp2, - rn: tmp2.to_reg(), - imml: imml.clone(), + let tmp = ctx.alloc_tmp(RegClass::V128, F64); + + // Copy LHS to rd. + ctx.emit(Inst::FpuMove64 { rd, rn }); + + // Copy the sign bit to the lowest bit in tmp. + let imm = FPURightShiftImm::maybe_from_u8(bits - 1, bits).unwrap(); + ctx.emit(Inst::FpuRRI { + fpu_op: choose_32_64(ty, FPUOpRI::UShr32(imm), FPUOpRI::UShr64(imm)), + rd: tmp, + rn: rm, }); - let alu_op = choose_32_64(ty, ALUOp::AndNot32, ALUOp::AndNot64); - ctx.emit(Inst::AluRRImmLogic { - alu_op, - rd: tmp1, - rn: tmp1.to_reg(), - imml, - }); - let alu_op = choose_32_64(ty, ALUOp::Orr32, ALUOp::Orr64); - ctx.emit(Inst::AluRRR { - alu_op, - rd: tmp1, - rn: tmp1.to_reg(), - rm: tmp2.to_reg(), - }); - ctx.emit(Inst::MovToVec64 { + + // Insert the bit from tmp into the sign bit of rd. + let imm = FPULeftShiftImm::maybe_from_u8(bits - 1, bits).unwrap(); + ctx.emit(Inst::FpuRRI { + fpu_op: choose_32_64(ty, FPUOpRI::Sli32(imm), FPUOpRI::Sli64(imm)), rd, - rn: tmp1.to_reg(), + rn: tmp.to_reg(), }); } diff --git a/cranelift/filetests/filetests/vcode/aarch64/floating-point.clif b/cranelift/filetests/filetests/vcode/aarch64/floating-point.clif index b5d0c3768b..6b991026ae 100644 --- a/cranelift/filetests/filetests/vcode/aarch64/floating-point.clif +++ b/cranelift/filetests/filetests/vcode/aarch64/floating-point.clif @@ -397,12 +397,8 @@ block0(v0: f32, v1: f32): ; check: stp fp, lr, [sp, #-16]! ; nextln: mov fp, sp -; nextln: mov x0, v0.d[0] -; nextln: mov x1, v1.d[0] -; nextln: and w1, w1, #2147483648 -; nextln: bic w0, w0, #2147483648 -; nextln: orr w0, w0, w1 -; nextln: mov v0.d[0], x0 +; nextln: ushr v1.2s, v1.2s, #31 +; nextln: sli v0.2s, v1.2s, #31 ; nextln: mov sp, fp ; nextln: ldp fp, lr, [sp], #16 ; nextln: ret @@ -415,12 +411,8 @@ block0(v0: f64, v1: f64): ; check: stp fp, lr, [sp, #-16]! ; nextln: mov fp, sp -; nextln: mov x0, v0.d[0] -; nextln: mov x1, v1.d[0] -; nextln: and x1, x1, #9223372036854775808 -; nextln: bic x0, x0, #9223372036854775808 -; nextln: orr x0, x0, x1 -; nextln: mov v0.d[0], x0 +; nextln: ushr d1, d1, #63 +; nextln: sli d0, d1, #63 ; nextln: mov sp, fp ; nextln: ldp fp, lr, [sp], #16 ; nextln: ret