arm64: Use FPU instrctions for Fcopysign

Copyright (c) 2020, Arm Limited.
This commit is contained in:
Joey Gouly
2020-05-21 18:14:12 +01:00
parent 5c39b74eb8
commit 02c3f238f8
7 changed files with 264 additions and 54 deletions

View File

@@ -85,12 +85,12 @@ pub fn u64_constant(bits: u64) -> ConstantData {
// Instructions and subcomponents: emission
fn machreg_to_gpr(m: Reg) -> u32 {
assert!(m.get_class() == RegClass::I64);
assert_eq!(m.get_class(), RegClass::I64);
u32::try_from(m.to_real_reg().get_hw_encoding()).unwrap()
}
fn machreg_to_vec(m: Reg) -> u32 {
assert!(m.get_class() == RegClass::V128);
assert_eq!(m.get_class(), RegClass::V128);
u32::try_from(m.to_real_reg().get_hw_encoding()).unwrap()
}
@@ -948,6 +948,44 @@ impl MachInstEmit for Inst {
};
sink.put4(enc_fpurrr(top22, rd, rn, rm));
}
&Inst::FpuRRI { fpu_op, rd, rn } => match fpu_op {
FPUOpRI::UShr32(imm) => {
debug_assert_eq!(32, imm.lane_size_in_bits);
sink.put4(
0b0_0_1_011110_0000000_00_0_0_0_1_00000_00000
| imm.enc() << 16
| machreg_to_vec(rn) << 5
| machreg_to_vec(rd.to_reg()),
)
}
FPUOpRI::UShr64(imm) => {
debug_assert_eq!(64, imm.lane_size_in_bits);
sink.put4(
0b01_1_111110_0000000_00_0_0_0_1_00000_00000
| imm.enc() << 16
| machreg_to_vec(rn) << 5
| machreg_to_vec(rd.to_reg()),
)
}
FPUOpRI::Sli64(imm) => {
debug_assert_eq!(64, imm.lane_size_in_bits);
sink.put4(
0b01_1_111110_0000000_010101_00000_00000
| imm.enc() << 16
| machreg_to_vec(rn) << 5
| machreg_to_vec(rd.to_reg()),
)
}
FPUOpRI::Sli32(imm) => {
debug_assert_eq!(32, imm.lane_size_in_bits);
sink.put4(
0b0_0_1_011110_0000000_010101_00000_00000
| imm.enc() << 16
| machreg_to_vec(rn) << 5
| machreg_to_vec(rd.to_reg()),
)
}
},
&Inst::FpuRRRR {
fpu_op,
rd,

View File

@@ -2400,6 +2400,46 @@ fn test_aarch64_binemit() {
"fmadd d15, d30, d31, d1",
));
insns.push((
Inst::FpuRRI {
fpu_op: FPUOpRI::UShr32(FPURightShiftImm::maybe_from_u8(32, 32).unwrap()),
rd: writable_vreg(2),
rn: vreg(5),
},
"A204202F",
"ushr v2.2s, v5.2s, #32",
));
insns.push((
Inst::FpuRRI {
fpu_op: FPUOpRI::UShr64(FPURightShiftImm::maybe_from_u8(63, 64).unwrap()),
rd: writable_vreg(2),
rn: vreg(5),
},
"A204417F",
"ushr d2, d5, #63",
));
insns.push((
Inst::FpuRRI {
fpu_op: FPUOpRI::Sli32(FPULeftShiftImm::maybe_from_u8(31, 32).unwrap()),
rd: writable_vreg(4),
rn: vreg(10),
},
"44553F2F",
"sli v4.2s, v10.2s, #31",
));
insns.push((
Inst::FpuRRI {
fpu_op: FPUOpRI::Sli64(FPULeftShiftImm::maybe_from_u8(63, 64).unwrap()),
rd: writable_vreg(4),
rn: vreg(10),
},
"44557F7F",
"sli d4, d10, #63",
));
insns.push((
Inst::FpuToInt {
op: FpuToIntOp::F32ToU32,

View File

@@ -106,6 +106,85 @@ impl SImm7Scaled {
}
}
#[derive(Clone, Copy, Debug)]
pub struct FPULeftShiftImm {
pub amount: u8,
pub lane_size_in_bits: u8,
}
impl FPULeftShiftImm {
pub fn maybe_from_u8(amount: u8, lane_size_in_bits: u8) -> Option<Self> {
debug_assert!(lane_size_in_bits == 32 || lane_size_in_bits == 64);
if amount < lane_size_in_bits {
Some(Self {
amount,
lane_size_in_bits,
})
} else {
None
}
}
pub fn enc(&self) -> u32 {
debug_assert!(self.lane_size_in_bits.is_power_of_two());
debug_assert!(self.lane_size_in_bits > self.amount);
// The encoding of the immediate follows the table below,
// where xs encode the shift amount.
//
// | lane_size_in_bits | encoding |
// +------------------------------+
// | 8 | 0001xxx |
// | 16 | 001xxxx |
// | 32 | 01xxxxx |
// | 64 | 1xxxxxx |
//
// The highest one bit is represented by `lane_size_in_bits`. Since
// `lane_size_in_bits` is a power of 2 and `amount` is less
// than `lane_size_in_bits`, they can be ORed
// together to produced the encoded value.
u32::from(self.lane_size_in_bits | self.amount)
}
}
#[derive(Clone, Copy, Debug)]
pub struct FPURightShiftImm {
pub amount: u8,
pub lane_size_in_bits: u8,
}
impl FPURightShiftImm {
pub fn maybe_from_u8(amount: u8, lane_size_in_bits: u8) -> Option<Self> {
debug_assert!(lane_size_in_bits == 32 || lane_size_in_bits == 64);
if amount > 0 && amount <= lane_size_in_bits {
Some(Self {
amount,
lane_size_in_bits,
})
} else {
None
}
}
pub fn enc(&self) -> u32 {
debug_assert_ne!(0, self.amount);
// The encoding of the immediate follows the table below,
// where xs encodes the negated shift amount.
//
// | lane_size_in_bits | encoding |
// +------------------------------+
// | 8 | 0001xxx |
// | 16 | 001xxxx |
// | 32 | 01xxxxx |
// | 64 | 1xxxxxx |
//
// The shift amount is negated such that a shift ammount
// of 1 (in 64-bit) is encoded as 0b111111 and a shift
// amount of 64 is encoded as 0b000000,
// in the bottom 6 bits.
u32::from((self.lane_size_in_bits * 2) - self.amount)
}
}
/// a 9-bit signed offset.
#[derive(Clone, Copy, Debug)]
pub struct SImm9 {
@@ -576,6 +655,18 @@ impl ShowWithRRU for SImm7Scaled {
}
}
impl ShowWithRRU for FPULeftShiftImm {
fn show_rru(&self, _mb_rru: Option<&RealRegUniverse>) -> String {
format!("#{}", self.amount)
}
}
impl ShowWithRRU for FPURightShiftImm {
fn show_rru(&self, _mb_rru: Option<&RealRegUniverse>) -> String {
format!("#{}", self.amount)
}
}
impl ShowWithRRU for SImm9 {
fn show_rru(&self, _mb_rru: Option<&RealRegUniverse>) -> String {
format!("#{}", self.value)

View File

@@ -4,7 +4,7 @@
#![allow(dead_code)]
use crate::binemit::CodeOffset;
use crate::ir::types::{B1, B16, B32, B64, B8, F32, F64, FFLAGS, I16, I32, I64, I8, IFLAGS};
use crate::ir::types::{B1, B16, B32, B64, B8, F32, F32X2, F64, FFLAGS, I16, I32, I64, I8, IFLAGS};
use crate::ir::{ExternalName, Opcode, SourceLoc, TrapCode, Type};
use crate::machinst::*;
use crate::{settings, CodegenError, CodegenResult};
@@ -124,6 +124,19 @@ pub enum FPUOp2 {
Min64,
}
/// A floating-point unit (FPU) operation with two args, a register and an immediate.
#[derive(Copy, Clone, Debug)]
pub enum FPUOpRI {
/// Unsigned right shift. Rd = Rn << #imm
UShr32(FPURightShiftImm),
/// Unsigned right shift. Rd = Rn << #imm
UShr64(FPURightShiftImm),
/// Shift left and insert. Rd |= Rn << #imm
Sli32(FPULeftShiftImm),
/// Shift left and insert. Rd |= Rn << #imm
Sli64(FPULeftShiftImm),
}
/// A floating-point unit (FPU) operation with three args.
#[derive(Copy, Clone, Debug, PartialEq, Eq, Hash)]
pub enum FPUOp3 {
@@ -472,6 +485,12 @@ pub enum Inst {
rm: Reg,
},
FpuRRI {
fpu_op: FPUOpRI,
rd: Writable<Reg>,
rn: Reg,
},
/// 3-op FPU instruction.
FpuRRRR {
fpu_op: FPUOp3,
@@ -1034,6 +1053,13 @@ fn aarch64_get_regs(inst: &Inst, collector: &mut RegUsageCollector) {
collector.add_use(rn);
collector.add_use(rm);
}
&Inst::FpuRRI { fpu_op, rd, rn, .. } => {
match fpu_op {
FPUOpRI::UShr32(..) | FPUOpRI::UShr64(..) => collector.add_def(rd),
FPUOpRI::Sli32(..) | FPUOpRI::Sli64(..) => collector.add_mod(rd),
}
collector.add_use(rn);
}
&Inst::FpuRRRR { rd, rn, rm, ra, .. } => {
collector.add_def(rd);
collector.add_use(rn);
@@ -1482,6 +1508,14 @@ fn aarch64_map_regs(inst: &mut Inst, mapper: &RegUsageMapper) {
map_use(mapper, rn);
map_use(mapper, rm);
}
&mut Inst::FpuRRI {
ref mut rd,
ref mut rn,
..
} => {
map_def(mapper, rd);
map_use(mapper, rn);
}
&mut Inst::FpuRRRR {
ref mut rd,
ref mut rn,
@@ -2236,6 +2270,23 @@ impl ShowWithRRU for Inst {
let rm = show_freg_sized(rm, mb_rru, size);
format!("{} {}, {}, {}", op, rd, rn, rm)
}
&Inst::FpuRRI { fpu_op, rd, rn } => {
let (op, imm, vector) = match fpu_op {
FPUOpRI::UShr32(imm) => ("ushr", imm.show_rru(mb_rru), true),
FPUOpRI::UShr64(imm) => ("ushr", imm.show_rru(mb_rru), false),
FPUOpRI::Sli32(imm) => ("sli", imm.show_rru(mb_rru), true),
FPUOpRI::Sli64(imm) => ("sli", imm.show_rru(mb_rru), false),
};
let show_vreg_fn: fn(Reg, Option<&RealRegUniverse>) -> String = if vector {
|reg, mb_rru| show_vreg_vector(reg, mb_rru, F32X2)
} else {
show_vreg_scalar
};
let rd = show_vreg_fn(rd.to_reg(), mb_rru);
let rn = show_vreg_fn(rn, mb_rru);
format!("{} {}, {}, {}", op, rd, rn, imm)
}
&Inst::FpuRRRR {
fpu_op,
rd,

View File

@@ -1,5 +1,6 @@
//! AArch64 ISA definitions: registers.
use crate::ir::types::*;
use crate::isa::aarch64::inst::InstSize;
use crate::machinst::*;
use crate::settings;
@@ -307,3 +308,16 @@ pub fn show_vreg_scalar(reg: Reg, mb_rru: Option<&RealRegUniverse>) -> String {
}
s
}
/// Show a vector register.
pub fn show_vreg_vector(reg: Reg, mb_rru: Option<&RealRegUniverse>, ty: Type) -> String {
assert_eq!(RegClass::V128, reg.get_class());
let mut s = reg.show_rru(mb_rru);
match ty {
F32X2 => s.push_str(".2s"),
_ => unimplemented!(),
}
s
}

View File

@@ -1460,54 +1460,38 @@ pub(crate) fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(
Opcode::Fcopysign => {
// Copy the sign bit from inputs[1] to inputs[0]. We use the following sequence:
//
// (64 bits for example, 32-bit sequence is analogous):
// This is a scalar Fcopysign.
// This uses scalar NEON operations for 64-bit and vector operations (2S) for 32-bit.
//
// MOV Xtmp1, Dinput0
// MOV Xtmp2, Dinput1
// AND Xtmp2, 0x8000_0000_0000_0000
// BIC Xtmp1, 0x8000_0000_0000_0000
// ORR Xtmp1, Xtmp1, Xtmp2
// MOV Doutput, Xtmp1
// mov vd, vn
// ushr vtmp, vm, #63 / #31
// sli vd, vtmp, #63 / #31
let ty = ctx.output_ty(insn, 0);
let bits = ty_bits(ty);
let bits = ty_bits(ty) as u8;
assert!(bits == 32 || bits == 64);
let rn = input_to_reg(ctx, inputs[0], NarrowValueMode::None);
let rm = input_to_reg(ctx, inputs[1], NarrowValueMode::None);
let rd = output_to_reg(ctx, outputs[0]);
let tmp1 = ctx.alloc_tmp(RegClass::I64, I64);
let tmp2 = ctx.alloc_tmp(RegClass::I64, I64);
ctx.emit(Inst::MovFromVec64 { rd: tmp1, rn: rn });
ctx.emit(Inst::MovFromVec64 { rd: tmp2, rn: rm });
let imml = if bits == 32 {
ImmLogic::maybe_from_u64(0x8000_0000, I32).unwrap()
} else {
ImmLogic::maybe_from_u64(0x8000_0000_0000_0000, I64).unwrap()
};
let alu_op = choose_32_64(ty, ALUOp::And32, ALUOp::And64);
ctx.emit(Inst::AluRRImmLogic {
alu_op,
rd: tmp2,
rn: tmp2.to_reg(),
imml: imml.clone(),
let tmp = ctx.alloc_tmp(RegClass::V128, F64);
// Copy LHS to rd.
ctx.emit(Inst::FpuMove64 { rd, rn });
// Copy the sign bit to the lowest bit in tmp.
let imm = FPURightShiftImm::maybe_from_u8(bits - 1, bits).unwrap();
ctx.emit(Inst::FpuRRI {
fpu_op: choose_32_64(ty, FPUOpRI::UShr32(imm), FPUOpRI::UShr64(imm)),
rd: tmp,
rn: rm,
});
let alu_op = choose_32_64(ty, ALUOp::AndNot32, ALUOp::AndNot64);
ctx.emit(Inst::AluRRImmLogic {
alu_op,
rd: tmp1,
rn: tmp1.to_reg(),
imml,
});
let alu_op = choose_32_64(ty, ALUOp::Orr32, ALUOp::Orr64);
ctx.emit(Inst::AluRRR {
alu_op,
rd: tmp1,
rn: tmp1.to_reg(),
rm: tmp2.to_reg(),
});
ctx.emit(Inst::MovToVec64 {
// Insert the bit from tmp into the sign bit of rd.
let imm = FPULeftShiftImm::maybe_from_u8(bits - 1, bits).unwrap();
ctx.emit(Inst::FpuRRI {
fpu_op: choose_32_64(ty, FPUOpRI::Sli32(imm), FPUOpRI::Sli64(imm)),
rd,
rn: tmp1.to_reg(),
rn: tmp.to_reg(),
});
}

View File

@@ -397,12 +397,8 @@ block0(v0: f32, v1: f32):
; check: stp fp, lr, [sp, #-16]!
; nextln: mov fp, sp
; nextln: mov x0, v0.d[0]
; nextln: mov x1, v1.d[0]
; nextln: and w1, w1, #2147483648
; nextln: bic w0, w0, #2147483648
; nextln: orr w0, w0, w1
; nextln: mov v0.d[0], x0
; nextln: ushr v1.2s, v1.2s, #31
; nextln: sli v0.2s, v1.2s, #31
; nextln: mov sp, fp
; nextln: ldp fp, lr, [sp], #16
; nextln: ret
@@ -415,12 +411,8 @@ block0(v0: f64, v1: f64):
; check: stp fp, lr, [sp, #-16]!
; nextln: mov fp, sp
; nextln: mov x0, v0.d[0]
; nextln: mov x1, v1.d[0]
; nextln: and x1, x1, #9223372036854775808
; nextln: bic x0, x0, #9223372036854775808
; nextln: orr x0, x0, x1
; nextln: mov v0.d[0], x0
; nextln: ushr d1, d1, #63
; nextln: sli d0, d1, #63
; nextln: mov sp, fp
; nextln: ldp fp, lr, [sp], #16
; nextln: ret