Add sse41 lowering for rounding x64

This commit is contained in:
Johnnie Birch
2021-01-25 00:02:57 -08:00
parent d1c1cb6a25
commit cbd7a6a80e
3 changed files with 36 additions and 29 deletions

View File

@@ -2094,7 +2094,9 @@ pub(crate) fn emit(
SseOpcode::Pextrd => (LegacyPrefixes::_66, 0x0F3A16, 3), SseOpcode::Pextrd => (LegacyPrefixes::_66, 0x0F3A16, 3),
SseOpcode::Pshufd => (LegacyPrefixes::_66, 0x0F70, 2), SseOpcode::Pshufd => (LegacyPrefixes::_66, 0x0F70, 2),
SseOpcode::Roundps => (LegacyPrefixes::_66, 0x0F3A08, 3), SseOpcode::Roundps => (LegacyPrefixes::_66, 0x0F3A08, 3),
SseOpcode::Roundss => (LegacyPrefixes::_66, 0x0F3A0A, 3),
SseOpcode::Roundpd => (LegacyPrefixes::_66, 0x0F3A09, 3), SseOpcode::Roundpd => (LegacyPrefixes::_66, 0x0F3A09, 3),
SseOpcode::Roundsd => (LegacyPrefixes::_66, 0x0F3A0B, 3),
_ => unimplemented!("Opcode {:?} not implemented", op), _ => unimplemented!("Opcode {:?} not implemented", op),
}; };
let rex = if *is64 { let rex = if *is64 {

View File

@@ -1823,7 +1823,7 @@ impl fmt::Debug for Inst {
fn x64_get_regs(inst: &Inst, collector: &mut RegUsageCollector) { fn x64_get_regs(inst: &Inst, collector: &mut RegUsageCollector) {
// This is a bit subtle. If some register is in the modified set, then it may not be in either // This is a bit subtle. If some register is in the modified set, then it may not be in either
// the use or def sets. However, enforcing that directly is somewhat difficult. Instead, // the use or def sets. However, enforcing that directly is somewhat difficult. Instead,
// regalloc.rs will "fix" this for us by removing the the modified set from the use and def // regalloc.rs will "fix" this for us by removing the modified set from the use and def
// sets. // sets.
match inst { match inst {
Inst::AluRmiR { src, dst, .. } => { Inst::AluRmiR { src, dst, .. } => {
@@ -1895,6 +1895,10 @@ fn x64_get_regs(inst: &Inst, collector: &mut RegUsageCollector) {
|| *op == SseOpcode::Pextrw || *op == SseOpcode::Pextrw
|| *op == SseOpcode::Pextrd || *op == SseOpcode::Pextrd
|| *op == SseOpcode::Pshufd || *op == SseOpcode::Pshufd
|| *op == SseOpcode::Roundss
|| *op == SseOpcode::Roundsd
|| *op == SseOpcode::Roundps
|| *op == SseOpcode::Roundpd
{ {
src.get_regs_as_uses(collector); src.get_regs_as_uses(collector);
collector.add_def(*dst); collector.add_def(*dst);
@@ -2236,6 +2240,10 @@ fn x64_map_regs<RUM: RegUsageMapper>(inst: &mut Inst, mapper: &RUM) {
|| *op == SseOpcode::Pextrw || *op == SseOpcode::Pextrw
|| *op == SseOpcode::Pextrd || *op == SseOpcode::Pextrd
|| *op == SseOpcode::Pshufd || *op == SseOpcode::Pshufd
|| *op == SseOpcode::Roundss
|| *op == SseOpcode::Roundsd
|| *op == SseOpcode::Roundps
|| *op == SseOpcode::Roundpd
{ {
src.map_uses(mapper); src.map_uses(mapper);
map_def(mapper, dst); map_def(mapper, dst);

View File

@@ -8,7 +8,7 @@ use crate::ir::{
use crate::isa::x64::abi::*; use crate::isa::x64::abi::*;
use crate::isa::x64::inst::args::*; use crate::isa::x64::inst::args::*;
use crate::isa::x64::inst::*; use crate::isa::x64::inst::*;
use crate::isa::{x64::X64Backend, CallConv}; use crate::isa::{x64::settings as x64_settings, x64::X64Backend, CallConv};
use crate::machinst::lower::*; use crate::machinst::lower::*;
use crate::machinst::*; use crate::machinst::*;
use crate::result::CodegenResult; use crate::result::CodegenResult;
@@ -1330,6 +1330,7 @@ fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(
ctx: &mut C, ctx: &mut C,
insn: IRInst, insn: IRInst,
flags: &Flags, flags: &Flags,
isa_flags: &x64_settings::Flags,
triple: &Triple, triple: &Triple,
) -> CodegenResult<()> { ) -> CodegenResult<()> {
let op = ctx.data(insn).opcode(); let op = ctx.data(insn).opcode();
@@ -4211,11 +4212,29 @@ fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(
} }
Opcode::Ceil | Opcode::Floor | Opcode::Nearest | Opcode::Trunc => { Opcode::Ceil | Opcode::Floor | Opcode::Nearest | Opcode::Trunc => {
// TODO use ROUNDSS/ROUNDSD after sse4.1.
// Lower to VM calls when there's no access to SSE4.1.
let ty = ty.unwrap(); let ty = ty.unwrap();
if !ty.is_vector() { if isa_flags.use_sse41() {
let mode = match op {
Opcode::Ceil => RoundImm::RoundUp,
Opcode::Floor => RoundImm::RoundDown,
Opcode::Nearest => RoundImm::RoundNearest,
Opcode::Trunc => RoundImm::RoundZero,
_ => panic!("unexpected opcode {:?} in Ceil/Floor/Nearest/Trunc", op),
};
let op = match ty {
types::F32 => SseOpcode::Roundss,
types::F64 => SseOpcode::Roundsd,
types::F32X4 => SseOpcode::Roundps,
types::F64X2 => SseOpcode::Roundpd,
_ => panic!("unexpected type {:?} in Ceil/Floor/Nearest/Trunc", ty),
};
let src = input_to_reg_mem(ctx, inputs[0]);
let dst = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
ctx.emit(Inst::xmm_rm_r_imm(op, src, dst, mode.encode(), false));
} else {
// Lower to VM calls when there's no access to SSE4.1.
// Note, for vector types on platforms that don't support sse41
// the execution will panic here.
let libcall = match (op, ty) { let libcall = match (op, ty) {
(Opcode::Ceil, types::F32) => LibCall::CeilF32, (Opcode::Ceil, types::F32) => LibCall::CeilF32,
(Opcode::Ceil, types::F64) => LibCall::CeilF64, (Opcode::Ceil, types::F64) => LibCall::CeilF64,
@@ -4231,28 +4250,6 @@ fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(
), ),
}; };
emit_vm_call(ctx, flags, triple, libcall, insn, inputs, outputs)?; emit_vm_call(ctx, flags, triple, libcall, insn, inputs, outputs)?;
} else {
let (op, mode) = match (op, ty) {
(Opcode::Ceil, types::F32X4) => (SseOpcode::Roundps, RoundImm::RoundUp),
(Opcode::Ceil, types::F64X2) => (SseOpcode::Roundpd, RoundImm::RoundUp),
(Opcode::Floor, types::F32X4) => (SseOpcode::Roundps, RoundImm::RoundDown),
(Opcode::Floor, types::F64X2) => (SseOpcode::Roundpd, RoundImm::RoundDown),
(Opcode::Trunc, types::F32X4) => (SseOpcode::Roundps, RoundImm::RoundZero),
(Opcode::Trunc, types::F64X2) => (SseOpcode::Roundpd, RoundImm::RoundZero),
(Opcode::Nearest, types::F32X4) => (SseOpcode::Roundps, RoundImm::RoundNearest),
(Opcode::Nearest, types::F64X2) => (SseOpcode::Roundpd, RoundImm::RoundNearest),
_ => panic!("Unknown op/ty combination (vector){:?}", ty),
};
let src = put_input_in_reg(ctx, inputs[0]);
let dst = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
ctx.emit(Inst::gen_move(dst, src, ty));
ctx.emit(Inst::xmm_rm_r_imm(
op,
RegMem::from(dst),
dst,
mode.encode(),
false,
));
} }
} }
@@ -5389,7 +5386,7 @@ impl LowerBackend for X64Backend {
type MInst = Inst; type MInst = Inst;
fn lower<C: LowerCtx<I = Inst>>(&self, ctx: &mut C, ir_inst: IRInst) -> CodegenResult<()> { fn lower<C: LowerCtx<I = Inst>>(&self, ctx: &mut C, ir_inst: IRInst) -> CodegenResult<()> {
lower_insn_to_regs(ctx, ir_inst, &self.flags, &self.triple) lower_insn_to_regs(ctx, ir_inst, &self.flags, &self.x64_flags, &self.triple)
} }
fn lower_branch_group<C: LowerCtx<I = Inst>>( fn lower_branch_group<C: LowerCtx<I = Inst>>(