diff --git a/cranelift/codegen/src/isa/x64/inst/emit.rs b/cranelift/codegen/src/isa/x64/inst/emit.rs index 095256ab49..0a029301a6 100644 --- a/cranelift/codegen/src/isa/x64/inst/emit.rs +++ b/cranelift/codegen/src/isa/x64/inst/emit.rs @@ -2094,7 +2094,9 @@ pub(crate) fn emit( SseOpcode::Pextrd => (LegacyPrefixes::_66, 0x0F3A16, 3), SseOpcode::Pshufd => (LegacyPrefixes::_66, 0x0F70, 2), SseOpcode::Roundps => (LegacyPrefixes::_66, 0x0F3A08, 3), + SseOpcode::Roundss => (LegacyPrefixes::_66, 0x0F3A0A, 3), SseOpcode::Roundpd => (LegacyPrefixes::_66, 0x0F3A09, 3), + SseOpcode::Roundsd => (LegacyPrefixes::_66, 0x0F3A0B, 3), _ => unimplemented!("Opcode {:?} not implemented", op), }; let rex = if *is64 { diff --git a/cranelift/codegen/src/isa/x64/inst/mod.rs b/cranelift/codegen/src/isa/x64/inst/mod.rs index bab28f2aa0..b6276c943d 100644 --- a/cranelift/codegen/src/isa/x64/inst/mod.rs +++ b/cranelift/codegen/src/isa/x64/inst/mod.rs @@ -1823,7 +1823,7 @@ impl fmt::Debug for Inst { fn x64_get_regs(inst: &Inst, collector: &mut RegUsageCollector) { // This is a bit subtle. If some register is in the modified set, then it may not be in either // the use or def sets. However, enforcing that directly is somewhat difficult. Instead, - // regalloc.rs will "fix" this for us by removing the the modified set from the use and def + // regalloc.rs will "fix" this for us by removing the modified set from the use and def // sets. match inst { Inst::AluRmiR { src, dst, .. } => { @@ -1895,6 +1895,10 @@ fn x64_get_regs(inst: &Inst, collector: &mut RegUsageCollector) { || *op == SseOpcode::Pextrw || *op == SseOpcode::Pextrd || *op == SseOpcode::Pshufd + || *op == SseOpcode::Roundss + || *op == SseOpcode::Roundsd + || *op == SseOpcode::Roundps + || *op == SseOpcode::Roundpd { src.get_regs_as_uses(collector); collector.add_def(*dst); @@ -2236,6 +2240,10 @@ fn x64_map_regs(inst: &mut Inst, mapper: &RUM) { || *op == SseOpcode::Pextrw || *op == SseOpcode::Pextrd || *op == SseOpcode::Pshufd + || *op == SseOpcode::Roundss + || *op == SseOpcode::Roundsd + || *op == SseOpcode::Roundps + || *op == SseOpcode::Roundpd { src.map_uses(mapper); map_def(mapper, dst); diff --git a/cranelift/codegen/src/isa/x64/lower.rs b/cranelift/codegen/src/isa/x64/lower.rs index 45014fca17..6c002dd6cf 100644 --- a/cranelift/codegen/src/isa/x64/lower.rs +++ b/cranelift/codegen/src/isa/x64/lower.rs @@ -8,7 +8,7 @@ use crate::ir::{ use crate::isa::x64::abi::*; use crate::isa::x64::inst::args::*; use crate::isa::x64::inst::*; -use crate::isa::{x64::X64Backend, CallConv}; +use crate::isa::{x64::settings as x64_settings, x64::X64Backend, CallConv}; use crate::machinst::lower::*; use crate::machinst::*; use crate::result::CodegenResult; @@ -1330,6 +1330,7 @@ fn lower_insn_to_regs>( ctx: &mut C, insn: IRInst, flags: &Flags, + isa_flags: &x64_settings::Flags, triple: &Triple, ) -> CodegenResult<()> { let op = ctx.data(insn).opcode(); @@ -4211,11 +4212,29 @@ fn lower_insn_to_regs>( } Opcode::Ceil | Opcode::Floor | Opcode::Nearest | Opcode::Trunc => { - // TODO use ROUNDSS/ROUNDSD after sse4.1. - - // Lower to VM calls when there's no access to SSE4.1. let ty = ty.unwrap(); - if !ty.is_vector() { + if isa_flags.use_sse41() { + let mode = match op { + Opcode::Ceil => RoundImm::RoundUp, + Opcode::Floor => RoundImm::RoundDown, + Opcode::Nearest => RoundImm::RoundNearest, + Opcode::Trunc => RoundImm::RoundZero, + _ => panic!("unexpected opcode {:?} in Ceil/Floor/Nearest/Trunc", op), + }; + let op = match ty { + types::F32 => SseOpcode::Roundss, + types::F64 => SseOpcode::Roundsd, + types::F32X4 => SseOpcode::Roundps, + types::F64X2 => SseOpcode::Roundpd, + _ => panic!("unexpected type {:?} in Ceil/Floor/Nearest/Trunc", ty), + }; + let src = input_to_reg_mem(ctx, inputs[0]); + let dst = get_output_reg(ctx, outputs[0]).only_reg().unwrap(); + ctx.emit(Inst::xmm_rm_r_imm(op, src, dst, mode.encode(), false)); + } else { + // Lower to VM calls when there's no access to SSE4.1. + // Note, for vector types on platforms that don't support sse41 + // the execution will panic here. let libcall = match (op, ty) { (Opcode::Ceil, types::F32) => LibCall::CeilF32, (Opcode::Ceil, types::F64) => LibCall::CeilF64, @@ -4231,28 +4250,6 @@ fn lower_insn_to_regs>( ), }; emit_vm_call(ctx, flags, triple, libcall, insn, inputs, outputs)?; - } else { - let (op, mode) = match (op, ty) { - (Opcode::Ceil, types::F32X4) => (SseOpcode::Roundps, RoundImm::RoundUp), - (Opcode::Ceil, types::F64X2) => (SseOpcode::Roundpd, RoundImm::RoundUp), - (Opcode::Floor, types::F32X4) => (SseOpcode::Roundps, RoundImm::RoundDown), - (Opcode::Floor, types::F64X2) => (SseOpcode::Roundpd, RoundImm::RoundDown), - (Opcode::Trunc, types::F32X4) => (SseOpcode::Roundps, RoundImm::RoundZero), - (Opcode::Trunc, types::F64X2) => (SseOpcode::Roundpd, RoundImm::RoundZero), - (Opcode::Nearest, types::F32X4) => (SseOpcode::Roundps, RoundImm::RoundNearest), - (Opcode::Nearest, types::F64X2) => (SseOpcode::Roundpd, RoundImm::RoundNearest), - _ => panic!("Unknown op/ty combination (vector){:?}", ty), - }; - let src = put_input_in_reg(ctx, inputs[0]); - let dst = get_output_reg(ctx, outputs[0]).only_reg().unwrap(); - ctx.emit(Inst::gen_move(dst, src, ty)); - ctx.emit(Inst::xmm_rm_r_imm( - op, - RegMem::from(dst), - dst, - mode.encode(), - false, - )); } } @@ -5389,7 +5386,7 @@ impl LowerBackend for X64Backend { type MInst = Inst; fn lower>(&self, ctx: &mut C, ir_inst: IRInst) -> CodegenResult<()> { - lower_insn_to_regs(ctx, ir_inst, &self.flags, &self.triple) + lower_insn_to_regs(ctx, ir_inst, &self.flags, &self.x64_flags, &self.triple) } fn lower_branch_group>(