From aa103698d48c968b126a8c209bd5f9b4061630ac Mon Sep 17 00:00:00 2001 From: Benjamin Bouvier Date: Tue, 21 Jul 2020 18:54:50 +0200 Subject: [PATCH] machinst x64: extend Copysign to work for f64 inputs too; --- cranelift/codegen/src/isa/x64/inst/args.rs | 6 ++ cranelift/codegen/src/isa/x64/inst/emit.rs | 2 + cranelift/codegen/src/isa/x64/lower.rs | 91 ++++++++++++---------- 3 files changed, 59 insertions(+), 40 deletions(-) diff --git a/cranelift/codegen/src/isa/x64/inst/args.rs b/cranelift/codegen/src/isa/x64/inst/args.rs index 0b64f3bb4c..162ed8c2d1 100644 --- a/cranelift/codegen/src/isa/x64/inst/args.rs +++ b/cranelift/codegen/src/isa/x64/inst/args.rs @@ -339,6 +339,7 @@ pub enum SseOpcode { Andps, Andpd, Andnps, + Andnpd, Comiss, Comisd, Cmpss, @@ -359,6 +360,7 @@ pub enum SseOpcode { Minss, Minsd, Movaps, + Movapd, Movd, Movq, Movss, @@ -410,6 +412,7 @@ impl SseOpcode { SseOpcode::Addsd | SseOpcode::Andpd + | SseOpcode::Andnpd | SseOpcode::Cvtsd2ss | SseOpcode::Cvtsd2si | SseOpcode::Cvtsi2sd @@ -418,6 +421,7 @@ impl SseOpcode { | SseOpcode::Divsd | SseOpcode::Maxsd | SseOpcode::Minsd + | SseOpcode::Movapd | SseOpcode::Movd | SseOpcode::Movq | SseOpcode::Movsd @@ -451,6 +455,7 @@ impl fmt::Debug for SseOpcode { SseOpcode::Andpd => "andpd", SseOpcode::Andps => "andps", SseOpcode::Andnps => "andnps", + SseOpcode::Andnpd => "andnpd", SseOpcode::Comiss => "comiss", SseOpcode::Comisd => "comisd", SseOpcode::Cvtsd2ss => "cvtsd2ss", @@ -468,6 +473,7 @@ impl fmt::Debug for SseOpcode { SseOpcode::Minss => "minss", SseOpcode::Minsd => "minsd", SseOpcode::Movaps => "movaps", + SseOpcode::Movapd => "movapd", SseOpcode::Movd => "movd", SseOpcode::Movq => "movq", SseOpcode::Movss => "movss", diff --git a/cranelift/codegen/src/isa/x64/inst/emit.rs b/cranelift/codegen/src/isa/x64/inst/emit.rs index 9e022bdf27..00d1a0777a 100644 --- a/cranelift/codegen/src/isa/x64/inst/emit.rs +++ b/cranelift/codegen/src/isa/x64/inst/emit.rs @@ -1497,6 +1497,7 @@ pub(crate) fn emit( let (prefix, opcode) = match op { SseOpcode::Movaps => (LegacyPrefix::None, 0x0F28), + SseOpcode::Movapd => (LegacyPrefix::_66, 0x0F28), SseOpcode::Movsd => (LegacyPrefix::_F2, 0x0F10), SseOpcode::Movss => (LegacyPrefix::_F3, 0x0F10), SseOpcode::Sqrtss => (LegacyPrefix::_F3, 0x0F51), @@ -1533,6 +1534,7 @@ pub(crate) fn emit( SseOpcode::Andpd => (LegacyPrefix::_66, 0x0F54), SseOpcode::Andps => (LegacyPrefix::None, 0x0F54), SseOpcode::Andnps => (LegacyPrefix::None, 0x0F55), + SseOpcode::Andnpd => (LegacyPrefix::_66, 0x0F55), SseOpcode::Mulss => (LegacyPrefix::_F3, 0x0F59), SseOpcode::Mulsd => (LegacyPrefix::_F2, 0x0F59), SseOpcode::Orpd => (LegacyPrefix::_66, 0x0F56), diff --git a/cranelift/codegen/src/isa/x64/lower.rs b/cranelift/codegen/src/isa/x64/lower.rs index 435f8974a2..3bfac07a4d 100644 --- a/cranelift/codegen/src/isa/x64/lower.rs +++ b/cranelift/codegen/src/isa/x64/lower.rs @@ -1228,50 +1228,61 @@ fn lower_insn_to_regs>( let dst = output_to_reg(ctx, outputs[0]); let lhs = input_to_reg(ctx, inputs[0]); let rhs = input_to_reg(ctx, inputs[1]); - if !flt_ty_is_64(ty.unwrap()) { - // movabs 0x8000_0000, tmp_gpr1 - // movd tmp_gpr1, tmp_xmm1 - // movaps tmp_xmm1, dst - // andnps src_1, dst - // movss src_2, tmp_xmm2 - // andps tmp_xmm1, tmp_xmm2 - // orps tmp_xmm2, dst - let tmp_gpr1 = ctx.alloc_tmp(RegClass::I64, I32); - let tmp_xmm1 = ctx.alloc_tmp(RegClass::V128, F32); - let tmp_xmm2 = ctx.alloc_tmp(RegClass::V128, F32); - ctx.emit(Inst::imm_r(true, 0x8000_0000, tmp_gpr1)); - ctx.emit(Inst::gpr_to_xmm( - SseOpcode::Movd, - RegMem::reg(tmp_gpr1.to_reg()), - OperandSize::Size32, - tmp_xmm1, - )); - ctx.emit(Inst::xmm_mov( + + let ty = ty.unwrap(); + + // We're going to generate the following sequence: + // + // movabs $INT_MIN, tmp_gpr1 + // mov{d,q} tmp_gpr1, tmp_xmm1 + // movap{s,d} tmp_xmm1, dst + // andnp{s,d} src_1, dst + // movap{s,d} src_2, tmp_xmm2 + // andp{s,d} tmp_xmm1, tmp_xmm2 + // orp{s,d} tmp_xmm2, dst + + let tmp_xmm1 = ctx.alloc_tmp(RegClass::V128, F32); + let tmp_xmm2 = ctx.alloc_tmp(RegClass::V128, F32); + + let (sign_bit_cst, mov_op, and_not_op, and_op, or_op) = match ty { + F32 => ( + 0x8000_0000, SseOpcode::Movaps, - RegMem::reg(tmp_xmm1.to_reg()), - dst, - None, - )); - ctx.emit(Inst::xmm_rm_r(SseOpcode::Andnps, RegMem::reg(lhs), dst)); - ctx.emit(Inst::xmm_mov( - SseOpcode::Movss, - RegMem::reg(rhs), - tmp_xmm2, - None, - )); - ctx.emit(Inst::xmm_rm_r( + SseOpcode::Andnps, SseOpcode::Andps, - RegMem::reg(tmp_xmm1.to_reg()), - tmp_xmm2, - )); - ctx.emit(Inst::xmm_rm_r( SseOpcode::Orps, - RegMem::reg(tmp_xmm2.to_reg()), - dst, - )); - } else { - unimplemented!("{:?} for non 32-bit destination is not supported", op); + ), + F64 => ( + 0x8000_0000_0000_0000, + SseOpcode::Movapd, + SseOpcode::Andnpd, + SseOpcode::Andpd, + SseOpcode::Orpd, + ), + _ => { + panic!("unexpected type {:?} for copysign", ty); + } + }; + + for inst in Inst::gen_constant(tmp_xmm1, sign_bit_cst, ty, |reg_class, ty| { + ctx.alloc_tmp(reg_class, ty) + }) { + ctx.emit(inst); } + ctx.emit(Inst::xmm_mov( + mov_op, + RegMem::reg(tmp_xmm1.to_reg()), + dst, + None, + )); + ctx.emit(Inst::xmm_rm_r(and_not_op, RegMem::reg(lhs), dst)); + ctx.emit(Inst::xmm_mov(mov_op, RegMem::reg(rhs), tmp_xmm2, None)); + ctx.emit(Inst::xmm_rm_r( + and_op, + RegMem::reg(tmp_xmm1.to_reg()), + tmp_xmm2, + )); + ctx.emit(Inst::xmm_rm_r(or_op, RegMem::reg(tmp_xmm2.to_reg()), dst)); } Opcode::Ceil | Opcode::Floor | Opcode::Nearest | Opcode::Trunc => {