From 80c77da3348d453e070bb919ff599971a0ccb6a2 Mon Sep 17 00:00:00 2001 From: Trevor Elliott Date: Thu, 18 Aug 2022 17:59:23 -0700 Subject: [PATCH] x64: Lower bitcast, fabs, and fneg in ISLE (#4729) * Add tests for bitcast * Migrate bitcast to ISLE * Add tests for fabs * Lower fabs in ISLE * Add tests for fneg * Lower fneg in ISLE --- cranelift/codegen/src/isa/x64/inst.isle | 25 +++ .../codegen/src/isa/x64/inst/emit_tests.rs | 11 ++ cranelift/codegen/src/isa/x64/inst/mod.rs | 11 -- cranelift/codegen/src/isa/x64/lower.isle | 38 +++++ cranelift/codegen/src/isa/x64/lower.rs | 144 +----------------- .../filetests/filetests/isa/x64/bitcast.clif | 59 +++++++ .../filetests/filetests/isa/x64/fabs.clif | 67 ++++++++ .../filetests/isa/x64/floating-point.clif | 16 +- .../filetests/filetests/isa/x64/fneg.clif | 67 ++++++++ 9 files changed, 279 insertions(+), 159 deletions(-) create mode 100644 cranelift/filetests/filetests/isa/x64/bitcast.clif create mode 100644 cranelift/filetests/filetests/isa/x64/fabs.clif create mode 100644 cranelift/filetests/filetests/isa/x64/fneg.clif diff --git a/cranelift/codegen/src/isa/x64/inst.isle b/cranelift/codegen/src/isa/x64/inst.isle index 2292a81064..be1b4c6ffd 100644 --- a/cranelift/codegen/src/isa/x64/inst.isle +++ b/cranelift/codegen/src/isa/x64/inst.isle @@ -1647,6 +1647,10 @@ (rule (x64_movupd from) (xmm_unary_rm_r (SseOpcode.Movupd) from)) +(decl x64_movd (Xmm) Gpr) +(rule (x64_movd from) + (xmm_to_gpr (SseOpcode.Movd) from (OperandSize.Size32))) + (decl x64_movdqu (XmmMem) Xmm) (rule (x64_movdqu from) (xmm_unary_rm_r (SseOpcode.Movdqu) from)) @@ -2763,6 +2767,13 @@ (operand_size_of_type_32_64 (lane_type ty)))))) dst)) +;; Helper for creating `MInst.XmmToGpr` instructions. +(decl xmm_to_gpr (SseOpcode Xmm OperandSize) Gpr) +(rule (xmm_to_gpr op src size) + (let ((dst WritableGpr (temp_writable_gpr)) + (_ Unit (emit (MInst.XmmToGpr op src dst size)))) + dst)) + ;; Helper for creating `MInst.GprToXmm` instructions. (decl gpr_to_xmm (SseOpcode GprMem OperandSize) Xmm) (rule (gpr_to_xmm op src size) @@ -3514,6 +3525,20 @@ (decl atomic_rmw_op_to_mach_atomic_rmw_op (AtomicRmwOp) MachAtomicRmwOp) (extern constructor atomic_rmw_op_to_mach_atomic_rmw_op atomic_rmw_op_to_mach_atomic_rmw_op) +;;;; Casting ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +(decl bitcast_xmm_to_gpr (Type Xmm) Gpr) +(rule (bitcast_xmm_to_gpr $F32 src) + (xmm_to_gpr (SseOpcode.Movd) src (OperandSize.Size32))) +(rule (bitcast_xmm_to_gpr $F64 src) + (xmm_to_gpr (SseOpcode.Movq) src (OperandSize.Size64))) + +(decl bitcast_gpr_to_xmm (Type Gpr) Xmm) +(rule (bitcast_gpr_to_xmm $I32 src) + (gpr_to_xmm (SseOpcode.Movd) src (OperandSize.Size32))) +(rule (bitcast_gpr_to_xmm $I64 src) + (gpr_to_xmm (SseOpcode.Movq) src (OperandSize.Size64))) + ;;;; Automatic conversions ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; (convert Gpr InstOutput output_gpr) diff --git a/cranelift/codegen/src/isa/x64/inst/emit_tests.rs b/cranelift/codegen/src/isa/x64/inst/emit_tests.rs index 61b7f33e36..b9a3a94ffb 100644 --- a/cranelift/codegen/src/isa/x64/inst/emit_tests.rs +++ b/cranelift/codegen/src/isa/x64/inst/emit_tests.rs @@ -38,6 +38,17 @@ impl Inst { dst: WritableXmm::from_writable_reg(dst).unwrap(), } } + + fn xmm_rmi_reg(opcode: SseOpcode, src: RegMemImm, dst: Writable) -> Inst { + src.assert_regclass_is(RegClass::Float); + debug_assert!(dst.to_reg().class() == RegClass::Float); + Inst::XmmRmiReg { + opcode, + src1: Xmm::new(dst.to_reg()).unwrap(), + src2: XmmMemImm::new(src).unwrap(), + dst: WritableXmm::from_writable_reg(dst).unwrap(), + } + } } #[test] diff --git a/cranelift/codegen/src/isa/x64/inst/mod.rs b/cranelift/codegen/src/isa/x64/inst/mod.rs index 354be76df2..a7f221c026 100644 --- a/cranelift/codegen/src/isa/x64/inst/mod.rs +++ b/cranelift/codegen/src/isa/x64/inst/mod.rs @@ -455,17 +455,6 @@ impl Inst { Inst::MovzxRmR { ext_mode, src, dst } } - pub(crate) fn xmm_rmi_reg(opcode: SseOpcode, src: RegMemImm, dst: Writable) -> Inst { - src.assert_regclass_is(RegClass::Float); - debug_assert!(dst.to_reg().class() == RegClass::Float); - Inst::XmmRmiReg { - opcode, - src1: Xmm::new(dst.to_reg()).unwrap(), - src2: XmmMemImm::new(src).unwrap(), - dst: WritableXmm::from_writable_reg(dst).unwrap(), - } - } - pub(crate) fn movsx_rm_r(ext_mode: ExtMode, src: RegMem, dst: Writable) -> Inst { src.assert_regclass_is(RegClass::Int); debug_assert!(dst.to_reg().class() == RegClass::Int); diff --git a/cranelift/codegen/src/isa/x64/lower.isle b/cranelift/codegen/src/isa/x64/lower.isle index 2429c5cd4b..58e32fc206 100644 --- a/cranelift/codegen/src/isa/x64/lower.isle +++ b/cranelift/codegen/src/isa/x64/lower.isle @@ -1194,6 +1194,12 @@ ;;;; Rules for `fabs` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +(rule (lower (has_type $F32 (fabs x))) + (x64_andps x (imm $F32 0x7fffffff))) + +(rule (lower (has_type $F64 (fabs x))) + (x64_andpd x (imm $F64 0x7fffffffffffffff))) + ;; Special case for `f32x4.abs`. (rule (lower (has_type $F32X4 (fabs x))) (x64_andps x @@ -1206,6 +1212,24 @@ (x64_psrlq (vector_all_ones) (RegMemImm.Imm 1)))) +;;;; Rules for `fneg` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +(rule (lower (has_type $F32 (fneg x))) + (x64_xorps x (imm $F32 0x80000000))) + +(rule (lower (has_type $F64 (fneg x))) + (x64_xorpd x (imm $F64 0x8000000000000000))) + +(rule (lower (has_type $F32X4 (fneg x))) + (x64_xorps x + (x64_pslld (vector_all_ones) + (RegMemImm.Imm 31)))) + +(rule (lower (has_type $F64X2 (fneg x))) + (x64_xorpd x + (x64_psllq (vector_all_ones) + (RegMemImm.Imm 63)))) + ;;;; Rules for `bnot` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;; `i64` and smaller. @@ -3281,3 +3305,17 @@ ;; We're missing a `unarrow` case for $I64X2 ;; https://github.com/bytecodealliance/wasmtime/issues/4734 + +;; Rules for `bitcast` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +(rule (lower (has_type $I32 (bitcast src @ (value_type $F32)))) + (bitcast_xmm_to_gpr $F32 src)) + +(rule (lower (has_type $F32 (bitcast src @ (value_type $I32)))) + (bitcast_gpr_to_xmm $I32 src)) + +(rule (lower (has_type $I64 (bitcast src @ (value_type $F64)))) + (bitcast_xmm_to_gpr $F64 src)) + +(rule (lower (has_type $F64 (bitcast src @ (value_type $I64)))) + (bitcast_gpr_to_xmm $I64 src)) diff --git a/cranelift/codegen/src/isa/x64/lower.rs b/cranelift/codegen/src/isa/x64/lower.rs index 552d0fcea7..3a7c53af8d 100644 --- a/cranelift/codegen/src/isa/x64/lower.rs +++ b/cranelift/codegen/src/isa/x64/lower.rs @@ -4,9 +4,7 @@ pub(super) mod isle; use crate::data_value::DataValue; -use crate::ir::{ - condcodes::FloatCC, types, ExternalName, Inst as IRInst, InstructionData, LibCall, Opcode, Type, -}; +use crate::ir::{types, ExternalName, Inst as IRInst, InstructionData, LibCall, Opcode, Type}; use crate::isa::x64::abi::*; use crate::isa::x64::inst::args::*; use crate::isa::x64::inst::*; @@ -568,145 +566,13 @@ fn lower_insn_to_regs( | Opcode::SwidenHigh | Opcode::SwidenLow | Opcode::Snarrow - | Opcode::Unarrow => { + | Opcode::Unarrow + | Opcode::Bitcast + | Opcode::Fabs + | Opcode::Fneg => { implemented_in_isle(ctx); } - Opcode::Bitcast => { - let input_ty = ctx.input_ty(insn, 0); - let output_ty = ctx.output_ty(insn, 0); - match (input_ty, output_ty) { - (types::F32, types::I32) => { - let src = put_input_in_reg(ctx, inputs[0]); - let dst = get_output_reg(ctx, outputs[0]).only_reg().unwrap(); - ctx.emit(Inst::xmm_to_gpr( - SseOpcode::Movd, - src, - dst, - OperandSize::Size32, - )); - } - (types::I32, types::F32) => { - let src = input_to_reg_mem(ctx, inputs[0]); - let dst = get_output_reg(ctx, outputs[0]).only_reg().unwrap(); - ctx.emit(Inst::gpr_to_xmm( - SseOpcode::Movd, - src, - OperandSize::Size32, - dst, - )); - } - (types::F64, types::I64) => { - let src = put_input_in_reg(ctx, inputs[0]); - let dst = get_output_reg(ctx, outputs[0]).only_reg().unwrap(); - ctx.emit(Inst::xmm_to_gpr( - SseOpcode::Movq, - src, - dst, - OperandSize::Size64, - )); - } - (types::I64, types::F64) => { - let src = input_to_reg_mem(ctx, inputs[0]); - let dst = get_output_reg(ctx, outputs[0]).only_reg().unwrap(); - ctx.emit(Inst::gpr_to_xmm( - SseOpcode::Movq, - src, - OperandSize::Size64, - dst, - )); - } - _ => unreachable!("invalid bitcast from {:?} to {:?}", input_ty, output_ty), - } - } - - Opcode::Fabs | Opcode::Fneg => { - let src = RegMem::reg(put_input_in_reg(ctx, inputs[0])); - let dst = get_output_reg(ctx, outputs[0]).only_reg().unwrap(); - - // In both cases, generate a constant and apply a single binary instruction: - // - to compute the absolute value, set all bits to 1 but the MSB to 0, and bit-AND the - // src with it. - // - to compute the negated value, set all bits to 0 but the MSB to 1, and bit-XOR the - // src with it. - let output_ty = ty.unwrap(); - if !output_ty.is_vector() { - let (val, opcode): (u64, _) = match output_ty { - types::F32 => match op { - Opcode::Fabs => (0x7fffffff, SseOpcode::Andps), - Opcode::Fneg => (0x80000000, SseOpcode::Xorps), - _ => unreachable!(), - }, - types::F64 => match op { - Opcode::Fabs => (0x7fffffffffffffff, SseOpcode::Andpd), - Opcode::Fneg => (0x8000000000000000, SseOpcode::Xorpd), - _ => unreachable!(), - }, - _ => panic!("unexpected type {:?} for Fabs", output_ty), - }; - - for inst in Inst::gen_constant(ValueRegs::one(dst), val as u128, output_ty, |ty| { - ctx.alloc_tmp(ty).only_reg().unwrap() - }) { - ctx.emit(inst); - } - - ctx.emit(Inst::xmm_rm_r(opcode, src, dst)); - } else { - // Eventually vector constants should be available in `gen_constant` and this block - // can be merged with the one above (TODO). - if output_ty.bits() == 128 { - // Move the `lhs` to the same register as `dst`; this may not emit an actual move - // but ensures that the registers are the same to match x86's read-write operand - // encoding. - let src = put_input_in_reg(ctx, inputs[0]); - ctx.emit(Inst::gen_move(dst, src, output_ty)); - - // Generate an all 1s constant in an XMM register. This uses CMPPS but could - // have used CMPPD with the same effect. Note, we zero the temp we allocate - // because if not, there is a chance that the register we use could be initialized - // with NaN .. in which case the CMPPS would fail since NaN != NaN. - let tmp = ctx.alloc_tmp(output_ty).only_reg().unwrap(); - ctx.emit(Inst::xmm_rm_r(SseOpcode::Xorps, RegMem::from(tmp), tmp)); - let cond = FcmpImm::from(FloatCC::Equal); - let cmpps = Inst::xmm_rm_r_imm( - SseOpcode::Cmpps, - RegMem::reg(tmp.to_reg()), - tmp, - cond.encode(), - OperandSize::Size32, - ); - ctx.emit(cmpps); - - // Shift the all 1s constant to generate the mask. - let lane_bits = output_ty.lane_bits(); - let (shift_opcode, opcode, shift_by) = match (op, lane_bits) { - (Opcode::Fabs, _) => { - unreachable!( - "implemented in ISLE: inst = `{}`, type = `{:?}`", - ctx.dfg().display_inst(insn), - ty - ); - } - (Opcode::Fneg, 32) => (SseOpcode::Pslld, SseOpcode::Xorps, 31), - (Opcode::Fneg, 64) => (SseOpcode::Psllq, SseOpcode::Xorpd, 63), - _ => unreachable!( - "unexpected opcode and lane size: {:?}, {} bits", - op, lane_bits - ), - }; - let shift = Inst::xmm_rmi_reg(shift_opcode, RegMemImm::imm(shift_by), tmp); - ctx.emit(shift); - - // Apply shifted mask (XOR or AND). - let mask = Inst::xmm_rm_r(opcode, RegMem::reg(tmp.to_reg()), dst); - ctx.emit(mask); - } else { - panic!("unexpected type {:?} for Fabs", output_ty); - } - } - } - Opcode::Fcopysign => { let dst = get_output_reg(ctx, outputs[0]).only_reg().unwrap(); let lhs = put_input_in_reg(ctx, inputs[0]); diff --git a/cranelift/filetests/filetests/isa/x64/bitcast.clif b/cranelift/filetests/filetests/isa/x64/bitcast.clif new file mode 100644 index 0000000000..97418b56a6 --- /dev/null +++ b/cranelift/filetests/filetests/isa/x64/bitcast.clif @@ -0,0 +1,59 @@ +test compile precise-output +target x86_64 + +function %f1(f32) -> i32 { +block0(v0: f32): + v1 = bitcast.i32 v0 + return v1 +} + +; pushq %rbp +; movq %rsp, %rbp +; block0: +; movd %xmm0, %eax +; movq %rbp, %rsp +; popq %rbp +; ret + +function %f2(i32) -> f32 { +block0(v0: i32): + v1 = bitcast.f32 v0 + return v1 +} + +; pushq %rbp +; movq %rsp, %rbp +; block0: +; movd %edi, %xmm0 +; movq %rbp, %rsp +; popq %rbp +; ret + +function %f3(f64) -> i64 { +block0(v0: f64): + v1 = bitcast.i64 v0 + return v1 +} + +; pushq %rbp +; movq %rsp, %rbp +; block0: +; movq %xmm0, %rax +; movq %rbp, %rsp +; popq %rbp +; ret + +function %f4(i64) -> f64 { +block0(v0: i64): + v1 = bitcast.f64 v0 + return v1 +} + +; pushq %rbp +; movq %rsp, %rbp +; block0: +; movq %rdi, %xmm0 +; movq %rbp, %rsp +; popq %rbp +; ret + diff --git a/cranelift/filetests/filetests/isa/x64/fabs.clif b/cranelift/filetests/filetests/isa/x64/fabs.clif new file mode 100644 index 0000000000..89ab1d42b0 --- /dev/null +++ b/cranelift/filetests/filetests/isa/x64/fabs.clif @@ -0,0 +1,67 @@ +test compile precise-output +target x86_64 + +function %f1(f32) -> f32 { +block0(v0: f32): + v1 = fabs v0 + return v1 +} + +; pushq %rbp +; movq %rsp, %rbp +; block0: +; movl $2147483647, %ecx +; movd %ecx, %xmm5 +; andps %xmm0, %xmm5, %xmm0 +; movq %rbp, %rsp +; popq %rbp +; ret + +function %f2(f64) -> f64 { +block0(v0: f64): + v1 = fabs v0 + return v1 +} + +; pushq %rbp +; movq %rsp, %rbp +; block0: +; movabsq $9223372036854775807, %rcx +; movq %rcx, %xmm5 +; andpd %xmm0, %xmm5, %xmm0 +; movq %rbp, %rsp +; popq %rbp +; ret + +function %f3(f32x4) -> f32x4 { +block0(v0: f32x4): + v1 = fabs v0 + return v1 +} + +; pushq %rbp +; movq %rsp, %rbp +; block0: +; pcmpeqd %xmm4, %xmm4, %xmm4 +; psrld %xmm4, $1, %xmm4 +; andps %xmm0, %xmm4, %xmm0 +; movq %rbp, %rsp +; popq %rbp +; ret + +function %f4(f64x2) -> f64x2 { +block0(v0: f64x2): + v1 = fabs v0 + return v1 +} + +; pushq %rbp +; movq %rsp, %rbp +; block0: +; pcmpeqd %xmm4, %xmm4, %xmm4 +; psrlq %xmm4, $1, %xmm4 +; andpd %xmm0, %xmm4, %xmm0 +; movq %rbp, %rsp +; popq %rbp +; ret + diff --git a/cranelift/filetests/filetests/isa/x64/floating-point.clif b/cranelift/filetests/filetests/isa/x64/floating-point.clif index 1c1dc03fdb..b5b25b5ab6 100644 --- a/cranelift/filetests/filetests/isa/x64/floating-point.clif +++ b/cranelift/filetests/filetests/isa/x64/floating-point.clif @@ -10,11 +10,9 @@ block0(v0: f64): ; pushq %rbp ; movq %rsp, %rbp ; block0: -; movdqa %xmm0, %xmm5 -; movabsq $9223372036854775807, %rdx -; movq %rdx, %xmm0 -; movdqa %xmm5, %xmm7 -; andpd %xmm0, %xmm7, %xmm0 +; movabsq $9223372036854775807, %rcx +; movq %rcx, %xmm5 +; andpd %xmm0, %xmm5, %xmm0 ; movq %rbp, %rsp ; popq %rbp ; ret @@ -29,10 +27,10 @@ block0(v0: i64): ; pushq %rbp ; movq %rsp, %rbp ; block0: -; movsd 0(%rdi), %xmm5 -; movabsq $9223372036854775807, %r8 -; movq %r8, %xmm0 -; andpd %xmm0, %xmm5, %xmm0 +; movsd 0(%rdi), %xmm0 +; movabsq $9223372036854775807, %rdx +; movq %rdx, %xmm6 +; andpd %xmm0, %xmm6, %xmm0 ; movq %rbp, %rsp ; popq %rbp ; ret diff --git a/cranelift/filetests/filetests/isa/x64/fneg.clif b/cranelift/filetests/filetests/isa/x64/fneg.clif new file mode 100644 index 0000000000..6b76b9d2ad --- /dev/null +++ b/cranelift/filetests/filetests/isa/x64/fneg.clif @@ -0,0 +1,67 @@ +test compile precise-output +target x86_64 + +function %f1(f32) -> f32 { +block0(v0: f32): + v1 = fneg v0 + return v1 +} + +; pushq %rbp +; movq %rsp, %rbp +; block0: +; movl $-2147483648, %ecx +; movd %ecx, %xmm5 +; xorps %xmm0, %xmm5, %xmm0 +; movq %rbp, %rsp +; popq %rbp +; ret + +function %f2(f64) -> f64 { +block0(v0: f64): + v1 = fneg v0 + return v1 +} + +; pushq %rbp +; movq %rsp, %rbp +; block0: +; movabsq $-9223372036854775808, %rcx +; movq %rcx, %xmm5 +; xorpd %xmm0, %xmm5, %xmm0 +; movq %rbp, %rsp +; popq %rbp +; ret + +function %f3(f32x4) -> f32x4 { +block0(v0: f32x4): + v1 = fneg v0 + return v1 +} + +; pushq %rbp +; movq %rsp, %rbp +; block0: +; pcmpeqd %xmm4, %xmm4, %xmm4 +; pslld %xmm4, $31, %xmm4 +; xorps %xmm0, %xmm4, %xmm0 +; movq %rbp, %rsp +; popq %rbp +; ret + +function %f4(f64x2) -> f64x2 { +block0(v0: f64x2): + v1 = fneg v0 + return v1 +} + +; pushq %rbp +; movq %rsp, %rbp +; block0: +; pcmpeqd %xmm4, %xmm4, %xmm4 +; psllq %xmm4, $63, %xmm4 +; xorpd %xmm0, %xmm4, %xmm0 +; movq %rbp, %rsp +; popq %rbp +; ret +