x64: Lower bitcast, fabs, and fneg in ISLE (#4729)

* Add tests for bitcast

* Migrate bitcast to ISLE

* Add tests for fabs

* Lower fabs in ISLE

* Add tests for fneg

* Lower fneg in ISLE
This commit is contained in:
Trevor Elliott
2022-08-18 17:59:23 -07:00
committed by GitHub
parent 5ec92d59d2
commit 80c77da334
9 changed files with 279 additions and 159 deletions

View File

@@ -1647,6 +1647,10 @@
(rule (x64_movupd from)
(xmm_unary_rm_r (SseOpcode.Movupd) from))
(decl x64_movd (Xmm) Gpr)
(rule (x64_movd from)
(xmm_to_gpr (SseOpcode.Movd) from (OperandSize.Size32)))
(decl x64_movdqu (XmmMem) Xmm)
(rule (x64_movdqu from)
(xmm_unary_rm_r (SseOpcode.Movdqu) from))
@@ -2763,6 +2767,13 @@
(operand_size_of_type_32_64 (lane_type ty))))))
dst))
;; Helper for creating `MInst.XmmToGpr` instructions.
(decl xmm_to_gpr (SseOpcode Xmm OperandSize) Gpr)
(rule (xmm_to_gpr op src size)
(let ((dst WritableGpr (temp_writable_gpr))
(_ Unit (emit (MInst.XmmToGpr op src dst size))))
dst))
;; Helper for creating `MInst.GprToXmm` instructions.
(decl gpr_to_xmm (SseOpcode GprMem OperandSize) Xmm)
(rule (gpr_to_xmm op src size)
@@ -3514,6 +3525,20 @@
(decl atomic_rmw_op_to_mach_atomic_rmw_op (AtomicRmwOp) MachAtomicRmwOp)
(extern constructor atomic_rmw_op_to_mach_atomic_rmw_op atomic_rmw_op_to_mach_atomic_rmw_op)
;;;; Casting ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
(decl bitcast_xmm_to_gpr (Type Xmm) Gpr)
(rule (bitcast_xmm_to_gpr $F32 src)
(xmm_to_gpr (SseOpcode.Movd) src (OperandSize.Size32)))
(rule (bitcast_xmm_to_gpr $F64 src)
(xmm_to_gpr (SseOpcode.Movq) src (OperandSize.Size64)))
(decl bitcast_gpr_to_xmm (Type Gpr) Xmm)
(rule (bitcast_gpr_to_xmm $I32 src)
(gpr_to_xmm (SseOpcode.Movd) src (OperandSize.Size32)))
(rule (bitcast_gpr_to_xmm $I64 src)
(gpr_to_xmm (SseOpcode.Movq) src (OperandSize.Size64)))
;;;; Automatic conversions ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
(convert Gpr InstOutput output_gpr)

View File

@@ -38,6 +38,17 @@ impl Inst {
dst: WritableXmm::from_writable_reg(dst).unwrap(),
}
}
fn xmm_rmi_reg(opcode: SseOpcode, src: RegMemImm, dst: Writable<Reg>) -> Inst {
src.assert_regclass_is(RegClass::Float);
debug_assert!(dst.to_reg().class() == RegClass::Float);
Inst::XmmRmiReg {
opcode,
src1: Xmm::new(dst.to_reg()).unwrap(),
src2: XmmMemImm::new(src).unwrap(),
dst: WritableXmm::from_writable_reg(dst).unwrap(),
}
}
}
#[test]

View File

@@ -455,17 +455,6 @@ impl Inst {
Inst::MovzxRmR { ext_mode, src, dst }
}
pub(crate) fn xmm_rmi_reg(opcode: SseOpcode, src: RegMemImm, dst: Writable<Reg>) -> Inst {
src.assert_regclass_is(RegClass::Float);
debug_assert!(dst.to_reg().class() == RegClass::Float);
Inst::XmmRmiReg {
opcode,
src1: Xmm::new(dst.to_reg()).unwrap(),
src2: XmmMemImm::new(src).unwrap(),
dst: WritableXmm::from_writable_reg(dst).unwrap(),
}
}
pub(crate) fn movsx_rm_r(ext_mode: ExtMode, src: RegMem, dst: Writable<Reg>) -> Inst {
src.assert_regclass_is(RegClass::Int);
debug_assert!(dst.to_reg().class() == RegClass::Int);

View File

@@ -1194,6 +1194,12 @@
;;;; Rules for `fabs` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
(rule (lower (has_type $F32 (fabs x)))
(x64_andps x (imm $F32 0x7fffffff)))
(rule (lower (has_type $F64 (fabs x)))
(x64_andpd x (imm $F64 0x7fffffffffffffff)))
;; Special case for `f32x4.abs`.
(rule (lower (has_type $F32X4 (fabs x)))
(x64_andps x
@@ -1206,6 +1212,24 @@
(x64_psrlq (vector_all_ones)
(RegMemImm.Imm 1))))
;;;; Rules for `fneg` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
(rule (lower (has_type $F32 (fneg x)))
(x64_xorps x (imm $F32 0x80000000)))
(rule (lower (has_type $F64 (fneg x)))
(x64_xorpd x (imm $F64 0x8000000000000000)))
(rule (lower (has_type $F32X4 (fneg x)))
(x64_xorps x
(x64_pslld (vector_all_ones)
(RegMemImm.Imm 31))))
(rule (lower (has_type $F64X2 (fneg x)))
(x64_xorpd x
(x64_psllq (vector_all_ones)
(RegMemImm.Imm 63))))
;;;; Rules for `bnot` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;; `i64` and smaller.
@@ -3281,3 +3305,17 @@
;; We're missing a `unarrow` case for $I64X2
;; https://github.com/bytecodealliance/wasmtime/issues/4734
;; Rules for `bitcast` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
(rule (lower (has_type $I32 (bitcast src @ (value_type $F32))))
(bitcast_xmm_to_gpr $F32 src))
(rule (lower (has_type $F32 (bitcast src @ (value_type $I32))))
(bitcast_gpr_to_xmm $I32 src))
(rule (lower (has_type $I64 (bitcast src @ (value_type $F64))))
(bitcast_xmm_to_gpr $F64 src))
(rule (lower (has_type $F64 (bitcast src @ (value_type $I64))))
(bitcast_gpr_to_xmm $I64 src))

View File

@@ -4,9 +4,7 @@
pub(super) mod isle;
use crate::data_value::DataValue;
use crate::ir::{
condcodes::FloatCC, types, ExternalName, Inst as IRInst, InstructionData, LibCall, Opcode, Type,
};
use crate::ir::{types, ExternalName, Inst as IRInst, InstructionData, LibCall, Opcode, Type};
use crate::isa::x64::abi::*;
use crate::isa::x64::inst::args::*;
use crate::isa::x64::inst::*;
@@ -568,145 +566,13 @@ fn lower_insn_to_regs(
| Opcode::SwidenHigh
| Opcode::SwidenLow
| Opcode::Snarrow
| Opcode::Unarrow => {
| Opcode::Unarrow
| Opcode::Bitcast
| Opcode::Fabs
| Opcode::Fneg => {
implemented_in_isle(ctx);
}
Opcode::Bitcast => {
let input_ty = ctx.input_ty(insn, 0);
let output_ty = ctx.output_ty(insn, 0);
match (input_ty, output_ty) {
(types::F32, types::I32) => {
let src = put_input_in_reg(ctx, inputs[0]);
let dst = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
ctx.emit(Inst::xmm_to_gpr(
SseOpcode::Movd,
src,
dst,
OperandSize::Size32,
));
}
(types::I32, types::F32) => {
let src = input_to_reg_mem(ctx, inputs[0]);
let dst = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
ctx.emit(Inst::gpr_to_xmm(
SseOpcode::Movd,
src,
OperandSize::Size32,
dst,
));
}
(types::F64, types::I64) => {
let src = put_input_in_reg(ctx, inputs[0]);
let dst = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
ctx.emit(Inst::xmm_to_gpr(
SseOpcode::Movq,
src,
dst,
OperandSize::Size64,
));
}
(types::I64, types::F64) => {
let src = input_to_reg_mem(ctx, inputs[0]);
let dst = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
ctx.emit(Inst::gpr_to_xmm(
SseOpcode::Movq,
src,
OperandSize::Size64,
dst,
));
}
_ => unreachable!("invalid bitcast from {:?} to {:?}", input_ty, output_ty),
}
}
Opcode::Fabs | Opcode::Fneg => {
let src = RegMem::reg(put_input_in_reg(ctx, inputs[0]));
let dst = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
// In both cases, generate a constant and apply a single binary instruction:
// - to compute the absolute value, set all bits to 1 but the MSB to 0, and bit-AND the
// src with it.
// - to compute the negated value, set all bits to 0 but the MSB to 1, and bit-XOR the
// src with it.
let output_ty = ty.unwrap();
if !output_ty.is_vector() {
let (val, opcode): (u64, _) = match output_ty {
types::F32 => match op {
Opcode::Fabs => (0x7fffffff, SseOpcode::Andps),
Opcode::Fneg => (0x80000000, SseOpcode::Xorps),
_ => unreachable!(),
},
types::F64 => match op {
Opcode::Fabs => (0x7fffffffffffffff, SseOpcode::Andpd),
Opcode::Fneg => (0x8000000000000000, SseOpcode::Xorpd),
_ => unreachable!(),
},
_ => panic!("unexpected type {:?} for Fabs", output_ty),
};
for inst in Inst::gen_constant(ValueRegs::one(dst), val as u128, output_ty, |ty| {
ctx.alloc_tmp(ty).only_reg().unwrap()
}) {
ctx.emit(inst);
}
ctx.emit(Inst::xmm_rm_r(opcode, src, dst));
} else {
// Eventually vector constants should be available in `gen_constant` and this block
// can be merged with the one above (TODO).
if output_ty.bits() == 128 {
// Move the `lhs` to the same register as `dst`; this may not emit an actual move
// but ensures that the registers are the same to match x86's read-write operand
// encoding.
let src = put_input_in_reg(ctx, inputs[0]);
ctx.emit(Inst::gen_move(dst, src, output_ty));
// Generate an all 1s constant in an XMM register. This uses CMPPS but could
// have used CMPPD with the same effect. Note, we zero the temp we allocate
// because if not, there is a chance that the register we use could be initialized
// with NaN .. in which case the CMPPS would fail since NaN != NaN.
let tmp = ctx.alloc_tmp(output_ty).only_reg().unwrap();
ctx.emit(Inst::xmm_rm_r(SseOpcode::Xorps, RegMem::from(tmp), tmp));
let cond = FcmpImm::from(FloatCC::Equal);
let cmpps = Inst::xmm_rm_r_imm(
SseOpcode::Cmpps,
RegMem::reg(tmp.to_reg()),
tmp,
cond.encode(),
OperandSize::Size32,
);
ctx.emit(cmpps);
// Shift the all 1s constant to generate the mask.
let lane_bits = output_ty.lane_bits();
let (shift_opcode, opcode, shift_by) = match (op, lane_bits) {
(Opcode::Fabs, _) => {
unreachable!(
"implemented in ISLE: inst = `{}`, type = `{:?}`",
ctx.dfg().display_inst(insn),
ty
);
}
(Opcode::Fneg, 32) => (SseOpcode::Pslld, SseOpcode::Xorps, 31),
(Opcode::Fneg, 64) => (SseOpcode::Psllq, SseOpcode::Xorpd, 63),
_ => unreachable!(
"unexpected opcode and lane size: {:?}, {} bits",
op, lane_bits
),
};
let shift = Inst::xmm_rmi_reg(shift_opcode, RegMemImm::imm(shift_by), tmp);
ctx.emit(shift);
// Apply shifted mask (XOR or AND).
let mask = Inst::xmm_rm_r(opcode, RegMem::reg(tmp.to_reg()), dst);
ctx.emit(mask);
} else {
panic!("unexpected type {:?} for Fabs", output_ty);
}
}
}
Opcode::Fcopysign => {
let dst = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
let lhs = put_input_in_reg(ctx, inputs[0]);

View File

@@ -0,0 +1,59 @@
test compile precise-output
target x86_64
function %f1(f32) -> i32 {
block0(v0: f32):
v1 = bitcast.i32 v0
return v1
}
; pushq %rbp
; movq %rsp, %rbp
; block0:
; movd %xmm0, %eax
; movq %rbp, %rsp
; popq %rbp
; ret
function %f2(i32) -> f32 {
block0(v0: i32):
v1 = bitcast.f32 v0
return v1
}
; pushq %rbp
; movq %rsp, %rbp
; block0:
; movd %edi, %xmm0
; movq %rbp, %rsp
; popq %rbp
; ret
function %f3(f64) -> i64 {
block0(v0: f64):
v1 = bitcast.i64 v0
return v1
}
; pushq %rbp
; movq %rsp, %rbp
; block0:
; movq %xmm0, %rax
; movq %rbp, %rsp
; popq %rbp
; ret
function %f4(i64) -> f64 {
block0(v0: i64):
v1 = bitcast.f64 v0
return v1
}
; pushq %rbp
; movq %rsp, %rbp
; block0:
; movq %rdi, %xmm0
; movq %rbp, %rsp
; popq %rbp
; ret

View File

@@ -0,0 +1,67 @@
test compile precise-output
target x86_64
function %f1(f32) -> f32 {
block0(v0: f32):
v1 = fabs v0
return v1
}
; pushq %rbp
; movq %rsp, %rbp
; block0:
; movl $2147483647, %ecx
; movd %ecx, %xmm5
; andps %xmm0, %xmm5, %xmm0
; movq %rbp, %rsp
; popq %rbp
; ret
function %f2(f64) -> f64 {
block0(v0: f64):
v1 = fabs v0
return v1
}
; pushq %rbp
; movq %rsp, %rbp
; block0:
; movabsq $9223372036854775807, %rcx
; movq %rcx, %xmm5
; andpd %xmm0, %xmm5, %xmm0
; movq %rbp, %rsp
; popq %rbp
; ret
function %f3(f32x4) -> f32x4 {
block0(v0: f32x4):
v1 = fabs v0
return v1
}
; pushq %rbp
; movq %rsp, %rbp
; block0:
; pcmpeqd %xmm4, %xmm4, %xmm4
; psrld %xmm4, $1, %xmm4
; andps %xmm0, %xmm4, %xmm0
; movq %rbp, %rsp
; popq %rbp
; ret
function %f4(f64x2) -> f64x2 {
block0(v0: f64x2):
v1 = fabs v0
return v1
}
; pushq %rbp
; movq %rsp, %rbp
; block0:
; pcmpeqd %xmm4, %xmm4, %xmm4
; psrlq %xmm4, $1, %xmm4
; andpd %xmm0, %xmm4, %xmm0
; movq %rbp, %rsp
; popq %rbp
; ret

View File

@@ -10,11 +10,9 @@ block0(v0: f64):
; pushq %rbp
; movq %rsp, %rbp
; block0:
; movdqa %xmm0, %xmm5
; movabsq $9223372036854775807, %rdx
; movq %rdx, %xmm0
; movdqa %xmm5, %xmm7
; andpd %xmm0, %xmm7, %xmm0
; movabsq $9223372036854775807, %rcx
; movq %rcx, %xmm5
; andpd %xmm0, %xmm5, %xmm0
; movq %rbp, %rsp
; popq %rbp
; ret
@@ -29,10 +27,10 @@ block0(v0: i64):
; pushq %rbp
; movq %rsp, %rbp
; block0:
; movsd 0(%rdi), %xmm5
; movabsq $9223372036854775807, %r8
; movq %r8, %xmm0
; andpd %xmm0, %xmm5, %xmm0
; movsd 0(%rdi), %xmm0
; movabsq $9223372036854775807, %rdx
; movq %rdx, %xmm6
; andpd %xmm0, %xmm6, %xmm0
; movq %rbp, %rsp
; popq %rbp
; ret

View File

@@ -0,0 +1,67 @@
test compile precise-output
target x86_64
function %f1(f32) -> f32 {
block0(v0: f32):
v1 = fneg v0
return v1
}
; pushq %rbp
; movq %rsp, %rbp
; block0:
; movl $-2147483648, %ecx
; movd %ecx, %xmm5
; xorps %xmm0, %xmm5, %xmm0
; movq %rbp, %rsp
; popq %rbp
; ret
function %f2(f64) -> f64 {
block0(v0: f64):
v1 = fneg v0
return v1
}
; pushq %rbp
; movq %rsp, %rbp
; block0:
; movabsq $-9223372036854775808, %rcx
; movq %rcx, %xmm5
; xorpd %xmm0, %xmm5, %xmm0
; movq %rbp, %rsp
; popq %rbp
; ret
function %f3(f32x4) -> f32x4 {
block0(v0: f32x4):
v1 = fneg v0
return v1
}
; pushq %rbp
; movq %rsp, %rbp
; block0:
; pcmpeqd %xmm4, %xmm4, %xmm4
; pslld %xmm4, $31, %xmm4
; xorps %xmm0, %xmm4, %xmm0
; movq %rbp, %rsp
; popq %rbp
; ret
function %f4(f64x2) -> f64x2 {
block0(v0: f64x2):
v1 = fneg v0
return v1
}
; pushq %rbp
; movq %rsp, %rbp
; block0:
; pcmpeqd %xmm4, %xmm4, %xmm4
; psllq %xmm4, $63, %xmm4
; xorpd %xmm0, %xmm4, %xmm0
; movq %rbp, %rsp
; popq %rbp
; ret