x64: Migrate fabs and bnot vector operations to ISLE
This was my first attempt at transitioning code to ISLE to originally fix #3327 but that fix has since landed on `main`, so this is instead now just porting a few operations to ISLE. Closes #3336
This commit is contained in:
@@ -329,6 +329,19 @@
|
|||||||
Vpmullq
|
Vpmullq
|
||||||
Vpopcntb))
|
Vpopcntb))
|
||||||
|
|
||||||
|
(type FcmpImm extern
|
||||||
|
(enum Equal
|
||||||
|
LessThan
|
||||||
|
LessThanOrEqual
|
||||||
|
Unordered
|
||||||
|
NotEqual
|
||||||
|
UnorderedOrGreaterThanOrEqual
|
||||||
|
UnorderedOrGreaterThan
|
||||||
|
Ordered))
|
||||||
|
|
||||||
|
(decl encode_fcmp_imm (FcmpImm) u8)
|
||||||
|
(extern constructor encode_fcmp_imm encode_fcmp_imm)
|
||||||
|
|
||||||
;;;; Helpers for Querying Enabled ISA Extensions ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
;;;; Helpers for Querying Enabled ISA Extensions ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||||
|
|
||||||
(decl avx512vl_enabled () Type)
|
(decl avx512vl_enabled () Type)
|
||||||
@@ -450,6 +463,49 @@
|
|||||||
(rule (extend (ExtendKind.Sign) ty mode src)
|
(rule (extend (ExtendKind.Sign) ty mode src)
|
||||||
(movsx ty mode src))
|
(movsx ty mode src))
|
||||||
|
|
||||||
|
;;;; Helpers for Working SSE tidbits ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||||
|
|
||||||
|
;; Determine the appropriate operation for xor-ing vectors of the specified type
|
||||||
|
(decl sse_xor_op (Type) SseOpcode)
|
||||||
|
(rule (sse_xor_op $F32X4) (SseOpcode.Xorps))
|
||||||
|
(rule (sse_xor_op $F64X2) (SseOpcode.Xorpd))
|
||||||
|
(rule (sse_xor_op (multi_lane _bits _lanes)) (SseOpcode.Pxor))
|
||||||
|
|
||||||
|
;; Performs an xor operation of the two operands specified
|
||||||
|
(decl sse_xor (Type Reg RegMem) Reg)
|
||||||
|
(rule (sse_xor ty x y) (xmm_rm_r ty (sse_xor_op ty) x y))
|
||||||
|
|
||||||
|
;; Determine the appropriate operation to compare two vectors of the specified
|
||||||
|
;; type.
|
||||||
|
(decl sse_cmp_op (Type) SseOpcode)
|
||||||
|
(rule (sse_cmp_op (multi_lane 8 16)) (SseOpcode.Pcmpeqb))
|
||||||
|
(rule (sse_cmp_op (multi_lane 16 8)) (SseOpcode.Pcmpeqw))
|
||||||
|
(rule (sse_cmp_op (multi_lane 32 4)) (SseOpcode.Pcmpeqd))
|
||||||
|
(rule (sse_cmp_op (multi_lane 64 2)) (SseOpcode.Pcmpeqq))
|
||||||
|
(rule (sse_cmp_op $F32X4) (SseOpcode.Cmpps))
|
||||||
|
(rule (sse_cmp_op $F64X2) (SseOpcode.Cmppd))
|
||||||
|
|
||||||
|
;; Generates a register value which has an all-ones pattern of the specified
|
||||||
|
;; type.
|
||||||
|
;;
|
||||||
|
;; Note that this is accomplished by comparing a fresh register with itself,
|
||||||
|
;; which for integers is always true. Also note that the comparison is always
|
||||||
|
;; done for integers, it doesn't actually take the input `ty` into account. This
|
||||||
|
;; is because we're comparing a fresh register to itself and we don't know the
|
||||||
|
;; previous contents of the register. If a floating-point comparison is used
|
||||||
|
;; then it runs the risk of comparing NaN against NaN and not actually producing
|
||||||
|
;; an all-ones mask. By using integer comparision operations we're guaranteeed
|
||||||
|
;; that everything is equal to itself.
|
||||||
|
(decl vector_all_ones (Type) Reg)
|
||||||
|
(rule (vector_all_ones ty)
|
||||||
|
(let ((wr WritableReg (temp_writable_reg ty))
|
||||||
|
(r Reg (writable_reg_to_reg wr))
|
||||||
|
(_ Unit (emit (MInst.XmmRmR (sse_cmp_op $I32X4)
|
||||||
|
r
|
||||||
|
(RegMem.Reg r)
|
||||||
|
wr))))
|
||||||
|
r))
|
||||||
|
|
||||||
;;;; Instruction Constructors ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
;;;; Instruction Constructors ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||||
;;
|
;;
|
||||||
;; These constructors create SSA-style `MInst`s. It is their responsibility to
|
;; These constructors create SSA-style `MInst`s. It is their responsibility to
|
||||||
@@ -596,6 +652,17 @@
|
|||||||
wr))))
|
wr))))
|
||||||
r))
|
r))
|
||||||
|
|
||||||
|
;; Special case for zero immediates with vector types, they turn into an xor
|
||||||
|
;; specific to the vector type.
|
||||||
|
(rule (imm ty @ (multi_lane _bits _lanes) 0)
|
||||||
|
(let ((wr WritableReg (temp_writable_reg ty))
|
||||||
|
(r Reg (writable_reg_to_reg wr))
|
||||||
|
(_ Unit (emit (MInst.XmmRmR (sse_xor_op ty)
|
||||||
|
r
|
||||||
|
(RegMem.Reg r)
|
||||||
|
wr))))
|
||||||
|
r))
|
||||||
|
|
||||||
;; Helper for creating `MInst.ShifR` instructions.
|
;; Helper for creating `MInst.ShifR` instructions.
|
||||||
(decl shift_r (Type ShiftKind Reg Imm8Reg) Reg)
|
(decl shift_r (Type ShiftKind Reg Imm8Reg) Reg)
|
||||||
(rule (shift_r ty kind src1 src2)
|
(rule (shift_r ty kind src1 src2)
|
||||||
@@ -948,6 +1015,11 @@
|
|||||||
(rule (psllq src1 src2)
|
(rule (psllq src1 src2)
|
||||||
(xmm_rmi_reg (SseOpcode.Psllq) src1 src2))
|
(xmm_rmi_reg (SseOpcode.Psllq) src1 src2))
|
||||||
|
|
||||||
|
;; Helper for creating `psrld` instructions.
|
||||||
|
(decl psrld (Reg RegMemImm) Reg)
|
||||||
|
(rule (psrld src1 src2)
|
||||||
|
(xmm_rmi_reg (SseOpcode.Psrld) src1 src2))
|
||||||
|
|
||||||
;; Helper for creating `psrlq` instructions.
|
;; Helper for creating `psrlq` instructions.
|
||||||
(decl psrlq (Reg RegMemImm) Reg)
|
(decl psrlq (Reg RegMemImm) Reg)
|
||||||
(rule (psrlq src1 src2)
|
(rule (psrlq src1 src2)
|
||||||
@@ -975,3 +1047,25 @@
|
|||||||
(decl mulhi_u (Type Reg RegMem) ValueRegs)
|
(decl mulhi_u (Type Reg RegMem) ValueRegs)
|
||||||
(rule (mulhi_u ty src1 src2)
|
(rule (mulhi_u ty src1 src2)
|
||||||
(mul_hi ty $false src1 src2))
|
(mul_hi ty $false src1 src2))
|
||||||
|
|
||||||
|
;; Helper for creating `cmpps` instructions.
|
||||||
|
(decl cmpps (Reg RegMem FcmpImm) Reg)
|
||||||
|
(rule (cmpps src1 src2 imm)
|
||||||
|
(xmm_rm_r_imm (SseOpcode.Cmpps)
|
||||||
|
src1
|
||||||
|
src2
|
||||||
|
(encode_fcmp_imm imm)
|
||||||
|
(OperandSize.Size32)))
|
||||||
|
|
||||||
|
;; Helper for creating `cmppd` instructions.
|
||||||
|
;;
|
||||||
|
;; Note that `Size32` is intentional despite this being used for 64-bit
|
||||||
|
;; operations, since this presumably induces the correct encoding of the
|
||||||
|
;; instruction.
|
||||||
|
(decl cmppd (Reg RegMem FcmpImm) Reg)
|
||||||
|
(rule (cmppd src1 src2 imm)
|
||||||
|
(xmm_rm_r_imm (SseOpcode.Cmppd)
|
||||||
|
src1
|
||||||
|
src2
|
||||||
|
(encode_fcmp_imm imm)
|
||||||
|
(OperandSize.Size32)))
|
||||||
|
|||||||
@@ -1391,7 +1391,8 @@ impl fmt::Display for CC {
|
|||||||
/// Encode the ways that floats can be compared. This is used in float comparisons such as `cmpps`,
|
/// Encode the ways that floats can be compared. This is used in float comparisons such as `cmpps`,
|
||||||
/// e.g.; it is distinguished from other float comparisons (e.g. `ucomiss`) in that those use EFLAGS
|
/// e.g.; it is distinguished from other float comparisons (e.g. `ucomiss`) in that those use EFLAGS
|
||||||
/// whereas [FcmpImm] is used as an immediate.
|
/// whereas [FcmpImm] is used as an immediate.
|
||||||
pub(crate) enum FcmpImm {
|
#[derive(Clone, Copy)]
|
||||||
|
pub enum FcmpImm {
|
||||||
Equal = 0x00,
|
Equal = 0x00,
|
||||||
LessThan = 0x01,
|
LessThan = 0x01,
|
||||||
LessThanOrEqual = 0x02,
|
LessThanOrEqual = 0x02,
|
||||||
|
|||||||
@@ -1301,31 +1301,6 @@ impl Inst {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Choose which instruction to use for comparing two values for equality.
|
|
||||||
pub(crate) fn equals(ty: Type, from: RegMem, to: Writable<Reg>) -> Inst {
|
|
||||||
match ty {
|
|
||||||
types::I8X16 | types::B8X16 => Inst::xmm_rm_r(SseOpcode::Pcmpeqb, from, to),
|
|
||||||
types::I16X8 | types::B16X8 => Inst::xmm_rm_r(SseOpcode::Pcmpeqw, from, to),
|
|
||||||
types::I32X4 | types::B32X4 => Inst::xmm_rm_r(SseOpcode::Pcmpeqd, from, to),
|
|
||||||
types::I64X2 | types::B64X2 => Inst::xmm_rm_r(SseOpcode::Pcmpeqq, from, to),
|
|
||||||
types::F32X4 => Inst::xmm_rm_r_imm(
|
|
||||||
SseOpcode::Cmpps,
|
|
||||||
from,
|
|
||||||
to,
|
|
||||||
FcmpImm::Equal.encode(),
|
|
||||||
OperandSize::Size32,
|
|
||||||
),
|
|
||||||
types::F64X2 => Inst::xmm_rm_r_imm(
|
|
||||||
SseOpcode::Cmppd,
|
|
||||||
from,
|
|
||||||
to,
|
|
||||||
FcmpImm::Equal.encode(),
|
|
||||||
OperandSize::Size32,
|
|
||||||
),
|
|
||||||
_ => unimplemented!("unimplemented type for Inst::equals: {}", ty),
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
/// Choose which instruction to use for computing a bitwise AND on two values.
|
/// Choose which instruction to use for computing a bitwise AND on two values.
|
||||||
pub(crate) fn and(ty: Type, from: RegMem, to: Writable<Reg>) -> Inst {
|
pub(crate) fn and(ty: Type, from: RegMem, to: Writable<Reg>) -> Inst {
|
||||||
match ty {
|
match ty {
|
||||||
@@ -1356,16 +1331,6 @@ impl Inst {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Choose which instruction to use for computing a bitwise XOR on two values.
|
|
||||||
pub(crate) fn xor(ty: Type, from: RegMem, to: Writable<Reg>) -> Inst {
|
|
||||||
match ty {
|
|
||||||
types::F32X4 => Inst::xmm_rm_r(SseOpcode::Xorps, from, to),
|
|
||||||
types::F64X2 => Inst::xmm_rm_r(SseOpcode::Xorpd, from, to),
|
|
||||||
_ if ty.is_vector() && ty.bits() == 128 => Inst::xmm_rm_r(SseOpcode::Pxor, from, to),
|
|
||||||
_ => unimplemented!("unimplemented type for Inst::xor: {}", ty),
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
/// Translate three-operand instructions into a sequence of two-operand
|
/// Translate three-operand instructions into a sequence of two-operand
|
||||||
/// instructions.
|
/// instructions.
|
||||||
///
|
///
|
||||||
|
|||||||
@@ -484,18 +484,8 @@
|
|||||||
|
|
||||||
;; SSE.
|
;; SSE.
|
||||||
|
|
||||||
(rule (lower (has_type $F32X4 (bxor x y)))
|
(rule (lower (has_type ty @ (multi_lane _bits _lanes) (bxor x y)))
|
||||||
(value_reg (xorps (put_in_reg x)
|
(value_reg (sse_xor ty (put_in_reg x) (put_in_reg_mem y))))
|
||||||
(put_in_reg_mem y))))
|
|
||||||
|
|
||||||
(rule (lower (has_type $F64X2 (bxor x y)))
|
|
||||||
(value_reg (xorpd (put_in_reg x)
|
|
||||||
(put_in_reg_mem y))))
|
|
||||||
|
|
||||||
(rule (lower (has_type (multi_lane _bits _lanes)
|
|
||||||
(bxor x y)))
|
|
||||||
(value_reg (pxor (put_in_reg x)
|
|
||||||
(put_in_reg_mem y))))
|
|
||||||
|
|
||||||
;; `{i,b}128`.
|
;; `{i,b}128`.
|
||||||
|
|
||||||
@@ -945,3 +935,22 @@
|
|||||||
|
|
||||||
(rule (lower (has_type (multi_lane _bits _lanes) (band_not x y)))
|
(rule (lower (has_type (multi_lane _bits _lanes) (band_not x y)))
|
||||||
(value_reg (pandn (put_in_reg y) (put_in_reg_mem x))))
|
(value_reg (pandn (put_in_reg y) (put_in_reg_mem x))))
|
||||||
|
|
||||||
|
;;;; Rules for `fabs` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||||
|
|
||||||
|
;; Special case for `f32x4.abs`.
|
||||||
|
(rule (lower (has_type $F32X4 (fabs x)))
|
||||||
|
(value_reg (andps (put_in_reg x)
|
||||||
|
(RegMem.Reg (psrld (vector_all_ones $F32X4) (RegMemImm.Imm 1))))))
|
||||||
|
|
||||||
|
;; Special case for `f64x2.abs`.
|
||||||
|
(rule (lower (has_type $F64X2 (fabs x)))
|
||||||
|
(value_reg (andpd (put_in_reg x)
|
||||||
|
(RegMem.Reg (psrlq (vector_all_ones $F64X2) (RegMemImm.Imm 1))))))
|
||||||
|
|
||||||
|
;;;; Rules for `bnot` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||||
|
|
||||||
|
;; Special case for vector-types where bit-negation is an xor against an
|
||||||
|
;; all-one value
|
||||||
|
(rule (lower (has_type ty @ (multi_lane _bits _lanes) (bnot x)))
|
||||||
|
(value_reg (sse_xor ty (put_in_reg x) (RegMem.Reg (vector_all_ones ty)))))
|
||||||
|
|||||||
@@ -1615,14 +1615,11 @@ fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(
|
|||||||
let ty = ty.unwrap();
|
let ty = ty.unwrap();
|
||||||
|
|
||||||
if ty.is_vector() {
|
if ty.is_vector() {
|
||||||
let src = put_input_in_reg(ctx, inputs[0]);
|
unreachable!(
|
||||||
let dst = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
|
"implemented in ISLE: inst = `{}`, type = `{:?}`",
|
||||||
ctx.emit(Inst::gen_move(dst, src, ty));
|
ctx.dfg().display_inst(insn),
|
||||||
let tmp = ctx.alloc_tmp(ty).only_reg().unwrap();
|
ty
|
||||||
|
);
|
||||||
// Set tmp to all 1s before flipping the bits
|
|
||||||
ctx.emit(Inst::equals(types::I32X4, RegMem::from(tmp), tmp));
|
|
||||||
ctx.emit(Inst::xor(ty, RegMem::from(tmp), dst));
|
|
||||||
} else if ty == types::I128 || ty == types::B128 {
|
} else if ty == types::I128 || ty == types::B128 {
|
||||||
let src = put_input_in_regs(ctx, inputs[0]);
|
let src = put_input_in_regs(ctx, inputs[0]);
|
||||||
let dst = get_output_reg(ctx, outputs[0]);
|
let dst = get_output_reg(ctx, outputs[0]);
|
||||||
@@ -4669,8 +4666,13 @@ fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(
|
|||||||
// Shift the all 1s constant to generate the mask.
|
// Shift the all 1s constant to generate the mask.
|
||||||
let lane_bits = output_ty.lane_bits();
|
let lane_bits = output_ty.lane_bits();
|
||||||
let (shift_opcode, opcode, shift_by) = match (op, lane_bits) {
|
let (shift_opcode, opcode, shift_by) = match (op, lane_bits) {
|
||||||
(Opcode::Fabs, 32) => (SseOpcode::Psrld, SseOpcode::Andps, 1),
|
(Opcode::Fabs, _) => {
|
||||||
(Opcode::Fabs, 64) => (SseOpcode::Psrlq, SseOpcode::Andpd, 1),
|
unreachable!(
|
||||||
|
"implemented in ISLE: inst = `{}`, type = `{:?}`",
|
||||||
|
ctx.dfg().display_inst(insn),
|
||||||
|
ty
|
||||||
|
);
|
||||||
|
}
|
||||||
(Opcode::Fneg, 32) => (SseOpcode::Pslld, SseOpcode::Xorps, 31),
|
(Opcode::Fneg, 32) => (SseOpcode::Pslld, SseOpcode::Xorps, 31),
|
||||||
(Opcode::Fneg, 64) => (SseOpcode::Psllq, SseOpcode::Xorpd, 63),
|
(Opcode::Fneg, 64) => (SseOpcode::Psllq, SseOpcode::Xorpd, 63),
|
||||||
_ => unreachable!(
|
_ => unreachable!(
|
||||||
|
|||||||
@@ -13,7 +13,9 @@ use crate::isa::x64::settings as x64_settings;
|
|||||||
use crate::{
|
use crate::{
|
||||||
ir::{immediates::*, types::*, Inst, InstructionData, Opcode, Value, ValueList},
|
ir::{immediates::*, types::*, Inst, InstructionData, Opcode, Value, ValueList},
|
||||||
isa::x64::inst::{
|
isa::x64::inst::{
|
||||||
args::{Avx512Opcode, CmpOpcode, ExtMode, Imm8Reg, RegMem, ShiftKind, SseOpcode, CC},
|
args::{
|
||||||
|
Avx512Opcode, CmpOpcode, ExtMode, FcmpImm, Imm8Reg, RegMem, ShiftKind, SseOpcode, CC,
|
||||||
|
},
|
||||||
x64_map_regs, RegMapper,
|
x64_map_regs, RegMapper,
|
||||||
},
|
},
|
||||||
machinst::{get_output_reg, InsnInput, InsnOutput, LowerCtx},
|
machinst::{get_output_reg, InsnInput, InsnOutput, LowerCtx},
|
||||||
@@ -313,6 +315,11 @@ where
|
|||||||
RegMem::reg(self.put_in_reg(val))
|
RegMem::reg(self.put_in_reg(val))
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#[inline]
|
||||||
|
fn encode_fcmp_imm(&mut self, imm: &FcmpImm) -> u8 {
|
||||||
|
imm.encode()
|
||||||
|
}
|
||||||
|
|
||||||
#[inline]
|
#[inline]
|
||||||
fn avx512vl_enabled(&mut self, _: Type) -> Option<()> {
|
fn avx512vl_enabled(&mut self, _: Type) -> Option<()> {
|
||||||
if self.isa_flags.use_avx512vl_simd() {
|
if self.isa_flags.use_avx512vl_simd() {
|
||||||
|
|||||||
File diff suppressed because it is too large
Load Diff
@@ -7,4 +7,50 @@
|
|||||||
v128.not)
|
v128.not)
|
||||||
)
|
)
|
||||||
|
|
||||||
(assert_return (invoke "v128_not") (v128.const i32x4 -1 -1 -1 -1))
|
(assert_return (invoke "v128_not") (v128.const i32x4 -1 -1 -1 -1))
|
||||||
|
|
||||||
|
;; from #3327
|
||||||
|
(module
|
||||||
|
(func (result i32)
|
||||||
|
v128.const i32x4 0xffffffff 0x80bfffff 0x80bf0a0a 0x80bf0a0a
|
||||||
|
f64x2.promote_low_f32x4
|
||||||
|
v128.not
|
||||||
|
v128.not
|
||||||
|
v128.not
|
||||||
|
v128.not
|
||||||
|
v128.not
|
||||||
|
v128.not
|
||||||
|
v128.not
|
||||||
|
v128.const i32x4 0 0 0 0
|
||||||
|
f64x2.gt
|
||||||
|
v128.not
|
||||||
|
i64x2.bitmask)
|
||||||
|
(export "" (func 0)))
|
||||||
|
(assert_return (invoke "") (i32.const 0))
|
||||||
|
|
||||||
|
;; from #3327
|
||||||
|
(module
|
||||||
|
(type (func (param i32) (result i32)))
|
||||||
|
(func (type 0) (param i32) (result i32)
|
||||||
|
local.get 0
|
||||||
|
i32x4.splat
|
||||||
|
f64x2.abs
|
||||||
|
v128.not
|
||||||
|
i64x2.bitmask)
|
||||||
|
(export "1" (func 0)))
|
||||||
|
(assert_return (invoke "1" (i32.const 0)) (i32.const 3))
|
||||||
|
|
||||||
|
(module
|
||||||
|
(type (;0;) (func (result v128)))
|
||||||
|
(func (;0;) (type 0) (result v128)
|
||||||
|
v128.const i32x4 0x733c3e67 0x3c3e6776 0x3e677673 0x6776733c
|
||||||
|
i64x2.abs
|
||||||
|
i64x2.bitmask
|
||||||
|
i8x16.splat
|
||||||
|
v128.const i32x4 0x733c3e67 0x3c3e6776 0x3e677673 0x6776733c
|
||||||
|
i64x2.ge_s
|
||||||
|
f32x4.floor
|
||||||
|
v128.not
|
||||||
|
i16x8.extadd_pairwise_i8x16_u)
|
||||||
|
(export "x" (func 0)))
|
||||||
|
(assert_return (invoke "x") (v128.const i32x4 0x01fe01fe 0x01fe01fe 0x01fe01fe 0x01fe01fe))
|
||||||
|
|||||||
Reference in New Issue
Block a user