x64: Lower extractlane, scalar_to_vector, and splat in ISLE (#4780)
Lower extractlane, scalar_to_vector and splat in ISLE. This PR also makes some changes to the SinkableLoad api * change the return type of sink_load to RegMem as there are more functions available for dealing with RegMem * add reg_mem_to_reg_mem_imm and register it as an automatic conversion
This commit is contained in:
@@ -777,6 +777,13 @@
|
|||||||
(Reg (reg Reg))
|
(Reg (reg Reg))
|
||||||
(Mem (addr SyntheticAmode))))
|
(Mem (addr SyntheticAmode))))
|
||||||
|
|
||||||
|
;; Convert a RegMem to a RegMemImm.
|
||||||
|
(decl reg_mem_to_reg_mem_imm (RegMem) RegMemImm)
|
||||||
|
(rule (reg_mem_to_reg_mem_imm (RegMem.Reg reg))
|
||||||
|
(RegMemImm.Reg reg))
|
||||||
|
(rule (reg_mem_to_reg_mem_imm (RegMem.Mem addr))
|
||||||
|
(RegMemImm.Mem addr))
|
||||||
|
|
||||||
;; Put the given clif value into a `RegMem` operand.
|
;; Put the given clif value into a `RegMem` operand.
|
||||||
;;
|
;;
|
||||||
;; Asserts that the value fits into a single register, and doesn't require
|
;; Asserts that the value fits into a single register, and doesn't require
|
||||||
@@ -1456,13 +1463,17 @@
|
|||||||
;; This is a side-effectful operation that notifies the context that the
|
;; This is a side-effectful operation that notifies the context that the
|
||||||
;; instruction that produced the `SinkableImm` has been sunk into another
|
;; instruction that produced the `SinkableImm` has been sunk into another
|
||||||
;; instruction, and no longer needs to be lowered.
|
;; instruction, and no longer needs to be lowered.
|
||||||
(decl sink_load (SinkableLoad) RegMemImm)
|
(decl sink_load (SinkableLoad) RegMem)
|
||||||
(extern constructor sink_load sink_load)
|
(extern constructor sink_load sink_load)
|
||||||
|
|
||||||
(decl sink_load_to_gpr_mem_imm (SinkableLoad) GprMemImm)
|
(decl sink_load_to_gpr_mem_imm (SinkableLoad) GprMemImm)
|
||||||
(rule (sink_load_to_gpr_mem_imm load)
|
(rule (sink_load_to_gpr_mem_imm load)
|
||||||
(gpr_mem_imm_new (sink_load load)))
|
(gpr_mem_imm_new (sink_load load)))
|
||||||
|
|
||||||
|
(decl sink_load_to_xmm_mem (SinkableLoad) XmmMem)
|
||||||
|
(rule (sink_load_to_xmm_mem load)
|
||||||
|
(reg_mem_to_xmm_mem (sink_load load)))
|
||||||
|
|
||||||
;;;; Helpers for Sign/Zero Extending ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
;;;; Helpers for Sign/Zero Extending ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||||
|
|
||||||
(type ExtKind extern
|
(type ExtKind extern
|
||||||
@@ -1534,6 +1545,13 @@
|
|||||||
(let ((r WritableXmm (temp_writable_xmm)))
|
(let ((r WritableXmm (temp_writable_xmm)))
|
||||||
(x64_pcmpeqd r r)))
|
(x64_pcmpeqd r r)))
|
||||||
|
|
||||||
|
;; Helper for creating XmmUninitializedValue instructions.
|
||||||
|
(decl xmm_uninit_value () Xmm)
|
||||||
|
(rule (xmm_uninit_value)
|
||||||
|
(let ((dst WritableXmm (temp_writable_xmm))
|
||||||
|
(_ Unit (emit (MInst.XmmUninitializedValue dst))))
|
||||||
|
dst))
|
||||||
|
|
||||||
;; Helper for creating an SSE register holding an `i64x2` from two `i64` values.
|
;; Helper for creating an SSE register holding an `i64x2` from two `i64` values.
|
||||||
(decl make_i64x2_from_lanes (GprMem GprMem) Xmm)
|
(decl make_i64x2_from_lanes (GprMem GprMem) Xmm)
|
||||||
(rule (make_i64x2_from_lanes lo hi)
|
(rule (make_i64x2_from_lanes lo hi)
|
||||||
@@ -2828,6 +2846,30 @@
|
|||||||
(rule (x64_psrad src1 src2)
|
(rule (x64_psrad src1 src2)
|
||||||
(xmm_rmi_xmm (SseOpcode.Psrad) src1 src2))
|
(xmm_rmi_xmm (SseOpcode.Psrad) src1 src2))
|
||||||
|
|
||||||
|
;; Helper for creating `pextrb` instructions.
|
||||||
|
(decl x64_pextrb (Type Xmm u8) Gpr)
|
||||||
|
(rule (x64_pextrb ty src lane)
|
||||||
|
(let ((dst WritableGpr (temp_writable_gpr))
|
||||||
|
(_ Unit (emit (MInst.XmmRmRImm (SseOpcode.Pextrb)
|
||||||
|
dst
|
||||||
|
src
|
||||||
|
dst
|
||||||
|
lane
|
||||||
|
(operand_size_of_type_32_64 (lane_type ty))))))
|
||||||
|
dst))
|
||||||
|
|
||||||
|
;; Helper for creating `pextrw` instructions.
|
||||||
|
(decl x64_pextrw (Type Xmm u8) Gpr)
|
||||||
|
(rule (x64_pextrw ty src lane)
|
||||||
|
(let ((dst WritableGpr (temp_writable_gpr))
|
||||||
|
(_ Unit (emit (MInst.XmmRmRImm (SseOpcode.Pextrw)
|
||||||
|
dst
|
||||||
|
src
|
||||||
|
dst
|
||||||
|
lane
|
||||||
|
(operand_size_of_type_32_64 (lane_type ty))))))
|
||||||
|
dst))
|
||||||
|
|
||||||
;; Helper for creating `pextrd` instructions.
|
;; Helper for creating `pextrd` instructions.
|
||||||
(decl x64_pextrd (Type Xmm u8) Gpr)
|
(decl x64_pextrd (Type Xmm u8) Gpr)
|
||||||
(rule (x64_pextrd ty src lane)
|
(rule (x64_pextrd ty src lane)
|
||||||
@@ -3707,6 +3749,7 @@
|
|||||||
(convert WritableGpr Gpr writable_gpr_to_gpr)
|
(convert WritableGpr Gpr writable_gpr_to_gpr)
|
||||||
(convert RegMemImm GprMemImm gpr_mem_imm_new)
|
(convert RegMemImm GprMemImm gpr_mem_imm_new)
|
||||||
(convert RegMem GprMem reg_mem_to_gpr_mem)
|
(convert RegMem GprMem reg_mem_to_gpr_mem)
|
||||||
|
(convert RegMem RegMemImm reg_mem_to_reg_mem_imm)
|
||||||
(convert Reg GprMem reg_to_gpr_mem)
|
(convert Reg GprMem reg_to_gpr_mem)
|
||||||
(convert Reg GprMemImm reg_to_gpr_mem_imm)
|
(convert Reg GprMemImm reg_to_gpr_mem_imm)
|
||||||
(convert WritableGpr WritableReg writable_gpr_to_reg)
|
(convert WritableGpr WritableReg writable_gpr_to_reg)
|
||||||
|
|||||||
@@ -78,6 +78,17 @@ impl Inst {
|
|||||||
dst: WritableXmm::from_writable_reg(dst).unwrap(),
|
dst: WritableXmm::from_writable_reg(dst).unwrap(),
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// TODO Can be replaced by `Inst::move` (high-level) and `Inst::unary_rm_r` (low-level)
|
||||||
|
fn xmm_mov(op: SseOpcode, src: RegMem, dst: Writable<Reg>) -> Inst {
|
||||||
|
src.assert_regclass_is(RegClass::Float);
|
||||||
|
debug_assert!(dst.to_reg().class() == RegClass::Float);
|
||||||
|
Inst::XmmUnaryRmR {
|
||||||
|
op,
|
||||||
|
src: XmmMem::new(src).unwrap(),
|
||||||
|
dst: WritableXmm::from_writable_reg(dst).unwrap(),
|
||||||
|
}
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
#[test]
|
#[test]
|
||||||
|
|||||||
@@ -263,17 +263,6 @@ impl Inst {
|
|||||||
Inst::MovRR { size, src, dst }
|
Inst::MovRR { size, src, dst }
|
||||||
}
|
}
|
||||||
|
|
||||||
// TODO Can be replaced by `Inst::move` (high-level) and `Inst::unary_rm_r` (low-level)
|
|
||||||
pub(crate) fn xmm_mov(op: SseOpcode, src: RegMem, dst: Writable<Reg>) -> Inst {
|
|
||||||
src.assert_regclass_is(RegClass::Float);
|
|
||||||
debug_assert!(dst.to_reg().class() == RegClass::Float);
|
|
||||||
Inst::XmmUnaryRmR {
|
|
||||||
op,
|
|
||||||
src: XmmMem::new(src).unwrap(),
|
|
||||||
dst: WritableXmm::from_writable_reg(dst).unwrap(),
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
pub(crate) fn xmm_load_const(src: VCodeConstant, dst: Writable<Reg>, ty: Type) -> Inst {
|
pub(crate) fn xmm_load_const(src: VCodeConstant, dst: Writable<Reg>, ty: Type) -> Inst {
|
||||||
debug_assert!(dst.to_reg().class() == RegClass::Float);
|
debug_assert!(dst.to_reg().class() == RegClass::Float);
|
||||||
debug_assert!(ty.is_vector() && ty.bits() == 128);
|
debug_assert!(ty.is_vector() && ty.bits() == 128);
|
||||||
@@ -316,13 +305,6 @@ impl Inst {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
pub(crate) fn xmm_uninit_value(dst: Writable<Reg>) -> Self {
|
|
||||||
debug_assert!(dst.to_reg().class() == RegClass::Float);
|
|
||||||
Inst::XmmUninitializedValue {
|
|
||||||
dst: WritableXmm::from_writable_reg(dst).unwrap(),
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
pub(crate) fn xmm_mov_r_m(op: SseOpcode, src: Reg, dst: impl Into<SyntheticAmode>) -> Inst {
|
pub(crate) fn xmm_mov_r_m(op: SseOpcode, src: Reg, dst: impl Into<SyntheticAmode>) -> Inst {
|
||||||
debug_assert!(src.class() == RegClass::Float);
|
debug_assert!(src.class() == RegClass::Float);
|
||||||
Inst::XmmMovRM {
|
Inst::XmmMovRM {
|
||||||
|
|||||||
@@ -3547,3 +3547,99 @@
|
|||||||
mask
|
mask
|
||||||
(x64_xmm_load_const $I8X16 (swizzle_zero_mask)))))
|
(x64_xmm_load_const $I8X16 (swizzle_zero_mask)))))
|
||||||
(x64_pshufb src mask)))
|
(x64_pshufb src mask)))
|
||||||
|
|
||||||
|
;; Rules for `extractlane` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||||
|
|
||||||
|
;; Remove the extractlane instruction, leaving the float where it is. The upper
|
||||||
|
;; bits will remain unchanged; for correctness, this relies on Cranelift type
|
||||||
|
;; checking to avoid using those bits.
|
||||||
|
(rule (lower (has_type (ty_scalar_float _) (extractlane val (u8_from_uimm8 0))))
|
||||||
|
val)
|
||||||
|
|
||||||
|
;; Cases 2-4 for an F32X4
|
||||||
|
(rule (lower (has_type $F32 (extractlane val @ (value_type (ty_vec128 ty))
|
||||||
|
(u8_from_uimm8 lane))))
|
||||||
|
(x64_pshufd val lane (OperandSize.Size32)))
|
||||||
|
|
||||||
|
;; This is the only remaining case for F64X2
|
||||||
|
(rule (lower (has_type $F64 (extractlane val @ (value_type (ty_vec128 ty))
|
||||||
|
(u8_from_uimm8 1))))
|
||||||
|
;; 0xee == 0b11_10_11_10
|
||||||
|
(x64_pshufd val 0xee (OperandSize.Size32)))
|
||||||
|
|
||||||
|
(rule (lower (extractlane val @ (value_type ty @ (multi_lane 8 16)) (u8_from_uimm8 lane)))
|
||||||
|
(x64_pextrb ty val lane))
|
||||||
|
|
||||||
|
(rule (lower (extractlane val @ (value_type ty @ (multi_lane 16 8)) (u8_from_uimm8 lane)))
|
||||||
|
(x64_pextrw ty val lane))
|
||||||
|
|
||||||
|
(rule (lower (extractlane val @ (value_type ty @ (multi_lane 32 4)) (u8_from_uimm8 lane)))
|
||||||
|
(x64_pextrd ty val lane))
|
||||||
|
|
||||||
|
(rule (lower (extractlane val @ (value_type ty @ (multi_lane 64 2)) (u8_from_uimm8 lane)))
|
||||||
|
(x64_pextrd ty val lane))
|
||||||
|
|
||||||
|
;; Rules for `scalar_to_vector` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||||
|
|
||||||
|
;; Case 1: when moving a scalar float, we simply move from one XMM register
|
||||||
|
;; to another, expecting the register allocator to elide this. Here we
|
||||||
|
;; assume that the upper bits of a scalar float have not been munged with
|
||||||
|
;; (the same assumption the old backend makes).
|
||||||
|
(rule (lower (scalar_to_vector src @ (value_type (ty_scalar_float _))))
|
||||||
|
src)
|
||||||
|
|
||||||
|
;; Case 2: when moving a scalar value of any other type, use MOVD to zero
|
||||||
|
;; the upper lanes.
|
||||||
|
(rule (lower (scalar_to_vector src @ (value_type ty)))
|
||||||
|
(bitcast_gpr_to_xmm ty src))
|
||||||
|
|
||||||
|
;; Case 3: when presented with `load + scalar_to_vector`, coalesce into a single
|
||||||
|
;; MOVSS/MOVSD instruction.
|
||||||
|
(rule (lower (scalar_to_vector (and (sinkable_load src) (value_type (ty_32 _)))))
|
||||||
|
(x64_movss_load (sink_load_to_xmm_mem src)))
|
||||||
|
(rule (lower (scalar_to_vector (and (sinkable_load src) (value_type (ty_64 _)))))
|
||||||
|
(x64_movsd_load (sink_load_to_xmm_mem src)))
|
||||||
|
|
||||||
|
;; Rules for `splat` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||||
|
|
||||||
|
(rule (lower (has_type (multi_lane 8 16) (splat src)))
|
||||||
|
(let ((vec Xmm (vec_insert_lane $I8X16 (xmm_uninit_value) src 0))
|
||||||
|
(zeros Xmm (x64_pxor vec vec)))
|
||||||
|
;; Shuffle the lowest byte lane to all other lanes.
|
||||||
|
(x64_pshufb vec zeros)))
|
||||||
|
|
||||||
|
(rule (lower (has_type (multi_lane 16 8) (splat src)))
|
||||||
|
(let (;; Force the input into a register so that we don't create a
|
||||||
|
;; VCodeConstant.
|
||||||
|
(src RegMem (RegMem.Reg src))
|
||||||
|
(vec Xmm (vec_insert_lane $I16X8 (xmm_uninit_value) src 0))
|
||||||
|
(vec Xmm (vec_insert_lane $I16X8 vec src 1)))
|
||||||
|
;; Shuffle the lowest two lanes to all other lanes.
|
||||||
|
(x64_pshufd vec 0 (OperandSize.Size32))))
|
||||||
|
|
||||||
|
(rule (lower (has_type (multi_lane 32 4) (splat src @ (value_type (ty_scalar_float _)))))
|
||||||
|
(lower_splat_32x4 $F32X4 src))
|
||||||
|
|
||||||
|
(rule (lower (has_type (multi_lane 32 4) (splat src)))
|
||||||
|
(lower_splat_32x4 $I32X4 src))
|
||||||
|
|
||||||
|
(decl lower_splat_32x4 (Type Value) Xmm)
|
||||||
|
(rule (lower_splat_32x4 ty src)
|
||||||
|
(let ((src RegMem src)
|
||||||
|
(vec Xmm (vec_insert_lane ty (xmm_uninit_value) src 0)))
|
||||||
|
;; Shuffle the lowest lane to all other lanes.
|
||||||
|
(x64_pshufd vec 0 (OperandSize.Size32))))
|
||||||
|
|
||||||
|
(rule (lower (has_type (multi_lane 64 2) (splat src @ (value_type (ty_scalar_float _)))))
|
||||||
|
(lower_splat_64x2 $F64X2 src))
|
||||||
|
|
||||||
|
(rule (lower (has_type (multi_lane 64 2) (splat src)))
|
||||||
|
(lower_splat_64x2 $I64X2 src))
|
||||||
|
|
||||||
|
(decl lower_splat_64x2 (Type Value) Xmm)
|
||||||
|
(rule (lower_splat_64x2 ty src)
|
||||||
|
(let (;; Force the input into a register so that we don't create a
|
||||||
|
;; VCodeConstant.
|
||||||
|
(src RegMem (RegMem.Reg src))
|
||||||
|
(vec Xmm (vec_insert_lane ty (xmm_uninit_value) src 0)))
|
||||||
|
(vec_insert_lane ty vec src 1)))
|
||||||
|
|||||||
@@ -3,7 +3,7 @@
|
|||||||
// ISLE integration glue.
|
// ISLE integration glue.
|
||||||
pub(super) mod isle;
|
pub(super) mod isle;
|
||||||
|
|
||||||
use crate::ir::{types, ExternalName, Inst as IRInst, InstructionData, LibCall, Opcode, Type};
|
use crate::ir::{types, ExternalName, Inst as IRInst, LibCall, Opcode, Type};
|
||||||
use crate::isa::x64::abi::*;
|
use crate::isa::x64::abi::*;
|
||||||
use crate::isa::x64::inst::args::*;
|
use crate::isa::x64::inst::args::*;
|
||||||
use crate::isa::x64::inst::*;
|
use crate::isa::x64::inst::*;
|
||||||
@@ -160,100 +160,6 @@ fn input_to_imm(ctx: &mut Lower<Inst>, spec: InsnInput) -> Option<u64> {
|
|||||||
.constant
|
.constant
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Emit an instruction to insert a value `src` into a lane of `dst`.
|
|
||||||
fn emit_insert_lane(ctx: &mut Lower<Inst>, src: RegMem, dst: Writable<Reg>, lane: u8, ty: Type) {
|
|
||||||
if !ty.is_float() {
|
|
||||||
let (sse_op, size) = match ty.lane_bits() {
|
|
||||||
8 => (SseOpcode::Pinsrb, OperandSize::Size32),
|
|
||||||
16 => (SseOpcode::Pinsrw, OperandSize::Size32),
|
|
||||||
32 => (SseOpcode::Pinsrd, OperandSize::Size32),
|
|
||||||
64 => (SseOpcode::Pinsrd, OperandSize::Size64),
|
|
||||||
_ => panic!("Unable to insertlane for lane size: {}", ty.lane_bits()),
|
|
||||||
};
|
|
||||||
ctx.emit(Inst::xmm_rm_r_imm(sse_op, src, dst, lane, size));
|
|
||||||
} else if ty == types::F32 {
|
|
||||||
let sse_op = SseOpcode::Insertps;
|
|
||||||
// Insert 32-bits from replacement (at index 00, bits 7:8) to vector (lane
|
|
||||||
// shifted into bits 5:6).
|
|
||||||
let lane = 0b00_00_00_00 | lane << 4;
|
|
||||||
ctx.emit(Inst::xmm_rm_r_imm(
|
|
||||||
sse_op,
|
|
||||||
src,
|
|
||||||
dst,
|
|
||||||
lane,
|
|
||||||
OperandSize::Size32,
|
|
||||||
));
|
|
||||||
} else if ty == types::F64 {
|
|
||||||
let sse_op = match lane {
|
|
||||||
// Move the lowest quadword in replacement to vector without changing
|
|
||||||
// the upper bits.
|
|
||||||
0 => SseOpcode::Movsd,
|
|
||||||
// Move the low 64 bits of replacement vector to the high 64 bits of the
|
|
||||||
// vector.
|
|
||||||
1 => SseOpcode::Movlhps,
|
|
||||||
_ => unreachable!(),
|
|
||||||
};
|
|
||||||
// Here we use the `xmm_rm_r` encoding because it correctly tells the register
|
|
||||||
// allocator how we are using `dst`: we are using `dst` as a `mod` whereas other
|
|
||||||
// encoding formats like `xmm_unary_rm_r` treat it as a `def`.
|
|
||||||
ctx.emit(Inst::xmm_rm_r(sse_op, src, dst));
|
|
||||||
} else {
|
|
||||||
panic!("unable to emit insertlane for type: {}", ty)
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
/// Emit an instruction to extract a lane of `src` into `dst`.
|
|
||||||
fn emit_extract_lane(ctx: &mut Lower<Inst>, src: Reg, dst: Writable<Reg>, lane: u8, ty: Type) {
|
|
||||||
if !ty.is_float() {
|
|
||||||
let (sse_op, size) = match ty.lane_bits() {
|
|
||||||
8 => (SseOpcode::Pextrb, OperandSize::Size32),
|
|
||||||
16 => (SseOpcode::Pextrw, OperandSize::Size32),
|
|
||||||
32 => (SseOpcode::Pextrd, OperandSize::Size32),
|
|
||||||
64 => (SseOpcode::Pextrd, OperandSize::Size64),
|
|
||||||
_ => panic!("Unable to extractlane for lane size: {}", ty.lane_bits()),
|
|
||||||
};
|
|
||||||
let src = RegMem::reg(src);
|
|
||||||
ctx.emit(Inst::xmm_rm_r_imm(sse_op, src, dst, lane, size));
|
|
||||||
} else if ty == types::F32 || ty == types::F64 {
|
|
||||||
if lane == 0 {
|
|
||||||
// Remove the extractlane instruction, leaving the float where it is. The upper
|
|
||||||
// bits will remain unchanged; for correctness, this relies on Cranelift type
|
|
||||||
// checking to avoid using those bits.
|
|
||||||
ctx.emit(Inst::gen_move(dst, src, ty));
|
|
||||||
} else {
|
|
||||||
// Otherwise, shuffle the bits in `lane` to the lowest lane.
|
|
||||||
let sse_op = SseOpcode::Pshufd;
|
|
||||||
let mask = match ty {
|
|
||||||
// Move the value at `lane` to lane 0, copying existing value at lane 0 to
|
|
||||||
// other lanes. Again, this relies on Cranelift type checking to avoid
|
|
||||||
// using those bits.
|
|
||||||
types::F32 => {
|
|
||||||
assert!(lane > 0 && lane < 4);
|
|
||||||
0b00_00_00_00 | lane
|
|
||||||
}
|
|
||||||
// Move the value at `lane` 1 (we know it must be 1 because of the `if`
|
|
||||||
// statement above) to lane 0 and leave lane 1 unchanged. The Cranelift type
|
|
||||||
// checking assumption also applies here.
|
|
||||||
types::F64 => {
|
|
||||||
assert!(lane == 1);
|
|
||||||
0b11_10_11_10
|
|
||||||
}
|
|
||||||
_ => unreachable!(),
|
|
||||||
};
|
|
||||||
let src = RegMem::reg(src);
|
|
||||||
ctx.emit(Inst::xmm_rm_r_imm(
|
|
||||||
sse_op,
|
|
||||||
src,
|
|
||||||
dst,
|
|
||||||
mask,
|
|
||||||
OperandSize::Size32,
|
|
||||||
));
|
|
||||||
}
|
|
||||||
} else {
|
|
||||||
panic!("unable to emit extractlane for type: {}", ty)
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
fn emit_vm_call(
|
fn emit_vm_call(
|
||||||
ctx: &mut Lower<Inst>,
|
ctx: &mut Lower<Inst>,
|
||||||
flags: &Flags,
|
flags: &Flags,
|
||||||
@@ -586,132 +492,15 @@ fn lower_insn_to_regs(
|
|||||||
| Opcode::RawBitcast
|
| Opcode::RawBitcast
|
||||||
| Opcode::Insertlane
|
| Opcode::Insertlane
|
||||||
| Opcode::Shuffle
|
| Opcode::Shuffle
|
||||||
| Opcode::Swizzle => {
|
| Opcode::Swizzle
|
||||||
|
| Opcode::Extractlane
|
||||||
|
| Opcode::ScalarToVector
|
||||||
|
| Opcode::Splat => {
|
||||||
implemented_in_isle(ctx);
|
implemented_in_isle(ctx);
|
||||||
}
|
}
|
||||||
|
|
||||||
Opcode::DynamicStackAddr => unimplemented!("DynamicStackAddr"),
|
Opcode::DynamicStackAddr => unimplemented!("DynamicStackAddr"),
|
||||||
|
|
||||||
Opcode::Extractlane => {
|
|
||||||
// The instruction format maps to variables like: %dst = extractlane %src, %lane
|
|
||||||
let ty = ty.unwrap();
|
|
||||||
let dst = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
|
|
||||||
let src_ty = ctx.input_ty(insn, 0);
|
|
||||||
assert_eq!(src_ty.bits(), 128);
|
|
||||||
let src = put_input_in_reg(ctx, inputs[0]);
|
|
||||||
let lane = if let InstructionData::BinaryImm8 { imm, .. } = ctx.data(insn) {
|
|
||||||
*imm
|
|
||||||
} else {
|
|
||||||
unreachable!();
|
|
||||||
};
|
|
||||||
debug_assert!(lane < src_ty.lane_count() as u8);
|
|
||||||
|
|
||||||
emit_extract_lane(ctx, src, dst, lane, ty);
|
|
||||||
}
|
|
||||||
|
|
||||||
Opcode::ScalarToVector => {
|
|
||||||
// When moving a scalar value to a vector register, we must be handle several
|
|
||||||
// situations:
|
|
||||||
// 1. a scalar float is already in an XMM register, so we simply move it
|
|
||||||
// 2. a scalar of any other type resides in a GPR register: MOVD moves the bits to an
|
|
||||||
// XMM register and zeroes the upper bits
|
|
||||||
// 3. a scalar (float or otherwise) that has previously been loaded from memory (e.g.
|
|
||||||
// the default lowering of Wasm's `load[32|64]_zero`) can be lowered to a single
|
|
||||||
// MOVSS/MOVSD instruction; to do this, we rely on `input_to_reg_mem` to sink the
|
|
||||||
// unused load.
|
|
||||||
let src = input_to_reg_mem(ctx, inputs[0]);
|
|
||||||
let src_ty = ctx.input_ty(insn, 0);
|
|
||||||
let dst = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
|
|
||||||
let dst_ty = ty.unwrap();
|
|
||||||
assert!(src_ty == dst_ty.lane_type() && dst_ty.bits() == 128);
|
|
||||||
match src {
|
|
||||||
RegMem::Reg { reg } => {
|
|
||||||
if src_ty.is_float() {
|
|
||||||
// Case 1: when moving a scalar float, we simply move from one XMM register
|
|
||||||
// to another, expecting the register allocator to elide this. Here we
|
|
||||||
// assume that the upper bits of a scalar float have not been munged with
|
|
||||||
// (the same assumption the old backend makes).
|
|
||||||
ctx.emit(Inst::gen_move(dst, reg, dst_ty));
|
|
||||||
} else {
|
|
||||||
// Case 2: when moving a scalar value of any other type, use MOVD to zero
|
|
||||||
// the upper lanes.
|
|
||||||
let src_size = match src_ty.bits() {
|
|
||||||
32 => OperandSize::Size32,
|
|
||||||
64 => OperandSize::Size64,
|
|
||||||
_ => unimplemented!("invalid source size for type: {}", src_ty),
|
|
||||||
};
|
|
||||||
ctx.emit(Inst::gpr_to_xmm(SseOpcode::Movd, src, src_size, dst));
|
|
||||||
}
|
|
||||||
}
|
|
||||||
RegMem::Mem { .. } => {
|
|
||||||
// Case 3: when presented with `load + scalar_to_vector`, coalesce into a single
|
|
||||||
// MOVSS/MOVSD instruction.
|
|
||||||
let opcode = match src_ty.bits() {
|
|
||||||
32 => SseOpcode::Movss,
|
|
||||||
64 => SseOpcode::Movsd,
|
|
||||||
_ => unimplemented!("unable to move scalar to vector for type: {}", src_ty),
|
|
||||||
};
|
|
||||||
ctx.emit(Inst::xmm_mov(opcode, src, dst));
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
Opcode::Splat => {
|
|
||||||
let ty = ty.unwrap();
|
|
||||||
assert_eq!(ty.bits(), 128);
|
|
||||||
let src_ty = ctx.input_ty(insn, 0);
|
|
||||||
assert!(src_ty.bits() < 128);
|
|
||||||
|
|
||||||
let src = input_to_reg_mem(ctx, inputs[0]);
|
|
||||||
let dst = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
|
|
||||||
|
|
||||||
// We know that splat will overwrite all of the lanes of `dst` but it takes several
|
|
||||||
// instructions to do so. Because of the multiple instructions, there is no good way to
|
|
||||||
// declare `dst` a `def` except with the following pseudo-instruction.
|
|
||||||
ctx.emit(Inst::xmm_uninit_value(dst));
|
|
||||||
|
|
||||||
// TODO: eventually many of these sequences could be optimized with AVX's VBROADCAST*
|
|
||||||
// and VPBROADCAST*.
|
|
||||||
match ty.lane_bits() {
|
|
||||||
8 => {
|
|
||||||
emit_insert_lane(ctx, src, dst, 0, ty.lane_type());
|
|
||||||
// Initialize a register with all 0s.
|
|
||||||
let tmp = ctx.alloc_tmp(ty).only_reg().unwrap();
|
|
||||||
ctx.emit(Inst::xmm_rm_r(SseOpcode::Pxor, RegMem::from(tmp), tmp));
|
|
||||||
// Shuffle the lowest byte lane to all other lanes.
|
|
||||||
ctx.emit(Inst::xmm_rm_r(SseOpcode::Pshufb, RegMem::from(tmp), dst))
|
|
||||||
}
|
|
||||||
16 => {
|
|
||||||
emit_insert_lane(ctx, src.clone(), dst, 0, ty.lane_type());
|
|
||||||
emit_insert_lane(ctx, src, dst, 1, ty.lane_type());
|
|
||||||
// Shuffle the lowest two lanes to all other lanes.
|
|
||||||
ctx.emit(Inst::xmm_rm_r_imm(
|
|
||||||
SseOpcode::Pshufd,
|
|
||||||
RegMem::from(dst),
|
|
||||||
dst,
|
|
||||||
0,
|
|
||||||
OperandSize::Size32,
|
|
||||||
))
|
|
||||||
}
|
|
||||||
32 => {
|
|
||||||
emit_insert_lane(ctx, src, dst, 0, ty.lane_type());
|
|
||||||
// Shuffle the lowest lane to all other lanes.
|
|
||||||
ctx.emit(Inst::xmm_rm_r_imm(
|
|
||||||
SseOpcode::Pshufd,
|
|
||||||
RegMem::from(dst),
|
|
||||||
dst,
|
|
||||||
0,
|
|
||||||
OperandSize::Size32,
|
|
||||||
))
|
|
||||||
}
|
|
||||||
64 => {
|
|
||||||
emit_insert_lane(ctx, src.clone(), dst, 0, ty.lane_type());
|
|
||||||
emit_insert_lane(ctx, src, dst, 1, ty.lane_type());
|
|
||||||
}
|
|
||||||
_ => panic!("Invalid type to splat: {}", ty),
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
Opcode::VanyTrue => {
|
Opcode::VanyTrue => {
|
||||||
let dst = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
|
let dst = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
|
||||||
let src_ty = ctx.input_ty(insn, 0);
|
let src_ty = ctx.input_ty(insn, 0);
|
||||||
|
|||||||
@@ -306,10 +306,10 @@ impl Context for IsleContext<'_, '_, MInst, Flags, IsaFlags, 6> {
|
|||||||
None
|
None
|
||||||
}
|
}
|
||||||
|
|
||||||
fn sink_load(&mut self, load: &SinkableLoad) -> RegMemImm {
|
fn sink_load(&mut self, load: &SinkableLoad) -> RegMem {
|
||||||
self.lower_ctx.sink_inst(load.inst);
|
self.lower_ctx.sink_inst(load.inst);
|
||||||
let addr = lower_to_amode(self.lower_ctx, load.addr_input, load.offset);
|
let addr = lower_to_amode(self.lower_ctx, load.addr_input, load.offset);
|
||||||
RegMemImm::Mem {
|
RegMem::Mem {
|
||||||
addr: SyntheticAmode::Real(addr),
|
addr: SyntheticAmode::Real(addr),
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -298,6 +298,24 @@ macro_rules! isle_prelude_methods {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#[inline]
|
||||||
|
fn ty_32(&mut self, ty: Type) -> Option<Type> {
|
||||||
|
if ty.bits() == 32 {
|
||||||
|
Some(ty)
|
||||||
|
} else {
|
||||||
|
None
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
#[inline]
|
||||||
|
fn ty_64(&mut self, ty: Type) -> Option<Type> {
|
||||||
|
if ty.bits() == 64 {
|
||||||
|
Some(ty)
|
||||||
|
} else {
|
||||||
|
None
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
#[inline]
|
#[inline]
|
||||||
fn ty_32_or_64(&mut self, ty: Type) -> Option<Type> {
|
fn ty_32_or_64(&mut self, ty: Type) -> Option<Type> {
|
||||||
if ty.bits() == 32 || ty.bits() == 64 {
|
if ty.bits() == 32 || ty.bits() == 64 {
|
||||||
|
|||||||
@@ -328,6 +328,14 @@
|
|||||||
(decl fits_in_64 (Type) Type)
|
(decl fits_in_64 (Type) Type)
|
||||||
(extern extractor fits_in_64 fits_in_64)
|
(extern extractor fits_in_64 fits_in_64)
|
||||||
|
|
||||||
|
;; An extractor that only matches types that fit in exactly 32 bits.
|
||||||
|
(decl ty_32 (Type) Type)
|
||||||
|
(extern extractor ty_32 ty_32)
|
||||||
|
|
||||||
|
;; An extractor that only matches types that fit in exactly 64 bits.
|
||||||
|
(decl ty_64 (Type) Type)
|
||||||
|
(extern extractor ty_64 ty_64)
|
||||||
|
|
||||||
;; A pure constructor that only matches scalar booleans, integers, and
|
;; A pure constructor that only matches scalar booleans, integers, and
|
||||||
;; references that can fit in 64 bits.
|
;; references that can fit in 64 bits.
|
||||||
(decl pure ty_int_bool_ref_scalar_64 (Type) Type)
|
(decl pure ty_int_bool_ref_scalar_64 (Type) Type)
|
||||||
|
|||||||
87
cranelift/filetests/filetests/isa/x64/extractlane.clif
Normal file
87
cranelift/filetests/filetests/isa/x64/extractlane.clif
Normal file
@@ -0,0 +1,87 @@
|
|||||||
|
test compile precise-output
|
||||||
|
target x86_64
|
||||||
|
|
||||||
|
function %f1(i8x16) -> i8 {
|
||||||
|
block0(v0: i8x16):
|
||||||
|
v1 = extractlane v0, 1
|
||||||
|
return v1
|
||||||
|
}
|
||||||
|
|
||||||
|
; pushq %rbp
|
||||||
|
; movq %rsp, %rbp
|
||||||
|
; block0:
|
||||||
|
; pextrb $1, %xmm0, %rax
|
||||||
|
; movq %rbp, %rsp
|
||||||
|
; popq %rbp
|
||||||
|
; ret
|
||||||
|
|
||||||
|
function %f2(i16x8) -> i16 {
|
||||||
|
block0(v0: i16x8):
|
||||||
|
v1 = extractlane v0, 1
|
||||||
|
return v1
|
||||||
|
}
|
||||||
|
|
||||||
|
; pushq %rbp
|
||||||
|
; movq %rsp, %rbp
|
||||||
|
; block0:
|
||||||
|
; pextrw $1, %xmm0, %rax
|
||||||
|
; movq %rbp, %rsp
|
||||||
|
; popq %rbp
|
||||||
|
; ret
|
||||||
|
|
||||||
|
function %f3(i32x4) -> i32 {
|
||||||
|
block0(v0: i32x4):
|
||||||
|
v1 = extractlane v0, 1
|
||||||
|
return v1
|
||||||
|
}
|
||||||
|
|
||||||
|
; pushq %rbp
|
||||||
|
; movq %rsp, %rbp
|
||||||
|
; block0:
|
||||||
|
; pextrd $1, %xmm0, %rax
|
||||||
|
; movq %rbp, %rsp
|
||||||
|
; popq %rbp
|
||||||
|
; ret
|
||||||
|
|
||||||
|
function %f4(i64x2) -> i64 {
|
||||||
|
block0(v0: i64x2):
|
||||||
|
v1 = extractlane v0, 1
|
||||||
|
return v1
|
||||||
|
}
|
||||||
|
|
||||||
|
; pushq %rbp
|
||||||
|
; movq %rsp, %rbp
|
||||||
|
; block0:
|
||||||
|
; pextrd.w $1, %xmm0, %rax
|
||||||
|
; movq %rbp, %rsp
|
||||||
|
; popq %rbp
|
||||||
|
; ret
|
||||||
|
|
||||||
|
function %f5(f32x4) -> f32 {
|
||||||
|
block0(v0: f32x4):
|
||||||
|
v1 = extractlane v0, 1
|
||||||
|
return v1
|
||||||
|
}
|
||||||
|
|
||||||
|
; pushq %rbp
|
||||||
|
; movq %rsp, %rbp
|
||||||
|
; block0:
|
||||||
|
; pshufd $1, %xmm0, %xmm0
|
||||||
|
; movq %rbp, %rsp
|
||||||
|
; popq %rbp
|
||||||
|
; ret
|
||||||
|
|
||||||
|
function %f6(f64x2) -> f64 {
|
||||||
|
block0(v0: f64x2):
|
||||||
|
v1 = extractlane v0, 1
|
||||||
|
return v1
|
||||||
|
}
|
||||||
|
|
||||||
|
; pushq %rbp
|
||||||
|
; movq %rsp, %rbp
|
||||||
|
; block0:
|
||||||
|
; pshufd $238, %xmm0, %xmm0
|
||||||
|
; movq %rbp, %rsp
|
||||||
|
; popq %rbp
|
||||||
|
; ret
|
||||||
|
|
||||||
@@ -74,8 +74,8 @@ block0(v0: i8):
|
|||||||
; block0:
|
; block0:
|
||||||
; uninit %xmm0
|
; uninit %xmm0
|
||||||
; pinsrb $0, %xmm0, %rdi, %xmm0
|
; pinsrb $0, %xmm0, %rdi, %xmm0
|
||||||
; pxor %xmm6, %xmm6, %xmm6
|
; pxor %xmm7, %xmm7, %xmm7
|
||||||
; pshufb %xmm0, %xmm6, %xmm0
|
; pshufb %xmm0, %xmm7, %xmm0
|
||||||
; movq %rbp, %rsp
|
; movq %rbp, %rsp
|
||||||
; popq %rbp
|
; popq %rbp
|
||||||
; ret
|
; ret
|
||||||
@@ -90,11 +90,11 @@ block0:
|
|||||||
; pushq %rbp
|
; pushq %rbp
|
||||||
; movq %rsp, %rbp
|
; movq %rsp, %rbp
|
||||||
; block0:
|
; block0:
|
||||||
; movl $65535, %eax
|
; movl $65535, %edi
|
||||||
; uninit %xmm0
|
; uninit %xmm5
|
||||||
; pinsrw $0, %xmm0, %rax, %xmm0
|
; pinsrw $0, %xmm5, %rdi, %xmm5
|
||||||
; pinsrw $1, %xmm0, %rax, %xmm0
|
; pinsrw $1, %xmm5, %rdi, %xmm5
|
||||||
; pshufd $0, %xmm0, %xmm0
|
; pshufd $0, %xmm5, %xmm0
|
||||||
; movq %rbp, %rsp
|
; movq %rbp, %rsp
|
||||||
; popq %rbp
|
; popq %rbp
|
||||||
; ret
|
; ret
|
||||||
@@ -108,9 +108,9 @@ block0(v0: i32):
|
|||||||
; pushq %rbp
|
; pushq %rbp
|
||||||
; movq %rsp, %rbp
|
; movq %rsp, %rbp
|
||||||
; block0:
|
; block0:
|
||||||
; uninit %xmm0
|
; uninit %xmm4
|
||||||
; pinsrd $0, %xmm0, %rdi, %xmm0
|
; pinsrd $0, %xmm4, %rdi, %xmm4
|
||||||
; pshufd $0, %xmm0, %xmm0
|
; pshufd $0, %xmm4, %xmm0
|
||||||
; movq %rbp, %rsp
|
; movq %rbp, %rsp
|
||||||
; popq %rbp
|
; popq %rbp
|
||||||
; ret
|
; ret
|
||||||
@@ -124,11 +124,11 @@ block0(v0: f64):
|
|||||||
; pushq %rbp
|
; pushq %rbp
|
||||||
; movq %rsp, %rbp
|
; movq %rsp, %rbp
|
||||||
; block0:
|
; block0:
|
||||||
; movdqa %xmm0, %xmm4
|
; movdqa %xmm0, %xmm6
|
||||||
; uninit %xmm0
|
; uninit %xmm0
|
||||||
; movdqa %xmm4, %xmm5
|
; movdqa %xmm6, %xmm7
|
||||||
; movsd %xmm0, %xmm5, %xmm0
|
; movsd %xmm0, %xmm7, %xmm0
|
||||||
; movlhps %xmm0, %xmm5, %xmm0
|
; movlhps %xmm0, %xmm7, %xmm0
|
||||||
; movq %rbp, %rsp
|
; movq %rbp, %rsp
|
||||||
; popq %rbp
|
; popq %rbp
|
||||||
; ret
|
; ret
|
||||||
|
|||||||
Reference in New Issue
Block a user