x64: Lower shuffle and swizzle in ISLE (#4772)
Lower `shuffle` and `swizzle` in ISLE.
This PR surfaced a bug with the lowering of `shuffle` when avx512vl and avx512vbmi are enabled: we use `vpermi2b` as the implementation, but panic if the immediate shuffle mask contains any out-of-bounds values. The behavior when the avx512 extensions are not present is that out-of-bounds values are turned into `0` in the result.
I've resolved this by detecting when the shuffle immediate has out-of-bounds indices in the avx512-enabled lowering, and generating an additional mask to zero out the lanes where those indices occur. This brings the avx512 case into line with the semantics of the `shuffle` op: 94bcbe8446/cranelift/codegen/meta/src/shared/instructions.rs (L1495-L1498)
This commit is contained in:
@@ -1400,6 +1400,9 @@
|
|||||||
(decl avx512bitalg_enabled () Type)
|
(decl avx512bitalg_enabled () Type)
|
||||||
(extern extractor avx512bitalg_enabled avx512bitalg_enabled)
|
(extern extractor avx512bitalg_enabled avx512bitalg_enabled)
|
||||||
|
|
||||||
|
(decl avx512vbmi_enabled () Type)
|
||||||
|
(extern extractor avx512vbmi_enabled avx512vbmi_enabled)
|
||||||
|
|
||||||
(decl use_lzcnt () Type)
|
(decl use_lzcnt () Type)
|
||||||
(extern extractor use_lzcnt use_lzcnt)
|
(extern extractor use_lzcnt use_lzcnt)
|
||||||
|
|
||||||
@@ -2740,6 +2743,19 @@
|
|||||||
src1
|
src1
|
||||||
src2))
|
src2))
|
||||||
|
|
||||||
|
;; Helper for creating `vpermi2b` instructions.
|
||||||
|
;;
|
||||||
|
;; Requires AVX-512 vl and vbmi extensions.
|
||||||
|
(decl x64_vpermi2b (Xmm Xmm Xmm) Xmm)
|
||||||
|
(rule (x64_vpermi2b src1 src2 src3)
|
||||||
|
(let ((dst WritableXmm (temp_writable_xmm))
|
||||||
|
(_ Unit (emit (gen_move $I8X16 dst src3)))
|
||||||
|
(_ Unit (emit (MInst.XmmRmREvex (Avx512Opcode.Vpermi2b)
|
||||||
|
src1
|
||||||
|
src2
|
||||||
|
dst))))
|
||||||
|
dst))
|
||||||
|
|
||||||
;; Helper for creating `MInst.MulHi` instructions.
|
;; Helper for creating `MInst.MulHi` instructions.
|
||||||
;;
|
;;
|
||||||
;; Returns the (lo, hi) register halves of the multiplication.
|
;; Returns the (lo, hi) register halves of the multiplication.
|
||||||
@@ -3634,6 +3650,47 @@
|
|||||||
(let ((dst WritableGpr (pinned_writable_gpr)))
|
(let ((dst WritableGpr (pinned_writable_gpr)))
|
||||||
(SideEffectNoResult.Inst (gen_move $I64 dst val))))
|
(SideEffectNoResult.Inst (gen_move $I64 dst val))))
|
||||||
|
|
||||||
|
;;;; Shuffle ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||||
|
|
||||||
|
;; Produce a mask suitable for use with `pshufb` for permuting the argument to
|
||||||
|
;; shuffle, when the arguments are the same (i.e. `shuffle a a mask`). This will
|
||||||
|
;; map all indices in the range 0..31 to the range 0..15.
|
||||||
|
(decl shuffle_0_31_mask (VecMask) VCodeConstant)
|
||||||
|
(extern constructor shuffle_0_31_mask shuffle_0_31_mask)
|
||||||
|
|
||||||
|
;; Produce a mask suitable for use with `pshufb` for permuting the lhs of a
|
||||||
|
;; `shuffle` operation (lanes 0-15).
|
||||||
|
(decl shuffle_0_15_mask (VecMask) VCodeConstant)
|
||||||
|
(extern constructor shuffle_0_15_mask shuffle_0_15_mask)
|
||||||
|
|
||||||
|
;; Produce a mask suitable for use with `pshufb` for permuting the rhs of a
|
||||||
|
;; `shuffle` operation (lanes 16-31).
|
||||||
|
(decl shuffle_16_31_mask (VecMask) VCodeConstant)
|
||||||
|
(extern constructor shuffle_16_31_mask shuffle_16_31_mask)
|
||||||
|
|
||||||
|
;; Produce a permutation suitable for use with `vpermi2b`, for permuting two
|
||||||
|
;; I8X16 vectors simultaneously.
|
||||||
|
;;
|
||||||
|
;; NOTE: `vpermi2b` will mask the indices in each lane to 5 bits when indexing
|
||||||
|
;; into vectors, so this constructor makes no effort to handle indices that are
|
||||||
|
;; larger than 31. If you are lowering a clif opcode like `shuffle` that has
|
||||||
|
;; special behavior for out of bounds indices (emitting a `0` in the resulting
|
||||||
|
;; vector in the case of `shuffle`) you'll need to handle that behavior
|
||||||
|
;; separately.
|
||||||
|
(decl perm_from_mask (VecMask) VCodeConstant)
|
||||||
|
(extern constructor perm_from_mask perm_from_mask)
|
||||||
|
|
||||||
|
;; If the mask that would be given to `shuffle` contains any out-of-bounds
|
||||||
|
;; indices, return a mask that will zero those.
|
||||||
|
(decl perm_from_mask_with_zeros (VCodeConstant VCodeConstant) VecMask)
|
||||||
|
(extern extractor perm_from_mask_with_zeros perm_from_mask_with_zeros)
|
||||||
|
|
||||||
|
;;;; Swizzle ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||||
|
|
||||||
|
;; Create a mask for zeroing out-of-bounds lanes of the swizzle mask.
|
||||||
|
(decl swizzle_zero_mask () VCodeConstant)
|
||||||
|
(extern constructor swizzle_zero_mask swizzle_zero_mask)
|
||||||
|
|
||||||
;;;; Automatic conversions ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
;;;; Automatic conversions ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||||
|
|
||||||
(convert Gpr InstOutput output_gpr)
|
(convert Gpr InstOutput output_gpr)
|
||||||
|
|||||||
@@ -66,6 +66,18 @@ impl Inst {
|
|||||||
dst_hi: WritableGpr::from_reg(Gpr::new(regs::rdx()).unwrap()),
|
dst_hi: WritableGpr::from_reg(Gpr::new(regs::rdx()).unwrap()),
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
fn xmm_rm_r_evex(op: Avx512Opcode, src1: RegMem, src2: Reg, dst: Writable<Reg>) -> Self {
|
||||||
|
src1.assert_regclass_is(RegClass::Float);
|
||||||
|
debug_assert!(src2.class() == RegClass::Float);
|
||||||
|
debug_assert!(dst.to_reg().class() == RegClass::Float);
|
||||||
|
Inst::XmmRmREvex {
|
||||||
|
op,
|
||||||
|
src1: XmmMem::new(src1).unwrap(),
|
||||||
|
src2: Xmm::new(src2).unwrap(),
|
||||||
|
dst: WritableXmm::from_writable_reg(dst).unwrap(),
|
||||||
|
}
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
#[test]
|
#[test]
|
||||||
|
|||||||
@@ -316,23 +316,6 @@ impl Inst {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
pub(crate) fn xmm_rm_r_evex(
|
|
||||||
op: Avx512Opcode,
|
|
||||||
src1: RegMem,
|
|
||||||
src2: Reg,
|
|
||||||
dst: Writable<Reg>,
|
|
||||||
) -> Self {
|
|
||||||
src1.assert_regclass_is(RegClass::Float);
|
|
||||||
debug_assert!(src2.class() == RegClass::Float);
|
|
||||||
debug_assert!(dst.to_reg().class() == RegClass::Float);
|
|
||||||
Inst::XmmRmREvex {
|
|
||||||
op,
|
|
||||||
src1: XmmMem::new(src1).unwrap(),
|
|
||||||
src2: Xmm::new(src2).unwrap(),
|
|
||||||
dst: WritableXmm::from_writable_reg(dst).unwrap(),
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
pub(crate) fn xmm_uninit_value(dst: Writable<Reg>) -> Self {
|
pub(crate) fn xmm_uninit_value(dst: Writable<Reg>) -> Self {
|
||||||
debug_assert!(dst.to_reg().class() == RegClass::Float);
|
debug_assert!(dst.to_reg().class() == RegClass::Float);
|
||||||
Inst::XmmUninitializedValue {
|
Inst::XmmUninitializedValue {
|
||||||
|
|||||||
@@ -3500,3 +3500,50 @@
|
|||||||
;; register allocator a definition for the output virtual register.
|
;; register allocator a definition for the output virtual register.
|
||||||
(rule (lower (raw_bitcast val))
|
(rule (lower (raw_bitcast val))
|
||||||
(put_in_regs val))
|
(put_in_regs val))
|
||||||
|
|
||||||
|
;; Rules for `shuffle` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||||
|
|
||||||
|
;; If `lhs` and `rhs` are the same we can use a single PSHUFB to shuffle the XMM
|
||||||
|
;; register. We statically build `constructed_mask` to zero out any unknown lane
|
||||||
|
;; indices (may not be completely necessary: verification could fail incorrect
|
||||||
|
;; mask values) and fix the indexes to all point to the `dst` vector.
|
||||||
|
(rule (lower (shuffle a a (vec_mask_from_immediate mask)))
|
||||||
|
(x64_pshufb a (x64_xmm_load_const $I8X16 (shuffle_0_31_mask mask))))
|
||||||
|
|
||||||
|
;; For the case where the shuffle mask contains out-of-bounds values (values
|
||||||
|
;; greater than 31) we must mask off those resulting values in the result of
|
||||||
|
;; `vpermi2b`.
|
||||||
|
(rule (lower (has_type (and (avx512vl_enabled) (avx512vbmi_enabled))
|
||||||
|
(shuffle a b (vec_mask_from_immediate
|
||||||
|
(perm_from_mask_with_zeros mask zeros)))))
|
||||||
|
(x64_andps
|
||||||
|
(x64_xmm_load_const $I8X16 zeros)
|
||||||
|
(x64_vpermi2b b a (x64_xmm_load_const $I8X16 mask))))
|
||||||
|
|
||||||
|
;; However, if the shuffle mask contains no out-of-bounds values, we can use
|
||||||
|
;; `vpermi2b` without any masking.
|
||||||
|
(rule (lower (has_type (and (avx512vl_enabled) (avx512vbmi_enabled))
|
||||||
|
(shuffle a b (vec_mask_from_immediate mask))))
|
||||||
|
(x64_vpermi2b b a (x64_xmm_load_const $I8X16 (perm_from_mask mask))))
|
||||||
|
|
||||||
|
;; If `lhs` and `rhs` are different, we must shuffle each separately and then OR
|
||||||
|
;; them together. This is necessary due to PSHUFB semantics. As in the case
|
||||||
|
;; above, we build the `constructed_mask` for each case statically.
|
||||||
|
(rule (lower (shuffle a b (vec_mask_from_immediate mask)))
|
||||||
|
(x64_por
|
||||||
|
(x64_pshufb a (x64_xmm_load_const $I8X16 (shuffle_0_15_mask mask)))
|
||||||
|
(x64_pshufb b (x64_xmm_load_const $I8X16 (shuffle_16_31_mask mask)))))
|
||||||
|
|
||||||
|
;; Rules for `swizzle` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||||
|
|
||||||
|
;; SIMD swizzle; the following inefficient implementation is due to the Wasm
|
||||||
|
;; SIMD spec requiring mask indexes greater than 15 to have the same semantics
|
||||||
|
;; as a 0 index. For the spec discussion, see
|
||||||
|
;; https://github.com/WebAssembly/simd/issues/93. The CLIF semantics match the
|
||||||
|
;; Wasm SIMD semantics for this instruction. The instruction format maps to
|
||||||
|
;; variables like: %dst = swizzle %src, %mask
|
||||||
|
(rule (lower (swizzle src mask))
|
||||||
|
(let ((mask Xmm (x64_paddusb
|
||||||
|
mask
|
||||||
|
(x64_xmm_load_const $I8X16 (swizzle_zero_mask)))))
|
||||||
|
(x64_pshufb src mask)))
|
||||||
|
|||||||
@@ -3,7 +3,6 @@
|
|||||||
// ISLE integration glue.
|
// ISLE integration glue.
|
||||||
pub(super) mod isle;
|
pub(super) mod isle;
|
||||||
|
|
||||||
use crate::data_value::DataValue;
|
|
||||||
use crate::ir::{types, ExternalName, Inst as IRInst, InstructionData, LibCall, Opcode, Type};
|
use crate::ir::{types, ExternalName, Inst as IRInst, InstructionData, LibCall, Opcode, Type};
|
||||||
use crate::isa::x64::abi::*;
|
use crate::isa::x64::abi::*;
|
||||||
use crate::isa::x64::inst::args::*;
|
use crate::isa::x64::inst::args::*;
|
||||||
@@ -585,139 +584,14 @@ fn lower_insn_to_regs(
|
|||||||
| Opcode::SetPinnedReg
|
| Opcode::SetPinnedReg
|
||||||
| Opcode::Vconst
|
| Opcode::Vconst
|
||||||
| Opcode::RawBitcast
|
| Opcode::RawBitcast
|
||||||
| Opcode::Insertlane => {
|
| Opcode::Insertlane
|
||||||
|
| Opcode::Shuffle
|
||||||
|
| Opcode::Swizzle => {
|
||||||
implemented_in_isle(ctx);
|
implemented_in_isle(ctx);
|
||||||
}
|
}
|
||||||
|
|
||||||
Opcode::DynamicStackAddr => unimplemented!("DynamicStackAddr"),
|
Opcode::DynamicStackAddr => unimplemented!("DynamicStackAddr"),
|
||||||
|
|
||||||
Opcode::Shuffle => {
|
|
||||||
let ty = ty.unwrap();
|
|
||||||
let dst = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
|
|
||||||
let lhs_ty = ctx.input_ty(insn, 0);
|
|
||||||
let lhs = put_input_in_reg(ctx, inputs[0]);
|
|
||||||
let rhs = put_input_in_reg(ctx, inputs[1]);
|
|
||||||
let mask = match ctx.get_immediate(insn) {
|
|
||||||
Some(DataValue::V128(bytes)) => bytes.to_vec(),
|
|
||||||
_ => unreachable!("shuffle should always have a 16-byte immediate"),
|
|
||||||
};
|
|
||||||
|
|
||||||
// A mask-building helper: in 128-bit SIMD, 0-15 indicate which lane to read from and a
|
|
||||||
// 1 in the most significant position zeroes the lane.
|
|
||||||
let zero_unknown_lane_index = |b: u8| if b > 15 { 0b10000000 } else { b };
|
|
||||||
|
|
||||||
ctx.emit(Inst::gen_move(dst, rhs, ty));
|
|
||||||
if rhs == lhs {
|
|
||||||
// If `lhs` and `rhs` are the same we can use a single PSHUFB to shuffle the XMM
|
|
||||||
// register. We statically build `constructed_mask` to zero out any unknown lane
|
|
||||||
// indices (may not be completely necessary: verification could fail incorrect mask
|
|
||||||
// values) and fix the indexes to all point to the `dst` vector.
|
|
||||||
let constructed_mask = mask
|
|
||||||
.iter()
|
|
||||||
// If the mask is greater than 15 it still may be referring to a lane in b.
|
|
||||||
.map(|&b| if b > 15 { b.wrapping_sub(16) } else { b })
|
|
||||||
.map(zero_unknown_lane_index)
|
|
||||||
.collect();
|
|
||||||
let constant = ctx.use_constant(VCodeConstantData::Generated(constructed_mask));
|
|
||||||
let tmp = ctx.alloc_tmp(types::I8X16).only_reg().unwrap();
|
|
||||||
ctx.emit(Inst::xmm_load_const(constant, tmp, ty));
|
|
||||||
// After loading the constructed mask in a temporary register, we use this to
|
|
||||||
// shuffle the `dst` register (remember that, in this case, it is the same as
|
|
||||||
// `src` so we disregard this register).
|
|
||||||
ctx.emit(Inst::xmm_rm_r(SseOpcode::Pshufb, RegMem::from(tmp), dst));
|
|
||||||
} else {
|
|
||||||
if isa_flags.use_avx512vl_simd() && isa_flags.use_avx512vbmi_simd() {
|
|
||||||
assert!(
|
|
||||||
mask.iter().all(|b| *b < 32),
|
|
||||||
"shuffle mask values must be between 0 and 31"
|
|
||||||
);
|
|
||||||
|
|
||||||
// Load the mask into the destination register.
|
|
||||||
let constant = ctx.use_constant(VCodeConstantData::Generated(mask.into()));
|
|
||||||
ctx.emit(Inst::xmm_load_const(constant, dst, ty));
|
|
||||||
|
|
||||||
// VPERMI2B has the exact semantics of Wasm's shuffle:
|
|
||||||
// permute the bytes in `src1` and `src2` using byte indexes
|
|
||||||
// in `dst` and store the byte results in `dst`.
|
|
||||||
ctx.emit(Inst::xmm_rm_r_evex(
|
|
||||||
Avx512Opcode::Vpermi2b,
|
|
||||||
RegMem::reg(rhs),
|
|
||||||
lhs,
|
|
||||||
dst,
|
|
||||||
));
|
|
||||||
} else {
|
|
||||||
// If `lhs` and `rhs` are different, we must shuffle each separately and then OR
|
|
||||||
// them together. This is necessary due to PSHUFB semantics. As in the case above,
|
|
||||||
// we build the `constructed_mask` for each case statically.
|
|
||||||
|
|
||||||
// PSHUFB the `lhs` argument into `tmp0`, placing zeroes for unused lanes.
|
|
||||||
let tmp0 = ctx.alloc_tmp(lhs_ty).only_reg().unwrap();
|
|
||||||
ctx.emit(Inst::gen_move(tmp0, lhs, lhs_ty));
|
|
||||||
let constructed_mask =
|
|
||||||
mask.iter().cloned().map(zero_unknown_lane_index).collect();
|
|
||||||
let constant = ctx.use_constant(VCodeConstantData::Generated(constructed_mask));
|
|
||||||
let tmp1 = ctx.alloc_tmp(types::I8X16).only_reg().unwrap();
|
|
||||||
ctx.emit(Inst::xmm_load_const(constant, tmp1, ty));
|
|
||||||
ctx.emit(Inst::xmm_rm_r(SseOpcode::Pshufb, RegMem::from(tmp1), tmp0));
|
|
||||||
|
|
||||||
// PSHUFB the second argument, placing zeroes for unused lanes.
|
|
||||||
let constructed_mask = mask
|
|
||||||
.iter()
|
|
||||||
.map(|b| b.wrapping_sub(16))
|
|
||||||
.map(zero_unknown_lane_index)
|
|
||||||
.collect();
|
|
||||||
let constant = ctx.use_constant(VCodeConstantData::Generated(constructed_mask));
|
|
||||||
let tmp2 = ctx.alloc_tmp(types::I8X16).only_reg().unwrap();
|
|
||||||
ctx.emit(Inst::xmm_load_const(constant, tmp2, ty));
|
|
||||||
ctx.emit(Inst::xmm_rm_r(SseOpcode::Pshufb, RegMem::from(tmp2), dst));
|
|
||||||
|
|
||||||
// OR the shuffled registers (the mechanism and lane-size for OR-ing the registers
|
|
||||||
// is not important).
|
|
||||||
ctx.emit(Inst::xmm_rm_r(SseOpcode::Orps, RegMem::from(tmp0), dst));
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
Opcode::Swizzle => {
|
|
||||||
// SIMD swizzle; the following inefficient implementation is due to the Wasm SIMD spec
|
|
||||||
// requiring mask indexes greater than 15 to have the same semantics as a 0 index. For
|
|
||||||
// the spec discussion, see https://github.com/WebAssembly/simd/issues/93. The CLIF
|
|
||||||
// semantics match the Wasm SIMD semantics for this instruction.
|
|
||||||
// The instruction format maps to variables like: %dst = swizzle %src, %mask
|
|
||||||
let ty = ty.unwrap();
|
|
||||||
let dst = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
|
|
||||||
let src = put_input_in_reg(ctx, inputs[0]);
|
|
||||||
let swizzle_mask = put_input_in_reg(ctx, inputs[1]);
|
|
||||||
|
|
||||||
// Inform the register allocator that `src` and `dst` should be in the same register.
|
|
||||||
ctx.emit(Inst::gen_move(dst, src, ty));
|
|
||||||
|
|
||||||
// Create a mask for zeroing out-of-bounds lanes of the swizzle mask.
|
|
||||||
let zero_mask = ctx.alloc_tmp(types::I8X16).only_reg().unwrap();
|
|
||||||
static ZERO_MASK_VALUE: [u8; 16] = [
|
|
||||||
0x70, 0x70, 0x70, 0x70, 0x70, 0x70, 0x70, 0x70, 0x70, 0x70, 0x70, 0x70, 0x70, 0x70,
|
|
||||||
0x70, 0x70,
|
|
||||||
];
|
|
||||||
let constant = ctx.use_constant(VCodeConstantData::WellKnown(&ZERO_MASK_VALUE));
|
|
||||||
ctx.emit(Inst::xmm_load_const(constant, zero_mask, ty));
|
|
||||||
|
|
||||||
// Use the `zero_mask` on a writable `swizzle_mask`.
|
|
||||||
let swizzle_mask_tmp = ctx.alloc_tmp(types::I8X16).only_reg().unwrap();
|
|
||||||
ctx.emit(Inst::gen_move(swizzle_mask_tmp, swizzle_mask, ty));
|
|
||||||
ctx.emit(Inst::xmm_rm_r(
|
|
||||||
SseOpcode::Paddusb,
|
|
||||||
RegMem::from(zero_mask),
|
|
||||||
swizzle_mask_tmp,
|
|
||||||
));
|
|
||||||
|
|
||||||
// Shuffle `dst` using the fixed-up `swizzle_mask`.
|
|
||||||
ctx.emit(Inst::xmm_rm_r(
|
|
||||||
SseOpcode::Pshufb,
|
|
||||||
RegMem::from(swizzle_mask_tmp),
|
|
||||||
dst,
|
|
||||||
));
|
|
||||||
}
|
|
||||||
|
|
||||||
Opcode::Extractlane => {
|
Opcode::Extractlane => {
|
||||||
// The instruction format maps to variables like: %dst = extractlane %src, %lane
|
// The instruction format maps to variables like: %dst = extractlane %src, %lane
|
||||||
let ty = ty.unwrap();
|
let ty = ty.unwrap();
|
||||||
|
|||||||
@@ -34,6 +34,7 @@ use crate::{
|
|||||||
VCodeConstantData,
|
VCodeConstantData,
|
||||||
},
|
},
|
||||||
};
|
};
|
||||||
|
use alloc::vec::Vec;
|
||||||
use regalloc2::PReg;
|
use regalloc2::PReg;
|
||||||
use smallvec::SmallVec;
|
use smallvec::SmallVec;
|
||||||
use std::boxed::Box;
|
use std::boxed::Box;
|
||||||
@@ -200,6 +201,15 @@ impl Context for IsleContext<'_, '_, MInst, Flags, IsaFlags, 6> {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#[inline]
|
||||||
|
fn avx512vbmi_enabled(&mut self, _: Type) -> Option<()> {
|
||||||
|
if self.isa_flags.use_avx512vbmi_simd() {
|
||||||
|
Some(())
|
||||||
|
} else {
|
||||||
|
None
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
#[inline]
|
#[inline]
|
||||||
fn use_lzcnt(&mut self, _: Type) -> Option<()> {
|
fn use_lzcnt(&mut self, _: Type) -> Option<()> {
|
||||||
if self.isa_flags.use_lzcnt() {
|
if self.isa_flags.use_lzcnt() {
|
||||||
@@ -839,6 +849,73 @@ impl Context for IsleContext<'_, '_, MInst, Flags, IsaFlags, 6> {
|
|||||||
Writable::from_reg(Gpr::new(regs::pinned_reg()).unwrap())
|
Writable::from_reg(Gpr::new(regs::pinned_reg()).unwrap())
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#[inline]
|
||||||
|
fn shuffle_0_31_mask(&mut self, mask: &VecMask) -> VCodeConstant {
|
||||||
|
let mask = mask
|
||||||
|
.iter()
|
||||||
|
.map(|&b| if b > 15 { b.wrapping_sub(15) } else { b })
|
||||||
|
.map(|b| if b > 15 { 0b10000000 } else { b })
|
||||||
|
.collect();
|
||||||
|
self.lower_ctx
|
||||||
|
.use_constant(VCodeConstantData::Generated(mask))
|
||||||
|
}
|
||||||
|
|
||||||
|
#[inline]
|
||||||
|
fn shuffle_0_15_mask(&mut self, mask: &VecMask) -> VCodeConstant {
|
||||||
|
let mask = mask
|
||||||
|
.iter()
|
||||||
|
.map(|&b| if b > 15 { 0b10000000 } else { b })
|
||||||
|
.collect();
|
||||||
|
self.lower_ctx
|
||||||
|
.use_constant(VCodeConstantData::Generated(mask))
|
||||||
|
}
|
||||||
|
|
||||||
|
#[inline]
|
||||||
|
fn shuffle_16_31_mask(&mut self, mask: &VecMask) -> VCodeConstant {
|
||||||
|
let mask = mask
|
||||||
|
.iter()
|
||||||
|
.map(|&b| b.wrapping_sub(16))
|
||||||
|
.map(|b| if b > 15 { 0b10000000 } else { b })
|
||||||
|
.collect();
|
||||||
|
self.lower_ctx
|
||||||
|
.use_constant(VCodeConstantData::Generated(mask))
|
||||||
|
}
|
||||||
|
|
||||||
|
#[inline]
|
||||||
|
fn perm_from_mask_with_zeros(
|
||||||
|
&mut self,
|
||||||
|
mask: &VecMask,
|
||||||
|
) -> Option<(VCodeConstant, VCodeConstant)> {
|
||||||
|
if !mask.iter().any(|&b| b > 31) {
|
||||||
|
return None;
|
||||||
|
}
|
||||||
|
|
||||||
|
let zeros = mask
|
||||||
|
.iter()
|
||||||
|
.map(|&b| if b > 31 { 0x00 } else { 0xff })
|
||||||
|
.collect();
|
||||||
|
|
||||||
|
Some((
|
||||||
|
self.perm_from_mask(mask),
|
||||||
|
self.lower_ctx
|
||||||
|
.use_constant(VCodeConstantData::Generated(zeros)),
|
||||||
|
))
|
||||||
|
}
|
||||||
|
|
||||||
|
#[inline]
|
||||||
|
fn perm_from_mask(&mut self, mask: &VecMask) -> VCodeConstant {
|
||||||
|
let mask = mask.iter().cloned().collect();
|
||||||
|
self.lower_ctx
|
||||||
|
.use_constant(VCodeConstantData::Generated(mask))
|
||||||
|
}
|
||||||
|
|
||||||
|
#[inline]
|
||||||
|
fn swizzle_zero_mask(&mut self) -> VCodeConstant {
|
||||||
|
static ZERO_MASK_VALUE: [u8; 16] = [0x70; 16];
|
||||||
|
self.lower_ctx
|
||||||
|
.use_constant(VCodeConstantData::WellKnown(&ZERO_MASK_VALUE))
|
||||||
|
}
|
||||||
|
|
||||||
fn emit_div_or_rem(
|
fn emit_div_or_rem(
|
||||||
&mut self,
|
&mut self,
|
||||||
kind: &DivOrRemKind,
|
kind: &DivOrRemKind,
|
||||||
|
|||||||
@@ -7,6 +7,7 @@ use std::cell::Cell;
|
|||||||
use target_lexicon::Triple;
|
use target_lexicon::Triple;
|
||||||
|
|
||||||
pub use super::MachLabel;
|
pub use super::MachLabel;
|
||||||
|
pub use crate::data_value::DataValue;
|
||||||
pub use crate::ir::{
|
pub use crate::ir::{
|
||||||
ArgumentExtension, Constant, DynamicStackSlot, ExternalName, FuncRef, GlobalValue, Immediate,
|
ArgumentExtension, Constant, DynamicStackSlot, ExternalName, FuncRef, GlobalValue, Immediate,
|
||||||
SigRef, StackSlot,
|
SigRef, StackSlot,
|
||||||
@@ -24,6 +25,7 @@ pub type ValueArray2 = [Value; 2];
|
|||||||
pub type ValueArray3 = [Value; 3];
|
pub type ValueArray3 = [Value; 3];
|
||||||
pub type WritableReg = Writable<Reg>;
|
pub type WritableReg = Writable<Reg>;
|
||||||
pub type VecReg = Vec<Reg>;
|
pub type VecReg = Vec<Reg>;
|
||||||
|
pub type VecMask = Vec<u8>;
|
||||||
pub type ValueRegs = crate::machinst::ValueRegs<Reg>;
|
pub type ValueRegs = crate::machinst::ValueRegs<Reg>;
|
||||||
pub type WritableValueRegs = crate::machinst::ValueRegs<WritableReg>;
|
pub type WritableValueRegs = crate::machinst::ValueRegs<WritableReg>;
|
||||||
pub type InstOutput = SmallVec<[ValueRegs; 2]>;
|
pub type InstOutput = SmallVec<[ValueRegs; 2]>;
|
||||||
@@ -683,6 +685,16 @@ macro_rules! isle_prelude_methods {
|
|||||||
Some(u128::from_le_bytes(bytes.try_into().ok()?))
|
Some(u128::from_le_bytes(bytes.try_into().ok()?))
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#[inline]
|
||||||
|
fn vec_mask_from_immediate(&mut self, imm: Immediate) -> Option<VecMask> {
|
||||||
|
let data = self.lower_ctx.get_immediate_data(imm);
|
||||||
|
if data.len() == 16 {
|
||||||
|
Some(Vec::from(data.as_slice()))
|
||||||
|
} else {
|
||||||
|
None
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
#[inline]
|
#[inline]
|
||||||
fn u64_from_constant(&mut self, constant: Constant) -> Option<u64> {
|
fn u64_from_constant(&mut self, constant: Constant) -> Option<u64> {
|
||||||
let bytes = self.lower_ctx.get_constant_data(constant).as_slice();
|
let bytes = self.lower_ctx.get_constant_data(constant).as_slice();
|
||||||
|
|||||||
@@ -5,7 +5,6 @@
|
|||||||
// TODO: separate the IR-query core of `Lower` from the lowering logic built on
|
// TODO: separate the IR-query core of `Lower` from the lowering logic built on
|
||||||
// top of it, e.g. the side-effect/coloring analysis and the scan support.
|
// top of it, e.g. the side-effect/coloring analysis and the scan support.
|
||||||
|
|
||||||
use crate::data_value::DataValue;
|
|
||||||
use crate::entity::SecondaryMap;
|
use crate::entity::SecondaryMap;
|
||||||
use crate::fx::{FxHashMap, FxHashSet};
|
use crate::fx::{FxHashMap, FxHashSet};
|
||||||
use crate::inst_predicates::{has_lowering_side_effect, is_constant_64bit};
|
use crate::inst_predicates::{has_lowering_side_effect, is_constant_64bit};
|
||||||
@@ -23,7 +22,6 @@ use crate::machinst::{
|
|||||||
};
|
};
|
||||||
use crate::{trace, CodegenResult};
|
use crate::{trace, CodegenResult};
|
||||||
use alloc::vec::Vec;
|
use alloc::vec::Vec;
|
||||||
use core::convert::TryInto;
|
|
||||||
use regalloc2::VReg;
|
use regalloc2::VReg;
|
||||||
use smallvec::{smallvec, SmallVec};
|
use smallvec::{smallvec, SmallVec};
|
||||||
use std::fmt::Debug;
|
use std::fmt::Debug;
|
||||||
@@ -1381,35 +1379,6 @@ impl<'func, I: VCodeInst> Lower<'func, I> {
|
|||||||
self.vcode.constants().insert(constant)
|
self.vcode.constants().insert(constant)
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Retrieve the value immediate from an instruction. This will perform necessary lookups on the
|
|
||||||
/// `DataFlowGraph` to retrieve even large immediates.
|
|
||||||
pub fn get_immediate(&self, ir_inst: Inst) -> Option<DataValue> {
|
|
||||||
let inst_data = self.data(ir_inst);
|
|
||||||
match inst_data {
|
|
||||||
InstructionData::Shuffle { imm, .. } => {
|
|
||||||
let mask = self.f.dfg.immediates.get(imm.clone()).unwrap().as_slice();
|
|
||||||
let value = match mask.len() {
|
|
||||||
16 => DataValue::V128(mask.try_into().expect("a 16-byte vector mask")),
|
|
||||||
8 => DataValue::V64(mask.try_into().expect("an 8-byte vector mask")),
|
|
||||||
length => panic!("unexpected Shuffle mask length {}", length),
|
|
||||||
};
|
|
||||||
Some(value)
|
|
||||||
}
|
|
||||||
InstructionData::UnaryConst {
|
|
||||||
constant_handle, ..
|
|
||||||
} => {
|
|
||||||
let buffer = self.f.dfg.constants.get(constant_handle.clone()).as_slice();
|
|
||||||
let value = match buffer.len() {
|
|
||||||
16 => DataValue::V128(buffer.try_into().expect("a 16-byte data buffer")),
|
|
||||||
8 => DataValue::V64(buffer.try_into().expect("an 8-byte data buffer")),
|
|
||||||
length => panic!("unexpected UnaryConst buffer length {}", length),
|
|
||||||
};
|
|
||||||
Some(value)
|
|
||||||
}
|
|
||||||
_ => inst_data.imm_value(),
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
/// Cause the value in `reg` to be in a virtual reg, by copying it into a new virtual reg
|
/// Cause the value in `reg` to be in a virtual reg, by copying it into a new virtual reg
|
||||||
/// if `reg` is a real reg. `ty` describes the type of the value in `reg`.
|
/// if `reg` is a real reg. `ty` describes the type of the value in `reg`.
|
||||||
pub fn ensure_in_vreg(&mut self, reg: Reg, ty: Type) -> Reg {
|
pub fn ensure_in_vreg(&mut self, reg: Reg, ty: Type) -> Reg {
|
||||||
|
|||||||
@@ -35,6 +35,9 @@
|
|||||||
;; ISLE representation of `&[Value]`.
|
;; ISLE representation of `&[Value]`.
|
||||||
(type ValueSlice (primitive ValueSlice))
|
(type ValueSlice (primitive ValueSlice))
|
||||||
|
|
||||||
|
;; ISLE representation of `Vec<u8>`
|
||||||
|
(type VecMask extern (enum))
|
||||||
|
|
||||||
(type ValueList (primitive ValueList))
|
(type ValueList (primitive ValueList))
|
||||||
(type ValueRegs (primitive ValueRegs))
|
(type ValueRegs (primitive ValueRegs))
|
||||||
(type WritableValueRegs (primitive WritableValueRegs))
|
(type WritableValueRegs (primitive WritableValueRegs))
|
||||||
@@ -798,6 +801,11 @@
|
|||||||
(decl u128_from_immediate (u128) Immediate)
|
(decl u128_from_immediate (u128) Immediate)
|
||||||
(extern extractor u128_from_immediate u128_from_immediate)
|
(extern extractor u128_from_immediate u128_from_immediate)
|
||||||
|
|
||||||
|
;; Accessor for `Immediate` as a vector of u8 values.
|
||||||
|
|
||||||
|
(decl vec_mask_from_immediate (VecMask) Immediate)
|
||||||
|
(extern extractor vec_mask_from_immediate vec_mask_from_immediate)
|
||||||
|
|
||||||
;; Accessor for `Constant` as u128.
|
;; Accessor for `Constant` as u128.
|
||||||
|
|
||||||
(decl u128_from_constant (u128) Constant)
|
(decl u128_from_constant (u128) Constant)
|
||||||
|
|||||||
58
cranelift/filetests/filetests/isa/x64/shuffle-avx512.clif
Normal file
58
cranelift/filetests/filetests/isa/x64/shuffle-avx512.clif
Normal file
@@ -0,0 +1,58 @@
|
|||||||
|
test compile precise-output
|
||||||
|
set enable_simd
|
||||||
|
target x86_64 has_avx512vl has_avx512vbmi
|
||||||
|
|
||||||
|
function %shuffle_in_bounds(i8x16, i8x16) -> i8x16 {
|
||||||
|
block0(v0: i8x16, v1: i8x16):
|
||||||
|
;; pick the second lane of v1, the rest use the first lane of v0
|
||||||
|
v2 = shuffle v0, v1, 0x11000000000000000000000000000000
|
||||||
|
return v2
|
||||||
|
}
|
||||||
|
|
||||||
|
; pushq %rbp
|
||||||
|
; movq %rsp, %rbp
|
||||||
|
; block0:
|
||||||
|
; movdqa %xmm0, %xmm9
|
||||||
|
; load_const VCodeConstant(0), %xmm0
|
||||||
|
; vpermi2b %xmm1, %xmm0, %xmm9
|
||||||
|
; movq %rbp, %rsp
|
||||||
|
; popq %rbp
|
||||||
|
; ret
|
||||||
|
|
||||||
|
function %shuffle_out_of_bounds(i8x16, i8x16) -> i8x16 {
|
||||||
|
block0(v0: i8x16, v1: i8x16):
|
||||||
|
;; pick zero for the first lane, the rest use first lane of v0
|
||||||
|
;; This should introduce two constants, one for the permutation and one to
|
||||||
|
;; mask the non-zero values for lanes 1-15
|
||||||
|
v2 = shuffle v0, v1, 0x80000000000000000000000000000000
|
||||||
|
return v2
|
||||||
|
}
|
||||||
|
|
||||||
|
; pushq %rbp
|
||||||
|
; movq %rsp, %rbp
|
||||||
|
; block0:
|
||||||
|
; movdqa %xmm0, %xmm12
|
||||||
|
; load_const VCodeConstant(1), %xmm0
|
||||||
|
; load_const VCodeConstant(0), %xmm7
|
||||||
|
; vpermi2b %xmm1, %xmm7, %xmm12
|
||||||
|
; andps %xmm0, %xmm7, %xmm0
|
||||||
|
; movq %rbp, %rsp
|
||||||
|
; popq %rbp
|
||||||
|
; ret
|
||||||
|
|
||||||
|
function %f3(i8x16, i8x16) -> i8x16 {
|
||||||
|
block0(v0: i8x16, v1: i8x16):
|
||||||
|
v2 = shuffle v0, v1, [3 0 31 26 4 6 12 11 23 13 24 4 2 15 17 5]
|
||||||
|
return v2
|
||||||
|
}
|
||||||
|
|
||||||
|
; pushq %rbp
|
||||||
|
; movq %rsp, %rbp
|
||||||
|
; block0:
|
||||||
|
; movdqa %xmm0, %xmm9
|
||||||
|
; load_const VCodeConstant(0), %xmm0
|
||||||
|
; vpermi2b %xmm1, %xmm0, %xmm9
|
||||||
|
; movq %rbp, %rsp
|
||||||
|
; popq %rbp
|
||||||
|
; ret
|
||||||
|
|
||||||
@@ -15,13 +15,13 @@ block0:
|
|||||||
; pushq %rbp
|
; pushq %rbp
|
||||||
; movq %rsp, %rbp
|
; movq %rsp, %rbp
|
||||||
; block0:
|
; block0:
|
||||||
; load_const VCodeConstant(3), %xmm6
|
; load_const VCodeConstant(3), %xmm0
|
||||||
; load_const VCodeConstant(2), %xmm0
|
; load_const VCodeConstant(2), %xmm5
|
||||||
; load_const VCodeConstant(0), %xmm7
|
; load_const VCodeConstant(0), %xmm3
|
||||||
; pshufb %xmm6, %xmm7, %xmm6
|
; pshufb %xmm0, %xmm3, %xmm0
|
||||||
; load_const VCodeConstant(1), %xmm10
|
; load_const VCodeConstant(1), %xmm7
|
||||||
; pshufb %xmm0, %xmm10, %xmm0
|
; pshufb %xmm5, %xmm7, %xmm5
|
||||||
; orps %xmm0, %xmm6, %xmm0
|
; por %xmm0, %xmm5, %xmm0
|
||||||
; movq %rbp, %rsp
|
; movq %rbp, %rsp
|
||||||
; popq %rbp
|
; popq %rbp
|
||||||
; ret
|
; ret
|
||||||
@@ -37,8 +37,8 @@ block0:
|
|||||||
; movq %rsp, %rbp
|
; movq %rsp, %rbp
|
||||||
; block0:
|
; block0:
|
||||||
; load_const VCodeConstant(1), %xmm0
|
; load_const VCodeConstant(1), %xmm0
|
||||||
; load_const VCodeConstant(0), %xmm4
|
; load_const VCodeConstant(0), %xmm2
|
||||||
; pshufb %xmm0, %xmm4, %xmm0
|
; pshufb %xmm0, %xmm2, %xmm0
|
||||||
; movq %rbp, %rsp
|
; movq %rbp, %rsp
|
||||||
; popq %rbp
|
; popq %rbp
|
||||||
; ret
|
; ret
|
||||||
@@ -55,10 +55,10 @@ block0:
|
|||||||
; movq %rsp, %rbp
|
; movq %rsp, %rbp
|
||||||
; block0:
|
; block0:
|
||||||
; load_const VCodeConstant(1), %xmm0
|
; load_const VCodeConstant(1), %xmm0
|
||||||
; load_const VCodeConstant(1), %xmm5
|
; load_const VCodeConstant(1), %xmm3
|
||||||
; load_const VCodeConstant(0), %xmm6
|
; load_const VCodeConstant(0), %xmm4
|
||||||
; paddusb %xmm5, %xmm6, %xmm5
|
; paddusb %xmm3, %xmm4, %xmm3
|
||||||
; pshufb %xmm0, %xmm5, %xmm0
|
; pshufb %xmm0, %xmm3, %xmm0
|
||||||
; movq %rbp, %rsp
|
; movq %rbp, %rsp
|
||||||
; popq %rbp
|
; popq %rbp
|
||||||
; ret
|
; ret
|
||||||
|
|||||||
@@ -4,6 +4,7 @@ target aarch64
|
|||||||
target s390x
|
target s390x
|
||||||
set enable_simd
|
set enable_simd
|
||||||
target x86_64 has_sse3 has_ssse3 has_sse41
|
target x86_64 has_sse3 has_ssse3 has_sse41
|
||||||
|
target x86_64 has_sse3 has_ssse3 has_sse41 has_avx512vl has_avx512vbmi
|
||||||
|
|
||||||
function %shuffle_i8x16(i8x16, i8x16) -> i8x16 {
|
function %shuffle_i8x16(i8x16, i8x16) -> i8x16 {
|
||||||
block0(v0: i8x16, v1: i8x16):
|
block0(v0: i8x16, v1: i8x16):
|
||||||
@@ -11,3 +12,10 @@ block0(v0: i8x16, v1: i8x16):
|
|||||||
return v2
|
return v2
|
||||||
}
|
}
|
||||||
; run: %shuffle_i8x16([1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16], [17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32]) == [4 1 32 27 5 7 13 12 24 14 25 5 3 16 18 6]
|
; run: %shuffle_i8x16([1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16], [17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32]) == [4 1 32 27 5 7 13 12 24 14 25 5 3 16 18 6]
|
||||||
|
|
||||||
|
function %shuffle_zeros(i8x16, i8x16) -> i8x16 {
|
||||||
|
block0(v0: i8x16, v1: i8x16):
|
||||||
|
v2 = shuffle v0, v1, [3 0 32 255 4 6 12 11 23 13 24 4 2 97 17 5]
|
||||||
|
return v2
|
||||||
|
}
|
||||||
|
; run: %shuffle_zeros([1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16], [17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32]) == [4 1 0 0 5 7 13 12 24 14 25 5 3 0 18 6]
|
||||||
|
|||||||
Reference in New Issue
Block a user