diff --git a/cranelift/codegen/src/isa/x64/inst.isle b/cranelift/codegen/src/isa/x64/inst.isle index 621d99c551..fae522c746 100644 --- a/cranelift/codegen/src/isa/x64/inst.isle +++ b/cranelift/codegen/src/isa/x64/inst.isle @@ -1400,6 +1400,9 @@ (decl avx512bitalg_enabled () Type) (extern extractor avx512bitalg_enabled avx512bitalg_enabled) +(decl avx512vbmi_enabled () Type) +(extern extractor avx512vbmi_enabled avx512vbmi_enabled) + (decl use_lzcnt () Type) (extern extractor use_lzcnt use_lzcnt) @@ -2740,6 +2743,19 @@ src1 src2)) +;; Helper for creating `vpermi2b` instructions. +;; +;; Requires AVX-512 vl and vbmi extensions. +(decl x64_vpermi2b (Xmm Xmm Xmm) Xmm) +(rule (x64_vpermi2b src1 src2 src3) + (let ((dst WritableXmm (temp_writable_xmm)) + (_ Unit (emit (gen_move $I8X16 dst src3))) + (_ Unit (emit (MInst.XmmRmREvex (Avx512Opcode.Vpermi2b) + src1 + src2 + dst)))) + dst)) + ;; Helper for creating `MInst.MulHi` instructions. ;; ;; Returns the (lo, hi) register halves of the multiplication. @@ -3634,6 +3650,47 @@ (let ((dst WritableGpr (pinned_writable_gpr))) (SideEffectNoResult.Inst (gen_move $I64 dst val)))) +;;;; Shuffle ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +;; Produce a mask suitable for use with `pshufb` for permuting the argument to +;; shuffle, when the arguments are the same (i.e. `shuffle a a mask`). This will +;; map all indices in the range 0..31 to the range 0..15. +(decl shuffle_0_31_mask (VecMask) VCodeConstant) +(extern constructor shuffle_0_31_mask shuffle_0_31_mask) + +;; Produce a mask suitable for use with `pshufb` for permuting the lhs of a +;; `shuffle` operation (lanes 0-15). +(decl shuffle_0_15_mask (VecMask) VCodeConstant) +(extern constructor shuffle_0_15_mask shuffle_0_15_mask) + +;; Produce a mask suitable for use with `pshufb` for permuting the rhs of a +;; `shuffle` operation (lanes 16-31). +(decl shuffle_16_31_mask (VecMask) VCodeConstant) +(extern constructor shuffle_16_31_mask shuffle_16_31_mask) + +;; Produce a permutation suitable for use with `vpermi2b`, for permuting two +;; I8X16 vectors simultaneously. +;; +;; NOTE: `vpermi2b` will mask the indices in each lane to 5 bits when indexing +;; into vectors, so this constructor makes no effort to handle indices that are +;; larger than 31. If you are lowering a clif opcode like `shuffle` that has +;; special behavior for out of bounds indices (emitting a `0` in the resulting +;; vector in the case of `shuffle`) you'll need to handle that behavior +;; separately. +(decl perm_from_mask (VecMask) VCodeConstant) +(extern constructor perm_from_mask perm_from_mask) + +;; If the mask that would be given to `shuffle` contains any out-of-bounds +;; indices, return a mask that will zero those. +(decl perm_from_mask_with_zeros (VCodeConstant VCodeConstant) VecMask) +(extern extractor perm_from_mask_with_zeros perm_from_mask_with_zeros) + +;;;; Swizzle ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +;; Create a mask for zeroing out-of-bounds lanes of the swizzle mask. +(decl swizzle_zero_mask () VCodeConstant) +(extern constructor swizzle_zero_mask swizzle_zero_mask) + ;;;; Automatic conversions ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; (convert Gpr InstOutput output_gpr) diff --git a/cranelift/codegen/src/isa/x64/inst/emit_tests.rs b/cranelift/codegen/src/isa/x64/inst/emit_tests.rs index ee01f41a13..f23547e8c5 100644 --- a/cranelift/codegen/src/isa/x64/inst/emit_tests.rs +++ b/cranelift/codegen/src/isa/x64/inst/emit_tests.rs @@ -66,6 +66,18 @@ impl Inst { dst_hi: WritableGpr::from_reg(Gpr::new(regs::rdx()).unwrap()), } } + + fn xmm_rm_r_evex(op: Avx512Opcode, src1: RegMem, src2: Reg, dst: Writable) -> Self { + src1.assert_regclass_is(RegClass::Float); + debug_assert!(src2.class() == RegClass::Float); + debug_assert!(dst.to_reg().class() == RegClass::Float); + Inst::XmmRmREvex { + op, + src1: XmmMem::new(src1).unwrap(), + src2: Xmm::new(src2).unwrap(), + dst: WritableXmm::from_writable_reg(dst).unwrap(), + } + } } #[test] diff --git a/cranelift/codegen/src/isa/x64/inst/mod.rs b/cranelift/codegen/src/isa/x64/inst/mod.rs index 83cca60a32..36a1ea9e72 100644 --- a/cranelift/codegen/src/isa/x64/inst/mod.rs +++ b/cranelift/codegen/src/isa/x64/inst/mod.rs @@ -316,23 +316,6 @@ impl Inst { } } - pub(crate) fn xmm_rm_r_evex( - op: Avx512Opcode, - src1: RegMem, - src2: Reg, - dst: Writable, - ) -> Self { - src1.assert_regclass_is(RegClass::Float); - debug_assert!(src2.class() == RegClass::Float); - debug_assert!(dst.to_reg().class() == RegClass::Float); - Inst::XmmRmREvex { - op, - src1: XmmMem::new(src1).unwrap(), - src2: Xmm::new(src2).unwrap(), - dst: WritableXmm::from_writable_reg(dst).unwrap(), - } - } - pub(crate) fn xmm_uninit_value(dst: Writable) -> Self { debug_assert!(dst.to_reg().class() == RegClass::Float); Inst::XmmUninitializedValue { diff --git a/cranelift/codegen/src/isa/x64/lower.isle b/cranelift/codegen/src/isa/x64/lower.isle index f0646390e7..bbb93eb031 100644 --- a/cranelift/codegen/src/isa/x64/lower.isle +++ b/cranelift/codegen/src/isa/x64/lower.isle @@ -3500,3 +3500,50 @@ ;; register allocator a definition for the output virtual register. (rule (lower (raw_bitcast val)) (put_in_regs val)) + +;; Rules for `shuffle` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +;; If `lhs` and `rhs` are the same we can use a single PSHUFB to shuffle the XMM +;; register. We statically build `constructed_mask` to zero out any unknown lane +;; indices (may not be completely necessary: verification could fail incorrect +;; mask values) and fix the indexes to all point to the `dst` vector. +(rule (lower (shuffle a a (vec_mask_from_immediate mask))) + (x64_pshufb a (x64_xmm_load_const $I8X16 (shuffle_0_31_mask mask)))) + +;; For the case where the shuffle mask contains out-of-bounds values (values +;; greater than 31) we must mask off those resulting values in the result of +;; `vpermi2b`. +(rule (lower (has_type (and (avx512vl_enabled) (avx512vbmi_enabled)) + (shuffle a b (vec_mask_from_immediate + (perm_from_mask_with_zeros mask zeros))))) + (x64_andps + (x64_xmm_load_const $I8X16 zeros) + (x64_vpermi2b b a (x64_xmm_load_const $I8X16 mask)))) + +;; However, if the shuffle mask contains no out-of-bounds values, we can use +;; `vpermi2b` without any masking. +(rule (lower (has_type (and (avx512vl_enabled) (avx512vbmi_enabled)) + (shuffle a b (vec_mask_from_immediate mask)))) + (x64_vpermi2b b a (x64_xmm_load_const $I8X16 (perm_from_mask mask)))) + +;; If `lhs` and `rhs` are different, we must shuffle each separately and then OR +;; them together. This is necessary due to PSHUFB semantics. As in the case +;; above, we build the `constructed_mask` for each case statically. +(rule (lower (shuffle a b (vec_mask_from_immediate mask))) + (x64_por + (x64_pshufb a (x64_xmm_load_const $I8X16 (shuffle_0_15_mask mask))) + (x64_pshufb b (x64_xmm_load_const $I8X16 (shuffle_16_31_mask mask))))) + +;; Rules for `swizzle` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +;; SIMD swizzle; the following inefficient implementation is due to the Wasm +;; SIMD spec requiring mask indexes greater than 15 to have the same semantics +;; as a 0 index. For the spec discussion, see +;; https://github.com/WebAssembly/simd/issues/93. The CLIF semantics match the +;; Wasm SIMD semantics for this instruction. The instruction format maps to +;; variables like: %dst = swizzle %src, %mask +(rule (lower (swizzle src mask)) + (let ((mask Xmm (x64_paddusb + mask + (x64_xmm_load_const $I8X16 (swizzle_zero_mask))))) + (x64_pshufb src mask))) diff --git a/cranelift/codegen/src/isa/x64/lower.rs b/cranelift/codegen/src/isa/x64/lower.rs index 248a1858e8..5a335da67a 100644 --- a/cranelift/codegen/src/isa/x64/lower.rs +++ b/cranelift/codegen/src/isa/x64/lower.rs @@ -3,7 +3,6 @@ // ISLE integration glue. pub(super) mod isle; -use crate::data_value::DataValue; use crate::ir::{types, ExternalName, Inst as IRInst, InstructionData, LibCall, Opcode, Type}; use crate::isa::x64::abi::*; use crate::isa::x64::inst::args::*; @@ -585,139 +584,14 @@ fn lower_insn_to_regs( | Opcode::SetPinnedReg | Opcode::Vconst | Opcode::RawBitcast - | Opcode::Insertlane => { + | Opcode::Insertlane + | Opcode::Shuffle + | Opcode::Swizzle => { implemented_in_isle(ctx); } Opcode::DynamicStackAddr => unimplemented!("DynamicStackAddr"), - Opcode::Shuffle => { - let ty = ty.unwrap(); - let dst = get_output_reg(ctx, outputs[0]).only_reg().unwrap(); - let lhs_ty = ctx.input_ty(insn, 0); - let lhs = put_input_in_reg(ctx, inputs[0]); - let rhs = put_input_in_reg(ctx, inputs[1]); - let mask = match ctx.get_immediate(insn) { - Some(DataValue::V128(bytes)) => bytes.to_vec(), - _ => unreachable!("shuffle should always have a 16-byte immediate"), - }; - - // A mask-building helper: in 128-bit SIMD, 0-15 indicate which lane to read from and a - // 1 in the most significant position zeroes the lane. - let zero_unknown_lane_index = |b: u8| if b > 15 { 0b10000000 } else { b }; - - ctx.emit(Inst::gen_move(dst, rhs, ty)); - if rhs == lhs { - // If `lhs` and `rhs` are the same we can use a single PSHUFB to shuffle the XMM - // register. We statically build `constructed_mask` to zero out any unknown lane - // indices (may not be completely necessary: verification could fail incorrect mask - // values) and fix the indexes to all point to the `dst` vector. - let constructed_mask = mask - .iter() - // If the mask is greater than 15 it still may be referring to a lane in b. - .map(|&b| if b > 15 { b.wrapping_sub(16) } else { b }) - .map(zero_unknown_lane_index) - .collect(); - let constant = ctx.use_constant(VCodeConstantData::Generated(constructed_mask)); - let tmp = ctx.alloc_tmp(types::I8X16).only_reg().unwrap(); - ctx.emit(Inst::xmm_load_const(constant, tmp, ty)); - // After loading the constructed mask in a temporary register, we use this to - // shuffle the `dst` register (remember that, in this case, it is the same as - // `src` so we disregard this register). - ctx.emit(Inst::xmm_rm_r(SseOpcode::Pshufb, RegMem::from(tmp), dst)); - } else { - if isa_flags.use_avx512vl_simd() && isa_flags.use_avx512vbmi_simd() { - assert!( - mask.iter().all(|b| *b < 32), - "shuffle mask values must be between 0 and 31" - ); - - // Load the mask into the destination register. - let constant = ctx.use_constant(VCodeConstantData::Generated(mask.into())); - ctx.emit(Inst::xmm_load_const(constant, dst, ty)); - - // VPERMI2B has the exact semantics of Wasm's shuffle: - // permute the bytes in `src1` and `src2` using byte indexes - // in `dst` and store the byte results in `dst`. - ctx.emit(Inst::xmm_rm_r_evex( - Avx512Opcode::Vpermi2b, - RegMem::reg(rhs), - lhs, - dst, - )); - } else { - // If `lhs` and `rhs` are different, we must shuffle each separately and then OR - // them together. This is necessary due to PSHUFB semantics. As in the case above, - // we build the `constructed_mask` for each case statically. - - // PSHUFB the `lhs` argument into `tmp0`, placing zeroes for unused lanes. - let tmp0 = ctx.alloc_tmp(lhs_ty).only_reg().unwrap(); - ctx.emit(Inst::gen_move(tmp0, lhs, lhs_ty)); - let constructed_mask = - mask.iter().cloned().map(zero_unknown_lane_index).collect(); - let constant = ctx.use_constant(VCodeConstantData::Generated(constructed_mask)); - let tmp1 = ctx.alloc_tmp(types::I8X16).only_reg().unwrap(); - ctx.emit(Inst::xmm_load_const(constant, tmp1, ty)); - ctx.emit(Inst::xmm_rm_r(SseOpcode::Pshufb, RegMem::from(tmp1), tmp0)); - - // PSHUFB the second argument, placing zeroes for unused lanes. - let constructed_mask = mask - .iter() - .map(|b| b.wrapping_sub(16)) - .map(zero_unknown_lane_index) - .collect(); - let constant = ctx.use_constant(VCodeConstantData::Generated(constructed_mask)); - let tmp2 = ctx.alloc_tmp(types::I8X16).only_reg().unwrap(); - ctx.emit(Inst::xmm_load_const(constant, tmp2, ty)); - ctx.emit(Inst::xmm_rm_r(SseOpcode::Pshufb, RegMem::from(tmp2), dst)); - - // OR the shuffled registers (the mechanism and lane-size for OR-ing the registers - // is not important). - ctx.emit(Inst::xmm_rm_r(SseOpcode::Orps, RegMem::from(tmp0), dst)); - } - } - } - - Opcode::Swizzle => { - // SIMD swizzle; the following inefficient implementation is due to the Wasm SIMD spec - // requiring mask indexes greater than 15 to have the same semantics as a 0 index. For - // the spec discussion, see https://github.com/WebAssembly/simd/issues/93. The CLIF - // semantics match the Wasm SIMD semantics for this instruction. - // The instruction format maps to variables like: %dst = swizzle %src, %mask - let ty = ty.unwrap(); - let dst = get_output_reg(ctx, outputs[0]).only_reg().unwrap(); - let src = put_input_in_reg(ctx, inputs[0]); - let swizzle_mask = put_input_in_reg(ctx, inputs[1]); - - // Inform the register allocator that `src` and `dst` should be in the same register. - ctx.emit(Inst::gen_move(dst, src, ty)); - - // Create a mask for zeroing out-of-bounds lanes of the swizzle mask. - let zero_mask = ctx.alloc_tmp(types::I8X16).only_reg().unwrap(); - static ZERO_MASK_VALUE: [u8; 16] = [ - 0x70, 0x70, 0x70, 0x70, 0x70, 0x70, 0x70, 0x70, 0x70, 0x70, 0x70, 0x70, 0x70, 0x70, - 0x70, 0x70, - ]; - let constant = ctx.use_constant(VCodeConstantData::WellKnown(&ZERO_MASK_VALUE)); - ctx.emit(Inst::xmm_load_const(constant, zero_mask, ty)); - - // Use the `zero_mask` on a writable `swizzle_mask`. - let swizzle_mask_tmp = ctx.alloc_tmp(types::I8X16).only_reg().unwrap(); - ctx.emit(Inst::gen_move(swizzle_mask_tmp, swizzle_mask, ty)); - ctx.emit(Inst::xmm_rm_r( - SseOpcode::Paddusb, - RegMem::from(zero_mask), - swizzle_mask_tmp, - )); - - // Shuffle `dst` using the fixed-up `swizzle_mask`. - ctx.emit(Inst::xmm_rm_r( - SseOpcode::Pshufb, - RegMem::from(swizzle_mask_tmp), - dst, - )); - } - Opcode::Extractlane => { // The instruction format maps to variables like: %dst = extractlane %src, %lane let ty = ty.unwrap(); diff --git a/cranelift/codegen/src/isa/x64/lower/isle.rs b/cranelift/codegen/src/isa/x64/lower/isle.rs index cf86de35ac..36f9c1dda2 100644 --- a/cranelift/codegen/src/isa/x64/lower/isle.rs +++ b/cranelift/codegen/src/isa/x64/lower/isle.rs @@ -34,6 +34,7 @@ use crate::{ VCodeConstantData, }, }; +use alloc::vec::Vec; use regalloc2::PReg; use smallvec::SmallVec; use std::boxed::Box; @@ -200,6 +201,15 @@ impl Context for IsleContext<'_, '_, MInst, Flags, IsaFlags, 6> { } } + #[inline] + fn avx512vbmi_enabled(&mut self, _: Type) -> Option<()> { + if self.isa_flags.use_avx512vbmi_simd() { + Some(()) + } else { + None + } + } + #[inline] fn use_lzcnt(&mut self, _: Type) -> Option<()> { if self.isa_flags.use_lzcnt() { @@ -839,6 +849,73 @@ impl Context for IsleContext<'_, '_, MInst, Flags, IsaFlags, 6> { Writable::from_reg(Gpr::new(regs::pinned_reg()).unwrap()) } + #[inline] + fn shuffle_0_31_mask(&mut self, mask: &VecMask) -> VCodeConstant { + let mask = mask + .iter() + .map(|&b| if b > 15 { b.wrapping_sub(15) } else { b }) + .map(|b| if b > 15 { 0b10000000 } else { b }) + .collect(); + self.lower_ctx + .use_constant(VCodeConstantData::Generated(mask)) + } + + #[inline] + fn shuffle_0_15_mask(&mut self, mask: &VecMask) -> VCodeConstant { + let mask = mask + .iter() + .map(|&b| if b > 15 { 0b10000000 } else { b }) + .collect(); + self.lower_ctx + .use_constant(VCodeConstantData::Generated(mask)) + } + + #[inline] + fn shuffle_16_31_mask(&mut self, mask: &VecMask) -> VCodeConstant { + let mask = mask + .iter() + .map(|&b| b.wrapping_sub(16)) + .map(|b| if b > 15 { 0b10000000 } else { b }) + .collect(); + self.lower_ctx + .use_constant(VCodeConstantData::Generated(mask)) + } + + #[inline] + fn perm_from_mask_with_zeros( + &mut self, + mask: &VecMask, + ) -> Option<(VCodeConstant, VCodeConstant)> { + if !mask.iter().any(|&b| b > 31) { + return None; + } + + let zeros = mask + .iter() + .map(|&b| if b > 31 { 0x00 } else { 0xff }) + .collect(); + + Some(( + self.perm_from_mask(mask), + self.lower_ctx + .use_constant(VCodeConstantData::Generated(zeros)), + )) + } + + #[inline] + fn perm_from_mask(&mut self, mask: &VecMask) -> VCodeConstant { + let mask = mask.iter().cloned().collect(); + self.lower_ctx + .use_constant(VCodeConstantData::Generated(mask)) + } + + #[inline] + fn swizzle_zero_mask(&mut self) -> VCodeConstant { + static ZERO_MASK_VALUE: [u8; 16] = [0x70; 16]; + self.lower_ctx + .use_constant(VCodeConstantData::WellKnown(&ZERO_MASK_VALUE)) + } + fn emit_div_or_rem( &mut self, kind: &DivOrRemKind, diff --git a/cranelift/codegen/src/machinst/isle.rs b/cranelift/codegen/src/machinst/isle.rs index efa8ae608d..4d9fe9fe20 100644 --- a/cranelift/codegen/src/machinst/isle.rs +++ b/cranelift/codegen/src/machinst/isle.rs @@ -7,6 +7,7 @@ use std::cell::Cell; use target_lexicon::Triple; pub use super::MachLabel; +pub use crate::data_value::DataValue; pub use crate::ir::{ ArgumentExtension, Constant, DynamicStackSlot, ExternalName, FuncRef, GlobalValue, Immediate, SigRef, StackSlot, @@ -24,6 +25,7 @@ pub type ValueArray2 = [Value; 2]; pub type ValueArray3 = [Value; 3]; pub type WritableReg = Writable; pub type VecReg = Vec; +pub type VecMask = Vec; pub type ValueRegs = crate::machinst::ValueRegs; pub type WritableValueRegs = crate::machinst::ValueRegs; pub type InstOutput = SmallVec<[ValueRegs; 2]>; @@ -683,6 +685,16 @@ macro_rules! isle_prelude_methods { Some(u128::from_le_bytes(bytes.try_into().ok()?)) } + #[inline] + fn vec_mask_from_immediate(&mut self, imm: Immediate) -> Option { + let data = self.lower_ctx.get_immediate_data(imm); + if data.len() == 16 { + Some(Vec::from(data.as_slice())) + } else { + None + } + } + #[inline] fn u64_from_constant(&mut self, constant: Constant) -> Option { let bytes = self.lower_ctx.get_constant_data(constant).as_slice(); diff --git a/cranelift/codegen/src/machinst/lower.rs b/cranelift/codegen/src/machinst/lower.rs index 96e5a3fe40..d03faae472 100644 --- a/cranelift/codegen/src/machinst/lower.rs +++ b/cranelift/codegen/src/machinst/lower.rs @@ -5,7 +5,6 @@ // TODO: separate the IR-query core of `Lower` from the lowering logic built on // top of it, e.g. the side-effect/coloring analysis and the scan support. -use crate::data_value::DataValue; use crate::entity::SecondaryMap; use crate::fx::{FxHashMap, FxHashSet}; use crate::inst_predicates::{has_lowering_side_effect, is_constant_64bit}; @@ -23,7 +22,6 @@ use crate::machinst::{ }; use crate::{trace, CodegenResult}; use alloc::vec::Vec; -use core::convert::TryInto; use regalloc2::VReg; use smallvec::{smallvec, SmallVec}; use std::fmt::Debug; @@ -1381,35 +1379,6 @@ impl<'func, I: VCodeInst> Lower<'func, I> { self.vcode.constants().insert(constant) } - /// Retrieve the value immediate from an instruction. This will perform necessary lookups on the - /// `DataFlowGraph` to retrieve even large immediates. - pub fn get_immediate(&self, ir_inst: Inst) -> Option { - let inst_data = self.data(ir_inst); - match inst_data { - InstructionData::Shuffle { imm, .. } => { - let mask = self.f.dfg.immediates.get(imm.clone()).unwrap().as_slice(); - let value = match mask.len() { - 16 => DataValue::V128(mask.try_into().expect("a 16-byte vector mask")), - 8 => DataValue::V64(mask.try_into().expect("an 8-byte vector mask")), - length => panic!("unexpected Shuffle mask length {}", length), - }; - Some(value) - } - InstructionData::UnaryConst { - constant_handle, .. - } => { - let buffer = self.f.dfg.constants.get(constant_handle.clone()).as_slice(); - let value = match buffer.len() { - 16 => DataValue::V128(buffer.try_into().expect("a 16-byte data buffer")), - 8 => DataValue::V64(buffer.try_into().expect("an 8-byte data buffer")), - length => panic!("unexpected UnaryConst buffer length {}", length), - }; - Some(value) - } - _ => inst_data.imm_value(), - } - } - /// Cause the value in `reg` to be in a virtual reg, by copying it into a new virtual reg /// if `reg` is a real reg. `ty` describes the type of the value in `reg`. pub fn ensure_in_vreg(&mut self, reg: Reg, ty: Type) -> Reg { diff --git a/cranelift/codegen/src/prelude.isle b/cranelift/codegen/src/prelude.isle index 890c597ef1..f5caeb94b7 100644 --- a/cranelift/codegen/src/prelude.isle +++ b/cranelift/codegen/src/prelude.isle @@ -35,6 +35,9 @@ ;; ISLE representation of `&[Value]`. (type ValueSlice (primitive ValueSlice)) +;; ISLE representation of `Vec` +(type VecMask extern (enum)) + (type ValueList (primitive ValueList)) (type ValueRegs (primitive ValueRegs)) (type WritableValueRegs (primitive WritableValueRegs)) @@ -798,6 +801,11 @@ (decl u128_from_immediate (u128) Immediate) (extern extractor u128_from_immediate u128_from_immediate) +;; Accessor for `Immediate` as a vector of u8 values. + +(decl vec_mask_from_immediate (VecMask) Immediate) +(extern extractor vec_mask_from_immediate vec_mask_from_immediate) + ;; Accessor for `Constant` as u128. (decl u128_from_constant (u128) Constant) diff --git a/cranelift/filetests/filetests/isa/x64/shuffle-avx512.clif b/cranelift/filetests/filetests/isa/x64/shuffle-avx512.clif new file mode 100644 index 0000000000..29221415ca --- /dev/null +++ b/cranelift/filetests/filetests/isa/x64/shuffle-avx512.clif @@ -0,0 +1,58 @@ +test compile precise-output +set enable_simd +target x86_64 has_avx512vl has_avx512vbmi + +function %shuffle_in_bounds(i8x16, i8x16) -> i8x16 { +block0(v0: i8x16, v1: i8x16): + ;; pick the second lane of v1, the rest use the first lane of v0 + v2 = shuffle v0, v1, 0x11000000000000000000000000000000 + return v2 +} + +; pushq %rbp +; movq %rsp, %rbp +; block0: +; movdqa %xmm0, %xmm9 +; load_const VCodeConstant(0), %xmm0 +; vpermi2b %xmm1, %xmm0, %xmm9 +; movq %rbp, %rsp +; popq %rbp +; ret + +function %shuffle_out_of_bounds(i8x16, i8x16) -> i8x16 { +block0(v0: i8x16, v1: i8x16): + ;; pick zero for the first lane, the rest use first lane of v0 + ;; This should introduce two constants, one for the permutation and one to + ;; mask the non-zero values for lanes 1-15 + v2 = shuffle v0, v1, 0x80000000000000000000000000000000 + return v2 +} + +; pushq %rbp +; movq %rsp, %rbp +; block0: +; movdqa %xmm0, %xmm12 +; load_const VCodeConstant(1), %xmm0 +; load_const VCodeConstant(0), %xmm7 +; vpermi2b %xmm1, %xmm7, %xmm12 +; andps %xmm0, %xmm7, %xmm0 +; movq %rbp, %rsp +; popq %rbp +; ret + +function %f3(i8x16, i8x16) -> i8x16 { +block0(v0: i8x16, v1: i8x16): + v2 = shuffle v0, v1, [3 0 31 26 4 6 12 11 23 13 24 4 2 15 17 5] + return v2 +} + +; pushq %rbp +; movq %rsp, %rbp +; block0: +; movdqa %xmm0, %xmm9 +; load_const VCodeConstant(0), %xmm0 +; vpermi2b %xmm1, %xmm0, %xmm9 +; movq %rbp, %rsp +; popq %rbp +; ret + diff --git a/cranelift/filetests/filetests/isa/x64/simd-lane-access-compile.clif b/cranelift/filetests/filetests/isa/x64/simd-lane-access-compile.clif index 98a04dac05..1dd0dbc29a 100644 --- a/cranelift/filetests/filetests/isa/x64/simd-lane-access-compile.clif +++ b/cranelift/filetests/filetests/isa/x64/simd-lane-access-compile.clif @@ -15,13 +15,13 @@ block0: ; pushq %rbp ; movq %rsp, %rbp ; block0: -; load_const VCodeConstant(3), %xmm6 -; load_const VCodeConstant(2), %xmm0 -; load_const VCodeConstant(0), %xmm7 -; pshufb %xmm6, %xmm7, %xmm6 -; load_const VCodeConstant(1), %xmm10 -; pshufb %xmm0, %xmm10, %xmm0 -; orps %xmm0, %xmm6, %xmm0 +; load_const VCodeConstant(3), %xmm0 +; load_const VCodeConstant(2), %xmm5 +; load_const VCodeConstant(0), %xmm3 +; pshufb %xmm0, %xmm3, %xmm0 +; load_const VCodeConstant(1), %xmm7 +; pshufb %xmm5, %xmm7, %xmm5 +; por %xmm0, %xmm5, %xmm0 ; movq %rbp, %rsp ; popq %rbp ; ret @@ -37,8 +37,8 @@ block0: ; movq %rsp, %rbp ; block0: ; load_const VCodeConstant(1), %xmm0 -; load_const VCodeConstant(0), %xmm4 -; pshufb %xmm0, %xmm4, %xmm0 +; load_const VCodeConstant(0), %xmm2 +; pshufb %xmm0, %xmm2, %xmm0 ; movq %rbp, %rsp ; popq %rbp ; ret @@ -55,10 +55,10 @@ block0: ; movq %rsp, %rbp ; block0: ; load_const VCodeConstant(1), %xmm0 -; load_const VCodeConstant(1), %xmm5 -; load_const VCodeConstant(0), %xmm6 -; paddusb %xmm5, %xmm6, %xmm5 -; pshufb %xmm0, %xmm5, %xmm0 +; load_const VCodeConstant(1), %xmm3 +; load_const VCodeConstant(0), %xmm4 +; paddusb %xmm3, %xmm4, %xmm3 +; pshufb %xmm0, %xmm3, %xmm0 ; movq %rbp, %rsp ; popq %rbp ; ret diff --git a/cranelift/filetests/filetests/runtests/simd-shuffle.clif b/cranelift/filetests/filetests/runtests/simd-shuffle.clif index eaabb23768..cbb8bef5ae 100644 --- a/cranelift/filetests/filetests/runtests/simd-shuffle.clif +++ b/cranelift/filetests/filetests/runtests/simd-shuffle.clif @@ -4,6 +4,7 @@ target aarch64 target s390x set enable_simd target x86_64 has_sse3 has_ssse3 has_sse41 +target x86_64 has_sse3 has_ssse3 has_sse41 has_avx512vl has_avx512vbmi function %shuffle_i8x16(i8x16, i8x16) -> i8x16 { block0(v0: i8x16, v1: i8x16): @@ -11,3 +12,10 @@ block0(v0: i8x16, v1: i8x16): return v2 } ; run: %shuffle_i8x16([1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16], [17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32]) == [4 1 32 27 5 7 13 12 24 14 25 5 3 16 18 6] + +function %shuffle_zeros(i8x16, i8x16) -> i8x16 { +block0(v0: i8x16, v1: i8x16): + v2 = shuffle v0, v1, [3 0 32 255 4 6 12 11 23 13 24 4 2 97 17 5] + return v2 +} +; run: %shuffle_zeros([1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16], [17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32]) == [4 1 0 0 5 7 13 12 24 14 25 5 3 0 18 6]