Add x86 implementation of shuffle

This commit is contained in:
Andrew Brown
2019-08-26 14:50:05 -07:00
parent 9e088e4164
commit af1499ce99
18 changed files with 336 additions and 44 deletions

View File

@@ -1785,7 +1785,7 @@ pub(crate) fn define(
let allowed_simd_type = |t: &LaneType| t.lane_bits() >= 8 && t.lane_bits() < 128;
// PSHUFB, 8-bit shuffle using two XMM registers.
for ty in ValueType::all_lane_types().filter(|t| t.lane_bits() == 8) {
for ty in ValueType::all_lane_types().filter(allowed_simd_type) {
let instruction = x86_pshufb.bind_vector_from_lane(ty, sse_vector_size);
let template = rec_fa.nonrex().opcodes(vec![0x66, 0x0f, 0x38, 00]);
e.enc32_isap(instruction.clone(), template.clone(), use_ssse3_simd);
@@ -1804,7 +1804,7 @@ pub(crate) fn define(
// SIMD scalar_to_vector; this uses MOV to copy the scalar value to an XMM register; according
// to the Intel manual: "When the destination operand is an XMM register, the source operand is
// written to the low doubleword of the register and the regiser is zero-extended to 128 bits."
// written to the low doubleword of the register and the register is zero-extended to 128 bits."
for ty in ValueType::all_lane_types().filter(allowed_simd_type) {
let instruction = scalar_to_vector.bind_vector_from_lane(ty, sse_vector_size);
if ty.is_float() {
@@ -1929,6 +1929,13 @@ pub(crate) fn define(
e.enc_32_64_maybe_isap(instruction, template, None); // from SSE
}
// SIMD bor using ORPS
for ty in ValueType::all_lane_types().filter(allowed_simd_type) {
let instruction = bor.bind_vector_from_lane(ty, sse_vector_size);
let template = rec_fa.nonrex().opcodes(vec![0x0f, 0x56]);
e.enc_32_64_maybe_isap(instruction, template, None); // from SSE
}
// Reference type instructions
// Null references implemented as iconst 0.

View File

@@ -45,6 +45,7 @@ pub(crate) fn define(shared: &mut SharedDefinitions, x86_instructions: &Instruct
let selectif = insts.by_name("selectif");
let smulhi = insts.by_name("smulhi");
let splat = insts.by_name("splat");
let shuffle = insts.by_name("shuffle");
let srem = insts.by_name("srem");
let udiv = insts.by_name("udiv");
let umulhi = insts.by_name("umulhi");
@@ -380,6 +381,7 @@ pub(crate) fn define(shared: &mut SharedDefinitions, x86_instructions: &Instruct
);
}
narrow.custom_legalize(shuffle, "convert_shuffle");
narrow.custom_legalize(extractlane, "convert_extractlane");
narrow.custom_legalize(insertlane, "convert_insertlane");

View File

@@ -396,11 +396,11 @@ pub(crate) fn define<'shared>(
let f_trap = formats.by_name("Trap");
let f_unary = formats.by_name("Unary");
let f_unary_bool = formats.by_name("UnaryBool");
let f_unary_const = formats.by_name("UnaryConst");
let f_unary_global_value = formats.by_name("UnaryGlobalValue");
let f_unary_ieee32 = formats.by_name("UnaryIeee32");
let f_unary_ieee64 = formats.by_name("UnaryIeee64");
let f_unary_imm = formats.by_name("UnaryImm");
let f_unary_imm128 = formats.by_name("UnaryImm128");
// Predicates shorthands.
let use_sse41 = settings.predicate_by_name("use_sse41");
@@ -2437,14 +2437,14 @@ pub(crate) fn define<'shared>(
);
recipes.add_template_recipe(
EncodingRecipeBuilder::new("vconst", f_unary_imm128, 5)
EncodingRecipeBuilder::new("vconst", f_unary_const, 5)
.operands_out(vec![fpr])
.clobbers_flags(false)
.emit(
r#"
{{PUT_OP}}(bits, rex2(0, out_reg0), sink);
modrm_riprel(out_reg0, sink);
const_disp4(imm, func, sink);
const_disp4(constant_handle, func, sink);
"#,
),
);

View File

@@ -6,10 +6,10 @@ pub(crate) fn define(imm: &Immediates, entities: &EntityRefs) -> FormatRegistry
registry.insert(Builder::new("Unary").value());
registry.insert(Builder::new("UnaryImm").imm(&imm.imm64));
registry.insert(Builder::new("UnaryImm128").imm(&imm.uimm128));
registry.insert(Builder::new("UnaryIeee32").imm(&imm.ieee32));
registry.insert(Builder::new("UnaryIeee64").imm(&imm.ieee64));
registry.insert(Builder::new("UnaryBool").imm(&imm.boolean));
registry.insert(Builder::new("UnaryConst").imm(&imm.pool_constant));
registry.insert(Builder::new("UnaryGlobalValue").imm(&entities.global_value));
registry.insert(Builder::new("Binary").value().value());
@@ -43,6 +43,12 @@ pub(crate) fn define(imm: &Immediates, entities: &EntityRefs) -> FormatRegistry
.value()
.imm_with_name("lane", &imm.uimm8),
);
registry.insert(
Builder::new("Shuffle")
.value()
.value()
.imm_with_name("mask", &imm.uimm128),
);
registry.insert(Builder::new("IntCompare").imm(&imm.intcc).value().value());
registry.insert(

View File

@@ -23,6 +23,12 @@ pub(crate) struct Immediates {
/// const.
pub uimm128: OperandKind,
/// A constant stored in the constant pool.
///
/// This operand is used to pass constants to instructions like vconst while storing the
/// actual bytes in the constant pool.
pub pool_constant: OperandKind,
/// A 32-bit immediate signed offset.
///
/// This is used to represent an immediate address offset in load/store instructions.
@@ -84,6 +90,12 @@ impl Immediates {
uimm128: Builder::new_imm("uimm128")
.doc("A 128-bit immediate unsigned integer.")
.rust_type("ir::Immediate")
.build(),
pool_constant: Builder::new_imm("poolConstant")
.doc("A constant stored in the constant pool.")
.default_member("constant_handle")
.rust_type("ir::Constant")
.build(),

View File

@@ -1090,7 +1090,7 @@ pub(crate) fn define(
let N = &operand_doc(
"N",
&imm.uimm128,
&imm.pool_constant,
"The 16 immediate bytes of a 128-bit vector",
);
let a = &operand_doc("a", TxN, "A constant vector value");
@@ -1108,6 +1108,41 @@ pub(crate) fn define(
.operands_out(vec![a]),
);
let mask = &operand_doc(
"mask",
&imm.uimm128,
"The 16 immediate bytes used for selecting the elements to shuffle",
);
let Tx16 = &TypeVar::new(
"Tx16",
"A SIMD vector with exactly 16 lanes of 8-bit values; eventually this may support other \
lane counts and widths",
TypeSetBuilder::new()
.ints(8..8)
.bools(8..8)
.simd_lanes(16..16)
.includes_scalars(false)
.build(),
);
let a = &operand_doc("a", Tx16, "A vector value");
let b = &operand_doc("b", Tx16, "A vector value");
ig.push(
Inst::new(
"shuffle",
r#"
SIMD vector shuffle.
Shuffle two vectors using the given immediate bytes. For each of the 16 bytes of the
immediate, a value i of 0-15 selects the i-th element of the first vector and a value i of
16-31 selects the (i-16)th element of the second vector. Immediate values outside of the
0-31 range place a 0 in the resulting vector lane.
"#,
)
.operands_in(vec![a, b, mask])
.operands_out(vec![a]),
);
let a = &operand_doc("a", Ref, "A constant reference null value");
ig.push(

View File

@@ -5,7 +5,7 @@ use crate::ir;
use crate::ir::builder::ReplaceBuilder;
use crate::ir::extfunc::ExtFuncData;
use crate::ir::instructions::{BranchInfo, CallInfo, InstructionData};
use crate::ir::{types, ConstantPool};
use crate::ir::{types, ConstantPool, Immediate};
use crate::ir::{
Ebb, FuncRef, Inst, SigRef, Signature, Type, Value, ValueLabelAssignments, ValueList,
ValueListPool,
@@ -19,6 +19,7 @@ use core::mem;
use core::ops::{Index, IndexMut};
use core::u16;
use std::collections::HashMap;
use std::vec::Vec;
/// A data flow graph defines all instructions and extended basic blocks in a function as well as
/// the data flow dependencies between them. The DFG also tracks values which can be either
@@ -70,6 +71,9 @@ pub struct DataFlowGraph {
/// Constants used within the function
pub constants: ConstantPool,
/// Stores large immediates that otherwise will not fit on InstructionData
pub immediates: PrimaryMap<Immediate, Vec<u8>>,
}
impl DataFlowGraph {
@@ -85,6 +89,7 @@ impl DataFlowGraph {
ext_funcs: PrimaryMap::new(),
values_labels: None,
constants: ConstantPool::new(),
immediates: PrimaryMap::new(),
}
}
@@ -98,7 +103,8 @@ impl DataFlowGraph {
self.signatures.clear();
self.ext_funcs.clear();
self.values_labels = None;
self.constants.clear()
self.constants.clear();
self.immediates.clear();
}
/// Get the total number of instructions created in this function, whether they are currently

View File

@@ -181,6 +181,29 @@ impl Constant {
}
}
/// An opaque reference to an immediate.
///
/// Some immediates (e.g. SIMD shuffle masks) are too large to store in the
/// [`InstructionData`](super::instructions::InstructionData) struct and therefore must be
/// tracked separately in [`DataFlowGraph::immediates`](super::dfg::DataFlowGraph). `Immediate`
/// provides a way to reference values stored there.
#[derive(Copy, Clone, PartialEq, Eq, Hash)]
pub struct Immediate(u32);
entity_impl!(Immediate, "imm");
impl Immediate {
/// Create an immediate reference from its number.
///
/// This method is for use by the parser.
pub fn with_number(n: u32) -> Option<Self> {
if n < u32::MAX {
Some(Immediate(n))
} else {
None
}
}
}
/// An opaque reference to a [jump table](https://en.wikipedia.org/wiki/Branch_table).
///
/// `JumpTable`s are used for indirect branching and are specialized for dense,

View File

@@ -31,7 +31,8 @@ pub use crate::ir::builder::{InsertBuilder, InstBuilder, InstBuilderBase, InstIn
pub use crate::ir::constant::{ConstantData, ConstantOffset, ConstantPool};
pub use crate::ir::dfg::{DataFlowGraph, ValueDef};
pub use crate::ir::entities::{
Constant, Ebb, FuncRef, GlobalValue, Heap, Inst, JumpTable, SigRef, StackSlot, Table, Value,
Constant, Ebb, FuncRef, GlobalValue, Heap, Immediate, Inst, JumpTable, SigRef, StackSlot,
Table, Value,
};
pub use crate::ir::extfunc::{
AbiParam, ArgumentExtension, ArgumentPurpose, ExtFuncData, Signature,

View File

@@ -899,6 +899,80 @@ fn expand_fcvt_to_uint_sat(
cfg.recompute_ebb(pos.func, done);
}
/// Convert shuffle instructions.
fn convert_shuffle(
inst: ir::Inst,
func: &mut ir::Function,
_cfg: &mut ControlFlowGraph,
_isa: &dyn TargetIsa,
) {
let mut pos = FuncCursor::new(func).at_inst(inst);
pos.use_srcloc(inst);
if let ir::InstructionData::Shuffle { args, mask, .. } = pos.func.dfg[inst] {
// A mask-building helper: in 128-bit SIMD, 0-15 indicate which lane to read from and a 1
// in the most significant position zeroes the lane.
let zero_unknown_lane_index = |b: u8| if b > 15 { 0b10000000 } else { b };
// We only have to worry about aliasing here because copies will be introduced later (in
// regalloc).
let a = pos.func.dfg.resolve_aliases(args[0]);
let b = pos.func.dfg.resolve_aliases(args[1]);
let mask = pos
.func
.dfg
.immediates
.get(mask)
.expect("The shuffle immediate should have been recorded before this point")
.clone();
if a == b {
// PSHUFB the first argument (since it is the same as the second).
let constructed_mask = mask
.iter()
// If the mask is greater than 15 it still may be referring to a lane in b.
.map(|&b| if b > 15 { b.wrapping_sub(16) } else { b })
.map(zero_unknown_lane_index)
.collect();
let handle = pos.func.dfg.constants.insert(constructed_mask);
// Move the built mask into another XMM register.
let a_type = pos.func.dfg.value_type(a);
let mask_value = pos.ins().vconst(a_type, handle);
// Shuffle the single incoming argument.
pos.func.dfg.replace(inst).x86_pshufb(a, mask_value);
} else {
// PSHUFB the first argument, placing zeroes for unused lanes.
let constructed_mask = mask.iter().cloned().map(zero_unknown_lane_index).collect();
let handle = pos.func.dfg.constants.insert(constructed_mask);
// Move the built mask into another XMM register.
let a_type = pos.func.dfg.value_type(a);
let mask_value = pos.ins().vconst(a_type, handle);
// Shuffle the first argument.
let shuffled_first_arg = pos.ins().x86_pshufb(a, mask_value);
// PSHUFB the second argument, placing zeroes for unused lanes.
let constructed_mask = mask
.iter()
.map(|b| b.wrapping_sub(16))
.map(zero_unknown_lane_index)
.collect();
let handle = pos.func.dfg.constants.insert(constructed_mask);
// Move the built mask into another XMM register.
let b_type = pos.func.dfg.value_type(b);
let mask_value = pos.ins().vconst(b_type, handle);
// Shuffle the second argument.
let shuffled_second_arg = pos.ins().x86_pshufb(b, mask_value);
// OR the vectors together to form the final shuffled value.
pos.func
.dfg
.replace(inst)
.bor(shuffled_first_arg, shuffled_second_arg);
// TODO when AVX512 is enabled we should replace this sequence with a single VPERMB
};
}
}
/// Because floats already exist in XMM registers, we can keep them there when executing a CLIF
/// extractlane instruction
fn convert_extractlane(

View File

@@ -706,7 +706,6 @@ impl<'a> Verifier<'a> {
// Exhaustive list so we can't forget to add new formats
Unary { .. }
| UnaryImm { .. }
| UnaryImm128 { .. }
| UnaryIeee32 { .. }
| UnaryIeee64 { .. }
| UnaryBool { .. }
@@ -715,6 +714,8 @@ impl<'a> Verifier<'a> {
| Ternary { .. }
| InsertLane { .. }
| ExtractLane { .. }
| UnaryConst { .. }
| Shuffle { .. }
| IntCompare { .. }
| IntCompareImm { .. }
| IntCond { .. }

View File

@@ -488,11 +488,6 @@ pub fn write_operands(
match dfg[inst] {
Unary { arg, .. } => write!(w, " {}", arg),
UnaryImm { imm, .. } => write!(w, " {}", imm),
UnaryImm128 { imm, .. } => {
let data = dfg.constants.get(imm);
let uimm128 = Uimm128::from(&data[..]);
write!(w, " {}", uimm128)
}
UnaryIeee32 { imm, .. } => write!(w, " {}", imm),
UnaryIeee64 { imm, .. } => write!(w, " {}", imm),
UnaryBool { imm, .. } => write!(w, " {}", imm),
@@ -510,6 +505,20 @@ pub fn write_operands(
NullAry { .. } => write!(w, " "),
InsertLane { lane, args, .. } => write!(w, " {}, {}, {}", args[0], lane, args[1]),
ExtractLane { lane, arg, .. } => write!(w, " {}, {}", arg, lane),
UnaryConst {
constant_handle, ..
} => {
let data = dfg.constants.get(constant_handle);
let uimm128 = Uimm128::from(&data[..]);
write!(w, " {}", uimm128)
}
Shuffle { mask, args, .. } => {
let data = dfg.immediates.get(mask).expect(
"Expected the shuffle mask to already be inserted into the immediates table",
);
let uimm128 = Uimm128::from(&data[..]);
write!(w, " {}, {}, {}", args[0], args[1], uimm128)
}
IntCompare { cond, args, .. } => write!(w, " {} {}, {}", cond, args[0], args[1]),
IntCompareImm { cond, arg, imm, .. } => write!(w, " {} {}, {}", cond, arg, imm),
IntCond { cond, arg, .. } => write!(w, " {} {}", cond, arg),