Add x86 implementation of splat instruction
This commit is contained in:
@@ -1,7 +1,8 @@
|
||||
use crate::cdsl::ast::{var, ExprBuilder, Literal};
|
||||
use crate::cdsl::instructions::InstructionGroup;
|
||||
use crate::cdsl::types::ValueType;
|
||||
use crate::cdsl::xform::TransformGroupBuilder;
|
||||
|
||||
use crate::shared::types::Float::F64;
|
||||
use crate::shared::types::Int::{I32, I64};
|
||||
use crate::shared::Definitions as SharedDefinitions;
|
||||
|
||||
@@ -19,9 +20,11 @@ pub fn define(shared: &mut SharedDefinitions, x86_instructions: &InstructionGrou
|
||||
// List of instructions.
|
||||
let insts = &shared.instructions;
|
||||
let band = insts.by_name("band");
|
||||
let bitcast = insts.by_name("bitcast");
|
||||
let bor = insts.by_name("bor");
|
||||
let clz = insts.by_name("clz");
|
||||
let ctz = insts.by_name("ctz");
|
||||
let f64const = insts.by_name("f64const");
|
||||
let fcmp = insts.by_name("fcmp");
|
||||
let fcvt_from_uint = insts.by_name("fcvt_from_uint");
|
||||
let fcvt_to_sint = insts.by_name("fcvt_to_sint");
|
||||
@@ -33,11 +36,15 @@ pub fn define(shared: &mut SharedDefinitions, x86_instructions: &InstructionGrou
|
||||
let iadd = insts.by_name("iadd");
|
||||
let iconst = insts.by_name("iconst");
|
||||
let imul = insts.by_name("imul");
|
||||
let insertlane = insts.by_name("insertlane");
|
||||
let isub = insts.by_name("isub");
|
||||
let popcnt = insts.by_name("popcnt");
|
||||
let raw_bitcast = insts.by_name("raw_bitcast");
|
||||
let scalar_to_vector = insts.by_name("scalar_to_vector");
|
||||
let sdiv = insts.by_name("sdiv");
|
||||
let selectif = insts.by_name("selectif");
|
||||
let smulhi = insts.by_name("smulhi");
|
||||
let splat = insts.by_name("splat");
|
||||
let srem = insts.by_name("srem");
|
||||
let udiv = insts.by_name("udiv");
|
||||
let umulhi = insts.by_name("umulhi");
|
||||
@@ -46,6 +53,8 @@ pub fn define(shared: &mut SharedDefinitions, x86_instructions: &InstructionGrou
|
||||
|
||||
let x86_bsf = x86_instructions.by_name("x86_bsf");
|
||||
let x86_bsr = x86_instructions.by_name("x86_bsr");
|
||||
let x86_pshufb = x86_instructions.by_name("x86_pshufb");
|
||||
let x86_pshufd = x86_instructions.by_name("x86_pshufd");
|
||||
let x86_umulx = x86_instructions.by_name("x86_umulx");
|
||||
let x86_smulx = x86_instructions.by_name("x86_smulx");
|
||||
|
||||
@@ -53,6 +62,8 @@ pub fn define(shared: &mut SharedDefinitions, x86_instructions: &InstructionGrou
|
||||
let floatcc = shared.operand_kinds.by_name("floatcc");
|
||||
let imm64 = shared.operand_kinds.by_name("imm64");
|
||||
let intcc = shared.operand_kinds.by_name("intcc");
|
||||
let uimm8 = shared.operand_kinds.by_name("uimm8");
|
||||
let ieee64 = shared.operand_kinds.by_name("ieee64");
|
||||
|
||||
// Division and remainder.
|
||||
//
|
||||
@@ -290,4 +301,84 @@ pub fn define(shared: &mut SharedDefinitions, x86_instructions: &InstructionGrou
|
||||
);
|
||||
|
||||
group.build_and_add_to(&mut shared.transform_groups);
|
||||
|
||||
let mut narrow = TransformGroupBuilder::new(
|
||||
"x86_narrow",
|
||||
r#"
|
||||
Legalize instructions by narrowing.
|
||||
|
||||
Use x86-specific instructions if needed."#,
|
||||
)
|
||||
.isa("x86")
|
||||
.chain_with(shared.transform_groups.by_name("narrow").id);
|
||||
|
||||
// SIMD
|
||||
let uimm8_zero = Literal::constant(uimm8, 0x00);
|
||||
let uimm8_one = Literal::constant(uimm8, 0x01);
|
||||
let ieee64_zero = Literal::constant(ieee64, 0x00);
|
||||
let b = var("b");
|
||||
let c = var("c");
|
||||
let d = var("d");
|
||||
|
||||
// SIMD splat: 8-bits
|
||||
for ty in ValueType::all_lane_types().filter(|t| t.lane_bits() == 8) {
|
||||
let splat_x8x16 = splat.bind_vector(ty, 128 / ty.lane_bits());
|
||||
let bitcast_f64_to_any8x16 = bitcast.bind_vector(ty, 128 / ty.lane_bits()).bind(F64);
|
||||
narrow.legalize(
|
||||
def!(y = splat_x8x16(x)),
|
||||
vec![
|
||||
def!(a = scalar_to_vector(x)), // move into the lowest 8 bits of an XMM register
|
||||
def!(b = f64const(ieee64_zero)), // zero out a different XMM register; the shuffle mask for moving the lowest byte to all other byte lanes is 0x0
|
||||
def!(c = bitcast_f64_to_any8x16(b)), // no instruction emitted; informs the SSA that the 0 in b can be used as a vector of this type
|
||||
def!(y = x86_pshufb(a, c)), // PSHUFB takes two XMM operands, one of which is a shuffle mask (i.e. b)
|
||||
],
|
||||
);
|
||||
}
|
||||
|
||||
// SIMD splat: 16-bits
|
||||
for ty in ValueType::all_lane_types().filter(|t| t.lane_bits() == 16) {
|
||||
let splat_x16x8 = splat.bind_vector(ty, 128 / ty.lane_bits());
|
||||
let raw_bitcast_any16x8_to_i32x4 = raw_bitcast
|
||||
.bind_vector(I32, 4)
|
||||
.bind_vector(ty, 128 / ty.lane_bits());
|
||||
let raw_bitcast_i32x4_to_any16x8 = raw_bitcast
|
||||
.bind_vector(ty, 128 / ty.lane_bits())
|
||||
.bind_vector(I32, 4);
|
||||
narrow.legalize(
|
||||
def!(y = splat_x16x8(x)),
|
||||
vec![
|
||||
def!(a = scalar_to_vector(x)), // move into the lowest 16 bits of an XMM register
|
||||
def!(b = insertlane(a, uimm8_one, x)), // insert the value again but in the next lowest 16 bits
|
||||
def!(c = raw_bitcast_any16x8_to_i32x4(b)), // no instruction emitted; pretend this is an I32x4 so we can use PSHUFD
|
||||
def!(d = x86_pshufd(c, uimm8_zero)), // broadcast the bytes in the XMM register with PSHUFD
|
||||
def!(y = raw_bitcast_i32x4_to_any16x8(d)), // no instruction emitted; pretend this is an X16x8 again
|
||||
],
|
||||
);
|
||||
}
|
||||
|
||||
// SIMD splat: 32-bits
|
||||
for ty in ValueType::all_lane_types().filter(|t| t.lane_bits() == 32) {
|
||||
let splat_any32x4 = splat.bind_vector(ty, 128 / ty.lane_bits());
|
||||
narrow.legalize(
|
||||
def!(y = splat_any32x4(x)),
|
||||
vec![
|
||||
def!(a = scalar_to_vector(x)), // translate to an x86 MOV to get the value in an XMM register
|
||||
def!(y = x86_pshufd(a, uimm8_zero)), // broadcast the bytes in the XMM register with PSHUF
|
||||
],
|
||||
);
|
||||
}
|
||||
|
||||
// SIMD splat: 64-bits
|
||||
for ty in ValueType::all_lane_types().filter(|t| t.lane_bits() == 64) {
|
||||
let splat_any64x2 = splat.bind_vector(ty, 128 / ty.lane_bits());
|
||||
narrow.legalize(
|
||||
def!(y = splat_any64x2(x)),
|
||||
vec![
|
||||
def!(a = scalar_to_vector(x)), // move into the lowest 64 bits of an XMM register
|
||||
def!(y = insertlane(a, uimm8_one, x)), // move into the highest 64 bits of the same XMM register
|
||||
],
|
||||
);
|
||||
}
|
||||
|
||||
narrow.build_and_add_to(&mut shared.transform_groups);
|
||||
}
|
||||
|
||||
@@ -30,6 +30,7 @@ pub fn define(shared_defs: &mut SharedDefinitions) -> TargetIsa {
|
||||
let expand_flags = shared_defs.transform_groups.by_name("expand_flags");
|
||||
let narrow = shared_defs.transform_groups.by_name("narrow");
|
||||
let widen = shared_defs.transform_groups.by_name("widen");
|
||||
let x86_narrow = shared_defs.transform_groups.by_name("x86_narrow");
|
||||
let x86_expand = shared_defs.transform_groups.by_name("x86_expand");
|
||||
|
||||
x86_32.legalize_monomorphic(expand_flags);
|
||||
@@ -42,7 +43,7 @@ pub fn define(shared_defs: &mut SharedDefinitions) -> TargetIsa {
|
||||
x86_32.legalize_type(F64, x86_expand);
|
||||
|
||||
x86_64.legalize_monomorphic(expand_flags);
|
||||
x86_64.legalize_default(narrow);
|
||||
x86_64.legalize_default(x86_narrow);
|
||||
x86_64.legalize_type(B1, expand_flags);
|
||||
x86_64.legalize_type(I8, widen);
|
||||
x86_64.legalize_type(I16, widen);
|
||||
|
||||
@@ -738,6 +738,12 @@ impl From<f64> for Ieee64 {
|
||||
}
|
||||
}
|
||||
|
||||
impl From<u64> for Ieee64 {
|
||||
fn from(x: u64) -> Self {
|
||||
Ieee64::with_float(f64::from_bits(x))
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
|
||||
73
cranelift/filetests/filetests/isa/x86/legalize-splat.clif
Normal file
73
cranelift/filetests/filetests/isa/x86/legalize-splat.clif
Normal file
@@ -0,0 +1,73 @@
|
||||
test compile
|
||||
set enable_simd=true
|
||||
set probestack_enabled=false
|
||||
target x86_64 haswell
|
||||
|
||||
; use baldrdash calling convention here for simplicity (avoids prologue, epilogue)
|
||||
function %test_splat_i32() -> i32x4 baldrdash {
|
||||
ebb0:
|
||||
v0 = iconst.i32 42
|
||||
v1 = splat.i32x4 v0
|
||||
return v1
|
||||
}
|
||||
|
||||
; sameln: function %test_splat_i32() -> i32x4 [%xmm0] baldrdash {
|
||||
; nextln: ss0 = incoming_arg 0, offset 0
|
||||
; nextln:
|
||||
; nextln: ebb0:
|
||||
; nextln: v0 = iconst.i32 42
|
||||
; nextln: v2 = scalar_to_vector.i32x4 v0
|
||||
; nextln: v1 = x86_pshufd v2, 0
|
||||
; nextln: return v1
|
||||
; nextln: }
|
||||
|
||||
|
||||
|
||||
function %test_splat_i64() -> i64x2 baldrdash {
|
||||
ebb0:
|
||||
v0 = iconst.i64 42
|
||||
v1 = splat.i64x2 v0
|
||||
return v1
|
||||
}
|
||||
|
||||
; check: ebb0:
|
||||
; nextln: v0 = iconst.i64 42
|
||||
; nextln: v2 = scalar_to_vector.i64x2 v0
|
||||
; nextln: v1 = insertlane v2, 1, v0
|
||||
; nextln: return v1
|
||||
|
||||
|
||||
|
||||
function %test_splat_b16() -> b16x8 baldrdash {
|
||||
ebb0:
|
||||
v0 = bconst.b16 true
|
||||
v1 = splat.b16x8 v0
|
||||
return v1
|
||||
}
|
||||
|
||||
; check: ebb0:
|
||||
; nextln: v0 = bconst.b16 true
|
||||
; nextln: v2 = scalar_to_vector.b16x8 v0
|
||||
; nextln: v3 = insertlane v2, 1, v0
|
||||
; nextln: v4 = raw_bitcast.i32x4 v3
|
||||
; nextln: v5 = x86_pshufd v4, 0
|
||||
; nextln: v1 = raw_bitcast.b16x8 v5
|
||||
; nextln: return v1
|
||||
|
||||
|
||||
|
||||
function %test_splat_i8() -> i8x16 baldrdash {
|
||||
ebb0:
|
||||
v0 = iconst.i8 42
|
||||
v1 = splat.i8x16 v0
|
||||
return v1
|
||||
}
|
||||
|
||||
; check: ebb0:
|
||||
; nextln: v2 = iconst.i32 42
|
||||
; nextln: v0 = ireduce.i8 v2
|
||||
; nextln: v3 = scalar_to_vector.i8x16 v0
|
||||
; nextln: v4 = f64const 0.0
|
||||
; nextln: v5 = bitcast.i8x16 v4
|
||||
; nextln: v1 = x86_pshufb v3, v5
|
||||
; nextln: return v1
|
||||
Reference in New Issue
Block a user