Add x86 SIMD ishl

Only the shifts with applicable SSE2 instructions (i.e. 16-64 bit width) are implemented here.
This commit is contained in:
Andrew Brown
2019-10-02 13:38:54 -07:00
parent 67733bd2fc
commit 6460fe705f
7 changed files with 152 additions and 1 deletions

View File

@@ -521,6 +521,7 @@ pub(crate) fn define<'defs>(
let x86_pinsr = x86.by_name("x86_pinsr"); let x86_pinsr = x86.by_name("x86_pinsr");
let x86_pshufd = x86.by_name("x86_pshufd"); let x86_pshufd = x86.by_name("x86_pshufd");
let x86_pshufb = x86.by_name("x86_pshufb"); let x86_pshufb = x86.by_name("x86_pshufb");
let x86_psll = x86.by_name("x86_psll");
let x86_push = x86.by_name("x86_push"); let x86_push = x86.by_name("x86_push");
let x86_sdivmodx = x86.by_name("x86_sdivmodx"); let x86_sdivmodx = x86.by_name("x86_sdivmodx");
let x86_smulx = x86.by_name("x86_smulx"); let x86_smulx = x86.by_name("x86_smulx");
@@ -1988,6 +1989,26 @@ pub(crate) fn define<'defs>(
e.enc_32_64(bxor, rec_fa.opcodes(&PXOR)); e.enc_32_64(bxor, rec_fa.opcodes(&PXOR));
} }
// SIMD bitcast from I32/I64 to the low bits of a vector (e.g. I64x2); this register movement
// allows SIMD shifts to be legalized more easily. TODO ideally this would be typed as an
// I128x1 but restrictions on the type builder prevent this; the general idea here is that
// the upper bits are all zeroed and do not form parts of any separate lane. See
// https://github.com/CraneStation/cranelift/issues/1146.
e.enc_both(
bitcast.bind(vector(I64, sse_vector_size)).bind(I32),
rec_frurm.opcodes(&MOVD_LOAD_XMM),
);
e.enc64(
bitcast.bind(vector(I64, sse_vector_size)).bind(I64),
rec_frurm.opcodes(&MOVD_LOAD_XMM).rex().w(),
);
// SIMD shift left
for (ty, opcodes) in &[(I16, &PSLLW), (I32, &PSLLD), (I64, &PSLLQ)] {
let x86_psll = x86_psll.bind(vector(*ty, sse_vector_size));
e.enc_32_64(x86_psll, rec_fa.opcodes(*opcodes));
}
// SIMD icmp using PCMPEQ* // SIMD icmp using PCMPEQ*
for ty in ValueType::all_lane_types().filter(|t| t.is_int() && allowed_simd_type(t)) { for ty in ValueType::all_lane_types().filter(|t| t.is_int() && allowed_simd_type(t)) {
let (opcodes, isa_predicate): (&[_], _) = match ty.lane_bits() { let (opcodes, isa_predicate): (&[_], _) = match ty.lane_bits() {

View File

@@ -387,5 +387,40 @@ pub(crate) fn define(
.operands_out(vec![a]), .operands_out(vec![a]),
); );
let IxN = &TypeVar::new(
"IxN",
"A SIMD vector type containing integers",
TypeSetBuilder::new()
.ints(Interval::All)
.simd_lanes(Interval::All)
.includes_scalars(false)
.build(),
);
let I64x2 = &TypeVar::new(
"I64x2",
"A SIMD vector type containing one large integer (the upper lane is concatenated with \
the lower lane to form the integer)",
TypeSetBuilder::new()
.ints(64..64)
.simd_lanes(2..2)
.includes_scalars(false)
.build(),
);
let x = &operand_doc("x", IxN, "Vector value to shift");
let y = &operand_doc("y", I64x2, "Number of bits to shift");
let a = &operand("a", IxN);
ig.push(
Inst::new(
"x86_psll",
r#"
Shift Packed Data Left Logical -- This implements the behavior of the shared instruction
``ishl`` but alters the shift operand to live in an XMM register as expected by the PSSL*
family of instructions.
"#,
)
.operands_in(vec![x, y])
.operands_out(vec![a]),
);
ig.build() ig.build()
} }

View File

@@ -3,7 +3,7 @@ use crate::cdsl::instructions::{vector, Bindable, InstructionGroup};
use crate::cdsl::types::{LaneType, ValueType}; use crate::cdsl::types::{LaneType, ValueType};
use crate::cdsl::xform::TransformGroupBuilder; use crate::cdsl::xform::TransformGroupBuilder;
use crate::shared::types::Float::F64; use crate::shared::types::Float::F64;
use crate::shared::types::Int::{I32, I64}; use crate::shared::types::Int::{I16, I32, I64};
use crate::shared::Definitions as SharedDefinitions; use crate::shared::Definitions as SharedDefinitions;
pub(crate) fn define(shared: &mut SharedDefinitions, x86_instructions: &InstructionGroup) { pub(crate) fn define(shared: &mut SharedDefinitions, x86_instructions: &InstructionGroup) {
@@ -20,6 +20,7 @@ pub(crate) fn define(shared: &mut SharedDefinitions, x86_instructions: &Instruct
// List of instructions. // List of instructions.
let insts = &shared.instructions; let insts = &shared.instructions;
let band = insts.by_name("band"); let band = insts.by_name("band");
let bitcast = insts.by_name("bitcast");
let bor = insts.by_name("bor"); let bor = insts.by_name("bor");
let bnot = insts.by_name("bnot"); let bnot = insts.by_name("bnot");
let bxor = insts.by_name("bxor"); let bxor = insts.by_name("bxor");
@@ -40,6 +41,7 @@ pub(crate) fn define(shared: &mut SharedDefinitions, x86_instructions: &Instruct
let imul = insts.by_name("imul"); let imul = insts.by_name("imul");
let ineg = insts.by_name("ineg"); let ineg = insts.by_name("ineg");
let insertlane = insts.by_name("insertlane"); let insertlane = insts.by_name("insertlane");
let ishl = insts.by_name("ishl");
let isub = insts.by_name("isub"); let isub = insts.by_name("isub");
let popcnt = insts.by_name("popcnt"); let popcnt = insts.by_name("popcnt");
let raw_bitcast = insts.by_name("raw_bitcast"); let raw_bitcast = insts.by_name("raw_bitcast");
@@ -60,6 +62,7 @@ pub(crate) fn define(shared: &mut SharedDefinitions, x86_instructions: &Instruct
let x86_bsr = x86_instructions.by_name("x86_bsr"); let x86_bsr = x86_instructions.by_name("x86_bsr");
let x86_pshufb = x86_instructions.by_name("x86_pshufb"); let x86_pshufb = x86_instructions.by_name("x86_pshufb");
let x86_pshufd = x86_instructions.by_name("x86_pshufd"); let x86_pshufd = x86_instructions.by_name("x86_pshufd");
let x86_psll = x86_instructions.by_name("x86_psll");
let x86_umulx = x86_instructions.by_name("x86_umulx"); let x86_umulx = x86_instructions.by_name("x86_umulx");
let x86_smulx = x86_instructions.by_name("x86_smulx"); let x86_smulx = x86_instructions.by_name("x86_smulx");
@@ -394,6 +397,16 @@ pub(crate) fn define(shared: &mut SharedDefinitions, x86_instructions: &Instruct
); );
} }
// SIMD shift left
for ty in &[I16, I32, I64] {
let ishl = ishl.bind(vector(*ty, sse_vector_size));
let bitcast = bitcast.bind(vector(I64, sse_vector_size));
narrow.legalize(
def!(a = ishl(x, y)),
vec![def!(b = bitcast(y)), def!(a = x86_psll(x, b))],
);
}
narrow.custom_legalize(shuffle, "convert_shuffle"); narrow.custom_legalize(shuffle, "convert_shuffle");
narrow.custom_legalize(extractlane, "convert_extractlane"); narrow.custom_legalize(extractlane, "convert_extractlane");
narrow.custom_legalize(insertlane, "convert_insertlane"); narrow.custom_legalize(insertlane, "convert_insertlane");

View File

@@ -320,6 +320,15 @@ pub static PSHUFB: [u8; 4] = [0x66, 0x0f, 0x38, 0x00];
/// store the result in xmm1 (SSE2). /// store the result in xmm1 (SSE2).
pub static PSHUFD: [u8; 3] = [0x66, 0x0f, 0x70]; pub static PSHUFD: [u8; 3] = [0x66, 0x0f, 0x70];
/// Shift words in xmm1 left by xmm2/m128 while shifting in 0s (SSE2).
pub static PSLLW: [u8; 3] = [0x66, 0x0f, 0xf1];
/// Shift doublewords in xmm1 left by xmm2/m128 while shifting in 0s (SSE2).
pub static PSLLD: [u8; 3] = [0x66, 0x0f, 0xf2];
/// Shift quadwords in xmm1 left by xmm2/m128 while shifting in 0s (SSE2).
pub static PSLLQ: [u8; 3] = [0x66, 0x0f, 0xf3];
/// Subtract packed byte integers in xmm2/m128 from packed byte integers in xmm1 (SSE2). /// Subtract packed byte integers in xmm2/m128 from packed byte integers in xmm1 (SSE2).
pub static PSUBB: [u8; 3] = [0x66, 0x0f, 0xf8]; pub static PSUBB: [u8; 3] = [0x66, 0x0f, 0xf8];

View File

@@ -0,0 +1,21 @@
test binemit
set enable_simd
target x86_64 skylake
function %ishl_i16x8(i16x8, i64x2) -> i16x8 {
ebb0(v0: i16x8 [%xmm2], v1: i64x2 [%xmm1]):
[-, %xmm2] v2 = x86_psll v0, v1 ; bin: 66 0f f1 d1
return v2
}
function %ishl_i32x4(i32x4, i64x2) -> i32x4 {
ebb0(v0: i32x4 [%xmm4], v1: i64x2 [%xmm0]):
[-, %xmm4] v2 = x86_psll v0, v1 ; bin: 66 0f f2 e0
return v2
}
function %ishl_i64x2(i64x2, i64x2) -> i64x2 {
ebb0(v0: i64x2 [%xmm6], v1: i64x2 [%xmm3]):
[-, %xmm6] v2 = x86_psll v0, v1 ; bin: 66 0f f3 f3
return v2
}

View File

@@ -0,0 +1,13 @@
test legalizer
set enable_simd
target x86_64 skylake
function %ishl_i32x4() -> i32x4 {
ebb0:
v0 = iconst.i32 1
v1 = vconst.i32x4 [1 2 4 8]
v2 = ishl v1, v0
; check: v3 = bitcast.i64x2 v0
; nextln: v2 = x86_psll v1, v3
return v2
}

View File

@@ -0,0 +1,39 @@
test run
set enable_simd
target x86_64 skylake
; TODO: once available, replace all lane extraction with `icmp + all_ones`
function %ishl_i32x4() -> b1 {
ebb0:
v0 = iconst.i32 1
v1 = vconst.i32x4 [1 2 4 8]
v2 = ishl v1, v0
v3 = extractlane v2, 0
v4 = icmp_imm eq v3, 2
v5 = extractlane v2, 3
v6 = icmp_imm eq v5, 16
v7 = band v4, v6
return v7
}
; run
function %ishl_too_large_i16x8() -> b1 {
ebb0:
v0 = iconst.i32 17 ; note that this will shift off the end of each lane
v1 = vconst.i16x8 [1 2 4 8 16 32 64 128]
v2 = ishl v1, v0
v3 = extractlane v2, 0
v4 = icmp_imm eq v3, 0
v5 = extractlane v2, 3
v6 = icmp_imm eq v5, 0
v7 = band v4, v6
return v7
}
; run