Add x86 SIMD ishl
Only the shifts with applicable SSE2 instructions (i.e. 16-64 bit width) are implemented here.
This commit is contained in:
@@ -521,6 +521,7 @@ pub(crate) fn define<'defs>(
|
|||||||
let x86_pinsr = x86.by_name("x86_pinsr");
|
let x86_pinsr = x86.by_name("x86_pinsr");
|
||||||
let x86_pshufd = x86.by_name("x86_pshufd");
|
let x86_pshufd = x86.by_name("x86_pshufd");
|
||||||
let x86_pshufb = x86.by_name("x86_pshufb");
|
let x86_pshufb = x86.by_name("x86_pshufb");
|
||||||
|
let x86_psll = x86.by_name("x86_psll");
|
||||||
let x86_push = x86.by_name("x86_push");
|
let x86_push = x86.by_name("x86_push");
|
||||||
let x86_sdivmodx = x86.by_name("x86_sdivmodx");
|
let x86_sdivmodx = x86.by_name("x86_sdivmodx");
|
||||||
let x86_smulx = x86.by_name("x86_smulx");
|
let x86_smulx = x86.by_name("x86_smulx");
|
||||||
@@ -1988,6 +1989,26 @@ pub(crate) fn define<'defs>(
|
|||||||
e.enc_32_64(bxor, rec_fa.opcodes(&PXOR));
|
e.enc_32_64(bxor, rec_fa.opcodes(&PXOR));
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// SIMD bitcast from I32/I64 to the low bits of a vector (e.g. I64x2); this register movement
|
||||||
|
// allows SIMD shifts to be legalized more easily. TODO ideally this would be typed as an
|
||||||
|
// I128x1 but restrictions on the type builder prevent this; the general idea here is that
|
||||||
|
// the upper bits are all zeroed and do not form parts of any separate lane. See
|
||||||
|
// https://github.com/CraneStation/cranelift/issues/1146.
|
||||||
|
e.enc_both(
|
||||||
|
bitcast.bind(vector(I64, sse_vector_size)).bind(I32),
|
||||||
|
rec_frurm.opcodes(&MOVD_LOAD_XMM),
|
||||||
|
);
|
||||||
|
e.enc64(
|
||||||
|
bitcast.bind(vector(I64, sse_vector_size)).bind(I64),
|
||||||
|
rec_frurm.opcodes(&MOVD_LOAD_XMM).rex().w(),
|
||||||
|
);
|
||||||
|
|
||||||
|
// SIMD shift left
|
||||||
|
for (ty, opcodes) in &[(I16, &PSLLW), (I32, &PSLLD), (I64, &PSLLQ)] {
|
||||||
|
let x86_psll = x86_psll.bind(vector(*ty, sse_vector_size));
|
||||||
|
e.enc_32_64(x86_psll, rec_fa.opcodes(*opcodes));
|
||||||
|
}
|
||||||
|
|
||||||
// SIMD icmp using PCMPEQ*
|
// SIMD icmp using PCMPEQ*
|
||||||
for ty in ValueType::all_lane_types().filter(|t| t.is_int() && allowed_simd_type(t)) {
|
for ty in ValueType::all_lane_types().filter(|t| t.is_int() && allowed_simd_type(t)) {
|
||||||
let (opcodes, isa_predicate): (&[_], _) = match ty.lane_bits() {
|
let (opcodes, isa_predicate): (&[_], _) = match ty.lane_bits() {
|
||||||
|
|||||||
@@ -387,5 +387,40 @@ pub(crate) fn define(
|
|||||||
.operands_out(vec![a]),
|
.operands_out(vec![a]),
|
||||||
);
|
);
|
||||||
|
|
||||||
|
let IxN = &TypeVar::new(
|
||||||
|
"IxN",
|
||||||
|
"A SIMD vector type containing integers",
|
||||||
|
TypeSetBuilder::new()
|
||||||
|
.ints(Interval::All)
|
||||||
|
.simd_lanes(Interval::All)
|
||||||
|
.includes_scalars(false)
|
||||||
|
.build(),
|
||||||
|
);
|
||||||
|
let I64x2 = &TypeVar::new(
|
||||||
|
"I64x2",
|
||||||
|
"A SIMD vector type containing one large integer (the upper lane is concatenated with \
|
||||||
|
the lower lane to form the integer)",
|
||||||
|
TypeSetBuilder::new()
|
||||||
|
.ints(64..64)
|
||||||
|
.simd_lanes(2..2)
|
||||||
|
.includes_scalars(false)
|
||||||
|
.build(),
|
||||||
|
);
|
||||||
|
let x = &operand_doc("x", IxN, "Vector value to shift");
|
||||||
|
let y = &operand_doc("y", I64x2, "Number of bits to shift");
|
||||||
|
let a = &operand("a", IxN);
|
||||||
|
ig.push(
|
||||||
|
Inst::new(
|
||||||
|
"x86_psll",
|
||||||
|
r#"
|
||||||
|
Shift Packed Data Left Logical -- This implements the behavior of the shared instruction
|
||||||
|
``ishl`` but alters the shift operand to live in an XMM register as expected by the PSSL*
|
||||||
|
family of instructions.
|
||||||
|
"#,
|
||||||
|
)
|
||||||
|
.operands_in(vec![x, y])
|
||||||
|
.operands_out(vec![a]),
|
||||||
|
);
|
||||||
|
|
||||||
ig.build()
|
ig.build()
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -3,7 +3,7 @@ use crate::cdsl::instructions::{vector, Bindable, InstructionGroup};
|
|||||||
use crate::cdsl::types::{LaneType, ValueType};
|
use crate::cdsl::types::{LaneType, ValueType};
|
||||||
use crate::cdsl::xform::TransformGroupBuilder;
|
use crate::cdsl::xform::TransformGroupBuilder;
|
||||||
use crate::shared::types::Float::F64;
|
use crate::shared::types::Float::F64;
|
||||||
use crate::shared::types::Int::{I32, I64};
|
use crate::shared::types::Int::{I16, I32, I64};
|
||||||
use crate::shared::Definitions as SharedDefinitions;
|
use crate::shared::Definitions as SharedDefinitions;
|
||||||
|
|
||||||
pub(crate) fn define(shared: &mut SharedDefinitions, x86_instructions: &InstructionGroup) {
|
pub(crate) fn define(shared: &mut SharedDefinitions, x86_instructions: &InstructionGroup) {
|
||||||
@@ -20,6 +20,7 @@ pub(crate) fn define(shared: &mut SharedDefinitions, x86_instructions: &Instruct
|
|||||||
// List of instructions.
|
// List of instructions.
|
||||||
let insts = &shared.instructions;
|
let insts = &shared.instructions;
|
||||||
let band = insts.by_name("band");
|
let band = insts.by_name("band");
|
||||||
|
let bitcast = insts.by_name("bitcast");
|
||||||
let bor = insts.by_name("bor");
|
let bor = insts.by_name("bor");
|
||||||
let bnot = insts.by_name("bnot");
|
let bnot = insts.by_name("bnot");
|
||||||
let bxor = insts.by_name("bxor");
|
let bxor = insts.by_name("bxor");
|
||||||
@@ -40,6 +41,7 @@ pub(crate) fn define(shared: &mut SharedDefinitions, x86_instructions: &Instruct
|
|||||||
let imul = insts.by_name("imul");
|
let imul = insts.by_name("imul");
|
||||||
let ineg = insts.by_name("ineg");
|
let ineg = insts.by_name("ineg");
|
||||||
let insertlane = insts.by_name("insertlane");
|
let insertlane = insts.by_name("insertlane");
|
||||||
|
let ishl = insts.by_name("ishl");
|
||||||
let isub = insts.by_name("isub");
|
let isub = insts.by_name("isub");
|
||||||
let popcnt = insts.by_name("popcnt");
|
let popcnt = insts.by_name("popcnt");
|
||||||
let raw_bitcast = insts.by_name("raw_bitcast");
|
let raw_bitcast = insts.by_name("raw_bitcast");
|
||||||
@@ -60,6 +62,7 @@ pub(crate) fn define(shared: &mut SharedDefinitions, x86_instructions: &Instruct
|
|||||||
let x86_bsr = x86_instructions.by_name("x86_bsr");
|
let x86_bsr = x86_instructions.by_name("x86_bsr");
|
||||||
let x86_pshufb = x86_instructions.by_name("x86_pshufb");
|
let x86_pshufb = x86_instructions.by_name("x86_pshufb");
|
||||||
let x86_pshufd = x86_instructions.by_name("x86_pshufd");
|
let x86_pshufd = x86_instructions.by_name("x86_pshufd");
|
||||||
|
let x86_psll = x86_instructions.by_name("x86_psll");
|
||||||
let x86_umulx = x86_instructions.by_name("x86_umulx");
|
let x86_umulx = x86_instructions.by_name("x86_umulx");
|
||||||
let x86_smulx = x86_instructions.by_name("x86_smulx");
|
let x86_smulx = x86_instructions.by_name("x86_smulx");
|
||||||
|
|
||||||
@@ -394,6 +397,16 @@ pub(crate) fn define(shared: &mut SharedDefinitions, x86_instructions: &Instruct
|
|||||||
);
|
);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// SIMD shift left
|
||||||
|
for ty in &[I16, I32, I64] {
|
||||||
|
let ishl = ishl.bind(vector(*ty, sse_vector_size));
|
||||||
|
let bitcast = bitcast.bind(vector(I64, sse_vector_size));
|
||||||
|
narrow.legalize(
|
||||||
|
def!(a = ishl(x, y)),
|
||||||
|
vec![def!(b = bitcast(y)), def!(a = x86_psll(x, b))],
|
||||||
|
);
|
||||||
|
}
|
||||||
|
|
||||||
narrow.custom_legalize(shuffle, "convert_shuffle");
|
narrow.custom_legalize(shuffle, "convert_shuffle");
|
||||||
narrow.custom_legalize(extractlane, "convert_extractlane");
|
narrow.custom_legalize(extractlane, "convert_extractlane");
|
||||||
narrow.custom_legalize(insertlane, "convert_insertlane");
|
narrow.custom_legalize(insertlane, "convert_insertlane");
|
||||||
|
|||||||
@@ -320,6 +320,15 @@ pub static PSHUFB: [u8; 4] = [0x66, 0x0f, 0x38, 0x00];
|
|||||||
/// store the result in xmm1 (SSE2).
|
/// store the result in xmm1 (SSE2).
|
||||||
pub static PSHUFD: [u8; 3] = [0x66, 0x0f, 0x70];
|
pub static PSHUFD: [u8; 3] = [0x66, 0x0f, 0x70];
|
||||||
|
|
||||||
|
/// Shift words in xmm1 left by xmm2/m128 while shifting in 0s (SSE2).
|
||||||
|
pub static PSLLW: [u8; 3] = [0x66, 0x0f, 0xf1];
|
||||||
|
|
||||||
|
/// Shift doublewords in xmm1 left by xmm2/m128 while shifting in 0s (SSE2).
|
||||||
|
pub static PSLLD: [u8; 3] = [0x66, 0x0f, 0xf2];
|
||||||
|
|
||||||
|
/// Shift quadwords in xmm1 left by xmm2/m128 while shifting in 0s (SSE2).
|
||||||
|
pub static PSLLQ: [u8; 3] = [0x66, 0x0f, 0xf3];
|
||||||
|
|
||||||
/// Subtract packed byte integers in xmm2/m128 from packed byte integers in xmm1 (SSE2).
|
/// Subtract packed byte integers in xmm2/m128 from packed byte integers in xmm1 (SSE2).
|
||||||
pub static PSUBB: [u8; 3] = [0x66, 0x0f, 0xf8];
|
pub static PSUBB: [u8; 3] = [0x66, 0x0f, 0xf8];
|
||||||
|
|
||||||
|
|||||||
@@ -0,0 +1,21 @@
|
|||||||
|
test binemit
|
||||||
|
set enable_simd
|
||||||
|
target x86_64 skylake
|
||||||
|
|
||||||
|
function %ishl_i16x8(i16x8, i64x2) -> i16x8 {
|
||||||
|
ebb0(v0: i16x8 [%xmm2], v1: i64x2 [%xmm1]):
|
||||||
|
[-, %xmm2] v2 = x86_psll v0, v1 ; bin: 66 0f f1 d1
|
||||||
|
return v2
|
||||||
|
}
|
||||||
|
|
||||||
|
function %ishl_i32x4(i32x4, i64x2) -> i32x4 {
|
||||||
|
ebb0(v0: i32x4 [%xmm4], v1: i64x2 [%xmm0]):
|
||||||
|
[-, %xmm4] v2 = x86_psll v0, v1 ; bin: 66 0f f2 e0
|
||||||
|
return v2
|
||||||
|
}
|
||||||
|
|
||||||
|
function %ishl_i64x2(i64x2, i64x2) -> i64x2 {
|
||||||
|
ebb0(v0: i64x2 [%xmm6], v1: i64x2 [%xmm3]):
|
||||||
|
[-, %xmm6] v2 = x86_psll v0, v1 ; bin: 66 0f f3 f3
|
||||||
|
return v2
|
||||||
|
}
|
||||||
@@ -0,0 +1,13 @@
|
|||||||
|
test legalizer
|
||||||
|
set enable_simd
|
||||||
|
target x86_64 skylake
|
||||||
|
|
||||||
|
function %ishl_i32x4() -> i32x4 {
|
||||||
|
ebb0:
|
||||||
|
v0 = iconst.i32 1
|
||||||
|
v1 = vconst.i32x4 [1 2 4 8]
|
||||||
|
v2 = ishl v1, v0
|
||||||
|
; check: v3 = bitcast.i64x2 v0
|
||||||
|
; nextln: v2 = x86_psll v1, v3
|
||||||
|
return v2
|
||||||
|
}
|
||||||
39
cranelift/filetests/filetests/isa/x86/simd-bitwise-run.clif
Normal file
39
cranelift/filetests/filetests/isa/x86/simd-bitwise-run.clif
Normal file
@@ -0,0 +1,39 @@
|
|||||||
|
test run
|
||||||
|
set enable_simd
|
||||||
|
target x86_64 skylake
|
||||||
|
|
||||||
|
; TODO: once available, replace all lane extraction with `icmp + all_ones`
|
||||||
|
|
||||||
|
function %ishl_i32x4() -> b1 {
|
||||||
|
ebb0:
|
||||||
|
v0 = iconst.i32 1
|
||||||
|
v1 = vconst.i32x4 [1 2 4 8]
|
||||||
|
v2 = ishl v1, v0
|
||||||
|
|
||||||
|
v3 = extractlane v2, 0
|
||||||
|
v4 = icmp_imm eq v3, 2
|
||||||
|
|
||||||
|
v5 = extractlane v2, 3
|
||||||
|
v6 = icmp_imm eq v5, 16
|
||||||
|
|
||||||
|
v7 = band v4, v6
|
||||||
|
return v7
|
||||||
|
}
|
||||||
|
; run
|
||||||
|
|
||||||
|
function %ishl_too_large_i16x8() -> b1 {
|
||||||
|
ebb0:
|
||||||
|
v0 = iconst.i32 17 ; note that this will shift off the end of each lane
|
||||||
|
v1 = vconst.i16x8 [1 2 4 8 16 32 64 128]
|
||||||
|
v2 = ishl v1, v0
|
||||||
|
|
||||||
|
v3 = extractlane v2, 0
|
||||||
|
v4 = icmp_imm eq v3, 0
|
||||||
|
|
||||||
|
v5 = extractlane v2, 3
|
||||||
|
v6 = icmp_imm eq v5, 0
|
||||||
|
|
||||||
|
v7 = band v4, v6
|
||||||
|
return v7
|
||||||
|
}
|
||||||
|
; run
|
||||||
Reference in New Issue
Block a user