Add x86 SIMD sshr and ushr

Only the shifts with applicable SSE2 instructions are implemented here: PSRL* (for ushr) only has 16-64 bit instructions and PSRA* (for sshr) only has 16-32 bit instructions.
This commit is contained in:
Andrew Brown
2019-10-07 10:38:35 -07:00
parent 808885ce56
commit f1904bffea
7 changed files with 197 additions and 2 deletions

View File

@@ -522,6 +522,8 @@ pub(crate) fn define<'defs>(
let x86_pshufd = x86.by_name("x86_pshufd"); let x86_pshufd = x86.by_name("x86_pshufd");
let x86_pshufb = x86.by_name("x86_pshufb"); let x86_pshufb = x86.by_name("x86_pshufb");
let x86_psll = x86.by_name("x86_psll"); let x86_psll = x86.by_name("x86_psll");
let x86_psra = x86.by_name("x86_psra");
let x86_psrl = x86.by_name("x86_psrl");
let x86_push = x86.by_name("x86_push"); let x86_push = x86.by_name("x86_push");
let x86_sdivmodx = x86.by_name("x86_sdivmodx"); let x86_sdivmodx = x86.by_name("x86_sdivmodx");
let x86_smulx = x86.by_name("x86_smulx"); let x86_smulx = x86.by_name("x86_smulx");
@@ -2009,6 +2011,18 @@ pub(crate) fn define<'defs>(
e.enc_32_64(x86_psll, rec_fa.opcodes(*opcodes)); e.enc_32_64(x86_psll, rec_fa.opcodes(*opcodes));
} }
// SIMD shift right (logical)
for (ty, opcodes) in &[(I16, &PSRLW), (I32, &PSRLD), (I64, &PSRLQ)] {
let x86_psrl = x86_psrl.bind(vector(*ty, sse_vector_size));
e.enc_32_64(x86_psrl, rec_fa.opcodes(*opcodes));
}
// SIMD shift right (arithmetic)
for (ty, opcodes) in &[(I16, &PSRAW), (I32, &PSRAD)] {
let x86_psra = x86_psra.bind(vector(*ty, sse_vector_size));
e.enc_32_64(x86_psra, rec_fa.opcodes(*opcodes));
}
// SIMD icmp using PCMPEQ* // SIMD icmp using PCMPEQ*
for ty in ValueType::all_lane_types().filter(|t| t.is_int() && allowed_simd_type(t)) { for ty in ValueType::all_lane_types().filter(|t| t.is_int() && allowed_simd_type(t)) {
let (opcodes, isa_predicate): (&[_], _) = match ty.lane_bits() { let (opcodes, isa_predicate): (&[_], _) = match ty.lane_bits() {

View File

@@ -414,13 +414,37 @@ pub(crate) fn define(
"x86_psll", "x86_psll",
r#" r#"
Shift Packed Data Left Logical -- This implements the behavior of the shared instruction Shift Packed Data Left Logical -- This implements the behavior of the shared instruction
``ishl`` but alters the shift operand to live in an XMM register as expected by the PSSL* ``ishl`` but alters the shift operand to live in an XMM register as expected by the PSLL*
family of instructions. family of instructions.
"#, "#,
) )
.operands_in(vec![x, y]) .operands_in(vec![x, y])
.operands_out(vec![a]), .operands_out(vec![a]),
); );
ig.push(
Inst::new(
"x86_psrl",
r#"
Shift Packed Data Right Logical -- This implements the behavior of the shared instruction
``ushr`` but alters the shift operand to live in an XMM register as expected by the PSRL*
family of instructions.
"#,
)
.operands_in(vec![x, y])
.operands_out(vec![a]),
);
ig.push(
Inst::new(
"x86_psra",
r#"
Shift Packed Data Right Arithmetic -- This implements the behavior of the shared
instruction ``sshr`` but alters the shift operand to live in an XMM register as expected by
the PSRA* family of instructions.
"#,
)
.operands_in(vec![x, y])
.operands_out(vec![a]),
);
ig.build() ig.build()
} }

View File

@@ -52,10 +52,12 @@ pub(crate) fn define(shared: &mut SharedDefinitions, x86_instructions: &Instruct
let splat = insts.by_name("splat"); let splat = insts.by_name("splat");
let shuffle = insts.by_name("shuffle"); let shuffle = insts.by_name("shuffle");
let srem = insts.by_name("srem"); let srem = insts.by_name("srem");
let sshr = insts.by_name("sshr");
let udiv = insts.by_name("udiv"); let udiv = insts.by_name("udiv");
let umulhi = insts.by_name("umulhi"); let umulhi = insts.by_name("umulhi");
let ushr_imm = insts.by_name("ushr_imm"); let ushr_imm = insts.by_name("ushr_imm");
let urem = insts.by_name("urem"); let urem = insts.by_name("urem");
let ushr = insts.by_name("ushr");
let vconst = insts.by_name("vconst"); let vconst = insts.by_name("vconst");
let x86_bsf = x86_instructions.by_name("x86_bsf"); let x86_bsf = x86_instructions.by_name("x86_bsf");
@@ -63,6 +65,8 @@ pub(crate) fn define(shared: &mut SharedDefinitions, x86_instructions: &Instruct
let x86_pshufb = x86_instructions.by_name("x86_pshufb"); let x86_pshufb = x86_instructions.by_name("x86_pshufb");
let x86_pshufd = x86_instructions.by_name("x86_pshufd"); let x86_pshufd = x86_instructions.by_name("x86_pshufd");
let x86_psll = x86_instructions.by_name("x86_psll"); let x86_psll = x86_instructions.by_name("x86_psll");
let x86_psra = x86_instructions.by_name("x86_psra");
let x86_psrl = x86_instructions.by_name("x86_psrl");
let x86_umulx = x86_instructions.by_name("x86_umulx"); let x86_umulx = x86_instructions.by_name("x86_umulx");
let x86_smulx = x86_instructions.by_name("x86_smulx"); let x86_smulx = x86_instructions.by_name("x86_smulx");
@@ -397,7 +401,7 @@ pub(crate) fn define(shared: &mut SharedDefinitions, x86_instructions: &Instruct
); );
} }
// SIMD shift left // SIMD shift left (logical)
for ty in &[I16, I32, I64] { for ty in &[I16, I32, I64] {
let ishl = ishl.bind(vector(*ty, sse_vector_size)); let ishl = ishl.bind(vector(*ty, sse_vector_size));
let bitcast = bitcast.bind(vector(I64, sse_vector_size)); let bitcast = bitcast.bind(vector(I64, sse_vector_size));
@@ -407,6 +411,26 @@ pub(crate) fn define(shared: &mut SharedDefinitions, x86_instructions: &Instruct
); );
} }
// SIMD shift right (logical)
for ty in &[I16, I32, I64] {
let ushr = ushr.bind(vector(*ty, sse_vector_size));
let bitcast = bitcast.bind(vector(I64, sse_vector_size));
narrow.legalize(
def!(a = ushr(x, y)),
vec![def!(b = bitcast(y)), def!(a = x86_psrl(x, b))],
);
}
// SIMD shift left (arithmetic)
for ty in &[I16, I32, I64] {
let sshr = sshr.bind(vector(*ty, sse_vector_size));
let bitcast = bitcast.bind(vector(I64, sse_vector_size));
narrow.legalize(
def!(a = sshr(x, y)),
vec![def!(b = bitcast(y)), def!(a = x86_psra(x, b))],
);
}
narrow.custom_legalize(shuffle, "convert_shuffle"); narrow.custom_legalize(shuffle, "convert_shuffle");
narrow.custom_legalize(extractlane, "convert_extractlane"); narrow.custom_legalize(extractlane, "convert_extractlane");
narrow.custom_legalize(insertlane, "convert_insertlane"); narrow.custom_legalize(insertlane, "convert_insertlane");

View File

@@ -329,6 +329,21 @@ pub static PSLLD: [u8; 3] = [0x66, 0x0f, 0xf2];
/// Shift quadwords in xmm1 left by xmm2/m128 while shifting in 0s (SSE2). /// Shift quadwords in xmm1 left by xmm2/m128 while shifting in 0s (SSE2).
pub static PSLLQ: [u8; 3] = [0x66, 0x0f, 0xf3]; pub static PSLLQ: [u8; 3] = [0x66, 0x0f, 0xf3];
/// Shift words in xmm1 right by xmm2/m128 while shifting in 0s (SSE2).
pub static PSRLW: [u8; 3] = [0x66, 0x0f, 0xd1];
/// Shift doublewords in xmm1 right by xmm2/m128 while shifting in 0s (SSE2).
pub static PSRLD: [u8; 3] = [0x66, 0x0f, 0xd2];
/// Shift quadwords in xmm1 right by xmm2/m128 while shifting in 0s (SSE2).
pub static PSRLQ: [u8; 3] = [0x66, 0x0f, 0xd3];
/// Shift words in xmm1 right by xmm2/m128 while shifting in sign bits (SSE2).
pub static PSRAW: [u8; 3] = [0x66, 0x0f, 0xe1];
/// Shift doublewords in xmm1 right by xmm2/m128 while shifting in sign bits (SSE2).
pub static PSRAD: [u8; 3] = [0x66, 0x0f, 0xe2];
/// Subtract packed byte integers in xmm2/m128 from packed byte integers in xmm1 (SSE2). /// Subtract packed byte integers in xmm2/m128 from packed byte integers in xmm1 (SSE2).
pub static PSUBB: [u8; 3] = [0x66, 0x0f, 0xf8]; pub static PSUBB: [u8; 3] = [0x66, 0x0f, 0xf8];

View File

@@ -19,3 +19,33 @@ ebb0(v0: i64x2 [%xmm6], v1: i64x2 [%xmm3]):
[-, %xmm6] v2 = x86_psll v0, v1 ; bin: 66 0f f3 f3 [-, %xmm6] v2 = x86_psll v0, v1 ; bin: 66 0f f3 f3
return v2 return v2
} }
function %ushr_i16x8(i16x8, i64x2) -> i16x8 {
ebb0(v0: i16x8 [%xmm2], v1: i64x2 [%xmm1]):
[-, %xmm2] v2 = x86_psrl v0, v1 ; bin: 66 0f d1 d1
return v2
}
function %ushr_i32x4(i32x4, i64x2) -> i32x4 {
ebb0(v0: i32x4 [%xmm4], v1: i64x2 [%xmm0]):
[-, %xmm4] v2 = x86_psrl v0, v1 ; bin: 66 0f d2 e0
return v2
}
function %ushr_i64x2(i64x2, i64x2) -> i64x2 {
ebb0(v0: i64x2 [%xmm6], v1: i64x2 [%xmm3]):
[-, %xmm6] v2 = x86_psrl v0, v1 ; bin: 66 0f d3 f3
return v2
}
function %sshr_i16x8(i16x8, i64x2) -> i16x8 {
ebb0(v0: i16x8 [%xmm2], v1: i64x2 [%xmm1]):
[-, %xmm2] v2 = x86_psra v0, v1 ; bin: 66 0f e1 d1
return v2
}
function %sshr_i32x4(i32x4, i64x2) -> i32x4 {
ebb0(v0: i32x4 [%xmm4], v1: i64x2 [%xmm0]):
[-, %xmm4] v2 = x86_psra v0, v1 ; bin: 66 0f e2 e0
return v2
}

View File

@@ -11,3 +11,23 @@ ebb0:
; nextln: v2 = x86_psll v1, v3 ; nextln: v2 = x86_psll v1, v3
return v2 return v2
} }
function %ushr_i64x2() -> i64x2 {
ebb0:
v0 = iconst.i32 1
v1 = vconst.i64x2 [1 2]
v2 = ushr v1, v0
; check: v3 = bitcast.i64x2 v0
; nextln: v2 = x86_psrl v1, v3
return v2
}
function %sshr_i16x8() -> i16x8 {
ebb0:
v0 = iconst.i32 1
v1 = vconst.i16x8 [1 2 4 8 16 32 64 128]
v2 = sshr v1, v0
; check: v3 = bitcast.i64x2 v0
; nextln: v2 = x86_psra v1, v3
return v2
}

View File

@@ -37,3 +37,71 @@ ebb0:
return v7 return v7
} }
; run ; run
function %ushr_i64x2() -> b1 {
ebb0:
v0 = iconst.i32 1
v1 = vconst.i64x2 [1 2]
v2 = ushr v1, v0
v3 = extractlane v2, 0
v4 = icmp_imm eq v3, 0
v5 = extractlane v2, 1
v6 = icmp_imm eq v5, 1
v7 = band v4, v6
return v7
}
; run
function %ushr_too_large_i32x4() -> b1 {
ebb0:
v0 = iconst.i32 33 ; note that this will shift off the end of each lane
v1 = vconst.i32x4 [1 2 4 8]
v2 = ushr v1, v0
v3 = extractlane v2, 0
v4 = icmp_imm eq v3, 0
v5 = extractlane v2, 3
v6 = icmp_imm eq v5, 0
v7 = band v4, v6
return v7
}
; run
function %sshr_i16x8() -> b1 {
ebb0:
v0 = iconst.i32 1
v1 = vconst.i16x8 [-1 2 4 8 -16 32 64 128]
v2 = sshr v1, v0
v3 = extractlane v2, 0
v4 = icmp_imm eq v3, 0xffff ; because of the shifted-in sign-bit, this remains 0xffff == -1
v5 = extractlane v2, 4
v6 = icmp_imm eq v5, 0xfff8 ; -16 has been shifted to -8 == 0xfff8
v7 = band v4, v6
return v7
}
; run
function %sshr_too_large_i32x4() -> b1 {
ebb0:
v0 = iconst.i32 33 ; note that this will shift off the end of each lane
v1 = vconst.i32x4 [1 2 4 -8]
v2 = sshr v1, v0
v3 = extractlane v2, 0
v4 = icmp_imm eq v3, 0
v5 = extractlane v2, 3
v6 = icmp_imm eq v5, 0xffff_ffff ; shifting in the sign-bit repeatedly fills the result with 1s
v7 = band v4, v6
return v7
}
; run