Legalize i8x16.sshr using pack/unpack instructions

Due to arithmetic shift behavior, this legalization cannot easily use the masks for i8x16.ushr or i8x16.ishl
This commit is contained in:
Andrew Brown
2020-03-25 11:06:54 -07:00
parent fb6e8f784d
commit d24f23285b
3 changed files with 69 additions and 3 deletions

View File

@@ -326,6 +326,7 @@ fn define_simd(shared: &mut SharedDefinitions, x86_instructions: &InstructionGro
let fcmp = insts.by_name("fcmp"); let fcmp = insts.by_name("fcmp");
let fabs = insts.by_name("fabs"); let fabs = insts.by_name("fabs");
let fneg = insts.by_name("fneg"); let fneg = insts.by_name("fneg");
let iadd_imm = insts.by_name("iadd_imm");
let icmp = insts.by_name("icmp"); let icmp = insts.by_name("icmp");
let imax = insts.by_name("imax"); let imax = insts.by_name("imax");
let imin = insts.by_name("imin"); let imin = insts.by_name("imin");
@@ -349,6 +350,7 @@ fn define_simd(shared: &mut SharedDefinitions, x86_instructions: &InstructionGro
let vall_true = insts.by_name("vall_true"); let vall_true = insts.by_name("vall_true");
let vany_true = insts.by_name("vany_true"); let vany_true = insts.by_name("vany_true");
let x86_packss = x86_instructions.by_name("x86_packss");
let x86_pmaxs = x86_instructions.by_name("x86_pmaxs"); let x86_pmaxs = x86_instructions.by_name("x86_pmaxs");
let x86_pmaxu = x86_instructions.by_name("x86_pmaxu"); let x86_pmaxu = x86_instructions.by_name("x86_pmaxu");
let x86_pmins = x86_instructions.by_name("x86_pmins"); let x86_pmins = x86_instructions.by_name("x86_pmins");
@@ -357,6 +359,8 @@ fn define_simd(shared: &mut SharedDefinitions, x86_instructions: &InstructionGro
let x86_pshufd = x86_instructions.by_name("x86_pshufd"); let x86_pshufd = x86_instructions.by_name("x86_pshufd");
let x86_psra = x86_instructions.by_name("x86_psra"); let x86_psra = x86_instructions.by_name("x86_psra");
let x86_ptest = x86_instructions.by_name("x86_ptest"); let x86_ptest = x86_instructions.by_name("x86_ptest");
let x86_punpckh = x86_instructions.by_name("x86_punpckh");
let x86_punpckl = x86_instructions.by_name("x86_punpckl");
let imm = &shared.imm; let imm = &shared.imm;
@@ -373,6 +377,7 @@ fn define_simd(shared: &mut SharedDefinitions, x86_instructions: &InstructionGro
// Set up variables and immediates. // Set up variables and immediates.
let uimm8_zero = Literal::constant(&imm.uimm8, 0x00); let uimm8_zero = Literal::constant(&imm.uimm8, 0x00);
let uimm8_one = Literal::constant(&imm.uimm8, 0x01); let uimm8_one = Literal::constant(&imm.uimm8, 0x01);
let uimm8_eight = Literal::constant(&imm.uimm8, 8);
let u128_zeroes = constant(vec![0x00; 16]); let u128_zeroes = constant(vec![0x00; 16]);
let u128_ones = constant(vec![0xff; 16]); let u128_ones = constant(vec![0xff; 16]);
let u128_seventies = constant(vec![0x70; 16]); let u128_seventies = constant(vec![0x70; 16]);
@@ -381,8 +386,12 @@ fn define_simd(shared: &mut SharedDefinitions, x86_instructions: &InstructionGro
let c = var("c"); let c = var("c");
let d = var("d"); let d = var("d");
let e = var("e"); let e = var("e");
let f = var("f");
let g = var("g");
let h = var("h");
let x = var("x"); let x = var("x");
let y = var("y"); let y = var("y");
let z = var("z");
// Limit the SIMD vector size: eventually multiple vector sizes may be supported // Limit the SIMD vector size: eventually multiple vector sizes may be supported
// but for now only SSE-sized vectors are available. // but for now only SSE-sized vectors are available.
@@ -484,13 +493,37 @@ fn define_simd(shared: &mut SharedDefinitions, x86_instructions: &InstructionGro
); );
} }
// SIMD shift left (arithmetic) // SIMD shift right (arithmetic)
for ty in &[I16, I32, I64] { for ty in &[I16, I32, I64] {
let sshr = sshr.bind(vector(*ty, sse_vector_size)); let sshr = sshr.bind(vector(*ty, sse_vector_size));
let bitcast = bitcast.bind(vector(I64, sse_vector_size)); let bitcast_i64x2 = bitcast.bind(vector(I64, sse_vector_size));
narrow.legalize( narrow.legalize(
def!(a = sshr(x, y)), def!(a = sshr(x, y)),
vec![def!(b = bitcast(y)), def!(a = x86_psra(x, b))], vec![def!(b = bitcast_i64x2(y)), def!(a = x86_psra(x, b))],
);
}
{
let sshr = sshr.bind(vector(I8, sse_vector_size));
let bitcast_i64x2 = bitcast.bind(vector(I64, sse_vector_size));
let raw_bitcast_i16x8 = raw_bitcast.bind(vector(I16, sse_vector_size));
let raw_bitcast_i16x8_again = raw_bitcast.bind(vector(I16, sse_vector_size));
narrow.legalize(
def!(z = sshr(x, y)),
vec![
// Since we will use the high byte of each 16x8 lane, shift an extra 8 bits.
def!(a = iadd_imm(y, uimm8_eight)),
def!(b = bitcast_i64x2(a)),
// Take the low 8 bytes of x, duplicate them in 16x8 lanes, then shift right.
def!(c = x86_punpckl(x, x)),
def!(d = raw_bitcast_i16x8(c)),
def!(e = x86_psra(d, b)),
// Take the high 8 bytes of x, duplicate them in 16x8 lanes, then shift right.
def!(f = x86_punpckh(x, x)),
def!(g = raw_bitcast_i16x8_again(f)),
def!(h = x86_psra(g, b)),
// Re-pack the vector.
def!(z = x86_packss(e, h)),
],
); );
} }

View File

@@ -18,6 +18,26 @@ block0:
return v2 return v2
} }
function %sshr_i8x16() -> i8x16 {
block0:
v0 = iconst.i32 1
v1 = vconst.i8x16 [0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15]
v2 = sshr v1, v0
; check: v3 = iadd_imm v0, 8
; nextln: v4 = bitcast.i64x2 v3
; nextln: v5 = x86_punpckl v1, v1
; nextln: v6 = raw_bitcast.i16x8 v5
; nextln: v7 = x86_psra v6, v4
; nextln: v8 = x86_punpckh v1, v1
; nextln: v9 = raw_bitcast.i16x8 v8
; nextln: v10 = x86_psra v9, v4
; nextln: v2 = x86_packss v7, v10
return v2
}
function %ishl_i8x16() -> i8x16 { function %ishl_i8x16() -> i8x16 {
block0: block0:
v0 = iconst.i32 1 v0 = iconst.i32 1

View File

@@ -51,6 +51,19 @@ block0:
} }
; run ; run
function %sshr_i8x16() -> b1 {
block0:
v0 = iconst.i32 1
v1 = vconst.i8x16 [0 0xff 2 0xfd 4 0xfb 6 0xf9 8 0xf7 10 0xf5 12 0xf3 14 0xf1]
v2 = sshr v1, v0
v3 = vconst.i8x16 [0 0xff 1 0xfe 2 0xfd 3 0xfc 4 0xfb 5 0xfa 6 0xf9 7 0xf8]
v4 = icmp eq v2, v3
v5 = vall_true v4
return v5
}
; run
function %ishl_i8x16() -> b1 { function %ishl_i8x16() -> b1 {
block0: block0:
v0 = iconst.i32 1 v0 = iconst.i32 1