Legalize i8x16.sshr using pack/unpack instructions
Due to arithmetic shift behavior, this legalization cannot easily use the masks for i8x16.ushr or i8x16.ishl
This commit is contained in:
@@ -326,6 +326,7 @@ fn define_simd(shared: &mut SharedDefinitions, x86_instructions: &InstructionGro
|
|||||||
let fcmp = insts.by_name("fcmp");
|
let fcmp = insts.by_name("fcmp");
|
||||||
let fabs = insts.by_name("fabs");
|
let fabs = insts.by_name("fabs");
|
||||||
let fneg = insts.by_name("fneg");
|
let fneg = insts.by_name("fneg");
|
||||||
|
let iadd_imm = insts.by_name("iadd_imm");
|
||||||
let icmp = insts.by_name("icmp");
|
let icmp = insts.by_name("icmp");
|
||||||
let imax = insts.by_name("imax");
|
let imax = insts.by_name("imax");
|
||||||
let imin = insts.by_name("imin");
|
let imin = insts.by_name("imin");
|
||||||
@@ -349,6 +350,7 @@ fn define_simd(shared: &mut SharedDefinitions, x86_instructions: &InstructionGro
|
|||||||
let vall_true = insts.by_name("vall_true");
|
let vall_true = insts.by_name("vall_true");
|
||||||
let vany_true = insts.by_name("vany_true");
|
let vany_true = insts.by_name("vany_true");
|
||||||
|
|
||||||
|
let x86_packss = x86_instructions.by_name("x86_packss");
|
||||||
let x86_pmaxs = x86_instructions.by_name("x86_pmaxs");
|
let x86_pmaxs = x86_instructions.by_name("x86_pmaxs");
|
||||||
let x86_pmaxu = x86_instructions.by_name("x86_pmaxu");
|
let x86_pmaxu = x86_instructions.by_name("x86_pmaxu");
|
||||||
let x86_pmins = x86_instructions.by_name("x86_pmins");
|
let x86_pmins = x86_instructions.by_name("x86_pmins");
|
||||||
@@ -357,6 +359,8 @@ fn define_simd(shared: &mut SharedDefinitions, x86_instructions: &InstructionGro
|
|||||||
let x86_pshufd = x86_instructions.by_name("x86_pshufd");
|
let x86_pshufd = x86_instructions.by_name("x86_pshufd");
|
||||||
let x86_psra = x86_instructions.by_name("x86_psra");
|
let x86_psra = x86_instructions.by_name("x86_psra");
|
||||||
let x86_ptest = x86_instructions.by_name("x86_ptest");
|
let x86_ptest = x86_instructions.by_name("x86_ptest");
|
||||||
|
let x86_punpckh = x86_instructions.by_name("x86_punpckh");
|
||||||
|
let x86_punpckl = x86_instructions.by_name("x86_punpckl");
|
||||||
|
|
||||||
let imm = &shared.imm;
|
let imm = &shared.imm;
|
||||||
|
|
||||||
@@ -373,6 +377,7 @@ fn define_simd(shared: &mut SharedDefinitions, x86_instructions: &InstructionGro
|
|||||||
// Set up variables and immediates.
|
// Set up variables and immediates.
|
||||||
let uimm8_zero = Literal::constant(&imm.uimm8, 0x00);
|
let uimm8_zero = Literal::constant(&imm.uimm8, 0x00);
|
||||||
let uimm8_one = Literal::constant(&imm.uimm8, 0x01);
|
let uimm8_one = Literal::constant(&imm.uimm8, 0x01);
|
||||||
|
let uimm8_eight = Literal::constant(&imm.uimm8, 8);
|
||||||
let u128_zeroes = constant(vec![0x00; 16]);
|
let u128_zeroes = constant(vec![0x00; 16]);
|
||||||
let u128_ones = constant(vec![0xff; 16]);
|
let u128_ones = constant(vec![0xff; 16]);
|
||||||
let u128_seventies = constant(vec![0x70; 16]);
|
let u128_seventies = constant(vec![0x70; 16]);
|
||||||
@@ -381,8 +386,12 @@ fn define_simd(shared: &mut SharedDefinitions, x86_instructions: &InstructionGro
|
|||||||
let c = var("c");
|
let c = var("c");
|
||||||
let d = var("d");
|
let d = var("d");
|
||||||
let e = var("e");
|
let e = var("e");
|
||||||
|
let f = var("f");
|
||||||
|
let g = var("g");
|
||||||
|
let h = var("h");
|
||||||
let x = var("x");
|
let x = var("x");
|
||||||
let y = var("y");
|
let y = var("y");
|
||||||
|
let z = var("z");
|
||||||
|
|
||||||
// Limit the SIMD vector size: eventually multiple vector sizes may be supported
|
// Limit the SIMD vector size: eventually multiple vector sizes may be supported
|
||||||
// but for now only SSE-sized vectors are available.
|
// but for now only SSE-sized vectors are available.
|
||||||
@@ -484,13 +493,37 @@ fn define_simd(shared: &mut SharedDefinitions, x86_instructions: &InstructionGro
|
|||||||
);
|
);
|
||||||
}
|
}
|
||||||
|
|
||||||
// SIMD shift left (arithmetic)
|
// SIMD shift right (arithmetic)
|
||||||
for ty in &[I16, I32, I64] {
|
for ty in &[I16, I32, I64] {
|
||||||
let sshr = sshr.bind(vector(*ty, sse_vector_size));
|
let sshr = sshr.bind(vector(*ty, sse_vector_size));
|
||||||
let bitcast = bitcast.bind(vector(I64, sse_vector_size));
|
let bitcast_i64x2 = bitcast.bind(vector(I64, sse_vector_size));
|
||||||
narrow.legalize(
|
narrow.legalize(
|
||||||
def!(a = sshr(x, y)),
|
def!(a = sshr(x, y)),
|
||||||
vec![def!(b = bitcast(y)), def!(a = x86_psra(x, b))],
|
vec![def!(b = bitcast_i64x2(y)), def!(a = x86_psra(x, b))],
|
||||||
|
);
|
||||||
|
}
|
||||||
|
{
|
||||||
|
let sshr = sshr.bind(vector(I8, sse_vector_size));
|
||||||
|
let bitcast_i64x2 = bitcast.bind(vector(I64, sse_vector_size));
|
||||||
|
let raw_bitcast_i16x8 = raw_bitcast.bind(vector(I16, sse_vector_size));
|
||||||
|
let raw_bitcast_i16x8_again = raw_bitcast.bind(vector(I16, sse_vector_size));
|
||||||
|
narrow.legalize(
|
||||||
|
def!(z = sshr(x, y)),
|
||||||
|
vec![
|
||||||
|
// Since we will use the high byte of each 16x8 lane, shift an extra 8 bits.
|
||||||
|
def!(a = iadd_imm(y, uimm8_eight)),
|
||||||
|
def!(b = bitcast_i64x2(a)),
|
||||||
|
// Take the low 8 bytes of x, duplicate them in 16x8 lanes, then shift right.
|
||||||
|
def!(c = x86_punpckl(x, x)),
|
||||||
|
def!(d = raw_bitcast_i16x8(c)),
|
||||||
|
def!(e = x86_psra(d, b)),
|
||||||
|
// Take the high 8 bytes of x, duplicate them in 16x8 lanes, then shift right.
|
||||||
|
def!(f = x86_punpckh(x, x)),
|
||||||
|
def!(g = raw_bitcast_i16x8_again(f)),
|
||||||
|
def!(h = x86_psra(g, b)),
|
||||||
|
// Re-pack the vector.
|
||||||
|
def!(z = x86_packss(e, h)),
|
||||||
|
],
|
||||||
);
|
);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
@@ -18,6 +18,26 @@ block0:
|
|||||||
return v2
|
return v2
|
||||||
}
|
}
|
||||||
|
|
||||||
|
function %sshr_i8x16() -> i8x16 {
|
||||||
|
block0:
|
||||||
|
v0 = iconst.i32 1
|
||||||
|
v1 = vconst.i8x16 [0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15]
|
||||||
|
v2 = sshr v1, v0
|
||||||
|
; check: v3 = iadd_imm v0, 8
|
||||||
|
; nextln: v4 = bitcast.i64x2 v3
|
||||||
|
|
||||||
|
; nextln: v5 = x86_punpckl v1, v1
|
||||||
|
; nextln: v6 = raw_bitcast.i16x8 v5
|
||||||
|
; nextln: v7 = x86_psra v6, v4
|
||||||
|
|
||||||
|
; nextln: v8 = x86_punpckh v1, v1
|
||||||
|
; nextln: v9 = raw_bitcast.i16x8 v8
|
||||||
|
; nextln: v10 = x86_psra v9, v4
|
||||||
|
|
||||||
|
; nextln: v2 = x86_packss v7, v10
|
||||||
|
return v2
|
||||||
|
}
|
||||||
|
|
||||||
function %ishl_i8x16() -> i8x16 {
|
function %ishl_i8x16() -> i8x16 {
|
||||||
block0:
|
block0:
|
||||||
v0 = iconst.i32 1
|
v0 = iconst.i32 1
|
||||||
|
|||||||
@@ -51,6 +51,19 @@ block0:
|
|||||||
}
|
}
|
||||||
; run
|
; run
|
||||||
|
|
||||||
|
function %sshr_i8x16() -> b1 {
|
||||||
|
block0:
|
||||||
|
v0 = iconst.i32 1
|
||||||
|
v1 = vconst.i8x16 [0 0xff 2 0xfd 4 0xfb 6 0xf9 8 0xf7 10 0xf5 12 0xf3 14 0xf1]
|
||||||
|
v2 = sshr v1, v0
|
||||||
|
|
||||||
|
v3 = vconst.i8x16 [0 0xff 1 0xfe 2 0xfd 3 0xfc 4 0xfb 5 0xfa 6 0xf9 7 0xf8]
|
||||||
|
v4 = icmp eq v2, v3
|
||||||
|
v5 = vall_true v4
|
||||||
|
return v5
|
||||||
|
}
|
||||||
|
; run
|
||||||
|
|
||||||
function %ishl_i8x16() -> b1 {
|
function %ishl_i8x16() -> b1 {
|
||||||
block0:
|
block0:
|
||||||
v0 = iconst.i32 1
|
v0 = iconst.i32 1
|
||||||
|
|||||||
Reference in New Issue
Block a user