Legalize i8x16.sshr using pack/unpack instructions

Due to arithmetic shift behavior, this legalization cannot easily use the masks for i8x16.ushr or i8x16.ishl
2020-03-25 11:06:54 -07:00
parent fb6e8f784d
commit d24f23285b
3 changed files with 69 additions and 3 deletions
--- a/cranelift/codegen/meta/src/isa/x86/legalize.rs
+++ b/cranelift/codegen/meta/src/isa/x86/legalize.rs
@@ -326,6 +326,7 @@ fn define_simd(shared: &mut SharedDefinitions, x86_instructions: &InstructionGro
    let fcmp = insts.by_name("fcmp");
    let fabs = insts.by_name("fabs");
    let fneg = insts.by_name("fneg");
+    let iadd_imm = insts.by_name("iadd_imm");
    let icmp = insts.by_name("icmp");
    let imax = insts.by_name("imax");
    let imin = insts.by_name("imin");
@@ -349,6 +350,7 @@ fn define_simd(shared: &mut SharedDefinitions, x86_instructions: &InstructionGro
    let vall_true = insts.by_name("vall_true");
    let vany_true = insts.by_name("vany_true");

+    let x86_packss = x86_instructions.by_name("x86_packss");
    let x86_pmaxs = x86_instructions.by_name("x86_pmaxs");
    let x86_pmaxu = x86_instructions.by_name("x86_pmaxu");
    let x86_pmins = x86_instructions.by_name("x86_pmins");
@@ -357,6 +359,8 @@ fn define_simd(shared: &mut SharedDefinitions, x86_instructions: &InstructionGro
    let x86_pshufd = x86_instructions.by_name("x86_pshufd");
    let x86_psra = x86_instructions.by_name("x86_psra");
    let x86_ptest = x86_instructions.by_name("x86_ptest");
+    let x86_punpckh = x86_instructions.by_name("x86_punpckh");
+    let x86_punpckl = x86_instructions.by_name("x86_punpckl");

    let imm = &shared.imm;

@@ -373,6 +377,7 @@ fn define_simd(shared: &mut SharedDefinitions, x86_instructions: &InstructionGro
    // Set up variables and immediates.
    let uimm8_zero = Literal::constant(&imm.uimm8, 0x00);
    let uimm8_one = Literal::constant(&imm.uimm8, 0x01);
+    let uimm8_eight = Literal::constant(&imm.uimm8, 8);
    let u128_zeroes = constant(vec![0x00; 16]);
    let u128_ones = constant(vec![0xff; 16]);
    let u128_seventies = constant(vec![0x70; 16]);
@@ -381,8 +386,12 @@ fn define_simd(shared: &mut SharedDefinitions, x86_instructions: &InstructionGro
    let c = var("c");
    let d = var("d");
    let e = var("e");
+    let f = var("f");
+    let g = var("g");
+    let h = var("h");
    let x = var("x");
    let y = var("y");
+    let z = var("z");

    // Limit the SIMD vector size: eventually multiple vector sizes may be supported
    // but for now only SSE-sized vectors are available.
@@ -484,13 +493,37 @@ fn define_simd(shared: &mut SharedDefinitions, x86_instructions: &InstructionGro
        );
    }

-    // SIMD shift left (arithmetic)
+    // SIMD shift right (arithmetic)
    for ty in &[I16, I32, I64] {
        let sshr = sshr.bind(vector(*ty, sse_vector_size));
-        let bitcast = bitcast.bind(vector(I64, sse_vector_size));
+        let bitcast_i64x2 = bitcast.bind(vector(I64, sse_vector_size));
        narrow.legalize(
            def!(a = sshr(x, y)),
-            vec![def!(b = bitcast(y)), def!(a = x86_psra(x, b))],
+            vec![def!(b = bitcast_i64x2(y)), def!(a = x86_psra(x, b))],
+        );
+    }
+    {
+        let sshr = sshr.bind(vector(I8, sse_vector_size));
+        let bitcast_i64x2 = bitcast.bind(vector(I64, sse_vector_size));
+        let raw_bitcast_i16x8 = raw_bitcast.bind(vector(I16, sse_vector_size));
+        let raw_bitcast_i16x8_again = raw_bitcast.bind(vector(I16, sse_vector_size));
+        narrow.legalize(
+            def!(z = sshr(x, y)),
+            vec![
+                // Since we will use the high byte of each 16x8 lane, shift an extra 8 bits.
+                def!(a = iadd_imm(y, uimm8_eight)),
+                def!(b = bitcast_i64x2(a)),
+                // Take the low 8 bytes of x, duplicate them in 16x8 lanes, then shift right.
+                def!(c = x86_punpckl(x, x)),
+                def!(d = raw_bitcast_i16x8(c)),
+                def!(e = x86_psra(d, b)),
+                // Take the high 8 bytes of x, duplicate them in 16x8 lanes, then shift right.
+                def!(f = x86_punpckh(x, x)),
+                def!(g = raw_bitcast_i16x8_again(f)),
+                def!(h = x86_psra(g, b)),
+                // Re-pack the vector.
+                def!(z = x86_packss(e, h)),
+            ],
        );
    }

--- a/cranelift/filetests/filetests/isa/x86/simd-bitwise-legalize.clif
+++ b/cranelift/filetests/filetests/isa/x86/simd-bitwise-legalize.clif
@@ -18,6 +18,26 @@ block0:
    return v2
 }

+function %sshr_i8x16() -> i8x16 {
+block0:
+    v0 = iconst.i32 1
+    v1 = vconst.i8x16 [0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15]
+    v2 = sshr v1, v0
+    ; check:  v3 = iadd_imm v0, 8
+    ; nextln: v4 = bitcast.i64x2 v3
+
+    ; nextln: v5 = x86_punpckl v1, v1
+    ; nextln: v6 = raw_bitcast.i16x8 v5
+    ; nextln: v7 = x86_psra v6, v4
+
+    ; nextln: v8 = x86_punpckh v1, v1
+    ; nextln: v9 = raw_bitcast.i16x8 v8
+    ; nextln: v10 = x86_psra v9, v4
+
+    ; nextln: v2 = x86_packss v7, v10
+    return v2
+}
+
 function %ishl_i8x16() -> i8x16 {
 block0:
    v0 = iconst.i32 1
--- a/cranelift/filetests/filetests/isa/x86/simd-bitwise-run.clif
+++ b/cranelift/filetests/filetests/isa/x86/simd-bitwise-run.clif
@@ -51,6 +51,19 @@ block0:
 }
 ; run

+function %sshr_i8x16() -> b1 {
+block0:
+    v0 = iconst.i32 1
+    v1 = vconst.i8x16 [0 0xff 2 0xfd 4 0xfb 6 0xf9 8 0xf7 10 0xf5 12 0xf3 14 0xf1]
+    v2 = sshr v1, v0
+
+    v3 = vconst.i8x16 [0 0xff 1 0xfe 2 0xfd 3 0xfc 4 0xfb 5 0xfa 6 0xf9 7 0xf8]
+    v4 = icmp eq v2, v3
+    v5 = vall_true v4
+    return v5
+}
+; run
+
 function %ishl_i8x16() -> b1 {
 block0:
    v0 = iconst.i32 1