Add x86 implentation of 8x16 ushr

This involves some large mask tables that may hurt code size but reduce the number of instructions. See https://github.com/WebAssembly/simd/issues/117 for a more in-depth discussion on this.
2020-03-20 18:59:20 -07:00
parent 39c0a28d77
commit 3f47291f2e
4 changed files with 98 additions and 12 deletions
--- a/cranelift/codegen/meta/src/isa/x86/legalize.rs
+++ b/cranelift/codegen/meta/src/isa/x86/legalize.rs
@@ -357,7 +357,6 @@ fn define_simd(shared: &mut SharedDefinitions, x86_instructions: &InstructionGro
    let x86_pshufd = x86_instructions.by_name("x86_pshufd");
    let x86_psll = x86_instructions.by_name("x86_psll");
    let x86_psra = x86_instructions.by_name("x86_psra");
-    let x86_psrl = x86_instructions.by_name("x86_psrl");
    let x86_ptest = x86_instructions.by_name("x86_ptest");

    let imm = &shared.imm;
@@ -496,16 +495,6 @@ fn define_simd(shared: &mut SharedDefinitions, x86_instructions: &InstructionGro
        );
    }

-    // SIMD shift right (logical)
-    for ty in &[I16, I32, I64] {
-        let ushr = ushr.bind(vector(*ty, sse_vector_size));
-        let bitcast = bitcast.bind(vector(I64, sse_vector_size));
-        narrow.legalize(
-            def!(a = ushr(x, y)),
-            vec![def!(b = bitcast(y)), def!(a = x86_psrl(x, b))],
-        );
-    }
-
    // SIMD shift left (arithmetic)
    for ty in &[I16, I32, I64] {
        let sshr = sshr.bind(vector(*ty, sse_vector_size));
@@ -695,6 +684,7 @@ fn define_simd(shared: &mut SharedDefinitions, x86_instructions: &InstructionGro
    narrow.custom_legalize(extractlane, "convert_extractlane");
    narrow.custom_legalize(insertlane, "convert_insertlane");
    narrow.custom_legalize(ineg, "convert_ineg");
+    narrow.custom_legalize(ushr, "convert_ushr");

    narrow.build_and_add_to(&mut shared.transform_groups);
 }