Add x86 implementation of SIMD load_extend instructions

2020-02-18 15:05:49 -08:00
parent cf1cfdcace
commit 54398156ea
3 changed files with 85 additions and 0 deletions
--- a/cranelift/codegen/meta/src/isa/x86/encodings.rs
+++ b/cranelift/codegen/meta/src/isa/x86/encodings.rs
@@ -1600,6 +1600,9 @@ fn define_simd(
    let regspill = shared.by_name("regspill");
    let sadd_sat = shared.by_name("sadd_sat");
    let scalar_to_vector = shared.by_name("scalar_to_vector");
+    let sload8x8 = shared.by_name("sload8x8");
+    let sload16x4 = shared.by_name("sload16x4");
+    let sload32x2 = shared.by_name("sload32x2");
    let spill = shared.by_name("spill");
    let sqrt = shared.by_name("sqrt");
    let sshr_imm = shared.by_name("sshr_imm");
@@ -1607,6 +1610,9 @@ fn define_simd(
    let store = shared.by_name("store");
    let store_complex = shared.by_name("store_complex");
    let uadd_sat = shared.by_name("uadd_sat");
+    let uload8x8 = shared.by_name("uload8x8");
+    let uload16x4 = shared.by_name("uload16x4");
+    let uload32x2 = shared.by_name("uload32x2");
    let ushr_imm = shared.by_name("ushr_imm");
    let usub_sat = shared.by_name("usub_sat");
    let vconst = shared.by_name("vconst");
@@ -1926,6 +1932,31 @@ fn define_simd(
        e.enc_32_64_rec(bound_copy_nop, rec_stacknull, 0);
    }

+    // SIMD load extend
+    for (inst, opcodes) in &[
+        (uload8x8, &PMOVZXBW),
+        (uload16x4, &PMOVZXWD),
+        (uload32x2, &PMOVZXDQ),
+        (sload8x8, &PMOVSXBW),
+        (sload16x4, &PMOVSXWD),
+        (sload32x2, &PMOVSXDQ),
+    ] {
+        let isap = Some(use_sse41_simd);
+        for recipe in &[rec_fld, rec_fldDisp8, rec_fldDisp32] {
+            let inst = *inst;
+            let template = recipe.opcodes(*opcodes);
+            e.enc32_maybe_isap(inst.clone().bind(I32), template.clone(), isap);
+            // REX-less encoding must come after REX encoding so we don't use it by
+            // default. Otherwise reg-alloc would never use r8 and up.
+            e.enc64_maybe_isap(inst.clone().bind(I32), template.clone().rex(), isap);
+            e.enc64_maybe_isap(inst.clone().bind(I32), template.clone(), isap);
+            // Similar to above; TODO some of this duplication can be cleaned up by infer_rex()
+            // tracked in https://github.com/bytecodealliance/cranelift/issues/1090
+            e.enc64_maybe_isap(inst.clone().bind(I64), template.clone().rex(), isap);
+            e.enc64_maybe_isap(inst.bind(I64), template, isap);
+        }
+    }
+
    // SIMD integer addition
    for (ty, opcodes) in &[(I8, &PADDB), (I16, &PADDW), (I32, &PADDD), (I64, &PADDQ)] {
        let iadd = iadd.bind(vector(*ty, sse_vector_size));
--- a/cranelift/codegen/meta/src/isa/x86/opcodes.rs
+++ b/cranelift/codegen/meta/src/isa/x86/opcodes.rs
@@ -417,6 +417,30 @@ pub static PMINUD: [u8; 4] = [0x66, 0x0f, 0x38, 0x3b];
 /// xmm1 (SSE4.1).
 pub static PMINUW: [u8; 4] = [0x66, 0x0f, 0x38, 0x3a];

+/// Sign extend 8 packed 8-bit integers in the low 8 bytes of xmm2/m64 to 8 packed 16-bit
+/// integers in xmm1 (SSE4.1).
+pub static PMOVSXBW: [u8; 4] = [0x66, 0x0f, 0x38, 0x20];
+
+/// Sign extend 4 packed 16-bit integers in the low 8 bytes of xmm2/m64 to 4 packed 32-bit
+/// integers in xmm1 (SSE4.1).
+pub static PMOVSXWD: [u8; 4] = [0x66, 0x0f, 0x38, 0x23];
+
+/// Sign extend 2 packed 32-bit integers in the low 8 bytes of xmm2/m64 to 2 packed 64-bit
+/// integers in xmm1.
+pub static PMOVSXDQ: [u8; 4] = [0x66, 0x0f, 0x38, 0x25];
+
+/// Zero extend 8 packed 8-bit integers in the low 8 bytes of xmm2/m64 to 8 packed 16-bit
+/// integers in xmm1 (SSE4.1).
+pub static PMOVZXBW: [u8; 4] = [0x66, 0x0f, 0x38, 0x30];
+
+/// Zero extend 4 packed 16-bit integers in the low 8 bytes of xmm2/m64 to 4 packed 32-bit
+/// integers in xmm1 (SSE4.1).
+pub static PMOVZXWD: [u8; 4] = [0x66, 0x0f, 0x38, 0x33];
+
+/// Zero extend 2 packed 32-bit integers in the low 8 bytes of xmm2/m64 to 2 packed 64-bit
+/// integers in xmm1.
+pub static PMOVZXDQ: [u8; 4] = [0x66, 0x0f, 0x38, 0x35];
+
 /// Multiply the packed signed word integers in xmm1 and xmm2/m128, and store the low 16 bits of
 /// the results in xmm1 (SSE2).
 pub static PMULLW: [u8; 3] = [0x66, 0x0f, 0xd5];