x64: Improve memory support in {insert,extract}lane (#5982)

* x64: Improve memory support in `{insert,extract}lane` This commit improves adds support to Cranelift to emit `pextr{b,w,d,q}` with a memory destination, merging a store-of-extract operation into one instruction. Additionally AVX support is added for the `pextr*` instructions. I've additionally tried to ensure that codegen tests and runtests exist for all forms of these instructions too. * Add missing commas * Fix tests
2023-03-13 14:30:44 -05:00
parent 5c95e6fbaf
commit 6ecdc2482e
17 changed files with 1066 additions and 54 deletions
--- a/cranelift/filetests/filetests/runtests/simd-extractlane.clif
+++ b/cranelift/filetests/filetests/runtests/simd-extractlane.clif
@@ -4,6 +4,7 @@ target aarch64
 target s390x
 set enable_simd
 target x86_64 has_sse3 has_ssse3 has_sse41
+target x86_64 has_sse3 has_ssse3 has_sse41 has_avx

 function %extractlane_4(i8x16) -> i8 {
 block0(v0: i8x16):
@@ -33,3 +34,69 @@ block0(v0: i64x2):
    return v1
 }
 ; run: %extractlane_1([0 4294967297]) == 4294967297
+
+function %extractlane_i8x16_through_stack(i8x16) -> i8 {
+    ss0 = explicit_slot 8
+block0(v0: i8x16):
+    v2 = stack_addr.i64 ss0
+    v3 = extractlane v0, 1
+    store v3, v2
+    v4 = load.i8 v2
+    return v4
+}
+; run: %extractlane_i8x16_through_stack([1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16]) == 2
+
+function %extractlane_i16x8_through_stack(i16x8) -> i16 {
+    ss0 = explicit_slot 8
+block0(v0: i16x8):
+    v2 = stack_addr.i64 ss0
+    v3 = extractlane v0, 2
+    store v3, v2
+    v4 = load.i16 v2
+    return v4
+}
+; run: %extractlane_i16x8_through_stack([1 2 3 4 5 6 7 8]) == 3
+
+function %extractlane_i32x4_through_stack(i32x4) -> i32 {
+    ss0 = explicit_slot 8
+block0(v0: i32x4):
+    v2 = stack_addr.i64 ss0
+    v3 = extractlane v0, 3
+    store v3, v2
+    v4 = load.i32 v2
+    return v4
+}
+; run: %extractlane_i32x4_through_stack([1 2 3 4]) == 4
+
+function %extractlane_i64x2_through_stack(i64x2) -> i64 {
+    ss0 = explicit_slot 8
+block0(v0: i64x2):
+    v2 = stack_addr.i64 ss0
+    v3 = extractlane v0, 0
+    store v3, v2
+    v4 = load.i64 v2
+    return v4
+}
+; run: %extractlane_i64x2_through_stack([1 2]) == 1
+
+function %extractlane_f32x4_through_stack(f32x4) -> f32 {
+    ss0 = explicit_slot 8
+block0(v0: f32x4):
+    v2 = stack_addr.i64 ss0
+    v3 = extractlane v0, 3
+    store v3, v2
+    v4 = load.f32 v2
+    return v4
+}
+; run: %extractlane_f32x4_through_stack([0x1.0 0x2.0 0x3.0 0x4.0]) == 0x4.0
+
+function %extractlane_f64x2_through_stack(f64x2) -> f64 {
+    ss0 = explicit_slot 8
+block0(v0: f64x2):
+    v2 = stack_addr.i64 ss0
+    v3 = extractlane v0, 0
+    store v3, v2
+    v4 = load.f64 v2
+    return v4
+}
+; run: %extractlane_f64x2_through_stack([0x1.0 0x2.0]) == 0x1.0
--- a/cranelift/filetests/filetests/runtests/simd-insertlane.clif
+++ b/cranelift/filetests/filetests/runtests/simd-insertlane.clif
@@ -47,3 +47,91 @@ block0(v0: f64x2, v1: f64):
    return v2
 }
 ; run: %insertlane_1_in_f64x2([0x1.0 0x2.0], 0x3.0) == [0x1.0 0x3.0]
+
+function %insertlane_i8x16_through_stack(i8x16, i8) -> i8x16 {
+    ss0 = explicit_slot 8
+block0(v0: i8x16, v1: i8):
+    v2 = stack_addr.i64 ss0
+    store v1, v2
+    v3 = load.i8 v2
+    v4 = insertlane v0, v3, 1
+    return v4
+}
+; run: %insertlane_i8x16_through_stack([1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1], 2) == [1 2 1 1 1 1 1 1 1 1 1 1 1 1 1 1]
+
+function %insertlane_i16x8_through_stack(i16x8, i16) -> i16x8 {
+    ss0 = explicit_slot 8
+block0(v0: i16x8, v1: i16):
+    v2 = stack_addr.i64 ss0
+    store v1, v2
+    v3 = load.i16 v2
+    v4 = insertlane v0, v3, 2
+    return v4
+}
+; run: %insertlane_i16x8_through_stack([1 1 1 1 1 1 1 1], 2) == [1 1 2 1 1 1 1 1]
+
+function %insertlane_i32x4_through_stack(i32x4, i32) -> i32x4 {
+    ss0 = explicit_slot 8
+block0(v0: i32x4, v1: i32):
+    v2 = stack_addr.i64 ss0
+    store v1, v2
+    v3 = load.i32 v2
+    v4 = insertlane v0, v3, 3
+    return v4
+}
+; run: %insertlane_i32x4_through_stack([1 1 1 1], 2) == [1 1 1 2]
+
+function %insertlane_i64x2_through_stack(i64x2, i64) -> i64x2 {
+    ss0 = explicit_slot 8
+block0(v0: i64x2, v1: i64):
+    v2 = stack_addr.i64 ss0
+    store v1, v2
+    v3 = load.i64 v2
+    v4 = insertlane v0, v3, 0
+    return v4
+}
+; run: %insertlane_i64x2_through_stack([1 1], 2) == [2 1]
+
+function %insertlane_f32x4_through_stack(f32x4, f32) -> f32x4 {
+    ss0 = explicit_slot 8
+block0(v0: f32x4, v1: f32):
+    v2 = stack_addr.i64 ss0
+    store v1, v2
+    v3 = load.f32 v2
+    v4 = insertlane v0, v3, 3
+    return v4
+}
+; run: %insertlane_f32x4_through_stack([0x1.0 0x1.0 0x1.0 0x1.0], 0x2.0) == [0x1.0 0x1.0 0x1.0 0x2.0]
+
+function %insertlane_f32x4_through_stack2(f32x4, f32) -> f32x4 {
+    ss0 = explicit_slot 8
+block0(v0: f32x4, v1: f32):
+    v2 = stack_addr.i64 ss0
+    store v1, v2
+    v3 = load.f32 v2
+    v4 = insertlane v0, v3, 0
+    return v4
+}
+; run: %insertlane_f32x4_through_stack2([0x1.0 0x1.0 0x1.0 0x1.0], 0x2.0) == [0x2.0 0x1.0 0x1.0 0x1.0]
+
+function %insertlane_f64x2_through_stack(f64x2, f64) -> f64x2 {
+    ss0 = explicit_slot 8
+block0(v0: f64x2, v1: f64):
+    v2 = stack_addr.i64 ss0
+    store v1, v2
+    v3 = load.f64 v2
+    v4 = insertlane v0, v3, 0
+    return v4
+}
+; run: %insertlane_f64x2_through_stack([0x1.0 0x1.0], 0x2.0) == [0x2.0 0x1.0]
+
+function %insertlane_f64x2_through_stack2(f64x2, f64) -> f64x2 {
+    ss0 = explicit_slot 8
+block0(v0: f64x2, v1: f64):
+    v2 = stack_addr.i64 ss0
+    store v1, v2
+    v3 = load.f64 v2
+    v4 = insertlane v0, v3, 1
+    return v4
+}
+; run: %insertlane_f64x2_through_stack2([0x1.0 0x1.0], 0x2.0) == [0x1.0 0x2.0]