Avoid extra register movement when lowering an x86 insertlane to a float vector

This commit is contained in:
Andrew Brown
2019-08-23 11:38:29 -07:00
parent 3dfc68afb1
commit 295b2ef614
11 changed files with 334 additions and 55 deletions

View File

@@ -28,3 +28,41 @@ ebb0:
return v3
}
; run
function %test_extractlane_i32_with_vector_reuse() -> b1 {
ebb0:
v0 = iconst.i32 42
v1 = iconst.i32 99
v2 = splat.i32x4 v0
v3 = insertlane v2, 2, v1
v4 = extractlane v3, 3
v5 = icmp eq v4, v0
v6 = extractlane v3, 2
v7 = icmp eq v6, v1
v8 = band v5, v7
return v8
}
; run
function %test_extractlane_f32_with_vector_reuse() -> b1 {
ebb0:
v0 = f32const 0x42.42
v1 = f32const 0x99.99
v2 = splat.f32x4 v0
v3 = insertlane v2, 2, v1
v4 = extractlane v3, 3
v5 = fcmp eq v4, v0
v6 = extractlane v3, 2
v7 = fcmp eq v6, v1
v8 = band v5, v7
return v8
}
; run

View File

@@ -0,0 +1,42 @@
test binemit
set enable_simd
target x86_64 haswell
; for insertlane, floats are legalized differently than integers and booleans; integers and booleans use x86_pinsr
; which is manually placed in the IR so that it can be binemit-tested
function %test_insertlane_b8() {
ebb0:
[-, %rax] v0 = bconst.b8 true
[-, %rbx] v1 = bconst.b8 false
[-, %xmm0] v2 = splat.b8x16 v0
[-, %xmm0] v3 = x86_pinsr v2, 10, v1 ; bin: 66 0f 3a 20 c3 0a
return
}
function %test_insertlane_i16() {
ebb0:
[-, %rax] v0 = iconst.i16 4
[-, %rbx] v1 = iconst.i16 5
[-, %xmm1] v2 = splat.i16x8 v0
[-, %xmm1] v3 = x86_pinsr v2, 4, v1 ; bin: 66 0f c4 cb 04
return
}
function %test_insertlane_i32() {
ebb0:
[-, %rax] v0 = iconst.i32 42
[-, %rbx] v1 = iconst.i32 99
[-, %xmm4] v2 = splat.i32x4 v0
[-, %xmm4] v3 = x86_pinsr v2, 2, v1 ; bin: 66 0f 3a 22 e3 02
return
}
function %test_insertlane_b64() {
ebb0:
[-, %rax] v0 = bconst.b64 true
[-, %rbx] v1 = bconst.b64 false
[-, %xmm2] v2 = splat.b64x2 v0
[-, %xmm2] v3 = x86_pinsr v2, 1, v1 ; bin: 66 48 0f 3a 22 d3 01
return
}

View File

@@ -0,0 +1,48 @@
test run
set enable_simd
; TODO once SIMD vector comparison is implemented, remove use of extractlane below
function %test_insertlane_b8() -> b8 {
ebb0:
v1 = bconst.b8 true
v2 = vconst.b8x16 [false false false false false false false false false false false false false
false false false]
v3 = insertlane v2, 10, v1
v4 = extractlane v3, 10
return v4
}
; run
function %test_insertlane_f32() -> b1 {
ebb0:
v0 = f32const 0x42.42
v1 = vconst.f32x4 0x00
v2 = insertlane v1, 1, v0
v3 = extractlane v2, 1
v4 = fcmp eq v3, v0
return v4
}
; run
function %test_insertlane_f64_lane1() -> b1 {
ebb0:
v0 = f64const 0x42.42
v1 = vconst.f64x2 0x00
v2 = insertlane v1, 1, v0
v3 = extractlane v2, 1
v4 = fcmp eq v3, v0
return v4
}
; run
function %test_insertlane_f64_lane0() -> b1 {
ebb0:
v0 = f64const 0x42.42
v1 = vconst.f64x2 0x00
v2 = insertlane v1, 0, v0
v3 = extractlane v2, 0
v4 = fcmp eq v3, v0
return v4
}
; run

View File

@@ -1,39 +0,0 @@
test binemit
set enable_simd
target x86_64 haswell
function %test_insertlane_b8() {
ebb0:
[-, %rax] v0 = bconst.b8 true
[-, %rbx] v1 = bconst.b8 false
[-, %xmm0] v2 = splat.b8x16 v0
[-, %xmm0] v3 = insertlane v2, 10, v1 ; bin: 66 0f 3a 20 c3 0a
return
}
function %test_insertlane_i16() {
ebb0:
[-, %rax] v0 = iconst.i16 4
[-, %rbx] v1 = iconst.i16 5
[-, %xmm1] v2 = splat.i16x8 v0
[-, %xmm1] v3 = insertlane v2, 4, v1 ; bin: 66 0f c4 cb 04
return
}
function %test_insertlane_i32() {
ebb0:
[-, %rax] v0 = iconst.i32 42
[-, %rbx] v1 = iconst.i32 99
[-, %xmm4] v2 = splat.i32x4 v0
[-, %xmm4] v3 = insertlane v2, 2, v1 ; bin: 66 0f 3a 22 e3 02
return
}
function %test_insertlane_f64() {
ebb0:
[-, %rax] v0 = f64const 0x0.0
[-, %rbx] v1 = f64const 0x4.2
[-, %xmm2] v2 = splat.f64x2 v0
[-, %xmm2] v3 = insertlane v2, 1, v1 ; bin: 66 48 0f 3a 22 d3 01
return
}

View File

@@ -33,7 +33,7 @@ ebb0:
; check: ebb0:
; nextln: v0 = iconst.i64 42
; nextln: v2 = scalar_to_vector.i64x2 v0
; nextln: v1 = insertlane v2, 1, v0
; nextln: v1 = x86_pinsr v2, 1, v0
; nextln: return v1
@@ -48,7 +48,7 @@ ebb0:
; check: ebb0:
; nextln: v0 = bconst.b16 true
; nextln: v2 = scalar_to_vector.b16x8 v0
; nextln: v3 = insertlane v2, 1, v0
; nextln: v3 = x86_pinsr v2, 1, v0
; nextln: v4 = raw_bitcast.i32x4 v3
; nextln: v5 = x86_pshufd v4, 0
; nextln: v1 = raw_bitcast.b16x8 v5