Remove the Cranelift vselect instruction (#5918)

* Remove the Cranelift `vselect` instruction

This instruction is documented as selecting lanes based on the "truthy"
value of the condition lane, but the current status of the
implementation of this instruction is:

* x64 - uses the high bit for `f32x4` and `f64x2` and otherwise uses the
  high bit of each byte doing a byte-wise lane select rather than
  whatever the controlling type is.

* AArch64 - this is the same as `bitselect` which is a bit-wise
  selection rather than a lane-wise selection.

* s390x - this is the same as AArch64, a bit-wise selection rather than
  lane-wise.

* interpreter - the interpreter implements the documented semantics of
  selecting based on "truthy" values.

Coupled with the status of the implementation is the fact that this
instruction is not used by WebAssembly SIMD today either. The only use
of this instruction in Cranelift is the nan-canonicalization pass. By
moving nan-canonicalization to `bitselect`, since that has the desired
semantics, there's no longer any need for `vselect`.

Given this situation this commit subsqeuently removes `vselect` and all
usage of it throughout Cranelift.

Closes #5917

* Review comments

* Bring back vselect opts as bitselect opts

* Clean up vselect usage in the interpreter

* Move bitcast in nan canonicalization

* Add a comment about float optimization
This commit is contained in:
Alex Crichton
2023-03-07 18:42:05 -06:00
committed by GitHub
parent fc45ccc125
commit 07518dfd36
14 changed files with 163 additions and 333 deletions

View File

@@ -173,7 +173,7 @@ block0:
function %vselect_i16x8(i16x8, i16x8, i16x8) -> i16x8 {
block0(v0: i16x8, v1: i16x8, v2: i16x8):
v3 = vselect v0, v1, v2
v3 = bitselect v0, v1, v2
return v3
}
@@ -187,9 +187,9 @@ block0(v0: i16x8, v1: i16x8, v2: i16x8):
; bsl v0.16b, v1.16b, v2.16b
; ret
function %vselect_f32x4(i32x4, f32x4, f32x4) -> f32x4 {
block0(v0: i32x4, v1: f32x4, v2: f32x4):
v3 = vselect v0, v1, v2
function %vselect_f32x4(f32x4, f32x4, f32x4) -> f32x4 {
block0(v0: f32x4, v1: f32x4, v2: f32x4):
v3 = bitselect v0, v1, v2
return v3
}
@@ -203,9 +203,9 @@ block0(v0: i32x4, v1: f32x4, v2: f32x4):
; bsl v0.16b, v1.16b, v2.16b
; ret
function %vselect_f64x2(i64x2, f64x2, f64x2) -> f64x2 {
block0(v0: i64x2, v1: f64x2, v2: f64x2):
v3 = vselect v0, v1, v2
function %vselect_f64x2(f64x2, f64x2, f64x2) -> f64x2 {
block0(v0: f64x2, v1: f64x2, v2: f64x2):
v3 = bitselect v0, v1, v2
return v3
}

View File

@@ -514,67 +514,3 @@ block0(v0: i8x16, v1: i8x16, v2: i8x16):
; vsel %v24, %v25, %v26, %v24
; br %r14
function %vselect_i64x2(i64x2, i64x2, i64x2) -> i64x2 {
block0(v0: i64x2, v1: i64x2, v2: i64x2):
v3 = vselect.i64x2 v0, v1, v2
return v3
}
; VCode:
; block0:
; vsel %v24, %v25, %v26, %v24
; br %r14
;
; Disassembled:
; block0: ; offset 0x0
; vsel %v24, %v25, %v26, %v24
; br %r14
function %vselect_i32x4(i32x4, i32x4, i32x4) -> i32x4 {
block0(v0: i32x4, v1: i32x4, v2: i32x4):
v3 = vselect.i32x4 v0, v1, v2
return v3
}
; VCode:
; block0:
; vsel %v24, %v25, %v26, %v24
; br %r14
;
; Disassembled:
; block0: ; offset 0x0
; vsel %v24, %v25, %v26, %v24
; br %r14
function %vselect_i16x8(i16x8, i16x8, i16x8) -> i16x8 {
block0(v0: i16x8, v1: i16x8, v2: i16x8):
v3 = vselect.i16x8 v0, v1, v2
return v3
}
; VCode:
; block0:
; vsel %v24, %v25, %v26, %v24
; br %r14
;
; Disassembled:
; block0: ; offset 0x0
; vsel %v24, %v25, %v26, %v24
; br %r14
function %vselect_i8x16(i8x16, i8x16, i8x16) -> i8x16 {
block0(v0: i8x16, v1: i8x16, v2: i8x16):
v3 = vselect.i8x16 v0, v1, v2
return v3
}
; VCode:
; block0:
; vsel %v24, %v25, %v26, %v24
; br %r14
;
; Disassembled:
; block0: ; offset 0x0
; vsel %v24, %v25, %v26, %v24
; br %r14

View File

@@ -433,7 +433,7 @@ block0(v0: f64x2, v1: f64x2):
function %i16x8_bitselect(i16x8, i16x8, i16x8) -> i16x8 {
block0(v0: i16x8, v1: i16x8, v2: i16x8):
v3 = vselect v0, v1, v2
v3 = bitselect v0, v1, v2
return v3
}
@@ -441,7 +441,9 @@ block0(v0: i16x8, v1: i16x8, v2: i16x8):
; pushq %rbp
; movq %rsp, %rbp
; block0:
; vpblendvb %xmm0, %xmm1, %xmm0, %xmm2
; vpand %xmm1, %xmm0, %xmm4
; vpandn %xmm0, %xmm2, %xmm6
; vpor %xmm6, %xmm4, %xmm0
; movq %rbp, %rsp
; popq %rbp
; ret
@@ -451,14 +453,16 @@ block0(v0: i16x8, v1: i16x8, v2: i16x8):
; pushq %rbp
; movq %rsp, %rbp
; block1: ; offset 0x4
; vpblendvb %xmm0, %xmm1, %xmm2, %xmm0
; vpand %xmm0, %xmm1, %xmm4
; vpandn %xmm2, %xmm0, %xmm6
; vpor %xmm4, %xmm6, %xmm0
; movq %rbp, %rsp
; popq %rbp
; retq
function %i32x4_bitselect(i32x4, f32x4, f32x4) -> f32x4 {
block0(v0: i32x4, v1: f32x4, v2: f32x4):
v3 = vselect v0, v1, v2
function %f32x4_bitselect(f32x4, f32x4, f32x4) -> f32x4 {
block0(v0: f32x4, v1: f32x4, v2: f32x4):
v3 = bitselect v0, v1, v2
return v3
}
@@ -466,7 +470,9 @@ block0(v0: i32x4, v1: f32x4, v2: f32x4):
; pushq %rbp
; movq %rsp, %rbp
; block0:
; vblendvps %xmm0, %xmm1, %xmm0, %xmm2
; vandps %xmm1, %xmm0, %xmm4
; vandnps %xmm0, %xmm2, %xmm6
; vorps %xmm6, %xmm4, %xmm0
; movq %rbp, %rsp
; popq %rbp
; ret
@@ -476,14 +482,16 @@ block0(v0: i32x4, v1: f32x4, v2: f32x4):
; pushq %rbp
; movq %rsp, %rbp
; block1: ; offset 0x4
; vblendvps %xmm0, %xmm1, %xmm2, %xmm0
; vandps %xmm0, %xmm1, %xmm4
; vandnps %xmm2, %xmm0, %xmm6
; vorps %xmm4, %xmm6, %xmm0
; movq %rbp, %rsp
; popq %rbp
; retq
function %i64x2_bitselect(i64x2, f64x2, f64x2) -> f64x2 {
block0(v0: i64x2, v1: f64x2, v2: f64x2):
v3 = vselect v0, v1, v2
function %f64x2_bitselect(f64x2, f64x2, f64x2) -> f64x2 {
block0(v0: f64x2, v1: f64x2, v2: f64x2):
v3 = bitselect v0, v1, v2
return v3
}
@@ -491,7 +499,9 @@ block0(v0: i64x2, v1: f64x2, v2: f64x2):
; pushq %rbp
; movq %rsp, %rbp
; block0:
; vblendvpd %xmm0, %xmm1, %xmm0, %xmm2
; vandpd %xmm1, %xmm0, %xmm4
; vandnpd %xmm0, %xmm2, %xmm6
; vorpd %xmm6, %xmm4, %xmm0
; movq %rbp, %rsp
; popq %rbp
; ret
@@ -501,7 +511,9 @@ block0(v0: i64x2, v1: f64x2, v2: f64x2):
; pushq %rbp
; movq %rsp, %rbp
; block1: ; offset 0x4
; vblendvpd %xmm0, %xmm1, %xmm2, %xmm0
; vandpd %xmm0, %xmm1, %xmm4
; vandnpd %xmm2, %xmm0, %xmm6
; vorpd %xmm4, %xmm6, %xmm0
; movq %rbp, %rsp
; popq %rbp
; retq

View File

@@ -229,7 +229,7 @@ block0(v0: i32x4, v1: i32x4):
function %vselect_i16x8(i16x8, i16x8, i16x8) -> i16x8 {
block0(v0: i16x8, v1: i16x8, v2: i16x8):
v3 = vselect v0, v1, v2
v3 = bitselect v0, v1, v2
return v3
}
@@ -237,9 +237,10 @@ block0(v0: i16x8, v1: i16x8, v2: i16x8):
; pushq %rbp
; movq %rsp, %rbp
; block0:
; movdqa %xmm2, %xmm4
; pblendvb %xmm4, %xmm1, %xmm4
; movdqa %xmm4, %xmm0
; movdqa %xmm1, %xmm4
; pand %xmm4, %xmm0, %xmm4
; pandn %xmm0, %xmm2, %xmm0
; por %xmm0, %xmm4, %xmm0
; movq %rbp, %rsp
; popq %rbp
; ret
@@ -249,16 +250,17 @@ block0(v0: i16x8, v1: i16x8, v2: i16x8):
; pushq %rbp
; movq %rsp, %rbp
; block1: ; offset 0x4
; movdqa %xmm2, %xmm4
; pblendvb %xmm0, %xmm1, %xmm4
; movdqa %xmm4, %xmm0
; movdqa %xmm1, %xmm4
; pand %xmm0, %xmm4
; pandn %xmm2, %xmm0
; por %xmm4, %xmm0
; movq %rbp, %rsp
; popq %rbp
; retq
function %vselect_f32x4(i32x4, f32x4, f32x4) -> f32x4 {
block0(v0: i32x4, v1: f32x4, v2: f32x4):
v3 = vselect v0, v1, v2
function %vselect_f32x4(f32x4, f32x4, f32x4) -> f32x4 {
block0(v0: f32x4, v1: f32x4, v2: f32x4):
v3 = bitselect v0, v1, v2
return v3
}
@@ -266,9 +268,10 @@ block0(v0: i32x4, v1: f32x4, v2: f32x4):
; pushq %rbp
; movq %rsp, %rbp
; block0:
; movdqa %xmm2, %xmm4
; blendvps %xmm4, %xmm1, %xmm4
; movdqa %xmm4, %xmm0
; movdqa %xmm1, %xmm4
; andps %xmm4, %xmm0, %xmm4
; andnps %xmm0, %xmm2, %xmm0
; orps %xmm0, %xmm4, %xmm0
; movq %rbp, %rsp
; popq %rbp
; ret
@@ -278,16 +281,17 @@ block0(v0: i32x4, v1: f32x4, v2: f32x4):
; pushq %rbp
; movq %rsp, %rbp
; block1: ; offset 0x4
; movdqa %xmm2, %xmm4
; blendvps %xmm0, %xmm1, %xmm4
; movdqa %xmm4, %xmm0
; movdqa %xmm1, %xmm4
; andps %xmm0, %xmm4
; andnps %xmm2, %xmm0
; orps %xmm4, %xmm0
; movq %rbp, %rsp
; popq %rbp
; retq
function %vselect_f64x2(i64x2, f64x2, f64x2) -> f64x2 {
block0(v0: i64x2, v1: f64x2, v2: f64x2):
v3 = vselect v0, v1, v2
function %vselect_f64x2(f64x2, f64x2, f64x2) -> f64x2 {
block0(v0: f64x2, v1: f64x2, v2: f64x2):
v3 = bitselect v0, v1, v2
return v3
}
@@ -295,9 +299,10 @@ block0(v0: i64x2, v1: f64x2, v2: f64x2):
; pushq %rbp
; movq %rsp, %rbp
; block0:
; movdqa %xmm2, %xmm4
; blendvpd %xmm4, %xmm1, %xmm4
; movdqa %xmm4, %xmm0
; movdqa %xmm1, %xmm4
; andpd %xmm4, %xmm0, %xmm4
; andnpd %xmm0, %xmm2, %xmm0
; orpd %xmm0, %xmm4, %xmm0
; movq %rbp, %rsp
; popq %rbp
; ret
@@ -307,9 +312,10 @@ block0(v0: i64x2, v1: f64x2, v2: f64x2):
; pushq %rbp
; movq %rsp, %rbp
; block1: ; offset 0x4
; movdqa %xmm2, %xmm4
; blendvpd %xmm0, %xmm1, %xmm4
; movdqa %xmm4, %xmm0
; movdqa %xmm1, %xmm4
; andpd %xmm0, %xmm4
; andnpd %xmm2, %xmm0
; orpd %xmm4, %xmm0
; movq %rbp, %rsp
; popq %rbp
; retq