Remove the Cranelift vselect instruction (#5918)

* Remove the Cranelift `vselect` instruction

This instruction is documented as selecting lanes based on the "truthy"
value of the condition lane, but the current status of the
implementation of this instruction is:

* x64 - uses the high bit for `f32x4` and `f64x2` and otherwise uses the
  high bit of each byte doing a byte-wise lane select rather than
  whatever the controlling type is.

* AArch64 - this is the same as `bitselect` which is a bit-wise
  selection rather than a lane-wise selection.

* s390x - this is the same as AArch64, a bit-wise selection rather than
  lane-wise.

* interpreter - the interpreter implements the documented semantics of
  selecting based on "truthy" values.

Coupled with the status of the implementation is the fact that this
instruction is not used by WebAssembly SIMD today either. The only use
of this instruction in Cranelift is the nan-canonicalization pass. By
moving nan-canonicalization to `bitselect`, since that has the desired
semantics, there's no longer any need for `vselect`.

Given this situation this commit subsqeuently removes `vselect` and all
usage of it throughout Cranelift.

Closes #5917

* Review comments

* Bring back vselect opts as bitselect opts

* Clean up vselect usage in the interpreter

* Move bitcast in nan canonicalization

* Add a comment about float optimization
This commit is contained in:
Alex Crichton
2023-03-07 18:42:05 -06:00
committed by GitHub
parent fc45ccc125
commit 07518dfd36
14 changed files with 163 additions and 333 deletions

View File

@@ -1409,8 +1409,8 @@ pub(crate) fn define(
r#" r#"
Conditional select. Conditional select.
This instruction selects whole values. Use `vselect` for This instruction selects whole values. Use `bitselect` to choose each
lane-wise selection. bit according to a mask.
"#, "#,
&formats.ternary, &formats.ternary,
) )
@@ -1458,7 +1458,7 @@ pub(crate) fn define(
For each bit in `c`, this instruction selects the corresponding bit from `x` if the bit For each bit in `c`, this instruction selects the corresponding bit from `x` if the bit
in `x` is 1 and the corresponding bit from `y` if the bit in `c` is 0. See also: in `x` is 1 and the corresponding bit from `y` if the bit in `c` is 0. See also:
`select`, `vselect`. `select`.
"#, "#,
&formats.ternary, &formats.ternary,
) )
@@ -1484,26 +1484,7 @@ pub(crate) fn define(
.operands_out(vec![a]), .operands_out(vec![a]),
); );
let c = &Operand::new("c", &TxN.as_bool()).with_doc("Controlling vector");
let x = &Operand::new("x", TxN).with_doc("Value to use where `c` is true");
let y = &Operand::new("y", TxN).with_doc("Value to use where `c` is false");
let a = &Operand::new("a", TxN); let a = &Operand::new("a", TxN);
ig.push(
Inst::new(
"vselect",
r#"
Vector lane select.
Select lanes from ``x`` or ``y`` controlled by the lanes of the truthy
vector ``c``.
"#,
&formats.ternary,
)
.operands_in(vec![c, x, y])
.operands_out(vec![a]),
);
let s = &Operand::new("s", i8); let s = &Operand::new("s", i8);
ig.push( ig.push(

View File

@@ -1659,11 +1659,6 @@
(rule 1 (lower (has_type (ty_vec128 ty) (bitselect c x y))) (rule 1 (lower (has_type (ty_vec128 ty) (bitselect c x y)))
(bsl ty c x y)) (bsl ty c x y))
;;;; Rules for `vselect` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
(rule (lower (has_type (ty_vec128 ty) (vselect c x y)))
(bsl ty c x y))
;;;; Rules for `ireduce` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;;;; Rules for `ireduce` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;; T -> I{64,32,16,8}: We can simply pass through the value: values ;; T -> I{64,32,16,8}: We can simply pass through the value: values

View File

@@ -1170,13 +1170,6 @@
(vec_select ty y z x)) (vec_select ty y z x))
;;;; Rules for `vselect` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;; Vector select.
(rule (lower (has_type (ty_vec128 ty) (vselect x y z)))
(vec_select ty y z x))
;;;; Rules for `bmask` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;;;; Rules for `bmask` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
(rule (lower (has_type ty (bmask x))) (rule (lower (has_type ty (bmask x)))

View File

@@ -1193,7 +1193,7 @@
(sse_or ty b a))) (sse_or ty b a)))
;; If every byte of the condition is guaranteed to be all ones or all zeroes, ;; If every byte of the condition is guaranteed to be all ones or all zeroes,
;; we can use x86_blend like vselect does. ;; we can use x64_blend.
(rule 1 (lower (has_type ty @ (multi_lane _bits _lanes) (rule 1 (lower (has_type ty @ (multi_lane _bits _lanes)
(bitselect condition (bitselect condition
if_true if_true
@@ -1226,15 +1226,6 @@
(x86_blendv condition if_true if_false))) (x86_blendv condition if_true if_false)))
(x64_blendvpd if_false if_true condition)) (x64_blendvpd if_false if_true condition))
;;;; Rules for `vselect` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
(rule (lower (has_type ty @ (multi_lane _bits _lanes)
(vselect condition if_true if_false)))
(x64_blend ty
condition
if_true
if_false))
;;;; Rules for `insertlane` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;;;; Rules for `insertlane` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
(rule (lower (insertlane vec @ (value_type ty) val (u8_from_uimm8 idx))) (rule (lower (insertlane vec @ (value_type ty) val (u8_from_uimm8 idx)))

View File

@@ -7,6 +7,7 @@ use crate::ir::condcodes::FloatCC;
use crate::ir::immediates::{Ieee32, Ieee64}; use crate::ir::immediates::{Ieee32, Ieee64};
use crate::ir::types; use crate::ir::types;
use crate::ir::{Function, Inst, InstBuilder, InstructionData, Opcode, Value}; use crate::ir::{Function, Inst, InstBuilder, InstructionData, Opcode, Value};
use crate::opts::MemFlags;
use crate::timing; use crate::timing;
// Canonical 32-bit and 64-bit NaN values. // Canonical 32-bit and 64-bit NaN values.
@@ -70,9 +71,10 @@ fn add_nan_canon_seq(pos: &mut FuncCursor, inst: Inst) {
.select(is_nan, canon_nan, new_res); .select(is_nan, canon_nan, new_res);
}; };
let vector_select = |pos: &mut FuncCursor, canon_nan: Value| { let vector_select = |pos: &mut FuncCursor, canon_nan: Value| {
let is_nan = pos.ins().bitcast(val_type, MemFlags::new(), is_nan);
pos.ins() pos.ins()
.with_result(val) .with_result(val)
.vselect(is_nan, canon_nan, new_res); .bitselect(is_nan, canon_nan, new_res);
}; };
match val_type { match val_type {

View File

@@ -454,56 +454,56 @@
(select ty (icmp _ (IntCC.UnsignedGreaterThanOrEqual) x y) y x)) (select ty (icmp _ (IntCC.UnsignedGreaterThanOrEqual) x y) y x))
(umin ty x y)) (umin ty x y))
;; Transform vselect-of-icmp into {u,s}{min,max} instructions where possible. ;; Transform bitselect-of-icmp into {u,s}{min,max} instructions where possible.
(rule (simplify (rule (simplify
(vselect ty (icmp _ (IntCC.SignedGreaterThan) x y) x y)) (bitselect ty (icmp _ (IntCC.SignedGreaterThan) x y) x y))
(smax ty x y)) (smax ty x y))
(rule (simplify (rule (simplify
(vselect ty (icmp _ (IntCC.SignedGreaterThanOrEqual) x y) x y)) (bitselect ty (icmp _ (IntCC.SignedGreaterThanOrEqual) x y) x y))
(smax ty x y)) (smax ty x y))
(rule (simplify (rule (simplify
(vselect ty (icmp _ (IntCC.UnsignedGreaterThan) x y) x y)) (bitselect ty (icmp _ (IntCC.UnsignedGreaterThan) x y) x y))
(umax ty x y)) (umax ty x y))
(rule (simplify (rule (simplify
(vselect ty (icmp _ (IntCC.UnsignedGreaterThanOrEqual) x y) x y)) (bitselect ty (icmp _ (IntCC.UnsignedGreaterThanOrEqual) x y) x y))
(umax ty x y)) (umax ty x y))
(rule (simplify (rule (simplify
(vselect ty (icmp _ (IntCC.SignedLessThan) x y) x y)) (bitselect ty (icmp _ (IntCC.SignedLessThan) x y) x y))
(smin ty x y)) (smin ty x y))
(rule (simplify (rule (simplify
(vselect ty (icmp _ (IntCC.SignedLessThanOrEqual) x y) x y)) (bitselect ty (icmp _ (IntCC.SignedLessThanOrEqual) x y) x y))
(smin ty x y)) (smin ty x y))
(rule (simplify (rule (simplify
(vselect ty (icmp _ (IntCC.UnsignedLessThan) x y) x y)) (bitselect ty (icmp _ (IntCC.UnsignedLessThan) x y) x y))
(umin ty x y)) (umin ty x y))
(rule (simplify (rule (simplify
(vselect ty (icmp _ (IntCC.UnsignedLessThanOrEqual) x y) x y)) (bitselect ty (icmp _ (IntCC.UnsignedLessThanOrEqual) x y) x y))
(umin ty x y)) (umin ty x y))
;; These are the same rules as above, but when the operands for select are swapped ;; These are the same rules as above, but when the operands for select are swapped
(rule (simplify (rule (simplify
(vselect ty (icmp _ (IntCC.SignedLessThan) x y) y x)) (bitselect ty (icmp _ (IntCC.SignedLessThan) x y) y x))
(smax ty x y)) (smax ty x y))
(rule (simplify (rule (simplify
(vselect ty (icmp _ (IntCC.SignedLessThanOrEqual) x y) y x)) (bitselect ty (icmp _ (IntCC.SignedLessThanOrEqual) x y) y x))
(smax ty x y)) (smax ty x y))
(rule (simplify (rule (simplify
(vselect ty (icmp _ (IntCC.UnsignedLessThan) x y) y x)) (bitselect ty (icmp _ (IntCC.UnsignedLessThan) x y) y x))
(umax ty x y)) (umax ty x y))
(rule (simplify (rule (simplify
(vselect ty (icmp _ (IntCC.UnsignedLessThanOrEqual) x y) y x)) (bitselect ty (icmp _ (IntCC.UnsignedLessThanOrEqual) x y) y x))
(umax ty x y)) (umax ty x y))
(rule (simplify (rule (simplify
(vselect ty (icmp _ (IntCC.SignedGreaterThan) x y) y x)) (bitselect ty (icmp _ (IntCC.SignedGreaterThan) x y) y x))
(smin ty x y)) (smin ty x y))
(rule (simplify (rule (simplify
(vselect ty (icmp _ (IntCC.SignedGreaterThanOrEqual) x y) y x)) (bitselect ty (icmp _ (IntCC.SignedGreaterThanOrEqual) x y) y x))
(smin ty x y)) (smin ty x y))
(rule (simplify (rule (simplify
(vselect ty (icmp _ (IntCC.UnsignedGreaterThan) x y) y x)) (bitselect ty (icmp _ (IntCC.UnsignedGreaterThan) x y) y x))
(umin ty x y)) (umin ty x y))
(rule (simplify (rule (simplify
(vselect ty (icmp _ (IntCC.UnsignedGreaterThanOrEqual) x y) y x)) (bitselect ty (icmp _ (IntCC.UnsignedGreaterThanOrEqual) x y) y x))
(umin ty x y)) (umin ty x y))
;; For floats convert fcmp lt into pseudo_min and gt into pseudo_max ;; For floats convert fcmp lt into pseudo_min and gt into pseudo_max
@@ -520,13 +520,9 @@
(select ty (fcmp _ (FloatCC.GreaterThan) x y) x y)) (select ty (fcmp _ (FloatCC.GreaterThan) x y) x y))
(fmax_pseudo ty x y)) (fmax_pseudo ty x y))
;; Do the same for vectors ;; TODO: perform this same optimization to `f{min,max}_pseudo` for vectors
(rule (simplify ;; with the `bitselect` instruction, but the pattern is a bit more complicated
(vselect ty (fcmp _ (FloatCC.LessThan) x y) x y)) ;; due to most bitselects-over-floats having bitcasts.
(fmin_pseudo ty x y))
(rule (simplify
(vselect ty (fcmp _ (FloatCC.GreaterThan) x y) x y))
(fmax_pseudo ty x y))
;; If both of the multiplied arguments to an `fma` are negated then remove ;; If both of the multiplied arguments to an `fma` are negated then remove
;; both of them since they cancel out. ;; both of them since they cancel out.

View File

@@ -5,10 +5,10 @@ target x86_64
target aarch64 target aarch64
target s390x target s390x
function %vselect_sgt_to_smax(i32x4, i32x4) -> i32x4 { function %bitselect_sgt_to_smax(i32x4, i32x4) -> i32x4 {
block0(v0: i32x4, v1: i32x4): block0(v0: i32x4, v1: i32x4):
v2 = icmp sgt v0, v1 v2 = icmp sgt v0, v1
v3 = vselect v2, v0, v1 v3 = bitselect v2, v0, v1
return v3 return v3
} }
@@ -17,11 +17,11 @@ block0(v0: i32x4, v1: i32x4):
; check: return v4 ; check: return v4
; This tests an inverted vselect, where the operands are swapped. ; This tests an inverted bitselect, where the operands are swapped.
function %vselect_sgt_to_smax(i32x4, i32x4) -> i32x4 { function %bitselect_sgt_to_smax(i32x4, i32x4) -> i32x4 {
block0(v0: i32x4, v1: i32x4): block0(v0: i32x4, v1: i32x4):
v2 = icmp sgt v0, v1 v2 = icmp sgt v0, v1
v3 = vselect v2, v1, v0 v3 = bitselect v2, v1, v0
return v3 return v3
} }
@@ -31,10 +31,10 @@ block0(v0: i32x4, v1: i32x4):
function %vselect_sge_to_smax(i32x4, i32x4) -> i32x4 { function %bitselect_sge_to_smax(i32x4, i32x4) -> i32x4 {
block0(v0: i32x4, v1: i32x4): block0(v0: i32x4, v1: i32x4):
v2 = icmp sge v0, v1 v2 = icmp sge v0, v1
v3 = vselect v2, v0, v1 v3 = bitselect v2, v0, v1
return v3 return v3
} }
@@ -43,10 +43,10 @@ block0(v0: i32x4, v1: i32x4):
; check: return v4 ; check: return v4
function %vselect_ugt_to_umax(i32x4, i32x4) -> i32x4 { function %bitselect_ugt_to_umax(i32x4, i32x4) -> i32x4 {
block0(v0: i32x4, v1: i32x4): block0(v0: i32x4, v1: i32x4):
v2 = icmp ugt v0, v1 v2 = icmp ugt v0, v1
v3 = vselect v2, v0, v1 v3 = bitselect v2, v0, v1
return v3 return v3
} }
@@ -55,10 +55,10 @@ block0(v0: i32x4, v1: i32x4):
; check: return v4 ; check: return v4
function %vselect_uge_to_umax(i32x4, i32x4) -> i32x4 { function %bitselect_uge_to_umax(i32x4, i32x4) -> i32x4 {
block0(v0: i32x4, v1: i32x4): block0(v0: i32x4, v1: i32x4):
v2 = icmp uge v0, v1 v2 = icmp uge v0, v1
v3 = vselect v2, v0, v1 v3 = bitselect v2, v0, v1
return v3 return v3
} }
@@ -68,10 +68,10 @@ block0(v0: i32x4, v1: i32x4):
function %vselect_slt_to_smin(i32x4, i32x4) -> i32x4 { function %bitselect_slt_to_smin(i32x4, i32x4) -> i32x4 {
block0(v0: i32x4, v1: i32x4): block0(v0: i32x4, v1: i32x4):
v2 = icmp slt v0, v1 v2 = icmp slt v0, v1
v3 = vselect v2, v0, v1 v3 = bitselect v2, v0, v1
return v3 return v3
} }
@@ -80,10 +80,10 @@ block0(v0: i32x4, v1: i32x4):
; check: return v4 ; check: return v4
function %vselect_sle_to_smin(i32x4, i32x4) -> i32x4 { function %bitselect_sle_to_smin(i32x4, i32x4) -> i32x4 {
block0(v0: i32x4, v1: i32x4): block0(v0: i32x4, v1: i32x4):
v2 = icmp sle v0, v1 v2 = icmp sle v0, v1
v3 = vselect v2, v0, v1 v3 = bitselect v2, v0, v1
return v3 return v3
} }
@@ -92,10 +92,10 @@ block0(v0: i32x4, v1: i32x4):
; check: return v4 ; check: return v4
function %vselect_ult_to_umin(i32x4, i32x4) -> i32x4 { function %bitselect_ult_to_umin(i32x4, i32x4) -> i32x4 {
block0(v0: i32x4, v1: i32x4): block0(v0: i32x4, v1: i32x4):
v2 = icmp ult v0, v1 v2 = icmp ult v0, v1
v3 = vselect v2, v0, v1 v3 = bitselect v2, v0, v1
return v3 return v3
} }
@@ -104,10 +104,10 @@ block0(v0: i32x4, v1: i32x4):
; check: return v4 ; check: return v4
function %vselect_ule_to_umin(i32x4, i32x4) -> i32x4 { function %bitselect_ule_to_umin(i32x4, i32x4) -> i32x4 {
block0(v0: i32x4, v1: i32x4): block0(v0: i32x4, v1: i32x4):
v2 = icmp ule v0, v1 v2 = icmp ule v0, v1
v3 = vselect v2, v0, v1 v3 = bitselect v2, v0, v1
return v3 return v3
} }
@@ -117,38 +117,14 @@ block0(v0: i32x4, v1: i32x4):
function %vselect_with_different_regs_does_not_optimize(i32x4, i32x4, i32x4, i32x4) -> i32x4 { function %bitselect_with_different_regs_does_not_optimize(i32x4, i32x4, i32x4, i32x4) -> i32x4 {
block0(v0: i32x4, v1: i32x4, v2: i32x4, v3: i32x4): block0(v0: i32x4, v1: i32x4, v2: i32x4, v3: i32x4):
v4 = icmp ule v0, v1 v4 = icmp ule v0, v1
v5 = vselect v4, v2, v3 v5 = bitselect v4, v2, v3
return v5 return v5
} }
; check: block0(v0: i32x4, v1: i32x4, v2: i32x4, v3: i32x4): ; check: block0(v0: i32x4, v1: i32x4, v2: i32x4, v3: i32x4):
; check: v4 = icmp ule v0, v1 ; check: v4 = icmp ule v0, v1
; check: v5 = vselect v4, v2, v3 ; check: v5 = bitselect v4, v2, v3
; check: return v5 ; check: return v5
function %vselect_fcmp_gt_to_fmax_pseudo(f32x4, f32x4) -> f32x4 {
block0(v0: f32x4, v1: f32x4):
v2 = fcmp gt v0, v1
v3 = vselect v2, v0, v1
return v3
}
; check: block0(v0: f32x4, v1: f32x4):
; check: v4 = fmax_pseudo v0, v1
; check: return v4
function %vselect_fcmp_lt_to_fmin_pseudo(f32x4, f32x4) -> f32x4 {
block0(v0: f32x4, v1: f32x4):
v2 = fcmp lt v0, v1
v3 = vselect v2, v0, v1
return v3
}
; check: block0(v0: f32x4, v1: f32x4):
; check: v4 = fmin_pseudo v0, v1
; check: return v4

View File

@@ -173,7 +173,7 @@ block0:
function %vselect_i16x8(i16x8, i16x8, i16x8) -> i16x8 { function %vselect_i16x8(i16x8, i16x8, i16x8) -> i16x8 {
block0(v0: i16x8, v1: i16x8, v2: i16x8): block0(v0: i16x8, v1: i16x8, v2: i16x8):
v3 = vselect v0, v1, v2 v3 = bitselect v0, v1, v2
return v3 return v3
} }
@@ -187,9 +187,9 @@ block0(v0: i16x8, v1: i16x8, v2: i16x8):
; bsl v0.16b, v1.16b, v2.16b ; bsl v0.16b, v1.16b, v2.16b
; ret ; ret
function %vselect_f32x4(i32x4, f32x4, f32x4) -> f32x4 { function %vselect_f32x4(f32x4, f32x4, f32x4) -> f32x4 {
block0(v0: i32x4, v1: f32x4, v2: f32x4): block0(v0: f32x4, v1: f32x4, v2: f32x4):
v3 = vselect v0, v1, v2 v3 = bitselect v0, v1, v2
return v3 return v3
} }
@@ -203,9 +203,9 @@ block0(v0: i32x4, v1: f32x4, v2: f32x4):
; bsl v0.16b, v1.16b, v2.16b ; bsl v0.16b, v1.16b, v2.16b
; ret ; ret
function %vselect_f64x2(i64x2, f64x2, f64x2) -> f64x2 { function %vselect_f64x2(f64x2, f64x2, f64x2) -> f64x2 {
block0(v0: i64x2, v1: f64x2, v2: f64x2): block0(v0: f64x2, v1: f64x2, v2: f64x2):
v3 = vselect v0, v1, v2 v3 = bitselect v0, v1, v2
return v3 return v3
} }

View File

@@ -514,67 +514,3 @@ block0(v0: i8x16, v1: i8x16, v2: i8x16):
; vsel %v24, %v25, %v26, %v24 ; vsel %v24, %v25, %v26, %v24
; br %r14 ; br %r14
function %vselect_i64x2(i64x2, i64x2, i64x2) -> i64x2 {
block0(v0: i64x2, v1: i64x2, v2: i64x2):
v3 = vselect.i64x2 v0, v1, v2
return v3
}
; VCode:
; block0:
; vsel %v24, %v25, %v26, %v24
; br %r14
;
; Disassembled:
; block0: ; offset 0x0
; vsel %v24, %v25, %v26, %v24
; br %r14
function %vselect_i32x4(i32x4, i32x4, i32x4) -> i32x4 {
block0(v0: i32x4, v1: i32x4, v2: i32x4):
v3 = vselect.i32x4 v0, v1, v2
return v3
}
; VCode:
; block0:
; vsel %v24, %v25, %v26, %v24
; br %r14
;
; Disassembled:
; block0: ; offset 0x0
; vsel %v24, %v25, %v26, %v24
; br %r14
function %vselect_i16x8(i16x8, i16x8, i16x8) -> i16x8 {
block0(v0: i16x8, v1: i16x8, v2: i16x8):
v3 = vselect.i16x8 v0, v1, v2
return v3
}
; VCode:
; block0:
; vsel %v24, %v25, %v26, %v24
; br %r14
;
; Disassembled:
; block0: ; offset 0x0
; vsel %v24, %v25, %v26, %v24
; br %r14
function %vselect_i8x16(i8x16, i8x16, i8x16) -> i8x16 {
block0(v0: i8x16, v1: i8x16, v2: i8x16):
v3 = vselect.i8x16 v0, v1, v2
return v3
}
; VCode:
; block0:
; vsel %v24, %v25, %v26, %v24
; br %r14
;
; Disassembled:
; block0: ; offset 0x0
; vsel %v24, %v25, %v26, %v24
; br %r14

View File

@@ -433,7 +433,7 @@ block0(v0: f64x2, v1: f64x2):
function %i16x8_bitselect(i16x8, i16x8, i16x8) -> i16x8 { function %i16x8_bitselect(i16x8, i16x8, i16x8) -> i16x8 {
block0(v0: i16x8, v1: i16x8, v2: i16x8): block0(v0: i16x8, v1: i16x8, v2: i16x8):
v3 = vselect v0, v1, v2 v3 = bitselect v0, v1, v2
return v3 return v3
} }
@@ -441,7 +441,9 @@ block0(v0: i16x8, v1: i16x8, v2: i16x8):
; pushq %rbp ; pushq %rbp
; movq %rsp, %rbp ; movq %rsp, %rbp
; block0: ; block0:
; vpblendvb %xmm0, %xmm1, %xmm0, %xmm2 ; vpand %xmm1, %xmm0, %xmm4
; vpandn %xmm0, %xmm2, %xmm6
; vpor %xmm6, %xmm4, %xmm0
; movq %rbp, %rsp ; movq %rbp, %rsp
; popq %rbp ; popq %rbp
; ret ; ret
@@ -451,14 +453,16 @@ block0(v0: i16x8, v1: i16x8, v2: i16x8):
; pushq %rbp ; pushq %rbp
; movq %rsp, %rbp ; movq %rsp, %rbp
; block1: ; offset 0x4 ; block1: ; offset 0x4
; vpblendvb %xmm0, %xmm1, %xmm2, %xmm0 ; vpand %xmm0, %xmm1, %xmm4
; vpandn %xmm2, %xmm0, %xmm6
; vpor %xmm4, %xmm6, %xmm0
; movq %rbp, %rsp ; movq %rbp, %rsp
; popq %rbp ; popq %rbp
; retq ; retq
function %i32x4_bitselect(i32x4, f32x4, f32x4) -> f32x4 { function %f32x4_bitselect(f32x4, f32x4, f32x4) -> f32x4 {
block0(v0: i32x4, v1: f32x4, v2: f32x4): block0(v0: f32x4, v1: f32x4, v2: f32x4):
v3 = vselect v0, v1, v2 v3 = bitselect v0, v1, v2
return v3 return v3
} }
@@ -466,7 +470,9 @@ block0(v0: i32x4, v1: f32x4, v2: f32x4):
; pushq %rbp ; pushq %rbp
; movq %rsp, %rbp ; movq %rsp, %rbp
; block0: ; block0:
; vblendvps %xmm0, %xmm1, %xmm0, %xmm2 ; vandps %xmm1, %xmm0, %xmm4
; vandnps %xmm0, %xmm2, %xmm6
; vorps %xmm6, %xmm4, %xmm0
; movq %rbp, %rsp ; movq %rbp, %rsp
; popq %rbp ; popq %rbp
; ret ; ret
@@ -476,14 +482,16 @@ block0(v0: i32x4, v1: f32x4, v2: f32x4):
; pushq %rbp ; pushq %rbp
; movq %rsp, %rbp ; movq %rsp, %rbp
; block1: ; offset 0x4 ; block1: ; offset 0x4
; vblendvps %xmm0, %xmm1, %xmm2, %xmm0 ; vandps %xmm0, %xmm1, %xmm4
; vandnps %xmm2, %xmm0, %xmm6
; vorps %xmm4, %xmm6, %xmm0
; movq %rbp, %rsp ; movq %rbp, %rsp
; popq %rbp ; popq %rbp
; retq ; retq
function %i64x2_bitselect(i64x2, f64x2, f64x2) -> f64x2 { function %f64x2_bitselect(f64x2, f64x2, f64x2) -> f64x2 {
block0(v0: i64x2, v1: f64x2, v2: f64x2): block0(v0: f64x2, v1: f64x2, v2: f64x2):
v3 = vselect v0, v1, v2 v3 = bitselect v0, v1, v2
return v3 return v3
} }
@@ -491,7 +499,9 @@ block0(v0: i64x2, v1: f64x2, v2: f64x2):
; pushq %rbp ; pushq %rbp
; movq %rsp, %rbp ; movq %rsp, %rbp
; block0: ; block0:
; vblendvpd %xmm0, %xmm1, %xmm0, %xmm2 ; vandpd %xmm1, %xmm0, %xmm4
; vandnpd %xmm0, %xmm2, %xmm6
; vorpd %xmm6, %xmm4, %xmm0
; movq %rbp, %rsp ; movq %rbp, %rsp
; popq %rbp ; popq %rbp
; ret ; ret
@@ -501,7 +511,9 @@ block0(v0: i64x2, v1: f64x2, v2: f64x2):
; pushq %rbp ; pushq %rbp
; movq %rsp, %rbp ; movq %rsp, %rbp
; block1: ; offset 0x4 ; block1: ; offset 0x4
; vblendvpd %xmm0, %xmm1, %xmm2, %xmm0 ; vandpd %xmm0, %xmm1, %xmm4
; vandnpd %xmm2, %xmm0, %xmm6
; vorpd %xmm4, %xmm6, %xmm0
; movq %rbp, %rsp ; movq %rbp, %rsp
; popq %rbp ; popq %rbp
; retq ; retq

View File

@@ -229,7 +229,7 @@ block0(v0: i32x4, v1: i32x4):
function %vselect_i16x8(i16x8, i16x8, i16x8) -> i16x8 { function %vselect_i16x8(i16x8, i16x8, i16x8) -> i16x8 {
block0(v0: i16x8, v1: i16x8, v2: i16x8): block0(v0: i16x8, v1: i16x8, v2: i16x8):
v3 = vselect v0, v1, v2 v3 = bitselect v0, v1, v2
return v3 return v3
} }
@@ -237,9 +237,10 @@ block0(v0: i16x8, v1: i16x8, v2: i16x8):
; pushq %rbp ; pushq %rbp
; movq %rsp, %rbp ; movq %rsp, %rbp
; block0: ; block0:
; movdqa %xmm2, %xmm4 ; movdqa %xmm1, %xmm4
; pblendvb %xmm4, %xmm1, %xmm4 ; pand %xmm4, %xmm0, %xmm4
; movdqa %xmm4, %xmm0 ; pandn %xmm0, %xmm2, %xmm0
; por %xmm0, %xmm4, %xmm0
; movq %rbp, %rsp ; movq %rbp, %rsp
; popq %rbp ; popq %rbp
; ret ; ret
@@ -249,16 +250,17 @@ block0(v0: i16x8, v1: i16x8, v2: i16x8):
; pushq %rbp ; pushq %rbp
; movq %rsp, %rbp ; movq %rsp, %rbp
; block1: ; offset 0x4 ; block1: ; offset 0x4
; movdqa %xmm2, %xmm4 ; movdqa %xmm1, %xmm4
; pblendvb %xmm0, %xmm1, %xmm4 ; pand %xmm0, %xmm4
; movdqa %xmm4, %xmm0 ; pandn %xmm2, %xmm0
; por %xmm4, %xmm0
; movq %rbp, %rsp ; movq %rbp, %rsp
; popq %rbp ; popq %rbp
; retq ; retq
function %vselect_f32x4(i32x4, f32x4, f32x4) -> f32x4 { function %vselect_f32x4(f32x4, f32x4, f32x4) -> f32x4 {
block0(v0: i32x4, v1: f32x4, v2: f32x4): block0(v0: f32x4, v1: f32x4, v2: f32x4):
v3 = vselect v0, v1, v2 v3 = bitselect v0, v1, v2
return v3 return v3
} }
@@ -266,9 +268,10 @@ block0(v0: i32x4, v1: f32x4, v2: f32x4):
; pushq %rbp ; pushq %rbp
; movq %rsp, %rbp ; movq %rsp, %rbp
; block0: ; block0:
; movdqa %xmm2, %xmm4 ; movdqa %xmm1, %xmm4
; blendvps %xmm4, %xmm1, %xmm4 ; andps %xmm4, %xmm0, %xmm4
; movdqa %xmm4, %xmm0 ; andnps %xmm0, %xmm2, %xmm0
; orps %xmm0, %xmm4, %xmm0
; movq %rbp, %rsp ; movq %rbp, %rsp
; popq %rbp ; popq %rbp
; ret ; ret
@@ -278,16 +281,17 @@ block0(v0: i32x4, v1: f32x4, v2: f32x4):
; pushq %rbp ; pushq %rbp
; movq %rsp, %rbp ; movq %rsp, %rbp
; block1: ; offset 0x4 ; block1: ; offset 0x4
; movdqa %xmm2, %xmm4 ; movdqa %xmm1, %xmm4
; blendvps %xmm0, %xmm1, %xmm4 ; andps %xmm0, %xmm4
; movdqa %xmm4, %xmm0 ; andnps %xmm2, %xmm0
; orps %xmm4, %xmm0
; movq %rbp, %rsp ; movq %rbp, %rsp
; popq %rbp ; popq %rbp
; retq ; retq
function %vselect_f64x2(i64x2, f64x2, f64x2) -> f64x2 { function %vselect_f64x2(f64x2, f64x2, f64x2) -> f64x2 {
block0(v0: i64x2, v1: f64x2, v2: f64x2): block0(v0: f64x2, v1: f64x2, v2: f64x2):
v3 = vselect v0, v1, v2 v3 = bitselect v0, v1, v2
return v3 return v3
} }
@@ -295,9 +299,10 @@ block0(v0: i64x2, v1: f64x2, v2: f64x2):
; pushq %rbp ; pushq %rbp
; movq %rsp, %rbp ; movq %rsp, %rbp
; block0: ; block0:
; movdqa %xmm2, %xmm4 ; movdqa %xmm1, %xmm4
; blendvpd %xmm4, %xmm1, %xmm4 ; andpd %xmm4, %xmm0, %xmm4
; movdqa %xmm4, %xmm0 ; andnpd %xmm0, %xmm2, %xmm0
; orpd %xmm0, %xmm4, %xmm0
; movq %rbp, %rsp ; movq %rbp, %rsp
; popq %rbp ; popq %rbp
; ret ; ret
@@ -307,9 +312,10 @@ block0(v0: i64x2, v1: f64x2, v2: f64x2):
; pushq %rbp ; pushq %rbp
; movq %rsp, %rbp ; movq %rsp, %rbp
; block1: ; offset 0x4 ; block1: ; offset 0x4
; movdqa %xmm2, %xmm4 ; movdqa %xmm1, %xmm4
; blendvpd %xmm0, %xmm1, %xmm4 ; andpd %xmm0, %xmm4
; movdqa %xmm4, %xmm0 ; andnpd %xmm2, %xmm0
; orpd %xmm4, %xmm0
; movq %rbp, %rsp ; movq %rbp, %rsp
; popq %rbp ; popq %rbp
; retq ; retq

View File

@@ -1,82 +0,0 @@
test interpret
test run
target s390x
target aarch64
set enable_simd
target x86_64 has_sse3 has_ssse3 has_sse41
function %vselect_i8x16() -> i8x16 {
block0:
v1 = vconst.i8x16 [0 -1 0 -1 0 -1 -1 -1 -1 -1 0 0 0 0 0 0]
v2 = vconst.i8x16 [100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115]
v3 = vconst.i8x16 [200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215]
v4 = vselect v1, v2, v3
return v4
}
; run: %vselect_i8x16() == [200 101 202 103 204 105 106 107 108 109 210 211 212 213 214 215]
function %vselect_i16x8() -> i16x8 {
block0:
v1 = vconst.i16x8 [0 -1 0 -1 0 -1 -1 -1]
v2 = vconst.i16x8 [100 101 102 103 104 105 106 107]
v3 = vconst.i16x8 [200 201 202 203 204 205 206 207]
v4 = vselect v1, v2, v3
return v4
}
; run: %vselect_i16x8() == [200 101 202 103 204 105 106 107]
function %vselect_i32x4_const() -> i32x4 {
block0:
v1 = vconst.i32x4 [0 -1 0 -1]
v2 = vconst.i32x4 [100 101 102 103]
v3 = vconst.i32x4 [200 201 202 203]
v4 = vselect v1, v2, v3
return v4
}
; run: %vselect_i32x4_const() == [200 101 202 103]
function %vselect_i32x4(i32x4, i32x4, i32x4) -> i32x4 {
block0(v0: i32x4, v1: i32x4, v2: i32x4):
v3 = vselect v0, v1, v2
return v3
}
; Remember that vselect accepts: 1) the selector vector, 2) the "if true" vector, and 3) the "if false" vector.
; run: %vselect_i32x4([-1 -1 0 0], [1 2 -1 -1], [-1 -1 3 4]) == [1 2 3 4]
function %vselect_i64x2() -> i64x2 {
block0:
v1 = vconst.i64x2 [0 -1]
v2 = vconst.i64x2 [100 101]
v3 = vconst.i64x2 [200 201]
v4 = vselect v1, v2, v3
return v4
}
; run: %vselect_i64x2() == [200 101]
function %vselect_p_i8x16(i8x16, i8x16, i8x16) -> i8x16 {
block0(v0: i8x16, v1: i8x16, v2: i8x16):
v3 = vselect v0, v1, v2
return v3
}
; run: %vselect_p_i8x16([-1 0 -1 -1 -1 0 0 0 -1 0 -1 -1 -1 0 0 0], [1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16], [17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32]) == [1 18 3 4 5 22 23 24 9 26 11 12 13 30 31 32]
function %vselect_p_i16x8(i16x8, i16x8, i16x8) -> i16x8 {
block0(v0: i16x8, v1: i16x8, v2: i16x8):
v3 = vselect v0, v1, v2
return v3
}
; run: %vselect_p_i16x8([-1 0 -1 -1 -1 0 0 0], [1 2 3 4 5 6 7 8], [17 18 19 20 21 22 23 24]) == [1 18 3 4 5 22 23 24]
function %vselect_p_i32x4(i32x4, i32x4, i32x4) -> i32x4 {
block0(v0: i32x4, v1: i32x4, v2: i32x4):
v3 = vselect v0, v1, v2
return v3
}
; run: %vselect_p_i32x4([-1 0 -1 -1], [1 2 3 4], [100000 200000 300000 400000]) == [1 200000 3 4]
function %vselect_p_i64x2(i64x2, i64x2, i64x2) -> i64x2 {
block0(v0: i64x2, v1: i64x2, v2: i64x2):
v3 = vselect v0, v1, v2
return v3
}
; run: %vselect_p_i64x2([-1 0], [1 2], [100000000000 200000000000]) == [1 200000000000]

View File

@@ -603,11 +603,7 @@ where
Opcode::Select | Opcode::SelectSpectreGuard => { Opcode::Select | Opcode::SelectSpectreGuard => {
choose(arg(0)?.into_bool()?, arg(1)?, arg(2)?) choose(arg(0)?.into_bool()?, arg(1)?, arg(2)?)
} }
Opcode::Bitselect => { Opcode::Bitselect => assign(bitselect(arg(0)?, arg(1)?, arg(2)?)?),
let mask_a = Value::and(arg(0)?, arg(1)?)?;
let mask_b = Value::and(Value::not(arg(0)?)?, arg(2)?)?;
assign(Value::or(mask_a, mask_b)?)
}
Opcode::Icmp => assign(icmp( Opcode::Icmp => assign(icmp(
ctrl_ty, ctrl_ty,
inst.cond_code().unwrap(), inst.cond_code().unwrap(),
@@ -623,7 +619,7 @@ where
Opcode::Smin => { Opcode::Smin => {
if ctrl_ty.is_vector() { if ctrl_ty.is_vector() {
let icmp = icmp(ctrl_ty, IntCC::SignedGreaterThan, &arg(1)?, &arg(0)?)?; let icmp = icmp(ctrl_ty, IntCC::SignedGreaterThan, &arg(1)?, &arg(0)?)?;
assign(vselect(&icmp, &arg(0)?, &arg(1)?, ctrl_ty)?) assign(bitselect(icmp, arg(0)?, arg(1)?)?)
} else { } else {
choose(Value::gt(&arg(1)?, &arg(0)?)?, arg(0)?, arg(1)?) choose(Value::gt(&arg(1)?, &arg(0)?)?, arg(0)?, arg(1)?)
} }
@@ -631,7 +627,7 @@ where
Opcode::Umin => { Opcode::Umin => {
if ctrl_ty.is_vector() { if ctrl_ty.is_vector() {
let icmp = icmp(ctrl_ty, IntCC::UnsignedGreaterThan, &arg(1)?, &arg(0)?)?; let icmp = icmp(ctrl_ty, IntCC::UnsignedGreaterThan, &arg(1)?, &arg(0)?)?;
assign(vselect(&icmp, &arg(0)?, &arg(1)?, ctrl_ty)?) assign(bitselect(icmp, arg(0)?, arg(1)?)?)
} else { } else {
choose( choose(
Value::gt( Value::gt(
@@ -646,7 +642,7 @@ where
Opcode::Smax => { Opcode::Smax => {
if ctrl_ty.is_vector() { if ctrl_ty.is_vector() {
let icmp = icmp(ctrl_ty, IntCC::SignedGreaterThan, &arg(0)?, &arg(1)?)?; let icmp = icmp(ctrl_ty, IntCC::SignedGreaterThan, &arg(0)?, &arg(1)?)?;
assign(vselect(&icmp, &arg(0)?, &arg(1)?, ctrl_ty)?) assign(bitselect(icmp, arg(0)?, arg(1)?)?)
} else { } else {
choose(Value::gt(&arg(0)?, &arg(1)?)?, arg(0)?, arg(1)?) choose(Value::gt(&arg(0)?, &arg(1)?)?, arg(0)?, arg(1)?)
} }
@@ -654,7 +650,7 @@ where
Opcode::Umax => { Opcode::Umax => {
if ctrl_ty.is_vector() { if ctrl_ty.is_vector() {
let icmp = icmp(ctrl_ty, IntCC::UnsignedGreaterThan, &arg(0)?, &arg(1)?)?; let icmp = icmp(ctrl_ty, IntCC::UnsignedGreaterThan, &arg(0)?, &arg(1)?)?;
assign(vselect(&icmp, &arg(0)?, &arg(1)?, ctrl_ty)?) assign(bitselect(icmp, arg(0)?, arg(1)?)?)
} else { } else {
choose( choose(
Value::gt( Value::gt(
@@ -1067,7 +1063,6 @@ where
} }
assign(Value::int(result, ctrl_ty)?) assign(Value::int(result, ctrl_ty)?)
} }
Opcode::Vselect => assign(vselect(&arg(0)?, &arg(1)?, &arg(2)?, ctrl_ty)?),
Opcode::VanyTrue => { Opcode::VanyTrue => {
let lane_ty = ctrl_ty.lane_type(); let lane_ty = ctrl_ty.lane_type();
let init = V::bool(false, true, lane_ty)?; let init = V::bool(false, true, lane_ty)?;
@@ -1641,20 +1636,11 @@ where
vectorizelanes(&result, vector_type) vectorizelanes(&result, vector_type)
} }
fn vselect<V>(c: &V, x: &V, y: &V, vector_type: types::Type) -> ValueResult<V> fn bitselect<V>(c: V, x: V, y: V) -> ValueResult<V>
where where
V: Value, V: Value,
{ {
let c = extractlanes(c, vector_type)?; let mask_x = Value::and(c.clone(), x)?;
let x = extractlanes(x, vector_type)?; let mask_y = Value::and(Value::not(c)?, y)?;
let y = extractlanes(y, vector_type)?; Value::or(mask_x, mask_y)
let mut new_vec = SimdVec::new();
for (c, (x, y)) in c.into_iter().zip(x.into_iter().zip(y.into_iter())) {
if Value::eq(&c, &Value::int(0, vector_type.lane_type())?)? {
new_vec.push(y);
} else {
new_vec.push(x);
}
}
vectorizelanes(&new_vec, vector_type)
} }

View File

@@ -218,6 +218,28 @@ macro_rules! binary_match {
}; };
} }
macro_rules! bitop {
( $op:tt($arg1:expr, $arg2:expr) ) => {
Ok(match ($arg1, $arg2) {
(DataValue::I8(a), DataValue::I8(b)) => DataValue::I8(a $op b),
(DataValue::I16(a), DataValue::I16(b)) => DataValue::I16(a $op b),
(DataValue::I32(a), DataValue::I32(b)) => DataValue::I32(a $op b),
(DataValue::I64(a), DataValue::I64(b)) => DataValue::I64(a $op b),
(DataValue::I128(a), DataValue::I128(b)) => DataValue::I128(a $op b),
(DataValue::F32(a), DataValue::F32(b)) => DataValue::F32(a $op b),
(DataValue::F64(a), DataValue::F64(b)) => DataValue::F64(a $op b),
(DataValue::V128(a), DataValue::V128(b)) => {
let mut a2 = a.clone();
for (a, b) in a2.iter_mut().zip(b.iter()) {
*a = *a $op *b;
}
DataValue::V128(a2)
}
_ => unimplemented!(),
})
};
}
impl Value for DataValue { impl Value for DataValue {
fn ty(&self) -> Type { fn ty(&self) -> Type {
self.ty() self.ty()
@@ -686,19 +708,35 @@ impl Value for DataValue {
} }
fn and(self, other: Self) -> ValueResult<Self> { fn and(self, other: Self) -> ValueResult<Self> {
binary_match!(&(self, other); [I8, I16, I32, I64, I128, F32, F64]) bitop!(&(self, other))
} }
fn or(self, other: Self) -> ValueResult<Self> { fn or(self, other: Self) -> ValueResult<Self> {
binary_match!(|(self, other); [I8, I16, I32, I64, I128, F32, F64]) bitop!(|(self, other))
} }
fn xor(self, other: Self) -> ValueResult<Self> { fn xor(self, other: Self) -> ValueResult<Self> {
binary_match!(^(self, other); [I8, I16, I32, I64, I128, F32, F64]) bitop!(^(self, other))
} }
fn not(self) -> ValueResult<Self> { fn not(self) -> ValueResult<Self> {
unary_match!(!(self); [I8, I16, I32, I64, I128, F32, F64]) Ok(match self {
DataValue::I8(a) => DataValue::I8(!a),
DataValue::I16(a) => DataValue::I16(!a),
DataValue::I32(a) => DataValue::I32(!a),
DataValue::I64(a) => DataValue::I64(!a),
DataValue::I128(a) => DataValue::I128(!a),
DataValue::F32(a) => DataValue::F32(!a),
DataValue::F64(a) => DataValue::F64(!a),
DataValue::V128(a) => {
let mut a2 = a.clone();
for a in a2.iter_mut() {
*a = !*a;
}
DataValue::V128(a2)
}
_ => unimplemented!(),
})
} }
fn count_ones(self) -> ValueResult<Self> { fn count_ones(self) -> ValueResult<Self> {