Remove the Cranelift vselect instruction (#5918)

* Remove the Cranelift `vselect` instruction This instruction is documented as selecting lanes based on the "truthy" value of the condition lane, but the current status of the implementation of this instruction is: * x64 - uses the high bit for `f32x4` and `f64x2` and otherwise uses the high bit of each byte doing a byte-wise lane select rather than whatever the controlling type is. * AArch64 - this is the same as `bitselect` which is a bit-wise selection rather than a lane-wise selection. * s390x - this is the same as AArch64, a bit-wise selection rather than lane-wise. * interpreter - the interpreter implements the documented semantics of selecting based on "truthy" values. Coupled with the status of the implementation is the fact that this instruction is not used by WebAssembly SIMD today either. The only use of this instruction in Cranelift is the nan-canonicalization pass. By moving nan-canonicalization to `bitselect`, since that has the desired semantics, there's no longer any need for `vselect`. Given this situation this commit subsqeuently removes `vselect` and all usage of it throughout Cranelift. Closes #5917 * Review comments * Bring back vselect opts as bitselect opts * Clean up vselect usage in the interpreter * Move bitcast in nan canonicalization * Add a comment about float optimization
2023-03-07 18:42:05 -06:00
parent fc45ccc125
commit 07518dfd36
14 changed files with 163 additions and 333 deletions
--- a/cranelift/codegen/meta/src/shared/instructions.rs
+++ b/cranelift/codegen/meta/src/shared/instructions.rs
@@ -1409,8 +1409,8 @@ pub(crate) fn define(
            r#"
        Conditional select.

-        This instruction selects whole values. Use `vselect` for
-        lane-wise selection.
+        This instruction selects whole values. Use `bitselect` to choose each
+        bit according to a mask.
        "#,
            &formats.ternary,
        )
@@ -1458,7 +1458,7 @@ pub(crate) fn define(

        For each bit in `c`, this instruction selects the corresponding bit from `x` if the bit
        in `x` is 1 and the corresponding bit from `y` if the bit in `c` is 0. See also:
-        `select`, `vselect`.
+        `select`.
        "#,
            &formats.ternary,
        )
@@ -1484,26 +1484,7 @@ pub(crate) fn define(
        .operands_out(vec![a]),
    );

-    let c = &Operand::new("c", &TxN.as_bool()).with_doc("Controlling vector");
-    let x = &Operand::new("x", TxN).with_doc("Value to use where `c` is true");
-    let y = &Operand::new("y", TxN).with_doc("Value to use where `c` is false");
    let a = &Operand::new("a", TxN);
-
-    ig.push(
-        Inst::new(
-            "vselect",
-            r#"
-        Vector lane select.
-
-        Select lanes from ``x`` or ``y`` controlled by the lanes of the truthy
-        vector ``c``.
-        "#,
-            &formats.ternary,
-        )
-        .operands_in(vec![c, x, y])
-        .operands_out(vec![a]),
-    );
-
    let s = &Operand::new("s", i8);

    ig.push(
--- a/cranelift/codegen/src/isa/aarch64/lower.isle
+++ b/cranelift/codegen/src/isa/aarch64/lower.isle
@@ -1659,11 +1659,6 @@
 (rule 1 (lower (has_type (ty_vec128 ty) (bitselect c x y)))
        (bsl ty c x y))

-;;;; Rules for `vselect` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
-
-(rule (lower (has_type (ty_vec128 ty) (vselect c x y)))
-        (bsl ty c x y))
-
 ;;;; Rules for `ireduce` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;

 ;; T -> I{64,32,16,8}: We can simply pass through the value: values
--- a/cranelift/codegen/src/isa/s390x/lower.isle
+++ b/cranelift/codegen/src/isa/s390x/lower.isle
@@ -1170,13 +1170,6 @@
      (vec_select ty y z x))


-;;;; Rules for `vselect` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
-
-;; Vector select.
-(rule (lower (has_type (ty_vec128 ty) (vselect x y z)))
-      (vec_select ty y z x))
-
-
 ;;;; Rules for `bmask` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;

 (rule (lower (has_type ty (bmask x)))
--- a/cranelift/codegen/src/isa/x64/lower.isle
+++ b/cranelift/codegen/src/isa/x64/lower.isle
@@ -1193,7 +1193,7 @@
        (sse_or ty b a)))

 ;; If every byte of the condition is guaranteed to be all ones or all zeroes,
-;; we can use x86_blend like vselect does.
+;; we can use x64_blend.
 (rule 1 (lower (has_type ty @ (multi_lane _bits _lanes)
                         (bitselect condition
                                    if_true
@@ -1226,15 +1226,6 @@
                       (x86_blendv condition if_true if_false)))
      (x64_blendvpd if_false if_true condition))

-;;;; Rules for `vselect` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
-
-(rule (lower (has_type ty @ (multi_lane _bits _lanes)
-                       (vselect condition if_true if_false)))
-      (x64_blend ty
-                 condition
-                 if_true
-                 if_false))
-
 ;;;; Rules for `insertlane` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;

 (rule (lower (insertlane vec @ (value_type ty) val (u8_from_uimm8 idx)))
--- a/cranelift/codegen/src/nan_canonicalization.rs
+++ b/cranelift/codegen/src/nan_canonicalization.rs
@@ -7,6 +7,7 @@ use crate::ir::condcodes::FloatCC;
 use crate::ir::immediates::{Ieee32, Ieee64};
 use crate::ir::types;
 use crate::ir::{Function, Inst, InstBuilder, InstructionData, Opcode, Value};
+use crate::opts::MemFlags;
 use crate::timing;

 // Canonical 32-bit and 64-bit NaN values.
@@ -70,9 +71,10 @@ fn add_nan_canon_seq(pos: &mut FuncCursor, inst: Inst) {
            .select(is_nan, canon_nan, new_res);
    };
    let vector_select = |pos: &mut FuncCursor, canon_nan: Value| {
+        let is_nan = pos.ins().bitcast(val_type, MemFlags::new(), is_nan);
        pos.ins()
            .with_result(val)
-            .vselect(is_nan, canon_nan, new_res);
+            .bitselect(is_nan, canon_nan, new_res);
    };

    match val_type {
--- a/cranelift/codegen/src/opts/algebraic.isle
+++ b/cranelift/codegen/src/opts/algebraic.isle
@@ -454,56 +454,56 @@
       (select ty (icmp _ (IntCC.UnsignedGreaterThanOrEqual) x y) y x))
      (umin ty x y))

-;; Transform vselect-of-icmp into {u,s}{min,max} instructions where possible.
+;; Transform bitselect-of-icmp into {u,s}{min,max} instructions where possible.
 (rule (simplify
-       (vselect ty (icmp _ (IntCC.SignedGreaterThan) x y) x y))
+       (bitselect ty (icmp _ (IntCC.SignedGreaterThan) x y) x y))
      (smax ty x y))
 (rule (simplify
-       (vselect ty (icmp _ (IntCC.SignedGreaterThanOrEqual) x y) x y))
+       (bitselect ty (icmp _ (IntCC.SignedGreaterThanOrEqual) x y) x y))
      (smax ty x y))
 (rule (simplify
-       (vselect ty (icmp _ (IntCC.UnsignedGreaterThan) x y) x y))
+       (bitselect ty (icmp _ (IntCC.UnsignedGreaterThan) x y) x y))
      (umax ty x y))
 (rule (simplify
-       (vselect ty (icmp _ (IntCC.UnsignedGreaterThanOrEqual) x y) x y))
+       (bitselect ty (icmp _ (IntCC.UnsignedGreaterThanOrEqual) x y) x y))
      (umax ty x y))
 (rule (simplify
-       (vselect ty (icmp _ (IntCC.SignedLessThan) x y) x y))
+       (bitselect ty (icmp _ (IntCC.SignedLessThan) x y) x y))
      (smin ty x y))
 (rule (simplify
-       (vselect ty (icmp _ (IntCC.SignedLessThanOrEqual) x y) x y))
+       (bitselect ty (icmp _ (IntCC.SignedLessThanOrEqual) x y) x y))
      (smin ty x y))
 (rule (simplify
-       (vselect ty (icmp _ (IntCC.UnsignedLessThan) x y) x y))
+       (bitselect ty (icmp _ (IntCC.UnsignedLessThan) x y) x y))
      (umin ty x y))
 (rule (simplify
-       (vselect ty (icmp _ (IntCC.UnsignedLessThanOrEqual) x y) x y))
+       (bitselect ty (icmp _ (IntCC.UnsignedLessThanOrEqual) x y) x y))
      (umin ty x y))

 ;; These are the same rules as above, but when the operands for select are swapped
 (rule (simplify
-       (vselect ty (icmp _ (IntCC.SignedLessThan) x y) y x))
+       (bitselect ty (icmp _ (IntCC.SignedLessThan) x y) y x))
      (smax ty x y))
 (rule (simplify
-       (vselect ty (icmp _ (IntCC.SignedLessThanOrEqual) x y) y x))
+       (bitselect ty (icmp _ (IntCC.SignedLessThanOrEqual) x y) y x))
      (smax ty x y))
 (rule (simplify
-       (vselect ty (icmp _ (IntCC.UnsignedLessThan) x y) y x))
+       (bitselect ty (icmp _ (IntCC.UnsignedLessThan) x y) y x))
      (umax ty x y))
 (rule (simplify
-       (vselect ty (icmp _ (IntCC.UnsignedLessThanOrEqual) x y) y x))
+       (bitselect ty (icmp _ (IntCC.UnsignedLessThanOrEqual) x y) y x))
      (umax ty x y))
 (rule (simplify
-       (vselect ty (icmp _ (IntCC.SignedGreaterThan) x y) y x))
+       (bitselect ty (icmp _ (IntCC.SignedGreaterThan) x y) y x))
      (smin ty x y))
 (rule (simplify
-       (vselect ty (icmp _ (IntCC.SignedGreaterThanOrEqual) x y) y x))
+       (bitselect ty (icmp _ (IntCC.SignedGreaterThanOrEqual) x y) y x))
      (smin ty x y))
 (rule (simplify
-       (vselect ty (icmp _ (IntCC.UnsignedGreaterThan) x y) y x))
+       (bitselect ty (icmp _ (IntCC.UnsignedGreaterThan) x y) y x))
      (umin ty x y))
 (rule (simplify
-       (vselect ty (icmp _ (IntCC.UnsignedGreaterThanOrEqual) x y) y x))
+       (bitselect ty (icmp _ (IntCC.UnsignedGreaterThanOrEqual) x y) y x))
      (umin ty x y))

 ;; For floats convert fcmp lt into pseudo_min and gt into pseudo_max
@@ -520,13 +520,9 @@
       (select ty (fcmp _ (FloatCC.GreaterThan) x y) x y))
      (fmax_pseudo ty x y))

-;; Do the same for vectors
-(rule (simplify
-       (vselect ty (fcmp _ (FloatCC.LessThan) x y) x y))
-      (fmin_pseudo ty x y))
-(rule (simplify
-       (vselect ty (fcmp _ (FloatCC.GreaterThan) x y) x y))
-      (fmax_pseudo ty x y))
+;; TODO: perform this same optimization to `f{min,max}_pseudo` for vectors
+;; with the `bitselect` instruction, but the pattern is a bit more complicated
+;; due to most bitselects-over-floats having bitcasts.

 ;; If both of the multiplied arguments to an `fma` are negated then remove
 ;; both of them since they cancel out.