From 137a8b710fcf095b214b15d6c323eb1596ddc87c Mon Sep 17 00:00:00 2001
From: Ulrich Weigand <ulrich.weigand@de.ibm.com>
Date: Thu, 3 Nov 2022 21:17:36 +0100
Subject: [PATCH] Move bitselect->vselect optimization to x64 back-end (#5191)

The simplifier was performing an optimization to replace bitselect
with vselect if the all bytes of the condition mask could be shown
to be all ones or all zeros.

This optimization only ever made any difference in codegen on the
x64 target.  Therefore, move this optimization to the x64 back-end
and perform it in ISLE instead.  Resulting codegen should be
unchanged, with slightly improved compile time.

This also eliminates a few endian-dependent bitcast operations.
---
 cranelift/codegen/src/isa/x64/lower.isle      |  20 +++
 cranelift/codegen/src/isa/x64/lower/isle.rs   |   9 ++
 cranelift/codegen/src/simple_preopt.rs        |  61 ---------
 .../filetests/isa/x64/simd-bitselect.clif     | 123 ++++++++++++++++++
 .../isa/x64/simd-bitwise-compile.clif         |  22 ----
 .../filetests/simple_preopt/bitselect.clif    |  52 --------
 6 files changed, 152 insertions(+), 135 deletions(-)
 create mode 100644 cranelift/filetests/filetests/isa/x64/simd-bitselect.clif
 delete mode 100644 cranelift/filetests/filetests/simple_preopt/bitselect.clif

diff --git a/cranelift/codegen/src/isa/x64/lower.isle b/cranelift/codegen/src/isa/x64/lower.isle
index e6dceabff1..f08d6ad5a7 100644
--- a/cranelift/codegen/src/isa/x64/lower.isle
+++ b/cranelift/codegen/src/isa/x64/lower.isle
@@ -1255,6 +1255,26 @@
             (b Xmm (sse_and_not ty cond_xmm if_false)))
         (sse_or ty b a)))
 
+;; If every byte of the condition is guaranteed to be all ones or all zeroes,
+;; we can use x86_blend like vselect does.
+(rule 1 (lower (has_type ty @ (multi_lane _bits _lanes)
+                         (bitselect condition
+                                    if_true
+                                    if_false)))
+      (if (all_ones_or_all_zeros condition))
+      (x64_blend ty
+                 condition
+                 if_true
+                 if_false))
+
+(decl pure all_ones_or_all_zeros (Value) bool)
+(rule (all_ones_or_all_zeros (and (icmp _ _ _) (value_type (multi_lane _ _)))) $true)
+(rule (all_ones_or_all_zeros (and (fcmp _ _ _) (value_type (multi_lane _ _)))) $true)
+(rule (all_ones_or_all_zeros (vconst (vconst_all_ones_or_all_zeros))) $true)
+
+(decl pure vconst_all_ones_or_all_zeros () Constant)
+(extern extractor vconst_all_ones_or_all_zeros vconst_all_ones_or_all_zeros)
+
 ;;;; Rules for `vselect` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 
 (rule (lower (has_type ty @ (multi_lane _bits _lanes)
diff --git a/cranelift/codegen/src/isa/x64/lower/isle.rs b/cranelift/codegen/src/isa/x64/lower/isle.rs
index 6258fb6d03..17776d289e 100644
--- a/cranelift/codegen/src/isa/x64/lower/isle.rs
+++ b/cranelift/codegen/src/isa/x64/lower/isle.rs
@@ -713,6 +713,15 @@ impl Context for IsleContext<'_, '_, MInst, Flags, IsaFlags, 6> {
         targets.len() as u32
     }
 
+    #[inline]
+    fn vconst_all_ones_or_all_zeros(&mut self, constant: Constant) -> Option<()> {
+        let const_data = self.lower_ctx.get_constant_data(constant);
+        if const_data.iter().all(|&b| b == 0 || b == 0xFF) {
+            return Some(());
+        }
+        None
+    }
+
     #[inline]
     fn fcvt_uint_mask_const(&mut self) -> VCodeConstant {
         self.lower_ctx
diff --git a/cranelift/codegen/src/simple_preopt.rs b/cranelift/codegen/src/simple_preopt.rs
index d107f1554c..08c3660213 100644
--- a/cranelift/codegen/src/simple_preopt.rs
+++ b/cranelift/codegen/src/simple_preopt.rs
@@ -826,67 +826,6 @@ mod simplify {
                 }
             }
 
-            InstructionData::Ternary {
-                opcode: Opcode::Bitselect,
-                args,
-            } => {
-                let old_cond_type = pos.func.dfg.value_type(args[0]);
-                if !old_cond_type.is_vector() {
-                    return;
-                }
-
-                // Replace bitselect with vselect if each lane of controlling mask is either
-                // all ones or all zeroes; on x86 bitselect is encoded using 3 instructions,
-                // while vselect can be encoded using single BLEND instruction.
-                if let ValueDef::Result(def_inst, _) = pos.func.dfg.value_def(args[0]) {
-                    let (cond_val, cond_type) = match pos.func.dfg[def_inst] {
-                        InstructionData::IntCompare { .. }
-                        | InstructionData::FloatCompare { .. } => {
-                            // If the controlled mask is from a comparison, the value will be all
-                            // zeros or ones in each output lane.
-                            let arg = args[0];
-                            let arg_type = pos.func.dfg.value_type(arg);
-                            if !arg_type.is_vector() {
-                                return;
-                            }
-                            (arg, arg_type)
-                        }
-                        InstructionData::UnaryConst {
-                            opcode: Opcode::Vconst,
-                            constant_handle,
-                        } => {
-                            // If each byte of controlling mask is 0x00 or 0xFF then
-                            // we will always bitcast our way to vselect(I8x16, I8x16).
-                            // Bitselect operates at bit level, so the lane types don't matter.
-                            let const_data = pos.func.dfg.constants.get(constant_handle);
-                            if !const_data.iter().all(|&b| b == 0 || b == 0xFF) {
-                                return;
-                            }
-                            let new_type = I8.by(old_cond_type.bytes()).unwrap();
-                            (pos.ins().bitcast(new_type, args[0]), new_type)
-                        }
-                        _ => return,
-                    };
-
-                    let lane_type = Type::int(cond_type.lane_bits() as u16).unwrap();
-                    let arg_type = lane_type.by(cond_type.lane_count()).unwrap();
-                    let old_arg_type = pos.func.dfg.value_type(args[1]);
-
-                    if arg_type != old_arg_type {
-                        // Operands types must match, we need to add bitcasts.
-                        let arg1 = pos.ins().bitcast(arg_type, args[1]);
-                        let arg2 = pos.ins().bitcast(arg_type, args[2]);
-                        let ret = pos.ins().vselect(cond_val, arg1, arg2);
-                        pos.func.dfg.replace(inst).bitcast(old_arg_type, ret);
-                    } else {
-                        pos.func
-                            .dfg
-                            .replace(inst)
-                            .vselect(cond_val, args[1], args[2]);
-                    }
-                }
-            }
-
             _ => {}
         }
     }
diff --git a/cranelift/filetests/filetests/isa/x64/simd-bitselect.clif b/cranelift/filetests/filetests/isa/x64/simd-bitselect.clif
new file mode 100644
index 0000000000..edcc1f2771
--- /dev/null
+++ b/cranelift/filetests/filetests/isa/x64/simd-bitselect.clif
@@ -0,0 +1,123 @@
+test compile precise-output
+set enable_simd
+target x86_64 skylake
+
+function %mask_from_icmp(i8x16, i8x16) -> i8x16 {
+block0(v0: i8x16, v1: i8x16):
+    v2 = icmp eq v0, v1
+    v3 = bitselect v2, v0, v1
+    return v3
+}
+
+;   pushq   %rbp
+;   movq    %rsp, %rbp
+; block0:
+;   movdqa  %xmm0, %xmm5
+;   pcmpeqb %xmm5, %xmm1, %xmm5
+;   movdqa  %xmm0, %xmm8
+;   movdqa  %xmm5, %xmm0
+;   movdqa  %xmm1, %xmm6
+;   pblendvb %xmm6, %xmm8, %xmm6
+;   movdqa  %xmm6, %xmm0
+;   movq    %rbp, %rsp
+;   popq    %rbp
+;   ret
+
+function %mask_from_fcmp(f32x4, f32x4, i32x4, i32x4) -> i32x4  {
+block0(v0: f32x4, v1: f32x4, v2: i32x4, v3: i32x4):
+    v4 = fcmp eq v0, v1
+    v5 = bitselect v4, v2, v3
+    return v5
+}
+
+;   pushq   %rbp
+;   movq    %rsp, %rbp
+; block0:
+;   cmpps   $0, %xmm0, %xmm1, %xmm0
+;   movdqa  %xmm3, %xmm8
+;   pblendvb %xmm8, %xmm2, %xmm8
+;   movdqa  %xmm8, %xmm0
+;   movq    %rbp, %rsp
+;   popq    %rbp
+;   ret
+
+function %mask_casted(i8x16, i8x16, i32x4) -> i8x16 {
+block0(v0: i8x16, v1: i8x16, v2: i32x4):
+    v3 = bitcast.i8x16 v2
+    v4 = bitselect v3, v0, v1
+    return v4
+}
+
+;   pushq   %rbp
+;   movq    %rsp, %rbp
+; block0:
+;   movdqa  %xmm0, %xmm5
+;   pand    %xmm5, %xmm2, %xmm5
+;   movdqa  %xmm2, %xmm0
+;   pandn   %xmm0, %xmm1, %xmm0
+;   por     %xmm0, %xmm5, %xmm0
+;   movq    %rbp, %rsp
+;   popq    %rbp
+;   ret
+
+function %good_const_mask_i8x16(i8x16, i8x16) -> i8x16 {
+block0(v0: i8x16, v1: i8x16):
+    v3 = vconst.i8x16 [0 0 0xFF 0 0 0xFF 0 0 0 0 0xFF 0 0 0 0 0xFF]
+    v4 = bitselect v3, v0, v1
+    return v4
+}
+
+;   pushq   %rbp
+;   movq    %rsp, %rbp
+; block0:
+;   movdqa  %xmm0, %xmm6
+;   movdqu  const(0), %xmm0
+;   movdqa  %xmm6, %xmm8
+;   movdqa  %xmm1, %xmm6
+;   pblendvb %xmm6, %xmm8, %xmm6
+;   movdqa  %xmm6, %xmm0
+;   movq    %rbp, %rsp
+;   popq    %rbp
+;   ret
+
+function %good_const_mask_i16x8(i16x8, i16x8) -> i16x8 {
+block0(v0: i16x8, v1: i16x8):
+    v3 = vconst.i16x8 [0x0000 0xFF00 0x0000 0x00FF 0x0000 0xFFFF 0x00FF 0xFFFF]
+    v4 = bitselect v3, v0, v1
+    return v4
+}
+
+;   pushq   %rbp
+;   movq    %rsp, %rbp
+; block0:
+;   movdqa  %xmm0, %xmm6
+;   movdqu  const(0), %xmm0
+;   movdqa  %xmm6, %xmm8
+;   movdqa  %xmm1, %xmm6
+;   pblendvb %xmm6, %xmm8, %xmm6
+;   movdqa  %xmm6, %xmm0
+;   movq    %rbp, %rsp
+;   popq    %rbp
+;   ret
+
+function %bad_const_mask(i8x16, i8x16) -> i8x16 {
+block0(v0: i8x16, v1: i8x16):
+    v3 = vconst.i8x16 [0 0 0xF0 0 0 0xFF 0 0 0 0 0xFF 0 0 0 0 0xFF]
+    v4 = bitselect v3, v0, v1
+    return v4
+}
+
+;   pushq   %rbp
+;   movq    %rsp, %rbp
+; block0:
+;   movdqu  const(0), %xmm6
+;   movdqa  %xmm6, %xmm9
+;   movdqa  %xmm0, %xmm5
+;   pand    %xmm5, %xmm9, %xmm5
+;   movdqa  %xmm9, %xmm0
+;   pandn   %xmm0, %xmm1, %xmm0
+;   por     %xmm0, %xmm5, %xmm0
+;   movq    %rbp, %rsp
+;   popq    %rbp
+;   ret
+
diff --git a/cranelift/filetests/filetests/isa/x64/simd-bitwise-compile.clif b/cranelift/filetests/filetests/isa/x64/simd-bitwise-compile.clif
index 231c2fc9e4..c35942613a 100644
--- a/cranelift/filetests/filetests/isa/x64/simd-bitwise-compile.clif
+++ b/cranelift/filetests/filetests/isa/x64/simd-bitwise-compile.clif
@@ -128,28 +128,6 @@ block0(v0: i32x4, v1: i32x4):
 ;   popq    %rbp
 ;   ret
 
-function %bitselect_i16x8() -> i16x8 {
-block0:
-    v0 = vconst.i16x8 [0 0 0 0 0 0 0 0]
-    v1 = vconst.i16x8 [0 0 0 0 0 0 0 0]
-    v2 = vconst.i16x8 [0 0 0 0 0 0 0 0]
-    v3 = bitselect v0, v1, v2
-    return v3
-}
-
-;   pushq   %rbp
-;   movq    %rsp, %rbp
-; block0:
-;   movdqu  const(0), %xmm0
-;   movdqu  const(0), %xmm2
-;   movdqu  const(0), %xmm6
-;   pand    %xmm2, %xmm0, %xmm2
-;   pandn   %xmm0, %xmm6, %xmm0
-;   por     %xmm0, %xmm2, %xmm0
-;   movq    %rbp, %rsp
-;   popq    %rbp
-;   ret
-
 function %vselect_i16x8(i16x8, i16x8, i16x8) -> i16x8 {
 block0(v0: i16x8, v1: i16x8, v2: i16x8):
     v3 = vselect v0, v1, v2
diff --git a/cranelift/filetests/filetests/simple_preopt/bitselect.clif b/cranelift/filetests/filetests/simple_preopt/bitselect.clif
deleted file mode 100644
index b7ba46f5c0..0000000000
--- a/cranelift/filetests/filetests/simple_preopt/bitselect.clif
+++ /dev/null
@@ -1,52 +0,0 @@
-test simple_preopt
-target aarch64
-target x86_64
-
-;; Test replacement of bitselect with vselect for special masks
-
-function %mask_from_icmp(i8x16, i8x16) -> i8x16 {
-block0(v0: i8x16, v1: i8x16):
-    v2 = icmp eq v0, v1
-    v3 = bitselect v2, v0, v1
-    ; check: v3 = vselect v2, v0, v1
-    return v3
-}
-
-;; We can't guarantee that the i32x4 has all ones or zeros in each lane, so we
-;; can't remove the bitselect in this case.
-function %mask_casted(i8x16, i8x16, i32x4) -> i8x16 {
-block0(v0: i8x16, v1: i8x16, v2: i32x4):
-    v3 = bitcast.i8x16 v2
-    v4 = bitselect v3, v0, v1
-    ; check: v4 = bitselect v3, v0, v1
-    return v4
-}
-
-function %good_const_mask_i8x16(i8x16, i8x16) -> i8x16 {
-block0(v0: i8x16, v1: i8x16):
-    v3 = vconst.i8x16 [0 0 0xFF 0 0 0xFF 0 0 0 0 0xFF 0 0 0 0 0xFF]
-    v4 = bitselect v3, v0, v1
-    ; check:  v5 = bitcast.i8x16 v3
-    ; nextln: v4 = vselect v5, v0, v1
-    return v4
-}
-
-function %good_const_mask_i16x8(i16x8, i16x8) -> i16x8 {
-block0(v0: i16x8, v1: i16x8):
-    v3 = vconst.i16x8 [0x0000 0xFF00 0x0000 0x00FF 0x0000 0xFFFF 0x00FF 0xFFFF]
-    v4 = bitselect v3, v0, v1
-    ; check:  v5 = bitcast.i8x16 v3
-    ; nextln: v6 = bitcast.i8x16 v0
-    ; nextln: v7 = bitcast.i8x16 v1
-    ; nextln: v8 = vselect v5, v6, v7
-    ; nextln: v4 = bitcast.i16x8 v8
-    return v4
-}
-
-function %bad_const_mask(i8x16, i8x16) -> i8x16 {
-block0(v0: i8x16, v1: i8x16):
-    v3 = vconst.i8x16 [0 0 0xF0 0 0 0xFF 0 0 0 0 0xFF 0 0 0 0 0xFF]
-    v4 = bitselect v3, v0, v1
-    ; check: v4 = bitselect v3, v0, v1
-    return v4
-}