Improve bitselect codegen with knowledge of operand origin (#1783)

* Encode vselect using BLEND instructions on x86 * Legalize vselect to bitselect * Optimize bitselect to vselect for some operands * Add run tests for bitselect-vselect optimization * Address review feedback
2020-05-30 04:53:11 +02:00
parent 16afca4451
commit e430984ac4
11 changed files with 341 additions and 1 deletions
--- a/cranelift/codegen/src/isa/x86/enc_tables.rs
+++ b/cranelift/codegen/src/isa/x86/enc_tables.rs
@@ -246,6 +246,20 @@ fn size_with_inferred_rex_for_inreg0_inreg1(
    sizing.base_size + if needs_rex { 1 } else { 0 }
 }

+/// Infers whether a dynamic REX prefix will be emitted, based on second and third operand.
+fn size_with_inferred_rex_for_inreg1_inreg2(
+    sizing: &RecipeSizing,
+    _enc: Encoding,
+    inst: Inst,
+    divert: &RegDiversions,
+    func: &Function,
+) -> u8 {
+    // No need to check for REX.W in `needs_rex` because `infer_rex().w()` is not allowed.
+    let needs_rex = test_input(1, inst, divert, func, is_extended_reg)
+        || test_input(2, inst, divert, func, is_extended_reg);
+    sizing.base_size + if needs_rex { 1 } else { 0 }
+}
+
 /// Infers whether a dynamic REX prefix will be emitted, based on a single
 /// input register and a single output register.
 fn size_with_inferred_rex_for_inreg0_outreg0(
--- a/cranelift/codegen/src/simple_preopt.rs
+++ b/cranelift/codegen/src/simple_preopt.rs
@@ -656,7 +656,7 @@ mod simplify {
        dfg::ValueDef,
        immediates,
        instructions::{Opcode, ValueList},
-        types::{I16, I32, I8},
+        types::{B8, I16, I32, I8},
    };
    use std::marker::PhantomData;

@@ -935,6 +935,69 @@ mod simplify {
                }
            }

+            InstructionData::Ternary {
+                opcode: Opcode::Bitselect,
+                args,
+            } => {
+                let old_cond_type = pos.func.dfg.value_type(args[0]);
+                if !old_cond_type.is_vector() {
+                    return;
+                }
+
+                // Replace bitselect with vselect if each lane of controlling mask is either
+                // all ones or all zeroes; on x86 bitselect is encoded using 3 instructions,
+                // while vselect can be encoded using single BLEND instruction.
+                if let ValueDef::Result(def_inst, _) = pos.func.dfg.value_def(args[0]) {
+                    let (cond_val, cond_type) = match pos.func.dfg[def_inst] {
+                        InstructionData::Unary {
+                            opcode: Opcode::RawBitcast,
+                            arg,
+                        } => {
+                            // If controlling mask is raw-bitcasted boolean vector then
+                            // we know each lane is either all zeroes or ones,
+                            // so we can use vselect instruction instead.
+                            let arg_type = pos.func.dfg.value_type(arg);
+                            if !arg_type.is_vector() || !arg_type.lane_type().is_bool() {
+                                return;
+                            }
+                            (arg, arg_type)
+                        }
+                        InstructionData::UnaryConst {
+                            opcode: Opcode::Vconst,
+                            constant_handle,
+                        } => {
+                            // If each byte of controlling mask is 0x00 or 0xFF then
+                            // we will always bitcast our way to vselect(B8x16, I8x16, I8x16).
+                            // Bitselect operates at bit level, so the lane types don't matter.
+                            let const_data = pos.func.dfg.constants.get(constant_handle);
+                            if !const_data.iter().all(|&b| b == 0 || b == 0xFF) {
+                                return;
+                            }
+                            let new_type = B8.by(old_cond_type.bytes() as u16).unwrap();
+                            (pos.ins().raw_bitcast(new_type, args[0]), new_type)
+                        }
+                        _ => return,
+                    };
+
+                    let lane_type = Type::int(cond_type.lane_bits() as u16).unwrap();
+                    let arg_type = lane_type.by(cond_type.lane_count()).unwrap();
+                    let old_arg_type = pos.func.dfg.value_type(args[1]);
+
+                    if arg_type != old_arg_type {
+                        // Operands types must match, we need to add bitcasts.
+                        let arg1 = pos.ins().raw_bitcast(arg_type, args[1]);
+                        let arg2 = pos.ins().raw_bitcast(arg_type, args[2]);
+                        let ret = pos.ins().vselect(cond_val, arg1, arg2);
+                        pos.func.dfg.replace(inst).raw_bitcast(old_arg_type, ret);
+                    } else {
+                        pos.func
+                            .dfg
+                            .replace(inst)
+                            .vselect(cond_val, args[1], args[2]);
+                    }
+                }
+            }
+
            _ => {}
        }
    }