diff --git a/cranelift/codegen/src/isa/x64/lower.isle b/cranelift/codegen/src/isa/x64/lower.isle index e6dceabff1..f08d6ad5a7 100644 --- a/cranelift/codegen/src/isa/x64/lower.isle +++ b/cranelift/codegen/src/isa/x64/lower.isle @@ -1255,6 +1255,26 @@ (b Xmm (sse_and_not ty cond_xmm if_false))) (sse_or ty b a))) +;; If every byte of the condition is guaranteed to be all ones or all zeroes, +;; we can use x86_blend like vselect does. +(rule 1 (lower (has_type ty @ (multi_lane _bits _lanes) + (bitselect condition + if_true + if_false))) + (if (all_ones_or_all_zeros condition)) + (x64_blend ty + condition + if_true + if_false)) + +(decl pure all_ones_or_all_zeros (Value) bool) +(rule (all_ones_or_all_zeros (and (icmp _ _ _) (value_type (multi_lane _ _)))) $true) +(rule (all_ones_or_all_zeros (and (fcmp _ _ _) (value_type (multi_lane _ _)))) $true) +(rule (all_ones_or_all_zeros (vconst (vconst_all_ones_or_all_zeros))) $true) + +(decl pure vconst_all_ones_or_all_zeros () Constant) +(extern extractor vconst_all_ones_or_all_zeros vconst_all_ones_or_all_zeros) + ;;;; Rules for `vselect` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; (rule (lower (has_type ty @ (multi_lane _bits _lanes) diff --git a/cranelift/codegen/src/isa/x64/lower/isle.rs b/cranelift/codegen/src/isa/x64/lower/isle.rs index 6258fb6d03..17776d289e 100644 --- a/cranelift/codegen/src/isa/x64/lower/isle.rs +++ b/cranelift/codegen/src/isa/x64/lower/isle.rs @@ -713,6 +713,15 @@ impl Context for IsleContext<'_, '_, MInst, Flags, IsaFlags, 6> { targets.len() as u32 } + #[inline] + fn vconst_all_ones_or_all_zeros(&mut self, constant: Constant) -> Option<()> { + let const_data = self.lower_ctx.get_constant_data(constant); + if const_data.iter().all(|&b| b == 0 || b == 0xFF) { + return Some(()); + } + None + } + #[inline] fn fcvt_uint_mask_const(&mut self) -> VCodeConstant { self.lower_ctx diff --git a/cranelift/codegen/src/simple_preopt.rs b/cranelift/codegen/src/simple_preopt.rs index d107f1554c..08c3660213 100644 --- a/cranelift/codegen/src/simple_preopt.rs +++ b/cranelift/codegen/src/simple_preopt.rs @@ -826,67 +826,6 @@ mod simplify { } } - InstructionData::Ternary { - opcode: Opcode::Bitselect, - args, - } => { - let old_cond_type = pos.func.dfg.value_type(args[0]); - if !old_cond_type.is_vector() { - return; - } - - // Replace bitselect with vselect if each lane of controlling mask is either - // all ones or all zeroes; on x86 bitselect is encoded using 3 instructions, - // while vselect can be encoded using single BLEND instruction. - if let ValueDef::Result(def_inst, _) = pos.func.dfg.value_def(args[0]) { - let (cond_val, cond_type) = match pos.func.dfg[def_inst] { - InstructionData::IntCompare { .. } - | InstructionData::FloatCompare { .. } => { - // If the controlled mask is from a comparison, the value will be all - // zeros or ones in each output lane. - let arg = args[0]; - let arg_type = pos.func.dfg.value_type(arg); - if !arg_type.is_vector() { - return; - } - (arg, arg_type) - } - InstructionData::UnaryConst { - opcode: Opcode::Vconst, - constant_handle, - } => { - // If each byte of controlling mask is 0x00 or 0xFF then - // we will always bitcast our way to vselect(I8x16, I8x16). - // Bitselect operates at bit level, so the lane types don't matter. - let const_data = pos.func.dfg.constants.get(constant_handle); - if !const_data.iter().all(|&b| b == 0 || b == 0xFF) { - return; - } - let new_type = I8.by(old_cond_type.bytes()).unwrap(); - (pos.ins().bitcast(new_type, args[0]), new_type) - } - _ => return, - }; - - let lane_type = Type::int(cond_type.lane_bits() as u16).unwrap(); - let arg_type = lane_type.by(cond_type.lane_count()).unwrap(); - let old_arg_type = pos.func.dfg.value_type(args[1]); - - if arg_type != old_arg_type { - // Operands types must match, we need to add bitcasts. - let arg1 = pos.ins().bitcast(arg_type, args[1]); - let arg2 = pos.ins().bitcast(arg_type, args[2]); - let ret = pos.ins().vselect(cond_val, arg1, arg2); - pos.func.dfg.replace(inst).bitcast(old_arg_type, ret); - } else { - pos.func - .dfg - .replace(inst) - .vselect(cond_val, args[1], args[2]); - } - } - } - _ => {} } } diff --git a/cranelift/filetests/filetests/isa/x64/simd-bitselect.clif b/cranelift/filetests/filetests/isa/x64/simd-bitselect.clif new file mode 100644 index 0000000000..edcc1f2771 --- /dev/null +++ b/cranelift/filetests/filetests/isa/x64/simd-bitselect.clif @@ -0,0 +1,123 @@ +test compile precise-output +set enable_simd +target x86_64 skylake + +function %mask_from_icmp(i8x16, i8x16) -> i8x16 { +block0(v0: i8x16, v1: i8x16): + v2 = icmp eq v0, v1 + v3 = bitselect v2, v0, v1 + return v3 +} + +; pushq %rbp +; movq %rsp, %rbp +; block0: +; movdqa %xmm0, %xmm5 +; pcmpeqb %xmm5, %xmm1, %xmm5 +; movdqa %xmm0, %xmm8 +; movdqa %xmm5, %xmm0 +; movdqa %xmm1, %xmm6 +; pblendvb %xmm6, %xmm8, %xmm6 +; movdqa %xmm6, %xmm0 +; movq %rbp, %rsp +; popq %rbp +; ret + +function %mask_from_fcmp(f32x4, f32x4, i32x4, i32x4) -> i32x4 { +block0(v0: f32x4, v1: f32x4, v2: i32x4, v3: i32x4): + v4 = fcmp eq v0, v1 + v5 = bitselect v4, v2, v3 + return v5 +} + +; pushq %rbp +; movq %rsp, %rbp +; block0: +; cmpps $0, %xmm0, %xmm1, %xmm0 +; movdqa %xmm3, %xmm8 +; pblendvb %xmm8, %xmm2, %xmm8 +; movdqa %xmm8, %xmm0 +; movq %rbp, %rsp +; popq %rbp +; ret + +function %mask_casted(i8x16, i8x16, i32x4) -> i8x16 { +block0(v0: i8x16, v1: i8x16, v2: i32x4): + v3 = bitcast.i8x16 v2 + v4 = bitselect v3, v0, v1 + return v4 +} + +; pushq %rbp +; movq %rsp, %rbp +; block0: +; movdqa %xmm0, %xmm5 +; pand %xmm5, %xmm2, %xmm5 +; movdqa %xmm2, %xmm0 +; pandn %xmm0, %xmm1, %xmm0 +; por %xmm0, %xmm5, %xmm0 +; movq %rbp, %rsp +; popq %rbp +; ret + +function %good_const_mask_i8x16(i8x16, i8x16) -> i8x16 { +block0(v0: i8x16, v1: i8x16): + v3 = vconst.i8x16 [0 0 0xFF 0 0 0xFF 0 0 0 0 0xFF 0 0 0 0 0xFF] + v4 = bitselect v3, v0, v1 + return v4 +} + +; pushq %rbp +; movq %rsp, %rbp +; block0: +; movdqa %xmm0, %xmm6 +; movdqu const(0), %xmm0 +; movdqa %xmm6, %xmm8 +; movdqa %xmm1, %xmm6 +; pblendvb %xmm6, %xmm8, %xmm6 +; movdqa %xmm6, %xmm0 +; movq %rbp, %rsp +; popq %rbp +; ret + +function %good_const_mask_i16x8(i16x8, i16x8) -> i16x8 { +block0(v0: i16x8, v1: i16x8): + v3 = vconst.i16x8 [0x0000 0xFF00 0x0000 0x00FF 0x0000 0xFFFF 0x00FF 0xFFFF] + v4 = bitselect v3, v0, v1 + return v4 +} + +; pushq %rbp +; movq %rsp, %rbp +; block0: +; movdqa %xmm0, %xmm6 +; movdqu const(0), %xmm0 +; movdqa %xmm6, %xmm8 +; movdqa %xmm1, %xmm6 +; pblendvb %xmm6, %xmm8, %xmm6 +; movdqa %xmm6, %xmm0 +; movq %rbp, %rsp +; popq %rbp +; ret + +function %bad_const_mask(i8x16, i8x16) -> i8x16 { +block0(v0: i8x16, v1: i8x16): + v3 = vconst.i8x16 [0 0 0xF0 0 0 0xFF 0 0 0 0 0xFF 0 0 0 0 0xFF] + v4 = bitselect v3, v0, v1 + return v4 +} + +; pushq %rbp +; movq %rsp, %rbp +; block0: +; movdqu const(0), %xmm6 +; movdqa %xmm6, %xmm9 +; movdqa %xmm0, %xmm5 +; pand %xmm5, %xmm9, %xmm5 +; movdqa %xmm9, %xmm0 +; pandn %xmm0, %xmm1, %xmm0 +; por %xmm0, %xmm5, %xmm0 +; movq %rbp, %rsp +; popq %rbp +; ret + diff --git a/cranelift/filetests/filetests/isa/x64/simd-bitwise-compile.clif b/cranelift/filetests/filetests/isa/x64/simd-bitwise-compile.clif index 231c2fc9e4..c35942613a 100644 --- a/cranelift/filetests/filetests/isa/x64/simd-bitwise-compile.clif +++ b/cranelift/filetests/filetests/isa/x64/simd-bitwise-compile.clif @@ -128,28 +128,6 @@ block0(v0: i32x4, v1: i32x4): ; popq %rbp ; ret -function %bitselect_i16x8() -> i16x8 { -block0: - v0 = vconst.i16x8 [0 0 0 0 0 0 0 0] - v1 = vconst.i16x8 [0 0 0 0 0 0 0 0] - v2 = vconst.i16x8 [0 0 0 0 0 0 0 0] - v3 = bitselect v0, v1, v2 - return v3 -} - -; pushq %rbp -; movq %rsp, %rbp -; block0: -; movdqu const(0), %xmm0 -; movdqu const(0), %xmm2 -; movdqu const(0), %xmm6 -; pand %xmm2, %xmm0, %xmm2 -; pandn %xmm0, %xmm6, %xmm0 -; por %xmm0, %xmm2, %xmm0 -; movq %rbp, %rsp -; popq %rbp -; ret - function %vselect_i16x8(i16x8, i16x8, i16x8) -> i16x8 { block0(v0: i16x8, v1: i16x8, v2: i16x8): v3 = vselect v0, v1, v2 diff --git a/cranelift/filetests/filetests/simple_preopt/bitselect.clif b/cranelift/filetests/filetests/simple_preopt/bitselect.clif deleted file mode 100644 index b7ba46f5c0..0000000000 --- a/cranelift/filetests/filetests/simple_preopt/bitselect.clif +++ /dev/null @@ -1,52 +0,0 @@ -test simple_preopt -target aarch64 -target x86_64 - -;; Test replacement of bitselect with vselect for special masks - -function %mask_from_icmp(i8x16, i8x16) -> i8x16 { -block0(v0: i8x16, v1: i8x16): - v2 = icmp eq v0, v1 - v3 = bitselect v2, v0, v1 - ; check: v3 = vselect v2, v0, v1 - return v3 -} - -;; We can't guarantee that the i32x4 has all ones or zeros in each lane, so we -;; can't remove the bitselect in this case. -function %mask_casted(i8x16, i8x16, i32x4) -> i8x16 { -block0(v0: i8x16, v1: i8x16, v2: i32x4): - v3 = bitcast.i8x16 v2 - v4 = bitselect v3, v0, v1 - ; check: v4 = bitselect v3, v0, v1 - return v4 -} - -function %good_const_mask_i8x16(i8x16, i8x16) -> i8x16 { -block0(v0: i8x16, v1: i8x16): - v3 = vconst.i8x16 [0 0 0xFF 0 0 0xFF 0 0 0 0 0xFF 0 0 0 0 0xFF] - v4 = bitselect v3, v0, v1 - ; check: v5 = bitcast.i8x16 v3 - ; nextln: v4 = vselect v5, v0, v1 - return v4 -} - -function %good_const_mask_i16x8(i16x8, i16x8) -> i16x8 { -block0(v0: i16x8, v1: i16x8): - v3 = vconst.i16x8 [0x0000 0xFF00 0x0000 0x00FF 0x0000 0xFFFF 0x00FF 0xFFFF] - v4 = bitselect v3, v0, v1 - ; check: v5 = bitcast.i8x16 v3 - ; nextln: v6 = bitcast.i8x16 v0 - ; nextln: v7 = bitcast.i8x16 v1 - ; nextln: v8 = vselect v5, v6, v7 - ; nextln: v4 = bitcast.i16x8 v8 - return v4 -} - -function %bad_const_mask(i8x16, i8x16) -> i8x16 { -block0(v0: i8x16, v1: i8x16): - v3 = vconst.i8x16 [0 0 0xF0 0 0 0xFF 0 0 0 0 0xFF 0 0 0 0 0xFF] - v4 = bitselect v3, v0, v1 - ; check: v4 = bitselect v3, v0, v1 - return v4 -}