diff --git a/cranelift/codegen/meta/src/isa/x86/encodings.rs b/cranelift/codegen/meta/src/isa/x86/encodings.rs index 541b22a1e2..65df907256 100644 --- a/cranelift/codegen/meta/src/isa/x86/encodings.rs +++ b/cranelift/codegen/meta/src/isa/x86/encodings.rs @@ -1634,6 +1634,7 @@ fn define_simd( let ushr_imm = shared.by_name("ushr_imm"); let usub_sat = shared.by_name("usub_sat"); let vconst = shared.by_name("vconst"); + let vselect = shared.by_name("vselect"); let x86_insertps = x86.by_name("x86_insertps"); let x86_movlhps = x86.by_name("x86_movlhps"); let x86_movsd = x86.by_name("x86_movsd"); @@ -1654,6 +1655,7 @@ fn define_simd( let x86_punpckl = x86.by_name("x86_punpckl"); // Shorthands for recipes. + let rec_blend = r.template("blend"); let rec_evex_reg_vvvv_rm_128 = r.template("evex_reg_vvvv_rm_128"); let rec_f_ib = r.template("f_ib"); let rec_fa = r.template("fa"); @@ -1723,6 +1725,20 @@ fn define_simd( e.enc_both_inferred(instruction, template); } + // SIMD vselect; controlling value of vselect is a boolean vector, so each lane should be + // either all ones or all zeroes - it makes it possible to always use 8-bit PBLENDVB; + // for 32/64-bit lanes we can also use BLENDVPS and BLENDVPD + for ty in ValueType::all_lane_types().filter(allowed_simd_type) { + let opcode = match ty.lane_bits() { + 32 => &BLENDVPS, + 64 => &BLENDVPD, + _ => &PBLENDVB, + }; + let instruction = vselect.bind(vector(ty, sse_vector_size)); + let template = rec_blend.opcodes(opcode); + e.enc_both_inferred_maybe_isap(instruction, template, Some(use_sse41_simd)); + } + // SIMD scalar_to_vector; this uses MOV to copy the scalar value to an XMM register; according // to the Intel manual: "When the destination operand is an XMM register, the source operand is // written to the low doubleword of the register and the register is zero-extended to 128 bits." diff --git a/cranelift/codegen/meta/src/isa/x86/legalize.rs b/cranelift/codegen/meta/src/isa/x86/legalize.rs index 3b073c1fa6..13da4a365a 100644 --- a/cranelift/codegen/meta/src/isa/x86/legalize.rs +++ b/cranelift/codegen/meta/src/isa/x86/legalize.rs @@ -378,6 +378,7 @@ fn define_simd(shared: &mut SharedDefinitions, x86_instructions: &InstructionGro let vconst = insts.by_name("vconst"); let vall_true = insts.by_name("vall_true"); let vany_true = insts.by_name("vany_true"); + let vselect = insts.by_name("vselect"); let x86_packss = x86_instructions.by_name("x86_packss"); let x86_pmaxs = x86_instructions.by_name("x86_pmaxs"); @@ -589,6 +590,17 @@ fn define_simd(shared: &mut SharedDefinitions, x86_instructions: &InstructionGro ); } + // SIMD vselect; replace with bitselect if BLEND* instructions are not available. + // This works, because each lane of boolean vector is filled with zeroes or ones. + for ty in ValueType::all_lane_types().filter(allowed_simd_type) { + let vselect = vselect.bind(vector(ty, sse_vector_size)); + let raw_bitcast = raw_bitcast.bind(vector(ty, sse_vector_size)); + narrow.legalize( + def!(d = vselect(c, x, y)), + vec![def!(a = raw_bitcast(c)), def!(d = bitselect(a, x, y))], + ); + } + // SIMD vany_true let ne = Literal::enumerator_for(&imm.intcc, "ne"); for ty in ValueType::all_lane_types().filter(allowed_simd_type) { diff --git a/cranelift/codegen/meta/src/isa/x86/opcodes.rs b/cranelift/codegen/meta/src/isa/x86/opcodes.rs index d34761d246..74dff216e7 100644 --- a/cranelift/codegen/meta/src/isa/x86/opcodes.rs +++ b/cranelift/codegen/meta/src/isa/x86/opcodes.rs @@ -54,6 +54,14 @@ pub static BIT_SCAN_FORWARD: [u8; 2] = [0x0f, 0xbc]; /// Bit scan reverse (stores index of first encountered 1 from the back). pub static BIT_SCAN_REVERSE: [u8; 2] = [0x0f, 0xbd]; +/// Select packed single-precision floating-point values from xmm1 and xmm2/m128 +/// from mask specified in XMM0 and store the values into xmm1 (SSE4.1). +pub static BLENDVPS: [u8; 4] = [0x66, 0x0f, 0x38, 0x14]; + +/// Select packed double-precision floating-point values from xmm1 and xmm2/m128 +/// from mask specified in XMM0 and store the values into xmm1 (SSE4.1). +pub static BLENDVPD: [u8; 4] = [0x66, 0x0f, 0x38, 0x15]; + /// Call near, relative, displacement relative to next instruction (sign-extended). pub static CALL_RELATIVE: [u8; 1] = [0xe8]; @@ -335,6 +343,10 @@ pub static PAVGB: [u8; 3] = [0x66, 0x0f, 0xE0]; /// Average packed unsigned word integers from xmm2/m128 and xmm1 with rounding (SSE2). pub static PAVGW: [u8; 3] = [0x66, 0x0f, 0xE3]; +/// Select byte values from xmm1 and xmm2/m128 from mask specified in the high bit of each byte +/// in XMM0 and store the values into xmm1 (SSE4.1). +pub static PBLENDVB: [u8; 4] = [0x66, 0x0f, 0x38, 0x10]; + /// Compare packed data for equal (SSE2). pub static PCMPEQB: [u8; 3] = [0x66, 0x0f, 0x74]; diff --git a/cranelift/codegen/meta/src/isa/x86/recipes.rs b/cranelift/codegen/meta/src/isa/x86/recipes.rs index ef08242f32..42e45d0328 100644 --- a/cranelift/codegen/meta/src/isa/x86/recipes.rs +++ b/cranelift/codegen/meta/src/isa/x86/recipes.rs @@ -427,6 +427,7 @@ pub(crate) fn define<'shared>( let reg_rcx = Register::new(gpr, regs.regunit_by_name(gpr, "rcx")); let reg_rdx = Register::new(gpr, regs.regunit_by_name(gpr, "rdx")); let reg_r15 = Register::new(gpr, regs.regunit_by_name(gpr, "r15")); + let reg_xmm0 = Register::new(fpr, regs.regunit_by_name(fpr, "xmm0")); // Stack operand with a 32-bit signed displacement from either RBP or RSP. let stack_gpr32 = Stack::new(gpr); @@ -904,6 +905,24 @@ pub(crate) fn define<'shared>( .inferred_rex_compute_size("size_with_inferred_rex_for_inreg1"), ); + // XX /r for BLEND* instructions + recipes.add_template_inferred( + EncodingRecipeBuilder::new("blend", &formats.ternary, 1) + .operands_in(vec![ + OperandConstraint::FixedReg(reg_xmm0), + OperandConstraint::RegClass(fpr), + OperandConstraint::RegClass(fpr), + ]) + .operands_out(vec![2]) + .emit( + r#" + {{PUT_OP}}(bits, rex2(in_reg1, in_reg2), sink); + modrm_rr(in_reg1, in_reg2, sink); + "#, + ), + "size_with_inferred_rex_for_inreg1_inreg2", + ); + // XX /n ib with 8-bit immediate sign-extended. { recipes.add_template_inferred( diff --git a/cranelift/codegen/src/isa/x86/enc_tables.rs b/cranelift/codegen/src/isa/x86/enc_tables.rs index c00ca97357..1d071d643b 100644 --- a/cranelift/codegen/src/isa/x86/enc_tables.rs +++ b/cranelift/codegen/src/isa/x86/enc_tables.rs @@ -246,6 +246,20 @@ fn size_with_inferred_rex_for_inreg0_inreg1( sizing.base_size + if needs_rex { 1 } else { 0 } } +/// Infers whether a dynamic REX prefix will be emitted, based on second and third operand. +fn size_with_inferred_rex_for_inreg1_inreg2( + sizing: &RecipeSizing, + _enc: Encoding, + inst: Inst, + divert: &RegDiversions, + func: &Function, +) -> u8 { + // No need to check for REX.W in `needs_rex` because `infer_rex().w()` is not allowed. + let needs_rex = test_input(1, inst, divert, func, is_extended_reg) + || test_input(2, inst, divert, func, is_extended_reg); + sizing.base_size + if needs_rex { 1 } else { 0 } +} + /// Infers whether a dynamic REX prefix will be emitted, based on a single /// input register and a single output register. fn size_with_inferred_rex_for_inreg0_outreg0( diff --git a/cranelift/codegen/src/simple_preopt.rs b/cranelift/codegen/src/simple_preopt.rs index 2f47c7d91b..98ff7c3992 100644 --- a/cranelift/codegen/src/simple_preopt.rs +++ b/cranelift/codegen/src/simple_preopt.rs @@ -656,7 +656,7 @@ mod simplify { dfg::ValueDef, immediates, instructions::{Opcode, ValueList}, - types::{I16, I32, I8}, + types::{B8, I16, I32, I8}, }; use std::marker::PhantomData; @@ -935,6 +935,69 @@ mod simplify { } } + InstructionData::Ternary { + opcode: Opcode::Bitselect, + args, + } => { + let old_cond_type = pos.func.dfg.value_type(args[0]); + if !old_cond_type.is_vector() { + return; + } + + // Replace bitselect with vselect if each lane of controlling mask is either + // all ones or all zeroes; on x86 bitselect is encoded using 3 instructions, + // while vselect can be encoded using single BLEND instruction. + if let ValueDef::Result(def_inst, _) = pos.func.dfg.value_def(args[0]) { + let (cond_val, cond_type) = match pos.func.dfg[def_inst] { + InstructionData::Unary { + opcode: Opcode::RawBitcast, + arg, + } => { + // If controlling mask is raw-bitcasted boolean vector then + // we know each lane is either all zeroes or ones, + // so we can use vselect instruction instead. + let arg_type = pos.func.dfg.value_type(arg); + if !arg_type.is_vector() || !arg_type.lane_type().is_bool() { + return; + } + (arg, arg_type) + } + InstructionData::UnaryConst { + opcode: Opcode::Vconst, + constant_handle, + } => { + // If each byte of controlling mask is 0x00 or 0xFF then + // we will always bitcast our way to vselect(B8x16, I8x16, I8x16). + // Bitselect operates at bit level, so the lane types don't matter. + let const_data = pos.func.dfg.constants.get(constant_handle); + if !const_data.iter().all(|&b| b == 0 || b == 0xFF) { + return; + } + let new_type = B8.by(old_cond_type.bytes() as u16).unwrap(); + (pos.ins().raw_bitcast(new_type, args[0]), new_type) + } + _ => return, + }; + + let lane_type = Type::int(cond_type.lane_bits() as u16).unwrap(); + let arg_type = lane_type.by(cond_type.lane_count()).unwrap(); + let old_arg_type = pos.func.dfg.value_type(args[1]); + + if arg_type != old_arg_type { + // Operands types must match, we need to add bitcasts. + let arg1 = pos.ins().raw_bitcast(arg_type, args[1]); + let arg2 = pos.ins().raw_bitcast(arg_type, args[2]); + let ret = pos.ins().vselect(cond_val, arg1, arg2); + pos.func.dfg.replace(inst).raw_bitcast(old_arg_type, ret); + } else { + pos.func + .dfg + .replace(inst) + .vselect(cond_val, args[1], args[2]); + } + } + } + _ => {} } } diff --git a/cranelift/filetests/filetests/isa/x86/simd-bitselect-to-vselect-run.clif b/cranelift/filetests/filetests/isa/x86/simd-bitselect-to-vselect-run.clif new file mode 100644 index 0000000000..03cc645712 --- /dev/null +++ b/cranelift/filetests/filetests/isa/x86/simd-bitselect-to-vselect-run.clif @@ -0,0 +1,39 @@ +test run +set opt_level=speed_and_size +set enable_simd +target x86_64 haswell + +;; Test if bitselect->vselect optimization works properly + +function %mask_from_icmp(i32x4, i32x4) -> i32x4 { +block0(v0: i32x4, v1: i32x4): + v2 = icmp sge v0, v1 + v3 = raw_bitcast.i32x4 v2 + v4 = bitselect v3, v0, v1 + return v4 +} +; run: %mask_from_icmp([5 6 7 8], [1 10 20 7]) == [5 10 20 8] + +function %mask_casted(i64x2, i64x2, i32x4) -> i64x2 { +block0(v0: i64x2, v1: i64x2, v2: i32x4): + v3 = raw_bitcast.i64x2 v2 + v4 = bitselect v3, v0, v1 + return v4 +} +; run: %mask_casted([0 0], [0xFFFFFF 0xFFFF4F], [0xFFF1 0 0xF 0]) == [0xFF000E 0xFFFF40] + +function %good_const_mask(i32x4, i32x4) -> i32x4 { +block0(v0: i32x4, v1: i32x4): + v2 = vconst.i32x4 [0x0000FF00 0x00FF00FF 0x00FF00FF 0xFF00FFFF] + v4 = bitselect v2, v0, v1 + return v4 +} +; run: %good_const_mask([0x1234 0x5678 0x1234 0x5678], [0xAAAA 0xAAAA 0xAAAA 0xAAAA]) == [0x12AA 0xAA78 0xAA34 0x5678] + +function %bad_const_mask(i32x4, i32x4) -> i32x4 { +block0(v0: i32x4, v1: i32x4): + v2 = vconst.i32x4 [0x0000FF00 0x00FF00FF 0x00FF000F 0xFF00FFF0] + v4 = bitselect v2, v0, v1 + return v4 +} +; run: %bad_const_mask([0x1234 0x5678 0x1234 0x5678], [0xAAAA 0xAAAA 0xAAAA 0xAAAA]) == [0x12AA 0xAA78 0xAAA4 0x567A] diff --git a/cranelift/filetests/filetests/isa/x86/simd-vselect-binemit.clif b/cranelift/filetests/filetests/isa/x86/simd-vselect-binemit.clif new file mode 100644 index 0000000000..a575c58f64 --- /dev/null +++ b/cranelift/filetests/filetests/isa/x86/simd-vselect-binemit.clif @@ -0,0 +1,27 @@ +test binemit +set enable_simd +target x86_64 haswell + +function %vselect_i8x16(b8x16, i8x16, i8x16) { +block0(v0: b8x16 [%xmm0], v1: i8x16 [%xmm3], v2: i8x16 [%xmm5]): +[-, %xmm5] v3 = vselect v0, v1, v2 ; bin: 66 0f 38 10 eb + return +} + +function %vselect_i16x8(b16x8, i16x8, i16x8) { +block0(v0: b16x8 [%xmm0], v1: i16x8 [%xmm3], v2: i16x8 [%xmm5]): +[-, %xmm5] v3 = vselect v0, v1, v2 ; bin: 66 0f 38 10 eb + return +} + +function %vselect_i32x4(b32x4, i32x4, i32x4) { +block0(v0: b32x4 [%xmm0], v1: i32x4 [%xmm3], v2: i32x4 [%xmm5]): +[-, %xmm5] v3 = vselect v0, v1, v2 ; bin: 66 0f 38 14 eb + return +} + +function %vselect_i64x2(b64x2, i64x2, i64x2) { +block0(v0: b64x2 [%xmm0], v1: i64x2 [%xmm3], v2: i64x2 [%xmm5]): +[-, %xmm5] v3 = vselect v0, v1, v2 ; bin: 66 0f 38 15 eb + return +} diff --git a/cranelift/filetests/filetests/isa/x86/simd-vselect-legalize-to-bitselect.clif b/cranelift/filetests/filetests/isa/x86/simd-vselect-legalize-to-bitselect.clif new file mode 100644 index 0000000000..723539631d --- /dev/null +++ b/cranelift/filetests/filetests/isa/x86/simd-vselect-legalize-to-bitselect.clif @@ -0,0 +1,45 @@ +test legalizer +set enable_simd +target x86_64 + +;; Test if vselect gets legalized if BLEND* instructions are not available + +function %vselect_i8x16(b8x16, i8x16, i8x16) -> i8x16 { +block0(v0: b8x16, v1: i8x16, v2: i8x16): + v3 = vselect v0, v1, v2 + ; check: v4 = raw_bitcast.i8x16 v0 + ; nextln: v5 = band v1, v4 + ; nextln: v6 = band_not v2, v4 + ; nextln: v3 = bor v5, v6 + return v3 +} + +function %vselect_i16x8(b16x8, i16x8, i16x8) -> i16x8 { +block0(v0: b16x8, v1: i16x8, v2: i16x8): + v3 = vselect v0, v1, v2 + ; check: v4 = raw_bitcast.i16x8 v0 + ; nextln: v5 = band v1, v4 + ; nextln: v6 = band_not v2, v4 + ; nextln: v3 = bor v5, v6 + return v3 +} + +function %vselect_i32x4(b32x4, i32x4, i32x4) -> i32x4 { +block0(v0: b32x4, v1: i32x4, v2: i32x4): + v3 = vselect v0, v1, v2 + ; check: v4 = raw_bitcast.i32x4 v0 + ; nextln: v5 = band v1, v4 + ; nextln: v6 = band_not v2, v4 + ; nextln: v3 = bor v5, v6 + return v3 +} + +function %vselect_i64x2(b64x2, i64x2, i64x2) -> i64x2 { +block0(v0: b64x2, v1: i64x2, v2: i64x2): + v3 = vselect v0, v1, v2 + ; check: v4 = raw_bitcast.i64x2 v0 + ; nextln: v5 = band v1, v4 + ; nextln: v6 = band_not v2, v4 + ; nextln: v3 = bor v5, v6 + return v3 +} diff --git a/cranelift/filetests/filetests/isa/x86/simd-vselect-run.clif b/cranelift/filetests/filetests/isa/x86/simd-vselect-run.clif new file mode 100644 index 0000000000..ac6feaa994 --- /dev/null +++ b/cranelift/filetests/filetests/isa/x86/simd-vselect-run.clif @@ -0,0 +1,43 @@ +test run +set enable_simd +target x86_64 haswell + +function %vselect_i8x16() -> i8x16 { +block0: + v1 = vconst.b8x16 [false true false true false true true true true true false false false false false false] + v2 = vconst.i8x16 [100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115] + v3 = vconst.i8x16 [200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215] + v4 = vselect v1, v2, v3 + return v4 +} +; run: %vselect_i8x16() == [200 101 202 103 204 105 106 107 108 109 210 211 212 213 214 215] + +function %vselect_i16x8() -> i16x8 { +block0: + v1 = vconst.b16x8 [false true false true false true true true] + v2 = vconst.i16x8 [100 101 102 103 104 105 106 107] + v3 = vconst.i16x8 [200 201 202 203 204 205 206 207] + v4 = vselect v1, v2, v3 + return v4 +} +; run: %vselect_i16x8() == [200 101 202 103 204 105 106 107] + +function %vselect_i32x4() -> i32x4 { +block0: + v1 = vconst.b32x4 [false true false true] + v2 = vconst.i32x4 [100 101 102 103] + v3 = vconst.i32x4 [200 201 202 203] + v4 = vselect v1, v2, v3 + return v4 +} +; run: %vselect_i32x4() == [200 101 202 103] + +function %vselect_i64x2() -> i64x2 { +block0: + v1 = vconst.b64x2 [false true] + v2 = vconst.i64x2 [100 101] + v3 = vconst.i64x2 [200 201] + v4 = vselect v1, v2, v3 + return v4 +} +; run: %vselect_i64x2() == [200 101] diff --git a/cranelift/filetests/filetests/simple_preopt/bitselect.clif b/cranelift/filetests/filetests/simple_preopt/bitselect.clif new file mode 100644 index 0000000000..684d91ee31 --- /dev/null +++ b/cranelift/filetests/filetests/simple_preopt/bitselect.clif @@ -0,0 +1,50 @@ +test simple_preopt +target x86_64 + +;; Test replacement of bitselect with vselect for special masks + +function %mask_from_icmp(i8x16, i8x16) -> i8x16 { +block0(v0: i8x16, v1: i8x16): + v2 = icmp eq v0, v1 + v3 = raw_bitcast.i8x16 v2 + v4 = bitselect v3, v0, v1 + ; check: v4 = vselect v2, v0, v1 + return v4 +} + +function %mask_casted(i8x16, i8x16, i32x4) -> i8x16 { +block0(v0: i8x16, v1: i8x16, v2: i32x4): + v3 = raw_bitcast.i8x16 v2 + v4 = bitselect v3, v0, v1 + ; check: v4 = bitselect v3, v0, v1 + return v4 +} + +function %good_const_mask_i8x16(i8x16, i8x16) -> i8x16 { +block0(v0: i8x16, v1: i8x16): + v3 = vconst.i8x16 [0 0 0xFF 0 0 0xFF 0 0 0 0 0xFF 0 0 0 0 0xFF] + v4 = bitselect v3, v0, v1 + ; check: v5 = raw_bitcast.b8x16 v3 + ; nextln: v4 = vselect v5, v0, v1 + return v4 +} + +function %good_const_mask_i16x8(i16x8, i16x8) -> i16x8 { +block0(v0: i16x8, v1: i16x8): + v3 = vconst.i16x8 [0x0000 0xFF00 0x0000 0x00FF 0x0000 0xFFFF 0x00FF 0xFFFF] + v4 = bitselect v3, v0, v1 + ; check: v5 = raw_bitcast.b8x16 v3 + ; nextln: v6 = raw_bitcast.i8x16 v0 + ; nextln: v7 = raw_bitcast.i8x16 v1 + ; nextln: v8 = vselect v5, v6, v7 + ; nextln: v4 = raw_bitcast.i16x8 v8 + return v4 +} + +function %bad_const_mask(i8x16, i8x16) -> i8x16 { +block0(v0: i8x16, v1: i8x16): + v3 = vconst.i8x16 [0 0 0xF0 0 0 0xFF 0 0 0 0 0xFF 0 0 0 0 0xFF] + v4 = bitselect v3, v0, v1 + ; check: v4 = bitselect v3, v0, v1 + return v4 +}