Improve bitselect codegen with knowledge of operand origin (#1783)
* Encode vselect using BLEND instructions on x86 * Legalize vselect to bitselect * Optimize bitselect to vselect for some operands * Add run tests for bitselect-vselect optimization * Address review feedback
This commit is contained in:
@@ -1634,6 +1634,7 @@ fn define_simd(
|
|||||||
let ushr_imm = shared.by_name("ushr_imm");
|
let ushr_imm = shared.by_name("ushr_imm");
|
||||||
let usub_sat = shared.by_name("usub_sat");
|
let usub_sat = shared.by_name("usub_sat");
|
||||||
let vconst = shared.by_name("vconst");
|
let vconst = shared.by_name("vconst");
|
||||||
|
let vselect = shared.by_name("vselect");
|
||||||
let x86_insertps = x86.by_name("x86_insertps");
|
let x86_insertps = x86.by_name("x86_insertps");
|
||||||
let x86_movlhps = x86.by_name("x86_movlhps");
|
let x86_movlhps = x86.by_name("x86_movlhps");
|
||||||
let x86_movsd = x86.by_name("x86_movsd");
|
let x86_movsd = x86.by_name("x86_movsd");
|
||||||
@@ -1654,6 +1655,7 @@ fn define_simd(
|
|||||||
let x86_punpckl = x86.by_name("x86_punpckl");
|
let x86_punpckl = x86.by_name("x86_punpckl");
|
||||||
|
|
||||||
// Shorthands for recipes.
|
// Shorthands for recipes.
|
||||||
|
let rec_blend = r.template("blend");
|
||||||
let rec_evex_reg_vvvv_rm_128 = r.template("evex_reg_vvvv_rm_128");
|
let rec_evex_reg_vvvv_rm_128 = r.template("evex_reg_vvvv_rm_128");
|
||||||
let rec_f_ib = r.template("f_ib");
|
let rec_f_ib = r.template("f_ib");
|
||||||
let rec_fa = r.template("fa");
|
let rec_fa = r.template("fa");
|
||||||
@@ -1723,6 +1725,20 @@ fn define_simd(
|
|||||||
e.enc_both_inferred(instruction, template);
|
e.enc_both_inferred(instruction, template);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// SIMD vselect; controlling value of vselect is a boolean vector, so each lane should be
|
||||||
|
// either all ones or all zeroes - it makes it possible to always use 8-bit PBLENDVB;
|
||||||
|
// for 32/64-bit lanes we can also use BLENDVPS and BLENDVPD
|
||||||
|
for ty in ValueType::all_lane_types().filter(allowed_simd_type) {
|
||||||
|
let opcode = match ty.lane_bits() {
|
||||||
|
32 => &BLENDVPS,
|
||||||
|
64 => &BLENDVPD,
|
||||||
|
_ => &PBLENDVB,
|
||||||
|
};
|
||||||
|
let instruction = vselect.bind(vector(ty, sse_vector_size));
|
||||||
|
let template = rec_blend.opcodes(opcode);
|
||||||
|
e.enc_both_inferred_maybe_isap(instruction, template, Some(use_sse41_simd));
|
||||||
|
}
|
||||||
|
|
||||||
// SIMD scalar_to_vector; this uses MOV to copy the scalar value to an XMM register; according
|
// SIMD scalar_to_vector; this uses MOV to copy the scalar value to an XMM register; according
|
||||||
// to the Intel manual: "When the destination operand is an XMM register, the source operand is
|
// to the Intel manual: "When the destination operand is an XMM register, the source operand is
|
||||||
// written to the low doubleword of the register and the register is zero-extended to 128 bits."
|
// written to the low doubleword of the register and the register is zero-extended to 128 bits."
|
||||||
|
|||||||
@@ -378,6 +378,7 @@ fn define_simd(shared: &mut SharedDefinitions, x86_instructions: &InstructionGro
|
|||||||
let vconst = insts.by_name("vconst");
|
let vconst = insts.by_name("vconst");
|
||||||
let vall_true = insts.by_name("vall_true");
|
let vall_true = insts.by_name("vall_true");
|
||||||
let vany_true = insts.by_name("vany_true");
|
let vany_true = insts.by_name("vany_true");
|
||||||
|
let vselect = insts.by_name("vselect");
|
||||||
|
|
||||||
let x86_packss = x86_instructions.by_name("x86_packss");
|
let x86_packss = x86_instructions.by_name("x86_packss");
|
||||||
let x86_pmaxs = x86_instructions.by_name("x86_pmaxs");
|
let x86_pmaxs = x86_instructions.by_name("x86_pmaxs");
|
||||||
@@ -589,6 +590,17 @@ fn define_simd(shared: &mut SharedDefinitions, x86_instructions: &InstructionGro
|
|||||||
);
|
);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// SIMD vselect; replace with bitselect if BLEND* instructions are not available.
|
||||||
|
// This works, because each lane of boolean vector is filled with zeroes or ones.
|
||||||
|
for ty in ValueType::all_lane_types().filter(allowed_simd_type) {
|
||||||
|
let vselect = vselect.bind(vector(ty, sse_vector_size));
|
||||||
|
let raw_bitcast = raw_bitcast.bind(vector(ty, sse_vector_size));
|
||||||
|
narrow.legalize(
|
||||||
|
def!(d = vselect(c, x, y)),
|
||||||
|
vec![def!(a = raw_bitcast(c)), def!(d = bitselect(a, x, y))],
|
||||||
|
);
|
||||||
|
}
|
||||||
|
|
||||||
// SIMD vany_true
|
// SIMD vany_true
|
||||||
let ne = Literal::enumerator_for(&imm.intcc, "ne");
|
let ne = Literal::enumerator_for(&imm.intcc, "ne");
|
||||||
for ty in ValueType::all_lane_types().filter(allowed_simd_type) {
|
for ty in ValueType::all_lane_types().filter(allowed_simd_type) {
|
||||||
|
|||||||
@@ -54,6 +54,14 @@ pub static BIT_SCAN_FORWARD: [u8; 2] = [0x0f, 0xbc];
|
|||||||
/// Bit scan reverse (stores index of first encountered 1 from the back).
|
/// Bit scan reverse (stores index of first encountered 1 from the back).
|
||||||
pub static BIT_SCAN_REVERSE: [u8; 2] = [0x0f, 0xbd];
|
pub static BIT_SCAN_REVERSE: [u8; 2] = [0x0f, 0xbd];
|
||||||
|
|
||||||
|
/// Select packed single-precision floating-point values from xmm1 and xmm2/m128
|
||||||
|
/// from mask specified in XMM0 and store the values into xmm1 (SSE4.1).
|
||||||
|
pub static BLENDVPS: [u8; 4] = [0x66, 0x0f, 0x38, 0x14];
|
||||||
|
|
||||||
|
/// Select packed double-precision floating-point values from xmm1 and xmm2/m128
|
||||||
|
/// from mask specified in XMM0 and store the values into xmm1 (SSE4.1).
|
||||||
|
pub static BLENDVPD: [u8; 4] = [0x66, 0x0f, 0x38, 0x15];
|
||||||
|
|
||||||
/// Call near, relative, displacement relative to next instruction (sign-extended).
|
/// Call near, relative, displacement relative to next instruction (sign-extended).
|
||||||
pub static CALL_RELATIVE: [u8; 1] = [0xe8];
|
pub static CALL_RELATIVE: [u8; 1] = [0xe8];
|
||||||
|
|
||||||
@@ -335,6 +343,10 @@ pub static PAVGB: [u8; 3] = [0x66, 0x0f, 0xE0];
|
|||||||
/// Average packed unsigned word integers from xmm2/m128 and xmm1 with rounding (SSE2).
|
/// Average packed unsigned word integers from xmm2/m128 and xmm1 with rounding (SSE2).
|
||||||
pub static PAVGW: [u8; 3] = [0x66, 0x0f, 0xE3];
|
pub static PAVGW: [u8; 3] = [0x66, 0x0f, 0xE3];
|
||||||
|
|
||||||
|
/// Select byte values from xmm1 and xmm2/m128 from mask specified in the high bit of each byte
|
||||||
|
/// in XMM0 and store the values into xmm1 (SSE4.1).
|
||||||
|
pub static PBLENDVB: [u8; 4] = [0x66, 0x0f, 0x38, 0x10];
|
||||||
|
|
||||||
/// Compare packed data for equal (SSE2).
|
/// Compare packed data for equal (SSE2).
|
||||||
pub static PCMPEQB: [u8; 3] = [0x66, 0x0f, 0x74];
|
pub static PCMPEQB: [u8; 3] = [0x66, 0x0f, 0x74];
|
||||||
|
|
||||||
|
|||||||
@@ -427,6 +427,7 @@ pub(crate) fn define<'shared>(
|
|||||||
let reg_rcx = Register::new(gpr, regs.regunit_by_name(gpr, "rcx"));
|
let reg_rcx = Register::new(gpr, regs.regunit_by_name(gpr, "rcx"));
|
||||||
let reg_rdx = Register::new(gpr, regs.regunit_by_name(gpr, "rdx"));
|
let reg_rdx = Register::new(gpr, regs.regunit_by_name(gpr, "rdx"));
|
||||||
let reg_r15 = Register::new(gpr, regs.regunit_by_name(gpr, "r15"));
|
let reg_r15 = Register::new(gpr, regs.regunit_by_name(gpr, "r15"));
|
||||||
|
let reg_xmm0 = Register::new(fpr, regs.regunit_by_name(fpr, "xmm0"));
|
||||||
|
|
||||||
// Stack operand with a 32-bit signed displacement from either RBP or RSP.
|
// Stack operand with a 32-bit signed displacement from either RBP or RSP.
|
||||||
let stack_gpr32 = Stack::new(gpr);
|
let stack_gpr32 = Stack::new(gpr);
|
||||||
@@ -904,6 +905,24 @@ pub(crate) fn define<'shared>(
|
|||||||
.inferred_rex_compute_size("size_with_inferred_rex_for_inreg1"),
|
.inferred_rex_compute_size("size_with_inferred_rex_for_inreg1"),
|
||||||
);
|
);
|
||||||
|
|
||||||
|
// XX /r for BLEND* instructions
|
||||||
|
recipes.add_template_inferred(
|
||||||
|
EncodingRecipeBuilder::new("blend", &formats.ternary, 1)
|
||||||
|
.operands_in(vec![
|
||||||
|
OperandConstraint::FixedReg(reg_xmm0),
|
||||||
|
OperandConstraint::RegClass(fpr),
|
||||||
|
OperandConstraint::RegClass(fpr),
|
||||||
|
])
|
||||||
|
.operands_out(vec![2])
|
||||||
|
.emit(
|
||||||
|
r#"
|
||||||
|
{{PUT_OP}}(bits, rex2(in_reg1, in_reg2), sink);
|
||||||
|
modrm_rr(in_reg1, in_reg2, sink);
|
||||||
|
"#,
|
||||||
|
),
|
||||||
|
"size_with_inferred_rex_for_inreg1_inreg2",
|
||||||
|
);
|
||||||
|
|
||||||
// XX /n ib with 8-bit immediate sign-extended.
|
// XX /n ib with 8-bit immediate sign-extended.
|
||||||
{
|
{
|
||||||
recipes.add_template_inferred(
|
recipes.add_template_inferred(
|
||||||
|
|||||||
@@ -246,6 +246,20 @@ fn size_with_inferred_rex_for_inreg0_inreg1(
|
|||||||
sizing.base_size + if needs_rex { 1 } else { 0 }
|
sizing.base_size + if needs_rex { 1 } else { 0 }
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/// Infers whether a dynamic REX prefix will be emitted, based on second and third operand.
|
||||||
|
fn size_with_inferred_rex_for_inreg1_inreg2(
|
||||||
|
sizing: &RecipeSizing,
|
||||||
|
_enc: Encoding,
|
||||||
|
inst: Inst,
|
||||||
|
divert: &RegDiversions,
|
||||||
|
func: &Function,
|
||||||
|
) -> u8 {
|
||||||
|
// No need to check for REX.W in `needs_rex` because `infer_rex().w()` is not allowed.
|
||||||
|
let needs_rex = test_input(1, inst, divert, func, is_extended_reg)
|
||||||
|
|| test_input(2, inst, divert, func, is_extended_reg);
|
||||||
|
sizing.base_size + if needs_rex { 1 } else { 0 }
|
||||||
|
}
|
||||||
|
|
||||||
/// Infers whether a dynamic REX prefix will be emitted, based on a single
|
/// Infers whether a dynamic REX prefix will be emitted, based on a single
|
||||||
/// input register and a single output register.
|
/// input register and a single output register.
|
||||||
fn size_with_inferred_rex_for_inreg0_outreg0(
|
fn size_with_inferred_rex_for_inreg0_outreg0(
|
||||||
|
|||||||
@@ -656,7 +656,7 @@ mod simplify {
|
|||||||
dfg::ValueDef,
|
dfg::ValueDef,
|
||||||
immediates,
|
immediates,
|
||||||
instructions::{Opcode, ValueList},
|
instructions::{Opcode, ValueList},
|
||||||
types::{I16, I32, I8},
|
types::{B8, I16, I32, I8},
|
||||||
};
|
};
|
||||||
use std::marker::PhantomData;
|
use std::marker::PhantomData;
|
||||||
|
|
||||||
@@ -935,6 +935,69 @@ mod simplify {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
InstructionData::Ternary {
|
||||||
|
opcode: Opcode::Bitselect,
|
||||||
|
args,
|
||||||
|
} => {
|
||||||
|
let old_cond_type = pos.func.dfg.value_type(args[0]);
|
||||||
|
if !old_cond_type.is_vector() {
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Replace bitselect with vselect if each lane of controlling mask is either
|
||||||
|
// all ones or all zeroes; on x86 bitselect is encoded using 3 instructions,
|
||||||
|
// while vselect can be encoded using single BLEND instruction.
|
||||||
|
if let ValueDef::Result(def_inst, _) = pos.func.dfg.value_def(args[0]) {
|
||||||
|
let (cond_val, cond_type) = match pos.func.dfg[def_inst] {
|
||||||
|
InstructionData::Unary {
|
||||||
|
opcode: Opcode::RawBitcast,
|
||||||
|
arg,
|
||||||
|
} => {
|
||||||
|
// If controlling mask is raw-bitcasted boolean vector then
|
||||||
|
// we know each lane is either all zeroes or ones,
|
||||||
|
// so we can use vselect instruction instead.
|
||||||
|
let arg_type = pos.func.dfg.value_type(arg);
|
||||||
|
if !arg_type.is_vector() || !arg_type.lane_type().is_bool() {
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
(arg, arg_type)
|
||||||
|
}
|
||||||
|
InstructionData::UnaryConst {
|
||||||
|
opcode: Opcode::Vconst,
|
||||||
|
constant_handle,
|
||||||
|
} => {
|
||||||
|
// If each byte of controlling mask is 0x00 or 0xFF then
|
||||||
|
// we will always bitcast our way to vselect(B8x16, I8x16, I8x16).
|
||||||
|
// Bitselect operates at bit level, so the lane types don't matter.
|
||||||
|
let const_data = pos.func.dfg.constants.get(constant_handle);
|
||||||
|
if !const_data.iter().all(|&b| b == 0 || b == 0xFF) {
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
let new_type = B8.by(old_cond_type.bytes() as u16).unwrap();
|
||||||
|
(pos.ins().raw_bitcast(new_type, args[0]), new_type)
|
||||||
|
}
|
||||||
|
_ => return,
|
||||||
|
};
|
||||||
|
|
||||||
|
let lane_type = Type::int(cond_type.lane_bits() as u16).unwrap();
|
||||||
|
let arg_type = lane_type.by(cond_type.lane_count()).unwrap();
|
||||||
|
let old_arg_type = pos.func.dfg.value_type(args[1]);
|
||||||
|
|
||||||
|
if arg_type != old_arg_type {
|
||||||
|
// Operands types must match, we need to add bitcasts.
|
||||||
|
let arg1 = pos.ins().raw_bitcast(arg_type, args[1]);
|
||||||
|
let arg2 = pos.ins().raw_bitcast(arg_type, args[2]);
|
||||||
|
let ret = pos.ins().vselect(cond_val, arg1, arg2);
|
||||||
|
pos.func.dfg.replace(inst).raw_bitcast(old_arg_type, ret);
|
||||||
|
} else {
|
||||||
|
pos.func
|
||||||
|
.dfg
|
||||||
|
.replace(inst)
|
||||||
|
.vselect(cond_val, args[1], args[2]);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
_ => {}
|
_ => {}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -0,0 +1,39 @@
|
|||||||
|
test run
|
||||||
|
set opt_level=speed_and_size
|
||||||
|
set enable_simd
|
||||||
|
target x86_64 haswell
|
||||||
|
|
||||||
|
;; Test if bitselect->vselect optimization works properly
|
||||||
|
|
||||||
|
function %mask_from_icmp(i32x4, i32x4) -> i32x4 {
|
||||||
|
block0(v0: i32x4, v1: i32x4):
|
||||||
|
v2 = icmp sge v0, v1
|
||||||
|
v3 = raw_bitcast.i32x4 v2
|
||||||
|
v4 = bitselect v3, v0, v1
|
||||||
|
return v4
|
||||||
|
}
|
||||||
|
; run: %mask_from_icmp([5 6 7 8], [1 10 20 7]) == [5 10 20 8]
|
||||||
|
|
||||||
|
function %mask_casted(i64x2, i64x2, i32x4) -> i64x2 {
|
||||||
|
block0(v0: i64x2, v1: i64x2, v2: i32x4):
|
||||||
|
v3 = raw_bitcast.i64x2 v2
|
||||||
|
v4 = bitselect v3, v0, v1
|
||||||
|
return v4
|
||||||
|
}
|
||||||
|
; run: %mask_casted([0 0], [0xFFFFFF 0xFFFF4F], [0xFFF1 0 0xF 0]) == [0xFF000E 0xFFFF40]
|
||||||
|
|
||||||
|
function %good_const_mask(i32x4, i32x4) -> i32x4 {
|
||||||
|
block0(v0: i32x4, v1: i32x4):
|
||||||
|
v2 = vconst.i32x4 [0x0000FF00 0x00FF00FF 0x00FF00FF 0xFF00FFFF]
|
||||||
|
v4 = bitselect v2, v0, v1
|
||||||
|
return v4
|
||||||
|
}
|
||||||
|
; run: %good_const_mask([0x1234 0x5678 0x1234 0x5678], [0xAAAA 0xAAAA 0xAAAA 0xAAAA]) == [0x12AA 0xAA78 0xAA34 0x5678]
|
||||||
|
|
||||||
|
function %bad_const_mask(i32x4, i32x4) -> i32x4 {
|
||||||
|
block0(v0: i32x4, v1: i32x4):
|
||||||
|
v2 = vconst.i32x4 [0x0000FF00 0x00FF00FF 0x00FF000F 0xFF00FFF0]
|
||||||
|
v4 = bitselect v2, v0, v1
|
||||||
|
return v4
|
||||||
|
}
|
||||||
|
; run: %bad_const_mask([0x1234 0x5678 0x1234 0x5678], [0xAAAA 0xAAAA 0xAAAA 0xAAAA]) == [0x12AA 0xAA78 0xAAA4 0x567A]
|
||||||
@@ -0,0 +1,27 @@
|
|||||||
|
test binemit
|
||||||
|
set enable_simd
|
||||||
|
target x86_64 haswell
|
||||||
|
|
||||||
|
function %vselect_i8x16(b8x16, i8x16, i8x16) {
|
||||||
|
block0(v0: b8x16 [%xmm0], v1: i8x16 [%xmm3], v2: i8x16 [%xmm5]):
|
||||||
|
[-, %xmm5] v3 = vselect v0, v1, v2 ; bin: 66 0f 38 10 eb
|
||||||
|
return
|
||||||
|
}
|
||||||
|
|
||||||
|
function %vselect_i16x8(b16x8, i16x8, i16x8) {
|
||||||
|
block0(v0: b16x8 [%xmm0], v1: i16x8 [%xmm3], v2: i16x8 [%xmm5]):
|
||||||
|
[-, %xmm5] v3 = vselect v0, v1, v2 ; bin: 66 0f 38 10 eb
|
||||||
|
return
|
||||||
|
}
|
||||||
|
|
||||||
|
function %vselect_i32x4(b32x4, i32x4, i32x4) {
|
||||||
|
block0(v0: b32x4 [%xmm0], v1: i32x4 [%xmm3], v2: i32x4 [%xmm5]):
|
||||||
|
[-, %xmm5] v3 = vselect v0, v1, v2 ; bin: 66 0f 38 14 eb
|
||||||
|
return
|
||||||
|
}
|
||||||
|
|
||||||
|
function %vselect_i64x2(b64x2, i64x2, i64x2) {
|
||||||
|
block0(v0: b64x2 [%xmm0], v1: i64x2 [%xmm3], v2: i64x2 [%xmm5]):
|
||||||
|
[-, %xmm5] v3 = vselect v0, v1, v2 ; bin: 66 0f 38 15 eb
|
||||||
|
return
|
||||||
|
}
|
||||||
@@ -0,0 +1,45 @@
|
|||||||
|
test legalizer
|
||||||
|
set enable_simd
|
||||||
|
target x86_64
|
||||||
|
|
||||||
|
;; Test if vselect gets legalized if BLEND* instructions are not available
|
||||||
|
|
||||||
|
function %vselect_i8x16(b8x16, i8x16, i8x16) -> i8x16 {
|
||||||
|
block0(v0: b8x16, v1: i8x16, v2: i8x16):
|
||||||
|
v3 = vselect v0, v1, v2
|
||||||
|
; check: v4 = raw_bitcast.i8x16 v0
|
||||||
|
; nextln: v5 = band v1, v4
|
||||||
|
; nextln: v6 = band_not v2, v4
|
||||||
|
; nextln: v3 = bor v5, v6
|
||||||
|
return v3
|
||||||
|
}
|
||||||
|
|
||||||
|
function %vselect_i16x8(b16x8, i16x8, i16x8) -> i16x8 {
|
||||||
|
block0(v0: b16x8, v1: i16x8, v2: i16x8):
|
||||||
|
v3 = vselect v0, v1, v2
|
||||||
|
; check: v4 = raw_bitcast.i16x8 v0
|
||||||
|
; nextln: v5 = band v1, v4
|
||||||
|
; nextln: v6 = band_not v2, v4
|
||||||
|
; nextln: v3 = bor v5, v6
|
||||||
|
return v3
|
||||||
|
}
|
||||||
|
|
||||||
|
function %vselect_i32x4(b32x4, i32x4, i32x4) -> i32x4 {
|
||||||
|
block0(v0: b32x4, v1: i32x4, v2: i32x4):
|
||||||
|
v3 = vselect v0, v1, v2
|
||||||
|
; check: v4 = raw_bitcast.i32x4 v0
|
||||||
|
; nextln: v5 = band v1, v4
|
||||||
|
; nextln: v6 = band_not v2, v4
|
||||||
|
; nextln: v3 = bor v5, v6
|
||||||
|
return v3
|
||||||
|
}
|
||||||
|
|
||||||
|
function %vselect_i64x2(b64x2, i64x2, i64x2) -> i64x2 {
|
||||||
|
block0(v0: b64x2, v1: i64x2, v2: i64x2):
|
||||||
|
v3 = vselect v0, v1, v2
|
||||||
|
; check: v4 = raw_bitcast.i64x2 v0
|
||||||
|
; nextln: v5 = band v1, v4
|
||||||
|
; nextln: v6 = band_not v2, v4
|
||||||
|
; nextln: v3 = bor v5, v6
|
||||||
|
return v3
|
||||||
|
}
|
||||||
43
cranelift/filetests/filetests/isa/x86/simd-vselect-run.clif
Normal file
43
cranelift/filetests/filetests/isa/x86/simd-vselect-run.clif
Normal file
@@ -0,0 +1,43 @@
|
|||||||
|
test run
|
||||||
|
set enable_simd
|
||||||
|
target x86_64 haswell
|
||||||
|
|
||||||
|
function %vselect_i8x16() -> i8x16 {
|
||||||
|
block0:
|
||||||
|
v1 = vconst.b8x16 [false true false true false true true true true true false false false false false false]
|
||||||
|
v2 = vconst.i8x16 [100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115]
|
||||||
|
v3 = vconst.i8x16 [200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215]
|
||||||
|
v4 = vselect v1, v2, v3
|
||||||
|
return v4
|
||||||
|
}
|
||||||
|
; run: %vselect_i8x16() == [200 101 202 103 204 105 106 107 108 109 210 211 212 213 214 215]
|
||||||
|
|
||||||
|
function %vselect_i16x8() -> i16x8 {
|
||||||
|
block0:
|
||||||
|
v1 = vconst.b16x8 [false true false true false true true true]
|
||||||
|
v2 = vconst.i16x8 [100 101 102 103 104 105 106 107]
|
||||||
|
v3 = vconst.i16x8 [200 201 202 203 204 205 206 207]
|
||||||
|
v4 = vselect v1, v2, v3
|
||||||
|
return v4
|
||||||
|
}
|
||||||
|
; run: %vselect_i16x8() == [200 101 202 103 204 105 106 107]
|
||||||
|
|
||||||
|
function %vselect_i32x4() -> i32x4 {
|
||||||
|
block0:
|
||||||
|
v1 = vconst.b32x4 [false true false true]
|
||||||
|
v2 = vconst.i32x4 [100 101 102 103]
|
||||||
|
v3 = vconst.i32x4 [200 201 202 203]
|
||||||
|
v4 = vselect v1, v2, v3
|
||||||
|
return v4
|
||||||
|
}
|
||||||
|
; run: %vselect_i32x4() == [200 101 202 103]
|
||||||
|
|
||||||
|
function %vselect_i64x2() -> i64x2 {
|
||||||
|
block0:
|
||||||
|
v1 = vconst.b64x2 [false true]
|
||||||
|
v2 = vconst.i64x2 [100 101]
|
||||||
|
v3 = vconst.i64x2 [200 201]
|
||||||
|
v4 = vselect v1, v2, v3
|
||||||
|
return v4
|
||||||
|
}
|
||||||
|
; run: %vselect_i64x2() == [200 101]
|
||||||
50
cranelift/filetests/filetests/simple_preopt/bitselect.clif
Normal file
50
cranelift/filetests/filetests/simple_preopt/bitselect.clif
Normal file
@@ -0,0 +1,50 @@
|
|||||||
|
test simple_preopt
|
||||||
|
target x86_64
|
||||||
|
|
||||||
|
;; Test replacement of bitselect with vselect for special masks
|
||||||
|
|
||||||
|
function %mask_from_icmp(i8x16, i8x16) -> i8x16 {
|
||||||
|
block0(v0: i8x16, v1: i8x16):
|
||||||
|
v2 = icmp eq v0, v1
|
||||||
|
v3 = raw_bitcast.i8x16 v2
|
||||||
|
v4 = bitselect v3, v0, v1
|
||||||
|
; check: v4 = vselect v2, v0, v1
|
||||||
|
return v4
|
||||||
|
}
|
||||||
|
|
||||||
|
function %mask_casted(i8x16, i8x16, i32x4) -> i8x16 {
|
||||||
|
block0(v0: i8x16, v1: i8x16, v2: i32x4):
|
||||||
|
v3 = raw_bitcast.i8x16 v2
|
||||||
|
v4 = bitselect v3, v0, v1
|
||||||
|
; check: v4 = bitselect v3, v0, v1
|
||||||
|
return v4
|
||||||
|
}
|
||||||
|
|
||||||
|
function %good_const_mask_i8x16(i8x16, i8x16) -> i8x16 {
|
||||||
|
block0(v0: i8x16, v1: i8x16):
|
||||||
|
v3 = vconst.i8x16 [0 0 0xFF 0 0 0xFF 0 0 0 0 0xFF 0 0 0 0 0xFF]
|
||||||
|
v4 = bitselect v3, v0, v1
|
||||||
|
; check: v5 = raw_bitcast.b8x16 v3
|
||||||
|
; nextln: v4 = vselect v5, v0, v1
|
||||||
|
return v4
|
||||||
|
}
|
||||||
|
|
||||||
|
function %good_const_mask_i16x8(i16x8, i16x8) -> i16x8 {
|
||||||
|
block0(v0: i16x8, v1: i16x8):
|
||||||
|
v3 = vconst.i16x8 [0x0000 0xFF00 0x0000 0x00FF 0x0000 0xFFFF 0x00FF 0xFFFF]
|
||||||
|
v4 = bitselect v3, v0, v1
|
||||||
|
; check: v5 = raw_bitcast.b8x16 v3
|
||||||
|
; nextln: v6 = raw_bitcast.i8x16 v0
|
||||||
|
; nextln: v7 = raw_bitcast.i8x16 v1
|
||||||
|
; nextln: v8 = vselect v5, v6, v7
|
||||||
|
; nextln: v4 = raw_bitcast.i16x8 v8
|
||||||
|
return v4
|
||||||
|
}
|
||||||
|
|
||||||
|
function %bad_const_mask(i8x16, i8x16) -> i8x16 {
|
||||||
|
block0(v0: i8x16, v1: i8x16):
|
||||||
|
v3 = vconst.i8x16 [0 0 0xF0 0 0 0xFF 0 0 0 0 0xFF 0 0 0 0 0xFF]
|
||||||
|
v4 = bitselect v3, v0, v1
|
||||||
|
; check: v4 = bitselect v3, v0, v1
|
||||||
|
return v4
|
||||||
|
}
|
||||||
Reference in New Issue
Block a user