x64: implement vselect with variable blend instructions
This change implements `vselect` using SSE4.1's `BLENDVPS`, `BLENDVPD`,
and `PBLENDVB`. `vselect` is a lane-selecting instruction that is used
by
[simple_preopt.rs](fa1faf5d22/cranelift/codegen/src/simple_preopt.rs (L947-L999))
to lower `bitselect` to a single x86 instruction when the condition mask
is known to be boolean (all 1s or 0s, e.g., from a conversion). This is
better than `bitselect` in general, which lowers to 4-5 instructions.
The old backend had the `vselect` lowering; this simply introduces it to
the new backend.
This commit is contained in:
@@ -478,6 +478,7 @@ pub enum SseOpcode {
|
|||||||
Andnps,
|
Andnps,
|
||||||
Andnpd,
|
Andnpd,
|
||||||
Blendvpd,
|
Blendvpd,
|
||||||
|
Blendvps,
|
||||||
Comiss,
|
Comiss,
|
||||||
Comisd,
|
Comisd,
|
||||||
Cmpps,
|
Cmpps,
|
||||||
@@ -547,6 +548,7 @@ pub enum SseOpcode {
|
|||||||
Pandn,
|
Pandn,
|
||||||
Pavgb,
|
Pavgb,
|
||||||
Pavgw,
|
Pavgw,
|
||||||
|
Pblendvb,
|
||||||
Pcmpeqb,
|
Pcmpeqb,
|
||||||
Pcmpeqw,
|
Pcmpeqw,
|
||||||
Pcmpeqd,
|
Pcmpeqd,
|
||||||
@@ -769,8 +771,10 @@ impl SseOpcode {
|
|||||||
| SseOpcode::Pshufb => SSSE3,
|
| SseOpcode::Pshufb => SSSE3,
|
||||||
|
|
||||||
SseOpcode::Blendvpd
|
SseOpcode::Blendvpd
|
||||||
|
| SseOpcode::Blendvps
|
||||||
| SseOpcode::Insertps
|
| SseOpcode::Insertps
|
||||||
| SseOpcode::Packusdw
|
| SseOpcode::Packusdw
|
||||||
|
| SseOpcode::Pblendvb
|
||||||
| SseOpcode::Pcmpeqq
|
| SseOpcode::Pcmpeqq
|
||||||
| SseOpcode::Pextrb
|
| SseOpcode::Pextrb
|
||||||
| SseOpcode::Pextrd
|
| SseOpcode::Pextrd
|
||||||
@@ -828,6 +832,7 @@ impl fmt::Debug for SseOpcode {
|
|||||||
SseOpcode::Andnps => "andnps",
|
SseOpcode::Andnps => "andnps",
|
||||||
SseOpcode::Andnpd => "andnpd",
|
SseOpcode::Andnpd => "andnpd",
|
||||||
SseOpcode::Blendvpd => "blendvpd",
|
SseOpcode::Blendvpd => "blendvpd",
|
||||||
|
SseOpcode::Blendvps => "blendvps",
|
||||||
SseOpcode::Cmpps => "cmpps",
|
SseOpcode::Cmpps => "cmpps",
|
||||||
SseOpcode::Cmppd => "cmppd",
|
SseOpcode::Cmppd => "cmppd",
|
||||||
SseOpcode::Cmpss => "cmpss",
|
SseOpcode::Cmpss => "cmpss",
|
||||||
@@ -897,6 +902,7 @@ impl fmt::Debug for SseOpcode {
|
|||||||
SseOpcode::Pandn => "pandn",
|
SseOpcode::Pandn => "pandn",
|
||||||
SseOpcode::Pavgb => "pavgb",
|
SseOpcode::Pavgb => "pavgb",
|
||||||
SseOpcode::Pavgw => "pavgw",
|
SseOpcode::Pavgw => "pavgw",
|
||||||
|
SseOpcode::Pblendvb => "pblendvb",
|
||||||
SseOpcode::Pcmpeqb => "pcmpeqb",
|
SseOpcode::Pcmpeqb => "pcmpeqb",
|
||||||
SseOpcode::Pcmpeqw => "pcmpeqw",
|
SseOpcode::Pcmpeqw => "pcmpeqw",
|
||||||
SseOpcode::Pcmpeqd => "pcmpeqd",
|
SseOpcode::Pcmpeqd => "pcmpeqd",
|
||||||
|
|||||||
@@ -1441,6 +1441,7 @@ pub(crate) fn emit(
|
|||||||
SseOpcode::Andpd => (LegacyPrefixes::_66, 0x0F54, 2),
|
SseOpcode::Andpd => (LegacyPrefixes::_66, 0x0F54, 2),
|
||||||
SseOpcode::Andnps => (LegacyPrefixes::None, 0x0F55, 2),
|
SseOpcode::Andnps => (LegacyPrefixes::None, 0x0F55, 2),
|
||||||
SseOpcode::Andnpd => (LegacyPrefixes::_66, 0x0F55, 2),
|
SseOpcode::Andnpd => (LegacyPrefixes::_66, 0x0F55, 2),
|
||||||
|
SseOpcode::Blendvps => (LegacyPrefixes::_66, 0x0F3814, 3),
|
||||||
SseOpcode::Blendvpd => (LegacyPrefixes::_66, 0x0F3815, 3),
|
SseOpcode::Blendvpd => (LegacyPrefixes::_66, 0x0F3815, 3),
|
||||||
SseOpcode::Cvttps2dq => (LegacyPrefixes::_F3, 0x0F5B, 2),
|
SseOpcode::Cvttps2dq => (LegacyPrefixes::_F3, 0x0F5B, 2),
|
||||||
SseOpcode::Cvtdq2ps => (LegacyPrefixes::None, 0x0F5B, 2),
|
SseOpcode::Cvtdq2ps => (LegacyPrefixes::None, 0x0F5B, 2),
|
||||||
@@ -1480,6 +1481,7 @@ pub(crate) fn emit(
|
|||||||
SseOpcode::Pandn => (LegacyPrefixes::_66, 0x0FDF, 2),
|
SseOpcode::Pandn => (LegacyPrefixes::_66, 0x0FDF, 2),
|
||||||
SseOpcode::Pavgb => (LegacyPrefixes::_66, 0x0FE0, 2),
|
SseOpcode::Pavgb => (LegacyPrefixes::_66, 0x0FE0, 2),
|
||||||
SseOpcode::Pavgw => (LegacyPrefixes::_66, 0x0FE3, 2),
|
SseOpcode::Pavgw => (LegacyPrefixes::_66, 0x0FE3, 2),
|
||||||
|
SseOpcode::Pblendvb => (LegacyPrefixes::_66, 0x0F3810, 3),
|
||||||
SseOpcode::Pcmpeqb => (LegacyPrefixes::_66, 0x0F74, 2),
|
SseOpcode::Pcmpeqb => (LegacyPrefixes::_66, 0x0F74, 2),
|
||||||
SseOpcode::Pcmpeqw => (LegacyPrefixes::_66, 0x0F75, 2),
|
SseOpcode::Pcmpeqw => (LegacyPrefixes::_66, 0x0F75, 2),
|
||||||
SseOpcode::Pcmpeqd => (LegacyPrefixes::_66, 0x0F76, 2),
|
SseOpcode::Pcmpeqd => (LegacyPrefixes::_66, 0x0F76, 2),
|
||||||
|
|||||||
@@ -3432,6 +3432,18 @@ fn test_x64_emit() {
|
|||||||
"blendvpd %xmm15, %xmm4",
|
"blendvpd %xmm15, %xmm4",
|
||||||
));
|
));
|
||||||
|
|
||||||
|
insns.push((
|
||||||
|
Inst::xmm_rm_r(SseOpcode::Blendvps, RegMem::reg(xmm2), w_xmm3),
|
||||||
|
"660F3814DA",
|
||||||
|
"blendvps %xmm2, %xmm3",
|
||||||
|
));
|
||||||
|
|
||||||
|
insns.push((
|
||||||
|
Inst::xmm_rm_r(SseOpcode::Pblendvb, RegMem::reg(xmm12), w_xmm13),
|
||||||
|
"66450F3810EC",
|
||||||
|
"pblendvb %xmm12, %xmm13",
|
||||||
|
));
|
||||||
|
|
||||||
// ========================================================
|
// ========================================================
|
||||||
// XMM_RM_R: Integer Packed
|
// XMM_RM_R: Integer Packed
|
||||||
|
|
||||||
|
|||||||
@@ -1927,13 +1927,20 @@ fn x64_get_regs(inst: &Inst, collector: &mut RegUsageCollector) {
|
|||||||
src.get_regs_as_uses(collector);
|
src.get_regs_as_uses(collector);
|
||||||
collector.add_def(*dst);
|
collector.add_def(*dst);
|
||||||
}
|
}
|
||||||
Inst::XmmRmR { src, dst, .. } => {
|
Inst::XmmRmR { src, dst, op, .. } => {
|
||||||
if inst.produces_const() {
|
if inst.produces_const() {
|
||||||
// No need to account for src, since src == dst.
|
// No need to account for src, since src == dst.
|
||||||
collector.add_def(*dst);
|
collector.add_def(*dst);
|
||||||
} else {
|
} else {
|
||||||
src.get_regs_as_uses(collector);
|
src.get_regs_as_uses(collector);
|
||||||
collector.add_mod(*dst);
|
collector.add_mod(*dst);
|
||||||
|
// Some instructions have an implicit use of XMM0.
|
||||||
|
if *op == SseOpcode::Blendvpd
|
||||||
|
|| *op == SseOpcode::Blendvps
|
||||||
|
|| *op == SseOpcode::Pblendvb
|
||||||
|
{
|
||||||
|
collector.add_use(regs::xmm0());
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
Inst::XmmRmREvex {
|
Inst::XmmRmREvex {
|
||||||
|
|||||||
@@ -2029,7 +2029,50 @@ fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(
|
|||||||
ctx.emit(Inst::gen_move(dst, tmp2.to_reg(), ty));
|
ctx.emit(Inst::gen_move(dst, tmp2.to_reg(), ty));
|
||||||
ctx.emit(Inst::or(ty, RegMem::from(tmp1), dst));
|
ctx.emit(Inst::or(ty, RegMem::from(tmp1), dst));
|
||||||
} else {
|
} else {
|
||||||
unimplemented!("scalar bitselect")
|
unimplemented!("no lowering for scalar bitselect instruction")
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
Opcode::Vselect => {
|
||||||
|
let ty = ty.unwrap();
|
||||||
|
let condition = put_input_in_reg(ctx, inputs[0]);
|
||||||
|
let condition_ty = ctx.input_ty(insn, 0);
|
||||||
|
let if_true = input_to_reg_mem(ctx, inputs[1]);
|
||||||
|
let if_false = put_input_in_reg(ctx, inputs[2]);
|
||||||
|
let dst = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
|
||||||
|
|
||||||
|
if ty.is_vector() {
|
||||||
|
// `vselect` relies on the bit representation of the condition:
|
||||||
|
// vector boolean types are defined in Cranelift to be all 1s or
|
||||||
|
// all 0s. This lowering relies on that fact to use x86's
|
||||||
|
// variable blend instructions, which look at the _high_bit_ of
|
||||||
|
// the condition mask. All the bits of vector booleans will
|
||||||
|
// match (all 1s or all 0s), so we can just use the high bit.
|
||||||
|
assert!(condition_ty.lane_type().is_bool());
|
||||||
|
|
||||||
|
// Variable blend instructions expect the condition mask to be
|
||||||
|
// in XMM0.
|
||||||
|
let xmm0 = Writable::from_reg(regs::xmm0());
|
||||||
|
ctx.emit(Inst::gen_move(xmm0, condition, ty));
|
||||||
|
|
||||||
|
// Match up the source and destination registers for regalloc.
|
||||||
|
ctx.emit(Inst::gen_move(dst, if_false, ty));
|
||||||
|
|
||||||
|
// Technically PBLENDVB would work in all cases (since the bytes
|
||||||
|
// inside the mask will be all 1s or 0s we can blend
|
||||||
|
// byte-by-byte instead of word-by-word, e.g.) but
|
||||||
|
// type-specialized versions are included here for clarity when
|
||||||
|
// troubleshooting and due to slight improvements in
|
||||||
|
// latency/throughput on certain processor families.
|
||||||
|
let opcode = match condition_ty {
|
||||||
|
types::B64X2 => SseOpcode::Blendvpd,
|
||||||
|
types::B32X4 => SseOpcode::Blendvps,
|
||||||
|
types::B16X8 | types::B8X16 => SseOpcode::Pblendvb,
|
||||||
|
_ => unimplemented!("unable lower vselect for type: {}", condition_ty),
|
||||||
|
};
|
||||||
|
ctx.emit(Inst::xmm_rm_r(opcode, if_true, dst));
|
||||||
|
} else {
|
||||||
|
unimplemented!("no lowering for scalar vselect instruction")
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
@@ -15,6 +15,16 @@ block0:
|
|||||||
; nextln: por %xmm1, %xmm0
|
; nextln: por %xmm1, %xmm0
|
||||||
; not: movdqa
|
; not: movdqa
|
||||||
|
|
||||||
|
function %vselect_i16x8() -> i16x8 {
|
||||||
|
block0:
|
||||||
|
v0 = vconst.b16x8 [false true false true false true false true]
|
||||||
|
v1 = vconst.i16x8 [0 0 0 0 0 0 0 0]
|
||||||
|
v2 = vconst.i16x8 [0 0 0 0 0 0 0 0]
|
||||||
|
v3 = vselect v0, v1, v2
|
||||||
|
return v3
|
||||||
|
}
|
||||||
|
; check: pblendvb %xmm1, %xmm2
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
; 8x16 shifts: these lower to complex sequences of instructions
|
; 8x16 shifts: these lower to complex sequences of instructions
|
||||||
|
|||||||
@@ -10,6 +10,17 @@ block0(v0: i8x16, v1: i8x16, v2: i8x16):
|
|||||||
; Remember that bitselect accepts: 1) the selector vector, 2) the "if true" vector, and 3) the "if false" vector.
|
; Remember that bitselect accepts: 1) the selector vector, 2) the "if true" vector, and 3) the "if false" vector.
|
||||||
; run: %bitselect_i8x16([0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 255], [127 0 0 0 0 0 0 0 0 0 0 0 0 0 0 42], [42 0 0 0 0 0 0 0 0 0 0 0 0 0 0 127]) == [42 0 0 0 0 0 0 0 0 0 0 0 0 0 0 42]
|
; run: %bitselect_i8x16([0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 255], [127 0 0 0 0 0 0 0 0 0 0 0 0 0 0 42], [42 0 0 0 0 0 0 0 0 0 0 0 0 0 0 127]) == [42 0 0 0 0 0 0 0 0 0 0 0 0 0 0 42]
|
||||||
|
|
||||||
|
function %vselect_i32x4(i32x4, i32x4) -> i32x4 {
|
||||||
|
block0(v1: i32x4, v2: i32x4):
|
||||||
|
; `make_trampoline` still does not know how to convert boolean vector types
|
||||||
|
; so we load the value directly here.
|
||||||
|
v0 = vconst.b32x4 [true true false false]
|
||||||
|
v3 = vselect v0, v1, v2
|
||||||
|
return v3
|
||||||
|
}
|
||||||
|
; Remember that vselect accepts: 1) the selector vector, 2) the "if true" vector, and 3) the "if false" vector.
|
||||||
|
; run: %vselect_i8x16([1 2 -1 -1], [-1 -1 3 4]) == [1 2 3 4]
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
; shift left
|
; shift left
|
||||||
|
|||||||
Reference in New Issue
Block a user