Optimize vconst for x86 when immediate contains all zeroes or ones
Instead of using MOVUPS to expensively load bits from memory, this change uses a predicate to optimize vconst without a memory access: - when the 128-bit immediate is all zeroes in all bits, use PXOR to zero out an XMM register - when the 128-bit immediate is all ones in all bits, use PCMPEQB to set an XMM register to all ones This leaves the constant data in the constant pool, which may increase code size (TODO)
This commit is contained in:
@@ -614,6 +614,12 @@ pub enum FormatPredicateKind {
|
||||
/// Is the immediate format field member equal to zero? (float64 version)
|
||||
IsZero64BitFloat,
|
||||
|
||||
/// Is the immediate format field member equal zero in all lanes?
|
||||
IsAllZeroes128Bit,
|
||||
|
||||
/// Does the immediate format field member have ones in all bits of all lanes?
|
||||
IsAllOnes128Bit,
|
||||
|
||||
/// Has the value list (in member_name) the size specified in parameter?
|
||||
LengthEquals(usize),
|
||||
|
||||
@@ -690,6 +696,14 @@ impl FormatPredicateNode {
|
||||
FormatPredicateKind::IsZero64BitFloat => {
|
||||
format!("predicates::is_zero_64_bit_float({})", self.member_name)
|
||||
}
|
||||
FormatPredicateKind::IsAllZeroes128Bit => format!(
|
||||
"predicates::is_all_zeroes_128_bit(func.dfg.constants.get({}))",
|
||||
self.member_name
|
||||
),
|
||||
FormatPredicateKind::IsAllOnes128Bit => format!(
|
||||
"predicates::is_all_ones_128_bit(func.dfg.constants.get({}))",
|
||||
self.member_name
|
||||
),
|
||||
FormatPredicateKind::LengthEquals(num) => format!(
|
||||
"predicates::has_length_of({}, {}, func)",
|
||||
self.member_name, num
|
||||
@@ -929,6 +943,28 @@ impl InstructionPredicate {
|
||||
))
|
||||
}
|
||||
|
||||
pub fn new_is_all_zeroes_128bit(
|
||||
format: &InstructionFormat,
|
||||
field_name: &'static str,
|
||||
) -> InstructionPredicateNode {
|
||||
InstructionPredicateNode::FormatPredicate(FormatPredicateNode::new(
|
||||
format,
|
||||
field_name,
|
||||
FormatPredicateKind::IsAllZeroes128Bit,
|
||||
))
|
||||
}
|
||||
|
||||
pub fn new_is_all_ones_128bit(
|
||||
format: &InstructionFormat,
|
||||
field_name: &'static str,
|
||||
) -> InstructionPredicateNode {
|
||||
InstructionPredicateNode::FormatPredicate(FormatPredicateNode::new(
|
||||
format,
|
||||
field_name,
|
||||
FormatPredicateKind::IsAllOnes128Bit,
|
||||
))
|
||||
}
|
||||
|
||||
pub fn new_length_equals(format: &InstructionFormat, size: usize) -> InstructionPredicateNode {
|
||||
assert!(
|
||||
format.has_value_list,
|
||||
|
||||
@@ -310,6 +310,20 @@ impl PerCpuModeEncodings {
|
||||
self.enc64_rec(inst, recipe, bits);
|
||||
}
|
||||
|
||||
/// Add the same encoding to both X86_32 and X86_64; assumes configuration (e.g. REX, operand binding) has already happened
|
||||
fn enc_32_64_func<T>(
|
||||
&mut self,
|
||||
inst: impl Clone + Into<InstSpec>,
|
||||
template: Template,
|
||||
builder_closure: T,
|
||||
) where
|
||||
T: FnOnce(EncodingBuilder) -> EncodingBuilder,
|
||||
{
|
||||
let encoding = self.make_encoding(inst.into(), template, builder_closure);
|
||||
self.enc32.push(encoding.clone());
|
||||
self.enc64.push(encoding);
|
||||
}
|
||||
|
||||
/// Add the same encoding to both X86_32 and X86_64; assumes configuration (e.g. REX, operand
|
||||
/// binding) has already happened.
|
||||
fn enc_32_64_maybe_isap(
|
||||
@@ -642,6 +656,7 @@ pub(crate) fn define(
|
||||
let rec_urm_noflags = r.template("urm_noflags");
|
||||
let rec_urm_noflags_abcd = r.template("urm_noflags_abcd");
|
||||
let rec_vconst = r.template("vconst");
|
||||
let rec_vconst_optimized = r.template("vconst_optimized");
|
||||
|
||||
// Predicates shorthands.
|
||||
let all_ones_funcaddrs_and_not_is_pic =
|
||||
@@ -1671,7 +1686,7 @@ pub(crate) fn define(
|
||||
);
|
||||
e.enc_x86_64_instp(
|
||||
f64const,
|
||||
rec_f64imm_z.opcodes(vec![0x66, 0x0f, 0x57]),
|
||||
rec_f64imm_z.opcodes(vec![0x66, 0x0f, 0x57]), // XORPD from SSE2
|
||||
is_zero_64_bit_float,
|
||||
);
|
||||
|
||||
@@ -1946,6 +1961,32 @@ pub(crate) fn define(
|
||||
}
|
||||
}
|
||||
|
||||
// SIMD vconst for special cases (all zeroes, all ones)
|
||||
// this must be encoded prior to the MOVUPS implementation (below) so the compiler sees this
|
||||
// encoding first
|
||||
for ty in ValueType::all_lane_types().filter(allowed_simd_type) {
|
||||
let f_unary_const = formats.get(formats.by_name("UnaryConst"));
|
||||
let instruction = vconst.bind_vector_from_lane(ty, sse_vector_size);
|
||||
|
||||
let is_zero_128bit =
|
||||
InstructionPredicate::new_is_all_zeroes_128bit(f_unary_const, "constant_handle");
|
||||
let template = rec_vconst_optimized
|
||||
.nonrex()
|
||||
.opcodes(vec![0x66, 0x0f, 0xef]); // PXOR from SSE2
|
||||
e.enc_32_64_func(instruction.clone(), template, |builder| {
|
||||
builder.inst_predicate(is_zero_128bit)
|
||||
});
|
||||
|
||||
let is_ones_128bit =
|
||||
InstructionPredicate::new_is_all_ones_128bit(f_unary_const, "constant_handle");
|
||||
let template = rec_vconst_optimized
|
||||
.nonrex()
|
||||
.opcodes(vec![0x66, 0x0f, 0x74]); // PCMPEQB from SSE2
|
||||
e.enc_32_64_func(instruction, template, |builder| {
|
||||
builder.inst_predicate(is_ones_128bit)
|
||||
});
|
||||
}
|
||||
|
||||
// SIMD vconst using MOVUPS
|
||||
// TODO it would be ideal if eventually this became the more efficient MOVAPS but we would have
|
||||
// to guarantee that the constants are aligned when emitted and there is currently no mechanism
|
||||
|
||||
@@ -2449,6 +2449,18 @@ pub(crate) fn define<'shared>(
|
||||
),
|
||||
);
|
||||
|
||||
recipes.add_template_recipe(
|
||||
EncodingRecipeBuilder::new("vconst_optimized", f_unary_const, 1)
|
||||
.operands_out(vec![fpr])
|
||||
.clobbers_flags(false)
|
||||
.emit(
|
||||
r#"
|
||||
{{PUT_OP}}(bits, rex2(out_reg0, out_reg0), sink);
|
||||
modrm_rr(out_reg0, out_reg0, sink);
|
||||
"#,
|
||||
),
|
||||
);
|
||||
|
||||
recipes.add_template_recipe(
|
||||
EncodingRecipeBuilder::new("jt_base", f_branch_table_base, 5)
|
||||
.operands_out(vec![gpr])
|
||||
|
||||
@@ -31,6 +31,18 @@ pub fn is_zero_32_bit_float<T: Into<ir::immediates::Ieee32>>(x: T) -> bool {
|
||||
x32.bits() == 0
|
||||
}
|
||||
|
||||
/// Check that a 128-bit vector contains all zeroes.
|
||||
#[allow(dead_code)]
|
||||
pub fn is_all_zeroes_128_bit<'b, T: PartialEq<&'b [u8; 16]>>(x: T) -> bool {
|
||||
x.eq(&&[0; 16])
|
||||
}
|
||||
|
||||
/// Check that a 128-bit vector contains all ones.
|
||||
#[allow(dead_code)]
|
||||
pub fn is_all_ones_128_bit<'b, T: PartialEq<&'b [u8; 16]>>(x: T) -> bool {
|
||||
x.eq(&&[0xff; 16])
|
||||
}
|
||||
|
||||
/// Check that `x` is the same as `y`.
|
||||
#[allow(dead_code)]
|
||||
pub fn is_equal<T: Eq + Copy, O: Into<T> + Copy>(x: T, y: O) -> bool {
|
||||
@@ -109,4 +121,19 @@ mod tests {
|
||||
assert!(!is_signed_int(x1, 16, 4));
|
||||
assert!(!is_signed_int(x2, 16, 4));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn is_all_zeroes() {
|
||||
assert!(is_all_zeroes_128_bit(&[0; 16]));
|
||||
assert!(is_all_zeroes_128_bit(vec![
|
||||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
|
||||
]));
|
||||
assert!(!is_all_zeroes_128_bit(&[1; 16]));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn is_all_ones() {
|
||||
assert!(!is_all_ones_128_bit(&[0; 16]));
|
||||
assert!(is_all_ones_128_bit(&[0xff; 16]));
|
||||
}
|
||||
}
|
||||
|
||||
@@ -5,7 +5,7 @@ target x86_64
|
||||
|
||||
function %test_vconst_b8() {
|
||||
ebb0:
|
||||
[-, %xmm2] v0 = vconst.b8x16 0x00 ; bin: 0f 10 15 00000008 PCRelRodata4(15)
|
||||
[-, %xmm3] v1 = vconst.b8x16 0x01 ; bin: 0f 10 1d 00000011 PCRelRodata4(31)
|
||||
[-, %xmm2] v0 = vconst.b8x16 0x01 ; bin: 0f 10 15 00000008 PCRelRodata4(15)
|
||||
[-, %xmm3] v1 = vconst.b8x16 0x02 ; bin: 0f 10 1d 00000011 PCRelRodata4(31)
|
||||
return
|
||||
}
|
||||
|
||||
23
cranelift/filetests/filetests/isa/x86/vconst-opt-run.clif
Normal file
23
cranelift/filetests/filetests/isa/x86/vconst-opt-run.clif
Normal file
@@ -0,0 +1,23 @@
|
||||
test run
|
||||
set enable_simd
|
||||
target x86_64
|
||||
|
||||
; TODO move to vconst-run.clif
|
||||
|
||||
function %test_vconst_zeroes() -> b1 {
|
||||
ebb0:
|
||||
v0 = vconst.i8x16 0x00
|
||||
v1 = extractlane v0, 4
|
||||
v2 = icmp_imm eq v1, 0
|
||||
return v2
|
||||
}
|
||||
; run
|
||||
|
||||
function %test_vconst_ones() -> b1 {
|
||||
ebb0:
|
||||
v0 = vconst.i8x16 0xffffffffffffffffffffffffffffffff
|
||||
v1 = extractlane v0, 2
|
||||
v2 = icmp_imm eq v1, 0xff
|
||||
return v2
|
||||
}
|
||||
; run
|
||||
12
cranelift/filetests/filetests/isa/x86/vconst-opt.clif
Normal file
12
cranelift/filetests/filetests/isa/x86/vconst-opt.clif
Normal file
@@ -0,0 +1,12 @@
|
||||
test binemit
|
||||
set enable_simd
|
||||
target x86_64
|
||||
|
||||
; TODO move to vconst-compile.clif or vconst-binemit.clif
|
||||
|
||||
function %test_vconst_optimizations() {
|
||||
ebb0:
|
||||
[-, %xmm4] v0 = vconst.b8x16 0x00 ; bin: 66 0f ef e4
|
||||
[-, %xmm7] v1 = vconst.b8x16 0xffffffffffffffffffffffffffffffff ; bin: 66 0f 74 ff
|
||||
return
|
||||
}
|
||||
Reference in New Issue
Block a user