Optimize vconst for x86 when immediate contains all zeroes or ones

Instead of using MOVUPS to expensively load bits from memory, this change uses a predicate to optimize vconst without a memory access:
 - when the 128-bit immediate is all zeroes in all bits, use PXOR to zero out an XMM register
 - when the 128-bit immediate is all ones in all bits, use PCMPEQB to set an XMM register to all ones

This leaves the constant data in the constant pool, which may increase code size (TODO)
This commit is contained in:
Andrew Brown
2019-08-28 15:29:40 -07:00
parent 694de912a5
commit 702155b19b
7 changed files with 154 additions and 3 deletions

View File

@@ -614,6 +614,12 @@ pub enum FormatPredicateKind {
/// Is the immediate format field member equal to zero? (float64 version) /// Is the immediate format field member equal to zero? (float64 version)
IsZero64BitFloat, IsZero64BitFloat,
/// Is the immediate format field member equal zero in all lanes?
IsAllZeroes128Bit,
/// Does the immediate format field member have ones in all bits of all lanes?
IsAllOnes128Bit,
/// Has the value list (in member_name) the size specified in parameter? /// Has the value list (in member_name) the size specified in parameter?
LengthEquals(usize), LengthEquals(usize),
@@ -690,6 +696,14 @@ impl FormatPredicateNode {
FormatPredicateKind::IsZero64BitFloat => { FormatPredicateKind::IsZero64BitFloat => {
format!("predicates::is_zero_64_bit_float({})", self.member_name) format!("predicates::is_zero_64_bit_float({})", self.member_name)
} }
FormatPredicateKind::IsAllZeroes128Bit => format!(
"predicates::is_all_zeroes_128_bit(func.dfg.constants.get({}))",
self.member_name
),
FormatPredicateKind::IsAllOnes128Bit => format!(
"predicates::is_all_ones_128_bit(func.dfg.constants.get({}))",
self.member_name
),
FormatPredicateKind::LengthEquals(num) => format!( FormatPredicateKind::LengthEquals(num) => format!(
"predicates::has_length_of({}, {}, func)", "predicates::has_length_of({}, {}, func)",
self.member_name, num self.member_name, num
@@ -929,6 +943,28 @@ impl InstructionPredicate {
)) ))
} }
pub fn new_is_all_zeroes_128bit(
format: &InstructionFormat,
field_name: &'static str,
) -> InstructionPredicateNode {
InstructionPredicateNode::FormatPredicate(FormatPredicateNode::new(
format,
field_name,
FormatPredicateKind::IsAllZeroes128Bit,
))
}
pub fn new_is_all_ones_128bit(
format: &InstructionFormat,
field_name: &'static str,
) -> InstructionPredicateNode {
InstructionPredicateNode::FormatPredicate(FormatPredicateNode::new(
format,
field_name,
FormatPredicateKind::IsAllOnes128Bit,
))
}
pub fn new_length_equals(format: &InstructionFormat, size: usize) -> InstructionPredicateNode { pub fn new_length_equals(format: &InstructionFormat, size: usize) -> InstructionPredicateNode {
assert!( assert!(
format.has_value_list, format.has_value_list,

View File

@@ -310,6 +310,20 @@ impl PerCpuModeEncodings {
self.enc64_rec(inst, recipe, bits); self.enc64_rec(inst, recipe, bits);
} }
/// Add the same encoding to both X86_32 and X86_64; assumes configuration (e.g. REX, operand binding) has already happened
fn enc_32_64_func<T>(
&mut self,
inst: impl Clone + Into<InstSpec>,
template: Template,
builder_closure: T,
) where
T: FnOnce(EncodingBuilder) -> EncodingBuilder,
{
let encoding = self.make_encoding(inst.into(), template, builder_closure);
self.enc32.push(encoding.clone());
self.enc64.push(encoding);
}
/// Add the same encoding to both X86_32 and X86_64; assumes configuration (e.g. REX, operand /// Add the same encoding to both X86_32 and X86_64; assumes configuration (e.g. REX, operand
/// binding) has already happened. /// binding) has already happened.
fn enc_32_64_maybe_isap( fn enc_32_64_maybe_isap(
@@ -642,6 +656,7 @@ pub(crate) fn define(
let rec_urm_noflags = r.template("urm_noflags"); let rec_urm_noflags = r.template("urm_noflags");
let rec_urm_noflags_abcd = r.template("urm_noflags_abcd"); let rec_urm_noflags_abcd = r.template("urm_noflags_abcd");
let rec_vconst = r.template("vconst"); let rec_vconst = r.template("vconst");
let rec_vconst_optimized = r.template("vconst_optimized");
// Predicates shorthands. // Predicates shorthands.
let all_ones_funcaddrs_and_not_is_pic = let all_ones_funcaddrs_and_not_is_pic =
@@ -1671,7 +1686,7 @@ pub(crate) fn define(
); );
e.enc_x86_64_instp( e.enc_x86_64_instp(
f64const, f64const,
rec_f64imm_z.opcodes(vec![0x66, 0x0f, 0x57]), rec_f64imm_z.opcodes(vec![0x66, 0x0f, 0x57]), // XORPD from SSE2
is_zero_64_bit_float, is_zero_64_bit_float,
); );
@@ -1946,6 +1961,32 @@ pub(crate) fn define(
} }
} }
// SIMD vconst for special cases (all zeroes, all ones)
// this must be encoded prior to the MOVUPS implementation (below) so the compiler sees this
// encoding first
for ty in ValueType::all_lane_types().filter(allowed_simd_type) {
let f_unary_const = formats.get(formats.by_name("UnaryConst"));
let instruction = vconst.bind_vector_from_lane(ty, sse_vector_size);
let is_zero_128bit =
InstructionPredicate::new_is_all_zeroes_128bit(f_unary_const, "constant_handle");
let template = rec_vconst_optimized
.nonrex()
.opcodes(vec![0x66, 0x0f, 0xef]); // PXOR from SSE2
e.enc_32_64_func(instruction.clone(), template, |builder| {
builder.inst_predicate(is_zero_128bit)
});
let is_ones_128bit =
InstructionPredicate::new_is_all_ones_128bit(f_unary_const, "constant_handle");
let template = rec_vconst_optimized
.nonrex()
.opcodes(vec![0x66, 0x0f, 0x74]); // PCMPEQB from SSE2
e.enc_32_64_func(instruction, template, |builder| {
builder.inst_predicate(is_ones_128bit)
});
}
// SIMD vconst using MOVUPS // SIMD vconst using MOVUPS
// TODO it would be ideal if eventually this became the more efficient MOVAPS but we would have // TODO it would be ideal if eventually this became the more efficient MOVAPS but we would have
// to guarantee that the constants are aligned when emitted and there is currently no mechanism // to guarantee that the constants are aligned when emitted and there is currently no mechanism

View File

@@ -2449,6 +2449,18 @@ pub(crate) fn define<'shared>(
), ),
); );
recipes.add_template_recipe(
EncodingRecipeBuilder::new("vconst_optimized", f_unary_const, 1)
.operands_out(vec![fpr])
.clobbers_flags(false)
.emit(
r#"
{{PUT_OP}}(bits, rex2(out_reg0, out_reg0), sink);
modrm_rr(out_reg0, out_reg0, sink);
"#,
),
);
recipes.add_template_recipe( recipes.add_template_recipe(
EncodingRecipeBuilder::new("jt_base", f_branch_table_base, 5) EncodingRecipeBuilder::new("jt_base", f_branch_table_base, 5)
.operands_out(vec![gpr]) .operands_out(vec![gpr])

View File

@@ -31,6 +31,18 @@ pub fn is_zero_32_bit_float<T: Into<ir::immediates::Ieee32>>(x: T) -> bool {
x32.bits() == 0 x32.bits() == 0
} }
/// Check that a 128-bit vector contains all zeroes.
#[allow(dead_code)]
pub fn is_all_zeroes_128_bit<'b, T: PartialEq<&'b [u8; 16]>>(x: T) -> bool {
x.eq(&&[0; 16])
}
/// Check that a 128-bit vector contains all ones.
#[allow(dead_code)]
pub fn is_all_ones_128_bit<'b, T: PartialEq<&'b [u8; 16]>>(x: T) -> bool {
x.eq(&&[0xff; 16])
}
/// Check that `x` is the same as `y`. /// Check that `x` is the same as `y`.
#[allow(dead_code)] #[allow(dead_code)]
pub fn is_equal<T: Eq + Copy, O: Into<T> + Copy>(x: T, y: O) -> bool { pub fn is_equal<T: Eq + Copy, O: Into<T> + Copy>(x: T, y: O) -> bool {
@@ -109,4 +121,19 @@ mod tests {
assert!(!is_signed_int(x1, 16, 4)); assert!(!is_signed_int(x1, 16, 4));
assert!(!is_signed_int(x2, 16, 4)); assert!(!is_signed_int(x2, 16, 4));
} }
#[test]
fn is_all_zeroes() {
assert!(is_all_zeroes_128_bit(&[0; 16]));
assert!(is_all_zeroes_128_bit(vec![
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
]));
assert!(!is_all_zeroes_128_bit(&[1; 16]));
}
#[test]
fn is_all_ones() {
assert!(!is_all_ones_128_bit(&[0; 16]));
assert!(is_all_ones_128_bit(&[0xff; 16]));
}
} }

View File

@@ -5,7 +5,7 @@ target x86_64
function %test_vconst_b8() { function %test_vconst_b8() {
ebb0: ebb0:
[-, %xmm2] v0 = vconst.b8x16 0x00 ; bin: 0f 10 15 00000008 PCRelRodata4(15) [-, %xmm2] v0 = vconst.b8x16 0x01 ; bin: 0f 10 15 00000008 PCRelRodata4(15)
[-, %xmm3] v1 = vconst.b8x16 0x01 ; bin: 0f 10 1d 00000011 PCRelRodata4(31) [-, %xmm3] v1 = vconst.b8x16 0x02 ; bin: 0f 10 1d 00000011 PCRelRodata4(31)
return return
} }

View File

@@ -0,0 +1,23 @@
test run
set enable_simd
target x86_64
; TODO move to vconst-run.clif
function %test_vconst_zeroes() -> b1 {
ebb0:
v0 = vconst.i8x16 0x00
v1 = extractlane v0, 4
v2 = icmp_imm eq v1, 0
return v2
}
; run
function %test_vconst_ones() -> b1 {
ebb0:
v0 = vconst.i8x16 0xffffffffffffffffffffffffffffffff
v1 = extractlane v0, 2
v2 = icmp_imm eq v1, 0xff
return v2
}
; run

View File

@@ -0,0 +1,12 @@
test binemit
set enable_simd
target x86_64
; TODO move to vconst-compile.clif or vconst-binemit.clif
function %test_vconst_optimizations() {
ebb0:
[-, %xmm4] v0 = vconst.b8x16 0x00 ; bin: 66 0f ef e4
[-, %xmm7] v1 = vconst.b8x16 0xffffffffffffffffffffffffffffffff ; bin: 66 0f 74 ff
return
}