From 702155b19b796663e4e3e8c76cf3a35ef0963229 Mon Sep 17 00:00:00 2001 From: Andrew Brown Date: Wed, 28 Aug 2019 15:29:40 -0700 Subject: [PATCH] Optimize vconst for x86 when immediate contains all zeroes or ones Instead of using MOVUPS to expensively load bits from memory, this change uses a predicate to optimize vconst without a memory access: - when the 128-bit immediate is all zeroes in all bits, use PXOR to zero out an XMM register - when the 128-bit immediate is all ones in all bits, use PCMPEQB to set an XMM register to all ones This leaves the constant data in the constant pool, which may increase code size (TODO) --- .../codegen/meta/src/cdsl/instructions.rs | 36 ++++++++++++++++ .../codegen/meta/src/isa/x86/encodings.rs | 43 ++++++++++++++++++- cranelift/codegen/meta/src/isa/x86/recipes.rs | 12 ++++++ cranelift/codegen/src/predicates.rs | 27 ++++++++++++ .../filetests/isa/x86/vconst-binemit.clif | 4 +- .../filetests/isa/x86/vconst-opt-run.clif | 23 ++++++++++ .../filetests/isa/x86/vconst-opt.clif | 12 ++++++ 7 files changed, 154 insertions(+), 3 deletions(-) create mode 100644 cranelift/filetests/filetests/isa/x86/vconst-opt-run.clif create mode 100644 cranelift/filetests/filetests/isa/x86/vconst-opt.clif diff --git a/cranelift/codegen/meta/src/cdsl/instructions.rs b/cranelift/codegen/meta/src/cdsl/instructions.rs index 2737099cc4..fdc5ad8bf8 100644 --- a/cranelift/codegen/meta/src/cdsl/instructions.rs +++ b/cranelift/codegen/meta/src/cdsl/instructions.rs @@ -614,6 +614,12 @@ pub enum FormatPredicateKind { /// Is the immediate format field member equal to zero? (float64 version) IsZero64BitFloat, + /// Is the immediate format field member equal zero in all lanes? + IsAllZeroes128Bit, + + /// Does the immediate format field member have ones in all bits of all lanes? + IsAllOnes128Bit, + /// Has the value list (in member_name) the size specified in parameter? LengthEquals(usize), @@ -690,6 +696,14 @@ impl FormatPredicateNode { FormatPredicateKind::IsZero64BitFloat => { format!("predicates::is_zero_64_bit_float({})", self.member_name) } + FormatPredicateKind::IsAllZeroes128Bit => format!( + "predicates::is_all_zeroes_128_bit(func.dfg.constants.get({}))", + self.member_name + ), + FormatPredicateKind::IsAllOnes128Bit => format!( + "predicates::is_all_ones_128_bit(func.dfg.constants.get({}))", + self.member_name + ), FormatPredicateKind::LengthEquals(num) => format!( "predicates::has_length_of({}, {}, func)", self.member_name, num @@ -929,6 +943,28 @@ impl InstructionPredicate { )) } + pub fn new_is_all_zeroes_128bit( + format: &InstructionFormat, + field_name: &'static str, + ) -> InstructionPredicateNode { + InstructionPredicateNode::FormatPredicate(FormatPredicateNode::new( + format, + field_name, + FormatPredicateKind::IsAllZeroes128Bit, + )) + } + + pub fn new_is_all_ones_128bit( + format: &InstructionFormat, + field_name: &'static str, + ) -> InstructionPredicateNode { + InstructionPredicateNode::FormatPredicate(FormatPredicateNode::new( + format, + field_name, + FormatPredicateKind::IsAllOnes128Bit, + )) + } + pub fn new_length_equals(format: &InstructionFormat, size: usize) -> InstructionPredicateNode { assert!( format.has_value_list, diff --git a/cranelift/codegen/meta/src/isa/x86/encodings.rs b/cranelift/codegen/meta/src/isa/x86/encodings.rs index d8b070b69e..6c578c458a 100644 --- a/cranelift/codegen/meta/src/isa/x86/encodings.rs +++ b/cranelift/codegen/meta/src/isa/x86/encodings.rs @@ -310,6 +310,20 @@ impl PerCpuModeEncodings { self.enc64_rec(inst, recipe, bits); } + /// Add the same encoding to both X86_32 and X86_64; assumes configuration (e.g. REX, operand binding) has already happened + fn enc_32_64_func( + &mut self, + inst: impl Clone + Into, + template: Template, + builder_closure: T, + ) where + T: FnOnce(EncodingBuilder) -> EncodingBuilder, + { + let encoding = self.make_encoding(inst.into(), template, builder_closure); + self.enc32.push(encoding.clone()); + self.enc64.push(encoding); + } + /// Add the same encoding to both X86_32 and X86_64; assumes configuration (e.g. REX, operand /// binding) has already happened. fn enc_32_64_maybe_isap( @@ -642,6 +656,7 @@ pub(crate) fn define( let rec_urm_noflags = r.template("urm_noflags"); let rec_urm_noflags_abcd = r.template("urm_noflags_abcd"); let rec_vconst = r.template("vconst"); + let rec_vconst_optimized = r.template("vconst_optimized"); // Predicates shorthands. let all_ones_funcaddrs_and_not_is_pic = @@ -1671,7 +1686,7 @@ pub(crate) fn define( ); e.enc_x86_64_instp( f64const, - rec_f64imm_z.opcodes(vec![0x66, 0x0f, 0x57]), + rec_f64imm_z.opcodes(vec![0x66, 0x0f, 0x57]), // XORPD from SSE2 is_zero_64_bit_float, ); @@ -1946,6 +1961,32 @@ pub(crate) fn define( } } + // SIMD vconst for special cases (all zeroes, all ones) + // this must be encoded prior to the MOVUPS implementation (below) so the compiler sees this + // encoding first + for ty in ValueType::all_lane_types().filter(allowed_simd_type) { + let f_unary_const = formats.get(formats.by_name("UnaryConst")); + let instruction = vconst.bind_vector_from_lane(ty, sse_vector_size); + + let is_zero_128bit = + InstructionPredicate::new_is_all_zeroes_128bit(f_unary_const, "constant_handle"); + let template = rec_vconst_optimized + .nonrex() + .opcodes(vec![0x66, 0x0f, 0xef]); // PXOR from SSE2 + e.enc_32_64_func(instruction.clone(), template, |builder| { + builder.inst_predicate(is_zero_128bit) + }); + + let is_ones_128bit = + InstructionPredicate::new_is_all_ones_128bit(f_unary_const, "constant_handle"); + let template = rec_vconst_optimized + .nonrex() + .opcodes(vec![0x66, 0x0f, 0x74]); // PCMPEQB from SSE2 + e.enc_32_64_func(instruction, template, |builder| { + builder.inst_predicate(is_ones_128bit) + }); + } + // SIMD vconst using MOVUPS // TODO it would be ideal if eventually this became the more efficient MOVAPS but we would have // to guarantee that the constants are aligned when emitted and there is currently no mechanism diff --git a/cranelift/codegen/meta/src/isa/x86/recipes.rs b/cranelift/codegen/meta/src/isa/x86/recipes.rs index b30a35e3f5..b9a10c86f6 100644 --- a/cranelift/codegen/meta/src/isa/x86/recipes.rs +++ b/cranelift/codegen/meta/src/isa/x86/recipes.rs @@ -2449,6 +2449,18 @@ pub(crate) fn define<'shared>( ), ); + recipes.add_template_recipe( + EncodingRecipeBuilder::new("vconst_optimized", f_unary_const, 1) + .operands_out(vec![fpr]) + .clobbers_flags(false) + .emit( + r#" + {{PUT_OP}}(bits, rex2(out_reg0, out_reg0), sink); + modrm_rr(out_reg0, out_reg0, sink); + "#, + ), + ); + recipes.add_template_recipe( EncodingRecipeBuilder::new("jt_base", f_branch_table_base, 5) .operands_out(vec![gpr]) diff --git a/cranelift/codegen/src/predicates.rs b/cranelift/codegen/src/predicates.rs index f900546111..16672b145b 100644 --- a/cranelift/codegen/src/predicates.rs +++ b/cranelift/codegen/src/predicates.rs @@ -31,6 +31,18 @@ pub fn is_zero_32_bit_float>(x: T) -> bool { x32.bits() == 0 } +/// Check that a 128-bit vector contains all zeroes. +#[allow(dead_code)] +pub fn is_all_zeroes_128_bit<'b, T: PartialEq<&'b [u8; 16]>>(x: T) -> bool { + x.eq(&&[0; 16]) +} + +/// Check that a 128-bit vector contains all ones. +#[allow(dead_code)] +pub fn is_all_ones_128_bit<'b, T: PartialEq<&'b [u8; 16]>>(x: T) -> bool { + x.eq(&&[0xff; 16]) +} + /// Check that `x` is the same as `y`. #[allow(dead_code)] pub fn is_equal + Copy>(x: T, y: O) -> bool { @@ -109,4 +121,19 @@ mod tests { assert!(!is_signed_int(x1, 16, 4)); assert!(!is_signed_int(x2, 16, 4)); } + + #[test] + fn is_all_zeroes() { + assert!(is_all_zeroes_128_bit(&[0; 16])); + assert!(is_all_zeroes_128_bit(vec![ + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 + ])); + assert!(!is_all_zeroes_128_bit(&[1; 16])); + } + + #[test] + fn is_all_ones() { + assert!(!is_all_ones_128_bit(&[0; 16])); + assert!(is_all_ones_128_bit(&[0xff; 16])); + } } diff --git a/cranelift/filetests/filetests/isa/x86/vconst-binemit.clif b/cranelift/filetests/filetests/isa/x86/vconst-binemit.clif index b07dc0fd4e..2d6f862679 100644 --- a/cranelift/filetests/filetests/isa/x86/vconst-binemit.clif +++ b/cranelift/filetests/filetests/isa/x86/vconst-binemit.clif @@ -5,7 +5,7 @@ target x86_64 function %test_vconst_b8() { ebb0: -[-, %xmm2] v0 = vconst.b8x16 0x00 ; bin: 0f 10 15 00000008 PCRelRodata4(15) -[-, %xmm3] v1 = vconst.b8x16 0x01 ; bin: 0f 10 1d 00000011 PCRelRodata4(31) +[-, %xmm2] v0 = vconst.b8x16 0x01 ; bin: 0f 10 15 00000008 PCRelRodata4(15) +[-, %xmm3] v1 = vconst.b8x16 0x02 ; bin: 0f 10 1d 00000011 PCRelRodata4(31) return } diff --git a/cranelift/filetests/filetests/isa/x86/vconst-opt-run.clif b/cranelift/filetests/filetests/isa/x86/vconst-opt-run.clif new file mode 100644 index 0000000000..487ff4f844 --- /dev/null +++ b/cranelift/filetests/filetests/isa/x86/vconst-opt-run.clif @@ -0,0 +1,23 @@ +test run +set enable_simd +target x86_64 + +; TODO move to vconst-run.clif + +function %test_vconst_zeroes() -> b1 { +ebb0: + v0 = vconst.i8x16 0x00 + v1 = extractlane v0, 4 + v2 = icmp_imm eq v1, 0 + return v2 +} +; run + +function %test_vconst_ones() -> b1 { +ebb0: + v0 = vconst.i8x16 0xffffffffffffffffffffffffffffffff + v1 = extractlane v0, 2 + v2 = icmp_imm eq v1, 0xff + return v2 +} +; run diff --git a/cranelift/filetests/filetests/isa/x86/vconst-opt.clif b/cranelift/filetests/filetests/isa/x86/vconst-opt.clif new file mode 100644 index 0000000000..4daeed8abe --- /dev/null +++ b/cranelift/filetests/filetests/isa/x86/vconst-opt.clif @@ -0,0 +1,12 @@ +test binemit +set enable_simd +target x86_64 + +; TODO move to vconst-compile.clif or vconst-binemit.clif + +function %test_vconst_optimizations() { +ebb0: +[-, %xmm4] v0 = vconst.b8x16 0x00 ; bin: 66 0f ef e4 +[-, %xmm7] v1 = vconst.b8x16 0xffffffffffffffffffffffffffffffff ; bin: 66 0f 74 ff + return +}