diff --git a/cranelift/codegen/meta/src/cdsl/instructions.rs b/cranelift/codegen/meta/src/cdsl/instructions.rs index 2737099cc4..fdc5ad8bf8 100644 --- a/cranelift/codegen/meta/src/cdsl/instructions.rs +++ b/cranelift/codegen/meta/src/cdsl/instructions.rs @@ -614,6 +614,12 @@ pub enum FormatPredicateKind { /// Is the immediate format field member equal to zero? (float64 version) IsZero64BitFloat, + /// Is the immediate format field member equal zero in all lanes? + IsAllZeroes128Bit, + + /// Does the immediate format field member have ones in all bits of all lanes? + IsAllOnes128Bit, + /// Has the value list (in member_name) the size specified in parameter? LengthEquals(usize), @@ -690,6 +696,14 @@ impl FormatPredicateNode { FormatPredicateKind::IsZero64BitFloat => { format!("predicates::is_zero_64_bit_float({})", self.member_name) } + FormatPredicateKind::IsAllZeroes128Bit => format!( + "predicates::is_all_zeroes_128_bit(func.dfg.constants.get({}))", + self.member_name + ), + FormatPredicateKind::IsAllOnes128Bit => format!( + "predicates::is_all_ones_128_bit(func.dfg.constants.get({}))", + self.member_name + ), FormatPredicateKind::LengthEquals(num) => format!( "predicates::has_length_of({}, {}, func)", self.member_name, num @@ -929,6 +943,28 @@ impl InstructionPredicate { )) } + pub fn new_is_all_zeroes_128bit( + format: &InstructionFormat, + field_name: &'static str, + ) -> InstructionPredicateNode { + InstructionPredicateNode::FormatPredicate(FormatPredicateNode::new( + format, + field_name, + FormatPredicateKind::IsAllZeroes128Bit, + )) + } + + pub fn new_is_all_ones_128bit( + format: &InstructionFormat, + field_name: &'static str, + ) -> InstructionPredicateNode { + InstructionPredicateNode::FormatPredicate(FormatPredicateNode::new( + format, + field_name, + FormatPredicateKind::IsAllOnes128Bit, + )) + } + pub fn new_length_equals(format: &InstructionFormat, size: usize) -> InstructionPredicateNode { assert!( format.has_value_list, diff --git a/cranelift/codegen/meta/src/isa/x86/encodings.rs b/cranelift/codegen/meta/src/isa/x86/encodings.rs index d8b070b69e..6c578c458a 100644 --- a/cranelift/codegen/meta/src/isa/x86/encodings.rs +++ b/cranelift/codegen/meta/src/isa/x86/encodings.rs @@ -310,6 +310,20 @@ impl PerCpuModeEncodings { self.enc64_rec(inst, recipe, bits); } + /// Add the same encoding to both X86_32 and X86_64; assumes configuration (e.g. REX, operand binding) has already happened + fn enc_32_64_func( + &mut self, + inst: impl Clone + Into, + template: Template, + builder_closure: T, + ) where + T: FnOnce(EncodingBuilder) -> EncodingBuilder, + { + let encoding = self.make_encoding(inst.into(), template, builder_closure); + self.enc32.push(encoding.clone()); + self.enc64.push(encoding); + } + /// Add the same encoding to both X86_32 and X86_64; assumes configuration (e.g. REX, operand /// binding) has already happened. fn enc_32_64_maybe_isap( @@ -642,6 +656,7 @@ pub(crate) fn define( let rec_urm_noflags = r.template("urm_noflags"); let rec_urm_noflags_abcd = r.template("urm_noflags_abcd"); let rec_vconst = r.template("vconst"); + let rec_vconst_optimized = r.template("vconst_optimized"); // Predicates shorthands. let all_ones_funcaddrs_and_not_is_pic = @@ -1671,7 +1686,7 @@ pub(crate) fn define( ); e.enc_x86_64_instp( f64const, - rec_f64imm_z.opcodes(vec![0x66, 0x0f, 0x57]), + rec_f64imm_z.opcodes(vec![0x66, 0x0f, 0x57]), // XORPD from SSE2 is_zero_64_bit_float, ); @@ -1946,6 +1961,32 @@ pub(crate) fn define( } } + // SIMD vconst for special cases (all zeroes, all ones) + // this must be encoded prior to the MOVUPS implementation (below) so the compiler sees this + // encoding first + for ty in ValueType::all_lane_types().filter(allowed_simd_type) { + let f_unary_const = formats.get(formats.by_name("UnaryConst")); + let instruction = vconst.bind_vector_from_lane(ty, sse_vector_size); + + let is_zero_128bit = + InstructionPredicate::new_is_all_zeroes_128bit(f_unary_const, "constant_handle"); + let template = rec_vconst_optimized + .nonrex() + .opcodes(vec![0x66, 0x0f, 0xef]); // PXOR from SSE2 + e.enc_32_64_func(instruction.clone(), template, |builder| { + builder.inst_predicate(is_zero_128bit) + }); + + let is_ones_128bit = + InstructionPredicate::new_is_all_ones_128bit(f_unary_const, "constant_handle"); + let template = rec_vconst_optimized + .nonrex() + .opcodes(vec![0x66, 0x0f, 0x74]); // PCMPEQB from SSE2 + e.enc_32_64_func(instruction, template, |builder| { + builder.inst_predicate(is_ones_128bit) + }); + } + // SIMD vconst using MOVUPS // TODO it would be ideal if eventually this became the more efficient MOVAPS but we would have // to guarantee that the constants are aligned when emitted and there is currently no mechanism diff --git a/cranelift/codegen/meta/src/isa/x86/recipes.rs b/cranelift/codegen/meta/src/isa/x86/recipes.rs index b30a35e3f5..b9a10c86f6 100644 --- a/cranelift/codegen/meta/src/isa/x86/recipes.rs +++ b/cranelift/codegen/meta/src/isa/x86/recipes.rs @@ -2449,6 +2449,18 @@ pub(crate) fn define<'shared>( ), ); + recipes.add_template_recipe( + EncodingRecipeBuilder::new("vconst_optimized", f_unary_const, 1) + .operands_out(vec![fpr]) + .clobbers_flags(false) + .emit( + r#" + {{PUT_OP}}(bits, rex2(out_reg0, out_reg0), sink); + modrm_rr(out_reg0, out_reg0, sink); + "#, + ), + ); + recipes.add_template_recipe( EncodingRecipeBuilder::new("jt_base", f_branch_table_base, 5) .operands_out(vec![gpr]) diff --git a/cranelift/codegen/src/predicates.rs b/cranelift/codegen/src/predicates.rs index f900546111..16672b145b 100644 --- a/cranelift/codegen/src/predicates.rs +++ b/cranelift/codegen/src/predicates.rs @@ -31,6 +31,18 @@ pub fn is_zero_32_bit_float>(x: T) -> bool { x32.bits() == 0 } +/// Check that a 128-bit vector contains all zeroes. +#[allow(dead_code)] +pub fn is_all_zeroes_128_bit<'b, T: PartialEq<&'b [u8; 16]>>(x: T) -> bool { + x.eq(&&[0; 16]) +} + +/// Check that a 128-bit vector contains all ones. +#[allow(dead_code)] +pub fn is_all_ones_128_bit<'b, T: PartialEq<&'b [u8; 16]>>(x: T) -> bool { + x.eq(&&[0xff; 16]) +} + /// Check that `x` is the same as `y`. #[allow(dead_code)] pub fn is_equal + Copy>(x: T, y: O) -> bool { @@ -109,4 +121,19 @@ mod tests { assert!(!is_signed_int(x1, 16, 4)); assert!(!is_signed_int(x2, 16, 4)); } + + #[test] + fn is_all_zeroes() { + assert!(is_all_zeroes_128_bit(&[0; 16])); + assert!(is_all_zeroes_128_bit(vec![ + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 + ])); + assert!(!is_all_zeroes_128_bit(&[1; 16])); + } + + #[test] + fn is_all_ones() { + assert!(!is_all_ones_128_bit(&[0; 16])); + assert!(is_all_ones_128_bit(&[0xff; 16])); + } } diff --git a/cranelift/filetests/filetests/isa/x86/vconst-binemit.clif b/cranelift/filetests/filetests/isa/x86/vconst-binemit.clif index b07dc0fd4e..2d6f862679 100644 --- a/cranelift/filetests/filetests/isa/x86/vconst-binemit.clif +++ b/cranelift/filetests/filetests/isa/x86/vconst-binemit.clif @@ -5,7 +5,7 @@ target x86_64 function %test_vconst_b8() { ebb0: -[-, %xmm2] v0 = vconst.b8x16 0x00 ; bin: 0f 10 15 00000008 PCRelRodata4(15) -[-, %xmm3] v1 = vconst.b8x16 0x01 ; bin: 0f 10 1d 00000011 PCRelRodata4(31) +[-, %xmm2] v0 = vconst.b8x16 0x01 ; bin: 0f 10 15 00000008 PCRelRodata4(15) +[-, %xmm3] v1 = vconst.b8x16 0x02 ; bin: 0f 10 1d 00000011 PCRelRodata4(31) return } diff --git a/cranelift/filetests/filetests/isa/x86/vconst-opt-run.clif b/cranelift/filetests/filetests/isa/x86/vconst-opt-run.clif new file mode 100644 index 0000000000..487ff4f844 --- /dev/null +++ b/cranelift/filetests/filetests/isa/x86/vconst-opt-run.clif @@ -0,0 +1,23 @@ +test run +set enable_simd +target x86_64 + +; TODO move to vconst-run.clif + +function %test_vconst_zeroes() -> b1 { +ebb0: + v0 = vconst.i8x16 0x00 + v1 = extractlane v0, 4 + v2 = icmp_imm eq v1, 0 + return v2 +} +; run + +function %test_vconst_ones() -> b1 { +ebb0: + v0 = vconst.i8x16 0xffffffffffffffffffffffffffffffff + v1 = extractlane v0, 2 + v2 = icmp_imm eq v1, 0xff + return v2 +} +; run diff --git a/cranelift/filetests/filetests/isa/x86/vconst-opt.clif b/cranelift/filetests/filetests/isa/x86/vconst-opt.clif new file mode 100644 index 0000000000..4daeed8abe --- /dev/null +++ b/cranelift/filetests/filetests/isa/x86/vconst-opt.clif @@ -0,0 +1,12 @@ +test binemit +set enable_simd +target x86_64 + +; TODO move to vconst-compile.clif or vconst-binemit.clif + +function %test_vconst_optimizations() { +ebb0: +[-, %xmm4] v0 = vconst.b8x16 0x00 ; bin: 66 0f ef e4 +[-, %xmm7] v1 = vconst.b8x16 0xffffffffffffffffffffffffffffffff ; bin: 66 0f 74 ff + return +}