Optimize vconst for x86 when immediate contains all zeroes or ones

Instead of using MOVUPS to expensively load bits from memory, this change uses a predicate to optimize vconst without a memory access: - when the 128-bit immediate is all zeroes in all bits, use PXOR to zero out an XMM register - when the 128-bit immediate is all ones in all bits, use PCMPEQB to set an XMM register to all ones This leaves the constant data in the constant pool, which may increase code size (TODO)
2019-08-28 15:29:40 -07:00
parent 694de912a5
commit 702155b19b
7 changed files with 154 additions and 3 deletions
--- a/cranelift/codegen/meta/src/cdsl/instructions.rs
+++ b/cranelift/codegen/meta/src/cdsl/instructions.rs
@@ -614,6 +614,12 @@ pub enum FormatPredicateKind {
    /// Is the immediate format field member equal to zero? (float64 version)
    IsZero64BitFloat,

+    /// Is the immediate format field member equal zero in all lanes?
+    IsAllZeroes128Bit,
+
+    /// Does the immediate format field member have ones in all bits of all lanes?
+    IsAllOnes128Bit,
+
    /// Has the value list (in member_name) the size specified in parameter?
    LengthEquals(usize),

@@ -690,6 +696,14 @@ impl FormatPredicateNode {
            FormatPredicateKind::IsZero64BitFloat => {
                format!("predicates::is_zero_64_bit_float({})", self.member_name)
            }
+            FormatPredicateKind::IsAllZeroes128Bit => format!(
+                "predicates::is_all_zeroes_128_bit(func.dfg.constants.get({}))",
+                self.member_name
+            ),
+            FormatPredicateKind::IsAllOnes128Bit => format!(
+                "predicates::is_all_ones_128_bit(func.dfg.constants.get({}))",
+                self.member_name
+            ),
            FormatPredicateKind::LengthEquals(num) => format!(
                "predicates::has_length_of({}, {}, func)",
                self.member_name, num
@@ -929,6 +943,28 @@ impl InstructionPredicate {
        ))
    }

+    pub fn new_is_all_zeroes_128bit(
+        format: &InstructionFormat,
+        field_name: &'static str,
+    ) -> InstructionPredicateNode {
+        InstructionPredicateNode::FormatPredicate(FormatPredicateNode::new(
+            format,
+            field_name,
+            FormatPredicateKind::IsAllZeroes128Bit,
+        ))
+    }
+
+    pub fn new_is_all_ones_128bit(
+        format: &InstructionFormat,
+        field_name: &'static str,
+    ) -> InstructionPredicateNode {
+        InstructionPredicateNode::FormatPredicate(FormatPredicateNode::new(
+            format,
+            field_name,
+            FormatPredicateKind::IsAllOnes128Bit,
+        ))
+    }
+
    pub fn new_length_equals(format: &InstructionFormat, size: usize) -> InstructionPredicateNode {
        assert!(
            format.has_value_list,
--- a/cranelift/codegen/meta/src/isa/x86/encodings.rs
+++ b/cranelift/codegen/meta/src/isa/x86/encodings.rs
@@ -310,6 +310,20 @@ impl PerCpuModeEncodings {
        self.enc64_rec(inst, recipe, bits);
    }

+    /// Add the same encoding to both X86_32 and X86_64; assumes configuration (e.g. REX, operand binding) has already happened
+    fn enc_32_64_func<T>(
+        &mut self,
+        inst: impl Clone + Into<InstSpec>,
+        template: Template,
+        builder_closure: T,
+    ) where
+        T: FnOnce(EncodingBuilder) -> EncodingBuilder,
+    {
+        let encoding = self.make_encoding(inst.into(), template, builder_closure);
+        self.enc32.push(encoding.clone());
+        self.enc64.push(encoding);
+    }
+
    /// Add the same encoding to both X86_32 and X86_64; assumes configuration (e.g. REX, operand
    /// binding) has already happened.
    fn enc_32_64_maybe_isap(
@@ -642,6 +656,7 @@ pub(crate) fn define(
    let rec_urm_noflags = r.template("urm_noflags");
    let rec_urm_noflags_abcd = r.template("urm_noflags_abcd");
    let rec_vconst = r.template("vconst");
+    let rec_vconst_optimized = r.template("vconst_optimized");

    // Predicates shorthands.
    let all_ones_funcaddrs_and_not_is_pic =
@@ -1671,7 +1686,7 @@ pub(crate) fn define(
    );
    e.enc_x86_64_instp(
        f64const,
-        rec_f64imm_z.opcodes(vec![0x66, 0x0f, 0x57]),
+        rec_f64imm_z.opcodes(vec![0x66, 0x0f, 0x57]), // XORPD from SSE2
        is_zero_64_bit_float,
    );

@@ -1946,6 +1961,32 @@ pub(crate) fn define(
        }
    }

+    // SIMD vconst for special cases (all zeroes, all ones)
+    // this must be encoded prior to the MOVUPS implementation (below) so the compiler sees this
+    // encoding first
+    for ty in ValueType::all_lane_types().filter(allowed_simd_type) {
+        let f_unary_const = formats.get(formats.by_name("UnaryConst"));
+        let instruction = vconst.bind_vector_from_lane(ty, sse_vector_size);
+
+        let is_zero_128bit =
+            InstructionPredicate::new_is_all_zeroes_128bit(f_unary_const, "constant_handle");
+        let template = rec_vconst_optimized
+            .nonrex()
+            .opcodes(vec![0x66, 0x0f, 0xef]); // PXOR from SSE2
+        e.enc_32_64_func(instruction.clone(), template, |builder| {
+            builder.inst_predicate(is_zero_128bit)
+        });
+
+        let is_ones_128bit =
+            InstructionPredicate::new_is_all_ones_128bit(f_unary_const, "constant_handle");
+        let template = rec_vconst_optimized
+            .nonrex()
+            .opcodes(vec![0x66, 0x0f, 0x74]); // PCMPEQB from SSE2
+        e.enc_32_64_func(instruction, template, |builder| {
+            builder.inst_predicate(is_ones_128bit)
+        });
+    }
+
    // SIMD vconst using MOVUPS
    // TODO it would be ideal if eventually this became the more efficient MOVAPS but we would have
    // to guarantee that the constants are aligned when emitted and there is currently no mechanism
--- a/cranelift/codegen/meta/src/isa/x86/recipes.rs
+++ b/cranelift/codegen/meta/src/isa/x86/recipes.rs
@@ -2449,6 +2449,18 @@ pub(crate) fn define<'shared>(
            ),
    );

+    recipes.add_template_recipe(
+        EncodingRecipeBuilder::new("vconst_optimized", f_unary_const, 1)
+            .operands_out(vec![fpr])
+            .clobbers_flags(false)
+            .emit(
+                r#"
+                    {{PUT_OP}}(bits, rex2(out_reg0, out_reg0), sink);
+                    modrm_rr(out_reg0, out_reg0, sink);
+                "#,
+            ),
+    );
+
    recipes.add_template_recipe(
        EncodingRecipeBuilder::new("jt_base", f_branch_table_base, 5)
            .operands_out(vec![gpr])
--- a/cranelift/codegen/src/predicates.rs
+++ b/cranelift/codegen/src/predicates.rs
@@ -31,6 +31,18 @@ pub fn is_zero_32_bit_float<T: Into<ir::immediates::Ieee32>>(x: T) -> bool {
    x32.bits() == 0
 }

+/// Check that a 128-bit vector contains all zeroes.
+#[allow(dead_code)]
+pub fn is_all_zeroes_128_bit<'b, T: PartialEq<&'b [u8; 16]>>(x: T) -> bool {
+    x.eq(&&[0; 16])
+}
+
+/// Check that a 128-bit vector contains all ones.
+#[allow(dead_code)]
+pub fn is_all_ones_128_bit<'b, T: PartialEq<&'b [u8; 16]>>(x: T) -> bool {
+    x.eq(&&[0xff; 16])
+}
+
 /// Check that `x` is the same as `y`.
 #[allow(dead_code)]
 pub fn is_equal<T: Eq + Copy, O: Into<T> + Copy>(x: T, y: O) -> bool {
@@ -109,4 +121,19 @@ mod tests {
        assert!(!is_signed_int(x1, 16, 4));
        assert!(!is_signed_int(x2, 16, 4));
    }
+
+    #[test]
+    fn is_all_zeroes() {
+        assert!(is_all_zeroes_128_bit(&[0; 16]));
+        assert!(is_all_zeroes_128_bit(vec![
+            0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
+        ]));
+        assert!(!is_all_zeroes_128_bit(&[1; 16]));
+    }
+
+    #[test]
+    fn is_all_ones() {
+        assert!(!is_all_ones_128_bit(&[0; 16]));
+        assert!(is_all_ones_128_bit(&[0xff; 16]));
+    }
 }
--- a/cranelift/filetests/filetests/isa/x86/vconst-binemit.clif
+++ b/cranelift/filetests/filetests/isa/x86/vconst-binemit.clif
@@ -5,7 +5,7 @@ target x86_64

 function %test_vconst_b8() {
 ebb0:
-[-, %xmm2]  v0 = vconst.b8x16 0x00  ; bin: 0f 10 15 00000008 PCRelRodata4(15)
-[-, %xmm3]  v1 = vconst.b8x16 0x01  ; bin: 0f 10 1d 00000011 PCRelRodata4(31)
+[-, %xmm2]  v0 = vconst.b8x16 0x01  ; bin: 0f 10 15 00000008 PCRelRodata4(15)
+[-, %xmm3]  v1 = vconst.b8x16 0x02  ; bin: 0f 10 1d 00000011 PCRelRodata4(31)
            return
 }
--- a/cranelift/filetests/filetests/isa/x86/vconst-opt-run.clif
+++ b/cranelift/filetests/filetests/isa/x86/vconst-opt-run.clif
@@ -0,0 +1,23 @@
+test run
+set enable_simd
+target x86_64
+
+; TODO move to vconst-run.clif
+
+function %test_vconst_zeroes() -> b1 {
+ebb0:
+    v0 = vconst.i8x16 0x00
+    v1 = extractlane v0, 4
+    v2 = icmp_imm eq v1, 0
+    return v2
+}
+; run
+
+function %test_vconst_ones() -> b1 {
+ebb0:
+    v0 = vconst.i8x16 0xffffffffffffffffffffffffffffffff
+    v1 = extractlane v0, 2
+    v2 = icmp_imm eq v1, 0xff
+    return v2
+}
+; run
--- a/cranelift/filetests/filetests/isa/x86/vconst-opt.clif
+++ b/cranelift/filetests/filetests/isa/x86/vconst-opt.clif
@@ -0,0 +1,12 @@
+test binemit
+set enable_simd
+target x86_64
+
+; TODO move to vconst-compile.clif or vconst-binemit.clif
+
+function %test_vconst_optimizations() {
+ebb0:
+[-, %xmm4]  v0 = vconst.b8x16 0x00                                  ; bin: 66 0f ef e4
+[-, %xmm7]  v1 = vconst.b8x16 0xffffffffffffffffffffffffffffffff    ; bin: 66 0f 74 ff
+            return
+}