From 99380fad1af9c029ae2b77c083eb123bc2dc429b Mon Sep 17 00:00:00 2001
From: Andy Wortman <me@iximeow.net>
Date: Mon, 16 Sep 2019 14:35:55 +0000
Subject: [PATCH] Use 'xor r, r' to set registers to 0 instead of mov (#766)

---
 .../codegen/meta/src/cdsl/instructions.rs     | 16 +++++++
 .../codegen/meta/src/isa/x86/encodings.rs     | 30 +++++++++++++
 cranelift/codegen/meta/src/isa/x86/recipes.rs | 12 +++++
 cranelift/codegen/src/predicates.rs           |  6 +++
 .../floating-point-zero-constants-32bit.clif  | 17 +++++++
 .../x86/floating-point-zero-constants.clif    | 31 +++++++++++++
 .../filetests/isa/x86/isub_imm-i8.clif        |  8 ++--
 .../x86/optimized-zero-constants-32bit.clif   | 36 ++++++++++++++-
 .../isa/x86/optimized-zero-constants.clif     | 44 ++++++++++++++++++-
 9 files changed, 193 insertions(+), 7 deletions(-)
 create mode 100644 cranelift/filetests/filetests/isa/x86/floating-point-zero-constants-32bit.clif
 create mode 100644 cranelift/filetests/filetests/isa/x86/floating-point-zero-constants.clif

diff --git a/cranelift/codegen/meta/src/cdsl/instructions.rs b/cranelift/codegen/meta/src/cdsl/instructions.rs
index f061be94ed..2737099cc4 100644
--- a/cranelift/codegen/meta/src/cdsl/instructions.rs
+++ b/cranelift/codegen/meta/src/cdsl/instructions.rs
@@ -606,6 +606,8 @@ pub enum FormatPredicateKind {
     /// `2^scale`.
     IsUnsignedInt(usize, usize),
 
+    /// Is the immediate format field member an integer equal to zero?
+    IsZeroInt,
     /// Is the immediate format field member equal to zero? (float32 version)
     IsZero32BitFloat,
 
@@ -679,6 +681,9 @@ impl FormatPredicateNode {
                 "predicates::is_unsigned_int({}, {}, {})",
                 self.member_name, width, scale
             ),
+            FormatPredicateKind::IsZeroInt => {
+                format!("predicates::is_zero_int({})", self.member_name)
+            }
             FormatPredicateKind::IsZero32BitFloat => {
                 format!("predicates::is_zero_32_bit_float({})", self.member_name)
             }
@@ -891,6 +896,17 @@ impl InstructionPredicate {
         ))
     }
 
+    pub fn new_is_zero_int(
+        format: &InstructionFormat,
+        field_name: &'static str,
+    ) -> InstructionPredicateNode {
+        InstructionPredicateNode::FormatPredicate(FormatPredicateNode::new(
+            format,
+            field_name,
+            FormatPredicateKind::IsZeroInt,
+        ))
+    }
+
     pub fn new_is_zero_32bit_float(
         format: &InstructionFormat,
         field_name: &'static str,
diff --git a/cranelift/codegen/meta/src/isa/x86/encodings.rs b/cranelift/codegen/meta/src/isa/x86/encodings.rs
index f16ca93717..9bdb751b26 100644
--- a/cranelift/codegen/meta/src/isa/x86/encodings.rs
+++ b/cranelift/codegen/meta/src/isa/x86/encodings.rs
@@ -614,6 +614,7 @@ pub(crate) fn define(
     let rec_trapif = r.recipe("trapif");
     let rec_trapff = r.recipe("trapff");
     let rec_u_id = r.template("u_id");
+    let rec_u_id_z = r.template("u_id_z");
     let rec_umr = r.template("umr");
     let rec_umr_reg_to_ssa = r.template("umr_reg_to_ssa");
     let rec_ur = r.template("ur");
@@ -750,6 +751,35 @@ pub(crate) fn define(
     }
     e.enc64(bconst.bind(B64), rec_pu_id_bool.opcodes(vec![0xb8]).rex());
 
+    let is_zero_int = InstructionPredicate::new_is_zero_int(f_unary_imm, "imm");
+    e.enc_both_instp(
+        iconst.bind(I8),
+        rec_u_id_z.opcodes(vec![0x30]),
+        is_zero_int.clone(),
+    );
+    // You may expect that i16 encodings would have an 0x66 prefix on the opcode to indicate that
+    // encodings should be on 16-bit operands (f.ex, "xor %ax, %ax"). Cranelift currently does not
+    // know that it can drop the 0x66 prefix and clear the upper half of a 32-bit register in these
+    // scenarios, so we explicitly select a wider but permissible opcode.
+    //
+    // This effectively formalizes the i16->i32 widening that Cranelift performs when there isn't
+    // an appropriate i16 encoding available.
+    e.enc_both_instp(
+        iconst.bind(I16),
+        rec_u_id_z.opcodes(vec![0x31]),
+        is_zero_int.clone(),
+    );
+    e.enc_both_instp(
+        iconst.bind(I32),
+        rec_u_id_z.opcodes(vec![0x31]),
+        is_zero_int.clone(),
+    );
+    e.enc_x86_64_instp(
+        iconst.bind(I64),
+        rec_u_id_z.opcodes(vec![0x31]),
+        is_zero_int,
+    );
+
     // Shifts and rotates.
     // Note that the dynamic shift amount is only masked by 5 or 6 bits; the 8-bit
     // and 16-bit shifts would need explicit masking.
diff --git a/cranelift/codegen/meta/src/isa/x86/recipes.rs b/cranelift/codegen/meta/src/isa/x86/recipes.rs
index 8176effc42..11a9972d98 100644
--- a/cranelift/codegen/meta/src/isa/x86/recipes.rs
+++ b/cranelift/codegen/meta/src/isa/x86/recipes.rs
@@ -1023,6 +1023,18 @@ pub(crate) fn define<'shared>(
             ),
     );
 
+    // XX+rd id unary with zero immediate.
+    recipes.add_template_recipe(
+        EncodingRecipeBuilder::new("u_id_z", f_unary_imm, 1)
+            .operands_out(vec![gpr])
+            .emit(
+                r#"
+                    {{PUT_OP}}(bits, rex2(out_reg0, out_reg0), sink);
+                    modrm_rr(out_reg0, out_reg0, sink);
+                "#,
+            ),
+    );
+
     // XX /n Unary with floating point 32-bit immediate equal to zero.
     {
         let format = formats.get(f_unary_ieee32);
diff --git a/cranelift/codegen/src/predicates.rs b/cranelift/codegen/src/predicates.rs
index 978e733c66..f900546111 100644
--- a/cranelift/codegen/src/predicates.rs
+++ b/cranelift/codegen/src/predicates.rs
@@ -11,6 +11,12 @@
 
 use crate::ir;
 
+/// Check that an integer value is zero.
+#[allow(dead_code)]
+pub fn is_zero_int<T: Into<i64>>(x: T) -> bool {
+    x.into() == 0
+}
+
 /// Check that a 64-bit floating point value is zero.
 #[allow(dead_code)]
 pub fn is_zero_64_bit_float<T: Into<ir::immediates::Ieee64>>(x: T) -> bool {
diff --git a/cranelift/filetests/filetests/isa/x86/floating-point-zero-constants-32bit.clif b/cranelift/filetests/filetests/isa/x86/floating-point-zero-constants-32bit.clif
new file mode 100644
index 0000000000..8021375558
--- /dev/null
+++ b/cranelift/filetests/filetests/isa/x86/floating-point-zero-constants-32bit.clif
@@ -0,0 +1,17 @@
+; Check that floating-point and integer constants equal to zero are optimized correctly.
+test binemit
+target i686
+
+function %foo() -> f32 fast {
+ebb0:
+  ; asm: xorps %xmm0, %xmm0
+  [-,%xmm0]    v0 = f32const 0.0     ; bin: 0f 57 c0
+  return v0
+}
+
+function %bar() -> f64 fast {
+ebb0:
+  ; asm: xorpd %xmm0, %xmm0
+  [-,%xmm0]    v1 = f64const 0.0     ; bin: 66 0f 57 c0
+  return v1
+}
diff --git a/cranelift/filetests/filetests/isa/x86/floating-point-zero-constants.clif b/cranelift/filetests/filetests/isa/x86/floating-point-zero-constants.clif
new file mode 100644
index 0000000000..049320870e
--- /dev/null
+++ b/cranelift/filetests/filetests/isa/x86/floating-point-zero-constants.clif
@@ -0,0 +1,31 @@
+; Check that floating-point constants equal to zero are optimized correctly.
+test binemit
+target x86_64
+
+function %zero_const_32bit_no_rex() -> f32 fast {
+ebb0:
+  ; asm: xorps %xmm0, %xmm0
+  [-,%xmm0]    v0 = f32const 0.0     ; bin: 40 0f 57 c0
+  return v0
+}
+
+function %zero_const_32bit_rex() -> f32 fast {
+ebb0:
+  ; asm: xorps %xmm8, %xmm8
+  [-,%xmm8]    v1 = f32const 0.0     ; bin: 45 0f 57 c0
+  return v1
+}
+
+function %zero_const_64bit_no_rex() -> f64 fast {
+ebb0:
+  ; asm: xorpd %xmm0, %xmm0
+  [-,%xmm0]    v0 = f64const 0.0     ; bin: 66 40 0f 57 c0
+  return v0
+}
+
+function %zero_const_64bit_rex() -> f64 fast {
+ebb0:
+  ; asm: xorpd %xmm8, %xmm8
+  [-,%xmm8]    v1 = f64const 0.0     ; bin: 66 45 0f 57 c0
+  return v1
+}
diff --git a/cranelift/filetests/filetests/isa/x86/isub_imm-i8.clif b/cranelift/filetests/filetests/isa/x86/isub_imm-i8.clif
index 8958b1afa4..1ca70ebbbe 100644
--- a/cranelift/filetests/filetests/isa/x86/isub_imm-i8.clif
+++ b/cranelift/filetests/filetests/isa/x86/isub_imm-i8.clif
@@ -5,9 +5,9 @@ function u0:0(i8) -> i8 fast {
 ebb0(v0: i8):
     v1 = iconst.i8 0
     v2 = isub v1, v0
-    ; check: v4 = uextend.i32 v0
-    ; nextln: v6 = iconst.i32 0
-    ; nextln = isub v6, v4
-    ; nextln = ireduce.i8 v5
+    ; check: v3 = uextend.i32 v0
+    ; nextln: v5 = iconst.i32 0
+    ; nextln = isub v5, v3
+    ; nextln = ireduce.i8 v4
     return v2
 }
diff --git a/cranelift/filetests/filetests/isa/x86/optimized-zero-constants-32bit.clif b/cranelift/filetests/filetests/isa/x86/optimized-zero-constants-32bit.clif
index c90e455527..21f936c4b9 100644
--- a/cranelift/filetests/filetests/isa/x86/optimized-zero-constants-32bit.clif
+++ b/cranelift/filetests/filetests/isa/x86/optimized-zero-constants-32bit.clif
@@ -1,5 +1,6 @@
-; Check that floating-point constants equal to zero are optimized correctly.
+; Check that floating-point and integer constants equal to zero are optimized correctly.
 test binemit
+set opt_level=best
 target i686
 
 function %foo() -> f32 fast {
@@ -16,3 +17,36 @@ ebb0:
   return v1
 }
 
+function %zero_dword() -> i32 fast {
+ebb0:
+  ; asm: xor %eax, %eax
+  [-,%rax]     v0 = iconst.i32 0     ; bin: 31 c0
+  ; asm: xor %edi, %edi
+  [-,%rdi]     v1 = iconst.i32 0     ; bin: 31 ff
+  return v0
+}
+
+function %zero_word() -> i16 fast {
+ebb0:
+  ; while you may expect this to be encoded like 6631c0, aka
+  ; xor %ax, %ax, the upper 16 bits of the register used for
+  ; i16 are left undefined, so it's not wrong to clear them.
+  ;
+  ; discarding the 66 prefix is shorter, so this test expects
+  ; that we do so.
+  ;
+  ; asm: xor %eax, %eax
+  [-,%rax]     v0 = iconst.i16 0     ; bin: 31 c0
+  ; asm: xor %edi, %edi
+  [-,%rdi]     v1 = iconst.i16 0     ; bin: 31 ff
+  return v0
+}
+
+function %zero_byte() -> i8 fast {
+ebb0:
+  ; asm: xor %al, %al
+  [-,%rax]     v0 = iconst.i8 0     ; bin: 30 c0
+  ; asm: xor %dh, %dh
+  [-,%rdi]     v1 = iconst.i8 0     ; bin: 30 ff
+  return v0
+}
diff --git a/cranelift/filetests/filetests/isa/x86/optimized-zero-constants.clif b/cranelift/filetests/filetests/isa/x86/optimized-zero-constants.clif
index 44060e9b97..4a1ad00ff4 100644
--- a/cranelift/filetests/filetests/isa/x86/optimized-zero-constants.clif
+++ b/cranelift/filetests/filetests/isa/x86/optimized-zero-constants.clif
@@ -1,11 +1,12 @@
 ; Check that floating-point constants equal to zero are optimized correctly.
 test binemit
+set opt_level=best
 target x86_64
 
 function %zero_const_32bit_no_rex() -> f32 fast {
 ebb0:
   ; asm: xorps %xmm0, %xmm0
-  [-,%xmm0]    v0 = f32const 0.0     ; bin: 40 0f 57 c0
+  [-,%xmm0]    v0 = f32const 0.0     ; bin: 0f 57 c0
   return v0
 }
 
@@ -19,7 +20,7 @@ ebb0:
 function %zero_const_64bit_no_rex() -> f64 fast {
 ebb0:
   ; asm: xorpd %xmm0, %xmm0
-  [-,%xmm0]    v0 = f64const 0.0     ; bin: 66 40 0f 57 c0
+  [-,%xmm0]    v0 = f64const 0.0     ; bin: 66 0f 57 c0
   return v0
 }
 
@@ -30,3 +31,42 @@ ebb0:
   return v1
 }
 
+function %imm_zero_register() -> i64 fast {
+ebb0:
+  ; asm: xor %eax, %eax
+  [-,%rax]     v0 = iconst.i64 0     ; bin: 31 c0
+  ; asm: xor %edi, %edi
+  [-,%rdi]     v1 = iconst.i64 0     ; bin: 31 ff
+  ; asm: xor %r8, r8
+  [-,%r8]      v2 = iconst.i64 0     ; bin: 45 31 c0
+  ; asm: xor %r15, %r15
+  [-,%r15]     v4 = iconst.i64 0     ; bin: 45 31 ff
+  return v0
+}
+
+function %zero_word() -> i16 fast {
+ebb0:
+  ; while you may expect this to be encoded like 6631c0, aka
+  ; xor %ax, %ax, the upper 16 bits of the register used for
+  ; i16 are left undefined, so it's not wrong to clear them.
+  ;
+  ; discarding the 66 prefix is shorter, so this test expects
+  ; that we do so.
+  ;
+  ; asm: xor %eax, %eax
+  [-,%rax]     v0 = iconst.i16 0     ; bin: 31 c0
+  ; asm: xor %edi, %edi
+  [-,%rdi]     v1 = iconst.i16 0     ; bin: 31 ff
+  return v0
+}
+
+function %zero_byte() -> i8 fast {
+ebb0:
+  ; asm: xor %r8b, %r8b
+  [-,%r15]     v0 = iconst.i8 0     ; bin: 45 30 ff
+  ; asm: xor %al, %al
+  [-,%rax]     v1 = iconst.i8 0     ; bin: 30 c0
+  ; asm: xor %dh, %dh
+  [-,%rdi]     v2 = iconst.i8 0     ; bin: 30 ff
+  return v0
+}