x64: Fix load sinking bugs in new lowerings (#4740)

Fixes #4736 Fix lowerings that were using values as both a Reg and a RegMem, making it look like a load could be sunk while its value in a register was still being used. Also add an assert that checks that loads that are sunk are never used.
2022-08-19 14:21:06 -07:00
parent fd98814b96
commit 754cf7156a
3 changed files with 57 additions and 16 deletions
--- a/cranelift/codegen/src/isa/x64/lower.isle
+++ b/cranelift/codegen/src/isa/x64/lower.isle
@@ -2934,8 +2934,9 @@
 (rule (cmp_zero_int_bool_ref val @ (value_type $B1))
      (x64_test (OperandSize.Size8) (RegMemImm.Imm 1) val))
 (rule (cmp_zero_int_bool_ref val @ (value_type ty))
-      (let ((size OperandSize (raw_operand_size_of_type ty)))
+      (let ((size OperandSize (raw_operand_size_of_type ty))
-        (x64_test size val val)))
+            (src Gpr val))
        (x64_test size src src)))
 ;; Rules for `bricmp` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
@@ -3036,8 +3037,10 @@
 ;; -> Convert(Ah) // Convert .. with no loss of significant digits from previous shift
 ;; -> Ah = Ah + Ah // Double Ah to account for shift right before the conversion.
 ;; -> dst = Ah + Al // Add the two floats together
-(rule (lower (has_type $F32X4 (fcvt_from_uint a)))
+(rule (lower (has_type $F32X4 (fcvt_from_uint val)))
-      (let (;;  get the low 16 bits
+      (let ((a Xmm val)
            ;;  get the low 16 bits
            (a_lo Xmm (x64_pslld a (RegMemImm.Imm 16)))
            (a_lo Xmm (x64_psrld a_lo (RegMemImm.Imm 16)))
@@ -3072,9 +3075,11 @@
 ;; The x64 backend currently only supports these two type combinations.
 (rule (lower (has_type $I32X4 (fcvt_to_sint_sat val @ (value_type $F32X4))))
-      (let (;; Sets tmp to zero if float is NaN
+      (let ((src Xmm val)
-            (tmp Xmm (x64_cmpps val val (FcmpImm.Equal)))
+
-            (dst Xmm (x64_andps val tmp))
+            ;; Sets tmp to zero if float is NaN
            (tmp Xmm (x64_cmpps src src (FcmpImm.Equal)))
            (dst Xmm (x64_andps src tmp))
            ;; Sets top bit of tmp if float is positive
            ;; Setting up to set top bit on negative float values
@@ -3141,10 +3146,12 @@
 ;; |                       Step 6                        |                 Step 7                 |
 ;; | (0-(INT_MAX+1))..(UINT_MAX-(INT_MAX+1))(w/overflow) | ((INT_MAX+1)-(INT_MAX+1))..(INT_MAX+1) |
 (rule (lower (has_type $I32X4 (fcvt_to_uint_sat val @ (value_type $F32X4))))
-      (let (;; Converting to unsigned int so if float src is negative or NaN
+      (let ((src Xmm val)
            ;; Converting to unsigned int so if float src is negative or NaN
            ;; will first set to zero.
-            (tmp2 Xmm (x64_pxor val val)) ;; make a zero
+            (tmp2 Xmm (x64_pxor src src)) ;; make a zero
-            (dst Xmm (x64_maxps val tmp2))
+            (dst Xmm (x64_maxps src tmp2))
            ;; Set tmp2 to INT_MAX+1. It is important to note here that after it looks
            ;; like we are only converting INT_MAX (0x7FFFFFFF) but in fact because
@@ -3233,10 +3240,12 @@
 ;; Rules for `swiden_high` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 (rule (lower (has_type $I16X8 (swiden_high val @ (value_type $I8X16))))
-      (x64_pmovsxbw (x64_palignr val val 8 (OperandSize.Size32))))
+      (let ((x Xmm val))
        (x64_pmovsxbw (x64_palignr x x 8 (OperandSize.Size32)))))
 (rule (lower (has_type $I32X4 (swiden_high val @ (value_type $I16X8))))
-      (x64_pmovsxwd (x64_palignr val val 8 (OperandSize.Size32))))
+      (let ((x Xmm val))
        (x64_pmovsxwd (x64_palignr x x 8 (OperandSize.Size32)))))
 (rule (lower (has_type $I64X2 (swiden_high val @ (value_type $I32X4))))
      (x64_pmovsxdq (x64_pshufd val 0xEE (OperandSize.Size32))))
@@ -3255,10 +3264,12 @@
 ;; Rules for `uwiden_high` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 (rule (lower (has_type $I16X8 (uwiden_high val @ (value_type $I8X16))))
-      (x64_pmovzxbw (x64_palignr val val 8 (OperandSize.Size32))))
+      (let ((x Xmm val))
        (x64_pmovzxbw (x64_palignr x x 8 (OperandSize.Size32)))))
 (rule (lower (has_type $I32X4 (uwiden_high val @ (value_type $I16X8))))
-      (x64_pmovzxwd (x64_palignr val val 8 (OperandSize.Size32))))
+      (let ((x Xmm val))
        (x64_pmovzxwd (x64_palignr x x 8 (OperandSize.Size32)))))
 (rule (lower (has_type $I64X2 (uwiden_high val @ (value_type $I32X4))))
      (x64_pmovzxdq (x64_pshufd val 0xEE (OperandSize.Size32))))
@@ -3277,9 +3288,11 @@
 ;; This rule is a special case for handling the translation of the wasm op
 ;; `i32x4.trunc_sat_f64x2_s_zero`. It can be removed once we have an
 ;; implementation of `snarrow` for `I64X2`.
-(rule (lower (has_type $I32X4 (snarrow (has_type $I64X2 (fcvt_to_sint_sat a))
+(rule (lower (has_type $I32X4 (snarrow (has_type $I64X2 (fcvt_to_sint_sat val))
                                       (vconst (u128_from_constant 0)))))
-      (let (;; y = i32x4.trunc_sat_f64x2_s_zero(x) is lowered to:
+      (let ((a Xmm val)
            ;; y = i32x4.trunc_sat_f64x2_s_zero(x) is lowered to:
            ;; MOVE xmm_tmp, xmm_x
            ;; CMPEQPD xmm_tmp, xmm_x
            ;; MOVE xmm_y, xmm_x
--- a/cranelift/codegen/src/machinst/lower.rs
+++ b/cranelift/codegen/src/machinst/lower.rs
@@ -1266,6 +1266,10 @@ impl<'func, I: VCodeInst> Lower<'func, I> {
        let ty = self.f.dfg.value_type(val);
        assert!(ty != IFLAGS && ty != FFLAGS);
        if let Some(inst) = self.f.dfg.value_def(val).inst() {
            assert!(!self.inst_sunk.contains(&inst));
        }
        // If the value is a constant, then (re)materialize it at each use. This
        // lowers register pressure.
        if let Some(c) = self
@@ -1347,6 +1351,10 @@ impl<'func, I: VCodeInst> Lower<'func, I> {
        assert!(has_lowering_side_effect(self.f, ir_inst));
        assert!(self.cur_scan_entry_color.is_some());
        for result in self.dfg().inst_results(ir_inst) {
            assert!(self.value_lowered_uses[*result] == 0);
        }
        let sunk_inst_entry_color = self
            .side_effect_inst_entry_colors
            .get(&ir_inst)
--- a/cranelift/filetests/filetests/isa/x64/widen-high-bug.clif
+++ b/cranelift/filetests/filetests/isa/x64/widen-high-bug.clif
@@ -0,0 +1,20 @@
 test compile precise-output
 target x86_64
 function u0:0(i64 vmctx, i8x16) -> i16x8 fast {
 block0(v0: i64, v2: i8x16):
    v5 = load.i8x16 notrap aligned table v0+80
    v6 = uwiden_high v5
    return v6
 }
 ;   pushq   %rbp
 ;   movq    %rsp, %rbp
 ; block0:
 ;   movdqu  80(%rdi), %xmm5
 ;   palignr $8, %xmm5, %xmm5, %xmm5
 ;   pmovzxbw %xmm5, %xmm0
 ;   movq    %rbp, %rsp
 ;   popq    %rbp
 ;   ret