From e7bced95125055b41e449d414e55ccf2f05c09cf Mon Sep 17 00:00:00 2001
From: Benjamin Bouvier <public@benj.me>
Date: Wed, 14 Apr 2021 18:08:52 +0200
Subject: [PATCH] cranelift: always spill i32 with i64 stores;

Fixes #2839. See also the issue description and comments in this commits for
details of what the fix is about here.
---
 cranelift/codegen/src/machinst/abi_impl.rs    |  13 ++
 .../isa/x64/store-stack-full-width-i32.clif   | 127 ++++++++++++++++++
 2 files changed, 140 insertions(+)
 create mode 100644 cranelift/filetests/filetests/isa/x64/store-stack-full-width-i32.clif
diff --git a/cranelift/codegen/src/machinst/abi_impl.rs b/cranelift/codegen/src/machinst/abi_impl.rs
index 85a6d6831b..5636b83f82 100644
--- a/cranelift/codegen/src/machinst/abi_impl.rs
+++ b/cranelift/codegen/src/machinst/abi_impl.rs
@@ -1213,6 +1213,19 @@ impl<M: ABIMachineSpec> ABICallee for ABICalleeImpl<M> {
         let spill_off = islot * M::word_bytes() as i64;
         let sp_off = self.stackslots_size as i64 + spill_off;
         trace!("store_spillslot: slot {:?} -> sp_off {}", slot, sp_off);
+
+        // When reloading from a spill slot, we might have lost information about real integer
+        // types. For instance, on the x64 backend, a zero-extension can become spurious and
+        // optimized into a move, causing vregs of types I32 and I64 to share the same coalescing
+        // equivalency class. As a matter of fact, such a value can be spilled as an I32 and later
+        // reloaded as an I64; to make sure the high bits are always defined, do a word-sized store
+        // all the time, in this case.
+        let ty = if ty.is_int() && ty.bytes() < M::word_bytes() {
+            M::word_type()
+        } else {
+            ty
+        };
+
         gen_store_stack_multi::<M>(StackAMode::NominalSPOffset(sp_off, ty), from_regs, ty)
     }
 
diff --git a/cranelift/filetests/filetests/isa/x64/store-stack-full-width-i32.clif b/cranelift/filetests/filetests/isa/x64/store-stack-full-width-i32.clif
new file mode 100644
index 0000000000..10b3102c17
--- /dev/null
+++ b/cranelift/filetests/filetests/isa/x64/store-stack-full-width-i32.clif
@@ -0,0 +1,127 @@
+test compile
+target x86_64 machinst
+
+;; The goal of this test is to ensure that stack spills of an integer value,
+;; which width is less than the machine word's size, cause the full word to be
+;; stored, and not only the lower bits.
+
+;; Because of unsigned extensions which can be transformed into simple moves,
+;; the source vreg of the extension operation can be coalesced with its
+;; destination vreg, and if it happens to be spill, then the reload may use a
+;; reload of a different, larger size.
+
+function %f0(i32, i32, i32) -> i64 {
+    fn0 = %g(i32) -> i64
+
+; check:  pushq   %rbp
+; nextln: movq    %rsp, %rbp
+; nextln: subq    $$64, %rsp
+
+;; Stash all the callee-saved registers.
+
+; nextln: movq    %r12, 16(%rsp)
+; nextln: movq    %r13, 24(%rsp)
+; nextln: movq    %r14, 32(%rsp)
+; nextln: movq    %rbx, 40(%rsp)
+; nextln: movq    %r15, 48(%rsp)
+
+block0(v0: i32, v1: i32, v2: i32):
+    ;; First, create enough virtual registers so that the call instructions
+    ;; causes at least one of them to be spilled onto the stack.
+
+    v3 = iadd.i32 v0, v1
+    v4 = iadd.i32 v1, v2
+    v5 = iadd.i32 v0, v2
+    v6 = iadd.i32 v3, v0
+    v7 = iadd.i32 v4, v0
+    v8 = iadd.i32 v5, v0
+
+; nextln: movq    %rdi, %r12
+; nextln: addl    %esi, %r12d
+; nextln:  movq    %rsi, %r13
+; nextln:  addl    %edx, %r13d
+; nextln:  movq    %rdi, %r14
+; nextln:  addl    %edx, %r14d
+; nextln:  movq    %r12, %rbx
+; nextln:  addl    %edi, %ebx
+; nextln:  movq    %r13, %r15
+; nextln:  addl    %edi, %r15d
+; nextln:  movq    %r14, %rsi
+
+;; This should be movq below, not movl.
+; nextln:  movq    %rsi, rsp(0 + virtual offset)
+
+; nextln:  movslq  rsp(0 + virtual offset), %rsi
+; nextln:  addl    %edi, %esi
+
+    ;; Put an effectful instruction so that the live-ranges of the adds and
+    ;; uextends are split here, and to prevent the uextend to be emitted
+    ;; before the call. This will effectively causing the above i32 to be
+    ;; spilled as an i32, and not a full i64.
+
+    v300 = call fn0(v0)
+
+;; This should be movq below, not movl.
+; nextln:  movq    %rsi, rsp(0 + virtual offset)
+
+; nextln:  load_ext_name %g+0, %rsi
+; nextln:  call    *%rsi
+
+    v31 = uextend.i64 v3
+    v41 = uextend.i64 v4
+    v51 = uextend.i64 v5
+    v61 = uextend.i64 v6
+    v71 = uextend.i64 v7
+    v81 = uextend.i64 v8
+
+    ;; None of the uextends are generated here yet.
+
+    ;; At this point, I'd expect that this second call below would be not
+    ;; necessary, but if it is removed, the uextend is applied before the call,
+    ;; and the i64 is spilled (then reloaded), causing the bug to not appear. So
+    ;; an additional call it is!
+
+    v100 = call fn0(v3)
+
+; nextln:  movq    %r12, %rsi
+; nextln:  movq    %rsi, rsp(8 + virtual offset)
+; nextln:  nop     len=0
+; nextln:  movq    %r12, %rdi
+; nextln:  load_ext_name %g+0, %rsi
+; nextln:  call    *%rsi
+
+    ;; Cause reloads of all the values. Most are in registers, but one of them
+    ;; is on the stack. Make sure they're all used in the final computation.
+
+    v101 = iadd.i64 v100, v31
+    v102 = iadd.i64 v101, v41
+    v103 = iadd.i64 v102, v51
+    v104 = iadd.i64 v103, v61
+    v105 = iadd.i64 v104, v71
+    v200 = iadd.i64 v105, v81
+
+; nextln:  movq    %rax, %rsi
+; nextln:  movq    rsp(8 + virtual offset), %rdi
+; nextln:  addq    %rdi, %rsi
+; nextln:  addq    %r13, %rsi
+; nextln:  addq    %r14, %rsi
+; nextln:  addq    %rbx, %rsi
+; nextln:  addq    %r15, %rsi
+
+;; The reload operates on a full word, so uses movq.
+; nextln:  movq    rsp(0 + virtual offset), %rdi
+
+; nextln:  addq    %rdi, %rsi
+; nextln:  movq    %rsi, %rax
+; nextln:  movq    16(%rsp), %r12
+; nextln:  movq    24(%rsp), %r13
+; nextln:  movq    32(%rsp), %r14
+; nextln:  movq    40(%rsp), %rbx
+; nextln:  movq    48(%rsp), %r15
+; nextln:  addq    $$64, %rsp
+
+    return v200
+; nextln:  movq    %rbp, %rsp
+; nextln:  popq    %rbp
+; nextln:  ret
+}