From 29dfcf5dfbf468e3ef3415e840b2666388c90324 Mon Sep 17 00:00:00 2001
From: Jakob Stoklund Olesen <jolesen@mozilla.com>
Date: Fri, 22 Sep 2017 15:35:11 -0700
Subject: [PATCH] Add spill/fill encodings for Intel ISAs.

To begin with, these are catch-all encodings with a SIB byte and a
32-bit displacement, so they can access any stack slot via both the
stack pointer and the frame pointer.

In the future, we will add encodings for 8-bit displacements as well as
EBP-relative references without a SIB byte.
---
 .../filetests/isa/intel/binary32-float.cton   | 34 ++++++++++++++
 cranelift/filetests/isa/intel/binary32.cton   | 17 +++++++
 .../filetests/isa/intel/binary64-float.cton   | 37 +++++++++++++++-
 cranelift/filetests/isa/intel/binary64.cton   | 44 +++++++++++++++++++
 lib/cretonne/meta/isa/intel/encodings.py      | 12 ++++-
 lib/cretonne/meta/isa/intel/recipes.py        | 42 +++++++++++++++++-
 lib/cretonne/meta/isa/intel/registers.py      |  8 +++-
 lib/cretonne/src/isa/intel/binemit.rs         | 29 +++++++++++-
 8 files changed, 217 insertions(+), 6 deletions(-)

diff --git a/cranelift/filetests/isa/intel/binary32-float.cton b/cranelift/filetests/isa/intel/binary32-float.cton
index 3756fbff1e..c5a1b08a0e 100644
--- a/cranelift/filetests/isa/intel/binary32-float.cton
+++ b/cranelift/filetests/isa/intel/binary32-float.cton
@@ -8,6 +8,11 @@ isa intel has_sse2
 ;
 
 function %F32() {
+    ss0 = incoming_arg 8, offset 0
+    ss1 = incoming_arg 1024, offset -1024
+    ss2 = incoming_arg 1024, offset -2048
+    ss3 = incoming_arg 8, offset -2056
+
 ebb0:
     [-,%rcx]            v0 = iconst.i32 1
     [-,%rsi]            v1 = iconst.i32 2
@@ -105,10 +110,27 @@ ebb0:
     ; asm: movd %xmm2, -10000(%esi)
     [-]                 store.f32 v101, v1-10000                ; bin: 66 0f 7e 96 ffffd8f0
 
+    ; Spill / Fill.
+
+    ; asm: movd %xmm5, 1032(%esp)
+    [-,ss1]             v200 = spill v100                       ; bin: 66 0f 7e ac 24 00000408
+    ; asm: movd %xmm2, 1032(%esp)
+    [-,ss1]             v201 = spill v101                       ; bin: 66 0f 7e 94 24 00000408
+
+    ; asm: movd 1032(%esp), %xmm5
+    [-,%xmm5]           v210 = fill v200                        ; bin: 66 0f 6e ac 24 00000408
+    ; asm: movd 1032(%esp), %xmm2
+    [-,%xmm2]           v211 = fill v201                        ; bin: 66 0f 6e 94 24 00000408
+
     return
 }
 
 function %F64() {
+    ss0 = incoming_arg 8, offset 0
+    ss1 = incoming_arg 1024, offset -1024
+    ss2 = incoming_arg 1024, offset -2048
+    ss3 = incoming_arg 8, offset -2056
+
 ebb0:
     [-,%rcx]            v0 = iconst.i32 1
     [-,%rsi]            v1 = iconst.i32 2
@@ -198,5 +220,17 @@ ebb0:
     ; asm: movq %xmm2, -10000(%esi)
     [-]                 store.f64 v101, v1-10000                ; bin: 66 0f d6 96 ffffd8f0
 
+    ; Spill / Fill.
+
+    ; asm: movq %xmm5, 1032(%esp)
+    [-,ss1]             v200 = spill v100                       ; bin: 66 0f d6 ac 24 00000408
+    ; asm: movq %xmm2, 1032(%esp)
+    [-,ss1]             v201 = spill v101                       ; bin: 66 0f d6 94 24 00000408
+
+    ; asm: movq 1032(%esp), %xmm5
+    [-,%xmm5]           v210 = fill v200                        ; bin: f3 0f 7e ac 24 00000408
+    ; asm: movq 1032(%esp), %xmm2
+    [-,%xmm2]           v211 = fill v201                        ; bin: f3 0f 7e 94 24 00000408
+
     return
 }
diff --git a/cranelift/filetests/isa/intel/binary32.cton b/cranelift/filetests/isa/intel/binary32.cton
index 16f8131cef..ee8c79e24f 100644
--- a/cranelift/filetests/isa/intel/binary32.cton
+++ b/cranelift/filetests/isa/intel/binary32.cton
@@ -11,6 +11,11 @@ function %I32() {
     fn0 = function %foo()
     sig0 = ()
 
+    ss0 = incoming_arg 8, offset 0
+    ss1 = incoming_arg 1024, offset -1024
+    ss2 = incoming_arg 1024, offset -2048
+    ss3 = incoming_arg 8, offset -2056
+
 ebb0:
     ; asm: movl $1, %ecx
     [-,%rcx]            v1 = iconst.i32 1        ; bin: b9 00000001
@@ -346,6 +351,18 @@ ebb0:
     ; asm: call *%esi
     call_indirect sig0, v401()                  ; bin: ff d6
 
+    ; Spill / Fill.
+
+    ; asm: movl %ecx, 1032(%esp)
+    [-,ss1]             v500 = spill v1         ; bin: 89 8c 24 00000408
+    ; asm: movl %esi, 1032(%esp)
+    [-,ss1]             v501 = spill v2         ; bin: 89 b4 24 00000408
+
+    ; asm: movl 1032(%esp), %ecx
+    [-,%rcx]            v510 = fill v500        ; bin: 8b 8c 24 00000408
+    ; asm: movl 1032(%esp), %esi
+    [-,%rsi]            v511 = fill v501        ; bin: 8b b4 24 00000408
+
     ; asm: testl %ecx, %ecx
     ; asm: je ebb1
     brz v1, ebb1                                ; bin: 85 c9 74 0e
diff --git a/cranelift/filetests/isa/intel/binary64-float.cton b/cranelift/filetests/isa/intel/binary64-float.cton
index 83aaf6d753..a5b4aaa163 100644
--- a/cranelift/filetests/isa/intel/binary64-float.cton
+++ b/cranelift/filetests/isa/intel/binary64-float.cton
@@ -1,6 +1,7 @@
 ; Binary emission of 64-bit floating point code.
 test binemit
 set is_64bit
+set is_compressed
 isa intel has_sse2
 
 ; The binary encodings can be verified with the command:
@@ -9,6 +10,11 @@ isa intel has_sse2
 ;
 
 function %F32() {
+    ss0 = incoming_arg 8, offset 0
+    ss1 = incoming_arg 1024, offset -1024
+    ss2 = incoming_arg 1024, offset -2048
+    ss3 = incoming_arg 8, offset -2056
+
 ebb0:
     [-,%r11]            v0 = iconst.i32 1
     [-,%rsi]            v1 = iconst.i32 2
@@ -36,7 +42,7 @@ ebb0:
     [-,%xmm10]          v17 = bitcast.f32 v1                    ; bin: 66 44 0f 6e d6
 
     ; asm: movd %xmm5, %ecx
-    [-,%rcx]            v18 = bitcast.i32 v10                   ; bin: 66 40 0f 7e e9
+    [-,%rcx]            v18 = bitcast.i32 v10                   ; bin: 66 0f 7e e9
     ; asm: movd %xmm10, %esi
     [-,%rsi]            v19 = bitcast.i32 v11                   ; bin: 66 44 0f 7e d6
 
@@ -113,10 +119,27 @@ ebb0:
     ; asm: movd %xmm10, -10000(%rax)
     [-]                 store.f32 v101, v2-10000                ; bin: 66 44 0f 7e 90 ffffd8f0
 
+    ; Spill / Fill.
+
+    ; asm: movd %xmm5, 1032(%rsp)
+    [-,ss1]             v200 = spill v100                       ; bin: 66 0f 7e ac 24 00000408
+    ; asm: movd %xmm10, 1032(%rsp)
+    [-,ss1]             v201 = spill v101                       ; bin: 66 44 0f 7e 94 24 00000408
+
+    ; asm: movd 1032(%rsp), %xmm5
+    [-,%xmm5]           v210 = fill v200                        ; bin: 66 0f 6e ac 24 00000408
+    ; asm: movd 1032(%rsp), %xmm10
+    [-,%xmm10]          v211 = fill v201                        ; bin: 66 44 0f 6e 94 24 00000408
+
     return
 }
 
 function %F64() {
+    ss0 = incoming_arg 8, offset 0
+    ss1 = incoming_arg 1024, offset -1024
+    ss2 = incoming_arg 1024, offset -2048
+    ss3 = incoming_arg 8, offset -2056
+
 ebb0:
     [-,%r11]            v0 = iconst.i32 1
     [-,%rsi]            v1 = iconst.i32 2
@@ -221,5 +244,17 @@ ebb0:
     ; asm: movq %xmm10, -10000(%rax)
     [-]                 store.f64 v101, v2-10000                ; bin: 66 44 0f d6 90 ffffd8f0
 
+    ; Spill / Fill.
+
+    ; asm: movq %xmm5, 1032(%rsp)
+    [-,ss1]             v200 = spill v100                       ; bin: 66 0f d6 ac 24 00000408
+    ; asm: movq %xmm10, 1032(%rsp)
+    [-,ss1]             v201 = spill v101                       ; bin: 66 44 0f d6 94 24 00000408
+
+    ; asm: movq 1032(%rsp), %xmm5
+    [-,%xmm5]           v210 = fill v200                        ; bin: f3 0f 7e ac 24 00000408
+    ; asm: movq 1032(%rsp), %xmm10
+    [-,%xmm10]          v211 = fill v201                        ; bin: f3 44 0f 7e 94 24 00000408
+
     return
 }
diff --git a/cranelift/filetests/isa/intel/binary64.cton b/cranelift/filetests/isa/intel/binary64.cton
index 8b22173fe4..c9febfdab0 100644
--- a/cranelift/filetests/isa/intel/binary64.cton
+++ b/cranelift/filetests/isa/intel/binary64.cton
@@ -14,6 +14,13 @@ function %I64() {
     fn0 = function %foo()
     sig0 = ()
 
+    ; Use incoming_arg stack slots because they won't be relocated by the frame
+    ; layout.
+    ss0 = incoming_arg 8, offset 0
+    ss1 = incoming_arg 1024, offset -1024
+    ss2 = incoming_arg 1024, offset -2048
+    ss3 = incoming_arg 8, offset -2056
+
 ebb0:
 
     ; Integer Constants.
@@ -436,6 +443,22 @@ ebb0:
     ; asm: call *%r10
     call_indirect sig0, v402()                  ; bin: 41 ff d2
 
+    ; Spill / Fill.
+
+    ; asm: movq %rcx, 1032(%rsp)
+    [-,ss1]             v500 = spill v1         ; bin: 48 89 8c 24 00000408
+    ; asm: movq %rsi, 1032(%rsp)
+    [-,ss1]             v501 = spill v2         ; bin: 48 89 b4 24 00000408
+    ; asm: movq %r10, 1032(%rsp)
+    [-,ss1]             v502 = spill v3         ; bin: 4c 89 94 24 00000408
+
+    ; asm: movq 1032(%rsp), %rcx
+    [-,%rcx]            v510 = fill v500        ; bin: 48 8b 8c 24 00000408
+    ; asm: movq 1032(%rsp), %rsi
+    [-,%rsi]            v511 = fill v501        ; bin: 48 8b b4 24 00000408
+    ; asm: movq 1032(%rsp), %r10
+    [-,%r10]            v512 = fill v502        ; bin: 4c 8b 94 24 00000408
+
     ; asm: testq %rcx, %rcx
     ; asm: je ebb1
     brz v1, ebb1                                ; bin: 48 85 c9 74 1b
@@ -477,6 +500,11 @@ function %I32() {
     fn0 = function %foo()
     sig0 = ()
 
+    ss0 = incoming_arg 8, offset 0
+    ss1 = incoming_arg 1024, offset -1024
+    ss2 = incoming_arg 1024, offset -2048
+    ss3 = incoming_arg 8, offset -2056
+
 ebb0:
 
     ; Integer Constants.
@@ -806,6 +834,22 @@ ebb0:
     ; asm: movzbl %dl, %esi
     [-,%rsi]             v351 = bint.i32 v301   ; bin: 0f b6 f2
 
+    ; Spill / Fill.
+
+    ; asm: movl %ecx, 1032(%rsp)
+    [-,ss1]             v500 = spill v1         ; bin: 89 8c 24 00000408
+    ; asm: movl %esi, 1032(%rsp)
+    [-,ss1]             v501 = spill v2         ; bin: 89 b4 24 00000408
+    ; asm: movl %r10d, 1032(%rsp)
+    [-,ss1]             v502 = spill v3         ; bin: 44 89 94 24 00000408
+
+    ; asm: movl 1032(%rsp), %ecx
+    [-,%rcx]            v510 = fill v500        ; bin: 8b 8c 24 00000408
+    ; asm: movl 1032(%rsp), %esi
+    [-,%rsi]            v511 = fill v501        ; bin: 8b b4 24 00000408
+    ; asm: movl 1032(%rsp), %r10d
+    [-,%r10]            v512 = fill v502        ; bin: 44 8b 94 24 00000408
+
     ; asm: testl %ecx, %ecx
     ; asm: je ebb1x
     brz v1, ebb1                                ; bin: 85 c9 74 18
diff --git a/lib/cretonne/meta/isa/intel/encodings.py b/lib/cretonne/meta/isa/intel/encodings.py
index 117940eb39..4b646658c1 100644
--- a/lib/cretonne/meta/isa/intel/encodings.py
+++ b/lib/cretonne/meta/isa/intel/encodings.py
@@ -65,7 +65,7 @@ def enc_i32_i64_ld_st(inst, w_bit, recipe, *args, **kwargs):
     Add encodings for `inst.i32` to I32.
     Add encodings for `inst.i32` to I64 with and without REX.
     Add encodings for `inst.i64` to I64 with a REX prefix, using the `w_bit`
-    argument to determine wheter or not to set the REX.W bit.
+    argument to determine whether or not to set the REX.W bit.
     """
     I32.enc(inst.i32.any, *recipe(*args, **kwargs))
 
@@ -181,6 +181,8 @@ enc_i32_i64_ld_st(base.store, True, r.st, 0x89)
 enc_i32_i64_ld_st(base.store, True, r.stDisp8, 0x89)
 enc_i32_i64_ld_st(base.store, True, r.stDisp32, 0x89)
 
+enc_i32_i64(base.spill, r.spSib32, 0x89)
+
 enc_i64(base.istore32.i64.any, r.st, 0x89)
 enc_i64(base.istore32.i64.any, r.stDisp8, 0x89)
 enc_i64(base.istore32.i64.any, r.stDisp32, 0x89)
@@ -208,6 +210,8 @@ enc_i32_i64_ld_st(base.load, True, r.ld, 0x8b)
 enc_i32_i64_ld_st(base.load, True, r.ldDisp8, 0x8b)
 enc_i32_i64_ld_st(base.load, True, r.ldDisp32, 0x8b)
 
+enc_i32_i64(base.fill, r.fiSib32, 0x8b)
+
 enc_i64(base.uload32.i64, r.ld, 0x8b)
 enc_i64(base.uload32.i64, r.ldDisp8, 0x8b)
 enc_i64(base.uload32.i64, r.ldDisp32, 0x8b)
@@ -252,6 +256,12 @@ enc_flt(base.store.f64.any, r.fst, 0x66, 0x0f, 0xd6)
 enc_flt(base.store.f64.any, r.fstDisp8, 0x66, 0x0f, 0xd6)
 enc_flt(base.store.f64.any, r.fstDisp32, 0x66, 0x0f, 0xd6)
 
+enc_flt(base.fill.f32, r.ffiSib32, 0x66, 0x0f, 0x6e)
+enc_flt(base.fill.f64, r.ffiSib32, 0xf3, 0x0f, 0x7e)
+
+enc_flt(base.spill.f32, r.fspSib32, 0x66, 0x0f, 0x7e)
+enc_flt(base.spill.f64, r.fspSib32, 0x66, 0x0f, 0xd6)
+
 #
 # Function addresses.
 #
diff --git a/lib/cretonne/meta/isa/intel/recipes.py b/lib/cretonne/meta/isa/intel/recipes.py
index 89556a4049..358f9cae72 100644
--- a/lib/cretonne/meta/isa/intel/recipes.py
+++ b/lib/cretonne/meta/isa/intel/recipes.py
@@ -8,7 +8,7 @@ from base.formats import Unary, UnaryImm, Binary, BinaryImm, MultiAry
 from base.formats import Trap, Call, IndirectCall, Store, Load
 from base.formats import IntCompare
 from base.formats import RegMove, Ternary, Jump, Branch, FuncAddr
-from .registers import GPR, ABCD, FPR, GPR8, FPR8
+from .registers import GPR, ABCD, FPR, GPR8, FPR8, StackGPR32, StackFPR32
 
 try:
     from typing import Tuple, Dict, Sequence  # noqa
@@ -474,6 +474,26 @@ fstDisp32 = TailRecipe(
         sink.put4(offset as u32);
         ''')
 
+# Unary spill with SIB and 32-bit displacement.
+spSib32 = TailRecipe(
+        'spSib32', Unary, size=6, ins=GPR, outs=StackGPR32,
+        emit='''
+        let base = stk_base(out_stk0.base);
+        PUT_OP(bits, rex2(base, in_reg0), sink);
+        modrm_sib_disp32(in_reg0, sink);
+        sib_noindex(base, sink);
+        sink.put4(out_stk0.offset as u32);
+        ''')
+fspSib32 = TailRecipe(
+        'fspSib32', Unary, size=6, ins=FPR, outs=StackFPR32,
+        emit='''
+        let base = stk_base(out_stk0.base);
+        PUT_OP(bits, rex2(base, in_reg0), sink);
+        modrm_sib_disp32(in_reg0, sink);
+        sib_noindex(base, sink);
+        sink.put4(out_stk0.offset as u32);
+        ''')
+
 #
 # Load recipes
 #
@@ -540,6 +560,26 @@ fldDisp32 = TailRecipe(
         sink.put4(offset as u32);
         ''')
 
+# Unary fill with SIB and 32-bit displacement.
+fiSib32 = TailRecipe(
+        'fiSib32', Unary, size=6, ins=StackGPR32, outs=GPR,
+        emit='''
+        let base = stk_base(in_stk0.base);
+        PUT_OP(bits, rex2(base, out_reg0), sink);
+        modrm_sib_disp32(out_reg0, sink);
+        sib_noindex(base, sink);
+        sink.put4(in_stk0.offset as u32);
+        ''')
+ffiSib32 = TailRecipe(
+        'ffiSib32', Unary, size=6, ins=StackFPR32, outs=FPR,
+        emit='''
+        let base = stk_base(in_stk0.base);
+        PUT_OP(bits, rex2(base, out_reg0), sink);
+        modrm_sib_disp32(out_reg0, sink);
+        sib_noindex(base, sink);
+        sink.put4(in_stk0.offset as u32);
+        ''')
+
 #
 # Call/return
 #
diff --git a/lib/cretonne/meta/isa/intel/registers.py b/lib/cretonne/meta/isa/intel/registers.py
index 62966aac3b..886812d6ce 100644
--- a/lib/cretonne/meta/isa/intel/registers.py
+++ b/lib/cretonne/meta/isa/intel/registers.py
@@ -23,7 +23,7 @@ data types, and the H-registers even less so. Rather than trying to model the
 H-registers accurately, we'll avoid using them in both I32 and I64 modes.
 """
 from __future__ import absolute_import
-from cdsl.registers import RegBank, RegClass
+from cdsl.registers import RegBank, RegClass, Stack
 from .defs import ISA
 
 
@@ -44,4 +44,10 @@ ABCD = GPR[0:4]
 FPR = RegClass(FloatRegs)
 FPR8 = FPR[0:8]
 
+# Constraints for stack operands.
+
+# Stack operand with a 32-bit signed displacement from either RBP or RSP.
+StackGPR32 = Stack(GPR)
+StackFPR32 = Stack(FPR)
+
 RegClass.extract_names(globals())
diff --git a/lib/cretonne/src/isa/intel/binemit.rs b/lib/cretonne/src/isa/intel/binemit.rs
index 6f75833be6..888128caed 100644
--- a/lib/cretonne/src/isa/intel/binemit.rs
+++ b/lib/cretonne/src/isa/intel/binemit.rs
@@ -2,8 +2,9 @@
 
 use binemit::{CodeSink, Reloc, bad_encoding};
 use ir::{Function, Inst, Ebb, InstructionData};
-use isa::RegUnit;
+use isa::{RegUnit, StackRef, StackBase, StackBaseMask};
 use regalloc::RegDiversions;
+use super::registers::RU;
 
 include!(concat!(env!("OUT_DIR"), "/binemit-intel.rs"));
 
@@ -27,6 +28,16 @@ impl Into<Reloc> for RelocKind {
     }
 }
 
+// Convert a stack base to the corresponding register.
+fn stk_base(base: StackBase) -> RegUnit {
+    let ru = match base {
+        StackBase::SP => RU::rsp,
+        StackBase::FP => RU::rbp,
+        StackBase::Zone => unimplemented!(),
+    };
+    ru as RegUnit
+}
+
 // Mandatory prefix bytes for Mp* opcodes.
 const PREFIX: [u8; 3] = [0x66, 0xf3, 0xf2];
 
@@ -43,7 +54,7 @@ fn rex1(reg_b: RegUnit) -> u8 {
 
 // Create a dual-register REX prefix, setting:
 //
-// REX.B = bit 3 of r/m register.
+// REX.B = bit 3 of r/m register, or SIB base register when a SIB byte is present.
 // REX.R = bit 3 of reg register.
 fn rex2(rm: RegUnit, reg: RegUnit) -> u8 {
     let b = ((rm >> 3) & 1) as u8;
@@ -185,6 +196,20 @@ fn modrm_disp32<CS: CodeSink + ?Sized>(rm: RegUnit, reg: RegUnit, sink: &mut CS)
     sink.put1(b);
 }
 
+/// Emit a mode 10 ModR/M byte indicating that a SIB byte is present.
+fn modrm_sib_disp32<CS: CodeSink + ?Sized>(reg: RegUnit, sink: &mut CS) {
+    modrm_disp32(0b100, reg, sink);
+}
+
+/// Emit a SIB byte with a base register and no scale+index.
+fn sib_noindex<CS: CodeSink + ?Sized>(base: RegUnit, sink: &mut CS) {
+    let base = base as u8 & 7;
+    // SIB        SS_III_BBB.
+    let mut b = 0b00_100_000;
+    b |= base;
+    sink.put1(b);
+}
+
 /// Emit a single-byte branch displacement to `destination`.
 fn disp1<CS: CodeSink + ?Sized>(destination: Ebb, func: &Function, sink: &mut CS) {
     let delta = func.offsets[destination].wrapping_sub(sink.offset() + 1);