Rename intel to x86.

x86 is the more accurate name, as there are non-Intel x86 implementations. Fixes #263.
2018-04-12 15:23:39 -07:00
parent 9e17e62d68
commit 1c760ab179
87 changed files with 222 additions and 225 deletions
--- a/cranelift/filetests/isa/x86/abi-bool.cton
+++ b/cranelift/filetests/isa/x86/abi-bool.cton
@@ -0,0 +1,20 @@
+test compile
+set is_64bit=1
+isa x86 haswell
+
+function %foo(i64, i64, i64, i32) -> b1 system_v {
+ebb3(v0: i64, v1: i64, v2: i64, v3: i32):
+    v5 = icmp ne v2, v2
+    v8 = iconst.i64 0
+    jump ebb2(v8, v3, v5)
+
+ebb2(v10: i64, v30: i32, v37: b1):
+    v18 = load.i32 notrap aligned v2
+    v27 = iadd.i64 v10, v10
+    v31 = icmp eq v30, v30
+    brz v31, ebb2(v27, v30, v37)
+    jump ebb0(v37)
+
+ebb0(v35: b1):
+    return v35
+}
--- a/cranelift/filetests/isa/x86/abi32.cton
+++ b/cranelift/filetests/isa/x86/abi32.cton
@@ -0,0 +1,20 @@
+; Test the legalization of function signatures.
+test legalizer
+isa x86
+
+; regex: V=v\d+
+
+function %f() {
+    sig0 = (i32) -> i32 system_v
+    ; check: sig0 = (i32 [0]) -> i32 [%rax] system_v
+
+    sig1 = (i64) -> b1 system_v
+    ; check: sig1 = (i32 [0], i32 [4]) -> b1 [%rax] system_v
+
+    sig2 = (f32, i64) -> f64 system_v
+    ; check: sig2 = (f32 [0], i32 [4], i32 [8]) -> f64 [%xmm0] system_v
+
+ebb0:
+    return
+}
+
--- a/cranelift/filetests/isa/x86/abi64.cton
+++ b/cranelift/filetests/isa/x86/abi64.cton
@@ -0,0 +1,32 @@
+; Test the legalization of function signatures.
+test legalizer
+set is_64bit
+isa x86
+
+; regex: V=v\d+
+
+function %f() {
+    sig0 = (i32) -> i32 system_v
+    ; check: sig0 = (i32 [%rdi]) -> i32 [%rax] system_v
+
+    sig1 = (i64) -> b1 system_v
+    ; check: sig1 = (i64 [%rdi]) -> b1 [%rax] system_v
+
+    sig2 = (f32, i64) -> f64 system_v
+    ; check: sig2 = (f32 [%xmm0], i64 [%rdi]) -> f64 [%xmm0] system_v
+
+ebb0:
+    return
+}
+
+function %pass_stack_int64(i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64 vmctx) spiderwasm {
+    sig0 = (i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64 vmctx) spiderwasm
+    fn0 = sig0 u0:0
+
+ebb0(v0: i64, v1: i64, v2: i64, v3: i64, v4: i64, v5: i64, v6: i64, v7: i64, v8: i64, v9: i64, v10: i64, v11: i64, v12: i64, v13: i64, v14: i64, v15: i64, v16: i64, v17: i64, v18: i64, v19: i64, v20: i64):
+    call fn0(v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20)
+    jump ebb1
+
+ebb1:
+    return
+}
--- a/cranelift/filetests/isa/x86/allones_funcaddrs32.cton
+++ b/cranelift/filetests/isa/x86/allones_funcaddrs32.cton
@@ -0,0 +1,25 @@
+; binary emission of 32-bit code.
+test binemit
+set is_compressed
+set allones_funcaddrs
+isa x86 haswell
+
+; The binary encodings can be verified with the command:
+;
+;   sed -ne 's/^ *; asm: *//p' filetests/isa/x86/allones_funcaddrs32.cton | llvm-mc -show-encoding -triple=i386
+;
+
+; Tests from binary32.cton affected by allones_funcaddrs.
+function %I32() {
+    sig0 = ()
+    fn0 = function %foo()
+
+ebb0:
+
+    ; asm: movl $-1, %ecx
+    [-,%rcx]            v400 = func_addr.i32 fn0        ; bin: b9 Abs4(%foo) ffffffff
+    ; asm: movl $-1, %esi
+    [-,%rsi]            v401 = func_addr.i32 fn0        ; bin: be Abs4(%foo) ffffffff
+
+    return                                              ; bin: c3
+}
--- a/cranelift/filetests/isa/x86/allones_funcaddrs64.cton
+++ b/cranelift/filetests/isa/x86/allones_funcaddrs64.cton
@@ -0,0 +1,28 @@
+; binary emission of 64-bit code.
+test binemit
+set is_64bit
+set is_compressed
+set allones_funcaddrs
+isa x86 haswell
+
+; The binary encodings can be verified with the command:
+;
+;   sed -ne 's/^ *; asm: *//p' filetests/isa/x86/allones_funcaddrs64.cton | llvm-mc -show-encoding -triple=x86_64
+;
+
+; Tests from binary64.cton affected by allones_funcaddrs.
+function %I64() {
+    sig0 = ()
+    fn0 = function %foo()
+
+ebb0:
+
+    ; asm: movabsq $-1, %rcx
+    [-,%rcx]            v400 = func_addr.i64 fn0        ; bin: 48 b9 Abs8(%foo) ffffffffffffffff
+    ; asm: movabsq $-1, %rsi
+    [-,%rsi]            v401 = func_addr.i64 fn0        ; bin: 48 be Abs8(%foo) ffffffffffffffff
+    ; asm: movabsq $-1, %r10
+    [-,%r10]            v402 = func_addr.i64 fn0        ; bin: 49 ba Abs8(%foo) ffffffffffffffff
+
+    return                                              ; bin: c3
+}
--- a/cranelift/filetests/isa/x86/baseline_clz_ctz_popcount.cton
+++ b/cranelift/filetests/isa/x86/baseline_clz_ctz_popcount.cton
@@ -0,0 +1,94 @@
+
+test compile
+set is_64bit
+isa x86 baseline
+
+
+; clz/ctz on 64 bit operands
+
+function %i64_clz(i64) -> i64 {
+ebb0(v10: i64):
+  v11 = clz v10
+  ; check: x86_bsr
+  ; check: selectif.i64
+  return v11
+}
+
+function %i64_ctz(i64) -> i64 {
+ebb1(v20: i64):
+  v21 = ctz v20
+  ; check: x86_bsf
+  ; check: selectif.i64
+  return v21
+}
+
+
+; clz/ctz on 32 bit operands
+
+function %i32_clz(i32) -> i32 {
+ebb0(v10: i32):
+  v11 = clz v10
+  ; check: x86_bsr
+  ; check: selectif.i32
+  return v11
+}
+
+function %i32_ctz(i32) -> i32 {
+ebb1(v20: i32):
+  v21 = ctz v20
+  ; check: x86_bsf
+  ; check: selectif.i32
+  return v21
+}
+
+
+; popcount on 64 bit operands
+
+function %i64_popcount(i64) -> i64 {
+ebb0(v30: i64):
+  v31 = popcnt v30;
+  ; check: ushr_imm
+  ; check: iconst.i64
+  ; check: band
+  ; check: isub
+  ; check: ushr_imm
+  ; check: band
+  ; check: isub
+  ; check: ushr_imm
+  ; check: band
+  ; check: isub
+  ; check: ushr_imm
+  ; check: iadd
+  ; check: iconst.i64
+  ; check: band
+  ; check: iconst.i64
+  ; check: imul
+  ; check: ushr_imm
+  return v31;
+}
+
+
+; popcount on 32 bit operands
+
+function %i32_popcount(i32) -> i32 {
+ebb0(v40: i32):
+  v41 = popcnt v40;
+  ; check: ushr_imm
+  ; check: iconst.i32
+  ; check: band
+  ; check: isub
+  ; check: ushr_imm
+  ; check: band
+  ; check: isub
+  ; check: ushr_imm
+  ; check: band
+  ; check: isub
+  ; check: ushr_imm
+  ; check: iadd
+  ; check: iconst.i32
+  ; check: band
+  ; check: iconst.i32
+  ; check: imul
+  ; check: ushr_imm
+  return v41;
+}
--- a/cranelift/filetests/isa/x86/baseline_clz_ctz_popcount_encoding.cton
+++ b/cranelift/filetests/isa/x86/baseline_clz_ctz_popcount_encoding.cton
@@ -0,0 +1,89 @@
+
+test binemit
+set is_64bit
+set is_compressed
+isa x86 baseline
+
+; The binary encodings can be verified with the command:
+;
+;   sed -ne 's/^ *; asm: *//p' filetests/isa/x86/baseline_clz_ctz_popcount_encoding.cton | llvm-mc -show-encoding -triple=x86_64
+;
+
+function %Foo() {
+ebb0:
+    ; 64-bit wide bsf
+
+    [-,%r11]                 v10 = iconst.i64 0x1234
+    ; asm: bsfq %r11, %rcx
+    [-,%rcx,%rflags]         v11, v12 = x86_bsf v10    ; bin: 49 0f bc cb
+
+    [-,%rdx]                 v14 = iconst.i64 0x5678
+    ; asm: bsfq %rdx, %r12
+    [-,%r12,%rflags]         v15, v16 = x86_bsf v14    ; bin: 4c 0f bc e2
+
+    ; asm: bsfq %rdx, %rdi
+    [-,%rdi,%rflags]         v17, v18 = x86_bsf v14    ; bin: 48 0f bc fa
+
+
+    ; 32-bit wide bsf
+
+    [-,%r11]                 v20 = iconst.i32 0x1234
+    ; asm: bsfl %r11d, %ecx
+    [-,%rcx,%rflags]         v21, v22 = x86_bsf v20    ; bin: 41 0f bc cb
+
+    [-,%rdx]                 v24 = iconst.i32 0x5678
+    ; asm: bsfl %edx, %r12d
+    [-,%r12,%rflags]         v25, v26 = x86_bsf v24    ; bin: 44 0f bc e2
+
+    ; asm: bsfl %edx, %esi
+    [-,%rsi,%rflags]         v27, v28 = x86_bsf v24    ; bin: 0f bc f2
+
+
+    ; 64-bit wide bsr
+
+    [-,%r11]                 v30 = iconst.i64 0x1234
+    ; asm: bsrq %r11, %rcx
+    [-,%rcx,%rflags]         v31, v32 = x86_bsr v30    ; bin: 49 0f bd cb
+
+    [-,%rdx]                 v34 = iconst.i64 0x5678
+    ; asm: bsrq %rdx, %r12
+    [-,%r12,%rflags]         v35, v36 = x86_bsr v34    ; bin: 4c 0f bd e2
+
+    ; asm: bsrq %rdx, %rdi
+    [-,%rdi,%rflags]         v37, v38 = x86_bsr v34    ; bin: 48 0f bd fa
+
+
+    ; 32-bit wide bsr
+
+    [-,%r11]                 v40 = iconst.i32 0x1234
+    ; asm: bsrl %r11d, %ecx
+    [-,%rcx,%rflags]         v41, v42 = x86_bsr v40    ; bin: 41 0f bd cb
+
+    [-,%rdx]                 v44 = iconst.i32 0x5678
+    ; asm: bsrl %edx, %r12d
+    [-,%r12,%rflags]         v45, v46 = x86_bsr v44    ; bin: 44 0f bd e2
+
+    ; asm: bsrl %edx, %esi
+    [-,%rsi,%rflags]         v47, v48 = x86_bsr v44    ; bin: 0f bd f2
+
+
+    ; 64-bit wide cmov
+
+    ; asm: cmoveq %r11, %rdx
+    [-,%rdx]     v51 = selectif.i64 eq v48, v30, v34   ; bin: 49 0f 44 d3
+
+    ; asm: cmoveq %rdi, %rdx
+    [-,%rdx]     v52 = selectif.i64 eq v48, v37, v34   ; bin: 48 0f 44 d7
+
+
+    ; 32-bit wide cmov
+
+    ; asm: cmovnel %r11d, %edx
+    [-,%rdx]    v60 = selectif.i32 ne v48, v40, v44    ; bin: 41 0f 45 d3
+
+    ; asm: cmovlel %esi, %edx
+    [-,%rdx]    v61 = selectif.i32 sle v48, v27, v44    ; bin: 0f 4e d6
+
+
+    trap user0
+}
--- a/cranelift/filetests/isa/x86/binary32-float.cton
+++ b/cranelift/filetests/isa/x86/binary32-float.cton
@@ -0,0 +1,508 @@
+; Binary emission of 32-bit floating point code.
+test binemit
+isa x86 haswell
+
+; The binary encodings can be verified with the command:
+;
+;   sed -ne 's/^ *; asm: *//p' filetests/isa/x86/binary32-float.cton | llvm-mc -show-encoding -triple=i386
+;
+
+function %F32() {
+    ss0 = incoming_arg 8, offset 0
+    ss1 = incoming_arg 1024, offset -1024
+    ss2 = incoming_arg 1024, offset -2048
+    ss3 = incoming_arg 8, offset -2056
+
+ebb0:
+    [-,%rcx]            v0 = iconst.i32 1
+    [-,%rsi]            v1 = iconst.i32 2
+
+    ; asm: cvtsi2ss %ecx, %xmm5
+    [-,%xmm5]           v10 = fcvt_from_sint.f32 v0             ; bin: f3 0f 2a e9
+    ; asm: cvtsi2ss %esi, %xmm2
+    [-,%xmm2]           v11 = fcvt_from_sint.f32 v1             ; bin: f3 0f 2a d6
+
+    ; asm: cvtss2sd %xmm2, %xmm5
+    [-,%xmm5]           v12 = fpromote.f64 v11                  ; bin: f3 0f 5a ea
+    ; asm: cvtss2sd %xmm5, %xmm2
+    [-,%xmm2]           v13 = fpromote.f64 v10                  ; bin: f3 0f 5a d5
+
+    ; asm: movd %ecx, %xmm5
+    [-,%xmm5]           v14 = bitcast.f32 v0                    ; bin: 66 0f 6e e9
+    ; asm: movd %esi, %xmm2
+    [-,%xmm2]           v15 = bitcast.f32 v1                    ; bin: 66 0f 6e d6
+
+    ; asm: movd %xmm5, %ecx
+    [-,%rcx]            v16 = bitcast.i32 v10                   ; bin: 66 0f 7e e9
+    ; asm: movd %xmm2, %esi
+    [-,%rsi]            v17 = bitcast.i32 v11                   ; bin: 66 0f 7e d6
+
+    ; asm: movaps %xmm2, %xmm5
+    [-,%xmm5]           v18 = copy v11                          ; bin: 0f 28 ea
+    ; asm: movaps %xmm5, %xmm2
+    [-,%xmm2]           v19 = copy v10                          ; bin: 0f 28 d5
+
+    ; asm: movaps %xmm2, %xmm5
+    regmove v19, %xmm2 -> %xmm5                                 ; bin: 0f 28 ea
+    ; asm: movaps %xmm5, %xmm2
+    regmove v19, %xmm5 -> %xmm2                                 ; bin: 0f 28 d5
+
+    ; Binary arithmetic.
+
+    ; asm: addss %xmm2, %xmm5
+    [-,%xmm5]           v20 = fadd v10, v11                     ; bin: f3 0f 58 ea
+    ; asm: addss %xmm5, %xmm2
+    [-,%xmm2]           v21 = fadd v11, v10                     ; bin: f3 0f 58 d5
+
+    ; asm: subss %xmm2, %xmm5
+    [-,%xmm5]           v22 = fsub v10, v11                     ; bin: f3 0f 5c ea
+    ; asm: subss %xmm5, %xmm2
+    [-,%xmm2]           v23 = fsub v11, v10                     ; bin: f3 0f 5c d5
+
+    ; asm: mulss %xmm2, %xmm5
+    [-,%xmm5]           v24 = fmul v10, v11                     ; bin: f3 0f 59 ea
+    ; asm: mulss %xmm5, %xmm2
+    [-,%xmm2]           v25 = fmul v11, v10                     ; bin: f3 0f 59 d5
+
+    ; asm: divss %xmm2, %xmm5
+    [-,%xmm5]           v26 = fdiv v10, v11                     ; bin: f3 0f 5e ea
+    ; asm: divss %xmm5, %xmm2
+    [-,%xmm2]           v27 = fdiv v11, v10                     ; bin: f3 0f 5e d5
+
+    ; Bitwise ops.
+    ; We use the *ps SSE instructions for everything because they are smaller.
+
+    ; asm: andps %xmm2, %xmm5
+    [-,%xmm5]           v30 = band v10, v11                     ; bin: 0f 54 ea
+    ; asm: andps %xmm5, %xmm2
+    [-,%xmm2]           v31 = band v11, v10                     ; bin: 0f 54 d5
+
+    ; asm: andnps %xmm2, %xmm5
+    [-,%xmm5]           v32 = band_not v11, v10                 ; bin: 0f 55 ea
+    ; asm: andnps %xmm5, %xmm2
+    [-,%xmm2]           v33 = band_not v10, v11                 ; bin: 0f 55 d5
+
+    ; asm: orps %xmm2, %xmm5
+    [-,%xmm5]           v34 = bor v10, v11                      ; bin: 0f 56 ea
+    ; asm: orps %xmm5, %xmm2
+    [-,%xmm2]           v35 = bor v11, v10                      ; bin: 0f 56 d5
+
+    ; asm: xorps %xmm2, %xmm5
+    [-,%xmm5]           v36 = bxor v10, v11                     ; bin: 0f 57 ea
+    ; asm: xorps %xmm5, %xmm2
+    [-,%xmm2]           v37 = bxor v11, v10                     ; bin: 0f 57 d5
+
+    ; Convert float to int. (No i64 dest on i386).
+
+    ; asm: cvttss2si %xmm5, %ecx
+    [-,%rcx]            v40 = x86_cvtt2si.i32 v10               ; bin: f3 0f 2c cd
+    ; asm: cvttss2si %xmm2, %esi
+    [-,%rsi]            v41 = x86_cvtt2si.i32 v11               ; bin: f3 0f 2c f2
+
+    ; Min/max.
+
+    ; asm: minss %xmm2, %xmm5
+    [-,%xmm5]           v42 = x86_fmin v10, v11                 ; bin: f3 0f 5d ea
+    ; asm: minss %xmm5, %xmm2
+    [-,%xmm2]           v43 = x86_fmin v11, v10                 ; bin: f3 0f 5d d5
+    ; asm: maxss %xmm2, %xmm5
+    [-,%xmm5]           v44 = x86_fmax v10, v11                 ; bin: f3 0f 5f ea
+    ; asm: maxss %xmm5, %xmm2
+    [-,%xmm2]           v45 = x86_fmax v11, v10                 ; bin: f3 0f 5f d5
+
+    ; Unary arithmetic.
+
+    ; asm: sqrtss %xmm5, %xmm2
+    [-,%xmm2]           v50 = sqrt v10                          ; bin: f3 0f 51 d5
+    ; asm: sqrtss %xmm2, %xmm5
+    [-,%xmm5]           v51 = sqrt v11                          ; bin: f3 0f 51 ea
+
+    ; asm: roundss $0, %xmm5, %xmm4
+    [-,%xmm4]           v52 = nearest v10                       ; bin: 66 0f 3a 0a e5 00
+    ; asm: roundss $0, %xmm2, %xmm5
+    [-,%xmm5]           v53 = nearest v11                       ; bin: 66 0f 3a 0a ea 00
+    ; asm: roundss $0, %xmm5, %xmm2
+    [-,%xmm2]           v54 = nearest v10                       ; bin: 66 0f 3a 0a d5 00
+
+    ; asm: roundss $1, %xmm5, %xmm4
+    [-,%xmm4]           v55 = floor v10                         ; bin: 66 0f 3a 0a e5 01
+    ; asm: roundss $1, %xmm2, %xmm5
+    [-,%xmm5]           v56 = floor v11                         ; bin: 66 0f 3a 0a ea 01
+    ; asm: roundss $1, %xmm5, %xmm2
+    [-,%xmm2]           v57 = floor v10                         ; bin: 66 0f 3a 0a d5 01
+
+    ; asm: roundss $2, %xmm5, %xmm4
+    [-,%xmm4]           v58 = ceil v10                          ; bin: 66 0f 3a 0a e5 02
+    ; asm: roundss $2, %xmm2, %xmm5
+    [-,%xmm5]           v59 = ceil v11                          ; bin: 66 0f 3a 0a ea 02
+    ; asm: roundss $2, %xmm5, %xmm2
+    [-,%xmm2]           v60 = ceil v10                          ; bin: 66 0f 3a 0a d5 02
+
+    ; asm: roundss $3, %xmm5, %xmm4
+    [-,%xmm4]           v61 = trunc v10                         ; bin: 66 0f 3a 0a e5 03
+    ; asm: roundss $3, %xmm2, %xmm5
+    [-,%xmm5]           v62 = trunc v11                         ; bin: 66 0f 3a 0a ea 03
+    ; asm: roundss $3, %xmm5, %xmm2
+    [-,%xmm2]           v63 = trunc v10                         ; bin: 66 0f 3a 0a d5 03
+
+    ; Load/Store
+
+    ; asm: movss (%ecx), %xmm5
+    [-,%xmm5]           v100 = load.f32 v0                      ; bin: heap_oob f3 0f 10 29
+    ; asm: movss (%esi), %xmm2
+    [-,%xmm2]           v101 = load.f32 v1                      ; bin: heap_oob f3 0f 10 16
+    ; asm: movss 50(%ecx), %xmm5
+    [-,%xmm5]           v110 = load.f32 v0+50                   ; bin: heap_oob f3 0f 10 69 32
+    ; asm: movss -50(%esi), %xmm2
+    [-,%xmm2]           v111 = load.f32 v1-50                   ; bin: heap_oob f3 0f 10 56 ce
+    ; asm: movss 10000(%ecx), %xmm5
+    [-,%xmm5]           v120 = load.f32 v0+10000                ; bin: heap_oob f3 0f 10 a9 00002710
+    ; asm: movss -10000(%esi), %xmm2
+    [-,%xmm2]           v121 = load.f32 v1-10000                ; bin: heap_oob f3 0f 10 96 ffffd8f0
+
+    ; asm: movss %xmm5, (%ecx)
+    [-]                 store.f32 v100, v0                      ; bin: heap_oob f3 0f 11 29
+    ; asm: movss %xmm2, (%esi)
+    [-]                 store.f32 v101, v1                      ; bin: heap_oob f3 0f 11 16
+    ; asm: movss %xmm5, 50(%ecx)
+    [-]                 store.f32 v100, v0+50                   ; bin: heap_oob f3 0f 11 69 32
+    ; asm: movss %xmm2, -50(%esi)
+    [-]                 store.f32 v101, v1-50                   ; bin: heap_oob f3 0f 11 56 ce
+    ; asm: movss %xmm5, 10000(%ecx)
+    [-]                 store.f32 v100, v0+10000                ; bin: heap_oob f3 0f 11 a9 00002710
+    ; asm: movss %xmm2, -10000(%esi)
+    [-]                 store.f32 v101, v1-10000                ; bin: heap_oob f3 0f 11 96 ffffd8f0
+
+    ; Spill / Fill.
+
+    ; asm: movss %xmm5, 1032(%esp)
+    [-,ss1]             v200 = spill v100                       ; bin: f3 0f 11 ac 24 00000408
+    ; asm: movss %xmm2, 1032(%esp)
+    [-,ss1]             v201 = spill v101                       ; bin: f3 0f 11 94 24 00000408
+
+    ; asm: movss 1032(%esp), %xmm5
+    [-,%xmm5]           v210 = fill v200                        ; bin: f3 0f 10 ac 24 00000408
+    ; asm: movss 1032(%esp), %xmm2
+    [-,%xmm2]           v211 = fill v201                        ; bin: f3 0f 10 94 24 00000408
+
+    ; asm: movss %xmm5, 1032(%rsp)
+    regspill v100, %xmm5 -> ss1                                 ; bin: f3 0f 11 ac 24 00000408
+    ; asm: movss 1032(%rsp), %xmm5
+    regfill v100, ss1 -> %xmm5                                  ; bin: f3 0f 10 ac 24 00000408
+
+    ; Comparisons.
+    ;
+    ; Only `supported_floatccs` are tested here. Others are handled by
+    ; legalization paterns.
+
+    ; asm: ucomiss %xmm2, %xmm5
+    ; asm: setnp %bl
+    [-,%rbx]            v300 = fcmp ord v10, v11                ; bin: 0f 2e ea 0f 9b c3
+    ; asm: ucomiss %xmm5, %xmm2
+    ; asm: setp %bl
+    [-,%rbx]            v301 = fcmp uno v11, v10                ; bin: 0f 2e d5 0f 9a c3
+    ; asm: ucomiss %xmm2, %xmm5
+    ; asm: setne %dl
+    [-,%rdx]            v302 = fcmp one v10, v11                ; bin: 0f 2e ea 0f 95 c2
+    ; asm: ucomiss %xmm5, %xmm2
+    ; asm: sete %dl
+    [-,%rdx]            v303 = fcmp ueq v11, v10                ; bin: 0f 2e d5 0f 94 c2
+    ; asm: ucomiss %xmm2, %xmm5
+    ; asm: seta %bl
+    [-,%rbx]            v304 = fcmp gt v10, v11                 ; bin: 0f 2e ea 0f 97 c3
+    ; asm: ucomiss %xmm5, %xmm2
+    ; asm: setae %bl
+    [-,%rbx]            v305 = fcmp ge v11, v10                 ; bin: 0f 2e d5 0f 93 c3
+    ; asm: ucomiss %xmm2, %xmm5
+    ; asm: setb %dl
+    [-,%rdx]            v306 = fcmp ult v10, v11                ; bin: 0f 2e ea 0f 92 c2
+    ; asm: ucomiss %xmm5, %xmm2
+    ; asm: setbe %dl
+    [-,%rdx]            v307 = fcmp ule v11, v10                ; bin: 0f 2e d5 0f 96 c2
+
+    ; asm: ucomiss %xmm2, %xmm5
+    [-,%rflags]         v310 = ffcmp v10, v11                   ; bin: 0f 2e ea
+    ; asm: ucomiss %xmm2, %xmm5
+    [-,%rflags]         v311 = ffcmp v11, v10                   ; bin: 0f 2e d5
+    ; asm: ucomiss %xmm5, %xmm5
+    [-,%rflags]         v312 = ffcmp v10, v10                   ; bin: 0f 2e ed
+
+    return
+}
+
+function %F64() {
+    ss0 = incoming_arg 8, offset 0
+    ss1 = incoming_arg 1024, offset -1024
+    ss2 = incoming_arg 1024, offset -2048
+    ss3 = incoming_arg 8, offset -2056
+
+ebb0:
+    [-,%rcx]            v0 = iconst.i32 1
+    [-,%rsi]            v1 = iconst.i32 2
+
+    ; asm: cvtsi2sd %ecx, %xmm5
+    [-,%xmm5]           v10 = fcvt_from_sint.f64 v0             ; bin: f2 0f 2a e9
+    ; asm: cvtsi2sd %esi, %xmm2
+    [-,%xmm2]           v11 = fcvt_from_sint.f64 v1             ; bin: f2 0f 2a d6
+
+    ; asm: cvtsd2ss %xmm2, %xmm5
+    [-,%xmm5]           v12 = fdemote.f32 v11                   ; bin: f2 0f 5a ea
+    ; asm: cvtsd2ss %xmm5, %xmm2
+    [-,%xmm2]           v13 = fdemote.f32 v10                   ; bin: f2 0f 5a d5
+
+    ; No i64 <-> f64 bitcasts in 32-bit mode.
+
+    ; asm: movaps %xmm2, %xmm5
+    [-,%xmm5]           v18 = copy v11                          ; bin: 0f 28 ea
+    ; asm: movaps %xmm5, %xmm2
+    [-,%xmm2]           v19 = copy v10                          ; bin: 0f 28 d5
+
+    ; asm: movaps %xmm2, %xmm5
+    regmove v19, %xmm2 -> %xmm5                                 ; bin: 0f 28 ea
+    ; asm: movaps %xmm5, %xmm2
+    regmove v19, %xmm5 -> %xmm2                                 ; bin: 0f 28 d5
+
+    ; Binary arithmetic.
+
+    ; asm: addsd %xmm2, %xmm5
+    [-,%xmm5]           v20 = fadd v10, v11                     ; bin: f2 0f 58 ea
+    ; asm: addsd %xmm5, %xmm2
+    [-,%xmm2]           v21 = fadd v11, v10                     ; bin: f2 0f 58 d5
+
+    ; asm: subsd %xmm2, %xmm5
+    [-,%xmm5]           v22 = fsub v10, v11                     ; bin: f2 0f 5c ea
+    ; asm: subsd %xmm5, %xmm2
+    [-,%xmm2]           v23 = fsub v11, v10                     ; bin: f2 0f 5c d5
+
+    ; asm: mulsd %xmm2, %xmm5
+    [-,%xmm5]           v24 = fmul v10, v11                     ; bin: f2 0f 59 ea
+    ; asm: mulsd %xmm5, %xmm2
+    [-,%xmm2]           v25 = fmul v11, v10                     ; bin: f2 0f 59 d5
+
+    ; asm: divsd %xmm2, %xmm5
+    [-,%xmm5]           v26 = fdiv v10, v11                     ; bin: f2 0f 5e ea
+    ; asm: divsd %xmm5, %xmm2
+    [-,%xmm2]           v27 = fdiv v11, v10                     ; bin: f2 0f 5e d5
+
+    ; Bitwise ops.
+    ; We use the *ps SSE instructions for everything because they are smaller.
+
+    ; asm: andps %xmm2, %xmm5
+    [-,%xmm5]           v30 = band v10, v11                     ; bin: 0f 54 ea
+    ; asm: andps %xmm5, %xmm2
+    [-,%xmm2]           v31 = band v11, v10                     ; bin: 0f 54 d5
+
+    ; asm: andnps %xmm2, %xmm5
+    [-,%xmm5]           v32 = band_not v11, v10                 ; bin: 0f 55 ea
+    ; asm: andnps %xmm5, %xmm2
+    [-,%xmm2]           v33 = band_not v10, v11                 ; bin: 0f 55 d5
+
+    ; asm: orps %xmm2, %xmm5
+    [-,%xmm5]           v34 = bor v10, v11                      ; bin: 0f 56 ea
+    ; asm: orps %xmm5, %xmm2
+    [-,%xmm2]           v35 = bor v11, v10                      ; bin: 0f 56 d5
+
+    ; asm: xorps %xmm2, %xmm5
+    [-,%xmm5]           v36 = bxor v10, v11                     ; bin: 0f 57 ea
+    ; asm: xorps %xmm5, %xmm2
+    [-,%xmm2]           v37 = bxor v11, v10                     ; bin: 0f 57 d5
+
+    ; Convert float to int. (No i64 dest on i386).
+
+    ; asm: cvttsd2si %xmm5, %ecx
+    [-,%rcx]            v40 = x86_cvtt2si.i32 v10               ; bin: f2 0f 2c cd
+    ; asm: cvttsd2si %xmm2, %esi
+    [-,%rsi]            v41 = x86_cvtt2si.i32 v11               ; bin: f2 0f 2c f2
+
+    ; Min/max.
+
+    ; asm: minsd %xmm2, %xmm5
+    [-,%xmm5]           v42 = x86_fmin v10, v11                 ; bin: f2 0f 5d ea
+    ; asm: minsd %xmm5, %xmm2
+    [-,%xmm2]           v43 = x86_fmin v11, v10                 ; bin: f2 0f 5d d5
+    ; asm: maxsd %xmm2, %xmm5
+    [-,%xmm5]           v44 = x86_fmax v10, v11                 ; bin: f2 0f 5f ea
+    ; asm: maxsd %xmm5, %xmm2
+    [-,%xmm2]           v45 = x86_fmax v11, v10                 ; bin: f2 0f 5f d5
+
+    ; Unary arithmetic.
+
+    ; asm: sqrtsd %xmm5, %xmm2
+    [-,%xmm2]           v50 = sqrt v10                          ; bin: f2 0f 51 d5
+    ; asm: sqrtsd %xmm2, %xmm5
+    [-,%xmm5]           v51 = sqrt v11                          ; bin: f2 0f 51 ea
+
+    ; asm: roundsd $0, %xmm5, %xmm4
+    [-,%xmm4]           v52 = nearest v10                       ; bin: 66 0f 3a 0b e5 00
+    ; asm: roundsd $0, %xmm2, %xmm5
+    [-,%xmm5]           v53 = nearest v11                       ; bin: 66 0f 3a 0b ea 00
+    ; asm: roundsd $0, %xmm5, %xmm2
+    [-,%xmm2]           v54 = nearest v10                       ; bin: 66 0f 3a 0b d5 00
+
+    ; asm: roundsd $1, %xmm5, %xmm4
+    [-,%xmm4]           v55 = floor v10                         ; bin: 66 0f 3a 0b e5 01
+    ; asm: roundsd $1, %xmm2, %xmm5
+    [-,%xmm5]           v56 = floor v11                         ; bin: 66 0f 3a 0b ea 01
+    ; asm: roundsd $1, %xmm5, %xmm2
+    [-,%xmm2]           v57 = floor v10                         ; bin: 66 0f 3a 0b d5 01
+
+    ; asm: roundsd $2, %xmm5, %xmm4
+    [-,%xmm4]           v58 = ceil v10                          ; bin: 66 0f 3a 0b e5 02
+    ; asm: roundsd $2, %xmm2, %xmm5
+    [-,%xmm5]           v59 = ceil v11                          ; bin: 66 0f 3a 0b ea 02
+    ; asm: roundsd $2, %xmm5, %xmm2
+    [-,%xmm2]           v60 = ceil v10                          ; bin: 66 0f 3a 0b d5 02
+
+    ; asm: roundsd $3, %xmm5, %xmm4
+    [-,%xmm4]           v61 = trunc v10                         ; bin: 66 0f 3a 0b e5 03
+    ; asm: roundsd $3, %xmm2, %xmm5
+    [-,%xmm5]           v62 = trunc v11                         ; bin: 66 0f 3a 0b ea 03
+    ; asm: roundsd $3, %xmm5, %xmm2
+    [-,%xmm2]           v63 = trunc v10                         ; bin: 66 0f 3a 0b d5 03
+
+    ; Load/Store
+
+    ; asm: movsd (%ecx), %xmm5
+    [-,%xmm5]           v100 = load.f64 v0                      ; bin: heap_oob f2 0f 10 29
+    ; asm: movsd (%esi), %xmm2
+    [-,%xmm2]           v101 = load.f64 v1                      ; bin: heap_oob f2 0f 10 16
+    ; asm: movsd 50(%ecx), %xmm5
+    [-,%xmm5]           v110 = load.f64 v0+50                   ; bin: heap_oob f2 0f 10 69 32
+    ; asm: movsd -50(%esi), %xmm2
+    [-,%xmm2]           v111 = load.f64 v1-50                   ; bin: heap_oob f2 0f 10 56 ce
+    ; asm: movsd 10000(%ecx), %xmm5
+    [-,%xmm5]           v120 = load.f64 v0+10000                ; bin: heap_oob f2 0f 10 a9 00002710
+    ; asm: movsd -10000(%esi), %xmm2
+    [-,%xmm2]           v121 = load.f64 v1-10000                ; bin: heap_oob f2 0f 10 96 ffffd8f0
+
+    ; asm: movsd %xmm5, (%ecx)
+    [-]                 store.f64 v100, v0                      ; bin: heap_oob f2 0f 11 29
+    ; asm: movsd %xmm2, (%esi)
+    [-]                 store.f64 v101, v1                      ; bin: heap_oob f2 0f 11 16
+    ; asm: movsd %xmm5, 50(%ecx)
+    [-]                 store.f64 v100, v0+50                   ; bin: heap_oob f2 0f 11 69 32
+    ; asm: movsd %xmm2, -50(%esi)
+    [-]                 store.f64 v101, v1-50                   ; bin: heap_oob f2 0f 11 56 ce
+    ; asm: movsd %xmm5, 10000(%ecx)
+    [-]                 store.f64 v100, v0+10000                ; bin: heap_oob f2 0f 11 a9 00002710
+    ; asm: movsd %xmm2, -10000(%esi)
+    [-]                 store.f64 v101, v1-10000                ; bin: heap_oob f2 0f 11 96 ffffd8f0
+
+    ; Spill / Fill.
+
+    ; asm: movsd %xmm5, 1032(%esp)
+    [-,ss1]             v200 = spill v100                       ; bin: f2 0f 11 ac 24 00000408
+    ; asm: movsd %xmm2, 1032(%esp)
+    [-,ss1]             v201 = spill v101                       ; bin: f2 0f 11 94 24 00000408
+
+    ; asm: movsd 1032(%esp), %xmm5
+    [-,%xmm5]           v210 = fill v200                        ; bin: f2 0f 10 ac 24 00000408
+    ; asm: movsd 1032(%esp), %xmm2
+    [-,%xmm2]           v211 = fill v201                        ; bin: f2 0f 10 94 24 00000408
+
+    ; asm: movsd %xmm5, 1032(%rsp)
+    regspill v100, %xmm5 -> ss1                                 ; bin: f2 0f 11 ac 24 00000408
+    ; asm: movsd 1032(%rsp), %xmm5
+    regfill v100, ss1 -> %xmm5                                  ; bin: f2 0f 10 ac 24 00000408
+
+    ; Comparisons.
+    ;
+    ; Only `supported_floatccs` are tested here. Others are handled by
+    ; legalization paterns.
+
+    ; asm: ucomisd %xmm2, %xmm5
+    ; asm: setnp %bl
+    [-,%rbx]            v300 = fcmp ord v10, v11                ; bin: 66 0f 2e ea 0f 9b c3
+    ; asm: ucomisd %xmm5, %xmm2
+    ; asm: setp %bl
+    [-,%rbx]            v301 = fcmp uno v11, v10                ; bin: 66 0f 2e d5 0f 9a c3
+    ; asm: ucomisd %xmm2, %xmm5
+    ; asm: setne %dl
+    [-,%rdx]            v302 = fcmp one v10, v11                ; bin: 66 0f 2e ea 0f 95 c2
+    ; asm: ucomisd %xmm5, %xmm2
+    ; asm: sete %dl
+    [-,%rdx]            v303 = fcmp ueq v11, v10                ; bin: 66 0f 2e d5 0f 94 c2
+    ; asm: ucomisd %xmm2, %xmm5
+    ; asm: seta %bl
+    [-,%rbx]            v304 = fcmp gt v10, v11                 ; bin: 66 0f 2e ea 0f 97 c3
+    ; asm: ucomisd %xmm5, %xmm2
+    ; asm: setae %bl
+    [-,%rbx]            v305 = fcmp ge v11, v10                 ; bin: 66 0f 2e d5 0f 93 c3
+    ; asm: ucomisd %xmm2, %xmm5
+    ; asm: setb %dl
+    [-,%rdx]            v306 = fcmp ult v10, v11                ; bin: 66 0f 2e ea 0f 92 c2
+    ; asm: ucomisd %xmm5, %xmm2
+    ; asm: setbe %dl
+    [-,%rdx]            v307 = fcmp ule v11, v10                ; bin: 66 0f 2e d5 0f 96 c2
+
+    ; asm: ucomisd %xmm2, %xmm5
+    [-,%rflags]         v310 = ffcmp v10, v11                   ; bin: 66 0f 2e ea
+    ; asm: ucomisd %xmm2, %xmm5
+    [-,%rflags]         v311 = ffcmp v11, v10                   ; bin: 66 0f 2e d5
+    ; asm: ucomisd %xmm5, %xmm5
+    [-,%rflags]         v312 = ffcmp v10, v10                   ; bin: 66 0f 2e ed
+
+    return
+}
+
+function %cpuflags_float(f32 [%xmm0]) {
+ebb0(v0: f32 [%xmm0]):
+    ; asm: ucomiss %xmm0, %xmm0
+    [-,%rflags]         v1 = ffcmp v0, v0                       ; bin: 0f 2e c0
+
+    jump ebb1
+
+ebb1:
+    ; asm: jnp ebb1
+    brff ord v1, ebb1                                           ; bin: 7b fe
+    ; asm: jp ebb1
+    brff uno v1, ebb1                                           ; bin: 7a fc
+    ; asm: jne ebb1
+    brff one v1, ebb1                                           ; bin: 75 fa
+    ; asm: je ebb1
+    brff ueq v1, ebb1                                           ; bin: 74 f8
+    ; asm: ja ebb1
+    brff gt v1, ebb1                                            ; bin: 77 f6
+    ; asm: jae ebb1
+    brff ge v1, ebb1                                            ; bin: 73 f4
+    ; asm: jb ebb1
+    brff ult v1, ebb1                                           ; bin: 72 f2
+    ; asm: jbe ebb1
+    brff ule v1, ebb1                                           ; bin: 76 f0
+
+    ; asm: jp .+4; ud2
+    trapff ord v1, user0                                        ; bin: 7a 02 user0 0f 0b
+    ; asm: jnp .+4; ud2
+    trapff uno v1, user0                                        ; bin: 7b 02 user0 0f 0b
+    ; asm: je .+4; ud2
+    trapff one v1, user0                                        ; bin: 74 02 user0 0f 0b
+    ; asm: jne .+4; ud2
+    trapff ueq v1, user0                                        ; bin: 75 02 user0 0f 0b
+    ; asm: jna .+4; ud2
+    trapff gt v1, user0                                         ; bin: 76 02 user0 0f 0b
+    ; asm: jnae .+4; ud2
+    trapff ge v1, user0                                         ; bin: 72 02 user0 0f 0b
+    ; asm: jnb .+4; ud2
+    trapff ult v1, user0                                        ; bin: 73 02 user0 0f 0b
+    ; asm: jnbe .+4; ud2
+    trapff ule v1, user0                                        ; bin: 77 02 user0 0f 0b
+
+    ; asm: setnp %bl
+    [-,%rbx]            v10 = trueff ord v1                     ; bin: 0f 9b c3
+    ; asm: setp %bl
+    [-,%rbx]            v11 = trueff uno v1                     ; bin: 0f 9a c3
+    ; asm: setne %dl
+    [-,%rdx]            v12 = trueff one v1                     ; bin: 0f 95 c2
+    ; asm: sete %dl
+    [-,%rdx]            v13 = trueff ueq v1                     ; bin: 0f 94 c2
+    ; asm: seta %al
+    [-,%rax]            v14 = trueff gt v1                      ; bin: 0f 97 c0
+    ; asm: setae %al
+    [-,%rax]            v15 = trueff ge v1                      ; bin: 0f 93 c0
+    ; asm: setb %cl
+    [-,%rcx]            v16 = trueff ult v1                     ; bin: 0f 92 c1
+    ; asm: setbe %cl
+    [-,%rcx]            v17 = trueff ule v1                     ; bin: 0f 96 c1
+
+    return
+}
--- a/cranelift/filetests/isa/x86/binary32.cton
+++ b/cranelift/filetests/isa/x86/binary32.cton
@@ -0,0 +1,596 @@
+; binary emission of x86-32 code.
+test binemit
+set is_compressed
+isa x86 haswell
+
+; The binary encodings can be verified with the command:
+;
+;   sed -ne 's/^ *; asm: *//p' filetests/isa/x86/binary32.cton | llvm-mc -show-encoding -triple=i386
+;
+
+function %I32() {
+    sig0 = ()
+    fn0 = function %foo()
+
+    gv0 = globalsym %some_gv
+
+    ss0 = incoming_arg 8, offset 0
+    ss1 = incoming_arg 1024, offset -1024
+    ss2 = incoming_arg 1024, offset -2048
+    ss3 = incoming_arg 8, offset -2056
+
+ebb0:
+    ; asm: movl $1, %ecx
+    [-,%rcx]            v1 = iconst.i32 1        ; bin: b9 00000001
+    ; asm: movl $2, %esi
+    [-,%rsi]            v2 = iconst.i32 2        ; bin: be 00000002
+
+    ; asm: movb $1, %cl
+    [-,%rcx]            v9007 = bconst.b1 true      ; bin: b9 00000001
+
+    ; Integer Register-Register Operations.
+
+    ; asm: addl %esi, %ecx
+    [-,%rcx]             v10 = iadd v1, v2       ; bin: 01 f1
+    ; asm: addl %ecx, %esi
+    [-,%rsi]             v11 = iadd v2, v1       ; bin: 01 ce
+    ; asm: subl %esi, %ecx
+    [-,%rcx]             v12 = isub v1, v2       ; bin: 29 f1
+    ; asm: subl %ecx, %esi
+    [-,%rsi]             v13 = isub v2, v1       ; bin: 29 ce
+
+    ; asm: andl %esi, %ecx
+    [-,%rcx]             v14 = band v1, v2       ; bin: 21 f1
+    ; asm: andl %ecx, %esi
+    [-,%rsi]             v15 = band v2, v1       ; bin: 21 ce
+    ; asm: orl %esi, %ecx
+    [-,%rcx]             v16 = bor v1, v2        ; bin: 09 f1
+    ; asm: orl %ecx, %esi
+    [-,%rsi]             v17 = bor v2, v1        ; bin: 09 ce
+    ; asm: xorl %esi, %ecx
+    [-,%rcx]             v18 = bxor v1, v2       ; bin: 31 f1
+    ; asm: xorl %ecx, %esi
+    [-,%rsi]             v19 = bxor v2, v1       ; bin: 31 ce
+
+    ; Dynamic shifts take the shift amount in %rcx.
+
+    ; asm: shll %cl, %esi
+    [-,%rsi]             v20 = ishl v2, v1       ; bin: d3 e6
+    ; asm: shll %cl, %ecx
+    [-,%rcx]             v21 = ishl v1, v1       ; bin: d3 e1
+    ; asm: shrl %cl, %esi
+    [-,%rsi]             v22 = ushr v2, v1       ; bin: d3 ee
+    ; asm: shrl %cl, %ecx
+    [-,%rcx]             v23 = ushr v1, v1       ; bin: d3 e9
+    ; asm: sarl %cl, %esi
+    [-,%rsi]             v24 = sshr v2, v1       ; bin: d3 fe
+    ; asm: sarl %cl, %ecx
+    [-,%rcx]             v25 = sshr v1, v1       ; bin: d3 f9
+    ; asm: roll %cl, %esi
+    [-,%rsi]             v26 = rotl v2, v1       ; bin: d3 c6
+    ; asm: roll %cl, %ecx
+    [-,%rcx]             v27 = rotl v1, v1       ; bin: d3 c1
+    ; asm: rorl %cl, %esi
+    [-,%rsi]             v28 = rotr v2, v1       ; bin: d3 ce
+    ; asm: rorl %cl, %ecx
+    [-,%rcx]             v29 = rotr v1, v1       ; bin: d3 c9
+
+    ; Integer Register - Immediate 8-bit operations.
+    ; The 8-bit immediate is sign-extended.
+
+    ; asm: addl $-128, %ecx
+    [-,%rcx]             v30 = iadd_imm v1, -128 ; bin: 83 c1 80
+    ; asm: addl $10, %esi
+    [-,%rsi]             v31 = iadd_imm v2, 10   ; bin: 83 c6 0a
+
+    ; asm: andl $-128, %ecx
+    [-,%rcx]             v32 = band_imm v1, -128 ; bin: 83 e1 80
+    ; asm: andl $10, %esi
+    [-,%rsi]             v33 = band_imm v2, 10   ; bin: 83 e6 0a
+    ; asm: orl $-128, %ecx
+    [-,%rcx]             v34 = bor_imm v1, -128  ; bin: 83 c9 80
+    ; asm: orl $10, %esi
+    [-,%rsi]             v35 = bor_imm v2, 10    ; bin: 83 ce 0a
+    ; asm: xorl $-128, %ecx
+    [-,%rcx]             v36 = bxor_imm v1, -128 ; bin: 83 f1 80
+    ; asm: xorl $10, %esi
+    [-,%rsi]             v37 = bxor_imm v2, 10   ; bin: 83 f6 0a
+
+    ; Integer Register - Immediate 32-bit operations.
+
+    ; asm: addl $-128000, %ecx
+    [-,%rcx]             v40 = iadd_imm v1, -128000 ; bin: 81 c1 fffe0c00
+    ; asm: addl $1000000, %esi
+    [-,%rsi]             v41 = iadd_imm v2, 1000000 ; bin: 81 c6 000f4240
+
+    ; asm: andl $-128000, %ecx
+    [-,%rcx]             v42 = band_imm v1, -128000 ; bin: 81 e1 fffe0c00
+    ; asm: andl $1000000, %esi
+    [-,%rsi]             v43 = band_imm v2, 1000000 ; bin: 81 e6 000f4240
+    ; asm: orl $-128000, %ecx
+    [-,%rcx]             v44 = bor_imm v1, -128000  ; bin: 81 c9 fffe0c00
+    ; asm: orl $1000000, %esi
+    [-,%rsi]             v45 = bor_imm v2, 1000000  ; bin: 81 ce 000f4240
+    ; asm: xorl $-128000, %ecx
+    [-,%rcx]             v46 = bxor_imm v1, -128000 ; bin: 81 f1 fffe0c00
+    ; asm: xorl $1000000, %esi
+    [-,%rsi]             v47 = bxor_imm v2, 1000000 ; bin: 81 f6 000f4240
+
+    ; More arithmetic.
+
+    ; asm: imull %esi, %ecx
+    [-,%rcx]             v50 = imul v1, v2       ; bin: 0f af ce
+    ; asm: imull %ecx, %esi
+    [-,%rsi]             v51 = imul v2, v1       ; bin: 0f af f1
+
+    ; asm: movl $1, %eax
+    [-,%rax]      v52 = iconst.i32 1                    ; bin: b8 00000001
+    ; asm: movl $2, %edx
+    [-,%rdx]      v53 = iconst.i32 2                    ; bin: ba 00000002
+    ; asm: idivl %ecx
+    [-,%rax,%rdx] v54, v55 = x86_sdivmodx v52, v53, v1  ; bin: int_divz f7 f9
+    ; asm: idivl %esi
+    [-,%rax,%rdx] v56, v57 = x86_sdivmodx v52, v53, v2  ; bin: int_divz f7 fe
+    ; asm: divl %ecx
+    [-,%rax,%rdx] v58, v59 = x86_udivmodx v52, v53, v1  ; bin: int_divz f7 f1
+    ; asm: divl %esi
+    [-,%rax,%rdx] v60, v61 = x86_udivmodx v52, v53, v2  ; bin: int_divz f7 f6
+
+    ; Register copies.
+
+    ; asm: movl %esi, %ecx
+    [-,%rcx]             v80 = copy v2           ; bin: 89 f1
+    ; asm: movl %ecx, %esi
+    [-,%rsi]             v81 = copy v1           ; bin: 89 ce
+
+    ; Copy Special
+    ; asm: movl %esp, %ebp
+    copy_special %rsp -> %rbp                   ; bin: 89 e5
+    ; asm: movl %ebp, %esp
+    copy_special %rbp -> %rsp                   ; bin: 89 ec
+
+
+    ; Load/Store instructions.
+
+    ; Register indirect addressing with no displacement.
+
+    ; asm: movl %ecx, (%esi)
+    store v1, v2                                ; bin: heap_oob 89 0e
+    ; asm: movl %esi, (%ecx)
+    store v2, v1                                ; bin: heap_oob 89 31
+    ; asm: movw %cx, (%esi)
+    istore16 v1, v2                             ; bin: heap_oob 66 89 0e
+    ; asm: movw %si, (%ecx)
+    istore16 v2, v1                             ; bin: heap_oob 66 89 31
+    ; asm: movb %cl, (%esi)
+    istore8 v1, v2                              ; bin: heap_oob 88 0e
+    ; Can't store %sil in 32-bit mode (needs REX prefix).
+
+    ; asm: movl (%ecx), %edi
+    [-,%rdi]            v100 = load.i32 v1      ; bin: heap_oob 8b 39
+    ; asm: movl (%esi), %edx
+    [-,%rdx]            v101 = load.i32 v2      ; bin: heap_oob 8b 16
+    ; asm: movzwl (%ecx), %edi
+    [-,%rdi]            v102 = uload16.i32 v1   ; bin: heap_oob 0f b7 39
+    ; asm: movzwl (%esi), %edx
+    [-,%rdx]            v103 = uload16.i32 v2   ; bin: heap_oob 0f b7 16
+    ; asm: movswl (%ecx), %edi
+    [-,%rdi]            v104 = sload16.i32 v1   ; bin: heap_oob 0f bf 39
+    ; asm: movswl (%esi), %edx
+    [-,%rdx]            v105 = sload16.i32 v2   ; bin: heap_oob 0f bf 16
+    ; asm: movzbl (%ecx), %edi
+    [-,%rdi]            v106 = uload8.i32 v1    ; bin: heap_oob 0f b6 39
+    ; asm: movzbl (%esi), %edx
+    [-,%rdx]            v107 = uload8.i32 v2    ; bin: heap_oob 0f b6 16
+    ; asm: movsbl (%ecx), %edi
+    [-,%rdi]            v108 = sload8.i32 v1    ; bin: heap_oob 0f be 39
+    ; asm: movsbl (%esi), %edx
+    [-,%rdx]            v109 = sload8.i32 v2    ; bin: heap_oob 0f be 16
+
+    ; Register-indirect with 8-bit signed displacement.
+
+    ; asm: movl %ecx, 100(%esi)
+    store v1, v2+100                            ; bin: heap_oob 89 4e 64
+    ; asm: movl %esi, -100(%ecx)
+    store v2, v1-100                            ; bin: heap_oob 89 71 9c
+    ; asm: movw %cx, 100(%esi)
+    istore16 v1, v2+100                         ; bin: heap_oob 66 89 4e 64
+    ; asm: movw %si, -100(%ecx)
+    istore16 v2, v1-100                         ; bin: heap_oob 66 89 71 9c
+    ; asm: movb %cl, 100(%esi)
+    istore8 v1, v2+100                          ; bin: heap_oob 88 4e 64
+
+    ; asm: movl 50(%ecx), %edi
+    [-,%rdi]            v110 = load.i32 v1+50           ; bin: heap_oob 8b 79 32
+    ; asm: movl -50(%esi), %edx
+    [-,%rdx]            v111 = load.i32 v2-50           ; bin: heap_oob 8b 56 ce
+    ; asm: movzwl 50(%ecx), %edi
+    [-,%rdi]            v112 = uload16.i32 v1+50        ; bin: heap_oob 0f b7 79 32
+    ; asm: movzwl -50(%esi), %edx
+    [-,%rdx]            v113 = uload16.i32 v2-50        ; bin: heap_oob 0f b7 56 ce
+    ; asm: movswl 50(%ecx), %edi
+    [-,%rdi]            v114 = sload16.i32 v1+50        ; bin: heap_oob 0f bf 79 32
+    ; asm: movswl -50(%esi), %edx
+    [-,%rdx]            v115 = sload16.i32 v2-50        ; bin: heap_oob 0f bf 56 ce
+    ; asm: movzbl 50(%ecx), %edi
+    [-,%rdi]            v116 = uload8.i32 v1+50         ; bin: heap_oob 0f b6 79 32
+    ; asm: movzbl -50(%esi), %edx
+    [-,%rdx]            v117 = uload8.i32 v2-50         ; bin: heap_oob 0f b6 56 ce
+    ; asm: movsbl 50(%ecx), %edi
+    [-,%rdi]            v118 = sload8.i32 v1+50         ; bin: heap_oob 0f be 79 32
+    ; asm: movsbl -50(%esi), %edx
+    [-,%rdx]            v119 = sload8.i32 v2-50         ; bin: heap_oob 0f be 56 ce
+
+    ; Register-indirect with 32-bit signed displacement.
+
+    ; asm: movl %ecx, 10000(%esi)
+    store v1, v2+10000                          ; bin: heap_oob 89 8e 00002710
+    ; asm: movl %esi, -10000(%ecx)
+    store v2, v1-10000                          ; bin: heap_oob 89 b1 ffffd8f0
+    ; asm: movw %cx, 10000(%esi)
+    istore16 v1, v2+10000                       ; bin: heap_oob 66 89 8e 00002710
+    ; asm: movw %si, -10000(%ecx)
+    istore16 v2, v1-10000                       ; bin: heap_oob 66 89 b1 ffffd8f0
+    ; asm: movb %cl, 10000(%esi)
+    istore8 v1, v2+10000                        ; bin: heap_oob 88 8e 00002710
+
+    ; asm: movl 50000(%ecx), %edi
+    [-,%rdi]            v120 = load.i32 v1+50000           ; bin: heap_oob 8b b9 0000c350
+    ; asm: movl -50000(%esi), %edx
+    [-,%rdx]            v121 = load.i32 v2-50000           ; bin: heap_oob 8b 96 ffff3cb0
+    ; asm: movzwl 50000(%ecx), %edi
+    [-,%rdi]            v122 = uload16.i32 v1+50000        ; bin: heap_oob 0f b7 b9 0000c350
+    ; asm: movzwl -50000(%esi), %edx
+    [-,%rdx]            v123 = uload16.i32 v2-50000        ; bin: heap_oob 0f b7 96 ffff3cb0
+    ; asm: movswl 50000(%ecx), %edi
+    [-,%rdi]            v124 = sload16.i32 v1+50000        ; bin: heap_oob 0f bf b9 0000c350
+    ; asm: movswl -50000(%esi), %edx
+    [-,%rdx]            v125 = sload16.i32 v2-50000        ; bin: heap_oob 0f bf 96 ffff3cb0
+    ; asm: movzbl 50000(%ecx), %edi
+    [-,%rdi]            v126 = uload8.i32 v1+50000         ; bin: heap_oob 0f b6 b9 0000c350
+    ; asm: movzbl -50000(%esi), %edx
+    [-,%rdx]            v127 = uload8.i32 v2-50000         ; bin: heap_oob 0f b6 96 ffff3cb0
+    ; asm: movsbl 50000(%ecx), %edi
+    [-,%rdi]            v128 = sload8.i32 v1+50000         ; bin: heap_oob 0f be b9 0000c350
+    ; asm: movsbl -50000(%esi), %edx
+    [-,%rdx]            v129 = sload8.i32 v2-50000         ; bin: heap_oob 0f be 96 ffff3cb0
+
+    ; Bit-counting instructions.
+
+    ; asm: popcntl %esi, %ecx
+    [-,%rcx]            v200 = popcnt v2        ; bin: f3 0f b8 ce
+    ; asm: popcntl %ecx, %esi
+    [-,%rsi]            v201 = popcnt v1        ; bin: f3 0f b8 f1
+
+    ; asm: lzcntl %esi, %ecx
+    [-,%rcx]            v202 = clz v2           ; bin: f3 0f bd ce
+    ; asm: lzcntl %ecx, %esi
+    [-,%rsi]            v203 = clz v1           ; bin: f3 0f bd f1
+
+    ; asm: tzcntl %esi, %ecx
+    [-,%rcx]            v204 = ctz v2           ; bin: f3 0f bc ce
+    ; asm: tzcntl %ecx, %esi
+    [-,%rsi]            v205 = ctz v1           ; bin: f3 0f bc f1
+
+    ; Integer comparisons.
+
+    ; asm: cmpl %esi, %ecx
+    ; asm: sete %bl
+    [-,%rbx]            v300 = icmp eq v1, v2   ; bin: 39 f1 0f 94 c3
+    ; asm: cmpl %ecx, %esi
+    ; asm: sete %dl
+    [-,%rdx]            v301 = icmp eq v2, v1   ; bin: 39 ce 0f 94 c2
+
+    ; asm: cmpl %esi, %ecx
+    ; asm: setne %bl
+    [-,%rbx]            v302 = icmp ne v1, v2   ; bin: 39 f1 0f 95 c3
+    ; asm: cmpl %ecx, %esi
+    ; asm: setne %dl
+    [-,%rdx]            v303 = icmp ne v2, v1   ; bin: 39 ce 0f 95 c2
+
+    ; asm: cmpl %esi, %ecx
+    ; asm: setl %bl
+    [-,%rbx]            v304 = icmp slt v1, v2  ; bin: 39 f1 0f 9c c3
+    ; asm: cmpl %ecx, %esi
+    ; asm: setl %dl
+    [-,%rdx]            v305 = icmp slt v2, v1  ; bin: 39 ce 0f 9c c2
+
+    ; asm: cmpl %esi, %ecx
+    ; asm: setge %bl
+    [-,%rbx]            v306 = icmp sge v1, v2  ; bin: 39 f1 0f 9d c3
+    ; asm: cmpl %ecx, %esi
+    ; asm: setge %dl
+    [-,%rdx]            v307 = icmp sge v2, v1  ; bin: 39 ce 0f 9d c2
+
+    ; asm: cmpl %esi, %ecx
+    ; asm: setg %bl
+    [-,%rbx]            v308 = icmp sgt v1, v2  ; bin: 39 f1 0f 9f c3
+    ; asm: cmpl %ecx, %esi
+    ; asm: setg %dl
+    [-,%rdx]            v309 = icmp sgt v2, v1  ; bin: 39 ce 0f 9f c2
+
+    ; asm: cmpl %esi, %ecx
+    ; asm: setle %bl
+    [-,%rbx]            v310 = icmp sle v1, v2  ; bin: 39 f1 0f 9e c3
+    ; asm: cmpl %ecx, %esi
+    ; asm: setle %dl
+    [-,%rdx]            v311 = icmp sle v2, v1  ; bin: 39 ce 0f 9e c2
+
+    ; asm: cmpl %esi, %ecx
+    ; asm: setb %bl
+    [-,%rbx]            v312 = icmp ult v1, v2  ; bin: 39 f1 0f 92 c3
+    ; asm: cmpl %ecx, %esi
+    ; asm: setb %dl
+    [-,%rdx]            v313 = icmp ult v2, v1  ; bin: 39 ce 0f 92 c2
+
+    ; asm: cmpl %esi, %ecx
+    ; asm: setae %bl
+    [-,%rbx]            v314 = icmp uge v1, v2  ; bin: 39 f1 0f 93 c3
+    ; asm: cmpl %ecx, %esi
+    ; asm: setae %dl
+    [-,%rdx]            v315 = icmp uge v2, v1  ; bin: 39 ce 0f 93 c2
+
+    ; asm: cmpl %esi, %ecx
+    ; asm: seta %bl
+    [-,%rbx]            v316 = icmp ugt v1, v2  ; bin: 39 f1 0f 97 c3
+    ; asm: cmpl %ecx, %esi
+    ; asm: seta %dl
+    [-,%rdx]            v317 = icmp ugt v2, v1  ; bin: 39 ce 0f 97 c2
+
+    ; asm: cmpl %esi, %ecx
+    ; asm: setbe %bl
+    [-,%rbx]            v318 = icmp ule v1, v2  ; bin: 39 f1 0f 96 c3
+    ; asm: cmpl %ecx, %esi
+    ; asm: setbe %dl
+    [-,%rdx]            v319 = icmp ule v2, v1  ; bin: 39 ce 0f 96 c2
+
+    ; Bool-to-int conversions.
+
+    ; asm: movzbl %bl, %ecx
+    [-,%rcx]             v350 = bint.i32 v300   ; bin: 0f b6 cb
+    ; asm: movzbl %dl, %esi
+    [-,%rsi]             v351 = bint.i32 v301   ; bin: 0f b6 f2
+
+    ; asm: call foo
+    call fn0()                                  ; bin: e8 PCRel4(%foo-4) 00000000
+
+    ; asm: movl $0, %ecx
+    [-,%rcx]            v400 = func_addr.i32 fn0        ; bin: b9 Abs4(%foo) 00000000
+    ; asm: movl $0, %esi
+    [-,%rsi]            v401 = func_addr.i32 fn0        ; bin: be Abs4(%foo) 00000000
+
+    ; asm: call *%ecx
+    call_indirect sig0, v400()                  ; bin: ff d1
+    ; asm: call *%esi
+    call_indirect sig0, v401()                  ; bin: ff d6
+
+    ; asm: movl $0, %ecx
+    [-,%rcx]            v450 = globalsym_addr.i32 gv0    ; bin: b9 Abs4(%some_gv) 00000000
+    ; asm: movl $0, %esi
+    [-,%rsi]            v451 = globalsym_addr.i32 gv0    ; bin: be Abs4(%some_gv) 00000000
+
+    ; Spill / Fill.
+
+    ; asm: movl %ecx, 1032(%esp)
+    [-,ss1]             v500 = spill v1         ; bin: 89 8c 24 00000408
+    ; asm: movl %esi, 1032(%esp)
+    [-,ss1]             v501 = spill v2         ; bin: 89 b4 24 00000408
+
+    ; asm: movl 1032(%esp), %ecx
+    [-,%rcx]            v510 = fill v500        ; bin: 8b 8c 24 00000408
+    ; asm: movl 1032(%esp), %esi
+    [-,%rsi]            v511 = fill v501        ; bin: 8b b4 24 00000408
+
+    ; asm: movl %ecx, 1032(%esp)
+    regspill v1, %rcx -> ss1                    ; bin: 89 8c 24 00000408
+    ; asm: movl 1032(%esp), %ecx
+    regfill v1, ss1 -> %rcx                     ; bin: 8b 8c 24 00000408
+
+    ; Push and Pop
+    ; asm: pushl %ecx
+    x86_push v1                                 ; bin: 51
+    ; asm: popl %ecx
+    [-,%rcx]            v512 = x86_pop.i32      ; bin: 59
+
+    ; Adjust Stack Pointer
+    ; asm: addl $64, %esp
+    adjust_sp_imm 64                            ; bin: 83 c4 40
+    ; asm: addl $-64, %esp
+    adjust_sp_imm -64                           ; bin: 83 c4 c0
+    ; asm: addl $1024, %esp
+    adjust_sp_imm 1024                          ; bin: 81 c4 00000400
+    ; asm: addl $-1024, %esp
+    adjust_sp_imm -1024                         ; bin: 81 c4 fffffc00
+    ; asm: addl $2147483647, %esp
+    adjust_sp_imm 2147483647                    ; bin: 81 c4 7fffffff
+    ; asm: addl $-2147483648, %esp
+    adjust_sp_imm -2147483648                   ; bin: 81 c4 80000000
+
+    ; Shift immediates
+    ; asm: shll $2, %esi
+    [-,%rsi]             v513 = ishl_imm v2, 2    ; bin: c1 e6 02
+    ; asm: sarl $5, %esi
+    [-,%rsi]             v514 = sshr_imm v2, 5    ; bin: c1 fe 05
+    ; asm: shrl $8, %esi
+    [-,%rsi]             v515 = ushr_imm v2, 8    ; bin: c1 ee 08
+
+    ; asm: testl %ecx, %ecx
+    ; asm: je ebb1
+    brz v1, ebb1                                ; bin: 85 c9 74 0e
+    ; asm: testl %esi, %esi
+    ; asm: je ebb1
+    brz v2, ebb1                                ; bin: 85 f6 74 0a
+    ; asm: testl %ecx, %ecx
+    ; asm: jne ebb1
+    brnz v1, ebb1                               ; bin: 85 c9 75 06
+    ; asm: testl %esi, %esi
+    ; asm: jne ebb1
+    brnz v2, ebb1                               ; bin: 85 f6 75 02
+
+    ; asm: jmp ebb2
+    jump ebb2                                   ; bin: eb 01
+
+    ; asm: ebb1:
+ebb1:
+    ; asm: ret
+    return                                      ; bin: c3
+
+    ; asm: ebb2:
+ebb2:
+    trap user0                                  ; bin: user0 0f 0b
+}
+
+; Special branch encodings only for I32 mode.
+function %special_branches() {
+ebb0:
+    [-,%rcx]            v1 = iconst.i32 1
+    [-,%rsi]            v2 = iconst.i32 2
+    [-,%rdi]            v3 = icmp eq v1, v2
+    [-,%rbx]            v4 = icmp ugt v1, v2
+
+    ; asm: testl $0xff, %edi
+    ; asm: je ebb1
+    brz v3, ebb1                                ; bin: f7 c7 000000ff 0f 84 00000015
+    ; asm: testb %bl, %bl
+    ; asm: je ebb1
+    brz v4, ebb1                                ; bin: 84 db 74 11
+    ; asm: testl $0xff, %edi
+    ; asm: jne ebb1
+    brnz v3, ebb1                               ; bin: f7 c7 000000ff 0f 85 00000005
+    ; asm: testb %bl, %bl
+    ; asm: jne ebb1
+    brnz v4, ebb1                               ; bin: 84 db 75 01
+
+    return
+
+ebb1:
+    return
+}
+
+; CPU flag instructions.
+function %cpu_flags() {
+ebb0:
+    [-,%rcx]            v1 = iconst.i32 1
+    [-,%rsi]            v2 = iconst.i32 2
+    jump ebb1
+
+ebb1:
+    ; asm: cmpl %esi, %ecx
+    [-,%rflags]         v10 = ifcmp v1, v2      ; bin: 39 f1
+    ; asm: cmpl %ecx, %esi
+    [-,%rflags]         v11 = ifcmp v2, v1      ; bin: 39 ce
+
+    ; asm: je ebb1
+    brif eq v11, ebb1                           ; bin: 74 fa
+    ; asm: jne ebb1
+    brif ne v11, ebb1                           ; bin: 75 f8
+    ; asm: jl ebb1
+    brif slt v11, ebb1                          ; bin: 7c f6
+    ; asm: jge ebb1
+    brif sge v11, ebb1                          ; bin: 7d f4
+    ; asm: jg ebb1
+    brif sgt v11, ebb1                          ; bin: 7f f2
+    ; asm: jle ebb1
+    brif sle v11, ebb1                          ; bin: 7e f0
+    ; asm: jb ebb1
+    brif ult v11, ebb1                          ; bin: 72 ee
+    ; asm: jae ebb1
+    brif uge v11, ebb1                          ; bin: 73 ec
+    ; asm: ja ebb1
+    brif ugt v11, ebb1                          ; bin: 77 ea
+    ; asm: jbe ebb1
+    brif ule v11, ebb1                          ; bin: 76 e8
+
+    ; asm: sete %bl
+    [-,%rbx]            v20 = trueif eq v11                           ; bin: 0f 94 c3
+    ; asm: setne %bl
+    [-,%rbx]            v21 = trueif ne v11                           ; bin: 0f 95 c3
+    ; asm: setl %dl
+    [-,%rdx]            v22 = trueif slt v11                          ; bin: 0f 9c c2
+    ; asm: setge %dl
+    [-,%rdx]            v23 = trueif sge v11                          ; bin: 0f 9d c2
+    ; asm: setg %bl
+    [-,%rbx]            v24 = trueif sgt v11                          ; bin: 0f 9f c3
+    ; asm: setle %bl
+    [-,%rbx]            v25 = trueif sle v11                          ; bin: 0f 9e c3
+    ; asm: setb %dl
+    [-,%rdx]            v26 = trueif ult v11                          ; bin: 0f 92 c2
+    ; asm: setae %dl
+    [-,%rdx]            v27 = trueif uge v11                          ; bin: 0f 93 c2
+    ; asm: seta %bl
+    [-,%rbx]            v28 = trueif ugt v11                          ; bin: 0f 97 c3
+    ; asm: setbe %bl
+    [-,%rbx]            v29 = trueif ule v11                          ; bin: 0f 96 c3
+
+    ; The trapif instructions are encoded as macros: a conditional jump over a ud2.
+    ; asm: jne .+4; ud2
+    trapif eq v11, user0                           ; bin: 75 02 user0 0f 0b
+    ; asm: je .+4; ud2
+    trapif ne v11, user0                           ; bin: 74 02 user0 0f 0b
+    ; asm: jnl .+4; ud2
+    trapif slt v11, user0                          ; bin: 7d 02 user0 0f 0b
+    ; asm: jnge .+4; ud2
+    trapif sge v11, user0                          ; bin: 7c 02 user0 0f 0b
+    ; asm: jng .+4; ud2
+    trapif sgt v11, user0                          ; bin: 7e 02 user0 0f 0b
+    ; asm: jnle .+4; ud2
+    trapif sle v11, user0                          ; bin: 7f 02 user0 0f 0b
+    ; asm: jnb .+4; ud2
+    trapif ult v11, user0                          ; bin: 73 02 user0 0f 0b
+    ; asm: jnae .+4; ud2
+    trapif uge v11, user0                          ; bin: 72 02 user0 0f 0b
+    ; asm: jna .+4; ud2
+    trapif ugt v11, user0                          ; bin: 76 02 user0 0f 0b
+    ; asm: jnbe .+4; ud2
+    trapif ule v11, user0                          ; bin: 77 02 user0 0f 0b
+
+    ; Stack check.
+    ; asm: cmpl %esp, %ecx
+    [-,%rflags]         v40 = ifcmp_sp v1       ; bin: 39 e1
+    ; asm: cmpl %esp, %esi
+    [-,%rflags]         v41 = ifcmp_sp v2       ; bin: 39 e6
+
+    ; asm: cmpl $-100, %ecx
+    [-,%rflags]         v42 = ifcmp_imm v1, -100   ; bin: 83 f9 9c
+    ; asm: cmpl $100, %esi
+    [-,%rflags]         v43 = ifcmp_imm v2, 100    ; bin: 83 fe 64
+
+    ; asm: cmpl $-10000, %ecx
+    [-,%rflags]         v44 = ifcmp_imm v1, -10000 ; bin: 81 f9 ffffd8f0
+    ; asm: cmpl $10000, %esi
+    [-,%rflags]         v45 = ifcmp_imm v2, 10000  ; bin: 81 fe 00002710
+
+    return
+}
+
+; Tests for i32/i8 conversion instructions.
+function %I32_I8() {
+ebb0:
+    [-,%rcx]            v1 = iconst.i32 1
+
+    [-,%rcx]            v11 = ireduce.i8 v1             ; bin:
+
+    ; asm: movsbl %cl, %esi
+    [-,%rsi]            v20 = sextend.i32 v11           ; bin: 0f be f1
+
+    ; asm: movzbl %cl, %esi
+    [-,%rsi]            v30 = uextend.i32 v11           ; bin: 0f b6 f1
+
+    trap user0                                          ; bin: user0 0f 0b
+}
+
+; Tests for i32/i16 conversion instructions.
+function %I32_I16() {
+ebb0:
+    [-,%rcx]            v1 = iconst.i32 1
+
+    [-,%rcx]            v11 = ireduce.i16 v1            ; bin:
+
+    ; asm: movswl %cx, %esi
+    [-,%rsi]            v20 = sextend.i32 v11           ; bin: 0f bf f1
+
+    ; asm: movzwl %cx, %esi
+    [-,%rsi]            v30 = uextend.i32 v11           ; bin: 0f b7 f1
+
+    trap user0                                          ; bin: user0 0f 0b
+}
--- a/cranelift/filetests/isa/x86/binary64-float.cton
+++ b/cranelift/filetests/isa/x86/binary64-float.cton
@@ -0,0 +1,542 @@
+; Binary emission of 64-bit floating point code.
+test binemit
+set is_64bit
+set is_compressed
+isa x86 haswell
+
+; The binary encodings can be verified with the command:
+;
+;   sed -ne 's/^ *; asm: *//p' filetests/isa/x86/binary64-float.cton | llvm-mc -show-encoding -triple=x86_64
+;
+
+function %F32() {
+    ss0 = incoming_arg 8, offset 0
+    ss1 = incoming_arg 1024, offset -1024
+    ss2 = incoming_arg 1024, offset -2048
+    ss3 = incoming_arg 8, offset -2056
+
+ebb0:
+    [-,%r11]            v0 = iconst.i32 1
+    [-,%rsi]            v1 = iconst.i32 2
+    [-,%rax]            v2 = iconst.i64 11
+    [-,%r14]            v3 = iconst.i64 12
+    [-,%r13]            v4 = iconst.i64 13
+
+    ; asm: cvtsi2ssl %r11d, %xmm5
+    [-,%xmm5]           v10 = fcvt_from_sint.f32 v0             ; bin: f3 41 0f 2a eb
+    ; asm: cvtsi2ssl %esi, %xmm10
+    [-,%xmm10]          v11 = fcvt_from_sint.f32 v1             ; bin: f3 44 0f 2a d6
+
+    ; asm: cvtsi2ssq %rax, %xmm5
+    [-,%xmm5]           v12 = fcvt_from_sint.f32 v2             ; bin: f3 48 0f 2a e8
+    ; asm: cvtsi2ssq %r14, %xmm10
+    [-,%xmm10]          v13 = fcvt_from_sint.f32 v3             ; bin: f3 4d 0f 2a d6
+
+    ; asm: cvtss2sd %xmm10, %xmm5
+    [-,%xmm5]           v14 = fpromote.f64 v11                  ; bin: f3 41 0f 5a ea
+    ; asm: cvtss2sd %xmm5, %xmm10
+    [-,%xmm10]          v15 = fpromote.f64 v10                  ; bin: f3 44 0f 5a d5
+
+    ; asm: movd %r11d, %xmm5
+    [-,%xmm5]           v16 = bitcast.f32 v0                    ; bin: 66 41 0f 6e eb
+    ; asm: movd %esi, %xmm10
+    [-,%xmm10]          v17 = bitcast.f32 v1                    ; bin: 66 44 0f 6e d6
+
+    ; asm: movd %xmm5, %ecx
+    [-,%rcx]            v18 = bitcast.i32 v10                   ; bin: 66 0f 7e e9
+    ; asm: movd %xmm10, %esi
+    [-,%rsi]            v19 = bitcast.i32 v11                   ; bin: 66 44 0f 7e d6
+
+    ; Binary arithmetic.
+
+    ; asm: addss %xmm10, %xmm5
+    [-,%xmm5]           v20 = fadd v10, v11                     ; bin: f3 41 0f 58 ea
+    ; asm: addss %xmm5, %xmm10
+    [-,%xmm10]          v21 = fadd v11, v10                     ; bin: f3 44 0f 58 d5
+
+    ; asm: subss %xmm10, %xmm5
+    [-,%xmm5]           v22 = fsub v10, v11                     ; bin: f3 41 0f 5c ea
+    ; asm: subss %xmm5, %xmm10
+    [-,%xmm10]          v23 = fsub v11, v10                     ; bin: f3 44 0f 5c d5
+
+    ; asm: mulss %xmm10, %xmm5
+    [-,%xmm5]           v24 = fmul v10, v11                     ; bin: f3 41 0f 59 ea
+    ; asm: mulss %xmm5, %xmm10
+    [-,%xmm10]          v25 = fmul v11, v10                     ; bin: f3 44 0f 59 d5
+
+    ; asm: divss %xmm10, %xmm5
+    [-,%xmm5]           v26 = fdiv v10, v11                     ; bin: f3 41 0f 5e ea
+    ; asm: divss %xmm5, %xmm10
+    [-,%xmm10]          v27 = fdiv v11, v10                     ; bin: f3 44 0f 5e d5
+
+    ; Bitwise ops.
+    ; We use the *ps SSE instructions for everything because they are smaller.
+
+    ; asm: andps %xmm10, %xmm5
+    [-,%xmm5]           v30 = band v10, v11                     ; bin: 41 0f 54 ea
+    ; asm: andps %xmm5, %xmm10
+    [-,%xmm10]          v31 = band v11, v10                     ; bin: 44 0f 54 d5
+
+    ; asm: andnps %xmm10, %xmm5
+    [-,%xmm5]           v32 = band_not v11, v10                 ; bin: 41 0f 55 ea
+    ; asm: andnps %xmm5, %xmm10
+    [-,%xmm10]          v33 = band_not v10, v11                 ; bin: 44 0f 55 d5
+
+    ; asm: orps %xmm10, %xmm5
+    [-,%xmm5]           v34 = bor v10, v11                      ; bin: 41 0f 56 ea
+    ; asm: orps %xmm5, %xmm10
+    [-,%xmm10]          v35 = bor v11, v10                      ; bin: 44 0f 56 d5
+
+    ; asm: xorps %xmm10, %xmm5
+    [-,%xmm5]           v36 = bxor v10, v11                     ; bin: 41 0f 57 ea
+    ; asm: xorps %xmm5, %xmm10
+    [-,%xmm10]          v37 = bxor v11, v10                     ; bin: 44 0f 57 d5
+
+    ; asm: movaps %xmm10, %xmm5
+    [-,%xmm5]           v38 = copy v11                          ; bin: 41 0f 28 ea
+    ; asm: movaps %xmm5, %xmm10
+    [-,%xmm10]          v39 = copy v10                          ; bin: 44 0f 28 d5
+
+    ; Convert float to int.
+
+    ; asm: cvttss2si %xmm5, %ecx
+    [-,%rcx]            v40 = x86_cvtt2si.i32 v10               ; bin: f3 0f 2c cd
+    ; asm: cvttss2si %xmm10, %esi
+    [-,%rsi]            v41 = x86_cvtt2si.i32 v11               ; bin: f3 41 0f 2c f2
+
+    ; asm: cvttss2si %xmm5, %rcx
+    [-,%rcx]            v42 = x86_cvtt2si.i64 v10               ; bin: f3 48 0f 2c cd
+    ; asm: cvttss2si %xmm10, %rsi
+    [-,%rsi]            v43 = x86_cvtt2si.i64 v11               ; bin: f3 49 0f 2c f2
+
+    ; Min/max.
+
+    ; asm: minss %xmm10, %xmm5
+    [-,%xmm5]           v44 = x86_fmin v10, v11                 ; bin: f3 41 0f 5d ea
+    ; asm: minss %xmm5, %xmm10
+    [-,%xmm10]          v45 = x86_fmin v11, v10                 ; bin: f3 44 0f 5d d5
+    ; asm: maxss %xmm10, %xmm5
+    [-,%xmm5]           v46 = x86_fmax v10, v11                 ; bin: f3 41 0f 5f ea
+    ; asm: maxss %xmm5, %xmm10
+    [-,%xmm10]          v47 = x86_fmax v11, v10                 ; bin: f3 44 0f 5f d5
+
+    ; Unary arithmetic.
+
+    ; asm: sqrtss %xmm5, %xmm10
+    [-,%xmm10]          v50 = sqrt v10                          ; bin: f3 44 0f 51 d5
+    ; asm: sqrtss %xmm10, %xmm5
+    [-,%xmm5]           v51 = sqrt v11                          ; bin: f3 41 0f 51 ea
+
+    ; asm: roundss $0, %xmm5, %xmm10
+    [-,%xmm10]          v52 = nearest v10                       ; bin: 66 44 0f 3a 0a d5 00
+    ; asm: roundss $0, %xmm10, %xmm5
+    [-,%xmm5]           v53 = nearest v11                       ; bin: 66 41 0f 3a 0a ea 00
+    ; asm: roundss $0, %xmm5, %xmm2
+    [-,%xmm2]           v54 = nearest v10                       ; bin: 66 0f 3a 0a d5 00
+
+    ; asm: roundss $1, %xmm5, %xmm10
+    [-,%xmm10]          v55 = floor v10                         ; bin: 66 44 0f 3a 0a d5 01
+    ; asm: roundss $1, %xmm10, %xmm5
+    [-,%xmm5]           v56 = floor v11                         ; bin: 66 41 0f 3a 0a ea 01
+    ; asm: roundss $1, %xmm5, %xmm2
+    [-,%xmm2]           v57 = floor v10                         ; bin: 66 0f 3a 0a d5 01
+
+    ; asm: roundss $2, %xmm5, %xmm10
+    [-,%xmm10]          v58 = ceil v10                          ; bin: 66 44 0f 3a 0a d5 02
+    ; asm: roundss $2, %xmm10, %xmm5
+    [-,%xmm5]           v59 = ceil v11                          ; bin: 66 41 0f 3a 0a ea 02
+    ; asm: roundss $2, %xmm5, %xmm2
+    [-,%xmm2]           v60 = ceil v10                          ; bin: 66 0f 3a 0a d5 02
+
+    ; asm: roundss $3, %xmm5, %xmm10
+    [-,%xmm10]          v61 = trunc v10                         ; bin: 66 44 0f 3a 0a d5 03
+    ; asm: roundss $3, %xmm10, %xmm5
+    [-,%xmm5]           v62 = trunc v11                         ; bin: 66 41 0f 3a 0a ea 03
+    ; asm: roundss $3, %xmm5, %xmm2
+    [-,%xmm2]           v63 = trunc v10                         ; bin: 66 0f 3a 0a d5 03
+
+    ; Load/Store
+
+    ; asm: movss (%r14), %xmm5
+    [-,%xmm5]           v100 = load.f32 v3                      ; bin: heap_oob f3 41 0f 10 2e
+    ; asm: movss (%rax), %xmm10
+    [-,%xmm10]          v101 = load.f32 v2                      ; bin: heap_oob f3 44 0f 10 10
+    ; asm: movss 50(%r14), %xmm5
+    [-,%xmm5]           v110 = load.f32 v3+50                   ; bin: heap_oob f3 41 0f 10 6e 32
+    ; asm: movss -50(%rax), %xmm10
+    [-,%xmm10]          v111 = load.f32 v2-50                   ; bin: heap_oob f3 44 0f 10 50 ce
+    ; asm: movss 10000(%r14), %xmm5
+    [-,%xmm5]           v120 = load.f32 v3+10000                ; bin: heap_oob f3 41 0f 10 ae 00002710
+    ; asm: movss -10000(%rax), %xmm10
+    [-,%xmm10]          v121 = load.f32 v2-10000                ; bin: heap_oob f3 44 0f 10 90 ffffd8f0
+
+    ; asm: movss %xmm5, (%r14)
+    [-]                 store.f32 v100, v3                      ; bin: heap_oob f3 41 0f 11 2e
+    ; asm: movss %xmm10, (%rax)
+    [-]                 store.f32 v101, v2                      ; bin: heap_oob f3 44 0f 11 10
+    ; asm: movss %xmm5, (%r13)
+    [-]                 store.f32 v100, v4                      ; bin: heap_oob f3 41 0f 11 6d 00
+    ; asm: movss %xmm10, (%r13)
+    [-]                 store.f32 v101, v4                      ; bin: heap_oob f3 45 0f 11 55 00
+    ; asm: movss %xmm5, 50(%r14)
+    [-]                 store.f32 v100, v3+50                   ; bin: heap_oob f3 41 0f 11 6e 32
+    ; asm: movss %xmm10, -50(%rax)
+    [-]                 store.f32 v101, v2-50                   ; bin: heap_oob f3 44 0f 11 50 ce
+    ; asm: movss %xmm5, 10000(%r14)
+    [-]                 store.f32 v100, v3+10000                ; bin: heap_oob f3 41 0f 11 ae 00002710
+    ; asm: movss %xmm10, -10000(%rax)
+    [-]                 store.f32 v101, v2-10000                ; bin: heap_oob f3 44 0f 11 90 ffffd8f0
+
+    ; Spill / Fill.
+
+    ; asm: movss %xmm5, 1032(%rsp)
+    [-,ss1]             v200 = spill v100                       ; bin: f3 0f 11 ac 24 00000408
+    ; asm: movss %xmm10, 1032(%rsp)
+    [-,ss1]             v201 = spill v101                       ; bin: f3 44 0f 11 94 24 00000408
+
+    ; asm: movss 1032(%rsp), %xmm5
+    [-,%xmm5]           v210 = fill v200                        ; bin: f3 0f 10 ac 24 00000408
+    ; asm: movss 1032(%rsp), %xmm10
+    [-,%xmm10]          v211 = fill v201                        ; bin: f3 44 0f 10 94 24 00000408
+
+    ; asm: movss %xmm5, 1032(%rsp)
+    regspill v100, %xmm5 -> ss1                                 ; bin: f3 0f 11 ac 24 00000408
+    ; asm: movss 1032(%rsp), %xmm5
+    regfill v100, ss1 -> %xmm5                                  ; bin: f3 0f 10 ac 24 00000408
+
+    ; Comparisons.
+    ;
+    ; Only `supported_floatccs` are tested here. Others are handled by
+    ; legalization paterns.
+
+    ; asm: ucomiss %xmm10, %xmm5
+    ; asm: setnp %bl
+    [-,%rbx]            v300 = fcmp ord v10, v11                ; bin: 41 0f 2e ea 0f 9b c3
+    ; asm: ucomiss %xmm5, %xmm10
+    ; asm: setp %bl
+    [-,%rbx]            v301 = fcmp uno v11, v10                ; bin: 44 0f 2e d5 0f 9a c3
+    ; asm: ucomiss %xmm10, %xmm5
+    ; asm: setne %dl
+    [-,%rdx]            v302 = fcmp one v10, v11                ; bin: 41 0f 2e ea 0f 95 c2
+    ; asm: ucomiss %xmm5, %xmm10
+    ; asm: sete %dl
+    [-,%rdx]            v303 = fcmp ueq v11, v10                ; bin: 44 0f 2e d5 0f 94 c2
+    ; asm: ucomiss %xmm10, %xmm5
+    ; asm: seta %bl
+    [-,%rbx]            v304 = fcmp gt v10, v11                 ; bin: 41 0f 2e ea 0f 97 c3
+    ; asm: ucomiss %xmm5, %xmm10
+    ; asm: setae %bl
+    [-,%rbx]            v305 = fcmp ge v11, v10                 ; bin: 44 0f 2e d5 0f 93 c3
+    ; asm: ucomiss %xmm10, %xmm5
+    ; asm: setb %dl
+    [-,%rdx]            v306 = fcmp ult v10, v11                ; bin: 41 0f 2e ea 0f 92 c2
+    ; asm: ucomiss %xmm5, %xmm10
+    ; asm: setbe %dl
+    [-,%rdx]            v307 = fcmp ule v11, v10                ; bin: 44 0f 2e d5 0f 96 c2
+
+    ; asm: ucomiss %xmm10, %xmm5
+    [-,%rflags]         v310 = ffcmp v10, v11                   ; bin: 41 0f 2e ea
+    ; asm: ucomiss %xmm10, %xmm5
+    [-,%rflags]         v311 = ffcmp v11, v10                   ; bin: 44 0f 2e d5
+    ; asm: ucomiss %xmm5, %xmm5
+    [-,%rflags]         v312 = ffcmp v10, v10                   ; bin: 0f 2e ed
+
+    return
+}
+
+function %F64() {
+    ss0 = incoming_arg 8, offset 0
+    ss1 = incoming_arg 1024, offset -1024
+    ss2 = incoming_arg 1024, offset -2048
+    ss3 = incoming_arg 8, offset -2056
+
+ebb0:
+    [-,%r11]            v0 = iconst.i32 1
+    [-,%rsi]            v1 = iconst.i32 2
+    [-,%rax]            v2 = iconst.i64 11
+    [-,%r14]            v3 = iconst.i64 12
+    [-,%r13]            v4 = iconst.i64 13
+
+    ; asm: cvtsi2sdl %r11d, %xmm5
+    [-,%xmm5]           v10 = fcvt_from_sint.f64 v0             ; bin: f2 41 0f 2a eb
+    ; asm: cvtsi2sdl %esi, %xmm10
+    [-,%xmm10]          v11 = fcvt_from_sint.f64 v1             ; bin: f2 44 0f 2a d6
+
+    ; asm: cvtsi2sdq %rax, %xmm5
+    [-,%xmm5]           v12 = fcvt_from_sint.f64 v2             ; bin: f2 48 0f 2a e8
+    ; asm: cvtsi2sdq %r14, %xmm10
+    [-,%xmm10]          v13 = fcvt_from_sint.f64 v3             ; bin: f2 4d 0f 2a d6
+
+    ; asm: cvtsd2ss %xmm10, %xmm5
+    [-,%xmm5]           v14 = fdemote.f32 v11                   ; bin: f2 41 0f 5a ea
+    ; asm: cvtsd2ss %xmm5, %xmm10
+    [-,%xmm10]          v15 = fdemote.f32 v10                   ; bin: f2 44 0f 5a d5
+
+    ; asm: movq %rax, %xmm5
+    [-,%xmm5]           v16 = bitcast.f64 v2                    ; bin: 66 48 0f 6e e8
+    ; asm: movq %r14, %xmm10
+    [-,%xmm10]          v17 = bitcast.f64 v3                    ; bin: 66 4d 0f 6e d6
+
+    ; asm: movq %xmm5, %rcx
+    [-,%rcx]            v18 = bitcast.i64 v10                   ; bin: 66 48 0f 7e e9
+    ; asm: movq %xmm10, %rsi
+    [-,%rsi]            v19 = bitcast.i64 v11                   ; bin: 66 4c 0f 7e d6
+
+    ; Binary arithmetic.
+
+    ; asm: addsd %xmm10, %xmm5
+    [-,%xmm5]           v20 = fadd v10, v11                     ; bin: f2 41 0f 58 ea
+    ; asm: addsd %xmm5, %xmm10
+    [-,%xmm10]          v21 = fadd v11, v10                     ; bin: f2 44 0f 58 d5
+
+    ; asm: subsd %xmm10, %xmm5
+    [-,%xmm5]           v22 = fsub v10, v11                     ; bin: f2 41 0f 5c ea
+    ; asm: subsd %xmm5, %xmm10
+    [-,%xmm10]          v23 = fsub v11, v10                     ; bin: f2 44 0f 5c d5
+
+    ; asm: mulsd %xmm10, %xmm5
+    [-,%xmm5]           v24 = fmul v10, v11                     ; bin: f2 41 0f 59 ea
+    ; asm: mulsd %xmm5, %xmm10
+    [-,%xmm10]          v25 = fmul v11, v10                     ; bin: f2 44 0f 59 d5
+
+    ; asm: divsd %xmm10, %xmm5
+    [-,%xmm5]           v26 = fdiv v10, v11                     ; bin: f2 41 0f 5e ea
+    ; asm: divsd %xmm5, %xmm10
+    [-,%xmm10]          v27 = fdiv v11, v10                     ; bin: f2 44 0f 5e d5
+
+    ; Bitwise ops.
+    ; We use the *ps SSE instructions for everything because they are smaller.
+
+    ; asm: andps %xmm10, %xmm5
+    [-,%xmm5]           v30 = band v10, v11                     ; bin: 41 0f 54 ea
+    ; asm: andps %xmm5, %xmm10
+    [-,%xmm10]          v31 = band v11, v10                     ; bin: 44 0f 54 d5
+
+    ; asm: andnps %xmm10, %xmm5
+    [-,%xmm5]           v32 = band_not v11, v10                 ; bin: 41 0f 55 ea
+    ; asm: andnps %xmm5, %xmm10
+    [-,%xmm10]          v33 = band_not v10, v11                 ; bin: 44 0f 55 d5
+
+    ; asm: orps %xmm10, %xmm5
+    [-,%xmm5]           v34 = bor v10, v11                      ; bin: 41 0f 56 ea
+    ; asm: orps %xmm5, %xmm10
+    [-,%xmm10]          v35 = bor v11, v10                      ; bin: 44 0f 56 d5
+
+    ; asm: xorps %xmm10, %xmm5
+    [-,%xmm5]           v36 = bxor v10, v11                     ; bin: 41 0f 57 ea
+    ; asm: xorps %xmm5, %xmm10
+    [-,%xmm10]          v37 = bxor v11, v10                     ; bin: 44 0f 57 d5
+
+    ; asm: movaps %xmm10, %xmm5
+    [-,%xmm5]           v38 = copy v11                          ; bin: 41 0f 28 ea
+    ; asm: movaps %xmm5, %xmm10
+    [-,%xmm10]          v39 = copy v10                          ; bin: 44 0f 28 d5
+
+    ; Convert float to int.
+
+    ; asm: cvttsd2si %xmm5, %ecx
+    [-,%rcx]            v40 = x86_cvtt2si.i32 v10               ; bin: f2 0f 2c cd
+    ; asm: cvttsd2si %xmm10, %esi
+    [-,%rsi]            v41 = x86_cvtt2si.i32 v11               ; bin: f2 41 0f 2c f2
+
+    ; asm: cvttsd2si %xmm5, %rcx
+    [-,%rcx]            v42 = x86_cvtt2si.i64 v10               ; bin: f2 48 0f 2c cd
+    ; asm: cvttsd2si %xmm10, %rsi
+    [-,%rsi]            v43 = x86_cvtt2si.i64 v11               ; bin: f2 49 0f 2c f2
+
+    ; Min/max.
+
+    ; asm: minsd %xmm10, %xmm5
+    [-,%xmm5]           v44 = x86_fmin v10, v11                 ; bin: f2 41 0f 5d ea
+    ; asm: minsd %xmm5, %xmm10
+    [-,%xmm10]          v45 = x86_fmin v11, v10                 ; bin: f2 44 0f 5d d5
+    ; asm: maxsd %xmm10, %xmm5
+    [-,%xmm5]           v46 = x86_fmax v10, v11                 ; bin: f2 41 0f 5f ea
+    ; asm: maxsd %xmm5, %xmm10
+    [-,%xmm10]          v47 = x86_fmax v11, v10                 ; bin: f2 44 0f 5f d5
+
+    ; Unary arithmetic.
+
+    ; asm: sqrtsd %xmm5, %xmm10
+    [-,%xmm10]          v50 = sqrt v10                          ; bin: f2 44 0f 51 d5
+    ; asm: sqrtsd %xmm10, %xmm5
+    [-,%xmm5]           v51 = sqrt v11                          ; bin: f2 41 0f 51 ea
+
+    ; asm: roundsd $0, %xmm5, %xmm10
+    [-,%xmm10]          v52 = nearest v10                       ; bin: 66 44 0f 3a 0b d5 00
+    ; asm: roundsd $0, %xmm10, %xmm5
+    [-,%xmm5]           v53 = nearest v11                       ; bin: 66 41 0f 3a 0b ea 00
+    ; asm: roundsd $0, %xmm5, %xmm2
+    [-,%xmm2]           v54 = nearest v10                       ; bin: 66 0f 3a 0b d5 00
+
+    ; asm: roundsd $1, %xmm5, %xmm10
+    [-,%xmm10]          v55 = floor v10                         ; bin: 66 44 0f 3a 0b d5 01
+    ; asm: roundsd $1, %xmm10, %xmm5
+    [-,%xmm5]           v56 = floor v11                         ; bin: 66 41 0f 3a 0b ea 01
+    ; asm: roundsd $1, %xmm5, %xmm2
+    [-,%xmm2]           v57 = floor v10                         ; bin: 66 0f 3a 0b d5 01
+
+    ; asm: roundsd $2, %xmm5, %xmm10
+    [-,%xmm10]          v58 = ceil v10                          ; bin: 66 44 0f 3a 0b d5 02
+    ; asm: roundsd $2, %xmm10, %xmm5
+    [-,%xmm5]           v59 = ceil v11                          ; bin: 66 41 0f 3a 0b ea 02
+    ; asm: roundsd $2, %xmm5, %xmm2
+    [-,%xmm2]           v60 = ceil v10                          ; bin: 66 0f 3a 0b d5 02
+
+    ; asm: roundsd $3, %xmm5, %xmm10
+    [-,%xmm10]          v61 = trunc v10                         ; bin: 66 44 0f 3a 0b d5 03
+    ; asm: roundsd $3, %xmm10, %xmm5
+    [-,%xmm5]           v62 = trunc v11                         ; bin: 66 41 0f 3a 0b ea 03
+    ; asm: roundsd $3, %xmm5, %xmm2
+    [-,%xmm2]           v63 = trunc v10                         ; bin: 66 0f 3a 0b d5 03
+
+    ; Load/Store
+
+    ; asm: movsd (%r14), %xmm5
+    [-,%xmm5]           v100 = load.f64 v3                      ; bin: heap_oob f2 41 0f 10 2e
+    ; asm: movsd (%rax), %xmm10
+    [-,%xmm10]          v101 = load.f64 v2                      ; bin: heap_oob f2 44 0f 10 10
+    ; asm: movsd 50(%r14), %xmm5
+    [-,%xmm5]           v110 = load.f64 v3+50                   ; bin: heap_oob f2 41 0f 10 6e 32
+    ; asm: movsd -50(%rax), %xmm10
+    [-,%xmm10]          v111 = load.f64 v2-50                   ; bin: heap_oob f2 44 0f 10 50 ce
+    ; asm: movsd 10000(%r14), %xmm5
+    [-,%xmm5]           v120 = load.f64 v3+10000                ; bin: heap_oob f2 41 0f 10 ae 00002710
+    ; asm: movsd -10000(%rax), %xmm10
+    [-,%xmm10]          v121 = load.f64 v2-10000                ; bin: heap_oob f2 44 0f 10 90 ffffd8f0
+
+    ; asm: movsd %xmm5, (%r14)
+    [-]                 store.f64 v100, v3                      ; bin: heap_oob f2 41 0f 11 2e
+    ; asm: movsd %xmm10, (%rax)
+    [-]                 store.f64 v101, v2                      ; bin: heap_oob f2 44 0f 11 10
+    ; asm: movsd %xmm5, (%r13)
+    [-]                 store.f64 v100, v4                      ; bin: heap_oob f2 41 0f 11 6d 00
+    ; asm: movsd %xmm10, (%r13)
+    [-]                 store.f64 v101, v4                      ; bin: heap_oob f2 45 0f 11 55 00
+    ; asm: movsd %xmm5, 50(%r14)
+    [-]                 store.f64 v100, v3+50                   ; bin: heap_oob f2 41 0f 11 6e 32
+    ; asm: movsd %xmm10, -50(%rax)
+    [-]                 store.f64 v101, v2-50                   ; bin: heap_oob f2 44 0f 11 50 ce
+    ; asm: movsd %xmm5, 10000(%r14)
+    [-]                 store.f64 v100, v3+10000                ; bin: heap_oob f2 41 0f 11 ae 00002710
+    ; asm: movsd %xmm10, -10000(%rax)
+    [-]                 store.f64 v101, v2-10000                ; bin: heap_oob f2 44 0f 11 90 ffffd8f0
+
+    ; Spill / Fill.
+
+    ; asm: movsd %xmm5, 1032(%rsp)
+    [-,ss1]             v200 = spill v100                       ; bin: f2 0f 11 ac 24 00000408
+    ; asm: movsd %xmm10, 1032(%rsp)
+    [-,ss1]             v201 = spill v101                       ; bin: f2 44 0f 11 94 24 00000408
+
+    ; asm: movsd 1032(%rsp), %xmm5
+    [-,%xmm5]           v210 = fill v200                        ; bin: f2 0f 10 ac 24 00000408
+    ; asm: movsd 1032(%rsp), %xmm10
+    [-,%xmm10]          v211 = fill v201                        ; bin: f2 44 0f 10 94 24 00000408
+
+    ; asm: movsd %xmm5, 1032(%rsp)
+    regspill v100, %xmm5 -> ss1                                 ; bin: f2 0f 11 ac 24 00000408
+    ; asm: movsd 1032(%rsp), %xmm5
+    regfill v100, ss1 -> %xmm5                                  ; bin: f2 0f 10 ac 24 00000408
+
+    ; Comparisons.
+    ;
+    ; Only `supported_floatccs` are tested here. Others are handled by
+    ; legalization paterns.
+
+    ; asm: ucomisd %xmm10, %xmm5
+    ; asm: setnp %bl
+    [-,%rbx]            v300 = fcmp ord v10, v11                ; bin: 66 41 0f 2e ea 0f 9b c3
+    ; asm: ucomisd %xmm5, %xmm10
+    ; asm: setp %bl
+    [-,%rbx]            v301 = fcmp uno v11, v10                ; bin: 66 44 0f 2e d5 0f 9a c3
+    ; asm: ucomisd %xmm10, %xmm5
+    ; asm: setne %dl
+    [-,%rdx]            v302 = fcmp one v10, v11                ; bin: 66 41 0f 2e ea 0f 95 c2
+    ; asm: ucomisd %xmm5, %xmm10
+    ; asm: sete %dl
+    [-,%rdx]            v303 = fcmp ueq v11, v10                ; bin: 66 44 0f 2e d5 0f 94 c2
+    ; asm: ucomisd %xmm10, %xmm5
+    ; asm: seta %bl
+    [-,%rbx]            v304 = fcmp gt v10, v11                 ; bin: 66 41 0f 2e ea 0f 97 c3
+    ; asm: ucomisd %xmm5, %xmm10
+    ; asm: setae %bl
+    [-,%rbx]            v305 = fcmp ge v11, v10                 ; bin: 66 44 0f 2e d5 0f 93 c3
+    ; asm: ucomisd %xmm10, %xmm5
+    ; asm: setb %dl
+    [-,%rdx]            v306 = fcmp ult v10, v11                ; bin: 66 41 0f 2e ea 0f 92 c2
+    ; asm: ucomisd %xmm5, %xmm10
+    ; asm: setbe %dl
+    [-,%rdx]            v307 = fcmp ule v11, v10                ; bin: 66 44 0f 2e d5 0f 96 c2
+
+    ; asm: ucomisd %xmm10, %xmm5
+    [-,%rflags]         v310 = ffcmp v10, v11                   ; bin: 66 41 0f 2e ea
+    ; asm: ucomisd %xmm10, %xmm5
+    [-,%rflags]         v311 = ffcmp v11, v10                   ; bin: 66 44 0f 2e d5
+    ; asm: ucomisd %xmm5, %xmm5
+    [-,%rflags]         v312 = ffcmp v10, v10                   ; bin: 66 0f 2e ed
+
+    return
+}
+
+function %cpuflags_float(f32 [%xmm0]) {
+ebb0(v0: f32 [%xmm0]):
+    ; asm: ucomiss %xmm0, %xmm0
+    [-,%rflags]         v1 = ffcmp v0, v0                       ; bin: 0f 2e c0
+
+    jump ebb1
+
+ebb1:
+    ; asm: jnp ebb1
+    brff ord v1, ebb1                                           ; bin: 7b fe
+    ; asm: jp ebb1
+    brff uno v1, ebb1                                           ; bin: 7a fc
+    ; asm: jne ebb1
+    brff one v1, ebb1                                           ; bin: 75 fa
+    ; asm: je ebb1
+    brff ueq v1, ebb1                                           ; bin: 74 f8
+    ; asm: ja ebb1
+    brff gt v1, ebb1                                            ; bin: 77 f6
+    ; asm: jae ebb1
+    brff ge v1, ebb1                                            ; bin: 73 f4
+    ; asm: jb ebb1
+    brff ult v1, ebb1                                           ; bin: 72 f2
+    ; asm: jbe ebb1
+    brff ule v1, ebb1                                           ; bin: 76 f0
+
+    ; asm: jp .+4; ud2
+    trapff ord v1, user0                                        ; bin: 7a 02 user0 0f 0b
+    ; asm: jnp .+4; ud2
+    trapff uno v1, user0                                        ; bin: 7b 02 user0 0f 0b
+    ; asm: je .+4; ud2
+    trapff one v1, user0                                        ; bin: 74 02 user0 0f 0b
+    ; asm: jne .+4; ud2
+    trapff ueq v1, user0                                        ; bin: 75 02 user0 0f 0b
+    ; asm: jna .+4; ud2
+    trapff gt v1, user0                                         ; bin: 76 02 user0 0f 0b
+    ; asm: jnae .+4; ud2
+    trapff ge v1, user0                                         ; bin: 72 02 user0 0f 0b
+    ; asm: jnb .+4; ud2
+    trapff ult v1, user0                                        ; bin: 73 02 user0 0f 0b
+    ; asm: jnbe .+4; ud2
+    trapff ule v1, user0                                        ; bin: 77 02 user0 0f 0b
+
+    ; asm: setnp %bl
+    [-,%rbx]            v10 = trueff ord v1                     ; bin: 0f 9b c3
+    ; asm: setp %bl
+    [-,%rbx]            v11 = trueff uno v1                     ; bin: 0f 9a c3
+    ; asm: setne %dl
+    [-,%rdx]            v12 = trueff one v1                     ; bin: 0f 95 c2
+    ; asm: sete %dl
+    [-,%rdx]            v13 = trueff ueq v1                     ; bin: 0f 94 c2
+    ; asm: seta %r10b
+    [-,%r10]            v14 = trueff gt v1                      ; bin: 41 0f 97 c2
+    ; asm: setae %r10b
+    [-,%r10]            v15 = trueff ge v1                      ; bin: 41 0f 93 c2
+    ; asm: setb %r14b
+    [-,%r14]            v16 = trueff ult v1                     ; bin: 41 0f 92 c6
+    ; asm: setbe %r14b
+    [-,%r14]            v17 = trueff ule v1                     ; bin: 41 0f 96 c6
+
+    return
+}
--- a/cranelift/filetests/isa/x86/binary64-pic.cton
+++ b/cranelift/filetests/isa/x86/binary64-pic.cton
@@ -0,0 +1,54 @@
+; binary emission of 64-bit code.
+test binemit
+set is_64bit
+set is_compressed
+set is_pic
+isa x86 haswell
+
+; The binary encodings can be verified with the command:
+;
+;   sed -ne 's/^ *; asm: *//p' filetests/isa/x86/binary64-pic.cton | llvm-mc -show-encoding -triple=x86_64
+;
+
+; Tests for i64 instructions.
+function %I64() {
+    sig0 = ()
+    fn0 = function %foo()
+
+    gv0 = globalsym %some_gv
+
+    ; Use incoming_arg stack slots because they won't be relocated by the frame
+    ; layout.
+    ss0 = incoming_arg 8, offset 0
+    ss1 = incoming_arg 1024, offset -1024
+    ss2 = incoming_arg 1024, offset -2048
+    ss3 = incoming_arg 8, offset -2056
+
+ebb0:
+
+    ; asm: call foo@PLT
+    call fn0()                                  ; bin: e8 PLTRel4(%foo-4) 00000000
+
+    ; asm: mov 0x0(%rip), %rax
+    [-,%rax]            v0 = func_addr.i64 fn0        ; bin: 48 8b 05 GOTPCRel4(%foo-4) 00000000
+    ; asm: mov 0x0(%rip), %rsi
+    [-,%rsi]            v1 = func_addr.i64 fn0        ; bin: 48 8b 35 GOTPCRel4(%foo-4) 00000000
+    ; asm: mov 0x0(%rip), %r10
+    [-,%r10]            v2 = func_addr.i64 fn0        ; bin: 4c 8b 15 GOTPCRel4(%foo-4) 00000000
+
+    ; asm: call *%rax
+    call_indirect sig0, v0()                  ; bin: ff d0
+    ; asm: call *%rsi
+    call_indirect sig0, v1()                  ; bin: ff d6
+    ; asm: call *%r10
+    call_indirect sig0, v2()                  ; bin: 41 ff d2
+
+    ; asm: mov 0x0(%rip), %rcx
+    [-,%rcx]            v3 = globalsym_addr.i64 gv0    ; bin: 48 8b 0d GOTPCRel4(%some_gv-4) 00000000
+    ; asm: mov 0x0(%rip), %rsi
+    [-,%rsi]            v4 = globalsym_addr.i64 gv0    ; bin: 48 8b 35 GOTPCRel4(%some_gv-4) 00000000
+    ; asm: mov 0x0(%rip), %r10
+    [-,%r10]            v5 = globalsym_addr.i64 gv0    ; bin: 4c 8b 15 GOTPCRel4(%some_gv-4) 00000000
+
+    return
+}
--- a/cranelift/filetests/isa/x86/binary64.cton
+++ b/cranelift/filetests/isa/x86/binary64.cton
--- a/cranelift/filetests/isa/x86/legalize-custom.cton
+++ b/cranelift/filetests/isa/x86/legalize-custom.cton
@@ -0,0 +1,97 @@
+; Test the custom legalizations.
+test legalizer
+isa x86
+set is_64bit
+isa x86
+
+; regex: V=v\d+
+; regex: EBB=ebb\d+
+
+function %cond_trap(i32) {
+ebb0(v1: i32):
+    trapz v1, user67
+    return
+    ; check: ebb0(v1: i32
+    ; nextln: $(f=$V) = ifcmp_imm v1, 0
+    ; nextln: trapif eq $f, user67
+    ; nextln: return
+}
+
+function %cond_trap2(i32) {
+ebb0(v1: i32):
+    trapnz v1, int_ovf
+    return
+    ; check: ebb0(v1: i32
+    ; nextln: $(f=$V) = ifcmp_imm v1, 0
+    ; nextln: trapif ne $f, int_ovf
+    ; nextln: return
+}
+
+function %cond_trap_b1(i32) {
+ebb0(v1: i32):
+    v2 = icmp_imm eq v1, 6
+    trapz v2, user7
+    return
+    ; check: ebb0(v1: i32
+    ; check: brnz v2, $(new=$EBB)
+    ; nextln: trap user7
+    ; check: $new:
+    ; nextln: return
+}
+
+function %cond_trap2_b1(i32) {
+ebb0(v1: i32):
+    v2 = icmp_imm eq v1, 6
+    trapnz v2, user9
+    return
+    ; check: ebb0(v1: i32
+    ; check: brz v2, $(new=$EBB)
+    ; nextln: trap user9
+    ; check: $new:
+    ; nextln: return
+}
+
+function %f32const() -> f32 {
+ebb0:
+    v1 = f32const 0x1.0p1
+    ; check: $(tmp=$V) = iconst.i32
+    ; check: v1 = bitcast.f32 $tmp
+    return v1
+}
+
+function %f64const() -> f64 {
+ebb0:
+    v1 = f64const 0x1.0p1
+    ; check: $(tmp=$V) = iconst.i64
+    ; check: v1 = bitcast.f64 $tmp
+    return v1
+}
+
+function %select_f64(f64, f64, i32) -> f64 {
+ebb0(v0: f64, v1: f64, v2: i32):
+    v3 = select v2, v0, v1
+    ; check:  brnz v2, $(new=$EBB)(v0)
+    ; nextln: jump $new(v1)
+    ; check: $new(v3: f64):
+    ; nextln: return v3
+    return v3
+}
+
+function %f32_min(f32, f32) -> f32 {
+ebb0(v0: f32, v1: f32):
+    v2 = fmin v0, v1
+    return v2
+    ; check: $(vnat=$V) = x86_fmin v0, v1
+    ; nextln: jump $(done=$EBB)($vnat)
+
+    ; check: $(uno=$EBB):
+    ; nextln: $(vuno=$V) = fadd.f32 v0, v1
+    ; nextln: jump $(done=$EBB)($vuno)
+
+    ; check: $(ueq=$EBB):
+    ; check: $(veq=$V) = bor.f32 v0, v1
+    ; nextln: jump $(done=$EBB)($veq)
+
+    ; check: $done(v2: f32):
+    ; nextln: return v2
+}
--- a/cranelift/filetests/isa/x86/legalize-div-traps.cton
+++ b/cranelift/filetests/isa/x86/legalize-div-traps.cton
@@ -0,0 +1,72 @@
+; Test the division legalizations.
+test legalizer
+set is_64bit
+; See also legalize-div.cton.
+set avoid_div_traps=1
+isa x86
+
+; regex: V=v\d+
+; regex: EBB=ebb\d+
+
+function %udiv(i64, i64) -> i64 {
+ebb0(v0: i64, v1: i64):
+    ; check: ebb0(
+    v2 = udiv v0, v1
+    ; nextln: $(fz=$V) = ifcmp_imm v1, 0
+    ; nextln: trapif eq $fz, int_divz
+    ; nextln: $(hi=$V) = iconst.i64 0
+    ; nextln: $(d=$V), $(r=$V) = x86_udivmodx v0, $hi, v1
+    return v2
+    ; nextln: return $d
+}
+
+function %urem(i64, i64) -> i64 {
+ebb0(v0: i64, v1: i64):
+    ; check: ebb0(
+    v2 = urem v0, v1
+    ; nextln: $(fz=$V) = ifcmp_imm v1, 0
+    ; nextln: trapif eq $fz, int_divz
+    ; nextln: $(hi=$V) = iconst.i64 0
+    ; nextln: $(d=$V), $(r=$V) = x86_udivmodx v0, $hi, v1
+    return v2
+    ; nextln: return $r
+}
+
+function %sdiv(i64, i64) -> i64 {
+ebb0(v0: i64, v1: i64):
+    ; check: ebb0(
+    v2 = sdiv v0, v1
+    ; nextln: $(fm1=$V) = ifcmp_imm v1, -1
+    ; nextln: brif eq $fm1, $(m1=$EBB)
+    ; nextln: $(fz=$V) = ifcmp_imm v1, 0
+    ; nextln: trapif eq $fz, int_divz
+    ; check: $(hi=$V) = sshr_imm
+    ; nextln: $(q=$V), $(r=$V) = x86_sdivmodx v0, $hi, v1
+    ; nextln: jump $(done=$EBB)($q)
+    ; check: $m1:
+    ; nextln: $(imin=$V) = iconst.i64 0x8000_0000_0000_0000
+    ; nextln: $(fm=$V) = ifcmp.i64 v0, $imin
+    ; nextln: trapif eq $fm, int_ovf
+    ; check: $done(v2: i64):
+    return v2
+    ; nextln: return v2
+}
+
+; The srem expansion needs to special-case x % -1 since x86_sdivmodx traps on INT_MIN/-1.
+; TODO: Add more explicit pattern matching once we've cleaned up the ifcmp+brif pattern.
+function %srem(i64, i64) -> i64 {
+ebb0(v0: i64, v1: i64):
+    ; check: ebb0(
+    v2 = srem v0, v1
+    ; nextln: $(fm1=$V) = ifcmp_imm v1, -1
+    ; nextln: brif eq $fm1, $(m1=$EBB)
+    ; check: $(hi=$V) = sshr_imm
+    ; nextln: $(d=$V), $(r=$V) = x86_sdivmodx v0, $hi, v1
+    ; nextln: jump $(done=$EBB)($r)
+    ; check: $m1:
+    ; nextln: $(zero=$V) = iconst.i64 0
+    ; nextln: jump $(done=$EBB)($zero)
+    ; check: $done(v2: i64):
+    return v2
+    ; nextln: return v2
+}
--- a/cranelift/filetests/isa/x86/legalize-div.cton
+++ b/cranelift/filetests/isa/x86/legalize-div.cton
@@ -0,0 +1,58 @@
+; Test the division legalizations.
+test legalizer
+set is_64bit
+; See also legalize-div-traps.cton.
+set avoid_div_traps=0
+isa x86
+
+; regex: V=v\d+
+; regex: EBB=ebb\d+
+
+function %udiv(i64, i64) -> i64 {
+ebb0(v0: i64, v1: i64):
+    ; check: ebb0(
+    v2 = udiv v0, v1
+    ; nextln: $(hi=$V) = iconst.i64 0
+    ; nextln: $(d=$V), $(r=$V) = x86_udivmodx v0, $hi, v1
+    return v2
+    ; nextln: return $d
+}
+
+function %urem(i64, i64) -> i64 {
+ebb0(v0: i64, v1: i64):
+    ; check: ebb0(
+    v2 = urem v0, v1
+    ; nextln: $(hi=$V) = iconst.i64 0
+    ; nextln: $(d=$V), $(r=$V) = x86_udivmodx v0, $hi, v1
+    return v2
+    ; nextln: return $r
+}
+
+function %sdiv(i64, i64) -> i64 {
+ebb0(v0: i64, v1: i64):
+    ; check: ebb0(
+    v2 = sdiv v0, v1
+    ; check: $(hi=$V) = sshr_imm
+    ; nextln: $(d=$V), $(r=$V) = x86_sdivmodx v0, $hi, v1
+    return v2
+    ; nextln: return $d
+}
+
+; The srem expansion needs to special-case x % -1 since x86_sdivmodx traps on INT_MIN/-1.
+; TODO: Add more explicit pattern matching once we've cleaned up the ifcmp+brif pattern.
+function %srem(i64, i64) -> i64 {
+ebb0(v0: i64, v1: i64):
+    ; check: ebb0(
+    v2 = srem v0, v1
+    ; nextln: $(fm1=$V) = ifcmp_imm v1, -1
+    ; nextln: brif eq $fm1, $(m1=$EBB)
+    ; check: $(hi=$V) = sshr_imm
+    ; nextln: $(d=$V), $(r=$V) = x86_sdivmodx v0, $hi, v1
+    ; nextln: jump $(done=$EBB)($r)
+    ; check: $m1:
+    ; nextln: $(zero=$V) = iconst.i64 0
+    ; nextln: jump $(done=$EBB)($zero)
+    ; check: $done(v2: i64):
+    return v2
+    ; nextln: return v2
+}
--- a/cranelift/filetests/isa/x86/legalize-libcall.cton
+++ b/cranelift/filetests/isa/x86/legalize-libcall.cton
@@ -0,0 +1,16 @@
+test legalizer
+
+; Pre-SSE 4.1, we need to use runtime library calls for floating point rounding operations.
+set is_64bit
+set is_pic
+isa x86
+
+function %floor(f32) -> f32 {
+ebb0(v0: f32):
+    v1 = floor v0
+    return v1
+}
+; check: function %floor(f32 [%xmm0]) -> f32 [%xmm0] system_v {
+; check: sig0 = (f32) -> f32 system_v
+; check: fn0 = sig0 %FloorF32
+; check: v1 = call fn0(v0)
--- a/cranelift/filetests/isa/x86/legalize-memory.cton
+++ b/cranelift/filetests/isa/x86/legalize-memory.cton
@@ -0,0 +1,125 @@
+; Test the legalization of memory objects.
+test legalizer
+set is_64bit
+isa x86
+
+; regex: V=v\d+
+; regex: EBB=ebb\d+
+
+function %vmctx(i64 vmctx) -> i64 {
+    gv1 = vmctx-16
+
+ebb1(v1: i64):
+    v2 = global_addr.i64 gv1
+    ; check: v2 = iadd_imm v1, -16
+    return v2
+    ; check: return v2
+}
+
+function %deref(i64 vmctx) -> i64 {
+    gv1 = vmctx-16
+    gv2 = deref(gv1)+32
+
+ebb1(v1: i64):
+    v2 = global_addr.i64 gv2
+    ; check: $(a1=$V) = iadd_imm v1, -16
+    ; check: $(p1=$V) = load.i64 notrap aligned $a1
+    ; check: v2 = iadd_imm $p1, 32
+    return v2
+    ; check: return v2
+}
+
+function %sym() -> i64 {
+    gv0 = globalsym %something
+    gv1 = globalsym u123:456
+
+ebb1:
+    v0 = global_addr.i64 gv0
+    ; check: v0 = globalsym_addr.i64 gv0
+    v1 = global_addr.i64 gv1
+    ; check: v1 = globalsym_addr.i64 gv1
+    v2 = bxor v0, v1
+    return v2
+}
+
+; SpiderMonkey VM-style static 4+2 GB heap.
+; This eliminates bounds checks completely for offsets < 2GB.
+function %staticheap_sm64(i32, i64 vmctx) -> f32 spiderwasm {
+    gv0 = vmctx+64
+    heap0 = static gv0, min 0x1000, bound 0x1_0000_0000, guard 0x8000_0000
+
+ebb0(v0: i32, v999: i64):
+    ; check: ebb0(
+    v1 = heap_addr.i64 heap0, v0, 1
+    ; Boundscheck should be eliminated.
+    ; Checks here are assuming that no pipehole opts fold the load offsets.
+    ; nextln: $(xoff=$V) = uextend.i64 v0
+    ; nextln: $(haddr=$V) = iadd_imm v999, 64
+    ; nextln: $(hbase=$V) = load.i64 notrap aligned $haddr
+    ; nextln: v1 = iadd $hbase, $xoff
+    v2 = load.f32 v1+16
+    ; nextln: v2 = load.f32 v1+16
+    v3 = load.f32 v1+20
+    ; nextln: v3 = load.f32 v1+20
+    v4 = fadd v2, v3
+    return v4
+}
+
+function %staticheap_static_oob_sm64(i32, i64 vmctx) -> f32 spiderwasm {
+    gv0 = vmctx+64
+    heap0 = static gv0, min 0x1000, bound 0x1000_0000, guard 0x8000_0000
+
+ebb0(v0: i32, v999: i64):
+    ; Everything after the obviously OOB access should be eliminated, leaving
+    ; the `trap heap_oob` instruction as the terminator of the Ebb and moving
+    ; the remainder of the instructions into an inaccessible Ebb.
+    ; check: ebb0(
+    ; nextln:     trap heap_oob
+    ; check: ebb1:
+    ; nextln:     v1 = iconst.i64 0
+    ; nextln:     v2 = load.f32 v1+16
+    ; nextln:     return v2
+    ; nextln: }
+    v1 = heap_addr.i64 heap0, v0, 0x1000_0001
+    v2 = load.f32 v1+16
+    return v2
+}
+
+
+; SpiderMonkey VM-style static 4+2 GB heap.
+; Offsets >= 2 GB do require a boundscheck.
+function %staticheap_sm64(i32, i64 vmctx) -> f32 spiderwasm {
+    gv0 = vmctx+64
+    heap0 = static gv0, min 0x1000, bound 0x1_0000_0000, guard 0x8000_0000
+
+ebb0(v0: i32, v999: i64):
+    ; check: ebb0(
+    v1 = heap_addr.i64 heap0, v0, 0x8000_0000
+    ; Boundscheck code
+    ; check: $(oob=$V) = icmp
+    ; nextln: brz $oob, $(ok=$EBB)
+    ; nextln: trap heap_oob
+    ; check: $ok:
+    ; Checks here are assuming that no pipehole opts fold the load offsets.
+    ; nextln: $(xoff=$V) = uextend.i64 v0
+    ; nextln: $(haddr=$V) = iadd_imm.i64 v999, 64
+    ; nextln: $(hbase=$V) = load.i64 notrap aligned $haddr
+    ; nextln: v1 = iadd $hbase, $xoff
+    v2 = load.f32 v1+0x7fff_ffff
+    ; nextln: v2 = load.f32 v1+0x7fff_ffff
+    return v2
+}
+
+; Stack overflow check.
+; The stack limit is stored in a pointer-sized global variable.
+function %stkchk(i64 vmctx) spiderwasm {
+    gv0 = vmctx+64
+
+ebb0(v0: i64):
+    ; check: ebb0(
+    stack_check gv0
+    ; check: $(limit=$V) = load.i64 notrap aligned
+    ; check: $(flags=$V) = ifcmp_sp $limit
+    ; check: trapif uge $flags, stk_ovf
+    return
+}
--- a/cranelift/filetests/isa/x86/legalize-mulhi.cton
+++ b/cranelift/filetests/isa/x86/legalize-mulhi.cton
@@ -0,0 +1,45 @@
+
+test compile
+set is_64bit
+isa x86 baseline
+
+; umulhi/smulhi on 64 bit operands
+
+function %i64_umulhi(i64, i64) -> i64 {
+ebb0(v10: i64, v11: i64):
+  v12 = umulhi v10, v11
+  ; check: %rdi -> %rax
+  ; check: x86_umulx
+  ; check: %rdx -> %rax
+  return v12
+}
+
+function %i64_smulhi(i64, i64) -> i64 {
+ebb0(v20: i64, v21: i64):
+  v22 = smulhi v20, v21
+  ; check: %rdi -> %rax
+  ; check: x86_smulx
+  ; check: %rdx -> %rax
+  return v22
+}
+
+
+; umulhi/smulhi on 32 bit operands
+
+function %i32_umulhi(i32, i32) -> i32 {
+ebb0(v30: i32, v31: i32):
+  v32 = umulhi v30, v31
+  ; check: %rdi -> %rax
+  ; check: x86_umulx
+  ; check: %rdx -> %rax
+  return v32
+}
+
+function %i32_smulhi(i32, i32) -> i32 {
+ebb0(v40: i32, v41: i32):
+  v42 = smulhi v40, v41
+  ; check: %rdi -> %rax
+  ; check: x86_smulx
+  ; check: %rdx -> %rax
+  return v42
+}
--- a/cranelift/filetests/isa/x86/prologue-epilogue.cton
+++ b/cranelift/filetests/isa/x86/prologue-epilogue.cton
@@ -0,0 +1,231 @@
+test compile
+set is_64bit
+set is_compressed
+set is_pic
+isa x86 haswell
+
+; An empty function.
+
+function %empty() {
+ebb0:
+    return
+}
+
+; check: function %empty(i64 fp [%rbp]) -> i64 fp [%rbp] system_v {
+; nextln:     ss0 = incoming_arg 16, offset -16
+; nextln: 
+; nextln: ebb0(v0: i64 [%rbp]):
+; nextln:     x86_push v0
+; nextln:     copy_special %rsp -> %rbp
+; nextln:     v1 = x86_pop.i64
+; nextln:     return v1
+; nextln: }
+
+; A function with a single stack slot.
+
+function %one_stack_slot() {
+    ss0 = explicit_slot 168
+ebb0:
+    return
+}
+
+; check: function %one_stack_slot(i64 fp [%rbp]) -> i64 fp [%rbp] system_v {
+; nextln:     ss0 = explicit_slot 168, offset -184
+; nextln:     ss1 = incoming_arg 16, offset -16
+; nextln: 
+; nextln: ebb0(v0: i64 [%rbp]):
+; nextln:     x86_push v0
+; nextln:     copy_special %rsp -> %rbp
+; nextln:     adjust_sp_imm -176
+; nextln:     adjust_sp_imm 176
+; nextln:     v1 = x86_pop.i64
+; nextln:     return v1
+; nextln: }
+
+; A function performing a call.
+
+function %call() {
+    fn0 = function %foo()
+
+ebb0:
+    call fn0()
+    return
+}
+
+; check: function %call(i64 fp [%rbp]) -> i64 fp [%rbp] system_v {
+; nextln:     ss0 = incoming_arg 16, offset -16
+; nextln:     sig0 = () system_v
+; nextln:     fn0 = sig0 %foo
+; nextln: 
+; nextln: ebb0(v0: i64 [%rbp]):
+; nextln:     x86_push v0
+; nextln:     copy_special %rsp -> %rbp
+; nextln:     call fn0()
+; nextln:     v1 = x86_pop.i64
+; nextln:     return v1
+; nextln: }
+
+; A function that uses a lot of registers but doesn't quite need to spill.
+
+function %no_spill(i64, i64) {
+ebb0(v0: i64, v1: i64):
+    v2 = load.i32 v0+0
+    v3 = load.i32 v0+8
+    v4 = load.i32 v0+16
+    v5 = load.i32 v0+24
+    v6 = load.i32 v0+32
+    v7 = load.i32 v0+40
+    v8 = load.i32 v0+48
+    v9 = load.i32 v0+56
+    v10 = load.i32 v0+64
+    v11 = load.i32 v0+72
+    v12 = load.i32 v0+80
+    v13 = load.i32 v0+88
+    v14 = load.i32 v0+96
+    store.i32 v2, v1+0
+    store.i32 v3, v1+8
+    store.i32 v4, v1+16
+    store.i32 v5, v1+24
+    store.i32 v6, v1+32
+    store.i32 v7, v1+40
+    store.i32 v8, v1+48
+    store.i32 v9, v1+56
+    store.i32 v10, v1+64
+    store.i32 v11, v1+72
+    store.i32 v12, v1+80
+    store.i32 v13, v1+88
+    store.i32 v14, v1+96
+    return
+}
+
+; check: function %no_spill(i64 [%rdi], i64 [%rsi], i64 fp [%rbp], i64 csr [%rbx], i64 csr [%r12], i64 csr [%r13], i64 csr [%r14], i64 csr [%r15]) -> i64 fp [%rbp], i64 csr [%rbx], i64 csr [%r12], i64 csr [%r13], i64 csr [%r14], i64 csr [%r15] system_v {
+; nextln:     ss0 = incoming_arg 56, offset -56
+; nextln: 
+; nextln: ebb0(v0: i64 [%rdi], v1: i64 [%rsi], v15: i64 [%rbp], v16: i64 [%rbx], v17: i64 [%r12], v18: i64 [%r13], v19: i64 [%r14], v20: i64 [%r15]):
+; nextln:     x86_push v15
+; nextln:     copy_special %rsp -> %rbp
+; nextln:     x86_push v16
+; nextln:     x86_push v17
+; nextln:     x86_push v18
+; nextln:     x86_push v19
+; nextln:     x86_push v20
+; nextln:     adjust_sp_imm -8
+; nextln:     v2 = load.i32 v0
+; nextln:     v3 = load.i32 v0+8
+; nextln:     v4 = load.i32 v0+16
+; nextln:     v5 = load.i32 v0+24
+; nextln:     v6 = load.i32 v0+32
+; nextln:     v7 = load.i32 v0+40
+; nextln:     v8 = load.i32 v0+48
+; nextln:     v9 = load.i32 v0+56
+; nextln:     v10 = load.i32 v0+64
+; nextln:     v11 = load.i32 v0+72
+; nextln:     v12 = load.i32 v0+80
+; nextln:     v13 = load.i32 v0+88
+; nextln:     v14 = load.i32 v0+96
+; nextln:     store v2, v1
+; nextln:     store v3, v1+8
+; nextln:     store v4, v1+16
+; nextln:     store v5, v1+24
+; nextln:     store v6, v1+32
+; nextln:     store v7, v1+40
+; nextln:     store v8, v1+48
+; nextln:     store v9, v1+56
+; nextln:     store v10, v1+64
+; nextln:     store v11, v1+72
+; nextln:     store v12, v1+80
+; nextln:     store v13, v1+88
+; nextln:     store v14, v1+96
+; nextln:     adjust_sp_imm 8
+; nextln:     v26 = x86_pop.i64
+; nextln:     v25 = x86_pop.i64
+; nextln:     v24 = x86_pop.i64
+; nextln:     v23 = x86_pop.i64
+; nextln:     v22 = x86_pop.i64
+; nextln:     v21 = x86_pop.i64
+; nextln:     return v21, v22, v23, v24, v25, v26
+; nextln: }
+
+; This function requires too many registers and must spill.
+
+function %yes_spill(i64, i64) {
+ebb0(v0: i64, v1: i64):
+    v2 = load.i32 v0+0
+    v3 = load.i32 v0+8
+    v4 = load.i32 v0+16
+    v5 = load.i32 v0+24
+    v6 = load.i32 v0+32
+    v7 = load.i32 v0+40
+    v8 = load.i32 v0+48
+    v9 = load.i32 v0+56
+    v10 = load.i32 v0+64
+    v11 = load.i32 v0+72
+    v12 = load.i32 v0+80
+    v13 = load.i32 v0+88
+    v14 = load.i32 v0+96
+    v15 = load.i32 v0+104
+    store.i32 v2, v1+0
+    store.i32 v3, v1+8
+    store.i32 v4, v1+16
+    store.i32 v5, v1+24
+    store.i32 v6, v1+32
+    store.i32 v7, v1+40
+    store.i32 v8, v1+48
+    store.i32 v9, v1+56
+    store.i32 v10, v1+64
+    store.i32 v11, v1+72
+    store.i32 v12, v1+80
+    store.i32 v13, v1+88
+    store.i32 v14, v1+96
+    store.i32 v15, v1+104
+    return
+}
+
+; check: function %yes_spill(i64 [%rdi], i64 [%rsi], i64 fp [%rbp], i64 csr [%rbx], i64 csr [%r12], i64 csr [%r13], i64 csr [%r14], i64 csr [%r15]) -> i64 fp [%rbp], i64 csr [%rbx], i64 csr [%r12], i64 csr [%r13], i64 csr [%r14], i64 csr [%r15] system_v {
+; check:     ss0 = spill_slot
+
+; check: ebb0(v16: i64 [%rdi], v17: i64 [%rsi], v48: i64 [%rbp], v49: i64 [%rbx], v50: i64 [%r12], v51: i64 [%r13], v52: i64 [%r14], v53: i64 [%r15]):
+; nextln:     x86_push v48
+; nextln:     copy_special %rsp -> %rbp
+; nextln:     x86_push v49
+; nextln:     x86_push v50
+; nextln:     x86_push v51
+; nextln:     x86_push v52
+; nextln:     x86_push v53
+; nextln:     adjust_sp_imm
+
+; check:      spill
+
+; check:      fill
+
+; check:     adjust_sp_imm
+; nextln:     v59 = x86_pop.i64
+; nextln:     v58 = x86_pop.i64
+; nextln:     v57 = x86_pop.i64
+; nextln:     v56 = x86_pop.i64
+; nextln:     v55 = x86_pop.i64
+; nextln:     v54 = x86_pop.i64
+; nextln:     return v54, v55, v56, v57, v58, v59
+; nextln: }
+
+; A function which uses diverted registers.
+
+function %divert(i32) -> i32 system_v {
+ebb0(v0: i32):
+    v2 = iconst.i32 0
+    v3 = iconst.i32 1
+    jump ebb3(v0, v3, v2)
+
+ebb3(v4: i32, v5: i32, v6: i32):
+    brz v4, ebb4
+    v7 = iadd v5, v6
+    v8 = iadd_imm v4, -1
+    jump ebb3(v8, v7, v5)
+
+ebb4:
+    return v5
+}
+
+; check: function %divert
+; check: regmove v5, %rcx -> %rbx
+; check: [RexOp1popq#58,%rbx]                v15 = x86_pop.i64