Add Intel encodings for floating point load/store instructions.

Include wasm/*-memory64.cton tests too.
2017-09-18 18:23:53 -07:00
parent 88348368a8
commit 1fdeddd0d3
8 changed files with 448 additions and 0 deletions
--- a/cranelift/filetests/isa/intel/binary32-float.cton
+++ b/cranelift/filetests/isa/intel/binary32-float.cton
@@ -77,6 +77,34 @@ ebb0:
    ; asm: xorps %xmm5, %xmm2
    [-,%xmm2]           v37 = bxor v11, v10                     ; bin: 0f 57 d5

+    ; Load/Store
+
+    ; asm: movd (%ecx), %xmm5
+    [-,%xmm5]           v100 = load.f32 v0                      ; bin: 66 0f 6e 29
+    ; asm: movd (%esi), %xmm2
+    [-,%xmm2]           v101 = load.f32 v1                      ; bin: 66 0f 6e 16
+    ; asm: movd 50(%ecx), %xmm5
+    [-,%xmm5]           v110 = load.f32 v0+50                   ; bin: 66 0f 6e 69 32
+    ; asm: movd -50(%esi), %xmm2
+    [-,%xmm2]           v111 = load.f32 v1-50                   ; bin: 66 0f 6e 56 ce
+    ; asm: movd 10000(%ecx), %xmm5
+    [-,%xmm5]           v120 = load.f32 v0+10000                ; bin: 66 0f 6e a9 00002710
+    ; asm: movd -10000(%esi), %xmm2
+    [-,%xmm2]           v121 = load.f32 v1-10000                ; bin: 66 0f 6e 96 ffffd8f0
+
+    ; asm: movd %xmm5, (%ecx)
+    [-]                 store.f32 v100, v0                      ; bin: 66 0f 7e 29
+    ; asm: movd %xmm2, (%esi)
+    [-]                 store.f32 v101, v1                      ; bin: 66 0f 7e 16
+    ; asm: movd %xmm5, 50(%ecx)
+    [-]                 store.f32 v100, v0+50                   ; bin: 66 0f 7e 69 32
+    ; asm: movd %xmm2, -50(%esi)
+    [-]                 store.f32 v101, v1-50                   ; bin: 66 0f 7e 56 ce
+    ; asm: movd %xmm5, 10000(%ecx)
+    [-]                 store.f32 v100, v0+10000                ; bin: 66 0f 7e a9 00002710
+    ; asm: movd %xmm2, -10000(%esi)
+    [-]                 store.f32 v101, v1-10000                ; bin: 66 0f 7e 96 ffffd8f0
+
    return
 }

@@ -142,5 +170,33 @@ ebb0:
    ; asm: xorps %xmm5, %xmm2
    [-,%xmm2]           v37 = bxor v11, v10                     ; bin: 0f 57 d5

+    ; Load/Store
+
+    ; asm: movq (%ecx), %xmm5
+    [-,%xmm5]           v100 = load.f64 v0                      ; bin: f3 0f 7e 29
+    ; asm: movq (%esi), %xmm2
+    [-,%xmm2]           v101 = load.f64 v1                      ; bin: f3 0f 7e 16
+    ; asm: movq 50(%ecx), %xmm5
+    [-,%xmm5]           v110 = load.f64 v0+50                   ; bin: f3 0f 7e 69 32
+    ; asm: movq -50(%esi), %xmm2
+    [-,%xmm2]           v111 = load.f64 v1-50                   ; bin: f3 0f 7e 56 ce
+    ; asm: movq 10000(%ecx), %xmm5
+    [-,%xmm5]           v120 = load.f64 v0+10000                ; bin: f3 0f 7e a9 00002710
+    ; asm: movq -10000(%esi), %xmm2
+    [-,%xmm2]           v121 = load.f64 v1-10000                ; bin: f3 0f 7e 96 ffffd8f0
+
+    ; asm: movq %xmm5, (%ecx)
+    [-]                 store.f64 v100, v0                      ; bin: 66 0f d6 29
+    ; asm: movq %xmm2, (%esi)
+    [-]                 store.f64 v101, v1                      ; bin: 66 0f d6 16
+    ; asm: movq %xmm5, 50(%ecx)
+    [-]                 store.f64 v100, v0+50                   ; bin: 66 0f d6 69 32
+    ; asm: movq %xmm2, -50(%esi)
+    [-]                 store.f64 v101, v1-50                   ; bin: 66 0f d6 56 ce
+    ; asm: movq %xmm5, 10000(%ecx)
+    [-]                 store.f64 v100, v0+10000                ; bin: 66 0f d6 a9 00002710
+    ; asm: movq %xmm2, -10000(%esi)
+    [-]                 store.f64 v101, v1-10000                ; bin: 66 0f d6 96 ffffd8f0
+
    return
 }
--- a/cranelift/filetests/isa/intel/binary64-float.cton
+++ b/cranelift/filetests/isa/intel/binary64-float.cton
@@ -85,6 +85,34 @@ ebb0:
    ; asm: xorps %xmm5, %xmm10
    [-,%xmm10]          v37 = bxor v11, v10                     ; bin: 44 0f 57 d5

+    ; Load/Store
+
+    ; asm: movd (%r14), %xmm5
+    [-,%xmm5]           v100 = load.f32 v3                      ; bin: 66 41 0f 6e 2e
+    ; asm: movd (%rax), %xmm10
+    [-,%xmm10]          v101 = load.f32 v2                      ; bin: 66 44 0f 6e 10
+    ; asm: movd 50(%r14), %xmm5
+    [-,%xmm5]           v110 = load.f32 v3+50                   ; bin: 66 41 0f 6e 6e 32
+    ; asm: movd -50(%rax), %xmm10
+    [-,%xmm10]          v111 = load.f32 v2-50                   ; bin: 66 44 0f 6e 50 ce
+    ; asm: movd 10000(%r14), %xmm5
+    [-,%xmm5]           v120 = load.f32 v3+10000                ; bin: 66 41 0f 6e ae 00002710
+    ; asm: movd -10000(%rax), %xmm10
+    [-,%xmm10]          v121 = load.f32 v2-10000                ; bin: 66 44 0f 6e 90 ffffd8f0
+
+    ; asm: movd %xmm5, (%r14)
+    [-]                 store.f32 v100, v3                      ; bin: 66 41 0f 7e 2e
+    ; asm: movd %xmm10, (%rax)
+    [-]                 store.f32 v101, v2                      ; bin: 66 44 0f 7e 10
+    ; asm: movd %xmm5, 50(%r14)
+    [-]                 store.f32 v100, v3+50                   ; bin: 66 41 0f 7e 6e 32
+    ; asm: movd %xmm10, -50(%rax)
+    [-]                 store.f32 v101, v2-50                   ; bin: 66 44 0f 7e 50 ce
+    ; asm: movd %xmm5, 10000(%r14)
+    [-]                 store.f32 v100, v3+10000                ; bin: 66 41 0f 7e ae 00002710
+    ; asm: movd %xmm10, -10000(%rax)
+    [-]                 store.f32 v101, v2-10000                ; bin: 66 44 0f 7e 90 ffffd8f0
+
    return
 }

@@ -165,5 +193,33 @@ ebb0:
    ; asm: xorps %xmm5, %xmm10
    [-,%xmm10]          v37 = bxor v11, v10                     ; bin: 44 0f 57 d5

+    ; Load/Store
+
+    ; asm: movq (%r14), %xmm5
+    [-,%xmm5]           v100 = load.f64 v3                      ; bin: f3 41 0f 7e 2e
+    ; asm: movq (%rax), %xmm10
+    [-,%xmm10]          v101 = load.f64 v2                      ; bin: f3 44 0f 7e 10
+    ; asm: movq 50(%r14), %xmm5
+    [-,%xmm5]           v110 = load.f64 v3+50                   ; bin: f3 41 0f 7e 6e 32
+    ; asm: movq -50(%rax), %xmm10
+    [-,%xmm10]          v111 = load.f64 v2-50                   ; bin: f3 44 0f 7e 50 ce
+    ; asm: movq 10000(%r14), %xmm5
+    [-,%xmm5]           v120 = load.f64 v3+10000                ; bin: f3 41 0f 7e ae 00002710
+    ; asm: movq -10000(%rax), %xmm10
+    [-,%xmm10]          v121 = load.f64 v2-10000                ; bin: f3 44 0f 7e 90 ffffd8f0
+
+    ; asm: movq %xmm5, (%r14)
+    [-]                 store.f64 v100, v3                      ; bin: 66 41 0f d6 2e
+    ; asm: movq %xmm10, (%rax)
+    [-]                 store.f64 v101, v2                      ; bin: 66 44 0f d6 10
+    ; asm: movq %xmm5, 50(%r14)
+    [-]                 store.f64 v100, v3+50                   ; bin: 66 41 0f d6 6e 32
+    ; asm: movq %xmm10, -50(%rax)
+    [-]                 store.f64 v101, v2-50                   ; bin: 66 44 0f d6 50 ce
+    ; asm: movq %xmm5, 10000(%r14)
+    [-]                 store.f64 v100, v3+10000                ; bin: 66 41 0f d6 ae 00002710
+    ; asm: movq %xmm10, -10000(%rax)
+    [-]                 store.f64 v101, v2-10000                ; bin: 66 44 0f d6 90 ffffd8f0
+
    return
 }
--- a/cranelift/filetests/wasm/f32-memory64.cton
+++ b/cranelift/filetests/wasm/f32-memory64.cton
@@ -0,0 +1,27 @@
+; Test basic code generation for f32 memory WebAssembly instructions.
+test compile
+
+; We only test on 64-bit since the heap_addr instructions and vmctx parameters
+; explicitly mention the pointer width.
+set is_64bit=1
+isa intel haswell
+
+function %f32_load(i32, i64 vmctx) -> f32 {
+    gv0 = vmctx
+    heap0 = static gv0, min 0x0001_0000, bound 0x0001_0000_0000, guard 0x8000_0000
+
+ebb0(v0: i32, v1: i64):
+    v2 = heap_addr.i64 heap0, v0, 1
+    v3 = load.f32 v2
+    return v3
+}
+
+function %f32_store(f32, i32, i64 vmctx) {
+    gv0 = vmctx
+    heap0 = static gv0, min 0x0001_0000, bound 0x0001_0000_0000, guard 0x8000_0000
+
+ebb0(v0: f32, v1: i32, v2: i64):
+    v3 = heap_addr.i64 heap0, v1, 1
+    store v0, v3
+    return
+}
--- a/cranelift/filetests/wasm/f64-memory64.cton
+++ b/cranelift/filetests/wasm/f64-memory64.cton
@@ -0,0 +1,27 @@
+; Test basic code generation for f64 memory WebAssembly instructions.
+test compile
+
+; We only test on 64-bit since the heap_addr instructions and vmctx parameters
+; explicitly mention the pointer width.
+set is_64bit=1
+isa intel haswell
+
+function %f64_load(i32, i64 vmctx) -> f64 {
+    gv0 = vmctx
+    heap0 = static gv0, min 0x0001_0000, bound 0x0001_0000_0000, guard 0x8000_0000
+
+ebb0(v0: i32, v1: i64):
+    v2 = heap_addr.i64 heap0, v0, 1
+    v3 = load.f64 v2
+    return v3
+}
+
+function %f64_store(f64, i32, i64 vmctx) {
+    gv0 = vmctx
+    heap0 = static gv0, min 0x0001_0000, bound 0x0001_0000_0000, guard 0x8000_0000
+
+ebb0(v0: f64, v1: i32, v2: i64):
+    v3 = heap_addr.i64 heap0, v1, 1
+    store v0, v3
+    return
+}
--- a/cranelift/filetests/wasm/i32-memory64.cton
+++ b/cranelift/filetests/wasm/i32-memory64.cton
@@ -0,0 +1,88 @@
+; Test basic code generation for i32 memory WebAssembly instructions.
+test compile
+
+; We only test on 64-bit since the heap_addr instructions and vmctx parameters
+; explicitly mention the pointer width.
+set is_64bit=1
+isa intel haswell
+
+function %i32_load(i32, i64 vmctx) -> i32 {
+    gv0 = vmctx
+    heap0 = static gv0, min 0x0001_0000, bound 0x0001_0000_0000, guard 0x8000_0000
+
+ebb0(v0: i32, v1: i64):
+    v2 = heap_addr.i64 heap0, v0, 1
+    v3 = load.i32 v2
+    return v3
+}
+
+function %i32_store(i32, i32, i64 vmctx) {
+    gv0 = vmctx
+    heap0 = static gv0, min 0x0001_0000, bound 0x0001_0000_0000, guard 0x8000_0000
+
+ebb0(v0: i32, v1: i32, v2: i64):
+    v3 = heap_addr.i64 heap0, v1, 1
+    store v0, v3
+    return
+}
+
+function %i32_load8_s(i32, i64 vmctx) -> i32 {
+    gv0 = vmctx
+    heap0 = static gv0, min 0x0001_0000, bound 0x0001_0000_0000, guard 0x8000_0000
+
+ebb0(v0: i32, v1: i64):
+    v2 = heap_addr.i64 heap0, v0, 1
+    v3 = sload8.i32 v2
+    return v3
+}
+
+function %i32_load8_u(i32, i64 vmctx) -> i32 {
+    gv0 = vmctx
+    heap0 = static gv0, min 0x0001_0000, bound 0x0001_0000_0000, guard 0x8000_0000
+
+ebb0(v0: i32, v1: i64):
+    v2 = heap_addr.i64 heap0, v0, 1
+    v3 = uload8.i32 v2
+    return v3
+}
+
+function %i32_store8(i32, i32, i64 vmctx) {
+    gv0 = vmctx
+    heap0 = static gv0, min 0x0001_0000, bound 0x0001_0000_0000, guard 0x8000_0000
+
+ebb0(v0: i32, v1: i32, v2: i64):
+    v3 = heap_addr.i64 heap0, v1, 1
+    istore8 v0, v3
+    return
+}
+
+function %i32_load16_s(i32, i64 vmctx) -> i32 {
+    gv0 = vmctx
+    heap0 = static gv0, min 0x0001_0000, bound 0x0001_0000_0000, guard 0x8000_0000
+
+ebb0(v0: i32, v1: i64):
+    v2 = heap_addr.i64 heap0, v0, 1
+    v3 = sload16.i32 v2
+    return v3
+}
+
+function %i32_load16_u(i32, i64 vmctx) -> i32 {
+    gv0 = vmctx
+    heap0 = static gv0, min 0x0001_0000, bound 0x0001_0000_0000, guard 0x8000_0000
+
+ebb0(v0: i32, v1: i64):
+    v2 = heap_addr.i64 heap0, v0, 1
+    v3 = uload16.i32 v2
+    return v3
+}
+
+function %i32_store16(i32, i32, i64 vmctx) {
+    gv0 = vmctx
+    heap0 = static gv0, min 0x0001_0000, bound 0x0001_0000_0000, guard 0x8000_0000
+
+ebb0(v0: i32, v1: i32, v2: i64):
+    v3 = heap_addr.i64 heap0, v1, 1
+    istore16 v0, v3
+    return
+}
+
--- a/cranelift/filetests/wasm/i64-memory64.cton
+++ b/cranelift/filetests/wasm/i64-memory64.cton
@@ -0,0 +1,117 @@
+; Test basic code generation for i32 memory WebAssembly instructions.
+test compile
+
+; We only test on 64-bit since the heap_addr instructions and vmctx parameters
+; explicitly mention the pointer width.
+set is_64bit=1
+isa intel haswell
+
+function %i64_load(i32, i64 vmctx) -> i64 {
+    gv0 = vmctx
+    heap0 = static gv0, min 0x0001_0000, bound 0x0001_0000_0000, guard 0x8000_0000
+
+ebb0(v0: i32, v1: i64):
+    v2 = heap_addr.i64 heap0, v0, 1
+    v3 = load.i64 v2
+    return v3
+}
+
+function %i64_store(i64, i32, i64 vmctx) {
+    gv0 = vmctx
+    heap0 = static gv0, min 0x0001_0000, bound 0x0001_0000_0000, guard 0x8000_0000
+
+ebb0(v0: i64, v1: i32, v2: i64):
+    v3 = heap_addr.i64 heap0, v1, 1
+    store v0, v3
+    return
+}
+
+function %i64_load8_s(i32, i64 vmctx) -> i64 {
+    gv0 = vmctx
+    heap0 = static gv0, min 0x0001_0000, bound 0x0001_0000_0000, guard 0x8000_0000
+
+ebb0(v0: i32, v1: i64):
+    v2 = heap_addr.i64 heap0, v0, 1
+    v3 = sload8.i64 v2
+    return v3
+}
+
+function %i64_load8_u(i32, i64 vmctx) -> i64 {
+    gv0 = vmctx
+    heap0 = static gv0, min 0x0001_0000, bound 0x0001_0000_0000, guard 0x8000_0000
+
+ebb0(v0: i32, v1: i64):
+    v2 = heap_addr.i64 heap0, v0, 1
+    v3 = uload8.i64 v2
+    return v3
+}
+
+function %i64_store8(i64, i32, i64 vmctx) {
+    gv0 = vmctx
+    heap0 = static gv0, min 0x0001_0000, bound 0x0001_0000_0000, guard 0x8000_0000
+
+ebb0(v0: i64, v1: i32, v2: i64):
+    v3 = heap_addr.i64 heap0, v1, 1
+    istore8 v0, v3
+    return
+}
+
+function %i64_load16_s(i32, i64 vmctx) -> i64 {
+    gv0 = vmctx
+    heap0 = static gv0, min 0x0001_0000, bound 0x0001_0000_0000, guard 0x8000_0000
+
+ebb0(v0: i32, v1: i64):
+    v2 = heap_addr.i64 heap0, v0, 1
+    v3 = sload16.i64 v2
+    return v3
+}
+
+function %i64_load16_u(i32, i64 vmctx) -> i64 {
+    gv0 = vmctx
+    heap0 = static gv0, min 0x0001_0000, bound 0x0001_0000_0000, guard 0x8000_0000
+
+ebb0(v0: i32, v1: i64):
+    v2 = heap_addr.i64 heap0, v0, 1
+    v3 = uload16.i64 v2
+    return v3
+}
+
+function %i64_store16(i64, i32, i64 vmctx) {
+    gv0 = vmctx
+    heap0 = static gv0, min 0x0001_0000, bound 0x0001_0000_0000, guard 0x8000_0000
+
+ebb0(v0: i64, v1: i32, v2: i64):
+    v3 = heap_addr.i64 heap0, v1, 1
+    istore16 v0, v3
+    return
+}
+
+function %i64_load32_s(i32, i64 vmctx) -> i64 {
+    gv0 = vmctx
+    heap0 = static gv0, min 0x0001_0000, bound 0x0001_0000_0000, guard 0x8000_0000
+
+ebb0(v0: i32, v1: i64):
+    v2 = heap_addr.i64 heap0, v0, 1
+    v3 = sload32.i64 v2
+    return v3
+}
+
+function %i64_load32_u(i32, i64 vmctx) -> i64 {
+    gv0 = vmctx
+    heap0 = static gv0, min 0x0001_0000, bound 0x0001_0000_0000, guard 0x8000_0000
+
+ebb0(v0: i32, v1: i64):
+    v2 = heap_addr.i64 heap0, v0, 1
+    v3 = uload32.i64 v2
+    return v3
+}
+
+function %i64_store32(i64, i32, i64 vmctx) {
+    gv0 = vmctx
+    heap0 = static gv0, min 0x0001_0000, bound 0x0001_0000_0000, guard 0x8000_0000
+
+ebb0(v0: i64, v1: i32, v2: i64):
+    v3 = heap_addr.i64 heap0, v1, 1
+    istore32 v0, v3
+    return
+}
--- a/lib/cretonne/meta/isa/intel/encodings.py
+++ b/lib/cretonne/meta/isa/intel/encodings.py
@@ -223,6 +223,26 @@ enc_i32_i64_ld_st(base.sload8, True, r.ld, 0x0f, 0xbe)
 enc_i32_i64_ld_st(base.sload8, True, r.ldDisp8, 0x0f, 0xbe)
 enc_i32_i64_ld_st(base.sload8, True, r.ldDisp32, 0x0f, 0xbe)

+#
+# Float loads and stores.
+#
+
+enc_flt(base.load.f32.any, r.fld, 0x66, 0x0f, 0x6e)
+enc_flt(base.load.f32.any, r.fldDisp8, 0x66, 0x0f, 0x6e)
+enc_flt(base.load.f32.any, r.fldDisp32, 0x66, 0x0f, 0x6e)
+
+enc_flt(base.load.f64.any, r.fld, 0xf3, 0x0f, 0x7e)
+enc_flt(base.load.f64.any, r.fldDisp8, 0xf3, 0x0f, 0x7e)
+enc_flt(base.load.f64.any, r.fldDisp32, 0xf3, 0x0f, 0x7e)
+
+enc_flt(base.store.f32.any, r.fst, 0x66, 0x0f, 0x7e)
+enc_flt(base.store.f32.any, r.fstDisp8, 0x66, 0x0f, 0x7e)
+enc_flt(base.store.f32.any, r.fstDisp32, 0x66, 0x0f, 0x7e)
+
+enc_flt(base.store.f64.any, r.fst, 0x66, 0x0f, 0xd6)
+enc_flt(base.store.f64.any, r.fstDisp8, 0x66, 0x0f, 0xd6)
+enc_flt(base.store.f64.any, r.fstDisp32, 0x66, 0x0f, 0xd6)
+
 #
 # Call/return
 #
--- a/lib/cretonne/meta/isa/intel/recipes.py
+++ b/lib/cretonne/meta/isa/intel/recipes.py
@@ -374,6 +374,15 @@ st_abcd = TailRecipe(
        modrm_rm(in_reg1, in_reg0, sink);
        ''')

+# XX /r register-indirect store of FPR with no offset.
+fst = TailRecipe(
+        'fst', Store, size=1, ins=(FPR, GPR), outs=(),
+        instp=IsEqual(Store.offset, 0),
+        emit='''
+        PUT_OP(bits, rex2(in_reg1, in_reg0), sink);
+        modrm_rm(in_reg1, in_reg0, sink);
+        ''')
+
 # XX /r register-indirect store with 8-bit offset.
 stDisp8 = TailRecipe(
        'stDisp8', Store, size=2, ins=(GPR, GPR), outs=(),
@@ -393,6 +402,15 @@ stDisp8_abcd = TailRecipe(
        let offset: i32 = offset.into();
        sink.put1(offset as u8);
        ''')
+fstDisp8 = TailRecipe(
+        'fstDisp8', Store, size=2, ins=(FPR, GPR), outs=(),
+        instp=IsSignedInt(Store.offset, 8),
+        emit='''
+        PUT_OP(bits, rex2(in_reg1, in_reg0), sink);
+        modrm_disp8(in_reg1, in_reg0, sink);
+        let offset: i32 = offset.into();
+        sink.put1(offset as u8);
+        ''')

 # XX /r register-indirect store with 32-bit offset.
 stDisp32 = TailRecipe(
@@ -411,6 +429,14 @@ stDisp32_abcd = TailRecipe(
        let offset: i32 = offset.into();
        sink.put4(offset as u32);
        ''')
+fstDisp32 = TailRecipe(
+        'fstDisp32', Store, size=5, ins=(FPR, GPR), outs=(),
+        emit='''
+        PUT_OP(bits, rex2(in_reg1, in_reg0), sink);
+        modrm_disp32(in_reg1, in_reg0, sink);
+        let offset: i32 = offset.into();
+        sink.put4(offset as u32);
+        ''')

 #
 # Load recipes
@@ -425,6 +451,15 @@ ld = TailRecipe(
        modrm_rm(in_reg0, out_reg0, sink);
        ''')

+# XX /r float load with no offset.
+fld = TailRecipe(
+        'fld', Load, size=1, ins=(GPR), outs=(FPR),
+        instp=IsEqual(Load.offset, 0),
+        emit='''
+        PUT_OP(bits, rex2(in_reg0, out_reg0), sink);
+        modrm_rm(in_reg0, out_reg0, sink);
+        ''')
+
 # XX /r load with 8-bit offset.
 ldDisp8 = TailRecipe(
        'ldDisp8', Load, size=2, ins=(GPR), outs=(GPR),
@@ -436,6 +471,17 @@ ldDisp8 = TailRecipe(
        sink.put1(offset as u8);
        ''')

+# XX /r float load with 8-bit offset.
+fldDisp8 = TailRecipe(
+        'fldDisp8', Load, size=2, ins=(GPR), outs=(FPR),
+        instp=IsSignedInt(Load.offset, 8),
+        emit='''
+        PUT_OP(bits, rex2(in_reg0, out_reg0), sink);
+        modrm_disp8(in_reg0, out_reg0, sink);
+        let offset: i32 = offset.into();
+        sink.put1(offset as u8);
+        ''')
+
 # XX /r load with 32-bit offset.
 ldDisp32 = TailRecipe(
        'ldDisp32', Load, size=5, ins=(GPR), outs=(GPR),
@@ -447,6 +493,17 @@ ldDisp32 = TailRecipe(
        sink.put4(offset as u32);
        ''')

+# XX /r float load with 32-bit offset.
+fldDisp32 = TailRecipe(
+        'fldDisp32', Load, size=5, ins=(GPR), outs=(FPR),
+        instp=IsSignedInt(Load.offset, 32),
+        emit='''
+        PUT_OP(bits, rex2(in_reg0, out_reg0), sink);
+        modrm_disp32(in_reg0, out_reg0, sink);
+        let offset: i32 = offset.into();
+        sink.put4(offset as u32);
+        ''')
+
 #
 # Call/return
 #