load_complex and store_complex instructions (#309)

* Start adding the load_complex and store_complex instructions. N.b.: The text format is not correct yet. Requires changes to the lexer and parser. I'm not sure why I needed to change the RuntimeError to Exception yet. Will fix. * Get first few encodings of load_complex working. Still needs var args type checking. * Clean up ModRM helper functions in binemit. * Implement 32-bit displace for load_complex * Use encoding helpers instead of doing them all by hand * Initial implementation of store_complex * Parse value list for load/store_complex with + as delimiter. Looks nice. * Add sign/zero-extension and size variants for load_complex. * Add size variants of store_complex. * Add asm helper lines to load/store complex bin tests. * Example of length-checking the instruction ValueList for an encoding. Extremely questionable implementation. * Fix Python linting issues * First draft of postopt pass to fold adds and loads into load_complex. Just simple loads for now. * Optimization pass now works with all types of loads. * Add store+add -> store_complex to postopt pass * Put complex address optimization behind ISA flag. * Add load/store complex for f32 and f64 * Fixes changes to lexer that broke NaN parsing. Abstracts away the repeated checks for whether or not the characters following a + or - are going to be parsed as a number or not. * Fix formatting issues * Fix register restrictions for complex addresses. * Encoding tests for x86-32. * Add documentation for newly added instructions, recipes, and cdsl changes. * Fix python formatting again * Apply value-list length predicates to all LoadComplex and StoreComplex instructions. * Add predicate types to new encoding helpers for mypy. * Import FieldPredicate to satisfy mypy. * Add and fix some "asm" strings in the encoding tests. * Line-up 'bin' comments in x86/binary64 test * Test parsing of offset-less store_complex instruction. * 'sNaN' not 'sNan' * Bounds check the lookup for polymorphic typevar operand. * Fix encodings for istore16_complex.
2018-05-09 12:07:00 -07:00
parent 5aa84a744b
commit f636d795c5
25 changed files with 1127 additions and 21 deletions
--- a/cranelift/docs/langref.rst
+++ b/cranelift/docs/langref.rst
@@ -476,6 +476,11 @@ these instructions is undefined. If it is addressable but not
 There are also more restricted operations for accessing specific types of memory
 objects.

+Additionally, instructions are provided for handling multi-register addressing.
+
+.. autoinst:: load_complex
+.. autoinst:: store_complex
+
 Memory operation flags
 ----------------------

--- a/cranelift/filetests/isa/x86/binary32-float.cton
+++ b/cranelift/filetests/isa/x86/binary32-float.cton
@@ -227,6 +227,32 @@ ebb0:
    ; asm: ucomiss %xmm5, %xmm5
    [-,%rflags]         v312 = ffcmp v10, v10                   ; bin: 0f 2e ed

+    ; Load/Store Complex
+
+    [-,%rax]            v350 = iconst.i32 1
+    [-,%rbx]            v351 = iconst.i32 2
+
+    ; asm: movss (%rax,%rbx,1),%xmm5
+    [-,%xmm5]           v352 = load_complex.f32 v350+v351               ; bin: heap_oob f3 0f 10 2c 18
+    ; asm: movss 0x32(%rax,%rbx,1),%xmm5
+    [-,%xmm5]           v353 = load_complex.f32 v350+v351+50            ; bin: heap_oob f3 0f 10 6c 18 32
+    ; asm: movss -0x32(%rax,%rbx,1),%xmm5
+    [-,%xmm5]           v354 = load_complex.f32 v350+v351-50            ; bin: heap_oob f3 0f 10 6c 18 ce
+    ; asm: movss 0x2710(%rax,%rbx,1),%xmm5
+    [-,%xmm5]           v355 = load_complex.f32 v350+v351+10000         ; bin: heap_oob f3 0f 10 ac 18 00002710
+    ; asm: movss -0x2710(%rax,%rbx,1),%xmm5
+    [-,%xmm5]           v356 = load_complex.f32 v350+v351-10000         ; bin: heap_oob f3 0f 10 ac 18 ffffd8f0
+    ; asm: movss %xmm5,(%rax,%rbx,1)
+    [-]                 store_complex.f32 v100, v350+v351               ; bin: heap_oob f3 0f 11 2c 18
+    ; asm: movss %xmm5,0x32(%rax,%rbx,1)
+    [-]                 store_complex.f32 v100, v350+v351+50            ; bin: heap_oob f3 0f 11 6c 18 32
+    ; asm: movss %xmm2,-0x32(%rax,%rbx,1)
+    [-]                 store_complex.f32 v101, v350+v351-50            ; bin: heap_oob f3 0f 11 54 18 ce
+    ; asm: movss %xmm5,0x2710(%rax,%rbx,1)
+    [-]                 store_complex.f32 v100, v350+v351+10000         ; bin: heap_oob f3 0f 11 ac 18 00002710
+    ; asm: movss %xmm2,-0x2710(%rax,%rbx,1)
+    [-]                 store_complex.f32 v101, v350+v351-10000         ; bin: heap_oob f3 0f 11 94 18 ffffd8f0
+
    return
 }

--- a/cranelift/filetests/isa/x86/binary32.cton
+++ b/cranelift/filetests/isa/x86/binary32.cton
@@ -432,6 +432,37 @@ ebb0:
    ; asm: shrl $8, %esi
    [-,%rsi]             v515 = ushr_imm v2, 8    ; bin: c1 ee 08

+    ; Load Complex
+    [-,%rax]            v521 = iconst.i32 1
+    [-,%rbx]            v522 = iconst.i32 1
+    ; asm: movl (%eax,%ebx,1), %ecx
+    [-,%rcx]            v526 = load_complex.i32 v521+v522         ; bin: heap_oob 8b 0c 18
+    ; asm: movl 1(%eax,%ebx,1), %ecx
+    [-,%rcx]            v528 = load_complex.i32 v521+v522+1       ; bin: heap_oob 8b 4c 18 01
+    ; asm: mov    0x100000(%eax,%ebx,1),%ecx
+    [-,%rcx]            v530 = load_complex.i32 v521+v522+0x1000  ; bin: heap_oob 8b 8c 18 00001000
+    ; asm: movzbl (%eax,%ebx,1),%ecx
+    [-,%rcx]            v532 = uload8_complex.i32 v521+v522         ; bin: heap_oob 0f b6 0c 18
+    ; asm: movsbl (%eax,%ebx,1),%ecx
+    [-,%rcx]            v534 = sload8_complex.i32 v521+v522         ; bin: heap_oob 0f be 0c 18
+    ; asm: movzwl (%eax,%ebx,1),%ecx
+    [-,%rcx]            v536 = uload16_complex.i32 v521+v522         ; bin: heap_oob 0f b7 0c 18
+    ; asm: movswl (%eax,%ebx,1),%ecx
+    [-,%rcx]            v538 = sload16_complex.i32 v521+v522         ; bin: heap_oob 0f bf 0c 18
+
+    ; Store Complex
+    [-,%rcx]            v601 = iconst.i32 1
+    ; asm: mov    %ecx,(%eax,%ebx,1)
+    store_complex v601, v521+v522        ; bin: heap_oob 89 0c 18
+    ; asm: mov    %ecx,0x1(%eax,%ebx,1)
+    store_complex v601, v521+v522+1      ; bin: heap_oob 89 4c 18 01
+    ; asm: mov    %ecx,0x100000(%eax,%ebx,1)
+    store_complex v601, v521+v522+0x1000 ; bin: heap_oob 89 8c 18 00001000
+    ; asm: mov    %cx,(%eax,%ebx,1)
+    istore16_complex v601, v521+v522     ; bin: heap_oob 66 89 0c 18
+    ; asm: mov    %cl,(%eax,%ebx,1)
+    istore8_complex v601, v521+v522      ; bin: heap_oob 88 0c 18
+
    ; asm: testl %ecx, %ecx
    ; asm: je ebb1
    brz v1, ebb1                                ; bin: 85 c9 74 0e
--- a/cranelift/filetests/isa/x86/binary64-float.cton
+++ b/cranelift/filetests/isa/x86/binary64-float.cton
@@ -241,6 +241,34 @@ ebb0:
    ; asm: ucomiss %xmm5, %xmm5
    [-,%rflags]         v312 = ffcmp v10, v10                   ; bin: 0f 2e ed

+
+    ; Load/Store Complex
+
+    [-,%rax]            v350 = iconst.i64 1
+    [-,%rbx]            v351 = iconst.i64 2
+
+    ; asm: movss  (%rax,%rbx,1),%xmm5
+    [-,%xmm5]           v352 = load_complex.f32 v350+v351               ; bin: heap_oob f3 0f 10 2c 18
+    ; asm: movss 0x32(%rax,%rbx,1),%xmm5
+    [-,%xmm5]           v353 = load_complex.f32 v350+v351+50            ; bin: heap_oob f3 0f 10 6c 18 32
+    ; asm: movss -0x32(%rax,%rbx,1),%xmm10
+    [-,%xmm10]          v354 = load_complex.f32 v350+v351-50            ; bin: heap_oob f3 44 0f 10 54 18 ce
+    ; asm: 0x2710(%rax,%rbx,1),%xmm5
+    [-,%xmm5]           v355 = load_complex.f32 v350+v351+10000         ; bin: heap_oob f3 0f 10 ac 18 00002710
+    ; asm: -0x2710(%rax,%rbx,1),%xmm10
+    [-,%xmm10]          v356 = load_complex.f32 v350+v351-10000         ; bin: heap_oob f3 44 0f 10 94 18 ffffd8f0
+
+    ; asm: movsd %xmm5, (%rax,%rbx,1)
+    [-]                 store_complex.f32 v100, v350+v351               ; bin: heap_oob f3 0f 11 2c 18
+    ; asm: movsd %xmm5, 50(%rax,%rbx,1)
+    [-]                 store_complex.f32 v100, v350+v351+50            ; bin: heap_oob f3 0f 11 6c 18 32
+    ; asm: movsd %xmm10, -50(%rax,%rbx,1)
+    [-]                 store_complex.f32 v101, v350+v351-50            ; bin: heap_oob f3 44 0f 11 54 18 ce
+    ; asm: movsd %xmm5, 10000(%rax,%rbx,1)
+    [-]                 store_complex.f32 v100, v350+v351+10000         ; bin: heap_oob f3 0f 11 ac 18 00002710
+    ; asm: movsd %xmm10, -10000(%rax,%rbx,1)
+    [-]                 store_complex.f32 v101, v350+v351-10000         ; bin: heap_oob f3 44 0f 11 94 18 ffffd8f0
+
    return
 }

@@ -476,6 +504,32 @@ ebb0:
    ; asm: ucomisd %xmm5, %xmm5
    [-,%rflags]         v312 = ffcmp v10, v10                   ; bin: 66 0f 2e ed

+    ; Load/Store Complex
+
+    [-,%rax]            v350 = iconst.i64 1
+    [-,%rbx]            v351 = iconst.i64 2
+    ; asm: movsd (%rax,%rbx,1),%xmm5
+    [-,%xmm5]           v352 = load_complex.f64 v350+v351               ; bin: heap_oob f2 0f 10 2c 18
+    ; asm: movsd 0x32(%rax,%rbx,1),%xmm5
+    [-,%xmm5]           v353 = load_complex.f64 v350+v351+50            ; bin: heap_oob f2 0f 10 6c 18 32
+    ; asm: movsd -0x32(%rax,%rbx,1),%xmm10
+    [-,%xmm10]          v354 = load_complex.f64 v350+v351-50            ; bin: heap_oob f2 44 0f 10 54 18 ce
+    ; asm: movsd 0x2710(%rax,%rbx,1),%xmm5
+    [-,%xmm5]           v355 = load_complex.f64 v350+v351+10000         ; bin: heap_oob f2 0f 10 ac 18 00002710
+    ; asm: movsd -0x2710(%rax,%rbx,1),%xmm10
+    [-,%xmm10]          v356 = load_complex.f64 v350+v351-10000         ; bin: heap_oob f2 44 0f 10 94 18 ffffd8f0
+
+    ; asm: movsd %xmm5, (%rax,%rbx,1)
+    [-]                 store_complex.f64 v100, v350+v351               ; bin: heap_oob f2 0f 11 2c 18
+    ; asm: movsd %xmm5, 50(%rax,%rbx,1)
+    [-]                 store_complex.f64 v100, v350+v351+50            ; bin: heap_oob f2 0f 11 6c 18 32
+    ; asm: movsd %xmm10, -50(%rax,%rbx,1)
+    [-]                 store_complex.f64 v101, v350+v351-50            ; bin: heap_oob f2 44 0f 11 54 18 ce
+    ; asm: movsd %xmm5, 10000(%rax,%rbx,1)
+    [-]                 store_complex.f64 v100, v350+v351+10000         ; bin: heap_oob f2 0f 11 ac 18 00002710
+    ; asm: movsd %xmm10, -10000(%rax,%rbx,1)
+    [-]                 store_complex.f64 v101, v350+v351-10000         ; bin: heap_oob f2 44 0f 11 94 18 ffffd8f0
+
    return
 }

--- a/cranelift/filetests/isa/x86/binary64.cton
+++ b/cranelift/filetests/isa/x86/binary64.cton
@@ -594,6 +594,80 @@ ebb0:
    [-,%r8]              v520 = ushr_imm v4, 63   ; bin: 49 c1 e8 3f


+    ; Load Complex
+    [-,%rax]            v521 = iconst.i64 1
+    [-,%rbx]            v522 = iconst.i64 1
+    [-,%rdi]            v523 = iconst.i32 1
+    [-,%rsi]            v524 = iconst.i32 1
+    ; asm: movq (%rax,%rbx,1), %rcx
+    [-,%rcx]            v525 = load_complex.i64 v521+v522               ; bin: heap_oob 48 8b 0c 18
+    ; asm: movl (%rax,%rbx,1), %ecx
+    [-,%rcx]            v526 = load_complex.i32 v521+v522               ; bin: heap_oob 8b 0c 18
+    ; asm: movq 1(%rax,%rbx,1), %rcx
+    [-,%rcx]            v527 = load_complex.i64 v521+v522+1             ; bin: heap_oob 48 8b 4c 18 01
+    ; asm: movl 1(%rax,%rbx,1), %ecx
+    [-,%rcx]            v528 = load_complex.i32 v521+v522+1             ; bin: heap_oob 8b 4c 18 01
+    ; asm: mov    0x100000(%rax,%rbx,1),%rcx
+    [-,%rcx]            v529 = load_complex.i64 v521+v522+0x1000        ; bin: heap_oob 48 8b 8c 18 00001000
+    ; asm: mov    0x100000(%rax,%rbx,1),%ecx
+    [-,%rcx]            v530 = load_complex.i32 v521+v522+0x1000        ; bin: heap_oob 8b 8c 18 00001000
+    ; asm: movzbq (%rax,%rbx,1),%rcx
+    [-,%rcx]            v531 = uload8_complex.i64 v521+v522             ; bin: heap_oob 48 0f b6 0c 18
+    ; asm: movzbl (%rax,%rbx,1),%ecx
+    [-,%rcx]            v532 = uload8_complex.i32 v521+v522             ; bin: heap_oob 0f b6 0c 18
+    ; asm: movsbq (%rax,%rbx,1),%rcx
+    [-,%rcx]            v533 = sload8_complex.i64 v521+v522             ; bin: heap_oob 48 0f be 0c 18
+    ; asm: movsbl (%rax,%rbx,1),%ecx
+    [-,%rcx]            v534 = sload8_complex.i32 v521+v522             ; bin: heap_oob 0f be 0c 18
+    ; asm: movzwq (%rax,%rbx,1),%rcx
+    [-,%rcx]            v535 = uload16_complex.i64 v521+v522            ; bin: heap_oob 48 0f b7 0c 18
+    ; asm: movzwl (%rax,%rbx,1),%ecx
+    [-,%rcx]            v536 = uload16_complex.i32 v521+v522            ; bin: heap_oob 0f b7 0c 18
+    ; asm: movswq (%rax,%rbx,1),%rcx
+    [-,%rcx]            v537 = sload16_complex.i64 v521+v522            ; bin: heap_oob 48 0f bf 0c 18
+    ; asm: movswl (%rax,%rbx,1),%ecx
+    [-,%rcx]            v538 = sload16_complex.i32 v521+v522            ; bin: heap_oob 0f bf 0c 18
+    ; asm: mov    (%rax,%rbx,1),%ecx
+    [-,%rcx]            v539 = uload32_complex v521+v522                ; bin: heap_oob 8b 0c 18
+    ; asm: movslq (%rax,%rbx,1),%rcx
+    [-,%rcx]            v540 = sload32_complex v521+v522                ; bin: heap_oob 48 63 0c 18
+    [-,%r13]            v550 = iconst.i64 1
+    [-,%r14]            v551 = iconst.i64 1
+    ; asm: mov 0x0(%r13,%r14,1),%r12d
+    [-,%r12]            v552 = load_complex.i32 v550+v551               ; bin: heap_oob 47 8b 64 35 00
+
+    ; Store Complex
+    [-,%rcx]            v600 = iconst.i64 1
+    [-,%rcx]            v601 = iconst.i32 1
+    [-,%r10]            v602 = iconst.i64 1
+    [-,%r11]            v603 = iconst.i32 1
+    ; asm: mov    %rcx,(%rax,%rbx,1)
+    store_complex v600, v521+v522               ; bin: heap_oob 48 89 0c 18
+    ; asm: mov    %rcx,0x1(%rax,%rbx,1)
+    store_complex v600, v521+v522+1             ; bin: heap_oob 48 89 4c 18 01
+    ; asm: mov    %rcx,0x100000(%rax,%rbx,1)
+    store_complex v600, v521+v522+0x1000        ; bin: heap_oob 48 89 8c 18 00001000
+    ; asm: mov    %ecx,(%rax,%rbx,1)
+    store_complex v601, v521+v522               ; bin: heap_oob 89 0c 18
+    ; asm: mov    %ecx,0x1(%rax,%rbx,1)
+    store_complex v601, v521+v522+1             ; bin: heap_oob 89 4c 18 01
+    ; asm: mov    %ecx,0x100000(%rax,%rbx,1)
+    store_complex v601, v521+v522+0x1000        ; bin: heap_oob 89 8c 18 00001000
+    ; asm: mov    %ecx,(%rax,%rbx,1)
+    istore32_complex v600, v521+v522            ; bin: heap_oob 89 0c 18
+    ; asm: mov    %cx,(%rax,%rbx,1)
+    istore16_complex v600, v521+v522            ; bin: heap_oob 66 89 0c 18
+    ; asm: mov    %cx,(%rax,%rbx,1)
+    istore16_complex v601, v521+v522            ; bin: heap_oob 66 89 0c 18
+    ; asm: mov    %r10w,(%rax,%rbx,1)
+    istore16_complex v602, v521+v522            ; bin: heap_oob 66 44 89 14 18
+    ; asm: mov    %r11w,(%rax,%rbx,1)
+    istore16_complex v603, v521+v522            ; bin: heap_oob 66 44 89 1c 18
+    ; asm: mov    %cl,(%rax,%rbx,1)
+    istore8_complex v600, v521+v522             ; bin: heap_oob 88 0c 18
+    ; asm: mov    %cl,(%rax,%rbx,1)
+    istore8_complex v601, v521+v522             ; bin: heap_oob 88 0c 18
+
    ; asm: testq %rcx, %rcx
    ; asm: je ebb1
    brz v1, ebb1                                ; bin: 48 85 c9 74 1b
--- a/cranelift/filetests/parser/tiny.cton
+++ b/cranelift/filetests/parser/tiny.cton
@@ -158,9 +158,13 @@ ebb0(v1: i32):
    v6 = load.i64 aligned notrap v1
    v7 = load.i64 v1-12
    v8 = load.i64 notrap v1+0x1_0000
+    v9 = load_complex.i64 v1+v2
+    v10 = load_complex.i64 v1+v2+0x1
    store v2, v1
    store aligned v3, v1+12
    store notrap aligned v3, v1-12
+    store_complex v3, v1+v2
+    store_complex v3, v1+v2+0x1
 }
 ; sameln: function %memory(i32) fast {
 ; nextln: ebb0(v1: i32):
@@ -171,9 +175,13 @@ ebb0(v1: i32):
 ; nextln:     v6 = load.i64 notrap aligned v1
 ; nextln:     v7 = load.i64 v1-12
 ; nextln:     v8 = load.i64 notrap v1+0x0001_0000
+; nextln:     v9 = load_complex.i64 v1+v2
+; nextln:     v10 = load_complex.i64 v1+v2+1
 ; nextln:     store v2, v1
 ; nextln:     store aligned v3, v1+12
 ; nextln:     store notrap aligned v3, v1-12
+; nextln:     store_complex v3, v1+v2
+; nextln:     store_complex v3, v1+v2+1

 ; Register diversions.
 ; This test file has no ISA, so we can unly use register unit numbers.
--- a/cranelift/filetests/postopt/complex_memory_ops.cton
+++ b/cranelift/filetests/postopt/complex_memory_ops.cton
@@ -0,0 +1,95 @@
+test postopt
+set is_64bit
+isa x86
+
+function %dual_loads(i64, i64) -> i64 {
+ebb0(v0: i64, v1: i64):
+[RexOp1rr#8001]    v3 = iadd v0, v1
+                   v4 = load.i64 v3
+                   v5 = uload8.i64 v3
+                   v6 = sload8.i64 v3
+                   v7 = uload16.i64 v3
+                   v8 = sload16.i64 v3
+                   v9 = uload32.i64 v3
+                   v10 = sload32.i64 v3
+[Op1ret#c3]        return v10
+}
+
+; sameln: function %dual_loads
+; nextln: ebb0(v0: i64, v1: i64):
+; nextln:    v3 = iadd v0, v1
+; nextln:    v4 = load_complex.i64 v0+v1
+; nextln:    v5 = uload8_complex.i64 v0+v1
+; nextln:    v6 = sload8_complex.i64 v0+v1
+; nextln:    v7 = uload16_complex.i64 v0+v1
+; nextln:    v8 = sload16_complex.i64 v0+v1
+; nextln:    v9 = uload32_complex v0+v1
+; nextln:    v10 = sload32_complex v0+v1
+; nextln:    return v10
+; nextln: }
+
+function %dual_loads2(i64, i64) -> i64 {
+ebb0(v0: i64, v1: i64):
+[RexOp1rr#8001]    v3 = iadd v0, v1
+                   v4 = load.i64 v3+1
+                   v5 = uload8.i64 v3+1
+                   v6 = sload8.i64 v3+1
+                   v7 = uload16.i64 v3+1
+                   v8 = sload16.i64 v3+1
+                   v9 = uload32.i64 v3+1
+                   v10 = sload32.i64 v3+1
+[Op1ret#c3]        return v10
+}
+
+; sameln: function %dual_loads2
+; nextln: ebb0(v0: i64, v1: i64):
+; nextln:    v3 = iadd v0, v1
+; nextln:    v4 = load_complex.i64 v0+v1+1
+; nextln:    v5 = uload8_complex.i64 v0+v1+1
+; nextln:    v6 = sload8_complex.i64 v0+v1+1
+; nextln:    v7 = uload16_complex.i64 v0+v1+1
+; nextln:    v8 = sload16_complex.i64 v0+v1+1
+; nextln:    v9 = uload32_complex v0+v1+1
+; nextln:    v10 = sload32_complex v0+v1+1
+; nextln:    return v10
+; nextln: }
+
+function %dual_stores(i64, i64, i64) {
+ebb0(v0: i64, v1: i64, v2: i64):
+[RexOp1rr#8001]    v3 = iadd v0, v1
+[RexOp1st#8089]    store.i64 v2, v3
+[RexOp1st#88]      istore8.i64 v2, v3
+[RexMp1st#189]     istore16.i64 v2, v3
+[RexOp1st#89]      istore32.i64 v2, v3
+[Op1ret#c3]        return
+}
+
+; sameln: function %dual_stores
+; nextln: ebb0(v0: i64, v1: i64, v2: i64):
+; nextln:    v3 = iadd v0, v1
+; nextln:    store_complex v2, v0+v1
+; nextln:    istore8_complex v2, v0+v1
+; nextln:    istore16_complex v2, v0+v1
+; nextln:    istore32_complex v2, v0+v1
+; nextln:    return
+; nextln: }
+
+function %dual_stores2(i64, i64, i64) {
+ebb0(v0: i64, v1: i64, v2: i64):
+[RexOp1rr#8001]         v3 = iadd v0, v1
+[RexOp1stDisp8#8089]    store.i64 v2, v3+1
+[RexOp1stDisp8#88]      istore8.i64 v2, v3+1
+[RexMp1stDisp8#189]     istore16.i64 v2, v3+1
+[RexOp1stDisp8#89]      istore32.i64 v2, v3+1
+[Op1ret#c3]             return
+}
+
+; sameln: function %dual_stores2
+; nextln: ebb0(v0: i64, v1: i64, v2: i64):
+; nextln:    v3 = iadd v0, v1
+; nextln:    store_complex v2, v0+v1+1
+; nextln:    istore8_complex v2, v0+v1+1
+; nextln:    istore16_complex v2, v0+v1+1
+; nextln:    istore32_complex v2, v0+v1+1
+; nextln:    return
+; nextln: }
--- a/lib/codegen/meta/base/formats.py
+++ b/lib/codegen/meta/base/formats.py
@@ -57,7 +57,9 @@ CallIndirect = InstructionFormat(sig_ref, VALUE, VARIABLE_ARGS)
 FuncAddr = InstructionFormat(func_ref)

 Load = InstructionFormat(memflags, VALUE, offset32)
+LoadComplex = InstructionFormat(memflags, VARIABLE_ARGS, offset32)
 Store = InstructionFormat(memflags, VALUE, VALUE, offset32)
+StoreComplex = InstructionFormat(memflags, VALUE, VARIABLE_ARGS, offset32)

 StackLoad = InstructionFormat(stack_slot, offset32)
 StackStore = InstructionFormat(VALUE, stack_slot, offset32)
--- a/lib/codegen/meta/base/instructions.py
+++ b/lib/codegen/meta/base/instructions.py
@@ -246,6 +246,7 @@ x = Operand('x', Mem, doc='Value to be stored')
 a = Operand('a', Mem, doc='Value loaded')
 p = Operand('p', iAddr)
 Flags = Operand('Flags', memflags)
+args = Operand('args', VARIABLE_ARGS, doc='Address arguments')

 load = Instruction(
        'load', r"""
@@ -256,6 +257,15 @@ load = Instruction(
        """,
        ins=(Flags, p, Offset), outs=a, can_load=True)

+load_complex = Instruction(
+        'load_complex', r"""
+        Load from memory at ``sum(args) + Offset``.
+
+        This is a polymorphic instruction that can load any value type which
+        has a memory representation.
+        """,
+        ins=(Flags, args, Offset), outs=a, can_load=True)
+
 store = Instruction(
        'store', r"""
        Store ``x`` to memory at ``p + Offset``.
@@ -265,6 +275,16 @@ store = Instruction(
        """,
        ins=(Flags, x, p, Offset), can_store=True)

+store_complex = Instruction(
+        'store_complex', r"""
+        Store ``x`` to memory at ``sum(args) + Offset``.
+
+        This is a polymorphic instruction that can store any value type with a
+        memory representation.
+        """,
+        ins=(Flags, x, args, Offset), can_store=True)
+
+
 iExt8 = TypeVar(
        'iExt8', 'An integer type with more than 8 bits',
        ints=(16, 64))
@@ -279,6 +299,14 @@ uload8 = Instruction(
        """,
        ins=(Flags, p, Offset), outs=a, can_load=True)

+uload8_complex = Instruction(
+        'uload8_complex', r"""
+        Load 8 bits from memory at ``sum(args) + Offset`` and zero-extend.
+
+        This is equivalent to ``load.i8`` followed by ``uextend``.
+        """,
+        ins=(Flags, args, Offset), outs=a, can_load=True)
+
 sload8 = Instruction(
        'sload8', r"""
        Load 8 bits from memory at ``p + Offset`` and sign-extend.
@@ -287,6 +315,14 @@ sload8 = Instruction(
        """,
        ins=(Flags, p, Offset), outs=a, can_load=True)

+sload8_complex = Instruction(
+        'sload8_complex', r"""
+        Load 8 bits from memory at ``sum(args) + Offset`` and sign-extend.
+
+        This is equivalent to ``load.i8`` followed by ``uextend``.
+        """,
+        ins=(Flags, args, Offset), outs=a, can_load=True)
+
 istore8 = Instruction(
        'istore8', r"""
        Store the low 8 bits of ``x`` to memory at ``p + Offset``.
@@ -295,6 +331,14 @@ istore8 = Instruction(
        """,
        ins=(Flags, x, p, Offset), can_store=True)

+istore8_complex = Instruction(
+        'istore8_complex', r"""
+        Store the low 8 bits of ``x`` to memory at ``sum(args) + Offset``.
+
+        This is equivalent to ``ireduce.i8`` followed by ``store.i8``.
+        """,
+        ins=(Flags, x, args, Offset), can_store=True)
+
 iExt16 = TypeVar(
        'iExt16', 'An integer type with more than 16 bits',
        ints=(32, 64))
@@ -309,6 +353,14 @@ uload16 = Instruction(
        """,
        ins=(Flags, p, Offset), outs=a, can_load=True)

+uload16_complex = Instruction(
+        'uload16_complex', r"""
+        Load 16 bits from memory at ``sum(args) + Offset`` and zero-extend.
+
+        This is equivalent to ``load.i16`` followed by ``uextend``.
+        """,
+        ins=(Flags, args, Offset), outs=a, can_load=True)
+
 sload16 = Instruction(
        'sload16', r"""
        Load 16 bits from memory at ``p + Offset`` and sign-extend.
@@ -317,6 +369,14 @@ sload16 = Instruction(
        """,
        ins=(Flags, p, Offset), outs=a, can_load=True)

+sload16_complex = Instruction(
+        'sload16_complex', r"""
+        Load 16 bits from memory at ``sum(args) + Offset`` and sign-extend.
+
+        This is equivalent to ``load.i16`` followed by ``uextend``.
+        """,
+        ins=(Flags, args, Offset), outs=a, can_load=True)
+
 istore16 = Instruction(
        'istore16', r"""
        Store the low 16 bits of ``x`` to memory at ``p + Offset``.
@@ -325,6 +385,14 @@ istore16 = Instruction(
        """,
        ins=(Flags, x, p, Offset), can_store=True)

+istore16_complex = Instruction(
+        'istore16_complex', r"""
+        Store the low 16 bits of ``x`` to memory at ``sum(args) + Offset``.
+
+        This is equivalent to ``ireduce.i16`` followed by ``store.i16``.
+        """,
+        ins=(Flags, x, args, Offset), can_store=True)
+
 iExt32 = TypeVar(
        'iExt32', 'An integer type with more than 32 bits',
        ints=(64, 64))
@@ -339,6 +407,14 @@ uload32 = Instruction(
        """,
        ins=(Flags, p, Offset), outs=a, can_load=True)

+uload32_complex = Instruction(
+        'uload32_complex', r"""
+        Load 32 bits from memory at ``sum(args) + Offset`` and zero-extend.
+
+        This is equivalent to ``load.i32`` followed by ``uextend``.
+        """,
+        ins=(Flags, args, Offset), outs=a, can_load=True)
+
 sload32 = Instruction(
        'sload32', r"""
        Load 32 bits from memory at ``p + Offset`` and sign-extend.
@@ -347,6 +423,14 @@ sload32 = Instruction(
        """,
        ins=(Flags, p, Offset), outs=a, can_load=True)

+sload32_complex = Instruction(
+        'sload32_complex', r"""
+        Load 32 bits from memory at ``sum(args) + Offset`` and sign-extend.
+
+        This is equivalent to ``load.i32`` followed by ``uextend``.
+        """,
+        ins=(Flags, args, Offset), outs=a, can_load=True)
+
 istore32 = Instruction(
        'istore32', r"""
        Store the low 32 bits of ``x`` to memory at ``p + Offset``.
@@ -355,6 +439,14 @@ istore32 = Instruction(
        """,
        ins=(Flags, x, p, Offset), can_store=True)

+istore32_complex = Instruction(
+        'istore32_complex', r"""
+        Store the low 32 bits of ``x`` to memory at ``sum(args) + Offset``.
+
+        This is equivalent to ``ireduce.i32`` followed by ``store.i32``.
+        """,
+        ins=(Flags, x, args, Offset), can_store=True)
+
 x = Operand('x', Mem, doc='Value to be stored')
 a = Operand('a', Mem, doc='Value loaded')
 Offset = Operand('Offset', offset32, 'In-bounds offset into stack slot')
--- a/lib/codegen/meta/base/predicates.py
+++ b/lib/codegen/meta/base/predicates.py
@@ -2,12 +2,12 @@
 Cretonne predicates that consider `Function` fields.
 """
 from cdsl.predicates import FieldPredicate
-from .formats import UnaryGlobalVar
+from .formats import UnaryGlobalVar, InstructionFormat

 try:
    from typing import TYPE_CHECKING
    if TYPE_CHECKING:
-        from cdsl.formats import FormatField  # noqa
+        from cdsl.formats import InstructionFormat, FormatField  # noqa
 except ImportError:
    pass

@@ -33,3 +33,10 @@ class IsColocatedData(FieldPredicate):
        # type: () -> None
        super(IsColocatedData, self).__init__(
            UnaryGlobalVar.global_var, 'is_colocated_data', ('func',))
+
+
+class LengthEquals(FieldPredicate):
+    def __init__(self, iform, num):
+        # type: (InstructionFormat, int) -> None
+        super(LengthEquals, self).__init__(
+            iform.args(), 'has_length_of', (num, 'func'))
--- a/lib/codegen/meta/cdsl/formats.py
+++ b/lib/codegen/meta/cdsl/formats.py
@@ -103,6 +103,19 @@ class InstructionFormat(object):
        InstructionFormat._registry[sig] = self
        InstructionFormat.all_formats.append(self)

+    def args(self):
+        # type: () -> FormatField
+        """
+        Provides a ValueListField, which is derived from FormatField,
+        corresponding to the full ValueList of the instruction format. This
+        is useful for creating predicates for instructions which use variadic
+        arguments.
+        """
+
+        if self.has_value_list:
+            return ValueListField(self)
+        return None
+
    def _process_member_names(self, kinds):
        # type: (Sequence[Union[OperandKind, Tuple[str, OperandKind]]]) -> Iterable[FormatField]  # noqa
        """
@@ -210,7 +223,7 @@ class FormatField(object):
    This corresponds to a single member of a variant of the `InstructionData`
    data type.

-    :param iformat: Parent `InstructionFormat`.
+    :param iform: Parent `InstructionFormat`.
    :param immnum: Immediate operand number in parent.
    :param kind: Immediate Operand kind.
    :param member: Member name in `InstructionData` variant.
@@ -227,6 +240,29 @@ class FormatField(object):
        # type: () -> str
        return '{}.{}'.format(self.format.name, self.member)

+    def rust_destructuring_name(self):
+        # type: () -> str
+        return self.member
+
    def rust_name(self):
        # type: () -> str
        return self.member
+
+
+class ValueListField(FormatField):
+    """
+    The full value list field of an instruction format.
+
+    This corresponds to all Value-type members of a variant of the
+    `InstructionData` format, which contains a ValueList.
+
+    :param iform: Parent `InstructionFormat`.
+    """
+    def __init__(self, iform):
+        # type: (InstructionFormat) -> None
+        self.format = iform
+        self.member = "args"
+
+    def rust_destructuring_name(self):
+        # type: () -> str
+        return 'ref {}'.format(self.member)
--- a/lib/codegen/meta/cdsl/instructions.py
+++ b/lib/codegen/meta/cdsl/instructions.py
@@ -201,9 +201,10 @@ class Instruction(object):
        # Prefer to use the typevar_operand to infer the controlling typevar.
        self.use_typevar_operand = False
        typevar_error = None
-        if self.format.typevar_operand is not None:
+        tv_op = self.format.typevar_operand
+        if tv_op is not None and tv_op < len(self.value_opnums):
            try:
-                opnum = self.value_opnums[self.format.typevar_operand]
+                opnum = self.value_opnums[tv_op]
                tv = self.ins[opnum].typevar
                if tv is tv.free_typevar() or tv.singleton_type() is not None:
                    self.other_typevars = self._verify_ctrl_typevar(tv)
--- a/lib/codegen/meta/gen_binemit.py
+++ b/lib/codegen/meta/gen_binemit.py
@@ -27,7 +27,7 @@ def gen_recipe(recipe, fmt):
    nvops = iform.num_value_operands
    want_args = any(isinstance(i, RegClass) or isinstance(i, Stack)
                    for i in recipe.ins)
-    assert not want_args or nvops > 0
+    assert not want_args or nvops > 0 or iform.has_value_list
    want_outs = any(isinstance(o, RegClass) or isinstance(o, Stack)
                    for o in recipe.outs)

--- a/lib/codegen/meta/gen_encoding.py
+++ b/lib/codegen/meta/gen_encoding.py
@@ -103,7 +103,7 @@ def emit_instp(instp, fmt, has_func=False):
    fnames = set()  # type: Set[str]
    for p in leafs:
        if isinstance(p, FieldPredicate):
-            fnames.add(p.field.rust_name())
+            fnames.add(p.field.rust_destructuring_name())
        else:
            assert isinstance(p, TypePredicate)
            has_type_check = True
--- a/lib/codegen/meta/isa/x86/encodings.py
+++ b/lib/codegen/meta/isa/x86/encodings.py
@@ -3,9 +3,9 @@ x86 Encodings.
 """
 from __future__ import absolute_import
 from cdsl.predicates import IsUnsignedInt, Not, And
-from base.predicates import IsColocatedFunc, IsColocatedData
+from base.predicates import IsColocatedFunc, IsColocatedData, LengthEquals
 from base import instructions as base
-from base.formats import UnaryImm, FuncAddr, Call
+from base.formats import UnaryImm, FuncAddr, Call, LoadComplex, StoreComplex
 from .defs import X86_64, X86_32
 from . import recipes as r
 from . import settings as cfg
@@ -19,6 +19,7 @@ try:
    from typing import TYPE_CHECKING, Any  # noqa
    if TYPE_CHECKING:
        from cdsl.instructions import MaybeBoundInst  # noqa
+        from cdsl.predicates import FieldPredicate # noqa
 except ImportError:
    pass

@@ -54,6 +55,15 @@ def enc_x86_64(inst, recipe, *args, **kwargs):
    X86_64.enc(inst, *recipe(*args, **kwargs))


+def enc_x86_64_instp(inst, recipe, instp, *args, **kwargs):
+    # type: (MaybeBoundInst, r.TailRecipe, FieldPredicate, *int, **int) -> None
+    """
+    Add encodings for `inst` to X86_64 with and without a REX prefix.
+    """
+    X86_64.enc(inst, *recipe.rex(*args, **kwargs), instp=instp)
+    X86_64.enc(inst, *recipe(*args, **kwargs), instp=instp)
+
+
 def enc_both(inst, recipe, *args, **kwargs):
    # type: (MaybeBoundInst, r.TailRecipe, *int, **Any) -> None
    """
@@ -63,6 +73,15 @@ def enc_both(inst, recipe, *args, **kwargs):
    enc_x86_64(inst, recipe, *args, **kwargs)


+def enc_both_instp(inst, recipe, instp, *args, **kwargs):
+    # type: (MaybeBoundInst, r.TailRecipe, FieldPredicate, *int, **Any) -> None
+    """
+    Add encodings for `inst` to both X86_32 and X86_64.
+    """
+    X86_32.enc(inst, *recipe(*args, **kwargs), instp=instp)
+    enc_x86_64_instp(inst, recipe, instp, *args, **kwargs)
+
+
 def enc_i32_i64(inst, recipe, *args, **kwargs):
    # type: (MaybeBoundInst, r.TailRecipe, *int, **int) -> None
    """
@@ -80,6 +99,25 @@ def enc_i32_i64(inst, recipe, *args, **kwargs):
    X86_64.enc(inst.i64, *recipe.rex(*args, w=1, **kwargs))


+def enc_i32_i64_instp(inst, recipe, instp, *args, **kwargs):
+    # type: (MaybeBoundInst, r.TailRecipe, FieldPredicate, *int, **int) -> None
+    """
+    Add encodings for `inst.i32` to X86_32.
+    Add encodings for `inst.i32` to X86_64 with and without REX.
+    Add encodings for `inst.i64` to X86_64 with a REX.W prefix.
+
+    Similar to `enc_i32_i64` but applies `instp` to each encoding.
+    """
+    X86_32.enc(inst.i32, *recipe(*args, **kwargs), instp=instp)
+
+    # REX-less encoding must come after REX encoding so we don't use it by
+    # default. Otherwise reg-alloc would never use r8 and up.
+    X86_64.enc(inst.i32, *recipe.rex(*args, **kwargs), instp=instp)
+    X86_64.enc(inst.i32, *recipe(*args, **kwargs), instp=instp)
+
+    X86_64.enc(inst.i64, *recipe.rex(*args, w=1, **kwargs), instp=instp)
+
+
 def enc_i32_i64_ld_st(inst, w_bit, recipe, *args, **kwargs):
    # type: (MaybeBoundInst, bool, r.TailRecipe, *int, **int) -> None
    """
@@ -212,6 +250,31 @@ X86_64.enc(base.ctz.i32, *r.urm(0xf3, 0x0f, 0xbc), isap=cfg.use_bmi1)
 #
 # Loads and stores.
 #
+
+ldcomplexp = LengthEquals(LoadComplex, 2)
+for recipe in [r.ldWithIndex, r.ldWithIndexDisp8, r.ldWithIndexDisp32]:
+    enc_i32_i64_instp(base.load_complex, recipe, ldcomplexp, 0x8b)
+    enc_x86_64_instp(base.uload32_complex, recipe, ldcomplexp, 0x8b)
+    X86_64.enc(base.sload32_complex, *recipe.rex(0x63, w=1),
+               instp=ldcomplexp)
+    enc_i32_i64_instp(base.uload16_complex, recipe, ldcomplexp, 0x0f, 0xb7)
+    enc_i32_i64_instp(base.sload16_complex, recipe, ldcomplexp, 0x0f, 0xbf)
+    enc_i32_i64_instp(base.uload8_complex, recipe, ldcomplexp, 0x0f, 0xb6)
+    enc_i32_i64_instp(base.sload8_complex, recipe, ldcomplexp, 0x0f, 0xbe)
+
+stcomplexp = LengthEquals(StoreComplex, 3)
+for recipe in [r.stWithIndex, r.stWithIndexDisp8, r.stWithIndexDisp32]:
+    enc_i32_i64_instp(base.store_complex, recipe, stcomplexp, 0x89)
+    enc_x86_64_instp(base.istore32_complex, recipe, stcomplexp, 0x89)
+    enc_both_instp(base.istore16_complex.i32, recipe, stcomplexp, 0x66, 0x89)
+    enc_x86_64_instp(base.istore16_complex.i64, recipe, stcomplexp, 0x66, 0x89)
+
+for recipe in [r.stWithIndex_abcd,
+               r.stWithIndexDisp8_abcd,
+               r.stWithIndexDisp32_abcd]:
+    enc_both_instp(base.istore8_complex.i32, recipe, stcomplexp, 0x88)
+    enc_x86_64_instp(base.istore8_complex.i64, recipe, stcomplexp, 0x88)
+
 for recipe in [r.st, r.stDisp8, r.stDisp32]:
    enc_i32_i64_ld_st(base.store, True, recipe, 0x89)
    enc_x86_64(base.istore32.i64.any, recipe, 0x89)
@@ -286,18 +349,34 @@ enc_both(base.load.f32.any, r.fld, 0xf3, 0x0f, 0x10)
 enc_both(base.load.f32.any, r.fldDisp8, 0xf3, 0x0f, 0x10)
 enc_both(base.load.f32.any, r.fldDisp32, 0xf3, 0x0f, 0x10)

+enc_both(base.load_complex.f32, r.fldWithIndex, 0xf3, 0x0f, 0x10)
+enc_both(base.load_complex.f32, r.fldWithIndexDisp8, 0xf3, 0x0f, 0x10)
+enc_both(base.load_complex.f32, r.fldWithIndexDisp32, 0xf3, 0x0f, 0x10)
+
 enc_both(base.load.f64.any, r.fld, 0xf2, 0x0f, 0x10)
 enc_both(base.load.f64.any, r.fldDisp8, 0xf2, 0x0f, 0x10)
 enc_both(base.load.f64.any, r.fldDisp32, 0xf2, 0x0f, 0x10)

+enc_both(base.load_complex.f64, r.fldWithIndex, 0xf2, 0x0f, 0x10)
+enc_both(base.load_complex.f64, r.fldWithIndexDisp8, 0xf2, 0x0f, 0x10)
+enc_both(base.load_complex.f64, r.fldWithIndexDisp32, 0xf2, 0x0f, 0x10)
+
 enc_both(base.store.f32.any, r.fst, 0xf3, 0x0f, 0x11)
 enc_both(base.store.f32.any, r.fstDisp8, 0xf3, 0x0f, 0x11)
 enc_both(base.store.f32.any, r.fstDisp32, 0xf3, 0x0f, 0x11)

+enc_both(base.store_complex.f32, r.fstWithIndex, 0xf3, 0x0f, 0x11)
+enc_both(base.store_complex.f32, r.fstWithIndexDisp8, 0xf3, 0x0f, 0x11)
+enc_both(base.store_complex.f32, r.fstWithIndexDisp32, 0xf3, 0x0f, 0x11)
+
 enc_both(base.store.f64.any, r.fst, 0xf2, 0x0f, 0x11)
 enc_both(base.store.f64.any, r.fstDisp8, 0xf2, 0x0f, 0x11)
 enc_both(base.store.f64.any, r.fstDisp32, 0xf2, 0x0f, 0x11)

+enc_both(base.store_complex.f64, r.fstWithIndex, 0xf2, 0x0f, 0x11)
+enc_both(base.store_complex.f64, r.fstWithIndexDisp8, 0xf2, 0x0f, 0x11)
+enc_both(base.store_complex.f64, r.fstWithIndexDisp32, 0xf2, 0x0f, 0x11)
+
 enc_both(base.fill.f32, r.ffillSib32, 0xf3, 0x0f, 0x10)
 enc_both(base.regfill.f32, r.fregfill32, 0xf3, 0x0f, 0x10)
 enc_both(base.fill.f64, r.ffillSib32, 0xf2, 0x0f, 0x10)
--- a/lib/codegen/meta/isa/x86/recipes.py
+++ b/lib/codegen/meta/isa/x86/recipes.py
@@ -14,6 +14,7 @@ from base.formats import IntSelect, IntCondTrap, FloatCondTrap
 from base.formats import Jump, Branch, BranchInt, BranchFloat
 from base.formats import Ternary, FuncAddr, UnaryGlobalVar
 from base.formats import RegMove, RegSpill, RegFill, CopySpecial
+from base.formats import LoadComplex, StoreComplex
 from .registers import GPR, ABCD, FPR, GPR_DEREF_SAFE, GPR_ZERO_DEREF_SAFE
 from .registers import GPR8, FPR8, GPR8_DEREF_SAFE, GPR8_ZERO_DEREF_SAFE, FLAG
 from .registers import StackGPR32, StackFPR32
@@ -739,6 +740,22 @@ st = TailRecipe(
        modrm_rm(in_reg1, in_reg0, sink);
        ''')

+# XX /r register-indirect store with index and no offset.
+stWithIndex = TailRecipe(
+    'stWithIndex', StoreComplex, size=2,
+    ins=(GPR, GPR_ZERO_DEREF_SAFE, GPR_DEREF_SAFE),
+    outs=(),
+    instp=IsEqual(StoreComplex.offset, 0),
+    clobbers_flags=False,
+    emit='''
+    if !flags.notrap() {
+        sink.trap(TrapCode::HeapOutOfBounds, func.srclocs[inst]);
+    }
+    PUT_OP(bits, rex3(in_reg1, in_reg0, in_reg2), sink);
+    modrm_sib(in_reg0, sink);
+    sib(0, in_reg2, in_reg1, sink);
+    ''')
+
 # XX /r register-indirect store with no offset.
 # Only ABCD allowed for stored value. This is for byte stores with no REX.
 st_abcd = TailRecipe(
@@ -754,6 +771,23 @@ st_abcd = TailRecipe(
        modrm_rm(in_reg1, in_reg0, sink);
        ''')

+# XX /r register-indirect store with index and no offset.
+# Only ABCD allowed for stored value. This is for byte stores with no REX.
+stWithIndex_abcd = TailRecipe(
+    'stWithIndex_abcd', StoreComplex, size=2,
+    ins=(ABCD, GPR_ZERO_DEREF_SAFE, GPR_DEREF_SAFE),
+    outs=(),
+    instp=IsEqual(StoreComplex.offset, 0),
+    clobbers_flags=False,
+    emit='''
+    if !flags.notrap() {
+        sink.trap(TrapCode::HeapOutOfBounds, func.srclocs[inst]);
+    }
+    PUT_OP(bits, rex3(in_reg1, in_reg0, in_reg2), sink);
+    modrm_sib(in_reg0, sink);
+    sib(0, in_reg2, in_reg1, sink);
+    ''')
+
 # XX /r register-indirect store of FPR with no offset.
 fst = TailRecipe(
        'fst', Store, size=1, ins=(FPR, GPR_ZERO_DEREF_SAFE), outs=(),
@@ -766,6 +800,20 @@ fst = TailRecipe(
        PUT_OP(bits, rex2(in_reg1, in_reg0), sink);
        modrm_rm(in_reg1, in_reg0, sink);
        ''')
+# XX /r register-indirect store with index and no offset of FPR.
+fstWithIndex = TailRecipe(
+        'fstWithIndex', StoreComplex, size=2,
+        ins=(FPR, GPR_ZERO_DEREF_SAFE, GPR_DEREF_SAFE), outs=(),
+        instp=IsEqual(StoreComplex.offset, 0),
+        clobbers_flags=False,
+        emit='''
+        if !flags.notrap() {
+            sink.trap(TrapCode::HeapOutOfBounds, func.srclocs[inst]);
+        }
+        PUT_OP(bits, rex3(in_reg1, in_reg0, in_reg2), sink);
+        modrm_sib(in_reg0, sink);
+        sib(0, in_reg2, in_reg1, sink);
+        ''')

 # XX /r register-indirect store with 8-bit offset.
 stDisp8 = TailRecipe(
@@ -781,6 +829,27 @@ stDisp8 = TailRecipe(
        let offset: i32 = offset.into();
        sink.put1(offset as u8);
        ''')
+
+# XX /r register-indirect store with index and 8-bit offset.
+stWithIndexDisp8 = TailRecipe(
+    'stWithIndexDisp8', StoreComplex, size=3,
+    ins=(GPR, GPR, GPR_DEREF_SAFE),
+    outs=(),
+    instp=IsSignedInt(StoreComplex.offset, 8),
+    clobbers_flags=False,
+    emit='''
+    if !flags.notrap() {
+        sink.trap(TrapCode::HeapOutOfBounds, func.srclocs[inst]);
+    }
+    PUT_OP(bits, rex3(in_reg1, in_reg0, in_reg2), sink);
+    modrm_sib_disp8(in_reg0, sink);
+    sib(0, in_reg2, in_reg1, sink);
+    let offset: i32 = offset.into();
+    sink.put1(offset as u8);
+    ''')
+
+# XX /r register-indirect store with 8-bit offset.
+# Only ABCD allowed for stored value. This is for byte stores with no REX.
 stDisp8_abcd = TailRecipe(
        'stDisp8_abcd', Store, size=2, ins=(ABCD, GPR), outs=(),
        instp=IsSignedInt(Store.offset, 8),
@@ -795,6 +864,27 @@ stDisp8_abcd = TailRecipe(
        let offset: i32 = offset.into();
        sink.put1(offset as u8);
        ''')
+
+# XX /r register-indirect store with index and 8-bit offset.
+# Only ABCD allowed for stored value. This is for byte stores with no REX.
+stWithIndexDisp8_abcd = TailRecipe(
+    'stWithIndexDisp8_abcd', StoreComplex, size=3,
+    ins=(ABCD, GPR, GPR_DEREF_SAFE),
+    outs=(),
+    instp=IsSignedInt(StoreComplex.offset, 8),
+    clobbers_flags=False,
+    emit='''
+    if !flags.notrap() {
+        sink.trap(TrapCode::HeapOutOfBounds, func.srclocs[inst]);
+    }
+    PUT_OP(bits, rex3(in_reg1, in_reg0, in_reg2), sink);
+    modrm_sib_disp8(in_reg0, sink);
+    sib(0, in_reg2, in_reg1, sink);
+    let offset: i32 = offset.into();
+    sink.put1(offset as u8);
+    ''')
+
+# XX /r register-indirect store with 8-bit offset of FPR.
 fstDisp8 = TailRecipe(
        'fstDisp8', Store, size=2, ins=(FPR, GPR_DEREF_SAFE), outs=(),
        instp=IsSignedInt(Store.offset, 8),
@@ -809,6 +899,24 @@ fstDisp8 = TailRecipe(
        sink.put1(offset as u8);
        ''')

+# XX /r register-indirect store with index and 8-bit offset of FPR.
+fstWithIndexDisp8 = TailRecipe(
+    'fstWithIndexDisp8', StoreComplex, size=3,
+    ins=(FPR, GPR, GPR_DEREF_SAFE),
+    outs=(),
+    instp=IsSignedInt(StoreComplex.offset, 8),
+    clobbers_flags=False,
+    emit='''
+    if !flags.notrap() {
+        sink.trap(TrapCode::HeapOutOfBounds, func.srclocs[inst]);
+    }
+    PUT_OP(bits, rex3(in_reg1, in_reg0, in_reg2), sink);
+    modrm_sib_disp8(in_reg0, sink);
+    sib(0, in_reg2, in_reg1, sink);
+    let offset: i32 = offset.into();
+    sink.put1(offset as u8);
+    ''')
+
 # XX /r register-indirect store with 32-bit offset.
 stDisp32 = TailRecipe(
        'stDisp32', Store, size=5, ins=(GPR, GPR_DEREF_SAFE), outs=(),
@@ -822,6 +930,27 @@ stDisp32 = TailRecipe(
        let offset: i32 = offset.into();
        sink.put4(offset as u32);
        ''')
+
+# XX /r register-indirect store with index and 32-bit offset.
+stWithIndexDisp32 = TailRecipe(
+    'stWithIndexDisp32', StoreComplex, size=6,
+    ins=(GPR, GPR, GPR_DEREF_SAFE),
+    outs=(),
+    instp=IsSignedInt(StoreComplex.offset, 32),
+    clobbers_flags=False,
+    emit='''
+    if !flags.notrap() {
+        sink.trap(TrapCode::HeapOutOfBounds, func.srclocs[inst]);
+    }
+    PUT_OP(bits, rex3(in_reg1, in_reg0, in_reg2), sink);
+    modrm_sib_disp32(in_reg0, sink);
+    sib(0, in_reg2, in_reg1, sink);
+    let offset: i32 = offset.into();
+    sink.put4(offset as u32);
+    ''')
+
+# XX /r register-indirect store with 32-bit offset.
+# Only ABCD allowed for stored value. This is for byte stores with no REX.
 stDisp32_abcd = TailRecipe(
        'stDisp32_abcd', Store, size=5, ins=(ABCD, GPR), outs=(),
        when_prefixed=stDisp32,
@@ -835,6 +964,27 @@ stDisp32_abcd = TailRecipe(
        let offset: i32 = offset.into();
        sink.put4(offset as u32);
        ''')
+
+# XX /r register-indirect store with index and 32-bit offset.
+# Only ABCD allowed for stored value. This is for byte stores with no REX.
+stWithIndexDisp32_abcd = TailRecipe(
+    'stWithIndexDisp32_abcd', StoreComplex, size=6,
+    ins=(ABCD, GPR, GPR_DEREF_SAFE),
+    outs=(),
+    instp=IsSignedInt(StoreComplex.offset, 32),
+    clobbers_flags=False,
+    emit='''
+    if !flags.notrap() {
+        sink.trap(TrapCode::HeapOutOfBounds, func.srclocs[inst]);
+    }
+    PUT_OP(bits, rex3(in_reg1, in_reg0, in_reg2), sink);
+    modrm_sib_disp32(in_reg0, sink);
+    sib(0, in_reg2, in_reg1, sink);
+    let offset: i32 = offset.into();
+    sink.put4(offset as u32);
+    ''')
+
+# XX /r register-indirect store with 32-bit offset of FPR.
 fstDisp32 = TailRecipe(
        'fstDisp32', Store, size=5, ins=(FPR, GPR_DEREF_SAFE), outs=(),
        clobbers_flags=False,
@@ -848,6 +998,24 @@ fstDisp32 = TailRecipe(
        sink.put4(offset as u32);
        ''')

+# XX /r register-indirect store with index and 32-bit offset of FPR.
+fstWithIndexDisp32 = TailRecipe(
+    'fstWithIndexDisp32', StoreComplex, size=6,
+    ins=(FPR, GPR, GPR_DEREF_SAFE),
+    outs=(),
+    instp=IsSignedInt(StoreComplex.offset, 32),
+    clobbers_flags=False,
+    emit='''
+    if !flags.notrap() {
+        sink.trap(TrapCode::HeapOutOfBounds, func.srclocs[inst]);
+    }
+    PUT_OP(bits, rex3(in_reg1, in_reg0, in_reg2), sink);
+    modrm_sib_disp32(in_reg0, sink);
+    sib(0, in_reg2, in_reg1, sink);
+    let offset: i32 = offset.into();
+    sink.put4(offset as u32);
+    ''')
+
 # Unary spill with SIB and 32-bit displacement.
 spillSib32 = TailRecipe(
        'spillSib32', Unary, size=6, ins=GPR, outs=StackGPR32,
@@ -919,6 +1087,22 @@ ld = TailRecipe(
        modrm_rm(in_reg0, out_reg0, sink);
        ''')

+# XX /r load with index and no offset.
+ldWithIndex = TailRecipe(
+    'ldWithIndex', LoadComplex, size=2,
+    ins=(GPR_ZERO_DEREF_SAFE, GPR_DEREF_SAFE),
+    outs=(GPR),
+    instp=IsEqual(LoadComplex.offset, 0),
+    clobbers_flags=False,
+    emit='''
+    if !flags.notrap() {
+        sink.trap(TrapCode::HeapOutOfBounds, func.srclocs[inst]);
+    }
+    PUT_OP(bits, rex3(in_reg0, out_reg0, in_reg1), sink);
+    modrm_sib(out_reg0, sink);
+    sib(0, in_reg1, in_reg0, sink);
+    ''')
+
 # XX /r float load with no offset.
 fld = TailRecipe(
        'fld', Load, size=1, ins=(GPR_ZERO_DEREF_SAFE), outs=(FPR),
@@ -932,6 +1116,22 @@ fld = TailRecipe(
        modrm_rm(in_reg0, out_reg0, sink);
        ''')

+# XX /r float load with index and no offset.
+fldWithIndex = TailRecipe(
+    'fldWithIndex', LoadComplex, size=2,
+    ins=(GPR_ZERO_DEREF_SAFE, GPR_DEREF_SAFE),
+    outs=(FPR),
+    instp=IsEqual(LoadComplex.offset, 0),
+    clobbers_flags=False,
+    emit='''
+    if !flags.notrap() {
+        sink.trap(TrapCode::HeapOutOfBounds, func.srclocs[inst]);
+    }
+    PUT_OP(bits, rex3(in_reg0, out_reg0, in_reg1), sink);
+    modrm_sib(out_reg0, sink);
+    sib(0, in_reg1, in_reg0, sink);
+    ''')
+
 # XX /r load with 8-bit offset.
 ldDisp8 = TailRecipe(
        'ldDisp8', Load, size=2, ins=(GPR_DEREF_SAFE), outs=(GPR),
@@ -947,6 +1147,24 @@ ldDisp8 = TailRecipe(
        sink.put1(offset as u8);
        ''')

+# XX /r load with index and 8-bit offset.
+ldWithIndexDisp8 = TailRecipe(
+    'ldWithIndexDisp8', LoadComplex, size=3,
+    ins=(GPR, GPR_DEREF_SAFE),
+    outs=(GPR),
+    instp=IsSignedInt(LoadComplex.offset, 8),
+    clobbers_flags=False,
+    emit='''
+    if !flags.notrap() {
+        sink.trap(TrapCode::HeapOutOfBounds, func.srclocs[inst]);
+    }
+    PUT_OP(bits, rex3(in_reg0, out_reg0, in_reg1), sink);
+    modrm_sib_disp8(out_reg0, sink);
+    sib(0, in_reg1, in_reg0, sink);
+    let offset: i32 = offset.into();
+    sink.put1(offset as u8);
+    ''')
+
 # XX /r float load with 8-bit offset.
 fldDisp8 = TailRecipe(
        'fldDisp8', Load, size=2, ins=(GPR_DEREF_SAFE), outs=(FPR),
@@ -962,6 +1180,24 @@ fldDisp8 = TailRecipe(
        sink.put1(offset as u8);
        ''')

+# XX /r float load with 8-bit offset.
+fldWithIndexDisp8 = TailRecipe(
+    'fldWithIndexDisp8', LoadComplex, size=3,
+    ins=(GPR, GPR_DEREF_SAFE),
+    outs=(FPR),
+    instp=IsSignedInt(LoadComplex.offset, 8),
+    clobbers_flags=False,
+    emit='''
+    if !flags.notrap() {
+        sink.trap(TrapCode::HeapOutOfBounds, func.srclocs[inst]);
+    }
+    PUT_OP(bits, rex3(in_reg0, out_reg0, in_reg1), sink);
+    modrm_sib_disp8(out_reg0, sink);
+    sib(0, in_reg1, in_reg0, sink);
+    let offset: i32 = offset.into();
+    sink.put1(offset as u8);
+    ''')
+
 # XX /r load with 32-bit offset.
 ldDisp32 = TailRecipe(
        'ldDisp32', Load, size=5, ins=(GPR_DEREF_SAFE), outs=(GPR),
@@ -977,6 +1213,24 @@ ldDisp32 = TailRecipe(
        sink.put4(offset as u32);
        ''')

+# XX /r load with index and 32-bit offset.
+ldWithIndexDisp32 = TailRecipe(
+    'ldWithIndexDisp32', LoadComplex, size=6,
+    ins=(GPR, GPR_DEREF_SAFE),
+    outs=(GPR),
+    instp=IsSignedInt(LoadComplex.offset, 32),
+    clobbers_flags=False,
+    emit='''
+    if !flags.notrap() {
+        sink.trap(TrapCode::HeapOutOfBounds, func.srclocs[inst]);
+    }
+    PUT_OP(bits, rex3(in_reg0, out_reg0, in_reg1), sink);
+    modrm_sib_disp32(out_reg0, sink);
+    sib(0, in_reg1, in_reg0, sink);
+    let offset: i32 = offset.into();
+    sink.put4(offset as u32);
+    ''')
+
 # XX /r float load with 32-bit offset.
 fldDisp32 = TailRecipe(
        'fldDisp32', Load, size=5, ins=(GPR_DEREF_SAFE), outs=(FPR),
@@ -992,6 +1246,24 @@ fldDisp32 = TailRecipe(
        sink.put4(offset as u32);
        ''')

+# XX /r float load with index and 32-bit offset.
+fldWithIndexDisp32 = TailRecipe(
+    'fldWithIndexDisp32', LoadComplex, size=6,
+    ins=(GPR, GPR_DEREF_SAFE),
+    outs=(FPR),
+    instp=IsSignedInt(LoadComplex.offset, 32),
+    clobbers_flags=False,
+    emit='''
+    if !flags.notrap() {
+        sink.trap(TrapCode::HeapOutOfBounds, func.srclocs[inst]);
+    }
+    PUT_OP(bits, rex3(in_reg0, out_reg0, in_reg1), sink);
+    modrm_sib_disp32(out_reg0, sink);
+    sib(0, in_reg1, in_reg0, sink);
+    let offset: i32 = offset.into();
+    sink.put4(offset as u32);
+    ''')
+
 # Unary fill with SIB and 32-bit displacement.
 fillSib32 = TailRecipe(
        'fillSib32', Unary, size=6, ins=StackGPR32, outs=GPR,
--- a/lib/codegen/src/isa/mod.rs
+++ b/lib/codegen/src/isa/mod.rs
@@ -162,6 +162,11 @@ pub trait TargetIsa: fmt::Display {
        false
    }

+    /// Does the CPU implement multi-register addressing?
+    fn uses_complex_addresses(&self) -> bool {
+        false
+    }
+
    /// Get a data structure describing the registers in this ISA.
    fn register_info(&self) -> RegInfo;

--- a/lib/codegen/src/isa/x86/binemit.rs
+++ b/lib/codegen/src/isa/x86/binemit.rs
@@ -46,6 +46,18 @@ fn rex2(rm: RegUnit, reg: RegUnit) -> u8 {
    BASE_REX | b | (r << 2)
 }

+// Create a three-register REX prefix, setting:
+//
+// REX.B = bit 3 of r/m register, or SIB base register when a SIB byte is present.
+// REX.R = bit 3 of reg register.
+// REX.X = bit 3 of SIB index register.
+fn rex3(rm: RegUnit, reg: RegUnit, index: RegUnit) -> u8 {
+    let b = ((rm >> 3) & 1) as u8;
+    let r = ((reg >> 3) & 1) as u8;
+    let x = ((index >> 3) & 1) as u8;
+    BASE_REX | b | (x << 1) | (r << 2)
+}
+
 // Emit a REX prefix.
 //
 // The R, X, and B bits are computed from registers using the functions above. The W bit is
@@ -211,7 +223,19 @@ fn modrm_disp32<CS: CodeSink + ?Sized>(rm: RegUnit, reg: RegUnit, sink: &mut CS)
    sink.put1(b);
 }

-/// Emit a mode 10 ModR/M byte indicating that a SIB byte is present.
+/// Emit a mode 00 ModR/M with a 100 RM indicating a SIB byte is present.
+fn modrm_sib<CS: CodeSink + ?Sized>(reg: RegUnit, sink: &mut CS) {
+    modrm_rm(0b100, reg, sink);
+}
+
+/// Emit a mode 01 ModR/M with a 100 RM indicating a SIB byte and 8-bit
+/// displacement are present.
+fn modrm_sib_disp8<CS: CodeSink + ?Sized>(reg: RegUnit, sink: &mut CS) {
+    modrm_disp8(0b100, reg, sink);
+}
+
+/// Emit a mode 10 ModR/M with a 100 RM indicating a SIB byte and 32-bit
+/// displacement are present.
 fn modrm_sib_disp32<CS: CodeSink + ?Sized>(reg: RegUnit, sink: &mut CS) {
    modrm_disp32(0b100, reg, sink);
 }
@@ -225,6 +249,16 @@ fn sib_noindex<CS: CodeSink + ?Sized>(base: RegUnit, sink: &mut CS) {
    sink.put1(b);
 }

+fn sib<CS: CodeSink + ?Sized>(scale: u8, index: RegUnit, base: RegUnit, sink: &mut CS) {
+    // SIB        SS_III_BBB.
+    debug_assert_eq!(scale & !0x03, 0, "Scale out of range");
+    let scale = scale & 3;
+    let index = index as u8 & 7;
+    let base = base as u8 & 7;
+    let b: u8 = (scale << 6) | (index << 3) | base;
+    sink.put1(b);
+}
+
 /// Get the low 4 bits of an opcode for an integer condition code.
 ///
 /// Add this offset to a base opcode for:
--- a/lib/codegen/src/isa/x86/mod.rs
+++ b/lib/codegen/src/isa/x86/mod.rs
@@ -62,6 +62,10 @@ impl TargetIsa for Isa {
        true
    }

+    fn uses_complex_addresses(&self) -> bool {
+        true
+    }
+
    fn register_info(&self) -> RegInfo {
        registers::INFO.clone()
    }
--- a/lib/codegen/src/postopt.rs
+++ b/lib/codegen/src/postopt.rs
@@ -5,9 +5,9 @@
 use cursor::{Cursor, EncCursor};
 use ir::condcodes::{CondCode, FloatCC, IntCC};
 use ir::dfg::ValueDef;
-use ir::immediates::Imm64;
+use ir::immediates::{Imm64, Offset32};
 use ir::instructions::{Opcode, ValueList};
-use ir::{Ebb, Function, Inst, InstBuilder, InstructionData, Value};
+use ir::{Ebb, Function, Inst, InstBuilder, InstructionData, Value, Type, MemFlags};
 use isa::TargetIsa;
 use timing;

@@ -173,6 +173,158 @@ fn optimize_cpu_flags(
    pos.func.update_encoding(info.br_inst, isa).is_ok();
 }

+
+struct MemOpInfo {
+    opcode: Opcode,
+    inst: Inst,
+    itype: Type,
+    arg: Value,
+    st_arg: Option<Value>,
+    flags: MemFlags,
+    offset: Offset32,
+    add_args: Option<[Value; 2]>,
+}
+
+fn optimize_complex_addresses(pos: &mut EncCursor, inst: Inst, isa: &TargetIsa) {
+    let mut info = match pos.func.dfg[inst] {
+        InstructionData::Load {
+            opcode,
+            arg,
+            flags,
+            offset,
+        } => MemOpInfo {
+            opcode: opcode,
+            inst: inst,
+            itype: pos.func.dfg.ctrl_typevar(inst),
+            arg: arg,
+            st_arg: None,
+            flags: flags,
+            offset: offset,
+            add_args: None,
+        },
+        InstructionData::Store {
+            opcode,
+            args,
+            flags,
+            offset,
+        } => MemOpInfo {
+            opcode: opcode,
+            inst: inst,
+            itype: pos.func.dfg.ctrl_typevar(inst),
+            arg: args[1],
+            st_arg: Some(args[0]),
+            flags: flags,
+            offset: offset,
+            add_args: None,
+        },
+        _ => return,
+    };
+
+    if let ValueDef::Result(result_inst, _) = pos.func.dfg.value_def(info.arg) {
+        match pos.func.dfg[result_inst] {
+            InstructionData::Binary { opcode, args } if opcode == Opcode::Iadd => {
+                info.add_args = Some(args.clone());
+            }
+            _ => return,
+        }
+    } else {
+        return;
+    }
+
+    match info.opcode {
+        Opcode::Load => {
+            pos.func.dfg.replace(info.inst).load_complex(
+                info.itype,
+                info.flags,
+                &info.add_args.unwrap(),
+                info.offset,
+            );
+        }
+        Opcode::Uload8 => {
+            pos.func.dfg.replace(info.inst).uload8_complex(
+                info.itype,
+                info.flags,
+                &info.add_args.unwrap(),
+                info.offset,
+            );
+        }
+        Opcode::Sload8 => {
+            pos.func.dfg.replace(info.inst).sload8_complex(
+                info.itype,
+                info.flags,
+                &info.add_args.unwrap(),
+                info.offset,
+            );
+        }
+        Opcode::Uload16 => {
+            pos.func.dfg.replace(info.inst).uload16_complex(
+                info.itype,
+                info.flags,
+                &info.add_args.unwrap(),
+                info.offset,
+            );
+        }
+        Opcode::Sload16 => {
+            pos.func.dfg.replace(info.inst).sload16_complex(
+                info.itype,
+                info.flags,
+                &info.add_args.unwrap(),
+                info.offset,
+            );
+        }
+        Opcode::Uload32 => {
+            pos.func.dfg.replace(info.inst).uload32_complex(
+                info.flags,
+                &info.add_args.unwrap(),
+                info.offset,
+            );
+        }
+        Opcode::Sload32 => {
+            pos.func.dfg.replace(info.inst).sload32_complex(
+                info.flags,
+                &info.add_args.unwrap(),
+                info.offset,
+            );
+        }
+        Opcode::Store => {
+            pos.func.dfg.replace(info.inst).store_complex(
+                info.flags,
+                info.st_arg.unwrap(),
+                &info.add_args.unwrap(),
+                info.offset,
+            );
+        }
+        Opcode::Istore8 => {
+            pos.func.dfg.replace(info.inst).istore8_complex(
+                info.flags,
+                info.st_arg.unwrap(),
+                &info.add_args.unwrap(),
+                info.offset,
+            );
+        }
+        Opcode::Istore16 => {
+            pos.func.dfg.replace(info.inst).istore16_complex(
+                info.flags,
+                info.st_arg.unwrap(),
+                &info.add_args.unwrap(),
+                info.offset,
+            );
+        }
+        Opcode::Istore32 => {
+            pos.func.dfg.replace(info.inst).istore32_complex(
+                info.flags,
+                info.st_arg.unwrap(),
+                &info.add_args.unwrap(),
+                info.offset,
+            );
+        }
+        _ => return,
+    }
+    pos.func.update_encoding(info.inst, isa).is_ok();
+}
+
+
+
 //----------------------------------------------------------------------
 //
 // The main post-opt pass.
@@ -198,6 +350,10 @@ pub fn do_postopt(func: &mut Function, isa: &TargetIsa) {
                    }
                }
            }
+
+            if isa.uses_complex_addresses() {
+                optimize_complex_addresses(&mut pos, inst, isa);
+            }
        }
    }
 }
--- a/lib/codegen/src/predicates.rs
+++ b/lib/codegen/src/predicates.rs
@@ -46,6 +46,11 @@ pub fn is_colocated_data(global_var: ir::GlobalVar, func: &ir::Function) -> bool
    }
 }

+#[allow(dead_code)]
+pub fn has_length_of(value_list: &ir::ValueList, num: usize, func: &ir::Function) -> bool {
+    value_list.len(&func.dfg.value_lists) == num
+}
+
 #[cfg(test)]
 mod tests {
    use super::*;
--- a/lib/codegen/src/verifier/mod.rs
+++ b/lib/codegen/src/verifier/mod.rs
@@ -335,6 +335,12 @@ impl<'a> Verifier<'a> {
            RegFill { src, .. } => {
                self.verify_stack_slot(inst, src)?;
            }
+            LoadComplex { ref args, .. } => {
+                self.verify_value_list(inst, args)?;
+            }
+            StoreComplex { ref args, .. } => {
+                self.verify_value_list(inst, args)?;
+            }

            // Exhaustive list so we can't forget to add new formats
            Unary { .. } |
@@ -1149,8 +1155,8 @@ impl<'a> Verifier<'a> {
 mod tests {
    use super::{Error, Verifier};
    use entity::EntityList;
-    use ir::Function;
    use ir::instructions::{InstructionData, Opcode};
+    use ir::Function;
    use settings;

    macro_rules! assert_err_with_msg {
--- a/lib/codegen/src/write.rs
+++ b/lib/codegen/src/write.rs
@@ -369,12 +369,44 @@ pub fn write_operands(
        } => write!(w, " {}, {}{}", arg, stack_slot, offset),
        HeapAddr { heap, arg, imm, .. } => write!(w, " {}, {}, {}", heap, arg, imm),
        Load { flags, arg, offset, .. } => write!(w, "{} {}{}", flags, arg, offset),
+        LoadComplex {
+            flags,
+            ref args,
+            offset,
+            ..
+        } => {
+            let args = args.as_slice(pool);
+            write!(
+                w,
+                "{} {}{}",
+                flags,
+                DisplayValuesWithDelimiter(&args, '+'),
+                offset
+            )
+
+        }
        Store {
            flags,
            args,
            offset,
            ..
        } => write!(w, "{} {}, {}{}", flags, args[0], args[1], offset),
+        StoreComplex {
+            flags,
+            ref args,
+            offset,
+            ..
+        } => {
+            let args = args.as_slice(pool);
+            write!(
+                w,
+                "{} {}, {}{}",
+                flags,
+                args[0],
+                DisplayValuesWithDelimiter(&args[1..], '+'),
+                offset
+            )
+        }
        RegMove { arg, src, dst, .. } => {
            if let Some(isa) = isa {
                let regs = isa.register_info();
@@ -450,6 +482,21 @@ impl<'a> fmt::Display for DisplayValues<'a> {
    }
 }

+struct DisplayValuesWithDelimiter<'a>(&'a [Value], char);
+
+impl<'a> fmt::Display for DisplayValuesWithDelimiter<'a> {
+    fn fmt(&self, f: &mut fmt::Formatter) -> Result {
+        for (i, val) in self.0.iter().enumerate() {
+            if i == 0 {
+                write!(f, "{}", val)?;
+            } else {
+                write!(f, "{}{}", self.1, val)?;
+            }
+        }
+        Ok(())
+    }
+}
+
 #[cfg(test)]
 mod tests {
    use ir::types;
--- a/lib/reader/src/lexer.rs
+++ b/lib/reader/src/lexer.rs
@@ -22,6 +22,7 @@ pub enum Token<'a> {
    LBracket, // '['
    RBracket, // ']'
    Minus, // '-'
+    Plus, // '+'
    Comma, // ','
    Dot, // '.'
    Colon, // ':'
@@ -169,6 +170,25 @@ impl<'a> Lexer<'a> {
        self.source[self.pos..].starts_with(prefix)
    }

+    // Starting from `lookahead`, are we looking at a number?
+    fn looking_at_numeric(&self) -> bool {
+        if let Some(c) = self.lookahead {
+            if c.is_digit(10) {
+                return true;
+            }
+            match c {
+                '-' => return true,
+                '+' => return true,
+                '.' => return true,
+                _ => {}
+            }
+            if self.looking_at("NaN") || self.looking_at("Inf") || self.looking_at("sNaN") {
+                return true;
+            }
+        }
+        false
+    }
+
    // Scan a single-char token.
    fn scan_char(&mut self, tok: Token<'a>) -> Result<LocatedToken<'a>, LocatedError> {
        assert_ne!(self.lookahead, None);
@@ -234,16 +254,17 @@ impl<'a> Lexer<'a> {
        match self.lookahead {
            Some('-') => {
                self.next_ch();
-
-                if let Some(c) = self.lookahead {
-                    // If the next character won't parse as a number, we return Token::Minus
-                    if !c.is_alphanumeric() && c != '.' {
+                if !self.looking_at_numeric() {
+                    // If the next characters won't parse as a number, we return Token::Minus
                    return token(Token::Minus, loc);
                }
            }
-            }
            Some('+') => {
                self.next_ch();
+                if !self.looking_at_numeric() {
+                    // If the next characters won't parse as a number, we return Token::Minus
+                    return token(Token::Plus, loc);
+                }
            }
            _ => {}
        }
--- a/lib/reader/src/parser.rs
+++ b/lib/reader/src/parser.rs
@@ -13,8 +13,8 @@ use cretonne_codegen::ir::{AbiParam, ArgumentExtension, ArgumentLoc, Ebb, ExtFun
                           Type, Value, ValueLoc};
 use cretonne_codegen::isa::{self, Encoding, RegUnit, TargetIsa};
 use cretonne_codegen::packed_option::ReservedValue;
-use cretonne_codegen::{settings, timing};
 use cretonne_codegen::settings::CallConv;
+use cretonne_codegen::{settings, timing};
 use error::{Error, Location, Result};
 use isaspec;
 use lexer::{self, Lexer, Token};
@@ -1872,6 +1872,24 @@ impl<'a> Parser<'a> {
        Ok(args)
    }

+    fn parse_value_sequence(&mut self) -> Result<VariableArgs> {
+        let mut args = VariableArgs::new();
+
+        if let Some(Token::Value(v)) = self.token() {
+            args.push(v);
+            self.consume();
+        } else {
+            return Ok(args);
+        }
+
+        while self.optional(Token::Plus) {
+            args.push(self.match_value("expected value in argument list")?);
+        }
+
+        Ok(args)
+
+    }
+
    // Parse an optional value list enclosed in parantheses.
    fn parse_opt_value_list(&mut self) -> Result<VariableArgs> {
        if !self.optional(Token::LPar) {
@@ -2267,6 +2285,17 @@ impl<'a> Parser<'a> {
                    offset,
                }
            }
+            InstructionFormat::LoadComplex => {
+                let flags = self.optional_memflags();
+                let args = self.parse_value_sequence()?;
+                let offset = self.optional_offset32()?;
+                InstructionData::LoadComplex {
+                    opcode,
+                    flags,
+                    args: args.into_value_list(&[], &mut ctx.function.dfg.value_lists),
+                    offset,
+                }
+            }
            InstructionFormat::Store => {
                let flags = self.optional_memflags();
                let arg = self.match_value("expected SSA value operand")?;
@@ -2283,6 +2312,23 @@ impl<'a> Parser<'a> {
                    offset,
                }
            }
+
+            InstructionFormat::StoreComplex => {
+                let flags = self.optional_memflags();
+                let src = self.match_value("expected SSA value operand")?;
+                self.match_token(
+                    Token::Comma,
+                    "expected ',' between operands",
+                )?;
+                let args = self.parse_value_sequence()?;
+                let offset = self.optional_offset32()?;
+                InstructionData::StoreComplex {
+                    opcode,
+                    flags,
+                    args: args.into_value_list(&[src], &mut ctx.function.dfg.value_lists),
+                    offset,
+                }
+            }
            InstructionFormat::RegMove => {
                let arg = self.match_value("expected SSA value operand")?;
                self.match_token(
@@ -2402,9 +2448,9 @@ impl<'a> Parser<'a> {
 #[cfg(test)]
 mod tests {
    use super::*;
-    use cretonne_codegen::ir::StackSlotKind;
    use cretonne_codegen::ir::entities::AnyEntity;
    use cretonne_codegen::ir::types;
+    use cretonne_codegen::ir::StackSlotKind;
    use cretonne_codegen::ir::{ArgumentExtension, ArgumentPurpose};
    use cretonne_codegen::settings::CallConv;
    use error::Error;