x64: Enable load-coalescing for SSE/AVX instructions (#5841)

* x64: Enable load-coalescing for SSE/AVX instructions This commit unlocks the ability to fold loads into operands of SSE and AVX instructions. This is beneficial for both function size when it happens in addition to being able to reduce register pressure. Previously this was not done because most SSE instructions require memory to be aligned. AVX instructions, however, do not have alignment requirements. The solution implemented here is one recommended by Chris which is to add a new `XmmMemAligned` newtype wrapper around `XmmMem`. All SSE instructions are now annotated as requiring an `XmmMemAligned` operand except for a new new instruction styles used specifically for instructions that don't require alignment (e.g. `movdqu`, `*sd`, and `*ss` instructions). All existing instruction helpers continue to take `XmmMem`, however. This way if an AVX lowering is chosen it can be used as-is. If an SSE lowering is chosen, however, then an automatic conversion from `XmmMem` to `XmmMemAligned` kicks in. This automatic conversion only fails for unaligned addresses in which case a load instruction is emitted and the operand becomes a temporary register instead. A number of prior `Xmm` arguments have now been converted to `XmmMem` as well. One change from this commit is that loading an unaligned operand for an SSE instruction previously would use the "correct type" of load, e.g. `movups` for f32x4 or `movup` for f64x2, but now the loading happens in a context without type information so the `movdqu` instruction is generated. According to [this stack overflow question][question] it looks like modern processors won't penalize this "wrong" choice of type when the operand is then used for f32 or f64 oriented instructions. Finally this commit improves some reuse of logic in the `put_in_*_mem*` helper to share code with `sinkable_load` and avoid duplication. With this in place some various ISLE rules have been updated as well. In the tests it can be seen that AVX-instructions are now automatically load-coalesced and use memory operands in a few cases. [question]: https://stackoverflow.com/questions/40854819/is-there-any-situation-where-using-movdqu-and-movupd-is-better-than-movups * Fix tests * Fix move-and-extend to be unaligned These don't have alignment requirements like other xmm instructions as well. Additionally add some ISA tests to ensure that their output is tested. * Review comments
2023-02-21 13:10:19 -06:00
parent c65de1f1b1
commit d82ebcc102
11 changed files with 644 additions and 323 deletions
--- a/cranelift/filetests/filetests/isa/x64/fastcall.clif
+++ b/cranelift/filetests/filetests/isa/x64/fastcall.clif
@@ -333,44 +333,42 @@ block0(v0: i64):
 ;   unwind PushFrameRegs { offset_upward_to_caller_sp: 16 }
 ;   movq    %rsp, %rbp
 ;   unwind DefineNewFrame { offset_upward_to_caller_sp: 16, offset_downward_to_clobbers: 160 }
-;   subq    %rsp, $256, %rsp
-;   movdqu  %xmm6, 96(%rsp)
+;   subq    %rsp, $224, %rsp
+;   movdqu  %xmm6, 64(%rsp)
 ;   unwind SaveReg { clobber_offset: 0, reg: p6f }
-;   movdqu  %xmm7, 112(%rsp)
+;   movdqu  %xmm7, 80(%rsp)
 ;   unwind SaveReg { clobber_offset: 16, reg: p7f }
-;   movdqu  %xmm8, 128(%rsp)
+;   movdqu  %xmm8, 96(%rsp)
 ;   unwind SaveReg { clobber_offset: 32, reg: p8f }
-;   movdqu  %xmm9, 144(%rsp)
+;   movdqu  %xmm9, 112(%rsp)
 ;   unwind SaveReg { clobber_offset: 48, reg: p9f }
-;   movdqu  %xmm10, 160(%rsp)
+;   movdqu  %xmm10, 128(%rsp)
 ;   unwind SaveReg { clobber_offset: 64, reg: p10f }
-;   movdqu  %xmm11, 176(%rsp)
+;   movdqu  %xmm11, 144(%rsp)
 ;   unwind SaveReg { clobber_offset: 80, reg: p11f }
-;   movdqu  %xmm12, 192(%rsp)
+;   movdqu  %xmm12, 160(%rsp)
 ;   unwind SaveReg { clobber_offset: 96, reg: p12f }
-;   movdqu  %xmm13, 208(%rsp)
+;   movdqu  %xmm13, 176(%rsp)
 ;   unwind SaveReg { clobber_offset: 112, reg: p13f }
-;   movdqu  %xmm14, 224(%rsp)
+;   movdqu  %xmm14, 192(%rsp)
 ;   unwind SaveReg { clobber_offset: 128, reg: p14f }
-;   movdqu  %xmm15, 240(%rsp)
+;   movdqu  %xmm15, 208(%rsp)
 ;   unwind SaveReg { clobber_offset: 144, reg: p15f }
 ; block0:
 ;   movsd   0(%rcx), %xmm0
 ;   movsd   8(%rcx), %xmm10
-;   movdqu  %xmm10, rsp(80 + virtual offset)
-;   movsd   16(%rcx), %xmm2
-;   movdqu  %xmm2, rsp(0 + virtual offset)
+;   movdqu  %xmm10, rsp(48 + virtual offset)
+;   movsd   16(%rcx), %xmm5
 ;   movsd   24(%rcx), %xmm14
-;   movdqu  %xmm14, rsp(64 + virtual offset)
+;   movdqu  %xmm14, rsp(32 + virtual offset)
 ;   movsd   32(%rcx), %xmm13
 ;   movsd   40(%rcx), %xmm15
-;   movdqu  %xmm15, rsp(48 + virtual offset)
+;   movdqu  %xmm15, rsp(16 + virtual offset)
 ;   movsd   48(%rcx), %xmm7
-;   movsd   56(%rcx), %xmm5
-;   movdqu  %xmm5, rsp(32 + virtual offset)
+;   movsd   56(%rcx), %xmm8
+;   movdqu  %xmm8, rsp(0 + virtual offset)
 ;   movsd   64(%rcx), %xmm12
-;   movsd   72(%rcx), %xmm4
-;   movdqu  %xmm4, rsp(16 + virtual offset)
+;   movsd   72(%rcx), %xmm2
 ;   movsd   80(%rcx), %xmm9
 ;   movsd   88(%rcx), %xmm4
 ;   movsd   96(%rcx), %xmm3
@@ -380,24 +378,21 @@ block0(v0: i64):
 ;   movsd   128(%rcx), %xmm6
 ;   movsd   136(%rcx), %xmm14
 ;   movsd   144(%rcx), %xmm1
-;   movsd   152(%rcx), %xmm15
-;   movdqu  rsp(80 + virtual offset), %xmm2
-;   addsd   %xmm0, %xmm2, %xmm0
-;   movdqu  rsp(0 + virtual offset), %xmm2
-;   movdqu  rsp(64 + virtual offset), %xmm5
-;   addsd   %xmm2, %xmm5, %xmm2
-;   movdqu  rsp(48 + virtual offset), %xmm5
-;   addsd   %xmm13, %xmm5, %xmm13
-;   movdqu  rsp(32 + virtual offset), %xmm5
-;   addsd   %xmm7, %xmm5, %xmm7
-;   movdqu  rsp(16 + virtual offset), %xmm5
-;   addsd   %xmm12, %xmm5, %xmm12
+;   movdqu  rsp(48 + virtual offset), %xmm15
+;   addsd   %xmm0, %xmm15, %xmm0
+;   movdqu  rsp(32 + virtual offset), %xmm15
+;   addsd   %xmm5, %xmm15, %xmm5
+;   movdqu  rsp(16 + virtual offset), %xmm15
+;   addsd   %xmm13, %xmm15, %xmm13
+;   movdqu  rsp(0 + virtual offset), %xmm15
+;   addsd   %xmm7, %xmm15, %xmm7
+;   addsd   %xmm12, %xmm2, %xmm12
 ;   addsd   %xmm9, %xmm4, %xmm9
 ;   addsd   %xmm3, %xmm8, %xmm3
 ;   addsd   %xmm11, %xmm10, %xmm11
 ;   addsd   %xmm6, %xmm14, %xmm6
-;   addsd   %xmm1, %xmm15, %xmm1
-;   addsd   %xmm0, %xmm2, %xmm0
+;   addsd   %xmm1, 152(%rcx), %xmm1
+;   addsd   %xmm0, %xmm5, %xmm0
 ;   addsd   %xmm13, %xmm7, %xmm13
 ;   addsd   %xmm12, %xmm9, %xmm12
 ;   addsd   %xmm3, %xmm11, %xmm3
@@ -406,17 +401,17 @@ block0(v0: i64):
 ;   addsd   %xmm12, %xmm3, %xmm12
 ;   addsd   %xmm0, %xmm12, %xmm0
 ;   addsd   %xmm0, %xmm6, %xmm0
-;   movdqu  96(%rsp), %xmm6
-;   movdqu  112(%rsp), %xmm7
-;   movdqu  128(%rsp), %xmm8
-;   movdqu  144(%rsp), %xmm9
-;   movdqu  160(%rsp), %xmm10
-;   movdqu  176(%rsp), %xmm11
-;   movdqu  192(%rsp), %xmm12
-;   movdqu  208(%rsp), %xmm13
-;   movdqu  224(%rsp), %xmm14
-;   movdqu  240(%rsp), %xmm15
-;   addq    %rsp, $256, %rsp
+;   movdqu  64(%rsp), %xmm6
+;   movdqu  80(%rsp), %xmm7
+;   movdqu  96(%rsp), %xmm8
+;   movdqu  112(%rsp), %xmm9
+;   movdqu  128(%rsp), %xmm10
+;   movdqu  144(%rsp), %xmm11
+;   movdqu  160(%rsp), %xmm12
+;   movdqu  176(%rsp), %xmm13
+;   movdqu  192(%rsp), %xmm14
+;   movdqu  208(%rsp), %xmm15
+;   addq    %rsp, $224, %rsp
 ;   movq    %rbp, %rsp
 ;   popq    %rbp
 ;   ret
@@ -425,34 +420,32 @@ block0(v0: i64):
 ; block0: ; offset 0x0
 ;   pushq %rbp
 ;   movq %rsp, %rbp
-;   subq $0x100, %rsp
-;   movdqu %xmm6, 0x60(%rsp)
-;   movdqu %xmm7, 0x70(%rsp)
-;   movdqu %xmm8, 0x80(%rsp)
-;   movdqu %xmm9, 0x90(%rsp)
-;   movdqu %xmm10, 0xa0(%rsp)
-;   movdqu %xmm11, 0xb0(%rsp)
-;   movdqu %xmm12, 0xc0(%rsp)
-;   movdqu %xmm13, 0xd0(%rsp)
-;   movdqu %xmm14, 0xe0(%rsp)
-;   movdqu %xmm15, 0xf0(%rsp)
-; block1: ; offset 0x67
+;   subq $0xe0, %rsp
+;   movdqu %xmm6, 0x40(%rsp)
+;   movdqu %xmm7, 0x50(%rsp)
+;   movdqu %xmm8, 0x60(%rsp)
+;   movdqu %xmm9, 0x70(%rsp)
+;   movdqu %xmm10, 0x80(%rsp)
+;   movdqu %xmm11, 0x90(%rsp)
+;   movdqu %xmm12, 0xa0(%rsp)
+;   movdqu %xmm13, 0xb0(%rsp)
+;   movdqu %xmm14, 0xc0(%rsp)
+;   movdqu %xmm15, 0xd0(%rsp)
+; block1: ; offset 0x61
 ;   movsd (%rcx), %xmm0 ; trap: heap_oob
 ;   movsd 8(%rcx), %xmm10 ; trap: heap_oob
-;   movdqu %xmm10, 0x50(%rsp)
-;   movsd 0x10(%rcx), %xmm2 ; trap: heap_oob
-;   movdqu %xmm2, (%rsp)
+;   movdqu %xmm10, 0x30(%rsp)
+;   movsd 0x10(%rcx), %xmm5 ; trap: heap_oob
 ;   movsd 0x18(%rcx), %xmm14 ; trap: heap_oob
-;   movdqu %xmm14, 0x40(%rsp)
+;   movdqu %xmm14, 0x20(%rsp)
 ;   movsd 0x20(%rcx), %xmm13 ; trap: heap_oob
 ;   movsd 0x28(%rcx), %xmm15 ; trap: heap_oob
-;   movdqu %xmm15, 0x30(%rsp)
+;   movdqu %xmm15, 0x10(%rsp)
 ;   movsd 0x30(%rcx), %xmm7 ; trap: heap_oob
-;   movsd 0x38(%rcx), %xmm5 ; trap: heap_oob
-;   movdqu %xmm5, 0x20(%rsp)
+;   movsd 0x38(%rcx), %xmm8 ; trap: heap_oob
+;   movdqu %xmm8, (%rsp)
 ;   movsd 0x40(%rcx), %xmm12 ; trap: heap_oob
-;   movsd 0x48(%rcx), %xmm4 ; trap: heap_oob
-;   movdqu %xmm4, 0x10(%rsp)
+;   movsd 0x48(%rcx), %xmm2 ; trap: heap_oob
 ;   movsd 0x50(%rcx), %xmm9 ; trap: heap_oob
 ;   movsd 0x58(%rcx), %xmm4 ; trap: heap_oob
 ;   movsd 0x60(%rcx), %xmm3 ; trap: heap_oob
@@ -462,24 +455,21 @@ block0(v0: i64):
 ;   movsd 0x80(%rcx), %xmm6 ; trap: heap_oob
 ;   movsd 0x88(%rcx), %xmm14 ; trap: heap_oob
 ;   movsd 0x90(%rcx), %xmm1 ; trap: heap_oob
-;   movsd 0x98(%rcx), %xmm15 ; trap: heap_oob
-;   movdqu 0x50(%rsp), %xmm2
-;   addsd %xmm2, %xmm0
-;   movdqu (%rsp), %xmm2
-;   movdqu 0x40(%rsp), %xmm5
-;   addsd %xmm5, %xmm2
-;   movdqu 0x30(%rsp), %xmm5
-;   addsd %xmm5, %xmm13
-;   movdqu 0x20(%rsp), %xmm5
-;   addsd %xmm5, %xmm7
-;   movdqu 0x10(%rsp), %xmm5
-;   addsd %xmm5, %xmm12
+;   movdqu 0x30(%rsp), %xmm15
+;   addsd %xmm15, %xmm0
+;   movdqu 0x20(%rsp), %xmm15
+;   addsd %xmm15, %xmm5
+;   movdqu 0x10(%rsp), %xmm15
+;   addsd %xmm15, %xmm13
+;   movdqu (%rsp), %xmm15
+;   addsd %xmm15, %xmm7
+;   addsd %xmm2, %xmm12
 ;   addsd %xmm4, %xmm9
 ;   addsd %xmm8, %xmm3
 ;   addsd %xmm10, %xmm11
 ;   addsd %xmm14, %xmm6
-;   addsd %xmm15, %xmm1
-;   addsd %xmm2, %xmm0
+;   addsd 0x98(%rcx), %xmm1 ; trap: heap_oob
+;   addsd %xmm5, %xmm0
 ;   addsd %xmm7, %xmm13
 ;   addsd %xmm9, %xmm12
 ;   addsd %xmm11, %xmm3
@@ -488,17 +478,17 @@ block0(v0: i64):
 ;   addsd %xmm3, %xmm12
 ;   addsd %xmm12, %xmm0
 ;   addsd %xmm6, %xmm0
-;   movdqu 0x60(%rsp), %xmm6
-;   movdqu 0x70(%rsp), %xmm7
-;   movdqu 0x80(%rsp), %xmm8
-;   movdqu 0x90(%rsp), %xmm9
-;   movdqu 0xa0(%rsp), %xmm10
-;   movdqu 0xb0(%rsp), %xmm11
-;   movdqu 0xc0(%rsp), %xmm12
-;   movdqu 0xd0(%rsp), %xmm13
-;   movdqu 0xe0(%rsp), %xmm14
-;   movdqu 0xf0(%rsp), %xmm15
-;   addq $0x100, %rsp
+;   movdqu 0x40(%rsp), %xmm6
+;   movdqu 0x50(%rsp), %xmm7
+;   movdqu 0x60(%rsp), %xmm8
+;   movdqu 0x70(%rsp), %xmm9
+;   movdqu 0x80(%rsp), %xmm10
+;   movdqu 0x90(%rsp), %xmm11
+;   movdqu 0xa0(%rsp), %xmm12
+;   movdqu 0xb0(%rsp), %xmm13
+;   movdqu 0xc0(%rsp), %xmm14
+;   movdqu 0xd0(%rsp), %xmm15
+;   addq $0xe0, %rsp
 ;   movq %rbp, %rsp
 ;   popq %rbp
 ;   retq
--- a/cranelift/filetests/filetests/isa/x64/simd-bitwise-avx.clif
+++ b/cranelift/filetests/filetests/isa/x64/simd-bitwise-avx.clif
@@ -13,8 +13,7 @@ block0(v0: f32x4, v1: i64):
 ;   pushq   %rbp
 ;   movq    %rsp, %rbp
 ; block0:
-;   movups  0(%rdi), %xmm4
-;   vorps   %xmm0, %xmm4, %xmm0
+;   vorps   %xmm0, 0(%rdi), %xmm0
 ;   movq    %rbp, %rsp
 ;   popq    %rbp
 ;   ret
@@ -24,8 +23,7 @@ block0(v0: f32x4, v1: i64):
 ;   pushq %rbp
 ;   movq %rsp, %rbp
 ; block1: ; offset 0x4
-;   movups (%rdi), %xmm4
-;   vorps %xmm4, %xmm0, %xmm0
+;   vorps (%rdi), %xmm0, %xmm0
 ;   movq %rbp, %rsp
 ;   popq %rbp
 ;   retq
@@ -42,12 +40,11 @@ block0(v0: i64):
 ;   pushq   %rbp
 ;   movq    %rsp, %rbp
 ; block0:
-;   movss   0(%rdi), %xmm7
-;   movl    $-2147483648, %ecx
-;   movd    %ecx, %xmm5
-;   vandnps %xmm5, const(0), %xmm8
-;   vandps  %xmm5, %xmm7, %xmm9
-;   vorps   %xmm8, %xmm9, %xmm0
+;   movl    $-2147483648, %eax
+;   movd    %eax, %xmm4
+;   vandnps %xmm4, const(0), %xmm6
+;   vandps  %xmm4, 0(%rdi), %xmm8
+;   vorps   %xmm6, %xmm8, %xmm0
 ;   movq    %rbp, %rsp
 ;   popq    %rbp
 ;   ret
@@ -57,12 +54,11 @@ block0(v0: i64):
 ;   pushq %rbp
 ;   movq %rsp, %rbp
 ; block1: ; offset 0x4
-;   movss (%rdi), %xmm7
-;   movl $0x80000000, %ecx
-;   movd %ecx, %xmm5
-;   vandnps 0x17(%rip), %xmm5, %xmm8
-;   vandps %xmm7, %xmm5, %xmm9
-;   vorps %xmm9, %xmm8, %xmm0
+;   movl $0x80000000, %eax
+;   movd %eax, %xmm4
+;   vandnps 0x1b(%rip), %xmm4, %xmm6
+;   vandps (%rdi), %xmm4, %xmm8
+;   vorps %xmm8, %xmm6, %xmm0
 ;   movq %rbp, %rsp
 ;   popq %rbp
 ;   retq
@@ -78,6 +74,8 @@ block0(v0: i64):
 ;   addb %al, (%rax)
 ;   addb %al, (%rax)
 ;   addb %al, (%rax)
+;   addb %al, (%rax)
+;   addb %al, (%rax)

 function %bor_f32x4(f32x4, f32x4) -> f32x4 {
 block0(v0: f32x4, v1: f32x4):
--- a/cranelift/filetests/filetests/isa/x64/simd-load-extend.clif
+++ b/cranelift/filetests/filetests/isa/x64/simd-load-extend.clif
@@ -0,0 +1,154 @@
+test compile precise-output
+set enable_simd
+target x86_64
+
+function %uload8x8(i64) -> i16x8 {
+block0(v0: i64):
+  v1 = uload8x8 v0
+  return v1
+}
+
+; VCode:
+;   pushq   %rbp
+;   movq    %rsp, %rbp
+; block0:
+;   pmovzxbw 0(%rdi), %xmm0
+;   movq    %rbp, %rsp
+;   popq    %rbp
+;   ret
+; 
+; Disassembled:
+; block0: ; offset 0x0
+;   pushq %rbp
+;   movq %rsp, %rbp
+; block1: ; offset 0x4
+;   pmovzxbw (%rdi), %xmm0 ; trap: heap_oob
+;   movq %rbp, %rsp
+;   popq %rbp
+;   retq
+
+function %sload8x8(i64) -> i16x8 {
+block0(v0: i64):
+  v1 = sload8x8 v0
+  return v1
+}
+
+; VCode:
+;   pushq   %rbp
+;   movq    %rsp, %rbp
+; block0:
+;   pmovsxbw 0(%rdi), %xmm0
+;   movq    %rbp, %rsp
+;   popq    %rbp
+;   ret
+; 
+; Disassembled:
+; block0: ; offset 0x0
+;   pushq %rbp
+;   movq %rsp, %rbp
+; block1: ; offset 0x4
+;   pmovsxbw (%rdi), %xmm0 ; trap: heap_oob
+;   movq %rbp, %rsp
+;   popq %rbp
+;   retq
+
+function %uload16x4(i64) -> i32x4 {
+block0(v0: i64):
+  v1 = uload16x4 v0
+  return v1
+}
+
+; VCode:
+;   pushq   %rbp
+;   movq    %rsp, %rbp
+; block0:
+;   pmovzxwd 0(%rdi), %xmm0
+;   movq    %rbp, %rsp
+;   popq    %rbp
+;   ret
+; 
+; Disassembled:
+; block0: ; offset 0x0
+;   pushq %rbp
+;   movq %rsp, %rbp
+; block1: ; offset 0x4
+;   pmovzxwd (%rdi), %xmm0 ; trap: heap_oob
+;   movq %rbp, %rsp
+;   popq %rbp
+;   retq
+
+function %sload16x4(i64) -> i32x4 {
+block0(v0: i64):
+  v1 = sload16x4 v0
+  return v1
+}
+
+; VCode:
+;   pushq   %rbp
+;   movq    %rsp, %rbp
+; block0:
+;   pmovsxwd 0(%rdi), %xmm0
+;   movq    %rbp, %rsp
+;   popq    %rbp
+;   ret
+; 
+; Disassembled:
+; block0: ; offset 0x0
+;   pushq %rbp
+;   movq %rsp, %rbp
+; block1: ; offset 0x4
+;   pmovsxwd (%rdi), %xmm0 ; trap: heap_oob
+;   movq %rbp, %rsp
+;   popq %rbp
+;   retq
+
+function %uload32x2(i64) -> i64x2 {
+block0(v0: i64):
+  v1 = uload32x2 v0
+  return v1
+}
+
+; VCode:
+;   pushq   %rbp
+;   movq    %rsp, %rbp
+; block0:
+;   pmovzxdq 0(%rdi), %xmm0
+;   movq    %rbp, %rsp
+;   popq    %rbp
+;   ret
+; 
+; Disassembled:
+; block0: ; offset 0x0
+;   pushq %rbp
+;   movq %rsp, %rbp
+; block1: ; offset 0x4
+;   pmovzxdq (%rdi), %xmm0 ; trap: heap_oob
+;   movq %rbp, %rsp
+;   popq %rbp
+;   retq
+
+function %sload32x2(i64) -> i64x2 {
+block0(v0: i64):
+  v1 = sload32x2 v0
+  return v1
+}
+
+; VCode:
+;   pushq   %rbp
+;   movq    %rsp, %rbp
+; block0:
+;   pmovsxdq 0(%rdi), %xmm0
+;   movq    %rbp, %rsp
+;   popq    %rbp
+;   ret
+; 
+; Disassembled:
+; block0: ; offset 0x0
+;   pushq %rbp
+;   movq %rsp, %rbp
+; block1: ; offset 0x4
+;   pmovsxdq (%rdi), %xmm0 ; trap: heap_oob
+;   movq %rbp, %rsp
+;   popq %rbp
+;   retq
+