From 52b4c48a1b27b80247ccf31c9fe3f64d4f56b423 Mon Sep 17 00:00:00 2001 From: Alex Crichton Date: Wed, 1 Mar 2023 23:43:42 -0600 Subject: [PATCH] x64: Improve codegen for i8x16.shr_u (#5906) This catches a case that wasn't handled previously by #5880 to allow a constant load to be folded into an instruction rather than forcing it to be loaded into a temporary register. --- cranelift/codegen/src/isa/x64/lower.isle | 6 ++-- .../filetests/isa/x64/simd-arith-avx.clif | 17 ++++++----- .../isa/x64/simd-bitwise-compile.clif | 30 +++++++------------ 3 files changed, 22 insertions(+), 31 deletions(-) diff --git a/cranelift/codegen/src/isa/x64/lower.isle b/cranelift/codegen/src/isa/x64/lower.isle index 9f83ccbfb9..ec21968b38 100644 --- a/cranelift/codegen/src/isa/x64/lower.isle +++ b/cranelift/codegen/src/isa/x64/lower.isle @@ -522,12 +522,10 @@ ;; Shift `src` using 16x8. Unfortunately, a 16x8 shift will only be ;; correct for half of the lanes; the others must be fixed up with ;; the mask below. - (unmasked Xmm (x64_psrlw src (mov_rmi_to_xmm masked_amt))) - (mask_addr SyntheticAmode (ushr_i8x16_mask masked_amt)) - (mask Reg (x64_load $I8X16 mask_addr (ExtKind.None)))) + (unmasked Xmm (x64_psrlw src (mov_rmi_to_xmm masked_amt)))) (sse_and $I8X16 unmasked - (RegMem.Reg mask)))) + (ushr_i8x16_mask masked_amt)))) ;; Get the address of the mask to use when fixing up the lanes that weren't ;; correctly generated by the 16x8 shift. diff --git a/cranelift/filetests/filetests/isa/x64/simd-arith-avx.clif b/cranelift/filetests/filetests/isa/x64/simd-arith-avx.clif index 3e622bd5f9..fdd58032ea 100644 --- a/cranelift/filetests/filetests/isa/x64/simd-arith-avx.clif +++ b/cranelift/filetests/filetests/isa/x64/simd-arith-avx.clif @@ -1635,8 +1635,7 @@ block0(v0: i8x16, v1: i32): ; vpsrlw %xmm0, %xmm5, %xmm7 ; lea const(0), %rsi ; shlq $4, %r10, %r10 -; movdqu 0(%rsi,%r10,1), %xmm13 -; vpand %xmm7, %xmm13, %xmm0 +; vpand %xmm7, 0(%rsi,%r10,1), %xmm0 ; movq %rbp, %rsp ; popq %rbp ; ret @@ -1652,11 +1651,13 @@ block0(v0: i8x16, v1: i32): ; vpsrlw %xmm5, %xmm0, %xmm7 ; leaq 0x15(%rip), %rsi ; shlq $4, %r10 -; movdqu (%rsi, %r10), %xmm13 -; vpand %xmm13, %xmm7, %xmm0 +; vpand (%rsi, %r10), %xmm7, %xmm0 ; movq %rbp, %rsp ; popq %rbp ; retq +; addb %al, (%rax) +; addb %al, (%rax) +; addb %bh, %bh function %i8x16_ushr_imm(i8x16) -> i8x16 { block0(v0: i8x16): @@ -1670,8 +1671,7 @@ block0(v0: i8x16): ; movq %rsp, %rbp ; block0: ; vpsrlw %xmm0, $1, %xmm2 -; movdqu const(0), %xmm4 -; vpand %xmm2, %xmm4, %xmm0 +; vpand %xmm2, const(0), %xmm0 ; movq %rbp, %rsp ; popq %rbp ; ret @@ -1682,14 +1682,15 @@ block0(v0: i8x16): ; movq %rsp, %rbp ; block1: ; offset 0x4 ; vpsrlw $1, %xmm0, %xmm2 -; movdqu 0xf(%rip), %xmm4 -; vpand %xmm4, %xmm2, %xmm0 +; vpand 0xf(%rip), %xmm2, %xmm0 ; movq %rbp, %rsp ; popq %rbp ; retq ; addb %al, (%rax) ; addb %al, (%rax) ; addb %al, (%rax) +; addb %al, (%rax) +; addb %al, (%rax) ; jg 0xa1 ; jg 0xa3 ; jg 0xa5 diff --git a/cranelift/filetests/filetests/isa/x64/simd-bitwise-compile.clif b/cranelift/filetests/filetests/isa/x64/simd-bitwise-compile.clif index 1ecdf31ed0..f63cc22313 100644 --- a/cranelift/filetests/filetests/isa/x64/simd-bitwise-compile.clif +++ b/cranelift/filetests/filetests/isa/x64/simd-bitwise-compile.clif @@ -485,8 +485,7 @@ block0: ; block0: ; movdqu const(1), %xmm0 ; psrlw %xmm0, $1, %xmm0 -; movdqu const(0), %xmm3 -; pand %xmm0, %xmm3, %xmm0 +; pand %xmm0, const(0), %xmm0 ; movq %rbp, %rsp ; popq %rbp ; ret @@ -496,28 +495,21 @@ block0: ; pushq %rbp ; movq %rsp, %rbp ; block1: ; offset 0x4 -; movdqu 0x34(%rip), %xmm0 +; movdqu 0x24(%rip), %xmm0 ; psrlw $1, %xmm0 -; movdqu 0x17(%rip), %xmm3 -; pand %xmm3, %xmm0 +; pand 7(%rip), %xmm0 ; movq %rbp, %rsp ; popq %rbp ; retq ; addb %al, (%rax) -; addb %al, (%rax) -; addb %al, (%rax) -; addb %al, (%rax) -; addb %al, (%rax) -; addb %al, (%rax) -; addb %al, (%rax) -; jg 0xb1 -; jg 0xb3 -; jg 0xb5 -; jg 0xb7 -; jg 0xb9 -; jg 0xbb -; jg 0xbd -; jg 0xbf +; jg 0xa1 +; jg 0xa3 +; jg 0xa5 +; jg 0xa7 +; jg 0xa9 +; jg 0xab +; jg 0xad +; jg 0xaf ; addb %al, (%rcx) ; addb (%rbx), %al ; addb $5, %al