x64: Improve codegen for i8x16.shr_u (#5906)

This catches a case that wasn't handled previously by #5880 to allow a
constant load to be folded into an instruction rather than forcing it to
be loaded into a temporary register.
This commit is contained in:
Alex Crichton
2023-03-01 23:43:42 -06:00
committed by GitHub
parent 7b8854f803
commit 52b4c48a1b
3 changed files with 22 additions and 31 deletions

View File

@@ -522,12 +522,10 @@
;; Shift `src` using 16x8. Unfortunately, a 16x8 shift will only be ;; Shift `src` using 16x8. Unfortunately, a 16x8 shift will only be
;; correct for half of the lanes; the others must be fixed up with ;; correct for half of the lanes; the others must be fixed up with
;; the mask below. ;; the mask below.
(unmasked Xmm (x64_psrlw src (mov_rmi_to_xmm masked_amt))) (unmasked Xmm (x64_psrlw src (mov_rmi_to_xmm masked_amt))))
(mask_addr SyntheticAmode (ushr_i8x16_mask masked_amt))
(mask Reg (x64_load $I8X16 mask_addr (ExtKind.None))))
(sse_and $I8X16 (sse_and $I8X16
unmasked unmasked
(RegMem.Reg mask)))) (ushr_i8x16_mask masked_amt))))
;; Get the address of the mask to use when fixing up the lanes that weren't ;; Get the address of the mask to use when fixing up the lanes that weren't
;; correctly generated by the 16x8 shift. ;; correctly generated by the 16x8 shift.

View File

@@ -1635,8 +1635,7 @@ block0(v0: i8x16, v1: i32):
; vpsrlw %xmm0, %xmm5, %xmm7 ; vpsrlw %xmm0, %xmm5, %xmm7
; lea const(0), %rsi ; lea const(0), %rsi
; shlq $4, %r10, %r10 ; shlq $4, %r10, %r10
; movdqu 0(%rsi,%r10,1), %xmm13 ; vpand %xmm7, 0(%rsi,%r10,1), %xmm0
; vpand %xmm7, %xmm13, %xmm0
; movq %rbp, %rsp ; movq %rbp, %rsp
; popq %rbp ; popq %rbp
; ret ; ret
@@ -1652,11 +1651,13 @@ block0(v0: i8x16, v1: i32):
; vpsrlw %xmm5, %xmm0, %xmm7 ; vpsrlw %xmm5, %xmm0, %xmm7
; leaq 0x15(%rip), %rsi ; leaq 0x15(%rip), %rsi
; shlq $4, %r10 ; shlq $4, %r10
; movdqu (%rsi, %r10), %xmm13 ; vpand (%rsi, %r10), %xmm7, %xmm0
; vpand %xmm13, %xmm7, %xmm0
; movq %rbp, %rsp ; movq %rbp, %rsp
; popq %rbp ; popq %rbp
; retq ; retq
; addb %al, (%rax)
; addb %al, (%rax)
; addb %bh, %bh
function %i8x16_ushr_imm(i8x16) -> i8x16 { function %i8x16_ushr_imm(i8x16) -> i8x16 {
block0(v0: i8x16): block0(v0: i8x16):
@@ -1670,8 +1671,7 @@ block0(v0: i8x16):
; movq %rsp, %rbp ; movq %rsp, %rbp
; block0: ; block0:
; vpsrlw %xmm0, $1, %xmm2 ; vpsrlw %xmm0, $1, %xmm2
; movdqu const(0), %xmm4 ; vpand %xmm2, const(0), %xmm0
; vpand %xmm2, %xmm4, %xmm0
; movq %rbp, %rsp ; movq %rbp, %rsp
; popq %rbp ; popq %rbp
; ret ; ret
@@ -1682,14 +1682,15 @@ block0(v0: i8x16):
; movq %rsp, %rbp ; movq %rsp, %rbp
; block1: ; offset 0x4 ; block1: ; offset 0x4
; vpsrlw $1, %xmm0, %xmm2 ; vpsrlw $1, %xmm0, %xmm2
; movdqu 0xf(%rip), %xmm4 ; vpand 0xf(%rip), %xmm2, %xmm0
; vpand %xmm4, %xmm2, %xmm0
; movq %rbp, %rsp ; movq %rbp, %rsp
; popq %rbp ; popq %rbp
; retq ; retq
; addb %al, (%rax) ; addb %al, (%rax)
; addb %al, (%rax) ; addb %al, (%rax)
; addb %al, (%rax) ; addb %al, (%rax)
; addb %al, (%rax)
; addb %al, (%rax)
; jg 0xa1 ; jg 0xa1
; jg 0xa3 ; jg 0xa3
; jg 0xa5 ; jg 0xa5

View File

@@ -485,8 +485,7 @@ block0:
; block0: ; block0:
; movdqu const(1), %xmm0 ; movdqu const(1), %xmm0
; psrlw %xmm0, $1, %xmm0 ; psrlw %xmm0, $1, %xmm0
; movdqu const(0), %xmm3 ; pand %xmm0, const(0), %xmm0
; pand %xmm0, %xmm3, %xmm0
; movq %rbp, %rsp ; movq %rbp, %rsp
; popq %rbp ; popq %rbp
; ret ; ret
@@ -496,28 +495,21 @@ block0:
; pushq %rbp ; pushq %rbp
; movq %rsp, %rbp ; movq %rsp, %rbp
; block1: ; offset 0x4 ; block1: ; offset 0x4
; movdqu 0x34(%rip), %xmm0 ; movdqu 0x24(%rip), %xmm0
; psrlw $1, %xmm0 ; psrlw $1, %xmm0
; movdqu 0x17(%rip), %xmm3 ; pand 7(%rip), %xmm0
; pand %xmm3, %xmm0
; movq %rbp, %rsp ; movq %rbp, %rsp
; popq %rbp ; popq %rbp
; retq ; retq
; addb %al, (%rax) ; addb %al, (%rax)
; addb %al, (%rax) ; jg 0xa1
; addb %al, (%rax) ; jg 0xa3
; addb %al, (%rax) ; jg 0xa5
; addb %al, (%rax) ; jg 0xa7
; addb %al, (%rax) ; jg 0xa9
; addb %al, (%rax) ; jg 0xab
; jg 0xb1 ; jg 0xad
; jg 0xb3 ; jg 0xaf
; jg 0xb5
; jg 0xb7
; jg 0xb9
; jg 0xbb
; jg 0xbd
; jg 0xbf
; addb %al, (%rcx) ; addb %al, (%rcx)
; addb (%rbx), %al ; addb (%rbx), %al
; addb $5, %al ; addb $5, %al