x64: Improve codegen for i8x16.shr_u (#5906)
This catches a case that wasn't handled previously by #5880 to allow a constant load to be folded into an instruction rather than forcing it to be loaded into a temporary register.
This commit is contained in:
@@ -522,12 +522,10 @@
|
|||||||
;; Shift `src` using 16x8. Unfortunately, a 16x8 shift will only be
|
;; Shift `src` using 16x8. Unfortunately, a 16x8 shift will only be
|
||||||
;; correct for half of the lanes; the others must be fixed up with
|
;; correct for half of the lanes; the others must be fixed up with
|
||||||
;; the mask below.
|
;; the mask below.
|
||||||
(unmasked Xmm (x64_psrlw src (mov_rmi_to_xmm masked_amt)))
|
(unmasked Xmm (x64_psrlw src (mov_rmi_to_xmm masked_amt))))
|
||||||
(mask_addr SyntheticAmode (ushr_i8x16_mask masked_amt))
|
|
||||||
(mask Reg (x64_load $I8X16 mask_addr (ExtKind.None))))
|
|
||||||
(sse_and $I8X16
|
(sse_and $I8X16
|
||||||
unmasked
|
unmasked
|
||||||
(RegMem.Reg mask))))
|
(ushr_i8x16_mask masked_amt))))
|
||||||
|
|
||||||
;; Get the address of the mask to use when fixing up the lanes that weren't
|
;; Get the address of the mask to use when fixing up the lanes that weren't
|
||||||
;; correctly generated by the 16x8 shift.
|
;; correctly generated by the 16x8 shift.
|
||||||
|
|||||||
@@ -1635,8 +1635,7 @@ block0(v0: i8x16, v1: i32):
|
|||||||
; vpsrlw %xmm0, %xmm5, %xmm7
|
; vpsrlw %xmm0, %xmm5, %xmm7
|
||||||
; lea const(0), %rsi
|
; lea const(0), %rsi
|
||||||
; shlq $4, %r10, %r10
|
; shlq $4, %r10, %r10
|
||||||
; movdqu 0(%rsi,%r10,1), %xmm13
|
; vpand %xmm7, 0(%rsi,%r10,1), %xmm0
|
||||||
; vpand %xmm7, %xmm13, %xmm0
|
|
||||||
; movq %rbp, %rsp
|
; movq %rbp, %rsp
|
||||||
; popq %rbp
|
; popq %rbp
|
||||||
; ret
|
; ret
|
||||||
@@ -1652,11 +1651,13 @@ block0(v0: i8x16, v1: i32):
|
|||||||
; vpsrlw %xmm5, %xmm0, %xmm7
|
; vpsrlw %xmm5, %xmm0, %xmm7
|
||||||
; leaq 0x15(%rip), %rsi
|
; leaq 0x15(%rip), %rsi
|
||||||
; shlq $4, %r10
|
; shlq $4, %r10
|
||||||
; movdqu (%rsi, %r10), %xmm13
|
; vpand (%rsi, %r10), %xmm7, %xmm0
|
||||||
; vpand %xmm13, %xmm7, %xmm0
|
|
||||||
; movq %rbp, %rsp
|
; movq %rbp, %rsp
|
||||||
; popq %rbp
|
; popq %rbp
|
||||||
; retq
|
; retq
|
||||||
|
; addb %al, (%rax)
|
||||||
|
; addb %al, (%rax)
|
||||||
|
; addb %bh, %bh
|
||||||
|
|
||||||
function %i8x16_ushr_imm(i8x16) -> i8x16 {
|
function %i8x16_ushr_imm(i8x16) -> i8x16 {
|
||||||
block0(v0: i8x16):
|
block0(v0: i8x16):
|
||||||
@@ -1670,8 +1671,7 @@ block0(v0: i8x16):
|
|||||||
; movq %rsp, %rbp
|
; movq %rsp, %rbp
|
||||||
; block0:
|
; block0:
|
||||||
; vpsrlw %xmm0, $1, %xmm2
|
; vpsrlw %xmm0, $1, %xmm2
|
||||||
; movdqu const(0), %xmm4
|
; vpand %xmm2, const(0), %xmm0
|
||||||
; vpand %xmm2, %xmm4, %xmm0
|
|
||||||
; movq %rbp, %rsp
|
; movq %rbp, %rsp
|
||||||
; popq %rbp
|
; popq %rbp
|
||||||
; ret
|
; ret
|
||||||
@@ -1682,14 +1682,15 @@ block0(v0: i8x16):
|
|||||||
; movq %rsp, %rbp
|
; movq %rsp, %rbp
|
||||||
; block1: ; offset 0x4
|
; block1: ; offset 0x4
|
||||||
; vpsrlw $1, %xmm0, %xmm2
|
; vpsrlw $1, %xmm0, %xmm2
|
||||||
; movdqu 0xf(%rip), %xmm4
|
; vpand 0xf(%rip), %xmm2, %xmm0
|
||||||
; vpand %xmm4, %xmm2, %xmm0
|
|
||||||
; movq %rbp, %rsp
|
; movq %rbp, %rsp
|
||||||
; popq %rbp
|
; popq %rbp
|
||||||
; retq
|
; retq
|
||||||
; addb %al, (%rax)
|
; addb %al, (%rax)
|
||||||
; addb %al, (%rax)
|
; addb %al, (%rax)
|
||||||
; addb %al, (%rax)
|
; addb %al, (%rax)
|
||||||
|
; addb %al, (%rax)
|
||||||
|
; addb %al, (%rax)
|
||||||
; jg 0xa1
|
; jg 0xa1
|
||||||
; jg 0xa3
|
; jg 0xa3
|
||||||
; jg 0xa5
|
; jg 0xa5
|
||||||
|
|||||||
@@ -485,8 +485,7 @@ block0:
|
|||||||
; block0:
|
; block0:
|
||||||
; movdqu const(1), %xmm0
|
; movdqu const(1), %xmm0
|
||||||
; psrlw %xmm0, $1, %xmm0
|
; psrlw %xmm0, $1, %xmm0
|
||||||
; movdqu const(0), %xmm3
|
; pand %xmm0, const(0), %xmm0
|
||||||
; pand %xmm0, %xmm3, %xmm0
|
|
||||||
; movq %rbp, %rsp
|
; movq %rbp, %rsp
|
||||||
; popq %rbp
|
; popq %rbp
|
||||||
; ret
|
; ret
|
||||||
@@ -496,28 +495,21 @@ block0:
|
|||||||
; pushq %rbp
|
; pushq %rbp
|
||||||
; movq %rsp, %rbp
|
; movq %rsp, %rbp
|
||||||
; block1: ; offset 0x4
|
; block1: ; offset 0x4
|
||||||
; movdqu 0x34(%rip), %xmm0
|
; movdqu 0x24(%rip), %xmm0
|
||||||
; psrlw $1, %xmm0
|
; psrlw $1, %xmm0
|
||||||
; movdqu 0x17(%rip), %xmm3
|
; pand 7(%rip), %xmm0
|
||||||
; pand %xmm3, %xmm0
|
|
||||||
; movq %rbp, %rsp
|
; movq %rbp, %rsp
|
||||||
; popq %rbp
|
; popq %rbp
|
||||||
; retq
|
; retq
|
||||||
; addb %al, (%rax)
|
; addb %al, (%rax)
|
||||||
; addb %al, (%rax)
|
; jg 0xa1
|
||||||
; addb %al, (%rax)
|
; jg 0xa3
|
||||||
; addb %al, (%rax)
|
; jg 0xa5
|
||||||
; addb %al, (%rax)
|
; jg 0xa7
|
||||||
; addb %al, (%rax)
|
; jg 0xa9
|
||||||
; addb %al, (%rax)
|
; jg 0xab
|
||||||
; jg 0xb1
|
; jg 0xad
|
||||||
; jg 0xb3
|
; jg 0xaf
|
||||||
; jg 0xb5
|
|
||||||
; jg 0xb7
|
|
||||||
; jg 0xb9
|
|
||||||
; jg 0xbb
|
|
||||||
; jg 0xbd
|
|
||||||
; jg 0xbf
|
|
||||||
; addb %al, (%rcx)
|
; addb %al, (%rcx)
|
||||||
; addb (%rbx), %al
|
; addb (%rbx), %al
|
||||||
; addb $5, %al
|
; addb $5, %al
|
||||||
|
|||||||
Reference in New Issue
Block a user