x64: Improve codegen for i8x16.shr_u (#5906)
This catches a case that wasn't handled previously by #5880 to allow a constant load to be folded into an instruction rather than forcing it to be loaded into a temporary register.
This commit is contained in:
@@ -522,12 +522,10 @@
|
||||
;; Shift `src` using 16x8. Unfortunately, a 16x8 shift will only be
|
||||
;; correct for half of the lanes; the others must be fixed up with
|
||||
;; the mask below.
|
||||
(unmasked Xmm (x64_psrlw src (mov_rmi_to_xmm masked_amt)))
|
||||
(mask_addr SyntheticAmode (ushr_i8x16_mask masked_amt))
|
||||
(mask Reg (x64_load $I8X16 mask_addr (ExtKind.None))))
|
||||
(unmasked Xmm (x64_psrlw src (mov_rmi_to_xmm masked_amt))))
|
||||
(sse_and $I8X16
|
||||
unmasked
|
||||
(RegMem.Reg mask))))
|
||||
(ushr_i8x16_mask masked_amt))))
|
||||
|
||||
;; Get the address of the mask to use when fixing up the lanes that weren't
|
||||
;; correctly generated by the 16x8 shift.
|
||||
|
||||
Reference in New Issue
Block a user