x64: Elide more uextend with extractlane (#6045)
* x64: Elide more uextend with extractlane
I've confirmed locally now that `pextr{b,w,d}` all zero the upper bits
of the full 64-bit register size which means that the `extractlane`
operation with a zero-extend can be elided for more cases, including
8-to-64-bit casts as well as 32-to-64.
This helps elide a few extra `mov`s in a loop I was looking at and had a
modest corresponding increase in performance (my guess was due to the
slightly decreased code size mostly as opposed to the removed `mov`s).
* Remove stray file
This commit is contained in:
@@ -1715,14 +1715,11 @@
|
||||
(if-let $true (value32_zeros_upper32 src))
|
||||
src)
|
||||
|
||||
;; The `extractlane` instruction, extended to `$I32`, means that either an
|
||||
;; i8x16 or an i16x8 is being extracted. These are implemented with
|
||||
;; the `pextr{b,w}` instruction which automatically zero the upper bits of the
|
||||
;; destination register so the `uextend` in these cases can be elided.
|
||||
;;
|
||||
;; TODO: the documentation for `pextr{b,w}` seems to indicate it zero extends
|
||||
;; to not only 32-bits but probably the whole 64-bit register. If that's the
|
||||
;; case then this should match a zero-extend to any size instead of just `$I32`.
|
||||
;; The `extractlane` instruction, when paired with a zero-extension, means
|
||||
;; that one of `i{8x16,16x8,32x4}` is being extracted and extend. These
|
||||
;; extractions are implemented with the `pextr{b,w,d}` instructions which
|
||||
;; automatically zero the upper bits of the destination register so the
|
||||
;; `uextend` in these cases can be elided.
|
||||
;;
|
||||
;; TODO: the interaction here between this rule and the "it's written far away"
|
||||
;; rule to lower `extractlane` isn't great. Ideally this rule (and the other
|
||||
@@ -1730,7 +1727,7 @@
|
||||
;; be connected to the extractlane rules. There's some discussion of this on
|
||||
;; #6022 but the gist is that there's not a lot of great options at this time,
|
||||
;; so this doc block is what's here for now.
|
||||
(rule 1 (extend_to_gpr src @ (extractlane _ _) $I32 (ExtendKind.Zero))
|
||||
(rule 1 (extend_to_gpr src @ (extractlane _ _) _ (ExtendKind.Zero))
|
||||
src)
|
||||
|
||||
(rule (extend_to_gpr (and val (value_type from_ty))
|
||||
|
||||
@@ -3808,6 +3808,8 @@
|
||||
(rule 0 (lower (extractlane val @ (value_type ty @ (multi_lane 16 8)) (u8_from_uimm8 lane)))
|
||||
(x64_pextrw val lane))
|
||||
|
||||
;; See the note in the 8x16 case above for how this rule is connected to
|
||||
;; `extend_to_gpr`.
|
||||
(rule 0 (lower (extractlane val @ (value_type ty @ (multi_lane 32 4)) (u8_from_uimm8 lane)))
|
||||
(x64_pextrd val lane))
|
||||
|
||||
|
||||
@@ -79,6 +79,32 @@ block0(v0: i8x16):
|
||||
; popq %rbp
|
||||
; retq
|
||||
|
||||
function %extractlane_i8x16_i64(i8x16) -> i64 {
|
||||
block0(v0: i8x16):
|
||||
v1 = extractlane v0, 1
|
||||
v2 = uextend.i64 v1
|
||||
return v2
|
||||
}
|
||||
|
||||
; VCode:
|
||||
; pushq %rbp
|
||||
; movq %rsp, %rbp
|
||||
; block0:
|
||||
; pextrb $1, %xmm0, %rax
|
||||
; movq %rbp, %rsp
|
||||
; popq %rbp
|
||||
; ret
|
||||
;
|
||||
; Disassembled:
|
||||
; block0: ; offset 0x0
|
||||
; pushq %rbp
|
||||
; movq %rsp, %rbp
|
||||
; block1: ; offset 0x4
|
||||
; pextrb $1, %xmm0, %eax
|
||||
; movq %rbp, %rsp
|
||||
; popq %rbp
|
||||
; retq
|
||||
|
||||
function %extractlane_i16x8_i32(i16x8) -> i32 {
|
||||
block0(v0: i16x8):
|
||||
v1 = extractlane v0, 1
|
||||
@@ -105,3 +131,55 @@ block0(v0: i16x8):
|
||||
; popq %rbp
|
||||
; retq
|
||||
|
||||
function %extractlane_i16x8_i64(i16x8) -> i64 {
|
||||
block0(v0: i16x8):
|
||||
v1 = extractlane v0, 1
|
||||
v2 = uextend.i64 v1
|
||||
return v2
|
||||
}
|
||||
|
||||
; VCode:
|
||||
; pushq %rbp
|
||||
; movq %rsp, %rbp
|
||||
; block0:
|
||||
; pextrw $1, %xmm0, %rax
|
||||
; movq %rbp, %rsp
|
||||
; popq %rbp
|
||||
; ret
|
||||
;
|
||||
; Disassembled:
|
||||
; block0: ; offset 0x0
|
||||
; pushq %rbp
|
||||
; movq %rsp, %rbp
|
||||
; block1: ; offset 0x4
|
||||
; pextrw $1, %xmm0, %eax
|
||||
; movq %rbp, %rsp
|
||||
; popq %rbp
|
||||
; retq
|
||||
|
||||
function %extractlane_i32x4_i64(i32x4) -> i64 {
|
||||
block0(v0: i32x4):
|
||||
v1 = extractlane v0, 1
|
||||
v2 = uextend.i64 v1
|
||||
return v2
|
||||
}
|
||||
|
||||
; VCode:
|
||||
; pushq %rbp
|
||||
; movq %rsp, %rbp
|
||||
; block0:
|
||||
; pextrd $1, %xmm0, %rax
|
||||
; movq %rbp, %rsp
|
||||
; popq %rbp
|
||||
; ret
|
||||
;
|
||||
; Disassembled:
|
||||
; block0: ; offset 0x0
|
||||
; pushq %rbp
|
||||
; movq %rsp, %rbp
|
||||
; block1: ; offset 0x4
|
||||
; pextrd $1, %xmm0, %eax
|
||||
; movq %rbp, %rsp
|
||||
; popq %rbp
|
||||
; retq
|
||||
|
||||
|
||||
Reference in New Issue
Block a user