x64: Elide more uextend with extractlane (#6045)
* x64: Elide more uextend with extractlane
I've confirmed locally now that `pextr{b,w,d}` all zero the upper bits
of the full 64-bit register size which means that the `extractlane`
operation with a zero-extend can be elided for more cases, including
8-to-64-bit casts as well as 32-to-64.
This helps elide a few extra `mov`s in a loop I was looking at and had a
modest corresponding increase in performance (my guess was due to the
slightly decreased code size mostly as opposed to the removed `mov`s).
* Remove stray file
This commit is contained in:
@@ -1715,14 +1715,11 @@
|
|||||||
(if-let $true (value32_zeros_upper32 src))
|
(if-let $true (value32_zeros_upper32 src))
|
||||||
src)
|
src)
|
||||||
|
|
||||||
;; The `extractlane` instruction, extended to `$I32`, means that either an
|
;; The `extractlane` instruction, when paired with a zero-extension, means
|
||||||
;; i8x16 or an i16x8 is being extracted. These are implemented with
|
;; that one of `i{8x16,16x8,32x4}` is being extracted and extend. These
|
||||||
;; the `pextr{b,w}` instruction which automatically zero the upper bits of the
|
;; extractions are implemented with the `pextr{b,w,d}` instructions which
|
||||||
;; destination register so the `uextend` in these cases can be elided.
|
;; automatically zero the upper bits of the destination register so the
|
||||||
;;
|
;; `uextend` in these cases can be elided.
|
||||||
;; TODO: the documentation for `pextr{b,w}` seems to indicate it zero extends
|
|
||||||
;; to not only 32-bits but probably the whole 64-bit register. If that's the
|
|
||||||
;; case then this should match a zero-extend to any size instead of just `$I32`.
|
|
||||||
;;
|
;;
|
||||||
;; TODO: the interaction here between this rule and the "it's written far away"
|
;; TODO: the interaction here between this rule and the "it's written far away"
|
||||||
;; rule to lower `extractlane` isn't great. Ideally this rule (and the other
|
;; rule to lower `extractlane` isn't great. Ideally this rule (and the other
|
||||||
@@ -1730,7 +1727,7 @@
|
|||||||
;; be connected to the extractlane rules. There's some discussion of this on
|
;; be connected to the extractlane rules. There's some discussion of this on
|
||||||
;; #6022 but the gist is that there's not a lot of great options at this time,
|
;; #6022 but the gist is that there's not a lot of great options at this time,
|
||||||
;; so this doc block is what's here for now.
|
;; so this doc block is what's here for now.
|
||||||
(rule 1 (extend_to_gpr src @ (extractlane _ _) $I32 (ExtendKind.Zero))
|
(rule 1 (extend_to_gpr src @ (extractlane _ _) _ (ExtendKind.Zero))
|
||||||
src)
|
src)
|
||||||
|
|
||||||
(rule (extend_to_gpr (and val (value_type from_ty))
|
(rule (extend_to_gpr (and val (value_type from_ty))
|
||||||
|
|||||||
@@ -3808,6 +3808,8 @@
|
|||||||
(rule 0 (lower (extractlane val @ (value_type ty @ (multi_lane 16 8)) (u8_from_uimm8 lane)))
|
(rule 0 (lower (extractlane val @ (value_type ty @ (multi_lane 16 8)) (u8_from_uimm8 lane)))
|
||||||
(x64_pextrw val lane))
|
(x64_pextrw val lane))
|
||||||
|
|
||||||
|
;; See the note in the 8x16 case above for how this rule is connected to
|
||||||
|
;; `extend_to_gpr`.
|
||||||
(rule 0 (lower (extractlane val @ (value_type ty @ (multi_lane 32 4)) (u8_from_uimm8 lane)))
|
(rule 0 (lower (extractlane val @ (value_type ty @ (multi_lane 32 4)) (u8_from_uimm8 lane)))
|
||||||
(x64_pextrd val lane))
|
(x64_pextrd val lane))
|
||||||
|
|
||||||
|
|||||||
@@ -79,6 +79,32 @@ block0(v0: i8x16):
|
|||||||
; popq %rbp
|
; popq %rbp
|
||||||
; retq
|
; retq
|
||||||
|
|
||||||
|
function %extractlane_i8x16_i64(i8x16) -> i64 {
|
||||||
|
block0(v0: i8x16):
|
||||||
|
v1 = extractlane v0, 1
|
||||||
|
v2 = uextend.i64 v1
|
||||||
|
return v2
|
||||||
|
}
|
||||||
|
|
||||||
|
; VCode:
|
||||||
|
; pushq %rbp
|
||||||
|
; movq %rsp, %rbp
|
||||||
|
; block0:
|
||||||
|
; pextrb $1, %xmm0, %rax
|
||||||
|
; movq %rbp, %rsp
|
||||||
|
; popq %rbp
|
||||||
|
; ret
|
||||||
|
;
|
||||||
|
; Disassembled:
|
||||||
|
; block0: ; offset 0x0
|
||||||
|
; pushq %rbp
|
||||||
|
; movq %rsp, %rbp
|
||||||
|
; block1: ; offset 0x4
|
||||||
|
; pextrb $1, %xmm0, %eax
|
||||||
|
; movq %rbp, %rsp
|
||||||
|
; popq %rbp
|
||||||
|
; retq
|
||||||
|
|
||||||
function %extractlane_i16x8_i32(i16x8) -> i32 {
|
function %extractlane_i16x8_i32(i16x8) -> i32 {
|
||||||
block0(v0: i16x8):
|
block0(v0: i16x8):
|
||||||
v1 = extractlane v0, 1
|
v1 = extractlane v0, 1
|
||||||
@@ -105,3 +131,55 @@ block0(v0: i16x8):
|
|||||||
; popq %rbp
|
; popq %rbp
|
||||||
; retq
|
; retq
|
||||||
|
|
||||||
|
function %extractlane_i16x8_i64(i16x8) -> i64 {
|
||||||
|
block0(v0: i16x8):
|
||||||
|
v1 = extractlane v0, 1
|
||||||
|
v2 = uextend.i64 v1
|
||||||
|
return v2
|
||||||
|
}
|
||||||
|
|
||||||
|
; VCode:
|
||||||
|
; pushq %rbp
|
||||||
|
; movq %rsp, %rbp
|
||||||
|
; block0:
|
||||||
|
; pextrw $1, %xmm0, %rax
|
||||||
|
; movq %rbp, %rsp
|
||||||
|
; popq %rbp
|
||||||
|
; ret
|
||||||
|
;
|
||||||
|
; Disassembled:
|
||||||
|
; block0: ; offset 0x0
|
||||||
|
; pushq %rbp
|
||||||
|
; movq %rsp, %rbp
|
||||||
|
; block1: ; offset 0x4
|
||||||
|
; pextrw $1, %xmm0, %eax
|
||||||
|
; movq %rbp, %rsp
|
||||||
|
; popq %rbp
|
||||||
|
; retq
|
||||||
|
|
||||||
|
function %extractlane_i32x4_i64(i32x4) -> i64 {
|
||||||
|
block0(v0: i32x4):
|
||||||
|
v1 = extractlane v0, 1
|
||||||
|
v2 = uextend.i64 v1
|
||||||
|
return v2
|
||||||
|
}
|
||||||
|
|
||||||
|
; VCode:
|
||||||
|
; pushq %rbp
|
||||||
|
; movq %rsp, %rbp
|
||||||
|
; block0:
|
||||||
|
; pextrd $1, %xmm0, %rax
|
||||||
|
; movq %rbp, %rsp
|
||||||
|
; popq %rbp
|
||||||
|
; ret
|
||||||
|
;
|
||||||
|
; Disassembled:
|
||||||
|
; block0: ; offset 0x0
|
||||||
|
; pushq %rbp
|
||||||
|
; movq %rsp, %rbp
|
||||||
|
; block1: ; offset 0x4
|
||||||
|
; pextrd $1, %xmm0, %eax
|
||||||
|
; movq %rbp, %rsp
|
||||||
|
; popq %rbp
|
||||||
|
; retq
|
||||||
|
|
||||||
|
|||||||
Reference in New Issue
Block a user