x64: Elide more uextend with extractlane (#6045)
* x64: Elide more uextend with extractlane
I've confirmed locally now that `pextr{b,w,d}` all zero the upper bits
of the full 64-bit register size which means that the `extractlane`
operation with a zero-extend can be elided for more cases, including
8-to-64-bit casts as well as 32-to-64.
This helps elide a few extra `mov`s in a loop I was looking at and had a
modest corresponding increase in performance (my guess was due to the
slightly decreased code size mostly as opposed to the removed `mov`s).
* Remove stray file
This commit is contained in:
@@ -79,6 +79,32 @@ block0(v0: i8x16):
|
||||
; popq %rbp
|
||||
; retq
|
||||
|
||||
function %extractlane_i8x16_i64(i8x16) -> i64 {
|
||||
block0(v0: i8x16):
|
||||
v1 = extractlane v0, 1
|
||||
v2 = uextend.i64 v1
|
||||
return v2
|
||||
}
|
||||
|
||||
; VCode:
|
||||
; pushq %rbp
|
||||
; movq %rsp, %rbp
|
||||
; block0:
|
||||
; pextrb $1, %xmm0, %rax
|
||||
; movq %rbp, %rsp
|
||||
; popq %rbp
|
||||
; ret
|
||||
;
|
||||
; Disassembled:
|
||||
; block0: ; offset 0x0
|
||||
; pushq %rbp
|
||||
; movq %rsp, %rbp
|
||||
; block1: ; offset 0x4
|
||||
; pextrb $1, %xmm0, %eax
|
||||
; movq %rbp, %rsp
|
||||
; popq %rbp
|
||||
; retq
|
||||
|
||||
function %extractlane_i16x8_i32(i16x8) -> i32 {
|
||||
block0(v0: i16x8):
|
||||
v1 = extractlane v0, 1
|
||||
@@ -105,3 +131,55 @@ block0(v0: i16x8):
|
||||
; popq %rbp
|
||||
; retq
|
||||
|
||||
function %extractlane_i16x8_i64(i16x8) -> i64 {
|
||||
block0(v0: i16x8):
|
||||
v1 = extractlane v0, 1
|
||||
v2 = uextend.i64 v1
|
||||
return v2
|
||||
}
|
||||
|
||||
; VCode:
|
||||
; pushq %rbp
|
||||
; movq %rsp, %rbp
|
||||
; block0:
|
||||
; pextrw $1, %xmm0, %rax
|
||||
; movq %rbp, %rsp
|
||||
; popq %rbp
|
||||
; ret
|
||||
;
|
||||
; Disassembled:
|
||||
; block0: ; offset 0x0
|
||||
; pushq %rbp
|
||||
; movq %rsp, %rbp
|
||||
; block1: ; offset 0x4
|
||||
; pextrw $1, %xmm0, %eax
|
||||
; movq %rbp, %rsp
|
||||
; popq %rbp
|
||||
; retq
|
||||
|
||||
function %extractlane_i32x4_i64(i32x4) -> i64 {
|
||||
block0(v0: i32x4):
|
||||
v1 = extractlane v0, 1
|
||||
v2 = uextend.i64 v1
|
||||
return v2
|
||||
}
|
||||
|
||||
; VCode:
|
||||
; pushq %rbp
|
||||
; movq %rsp, %rbp
|
||||
; block0:
|
||||
; pextrd $1, %xmm0, %rax
|
||||
; movq %rbp, %rsp
|
||||
; popq %rbp
|
||||
; ret
|
||||
;
|
||||
; Disassembled:
|
||||
; block0: ; offset 0x0
|
||||
; pushq %rbp
|
||||
; movq %rsp, %rbp
|
||||
; block1: ; offset 0x4
|
||||
; pextrd $1, %xmm0, %eax
|
||||
; movq %rbp, %rsp
|
||||
; popq %rbp
|
||||
; retq
|
||||
|
||||
|
||||
Reference in New Issue
Block a user