x64: Optimize store-of-extract-lane-0 (#5924)

* x64: Optimize store-of-extract-lane-0

The `movss` and `movsd` instructions can be used to store the 0th lane
of a `t32x4` or a `t64x2` vector into memory, enabling fusing a `store`
and an `extractlane` instruction.

* Fix merge conflict with `main`
This commit is contained in:
Alex Crichton
2023-03-09 19:06:38 -06:00
committed by GitHub
parent 83f21e784a
commit 0ec7b872fa
2 changed files with 122 additions and 0 deletions

View File

@@ -151,3 +151,107 @@ block0(v0: f64x2):
; popq %rbp
; retq
function %extract_i32x4_lane0_to_memory(i32x4, i64) {
block0(v0: i32x4, v1: i64):
v2 = extractlane v0, 0
store v2, v1
return
}
; VCode:
; pushq %rbp
; movq %rsp, %rbp
; block0:
; movss %xmm0, 0(%rdi)
; movq %rbp, %rsp
; popq %rbp
; ret
;
; Disassembled:
; block0: ; offset 0x0
; pushq %rbp
; movq %rsp, %rbp
; block1: ; offset 0x4
; movss %xmm0, (%rdi) ; trap: heap_oob
; movq %rbp, %rsp
; popq %rbp
; retq
function %extract_f32x4_lane0_to_memory(f32x4, i64) {
block0(v0: f32x4, v1: i64):
v2 = extractlane v0, 0
store v2, v1
return
}
; VCode:
; pushq %rbp
; movq %rsp, %rbp
; block0:
; movss %xmm0, 0(%rdi)
; movq %rbp, %rsp
; popq %rbp
; ret
;
; Disassembled:
; block0: ; offset 0x0
; pushq %rbp
; movq %rsp, %rbp
; block1: ; offset 0x4
; movss %xmm0, (%rdi) ; trap: heap_oob
; movq %rbp, %rsp
; popq %rbp
; retq
function %extract_i64x2_lane0_to_memory(i64x2, i64) {
block0(v0: i64x2, v1: i64):
v2 = extractlane v0, 0
store v2, v1
return
}
; VCode:
; pushq %rbp
; movq %rsp, %rbp
; block0:
; movsd %xmm0, 0(%rdi)
; movq %rbp, %rsp
; popq %rbp
; ret
;
; Disassembled:
; block0: ; offset 0x0
; pushq %rbp
; movq %rsp, %rbp
; block1: ; offset 0x4
; movsd %xmm0, (%rdi) ; trap: heap_oob
; movq %rbp, %rsp
; popq %rbp
; retq
function %extract_f64x2_lane0_to_memory(f64x2, i64) {
block0(v0: f64x2, v1: i64):
v2 = extractlane v0, 0
store v2, v1
return
}
; VCode:
; pushq %rbp
; movq %rsp, %rbp
; block0:
; movsd %xmm0, 0(%rdi)
; movq %rbp, %rsp
; popq %rbp
; ret
;
; Disassembled:
; block0: ; offset 0x0
; pushq %rbp
; movq %rsp, %rbp
; block1: ; offset 0x4
; movsd %xmm0, (%rdi) ; trap: heap_oob
; movq %rbp, %rsp
; popq %rbp
; retq