* x64: Optimize store-of-extract-lane-0 The `movss` and `movsd` instructions can be used to store the 0th lane of a `t32x4` or a `t64x2` vector into memory, enabling fusing a `store` and an `extractlane` instruction. * Fix merge conflict with `main`
258 lines
4.1 KiB
Plaintext
258 lines
4.1 KiB
Plaintext
test compile precise-output
|
|
target x86_64
|
|
|
|
function %f1(i8x16) -> i8 {
|
|
block0(v0: i8x16):
|
|
v1 = extractlane v0, 1
|
|
return v1
|
|
}
|
|
|
|
; VCode:
|
|
; pushq %rbp
|
|
; movq %rsp, %rbp
|
|
; block0:
|
|
; pextrb $1, %xmm0, %rax
|
|
; movq %rbp, %rsp
|
|
; popq %rbp
|
|
; ret
|
|
;
|
|
; Disassembled:
|
|
; block0: ; offset 0x0
|
|
; pushq %rbp
|
|
; movq %rsp, %rbp
|
|
; block1: ; offset 0x4
|
|
; pextrb $1, %xmm0, %eax
|
|
; movq %rbp, %rsp
|
|
; popq %rbp
|
|
; retq
|
|
|
|
function %f2(i16x8) -> i16 {
|
|
block0(v0: i16x8):
|
|
v1 = extractlane v0, 1
|
|
return v1
|
|
}
|
|
|
|
; VCode:
|
|
; pushq %rbp
|
|
; movq %rsp, %rbp
|
|
; block0:
|
|
; pextrw $1, %xmm0, %rax
|
|
; movq %rbp, %rsp
|
|
; popq %rbp
|
|
; ret
|
|
;
|
|
; Disassembled:
|
|
; block0: ; offset 0x0
|
|
; pushq %rbp
|
|
; movq %rsp, %rbp
|
|
; block1: ; offset 0x4
|
|
; pextrw $1, %xmm0, %eax
|
|
; movq %rbp, %rsp
|
|
; popq %rbp
|
|
; retq
|
|
|
|
function %f3(i32x4) -> i32 {
|
|
block0(v0: i32x4):
|
|
v1 = extractlane v0, 1
|
|
return v1
|
|
}
|
|
|
|
; VCode:
|
|
; pushq %rbp
|
|
; movq %rsp, %rbp
|
|
; block0:
|
|
; pextrd $1, %xmm0, %rax
|
|
; movq %rbp, %rsp
|
|
; popq %rbp
|
|
; ret
|
|
;
|
|
; Disassembled:
|
|
; block0: ; offset 0x0
|
|
; pushq %rbp
|
|
; movq %rsp, %rbp
|
|
; block1: ; offset 0x4
|
|
; pextrd $1, %xmm0, %eax
|
|
; movq %rbp, %rsp
|
|
; popq %rbp
|
|
; retq
|
|
|
|
function %f4(i64x2) -> i64 {
|
|
block0(v0: i64x2):
|
|
v1 = extractlane v0, 1
|
|
return v1
|
|
}
|
|
|
|
; VCode:
|
|
; pushq %rbp
|
|
; movq %rsp, %rbp
|
|
; block0:
|
|
; pextrq $1, %xmm0, %rax
|
|
; movq %rbp, %rsp
|
|
; popq %rbp
|
|
; ret
|
|
;
|
|
; Disassembled:
|
|
; block0: ; offset 0x0
|
|
; pushq %rbp
|
|
; movq %rsp, %rbp
|
|
; block1: ; offset 0x4
|
|
; pextrq $1, %xmm0, %rax
|
|
; movq %rbp, %rsp
|
|
; popq %rbp
|
|
; retq
|
|
|
|
function %f5(f32x4) -> f32 {
|
|
block0(v0: f32x4):
|
|
v1 = extractlane v0, 1
|
|
return v1
|
|
}
|
|
|
|
; VCode:
|
|
; pushq %rbp
|
|
; movq %rsp, %rbp
|
|
; block0:
|
|
; pshufd $1, %xmm0, %xmm0
|
|
; movq %rbp, %rsp
|
|
; popq %rbp
|
|
; ret
|
|
;
|
|
; Disassembled:
|
|
; block0: ; offset 0x0
|
|
; pushq %rbp
|
|
; movq %rsp, %rbp
|
|
; block1: ; offset 0x4
|
|
; pshufd $1, %xmm0, %xmm0
|
|
; movq %rbp, %rsp
|
|
; popq %rbp
|
|
; retq
|
|
|
|
function %f6(f64x2) -> f64 {
|
|
block0(v0: f64x2):
|
|
v1 = extractlane v0, 1
|
|
return v1
|
|
}
|
|
|
|
; VCode:
|
|
; pushq %rbp
|
|
; movq %rsp, %rbp
|
|
; block0:
|
|
; pshufd $238, %xmm0, %xmm0
|
|
; movq %rbp, %rsp
|
|
; popq %rbp
|
|
; ret
|
|
;
|
|
; Disassembled:
|
|
; block0: ; offset 0x0
|
|
; pushq %rbp
|
|
; movq %rsp, %rbp
|
|
; block1: ; offset 0x4
|
|
; pshufd $0xee, %xmm0, %xmm0
|
|
; movq %rbp, %rsp
|
|
; popq %rbp
|
|
; retq
|
|
|
|
function %extract_i32x4_lane0_to_memory(i32x4, i64) {
|
|
block0(v0: i32x4, v1: i64):
|
|
v2 = extractlane v0, 0
|
|
store v2, v1
|
|
return
|
|
}
|
|
|
|
; VCode:
|
|
; pushq %rbp
|
|
; movq %rsp, %rbp
|
|
; block0:
|
|
; movss %xmm0, 0(%rdi)
|
|
; movq %rbp, %rsp
|
|
; popq %rbp
|
|
; ret
|
|
;
|
|
; Disassembled:
|
|
; block0: ; offset 0x0
|
|
; pushq %rbp
|
|
; movq %rsp, %rbp
|
|
; block1: ; offset 0x4
|
|
; movss %xmm0, (%rdi) ; trap: heap_oob
|
|
; movq %rbp, %rsp
|
|
; popq %rbp
|
|
; retq
|
|
|
|
function %extract_f32x4_lane0_to_memory(f32x4, i64) {
|
|
block0(v0: f32x4, v1: i64):
|
|
v2 = extractlane v0, 0
|
|
store v2, v1
|
|
return
|
|
}
|
|
|
|
; VCode:
|
|
; pushq %rbp
|
|
; movq %rsp, %rbp
|
|
; block0:
|
|
; movss %xmm0, 0(%rdi)
|
|
; movq %rbp, %rsp
|
|
; popq %rbp
|
|
; ret
|
|
;
|
|
; Disassembled:
|
|
; block0: ; offset 0x0
|
|
; pushq %rbp
|
|
; movq %rsp, %rbp
|
|
; block1: ; offset 0x4
|
|
; movss %xmm0, (%rdi) ; trap: heap_oob
|
|
; movq %rbp, %rsp
|
|
; popq %rbp
|
|
; retq
|
|
|
|
function %extract_i64x2_lane0_to_memory(i64x2, i64) {
|
|
block0(v0: i64x2, v1: i64):
|
|
v2 = extractlane v0, 0
|
|
store v2, v1
|
|
return
|
|
}
|
|
|
|
; VCode:
|
|
; pushq %rbp
|
|
; movq %rsp, %rbp
|
|
; block0:
|
|
; movsd %xmm0, 0(%rdi)
|
|
; movq %rbp, %rsp
|
|
; popq %rbp
|
|
; ret
|
|
;
|
|
; Disassembled:
|
|
; block0: ; offset 0x0
|
|
; pushq %rbp
|
|
; movq %rsp, %rbp
|
|
; block1: ; offset 0x4
|
|
; movsd %xmm0, (%rdi) ; trap: heap_oob
|
|
; movq %rbp, %rsp
|
|
; popq %rbp
|
|
; retq
|
|
|
|
function %extract_f64x2_lane0_to_memory(f64x2, i64) {
|
|
block0(v0: f64x2, v1: i64):
|
|
v2 = extractlane v0, 0
|
|
store v2, v1
|
|
return
|
|
}
|
|
|
|
; VCode:
|
|
; pushq %rbp
|
|
; movq %rsp, %rbp
|
|
; block0:
|
|
; movsd %xmm0, 0(%rdi)
|
|
; movq %rbp, %rsp
|
|
; popq %rbp
|
|
; ret
|
|
;
|
|
; Disassembled:
|
|
; block0: ; offset 0x0
|
|
; pushq %rbp
|
|
; movq %rsp, %rbp
|
|
; block1: ; offset 0x4
|
|
; movsd %xmm0, (%rdi) ; trap: heap_oob
|
|
; movq %rbp, %rsp
|
|
; popq %rbp
|
|
; retq
|
|
|