* x64: Add non-SSE4.1 lowerings of `pmov{s,z}x*`
This commit adds lowerings for a suite of sign/zero extension
instructions which don't require SSE4.1. Like before these lowerings are
based on LLVM's output.
This commit also deletes special casees for `i16x8.extmul_{low,high}_*`
since the output of the special case is the same as the default lowering
of all the component instructions used within as well.
* Remove SSE4.1 specialization of `uwiden_high`
LLVM prefers the `punpckh*`-based lowerings and at least according to
`llvm-mca` these are slightly better cycle-wise too.
323 lines
5.1 KiB
Plaintext
323 lines
5.1 KiB
Plaintext
test compile precise-output
|
|
target x86_64
|
|
|
|
function %f1(i8x16) -> i16x8 {
|
|
block0(v0: i8x16):
|
|
v1 = swiden_low v0
|
|
return v1
|
|
}
|
|
|
|
; VCode:
|
|
; pushq %rbp
|
|
; movq %rsp, %rbp
|
|
; block0:
|
|
; pmovsxbw %xmm0, %xmm0
|
|
; movq %rbp, %rsp
|
|
; popq %rbp
|
|
; ret
|
|
;
|
|
; Disassembled:
|
|
; block0: ; offset 0x0
|
|
; pushq %rbp
|
|
; movq %rsp, %rbp
|
|
; block1: ; offset 0x4
|
|
; pmovsxbw %xmm0, %xmm0
|
|
; movq %rbp, %rsp
|
|
; popq %rbp
|
|
; retq
|
|
|
|
function %f2(i16x8) -> i32x4 {
|
|
block0(v0: i16x8):
|
|
v1 = swiden_low v0
|
|
return v1
|
|
}
|
|
|
|
; VCode:
|
|
; pushq %rbp
|
|
; movq %rsp, %rbp
|
|
; block0:
|
|
; pmovsxwd %xmm0, %xmm0
|
|
; movq %rbp, %rsp
|
|
; popq %rbp
|
|
; ret
|
|
;
|
|
; Disassembled:
|
|
; block0: ; offset 0x0
|
|
; pushq %rbp
|
|
; movq %rsp, %rbp
|
|
; block1: ; offset 0x4
|
|
; pmovsxwd %xmm0, %xmm0
|
|
; movq %rbp, %rsp
|
|
; popq %rbp
|
|
; retq
|
|
|
|
function %f3(i32x4) -> i64x2 {
|
|
block0(v0: i32x4):
|
|
v1 = swiden_low v0
|
|
return v1
|
|
}
|
|
|
|
; VCode:
|
|
; pushq %rbp
|
|
; movq %rsp, %rbp
|
|
; block0:
|
|
; pmovsxdq %xmm0, %xmm0
|
|
; movq %rbp, %rsp
|
|
; popq %rbp
|
|
; ret
|
|
;
|
|
; Disassembled:
|
|
; block0: ; offset 0x0
|
|
; pushq %rbp
|
|
; movq %rsp, %rbp
|
|
; block1: ; offset 0x4
|
|
; pmovsxdq %xmm0, %xmm0
|
|
; movq %rbp, %rsp
|
|
; popq %rbp
|
|
; retq
|
|
|
|
function %f4(i8x16) -> i16x8 {
|
|
block0(v0: i8x16):
|
|
v1 = swiden_high v0
|
|
return v1
|
|
}
|
|
|
|
; VCode:
|
|
; pushq %rbp
|
|
; movq %rsp, %rbp
|
|
; block0:
|
|
; movdqa %xmm0, %xmm2
|
|
; palignr $8, %xmm2, %xmm0, %xmm2
|
|
; pmovsxbw %xmm2, %xmm0
|
|
; movq %rbp, %rsp
|
|
; popq %rbp
|
|
; ret
|
|
;
|
|
; Disassembled:
|
|
; block0: ; offset 0x0
|
|
; pushq %rbp
|
|
; movq %rsp, %rbp
|
|
; block1: ; offset 0x4
|
|
; movdqa %xmm0, %xmm2
|
|
; palignr $8, %xmm0, %xmm2
|
|
; pmovsxbw %xmm2, %xmm0
|
|
; movq %rbp, %rsp
|
|
; popq %rbp
|
|
; retq
|
|
|
|
function %f5(i16x8) -> i32x4 {
|
|
block0(v0: i16x8):
|
|
v1 = swiden_high v0
|
|
return v1
|
|
}
|
|
|
|
; VCode:
|
|
; pushq %rbp
|
|
; movq %rsp, %rbp
|
|
; block0:
|
|
; movdqa %xmm0, %xmm2
|
|
; palignr $8, %xmm2, %xmm0, %xmm2
|
|
; pmovsxwd %xmm2, %xmm0
|
|
; movq %rbp, %rsp
|
|
; popq %rbp
|
|
; ret
|
|
;
|
|
; Disassembled:
|
|
; block0: ; offset 0x0
|
|
; pushq %rbp
|
|
; movq %rsp, %rbp
|
|
; block1: ; offset 0x4
|
|
; movdqa %xmm0, %xmm2
|
|
; palignr $8, %xmm0, %xmm2
|
|
; pmovsxwd %xmm2, %xmm0
|
|
; movq %rbp, %rsp
|
|
; popq %rbp
|
|
; retq
|
|
|
|
function %f6(i32x4) -> i64x2 {
|
|
block0(v0: i32x4):
|
|
v1 = swiden_high v0
|
|
return v1
|
|
}
|
|
|
|
; VCode:
|
|
; pushq %rbp
|
|
; movq %rsp, %rbp
|
|
; block0:
|
|
; pshufd $238, %xmm0, %xmm2
|
|
; pmovsxdq %xmm2, %xmm0
|
|
; movq %rbp, %rsp
|
|
; popq %rbp
|
|
; ret
|
|
;
|
|
; Disassembled:
|
|
; block0: ; offset 0x0
|
|
; pushq %rbp
|
|
; movq %rsp, %rbp
|
|
; block1: ; offset 0x4
|
|
; pshufd $0xee, %xmm0, %xmm2
|
|
; pmovsxdq %xmm2, %xmm0
|
|
; movq %rbp, %rsp
|
|
; popq %rbp
|
|
; retq
|
|
|
|
function %f7(i8x16) -> i16x8 {
|
|
block0(v0: i8x16):
|
|
v1 = uwiden_low v0
|
|
return v1
|
|
}
|
|
|
|
; VCode:
|
|
; pushq %rbp
|
|
; movq %rsp, %rbp
|
|
; block0:
|
|
; pmovzxbw %xmm0, %xmm0
|
|
; movq %rbp, %rsp
|
|
; popq %rbp
|
|
; ret
|
|
;
|
|
; Disassembled:
|
|
; block0: ; offset 0x0
|
|
; pushq %rbp
|
|
; movq %rsp, %rbp
|
|
; block1: ; offset 0x4
|
|
; pmovzxbw %xmm0, %xmm0
|
|
; movq %rbp, %rsp
|
|
; popq %rbp
|
|
; retq
|
|
|
|
function %f8(i16x8) -> i32x4 {
|
|
block0(v0: i16x8):
|
|
v1 = uwiden_low v0
|
|
return v1
|
|
}
|
|
|
|
; VCode:
|
|
; pushq %rbp
|
|
; movq %rsp, %rbp
|
|
; block0:
|
|
; pmovzxwd %xmm0, %xmm0
|
|
; movq %rbp, %rsp
|
|
; popq %rbp
|
|
; ret
|
|
;
|
|
; Disassembled:
|
|
; block0: ; offset 0x0
|
|
; pushq %rbp
|
|
; movq %rsp, %rbp
|
|
; block1: ; offset 0x4
|
|
; pmovzxwd %xmm0, %xmm0
|
|
; movq %rbp, %rsp
|
|
; popq %rbp
|
|
; retq
|
|
|
|
function %f9(i32x4) -> i64x2 {
|
|
block0(v0: i32x4):
|
|
v1 = uwiden_low v0
|
|
return v1
|
|
}
|
|
|
|
; VCode:
|
|
; pushq %rbp
|
|
; movq %rsp, %rbp
|
|
; block0:
|
|
; pmovzxdq %xmm0, %xmm0
|
|
; movq %rbp, %rsp
|
|
; popq %rbp
|
|
; ret
|
|
;
|
|
; Disassembled:
|
|
; block0: ; offset 0x0
|
|
; pushq %rbp
|
|
; movq %rsp, %rbp
|
|
; block1: ; offset 0x4
|
|
; pmovzxdq %xmm0, %xmm0
|
|
; movq %rbp, %rsp
|
|
; popq %rbp
|
|
; retq
|
|
|
|
function %f10(i8x16) -> i16x8 {
|
|
block0(v0: i8x16):
|
|
v1 = uwiden_high v0
|
|
return v1
|
|
}
|
|
|
|
; VCode:
|
|
; pushq %rbp
|
|
; movq %rsp, %rbp
|
|
; block0:
|
|
; uninit %xmm3
|
|
; pxor %xmm3, %xmm3, %xmm3
|
|
; punpckhbw %xmm0, %xmm3, %xmm0
|
|
; movq %rbp, %rsp
|
|
; popq %rbp
|
|
; ret
|
|
;
|
|
; Disassembled:
|
|
; block0: ; offset 0x0
|
|
; pushq %rbp
|
|
; movq %rsp, %rbp
|
|
; block1: ; offset 0x4
|
|
; pxor %xmm3, %xmm3
|
|
; punpckhbw %xmm3, %xmm0
|
|
; movq %rbp, %rsp
|
|
; popq %rbp
|
|
; retq
|
|
|
|
function %f11(i16x8) -> i32x4 {
|
|
block0(v0: i16x8):
|
|
v1 = uwiden_high v0
|
|
return v1
|
|
}
|
|
|
|
; VCode:
|
|
; pushq %rbp
|
|
; movq %rsp, %rbp
|
|
; block0:
|
|
; uninit %xmm3
|
|
; pxor %xmm3, %xmm3, %xmm3
|
|
; punpckhwd %xmm0, %xmm3, %xmm0
|
|
; movq %rbp, %rsp
|
|
; popq %rbp
|
|
; ret
|
|
;
|
|
; Disassembled:
|
|
; block0: ; offset 0x0
|
|
; pushq %rbp
|
|
; movq %rsp, %rbp
|
|
; block1: ; offset 0x4
|
|
; pxor %xmm3, %xmm3
|
|
; punpckhwd %xmm3, %xmm0
|
|
; movq %rbp, %rsp
|
|
; popq %rbp
|
|
; retq
|
|
|
|
function %f12(i32x4) -> i64x2 {
|
|
block0(v0: i32x4):
|
|
v1 = uwiden_high v0
|
|
return v1
|
|
}
|
|
|
|
; VCode:
|
|
; pushq %rbp
|
|
; movq %rsp, %rbp
|
|
; block0:
|
|
; uninit %xmm3
|
|
; xorps %xmm3, %xmm3, %xmm3
|
|
; unpckhps %xmm0, %xmm3, %xmm0
|
|
; movq %rbp, %rsp
|
|
; popq %rbp
|
|
; ret
|
|
;
|
|
; Disassembled:
|
|
; block0: ; offset 0x0
|
|
; pushq %rbp
|
|
; movq %rsp, %rbp
|
|
; block1: ; offset 0x4
|
|
; xorps %xmm3, %xmm3
|
|
; unpckhps %xmm3, %xmm0
|
|
; movq %rbp, %rsp
|
|
; popq %rbp
|
|
; retq
|
|
|