x64: Add non-SSE4.1 lowerings of pmov{s,z}x* (#6279)
* x64: Add non-SSE4.1 lowerings of `pmov{s,z}x*`
This commit adds lowerings for a suite of sign/zero extension
instructions which don't require SSE4.1. Like before these lowerings are
based on LLVM's output.
This commit also deletes special casees for `i16x8.extmul_{low,high}_*`
since the output of the special case is the same as the default lowering
of all the component instructions used within as well.
* Remove SSE4.1 specialization of `uwiden_high`
LLVM prefers the `punpckh*`-based lowerings and at least according to
`llvm-mca` these are slightly better cycle-wise too.
This commit is contained in:
@@ -1204,8 +1204,9 @@ block0(v0: i8x16):
|
||||
; pushq %rbp
|
||||
; movq %rsp, %rbp
|
||||
; block0:
|
||||
; vpalignr $8, %xmm0, %xmm0, %xmm2
|
||||
; vpmovzxbw %xmm2, %xmm0
|
||||
; uninit %xmm2
|
||||
; vpxor %xmm2, %xmm2, %xmm4
|
||||
; vpunpckhbw %xmm0, %xmm4, %xmm0
|
||||
; movq %rbp, %rsp
|
||||
; popq %rbp
|
||||
; ret
|
||||
@@ -1215,8 +1216,8 @@ block0(v0: i8x16):
|
||||
; pushq %rbp
|
||||
; movq %rsp, %rbp
|
||||
; block1: ; offset 0x4
|
||||
; vpalignr $8, %xmm0, %xmm0, %xmm2
|
||||
; vpmovzxbw %xmm2, %xmm0
|
||||
; vpxor %xmm2, %xmm2, %xmm4
|
||||
; vpunpckhbw %xmm4, %xmm0, %xmm0
|
||||
; movq %rbp, %rsp
|
||||
; popq %rbp
|
||||
; retq
|
||||
|
||||
@@ -16,13 +16,13 @@ block0(v0: i8x16, v1: i8x16):
|
||||
; pushq %rbp
|
||||
; movq %rsp, %rbp
|
||||
; block0:
|
||||
; movdqa %xmm0, %xmm3
|
||||
; palignr $8, %xmm3, %xmm0, %xmm3
|
||||
; pmovsxbw %xmm3, %xmm0
|
||||
; movdqa %xmm1, %xmm7
|
||||
; palignr $8, %xmm7, %xmm1, %xmm7
|
||||
; pmovsxbw %xmm7, %xmm9
|
||||
; pmullw %xmm0, %xmm9, %xmm0
|
||||
; movdqa %xmm0, %xmm6
|
||||
; palignr $8, %xmm6, %xmm0, %xmm6
|
||||
; pmovsxbw %xmm6, %xmm0
|
||||
; movdqa %xmm1, %xmm6
|
||||
; palignr $8, %xmm6, %xmm1, %xmm6
|
||||
; pmovsxbw %xmm6, %xmm8
|
||||
; pmullw %xmm0, %xmm8, %xmm0
|
||||
; movq %rbp, %rsp
|
||||
; popq %rbp
|
||||
; ret
|
||||
@@ -32,13 +32,13 @@ block0(v0: i8x16, v1: i8x16):
|
||||
; pushq %rbp
|
||||
; movq %rsp, %rbp
|
||||
; block1: ; offset 0x4
|
||||
; movdqa %xmm0, %xmm3
|
||||
; palignr $8, %xmm0, %xmm3
|
||||
; pmovsxbw %xmm3, %xmm0
|
||||
; movdqa %xmm1, %xmm7
|
||||
; palignr $8, %xmm1, %xmm7
|
||||
; pmovsxbw %xmm7, %xmm9
|
||||
; pmullw %xmm9, %xmm0
|
||||
; movdqa %xmm0, %xmm6
|
||||
; palignr $8, %xmm0, %xmm6
|
||||
; pmovsxbw %xmm6, %xmm0
|
||||
; movdqa %xmm1, %xmm6
|
||||
; palignr $8, %xmm1, %xmm6
|
||||
; pmovsxbw %xmm6, %xmm8
|
||||
; pmullw %xmm8, %xmm0
|
||||
; movq %rbp, %rsp
|
||||
; popq %rbp
|
||||
; retq
|
||||
@@ -226,13 +226,14 @@ block0(v0: i8x16, v1: i8x16):
|
||||
; pushq %rbp
|
||||
; movq %rsp, %rbp
|
||||
; block0:
|
||||
; movdqa %xmm0, %xmm3
|
||||
; palignr $8, %xmm3, %xmm0, %xmm3
|
||||
; pmovzxbw %xmm3, %xmm0
|
||||
; movdqa %xmm1, %xmm7
|
||||
; palignr $8, %xmm7, %xmm1, %xmm7
|
||||
; pmovzxbw %xmm7, %xmm9
|
||||
; pmullw %xmm0, %xmm9, %xmm0
|
||||
; uninit %xmm8
|
||||
; pxor %xmm8, %xmm8, %xmm8
|
||||
; punpckhbw %xmm0, %xmm8, %xmm0
|
||||
; uninit %xmm8
|
||||
; pxor %xmm8, %xmm8, %xmm8
|
||||
; movdqa %xmm1, %xmm11
|
||||
; punpckhbw %xmm11, %xmm8, %xmm11
|
||||
; pmullw %xmm0, %xmm11, %xmm0
|
||||
; movq %rbp, %rsp
|
||||
; popq %rbp
|
||||
; ret
|
||||
@@ -242,13 +243,12 @@ block0(v0: i8x16, v1: i8x16):
|
||||
; pushq %rbp
|
||||
; movq %rsp, %rbp
|
||||
; block1: ; offset 0x4
|
||||
; movdqa %xmm0, %xmm3
|
||||
; palignr $8, %xmm0, %xmm3
|
||||
; pmovzxbw %xmm3, %xmm0
|
||||
; movdqa %xmm1, %xmm7
|
||||
; palignr $8, %xmm1, %xmm7
|
||||
; pmovzxbw %xmm7, %xmm9
|
||||
; pmullw %xmm9, %xmm0
|
||||
; pxor %xmm8, %xmm8
|
||||
; punpckhbw %xmm8, %xmm0
|
||||
; pxor %xmm8, %xmm8
|
||||
; movdqa %xmm1, %xmm11
|
||||
; punpckhbw %xmm8, %xmm11
|
||||
; pmullw %xmm11, %xmm0
|
||||
; movq %rbp, %rsp
|
||||
; popq %rbp
|
||||
; retq
|
||||
|
||||
@@ -12,9 +12,10 @@ block0(v0: i64, v2: i8x16):
|
||||
; pushq %rbp
|
||||
; movq %rsp, %rbp
|
||||
; block0:
|
||||
; movdqu 80(%rdi), %xmm3
|
||||
; palignr $8, %xmm3, %xmm3, %xmm3
|
||||
; pmovzxbw %xmm3, %xmm0
|
||||
; movdqu 80(%rdi), %xmm0
|
||||
; uninit %xmm5
|
||||
; pxor %xmm5, %xmm5, %xmm5
|
||||
; punpckhbw %xmm0, %xmm5, %xmm0
|
||||
; movq %rbp, %rsp
|
||||
; popq %rbp
|
||||
; ret
|
||||
@@ -24,9 +25,9 @@ block0(v0: i64, v2: i8x16):
|
||||
; pushq %rbp
|
||||
; movq %rsp, %rbp
|
||||
; block1: ; offset 0x4
|
||||
; movdqu 0x50(%rdi), %xmm3
|
||||
; palignr $8, %xmm3, %xmm3
|
||||
; pmovzxbw %xmm3, %xmm0
|
||||
; movdqu 0x50(%rdi), %xmm0
|
||||
; pxor %xmm5, %xmm5
|
||||
; punpckhbw %xmm5, %xmm0
|
||||
; movq %rbp, %rsp
|
||||
; popq %rbp
|
||||
; retq
|
||||
|
||||
@@ -246,9 +246,9 @@ block0(v0: i8x16):
|
||||
; pushq %rbp
|
||||
; movq %rsp, %rbp
|
||||
; block0:
|
||||
; movdqa %xmm0, %xmm2
|
||||
; palignr $8, %xmm2, %xmm0, %xmm2
|
||||
; pmovzxbw %xmm2, %xmm0
|
||||
; uninit %xmm3
|
||||
; pxor %xmm3, %xmm3, %xmm3
|
||||
; punpckhbw %xmm0, %xmm3, %xmm0
|
||||
; movq %rbp, %rsp
|
||||
; popq %rbp
|
||||
; ret
|
||||
@@ -258,9 +258,8 @@ block0(v0: i8x16):
|
||||
; pushq %rbp
|
||||
; movq %rsp, %rbp
|
||||
; block1: ; offset 0x4
|
||||
; movdqa %xmm0, %xmm2
|
||||
; palignr $8, %xmm0, %xmm2
|
||||
; pmovzxbw %xmm2, %xmm0
|
||||
; pxor %xmm3, %xmm3
|
||||
; punpckhbw %xmm3, %xmm0
|
||||
; movq %rbp, %rsp
|
||||
; popq %rbp
|
||||
; retq
|
||||
@@ -275,9 +274,9 @@ block0(v0: i16x8):
|
||||
; pushq %rbp
|
||||
; movq %rsp, %rbp
|
||||
; block0:
|
||||
; movdqa %xmm0, %xmm2
|
||||
; palignr $8, %xmm2, %xmm0, %xmm2
|
||||
; pmovzxwd %xmm2, %xmm0
|
||||
; uninit %xmm3
|
||||
; pxor %xmm3, %xmm3, %xmm3
|
||||
; punpckhwd %xmm0, %xmm3, %xmm0
|
||||
; movq %rbp, %rsp
|
||||
; popq %rbp
|
||||
; ret
|
||||
@@ -287,9 +286,8 @@ block0(v0: i16x8):
|
||||
; pushq %rbp
|
||||
; movq %rsp, %rbp
|
||||
; block1: ; offset 0x4
|
||||
; movdqa %xmm0, %xmm2
|
||||
; palignr $8, %xmm0, %xmm2
|
||||
; pmovzxwd %xmm2, %xmm0
|
||||
; pxor %xmm3, %xmm3
|
||||
; punpckhwd %xmm3, %xmm0
|
||||
; movq %rbp, %rsp
|
||||
; popq %rbp
|
||||
; retq
|
||||
@@ -304,8 +302,9 @@ block0(v0: i32x4):
|
||||
; pushq %rbp
|
||||
; movq %rsp, %rbp
|
||||
; block0:
|
||||
; pshufd $238, %xmm0, %xmm2
|
||||
; pmovzxdq %xmm2, %xmm0
|
||||
; uninit %xmm3
|
||||
; xorps %xmm3, %xmm3, %xmm3
|
||||
; unpckhps %xmm0, %xmm3, %xmm0
|
||||
; movq %rbp, %rsp
|
||||
; popq %rbp
|
||||
; ret
|
||||
@@ -315,8 +314,8 @@ block0(v0: i32x4):
|
||||
; pushq %rbp
|
||||
; movq %rsp, %rbp
|
||||
; block1: ; offset 0x4
|
||||
; pshufd $0xee, %xmm0, %xmm2
|
||||
; pmovzxdq %xmm2, %xmm0
|
||||
; xorps %xmm3, %xmm3
|
||||
; unpckhps %xmm3, %xmm0
|
||||
; movq %rbp, %rsp
|
||||
; popq %rbp
|
||||
; retq
|
||||
|
||||
@@ -2,9 +2,11 @@ test interpret
|
||||
test run
|
||||
target aarch64
|
||||
target s390x
|
||||
target x86_64 ssse3 has_sse41=false
|
||||
set enable_simd
|
||||
target x86_64 has_sse3 has_ssse3 has_sse41
|
||||
target x86_64 has_sse3 has_ssse3 has_sse41 has_avx
|
||||
target x86_64
|
||||
target x86_64 sse41
|
||||
target x86_64 sse41 has_avx
|
||||
|
||||
function %swidenhigh_i8x16(i8x16) -> i16x8 {
|
||||
block0(v0: i8x16):
|
||||
|
||||
@@ -2,9 +2,11 @@ test interpret
|
||||
test run
|
||||
target aarch64
|
||||
target s390x
|
||||
target x86_64 ssse3 has_sse41=false
|
||||
set enable_simd
|
||||
target x86_64 has_sse3 has_ssse3 has_sse41
|
||||
target x86_64 has_sse3 has_ssse3 has_sse41 has_avx
|
||||
target x86_64
|
||||
target x86_64 sse41
|
||||
target x86_64 sse41 has_avx
|
||||
|
||||
function %swidenlow_i8x16(i8x16) -> i16x8 {
|
||||
block0(v0: i8x16):
|
||||
|
||||
@@ -2,9 +2,11 @@ test interpret
|
||||
test run
|
||||
target aarch64
|
||||
target s390x
|
||||
target x86_64 ssse3 has_sse41=false
|
||||
set enable_simd
|
||||
target x86_64 has_sse3 has_ssse3 has_sse41
|
||||
target x86_64 has_sse3 has_ssse3 has_sse41 has_avx
|
||||
target x86_64
|
||||
target x86_64 sse41
|
||||
target x86_64 sse41 has_avx
|
||||
|
||||
function %uwidenhigh_i8x16(i8x16) -> i16x8 {
|
||||
block0(v0: i8x16):
|
||||
@@ -12,6 +14,7 @@ block0(v0: i8x16):
|
||||
return v1
|
||||
}
|
||||
; run: %uwidenhigh_i8x16([1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16]) == [9 10 11 12 13 14 15 16]
|
||||
; run: %uwidenhigh_i8x16([1 2 3 4 5 6 7 8 9 0x80 10 0xff 11 0x92 12 0x70]) == [9 0x80 10 0xff 11 0x92 12 0x70]
|
||||
|
||||
function %uwidenhigh_i16x8(i16x8) -> i32x4 {
|
||||
block0(v0: i16x8):
|
||||
@@ -19,6 +22,7 @@ block0(v0: i16x8):
|
||||
return v1
|
||||
}
|
||||
; run: %uwidenhigh_i16x8([1 2 3 4 5 6 7 8]) == [5 6 7 8]
|
||||
; run: %uwidenhigh_i16x8([9 10 11 12 13 14 -1 -2]) == [13 14 0xffff 0xfffe]
|
||||
|
||||
function %uwidenhigh_i32x4(i32x4) -> i64x2 {
|
||||
block0(v0: i32x4):
|
||||
@@ -26,3 +30,4 @@ block0(v0: i32x4):
|
||||
return v1
|
||||
}
|
||||
; run: %uwidenhigh_i32x4([1 2 3 4]) == [3 4]
|
||||
; run: %uwidenhigh_i32x4([4 5 6 -1]) == [6 0xffffffff]
|
||||
|
||||
@@ -2,9 +2,11 @@ test interpret
|
||||
test run
|
||||
target aarch64
|
||||
target s390x
|
||||
target x86_64 ssse3 has_sse41=false
|
||||
set enable_simd
|
||||
target x86_64 has_sse3 has_ssse3 has_sse41
|
||||
target x86_64 has_sse3 has_ssse3 has_sse41 has_avx
|
||||
target x86_64
|
||||
target x86_64 sse41
|
||||
target x86_64 sse41 has_avx
|
||||
|
||||
function %uwidenlow_i8x16(i8x16) -> i16x8 {
|
||||
block0(v0: i8x16):
|
||||
|
||||
@@ -125,15 +125,15 @@
|
||||
;; movq %rsp, %rbp
|
||||
;; unwind DefineNewFrame { offset_upward_to_caller_sp: 16, offset_downward_to_clobbers: 0 }
|
||||
;; block0:
|
||||
;; vpmovsxbw %xmm0, %xmm10
|
||||
;; vpmovsxbw %xmm1, %xmm12
|
||||
;; vpmullw %xmm10, %xmm12, %xmm14
|
||||
;; vpalignr $8, %xmm0, %xmm0, %xmm8
|
||||
;; vpmovsxbw %xmm8, %xmm10
|
||||
;; vpalignr $8, %xmm1, %xmm1, %xmm12
|
||||
;; vpmovsxbw %xmm12, %xmm15
|
||||
;; vpmullw %xmm10, %xmm15, %xmm0
|
||||
;; vphaddw %xmm14, %xmm0, %xmm0
|
||||
;; vpmovsxbw %xmm0, %xmm12
|
||||
;; vpmovsxbw %xmm1, %xmm13
|
||||
;; vpmullw %xmm12, %xmm13, %xmm12
|
||||
;; vpalignr $8, %xmm0, %xmm0, %xmm11
|
||||
;; vpmovsxbw %xmm11, %xmm13
|
||||
;; vpalignr $8, %xmm1, %xmm1, %xmm11
|
||||
;; vpmovsxbw %xmm11, %xmm14
|
||||
;; vpmullw %xmm13, %xmm14, %xmm13
|
||||
;; vphaddw %xmm12, %xmm13, %xmm0
|
||||
;; jmp label1
|
||||
;; block1:
|
||||
;; movq %rbp, %rsp
|
||||
@@ -146,15 +146,15 @@
|
||||
;; movq %rsp, %rbp
|
||||
;; unwind DefineNewFrame { offset_upward_to_caller_sp: 16, offset_downward_to_clobbers: 0 }
|
||||
;; block0:
|
||||
;; vpmovsxbw %xmm0, %xmm13
|
||||
;; vpmovsxbw %xmm1, %xmm15
|
||||
;; vpmullw %xmm13, %xmm15, %xmm3
|
||||
;; vpalignr $8, %xmm0, %xmm0, %xmm11
|
||||
;; vpmovsxbw %xmm11, %xmm13
|
||||
;; vpalignr $8, %xmm1, %xmm1, %xmm15
|
||||
;; vpmovsxbw %xmm15, %xmm1
|
||||
;; vpmullw %xmm13, %xmm1, %xmm4
|
||||
;; vphaddw %xmm3, %xmm4, %xmm15
|
||||
;; vpmovsxbw %xmm0, %xmm15
|
||||
;; vpmovsxbw %xmm1, %xmm3
|
||||
;; vpmullw %xmm15, %xmm3, %xmm15
|
||||
;; vpalignr $8, %xmm0, %xmm0, %xmm14
|
||||
;; vpmovsxbw %xmm14, %xmm0
|
||||
;; vpalignr $8, %xmm1, %xmm1, %xmm14
|
||||
;; vpmovsxbw %xmm14, %xmm1
|
||||
;; vpmullw %xmm0, %xmm1, %xmm0
|
||||
;; vphaddw %xmm15, %xmm0, %xmm15
|
||||
;; vpmaddwd %xmm15, const(0), %xmm15
|
||||
;; vpaddd %xmm15, %xmm2, %xmm0
|
||||
;; jmp label1
|
||||
|
||||
Reference in New Issue
Block a user