x64: Add non-SSE4.1 lowerings of pmov{s,z}x* (#6279)
* x64: Add non-SSE4.1 lowerings of `pmov{s,z}x*`
This commit adds lowerings for a suite of sign/zero extension
instructions which don't require SSE4.1. Like before these lowerings are
based on LLVM's output.
This commit also deletes special casees for `i16x8.extmul_{low,high}_*`
since the output of the special case is the same as the default lowering
of all the component instructions used within as well.
* Remove SSE4.1 specialization of `uwiden_high`
LLVM prefers the `punpckh*`-based lowerings and at least according to
`llvm-mca` these are slightly better cycle-wise too.
This commit is contained in:
@@ -908,6 +908,7 @@
|
||||
Ucomiss
|
||||
Ucomisd
|
||||
Unpcklps
|
||||
Unpckhps
|
||||
Xorps
|
||||
Xorpd
|
||||
Phaddw
|
||||
@@ -1183,6 +1184,7 @@
|
||||
Vpunpckhwd
|
||||
Vpunpcklwd
|
||||
Vunpcklps
|
||||
Vunpckhps
|
||||
Vandnps
|
||||
Vandnpd
|
||||
Vpandn
|
||||
@@ -2901,6 +2903,14 @@
|
||||
(if-let $true (use_avx_simd))
|
||||
(xmm_rmir_vex (AvxOpcode.Vunpcklps) src1 src2))
|
||||
|
||||
;; Helper for creating `unpckhps` instructions.
|
||||
(decl x64_unpckhps (Xmm XmmMem) Xmm)
|
||||
(rule 0 (x64_unpckhps src1 src2)
|
||||
(xmm_rm_r (SseOpcode.Unpckhps) src1 src2))
|
||||
(rule 1 (x64_unpckhps src1 src2)
|
||||
(if-let $true (use_avx_simd))
|
||||
(xmm_rmir_vex (AvxOpcode.Vunpckhps) src1 src2))
|
||||
|
||||
;; Helper for creating `andnps` instructions.
|
||||
(decl x64_andnps (Xmm XmmMem) Xmm)
|
||||
(rule 0 (x64_andnps src1 src2)
|
||||
@@ -4908,6 +4918,7 @@
|
||||
(convert Xmm XmmMemAligned xmm_to_xmm_mem_aligned)
|
||||
(convert XmmMem XmmMemImm xmm_mem_to_xmm_mem_imm)
|
||||
(convert XmmMem RegMem xmm_mem_to_reg_mem)
|
||||
(convert RegMemImm XmmMemImm xmm_mem_imm_new)
|
||||
(convert WritableXmm Xmm writable_xmm_to_xmm)
|
||||
(convert WritableXmm WritableReg writable_xmm_to_reg)
|
||||
(convert WritableXmm Reg writable_xmm_to_r_reg)
|
||||
|
||||
@@ -1116,6 +1116,7 @@ pub enum SseOpcode {
|
||||
Ucomiss,
|
||||
Ucomisd,
|
||||
Unpcklps,
|
||||
Unpckhps,
|
||||
Xorps,
|
||||
Xorpd,
|
||||
Phaddw,
|
||||
@@ -1168,6 +1169,7 @@ impl SseOpcode {
|
||||
| SseOpcode::Subss
|
||||
| SseOpcode::Ucomiss
|
||||
| SseOpcode::Unpcklps
|
||||
| SseOpcode::Unpckhps
|
||||
| SseOpcode::Xorps => SSE,
|
||||
|
||||
SseOpcode::Addpd
|
||||
@@ -1516,6 +1518,7 @@ impl fmt::Debug for SseOpcode {
|
||||
SseOpcode::Ucomiss => "ucomiss",
|
||||
SseOpcode::Ucomisd => "ucomisd",
|
||||
SseOpcode::Unpcklps => "unpcklps",
|
||||
SseOpcode::Unpckhps => "unpckhps",
|
||||
SseOpcode::Xorps => "xorps",
|
||||
SseOpcode::Xorpd => "xorpd",
|
||||
SseOpcode::Phaddw => "phaddw",
|
||||
@@ -1611,6 +1614,7 @@ impl AvxOpcode {
|
||||
| AvxOpcode::Vpunpckhwd
|
||||
| AvxOpcode::Vpunpcklwd
|
||||
| AvxOpcode::Vunpcklps
|
||||
| AvxOpcode::Vunpckhps
|
||||
| AvxOpcode::Vaddps
|
||||
| AvxOpcode::Vaddpd
|
||||
| AvxOpcode::Vsubps
|
||||
|
||||
@@ -2060,6 +2060,7 @@ pub(crate) fn emit(
|
||||
SseOpcode::Subss => (LegacyPrefixes::_F3, 0x0F5C, 2),
|
||||
SseOpcode::Subsd => (LegacyPrefixes::_F2, 0x0F5C, 2),
|
||||
SseOpcode::Unpcklps => (LegacyPrefixes::None, 0x0F14, 2),
|
||||
SseOpcode::Unpckhps => (LegacyPrefixes::None, 0x0F15, 2),
|
||||
SseOpcode::Xorps => (LegacyPrefixes::None, 0x0F57, 2),
|
||||
SseOpcode::Xorpd => (LegacyPrefixes::_66, 0x0F57, 2),
|
||||
SseOpcode::Phaddw => (LegacyPrefixes::_66, 0x0F3801, 3),
|
||||
@@ -2206,6 +2207,7 @@ pub(crate) fn emit(
|
||||
AvxOpcode::Vpunpckhwd => (LP::_66, OM::_0F, 0x69),
|
||||
AvxOpcode::Vpunpcklwd => (LP::_66, OM::_0F, 0x61),
|
||||
AvxOpcode::Vunpcklps => (LP::None, OM::_0F, 0x14),
|
||||
AvxOpcode::Vunpckhps => (LP::None, OM::_0F, 0x15),
|
||||
AvxOpcode::Vaddps => (LP::None, OM::_0F, 0x58),
|
||||
AvxOpcode::Vaddpd => (LP::_66, OM::_0F, 0x58),
|
||||
AvxOpcode::Vsubps => (LP::None, OM::_0F, 0x5C),
|
||||
|
||||
@@ -982,20 +982,6 @@
|
||||
;; al_bl + aa_bb_shifted
|
||||
(x64_paddq al_bl aa_bb_shifted)))
|
||||
|
||||
;; Special case for `i16x8.extmul_high_i8x16_s`.
|
||||
(rule 1 (lower (has_type (multi_lane 16 8)
|
||||
(imul (swiden_high (and (value_type (multi_lane 8 16))
|
||||
x))
|
||||
(swiden_high (and (value_type (multi_lane 8 16))
|
||||
y)))))
|
||||
(let ((x1 Xmm x)
|
||||
(x2 Xmm (x64_palignr x1 x1 8))
|
||||
(x3 Xmm (x64_pmovsxbw x2))
|
||||
(y1 Xmm y)
|
||||
(y2 Xmm (x64_palignr y1 y1 8))
|
||||
(y3 Xmm (x64_pmovsxbw y2)))
|
||||
(x64_pmullw x3 y3)))
|
||||
|
||||
;; Special case for `i32x4.extmul_high_i16x8_s`.
|
||||
(rule 1 (lower (has_type (multi_lane 32 4)
|
||||
(imul (swiden_high (and (value_type (multi_lane 16 8))
|
||||
@@ -1019,16 +1005,6 @@
|
||||
(y2 Xmm (x64_pshufd y 0xFA)))
|
||||
(x64_pmuldq x2 y2)))
|
||||
|
||||
;; Special case for `i16x8.extmul_low_i8x16_s`.
|
||||
(rule 1 (lower (has_type (multi_lane 16 8)
|
||||
(imul (swiden_low (and (value_type (multi_lane 8 16))
|
||||
x))
|
||||
(swiden_low (and (value_type (multi_lane 8 16))
|
||||
y)))))
|
||||
(let ((x2 Xmm (x64_pmovsxbw x))
|
||||
(y2 Xmm (x64_pmovsxbw y)))
|
||||
(x64_pmullw x2 y2)))
|
||||
|
||||
;; Special case for `i32x4.extmul_low_i16x8_s`.
|
||||
(rule 1 (lower (has_type (multi_lane 32 4)
|
||||
(imul (swiden_low (and (value_type (multi_lane 16 8))
|
||||
@@ -1052,20 +1028,6 @@
|
||||
(y2 Xmm (x64_pshufd y 0x50)))
|
||||
(x64_pmuldq x2 y2)))
|
||||
|
||||
;; Special case for `i16x8.extmul_high_i8x16_u`.
|
||||
(rule 1 (lower (has_type (multi_lane 16 8)
|
||||
(imul (uwiden_high (and (value_type (multi_lane 8 16))
|
||||
x))
|
||||
(uwiden_high (and (value_type (multi_lane 8 16))
|
||||
y)))))
|
||||
(let ((x1 Xmm x)
|
||||
(x2 Xmm (x64_palignr x1 x1 8))
|
||||
(x3 Xmm (x64_pmovzxbw x2))
|
||||
(y1 Xmm y)
|
||||
(y2 Xmm (x64_palignr y1 y1 8))
|
||||
(y3 Xmm (x64_pmovzxbw y2)))
|
||||
(x64_pmullw x3 y3)))
|
||||
|
||||
;; Special case for `i32x4.extmul_high_i16x8_u`.
|
||||
(rule 1 (lower (has_type (multi_lane 32 4)
|
||||
(imul (uwiden_high (and (value_type (multi_lane 16 8))
|
||||
@@ -1088,16 +1050,6 @@
|
||||
(y2 Xmm (x64_pshufd y 0xFA)))
|
||||
(x64_pmuludq x2 y2)))
|
||||
|
||||
;; Special case for `i16x8.extmul_low_i8x16_u`.
|
||||
(rule 1 (lower (has_type (multi_lane 16 8)
|
||||
(imul (uwiden_low (and (value_type (multi_lane 8 16))
|
||||
x))
|
||||
(uwiden_low (and (value_type (multi_lane 8 16))
|
||||
y)))))
|
||||
(let ((x2 Xmm (x64_pmovzxbw x))
|
||||
(y2 Xmm (x64_pmovzxbw y)))
|
||||
(x64_pmullw x2 y2)))
|
||||
|
||||
;; Special case for `i32x4.extmul_low_i16x8_u`.
|
||||
(rule 1 (lower (has_type (multi_lane 32 4)
|
||||
(imul (uwiden_low (and (value_type (multi_lane 16 8))
|
||||
@@ -2559,18 +2511,37 @@
|
||||
|
||||
;; We also include widening vector loads; these sign- or zero-extend each lane
|
||||
;; to the next wider width (e.g., 16x4 -> 32x4).
|
||||
(rule 1 (lower (has_type $I16X8 (sload8x8 flags address offset)))
|
||||
(if-let $true (use_sse41))
|
||||
(x64_pmovsxbw (to_amode flags address offset)))
|
||||
(rule 1 (lower (has_type $I16X8 (uload8x8 flags address offset)))
|
||||
(if-let $true (use_sse41))
|
||||
(x64_pmovzxbw (to_amode flags address offset)))
|
||||
(rule 1 (lower (has_type $I32X4 (sload16x4 flags address offset)))
|
||||
(if-let $true (use_sse41))
|
||||
(x64_pmovsxwd (to_amode flags address offset)))
|
||||
(rule 1 (lower (has_type $I32X4 (uload16x4 flags address offset)))
|
||||
(if-let $true (use_sse41))
|
||||
(x64_pmovzxwd (to_amode flags address offset)))
|
||||
(rule 1 (lower (has_type $I64X2 (sload32x2 flags address offset)))
|
||||
(if-let $true (use_sse41))
|
||||
(x64_pmovsxdq (to_amode flags address offset)))
|
||||
(rule 1 (lower (has_type $I64X2 (uload32x2 flags address offset)))
|
||||
(if-let $true (use_sse41))
|
||||
(x64_pmovzxdq (to_amode flags address offset)))
|
||||
|
||||
(rule (lower (has_type $I16X8 (sload8x8 flags address offset)))
|
||||
(x64_pmovsxbw (to_amode flags address offset)))
|
||||
(lower_swiden_low $I16X8 (x64_movq_to_xmm (to_amode flags address offset))))
|
||||
(rule (lower (has_type $I16X8 (uload8x8 flags address offset)))
|
||||
(x64_pmovzxbw (to_amode flags address offset)))
|
||||
(lower_uwiden_low $I16X8 (x64_movq_to_xmm (to_amode flags address offset))))
|
||||
(rule (lower (has_type $I32X4 (sload16x4 flags address offset)))
|
||||
(x64_pmovsxwd (to_amode flags address offset)))
|
||||
(lower_swiden_low $I32X4 (x64_movq_to_xmm (to_amode flags address offset))))
|
||||
(rule (lower (has_type $I32X4 (uload16x4 flags address offset)))
|
||||
(x64_pmovzxwd (to_amode flags address offset)))
|
||||
(lower_uwiden_low $I32X4 (x64_movq_to_xmm (to_amode flags address offset))))
|
||||
(rule (lower (has_type $I64X2 (sload32x2 flags address offset)))
|
||||
(x64_pmovsxdq (to_amode flags address offset)))
|
||||
(lower_swiden_low $I64X2 (x64_movq_to_xmm (to_amode flags address offset))))
|
||||
(rule (lower (has_type $I64X2 (uload32x2 flags address offset)))
|
||||
(x64_pmovzxdq (to_amode flags address offset)))
|
||||
(lower_uwiden_low $I64X2 (x64_movq_to_xmm (to_amode flags address offset))))
|
||||
|
||||
;; Rules for `store*` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
|
||||
@@ -3266,51 +3237,101 @@
|
||||
|
||||
;; Rules for `swiden_low` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
|
||||
(rule (lower (has_type $I16X8 (swiden_low val @ (value_type $I8X16))))
|
||||
(x64_pmovsxbw val))
|
||||
;; With SSE4.1 use the `pmovsx*` instructions for this
|
||||
(rule 1 (lower (has_type $I16X8 (swiden_low val @ (value_type $I8X16))))
|
||||
(if-let $true (use_sse41))
|
||||
(x64_pmovsxbw val))
|
||||
(rule 1 (lower (has_type $I32X4 (swiden_low val @ (value_type $I16X8))))
|
||||
(if-let $true (use_sse41))
|
||||
(x64_pmovsxwd val))
|
||||
(rule 1 (lower (has_type $I64X2 (swiden_low val @ (value_type $I32X4))))
|
||||
(if-let $true (use_sse41))
|
||||
(x64_pmovsxdq val))
|
||||
|
||||
(rule (lower (has_type $I32X4 (swiden_low val @ (value_type $I16X8))))
|
||||
(x64_pmovsxwd val))
|
||||
(rule (lower (has_type ty (swiden_low val))) (lower_swiden_low ty val))
|
||||
|
||||
(rule (lower (has_type $I64X2 (swiden_low val @ (value_type $I32X4))))
|
||||
(x64_pmovsxdq val))
|
||||
(decl lower_swiden_low (Type Xmm) Xmm)
|
||||
|
||||
;; Duplicate the low lanes next to each other, then perform a wider shift-right
|
||||
;; by the low lane width to move the upper of each pair back into the lower lane
|
||||
;; of each pair, achieving the widening of the lower lanes.
|
||||
(rule (lower_swiden_low $I16X8 val)
|
||||
(x64_psraw (x64_punpcklbw val val) (xmi_imm 8)))
|
||||
(rule (lower_swiden_low $I32X4 val)
|
||||
(x64_psrad (x64_punpcklwd val val) (xmi_imm 16)))
|
||||
|
||||
;; Generate the sign-extended halves with a `val < 0` comparison (expressed
|
||||
;; reversed here), then interleave the low 32-bit halves to create the full
|
||||
;; 64-bit results.
|
||||
(rule (lower_swiden_low $I64X2 val)
|
||||
(let ((tmp Xmm (x64_pcmpgtd (xmm_zero $I32X4) val)))
|
||||
(x64_punpckldq val tmp)))
|
||||
|
||||
;; Rules for `swiden_high` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
|
||||
;; Similar to `swiden_low` with SSE4.1 except that the upper lanes are moved
|
||||
;; to the lower lanes first.
|
||||
(rule 1 (lower (has_type $I16X8 (swiden_high val @ (value_type $I8X16))))
|
||||
(if-let $true (use_sse41))
|
||||
(let ((x Xmm val))
|
||||
(x64_pmovsxbw (x64_palignr x x 8))))
|
||||
(rule 1 (lower (has_type $I32X4 (swiden_high val @ (value_type $I16X8))))
|
||||
(if-let $true (use_sse41))
|
||||
(let ((x Xmm val))
|
||||
(x64_pmovsxwd (x64_palignr x x 8))))
|
||||
(rule 1 (lower (has_type $I64X2 (swiden_high val @ (value_type $I32X4))))
|
||||
(if-let $true (use_sse41))
|
||||
(x64_pmovsxdq (x64_pshufd val 0b11_10_11_10)))
|
||||
|
||||
;; Similar to `swiden_low` versions but using `punpckh*` instructions to
|
||||
;; pair the high lanes next to each other.
|
||||
(rule (lower (has_type $I16X8 (swiden_high val @ (value_type $I8X16))))
|
||||
(let ((x Xmm val))
|
||||
(x64_pmovsxbw (x64_palignr x x 8))))
|
||||
|
||||
(let ((val Xmm val))
|
||||
(x64_psraw (x64_punpckhbw val val) (xmi_imm 8))))
|
||||
(rule (lower (has_type $I32X4 (swiden_high val @ (value_type $I16X8))))
|
||||
(let ((x Xmm val))
|
||||
(x64_pmovsxwd (x64_palignr x x 8))))
|
||||
(let ((val Xmm val))
|
||||
(x64_psrad (x64_punpckhwd val val) (xmi_imm 16))))
|
||||
|
||||
;; Same as `swiden_low`, but `val` has its high lanes moved down.
|
||||
(rule (lower (has_type $I64X2 (swiden_high val @ (value_type $I32X4))))
|
||||
(x64_pmovsxdq (x64_pshufd val 0xEE)))
|
||||
(let ((val Xmm (x64_pshufd val 0b00_00_11_10))
|
||||
(tmp Xmm (x64_pcmpgtd (xmm_zero $I32X4) val)))
|
||||
(x64_punpckldq val tmp)))
|
||||
|
||||
;; Rules for `uwiden_low` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
|
||||
(rule (lower (has_type $I16X8 (uwiden_low val @ (value_type $I8X16))))
|
||||
(x64_pmovzxbw val))
|
||||
;; With SSE4.1 use the `pmovzx*` instructions for this
|
||||
(rule 1 (lower (has_type $I16X8 (uwiden_low val @ (value_type $I8X16))))
|
||||
(if-let $true (use_sse41))
|
||||
(x64_pmovzxbw val))
|
||||
(rule 1 (lower (has_type $I32X4 (uwiden_low val @ (value_type $I16X8))))
|
||||
(if-let $true (use_sse41))
|
||||
(x64_pmovzxwd val))
|
||||
(rule 1 (lower (has_type $I64X2 (uwiden_low val @ (value_type $I32X4))))
|
||||
(if-let $true (use_sse41))
|
||||
(x64_pmovzxdq val))
|
||||
|
||||
(rule (lower (has_type $I32X4 (uwiden_low val @ (value_type $I16X8))))
|
||||
(x64_pmovzxwd val))
|
||||
(rule (lower (has_type ty (uwiden_low val))) (lower_uwiden_low ty val))
|
||||
|
||||
(rule (lower (has_type $I64X2 (uwiden_low val @ (value_type $I32X4))))
|
||||
(x64_pmovzxdq val))
|
||||
;; Interleave an all-zero register with the low lanes to produce zero-extended
|
||||
;; results.
|
||||
(decl lower_uwiden_low (Type Xmm) Xmm)
|
||||
(rule (lower_uwiden_low $I16X8 val) (x64_punpcklbw val (xmm_zero $I8X16)))
|
||||
(rule (lower_uwiden_low $I32X4 val) (x64_punpcklwd val (xmm_zero $I8X16)))
|
||||
(rule (lower_uwiden_low $I64X2 val) (x64_unpcklps val (xmm_zero $F32X4)))
|
||||
|
||||
;; Rules for `uwiden_high` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
|
||||
;; Same as `uwiden_high`, but interleaving high lanes instead.
|
||||
;;
|
||||
;; Note that according to `llvm-mca` at least these instructions are faster
|
||||
;; than using `pmovzx*` in terms of cycles, even if SSE4.1 is available.
|
||||
(rule (lower (has_type $I16X8 (uwiden_high val @ (value_type $I8X16))))
|
||||
(let ((x Xmm val))
|
||||
(x64_pmovzxbw (x64_palignr x x 8))))
|
||||
|
||||
(x64_punpckhbw val (xmm_zero $I8X16)))
|
||||
(rule (lower (has_type $I32X4 (uwiden_high val @ (value_type $I16X8))))
|
||||
(let ((x Xmm val))
|
||||
(x64_pmovzxwd (x64_palignr x x 8))))
|
||||
|
||||
(x64_punpckhwd val (xmm_zero $I8X16)))
|
||||
(rule (lower (has_type $I64X2 (uwiden_high val @ (value_type $I32X4))))
|
||||
(x64_pmovzxdq (x64_pshufd val 0xEE)))
|
||||
(x64_unpckhps val (xmm_zero $F32X4)))
|
||||
|
||||
;; Rules for `snarrow` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
|
||||
|
||||
@@ -1204,8 +1204,9 @@ block0(v0: i8x16):
|
||||
; pushq %rbp
|
||||
; movq %rsp, %rbp
|
||||
; block0:
|
||||
; vpalignr $8, %xmm0, %xmm0, %xmm2
|
||||
; vpmovzxbw %xmm2, %xmm0
|
||||
; uninit %xmm2
|
||||
; vpxor %xmm2, %xmm2, %xmm4
|
||||
; vpunpckhbw %xmm0, %xmm4, %xmm0
|
||||
; movq %rbp, %rsp
|
||||
; popq %rbp
|
||||
; ret
|
||||
@@ -1215,8 +1216,8 @@ block0(v0: i8x16):
|
||||
; pushq %rbp
|
||||
; movq %rsp, %rbp
|
||||
; block1: ; offset 0x4
|
||||
; vpalignr $8, %xmm0, %xmm0, %xmm2
|
||||
; vpmovzxbw %xmm2, %xmm0
|
||||
; vpxor %xmm2, %xmm2, %xmm4
|
||||
; vpunpckhbw %xmm4, %xmm0, %xmm0
|
||||
; movq %rbp, %rsp
|
||||
; popq %rbp
|
||||
; retq
|
||||
|
||||
@@ -16,13 +16,13 @@ block0(v0: i8x16, v1: i8x16):
|
||||
; pushq %rbp
|
||||
; movq %rsp, %rbp
|
||||
; block0:
|
||||
; movdqa %xmm0, %xmm3
|
||||
; palignr $8, %xmm3, %xmm0, %xmm3
|
||||
; pmovsxbw %xmm3, %xmm0
|
||||
; movdqa %xmm1, %xmm7
|
||||
; palignr $8, %xmm7, %xmm1, %xmm7
|
||||
; pmovsxbw %xmm7, %xmm9
|
||||
; pmullw %xmm0, %xmm9, %xmm0
|
||||
; movdqa %xmm0, %xmm6
|
||||
; palignr $8, %xmm6, %xmm0, %xmm6
|
||||
; pmovsxbw %xmm6, %xmm0
|
||||
; movdqa %xmm1, %xmm6
|
||||
; palignr $8, %xmm6, %xmm1, %xmm6
|
||||
; pmovsxbw %xmm6, %xmm8
|
||||
; pmullw %xmm0, %xmm8, %xmm0
|
||||
; movq %rbp, %rsp
|
||||
; popq %rbp
|
||||
; ret
|
||||
@@ -32,13 +32,13 @@ block0(v0: i8x16, v1: i8x16):
|
||||
; pushq %rbp
|
||||
; movq %rsp, %rbp
|
||||
; block1: ; offset 0x4
|
||||
; movdqa %xmm0, %xmm3
|
||||
; palignr $8, %xmm0, %xmm3
|
||||
; pmovsxbw %xmm3, %xmm0
|
||||
; movdqa %xmm1, %xmm7
|
||||
; palignr $8, %xmm1, %xmm7
|
||||
; pmovsxbw %xmm7, %xmm9
|
||||
; pmullw %xmm9, %xmm0
|
||||
; movdqa %xmm0, %xmm6
|
||||
; palignr $8, %xmm0, %xmm6
|
||||
; pmovsxbw %xmm6, %xmm0
|
||||
; movdqa %xmm1, %xmm6
|
||||
; palignr $8, %xmm1, %xmm6
|
||||
; pmovsxbw %xmm6, %xmm8
|
||||
; pmullw %xmm8, %xmm0
|
||||
; movq %rbp, %rsp
|
||||
; popq %rbp
|
||||
; retq
|
||||
@@ -226,13 +226,14 @@ block0(v0: i8x16, v1: i8x16):
|
||||
; pushq %rbp
|
||||
; movq %rsp, %rbp
|
||||
; block0:
|
||||
; movdqa %xmm0, %xmm3
|
||||
; palignr $8, %xmm3, %xmm0, %xmm3
|
||||
; pmovzxbw %xmm3, %xmm0
|
||||
; movdqa %xmm1, %xmm7
|
||||
; palignr $8, %xmm7, %xmm1, %xmm7
|
||||
; pmovzxbw %xmm7, %xmm9
|
||||
; pmullw %xmm0, %xmm9, %xmm0
|
||||
; uninit %xmm8
|
||||
; pxor %xmm8, %xmm8, %xmm8
|
||||
; punpckhbw %xmm0, %xmm8, %xmm0
|
||||
; uninit %xmm8
|
||||
; pxor %xmm8, %xmm8, %xmm8
|
||||
; movdqa %xmm1, %xmm11
|
||||
; punpckhbw %xmm11, %xmm8, %xmm11
|
||||
; pmullw %xmm0, %xmm11, %xmm0
|
||||
; movq %rbp, %rsp
|
||||
; popq %rbp
|
||||
; ret
|
||||
@@ -242,13 +243,12 @@ block0(v0: i8x16, v1: i8x16):
|
||||
; pushq %rbp
|
||||
; movq %rsp, %rbp
|
||||
; block1: ; offset 0x4
|
||||
; movdqa %xmm0, %xmm3
|
||||
; palignr $8, %xmm0, %xmm3
|
||||
; pmovzxbw %xmm3, %xmm0
|
||||
; movdqa %xmm1, %xmm7
|
||||
; palignr $8, %xmm1, %xmm7
|
||||
; pmovzxbw %xmm7, %xmm9
|
||||
; pmullw %xmm9, %xmm0
|
||||
; pxor %xmm8, %xmm8
|
||||
; punpckhbw %xmm8, %xmm0
|
||||
; pxor %xmm8, %xmm8
|
||||
; movdqa %xmm1, %xmm11
|
||||
; punpckhbw %xmm8, %xmm11
|
||||
; pmullw %xmm11, %xmm0
|
||||
; movq %rbp, %rsp
|
||||
; popq %rbp
|
||||
; retq
|
||||
|
||||
@@ -12,9 +12,10 @@ block0(v0: i64, v2: i8x16):
|
||||
; pushq %rbp
|
||||
; movq %rsp, %rbp
|
||||
; block0:
|
||||
; movdqu 80(%rdi), %xmm3
|
||||
; palignr $8, %xmm3, %xmm3, %xmm3
|
||||
; pmovzxbw %xmm3, %xmm0
|
||||
; movdqu 80(%rdi), %xmm0
|
||||
; uninit %xmm5
|
||||
; pxor %xmm5, %xmm5, %xmm5
|
||||
; punpckhbw %xmm0, %xmm5, %xmm0
|
||||
; movq %rbp, %rsp
|
||||
; popq %rbp
|
||||
; ret
|
||||
@@ -24,9 +25,9 @@ block0(v0: i64, v2: i8x16):
|
||||
; pushq %rbp
|
||||
; movq %rsp, %rbp
|
||||
; block1: ; offset 0x4
|
||||
; movdqu 0x50(%rdi), %xmm3
|
||||
; palignr $8, %xmm3, %xmm3
|
||||
; pmovzxbw %xmm3, %xmm0
|
||||
; movdqu 0x50(%rdi), %xmm0
|
||||
; pxor %xmm5, %xmm5
|
||||
; punpckhbw %xmm5, %xmm0
|
||||
; movq %rbp, %rsp
|
||||
; popq %rbp
|
||||
; retq
|
||||
|
||||
@@ -246,9 +246,9 @@ block0(v0: i8x16):
|
||||
; pushq %rbp
|
||||
; movq %rsp, %rbp
|
||||
; block0:
|
||||
; movdqa %xmm0, %xmm2
|
||||
; palignr $8, %xmm2, %xmm0, %xmm2
|
||||
; pmovzxbw %xmm2, %xmm0
|
||||
; uninit %xmm3
|
||||
; pxor %xmm3, %xmm3, %xmm3
|
||||
; punpckhbw %xmm0, %xmm3, %xmm0
|
||||
; movq %rbp, %rsp
|
||||
; popq %rbp
|
||||
; ret
|
||||
@@ -258,9 +258,8 @@ block0(v0: i8x16):
|
||||
; pushq %rbp
|
||||
; movq %rsp, %rbp
|
||||
; block1: ; offset 0x4
|
||||
; movdqa %xmm0, %xmm2
|
||||
; palignr $8, %xmm0, %xmm2
|
||||
; pmovzxbw %xmm2, %xmm0
|
||||
; pxor %xmm3, %xmm3
|
||||
; punpckhbw %xmm3, %xmm0
|
||||
; movq %rbp, %rsp
|
||||
; popq %rbp
|
||||
; retq
|
||||
@@ -275,9 +274,9 @@ block0(v0: i16x8):
|
||||
; pushq %rbp
|
||||
; movq %rsp, %rbp
|
||||
; block0:
|
||||
; movdqa %xmm0, %xmm2
|
||||
; palignr $8, %xmm2, %xmm0, %xmm2
|
||||
; pmovzxwd %xmm2, %xmm0
|
||||
; uninit %xmm3
|
||||
; pxor %xmm3, %xmm3, %xmm3
|
||||
; punpckhwd %xmm0, %xmm3, %xmm0
|
||||
; movq %rbp, %rsp
|
||||
; popq %rbp
|
||||
; ret
|
||||
@@ -287,9 +286,8 @@ block0(v0: i16x8):
|
||||
; pushq %rbp
|
||||
; movq %rsp, %rbp
|
||||
; block1: ; offset 0x4
|
||||
; movdqa %xmm0, %xmm2
|
||||
; palignr $8, %xmm0, %xmm2
|
||||
; pmovzxwd %xmm2, %xmm0
|
||||
; pxor %xmm3, %xmm3
|
||||
; punpckhwd %xmm3, %xmm0
|
||||
; movq %rbp, %rsp
|
||||
; popq %rbp
|
||||
; retq
|
||||
@@ -304,8 +302,9 @@ block0(v0: i32x4):
|
||||
; pushq %rbp
|
||||
; movq %rsp, %rbp
|
||||
; block0:
|
||||
; pshufd $238, %xmm0, %xmm2
|
||||
; pmovzxdq %xmm2, %xmm0
|
||||
; uninit %xmm3
|
||||
; xorps %xmm3, %xmm3, %xmm3
|
||||
; unpckhps %xmm0, %xmm3, %xmm0
|
||||
; movq %rbp, %rsp
|
||||
; popq %rbp
|
||||
; ret
|
||||
@@ -315,8 +314,8 @@ block0(v0: i32x4):
|
||||
; pushq %rbp
|
||||
; movq %rsp, %rbp
|
||||
; block1: ; offset 0x4
|
||||
; pshufd $0xee, %xmm0, %xmm2
|
||||
; pmovzxdq %xmm2, %xmm0
|
||||
; xorps %xmm3, %xmm3
|
||||
; unpckhps %xmm3, %xmm0
|
||||
; movq %rbp, %rsp
|
||||
; popq %rbp
|
||||
; retq
|
||||
|
||||
@@ -2,9 +2,11 @@ test interpret
|
||||
test run
|
||||
target aarch64
|
||||
target s390x
|
||||
target x86_64 ssse3 has_sse41=false
|
||||
set enable_simd
|
||||
target x86_64 has_sse3 has_ssse3 has_sse41
|
||||
target x86_64 has_sse3 has_ssse3 has_sse41 has_avx
|
||||
target x86_64
|
||||
target x86_64 sse41
|
||||
target x86_64 sse41 has_avx
|
||||
|
||||
function %swidenhigh_i8x16(i8x16) -> i16x8 {
|
||||
block0(v0: i8x16):
|
||||
|
||||
@@ -2,9 +2,11 @@ test interpret
|
||||
test run
|
||||
target aarch64
|
||||
target s390x
|
||||
target x86_64 ssse3 has_sse41=false
|
||||
set enable_simd
|
||||
target x86_64 has_sse3 has_ssse3 has_sse41
|
||||
target x86_64 has_sse3 has_ssse3 has_sse41 has_avx
|
||||
target x86_64
|
||||
target x86_64 sse41
|
||||
target x86_64 sse41 has_avx
|
||||
|
||||
function %swidenlow_i8x16(i8x16) -> i16x8 {
|
||||
block0(v0: i8x16):
|
||||
|
||||
@@ -2,9 +2,11 @@ test interpret
|
||||
test run
|
||||
target aarch64
|
||||
target s390x
|
||||
target x86_64 ssse3 has_sse41=false
|
||||
set enable_simd
|
||||
target x86_64 has_sse3 has_ssse3 has_sse41
|
||||
target x86_64 has_sse3 has_ssse3 has_sse41 has_avx
|
||||
target x86_64
|
||||
target x86_64 sse41
|
||||
target x86_64 sse41 has_avx
|
||||
|
||||
function %uwidenhigh_i8x16(i8x16) -> i16x8 {
|
||||
block0(v0: i8x16):
|
||||
@@ -12,6 +14,7 @@ block0(v0: i8x16):
|
||||
return v1
|
||||
}
|
||||
; run: %uwidenhigh_i8x16([1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16]) == [9 10 11 12 13 14 15 16]
|
||||
; run: %uwidenhigh_i8x16([1 2 3 4 5 6 7 8 9 0x80 10 0xff 11 0x92 12 0x70]) == [9 0x80 10 0xff 11 0x92 12 0x70]
|
||||
|
||||
function %uwidenhigh_i16x8(i16x8) -> i32x4 {
|
||||
block0(v0: i16x8):
|
||||
@@ -19,6 +22,7 @@ block0(v0: i16x8):
|
||||
return v1
|
||||
}
|
||||
; run: %uwidenhigh_i16x8([1 2 3 4 5 6 7 8]) == [5 6 7 8]
|
||||
; run: %uwidenhigh_i16x8([9 10 11 12 13 14 -1 -2]) == [13 14 0xffff 0xfffe]
|
||||
|
||||
function %uwidenhigh_i32x4(i32x4) -> i64x2 {
|
||||
block0(v0: i32x4):
|
||||
@@ -26,3 +30,4 @@ block0(v0: i32x4):
|
||||
return v1
|
||||
}
|
||||
; run: %uwidenhigh_i32x4([1 2 3 4]) == [3 4]
|
||||
; run: %uwidenhigh_i32x4([4 5 6 -1]) == [6 0xffffffff]
|
||||
|
||||
@@ -2,9 +2,11 @@ test interpret
|
||||
test run
|
||||
target aarch64
|
||||
target s390x
|
||||
target x86_64 ssse3 has_sse41=false
|
||||
set enable_simd
|
||||
target x86_64 has_sse3 has_ssse3 has_sse41
|
||||
target x86_64 has_sse3 has_ssse3 has_sse41 has_avx
|
||||
target x86_64
|
||||
target x86_64 sse41
|
||||
target x86_64 sse41 has_avx
|
||||
|
||||
function %uwidenlow_i8x16(i8x16) -> i16x8 {
|
||||
block0(v0: i8x16):
|
||||
|
||||
@@ -125,15 +125,15 @@
|
||||
;; movq %rsp, %rbp
|
||||
;; unwind DefineNewFrame { offset_upward_to_caller_sp: 16, offset_downward_to_clobbers: 0 }
|
||||
;; block0:
|
||||
;; vpmovsxbw %xmm0, %xmm10
|
||||
;; vpmovsxbw %xmm1, %xmm12
|
||||
;; vpmullw %xmm10, %xmm12, %xmm14
|
||||
;; vpalignr $8, %xmm0, %xmm0, %xmm8
|
||||
;; vpmovsxbw %xmm8, %xmm10
|
||||
;; vpalignr $8, %xmm1, %xmm1, %xmm12
|
||||
;; vpmovsxbw %xmm12, %xmm15
|
||||
;; vpmullw %xmm10, %xmm15, %xmm0
|
||||
;; vphaddw %xmm14, %xmm0, %xmm0
|
||||
;; vpmovsxbw %xmm0, %xmm12
|
||||
;; vpmovsxbw %xmm1, %xmm13
|
||||
;; vpmullw %xmm12, %xmm13, %xmm12
|
||||
;; vpalignr $8, %xmm0, %xmm0, %xmm11
|
||||
;; vpmovsxbw %xmm11, %xmm13
|
||||
;; vpalignr $8, %xmm1, %xmm1, %xmm11
|
||||
;; vpmovsxbw %xmm11, %xmm14
|
||||
;; vpmullw %xmm13, %xmm14, %xmm13
|
||||
;; vphaddw %xmm12, %xmm13, %xmm0
|
||||
;; jmp label1
|
||||
;; block1:
|
||||
;; movq %rbp, %rsp
|
||||
@@ -146,15 +146,15 @@
|
||||
;; movq %rsp, %rbp
|
||||
;; unwind DefineNewFrame { offset_upward_to_caller_sp: 16, offset_downward_to_clobbers: 0 }
|
||||
;; block0:
|
||||
;; vpmovsxbw %xmm0, %xmm13
|
||||
;; vpmovsxbw %xmm1, %xmm15
|
||||
;; vpmullw %xmm13, %xmm15, %xmm3
|
||||
;; vpalignr $8, %xmm0, %xmm0, %xmm11
|
||||
;; vpmovsxbw %xmm11, %xmm13
|
||||
;; vpalignr $8, %xmm1, %xmm1, %xmm15
|
||||
;; vpmovsxbw %xmm15, %xmm1
|
||||
;; vpmullw %xmm13, %xmm1, %xmm4
|
||||
;; vphaddw %xmm3, %xmm4, %xmm15
|
||||
;; vpmovsxbw %xmm0, %xmm15
|
||||
;; vpmovsxbw %xmm1, %xmm3
|
||||
;; vpmullw %xmm15, %xmm3, %xmm15
|
||||
;; vpalignr $8, %xmm0, %xmm0, %xmm14
|
||||
;; vpmovsxbw %xmm14, %xmm0
|
||||
;; vpalignr $8, %xmm1, %xmm1, %xmm14
|
||||
;; vpmovsxbw %xmm14, %xmm1
|
||||
;; vpmullw %xmm0, %xmm1, %xmm0
|
||||
;; vphaddw %xmm15, %xmm0, %xmm15
|
||||
;; vpmaddwd %xmm15, const(0), %xmm15
|
||||
;; vpaddd %xmm15, %xmm2, %xmm0
|
||||
;; jmp label1
|
||||
|
||||
Reference in New Issue
Block a user