x64: Sink constant loads into xmm instructions (#5880)

A number of places in the x64 backend make use of 128-bit constants for
various wasm SIMD-related instructions although most of them currently
use the `x64_xmm_load_const` helper to load the constant into a
register. Almost all xmm instructions, however, enable using a memory
operand which means that these loads can be folded into instructions to
help reduce register pressure. Automatic conversions were added for a
`VCodeConstant` into an `XmmMem` value and then explicit loads were all
removed in favor of forwarding the `XmmMem` value directly to the
underlying instruction. Note that some instances of `x64_xmm_load_const`
remain since they're used in contexts where load sinking won't work
(e.g. they're the first operand, not the second for non-commutative
instructions).
This commit is contained in:
Alex Crichton
2023-02-27 16:02:42 -06:00
committed by GitHub
parent 9b86a0b9b1
commit f2dce812c3
11 changed files with 147 additions and 182 deletions

View File

@@ -4489,6 +4489,8 @@
(convert SyntheticAmode XmmMem synthetic_amode_to_xmm_mem) (convert SyntheticAmode XmmMem synthetic_amode_to_xmm_mem)
(convert Amode XmmMemAligned amode_to_xmm_mem_aligned) (convert Amode XmmMemAligned amode_to_xmm_mem_aligned)
(convert SyntheticAmode XmmMemAligned synthetic_amode_to_xmm_mem_aligned) (convert SyntheticAmode XmmMemAligned synthetic_amode_to_xmm_mem_aligned)
(convert VCodeConstant SyntheticAmode const_to_synthetic_amode)
(convert VCodeConstant XmmMem const_to_xmm_mem)
(convert IntCC CC intcc_to_cc) (convert IntCC CC intcc_to_cc)
(convert AtomicRmwOp MachAtomicRmwOp atomic_rmw_op_to_mach_atomic_rmw_op) (convert AtomicRmwOp MachAtomicRmwOp atomic_rmw_op_to_mach_atomic_rmw_op)
@@ -4537,6 +4539,8 @@
(synthetic_amode_to_reg_mem amode)) (synthetic_amode_to_reg_mem amode))
(decl const_to_synthetic_amode (VCodeConstant) SyntheticAmode) (decl const_to_synthetic_amode (VCodeConstant) SyntheticAmode)
(extern constructor const_to_synthetic_amode const_to_synthetic_amode) (extern constructor const_to_synthetic_amode const_to_synthetic_amode)
(decl const_to_xmm_mem (VCodeConstant) XmmMem)
(rule (const_to_xmm_mem c) (const_to_synthetic_amode c))
(decl xmm_to_xmm_mem_aligned (Xmm) XmmMemAligned) (decl xmm_to_xmm_mem_aligned (Xmm) XmmMemAligned)
(rule (xmm_to_xmm_mem_aligned reg) (xmm_mem_to_xmm_mem_aligned reg)) (rule (xmm_to_xmm_mem_aligned reg) (xmm_mem_to_xmm_mem_aligned reg))

View File

@@ -1908,7 +1908,7 @@
(rule (lower (has_type $I8X16 (rule (lower (has_type $I8X16
(popcnt src))) (popcnt src)))
(let ((nibble_table_const VCodeConstant (popcount_4bit_table)) (let ((nibble_table_const VCodeConstant (popcount_4bit_table))
(low_mask Xmm (x64_xmm_load_const $I8X16 (popcount_low_mask))) (low_mask XmmMem (popcount_low_mask))
(low_nibbles Xmm (sse_and $I8X16 src low_mask)) (low_nibbles Xmm (sse_and $I8X16 src low_mask))
;; Note that this is a 16x8 shift, but that's OK; we mask ;; Note that this is a 16x8 shift, but that's OK; we mask
;; off anything that traverses from one byte to the next ;; off anything that traverses from one byte to the next
@@ -2984,9 +2984,9 @@
;; every value of the mantissa represents a corresponding uint32 number. ;; every value of the mantissa represents a corresponding uint32 number.
;; When we subtract 0x1.0p52 we are left with double(src). ;; When we subtract 0x1.0p52 we are left with double(src).
(rule 1 (lower (has_type $F64X2 (fcvt_from_uint (uwiden_low val @ (value_type $I32X4))))) (rule 1 (lower (has_type $F64X2 (fcvt_from_uint (uwiden_low val @ (value_type $I32X4)))))
(let ((uint_mask Xmm (x64_xmm_load_const $I32X4 (fcvt_uint_mask_const))) (let ((uint_mask XmmMem (fcvt_uint_mask_const))
(res Xmm (x64_unpcklps val uint_mask)) (res Xmm (x64_unpcklps val uint_mask))
(uint_mask_high Xmm (x64_xmm_load_const $I32X4 (fcvt_uint_mask_high_const)))) (uint_mask_high XmmMem (fcvt_uint_mask_high_const)))
(x64_subpd res uint_mask_high))) (x64_subpd res uint_mask_high)))
;; When AVX512VL and AVX512F are available, ;; When AVX512VL and AVX512F are available,
@@ -3186,7 +3186,7 @@
(has_type $I32X4 (iadd_pairwise (has_type $I32X4 (iadd_pairwise
(swiden_low val @ (value_type $I16X8)) (swiden_low val @ (value_type $I16X8))
(swiden_high val)))) (swiden_high val))))
(let ((mul_const Xmm (x64_xmm_load_const $I16X8 (iadd_pairwise_mul_const_32)))) (let ((mul_const XmmMem (iadd_pairwise_mul_const_32)))
(x64_pmaddwd val mul_const))) (x64_pmaddwd val mul_const)))
;; special case for the `i16x8.extadd_pairwise_i8x16_u` wasm instruction ;; special case for the `i16x8.extadd_pairwise_i8x16_u` wasm instruction
@@ -3194,7 +3194,7 @@
(has_type $I16X8 (iadd_pairwise (has_type $I16X8 (iadd_pairwise
(uwiden_low val @ (value_type $I8X16)) (uwiden_low val @ (value_type $I8X16))
(uwiden_high val)))) (uwiden_high val))))
(let ((mul_const Xmm (x64_xmm_load_const $I8X16 (iadd_pairwise_mul_const_16)))) (let ((mul_const XmmMem (iadd_pairwise_mul_const_16)))
(x64_pmaddubsw val mul_const))) (x64_pmaddubsw val mul_const)))
;; special case for the `i32x4.extadd_pairwise_i16x8_u` wasm instruction ;; special case for the `i32x4.extadd_pairwise_i16x8_u` wasm instruction
@@ -3202,13 +3202,13 @@
(has_type $I32X4 (iadd_pairwise (has_type $I32X4 (iadd_pairwise
(uwiden_low val @ (value_type $I16X8)) (uwiden_low val @ (value_type $I16X8))
(uwiden_high val)))) (uwiden_high val))))
(let ((xor_const Xmm (x64_xmm_load_const $I16X8 (iadd_pairwise_xor_const_32))) (let ((xor_const XmmMem (iadd_pairwise_xor_const_32))
(dst Xmm (x64_pxor val xor_const)) (dst Xmm (x64_pxor val xor_const))
(madd_const Xmm (x64_xmm_load_const $I16X8 (iadd_pairwise_mul_const_32))) (madd_const XmmMem (iadd_pairwise_mul_const_32))
(dst Xmm (x64_pmaddwd dst madd_const)) (dst Xmm (x64_pmaddwd dst madd_const))
(addd_const Xmm (x64_xmm_load_const $I16X8 (iadd_pairwise_addd_const_32)))) (addd_const XmmMem (iadd_pairwise_addd_const_32)))
(x64_paddd dst addd_const))) (x64_paddd dst addd_const)))
;; special case for the `i32x4.dot_i16x8_s` wasm instruction ;; special case for the `i32x4.dot_i16x8_s` wasm instruction
@@ -3293,7 +3293,7 @@
;; CVTTPD2DQ xmm_y, xmm_y ;; CVTTPD2DQ xmm_y, xmm_y
(tmp1 Xmm (x64_cmppd a a (FcmpImm.Equal))) (tmp1 Xmm (x64_cmppd a a (FcmpImm.Equal)))
(umax_mask Xmm (x64_xmm_load_const $F64X2 (snarrow_umax_mask))) (umax_mask XmmMem (snarrow_umax_mask))
;; ANDPD xmm_y, [wasm_f64x2_splat(2147483647.0)] ;; ANDPD xmm_y, [wasm_f64x2_splat(2147483647.0)]
(tmp1 Xmm (x64_andps tmp1 umax_mask)) (tmp1 Xmm (x64_andps tmp1 umax_mask))
@@ -3509,7 +3509,7 @@
;; indices (may not be completely necessary: verification could fail incorrect ;; indices (may not be completely necessary: verification could fail incorrect
;; mask values) and fix the indexes to all point to the `dst` vector. ;; mask values) and fix the indexes to all point to the `dst` vector.
(rule 3 (lower (shuffle a a (vec_mask_from_immediate mask))) (rule 3 (lower (shuffle a a (vec_mask_from_immediate mask)))
(x64_pshufb a (x64_xmm_load_const $I8X16 (shuffle_0_31_mask mask)))) (x64_pshufb a (shuffle_0_31_mask mask)))
;; For the case where the shuffle mask contains out-of-bounds values (values ;; For the case where the shuffle mask contains out-of-bounds values (values
;; greater than 31) we must mask off those resulting values in the result of ;; greater than 31) we must mask off those resulting values in the result of
@@ -3517,9 +3517,7 @@
(rule 2 (lower (has_type (and (avx512vl_enabled $true) (avx512vbmi_enabled $true)) (rule 2 (lower (has_type (and (avx512vl_enabled $true) (avx512vbmi_enabled $true))
(shuffle a b (vec_mask_from_immediate (shuffle a b (vec_mask_from_immediate
(perm_from_mask_with_zeros mask zeros))))) (perm_from_mask_with_zeros mask zeros)))))
(x64_andps (x64_andps (x64_vpermi2b b a (x64_xmm_load_const $I8X16 mask)) zeros))
(x64_xmm_load_const $I8X16 zeros)
(x64_vpermi2b b a (x64_xmm_load_const $I8X16 mask))))
;; However, if the shuffle mask contains no out-of-bounds values, we can use ;; However, if the shuffle mask contains no out-of-bounds values, we can use
;; `vpermi2b` without any masking. ;; `vpermi2b` without any masking.
@@ -3532,8 +3530,8 @@
;; above, we build the `constructed_mask` for each case statically. ;; above, we build the `constructed_mask` for each case statically.
(rule (lower (shuffle a b (vec_mask_from_immediate mask))) (rule (lower (shuffle a b (vec_mask_from_immediate mask)))
(x64_por (x64_por
(x64_pshufb a (x64_xmm_load_const $I8X16 (shuffle_0_15_mask mask))) (x64_pshufb a (shuffle_0_15_mask mask))
(x64_pshufb b (x64_xmm_load_const $I8X16 (shuffle_16_31_mask mask))))) (x64_pshufb b (shuffle_16_31_mask mask))))
;; Rules for `swizzle` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;; Rules for `swizzle` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
@@ -3544,9 +3542,7 @@
;; Wasm SIMD semantics for this instruction. The instruction format maps to ;; Wasm SIMD semantics for this instruction. The instruction format maps to
;; variables like: %dst = swizzle %src, %mask ;; variables like: %dst = swizzle %src, %mask
(rule (lower (swizzle src mask)) (rule (lower (swizzle src mask))
(let ((mask Xmm (x64_paddusb (let ((mask Xmm (x64_paddusb mask (swizzle_zero_mask))))
mask
(x64_xmm_load_const $I8X16 (swizzle_zero_mask)))))
(x64_pshufb src mask))) (x64_pshufb src mask)))
;; Rules for `extractlane` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;; Rules for `extractlane` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
@@ -3721,9 +3717,9 @@
(let ((src1 Xmm qx) (let ((src1 Xmm qx)
(src2 Xmm qy) (src2 Xmm qy)
(mask Xmm (x64_xmm_load_const $I16X8 (sqmul_round_sat_mask))) (mask XmmMem (sqmul_round_sat_mask))
(dst Xmm (x64_pmulhrsw src1 src2)) (dst Xmm (x64_pmulhrsw src1 src2))
(cmp Xmm (x64_pcmpeqw mask dst))) (cmp Xmm (x64_pcmpeqw dst mask)))
(x64_pxor dst cmp))) (x64_pxor dst cmp)))
;; Rules for `sqmul_round_sat` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;; Rules for `sqmul_round_sat` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
@@ -3749,7 +3745,7 @@
(zeros Xmm (xmm_zero $F64X2)) (zeros Xmm (xmm_zero $F64X2))
(dst Xmm (x64_maxpd src zeros)) (dst Xmm (x64_maxpd src zeros))
(umax_mask Xmm (x64_xmm_load_const $F64X2 (uunarrow_umax_mask))) (umax_mask XmmMem (uunarrow_umax_mask))
;; MINPD xmm_y, [wasm_f64x2_splat(4294967295.0)] ;; MINPD xmm_y, [wasm_f64x2_splat(4294967295.0)]
(dst Xmm (x64_minpd dst umax_mask)) (dst Xmm (x64_minpd dst umax_mask))
@@ -3758,7 +3754,7 @@
(dst Xmm (x64_roundpd dst (RoundImm.RoundZero))) (dst Xmm (x64_roundpd dst (RoundImm.RoundZero)))
;; ADDPD xmm_y, [wasm_f64x2_splat(0x1.0p+52)] ;; ADDPD xmm_y, [wasm_f64x2_splat(0x1.0p+52)]
(uint_mask Xmm (x64_xmm_load_const $F64X2 (uunarrow_uint_mask))) (uint_mask XmmMem (uunarrow_uint_mask))
(dst Xmm (x64_addpd dst uint_mask))) (dst Xmm (x64_addpd dst uint_mask)))
;; SHUFPS xmm_y, xmm_xmp, 0x88 ;; SHUFPS xmm_y, xmm_xmp, 0x88

View File

@@ -304,10 +304,8 @@ block0(v0: i32x4):
; pushq %rbp ; pushq %rbp
; movq %rsp, %rbp ; movq %rsp, %rbp
; block0: ; block0:
; movdqu const(0), %xmm2 ; unpcklps %xmm0, const(0), %xmm0
; unpcklps %xmm0, %xmm2, %xmm0 ; subpd %xmm0, const(1), %xmm0
; movdqu const(1), %xmm6
; subpd %xmm0, %xmm6, %xmm0
; movq %rbp, %rsp ; movq %rbp, %rsp
; popq %rbp ; popq %rbp
; ret ; ret
@@ -317,14 +315,16 @@ block0(v0: i32x4):
; pushq %rbp ; pushq %rbp
; movq %rsp, %rbp ; movq %rsp, %rbp
; block1: ; offset 0x4 ; block1: ; offset 0x4
; movdqu 0x14(%rip), %xmm2 ; unpcklps 0x15(%rip), %xmm0
; unpcklps %xmm2, %xmm0 ; subpd 0x1d(%rip), %xmm0
; movdqu 0x19(%rip), %xmm6
; subpd %xmm6, %xmm0
; movq %rbp, %rsp ; movq %rbp, %rsp
; popq %rbp ; popq %rbp
; retq ; retq
; addb %al, (%rax) ; addb %al, (%rax)
; addb %al, (%rax)
; addb %al, (%rax)
; addb %al, (%rax)
; addb %al, (%rax)
; xorb %al, (%rbx) ; xorb %al, (%rbx)
; addb %dh, (%rax) ; addb %dh, (%rax)
; addb %al, (%r8) ; addb %al, (%r8)

View File

@@ -566,10 +566,9 @@ block0(v0: f64x2):
; movq %rsp, %rbp ; movq %rsp, %rbp
; block0: ; block0:
; vcmppd $0 %xmm0, %xmm0, %xmm2 ; vcmppd $0 %xmm0, %xmm0, %xmm2
; movupd const(0), %xmm4 ; vandps %xmm2, const(0), %xmm4
; vandps %xmm2, %xmm4, %xmm6 ; vminpd %xmm0, %xmm4, %xmm6
; vminpd %xmm0, %xmm6, %xmm8 ; vcvttpd2dq %xmm6, %xmm0
; vcvttpd2dq %xmm8, %xmm0
; movq %rbp, %rsp ; movq %rbp, %rsp
; popq %rbp ; popq %rbp
; ret ; ret
@@ -580,19 +579,13 @@ block0(v0: f64x2):
; movq %rsp, %rbp ; movq %rsp, %rbp
; block1: ; offset 0x4 ; block1: ; offset 0x4
; vcmpeqpd %xmm0, %xmm0, %xmm2 ; vcmpeqpd %xmm0, %xmm0, %xmm2
; movupd 0x1f(%rip), %xmm4 ; vandps 0xf(%rip), %xmm2, %xmm4
; vandps %xmm4, %xmm2, %xmm6 ; vminpd %xmm4, %xmm0, %xmm6
; vminpd %xmm6, %xmm0, %xmm8 ; vcvttpd2dq %xmm6, %xmm0
; vcvttpd2dq %xmm8, %xmm0
; movq %rbp, %rsp ; movq %rbp, %rsp
; popq %rbp ; popq %rbp
; retq ; retq
; addb %al, (%rax) ; addb %al, (%rax)
; addb %al, (%rax) ; addb %al, (%rax)
; addb %al, (%rax) ; sarb $0xff, %bh
; addb %al, (%rax)
; addb %al, (%rax)
; addb %al, (%rax)
; addb %al, (%rax)
; addb %al, %al

View File

@@ -63,13 +63,12 @@ block0(v0: f64x2):
; pushq %rbp ; pushq %rbp
; movq %rsp, %rbp ; movq %rsp, %rbp
; block0: ; block0:
; movdqa %xmm0, %xmm4 ; movdqa %xmm0, %xmm3
; cmppd $0, %xmm4, %xmm0, %xmm4 ; cmppd $0, %xmm3, %xmm0, %xmm3
; movupd const(0), %xmm5 ; andps %xmm3, const(0), %xmm3
; andps %xmm4, %xmm5, %xmm4 ; movdqa %xmm0, %xmm6
; movdqa %xmm0, %xmm8 ; minpd %xmm6, %xmm3, %xmm6
; minpd %xmm8, %xmm4, %xmm8 ; cvttpd2dq %xmm6, %xmm0
; cvttpd2dq %xmm8, %xmm0
; movq %rbp, %rsp ; movq %rbp, %rsp
; popq %rbp ; popq %rbp
; ret ; ret
@@ -79,20 +78,22 @@ block0(v0: f64x2):
; pushq %rbp ; pushq %rbp
; movq %rsp, %rbp ; movq %rsp, %rbp
; block1: ; offset 0x4 ; block1: ; offset 0x4
; movdqa %xmm0, %xmm4 ; movdqa %xmm0, %xmm3
; cmpeqpd %xmm0, %xmm4 ; cmpeqpd %xmm0, %xmm3
; movupd 0x1b(%rip), %xmm5 ; andps 0x1c(%rip), %xmm3
; andps %xmm5, %xmm4 ; movdqa %xmm0, %xmm6
; movdqa %xmm0, %xmm8 ; minpd %xmm3, %xmm6
; minpd %xmm4, %xmm8 ; cvttpd2dq %xmm6, %xmm0
; cvttpd2dq %xmm8, %xmm0
; movq %rbp, %rsp ; movq %rbp, %rsp
; popq %rbp ; popq %rbp
; retq ; retq
; addb %al, (%rax) ; addb %al, (%rax)
; addb %al, (%rax) ; addb %al, (%rax)
; addb %al, (%rax) ; addb %al, (%rax)
; sarb $0xff, %bh ; addb %al, (%rax)
; addb %al, (%rax)
; addb %al, (%rax)
; addb %al, %al
function %f4(i16x8, i16x8) -> i8x16 { function %f4(i16x8, i16x8) -> i8x16 {
block0(v0: i16x8, v1: i16x8): block0(v0: i16x8, v1: i16x8):

View File

@@ -55,12 +55,11 @@ block0(v0: i8x16, v1: i8x16):
; pushq %rbp ; pushq %rbp
; movq %rsp, %rbp ; movq %rsp, %rbp
; block0: ; block0:
; movdqa %xmm0, %xmm7 ; movdqa %xmm0, %xmm6
; movdqu const(1), %xmm0 ; movdqu const(0), %xmm0
; movdqu const(0), %xmm6 ; movdqa %xmm6, %xmm7
; movdqa %xmm7, %xmm9 ; vpermi2b %xmm1, %xmm7, %xmm0, %xmm0
; vpermi2b %xmm1, %xmm9, %xmm6, %xmm6 ; andps %xmm0, const(1), %xmm0
; andps %xmm0, %xmm6, %xmm0
; movq %rbp, %rsp ; movq %rbp, %rsp
; popq %rbp ; popq %rbp
; ret ; ret
@@ -70,12 +69,11 @@ block0(v0: i8x16, v1: i8x16):
; pushq %rbp ; pushq %rbp
; movq %rsp, %rbp ; movq %rsp, %rbp
; block1: ; offset 0x4 ; block1: ; offset 0x4
; movdqa %xmm0, %xmm7 ; movdqa %xmm0, %xmm6
; movdqu 0x30(%rip), %xmm0 ; movdqu 0x20(%rip), %xmm0
; movdqu 0x18(%rip), %xmm6 ; movdqa %xmm6, %xmm7
; movdqa %xmm7, %xmm9 ; vpermi2b %xmm1, %xmm7, %xmm0
; vpermi2b %xmm1, %xmm9, %xmm6 ; andps 0x1f(%rip), %xmm0
; andps %xmm6, %xmm0
; movq %rbp, %rsp ; movq %rbp, %rsp
; popq %rbp ; popq %rbp
; retq ; retq
@@ -89,7 +87,9 @@ block0(v0: i8x16, v1: i8x16):
; addb %al, (%rax) ; addb %al, (%rax)
; addb %al, (%rax) ; addb %al, (%rax)
; addb %al, (%rax) ; addb %al, (%rax)
; cmpb $0xff, %bh ; addb %al, (%rax)
; addb %al, (%rax)
; addb %al, -1(%rax)
function %f3(i8x16, i8x16) -> i8x16 { function %f3(i8x16, i8x16) -> i8x16 {
block0(v0: i8x16, v1: i8x16): block0(v0: i8x16, v1: i8x16):

View File

@@ -574,10 +574,9 @@ block0(v0: i16x8, v1: i16x8):
; pushq %rbp ; pushq %rbp
; movq %rsp, %rbp ; movq %rsp, %rbp
; block0: ; block0:
; movdqu const(0), %xmm3 ; vpmulhrsw %xmm0, %xmm1, %xmm3
; vpmulhrsw %xmm0, %xmm1, %xmm5 ; vpcmpeqw %xmm3, const(0), %xmm5
; vpcmpeqw %xmm3, %xmm5, %xmm7 ; vpxor %xmm3, %xmm5, %xmm0
; vpxor %xmm5, %xmm7, %xmm0
; movq %rbp, %rsp ; movq %rbp, %rsp
; popq %rbp ; popq %rbp
; ret ; ret
@@ -587,14 +586,15 @@ block0(v0: i16x8, v1: i16x8):
; pushq %rbp ; pushq %rbp
; movq %rsp, %rbp ; movq %rsp, %rbp
; block1: ; offset 0x4 ; block1: ; offset 0x4
; movdqu 0x14(%rip), %xmm3 ; vpmulhrsw %xmm1, %xmm0, %xmm3
; vpmulhrsw %xmm1, %xmm0, %xmm5 ; vpcmpeqw 0xf(%rip), %xmm3, %xmm5
; vpcmpeqw %xmm5, %xmm3, %xmm7 ; vpxor %xmm5, %xmm3, %xmm0
; vpxor %xmm7, %xmm5, %xmm0
; movq %rbp, %rsp ; movq %rbp, %rsp
; popq %rbp ; popq %rbp
; retq ; retq
; addb %al, (%rax) ; addb %al, (%rax)
; addb %al, (%rax)
; addb %al, (%rax)
; addb %al, -0x7fff8000(%rax) ; addb %al, -0x7fff8000(%rax)
; addb %al, -0x7fff8000(%rax) ; addb %al, -0x7fff8000(%rax)
@@ -671,10 +671,8 @@ block0(v0: i32x4):
; pushq %rbp ; pushq %rbp
; movq %rsp, %rbp ; movq %rsp, %rbp
; block0: ; block0:
; movdqu const(0), %xmm2 ; vunpcklps %xmm0, const(0), %xmm2
; vunpcklps %xmm0, %xmm2, %xmm4 ; vsubpd %xmm2, const(1), %xmm0
; movdqu const(1), %xmm6
; vsubpd %xmm4, %xmm6, %xmm0
; movq %rbp, %rsp ; movq %rbp, %rsp
; popq %rbp ; popq %rbp
; ret ; ret
@@ -684,10 +682,8 @@ block0(v0: i32x4):
; pushq %rbp ; pushq %rbp
; movq %rsp, %rbp ; movq %rsp, %rbp
; block1: ; offset 0x4 ; block1: ; offset 0x4
; movdqu 0x24(%rip), %xmm2 ; vunpcklps 0x14(%rip), %xmm0, %xmm2
; vunpcklps %xmm2, %xmm0, %xmm4 ; vsubpd 0x1c(%rip), %xmm2, %xmm0
; movdqu 0x28(%rip), %xmm6
; vsubpd %xmm6, %xmm4, %xmm0
; movq %rbp, %rsp ; movq %rbp, %rsp
; popq %rbp ; popq %rbp
; retq ; retq
@@ -695,10 +691,6 @@ block0(v0: i32x4):
; addb %al, (%rax) ; addb %al, (%rax)
; addb %al, (%rax) ; addb %al, (%rax)
; addb %al, (%rax) ; addb %al, (%rax)
; addb %al, (%rax)
; addb %al, (%rax)
; addb %al, (%rax)
; addb %al, (%rax)
; addb %dh, (%rax) ; addb %dh, (%rax)
; addb %al, (%r8) ; addb %al, (%r8)
; xorb %al, (%rbx) ; xorb %al, (%rbx)
@@ -1283,8 +1275,7 @@ block0(v0: i16x8):
; pushq %rbp ; pushq %rbp
; movq %rsp, %rbp ; movq %rsp, %rbp
; block0: ; block0:
; movdqu const(0), %xmm2 ; vpmaddwd %xmm0, const(0), %xmm0
; vpmaddwd %xmm0, %xmm2, %xmm0
; movq %rbp, %rsp ; movq %rbp, %rsp
; popq %rbp ; popq %rbp
; ret ; ret
@@ -1294,8 +1285,7 @@ block0(v0: i16x8):
; pushq %rbp ; pushq %rbp
; movq %rsp, %rbp ; movq %rsp, %rbp
; block1: ; offset 0x4 ; block1: ; offset 0x4
; movdqu 0x14(%rip), %xmm2 ; vpmaddwd 0x14(%rip), %xmm0, %xmm0
; vpmaddwd %xmm2, %xmm0, %xmm0
; movq %rbp, %rsp ; movq %rbp, %rsp
; popq %rbp ; popq %rbp
; retq ; retq
@@ -1304,6 +1294,8 @@ block0(v0: i16x8):
; addb %al, (%rax) ; addb %al, (%rax)
; addb %al, (%rax) ; addb %al, (%rax)
; addb %al, (%rax) ; addb %al, (%rax)
; addb %al, (%rax)
; addb %al, (%rax)
; addb %al, (%rcx) ; addb %al, (%rcx)
; addb %al, (%rcx) ; addb %al, (%rcx)
; addb %al, (%rcx) ; addb %al, (%rcx)
@@ -1357,12 +1349,10 @@ block0(v0: f64x2):
; block0: ; block0:
; xorpd %xmm2, %xmm2, %xmm2 ; xorpd %xmm2, %xmm2, %xmm2
; vmaxpd %xmm0, %xmm2, %xmm4 ; vmaxpd %xmm0, %xmm2, %xmm4
; movupd const(0), %xmm6 ; vminpd %xmm4, const(0), %xmm6
; vminpd %xmm4, %xmm6, %xmm8 ; vroundpd $3, %xmm6, %xmm8
; vroundpd $3, %xmm8, %xmm10 ; vaddpd %xmm8, const(1), %xmm10
; movupd const(1), %xmm12 ; vshufps $136 %xmm10, %xmm2, %xmm0
; vaddpd %xmm10, %xmm12, %xmm14
; vshufps $136 %xmm14, %xmm2, %xmm0
; movq %rbp, %rsp ; movq %rbp, %rsp
; popq %rbp ; popq %rbp
; ret ; ret
@@ -1374,22 +1364,17 @@ block0(v0: f64x2):
; block1: ; offset 0x4 ; block1: ; offset 0x4
; xorpd %xmm2, %xmm2 ; xorpd %xmm2, %xmm2
; vmaxpd %xmm2, %xmm0, %xmm4 ; vmaxpd %xmm2, %xmm0, %xmm4
; movupd 0x2c(%rip), %xmm6 ; vminpd 0x1c(%rip), %xmm4, %xmm6
; vminpd %xmm6, %xmm4, %xmm8 ; vroundpd $3, %xmm6, %xmm8
; vroundpd $3, %xmm8, %xmm10 ; vaddpd 0x1e(%rip), %xmm8, %xmm10
; movupd 0x29(%rip), %xmm12 ; vshufps $0x88, %xmm2, %xmm10, %xmm0
; vaddpd %xmm12, %xmm10, %xmm14
; vshufps $0x88, %xmm2, %xmm14, %xmm0
; movq %rbp, %rsp ; movq %rbp, %rsp
; popq %rbp ; popq %rbp
; retq ; retq
; addb %al, (%rax) ; addb %al, (%rax)
; addb %al, (%rax) ; addb %al, (%rax)
; addb %al, (%rax) ; addb %al, (%rax)
; addb %al, (%rax) ; loopne 0x33
; addb %al, (%rax)
; addb %al, (%rax)
; loopne 0x43
function %i8x16_shl(i8x16, i32) -> i8x16 { function %i8x16_shl(i8x16, i32) -> i8x16 {
block0(v0: i8x16, v1: i32): block0(v0: i8x16, v1: i32):

View File

@@ -17,12 +17,10 @@ block0:
; movq %rsp, %rbp ; movq %rsp, %rbp
; block0: ; block0:
; movdqu const(3), %xmm0 ; movdqu const(3), %xmm0
; movdqu const(2), %xmm4 ; movdqu const(2), %xmm2
; movdqu const(0), %xmm2 ; pshufb %xmm0, const(0), %xmm0
; pshufb %xmm0, %xmm2, %xmm0 ; pshufb %xmm2, const(1), %xmm2
; movdqu const(1), %xmm6 ; por %xmm0, %xmm2, %xmm0
; pshufb %xmm4, %xmm6, %xmm4
; por %xmm0, %xmm4, %xmm0
; movq %rbp, %rsp ; movq %rbp, %rsp
; popq %rbp ; popq %rbp
; ret ; ret
@@ -32,13 +30,11 @@ block0:
; pushq %rbp ; pushq %rbp
; movq %rsp, %rbp ; movq %rsp, %rbp
; block1: ; offset 0x4 ; block1: ; offset 0x4
; movdqu 0x64(%rip), %xmm0 ; movdqu 0x54(%rip), %xmm0
; movdqu 0x4c(%rip), %xmm4 ; movdqu 0x3c(%rip), %xmm2
; movdqu 0x24(%rip), %xmm2 ; pshufb 0x13(%rip), %xmm0
; pshufb %xmm2, %xmm0 ; pshufb 0x1a(%rip), %xmm2
; movdqu 0x27(%rip), %xmm6 ; por %xmm2, %xmm0
; pshufb %xmm6, %xmm4
; por %xmm4, %xmm0
; movq %rbp, %rsp ; movq %rbp, %rsp
; popq %rbp ; popq %rbp
; retq ; retq
@@ -50,10 +46,6 @@ block0:
; addb %al, (%rax) ; addb %al, (%rax)
; addb %al, (%rax) ; addb %al, (%rax)
; addb %al, (%rax) ; addb %al, (%rax)
; addb %al, (%rax)
; addb %al, (%rax)
; addb %al, (%rax)
; addb %al, (%rax)
; addb $0x80, -0x7f7f7f80(%rax) ; addb $0x80, -0x7f7f7f80(%rax)
; addb $0x80, -0x7f7f7f80(%rax) ; addb $0x80, -0x7f7f7f80(%rax)
; addb $0, 0x101(%rax) ; addb $0, 0x101(%rax)
@@ -84,8 +76,7 @@ block0:
; movq %rsp, %rbp ; movq %rsp, %rbp
; block0: ; block0:
; movdqu const(1), %xmm0 ; movdqu const(1), %xmm0
; movdqu const(0), %xmm1 ; pshufb %xmm0, const(0), %xmm0
; pshufb %xmm0, %xmm1, %xmm0
; movq %rbp, %rsp ; movq %rbp, %rsp
; popq %rbp ; popq %rbp
; ret ; ret
@@ -96,8 +87,7 @@ block0:
; movq %rsp, %rbp ; movq %rsp, %rbp
; block1: ; offset 0x4 ; block1: ; offset 0x4
; movdqu 0x24(%rip), %xmm0 ; movdqu 0x24(%rip), %xmm0
; movdqu 0xc(%rip), %xmm1 ; pshufb 0xb(%rip), %xmm0
; pshufb %xmm1, %xmm0
; movq %rbp, %rsp ; movq %rbp, %rsp
; popq %rbp ; popq %rbp
; retq ; retq
@@ -109,6 +99,8 @@ block0:
; addb %al, (%rax) ; addb %al, (%rax)
; addb %al, (%rax) ; addb %al, (%rax)
; addb %al, (%rax) ; addb %al, (%rax)
; addb %al, (%rax)
; addb %al, (%rax)
; addb %al, (%rcx, %rax) ; addb %al, (%rcx, %rax)
; addb %al, (%rax) ; addb %al, (%rax)
; addb %al, (%rax) ; addb %al, (%rax)
@@ -131,10 +123,9 @@ block0:
; movq %rsp, %rbp ; movq %rsp, %rbp
; block0: ; block0:
; movdqu const(1), %xmm0 ; movdqu const(1), %xmm0
; movdqu const(1), %xmm2 ; movdqu const(1), %xmm1
; movdqu const(0), %xmm3 ; paddusb %xmm1, const(0), %xmm1
; paddusb %xmm2, %xmm3, %xmm2 ; pshufb %xmm0, %xmm1, %xmm0
; pshufb %xmm0, %xmm2, %xmm0
; movq %rbp, %rsp ; movq %rbp, %rsp
; popq %rbp ; popq %rbp
; ret ; ret
@@ -145,16 +136,17 @@ block0:
; movq %rsp, %rbp ; movq %rsp, %rbp
; block1: ; offset 0x4 ; block1: ; offset 0x4
; movdqu 0x34(%rip), %xmm0 ; movdqu 0x34(%rip), %xmm0
; movdqu 0x2c(%rip), %xmm2 ; movdqu 0x2c(%rip), %xmm1
; movdqu 0x14(%rip), %xmm3 ; paddusb 0x14(%rip), %xmm1
; paddusb %xmm3, %xmm2 ; pshufb %xmm1, %xmm0
; pshufb %xmm2, %xmm0
; movq %rbp, %rsp ; movq %rbp, %rsp
; popq %rbp ; popq %rbp
; retq ; retq
; addb %al, (%rax) ; addb %al, (%rax)
; addb %al, (%rax) ; addb %al, (%rax)
; addb %al, (%rax) ; addb %al, (%rax)
; addb %al, (%rax)
; addb %al, (%rax)
; jo 0xa2 ; jo 0xa2
; jo 0xa4 ; jo 0xa4
; jo 0xa6 ; jo 0xa6

View File

@@ -55,8 +55,7 @@ block0(v0: i16x8):
; pushq %rbp ; pushq %rbp
; movq %rsp, %rbp ; movq %rsp, %rbp
; block0: ; block0:
; movdqu const(0), %xmm2 ; pmaddwd %xmm0, const(0), %xmm0
; pmaddwd %xmm0, %xmm2, %xmm0
; movq %rbp, %rsp ; movq %rbp, %rsp
; popq %rbp ; popq %rbp
; ret ; ret
@@ -66,8 +65,7 @@ block0(v0: i16x8):
; pushq %rbp ; pushq %rbp
; movq %rsp, %rbp ; movq %rsp, %rbp
; block1: ; offset 0x4 ; block1: ; offset 0x4
; movdqu 0x14(%rip), %xmm2 ; pmaddwd 0x14(%rip), %xmm0
; pmaddwd %xmm2, %xmm0
; movq %rbp, %rsp ; movq %rbp, %rsp
; popq %rbp ; popq %rbp
; retq ; retq
@@ -76,6 +74,8 @@ block0(v0: i16x8):
; addb %al, (%rax) ; addb %al, (%rax)
; addb %al, (%rax) ; addb %al, (%rax)
; addb %al, (%rax) ; addb %al, (%rax)
; addb %al, (%rax)
; addb %al, (%rax)
; addb %al, (%rcx) ; addb %al, (%rcx)
; addb %al, (%rcx) ; addb %al, (%rcx)
; addb %al, (%rcx) ; addb %al, (%rcx)
@@ -97,8 +97,7 @@ block0(v0: i8x16):
; pushq %rbp ; pushq %rbp
; movq %rsp, %rbp ; movq %rsp, %rbp
; block0: ; block0:
; movdqu const(0), %xmm2 ; pmaddubsw %xmm0, const(0), %xmm0
; pmaddubsw %xmm0, %xmm2, %xmm0
; movq %rbp, %rsp ; movq %rbp, %rsp
; popq %rbp ; popq %rbp
; ret ; ret
@@ -108,8 +107,7 @@ block0(v0: i8x16):
; pushq %rbp ; pushq %rbp
; movq %rsp, %rbp ; movq %rsp, %rbp
; block1: ; offset 0x4 ; block1: ; offset 0x4
; movdqu 0x14(%rip), %xmm2 ; pmaddubsw 0x13(%rip), %xmm0
; pmaddubsw %xmm2, %xmm0
; movq %rbp, %rsp ; movq %rbp, %rsp
; popq %rbp ; popq %rbp
; retq ; retq
@@ -118,6 +116,8 @@ block0(v0: i8x16):
; addb %al, (%rax) ; addb %al, (%rax)
; addb %al, (%rax) ; addb %al, (%rax)
; addb %al, (%rax) ; addb %al, (%rax)
; addb %al, (%rax)
; addb %al, (%rax)
; addl %eax, (%rcx) ; addl %eax, (%rcx)
; addl %eax, (%rcx) ; addl %eax, (%rcx)
; addl %eax, (%rcx) ; addl %eax, (%rcx)
@@ -139,12 +139,9 @@ block0(v0: i16x8):
; pushq %rbp ; pushq %rbp
; movq %rsp, %rbp ; movq %rsp, %rbp
; block0: ; block0:
; movdqu const(0), %xmm2 ; pxor %xmm0, const(0), %xmm0
; pxor %xmm0, %xmm2, %xmm0 ; pmaddwd %xmm0, const(1), %xmm0
; movdqu const(1), %xmm6 ; paddd %xmm0, const(2), %xmm0
; pmaddwd %xmm0, %xmm6, %xmm0
; movdqu const(2), %xmm10
; paddd %xmm0, %xmm10, %xmm0
; movq %rbp, %rsp ; movq %rbp, %rsp
; popq %rbp ; popq %rbp
; ret ; ret
@@ -154,16 +151,20 @@ block0(v0: i16x8):
; pushq %rbp ; pushq %rbp
; movq %rsp, %rbp ; movq %rsp, %rbp
; block1: ; offset 0x4 ; block1: ; offset 0x4
; movdqu 0x24(%rip), %xmm2 ; pxor 0x24(%rip), %xmm0
; pxor %xmm2, %xmm0 ; pmaddwd 0x2c(%rip), %xmm0
; movdqu 0x28(%rip), %xmm6 ; paddd 0x34(%rip), %xmm0
; pmaddwd %xmm6, %xmm0
; movdqu 0x2b(%rip), %xmm10
; paddd %xmm10, %xmm0
; movq %rbp, %rsp ; movq %rbp, %rsp
; popq %rbp ; popq %rbp
; retq ; retq
; addb %al, (%rax) ; addb %al, (%rax)
; addb %al, (%rax)
; addb %al, (%rax)
; addb %al, (%rax)
; addb %al, (%rax)
; addb %al, (%rax)
; addb %al, (%rax)
; addb %al, (%rax)
; addb $0x80, (%rax) ; addb $0x80, (%rax)
; addb %al, -0x7fff8000(%rax) ; addb %al, -0x7fff8000(%rax)
; addb %al, -0x7fff8000(%rax) ; addb %al, -0x7fff8000(%rax)

View File

@@ -11,9 +11,9 @@ block0(v0: i16x8, v1: i16x8):
; pushq %rbp ; pushq %rbp
; movq %rsp, %rbp ; movq %rsp, %rbp
; block0: ; block0:
; movdqu const(0), %xmm5
; pmulhrsw %xmm0, %xmm1, %xmm0 ; pmulhrsw %xmm0, %xmm1, %xmm0
; pcmpeqw %xmm5, %xmm0, %xmm5 ; movdqa %xmm0, %xmm5
; pcmpeqw %xmm5, const(0), %xmm5
; pxor %xmm0, %xmm5, %xmm0 ; pxor %xmm0, %xmm5, %xmm0
; movq %rbp, %rsp ; movq %rbp, %rsp
; popq %rbp ; popq %rbp
@@ -24,9 +24,9 @@ block0(v0: i16x8, v1: i16x8):
; pushq %rbp ; pushq %rbp
; movq %rsp, %rbp ; movq %rsp, %rbp
; block1: ; offset 0x4 ; block1: ; offset 0x4
; movdqu 0x14(%rip), %xmm5
; pmulhrsw %xmm1, %xmm0 ; pmulhrsw %xmm1, %xmm0
; pcmpeqw %xmm0, %xmm5 ; movdqa %xmm0, %xmm5
; pcmpeqw 0xb(%rip), %xmm5
; pxor %xmm5, %xmm0 ; pxor %xmm5, %xmm0
; movq %rbp, %rsp ; movq %rbp, %rsp
; popq %rbp ; popq %rbp

View File

@@ -14,13 +14,11 @@ block0(v0: f64x2):
; movq %rsp, %rbp ; movq %rsp, %rbp
; block0: ; block0:
; xorpd %xmm2, %xmm2, %xmm2 ; xorpd %xmm2, %xmm2, %xmm2
; movdqa %xmm0, %xmm6 ; movdqa %xmm0, %xmm5
; maxpd %xmm6, %xmm2, %xmm6 ; maxpd %xmm5, %xmm2, %xmm5
; movupd const(0), %xmm7 ; minpd %xmm5, const(0), %xmm5
; minpd %xmm6, %xmm7, %xmm6 ; roundpd $3, %xmm5, %xmm0
; roundpd $3, %xmm6, %xmm0 ; addpd %xmm0, const(1), %xmm0
; movupd const(1), %xmm12
; addpd %xmm0, %xmm12, %xmm0
; shufps $136, %xmm0, %xmm2, %xmm0 ; shufps $136, %xmm0, %xmm2, %xmm0
; movq %rbp, %rsp ; movq %rbp, %rsp
; popq %rbp ; popq %rbp
@@ -32,20 +30,15 @@ block0(v0: f64x2):
; movq %rsp, %rbp ; movq %rsp, %rbp
; block1: ; offset 0x4 ; block1: ; offset 0x4
; xorpd %xmm2, %xmm2 ; xorpd %xmm2, %xmm2
; movdqa %xmm0, %xmm6 ; movdqa %xmm0, %xmm5
; maxpd %xmm2, %xmm6 ; maxpd %xmm2, %xmm5
; movupd 0x28(%rip), %xmm7 ; minpd 0x18(%rip), %xmm5
; minpd %xmm7, %xmm6 ; roundpd $3, %xmm5, %xmm0
; roundpd $3, %xmm6, %xmm0 ; addpd 0x1a(%rip), %xmm0
; movupd 0x25(%rip), %xmm12
; addpd %xmm12, %xmm0
; shufps $0x88, %xmm2, %xmm0 ; shufps $0x88, %xmm2, %xmm0
; movq %rbp, %rsp ; movq %rbp, %rsp
; popq %rbp ; popq %rbp
; retq ; retq
; addb %al, (%rax) ; addb %al, (%rax)
; addb %al, (%rax)
; addb %al, (%rax)
; addb %al, (%rax)
; addb %ah, %al ; addb %ah, %al