From f2dce812c3ee5d7fae416e7acd0cfefce6708a97 Mon Sep 17 00:00:00 2001 From: Alex Crichton Date: Mon, 27 Feb 2023 16:02:42 -0600 Subject: [PATCH] x64: Sink constant loads into xmm instructions (#5880) A number of places in the x64 backend make use of 128-bit constants for various wasm SIMD-related instructions although most of them currently use the `x64_xmm_load_const` helper to load the constant into a register. Almost all xmm instructions, however, enable using a memory operand which means that these loads can be folded into instructions to help reduce register pressure. Automatic conversions were added for a `VCodeConstant` into an `XmmMem` value and then explicit loads were all removed in favor of forwarding the `XmmMem` value directly to the underlying instruction. Note that some instances of `x64_xmm_load_const` remain since they're used in contexts where load sinking won't work (e.g. they're the first operand, not the second for non-commutative instructions). --- cranelift/codegen/src/isa/x64/inst.isle | 4 ++ cranelift/codegen/src/isa/x64/lower.isle | 40 +++++------- .../filetests/filetests/isa/x64/fcvt.clif | 16 ++--- .../filetests/isa/x64/float-avx.clif | 21 ++---- .../filetests/isa/x64/narrowing.clif | 31 ++++----- .../filetests/isa/x64/shuffle-avx512.clif | 26 ++++---- .../filetests/isa/x64/simd-arith-avx.clif | 65 +++++++------------ .../isa/x64/simd-lane-access-compile.clif | 50 ++++++-------- .../filetests/isa/x64/simd-pairwise-add.clif | 41 ++++++------ .../filetests/isa/x64/sqmul_round_sat.clif | 8 +-- .../filetests/filetests/isa/x64/uunarrow.clif | 27 +++----- 11 files changed, 147 insertions(+), 182 deletions(-) diff --git a/cranelift/codegen/src/isa/x64/inst.isle b/cranelift/codegen/src/isa/x64/inst.isle index 34e3145c5f..d25cc7efc2 100644 --- a/cranelift/codegen/src/isa/x64/inst.isle +++ b/cranelift/codegen/src/isa/x64/inst.isle @@ -4489,6 +4489,8 @@ (convert SyntheticAmode XmmMem synthetic_amode_to_xmm_mem) (convert Amode XmmMemAligned amode_to_xmm_mem_aligned) (convert SyntheticAmode XmmMemAligned synthetic_amode_to_xmm_mem_aligned) +(convert VCodeConstant SyntheticAmode const_to_synthetic_amode) +(convert VCodeConstant XmmMem const_to_xmm_mem) (convert IntCC CC intcc_to_cc) (convert AtomicRmwOp MachAtomicRmwOp atomic_rmw_op_to_mach_atomic_rmw_op) @@ -4537,6 +4539,8 @@ (synthetic_amode_to_reg_mem amode)) (decl const_to_synthetic_amode (VCodeConstant) SyntheticAmode) (extern constructor const_to_synthetic_amode const_to_synthetic_amode) +(decl const_to_xmm_mem (VCodeConstant) XmmMem) +(rule (const_to_xmm_mem c) (const_to_synthetic_amode c)) (decl xmm_to_xmm_mem_aligned (Xmm) XmmMemAligned) (rule (xmm_to_xmm_mem_aligned reg) (xmm_mem_to_xmm_mem_aligned reg)) diff --git a/cranelift/codegen/src/isa/x64/lower.isle b/cranelift/codegen/src/isa/x64/lower.isle index 84f9f3ef80..40b40f9b9f 100644 --- a/cranelift/codegen/src/isa/x64/lower.isle +++ b/cranelift/codegen/src/isa/x64/lower.isle @@ -1908,7 +1908,7 @@ (rule (lower (has_type $I8X16 (popcnt src))) (let ((nibble_table_const VCodeConstant (popcount_4bit_table)) - (low_mask Xmm (x64_xmm_load_const $I8X16 (popcount_low_mask))) + (low_mask XmmMem (popcount_low_mask)) (low_nibbles Xmm (sse_and $I8X16 src low_mask)) ;; Note that this is a 16x8 shift, but that's OK; we mask ;; off anything that traverses from one byte to the next @@ -2984,9 +2984,9 @@ ;; every value of the mantissa represents a corresponding uint32 number. ;; When we subtract 0x1.0p52 we are left with double(src). (rule 1 (lower (has_type $F64X2 (fcvt_from_uint (uwiden_low val @ (value_type $I32X4))))) - (let ((uint_mask Xmm (x64_xmm_load_const $I32X4 (fcvt_uint_mask_const))) + (let ((uint_mask XmmMem (fcvt_uint_mask_const)) (res Xmm (x64_unpcklps val uint_mask)) - (uint_mask_high Xmm (x64_xmm_load_const $I32X4 (fcvt_uint_mask_high_const)))) + (uint_mask_high XmmMem (fcvt_uint_mask_high_const))) (x64_subpd res uint_mask_high))) ;; When AVX512VL and AVX512F are available, @@ -3186,7 +3186,7 @@ (has_type $I32X4 (iadd_pairwise (swiden_low val @ (value_type $I16X8)) (swiden_high val)))) - (let ((mul_const Xmm (x64_xmm_load_const $I16X8 (iadd_pairwise_mul_const_32)))) + (let ((mul_const XmmMem (iadd_pairwise_mul_const_32))) (x64_pmaddwd val mul_const))) ;; special case for the `i16x8.extadd_pairwise_i8x16_u` wasm instruction @@ -3194,7 +3194,7 @@ (has_type $I16X8 (iadd_pairwise (uwiden_low val @ (value_type $I8X16)) (uwiden_high val)))) - (let ((mul_const Xmm (x64_xmm_load_const $I8X16 (iadd_pairwise_mul_const_16)))) + (let ((mul_const XmmMem (iadd_pairwise_mul_const_16))) (x64_pmaddubsw val mul_const))) ;; special case for the `i32x4.extadd_pairwise_i16x8_u` wasm instruction @@ -3202,13 +3202,13 @@ (has_type $I32X4 (iadd_pairwise (uwiden_low val @ (value_type $I16X8)) (uwiden_high val)))) - (let ((xor_const Xmm (x64_xmm_load_const $I16X8 (iadd_pairwise_xor_const_32))) + (let ((xor_const XmmMem (iadd_pairwise_xor_const_32)) (dst Xmm (x64_pxor val xor_const)) - (madd_const Xmm (x64_xmm_load_const $I16X8 (iadd_pairwise_mul_const_32))) + (madd_const XmmMem (iadd_pairwise_mul_const_32)) (dst Xmm (x64_pmaddwd dst madd_const)) - (addd_const Xmm (x64_xmm_load_const $I16X8 (iadd_pairwise_addd_const_32)))) + (addd_const XmmMem (iadd_pairwise_addd_const_32))) (x64_paddd dst addd_const))) ;; special case for the `i32x4.dot_i16x8_s` wasm instruction @@ -3293,7 +3293,7 @@ ;; CVTTPD2DQ xmm_y, xmm_y (tmp1 Xmm (x64_cmppd a a (FcmpImm.Equal))) - (umax_mask Xmm (x64_xmm_load_const $F64X2 (snarrow_umax_mask))) + (umax_mask XmmMem (snarrow_umax_mask)) ;; ANDPD xmm_y, [wasm_f64x2_splat(2147483647.0)] (tmp1 Xmm (x64_andps tmp1 umax_mask)) @@ -3509,7 +3509,7 @@ ;; indices (may not be completely necessary: verification could fail incorrect ;; mask values) and fix the indexes to all point to the `dst` vector. (rule 3 (lower (shuffle a a (vec_mask_from_immediate mask))) - (x64_pshufb a (x64_xmm_load_const $I8X16 (shuffle_0_31_mask mask)))) + (x64_pshufb a (shuffle_0_31_mask mask))) ;; For the case where the shuffle mask contains out-of-bounds values (values ;; greater than 31) we must mask off those resulting values in the result of @@ -3517,9 +3517,7 @@ (rule 2 (lower (has_type (and (avx512vl_enabled $true) (avx512vbmi_enabled $true)) (shuffle a b (vec_mask_from_immediate (perm_from_mask_with_zeros mask zeros))))) - (x64_andps - (x64_xmm_load_const $I8X16 zeros) - (x64_vpermi2b b a (x64_xmm_load_const $I8X16 mask)))) + (x64_andps (x64_vpermi2b b a (x64_xmm_load_const $I8X16 mask)) zeros)) ;; However, if the shuffle mask contains no out-of-bounds values, we can use ;; `vpermi2b` without any masking. @@ -3532,8 +3530,8 @@ ;; above, we build the `constructed_mask` for each case statically. (rule (lower (shuffle a b (vec_mask_from_immediate mask))) (x64_por - (x64_pshufb a (x64_xmm_load_const $I8X16 (shuffle_0_15_mask mask))) - (x64_pshufb b (x64_xmm_load_const $I8X16 (shuffle_16_31_mask mask))))) + (x64_pshufb a (shuffle_0_15_mask mask)) + (x64_pshufb b (shuffle_16_31_mask mask)))) ;; Rules for `swizzle` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; @@ -3544,9 +3542,7 @@ ;; Wasm SIMD semantics for this instruction. The instruction format maps to ;; variables like: %dst = swizzle %src, %mask (rule (lower (swizzle src mask)) - (let ((mask Xmm (x64_paddusb - mask - (x64_xmm_load_const $I8X16 (swizzle_zero_mask))))) + (let ((mask Xmm (x64_paddusb mask (swizzle_zero_mask)))) (x64_pshufb src mask))) ;; Rules for `extractlane` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; @@ -3721,9 +3717,9 @@ (let ((src1 Xmm qx) (src2 Xmm qy) - (mask Xmm (x64_xmm_load_const $I16X8 (sqmul_round_sat_mask))) + (mask XmmMem (sqmul_round_sat_mask)) (dst Xmm (x64_pmulhrsw src1 src2)) - (cmp Xmm (x64_pcmpeqw mask dst))) + (cmp Xmm (x64_pcmpeqw dst mask))) (x64_pxor dst cmp))) ;; Rules for `sqmul_round_sat` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; @@ -3749,7 +3745,7 @@ (zeros Xmm (xmm_zero $F64X2)) (dst Xmm (x64_maxpd src zeros)) - (umax_mask Xmm (x64_xmm_load_const $F64X2 (uunarrow_umax_mask))) + (umax_mask XmmMem (uunarrow_umax_mask)) ;; MINPD xmm_y, [wasm_f64x2_splat(4294967295.0)] (dst Xmm (x64_minpd dst umax_mask)) @@ -3758,7 +3754,7 @@ (dst Xmm (x64_roundpd dst (RoundImm.RoundZero))) ;; ADDPD xmm_y, [wasm_f64x2_splat(0x1.0p+52)] - (uint_mask Xmm (x64_xmm_load_const $F64X2 (uunarrow_uint_mask))) + (uint_mask XmmMem (uunarrow_uint_mask)) (dst Xmm (x64_addpd dst uint_mask))) ;; SHUFPS xmm_y, xmm_xmp, 0x88 diff --git a/cranelift/filetests/filetests/isa/x64/fcvt.clif b/cranelift/filetests/filetests/isa/x64/fcvt.clif index f5d366095f..aee96700a8 100644 --- a/cranelift/filetests/filetests/isa/x64/fcvt.clif +++ b/cranelift/filetests/filetests/isa/x64/fcvt.clif @@ -304,10 +304,8 @@ block0(v0: i32x4): ; pushq %rbp ; movq %rsp, %rbp ; block0: -; movdqu const(0), %xmm2 -; unpcklps %xmm0, %xmm2, %xmm0 -; movdqu const(1), %xmm6 -; subpd %xmm0, %xmm6, %xmm0 +; unpcklps %xmm0, const(0), %xmm0 +; subpd %xmm0, const(1), %xmm0 ; movq %rbp, %rsp ; popq %rbp ; ret @@ -317,14 +315,16 @@ block0(v0: i32x4): ; pushq %rbp ; movq %rsp, %rbp ; block1: ; offset 0x4 -; movdqu 0x14(%rip), %xmm2 -; unpcklps %xmm2, %xmm0 -; movdqu 0x19(%rip), %xmm6 -; subpd %xmm6, %xmm0 +; unpcklps 0x15(%rip), %xmm0 +; subpd 0x1d(%rip), %xmm0 ; movq %rbp, %rsp ; popq %rbp ; retq ; addb %al, (%rax) +; addb %al, (%rax) +; addb %al, (%rax) +; addb %al, (%rax) +; addb %al, (%rax) ; xorb %al, (%rbx) ; addb %dh, (%rax) ; addb %al, (%r8) diff --git a/cranelift/filetests/filetests/isa/x64/float-avx.clif b/cranelift/filetests/filetests/isa/x64/float-avx.clif index 8626b34757..6776bb529a 100644 --- a/cranelift/filetests/filetests/isa/x64/float-avx.clif +++ b/cranelift/filetests/filetests/isa/x64/float-avx.clif @@ -566,10 +566,9 @@ block0(v0: f64x2): ; movq %rsp, %rbp ; block0: ; vcmppd $0 %xmm0, %xmm0, %xmm2 -; movupd const(0), %xmm4 -; vandps %xmm2, %xmm4, %xmm6 -; vminpd %xmm0, %xmm6, %xmm8 -; vcvttpd2dq %xmm8, %xmm0 +; vandps %xmm2, const(0), %xmm4 +; vminpd %xmm0, %xmm4, %xmm6 +; vcvttpd2dq %xmm6, %xmm0 ; movq %rbp, %rsp ; popq %rbp ; ret @@ -580,19 +579,13 @@ block0(v0: f64x2): ; movq %rsp, %rbp ; block1: ; offset 0x4 ; vcmpeqpd %xmm0, %xmm0, %xmm2 -; movupd 0x1f(%rip), %xmm4 -; vandps %xmm4, %xmm2, %xmm6 -; vminpd %xmm6, %xmm0, %xmm8 -; vcvttpd2dq %xmm8, %xmm0 +; vandps 0xf(%rip), %xmm2, %xmm4 +; vminpd %xmm4, %xmm0, %xmm6 +; vcvttpd2dq %xmm6, %xmm0 ; movq %rbp, %rsp ; popq %rbp ; retq ; addb %al, (%rax) ; addb %al, (%rax) -; addb %al, (%rax) -; addb %al, (%rax) -; addb %al, (%rax) -; addb %al, (%rax) -; addb %al, (%rax) -; addb %al, %al +; sarb $0xff, %bh diff --git a/cranelift/filetests/filetests/isa/x64/narrowing.clif b/cranelift/filetests/filetests/isa/x64/narrowing.clif index 2e9025d5e7..5eba163264 100644 --- a/cranelift/filetests/filetests/isa/x64/narrowing.clif +++ b/cranelift/filetests/filetests/isa/x64/narrowing.clif @@ -63,13 +63,12 @@ block0(v0: f64x2): ; pushq %rbp ; movq %rsp, %rbp ; block0: -; movdqa %xmm0, %xmm4 -; cmppd $0, %xmm4, %xmm0, %xmm4 -; movupd const(0), %xmm5 -; andps %xmm4, %xmm5, %xmm4 -; movdqa %xmm0, %xmm8 -; minpd %xmm8, %xmm4, %xmm8 -; cvttpd2dq %xmm8, %xmm0 +; movdqa %xmm0, %xmm3 +; cmppd $0, %xmm3, %xmm0, %xmm3 +; andps %xmm3, const(0), %xmm3 +; movdqa %xmm0, %xmm6 +; minpd %xmm6, %xmm3, %xmm6 +; cvttpd2dq %xmm6, %xmm0 ; movq %rbp, %rsp ; popq %rbp ; ret @@ -79,20 +78,22 @@ block0(v0: f64x2): ; pushq %rbp ; movq %rsp, %rbp ; block1: ; offset 0x4 -; movdqa %xmm0, %xmm4 -; cmpeqpd %xmm0, %xmm4 -; movupd 0x1b(%rip), %xmm5 -; andps %xmm5, %xmm4 -; movdqa %xmm0, %xmm8 -; minpd %xmm4, %xmm8 -; cvttpd2dq %xmm8, %xmm0 +; movdqa %xmm0, %xmm3 +; cmpeqpd %xmm0, %xmm3 +; andps 0x1c(%rip), %xmm3 +; movdqa %xmm0, %xmm6 +; minpd %xmm3, %xmm6 +; cvttpd2dq %xmm6, %xmm0 ; movq %rbp, %rsp ; popq %rbp ; retq ; addb %al, (%rax) ; addb %al, (%rax) ; addb %al, (%rax) -; sarb $0xff, %bh +; addb %al, (%rax) +; addb %al, (%rax) +; addb %al, (%rax) +; addb %al, %al function %f4(i16x8, i16x8) -> i8x16 { block0(v0: i16x8, v1: i16x8): diff --git a/cranelift/filetests/filetests/isa/x64/shuffle-avx512.clif b/cranelift/filetests/filetests/isa/x64/shuffle-avx512.clif index c7bbf96e44..6f63010491 100644 --- a/cranelift/filetests/filetests/isa/x64/shuffle-avx512.clif +++ b/cranelift/filetests/filetests/isa/x64/shuffle-avx512.clif @@ -55,12 +55,11 @@ block0(v0: i8x16, v1: i8x16): ; pushq %rbp ; movq %rsp, %rbp ; block0: -; movdqa %xmm0, %xmm7 -; movdqu const(1), %xmm0 -; movdqu const(0), %xmm6 -; movdqa %xmm7, %xmm9 -; vpermi2b %xmm1, %xmm9, %xmm6, %xmm6 -; andps %xmm0, %xmm6, %xmm0 +; movdqa %xmm0, %xmm6 +; movdqu const(0), %xmm0 +; movdqa %xmm6, %xmm7 +; vpermi2b %xmm1, %xmm7, %xmm0, %xmm0 +; andps %xmm0, const(1), %xmm0 ; movq %rbp, %rsp ; popq %rbp ; ret @@ -70,12 +69,11 @@ block0(v0: i8x16, v1: i8x16): ; pushq %rbp ; movq %rsp, %rbp ; block1: ; offset 0x4 -; movdqa %xmm0, %xmm7 -; movdqu 0x30(%rip), %xmm0 -; movdqu 0x18(%rip), %xmm6 -; movdqa %xmm7, %xmm9 -; vpermi2b %xmm1, %xmm9, %xmm6 -; andps %xmm6, %xmm0 +; movdqa %xmm0, %xmm6 +; movdqu 0x20(%rip), %xmm0 +; movdqa %xmm6, %xmm7 +; vpermi2b %xmm1, %xmm7, %xmm0 +; andps 0x1f(%rip), %xmm0 ; movq %rbp, %rsp ; popq %rbp ; retq @@ -89,7 +87,9 @@ block0(v0: i8x16, v1: i8x16): ; addb %al, (%rax) ; addb %al, (%rax) ; addb %al, (%rax) -; cmpb $0xff, %bh +; addb %al, (%rax) +; addb %al, (%rax) +; addb %al, -1(%rax) function %f3(i8x16, i8x16) -> i8x16 { block0(v0: i8x16, v1: i8x16): diff --git a/cranelift/filetests/filetests/isa/x64/simd-arith-avx.clif b/cranelift/filetests/filetests/isa/x64/simd-arith-avx.clif index 3232802602..3e622bd5f9 100644 --- a/cranelift/filetests/filetests/isa/x64/simd-arith-avx.clif +++ b/cranelift/filetests/filetests/isa/x64/simd-arith-avx.clif @@ -574,10 +574,9 @@ block0(v0: i16x8, v1: i16x8): ; pushq %rbp ; movq %rsp, %rbp ; block0: -; movdqu const(0), %xmm3 -; vpmulhrsw %xmm0, %xmm1, %xmm5 -; vpcmpeqw %xmm3, %xmm5, %xmm7 -; vpxor %xmm5, %xmm7, %xmm0 +; vpmulhrsw %xmm0, %xmm1, %xmm3 +; vpcmpeqw %xmm3, const(0), %xmm5 +; vpxor %xmm3, %xmm5, %xmm0 ; movq %rbp, %rsp ; popq %rbp ; ret @@ -587,14 +586,15 @@ block0(v0: i16x8, v1: i16x8): ; pushq %rbp ; movq %rsp, %rbp ; block1: ; offset 0x4 -; movdqu 0x14(%rip), %xmm3 -; vpmulhrsw %xmm1, %xmm0, %xmm5 -; vpcmpeqw %xmm5, %xmm3, %xmm7 -; vpxor %xmm7, %xmm5, %xmm0 +; vpmulhrsw %xmm1, %xmm0, %xmm3 +; vpcmpeqw 0xf(%rip), %xmm3, %xmm5 +; vpxor %xmm5, %xmm3, %xmm0 ; movq %rbp, %rsp ; popq %rbp ; retq ; addb %al, (%rax) +; addb %al, (%rax) +; addb %al, (%rax) ; addb %al, -0x7fff8000(%rax) ; addb %al, -0x7fff8000(%rax) @@ -671,10 +671,8 @@ block0(v0: i32x4): ; pushq %rbp ; movq %rsp, %rbp ; block0: -; movdqu const(0), %xmm2 -; vunpcklps %xmm0, %xmm2, %xmm4 -; movdqu const(1), %xmm6 -; vsubpd %xmm4, %xmm6, %xmm0 +; vunpcklps %xmm0, const(0), %xmm2 +; vsubpd %xmm2, const(1), %xmm0 ; movq %rbp, %rsp ; popq %rbp ; ret @@ -684,10 +682,8 @@ block0(v0: i32x4): ; pushq %rbp ; movq %rsp, %rbp ; block1: ; offset 0x4 -; movdqu 0x24(%rip), %xmm2 -; vunpcklps %xmm2, %xmm0, %xmm4 -; movdqu 0x28(%rip), %xmm6 -; vsubpd %xmm6, %xmm4, %xmm0 +; vunpcklps 0x14(%rip), %xmm0, %xmm2 +; vsubpd 0x1c(%rip), %xmm2, %xmm0 ; movq %rbp, %rsp ; popq %rbp ; retq @@ -695,10 +691,6 @@ block0(v0: i32x4): ; addb %al, (%rax) ; addb %al, (%rax) ; addb %al, (%rax) -; addb %al, (%rax) -; addb %al, (%rax) -; addb %al, (%rax) -; addb %al, (%rax) ; addb %dh, (%rax) ; addb %al, (%r8) ; xorb %al, (%rbx) @@ -1283,8 +1275,7 @@ block0(v0: i16x8): ; pushq %rbp ; movq %rsp, %rbp ; block0: -; movdqu const(0), %xmm2 -; vpmaddwd %xmm0, %xmm2, %xmm0 +; vpmaddwd %xmm0, const(0), %xmm0 ; movq %rbp, %rsp ; popq %rbp ; ret @@ -1294,8 +1285,7 @@ block0(v0: i16x8): ; pushq %rbp ; movq %rsp, %rbp ; block1: ; offset 0x4 -; movdqu 0x14(%rip), %xmm2 -; vpmaddwd %xmm2, %xmm0, %xmm0 +; vpmaddwd 0x14(%rip), %xmm0, %xmm0 ; movq %rbp, %rsp ; popq %rbp ; retq @@ -1304,6 +1294,8 @@ block0(v0: i16x8): ; addb %al, (%rax) ; addb %al, (%rax) ; addb %al, (%rax) +; addb %al, (%rax) +; addb %al, (%rax) ; addb %al, (%rcx) ; addb %al, (%rcx) ; addb %al, (%rcx) @@ -1357,12 +1349,10 @@ block0(v0: f64x2): ; block0: ; xorpd %xmm2, %xmm2, %xmm2 ; vmaxpd %xmm0, %xmm2, %xmm4 -; movupd const(0), %xmm6 -; vminpd %xmm4, %xmm6, %xmm8 -; vroundpd $3, %xmm8, %xmm10 -; movupd const(1), %xmm12 -; vaddpd %xmm10, %xmm12, %xmm14 -; vshufps $136 %xmm14, %xmm2, %xmm0 +; vminpd %xmm4, const(0), %xmm6 +; vroundpd $3, %xmm6, %xmm8 +; vaddpd %xmm8, const(1), %xmm10 +; vshufps $136 %xmm10, %xmm2, %xmm0 ; movq %rbp, %rsp ; popq %rbp ; ret @@ -1374,22 +1364,17 @@ block0(v0: f64x2): ; block1: ; offset 0x4 ; xorpd %xmm2, %xmm2 ; vmaxpd %xmm2, %xmm0, %xmm4 -; movupd 0x2c(%rip), %xmm6 -; vminpd %xmm6, %xmm4, %xmm8 -; vroundpd $3, %xmm8, %xmm10 -; movupd 0x29(%rip), %xmm12 -; vaddpd %xmm12, %xmm10, %xmm14 -; vshufps $0x88, %xmm2, %xmm14, %xmm0 +; vminpd 0x1c(%rip), %xmm4, %xmm6 +; vroundpd $3, %xmm6, %xmm8 +; vaddpd 0x1e(%rip), %xmm8, %xmm10 +; vshufps $0x88, %xmm2, %xmm10, %xmm0 ; movq %rbp, %rsp ; popq %rbp ; retq ; addb %al, (%rax) ; addb %al, (%rax) ; addb %al, (%rax) -; addb %al, (%rax) -; addb %al, (%rax) -; addb %al, (%rax) -; loopne 0x43 +; loopne 0x33 function %i8x16_shl(i8x16, i32) -> i8x16 { block0(v0: i8x16, v1: i32): diff --git a/cranelift/filetests/filetests/isa/x64/simd-lane-access-compile.clif b/cranelift/filetests/filetests/isa/x64/simd-lane-access-compile.clif index 790fde063f..b6f75d7792 100644 --- a/cranelift/filetests/filetests/isa/x64/simd-lane-access-compile.clif +++ b/cranelift/filetests/filetests/isa/x64/simd-lane-access-compile.clif @@ -17,12 +17,10 @@ block0: ; movq %rsp, %rbp ; block0: ; movdqu const(3), %xmm0 -; movdqu const(2), %xmm4 -; movdqu const(0), %xmm2 -; pshufb %xmm0, %xmm2, %xmm0 -; movdqu const(1), %xmm6 -; pshufb %xmm4, %xmm6, %xmm4 -; por %xmm0, %xmm4, %xmm0 +; movdqu const(2), %xmm2 +; pshufb %xmm0, const(0), %xmm0 +; pshufb %xmm2, const(1), %xmm2 +; por %xmm0, %xmm2, %xmm0 ; movq %rbp, %rsp ; popq %rbp ; ret @@ -32,13 +30,11 @@ block0: ; pushq %rbp ; movq %rsp, %rbp ; block1: ; offset 0x4 -; movdqu 0x64(%rip), %xmm0 -; movdqu 0x4c(%rip), %xmm4 -; movdqu 0x24(%rip), %xmm2 -; pshufb %xmm2, %xmm0 -; movdqu 0x27(%rip), %xmm6 -; pshufb %xmm6, %xmm4 -; por %xmm4, %xmm0 +; movdqu 0x54(%rip), %xmm0 +; movdqu 0x3c(%rip), %xmm2 +; pshufb 0x13(%rip), %xmm0 +; pshufb 0x1a(%rip), %xmm2 +; por %xmm2, %xmm0 ; movq %rbp, %rsp ; popq %rbp ; retq @@ -50,10 +46,6 @@ block0: ; addb %al, (%rax) ; addb %al, (%rax) ; addb %al, (%rax) -; addb %al, (%rax) -; addb %al, (%rax) -; addb %al, (%rax) -; addb %al, (%rax) ; addb $0x80, -0x7f7f7f80(%rax) ; addb $0x80, -0x7f7f7f80(%rax) ; addb $0, 0x101(%rax) @@ -84,8 +76,7 @@ block0: ; movq %rsp, %rbp ; block0: ; movdqu const(1), %xmm0 -; movdqu const(0), %xmm1 -; pshufb %xmm0, %xmm1, %xmm0 +; pshufb %xmm0, const(0), %xmm0 ; movq %rbp, %rsp ; popq %rbp ; ret @@ -96,8 +87,7 @@ block0: ; movq %rsp, %rbp ; block1: ; offset 0x4 ; movdqu 0x24(%rip), %xmm0 -; movdqu 0xc(%rip), %xmm1 -; pshufb %xmm1, %xmm0 +; pshufb 0xb(%rip), %xmm0 ; movq %rbp, %rsp ; popq %rbp ; retq @@ -109,6 +99,8 @@ block0: ; addb %al, (%rax) ; addb %al, (%rax) ; addb %al, (%rax) +; addb %al, (%rax) +; addb %al, (%rax) ; addb %al, (%rcx, %rax) ; addb %al, (%rax) ; addb %al, (%rax) @@ -131,10 +123,9 @@ block0: ; movq %rsp, %rbp ; block0: ; movdqu const(1), %xmm0 -; movdqu const(1), %xmm2 -; movdqu const(0), %xmm3 -; paddusb %xmm2, %xmm3, %xmm2 -; pshufb %xmm0, %xmm2, %xmm0 +; movdqu const(1), %xmm1 +; paddusb %xmm1, const(0), %xmm1 +; pshufb %xmm0, %xmm1, %xmm0 ; movq %rbp, %rsp ; popq %rbp ; ret @@ -145,16 +136,17 @@ block0: ; movq %rsp, %rbp ; block1: ; offset 0x4 ; movdqu 0x34(%rip), %xmm0 -; movdqu 0x2c(%rip), %xmm2 -; movdqu 0x14(%rip), %xmm3 -; paddusb %xmm3, %xmm2 -; pshufb %xmm2, %xmm0 +; movdqu 0x2c(%rip), %xmm1 +; paddusb 0x14(%rip), %xmm1 +; pshufb %xmm1, %xmm0 ; movq %rbp, %rsp ; popq %rbp ; retq ; addb %al, (%rax) ; addb %al, (%rax) ; addb %al, (%rax) +; addb %al, (%rax) +; addb %al, (%rax) ; jo 0xa2 ; jo 0xa4 ; jo 0xa6 diff --git a/cranelift/filetests/filetests/isa/x64/simd-pairwise-add.clif b/cranelift/filetests/filetests/isa/x64/simd-pairwise-add.clif index 6f3698c61b..1838ddacdd 100644 --- a/cranelift/filetests/filetests/isa/x64/simd-pairwise-add.clif +++ b/cranelift/filetests/filetests/isa/x64/simd-pairwise-add.clif @@ -55,8 +55,7 @@ block0(v0: i16x8): ; pushq %rbp ; movq %rsp, %rbp ; block0: -; movdqu const(0), %xmm2 -; pmaddwd %xmm0, %xmm2, %xmm0 +; pmaddwd %xmm0, const(0), %xmm0 ; movq %rbp, %rsp ; popq %rbp ; ret @@ -66,8 +65,7 @@ block0(v0: i16x8): ; pushq %rbp ; movq %rsp, %rbp ; block1: ; offset 0x4 -; movdqu 0x14(%rip), %xmm2 -; pmaddwd %xmm2, %xmm0 +; pmaddwd 0x14(%rip), %xmm0 ; movq %rbp, %rsp ; popq %rbp ; retq @@ -76,6 +74,8 @@ block0(v0: i16x8): ; addb %al, (%rax) ; addb %al, (%rax) ; addb %al, (%rax) +; addb %al, (%rax) +; addb %al, (%rax) ; addb %al, (%rcx) ; addb %al, (%rcx) ; addb %al, (%rcx) @@ -97,8 +97,7 @@ block0(v0: i8x16): ; pushq %rbp ; movq %rsp, %rbp ; block0: -; movdqu const(0), %xmm2 -; pmaddubsw %xmm0, %xmm2, %xmm0 +; pmaddubsw %xmm0, const(0), %xmm0 ; movq %rbp, %rsp ; popq %rbp ; ret @@ -108,8 +107,7 @@ block0(v0: i8x16): ; pushq %rbp ; movq %rsp, %rbp ; block1: ; offset 0x4 -; movdqu 0x14(%rip), %xmm2 -; pmaddubsw %xmm2, %xmm0 +; pmaddubsw 0x13(%rip), %xmm0 ; movq %rbp, %rsp ; popq %rbp ; retq @@ -118,6 +116,8 @@ block0(v0: i8x16): ; addb %al, (%rax) ; addb %al, (%rax) ; addb %al, (%rax) +; addb %al, (%rax) +; addb %al, (%rax) ; addl %eax, (%rcx) ; addl %eax, (%rcx) ; addl %eax, (%rcx) @@ -139,12 +139,9 @@ block0(v0: i16x8): ; pushq %rbp ; movq %rsp, %rbp ; block0: -; movdqu const(0), %xmm2 -; pxor %xmm0, %xmm2, %xmm0 -; movdqu const(1), %xmm6 -; pmaddwd %xmm0, %xmm6, %xmm0 -; movdqu const(2), %xmm10 -; paddd %xmm0, %xmm10, %xmm0 +; pxor %xmm0, const(0), %xmm0 +; pmaddwd %xmm0, const(1), %xmm0 +; paddd %xmm0, const(2), %xmm0 ; movq %rbp, %rsp ; popq %rbp ; ret @@ -154,16 +151,20 @@ block0(v0: i16x8): ; pushq %rbp ; movq %rsp, %rbp ; block1: ; offset 0x4 -; movdqu 0x24(%rip), %xmm2 -; pxor %xmm2, %xmm0 -; movdqu 0x28(%rip), %xmm6 -; pmaddwd %xmm6, %xmm0 -; movdqu 0x2b(%rip), %xmm10 -; paddd %xmm10, %xmm0 +; pxor 0x24(%rip), %xmm0 +; pmaddwd 0x2c(%rip), %xmm0 +; paddd 0x34(%rip), %xmm0 ; movq %rbp, %rsp ; popq %rbp ; retq ; addb %al, (%rax) +; addb %al, (%rax) +; addb %al, (%rax) +; addb %al, (%rax) +; addb %al, (%rax) +; addb %al, (%rax) +; addb %al, (%rax) +; addb %al, (%rax) ; addb $0x80, (%rax) ; addb %al, -0x7fff8000(%rax) ; addb %al, -0x7fff8000(%rax) diff --git a/cranelift/filetests/filetests/isa/x64/sqmul_round_sat.clif b/cranelift/filetests/filetests/isa/x64/sqmul_round_sat.clif index d9241b6047..fdf7cee8f9 100644 --- a/cranelift/filetests/filetests/isa/x64/sqmul_round_sat.clif +++ b/cranelift/filetests/filetests/isa/x64/sqmul_round_sat.clif @@ -11,9 +11,9 @@ block0(v0: i16x8, v1: i16x8): ; pushq %rbp ; movq %rsp, %rbp ; block0: -; movdqu const(0), %xmm5 ; pmulhrsw %xmm0, %xmm1, %xmm0 -; pcmpeqw %xmm5, %xmm0, %xmm5 +; movdqa %xmm0, %xmm5 +; pcmpeqw %xmm5, const(0), %xmm5 ; pxor %xmm0, %xmm5, %xmm0 ; movq %rbp, %rsp ; popq %rbp @@ -24,9 +24,9 @@ block0(v0: i16x8, v1: i16x8): ; pushq %rbp ; movq %rsp, %rbp ; block1: ; offset 0x4 -; movdqu 0x14(%rip), %xmm5 ; pmulhrsw %xmm1, %xmm0 -; pcmpeqw %xmm0, %xmm5 +; movdqa %xmm0, %xmm5 +; pcmpeqw 0xb(%rip), %xmm5 ; pxor %xmm5, %xmm0 ; movq %rbp, %rsp ; popq %rbp diff --git a/cranelift/filetests/filetests/isa/x64/uunarrow.clif b/cranelift/filetests/filetests/isa/x64/uunarrow.clif index b65eb2643a..643b32f8e7 100644 --- a/cranelift/filetests/filetests/isa/x64/uunarrow.clif +++ b/cranelift/filetests/filetests/isa/x64/uunarrow.clif @@ -14,13 +14,11 @@ block0(v0: f64x2): ; movq %rsp, %rbp ; block0: ; xorpd %xmm2, %xmm2, %xmm2 -; movdqa %xmm0, %xmm6 -; maxpd %xmm6, %xmm2, %xmm6 -; movupd const(0), %xmm7 -; minpd %xmm6, %xmm7, %xmm6 -; roundpd $3, %xmm6, %xmm0 -; movupd const(1), %xmm12 -; addpd %xmm0, %xmm12, %xmm0 +; movdqa %xmm0, %xmm5 +; maxpd %xmm5, %xmm2, %xmm5 +; minpd %xmm5, const(0), %xmm5 +; roundpd $3, %xmm5, %xmm0 +; addpd %xmm0, const(1), %xmm0 ; shufps $136, %xmm0, %xmm2, %xmm0 ; movq %rbp, %rsp ; popq %rbp @@ -32,20 +30,15 @@ block0(v0: f64x2): ; movq %rsp, %rbp ; block1: ; offset 0x4 ; xorpd %xmm2, %xmm2 -; movdqa %xmm0, %xmm6 -; maxpd %xmm2, %xmm6 -; movupd 0x28(%rip), %xmm7 -; minpd %xmm7, %xmm6 -; roundpd $3, %xmm6, %xmm0 -; movupd 0x25(%rip), %xmm12 -; addpd %xmm12, %xmm0 +; movdqa %xmm0, %xmm5 +; maxpd %xmm2, %xmm5 +; minpd 0x18(%rip), %xmm5 +; roundpd $3, %xmm5, %xmm0 +; addpd 0x1a(%rip), %xmm0 ; shufps $0x88, %xmm2, %xmm0 ; movq %rbp, %rsp ; popq %rbp ; retq ; addb %al, (%rax) -; addb %al, (%rax) -; addb %al, (%rax) -; addb %al, (%rax) ; addb %ah, %al