From 05089321740a07757dff0a285176b2651a49aae2 Mon Sep 17 00:00:00 2001 From: Afonso Bordado Date: Wed, 27 Jul 2022 18:54:00 +0100 Subject: [PATCH] cranelift: Align Scalar and SIMD shift semantics (#4520) * cranelift: Reorganize test suite Group some SIMD operations by instruction. * cranelift: Deduplicate some shift tests Also, new tests with the mod behaviour * aarch64: Lower shifts with mod behaviour * x64: Lower shifts with mod behaviour * wasmtime: Don't mask SIMD shifts --- cranelift/codegen/src/isa/aarch64/lower.isle | 9 +- .../codegen/src/isa/aarch64/lower/isle.rs | 4 +- cranelift/codegen/src/isa/x64/inst.isle | 9 + cranelift/codegen/src/isa/x64/lower.isle | 66 +++--- cranelift/codegen/src/isa/x64/lower/isle.rs | 10 + .../filetests/isa/aarch64/arithmetic.clif | 8 +- .../isa/x64/simd-bitwise-compile.clif | 61 ++--- .../filetests/runtests/simd-bitselect.clif | 30 +++ .../filetests/runtests/simd-bitwise-run.clif | 216 ------------------ .../filetests/runtests/simd-bitwise.clif | 132 ----------- .../filetests/runtests/simd-ishl.clif | 46 ++++ .../filetests/runtests/simd-sshr.clif | 58 +++++ .../filetests/runtests/simd-ushr.clif | 52 +++++ .../filetests/runtests/simd-vselect.clif | 12 + cranelift/wasm/src/code_translator.rs | 24 +- 15 files changed, 314 insertions(+), 423 deletions(-) delete mode 100644 cranelift/filetests/filetests/runtests/simd-bitwise-run.clif delete mode 100644 cranelift/filetests/filetests/runtests/simd-bitwise.clif create mode 100644 cranelift/filetests/filetests/runtests/simd-ishl.clif create mode 100644 cranelift/filetests/filetests/runtests/simd-sshr.clif create mode 100644 cranelift/filetests/filetests/runtests/simd-ushr.clif diff --git a/cranelift/codegen/src/isa/aarch64/lower.isle b/cranelift/codegen/src/isa/aarch64/lower.isle index 0874d90254..34c758227a 100644 --- a/cranelift/codegen/src/isa/aarch64/lower.isle +++ b/cranelift/codegen/src/isa/aarch64/lower.isle @@ -927,7 +927,8 @@ ;; Shift for vector types. (rule (lower (has_type (ty_vec128 ty) (ishl x y))) (let ((size VectorSize (vector_size ty)) - (shift Reg (vec_dup y size))) + (masked_shift_amt Reg (and_imm $I32 y (shift_mask ty))) + (shift Reg (vec_dup masked_shift_amt size))) (sshl x shift size))) ;; Helper function to emit a shift operation with the opcode specified and @@ -986,7 +987,8 @@ ;; Vector shifts. (rule (lower (has_type (ty_vec128 ty) (ushr x y))) (let ((size VectorSize (vector_size ty)) - (shift Reg (vec_dup (sub $I32 (zero_reg) y) size))) + (masked_shift_amt Reg (and_imm $I32 y (shift_mask ty))) + (shift Reg (vec_dup (sub $I64 (zero_reg) masked_shift_amt) size))) (ushl x shift size))) ;; lsr lo_rshift, src_lo, amt @@ -1035,7 +1037,8 @@ ;; Note that right shifts are implemented with a negative left shift. (rule (lower (has_type (ty_vec128 ty) (sshr x y))) (let ((size VectorSize (vector_size ty)) - (shift Reg (vec_dup (sub $I32 (zero_reg) y) size))) + (masked_shift_amt Reg (and_imm $I32 y (shift_mask ty))) + (shift Reg (vec_dup (sub $I64 (zero_reg) masked_shift_amt) size))) (sshl x shift size))) ;; lsr lo_rshift, src_lo, amt diff --git a/cranelift/codegen/src/isa/aarch64/lower/isle.rs b/cranelift/codegen/src/isa/aarch64/lower/isle.rs index 9c0f7a5738..d13e4123ff 100644 --- a/cranelift/codegen/src/isa/aarch64/lower/isle.rs +++ b/cranelift/codegen/src/isa/aarch64/lower/isle.rs @@ -335,7 +335,9 @@ where } fn shift_mask(&mut self, ty: Type) -> ImmLogic { - let mask = (ty.bits() - 1) as u64; + debug_assert!(ty.lane_bits().is_power_of_two()); + + let mask = (ty.lane_bits() - 1) as u64; ImmLogic::maybe_from_u64(mask, I32).unwrap() } diff --git a/cranelift/codegen/src/isa/x64/inst.isle b/cranelift/codegen/src/isa/x64/inst.isle index 52c0685960..e95b0dc081 100644 --- a/cranelift/codegen/src/isa/x64/inst.isle +++ b/cranelift/codegen/src/isa/x64/inst.isle @@ -1147,6 +1147,10 @@ (decl reg_mem_to_xmm_mem (RegMem) XmmMem) (extern constructor reg_mem_to_xmm_mem reg_mem_to_xmm_mem) +;; Construct a new `RegMemImm` from the given `Reg`. +(decl reg_to_reg_mem_imm (Reg) RegMemImm) +(extern constructor reg_to_reg_mem_imm reg_to_reg_mem_imm) + ;; Construct a new `GprMemImm` from the given `RegMemImm`. ;; ;; Asserts that the `RegMemImm`'s register, if any, is an GPR register. @@ -1354,6 +1358,10 @@ (decl const_to_type_masked_imm8 (u64 Type) Imm8Gpr) (extern constructor const_to_type_masked_imm8 const_to_type_masked_imm8) +;; Generate a mask for the bit-width of the given type +(decl shift_mask (Type) u32) +(extern constructor shift_mask shift_mask) + ;; Extract a constant `GprMemImm.Imm` from a value operand. (decl simm32_from_value (GprMemImm) Value) (extern extractor simm32_from_value simm32_from_value) @@ -3043,6 +3051,7 @@ (convert Xmm RegMem xmm_to_reg_mem) (convert Reg Xmm xmm_new) (convert Reg XmmMem reg_to_xmm_mem) +(convert Reg RegMemImm reg_to_reg_mem_imm) (convert RegMem XmmMem reg_mem_to_xmm_mem) (convert RegMemImm XmmMemImm mov_rmi_to_xmm) (convert Xmm XmmMem xmm_to_xmm_mem) diff --git a/cranelift/codegen/src/isa/x64/lower.isle b/cranelift/codegen/src/isa/x64/lower.isle index 876b836504..4359c77fce 100644 --- a/cranelift/codegen/src/isa/x64/lower.isle +++ b/cranelift/codegen/src/isa/x64/lower.isle @@ -531,13 +531,15 @@ ;; in higher feature sets like AVX), we lower the `ishl.i8x16` to a sequence of ;; instructions. The basic idea, whether the amount to shift by is an immediate ;; or not, is to use a 16x8 shift and then mask off the incorrect bits to 0s. -(rule (lower (has_type $I8X16 (ishl src amt))) +(rule (lower (has_type ty @ $I8X16 (ishl src amt))) (let ( + ;; Mask the amount to ensure wrapping behaviour + (masked_amt Reg (x64_and $I64 amt (RegMemImm.Imm (shift_mask ty)))) ;; Shift `src` using 16x8. Unfortunately, a 16x8 shift will only be ;; correct for half of the lanes; the others must be fixed up with ;; the mask below. - (unmasked Xmm (x64_psllw src (mov_rmi_to_xmm amt))) - (mask_addr SyntheticAmode (ishl_i8x16_mask amt)) + (unmasked Xmm (x64_psllw src (mov_rmi_to_xmm masked_amt))) + (mask_addr SyntheticAmode (ishl_i8x16_mask masked_amt)) (mask Reg (x64_load $I8X16 mask_addr (ExtKind.None)))) (sse_and $I8X16 unmasked (RegMem.Reg mask)))) @@ -571,16 +573,19 @@ (rule (ishl_i8x16_mask (RegMemImm.Mem amt)) (ishl_i8x16_mask (RegMemImm.Reg (x64_load $I64 amt (ExtKind.None))))) -;; 16x8, 32x4, and 64x2 shifts can each use a single instruction. +;; 16x8, 32x4, and 64x2 shifts can each use a single instruction, once the shift amount is masked. -(rule (lower (has_type $I16X8 (ishl src amt))) - (x64_psllw src (mov_rmi_to_xmm amt))) +(rule (lower (has_type ty @ $I16X8 (ishl src amt))) + (let ((masked_amt Reg (x64_and $I64 amt (RegMemImm.Imm (shift_mask ty))))) + (x64_psllw src (mov_rmi_to_xmm masked_amt)))) -(rule (lower (has_type $I32X4 (ishl src amt))) - (x64_pslld src (mov_rmi_to_xmm amt))) +(rule (lower (has_type ty @ $I32X4 (ishl src amt))) + (let ((masked_amt Reg (x64_and $I64 amt (RegMemImm.Imm (shift_mask ty))))) + (x64_pslld src (mov_rmi_to_xmm masked_amt)))) -(rule (lower (has_type $I64X2 (ishl src amt))) - (x64_psllq src (mov_rmi_to_xmm amt))) +(rule (lower (has_type ty @ $I64X2 (ishl src amt))) + (let ((masked_amt Reg (x64_and $I64 amt (RegMemImm.Imm (shift_mask ty))))) + (x64_psllq src (mov_rmi_to_xmm masked_amt)))) ;;;; Rules for `ushr` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; @@ -630,13 +635,15 @@ ;; There are no 8x16 shifts in x64. Do the same 16x8-shift-and-mask thing we do ;; with 8x16 `ishl`. -(rule (lower (has_type $I8X16 (ushr src amt))) +(rule (lower (has_type ty @ $I8X16 (ushr src amt))) (let ( + ;; Mask the amount to ensure wrapping behaviour + (masked_amt Reg (x64_and $I64 amt (RegMemImm.Imm (shift_mask ty)))) ;; Shift `src` using 16x8. Unfortunately, a 16x8 shift will only be ;; correct for half of the lanes; the others must be fixed up with ;; the mask below. - (unmasked Xmm (x64_psrlw src (mov_rmi_to_xmm amt))) - (mask_addr SyntheticAmode (ushr_i8x16_mask amt)) + (unmasked Xmm (x64_psrlw src (mov_rmi_to_xmm masked_amt))) + (mask_addr SyntheticAmode (ushr_i8x16_mask masked_amt)) (mask Reg (x64_load $I8X16 mask_addr (ExtKind.None)))) (sse_and $I8X16 unmasked @@ -673,16 +680,19 @@ (rule (ushr_i8x16_mask (RegMemImm.Mem amt)) (ushr_i8x16_mask (RegMemImm.Reg (x64_load $I64 amt (ExtKind.None))))) -;; 16x8, 32x4, and 64x2 shifts can each use a single instruction. +;; 16x8, 32x4, and 64x2 shifts can each use a single instruction, once the shift amount is masked. -(rule (lower (has_type $I16X8 (ushr src amt))) - (x64_psrlw src (mov_rmi_to_xmm amt))) +(rule (lower (has_type ty @ $I16X8 (ushr src amt))) + (let ((masked_amt Reg (x64_and $I64 amt (RegMemImm.Imm (shift_mask ty))))) + (x64_psrlw src (mov_rmi_to_xmm masked_amt)))) -(rule (lower (has_type $I32X4 (ushr src amt))) - (x64_psrld src (mov_rmi_to_xmm amt))) +(rule (lower (has_type ty @ $I32X4 (ushr src amt))) + (let ((masked_amt Reg (x64_and $I64 amt (RegMemImm.Imm (shift_mask ty))))) + (x64_psrld src (mov_rmi_to_xmm masked_amt)))) -(rule (lower (has_type $I64X2 (ushr src amt))) - (x64_psrlq src (mov_rmi_to_xmm amt))) +(rule (lower (has_type ty @ $I64X2 (ushr src amt))) + (let ((masked_amt Reg (x64_and $I64 amt (RegMemImm.Imm (shift_mask ty))))) + (x64_psrlq src (mov_rmi_to_xmm masked_amt)))) ;;;; Rules for `sshr` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; @@ -746,14 +756,16 @@ ;; hi.i16x8 = [(s8, s8), (s9, s9), ..., (s15, s15)] ;; shifted_hi.i16x8 = shift each lane of `high` ;; result = [s0'', s1'', ..., s15''] -(rule (lower (has_type $I8X16 (sshr src amt @ (value_type amt_ty)))) +(rule (lower (has_type ty @ $I8X16 (sshr src amt @ (value_type amt_ty)))) (let ((src_ Xmm (put_in_xmm src)) + ;; Mask the amount to ensure wrapping behaviour + (masked_amt Reg (x64_and $I64 amt (RegMemImm.Imm (shift_mask ty)))) ;; In order for `packsswb` later to only use the high byte of each ;; 16x8 lane, we shift right an extra 8 bits, relying on `psraw` to ;; fill in the upper bits appropriately. (lo Xmm (x64_punpcklbw src_ src_)) (hi Xmm (x64_punpckhbw src_ src_)) - (amt_ XmmMemImm (sshr_i8x16_bigger_shift amt_ty amt)) + (amt_ XmmMemImm (sshr_i8x16_bigger_shift amt_ty masked_amt)) (shifted_lo Xmm (x64_psraw lo amt_)) (shifted_hi Xmm (x64_psraw hi amt_))) (x64_packsswb shifted_lo shifted_hi))) @@ -773,11 +785,13 @@ ;; `sshr.{i16x8,i32x4}` can be a simple `psra{w,d}`, we just have to make sure ;; that if the shift amount is in a register, it is in an XMM register. -(rule (lower (has_type $I16X8 (sshr src amt))) - (x64_psraw src (mov_rmi_to_xmm amt))) +(rule (lower (has_type ty @ $I16X8 (sshr src amt))) + (let ((masked_amt Reg (x64_and $I64 amt (RegMemImm.Imm (shift_mask ty))))) + (x64_psraw src (mov_rmi_to_xmm masked_amt)))) -(rule (lower (has_type $I32X4 (sshr src amt))) - (x64_psrad src (mov_rmi_to_xmm amt))) +(rule (lower (has_type ty @ $I32X4 (sshr src amt))) + (let ((masked_amt Reg (x64_and $I64 amt (RegMemImm.Imm (shift_mask ty))))) + (x64_psrad src (mov_rmi_to_xmm masked_amt)))) ;; The `sshr.i64x2` CLIF instruction has no single x86 instruction in the older ;; feature sets. Newer ones like AVX512VL + AVX512F include `vpsraq`, a 128-bit diff --git a/cranelift/codegen/src/isa/x64/lower/isle.rs b/cranelift/codegen/src/isa/x64/lower/isle.rs index 70de66846b..6a81d09425 100644 --- a/cranelift/codegen/src/isa/x64/lower/isle.rs +++ b/cranelift/codegen/src/isa/x64/lower/isle.rs @@ -229,6 +229,11 @@ where .unwrap() } + #[inline] + fn shift_mask(&mut self, ty: Type) -> u32 { + ty.lane_bits() - 1 + } + #[inline] fn simm32_from_value(&mut self, val: Value) -> Option { let inst = self.lower_ctx.dfg().value_def(val).inst()?; @@ -415,6 +420,11 @@ where Writable::from_reg(Xmm::new(self.temp_writable_reg(I8X16).to_reg()).unwrap()) } + #[inline] + fn reg_to_reg_mem_imm(&mut self, reg: Reg) -> RegMemImm { + RegMemImm::Reg { reg } + } + #[inline] fn reg_mem_to_xmm_mem(&mut self, rm: &RegMem) -> XmmMem { XmmMem::new(rm.clone()).unwrap() diff --git a/cranelift/filetests/filetests/isa/aarch64/arithmetic.clif b/cranelift/filetests/filetests/isa/aarch64/arithmetic.clif index b50b482b25..9492acad62 100644 --- a/cranelift/filetests/filetests/isa/aarch64/arithmetic.clif +++ b/cranelift/filetests/filetests/isa/aarch64/arithmetic.clif @@ -344,9 +344,10 @@ block0(v0: i8x16): ; block0: ; movz x3, #1 -; sub w5, wzr, w3 -; dup v7.16b, w5 -; ushl v0.16b, v0.16b, v7.16b +; and w5, w3, #7 +; sub x7, xzr, x5 +; dup v17.16b, w7 +; ushl v0.16b, v0.16b, v17.16b ; ret function %add_i128(i128, i128) -> i128 { @@ -492,4 +493,3 @@ block0(v0: i64): ; b.vc 8 ; udf ; sdiv x0, x0, x3 ; ret - diff --git a/cranelift/filetests/filetests/isa/x64/simd-bitwise-compile.clif b/cranelift/filetests/filetests/isa/x64/simd-bitwise-compile.clif index b14699ef99..7433faab5a 100644 --- a/cranelift/filetests/filetests/isa/x64/simd-bitwise-compile.clif +++ b/cranelift/filetests/filetests/isa/x64/simd-bitwise-compile.clif @@ -206,12 +206,13 @@ block0(v0: i32): ; movq %rsp, %rbp ; block0: ; load_const VCodeConstant(1), %xmm0 -; movd %edi, %xmm5 -; psllw %xmm0, %xmm5, %xmm0 -; lea const(VCodeConstant(0)), %rsi +; andq %rdi, $7, %rdi +; movd %edi, %xmm7 +; psllw %xmm0, %xmm7, %xmm0 +; lea const(VCodeConstant(0)), %rax ; shlq $4, %rdi, %rdi -; movdqu 0(%rsi,%rdi,1), %xmm13 -; pand %xmm0, %xmm13, %xmm0 +; movdqu 0(%rax,%rdi,1), %xmm15 +; pand %xmm0, %xmm15, %xmm0 ; movq %rbp, %rsp ; popq %rbp ; ret @@ -228,9 +229,14 @@ block0: ; movq %rsp, %rbp ; block0: ; load_const VCodeConstant(1), %xmm0 -; psrlw %xmm0, $1, %xmm0 -; movdqu const(VCodeConstant(0)), %xmm5 -; pand %xmm0, %xmm5, %xmm0 +; movl $1, %r11d +; andq %r11, $7, %r11 +; movd %r11d, %xmm7 +; psrlw %xmm0, %xmm7, %xmm0 +; lea const(VCodeConstant(0)), %rax +; shlq $4, %r11, %r11 +; movdqu 0(%rax,%r11,1), %xmm15 +; pand %xmm0, %xmm15, %xmm0 ; movq %rbp, %rsp ; popq %rbp ; ret @@ -245,15 +251,16 @@ block0(v0: i32): ; pushq %rbp ; movq %rsp, %rbp ; block0: -; load_const VCodeConstant(0), %xmm9 -; movdqa %xmm9, %xmm0 -; punpcklbw %xmm0, %xmm9, %xmm0 -; punpckhbw %xmm9, %xmm9, %xmm9 +; load_const VCodeConstant(0), %xmm10 +; andq %rdi, $7, %rdi +; movdqa %xmm10, %xmm0 +; punpcklbw %xmm0, %xmm10, %xmm0 +; punpckhbw %xmm10, %xmm10, %xmm10 ; addl %edi, $8, %edi -; movd %edi, %xmm11 -; psraw %xmm0, %xmm11, %xmm0 -; psraw %xmm9, %xmm11, %xmm9 -; packsswb %xmm0, %xmm9, %xmm0 +; movd %edi, %xmm13 +; psraw %xmm0, %xmm13, %xmm0 +; psraw %xmm10, %xmm13, %xmm10 +; packsswb %xmm0, %xmm10, %xmm0 ; movq %rbp, %rsp ; popq %rbp ; ret @@ -267,17 +274,19 @@ block0(v0: i8x16, v1: i32): ; pushq %rbp ; movq %rsp, %rbp ; block0: -; movdqa %xmm0, %xmm9 -; punpcklbw %xmm9, %xmm0, %xmm9 +; movl $3, %esi +; andq %rsi, $7, %rsi +; movdqa %xmm0, %xmm15 +; punpcklbw %xmm15, %xmm0, %xmm15 +; movdqa %xmm15, %xmm13 ; punpckhbw %xmm0, %xmm0, %xmm0 -; movdqa %xmm9, %xmm12 -; psraw %xmm12, $11, %xmm12 -; movdqa %xmm12, %xmm9 -; psraw %xmm0, $11, %xmm0 -; movdqa %xmm9, %xmm1 -; packsswb %xmm1, %xmm0, %xmm1 -; movdqa %xmm1, %xmm9 -; movdqa %xmm9, %xmm0 +; movdqa %xmm0, %xmm7 +; addl %esi, $8, %esi +; movd %esi, %xmm15 +; movdqa %xmm13, %xmm0 +; psraw %xmm0, %xmm15, %xmm0 +; psraw %xmm7, %xmm15, %xmm7 +; packsswb %xmm0, %xmm7, %xmm0 ; movq %rbp, %rsp ; popq %rbp ; ret diff --git a/cranelift/filetests/filetests/runtests/simd-bitselect.clif b/cranelift/filetests/filetests/runtests/simd-bitselect.clif index 18027373f8..3ab2187f60 100644 --- a/cranelift/filetests/filetests/runtests/simd-bitselect.clif +++ b/cranelift/filetests/filetests/runtests/simd-bitselect.clif @@ -13,3 +13,33 @@ block0(v0: i32x4, v1: i32x4, v2: i32x4): ; run: %bitselect_i32x4(0x11111111111111111111111111111111, 0x11111111111111111111111111111111, 0x00000000000000000000000000000000) == 0x11111111111111111111111111111111 ; run: %bitselect_i32x4(0x01010011000011110000000011111111, 0x11111111111111111111111111111111, 0x00000000000000000000000000000000) == 0x01010011000011110000000011111111 ; run: %bitselect_i32x4(0x00000000000000001111111111111111, 0x00000000000000000000000000000000, 0x11111111111111111111111111111111) == 0x11111111111111110000000000000000 + +function %bitselect_i8x16(i8x16, i8x16, i8x16) -> i8x16 { +block0(v0: i8x16, v1: i8x16, v2: i8x16): + v3 = bitselect v0, v1, v2 + return v3 +} +; Remember that bitselect accepts: 1) the selector vector, 2) the "if true" vector, and 3) the "if false" vector. +; run: %bitselect_i8x16([0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 255], [127 0 0 0 0 0 0 0 0 0 0 0 0 0 0 42], [42 0 0 0 0 0 0 0 0 0 0 0 0 0 0 127]) == [42 0 0 0 0 0 0 0 0 0 0 0 0 0 0 42] + +function %bitselect_i8x16() -> b1 { +block0: + v0 = vconst.i8x16 [0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 255] ; the selector vector + v1 = vconst.i8x16 [127 0 0 0 0 0 0 0 0 0 0 0 0 0 0 42] ; for each 1-bit in v0 the bit of v1 is selected + v2 = vconst.i8x16 [42 0 0 0 0 0 0 0 0 0 0 0 0 0 0 127] ; for each 0-bit in v0 the bit of v2 is selected + v3 = bitselect v0, v1, v2 + + v4 = extractlane v3, 0 + v5 = icmp_imm eq v4, 42 + + v6 = extractlane v3, 1 + v7 = icmp_imm eq v6, 0 + + v8 = extractlane v3, 15 + v9 = icmp_imm eq v8, 42 + + v10 = band v5, v7 + v11 = band v10, v9 + return v11 +} +; run diff --git a/cranelift/filetests/filetests/runtests/simd-bitwise-run.clif b/cranelift/filetests/filetests/runtests/simd-bitwise-run.clif deleted file mode 100644 index ce3ffa5321..0000000000 --- a/cranelift/filetests/filetests/runtests/simd-bitwise-run.clif +++ /dev/null @@ -1,216 +0,0 @@ -test run -set enable_simd -target aarch64 -; target s390x FIXME: s390x implements modulo semantics for shift counts -target x86_64 skylake - -; TODO: once available, replace all lane extraction with `icmp + all_ones` - -function %ishl_i32x4() -> b1 { -block0: - v0 = iconst.i32 1 - v1 = vconst.i32x4 [1 2 4 8] - v2 = ishl v1, v0 - - v3 = extractlane v2, 0 - v4 = icmp_imm eq v3, 2 - - v5 = extractlane v2, 3 - v6 = icmp_imm eq v5, 16 - - v7 = band v4, v6 - return v7 -} -; run - -function %ishl_too_large_i16x8() -> b1 { -block0: - v0 = iconst.i32 17 ; note that this will shift off the end of each lane - v1 = vconst.i16x8 [1 2 4 8 16 32 64 128] - v2 = ishl v1, v0 - - v3 = extractlane v2, 0 - v4 = icmp_imm eq v3, 0 - - v5 = extractlane v2, 3 - v6 = icmp_imm eq v5, 0 - - v7 = band v4, v6 - return v7 -} -; run - -function %ushr_i8x16() -> b1 { -block0: - v0 = iconst.i32 1 - v1 = vconst.i8x16 [0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15] - v2 = ushr v1, v0 - - v3 = vconst.i8x16 [0 0 1 1 2 2 3 3 4 4 5 5 6 6 7 7] - v4 = icmp eq v2, v3 - v5 = vall_true v4 - return v5 -} -; run - -function %sshr_i8x16() -> b1 { -block0: - v0 = iconst.i32 1 - v1 = vconst.i8x16 [0 0xff 2 0xfd 4 0xfb 6 0xf9 8 0xf7 10 0xf5 12 0xf3 14 0xf1] - v2 = sshr v1, v0 - - v3 = vconst.i8x16 [0 0xff 1 0xfe 2 0xfd 3 0xfc 4 0xfb 5 0xfa 6 0xf9 7 0xf8] - v4 = icmp eq v2, v3 - v5 = vall_true v4 - return v5 -} -; run - -function %ishl_i8x16() -> b1 { -block0: - v0 = iconst.i32 1 - v1 = vconst.i8x16 [0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15] - v2 = ishl v1, v0 - - v3 = vconst.i8x16 [0 2 4 6 8 10 12 14 16 18 20 22 24 26 28 30] - v4 = icmp eq v2, v3 - v5 = vall_true v4 - return v5 -} -; run - -function %ushr_i64x2() -> b1 { -block0: - v0 = iconst.i32 1 - v1 = vconst.i64x2 [1 2] - v2 = ushr v1, v0 - - v3 = extractlane v2, 0 - v4 = icmp_imm eq v3, 0 - - v5 = extractlane v2, 1 - v6 = icmp_imm eq v5, 1 - - v7 = band v4, v6 - return v7 -} -; run - -function %ushr_too_large_i32x4() -> b1 { -block0: - v0 = iconst.i32 33 ; note that this will shift off the end of each lane - v1 = vconst.i32x4 [1 2 4 8] - v2 = ushr v1, v0 - - v3 = extractlane v2, 0 - v4 = icmp_imm eq v3, 0 - - v5 = extractlane v2, 3 - v6 = icmp_imm eq v5, 0 - - v7 = band v4, v6 - return v7 -} -; run - -function %sshr_i16x8() -> b1 { -block0: - v0 = iconst.i32 1 - v1 = vconst.i16x8 [-1 2 4 8 -16 32 64 128] - v2 = sshr v1, v0 - - v3 = extractlane v2, 0 - v4 = icmp_imm eq v3, 0xffff ; because of the shifted-in sign-bit, this remains 0xffff == -1 - - v5 = extractlane v2, 4 - v6 = icmp_imm eq v5, 0xfff8 ; -16 has been shifted to -8 == 0xfff8 - - v7 = band v4, v6 - return v7 -} -; run - -function %sshr_too_large_i32x4() -> b1 { -block0: - v0 = iconst.i32 33 ; note that this will shift off the end of each lane - v1 = vconst.i32x4 [1 2 4 -8] - v2 = sshr v1, v0 - - v3 = extractlane v2, 0 - v4 = icmp_imm eq v3, 0 - - v5 = extractlane v2, 3 - v6 = icmp_imm eq v5, 0xffff_ffff ; shifting in the sign-bit repeatedly fills the result with 1s - - v7 = band v4, v6 - return v7 -} -; run - -function %sshr_i64x2(i64x2, i32) -> i64x2 { -block0(v0:i64x2, v1:i32): - v2 = sshr v0, v1 - return v2 -} -; run: %sshr_i64x2([1 -1], 0) == [1 -1] -; run: %sshr_i64x2([1 -1], 1) == [0 -1] ; note the -1 shift result -; run: %sshr_i64x2([2 -2], 1) == [1 -1] -; run: %sshr_i64x2([0x80000000_00000000 0x7FFFFFFF_FFFFFFFF], 63) == [0xFFFFFFFF_FFFFFFFF 0] - -function %bitselect_i8x16() -> b1 { -block0: - v0 = vconst.i8x16 [0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 255] ; the selector vector - v1 = vconst.i8x16 [127 0 0 0 0 0 0 0 0 0 0 0 0 0 0 42] ; for each 1-bit in v0 the bit of v1 is selected - v2 = vconst.i8x16 [42 0 0 0 0 0 0 0 0 0 0 0 0 0 0 127] ; for each 0-bit in v0 the bit of v2 is selected - v3 = bitselect v0, v1, v2 - - v4 = extractlane v3, 0 - v5 = icmp_imm eq v4, 42 - - v6 = extractlane v3, 1 - v7 = icmp_imm eq v6, 0 - - v8 = extractlane v3, 15 - v9 = icmp_imm eq v8, 42 - - v10 = band v5, v7 - v11 = band v10, v9 - return v11 -} -; run - -function %sshr_imm_i32x4() -> b1 { -block0: - v1 = vconst.i32x4 [1 2 4 -8] - v2 = sshr_imm v1, 1 - - v3 = vconst.i32x4 [0 1 2 -4] - v4 = icmp eq v2, v3 - v5 = vall_true v4 - return v5 -} -; run - -function %sshr_imm_i16x8() -> b1 { -block0: - v1 = vconst.i16x8 [1 2 4 -8 0 0 0 0] - v2 = ushr_imm v1, 1 - - v3 = vconst.i16x8 [0 1 2 32764 0 0 0 0] ; -4 with MSB unset == 32764 - v4 = icmp eq v2, v3 - v5 = vall_true v4 - return v5 -} -; run - -function %ishl_imm_i64x2() -> b1 { -block0: - v1 = vconst.i64x2 [1 0] - v2 = ishl_imm v1, 1 - - v3 = vconst.i64x2 [2 0] - v4 = icmp eq v2, v3 - v5 = vall_true v4 - return v5 -} -; run diff --git a/cranelift/filetests/filetests/runtests/simd-bitwise.clif b/cranelift/filetests/filetests/runtests/simd-bitwise.clif deleted file mode 100644 index 251f9516c1..0000000000 --- a/cranelift/filetests/filetests/runtests/simd-bitwise.clif +++ /dev/null @@ -1,132 +0,0 @@ -test run -target aarch64 -; target s390x FIXME: s390x implements modulo semantics for shift counts -set enable_simd -target x86_64 skylake - -function %bitselect_i8x16(i8x16, i8x16, i8x16) -> i8x16 { -block0(v0: i8x16, v1: i8x16, v2: i8x16): - v3 = bitselect v0, v1, v2 - return v3 -} -; Remember that bitselect accepts: 1) the selector vector, 2) the "if true" vector, and 3) the "if false" vector. -; run: %bitselect_i8x16([0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 255], [127 0 0 0 0 0 0 0 0 0 0 0 0 0 0 42], [42 0 0 0 0 0 0 0 0 0 0 0 0 0 0 127]) == [42 0 0 0 0 0 0 0 0 0 0 0 0 0 0 42] - -function %vselect_i32x4(i32x4, i32x4) -> i32x4 { -block0(v1: i32x4, v2: i32x4): - ; `make_trampoline` still does not know how to convert boolean vector types - ; so we load the value directly here. - v0 = vconst.b32x4 [true true false false] - v3 = vselect v0, v1, v2 - return v3 -} -; Remember that vselect accepts: 1) the selector vector, 2) the "if true" vector, and 3) the "if false" vector. -; run: %vselect_i8x16([1 2 -1 -1], [-1 -1 3 4]) == [1 2 3 4] - - - -; shift left - -function %ishl_i8x16(i8x16, i32) -> i8x16 { -block0(v0: i8x16, v1: i32): - v2 = ishl v0, v1 - return v2 -} -; run: %ishl_i8x16([0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15], 4) == [0x00 0x10 0x20 0x30 0x40 0x50 0x60 0x70 0x80 0x90 0xa0 0xb0 0xc0 0xd0 0xe0 0xf0] - -function %ishl_i16x8(i16x8, i32) -> i16x8 { -block0(v0: i16x8, v1: i32): - v2 = ishl v0, v1 - return v2 -} -; run: %ishl_i16x8([1 2 4 8 16 32 64 128], 17) == [0 0 0 0 0 0 0 0] - -function %ishl_i32x4(i32x4, i32) -> i32x4 { -block0(v0: i32x4, v1: i32): - v2 = ishl v0, v1 - return v2 -} -; run: %ishl_i32x4([1 2 4 8], 1) == [2 4 8 16] - -function %ishl_imm_i64x2(i64x2) -> i64x2 { -block0(v0: i64x2): - v2 = ishl_imm v0, 1 - return v2 -} -; run: %ishl_imm_i64x2([1 0]) == [2 0] - - - -; shift right (logical) - -function %ushr_i8x16(i8x16, i32) -> i8x16 { -block0(v0: i8x16, v1: i32): - v2 = ushr v0, v1 - return v2 -} -; run: %ushr_i8x16([0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15], 1) == [0 0 1 1 2 2 3 3 4 4 5 5 6 6 7 7] - -function %ushr_i32x4(i32x4, i32) -> i32x4 { -block0(v0: i32x4, v1: i32): - v2 = ushr v0, v1 - return v2 -} -; run: %ushr_i32x4([1 2 4 8], 33) == [0 0 0 0] - -function %ushr_i64x2(i64x2, i32) -> i64x2 { -block0(v0: i64x2, v1: i32): - v2 = ushr v0, v1 - return v2 -} -; run: %ushr_i64x2([1 2], 1) == [0 1] - - - -; shift right (arithmetic) - -function %sshr_i8x16(i8x16, i32) -> i8x16 { -block0(v0: i8x16, v1: i32): - v2 = sshr v0, v1 - return v2 -} -; run: %sshr_i8x16([0 0xff 2 0xfd 4 0xfb 6 0xf9 8 0xf7 10 0xf5 12 0xf3 14 0xf1], 1) == [0 0xff 1 0xfe 2 0xfd 3 0xfc 4 0xfb 5 0xfa 6 0xf9 7 0xf8] - -function %sshr_i16x8(i16x8, i32) -> i16x8 { -block0(v0: i16x8, v1: i32): - v2 = sshr v0, v1 - return v2 -} -; note: because of the shifted-in sign-bit, lane 0 remains -1 == 0xffff, whereas lane 4 has been shifted to -8 == 0xfff8 -; run: %ushr_i16x8([-1 2 4 8 -16 32 64 128], 1) == [-1 1 2 4 -8 16 32 64] - -function %sshr_i32x4(i32x4, i32) -> i32x4 { -block0(v0: i32x4, v1: i32): - v2 = sshr v0, v1 - return v2 -} -; note: shifting in the sign-bit repeatedly in lane 3 fills the result with 1s (-1 == 0xffff_ffff) -; run: %ushr_i32x4([1 2 4 -8], 33) == [0 0 0 0xffff_ffff] - -function %sshr_i64x2(i64x2, i32) -> i64x2 { -block0(v0:i64x2, v1:i32): - v2 = sshr v0, v1 - return v2 -} -; run: %sshr_i64x2([1 -1], 0) == [1 -1] -; run: %sshr_i64x2([1 -1], 1) == [0 -1] ; note the -1 shift result -; run: %sshr_i64x2([2 -2], 1) == [1 -1] -; run: %sshr_i64x2([0x80000000_00000000 0x7FFFFFFF_FFFFFFFF], 63) == [0xFFFFFFFF_FFFFFFFF 0] - -function %sshr_imm_i32x4(i32x4) -> i32x4 { -block0(v0: i32x4): - v1 = sshr_imm v0, 1 - return v1 -} -; run: %sshr_imm_i32x4([1 2 4 -8]) == [0 1 2 -4] - -function %sshr_imm_i16x8(i16x8) -> i16x8 { -block0(v0: i16x8): - v1 = sshr_imm v0, 1 - return v1 -} -; run: %sshr_imm_i16x8([1 2 4 -8 0 0 0 0]) == [0 1 2 -4 0 0 0 0] diff --git a/cranelift/filetests/filetests/runtests/simd-ishl.clif b/cranelift/filetests/filetests/runtests/simd-ishl.clif new file mode 100644 index 0000000000..105df1e856 --- /dev/null +++ b/cranelift/filetests/filetests/runtests/simd-ishl.clif @@ -0,0 +1,46 @@ +test run +set enable_simd +target aarch64 +target s390x +target x86_64 skylake + + +function %ishl_i8x16(i8x16, i32) -> i8x16 { +block0(v0: i8x16, v1: i32): + v2 = ishl v0, v1 + return v2 +} +; run: %ishl_i8x16([0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15], 4) == [0x00 0x10 0x20 0x30 0x40 0x50 0x60 0x70 0x80 0x90 0xa0 0xb0 0xc0 0xd0 0xe0 0xf0] +; run: %ishl_i8x16([0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15], 12) == [0x00 0x10 0x20 0x30 0x40 0x50 0x60 0x70 0x80 0x90 0xa0 0xb0 0xc0 0xd0 0xe0 0xf0] + +function %ishl_i16x8(i16x8, i32) -> i16x8 { +block0(v0: i16x8, v1: i32): + v2 = ishl v0, v1 + return v2 +} +; run: %ishl_i16x8([1 2 4 8 16 32 64 128], 1) == [2 4 8 16 32 64 128 256] +; run: %ishl_i16x8([1 2 4 8 16 32 64 128], 17) == [2 4 8 16 32 64 128 256] + +function %ishl_i32x4(i32x4, i32) -> i32x4 { +block0(v0: i32x4, v1: i32): + v2 = ishl v0, v1 + return v2 +} +; run: %ishl_i32x4([1 2 4 8], 1) == [2 4 8 16] +; run: %ishl_i32x4([1 2 4 8], 33) == [2 4 8 16] + +function %ishl_i64x2(i64x2, i32) -> i64x2 { +block0(v0: i64x2, v1: i32): + v2 = ishl v0, v1 + return v2 +} +; run: %ishl_i64x2([1 2], 1) == [2 4] +; run: %ishl_i64x2([1 2], 65) == [2 4] + + +function %ishl_imm_i64x2(i64x2) -> i64x2 { +block0(v0: i64x2): + v2 = ishl_imm v0, 1 + return v2 +} +; run: %ishl_imm_i64x2([1 0]) == [2 0] diff --git a/cranelift/filetests/filetests/runtests/simd-sshr.clif b/cranelift/filetests/filetests/runtests/simd-sshr.clif new file mode 100644 index 0000000000..ccb8bd8d18 --- /dev/null +++ b/cranelift/filetests/filetests/runtests/simd-sshr.clif @@ -0,0 +1,58 @@ +test run +set enable_simd +target aarch64 +target s390x +target x86_64 skylake + + +function %sshr_i8x16(i8x16, i32) -> i8x16 { +block0(v0: i8x16, v1: i32): + v2 = sshr v0, v1 + return v2 +} +; run: %sshr_i8x16([0 0xff 2 0xfd 4 0xfb 6 0xf9 8 0xf7 10 0xf5 12 0xf3 14 0xf1], 1) == [0 0xff 1 0xfe 2 0xfd 3 0xfc 4 0xfb 5 0xfa 6 0xf9 7 0xf8] +; run: %sshr_i8x16([0 0xff 2 0xfd 4 0xfb 6 0xf9 8 0xf7 10 0xf5 12 0xf3 14 0xf1], 9) == [0 0xff 1 0xfe 2 0xfd 3 0xfc 4 0xfb 5 0xfa 6 0xf9 7 0xf8] + +function %sshr_i16x8(i16x8, i32) -> i16x8 { +block0(v0: i16x8, v1: i32): + v2 = sshr v0, v1 + return v2 +} +; note: because of the shifted-in sign-bit, lane 0 remains -1 == 0xffff, whereas lane 4 has been shifted to -8 == 0xfff8 +; run: %sshr_i16x8([-1 2 4 8 -16 32 64 128], 1) == [-1 1 2 4 -8 16 32 64] +; run: %sshr_i16x8([-1 2 4 8 -16 32 64 128], 17) == [-1 1 2 4 -8 16 32 64] + +function %sshr_i32x4(i32x4, i32) -> i32x4 { +block0(v0: i32x4, v1: i32): + v2 = sshr v0, v1 + return v2 +} +; run: %sshr_i32x4([1 2 4 -8], 1) == [0 1 2 -4] +; run: %sshr_i32x4([1 2 4 -8], 33) == [0 1 2 -4] + +function %sshr_i64x2(i64x2, i32) -> i64x2 { +block0(v0:i64x2, v1:i32): + v2 = sshr v0, v1 + return v2 +} +; run: %sshr_i64x2([1 -1], 0) == [1 -1] +; run: %sshr_i64x2([1 -1], 1) == [0 -1] ; note the -1 shift result +; run: %sshr_i64x2([2 -2], 1) == [1 -1] +; run: %sshr_i64x2([0x80000000_00000000 0x7FFFFFFF_FFFFFFFF], 63) == [0xFFFFFFFF_FFFFFFFF 0] +; run: %sshr_i64x2([2 -2], 65) == [1 -1] + + + +function %sshr_imm_i32x4(i32x4) -> i32x4 { +block0(v0: i32x4): + v1 = sshr_imm v0, 1 + return v1 +} +; run: %sshr_imm_i32x4([1 2 4 -8]) == [0 1 2 -4] + +function %sshr_imm_i16x8(i16x8) -> i16x8 { +block0(v0: i16x8): + v1 = sshr_imm v0, 1 + return v1 +} +; run: %sshr_imm_i16x8([1 2 4 -8 0 0 0 0]) == [0 1 2 -4 0 0 0 0] diff --git a/cranelift/filetests/filetests/runtests/simd-ushr.clif b/cranelift/filetests/filetests/runtests/simd-ushr.clif new file mode 100644 index 0000000000..b77aedad58 --- /dev/null +++ b/cranelift/filetests/filetests/runtests/simd-ushr.clif @@ -0,0 +1,52 @@ +test run +set enable_simd +target aarch64 +target s390x +target x86_64 skylake + + +function %ushr_i8x16(i8x16, i32) -> i8x16 { +block0(v0: i8x16, v1: i32): + v2 = ushr v0, v1 + return v2 +} +; run: %ushr_i8x16([0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15], 1) == [0 0 1 1 2 2 3 3 4 4 5 5 6 6 7 7] +; run: %ushr_i8x16([0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15], 9) == [0 0 1 1 2 2 3 3 4 4 5 5 6 6 7 7] + +function %ushr_i16x8(i16x8, i32) -> i16x8 { +block0(v0: i16x8, v1: i32): + v2 = ushr v0, v1 + return v2 +} +; run: %ushr_i16x8([0 1 2 3 4 5 6 7], 1) == [0 0 1 1 2 2 3 3] +; run: %ushr_i16x8([0 1 2 3 4 5 6 7], 17) == [0 0 1 1 2 2 3 3] +; run: %ushr_i16x8([1 2 4 -8 0 0 0 0], 1) == [0 1 2 32764 0 0 0 0] + +function %ushr_i32x4(i32x4, i32) -> i32x4 { +block0(v0: i32x4, v1: i32): + v2 = ushr v0, v1 + return v2 +} +; run: %ushr_i32x4([1 2 4 8], 1) == [0 1 2 4] +; run: %ushr_i32x4([1 2 4 8], 33) == [0 1 2 4] + +function %ushr_i64x2(i64x2, i32) -> i64x2 { +block0(v0: i64x2, v1: i32): + v2 = ushr v0, v1 + return v2 +} +; run: %ushr_i64x2([1 2], 1) == [0 1] +; run: %ushr_i64x2([1 2], 65) == [0 1] + + +function %sshr_imm_i16x8() -> b1 { +block0: + v1 = vconst.i16x8 [1 2 4 -8 0 0 0 0] + v2 = ushr_imm v1, 1 + + v3 = vconst.i16x8 [0 1 2 32764 0 0 0 0] ; -4 with MSB unset == 32764 + v4 = icmp eq v2, v3 + v5 = vall_true v4 + return v5 +} +; run diff --git a/cranelift/filetests/filetests/runtests/simd-vselect.clif b/cranelift/filetests/filetests/runtests/simd-vselect.clif index db5f918043..b4a1c70913 100644 --- a/cranelift/filetests/filetests/runtests/simd-vselect.clif +++ b/cranelift/filetests/filetests/runtests/simd-vselect.clif @@ -72,3 +72,15 @@ block0(v0: b64x2, v1: i64x2, v2: i64x2): return v3 } ; run: %vselect_p_i64x2([true false], [1 2], [100000000000 200000000000]) == [1 200000000000] + + +function %vselect_i32x4(i32x4, i32x4) -> i32x4 { +block0(v1: i32x4, v2: i32x4): + ; `make_trampoline` still does not know how to convert boolean vector types + ; so we load the value directly here. + v0 = vconst.b32x4 [true true false false] + v3 = vselect v0, v1, v2 + return v3 +} +; Remember that vselect accepts: 1) the selector vector, 2) the "if true" vector, and 3) the "if false" vector. +; run: %vselect_i32x4([1 2 -1 -1], [-1 -1 3 4]) == [1 2 3 4] diff --git a/cranelift/wasm/src/code_translator.rs b/cranelift/wasm/src/code_translator.rs index 2edc80d22d..6a808c4d45 100644 --- a/cranelift/wasm/src/code_translator.rs +++ b/cranelift/wasm/src/code_translator.rs @@ -1630,29 +1630,23 @@ pub fn translate_operator( Operator::I8x16Shl | Operator::I16x8Shl | Operator::I32x4Shl | Operator::I64x2Shl => { let (a, b) = state.pop2(); let bitcast_a = optionally_bitcast_vector(a, type_of(op), builder); - let bitwidth = i64::from(type_of(op).lane_bits()); - // The spec expects to shift with `b mod lanewidth`; so, e.g., for 16 bit lane-width - // we do `b AND 15`; this means fewer instructions than `iconst + urem`. - let b_mod_bitwidth = builder.ins().band_imm(b, bitwidth - 1); - state.push1(builder.ins().ishl(bitcast_a, b_mod_bitwidth)) + // The spec expects to shift with `b mod lanewidth`; This is directly compatible + // with cranelift's instruction. + state.push1(builder.ins().ishl(bitcast_a, b)) } Operator::I8x16ShrU | Operator::I16x8ShrU | Operator::I32x4ShrU | Operator::I64x2ShrU => { let (a, b) = state.pop2(); let bitcast_a = optionally_bitcast_vector(a, type_of(op), builder); - let bitwidth = i64::from(type_of(op).lane_bits()); - // The spec expects to shift with `b mod lanewidth`; so, e.g., for 16 bit lane-width - // we do `b AND 15`; this means fewer instructions than `iconst + urem`. - let b_mod_bitwidth = builder.ins().band_imm(b, bitwidth - 1); - state.push1(builder.ins().ushr(bitcast_a, b_mod_bitwidth)) + // The spec expects to shift with `b mod lanewidth`; This is directly compatible + // with cranelift's instruction. + state.push1(builder.ins().ushr(bitcast_a, b)) } Operator::I8x16ShrS | Operator::I16x8ShrS | Operator::I32x4ShrS | Operator::I64x2ShrS => { let (a, b) = state.pop2(); let bitcast_a = optionally_bitcast_vector(a, type_of(op), builder); - let bitwidth = i64::from(type_of(op).lane_bits()); - // The spec expects to shift with `b mod lanewidth`; so, e.g., for 16 bit lane-width - // we do `b AND 15`; this means fewer instructions than `iconst + urem`. - let b_mod_bitwidth = builder.ins().band_imm(b, bitwidth - 1); - state.push1(builder.ins().sshr(bitcast_a, b_mod_bitwidth)) + // The spec expects to shift with `b mod lanewidth`; This is directly compatible + // with cranelift's instruction. + state.push1(builder.ins().sshr(bitcast_a, b)) } Operator::V128Bitselect => { let (a, b, c) = state.pop3();