cranelift: Align Scalar and SIMD shift semantics (#4520)
* cranelift: Reorganize test suite Group some SIMD operations by instruction. * cranelift: Deduplicate some shift tests Also, new tests with the mod behaviour * aarch64: Lower shifts with mod behaviour * x64: Lower shifts with mod behaviour * wasmtime: Don't mask SIMD shifts
This commit is contained in:
@@ -927,7 +927,8 @@
|
|||||||
;; Shift for vector types.
|
;; Shift for vector types.
|
||||||
(rule (lower (has_type (ty_vec128 ty) (ishl x y)))
|
(rule (lower (has_type (ty_vec128 ty) (ishl x y)))
|
||||||
(let ((size VectorSize (vector_size ty))
|
(let ((size VectorSize (vector_size ty))
|
||||||
(shift Reg (vec_dup y size)))
|
(masked_shift_amt Reg (and_imm $I32 y (shift_mask ty)))
|
||||||
|
(shift Reg (vec_dup masked_shift_amt size)))
|
||||||
(sshl x shift size)))
|
(sshl x shift size)))
|
||||||
|
|
||||||
;; Helper function to emit a shift operation with the opcode specified and
|
;; Helper function to emit a shift operation with the opcode specified and
|
||||||
@@ -986,7 +987,8 @@
|
|||||||
;; Vector shifts.
|
;; Vector shifts.
|
||||||
(rule (lower (has_type (ty_vec128 ty) (ushr x y)))
|
(rule (lower (has_type (ty_vec128 ty) (ushr x y)))
|
||||||
(let ((size VectorSize (vector_size ty))
|
(let ((size VectorSize (vector_size ty))
|
||||||
(shift Reg (vec_dup (sub $I32 (zero_reg) y) size)))
|
(masked_shift_amt Reg (and_imm $I32 y (shift_mask ty)))
|
||||||
|
(shift Reg (vec_dup (sub $I64 (zero_reg) masked_shift_amt) size)))
|
||||||
(ushl x shift size)))
|
(ushl x shift size)))
|
||||||
|
|
||||||
;; lsr lo_rshift, src_lo, amt
|
;; lsr lo_rshift, src_lo, amt
|
||||||
@@ -1035,7 +1037,8 @@
|
|||||||
;; Note that right shifts are implemented with a negative left shift.
|
;; Note that right shifts are implemented with a negative left shift.
|
||||||
(rule (lower (has_type (ty_vec128 ty) (sshr x y)))
|
(rule (lower (has_type (ty_vec128 ty) (sshr x y)))
|
||||||
(let ((size VectorSize (vector_size ty))
|
(let ((size VectorSize (vector_size ty))
|
||||||
(shift Reg (vec_dup (sub $I32 (zero_reg) y) size)))
|
(masked_shift_amt Reg (and_imm $I32 y (shift_mask ty)))
|
||||||
|
(shift Reg (vec_dup (sub $I64 (zero_reg) masked_shift_amt) size)))
|
||||||
(sshl x shift size)))
|
(sshl x shift size)))
|
||||||
|
|
||||||
;; lsr lo_rshift, src_lo, amt
|
;; lsr lo_rshift, src_lo, amt
|
||||||
|
|||||||
@@ -335,7 +335,9 @@ where
|
|||||||
}
|
}
|
||||||
|
|
||||||
fn shift_mask(&mut self, ty: Type) -> ImmLogic {
|
fn shift_mask(&mut self, ty: Type) -> ImmLogic {
|
||||||
let mask = (ty.bits() - 1) as u64;
|
debug_assert!(ty.lane_bits().is_power_of_two());
|
||||||
|
|
||||||
|
let mask = (ty.lane_bits() - 1) as u64;
|
||||||
ImmLogic::maybe_from_u64(mask, I32).unwrap()
|
ImmLogic::maybe_from_u64(mask, I32).unwrap()
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
@@ -1147,6 +1147,10 @@
|
|||||||
(decl reg_mem_to_xmm_mem (RegMem) XmmMem)
|
(decl reg_mem_to_xmm_mem (RegMem) XmmMem)
|
||||||
(extern constructor reg_mem_to_xmm_mem reg_mem_to_xmm_mem)
|
(extern constructor reg_mem_to_xmm_mem reg_mem_to_xmm_mem)
|
||||||
|
|
||||||
|
;; Construct a new `RegMemImm` from the given `Reg`.
|
||||||
|
(decl reg_to_reg_mem_imm (Reg) RegMemImm)
|
||||||
|
(extern constructor reg_to_reg_mem_imm reg_to_reg_mem_imm)
|
||||||
|
|
||||||
;; Construct a new `GprMemImm` from the given `RegMemImm`.
|
;; Construct a new `GprMemImm` from the given `RegMemImm`.
|
||||||
;;
|
;;
|
||||||
;; Asserts that the `RegMemImm`'s register, if any, is an GPR register.
|
;; Asserts that the `RegMemImm`'s register, if any, is an GPR register.
|
||||||
@@ -1354,6 +1358,10 @@
|
|||||||
(decl const_to_type_masked_imm8 (u64 Type) Imm8Gpr)
|
(decl const_to_type_masked_imm8 (u64 Type) Imm8Gpr)
|
||||||
(extern constructor const_to_type_masked_imm8 const_to_type_masked_imm8)
|
(extern constructor const_to_type_masked_imm8 const_to_type_masked_imm8)
|
||||||
|
|
||||||
|
;; Generate a mask for the bit-width of the given type
|
||||||
|
(decl shift_mask (Type) u32)
|
||||||
|
(extern constructor shift_mask shift_mask)
|
||||||
|
|
||||||
;; Extract a constant `GprMemImm.Imm` from a value operand.
|
;; Extract a constant `GprMemImm.Imm` from a value operand.
|
||||||
(decl simm32_from_value (GprMemImm) Value)
|
(decl simm32_from_value (GprMemImm) Value)
|
||||||
(extern extractor simm32_from_value simm32_from_value)
|
(extern extractor simm32_from_value simm32_from_value)
|
||||||
@@ -3043,6 +3051,7 @@
|
|||||||
(convert Xmm RegMem xmm_to_reg_mem)
|
(convert Xmm RegMem xmm_to_reg_mem)
|
||||||
(convert Reg Xmm xmm_new)
|
(convert Reg Xmm xmm_new)
|
||||||
(convert Reg XmmMem reg_to_xmm_mem)
|
(convert Reg XmmMem reg_to_xmm_mem)
|
||||||
|
(convert Reg RegMemImm reg_to_reg_mem_imm)
|
||||||
(convert RegMem XmmMem reg_mem_to_xmm_mem)
|
(convert RegMem XmmMem reg_mem_to_xmm_mem)
|
||||||
(convert RegMemImm XmmMemImm mov_rmi_to_xmm)
|
(convert RegMemImm XmmMemImm mov_rmi_to_xmm)
|
||||||
(convert Xmm XmmMem xmm_to_xmm_mem)
|
(convert Xmm XmmMem xmm_to_xmm_mem)
|
||||||
|
|||||||
@@ -531,13 +531,15 @@
|
|||||||
;; in higher feature sets like AVX), we lower the `ishl.i8x16` to a sequence of
|
;; in higher feature sets like AVX), we lower the `ishl.i8x16` to a sequence of
|
||||||
;; instructions. The basic idea, whether the amount to shift by is an immediate
|
;; instructions. The basic idea, whether the amount to shift by is an immediate
|
||||||
;; or not, is to use a 16x8 shift and then mask off the incorrect bits to 0s.
|
;; or not, is to use a 16x8 shift and then mask off the incorrect bits to 0s.
|
||||||
(rule (lower (has_type $I8X16 (ishl src amt)))
|
(rule (lower (has_type ty @ $I8X16 (ishl src amt)))
|
||||||
(let (
|
(let (
|
||||||
|
;; Mask the amount to ensure wrapping behaviour
|
||||||
|
(masked_amt Reg (x64_and $I64 amt (RegMemImm.Imm (shift_mask ty))))
|
||||||
;; Shift `src` using 16x8. Unfortunately, a 16x8 shift will only be
|
;; Shift `src` using 16x8. Unfortunately, a 16x8 shift will only be
|
||||||
;; correct for half of the lanes; the others must be fixed up with
|
;; correct for half of the lanes; the others must be fixed up with
|
||||||
;; the mask below.
|
;; the mask below.
|
||||||
(unmasked Xmm (x64_psllw src (mov_rmi_to_xmm amt)))
|
(unmasked Xmm (x64_psllw src (mov_rmi_to_xmm masked_amt)))
|
||||||
(mask_addr SyntheticAmode (ishl_i8x16_mask amt))
|
(mask_addr SyntheticAmode (ishl_i8x16_mask masked_amt))
|
||||||
(mask Reg (x64_load $I8X16 mask_addr (ExtKind.None))))
|
(mask Reg (x64_load $I8X16 mask_addr (ExtKind.None))))
|
||||||
(sse_and $I8X16 unmasked (RegMem.Reg mask))))
|
(sse_and $I8X16 unmasked (RegMem.Reg mask))))
|
||||||
|
|
||||||
@@ -571,16 +573,19 @@
|
|||||||
(rule (ishl_i8x16_mask (RegMemImm.Mem amt))
|
(rule (ishl_i8x16_mask (RegMemImm.Mem amt))
|
||||||
(ishl_i8x16_mask (RegMemImm.Reg (x64_load $I64 amt (ExtKind.None)))))
|
(ishl_i8x16_mask (RegMemImm.Reg (x64_load $I64 amt (ExtKind.None)))))
|
||||||
|
|
||||||
;; 16x8, 32x4, and 64x2 shifts can each use a single instruction.
|
;; 16x8, 32x4, and 64x2 shifts can each use a single instruction, once the shift amount is masked.
|
||||||
|
|
||||||
(rule (lower (has_type $I16X8 (ishl src amt)))
|
(rule (lower (has_type ty @ $I16X8 (ishl src amt)))
|
||||||
(x64_psllw src (mov_rmi_to_xmm amt)))
|
(let ((masked_amt Reg (x64_and $I64 amt (RegMemImm.Imm (shift_mask ty)))))
|
||||||
|
(x64_psllw src (mov_rmi_to_xmm masked_amt))))
|
||||||
|
|
||||||
(rule (lower (has_type $I32X4 (ishl src amt)))
|
(rule (lower (has_type ty @ $I32X4 (ishl src amt)))
|
||||||
(x64_pslld src (mov_rmi_to_xmm amt)))
|
(let ((masked_amt Reg (x64_and $I64 amt (RegMemImm.Imm (shift_mask ty)))))
|
||||||
|
(x64_pslld src (mov_rmi_to_xmm masked_amt))))
|
||||||
|
|
||||||
(rule (lower (has_type $I64X2 (ishl src amt)))
|
(rule (lower (has_type ty @ $I64X2 (ishl src amt)))
|
||||||
(x64_psllq src (mov_rmi_to_xmm amt)))
|
(let ((masked_amt Reg (x64_and $I64 amt (RegMemImm.Imm (shift_mask ty)))))
|
||||||
|
(x64_psllq src (mov_rmi_to_xmm masked_amt))))
|
||||||
|
|
||||||
;;;; Rules for `ushr` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
;;;; Rules for `ushr` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||||
|
|
||||||
@@ -630,13 +635,15 @@
|
|||||||
|
|
||||||
;; There are no 8x16 shifts in x64. Do the same 16x8-shift-and-mask thing we do
|
;; There are no 8x16 shifts in x64. Do the same 16x8-shift-and-mask thing we do
|
||||||
;; with 8x16 `ishl`.
|
;; with 8x16 `ishl`.
|
||||||
(rule (lower (has_type $I8X16 (ushr src amt)))
|
(rule (lower (has_type ty @ $I8X16 (ushr src amt)))
|
||||||
(let (
|
(let (
|
||||||
|
;; Mask the amount to ensure wrapping behaviour
|
||||||
|
(masked_amt Reg (x64_and $I64 amt (RegMemImm.Imm (shift_mask ty))))
|
||||||
;; Shift `src` using 16x8. Unfortunately, a 16x8 shift will only be
|
;; Shift `src` using 16x8. Unfortunately, a 16x8 shift will only be
|
||||||
;; correct for half of the lanes; the others must be fixed up with
|
;; correct for half of the lanes; the others must be fixed up with
|
||||||
;; the mask below.
|
;; the mask below.
|
||||||
(unmasked Xmm (x64_psrlw src (mov_rmi_to_xmm amt)))
|
(unmasked Xmm (x64_psrlw src (mov_rmi_to_xmm masked_amt)))
|
||||||
(mask_addr SyntheticAmode (ushr_i8x16_mask amt))
|
(mask_addr SyntheticAmode (ushr_i8x16_mask masked_amt))
|
||||||
(mask Reg (x64_load $I8X16 mask_addr (ExtKind.None))))
|
(mask Reg (x64_load $I8X16 mask_addr (ExtKind.None))))
|
||||||
(sse_and $I8X16
|
(sse_and $I8X16
|
||||||
unmasked
|
unmasked
|
||||||
@@ -673,16 +680,19 @@
|
|||||||
(rule (ushr_i8x16_mask (RegMemImm.Mem amt))
|
(rule (ushr_i8x16_mask (RegMemImm.Mem amt))
|
||||||
(ushr_i8x16_mask (RegMemImm.Reg (x64_load $I64 amt (ExtKind.None)))))
|
(ushr_i8x16_mask (RegMemImm.Reg (x64_load $I64 amt (ExtKind.None)))))
|
||||||
|
|
||||||
;; 16x8, 32x4, and 64x2 shifts can each use a single instruction.
|
;; 16x8, 32x4, and 64x2 shifts can each use a single instruction, once the shift amount is masked.
|
||||||
|
|
||||||
(rule (lower (has_type $I16X8 (ushr src amt)))
|
(rule (lower (has_type ty @ $I16X8 (ushr src amt)))
|
||||||
(x64_psrlw src (mov_rmi_to_xmm amt)))
|
(let ((masked_amt Reg (x64_and $I64 amt (RegMemImm.Imm (shift_mask ty)))))
|
||||||
|
(x64_psrlw src (mov_rmi_to_xmm masked_amt))))
|
||||||
|
|
||||||
(rule (lower (has_type $I32X4 (ushr src amt)))
|
(rule (lower (has_type ty @ $I32X4 (ushr src amt)))
|
||||||
(x64_psrld src (mov_rmi_to_xmm amt)))
|
(let ((masked_amt Reg (x64_and $I64 amt (RegMemImm.Imm (shift_mask ty)))))
|
||||||
|
(x64_psrld src (mov_rmi_to_xmm masked_amt))))
|
||||||
|
|
||||||
(rule (lower (has_type $I64X2 (ushr src amt)))
|
(rule (lower (has_type ty @ $I64X2 (ushr src amt)))
|
||||||
(x64_psrlq src (mov_rmi_to_xmm amt)))
|
(let ((masked_amt Reg (x64_and $I64 amt (RegMemImm.Imm (shift_mask ty)))))
|
||||||
|
(x64_psrlq src (mov_rmi_to_xmm masked_amt))))
|
||||||
|
|
||||||
;;;; Rules for `sshr` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
;;;; Rules for `sshr` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||||
|
|
||||||
@@ -746,14 +756,16 @@
|
|||||||
;; hi.i16x8 = [(s8, s8), (s9, s9), ..., (s15, s15)]
|
;; hi.i16x8 = [(s8, s8), (s9, s9), ..., (s15, s15)]
|
||||||
;; shifted_hi.i16x8 = shift each lane of `high`
|
;; shifted_hi.i16x8 = shift each lane of `high`
|
||||||
;; result = [s0'', s1'', ..., s15'']
|
;; result = [s0'', s1'', ..., s15'']
|
||||||
(rule (lower (has_type $I8X16 (sshr src amt @ (value_type amt_ty))))
|
(rule (lower (has_type ty @ $I8X16 (sshr src amt @ (value_type amt_ty))))
|
||||||
(let ((src_ Xmm (put_in_xmm src))
|
(let ((src_ Xmm (put_in_xmm src))
|
||||||
|
;; Mask the amount to ensure wrapping behaviour
|
||||||
|
(masked_amt Reg (x64_and $I64 amt (RegMemImm.Imm (shift_mask ty))))
|
||||||
;; In order for `packsswb` later to only use the high byte of each
|
;; In order for `packsswb` later to only use the high byte of each
|
||||||
;; 16x8 lane, we shift right an extra 8 bits, relying on `psraw` to
|
;; 16x8 lane, we shift right an extra 8 bits, relying on `psraw` to
|
||||||
;; fill in the upper bits appropriately.
|
;; fill in the upper bits appropriately.
|
||||||
(lo Xmm (x64_punpcklbw src_ src_))
|
(lo Xmm (x64_punpcklbw src_ src_))
|
||||||
(hi Xmm (x64_punpckhbw src_ src_))
|
(hi Xmm (x64_punpckhbw src_ src_))
|
||||||
(amt_ XmmMemImm (sshr_i8x16_bigger_shift amt_ty amt))
|
(amt_ XmmMemImm (sshr_i8x16_bigger_shift amt_ty masked_amt))
|
||||||
(shifted_lo Xmm (x64_psraw lo amt_))
|
(shifted_lo Xmm (x64_psraw lo amt_))
|
||||||
(shifted_hi Xmm (x64_psraw hi amt_)))
|
(shifted_hi Xmm (x64_psraw hi amt_)))
|
||||||
(x64_packsswb shifted_lo shifted_hi)))
|
(x64_packsswb shifted_lo shifted_hi)))
|
||||||
@@ -773,11 +785,13 @@
|
|||||||
;; `sshr.{i16x8,i32x4}` can be a simple `psra{w,d}`, we just have to make sure
|
;; `sshr.{i16x8,i32x4}` can be a simple `psra{w,d}`, we just have to make sure
|
||||||
;; that if the shift amount is in a register, it is in an XMM register.
|
;; that if the shift amount is in a register, it is in an XMM register.
|
||||||
|
|
||||||
(rule (lower (has_type $I16X8 (sshr src amt)))
|
(rule (lower (has_type ty @ $I16X8 (sshr src amt)))
|
||||||
(x64_psraw src (mov_rmi_to_xmm amt)))
|
(let ((masked_amt Reg (x64_and $I64 amt (RegMemImm.Imm (shift_mask ty)))))
|
||||||
|
(x64_psraw src (mov_rmi_to_xmm masked_amt))))
|
||||||
|
|
||||||
(rule (lower (has_type $I32X4 (sshr src amt)))
|
(rule (lower (has_type ty @ $I32X4 (sshr src amt)))
|
||||||
(x64_psrad src (mov_rmi_to_xmm amt)))
|
(let ((masked_amt Reg (x64_and $I64 amt (RegMemImm.Imm (shift_mask ty)))))
|
||||||
|
(x64_psrad src (mov_rmi_to_xmm masked_amt))))
|
||||||
|
|
||||||
;; The `sshr.i64x2` CLIF instruction has no single x86 instruction in the older
|
;; The `sshr.i64x2` CLIF instruction has no single x86 instruction in the older
|
||||||
;; feature sets. Newer ones like AVX512VL + AVX512F include `vpsraq`, a 128-bit
|
;; feature sets. Newer ones like AVX512VL + AVX512F include `vpsraq`, a 128-bit
|
||||||
|
|||||||
@@ -229,6 +229,11 @@ where
|
|||||||
.unwrap()
|
.unwrap()
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#[inline]
|
||||||
|
fn shift_mask(&mut self, ty: Type) -> u32 {
|
||||||
|
ty.lane_bits() - 1
|
||||||
|
}
|
||||||
|
|
||||||
#[inline]
|
#[inline]
|
||||||
fn simm32_from_value(&mut self, val: Value) -> Option<GprMemImm> {
|
fn simm32_from_value(&mut self, val: Value) -> Option<GprMemImm> {
|
||||||
let inst = self.lower_ctx.dfg().value_def(val).inst()?;
|
let inst = self.lower_ctx.dfg().value_def(val).inst()?;
|
||||||
@@ -415,6 +420,11 @@ where
|
|||||||
Writable::from_reg(Xmm::new(self.temp_writable_reg(I8X16).to_reg()).unwrap())
|
Writable::from_reg(Xmm::new(self.temp_writable_reg(I8X16).to_reg()).unwrap())
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#[inline]
|
||||||
|
fn reg_to_reg_mem_imm(&mut self, reg: Reg) -> RegMemImm {
|
||||||
|
RegMemImm::Reg { reg }
|
||||||
|
}
|
||||||
|
|
||||||
#[inline]
|
#[inline]
|
||||||
fn reg_mem_to_xmm_mem(&mut self, rm: &RegMem) -> XmmMem {
|
fn reg_mem_to_xmm_mem(&mut self, rm: &RegMem) -> XmmMem {
|
||||||
XmmMem::new(rm.clone()).unwrap()
|
XmmMem::new(rm.clone()).unwrap()
|
||||||
|
|||||||
@@ -344,9 +344,10 @@ block0(v0: i8x16):
|
|||||||
|
|
||||||
; block0:
|
; block0:
|
||||||
; movz x3, #1
|
; movz x3, #1
|
||||||
; sub w5, wzr, w3
|
; and w5, w3, #7
|
||||||
; dup v7.16b, w5
|
; sub x7, xzr, x5
|
||||||
; ushl v0.16b, v0.16b, v7.16b
|
; dup v17.16b, w7
|
||||||
|
; ushl v0.16b, v0.16b, v17.16b
|
||||||
; ret
|
; ret
|
||||||
|
|
||||||
function %add_i128(i128, i128) -> i128 {
|
function %add_i128(i128, i128) -> i128 {
|
||||||
@@ -492,4 +493,3 @@ block0(v0: i64):
|
|||||||
; b.vc 8 ; udf
|
; b.vc 8 ; udf
|
||||||
; sdiv x0, x0, x3
|
; sdiv x0, x0, x3
|
||||||
; ret
|
; ret
|
||||||
|
|
||||||
|
|||||||
@@ -206,12 +206,13 @@ block0(v0: i32):
|
|||||||
; movq %rsp, %rbp
|
; movq %rsp, %rbp
|
||||||
; block0:
|
; block0:
|
||||||
; load_const VCodeConstant(1), %xmm0
|
; load_const VCodeConstant(1), %xmm0
|
||||||
; movd %edi, %xmm5
|
; andq %rdi, $7, %rdi
|
||||||
; psllw %xmm0, %xmm5, %xmm0
|
; movd %edi, %xmm7
|
||||||
; lea const(VCodeConstant(0)), %rsi
|
; psllw %xmm0, %xmm7, %xmm0
|
||||||
|
; lea const(VCodeConstant(0)), %rax
|
||||||
; shlq $4, %rdi, %rdi
|
; shlq $4, %rdi, %rdi
|
||||||
; movdqu 0(%rsi,%rdi,1), %xmm13
|
; movdqu 0(%rax,%rdi,1), %xmm15
|
||||||
; pand %xmm0, %xmm13, %xmm0
|
; pand %xmm0, %xmm15, %xmm0
|
||||||
; movq %rbp, %rsp
|
; movq %rbp, %rsp
|
||||||
; popq %rbp
|
; popq %rbp
|
||||||
; ret
|
; ret
|
||||||
@@ -228,9 +229,14 @@ block0:
|
|||||||
; movq %rsp, %rbp
|
; movq %rsp, %rbp
|
||||||
; block0:
|
; block0:
|
||||||
; load_const VCodeConstant(1), %xmm0
|
; load_const VCodeConstant(1), %xmm0
|
||||||
; psrlw %xmm0, $1, %xmm0
|
; movl $1, %r11d
|
||||||
; movdqu const(VCodeConstant(0)), %xmm5
|
; andq %r11, $7, %r11
|
||||||
; pand %xmm0, %xmm5, %xmm0
|
; movd %r11d, %xmm7
|
||||||
|
; psrlw %xmm0, %xmm7, %xmm0
|
||||||
|
; lea const(VCodeConstant(0)), %rax
|
||||||
|
; shlq $4, %r11, %r11
|
||||||
|
; movdqu 0(%rax,%r11,1), %xmm15
|
||||||
|
; pand %xmm0, %xmm15, %xmm0
|
||||||
; movq %rbp, %rsp
|
; movq %rbp, %rsp
|
||||||
; popq %rbp
|
; popq %rbp
|
||||||
; ret
|
; ret
|
||||||
@@ -245,15 +251,16 @@ block0(v0: i32):
|
|||||||
; pushq %rbp
|
; pushq %rbp
|
||||||
; movq %rsp, %rbp
|
; movq %rsp, %rbp
|
||||||
; block0:
|
; block0:
|
||||||
; load_const VCodeConstant(0), %xmm9
|
; load_const VCodeConstant(0), %xmm10
|
||||||
; movdqa %xmm9, %xmm0
|
; andq %rdi, $7, %rdi
|
||||||
; punpcklbw %xmm0, %xmm9, %xmm0
|
; movdqa %xmm10, %xmm0
|
||||||
; punpckhbw %xmm9, %xmm9, %xmm9
|
; punpcklbw %xmm0, %xmm10, %xmm0
|
||||||
|
; punpckhbw %xmm10, %xmm10, %xmm10
|
||||||
; addl %edi, $8, %edi
|
; addl %edi, $8, %edi
|
||||||
; movd %edi, %xmm11
|
; movd %edi, %xmm13
|
||||||
; psraw %xmm0, %xmm11, %xmm0
|
; psraw %xmm0, %xmm13, %xmm0
|
||||||
; psraw %xmm9, %xmm11, %xmm9
|
; psraw %xmm10, %xmm13, %xmm10
|
||||||
; packsswb %xmm0, %xmm9, %xmm0
|
; packsswb %xmm0, %xmm10, %xmm0
|
||||||
; movq %rbp, %rsp
|
; movq %rbp, %rsp
|
||||||
; popq %rbp
|
; popq %rbp
|
||||||
; ret
|
; ret
|
||||||
@@ -267,17 +274,19 @@ block0(v0: i8x16, v1: i32):
|
|||||||
; pushq %rbp
|
; pushq %rbp
|
||||||
; movq %rsp, %rbp
|
; movq %rsp, %rbp
|
||||||
; block0:
|
; block0:
|
||||||
; movdqa %xmm0, %xmm9
|
; movl $3, %esi
|
||||||
; punpcklbw %xmm9, %xmm0, %xmm9
|
; andq %rsi, $7, %rsi
|
||||||
|
; movdqa %xmm0, %xmm15
|
||||||
|
; punpcklbw %xmm15, %xmm0, %xmm15
|
||||||
|
; movdqa %xmm15, %xmm13
|
||||||
; punpckhbw %xmm0, %xmm0, %xmm0
|
; punpckhbw %xmm0, %xmm0, %xmm0
|
||||||
; movdqa %xmm9, %xmm12
|
; movdqa %xmm0, %xmm7
|
||||||
; psraw %xmm12, $11, %xmm12
|
; addl %esi, $8, %esi
|
||||||
; movdqa %xmm12, %xmm9
|
; movd %esi, %xmm15
|
||||||
; psraw %xmm0, $11, %xmm0
|
; movdqa %xmm13, %xmm0
|
||||||
; movdqa %xmm9, %xmm1
|
; psraw %xmm0, %xmm15, %xmm0
|
||||||
; packsswb %xmm1, %xmm0, %xmm1
|
; psraw %xmm7, %xmm15, %xmm7
|
||||||
; movdqa %xmm1, %xmm9
|
; packsswb %xmm0, %xmm7, %xmm0
|
||||||
; movdqa %xmm9, %xmm0
|
|
||||||
; movq %rbp, %rsp
|
; movq %rbp, %rsp
|
||||||
; popq %rbp
|
; popq %rbp
|
||||||
; ret
|
; ret
|
||||||
|
|||||||
@@ -13,3 +13,33 @@ block0(v0: i32x4, v1: i32x4, v2: i32x4):
|
|||||||
; run: %bitselect_i32x4(0x11111111111111111111111111111111, 0x11111111111111111111111111111111, 0x00000000000000000000000000000000) == 0x11111111111111111111111111111111
|
; run: %bitselect_i32x4(0x11111111111111111111111111111111, 0x11111111111111111111111111111111, 0x00000000000000000000000000000000) == 0x11111111111111111111111111111111
|
||||||
; run: %bitselect_i32x4(0x01010011000011110000000011111111, 0x11111111111111111111111111111111, 0x00000000000000000000000000000000) == 0x01010011000011110000000011111111
|
; run: %bitselect_i32x4(0x01010011000011110000000011111111, 0x11111111111111111111111111111111, 0x00000000000000000000000000000000) == 0x01010011000011110000000011111111
|
||||||
; run: %bitselect_i32x4(0x00000000000000001111111111111111, 0x00000000000000000000000000000000, 0x11111111111111111111111111111111) == 0x11111111111111110000000000000000
|
; run: %bitselect_i32x4(0x00000000000000001111111111111111, 0x00000000000000000000000000000000, 0x11111111111111111111111111111111) == 0x11111111111111110000000000000000
|
||||||
|
|
||||||
|
function %bitselect_i8x16(i8x16, i8x16, i8x16) -> i8x16 {
|
||||||
|
block0(v0: i8x16, v1: i8x16, v2: i8x16):
|
||||||
|
v3 = bitselect v0, v1, v2
|
||||||
|
return v3
|
||||||
|
}
|
||||||
|
; Remember that bitselect accepts: 1) the selector vector, 2) the "if true" vector, and 3) the "if false" vector.
|
||||||
|
; run: %bitselect_i8x16([0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 255], [127 0 0 0 0 0 0 0 0 0 0 0 0 0 0 42], [42 0 0 0 0 0 0 0 0 0 0 0 0 0 0 127]) == [42 0 0 0 0 0 0 0 0 0 0 0 0 0 0 42]
|
||||||
|
|
||||||
|
function %bitselect_i8x16() -> b1 {
|
||||||
|
block0:
|
||||||
|
v0 = vconst.i8x16 [0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 255] ; the selector vector
|
||||||
|
v1 = vconst.i8x16 [127 0 0 0 0 0 0 0 0 0 0 0 0 0 0 42] ; for each 1-bit in v0 the bit of v1 is selected
|
||||||
|
v2 = vconst.i8x16 [42 0 0 0 0 0 0 0 0 0 0 0 0 0 0 127] ; for each 0-bit in v0 the bit of v2 is selected
|
||||||
|
v3 = bitselect v0, v1, v2
|
||||||
|
|
||||||
|
v4 = extractlane v3, 0
|
||||||
|
v5 = icmp_imm eq v4, 42
|
||||||
|
|
||||||
|
v6 = extractlane v3, 1
|
||||||
|
v7 = icmp_imm eq v6, 0
|
||||||
|
|
||||||
|
v8 = extractlane v3, 15
|
||||||
|
v9 = icmp_imm eq v8, 42
|
||||||
|
|
||||||
|
v10 = band v5, v7
|
||||||
|
v11 = band v10, v9
|
||||||
|
return v11
|
||||||
|
}
|
||||||
|
; run
|
||||||
|
|||||||
@@ -1,216 +0,0 @@
|
|||||||
test run
|
|
||||||
set enable_simd
|
|
||||||
target aarch64
|
|
||||||
; target s390x FIXME: s390x implements modulo semantics for shift counts
|
|
||||||
target x86_64 skylake
|
|
||||||
|
|
||||||
; TODO: once available, replace all lane extraction with `icmp + all_ones`
|
|
||||||
|
|
||||||
function %ishl_i32x4() -> b1 {
|
|
||||||
block0:
|
|
||||||
v0 = iconst.i32 1
|
|
||||||
v1 = vconst.i32x4 [1 2 4 8]
|
|
||||||
v2 = ishl v1, v0
|
|
||||||
|
|
||||||
v3 = extractlane v2, 0
|
|
||||||
v4 = icmp_imm eq v3, 2
|
|
||||||
|
|
||||||
v5 = extractlane v2, 3
|
|
||||||
v6 = icmp_imm eq v5, 16
|
|
||||||
|
|
||||||
v7 = band v4, v6
|
|
||||||
return v7
|
|
||||||
}
|
|
||||||
; run
|
|
||||||
|
|
||||||
function %ishl_too_large_i16x8() -> b1 {
|
|
||||||
block0:
|
|
||||||
v0 = iconst.i32 17 ; note that this will shift off the end of each lane
|
|
||||||
v1 = vconst.i16x8 [1 2 4 8 16 32 64 128]
|
|
||||||
v2 = ishl v1, v0
|
|
||||||
|
|
||||||
v3 = extractlane v2, 0
|
|
||||||
v4 = icmp_imm eq v3, 0
|
|
||||||
|
|
||||||
v5 = extractlane v2, 3
|
|
||||||
v6 = icmp_imm eq v5, 0
|
|
||||||
|
|
||||||
v7 = band v4, v6
|
|
||||||
return v7
|
|
||||||
}
|
|
||||||
; run
|
|
||||||
|
|
||||||
function %ushr_i8x16() -> b1 {
|
|
||||||
block0:
|
|
||||||
v0 = iconst.i32 1
|
|
||||||
v1 = vconst.i8x16 [0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15]
|
|
||||||
v2 = ushr v1, v0
|
|
||||||
|
|
||||||
v3 = vconst.i8x16 [0 0 1 1 2 2 3 3 4 4 5 5 6 6 7 7]
|
|
||||||
v4 = icmp eq v2, v3
|
|
||||||
v5 = vall_true v4
|
|
||||||
return v5
|
|
||||||
}
|
|
||||||
; run
|
|
||||||
|
|
||||||
function %sshr_i8x16() -> b1 {
|
|
||||||
block0:
|
|
||||||
v0 = iconst.i32 1
|
|
||||||
v1 = vconst.i8x16 [0 0xff 2 0xfd 4 0xfb 6 0xf9 8 0xf7 10 0xf5 12 0xf3 14 0xf1]
|
|
||||||
v2 = sshr v1, v0
|
|
||||||
|
|
||||||
v3 = vconst.i8x16 [0 0xff 1 0xfe 2 0xfd 3 0xfc 4 0xfb 5 0xfa 6 0xf9 7 0xf8]
|
|
||||||
v4 = icmp eq v2, v3
|
|
||||||
v5 = vall_true v4
|
|
||||||
return v5
|
|
||||||
}
|
|
||||||
; run
|
|
||||||
|
|
||||||
function %ishl_i8x16() -> b1 {
|
|
||||||
block0:
|
|
||||||
v0 = iconst.i32 1
|
|
||||||
v1 = vconst.i8x16 [0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15]
|
|
||||||
v2 = ishl v1, v0
|
|
||||||
|
|
||||||
v3 = vconst.i8x16 [0 2 4 6 8 10 12 14 16 18 20 22 24 26 28 30]
|
|
||||||
v4 = icmp eq v2, v3
|
|
||||||
v5 = vall_true v4
|
|
||||||
return v5
|
|
||||||
}
|
|
||||||
; run
|
|
||||||
|
|
||||||
function %ushr_i64x2() -> b1 {
|
|
||||||
block0:
|
|
||||||
v0 = iconst.i32 1
|
|
||||||
v1 = vconst.i64x2 [1 2]
|
|
||||||
v2 = ushr v1, v0
|
|
||||||
|
|
||||||
v3 = extractlane v2, 0
|
|
||||||
v4 = icmp_imm eq v3, 0
|
|
||||||
|
|
||||||
v5 = extractlane v2, 1
|
|
||||||
v6 = icmp_imm eq v5, 1
|
|
||||||
|
|
||||||
v7 = band v4, v6
|
|
||||||
return v7
|
|
||||||
}
|
|
||||||
; run
|
|
||||||
|
|
||||||
function %ushr_too_large_i32x4() -> b1 {
|
|
||||||
block0:
|
|
||||||
v0 = iconst.i32 33 ; note that this will shift off the end of each lane
|
|
||||||
v1 = vconst.i32x4 [1 2 4 8]
|
|
||||||
v2 = ushr v1, v0
|
|
||||||
|
|
||||||
v3 = extractlane v2, 0
|
|
||||||
v4 = icmp_imm eq v3, 0
|
|
||||||
|
|
||||||
v5 = extractlane v2, 3
|
|
||||||
v6 = icmp_imm eq v5, 0
|
|
||||||
|
|
||||||
v7 = band v4, v6
|
|
||||||
return v7
|
|
||||||
}
|
|
||||||
; run
|
|
||||||
|
|
||||||
function %sshr_i16x8() -> b1 {
|
|
||||||
block0:
|
|
||||||
v0 = iconst.i32 1
|
|
||||||
v1 = vconst.i16x8 [-1 2 4 8 -16 32 64 128]
|
|
||||||
v2 = sshr v1, v0
|
|
||||||
|
|
||||||
v3 = extractlane v2, 0
|
|
||||||
v4 = icmp_imm eq v3, 0xffff ; because of the shifted-in sign-bit, this remains 0xffff == -1
|
|
||||||
|
|
||||||
v5 = extractlane v2, 4
|
|
||||||
v6 = icmp_imm eq v5, 0xfff8 ; -16 has been shifted to -8 == 0xfff8
|
|
||||||
|
|
||||||
v7 = band v4, v6
|
|
||||||
return v7
|
|
||||||
}
|
|
||||||
; run
|
|
||||||
|
|
||||||
function %sshr_too_large_i32x4() -> b1 {
|
|
||||||
block0:
|
|
||||||
v0 = iconst.i32 33 ; note that this will shift off the end of each lane
|
|
||||||
v1 = vconst.i32x4 [1 2 4 -8]
|
|
||||||
v2 = sshr v1, v0
|
|
||||||
|
|
||||||
v3 = extractlane v2, 0
|
|
||||||
v4 = icmp_imm eq v3, 0
|
|
||||||
|
|
||||||
v5 = extractlane v2, 3
|
|
||||||
v6 = icmp_imm eq v5, 0xffff_ffff ; shifting in the sign-bit repeatedly fills the result with 1s
|
|
||||||
|
|
||||||
v7 = band v4, v6
|
|
||||||
return v7
|
|
||||||
}
|
|
||||||
; run
|
|
||||||
|
|
||||||
function %sshr_i64x2(i64x2, i32) -> i64x2 {
|
|
||||||
block0(v0:i64x2, v1:i32):
|
|
||||||
v2 = sshr v0, v1
|
|
||||||
return v2
|
|
||||||
}
|
|
||||||
; run: %sshr_i64x2([1 -1], 0) == [1 -1]
|
|
||||||
; run: %sshr_i64x2([1 -1], 1) == [0 -1] ; note the -1 shift result
|
|
||||||
; run: %sshr_i64x2([2 -2], 1) == [1 -1]
|
|
||||||
; run: %sshr_i64x2([0x80000000_00000000 0x7FFFFFFF_FFFFFFFF], 63) == [0xFFFFFFFF_FFFFFFFF 0]
|
|
||||||
|
|
||||||
function %bitselect_i8x16() -> b1 {
|
|
||||||
block0:
|
|
||||||
v0 = vconst.i8x16 [0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 255] ; the selector vector
|
|
||||||
v1 = vconst.i8x16 [127 0 0 0 0 0 0 0 0 0 0 0 0 0 0 42] ; for each 1-bit in v0 the bit of v1 is selected
|
|
||||||
v2 = vconst.i8x16 [42 0 0 0 0 0 0 0 0 0 0 0 0 0 0 127] ; for each 0-bit in v0 the bit of v2 is selected
|
|
||||||
v3 = bitselect v0, v1, v2
|
|
||||||
|
|
||||||
v4 = extractlane v3, 0
|
|
||||||
v5 = icmp_imm eq v4, 42
|
|
||||||
|
|
||||||
v6 = extractlane v3, 1
|
|
||||||
v7 = icmp_imm eq v6, 0
|
|
||||||
|
|
||||||
v8 = extractlane v3, 15
|
|
||||||
v9 = icmp_imm eq v8, 42
|
|
||||||
|
|
||||||
v10 = band v5, v7
|
|
||||||
v11 = band v10, v9
|
|
||||||
return v11
|
|
||||||
}
|
|
||||||
; run
|
|
||||||
|
|
||||||
function %sshr_imm_i32x4() -> b1 {
|
|
||||||
block0:
|
|
||||||
v1 = vconst.i32x4 [1 2 4 -8]
|
|
||||||
v2 = sshr_imm v1, 1
|
|
||||||
|
|
||||||
v3 = vconst.i32x4 [0 1 2 -4]
|
|
||||||
v4 = icmp eq v2, v3
|
|
||||||
v5 = vall_true v4
|
|
||||||
return v5
|
|
||||||
}
|
|
||||||
; run
|
|
||||||
|
|
||||||
function %sshr_imm_i16x8() -> b1 {
|
|
||||||
block0:
|
|
||||||
v1 = vconst.i16x8 [1 2 4 -8 0 0 0 0]
|
|
||||||
v2 = ushr_imm v1, 1
|
|
||||||
|
|
||||||
v3 = vconst.i16x8 [0 1 2 32764 0 0 0 0] ; -4 with MSB unset == 32764
|
|
||||||
v4 = icmp eq v2, v3
|
|
||||||
v5 = vall_true v4
|
|
||||||
return v5
|
|
||||||
}
|
|
||||||
; run
|
|
||||||
|
|
||||||
function %ishl_imm_i64x2() -> b1 {
|
|
||||||
block0:
|
|
||||||
v1 = vconst.i64x2 [1 0]
|
|
||||||
v2 = ishl_imm v1, 1
|
|
||||||
|
|
||||||
v3 = vconst.i64x2 [2 0]
|
|
||||||
v4 = icmp eq v2, v3
|
|
||||||
v5 = vall_true v4
|
|
||||||
return v5
|
|
||||||
}
|
|
||||||
; run
|
|
||||||
@@ -1,132 +0,0 @@
|
|||||||
test run
|
|
||||||
target aarch64
|
|
||||||
; target s390x FIXME: s390x implements modulo semantics for shift counts
|
|
||||||
set enable_simd
|
|
||||||
target x86_64 skylake
|
|
||||||
|
|
||||||
function %bitselect_i8x16(i8x16, i8x16, i8x16) -> i8x16 {
|
|
||||||
block0(v0: i8x16, v1: i8x16, v2: i8x16):
|
|
||||||
v3 = bitselect v0, v1, v2
|
|
||||||
return v3
|
|
||||||
}
|
|
||||||
; Remember that bitselect accepts: 1) the selector vector, 2) the "if true" vector, and 3) the "if false" vector.
|
|
||||||
; run: %bitselect_i8x16([0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 255], [127 0 0 0 0 0 0 0 0 0 0 0 0 0 0 42], [42 0 0 0 0 0 0 0 0 0 0 0 0 0 0 127]) == [42 0 0 0 0 0 0 0 0 0 0 0 0 0 0 42]
|
|
||||||
|
|
||||||
function %vselect_i32x4(i32x4, i32x4) -> i32x4 {
|
|
||||||
block0(v1: i32x4, v2: i32x4):
|
|
||||||
; `make_trampoline` still does not know how to convert boolean vector types
|
|
||||||
; so we load the value directly here.
|
|
||||||
v0 = vconst.b32x4 [true true false false]
|
|
||||||
v3 = vselect v0, v1, v2
|
|
||||||
return v3
|
|
||||||
}
|
|
||||||
; Remember that vselect accepts: 1) the selector vector, 2) the "if true" vector, and 3) the "if false" vector.
|
|
||||||
; run: %vselect_i8x16([1 2 -1 -1], [-1 -1 3 4]) == [1 2 3 4]
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
; shift left
|
|
||||||
|
|
||||||
function %ishl_i8x16(i8x16, i32) -> i8x16 {
|
|
||||||
block0(v0: i8x16, v1: i32):
|
|
||||||
v2 = ishl v0, v1
|
|
||||||
return v2
|
|
||||||
}
|
|
||||||
; run: %ishl_i8x16([0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15], 4) == [0x00 0x10 0x20 0x30 0x40 0x50 0x60 0x70 0x80 0x90 0xa0 0xb0 0xc0 0xd0 0xe0 0xf0]
|
|
||||||
|
|
||||||
function %ishl_i16x8(i16x8, i32) -> i16x8 {
|
|
||||||
block0(v0: i16x8, v1: i32):
|
|
||||||
v2 = ishl v0, v1
|
|
||||||
return v2
|
|
||||||
}
|
|
||||||
; run: %ishl_i16x8([1 2 4 8 16 32 64 128], 17) == [0 0 0 0 0 0 0 0]
|
|
||||||
|
|
||||||
function %ishl_i32x4(i32x4, i32) -> i32x4 {
|
|
||||||
block0(v0: i32x4, v1: i32):
|
|
||||||
v2 = ishl v0, v1
|
|
||||||
return v2
|
|
||||||
}
|
|
||||||
; run: %ishl_i32x4([1 2 4 8], 1) == [2 4 8 16]
|
|
||||||
|
|
||||||
function %ishl_imm_i64x2(i64x2) -> i64x2 {
|
|
||||||
block0(v0: i64x2):
|
|
||||||
v2 = ishl_imm v0, 1
|
|
||||||
return v2
|
|
||||||
}
|
|
||||||
; run: %ishl_imm_i64x2([1 0]) == [2 0]
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
; shift right (logical)
|
|
||||||
|
|
||||||
function %ushr_i8x16(i8x16, i32) -> i8x16 {
|
|
||||||
block0(v0: i8x16, v1: i32):
|
|
||||||
v2 = ushr v0, v1
|
|
||||||
return v2
|
|
||||||
}
|
|
||||||
; run: %ushr_i8x16([0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15], 1) == [0 0 1 1 2 2 3 3 4 4 5 5 6 6 7 7]
|
|
||||||
|
|
||||||
function %ushr_i32x4(i32x4, i32) -> i32x4 {
|
|
||||||
block0(v0: i32x4, v1: i32):
|
|
||||||
v2 = ushr v0, v1
|
|
||||||
return v2
|
|
||||||
}
|
|
||||||
; run: %ushr_i32x4([1 2 4 8], 33) == [0 0 0 0]
|
|
||||||
|
|
||||||
function %ushr_i64x2(i64x2, i32) -> i64x2 {
|
|
||||||
block0(v0: i64x2, v1: i32):
|
|
||||||
v2 = ushr v0, v1
|
|
||||||
return v2
|
|
||||||
}
|
|
||||||
; run: %ushr_i64x2([1 2], 1) == [0 1]
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
; shift right (arithmetic)
|
|
||||||
|
|
||||||
function %sshr_i8x16(i8x16, i32) -> i8x16 {
|
|
||||||
block0(v0: i8x16, v1: i32):
|
|
||||||
v2 = sshr v0, v1
|
|
||||||
return v2
|
|
||||||
}
|
|
||||||
; run: %sshr_i8x16([0 0xff 2 0xfd 4 0xfb 6 0xf9 8 0xf7 10 0xf5 12 0xf3 14 0xf1], 1) == [0 0xff 1 0xfe 2 0xfd 3 0xfc 4 0xfb 5 0xfa 6 0xf9 7 0xf8]
|
|
||||||
|
|
||||||
function %sshr_i16x8(i16x8, i32) -> i16x8 {
|
|
||||||
block0(v0: i16x8, v1: i32):
|
|
||||||
v2 = sshr v0, v1
|
|
||||||
return v2
|
|
||||||
}
|
|
||||||
; note: because of the shifted-in sign-bit, lane 0 remains -1 == 0xffff, whereas lane 4 has been shifted to -8 == 0xfff8
|
|
||||||
; run: %ushr_i16x8([-1 2 4 8 -16 32 64 128], 1) == [-1 1 2 4 -8 16 32 64]
|
|
||||||
|
|
||||||
function %sshr_i32x4(i32x4, i32) -> i32x4 {
|
|
||||||
block0(v0: i32x4, v1: i32):
|
|
||||||
v2 = sshr v0, v1
|
|
||||||
return v2
|
|
||||||
}
|
|
||||||
; note: shifting in the sign-bit repeatedly in lane 3 fills the result with 1s (-1 == 0xffff_ffff)
|
|
||||||
; run: %ushr_i32x4([1 2 4 -8], 33) == [0 0 0 0xffff_ffff]
|
|
||||||
|
|
||||||
function %sshr_i64x2(i64x2, i32) -> i64x2 {
|
|
||||||
block0(v0:i64x2, v1:i32):
|
|
||||||
v2 = sshr v0, v1
|
|
||||||
return v2
|
|
||||||
}
|
|
||||||
; run: %sshr_i64x2([1 -1], 0) == [1 -1]
|
|
||||||
; run: %sshr_i64x2([1 -1], 1) == [0 -1] ; note the -1 shift result
|
|
||||||
; run: %sshr_i64x2([2 -2], 1) == [1 -1]
|
|
||||||
; run: %sshr_i64x2([0x80000000_00000000 0x7FFFFFFF_FFFFFFFF], 63) == [0xFFFFFFFF_FFFFFFFF 0]
|
|
||||||
|
|
||||||
function %sshr_imm_i32x4(i32x4) -> i32x4 {
|
|
||||||
block0(v0: i32x4):
|
|
||||||
v1 = sshr_imm v0, 1
|
|
||||||
return v1
|
|
||||||
}
|
|
||||||
; run: %sshr_imm_i32x4([1 2 4 -8]) == [0 1 2 -4]
|
|
||||||
|
|
||||||
function %sshr_imm_i16x8(i16x8) -> i16x8 {
|
|
||||||
block0(v0: i16x8):
|
|
||||||
v1 = sshr_imm v0, 1
|
|
||||||
return v1
|
|
||||||
}
|
|
||||||
; run: %sshr_imm_i16x8([1 2 4 -8 0 0 0 0]) == [0 1 2 -4 0 0 0 0]
|
|
||||||
46
cranelift/filetests/filetests/runtests/simd-ishl.clif
Normal file
46
cranelift/filetests/filetests/runtests/simd-ishl.clif
Normal file
@@ -0,0 +1,46 @@
|
|||||||
|
test run
|
||||||
|
set enable_simd
|
||||||
|
target aarch64
|
||||||
|
target s390x
|
||||||
|
target x86_64 skylake
|
||||||
|
|
||||||
|
|
||||||
|
function %ishl_i8x16(i8x16, i32) -> i8x16 {
|
||||||
|
block0(v0: i8x16, v1: i32):
|
||||||
|
v2 = ishl v0, v1
|
||||||
|
return v2
|
||||||
|
}
|
||||||
|
; run: %ishl_i8x16([0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15], 4) == [0x00 0x10 0x20 0x30 0x40 0x50 0x60 0x70 0x80 0x90 0xa0 0xb0 0xc0 0xd0 0xe0 0xf0]
|
||||||
|
; run: %ishl_i8x16([0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15], 12) == [0x00 0x10 0x20 0x30 0x40 0x50 0x60 0x70 0x80 0x90 0xa0 0xb0 0xc0 0xd0 0xe0 0xf0]
|
||||||
|
|
||||||
|
function %ishl_i16x8(i16x8, i32) -> i16x8 {
|
||||||
|
block0(v0: i16x8, v1: i32):
|
||||||
|
v2 = ishl v0, v1
|
||||||
|
return v2
|
||||||
|
}
|
||||||
|
; run: %ishl_i16x8([1 2 4 8 16 32 64 128], 1) == [2 4 8 16 32 64 128 256]
|
||||||
|
; run: %ishl_i16x8([1 2 4 8 16 32 64 128], 17) == [2 4 8 16 32 64 128 256]
|
||||||
|
|
||||||
|
function %ishl_i32x4(i32x4, i32) -> i32x4 {
|
||||||
|
block0(v0: i32x4, v1: i32):
|
||||||
|
v2 = ishl v0, v1
|
||||||
|
return v2
|
||||||
|
}
|
||||||
|
; run: %ishl_i32x4([1 2 4 8], 1) == [2 4 8 16]
|
||||||
|
; run: %ishl_i32x4([1 2 4 8], 33) == [2 4 8 16]
|
||||||
|
|
||||||
|
function %ishl_i64x2(i64x2, i32) -> i64x2 {
|
||||||
|
block0(v0: i64x2, v1: i32):
|
||||||
|
v2 = ishl v0, v1
|
||||||
|
return v2
|
||||||
|
}
|
||||||
|
; run: %ishl_i64x2([1 2], 1) == [2 4]
|
||||||
|
; run: %ishl_i64x2([1 2], 65) == [2 4]
|
||||||
|
|
||||||
|
|
||||||
|
function %ishl_imm_i64x2(i64x2) -> i64x2 {
|
||||||
|
block0(v0: i64x2):
|
||||||
|
v2 = ishl_imm v0, 1
|
||||||
|
return v2
|
||||||
|
}
|
||||||
|
; run: %ishl_imm_i64x2([1 0]) == [2 0]
|
||||||
58
cranelift/filetests/filetests/runtests/simd-sshr.clif
Normal file
58
cranelift/filetests/filetests/runtests/simd-sshr.clif
Normal file
@@ -0,0 +1,58 @@
|
|||||||
|
test run
|
||||||
|
set enable_simd
|
||||||
|
target aarch64
|
||||||
|
target s390x
|
||||||
|
target x86_64 skylake
|
||||||
|
|
||||||
|
|
||||||
|
function %sshr_i8x16(i8x16, i32) -> i8x16 {
|
||||||
|
block0(v0: i8x16, v1: i32):
|
||||||
|
v2 = sshr v0, v1
|
||||||
|
return v2
|
||||||
|
}
|
||||||
|
; run: %sshr_i8x16([0 0xff 2 0xfd 4 0xfb 6 0xf9 8 0xf7 10 0xf5 12 0xf3 14 0xf1], 1) == [0 0xff 1 0xfe 2 0xfd 3 0xfc 4 0xfb 5 0xfa 6 0xf9 7 0xf8]
|
||||||
|
; run: %sshr_i8x16([0 0xff 2 0xfd 4 0xfb 6 0xf9 8 0xf7 10 0xf5 12 0xf3 14 0xf1], 9) == [0 0xff 1 0xfe 2 0xfd 3 0xfc 4 0xfb 5 0xfa 6 0xf9 7 0xf8]
|
||||||
|
|
||||||
|
function %sshr_i16x8(i16x8, i32) -> i16x8 {
|
||||||
|
block0(v0: i16x8, v1: i32):
|
||||||
|
v2 = sshr v0, v1
|
||||||
|
return v2
|
||||||
|
}
|
||||||
|
; note: because of the shifted-in sign-bit, lane 0 remains -1 == 0xffff, whereas lane 4 has been shifted to -8 == 0xfff8
|
||||||
|
; run: %sshr_i16x8([-1 2 4 8 -16 32 64 128], 1) == [-1 1 2 4 -8 16 32 64]
|
||||||
|
; run: %sshr_i16x8([-1 2 4 8 -16 32 64 128], 17) == [-1 1 2 4 -8 16 32 64]
|
||||||
|
|
||||||
|
function %sshr_i32x4(i32x4, i32) -> i32x4 {
|
||||||
|
block0(v0: i32x4, v1: i32):
|
||||||
|
v2 = sshr v0, v1
|
||||||
|
return v2
|
||||||
|
}
|
||||||
|
; run: %sshr_i32x4([1 2 4 -8], 1) == [0 1 2 -4]
|
||||||
|
; run: %sshr_i32x4([1 2 4 -8], 33) == [0 1 2 -4]
|
||||||
|
|
||||||
|
function %sshr_i64x2(i64x2, i32) -> i64x2 {
|
||||||
|
block0(v0:i64x2, v1:i32):
|
||||||
|
v2 = sshr v0, v1
|
||||||
|
return v2
|
||||||
|
}
|
||||||
|
; run: %sshr_i64x2([1 -1], 0) == [1 -1]
|
||||||
|
; run: %sshr_i64x2([1 -1], 1) == [0 -1] ; note the -1 shift result
|
||||||
|
; run: %sshr_i64x2([2 -2], 1) == [1 -1]
|
||||||
|
; run: %sshr_i64x2([0x80000000_00000000 0x7FFFFFFF_FFFFFFFF], 63) == [0xFFFFFFFF_FFFFFFFF 0]
|
||||||
|
; run: %sshr_i64x2([2 -2], 65) == [1 -1]
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
function %sshr_imm_i32x4(i32x4) -> i32x4 {
|
||||||
|
block0(v0: i32x4):
|
||||||
|
v1 = sshr_imm v0, 1
|
||||||
|
return v1
|
||||||
|
}
|
||||||
|
; run: %sshr_imm_i32x4([1 2 4 -8]) == [0 1 2 -4]
|
||||||
|
|
||||||
|
function %sshr_imm_i16x8(i16x8) -> i16x8 {
|
||||||
|
block0(v0: i16x8):
|
||||||
|
v1 = sshr_imm v0, 1
|
||||||
|
return v1
|
||||||
|
}
|
||||||
|
; run: %sshr_imm_i16x8([1 2 4 -8 0 0 0 0]) == [0 1 2 -4 0 0 0 0]
|
||||||
52
cranelift/filetests/filetests/runtests/simd-ushr.clif
Normal file
52
cranelift/filetests/filetests/runtests/simd-ushr.clif
Normal file
@@ -0,0 +1,52 @@
|
|||||||
|
test run
|
||||||
|
set enable_simd
|
||||||
|
target aarch64
|
||||||
|
target s390x
|
||||||
|
target x86_64 skylake
|
||||||
|
|
||||||
|
|
||||||
|
function %ushr_i8x16(i8x16, i32) -> i8x16 {
|
||||||
|
block0(v0: i8x16, v1: i32):
|
||||||
|
v2 = ushr v0, v1
|
||||||
|
return v2
|
||||||
|
}
|
||||||
|
; run: %ushr_i8x16([0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15], 1) == [0 0 1 1 2 2 3 3 4 4 5 5 6 6 7 7]
|
||||||
|
; run: %ushr_i8x16([0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15], 9) == [0 0 1 1 2 2 3 3 4 4 5 5 6 6 7 7]
|
||||||
|
|
||||||
|
function %ushr_i16x8(i16x8, i32) -> i16x8 {
|
||||||
|
block0(v0: i16x8, v1: i32):
|
||||||
|
v2 = ushr v0, v1
|
||||||
|
return v2
|
||||||
|
}
|
||||||
|
; run: %ushr_i16x8([0 1 2 3 4 5 6 7], 1) == [0 0 1 1 2 2 3 3]
|
||||||
|
; run: %ushr_i16x8([0 1 2 3 4 5 6 7], 17) == [0 0 1 1 2 2 3 3]
|
||||||
|
; run: %ushr_i16x8([1 2 4 -8 0 0 0 0], 1) == [0 1 2 32764 0 0 0 0]
|
||||||
|
|
||||||
|
function %ushr_i32x4(i32x4, i32) -> i32x4 {
|
||||||
|
block0(v0: i32x4, v1: i32):
|
||||||
|
v2 = ushr v0, v1
|
||||||
|
return v2
|
||||||
|
}
|
||||||
|
; run: %ushr_i32x4([1 2 4 8], 1) == [0 1 2 4]
|
||||||
|
; run: %ushr_i32x4([1 2 4 8], 33) == [0 1 2 4]
|
||||||
|
|
||||||
|
function %ushr_i64x2(i64x2, i32) -> i64x2 {
|
||||||
|
block0(v0: i64x2, v1: i32):
|
||||||
|
v2 = ushr v0, v1
|
||||||
|
return v2
|
||||||
|
}
|
||||||
|
; run: %ushr_i64x2([1 2], 1) == [0 1]
|
||||||
|
; run: %ushr_i64x2([1 2], 65) == [0 1]
|
||||||
|
|
||||||
|
|
||||||
|
function %sshr_imm_i16x8() -> b1 {
|
||||||
|
block0:
|
||||||
|
v1 = vconst.i16x8 [1 2 4 -8 0 0 0 0]
|
||||||
|
v2 = ushr_imm v1, 1
|
||||||
|
|
||||||
|
v3 = vconst.i16x8 [0 1 2 32764 0 0 0 0] ; -4 with MSB unset == 32764
|
||||||
|
v4 = icmp eq v2, v3
|
||||||
|
v5 = vall_true v4
|
||||||
|
return v5
|
||||||
|
}
|
||||||
|
; run
|
||||||
@@ -72,3 +72,15 @@ block0(v0: b64x2, v1: i64x2, v2: i64x2):
|
|||||||
return v3
|
return v3
|
||||||
}
|
}
|
||||||
; run: %vselect_p_i64x2([true false], [1 2], [100000000000 200000000000]) == [1 200000000000]
|
; run: %vselect_p_i64x2([true false], [1 2], [100000000000 200000000000]) == [1 200000000000]
|
||||||
|
|
||||||
|
|
||||||
|
function %vselect_i32x4(i32x4, i32x4) -> i32x4 {
|
||||||
|
block0(v1: i32x4, v2: i32x4):
|
||||||
|
; `make_trampoline` still does not know how to convert boolean vector types
|
||||||
|
; so we load the value directly here.
|
||||||
|
v0 = vconst.b32x4 [true true false false]
|
||||||
|
v3 = vselect v0, v1, v2
|
||||||
|
return v3
|
||||||
|
}
|
||||||
|
; Remember that vselect accepts: 1) the selector vector, 2) the "if true" vector, and 3) the "if false" vector.
|
||||||
|
; run: %vselect_i32x4([1 2 -1 -1], [-1 -1 3 4]) == [1 2 3 4]
|
||||||
|
|||||||
@@ -1630,29 +1630,23 @@ pub fn translate_operator<FE: FuncEnvironment + ?Sized>(
|
|||||||
Operator::I8x16Shl | Operator::I16x8Shl | Operator::I32x4Shl | Operator::I64x2Shl => {
|
Operator::I8x16Shl | Operator::I16x8Shl | Operator::I32x4Shl | Operator::I64x2Shl => {
|
||||||
let (a, b) = state.pop2();
|
let (a, b) = state.pop2();
|
||||||
let bitcast_a = optionally_bitcast_vector(a, type_of(op), builder);
|
let bitcast_a = optionally_bitcast_vector(a, type_of(op), builder);
|
||||||
let bitwidth = i64::from(type_of(op).lane_bits());
|
// The spec expects to shift with `b mod lanewidth`; This is directly compatible
|
||||||
// The spec expects to shift with `b mod lanewidth`; so, e.g., for 16 bit lane-width
|
// with cranelift's instruction.
|
||||||
// we do `b AND 15`; this means fewer instructions than `iconst + urem`.
|
state.push1(builder.ins().ishl(bitcast_a, b))
|
||||||
let b_mod_bitwidth = builder.ins().band_imm(b, bitwidth - 1);
|
|
||||||
state.push1(builder.ins().ishl(bitcast_a, b_mod_bitwidth))
|
|
||||||
}
|
}
|
||||||
Operator::I8x16ShrU | Operator::I16x8ShrU | Operator::I32x4ShrU | Operator::I64x2ShrU => {
|
Operator::I8x16ShrU | Operator::I16x8ShrU | Operator::I32x4ShrU | Operator::I64x2ShrU => {
|
||||||
let (a, b) = state.pop2();
|
let (a, b) = state.pop2();
|
||||||
let bitcast_a = optionally_bitcast_vector(a, type_of(op), builder);
|
let bitcast_a = optionally_bitcast_vector(a, type_of(op), builder);
|
||||||
let bitwidth = i64::from(type_of(op).lane_bits());
|
// The spec expects to shift with `b mod lanewidth`; This is directly compatible
|
||||||
// The spec expects to shift with `b mod lanewidth`; so, e.g., for 16 bit lane-width
|
// with cranelift's instruction.
|
||||||
// we do `b AND 15`; this means fewer instructions than `iconst + urem`.
|
state.push1(builder.ins().ushr(bitcast_a, b))
|
||||||
let b_mod_bitwidth = builder.ins().band_imm(b, bitwidth - 1);
|
|
||||||
state.push1(builder.ins().ushr(bitcast_a, b_mod_bitwidth))
|
|
||||||
}
|
}
|
||||||
Operator::I8x16ShrS | Operator::I16x8ShrS | Operator::I32x4ShrS | Operator::I64x2ShrS => {
|
Operator::I8x16ShrS | Operator::I16x8ShrS | Operator::I32x4ShrS | Operator::I64x2ShrS => {
|
||||||
let (a, b) = state.pop2();
|
let (a, b) = state.pop2();
|
||||||
let bitcast_a = optionally_bitcast_vector(a, type_of(op), builder);
|
let bitcast_a = optionally_bitcast_vector(a, type_of(op), builder);
|
||||||
let bitwidth = i64::from(type_of(op).lane_bits());
|
// The spec expects to shift with `b mod lanewidth`; This is directly compatible
|
||||||
// The spec expects to shift with `b mod lanewidth`; so, e.g., for 16 bit lane-width
|
// with cranelift's instruction.
|
||||||
// we do `b AND 15`; this means fewer instructions than `iconst + urem`.
|
state.push1(builder.ins().sshr(bitcast_a, b))
|
||||||
let b_mod_bitwidth = builder.ins().band_imm(b, bitwidth - 1);
|
|
||||||
state.push1(builder.ins().sshr(bitcast_a, b_mod_bitwidth))
|
|
||||||
}
|
}
|
||||||
Operator::V128Bitselect => {
|
Operator::V128Bitselect => {
|
||||||
let (a, b, c) = state.pop3();
|
let (a, b, c) = state.pop3();
|
||||||
|
|||||||
Reference in New Issue
Block a user