cranelift: Align Scalar and SIMD shift semantics (#4520)
* cranelift: Reorganize test suite Group some SIMD operations by instruction. * cranelift: Deduplicate some shift tests Also, new tests with the mod behaviour * aarch64: Lower shifts with mod behaviour * x64: Lower shifts with mod behaviour * wasmtime: Don't mask SIMD shifts
This commit is contained in:
@@ -927,7 +927,8 @@
|
||||
;; Shift for vector types.
|
||||
(rule (lower (has_type (ty_vec128 ty) (ishl x y)))
|
||||
(let ((size VectorSize (vector_size ty))
|
||||
(shift Reg (vec_dup y size)))
|
||||
(masked_shift_amt Reg (and_imm $I32 y (shift_mask ty)))
|
||||
(shift Reg (vec_dup masked_shift_amt size)))
|
||||
(sshl x shift size)))
|
||||
|
||||
;; Helper function to emit a shift operation with the opcode specified and
|
||||
@@ -986,7 +987,8 @@
|
||||
;; Vector shifts.
|
||||
(rule (lower (has_type (ty_vec128 ty) (ushr x y)))
|
||||
(let ((size VectorSize (vector_size ty))
|
||||
(shift Reg (vec_dup (sub $I32 (zero_reg) y) size)))
|
||||
(masked_shift_amt Reg (and_imm $I32 y (shift_mask ty)))
|
||||
(shift Reg (vec_dup (sub $I64 (zero_reg) masked_shift_amt) size)))
|
||||
(ushl x shift size)))
|
||||
|
||||
;; lsr lo_rshift, src_lo, amt
|
||||
@@ -1035,7 +1037,8 @@
|
||||
;; Note that right shifts are implemented with a negative left shift.
|
||||
(rule (lower (has_type (ty_vec128 ty) (sshr x y)))
|
||||
(let ((size VectorSize (vector_size ty))
|
||||
(shift Reg (vec_dup (sub $I32 (zero_reg) y) size)))
|
||||
(masked_shift_amt Reg (and_imm $I32 y (shift_mask ty)))
|
||||
(shift Reg (vec_dup (sub $I64 (zero_reg) masked_shift_amt) size)))
|
||||
(sshl x shift size)))
|
||||
|
||||
;; lsr lo_rshift, src_lo, amt
|
||||
|
||||
@@ -335,7 +335,9 @@ where
|
||||
}
|
||||
|
||||
fn shift_mask(&mut self, ty: Type) -> ImmLogic {
|
||||
let mask = (ty.bits() - 1) as u64;
|
||||
debug_assert!(ty.lane_bits().is_power_of_two());
|
||||
|
||||
let mask = (ty.lane_bits() - 1) as u64;
|
||||
ImmLogic::maybe_from_u64(mask, I32).unwrap()
|
||||
}
|
||||
|
||||
|
||||
@@ -1147,6 +1147,10 @@
|
||||
(decl reg_mem_to_xmm_mem (RegMem) XmmMem)
|
||||
(extern constructor reg_mem_to_xmm_mem reg_mem_to_xmm_mem)
|
||||
|
||||
;; Construct a new `RegMemImm` from the given `Reg`.
|
||||
(decl reg_to_reg_mem_imm (Reg) RegMemImm)
|
||||
(extern constructor reg_to_reg_mem_imm reg_to_reg_mem_imm)
|
||||
|
||||
;; Construct a new `GprMemImm` from the given `RegMemImm`.
|
||||
;;
|
||||
;; Asserts that the `RegMemImm`'s register, if any, is an GPR register.
|
||||
@@ -1354,6 +1358,10 @@
|
||||
(decl const_to_type_masked_imm8 (u64 Type) Imm8Gpr)
|
||||
(extern constructor const_to_type_masked_imm8 const_to_type_masked_imm8)
|
||||
|
||||
;; Generate a mask for the bit-width of the given type
|
||||
(decl shift_mask (Type) u32)
|
||||
(extern constructor shift_mask shift_mask)
|
||||
|
||||
;; Extract a constant `GprMemImm.Imm` from a value operand.
|
||||
(decl simm32_from_value (GprMemImm) Value)
|
||||
(extern extractor simm32_from_value simm32_from_value)
|
||||
@@ -3043,6 +3051,7 @@
|
||||
(convert Xmm RegMem xmm_to_reg_mem)
|
||||
(convert Reg Xmm xmm_new)
|
||||
(convert Reg XmmMem reg_to_xmm_mem)
|
||||
(convert Reg RegMemImm reg_to_reg_mem_imm)
|
||||
(convert RegMem XmmMem reg_mem_to_xmm_mem)
|
||||
(convert RegMemImm XmmMemImm mov_rmi_to_xmm)
|
||||
(convert Xmm XmmMem xmm_to_xmm_mem)
|
||||
|
||||
@@ -531,13 +531,15 @@
|
||||
;; in higher feature sets like AVX), we lower the `ishl.i8x16` to a sequence of
|
||||
;; instructions. The basic idea, whether the amount to shift by is an immediate
|
||||
;; or not, is to use a 16x8 shift and then mask off the incorrect bits to 0s.
|
||||
(rule (lower (has_type $I8X16 (ishl src amt)))
|
||||
(rule (lower (has_type ty @ $I8X16 (ishl src amt)))
|
||||
(let (
|
||||
;; Mask the amount to ensure wrapping behaviour
|
||||
(masked_amt Reg (x64_and $I64 amt (RegMemImm.Imm (shift_mask ty))))
|
||||
;; Shift `src` using 16x8. Unfortunately, a 16x8 shift will only be
|
||||
;; correct for half of the lanes; the others must be fixed up with
|
||||
;; the mask below.
|
||||
(unmasked Xmm (x64_psllw src (mov_rmi_to_xmm amt)))
|
||||
(mask_addr SyntheticAmode (ishl_i8x16_mask amt))
|
||||
(unmasked Xmm (x64_psllw src (mov_rmi_to_xmm masked_amt)))
|
||||
(mask_addr SyntheticAmode (ishl_i8x16_mask masked_amt))
|
||||
(mask Reg (x64_load $I8X16 mask_addr (ExtKind.None))))
|
||||
(sse_and $I8X16 unmasked (RegMem.Reg mask))))
|
||||
|
||||
@@ -571,16 +573,19 @@
|
||||
(rule (ishl_i8x16_mask (RegMemImm.Mem amt))
|
||||
(ishl_i8x16_mask (RegMemImm.Reg (x64_load $I64 amt (ExtKind.None)))))
|
||||
|
||||
;; 16x8, 32x4, and 64x2 shifts can each use a single instruction.
|
||||
;; 16x8, 32x4, and 64x2 shifts can each use a single instruction, once the shift amount is masked.
|
||||
|
||||
(rule (lower (has_type $I16X8 (ishl src amt)))
|
||||
(x64_psllw src (mov_rmi_to_xmm amt)))
|
||||
(rule (lower (has_type ty @ $I16X8 (ishl src amt)))
|
||||
(let ((masked_amt Reg (x64_and $I64 amt (RegMemImm.Imm (shift_mask ty)))))
|
||||
(x64_psllw src (mov_rmi_to_xmm masked_amt))))
|
||||
|
||||
(rule (lower (has_type $I32X4 (ishl src amt)))
|
||||
(x64_pslld src (mov_rmi_to_xmm amt)))
|
||||
(rule (lower (has_type ty @ $I32X4 (ishl src amt)))
|
||||
(let ((masked_amt Reg (x64_and $I64 amt (RegMemImm.Imm (shift_mask ty)))))
|
||||
(x64_pslld src (mov_rmi_to_xmm masked_amt))))
|
||||
|
||||
(rule (lower (has_type $I64X2 (ishl src amt)))
|
||||
(x64_psllq src (mov_rmi_to_xmm amt)))
|
||||
(rule (lower (has_type ty @ $I64X2 (ishl src amt)))
|
||||
(let ((masked_amt Reg (x64_and $I64 amt (RegMemImm.Imm (shift_mask ty)))))
|
||||
(x64_psllq src (mov_rmi_to_xmm masked_amt))))
|
||||
|
||||
;;;; Rules for `ushr` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
|
||||
@@ -630,13 +635,15 @@
|
||||
|
||||
;; There are no 8x16 shifts in x64. Do the same 16x8-shift-and-mask thing we do
|
||||
;; with 8x16 `ishl`.
|
||||
(rule (lower (has_type $I8X16 (ushr src amt)))
|
||||
(rule (lower (has_type ty @ $I8X16 (ushr src amt)))
|
||||
(let (
|
||||
;; Mask the amount to ensure wrapping behaviour
|
||||
(masked_amt Reg (x64_and $I64 amt (RegMemImm.Imm (shift_mask ty))))
|
||||
;; Shift `src` using 16x8. Unfortunately, a 16x8 shift will only be
|
||||
;; correct for half of the lanes; the others must be fixed up with
|
||||
;; the mask below.
|
||||
(unmasked Xmm (x64_psrlw src (mov_rmi_to_xmm amt)))
|
||||
(mask_addr SyntheticAmode (ushr_i8x16_mask amt))
|
||||
(unmasked Xmm (x64_psrlw src (mov_rmi_to_xmm masked_amt)))
|
||||
(mask_addr SyntheticAmode (ushr_i8x16_mask masked_amt))
|
||||
(mask Reg (x64_load $I8X16 mask_addr (ExtKind.None))))
|
||||
(sse_and $I8X16
|
||||
unmasked
|
||||
@@ -673,16 +680,19 @@
|
||||
(rule (ushr_i8x16_mask (RegMemImm.Mem amt))
|
||||
(ushr_i8x16_mask (RegMemImm.Reg (x64_load $I64 amt (ExtKind.None)))))
|
||||
|
||||
;; 16x8, 32x4, and 64x2 shifts can each use a single instruction.
|
||||
;; 16x8, 32x4, and 64x2 shifts can each use a single instruction, once the shift amount is masked.
|
||||
|
||||
(rule (lower (has_type $I16X8 (ushr src amt)))
|
||||
(x64_psrlw src (mov_rmi_to_xmm amt)))
|
||||
(rule (lower (has_type ty @ $I16X8 (ushr src amt)))
|
||||
(let ((masked_amt Reg (x64_and $I64 amt (RegMemImm.Imm (shift_mask ty)))))
|
||||
(x64_psrlw src (mov_rmi_to_xmm masked_amt))))
|
||||
|
||||
(rule (lower (has_type $I32X4 (ushr src amt)))
|
||||
(x64_psrld src (mov_rmi_to_xmm amt)))
|
||||
(rule (lower (has_type ty @ $I32X4 (ushr src amt)))
|
||||
(let ((masked_amt Reg (x64_and $I64 amt (RegMemImm.Imm (shift_mask ty)))))
|
||||
(x64_psrld src (mov_rmi_to_xmm masked_amt))))
|
||||
|
||||
(rule (lower (has_type $I64X2 (ushr src amt)))
|
||||
(x64_psrlq src (mov_rmi_to_xmm amt)))
|
||||
(rule (lower (has_type ty @ $I64X2 (ushr src amt)))
|
||||
(let ((masked_amt Reg (x64_and $I64 amt (RegMemImm.Imm (shift_mask ty)))))
|
||||
(x64_psrlq src (mov_rmi_to_xmm masked_amt))))
|
||||
|
||||
;;;; Rules for `sshr` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
|
||||
@@ -746,14 +756,16 @@
|
||||
;; hi.i16x8 = [(s8, s8), (s9, s9), ..., (s15, s15)]
|
||||
;; shifted_hi.i16x8 = shift each lane of `high`
|
||||
;; result = [s0'', s1'', ..., s15'']
|
||||
(rule (lower (has_type $I8X16 (sshr src amt @ (value_type amt_ty))))
|
||||
(rule (lower (has_type ty @ $I8X16 (sshr src amt @ (value_type amt_ty))))
|
||||
(let ((src_ Xmm (put_in_xmm src))
|
||||
;; Mask the amount to ensure wrapping behaviour
|
||||
(masked_amt Reg (x64_and $I64 amt (RegMemImm.Imm (shift_mask ty))))
|
||||
;; In order for `packsswb` later to only use the high byte of each
|
||||
;; 16x8 lane, we shift right an extra 8 bits, relying on `psraw` to
|
||||
;; fill in the upper bits appropriately.
|
||||
(lo Xmm (x64_punpcklbw src_ src_))
|
||||
(hi Xmm (x64_punpckhbw src_ src_))
|
||||
(amt_ XmmMemImm (sshr_i8x16_bigger_shift amt_ty amt))
|
||||
(amt_ XmmMemImm (sshr_i8x16_bigger_shift amt_ty masked_amt))
|
||||
(shifted_lo Xmm (x64_psraw lo amt_))
|
||||
(shifted_hi Xmm (x64_psraw hi amt_)))
|
||||
(x64_packsswb shifted_lo shifted_hi)))
|
||||
@@ -773,11 +785,13 @@
|
||||
;; `sshr.{i16x8,i32x4}` can be a simple `psra{w,d}`, we just have to make sure
|
||||
;; that if the shift amount is in a register, it is in an XMM register.
|
||||
|
||||
(rule (lower (has_type $I16X8 (sshr src amt)))
|
||||
(x64_psraw src (mov_rmi_to_xmm amt)))
|
||||
(rule (lower (has_type ty @ $I16X8 (sshr src amt)))
|
||||
(let ((masked_amt Reg (x64_and $I64 amt (RegMemImm.Imm (shift_mask ty)))))
|
||||
(x64_psraw src (mov_rmi_to_xmm masked_amt))))
|
||||
|
||||
(rule (lower (has_type $I32X4 (sshr src amt)))
|
||||
(x64_psrad src (mov_rmi_to_xmm amt)))
|
||||
(rule (lower (has_type ty @ $I32X4 (sshr src amt)))
|
||||
(let ((masked_amt Reg (x64_and $I64 amt (RegMemImm.Imm (shift_mask ty)))))
|
||||
(x64_psrad src (mov_rmi_to_xmm masked_amt))))
|
||||
|
||||
;; The `sshr.i64x2` CLIF instruction has no single x86 instruction in the older
|
||||
;; feature sets. Newer ones like AVX512VL + AVX512F include `vpsraq`, a 128-bit
|
||||
|
||||
@@ -229,6 +229,11 @@ where
|
||||
.unwrap()
|
||||
}
|
||||
|
||||
#[inline]
|
||||
fn shift_mask(&mut self, ty: Type) -> u32 {
|
||||
ty.lane_bits() - 1
|
||||
}
|
||||
|
||||
#[inline]
|
||||
fn simm32_from_value(&mut self, val: Value) -> Option<GprMemImm> {
|
||||
let inst = self.lower_ctx.dfg().value_def(val).inst()?;
|
||||
@@ -415,6 +420,11 @@ where
|
||||
Writable::from_reg(Xmm::new(self.temp_writable_reg(I8X16).to_reg()).unwrap())
|
||||
}
|
||||
|
||||
#[inline]
|
||||
fn reg_to_reg_mem_imm(&mut self, reg: Reg) -> RegMemImm {
|
||||
RegMemImm::Reg { reg }
|
||||
}
|
||||
|
||||
#[inline]
|
||||
fn reg_mem_to_xmm_mem(&mut self, rm: &RegMem) -> XmmMem {
|
||||
XmmMem::new(rm.clone()).unwrap()
|
||||
|
||||
Reference in New Issue
Block a user