cranelift: port sshr to ISLE on x64 (#3681)

This commit is contained in:
Nick Fitzgerald
2022-01-12 07:13:58 -08:00
committed by GitHub
parent 1ef0abb12c
commit 7454f1f3af
13 changed files with 1003 additions and 563 deletions

View File

@@ -1,4 +1,4 @@
src/clif.isle f176ef3bba99365
src/prelude.isle babc931e5dc5b4cf
src/prelude.isle d95510fad2e2473c
src/isa/aarch64/inst.isle 5fa80451697b084f
src/isa/aarch64/lower.isle 2d2e1e076a0c8a23

View File

@@ -20,6 +20,8 @@ pub trait Context {
fn pack_value_array_2(&mut self, arg0: Value, arg1: Value) -> ValueArray2;
fn unpack_value_array_3(&mut self, arg0: &ValueArray3) -> (Value, Value, Value);
fn pack_value_array_3(&mut self, arg0: Value, arg1: Value, arg2: Value) -> ValueArray3;
fn u32_add(&mut self, arg0: u32, arg1: u32) -> u32;
fn u8_and(&mut self, arg0: u8, arg1: u8) -> u8;
fn value_reg(&mut self, arg0: Reg) -> ValueRegs;
fn value_regs(&mut self, arg0: Reg, arg1: Reg) -> ValueRegs;
fn temp_writable_reg(&mut self, arg0: Type) -> WritableReg;
@@ -32,6 +34,7 @@ pub trait Context {
fn u32_as_u64(&mut self, arg0: u32) -> u64;
fn ty_bits(&mut self, arg0: Type) -> u8;
fn ty_bits_u16(&mut self, arg0: Type) -> u16;
fn lane_type(&mut self, arg0: Type) -> Type;
fn fits_in_16(&mut self, arg0: Type) -> Option<Type>;
fn fits_in_32(&mut self, arg0: Type) -> Option<Type>;
fn fits_in_64(&mut self, arg0: Type) -> Option<Type>;
@@ -52,7 +55,6 @@ pub trait Context {
fn first_result(&mut self, arg0: Inst) -> Option<Value>;
fn inst_data(&mut self, arg0: Inst) -> InstructionData;
fn value_type(&mut self, arg0: Value) -> Type;
fn ty_bits_mask(&mut self, arg0: Type) -> u64;
fn multi_lane(&mut self, arg0: Type) -> Option<(u8, u16)>;
fn def_inst(&mut self, arg0: Value) -> Option<Inst>;
fn trap_code_division_by_zero(&mut self) -> TrapCode;
@@ -89,13 +91,13 @@ pub trait Context {
fn rotr_opposite_amount(&mut self, arg0: Type, arg1: ImmShift) -> ImmShift;
}
/// Internal type ProducesFlags: defined at src/prelude.isle line 263.
/// Internal type ProducesFlags: defined at src/prelude.isle line 273.
#[derive(Clone, Debug)]
pub enum ProducesFlags {
ProducesFlags { inst: MInst, result: Reg },
}
/// Internal type ConsumesFlags: defined at src/prelude.isle line 266.
/// Internal type ConsumesFlags: defined at src/prelude.isle line 276.
#[derive(Clone, Debug)]
pub enum ConsumesFlags {
ConsumesFlags { inst: MInst, result: Reg },
@@ -975,7 +977,7 @@ pub enum AtomicRMWOp {
// Generated as internal constructor for term temp_reg.
pub fn constructor_temp_reg<C: Context>(ctx: &mut C, arg0: Type) -> Option<Reg> {
let pattern0_0 = arg0;
// Rule at src/prelude.isle line 60.
// Rule at src/prelude.isle line 66.
let expr0_0 = C::temp_writable_reg(ctx, pattern0_0);
let expr1_0 = C::writable_reg_to_reg(ctx, expr0_0);
return Some(expr1_0);
@@ -984,7 +986,7 @@ pub fn constructor_temp_reg<C: Context>(ctx: &mut C, arg0: Type) -> Option<Reg>
// Generated as internal constructor for term lo_reg.
pub fn constructor_lo_reg<C: Context>(ctx: &mut C, arg0: Value) -> Option<Reg> {
let pattern0_0 = arg0;
// Rule at src/prelude.isle line 95.
// Rule at src/prelude.isle line 101.
let expr0_0 = C::put_in_regs(ctx, pattern0_0);
let expr1_0: usize = 0;
let expr2_0 = C::value_regs_get(ctx, expr0_0, expr1_0);
@@ -1009,7 +1011,7 @@ pub fn constructor_with_flags<C: Context>(
result: pattern3_1,
} = pattern2_0
{
// Rule at src/prelude.isle line 276.
// Rule at src/prelude.isle line 286.
let expr0_0 = C::emit(ctx, &pattern1_0);
let expr1_0 = C::emit(ctx, &pattern3_0);
let expr2_0 = C::value_regs(ctx, pattern1_1, pattern3_1);
@@ -1037,7 +1039,7 @@ pub fn constructor_with_flags_1<C: Context>(
result: pattern3_1,
} = pattern2_0
{
// Rule at src/prelude.isle line 284.
// Rule at src/prelude.isle line 294.
let expr0_0 = C::emit(ctx, &pattern1_0);
let expr1_0 = C::emit(ctx, &pattern3_0);
return Some(pattern3_1);
@@ -1071,7 +1073,7 @@ pub fn constructor_with_flags_2<C: Context>(
result: pattern5_1,
} = pattern4_0
{
// Rule at src/prelude.isle line 294.
// Rule at src/prelude.isle line 304.
let expr0_0 = C::emit(ctx, &pattern1_0);
let expr1_0 = C::emit(ctx, &pattern5_0);
let expr2_0 = C::emit(ctx, &pattern3_0);

View File

@@ -35,6 +35,7 @@
(dst WritableReg)
(imm u8)
(size OperandSize))
(XmmUninitializedValue (dst WritableReg))
(CmpRmiR (size OperandSize)
(opcode CmpOpcode)
(src RegMemImm)
@@ -292,6 +293,15 @@
(Mem (addr SyntheticAmode))
(Imm (simm32 u32))))
;; Put the given clif value into a `RegMemImm` operand.
;;
;; Asserts that the value fits into a single register, and doesn't require
;; multiple registers for its representation (like `i128` for example).
;;
;; As a side effect, this marks the value as used.
(decl put_in_reg_mem_imm (Value) RegMemImm)
(extern constructor put_in_reg_mem_imm put_in_reg_mem_imm)
(type RegMem extern
(enum
(Reg (reg Reg))
@@ -319,6 +329,18 @@
(enum (Imm8 (imm u8))
(Reg (reg Reg))))
;; Put the given clif value into a `Imm8Reg` operand, masked to the bit width of
;; the given type.
;;
;; Asserts that the value fits into a single register, and doesn't require
;; multiple registers for its representation (like `i128` for example).
;;
;; As a side effect, this marks the value as used.
;;
;; This is used when lowering various shifts and rotates.
(decl put_masked_in_imm8_reg (Value Type) Imm8Reg)
(extern constructor put_masked_in_imm8_reg put_masked_in_imm8_reg)
(type CC extern
(enum O
NO
@@ -383,9 +405,12 @@
(decl imm8_from_value (Imm8Reg) Value)
(extern extractor imm8_from_value imm8_from_value)
;; Mask an `Imm8Reg.Imm8`.
(decl mask_imm8_const (Imm8Reg u64) Imm8Reg)
(extern constructor mask_imm8_const mask_imm8_const)
;; Mask a constant to the bit-width of the given type and package it into an
;; `Imm8Reg.Imm8`. This is used for shifts and rotates, so that we don't try and
;; shift/rotate more bits than the type has available, per Cranelift's
;; semantics.
(decl const_to_type_masked_imm8 (u64 Type) Imm8Reg)
(extern constructor const_to_type_masked_imm8 const_to_type_masked_imm8)
;; Extract a constant `RegMemImm.Imm` from a value operand.
(decl simm32_from_value (RegMemImm) Value)
@@ -494,6 +519,37 @@
wr))))
r))
;; Helper for creating an SSE register holding an `i64x2` from two `i64` values.
(decl make_i64x2_from_lanes (RegMem RegMem) Reg)
(rule (make_i64x2_from_lanes lo hi)
(let ((dst_w WritableReg (temp_writable_reg $I64X2))
(dst_r Reg (writable_reg_to_reg dst_w))
(_0 Unit (emit (MInst.XmmUninitializedValue dst_w)))
(_1 Unit (emit (MInst.XmmRmRImm (SseOpcode.Pinsrd)
dst_r
lo
dst_w
0
(OperandSize.Size64))))
(_2 Unit (emit (MInst.XmmRmRImm (SseOpcode.Pinsrd)
dst_r
hi
dst_w
1
(OperandSize.Size64)))))
dst_r))
;; Move a `RegMemImm.Reg` operand to an XMM register, if necessary.
(decl reg_mem_imm_to_xmm (RegMemImm) RegMemImm)
(rule (reg_mem_imm_to_xmm rmi @ (RegMemImm.Mem _)) rmi)
(rule (reg_mem_imm_to_xmm rmi @ (RegMemImm.Imm _)) rmi)
(rule (reg_mem_imm_to_xmm (RegMemImm.Reg r))
(RegMemImm.Reg (gpr_to_xmm $I8X16
(SseOpcode.Movd)
(RegMem.Reg r)
(OperandSize.Size32))))
;;;; Instruction Constructors ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;;
;; These constructors create SSA-style `MInst`s. It is their responsibility to
@@ -1058,6 +1114,21 @@
(rule (pminud src1 src2)
(xmm_rm_r $I8X16 (SseOpcode.Pminud) src1 src2))
;; Helper for creating `punpcklbw` instructions.
(decl punpcklbw (Reg RegMem) Reg)
(rule (punpcklbw src1 src2)
(xmm_rm_r $I8X16 (SseOpcode.Punpcklbw) src1 src2))
;; Helper for creating `punpckhbw` instructions.
(decl punpckhbw (Reg RegMem) Reg)
(rule (punpckhbw src1 src2)
(xmm_rm_r $I8X16 (SseOpcode.Punpckhbw) src1 src2))
;; Helper for creating `packsswb` instructions.
(decl packsswb (Reg RegMem) Reg)
(rule (packsswb src1 src2)
(xmm_rm_r $I8X16 (SseOpcode.Packsswb) src1 src2))
;; Helper for creating `MInst.XmmRmRImm` instructions.
(decl xmm_rm_r_imm (SseOpcode Reg RegMem u8 OperandSize) Reg)
(rule (xmm_rm_r_imm op src1 src2 imm size)
@@ -1180,6 +1251,16 @@
(rule (psrlq src1 src2)
(xmm_rmi_reg (SseOpcode.Psrlq) src1 src2))
;; Helper for creating `psraw` instructions.
(decl psraw (Reg RegMemImm) Reg)
(rule (psraw src1 src2)
(xmm_rmi_reg (SseOpcode.Psraw) src1 src2))
;; Helper for creating `psrad` instructions.
(decl psrad (Reg RegMemImm) Reg)
(rule (psrad src1 src2)
(xmm_rmi_reg (SseOpcode.Psrad) src1 src2))
;; Helper for creating `MInst.MulHi` instructions.
;;
;; Returns the (lo, hi) register halves of the multiplication.
@@ -1252,6 +1333,19 @@
(rule (insertps src1 src2 lane)
(xmm_rm_r_imm (SseOpcode.Insertps) src1 src2 lane (OperandSize.Size32)))
;; Helper for creating `pextrd` instructions.
(decl pextrd (Type Reg u8) Reg)
(rule (pextrd ty src lane)
(let ((w_dst WritableReg (temp_writable_reg ty))
(r_dst Reg (writable_reg_to_reg w_dst))
(_ Unit (emit (MInst.XmmRmRImm (SseOpcode.Pextrd)
r_dst
(RegMem.Reg src)
w_dst
lane
(operand_size_of_type_32_64 (lane_type ty))))))
r_dst))
;; Helper for creating `not` instructions.
(decl not (Type Reg) Reg)
(rule (not ty src)

View File

@@ -537,13 +537,7 @@
;; `i64` and smaller.
(rule (lower (has_type (fits_in_64 ty) (ishl src amt)))
;; NB: Only the low bits of `amt` matter since we logically mask the shift
;; amount to the value's bit width.
(let ((amt_ Reg (lo_reg amt)))
(value_reg (shl ty (put_in_reg src) (Imm8Reg.Reg amt_)))))
(rule (lower (has_type (fits_in_64 ty) (ishl src (imm8_from_value amt))))
(value_reg (shl ty (put_in_reg src) amt)))
(value_reg (shl ty (put_in_reg src) (put_masked_in_imm8_reg amt ty))))
;; `i128`.
@@ -582,15 +576,8 @@
;; `i64` and smaller.
(rule (lower (has_type (fits_in_64 ty) (ushr src amt)))
(let ((src_ Reg (extend_to_reg src ty (ExtendKind.Zero)))
;; NB: Only the low bits of `amt` matter since we logically mask the
;; shift amount to the value's bit width.
(amt_ Reg (lo_reg amt)))
(value_reg (shr ty src_ (Imm8Reg.Reg amt_)))))
(rule (lower (has_type (fits_in_64 ty) (ushr src (imm8_from_value amt))))
(let ((src_ Reg (extend_to_reg src ty (ExtendKind.Zero))))
(value_reg (shr ty src_ amt))))
(value_reg (shr ty src_ (put_masked_in_imm8_reg amt ty)))))
;; `i128`.
@@ -623,6 +610,109 @@
(let ((amt_ Reg (lo_reg amt)))
(shr_i128 (put_in_regs src) amt_)))
;;;; Rules for `sshr` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;; `i64` and smaller.
(rule (lower (has_type (fits_in_64 ty) (sshr src amt)))
(let ((src_ Reg (extend_to_reg src ty (ExtendKind.Sign))))
(value_reg (sar ty src_ (put_masked_in_imm8_reg amt ty)))))
;; `i128`.
(decl sar_i128 (ValueRegs Reg) ValueRegs)
(rule (sar_i128 src amt)
;; Unpack the low/high halves of `src`.
(let ((src_lo Reg (value_regs_get src 0))
(src_hi Reg (value_regs_get src 1))
;; Do a shift of each half. NB: the low half uses an unsigned shift
;; because its MSB is not a sign bit.
(lo_shifted Reg (shr $I64 src_lo (Imm8Reg.Reg amt)))
(hi_shifted Reg (sar $I64 src_hi (Imm8Reg.Reg amt)))
;; `src_hi << (64 - amt)` are the bits to carry over from the low
;; half to the high half.
(carry Reg (shl $I64 src_hi (Imm8Reg.Reg (sub $I64 (imm $I64 64) (RegMemImm.Reg amt)))))
;; Nullify the carry if we are shifting by a multiple of 128.
(carry_ Reg (with_flags_1 (test (OperandSize.Size64) (RegMemImm.Imm 127) amt)
(cmove $I64 (CC.Z) (RegMem.Reg (imm $I64 0)) carry)))
;; Add the carry into the low half.
(lo_shifted_ Reg (or $I64 lo_shifted (RegMemImm.Reg carry_)))
;; Get all sign bits.
(sign_bits Reg (sar $I64 src_hi (Imm8Reg.Imm8 63))))
;; Combine the two shifted halves. However, if we are shifting by >= 64
;; (modulo 128), then the hi bits are all sign bits and the lo bits are
;; what would otherwise be our hi bits.
(with_flags_2 (test (OperandSize.Size64) (RegMemImm.Imm 64) amt)
(cmove $I64 (CC.Z) (RegMem.Reg lo_shifted_) hi_shifted)
(cmove $I64 (CC.Z) (RegMem.Reg hi_shifted) sign_bits))))
(rule (lower (has_type $I128 (sshr src amt)))
;; NB: Only the low bits of `amt` matter since we logically mask the shift
;; amount to the value's bit width.
(let ((amt_ Reg (lo_reg amt)))
(sar_i128 (put_in_regs src) amt_)))
;; SSE.
;; Since the x86 instruction set does not have an 8x16 shift instruction and the
;; approach used for `ishl` and `ushr` cannot be easily used (the masks do not
;; preserve the sign), we use a different approach here: separate the low and
;; high lanes, shift them separately, and merge them into the final result.
;;
;; Visually, this looks like the following, where `src.i8x16 = [s0, s1, ...,
;; s15]:
;;
;; lo.i16x8 = [(s0, s0), (s1, s1), ..., (s7, s7)]
;; shifted_lo.i16x8 = shift each lane of `low`
;; hi.i16x8 = [(s8, s8), (s9, s9), ..., (s15, s15)]
;; shifted_hi.i16x8 = shift each lane of `high`
;; result = [s0'', s1'', ..., s15'']
(rule (lower (has_type $I8X16 (sshr src amt @ (value_type amt_ty))))
(let ((src_ Reg (put_in_reg src))
;; In order for `packsswb` later to only use the high byte of each
;; 16x8 lane, we shift right an extra 8 bits, relying on `psraw` to
;; fill in the upper bits appropriately.
(lo Reg (punpcklbw src_ (RegMem.Reg src_)))
(hi Reg (punpckhbw src_ (RegMem.Reg src_)))
(amt_ RegMemImm (sshr_i8x16_bigger_shift amt_ty (put_in_reg_mem_imm amt)))
(shifted_lo Reg (psraw lo amt_))
(shifted_hi Reg (psraw hi amt_)))
(value_reg (packsswb shifted_lo (RegMem.Reg shifted_hi)))))
(decl sshr_i8x16_bigger_shift (Type RegMemImm) RegMemImm)
(rule (sshr_i8x16_bigger_shift _ty (RegMemImm.Imm i))
(RegMemImm.Imm (u32_add i 8)))
(rule (sshr_i8x16_bigger_shift ty (RegMemImm.Reg r))
(reg_mem_imm_to_xmm (RegMemImm.Reg (add ty r (RegMemImm.Imm 8)))))
(rule (sshr_i8x16_bigger_shift ty rmi @ (RegMemImm.Mem _m))
(reg_mem_imm_to_xmm (RegMemImm.Reg (add ty (imm ty 8) rmi))))
;; `sshr.{i16x8,i32x4}` can be a simple `psra{w,d}`, we just have to make sure
;; that if the shift amount is in a register, it is in an XMM register.
(rule (lower (has_type $I16X8 (sshr src amt)))
(value_reg (psraw (put_in_reg src)
(reg_mem_imm_to_xmm (put_in_reg_mem_imm amt)))))
(rule (lower (has_type $I32X4 (sshr src amt)))
(value_reg (psrad (put_in_reg src)
(reg_mem_imm_to_xmm (put_in_reg_mem_imm amt)))))
;; The `sshr.i64x2` CLIF instruction has no single x86 instruction in the older
;; feature sets. Newer ones like AVX512VL + AVX512F include `vpsraq`, a 128-bit
;; instruction that would fit here, but this backend does not currently have
;; support for EVEX encodings. To remedy this, we extract each 64-bit lane to a
;; GPR, shift each using a scalar instruction, and insert the shifted values
;; back in the `dst` XMM register.
;;
;; (TODO: when EVEX support is available, add an alternate lowering here).
(rule (lower (has_type $I64X2 (sshr src amt)))
(let ((src_ Reg (put_in_reg src))
(lo Reg (pextrd $I64 src_ 0))
(hi Reg (pextrd $I64 src_ 1))
(amt_ Imm8Reg (put_masked_in_imm8_reg amt $I64))
(shifted_lo Reg (sar $I64 lo amt_))
(shifted_hi Reg (sar $I64 hi amt_)))
(value_reg (make_i64x2_from_lanes (RegMem.Reg shifted_lo)
(RegMem.Reg shifted_hi)))))
;;;; Rules for `rotl` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;; `i16` and `i8`: we need to extend the shift amount, or mask the
@@ -632,8 +722,11 @@
(let ((amt_ Reg (extend_to_reg amt $I32 (ExtendKind.Zero))))
(value_reg (m_rotl ty (put_in_reg src) (Imm8Reg.Reg amt_)))))
(rule (lower (has_type (ty_8_or_16 ty) (rotl src (imm8_from_value amt))))
(value_reg (m_rotl ty (put_in_reg src) (mask_imm8_const amt (ty_bits_mask ty)))))
(rule (lower (has_type (ty_8_or_16 ty)
(rotl src (u64_from_iconst amt))))
(value_reg (m_rotl ty
(put_in_reg src)
(const_to_type_masked_imm8 amt ty))))
;; `i64` and `i32`: we can rely on x86's rotate-amount masking since
;; we operate on the whole register.
@@ -644,8 +737,11 @@
(let ((amt_ Reg (lo_reg amt)))
(value_reg (m_rotl ty (put_in_reg src) (Imm8Reg.Reg amt_)))))
(rule (lower (has_type (ty_32_or_64 ty) (rotl src (imm8_from_value amt))))
(value_reg (m_rotl ty (put_in_reg src) amt)))
(rule (lower (has_type (ty_32_or_64 ty)
(rotl src (u64_from_iconst amt))))
(value_reg (m_rotl ty
(put_in_reg src)
(const_to_type_masked_imm8 amt ty))))
;; `i128`.

View File

@@ -1538,13 +1538,18 @@ fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(
| Opcode::Umin
| Opcode::Bnot
| Opcode::Bitselect
| Opcode::Vselect => implemented_in_isle(ctx),
| Opcode::Vselect
| Opcode::Sshr => implemented_in_isle(ctx),
Opcode::Ishl | Opcode::Ushr | Opcode::Sshr | Opcode::Rotl | Opcode::Rotr => {
Opcode::Ishl | Opcode::Ushr | Opcode::Rotl | Opcode::Rotr => {
let dst_ty = ctx.output_ty(insn, 0);
debug_assert_eq!(ctx.input_ty(insn, 0), dst_ty);
if !dst_ty.is_vector() && dst_ty.bits() <= 64 {
if op != Opcode::Rotr {
implemented_in_isle(ctx);
}
// Scalar shifts on x86 have various encodings:
// - shift by one bit, e.g. `SAL r/m8, 1` (not used here)
// - shift by an immediate amount, e.g. `SAL r/m8, imm8`
@@ -1557,10 +1562,6 @@ fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(
OperandSize::Size32,
extend_input_to_reg(ctx, inputs[0], ExtSpec::ZeroExtendTo32),
),
Opcode::Sshr => (
OperandSize::Size32,
extend_input_to_reg(ctx, inputs[0], ExtSpec::SignExtendTo32),
),
Opcode::Rotl | Opcode::Rotr => (
OperandSize::from_ty(dst_ty),
put_input_in_reg(ctx, inputs[0]),
@@ -1590,7 +1591,6 @@ fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(
let shift_kind = match op {
Opcode::Ishl => ShiftKind::ShiftLeft,
Opcode::Ushr => ShiftKind::ShiftRightLogical,
Opcode::Sshr => ShiftKind::ShiftRightArithmetic,
Opcode::Rotl => ShiftKind::RotateLeft,
Opcode::Rotr => ShiftKind::RotateRight,
_ => unreachable!(),
@@ -1608,50 +1608,8 @@ fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(
let dst = get_output_reg(ctx, outputs[0]);
match op {
Opcode::Ishl => {
emit_shl_i128(ctx, src, dst, amt_src);
}
Opcode::Ushr => {
emit_shr_i128(ctx, src, dst, amt_src, /* is_signed = */ false);
}
Opcode::Sshr => {
emit_shr_i128(ctx, src, dst, amt_src, /* is_signed = */ true);
}
Opcode::Rotl => {
// (mov tmp, src)
// (shl.i128 tmp, amt)
// (mov dst, src)
// (ushr.i128 dst, 128-amt)
// (or dst, tmp)
let tmp = ctx.alloc_tmp(types::I128);
emit_shl_i128(ctx, src, tmp, amt_src);
let inv_amt = ctx.alloc_tmp(types::I64).only_reg().unwrap();
ctx.emit(Inst::imm(OperandSize::Size64, 128, inv_amt));
ctx.emit(Inst::alu_rmi_r(
OperandSize::Size64,
AluRmiROpcode::Sub,
RegMemImm::reg(amt_src),
inv_amt,
));
emit_shr_i128(
ctx,
src,
dst,
inv_amt.to_reg(),
/* is_signed = */ false,
);
ctx.emit(Inst::alu_rmi_r(
OperandSize::Size64,
AluRmiROpcode::Or,
RegMemImm::reg(tmp.regs()[0].to_reg()),
dst.regs()[0],
));
ctx.emit(Inst::alu_rmi_r(
OperandSize::Size64,
AluRmiROpcode::Or,
RegMemImm::reg(tmp.regs()[1].to_reg()),
dst.regs()[1],
));
Opcode::Ishl | Opcode::Ushr | Opcode::Rotl => {
implemented_in_isle(ctx);
}
Opcode::Rotr => {
// (mov tmp, src)
@@ -1808,127 +1766,6 @@ fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(
_ => SseOpcode::Pand,
};
ctx.emit(Inst::xmm_rm_r(sse_op, RegMem::from(mask_value), dst));
} else if dst_ty == types::I8X16 && op == Opcode::Sshr {
// Since the x86 instruction set does not have an 8x16 shift instruction and the approach used for
// `ishl` and `ushr` cannot be easily used (the masks do not preserve the sign), we use a different
// approach here: separate the low and high lanes, shift them separately, and merge them into the final
// result. Visually, this looks like the following, where `src.i8x16 = [s0, s1, ..., s15]:
// low.i16x8 = [(s0, s0), (s1, s1), ..., (s7, s7)]
// shifted_low.i16x8 = shift each lane of `low`
// high.i16x8 = [(s8, s8), (s9, s9), ..., (s15, s15)]
// shifted_high.i16x8 = shift each lane of `high`
// dst.i8x16 = [s0'', s1'', ..., s15'']
let src = put_input_in_reg(ctx, inputs[0]);
let shift_by = input_to_reg_mem_imm(ctx, inputs[1]);
let shift_by_ty = ctx.input_ty(insn, 1);
let dst = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
// In order for PACKSSWB later to only use the high byte of each 16x8 lane, we shift right an extra 8
// bits, relying on PSRAW to fill in the upper bits appropriately.
let bigger_shift_by = match shift_by {
// When we know the shift amount at compile time, we add the extra shift amount statically.
RegMemImm::Imm { simm32 } => RegMemImm::imm(simm32 + 8),
// Otherwise we add instructions to add the extra shift amount and move the value into an XMM
// register.
RegMemImm::Reg { reg } => {
let bigger_shift_by_gpr = ctx.alloc_tmp(shift_by_ty).only_reg().unwrap();
ctx.emit(Inst::mov_r_r(OperandSize::Size64, reg, bigger_shift_by_gpr));
let size = if shift_by_ty == types::I64 {
OperandSize::Size64
} else {
OperandSize::Size32
};
let imm = RegMemImm::imm(8);
ctx.emit(Inst::alu_rmi_r(
size,
AluRmiROpcode::Add,
imm,
bigger_shift_by_gpr,
));
let bigger_shift_by_xmm = ctx.alloc_tmp(dst_ty).only_reg().unwrap();
ctx.emit(Inst::gpr_to_xmm(
SseOpcode::Movd,
RegMem::from(bigger_shift_by_gpr),
OperandSize::Size32,
bigger_shift_by_xmm,
));
RegMemImm::reg(bigger_shift_by_xmm.to_reg())
}
RegMemImm::Mem { .. } => unimplemented!("load shift amount to XMM register"),
};
// Unpack and shift the lower lanes of `src` into the `dst` register.
ctx.emit(Inst::gen_move(dst, src, dst_ty));
ctx.emit(Inst::xmm_rm_r(SseOpcode::Punpcklbw, RegMem::from(dst), dst));
ctx.emit(Inst::xmm_rmi_reg(
SseOpcode::Psraw,
bigger_shift_by.clone(),
dst,
));
// Unpack and shift the upper lanes of `src` into a temporary register, `upper_lanes`.
let upper_lanes = ctx.alloc_tmp(dst_ty).only_reg().unwrap();
ctx.emit(Inst::gen_move(upper_lanes, src, dst_ty));
ctx.emit(Inst::xmm_rm_r(
SseOpcode::Punpckhbw,
RegMem::from(upper_lanes),
upper_lanes,
));
ctx.emit(Inst::xmm_rmi_reg(
SseOpcode::Psraw,
bigger_shift_by,
upper_lanes,
));
// Merge the upper and lower shifted lanes into `dst`.
ctx.emit(Inst::xmm_rm_r(
SseOpcode::Packsswb,
RegMem::from(upper_lanes),
dst,
));
} else if dst_ty == types::I64X2 && op == Opcode::Sshr {
// The `sshr.i8x16` CLIF instruction has no single x86 instruction in the older feature sets; newer ones
// like AVX512VL + AVX512F include VPSRAQ, a 128-bit instruction that would fit here, but this backend
// does not currently have support for EVEX encodings (TODO when EVEX support is available, add an
// alternate lowering here). To remedy this, we extract each 64-bit lane to a GPR, shift each using a
// scalar instruction, and insert the shifted values back in the `dst` XMM register.
let src = put_input_in_reg(ctx, inputs[0]);
let dst = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
ctx.emit(Inst::gen_move(dst, src, dst_ty));
// Extract the upper and lower lanes into temporary GPRs.
let lower_lane = ctx.alloc_tmp(types::I64).only_reg().unwrap();
emit_extract_lane(ctx, src, lower_lane, 0, types::I64);
let upper_lane = ctx.alloc_tmp(types::I64).only_reg().unwrap();
emit_extract_lane(ctx, src, upper_lane, 1, types::I64);
// Shift each value.
let mut shift = |reg: Writable<Reg>| {
let kind = ShiftKind::ShiftRightArithmetic;
if let Some(shift_by) = ctx.get_input_as_source_or_const(insn, 1).constant {
// Mask the shift amount according to Cranelift's semantics.
let shift_by = (shift_by as u8) & (types::I64.bits() as u8 - 1);
ctx.emit(Inst::shift_r(
OperandSize::Size64,
kind,
Some(shift_by),
reg,
));
} else {
let dynamic_shift_by = put_input_in_reg(ctx, inputs[1]);
let w_rcx = Writable::from_reg(regs::rcx());
ctx.emit(Inst::mov_r_r(OperandSize::Size64, dynamic_shift_by, w_rcx));
ctx.emit(Inst::shift_r(OperandSize::Size64, kind, None, reg));
};
};
shift(lower_lane);
shift(upper_lane);
// Insert the scalar values back into the `dst` vector.
emit_insert_lane(ctx, RegMem::from(lower_lane), dst, 0, types::I64);
emit_insert_lane(ctx, RegMem::from(upper_lane), dst, 1, types::I64);
} else {
// For the remaining packed shifts not covered above, x86 has implementations that can either:
// - shift using an immediate
@@ -1940,13 +1777,11 @@ fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(
types::I16X8 => match op {
Opcode::Ishl => SseOpcode::Psllw,
Opcode::Ushr => SseOpcode::Psrlw,
Opcode::Sshr => SseOpcode::Psraw,
_ => unimplemented!("{} is not implemented for type {}", op, dst_ty),
},
types::I32X4 => match op {
Opcode::Ishl => SseOpcode::Pslld,
Opcode::Ushr => SseOpcode::Psrld,
Opcode::Sshr => SseOpcode::Psrad,
_ => unimplemented!("{} is not implemented for type {}", op, dst_ty),
},
types::I64X2 => match op {

View File

@@ -69,6 +69,31 @@ where
OperandSize::from_ty(ty)
}
fn put_in_reg_mem_imm(&mut self, val: Value) -> RegMemImm {
let inputs = self.lower_ctx.get_value_as_source_or_const(val);
if let Some(c) = inputs.constant {
if let Some(imm) = to_simm32(c as i64) {
return imm;
}
// Generate constants fresh at each use to minimize long-range
// register pressure.
let ty = self.value_type(val);
return RegMemImm::reg(generated_code::constructor_imm(self, ty, c).unwrap());
}
if let Some((src_insn, 0)) = inputs.inst {
if let Some((addr_input, offset)) = is_mergeable_load(self.lower_ctx, src_insn) {
self.lower_ctx.sink_inst(src_insn);
let amode = lower_to_amode(self.lower_ctx, addr_input, offset);
return RegMemImm::mem(amode);
}
}
RegMemImm::reg(self.put_in_reg(val))
}
fn put_in_reg_mem(&mut self, val: Value) -> RegMem {
let inputs = self.lower_ctx.get_value_as_source_or_const(val);
@@ -90,6 +115,23 @@ where
RegMem::reg(self.put_in_reg(val))
}
fn put_masked_in_imm8_reg(&mut self, val: Value, ty: Type) -> Imm8Reg {
let inputs = self.lower_ctx.get_value_as_source_or_const(val);
if let Some(c) = inputs.constant {
let mask = 1_u64
.checked_shl(ty.bits() as u32)
.map_or(u64::MAX, |x| x - 1);
return Imm8Reg::Imm8 {
imm: (c & mask) as u8,
};
}
Imm8Reg::Reg {
reg: self.put_in_regs(val).regs()[0],
}
}
#[inline]
fn encode_fcmp_imm(&mut self, imm: &FcmpImm) -> u8 {
imm.encode()
@@ -131,12 +173,12 @@ where
}
#[inline]
fn mask_imm8_const(&mut self, imm8: &Imm8Reg, mask: u64) -> Imm8Reg {
match imm8 {
&Imm8Reg::Reg { reg } => Imm8Reg::Reg { reg },
&Imm8Reg::Imm8 { imm } => Imm8Reg::Imm8 {
imm: imm & (mask as u8),
},
fn const_to_type_masked_imm8(&mut self, c: u64, ty: Type) -> Imm8Reg {
let mask = 1_u64
.checked_shl(ty.bits() as u32)
.map_or(u64::MAX, |x| x - 1);
Imm8Reg::Imm8 {
imm: (c & mask) as u8,
}
}

View File

@@ -1,4 +1,4 @@
src/clif.isle f176ef3bba99365
src/prelude.isle babc931e5dc5b4cf
src/isa/x64/inst.isle bc5fc626492752c8
src/isa/x64/lower.isle 33e94300f4c08455
src/prelude.isle d95510fad2e2473c
src/isa/x64/inst.isle c16462cc359dd466
src/isa/x64/lower.isle 9f761598e3949e8e

File diff suppressed because it is too large Load Diff

View File

@@ -95,11 +95,6 @@ macro_rules! isle_prelude_methods {
ty.bits().try_into().unwrap()
}
#[inline]
fn ty_bits_mask(&mut self, ty: Type) -> u64 {
(1 << (self.ty_bits(ty) as u64)) - 1
}
#[inline]
fn ty_bits_u16(&mut self, ty: Type) -> u16 {
ty.bits()
@@ -260,6 +255,21 @@ macro_rules! isle_prelude_methods {
n => Some(n as u64),
}
}
#[inline]
fn u32_add(&mut self, a: u32, b: u32) -> u32 {
a.wrapping_add(b)
}
#[inline]
fn u8_and(&mut self, a: u8, b: u8) -> u8 {
a & b
}
#[inline]
fn lane_type(&mut self, ty: Type) -> Type {
ty.lane_type()
}
};
}

View File

@@ -38,6 +38,12 @@
(type ValueList (primitive ValueList))
(type ValueRegs (primitive ValueRegs))
(decl u32_add (u32 u32) u32)
(extern constructor u32_add u32_add)
(decl u8_and (u8 u8) u8)
(extern constructor u8_and u8_and)
;;;; Registers ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
(type Reg (primitive Reg))
@@ -146,6 +152,10 @@
(decl ty_bits_u16 (Type) u16)
(extern constructor ty_bits_u16 ty_bits_u16)
;; Get the type of each lane in the given type.
(decl lane_type (Type) Type)
(extern constructor lane_type lane_type)
;;;; Helper Clif Extractors ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;; An extractor that only matches types that can fit in 16 bits.
@@ -242,11 +252,6 @@
(and (result_type ty)
inst))
;; Return a bitmask that will mask off a count to be within `ty`'s
;; bit-width. Used for shifts/rotates.
(decl ty_bits_mask (Type) u64)
(extern constructor ty_bits_mask ty_bits_mask)
;; Match a multi-lane type, extracting (# bits per lane, # lanes) from the given
;; type. Will only match when there is more than one lane.
(decl multi_lane (u8 u16) Type)
@@ -256,6 +261,11 @@
(decl def_inst (Inst) Value)
(extern extractor def_inst def_inst)
;; Extract a constant `u64` from a value defined by an `iconst`.
(decl u64_from_iconst (u64) Value)
(extractor (u64_from_iconst x)
(def_inst (iconst (u64_from_imm64 x))))
;;;; Helpers for Working with Flags ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;; Newtype wrapper around `MInst` for instructions that are used for their

View File

@@ -1173,35 +1173,32 @@ block0(v0: i128, v1: i128):
; Entry block: 0
; Block 0:
; (original IR block: block0)
; (instruction range: 0 .. 28)
; (instruction range: 0 .. 25)
; Inst 0: pushq %rbp
; Inst 1: movq %rsp, %rbp
; Inst 2: movq %rdi, %r8
; Inst 3: movq %rsi, %rdi
; Inst 4: movq %rdi, %rsi
; Inst 5: movq %rdx, %rcx
; Inst 6: sarq %cl, %rsi
; Inst 7: movq %rdx, %rcx
; Inst 8: shrq %cl, %r8
; Inst 9: movl $64, %ecx
; Inst 10: subq %rdx, %rcx
; Inst 11: movq %rdi, %rax
; Inst 12: shlq %cl, %rax
; Inst 13: xorq %rcx, %rcx
; Inst 14: testq $127, %rdx
; Inst 15: cmovzq %rcx, %rax
; Inst 16: orq %r8, %rax
; Inst 17: sarq $63, %rdi
; Inst 18: xorq %rcx, %rcx
; Inst 19: andq $64, %rdx
; Inst 20: cmovzq %rsi, %rdi
; Inst 21: cmovzq %rax, %rcx
; Inst 22: cmovnzq %rsi, %rcx
; Inst 23: movq %rcx, %rax
; Inst 24: movq %rdi, %rdx
; Inst 25: movq %rbp, %rsp
; Inst 26: popq %rbp
; Inst 27: ret
; Inst 2: movq %rdi, %rax
; Inst 3: movq %rdx, %rcx
; Inst 4: shrq %cl, %rax
; Inst 5: movq %rsi, %rdi
; Inst 6: movq %rdx, %rcx
; Inst 7: sarq %cl, %rdi
; Inst 8: movl $64, %ecx
; Inst 9: subq %rdx, %rcx
; Inst 10: movq %rsi, %r8
; Inst 11: shlq %cl, %r8
; Inst 12: xorq %rcx, %rcx
; Inst 13: testq $127, %rdx
; Inst 14: cmovzq %rcx, %r8
; Inst 15: orq %r8, %rax
; Inst 16: sarq $63, %rsi
; Inst 17: testq $64, %rdx
; Inst 18: cmovzq %rdi, %rsi
; Inst 19: cmovzq %rax, %rdi
; Inst 20: movq %rdi, %rax
; Inst 21: movq %rsi, %rdx
; Inst 22: movq %rbp, %rsp
; Inst 23: popq %rbp
; Inst 24: ret
; }}
function %f33(i128, i128) -> i128 {

View File

@@ -324,16 +324,16 @@ block0(v0: i32):
; (instruction range: 0 .. 15)
; Inst 0: pushq %rbp
; Inst 1: movq %rsp, %rbp
; Inst 2: load_const VCodeConstant(0), %xmm0
; Inst 3: addl $8, %edi
; Inst 4: movd %edi, %xmm2
; Inst 5: movdqa %xmm0, %xmm1
; Inst 6: punpcklbw %xmm1, %xmm1
; Inst 7: psraw %xmm2, %xmm1
; Inst 8: punpckhbw %xmm0, %xmm0
; Inst 2: load_const VCodeConstant(0), %xmm2
; Inst 3: movdqa %xmm2, %xmm0
; Inst 4: punpcklbw %xmm2, %xmm0
; Inst 5: movdqa %xmm2, %xmm1
; Inst 6: punpckhbw %xmm2, %xmm1
; Inst 7: addl $8, %edi
; Inst 8: movd %edi, %xmm2
; Inst 9: psraw %xmm2, %xmm0
; Inst 10: packsswb %xmm0, %xmm1
; Inst 11: movdqa %xmm1, %xmm0
; Inst 10: psraw %xmm2, %xmm1
; Inst 11: packsswb %xmm1, %xmm0
; Inst 12: movq %rbp, %rsp
; Inst 13: popq %rbp
; Inst 14: ret
@@ -349,19 +349,20 @@ block0(v0: i8x16, v1: i32):
; Entry block: 0
; Block 0:
; (original IR block: block0)
; (instruction range: 0 .. 12)
; (instruction range: 0 .. 13)
; Inst 0: pushq %rbp
; Inst 1: movq %rsp, %rbp
; Inst 2: movdqa %xmm0, %xmm1
; Inst 3: movdqa %xmm1, %xmm0
; Inst 4: punpcklbw %xmm0, %xmm0
; Inst 5: psraw $11, %xmm0
; Inst 6: punpckhbw %xmm1, %xmm1
; Inst 7: psraw $11, %xmm1
; Inst 8: packsswb %xmm1, %xmm0
; Inst 9: movq %rbp, %rsp
; Inst 10: popq %rbp
; Inst 11: ret
; Inst 3: punpcklbw %xmm0, %xmm1
; Inst 4: movdqa %xmm0, %xmm2
; Inst 5: punpckhbw %xmm0, %xmm2
; Inst 6: psraw $11, %xmm1
; Inst 7: psraw $11, %xmm2
; Inst 8: packsswb %xmm2, %xmm1
; Inst 9: movdqa %xmm1, %xmm0
; Inst 10: movq %rbp, %rsp
; Inst 11: popq %rbp
; Inst 12: ret
; }}
function %sshr_i64x2(i64x2, i32) -> i64x2 {
@@ -374,21 +375,20 @@ block0(v0: i64x2, v1: i32):
; Entry block: 0
; Block 0:
; (original IR block: block0)
; (instruction range: 0 .. 15)
; (instruction range: 0 .. 14)
; Inst 0: pushq %rbp
; Inst 1: movq %rsp, %rbp
; Inst 2: movdqa %xmm0, %xmm1
; Inst 3: pextrd.w $0, %xmm0, %rsi
; Inst 4: pextrd.w $1, %xmm0, %rax
; Inst 5: movq %rdi, %rcx
; Inst 6: sarq %cl, %rsi
; Inst 7: movq %rdi, %rcx
; Inst 8: sarq %cl, %rax
; Inst 9: pinsrd.w $0, %rsi, %xmm1
; Inst 10: pinsrd.w $1, %rax, %xmm1
; Inst 11: movdqa %xmm1, %xmm0
; Inst 12: movq %rbp, %rsp
; Inst 13: popq %rbp
; Inst 14: ret
; Inst 2: pextrd.w $0, %xmm0, %rsi
; Inst 3: pextrd.w $1, %xmm0, %rax
; Inst 4: movq %rdi, %rcx
; Inst 5: sarq %cl, %rsi
; Inst 6: movq %rdi, %rcx
; Inst 7: sarq %cl, %rax
; Inst 8: uninit %xmm0
; Inst 9: pinsrd.w $0, %rsi, %xmm0
; Inst 10: pinsrd.w $1, %rax, %xmm0
; Inst 11: movq %rbp, %rsp
; Inst 12: popq %rbp
; Inst 13: ret
; }}

View File

@@ -13,7 +13,7 @@ use std::path::{Path, PathBuf};
use std::time;
/// Timeout in seconds when we're not making progress.
const TIMEOUT_PANIC: usize = 10;
const TIMEOUT_PANIC: usize = 60;
/// Timeout for reporting slow tests without panicking.
const TIMEOUT_SLOW: usize = 3;