cranelift: port sshr to ISLE on x64 (#3681)
This commit is contained in:
@@ -1,4 +1,4 @@
|
|||||||
src/clif.isle f176ef3bba99365
|
src/clif.isle f176ef3bba99365
|
||||||
src/prelude.isle babc931e5dc5b4cf
|
src/prelude.isle d95510fad2e2473c
|
||||||
src/isa/aarch64/inst.isle 5fa80451697b084f
|
src/isa/aarch64/inst.isle 5fa80451697b084f
|
||||||
src/isa/aarch64/lower.isle 2d2e1e076a0c8a23
|
src/isa/aarch64/lower.isle 2d2e1e076a0c8a23
|
||||||
|
|||||||
@@ -20,6 +20,8 @@ pub trait Context {
|
|||||||
fn pack_value_array_2(&mut self, arg0: Value, arg1: Value) -> ValueArray2;
|
fn pack_value_array_2(&mut self, arg0: Value, arg1: Value) -> ValueArray2;
|
||||||
fn unpack_value_array_3(&mut self, arg0: &ValueArray3) -> (Value, Value, Value);
|
fn unpack_value_array_3(&mut self, arg0: &ValueArray3) -> (Value, Value, Value);
|
||||||
fn pack_value_array_3(&mut self, arg0: Value, arg1: Value, arg2: Value) -> ValueArray3;
|
fn pack_value_array_3(&mut self, arg0: Value, arg1: Value, arg2: Value) -> ValueArray3;
|
||||||
|
fn u32_add(&mut self, arg0: u32, arg1: u32) -> u32;
|
||||||
|
fn u8_and(&mut self, arg0: u8, arg1: u8) -> u8;
|
||||||
fn value_reg(&mut self, arg0: Reg) -> ValueRegs;
|
fn value_reg(&mut self, arg0: Reg) -> ValueRegs;
|
||||||
fn value_regs(&mut self, arg0: Reg, arg1: Reg) -> ValueRegs;
|
fn value_regs(&mut self, arg0: Reg, arg1: Reg) -> ValueRegs;
|
||||||
fn temp_writable_reg(&mut self, arg0: Type) -> WritableReg;
|
fn temp_writable_reg(&mut self, arg0: Type) -> WritableReg;
|
||||||
@@ -32,6 +34,7 @@ pub trait Context {
|
|||||||
fn u32_as_u64(&mut self, arg0: u32) -> u64;
|
fn u32_as_u64(&mut self, arg0: u32) -> u64;
|
||||||
fn ty_bits(&mut self, arg0: Type) -> u8;
|
fn ty_bits(&mut self, arg0: Type) -> u8;
|
||||||
fn ty_bits_u16(&mut self, arg0: Type) -> u16;
|
fn ty_bits_u16(&mut self, arg0: Type) -> u16;
|
||||||
|
fn lane_type(&mut self, arg0: Type) -> Type;
|
||||||
fn fits_in_16(&mut self, arg0: Type) -> Option<Type>;
|
fn fits_in_16(&mut self, arg0: Type) -> Option<Type>;
|
||||||
fn fits_in_32(&mut self, arg0: Type) -> Option<Type>;
|
fn fits_in_32(&mut self, arg0: Type) -> Option<Type>;
|
||||||
fn fits_in_64(&mut self, arg0: Type) -> Option<Type>;
|
fn fits_in_64(&mut self, arg0: Type) -> Option<Type>;
|
||||||
@@ -52,7 +55,6 @@ pub trait Context {
|
|||||||
fn first_result(&mut self, arg0: Inst) -> Option<Value>;
|
fn first_result(&mut self, arg0: Inst) -> Option<Value>;
|
||||||
fn inst_data(&mut self, arg0: Inst) -> InstructionData;
|
fn inst_data(&mut self, arg0: Inst) -> InstructionData;
|
||||||
fn value_type(&mut self, arg0: Value) -> Type;
|
fn value_type(&mut self, arg0: Value) -> Type;
|
||||||
fn ty_bits_mask(&mut self, arg0: Type) -> u64;
|
|
||||||
fn multi_lane(&mut self, arg0: Type) -> Option<(u8, u16)>;
|
fn multi_lane(&mut self, arg0: Type) -> Option<(u8, u16)>;
|
||||||
fn def_inst(&mut self, arg0: Value) -> Option<Inst>;
|
fn def_inst(&mut self, arg0: Value) -> Option<Inst>;
|
||||||
fn trap_code_division_by_zero(&mut self) -> TrapCode;
|
fn trap_code_division_by_zero(&mut self) -> TrapCode;
|
||||||
@@ -89,13 +91,13 @@ pub trait Context {
|
|||||||
fn rotr_opposite_amount(&mut self, arg0: Type, arg1: ImmShift) -> ImmShift;
|
fn rotr_opposite_amount(&mut self, arg0: Type, arg1: ImmShift) -> ImmShift;
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Internal type ProducesFlags: defined at src/prelude.isle line 263.
|
/// Internal type ProducesFlags: defined at src/prelude.isle line 273.
|
||||||
#[derive(Clone, Debug)]
|
#[derive(Clone, Debug)]
|
||||||
pub enum ProducesFlags {
|
pub enum ProducesFlags {
|
||||||
ProducesFlags { inst: MInst, result: Reg },
|
ProducesFlags { inst: MInst, result: Reg },
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Internal type ConsumesFlags: defined at src/prelude.isle line 266.
|
/// Internal type ConsumesFlags: defined at src/prelude.isle line 276.
|
||||||
#[derive(Clone, Debug)]
|
#[derive(Clone, Debug)]
|
||||||
pub enum ConsumesFlags {
|
pub enum ConsumesFlags {
|
||||||
ConsumesFlags { inst: MInst, result: Reg },
|
ConsumesFlags { inst: MInst, result: Reg },
|
||||||
@@ -975,7 +977,7 @@ pub enum AtomicRMWOp {
|
|||||||
// Generated as internal constructor for term temp_reg.
|
// Generated as internal constructor for term temp_reg.
|
||||||
pub fn constructor_temp_reg<C: Context>(ctx: &mut C, arg0: Type) -> Option<Reg> {
|
pub fn constructor_temp_reg<C: Context>(ctx: &mut C, arg0: Type) -> Option<Reg> {
|
||||||
let pattern0_0 = arg0;
|
let pattern0_0 = arg0;
|
||||||
// Rule at src/prelude.isle line 60.
|
// Rule at src/prelude.isle line 66.
|
||||||
let expr0_0 = C::temp_writable_reg(ctx, pattern0_0);
|
let expr0_0 = C::temp_writable_reg(ctx, pattern0_0);
|
||||||
let expr1_0 = C::writable_reg_to_reg(ctx, expr0_0);
|
let expr1_0 = C::writable_reg_to_reg(ctx, expr0_0);
|
||||||
return Some(expr1_0);
|
return Some(expr1_0);
|
||||||
@@ -984,7 +986,7 @@ pub fn constructor_temp_reg<C: Context>(ctx: &mut C, arg0: Type) -> Option<Reg>
|
|||||||
// Generated as internal constructor for term lo_reg.
|
// Generated as internal constructor for term lo_reg.
|
||||||
pub fn constructor_lo_reg<C: Context>(ctx: &mut C, arg0: Value) -> Option<Reg> {
|
pub fn constructor_lo_reg<C: Context>(ctx: &mut C, arg0: Value) -> Option<Reg> {
|
||||||
let pattern0_0 = arg0;
|
let pattern0_0 = arg0;
|
||||||
// Rule at src/prelude.isle line 95.
|
// Rule at src/prelude.isle line 101.
|
||||||
let expr0_0 = C::put_in_regs(ctx, pattern0_0);
|
let expr0_0 = C::put_in_regs(ctx, pattern0_0);
|
||||||
let expr1_0: usize = 0;
|
let expr1_0: usize = 0;
|
||||||
let expr2_0 = C::value_regs_get(ctx, expr0_0, expr1_0);
|
let expr2_0 = C::value_regs_get(ctx, expr0_0, expr1_0);
|
||||||
@@ -1009,7 +1011,7 @@ pub fn constructor_with_flags<C: Context>(
|
|||||||
result: pattern3_1,
|
result: pattern3_1,
|
||||||
} = pattern2_0
|
} = pattern2_0
|
||||||
{
|
{
|
||||||
// Rule at src/prelude.isle line 276.
|
// Rule at src/prelude.isle line 286.
|
||||||
let expr0_0 = C::emit(ctx, &pattern1_0);
|
let expr0_0 = C::emit(ctx, &pattern1_0);
|
||||||
let expr1_0 = C::emit(ctx, &pattern3_0);
|
let expr1_0 = C::emit(ctx, &pattern3_0);
|
||||||
let expr2_0 = C::value_regs(ctx, pattern1_1, pattern3_1);
|
let expr2_0 = C::value_regs(ctx, pattern1_1, pattern3_1);
|
||||||
@@ -1037,7 +1039,7 @@ pub fn constructor_with_flags_1<C: Context>(
|
|||||||
result: pattern3_1,
|
result: pattern3_1,
|
||||||
} = pattern2_0
|
} = pattern2_0
|
||||||
{
|
{
|
||||||
// Rule at src/prelude.isle line 284.
|
// Rule at src/prelude.isle line 294.
|
||||||
let expr0_0 = C::emit(ctx, &pattern1_0);
|
let expr0_0 = C::emit(ctx, &pattern1_0);
|
||||||
let expr1_0 = C::emit(ctx, &pattern3_0);
|
let expr1_0 = C::emit(ctx, &pattern3_0);
|
||||||
return Some(pattern3_1);
|
return Some(pattern3_1);
|
||||||
@@ -1071,7 +1073,7 @@ pub fn constructor_with_flags_2<C: Context>(
|
|||||||
result: pattern5_1,
|
result: pattern5_1,
|
||||||
} = pattern4_0
|
} = pattern4_0
|
||||||
{
|
{
|
||||||
// Rule at src/prelude.isle line 294.
|
// Rule at src/prelude.isle line 304.
|
||||||
let expr0_0 = C::emit(ctx, &pattern1_0);
|
let expr0_0 = C::emit(ctx, &pattern1_0);
|
||||||
let expr1_0 = C::emit(ctx, &pattern5_0);
|
let expr1_0 = C::emit(ctx, &pattern5_0);
|
||||||
let expr2_0 = C::emit(ctx, &pattern3_0);
|
let expr2_0 = C::emit(ctx, &pattern3_0);
|
||||||
|
|||||||
@@ -35,6 +35,7 @@
|
|||||||
(dst WritableReg)
|
(dst WritableReg)
|
||||||
(imm u8)
|
(imm u8)
|
||||||
(size OperandSize))
|
(size OperandSize))
|
||||||
|
(XmmUninitializedValue (dst WritableReg))
|
||||||
(CmpRmiR (size OperandSize)
|
(CmpRmiR (size OperandSize)
|
||||||
(opcode CmpOpcode)
|
(opcode CmpOpcode)
|
||||||
(src RegMemImm)
|
(src RegMemImm)
|
||||||
@@ -292,6 +293,15 @@
|
|||||||
(Mem (addr SyntheticAmode))
|
(Mem (addr SyntheticAmode))
|
||||||
(Imm (simm32 u32))))
|
(Imm (simm32 u32))))
|
||||||
|
|
||||||
|
;; Put the given clif value into a `RegMemImm` operand.
|
||||||
|
;;
|
||||||
|
;; Asserts that the value fits into a single register, and doesn't require
|
||||||
|
;; multiple registers for its representation (like `i128` for example).
|
||||||
|
;;
|
||||||
|
;; As a side effect, this marks the value as used.
|
||||||
|
(decl put_in_reg_mem_imm (Value) RegMemImm)
|
||||||
|
(extern constructor put_in_reg_mem_imm put_in_reg_mem_imm)
|
||||||
|
|
||||||
(type RegMem extern
|
(type RegMem extern
|
||||||
(enum
|
(enum
|
||||||
(Reg (reg Reg))
|
(Reg (reg Reg))
|
||||||
@@ -319,6 +329,18 @@
|
|||||||
(enum (Imm8 (imm u8))
|
(enum (Imm8 (imm u8))
|
||||||
(Reg (reg Reg))))
|
(Reg (reg Reg))))
|
||||||
|
|
||||||
|
;; Put the given clif value into a `Imm8Reg` operand, masked to the bit width of
|
||||||
|
;; the given type.
|
||||||
|
;;
|
||||||
|
;; Asserts that the value fits into a single register, and doesn't require
|
||||||
|
;; multiple registers for its representation (like `i128` for example).
|
||||||
|
;;
|
||||||
|
;; As a side effect, this marks the value as used.
|
||||||
|
;;
|
||||||
|
;; This is used when lowering various shifts and rotates.
|
||||||
|
(decl put_masked_in_imm8_reg (Value Type) Imm8Reg)
|
||||||
|
(extern constructor put_masked_in_imm8_reg put_masked_in_imm8_reg)
|
||||||
|
|
||||||
(type CC extern
|
(type CC extern
|
||||||
(enum O
|
(enum O
|
||||||
NO
|
NO
|
||||||
@@ -383,9 +405,12 @@
|
|||||||
(decl imm8_from_value (Imm8Reg) Value)
|
(decl imm8_from_value (Imm8Reg) Value)
|
||||||
(extern extractor imm8_from_value imm8_from_value)
|
(extern extractor imm8_from_value imm8_from_value)
|
||||||
|
|
||||||
;; Mask an `Imm8Reg.Imm8`.
|
;; Mask a constant to the bit-width of the given type and package it into an
|
||||||
(decl mask_imm8_const (Imm8Reg u64) Imm8Reg)
|
;; `Imm8Reg.Imm8`. This is used for shifts and rotates, so that we don't try and
|
||||||
(extern constructor mask_imm8_const mask_imm8_const)
|
;; shift/rotate more bits than the type has available, per Cranelift's
|
||||||
|
;; semantics.
|
||||||
|
(decl const_to_type_masked_imm8 (u64 Type) Imm8Reg)
|
||||||
|
(extern constructor const_to_type_masked_imm8 const_to_type_masked_imm8)
|
||||||
|
|
||||||
;; Extract a constant `RegMemImm.Imm` from a value operand.
|
;; Extract a constant `RegMemImm.Imm` from a value operand.
|
||||||
(decl simm32_from_value (RegMemImm) Value)
|
(decl simm32_from_value (RegMemImm) Value)
|
||||||
@@ -494,6 +519,37 @@
|
|||||||
wr))))
|
wr))))
|
||||||
r))
|
r))
|
||||||
|
|
||||||
|
;; Helper for creating an SSE register holding an `i64x2` from two `i64` values.
|
||||||
|
(decl make_i64x2_from_lanes (RegMem RegMem) Reg)
|
||||||
|
(rule (make_i64x2_from_lanes lo hi)
|
||||||
|
(let ((dst_w WritableReg (temp_writable_reg $I64X2))
|
||||||
|
(dst_r Reg (writable_reg_to_reg dst_w))
|
||||||
|
(_0 Unit (emit (MInst.XmmUninitializedValue dst_w)))
|
||||||
|
(_1 Unit (emit (MInst.XmmRmRImm (SseOpcode.Pinsrd)
|
||||||
|
dst_r
|
||||||
|
lo
|
||||||
|
dst_w
|
||||||
|
0
|
||||||
|
(OperandSize.Size64))))
|
||||||
|
(_2 Unit (emit (MInst.XmmRmRImm (SseOpcode.Pinsrd)
|
||||||
|
dst_r
|
||||||
|
hi
|
||||||
|
dst_w
|
||||||
|
1
|
||||||
|
(OperandSize.Size64)))))
|
||||||
|
dst_r))
|
||||||
|
|
||||||
|
;; Move a `RegMemImm.Reg` operand to an XMM register, if necessary.
|
||||||
|
(decl reg_mem_imm_to_xmm (RegMemImm) RegMemImm)
|
||||||
|
(rule (reg_mem_imm_to_xmm rmi @ (RegMemImm.Mem _)) rmi)
|
||||||
|
(rule (reg_mem_imm_to_xmm rmi @ (RegMemImm.Imm _)) rmi)
|
||||||
|
(rule (reg_mem_imm_to_xmm (RegMemImm.Reg r))
|
||||||
|
(RegMemImm.Reg (gpr_to_xmm $I8X16
|
||||||
|
(SseOpcode.Movd)
|
||||||
|
(RegMem.Reg r)
|
||||||
|
(OperandSize.Size32))))
|
||||||
|
|
||||||
|
|
||||||
;;;; Instruction Constructors ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
;;;; Instruction Constructors ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||||
;;
|
;;
|
||||||
;; These constructors create SSA-style `MInst`s. It is their responsibility to
|
;; These constructors create SSA-style `MInst`s. It is their responsibility to
|
||||||
@@ -1058,6 +1114,21 @@
|
|||||||
(rule (pminud src1 src2)
|
(rule (pminud src1 src2)
|
||||||
(xmm_rm_r $I8X16 (SseOpcode.Pminud) src1 src2))
|
(xmm_rm_r $I8X16 (SseOpcode.Pminud) src1 src2))
|
||||||
|
|
||||||
|
;; Helper for creating `punpcklbw` instructions.
|
||||||
|
(decl punpcklbw (Reg RegMem) Reg)
|
||||||
|
(rule (punpcklbw src1 src2)
|
||||||
|
(xmm_rm_r $I8X16 (SseOpcode.Punpcklbw) src1 src2))
|
||||||
|
|
||||||
|
;; Helper for creating `punpckhbw` instructions.
|
||||||
|
(decl punpckhbw (Reg RegMem) Reg)
|
||||||
|
(rule (punpckhbw src1 src2)
|
||||||
|
(xmm_rm_r $I8X16 (SseOpcode.Punpckhbw) src1 src2))
|
||||||
|
|
||||||
|
;; Helper for creating `packsswb` instructions.
|
||||||
|
(decl packsswb (Reg RegMem) Reg)
|
||||||
|
(rule (packsswb src1 src2)
|
||||||
|
(xmm_rm_r $I8X16 (SseOpcode.Packsswb) src1 src2))
|
||||||
|
|
||||||
;; Helper for creating `MInst.XmmRmRImm` instructions.
|
;; Helper for creating `MInst.XmmRmRImm` instructions.
|
||||||
(decl xmm_rm_r_imm (SseOpcode Reg RegMem u8 OperandSize) Reg)
|
(decl xmm_rm_r_imm (SseOpcode Reg RegMem u8 OperandSize) Reg)
|
||||||
(rule (xmm_rm_r_imm op src1 src2 imm size)
|
(rule (xmm_rm_r_imm op src1 src2 imm size)
|
||||||
@@ -1180,6 +1251,16 @@
|
|||||||
(rule (psrlq src1 src2)
|
(rule (psrlq src1 src2)
|
||||||
(xmm_rmi_reg (SseOpcode.Psrlq) src1 src2))
|
(xmm_rmi_reg (SseOpcode.Psrlq) src1 src2))
|
||||||
|
|
||||||
|
;; Helper for creating `psraw` instructions.
|
||||||
|
(decl psraw (Reg RegMemImm) Reg)
|
||||||
|
(rule (psraw src1 src2)
|
||||||
|
(xmm_rmi_reg (SseOpcode.Psraw) src1 src2))
|
||||||
|
|
||||||
|
;; Helper for creating `psrad` instructions.
|
||||||
|
(decl psrad (Reg RegMemImm) Reg)
|
||||||
|
(rule (psrad src1 src2)
|
||||||
|
(xmm_rmi_reg (SseOpcode.Psrad) src1 src2))
|
||||||
|
|
||||||
;; Helper for creating `MInst.MulHi` instructions.
|
;; Helper for creating `MInst.MulHi` instructions.
|
||||||
;;
|
;;
|
||||||
;; Returns the (lo, hi) register halves of the multiplication.
|
;; Returns the (lo, hi) register halves of the multiplication.
|
||||||
@@ -1252,6 +1333,19 @@
|
|||||||
(rule (insertps src1 src2 lane)
|
(rule (insertps src1 src2 lane)
|
||||||
(xmm_rm_r_imm (SseOpcode.Insertps) src1 src2 lane (OperandSize.Size32)))
|
(xmm_rm_r_imm (SseOpcode.Insertps) src1 src2 lane (OperandSize.Size32)))
|
||||||
|
|
||||||
|
;; Helper for creating `pextrd` instructions.
|
||||||
|
(decl pextrd (Type Reg u8) Reg)
|
||||||
|
(rule (pextrd ty src lane)
|
||||||
|
(let ((w_dst WritableReg (temp_writable_reg ty))
|
||||||
|
(r_dst Reg (writable_reg_to_reg w_dst))
|
||||||
|
(_ Unit (emit (MInst.XmmRmRImm (SseOpcode.Pextrd)
|
||||||
|
r_dst
|
||||||
|
(RegMem.Reg src)
|
||||||
|
w_dst
|
||||||
|
lane
|
||||||
|
(operand_size_of_type_32_64 (lane_type ty))))))
|
||||||
|
r_dst))
|
||||||
|
|
||||||
;; Helper for creating `not` instructions.
|
;; Helper for creating `not` instructions.
|
||||||
(decl not (Type Reg) Reg)
|
(decl not (Type Reg) Reg)
|
||||||
(rule (not ty src)
|
(rule (not ty src)
|
||||||
|
|||||||
@@ -537,13 +537,7 @@
|
|||||||
;; `i64` and smaller.
|
;; `i64` and smaller.
|
||||||
|
|
||||||
(rule (lower (has_type (fits_in_64 ty) (ishl src amt)))
|
(rule (lower (has_type (fits_in_64 ty) (ishl src amt)))
|
||||||
;; NB: Only the low bits of `amt` matter since we logically mask the shift
|
(value_reg (shl ty (put_in_reg src) (put_masked_in_imm8_reg amt ty))))
|
||||||
;; amount to the value's bit width.
|
|
||||||
(let ((amt_ Reg (lo_reg amt)))
|
|
||||||
(value_reg (shl ty (put_in_reg src) (Imm8Reg.Reg amt_)))))
|
|
||||||
|
|
||||||
(rule (lower (has_type (fits_in_64 ty) (ishl src (imm8_from_value amt))))
|
|
||||||
(value_reg (shl ty (put_in_reg src) amt)))
|
|
||||||
|
|
||||||
;; `i128`.
|
;; `i128`.
|
||||||
|
|
||||||
@@ -582,15 +576,8 @@
|
|||||||
;; `i64` and smaller.
|
;; `i64` and smaller.
|
||||||
|
|
||||||
(rule (lower (has_type (fits_in_64 ty) (ushr src amt)))
|
(rule (lower (has_type (fits_in_64 ty) (ushr src amt)))
|
||||||
(let ((src_ Reg (extend_to_reg src ty (ExtendKind.Zero)))
|
|
||||||
;; NB: Only the low bits of `amt` matter since we logically mask the
|
|
||||||
;; shift amount to the value's bit width.
|
|
||||||
(amt_ Reg (lo_reg amt)))
|
|
||||||
(value_reg (shr ty src_ (Imm8Reg.Reg amt_)))))
|
|
||||||
|
|
||||||
(rule (lower (has_type (fits_in_64 ty) (ushr src (imm8_from_value amt))))
|
|
||||||
(let ((src_ Reg (extend_to_reg src ty (ExtendKind.Zero))))
|
(let ((src_ Reg (extend_to_reg src ty (ExtendKind.Zero))))
|
||||||
(value_reg (shr ty src_ amt))))
|
(value_reg (shr ty src_ (put_masked_in_imm8_reg amt ty)))))
|
||||||
|
|
||||||
;; `i128`.
|
;; `i128`.
|
||||||
|
|
||||||
@@ -623,6 +610,109 @@
|
|||||||
(let ((amt_ Reg (lo_reg amt)))
|
(let ((amt_ Reg (lo_reg amt)))
|
||||||
(shr_i128 (put_in_regs src) amt_)))
|
(shr_i128 (put_in_regs src) amt_)))
|
||||||
|
|
||||||
|
;;;; Rules for `sshr` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||||
|
|
||||||
|
;; `i64` and smaller.
|
||||||
|
|
||||||
|
(rule (lower (has_type (fits_in_64 ty) (sshr src amt)))
|
||||||
|
(let ((src_ Reg (extend_to_reg src ty (ExtendKind.Sign))))
|
||||||
|
(value_reg (sar ty src_ (put_masked_in_imm8_reg amt ty)))))
|
||||||
|
|
||||||
|
;; `i128`.
|
||||||
|
|
||||||
|
(decl sar_i128 (ValueRegs Reg) ValueRegs)
|
||||||
|
(rule (sar_i128 src amt)
|
||||||
|
;; Unpack the low/high halves of `src`.
|
||||||
|
(let ((src_lo Reg (value_regs_get src 0))
|
||||||
|
(src_hi Reg (value_regs_get src 1))
|
||||||
|
;; Do a shift of each half. NB: the low half uses an unsigned shift
|
||||||
|
;; because its MSB is not a sign bit.
|
||||||
|
(lo_shifted Reg (shr $I64 src_lo (Imm8Reg.Reg amt)))
|
||||||
|
(hi_shifted Reg (sar $I64 src_hi (Imm8Reg.Reg amt)))
|
||||||
|
;; `src_hi << (64 - amt)` are the bits to carry over from the low
|
||||||
|
;; half to the high half.
|
||||||
|
(carry Reg (shl $I64 src_hi (Imm8Reg.Reg (sub $I64 (imm $I64 64) (RegMemImm.Reg amt)))))
|
||||||
|
;; Nullify the carry if we are shifting by a multiple of 128.
|
||||||
|
(carry_ Reg (with_flags_1 (test (OperandSize.Size64) (RegMemImm.Imm 127) amt)
|
||||||
|
(cmove $I64 (CC.Z) (RegMem.Reg (imm $I64 0)) carry)))
|
||||||
|
;; Add the carry into the low half.
|
||||||
|
(lo_shifted_ Reg (or $I64 lo_shifted (RegMemImm.Reg carry_)))
|
||||||
|
;; Get all sign bits.
|
||||||
|
(sign_bits Reg (sar $I64 src_hi (Imm8Reg.Imm8 63))))
|
||||||
|
;; Combine the two shifted halves. However, if we are shifting by >= 64
|
||||||
|
;; (modulo 128), then the hi bits are all sign bits and the lo bits are
|
||||||
|
;; what would otherwise be our hi bits.
|
||||||
|
(with_flags_2 (test (OperandSize.Size64) (RegMemImm.Imm 64) amt)
|
||||||
|
(cmove $I64 (CC.Z) (RegMem.Reg lo_shifted_) hi_shifted)
|
||||||
|
(cmove $I64 (CC.Z) (RegMem.Reg hi_shifted) sign_bits))))
|
||||||
|
|
||||||
|
(rule (lower (has_type $I128 (sshr src amt)))
|
||||||
|
;; NB: Only the low bits of `amt` matter since we logically mask the shift
|
||||||
|
;; amount to the value's bit width.
|
||||||
|
(let ((amt_ Reg (lo_reg amt)))
|
||||||
|
(sar_i128 (put_in_regs src) amt_)))
|
||||||
|
|
||||||
|
;; SSE.
|
||||||
|
|
||||||
|
;; Since the x86 instruction set does not have an 8x16 shift instruction and the
|
||||||
|
;; approach used for `ishl` and `ushr` cannot be easily used (the masks do not
|
||||||
|
;; preserve the sign), we use a different approach here: separate the low and
|
||||||
|
;; high lanes, shift them separately, and merge them into the final result.
|
||||||
|
;;
|
||||||
|
;; Visually, this looks like the following, where `src.i8x16 = [s0, s1, ...,
|
||||||
|
;; s15]:
|
||||||
|
;;
|
||||||
|
;; lo.i16x8 = [(s0, s0), (s1, s1), ..., (s7, s7)]
|
||||||
|
;; shifted_lo.i16x8 = shift each lane of `low`
|
||||||
|
;; hi.i16x8 = [(s8, s8), (s9, s9), ..., (s15, s15)]
|
||||||
|
;; shifted_hi.i16x8 = shift each lane of `high`
|
||||||
|
;; result = [s0'', s1'', ..., s15'']
|
||||||
|
(rule (lower (has_type $I8X16 (sshr src amt @ (value_type amt_ty))))
|
||||||
|
(let ((src_ Reg (put_in_reg src))
|
||||||
|
;; In order for `packsswb` later to only use the high byte of each
|
||||||
|
;; 16x8 lane, we shift right an extra 8 bits, relying on `psraw` to
|
||||||
|
;; fill in the upper bits appropriately.
|
||||||
|
(lo Reg (punpcklbw src_ (RegMem.Reg src_)))
|
||||||
|
(hi Reg (punpckhbw src_ (RegMem.Reg src_)))
|
||||||
|
(amt_ RegMemImm (sshr_i8x16_bigger_shift amt_ty (put_in_reg_mem_imm amt)))
|
||||||
|
(shifted_lo Reg (psraw lo amt_))
|
||||||
|
(shifted_hi Reg (psraw hi amt_)))
|
||||||
|
(value_reg (packsswb shifted_lo (RegMem.Reg shifted_hi)))))
|
||||||
|
|
||||||
|
(decl sshr_i8x16_bigger_shift (Type RegMemImm) RegMemImm)
|
||||||
|
(rule (sshr_i8x16_bigger_shift _ty (RegMemImm.Imm i))
|
||||||
|
(RegMemImm.Imm (u32_add i 8)))
|
||||||
|
(rule (sshr_i8x16_bigger_shift ty (RegMemImm.Reg r))
|
||||||
|
(reg_mem_imm_to_xmm (RegMemImm.Reg (add ty r (RegMemImm.Imm 8)))))
|
||||||
|
(rule (sshr_i8x16_bigger_shift ty rmi @ (RegMemImm.Mem _m))
|
||||||
|
(reg_mem_imm_to_xmm (RegMemImm.Reg (add ty (imm ty 8) rmi))))
|
||||||
|
|
||||||
|
;; `sshr.{i16x8,i32x4}` can be a simple `psra{w,d}`, we just have to make sure
|
||||||
|
;; that if the shift amount is in a register, it is in an XMM register.
|
||||||
|
(rule (lower (has_type $I16X8 (sshr src amt)))
|
||||||
|
(value_reg (psraw (put_in_reg src)
|
||||||
|
(reg_mem_imm_to_xmm (put_in_reg_mem_imm amt)))))
|
||||||
|
(rule (lower (has_type $I32X4 (sshr src amt)))
|
||||||
|
(value_reg (psrad (put_in_reg src)
|
||||||
|
(reg_mem_imm_to_xmm (put_in_reg_mem_imm amt)))))
|
||||||
|
|
||||||
|
;; The `sshr.i64x2` CLIF instruction has no single x86 instruction in the older
|
||||||
|
;; feature sets. Newer ones like AVX512VL + AVX512F include `vpsraq`, a 128-bit
|
||||||
|
;; instruction that would fit here, but this backend does not currently have
|
||||||
|
;; support for EVEX encodings. To remedy this, we extract each 64-bit lane to a
|
||||||
|
;; GPR, shift each using a scalar instruction, and insert the shifted values
|
||||||
|
;; back in the `dst` XMM register.
|
||||||
|
;;
|
||||||
|
;; (TODO: when EVEX support is available, add an alternate lowering here).
|
||||||
|
(rule (lower (has_type $I64X2 (sshr src amt)))
|
||||||
|
(let ((src_ Reg (put_in_reg src))
|
||||||
|
(lo Reg (pextrd $I64 src_ 0))
|
||||||
|
(hi Reg (pextrd $I64 src_ 1))
|
||||||
|
(amt_ Imm8Reg (put_masked_in_imm8_reg amt $I64))
|
||||||
|
(shifted_lo Reg (sar $I64 lo amt_))
|
||||||
|
(shifted_hi Reg (sar $I64 hi amt_)))
|
||||||
|
(value_reg (make_i64x2_from_lanes (RegMem.Reg shifted_lo)
|
||||||
|
(RegMem.Reg shifted_hi)))))
|
||||||
;;;; Rules for `rotl` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
;;;; Rules for `rotl` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||||
|
|
||||||
;; `i16` and `i8`: we need to extend the shift amount, or mask the
|
;; `i16` and `i8`: we need to extend the shift amount, or mask the
|
||||||
@@ -632,8 +722,11 @@
|
|||||||
(let ((amt_ Reg (extend_to_reg amt $I32 (ExtendKind.Zero))))
|
(let ((amt_ Reg (extend_to_reg amt $I32 (ExtendKind.Zero))))
|
||||||
(value_reg (m_rotl ty (put_in_reg src) (Imm8Reg.Reg amt_)))))
|
(value_reg (m_rotl ty (put_in_reg src) (Imm8Reg.Reg amt_)))))
|
||||||
|
|
||||||
(rule (lower (has_type (ty_8_or_16 ty) (rotl src (imm8_from_value amt))))
|
(rule (lower (has_type (ty_8_or_16 ty)
|
||||||
(value_reg (m_rotl ty (put_in_reg src) (mask_imm8_const amt (ty_bits_mask ty)))))
|
(rotl src (u64_from_iconst amt))))
|
||||||
|
(value_reg (m_rotl ty
|
||||||
|
(put_in_reg src)
|
||||||
|
(const_to_type_masked_imm8 amt ty))))
|
||||||
|
|
||||||
;; `i64` and `i32`: we can rely on x86's rotate-amount masking since
|
;; `i64` and `i32`: we can rely on x86's rotate-amount masking since
|
||||||
;; we operate on the whole register.
|
;; we operate on the whole register.
|
||||||
@@ -644,8 +737,11 @@
|
|||||||
(let ((amt_ Reg (lo_reg amt)))
|
(let ((amt_ Reg (lo_reg amt)))
|
||||||
(value_reg (m_rotl ty (put_in_reg src) (Imm8Reg.Reg amt_)))))
|
(value_reg (m_rotl ty (put_in_reg src) (Imm8Reg.Reg amt_)))))
|
||||||
|
|
||||||
(rule (lower (has_type (ty_32_or_64 ty) (rotl src (imm8_from_value amt))))
|
(rule (lower (has_type (ty_32_or_64 ty)
|
||||||
(value_reg (m_rotl ty (put_in_reg src) amt)))
|
(rotl src (u64_from_iconst amt))))
|
||||||
|
(value_reg (m_rotl ty
|
||||||
|
(put_in_reg src)
|
||||||
|
(const_to_type_masked_imm8 amt ty))))
|
||||||
|
|
||||||
;; `i128`.
|
;; `i128`.
|
||||||
|
|
||||||
|
|||||||
@@ -1538,13 +1538,18 @@ fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(
|
|||||||
| Opcode::Umin
|
| Opcode::Umin
|
||||||
| Opcode::Bnot
|
| Opcode::Bnot
|
||||||
| Opcode::Bitselect
|
| Opcode::Bitselect
|
||||||
| Opcode::Vselect => implemented_in_isle(ctx),
|
| Opcode::Vselect
|
||||||
|
| Opcode::Sshr => implemented_in_isle(ctx),
|
||||||
|
|
||||||
Opcode::Ishl | Opcode::Ushr | Opcode::Sshr | Opcode::Rotl | Opcode::Rotr => {
|
Opcode::Ishl | Opcode::Ushr | Opcode::Rotl | Opcode::Rotr => {
|
||||||
let dst_ty = ctx.output_ty(insn, 0);
|
let dst_ty = ctx.output_ty(insn, 0);
|
||||||
debug_assert_eq!(ctx.input_ty(insn, 0), dst_ty);
|
debug_assert_eq!(ctx.input_ty(insn, 0), dst_ty);
|
||||||
|
|
||||||
if !dst_ty.is_vector() && dst_ty.bits() <= 64 {
|
if !dst_ty.is_vector() && dst_ty.bits() <= 64 {
|
||||||
|
if op != Opcode::Rotr {
|
||||||
|
implemented_in_isle(ctx);
|
||||||
|
}
|
||||||
|
|
||||||
// Scalar shifts on x86 have various encodings:
|
// Scalar shifts on x86 have various encodings:
|
||||||
// - shift by one bit, e.g. `SAL r/m8, 1` (not used here)
|
// - shift by one bit, e.g. `SAL r/m8, 1` (not used here)
|
||||||
// - shift by an immediate amount, e.g. `SAL r/m8, imm8`
|
// - shift by an immediate amount, e.g. `SAL r/m8, imm8`
|
||||||
@@ -1557,10 +1562,6 @@ fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(
|
|||||||
OperandSize::Size32,
|
OperandSize::Size32,
|
||||||
extend_input_to_reg(ctx, inputs[0], ExtSpec::ZeroExtendTo32),
|
extend_input_to_reg(ctx, inputs[0], ExtSpec::ZeroExtendTo32),
|
||||||
),
|
),
|
||||||
Opcode::Sshr => (
|
|
||||||
OperandSize::Size32,
|
|
||||||
extend_input_to_reg(ctx, inputs[0], ExtSpec::SignExtendTo32),
|
|
||||||
),
|
|
||||||
Opcode::Rotl | Opcode::Rotr => (
|
Opcode::Rotl | Opcode::Rotr => (
|
||||||
OperandSize::from_ty(dst_ty),
|
OperandSize::from_ty(dst_ty),
|
||||||
put_input_in_reg(ctx, inputs[0]),
|
put_input_in_reg(ctx, inputs[0]),
|
||||||
@@ -1590,7 +1591,6 @@ fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(
|
|||||||
let shift_kind = match op {
|
let shift_kind = match op {
|
||||||
Opcode::Ishl => ShiftKind::ShiftLeft,
|
Opcode::Ishl => ShiftKind::ShiftLeft,
|
||||||
Opcode::Ushr => ShiftKind::ShiftRightLogical,
|
Opcode::Ushr => ShiftKind::ShiftRightLogical,
|
||||||
Opcode::Sshr => ShiftKind::ShiftRightArithmetic,
|
|
||||||
Opcode::Rotl => ShiftKind::RotateLeft,
|
Opcode::Rotl => ShiftKind::RotateLeft,
|
||||||
Opcode::Rotr => ShiftKind::RotateRight,
|
Opcode::Rotr => ShiftKind::RotateRight,
|
||||||
_ => unreachable!(),
|
_ => unreachable!(),
|
||||||
@@ -1608,50 +1608,8 @@ fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(
|
|||||||
let dst = get_output_reg(ctx, outputs[0]);
|
let dst = get_output_reg(ctx, outputs[0]);
|
||||||
|
|
||||||
match op {
|
match op {
|
||||||
Opcode::Ishl => {
|
Opcode::Ishl | Opcode::Ushr | Opcode::Rotl => {
|
||||||
emit_shl_i128(ctx, src, dst, amt_src);
|
implemented_in_isle(ctx);
|
||||||
}
|
|
||||||
Opcode::Ushr => {
|
|
||||||
emit_shr_i128(ctx, src, dst, amt_src, /* is_signed = */ false);
|
|
||||||
}
|
|
||||||
Opcode::Sshr => {
|
|
||||||
emit_shr_i128(ctx, src, dst, amt_src, /* is_signed = */ true);
|
|
||||||
}
|
|
||||||
Opcode::Rotl => {
|
|
||||||
// (mov tmp, src)
|
|
||||||
// (shl.i128 tmp, amt)
|
|
||||||
// (mov dst, src)
|
|
||||||
// (ushr.i128 dst, 128-amt)
|
|
||||||
// (or dst, tmp)
|
|
||||||
let tmp = ctx.alloc_tmp(types::I128);
|
|
||||||
emit_shl_i128(ctx, src, tmp, amt_src);
|
|
||||||
let inv_amt = ctx.alloc_tmp(types::I64).only_reg().unwrap();
|
|
||||||
ctx.emit(Inst::imm(OperandSize::Size64, 128, inv_amt));
|
|
||||||
ctx.emit(Inst::alu_rmi_r(
|
|
||||||
OperandSize::Size64,
|
|
||||||
AluRmiROpcode::Sub,
|
|
||||||
RegMemImm::reg(amt_src),
|
|
||||||
inv_amt,
|
|
||||||
));
|
|
||||||
emit_shr_i128(
|
|
||||||
ctx,
|
|
||||||
src,
|
|
||||||
dst,
|
|
||||||
inv_amt.to_reg(),
|
|
||||||
/* is_signed = */ false,
|
|
||||||
);
|
|
||||||
ctx.emit(Inst::alu_rmi_r(
|
|
||||||
OperandSize::Size64,
|
|
||||||
AluRmiROpcode::Or,
|
|
||||||
RegMemImm::reg(tmp.regs()[0].to_reg()),
|
|
||||||
dst.regs()[0],
|
|
||||||
));
|
|
||||||
ctx.emit(Inst::alu_rmi_r(
|
|
||||||
OperandSize::Size64,
|
|
||||||
AluRmiROpcode::Or,
|
|
||||||
RegMemImm::reg(tmp.regs()[1].to_reg()),
|
|
||||||
dst.regs()[1],
|
|
||||||
));
|
|
||||||
}
|
}
|
||||||
Opcode::Rotr => {
|
Opcode::Rotr => {
|
||||||
// (mov tmp, src)
|
// (mov tmp, src)
|
||||||
@@ -1808,127 +1766,6 @@ fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(
|
|||||||
_ => SseOpcode::Pand,
|
_ => SseOpcode::Pand,
|
||||||
};
|
};
|
||||||
ctx.emit(Inst::xmm_rm_r(sse_op, RegMem::from(mask_value), dst));
|
ctx.emit(Inst::xmm_rm_r(sse_op, RegMem::from(mask_value), dst));
|
||||||
} else if dst_ty == types::I8X16 && op == Opcode::Sshr {
|
|
||||||
// Since the x86 instruction set does not have an 8x16 shift instruction and the approach used for
|
|
||||||
// `ishl` and `ushr` cannot be easily used (the masks do not preserve the sign), we use a different
|
|
||||||
// approach here: separate the low and high lanes, shift them separately, and merge them into the final
|
|
||||||
// result. Visually, this looks like the following, where `src.i8x16 = [s0, s1, ..., s15]:
|
|
||||||
// low.i16x8 = [(s0, s0), (s1, s1), ..., (s7, s7)]
|
|
||||||
// shifted_low.i16x8 = shift each lane of `low`
|
|
||||||
// high.i16x8 = [(s8, s8), (s9, s9), ..., (s15, s15)]
|
|
||||||
// shifted_high.i16x8 = shift each lane of `high`
|
|
||||||
// dst.i8x16 = [s0'', s1'', ..., s15'']
|
|
||||||
let src = put_input_in_reg(ctx, inputs[0]);
|
|
||||||
let shift_by = input_to_reg_mem_imm(ctx, inputs[1]);
|
|
||||||
let shift_by_ty = ctx.input_ty(insn, 1);
|
|
||||||
let dst = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
|
|
||||||
|
|
||||||
// In order for PACKSSWB later to only use the high byte of each 16x8 lane, we shift right an extra 8
|
|
||||||
// bits, relying on PSRAW to fill in the upper bits appropriately.
|
|
||||||
let bigger_shift_by = match shift_by {
|
|
||||||
// When we know the shift amount at compile time, we add the extra shift amount statically.
|
|
||||||
RegMemImm::Imm { simm32 } => RegMemImm::imm(simm32 + 8),
|
|
||||||
// Otherwise we add instructions to add the extra shift amount and move the value into an XMM
|
|
||||||
// register.
|
|
||||||
RegMemImm::Reg { reg } => {
|
|
||||||
let bigger_shift_by_gpr = ctx.alloc_tmp(shift_by_ty).only_reg().unwrap();
|
|
||||||
ctx.emit(Inst::mov_r_r(OperandSize::Size64, reg, bigger_shift_by_gpr));
|
|
||||||
|
|
||||||
let size = if shift_by_ty == types::I64 {
|
|
||||||
OperandSize::Size64
|
|
||||||
} else {
|
|
||||||
OperandSize::Size32
|
|
||||||
};
|
|
||||||
let imm = RegMemImm::imm(8);
|
|
||||||
ctx.emit(Inst::alu_rmi_r(
|
|
||||||
size,
|
|
||||||
AluRmiROpcode::Add,
|
|
||||||
imm,
|
|
||||||
bigger_shift_by_gpr,
|
|
||||||
));
|
|
||||||
|
|
||||||
let bigger_shift_by_xmm = ctx.alloc_tmp(dst_ty).only_reg().unwrap();
|
|
||||||
ctx.emit(Inst::gpr_to_xmm(
|
|
||||||
SseOpcode::Movd,
|
|
||||||
RegMem::from(bigger_shift_by_gpr),
|
|
||||||
OperandSize::Size32,
|
|
||||||
bigger_shift_by_xmm,
|
|
||||||
));
|
|
||||||
RegMemImm::reg(bigger_shift_by_xmm.to_reg())
|
|
||||||
}
|
|
||||||
RegMemImm::Mem { .. } => unimplemented!("load shift amount to XMM register"),
|
|
||||||
};
|
|
||||||
|
|
||||||
// Unpack and shift the lower lanes of `src` into the `dst` register.
|
|
||||||
ctx.emit(Inst::gen_move(dst, src, dst_ty));
|
|
||||||
ctx.emit(Inst::xmm_rm_r(SseOpcode::Punpcklbw, RegMem::from(dst), dst));
|
|
||||||
ctx.emit(Inst::xmm_rmi_reg(
|
|
||||||
SseOpcode::Psraw,
|
|
||||||
bigger_shift_by.clone(),
|
|
||||||
dst,
|
|
||||||
));
|
|
||||||
|
|
||||||
// Unpack and shift the upper lanes of `src` into a temporary register, `upper_lanes`.
|
|
||||||
let upper_lanes = ctx.alloc_tmp(dst_ty).only_reg().unwrap();
|
|
||||||
ctx.emit(Inst::gen_move(upper_lanes, src, dst_ty));
|
|
||||||
ctx.emit(Inst::xmm_rm_r(
|
|
||||||
SseOpcode::Punpckhbw,
|
|
||||||
RegMem::from(upper_lanes),
|
|
||||||
upper_lanes,
|
|
||||||
));
|
|
||||||
ctx.emit(Inst::xmm_rmi_reg(
|
|
||||||
SseOpcode::Psraw,
|
|
||||||
bigger_shift_by,
|
|
||||||
upper_lanes,
|
|
||||||
));
|
|
||||||
|
|
||||||
// Merge the upper and lower shifted lanes into `dst`.
|
|
||||||
ctx.emit(Inst::xmm_rm_r(
|
|
||||||
SseOpcode::Packsswb,
|
|
||||||
RegMem::from(upper_lanes),
|
|
||||||
dst,
|
|
||||||
));
|
|
||||||
} else if dst_ty == types::I64X2 && op == Opcode::Sshr {
|
|
||||||
// The `sshr.i8x16` CLIF instruction has no single x86 instruction in the older feature sets; newer ones
|
|
||||||
// like AVX512VL + AVX512F include VPSRAQ, a 128-bit instruction that would fit here, but this backend
|
|
||||||
// does not currently have support for EVEX encodings (TODO when EVEX support is available, add an
|
|
||||||
// alternate lowering here). To remedy this, we extract each 64-bit lane to a GPR, shift each using a
|
|
||||||
// scalar instruction, and insert the shifted values back in the `dst` XMM register.
|
|
||||||
let src = put_input_in_reg(ctx, inputs[0]);
|
|
||||||
let dst = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
|
|
||||||
ctx.emit(Inst::gen_move(dst, src, dst_ty));
|
|
||||||
|
|
||||||
// Extract the upper and lower lanes into temporary GPRs.
|
|
||||||
let lower_lane = ctx.alloc_tmp(types::I64).only_reg().unwrap();
|
|
||||||
emit_extract_lane(ctx, src, lower_lane, 0, types::I64);
|
|
||||||
let upper_lane = ctx.alloc_tmp(types::I64).only_reg().unwrap();
|
|
||||||
emit_extract_lane(ctx, src, upper_lane, 1, types::I64);
|
|
||||||
|
|
||||||
// Shift each value.
|
|
||||||
let mut shift = |reg: Writable<Reg>| {
|
|
||||||
let kind = ShiftKind::ShiftRightArithmetic;
|
|
||||||
if let Some(shift_by) = ctx.get_input_as_source_or_const(insn, 1).constant {
|
|
||||||
// Mask the shift amount according to Cranelift's semantics.
|
|
||||||
let shift_by = (shift_by as u8) & (types::I64.bits() as u8 - 1);
|
|
||||||
ctx.emit(Inst::shift_r(
|
|
||||||
OperandSize::Size64,
|
|
||||||
kind,
|
|
||||||
Some(shift_by),
|
|
||||||
reg,
|
|
||||||
));
|
|
||||||
} else {
|
|
||||||
let dynamic_shift_by = put_input_in_reg(ctx, inputs[1]);
|
|
||||||
let w_rcx = Writable::from_reg(regs::rcx());
|
|
||||||
ctx.emit(Inst::mov_r_r(OperandSize::Size64, dynamic_shift_by, w_rcx));
|
|
||||||
ctx.emit(Inst::shift_r(OperandSize::Size64, kind, None, reg));
|
|
||||||
};
|
|
||||||
};
|
|
||||||
shift(lower_lane);
|
|
||||||
shift(upper_lane);
|
|
||||||
|
|
||||||
// Insert the scalar values back into the `dst` vector.
|
|
||||||
emit_insert_lane(ctx, RegMem::from(lower_lane), dst, 0, types::I64);
|
|
||||||
emit_insert_lane(ctx, RegMem::from(upper_lane), dst, 1, types::I64);
|
|
||||||
} else {
|
} else {
|
||||||
// For the remaining packed shifts not covered above, x86 has implementations that can either:
|
// For the remaining packed shifts not covered above, x86 has implementations that can either:
|
||||||
// - shift using an immediate
|
// - shift using an immediate
|
||||||
@@ -1940,13 +1777,11 @@ fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(
|
|||||||
types::I16X8 => match op {
|
types::I16X8 => match op {
|
||||||
Opcode::Ishl => SseOpcode::Psllw,
|
Opcode::Ishl => SseOpcode::Psllw,
|
||||||
Opcode::Ushr => SseOpcode::Psrlw,
|
Opcode::Ushr => SseOpcode::Psrlw,
|
||||||
Opcode::Sshr => SseOpcode::Psraw,
|
|
||||||
_ => unimplemented!("{} is not implemented for type {}", op, dst_ty),
|
_ => unimplemented!("{} is not implemented for type {}", op, dst_ty),
|
||||||
},
|
},
|
||||||
types::I32X4 => match op {
|
types::I32X4 => match op {
|
||||||
Opcode::Ishl => SseOpcode::Pslld,
|
Opcode::Ishl => SseOpcode::Pslld,
|
||||||
Opcode::Ushr => SseOpcode::Psrld,
|
Opcode::Ushr => SseOpcode::Psrld,
|
||||||
Opcode::Sshr => SseOpcode::Psrad,
|
|
||||||
_ => unimplemented!("{} is not implemented for type {}", op, dst_ty),
|
_ => unimplemented!("{} is not implemented for type {}", op, dst_ty),
|
||||||
},
|
},
|
||||||
types::I64X2 => match op {
|
types::I64X2 => match op {
|
||||||
|
|||||||
@@ -69,6 +69,31 @@ where
|
|||||||
OperandSize::from_ty(ty)
|
OperandSize::from_ty(ty)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
fn put_in_reg_mem_imm(&mut self, val: Value) -> RegMemImm {
|
||||||
|
let inputs = self.lower_ctx.get_value_as_source_or_const(val);
|
||||||
|
|
||||||
|
if let Some(c) = inputs.constant {
|
||||||
|
if let Some(imm) = to_simm32(c as i64) {
|
||||||
|
return imm;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Generate constants fresh at each use to minimize long-range
|
||||||
|
// register pressure.
|
||||||
|
let ty = self.value_type(val);
|
||||||
|
return RegMemImm::reg(generated_code::constructor_imm(self, ty, c).unwrap());
|
||||||
|
}
|
||||||
|
|
||||||
|
if let Some((src_insn, 0)) = inputs.inst {
|
||||||
|
if let Some((addr_input, offset)) = is_mergeable_load(self.lower_ctx, src_insn) {
|
||||||
|
self.lower_ctx.sink_inst(src_insn);
|
||||||
|
let amode = lower_to_amode(self.lower_ctx, addr_input, offset);
|
||||||
|
return RegMemImm::mem(amode);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
RegMemImm::reg(self.put_in_reg(val))
|
||||||
|
}
|
||||||
|
|
||||||
fn put_in_reg_mem(&mut self, val: Value) -> RegMem {
|
fn put_in_reg_mem(&mut self, val: Value) -> RegMem {
|
||||||
let inputs = self.lower_ctx.get_value_as_source_or_const(val);
|
let inputs = self.lower_ctx.get_value_as_source_or_const(val);
|
||||||
|
|
||||||
@@ -90,6 +115,23 @@ where
|
|||||||
RegMem::reg(self.put_in_reg(val))
|
RegMem::reg(self.put_in_reg(val))
|
||||||
}
|
}
|
||||||
|
|
||||||
|
fn put_masked_in_imm8_reg(&mut self, val: Value, ty: Type) -> Imm8Reg {
|
||||||
|
let inputs = self.lower_ctx.get_value_as_source_or_const(val);
|
||||||
|
|
||||||
|
if let Some(c) = inputs.constant {
|
||||||
|
let mask = 1_u64
|
||||||
|
.checked_shl(ty.bits() as u32)
|
||||||
|
.map_or(u64::MAX, |x| x - 1);
|
||||||
|
return Imm8Reg::Imm8 {
|
||||||
|
imm: (c & mask) as u8,
|
||||||
|
};
|
||||||
|
}
|
||||||
|
|
||||||
|
Imm8Reg::Reg {
|
||||||
|
reg: self.put_in_regs(val).regs()[0],
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
#[inline]
|
#[inline]
|
||||||
fn encode_fcmp_imm(&mut self, imm: &FcmpImm) -> u8 {
|
fn encode_fcmp_imm(&mut self, imm: &FcmpImm) -> u8 {
|
||||||
imm.encode()
|
imm.encode()
|
||||||
@@ -131,12 +173,12 @@ where
|
|||||||
}
|
}
|
||||||
|
|
||||||
#[inline]
|
#[inline]
|
||||||
fn mask_imm8_const(&mut self, imm8: &Imm8Reg, mask: u64) -> Imm8Reg {
|
fn const_to_type_masked_imm8(&mut self, c: u64, ty: Type) -> Imm8Reg {
|
||||||
match imm8 {
|
let mask = 1_u64
|
||||||
&Imm8Reg::Reg { reg } => Imm8Reg::Reg { reg },
|
.checked_shl(ty.bits() as u32)
|
||||||
&Imm8Reg::Imm8 { imm } => Imm8Reg::Imm8 {
|
.map_or(u64::MAX, |x| x - 1);
|
||||||
imm: imm & (mask as u8),
|
Imm8Reg::Imm8 {
|
||||||
},
|
imm: (c & mask) as u8,
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
@@ -1,4 +1,4 @@
|
|||||||
src/clif.isle f176ef3bba99365
|
src/clif.isle f176ef3bba99365
|
||||||
src/prelude.isle babc931e5dc5b4cf
|
src/prelude.isle d95510fad2e2473c
|
||||||
src/isa/x64/inst.isle bc5fc626492752c8
|
src/isa/x64/inst.isle c16462cc359dd466
|
||||||
src/isa/x64/lower.isle 33e94300f4c08455
|
src/isa/x64/lower.isle 9f761598e3949e8e
|
||||||
|
|||||||
File diff suppressed because it is too large
Load Diff
@@ -95,11 +95,6 @@ macro_rules! isle_prelude_methods {
|
|||||||
ty.bits().try_into().unwrap()
|
ty.bits().try_into().unwrap()
|
||||||
}
|
}
|
||||||
|
|
||||||
#[inline]
|
|
||||||
fn ty_bits_mask(&mut self, ty: Type) -> u64 {
|
|
||||||
(1 << (self.ty_bits(ty) as u64)) - 1
|
|
||||||
}
|
|
||||||
|
|
||||||
#[inline]
|
#[inline]
|
||||||
fn ty_bits_u16(&mut self, ty: Type) -> u16 {
|
fn ty_bits_u16(&mut self, ty: Type) -> u16 {
|
||||||
ty.bits()
|
ty.bits()
|
||||||
@@ -260,6 +255,21 @@ macro_rules! isle_prelude_methods {
|
|||||||
n => Some(n as u64),
|
n => Some(n as u64),
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#[inline]
|
||||||
|
fn u32_add(&mut self, a: u32, b: u32) -> u32 {
|
||||||
|
a.wrapping_add(b)
|
||||||
|
}
|
||||||
|
|
||||||
|
#[inline]
|
||||||
|
fn u8_and(&mut self, a: u8, b: u8) -> u8 {
|
||||||
|
a & b
|
||||||
|
}
|
||||||
|
|
||||||
|
#[inline]
|
||||||
|
fn lane_type(&mut self, ty: Type) -> Type {
|
||||||
|
ty.lane_type()
|
||||||
|
}
|
||||||
};
|
};
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
@@ -38,6 +38,12 @@
|
|||||||
(type ValueList (primitive ValueList))
|
(type ValueList (primitive ValueList))
|
||||||
(type ValueRegs (primitive ValueRegs))
|
(type ValueRegs (primitive ValueRegs))
|
||||||
|
|
||||||
|
(decl u32_add (u32 u32) u32)
|
||||||
|
(extern constructor u32_add u32_add)
|
||||||
|
|
||||||
|
(decl u8_and (u8 u8) u8)
|
||||||
|
(extern constructor u8_and u8_and)
|
||||||
|
|
||||||
;;;; Registers ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
;;;; Registers ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||||
|
|
||||||
(type Reg (primitive Reg))
|
(type Reg (primitive Reg))
|
||||||
@@ -146,6 +152,10 @@
|
|||||||
(decl ty_bits_u16 (Type) u16)
|
(decl ty_bits_u16 (Type) u16)
|
||||||
(extern constructor ty_bits_u16 ty_bits_u16)
|
(extern constructor ty_bits_u16 ty_bits_u16)
|
||||||
|
|
||||||
|
;; Get the type of each lane in the given type.
|
||||||
|
(decl lane_type (Type) Type)
|
||||||
|
(extern constructor lane_type lane_type)
|
||||||
|
|
||||||
;;;; Helper Clif Extractors ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
;;;; Helper Clif Extractors ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||||
|
|
||||||
;; An extractor that only matches types that can fit in 16 bits.
|
;; An extractor that only matches types that can fit in 16 bits.
|
||||||
@@ -242,11 +252,6 @@
|
|||||||
(and (result_type ty)
|
(and (result_type ty)
|
||||||
inst))
|
inst))
|
||||||
|
|
||||||
;; Return a bitmask that will mask off a count to be within `ty`'s
|
|
||||||
;; bit-width. Used for shifts/rotates.
|
|
||||||
(decl ty_bits_mask (Type) u64)
|
|
||||||
(extern constructor ty_bits_mask ty_bits_mask)
|
|
||||||
|
|
||||||
;; Match a multi-lane type, extracting (# bits per lane, # lanes) from the given
|
;; Match a multi-lane type, extracting (# bits per lane, # lanes) from the given
|
||||||
;; type. Will only match when there is more than one lane.
|
;; type. Will only match when there is more than one lane.
|
||||||
(decl multi_lane (u8 u16) Type)
|
(decl multi_lane (u8 u16) Type)
|
||||||
@@ -256,6 +261,11 @@
|
|||||||
(decl def_inst (Inst) Value)
|
(decl def_inst (Inst) Value)
|
||||||
(extern extractor def_inst def_inst)
|
(extern extractor def_inst def_inst)
|
||||||
|
|
||||||
|
;; Extract a constant `u64` from a value defined by an `iconst`.
|
||||||
|
(decl u64_from_iconst (u64) Value)
|
||||||
|
(extractor (u64_from_iconst x)
|
||||||
|
(def_inst (iconst (u64_from_imm64 x))))
|
||||||
|
|
||||||
;;;; Helpers for Working with Flags ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
;;;; Helpers for Working with Flags ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||||
|
|
||||||
;; Newtype wrapper around `MInst` for instructions that are used for their
|
;; Newtype wrapper around `MInst` for instructions that are used for their
|
||||||
|
|||||||
@@ -1173,35 +1173,32 @@ block0(v0: i128, v1: i128):
|
|||||||
; Entry block: 0
|
; Entry block: 0
|
||||||
; Block 0:
|
; Block 0:
|
||||||
; (original IR block: block0)
|
; (original IR block: block0)
|
||||||
; (instruction range: 0 .. 28)
|
; (instruction range: 0 .. 25)
|
||||||
; Inst 0: pushq %rbp
|
; Inst 0: pushq %rbp
|
||||||
; Inst 1: movq %rsp, %rbp
|
; Inst 1: movq %rsp, %rbp
|
||||||
; Inst 2: movq %rdi, %r8
|
; Inst 2: movq %rdi, %rax
|
||||||
; Inst 3: movq %rsi, %rdi
|
; Inst 3: movq %rdx, %rcx
|
||||||
; Inst 4: movq %rdi, %rsi
|
; Inst 4: shrq %cl, %rax
|
||||||
; Inst 5: movq %rdx, %rcx
|
; Inst 5: movq %rsi, %rdi
|
||||||
; Inst 6: sarq %cl, %rsi
|
; Inst 6: movq %rdx, %rcx
|
||||||
; Inst 7: movq %rdx, %rcx
|
; Inst 7: sarq %cl, %rdi
|
||||||
; Inst 8: shrq %cl, %r8
|
; Inst 8: movl $64, %ecx
|
||||||
; Inst 9: movl $64, %ecx
|
; Inst 9: subq %rdx, %rcx
|
||||||
; Inst 10: subq %rdx, %rcx
|
; Inst 10: movq %rsi, %r8
|
||||||
; Inst 11: movq %rdi, %rax
|
; Inst 11: shlq %cl, %r8
|
||||||
; Inst 12: shlq %cl, %rax
|
; Inst 12: xorq %rcx, %rcx
|
||||||
; Inst 13: xorq %rcx, %rcx
|
; Inst 13: testq $127, %rdx
|
||||||
; Inst 14: testq $127, %rdx
|
; Inst 14: cmovzq %rcx, %r8
|
||||||
; Inst 15: cmovzq %rcx, %rax
|
; Inst 15: orq %r8, %rax
|
||||||
; Inst 16: orq %r8, %rax
|
; Inst 16: sarq $63, %rsi
|
||||||
; Inst 17: sarq $63, %rdi
|
; Inst 17: testq $64, %rdx
|
||||||
; Inst 18: xorq %rcx, %rcx
|
; Inst 18: cmovzq %rdi, %rsi
|
||||||
; Inst 19: andq $64, %rdx
|
; Inst 19: cmovzq %rax, %rdi
|
||||||
; Inst 20: cmovzq %rsi, %rdi
|
; Inst 20: movq %rdi, %rax
|
||||||
; Inst 21: cmovzq %rax, %rcx
|
; Inst 21: movq %rsi, %rdx
|
||||||
; Inst 22: cmovnzq %rsi, %rcx
|
; Inst 22: movq %rbp, %rsp
|
||||||
; Inst 23: movq %rcx, %rax
|
; Inst 23: popq %rbp
|
||||||
; Inst 24: movq %rdi, %rdx
|
; Inst 24: ret
|
||||||
; Inst 25: movq %rbp, %rsp
|
|
||||||
; Inst 26: popq %rbp
|
|
||||||
; Inst 27: ret
|
|
||||||
; }}
|
; }}
|
||||||
|
|
||||||
function %f33(i128, i128) -> i128 {
|
function %f33(i128, i128) -> i128 {
|
||||||
|
|||||||
@@ -324,16 +324,16 @@ block0(v0: i32):
|
|||||||
; (instruction range: 0 .. 15)
|
; (instruction range: 0 .. 15)
|
||||||
; Inst 0: pushq %rbp
|
; Inst 0: pushq %rbp
|
||||||
; Inst 1: movq %rsp, %rbp
|
; Inst 1: movq %rsp, %rbp
|
||||||
; Inst 2: load_const VCodeConstant(0), %xmm0
|
; Inst 2: load_const VCodeConstant(0), %xmm2
|
||||||
; Inst 3: addl $8, %edi
|
; Inst 3: movdqa %xmm2, %xmm0
|
||||||
; Inst 4: movd %edi, %xmm2
|
; Inst 4: punpcklbw %xmm2, %xmm0
|
||||||
; Inst 5: movdqa %xmm0, %xmm1
|
; Inst 5: movdqa %xmm2, %xmm1
|
||||||
; Inst 6: punpcklbw %xmm1, %xmm1
|
; Inst 6: punpckhbw %xmm2, %xmm1
|
||||||
; Inst 7: psraw %xmm2, %xmm1
|
; Inst 7: addl $8, %edi
|
||||||
; Inst 8: punpckhbw %xmm0, %xmm0
|
; Inst 8: movd %edi, %xmm2
|
||||||
; Inst 9: psraw %xmm2, %xmm0
|
; Inst 9: psraw %xmm2, %xmm0
|
||||||
; Inst 10: packsswb %xmm0, %xmm1
|
; Inst 10: psraw %xmm2, %xmm1
|
||||||
; Inst 11: movdqa %xmm1, %xmm0
|
; Inst 11: packsswb %xmm1, %xmm0
|
||||||
; Inst 12: movq %rbp, %rsp
|
; Inst 12: movq %rbp, %rsp
|
||||||
; Inst 13: popq %rbp
|
; Inst 13: popq %rbp
|
||||||
; Inst 14: ret
|
; Inst 14: ret
|
||||||
@@ -349,19 +349,20 @@ block0(v0: i8x16, v1: i32):
|
|||||||
; Entry block: 0
|
; Entry block: 0
|
||||||
; Block 0:
|
; Block 0:
|
||||||
; (original IR block: block0)
|
; (original IR block: block0)
|
||||||
; (instruction range: 0 .. 12)
|
; (instruction range: 0 .. 13)
|
||||||
; Inst 0: pushq %rbp
|
; Inst 0: pushq %rbp
|
||||||
; Inst 1: movq %rsp, %rbp
|
; Inst 1: movq %rsp, %rbp
|
||||||
; Inst 2: movdqa %xmm0, %xmm1
|
; Inst 2: movdqa %xmm0, %xmm1
|
||||||
; Inst 3: movdqa %xmm1, %xmm0
|
; Inst 3: punpcklbw %xmm0, %xmm1
|
||||||
; Inst 4: punpcklbw %xmm0, %xmm0
|
; Inst 4: movdqa %xmm0, %xmm2
|
||||||
; Inst 5: psraw $11, %xmm0
|
; Inst 5: punpckhbw %xmm0, %xmm2
|
||||||
; Inst 6: punpckhbw %xmm1, %xmm1
|
; Inst 6: psraw $11, %xmm1
|
||||||
; Inst 7: psraw $11, %xmm1
|
; Inst 7: psraw $11, %xmm2
|
||||||
; Inst 8: packsswb %xmm1, %xmm0
|
; Inst 8: packsswb %xmm2, %xmm1
|
||||||
; Inst 9: movq %rbp, %rsp
|
; Inst 9: movdqa %xmm1, %xmm0
|
||||||
; Inst 10: popq %rbp
|
; Inst 10: movq %rbp, %rsp
|
||||||
; Inst 11: ret
|
; Inst 11: popq %rbp
|
||||||
|
; Inst 12: ret
|
||||||
; }}
|
; }}
|
||||||
|
|
||||||
function %sshr_i64x2(i64x2, i32) -> i64x2 {
|
function %sshr_i64x2(i64x2, i32) -> i64x2 {
|
||||||
@@ -374,21 +375,20 @@ block0(v0: i64x2, v1: i32):
|
|||||||
; Entry block: 0
|
; Entry block: 0
|
||||||
; Block 0:
|
; Block 0:
|
||||||
; (original IR block: block0)
|
; (original IR block: block0)
|
||||||
; (instruction range: 0 .. 15)
|
; (instruction range: 0 .. 14)
|
||||||
; Inst 0: pushq %rbp
|
; Inst 0: pushq %rbp
|
||||||
; Inst 1: movq %rsp, %rbp
|
; Inst 1: movq %rsp, %rbp
|
||||||
; Inst 2: movdqa %xmm0, %xmm1
|
; Inst 2: pextrd.w $0, %xmm0, %rsi
|
||||||
; Inst 3: pextrd.w $0, %xmm0, %rsi
|
; Inst 3: pextrd.w $1, %xmm0, %rax
|
||||||
; Inst 4: pextrd.w $1, %xmm0, %rax
|
; Inst 4: movq %rdi, %rcx
|
||||||
; Inst 5: movq %rdi, %rcx
|
; Inst 5: sarq %cl, %rsi
|
||||||
; Inst 6: sarq %cl, %rsi
|
; Inst 6: movq %rdi, %rcx
|
||||||
; Inst 7: movq %rdi, %rcx
|
; Inst 7: sarq %cl, %rax
|
||||||
; Inst 8: sarq %cl, %rax
|
; Inst 8: uninit %xmm0
|
||||||
; Inst 9: pinsrd.w $0, %rsi, %xmm1
|
; Inst 9: pinsrd.w $0, %rsi, %xmm0
|
||||||
; Inst 10: pinsrd.w $1, %rax, %xmm1
|
; Inst 10: pinsrd.w $1, %rax, %xmm0
|
||||||
; Inst 11: movdqa %xmm1, %xmm0
|
; Inst 11: movq %rbp, %rsp
|
||||||
; Inst 12: movq %rbp, %rsp
|
; Inst 12: popq %rbp
|
||||||
; Inst 13: popq %rbp
|
; Inst 13: ret
|
||||||
; Inst 14: ret
|
|
||||||
; }}
|
; }}
|
||||||
|
|
||||||
|
|||||||
@@ -13,7 +13,7 @@ use std::path::{Path, PathBuf};
|
|||||||
use std::time;
|
use std::time;
|
||||||
|
|
||||||
/// Timeout in seconds when we're not making progress.
|
/// Timeout in seconds when we're not making progress.
|
||||||
const TIMEOUT_PANIC: usize = 10;
|
const TIMEOUT_PANIC: usize = 60;
|
||||||
|
|
||||||
/// Timeout for reporting slow tests without panicking.
|
/// Timeout for reporting slow tests without panicking.
|
||||||
const TIMEOUT_SLOW: usize = 3;
|
const TIMEOUT_SLOW: usize = 3;
|
||||||
|
|||||||
Reference in New Issue
Block a user