cranelift: Port ishl SIMD lowerings to ISLE (#3686)
This commit is contained in:
@@ -1,4 +1,4 @@
|
|||||||
src/clif.isle f176ef3bba99365
|
src/clif.isle f176ef3bba99365
|
||||||
src/prelude.isle d95510fad2e2473c
|
src/prelude.isle 7b911d3b894ae17
|
||||||
src/isa/aarch64/inst.isle 5fa80451697b084f
|
src/isa/aarch64/inst.isle 5fa80451697b084f
|
||||||
src/isa/aarch64/lower.isle 2d2e1e076a0c8a23
|
src/isa/aarch64/lower.isle 2d2e1e076a0c8a23
|
||||||
|
|||||||
@@ -34,6 +34,7 @@ pub trait Context {
|
|||||||
fn u32_as_u64(&mut self, arg0: u32) -> u64;
|
fn u32_as_u64(&mut self, arg0: u32) -> u64;
|
||||||
fn ty_bits(&mut self, arg0: Type) -> u8;
|
fn ty_bits(&mut self, arg0: Type) -> u8;
|
||||||
fn ty_bits_u16(&mut self, arg0: Type) -> u16;
|
fn ty_bits_u16(&mut self, arg0: Type) -> u16;
|
||||||
|
fn ty_bytes(&mut self, arg0: Type) -> u16;
|
||||||
fn lane_type(&mut self, arg0: Type) -> Type;
|
fn lane_type(&mut self, arg0: Type) -> Type;
|
||||||
fn fits_in_16(&mut self, arg0: Type) -> Option<Type>;
|
fn fits_in_16(&mut self, arg0: Type) -> Option<Type>;
|
||||||
fn fits_in_32(&mut self, arg0: Type) -> Option<Type>;
|
fn fits_in_32(&mut self, arg0: Type) -> Option<Type>;
|
||||||
@@ -91,13 +92,13 @@ pub trait Context {
|
|||||||
fn rotr_opposite_amount(&mut self, arg0: Type, arg1: ImmShift) -> ImmShift;
|
fn rotr_opposite_amount(&mut self, arg0: Type, arg1: ImmShift) -> ImmShift;
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Internal type ProducesFlags: defined at src/prelude.isle line 273.
|
/// Internal type ProducesFlags: defined at src/prelude.isle line 277.
|
||||||
#[derive(Clone, Debug)]
|
#[derive(Clone, Debug)]
|
||||||
pub enum ProducesFlags {
|
pub enum ProducesFlags {
|
||||||
ProducesFlags { inst: MInst, result: Reg },
|
ProducesFlags { inst: MInst, result: Reg },
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Internal type ConsumesFlags: defined at src/prelude.isle line 276.
|
/// Internal type ConsumesFlags: defined at src/prelude.isle line 280.
|
||||||
#[derive(Clone, Debug)]
|
#[derive(Clone, Debug)]
|
||||||
pub enum ConsumesFlags {
|
pub enum ConsumesFlags {
|
||||||
ConsumesFlags { inst: MInst, result: Reg },
|
ConsumesFlags { inst: MInst, result: Reg },
|
||||||
@@ -1011,7 +1012,7 @@ pub fn constructor_with_flags<C: Context>(
|
|||||||
result: pattern3_1,
|
result: pattern3_1,
|
||||||
} = pattern2_0
|
} = pattern2_0
|
||||||
{
|
{
|
||||||
// Rule at src/prelude.isle line 286.
|
// Rule at src/prelude.isle line 290.
|
||||||
let expr0_0 = C::emit(ctx, &pattern1_0);
|
let expr0_0 = C::emit(ctx, &pattern1_0);
|
||||||
let expr1_0 = C::emit(ctx, &pattern3_0);
|
let expr1_0 = C::emit(ctx, &pattern3_0);
|
||||||
let expr2_0 = C::value_regs(ctx, pattern1_1, pattern3_1);
|
let expr2_0 = C::value_regs(ctx, pattern1_1, pattern3_1);
|
||||||
@@ -1039,7 +1040,7 @@ pub fn constructor_with_flags_1<C: Context>(
|
|||||||
result: pattern3_1,
|
result: pattern3_1,
|
||||||
} = pattern2_0
|
} = pattern2_0
|
||||||
{
|
{
|
||||||
// Rule at src/prelude.isle line 294.
|
// Rule at src/prelude.isle line 298.
|
||||||
let expr0_0 = C::emit(ctx, &pattern1_0);
|
let expr0_0 = C::emit(ctx, &pattern1_0);
|
||||||
let expr1_0 = C::emit(ctx, &pattern3_0);
|
let expr1_0 = C::emit(ctx, &pattern3_0);
|
||||||
return Some(pattern3_1);
|
return Some(pattern3_1);
|
||||||
@@ -1073,7 +1074,7 @@ pub fn constructor_with_flags_2<C: Context>(
|
|||||||
result: pattern5_1,
|
result: pattern5_1,
|
||||||
} = pattern4_0
|
} = pattern4_0
|
||||||
{
|
{
|
||||||
// Rule at src/prelude.isle line 304.
|
// Rule at src/prelude.isle line 308.
|
||||||
let expr0_0 = C::emit(ctx, &pattern1_0);
|
let expr0_0 = C::emit(ctx, &pattern1_0);
|
||||||
let expr1_0 = C::emit(ctx, &pattern5_0);
|
let expr1_0 = C::emit(ctx, &pattern5_0);
|
||||||
let expr2_0 = C::emit(ctx, &pattern3_0);
|
let expr2_0 = C::emit(ctx, &pattern3_0);
|
||||||
|
|||||||
@@ -54,6 +54,8 @@
|
|||||||
(MovsxRmR (ext_mode ExtMode)
|
(MovsxRmR (ext_mode ExtMode)
|
||||||
(src RegMem)
|
(src RegMem)
|
||||||
(dst WritableReg))
|
(dst WritableReg))
|
||||||
|
(Mov64MR (src SyntheticAmode)
|
||||||
|
(dst WritableReg))
|
||||||
(Cmove (size OperandSize)
|
(Cmove (size OperandSize)
|
||||||
(cc CC)
|
(cc CC)
|
||||||
(consequent RegMem)
|
(consequent RegMem)
|
||||||
@@ -70,6 +72,8 @@
|
|||||||
(Not (size OperandSize)
|
(Not (size OperandSize)
|
||||||
(src Reg)
|
(src Reg)
|
||||||
(dst WritableReg))
|
(dst WritableReg))
|
||||||
|
(LoadEffectiveAddress (addr SyntheticAmode)
|
||||||
|
(dst WritableReg))
|
||||||
))
|
))
|
||||||
|
|
||||||
(type OperandSize extern
|
(type OperandSize extern
|
||||||
@@ -318,6 +322,17 @@
|
|||||||
|
|
||||||
(type SyntheticAmode extern (enum))
|
(type SyntheticAmode extern (enum))
|
||||||
|
|
||||||
|
(decl synthetic_amode_to_reg_mem (SyntheticAmode) RegMem)
|
||||||
|
(extern constructor synthetic_amode_to_reg_mem synthetic_amode_to_reg_mem)
|
||||||
|
|
||||||
|
(type Amode extern (enum))
|
||||||
|
|
||||||
|
(decl amode_imm_reg_reg_shift (u32 Reg Reg u8) Amode)
|
||||||
|
(extern constructor amode_imm_reg_reg_shift amode_imm_reg_reg_shift)
|
||||||
|
|
||||||
|
(decl amode_to_synthetic_amode (Amode) SyntheticAmode)
|
||||||
|
(extern constructor amode_to_synthetic_amode amode_to_synthetic_amode)
|
||||||
|
|
||||||
(type ShiftKind extern
|
(type ShiftKind extern
|
||||||
(enum ShiftLeft
|
(enum ShiftLeft
|
||||||
ShiftRightLogical
|
ShiftRightLogical
|
||||||
@@ -438,6 +453,11 @@
|
|||||||
|
|
||||||
;;;; Helpers for Sign/Zero Extending ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
;;;; Helpers for Sign/Zero Extending ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||||
|
|
||||||
|
(type ExtKind extern
|
||||||
|
(enum None
|
||||||
|
SignExtend
|
||||||
|
ZeroExtend))
|
||||||
|
|
||||||
(type ExtendKind (enum Sign Zero))
|
(type ExtendKind (enum Sign Zero))
|
||||||
|
|
||||||
(type ExtMode extern (enum BL BQ WL WQ LQ))
|
(type ExtMode extern (enum BL BQ WL WQ LQ))
|
||||||
@@ -549,6 +569,40 @@
|
|||||||
(RegMem.Reg r)
|
(RegMem.Reg r)
|
||||||
(OperandSize.Size32))))
|
(OperandSize.Size32))))
|
||||||
|
|
||||||
|
;;;; Helpers for Emitting Loads ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||||
|
|
||||||
|
;; Load a value into a register.
|
||||||
|
(decl x64_load (Type SyntheticAmode ExtKind) Reg)
|
||||||
|
|
||||||
|
(rule (x64_load (fits_in_32 ty) addr (ExtKind.SignExtend))
|
||||||
|
(movsx ty
|
||||||
|
(ext_mode (ty_bytes ty) 8)
|
||||||
|
(synthetic_amode_to_reg_mem addr)))
|
||||||
|
|
||||||
|
(rule (x64_load $I64 addr _ext_kind)
|
||||||
|
(let ((dst WritableReg (temp_writable_reg $I64))
|
||||||
|
(_ Unit (emit (MInst.Mov64MR addr dst))))
|
||||||
|
(writable_reg_to_reg dst)))
|
||||||
|
|
||||||
|
(rule (x64_load $F32 addr _ext_kind)
|
||||||
|
(xmm_unary_rm_r (SseOpcode.Movss)
|
||||||
|
(synthetic_amode_to_reg_mem addr)))
|
||||||
|
|
||||||
|
(rule (x64_load $F64 addr _ext_kind)
|
||||||
|
(xmm_unary_rm_r (SseOpcode.Movsd)
|
||||||
|
(synthetic_amode_to_reg_mem addr)))
|
||||||
|
|
||||||
|
(rule (x64_load $F32X4 addr _ext_kind)
|
||||||
|
(xmm_unary_rm_r (SseOpcode.Movups)
|
||||||
|
(synthetic_amode_to_reg_mem addr)))
|
||||||
|
|
||||||
|
(rule (x64_load $F64X2 addr _ext_kind)
|
||||||
|
(xmm_unary_rm_r (SseOpcode.Movupd)
|
||||||
|
(synthetic_amode_to_reg_mem addr)))
|
||||||
|
|
||||||
|
(rule (x64_load (multi_lane _bits _lanes) addr _ext_kind)
|
||||||
|
(xmm_unary_rm_r (SseOpcode.Movdqu)
|
||||||
|
(synthetic_amode_to_reg_mem addr)))
|
||||||
|
|
||||||
;;;; Instruction Constructors ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
;;;; Instruction Constructors ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||||
;;
|
;;
|
||||||
@@ -1236,6 +1290,16 @@
|
|||||||
dst))))
|
dst))))
|
||||||
(writable_reg_to_reg dst)))
|
(writable_reg_to_reg dst)))
|
||||||
|
|
||||||
|
;; Helper for creating `psllw` instructions.
|
||||||
|
(decl psllw (Reg RegMemImm) Reg)
|
||||||
|
(rule (psllw src1 src2)
|
||||||
|
(xmm_rmi_reg (SseOpcode.Psllw) src1 src2))
|
||||||
|
|
||||||
|
;; Helper for creating `pslld` instructions.
|
||||||
|
(decl pslld (Reg RegMemImm) Reg)
|
||||||
|
(rule (pslld src1 src2)
|
||||||
|
(xmm_rmi_reg (SseOpcode.Pslld) src1 src2))
|
||||||
|
|
||||||
;; Helper for creating `psllq` instructions.
|
;; Helper for creating `psllq` instructions.
|
||||||
(decl psllq (Reg RegMemImm) Reg)
|
(decl psllq (Reg RegMemImm) Reg)
|
||||||
(rule (psllq src1 src2)
|
(rule (psllq src1 src2)
|
||||||
@@ -1353,3 +1417,9 @@
|
|||||||
(size OperandSize (operand_size_of_type_32_64 ty))
|
(size OperandSize (operand_size_of_type_32_64 ty))
|
||||||
(_ Unit (emit (MInst.Not size src dst))))
|
(_ Unit (emit (MInst.Not size src dst))))
|
||||||
(writable_reg_to_reg dst)))
|
(writable_reg_to_reg dst)))
|
||||||
|
|
||||||
|
(decl lea (SyntheticAmode) Reg)
|
||||||
|
(rule (lea addr)
|
||||||
|
(let ((dst WritableReg (temp_writable_reg $I64))
|
||||||
|
(_ Unit (emit (MInst.LoadEffectiveAddress addr dst))))
|
||||||
|
(writable_reg_to_reg dst)))
|
||||||
|
|||||||
@@ -3053,6 +3053,12 @@ impl MachInst for Inst {
|
|||||||
}
|
}
|
||||||
|
|
||||||
fn gen_move(dst_reg: Writable<Reg>, src_reg: Reg, ty: Type) -> Inst {
|
fn gen_move(dst_reg: Writable<Reg>, src_reg: Reg, ty: Type) -> Inst {
|
||||||
|
log::trace!(
|
||||||
|
"Inst::gen_move {:?} -> {:?} (type: {:?})",
|
||||||
|
src_reg,
|
||||||
|
dst_reg.to_reg(),
|
||||||
|
ty
|
||||||
|
);
|
||||||
let rc_dst = dst_reg.to_reg().get_class();
|
let rc_dst = dst_reg.to_reg().get_class();
|
||||||
let rc_src = src_reg.get_class();
|
let rc_src = src_reg.get_class();
|
||||||
// If this isn't true, we have gone way off the rails.
|
// If this isn't true, we have gone way off the rails.
|
||||||
|
|||||||
@@ -571,6 +571,67 @@
|
|||||||
(let ((amt_ Reg (lo_reg amt)))
|
(let ((amt_ Reg (lo_reg amt)))
|
||||||
(shl_i128 (put_in_regs src) amt_)))
|
(shl_i128 (put_in_regs src) amt_)))
|
||||||
|
|
||||||
|
;; SSE.
|
||||||
|
|
||||||
|
;; Since the x86 instruction set does not have any 8x16 shift instructions (even
|
||||||
|
;; in higher feature sets like AVX), we lower the `ishl.i8x16` to a sequence of
|
||||||
|
;; instructions. The basic idea, whether the amount to shift by is an immediate
|
||||||
|
;; or not, is to use a 16x8 shift and then mask off the incorrect bits to 0s.
|
||||||
|
(rule (lower (has_type $I8X16 (ishl src amt)))
|
||||||
|
(let ((src_ Reg (put_in_reg src))
|
||||||
|
(amt_gpr RegMemImm (put_in_reg_mem_imm amt))
|
||||||
|
(amt_xmm RegMemImm (reg_mem_imm_to_xmm amt_gpr))
|
||||||
|
;; Shift `src` using 16x8. Unfortunately, a 16x8 shift will only be
|
||||||
|
;; correct for half of the lanes; the others must be fixed up with
|
||||||
|
;; the mask below.
|
||||||
|
(unmasked Reg (psllw src_ amt_xmm))
|
||||||
|
(mask_addr SyntheticAmode (ishl_i8x16_mask amt_gpr))
|
||||||
|
(mask Reg (x64_load $I8X16 mask_addr (ExtKind.None))))
|
||||||
|
(value_reg (sse_and $I8X16 unmasked (RegMem.Reg mask)))))
|
||||||
|
|
||||||
|
;; Get the address of the mask to use when fixing up the lanes that weren't
|
||||||
|
;; correctly generated by the 16x8 shift.
|
||||||
|
(decl ishl_i8x16_mask (RegMemImm) SyntheticAmode)
|
||||||
|
|
||||||
|
;; When the shift amount is known, we can statically (i.e. at compile time)
|
||||||
|
;; determine the mask to use and only emit that.
|
||||||
|
(rule (ishl_i8x16_mask (RegMemImm.Imm amt))
|
||||||
|
(ishl_i8x16_mask_for_const amt))
|
||||||
|
|
||||||
|
;; Otherwise, we must emit the entire mask table and dynamically (i.e. at run
|
||||||
|
;; time) find the correct mask offset in the table. We do this use `lea` to find
|
||||||
|
;; the base address of the mask table and then complex addressing to offset to
|
||||||
|
;; the right mask: `base_address + amt << 4`
|
||||||
|
(rule (ishl_i8x16_mask (RegMemImm.Reg amt))
|
||||||
|
(let ((mask_table SyntheticAmode (ishl_i8x16_mask_table))
|
||||||
|
(base_mask_addr Reg (lea mask_table))
|
||||||
|
(mask_offset Reg (shl $I64 amt (Imm8Reg.Imm8 4))))
|
||||||
|
(amode_to_synthetic_amode (amode_imm_reg_reg_shift 0
|
||||||
|
base_mask_addr
|
||||||
|
mask_offset
|
||||||
|
0))))
|
||||||
|
(rule (ishl_i8x16_mask (RegMemImm.Mem amt))
|
||||||
|
(ishl_i8x16_mask (RegMemImm.Reg (x64_load $I64 amt (ExtKind.None)))))
|
||||||
|
|
||||||
|
;; Get the address of the mask for a constant 8x16 shift amount.
|
||||||
|
(decl ishl_i8x16_mask_for_const (u32) SyntheticAmode)
|
||||||
|
(extern constructor ishl_i8x16_mask_for_const ishl_i8x16_mask_for_const)
|
||||||
|
|
||||||
|
;; Get the address of the mask table for a dynamic 8x16 shift amount.
|
||||||
|
(decl ishl_i8x16_mask_table () SyntheticAmode)
|
||||||
|
(extern constructor ishl_i8x16_mask_table ishl_i8x16_mask_table)
|
||||||
|
|
||||||
|
;; 16x8, 32x4, and 64x2 shifts can each use a single instruction.
|
||||||
|
(rule (lower (has_type $I16X8 (ishl src amt)))
|
||||||
|
(value_reg (psllw (put_in_reg src)
|
||||||
|
(reg_mem_imm_to_xmm (put_in_reg_mem_imm amt)))))
|
||||||
|
(rule (lower (has_type $I32X4 (ishl src amt)))
|
||||||
|
(value_reg (pslld (put_in_reg src)
|
||||||
|
(reg_mem_imm_to_xmm (put_in_reg_mem_imm amt)))))
|
||||||
|
(rule (lower (has_type $I64X2 (ishl src amt)))
|
||||||
|
(value_reg (psllq (put_in_reg src)
|
||||||
|
(reg_mem_imm_to_xmm (put_in_reg_mem_imm amt)))))
|
||||||
|
|
||||||
;;;; Rules for `ushr` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
;;;; Rules for `ushr` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||||
|
|
||||||
;; `i64` and smaller.
|
;; `i64` and smaller.
|
||||||
|
|||||||
@@ -1539,9 +1539,10 @@ fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(
|
|||||||
| Opcode::Bnot
|
| Opcode::Bnot
|
||||||
| Opcode::Bitselect
|
| Opcode::Bitselect
|
||||||
| Opcode::Vselect
|
| Opcode::Vselect
|
||||||
| Opcode::Sshr => implemented_in_isle(ctx),
|
| Opcode::Sshr
|
||||||
|
| Opcode::Ishl => implemented_in_isle(ctx),
|
||||||
|
|
||||||
Opcode::Ishl | Opcode::Ushr | Opcode::Rotl | Opcode::Rotr => {
|
Opcode::Ushr | Opcode::Rotl | Opcode::Rotr => {
|
||||||
let dst_ty = ctx.output_ty(insn, 0);
|
let dst_ty = ctx.output_ty(insn, 0);
|
||||||
debug_assert_eq!(ctx.input_ty(insn, 0), dst_ty);
|
debug_assert_eq!(ctx.input_ty(insn, 0), dst_ty);
|
||||||
|
|
||||||
@@ -1557,7 +1558,6 @@ fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(
|
|||||||
// This implementation uses the last two encoding methods.
|
// This implementation uses the last two encoding methods.
|
||||||
let (size, lhs) = match dst_ty {
|
let (size, lhs) = match dst_ty {
|
||||||
types::I8 | types::I16 => match op {
|
types::I8 | types::I16 => match op {
|
||||||
Opcode::Ishl => (OperandSize::Size32, put_input_in_reg(ctx, inputs[0])),
|
|
||||||
Opcode::Ushr => (
|
Opcode::Ushr => (
|
||||||
OperandSize::Size32,
|
OperandSize::Size32,
|
||||||
extend_input_to_reg(ctx, inputs[0], ExtSpec::ZeroExtendTo32),
|
extend_input_to_reg(ctx, inputs[0], ExtSpec::ZeroExtendTo32),
|
||||||
@@ -1589,7 +1589,6 @@ fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(
|
|||||||
let dst = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
|
let dst = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
|
||||||
|
|
||||||
let shift_kind = match op {
|
let shift_kind = match op {
|
||||||
Opcode::Ishl => ShiftKind::ShiftLeft,
|
|
||||||
Opcode::Ushr => ShiftKind::ShiftRightLogical,
|
Opcode::Ushr => ShiftKind::ShiftRightLogical,
|
||||||
Opcode::Rotl => ShiftKind::RotateLeft,
|
Opcode::Rotl => ShiftKind::RotateLeft,
|
||||||
Opcode::Rotr => ShiftKind::RotateRight,
|
Opcode::Rotr => ShiftKind::RotateRight,
|
||||||
@@ -1608,7 +1607,7 @@ fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(
|
|||||||
let dst = get_output_reg(ctx, outputs[0]);
|
let dst = get_output_reg(ctx, outputs[0]);
|
||||||
|
|
||||||
match op {
|
match op {
|
||||||
Opcode::Ishl | Opcode::Ushr | Opcode::Rotl => {
|
Opcode::Ushr | Opcode::Rotl => {
|
||||||
implemented_in_isle(ctx);
|
implemented_in_isle(ctx);
|
||||||
}
|
}
|
||||||
Opcode::Rotr => {
|
Opcode::Rotr => {
|
||||||
@@ -1643,7 +1642,7 @@ fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(
|
|||||||
}
|
}
|
||||||
_ => unreachable!(),
|
_ => unreachable!(),
|
||||||
}
|
}
|
||||||
} else if dst_ty == types::I8X16 && (op == Opcode::Ishl || op == Opcode::Ushr) {
|
} else if dst_ty == types::I8X16 && op == Opcode::Ushr {
|
||||||
// Since the x86 instruction set does not have any 8x16 shift instructions (even in higher feature sets
|
// Since the x86 instruction set does not have any 8x16 shift instructions (even in higher feature sets
|
||||||
// like AVX), we lower the `ishl.i8x16` and `ushr.i8x16` to a sequence of instructions. The basic idea,
|
// like AVX), we lower the `ishl.i8x16` and `ushr.i8x16` to a sequence of instructions. The basic idea,
|
||||||
// whether the `shift_by` amount is an immediate or not, is to use a 16x8 shift and then mask off the
|
// whether the `shift_by` amount is an immediate or not, is to use a 16x8 shift and then mask off the
|
||||||
@@ -1671,7 +1670,6 @@ fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(
|
|||||||
// Shift `src` using 16x8. Unfortunately, a 16x8 shift will only be correct for half of the lanes;
|
// Shift `src` using 16x8. Unfortunately, a 16x8 shift will only be correct for half of the lanes;
|
||||||
// the others must be fixed up with the mask below.
|
// the others must be fixed up with the mask below.
|
||||||
let shift_opcode = match op {
|
let shift_opcode = match op {
|
||||||
Opcode::Ishl => SseOpcode::Psllw,
|
|
||||||
Opcode::Ushr => SseOpcode::Psrlw,
|
Opcode::Ushr => SseOpcode::Psrlw,
|
||||||
_ => unimplemented!("{} is not implemented for type {}", op, dst_ty),
|
_ => unimplemented!("{} is not implemented for type {}", op, dst_ty),
|
||||||
};
|
};
|
||||||
@@ -1695,20 +1693,8 @@ fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(
|
|||||||
0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x01, 0x01, 0x01, 0x01, 0x01,
|
0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x01, 0x01, 0x01, 0x01, 0x01,
|
||||||
0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01,
|
0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01,
|
||||||
];
|
];
|
||||||
const SHL_MASKS: [u8; 128] = [
|
|
||||||
0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
|
|
||||||
0xff, 0xff, 0xff, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe,
|
|
||||||
0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfc, 0xfc, 0xfc, 0xfc, 0xfc, 0xfc, 0xfc,
|
|
||||||
0xfc, 0xfc, 0xfc, 0xfc, 0xfc, 0xfc, 0xfc, 0xfc, 0xfc, 0xf8, 0xf8, 0xf8, 0xf8,
|
|
||||||
0xf8, 0xf8, 0xf8, 0xf8, 0xf8, 0xf8, 0xf8, 0xf8, 0xf8, 0xf8, 0xf8, 0xf8, 0xf0,
|
|
||||||
0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0,
|
|
||||||
0xf0, 0xf0, 0xe0, 0xe0, 0xe0, 0xe0, 0xe0, 0xe0, 0xe0, 0xe0, 0xe0, 0xe0, 0xe0,
|
|
||||||
0xe0, 0xe0, 0xe0, 0xe0, 0xe0, 0xc0, 0xc0, 0xc0, 0xc0, 0xc0, 0xc0, 0xc0, 0xc0,
|
|
||||||
0xc0, 0xc0, 0xc0, 0xc0, 0xc0, 0xc0, 0xc0, 0xc0, 0x80, 0x80, 0x80, 0x80, 0x80,
|
|
||||||
0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
|
|
||||||
];
|
|
||||||
let mask = match op {
|
let mask = match op {
|
||||||
Opcode::Ishl => &SHL_MASKS,
|
|
||||||
Opcode::Ushr => &USHR_MASKS,
|
Opcode::Ushr => &USHR_MASKS,
|
||||||
_ => unimplemented!("{} is not implemented for type {}", op, dst_ty),
|
_ => unimplemented!("{} is not implemented for type {}", op, dst_ty),
|
||||||
};
|
};
|
||||||
@@ -1775,17 +1761,14 @@ fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(
|
|||||||
let dst = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
|
let dst = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
|
||||||
let sse_op = match dst_ty {
|
let sse_op = match dst_ty {
|
||||||
types::I16X8 => match op {
|
types::I16X8 => match op {
|
||||||
Opcode::Ishl => SseOpcode::Psllw,
|
|
||||||
Opcode::Ushr => SseOpcode::Psrlw,
|
Opcode::Ushr => SseOpcode::Psrlw,
|
||||||
_ => unimplemented!("{} is not implemented for type {}", op, dst_ty),
|
_ => unimplemented!("{} is not implemented for type {}", op, dst_ty),
|
||||||
},
|
},
|
||||||
types::I32X4 => match op {
|
types::I32X4 => match op {
|
||||||
Opcode::Ishl => SseOpcode::Pslld,
|
|
||||||
Opcode::Ushr => SseOpcode::Psrld,
|
Opcode::Ushr => SseOpcode::Psrld,
|
||||||
_ => unimplemented!("{} is not implemented for type {}", op, dst_ty),
|
_ => unimplemented!("{} is not implemented for type {}", op, dst_ty),
|
||||||
},
|
},
|
||||||
types::I64X2 => match op {
|
types::I64X2 => match op {
|
||||||
Opcode::Ishl => SseOpcode::Psllq,
|
|
||||||
Opcode::Ushr => SseOpcode::Psrlq,
|
Opcode::Ushr => SseOpcode::Psrlq,
|
||||||
_ => unimplemented!("{} is not implemented for type {}", op, dst_ty),
|
_ => unimplemented!("{} is not implemented for type {}", op, dst_ty),
|
||||||
},
|
},
|
||||||
|
|||||||
@@ -7,19 +7,19 @@ mod generated_code;
|
|||||||
use super::{
|
use super::{
|
||||||
is_mergeable_load, lower_to_amode, AluRmiROpcode, Inst as MInst, OperandSize, Reg, RegMemImm,
|
is_mergeable_load, lower_to_amode, AluRmiROpcode, Inst as MInst, OperandSize, Reg, RegMemImm,
|
||||||
};
|
};
|
||||||
use crate::isa::x64::inst::args::SyntheticAmode;
|
|
||||||
use crate::isa::x64::inst::regs;
|
|
||||||
use crate::isa::x64::settings::Flags;
|
|
||||||
use crate::machinst::isle::*;
|
|
||||||
use crate::{
|
use crate::{
|
||||||
ir::{immediates::*, types::*, Inst, InstructionData, Opcode, TrapCode, Value, ValueList},
|
ir::{immediates::*, types::*, Inst, InstructionData, Opcode, TrapCode, Value, ValueList},
|
||||||
isa::x64::inst::{
|
isa::x64::{
|
||||||
|
inst::{
|
||||||
args::{
|
args::{
|
||||||
Avx512Opcode, CmpOpcode, ExtMode, FcmpImm, Imm8Reg, RegMem, ShiftKind, SseOpcode, CC,
|
Amode, Avx512Opcode, CmpOpcode, ExtKind, ExtMode, FcmpImm, Imm8Reg, RegMem,
|
||||||
|
ShiftKind, SseOpcode, SyntheticAmode, CC,
|
||||||
},
|
},
|
||||||
x64_map_regs,
|
regs, x64_map_regs,
|
||||||
},
|
},
|
||||||
machinst::{InsnInput, InsnOutput, LowerCtx},
|
settings::Flags,
|
||||||
|
},
|
||||||
|
machinst::{isle::*, InsnInput, InsnOutput, LowerCtx, VCodeConstantData},
|
||||||
};
|
};
|
||||||
use std::convert::TryFrom;
|
use std::convert::TryFrom;
|
||||||
|
|
||||||
@@ -248,8 +248,59 @@ where
|
|||||||
fn xmm0(&mut self) -> WritableReg {
|
fn xmm0(&mut self) -> WritableReg {
|
||||||
WritableReg::from_reg(regs::xmm0())
|
WritableReg::from_reg(regs::xmm0())
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#[inline]
|
||||||
|
fn synthetic_amode_to_reg_mem(&mut self, addr: &SyntheticAmode) -> RegMem {
|
||||||
|
RegMem::mem(addr.clone())
|
||||||
|
}
|
||||||
|
|
||||||
|
#[inline]
|
||||||
|
fn amode_imm_reg_reg_shift(&mut self, simm32: u32, base: Reg, index: Reg, shift: u8) -> Amode {
|
||||||
|
Amode::imm_reg_reg_shift(simm32, base, index, shift)
|
||||||
|
}
|
||||||
|
|
||||||
|
#[inline]
|
||||||
|
fn amode_to_synthetic_amode(&mut self, amode: &Amode) -> SyntheticAmode {
|
||||||
|
amode.clone().into()
|
||||||
|
}
|
||||||
|
|
||||||
|
fn ishl_i8x16_mask_for_const(&mut self, amt: u32) -> SyntheticAmode {
|
||||||
|
// When the shift amount is known, we can statically (i.e. at compile
|
||||||
|
// time) determine the mask to use and only emit that.
|
||||||
|
debug_assert!(amt < 8);
|
||||||
|
let mask_offset = amt as usize * 16;
|
||||||
|
let mask_constant = self.lower_ctx.use_constant(VCodeConstantData::WellKnown(
|
||||||
|
&I8X16_SHL_MASKS[mask_offset..mask_offset + 16],
|
||||||
|
));
|
||||||
|
SyntheticAmode::ConstantOffset(mask_constant)
|
||||||
|
}
|
||||||
|
|
||||||
|
fn ishl_i8x16_mask_table(&mut self) -> SyntheticAmode {
|
||||||
|
let mask_table = self
|
||||||
|
.lower_ctx
|
||||||
|
.use_constant(VCodeConstantData::WellKnown(&I8X16_SHL_MASKS));
|
||||||
|
SyntheticAmode::ConstantOffset(mask_table)
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Since x64 doesn't have 8x16 shifts and we must use a 16x8 shift instead, we
|
||||||
|
// need to fix up the bits that migrate from one half of the lane to the
|
||||||
|
// other. Each 16-byte mask is indexed by the shift amount: e.g. if we shift
|
||||||
|
// right by 0 (no movement), we want to retain all the bits so we mask with
|
||||||
|
// `0xff`; if we shift right by 1, we want to retain all bits except the MSB so
|
||||||
|
// we mask with `0x7f`; etc.
|
||||||
|
#[rustfmt::skip] // Preserve 16 bytes (i.e. one mask) per row.
|
||||||
|
const I8X16_SHL_MASKS: [u8; 128] = [
|
||||||
|
0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
|
||||||
|
0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe,
|
||||||
|
0xfc, 0xfc, 0xfc, 0xfc, 0xfc, 0xfc, 0xfc, 0xfc, 0xfc, 0xfc, 0xfc, 0xfc, 0xfc, 0xfc, 0xfc, 0xfc,
|
||||||
|
0xf8, 0xf8, 0xf8, 0xf8, 0xf8, 0xf8, 0xf8, 0xf8, 0xf8, 0xf8, 0xf8, 0xf8, 0xf8, 0xf8, 0xf8, 0xf8,
|
||||||
|
0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0,
|
||||||
|
0xe0, 0xe0, 0xe0, 0xe0, 0xe0, 0xe0, 0xe0, 0xe0, 0xe0, 0xe0, 0xe0, 0xe0, 0xe0, 0xe0, 0xe0, 0xe0,
|
||||||
|
0xc0, 0xc0, 0xc0, 0xc0, 0xc0, 0xc0, 0xc0, 0xc0, 0xc0, 0xc0, 0xc0, 0xc0, 0xc0, 0xc0, 0xc0, 0xc0,
|
||||||
|
0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
|
||||||
|
];
|
||||||
|
|
||||||
#[inline]
|
#[inline]
|
||||||
fn to_simm32(constant: i64) -> Option<RegMemImm> {
|
fn to_simm32(constant: i64) -> Option<RegMemImm> {
|
||||||
if constant == ((constant << 32) >> 32) {
|
if constant == ((constant << 32) >> 32) {
|
||||||
|
|||||||
@@ -1,4 +1,4 @@
|
|||||||
src/clif.isle f176ef3bba99365
|
src/clif.isle f176ef3bba99365
|
||||||
src/prelude.isle d95510fad2e2473c
|
src/prelude.isle 7b911d3b894ae17
|
||||||
src/isa/x64/inst.isle c16462cc359dd466
|
src/isa/x64/inst.isle dbfa857f7f2c5d9f
|
||||||
src/isa/x64/lower.isle 9f761598e3949e8e
|
src/isa/x64/lower.isle 5a737854091e1189
|
||||||
|
|||||||
File diff suppressed because it is too large
Load Diff
@@ -100,6 +100,11 @@ macro_rules! isle_prelude_methods {
|
|||||||
ty.bits()
|
ty.bits()
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#[inline]
|
||||||
|
fn ty_bytes(&mut self, ty: Type) -> u16 {
|
||||||
|
u16::try_from(ty.bytes()).unwrap()
|
||||||
|
}
|
||||||
|
|
||||||
fn fits_in_16(&mut self, ty: Type) -> Option<Type> {
|
fn fits_in_16(&mut self, ty: Type) -> Option<Type> {
|
||||||
if ty.bits() <= 16 {
|
if ty.bits() <= 16 {
|
||||||
Some(ty)
|
Some(ty)
|
||||||
|
|||||||
@@ -152,6 +152,10 @@
|
|||||||
(decl ty_bits_u16 (Type) u16)
|
(decl ty_bits_u16 (Type) u16)
|
||||||
(extern constructor ty_bits_u16 ty_bits_u16)
|
(extern constructor ty_bits_u16 ty_bits_u16)
|
||||||
|
|
||||||
|
;; Get the byte width of a given type.
|
||||||
|
(decl ty_bytes (Type) u16)
|
||||||
|
(extern constructor ty_bytes ty_bytes)
|
||||||
|
|
||||||
;; Get the type of each lane in the given type.
|
;; Get the type of each lane in the given type.
|
||||||
(decl lane_type (Type) Type)
|
(decl lane_type (Type) Type)
|
||||||
(extern constructor lane_type lane_type)
|
(extern constructor lane_type lane_type)
|
||||||
|
|||||||
Reference in New Issue
Block a user