cranelift: Port ishl SIMD lowerings to ISLE (#3686)

This commit is contained in:
Nick Fitzgerald
2022-01-13 07:34:37 -08:00
committed by GitHub
parent 13f17db297
commit a7dba81c1d
11 changed files with 698 additions and 296 deletions

View File

@@ -1,4 +1,4 @@
src/clif.isle f176ef3bba99365 src/clif.isle f176ef3bba99365
src/prelude.isle d95510fad2e2473c src/prelude.isle 7b911d3b894ae17
src/isa/aarch64/inst.isle 5fa80451697b084f src/isa/aarch64/inst.isle 5fa80451697b084f
src/isa/aarch64/lower.isle 2d2e1e076a0c8a23 src/isa/aarch64/lower.isle 2d2e1e076a0c8a23

View File

@@ -34,6 +34,7 @@ pub trait Context {
fn u32_as_u64(&mut self, arg0: u32) -> u64; fn u32_as_u64(&mut self, arg0: u32) -> u64;
fn ty_bits(&mut self, arg0: Type) -> u8; fn ty_bits(&mut self, arg0: Type) -> u8;
fn ty_bits_u16(&mut self, arg0: Type) -> u16; fn ty_bits_u16(&mut self, arg0: Type) -> u16;
fn ty_bytes(&mut self, arg0: Type) -> u16;
fn lane_type(&mut self, arg0: Type) -> Type; fn lane_type(&mut self, arg0: Type) -> Type;
fn fits_in_16(&mut self, arg0: Type) -> Option<Type>; fn fits_in_16(&mut self, arg0: Type) -> Option<Type>;
fn fits_in_32(&mut self, arg0: Type) -> Option<Type>; fn fits_in_32(&mut self, arg0: Type) -> Option<Type>;
@@ -91,13 +92,13 @@ pub trait Context {
fn rotr_opposite_amount(&mut self, arg0: Type, arg1: ImmShift) -> ImmShift; fn rotr_opposite_amount(&mut self, arg0: Type, arg1: ImmShift) -> ImmShift;
} }
/// Internal type ProducesFlags: defined at src/prelude.isle line 273. /// Internal type ProducesFlags: defined at src/prelude.isle line 277.
#[derive(Clone, Debug)] #[derive(Clone, Debug)]
pub enum ProducesFlags { pub enum ProducesFlags {
ProducesFlags { inst: MInst, result: Reg }, ProducesFlags { inst: MInst, result: Reg },
} }
/// Internal type ConsumesFlags: defined at src/prelude.isle line 276. /// Internal type ConsumesFlags: defined at src/prelude.isle line 280.
#[derive(Clone, Debug)] #[derive(Clone, Debug)]
pub enum ConsumesFlags { pub enum ConsumesFlags {
ConsumesFlags { inst: MInst, result: Reg }, ConsumesFlags { inst: MInst, result: Reg },
@@ -1011,7 +1012,7 @@ pub fn constructor_with_flags<C: Context>(
result: pattern3_1, result: pattern3_1,
} = pattern2_0 } = pattern2_0
{ {
// Rule at src/prelude.isle line 286. // Rule at src/prelude.isle line 290.
let expr0_0 = C::emit(ctx, &pattern1_0); let expr0_0 = C::emit(ctx, &pattern1_0);
let expr1_0 = C::emit(ctx, &pattern3_0); let expr1_0 = C::emit(ctx, &pattern3_0);
let expr2_0 = C::value_regs(ctx, pattern1_1, pattern3_1); let expr2_0 = C::value_regs(ctx, pattern1_1, pattern3_1);
@@ -1039,7 +1040,7 @@ pub fn constructor_with_flags_1<C: Context>(
result: pattern3_1, result: pattern3_1,
} = pattern2_0 } = pattern2_0
{ {
// Rule at src/prelude.isle line 294. // Rule at src/prelude.isle line 298.
let expr0_0 = C::emit(ctx, &pattern1_0); let expr0_0 = C::emit(ctx, &pattern1_0);
let expr1_0 = C::emit(ctx, &pattern3_0); let expr1_0 = C::emit(ctx, &pattern3_0);
return Some(pattern3_1); return Some(pattern3_1);
@@ -1073,7 +1074,7 @@ pub fn constructor_with_flags_2<C: Context>(
result: pattern5_1, result: pattern5_1,
} = pattern4_0 } = pattern4_0
{ {
// Rule at src/prelude.isle line 304. // Rule at src/prelude.isle line 308.
let expr0_0 = C::emit(ctx, &pattern1_0); let expr0_0 = C::emit(ctx, &pattern1_0);
let expr1_0 = C::emit(ctx, &pattern5_0); let expr1_0 = C::emit(ctx, &pattern5_0);
let expr2_0 = C::emit(ctx, &pattern3_0); let expr2_0 = C::emit(ctx, &pattern3_0);

View File

@@ -54,6 +54,8 @@
(MovsxRmR (ext_mode ExtMode) (MovsxRmR (ext_mode ExtMode)
(src RegMem) (src RegMem)
(dst WritableReg)) (dst WritableReg))
(Mov64MR (src SyntheticAmode)
(dst WritableReg))
(Cmove (size OperandSize) (Cmove (size OperandSize)
(cc CC) (cc CC)
(consequent RegMem) (consequent RegMem)
@@ -70,6 +72,8 @@
(Not (size OperandSize) (Not (size OperandSize)
(src Reg) (src Reg)
(dst WritableReg)) (dst WritableReg))
(LoadEffectiveAddress (addr SyntheticAmode)
(dst WritableReg))
)) ))
(type OperandSize extern (type OperandSize extern
@@ -318,6 +322,17 @@
(type SyntheticAmode extern (enum)) (type SyntheticAmode extern (enum))
(decl synthetic_amode_to_reg_mem (SyntheticAmode) RegMem)
(extern constructor synthetic_amode_to_reg_mem synthetic_amode_to_reg_mem)
(type Amode extern (enum))
(decl amode_imm_reg_reg_shift (u32 Reg Reg u8) Amode)
(extern constructor amode_imm_reg_reg_shift amode_imm_reg_reg_shift)
(decl amode_to_synthetic_amode (Amode) SyntheticAmode)
(extern constructor amode_to_synthetic_amode amode_to_synthetic_amode)
(type ShiftKind extern (type ShiftKind extern
(enum ShiftLeft (enum ShiftLeft
ShiftRightLogical ShiftRightLogical
@@ -438,6 +453,11 @@
;;;; Helpers for Sign/Zero Extending ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;;;; Helpers for Sign/Zero Extending ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
(type ExtKind extern
(enum None
SignExtend
ZeroExtend))
(type ExtendKind (enum Sign Zero)) (type ExtendKind (enum Sign Zero))
(type ExtMode extern (enum BL BQ WL WQ LQ)) (type ExtMode extern (enum BL BQ WL WQ LQ))
@@ -549,6 +569,40 @@
(RegMem.Reg r) (RegMem.Reg r)
(OperandSize.Size32)))) (OperandSize.Size32))))
;;;; Helpers for Emitting Loads ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;; Load a value into a register.
(decl x64_load (Type SyntheticAmode ExtKind) Reg)
(rule (x64_load (fits_in_32 ty) addr (ExtKind.SignExtend))
(movsx ty
(ext_mode (ty_bytes ty) 8)
(synthetic_amode_to_reg_mem addr)))
(rule (x64_load $I64 addr _ext_kind)
(let ((dst WritableReg (temp_writable_reg $I64))
(_ Unit (emit (MInst.Mov64MR addr dst))))
(writable_reg_to_reg dst)))
(rule (x64_load $F32 addr _ext_kind)
(xmm_unary_rm_r (SseOpcode.Movss)
(synthetic_amode_to_reg_mem addr)))
(rule (x64_load $F64 addr _ext_kind)
(xmm_unary_rm_r (SseOpcode.Movsd)
(synthetic_amode_to_reg_mem addr)))
(rule (x64_load $F32X4 addr _ext_kind)
(xmm_unary_rm_r (SseOpcode.Movups)
(synthetic_amode_to_reg_mem addr)))
(rule (x64_load $F64X2 addr _ext_kind)
(xmm_unary_rm_r (SseOpcode.Movupd)
(synthetic_amode_to_reg_mem addr)))
(rule (x64_load (multi_lane _bits _lanes) addr _ext_kind)
(xmm_unary_rm_r (SseOpcode.Movdqu)
(synthetic_amode_to_reg_mem addr)))
;;;; Instruction Constructors ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;;;; Instruction Constructors ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;; ;;
@@ -1236,6 +1290,16 @@
dst)))) dst))))
(writable_reg_to_reg dst))) (writable_reg_to_reg dst)))
;; Helper for creating `psllw` instructions.
(decl psllw (Reg RegMemImm) Reg)
(rule (psllw src1 src2)
(xmm_rmi_reg (SseOpcode.Psllw) src1 src2))
;; Helper for creating `pslld` instructions.
(decl pslld (Reg RegMemImm) Reg)
(rule (pslld src1 src2)
(xmm_rmi_reg (SseOpcode.Pslld) src1 src2))
;; Helper for creating `psllq` instructions. ;; Helper for creating `psllq` instructions.
(decl psllq (Reg RegMemImm) Reg) (decl psllq (Reg RegMemImm) Reg)
(rule (psllq src1 src2) (rule (psllq src1 src2)
@@ -1353,3 +1417,9 @@
(size OperandSize (operand_size_of_type_32_64 ty)) (size OperandSize (operand_size_of_type_32_64 ty))
(_ Unit (emit (MInst.Not size src dst)))) (_ Unit (emit (MInst.Not size src dst))))
(writable_reg_to_reg dst))) (writable_reg_to_reg dst)))
(decl lea (SyntheticAmode) Reg)
(rule (lea addr)
(let ((dst WritableReg (temp_writable_reg $I64))
(_ Unit (emit (MInst.LoadEffectiveAddress addr dst))))
(writable_reg_to_reg dst)))

View File

@@ -3053,6 +3053,12 @@ impl MachInst for Inst {
} }
fn gen_move(dst_reg: Writable<Reg>, src_reg: Reg, ty: Type) -> Inst { fn gen_move(dst_reg: Writable<Reg>, src_reg: Reg, ty: Type) -> Inst {
log::trace!(
"Inst::gen_move {:?} -> {:?} (type: {:?})",
src_reg,
dst_reg.to_reg(),
ty
);
let rc_dst = dst_reg.to_reg().get_class(); let rc_dst = dst_reg.to_reg().get_class();
let rc_src = src_reg.get_class(); let rc_src = src_reg.get_class();
// If this isn't true, we have gone way off the rails. // If this isn't true, we have gone way off the rails.

View File

@@ -571,6 +571,67 @@
(let ((amt_ Reg (lo_reg amt))) (let ((amt_ Reg (lo_reg amt)))
(shl_i128 (put_in_regs src) amt_))) (shl_i128 (put_in_regs src) amt_)))
;; SSE.
;; Since the x86 instruction set does not have any 8x16 shift instructions (even
;; in higher feature sets like AVX), we lower the `ishl.i8x16` to a sequence of
;; instructions. The basic idea, whether the amount to shift by is an immediate
;; or not, is to use a 16x8 shift and then mask off the incorrect bits to 0s.
(rule (lower (has_type $I8X16 (ishl src amt)))
(let ((src_ Reg (put_in_reg src))
(amt_gpr RegMemImm (put_in_reg_mem_imm amt))
(amt_xmm RegMemImm (reg_mem_imm_to_xmm amt_gpr))
;; Shift `src` using 16x8. Unfortunately, a 16x8 shift will only be
;; correct for half of the lanes; the others must be fixed up with
;; the mask below.
(unmasked Reg (psllw src_ amt_xmm))
(mask_addr SyntheticAmode (ishl_i8x16_mask amt_gpr))
(mask Reg (x64_load $I8X16 mask_addr (ExtKind.None))))
(value_reg (sse_and $I8X16 unmasked (RegMem.Reg mask)))))
;; Get the address of the mask to use when fixing up the lanes that weren't
;; correctly generated by the 16x8 shift.
(decl ishl_i8x16_mask (RegMemImm) SyntheticAmode)
;; When the shift amount is known, we can statically (i.e. at compile time)
;; determine the mask to use and only emit that.
(rule (ishl_i8x16_mask (RegMemImm.Imm amt))
(ishl_i8x16_mask_for_const amt))
;; Otherwise, we must emit the entire mask table and dynamically (i.e. at run
;; time) find the correct mask offset in the table. We do this use `lea` to find
;; the base address of the mask table and then complex addressing to offset to
;; the right mask: `base_address + amt << 4`
(rule (ishl_i8x16_mask (RegMemImm.Reg amt))
(let ((mask_table SyntheticAmode (ishl_i8x16_mask_table))
(base_mask_addr Reg (lea mask_table))
(mask_offset Reg (shl $I64 amt (Imm8Reg.Imm8 4))))
(amode_to_synthetic_amode (amode_imm_reg_reg_shift 0
base_mask_addr
mask_offset
0))))
(rule (ishl_i8x16_mask (RegMemImm.Mem amt))
(ishl_i8x16_mask (RegMemImm.Reg (x64_load $I64 amt (ExtKind.None)))))
;; Get the address of the mask for a constant 8x16 shift amount.
(decl ishl_i8x16_mask_for_const (u32) SyntheticAmode)
(extern constructor ishl_i8x16_mask_for_const ishl_i8x16_mask_for_const)
;; Get the address of the mask table for a dynamic 8x16 shift amount.
(decl ishl_i8x16_mask_table () SyntheticAmode)
(extern constructor ishl_i8x16_mask_table ishl_i8x16_mask_table)
;; 16x8, 32x4, and 64x2 shifts can each use a single instruction.
(rule (lower (has_type $I16X8 (ishl src amt)))
(value_reg (psllw (put_in_reg src)
(reg_mem_imm_to_xmm (put_in_reg_mem_imm amt)))))
(rule (lower (has_type $I32X4 (ishl src amt)))
(value_reg (pslld (put_in_reg src)
(reg_mem_imm_to_xmm (put_in_reg_mem_imm amt)))))
(rule (lower (has_type $I64X2 (ishl src amt)))
(value_reg (psllq (put_in_reg src)
(reg_mem_imm_to_xmm (put_in_reg_mem_imm amt)))))
;;;; Rules for `ushr` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;;;; Rules for `ushr` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;; `i64` and smaller. ;; `i64` and smaller.

View File

@@ -1539,9 +1539,10 @@ fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(
| Opcode::Bnot | Opcode::Bnot
| Opcode::Bitselect | Opcode::Bitselect
| Opcode::Vselect | Opcode::Vselect
| Opcode::Sshr => implemented_in_isle(ctx), | Opcode::Sshr
| Opcode::Ishl => implemented_in_isle(ctx),
Opcode::Ishl | Opcode::Ushr | Opcode::Rotl | Opcode::Rotr => { Opcode::Ushr | Opcode::Rotl | Opcode::Rotr => {
let dst_ty = ctx.output_ty(insn, 0); let dst_ty = ctx.output_ty(insn, 0);
debug_assert_eq!(ctx.input_ty(insn, 0), dst_ty); debug_assert_eq!(ctx.input_ty(insn, 0), dst_ty);
@@ -1557,7 +1558,6 @@ fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(
// This implementation uses the last two encoding methods. // This implementation uses the last two encoding methods.
let (size, lhs) = match dst_ty { let (size, lhs) = match dst_ty {
types::I8 | types::I16 => match op { types::I8 | types::I16 => match op {
Opcode::Ishl => (OperandSize::Size32, put_input_in_reg(ctx, inputs[0])),
Opcode::Ushr => ( Opcode::Ushr => (
OperandSize::Size32, OperandSize::Size32,
extend_input_to_reg(ctx, inputs[0], ExtSpec::ZeroExtendTo32), extend_input_to_reg(ctx, inputs[0], ExtSpec::ZeroExtendTo32),
@@ -1589,7 +1589,6 @@ fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(
let dst = get_output_reg(ctx, outputs[0]).only_reg().unwrap(); let dst = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
let shift_kind = match op { let shift_kind = match op {
Opcode::Ishl => ShiftKind::ShiftLeft,
Opcode::Ushr => ShiftKind::ShiftRightLogical, Opcode::Ushr => ShiftKind::ShiftRightLogical,
Opcode::Rotl => ShiftKind::RotateLeft, Opcode::Rotl => ShiftKind::RotateLeft,
Opcode::Rotr => ShiftKind::RotateRight, Opcode::Rotr => ShiftKind::RotateRight,
@@ -1608,7 +1607,7 @@ fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(
let dst = get_output_reg(ctx, outputs[0]); let dst = get_output_reg(ctx, outputs[0]);
match op { match op {
Opcode::Ishl | Opcode::Ushr | Opcode::Rotl => { Opcode::Ushr | Opcode::Rotl => {
implemented_in_isle(ctx); implemented_in_isle(ctx);
} }
Opcode::Rotr => { Opcode::Rotr => {
@@ -1643,7 +1642,7 @@ fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(
} }
_ => unreachable!(), _ => unreachable!(),
} }
} else if dst_ty == types::I8X16 && (op == Opcode::Ishl || op == Opcode::Ushr) { } else if dst_ty == types::I8X16 && op == Opcode::Ushr {
// Since the x86 instruction set does not have any 8x16 shift instructions (even in higher feature sets // Since the x86 instruction set does not have any 8x16 shift instructions (even in higher feature sets
// like AVX), we lower the `ishl.i8x16` and `ushr.i8x16` to a sequence of instructions. The basic idea, // like AVX), we lower the `ishl.i8x16` and `ushr.i8x16` to a sequence of instructions. The basic idea,
// whether the `shift_by` amount is an immediate or not, is to use a 16x8 shift and then mask off the // whether the `shift_by` amount is an immediate or not, is to use a 16x8 shift and then mask off the
@@ -1671,7 +1670,6 @@ fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(
// Shift `src` using 16x8. Unfortunately, a 16x8 shift will only be correct for half of the lanes; // Shift `src` using 16x8. Unfortunately, a 16x8 shift will only be correct for half of the lanes;
// the others must be fixed up with the mask below. // the others must be fixed up with the mask below.
let shift_opcode = match op { let shift_opcode = match op {
Opcode::Ishl => SseOpcode::Psllw,
Opcode::Ushr => SseOpcode::Psrlw, Opcode::Ushr => SseOpcode::Psrlw,
_ => unimplemented!("{} is not implemented for type {}", op, dst_ty), _ => unimplemented!("{} is not implemented for type {}", op, dst_ty),
}; };
@@ -1695,20 +1693,8 @@ fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(
0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x01, 0x01, 0x01, 0x01, 0x01, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x01, 0x01, 0x01, 0x01, 0x01,
0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01,
]; ];
const SHL_MASKS: [u8; 128] = [
0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
0xff, 0xff, 0xff, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe,
0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfc, 0xfc, 0xfc, 0xfc, 0xfc, 0xfc, 0xfc,
0xfc, 0xfc, 0xfc, 0xfc, 0xfc, 0xfc, 0xfc, 0xfc, 0xfc, 0xf8, 0xf8, 0xf8, 0xf8,
0xf8, 0xf8, 0xf8, 0xf8, 0xf8, 0xf8, 0xf8, 0xf8, 0xf8, 0xf8, 0xf8, 0xf8, 0xf0,
0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0,
0xf0, 0xf0, 0xe0, 0xe0, 0xe0, 0xe0, 0xe0, 0xe0, 0xe0, 0xe0, 0xe0, 0xe0, 0xe0,
0xe0, 0xe0, 0xe0, 0xe0, 0xe0, 0xc0, 0xc0, 0xc0, 0xc0, 0xc0, 0xc0, 0xc0, 0xc0,
0xc0, 0xc0, 0xc0, 0xc0, 0xc0, 0xc0, 0xc0, 0xc0, 0x80, 0x80, 0x80, 0x80, 0x80,
0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
];
let mask = match op { let mask = match op {
Opcode::Ishl => &SHL_MASKS,
Opcode::Ushr => &USHR_MASKS, Opcode::Ushr => &USHR_MASKS,
_ => unimplemented!("{} is not implemented for type {}", op, dst_ty), _ => unimplemented!("{} is not implemented for type {}", op, dst_ty),
}; };
@@ -1775,17 +1761,14 @@ fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(
let dst = get_output_reg(ctx, outputs[0]).only_reg().unwrap(); let dst = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
let sse_op = match dst_ty { let sse_op = match dst_ty {
types::I16X8 => match op { types::I16X8 => match op {
Opcode::Ishl => SseOpcode::Psllw,
Opcode::Ushr => SseOpcode::Psrlw, Opcode::Ushr => SseOpcode::Psrlw,
_ => unimplemented!("{} is not implemented for type {}", op, dst_ty), _ => unimplemented!("{} is not implemented for type {}", op, dst_ty),
}, },
types::I32X4 => match op { types::I32X4 => match op {
Opcode::Ishl => SseOpcode::Pslld,
Opcode::Ushr => SseOpcode::Psrld, Opcode::Ushr => SseOpcode::Psrld,
_ => unimplemented!("{} is not implemented for type {}", op, dst_ty), _ => unimplemented!("{} is not implemented for type {}", op, dst_ty),
}, },
types::I64X2 => match op { types::I64X2 => match op {
Opcode::Ishl => SseOpcode::Psllq,
Opcode::Ushr => SseOpcode::Psrlq, Opcode::Ushr => SseOpcode::Psrlq,
_ => unimplemented!("{} is not implemented for type {}", op, dst_ty), _ => unimplemented!("{} is not implemented for type {}", op, dst_ty),
}, },

View File

@@ -7,19 +7,19 @@ mod generated_code;
use super::{ use super::{
is_mergeable_load, lower_to_amode, AluRmiROpcode, Inst as MInst, OperandSize, Reg, RegMemImm, is_mergeable_load, lower_to_amode, AluRmiROpcode, Inst as MInst, OperandSize, Reg, RegMemImm,
}; };
use crate::isa::x64::inst::args::SyntheticAmode;
use crate::isa::x64::inst::regs;
use crate::isa::x64::settings::Flags;
use crate::machinst::isle::*;
use crate::{ use crate::{
ir::{immediates::*, types::*, Inst, InstructionData, Opcode, TrapCode, Value, ValueList}, ir::{immediates::*, types::*, Inst, InstructionData, Opcode, TrapCode, Value, ValueList},
isa::x64::inst::{ isa::x64::{
inst::{
args::{ args::{
Avx512Opcode, CmpOpcode, ExtMode, FcmpImm, Imm8Reg, RegMem, ShiftKind, SseOpcode, CC, Amode, Avx512Opcode, CmpOpcode, ExtKind, ExtMode, FcmpImm, Imm8Reg, RegMem,
ShiftKind, SseOpcode, SyntheticAmode, CC,
}, },
x64_map_regs, regs, x64_map_regs,
}, },
machinst::{InsnInput, InsnOutput, LowerCtx}, settings::Flags,
},
machinst::{isle::*, InsnInput, InsnOutput, LowerCtx, VCodeConstantData},
}; };
use std::convert::TryFrom; use std::convert::TryFrom;
@@ -248,8 +248,59 @@ where
fn xmm0(&mut self) -> WritableReg { fn xmm0(&mut self) -> WritableReg {
WritableReg::from_reg(regs::xmm0()) WritableReg::from_reg(regs::xmm0())
} }
#[inline]
fn synthetic_amode_to_reg_mem(&mut self, addr: &SyntheticAmode) -> RegMem {
RegMem::mem(addr.clone())
} }
#[inline]
fn amode_imm_reg_reg_shift(&mut self, simm32: u32, base: Reg, index: Reg, shift: u8) -> Amode {
Amode::imm_reg_reg_shift(simm32, base, index, shift)
}
#[inline]
fn amode_to_synthetic_amode(&mut self, amode: &Amode) -> SyntheticAmode {
amode.clone().into()
}
fn ishl_i8x16_mask_for_const(&mut self, amt: u32) -> SyntheticAmode {
// When the shift amount is known, we can statically (i.e. at compile
// time) determine the mask to use and only emit that.
debug_assert!(amt < 8);
let mask_offset = amt as usize * 16;
let mask_constant = self.lower_ctx.use_constant(VCodeConstantData::WellKnown(
&I8X16_SHL_MASKS[mask_offset..mask_offset + 16],
));
SyntheticAmode::ConstantOffset(mask_constant)
}
fn ishl_i8x16_mask_table(&mut self) -> SyntheticAmode {
let mask_table = self
.lower_ctx
.use_constant(VCodeConstantData::WellKnown(&I8X16_SHL_MASKS));
SyntheticAmode::ConstantOffset(mask_table)
}
}
// Since x64 doesn't have 8x16 shifts and we must use a 16x8 shift instead, we
// need to fix up the bits that migrate from one half of the lane to the
// other. Each 16-byte mask is indexed by the shift amount: e.g. if we shift
// right by 0 (no movement), we want to retain all the bits so we mask with
// `0xff`; if we shift right by 1, we want to retain all bits except the MSB so
// we mask with `0x7f`; etc.
#[rustfmt::skip] // Preserve 16 bytes (i.e. one mask) per row.
const I8X16_SHL_MASKS: [u8; 128] = [
0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe,
0xfc, 0xfc, 0xfc, 0xfc, 0xfc, 0xfc, 0xfc, 0xfc, 0xfc, 0xfc, 0xfc, 0xfc, 0xfc, 0xfc, 0xfc, 0xfc,
0xf8, 0xf8, 0xf8, 0xf8, 0xf8, 0xf8, 0xf8, 0xf8, 0xf8, 0xf8, 0xf8, 0xf8, 0xf8, 0xf8, 0xf8, 0xf8,
0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0,
0xe0, 0xe0, 0xe0, 0xe0, 0xe0, 0xe0, 0xe0, 0xe0, 0xe0, 0xe0, 0xe0, 0xe0, 0xe0, 0xe0, 0xe0, 0xe0,
0xc0, 0xc0, 0xc0, 0xc0, 0xc0, 0xc0, 0xc0, 0xc0, 0xc0, 0xc0, 0xc0, 0xc0, 0xc0, 0xc0, 0xc0, 0xc0,
0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
];
#[inline] #[inline]
fn to_simm32(constant: i64) -> Option<RegMemImm> { fn to_simm32(constant: i64) -> Option<RegMemImm> {
if constant == ((constant << 32) >> 32) { if constant == ((constant << 32) >> 32) {

View File

@@ -1,4 +1,4 @@
src/clif.isle f176ef3bba99365 src/clif.isle f176ef3bba99365
src/prelude.isle d95510fad2e2473c src/prelude.isle 7b911d3b894ae17
src/isa/x64/inst.isle c16462cc359dd466 src/isa/x64/inst.isle dbfa857f7f2c5d9f
src/isa/x64/lower.isle 9f761598e3949e8e src/isa/x64/lower.isle 5a737854091e1189

File diff suppressed because it is too large Load Diff

View File

@@ -100,6 +100,11 @@ macro_rules! isle_prelude_methods {
ty.bits() ty.bits()
} }
#[inline]
fn ty_bytes(&mut self, ty: Type) -> u16 {
u16::try_from(ty.bytes()).unwrap()
}
fn fits_in_16(&mut self, ty: Type) -> Option<Type> { fn fits_in_16(&mut self, ty: Type) -> Option<Type> {
if ty.bits() <= 16 { if ty.bits() <= 16 {
Some(ty) Some(ty)

View File

@@ -152,6 +152,10 @@
(decl ty_bits_u16 (Type) u16) (decl ty_bits_u16 (Type) u16)
(extern constructor ty_bits_u16 ty_bits_u16) (extern constructor ty_bits_u16 ty_bits_u16)
;; Get the byte width of a given type.
(decl ty_bytes (Type) u16)
(extern constructor ty_bytes ty_bytes)
;; Get the type of each lane in the given type. ;; Get the type of each lane in the given type.
(decl lane_type (Type) Type) (decl lane_type (Type) Type)
(extern constructor lane_type lane_type) (extern constructor lane_type lane_type)