x64: lower iabs.i64x2 using a single AVX512 instruction when possible (#2819)
* x64: add EVEX encoding mechanism Also, includes an empty stub module for the VEX encoding. * x64: lower abs.i64x2 to VPABSQ when available * x64: refactor EVEX encodings to use `EvexInstruction` This change replaces the `encode_evex` function with a builder-style struct, `EvexInstruction`. This approach clarifies the code, adds documentation, and results in slight speedups when benchmarked. * x64: rename encoding CodeSink to ByteSink
This commit is contained in:
@@ -460,9 +460,7 @@ pub(crate) enum InstructionSet {
|
|||||||
BMI1,
|
BMI1,
|
||||||
#[allow(dead_code)] // never constructed (yet).
|
#[allow(dead_code)] // never constructed (yet).
|
||||||
BMI2,
|
BMI2,
|
||||||
#[allow(dead_code)]
|
|
||||||
AVX512F,
|
AVX512F,
|
||||||
#[allow(dead_code)]
|
|
||||||
AVX512VL,
|
AVX512VL,
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -995,13 +993,11 @@ impl fmt::Display for SseOpcode {
|
|||||||
|
|
||||||
#[derive(Clone)]
|
#[derive(Clone)]
|
||||||
pub enum Avx512Opcode {
|
pub enum Avx512Opcode {
|
||||||
#[allow(dead_code)]
|
|
||||||
Vpabsq,
|
Vpabsq,
|
||||||
}
|
}
|
||||||
|
|
||||||
impl Avx512Opcode {
|
impl Avx512Opcode {
|
||||||
/// Which `InstructionSet`s support the opcode?
|
/// Which `InstructionSet`s support the opcode?
|
||||||
#[allow(dead_code)]
|
|
||||||
pub(crate) fn available_from(&self) -> SmallVec<[InstructionSet; 2]> {
|
pub(crate) fn available_from(&self) -> SmallVec<[InstructionSet; 2]> {
|
||||||
match self {
|
match self {
|
||||||
Avx512Opcode::Vpabsq => smallvec![InstructionSet::AVX512F, InstructionSet::AVX512VL],
|
Avx512Opcode::Vpabsq => smallvec![InstructionSet::AVX512F, InstructionSet::AVX512VL],
|
||||||
|
|||||||
@@ -6,9 +6,11 @@ use crate::isa::x64::inst::args::*;
|
|||||||
use crate::isa::x64::inst::*;
|
use crate::isa::x64::inst::*;
|
||||||
use crate::machinst::{inst_common, MachBuffer, MachInstEmit, MachLabel};
|
use crate::machinst::{inst_common, MachBuffer, MachInstEmit, MachLabel};
|
||||||
use core::convert::TryInto;
|
use core::convert::TryInto;
|
||||||
|
use encoding::evex::{EvexInstruction, EvexVectorLength};
|
||||||
use encoding::rex::{
|
use encoding::rex::{
|
||||||
emit_simm, emit_std_enc_enc, emit_std_enc_mem, emit_std_reg_mem, emit_std_reg_reg, int_reg_enc,
|
emit_simm, emit_std_enc_enc, emit_std_enc_mem, emit_std_reg_mem, emit_std_reg_reg, int_reg_enc,
|
||||||
low8_will_sign_extend_to_32, low8_will_sign_extend_to_64, reg_enc, LegacyPrefixes, RexFlags,
|
low8_will_sign_extend_to_32, low8_will_sign_extend_to_64, reg_enc, LegacyPrefixes, OpcodeMap,
|
||||||
|
RexFlags,
|
||||||
};
|
};
|
||||||
use log::debug;
|
use log::debug;
|
||||||
use regalloc::{Reg, Writable};
|
use regalloc::{Reg, Writable};
|
||||||
@@ -1404,6 +1406,24 @@ pub(crate) fn emit(
|
|||||||
};
|
};
|
||||||
}
|
}
|
||||||
|
|
||||||
|
Inst::XmmUnaryRmREvex { op, src, dst } => {
|
||||||
|
let opcode = match op {
|
||||||
|
Avx512Opcode::Vpabsq => 0x1f,
|
||||||
|
};
|
||||||
|
match src {
|
||||||
|
RegMem::Reg { reg: src } => EvexInstruction::new()
|
||||||
|
.length(EvexVectorLength::V128)
|
||||||
|
.prefix(LegacyPrefixes::_66)
|
||||||
|
.map(OpcodeMap::_0F38)
|
||||||
|
.w(true)
|
||||||
|
.opcode(opcode)
|
||||||
|
.reg(dst.to_reg().get_hw_encoding())
|
||||||
|
.rm(src.get_hw_encoding())
|
||||||
|
.encode(sink),
|
||||||
|
_ => todo!(),
|
||||||
|
};
|
||||||
|
}
|
||||||
|
|
||||||
Inst::XmmRmR {
|
Inst::XmmRmR {
|
||||||
op,
|
op,
|
||||||
src: src_e,
|
src: src_e,
|
||||||
|
|||||||
@@ -3865,6 +3865,12 @@ fn test_x64_emit() {
|
|||||||
"cvtdq2pd %xmm2, %xmm8",
|
"cvtdq2pd %xmm2, %xmm8",
|
||||||
));
|
));
|
||||||
|
|
||||||
|
insns.push((
|
||||||
|
Inst::xmm_unary_rm_r_evex(Avx512Opcode::Vpabsq, RegMem::reg(xmm2), w_xmm8),
|
||||||
|
"6272FD081FC2",
|
||||||
|
"vpabsq %xmm2, %xmm8",
|
||||||
|
));
|
||||||
|
|
||||||
// Xmm to int conversions, and conversely.
|
// Xmm to int conversions, and conversely.
|
||||||
|
|
||||||
insns.push((
|
insns.push((
|
||||||
@@ -4276,6 +4282,7 @@ fn test_x64_emit() {
|
|||||||
let mut isa_flag_builder = x64::settings::builder();
|
let mut isa_flag_builder = x64::settings::builder();
|
||||||
isa_flag_builder.enable("has_ssse3").unwrap();
|
isa_flag_builder.enable("has_ssse3").unwrap();
|
||||||
isa_flag_builder.enable("has_sse41").unwrap();
|
isa_flag_builder.enable("has_sse41").unwrap();
|
||||||
|
isa_flag_builder.enable("has_avx512f").unwrap();
|
||||||
let isa_flags = x64::settings::Flags::new(&flags, isa_flag_builder);
|
let isa_flags = x64::settings::Flags::new(&flags, isa_flag_builder);
|
||||||
|
|
||||||
let rru = regs::create_reg_universe_systemv(&flags);
|
let rru = regs::create_reg_universe_systemv(&flags);
|
||||||
|
|||||||
396
cranelift/codegen/src/isa/x64/inst/encoding/evex.rs
Normal file
396
cranelift/codegen/src/isa/x64/inst/encoding/evex.rs
Normal file
@@ -0,0 +1,396 @@
|
|||||||
|
//! Encodes EVEX instructions. These instructions are those added by the AVX-512 extensions. The
|
||||||
|
//! EVEX encoding requires a 4-byte prefix:
|
||||||
|
//!
|
||||||
|
//! Byte 0: 0x62
|
||||||
|
//! ┌───┬───┬───┬───┬───┬───┬───┬───┐
|
||||||
|
//! Byte 1: │ R │ X │ B │ R'│ 0 │ 0 │ m │ m │
|
||||||
|
//! ├───┼───┼───┼───┼───┼───┼───┼───┤
|
||||||
|
//! Byte 2: │ W │ v │ v │ v │ v │ 1 │ p │ p │
|
||||||
|
//! ├───┼───┼───┼───┼───┼───┼───┼───┤
|
||||||
|
//! Byte 3: │ z │ L'│ L │ b │ V'│ a │ a │ a │
|
||||||
|
//! └───┴───┴───┴───┴───┴───┴───┴───┘
|
||||||
|
//!
|
||||||
|
//! The prefix is then followeded by the opcode byte, the ModR/M byte, and other optional suffixes
|
||||||
|
//! (e.g. SIB byte, displacements, immediates) based on the instruction (see section 2.6, Intel
|
||||||
|
//! Software Development Manual, volume 2A).
|
||||||
|
use super::rex::{encode_modrm, LegacyPrefixes, OpcodeMap};
|
||||||
|
use super::ByteSink;
|
||||||
|
use core::ops::RangeInclusive;
|
||||||
|
|
||||||
|
/// Constructs an EVEX-encoded instruction using a builder pattern. This approach makes it visually
|
||||||
|
/// easier to transform something the manual's syntax, `EVEX.256.66.0F38.W1 1F /r` to code:
|
||||||
|
/// `EvexInstruction::new().length(...).prefix(...).map(...).w(true).opcode(0x1F).reg(...).rm(...)`.
|
||||||
|
pub struct EvexInstruction {
|
||||||
|
bits: u32,
|
||||||
|
opcode: u8,
|
||||||
|
reg: Register,
|
||||||
|
rm: Register,
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Because some of the bit flags in the EVEX prefix are reversed and users of `EvexInstruction` may
|
||||||
|
/// choose to skip setting fields, here we set some sane defaults. Note that:
|
||||||
|
/// - the first byte is always `0x62` but you will notice it at the end of the default `bits` value
|
||||||
|
/// implemented--remember the little-endian order
|
||||||
|
/// - some bits are always set to certain values: bits 10-11 to 0, bit 18 to 1
|
||||||
|
/// - the other bits set correspond to reversed bits: R, X, B, R' (byte 1), vvvv (byte 2), V' (byte
|
||||||
|
/// 3).
|
||||||
|
///
|
||||||
|
/// See the `default_emission` test for what these defaults are equivalent to (e.g. using RAX,
|
||||||
|
/// unsetting the W bit, etc.)
|
||||||
|
impl Default for EvexInstruction {
|
||||||
|
fn default() -> Self {
|
||||||
|
Self {
|
||||||
|
bits: 0x08_7C_F0_62,
|
||||||
|
opcode: 0,
|
||||||
|
reg: Register::default(),
|
||||||
|
rm: Register::default(),
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
#[allow(non_upper_case_globals)] // This makes it easier to match the bit range names to the manual's names.
|
||||||
|
impl EvexInstruction {
|
||||||
|
/// Construct a default EVEX instruction.
|
||||||
|
pub fn new() -> Self {
|
||||||
|
Self::default()
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Set the length of the instruction . Note that there are sets of instructions (i.e. rounding,
|
||||||
|
/// memory broadcast) that modify the same underlying bits--at some point (TODO) we can add a
|
||||||
|
/// way to set those context bits and verify that both are not used (e.g. rounding AND length).
|
||||||
|
/// For now, this method is very convenient.
|
||||||
|
#[inline(always)]
|
||||||
|
pub fn length(mut self, length: EvexVectorLength) -> Self {
|
||||||
|
self.write(Self::LL, EvexContext::Other { length }.bits() as u32);
|
||||||
|
self
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Set the legacy prefix byte of the instruction: None | 66 | F0 | F2 | F3. EVEX instructions
|
||||||
|
/// pack these into the prefix, not as separate bytes.
|
||||||
|
#[inline(always)]
|
||||||
|
pub fn prefix(mut self, prefix: LegacyPrefixes) -> Self {
|
||||||
|
self.write(Self::pp, prefix.bits() as u32);
|
||||||
|
self
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Set the opcode map byte of the instruction: None | 0F | 0F38 | 0F3A. EVEX instructions pack
|
||||||
|
/// these into the prefix, not as separate bytes.
|
||||||
|
#[inline(always)]
|
||||||
|
pub fn map(mut self, map: OpcodeMap) -> Self {
|
||||||
|
self.write(Self::mm, map.bits() as u32);
|
||||||
|
self
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Set the W bit, typically used to indicate an instruction using 64 bits of an operand (e.g.
|
||||||
|
/// 64 bit lanes). EVEX packs this bit in the EVEX prefix; previous encodings used the REX
|
||||||
|
/// prefix.
|
||||||
|
#[inline(always)]
|
||||||
|
pub fn w(mut self, w: bool) -> Self {
|
||||||
|
self.write(Self::W, w as u32);
|
||||||
|
self
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Set the instruction opcode byte.
|
||||||
|
#[inline(always)]
|
||||||
|
pub fn opcode(mut self, opcode: u8) -> Self {
|
||||||
|
self.opcode = opcode;
|
||||||
|
self
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Set the register to use for the `reg` bits; many instructions use this as the write operand.
|
||||||
|
/// Setting this affects both the ModRM byte (`reg` section) and the EVEX prefix (the extension
|
||||||
|
/// bits for register encodings > 8).
|
||||||
|
#[inline(always)]
|
||||||
|
pub fn reg(mut self, reg: impl Into<Register>) -> Self {
|
||||||
|
self.reg = reg.into();
|
||||||
|
let r = !(self.reg.0 >> 3) & 1;
|
||||||
|
let r_ = !(self.reg.0 >> 4) & 1;
|
||||||
|
self.write(Self::R, r as u32);
|
||||||
|
self.write(Self::R_, r_ as u32);
|
||||||
|
self
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Set the mask to use. See section 2.6 in the Intel Software Developer's Manual, volume 2A for
|
||||||
|
/// more details.
|
||||||
|
#[allow(dead_code)]
|
||||||
|
#[inline(always)]
|
||||||
|
pub fn mask(mut self, mask: EvexMasking) -> Self {
|
||||||
|
self.write(Self::aaa, mask.aaa_bits() as u32);
|
||||||
|
self.write(Self::z, mask.z_bit() as u32);
|
||||||
|
self
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Set the `vvvvv` register; some instructions allow using this as a second, non-destructive
|
||||||
|
/// source register in 3-operand instructions (e.g. 2 read, 1 write).
|
||||||
|
#[allow(dead_code)]
|
||||||
|
#[inline(always)]
|
||||||
|
pub fn vvvvv(mut self, reg: impl Into<Register>) -> Self {
|
||||||
|
let reg = reg.into();
|
||||||
|
self.write(Self::vvvv, !(reg.0 as u32) & 0b1111);
|
||||||
|
self.write(Self::V_, !(reg.0 as u32 >> 4) & 0b1);
|
||||||
|
self
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Set the register to use for the `rm` bits; many instructions use this as the "read from
|
||||||
|
/// register/memory" operand. Currently this does not support memory addressing (TODO).Setting
|
||||||
|
/// this affects both the ModRM byte (`rm` section) and the EVEX prefix (the extension bits for
|
||||||
|
/// register encodings > 8).
|
||||||
|
#[inline(always)]
|
||||||
|
pub fn rm(mut self, reg: impl Into<Register>) -> Self {
|
||||||
|
self.rm = reg.into();
|
||||||
|
let b = !(self.rm.0 >> 3) & 1;
|
||||||
|
let x = !(self.rm.0 >> 4) & 1;
|
||||||
|
self.write(Self::X, x as u32);
|
||||||
|
self.write(Self::B, b as u32);
|
||||||
|
self
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Emit the EVEX-encoded instruction to the code sink:
|
||||||
|
/// - first, the 4-byte EVEX prefix;
|
||||||
|
/// - then, the opcode byte;
|
||||||
|
/// - finally, the ModR/M byte.
|
||||||
|
///
|
||||||
|
/// Eventually this method should support encodings of more than just the reg-reg addressing mode (TODO).
|
||||||
|
pub fn encode<CS: ByteSink + ?Sized>(&self, sink: &mut CS) {
|
||||||
|
sink.put4(self.bits);
|
||||||
|
sink.put1(self.opcode);
|
||||||
|
sink.put1(encode_modrm(3, self.reg.0 & 7, self.rm.0 & 7));
|
||||||
|
}
|
||||||
|
|
||||||
|
// In order to simplify the encoding of the various bit ranges in the prefix, we specify those
|
||||||
|
// ranges according to the table below (extracted from the Intel Software Development Manual,
|
||||||
|
// volume 2A). Remember that, because we pack the 4-byte prefix into a little-endian `u32`, this
|
||||||
|
// chart should be read from right-to-left, top-to-bottom. Note also that we start ranges at bit
|
||||||
|
// 8, leaving bits 0-7 for the mandatory `0x62`.
|
||||||
|
// ┌───┬───┬───┬───┬───┬───┬───┬───┐
|
||||||
|
// Byte 1: │ R │ X │ B │ R'│ 0 │ 0 │ m │ m │
|
||||||
|
// ├───┼───┼───┼───┼───┼───┼───┼───┤
|
||||||
|
// Byte 2: │ W │ v │ v │ v │ v │ 1 │ p │ p │
|
||||||
|
// ├───┼───┼───┼───┼───┼───┼───┼───┤
|
||||||
|
// Byte 3: │ z │ L'│ L │ b │ V'│ a │ a │ a │
|
||||||
|
// └───┴───┴───┴───┴───┴───┴───┴───┘
|
||||||
|
|
||||||
|
// Byte 1:
|
||||||
|
const mm: RangeInclusive<u8> = 8..=9;
|
||||||
|
const R_: RangeInclusive<u8> = 12..=12;
|
||||||
|
const B: RangeInclusive<u8> = 13..=13;
|
||||||
|
const X: RangeInclusive<u8> = 14..=14;
|
||||||
|
const R: RangeInclusive<u8> = 15..=15;
|
||||||
|
|
||||||
|
// Byte 2:
|
||||||
|
const pp: RangeInclusive<u8> = 16..=17;
|
||||||
|
const vvvv: RangeInclusive<u8> = 19..=22;
|
||||||
|
const W: RangeInclusive<u8> = 23..=23;
|
||||||
|
|
||||||
|
// Byte 3:
|
||||||
|
const aaa: RangeInclusive<u8> = 24..=26;
|
||||||
|
const V_: RangeInclusive<u8> = 27..=27;
|
||||||
|
#[allow(dead_code)] // Will be used once broadcast and rounding controls are exposed.
|
||||||
|
const b: RangeInclusive<u8> = 28..=28;
|
||||||
|
const LL: RangeInclusive<u8> = 29..=30;
|
||||||
|
const z: RangeInclusive<u8> = 31..=31;
|
||||||
|
|
||||||
|
// A convenience method for writing the `value` bits to the given range in `self.bits`.
|
||||||
|
#[inline]
|
||||||
|
fn write(&mut self, range: RangeInclusive<u8>, value: u32) {
|
||||||
|
assert!(ExactSizeIterator::len(&range) > 0);
|
||||||
|
let size = range.end() - range.start() + 1; // Calculate the number of bits in the range.
|
||||||
|
let mask: u32 = (1 << size) - 1; // Generate a bit mask.
|
||||||
|
debug_assert!(
|
||||||
|
value <= mask,
|
||||||
|
"The written value should have fewer than {} bits.",
|
||||||
|
size
|
||||||
|
);
|
||||||
|
let mask_complement = !(mask << *range.start()); // Create the bitwise complement for the clear mask.
|
||||||
|
self.bits &= mask_complement; // Clear the bits in `range`; otherwise the OR below may allow previously-set bits to slip through.
|
||||||
|
let value = value << *range.start(); // Place the value in the correct location (assumes `value <= mask`).
|
||||||
|
self.bits |= value; // Modify the bits in `range`.
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
#[derive(Copy, Clone, Default)]
|
||||||
|
pub struct Register(u8);
|
||||||
|
impl From<u8> for Register {
|
||||||
|
fn from(reg: u8) -> Self {
|
||||||
|
debug_assert!(reg < 16);
|
||||||
|
Self(reg)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Defines the EVEX context for the `L'`, `L`, and `b` bits (bits 6:4 of EVEX P2 byte). Table 2-36 in
|
||||||
|
/// section 2.6.10 (Intel Software Development Manual, volume 2A) describes how these bits can be
|
||||||
|
/// used together for certain classes of instructions; i.e., special care should be taken to ensure
|
||||||
|
/// that instructions use an applicable correct `EvexContext`. Table 2-39 contains cases where
|
||||||
|
/// opcodes can result in an #UD.
|
||||||
|
#[allow(dead_code)] // Rounding and broadcast modes are not yet used.
|
||||||
|
pub enum EvexContext {
|
||||||
|
RoundingRegToRegFP {
|
||||||
|
rc: EvexRoundingControl,
|
||||||
|
},
|
||||||
|
NoRoundingFP {
|
||||||
|
sae: bool,
|
||||||
|
length: EvexVectorLength,
|
||||||
|
},
|
||||||
|
MemoryOp {
|
||||||
|
broadcast: bool,
|
||||||
|
length: EvexVectorLength,
|
||||||
|
},
|
||||||
|
Other {
|
||||||
|
length: EvexVectorLength,
|
||||||
|
},
|
||||||
|
}
|
||||||
|
|
||||||
|
impl Default for EvexContext {
|
||||||
|
fn default() -> Self {
|
||||||
|
Self::Other {
|
||||||
|
length: EvexVectorLength::default(),
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
impl EvexContext {
|
||||||
|
/// Encode the `L'`, `L`, and `b` bits (bits 6:4 of EVEX P2 byte) for merging with the P2 byte.
|
||||||
|
fn bits(&self) -> u8 {
|
||||||
|
match self {
|
||||||
|
Self::RoundingRegToRegFP { rc } => 0b001 | rc.bits() << 1,
|
||||||
|
Self::NoRoundingFP { sae, length } => (*sae as u8) | length.bits() << 1,
|
||||||
|
Self::MemoryOp { broadcast, length } => (*broadcast as u8) | length.bits() << 1,
|
||||||
|
Self::Other { length } => length.bits() << 1,
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/// The EVEX format allows choosing a vector length in the `L'` and `L` bits; see `EvexContext`.
|
||||||
|
#[allow(dead_code)] // Wider-length vectors are not yet used.
|
||||||
|
pub enum EvexVectorLength {
|
||||||
|
V128,
|
||||||
|
V256,
|
||||||
|
V512,
|
||||||
|
}
|
||||||
|
|
||||||
|
impl EvexVectorLength {
|
||||||
|
/// Encode the `L'` and `L` bits for merging with the P2 byte.
|
||||||
|
fn bits(&self) -> u8 {
|
||||||
|
match self {
|
||||||
|
Self::V128 => 0b00,
|
||||||
|
Self::V256 => 0b01,
|
||||||
|
Self::V512 => 0b10,
|
||||||
|
// 0b11 is reserved (#UD).
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
impl Default for EvexVectorLength {
|
||||||
|
fn default() -> Self {
|
||||||
|
Self::V128
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/// The EVEX format allows defining rounding control in the `L'` and `L` bits; see `EvexContext`.
|
||||||
|
#[allow(dead_code)] // Rounding controls are not yet used.
|
||||||
|
pub enum EvexRoundingControl {
|
||||||
|
RNE,
|
||||||
|
RD,
|
||||||
|
RU,
|
||||||
|
RZ,
|
||||||
|
}
|
||||||
|
|
||||||
|
impl EvexRoundingControl {
|
||||||
|
/// Encode the `L'` and `L` bits for merging with the P2 byte.
|
||||||
|
fn bits(&self) -> u8 {
|
||||||
|
match self {
|
||||||
|
Self::RNE => 0b00,
|
||||||
|
Self::RD => 0b01,
|
||||||
|
Self::RU => 0b10,
|
||||||
|
Self::RZ => 0b11,
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Defines the EVEX masking behavior; masking support is described in section 2.6.4 of the Intel
|
||||||
|
/// Software Development Manual, volume 2A.
|
||||||
|
#[allow(dead_code)] // Masking is not yet used.
|
||||||
|
pub enum EvexMasking {
|
||||||
|
None,
|
||||||
|
Merging { k: u8 },
|
||||||
|
Zeroing { k: u8 },
|
||||||
|
}
|
||||||
|
|
||||||
|
impl Default for EvexMasking {
|
||||||
|
fn default() -> Self {
|
||||||
|
EvexMasking::None
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
impl EvexMasking {
|
||||||
|
/// Encode the `z` bit for merging with the P2 byte.
|
||||||
|
fn z_bit(&self) -> u8 {
|
||||||
|
match self {
|
||||||
|
Self::None | Self::Merging { .. } => 0,
|
||||||
|
Self::Zeroing { .. } => 1,
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Encode the `aaa` bits for merging with the P2 byte.
|
||||||
|
fn aaa_bits(&self) -> u8 {
|
||||||
|
match self {
|
||||||
|
Self::None => 0b000,
|
||||||
|
Self::Merging { k } | Self::Zeroing { k } => {
|
||||||
|
debug_assert!(*k <= 7);
|
||||||
|
*k
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
#[cfg(test)]
|
||||||
|
mod tests {
|
||||||
|
use super::*;
|
||||||
|
use crate::isa::x64::inst::regs;
|
||||||
|
use std::vec::Vec;
|
||||||
|
|
||||||
|
// As a sanity test, we verify that the output of `xed-asmparse-main 'vpabsq xmm0{k0},
|
||||||
|
// xmm1'` matches this EVEX encoding machinery.
|
||||||
|
#[test]
|
||||||
|
fn vpabsq() {
|
||||||
|
let dst = regs::xmm0();
|
||||||
|
let src = regs::xmm1();
|
||||||
|
let mut sink0 = Vec::new();
|
||||||
|
|
||||||
|
EvexInstruction::new()
|
||||||
|
.prefix(LegacyPrefixes::_66)
|
||||||
|
.map(OpcodeMap::_0F38)
|
||||||
|
.w(true)
|
||||||
|
.opcode(0x1F)
|
||||||
|
.reg(dst.get_hw_encoding())
|
||||||
|
.rm(src.get_hw_encoding())
|
||||||
|
.length(EvexVectorLength::V128)
|
||||||
|
.encode(&mut sink0);
|
||||||
|
|
||||||
|
assert_eq!(sink0, vec![0x62, 0xf2, 0xfd, 0x08, 0x1f, 0xc1]);
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Verify that the defaults are equivalent to an instruction with a `0x00` opcode using the
|
||||||
|
/// "0" register (i.e. `rax`), with sane defaults for the various configurable parameters. This
|
||||||
|
/// test is more interesting than it may appear because some of the parameters have flipped-bit
|
||||||
|
/// representations (e.g. `vvvvv`) so emitting 0s as a default will not work.
|
||||||
|
#[test]
|
||||||
|
fn default_emission() {
|
||||||
|
let mut sink0 = Vec::new();
|
||||||
|
EvexInstruction::new().encode(&mut sink0);
|
||||||
|
|
||||||
|
let mut sink1 = Vec::new();
|
||||||
|
EvexInstruction::new()
|
||||||
|
.length(EvexVectorLength::V128)
|
||||||
|
.prefix(LegacyPrefixes::None)
|
||||||
|
.map(OpcodeMap::None)
|
||||||
|
.w(false)
|
||||||
|
.opcode(0x00)
|
||||||
|
.reg(regs::rax().get_hw_encoding())
|
||||||
|
.rm(regs::rax().get_hw_encoding())
|
||||||
|
.mask(EvexMasking::None)
|
||||||
|
.encode(&mut sink1);
|
||||||
|
|
||||||
|
assert_eq!(sink0, sink1);
|
||||||
|
}
|
||||||
|
}
|
||||||
@@ -1 +1,57 @@
|
|||||||
|
use crate::{isa::x64, machinst::MachBuffer};
|
||||||
|
use std::vec::Vec;
|
||||||
|
|
||||||
|
pub mod evex;
|
||||||
pub mod rex;
|
pub mod rex;
|
||||||
|
pub mod vex;
|
||||||
|
|
||||||
|
pub trait ByteSink {
|
||||||
|
/// Add 1 byte to the code section.
|
||||||
|
fn put1(&mut self, _: u8);
|
||||||
|
|
||||||
|
/// Add 2 bytes to the code section.
|
||||||
|
fn put2(&mut self, _: u16);
|
||||||
|
|
||||||
|
/// Add 4 bytes to the code section.
|
||||||
|
fn put4(&mut self, _: u32);
|
||||||
|
|
||||||
|
/// Add 8 bytes to the code section.
|
||||||
|
fn put8(&mut self, _: u64);
|
||||||
|
}
|
||||||
|
|
||||||
|
impl ByteSink for MachBuffer<x64::inst::Inst> {
|
||||||
|
fn put1(&mut self, value: u8) {
|
||||||
|
self.put1(value)
|
||||||
|
}
|
||||||
|
|
||||||
|
fn put2(&mut self, value: u16) {
|
||||||
|
self.put2(value)
|
||||||
|
}
|
||||||
|
|
||||||
|
fn put4(&mut self, value: u32) {
|
||||||
|
self.put4(value)
|
||||||
|
}
|
||||||
|
|
||||||
|
fn put8(&mut self, value: u64) {
|
||||||
|
self.put8(value)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Provide a convenient implementation for testing.
|
||||||
|
impl ByteSink for Vec<u8> {
|
||||||
|
fn put1(&mut self, v: u8) {
|
||||||
|
self.extend_from_slice(&[v])
|
||||||
|
}
|
||||||
|
|
||||||
|
fn put2(&mut self, v: u16) {
|
||||||
|
self.extend_from_slice(&v.to_le_bytes())
|
||||||
|
}
|
||||||
|
|
||||||
|
fn put4(&mut self, v: u32) {
|
||||||
|
self.extend_from_slice(&v.to_le_bytes())
|
||||||
|
}
|
||||||
|
|
||||||
|
fn put8(&mut self, v: u64) {
|
||||||
|
self.extend_from_slice(&v.to_le_bytes())
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|||||||
@@ -153,9 +153,37 @@ impl From<(OperandSize, Reg)> for RexFlags {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/// Allows using the same opcode byte in different "opcode maps" to allow for more instruction
|
||||||
|
/// encodings. See appendix A in the Intel Software Developer's Manual, volume 2A, for more details.
|
||||||
|
pub enum OpcodeMap {
|
||||||
|
None,
|
||||||
|
_0F,
|
||||||
|
_0F38,
|
||||||
|
_0F3A,
|
||||||
|
}
|
||||||
|
|
||||||
|
impl OpcodeMap {
|
||||||
|
/// Normally the opcode map is specified as bytes in the instruction, but some x64 encoding
|
||||||
|
/// formats pack this information as bits in a prefix (e.g. EVEX).
|
||||||
|
pub(crate) fn bits(&self) -> u8 {
|
||||||
|
match self {
|
||||||
|
OpcodeMap::None => 0b00,
|
||||||
|
OpcodeMap::_0F => 0b01,
|
||||||
|
OpcodeMap::_0F38 => 0b10,
|
||||||
|
OpcodeMap::_0F3A => 0b11,
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
impl Default for OpcodeMap {
|
||||||
|
fn default() -> Self {
|
||||||
|
Self::None
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
/// We may need to include one or more legacy prefix bytes before the REX prefix. This enum
|
/// We may need to include one or more legacy prefix bytes before the REX prefix. This enum
|
||||||
/// covers only the small set of possibilities that we actually need.
|
/// covers only the small set of possibilities that we actually need.
|
||||||
pub(crate) enum LegacyPrefixes {
|
pub enum LegacyPrefixes {
|
||||||
/// No prefix bytes.
|
/// No prefix bytes.
|
||||||
None,
|
None,
|
||||||
/// Operand Size Override -- here, denoting "16-bit operation".
|
/// Operand Size Override -- here, denoting "16-bit operation".
|
||||||
@@ -173,26 +201,47 @@ pub(crate) enum LegacyPrefixes {
|
|||||||
}
|
}
|
||||||
|
|
||||||
impl LegacyPrefixes {
|
impl LegacyPrefixes {
|
||||||
|
/// Emit the legacy prefix as bytes (e.g. in REX instructions).
|
||||||
#[inline(always)]
|
#[inline(always)]
|
||||||
pub(crate) fn emit(&self, sink: &mut MachBuffer<Inst>) {
|
pub(crate) fn emit(&self, sink: &mut MachBuffer<Inst>) {
|
||||||
match self {
|
match self {
|
||||||
LegacyPrefixes::_66 => sink.put1(0x66),
|
Self::_66 => sink.put1(0x66),
|
||||||
LegacyPrefixes::_F0 => sink.put1(0xF0),
|
Self::_F0 => sink.put1(0xF0),
|
||||||
LegacyPrefixes::_66F0 => {
|
Self::_66F0 => {
|
||||||
// I don't think the order matters, but in any case, this is the same order that
|
// I don't think the order matters, but in any case, this is the same order that
|
||||||
// the GNU assembler uses.
|
// the GNU assembler uses.
|
||||||
sink.put1(0x66);
|
sink.put1(0x66);
|
||||||
sink.put1(0xF0);
|
sink.put1(0xF0);
|
||||||
}
|
}
|
||||||
LegacyPrefixes::_F2 => sink.put1(0xF2),
|
Self::_F2 => sink.put1(0xF2),
|
||||||
LegacyPrefixes::_F3 => sink.put1(0xF3),
|
Self::_F3 => sink.put1(0xF3),
|
||||||
LegacyPrefixes::_66F3 => {
|
Self::_66F3 => {
|
||||||
sink.put1(0x66);
|
sink.put1(0x66);
|
||||||
sink.put1(0xF3);
|
sink.put1(0xF3);
|
||||||
}
|
}
|
||||||
LegacyPrefixes::None => (),
|
Self::None => (),
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/// Emit the legacy prefix as bits (e.g. for EVEX instructions).
|
||||||
|
#[inline(always)]
|
||||||
|
pub(crate) fn bits(&self) -> u8 {
|
||||||
|
match self {
|
||||||
|
Self::None => 0b00,
|
||||||
|
Self::_66 => 0b01,
|
||||||
|
Self::_F3 => 0b10,
|
||||||
|
Self::_F2 => 0b11,
|
||||||
|
_ => panic!(
|
||||||
|
"VEX and EVEX bits can only be extracted from single prefixes: None, 66, F3, F2"
|
||||||
|
),
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
impl Default for LegacyPrefixes {
|
||||||
|
fn default() -> Self {
|
||||||
|
Self::None
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
/// This is the core 'emit' function for instructions that reference memory.
|
/// This is the core 'emit' function for instructions that reference memory.
|
||||||
|
|||||||
2
cranelift/codegen/src/isa/x64/inst/encoding/vex.rs
Normal file
2
cranelift/codegen/src/isa/x64/inst/encoding/vex.rs
Normal file
@@ -0,0 +1,2 @@
|
|||||||
|
//! Encodes VEX instructions. These instructions are those added by the Advanced Vector Extensions
|
||||||
|
//! (AVX).
|
||||||
@@ -225,6 +225,12 @@ pub enum Inst {
|
|||||||
dst: Writable<Reg>,
|
dst: Writable<Reg>,
|
||||||
},
|
},
|
||||||
|
|
||||||
|
XmmUnaryRmREvex {
|
||||||
|
op: Avx512Opcode,
|
||||||
|
src: RegMem,
|
||||||
|
dst: Writable<Reg>,
|
||||||
|
},
|
||||||
|
|
||||||
/// XMM (scalar or vector) unary op (from xmm to reg/mem): stores, movd, movq
|
/// XMM (scalar or vector) unary op (from xmm to reg/mem): stores, movd, movq
|
||||||
XmmMovRM {
|
XmmMovRM {
|
||||||
op: SseOpcode,
|
op: SseOpcode,
|
||||||
@@ -571,6 +577,8 @@ impl Inst {
|
|||||||
| Inst::XmmRmRImm { op, .. }
|
| Inst::XmmRmRImm { op, .. }
|
||||||
| Inst::XmmToGpr { op, .. }
|
| Inst::XmmToGpr { op, .. }
|
||||||
| Inst::XmmUnaryRmR { op, .. } => smallvec![op.available_from()],
|
| Inst::XmmUnaryRmR { op, .. } => smallvec![op.available_from()],
|
||||||
|
|
||||||
|
Inst::XmmUnaryRmREvex { op, .. } => op.available_from(),
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@@ -705,6 +713,12 @@ impl Inst {
|
|||||||
Inst::XmmUnaryRmR { op, src, dst }
|
Inst::XmmUnaryRmR { op, src, dst }
|
||||||
}
|
}
|
||||||
|
|
||||||
|
pub(crate) fn xmm_unary_rm_r_evex(op: Avx512Opcode, src: RegMem, dst: Writable<Reg>) -> Inst {
|
||||||
|
src.assert_regclass_is(RegClass::V128);
|
||||||
|
debug_assert!(dst.to_reg().get_class() == RegClass::V128);
|
||||||
|
Inst::XmmUnaryRmREvex { op, src, dst }
|
||||||
|
}
|
||||||
|
|
||||||
pub(crate) fn xmm_rm_r(op: SseOpcode, src: RegMem, dst: Writable<Reg>) -> Self {
|
pub(crate) fn xmm_rm_r(op: SseOpcode, src: RegMem, dst: Writable<Reg>) -> Self {
|
||||||
src.assert_regclass_is(RegClass::V128);
|
src.assert_regclass_is(RegClass::V128);
|
||||||
debug_assert!(dst.to_reg().get_class() == RegClass::V128);
|
debug_assert!(dst.to_reg().get_class() == RegClass::V128);
|
||||||
@@ -1391,6 +1405,13 @@ impl PrettyPrint for Inst {
|
|||||||
show_ireg_sized(dst.to_reg(), mb_rru, 8),
|
show_ireg_sized(dst.to_reg(), mb_rru, 8),
|
||||||
),
|
),
|
||||||
|
|
||||||
|
Inst::XmmUnaryRmREvex { op, src, dst, .. } => format!(
|
||||||
|
"{} {}, {}",
|
||||||
|
ljustify(op.to_string()),
|
||||||
|
src.show_rru_sized(mb_rru, 8),
|
||||||
|
show_ireg_sized(dst.to_reg(), mb_rru, 8),
|
||||||
|
),
|
||||||
|
|
||||||
Inst::XmmMovRM { op, src, dst, .. } => format!(
|
Inst::XmmMovRM { op, src, dst, .. } => format!(
|
||||||
"{} {}, {}",
|
"{} {}, {}",
|
||||||
ljustify(op.to_string()),
|
ljustify(op.to_string()),
|
||||||
@@ -1863,7 +1884,9 @@ fn x64_get_regs(inst: &Inst, collector: &mut RegUsageCollector) {
|
|||||||
collector.add_def(Writable::from_reg(regs::rdx()));
|
collector.add_def(Writable::from_reg(regs::rdx()));
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
Inst::UnaryRmR { src, dst, .. } | Inst::XmmUnaryRmR { src, dst, .. } => {
|
Inst::UnaryRmR { src, dst, .. }
|
||||||
|
| Inst::XmmUnaryRmR { src, dst, .. }
|
||||||
|
| Inst::XmmUnaryRmREvex { src, dst, .. } => {
|
||||||
src.get_regs_as_uses(collector);
|
src.get_regs_as_uses(collector);
|
||||||
collector.add_def(*dst);
|
collector.add_def(*dst);
|
||||||
}
|
}
|
||||||
@@ -2210,6 +2233,11 @@ fn x64_map_regs<RUM: RegUsageMapper>(inst: &mut Inst, mapper: &RUM) {
|
|||||||
ref mut dst,
|
ref mut dst,
|
||||||
..
|
..
|
||||||
}
|
}
|
||||||
|
| Inst::XmmUnaryRmREvex {
|
||||||
|
ref mut src,
|
||||||
|
ref mut dst,
|
||||||
|
..
|
||||||
|
}
|
||||||
| Inst::UnaryRmR {
|
| Inst::UnaryRmR {
|
||||||
ref mut src,
|
ref mut src,
|
||||||
ref mut dst,
|
ref mut dst,
|
||||||
|
|||||||
@@ -1855,10 +1855,13 @@ fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(
|
|||||||
let dst = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
|
let dst = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
|
||||||
let ty = ty.unwrap();
|
let ty = ty.unwrap();
|
||||||
if ty == types::I64X2 {
|
if ty == types::I64X2 {
|
||||||
// This lowering could be a single instruction with AVX512F/VL's VPABSQ instruction.
|
if isa_flags.use_avx512f_simd() || isa_flags.use_avx512vl_simd() {
|
||||||
// Instead, we use a separate register, `tmp`, to contain the results of `0 - src`
|
ctx.emit(Inst::xmm_unary_rm_r_evex(Avx512Opcode::Vpabsq, src, dst));
|
||||||
// and then blend in those results with `BLENDVPD` if the MSB of `tmp` was set to 1
|
} else {
|
||||||
// (i.e. if `tmp` was negative or, conversely, if `src` was originally positive).
|
// If `VPABSQ` from AVX512 is unavailable, we use a separate register, `tmp`, to
|
||||||
|
// contain the results of `0 - src` and then blend in those results with
|
||||||
|
// `BLENDVPD` if the MSB of `tmp` was set to 1 (i.e. if `tmp` was negative or,
|
||||||
|
// conversely, if `src` was originally positive).
|
||||||
|
|
||||||
// Emit all 0s into the `tmp` register.
|
// Emit all 0s into the `tmp` register.
|
||||||
let tmp = ctx.alloc_tmp(ty).only_reg().unwrap();
|
let tmp = ctx.alloc_tmp(ty).only_reg().unwrap();
|
||||||
@@ -1874,6 +1877,7 @@ fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(
|
|||||||
ty,
|
ty,
|
||||||
));
|
));
|
||||||
ctx.emit(Inst::xmm_rm_r(SseOpcode::Blendvpd, src, dst));
|
ctx.emit(Inst::xmm_rm_r(SseOpcode::Blendvpd, src, dst));
|
||||||
|
}
|
||||||
} else if ty.is_vector() {
|
} else if ty.is_vector() {
|
||||||
let opcode = match ty {
|
let opcode = match ty {
|
||||||
types::I8X16 => SseOpcode::Pabsb,
|
types::I8X16 => SseOpcode::Pabsb,
|
||||||
|
|||||||
Reference in New Issue
Block a user