Remove the old x86 backend

This commit is contained in:
bjorn3
2021-06-18 17:28:55 +02:00
parent e989caf337
commit 9e34df33b9
246 changed files with 76 additions and 28804 deletions

File diff suppressed because it is too large Load Diff

View File

@@ -1,723 +0,0 @@
#![allow(non_snake_case)]
use crate::cdsl::instructions::{
AllInstructions, InstructionBuilder as Inst, InstructionGroup, InstructionGroupBuilder,
};
use crate::cdsl::operands::Operand;
use crate::cdsl::types::ValueType;
use crate::cdsl::typevar::{Interval, TypeSetBuilder, TypeVar};
use crate::shared::entities::EntityRefs;
use crate::shared::formats::Formats;
use crate::shared::immediates::Immediates;
use crate::shared::types;
#[allow(clippy::many_single_char_names)]
pub(crate) fn define(
mut all_instructions: &mut AllInstructions,
formats: &Formats,
immediates: &Immediates,
entities: &EntityRefs,
) -> InstructionGroup {
let mut ig = InstructionGroupBuilder::new(&mut all_instructions);
let iflags: &TypeVar = &ValueType::Special(types::Flag::IFlags.into()).into();
let iWord = &TypeVar::new(
"iWord",
"A scalar integer machine word",
TypeSetBuilder::new().ints(32..64).build(),
);
let nlo = &Operand::new("nlo", iWord).with_doc("Low part of numerator");
let nhi = &Operand::new("nhi", iWord).with_doc("High part of numerator");
let d = &Operand::new("d", iWord).with_doc("Denominator");
let q = &Operand::new("q", iWord).with_doc("Quotient");
let r = &Operand::new("r", iWord).with_doc("Remainder");
ig.push(
Inst::new(
"x86_udivmodx",
r#"
Extended unsigned division.
Concatenate the bits in `nhi` and `nlo` to form the numerator.
Interpret the bits as an unsigned number and divide by the unsigned
denominator `d`. Trap when `d` is zero or if the quotient is larger
than the range of the output.
Return both quotient and remainder.
"#,
&formats.ternary,
)
.operands_in(vec![nlo, nhi, d])
.operands_out(vec![q, r])
.can_trap(true),
);
ig.push(
Inst::new(
"x86_sdivmodx",
r#"
Extended signed division.
Concatenate the bits in `nhi` and `nlo` to form the numerator.
Interpret the bits as a signed number and divide by the signed
denominator `d`. Trap when `d` is zero or if the quotient is outside
the range of the output.
Return both quotient and remainder.
"#,
&formats.ternary,
)
.operands_in(vec![nlo, nhi, d])
.operands_out(vec![q, r])
.can_trap(true),
);
let argL = &Operand::new("argL", iWord);
let argR = &Operand::new("argR", iWord);
let resLo = &Operand::new("resLo", iWord);
let resHi = &Operand::new("resHi", iWord);
ig.push(
Inst::new(
"x86_umulx",
r#"
Unsigned integer multiplication, producing a double-length result.
Polymorphic over all scalar integer types, but does not support vector
types.
"#,
&formats.binary,
)
.operands_in(vec![argL, argR])
.operands_out(vec![resLo, resHi]),
);
ig.push(
Inst::new(
"x86_smulx",
r#"
Signed integer multiplication, producing a double-length result.
Polymorphic over all scalar integer types, but does not support vector
types.
"#,
&formats.binary,
)
.operands_in(vec![argL, argR])
.operands_out(vec![resLo, resHi]),
);
let Float = &TypeVar::new(
"Float",
"A scalar or vector floating point number",
TypeSetBuilder::new()
.floats(Interval::All)
.simd_lanes(Interval::All)
.build(),
);
let IntTo = &TypeVar::new(
"IntTo",
"An integer type with the same number of lanes",
TypeSetBuilder::new()
.ints(32..64)
.simd_lanes(Interval::All)
.build(),
);
let x = &Operand::new("x", Float);
let a = &Operand::new("a", IntTo);
ig.push(
Inst::new(
"x86_cvtt2si",
r#"
Convert with truncation floating point to signed integer.
The source floating point operand is converted to a signed integer by
rounding towards zero. If the result can't be represented in the output
type, returns the smallest signed value the output type can represent.
This instruction does not trap.
"#,
&formats.unary,
)
.operands_in(vec![x])
.operands_out(vec![a]),
);
let f32x4 = &TypeVar::new(
"f32x4",
"A floating point number",
TypeSetBuilder::new()
.floats(32..32)
.simd_lanes(4..4)
.build(),
);
let i32x4 = &TypeVar::new(
"i32x4",
"An integer type with the same number of lanes",
TypeSetBuilder::new().ints(32..32).simd_lanes(4..4).build(),
);
let x = &Operand::new("x", i32x4);
let a = &Operand::new("a", f32x4);
ig.push(
Inst::new(
"x86_vcvtudq2ps",
r#"
Convert unsigned integer to floating point.
Convert packed doubleword unsigned integers to packed single-precision floating-point
values. This instruction does not trap.
"#,
&formats.unary,
)
.operands_in(vec![x])
.operands_out(vec![a]),
);
let x = &Operand::new("x", Float);
let a = &Operand::new("a", Float);
let y = &Operand::new("y", Float);
ig.push(
Inst::new(
"x86_fmin",
r#"
Floating point minimum with x86 semantics.
This is equivalent to the C ternary operator `x < y ? x : y` which
differs from `fmin` when either operand is NaN or when comparing
+0.0 to -0.0.
When the two operands don't compare as LT, `y` is returned unchanged,
even if it is a signalling NaN.
"#,
&formats.binary,
)
.operands_in(vec![x, y])
.operands_out(vec![a]),
);
ig.push(
Inst::new(
"x86_fmax",
r#"
Floating point maximum with x86 semantics.
This is equivalent to the C ternary operator `x > y ? x : y` which
differs from `fmax` when either operand is NaN or when comparing
+0.0 to -0.0.
When the two operands don't compare as GT, `y` is returned unchanged,
even if it is a signalling NaN.
"#,
&formats.binary,
)
.operands_in(vec![x, y])
.operands_out(vec![a]),
);
let x = &Operand::new("x", iWord);
ig.push(
Inst::new(
"x86_push",
r#"
Pushes a value onto the stack.
Decrements the stack pointer and stores the specified value on to the top.
This is polymorphic in i32 and i64. However, it is only implemented for i64
in 64-bit mode, and only for i32 in 32-bit mode.
"#,
&formats.unary,
)
.operands_in(vec![x])
.other_side_effects(true)
.can_store(true),
);
ig.push(
Inst::new(
"x86_pop",
r#"
Pops a value from the stack.
Loads a value from the top of the stack and then increments the stack
pointer.
This is polymorphic in i32 and i64. However, it is only implemented for i64
in 64-bit mode, and only for i32 in 32-bit mode.
"#,
&formats.nullary,
)
.operands_out(vec![x])
.other_side_effects(true)
.can_load(true),
);
let y = &Operand::new("y", iWord);
let rflags = &Operand::new("rflags", iflags);
ig.push(
Inst::new(
"x86_bsr",
r#"
Bit Scan Reverse -- returns the bit-index of the most significant 1
in the word. Result is undefined if the argument is zero. However, it
sets the Z flag depending on the argument, so it is at least easy to
detect and handle that case.
This is polymorphic in i32 and i64. It is implemented for both i64 and
i32 in 64-bit mode, and only for i32 in 32-bit mode.
"#,
&formats.unary,
)
.operands_in(vec![x])
.operands_out(vec![y, rflags]),
);
ig.push(
Inst::new(
"x86_bsf",
r#"
Bit Scan Forwards -- returns the bit-index of the least significant 1
in the word. Is otherwise identical to 'bsr', just above.
"#,
&formats.unary,
)
.operands_in(vec![x])
.operands_out(vec![y, rflags]),
);
let uimm8 = &immediates.uimm8;
let TxN = &TypeVar::new(
"TxN",
"A SIMD vector type",
TypeSetBuilder::new()
.ints(Interval::All)
.floats(Interval::All)
.bools(Interval::All)
.simd_lanes(Interval::All)
.includes_scalars(false)
.build(),
);
let a = &Operand::new("a", TxN).with_doc("A vector value (i.e. held in an XMM register)");
let b = &Operand::new("b", TxN).with_doc("A vector value (i.e. held in an XMM register)");
let i = &Operand::new("i", uimm8).with_doc("An ordering operand controlling the copying of data from the source to the destination; see PSHUFD in Intel manual for details");
ig.push(
Inst::new(
"x86_pshufd",
r#"
Packed Shuffle Doublewords -- copies data from either memory or lanes in an extended
register and re-orders the data according to the passed immediate byte.
"#,
&formats.binary_imm8,
)
.operands_in(vec![a, i]) // TODO allow copying from memory here (need more permissive type than TxN)
.operands_out(vec![a]),
);
ig.push(
Inst::new(
"x86_pshufb",
r#"
Packed Shuffle Bytes -- re-orders data in an extended register using a shuffle
mask from either memory or another extended register
"#,
&formats.binary,
)
.operands_in(vec![a, b]) // TODO allow re-ordering from memory here (need more permissive type than TxN)
.operands_out(vec![a]),
);
let mask = &Operand::new("mask", uimm8).with_doc("mask to select lanes from b");
ig.push(
Inst::new(
"x86_pblendw",
r#"
Blend packed words using an immediate mask. Each bit of the 8-bit immediate corresponds to a
lane in ``b``: if the bit is set, the lane is copied into ``a``.
"#,
&formats.ternary_imm8,
)
.operands_in(vec![a, b, mask])
.operands_out(vec![a]),
);
let Idx = &Operand::new("Idx", uimm8).with_doc("Lane index");
let x = &Operand::new("x", TxN);
let a = &Operand::new("a", &TxN.lane_of());
ig.push(
Inst::new(
"x86_pextr",
r#"
Extract lane ``Idx`` from ``x``.
The lane index, ``Idx``, is an immediate value, not an SSA value. It
must indicate a valid lane index for the type of ``x``.
"#,
&formats.binary_imm8,
)
.operands_in(vec![x, Idx])
.operands_out(vec![a]),
);
let IBxN = &TypeVar::new(
"IBxN",
"A SIMD vector type containing only booleans and integers",
TypeSetBuilder::new()
.ints(Interval::All)
.bools(Interval::All)
.simd_lanes(Interval::All)
.includes_scalars(false)
.build(),
);
let x = &Operand::new("x", IBxN);
let y = &Operand::new("y", &IBxN.lane_of()).with_doc("New lane value");
let a = &Operand::new("a", IBxN);
ig.push(
Inst::new(
"x86_pinsr",
r#"
Insert ``y`` into ``x`` at lane ``Idx``.
The lane index, ``Idx``, is an immediate value, not an SSA value. It
must indicate a valid lane index for the type of ``x``.
"#,
&formats.ternary_imm8,
)
.operands_in(vec![x, y, Idx])
.operands_out(vec![a]),
);
let FxN = &TypeVar::new(
"FxN",
"A SIMD vector type containing floats",
TypeSetBuilder::new()
.floats(Interval::All)
.simd_lanes(Interval::All)
.includes_scalars(false)
.build(),
);
let x = &Operand::new("x", FxN);
let y = &Operand::new("y", &FxN.lane_of()).with_doc("New lane value");
let a = &Operand::new("a", FxN);
ig.push(
Inst::new(
"x86_insertps",
r#"
Insert a lane of ``y`` into ``x`` at using ``Idx`` to encode both which lane the value is
extracted from and which it is inserted to. This is similar to x86_pinsr but inserts
floats, which are already stored in an XMM register.
"#,
&formats.ternary_imm8,
)
.operands_in(vec![x, y, Idx])
.operands_out(vec![a]),
);
let x = &Operand::new("x", TxN);
let y = &Operand::new("y", TxN);
let a = &Operand::new("a", TxN);
ig.push(
Inst::new(
"x86_punpckh",
r#"
Unpack the high-order lanes of ``x`` and ``y`` and interleave into ``a``. With notional
i8x4 vectors, where ``x = [x3, x2, x1, x0]`` and ``y = [y3, y2, y1, y0]``, this operation
would result in ``a = [y3, x3, y2, x2]`` (using the Intel manual's right-to-left lane
ordering).
"#,
&formats.binary,
)
.operands_in(vec![x, y])
.operands_out(vec![a]),
);
ig.push(
Inst::new(
"x86_punpckl",
r#"
Unpack the low-order lanes of ``x`` and ``y`` and interleave into ``a``. With notional
i8x4 vectors, where ``x = [x3, x2, x1, x0]`` and ``y = [y3, y2, y1, y0]``, this operation
would result in ``a = [y1, x1, y0, x0]`` (using the Intel manual's right-to-left lane
ordering).
"#,
&formats.binary,
)
.operands_in(vec![x, y])
.operands_out(vec![a]),
);
let x = &Operand::new("x", FxN);
let y = &Operand::new("y", FxN);
let a = &Operand::new("a", FxN);
ig.push(
Inst::new(
"x86_movsd",
r#"
Move the low 64 bits of the float vector ``y`` to the low 64 bits of float vector ``x``
"#,
&formats.binary,
)
.operands_in(vec![x, y])
.operands_out(vec![a]),
);
ig.push(
Inst::new(
"x86_movlhps",
r#"
Move the low 64 bits of the float vector ``y`` to the high 64 bits of float vector ``x``
"#,
&formats.binary,
)
.operands_in(vec![x, y])
.operands_out(vec![a]),
);
let IxN = &TypeVar::new(
"IxN",
"A SIMD vector type containing integers",
TypeSetBuilder::new()
.ints(Interval::All)
.simd_lanes(Interval::All)
.includes_scalars(false)
.build(),
);
let I128 = &TypeVar::new(
"I128",
"A SIMD vector type containing one large integer (due to Cranelift type constraints, \
this uses the Cranelift I64X2 type but should be understood as one large value, i.e., the \
upper lane is concatenated with the lower lane to form the integer)",
TypeSetBuilder::new()
.ints(64..64)
.simd_lanes(2..2)
.includes_scalars(false)
.build(),
);
let x = &Operand::new("x", IxN).with_doc("Vector value to shift");
let y = &Operand::new("y", I128).with_doc("Number of bits to shift");
let a = &Operand::new("a", IxN);
ig.push(
Inst::new(
"x86_psll",
r#"
Shift Packed Data Left Logical -- This implements the behavior of the shared instruction
``ishl`` but alters the shift operand to live in an XMM register as expected by the PSLL*
family of instructions.
"#,
&formats.binary,
)
.operands_in(vec![x, y])
.operands_out(vec![a]),
);
ig.push(
Inst::new(
"x86_psrl",
r#"
Shift Packed Data Right Logical -- This implements the behavior of the shared instruction
``ushr`` but alters the shift operand to live in an XMM register as expected by the PSRL*
family of instructions.
"#,
&formats.binary,
)
.operands_in(vec![x, y])
.operands_out(vec![a]),
);
ig.push(
Inst::new(
"x86_psra",
r#"
Shift Packed Data Right Arithmetic -- This implements the behavior of the shared
instruction ``sshr`` but alters the shift operand to live in an XMM register as expected by
the PSRA* family of instructions.
"#,
&formats.binary,
)
.operands_in(vec![x, y])
.operands_out(vec![a]),
);
let I64x2 = &TypeVar::new(
"I64x2",
"A SIMD vector type containing two 64-bit integers",
TypeSetBuilder::new()
.ints(64..64)
.simd_lanes(2..2)
.includes_scalars(false)
.build(),
);
let x = &Operand::new("x", I64x2);
let y = &Operand::new("y", I64x2);
let a = &Operand::new("a", I64x2);
ig.push(
Inst::new(
"x86_pmullq",
r#"
Multiply Packed Integers -- Multiply two 64x2 integers and receive a 64x2 result with
lane-wise wrapping if the result overflows. This instruction is necessary to add distinct
encodings for CPUs with newer vector features.
"#,
&formats.binary,
)
.operands_in(vec![x, y])
.operands_out(vec![a]),
);
ig.push(
Inst::new(
"x86_pmuludq",
r#"
Multiply Packed Integers -- Using only the bottom 32 bits in each lane, multiply two 64x2
unsigned integers and receive a 64x2 result. This instruction avoids the need for handling
overflow as in `x86_pmullq`.
"#,
&formats.binary,
)
.operands_in(vec![x, y])
.operands_out(vec![a]),
);
let x = &Operand::new("x", TxN);
let y = &Operand::new("y", TxN);
let f = &Operand::new("f", iflags);
ig.push(
Inst::new(
"x86_ptest",
r#"
Logical Compare -- PTEST will set the ZF flag if all bits in the result are 0 of the
bitwise AND of the first source operand (first operand) and the second source operand
(second operand). PTEST sets the CF flag if all bits in the result are 0 of the bitwise
AND of the second source operand (second operand) and the logical NOT of the destination
operand (first operand).
"#,
&formats.binary,
)
.operands_in(vec![x, y])
.operands_out(vec![f]),
);
let x = &Operand::new("x", IxN);
let y = &Operand::new("y", IxN);
let a = &Operand::new("a", IxN);
ig.push(
Inst::new(
"x86_pmaxs",
r#"
Maximum of Packed Signed Integers -- Compare signed integers in the first and second
operand and return the maximum values.
"#,
&formats.binary,
)
.operands_in(vec![x, y])
.operands_out(vec![a]),
);
ig.push(
Inst::new(
"x86_pmaxu",
r#"
Maximum of Packed Unsigned Integers -- Compare unsigned integers in the first and second
operand and return the maximum values.
"#,
&formats.binary,
)
.operands_in(vec![x, y])
.operands_out(vec![a]),
);
ig.push(
Inst::new(
"x86_pmins",
r#"
Minimum of Packed Signed Integers -- Compare signed integers in the first and second
operand and return the minimum values.
"#,
&formats.binary,
)
.operands_in(vec![x, y])
.operands_out(vec![a]),
);
ig.push(
Inst::new(
"x86_pminu",
r#"
Minimum of Packed Unsigned Integers -- Compare unsigned integers in the first and second
operand and return the minimum values.
"#,
&formats.binary,
)
.operands_in(vec![x, y])
.operands_out(vec![a]),
);
let c = &Operand::new("c", uimm8)
.with_doc("The number of bytes to shift right; see PALIGNR in Intel manual for details");
ig.push(
Inst::new(
"x86_palignr",
r#"
Concatenate destination and source operands, extracting a byte-aligned result shifted to
the right by `c`.
"#,
&formats.ternary_imm8,
)
.operands_in(vec![x, y, c])
.operands_out(vec![a]),
);
let i64_t = &TypeVar::new(
"i64_t",
"A scalar 64bit integer",
TypeSetBuilder::new().ints(64..64).build(),
);
let GV = &Operand::new("GV", &entities.global_value);
let addr = &Operand::new("addr", i64_t);
ig.push(
Inst::new(
"x86_elf_tls_get_addr",
r#"
Elf tls get addr -- This implements the GD TLS model for ELF. The clobber output should
not be used.
"#,
&formats.unary_global_value,
)
// This is a bit overly broad to mark as clobbering *all* the registers, because it should
// only preserve caller-saved registers. There's no way to indicate this to register
// allocation yet, though, so mark as clobbering all registers instead.
.clobbers_all_regs(true)
.operands_in(vec![GV])
.operands_out(vec![addr]),
);
ig.push(
Inst::new(
"x86_macho_tls_get_addr",
r#"
Mach-O tls get addr -- This implements TLS access for Mach-O. The clobber output should
not be used.
"#,
&formats.unary_global_value,
)
// See above comment for x86_elf_tls_get_addr.
.clobbers_all_regs(true)
.operands_in(vec![GV])
.operands_out(vec![addr]),
);
ig.build()
}

View File

@@ -1,827 +0,0 @@
use crate::cdsl::ast::{constant, var, ExprBuilder, Literal};
use crate::cdsl::instructions::{vector, Bindable, InstructionGroup};
use crate::cdsl::types::{LaneType, ValueType};
use crate::cdsl::xform::TransformGroupBuilder;
use crate::shared::types::Float::{F32, F64};
use crate::shared::types::Int::{I16, I32, I64, I8};
use crate::shared::Definitions as SharedDefinitions;
#[allow(clippy::many_single_char_names)]
pub(crate) fn define(shared: &mut SharedDefinitions, x86_instructions: &InstructionGroup) {
let mut expand = TransformGroupBuilder::new(
"x86_expand",
r#"
Legalize instructions by expansion.
Use x86-specific instructions if needed."#,
)
.isa("x86")
.chain_with(shared.transform_groups.by_name("expand_flags").id);
let mut narrow = TransformGroupBuilder::new(
"x86_narrow",
r#"
Legalize instructions by narrowing.
Use x86-specific instructions if needed."#,
)
.isa("x86")
.chain_with(shared.transform_groups.by_name("narrow_flags").id);
let mut narrow_avx = TransformGroupBuilder::new(
"x86_narrow_avx",
r#"
Legalize instructions by narrowing with CPU feature checks.
This special case converts using x86 AVX instructions where available."#,
)
.isa("x86");
// We cannot chain with the x86_narrow group until this group is built, see bottom of this
// function for where this is chained.
let mut widen = TransformGroupBuilder::new(
"x86_widen",
r#"
Legalize instructions by widening.
Use x86-specific instructions if needed."#,
)
.isa("x86")
.chain_with(shared.transform_groups.by_name("widen").id);
// List of instructions.
let insts = &shared.instructions;
let band = insts.by_name("band");
let bor = insts.by_name("bor");
let clz = insts.by_name("clz");
let ctz = insts.by_name("ctz");
let fcmp = insts.by_name("fcmp");
let fcvt_from_uint = insts.by_name("fcvt_from_uint");
let fcvt_to_sint = insts.by_name("fcvt_to_sint");
let fcvt_to_uint = insts.by_name("fcvt_to_uint");
let fcvt_to_sint_sat = insts.by_name("fcvt_to_sint_sat");
let fcvt_to_uint_sat = insts.by_name("fcvt_to_uint_sat");
let fmax = insts.by_name("fmax");
let fmin = insts.by_name("fmin");
let iadd = insts.by_name("iadd");
let iconst = insts.by_name("iconst");
let imul = insts.by_name("imul");
let ineg = insts.by_name("ineg");
let isub = insts.by_name("isub");
let ishl = insts.by_name("ishl");
let ireduce = insts.by_name("ireduce");
let popcnt = insts.by_name("popcnt");
let sdiv = insts.by_name("sdiv");
let selectif = insts.by_name("selectif");
let smulhi = insts.by_name("smulhi");
let srem = insts.by_name("srem");
let tls_value = insts.by_name("tls_value");
let udiv = insts.by_name("udiv");
let umulhi = insts.by_name("umulhi");
let ushr = insts.by_name("ushr");
let ushr_imm = insts.by_name("ushr_imm");
let urem = insts.by_name("urem");
let x86_bsf = x86_instructions.by_name("x86_bsf");
let x86_bsr = x86_instructions.by_name("x86_bsr");
let x86_umulx = x86_instructions.by_name("x86_umulx");
let x86_smulx = x86_instructions.by_name("x86_smulx");
let imm = &shared.imm;
// Shift by a 64-bit amount is equivalent to a shift by that amount mod 32, so we can reduce
// the size of the shift amount. This is useful for x86_32, where an I64 shift amount is
// not encodable.
let a = var("a");
let x = var("x");
let y = var("y");
let z = var("z");
for &ty in &[I8, I16, I32] {
let ishl_by_i64 = ishl.bind(ty).bind(I64);
let ireduce = ireduce.bind(I32);
expand.legalize(
def!(a = ishl_by_i64(x, y)),
vec![def!(z = ireduce(y)), def!(a = ishl(x, z))],
);
}
for &ty in &[I8, I16, I32] {
let ushr_by_i64 = ushr.bind(ty).bind(I64);
let ireduce = ireduce.bind(I32);
expand.legalize(
def!(a = ushr_by_i64(x, y)),
vec![def!(z = ireduce(y)), def!(a = ishl(x, z))],
);
}
// Division and remainder.
//
// The srem expansion requires custom code because srem INT_MIN, -1 is not
// allowed to trap. The other ops need to check avoid_div_traps.
expand.custom_legalize(sdiv, "expand_sdivrem");
expand.custom_legalize(srem, "expand_sdivrem");
expand.custom_legalize(udiv, "expand_udivrem");
expand.custom_legalize(urem, "expand_udivrem");
// Double length (widening) multiplication.
let a = var("a");
let x = var("x");
let y = var("y");
let a1 = var("a1");
let a2 = var("a2");
let res_lo = var("res_lo");
let res_hi = var("res_hi");
expand.legalize(
def!(res_hi = umulhi(x, y)),
vec![def!((res_lo, res_hi) = x86_umulx(x, y))],
);
expand.legalize(
def!(res_hi = smulhi(x, y)),
vec![def!((res_lo, res_hi) = x86_smulx(x, y))],
);
// Floating point condition codes.
//
// The 8 condition codes in `supported_floatccs` are directly supported by a
// `ucomiss` or `ucomisd` instruction. The remaining codes need legalization
// patterns.
let floatcc_eq = Literal::enumerator_for(&imm.floatcc, "eq");
let floatcc_ord = Literal::enumerator_for(&imm.floatcc, "ord");
let floatcc_ueq = Literal::enumerator_for(&imm.floatcc, "ueq");
let floatcc_ne = Literal::enumerator_for(&imm.floatcc, "ne");
let floatcc_uno = Literal::enumerator_for(&imm.floatcc, "uno");
let floatcc_one = Literal::enumerator_for(&imm.floatcc, "one");
// Equality needs an explicit `ord` test which checks the parity bit.
expand.legalize(
def!(a = fcmp(floatcc_eq, x, y)),
vec![
def!(a1 = fcmp(floatcc_ord, x, y)),
def!(a2 = fcmp(floatcc_ueq, x, y)),
def!(a = band(a1, a2)),
],
);
expand.legalize(
def!(a = fcmp(floatcc_ne, x, y)),
vec![
def!(a1 = fcmp(floatcc_uno, x, y)),
def!(a2 = fcmp(floatcc_one, x, y)),
def!(a = bor(a1, a2)),
],
);
let floatcc_lt = &Literal::enumerator_for(&imm.floatcc, "lt");
let floatcc_gt = &Literal::enumerator_for(&imm.floatcc, "gt");
let floatcc_le = &Literal::enumerator_for(&imm.floatcc, "le");
let floatcc_ge = &Literal::enumerator_for(&imm.floatcc, "ge");
let floatcc_ugt = &Literal::enumerator_for(&imm.floatcc, "ugt");
let floatcc_ult = &Literal::enumerator_for(&imm.floatcc, "ult");
let floatcc_uge = &Literal::enumerator_for(&imm.floatcc, "uge");
let floatcc_ule = &Literal::enumerator_for(&imm.floatcc, "ule");
// Inequalities that need to be reversed.
for &(cc, rev_cc) in &[
(floatcc_lt, floatcc_gt),
(floatcc_le, floatcc_ge),
(floatcc_ugt, floatcc_ult),
(floatcc_uge, floatcc_ule),
] {
expand.legalize(def!(a = fcmp(cc, x, y)), vec![def!(a = fcmp(rev_cc, y, x))]);
}
// We need to modify the CFG for min/max legalization.
expand.custom_legalize(fmin, "expand_minmax");
expand.custom_legalize(fmax, "expand_minmax");
// Conversions from unsigned need special handling.
expand.custom_legalize(fcvt_from_uint, "expand_fcvt_from_uint");
// Conversions from float to int can trap and modify the control flow graph.
expand.custom_legalize(fcvt_to_sint, "expand_fcvt_to_sint");
expand.custom_legalize(fcvt_to_uint, "expand_fcvt_to_uint");
expand.custom_legalize(fcvt_to_sint_sat, "expand_fcvt_to_sint_sat");
expand.custom_legalize(fcvt_to_uint_sat, "expand_fcvt_to_uint_sat");
// Count leading and trailing zeroes, for baseline x86_64
let c_minus_one = var("c_minus_one");
let c_thirty_one = var("c_thirty_one");
let c_thirty_two = var("c_thirty_two");
let c_sixty_three = var("c_sixty_three");
let c_sixty_four = var("c_sixty_four");
let index1 = var("index1");
let r2flags = var("r2flags");
let index2 = var("index2");
let intcc_eq = Literal::enumerator_for(&imm.intcc, "eq");
let imm64_minus_one = Literal::constant(&imm.imm64, -1);
let imm64_63 = Literal::constant(&imm.imm64, 63);
expand.legalize(
def!(a = clz.I64(x)),
vec![
def!(c_minus_one = iconst(imm64_minus_one)),
def!(c_sixty_three = iconst(imm64_63)),
def!((index1, r2flags) = x86_bsr(x)),
def!(index2 = selectif(intcc_eq, r2flags, c_minus_one, index1)),
def!(a = isub(c_sixty_three, index2)),
],
);
let imm64_31 = Literal::constant(&imm.imm64, 31);
expand.legalize(
def!(a = clz.I32(x)),
vec![
def!(c_minus_one = iconst(imm64_minus_one)),
def!(c_thirty_one = iconst(imm64_31)),
def!((index1, r2flags) = x86_bsr(x)),
def!(index2 = selectif(intcc_eq, r2flags, c_minus_one, index1)),
def!(a = isub(c_thirty_one, index2)),
],
);
let imm64_64 = Literal::constant(&imm.imm64, 64);
expand.legalize(
def!(a = ctz.I64(x)),
vec![
def!(c_sixty_four = iconst(imm64_64)),
def!((index1, r2flags) = x86_bsf(x)),
def!(a = selectif(intcc_eq, r2flags, c_sixty_four, index1)),
],
);
let imm64_32 = Literal::constant(&imm.imm64, 32);
expand.legalize(
def!(a = ctz.I32(x)),
vec![
def!(c_thirty_two = iconst(imm64_32)),
def!((index1, r2flags) = x86_bsf(x)),
def!(a = selectif(intcc_eq, r2flags, c_thirty_two, index1)),
],
);
// Population count for baseline x86_64
let x = var("x");
let r = var("r");
let qv3 = var("qv3");
let qv4 = var("qv4");
let qv5 = var("qv5");
let qv6 = var("qv6");
let qv7 = var("qv7");
let qv8 = var("qv8");
let qv9 = var("qv9");
let qv10 = var("qv10");
let qv11 = var("qv11");
let qv12 = var("qv12");
let qv13 = var("qv13");
let qv14 = var("qv14");
let qv15 = var("qv15");
let qc77 = var("qc77");
#[allow(non_snake_case)]
let qc0F = var("qc0F");
let qc01 = var("qc01");
let imm64_1 = Literal::constant(&imm.imm64, 1);
let imm64_4 = Literal::constant(&imm.imm64, 4);
expand.legalize(
def!(r = popcnt.I64(x)),
vec![
def!(qv3 = ushr_imm(x, imm64_1)),
def!(qc77 = iconst(Literal::constant(&imm.imm64, 0x7777_7777_7777_7777))),
def!(qv4 = band(qv3, qc77)),
def!(qv5 = isub(x, qv4)),
def!(qv6 = ushr_imm(qv4, imm64_1)),
def!(qv7 = band(qv6, qc77)),
def!(qv8 = isub(qv5, qv7)),
def!(qv9 = ushr_imm(qv7, imm64_1)),
def!(qv10 = band(qv9, qc77)),
def!(qv11 = isub(qv8, qv10)),
def!(qv12 = ushr_imm(qv11, imm64_4)),
def!(qv13 = iadd(qv11, qv12)),
def!(qc0F = iconst(Literal::constant(&imm.imm64, 0x0F0F_0F0F_0F0F_0F0F))),
def!(qv14 = band(qv13, qc0F)),
def!(qc01 = iconst(Literal::constant(&imm.imm64, 0x0101_0101_0101_0101))),
def!(qv15 = imul(qv14, qc01)),
def!(r = ushr_imm(qv15, Literal::constant(&imm.imm64, 56))),
],
);
let lv3 = var("lv3");
let lv4 = var("lv4");
let lv5 = var("lv5");
let lv6 = var("lv6");
let lv7 = var("lv7");
let lv8 = var("lv8");
let lv9 = var("lv9");
let lv10 = var("lv10");
let lv11 = var("lv11");
let lv12 = var("lv12");
let lv13 = var("lv13");
let lv14 = var("lv14");
let lv15 = var("lv15");
let lc77 = var("lc77");
#[allow(non_snake_case)]
let lc0F = var("lc0F");
let lc01 = var("lc01");
expand.legalize(
def!(r = popcnt.I32(x)),
vec![
def!(lv3 = ushr_imm(x, imm64_1)),
def!(lc77 = iconst(Literal::constant(&imm.imm64, 0x7777_7777))),
def!(lv4 = band(lv3, lc77)),
def!(lv5 = isub(x, lv4)),
def!(lv6 = ushr_imm(lv4, imm64_1)),
def!(lv7 = band(lv6, lc77)),
def!(lv8 = isub(lv5, lv7)),
def!(lv9 = ushr_imm(lv7, imm64_1)),
def!(lv10 = band(lv9, lc77)),
def!(lv11 = isub(lv8, lv10)),
def!(lv12 = ushr_imm(lv11, imm64_4)),
def!(lv13 = iadd(lv11, lv12)),
def!(lc0F = iconst(Literal::constant(&imm.imm64, 0x0F0F_0F0F))),
def!(lv14 = band(lv13, lc0F)),
def!(lc01 = iconst(Literal::constant(&imm.imm64, 0x0101_0101))),
def!(lv15 = imul(lv14, lc01)),
def!(r = ushr_imm(lv15, Literal::constant(&imm.imm64, 24))),
],
);
expand.custom_legalize(ineg, "convert_ineg");
expand.custom_legalize(tls_value, "expand_tls_value");
widen.custom_legalize(ineg, "convert_ineg");
// To reduce compilation times, separate out large blocks of legalizations by theme.
define_simd(shared, x86_instructions, &mut narrow, &mut narrow_avx);
expand.build_and_add_to(&mut shared.transform_groups);
let narrow_id = narrow.build_and_add_to(&mut shared.transform_groups);
narrow_avx
.chain_with(narrow_id)
.build_and_add_to(&mut shared.transform_groups);
widen.build_and_add_to(&mut shared.transform_groups);
}
fn define_simd(
shared: &mut SharedDefinitions,
x86_instructions: &InstructionGroup,
narrow: &mut TransformGroupBuilder,
narrow_avx: &mut TransformGroupBuilder,
) {
let insts = &shared.instructions;
let band = insts.by_name("band");
let band_not = insts.by_name("band_not");
let bitcast = insts.by_name("bitcast");
let bitselect = insts.by_name("bitselect");
let bor = insts.by_name("bor");
let bnot = insts.by_name("bnot");
let bxor = insts.by_name("bxor");
let extractlane = insts.by_name("extractlane");
let fabs = insts.by_name("fabs");
let fcmp = insts.by_name("fcmp");
let fcvt_from_uint = insts.by_name("fcvt_from_uint");
let fcvt_to_sint_sat = insts.by_name("fcvt_to_sint_sat");
let fcvt_to_uint_sat = insts.by_name("fcvt_to_uint_sat");
let fmax = insts.by_name("fmax");
let fmin = insts.by_name("fmin");
let fneg = insts.by_name("fneg");
let iadd_imm = insts.by_name("iadd_imm");
let icmp = insts.by_name("icmp");
let imax = insts.by_name("imax");
let imin = insts.by_name("imin");
let imul = insts.by_name("imul");
let ineg = insts.by_name("ineg");
let insertlane = insts.by_name("insertlane");
let ishl = insts.by_name("ishl");
let ishl_imm = insts.by_name("ishl_imm");
let raw_bitcast = insts.by_name("raw_bitcast");
let scalar_to_vector = insts.by_name("scalar_to_vector");
let splat = insts.by_name("splat");
let shuffle = insts.by_name("shuffle");
let sshr = insts.by_name("sshr");
let swizzle = insts.by_name("swizzle");
let trueif = insts.by_name("trueif");
let uadd_sat = insts.by_name("uadd_sat");
let umax = insts.by_name("umax");
let umin = insts.by_name("umin");
let snarrow = insts.by_name("snarrow");
let swiden_high = insts.by_name("swiden_high");
let swiden_low = insts.by_name("swiden_low");
let ushr_imm = insts.by_name("ushr_imm");
let ushr = insts.by_name("ushr");
let uwiden_high = insts.by_name("uwiden_high");
let uwiden_low = insts.by_name("uwiden_low");
let vconst = insts.by_name("vconst");
let vall_true = insts.by_name("vall_true");
let vany_true = insts.by_name("vany_true");
let vselect = insts.by_name("vselect");
let x86_palignr = x86_instructions.by_name("x86_palignr");
let x86_pmaxs = x86_instructions.by_name("x86_pmaxs");
let x86_pmaxu = x86_instructions.by_name("x86_pmaxu");
let x86_pmins = x86_instructions.by_name("x86_pmins");
let x86_pminu = x86_instructions.by_name("x86_pminu");
let x86_pshufb = x86_instructions.by_name("x86_pshufb");
let x86_pshufd = x86_instructions.by_name("x86_pshufd");
let x86_psra = x86_instructions.by_name("x86_psra");
let x86_ptest = x86_instructions.by_name("x86_ptest");
let x86_punpckh = x86_instructions.by_name("x86_punpckh");
let x86_punpckl = x86_instructions.by_name("x86_punpckl");
let imm = &shared.imm;
// Set up variables and immediates.
let uimm8_zero = Literal::constant(&imm.uimm8, 0x00);
let uimm8_one = Literal::constant(&imm.uimm8, 0x01);
let uimm8_eight = Literal::constant(&imm.uimm8, 8);
let u128_zeroes = constant(vec![0x00; 16]);
let u128_ones = constant(vec![0xff; 16]);
let u128_seventies = constant(vec![0x70; 16]);
let a = var("a");
let b = var("b");
let c = var("c");
let d = var("d");
let e = var("e");
let f = var("f");
let g = var("g");
let h = var("h");
let x = var("x");
let y = var("y");
let z = var("z");
// Limit the SIMD vector size: eventually multiple vector sizes may be supported
// but for now only SSE-sized vectors are available.
let sse_vector_size: u64 = 128;
let allowed_simd_type = |t: &LaneType| t.lane_bits() >= 8 && t.lane_bits() < 128;
// SIMD splat: 8-bits
for ty in ValueType::all_lane_types().filter(|t| t.lane_bits() == 8) {
let splat_any8x16 = splat.bind(vector(ty, sse_vector_size));
narrow.legalize(
def!(y = splat_any8x16(x)),
vec![
// Move into the lowest 8 bits of an XMM register.
def!(a = scalar_to_vector(x)),
// Zero out a different XMM register; the shuffle mask for moving the lowest byte
// to all other byte lanes is 0x0.
def!(b = vconst(u128_zeroes)),
// PSHUFB takes two XMM operands, one of which is a shuffle mask (i.e. b).
def!(y = x86_pshufb(a, b)),
],
);
}
// SIMD splat: 16-bits
for ty in ValueType::all_lane_types().filter(|t| t.lane_bits() == 16) {
let splat_x16x8 = splat.bind(vector(ty, sse_vector_size));
let raw_bitcast_any16x8_to_i32x4 = raw_bitcast
.bind(vector(I32, sse_vector_size))
.bind(vector(ty, sse_vector_size));
let raw_bitcast_i32x4_to_any16x8 = raw_bitcast
.bind(vector(ty, sse_vector_size))
.bind(vector(I32, sse_vector_size));
narrow.legalize(
def!(y = splat_x16x8(x)),
vec![
// Move into the lowest 16 bits of an XMM register.
def!(a = scalar_to_vector(x)),
// Insert the value again but in the next lowest 16 bits.
def!(b = insertlane(a, x, uimm8_one)),
// No instruction emitted; pretend this is an I32x4 so we can use PSHUFD.
def!(c = raw_bitcast_any16x8_to_i32x4(b)),
// Broadcast the bytes in the XMM register with PSHUFD.
def!(d = x86_pshufd(c, uimm8_zero)),
// No instruction emitted; pretend this is an X16x8 again.
def!(y = raw_bitcast_i32x4_to_any16x8(d)),
],
);
}
// SIMD splat: 32-bits
for ty in ValueType::all_lane_types().filter(|t| t.lane_bits() == 32) {
let splat_any32x4 = splat.bind(vector(ty, sse_vector_size));
narrow.legalize(
def!(y = splat_any32x4(x)),
vec![
// Translate to an x86 MOV to get the value in an XMM register.
def!(a = scalar_to_vector(x)),
// Broadcast the bytes in the XMM register with PSHUFD.
def!(y = x86_pshufd(a, uimm8_zero)),
],
);
}
// SIMD splat: 64-bits
for ty in ValueType::all_lane_types().filter(|t| t.lane_bits() == 64) {
let splat_any64x2 = splat.bind(vector(ty, sse_vector_size));
narrow.legalize(
def!(y = splat_any64x2(x)),
vec![
// Move into the lowest 64 bits of an XMM register.
def!(a = scalar_to_vector(x)),
// Move into the highest 64 bits of the same XMM register.
def!(y = insertlane(a, x, uimm8_one)),
],
);
}
// SIMD swizzle; the following inefficient implementation is due to the Wasm SIMD spec requiring
// mask indexes greater than 15 to have the same semantics as a 0 index. For the spec discussion,
// see https://github.com/WebAssembly/simd/issues/93.
{
let swizzle = swizzle.bind(vector(I8, sse_vector_size));
narrow.legalize(
def!(a = swizzle(x, y)),
vec![
def!(b = vconst(u128_seventies)),
def!(c = uadd_sat(y, b)),
def!(a = x86_pshufb(x, c)),
],
);
}
// SIMD bnot
for ty in ValueType::all_lane_types().filter(allowed_simd_type) {
let bnot = bnot.bind(vector(ty, sse_vector_size));
narrow.legalize(
def!(y = bnot(x)),
vec![def!(a = vconst(u128_ones)), def!(y = bxor(a, x))],
);
}
// SIMD shift right (arithmetic, i16x8 and i32x4)
for ty in &[I16, I32] {
let sshr = sshr.bind(vector(*ty, sse_vector_size));
let bitcast_i64x2 = bitcast.bind(vector(I64, sse_vector_size));
narrow.legalize(
def!(a = sshr(x, y)),
vec![def!(b = bitcast_i64x2(y)), def!(a = x86_psra(x, b))],
);
}
// SIMD shift right (arithmetic, i8x16)
{
let sshr = sshr.bind(vector(I8, sse_vector_size));
let bitcast_i64x2 = bitcast.bind(vector(I64, sse_vector_size));
let raw_bitcast_i16x8 = raw_bitcast.bind(vector(I16, sse_vector_size));
let raw_bitcast_i16x8_again = raw_bitcast.bind(vector(I16, sse_vector_size));
narrow.legalize(
def!(z = sshr(x, y)),
vec![
// Since we will use the high byte of each 16x8 lane, shift an extra 8 bits.
def!(a = iadd_imm(y, uimm8_eight)),
def!(b = bitcast_i64x2(a)),
// Take the low 8 bytes of x, duplicate them in 16x8 lanes, then shift right.
def!(c = x86_punpckl(x, x)),
def!(d = raw_bitcast_i16x8(c)),
def!(e = x86_psra(d, b)),
// Take the high 8 bytes of x, duplicate them in 16x8 lanes, then shift right.
def!(f = x86_punpckh(x, x)),
def!(g = raw_bitcast_i16x8_again(f)),
def!(h = x86_psra(g, b)),
// Re-pack the vector.
def!(z = snarrow(e, h)),
],
);
}
// SIMD shift right (arithmetic, i64x2)
{
let sshr_vector = sshr.bind(vector(I64, sse_vector_size));
let sshr_scalar_lane0 = sshr.bind(I64);
let sshr_scalar_lane1 = sshr.bind(I64);
narrow.legalize(
def!(z = sshr_vector(x, y)),
vec![
// Use scalar operations to shift the first lane.
def!(a = extractlane(x, uimm8_zero)),
def!(b = sshr_scalar_lane0(a, y)),
def!(c = insertlane(x, b, uimm8_zero)),
// Do the same for the second lane.
def!(d = extractlane(x, uimm8_one)),
def!(e = sshr_scalar_lane1(d, y)),
def!(z = insertlane(c, e, uimm8_one)),
],
);
}
// SIMD select
for ty in ValueType::all_lane_types().filter(allowed_simd_type) {
let bitselect = bitselect.bind(vector(ty, sse_vector_size)); // must bind both x/y and c
narrow.legalize(
def!(d = bitselect(c, x, y)),
vec![
def!(a = band(x, c)),
def!(b = band_not(y, c)),
def!(d = bor(a, b)),
],
);
}
// SIMD vselect; replace with bitselect if BLEND* instructions are not available.
// This works, because each lane of boolean vector is filled with zeroes or ones.
for ty in ValueType::all_lane_types().filter(allowed_simd_type) {
let vselect = vselect.bind(vector(ty, sse_vector_size));
let raw_bitcast = raw_bitcast.bind(vector(ty, sse_vector_size));
narrow.legalize(
def!(d = vselect(c, x, y)),
vec![def!(a = raw_bitcast(c)), def!(d = bitselect(a, x, y))],
);
}
// SIMD vany_true
let ne = Literal::enumerator_for(&imm.intcc, "ne");
for ty in ValueType::all_lane_types().filter(allowed_simd_type) {
let vany_true = vany_true.bind(vector(ty, sse_vector_size));
narrow.legalize(
def!(y = vany_true(x)),
vec![def!(a = x86_ptest(x, x)), def!(y = trueif(ne, a))],
);
}
// SIMD vall_true
let eq = Literal::enumerator_for(&imm.intcc, "eq");
for ty in ValueType::all_lane_types().filter(allowed_simd_type) {
let vall_true = vall_true.bind(vector(ty, sse_vector_size));
if ty.is_int() {
// In the common case (Wasm's integer-only all_true), we do not require a
// bitcast.
narrow.legalize(
def!(y = vall_true(x)),
vec![
def!(a = vconst(u128_zeroes)),
def!(c = icmp(eq, x, a)),
def!(d = x86_ptest(c, c)),
def!(y = trueif(eq, d)),
],
);
} else {
// However, to support other types we must bitcast them to an integer vector to
// use icmp.
let lane_type_as_int = LaneType::int_from_bits(ty.lane_bits() as u16);
let raw_bitcast_to_int = raw_bitcast.bind(vector(lane_type_as_int, sse_vector_size));
narrow.legalize(
def!(y = vall_true(x)),
vec![
def!(a = vconst(u128_zeroes)),
def!(b = raw_bitcast_to_int(x)),
def!(c = icmp(eq, b, a)),
def!(d = x86_ptest(c, c)),
def!(y = trueif(eq, d)),
],
);
}
}
// SIMD icmp ne
let ne = Literal::enumerator_for(&imm.intcc, "ne");
for ty in ValueType::all_lane_types().filter(|ty| allowed_simd_type(ty) && ty.is_int()) {
let icmp_ = icmp.bind(vector(ty, sse_vector_size));
narrow.legalize(
def!(c = icmp_(ne, a, b)),
vec![def!(x = icmp(eq, a, b)), def!(c = bnot(x))],
);
}
// SIMD icmp greater-/less-than
let sgt = Literal::enumerator_for(&imm.intcc, "sgt");
let ugt = Literal::enumerator_for(&imm.intcc, "ugt");
let sge = Literal::enumerator_for(&imm.intcc, "sge");
let uge = Literal::enumerator_for(&imm.intcc, "uge");
let slt = Literal::enumerator_for(&imm.intcc, "slt");
let ult = Literal::enumerator_for(&imm.intcc, "ult");
let sle = Literal::enumerator_for(&imm.intcc, "sle");
let ule = Literal::enumerator_for(&imm.intcc, "ule");
for ty in &[I8, I16, I32] {
// greater-than
let icmp_ = icmp.bind(vector(*ty, sse_vector_size));
narrow.legalize(
def!(c = icmp_(ugt, a, b)),
vec![
def!(x = x86_pmaxu(a, b)),
def!(y = icmp(eq, x, b)),
def!(c = bnot(y)),
],
);
let icmp_ = icmp.bind(vector(*ty, sse_vector_size));
narrow.legalize(
def!(c = icmp_(sge, a, b)),
vec![def!(x = x86_pmins(a, b)), def!(c = icmp(eq, x, b))],
);
let icmp_ = icmp.bind(vector(*ty, sse_vector_size));
narrow.legalize(
def!(c = icmp_(uge, a, b)),
vec![def!(x = x86_pminu(a, b)), def!(c = icmp(eq, x, b))],
);
// less-than
let icmp_ = icmp.bind(vector(*ty, sse_vector_size));
narrow.legalize(def!(c = icmp_(slt, a, b)), vec![def!(c = icmp(sgt, b, a))]);
let icmp_ = icmp.bind(vector(*ty, sse_vector_size));
narrow.legalize(def!(c = icmp_(ult, a, b)), vec![def!(c = icmp(ugt, b, a))]);
let icmp_ = icmp.bind(vector(*ty, sse_vector_size));
narrow.legalize(def!(c = icmp_(sle, a, b)), vec![def!(c = icmp(sge, b, a))]);
let icmp_ = icmp.bind(vector(*ty, sse_vector_size));
narrow.legalize(def!(c = icmp_(ule, a, b)), vec![def!(c = icmp(uge, b, a))]);
}
// SIMD integer min/max
for ty in &[I8, I16, I32] {
let imin = imin.bind(vector(*ty, sse_vector_size));
narrow.legalize(def!(c = imin(a, b)), vec![def!(c = x86_pmins(a, b))]);
let umin = umin.bind(vector(*ty, sse_vector_size));
narrow.legalize(def!(c = umin(a, b)), vec![def!(c = x86_pminu(a, b))]);
let imax = imax.bind(vector(*ty, sse_vector_size));
narrow.legalize(def!(c = imax(a, b)), vec![def!(c = x86_pmaxs(a, b))]);
let umax = umax.bind(vector(*ty, sse_vector_size));
narrow.legalize(def!(c = umax(a, b)), vec![def!(c = x86_pmaxu(a, b))]);
}
// SIMD fcmp greater-/less-than
let gt = Literal::enumerator_for(&imm.floatcc, "gt");
let lt = Literal::enumerator_for(&imm.floatcc, "lt");
let ge = Literal::enumerator_for(&imm.floatcc, "ge");
let le = Literal::enumerator_for(&imm.floatcc, "le");
let ugt = Literal::enumerator_for(&imm.floatcc, "ugt");
let ult = Literal::enumerator_for(&imm.floatcc, "ult");
let uge = Literal::enumerator_for(&imm.floatcc, "uge");
let ule = Literal::enumerator_for(&imm.floatcc, "ule");
for ty in &[F32, F64] {
let fcmp_ = fcmp.bind(vector(*ty, sse_vector_size));
narrow.legalize(def!(c = fcmp_(gt, a, b)), vec![def!(c = fcmp(lt, b, a))]);
let fcmp_ = fcmp.bind(vector(*ty, sse_vector_size));
narrow.legalize(def!(c = fcmp_(ge, a, b)), vec![def!(c = fcmp(le, b, a))]);
let fcmp_ = fcmp.bind(vector(*ty, sse_vector_size));
narrow.legalize(def!(c = fcmp_(ult, a, b)), vec![def!(c = fcmp(ugt, b, a))]);
let fcmp_ = fcmp.bind(vector(*ty, sse_vector_size));
narrow.legalize(def!(c = fcmp_(ule, a, b)), vec![def!(c = fcmp(uge, b, a))]);
}
for ty in &[F32, F64] {
let fneg = fneg.bind(vector(*ty, sse_vector_size));
let lane_type_as_int = LaneType::int_from_bits(LaneType::from(*ty).lane_bits() as u16);
let uimm8_shift = Literal::constant(&imm.uimm8, lane_type_as_int.lane_bits() as i64 - 1);
let vconst = vconst.bind(vector(lane_type_as_int, sse_vector_size));
let bitcast_to_float = raw_bitcast.bind(vector(*ty, sse_vector_size));
narrow.legalize(
def!(b = fneg(a)),
vec![
def!(c = vconst(u128_ones)),
def!(d = ishl_imm(c, uimm8_shift)), // Create a mask of all 0s except the MSB.
def!(e = bitcast_to_float(d)), // Cast mask to the floating-point type.
def!(b = bxor(a, e)), // Flip the MSB.
],
);
}
// SIMD fabs
for ty in &[F32, F64] {
let fabs = fabs.bind(vector(*ty, sse_vector_size));
let lane_type_as_int = LaneType::int_from_bits(LaneType::from(*ty).lane_bits() as u16);
let vconst = vconst.bind(vector(lane_type_as_int, sse_vector_size));
let bitcast_to_float = raw_bitcast.bind(vector(*ty, sse_vector_size));
narrow.legalize(
def!(b = fabs(a)),
vec![
def!(c = vconst(u128_ones)),
def!(d = ushr_imm(c, uimm8_one)), // Create a mask of all 1s except the MSB.
def!(e = bitcast_to_float(d)), // Cast mask to the floating-point type.
def!(b = band(a, e)), // Unset the MSB.
],
);
}
// SIMD widen
for ty in &[I8, I16] {
let swiden_high = swiden_high.bind(vector(*ty, sse_vector_size));
narrow.legalize(
def!(b = swiden_high(a)),
vec![
def!(c = x86_palignr(a, a, uimm8_eight)),
def!(b = swiden_low(c)),
],
);
let uwiden_high = uwiden_high.bind(vector(*ty, sse_vector_size));
narrow.legalize(
def!(b = uwiden_high(a)),
vec![
def!(c = x86_palignr(a, a, uimm8_eight)),
def!(b = uwiden_low(c)),
],
);
}
narrow.custom_legalize(shuffle, "convert_shuffle");
narrow.custom_legalize(extractlane, "convert_extractlane");
narrow.custom_legalize(insertlane, "convert_insertlane");
narrow.custom_legalize(ineg, "convert_ineg");
narrow.custom_legalize(ushr, "convert_ushr");
narrow.custom_legalize(ishl, "convert_ishl");
narrow.custom_legalize(fcvt_to_sint_sat, "expand_fcvt_to_sint_sat_vector");
narrow.custom_legalize(fmin, "expand_minmax_vector");
narrow.custom_legalize(fmax, "expand_minmax_vector");
narrow_avx.custom_legalize(imul, "convert_i64x2_imul");
narrow_avx.custom_legalize(fcvt_from_uint, "expand_fcvt_from_uint_vector");
narrow_avx.custom_legalize(fcvt_to_uint_sat, "expand_fcvt_to_uint_sat_vector");
}

View File

@@ -1,87 +1,25 @@
use crate::cdsl::cpu_modes::CpuMode;
use crate::cdsl::instructions::{InstructionGroupBuilder, InstructionPredicateMap};
use crate::cdsl::isa::TargetIsa;
use crate::cdsl::types::{ReferenceType, VectorType};
use crate::cdsl::recipes::Recipes;
use crate::cdsl::regs::IsaRegsBuilder;
use crate::shared::types::Bool::B1;
use crate::shared::types::Float::{F32, F64};
use crate::shared::types::Int::{I16, I32, I64, I8};
use crate::shared::types::Reference::{R32, R64};
use crate::shared::Definitions as SharedDefinitions;
mod encodings;
mod instructions;
mod legalize;
mod opcodes;
mod recipes;
mod registers;
pub(crate) mod settings;
pub(crate) fn define(shared_defs: &mut SharedDefinitions) -> TargetIsa {
let settings = settings::define(&shared_defs.settings);
let regs = registers::define();
let inst_group = instructions::define(
&mut shared_defs.all_instructions,
&shared_defs.formats,
&shared_defs.imm,
&shared_defs.entities,
);
legalize::define(shared_defs, &inst_group);
let inst_group = InstructionGroupBuilder::new(&mut shared_defs.all_instructions).build();
// CPU modes for 32-bit and 64-bit operations.
let mut x86_64 = CpuMode::new("I64");
let mut x86_32 = CpuMode::new("I32");
let expand_flags = shared_defs.transform_groups.by_name("expand_flags");
let x86_widen = shared_defs.transform_groups.by_name("x86_widen");
let x86_narrow = shared_defs.transform_groups.by_name("x86_narrow");
let x86_narrow_avx = shared_defs.transform_groups.by_name("x86_narrow_avx");
let x86_expand = shared_defs.transform_groups.by_name("x86_expand");
x86_32.legalize_monomorphic(expand_flags);
x86_32.legalize_default(x86_narrow);
x86_32.legalize_type(B1, expand_flags);
x86_32.legalize_type(I8, x86_widen);
x86_32.legalize_type(I16, x86_widen);
x86_32.legalize_type(I32, x86_expand);
x86_32.legalize_value_type(ReferenceType(R32), x86_expand);
x86_32.legalize_type(F32, x86_expand);
x86_32.legalize_type(F64, x86_expand);
x86_32.legalize_value_type(VectorType::new(I32.into(), 4), x86_narrow_avx);
x86_32.legalize_value_type(VectorType::new(I64.into(), 2), x86_narrow_avx);
x86_32.legalize_value_type(VectorType::new(F32.into(), 4), x86_narrow_avx);
x86_64.legalize_monomorphic(expand_flags);
x86_64.legalize_default(x86_narrow);
x86_64.legalize_type(B1, expand_flags);
x86_64.legalize_type(I8, x86_widen);
x86_64.legalize_type(I16, x86_widen);
x86_64.legalize_type(I32, x86_expand);
x86_64.legalize_type(I64, x86_expand);
x86_64.legalize_value_type(ReferenceType(R64), x86_expand);
x86_64.legalize_type(F32, x86_expand);
x86_64.legalize_type(F64, x86_expand);
x86_64.legalize_value_type(VectorType::new(I32.into(), 4), x86_narrow_avx);
x86_64.legalize_value_type(VectorType::new(I64.into(), 2), x86_narrow_avx);
x86_64.legalize_value_type(VectorType::new(F32.into(), 4), x86_narrow_avx);
let recipes = recipes::define(shared_defs, &settings, &regs);
let encodings = encodings::define(shared_defs, &settings, &inst_group, &recipes);
x86_32.set_encodings(encodings.enc32);
x86_64.set_encodings(encodings.enc64);
let encodings_predicates = encodings.inst_pred_reg.extract();
let recipes = encodings.recipes;
let cpu_modes = vec![x86_64, x86_32];
let cpu_modes = vec![];
TargetIsa::new(
"x86",
settings,
regs,
recipes,
IsaRegsBuilder::new().build(),
Recipes::new(),
cpu_modes,
encodings_predicates,
InstructionPredicateMap::new(),
)
}

View File

@@ -1,721 +0,0 @@
//! Static, named definitions of instruction opcodes.
/// Empty opcode for use as a default.
pub static EMPTY: [u8; 0] = [];
/// Add with carry flag r{16,32,64} to r/m of the same size.
pub static ADC: [u8; 1] = [0x11];
/// Add r{16,32,64} to r/m of the same size.
pub static ADD: [u8; 1] = [0x01];
/// Add imm{16,32} to r/m{16,32,64}, possibly sign-extended.
pub static ADD_IMM: [u8; 1] = [0x81];
/// Add sign-extended imm8 to r/m{16,32,64}.
pub static ADD_IMM8_SIGN_EXTEND: [u8; 1] = [0x83];
/// Add packed double-precision floating-point values from xmm2/mem to xmm1 and store result in
/// xmm1 (SSE2).
pub static ADDPD: [u8; 3] = [0x66, 0x0f, 0x58];
/// Add packed single-precision floating-point values from xmm2/mem to xmm1 and store result in
/// xmm1 (SSE).
pub static ADDPS: [u8; 2] = [0x0f, 0x58];
/// Add the low double-precision floating-point value from xmm2/mem to xmm1
/// and store the result in xmm1.
pub static ADDSD: [u8; 3] = [0xf2, 0x0f, 0x58];
/// Add the low single-precision floating-point value from xmm2/mem to xmm1
/// and store the result in xmm1.
pub static ADDSS: [u8; 3] = [0xf3, 0x0f, 0x58];
/// r/m{16,32,64} AND register of the same size (Intel docs have a typo).
pub static AND: [u8; 1] = [0x21];
/// imm{16,32} AND r/m{16,32,64}, possibly sign-extended.
pub static AND_IMM: [u8; 1] = [0x81];
/// r/m{16,32,64} AND sign-extended imm8.
pub static AND_IMM8_SIGN_EXTEND: [u8; 1] = [0x83];
/// Return the bitwise logical AND NOT of packed single-precision floating-point
/// values in xmm1 and xmm2/mem.
pub static ANDNPS: [u8; 2] = [0x0f, 0x55];
/// Return the bitwise logical AND of packed single-precision floating-point values
/// in xmm1 and xmm2/mem.
pub static ANDPS: [u8; 2] = [0x0f, 0x54];
/// Bit scan forward (stores index of first encountered 1 from the front).
pub static BIT_SCAN_FORWARD: [u8; 2] = [0x0f, 0xbc];
/// Bit scan reverse (stores index of first encountered 1 from the back).
pub static BIT_SCAN_REVERSE: [u8; 2] = [0x0f, 0xbd];
/// Select packed single-precision floating-point values from xmm1 and xmm2/m128
/// from mask specified in XMM0 and store the values into xmm1 (SSE4.1).
pub static BLENDVPS: [u8; 4] = [0x66, 0x0f, 0x38, 0x14];
/// Select packed double-precision floating-point values from xmm1 and xmm2/m128
/// from mask specified in XMM0 and store the values into xmm1 (SSE4.1).
pub static BLENDVPD: [u8; 4] = [0x66, 0x0f, 0x38, 0x15];
/// Call near, relative, displacement relative to next instruction (sign-extended).
pub static CALL_RELATIVE: [u8; 1] = [0xe8];
/// Move r/m{16,32,64} if overflow (OF=1).
pub static CMOV_OVERFLOW: [u8; 2] = [0x0f, 0x40];
/// Compare imm{16,32} with r/m{16,32,64} (sign-extended if 64).
pub static CMP_IMM: [u8; 1] = [0x81];
/// Compare imm8 with r/m{16,32,64}.
pub static CMP_IMM8: [u8; 1] = [0x83];
/// Compare r{16,32,64} with r/m of the same size.
pub static CMP_REG: [u8; 1] = [0x39];
/// Compare packed double-precision floating-point value in xmm2/m32 and xmm1 using bits 2:0 of
/// imm8 as comparison predicate (SSE2).
pub static CMPPD: [u8; 3] = [0x66, 0x0f, 0xc2];
/// Compare packed single-precision floating-point value in xmm2/m32 and xmm1 using bits 2:0 of
/// imm8 as comparison predicate (SSE).
pub static CMPPS: [u8; 2] = [0x0f, 0xc2];
/// Convert four packed signed doubleword integers from xmm2/mem to four packed single-precision
/// floating-point values in xmm1 (SSE2).
pub static CVTDQ2PS: [u8; 2] = [0x0f, 0x5b];
/// Convert scalar double-precision floating-point value to scalar single-precision
/// floating-point value.
pub static CVTSD2SS: [u8; 3] = [0xf2, 0x0f, 0x5a];
/// Convert doubleword integer to scalar double-precision floating-point value.
pub static CVTSI2SD: [u8; 3] = [0xf2, 0x0f, 0x2a];
/// Convert doubleword integer to scalar single-precision floating-point value.
pub static CVTSI2SS: [u8; 3] = [0xf3, 0x0f, 0x2a];
/// Convert scalar single-precision floating-point value to scalar double-precision
/// float-point value.
pub static CVTSS2SD: [u8; 3] = [0xf3, 0x0f, 0x5a];
/// Convert four packed single-precision floating-point values from xmm2/mem to four packed signed
/// doubleword values in xmm1 using truncation (SSE2).
pub static CVTTPS2DQ: [u8; 3] = [0xf3, 0x0f, 0x5b];
/// Convert with truncation scalar double-precision floating-point value to signed
/// integer.
pub static CVTTSD2SI: [u8; 3] = [0xf2, 0x0f, 0x2c];
/// Convert with truncation scalar single-precision floating-point value to integer.
pub static CVTTSS2SI: [u8; 3] = [0xf3, 0x0f, 0x2c];
/// Unsigned divide for {16,32,64}-bit.
pub static DIV: [u8; 1] = [0xf7];
/// Divide packed double-precision floating-point values in xmm1 by packed double-precision
/// floating-point values in xmm2/mem (SSE2).
pub static DIVPD: [u8; 3] = [0x66, 0x0f, 0x5e];
/// Divide packed single-precision floating-point values in xmm1 by packed single-precision
/// floating-point values in xmm2/mem (SSE).
pub static DIVPS: [u8; 2] = [0x0f, 0x5e];
/// Divide low double-precision floating-point value in xmm1 by low double-precision
/// floating-point value in xmm2/m64.
pub static DIVSD: [u8; 3] = [0xf2, 0x0f, 0x5e];
/// Divide low single-precision floating-point value in xmm1 by low single-precision
/// floating-point value in xmm2/m32.
pub static DIVSS: [u8; 3] = [0xf3, 0x0f, 0x5e];
/// Signed divide for {16,32,64}-bit.
pub static IDIV: [u8; 1] = [0xf7];
/// Signed multiply for {16,32,64}-bit, generic registers.
pub static IMUL: [u8; 2] = [0x0f, 0xaf];
/// Signed multiply for {16,32,64}-bit, storing into RDX:RAX.
pub static IMUL_RDX_RAX: [u8; 1] = [0xf7];
/// Insert scalar single-precision floating-point value.
pub static INSERTPS: [u8; 4] = [0x66, 0x0f, 0x3a, 0x21];
/// Either:
/// 1. Jump near, absolute indirect, RIP = 64-bit offset from register or memory.
/// 2. Jump far, absolute indirect, address given in m16:64.
pub static JUMP_ABSOLUTE: [u8; 1] = [0xff];
/// Jump near, relative, RIP = RIP + 32-bit displacement sign extended to 64 bits.
pub static JUMP_NEAR_RELATIVE: [u8; 1] = [0xe9];
/// Jump near (rel32) if overflow (OF=1).
pub static JUMP_NEAR_IF_OVERFLOW: [u8; 2] = [0x0f, 0x80];
/// Jump short, relative, RIP = RIP + 8-bit displacement sign extended to 64 bits.
pub static JUMP_SHORT: [u8; 1] = [0xeb];
/// Jump short (rel8) if equal (ZF=1).
pub static JUMP_SHORT_IF_EQUAL: [u8; 1] = [0x74];
/// Jump short (rel8) if not equal (ZF=0).
pub static JUMP_SHORT_IF_NOT_EQUAL: [u8; 1] = [0x75];
/// Jump short (rel8) if overflow (OF=1).
pub static JUMP_SHORT_IF_OVERFLOW: [u8; 1] = [0x70];
/// Store effective address for m in register r{16,32,64}.
pub static LEA: [u8; 1] = [0x8d];
/// Count the number of leading zero bits.
pub static LZCNT: [u8; 3] = [0xf3, 0x0f, 0xbd];
/// Return the maximum packed double-precision floating-point values between xmm1 and xmm2/m128
/// (SSE2).
pub static MAXPD: [u8; 3] = [0x66, 0x0f, 0x5f];
/// Return the maximum packed single-precision floating-point values between xmm1 and xmm2/m128
/// (SSE).
pub static MAXPS: [u8; 2] = [0x0f, 0x5f];
/// Return the maximum scalar double-precision floating-point value between
/// xmm2/m64 and xmm1.
pub static MAXSD: [u8; 3] = [0xf2, 0x0f, 0x5f];
/// Return the maximum scalar single-precision floating-point value between
/// xmm2/m32 and xmm1.
pub static MAXSS: [u8; 3] = [0xf3, 0x0f, 0x5f];
/// Return the minimum packed double-precision floating-point values between xmm1 and xmm2/m128
/// (SSE2).
pub static MINPD: [u8; 3] = [0x66, 0x0f, 0x5d];
/// Return the minimum packed single-precision floating-point values between xmm1 and xmm2/m128
/// (SSE).
pub static MINPS: [u8; 2] = [0x0f, 0x5d];
/// Return the minimum scalar double-precision floating-point value between
/// xmm2/m64 and xmm1.
pub static MINSD: [u8; 3] = [0xf2, 0x0f, 0x5d];
/// Return the minimum scalar single-precision floating-point value between
/// xmm2/m32 and xmm1.
pub static MINSS: [u8; 3] = [0xf3, 0x0f, 0x5d];
/// Move r8 to r/m8.
pub static MOV_BYTE_STORE: [u8; 1] = [0x88];
/// Move imm{16,32,64} to same-sized register.
pub static MOV_IMM: [u8; 1] = [0xb8];
/// Move imm{16,32} to r{16,32,64}, sign-extended if 64-bit target.
pub static MOV_IMM_SIGNEXTEND: [u8; 1] = [0xc7];
/// Move {r/m16, r/m32, r/m64} to same-sized register.
pub static MOV_LOAD: [u8; 1] = [0x8b];
/// Move r16 to r/m16.
pub static MOV_STORE_16: [u8; 2] = [0x66, 0x89];
/// Move {r16, r32, r64} to same-sized register or memory.
pub static MOV_STORE: [u8; 1] = [0x89];
/// Move aligned packed single-precision floating-point values from x/m to xmm (SSE).
pub static MOVAPS_LOAD: [u8; 2] = [0x0f, 0x28];
/// Move doubleword from r/m32 to xmm (SSE2). Quadword with REX prefix.
pub static MOVD_LOAD_XMM: [u8; 3] = [0x66, 0x0f, 0x6e];
/// Move doubleword from xmm to r/m32 (SSE2). Quadword with REX prefix.
pub static MOVD_STORE_XMM: [u8; 3] = [0x66, 0x0f, 0x7e];
/// Move packed single-precision floating-point values low to high (SSE).
pub static MOVLHPS: [u8; 2] = [0x0f, 0x16];
/// Move scalar double-precision floating-point value (from reg/mem to reg).
pub static MOVSD_LOAD: [u8; 3] = [0xf2, 0x0f, 0x10];
/// Move scalar double-precision floating-point value (from reg to reg/mem).
pub static MOVSD_STORE: [u8; 3] = [0xf2, 0x0f, 0x11];
/// Move scalar single-precision floating-point value (from reg to reg/mem).
pub static MOVSS_STORE: [u8; 3] = [0xf3, 0x0f, 0x11];
/// Move scalar single-precision floating-point-value (from reg/mem to reg).
pub static MOVSS_LOAD: [u8; 3] = [0xf3, 0x0f, 0x10];
/// Move byte to register with sign-extension.
pub static MOVSX_BYTE: [u8; 2] = [0x0f, 0xbe];
/// Move word to register with sign-extension.
pub static MOVSX_WORD: [u8; 2] = [0x0f, 0xbf];
/// Move doubleword to register with sign-extension.
pub static MOVSXD: [u8; 1] = [0x63];
/// Move unaligned packed single-precision floating-point from x/m to xmm (SSE).
pub static MOVUPS_LOAD: [u8; 2] = [0x0f, 0x10];
/// Move unaligned packed single-precision floating-point value from xmm to x/m (SSE).
pub static MOVUPS_STORE: [u8; 2] = [0x0f, 0x11];
/// Move byte to register with zero-extension.
pub static MOVZX_BYTE: [u8; 2] = [0x0f, 0xb6];
/// Move word to register with zero-extension.
pub static MOVZX_WORD: [u8; 2] = [0x0f, 0xb7];
/// Unsigned multiply for {16,32,64}-bit.
pub static MUL: [u8; 1] = [0xf7];
/// Multiply packed double-precision floating-point values from xmm2/mem to xmm1 and store result
/// in xmm1 (SSE2).
pub static MULPD: [u8; 3] = [0x66, 0x0f, 0x59];
/// Multiply packed single-precision floating-point values from xmm2/mem to xmm1 and store result
/// in xmm1 (SSE).
pub static MULPS: [u8; 2] = [0x0f, 0x59];
/// Multiply the low double-precision floating-point value in xmm2/m64 by the
/// low double-precision floating-point value in xmm1.
pub static MULSD: [u8; 3] = [0xf2, 0x0f, 0x59];
/// Multiply the low single-precision floating-point value in xmm2/m32 by the
/// low single-precision floating-point value in xmm1.
pub static MULSS: [u8; 3] = [0xf3, 0x0f, 0x59];
/// Reverse each bit of r/m{16,32,64}.
pub static NOT: [u8; 1] = [0xf7];
/// r{16,32,64} OR register of same size.
pub static OR: [u8; 1] = [0x09];
/// imm{16,32} OR r/m{16,32,64}, possibly sign-extended.
pub static OR_IMM: [u8; 1] = [0x81];
/// r/m{16,32,64} OR sign-extended imm8.
pub static OR_IMM8_SIGN_EXTEND: [u8; 1] = [0x83];
/// Return the bitwise logical OR of packed single-precision values in xmm and x/m (SSE).
pub static ORPS: [u8; 2] = [0x0f, 0x56];
/// Compute the absolute value of bytes in xmm2/m128 and store the unsigned result in xmm1 (SSSE3).
pub static PABSB: [u8; 4] = [0x66, 0x0f, 0x38, 0x1c];
/// Compute the absolute value of 32-bit integers in xmm2/m128 and store the unsigned result in
/// xmm1 (SSSE3).
pub static PABSD: [u8; 4] = [0x66, 0x0f, 0x38, 0x1e];
/// Compute the absolute value of 16-bit integers in xmm2/m128 and store the unsigned result in
/// xmm1 (SSSE3).
pub static PABSW: [u8; 4] = [0x66, 0x0f, 0x38, 0x1d];
/// Converts 8 packed signed word integers from xmm1 and from xmm2/m128 into 16 packed signed byte
/// integers in xmm1 using signed saturation (SSE2).
pub static PACKSSWB: [u8; 3] = [0x66, 0x0f, 0x63];
/// Converts 4 packed signed doubleword integers from xmm1 and from xmm2/m128 into 8 packed signed
/// word integers in xmm1 using signed saturation (SSE2).
pub static PACKSSDW: [u8; 3] = [0x66, 0x0f, 0x6b];
/// Converts 8 packed signed word integers from xmm1 and from xmm2/m128 into 16 packed unsigned byte
/// integers in xmm1 using unsigned saturation (SSE2).
pub static PACKUSWB: [u8; 3] = [0x66, 0x0f, 0x67];
/// Converts 4 packed signed doubleword integers from xmm1 and from xmm2/m128 into 8 unpacked signed
/// word integers in xmm1 using unsigned saturation (SSE4.1).
pub static PACKUSDW: [u8; 4] = [0x66, 0x0f, 0x38, 0x2b];
/// Add packed byte integers from xmm2/m128 and xmm1 (SSE2).
pub static PADDB: [u8; 3] = [0x66, 0x0f, 0xfc];
/// Add packed doubleword integers from xmm2/m128 and xmm1 (SSE2).
pub static PADDD: [u8; 3] = [0x66, 0x0f, 0xfe];
/// Add packed quadword integers from xmm2/m128 and xmm1 (SSE2).
pub static PADDQ: [u8; 3] = [0x66, 0x0f, 0xd4];
/// Add packed word integers from xmm2/m128 and xmm1 (SSE2).
pub static PADDW: [u8; 3] = [0x66, 0x0f, 0xfd];
/// Add packed signed byte integers from xmm2/m128 and xmm1 saturate the results (SSE).
pub static PADDSB: [u8; 3] = [0x66, 0x0f, 0xec];
/// Add packed signed word integers from xmm2/m128 and xmm1 saturate the results (SSE).
pub static PADDSW: [u8; 3] = [0x66, 0x0f, 0xed];
/// Add packed unsigned byte integers from xmm2/m128 and xmm1 saturate the results (SSE).
pub static PADDUSB: [u8; 3] = [0x66, 0x0f, 0xdc];
/// Add packed unsigned word integers from xmm2/m128 and xmm1 saturate the results (SSE).
pub static PADDUSW: [u8; 3] = [0x66, 0x0f, 0xdd];
/// Concatenate destination and source operands, extract a byte-aligned result into xmm1 that is
/// shifted to the right by the constant number of bytes in imm8 (SSSE3).
pub static PALIGNR: [u8; 4] = [0x66, 0x0f, 0x3a, 0x0f];
/// Bitwise AND of xmm2/m128 and xmm1 (SSE2).
pub static PAND: [u8; 3] = [0x66, 0x0f, 0xdb];
/// Bitwise AND NOT of xmm2/m128 and xmm1 (SSE2).
pub static PANDN: [u8; 3] = [0x66, 0x0f, 0xdf];
/// Average packed unsigned byte integers from xmm2/m128 and xmm1 with rounding (SSE2).
pub static PAVGB: [u8; 3] = [0x66, 0x0f, 0xE0];
/// Average packed unsigned word integers from xmm2/m128 and xmm1 with rounding (SSE2).
pub static PAVGW: [u8; 3] = [0x66, 0x0f, 0xE3];
/// Select byte values from xmm1 and xmm2/m128 from mask specified in the high bit of each byte
/// in XMM0 and store the values into xmm1 (SSE4.1).
pub static PBLENDVB: [u8; 4] = [0x66, 0x0f, 0x38, 0x10];
/// Select words from xmm1 and xmm2/m128 from mask specified in imm8 and store the values into xmm1
/// (SSE4.1).
pub static PBLENDW: [u8; 4] = [0x66, 0x0f, 0x3a, 0x0e];
/// Compare packed data for equal (SSE2).
pub static PCMPEQB: [u8; 3] = [0x66, 0x0f, 0x74];
/// Compare packed data for equal (SSE2).
pub static PCMPEQD: [u8; 3] = [0x66, 0x0f, 0x76];
/// Compare packed data for equal (SSE4.1).
pub static PCMPEQQ: [u8; 4] = [0x66, 0x0f, 0x38, 0x29];
/// Compare packed data for equal (SSE2).
pub static PCMPEQW: [u8; 3] = [0x66, 0x0f, 0x75];
/// Compare packed signed byte integers for greater than (SSE2).
pub static PCMPGTB: [u8; 3] = [0x66, 0x0f, 0x64];
/// Compare packed signed doubleword integers for greater than (SSE2).
pub static PCMPGTD: [u8; 3] = [0x66, 0x0f, 0x66];
/// Compare packed signed quadword integers for greater than (SSE4.2).
pub static PCMPGTQ: [u8; 4] = [0x66, 0x0f, 0x38, 0x37];
/// Compare packed signed word integers for greater than (SSE2).
pub static PCMPGTW: [u8; 3] = [0x66, 0x0f, 0x65];
/// Extract doubleword or quadword, depending on REX.W (SSE4.1).
pub static PEXTR: [u8; 4] = [0x66, 0x0f, 0x3a, 0x16];
/// Extract byte (SSE4.1).
pub static PEXTRB: [u8; 4] = [0x66, 0x0f, 0x3a, 0x14];
/// Extract word (SSE4.1). There is a 3-byte SSE2 variant that can also move to m/16.
pub static PEXTRW: [u8; 4] = [0x66, 0x0f, 0x3a, 0x15];
/// Insert doubleword or quadword, depending on REX.W (SSE4.1).
pub static PINSR: [u8; 4] = [0x66, 0x0f, 0x3a, 0x22];
/// Insert byte (SSE4.1).
pub static PINSRB: [u8; 4] = [0x66, 0x0f, 0x3a, 0x20];
/// Insert word (SSE2).
pub static PINSRW: [u8; 3] = [0x66, 0x0f, 0xc4];
/// Compare packed signed byte integers in xmm1 and xmm2/m128 and store packed maximum values in
/// xmm1 (SSE4.1).
pub static PMAXSB: [u8; 4] = [0x66, 0x0f, 0x38, 0x3c];
/// Compare packed signed doubleword integers in xmm1 and xmm2/m128 and store packed maximum
/// values in xmm1 (SSE4.1).
pub static PMAXSD: [u8; 4] = [0x66, 0x0f, 0x38, 0x3d];
/// Compare packed signed word integers in xmm1 and xmm2/m128 and store packed maximum values in
/// xmm1 (SSE2).
pub static PMAXSW: [u8; 3] = [0x66, 0x0f, 0xee];
/// Compare packed unsigned byte integers in xmm1 and xmm2/m128 and store packed maximum values in
/// xmm1 (SSE2).
pub static PMAXUB: [u8; 3] = [0x66, 0x0f, 0xde];
/// Compare packed unsigned doubleword integers in xmm1 and xmm2/m128 and store packed maximum
/// values in xmm1 (SSE4.1).
pub static PMAXUD: [u8; 4] = [0x66, 0x0f, 0x38, 0x3f];
/// Compare packed unsigned word integers in xmm1 and xmm2/m128 and store packed maximum values in
/// xmm1 (SSE4.1).
pub static PMAXUW: [u8; 4] = [0x66, 0x0f, 0x38, 0x3e];
/// Compare packed signed byte integers in xmm1 and xmm2/m128 and store packed minimum values in
/// xmm1 (SSE4.1).
pub static PMINSB: [u8; 4] = [0x66, 0x0f, 0x38, 0x38];
/// Compare packed signed doubleword integers in xmm1 and xmm2/m128 and store packed minimum
/// values in xmm1 (SSE4.1).
pub static PMINSD: [u8; 4] = [0x66, 0x0f, 0x38, 0x39];
/// Compare packed signed word integers in xmm1 and xmm2/m128 and store packed minimum values in
/// xmm1 (SSE2).
pub static PMINSW: [u8; 3] = [0x66, 0x0f, 0xea];
/// Compare packed unsigned byte integers in xmm1 and xmm2/m128 and store packed minimum values in
/// xmm1 (SSE2).
pub static PMINUB: [u8; 3] = [0x66, 0x0f, 0xda];
/// Compare packed unsigned doubleword integers in xmm1 and xmm2/m128 and store packed minimum
/// values in xmm1 (SSE4.1).
pub static PMINUD: [u8; 4] = [0x66, 0x0f, 0x38, 0x3b];
/// Compare packed unsigned word integers in xmm1 and xmm2/m128 and store packed minimum values in
/// xmm1 (SSE4.1).
pub static PMINUW: [u8; 4] = [0x66, 0x0f, 0x38, 0x3a];
/// Sign extend 8 packed 8-bit integers in the low 8 bytes of xmm2/m64 to 8 packed 16-bit
/// integers in xmm1 (SSE4.1).
pub static PMOVSXBW: [u8; 4] = [0x66, 0x0f, 0x38, 0x20];
/// Sign extend 4 packed 16-bit integers in the low 8 bytes of xmm2/m64 to 4 packed 32-bit
/// integers in xmm1 (SSE4.1).
pub static PMOVSXWD: [u8; 4] = [0x66, 0x0f, 0x38, 0x23];
/// Sign extend 2 packed 32-bit integers in the low 8 bytes of xmm2/m64 to 2 packed 64-bit
/// integers in xmm1 (SSE4.1).
pub static PMOVSXDQ: [u8; 4] = [0x66, 0x0f, 0x38, 0x25];
/// Zero extend 8 packed 8-bit integers in the low 8 bytes of xmm2/m64 to 8 packed 16-bit
/// integers in xmm1 (SSE4.1).
pub static PMOVZXBW: [u8; 4] = [0x66, 0x0f, 0x38, 0x30];
/// Zero extend 4 packed 16-bit integers in the low 8 bytes of xmm2/m64 to 4 packed 32-bit
/// integers in xmm1 (SSE4.1).
pub static PMOVZXWD: [u8; 4] = [0x66, 0x0f, 0x38, 0x33];
/// Zero extend 2 packed 32-bit integers in the low 8 bytes of xmm2/m64 to 2 packed 64-bit
/// integers in xmm1 (SSE4.1).
pub static PMOVZXDQ: [u8; 4] = [0x66, 0x0f, 0x38, 0x35];
/// Multiply the packed signed word integers in xmm1 and xmm2/m128, and store the low 16 bits of
/// the results in xmm1 (SSE2).
pub static PMULLW: [u8; 3] = [0x66, 0x0f, 0xd5];
/// Multiply the packed doubleword signed integers in xmm1 and xmm2/m128 and store the low 32
/// bits of each product in xmm1 (SSE4.1).
pub static PMULLD: [u8; 4] = [0x66, 0x0f, 0x38, 0x40];
/// Multiply the packed quadword signed integers in xmm2 and xmm3/m128 and store the low 64
/// bits of each product in xmm1 (AVX512VL/DQ). Requires an EVEX encoding.
pub static VPMULLQ: [u8; 4] = [0x66, 0x0f, 0x38, 0x40];
/// Multiply packed unsigned doubleword integers in xmm1 by packed unsigned doubleword integers
/// in xmm2/m128, and store the quadword results in xmm1 (SSE2).
pub static PMULUDQ: [u8; 3] = [0x66, 0x0f, 0xf4];
/// Multiply the packed word integers, add adjacent doubleword results.
pub static PMADDWD: [u8; 3] = [0x66, 0x0f, 0xf5];
/// Pop top of stack into r{16,32,64}; increment stack pointer.
pub static POP_REG: [u8; 1] = [0x58];
/// Returns the count of number of bits set to 1.
pub static POPCNT: [u8; 3] = [0xf3, 0x0f, 0xb8];
/// Bitwise OR of xmm2/m128 and xmm1 (SSE2).
pub static POR: [u8; 3] = [0x66, 0x0f, 0xeb];
/// Shuffle bytes in xmm1 according to contents of xmm2/m128 (SSE3).
pub static PSHUFB: [u8; 4] = [0x66, 0x0f, 0x38, 0x00];
/// Shuffle the doublewords in xmm2/m128 based on the encoding in imm8 and
/// store the result in xmm1 (SSE2).
pub static PSHUFD: [u8; 3] = [0x66, 0x0f, 0x70];
/// Shift words in xmm1 by imm8; the direction and sign-bit behavior is controlled by the RRR
/// digit used in the ModR/M byte (SSE2).
pub static PS_W_IMM: [u8; 3] = [0x66, 0x0f, 0x71];
/// Shift doublewords in xmm1 by imm8; the direction and sign-bit behavior is controlled by the RRR
/// digit used in the ModR/M byte (SSE2).
pub static PS_D_IMM: [u8; 3] = [0x66, 0x0f, 0x72];
/// Shift quadwords in xmm1 by imm8; the direction and sign-bit behavior is controlled by the RRR
/// digit used in the ModR/M byte (SSE2).
pub static PS_Q_IMM: [u8; 3] = [0x66, 0x0f, 0x73];
/// Shift words in xmm1 left by xmm2/m128 while shifting in 0s (SSE2).
pub static PSLLW: [u8; 3] = [0x66, 0x0f, 0xf1];
/// Shift doublewords in xmm1 left by xmm2/m128 while shifting in 0s (SSE2).
pub static PSLLD: [u8; 3] = [0x66, 0x0f, 0xf2];
/// Shift quadwords in xmm1 left by xmm2/m128 while shifting in 0s (SSE2).
pub static PSLLQ: [u8; 3] = [0x66, 0x0f, 0xf3];
/// Shift words in xmm1 right by xmm2/m128 while shifting in 0s (SSE2).
pub static PSRLW: [u8; 3] = [0x66, 0x0f, 0xd1];
/// Shift doublewords in xmm1 right by xmm2/m128 while shifting in 0s (SSE2).
pub static PSRLD: [u8; 3] = [0x66, 0x0f, 0xd2];
/// Shift quadwords in xmm1 right by xmm2/m128 while shifting in 0s (SSE2).
pub static PSRLQ: [u8; 3] = [0x66, 0x0f, 0xd3];
/// Shift words in xmm1 right by xmm2/m128 while shifting in sign bits (SSE2).
pub static PSRAW: [u8; 3] = [0x66, 0x0f, 0xe1];
/// Shift doublewords in xmm1 right by xmm2/m128 while shifting in sign bits (SSE2).
pub static PSRAD: [u8; 3] = [0x66, 0x0f, 0xe2];
/// Subtract packed byte integers in xmm2/m128 from packed byte integers in xmm1 (SSE2).
pub static PSUBB: [u8; 3] = [0x66, 0x0f, 0xf8];
/// Subtract packed word integers in xmm2/m128 from packed word integers in xmm1 (SSE2).
pub static PSUBW: [u8; 3] = [0x66, 0x0f, 0xf9];
/// Subtract packed doubleword integers in xmm2/m128 from doubleword byte integers in xmm1 (SSE2).
pub static PSUBD: [u8; 3] = [0x66, 0x0f, 0xfa];
/// Subtract packed quadword integers in xmm2/m128 from xmm1 (SSE2).
pub static PSUBQ: [u8; 3] = [0x66, 0x0f, 0xfb];
/// Subtract packed signed byte integers in xmm2/m128 from packed signed byte integers in xmm1
/// and saturate results (SSE2).
pub static PSUBSB: [u8; 3] = [0x66, 0x0f, 0xe8];
/// Subtract packed signed word integers in xmm2/m128 from packed signed word integers in xmm1
/// and saturate results (SSE2).
pub static PSUBSW: [u8; 3] = [0x66, 0x0f, 0xe9];
/// Subtract packed unsigned byte integers in xmm2/m128 from packed unsigned byte integers in xmm1
/// and saturate results (SSE2).
pub static PSUBUSB: [u8; 3] = [0x66, 0x0f, 0xd8];
/// Subtract packed unsigned word integers in xmm2/m128 from packed unsigned word integers in xmm1
/// and saturate results (SSE2).
pub static PSUBUSW: [u8; 3] = [0x66, 0x0f, 0xd9];
/// Set ZF if xmm2/m128 AND xmm1 result is all 0s; set CF if xmm2/m128 AND NOT xmm1 result is all
/// 0s (SSE4.1).
pub static PTEST: [u8; 4] = [0x66, 0x0f, 0x38, 0x17];
/// Unpack and interleave high-order bytes from xmm1 and xmm2/m128 into xmm1 (SSE2).
pub static PUNPCKHBW: [u8; 3] = [0x66, 0x0f, 0x68];
/// Unpack and interleave high-order words from xmm1 and xmm2/m128 into xmm1 (SSE2).
pub static PUNPCKHWD: [u8; 3] = [0x66, 0x0f, 0x69];
/// Unpack and interleave high-order doublewords from xmm1 and xmm2/m128 into xmm1 (SSE2).
pub static PUNPCKHDQ: [u8; 3] = [0x66, 0x0f, 0x6A];
/// Unpack and interleave high-order quadwords from xmm1 and xmm2/m128 into xmm1 (SSE2).
pub static PUNPCKHQDQ: [u8; 3] = [0x66, 0x0f, 0x6D];
/// Unpack and interleave low-order bytes from xmm1 and xmm2/m128 into xmm1 (SSE2).
pub static PUNPCKLBW: [u8; 3] = [0x66, 0x0f, 0x60];
/// Unpack and interleave low-order words from xmm1 and xmm2/m128 into xmm1 (SSE2).
pub static PUNPCKLWD: [u8; 3] = [0x66, 0x0f, 0x61];
/// Unpack and interleave low-order doublewords from xmm1 and xmm2/m128 into xmm1 (SSE2).
pub static PUNPCKLDQ: [u8; 3] = [0x66, 0x0f, 0x62];
/// Unpack and interleave low-order quadwords from xmm1 and xmm2/m128 into xmm1 (SSE2).
pub static PUNPCKLQDQ: [u8; 3] = [0x66, 0x0f, 0x6C];
/// Push r{16,32,64}.
pub static PUSH_REG: [u8; 1] = [0x50];
/// Logical exclusive OR (SSE2).
pub static PXOR: [u8; 3] = [0x66, 0x0f, 0xef];
/// Near return to calling procedure.
pub static RET_NEAR: [u8; 1] = [0xc3];
/// General rotation opcode. Kind of rotation depends on encoding.
pub static ROTATE_CL: [u8; 1] = [0xd3];
/// General rotation opcode. Kind of rotation depends on encoding.
pub static ROTATE_IMM8: [u8; 1] = [0xc1];
/// Round scalar doubl-precision floating-point values.
pub static ROUNDSD: [u8; 4] = [0x66, 0x0f, 0x3a, 0x0b];
/// Round scalar single-precision floating-point values.
pub static ROUNDSS: [u8; 4] = [0x66, 0x0f, 0x3a, 0x0a];
/// Subtract with borrow r{16,32,64} from r/m of the same size.
pub static SBB: [u8; 1] = [0x19];
/// Set byte if overflow (OF=1).
pub static SET_BYTE_IF_OVERFLOW: [u8; 2] = [0x0f, 0x90];
/// Compute the square root of the packed double-precision floating-point values and store the
/// result in xmm1 (SSE2).
pub static SQRTPD: [u8; 3] = [0x66, 0x0f, 0x51];
/// Compute the square root of the packed double-precision floating-point values and store the
/// result in xmm1 (SSE).
pub static SQRTPS: [u8; 2] = [0x0f, 0x51];
/// Compute square root of scalar double-precision floating-point value.
pub static SQRTSD: [u8; 3] = [0xf2, 0x0f, 0x51];
/// Compute square root of scalar single-precision value.
pub static SQRTSS: [u8; 3] = [0xf3, 0x0f, 0x51];
/// Subtract r{16,32,64} from r/m of same size.
pub static SUB: [u8; 1] = [0x29];
/// Subtract packed double-precision floating-point values in xmm2/mem from xmm1 and store result
/// in xmm1 (SSE2).
pub static SUBPD: [u8; 3] = [0x66, 0x0f, 0x5c];
/// Subtract packed single-precision floating-point values in xmm2/mem from xmm1 and store result
/// in xmm1 (SSE).
pub static SUBPS: [u8; 2] = [0x0f, 0x5c];
/// Subtract the low double-precision floating-point value in xmm2/m64 from xmm1
/// and store the result in xmm1.
pub static SUBSD: [u8; 3] = [0xf2, 0x0f, 0x5c];
/// Subtract the low single-precision floating-point value in xmm2/m32 from xmm1
/// and store the result in xmm1.
pub static SUBSS: [u8; 3] = [0xf3, 0x0f, 0x5c];
/// AND r8 with r/m8; set SF, ZF, PF according to result.
pub static TEST_BYTE_REG: [u8; 1] = [0x84];
/// AND {r16, r32, r64} with r/m of the same size; set SF, ZF, PF according to result.
pub static TEST_REG: [u8; 1] = [0x85];
/// Count the number of trailing zero bits.
pub static TZCNT: [u8; 3] = [0xf3, 0x0f, 0xbc];
/// Compare low double-precision floating-point values in xmm1 and xmm2/mem64
/// and set the EFLAGS flags accordingly.
pub static UCOMISD: [u8; 3] = [0x66, 0x0f, 0x2e];
/// Compare low single-precision floating-point values in xmm1 and xmm2/mem32
/// and set the EFLAGS flags accordingly.
pub static UCOMISS: [u8; 2] = [0x0f, 0x2e];
/// Raise invalid opcode instruction.
pub static UNDEFINED2: [u8; 2] = [0x0f, 0x0b];
/// Convert four packed unsigned doubleword integers from xmm2/m128/m32bcst to packed
/// single-precision floating-point values in xmm1 with writemask k1. Rounding behavior
/// is controlled by MXCSR but can be overriden by EVEX.L'L in static rounding mode
/// (AVX512VL, AVX512F).
pub static VCVTUDQ2PS: [u8; 3] = [0xf2, 0x0f, 0x7a];
/// imm{16,32} XOR r/m{16,32,64}, possibly sign-extended.
pub static XOR_IMM: [u8; 1] = [0x81];
/// r/m{16,32,64} XOR sign-extended imm8.
pub static XOR_IMM8_SIGN_EXTEND: [u8; 1] = [0x83];
/// r/m{16,32,64} XOR register of the same size.
pub static XOR: [u8; 1] = [0x31];
/// Bitwise logical XOR of packed double-precision floating-point values.
pub static XORPD: [u8; 3] = [0x66, 0x0f, 0x57];
/// Bitwise logical XOR of packed single-precision floating-point values.
pub static XORPS: [u8; 2] = [0x0f, 0x57];

File diff suppressed because it is too large Load Diff

View File

@@ -1,43 +0,0 @@
use crate::cdsl::regs::{IsaRegs, IsaRegsBuilder, RegBankBuilder, RegClassBuilder};
pub(crate) fn define() -> IsaRegs {
let mut regs = IsaRegsBuilder::new();
let builder = RegBankBuilder::new("FloatRegs", "xmm")
.units(16)
.track_pressure(true);
let float_regs = regs.add_bank(builder);
let builder = RegBankBuilder::new("IntRegs", "r")
.units(16)
.names(vec!["rax", "rcx", "rdx", "rbx", "rsp", "rbp", "rsi", "rdi"])
.track_pressure(true)
.pinned_reg(15);
let int_regs = regs.add_bank(builder);
let builder = RegBankBuilder::new("FlagRegs", "")
.units(1)
.names(vec!["rflags"])
.track_pressure(false);
let flag_reg = regs.add_bank(builder);
let builder = RegClassBuilder::new_toplevel("GPR", int_regs);
let gpr = regs.add_class(builder);
let builder = RegClassBuilder::new_toplevel("FPR", float_regs);
let fpr = regs.add_class(builder);
let builder = RegClassBuilder::new_toplevel("FLAG", flag_reg);
regs.add_class(builder);
let builder = RegClassBuilder::subclass_of("GPR8", gpr, 0, 8);
let gpr8 = regs.add_class(builder);
let builder = RegClassBuilder::subclass_of("ABCD", gpr8, 0, 4);
regs.add_class(builder);
let builder = RegClassBuilder::subclass_of("FPR8", fpr, 0, 8);
regs.add_class(builder);
regs.build()
}