Previously, in #2128, we factored out a common "vanilla 64-bit ABI" implementation from the AArch64 ABI code, with the idea that this should be largely compatible with x64. This PR alters the new x64 backend to make use of the shared infrastructure, removing the duplication that existed previously. The generated code is nearly (not exactly) the same; the only difference relates to how the clobber-save region is padded in the prologue. This also changes some register allocations in the aarch64 code because call support in the shared ABI infra now passes a temp vreg in, rather than requiring use of a fixed, non-allocable temp; tests have been updated, and the runtime behavior is unchanged.
2743 lines
105 KiB
Rust
2743 lines
105 KiB
Rust
//! Lowering rules for X64.
|
|
|
|
#![allow(non_snake_case)]
|
|
|
|
use crate::ir;
|
|
use crate::ir::{
|
|
condcodes::FloatCC, condcodes::IntCC, types, AbiParam, ArgumentPurpose, ExternalName,
|
|
Inst as IRInst, InstructionData, LibCall, Opcode, Signature, TrapCode, Type,
|
|
};
|
|
use crate::isa::x64::abi::*;
|
|
use crate::isa::x64::inst::args::*;
|
|
use crate::isa::x64::inst::*;
|
|
use crate::isa::{x64::X64Backend, CallConv};
|
|
use crate::machinst::lower::*;
|
|
use crate::machinst::*;
|
|
use crate::result::CodegenResult;
|
|
use crate::settings::Flags;
|
|
use alloc::boxed::Box;
|
|
use alloc::vec::Vec;
|
|
use cranelift_codegen_shared::condcodes::CondCode;
|
|
use log::trace;
|
|
use regalloc::{Reg, RegClass, Writable};
|
|
use smallvec::SmallVec;
|
|
use std::convert::TryFrom;
|
|
use target_lexicon::Triple;
|
|
|
|
/// Context passed to all lowering functions.
|
|
type Ctx<'a> = &'a mut dyn LowerCtx<I = Inst>;
|
|
|
|
//=============================================================================
|
|
// Helpers for instruction lowering.
|
|
|
|
fn is_int_ty(ty: Type) -> bool {
|
|
match ty {
|
|
types::I8 | types::I16 | types::I32 | types::I64 | types::R64 => true,
|
|
types::R32 => panic!("shouldn't have 32-bits refs on x64"),
|
|
_ => false,
|
|
}
|
|
}
|
|
|
|
fn is_bool_ty(ty: Type) -> bool {
|
|
match ty {
|
|
types::B1 | types::B8 | types::B16 | types::B32 | types::B64 => true,
|
|
types::R32 => panic!("shouldn't have 32-bits refs on x64"),
|
|
_ => false,
|
|
}
|
|
}
|
|
|
|
/// This is target-word-size dependent. And it excludes booleans and reftypes.
|
|
fn is_valid_atomic_transaction_ty(ty: Type) -> bool {
|
|
match ty {
|
|
types::I8 | types::I16 | types::I32 | types::I64 => true,
|
|
_ => false,
|
|
}
|
|
}
|
|
|
|
fn iri_to_u64_imm(ctx: Ctx, inst: IRInst) -> Option<u64> {
|
|
ctx.get_constant(inst)
|
|
}
|
|
|
|
fn inst_trapcode(data: &InstructionData) -> Option<TrapCode> {
|
|
match data {
|
|
&InstructionData::Trap { code, .. }
|
|
| &InstructionData::CondTrap { code, .. }
|
|
| &InstructionData::IntCondTrap { code, .. }
|
|
| &InstructionData::FloatCondTrap { code, .. } => Some(code),
|
|
_ => None,
|
|
}
|
|
}
|
|
|
|
fn inst_condcode(data: &InstructionData) -> IntCC {
|
|
match data {
|
|
&InstructionData::IntCond { cond, .. }
|
|
| &InstructionData::BranchIcmp { cond, .. }
|
|
| &InstructionData::IntCompare { cond, .. }
|
|
| &InstructionData::IntCondTrap { cond, .. }
|
|
| &InstructionData::BranchInt { cond, .. }
|
|
| &InstructionData::IntSelect { cond, .. }
|
|
| &InstructionData::IntCompareImm { cond, .. } => cond,
|
|
_ => panic!("inst_condcode(x64): unhandled: {:?}", data),
|
|
}
|
|
}
|
|
|
|
fn inst_fp_condcode(data: &InstructionData) -> FloatCC {
|
|
match data {
|
|
&InstructionData::BranchFloat { cond, .. }
|
|
| &InstructionData::FloatCompare { cond, .. }
|
|
| &InstructionData::FloatCond { cond, .. }
|
|
| &InstructionData::FloatCondTrap { cond, .. } => cond,
|
|
_ => panic!("inst_fp_condcode(x64): unhandled: {:?}", data),
|
|
}
|
|
}
|
|
|
|
fn inst_atomic_rmw_op(data: &InstructionData) -> Option<ir::AtomicRmwOp> {
|
|
match data {
|
|
&InstructionData::AtomicRmw { op, .. } => Some(op),
|
|
_ => None,
|
|
}
|
|
}
|
|
|
|
fn ldst_offset(data: &InstructionData) -> Option<i32> {
|
|
match data {
|
|
&InstructionData::Load { offset, .. }
|
|
| &InstructionData::StackLoad { offset, .. }
|
|
| &InstructionData::LoadComplex { offset, .. }
|
|
| &InstructionData::Store { offset, .. }
|
|
| &InstructionData::StackStore { offset, .. }
|
|
| &InstructionData::StoreComplex { offset, .. } => Some(offset.into()),
|
|
_ => None,
|
|
}
|
|
}
|
|
|
|
/// Identifier for a particular input of an instruction.
|
|
#[derive(Clone, Copy, Debug, PartialEq, Eq)]
|
|
struct InsnInput {
|
|
insn: IRInst,
|
|
input: usize,
|
|
}
|
|
|
|
/// Identifier for a particular output of an instruction.
|
|
#[derive(Clone, Copy, Debug, PartialEq, Eq)]
|
|
struct InsnOutput {
|
|
insn: IRInst,
|
|
output: usize,
|
|
}
|
|
|
|
/// Returns whether the given specified `input` is a result produced by an instruction with Opcode
|
|
/// `op`.
|
|
// TODO investigate failures with checking against the result index.
|
|
fn matches_input<C: LowerCtx<I = Inst>>(
|
|
ctx: &mut C,
|
|
input: InsnInput,
|
|
op: Opcode,
|
|
) -> Option<IRInst> {
|
|
let inputs = ctx.get_input(input.insn, input.input);
|
|
inputs.inst.and_then(|(src_inst, _)| {
|
|
let data = ctx.data(src_inst);
|
|
if data.opcode() == op {
|
|
return Some(src_inst);
|
|
}
|
|
None
|
|
})
|
|
}
|
|
|
|
fn lowerinput_to_reg(ctx: Ctx, input: LowerInput) -> Reg {
|
|
ctx.use_input_reg(input);
|
|
input.reg
|
|
}
|
|
|
|
/// Put the given input into a register, and mark it as used (side-effect).
|
|
fn input_to_reg(ctx: Ctx, spec: InsnInput) -> Reg {
|
|
let input = ctx.get_input(spec.insn, spec.input);
|
|
lowerinput_to_reg(ctx, input)
|
|
}
|
|
|
|
/// An extension specification for `extend_input_to_reg`.
|
|
#[derive(Clone, Copy)]
|
|
enum ExtSpec {
|
|
ZeroExtendTo32,
|
|
ZeroExtendTo64,
|
|
SignExtendTo32,
|
|
SignExtendTo64,
|
|
}
|
|
|
|
/// Put the given input into a register, marking it as used, and do a zero- or signed- extension if
|
|
/// required. (This obviously causes side-effects.)
|
|
fn extend_input_to_reg(ctx: Ctx, spec: InsnInput, ext_spec: ExtSpec) -> Reg {
|
|
let requested_size = match ext_spec {
|
|
ExtSpec::ZeroExtendTo32 | ExtSpec::SignExtendTo32 => 32,
|
|
ExtSpec::ZeroExtendTo64 | ExtSpec::SignExtendTo64 => 64,
|
|
};
|
|
let input_size = ctx.input_ty(spec.insn, spec.input).bits();
|
|
|
|
let requested_ty = if requested_size == 32 {
|
|
types::I32
|
|
} else {
|
|
types::I64
|
|
};
|
|
|
|
let ext_mode = match (input_size, requested_size) {
|
|
(a, b) if a == b => return input_to_reg(ctx, spec),
|
|
(1, 8) => return input_to_reg(ctx, spec),
|
|
(a, 16) | (a, 32) if a == 1 || a == 8 => ExtMode::BL,
|
|
(a, 64) if a == 1 || a == 8 => ExtMode::BQ,
|
|
(16, 32) => ExtMode::WL,
|
|
(16, 64) => ExtMode::WQ,
|
|
(32, 64) => ExtMode::LQ,
|
|
_ => unreachable!("extend {} -> {}", input_size, requested_size),
|
|
};
|
|
|
|
let src = input_to_reg_mem(ctx, spec);
|
|
let dst = ctx.alloc_tmp(RegClass::I64, requested_ty);
|
|
match ext_spec {
|
|
ExtSpec::ZeroExtendTo32 | ExtSpec::ZeroExtendTo64 => {
|
|
ctx.emit(Inst::movzx_rm_r(
|
|
ext_mode, src, dst, /* infallible */ None,
|
|
))
|
|
}
|
|
ExtSpec::SignExtendTo32 | ExtSpec::SignExtendTo64 => {
|
|
ctx.emit(Inst::movsx_rm_r(
|
|
ext_mode, src, dst, /* infallible */ None,
|
|
))
|
|
}
|
|
}
|
|
dst.to_reg()
|
|
}
|
|
|
|
fn lowerinput_to_reg_mem(ctx: Ctx, input: LowerInput) -> RegMem {
|
|
// TODO handle memory.
|
|
RegMem::reg(lowerinput_to_reg(ctx, input))
|
|
}
|
|
|
|
/// Put the given input into a register or a memory operand.
|
|
/// Effectful: may mark the given input as used, when returning the register form.
|
|
fn input_to_reg_mem(ctx: Ctx, spec: InsnInput) -> RegMem {
|
|
let input = ctx.get_input(spec.insn, spec.input);
|
|
lowerinput_to_reg_mem(ctx, input)
|
|
}
|
|
|
|
/// Returns whether the given input is an immediate that can be properly sign-extended, without any
|
|
/// possible side-effect.
|
|
fn lowerinput_to_sext_imm(input: LowerInput, input_ty: Type) -> Option<u32> {
|
|
input.constant.and_then(|x| {
|
|
// For i64 instructions (prefixed with REX.W), require that the immediate will sign-extend
|
|
// to 64 bits. For other sizes, it doesn't matter and we can just use the plain
|
|
// constant.
|
|
if input_ty.bytes() != 8 || low32_will_sign_extend_to_64(x) {
|
|
Some(x as u32)
|
|
} else {
|
|
None
|
|
}
|
|
})
|
|
}
|
|
|
|
fn input_to_sext_imm(ctx: Ctx, spec: InsnInput) -> Option<u32> {
|
|
let input = ctx.get_input(spec.insn, spec.input);
|
|
let input_ty = ctx.input_ty(spec.insn, spec.input);
|
|
lowerinput_to_sext_imm(input, input_ty)
|
|
}
|
|
|
|
fn input_to_imm(ctx: Ctx, spec: InsnInput) -> Option<u64> {
|
|
ctx.get_input(spec.insn, spec.input).constant
|
|
}
|
|
|
|
/// Put the given input into an immediate, a register or a memory operand.
|
|
/// Effectful: may mark the given input as used, when returning the register form.
|
|
fn input_to_reg_mem_imm(ctx: Ctx, spec: InsnInput) -> RegMemImm {
|
|
let input = ctx.get_input(spec.insn, spec.input);
|
|
let input_ty = ctx.input_ty(spec.insn, spec.input);
|
|
match lowerinput_to_sext_imm(input, input_ty) {
|
|
Some(x) => RegMemImm::imm(x),
|
|
None => match lowerinput_to_reg_mem(ctx, input) {
|
|
RegMem::Reg { reg } => RegMemImm::reg(reg),
|
|
RegMem::Mem { addr } => RegMemImm::mem(addr),
|
|
},
|
|
}
|
|
}
|
|
|
|
fn output_to_reg(ctx: Ctx, spec: InsnOutput) -> Writable<Reg> {
|
|
ctx.get_output(spec.insn, spec.output)
|
|
}
|
|
|
|
fn emit_cmp(ctx: Ctx, insn: IRInst) {
|
|
let ty = ctx.input_ty(insn, 0);
|
|
|
|
let inputs = [InsnInput { insn, input: 0 }, InsnInput { insn, input: 1 }];
|
|
|
|
// TODO Try to commute the operands (and invert the condition) if one is an immediate.
|
|
let lhs = input_to_reg(ctx, inputs[0]);
|
|
let rhs = input_to_reg_mem_imm(ctx, inputs[1]);
|
|
|
|
// Cranelift's icmp semantics want to compare lhs - rhs, while Intel gives
|
|
// us dst - src at the machine instruction level, so invert operands.
|
|
ctx.emit(Inst::cmp_rmi_r(ty.bytes() as u8, rhs, lhs));
|
|
}
|
|
|
|
/// A specification for a fcmp emission.
|
|
enum FcmpSpec {
|
|
/// Normal flow.
|
|
Normal,
|
|
|
|
/// Avoid emitting Equal at all costs by inverting it to NotEqual, and indicate when that
|
|
/// happens with `InvertedEqualOrConditions`.
|
|
///
|
|
/// This is useful in contexts where it is hard/inefficient to produce a single instruction (or
|
|
/// sequence of instructions) that check for an "AND" combination of condition codes; see for
|
|
/// instance lowering of Select.
|
|
InvertEqual,
|
|
}
|
|
|
|
/// This explains how to interpret the results of an fcmp instruction.
|
|
enum FcmpCondResult {
|
|
/// The given condition code must be set.
|
|
Condition(CC),
|
|
|
|
/// Both condition codes must be set.
|
|
AndConditions(CC, CC),
|
|
|
|
/// Either of the conditions codes must be set.
|
|
OrConditions(CC, CC),
|
|
|
|
/// The associated spec was set to `FcmpSpec::InvertEqual` and Equal has been inverted. Either
|
|
/// of the condition codes must be set, and the user must invert meaning of analyzing the
|
|
/// condition code results. When the spec is set to `FcmpSpec::Normal`, then this case can't be
|
|
/// reached.
|
|
InvertedEqualOrConditions(CC, CC),
|
|
}
|
|
|
|
fn emit_fcmp(ctx: Ctx, insn: IRInst, mut cond_code: FloatCC, spec: FcmpSpec) -> FcmpCondResult {
|
|
let (flip_operands, inverted_equal) = match cond_code {
|
|
FloatCC::LessThan
|
|
| FloatCC::LessThanOrEqual
|
|
| FloatCC::UnorderedOrGreaterThan
|
|
| FloatCC::UnorderedOrGreaterThanOrEqual => {
|
|
cond_code = cond_code.reverse();
|
|
(true, false)
|
|
}
|
|
FloatCC::Equal => {
|
|
let inverted_equal = match spec {
|
|
FcmpSpec::Normal => false,
|
|
FcmpSpec::InvertEqual => {
|
|
cond_code = FloatCC::NotEqual; // same as .inverse()
|
|
true
|
|
}
|
|
};
|
|
(false, inverted_equal)
|
|
}
|
|
_ => (false, false),
|
|
};
|
|
|
|
// The only valid CC constructed with `from_floatcc` can be put in the flag
|
|
// register with a direct float comparison; do this here.
|
|
let op = match ctx.input_ty(insn, 0) {
|
|
types::F32 => SseOpcode::Ucomiss,
|
|
types::F64 => SseOpcode::Ucomisd,
|
|
_ => panic!("Bad input type to Fcmp"),
|
|
};
|
|
|
|
let inputs = &[InsnInput { insn, input: 0 }, InsnInput { insn, input: 1 }];
|
|
let (lhs_input, rhs_input) = if flip_operands {
|
|
(inputs[1], inputs[0])
|
|
} else {
|
|
(inputs[0], inputs[1])
|
|
};
|
|
let lhs = input_to_reg(ctx, lhs_input);
|
|
let rhs = input_to_reg_mem(ctx, rhs_input);
|
|
ctx.emit(Inst::xmm_cmp_rm_r(op, rhs, lhs));
|
|
|
|
let cond_result = match cond_code {
|
|
FloatCC::Equal => FcmpCondResult::AndConditions(CC::NP, CC::Z),
|
|
FloatCC::NotEqual if inverted_equal => {
|
|
FcmpCondResult::InvertedEqualOrConditions(CC::P, CC::NZ)
|
|
}
|
|
FloatCC::NotEqual if !inverted_equal => FcmpCondResult::OrConditions(CC::P, CC::NZ),
|
|
_ => FcmpCondResult::Condition(CC::from_floatcc(cond_code)),
|
|
};
|
|
|
|
cond_result
|
|
}
|
|
|
|
fn make_libcall_sig(ctx: Ctx, insn: IRInst, call_conv: CallConv, ptr_ty: Type) -> Signature {
|
|
let mut sig = Signature::new(call_conv);
|
|
for i in 0..ctx.num_inputs(insn) {
|
|
sig.params.push(AbiParam::new(ctx.input_ty(insn, i)));
|
|
}
|
|
for i in 0..ctx.num_outputs(insn) {
|
|
sig.returns.push(AbiParam::new(ctx.output_ty(insn, i)));
|
|
}
|
|
if call_conv.extends_baldrdash() {
|
|
// Adds the special VMContext parameter to the signature.
|
|
sig.params
|
|
.push(AbiParam::special(ptr_ty, ArgumentPurpose::VMContext));
|
|
}
|
|
sig
|
|
}
|
|
|
|
fn emit_vm_call<C: LowerCtx<I = Inst>>(
|
|
ctx: &mut C,
|
|
flags: &Flags,
|
|
triple: &Triple,
|
|
libcall: LibCall,
|
|
insn: IRInst,
|
|
inputs: SmallVec<[InsnInput; 4]>,
|
|
outputs: SmallVec<[InsnOutput; 2]>,
|
|
) -> CodegenResult<()> {
|
|
let extname = ExternalName::LibCall(libcall);
|
|
|
|
let dist = if flags.use_colocated_libcalls() {
|
|
RelocDistance::Near
|
|
} else {
|
|
RelocDistance::Far
|
|
};
|
|
|
|
// TODO avoid recreating signatures for every single Libcall function.
|
|
let call_conv = CallConv::for_libcall(flags, CallConv::triple_default(triple));
|
|
let sig = make_libcall_sig(ctx, insn, call_conv, types::I64);
|
|
|
|
let loc = ctx.srcloc(insn);
|
|
let mut abi = X64ABICaller::from_func(&sig, &extname, dist, loc)?;
|
|
|
|
abi.emit_stack_pre_adjust(ctx);
|
|
|
|
let vm_context = if call_conv.extends_baldrdash() { 1 } else { 0 };
|
|
assert_eq!(inputs.len() + vm_context, abi.num_args());
|
|
|
|
for (i, input) in inputs.iter().enumerate() {
|
|
let arg_reg = input_to_reg(ctx, *input);
|
|
abi.emit_copy_reg_to_arg(ctx, i, arg_reg);
|
|
}
|
|
if call_conv.extends_baldrdash() {
|
|
let vm_context_vreg = ctx
|
|
.get_vm_context()
|
|
.expect("should have a VMContext to pass to libcall funcs");
|
|
abi.emit_copy_reg_to_arg(ctx, inputs.len(), vm_context_vreg);
|
|
}
|
|
|
|
abi.emit_call(ctx);
|
|
for (i, output) in outputs.iter().enumerate() {
|
|
let retval_reg = output_to_reg(ctx, *output);
|
|
abi.emit_copy_retval_to_reg(ctx, i, retval_reg);
|
|
}
|
|
abi.emit_stack_post_adjust(ctx);
|
|
|
|
Ok(())
|
|
}
|
|
|
|
/// Returns whether the given input is a shift by a constant value less or equal than 3.
|
|
/// The goal is to embed it within an address mode.
|
|
fn matches_small_constant_shift<C: LowerCtx<I = Inst>>(
|
|
ctx: &mut C,
|
|
spec: InsnInput,
|
|
) -> Option<(InsnInput, u8)> {
|
|
matches_input(ctx, spec, Opcode::Ishl).and_then(|shift| {
|
|
match input_to_imm(
|
|
ctx,
|
|
InsnInput {
|
|
insn: shift,
|
|
input: 1,
|
|
},
|
|
) {
|
|
Some(shift_amt) if shift_amt <= 3 => Some((
|
|
InsnInput {
|
|
insn: shift,
|
|
input: 0,
|
|
},
|
|
shift_amt as u8,
|
|
)),
|
|
_ => None,
|
|
}
|
|
})
|
|
}
|
|
|
|
fn lower_to_amode<C: LowerCtx<I = Inst>>(ctx: &mut C, spec: InsnInput, offset: u32) -> Amode {
|
|
// We now either have an add that we must materialize, or some other input; as well as the
|
|
// final offset.
|
|
if let Some(add) = matches_input(ctx, spec, Opcode::Iadd) {
|
|
let add_inputs = &[
|
|
InsnInput {
|
|
insn: add,
|
|
input: 0,
|
|
},
|
|
InsnInput {
|
|
insn: add,
|
|
input: 1,
|
|
},
|
|
];
|
|
|
|
// TODO heap_addr legalization generates a uext64 *after* the shift, so these optimizations
|
|
// aren't happening in the wasm case. We could do better, given some range analysis.
|
|
let (base, index, shift) = if let Some((shift_input, shift_amt)) =
|
|
matches_small_constant_shift(ctx, add_inputs[0])
|
|
{
|
|
(
|
|
input_to_reg(ctx, add_inputs[1]),
|
|
input_to_reg(ctx, shift_input),
|
|
shift_amt,
|
|
)
|
|
} else if let Some((shift_input, shift_amt)) =
|
|
matches_small_constant_shift(ctx, add_inputs[1])
|
|
{
|
|
(
|
|
input_to_reg(ctx, add_inputs[0]),
|
|
input_to_reg(ctx, shift_input),
|
|
shift_amt,
|
|
)
|
|
} else {
|
|
(
|
|
input_to_reg(ctx, add_inputs[0]),
|
|
input_to_reg(ctx, add_inputs[1]),
|
|
0,
|
|
)
|
|
};
|
|
|
|
return Amode::imm_reg_reg_shift(offset, base, index, shift);
|
|
}
|
|
|
|
let input = input_to_reg(ctx, spec);
|
|
Amode::imm_reg(offset, input)
|
|
}
|
|
|
|
//=============================================================================
|
|
// Top-level instruction lowering entry point, for one instruction.
|
|
|
|
/// Actually codegen an instruction's results into registers.
|
|
fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(
|
|
ctx: &mut C,
|
|
insn: IRInst,
|
|
flags: &Flags,
|
|
triple: &Triple,
|
|
) -> CodegenResult<()> {
|
|
let op = ctx.data(insn).opcode();
|
|
|
|
let inputs: SmallVec<[InsnInput; 4]> = (0..ctx.num_inputs(insn))
|
|
.map(|i| InsnInput { insn, input: i })
|
|
.collect();
|
|
let outputs: SmallVec<[InsnOutput; 2]> = (0..ctx.num_outputs(insn))
|
|
.map(|i| InsnOutput { insn, output: i })
|
|
.collect();
|
|
|
|
let ty = if outputs.len() > 0 {
|
|
Some(ctx.output_ty(insn, 0))
|
|
} else {
|
|
None
|
|
};
|
|
|
|
match op {
|
|
Opcode::Iconst | Opcode::Bconst | Opcode::Null => {
|
|
if let Some(w64) = iri_to_u64_imm(ctx, insn) {
|
|
let dst_is_64 = w64 > 0x7fffffff;
|
|
let dst = output_to_reg(ctx, outputs[0]);
|
|
ctx.emit(Inst::imm_r(dst_is_64, w64, dst));
|
|
} else {
|
|
unimplemented!();
|
|
}
|
|
}
|
|
|
|
Opcode::Iadd
|
|
| Opcode::IaddIfcout
|
|
| Opcode::Isub
|
|
| Opcode::Imul
|
|
| Opcode::Band
|
|
| Opcode::Bor
|
|
| Opcode::Bxor => {
|
|
let ty = ty.unwrap();
|
|
if ty.lane_count() > 1 {
|
|
let sse_op = match op {
|
|
Opcode::Iadd => match ty {
|
|
types::I8X16 => SseOpcode::Paddb,
|
|
types::I16X8 => SseOpcode::Paddw,
|
|
types::I32X4 => SseOpcode::Paddd,
|
|
types::I64X2 => SseOpcode::Paddq,
|
|
_ => panic!("Unsupported type for packed Iadd instruction"),
|
|
},
|
|
Opcode::Isub => match ty {
|
|
types::I8X16 => SseOpcode::Psubb,
|
|
types::I16X8 => SseOpcode::Psubw,
|
|
types::I32X4 => SseOpcode::Psubd,
|
|
types::I64X2 => SseOpcode::Psubq,
|
|
_ => panic!("Unsupported type for packed Isub instruction"),
|
|
},
|
|
Opcode::Imul => match ty {
|
|
types::I16X8 => SseOpcode::Pmullw,
|
|
types::I32X4 => SseOpcode::Pmulld,
|
|
_ => panic!("Unsupported type for packed Imul instruction"),
|
|
},
|
|
_ => panic!("Unsupported packed instruction"),
|
|
};
|
|
let lhs = input_to_reg(ctx, inputs[0]);
|
|
let rhs = input_to_reg_mem(ctx, inputs[1]);
|
|
let dst = output_to_reg(ctx, outputs[0]);
|
|
|
|
// Move the `lhs` to the same register as `dst`.
|
|
ctx.emit(Inst::gen_move(dst, lhs, ty));
|
|
ctx.emit(Inst::xmm_rm_r(sse_op, rhs, dst));
|
|
} else {
|
|
let is_64 = ty == types::I64;
|
|
let alu_op = match op {
|
|
Opcode::Iadd | Opcode::IaddIfcout => AluRmiROpcode::Add,
|
|
Opcode::Isub => AluRmiROpcode::Sub,
|
|
Opcode::Imul => AluRmiROpcode::Mul,
|
|
Opcode::Band => AluRmiROpcode::And,
|
|
Opcode::Bor => AluRmiROpcode::Or,
|
|
Opcode::Bxor => AluRmiROpcode::Xor,
|
|
_ => unreachable!(),
|
|
};
|
|
|
|
let (lhs, rhs) = match op {
|
|
Opcode::Iadd
|
|
| Opcode::IaddIfcout
|
|
| Opcode::Imul
|
|
| Opcode::Band
|
|
| Opcode::Bor
|
|
| Opcode::Bxor => {
|
|
// For commutative operations, try to commute operands if one is an
|
|
// immediate.
|
|
if let Some(imm) = input_to_sext_imm(ctx, inputs[0]) {
|
|
(input_to_reg(ctx, inputs[1]), RegMemImm::imm(imm))
|
|
} else {
|
|
(
|
|
input_to_reg(ctx, inputs[0]),
|
|
input_to_reg_mem_imm(ctx, inputs[1]),
|
|
)
|
|
}
|
|
}
|
|
Opcode::Isub => (
|
|
input_to_reg(ctx, inputs[0]),
|
|
input_to_reg_mem_imm(ctx, inputs[1]),
|
|
),
|
|
_ => unreachable!(),
|
|
};
|
|
|
|
let dst = output_to_reg(ctx, outputs[0]);
|
|
ctx.emit(Inst::mov_r_r(true, lhs, dst));
|
|
ctx.emit(Inst::alu_rmi_r(is_64, alu_op, rhs, dst));
|
|
}
|
|
}
|
|
|
|
Opcode::Bnot => {
|
|
let ty = ty.unwrap();
|
|
if ty.is_vector() {
|
|
unimplemented!("vector bnot");
|
|
} else if ty.is_bool() {
|
|
unimplemented!("bool bnot")
|
|
} else {
|
|
let size = ty.bytes() as u8;
|
|
let src = input_to_reg(ctx, inputs[0]);
|
|
let dst = output_to_reg(ctx, outputs[0]);
|
|
ctx.emit(Inst::gen_move(dst, src, ty));
|
|
ctx.emit(Inst::not(size, dst));
|
|
}
|
|
}
|
|
|
|
Opcode::Ishl | Opcode::Ushr | Opcode::Sshr | Opcode::Rotl | Opcode::Rotr => {
|
|
let dst_ty = ctx.output_ty(insn, 0);
|
|
debug_assert_eq!(ctx.input_ty(insn, 0), dst_ty);
|
|
|
|
let (size, lhs) = match dst_ty {
|
|
types::I8 | types::I16 => match op {
|
|
Opcode::Ishl => (4, input_to_reg(ctx, inputs[0])),
|
|
Opcode::Ushr => (
|
|
4,
|
|
extend_input_to_reg(ctx, inputs[0], ExtSpec::ZeroExtendTo32),
|
|
),
|
|
Opcode::Sshr => (
|
|
4,
|
|
extend_input_to_reg(ctx, inputs[0], ExtSpec::SignExtendTo32),
|
|
),
|
|
Opcode::Rotl | Opcode::Rotr => {
|
|
(dst_ty.bytes() as u8, input_to_reg(ctx, inputs[0]))
|
|
}
|
|
_ => unreachable!(),
|
|
},
|
|
types::I32 | types::I64 => (dst_ty.bytes() as u8, input_to_reg(ctx, inputs[0])),
|
|
_ => unreachable!("unhandled output type for shift/rotates: {}", dst_ty),
|
|
};
|
|
|
|
let (count, rhs) = if let Some(cst) = ctx.get_input(insn, 1).constant {
|
|
let cst = if op == Opcode::Rotl || op == Opcode::Rotr {
|
|
// Mask rotation count, according to Cranelift's semantics.
|
|
(cst as u8) & (dst_ty.bits() as u8 - 1)
|
|
} else {
|
|
cst as u8
|
|
};
|
|
(Some(cst), None)
|
|
} else {
|
|
(None, Some(input_to_reg(ctx, inputs[1])))
|
|
};
|
|
|
|
let dst = output_to_reg(ctx, outputs[0]);
|
|
|
|
let shift_kind = match op {
|
|
Opcode::Ishl => ShiftKind::ShiftLeft,
|
|
Opcode::Ushr => ShiftKind::ShiftRightLogical,
|
|
Opcode::Sshr => ShiftKind::ShiftRightArithmetic,
|
|
Opcode::Rotl => ShiftKind::RotateLeft,
|
|
Opcode::Rotr => ShiftKind::RotateRight,
|
|
_ => unreachable!(),
|
|
};
|
|
|
|
let w_rcx = Writable::from_reg(regs::rcx());
|
|
ctx.emit(Inst::mov_r_r(true, lhs, dst));
|
|
if count.is_none() {
|
|
ctx.emit(Inst::mov_r_r(true, rhs.unwrap(), w_rcx));
|
|
}
|
|
ctx.emit(Inst::shift_r(size, shift_kind, count, dst));
|
|
}
|
|
|
|
Opcode::Ineg => {
|
|
let dst = output_to_reg(ctx, outputs[0]);
|
|
let ty = ty.unwrap();
|
|
|
|
if ty.is_vector() {
|
|
// Zero's out a register and then does a packed subtraction
|
|
// of the input from the register.
|
|
|
|
let src = input_to_reg_mem(ctx, inputs[0]);
|
|
let tmp = ctx.alloc_tmp(RegClass::V128, types::I32X4);
|
|
|
|
let subtract_opcode = match ty {
|
|
types::I8X16 => SseOpcode::Psubb,
|
|
types::I16X8 => SseOpcode::Psubw,
|
|
types::I32X4 => SseOpcode::Psubd,
|
|
types::I64X2 => SseOpcode::Psubq,
|
|
_ => panic!("Unsupported type for Ineg instruction, found {}", ty),
|
|
};
|
|
|
|
// Note we must zero out a tmp instead of using the destination register since
|
|
// the desitnation could be an alias for the source input register
|
|
ctx.emit(Inst::xmm_rm_r(
|
|
SseOpcode::Pxor,
|
|
RegMem::reg(tmp.to_reg()),
|
|
tmp,
|
|
));
|
|
ctx.emit(Inst::xmm_rm_r(subtract_opcode, src, tmp));
|
|
ctx.emit(Inst::xmm_unary_rm_r(
|
|
SseOpcode::Movapd,
|
|
RegMem::reg(tmp.to_reg()),
|
|
dst,
|
|
));
|
|
} else {
|
|
let size = ty.bytes() as u8;
|
|
let src = input_to_reg(ctx, inputs[0]);
|
|
ctx.emit(Inst::gen_move(dst, src, ty));
|
|
ctx.emit(Inst::neg(size, dst));
|
|
}
|
|
}
|
|
|
|
Opcode::Clz => {
|
|
// TODO when the x86 flags have use_lzcnt, we can use LZCNT.
|
|
|
|
// General formula using bit-scan reverse (BSR):
|
|
// mov -1, %dst
|
|
// bsr %src, %tmp
|
|
// cmovz %dst, %tmp
|
|
// mov $(size_bits - 1), %dst
|
|
// sub %tmp, %dst
|
|
|
|
let (ext_spec, ty) = match ctx.input_ty(insn, 0) {
|
|
types::I8 | types::I16 => (Some(ExtSpec::ZeroExtendTo32), types::I32),
|
|
a if a == types::I32 || a == types::I64 => (None, a),
|
|
_ => unreachable!(),
|
|
};
|
|
|
|
let src = if let Some(ext_spec) = ext_spec {
|
|
RegMem::reg(extend_input_to_reg(ctx, inputs[0], ext_spec))
|
|
} else {
|
|
input_to_reg_mem(ctx, inputs[0])
|
|
};
|
|
let dst = output_to_reg(ctx, outputs[0]);
|
|
|
|
let tmp = ctx.alloc_tmp(RegClass::I64, ty);
|
|
ctx.emit(Inst::imm_r(ty == types::I64, u64::max_value(), dst));
|
|
|
|
ctx.emit(Inst::unary_rm_r(
|
|
ty.bytes() as u8,
|
|
UnaryRmROpcode::Bsr,
|
|
src,
|
|
tmp,
|
|
));
|
|
|
|
ctx.emit(Inst::cmove(
|
|
ty.bytes() as u8,
|
|
CC::Z,
|
|
RegMem::reg(dst.to_reg()),
|
|
tmp,
|
|
));
|
|
|
|
ctx.emit(Inst::imm_r(ty == types::I64, ty.bits() as u64 - 1, dst));
|
|
|
|
ctx.emit(Inst::alu_rmi_r(
|
|
ty == types::I64,
|
|
AluRmiROpcode::Sub,
|
|
RegMemImm::reg(tmp.to_reg()),
|
|
dst,
|
|
));
|
|
}
|
|
|
|
Opcode::Ctz => {
|
|
// TODO when the x86 flags have use_bmi1, we can use TZCNT.
|
|
|
|
// General formula using bit-scan forward (BSF):
|
|
// bsf %src, %dst
|
|
// mov $(size_bits), %tmp
|
|
// cmovz %tmp, %dst
|
|
let ty = ctx.input_ty(insn, 0);
|
|
let ty = if ty.bits() < 32 { types::I32 } else { ty };
|
|
debug_assert!(ty == types::I32 || ty == types::I64);
|
|
|
|
let src = input_to_reg_mem(ctx, inputs[0]);
|
|
let dst = output_to_reg(ctx, outputs[0]);
|
|
|
|
let tmp = ctx.alloc_tmp(RegClass::I64, ty);
|
|
ctx.emit(Inst::imm_r(false /* 64 bits */, ty.bits() as u64, tmp));
|
|
|
|
ctx.emit(Inst::unary_rm_r(
|
|
ty.bytes() as u8,
|
|
UnaryRmROpcode::Bsf,
|
|
src,
|
|
dst,
|
|
));
|
|
|
|
ctx.emit(Inst::cmove(
|
|
ty.bytes() as u8,
|
|
CC::Z,
|
|
RegMem::reg(tmp.to_reg()),
|
|
dst,
|
|
));
|
|
}
|
|
|
|
Opcode::Popcnt => {
|
|
// TODO when the x86 flags have use_popcnt, we can use the popcnt instruction.
|
|
|
|
let (ext_spec, ty) = match ctx.input_ty(insn, 0) {
|
|
types::I8 | types::I16 => (Some(ExtSpec::ZeroExtendTo32), types::I32),
|
|
a if a == types::I32 || a == types::I64 => (None, a),
|
|
_ => unreachable!(),
|
|
};
|
|
|
|
let src = if let Some(ext_spec) = ext_spec {
|
|
RegMem::reg(extend_input_to_reg(ctx, inputs[0], ext_spec))
|
|
} else {
|
|
input_to_reg_mem(ctx, inputs[0])
|
|
};
|
|
let dst = output_to_reg(ctx, outputs[0]);
|
|
|
|
if ty == types::I64 {
|
|
let is_64 = true;
|
|
|
|
let tmp1 = ctx.alloc_tmp(RegClass::I64, types::I64);
|
|
let tmp2 = ctx.alloc_tmp(RegClass::I64, types::I64);
|
|
let cst = ctx.alloc_tmp(RegClass::I64, types::I64);
|
|
|
|
// mov src, tmp1
|
|
ctx.emit(Inst::mov64_rm_r(src.clone(), tmp1, None));
|
|
|
|
// shr $1, tmp1
|
|
ctx.emit(Inst::shift_r(
|
|
8,
|
|
ShiftKind::ShiftRightLogical,
|
|
Some(1),
|
|
tmp1,
|
|
));
|
|
|
|
// mov 0x7777_7777_7777_7777, cst
|
|
ctx.emit(Inst::imm_r(is_64, 0x7777777777777777, cst));
|
|
|
|
// andq cst, tmp1
|
|
ctx.emit(Inst::alu_rmi_r(
|
|
is_64,
|
|
AluRmiROpcode::And,
|
|
RegMemImm::reg(cst.to_reg()),
|
|
tmp1,
|
|
));
|
|
|
|
// mov src, tmp2
|
|
ctx.emit(Inst::mov64_rm_r(src, tmp2, None));
|
|
|
|
// sub tmp1, tmp2
|
|
ctx.emit(Inst::alu_rmi_r(
|
|
is_64,
|
|
AluRmiROpcode::Sub,
|
|
RegMemImm::reg(tmp1.to_reg()),
|
|
tmp2,
|
|
));
|
|
|
|
// shr $1, tmp1
|
|
ctx.emit(Inst::shift_r(
|
|
8,
|
|
ShiftKind::ShiftRightLogical,
|
|
Some(1),
|
|
tmp1,
|
|
));
|
|
|
|
// and cst, tmp1
|
|
ctx.emit(Inst::alu_rmi_r(
|
|
is_64,
|
|
AluRmiROpcode::And,
|
|
RegMemImm::reg(cst.to_reg()),
|
|
tmp1,
|
|
));
|
|
|
|
// sub tmp1, tmp2
|
|
ctx.emit(Inst::alu_rmi_r(
|
|
is_64,
|
|
AluRmiROpcode::Sub,
|
|
RegMemImm::reg(tmp1.to_reg()),
|
|
tmp2,
|
|
));
|
|
|
|
// shr $1, tmp1
|
|
ctx.emit(Inst::shift_r(
|
|
8,
|
|
ShiftKind::ShiftRightLogical,
|
|
Some(1),
|
|
tmp1,
|
|
));
|
|
|
|
// and cst, tmp1
|
|
ctx.emit(Inst::alu_rmi_r(
|
|
is_64,
|
|
AluRmiROpcode::And,
|
|
RegMemImm::reg(cst.to_reg()),
|
|
tmp1,
|
|
));
|
|
|
|
// sub tmp1, tmp2
|
|
ctx.emit(Inst::alu_rmi_r(
|
|
is_64,
|
|
AluRmiROpcode::Sub,
|
|
RegMemImm::reg(tmp1.to_reg()),
|
|
tmp2,
|
|
));
|
|
|
|
// mov tmp2, dst
|
|
ctx.emit(Inst::mov64_rm_r(RegMem::reg(tmp2.to_reg()), dst, None));
|
|
|
|
// shr $4, dst
|
|
ctx.emit(Inst::shift_r(8, ShiftKind::ShiftRightLogical, Some(4), dst));
|
|
|
|
// add tmp2, dst
|
|
ctx.emit(Inst::alu_rmi_r(
|
|
is_64,
|
|
AluRmiROpcode::Add,
|
|
RegMemImm::reg(tmp2.to_reg()),
|
|
dst,
|
|
));
|
|
|
|
// mov $0x0F0F_0F0F_0F0F_0F0F, cst
|
|
ctx.emit(Inst::imm_r(is_64, 0x0F0F0F0F0F0F0F0F, cst));
|
|
|
|
// and cst, dst
|
|
ctx.emit(Inst::alu_rmi_r(
|
|
is_64,
|
|
AluRmiROpcode::And,
|
|
RegMemImm::reg(cst.to_reg()),
|
|
dst,
|
|
));
|
|
|
|
// mov $0x0101_0101_0101_0101, cst
|
|
ctx.emit(Inst::imm_r(is_64, 0x0101010101010101, cst));
|
|
|
|
// mul cst, dst
|
|
ctx.emit(Inst::alu_rmi_r(
|
|
is_64,
|
|
AluRmiROpcode::Mul,
|
|
RegMemImm::reg(cst.to_reg()),
|
|
dst,
|
|
));
|
|
|
|
// shr $56, dst
|
|
ctx.emit(Inst::shift_r(
|
|
8,
|
|
ShiftKind::ShiftRightLogical,
|
|
Some(56),
|
|
dst,
|
|
));
|
|
} else {
|
|
assert_eq!(ty, types::I32);
|
|
let is_64 = false;
|
|
|
|
let tmp1 = ctx.alloc_tmp(RegClass::I64, types::I64);
|
|
let tmp2 = ctx.alloc_tmp(RegClass::I64, types::I64);
|
|
|
|
// mov src, tmp1
|
|
ctx.emit(Inst::mov64_rm_r(src.clone(), tmp1, None));
|
|
|
|
// shr $1, tmp1
|
|
ctx.emit(Inst::shift_r(
|
|
4,
|
|
ShiftKind::ShiftRightLogical,
|
|
Some(1),
|
|
tmp1,
|
|
));
|
|
|
|
// andq $0x7777_7777, tmp1
|
|
ctx.emit(Inst::alu_rmi_r(
|
|
is_64,
|
|
AluRmiROpcode::And,
|
|
RegMemImm::imm(0x77777777),
|
|
tmp1,
|
|
));
|
|
|
|
// mov src, tmp2
|
|
ctx.emit(Inst::mov64_rm_r(src, tmp2, None));
|
|
|
|
// sub tmp1, tmp2
|
|
ctx.emit(Inst::alu_rmi_r(
|
|
is_64,
|
|
AluRmiROpcode::Sub,
|
|
RegMemImm::reg(tmp1.to_reg()),
|
|
tmp2,
|
|
));
|
|
|
|
// shr $1, tmp1
|
|
ctx.emit(Inst::shift_r(
|
|
4,
|
|
ShiftKind::ShiftRightLogical,
|
|
Some(1),
|
|
tmp1,
|
|
));
|
|
|
|
// and 0x7777_7777, tmp1
|
|
ctx.emit(Inst::alu_rmi_r(
|
|
is_64,
|
|
AluRmiROpcode::And,
|
|
RegMemImm::imm(0x77777777),
|
|
tmp1,
|
|
));
|
|
|
|
// sub tmp1, tmp2
|
|
ctx.emit(Inst::alu_rmi_r(
|
|
is_64,
|
|
AluRmiROpcode::Sub,
|
|
RegMemImm::reg(tmp1.to_reg()),
|
|
tmp2,
|
|
));
|
|
|
|
// shr $1, tmp1
|
|
ctx.emit(Inst::shift_r(
|
|
4,
|
|
ShiftKind::ShiftRightLogical,
|
|
Some(1),
|
|
tmp1,
|
|
));
|
|
|
|
// and $0x7777_7777, tmp1
|
|
ctx.emit(Inst::alu_rmi_r(
|
|
is_64,
|
|
AluRmiROpcode::And,
|
|
RegMemImm::imm(0x77777777),
|
|
tmp1,
|
|
));
|
|
|
|
// sub tmp1, tmp2
|
|
ctx.emit(Inst::alu_rmi_r(
|
|
is_64,
|
|
AluRmiROpcode::Sub,
|
|
RegMemImm::reg(tmp1.to_reg()),
|
|
tmp2,
|
|
));
|
|
|
|
// mov tmp2, dst
|
|
ctx.emit(Inst::mov64_rm_r(RegMem::reg(tmp2.to_reg()), dst, None));
|
|
|
|
// shr $4, dst
|
|
ctx.emit(Inst::shift_r(4, ShiftKind::ShiftRightLogical, Some(4), dst));
|
|
|
|
// add tmp2, dst
|
|
ctx.emit(Inst::alu_rmi_r(
|
|
is_64,
|
|
AluRmiROpcode::Add,
|
|
RegMemImm::reg(tmp2.to_reg()),
|
|
dst,
|
|
));
|
|
|
|
// and $0x0F0F_0F0F, dst
|
|
ctx.emit(Inst::alu_rmi_r(
|
|
is_64,
|
|
AluRmiROpcode::And,
|
|
RegMemImm::imm(0x0F0F0F0F),
|
|
dst,
|
|
));
|
|
|
|
// mul $0x0101_0101, dst
|
|
ctx.emit(Inst::alu_rmi_r(
|
|
is_64,
|
|
AluRmiROpcode::Mul,
|
|
RegMemImm::imm(0x01010101),
|
|
dst,
|
|
));
|
|
|
|
// shr $24, dst
|
|
ctx.emit(Inst::shift_r(
|
|
4,
|
|
ShiftKind::ShiftRightLogical,
|
|
Some(24),
|
|
dst,
|
|
));
|
|
}
|
|
}
|
|
|
|
Opcode::IsNull | Opcode::IsInvalid => {
|
|
// Null references are represented by the constant value 0; invalid references are
|
|
// represented by the constant value -1. See `define_reftypes()` in
|
|
// `meta/src/isa/x86/encodings.rs` to confirm.
|
|
let src = input_to_reg(ctx, inputs[0]);
|
|
let dst = output_to_reg(ctx, outputs[0]);
|
|
let ty = ctx.input_ty(insn, 0);
|
|
let imm = match op {
|
|
Opcode::IsNull => {
|
|
// TODO could use tst src, src for IsNull
|
|
0
|
|
}
|
|
Opcode::IsInvalid => {
|
|
// We can do a 32-bit comparison even in 64-bits mode, as the constant is then
|
|
// sign-extended.
|
|
0xffffffff
|
|
}
|
|
_ => unreachable!(),
|
|
};
|
|
ctx.emit(Inst::cmp_rmi_r(ty.bytes() as u8, RegMemImm::imm(imm), src));
|
|
ctx.emit(Inst::setcc(CC::Z, dst));
|
|
}
|
|
|
|
Opcode::Uextend
|
|
| Opcode::Sextend
|
|
| Opcode::Bint
|
|
| Opcode::Breduce
|
|
| Opcode::Bextend
|
|
| Opcode::Ireduce => {
|
|
let src_ty = ctx.input_ty(insn, 0);
|
|
let dst_ty = ctx.output_ty(insn, 0);
|
|
|
|
let src = input_to_reg_mem(ctx, inputs[0]);
|
|
let dst = output_to_reg(ctx, outputs[0]);
|
|
|
|
let ext_mode = match (src_ty.bits(), dst_ty.bits()) {
|
|
(1, 8) | (1, 16) | (1, 32) | (8, 16) | (8, 32) => Some(ExtMode::BL),
|
|
(1, 64) | (8, 64) => Some(ExtMode::BQ),
|
|
(16, 32) => Some(ExtMode::WL),
|
|
(16, 64) => Some(ExtMode::WQ),
|
|
(32, 64) => Some(ExtMode::LQ),
|
|
(x, y) if x >= y => None,
|
|
_ => unreachable!(
|
|
"unexpected extension kind from {:?} to {:?}",
|
|
src_ty, dst_ty
|
|
),
|
|
};
|
|
|
|
// All of these other opcodes are simply a move from a zero-extended source. Here
|
|
// is why this works, in each case:
|
|
//
|
|
// - Bint: Bool-to-int. We always represent a bool as a 0 or 1, so we
|
|
// merely need to zero-extend here.
|
|
//
|
|
// - Breduce, Bextend: changing width of a boolean. We represent a
|
|
// bool as a 0 or 1, so again, this is a zero-extend / no-op.
|
|
//
|
|
// - Ireduce: changing width of an integer. Smaller ints are stored
|
|
// with undefined high-order bits, so we can simply do a copy.
|
|
|
|
if let Some(ext_mode) = ext_mode {
|
|
if op == Opcode::Sextend {
|
|
ctx.emit(Inst::movsx_rm_r(
|
|
ext_mode, src, dst, /* infallible */ None,
|
|
));
|
|
} else {
|
|
ctx.emit(Inst::movzx_rm_r(
|
|
ext_mode, src, dst, /* infallible */ None,
|
|
));
|
|
}
|
|
} else {
|
|
ctx.emit(Inst::mov64_rm_r(src, dst, /* infallible */ None));
|
|
}
|
|
}
|
|
|
|
Opcode::Icmp => {
|
|
emit_cmp(ctx, insn);
|
|
|
|
let condcode = inst_condcode(ctx.data(insn));
|
|
let cc = CC::from_intcc(condcode);
|
|
let dst = output_to_reg(ctx, outputs[0]);
|
|
ctx.emit(Inst::setcc(cc, dst));
|
|
}
|
|
|
|
Opcode::Fcmp => {
|
|
let cond_code = inst_fp_condcode(ctx.data(insn));
|
|
let input_ty = ctx.input_ty(insn, 0);
|
|
if !input_ty.is_vector() {
|
|
// Unordered is returned by setting ZF, PF, CF <- 111
|
|
// Greater than by ZF, PF, CF <- 000
|
|
// Less than by ZF, PF, CF <- 001
|
|
// Equal by ZF, PF, CF <- 100
|
|
//
|
|
// Checking the result of comiss is somewhat annoying because you don't have setcc
|
|
// instructions that explicitly check simultaneously for the condition (i.e. eq, le,
|
|
// gt, etc) *and* orderedness.
|
|
//
|
|
// So that might mean we need more than one setcc check and then a logical "and" or
|
|
// "or" to determine both, in some cases. However knowing that if the parity bit is
|
|
// set, then the result was considered unordered and knowing that if the parity bit is
|
|
// set, then both the ZF and CF flag bits must also be set we can get away with using
|
|
// one setcc for most condition codes.
|
|
|
|
let dst = output_to_reg(ctx, outputs[0]);
|
|
|
|
match emit_fcmp(ctx, insn, cond_code, FcmpSpec::Normal) {
|
|
FcmpCondResult::Condition(cc) => {
|
|
ctx.emit(Inst::setcc(cc, dst));
|
|
}
|
|
FcmpCondResult::AndConditions(cc1, cc2) => {
|
|
let tmp = ctx.alloc_tmp(RegClass::I64, types::I32);
|
|
ctx.emit(Inst::setcc(cc1, tmp));
|
|
ctx.emit(Inst::setcc(cc2, dst));
|
|
ctx.emit(Inst::alu_rmi_r(
|
|
false,
|
|
AluRmiROpcode::And,
|
|
RegMemImm::reg(tmp.to_reg()),
|
|
dst,
|
|
));
|
|
}
|
|
FcmpCondResult::OrConditions(cc1, cc2) => {
|
|
let tmp = ctx.alloc_tmp(RegClass::I64, types::I32);
|
|
ctx.emit(Inst::setcc(cc1, tmp));
|
|
ctx.emit(Inst::setcc(cc2, dst));
|
|
ctx.emit(Inst::alu_rmi_r(
|
|
false,
|
|
AluRmiROpcode::Or,
|
|
RegMemImm::reg(tmp.to_reg()),
|
|
dst,
|
|
));
|
|
}
|
|
FcmpCondResult::InvertedEqualOrConditions(_, _) => unreachable!(),
|
|
}
|
|
} else {
|
|
let op = match input_ty {
|
|
types::F32X4 => SseOpcode::Cmpps,
|
|
types::F64X2 => SseOpcode::Cmppd,
|
|
_ => panic!("Bad input type to fcmp: {}", input_ty),
|
|
};
|
|
|
|
// Since some packed comparisons are not available, some of the condition codes
|
|
// must be inverted, with a corresponding `flip` of the operands.
|
|
let (imm, flip) = match cond_code {
|
|
FloatCC::GreaterThan => (FcmpImm::LessThan, true),
|
|
FloatCC::GreaterThanOrEqual => (FcmpImm::LessThanOrEqual, true),
|
|
FloatCC::UnorderedOrLessThan => (FcmpImm::UnorderedOrGreaterThan, true),
|
|
FloatCC::UnorderedOrLessThanOrEqual => {
|
|
(FcmpImm::UnorderedOrGreaterThanOrEqual, true)
|
|
}
|
|
FloatCC::OrderedNotEqual | FloatCC::UnorderedOrEqual => {
|
|
panic!("unsupported float condition code: {}", cond_code)
|
|
}
|
|
_ => (FcmpImm::from(cond_code), false),
|
|
};
|
|
|
|
// Determine the operands of the comparison, possibly by flipping them.
|
|
let (lhs, rhs) = if flip {
|
|
(
|
|
input_to_reg(ctx, inputs[1]),
|
|
input_to_reg_mem(ctx, inputs[0]),
|
|
)
|
|
} else {
|
|
(
|
|
input_to_reg(ctx, inputs[0]),
|
|
input_to_reg_mem(ctx, inputs[1]),
|
|
)
|
|
};
|
|
|
|
// Move the `lhs` to the same register as `dst`; this may not emit an actual move
|
|
// but ensures that the registers are the same to match x86's read-write operand
|
|
// encoding.
|
|
let dst = output_to_reg(ctx, outputs[0]);
|
|
ctx.emit(Inst::gen_move(dst, lhs, input_ty));
|
|
|
|
// Emit the comparison.
|
|
ctx.emit(Inst::xmm_rm_r_imm(op, rhs, dst, imm.encode()));
|
|
}
|
|
}
|
|
|
|
Opcode::FallthroughReturn | Opcode::Return => {
|
|
for i in 0..ctx.num_inputs(insn) {
|
|
let src_reg = input_to_reg(ctx, inputs[i]);
|
|
let retval_reg = ctx.retval(i);
|
|
let ty = ctx.input_ty(insn, i);
|
|
ctx.emit(Inst::gen_move(retval_reg, src_reg, ty));
|
|
}
|
|
// N.B.: the Ret itself is generated by the ABI.
|
|
}
|
|
|
|
Opcode::Call | Opcode::CallIndirect => {
|
|
let loc = ctx.srcloc(insn);
|
|
let (mut abi, inputs) = match op {
|
|
Opcode::Call => {
|
|
let (extname, dist) = ctx.call_target(insn).unwrap();
|
|
let sig = ctx.call_sig(insn).unwrap();
|
|
assert_eq!(inputs.len(), sig.params.len());
|
|
assert_eq!(outputs.len(), sig.returns.len());
|
|
(
|
|
X64ABICaller::from_func(sig, &extname, dist, loc)?,
|
|
&inputs[..],
|
|
)
|
|
}
|
|
|
|
Opcode::CallIndirect => {
|
|
let ptr = input_to_reg(ctx, inputs[0]);
|
|
let sig = ctx.call_sig(insn).unwrap();
|
|
assert_eq!(inputs.len() - 1, sig.params.len());
|
|
assert_eq!(outputs.len(), sig.returns.len());
|
|
(X64ABICaller::from_ptr(sig, ptr, loc, op)?, &inputs[1..])
|
|
}
|
|
|
|
_ => unreachable!(),
|
|
};
|
|
|
|
abi.emit_stack_pre_adjust(ctx);
|
|
assert_eq!(inputs.len(), abi.num_args());
|
|
for (i, input) in inputs.iter().enumerate() {
|
|
let arg_reg = input_to_reg(ctx, *input);
|
|
abi.emit_copy_reg_to_arg(ctx, i, arg_reg);
|
|
}
|
|
abi.emit_call(ctx);
|
|
for (i, output) in outputs.iter().enumerate() {
|
|
let retval_reg = output_to_reg(ctx, *output);
|
|
abi.emit_copy_retval_to_reg(ctx, i, retval_reg);
|
|
}
|
|
abi.emit_stack_post_adjust(ctx);
|
|
}
|
|
|
|
Opcode::Debugtrap => {
|
|
ctx.emit(Inst::Hlt);
|
|
}
|
|
|
|
Opcode::Trap | Opcode::ResumableTrap => {
|
|
let trap_info = (ctx.srcloc(insn), inst_trapcode(ctx.data(insn)).unwrap());
|
|
ctx.emit_safepoint(Inst::Ud2 { trap_info });
|
|
}
|
|
|
|
Opcode::Trapif | Opcode::Trapff => {
|
|
let srcloc = ctx.srcloc(insn);
|
|
let trap_code = inst_trapcode(ctx.data(insn)).unwrap();
|
|
|
|
if matches_input(ctx, inputs[0], Opcode::IaddIfcout).is_some() {
|
|
let cond_code = inst_condcode(ctx.data(insn));
|
|
// The flags must not have been clobbered by any other instruction between the
|
|
// iadd_ifcout and this instruction, as verified by the CLIF validator; so we can
|
|
// simply use the flags here.
|
|
let cc = CC::from_intcc(cond_code);
|
|
|
|
ctx.emit_safepoint(Inst::TrapIf {
|
|
trap_code,
|
|
srcloc,
|
|
cc,
|
|
});
|
|
} else if op == Opcode::Trapif {
|
|
let cond_code = inst_condcode(ctx.data(insn));
|
|
let cc = CC::from_intcc(cond_code);
|
|
|
|
// Verification ensures that the input is always a single-def ifcmp.
|
|
let ifcmp = matches_input(ctx, inputs[0], Opcode::Ifcmp).unwrap();
|
|
emit_cmp(ctx, ifcmp);
|
|
|
|
ctx.emit_safepoint(Inst::TrapIf {
|
|
trap_code,
|
|
srcloc,
|
|
cc,
|
|
});
|
|
} else {
|
|
let cond_code = inst_fp_condcode(ctx.data(insn));
|
|
|
|
// Verification ensures that the input is always a single-def ffcmp.
|
|
let ffcmp = matches_input(ctx, inputs[0], Opcode::Ffcmp).unwrap();
|
|
|
|
match emit_fcmp(ctx, ffcmp, cond_code, FcmpSpec::Normal) {
|
|
FcmpCondResult::Condition(cc) => ctx.emit_safepoint(Inst::TrapIf {
|
|
trap_code,
|
|
srcloc,
|
|
cc,
|
|
}),
|
|
FcmpCondResult::AndConditions(cc1, cc2) => {
|
|
// A bit unfortunate, but materialize the flags in their own register, and
|
|
// check against this.
|
|
let tmp = ctx.alloc_tmp(RegClass::I64, types::I32);
|
|
let tmp2 = ctx.alloc_tmp(RegClass::I64, types::I32);
|
|
ctx.emit(Inst::setcc(cc1, tmp));
|
|
ctx.emit(Inst::setcc(cc2, tmp2));
|
|
ctx.emit(Inst::alu_rmi_r(
|
|
false, /* is_64 */
|
|
AluRmiROpcode::And,
|
|
RegMemImm::reg(tmp.to_reg()),
|
|
tmp2,
|
|
));
|
|
ctx.emit_safepoint(Inst::TrapIf {
|
|
trap_code,
|
|
srcloc,
|
|
cc: CC::NZ,
|
|
});
|
|
}
|
|
FcmpCondResult::OrConditions(cc1, cc2) => {
|
|
ctx.emit_safepoint(Inst::TrapIf {
|
|
trap_code,
|
|
srcloc,
|
|
cc: cc1,
|
|
});
|
|
ctx.emit_safepoint(Inst::TrapIf {
|
|
trap_code,
|
|
srcloc,
|
|
cc: cc2,
|
|
});
|
|
}
|
|
FcmpCondResult::InvertedEqualOrConditions(_, _) => unreachable!(),
|
|
};
|
|
};
|
|
}
|
|
|
|
Opcode::F64const => {
|
|
// TODO use cmpeqpd for all 1s.
|
|
let value = ctx.get_constant(insn).unwrap();
|
|
let dst = output_to_reg(ctx, outputs[0]);
|
|
for inst in Inst::gen_constant(dst, value, types::F64, |reg_class, ty| {
|
|
ctx.alloc_tmp(reg_class, ty)
|
|
}) {
|
|
ctx.emit(inst);
|
|
}
|
|
}
|
|
|
|
Opcode::F32const => {
|
|
// TODO use cmpeqps for all 1s.
|
|
let value = ctx.get_constant(insn).unwrap();
|
|
let dst = output_to_reg(ctx, outputs[0]);
|
|
for inst in Inst::gen_constant(dst, value, types::F32, |reg_class, ty| {
|
|
ctx.alloc_tmp(reg_class, ty)
|
|
}) {
|
|
ctx.emit(inst);
|
|
}
|
|
}
|
|
|
|
Opcode::Fadd | Opcode::Fsub | Opcode::Fmul | Opcode::Fdiv => {
|
|
let lhs = input_to_reg(ctx, inputs[0]);
|
|
let rhs = input_to_reg_mem(ctx, inputs[1]);
|
|
let dst = output_to_reg(ctx, outputs[0]);
|
|
let ty = ty.unwrap();
|
|
|
|
// Move the `lhs` to the same register as `dst`; this may not emit an actual move
|
|
// but ensures that the registers are the same to match x86's read-write operand
|
|
// encoding.
|
|
ctx.emit(Inst::gen_move(dst, lhs, ty));
|
|
|
|
// Note: min and max can't be handled here, because of the way Cranelift defines them:
|
|
// if any operand is a NaN, they must return the NaN operand, while the x86 machine
|
|
// instruction will return the second operand if either operand is a NaN.
|
|
let sse_op = match ty {
|
|
types::F32 => match op {
|
|
Opcode::Fadd => SseOpcode::Addss,
|
|
Opcode::Fsub => SseOpcode::Subss,
|
|
Opcode::Fmul => SseOpcode::Mulss,
|
|
Opcode::Fdiv => SseOpcode::Divss,
|
|
_ => unreachable!(),
|
|
},
|
|
types::F64 => match op {
|
|
Opcode::Fadd => SseOpcode::Addsd,
|
|
Opcode::Fsub => SseOpcode::Subsd,
|
|
Opcode::Fmul => SseOpcode::Mulsd,
|
|
Opcode::Fdiv => SseOpcode::Divsd,
|
|
_ => unreachable!(),
|
|
},
|
|
types::F32X4 => match op {
|
|
Opcode::Fadd => SseOpcode::Addps,
|
|
Opcode::Fsub => SseOpcode::Subps,
|
|
Opcode::Fmul => SseOpcode::Mulps,
|
|
Opcode::Fdiv => SseOpcode::Divps,
|
|
_ => unreachable!(),
|
|
},
|
|
types::F64X2 => match op {
|
|
Opcode::Fadd => SseOpcode::Addpd,
|
|
Opcode::Fsub => SseOpcode::Subpd,
|
|
Opcode::Fmul => SseOpcode::Mulpd,
|
|
Opcode::Fdiv => SseOpcode::Divpd,
|
|
_ => unreachable!(),
|
|
},
|
|
_ => panic!(
|
|
"invalid type: expected one of [F32, F64, F32X4, F64X2], found {}",
|
|
ty
|
|
),
|
|
};
|
|
ctx.emit(Inst::xmm_rm_r(sse_op, rhs, dst));
|
|
}
|
|
|
|
Opcode::Fmin | Opcode::Fmax => {
|
|
let lhs = input_to_reg(ctx, inputs[0]);
|
|
let rhs = input_to_reg(ctx, inputs[1]);
|
|
let dst = output_to_reg(ctx, outputs[0]);
|
|
let is_min = op == Opcode::Fmin;
|
|
let output_ty = ty.unwrap();
|
|
ctx.emit(Inst::gen_move(dst, rhs, output_ty));
|
|
let op_size = match output_ty {
|
|
types::F32 => OperandSize::Size32,
|
|
types::F64 => OperandSize::Size64,
|
|
_ => panic!("unexpected type {:?} for fmin/fmax", output_ty),
|
|
};
|
|
ctx.emit(Inst::xmm_min_max_seq(op_size, is_min, lhs, dst));
|
|
}
|
|
|
|
Opcode::Sqrt => {
|
|
let src = input_to_reg_mem(ctx, inputs[0]);
|
|
let dst = output_to_reg(ctx, outputs[0]);
|
|
let ty = ty.unwrap();
|
|
|
|
let sse_op = match ty {
|
|
types::F32 => SseOpcode::Sqrtss,
|
|
types::F64 => SseOpcode::Sqrtsd,
|
|
types::F32X4 => SseOpcode::Sqrtps,
|
|
types::F64X2 => SseOpcode::Sqrtpd,
|
|
_ => panic!(
|
|
"invalid type: expected one of [F32, F64, F32X4, F64X2], found {}",
|
|
ty
|
|
),
|
|
};
|
|
|
|
ctx.emit(Inst::xmm_unary_rm_r(sse_op, src, dst));
|
|
}
|
|
|
|
Opcode::Fpromote => {
|
|
let src = input_to_reg_mem(ctx, inputs[0]);
|
|
let dst = output_to_reg(ctx, outputs[0]);
|
|
ctx.emit(Inst::xmm_unary_rm_r(SseOpcode::Cvtss2sd, src, dst));
|
|
}
|
|
|
|
Opcode::Fdemote => {
|
|
let src = input_to_reg_mem(ctx, inputs[0]);
|
|
let dst = output_to_reg(ctx, outputs[0]);
|
|
ctx.emit(Inst::xmm_unary_rm_r(SseOpcode::Cvtsd2ss, src, dst));
|
|
}
|
|
|
|
Opcode::FcvtFromSint => {
|
|
let (ext_spec, src_size) = match ctx.input_ty(insn, 0) {
|
|
types::I8 | types::I16 => (Some(ExtSpec::SignExtendTo32), OperandSize::Size32),
|
|
types::I32 => (None, OperandSize::Size32),
|
|
types::I64 => (None, OperandSize::Size64),
|
|
_ => unreachable!(),
|
|
};
|
|
|
|
let src = match ext_spec {
|
|
Some(ext_spec) => RegMem::reg(extend_input_to_reg(ctx, inputs[0], ext_spec)),
|
|
None => input_to_reg_mem(ctx, inputs[0]),
|
|
};
|
|
|
|
let output_ty = ty.unwrap();
|
|
let opcode = if output_ty == types::F32 {
|
|
SseOpcode::Cvtsi2ss
|
|
} else {
|
|
assert_eq!(output_ty, types::F64);
|
|
SseOpcode::Cvtsi2sd
|
|
};
|
|
|
|
let dst = output_to_reg(ctx, outputs[0]);
|
|
ctx.emit(Inst::gpr_to_xmm(opcode, src, src_size, dst));
|
|
}
|
|
|
|
Opcode::FcvtFromUint => {
|
|
let dst = output_to_reg(ctx, outputs[0]);
|
|
let ty = ty.unwrap();
|
|
|
|
let input_ty = ctx.input_ty(insn, 0);
|
|
match input_ty {
|
|
types::I8 | types::I16 | types::I32 => {
|
|
// Conversion from an unsigned int smaller than 64-bit is easy: zero-extend +
|
|
// do a signed conversion (which won't overflow).
|
|
let opcode = if ty == types::F32 {
|
|
SseOpcode::Cvtsi2ss
|
|
} else {
|
|
assert_eq!(ty, types::F64);
|
|
SseOpcode::Cvtsi2sd
|
|
};
|
|
|
|
let src =
|
|
RegMem::reg(extend_input_to_reg(ctx, inputs[0], ExtSpec::ZeroExtendTo64));
|
|
ctx.emit(Inst::gpr_to_xmm(opcode, src, OperandSize::Size64, dst));
|
|
}
|
|
|
|
types::I64 => {
|
|
let src = input_to_reg(ctx, inputs[0]);
|
|
|
|
let src_copy = ctx.alloc_tmp(RegClass::I64, types::I64);
|
|
ctx.emit(Inst::gen_move(src_copy, src, types::I64));
|
|
|
|
let tmp_gpr1 = ctx.alloc_tmp(RegClass::I64, types::I64);
|
|
let tmp_gpr2 = ctx.alloc_tmp(RegClass::I64, types::I64);
|
|
ctx.emit(Inst::cvt_u64_to_float_seq(
|
|
ty == types::F64,
|
|
src_copy,
|
|
tmp_gpr1,
|
|
tmp_gpr2,
|
|
dst,
|
|
));
|
|
}
|
|
|
|
_ => panic!("unexpected input type for FcvtFromUint: {:?}", input_ty),
|
|
};
|
|
}
|
|
|
|
Opcode::FcvtToUint | Opcode::FcvtToUintSat | Opcode::FcvtToSint | Opcode::FcvtToSintSat => {
|
|
let src = input_to_reg(ctx, inputs[0]);
|
|
let dst = output_to_reg(ctx, outputs[0]);
|
|
|
|
let input_ty = ctx.input_ty(insn, 0);
|
|
let src_size = if input_ty == types::F32 {
|
|
OperandSize::Size32
|
|
} else {
|
|
assert_eq!(input_ty, types::F64);
|
|
OperandSize::Size64
|
|
};
|
|
|
|
let output_ty = ty.unwrap();
|
|
let dst_size = if output_ty == types::I32 {
|
|
OperandSize::Size32
|
|
} else {
|
|
assert_eq!(output_ty, types::I64);
|
|
OperandSize::Size64
|
|
};
|
|
|
|
let to_signed = op == Opcode::FcvtToSint || op == Opcode::FcvtToSintSat;
|
|
let is_sat = op == Opcode::FcvtToUintSat || op == Opcode::FcvtToSintSat;
|
|
|
|
let src_copy = ctx.alloc_tmp(RegClass::V128, input_ty);
|
|
ctx.emit(Inst::gen_move(src_copy, src, input_ty));
|
|
|
|
let tmp_xmm = ctx.alloc_tmp(RegClass::V128, input_ty);
|
|
let tmp_gpr = ctx.alloc_tmp(RegClass::I64, output_ty);
|
|
|
|
let srcloc = ctx.srcloc(insn);
|
|
if to_signed {
|
|
ctx.emit(Inst::cvt_float_to_sint_seq(
|
|
src_size, dst_size, is_sat, src_copy, dst, tmp_gpr, tmp_xmm, srcloc,
|
|
));
|
|
} else {
|
|
ctx.emit(Inst::cvt_float_to_uint_seq(
|
|
src_size, dst_size, is_sat, src_copy, dst, tmp_gpr, tmp_xmm, srcloc,
|
|
));
|
|
}
|
|
}
|
|
|
|
Opcode::Bitcast => {
|
|
let input_ty = ctx.input_ty(insn, 0);
|
|
let output_ty = ctx.output_ty(insn, 0);
|
|
match (input_ty, output_ty) {
|
|
(types::F32, types::I32) => {
|
|
let src = input_to_reg(ctx, inputs[0]);
|
|
let dst = output_to_reg(ctx, outputs[0]);
|
|
ctx.emit(Inst::xmm_to_gpr(
|
|
SseOpcode::Movd,
|
|
src,
|
|
dst,
|
|
OperandSize::Size32,
|
|
));
|
|
}
|
|
(types::I32, types::F32) => {
|
|
let src = input_to_reg_mem(ctx, inputs[0]);
|
|
let dst = output_to_reg(ctx, outputs[0]);
|
|
ctx.emit(Inst::gpr_to_xmm(
|
|
SseOpcode::Movd,
|
|
src,
|
|
OperandSize::Size32,
|
|
dst,
|
|
));
|
|
}
|
|
(types::F64, types::I64) => {
|
|
let src = input_to_reg(ctx, inputs[0]);
|
|
let dst = output_to_reg(ctx, outputs[0]);
|
|
ctx.emit(Inst::xmm_to_gpr(
|
|
SseOpcode::Movq,
|
|
src,
|
|
dst,
|
|
OperandSize::Size64,
|
|
));
|
|
}
|
|
(types::I64, types::F64) => {
|
|
let src = input_to_reg_mem(ctx, inputs[0]);
|
|
let dst = output_to_reg(ctx, outputs[0]);
|
|
ctx.emit(Inst::gpr_to_xmm(
|
|
SseOpcode::Movq,
|
|
src,
|
|
OperandSize::Size64,
|
|
dst,
|
|
));
|
|
}
|
|
_ => unreachable!("invalid bitcast from {:?} to {:?}", input_ty, output_ty),
|
|
}
|
|
}
|
|
|
|
Opcode::Fabs | Opcode::Fneg => {
|
|
let src = input_to_reg_mem(ctx, inputs[0]);
|
|
let dst = output_to_reg(ctx, outputs[0]);
|
|
|
|
// In both cases, generate a constant and apply a single binary instruction:
|
|
// - to compute the absolute value, set all bits to 1 but the MSB to 0, and bit-AND the
|
|
// src with it.
|
|
// - to compute the negated value, set all bits to 0 but the MSB to 1, and bit-XOR the
|
|
// src with it.
|
|
let output_ty = ty.unwrap();
|
|
if !output_ty.is_vector() {
|
|
let (val, opcode) = match output_ty {
|
|
types::F32 => match op {
|
|
Opcode::Fabs => (0x7fffffff, SseOpcode::Andps),
|
|
Opcode::Fneg => (0x80000000, SseOpcode::Xorps),
|
|
_ => unreachable!(),
|
|
},
|
|
types::F64 => match op {
|
|
Opcode::Fabs => (0x7fffffffffffffff, SseOpcode::Andpd),
|
|
Opcode::Fneg => (0x8000000000000000, SseOpcode::Xorpd),
|
|
_ => unreachable!(),
|
|
},
|
|
_ => panic!("unexpected type {:?} for Fabs", output_ty),
|
|
};
|
|
|
|
for inst in Inst::gen_constant(dst, val, output_ty, |reg_class, ty| {
|
|
ctx.alloc_tmp(reg_class, ty)
|
|
}) {
|
|
ctx.emit(inst);
|
|
}
|
|
|
|
ctx.emit(Inst::xmm_rm_r(opcode, src, dst));
|
|
} else {
|
|
// Eventually vector constants should be available in `gen_constant` and this block
|
|
// can be merged with the one above (TODO).
|
|
if output_ty.bits() == 128 {
|
|
// Move the `lhs` to the same register as `dst`; this may not emit an actual move
|
|
// but ensures that the registers are the same to match x86's read-write operand
|
|
// encoding.
|
|
let src = input_to_reg(ctx, inputs[0]);
|
|
ctx.emit(Inst::gen_move(dst, src, output_ty));
|
|
|
|
// Generate an all 1s constant in an XMM register. This uses CMPPS but could
|
|
// have used CMPPD with the same effect.
|
|
let tmp = ctx.alloc_tmp(RegClass::V128, output_ty);
|
|
let cond = FcmpImm::from(FloatCC::Equal);
|
|
let cmpps = Inst::xmm_rm_r_imm(
|
|
SseOpcode::Cmpps,
|
|
RegMem::reg(tmp.to_reg()),
|
|
tmp,
|
|
cond.encode(),
|
|
);
|
|
ctx.emit(cmpps);
|
|
|
|
// Shift the all 1s constant to generate the mask.
|
|
let lane_bits = output_ty.lane_bits();
|
|
let (shift_opcode, opcode, shift_by) = match (op, lane_bits) {
|
|
(Opcode::Fabs, 32) => (SseOpcode::Psrld, SseOpcode::Andps, 1),
|
|
(Opcode::Fabs, 64) => (SseOpcode::Psrlq, SseOpcode::Andpd, 1),
|
|
(Opcode::Fneg, 32) => (SseOpcode::Pslld, SseOpcode::Xorps, 31),
|
|
(Opcode::Fneg, 64) => (SseOpcode::Psllq, SseOpcode::Xorpd, 63),
|
|
_ => unreachable!(
|
|
"unexpected opcode and lane size: {:?}, {} bits",
|
|
op, lane_bits
|
|
),
|
|
};
|
|
let shift = Inst::xmm_rmi_reg(shift_opcode, RegMemImm::imm(shift_by), tmp);
|
|
ctx.emit(shift);
|
|
|
|
// Apply shifted mask (XOR or AND).
|
|
let mask = Inst::xmm_rm_r(opcode, RegMem::reg(tmp.to_reg()), dst);
|
|
ctx.emit(mask);
|
|
} else {
|
|
panic!("unexpected type {:?} for Fabs", output_ty);
|
|
}
|
|
}
|
|
}
|
|
|
|
Opcode::Fcopysign => {
|
|
let dst = output_to_reg(ctx, outputs[0]);
|
|
let lhs = input_to_reg(ctx, inputs[0]);
|
|
let rhs = input_to_reg(ctx, inputs[1]);
|
|
|
|
let ty = ty.unwrap();
|
|
|
|
// We're going to generate the following sequence:
|
|
//
|
|
// movabs $INT_MIN, tmp_gpr1
|
|
// mov{d,q} tmp_gpr1, tmp_xmm1
|
|
// movap{s,d} tmp_xmm1, dst
|
|
// andnp{s,d} src_1, dst
|
|
// movap{s,d} src_2, tmp_xmm2
|
|
// andp{s,d} tmp_xmm1, tmp_xmm2
|
|
// orp{s,d} tmp_xmm2, dst
|
|
|
|
let tmp_xmm1 = ctx.alloc_tmp(RegClass::V128, types::F32);
|
|
let tmp_xmm2 = ctx.alloc_tmp(RegClass::V128, types::F32);
|
|
|
|
let (sign_bit_cst, mov_op, and_not_op, and_op, or_op) = match ty {
|
|
types::F32 => (
|
|
0x8000_0000,
|
|
SseOpcode::Movaps,
|
|
SseOpcode::Andnps,
|
|
SseOpcode::Andps,
|
|
SseOpcode::Orps,
|
|
),
|
|
types::F64 => (
|
|
0x8000_0000_0000_0000,
|
|
SseOpcode::Movapd,
|
|
SseOpcode::Andnpd,
|
|
SseOpcode::Andpd,
|
|
SseOpcode::Orpd,
|
|
),
|
|
_ => {
|
|
panic!("unexpected type {:?} for copysign", ty);
|
|
}
|
|
};
|
|
|
|
for inst in Inst::gen_constant(tmp_xmm1, sign_bit_cst, ty, |reg_class, ty| {
|
|
ctx.alloc_tmp(reg_class, ty)
|
|
}) {
|
|
ctx.emit(inst);
|
|
}
|
|
ctx.emit(Inst::xmm_mov(
|
|
mov_op,
|
|
RegMem::reg(tmp_xmm1.to_reg()),
|
|
dst,
|
|
None,
|
|
));
|
|
ctx.emit(Inst::xmm_rm_r(and_not_op, RegMem::reg(lhs), dst));
|
|
ctx.emit(Inst::xmm_mov(mov_op, RegMem::reg(rhs), tmp_xmm2, None));
|
|
ctx.emit(Inst::xmm_rm_r(
|
|
and_op,
|
|
RegMem::reg(tmp_xmm1.to_reg()),
|
|
tmp_xmm2,
|
|
));
|
|
ctx.emit(Inst::xmm_rm_r(or_op, RegMem::reg(tmp_xmm2.to_reg()), dst));
|
|
}
|
|
|
|
Opcode::Ceil | Opcode::Floor | Opcode::Nearest | Opcode::Trunc => {
|
|
// TODO use ROUNDSS/ROUNDSD after sse4.1.
|
|
|
|
// Lower to VM calls when there's no access to SSE4.1.
|
|
let ty = ty.unwrap();
|
|
let libcall = match (ty, op) {
|
|
(types::F32, Opcode::Ceil) => LibCall::CeilF32,
|
|
(types::F64, Opcode::Ceil) => LibCall::CeilF64,
|
|
(types::F32, Opcode::Floor) => LibCall::FloorF32,
|
|
(types::F64, Opcode::Floor) => LibCall::FloorF64,
|
|
(types::F32, Opcode::Nearest) => LibCall::NearestF32,
|
|
(types::F64, Opcode::Nearest) => LibCall::NearestF64,
|
|
(types::F32, Opcode::Trunc) => LibCall::TruncF32,
|
|
(types::F64, Opcode::Trunc) => LibCall::TruncF64,
|
|
_ => panic!(
|
|
"unexpected type/opcode {:?}/{:?} in Ceil/Floor/Nearest/Trunc",
|
|
ty, op
|
|
),
|
|
};
|
|
|
|
emit_vm_call(ctx, flags, triple, libcall, insn, inputs, outputs)?;
|
|
}
|
|
|
|
Opcode::Load
|
|
| Opcode::Uload8
|
|
| Opcode::Sload8
|
|
| Opcode::Uload16
|
|
| Opcode::Sload16
|
|
| Opcode::Uload32
|
|
| Opcode::Sload32
|
|
| Opcode::LoadComplex
|
|
| Opcode::Uload8Complex
|
|
| Opcode::Sload8Complex
|
|
| Opcode::Uload16Complex
|
|
| Opcode::Sload16Complex
|
|
| Opcode::Uload32Complex
|
|
| Opcode::Sload32Complex => {
|
|
let offset = ldst_offset(ctx.data(insn)).unwrap();
|
|
|
|
let elem_ty = match op {
|
|
Opcode::Sload8 | Opcode::Uload8 | Opcode::Sload8Complex | Opcode::Uload8Complex => {
|
|
types::I8
|
|
}
|
|
Opcode::Sload16
|
|
| Opcode::Uload16
|
|
| Opcode::Sload16Complex
|
|
| Opcode::Uload16Complex => types::I16,
|
|
Opcode::Sload32
|
|
| Opcode::Uload32
|
|
| Opcode::Sload32Complex
|
|
| Opcode::Uload32Complex => types::I32,
|
|
Opcode::Load | Opcode::LoadComplex => ctx.output_ty(insn, 0),
|
|
_ => unimplemented!(),
|
|
};
|
|
|
|
let ext_mode = match elem_ty.bytes() {
|
|
1 => Some(ExtMode::BQ),
|
|
2 => Some(ExtMode::WQ),
|
|
4 => Some(ExtMode::LQ),
|
|
_ => None,
|
|
};
|
|
|
|
let sign_extend = match op {
|
|
Opcode::Sload8
|
|
| Opcode::Sload8Complex
|
|
| Opcode::Sload16
|
|
| Opcode::Sload16Complex
|
|
| Opcode::Sload32
|
|
| Opcode::Sload32Complex => true,
|
|
_ => false,
|
|
};
|
|
|
|
let amode = match op {
|
|
Opcode::Load
|
|
| Opcode::Uload8
|
|
| Opcode::Sload8
|
|
| Opcode::Uload16
|
|
| Opcode::Sload16
|
|
| Opcode::Uload32
|
|
| Opcode::Sload32 => {
|
|
assert_eq!(inputs.len(), 1, "only one input for load operands");
|
|
lower_to_amode(ctx, inputs[0], offset as u32)
|
|
}
|
|
|
|
Opcode::LoadComplex
|
|
| Opcode::Uload8Complex
|
|
| Opcode::Sload8Complex
|
|
| Opcode::Uload16Complex
|
|
| Opcode::Sload16Complex
|
|
| Opcode::Uload32Complex
|
|
| Opcode::Sload32Complex => {
|
|
assert_eq!(
|
|
inputs.len(),
|
|
2,
|
|
"can't handle more than two inputs in complex load"
|
|
);
|
|
let base = input_to_reg(ctx, inputs[0]);
|
|
let index = input_to_reg(ctx, inputs[1]);
|
|
let shift = 0;
|
|
Amode::imm_reg_reg_shift(offset as u32, base, index, shift)
|
|
}
|
|
|
|
_ => unreachable!(),
|
|
};
|
|
|
|
let srcloc = Some(ctx.srcloc(insn));
|
|
|
|
let dst = output_to_reg(ctx, outputs[0]);
|
|
let is_xmm = elem_ty.is_float() || elem_ty.is_vector();
|
|
match (sign_extend, is_xmm) {
|
|
(true, false) => {
|
|
// The load is sign-extended only when the output size is lower than 64 bits,
|
|
// so ext-mode is defined in this case.
|
|
ctx.emit(Inst::movsx_rm_r(
|
|
ext_mode.unwrap(),
|
|
RegMem::mem(amode),
|
|
dst,
|
|
srcloc,
|
|
));
|
|
}
|
|
(false, false) => {
|
|
if elem_ty.bytes() == 8 {
|
|
// Use a plain load.
|
|
ctx.emit(Inst::mov64_m_r(amode, dst, srcloc))
|
|
} else {
|
|
// Use a zero-extended load.
|
|
ctx.emit(Inst::movzx_rm_r(
|
|
ext_mode.unwrap(),
|
|
RegMem::mem(amode),
|
|
dst,
|
|
srcloc,
|
|
))
|
|
}
|
|
}
|
|
(_, true) => {
|
|
ctx.emit(match elem_ty {
|
|
types::F32 => {
|
|
Inst::xmm_mov(SseOpcode::Movss, RegMem::mem(amode), dst, srcloc)
|
|
}
|
|
types::F64 => {
|
|
Inst::xmm_mov(SseOpcode::Movsd, RegMem::mem(amode), dst, srcloc)
|
|
}
|
|
_ if elem_ty.is_vector() && elem_ty.bits() == 128 => {
|
|
Inst::xmm_mov(SseOpcode::Movups, RegMem::mem(amode), dst, srcloc)
|
|
} // TODO Specialize for different types: MOVUPD, MOVDQU
|
|
_ => unreachable!("unexpected type for load: {:?}", elem_ty),
|
|
});
|
|
}
|
|
}
|
|
}
|
|
|
|
Opcode::Store
|
|
| Opcode::Istore8
|
|
| Opcode::Istore16
|
|
| Opcode::Istore32
|
|
| Opcode::StoreComplex
|
|
| Opcode::Istore8Complex
|
|
| Opcode::Istore16Complex
|
|
| Opcode::Istore32Complex => {
|
|
let offset = ldst_offset(ctx.data(insn)).unwrap();
|
|
|
|
let elem_ty = match op {
|
|
Opcode::Istore8 | Opcode::Istore8Complex => types::I8,
|
|
Opcode::Istore16 | Opcode::Istore16Complex => types::I16,
|
|
Opcode::Istore32 | Opcode::Istore32Complex => types::I32,
|
|
Opcode::Store | Opcode::StoreComplex => ctx.input_ty(insn, 0),
|
|
_ => unreachable!(),
|
|
};
|
|
|
|
let addr = match op {
|
|
Opcode::Store | Opcode::Istore8 | Opcode::Istore16 | Opcode::Istore32 => {
|
|
assert_eq!(inputs.len(), 2, "only one input for store memory operands");
|
|
lower_to_amode(ctx, inputs[1], offset as u32)
|
|
}
|
|
|
|
Opcode::StoreComplex
|
|
| Opcode::Istore8Complex
|
|
| Opcode::Istore16Complex
|
|
| Opcode::Istore32Complex => {
|
|
assert_eq!(
|
|
inputs.len(),
|
|
3,
|
|
"can't handle more than two inputs in complex store"
|
|
);
|
|
let base = input_to_reg(ctx, inputs[1]);
|
|
let index = input_to_reg(ctx, inputs[2]);
|
|
let shift = 0;
|
|
Amode::imm_reg_reg_shift(offset as u32, base, index, shift)
|
|
}
|
|
|
|
_ => unreachable!(),
|
|
};
|
|
|
|
let src = input_to_reg(ctx, inputs[0]);
|
|
|
|
let srcloc = Some(ctx.srcloc(insn));
|
|
|
|
ctx.emit(match elem_ty {
|
|
types::F32 => Inst::xmm_mov_r_m(SseOpcode::Movss, src, addr, srcloc),
|
|
types::F64 => Inst::xmm_mov_r_m(SseOpcode::Movsd, src, addr, srcloc),
|
|
_ if elem_ty.is_vector() && elem_ty.bits() == 128 => {
|
|
// TODO Specialize for different types: MOVUPD, MOVDQU, etc.
|
|
Inst::xmm_mov_r_m(SseOpcode::Movups, src, addr, srcloc)
|
|
}
|
|
_ => Inst::mov_r_m(elem_ty.bytes() as u8, src, addr, srcloc),
|
|
});
|
|
}
|
|
|
|
Opcode::AtomicRmw => {
|
|
// This is a simple, general-case atomic update, based on a loop involving
|
|
// `cmpxchg`. Note that we could do much better than this in the case where the old
|
|
// value at the location (that is to say, the SSA `Value` computed by this CLIF
|
|
// instruction) is not required. In that case, we could instead implement this
|
|
// using a single `lock`-prefixed x64 read-modify-write instruction. Also, even in
|
|
// the case where the old value is required, for the `add` and `sub` cases, we can
|
|
// use the single instruction `lock xadd`. However, those improvements have been
|
|
// left for another day.
|
|
// TODO: filed as https://github.com/bytecodealliance/wasmtime/issues/2153
|
|
let dst = output_to_reg(ctx, outputs[0]);
|
|
let mut addr = input_to_reg(ctx, inputs[0]);
|
|
let mut arg2 = input_to_reg(ctx, inputs[1]);
|
|
let ty_access = ty.unwrap();
|
|
assert!(is_valid_atomic_transaction_ty(ty_access));
|
|
let memflags = ctx.memflags(insn).expect("memory flags");
|
|
let srcloc = if !memflags.notrap() {
|
|
Some(ctx.srcloc(insn))
|
|
} else {
|
|
None
|
|
};
|
|
|
|
// Make sure that both args are in virtual regs, since in effect we have to do a
|
|
// parallel copy to get them safely to the AtomicRmwSeq input regs, and that's not
|
|
// guaranteed safe if either is in a real reg.
|
|
addr = ctx.ensure_in_vreg(addr, types::I64);
|
|
arg2 = ctx.ensure_in_vreg(arg2, types::I64);
|
|
|
|
// Move the args to the preordained AtomicRMW input regs. Note that `AtomicRmwSeq`
|
|
// operates at whatever width is specified by `ty`, so there's no need to
|
|
// zero-extend `arg2` in the case of `ty` being I8/I16/I32.
|
|
ctx.emit(Inst::gen_move(
|
|
Writable::from_reg(regs::r9()),
|
|
addr,
|
|
types::I64,
|
|
));
|
|
ctx.emit(Inst::gen_move(
|
|
Writable::from_reg(regs::r10()),
|
|
arg2,
|
|
types::I64,
|
|
));
|
|
|
|
// Now the AtomicRmwSeq (pseudo-) instruction itself
|
|
let op = inst_common::AtomicRmwOp::from(inst_atomic_rmw_op(ctx.data(insn)).unwrap());
|
|
ctx.emit(Inst::AtomicRmwSeq {
|
|
ty: ty_access,
|
|
op,
|
|
srcloc,
|
|
});
|
|
|
|
// And finally, copy the preordained AtomicRmwSeq output reg to its destination.
|
|
ctx.emit(Inst::gen_move(dst, regs::rax(), types::I64));
|
|
}
|
|
|
|
Opcode::AtomicCas => {
|
|
// This is very similar to, but not identical to, the `AtomicRmw` case. As with
|
|
// `AtomicRmw`, there's no need to zero-extend narrow values here.
|
|
let dst = output_to_reg(ctx, outputs[0]);
|
|
let addr = lower_to_amode(ctx, inputs[0], 0);
|
|
let expected = input_to_reg(ctx, inputs[1]);
|
|
let replacement = input_to_reg(ctx, inputs[2]);
|
|
let ty_access = ty.unwrap();
|
|
assert!(is_valid_atomic_transaction_ty(ty_access));
|
|
let memflags = ctx.memflags(insn).expect("memory flags");
|
|
let srcloc = if !memflags.notrap() {
|
|
Some(ctx.srcloc(insn))
|
|
} else {
|
|
None
|
|
};
|
|
|
|
// Move the expected value into %rax. Because there's only one fixed register on
|
|
// the input side, we don't have to use `ensure_in_vreg`, as is necessary in the
|
|
// `AtomicRmw` case.
|
|
ctx.emit(Inst::gen_move(
|
|
Writable::from_reg(regs::rax()),
|
|
expected,
|
|
types::I64,
|
|
));
|
|
ctx.emit(Inst::LockCmpxchg {
|
|
ty: ty_access,
|
|
src: replacement,
|
|
dst: addr.into(),
|
|
srcloc,
|
|
});
|
|
// And finally, copy the old value at the location to its destination reg.
|
|
ctx.emit(Inst::gen_move(dst, regs::rax(), types::I64));
|
|
}
|
|
|
|
Opcode::AtomicLoad => {
|
|
// This is a normal load. The x86-TSO memory model provides sufficient sequencing
|
|
// to satisfy the CLIF synchronisation requirements for `AtomicLoad` without the
|
|
// need for any fence instructions.
|
|
let data = output_to_reg(ctx, outputs[0]);
|
|
let addr = lower_to_amode(ctx, inputs[0], 0);
|
|
let ty_access = ty.unwrap();
|
|
assert!(is_valid_atomic_transaction_ty(ty_access));
|
|
let memflags = ctx.memflags(insn).expect("memory flags");
|
|
let srcloc = if !memflags.notrap() {
|
|
Some(ctx.srcloc(insn))
|
|
} else {
|
|
None
|
|
};
|
|
|
|
let rm = RegMem::mem(addr);
|
|
if ty_access == types::I64 {
|
|
ctx.emit(Inst::mov64_rm_r(rm, data, srcloc));
|
|
} else {
|
|
let ext_mode = match ty_access {
|
|
types::I8 => ExtMode::BQ,
|
|
types::I16 => ExtMode::WQ,
|
|
types::I32 => ExtMode::LQ,
|
|
_ => panic!("lowering AtomicLoad: invalid type"),
|
|
};
|
|
ctx.emit(Inst::movzx_rm_r(ext_mode, rm, data, srcloc));
|
|
}
|
|
}
|
|
|
|
Opcode::AtomicStore => {
|
|
// This is a normal store, followed by an `mfence` instruction.
|
|
let data = input_to_reg(ctx, inputs[0]);
|
|
let addr = lower_to_amode(ctx, inputs[1], 0);
|
|
let ty_access = ctx.input_ty(insn, 0);
|
|
assert!(is_valid_atomic_transaction_ty(ty_access));
|
|
let memflags = ctx.memflags(insn).expect("memory flags");
|
|
let srcloc = if !memflags.notrap() {
|
|
Some(ctx.srcloc(insn))
|
|
} else {
|
|
None
|
|
};
|
|
|
|
ctx.emit(Inst::mov_r_m(ty_access.bytes() as u8, data, addr, srcloc));
|
|
ctx.emit(Inst::Fence {
|
|
kind: FenceKind::MFence,
|
|
});
|
|
}
|
|
|
|
Opcode::Fence => {
|
|
ctx.emit(Inst::Fence {
|
|
kind: FenceKind::MFence,
|
|
});
|
|
}
|
|
|
|
Opcode::FuncAddr => {
|
|
let dst = output_to_reg(ctx, outputs[0]);
|
|
let (extname, _) = ctx.call_target(insn).unwrap();
|
|
let extname = extname.clone();
|
|
let loc = ctx.srcloc(insn);
|
|
ctx.emit(Inst::LoadExtName {
|
|
dst,
|
|
name: Box::new(extname),
|
|
srcloc: loc,
|
|
offset: 0,
|
|
});
|
|
}
|
|
|
|
Opcode::SymbolValue => {
|
|
let dst = output_to_reg(ctx, outputs[0]);
|
|
let (extname, _, offset) = ctx.symbol_value(insn).unwrap();
|
|
let extname = extname.clone();
|
|
let loc = ctx.srcloc(insn);
|
|
ctx.emit(Inst::LoadExtName {
|
|
dst,
|
|
name: Box::new(extname),
|
|
srcloc: loc,
|
|
offset,
|
|
});
|
|
}
|
|
|
|
Opcode::StackAddr => {
|
|
let (stack_slot, offset) = match *ctx.data(insn) {
|
|
InstructionData::StackLoad {
|
|
opcode: Opcode::StackAddr,
|
|
stack_slot,
|
|
offset,
|
|
} => (stack_slot, offset),
|
|
_ => unreachable!(),
|
|
};
|
|
let dst = output_to_reg(ctx, outputs[0]);
|
|
let offset: i32 = offset.into();
|
|
let inst = ctx
|
|
.abi()
|
|
.stackslot_addr(stack_slot, u32::try_from(offset).unwrap(), dst);
|
|
ctx.emit(inst);
|
|
}
|
|
|
|
Opcode::Select => {
|
|
let flag_input = inputs[0];
|
|
if let Some(fcmp) = matches_input(ctx, flag_input, Opcode::Fcmp) {
|
|
let cond_code = inst_fp_condcode(ctx.data(fcmp));
|
|
|
|
// we request inversion of Equal to NotEqual here: taking LHS if equal would mean
|
|
// take it if both CC::NP and CC::Z are set, the conjunction of which can't be
|
|
// modeled with a single cmov instruction. Instead, we'll swap LHS and RHS in the
|
|
// select operation, and invert the equal to a not-equal here.
|
|
let fcmp_results = emit_fcmp(ctx, fcmp, cond_code, FcmpSpec::InvertEqual);
|
|
|
|
let (lhs_input, rhs_input) = match fcmp_results {
|
|
FcmpCondResult::InvertedEqualOrConditions(_, _) => (inputs[2], inputs[1]),
|
|
FcmpCondResult::Condition(_)
|
|
| FcmpCondResult::AndConditions(_, _)
|
|
| FcmpCondResult::OrConditions(_, _) => (inputs[1], inputs[2]),
|
|
};
|
|
|
|
let ty = ctx.output_ty(insn, 0);
|
|
let rhs = input_to_reg(ctx, rhs_input);
|
|
let dst = output_to_reg(ctx, outputs[0]);
|
|
let lhs = if is_int_ty(ty) && ty.bytes() < 4 {
|
|
// Special case: since the higher bits are undefined per CLIF semantics, we
|
|
// can just apply a 32-bit cmove here. Force inputs into registers, to
|
|
// avoid partial spilling out-of-bounds with memory accesses, though.
|
|
// Sign-extend operands to 32, then do a cmove of size 4.
|
|
RegMem::reg(input_to_reg(ctx, lhs_input))
|
|
} else {
|
|
input_to_reg_mem(ctx, lhs_input)
|
|
};
|
|
|
|
ctx.emit(Inst::gen_move(dst, rhs, ty));
|
|
|
|
match fcmp_results {
|
|
FcmpCondResult::Condition(cc) => {
|
|
if is_int_ty(ty) {
|
|
let size = u8::max(ty.bytes() as u8, 4);
|
|
ctx.emit(Inst::cmove(size, cc, lhs, dst));
|
|
} else {
|
|
ctx.emit(Inst::xmm_cmove(ty == types::F64, cc, lhs, dst));
|
|
}
|
|
}
|
|
FcmpCondResult::AndConditions(_, _) => {
|
|
unreachable!(
|
|
"can't AND with select; see above comment about inverting equal"
|
|
);
|
|
}
|
|
FcmpCondResult::InvertedEqualOrConditions(cc1, cc2)
|
|
| FcmpCondResult::OrConditions(cc1, cc2) => {
|
|
if is_int_ty(ty) {
|
|
let size = u8::max(ty.bytes() as u8, 4);
|
|
ctx.emit(Inst::cmove(size, cc1, lhs.clone(), dst));
|
|
ctx.emit(Inst::cmove(size, cc2, lhs, dst));
|
|
} else {
|
|
ctx.emit(Inst::xmm_cmove(ty == types::F64, cc1, lhs.clone(), dst));
|
|
ctx.emit(Inst::xmm_cmove(ty == types::F64, cc2, lhs, dst));
|
|
}
|
|
}
|
|
}
|
|
} else {
|
|
let cc = if let Some(icmp) = matches_input(ctx, flag_input, Opcode::Icmp) {
|
|
emit_cmp(ctx, icmp);
|
|
let cond_code = inst_condcode(ctx.data(icmp));
|
|
CC::from_intcc(cond_code)
|
|
} else {
|
|
// The input is a boolean value, compare it against zero.
|
|
let size = ctx.input_ty(insn, 0).bytes() as u8;
|
|
let test = input_to_reg(ctx, inputs[0]);
|
|
ctx.emit(Inst::cmp_rmi_r(size, RegMemImm::imm(0), test));
|
|
CC::NZ
|
|
};
|
|
|
|
let rhs = input_to_reg(ctx, inputs[2]);
|
|
let dst = output_to_reg(ctx, outputs[0]);
|
|
let ty = ctx.output_ty(insn, 0);
|
|
|
|
ctx.emit(Inst::gen_move(dst, rhs, ty));
|
|
|
|
if is_int_ty(ty) {
|
|
let mut size = ty.bytes() as u8;
|
|
let lhs = if size < 4 {
|
|
// Special case: since the higher bits are undefined per CLIF semantics, we
|
|
// can just apply a 32-bit cmove here. Force inputs into registers, to
|
|
// avoid partial spilling out-of-bounds with memory accesses, though.
|
|
size = 4;
|
|
RegMem::reg(input_to_reg(ctx, inputs[1]))
|
|
} else {
|
|
input_to_reg_mem(ctx, inputs[1])
|
|
};
|
|
ctx.emit(Inst::cmove(size, cc, lhs, dst));
|
|
} else {
|
|
debug_assert!(ty == types::F32 || ty == types::F64);
|
|
let lhs = input_to_reg_mem(ctx, inputs[1]);
|
|
ctx.emit(Inst::xmm_cmove(ty == types::F64, cc, lhs, dst));
|
|
}
|
|
}
|
|
}
|
|
|
|
Opcode::Selectif | Opcode::SelectifSpectreGuard => {
|
|
// Verification ensures that the input is always a single-def ifcmp.
|
|
let cmp_insn = ctx
|
|
.get_input(inputs[0].insn, inputs[0].input)
|
|
.inst
|
|
.unwrap()
|
|
.0;
|
|
debug_assert_eq!(ctx.data(cmp_insn).opcode(), Opcode::Ifcmp);
|
|
emit_cmp(ctx, cmp_insn);
|
|
|
|
let cc = CC::from_intcc(inst_condcode(ctx.data(insn)));
|
|
|
|
let lhs = input_to_reg_mem(ctx, inputs[1]);
|
|
let rhs = input_to_reg(ctx, inputs[2]);
|
|
let dst = output_to_reg(ctx, outputs[0]);
|
|
|
|
let ty = ctx.output_ty(insn, 0);
|
|
|
|
if is_int_ty(ty) {
|
|
let size = ty.bytes() as u8;
|
|
if size == 1 {
|
|
// Sign-extend operands to 32, then do a cmove of size 4.
|
|
let lhs_se = ctx.alloc_tmp(RegClass::I64, types::I32);
|
|
ctx.emit(Inst::movsx_rm_r(ExtMode::BL, lhs, lhs_se, None));
|
|
ctx.emit(Inst::movsx_rm_r(ExtMode::BL, RegMem::reg(rhs), dst, None));
|
|
ctx.emit(Inst::cmove(4, cc, RegMem::reg(lhs_se.to_reg()), dst));
|
|
} else {
|
|
ctx.emit(Inst::gen_move(dst, rhs, ty));
|
|
ctx.emit(Inst::cmove(size, cc, lhs, dst));
|
|
}
|
|
} else {
|
|
debug_assert!(ty == types::F32 || ty == types::F64);
|
|
ctx.emit(Inst::gen_move(dst, rhs, ty));
|
|
ctx.emit(Inst::xmm_cmove(ty == types::F64, cc, lhs, dst));
|
|
}
|
|
}
|
|
|
|
Opcode::Udiv | Opcode::Urem | Opcode::Sdiv | Opcode::Srem => {
|
|
let kind = match op {
|
|
Opcode::Udiv => DivOrRemKind::UnsignedDiv,
|
|
Opcode::Sdiv => DivOrRemKind::SignedDiv,
|
|
Opcode::Urem => DivOrRemKind::UnsignedRem,
|
|
Opcode::Srem => DivOrRemKind::SignedRem,
|
|
_ => unreachable!(),
|
|
};
|
|
let is_div = kind.is_div();
|
|
|
|
let input_ty = ctx.input_ty(insn, 0);
|
|
let size = input_ty.bytes() as u8;
|
|
|
|
let dividend = input_to_reg(ctx, inputs[0]);
|
|
let dst = output_to_reg(ctx, outputs[0]);
|
|
|
|
let srcloc = ctx.srcloc(insn);
|
|
ctx.emit(Inst::gen_move(
|
|
Writable::from_reg(regs::rax()),
|
|
dividend,
|
|
input_ty,
|
|
));
|
|
|
|
if flags.avoid_div_traps() {
|
|
// A vcode meta-instruction is used to lower the inline checks, since they embed
|
|
// pc-relative offsets that must not change, thus requiring regalloc to not
|
|
// interfere by introducing spills and reloads.
|
|
//
|
|
// Note it keeps the result in $rax (for divide) or $rdx (for rem), so that
|
|
// regalloc is aware of the coalescing opportunity between rax/rdx and the
|
|
// destination register.
|
|
let divisor = input_to_reg(ctx, inputs[1]);
|
|
|
|
let divisor_copy = ctx.alloc_tmp(RegClass::I64, types::I64);
|
|
ctx.emit(Inst::gen_move(divisor_copy, divisor, types::I64));
|
|
|
|
let tmp = if op == Opcode::Sdiv && size == 8 {
|
|
Some(ctx.alloc_tmp(RegClass::I64, types::I64))
|
|
} else {
|
|
None
|
|
};
|
|
ctx.emit(Inst::imm_r(true, 0, Writable::from_reg(regs::rdx())));
|
|
ctx.emit(Inst::checked_div_or_rem_seq(
|
|
kind,
|
|
size,
|
|
divisor_copy,
|
|
tmp,
|
|
srcloc,
|
|
));
|
|
} else {
|
|
let divisor = input_to_reg_mem(ctx, inputs[1]);
|
|
|
|
// Fill in the high parts:
|
|
if input_ty == types::I8 {
|
|
if kind.is_signed() {
|
|
// sign-extend the sign-bit of al into ah, for signed opcodes.
|
|
ctx.emit(Inst::sign_extend_data(1));
|
|
} else {
|
|
ctx.emit(Inst::movzx_rm_r(
|
|
ExtMode::BL,
|
|
RegMem::reg(regs::rax()),
|
|
Writable::from_reg(regs::rax()),
|
|
/* infallible */ None,
|
|
));
|
|
}
|
|
} else {
|
|
if kind.is_signed() {
|
|
// sign-extend the sign-bit of rax into rdx, for signed opcodes.
|
|
ctx.emit(Inst::sign_extend_data(size));
|
|
} else {
|
|
// zero for unsigned opcodes.
|
|
ctx.emit(Inst::imm_r(
|
|
true, /* is_64 */
|
|
0,
|
|
Writable::from_reg(regs::rdx()),
|
|
));
|
|
}
|
|
}
|
|
|
|
// Emit the actual idiv.
|
|
ctx.emit(Inst::div(size, kind.is_signed(), divisor, ctx.srcloc(insn)));
|
|
}
|
|
|
|
// Move the result back into the destination reg.
|
|
if is_div {
|
|
// The quotient is in rax.
|
|
ctx.emit(Inst::gen_move(dst, regs::rax(), input_ty));
|
|
} else {
|
|
// The remainder is in rdx.
|
|
ctx.emit(Inst::gen_move(dst, regs::rdx(), input_ty));
|
|
}
|
|
}
|
|
|
|
Opcode::Umulhi | Opcode::Smulhi => {
|
|
let input_ty = ctx.input_ty(insn, 0);
|
|
let size = input_ty.bytes() as u8;
|
|
|
|
let lhs = input_to_reg(ctx, inputs[0]);
|
|
let rhs = input_to_reg_mem(ctx, inputs[1]);
|
|
let dst = output_to_reg(ctx, outputs[0]);
|
|
|
|
// Move lhs in %rax.
|
|
ctx.emit(Inst::gen_move(
|
|
Writable::from_reg(regs::rax()),
|
|
lhs,
|
|
input_ty,
|
|
));
|
|
|
|
// Emit the actual mul or imul.
|
|
let signed = op == Opcode::Smulhi;
|
|
ctx.emit(Inst::mul_hi(size, signed, rhs));
|
|
|
|
// Read the result from the high part (stored in %rdx).
|
|
ctx.emit(Inst::gen_move(dst, regs::rdx(), input_ty));
|
|
}
|
|
|
|
Opcode::GetPinnedReg => {
|
|
let dst = output_to_reg(ctx, outputs[0]);
|
|
ctx.emit(Inst::gen_move(dst, regs::pinned_reg(), types::I64));
|
|
}
|
|
|
|
Opcode::SetPinnedReg => {
|
|
let src = input_to_reg(ctx, inputs[0]);
|
|
ctx.emit(Inst::gen_move(
|
|
Writable::from_reg(regs::pinned_reg()),
|
|
src,
|
|
types::I64,
|
|
));
|
|
}
|
|
|
|
Opcode::Vconst => {
|
|
let val = if let &InstructionData::UnaryConst {
|
|
constant_handle, ..
|
|
} = ctx.data(insn)
|
|
{
|
|
ctx.get_constant_data(constant_handle).clone().into_vec()
|
|
} else {
|
|
unreachable!("vconst should always have unary_const format")
|
|
};
|
|
let dst = output_to_reg(ctx, outputs[0]);
|
|
let ty = ty.unwrap();
|
|
ctx.emit(Inst::xmm_load_const_seq(val, dst, ty));
|
|
}
|
|
|
|
Opcode::RawBitcast => {
|
|
// A raw_bitcast is just a mechanism for correcting the type of V128 values (see
|
|
// https://github.com/bytecodealliance/wasmtime/issues/1147). As such, this IR
|
|
// instruction should emit no machine code but a move is necessary to give the register
|
|
// allocator a definition for the output virtual register.
|
|
let src = input_to_reg(ctx, inputs[0]);
|
|
let dst = output_to_reg(ctx, outputs[0]);
|
|
let ty = ty.unwrap();
|
|
ctx.emit(Inst::gen_move(dst, src, ty));
|
|
}
|
|
|
|
Opcode::IaddImm
|
|
| Opcode::ImulImm
|
|
| Opcode::UdivImm
|
|
| Opcode::SdivImm
|
|
| Opcode::UremImm
|
|
| Opcode::SremImm
|
|
| Opcode::IrsubImm
|
|
| Opcode::IaddCin
|
|
| Opcode::IaddIfcin
|
|
| Opcode::IaddCout
|
|
| Opcode::IaddCarry
|
|
| Opcode::IaddIfcarry
|
|
| Opcode::IsubBin
|
|
| Opcode::IsubIfbin
|
|
| Opcode::IsubBout
|
|
| Opcode::IsubIfbout
|
|
| Opcode::IsubBorrow
|
|
| Opcode::IsubIfborrow
|
|
| Opcode::BandImm
|
|
| Opcode::BorImm
|
|
| Opcode::BxorImm
|
|
| Opcode::RotlImm
|
|
| Opcode::RotrImm
|
|
| Opcode::IshlImm
|
|
| Opcode::UshrImm
|
|
| Opcode::SshrImm => {
|
|
panic!("ALU+imm and ALU+carry ops should not appear here!");
|
|
}
|
|
_ => unimplemented!("unimplemented lowering for opcode {:?}", op),
|
|
}
|
|
|
|
Ok(())
|
|
}
|
|
|
|
//=============================================================================
|
|
// Lowering-backend trait implementation.
|
|
|
|
impl LowerBackend for X64Backend {
|
|
type MInst = Inst;
|
|
|
|
fn lower<C: LowerCtx<I = Inst>>(&self, ctx: &mut C, ir_inst: IRInst) -> CodegenResult<()> {
|
|
lower_insn_to_regs(ctx, ir_inst, &self.flags, &self.triple)
|
|
}
|
|
|
|
fn lower_branch_group<C: LowerCtx<I = Inst>>(
|
|
&self,
|
|
ctx: &mut C,
|
|
branches: &[IRInst],
|
|
targets: &[MachLabel],
|
|
fallthrough: Option<MachLabel>,
|
|
) -> CodegenResult<()> {
|
|
// A block should end with at most two branches. The first may be a
|
|
// conditional branch; a conditional branch can be followed only by an
|
|
// unconditional branch or fallthrough. Otherwise, if only one branch,
|
|
// it may be an unconditional branch, a fallthrough, a return, or a
|
|
// trap. These conditions are verified by `is_ebb_basic()` during the
|
|
// verifier pass.
|
|
assert!(branches.len() <= 2);
|
|
|
|
if branches.len() == 2 {
|
|
// Must be a conditional branch followed by an unconditional branch.
|
|
let op0 = ctx.data(branches[0]).opcode();
|
|
let op1 = ctx.data(branches[1]).opcode();
|
|
|
|
trace!(
|
|
"lowering two-branch group: opcodes are {:?} and {:?}",
|
|
op0,
|
|
op1
|
|
);
|
|
assert!(op1 == Opcode::Jump || op1 == Opcode::Fallthrough);
|
|
|
|
let taken = BranchTarget::Label(targets[0]);
|
|
let not_taken = match op1 {
|
|
Opcode::Jump => BranchTarget::Label(targets[1]),
|
|
Opcode::Fallthrough => BranchTarget::Label(fallthrough.unwrap()),
|
|
_ => unreachable!(), // assert above.
|
|
};
|
|
|
|
match op0 {
|
|
Opcode::Brz | Opcode::Brnz => {
|
|
let flag_input = InsnInput {
|
|
insn: branches[0],
|
|
input: 0,
|
|
};
|
|
|
|
let src_ty = ctx.input_ty(branches[0], 0);
|
|
|
|
if let Some(icmp) = matches_input(ctx, flag_input, Opcode::Icmp) {
|
|
emit_cmp(ctx, icmp);
|
|
|
|
let cond_code = inst_condcode(ctx.data(icmp));
|
|
let cond_code = if op0 == Opcode::Brz {
|
|
cond_code.inverse()
|
|
} else {
|
|
cond_code
|
|
};
|
|
|
|
let cc = CC::from_intcc(cond_code);
|
|
ctx.emit(Inst::jmp_cond(cc, taken, not_taken));
|
|
} else if let Some(fcmp) = matches_input(ctx, flag_input, Opcode::Fcmp) {
|
|
let cond_code = inst_fp_condcode(ctx.data(fcmp));
|
|
let cond_code = if op0 == Opcode::Brz {
|
|
cond_code.inverse()
|
|
} else {
|
|
cond_code
|
|
};
|
|
match emit_fcmp(ctx, fcmp, cond_code, FcmpSpec::Normal) {
|
|
FcmpCondResult::Condition(cc) => {
|
|
ctx.emit(Inst::jmp_cond(cc, taken, not_taken));
|
|
}
|
|
FcmpCondResult::AndConditions(cc1, cc2) => {
|
|
ctx.emit(Inst::jmp_if(cc1.invert(), not_taken));
|
|
ctx.emit(Inst::jmp_cond(cc2.invert(), not_taken, taken));
|
|
}
|
|
FcmpCondResult::OrConditions(cc1, cc2) => {
|
|
ctx.emit(Inst::jmp_if(cc1, taken));
|
|
ctx.emit(Inst::jmp_cond(cc2, taken, not_taken));
|
|
}
|
|
FcmpCondResult::InvertedEqualOrConditions(_, _) => unreachable!(),
|
|
}
|
|
} else if is_int_ty(src_ty) || is_bool_ty(src_ty) {
|
|
let src = input_to_reg(
|
|
ctx,
|
|
InsnInput {
|
|
insn: branches[0],
|
|
input: 0,
|
|
},
|
|
);
|
|
let cc = match op0 {
|
|
Opcode::Brz => CC::Z,
|
|
Opcode::Brnz => CC::NZ,
|
|
_ => unreachable!(),
|
|
};
|
|
let size_bytes = src_ty.bytes() as u8;
|
|
ctx.emit(Inst::cmp_rmi_r(size_bytes, RegMemImm::imm(0), src));
|
|
ctx.emit(Inst::jmp_cond(cc, taken, not_taken));
|
|
} else {
|
|
unimplemented!("brz/brnz with non-int type {:?}", src_ty);
|
|
}
|
|
}
|
|
|
|
Opcode::BrIcmp => {
|
|
let src_ty = ctx.input_ty(branches[0], 0);
|
|
if is_int_ty(src_ty) || is_bool_ty(src_ty) {
|
|
let lhs = input_to_reg(
|
|
ctx,
|
|
InsnInput {
|
|
insn: branches[0],
|
|
input: 0,
|
|
},
|
|
);
|
|
let rhs = input_to_reg_mem_imm(
|
|
ctx,
|
|
InsnInput {
|
|
insn: branches[0],
|
|
input: 1,
|
|
},
|
|
);
|
|
let cc = CC::from_intcc(inst_condcode(ctx.data(branches[0])));
|
|
let byte_size = src_ty.bytes() as u8;
|
|
// Cranelift's icmp semantics want to compare lhs - rhs, while Intel gives
|
|
// us dst - src at the machine instruction level, so invert operands.
|
|
ctx.emit(Inst::cmp_rmi_r(byte_size, rhs, lhs));
|
|
ctx.emit(Inst::jmp_cond(cc, taken, not_taken));
|
|
} else {
|
|
unimplemented!("bricmp with non-int type {:?}", src_ty);
|
|
}
|
|
}
|
|
|
|
_ => panic!("unexpected branch opcode: {:?}", op0),
|
|
}
|
|
} else {
|
|
assert_eq!(branches.len(), 1);
|
|
|
|
// Must be an unconditional branch or trap.
|
|
let op = ctx.data(branches[0]).opcode();
|
|
match op {
|
|
Opcode::Jump | Opcode::Fallthrough => {
|
|
ctx.emit(Inst::jmp_known(BranchTarget::Label(targets[0])));
|
|
}
|
|
|
|
Opcode::BrTable => {
|
|
let jt_size = targets.len() - 1;
|
|
assert!(jt_size <= u32::max_value() as usize);
|
|
let jt_size = jt_size as u32;
|
|
|
|
let idx = extend_input_to_reg(
|
|
ctx,
|
|
InsnInput {
|
|
insn: branches[0],
|
|
input: 0,
|
|
},
|
|
ExtSpec::ZeroExtendTo32,
|
|
);
|
|
|
|
// Bounds-check (compute flags from idx - jt_size) and branch to default.
|
|
ctx.emit(Inst::cmp_rmi_r(4, RegMemImm::imm(jt_size), idx));
|
|
|
|
// Emit the compound instruction that does:
|
|
//
|
|
// lea $jt, %rA
|
|
// movsbl [%rA, %rIndex, 2], %rB
|
|
// add %rB, %rA
|
|
// j *%rA
|
|
// [jt entries]
|
|
//
|
|
// This must be *one* instruction in the vcode because we cannot allow regalloc
|
|
// to insert any spills/fills in the middle of the sequence; otherwise, the
|
|
// lea PC-rel offset to the jumptable would be incorrect. (The alternative
|
|
// is to introduce a relocation pass for inlined jumptables, which is much
|
|
// worse.)
|
|
|
|
// This temporary is used as a signed integer of 64-bits (to hold addresses).
|
|
let tmp1 = ctx.alloc_tmp(RegClass::I64, types::I64);
|
|
// This temporary is used as a signed integer of 32-bits (for the wasm-table
|
|
// index) and then 64-bits (address addend). The small lie about the I64 type
|
|
// is benign, since the temporary is dead after this instruction (and its
|
|
// Cranelift type is thus unused).
|
|
let tmp2 = ctx.alloc_tmp(RegClass::I64, types::I64);
|
|
|
|
let targets_for_term: Vec<MachLabel> = targets.to_vec();
|
|
let default_target = BranchTarget::Label(targets[0]);
|
|
|
|
let jt_targets: Vec<BranchTarget> = targets
|
|
.iter()
|
|
.skip(1)
|
|
.map(|bix| BranchTarget::Label(*bix))
|
|
.collect();
|
|
|
|
ctx.emit(Inst::JmpTableSeq {
|
|
idx,
|
|
tmp1,
|
|
tmp2,
|
|
default_target,
|
|
targets: jt_targets,
|
|
targets_for_term,
|
|
});
|
|
}
|
|
|
|
_ => panic!("Unknown branch type {:?}", op),
|
|
}
|
|
}
|
|
|
|
Ok(())
|
|
}
|
|
|
|
fn maybe_pinned_reg(&self) -> Option<Reg> {
|
|
Some(regs::pinned_reg())
|
|
}
|
|
}
|