* x64: Enable load-coalescing for SSE/AVX instructions This commit unlocks the ability to fold loads into operands of SSE and AVX instructions. This is beneficial for both function size when it happens in addition to being able to reduce register pressure. Previously this was not done because most SSE instructions require memory to be aligned. AVX instructions, however, do not have alignment requirements. The solution implemented here is one recommended by Chris which is to add a new `XmmMemAligned` newtype wrapper around `XmmMem`. All SSE instructions are now annotated as requiring an `XmmMemAligned` operand except for a new new instruction styles used specifically for instructions that don't require alignment (e.g. `movdqu`, `*sd`, and `*ss` instructions). All existing instruction helpers continue to take `XmmMem`, however. This way if an AVX lowering is chosen it can be used as-is. If an SSE lowering is chosen, however, then an automatic conversion from `XmmMem` to `XmmMemAligned` kicks in. This automatic conversion only fails for unaligned addresses in which case a load instruction is emitted and the operand becomes a temporary register instead. A number of prior `Xmm` arguments have now been converted to `XmmMem` as well. One change from this commit is that loading an unaligned operand for an SSE instruction previously would use the "correct type" of load, e.g. `movups` for f32x4 or `movup` for f64x2, but now the loading happens in a context without type information so the `movdqu` instruction is generated. According to [this stack overflow question][question] it looks like modern processors won't penalize this "wrong" choice of type when the operand is then used for f32 or f64 oriented instructions. Finally this commit improves some reuse of logic in the `put_in_*_mem*` helper to share code with `sinkable_load` and avoid duplication. With this in place some various ISLE rules have been updated as well. In the tests it can be seen that AVX-instructions are now automatically load-coalesced and use memory operands in a few cases. [question]: https://stackoverflow.com/questions/40854819/is-there-any-situation-where-using-movdqu-and-movupd-is-better-than-movups * Fix tests * Fix move-and-extend to be unaligned These don't have alignment requirements like other xmm instructions as well. Additionally add some ISA tests to ensure that their output is tested. * Review comments
313 lines
10 KiB
Rust
313 lines
10 KiB
Rust
//! Lowering rules for X64.
|
|
|
|
// ISLE integration glue.
|
|
pub(super) mod isle;
|
|
|
|
use crate::ir::{types, ExternalName, Inst as IRInst, LibCall, Opcode, Type};
|
|
use crate::isa::x64::abi::*;
|
|
use crate::isa::x64::inst::args::*;
|
|
use crate::isa::x64::inst::*;
|
|
use crate::isa::{x64::X64Backend, CallConv};
|
|
use crate::machinst::abi::SmallInstVec;
|
|
use crate::machinst::lower::*;
|
|
use crate::machinst::*;
|
|
use crate::result::CodegenResult;
|
|
use crate::settings::Flags;
|
|
use smallvec::smallvec;
|
|
use target_lexicon::Triple;
|
|
|
|
//=============================================================================
|
|
// Helpers for instruction lowering.
|
|
|
|
fn is_int_or_ref_ty(ty: Type) -> bool {
|
|
match ty {
|
|
types::I8 | types::I16 | types::I32 | types::I64 | types::R64 => true,
|
|
types::R32 => panic!("shouldn't have 32-bits refs on x64"),
|
|
_ => false,
|
|
}
|
|
}
|
|
|
|
/// Returns whether the given specified `input` is a result produced by an instruction with Opcode
|
|
/// `op`.
|
|
// TODO investigate failures with checking against the result index.
|
|
fn matches_input(ctx: &mut Lower<Inst>, input: InsnInput, op: Opcode) -> Option<IRInst> {
|
|
let inputs = ctx.get_input_as_source_or_const(input.insn, input.input);
|
|
inputs.inst.as_inst().and_then(|(src_inst, _)| {
|
|
let data = ctx.data(src_inst);
|
|
if data.opcode() == op {
|
|
return Some(src_inst);
|
|
}
|
|
None
|
|
})
|
|
}
|
|
|
|
/// Put the given input into possibly multiple registers, and mark it as used (side-effect).
|
|
fn put_input_in_regs(ctx: &mut Lower<Inst>, spec: InsnInput) -> ValueRegs<Reg> {
|
|
let ty = ctx.input_ty(spec.insn, spec.input);
|
|
let input = ctx.get_input_as_source_or_const(spec.insn, spec.input);
|
|
|
|
if let Some(c) = input.constant {
|
|
// Generate constants fresh at each use to minimize long-range register pressure.
|
|
let from_bits = ty_bits(ty);
|
|
let (size, c) = if from_bits < 64 {
|
|
(OperandSize::Size32, c & ((1u64 << from_bits) - 1))
|
|
} else {
|
|
(OperandSize::Size64, c)
|
|
};
|
|
assert!(is_int_or_ref_ty(ty)); // Only used for addresses.
|
|
let cst_copy = ctx.alloc_tmp(ty);
|
|
ctx.emit(Inst::imm(size, c, cst_copy.only_reg().unwrap()));
|
|
non_writable_value_regs(cst_copy)
|
|
} else {
|
|
ctx.put_input_in_regs(spec.insn, spec.input)
|
|
}
|
|
}
|
|
|
|
/// Put the given input into a register, and mark it as used (side-effect).
|
|
fn put_input_in_reg(ctx: &mut Lower<Inst>, spec: InsnInput) -> Reg {
|
|
put_input_in_regs(ctx, spec)
|
|
.only_reg()
|
|
.expect("Multi-register value not expected")
|
|
}
|
|
|
|
/// Determines whether a load operation (indicated by `src_insn`) can be merged
|
|
/// into the current lowering point. If so, returns the address-base source (as
|
|
/// an `InsnInput`) and an offset from that address from which to perform the
|
|
/// load.
|
|
fn is_mergeable_load(ctx: &mut Lower<Inst>, src_insn: IRInst) -> Option<(InsnInput, i32)> {
|
|
let insn_data = ctx.data(src_insn);
|
|
let inputs = ctx.num_inputs(src_insn);
|
|
if inputs != 1 {
|
|
return None;
|
|
}
|
|
|
|
let load_ty = ctx.output_ty(src_insn, 0);
|
|
if ty_bits(load_ty) < 32 {
|
|
// Narrower values are handled by ALU insts that are at least 32 bits
|
|
// wide, which is normally OK as we ignore upper buts; but, if we
|
|
// generate, e.g., a direct-from-memory 32-bit add for a byte value and
|
|
// the byte is the last byte in a page, the extra data that we load is
|
|
// incorrectly accessed. So we only allow loads to merge for
|
|
// 32-bit-and-above widths.
|
|
return None;
|
|
}
|
|
|
|
// Just testing the opcode is enough, because the width will always match if
|
|
// the type does (and the type should match if the CLIF is properly
|
|
// constructed).
|
|
if insn_data.opcode() == Opcode::Load {
|
|
let offset = insn_data
|
|
.load_store_offset()
|
|
.expect("load should have offset");
|
|
Some((
|
|
InsnInput {
|
|
insn: src_insn,
|
|
input: 0,
|
|
},
|
|
offset,
|
|
))
|
|
} else {
|
|
None
|
|
}
|
|
}
|
|
|
|
fn input_to_imm(ctx: &mut Lower<Inst>, spec: InsnInput) -> Option<u64> {
|
|
ctx.get_input_as_source_or_const(spec.insn, spec.input)
|
|
.constant
|
|
}
|
|
|
|
fn emit_vm_call(
|
|
ctx: &mut Lower<Inst>,
|
|
flags: &Flags,
|
|
triple: &Triple,
|
|
libcall: LibCall,
|
|
inputs: &[Reg],
|
|
outputs: &[Writable<Reg>],
|
|
) -> CodegenResult<()> {
|
|
let extname = ExternalName::LibCall(libcall);
|
|
|
|
let dist = if flags.use_colocated_libcalls() {
|
|
RelocDistance::Near
|
|
} else {
|
|
RelocDistance::Far
|
|
};
|
|
|
|
// TODO avoid recreating signatures for every single Libcall function.
|
|
let call_conv = CallConv::for_libcall(flags, CallConv::triple_default(triple));
|
|
let sig = libcall.signature(call_conv);
|
|
let caller_conv = ctx.abi().call_conv(ctx.sigs());
|
|
|
|
if !ctx.sigs().have_abi_sig_for_signature(&sig) {
|
|
ctx.sigs_mut()
|
|
.make_abi_sig_from_ir_signature::<X64ABIMachineSpec>(sig.clone(), flags)?;
|
|
}
|
|
|
|
let mut abi =
|
|
X64Caller::from_libcall(ctx.sigs(), &sig, &extname, dist, caller_conv, flags.clone())?;
|
|
|
|
abi.emit_stack_pre_adjust(ctx);
|
|
|
|
assert_eq!(inputs.len(), abi.num_args(ctx.sigs()));
|
|
|
|
for (i, input) in inputs.iter().enumerate() {
|
|
for inst in abi.gen_arg(ctx, i, ValueRegs::one(*input)) {
|
|
ctx.emit(inst);
|
|
}
|
|
}
|
|
|
|
let mut retval_insts: SmallInstVec<_> = smallvec![];
|
|
for (i, output) in outputs.iter().enumerate() {
|
|
retval_insts.extend(abi.gen_retval(ctx, i, ValueRegs::one(*output)).into_iter());
|
|
}
|
|
abi.emit_call(ctx);
|
|
for inst in retval_insts {
|
|
ctx.emit(inst);
|
|
}
|
|
abi.emit_stack_post_adjust(ctx);
|
|
|
|
Ok(())
|
|
}
|
|
|
|
/// Returns whether the given input is a shift by a constant value less or equal than 3.
|
|
/// The goal is to embed it within an address mode.
|
|
fn matches_small_constant_shift(ctx: &mut Lower<Inst>, spec: InsnInput) -> Option<(InsnInput, u8)> {
|
|
matches_input(ctx, spec, Opcode::Ishl).and_then(|shift| {
|
|
match input_to_imm(
|
|
ctx,
|
|
InsnInput {
|
|
insn: shift,
|
|
input: 1,
|
|
},
|
|
) {
|
|
Some(shift_amt) if shift_amt <= 3 => Some((
|
|
InsnInput {
|
|
insn: shift,
|
|
input: 0,
|
|
},
|
|
shift_amt as u8,
|
|
)),
|
|
_ => None,
|
|
}
|
|
})
|
|
}
|
|
|
|
/// Lowers an instruction to one of the x86 addressing modes.
|
|
///
|
|
/// Note: the 32-bit offset in Cranelift has to be sign-extended, which maps x86's behavior.
|
|
fn lower_to_amode(ctx: &mut Lower<Inst>, spec: InsnInput, offset: i32) -> Amode {
|
|
let flags = ctx
|
|
.memflags(spec.insn)
|
|
.expect("Instruction with amode should have memflags");
|
|
|
|
// We now either have an add that we must materialize, or some other input; as well as the
|
|
// final offset.
|
|
if let Some(add) = matches_input(ctx, spec, Opcode::Iadd) {
|
|
debug_assert_eq!(ctx.output_ty(add, 0), types::I64);
|
|
let add_inputs = &[
|
|
InsnInput {
|
|
insn: add,
|
|
input: 0,
|
|
},
|
|
InsnInput {
|
|
insn: add,
|
|
input: 1,
|
|
},
|
|
];
|
|
|
|
// TODO heap_addr legalization generates a uext64 *after* the shift, so these optimizations
|
|
// aren't happening in the wasm case. We could do better, given some range analysis.
|
|
let (base, index, shift) = if let Some((shift_input, shift_amt)) =
|
|
matches_small_constant_shift(ctx, add_inputs[0])
|
|
{
|
|
(
|
|
put_input_in_reg(ctx, add_inputs[1]),
|
|
put_input_in_reg(ctx, shift_input),
|
|
shift_amt,
|
|
)
|
|
} else if let Some((shift_input, shift_amt)) =
|
|
matches_small_constant_shift(ctx, add_inputs[1])
|
|
{
|
|
(
|
|
put_input_in_reg(ctx, add_inputs[0]),
|
|
put_input_in_reg(ctx, shift_input),
|
|
shift_amt,
|
|
)
|
|
} else {
|
|
for i in 0..=1 {
|
|
// Try to pierce through uextend.
|
|
if let Some(uextend) = matches_input(
|
|
ctx,
|
|
InsnInput {
|
|
insn: add,
|
|
input: i,
|
|
},
|
|
Opcode::Uextend,
|
|
) {
|
|
if let Some(cst) = ctx.get_input_as_source_or_const(uextend, 0).constant {
|
|
// Zero the upper bits.
|
|
let input_size = ctx.input_ty(uextend, 0).bits() as u64;
|
|
let shift: u64 = 64 - input_size;
|
|
let uext_cst: u64 = (cst << shift) >> shift;
|
|
|
|
let final_offset = (offset as i64).wrapping_add(uext_cst as i64);
|
|
if low32_will_sign_extend_to_64(final_offset as u64) {
|
|
let base = put_input_in_reg(ctx, add_inputs[1 - i]);
|
|
return Amode::imm_reg(final_offset as u32, base).with_flags(flags);
|
|
}
|
|
}
|
|
}
|
|
|
|
// If it's a constant, add it directly!
|
|
if let Some(cst) = ctx.get_input_as_source_or_const(add, i).constant {
|
|
let final_offset = (offset as i64).wrapping_add(cst as i64);
|
|
if low32_will_sign_extend_to_64(final_offset as u64) {
|
|
let base = put_input_in_reg(ctx, add_inputs[1 - i]);
|
|
return Amode::imm_reg(final_offset as u32, base).with_flags(flags);
|
|
}
|
|
}
|
|
}
|
|
|
|
(
|
|
put_input_in_reg(ctx, add_inputs[0]),
|
|
put_input_in_reg(ctx, add_inputs[1]),
|
|
0,
|
|
)
|
|
};
|
|
|
|
return Amode::imm_reg_reg_shift(
|
|
offset as u32,
|
|
Gpr::new(base).unwrap(),
|
|
Gpr::new(index).unwrap(),
|
|
shift,
|
|
)
|
|
.with_flags(flags);
|
|
}
|
|
|
|
let input = put_input_in_reg(ctx, spec);
|
|
Amode::imm_reg(offset as u32, input).with_flags(flags)
|
|
}
|
|
|
|
//=============================================================================
|
|
// Lowering-backend trait implementation.
|
|
|
|
impl LowerBackend for X64Backend {
|
|
type MInst = Inst;
|
|
|
|
fn lower(&self, ctx: &mut Lower<Inst>, ir_inst: IRInst) -> Option<InstOutput> {
|
|
isle::lower(ctx, self, ir_inst)
|
|
}
|
|
|
|
fn lower_branch(
|
|
&self,
|
|
ctx: &mut Lower<Inst>,
|
|
ir_inst: IRInst,
|
|
targets: &[MachLabel],
|
|
) -> Option<()> {
|
|
isle::lower_branch(ctx, self, ir_inst, targets)
|
|
}
|
|
|
|
fn maybe_pinned_reg(&self) -> Option<Reg> {
|
|
Some(regs::pinned_reg())
|
|
}
|
|
}
|