Merge pull request #2389 from cfallin/x64-load-op
x64 backend: merge loads into ALU ops when appropriate.
This commit is contained in:
@@ -22,9 +22,6 @@ use smallvec::SmallVec;
|
|||||||
use std::convert::TryFrom;
|
use std::convert::TryFrom;
|
||||||
use target_lexicon::Triple;
|
use target_lexicon::Triple;
|
||||||
|
|
||||||
/// Context passed to all lowering functions.
|
|
||||||
type Ctx<'a> = &'a mut dyn LowerCtx<I = Inst>;
|
|
||||||
|
|
||||||
//=============================================================================
|
//=============================================================================
|
||||||
// Helpers for instruction lowering.
|
// Helpers for instruction lowering.
|
||||||
|
|
||||||
@@ -89,34 +86,106 @@ fn matches_input_any<C: LowerCtx<I = Inst>>(
|
|||||||
})
|
})
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/// Emits instruction(s) to generate the given 64-bit constant value into a newly-allocated
|
||||||
|
/// temporary register, returning that register.
|
||||||
|
fn generate_constant<C: LowerCtx<I = Inst>>(ctx: &mut C, ty: Type, c: u64) -> Reg {
|
||||||
|
let from_bits = ty_bits(ty);
|
||||||
|
let masked = if from_bits < 64 {
|
||||||
|
c & ((1u64 << from_bits) - 1)
|
||||||
|
} else {
|
||||||
|
c
|
||||||
|
};
|
||||||
|
|
||||||
|
let cst_copy = ctx.alloc_tmp(Inst::rc_for_type(ty).unwrap(), ty);
|
||||||
|
for inst in Inst::gen_constant(cst_copy, masked, ty, |reg_class, ty| {
|
||||||
|
ctx.alloc_tmp(reg_class, ty)
|
||||||
|
})
|
||||||
|
.into_iter()
|
||||||
|
{
|
||||||
|
ctx.emit(inst);
|
||||||
|
}
|
||||||
|
cst_copy.to_reg()
|
||||||
|
}
|
||||||
|
|
||||||
/// Put the given input into a register, and mark it as used (side-effect).
|
/// Put the given input into a register, and mark it as used (side-effect).
|
||||||
fn put_input_in_reg(ctx: Ctx, spec: InsnInput) -> Reg {
|
fn put_input_in_reg<C: LowerCtx<I = Inst>>(ctx: &mut C, spec: InsnInput) -> Reg {
|
||||||
|
let ty = ctx.input_ty(spec.insn, spec.input);
|
||||||
let input = ctx.get_input_as_source_or_const(spec.insn, spec.input);
|
let input = ctx.get_input_as_source_or_const(spec.insn, spec.input);
|
||||||
|
|
||||||
if let Some(c) = input.constant {
|
if let Some(c) = input.constant {
|
||||||
// Generate constants fresh at each use to minimize long-range register pressure.
|
// Generate constants fresh at each use to minimize long-range register pressure.
|
||||||
let ty = ctx.input_ty(spec.insn, spec.input);
|
generate_constant(ctx, ty, c)
|
||||||
let from_bits = ty_bits(ty);
|
|
||||||
let masked = if from_bits < 64 {
|
|
||||||
c & ((1u64 << from_bits) - 1)
|
|
||||||
} else {
|
|
||||||
c
|
|
||||||
};
|
|
||||||
|
|
||||||
let cst_copy = ctx.alloc_tmp(Inst::rc_for_type(ty).unwrap(), ty);
|
|
||||||
for inst in Inst::gen_constant(cst_copy, masked, ty, |reg_class, ty| {
|
|
||||||
ctx.alloc_tmp(reg_class, ty)
|
|
||||||
})
|
|
||||||
.into_iter()
|
|
||||||
{
|
|
||||||
ctx.emit(inst);
|
|
||||||
}
|
|
||||||
cst_copy.to_reg()
|
|
||||||
} else {
|
} else {
|
||||||
ctx.put_input_in_reg(spec.insn, spec.input)
|
ctx.put_input_in_reg(spec.insn, spec.input)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/// Determines whether a load operation (indicated by `src_insn`) can be merged
|
||||||
|
/// into the current lowering point. If so, returns the address-base source (as
|
||||||
|
/// an `InsnInput`) and an offset from that address from which to perform the
|
||||||
|
/// load.
|
||||||
|
fn is_mergeable_load<C: LowerCtx<I = Inst>>(
|
||||||
|
ctx: &mut C,
|
||||||
|
src_insn: IRInst,
|
||||||
|
) -> Option<(InsnInput, i32)> {
|
||||||
|
let insn_data = ctx.data(src_insn);
|
||||||
|
let inputs = ctx.num_inputs(src_insn);
|
||||||
|
if inputs != 1 {
|
||||||
|
return None;
|
||||||
|
}
|
||||||
|
|
||||||
|
let load_ty = ctx.output_ty(src_insn, 0);
|
||||||
|
if ty_bits(load_ty) < 32 {
|
||||||
|
// Narrower values are handled by ALU insts that are at least 32 bits
|
||||||
|
// wide, which is normally OK as we ignore upper buts; but, if we
|
||||||
|
// generate, e.g., a direct-from-memory 32-bit add for a byte value and
|
||||||
|
// the byte is the last byte in a page, the extra data that we load is
|
||||||
|
// incorrectly accessed. So we only allow loads to merge for
|
||||||
|
// 32-bit-and-above widths.
|
||||||
|
return None;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Just testing the opcode is enough, because the width will always match if
|
||||||
|
// the type does (and the type should match if the CLIF is properly
|
||||||
|
// constructed).
|
||||||
|
if insn_data.opcode() == Opcode::Load {
|
||||||
|
let offset = insn_data
|
||||||
|
.load_store_offset()
|
||||||
|
.expect("load should have offset");
|
||||||
|
Some((
|
||||||
|
InsnInput {
|
||||||
|
insn: src_insn,
|
||||||
|
input: 0,
|
||||||
|
},
|
||||||
|
offset,
|
||||||
|
))
|
||||||
|
} else {
|
||||||
|
None
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Put the given input into a register or a memory operand.
|
||||||
|
/// Effectful: may mark the given input as used, when returning the register form.
|
||||||
|
fn input_to_reg_mem<C: LowerCtx<I = Inst>>(ctx: &mut C, spec: InsnInput) -> RegMem {
|
||||||
|
let inputs = ctx.get_input_as_source_or_const(spec.insn, spec.input);
|
||||||
|
|
||||||
|
if let Some(c) = inputs.constant {
|
||||||
|
// Generate constants fresh at each use to minimize long-range register pressure.
|
||||||
|
let ty = ctx.input_ty(spec.insn, spec.input);
|
||||||
|
return RegMem::reg(generate_constant(ctx, ty, c));
|
||||||
|
}
|
||||||
|
|
||||||
|
if let Some((src_insn, 0)) = inputs.inst {
|
||||||
|
if let Some((addr_input, offset)) = is_mergeable_load(ctx, src_insn) {
|
||||||
|
ctx.sink_inst(src_insn);
|
||||||
|
let amode = lower_to_amode(ctx, addr_input, offset);
|
||||||
|
return RegMem::mem(amode);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
RegMem::reg(ctx.put_input_in_reg(spec.insn, spec.input))
|
||||||
|
}
|
||||||
|
|
||||||
/// An extension specification for `extend_input_to_reg`.
|
/// An extension specification for `extend_input_to_reg`.
|
||||||
#[derive(Clone, Copy)]
|
#[derive(Clone, Copy)]
|
||||||
enum ExtSpec {
|
enum ExtSpec {
|
||||||
@@ -128,7 +197,11 @@ enum ExtSpec {
|
|||||||
|
|
||||||
/// Put the given input into a register, marking it as used, and do a zero- or signed- extension if
|
/// Put the given input into a register, marking it as used, and do a zero- or signed- extension if
|
||||||
/// required. (This obviously causes side-effects.)
|
/// required. (This obviously causes side-effects.)
|
||||||
fn extend_input_to_reg(ctx: Ctx, spec: InsnInput, ext_spec: ExtSpec) -> Reg {
|
fn extend_input_to_reg<C: LowerCtx<I = Inst>>(
|
||||||
|
ctx: &mut C,
|
||||||
|
spec: InsnInput,
|
||||||
|
ext_spec: ExtSpec,
|
||||||
|
) -> Reg {
|
||||||
let requested_size = match ext_spec {
|
let requested_size = match ext_spec {
|
||||||
ExtSpec::ZeroExtendTo32 | ExtSpec::SignExtendTo32 => 32,
|
ExtSpec::ZeroExtendTo32 | ExtSpec::SignExtendTo32 => 32,
|
||||||
ExtSpec::ZeroExtendTo64 | ExtSpec::SignExtendTo64 => 64,
|
ExtSpec::ZeroExtendTo64 | ExtSpec::SignExtendTo64 => 64,
|
||||||
@@ -160,13 +233,6 @@ fn extend_input_to_reg(ctx: Ctx, spec: InsnInput, ext_spec: ExtSpec) -> Reg {
|
|||||||
dst.to_reg()
|
dst.to_reg()
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Put the given input into a register or a memory operand.
|
|
||||||
/// Effectful: may mark the given input as used, when returning the register form.
|
|
||||||
fn input_to_reg_mem(ctx: Ctx, spec: InsnInput) -> RegMem {
|
|
||||||
// TODO handle memory; merge a load directly, if possible.
|
|
||||||
RegMem::reg(ctx.put_input_in_reg(spec.insn, spec.input))
|
|
||||||
}
|
|
||||||
|
|
||||||
/// Returns whether the given input is an immediate that can be properly sign-extended, without any
|
/// Returns whether the given input is an immediate that can be properly sign-extended, without any
|
||||||
/// possible side-effect.
|
/// possible side-effect.
|
||||||
fn non_reg_input_to_sext_imm(input: NonRegInput, input_ty: Type) -> Option<u32> {
|
fn non_reg_input_to_sext_imm(input: NonRegInput, input_ty: Type) -> Option<u32> {
|
||||||
@@ -182,20 +248,20 @@ fn non_reg_input_to_sext_imm(input: NonRegInput, input_ty: Type) -> Option<u32>
|
|||||||
})
|
})
|
||||||
}
|
}
|
||||||
|
|
||||||
fn input_to_sext_imm(ctx: Ctx, spec: InsnInput) -> Option<u32> {
|
fn input_to_sext_imm<C: LowerCtx<I = Inst>>(ctx: &mut C, spec: InsnInput) -> Option<u32> {
|
||||||
let input = ctx.get_input_as_source_or_const(spec.insn, spec.input);
|
let input = ctx.get_input_as_source_or_const(spec.insn, spec.input);
|
||||||
let input_ty = ctx.input_ty(spec.insn, spec.input);
|
let input_ty = ctx.input_ty(spec.insn, spec.input);
|
||||||
non_reg_input_to_sext_imm(input, input_ty)
|
non_reg_input_to_sext_imm(input, input_ty)
|
||||||
}
|
}
|
||||||
|
|
||||||
fn input_to_imm(ctx: Ctx, spec: InsnInput) -> Option<u64> {
|
fn input_to_imm<C: LowerCtx<I = Inst>>(ctx: &mut C, spec: InsnInput) -> Option<u64> {
|
||||||
ctx.get_input_as_source_or_const(spec.insn, spec.input)
|
ctx.get_input_as_source_or_const(spec.insn, spec.input)
|
||||||
.constant
|
.constant
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Put the given input into an immediate, a register or a memory operand.
|
/// Put the given input into an immediate, a register or a memory operand.
|
||||||
/// Effectful: may mark the given input as used, when returning the register form.
|
/// Effectful: may mark the given input as used, when returning the register form.
|
||||||
fn input_to_reg_mem_imm(ctx: Ctx, spec: InsnInput) -> RegMemImm {
|
fn input_to_reg_mem_imm<C: LowerCtx<I = Inst>>(ctx: &mut C, spec: InsnInput) -> RegMemImm {
|
||||||
let input = ctx.get_input_as_source_or_const(spec.insn, spec.input);
|
let input = ctx.get_input_as_source_or_const(spec.insn, spec.input);
|
||||||
let input_ty = ctx.input_ty(spec.insn, spec.input);
|
let input_ty = ctx.input_ty(spec.insn, spec.input);
|
||||||
match non_reg_input_to_sext_imm(input, input_ty) {
|
match non_reg_input_to_sext_imm(input, input_ty) {
|
||||||
@@ -305,7 +371,7 @@ fn emit_extract_lane<C: LowerCtx<I = Inst>>(
|
|||||||
///
|
///
|
||||||
/// Note: make sure that there are no instructions modifying the flags between a call to this
|
/// Note: make sure that there are no instructions modifying the flags between a call to this
|
||||||
/// function and the use of the flags!
|
/// function and the use of the flags!
|
||||||
fn emit_cmp(ctx: Ctx, insn: IRInst) {
|
fn emit_cmp<C: LowerCtx<I = Inst>>(ctx: &mut C, insn: IRInst) {
|
||||||
let ty = ctx.input_ty(insn, 0);
|
let ty = ctx.input_ty(insn, 0);
|
||||||
|
|
||||||
let inputs = [InsnInput { insn, input: 0 }, InsnInput { insn, input: 1 }];
|
let inputs = [InsnInput { insn, input: 0 }, InsnInput { insn, input: 1 }];
|
||||||
@@ -355,7 +421,12 @@ enum FcmpCondResult {
|
|||||||
///
|
///
|
||||||
/// Note: make sure that there are no instructions modifying the flags between a call to this
|
/// Note: make sure that there are no instructions modifying the flags between a call to this
|
||||||
/// function and the use of the flags!
|
/// function and the use of the flags!
|
||||||
fn emit_fcmp(ctx: Ctx, insn: IRInst, mut cond_code: FloatCC, spec: FcmpSpec) -> FcmpCondResult {
|
fn emit_fcmp<C: LowerCtx<I = Inst>>(
|
||||||
|
ctx: &mut C,
|
||||||
|
insn: IRInst,
|
||||||
|
mut cond_code: FloatCC,
|
||||||
|
spec: FcmpSpec,
|
||||||
|
) -> FcmpCondResult {
|
||||||
let (flip_operands, inverted_equal) = match cond_code {
|
let (flip_operands, inverted_equal) = match cond_code {
|
||||||
FloatCC::LessThan
|
FloatCC::LessThan
|
||||||
| FloatCC::LessThanOrEqual
|
| FloatCC::LessThanOrEqual
|
||||||
@@ -407,7 +478,12 @@ fn emit_fcmp(ctx: Ctx, insn: IRInst, mut cond_code: FloatCC, spec: FcmpSpec) ->
|
|||||||
cond_result
|
cond_result
|
||||||
}
|
}
|
||||||
|
|
||||||
fn make_libcall_sig(ctx: Ctx, insn: IRInst, call_conv: CallConv, ptr_ty: Type) -> Signature {
|
fn make_libcall_sig<C: LowerCtx<I = Inst>>(
|
||||||
|
ctx: &mut C,
|
||||||
|
insn: IRInst,
|
||||||
|
call_conv: CallConv,
|
||||||
|
ptr_ty: Type,
|
||||||
|
) -> Signature {
|
||||||
let mut sig = Signature::new(call_conv);
|
let mut sig = Signature::new(call_conv);
|
||||||
for i in 0..ctx.num_inputs(insn) {
|
for i in 0..ctx.num_inputs(insn) {
|
||||||
sig.params.push(AbiParam::new(ctx.input_ty(insn, i)));
|
sig.params.push(AbiParam::new(ctx.input_ty(insn, i)));
|
||||||
@@ -827,14 +903,16 @@ fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(
|
|||||||
| Opcode::Bor
|
| Opcode::Bor
|
||||||
| Opcode::Bxor => {
|
| Opcode::Bxor => {
|
||||||
// For commutative operations, try to commute operands if one is an
|
// For commutative operations, try to commute operands if one is an
|
||||||
// immediate.
|
// immediate or direct memory reference. Do so by converting LHS to RMI; if
|
||||||
if let Some(imm) = input_to_sext_imm(ctx, inputs[0]) {
|
// reg, then always convert RHS to RMI; else, use LHS as RMI and convert
|
||||||
(put_input_in_reg(ctx, inputs[1]), RegMemImm::imm(imm))
|
// RHS to reg.
|
||||||
|
let lhs = input_to_reg_mem_imm(ctx, inputs[0]);
|
||||||
|
if let RegMemImm::Reg { reg: lhs_reg } = lhs {
|
||||||
|
let rhs = input_to_reg_mem_imm(ctx, inputs[1]);
|
||||||
|
(lhs_reg, rhs)
|
||||||
} else {
|
} else {
|
||||||
(
|
let rhs_reg = put_input_in_reg(ctx, inputs[1]);
|
||||||
put_input_in_reg(ctx, inputs[0]),
|
(rhs_reg, lhs)
|
||||||
input_to_reg_mem_imm(ctx, inputs[1]),
|
|
||||||
)
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
Opcode::Isub => (
|
Opcode::Isub => (
|
||||||
|
|||||||
46
cranelift/filetests/filetests/isa/x64/load-op.clif
Normal file
46
cranelift/filetests/filetests/isa/x64/load-op.clif
Normal file
@@ -0,0 +1,46 @@
|
|||||||
|
test compile
|
||||||
|
target x86_64
|
||||||
|
feature "experimental_x64"
|
||||||
|
|
||||||
|
function %add_from_mem_u32_1(i64, i32) -> i32 {
|
||||||
|
block0(v0: i64, v1: i32):
|
||||||
|
v2 = load.i32 v0
|
||||||
|
v3 = iadd.i32 v2, v1
|
||||||
|
; check: addl 0(%rdi), %r12d
|
||||||
|
return v3
|
||||||
|
}
|
||||||
|
|
||||||
|
function %add_from_mem_u32_2(i64, i32) -> i32 {
|
||||||
|
block0(v0: i64, v1: i32):
|
||||||
|
v2 = load.i32 v0
|
||||||
|
v3 = iadd.i32 v1, v2
|
||||||
|
; check: addl 0(%rdi), %r12d
|
||||||
|
return v3
|
||||||
|
}
|
||||||
|
|
||||||
|
function %add_from_mem_u64_1(i64, i64) -> i64 {
|
||||||
|
block0(v0: i64, v1: i64):
|
||||||
|
v2 = load.i64 v0
|
||||||
|
v3 = iadd.i64 v2, v1
|
||||||
|
; check: addq 0(%rdi), %r12
|
||||||
|
return v3
|
||||||
|
}
|
||||||
|
|
||||||
|
function %add_from_mem_u64_2(i64, i64) -> i64 {
|
||||||
|
block0(v0: i64, v1: i64):
|
||||||
|
v2 = load.i64 v0
|
||||||
|
v3 = iadd.i64 v1, v2
|
||||||
|
; check: addq 0(%rdi), %r12
|
||||||
|
return v3
|
||||||
|
}
|
||||||
|
|
||||||
|
; test narrow loads: 8-bit load should not merge because the `addl` is 32 bits
|
||||||
|
; and would load 32 bits from memory, which may go beyond the end of the heap.
|
||||||
|
function %add_from_mem_not_narrow(i64, i8) -> i8 {
|
||||||
|
block0(v0: i64, v1: i8):
|
||||||
|
v2 = load.i8 v0
|
||||||
|
v3 = iadd.i8 v2, v1
|
||||||
|
; check: movzbq 0(%rdi), %r12
|
||||||
|
; nextln: addl %esi, %r12d
|
||||||
|
return v3
|
||||||
|
}
|
||||||
Reference in New Issue
Block a user