Merge pull request #2389 from cfallin/x64-load-op

x64 backend: merge loads into ALU ops when appropriate.
2020-11-17 11:42:19 -08:00
parent 281a41c08b b97f07b405
commit e7df081696
2 changed files with 166 additions and 42 deletions
--- a/cranelift/codegen/src/isa/x64/lower.rs
+++ b/cranelift/codegen/src/isa/x64/lower.rs
@@ -22,9 +22,6 @@ use smallvec::SmallVec;
 use std::convert::TryFrom;
 use target_lexicon::Triple;
 /// Context passed to all lowering functions.
 type Ctx<'a> = &'a mut dyn LowerCtx<I = Inst>;
 //=============================================================================
 // Helpers for instruction lowering.
@@ -89,34 +86,106 @@ fn matches_input_any<C: LowerCtx<I = Inst>>(
    })
 }
 /// Emits instruction(s) to generate the given 64-bit constant value into a newly-allocated
 /// temporary register, returning that register.
 fn generate_constant<C: LowerCtx<I = Inst>>(ctx: &mut C, ty: Type, c: u64) -> Reg {
    let from_bits = ty_bits(ty);
    let masked = if from_bits < 64 {
        c & ((1u64 << from_bits) - 1)
    } else {
        c
    };
    let cst_copy = ctx.alloc_tmp(Inst::rc_for_type(ty).unwrap(), ty);
    for inst in Inst::gen_constant(cst_copy, masked, ty, |reg_class, ty| {
        ctx.alloc_tmp(reg_class, ty)
    })
    .into_iter()
    {
        ctx.emit(inst);
    }
    cst_copy.to_reg()
 }
 /// Put the given input into a register, and mark it as used (side-effect).
-fn put_input_in_reg(ctx: Ctx, spec: InsnInput) -> Reg {
+fn put_input_in_reg<C: LowerCtx<I = Inst>>(ctx: &mut C, spec: InsnInput) -> Reg {
    let ty = ctx.input_ty(spec.insn, spec.input);
    let input = ctx.get_input_as_source_or_const(spec.insn, spec.input);
    if let Some(c) = input.constant {
        // Generate constants fresh at each use to minimize long-range register pressure.
-        let ty = ctx.input_ty(spec.insn, spec.input);
+        generate_constant(ctx, ty, c)
        let from_bits = ty_bits(ty);
        let masked = if from_bits < 64 {
            c & ((1u64 << from_bits) - 1)
        } else {
            c
        };
        let cst_copy = ctx.alloc_tmp(Inst::rc_for_type(ty).unwrap(), ty);
        for inst in Inst::gen_constant(cst_copy, masked, ty, |reg_class, ty| {
            ctx.alloc_tmp(reg_class, ty)
        })
        .into_iter()
        {
            ctx.emit(inst);
        }
        cst_copy.to_reg()
    } else {
        ctx.put_input_in_reg(spec.insn, spec.input)
    }
 }
 /// Determines whether a load operation (indicated by `src_insn`) can be merged
 /// into the current lowering point. If so, returns the address-base source (as
 /// an `InsnInput`) and an offset from that address from which to perform the
 /// load.
 fn is_mergeable_load<C: LowerCtx<I = Inst>>(
    ctx: &mut C,
    src_insn: IRInst,
 ) -> Option<(InsnInput, i32)> {
    let insn_data = ctx.data(src_insn);
    let inputs = ctx.num_inputs(src_insn);
    if inputs != 1 {
        return None;
    }
    let load_ty = ctx.output_ty(src_insn, 0);
    if ty_bits(load_ty) < 32 {
        // Narrower values are handled by ALU insts that are at least 32 bits
        // wide, which is normally OK as we ignore upper buts; but, if we
        // generate, e.g., a direct-from-memory 32-bit add for a byte value and
        // the byte is the last byte in a page, the extra data that we load is
        // incorrectly accessed. So we only allow loads to merge for
        // 32-bit-and-above widths.
        return None;
    }
    // Just testing the opcode is enough, because the width will always match if
    // the type does (and the type should match if the CLIF is properly
    // constructed).
    if insn_data.opcode() == Opcode::Load {
        let offset = insn_data
            .load_store_offset()
            .expect("load should have offset");
        Some((
            InsnInput {
                insn: src_insn,
                input: 0,
            },
            offset,
        ))
    } else {
        None
    }
 }
 /// Put the given input into a register or a memory operand.
 /// Effectful: may mark the given input as used, when returning the register form.
 fn input_to_reg_mem<C: LowerCtx<I = Inst>>(ctx: &mut C, spec: InsnInput) -> RegMem {
    let inputs = ctx.get_input_as_source_or_const(spec.insn, spec.input);
    if let Some(c) = inputs.constant {
        // Generate constants fresh at each use to minimize long-range register pressure.
        let ty = ctx.input_ty(spec.insn, spec.input);
        return RegMem::reg(generate_constant(ctx, ty, c));
    }
    if let Some((src_insn, 0)) = inputs.inst {
        if let Some((addr_input, offset)) = is_mergeable_load(ctx, src_insn) {
            ctx.sink_inst(src_insn);
            let amode = lower_to_amode(ctx, addr_input, offset);
            return RegMem::mem(amode);
        }
    }
    RegMem::reg(ctx.put_input_in_reg(spec.insn, spec.input))
 }
 /// An extension specification for `extend_input_to_reg`.
 #[derive(Clone, Copy)]
 enum ExtSpec {
@@ -128,7 +197,11 @@ enum ExtSpec {
 /// Put the given input into a register, marking it as used, and do a zero- or signed- extension if
 /// required. (This obviously causes side-effects.)
-fn extend_input_to_reg(ctx: Ctx, spec: InsnInput, ext_spec: ExtSpec) -> Reg {
+fn extend_input_to_reg<C: LowerCtx<I = Inst>>(
    ctx: &mut C,
    spec: InsnInput,
    ext_spec: ExtSpec,
 ) -> Reg {
    let requested_size = match ext_spec {
        ExtSpec::ZeroExtendTo32 | ExtSpec::SignExtendTo32 => 32,
        ExtSpec::ZeroExtendTo64 | ExtSpec::SignExtendTo64 => 64,
@@ -160,13 +233,6 @@ fn extend_input_to_reg(ctx: Ctx, spec: InsnInput, ext_spec: ExtSpec) -> Reg {
    dst.to_reg()
 }
 /// Put the given input into a register or a memory operand.
 /// Effectful: may mark the given input as used, when returning the register form.
 fn input_to_reg_mem(ctx: Ctx, spec: InsnInput) -> RegMem {
    // TODO handle memory; merge a load directly, if possible.
    RegMem::reg(ctx.put_input_in_reg(spec.insn, spec.input))
 }
 /// Returns whether the given input is an immediate that can be properly sign-extended, without any
 /// possible side-effect.
 fn non_reg_input_to_sext_imm(input: NonRegInput, input_ty: Type) -> Option<u32> {
@@ -182,20 +248,20 @@ fn non_reg_input_to_sext_imm(input: NonRegInput, input_ty: Type) -> Option<u32>
    })
 }
-fn input_to_sext_imm(ctx: Ctx, spec: InsnInput) -> Option<u32> {
+fn input_to_sext_imm<C: LowerCtx<I = Inst>>(ctx: &mut C, spec: InsnInput) -> Option<u32> {
    let input = ctx.get_input_as_source_or_const(spec.insn, spec.input);
    let input_ty = ctx.input_ty(spec.insn, spec.input);
    non_reg_input_to_sext_imm(input, input_ty)
 }
-fn input_to_imm(ctx: Ctx, spec: InsnInput) -> Option<u64> {
+fn input_to_imm<C: LowerCtx<I = Inst>>(ctx: &mut C, spec: InsnInput) -> Option<u64> {
    ctx.get_input_as_source_or_const(spec.insn, spec.input)
        .constant
 }
 /// Put the given input into an immediate, a register or a memory operand.
 /// Effectful: may mark the given input as used, when returning the register form.
-fn input_to_reg_mem_imm(ctx: Ctx, spec: InsnInput) -> RegMemImm {
+fn input_to_reg_mem_imm<C: LowerCtx<I = Inst>>(ctx: &mut C, spec: InsnInput) -> RegMemImm {
    let input = ctx.get_input_as_source_or_const(spec.insn, spec.input);
    let input_ty = ctx.input_ty(spec.insn, spec.input);
    match non_reg_input_to_sext_imm(input, input_ty) {
@@ -305,7 +371,7 @@ fn emit_extract_lane<C: LowerCtx<I = Inst>>(
 ///
 /// Note: make sure that there are no instructions modifying the flags between a call to this
 /// function and the use of the flags!
-fn emit_cmp(ctx: Ctx, insn: IRInst) {
+fn emit_cmp<C: LowerCtx<I = Inst>>(ctx: &mut C, insn: IRInst) {
    let ty = ctx.input_ty(insn, 0);
    let inputs = [InsnInput { insn, input: 0 }, InsnInput { insn, input: 1 }];
@@ -355,7 +421,12 @@ enum FcmpCondResult {
 ///
 /// Note: make sure that there are no instructions modifying the flags between a call to this
 /// function and the use of the flags!
-fn emit_fcmp(ctx: Ctx, insn: IRInst, mut cond_code: FloatCC, spec: FcmpSpec) -> FcmpCondResult {
+fn emit_fcmp<C: LowerCtx<I = Inst>>(
    ctx: &mut C,
    insn: IRInst,
    mut cond_code: FloatCC,
    spec: FcmpSpec,
 ) -> FcmpCondResult {
    let (flip_operands, inverted_equal) = match cond_code {
        FloatCC::LessThan
        | FloatCC::LessThanOrEqual
@@ -407,7 +478,12 @@ fn emit_fcmp(ctx: Ctx, insn: IRInst, mut cond_code: FloatCC, spec: FcmpSpec) ->
    cond_result
 }
-fn make_libcall_sig(ctx: Ctx, insn: IRInst, call_conv: CallConv, ptr_ty: Type) -> Signature {
+fn make_libcall_sig<C: LowerCtx<I = Inst>>(
    ctx: &mut C,
    insn: IRInst,
    call_conv: CallConv,
    ptr_ty: Type,
 ) -> Signature {
    let mut sig = Signature::new(call_conv);
    for i in 0..ctx.num_inputs(insn) {
        sig.params.push(AbiParam::new(ctx.input_ty(insn, i)));
@@ -827,14 +903,16 @@ fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(
                    | Opcode::Bor
                    | Opcode::Bxor => {
                        // For commutative operations, try to commute operands if one is an
-                        // immediate.
+                        // immediate or direct memory reference. Do so by converting LHS to RMI; if
-                        if let Some(imm) = input_to_sext_imm(ctx, inputs[0]) {
+                        // reg, then always convert RHS to RMI; else, use LHS as RMI and convert
-                            (put_input_in_reg(ctx, inputs[1]), RegMemImm::imm(imm))
+                        // RHS to reg.
                        let lhs = input_to_reg_mem_imm(ctx, inputs[0]);
                        if let RegMemImm::Reg { reg: lhs_reg } = lhs {
                            let rhs = input_to_reg_mem_imm(ctx, inputs[1]);
                            (lhs_reg, rhs)
                        } else {
-                            (
+                            let rhs_reg = put_input_in_reg(ctx, inputs[1]);
-                                put_input_in_reg(ctx, inputs[0]),
+                            (rhs_reg, lhs)
                                input_to_reg_mem_imm(ctx, inputs[1]),
                            )
                        }
                    }
                    Opcode::Isub => (
--- a/cranelift/filetests/filetests/isa/x64/load-op.clif
+++ b/cranelift/filetests/filetests/isa/x64/load-op.clif
@@ -0,0 +1,46 @@
 test compile
 target x86_64
 feature "experimental_x64"
 function %add_from_mem_u32_1(i64, i32) -> i32 {
 block0(v0: i64, v1: i32):
  v2 = load.i32 v0
  v3 = iadd.i32 v2, v1
  ; check: addl    0(%rdi), %r12d
  return v3
 }
 function %add_from_mem_u32_2(i64, i32) -> i32 {
 block0(v0: i64, v1: i32):
  v2 = load.i32 v0
  v3 = iadd.i32 v1, v2
  ; check: addl    0(%rdi), %r12d
  return v3
 }
 function %add_from_mem_u64_1(i64, i64) -> i64 {
 block0(v0: i64, v1: i64):
  v2 = load.i64 v0
  v3 = iadd.i64 v2, v1
  ; check: addq    0(%rdi), %r12
  return v3
 }
 function %add_from_mem_u64_2(i64, i64) -> i64 {
 block0(v0: i64, v1: i64):
  v2 = load.i64 v0
  v3 = iadd.i64 v1, v2
  ; check: addq    0(%rdi), %r12
  return v3
 }
 ; test narrow loads: 8-bit load should not merge because the `addl` is 32 bits
 ; and would load 32 bits from memory, which may go beyond the end of the heap.
 function %add_from_mem_not_narrow(i64, i8) -> i8 {
 block0(v0: i64, v1: i8):
  v2 = load.i8 v0
  v3 = iadd.i8 v2, v1
  ; check: movzbq  0(%rdi), %r12
  ; nextln: addl    %esi, %r12d
  return v3
 }