Address review comments.

2020-05-18 15:40:17 -07:00
parent 687aca00fe
commit bdd2873c8c
7 changed files with 313 additions and 89 deletions
--- a/cranelift/codegen/src/isa/aarch64/inst/emit.rs
+++ b/cranelift/codegen/src/isa/aarch64/inst/emit.rs
@@ -352,6 +352,13 @@ impl MachInstEmit for Inst {
    type State = EmitState;

    fn emit(&self, sink: &mut MachBuffer<Inst>, flags: &settings::Flags, state: &mut EmitState) {
+        // N.B.: we *must* not exceed the "worst-case size" used to compute
+        // where to insert islands, except when islands are explicitly triggered
+        // (with an `EmitIsland`). We check this in debug builds. This is `mut`
+        // to allow disabling the check for `JTSequence`, which is always
+        // emitted following an `EmitIsland`.
+        let mut start_off = sink.cur_offset();
+
        match self {
            &Inst::AluRRR { alu_op, rd, rn, rm } => {
                let top11 = match alu_op {
@@ -1307,6 +1314,10 @@ impl MachInstEmit for Inst {
                        LabelUse::PCRel32,
                    );
                }
+
+                // Lowering produces an EmitIsland before using a JTSequence, so we can safely
+                // disable the worst-case-size check in this case.
+                start_off = sink.cur_offset();
            }
            &Inst::LoadConst64 { rd, const_data } => {
                let inst = Inst::ULoad64 {
@@ -1418,5 +1429,8 @@ impl MachInstEmit for Inst {
                }
            }
        }
+
+        let end_off = sink.cur_offset();
+        debug_assert!((end_off - start_off) <= Inst::worst_case_size());
    }
 }
--- a/cranelift/codegen/src/isa/aarch64/inst/mod.rs
+++ b/cranelift/codegen/src/isa/aarch64/inst/mod.rs
@@ -657,6 +657,15 @@ pub enum Inst {

    /// A one-way conditional branch, invisible to the CFG processing; used *only* as part of
    /// straight-line sequences in code to be emitted.
+    ///
+    /// In more detail:
+    /// - This branch is lowered to a branch at the machine-code level, but does not end a basic
+    ///   block, and does not create edges in the CFG seen by regalloc.
+    /// - Thus, it is *only* valid to use as part of a single-in, single-out sequence that is
+    ///   lowered from a single CLIF instruction. For example, certain arithmetic operations may
+    ///   use these branches to handle certain conditions, such as overflows, traps, etc.
+    ///
+    /// See, e.g., the lowering of `trapif` (conditional trap) for an example.
    OneWayCondBr {
        target: BranchTarget,
        kind: CondBrKind,
@@ -678,7 +687,7 @@ pub enum Inst {
        trap_info: (SourceLoc, TrapCode),
    },

-    /// Load the address (using a PC-relative offset) of a memory location, using the `ADR`
+    /// Compute the address (using a PC-relative offset) of a memory location, using the `ADR`
    /// instruction. Note that we take a simple offset, not a `MemLabel`, here, because `Adr` is
    /// only used for now in fixed lowering sequences with hardcoded offsets. In the future we may
    /// need full `MemLabel` support.
@@ -734,9 +743,26 @@ pub enum Inst {
        offset: i64,
    },

-    /// Meta-insn, no-op in generated code: emit constant/branch veneer island at this point (with
-    /// a guard jump around it) if less than the needed space is available before the next branch
-    /// deadline.
+    /// Meta-insn, no-op in generated code: emit constant/branch veneer island
+    /// at this point (with a guard jump around it) if less than the needed
+    /// space is available before the next branch deadline. See the `MachBuffer`
+    /// implementation in `machinst/buffer.rs` for the overall algorithm. In
+    /// brief, we retain a set of "pending/unresolved label references" from
+    /// branches as we scan forward through instructions to emit machine code;
+    /// if we notice we're about to go out of range on an unresolved reference,
+    /// we stop, emit a bunch of "veneers" (branches in a form that has a longer
+    /// range, e.g. a 26-bit-offset unconditional jump), and point the original
+    /// label references to those. This is an "island" because it comes in the
+    /// middle of the code.
+    ///
+    /// This meta-instruction is a necessary part of the logic that determines
+    /// where to place islands. Ordinarily, we want to place them between basic
+    /// blocks, so we compute the worst-case size of each block, and emit the
+    /// island before starting a block if we would exceed a deadline before the
+    /// end of the block. However, some sequences (such as an inline jumptable)
+    /// are variable-length and not accounted for by this logic; so these
+    /// lowered sequences include an `EmitIsland` to trigger island generation
+    /// where necessary.
    EmitIsland {
        /// The needed space before the next deadline.
        needed_space: CodeOffset,
@@ -1770,6 +1796,18 @@ impl MachInst for Inst {
            ));
            ret
        } else {
+            // Must be an integer type.
+            debug_assert!(
+                ty == B1
+                    || ty == I8
+                    || ty == B8
+                    || ty == I16
+                    || ty == B16
+                    || ty == I32
+                    || ty == B32
+                    || ty == I64
+                    || ty == B64
+            );
            Inst::load_constant(to_reg, value)
        }
    }
@@ -2601,7 +2639,8 @@ pub enum LabelUse {
    /// 21-bit offset for ADR (get address of label). PC-rel, offset is not shifted. Immediate is
    /// 21 signed bits, with high 19 bits in bits 23:5 and low 2 bits in bits 30:29.
    Adr21,
-    /// 32-bit PC relative constant offset (from address of constant itself). Used in jump tables.
+    /// 32-bit PC relative constant offset (from address of constant itself),
+    /// signed. Used in jump tables.
    PCRel32,
 }

--- a/cranelift/codegen/src/isa/aarch64/lower.rs
+++ b/cranelift/codegen/src/isa/aarch64/lower.rs
@@ -188,7 +188,7 @@ pub(crate) fn input_to_reg<C: LowerCtx<I = Inst>>(
    let inputs = ctx.get_input(input.insn, input.input);
    let in_reg = if let Some(c) = inputs.constant {
        // Generate constants fresh at each use to minimize long-range register pressure.
-        let to_reg = ctx.tmp(Inst::rc_for_type(ty).unwrap(), ty);
+        let to_reg = ctx.alloc_tmp(Inst::rc_for_type(ty).unwrap(), ty);
        for inst in Inst::gen_constant(to_reg, c, ty).into_iter() {
            ctx.emit(inst);
        }
@@ -201,7 +201,7 @@ pub(crate) fn input_to_reg<C: LowerCtx<I = Inst>>(
    match (narrow_mode, from_bits) {
        (NarrowValueMode::None, _) => in_reg,
        (NarrowValueMode::ZeroExtend32, n) if n < 32 => {
-            let tmp = ctx.tmp(RegClass::I64, I32);
+            let tmp = ctx.alloc_tmp(RegClass::I64, I32);
            ctx.emit(Inst::Extend {
                rd: tmp,
                rn: in_reg,
@@ -212,7 +212,7 @@ pub(crate) fn input_to_reg<C: LowerCtx<I = Inst>>(
            tmp.to_reg()
        }
        (NarrowValueMode::SignExtend32, n) if n < 32 => {
-            let tmp = ctx.tmp(RegClass::I64, I32);
+            let tmp = ctx.alloc_tmp(RegClass::I64, I32);
            ctx.emit(Inst::Extend {
                rd: tmp,
                rn: in_reg,
@@ -229,7 +229,7 @@ pub(crate) fn input_to_reg<C: LowerCtx<I = Inst>>(
                // Constants are zero-extended to full 64-bit width on load already.
                in_reg
            } else {
-                let tmp = ctx.tmp(RegClass::I64, I32);
+                let tmp = ctx.alloc_tmp(RegClass::I64, I32);
                ctx.emit(Inst::Extend {
                    rd: tmp,
                    rn: in_reg,
@@ -241,7 +241,7 @@ pub(crate) fn input_to_reg<C: LowerCtx<I = Inst>>(
            }
        }
        (NarrowValueMode::SignExtend64, n) if n < 64 => {
-            let tmp = ctx.tmp(RegClass::I64, I32);
+            let tmp = ctx.alloc_tmp(RegClass::I64, I32);
            ctx.emit(Inst::Extend {
                rd: tmp,
                rn: in_reg,
@@ -529,7 +529,7 @@ pub(crate) fn lower_address<C: LowerCtx<I = Inst>>(
    }

    // Otherwise, generate add instructions.
-    let addr = ctx.tmp(RegClass::I64, I64);
+    let addr = ctx.alloc_tmp(RegClass::I64, I64);

    // Get the const into a reg.
    lower_constant_u64(ctx, addr.clone(), offset as u64);
@@ -541,7 +541,7 @@ pub(crate) fn lower_address<C: LowerCtx<I = Inst>>(
        // In an addition, the stack register is the zero register, so divert it to another
        // register just before doing the actual add.
        let reg = if reg == stack_reg() {
-            let tmp = ctx.tmp(RegClass::I64, I64);
+            let tmp = ctx.alloc_tmp(RegClass::I64, I64);
            ctx.emit(Inst::Mov {
                rd: tmp,
                rm: stack_reg(),
--- a/cranelift/codegen/src/isa/aarch64/lower_inst.rs
+++ b/cranelift/codegen/src/isa/aarch64/lower_inst.rs
@@ -84,8 +84,8 @@ pub(crate) fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(
            } else {
                VecALUOp::UQAddScalar
            };
-            let va = ctx.tmp(RegClass::V128, I128);
-            let vb = ctx.tmp(RegClass::V128, I128);
+            let va = ctx.alloc_tmp(RegClass::V128, I128);
+            let vb = ctx.alloc_tmp(RegClass::V128, I128);
            let ra = input_to_reg(ctx, inputs[0], narrow_mode);
            let rb = input_to_reg(ctx, inputs[1], narrow_mode);
            let rd = output_to_reg(ctx, outputs[0]);
@@ -115,8 +115,8 @@ pub(crate) fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(
            } else {
                VecALUOp::UQSubScalar
            };
-            let va = ctx.tmp(RegClass::V128, I128);
-            let vb = ctx.tmp(RegClass::V128, I128);
+            let va = ctx.alloc_tmp(RegClass::V128, I128);
+            let vb = ctx.alloc_tmp(RegClass::V128, I128);
            let ra = input_to_reg(ctx, inputs[0], narrow_mode);
            let rb = input_to_reg(ctx, inputs[1], narrow_mode);
            let rd = output_to_reg(ctx, outputs[0]);
@@ -498,7 +498,7 @@ pub(crate) fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(
                            // ignored (because of the implicit masking done by the instruction),
                            // so this is equivalent to negating the input.
                            let alu_op = choose_32_64(ty, ALUOp::Sub32, ALUOp::Sub64);
-                            let tmp = ctx.tmp(RegClass::I64, ty);
+                            let tmp = ctx.alloc_tmp(RegClass::I64, ty);
                            ctx.emit(Inst::AluRRR {
                                alu_op,
                                rd: tmp,
@@ -521,7 +521,7 @@ pub(crate) fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(
                            // Really ty_bits_size - rn, but the upper bits of the result are
                            // ignored (because of the implicit masking done by the instruction),
                            // so this is equivalent to negating the input.
-                            let tmp = ctx.tmp(RegClass::I64, I32);
+                            let tmp = ctx.alloc_tmp(RegClass::I64, I32);
                            ctx.emit(Inst::AluRRR {
                                alu_op: ALUOp::Sub32,
                                rd: tmp,
@@ -534,7 +534,7 @@ pub(crate) fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(
                        };

                        // Explicitly mask the rotation count.
-                        let tmp_masked_rm = ctx.tmp(RegClass::I64, I32);
+                        let tmp_masked_rm = ctx.alloc_tmp(RegClass::I64, I32);
                        ctx.emit(Inst::AluRRImmLogic {
                            alu_op: ALUOp::And32,
                            rd: tmp_masked_rm,
@@ -543,8 +543,8 @@ pub(crate) fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(
                        });
                        let tmp_masked_rm = tmp_masked_rm.to_reg();

-                        let tmp1 = ctx.tmp(RegClass::I64, I32);
-                        let tmp2 = ctx.tmp(RegClass::I64, I32);
+                        let tmp1 = ctx.alloc_tmp(RegClass::I64, I32);
+                        let tmp2 = ctx.alloc_tmp(RegClass::I64, I32);
                        ctx.emit(Inst::AluRRImm12 {
                            alu_op: ALUOp::Sub32,
                            rd: tmp1,
@@ -583,7 +583,7 @@ pub(crate) fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(
                        }
                        immshift.imm &= ty_bits_size - 1;

-                        let tmp1 = ctx.tmp(RegClass::I64, I32);
+                        let tmp1 = ctx.alloc_tmp(RegClass::I64, I32);
                        ctx.emit(Inst::AluRRImmShift {
                            alu_op: ALUOp::Lsr32,
                            rd: tmp1,
@@ -688,7 +688,7 @@ pub(crate) fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(
            // and fix the sequence below to work properly for this.
            let narrow_mode = NarrowValueMode::ZeroExtend64;
            let rn = input_to_reg(ctx, inputs[0], narrow_mode);
-            let tmp = ctx.tmp(RegClass::I64, I64);
+            let tmp = ctx.alloc_tmp(RegClass::I64, I64);

            // If this is a 32-bit Popcnt, use Lsr32 to clear the top 32 bits of the register, then
            // the rest of the code is identical to the 64-bit version.
@@ -997,7 +997,7 @@ pub(crate) fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(
        }

        Opcode::Bitselect => {
-            let tmp = ctx.tmp(RegClass::I64, I64);
+            let tmp = ctx.alloc_tmp(RegClass::I64, I64);
            let rd = output_to_reg(ctx, outputs[0]);
            let rcond = input_to_reg(ctx, inputs[0], NarrowValueMode::None);
            let rn = input_to_reg(ctx, inputs[1], NarrowValueMode::None);
@@ -1475,8 +1475,8 @@ pub(crate) fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(
            let rn = input_to_reg(ctx, inputs[0], NarrowValueMode::None);
            let rm = input_to_reg(ctx, inputs[1], NarrowValueMode::None);
            let rd = output_to_reg(ctx, outputs[0]);
-            let tmp1 = ctx.tmp(RegClass::I64, I64);
-            let tmp2 = ctx.tmp(RegClass::I64, I64);
+            let tmp1 = ctx.alloc_tmp(RegClass::I64, I64);
+            let tmp2 = ctx.alloc_tmp(RegClass::I64, I64);
            ctx.emit(Inst::MovFromVec64 { rd: tmp1, rn: rn });
            ctx.emit(Inst::MovFromVec64 { rd: tmp2, rn: rm });
            let imml = if bits == 32 {
@@ -1546,7 +1546,7 @@ pub(crate) fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(
            let trap_info = (ctx.srcloc(insn), TrapCode::BadConversionToInteger);
            ctx.emit(Inst::Udf { trap_info });

-            let tmp = ctx.tmp(RegClass::V128, I128);
+            let tmp = ctx.alloc_tmp(RegClass::V128, I128);

            // Check that the input is in range, with "truncate towards zero" semantics. This means
            // we allow values that are slightly out of range:
@@ -1712,8 +1712,8 @@ pub(crate) fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(
                _ => unreachable!(),
            };

-            let rtmp1 = ctx.tmp(RegClass::V128, in_ty);
-            let rtmp2 = ctx.tmp(RegClass::V128, in_ty);
+            let rtmp1 = ctx.alloc_tmp(RegClass::V128, in_ty);
+            let rtmp2 = ctx.alloc_tmp(RegClass::V128, in_ty);

            if in_bits == 32 {
                ctx.emit(Inst::LoadFpuConst32 {
@@ -2072,7 +2072,9 @@ pub(crate) fn lower_branch<C: LowerCtx<I = Inst>>(
            Opcode::BrTable => {
                // Expand `br_table index, default, JT` to:
                //
-                //   (emit island with guard jump if needed)
+                //   emit_island  // this forces an island at this point
+                //                // if the jumptable would push us past
+                //                // the deadline
                //   subs idx, #jt_size
                //   b.hs default
                //   adr vTmp1, PC+16
@@ -2096,8 +2098,8 @@ pub(crate) fn lower_branch<C: LowerCtx<I = Inst>>(
                    NarrowValueMode::ZeroExtend32,
                );

-                let rtmp1 = ctx.tmp(RegClass::I64, I32);
-                let rtmp2 = ctx.tmp(RegClass::I64, I32);
+                let rtmp1 = ctx.alloc_tmp(RegClass::I64, I32);
+                let rtmp2 = ctx.alloc_tmp(RegClass::I64, I32);

                // Bounds-check and branch to default.
                if let Some(imm12) = Imm12::maybe_from_u64(jt_size as u64) {