diff --git a/cranelift/codegen/src/isa/aarch64/inst/emit.rs b/cranelift/codegen/src/isa/aarch64/inst/emit.rs index 2d5ecd406d..aa6727b978 100644 --- a/cranelift/codegen/src/isa/aarch64/inst/emit.rs +++ b/cranelift/codegen/src/isa/aarch64/inst/emit.rs @@ -352,6 +352,13 @@ impl MachInstEmit for Inst { type State = EmitState; fn emit(&self, sink: &mut MachBuffer, flags: &settings::Flags, state: &mut EmitState) { + // N.B.: we *must* not exceed the "worst-case size" used to compute + // where to insert islands, except when islands are explicitly triggered + // (with an `EmitIsland`). We check this in debug builds. This is `mut` + // to allow disabling the check for `JTSequence`, which is always + // emitted following an `EmitIsland`. + let mut start_off = sink.cur_offset(); + match self { &Inst::AluRRR { alu_op, rd, rn, rm } => { let top11 = match alu_op { @@ -1307,6 +1314,10 @@ impl MachInstEmit for Inst { LabelUse::PCRel32, ); } + + // Lowering produces an EmitIsland before using a JTSequence, so we can safely + // disable the worst-case-size check in this case. + start_off = sink.cur_offset(); } &Inst::LoadConst64 { rd, const_data } => { let inst = Inst::ULoad64 { @@ -1418,5 +1429,8 @@ impl MachInstEmit for Inst { } } } + + let end_off = sink.cur_offset(); + debug_assert!((end_off - start_off) <= Inst::worst_case_size()); } } diff --git a/cranelift/codegen/src/isa/aarch64/inst/mod.rs b/cranelift/codegen/src/isa/aarch64/inst/mod.rs index 714ba1eb4d..fd910522c5 100644 --- a/cranelift/codegen/src/isa/aarch64/inst/mod.rs +++ b/cranelift/codegen/src/isa/aarch64/inst/mod.rs @@ -657,6 +657,15 @@ pub enum Inst { /// A one-way conditional branch, invisible to the CFG processing; used *only* as part of /// straight-line sequences in code to be emitted. + /// + /// In more detail: + /// - This branch is lowered to a branch at the machine-code level, but does not end a basic + /// block, and does not create edges in the CFG seen by regalloc. + /// - Thus, it is *only* valid to use as part of a single-in, single-out sequence that is + /// lowered from a single CLIF instruction. For example, certain arithmetic operations may + /// use these branches to handle certain conditions, such as overflows, traps, etc. + /// + /// See, e.g., the lowering of `trapif` (conditional trap) for an example. OneWayCondBr { target: BranchTarget, kind: CondBrKind, @@ -678,7 +687,7 @@ pub enum Inst { trap_info: (SourceLoc, TrapCode), }, - /// Load the address (using a PC-relative offset) of a memory location, using the `ADR` + /// Compute the address (using a PC-relative offset) of a memory location, using the `ADR` /// instruction. Note that we take a simple offset, not a `MemLabel`, here, because `Adr` is /// only used for now in fixed lowering sequences with hardcoded offsets. In the future we may /// need full `MemLabel` support. @@ -734,9 +743,26 @@ pub enum Inst { offset: i64, }, - /// Meta-insn, no-op in generated code: emit constant/branch veneer island at this point (with - /// a guard jump around it) if less than the needed space is available before the next branch - /// deadline. + /// Meta-insn, no-op in generated code: emit constant/branch veneer island + /// at this point (with a guard jump around it) if less than the needed + /// space is available before the next branch deadline. See the `MachBuffer` + /// implementation in `machinst/buffer.rs` for the overall algorithm. In + /// brief, we retain a set of "pending/unresolved label references" from + /// branches as we scan forward through instructions to emit machine code; + /// if we notice we're about to go out of range on an unresolved reference, + /// we stop, emit a bunch of "veneers" (branches in a form that has a longer + /// range, e.g. a 26-bit-offset unconditional jump), and point the original + /// label references to those. This is an "island" because it comes in the + /// middle of the code. + /// + /// This meta-instruction is a necessary part of the logic that determines + /// where to place islands. Ordinarily, we want to place them between basic + /// blocks, so we compute the worst-case size of each block, and emit the + /// island before starting a block if we would exceed a deadline before the + /// end of the block. However, some sequences (such as an inline jumptable) + /// are variable-length and not accounted for by this logic; so these + /// lowered sequences include an `EmitIsland` to trigger island generation + /// where necessary. EmitIsland { /// The needed space before the next deadline. needed_space: CodeOffset, @@ -1770,6 +1796,18 @@ impl MachInst for Inst { )); ret } else { + // Must be an integer type. + debug_assert!( + ty == B1 + || ty == I8 + || ty == B8 + || ty == I16 + || ty == B16 + || ty == I32 + || ty == B32 + || ty == I64 + || ty == B64 + ); Inst::load_constant(to_reg, value) } } @@ -2601,7 +2639,8 @@ pub enum LabelUse { /// 21-bit offset for ADR (get address of label). PC-rel, offset is not shifted. Immediate is /// 21 signed bits, with high 19 bits in bits 23:5 and low 2 bits in bits 30:29. Adr21, - /// 32-bit PC relative constant offset (from address of constant itself). Used in jump tables. + /// 32-bit PC relative constant offset (from address of constant itself), + /// signed. Used in jump tables. PCRel32, } diff --git a/cranelift/codegen/src/isa/aarch64/lower.rs b/cranelift/codegen/src/isa/aarch64/lower.rs index d1368a3d97..eb4aafd551 100644 --- a/cranelift/codegen/src/isa/aarch64/lower.rs +++ b/cranelift/codegen/src/isa/aarch64/lower.rs @@ -188,7 +188,7 @@ pub(crate) fn input_to_reg>( let inputs = ctx.get_input(input.insn, input.input); let in_reg = if let Some(c) = inputs.constant { // Generate constants fresh at each use to minimize long-range register pressure. - let to_reg = ctx.tmp(Inst::rc_for_type(ty).unwrap(), ty); + let to_reg = ctx.alloc_tmp(Inst::rc_for_type(ty).unwrap(), ty); for inst in Inst::gen_constant(to_reg, c, ty).into_iter() { ctx.emit(inst); } @@ -201,7 +201,7 @@ pub(crate) fn input_to_reg>( match (narrow_mode, from_bits) { (NarrowValueMode::None, _) => in_reg, (NarrowValueMode::ZeroExtend32, n) if n < 32 => { - let tmp = ctx.tmp(RegClass::I64, I32); + let tmp = ctx.alloc_tmp(RegClass::I64, I32); ctx.emit(Inst::Extend { rd: tmp, rn: in_reg, @@ -212,7 +212,7 @@ pub(crate) fn input_to_reg>( tmp.to_reg() } (NarrowValueMode::SignExtend32, n) if n < 32 => { - let tmp = ctx.tmp(RegClass::I64, I32); + let tmp = ctx.alloc_tmp(RegClass::I64, I32); ctx.emit(Inst::Extend { rd: tmp, rn: in_reg, @@ -229,7 +229,7 @@ pub(crate) fn input_to_reg>( // Constants are zero-extended to full 64-bit width on load already. in_reg } else { - let tmp = ctx.tmp(RegClass::I64, I32); + let tmp = ctx.alloc_tmp(RegClass::I64, I32); ctx.emit(Inst::Extend { rd: tmp, rn: in_reg, @@ -241,7 +241,7 @@ pub(crate) fn input_to_reg>( } } (NarrowValueMode::SignExtend64, n) if n < 64 => { - let tmp = ctx.tmp(RegClass::I64, I32); + let tmp = ctx.alloc_tmp(RegClass::I64, I32); ctx.emit(Inst::Extend { rd: tmp, rn: in_reg, @@ -529,7 +529,7 @@ pub(crate) fn lower_address>( } // Otherwise, generate add instructions. - let addr = ctx.tmp(RegClass::I64, I64); + let addr = ctx.alloc_tmp(RegClass::I64, I64); // Get the const into a reg. lower_constant_u64(ctx, addr.clone(), offset as u64); @@ -541,7 +541,7 @@ pub(crate) fn lower_address>( // In an addition, the stack register is the zero register, so divert it to another // register just before doing the actual add. let reg = if reg == stack_reg() { - let tmp = ctx.tmp(RegClass::I64, I64); + let tmp = ctx.alloc_tmp(RegClass::I64, I64); ctx.emit(Inst::Mov { rd: tmp, rm: stack_reg(), diff --git a/cranelift/codegen/src/isa/aarch64/lower_inst.rs b/cranelift/codegen/src/isa/aarch64/lower_inst.rs index 8692d853de..a92dea7a7b 100644 --- a/cranelift/codegen/src/isa/aarch64/lower_inst.rs +++ b/cranelift/codegen/src/isa/aarch64/lower_inst.rs @@ -84,8 +84,8 @@ pub(crate) fn lower_insn_to_regs>( } else { VecALUOp::UQAddScalar }; - let va = ctx.tmp(RegClass::V128, I128); - let vb = ctx.tmp(RegClass::V128, I128); + let va = ctx.alloc_tmp(RegClass::V128, I128); + let vb = ctx.alloc_tmp(RegClass::V128, I128); let ra = input_to_reg(ctx, inputs[0], narrow_mode); let rb = input_to_reg(ctx, inputs[1], narrow_mode); let rd = output_to_reg(ctx, outputs[0]); @@ -115,8 +115,8 @@ pub(crate) fn lower_insn_to_regs>( } else { VecALUOp::UQSubScalar }; - let va = ctx.tmp(RegClass::V128, I128); - let vb = ctx.tmp(RegClass::V128, I128); + let va = ctx.alloc_tmp(RegClass::V128, I128); + let vb = ctx.alloc_tmp(RegClass::V128, I128); let ra = input_to_reg(ctx, inputs[0], narrow_mode); let rb = input_to_reg(ctx, inputs[1], narrow_mode); let rd = output_to_reg(ctx, outputs[0]); @@ -498,7 +498,7 @@ pub(crate) fn lower_insn_to_regs>( // ignored (because of the implicit masking done by the instruction), // so this is equivalent to negating the input. let alu_op = choose_32_64(ty, ALUOp::Sub32, ALUOp::Sub64); - let tmp = ctx.tmp(RegClass::I64, ty); + let tmp = ctx.alloc_tmp(RegClass::I64, ty); ctx.emit(Inst::AluRRR { alu_op, rd: tmp, @@ -521,7 +521,7 @@ pub(crate) fn lower_insn_to_regs>( // Really ty_bits_size - rn, but the upper bits of the result are // ignored (because of the implicit masking done by the instruction), // so this is equivalent to negating the input. - let tmp = ctx.tmp(RegClass::I64, I32); + let tmp = ctx.alloc_tmp(RegClass::I64, I32); ctx.emit(Inst::AluRRR { alu_op: ALUOp::Sub32, rd: tmp, @@ -534,7 +534,7 @@ pub(crate) fn lower_insn_to_regs>( }; // Explicitly mask the rotation count. - let tmp_masked_rm = ctx.tmp(RegClass::I64, I32); + let tmp_masked_rm = ctx.alloc_tmp(RegClass::I64, I32); ctx.emit(Inst::AluRRImmLogic { alu_op: ALUOp::And32, rd: tmp_masked_rm, @@ -543,8 +543,8 @@ pub(crate) fn lower_insn_to_regs>( }); let tmp_masked_rm = tmp_masked_rm.to_reg(); - let tmp1 = ctx.tmp(RegClass::I64, I32); - let tmp2 = ctx.tmp(RegClass::I64, I32); + let tmp1 = ctx.alloc_tmp(RegClass::I64, I32); + let tmp2 = ctx.alloc_tmp(RegClass::I64, I32); ctx.emit(Inst::AluRRImm12 { alu_op: ALUOp::Sub32, rd: tmp1, @@ -583,7 +583,7 @@ pub(crate) fn lower_insn_to_regs>( } immshift.imm &= ty_bits_size - 1; - let tmp1 = ctx.tmp(RegClass::I64, I32); + let tmp1 = ctx.alloc_tmp(RegClass::I64, I32); ctx.emit(Inst::AluRRImmShift { alu_op: ALUOp::Lsr32, rd: tmp1, @@ -688,7 +688,7 @@ pub(crate) fn lower_insn_to_regs>( // and fix the sequence below to work properly for this. let narrow_mode = NarrowValueMode::ZeroExtend64; let rn = input_to_reg(ctx, inputs[0], narrow_mode); - let tmp = ctx.tmp(RegClass::I64, I64); + let tmp = ctx.alloc_tmp(RegClass::I64, I64); // If this is a 32-bit Popcnt, use Lsr32 to clear the top 32 bits of the register, then // the rest of the code is identical to the 64-bit version. @@ -997,7 +997,7 @@ pub(crate) fn lower_insn_to_regs>( } Opcode::Bitselect => { - let tmp = ctx.tmp(RegClass::I64, I64); + let tmp = ctx.alloc_tmp(RegClass::I64, I64); let rd = output_to_reg(ctx, outputs[0]); let rcond = input_to_reg(ctx, inputs[0], NarrowValueMode::None); let rn = input_to_reg(ctx, inputs[1], NarrowValueMode::None); @@ -1475,8 +1475,8 @@ pub(crate) fn lower_insn_to_regs>( let rn = input_to_reg(ctx, inputs[0], NarrowValueMode::None); let rm = input_to_reg(ctx, inputs[1], NarrowValueMode::None); let rd = output_to_reg(ctx, outputs[0]); - let tmp1 = ctx.tmp(RegClass::I64, I64); - let tmp2 = ctx.tmp(RegClass::I64, I64); + let tmp1 = ctx.alloc_tmp(RegClass::I64, I64); + let tmp2 = ctx.alloc_tmp(RegClass::I64, I64); ctx.emit(Inst::MovFromVec64 { rd: tmp1, rn: rn }); ctx.emit(Inst::MovFromVec64 { rd: tmp2, rn: rm }); let imml = if bits == 32 { @@ -1546,7 +1546,7 @@ pub(crate) fn lower_insn_to_regs>( let trap_info = (ctx.srcloc(insn), TrapCode::BadConversionToInteger); ctx.emit(Inst::Udf { trap_info }); - let tmp = ctx.tmp(RegClass::V128, I128); + let tmp = ctx.alloc_tmp(RegClass::V128, I128); // Check that the input is in range, with "truncate towards zero" semantics. This means // we allow values that are slightly out of range: @@ -1712,8 +1712,8 @@ pub(crate) fn lower_insn_to_regs>( _ => unreachable!(), }; - let rtmp1 = ctx.tmp(RegClass::V128, in_ty); - let rtmp2 = ctx.tmp(RegClass::V128, in_ty); + let rtmp1 = ctx.alloc_tmp(RegClass::V128, in_ty); + let rtmp2 = ctx.alloc_tmp(RegClass::V128, in_ty); if in_bits == 32 { ctx.emit(Inst::LoadFpuConst32 { @@ -2072,7 +2072,9 @@ pub(crate) fn lower_branch>( Opcode::BrTable => { // Expand `br_table index, default, JT` to: // - // (emit island with guard jump if needed) + // emit_island // this forces an island at this point + // // if the jumptable would push us past + // // the deadline // subs idx, #jt_size // b.hs default // adr vTmp1, PC+16 @@ -2096,8 +2098,8 @@ pub(crate) fn lower_branch>( NarrowValueMode::ZeroExtend32, ); - let rtmp1 = ctx.tmp(RegClass::I64, I32); - let rtmp2 = ctx.tmp(RegClass::I64, I32); + let rtmp1 = ctx.alloc_tmp(RegClass::I64, I32); + let rtmp2 = ctx.alloc_tmp(RegClass::I64, I32); // Bounds-check and branch to default. if let Some(imm12) = Imm12::maybe_from_u64(jt_size as u64) { diff --git a/cranelift/codegen/src/machinst/blockorder.rs b/cranelift/codegen/src/machinst/blockorder.rs index 104b2f8c15..1052d83858 100644 --- a/cranelift/codegen/src/machinst/blockorder.rs +++ b/cranelift/codegen/src/machinst/blockorder.rs @@ -3,12 +3,54 @@ //! This module handles the translation from CLIF BBs to VCode BBs. //! //! The basic idea is that we compute a sequence of "lowered blocks" that -//! correspond to subgraphs of the CLIF CFG plus an implicit block on *every* -//! edge (not just critical edges). Conceptually, the lowering pipeline wants to -//! insert moves for phi-nodes on every block-to-block transfer; these blocks -//! always conceptually exist, but may be merged with an "original" CLIF block -//! (and hence not actually exist; this is equivalent to inserting the blocks -//! only on critical edges). +//! correspond to one or more blocks in the graph: (CLIF CFG) `union` (implicit +//! block on *every* edge). Conceptually, the lowering pipeline wants to insert +//! moves for phi-nodes on every block-to-block transfer; these blocks always +//! conceptually exist, but may be merged with an "original" CLIF block (and +//! hence not actually exist; this is equivalent to inserting the blocks only on +//! critical edges). +//! +//! In other words, starting from a CFG like this (where each "CLIF block" and +//! "(edge N->M)" is a separate basic block): +//! +//! ```plain +//! +//! CLIF block 0 +//! / \ +//! (edge 0->1) (edge 0->2) +//! | | +//! CLIF block 1 CLIF block 2 +//! \ / +//! (edge 1->3) (edge 2->3) +//! \ / +//! CLIF block 3 +//! ``` +//! +//! We can produce a CFG of lowered blocks like so: +//! +//! ```plain +//! +--------------+ +//! | CLIF block 0 | +//! +--------------+ +//! / \ +//! +--------------+ +--------------+ +//! | (edge 0->1) | |(edge 0->2) | +//! | CLIF block 1 | | CLIF block 2 | +//! +--------------+ +--------------+ +//! \ / +//! +-----------+ +-----------+ +//! |(edge 1->3)| |(edge 2->3)| +//! +-----------+ +-----------+ +//! \ / +//! +------------+ +//! |CLIF block 3| +//! +------------+ +//! ``` +//! +//! (note that the edges into CLIF blocks 1 and 2 could be merged with those +//! blocks' original bodies, but the out-edges could not because for simplicity +//! in the successor-function definition, we only ever merge an edge onto one +//! side of an original CLIF block.) //! //! Each `LoweredBlock` names just an original CLIF block, an original CLIF //! block prepended or appended with an edge block (never both, though), or just @@ -23,6 +65,9 @@ //! have content, because this computation happens as part of lowering *before* //! regalloc, and regalloc may or may not insert moves/spills/reloads on any //! particular edge. But it works relatively well and is conceptually simple. +//! Furthermore, the [MachBuffer] machine-code sink performs final peephole-like +//! branch editing that in practice elides empty blocks and simplifies some of +//! the other redundancies that this scheme produces. use crate::entity::SecondaryMap; use crate::fx::{FxHashMap, FxHashSet}; diff --git a/cranelift/codegen/src/machinst/buffer.rs b/cranelift/codegen/src/machinst/buffer.rs index b9e3bb3c1e..cb7564f258 100644 --- a/cranelift/codegen/src/machinst/buffer.rs +++ b/cranelift/codegen/src/machinst/buffer.rs @@ -1,12 +1,116 @@ //! In-memory representation of compiled machine code, with labels and fixups to //! refer to those labels. Handles constant-pool island insertion and also //! veneer insertion for out-of-range jumps. +//! +//! This code exists to solve three problems: +//! +//! - Branch targets for forward branches are not known until later, when we +//! emit code in a single pass through the instruction structs. +//! +//! - On many architectures, address references or offsets have limited range. +//! For example, on AArch64, conditional branches can only target code +/- 1MB +//! from the branch itself. +//! +//! - The lowering of control flow from the CFG-with-edges produced by +//! [BlockLoweringOrder], combined with many empty edge blocks when the register +//! allocator does not need to insert any spills/reloads/moves in edge blocks, +//! results in many suboptimal branch patterns. The lowering also pays no +//! attention to block order, and so two-target conditional forms (cond-br +//! followed by uncond-br) can often by avoided because one of the targets is +//! the fallthrough. There are several cases here where we can simplify to use +//! fewer branches. +//! +//! This "buffer" implements a single-pass code emission strategy (with a later +//! "fixup" pass, but only through recorded fixups, not all instructions). The +//! basic idea is: +//! +//! - Emit branches as they are, including two-target (cond/uncond) compound +//! forms, but with zero offsets and optimistically assuming the target will be +//! in range. Record the "fixup" for later. Targets are denoted instead by +//! symbolic "labels" that are then bound to certain offsets in the buffer as +//! we emit code. (Nominally, there is a label at the start of every basic +//! block.) +//! +//! - As we do this, track the offset in the buffer at which the first label +//! reference "goes out of range". We call this the "deadline". If we reach the +//! deadline and we still have not bound the label to which an unresolved branch +//! refers, we have a problem! +//! +//! - To solve this problem, we emit "islands" full of "veneers". An island is +//! simply a chunk of code inserted in the middle of the code actually produced +//! by the emitter (e.g., vcode iterating over instruction structs). The emitter +//! has some awareness of this: it either asks for an island between blocks, so +//! it is not accidentally executed, or else it emits a branch around the island +//! when all other options fail (see [Inst::EmitIsland] meta-instruction). +//! +//! - A "veneer" is an instruction (or sequence of instructions) in an "island" +//! that implements a longer-range reference to a label. The idea is that, for +//! example, a branch with a limited range can branch to a "veneer" instead, +//! which is simply a branch in a form that can use a longer-range reference. On +//! AArch64, for example, conditionals have a +/- 1 MB range, but a conditional +//! can branch to an unconditional branch which has a +/- 128 MB range. Hence, a +//! conditional branch's label reference can be fixed up with a "veneer" to +//! achieve a longer range. +//! +//! - To implement all of this, we require the backend to provide a `LabelUse` +//! type that implements a trait. This is nominally an enum that records one of +//! several kinds of references to an offset in code -- basically, a relocation +//! type -- and will usually correspond to different instruction formats. The +//! `LabelUse` implementation specifies the maximum range, how to patch in the +//! actual label location when known, and how to generate a veneer to extend the +//! range. +//! +//! That satisfies label references, but we still may have suboptimal branch +//! patterns. To clean up the branches, we do a simple "peephole"-style +//! optimization on the fly. To do so, the emitter (e.g., `Inst::emit()`) +//! informs the buffer of branches in the code and, in the case of conditionals, +//! the code that would have been emitted to invert this branch's condition. We +//! track the "latest branches": these are branches that are contiguous up to +//! the current offset. (If any code is emitted after a branch, that branch or +//! run of contiguous branches is no longer "latest".) The latest branches are +//! those that we can edit by simply truncating the buffer and doing something +//! else instead. +//! +//! To optimize branches, we implement several simple rules, and try to apply +//! them to the "latest branches" when possible: +//! +//! - A branch with a label target, when that label is bound to the ending +//! offset of the branch (the fallthrough location), can be removed altogether, +//! because the branch would have no effect). +//! +//! - An unconditional branch that starts at a label location, and branches to +//! another label, results in a "label alias": all references to the label bound +//! *to* this branch instruction are instead resolved to the *target* of the +//! branch instruction. This effectively removes empty blocks that just +//! unconditionally branch to the next block. We call this "branch threading". +//! +//! - A conditional followed by an unconditional, when the conditional branches +//! to the unconditional's fallthrough, results in (i) the truncation of the +//! unconditional, (ii) the inversion of the condition's condition, and (iii) +//! replacement of the conditional's target (using the original target of the +//! unconditional). This is a fancy way of saying "we can flip a two-target +//! conditional branch's taken/not-taken targets if it works better with our +//! fallthrough". To make this work, the emitter actually gives the buffer +//! *both* forms of every conditional branch: the true form is emitted into the +//! buffer, and the "inverted" machine-code bytes are provided as part of the +//! branch-fixup metadata. +//! +//! - An unconditional B preceded by another unconditional P, when B's label(s) have +//! been redirected to target(B), can be removed entirely. This is an extension +//! of the branch-threading optimization, and is valid because if we know there +//! will be no fallthrough into this branch instruction (the prior instruction +//! is an unconditional jump), and if we know we have successfully redirected +//! all labels, then this branch instruction is unreachable. Note that this +//! works because the redirection happens before the label is ever resolved +//! (fixups happen at island emission time, at which point latest-branches are +//! cleared, or at the end of emission), so we are sure to catch and redirect +//! all possible paths to this instruction. use crate::binemit::{Addend, CodeOffset, CodeSink, Reloc}; use crate::ir::{ExternalName, Opcode, SourceLoc, TrapCode}; use crate::machinst::{BlockIndex, MachInstLabelUse, VCodeInst}; -use log::debug; +use log::trace; use smallvec::SmallVec; use std::mem; @@ -35,10 +139,11 @@ pub struct MachBuffer { cur_srcloc: Option<(CodeOffset, SourceLoc)>, /// Known label offsets; `UNKNOWN_LABEL_OFFSET` if unknown. label_offsets: SmallVec<[CodeOffset; 16]>, - /// Label aliases: one label points to an unconditional jump to another - /// label, so references to the first should be resolved as references - /// to the second. (We don't chase arbitrarily deep to avoid problems - /// with cycles.) + /// Label aliases: when one label points to an unconditional jump, and that + /// jump points to another label, we can redirect references to the first + /// label immediately to the second. (We don't chase arbitrarily deep to + /// avoid problems with cycles, but rather only one level, i.e. through one + /// jump.) label_aliases: SmallVec<[MachLabel; 16]>, /// Constants that must be emitted at some point. pending_constants: SmallVec<[MachLabelConstant; 16]>, @@ -129,13 +234,13 @@ impl MachBuffer { /// Add a byte. pub fn put1(&mut self, value: u8) { - debug!("MachBuffer: put byte @ {}: {:x}", self.cur_offset(), value); + trace!("MachBuffer: put byte @ {}: {:x}", self.cur_offset(), value); self.data.push(value); } /// Add 2 bytes. pub fn put2(&mut self, value: u16) { - debug!( + trace!( "MachBuffer: put 16-bit word @ {}: {:x}", self.cur_offset(), value @@ -146,7 +251,7 @@ impl MachBuffer { /// Add 4 bytes. pub fn put4(&mut self, value: u32) { - debug!( + trace!( "MachBuffer: put 32-bit word @ {}: {:x}", self.cur_offset(), value @@ -157,7 +262,7 @@ impl MachBuffer { /// Add 8 bytes. pub fn put8(&mut self, value: u64) { - debug!( + trace!( "MachBuffer: put 64-bit word @ {}: {:x}", self.cur_offset(), value @@ -168,7 +273,7 @@ impl MachBuffer { /// Add a slice of bytes. pub fn put_data(&mut self, data: &[u8]) { - debug!( + trace!( "MachBuffer: put data @ {}: len {}", self.cur_offset(), data.len() @@ -178,7 +283,7 @@ impl MachBuffer { /// Reserve appended space and return a mutable slice referring to it. pub fn get_appended_space(&mut self, len: usize) -> &mut [u8] { - debug!("MachBuffer: put data @ {}: len {}", self.cur_offset(), len); + trace!("MachBuffer: put data @ {}: len {}", self.cur_offset(), len); let off = self.data.len(); let new_len = self.data.len() + len; self.data.resize(new_len, 0); @@ -187,7 +292,7 @@ impl MachBuffer { /// Align up to the given alignment. pub fn align_to(&mut self, align_to: CodeOffset) { - debug!("MachBuffer: align to {}", align_to); + trace!("MachBuffer: align to {}", align_to); assert!(align_to.is_power_of_two()); while self.cur_offset() & (align_to - 1) != 0 { self.put1(0); @@ -200,13 +305,13 @@ impl MachBuffer { let l = self.label_offsets.len() as u32; self.label_offsets.push(UNKNOWN_LABEL_OFFSET); self.label_aliases.push(UNKNOWN_LABEL); - debug!("MachBuffer: new label -> {:?}", MachLabel(l)); + trace!("MachBuffer: new label -> {:?}", MachLabel(l)); MachLabel(l) } /// Reserve the first N MachLabels for blocks. pub fn reserve_labels_for_blocks(&mut self, blocks: BlockIndex) { - debug!("MachBuffer: first {} labels are for blocks", blocks); + trace!("MachBuffer: first {} labels are for blocks", blocks); debug_assert!(self.label_offsets.is_empty()); self.label_offsets .resize(blocks as usize, UNKNOWN_LABEL_OFFSET); @@ -215,7 +320,7 @@ impl MachBuffer { /// Bind a label to the current offset. pub fn bind_label(&mut self, label: MachLabel) { - debug!( + trace!( "MachBuffer: bind label {:?} at offset {}", label, self.cur_offset() @@ -244,9 +349,11 @@ impl MachBuffer { /// happen immediately, the buffer must already contain bytes at `offset` up /// to `offset + kind.patch_size()`. pub fn use_label_at_offset(&mut self, offset: CodeOffset, label: MachLabel, kind: I::LabelUse) { - debug!( + trace!( "MachBuffer: use_label_at_offset: offset {} label {:?} kind {:?}", - offset, label, kind + offset, + label, + kind ); debug_assert!(offset + kind.patch_size() <= self.cur_offset()); @@ -310,14 +417,15 @@ impl MachBuffer { self.data.truncate(b.start as usize); self.fixup_records.truncate(b.fixup); let cur_off = self.cur_offset(); - debug!( + trace!( "truncate_last_branch: truncated {:?}; off now {}", - b, cur_off + b, + cur_off ); for &mut (l, ref mut off) in self.labels_by_offset.iter_mut().rev() { if *off > cur_off { *off = cur_off; - debug!(" -> label {:?} reassigned to {}", l, cur_off); + trace!(" -> label {:?} reassigned to {}", l, cur_off); self.label_offsets[l.0 as usize] = cur_off; } else { break; @@ -326,13 +434,15 @@ impl MachBuffer { } fn optimize_branches(&mut self) { - debug!( + trace!( "enter optimize_branches:\n b = {:?}\n l = {:?}\n f = {:?}", - self.latest_branches, self.labels_by_offset, self.fixup_records + self.latest_branches, + self.labels_by_offset, + self.fixup_records ); while let Some(b) = self.latest_branches.last() { let cur_off = self.cur_offset(); - debug!("optimize_branches: last branch {:?} at off {}", b, cur_off); + trace!("optimize_branches: last branch {:?} at off {}", b, cur_off); // If there has been any code emission since the end of the last branch or // label definition, then there's nothing we can edit (because we // don't move code once placed, only back up and overwrite), so @@ -359,11 +469,11 @@ impl MachBuffer { // Set any label equal to current branch's start as an alias of // the branch's target. for &(l, off) in self.labels_by_offset.iter().rev() { - debug!(" -> uncond: latest label {:?} at off {}", l, off); + trace!(" -> uncond: latest label {:?} at off {}", l, off); if off > b.start { continue; } else if off == b.start { - debug!(" -> setting alias to {:?}", b.target); + trace!(" -> setting alias to {:?}", b.target); self.label_aliases[l.0 as usize] = b.target; } else { break; @@ -375,12 +485,12 @@ impl MachBuffer { // Examine any immediately preceding branch. if self.latest_branches.len() > 1 { let prev_b = &self.latest_branches[self.latest_branches.len() - 2]; - debug!(" -> more than one branch; prev_b = {:?}", prev_b); + trace!(" -> more than one branch; prev_b = {:?}", prev_b); // This uncond is immediately after another uncond; we've // already redirected labels to this uncond away; so we can // truncate this uncond. if prev_b.is_uncond() && prev_b.end == b.start { - debug!(" -> uncond follows another uncond; truncating"); + trace!(" -> uncond follows another uncond; truncating"); self.truncate_last_branch(); continue; } @@ -395,7 +505,7 @@ impl MachBuffer { && prev_b.end == b.start && self.resolve_label_offset(prev_b.target) == cur_off { - debug!(" -> uncond follows a conditional, and conditional's target resolves to current offset"); + trace!(" -> uncond follows a conditional, and conditional's target resolves to current offset"); let target = b.target; let data = prev_b.inverted.clone().unwrap(); self.truncate_last_branch(); @@ -407,7 +517,7 @@ impl MachBuffer { self.data.extend_from_slice(&data[..]); prev_b.inverted = Some(not_inverted); self.fixup_records[prev_b.fixup].label = target; - debug!(" -> reassigning target of condbr to {:?}", target); + trace!(" -> reassigning target of condbr to {:?}", target); prev_b.target = target; continue; } @@ -420,7 +530,7 @@ impl MachBuffer { // the current offset (end of branch) to the truncated // end-of-code. if self.resolve_label_offset(b.target) == cur_off { - debug!("branch with target == cur off; truncating"); + trace!("branch with target == cur off; truncating"); self.truncate_last_branch(); } @@ -430,9 +540,11 @@ impl MachBuffer { self.purge_latest_branches(); - debug!( + trace!( "leave optimize_branches:\n b = {:?}\n l = {:?}\n f = {:?}", - self.latest_branches, self.labels_by_offset, self.fixup_records + self.latest_branches, + self.labels_by_offset, + self.fixup_records ); } @@ -440,7 +552,7 @@ impl MachBuffer { let cur_off = self.cur_offset(); if let Some(l) = self.latest_branches.last() { if l.end < cur_off { - debug!("purge_latest_branches: removing branch {:?}", l); + trace!("purge_latest_branches: removing branch {:?}", l); self.latest_branches.clear(); } } @@ -498,9 +610,11 @@ impl MachBuffer { kind, } in fixup_records.into_iter() { - debug!( + trace!( "emit_island: fixup for label {:?} at offset {} kind {:?}", - label, offset, kind + label, + offset, + kind ); // We eagerly perform fixups whose label targets are known, if not out // of range, to avoid unnecessary veneers. @@ -516,7 +630,7 @@ impl MachBuffer { false }; - debug!( + trace!( " -> label_offset = {}, known = {}, in_range = {} (pos {} neg {})", label_offset, known, @@ -530,7 +644,7 @@ impl MachBuffer { if in_range { debug_assert!(known); // implied by in_range. let slice = &mut self.data[start..end]; - debug!("patching in-range!"); + trace!("patching in-range!"); kind.patch(slice, offset, label_offset); } else if !known && !kind.supports_veneer() { // Nothing for now. Keep it for next round. @@ -543,21 +657,23 @@ impl MachBuffer { // Allocate space for a veneer in the island. self.align_to(I::LabelUse::ALIGN); let veneer_offset = self.cur_offset(); - debug!("making a veneer at {}", veneer_offset); + trace!("making a veneer at {}", veneer_offset); let slice = &mut self.data[start..end]; // Patch the original label use to refer to teh veneer. - debug!( + trace!( "patching original at offset {} to veneer offset {}", - offset, veneer_offset + offset, + veneer_offset ); kind.patch(slice, offset, veneer_offset); // Generate the veneer. let veneer_slice = self.get_appended_space(kind.veneer_size() as usize); let (veneer_fixup_off, veneer_label_use) = kind.generate_veneer(veneer_slice, veneer_offset); - debug!( + trace!( "generated veneer; fixup offset {}, label_use {:?}", - veneer_fixup_off, veneer_label_use + veneer_fixup_off, + veneer_label_use ); // If the label is known (but was just out of range), do the // veneer label-use fixup now too; otherwise, save it for later. @@ -565,7 +681,7 @@ impl MachBuffer { let start = veneer_fixup_off as usize; let end = (veneer_fixup_off + veneer_label_use.patch_size()) as usize; let veneer_slice = &mut self.data[start..end]; - debug!("doing veneer fixup right away too"); + trace!("doing veneer fixup right away too"); veneer_label_use.patch(veneer_slice, veneer_fixup_off, label_offset); } else { new_fixups.push(MachLabelFixup { diff --git a/cranelift/codegen/src/machinst/lower.rs b/cranelift/codegen/src/machinst/lower.rs index fcbf3d2810..76663450ba 100644 --- a/cranelift/codegen/src/machinst/lower.rs +++ b/cranelift/codegen/src/machinst/lower.rs @@ -23,9 +23,9 @@ use alloc::vec::Vec; use log::debug; use smallvec::SmallVec; -/// An "instruction color" partitions instructions by side-effecting ops. All -/// instructions with the same "color" are guaranteed not to be separated by any -/// side-effecting op (for this purpose, loads are also considered +/// An "instruction color" partitions CLIF instructions by side-effecting ops. +/// All instructions with the same "color" are guaranteed not to be separated by +/// any side-effecting op (for this purpose, loads are also considered /// side-effecting, to avoid subtle questions w.r.t. the memory model), and /// furthermore, it is guaranteed that for any two instructions A and B such /// that color(A) == color(B), either A dominates B and B postdominates A, or @@ -33,7 +33,8 @@ use smallvec::SmallVec; /// have the same color, trivially providing the second condition.) Intuitively, /// this means that the ops of the same color must always execute "together", as /// part of one atomic contiguous section of the dynamic execution trace, and -/// they can be freely permuted without affecting program behavior. +/// they can be freely permuted (modulo true dataflow dependencies) without +/// affecting program behavior. #[derive(Clone, Copy, Debug, PartialEq, Eq, Hash)] pub struct InstColor(u32); impl InstColor { @@ -122,7 +123,11 @@ pub trait LowerCtx { /// If the backend uses the register, rather than one of the other /// forms (constant or merging of the producing op), it must call /// `use_input_reg()` to ensure the producing inst is actually lowered - /// as well. + /// as well. Failing to do so may result in the instruction that generates + /// this value never being generated, thus resulting in incorrect execution. + /// For correctness, backends should thus wrap `get_input()` and + /// `use_input_regs()` with helpers that return a register only after + /// ensuring it is marked as used. fn get_input(&self, ir_inst: Inst, idx: usize) -> LowerInput; /// Get the `idx`th output register of the given IR instruction. When /// `backend.lower_inst_to_regs(ctx, inst)` is called, it is expected that @@ -133,7 +138,7 @@ pub trait LowerCtx { // ask for an input to be gen'd into a register. /// Get a new temp. - fn tmp(&mut self, rc: RegClass, ty: Type) -> Writable; + fn alloc_tmp(&mut self, rc: RegClass, ty: Type) -> Writable; /// Emit a machine instruction. fn emit(&mut self, mach_inst: Self::I); /// Indicate that the given input uses the register returned by @@ -477,7 +482,7 @@ impl<'func, I: VCodeInst> Lower<'func, I> { // There's some overlap, so play safe and copy via temps. let mut tmp_regs: SmallVec<[Writable; 16]> = SmallVec::new(); for &ty in &phi_classes { - tmp_regs.push(self.tmp(I::rc_for_type(ty)?, ty)); + tmp_regs.push(self.alloc_tmp(I::rc_for_type(ty)?, ty)); } debug!("phi_temps = {:?}", tmp_regs); @@ -721,6 +726,9 @@ impl<'func, I: VCodeInst> Lower<'func, I> { Ok(vcode) } + /// Get the actual inputs for a value. This is the implementation for + /// `get_input()` but starting from the SSA value, which is not exposed to + /// the backend. fn get_input_for_val(&self, at_inst: Inst, val: Value) -> LowerInput { debug!("get_input_for_val: val {} at inst {}", val, at_inst); let mut reg = self.value_regs[val]; @@ -889,7 +897,7 @@ impl<'func, I: VCodeInst> LowerCtx for Lower<'func, I> { Writable::from_reg(self.value_regs[val]) } - fn tmp(&mut self, rc: RegClass, ty: Type) -> Writable { + fn alloc_tmp(&mut self, rc: RegClass, ty: Type) -> Writable { let v = self.next_vreg; self.next_vreg += 1; let vreg = Reg::new_virtual(rc, v);