diff --git a/cranelift/codegen/src/context.rs b/cranelift/codegen/src/context.rs index 2c4ce6b492..e5d11c6963 100644 --- a/cranelift/codegen/src/context.rs +++ b/cranelift/codegen/src/context.rs @@ -227,7 +227,7 @@ impl Context { let _tt = timing::binemit(); let mut sink = MemoryCodeSink::new(mem, relocs, traps, stackmaps); if let Some(ref result) = &self.mach_compile_result { - result.sections.emit(&mut sink); + result.buffer.emit(&mut sink); } else { isa.emit_function_to_memory(&self.func, &mut sink); } diff --git a/cranelift/codegen/src/inst_predicates.rs b/cranelift/codegen/src/inst_predicates.rs index 9cefbc38f9..f0d6fdf6b5 100644 --- a/cranelift/codegen/src/inst_predicates.rs +++ b/cranelift/codegen/src/inst_predicates.rs @@ -40,3 +40,24 @@ pub fn has_side_effect(func: &Function, inst: Inst) -> bool { let opcode = data.opcode(); trivially_has_side_effects(opcode) || is_load_with_defined_trapping(opcode, data) } + +/// Does the given instruction have any side-effect as per [has_side_effect], or else is a load? +pub fn has_side_effect_or_load(func: &Function, inst: Inst) -> bool { + has_side_effect(func, inst) || func.dfg[inst].opcode().can_load() +} + +/// Is the given instruction a constant value (`iconst`, `fconst`, `bconst`) that can be +/// represented in 64 bits? +pub fn is_constant_64bit(func: &Function, inst: Inst) -> Option { + let data = &func.dfg[inst]; + if data.opcode() == Opcode::Null { + return Some(0); + } + match data { + &InstructionData::UnaryImm { imm, .. } => Some(imm.bits() as u64), + &InstructionData::UnaryIeee32 { imm, .. } => Some(imm.bits() as u64), + &InstructionData::UnaryIeee64 { imm, .. } => Some(imm.bits()), + &InstructionData::UnaryBool { imm, .. } => Some(if imm { 1 } else { 0 }), + _ => None, + } +} diff --git a/cranelift/codegen/src/isa/aarch64/abi.rs b/cranelift/codegen/src/isa/aarch64/abi.rs index d90c23421e..8f388665b5 100644 --- a/cranelift/codegen/src/isa/aarch64/abi.rs +++ b/cranelift/codegen/src/isa/aarch64/abi.rs @@ -504,7 +504,7 @@ impl AArch64ABIBody { rn: stack_reg(), rm: stack_limit, }); - insts.push(Inst::CondBrLowered { + insts.push(Inst::OneWayCondBr { target: BranchTarget::ResolvedOffset(8), // Here `Hs` == "higher or same" when interpreting the two // operands as unsigned integers. diff --git a/cranelift/codegen/src/isa/aarch64/inst/args.rs b/cranelift/codegen/src/isa/aarch64/inst/args.rs index 8eb3b9b02a..4b8142fbe5 100644 --- a/cranelift/codegen/src/isa/aarch64/inst/args.rs +++ b/cranelift/codegen/src/isa/aarch64/inst/args.rs @@ -3,14 +3,14 @@ // Some variants are never constructed, but we still want them as options in the future. #![allow(dead_code)] -use crate::binemit::CodeOffset; use crate::ir::Type; use crate::isa::aarch64::inst::*; use crate::isa::aarch64::lower::ty_bits; +use crate::machinst::MachLabel; use regalloc::{RealRegUniverse, Reg, Writable}; -use core::convert::{Into, TryFrom}; +use core::convert::Into; use std::string::String; /// A shift operator for a register or immediate. @@ -303,78 +303,44 @@ impl CondBrKind { /// A branch target. Either unresolved (basic-block index) or resolved (offset /// from end of current instruction). -#[derive(Clone, Copy, Debug)] +#[derive(Clone, Copy, Debug, PartialEq, Eq)] pub enum BranchTarget { - /// An unresolved reference to a BlockIndex, as passed into + /// An unresolved reference to a Label, as passed into /// `lower_branch_group()`. - Block(BlockIndex), - /// A resolved reference to another instruction, after - /// `Inst::with_block_offsets()`. + Label(MachLabel), + /// A fixed PC offset. ResolvedOffset(isize), } impl BranchTarget { - /// Lower the branch target given offsets of each block. - pub fn lower(&mut self, targets: &[CodeOffset], my_offset: CodeOffset) { + /// Return the target's label, if it is a label-based target. + pub fn as_label(self) -> Option { match self { - &mut BranchTarget::Block(bix) => { - let bix = usize::try_from(bix).unwrap(); - assert!(bix < targets.len()); - let block_offset_in_func = targets[bix]; - let branch_offset = (block_offset_in_func as isize) - (my_offset as isize); - *self = BranchTarget::ResolvedOffset(branch_offset); - } - &mut BranchTarget::ResolvedOffset(..) => {} - } - } - - /// Get the block index. - pub fn as_block_index(&self) -> Option { - match self { - &BranchTarget::Block(bix) => Some(bix), + BranchTarget::Label(l) => Some(l), _ => None, } } - /// Get the offset as 4-byte words. Returns `0` if not - /// yet resolved (in that case, we're only computing - /// size and the offset doesn't matter). - pub fn as_offset_words(&self) -> isize { - match self { - &BranchTarget::ResolvedOffset(off) => off >> 2, + /// Return the target's offset, if specified, or zero if label-based. + pub fn as_offset19_or_zero(self) -> u32 { + let off = match self { + BranchTarget::ResolvedOffset(off) => off >> 2, _ => 0, - } + }; + assert!(off <= 0x3ffff); + assert!(off >= -0x40000); + (off as u32) & 0x7ffff } - /// Get the offset as a 26-bit offset suitable for a 26-bit jump, or `None` if overflow. - pub fn as_off26(&self) -> Option { - let off = self.as_offset_words(); - if (off < (1 << 25)) && (off >= -(1 << 25)) { - Some((off as u32) & ((1 << 26) - 1)) - } else { - None - } - } - - /// Get the offset as a 19-bit offset, or `None` if overflow. - pub fn as_off19(&self) -> Option { - let off = self.as_offset_words(); - if (off < (1 << 18)) && (off >= -(1 << 18)) { - Some((off as u32) & ((1 << 19) - 1)) - } else { - None - } - } - - /// Map the block index given a transform map. - pub fn map(&mut self, block_index_map: &[BlockIndex]) { - match self { - &mut BranchTarget::Block(ref mut bix) => { - let n = block_index_map[usize::try_from(*bix).unwrap()]; - *bix = n; - } - &mut BranchTarget::ResolvedOffset(_) => {} - } + /// Return the target's offset, if specified, or zero if label-based. + pub fn as_offset26_or_zero(self) -> u32 { + let off = match self { + BranchTarget::ResolvedOffset(off) => off >> 2, + _ => 0, + }; + assert!(off <= 0x1ffffff); + assert!(off >= -0x2000000); + (off as u32) & 0x3ffffff } } @@ -507,7 +473,7 @@ impl ShowWithRRU for Cond { impl ShowWithRRU for BranchTarget { fn show_rru(&self, _mb_rru: Option<&RealRegUniverse>) -> String { match self { - &BranchTarget::Block(block) => format!("block{}", block), + &BranchTarget::Label(label) => format!("label{:?}", label.get()), &BranchTarget::ResolvedOffset(off) => format!("{}", off), } } diff --git a/cranelift/codegen/src/isa/aarch64/inst/emit.rs b/cranelift/codegen/src/isa/aarch64/inst/emit.rs index 5a9f9fef59..2d5ecd406d 100644 --- a/cranelift/codegen/src/isa/aarch64/inst/emit.rs +++ b/cranelift/codegen/src/isa/aarch64/inst/emit.rs @@ -4,7 +4,7 @@ use crate::binemit::{CodeOffset, Reloc}; use crate::ir::constant::ConstantData; use crate::ir::types::*; use crate::ir::TrapCode; -use crate::isa::aarch64::{inst::regs::PINNED_REG, inst::*}; +use crate::isa::aarch64::inst::*; use regalloc::{Reg, RegClass, Writable}; @@ -149,6 +149,14 @@ fn enc_cbr(op_31_24: u32, off_18_0: u32, op_4: u32, cond: u32) -> u32 { (op_31_24 << 24) | (off_18_0 << 5) | (op_4 << 4) | cond } +fn enc_conditional_br(taken: BranchTarget, kind: CondBrKind) -> u32 { + match kind { + CondBrKind::Zero(reg) => enc_cmpbr(0b1_011010_0, taken.as_offset19_or_zero(), reg), + CondBrKind::NotZero(reg) => enc_cmpbr(0b1_011010_1, taken.as_offset19_or_zero(), reg), + CondBrKind::Cond(c) => enc_cbr(0b01010100, taken.as_offset19_or_zero(), 0b0, c.bits()), + } +} + const MOVE_WIDE_FIXED: u32 = 0x92800000; #[repr(u32)] @@ -340,10 +348,10 @@ pub struct EmitState { virtual_sp_offset: i64, } -impl MachInstEmit for Inst { +impl MachInstEmit for Inst { type State = EmitState; - fn emit(&self, sink: &mut O, flags: &settings::Flags, state: &mut EmitState) { + fn emit(&self, sink: &mut MachBuffer, flags: &settings::Flags, state: &mut EmitState) { match self { &Inst::AluRRR { alu_op, rd, rn, rm } => { let top11 = match alu_op { @@ -616,7 +624,7 @@ impl MachInstEmit for Inst { ref mem, srcloc, } => { - let (mem_insts, mem) = mem_finalize(sink.cur_offset_from_start(), mem, state); + let (mem_insts, mem) = mem_finalize(sink.cur_offset(), mem, state); for inst in mem_insts.into_iter() { inst.emit(sink, flags, state); @@ -759,7 +767,7 @@ impl MachInstEmit for Inst { ref mem, srcloc, } => { - let (mem_insts, mem) = mem_finalize(sink.cur_offset_from_start(), mem, state); + let (mem_insts, mem) = mem_finalize(sink.cur_offset(), mem, state); for inst in mem_insts.into_iter() { inst.emit(sink, flags, state); @@ -1147,10 +1155,18 @@ impl MachInstEmit for Inst { panic!("Unsupported extend variant"); } &Inst::Jump { ref dest } => { - // TODO: differentiate between as_off26() returning `None` for - // out-of-range vs. not-yet-finalized. The latter happens when we - // do early (fake) emission for size computation. - sink.put4(enc_jump26(0b000101, dest.as_off26().unwrap())); + let off = sink.cur_offset(); + // Emit the jump itself. + sink.put4(enc_jump26(0b000101, dest.as_offset26_or_zero())); + // After the jump has been emitted, indicate that it uses a + // label, if so, so that a fixup can occur later. This happens + // after we emit the bytes because the fixup might occur right + // away (so the bytes must actually exist now). + if let Some(l) = dest.as_label() { + sink.use_label_at_offset(off, l, LabelUse::Branch26); + let cur_off = sink.cur_offset(); + sink.add_uncond_branch(off, cur_off, l); + } } &Inst::Ret => { sink.put4(0xd65f03c0); @@ -1178,51 +1194,35 @@ impl MachInstEmit for Inst { sink.add_call_site(loc, opcode); } } - &Inst::CondBr { .. } => panic!("Unlowered CondBr during binemit!"), - &Inst::CondBrLowered { target, kind } => match kind { - // TODO: handle >2^19 case by emitting a compound sequence with - // an unconditional (26-bit) branch. We need branch-relaxation - // adjustment machinery to enable this (because we don't want to - // always emit the long form). - CondBrKind::Zero(reg) => { - sink.put4(enc_cmpbr(0b1_011010_0, target.as_off19().unwrap(), reg)); - } - CondBrKind::NotZero(reg) => { - sink.put4(enc_cmpbr(0b1_011010_1, target.as_off19().unwrap(), reg)); - } - CondBrKind::Cond(c) => { - sink.put4(enc_cbr( - 0b01010100, - target.as_off19().unwrap_or(0), - 0b0, - c.bits(), - )); - } - }, - &Inst::CondBrLoweredCompound { + &Inst::CondBr { taken, not_taken, kind, } => { // Conditional part first. - match kind { - CondBrKind::Zero(reg) => { - sink.put4(enc_cmpbr(0b1_011010_0, taken.as_off19().unwrap(), reg)); - } - CondBrKind::NotZero(reg) => { - sink.put4(enc_cmpbr(0b1_011010_1, taken.as_off19().unwrap(), reg)); - } - CondBrKind::Cond(c) => { - sink.put4(enc_cbr( - 0b01010100, - taken.as_off19().unwrap_or(0), - 0b0, - c.bits(), - )); - } + let cond_off = sink.cur_offset(); + sink.put4(enc_conditional_br(taken, kind)); + if let Some(l) = taken.as_label() { + sink.use_label_at_offset(cond_off, l, LabelUse::Branch19); + let cur_off = sink.cur_offset(); + let inverted = enc_conditional_br(taken, kind.invert()).to_le_bytes(); + sink.add_cond_branch(cond_off, cur_off, l, &inverted[..]); } // Unconditional part. - sink.put4(enc_jump26(0b000101, not_taken.as_off26().unwrap_or(0))); + let uncond_off = sink.cur_offset(); + sink.put4(enc_jump26(0b000101, not_taken.as_offset26_or_zero())); + if let Some(l) = not_taken.as_label() { + sink.use_label_at_offset(uncond_off, l, LabelUse::Branch26); + let cur_off = sink.cur_offset(); + sink.add_uncond_branch(uncond_off, cur_off, l); + } + } + &Inst::OneWayCondBr { target, kind } => { + let off = sink.cur_offset(); + sink.put4(enc_conditional_br(target, kind)); + if let Some(l) = target.as_label() { + sink.use_label_at_offset(off, l, LabelUse::Branch19); + } } &Inst::IndirectBr { rn, .. } => { sink.put4(enc_br(rn)); @@ -1239,8 +1239,7 @@ impl MachInstEmit for Inst { sink.add_trap(srcloc, code); sink.put4(0xd4a00000); } - &Inst::Adr { rd, ref label } => { - let off = memlabel_finalize(sink.cur_offset_from_start(), label); + &Inst::Adr { rd, off } => { assert!(off > -(1 << 20)); assert!(off < (1 << 20)); sink.put4(enc_adr(off, rd)); @@ -1261,19 +1260,13 @@ impl MachInstEmit for Inst { // This sequence is *one* instruction in the vcode, and is expanded only here at // emission time, because we cannot allow the regalloc to insert spills/reloads in // the middle; we depend on hardcoded PC-rel addressing below. - // - // N.B.: if PC-rel addressing on ADR below is changed, also update - // `Inst::with_block_offsets()` in aarch64/inst/mod.rs. // Save index in a tmp (the live range of ridx only goes to start of this // sequence; rtmp1 or rtmp2 may overwrite it). let inst = Inst::gen_move(rtmp2, ridx, I64); inst.emit(sink, flags, state); // Load address of jump table - let inst = Inst::Adr { - rd: rtmp1, - label: MemLabel::PCRel(16), - }; + let inst = Inst::Adr { rd: rtmp1, off: 16 }; inst.emit(sink, flags, state); // Load value out of jump table let inst = Inst::SLoad32 { @@ -1303,12 +1296,16 @@ impl MachInstEmit for Inst { }; inst.emit(sink, flags, state); // Emit jump table (table of 32-bit offsets). - for target in targets.iter() { - let off = target.as_offset_words() * 4; - let off = i32::try_from(off).unwrap(); - // cast i32 to u32 (two's-complement) - let off = off as u32; - sink.put4(off); + let jt_off = sink.cur_offset(); + for &target in targets.iter() { + let word_off = sink.cur_offset(); + let off_into_table = word_off - jt_off; + sink.put4(off_into_table); + sink.use_label_at_offset( + word_off, + target.as_label().unwrap(), + LabelUse::PCRel32, + ); } } &Inst::LoadConst64 { rd, const_data } => { @@ -1348,7 +1345,7 @@ impl MachInstEmit for Inst { } } &Inst::LoadAddr { rd, ref mem } => { - let (mem_insts, mem) = mem_finalize(sink.cur_offset_from_start(), mem, state); + let (mem_insts, mem) = mem_finalize(sink.cur_offset(), mem, state); for inst in mem_insts.into_iter() { inst.emit(sink, flags, state); } @@ -1401,20 +1398,6 @@ impl MachInstEmit for Inst { add.emit(sink, flags, state); } } - &Inst::GetPinnedReg { rd } => { - let inst = Inst::Mov { - rd, - rm: xreg(PINNED_REG), - }; - inst.emit(sink, flags, state); - } - &Inst::SetPinnedReg { rm } => { - let inst = Inst::Mov { - rd: Writable::from_reg(xreg(PINNED_REG)), - rm, - }; - inst.emit(sink, flags, state); - } &Inst::VirtualSPOffsetAdj { offset } => { debug!( "virtual sp offset adjusted by {} -> {}", @@ -1423,6 +1406,17 @@ impl MachInstEmit for Inst { ); state.virtual_sp_offset += offset; } + &Inst::EmitIsland { needed_space } => { + if sink.island_needed(needed_space + 4) { + let jump_around_label = sink.get_label(); + let jmp = Inst::Jump { + dest: BranchTarget::Label(jump_around_label), + }; + jmp.emit(sink, flags, state); + sink.emit_island(); + sink.bind_label(jump_around_label); + } + } } } } diff --git a/cranelift/codegen/src/isa/aarch64/inst/emit_tests.rs b/cranelift/codegen/src/isa/aarch64/inst/emit_tests.rs index f98d3c6b00..55977796ce 100644 --- a/cranelift/codegen/src/isa/aarch64/inst/emit_tests.rs +++ b/cranelift/codegen/src/isa/aarch64/inst/emit_tests.rs @@ -1956,7 +1956,7 @@ fn test_aarch64_binemit() { )); insns.push(( - Inst::CondBrLowered { + Inst::OneWayCondBr { target: BranchTarget::ResolvedOffset(64), kind: CondBrKind::Zero(xreg(8)), }, @@ -1964,7 +1964,7 @@ fn test_aarch64_binemit() { "cbz x8, 64", )); insns.push(( - Inst::CondBrLowered { + Inst::OneWayCondBr { target: BranchTarget::ResolvedOffset(64), kind: CondBrKind::NotZero(xreg(8)), }, @@ -1972,7 +1972,7 @@ fn test_aarch64_binemit() { "cbnz x8, 64", )); insns.push(( - Inst::CondBrLowered { + Inst::OneWayCondBr { target: BranchTarget::ResolvedOffset(64), kind: CondBrKind::Cond(Cond::Eq), }, @@ -1980,7 +1980,7 @@ fn test_aarch64_binemit() { "b.eq 64", )); insns.push(( - Inst::CondBrLowered { + Inst::OneWayCondBr { target: BranchTarget::ResolvedOffset(64), kind: CondBrKind::Cond(Cond::Ne), }, @@ -1989,7 +1989,7 @@ fn test_aarch64_binemit() { )); insns.push(( - Inst::CondBrLowered { + Inst::OneWayCondBr { target: BranchTarget::ResolvedOffset(64), kind: CondBrKind::Cond(Cond::Hs), }, @@ -1997,7 +1997,7 @@ fn test_aarch64_binemit() { "b.hs 64", )); insns.push(( - Inst::CondBrLowered { + Inst::OneWayCondBr { target: BranchTarget::ResolvedOffset(64), kind: CondBrKind::Cond(Cond::Lo), }, @@ -2005,7 +2005,7 @@ fn test_aarch64_binemit() { "b.lo 64", )); insns.push(( - Inst::CondBrLowered { + Inst::OneWayCondBr { target: BranchTarget::ResolvedOffset(64), kind: CondBrKind::Cond(Cond::Mi), }, @@ -2013,7 +2013,7 @@ fn test_aarch64_binemit() { "b.mi 64", )); insns.push(( - Inst::CondBrLowered { + Inst::OneWayCondBr { target: BranchTarget::ResolvedOffset(64), kind: CondBrKind::Cond(Cond::Pl), }, @@ -2021,7 +2021,7 @@ fn test_aarch64_binemit() { "b.pl 64", )); insns.push(( - Inst::CondBrLowered { + Inst::OneWayCondBr { target: BranchTarget::ResolvedOffset(64), kind: CondBrKind::Cond(Cond::Vs), }, @@ -2029,7 +2029,7 @@ fn test_aarch64_binemit() { "b.vs 64", )); insns.push(( - Inst::CondBrLowered { + Inst::OneWayCondBr { target: BranchTarget::ResolvedOffset(64), kind: CondBrKind::Cond(Cond::Vc), }, @@ -2037,7 +2037,7 @@ fn test_aarch64_binemit() { "b.vc 64", )); insns.push(( - Inst::CondBrLowered { + Inst::OneWayCondBr { target: BranchTarget::ResolvedOffset(64), kind: CondBrKind::Cond(Cond::Hi), }, @@ -2045,7 +2045,7 @@ fn test_aarch64_binemit() { "b.hi 64", )); insns.push(( - Inst::CondBrLowered { + Inst::OneWayCondBr { target: BranchTarget::ResolvedOffset(64), kind: CondBrKind::Cond(Cond::Ls), }, @@ -2053,7 +2053,7 @@ fn test_aarch64_binemit() { "b.ls 64", )); insns.push(( - Inst::CondBrLowered { + Inst::OneWayCondBr { target: BranchTarget::ResolvedOffset(64), kind: CondBrKind::Cond(Cond::Ge), }, @@ -2061,7 +2061,7 @@ fn test_aarch64_binemit() { "b.ge 64", )); insns.push(( - Inst::CondBrLowered { + Inst::OneWayCondBr { target: BranchTarget::ResolvedOffset(64), kind: CondBrKind::Cond(Cond::Lt), }, @@ -2069,7 +2069,7 @@ fn test_aarch64_binemit() { "b.lt 64", )); insns.push(( - Inst::CondBrLowered { + Inst::OneWayCondBr { target: BranchTarget::ResolvedOffset(64), kind: CondBrKind::Cond(Cond::Gt), }, @@ -2077,7 +2077,7 @@ fn test_aarch64_binemit() { "b.gt 64", )); insns.push(( - Inst::CondBrLowered { + Inst::OneWayCondBr { target: BranchTarget::ResolvedOffset(64), kind: CondBrKind::Cond(Cond::Le), }, @@ -2085,7 +2085,7 @@ fn test_aarch64_binemit() { "b.le 64", )); insns.push(( - Inst::CondBrLowered { + Inst::OneWayCondBr { target: BranchTarget::ResolvedOffset(64), kind: CondBrKind::Cond(Cond::Al), }, @@ -2093,7 +2093,7 @@ fn test_aarch64_binemit() { "b.al 64", )); insns.push(( - Inst::CondBrLowered { + Inst::OneWayCondBr { target: BranchTarget::ResolvedOffset(64), kind: CondBrKind::Cond(Cond::Nv), }, @@ -2102,7 +2102,7 @@ fn test_aarch64_binemit() { )); insns.push(( - Inst::CondBrLoweredCompound { + Inst::CondBr { taken: BranchTarget::ResolvedOffset(64), not_taken: BranchTarget::ResolvedOffset(128), kind: CondBrKind::Cond(Cond::Le), @@ -2138,7 +2138,7 @@ fn test_aarch64_binemit() { insns.push(( Inst::IndirectBr { rn: xreg(3), - targets: vec![1, 2, 3], + targets: vec![], }, "60001FD6", "br x3", @@ -2149,7 +2149,7 @@ fn test_aarch64_binemit() { insns.push(( Inst::Adr { rd: writable_xreg(15), - label: MemLabel::PCRel((1 << 20) - 4), + off: (1 << 20) - 4, }, "EFFF7F10", "adr x15, pc+1048572", @@ -2792,19 +2792,11 @@ fn test_aarch64_binemit() { let actual_printing = insn.show_rru(Some(&rru)); assert_eq!(expected_printing, actual_printing); - // Check the encoding is as expected. - let text_size = { - let mut code_sec = MachSectionSize::new(0); - insn.emit(&mut code_sec, &flags, &mut Default::default()); - code_sec.size() - }; - let mut sink = test_utils::TestCodeSink::new(); - let mut sections = MachSections::new(); - let code_idx = sections.add_section(0, text_size); - let code_sec = sections.get_section(code_idx); - insn.emit(code_sec, &flags, &mut Default::default()); - sections.emit(&mut sink); + let mut buffer = MachBuffer::new(); + insn.emit(&mut buffer, &flags, &mut Default::default()); + let buffer = buffer.finish(); + buffer.emit(&mut sink); let actual_encoding = &sink.stringify(); assert_eq!(expected_encoding, actual_encoding); } diff --git a/cranelift/codegen/src/isa/aarch64/inst/mod.rs b/cranelift/codegen/src/isa/aarch64/inst/mod.rs index 3e4247ac14..714ba1eb4d 100644 --- a/cranelift/codegen/src/isa/aarch64/inst/mod.rs +++ b/cranelift/codegen/src/isa/aarch64/inst/mod.rs @@ -645,35 +645,28 @@ pub enum Inst { dest: BranchTarget, }, - /// A conditional branch. + /// A conditional branch. Contains two targets; at emission time, both are emitted, but + /// the MachBuffer knows to truncate the trailing branch if fallthrough. We optimize the + /// choice of taken/not_taken (inverting the branch polarity as needed) based on the + /// fallthrough at the time of lowering. CondBr { taken: BranchTarget, not_taken: BranchTarget, kind: CondBrKind, }, - /// Lowered conditional branch: contains the original branch kind (or the - /// inverse), but only one BranchTarget is retained. The other is - /// implicitly the next instruction, given the final basic-block layout. - CondBrLowered { + /// A one-way conditional branch, invisible to the CFG processing; used *only* as part of + /// straight-line sequences in code to be emitted. + OneWayCondBr { target: BranchTarget, kind: CondBrKind, }, - /// As for `CondBrLowered`, but represents a condbr/uncond-br sequence (two - /// actual machine instructions). Needed when the final block layout implies - /// that neither arm of a conditional branch targets the fallthrough block. - CondBrLoweredCompound { - taken: BranchTarget, - not_taken: BranchTarget, - kind: CondBrKind, - }, - /// An indirect branch through a register, augmented with set of all /// possible successors. IndirectBr { rn: Reg, - targets: Vec, + targets: Vec, }, /// A "break" instruction, used for e.g. traps and debug breakpoints. @@ -685,11 +678,14 @@ pub enum Inst { trap_info: (SourceLoc, TrapCode), }, - /// Load the address (using a PC-relative offset) of a MemLabel, using the - /// `ADR` instruction. + /// Load the address (using a PC-relative offset) of a memory location, using the `ADR` + /// instruction. Note that we take a simple offset, not a `MemLabel`, here, because `Adr` is + /// only used for now in fixed lowering sequences with hardcoded offsets. In the future we may + /// need full `MemLabel` support. Adr { rd: Writable, - label: MemLabel, + /// Offset in range -2^20 .. 2^20. + off: i32, }, /// Raw 32-bit word, used for inline constants and jump-table entries. @@ -706,7 +702,7 @@ pub enum Inst { /// for rationale). JTSequence { targets: Box<[BranchTarget]>, - targets_for_term: Box<[BlockIndex]>, // needed for MachTerminator. + targets_for_term: Box<[MachLabel]>, // needed for MachTerminator. ridx: Reg, rtmp1: Writable, rtmp2: Writable, @@ -732,21 +728,19 @@ pub enum Inst { mem: MemArg, }, - /// Sets the value of the pinned register to the given register target. - GetPinnedReg { - rd: Writable, - }, - - /// Writes the value of the given source register to the pinned register. - SetPinnedReg { - rm: Reg, - }, - /// Marker, no-op in generated code: SP "virtual offset" is adjusted. This /// controls MemArg::NominalSPOffset args are lowered. VirtualSPOffsetAdj { offset: i64, }, + + /// Meta-insn, no-op in generated code: emit constant/branch veneer island at this point (with + /// a guard jump around it) if less than the needed space is available before the next branch + /// deadline. + EmitIsland { + /// The needed space before the next deadline. + needed_space: CodeOffset, + }, } fn count_zero_half_words(mut value: u64) -> usize { @@ -1111,9 +1105,7 @@ fn aarch64_get_regs(inst: &Inst, collector: &mut RegUsageCollector) { collector.add_defs(&*defs); collector.add_use(rn); } - &Inst::CondBr { ref kind, .. } - | &Inst::CondBrLowered { ref kind, .. } - | &Inst::CondBrLoweredCompound { ref kind, .. } => match kind { + &Inst::CondBr { ref kind, .. } | &Inst::OneWayCondBr { ref kind, .. } => match kind { CondBrKind::Zero(rt) | CondBrKind::NotZero(rt) => { collector.add_use(*rt); } @@ -1142,13 +1134,8 @@ fn aarch64_get_regs(inst: &Inst, collector: &mut RegUsageCollector) { &Inst::LoadAddr { rd, mem: _ } => { collector.add_def(rd); } - &Inst::GetPinnedReg { rd } => { - collector.add_def(rd); - } - &Inst::SetPinnedReg { rm } => { - collector.add_use(rm); - } &Inst::VirtualSPOffsetAdj { .. } => {} + &Inst::EmitIsland { .. } => {} } } @@ -1676,13 +1663,7 @@ fn aarch64_map_regs(inst: &mut Inst, mapper: &RegUsageMapper) { *defs = Box::new(new_defs); map_use(mapper, rn); } - &mut Inst::CondBr { ref mut kind, .. } => { - map_br(mapper, kind); - } - &mut Inst::CondBrLowered { ref mut kind, .. } => { - map_br(mapper, kind); - } - &mut Inst::CondBrLoweredCompound { ref mut kind, .. } => { + &mut Inst::CondBr { ref mut kind, .. } | &mut Inst::OneWayCondBr { ref mut kind, .. } => { map_br(mapper, kind); } &mut Inst::IndirectBr { ref mut rn, .. } => { @@ -1716,13 +1697,8 @@ fn aarch64_map_regs(inst: &mut Inst, mapper: &RegUsageMapper) { map_def(mapper, rd); map_mem(mapper, mem); } - &mut Inst::GetPinnedReg { ref mut rd } => { - map_def(mapper, rd); - } - &mut Inst::SetPinnedReg { ref mut rm } => { - map_use(mapper, rm); - } &mut Inst::VirtualSPOffsetAdj { .. } => {} + &mut Inst::EmitIsland { .. } => {} } } @@ -1730,6 +1706,8 @@ fn aarch64_map_regs(inst: &mut Inst, mapper: &RegUsageMapper) { // Instructions: misc functions and external interface impl MachInst for Inst { + type LabelUse = LabelUse; + fn get_regs(&self, collector: &mut RegUsageCollector) { aarch64_get_regs(self, collector) } @@ -1757,24 +1735,14 @@ impl MachInst for Inst { fn is_term<'a>(&'a self) -> MachTerminator<'a> { match self { &Inst::Ret | &Inst::EpiloguePlaceholder => MachTerminator::Ret, - &Inst::Jump { dest } => MachTerminator::Uncond(dest.as_block_index().unwrap()), + &Inst::Jump { dest } => MachTerminator::Uncond(dest.as_label().unwrap()), &Inst::CondBr { taken, not_taken, .. - } => MachTerminator::Cond( - taken.as_block_index().unwrap(), - not_taken.as_block_index().unwrap(), - ), - &Inst::CondBrLowered { .. } => { - // When this is used prior to branch finalization for branches - // within an open-coded sequence, i.e. with ResolvedOffsets, - // do not consider it a terminator. From the point of view of CFG analysis, - // it is part of a black-box single-in single-out region, hence is not - // denoted a terminator. + } => MachTerminator::Cond(taken.as_label().unwrap(), not_taken.as_label().unwrap()), + &Inst::OneWayCondBr { .. } => { + // Explicitly invisible to CFG processing. MachTerminator::None } - &Inst::CondBrLoweredCompound { .. } => { - panic!("is_term() called after lowering branches"); - } &Inst::IndirectBr { ref targets, .. } => MachTerminator::Indirect(&targets[..]), &Inst::JTSequence { ref targets_for_term, @@ -1789,6 +1757,23 @@ impl MachInst for Inst { Inst::mov(to_reg, from_reg) } + fn gen_constant(to_reg: Writable, value: u64, ty: Type) -> SmallVec<[Inst; 4]> { + if ty == F64 { + let mut ret = SmallVec::new(); + ret.push(Inst::load_fp_constant64(to_reg, f64::from_bits(value))); + ret + } else if ty == F32 { + let mut ret = SmallVec::new(); + ret.push(Inst::load_fp_constant32( + to_reg, + f32::from_bits(value as u32), + )); + ret + } else { + Inst::load_constant(to_reg, value) + } + } + fn gen_zero_len_nop() -> Inst { Inst::Nop0 } @@ -1815,101 +1800,25 @@ impl MachInst for Inst { } } - fn gen_jump(blockindex: BlockIndex) -> Inst { + fn gen_jump(target: MachLabel) -> Inst { Inst::Jump { - dest: BranchTarget::Block(blockindex), + dest: BranchTarget::Label(target), } } - fn with_block_rewrites(&mut self, block_target_map: &[BlockIndex]) { - match self { - &mut Inst::Jump { ref mut dest } => { - dest.map(block_target_map); - } - &mut Inst::CondBr { - ref mut taken, - ref mut not_taken, - .. - } => { - taken.map(block_target_map); - not_taken.map(block_target_map); - } - &mut Inst::CondBrLowered { .. } => { - // See note in `is_term()`: this is used in open-coded sequences - // within blocks and should be left alone. - } - &mut Inst::CondBrLoweredCompound { .. } => { - panic!("with_block_rewrites called after branch lowering!"); - } - _ => {} - } + fn reg_universe(flags: &settings::Flags) -> RealRegUniverse { + create_reg_universe(flags) } - fn with_fallthrough_block(&mut self, fallthrough: Option) { - match self { - &mut Inst::CondBr { - taken, - not_taken, - kind, - } => { - if taken.as_block_index() == fallthrough - && not_taken.as_block_index() == fallthrough - { - *self = Inst::Nop0; - } else if taken.as_block_index() == fallthrough { - *self = Inst::CondBrLowered { - target: not_taken, - kind: kind.invert(), - }; - } else if not_taken.as_block_index() == fallthrough { - *self = Inst::CondBrLowered { - target: taken, - kind, - }; - } else { - // We need a compound sequence (condbr / uncond-br). - *self = Inst::CondBrLoweredCompound { - taken, - not_taken, - kind, - }; - } - } - &mut Inst::Jump { dest } => { - if dest.as_block_index() == fallthrough { - *self = Inst::Nop0; - } - } - _ => {} - } - } - - fn with_block_offsets(&mut self, my_offset: CodeOffset, targets: &[CodeOffset]) { - match self { - &mut Inst::CondBrLowered { ref mut target, .. } => { - target.lower(targets, my_offset); - } - &mut Inst::CondBrLoweredCompound { - ref mut taken, - ref mut not_taken, - .. - } => { - taken.lower(targets, my_offset); - not_taken.lower(targets, my_offset + 4); - } - &mut Inst::Jump { ref mut dest } => { - dest.lower(targets, my_offset); - } - &mut Inst::JTSequence { - targets: ref mut t, .. - } => { - for target in t.iter_mut() { - // offset+20: jumptable is 20 bytes into compound sequence. - target.lower(targets, my_offset + 20); - } - } - _ => {} - } + fn worst_case_size() -> CodeOffset { + // The maximum size, in bytes, of any `Inst`'s emitted code. We have at least one case of + // an 8-instruction sequence (saturating int-to-float conversions) with three embedded + // 64-bit f64 constants. + // + // Note that inline jump-tables handle island/pool insertion separately, so we do not need + // to account for them here (otherwise the worst case would be 2^31 * 4, clearly not + // feasible for other reasons). + 44 } } @@ -2550,12 +2459,12 @@ impl ShowWithRRU for Inst { } } } - &Inst::CondBrLowered { + &Inst::OneWayCondBr { ref target, ref kind, } => { let target = target.show_rru(mb_rru); - match &kind { + match kind { &CondBrKind::Zero(reg) => { let reg = reg.show_rru(mb_rru); format!("cbz {}, {}", reg, target) @@ -2570,30 +2479,15 @@ impl ShowWithRRU for Inst { } } } - &Inst::CondBrLoweredCompound { - ref taken, - ref not_taken, - ref kind, - } => { - let first = Inst::CondBrLowered { - target: taken.clone(), - kind: kind.clone(), - }; - let second = Inst::Jump { - dest: not_taken.clone(), - }; - first.show_rru(mb_rru) + " ; " + &second.show_rru(mb_rru) - } &Inst::IndirectBr { rn, .. } => { let rn = rn.show_rru(mb_rru); format!("br {}", rn) } &Inst::Brk => "brk #0".to_string(), &Inst::Udf { .. } => "udf".to_string(), - &Inst::Adr { rd, ref label } => { + &Inst::Adr { rd, off } => { let rd = rd.show_rru(mb_rru); - let label = label.show_rru(mb_rru); - format!("adr {}, {}", rd, label) + format!("adr {}, pc+{}", rd, off) } &Inst::Word4 { data } => format!("data.i32 {}", data), &Inst::Word8 { data } => format!("data.i64 {}", data), @@ -2683,15 +2577,134 @@ impl ShowWithRRU for Inst { } ret } - &Inst::GetPinnedReg { rd } => { - let rd = rd.show_rru(mb_rru); - format!("get_pinned_reg {}", rd) - } - &Inst::SetPinnedReg { rm } => { - let rm = rm.show_rru(mb_rru); - format!("set_pinned_reg {}", rm) - } &Inst::VirtualSPOffsetAdj { offset } => format!("virtual_sp_offset_adjust {}", offset), + &Inst::EmitIsland { needed_space } => format!("emit_island {}", needed_space), + } + } +} + +//============================================================================= +// Label fixups and jump veneers. + +/// Different forms of label references for different instruction formats. +#[derive(Clone, Copy, Debug, PartialEq, Eq)] +pub enum LabelUse { + /// 19-bit branch offset (conditional branches). PC-rel, offset is imm << 2. Immediate is 19 + /// signed bits, in bits 23:5. Used by cbz, cbnz, b.cond. + Branch19, + /// 26-bit branch offset (unconditional branches). PC-rel, offset is imm << 2. Immediate is 26 + /// signed bits, in bits 25:0. Used by b, bl. + Branch26, + /// 19-bit offset for LDR (load literal). PC-rel, offset is imm << 2. Immediate is 19 signed bits, + /// in bits 23:5. + Ldr19, + /// 21-bit offset for ADR (get address of label). PC-rel, offset is not shifted. Immediate is + /// 21 signed bits, with high 19 bits in bits 23:5 and low 2 bits in bits 30:29. + Adr21, + /// 32-bit PC relative constant offset (from address of constant itself). Used in jump tables. + PCRel32, +} + +impl MachInstLabelUse for LabelUse { + /// Alignment for veneer code. Every AArch64 instruction must be 4-byte-aligned. + const ALIGN: CodeOffset = 4; + + /// Maximum PC-relative range (positive), inclusive. + fn max_pos_range(self) -> CodeOffset { + match self { + // 19-bit immediate, left-shifted by 2, for 21 bits of total range. Signed, so +2^20 + // from zero. Likewise for two other shifted cases below. + LabelUse::Branch19 => (1 << 20) - 1, + LabelUse::Branch26 => (1 << 27) - 1, + LabelUse::Ldr19 => (1 << 20) - 1, + // Adr does not shift its immediate, so the 21-bit immediate gives 21 bits of total + // range. + LabelUse::Adr21 => (1 << 20) - 1, + LabelUse::PCRel32 => 0x7fffffff, + } + } + + /// Maximum PC-relative range (negative). + fn max_neg_range(self) -> CodeOffset { + // All forms are twos-complement signed offsets, so negative limit is one more than + // positive limit. + self.max_pos_range() + 1 + } + + /// Size of window into code needed to do the patch. + fn patch_size(self) -> CodeOffset { + // Patch is on one instruction only for all of these label reference types. + 4 + } + + /// Perform the patch. + fn patch(self, buffer: &mut [u8], use_offset: CodeOffset, label_offset: CodeOffset) { + let pc_rel = (label_offset as i64) - (use_offset as i64); + debug_assert!(pc_rel <= self.max_pos_range() as i64); + debug_assert!(pc_rel >= -(self.max_neg_range() as i64)); + let pc_rel = pc_rel as u32; + let insn_word = u32::from_le_bytes([buffer[0], buffer[1], buffer[2], buffer[3]]); + let mask = match self { + LabelUse::Branch19 => 0x00ffffe0, // bits 23..5 inclusive + LabelUse::Branch26 => 0x03ffffff, // bits 25..0 inclusive + LabelUse::Ldr19 => 0x00ffffe0, // bits 23..5 inclusive + LabelUse::Adr21 => 0x60ffffe0, // bits 30..29, 25..5 inclusive + LabelUse::PCRel32 => 0xffffffff, + }; + let pc_rel_shifted = match self { + LabelUse::Adr21 | LabelUse::PCRel32 => pc_rel, + _ => { + debug_assert!(pc_rel & 3 == 0); + pc_rel >> 2 + } + }; + let pc_rel_inserted = match self { + LabelUse::Branch19 | LabelUse::Ldr19 => (pc_rel_shifted & 0x7ffff) << 5, + LabelUse::Branch26 => pc_rel_shifted & 0x3ffffff, + LabelUse::Adr21 => (pc_rel_shifted & 0x7ffff) << 5 | (pc_rel_shifted & 0x180000) << 10, + LabelUse::PCRel32 => pc_rel_shifted, + }; + let is_add = match self { + LabelUse::PCRel32 => true, + _ => false, + }; + let insn_word = if is_add { + insn_word.wrapping_add(pc_rel_inserted) + } else { + (insn_word & !mask) | pc_rel_inserted + }; + buffer[0..4].clone_from_slice(&u32::to_le_bytes(insn_word)); + } + + /// Is a veneer supported for this label reference type? + fn supports_veneer(self) -> bool { + match self { + LabelUse::Branch19 => true, // veneer is a Branch26 + _ => false, + } + } + + /// How large is the veneer, if supported? + fn veneer_size(self) -> CodeOffset { + 4 + } + + /// Generate a veneer into the buffer, given that this veneer is at `veneer_offset`, and return + /// an offset and label-use for the veneer's use of the original label. + fn generate_veneer( + self, + buffer: &mut [u8], + veneer_offset: CodeOffset, + ) -> (CodeOffset, LabelUse) { + match self { + LabelUse::Branch19 => { + // veneer is a Branch26 (unconditional branch). Just encode directly here -- don't + // bother with constructing an Inst. + let insn_word = 0b000101 << 26; + buffer[0..4].clone_from_slice(&u32::to_le_bytes(insn_word)); + (veneer_offset, LabelUse::Branch26) + } + _ => panic!("Unsupported label-reference type for veneer generation!"), } } } diff --git a/cranelift/codegen/src/isa/aarch64/lower.rs b/cranelift/codegen/src/isa/aarch64/lower.rs index f281c05af6..d1368a3d97 100644 --- a/cranelift/codegen/src/isa/aarch64/lower.rs +++ b/cranelift/codegen/src/isa/aarch64/lower.rs @@ -14,12 +14,14 @@ use crate::ir::Inst as IRInst; use crate::ir::{InstructionData, Opcode, TrapCode, Type}; use crate::machinst::lower::*; use crate::machinst::*; +use crate::CodegenResult; use crate::isa::aarch64::inst::*; use crate::isa::aarch64::AArch64Backend; use super::lower_inst; +use log::debug; use regalloc::{Reg, RegClass, Writable}; //============================================================================ @@ -104,18 +106,11 @@ pub(crate) enum ResultRegImmShift { } //============================================================================ -// Instruction input and output "slots". +// Instruction input "slots". // // We use these types to refer to operand numbers, and result numbers, together // with the associated instruction, in a type-safe way. -/// Identifier for a particular output of an instruction. -#[derive(Clone, Copy, Debug, PartialEq, Eq)] -pub(crate) struct InsnOutput { - pub(crate) insn: IRInst, - pub(crate) output: usize, -} - /// Identifier for a particular input of an instruction. #[derive(Clone, Copy, Debug, PartialEq, Eq)] pub(crate) struct InsnInput { @@ -123,93 +118,28 @@ pub(crate) struct InsnInput { pub(crate) input: usize, } -/// Producer of a value: either a previous instruction's output, or a register that will be -/// codegen'd separately. +/// Identifier for a particular output of an instruction. #[derive(Clone, Copy, Debug, PartialEq, Eq)] -pub(crate) enum InsnInputSource { - Output(InsnOutput), - Reg(Reg), -} - -impl InsnInputSource { - fn as_output(self) -> Option { - match self { - InsnInputSource::Output(o) => Some(o), - _ => None, - } - } -} - -fn get_input>(ctx: &mut C, output: InsnOutput, num: usize) -> InsnInput { - assert!(num <= ctx.num_inputs(output.insn)); - InsnInput { - insn: output.insn, - input: num, - } -} - -/// Convert an instruction input to a producing instruction's output if possible (in same BB), or a -/// register otherwise. -fn input_source>(ctx: &mut C, input: InsnInput) -> InsnInputSource { - if let Some((input_inst, result_num)) = ctx.input_inst(input.insn, input.input) { - let out = InsnOutput { - insn: input_inst, - output: result_num, - }; - InsnInputSource::Output(out) - } else { - let reg = ctx.input(input.insn, input.input); - InsnInputSource::Reg(reg) - } +pub(crate) struct InsnOutput { + pub(crate) insn: IRInst, + pub(crate) output: usize, } //============================================================================ -// Lowering: convert instruction outputs to result types. +// Lowering: convert instruction inputs to forms that we can use. -/// Lower an instruction output to a 64-bit constant, if possible. -pub(crate) fn output_to_const>(ctx: &mut C, out: InsnOutput) -> Option { - if out.output > 0 { - None - } else { - let inst_data = ctx.data(out.insn); - if inst_data.opcode() == Opcode::Null { - Some(0) - } else { - match inst_data { - &InstructionData::UnaryImm { opcode: _, imm } => { - // Only has Into for i64; we use u64 elsewhere, so we cast. - let imm: i64 = imm.into(); - Some(imm as u64) - } - &InstructionData::UnaryBool { opcode: _, imm } => Some(u64::from(imm)), - &InstructionData::UnaryIeee32 { opcode: _, imm } => Some(u64::from(imm.bits())), - &InstructionData::UnaryIeee64 { opcode: _, imm } => Some(imm.bits()), - _ => None, - } - } - } +/// Lower an instruction input to a 64-bit constant, if possible. +pub(crate) fn input_to_const>(ctx: &mut C, input: InsnInput) -> Option { + let input = ctx.get_input(input.insn, input.input); + input.constant } -pub(crate) fn output_to_const_f32>( +/// Lower an instruction input to a constant register-shift amount, if possible. +pub(crate) fn input_to_shiftimm>( ctx: &mut C, - out: InsnOutput, -) -> Option { - output_to_const(ctx, out).map(|value| f32::from_bits(value as u32)) -} - -pub(crate) fn output_to_const_f64>( - ctx: &mut C, - out: InsnOutput, -) -> Option { - output_to_const(ctx, out).map(|value| f64::from_bits(value)) -} - -/// Lower an instruction output to a constant register-shift amount, if possible. -pub(crate) fn output_to_shiftimm>( - ctx: &mut C, - out: InsnOutput, + input: InsnInput, ) -> Option { - output_to_const(ctx, out).and_then(ShiftOpShiftImm::maybe_from_shift) + input_to_const(ctx, input).and_then(ShiftOpShiftImm::maybe_from_shift) } /// How to handle narrow values loaded into registers; see note on `narrow_mode` @@ -237,9 +167,9 @@ impl NarrowValueMode { } } -/// Lower an instruction output to a reg. +/// Allocate a register for an instruction output and return it. pub(crate) fn output_to_reg>(ctx: &mut C, out: InsnOutput) -> Writable { - ctx.output(out.insn, out.output) + ctx.get_output(out.insn, out.output) } /// Lower an instruction input to a reg. @@ -252,9 +182,22 @@ pub(crate) fn input_to_reg>( input: InsnInput, narrow_mode: NarrowValueMode, ) -> Reg { + debug!("input_to_reg: input {:?}", input); let ty = ctx.input_ty(input.insn, input.input); let from_bits = ty_bits(ty) as u8; - let in_reg = ctx.input(input.insn, input.input); + let inputs = ctx.get_input(input.insn, input.input); + let in_reg = if let Some(c) = inputs.constant { + // Generate constants fresh at each use to minimize long-range register pressure. + let to_reg = ctx.tmp(Inst::rc_for_type(ty).unwrap(), ty); + for inst in Inst::gen_constant(to_reg, c, ty).into_iter() { + ctx.emit(inst); + } + to_reg.to_reg() + } else { + ctx.use_input_reg(inputs); + inputs.reg + }; + match (narrow_mode, from_bits) { (NarrowValueMode::None, _) => in_reg, (NarrowValueMode::ZeroExtend32, n) if n < 32 => { @@ -282,15 +225,20 @@ pub(crate) fn input_to_reg>( (NarrowValueMode::ZeroExtend32, 32) | (NarrowValueMode::SignExtend32, 32) => in_reg, (NarrowValueMode::ZeroExtend64, n) if n < 64 => { - let tmp = ctx.tmp(RegClass::I64, I32); - ctx.emit(Inst::Extend { - rd: tmp, - rn: in_reg, - signed: false, - from_bits, - to_bits: 64, - }); - tmp.to_reg() + if inputs.constant.is_some() { + // Constants are zero-extended to full 64-bit width on load already. + in_reg + } else { + let tmp = ctx.tmp(RegClass::I64, I32); + ctx.emit(Inst::Extend { + rd: tmp, + rn: in_reg, + signed: false, + from_bits, + to_bits: 64, + }); + tmp.to_reg() + } } (NarrowValueMode::SignExtend64, n) if n < 64 => { let tmp = ctx.tmp(RegClass::I64, I32); @@ -313,8 +261,6 @@ pub(crate) fn input_to_reg>( } /// Lower an instruction input to a reg or reg/shift, or reg/extend operand. -/// This does not actually codegen the source instruction; it just uses the -/// vreg into which the source instruction will generate its value. /// /// The `narrow_mode` flag indicates whether the consumer of this value needs /// the high bits clear. For many operations, such as an add/sub/mul or any @@ -330,23 +276,18 @@ fn input_to_rs>( input: InsnInput, narrow_mode: NarrowValueMode, ) -> ResultRS { - if let InsnInputSource::Output(out) = input_source(ctx, input) { - let insn = out.insn; - assert!(out.output <= ctx.num_outputs(insn)); + let inputs = ctx.get_input(input.insn, input.input); + if let Some((insn, 0)) = inputs.inst { let op = ctx.data(insn).opcode(); if op == Opcode::Ishl { - let shiftee = get_input(ctx, out, 0); - let shift_amt = get_input(ctx, out, 1); + let shiftee = InsnInput { insn, input: 0 }; + let shift_amt = InsnInput { insn, input: 1 }; // Can we get the shift amount as an immediate? - if let Some(shift_amt_out) = input_source(ctx, shift_amt).as_output() { - if let Some(shiftimm) = output_to_shiftimm(ctx, shift_amt_out) { - let reg = input_to_reg(ctx, shiftee, narrow_mode); - ctx.merged(insn); - ctx.merged(shift_amt_out.insn); - return ResultRS::RegShift(reg, ShiftOpAndAmt::new(ShiftOp::LSL, shiftimm)); - } + if let Some(shiftimm) = input_to_shiftimm(ctx, shift_amt) { + let reg = input_to_reg(ctx, shiftee, narrow_mode); + return ResultRS::RegShift(reg, ShiftOpAndAmt::new(ShiftOp::LSL, shiftimm)); } } } @@ -364,11 +305,10 @@ fn input_to_rse>( input: InsnInput, narrow_mode: NarrowValueMode, ) -> ResultRSE { - if let InsnInputSource::Output(out) = input_source(ctx, input) { - let insn = out.insn; - assert!(out.output <= ctx.num_outputs(insn)); + let inputs = ctx.get_input(input.insn, input.input); + if let Some((insn, 0)) = inputs.inst { let op = ctx.data(insn).opcode(); - let out_ty = ctx.output_ty(insn, out.output); + let out_ty = ctx.output_ty(insn, 0); let out_bits = ty_bits(out_ty); // If `out_ty` is smaller than 32 bits and we need to zero- or sign-extend, @@ -378,7 +318,7 @@ fn input_to_rse>( && ((narrow_mode.is_32bit() && out_bits < 32) || (!narrow_mode.is_32bit() && out_bits < 64)) { - let reg = output_to_reg(ctx, out); + let reg = input_to_reg(ctx, InsnInput { insn, input: 0 }, NarrowValueMode::None); let extendop = match (narrow_mode, out_bits) { (NarrowValueMode::SignExtend32, 1) | (NarrowValueMode::SignExtend64, 1) => { ExtendOp::SXTB @@ -402,15 +342,14 @@ fn input_to_rse>( (NarrowValueMode::ZeroExtend64, 32) => ExtendOp::UXTW, _ => unreachable!(), }; - return ResultRSE::RegExtend(reg.to_reg(), extendop); + return ResultRSE::RegExtend(reg, extendop); } // Is this a zero-extend or sign-extend and can we handle that with a register-mode operator? if op == Opcode::Uextend || op == Opcode::Sextend { assert!(out_bits == 32 || out_bits == 64); let sign_extend = op == Opcode::Sextend; - let extendee = get_input(ctx, out, 0); - let inner_ty = ctx.input_ty(extendee.insn, extendee.input); + let inner_ty = ctx.input_ty(insn, 0); let inner_bits = ty_bits(inner_ty); assert!(inner_bits < out_bits); let extendop = match (sign_extend, inner_bits) { @@ -424,8 +363,7 @@ fn input_to_rse>( (false, 32) => ExtendOp::UXTW, _ => unreachable!(), }; - let reg = input_to_reg(ctx, extendee, NarrowValueMode::None); - ctx.merged(insn); + let reg = input_to_reg(ctx, InsnInput { insn, input: 0 }, NarrowValueMode::None); return ResultRSE::RegExtend(reg, extendop); } } @@ -438,12 +376,9 @@ pub(crate) fn input_to_rse_imm12>( input: InsnInput, narrow_mode: NarrowValueMode, ) -> ResultRSEImm12 { - if let InsnInputSource::Output(out) = input_source(ctx, input) { - if let Some(imm_value) = output_to_const(ctx, out) { - if let Some(i) = Imm12::maybe_from_u64(imm_value) { - ctx.merged(out.insn); - return ResultRSEImm12::Imm12(i); - } + if let Some(imm_value) = input_to_const(ctx, input) { + if let Some(i) = Imm12::maybe_from_u64(imm_value) { + return ResultRSEImm12::Imm12(i); } } @@ -455,14 +390,11 @@ pub(crate) fn input_to_rs_immlogic>( input: InsnInput, narrow_mode: NarrowValueMode, ) -> ResultRSImmLogic { - if let InsnInputSource::Output(out) = input_source(ctx, input) { - if let Some(imm_value) = output_to_const(ctx, out) { - let ty = ctx.output_ty(out.insn, out.output); - let ty = if ty_bits(ty) < 32 { I32 } else { ty }; - if let Some(i) = ImmLogic::maybe_from_u64(imm_value, ty) { - ctx.merged(out.insn); - return ResultRSImmLogic::ImmLogic(i); - } + if let Some(imm_value) = input_to_const(ctx, input) { + let ty = ctx.input_ty(input.insn, input.input); + let ty = if ty_bits(ty) < 32 { I32 } else { ty }; + if let Some(i) = ImmLogic::maybe_from_u64(imm_value, ty) { + return ResultRSImmLogic::ImmLogic(i); } } @@ -473,12 +405,9 @@ pub(crate) fn input_to_reg_immshift>( ctx: &mut C, input: InsnInput, ) -> ResultRegImmShift { - if let InsnInputSource::Output(out) = input_source(ctx, input) { - if let Some(imm_value) = output_to_const(ctx, out) { - if let Some(immshift) = ImmShift::maybe_from_u64(imm_value) { - ctx.merged(out.insn); - return ResultRegImmShift::ImmShift(immshift); - } + if let Some(imm_value) = input_to_const(ctx, input) { + if let Some(immshift) = ImmShift::maybe_from_u64(imm_value) { + return ResultRegImmShift::ImmShift(immshift); } } @@ -823,24 +752,29 @@ pub(crate) fn inst_trapcode(data: &InstructionData) -> Option { } } -/// Checks for an instance of `op` feeding the given input. Marks as merged (decrementing refcount) if so. +/// Checks for an instance of `op` feeding the given input. pub(crate) fn maybe_input_insn>( c: &mut C, input: InsnInput, op: Opcode, ) -> Option { - if let InsnInputSource::Output(out) = input_source(c, input) { - let data = c.data(out.insn); + let inputs = c.get_input(input.insn, input.input); + debug!( + "maybe_input_insn: input {:?} has options {:?}; looking for op {:?}", + input, inputs, op + ); + if let Some((src_inst, _)) = inputs.inst { + let data = c.data(src_inst); + debug!(" -> input inst {:?}", data); if data.opcode() == op { - c.merged(out.insn); - return Some(out.insn); + return Some(src_inst); } } None } /// Checks for an instance of `op` feeding the given input, possibly via a conversion `conv` (e.g., -/// Bint or a bitcast). Marks one or both as merged if so, as appropriate. +/// Bint or a bitcast). /// /// FIXME cfallin 2020-03-30: this is really ugly. Factor out tree-matching stuff and make it /// a bit more generic. @@ -850,21 +784,19 @@ pub(crate) fn maybe_input_insn_via_conv>( op: Opcode, conv: Opcode, ) -> Option { - if let Some(ret) = maybe_input_insn(c, input, op) { - return Some(ret); - } - - if let InsnInputSource::Output(out) = input_source(c, input) { - let data = c.data(out.insn); + let inputs = c.get_input(input.insn, input.input); + if let Some((src_inst, _)) = inputs.inst { + let data = c.data(src_inst); + if data.opcode() == op { + return Some(src_inst); + } if data.opcode() == conv { - let conv_insn = out.insn; - let conv_input = InsnInput { - insn: conv_insn, - input: 0, - }; - if let Some(inner) = maybe_input_insn(c, conv_input, op) { - c.merged(conv_insn); - return Some(inner); + let inputs = c.get_input(src_inst, 0); + if let Some((src_inst, _)) = inputs.inst { + let data = c.data(src_inst); + if data.opcode() == op { + return Some(src_inst); + } } } } @@ -876,6 +808,7 @@ pub(crate) fn lower_icmp_or_ifcmp_to_flags>( insn: IRInst, is_signed: bool, ) { + debug!("lower_icmp_or_ifcmp_to_flags: insn {}", insn); let ty = ctx.input_ty(insn, 0); let bits = ty_bits(ty); let narrow_mode = match (bits <= 32, is_signed) { @@ -897,6 +830,7 @@ pub(crate) fn lower_icmp_or_ifcmp_to_flags>( let ty = ctx.input_ty(insn, 0); let rn = input_to_reg(ctx, inputs[0], narrow_mode); let rm = input_to_rse_imm12(ctx, inputs[1], narrow_mode); + debug!("lower_icmp_or_ifcmp_to_flags: rn = {:?} rm = {:?}", rn, rm); let alu_op = choose_32_64(ty, ALUOp::SubS32, ALUOp::SubS64); let rd = writable_zero_reg(); ctx.emit(alu_inst_imm12(alu_op, rd, rn, rm)); @@ -934,17 +868,21 @@ pub(crate) fn lower_fcmp_or_ffcmp_to_flags>(ctx: &mut C, i impl LowerBackend for AArch64Backend { type MInst = Inst; - fn lower>(&self, ctx: &mut C, ir_inst: IRInst) { - lower_inst::lower_insn_to_regs(ctx, ir_inst); + fn lower>(&self, ctx: &mut C, ir_inst: IRInst) -> CodegenResult<()> { + lower_inst::lower_insn_to_regs(ctx, ir_inst) } fn lower_branch_group>( &self, ctx: &mut C, branches: &[IRInst], - targets: &[BlockIndex], - fallthrough: Option, - ) { + targets: &[MachLabel], + fallthrough: Option, + ) -> CodegenResult<()> { lower_inst::lower_branch(ctx, branches, targets, fallthrough) } + + fn maybe_pinned_reg(&self) -> Option { + Some(xreg(PINNED_REG)) + } } diff --git a/cranelift/codegen/src/isa/aarch64/lower_inst.rs b/cranelift/codegen/src/isa/aarch64/lower_inst.rs index ffa9e11012..8692d853de 100644 --- a/cranelift/codegen/src/isa/aarch64/lower_inst.rs +++ b/cranelift/codegen/src/isa/aarch64/lower_inst.rs @@ -1,11 +1,13 @@ //! Lower a single Cranelift instruction into vcode. +use crate::binemit::CodeOffset; use crate::ir::condcodes::FloatCC; use crate::ir::types::*; use crate::ir::Inst as IRInst; use crate::ir::{InstructionData, Opcode, TrapCode}; use crate::machinst::lower::*; use crate::machinst::*; +use crate::CodegenResult; use crate::isa::aarch64::abi::*; use crate::isa::aarch64::inst::*; @@ -19,7 +21,10 @@ use smallvec::SmallVec; use super::lower::*; /// Actually codegen an instruction's results into registers. -pub(crate) fn lower_insn_to_regs>(ctx: &mut C, insn: IRInst) { +pub(crate) fn lower_insn_to_regs>( + ctx: &mut C, + insn: IRInst, +) -> CodegenResult<()> { let op = ctx.data(insn).opcode(); let inputs: SmallVec<[InsnInput; 4]> = (0..ctx.num_inputs(insn)) .map(|i| InsnInput { insn, input: i }) @@ -35,17 +40,17 @@ pub(crate) fn lower_insn_to_regs>(ctx: &mut C, insn: IRIns match op { Opcode::Iconst | Opcode::Bconst | Opcode::Null => { - let value = output_to_const(ctx, outputs[0]).unwrap(); + let value = ctx.get_constant(insn).unwrap(); let rd = output_to_reg(ctx, outputs[0]); lower_constant_u64(ctx, rd, value); } Opcode::F32const => { - let value = output_to_const_f32(ctx, outputs[0]).unwrap(); + let value = f32::from_bits(ctx.get_constant(insn).unwrap() as u32); let rd = output_to_reg(ctx, outputs[0]); lower_constant_f32(ctx, rd, value); } Opcode::F64const => { - let value = output_to_const_f64(ctx, outputs[0]).unwrap(); + let value = f64::from_bits(ctx.get_constant(insn).unwrap()); let rd = output_to_reg(ctx, outputs[0]); lower_constant_f64(ctx, rd, value); } @@ -271,7 +276,7 @@ pub(crate) fn lower_insn_to_regs>(ctx: &mut C, insn: IRIns // Check for divide by 0. let branch_size = 8; - ctx.emit(Inst::CondBrLowered { + ctx.emit(Inst::OneWayCondBr { target: BranchTarget::ResolvedOffset(branch_size), kind: CondBrKind::NotZero(rm), }); @@ -297,7 +302,7 @@ pub(crate) fn lower_insn_to_regs>(ctx: &mut C, insn: IRIns // Check for divide by 0. let branch_size = 20; - ctx.emit(Inst::CondBrLowered { + ctx.emit(Inst::OneWayCondBr { target: BranchTarget::ResolvedOffset(branch_size), kind: CondBrKind::Zero(rm), }); @@ -324,7 +329,7 @@ pub(crate) fn lower_insn_to_regs>(ctx: &mut C, insn: IRIns nzcv: NZCV::new(false, false, false, false), cond: Cond::Eq, }); - ctx.emit(Inst::CondBrLowered { + ctx.emit(Inst::OneWayCondBr { target: BranchTarget::ResolvedOffset(12), kind: CondBrKind::Cond(Cond::Vc), }); @@ -337,7 +342,7 @@ pub(crate) fn lower_insn_to_regs>(ctx: &mut C, insn: IRIns // Check for divide by 0. let branch_size = 8; - ctx.emit(Inst::CondBrLowered { + ctx.emit(Inst::OneWayCondBr { target: BranchTarget::ResolvedOffset(branch_size), kind: CondBrKind::NotZero(rm), }); @@ -1211,7 +1216,7 @@ pub(crate) fn lower_insn_to_regs>(ctx: &mut C, insn: IRIns // Branch around the break instruction with inverted cond. Go straight to lowered // one-target form; this is logically part of a single-in single-out template lowering. let cond = cond.invert(); - ctx.emit(Inst::CondBrLowered { + ctx.emit(Inst::OneWayCondBr { target: BranchTarget::ResolvedOffset(8), kind: CondBrKind::Cond(cond), }); @@ -1301,11 +1306,12 @@ pub(crate) fn lower_insn_to_regs>(ctx: &mut C, insn: IRIns Opcode::GetPinnedReg => { let rd = output_to_reg(ctx, outputs[0]); - ctx.emit(Inst::GetPinnedReg { rd }); + ctx.emit(Inst::mov(rd, xreg(PINNED_REG))); } + Opcode::SetPinnedReg => { let rm = input_to_reg(ctx, inputs[0], NarrowValueMode::None); - ctx.emit(Inst::SetPinnedReg { rm }); + ctx.emit(Inst::mov(writable_xreg(PINNED_REG), rm)); } Opcode::Spill @@ -1533,7 +1539,7 @@ pub(crate) fn lower_insn_to_regs>(ctx: &mut C, insn: IRIns } else { ctx.emit(Inst::FpuCmp64 { rn, rm: rn }); } - ctx.emit(Inst::CondBrLowered { + ctx.emit(Inst::OneWayCondBr { target: BranchTarget::ResolvedOffset(8), kind: CondBrKind::Cond(lower_fp_condcode(FloatCC::Ordered)), }); @@ -1574,7 +1580,7 @@ pub(crate) fn lower_insn_to_regs>(ctx: &mut C, insn: IRIns rn, rm: tmp.to_reg(), }); - ctx.emit(Inst::CondBrLowered { + ctx.emit(Inst::OneWayCondBr { target: BranchTarget::ResolvedOffset(8), kind: CondBrKind::Cond(lower_fp_condcode(low_cond)), }); @@ -1587,7 +1593,7 @@ pub(crate) fn lower_insn_to_regs>(ctx: &mut C, insn: IRIns rn, rm: tmp.to_reg(), }); - ctx.emit(Inst::CondBrLowered { + ctx.emit(Inst::OneWayCondBr { target: BranchTarget::ResolvedOffset(8), kind: CondBrKind::Cond(lower_fp_condcode(FloatCC::LessThan)), }); @@ -1617,7 +1623,7 @@ pub(crate) fn lower_insn_to_regs>(ctx: &mut C, insn: IRIns rn, rm: tmp.to_reg(), }); - ctx.emit(Inst::CondBrLowered { + ctx.emit(Inst::OneWayCondBr { target: BranchTarget::ResolvedOffset(8), kind: CondBrKind::Cond(lower_fp_condcode(low_cond)), }); @@ -1630,7 +1636,7 @@ pub(crate) fn lower_insn_to_regs>(ctx: &mut C, insn: IRIns rn, rm: tmp.to_reg(), }); - ctx.emit(Inst::CondBrLowered { + ctx.emit(Inst::OneWayCondBr { target: BranchTarget::ResolvedOffset(8), kind: CondBrKind::Cond(lower_fp_condcode(FloatCC::LessThan)), }); @@ -1862,14 +1868,16 @@ pub(crate) fn lower_insn_to_regs>(ctx: &mut C, insn: IRIns Opcode::AvgRound => unimplemented!(), Opcode::TlsValue => unimplemented!(), } + + Ok(()) } pub(crate) fn lower_branch>( ctx: &mut C, branches: &[IRInst], - targets: &[BlockIndex], - fallthrough: Option, -) { + targets: &[MachLabel], + fallthrough: Option, +) -> CodegenResult<()> { // A block should end with at most two branches. The first may be a // conditional branch; a conditional branch can be followed only by an // unconditional branch or fallthrough. Otherwise, if only one branch, @@ -1883,18 +1891,14 @@ pub(crate) fn lower_branch>( let op0 = ctx.data(branches[0]).opcode(); let op1 = ctx.data(branches[1]).opcode(); - //println!( - // "lowering two-branch group: opcodes are {:?} and {:?}", - // op0, op1 - //); - assert!(op1 == Opcode::Jump || op1 == Opcode::Fallthrough); - let taken = BranchTarget::Block(targets[0]); + let taken = BranchTarget::Label(targets[0]); let not_taken = match op1 { - Opcode::Jump => BranchTarget::Block(targets[1]), - Opcode::Fallthrough => BranchTarget::Block(fallthrough.unwrap()), + Opcode::Jump => BranchTarget::Label(targets[1]), + Opcode::Fallthrough => BranchTarget::Label(fallthrough.unwrap()), _ => unreachable!(), // assert above. }; + match op0 { Opcode::Brz | Opcode::Brnz => { let flag_input = InsnInput { @@ -1954,6 +1958,8 @@ pub(crate) fn lower_branch>( Opcode::BrIcmp => { let condcode = inst_condcode(ctx.data(branches[0])).unwrap(); let cond = lower_condcode(condcode); + let kind = CondBrKind::Cond(cond); + let is_signed = condcode_is_signed(condcode); let ty = ctx.input_ty(branches[0], 0); let bits = ty_bits(ty); @@ -1986,13 +1992,15 @@ pub(crate) fn lower_branch>( ctx.emit(Inst::CondBr { taken, not_taken, - kind: CondBrKind::Cond(cond), + kind, }); } Opcode::Brif => { let condcode = inst_condcode(ctx.data(branches[0])).unwrap(); let cond = lower_condcode(condcode); + let kind = CondBrKind::Cond(cond); + let is_signed = condcode_is_signed(condcode); let flag_input = InsnInput { insn: branches[0], @@ -2003,7 +2011,7 @@ pub(crate) fn lower_branch>( ctx.emit(Inst::CondBr { taken, not_taken, - kind: CondBrKind::Cond(cond), + kind, }); } else { // If the ifcmp result is actually placed in a @@ -2013,7 +2021,7 @@ pub(crate) fn lower_branch>( ctx.emit(Inst::CondBr { taken, not_taken, - kind: CondBrKind::Cond(cond), + kind, }); } } @@ -2021,6 +2029,7 @@ pub(crate) fn lower_branch>( Opcode::Brff => { let condcode = inst_fp_condcode(ctx.data(branches[0])).unwrap(); let cond = lower_fp_condcode(condcode); + let kind = CondBrKind::Cond(cond); let flag_input = InsnInput { insn: branches[0], input: 0, @@ -2030,7 +2039,7 @@ pub(crate) fn lower_branch>( ctx.emit(Inst::CondBr { taken, not_taken, - kind: CondBrKind::Cond(cond), + kind, }); } else { // If the ffcmp result is actually placed in a @@ -2040,7 +2049,7 @@ pub(crate) fn lower_branch>( ctx.emit(Inst::CondBr { taken, not_taken, - kind: CondBrKind::Cond(cond), + kind, }); } } @@ -2057,12 +2066,13 @@ pub(crate) fn lower_branch>( // fills in `targets[0]` with our fallthrough block, so this // is valid for both Jump and Fallthrough. ctx.emit(Inst::Jump { - dest: BranchTarget::Block(targets[0]), + dest: BranchTarget::Label(targets[0]), }); } Opcode::BrTable => { // Expand `br_table index, default, JT` to: // + // (emit island with guard jump if needed) // subs idx, #jt_size // b.hs default // adr vTmp1, PC+16 @@ -2072,6 +2082,11 @@ pub(crate) fn lower_branch>( // [jumptable offsets relative to JT base] let jt_size = targets.len() - 1; assert!(jt_size <= std::u32::MAX as usize); + + ctx.emit(Inst::EmitIsland { + needed_space: 4 * (6 + jt_size) as CodeOffset, + }); + let ridx = input_to_reg( ctx, InsnInput { @@ -2101,10 +2116,10 @@ pub(crate) fn lower_branch>( rm: rtmp1.to_reg(), }); } - let default_target = BranchTarget::Block(targets[0]); - ctx.emit(Inst::CondBrLowered { - kind: CondBrKind::Cond(Cond::Hs), // unsigned >= + let default_target = BranchTarget::Label(targets[0]); + ctx.emit(Inst::OneWayCondBr { target: default_target.clone(), + kind: CondBrKind::Cond(Cond::Hs), // unsigned >= }); // Emit the compound instruction that does: @@ -2125,9 +2140,9 @@ pub(crate) fn lower_branch>( let jt_targets: Vec = targets .iter() .skip(1) - .map(|bix| BranchTarget::Block(*bix)) + .map(|bix| BranchTarget::Label(*bix)) .collect(); - let targets_for_term: Vec = targets.to_vec(); + let targets_for_term: Vec = targets.to_vec(); ctx.emit(Inst::JTSequence { ridx, rtmp1, @@ -2140,4 +2155,6 @@ pub(crate) fn lower_branch>( _ => panic!("Unknown branch type!"), } } + + Ok(()) } diff --git a/cranelift/codegen/src/isa/aarch64/mod.rs b/cranelift/codegen/src/isa/aarch64/mod.rs index d377d998c9..3aa8c779aa 100644 --- a/cranelift/codegen/src/isa/aarch64/mod.rs +++ b/cranelift/codegen/src/isa/aarch64/mod.rs @@ -15,7 +15,7 @@ use target_lexicon::{Aarch64Architecture, Architecture, Triple}; // New backend: mod abi; -mod inst; +pub(crate) mod inst; mod lower; mod lower_inst; @@ -59,7 +59,7 @@ impl MachBackend for AArch64Backend { ) -> CodegenResult { let flags = self.flags(); let vcode = self.compile_vcode(func, flags.clone())?; - let sections = vcode.emit(); + let buffer = vcode.emit(); let frame_size = vcode.frame_size(); let disasm = if want_disasm { @@ -68,8 +68,10 @@ impl MachBackend for AArch64Backend { None }; + let buffer = buffer.finish(); + Ok(MachCompileResult { - sections, + buffer, frame_size, disasm, }) @@ -140,8 +142,8 @@ mod test { Triple::from_str("aarch64").unwrap(), settings::Flags::new(shared_flags), ); - let sections = backend.compile_function(&mut func, false).unwrap().sections; - let code = §ions.sections[0].data; + let buffer = backend.compile_function(&mut func, false).unwrap().buffer; + let code = &buffer.data[..]; // stp x29, x30, [sp, #-16]! // mov x29, sp @@ -155,7 +157,7 @@ mod test { 0x01, 0x0b, 0xbf, 0x03, 0x00, 0x91, 0xfd, 0x7b, 0xc1, 0xa8, 0xc0, 0x03, 0x5f, 0xd6, ]; - assert_eq!(code, &golden); + assert_eq!(code, &golden[..]); } #[test] @@ -198,34 +200,32 @@ mod test { let result = backend .compile_function(&mut func, /* want_disasm = */ false) .unwrap(); - let code = &result.sections.sections[0].data; + let code = &result.buffer.data[..]; // stp x29, x30, [sp, #-16]! // mov x29, sp - // mov x1, x0 - // mov x0, #0x1234 - // add w1, w1, w0 - // mov w2, w1 - // cbz x2, ... - // mov w2, w1 - // cbz x2, ... - // sub w0, w1, w0 + // mov x1, #0x1234 // #4660 + // add w0, w0, w1 + // mov w1, w0 + // cbnz x1, 0x28 + // mov x1, #0x1234 // #4660 + // add w1, w0, w1 + // mov w1, w1 + // cbnz x1, 0x18 + // mov w1, w0 + // cbnz x1, 0x18 + // mov x1, #0x1234 // #4660 + // sub w0, w0, w1 // mov sp, x29 // ldp x29, x30, [sp], #16 // ret - // add w2, w1, w0 - // mov w2, w2 - // cbnz x2, ... <---- compound branch (cond / uncond) - // b ... <---- - let golden = vec![ - 0xfd, 0x7b, 0xbf, 0xa9, 0xfd, 0x03, 0x00, 0x91, 0xe1, 0x03, 0x00, 0xaa, 0x80, 0x46, - 0x82, 0xd2, 0x21, 0x00, 0x00, 0x0b, 0xe2, 0x03, 0x01, 0x2a, 0xe2, 0x00, 0x00, 0xb4, - 0xe2, 0x03, 0x01, 0x2a, 0xa2, 0x00, 0x00, 0xb5, 0x20, 0x00, 0x00, 0x4b, 0xbf, 0x03, - 0x00, 0x91, 0xfd, 0x7b, 0xc1, 0xa8, 0xc0, 0x03, 0x5f, 0xd6, 0x22, 0x00, 0x00, 0x0b, - 0xe2, 0x03, 0x02, 0x2a, 0xc2, 0xff, 0xff, 0xb5, 0xf7, 0xff, 0xff, 0x17, + 253, 123, 191, 169, 253, 3, 0, 145, 129, 70, 130, 210, 0, 0, 1, 11, 225, 3, 0, 42, 161, + 0, 0, 181, 129, 70, 130, 210, 1, 0, 1, 11, 225, 3, 1, 42, 161, 255, 255, 181, 225, 3, + 0, 42, 97, 255, 255, 181, 129, 70, 130, 210, 0, 0, 1, 75, 191, 3, 0, 145, 253, 123, + 193, 168, 192, 3, 95, 214, ]; - assert_eq!(code, &golden); + assert_eq!(code, &golden[..]); } } diff --git a/cranelift/codegen/src/isa/mod.rs b/cranelift/codegen/src/isa/mod.rs index 6c9a904f03..d193936a91 100644 --- a/cranelift/codegen/src/isa/mod.rs +++ b/cranelift/codegen/src/isa/mod.rs @@ -77,14 +77,14 @@ mod riscv; #[cfg(feature = "x86")] mod x86; -#[cfg(feature = "x64")] -mod x64; +//#[cfg(feature = "x64")] +//mod x64; #[cfg(feature = "arm32")] mod arm32; #[cfg(feature = "arm64")] -mod aarch64; +pub(crate) mod aarch64; #[cfg(feature = "unwind")] pub mod unwind; diff --git a/cranelift/codegen/src/isa/x86/mod.rs b/cranelift/codegen/src/isa/x86/mod.rs index 9386e60310..0cd825b161 100644 --- a/cranelift/codegen/src/isa/x86/mod.rs +++ b/cranelift/codegen/src/isa/x86/mod.rs @@ -57,11 +57,11 @@ fn isa_constructor( let isa_flags = settings::Flags::new(&shared_flags, builder); if isa_flags.use_new_backend() { - #[cfg(not(feature = "x64"))] + //#[cfg(not(feature = "x64"))] panic!("new backend x86 support not included by cargo features!"); - #[cfg(feature = "x64")] - super::x64::isa_builder(triple).finish(shared_flags) + //#[cfg(feature = "x64")] + //super::x64::isa_builder(triple).finish(shared_flags) } else { Box::new(Isa { triple, diff --git a/cranelift/codegen/src/lib.rs b/cranelift/codegen/src/lib.rs index 3483219fea..dd871924ab 100644 --- a/cranelift/codegen/src/lib.rs +++ b/cranelift/codegen/src/lib.rs @@ -99,7 +99,6 @@ mod iterators; mod legalizer; mod licm; mod nan_canonicalization; -mod num_uses; mod partition_slice; mod postopt; mod predicates; diff --git a/cranelift/codegen/src/machinst/blockorder.rs b/cranelift/codegen/src/machinst/blockorder.rs index dd826809c4..104b2f8c15 100644 --- a/cranelift/codegen/src/machinst/blockorder.rs +++ b/cranelift/codegen/src/machinst/blockorder.rs @@ -1,49 +1,579 @@ //! Computation of basic block order in emitted code. +//! +//! This module handles the translation from CLIF BBs to VCode BBs. +//! +//! The basic idea is that we compute a sequence of "lowered blocks" that +//! correspond to subgraphs of the CLIF CFG plus an implicit block on *every* +//! edge (not just critical edges). Conceptually, the lowering pipeline wants to +//! insert moves for phi-nodes on every block-to-block transfer; these blocks +//! always conceptually exist, but may be merged with an "original" CLIF block +//! (and hence not actually exist; this is equivalent to inserting the blocks +//! only on critical edges). +//! +//! Each `LoweredBlock` names just an original CLIF block, an original CLIF +//! block prepended or appended with an edge block (never both, though), or just +//! an edge block. +//! +//! To compute this lowering, we do a DFS over the CLIF-plus-edge-block graph +//! (never actually materialized, just defined by a "successors" function), and +//! compute the reverse postorder. +//! +//! This algorithm isn't perfect w.r.t. generated code quality: we don't, for +//! example, consider any information about whether edge blocks will actually +//! have content, because this computation happens as part of lowering *before* +//! regalloc, and regalloc may or may not insert moves/spills/reloads on any +//! particular edge. But it works relatively well and is conceptually simple. +use crate::entity::SecondaryMap; +use crate::fx::{FxHashMap, FxHashSet}; +use crate::ir::{Block, Function, Inst, Opcode}; +use crate::machinst::lower::visit_block_succs; use crate::machinst::*; -/// Simple reverse postorder-based block order emission. -/// -/// TODO: use a proper algorithm, such as the bottom-up straight-line-section -/// construction algorithm. -struct BlockRPO { - visited: Vec, - postorder: Vec, +use log::debug; +use smallvec::SmallVec; + +/// Mapping from CLIF BBs to VCode BBs. +#[derive(Debug)] +pub struct BlockLoweringOrder { + /// Lowered blocks, in BlockIndex order. Each block is some combination of + /// (i) a CLIF block, and (ii) inserted crit-edge blocks before or after; + /// see [LoweredBlock] for details. + lowered_order: Vec, + /// Successors for all lowered blocks, in one serialized vector. Indexed by + /// the ranges in `lowered_succ_ranges`. + lowered_succs: Vec<(Inst, LoweredBlock)>, + /// BlockIndex values for successors for all lowered blocks, in the same + /// order as `lowered_succs`. + lowered_succ_indices: Vec<(Inst, BlockIndex)>, + /// Ranges in `lowered_succs` giving the successor lists for each lowered + /// block. Indexed by lowering-order index (`BlockIndex`). + lowered_succ_ranges: Vec<(usize, usize)>, + /// Mapping from CLIF BB to BlockIndex (index in lowered order). Note that + /// some CLIF BBs may not be lowered; in particular, we skip unreachable + /// blocks. + orig_map: SecondaryMap>, } -impl BlockRPO { - fn new(vcode: &VCode) -> BlockRPO { - BlockRPO { - visited: vec![false; vcode.num_blocks()], - postorder: Vec::with_capacity(vcode.num_blocks()), +/// The origin of a block in the lowered block-order: either an original CLIF +/// block, or an inserted edge-block, or a combination of the two if an edge is +/// non-critical. +#[derive(Clone, Copy, Debug, PartialEq, Eq, Hash)] +pub enum LoweredBlock { + /// Block in original CLIF, with no merged edge-blocks. + Orig { + /// Original CLIF block. + block: Block, + }, + /// Block in the original CLIF, plus edge-block to one succ (which is the + /// one successor of the original block). + OrigAndEdge { + /// The original CLIF block contained in this lowered block. + block: Block, + /// The edge (jump) instruction transitioning from this block + /// to the next, i.e., corresponding to the included edge-block. This + /// will be an instruction in `block`. + edge_inst: Inst, + /// The successor CLIF block. + succ: Block, + }, + /// Block in the original CLIF, preceded by edge-block from one pred (which + /// is the one pred of the original block). + EdgeAndOrig { + /// The previous CLIF block, i.e., the edge block's predecessor. + pred: Block, + /// The edge (jump) instruction corresponding to the included + /// edge-block. This will be an instruction in `pred`. + edge_inst: Inst, + /// The original CLIF block included in this lowered block. + block: Block, + }, + /// Split critical edge between two CLIF blocks. This lowered block does not + /// correspond to any original CLIF blocks; it only serves as an insertion + /// point for work to happen on the transition from `pred` to `succ`. + Edge { + /// The predecessor CLIF block. + pred: Block, + /// The edge (jump) instruction corresponding to this edge's transition. + /// This will be an instruction in `pred`. + edge_inst: Inst, + /// The successor CLIF block. + succ: Block, + }, +} + +impl LoweredBlock { + /// The associated original (CLIF) block included in this lowered block, if + /// any. + pub fn orig_block(self) -> Option { + match self { + LoweredBlock::Orig { block, .. } + | LoweredBlock::OrigAndEdge { block, .. } + | LoweredBlock::EdgeAndOrig { block, .. } => Some(block), + LoweredBlock::Edge { .. } => None, } } - fn visit(&mut self, vcode: &VCode, block: BlockIndex) { - self.visited[block as usize] = true; - for succ in vcode.succs(block) { - if !self.visited[succ.get() as usize] { - self.visit(vcode, succ.get()); + /// The associated in-edge, if any. + pub fn in_edge(self) -> Option<(Block, Inst, Block)> { + match self { + LoweredBlock::EdgeAndOrig { + pred, + edge_inst, + block, + } => Some((pred, edge_inst, block)), + _ => None, + } + } + + /// the associated out-edge, if any. Also includes edge-only blocks. + pub fn out_edge(self) -> Option<(Block, Inst, Block)> { + match self { + LoweredBlock::OrigAndEdge { + block, + edge_inst, + succ, + } => Some((block, edge_inst, succ)), + LoweredBlock::Edge { + pred, + edge_inst, + succ, + } => Some((pred, edge_inst, succ)), + _ => None, + } + } +} + +impl BlockLoweringOrder { + /// Compute and return a lowered block order for `f`. + pub fn new(f: &Function) -> BlockLoweringOrder { + debug!("BlockLoweringOrder: function body {:?}", f); + + // Step 1: compute the in-edge and out-edge count of every block. + let mut block_in_count = SecondaryMap::with_default(0); + let mut block_out_count = SecondaryMap::with_default(0); + + // Cache the block successors to avoid re-examining branches below. + let mut block_succs: SmallVec<[(Inst, Block); 128]> = SmallVec::new(); + let mut block_succ_range = SecondaryMap::with_default((0, 0)); + let mut fallthrough_return_block = None; + for block in f.layout.blocks() { + let block_succ_start = block_succs.len(); + visit_block_succs(f, block, |inst, succ| { + block_out_count[block] += 1; + block_in_count[succ] += 1; + block_succs.push((inst, succ)); + }); + let block_succ_end = block_succs.len(); + block_succ_range[block] = (block_succ_start, block_succ_end); + + for inst in f.layout.block_likely_branches(block) { + if f.dfg[inst].opcode() == Opcode::Return { + // Implicit output edge for any return. + block_out_count[block] += 1; + } + if f.dfg[inst].opcode() == Opcode::FallthroughReturn { + // Fallthrough return block must come last. + debug_assert!(fallthrough_return_block == None); + fallthrough_return_block = Some(block); + } } } - if Some(block) != vcode.fallthrough_return_block { - self.postorder.push(block); + // Implicit input edge for entry block. + if let Some(entry) = f.layout.entry_block() { + block_in_count[entry] += 1; } + + // Here we define the implicit CLIF-plus-edges graph. There are + // conceptually two such graphs: the original, with every edge explicit, + // and the merged one, with blocks (represented by `LoweredBlock` + // values) that contain original CLIF blocks, edges, or both. This + // function returns a lowered block's successors as per the latter, with + // consideration to edge-block merging. + // + // Note that there is a property of the block-merging rules below + // that is very important to ensure we don't miss any lowered blocks: + // any block in the implicit CLIF-plus-edges graph will *only* be + // included in one block in the merged graph. + // + // This, combined with the property that every edge block is reachable + // only from one predecessor (and hence cannot be reached by a DFS + // backedge), means that it is sufficient in our DFS below to track + // visited-bits per original CLIF block only, not per edge. This greatly + // simplifies the data structures (no need to keep a sparse hash-set of + // (block, block) tuples). + let compute_lowered_succs = |ret: &mut Vec<(Inst, LoweredBlock)>, block: LoweredBlock| { + let start_idx = ret.len(); + match block { + LoweredBlock::Orig { block } | LoweredBlock::EdgeAndOrig { block, .. } => { + // At an orig block; successors are always edge blocks, + // possibly with orig blocks following. + let range = block_succ_range[block]; + for &(edge_inst, succ) in &block_succs[range.0..range.1] { + if block_in_count[succ] == 1 { + ret.push(( + edge_inst, + LoweredBlock::EdgeAndOrig { + pred: block, + edge_inst, + block: succ, + }, + )); + } else { + ret.push(( + edge_inst, + LoweredBlock::Edge { + pred: block, + edge_inst, + succ, + }, + )); + } + } + } + LoweredBlock::Edge { + succ, edge_inst, .. + } + | LoweredBlock::OrigAndEdge { + succ, edge_inst, .. + } => { + // At an edge block; successors are always orig blocks, + // possibly with edge blocks following. + if block_out_count[succ] == 1 { + let range = block_succ_range[succ]; + // check if the one succ is a real CFG edge (vs. + // implicit return succ). + if range.1 - range.0 > 0 { + debug_assert!(range.1 - range.0 == 1); + let (succ_edge_inst, succ_succ) = block_succs[range.0]; + ret.push(( + edge_inst, + LoweredBlock::OrigAndEdge { + block: succ, + edge_inst: succ_edge_inst, + succ: succ_succ, + }, + )); + } else { + ret.push((edge_inst, LoweredBlock::Orig { block: succ })); + } + } else { + ret.push((edge_inst, LoweredBlock::Orig { block: succ })); + } + } + } + let end_idx = ret.len(); + (start_idx, end_idx) + }; + + // Build the explicit LoweredBlock-to-LoweredBlock successors list. + let mut lowered_succs = vec![]; + let mut lowered_succ_indices = vec![]; + + // Step 2: Compute RPO traversal of the implicit CLIF-plus-edge-block graph. Use an + // explicit stack so we don't overflow the real stack with a deep DFS. + #[derive(Debug)] + struct StackEntry { + this: LoweredBlock, + succs: (usize, usize), // range in lowered_succs + cur_succ: usize, // index in lowered_succs + } + + let mut stack: SmallVec<[StackEntry; 16]> = SmallVec::new(); + let mut visited = FxHashSet::default(); + let mut postorder = vec![]; + if let Some(entry) = f.layout.entry_block() { + // FIXME(cfallin): we might be able to use OrigAndEdge. Find a way + // to not special-case the entry block here. + let block = LoweredBlock::Orig { block: entry }; + visited.insert(block); + let range = compute_lowered_succs(&mut lowered_succs, block); + lowered_succ_indices.resize(lowered_succs.len(), 0); + stack.push(StackEntry { + this: block, + succs: range, + cur_succ: range.1, + }); + } + + let mut deferred_last = None; + while !stack.is_empty() { + let stack_entry = stack.last_mut().unwrap(); + let range = stack_entry.succs; + if stack_entry.cur_succ == range.0 { + let orig_block = stack_entry.this.orig_block(); + if orig_block.is_some() && orig_block == fallthrough_return_block { + deferred_last = Some((stack_entry.this, range)); + } else { + postorder.push((stack_entry.this, range)); + } + stack.pop(); + } else { + // Heuristic: chase the children in reverse. This puts the first + // successor block first in RPO, all other things being equal, + // which tends to prioritize loop backedges over out-edges, + // putting the edge-block closer to the loop body and minimizing + // live-ranges in linear instruction space. + let next = lowered_succs[stack_entry.cur_succ - 1].1; + stack_entry.cur_succ -= 1; + if visited.contains(&next) { + continue; + } + visited.insert(next); + let range = compute_lowered_succs(&mut lowered_succs, next); + lowered_succ_indices.resize(lowered_succs.len(), 0); + stack.push(StackEntry { + this: next, + succs: range, + cur_succ: range.1, + }); + } + } + + postorder.reverse(); + let mut rpo = postorder; + if let Some(d) = deferred_last { + rpo.push(d); + } + + // Step 3: now that we have RPO, build the BlockIndex/BB fwd/rev maps. + let mut lowered_order = vec![]; + let mut lowered_succ_ranges = vec![]; + let mut lb_to_bindex = FxHashMap::default(); + for (block, succ_range) in rpo.into_iter() { + lb_to_bindex.insert(block, lowered_order.len() as BlockIndex); + lowered_order.push(block); + lowered_succ_ranges.push(succ_range); + } + + let lowered_succ_indices = lowered_succs + .iter() + .map(|&(inst, succ)| (inst, lb_to_bindex.get(&succ).cloned().unwrap())) + .collect(); + + let mut orig_map = SecondaryMap::with_default(None); + for (i, lb) in lowered_order.iter().enumerate() { + let i = i as BlockIndex; + if let Some(b) = lb.orig_block() { + orig_map[b] = Some(i); + } + } + + let result = BlockLoweringOrder { + lowered_order, + lowered_succs, + lowered_succ_indices, + lowered_succ_ranges, + orig_map, + }; + debug!("BlockLoweringOrder: {:?}", result); + result } - fn rpo(self, vcode: &VCode) -> Vec { - let mut rpo = self.postorder; - rpo.reverse(); - if let Some(block) = vcode.fallthrough_return_block { - rpo.push(block); - } - rpo + /// Get the lowered order of blocks. + pub fn lowered_order(&self) -> &[LoweredBlock] { + &self.lowered_order[..] + } + + /// Get the successors for a lowered block, by index in `lowered_order()`'s + /// returned slice. Each successsor is paired with the edge-instruction + /// (branch) corresponding to this edge. + pub fn succs(&self, block: BlockIndex) -> &[(Inst, LoweredBlock)] { + let range = self.lowered_succ_ranges[block as usize]; + &self.lowered_succs[range.0..range.1] + } + + /// Get the successor indices for a lowered block. + pub fn succ_indices(&self, block: BlockIndex) -> &[(Inst, BlockIndex)] { + let range = self.lowered_succ_ranges[block as usize]; + &self.lowered_succ_indices[range.0..range.1] + } + + /// Get the lowered block index containing a CLIF block, if any. (May not be + /// present if the original CLIF block was unreachable.) + pub fn lowered_block_for_bb(&self, bb: Block) -> Option { + self.orig_map[bb] } } -/// Compute the final block order. -pub fn compute_final_block_order(vcode: &VCode) -> Vec { - let mut rpo = BlockRPO::new(vcode); - rpo.visit(vcode, vcode.entry()); - rpo.rpo(vcode) +#[cfg(test)] +mod test { + use super::*; + use crate::cursor::{Cursor, FuncCursor}; + use crate::ir::types::*; + use crate::ir::{AbiParam, ExternalName, Function, InstBuilder, Signature}; + use crate::isa::CallConv; + + fn build_test_func(n_blocks: usize, edges: &[(usize, usize)]) -> Function { + assert!(n_blocks > 0); + + let name = ExternalName::testcase("test0"); + let mut sig = Signature::new(CallConv::SystemV); + sig.params.push(AbiParam::new(I32)); + let mut func = Function::with_name_signature(name, sig); + let blocks = (0..n_blocks) + .map(|i| { + let bb = func.dfg.make_block(); + assert!(bb.as_u32() == i as u32); + bb + }) + .collect::>(); + + let arg0 = func.dfg.append_block_param(blocks[0], I32); + + let mut pos = FuncCursor::new(&mut func); + + let mut edge = 0; + for i in 0..n_blocks { + pos.insert_block(blocks[i]); + let mut succs = vec![]; + while edge < edges.len() && edges[edge].0 == i { + succs.push(edges[edge].1); + edge += 1; + } + if succs.len() == 0 { + pos.ins().return_(&[arg0]); + } else if succs.len() == 1 { + pos.ins().jump(blocks[succs[0]], &[]); + } else if succs.len() == 2 { + pos.ins().brnz(arg0, blocks[succs[0]], &[]); + pos.ins().jump(blocks[succs[1]], &[]); + } else { + panic!("Too many successors"); + } + } + + func + } + + #[test] + fn test_blockorder_diamond() { + let func = build_test_func(4, &[(0, 1), (0, 2), (1, 3), (2, 3)]); + let order = BlockLoweringOrder::new(&func); + + assert_eq!(order.lowered_order.len(), 6); + + assert!(order.lowered_order[0].orig_block().unwrap().as_u32() == 0); + assert!(order.lowered_order[0].in_edge().is_none()); + assert!(order.lowered_order[0].out_edge().is_none()); + + assert!(order.lowered_order[1].orig_block().unwrap().as_u32() == 1); + assert!(order.lowered_order[1].in_edge().unwrap().0.as_u32() == 0); + assert!(order.lowered_order[1].in_edge().unwrap().2.as_u32() == 1); + + assert!(order.lowered_order[2].orig_block().is_none()); + assert!(order.lowered_order[2].in_edge().is_none()); + assert!(order.lowered_order[2].out_edge().unwrap().0.as_u32() == 1); + assert!(order.lowered_order[2].out_edge().unwrap().2.as_u32() == 3); + + assert!(order.lowered_order[3].orig_block().unwrap().as_u32() == 2); + assert!(order.lowered_order[3].in_edge().unwrap().0.as_u32() == 0); + assert!(order.lowered_order[3].in_edge().unwrap().2.as_u32() == 2); + assert!(order.lowered_order[3].out_edge().is_none()); + + assert!(order.lowered_order[4].orig_block().is_none()); + assert!(order.lowered_order[4].in_edge().is_none()); + assert!(order.lowered_order[4].out_edge().unwrap().0.as_u32() == 2); + assert!(order.lowered_order[4].out_edge().unwrap().2.as_u32() == 3); + + assert!(order.lowered_order[5].orig_block().unwrap().as_u32() == 3); + assert!(order.lowered_order[5].in_edge().is_none()); + assert!(order.lowered_order[5].out_edge().is_none()); + } + + #[test] + fn test_blockorder_critedge() { + // 0 + // / \ + // 1 2 + // / \ \ + // 3 4 | + // |\ _|____| + // | \/ | + // | /\ | + // 5 6 + // + // (3 -> 5, 3 -> 6, 4 -> 6 are critical edges and must be split) + // + let func = build_test_func( + 7, + &[ + (0, 1), + (0, 2), + (1, 3), + (1, 4), + (2, 5), + (3, 5), + (3, 6), + (4, 6), + ], + ); + let order = BlockLoweringOrder::new(&func); + + assert_eq!(order.lowered_order.len(), 11); + println!("ordered = {:?}", order.lowered_order); + + // block 0 + assert!(order.lowered_order[0].orig_block().unwrap().as_u32() == 0); + assert!(order.lowered_order[0].in_edge().is_none()); + assert!(order.lowered_order[0].out_edge().is_none()); + + // edge 0->1 + block 1 + assert!(order.lowered_order[1].orig_block().unwrap().as_u32() == 1); + assert!(order.lowered_order[1].in_edge().unwrap().0.as_u32() == 0); + assert!(order.lowered_order[1].in_edge().unwrap().2.as_u32() == 1); + assert!(order.lowered_order[1].out_edge().is_none()); + + // edge 1->3 + block 3 + assert!(order.lowered_order[2].orig_block().unwrap().as_u32() == 3); + assert!(order.lowered_order[2].in_edge().unwrap().0.as_u32() == 1); + assert!(order.lowered_order[2].in_edge().unwrap().2.as_u32() == 3); + assert!(order.lowered_order[2].out_edge().is_none()); + + // edge 3->5 + assert!(order.lowered_order[3].orig_block().is_none()); + assert!(order.lowered_order[3].in_edge().is_none()); + assert!(order.lowered_order[3].out_edge().unwrap().0.as_u32() == 3); + assert!(order.lowered_order[3].out_edge().unwrap().2.as_u32() == 5); + + // edge 3->6 + assert!(order.lowered_order[4].orig_block().is_none()); + assert!(order.lowered_order[4].in_edge().is_none()); + assert!(order.lowered_order[4].out_edge().unwrap().0.as_u32() == 3); + assert!(order.lowered_order[4].out_edge().unwrap().2.as_u32() == 6); + + // edge 1->4 + block 4 + assert!(order.lowered_order[5].orig_block().unwrap().as_u32() == 4); + assert!(order.lowered_order[5].in_edge().unwrap().0.as_u32() == 1); + assert!(order.lowered_order[5].in_edge().unwrap().2.as_u32() == 4); + assert!(order.lowered_order[5].out_edge().is_none()); + + // edge 4->6 + assert!(order.lowered_order[6].orig_block().is_none()); + assert!(order.lowered_order[6].in_edge().is_none()); + assert!(order.lowered_order[6].out_edge().unwrap().0.as_u32() == 4); + assert!(order.lowered_order[6].out_edge().unwrap().2.as_u32() == 6); + + // block 6 + assert!(order.lowered_order[7].orig_block().unwrap().as_u32() == 6); + assert!(order.lowered_order[7].in_edge().is_none()); + assert!(order.lowered_order[7].out_edge().is_none()); + + // edge 0->2 + block 2 + assert!(order.lowered_order[8].orig_block().unwrap().as_u32() == 2); + assert!(order.lowered_order[8].in_edge().unwrap().0.as_u32() == 0); + assert!(order.lowered_order[8].in_edge().unwrap().2.as_u32() == 2); + assert!(order.lowered_order[8].out_edge().is_none()); + + // edge 2->5 + assert!(order.lowered_order[9].orig_block().is_none()); + assert!(order.lowered_order[9].in_edge().is_none()); + assert!(order.lowered_order[9].out_edge().unwrap().0.as_u32() == 2); + assert!(order.lowered_order[9].out_edge().unwrap().2.as_u32() == 5); + + // block 5 + assert!(order.lowered_order[10].orig_block().unwrap().as_u32() == 5); + assert!(order.lowered_order[10].in_edge().is_none()); + assert!(order.lowered_order[10].out_edge().is_none()); + } } diff --git a/cranelift/codegen/src/machinst/buffer.rs b/cranelift/codegen/src/machinst/buffer.rs new file mode 100644 index 0000000000..b9e3bb3c1e --- /dev/null +++ b/cranelift/codegen/src/machinst/buffer.rs @@ -0,0 +1,1035 @@ +//! In-memory representation of compiled machine code, with labels and fixups to +//! refer to those labels. Handles constant-pool island insertion and also +//! veneer insertion for out-of-range jumps. + +use crate::binemit::{Addend, CodeOffset, CodeSink, Reloc}; +use crate::ir::{ExternalName, Opcode, SourceLoc, TrapCode}; +use crate::machinst::{BlockIndex, MachInstLabelUse, VCodeInst}; + +use log::debug; +use smallvec::SmallVec; +use std::mem; + +/// A buffer of output to be produced, fixed up, and then emitted to a CodeSink +/// in bulk. +/// +/// This struct uses `SmallVec`s to support small-ish function bodies without +/// any heap allocation. As such, it will be several kilobytes large. This is +/// likely fine as long as it is stack-allocated for function emission then +/// thrown away; but beware if many buffer objects are retained persistently. +pub struct MachBuffer { + /// The buffer contents, as raw bytes. + data: SmallVec<[u8; 1024]>, + /// Any relocations referring to this code. Note that only *external* + /// relocations are tracked here; references to labels within the buffer are + /// resolved before emission. + relocs: SmallVec<[MachReloc; 16]>, + /// Any trap records referring to this code. + traps: SmallVec<[MachTrap; 16]>, + /// Any call site records referring to this code. + call_sites: SmallVec<[MachCallSite; 16]>, + /// Any source location mappings referring to this code. + srclocs: SmallVec<[MachSrcLoc; 64]>, + /// The current source location in progress (after `start_srcloc()` and + /// before `end_srcloc()`). This is a (start_offset, src_loc) tuple. + cur_srcloc: Option<(CodeOffset, SourceLoc)>, + /// Known label offsets; `UNKNOWN_LABEL_OFFSET` if unknown. + label_offsets: SmallVec<[CodeOffset; 16]>, + /// Label aliases: one label points to an unconditional jump to another + /// label, so references to the first should be resolved as references + /// to the second. (We don't chase arbitrarily deep to avoid problems + /// with cycles.) + label_aliases: SmallVec<[MachLabel; 16]>, + /// Constants that must be emitted at some point. + pending_constants: SmallVec<[MachLabelConstant; 16]>, + /// Fixups that must be performed after all code is emitted. + fixup_records: SmallVec<[MachLabelFixup; 16]>, + /// Current deadline at which all constants are flushed and all code labels + /// are extended by emitting long-range jumps in an island. This flush + /// should be rare (e.g., on AArch64, the shortest-range PC-rel references + /// are +/- 1MB for conditional jumps and load-literal instructions), so + /// it's acceptable to track a minimum and flush-all rather than doing more + /// detailed "current minimum" / sort-by-deadline trickery. + island_deadline: CodeOffset, + /// How many bytes are needed in the worst case for an island, given all + /// pending constants and fixups. + island_worst_case_size: CodeOffset, + /// Latest branches, to facilitate in-place editing for better fallthrough + /// behavior and empty-block removal. + latest_branches: SmallVec<[MachBranch; 4]>, + /// All labels, in offset order. + labels_by_offset: SmallVec<[(MachLabel, CodeOffset); 16]>, +} + +/// A `MachBuffer` once emission is completed: holds generated code and records, +/// without fixups. This allows the type to be independent of the backend. +pub struct MachBufferFinalized { + /// The buffer contents, as raw bytes. + pub data: SmallVec<[u8; 1024]>, + /// Any relocations referring to this code. Note that only *external* + /// relocations are tracked here; references to labels within the buffer are + /// resolved before emission. + relocs: SmallVec<[MachReloc; 16]>, + /// Any trap records referring to this code. + traps: SmallVec<[MachTrap; 16]>, + /// Any call site records referring to this code. + call_sites: SmallVec<[MachCallSite; 16]>, + /// Any source location mappings referring to this code. + srclocs: SmallVec<[MachSrcLoc; 64]>, +} + +static UNKNOWN_LABEL_OFFSET: CodeOffset = 0xffff_ffff; +static UNKNOWN_LABEL: MachLabel = MachLabel(0xffff_ffff); + +/// A label refers to some offset in a `MachBuffer`. It may not be resolved at +/// the point at which it is used by emitted code; the buffer records "fixups" +/// for references to the label, and will come back and patch the code +/// appropriately when the label's location is eventually known. +#[derive(Clone, Copy, Debug, PartialEq, Eq, PartialOrd, Ord, Hash)] +pub struct MachLabel(u32); + +impl MachLabel { + /// Get a label for a block. (The first N MachLabels are always reseved for + /// the N blocks in the vcode.) + pub fn from_block(bindex: BlockIndex) -> MachLabel { + MachLabel(bindex) + } + + /// Get the numeric label index. + pub fn get(self) -> u32 { + self.0 + } +} + +impl MachBuffer { + /// Create a new section, known to start at `start_offset` and with a size limited to `length_limit`. + pub fn new() -> MachBuffer { + MachBuffer { + data: SmallVec::new(), + relocs: SmallVec::new(), + traps: SmallVec::new(), + call_sites: SmallVec::new(), + srclocs: SmallVec::new(), + cur_srcloc: None, + label_offsets: SmallVec::new(), + label_aliases: SmallVec::new(), + pending_constants: SmallVec::new(), + fixup_records: SmallVec::new(), + island_deadline: UNKNOWN_LABEL_OFFSET, + island_worst_case_size: 0, + latest_branches: SmallVec::new(), + labels_by_offset: SmallVec::new(), + } + } + + /// Current offset from start of buffer. + pub fn cur_offset(&self) -> CodeOffset { + self.data.len() as CodeOffset + } + + /// Add a byte. + pub fn put1(&mut self, value: u8) { + debug!("MachBuffer: put byte @ {}: {:x}", self.cur_offset(), value); + self.data.push(value); + } + + /// Add 2 bytes. + pub fn put2(&mut self, value: u16) { + debug!( + "MachBuffer: put 16-bit word @ {}: {:x}", + self.cur_offset(), + value + ); + let bytes = value.to_le_bytes(); + self.data.extend_from_slice(&bytes[..]); + } + + /// Add 4 bytes. + pub fn put4(&mut self, value: u32) { + debug!( + "MachBuffer: put 32-bit word @ {}: {:x}", + self.cur_offset(), + value + ); + let bytes = value.to_le_bytes(); + self.data.extend_from_slice(&bytes[..]); + } + + /// Add 8 bytes. + pub fn put8(&mut self, value: u64) { + debug!( + "MachBuffer: put 64-bit word @ {}: {:x}", + self.cur_offset(), + value + ); + let bytes = value.to_le_bytes(); + self.data.extend_from_slice(&bytes[..]); + } + + /// Add a slice of bytes. + pub fn put_data(&mut self, data: &[u8]) { + debug!( + "MachBuffer: put data @ {}: len {}", + self.cur_offset(), + data.len() + ); + self.data.extend_from_slice(data); + } + + /// Reserve appended space and return a mutable slice referring to it. + pub fn get_appended_space(&mut self, len: usize) -> &mut [u8] { + debug!("MachBuffer: put data @ {}: len {}", self.cur_offset(), len); + let off = self.data.len(); + let new_len = self.data.len() + len; + self.data.resize(new_len, 0); + &mut self.data[off..] + } + + /// Align up to the given alignment. + pub fn align_to(&mut self, align_to: CodeOffset) { + debug!("MachBuffer: align to {}", align_to); + assert!(align_to.is_power_of_two()); + while self.cur_offset() & (align_to - 1) != 0 { + self.put1(0); + } + } + + /// Allocate a `Label` to refer to some offset. May not be bound to a fixed + /// offset yet. + pub fn get_label(&mut self) -> MachLabel { + let l = self.label_offsets.len() as u32; + self.label_offsets.push(UNKNOWN_LABEL_OFFSET); + self.label_aliases.push(UNKNOWN_LABEL); + debug!("MachBuffer: new label -> {:?}", MachLabel(l)); + MachLabel(l) + } + + /// Reserve the first N MachLabels for blocks. + pub fn reserve_labels_for_blocks(&mut self, blocks: BlockIndex) { + debug!("MachBuffer: first {} labels are for blocks", blocks); + debug_assert!(self.label_offsets.is_empty()); + self.label_offsets + .resize(blocks as usize, UNKNOWN_LABEL_OFFSET); + self.label_aliases.resize(blocks as usize, UNKNOWN_LABEL); + } + + /// Bind a label to the current offset. + pub fn bind_label(&mut self, label: MachLabel) { + debug!( + "MachBuffer: bind label {:?} at offset {}", + label, + self.cur_offset() + ); + let offset = self.cur_offset(); + self.label_offsets[label.0 as usize] = offset; + self.labels_by_offset.push((label, offset)); + self.optimize_branches(); + } + + /// Resolve a label to an offset, if known. May return `UNKNOWN_LABEL_OFFSET`. + fn resolve_label_offset(&self, label: MachLabel) -> CodeOffset { + let alias = self.label_aliases[label.0 as usize]; + if alias != UNKNOWN_LABEL { + self.label_offsets[alias.0 as usize] + } else { + self.label_offsets[label.0 as usize] + } + } + + /// Emit a reference to the given label with the given reference type (i.e., + /// branch-instruction format) at the current offset. This is like a + /// relocation, but handled internally. + /// + /// Because the offset of the label may already be known and the patch may + /// happen immediately, the buffer must already contain bytes at `offset` up + /// to `offset + kind.patch_size()`. + pub fn use_label_at_offset(&mut self, offset: CodeOffset, label: MachLabel, kind: I::LabelUse) { + debug!( + "MachBuffer: use_label_at_offset: offset {} label {:?} kind {:?}", + offset, label, kind + ); + debug_assert!(offset + kind.patch_size() <= self.cur_offset()); + + // Add the fixup, and update the worst-case island size based on a + // veneer for this label use. + self.fixup_records.push(MachLabelFixup { + label, + offset, + kind, + }); + if kind.supports_veneer() { + self.island_worst_case_size += kind.veneer_size(); + self.island_worst_case_size &= !(I::LabelUse::ALIGN - 1); + } + let deadline = offset + kind.max_pos_range(); + if deadline < self.island_deadline { + self.island_deadline = deadline; + } + } + + /// Inform the buffer of an unconditional branch at the given offset, + /// targetting the given label. May be used to optimize branches. + /// The last added label-use must correspond to this branch. + pub fn add_uncond_branch(&mut self, start: CodeOffset, end: CodeOffset, target: MachLabel) { + assert!(!self.fixup_records.is_empty()); + let fixup = self.fixup_records.len() - 1; + self.latest_branches.push(MachBranch { + start, + end, + target, + fixup, + inverted: None, + }); + } + + /// Inform the buffer of a conditional branch at the given offset, + /// targetting the given label. May be used to optimize branches. + /// The last added label-use must correspond to this branch. + pub fn add_cond_branch( + &mut self, + start: CodeOffset, + end: CodeOffset, + target: MachLabel, + inverted: &[u8], + ) { + assert!(!self.fixup_records.is_empty()); + let fixup = self.fixup_records.len() - 1; + let inverted = Some(SmallVec::from(inverted)); + self.latest_branches.push(MachBranch { + start, + end, + target, + fixup, + inverted, + }); + } + + fn truncate_last_branch(&mut self) { + let b = self.latest_branches.pop().unwrap(); + assert!(b.end == self.cur_offset()); + self.data.truncate(b.start as usize); + self.fixup_records.truncate(b.fixup); + let cur_off = self.cur_offset(); + debug!( + "truncate_last_branch: truncated {:?}; off now {}", + b, cur_off + ); + for &mut (l, ref mut off) in self.labels_by_offset.iter_mut().rev() { + if *off > cur_off { + *off = cur_off; + debug!(" -> label {:?} reassigned to {}", l, cur_off); + self.label_offsets[l.0 as usize] = cur_off; + } else { + break; + } + } + } + + fn optimize_branches(&mut self) { + debug!( + "enter optimize_branches:\n b = {:?}\n l = {:?}\n f = {:?}", + self.latest_branches, self.labels_by_offset, self.fixup_records + ); + while let Some(b) = self.latest_branches.last() { + let cur_off = self.cur_offset(); + debug!("optimize_branches: last branch {:?} at off {}", b, cur_off); + // If there has been any code emission since the end of the last branch or + // label definition, then there's nothing we can edit (because we + // don't move code once placed, only back up and overwrite), so + // clear the records and finish. + if b.end < cur_off { + break; + } + + // If latest is an unconditional branch: + // - For each label at this point, make the label an alias of + // the branch target. We can now assume below that the + // unconditional branch is reachable only via fallthrough, and we + // are free to remove it in an optimization. + // - If there is a prior unconditional branch that ends just before + // this one begins, then we can truncate this branch, because it is + // entirely unreachable (due to above). Trim the end of the + // `labels_by_offset` array and continue around the loop. + // - If there is a prior conditional branch whose target label + // resolves to the current offset (branches around the + // unconditional branch), then remove the unconditional branch, + // and make the target of the unconditional the target of the + // conditional instead. + if b.is_uncond() { + // Set any label equal to current branch's start as an alias of + // the branch's target. + for &(l, off) in self.labels_by_offset.iter().rev() { + debug!(" -> uncond: latest label {:?} at off {}", l, off); + if off > b.start { + continue; + } else if off == b.start { + debug!(" -> setting alias to {:?}", b.target); + self.label_aliases[l.0 as usize] = b.target; + } else { + break; + } + } + + // If the branch target is the next offset, + + // Examine any immediately preceding branch. + if self.latest_branches.len() > 1 { + let prev_b = &self.latest_branches[self.latest_branches.len() - 2]; + debug!(" -> more than one branch; prev_b = {:?}", prev_b); + // This uncond is immediately after another uncond; we've + // already redirected labels to this uncond away; so we can + // truncate this uncond. + if prev_b.is_uncond() && prev_b.end == b.start { + debug!(" -> uncond follows another uncond; truncating"); + self.truncate_last_branch(); + continue; + } + + // This uncond is immediately after a conditional, and the + // conditional's target is the end of this uncond, and we've + // already redirected labels to this uncond away; so we can + // truncate this uncond, flip the sense of the conditional, and + // set the conditional's target (in `latest_branches` and in + // `fixup_records`) to the uncond's target. + if prev_b.is_cond() + && prev_b.end == b.start + && self.resolve_label_offset(prev_b.target) == cur_off + { + debug!(" -> uncond follows a conditional, and conditional's target resolves to current offset"); + let target = b.target; + let data = prev_b.inverted.clone().unwrap(); + self.truncate_last_branch(); + let prev_b = self.latest_branches.last_mut().unwrap(); + let not_inverted = SmallVec::from( + &self.data[(prev_b.start as usize)..(prev_b.end as usize)], + ); + self.data.truncate(prev_b.start as usize); + self.data.extend_from_slice(&data[..]); + prev_b.inverted = Some(not_inverted); + self.fixup_records[prev_b.fixup].label = target; + debug!(" -> reassigning target of condbr to {:?}", target); + prev_b.target = target; + continue; + } + } + } + + // For any branch, conditional or unconditional: + // - If the target is a label at the current offset, then remove + // the conditional branch, and reset all labels that targetted + // the current offset (end of branch) to the truncated + // end-of-code. + if self.resolve_label_offset(b.target) == cur_off { + debug!("branch with target == cur off; truncating"); + self.truncate_last_branch(); + } + + // If we couldn't do anything with the last branch, then break. + break; + } + + self.purge_latest_branches(); + + debug!( + "leave optimize_branches:\n b = {:?}\n l = {:?}\n f = {:?}", + self.latest_branches, self.labels_by_offset, self.fixup_records + ); + } + + fn purge_latest_branches(&mut self) { + let cur_off = self.cur_offset(); + if let Some(l) = self.latest_branches.last() { + if l.end < cur_off { + debug!("purge_latest_branches: removing branch {:?}", l); + self.latest_branches.clear(); + } + } + } + + /// Emit a constant at some point in the future, binding the given label to + /// its offset. The constant will be placed at most `max_distance` from the + /// current offset. + pub fn defer_constant( + &mut self, + label: MachLabel, + align: CodeOffset, + data: &[u8], + max_distance: CodeOffset, + ) { + let deadline = self.cur_offset() + max_distance; + self.island_worst_case_size += data.len() as CodeOffset; + self.island_worst_case_size &= !(I::LabelUse::ALIGN - 1); + self.pending_constants.push(MachLabelConstant { + label, + align, + data: SmallVec::from(data), + }); + if deadline < self.island_deadline { + self.island_deadline = deadline; + } + } + + /// Is an island needed within the next N bytes? + pub fn island_needed(&self, distance: CodeOffset) -> bool { + let worst_case_end_of_island = self.cur_offset() + distance + self.island_worst_case_size; + worst_case_end_of_island > self.island_deadline + } + + /// Emit all pending constants and veneers. Should only be called if + /// `island_needed()` returns true, i.e., if we actually reach a deadline: + /// otherwise, unnecessary veneers may be inserted. + pub fn emit_island(&mut self) { + // We're going to purge fixups, so no latest-branch editing can happen + // anymore. + self.latest_branches.clear(); + + let pending_constants = mem::replace(&mut self.pending_constants, SmallVec::new()); + for MachLabelConstant { label, align, data } in pending_constants.into_iter() { + self.align_to(align); + self.bind_label(label); + self.put_data(&data[..]); + } + + let fixup_records = mem::replace(&mut self.fixup_records, SmallVec::new()); + let mut new_fixups = SmallVec::new(); + for MachLabelFixup { + label, + offset, + kind, + } in fixup_records.into_iter() + { + debug!( + "emit_island: fixup for label {:?} at offset {} kind {:?}", + label, offset, kind + ); + // We eagerly perform fixups whose label targets are known, if not out + // of range, to avoid unnecessary veneers. + let label_offset = self.resolve_label_offset(label); + let known = label_offset != UNKNOWN_LABEL_OFFSET; + let in_range = if known { + if label_offset >= offset { + (label_offset - offset) <= kind.max_pos_range() + } else { + (offset - label_offset) <= kind.max_neg_range() + } + } else { + false + }; + + debug!( + " -> label_offset = {}, known = {}, in_range = {} (pos {} neg {})", + label_offset, + known, + in_range, + kind.max_pos_range(), + kind.max_neg_range() + ); + + let start = offset as usize; + let end = (offset + kind.patch_size()) as usize; + if in_range { + debug_assert!(known); // implied by in_range. + let slice = &mut self.data[start..end]; + debug!("patching in-range!"); + kind.patch(slice, offset, label_offset); + } else if !known && !kind.supports_veneer() { + // Nothing for now. Keep it for next round. + new_fixups.push(MachLabelFixup { + label, + offset, + kind, + }); + } else if !in_range && kind.supports_veneer() { + // Allocate space for a veneer in the island. + self.align_to(I::LabelUse::ALIGN); + let veneer_offset = self.cur_offset(); + debug!("making a veneer at {}", veneer_offset); + let slice = &mut self.data[start..end]; + // Patch the original label use to refer to teh veneer. + debug!( + "patching original at offset {} to veneer offset {}", + offset, veneer_offset + ); + kind.patch(slice, offset, veneer_offset); + // Generate the veneer. + let veneer_slice = self.get_appended_space(kind.veneer_size() as usize); + let (veneer_fixup_off, veneer_label_use) = + kind.generate_veneer(veneer_slice, veneer_offset); + debug!( + "generated veneer; fixup offset {}, label_use {:?}", + veneer_fixup_off, veneer_label_use + ); + // If the label is known (but was just out of range), do the + // veneer label-use fixup now too; otherwise, save it for later. + if known { + let start = veneer_fixup_off as usize; + let end = (veneer_fixup_off + veneer_label_use.patch_size()) as usize; + let veneer_slice = &mut self.data[start..end]; + debug!("doing veneer fixup right away too"); + veneer_label_use.patch(veneer_slice, veneer_fixup_off, label_offset); + } else { + new_fixups.push(MachLabelFixup { + label, + offset: veneer_fixup_off, + kind: veneer_label_use, + }); + } + } else { + panic!( + "Cannot support label-use {:?} (known = {}, in-range = {})", + kind, known, in_range + ); + } + } + + self.fixup_records = new_fixups; + self.island_deadline = UNKNOWN_LABEL_OFFSET; + } + + /// Finish any deferred emissions and/or fixups. + pub fn finish(mut self) -> MachBufferFinalized { + // Ensure that all labels are defined. This is a full (release-mode) + // assert because we must avoid looping indefinitely below; an + // unresolved label will prevent the fixup_records vec from emptying. + assert!(self + .label_offsets + .iter() + .all(|&off| off != UNKNOWN_LABEL_OFFSET)); + + while !self.pending_constants.is_empty() || !self.fixup_records.is_empty() { + // `emit_island()` will emit any pending veneers and constants, and + // as a side-effect, will also take care of any fixups with resolved + // labels eagerly. + self.emit_island(); + } + + MachBufferFinalized { + data: self.data, + relocs: self.relocs, + traps: self.traps, + call_sites: self.call_sites, + srclocs: self.srclocs, + } + } + + /// Add an external relocation at the current offset. + pub fn add_reloc( + &mut self, + srcloc: SourceLoc, + kind: Reloc, + name: &ExternalName, + addend: Addend, + ) { + let name = name.clone(); + self.relocs.push(MachReloc { + offset: self.data.len() as CodeOffset, + srcloc, + kind, + name, + addend, + }); + } + + /// Add a trap record at the current offset. + pub fn add_trap(&mut self, srcloc: SourceLoc, code: TrapCode) { + self.traps.push(MachTrap { + offset: self.data.len() as CodeOffset, + srcloc, + code, + }); + } + + /// Add a call-site record at the current offset. + pub fn add_call_site(&mut self, srcloc: SourceLoc, opcode: Opcode) { + self.call_sites.push(MachCallSite { + ret_addr: self.data.len() as CodeOffset, + srcloc, + opcode, + }); + } + + /// Set the `SourceLoc` for code from this offset until the offset at the + /// next call to `end_srcloc()`. + pub fn start_srcloc(&mut self, loc: SourceLoc) { + self.cur_srcloc = Some((self.cur_offset(), loc)); + } + + /// Mark the end of the `SourceLoc` segment started at the last + /// `start_srcloc()` call. + pub fn end_srcloc(&mut self) { + let (start, loc) = self + .cur_srcloc + .take() + .expect("end_srcloc() called without start_srcloc()"); + let end = self.cur_offset(); + // Skip zero-length extends. + debug_assert!(end >= start); + if end > start { + self.srclocs.push(MachSrcLoc { start, end, loc }); + } + } +} + +impl MachBufferFinalized { + /// Get a list of source location mapping tuples in sorted-by-start-offset order. + pub fn get_srclocs_sorted(&self) -> &[MachSrcLoc] { + &self.srclocs[..] + } + + /// Get the total required size for the code. + pub fn total_size(&self) -> CodeOffset { + self.data.len() as CodeOffset + } + + /// Emit this buffer to the given CodeSink. + pub fn emit(&self, sink: &mut CS) { + // N.B.: we emit every section into the .text section as far as + // the `CodeSink` is concerned; we do not bother to segregate + // the contents into the actual program text, the jumptable and the + // rodata (constant pool). This allows us to generate code assuming + // that these will not be relocated relative to each other, and avoids + // having to designate each section as belonging in one of the three + // fixed categories defined by `CodeSink`. If this becomes a problem + // later (e.g. because of memory permissions or similar), we can + // add this designation and segregate the output; take care, however, + // to add the appropriate relocations in this case. + + let mut next_reloc = 0; + let mut next_trap = 0; + let mut next_call_site = 0; + for (idx, byte) in self.data.iter().enumerate() { + if next_reloc < self.relocs.len() { + let reloc = &self.relocs[next_reloc]; + if reloc.offset == idx as CodeOffset { + sink.reloc_external(reloc.srcloc, reloc.kind, &reloc.name, reloc.addend); + next_reloc += 1; + } + } + if next_trap < self.traps.len() { + let trap = &self.traps[next_trap]; + if trap.offset == idx as CodeOffset { + sink.trap(trap.code, trap.srcloc); + next_trap += 1; + } + } + if next_call_site < self.call_sites.len() { + let call_site = &self.call_sites[next_call_site]; + if call_site.ret_addr == idx as CodeOffset { + sink.add_call_site(call_site.opcode, call_site.srcloc); + next_call_site += 1; + } + } + sink.put1(*byte); + } + + sink.begin_jumptables(); + sink.begin_rodata(); + sink.end_codegen(); + } +} + +/// A constant that is deferred to the next constant-pool opportunity. +struct MachLabelConstant { + /// This label will refer to the constant's offset. + label: MachLabel, + /// Required alignment. + align: CodeOffset, + /// This data will be emitted when able. + data: SmallVec<[u8; 16]>, +} + +/// A fixup to perform on the buffer once code is emitted. Fixups always refer +/// to labels and patch the code based on label offsets. Hence, they are like +/// relocations, but internal to one buffer. +#[derive(Debug)] +struct MachLabelFixup { + /// The label whose offset controls this fixup. + label: MachLabel, + /// The offset to fix up / patch to refer to this label. + offset: CodeOffset, + /// The kind of fixup. This is architecture-specific; each architecture may have, + /// e.g., several types of branch instructions, each with differently-sized + /// offset fields and different places within the instruction to place the + /// bits. + kind: I::LabelUse, +} + +/// A relocation resulting from a compilation. +struct MachReloc { + /// The offset at which the relocation applies, *relative to the + /// containing section*. + offset: CodeOffset, + /// The original source location. + srcloc: SourceLoc, + /// The kind of relocation. + kind: Reloc, + /// The external symbol / name to which this relocation refers. + name: ExternalName, + /// The addend to add to the symbol value. + addend: i64, +} + +/// A trap record resulting from a compilation. +struct MachTrap { + /// The offset at which the trap instruction occurs, *relative to the + /// containing section*. + offset: CodeOffset, + /// The original source location. + srcloc: SourceLoc, + /// The trap code. + code: TrapCode, +} + +/// A call site record resulting from a compilation. +struct MachCallSite { + /// The offset of the call's return address, *relative to the containing section*. + ret_addr: CodeOffset, + /// The original source location. + srcloc: SourceLoc, + /// The call's opcode. + opcode: Opcode, +} + +/// A source-location mapping resulting from a compilation. +#[derive(Clone, Debug)] +pub struct MachSrcLoc { + /// The start of the region of code corresponding to a source location. + /// This is relative to the start of the function, not to the start of the + /// section. + pub start: CodeOffset, + /// The end of the region of code corresponding to a source location. + /// This is relative to the start of the section, not to the start of the + /// section. + pub end: CodeOffset, + /// The source location. + pub loc: SourceLoc, +} + +/// Record of branch instruction in the buffer, to facilitate editing. +#[derive(Clone, Debug)] +struct MachBranch { + start: CodeOffset, + end: CodeOffset, + target: MachLabel, + fixup: usize, + inverted: Option>, +} + +impl MachBranch { + fn is_cond(&self) -> bool { + self.inverted.is_some() + } + fn is_uncond(&self) -> bool { + self.inverted.is_none() + } +} + +// We use an actual instruction definition to do tests, so we depend on the `arm64` feature here. +#[cfg(all(test, feature = "arm64"))] +mod test { + use super::*; + use crate::isa::aarch64::inst::xreg; + use crate::isa::aarch64::inst::{BranchTarget, CondBrKind, Inst}; + use crate::machinst::MachInstEmit; + use crate::settings; + use std::default::Default; + + fn label(n: u32) -> MachLabel { + MachLabel::from_block(n) + } + fn target(n: u32) -> BranchTarget { + BranchTarget::Label(label(n)) + } + + #[test] + fn test_elide_jump_to_next() { + let flags = settings::Flags::new(settings::builder()); + let mut buf = MachBuffer::new(); + let mut state = Default::default(); + + buf.reserve_labels_for_blocks(2); + buf.bind_label(label(0)); + let inst = Inst::Jump { dest: target(1) }; + inst.emit(&mut buf, &flags, &mut state); + buf.bind_label(label(1)); + let buf = buf.finish(); + assert_eq!(0, buf.total_size()); + } + + #[test] + fn test_elide_trivial_jump_blocks() { + let flags = settings::Flags::new(settings::builder()); + let mut buf = MachBuffer::new(); + let mut state = Default::default(); + + buf.reserve_labels_for_blocks(4); + + buf.bind_label(label(0)); + let inst = Inst::CondBr { + kind: CondBrKind::NotZero(xreg(0)), + taken: target(1), + not_taken: target(2), + }; + inst.emit(&mut buf, &flags, &mut state); + + buf.bind_label(label(1)); + let inst = Inst::Jump { dest: target(3) }; + inst.emit(&mut buf, &flags, &mut state); + + buf.bind_label(label(2)); + let inst = Inst::Jump { dest: target(3) }; + inst.emit(&mut buf, &flags, &mut state); + + buf.bind_label(label(3)); + + let buf = buf.finish(); + assert_eq!(0, buf.total_size()); + } + + #[test] + fn test_flip_cond() { + let flags = settings::Flags::new(settings::builder()); + let mut buf = MachBuffer::new(); + let mut state = Default::default(); + + buf.reserve_labels_for_blocks(4); + + buf.bind_label(label(0)); + let inst = Inst::CondBr { + kind: CondBrKind::NotZero(xreg(0)), + taken: target(1), + not_taken: target(2), + }; + inst.emit(&mut buf, &flags, &mut state); + + buf.bind_label(label(1)); + let inst = Inst::Nop4; + inst.emit(&mut buf, &flags, &mut state); + + buf.bind_label(label(2)); + let inst = Inst::Nop4; + inst.emit(&mut buf, &flags, &mut state); + + buf.bind_label(label(3)); + + let buf = buf.finish(); + + let mut buf2 = MachBuffer::new(); + let mut state = Default::default(); + let inst = Inst::OneWayCondBr { + kind: CondBrKind::Zero(xreg(0)), + target: BranchTarget::ResolvedOffset(8), + }; + inst.emit(&mut buf2, &flags, &mut state); + let inst = Inst::Nop4; + inst.emit(&mut buf2, &flags, &mut state); + inst.emit(&mut buf2, &flags, &mut state); + + let buf2 = buf2.finish(); + + assert_eq!(buf.data, buf2.data); + } + + #[test] + fn test_island() { + let flags = settings::Flags::new(settings::builder()); + let mut buf = MachBuffer::new(); + let mut state = Default::default(); + + buf.reserve_labels_for_blocks(4); + + buf.bind_label(label(0)); + let inst = Inst::CondBr { + kind: CondBrKind::NotZero(xreg(0)), + taken: target(2), + not_taken: target(3), + }; + inst.emit(&mut buf, &flags, &mut state); + + buf.bind_label(label(1)); + while buf.cur_offset() < 2000000 { + if buf.island_needed(0) { + buf.emit_island(); + } + let inst = Inst::Nop4; + inst.emit(&mut buf, &flags, &mut state); + } + + buf.bind_label(label(2)); + let inst = Inst::Nop4; + inst.emit(&mut buf, &flags, &mut state); + + buf.bind_label(label(3)); + let inst = Inst::Nop4; + inst.emit(&mut buf, &flags, &mut state); + + let buf = buf.finish(); + + assert_eq!(2000000 + 8, buf.total_size()); + + let mut buf2 = MachBuffer::new(); + let mut state = Default::default(); + let inst = Inst::CondBr { + kind: CondBrKind::NotZero(xreg(0)), + taken: BranchTarget::ResolvedOffset(1048576 - 4), + not_taken: BranchTarget::ResolvedOffset(2000000 + 4 - 4), + }; + inst.emit(&mut buf2, &flags, &mut state); + + let buf2 = buf2.finish(); + + assert_eq!(&buf.data[0..8], &buf2.data[..]); + } + + #[test] + fn test_island_backward() { + let flags = settings::Flags::new(settings::builder()); + let mut buf = MachBuffer::new(); + let mut state = Default::default(); + + buf.reserve_labels_for_blocks(4); + + buf.bind_label(label(0)); + let inst = Inst::Nop4; + inst.emit(&mut buf, &flags, &mut state); + + buf.bind_label(label(1)); + let inst = Inst::Nop4; + inst.emit(&mut buf, &flags, &mut state); + + buf.bind_label(label(2)); + while buf.cur_offset() < 2000000 { + let inst = Inst::Nop4; + inst.emit(&mut buf, &flags, &mut state); + } + + buf.bind_label(label(3)); + let inst = Inst::CondBr { + kind: CondBrKind::NotZero(xreg(0)), + taken: target(0), + not_taken: target(1), + }; + inst.emit(&mut buf, &flags, &mut state); + + let buf = buf.finish(); + + assert_eq!(2000000 + 12, buf.total_size()); + + let mut buf2 = MachBuffer::new(); + let mut state = Default::default(); + let inst = Inst::CondBr { + kind: CondBrKind::NotZero(xreg(0)), + taken: BranchTarget::ResolvedOffset(8), + not_taken: BranchTarget::ResolvedOffset(4 - (2000000 + 4)), + }; + inst.emit(&mut buf2, &flags, &mut state); + let inst = Inst::Jump { + dest: BranchTarget::ResolvedOffset(-(2000000 + 8)), + }; + inst.emit(&mut buf2, &flags, &mut state); + + let buf2 = buf2.finish(); + + assert_eq!(&buf.data[2000000..], &buf2.data[..]); + } +} diff --git a/cranelift/codegen/src/machinst/compile.rs b/cranelift/codegen/src/machinst/compile.rs index 8f81320fd3..508e242cd7 100644 --- a/cranelift/codegen/src/machinst/compile.rs +++ b/cranelift/codegen/src/machinst/compile.rs @@ -18,8 +18,12 @@ pub fn compile( where B::MInst: ShowWithRRU, { - // This lowers the CL IR. - let mut vcode = Lower::new(f, abi)?.lower(b)?; + // Compute lowered block order. + let block_order = BlockLoweringOrder::new(f); + // Build the lowering context. + let lower = Lower::new(f, abi, block_order)?; + // Lower the IR. + let mut vcode = lower.lower(b)?; debug!( "vcode from lowering: \n{}", @@ -65,11 +69,6 @@ where // all at once. This also inserts prologues/epilogues. vcode.replace_insns_from_regalloc(result); - vcode.remove_redundant_branches(); - - // Do final passes over code to finalize branches. - vcode.finalize_branches(); - debug!( "vcode after regalloc: final version:\n{}", vcode.show_rru(Some(b.reg_universe())) diff --git a/cranelift/codegen/src/machinst/lower.rs b/cranelift/codegen/src/machinst/lower.rs index 47384f462e..fcbf3d2810 100644 --- a/cranelift/codegen/src/machinst/lower.rs +++ b/cranelift/codegen/src/machinst/lower.rs @@ -3,54 +3,97 @@ //! machine code, except for register allocation. use crate::entity::SecondaryMap; -use crate::inst_predicates::has_side_effect; +use crate::fx::{FxHashMap, FxHashSet}; +use crate::inst_predicates::{has_side_effect_or_load, is_constant_64bit}; use crate::ir::instructions::BranchInfo; use crate::ir::{ ArgumentExtension, Block, ExternalName, Function, GlobalValueData, Inst, InstructionData, MemFlags, Opcode, Signature, SourceLoc, Type, Value, ValueDef, }; -use crate::machinst::{ABIBody, BlockIndex, VCode, VCodeBuilder, VCodeInst}; -use crate::{num_uses::NumUses, CodegenResult}; +use crate::machinst::{ + ABIBody, BlockIndex, BlockLoweringOrder, LoweredBlock, MachLabel, VCode, VCodeBuilder, + VCodeInst, +}; +use crate::CodegenResult; -use regalloc::{Reg, RegClass, Set, VirtualReg, Writable}; +use regalloc::{Reg, RegClass, VirtualReg, Writable}; use alloc::boxed::Box; use alloc::vec::Vec; use log::debug; use smallvec::SmallVec; -use std::collections::VecDeque; -/// A context that machine-specific lowering code can use to emit lowered instructions. This is the -/// view of the machine-independent per-function lowering context that is seen by the machine -/// backend. +/// An "instruction color" partitions instructions by side-effecting ops. All +/// instructions with the same "color" are guaranteed not to be separated by any +/// side-effecting op (for this purpose, loads are also considered +/// side-effecting, to avoid subtle questions w.r.t. the memory model), and +/// furthermore, it is guaranteed that for any two instructions A and B such +/// that color(A) == color(B), either A dominates B and B postdominates A, or +/// vice-versa. (For now, in practice, only ops in the same basic block can ever +/// have the same color, trivially providing the second condition.) Intuitively, +/// this means that the ops of the same color must always execute "together", as +/// part of one atomic contiguous section of the dynamic execution trace, and +/// they can be freely permuted without affecting program behavior. +#[derive(Clone, Copy, Debug, PartialEq, Eq, Hash)] +pub struct InstColor(u32); +impl InstColor { + fn new(n: u32) -> InstColor { + InstColor(n) + } + + /// Get an arbitrary index representing this color. The index is unique + /// *within a single function compilation*, but indices may be reused across + /// functions. + pub fn get(self) -> u32 { + self.0 + } +} + +/// A context that machine-specific lowering code can use to emit lowered +/// instructions. This is the view of the machine-independent per-function +/// lowering context that is seen by the machine backend. pub trait LowerCtx { /// The instruction type for which this lowering framework is instantiated. - type I; + type I: VCodeInst; + + // Function-level queries: + + /// Get the `ABIBody`. + fn abi(&mut self) -> &dyn ABIBody; + /// Get the (virtual) register that receives the return value. A return + /// instruction should lower into a sequence that fills this register. (Why + /// not allow the backend to specify its own result register for the return? + /// Because there may be multiple return points.) + fn retval(&self, idx: usize) -> Writable; + + // General instruction queries: /// Get the instdata for a given IR instruction. fn data(&self, ir_inst: Inst) -> &InstructionData; /// Get the controlling type for a polymorphic IR instruction. fn ty(&self, ir_inst: Inst) -> Type; - /// Get the `ABIBody`. - fn abi(&mut self) -> &dyn ABIBody; - /// Emit a machine instruction. - fn emit(&mut self, mach_inst: Self::I); - /// Indicate that an IR instruction has been merged, and so one of its - /// uses is gone (replaced by uses of the instruction's inputs). This - /// helps the lowering algorithm to perform on-the-fly DCE, skipping over - /// unused instructions (such as immediates incorporated directly). - fn merged(&mut self, from_inst: Inst); - /// Get the producing instruction, if any, and output number, for the `idx`th input to the - /// given IR instruction - fn input_inst(&self, ir_inst: Inst, idx: usize) -> Option<(Inst, usize)>; - /// Map a Value to its associated writable (probably virtual) Reg. - fn value_to_writable_reg(&self, val: Value) -> Writable; - /// Map a Value to its associated (probably virtual) Reg. - fn value_to_reg(&self, val: Value) -> Reg; - /// Get the `idx`th input to the given IR instruction as a virtual register. - fn input(&self, ir_inst: Inst, idx: usize) -> Reg; - /// Get the `idx`th output of the given IR instruction as a virtual register. - fn output(&self, ir_inst: Inst, idx: usize) -> Writable; + /// Get the target for a call instruction, as an `ExternalName`. Returns a tuple + /// providing this name and the "relocation distance", i.e., whether the backend + /// can assume the target will be "nearby" (within some small offset) or an + /// arbitrary address. (This comes from the `colocated` bit in the CLIF.) + fn call_target<'b>(&'b self, ir_inst: Inst) -> Option<(&'b ExternalName, RelocDistance)>; + /// Get the signature for a call or call-indirect instruction. + fn call_sig<'b>(&'b self, ir_inst: Inst) -> Option<&'b Signature>; + /// Get the symbol name, relocation distance estimate, and offset for a + /// symbol_value instruction. + fn symbol_value<'b>(&'b self, ir_inst: Inst) -> Option<(&'b ExternalName, RelocDistance, i64)>; + /// Returns the memory flags of a given memory access. + fn memflags(&self, ir_inst: Inst) -> Option; + /// Get the source location for a given instruction. + fn srcloc(&self, ir_inst: Inst) -> SourceLoc; + /// Get the side-effect color of the given instruction (specifically, at the + /// program point just prior to the instruction). The "color" changes at + /// every side-effecting op; the backend should not try to merge across + /// side-effect colors unless the op being merged is known to be pure. + fn inst_color(&self, ir_inst: Inst) -> InstColor; + + // Instruction input/output queries: + /// Get the number of inputs to the given IR instruction. fn num_inputs(&self, ir_inst: Inst) -> usize; /// Get the number of outputs to the given IR instruction. @@ -59,27 +102,60 @@ pub trait LowerCtx { fn input_ty(&self, ir_inst: Inst, idx: usize) -> Type; /// Get the type for an instruction's output. fn output_ty(&self, ir_inst: Inst, idx: usize) -> Type; + /// Get the value of a constant instruction (`iconst`, etc.) as a 64-bit + /// value, if possible. + fn get_constant(&self, ir_inst: Inst) -> Option; + /// Get the input in any combination of three forms: + /// + /// - An instruction, if the same color as this instruction or if the + /// producing instruction has no side effects (thus in both cases + /// mergeable); + /// - A constant, if the value is a constant; + /// - A register. + /// + /// The instruction input may be available in some or all of these + /// forms. More than one is possible: e.g., it may be produced by an + /// instruction in the same block, but may also have been forced into a + /// register already by an earlier op. It will *always* be available + /// in a register, at least. + /// + /// If the backend uses the register, rather than one of the other + /// forms (constant or merging of the producing op), it must call + /// `use_input_reg()` to ensure the producing inst is actually lowered + /// as well. + fn get_input(&self, ir_inst: Inst, idx: usize) -> LowerInput; + /// Get the `idx`th output register of the given IR instruction. When + /// `backend.lower_inst_to_regs(ctx, inst)` is called, it is expected that + /// the backend will write results to these output register(s). + fn get_output(&mut self, ir_inst: Inst, idx: usize) -> Writable; + + // Codegen primitives: allocate temps, emit instructions, set result registers, + // ask for an input to be gen'd into a register. + /// Get a new temp. fn tmp(&mut self, rc: RegClass, ty: Type) -> Writable; - /// Get the number of block params. - fn num_bb_params(&self, bb: Block) -> usize; - /// Get the register for a block param. - fn bb_param(&self, bb: Block, idx: usize) -> Reg; - /// Get the register for a return value. - fn retval(&self, idx: usize) -> Writable; - /// Get the target for a call instruction, as an `ExternalName`. Returns a tuple - /// providing this name and the "relocation distance", i.e., whether the backend - /// can assume the target will be "nearby" (within some small offset) or an - /// arbitrary address. (This comes from the `colocated` bit in the CLIF.) - fn call_target<'b>(&'b self, ir_inst: Inst) -> Option<(&'b ExternalName, RelocDistance)>; - /// Get the signature for a call or call-indirect instruction. - fn call_sig<'b>(&'b self, ir_inst: Inst) -> Option<&'b Signature>; - /// Get the symbol name, relocation distance estimate, and offset for a symbol_value instruction. - fn symbol_value<'b>(&'b self, ir_inst: Inst) -> Option<(&'b ExternalName, RelocDistance, i64)>; - /// Returns the memory flags of a given memory access. - fn memflags(&self, ir_inst: Inst) -> Option; - /// Get the source location for a given instruction. - fn srcloc(&self, ir_inst: Inst) -> SourceLoc; + /// Emit a machine instruction. + fn emit(&mut self, mach_inst: Self::I); + /// Indicate that the given input uses the register returned by + /// `get_input()`. Codegen may not happen otherwise for the producing + /// instruction if it has no side effects and no uses. + fn use_input_reg(&mut self, input: LowerInput); +} + +/// A representation of all of the ways in which an instruction input is +/// available: as a producing instruction (in the same color-partition), as a +/// constant, and/or in an existing register. See [LowerCtx::get_input] for more +/// details. +#[derive(Clone, Copy, Debug)] +pub struct LowerInput { + /// The value is live in a register. This option is always available. Call + /// [LowerCtx::use_input_reg()] if the register is used. + pub reg: Reg, + /// An instruction produces this value; the instruction's result index that + /// produces this value is given. + pub inst: Option<(Inst, usize)>, + /// The value is a known constant. + pub constant: Option, } /// A machine backend. @@ -87,20 +163,31 @@ pub trait LowerBackend { /// The machine instruction type. type MInst: VCodeInst; - /// Lower a single instruction. Instructions are lowered in reverse order. - /// This function need not handle branches; those are always passed to - /// `lower_branch_group` below. - fn lower>(&self, ctx: &mut C, inst: Inst); + /// Lower a single instruction. + /// + /// For a branch, this function should not generate the actual branch + /// instruction. However, it must force any values it needs for the branch + /// edge (block-param actuals) into registers, because the actual branch + /// generation (`lower_branch_group()`) happens *after* any possible merged + /// out-edge. + fn lower>(&self, ctx: &mut C, inst: Inst) -> CodegenResult<()>; - /// Lower a block-terminating group of branches (which together can be seen as one - /// N-way branch), given a vcode BlockIndex for each target. + /// Lower a block-terminating group of branches (which together can be seen + /// as one N-way branch), given a vcode MachLabel for each target. fn lower_branch_group>( &self, ctx: &mut C, insts: &[Inst], - targets: &[BlockIndex], - fallthrough: Option, - ); + targets: &[MachLabel], + fallthrough: Option, + ) -> CodegenResult<()>; + + /// A bit of a hack: give a fixed register that always holds the result of a + /// `get_pinned_reg` instruction, if known. This allows elision of moves + /// into the associated vreg, instead using the real reg directly. + fn maybe_pinned_reg(&self) -> Option { + None + } } /// Machine-independent lowering driver / machine-instruction container. Maintains a correspondence @@ -112,17 +199,42 @@ pub struct Lower<'func, I: VCodeInst> { /// Lowered machine instructions. vcode: VCodeBuilder, - /// Number of active uses (minus `dec_use()` calls by backend) of each instruction. - num_uses: SecondaryMap, - /// Mapping from `Value` (SSA value in IR) to virtual register. value_regs: SecondaryMap, /// Return-value vregs. retval_regs: Vec<(Reg, ArgumentExtension)>, + /// Instruction colors. + inst_colors: SecondaryMap, + + /// Instruction constant values, if known. + inst_constants: FxHashMap, + + /// Instruction has a side-effect and must be codegen'd. + inst_needed: SecondaryMap, + + /// Value (vreg) is needed and producer must be codegen'd. + vreg_needed: Vec, + /// Next virtual register number to allocate. next_vreg: u32, + + /// Insts in reverse block order, before final copy to vcode. + block_insts: Vec<(SourceLoc, I)>, + + /// Ranges in `block_insts` constituting BBs. + block_ranges: Vec<(usize, usize)>, + + /// Instructions collected for the BB in progress, in reverse order, with + /// source-locs attached. + bb_insts: Vec<(SourceLoc, I)>, + + /// Instructions collected for the CLIF inst in progress, in forward order. + ir_insts: Vec, + + /// The register to use for GetPinnedReg, if any, on this architecture. + pinned_reg: Option, } /// Notion of "relocation distance". This gives an estimate of how far away a symbol will be from a @@ -143,7 +255,7 @@ fn alloc_vreg( value: Value, next_vreg: &mut u32, ) -> VirtualReg { - if value_regs[value].get_index() == 0 { + if value_regs[value].is_invalid() { // default value in map. let v = *next_vreg; *next_vreg += 1; @@ -159,41 +271,35 @@ enum GenerateReturn { impl<'func, I: VCodeInst> Lower<'func, I> { /// Prepare a new lowering context for the given IR function. - pub fn new(f: &'func Function, abi: Box>) -> CodegenResult> { - let mut vcode = VCodeBuilder::new(abi); + pub fn new( + f: &'func Function, + abi: Box>, + block_order: BlockLoweringOrder, + ) -> CodegenResult> { + let mut vcode = VCodeBuilder::new(abi, block_order); - let num_uses = NumUses::compute(f).take_uses(); + let mut next_vreg: u32 = 0; - let mut next_vreg: u32 = 1; + let mut value_regs = SecondaryMap::with_default(Reg::invalid()); - // Default register should never be seen, but the `value_regs` map needs a default and we - // don't want to push `Option` everywhere. All values will be assigned registers by the - // loops over block parameters and instruction results below. - // - // We do not use vreg 0 so that we can detect any unassigned register that leaks through. - let default_register = Reg::new_virtual(RegClass::I32, 0); - let mut value_regs = SecondaryMap::with_default(default_register); - - // Assign a vreg to each value. + // Assign a vreg to each block param and each inst result. for bb in f.layout.blocks() { - for param in f.dfg.block_params(bb) { - let vreg = alloc_vreg( - &mut value_regs, - I::rc_for_type(f.dfg.value_type(*param))?, - *param, - &mut next_vreg, - ); - vcode.set_vreg_type(vreg, f.dfg.value_type(*param)); + for ¶m in f.dfg.block_params(bb) { + let ty = f.dfg.value_type(param); + let vreg = alloc_vreg(&mut value_regs, I::rc_for_type(ty)?, param, &mut next_vreg); + vcode.set_vreg_type(vreg, ty); + debug!("bb {} param {}: vreg {:?}", bb, param, vreg); } for inst in f.layout.block_insts(bb) { - for result in f.dfg.inst_results(inst) { - let vreg = alloc_vreg( - &mut value_regs, - I::rc_for_type(f.dfg.value_type(*result))?, - *result, - &mut next_vreg, + for &result in f.dfg.inst_results(inst) { + let ty = f.dfg.value_type(result); + let vreg = + alloc_vreg(&mut value_regs, I::rc_for_type(ty)?, result, &mut next_vreg); + vcode.set_vreg_type(vreg, ty); + debug!( + "bb {} inst {} ({:?}): result vreg {:?}", + bb, inst, f.dfg[inst], vreg ); - vcode.set_vreg_type(vreg, f.dfg.value_type(*result)); } } } @@ -209,13 +315,51 @@ impl<'func, I: VCodeInst> Lower<'func, I> { vcode.set_vreg_type(vreg.as_virtual_reg().unwrap(), ret.value_type); } + // Compute instruction colors, find constant instructions, and find instructions with + // side-effects, in one combined pass. + let mut cur_color = 0; + let mut inst_colors = SecondaryMap::with_default(InstColor::new(0)); + let mut inst_constants = FxHashMap::default(); + let mut inst_needed = SecondaryMap::with_default(false); + for bb in f.layout.blocks() { + cur_color += 1; + for inst in f.layout.block_insts(bb) { + let side_effect = has_side_effect_or_load(f, inst); + + // Assign colors. A new color is chosen *after* any side-effecting instruction. + inst_colors[inst] = InstColor::new(cur_color); + debug!("bb {} inst {} has color {}", bb, inst, cur_color); + if side_effect { + debug!(" -> side-effecting"); + inst_needed[inst] = true; + cur_color += 1; + } + + // Determine if this is a constant; if so, add to the table. + if let Some(c) = is_constant_64bit(f, inst) { + debug!(" -> constant: {}", c); + inst_constants.insert(inst, c); + } + } + } + + let vreg_needed = std::iter::repeat(false).take(next_vreg as usize).collect(); + Ok(Lower { f, vcode, - num_uses, value_regs, retval_regs, + inst_colors, + inst_constants, + inst_needed, + vreg_needed, next_vreg, + block_insts: vec![], + block_ranges: vec![], + bb_insts: vec![], + ir_insts: vec![], + pinned_reg: None, }) } @@ -229,452 +373,427 @@ impl<'func, I: VCodeInst> Lower<'func, I> { for (i, param) in self.f.dfg.block_params(entry_bb).iter().enumerate() { let reg = Writable::from_reg(self.value_regs[*param]); let insn = self.vcode.abi().gen_copy_arg_to_reg(i, reg); - self.vcode.push(insn); + self.emit(insn); } } } fn gen_retval_setup(&mut self, gen_ret_inst: GenerateReturn) { - for (i, (reg, ext)) in self.retval_regs.iter().enumerate() { - let reg = Writable::from_reg(*reg); - let insns = self.vcode.abi().gen_copy_reg_to_retval(i, reg, *ext); + let retval_regs = self.retval_regs.clone(); + for (i, (reg, ext)) in retval_regs.into_iter().enumerate() { + let reg = Writable::from_reg(reg); + let insns = self.vcode.abi().gen_copy_reg_to_retval(i, reg, ext); for insn in insns { - self.vcode.push(insn); + self.emit(insn); } } let inst = match gen_ret_inst { GenerateReturn::Yes => self.vcode.abi().gen_ret(), GenerateReturn::No => self.vcode.abi().gen_epilogue_placeholder(), }; - self.vcode.push(inst); + self.emit(inst); } - fn find_reachable_bbs(&self) -> SmallVec<[Block; 16]> { - if let Some(entry) = self.f.layout.entry_block() { - let mut ret = SmallVec::new(); - let mut queue = VecDeque::new(); - let mut visited = SecondaryMap::with_default(false); - queue.push_back(entry); - visited[entry] = true; - while !queue.is_empty() { - let b = queue.pop_front().unwrap(); - ret.push(b); - let mut succs: SmallVec<[Block; 16]> = SmallVec::new(); - for inst in self.f.layout.block_likely_branches(b) { - if self.f.dfg[inst].opcode().is_branch() { - visit_branch_targets(self.f, b, inst, |succ| { - succs.push(succ); - }); - } + fn lower_edge(&mut self, pred: Block, inst: Inst, succ: Block) -> CodegenResult<()> { + debug!("lower_edge: pred {} succ {}", pred, succ); + + let mut src_regs: SmallVec<[Option; 16]> = SmallVec::new(); + let mut src_consts: SmallVec<[Option; 16]> = SmallVec::new(); + let mut dst_regs: SmallVec<[Writable; 16]> = SmallVec::new(); + + fn overlap(a: &[Option], b: &[Writable]) -> bool { + let mut set = FxHashSet::default(); + for &maybe_reg in a { + if let Some(r) = maybe_reg { + set.insert(r); } - for succ in succs.into_iter() { - if !visited[succ] { - queue.push_back(succ); - visited[succ] = true; + } + for ® in b { + if set.contains(®.to_reg()) { + return true; + } + } + false + } + + // Create a temporary for each block parameter. + let phi_classes: SmallVec<[Type; 16]> = self + .f + .dfg + .block_params(succ) + .iter() + .map(|p| self.f.dfg.value_type(*p)) + .collect(); + + // Create all of the phi uses (reads) from jump args to temps. + // Round up all the source and destination regs + for (i, arg) in self.f.dfg.inst_variable_args(inst).iter().enumerate() { + let arg = self.f.dfg.resolve_aliases(*arg); + let input = self.get_input_for_val(inst, arg); + debug!("jump arg {} is {}, reg {:?}", i, arg, input.reg); + if let Some(c) = input.constant { + src_consts.push(Some(c)); + src_regs.push(None); + } else { + self.use_input_reg(input); + src_regs.push(Some(input.reg)); + src_consts.push(None); + } + } + for (i, param) in self.f.dfg.block_params(succ).iter().enumerate() { + debug!("bb arg {} is {}", i, param); + dst_regs.push(Writable::from_reg(self.value_regs[*param])); + } + debug_assert!(src_regs.len() == dst_regs.len()); + debug_assert!(src_consts.len() == dst_regs.len()); + debug_assert!(phi_classes.len() == dst_regs.len()); + debug!( + "src_regs = {:?} src_consts = {:?} dst_regs = {:?}", + src_regs, src_consts, dst_regs + ); + + // If, as is mostly the case, the source and destination register + // sets are non overlapping, then we can copy directly, so as to + // save the register allocator work. + if !overlap(&src_regs[..], &dst_regs[..]) { + for i in 0..dst_regs.len() { + let src_reg = src_regs[i]; + let src_const = src_consts[i]; + let dst_reg = dst_regs[i]; + let ty = phi_classes[i]; + if let Some(r) = src_reg { + self.emit(I::gen_move(dst_reg, r, ty)); + } else { + // Generate constants fresh in phi-edge to avoid long + // live-ranges. Note that these are also excluded from the + // overlap check, which increases the chance that we don't + // have to do a two-stage copy. + for inst in I::gen_constant(dst_reg, src_const.unwrap(), ty).into_iter() { + self.emit(inst); } } } - - ret } else { - SmallVec::new() + // There's some overlap, so play safe and copy via temps. + let mut tmp_regs: SmallVec<[Writable; 16]> = SmallVec::new(); + for &ty in &phi_classes { + tmp_regs.push(self.tmp(I::rc_for_type(ty)?, ty)); + } + + debug!("phi_temps = {:?}", tmp_regs); + debug_assert!(tmp_regs.len() == src_regs.len()); + + for i in 0..dst_regs.len() { + let src_reg = src_regs[i]; + let tmp_reg = tmp_regs[i]; + let ty = phi_classes[i]; + let src_const = src_consts[i]; + if let Some(src_reg) = src_reg { + self.emit(I::gen_move(tmp_reg, src_reg, ty)); + } else { + for inst in I::gen_constant(tmp_reg, src_const.unwrap(), ty).into_iter() { + self.emit(inst); + } + } + } + for i in 0..dst_regs.len() { + let tmp_reg = tmp_regs[i].to_reg(); + let dst_reg = dst_regs[i]; + let ty = phi_classes[i]; + self.emit(I::gen_move(dst_reg, tmp_reg, ty)); + } + } + Ok(()) + } + + fn lower_clif_block>( + &mut self, + backend: &B, + block: Block, + ) -> CodegenResult<()> { + // Lowering loop: + // - For each non-branch instruction, in reverse order: + // - If side-effecting (load, store, branch/call/return, possible trap), or if + // used outside of this block, or if demanded by another inst, then lower. + // + // That's it! Lowering of side-effecting ops will force all *needed* + // (live) non-side-effecting ops to be lowered at the right places, via + // the `use_input_reg()` callback on the `LowerCtx` (that's us). That's + // because `use_input_reg()` sets the eager/demand bit for any insts + // whose result registers are used. + // + // We build up the BB in reverse instruction order in `bb_insts`. + // Because the machine backend calls `ctx.emit()` in forward order, we + // collect per-IR-inst lowered instructions in `ir_insts`, then reverse + // these and append to `bb_insts` as we go backward through the block. + // `bb_insts` are then reversed again and appended to the VCode at the + // end of the BB (in the toplevel driver `lower()`). + for inst in self.f.layout.block_insts(block).rev() { + let data = &self.f.dfg[inst]; + let value_needed = self + .f + .dfg + .inst_results(inst) + .iter() + .any(|&result| self.vreg_needed[self.value_regs[result].get_index()]); + debug!( + "lower_clif_block: block {} inst {} ({:?}) is_branch {} inst_needed {} value_needed {}", + block, + inst, + data, + data.opcode().is_branch(), + self.inst_needed[inst], + value_needed, + ); + if self.f.dfg[inst].opcode().is_branch() { + continue; + } + // Normal instruction: codegen if eager bit is set. (Other instructions may also be + // codegened if not eager when they are used by another instruction.) + if self.inst_needed[inst] || value_needed { + debug!("lowering: inst {}: {:?}", inst, self.f.dfg[inst]); + backend.lower(self, inst)?; + } + if data.opcode().is_return() { + // Return: handle specially, using ABI-appropriate sequence. + let gen_ret = if data.opcode() == Opcode::Return { + GenerateReturn::Yes + } else { + debug_assert!(data.opcode() == Opcode::FallthroughReturn); + GenerateReturn::No + }; + self.gen_retval_setup(gen_ret); + } + + let loc = self.srcloc(inst); + self.finish_ir_inst(loc); + } + Ok(()) + } + + fn finish_ir_inst(&mut self, loc: SourceLoc) { + for inst in self.ir_insts.drain(..).rev() { + self.bb_insts.push((loc, inst)); + } + } + + fn finish_bb(&mut self) { + let start = self.block_insts.len(); + for pair in self.bb_insts.drain(..).rev() { + self.block_insts.push(pair); + } + let end = self.block_insts.len(); + self.block_ranges.push((start, end)); + } + + fn copy_bbs_to_vcode(&mut self) { + for &(start, end) in self.block_ranges.iter().rev() { + for &(loc, ref inst) in &self.block_insts[start..end] { + self.vcode.set_srcloc(loc); + self.vcode.push(inst.clone()); + } + self.vcode.end_bb(); + } + } + + fn lower_clif_branches>( + &mut self, + backend: &B, + block: Block, + branches: &SmallVec<[Inst; 2]>, + targets: &SmallVec<[MachLabel; 2]>, + maybe_fallthrough: Option, + ) -> CodegenResult<()> { + debug!( + "lower_clif_branches: block {} branches {:?} targets {:?} maybe_fallthrough {:?}", + block, branches, targets, maybe_fallthrough + ); + backend.lower_branch_group(self, branches, targets, maybe_fallthrough)?; + let loc = self.srcloc(branches[0]); + self.finish_ir_inst(loc); + Ok(()) + } + + fn collect_branches_and_targets( + &self, + bindex: BlockIndex, + _bb: Block, + branches: &mut SmallVec<[Inst; 2]>, + targets: &mut SmallVec<[MachLabel; 2]>, + ) { + branches.clear(); + targets.clear(); + let mut last_inst = None; + for &(inst, succ) in self.vcode.block_order().succ_indices(bindex) { + // Avoid duplicates: this ensures a br_table is only inserted once. + if last_inst != Some(inst) { + branches.push(inst); + } else { + debug_assert!(self.f.dfg[inst].opcode() == Opcode::BrTable); + debug_assert!(branches.len() == 1); + } + last_inst = Some(inst); + targets.push(MachLabel::from_block(succ)); } } /// Lower the function. pub fn lower>(mut self, backend: &B) -> CodegenResult> { - // Find all reachable blocks. - let bbs = self.find_reachable_bbs(); - - // This records a Block-to-BlockIndex map so that branch targets can be resolved. - let mut next_bindex = self.vcode.init_bb_map(&bbs[..]); - - // Allocate a separate BlockIndex for each control-flow instruction so that we can create - // the edge blocks later. Each entry for a control-flow inst is the edge block; the list - // has (control flow inst, edge block, orig block) tuples. - // - // In general, a given inst may have only one target, except for jump tables which have - // more. But SmallVec may store inline more than the spec'd number, so ask for slightly - // more. - let mut edge_blocks_by_inst: SecondaryMap> = - SecondaryMap::with_default(SmallVec::new()); - - // Each basic block may at most have two edge blocks, since it may have a most two branch - // instructions. If we omit jump tables, we can model that 50% of branches are direct jumps - // (1 successor), and 50% are tests (2 successors). A distribution of edge_blocks per block - // matches this rough estimate that there are 1.5 edge block per block. - let mut edge_blocks: Vec<(Inst, BlockIndex, Block)> = Vec::with_capacity(bbs.len() * 3 / 2); - debug!("about to lower function: {:?}", self.f); - debug!("bb map: {:?}", self.vcode.blocks_by_bb()); - // Work backward (reverse block order, reverse through each block), skipping insns with zero - // uses. - for bb in bbs.iter().rev() { - for inst in self.f.layout.block_likely_branches(*bb) { - if self.f.dfg[inst].opcode().is_branch() { - // Find the original target. - let mut add_succ = |next_bb| { - let edge_block = next_bindex; - next_bindex += 1; - edge_blocks_by_inst[inst].push(edge_block); - edge_blocks.push((inst, edge_block, next_bb)); - }; - visit_branch_targets(self.f, *bb, inst, |succ| { - add_succ(succ); - }); - } - } - } + // Get the pinned reg here (we only parameterize this function on `B`, + // not the whole `Lower` impl). + self.pinned_reg = backend.maybe_pinned_reg(); - // Temporary vectors whose memory is reused in the loop below. + self.vcode.set_entry(0); + + // Reused vectors for branch lowering. let mut branches: SmallVec<[Inst; 2]> = SmallVec::new(); - let mut targets: SmallVec<[BlockIndex; 2]> = SmallVec::new(); + let mut targets: SmallVec<[MachLabel; 2]> = SmallVec::new(); - for bb in bbs.iter() { - debug!("lowering bb: {}", bb); + // get a copy of the lowered order; we hold this separately because we + // need a mut ref to the vcode to mutate it below. + let lowered_order: SmallVec<[LoweredBlock; 64]> = self + .vcode + .block_order() + .lowered_order() + .iter() + .cloned() + .collect(); - // If this is a return block, produce the return value setup. N.B.: this comes - // *before* the below because it must occur *after* any other instructions, and - // instructions are lowered in reverse order. - let last_insn = self.f.layout.block_insts(*bb).last().unwrap(); - let last_insn_opcode = self.f.dfg[last_insn].opcode(); - if last_insn_opcode.is_return() { - let gen_ret = if last_insn_opcode == Opcode::Return { - GenerateReturn::Yes - } else { - debug_assert!(last_insn_opcode == Opcode::FallthroughReturn); - self.vcode.set_fallthrough_return_block(*bb); - GenerateReturn::No - }; - self.gen_retval_setup(gen_ret); - self.vcode.end_ir_inst(); - } + // Main lowering loop over lowered blocks. + for (bindex, lb) in lowered_order.iter().enumerate().rev() { + let bindex = bindex as BlockIndex; - // Find the branches at the end first, and process those, if any. - for inst in self.f.layout.block_insts(*bb).rev() { - debug!("lower: inst {}", inst); - if edge_blocks_by_inst[inst].len() > 0 { - branches.push(inst); - for target in edge_blocks_by_inst[inst].iter().rev().cloned() { - targets.push(target); - } - } else { - // We've reached the end of the branches -- process all as a group, first. - if branches.len() > 0 { - let fallthrough = self.f.layout.next_block(*bb); - let fallthrough = fallthrough.map(|bb| self.vcode.bb_to_bindex(bb)); - branches.reverse(); - targets.reverse(); - debug!( - "lower_branch_group: targets = {:?} branches = {:?}", - targets, branches - ); - self.vcode.set_srcloc(self.srcloc(branches[0])); - backend.lower_branch_group( - &mut self, - &branches[..], - &targets[..], - fallthrough, - ); - self.vcode.end_ir_inst(); - branches.clear(); - targets.clear(); - } + // Lower the block body in reverse order (see comment in + // `lower_clif_block()` for rationale). - // Only codegen an instruction if it either has a side effect, or has at least - // one use of one of its results. - if self.num_uses[inst] > 0 || has_side_effect(self.f, inst) { - self.vcode.set_srcloc(self.srcloc(inst)); - backend.lower(&mut self, inst); - self.vcode.end_ir_inst(); + // End branches. + if let Some(bb) = lb.orig_block() { + self.collect_branches_and_targets(bindex, bb, &mut branches, &mut targets); + if branches.len() > 0 { + let maybe_fallthrough = if (bindex + 1) < (lowered_order.len() as BlockIndex) { + Some(MachLabel::from_block(bindex + 1)) } else { - // If we're skipping the instruction, we need to dec-ref its arguments. - for arg in self.f.dfg.inst_args(inst) { - let val = self.f.dfg.resolve_aliases(*arg); - match self.f.dfg.value_def(val) { - ValueDef::Result(src_inst, _) => { - self.dec_use(src_inst); - } - _ => {} - } - } - } - } - } - - // There are possibly some branches left if the block contained only branches. - if branches.len() > 0 { - let fallthrough = self.f.layout.next_block(*bb); - let fallthrough = fallthrough.map(|bb| self.vcode.bb_to_bindex(bb)); - branches.reverse(); - targets.reverse(); - debug!( - "lower_branch_group: targets = {:?} branches = {:?}", - targets, branches - ); - self.vcode.set_srcloc(self.srcloc(branches[0])); - backend.lower_branch_group(&mut self, &branches[..], &targets[..], fallthrough); - self.vcode.end_ir_inst(); - branches.clear(); - targets.clear(); - } - - // If this is the entry block, produce the argument setup. - if Some(*bb) == self.f.layout.entry_block() { - self.gen_arg_setup(); - self.vcode.end_ir_inst(); - } - - let vcode_bb = self.vcode.end_bb(); - debug!("finished building bb: BlockIndex {}", vcode_bb); - debug!("bb_to_bindex map says: {}", self.vcode.bb_to_bindex(*bb)); - assert!(vcode_bb == self.vcode.bb_to_bindex(*bb)); - if Some(*bb) == self.f.layout.entry_block() { - self.vcode.set_entry(vcode_bb); - } - } - - // Temporary vectors whose memory is reused in the loop below. - // TODO accomodate changes in regalloc.rs to use small vecs here? - let mut src_regs = Vec::new(); - let mut dst_regs = Vec::new(); - - // Now create the edge blocks, with phi lowering (block parameter copies). - for (inst, edge_block, orig_block) in edge_blocks.into_iter() { - debug!( - "creating edge block: inst {}, edge_block {}, orig_block {}", - inst, edge_block, orig_block - ); - - // Create a temporary for each block parameter. - let phi_classes: Vec = self - .f - .dfg - .block_params(orig_block) - .iter() - .map(|p| self.f.dfg.value_type(*p)) - .collect(); - - // Create all of the phi uses (reads) from jump args to temps. - // Round up all the source and destination regs - src_regs.clear(); - dst_regs.clear(); - for (i, arg) in self.f.dfg.inst_variable_args(inst).iter().enumerate() { - let arg = self.f.dfg.resolve_aliases(*arg); - debug!("jump arg {} is {}", i, arg); - src_regs.push(self.value_regs[arg]); - } - for (i, param) in self.f.dfg.block_params(orig_block).iter().enumerate() { - debug!("bb arg {} is {}", i, param); - dst_regs.push(Writable::from_reg(self.value_regs[*param])); - } - debug_assert!(src_regs.len() == dst_regs.len()); - debug_assert!(phi_classes.len() == dst_regs.len()); - - // If, as is mostly the case, the source and destination register - // sets are non overlapping, then we can copy directly, so as to - // save the register allocator work. - if !Set::::from_vec(src_regs.clone()).intersects(&Set::::from_vec( - dst_regs.iter().map(|r| r.to_reg()).collect(), - )) { - for (dst_reg, (src_reg, ty)) in - dst_regs.iter().zip(src_regs.iter().zip(phi_classes)) - { - self.vcode.push(I::gen_move(*dst_reg, *src_reg, ty)); + None + }; + self.lower_clif_branches(backend, bb, &branches, &targets, maybe_fallthrough)?; + self.finish_ir_inst(self.srcloc(branches[0])); } } else { - // There's some overlap, so play safe and copy via temps. - let mut tmp_regs = Vec::with_capacity(phi_classes.len()); - for &ty in &phi_classes { - tmp_regs.push(self.tmp(I::rc_for_type(ty)?, ty)); - } - - debug!("phi_temps = {:?}", tmp_regs); - debug_assert!(tmp_regs.len() == src_regs.len()); - - for (tmp_reg, (src_reg, &ty)) in - tmp_regs.iter().zip(src_regs.iter().zip(phi_classes.iter())) - { - self.vcode.push(I::gen_move(*tmp_reg, *src_reg, ty)); - } - for (dst_reg, (tmp_reg, &ty)) in - dst_regs.iter().zip(tmp_regs.iter().zip(phi_classes.iter())) - { - self.vcode.push(I::gen_move(*dst_reg, tmp_reg.to_reg(), ty)); - } + // If no orig block, this must be a pure edge block; get the successor and + // emit a jump. + let (_, succ) = self.vcode.block_order().succ_indices(bindex)[0]; + self.emit(I::gen_jump(MachLabel::from_block(succ))); + self.finish_ir_inst(SourceLoc::default()); } - // Create the unconditional jump to the original target block. - self.vcode - .push(I::gen_jump(self.vcode.bb_to_bindex(orig_block))); + // Out-edge phi moves. + if let Some((pred, inst, succ)) = lb.out_edge() { + self.lower_edge(pred, inst, succ)?; + self.finish_ir_inst(SourceLoc::default()); + } + // Original block body. + if let Some(bb) = lb.orig_block() { + self.lower_clif_block(backend, bb)?; + } + // In-edge phi moves. + if let Some((pred, inst, succ)) = lb.in_edge() { + self.lower_edge(pred, inst, succ)?; + self.finish_ir_inst(SourceLoc::default()); + } - // End the IR inst and block. (We lower this as if it were one IR instruction so that - // we can emit machine instructions in forward order.) - self.vcode.end_ir_inst(); - let blocknum = self.vcode.end_bb(); - assert!(blocknum == edge_block); + if bindex == 0 { + // Set up the function with arg vreg inits. + self.gen_arg_setup(); + self.finish_ir_inst(SourceLoc::default()); + } + + self.finish_bb(); } + self.copy_bbs_to_vcode(); + // Now that we've emitted all instructions into the VCodeBuilder, let's build the VCode. - Ok(self.vcode.build()) + let vcode = self.vcode.build(); + debug!("built vcode: {:?}", vcode); + + Ok(vcode) } - /// Reduce the use-count of an IR instruction. Use this when, e.g., isel incorporates the - /// computation of an input instruction directly, so that input instruction has one - /// fewer use. - fn dec_use(&mut self, ir_inst: Inst) { - assert!(self.num_uses[ir_inst] > 0); - self.num_uses[ir_inst] -= 1; - debug!( - "incref: ir_inst {} now has {} uses", - ir_inst, self.num_uses[ir_inst] - ); - } + fn get_input_for_val(&self, at_inst: Inst, val: Value) -> LowerInput { + debug!("get_input_for_val: val {} at inst {}", val, at_inst); + let mut reg = self.value_regs[val]; + debug!(" -> reg {:?}", reg); + assert!(reg.is_valid()); + let mut inst = match self.f.dfg.value_def(val) { + // OK to merge source instruction if (i) we have a source + // instruction, and either (ii-a) it has no side effects, or (ii-b) + // it has the same color as this instruction. + ValueDef::Result(src_inst, result_idx) => { + debug!(" -> src inst {}", src_inst); + debug!( + " -> has side effect: {}", + has_side_effect_or_load(self.f, src_inst) + ); + debug!( + " -> our color is {:?}, src inst is {:?}", + self.inst_color(at_inst), + self.inst_color(src_inst) + ); + if !has_side_effect_or_load(self.f, src_inst) + || self.inst_color(at_inst) == self.inst_color(src_inst) + { + Some((src_inst, result_idx)) + } else { + None + } + } + _ => None, + }; + let constant = inst.and_then(|(inst, _)| self.get_constant(inst)); - /// Increase the use-count of an IR instruction. Use this when, e.g., isel incorporates - /// the computation of an input instruction directly, so that input instruction's - /// inputs are now used directly by the merged instruction. - fn inc_use(&mut self, ir_inst: Inst) { - self.num_uses[ir_inst] += 1; - debug!( - "decref: ir_inst {} now has {} uses", - ir_inst, self.num_uses[ir_inst] - ); + // Pinned-reg hack: if backend specifies a fixed pinned register, use it + // directly when we encounter a GetPinnedReg op, rather than lowering + // the actual op, and do not return the source inst to the caller; the + // value comes "out of the ether" and we will not force generation of + // the superfluous move. + if let Some((i, _)) = inst { + if self.f.dfg[i].opcode() == Opcode::GetPinnedReg { + if let Some(pr) = self.pinned_reg { + reg = pr; + } + inst = None; + } + } + + LowerInput { + reg, + inst, + constant, + } } } impl<'func, I: VCodeInst> LowerCtx for Lower<'func, I> { type I = I; - /// Get the instdata for a given IR instruction. - fn data(&self, ir_inst: Inst) -> &InstructionData { - &self.f.dfg[ir_inst] - } - - /// Get the controlling type for a polymorphic IR instruction. - fn ty(&self, ir_inst: Inst) -> Type { - self.f.dfg.ctrl_typevar(ir_inst) - } - fn abi(&mut self) -> &dyn ABIBody { self.vcode.abi() } - /// Emit a machine instruction. - fn emit(&mut self, mach_inst: I) { - self.vcode.push(mach_inst); - } - - /// Indicate that a merge has occurred. - fn merged(&mut self, from_inst: Inst) { - debug!("merged: inst {}", from_inst); - // First, inc-ref all inputs of `from_inst`, because they are now used - // directly by `into_inst`. - for arg in self.f.dfg.inst_args(from_inst) { - let arg = self.f.dfg.resolve_aliases(*arg); - match self.f.dfg.value_def(arg) { - ValueDef::Result(src_inst, _) => { - debug!(" -> inc-reffing src inst {}", src_inst); - self.inc_use(src_inst); - } - _ => {} - } - } - // Then, dec-ref the merged instruction itself. It still retains references - // to its arguments (inc-ref'd above). If its refcount has reached zero, - // it will be skipped during emission and its args will be dec-ref'd at that - // time. - self.dec_use(from_inst); - } - - /// Get the producing instruction, if any, and output number, for the `idx`th input to the - /// given IR instruction. - fn input_inst(&self, ir_inst: Inst, idx: usize) -> Option<(Inst, usize)> { - let val = self.f.dfg.inst_args(ir_inst)[idx]; - let val = self.f.dfg.resolve_aliases(val); - match self.f.dfg.value_def(val) { - ValueDef::Result(src_inst, result_idx) => Some((src_inst, result_idx)), - _ => None, - } - } - - /// Map a Value to its associated writable (probably virtual) Reg. - fn value_to_writable_reg(&self, val: Value) -> Writable { - let val = self.f.dfg.resolve_aliases(val); - Writable::from_reg(self.value_regs[val]) - } - - /// Map a Value to its associated (probably virtual) Reg. - fn value_to_reg(&self, val: Value) -> Reg { - let val = self.f.dfg.resolve_aliases(val); - self.value_regs[val] - } - - /// Get the `idx`th input to the given IR instruction as a virtual register. - fn input(&self, ir_inst: Inst, idx: usize) -> Reg { - let val = self.f.dfg.inst_args(ir_inst)[idx]; - let val = self.f.dfg.resolve_aliases(val); - self.value_to_reg(val) - } - - /// Get the `idx`th output of the given IR instruction as a virtual register. - fn output(&self, ir_inst: Inst, idx: usize) -> Writable { - let val = self.f.dfg.inst_results(ir_inst)[idx]; - self.value_to_writable_reg(val) - } - - /// Get a new temp. - fn tmp(&mut self, rc: RegClass, ty: Type) -> Writable { - let v = self.next_vreg; - self.next_vreg += 1; - let vreg = Reg::new_virtual(rc, v); - self.vcode.set_vreg_type(vreg.as_virtual_reg().unwrap(), ty); - Writable::from_reg(vreg) - } - - /// Get the number of inputs for the given IR instruction. - fn num_inputs(&self, ir_inst: Inst) -> usize { - self.f.dfg.inst_args(ir_inst).len() - } - - /// Get the number of outputs for the given IR instruction. - fn num_outputs(&self, ir_inst: Inst) -> usize { - self.f.dfg.inst_results(ir_inst).len() - } - - /// Get the type for an instruction's input. - fn input_ty(&self, ir_inst: Inst, idx: usize) -> Type { - let val = self.f.dfg.inst_args(ir_inst)[idx]; - let val = self.f.dfg.resolve_aliases(val); - self.f.dfg.value_type(val) - } - - /// Get the type for an instruction's output. - fn output_ty(&self, ir_inst: Inst, idx: usize) -> Type { - self.f.dfg.value_type(self.f.dfg.inst_results(ir_inst)[idx]) - } - - /// Get the number of block params. - fn num_bb_params(&self, bb: Block) -> usize { - self.f.dfg.block_params(bb).len() - } - - /// Get the register for a block param. - fn bb_param(&self, bb: Block, idx: usize) -> Reg { - let val = self.f.dfg.block_params(bb)[idx]; - self.value_regs[val] - } - - /// Get the register for a return value. fn retval(&self, idx: usize) -> Writable { Writable::from_reg(self.retval_regs[idx].0) } - /// Get the target for a call instruction, as an `ExternalName`. Returns a tuple - /// providing this name and the "relocation distance", i.e., whether the backend - /// can assume the target will be "nearby" (within some small offset) or an - /// arbitrary address. (This comes from the `colocated` bit in the CLIF.) + fn data(&self, ir_inst: Inst) -> &InstructionData { + &self.f.dfg[ir_inst] + } + + fn ty(&self, ir_inst: Inst) -> Type { + self.f.dfg.ctrl_typevar(ir_inst) + } + fn call_target<'b>(&'b self, ir_inst: Inst) -> Option<(&'b ExternalName, RelocDistance)> { match &self.f.dfg[ir_inst] { &InstructionData::Call { func_ref, .. } @@ -686,7 +805,7 @@ impl<'func, I: VCodeInst> LowerCtx for Lower<'func, I> { _ => None, } } - /// Get the signature for a call or call-indirect instruction. + fn call_sig<'b>(&'b self, ir_inst: Inst) -> Option<&'b Signature> { match &self.f.dfg[ir_inst] { &InstructionData::Call { func_ref, .. } => { @@ -698,7 +817,6 @@ impl<'func, I: VCodeInst> LowerCtx for Lower<'func, I> { } } - /// Get the symbol name, relocation distance estimate, and offset for a symbol_value instruction. fn symbol_value<'b>(&'b self, ir_inst: Inst) -> Option<(&'b ExternalName, RelocDistance, i64)> { match &self.f.dfg[ir_inst] { &InstructionData::UnaryGlobalValue { global_value, .. } => { @@ -720,7 +838,6 @@ impl<'func, I: VCodeInst> LowerCtx for Lower<'func, I> { } } - /// Returns the memory flags of a given memory access. fn memflags(&self, ir_inst: Inst) -> Option { match &self.f.dfg[ir_inst] { &InstructionData::Load { flags, .. } @@ -731,27 +848,94 @@ impl<'func, I: VCodeInst> LowerCtx for Lower<'func, I> { } } - /// Get the source location for a given instruction. fn srcloc(&self, ir_inst: Inst) -> SourceLoc { self.f.srclocs[ir_inst] } + + fn inst_color(&self, ir_inst: Inst) -> InstColor { + self.inst_colors[ir_inst] + } + + fn num_inputs(&self, ir_inst: Inst) -> usize { + self.f.dfg.inst_args(ir_inst).len() + } + + fn num_outputs(&self, ir_inst: Inst) -> usize { + self.f.dfg.inst_results(ir_inst).len() + } + + fn input_ty(&self, ir_inst: Inst, idx: usize) -> Type { + let val = self.f.dfg.inst_args(ir_inst)[idx]; + let val = self.f.dfg.resolve_aliases(val); + self.f.dfg.value_type(val) + } + + fn output_ty(&self, ir_inst: Inst, idx: usize) -> Type { + self.f.dfg.value_type(self.f.dfg.inst_results(ir_inst)[idx]) + } + + fn get_constant(&self, ir_inst: Inst) -> Option { + self.inst_constants.get(&ir_inst).cloned() + } + + fn get_input(&self, ir_inst: Inst, idx: usize) -> LowerInput { + let val = self.f.dfg.inst_args(ir_inst)[idx]; + let val = self.f.dfg.resolve_aliases(val); + self.get_input_for_val(ir_inst, val) + } + + fn get_output(&mut self, ir_inst: Inst, idx: usize) -> Writable { + let val = self.f.dfg.inst_results(ir_inst)[idx]; + Writable::from_reg(self.value_regs[val]) + } + + fn tmp(&mut self, rc: RegClass, ty: Type) -> Writable { + let v = self.next_vreg; + self.next_vreg += 1; + let vreg = Reg::new_virtual(rc, v); + self.vcode.set_vreg_type(vreg.as_virtual_reg().unwrap(), ty); + Writable::from_reg(vreg) + } + + fn emit(&mut self, mach_inst: I) { + self.ir_insts.push(mach_inst); + } + + fn use_input_reg(&mut self, input: LowerInput) { + debug!("use_input_reg: vreg {:?} is needed", input.reg); + self.vreg_needed[input.reg.get_index()] = true; + } } -fn visit_branch_targets(f: &Function, block: Block, inst: Inst, mut visit: F) { +/// Visit all successors of a block with a given visitor closure. +pub(crate) fn visit_block_succs(f: &Function, block: Block, mut visit: F) { + for inst in f.layout.block_likely_branches(block) { + if f.dfg[inst].opcode().is_branch() { + visit_branch_targets(f, block, inst, &mut visit); + } + } +} + +fn visit_branch_targets( + f: &Function, + block: Block, + inst: Inst, + visit: &mut F, +) { if f.dfg[inst].opcode() == Opcode::Fallthrough { - visit(f.layout.next_block(block).unwrap()); + visit(inst, f.layout.next_block(block).unwrap()); } else { match f.dfg[inst].analyze_branch(&f.dfg.value_lists) { BranchInfo::NotABranch => {} BranchInfo::SingleDest(dest, _) => { - visit(dest); + visit(inst, dest); } BranchInfo::Table(table, maybe_dest) => { if let Some(dest) = maybe_dest { - visit(dest); + visit(inst, dest); } for &dest in f.jump_tables[table].as_slice() { - visit(dest); + visit(inst, dest); } } } diff --git a/cranelift/codegen/src/machinst/mod.rs b/cranelift/codegen/src/machinst/mod.rs index cc92982a84..517c3ac81c 100644 --- a/cranelift/codegen/src/machinst/mod.rs +++ b/cranelift/codegen/src/machinst/mod.rs @@ -109,6 +109,7 @@ use regalloc::RegUsageCollector; use regalloc::{ RealReg, RealRegUniverse, Reg, RegClass, RegUsageMapper, SpillSlot, VirtualReg, Writable, }; +use smallvec::SmallVec; use std::string::String; use target_lexicon::Triple; @@ -124,8 +125,8 @@ pub mod abi; pub use abi::*; pub mod pretty_print; pub use pretty_print::*; -pub mod sections; -pub use sections::*; +pub mod buffer; +pub use buffer::*; pub mod adapter; pub use adapter::*; @@ -152,6 +153,9 @@ pub trait MachInst: Clone + Debug { /// Generate a move. fn gen_move(to_reg: Writable, from_reg: Reg, ty: Type) -> Self; + /// Generate a constant into a reg. + fn gen_constant(to_reg: Writable, value: u64, ty: Type) -> SmallVec<[Self; 4]>; + /// Generate a zero-length no-op. fn gen_zero_len_nop() -> Self; @@ -166,7 +170,7 @@ pub trait MachInst: Clone + Debug { /// Generate a jump to another target. Used during lowering of /// control flow. - fn gen_jump(target: BlockIndex) -> Self; + fn gen_jump(target: MachLabel) -> Self; /// Generate a NOP. The `preferred_size` parameter allows the caller to /// request a NOP of that size, or as close to it as possible. The machine @@ -175,22 +179,62 @@ pub trait MachInst: Clone + Debug { /// the instruction must have a nonzero size. fn gen_nop(preferred_size: usize) -> Self; - /// Rewrite block targets using the block-target map. - fn with_block_rewrites(&mut self, block_target_map: &[BlockIndex]); - - /// Finalize branches once the block order (fallthrough) is known. - fn with_fallthrough_block(&mut self, fallthrough_block: Option); - - /// Update instruction once block offsets are known. These offsets are - /// relative to the beginning of the function. `targets` is indexed by - /// BlockIndex. - fn with_block_offsets(&mut self, my_offset: CodeOffset, targets: &[CodeOffset]); + /// Get the register universe for this backend. + fn reg_universe(flags: &Flags) -> RealRegUniverse; /// Align a basic block offset (from start of function). By default, no /// alignment occurs. fn align_basic_block(offset: CodeOffset) -> CodeOffset { offset } + + /// What is the worst-case instruction size emitted by this instruction type? + fn worst_case_size() -> CodeOffset; + + /// A label-use kind: a type that describes the types of label references that + /// can occur in an instruction. + type LabelUse: MachInstLabelUse; +} + +/// A descriptor of a label reference (use) in an instruction set. +pub trait MachInstLabelUse: Clone + Copy + Debug + Eq { + /// Required alignment for any veneer. Usually the required instruction + /// alignment (e.g., 4 for a RISC with 32-bit instructions, or 1 for x86). + const ALIGN: CodeOffset; + + /// What is the maximum PC-relative range (positive)? E.g., if `1024`, a + /// label-reference fixup at offset `x` is valid if the label resolves to `x + /// + 1024`. + fn max_pos_range(self) -> CodeOffset; + /// What is the maximum PC-relative range (negative)? This is the absolute + /// value; i.e., if `1024`, then a label-reference fixup at offset `x` is + /// valid if the label resolves to `x - 1024`. + fn max_neg_range(self) -> CodeOffset; + /// What is the size of code-buffer slice this label-use needs to patch in + /// the label's value? + fn patch_size(self) -> CodeOffset; + /// Perform a code-patch, given the offset into the buffer of this label use + /// and the offset into the buffer of the label's definition. + /// It is guaranteed that, given `delta = offset - label_offset`, we will + /// have `offset >= -self.max_neg_range()` and `offset <= + /// self.max_pos_range()`. + fn patch(self, buffer: &mut [u8], use_offset: CodeOffset, label_offset: CodeOffset); + /// Can the label-use be patched to a veneer that supports a longer range? + /// Usually valid for jumps (a short-range jump can jump to a longer-range + /// jump), but not for e.g. constant pool references, because the constant + /// load would require different code (one more level of indirection). + fn supports_veneer(self) -> bool; + /// How many bytes are needed for a veneer? + fn veneer_size(self) -> CodeOffset; + /// Generate a veneer. The given code-buffer slice is `self.veneer_size()` + /// bytes long at offset `veneer_offset` in the buffer. The original + /// label-use will be patched to refer to this veneer's offset. A new + /// (offset, LabelUse) is returned that allows the veneer to use the actual + /// label. For veneers to work properly, it is expected that the new veneer + /// has a larger range; on most platforms this probably means either a + /// "long-range jump" (e.g., on ARM, the 26-bit form), or if already at that + /// stage, a jump that supports a full 32-bit range, for example. + fn generate_veneer(self, buffer: &mut [u8], veneer_offset: CodeOffset) -> (CodeOffset, Self); } /// Describes a block terminator (not call) in the vcode, when its branches @@ -202,26 +246,26 @@ pub enum MachTerminator<'a> { /// A return instruction. Ret, /// An unconditional branch to another block. - Uncond(BlockIndex), + Uncond(MachLabel), /// A conditional branch to one of two other blocks. - Cond(BlockIndex, BlockIndex), + Cond(MachLabel, MachLabel), /// An indirect branch with known possible targets. - Indirect(&'a [BlockIndex]), + Indirect(&'a [MachLabel]), } /// A trait describing the ability to encode a MachInst into binary machine code. -pub trait MachInstEmit { +pub trait MachInstEmit: MachInst { /// Persistent state carried across `emit` invocations. type State: Default + Clone + Debug; /// Emit the instruction. - fn emit(&self, code: &mut O, flags: &Flags, state: &mut Self::State); + fn emit(&self, code: &mut MachBuffer, flags: &Flags, state: &mut Self::State); } /// The result of a `MachBackend::compile_function()` call. Contains machine /// code (as bytes) and a disassembly, if requested. pub struct MachCompileResult { /// Machine code. - pub sections: MachSections, + pub buffer: MachBufferFinalized, /// Size of stack frame, in bytes. pub frame_size: u32, /// Disassembly, if requested. @@ -231,7 +275,7 @@ pub struct MachCompileResult { impl MachCompileResult { /// Get a `CodeInfo` describing section sizes from this compilation result. pub fn code_info(&self) -> CodeInfo { - let code_size = self.sections.total_size(); + let code_size = self.buffer.total_size(); CodeInfo { code_size, jumptables_size: 0, diff --git a/cranelift/codegen/src/machinst/sections.rs b/cranelift/codegen/src/machinst/sections.rs deleted file mode 100644 index 0bd97dcdb6..0000000000 --- a/cranelift/codegen/src/machinst/sections.rs +++ /dev/null @@ -1,460 +0,0 @@ -//! In-memory representation of compiled machine code, in multiple sections -//! (text, constant pool / rodata, etc). Emission occurs into multiple sections -//! simultaneously, so we buffer the result in memory and hand off to the -//! caller at the end of compilation. - -use crate::binemit::{Addend, CodeOffset, CodeSink, Reloc}; -use crate::ir::{ExternalName, Opcode, SourceLoc, TrapCode}; - -use alloc::vec::Vec; - -/// A collection of sections with defined start-offsets. -pub struct MachSections { - /// Sections, in offset order. - pub sections: Vec, -} - -impl MachSections { - /// New, empty set of sections. - pub fn new() -> MachSections { - MachSections { sections: vec![] } - } - - /// Add a section with a known offset and size. Returns the index. - pub fn add_section(&mut self, start: CodeOffset, length: CodeOffset) -> usize { - let idx = self.sections.len(); - self.sections.push(MachSection::new(start, length)); - idx - } - - /// Mutably borrow the given section by index. - pub fn get_section<'a>(&'a mut self, idx: usize) -> &'a mut MachSection { - &mut self.sections[idx] - } - - /// Get mutable borrows of two sections simultaneously. Used during - /// instruction emission to provide references to the .text and .rodata - /// (constant pool) sections. - pub fn two_sections<'a>( - &'a mut self, - idx1: usize, - idx2: usize, - ) -> (&'a mut MachSection, &'a mut MachSection) { - assert!(idx1 < idx2); - assert!(idx1 < self.sections.len()); - assert!(idx2 < self.sections.len()); - let (first, rest) = self.sections.split_at_mut(idx2); - (&mut first[idx1], &mut rest[0]) - } - - /// Emit this set of sections to a set of sinks for the code, - /// relocations, traps, and stackmap. - pub fn emit(&self, sink: &mut CS) { - // N.B.: we emit every section into the .text section as far as - // the `CodeSink` is concerned; we do not bother to segregate - // the contents into the actual program text, the jumptable and the - // rodata (constant pool). This allows us to generate code assuming - // that these will not be relocated relative to each other, and avoids - // having to designate each section as belonging in one of the three - // fixed categories defined by `CodeSink`. If this becomes a problem - // later (e.g. because of memory permissions or similar), we can - // add this designation and segregate the output; take care, however, - // to add the appropriate relocations in this case. - - for section in &self.sections { - if section.data.len() > 0 { - while sink.offset() < section.start_offset { - sink.put1(0); - } - section.emit(sink); - } - } - sink.begin_jumptables(); - sink.begin_rodata(); - sink.end_codegen(); - } - - /// Get a list of source location mapping tuples in sorted-by-start-offset order. - pub fn get_srclocs_sorted<'a>(&'a self) -> MachSectionsSrcLocs<'a> { - MachSectionsSrcLocs::new(&self.sections) - } - - /// Get the total required size for these sections. - pub fn total_size(&self) -> CodeOffset { - if self.sections.len() == 0 { - 0 - } else { - // Find the last non-empty section. - self.sections - .iter() - .rev() - .find(|s| s.data.len() > 0) - .map(|s| s.cur_offset_from_start()) - .unwrap_or(0) - } - } -} - -/// An iterator over the srclocs in each section. -/// Returns MachSrcLocs in an order sorted by start location. -pub struct MachSectionsSrcLocs<'a> { - sections: &'a [MachSection], - cur_section: usize, - cur_srcloc: usize, - // For validation: - last_offset: CodeOffset, -} - -impl<'a> MachSectionsSrcLocs<'a> { - fn new(sections: &'a [MachSection]) -> MachSectionsSrcLocs<'a> { - MachSectionsSrcLocs { - sections, - cur_section: 0, - cur_srcloc: 0, - last_offset: 0, - } - } -} - -impl<'a> Iterator for MachSectionsSrcLocs<'a> { - type Item = &'a MachSrcLoc; - - fn next(&mut self) -> Option<&'a MachSrcLoc> { - // We simply iterate through sections and srcloc records in order. This produces a - // sorted order naturally because sections are in starting-offset-order, and srclocs - // are produced as a section is emitted into, so are in order as well. - - // If we're out of sections, we're done. - if self.cur_section >= self.sections.len() { - return None; - } - - // Otherwise, make sure we have a srcloc in the current section left to return, and - // advance to the next section if not. Done if we run out of sections. - while self.cur_srcloc >= self.sections[self.cur_section].srclocs.len() { - self.cur_srcloc = 0; - self.cur_section += 1; - if self.cur_section >= self.sections.len() { - return None; - } - } - - let loc = &self.sections[self.cur_section].srclocs[self.cur_srcloc]; - self.cur_srcloc += 1; - debug_assert!(loc.start >= self.last_offset); - self.last_offset = loc.start; - Some(loc) - } -} - -/// An abstraction over MachSection and MachSectionSize: some -/// receiver of section data. -pub trait MachSectionOutput { - /// Get the current offset from the start of all sections. - fn cur_offset_from_start(&self) -> CodeOffset; - - /// Get the start offset of this section. - fn start_offset(&self) -> CodeOffset; - - /// Add 1 byte to the section. - fn put1(&mut self, _: u8); - - /// Add 2 bytes to the section. - fn put2(&mut self, value: u16) { - let [b0, b1] = value.to_le_bytes(); - self.put1(b0); - self.put1(b1); - } - - /// Add 4 bytes to the section. - fn put4(&mut self, value: u32) { - let [b0, b1, b2, b3] = value.to_le_bytes(); - self.put1(b0); - self.put1(b1); - self.put1(b2); - self.put1(b3); - } - - /// Add 8 bytes to the section. - fn put8(&mut self, value: u64) { - let [b0, b1, b2, b3, b4, b5, b6, b7] = value.to_le_bytes(); - self.put1(b0); - self.put1(b1); - self.put1(b2); - self.put1(b3); - self.put1(b4); - self.put1(b5); - self.put1(b6); - self.put1(b7); - } - - /// Add a slice of bytes to the section. - fn put_data(&mut self, data: &[u8]); - - /// Add a relocation at the current offset. - fn add_reloc(&mut self, loc: SourceLoc, kind: Reloc, name: &ExternalName, addend: Addend); - - /// Add a trap record at the current offset. - fn add_trap(&mut self, loc: SourceLoc, code: TrapCode); - - /// Add a call return address record at the current offset. - fn add_call_site(&mut self, loc: SourceLoc, opcode: Opcode); - - /// Start the output for the given source-location at the current offset. - fn start_srcloc(&mut self, loc: SourceLoc); - - /// End the output for the previously-given source-location at the current offset. - fn end_srcloc(&mut self); - - /// Align up to the given alignment. - fn align_to(&mut self, align_to: CodeOffset) { - assert!(align_to.is_power_of_two()); - while self.cur_offset_from_start() & (align_to - 1) != 0 { - self.put1(0); - } - } -} - -/// A section of output to be emitted to a CodeSink / RelocSink in bulk. -/// Multiple sections may be created with known start offsets in advance; the -/// usual use-case is to create the .text (code) and .rodata (constant pool) at -/// once, after computing the length of the code, so that constant references -/// can use known offsets as instructions are emitted. -pub struct MachSection { - /// The starting offset of this section. - pub start_offset: CodeOffset, - /// The limit of this section, defined by the start of the next section. - pub length_limit: CodeOffset, - /// The section contents, as raw bytes. - pub data: Vec, - /// Any relocations referring to this section. - pub relocs: Vec, - /// Any trap records referring to this section. - pub traps: Vec, - /// Any call site records referring to this section. - pub call_sites: Vec, - /// Any source location mappings referring to this section. - pub srclocs: Vec, - /// The current source location in progress (after `start_srcloc()` and before `end_srcloc()`). - /// This is a (start_offset, src_loc) tuple. - pub cur_srcloc: Option<(CodeOffset, SourceLoc)>, -} - -impl MachSection { - /// Create a new section, known to start at `start_offset` and with a size limited to `length_limit`. - pub fn new(start_offset: CodeOffset, length_limit: CodeOffset) -> MachSection { - MachSection { - start_offset, - length_limit, - data: vec![], - relocs: vec![], - traps: vec![], - call_sites: vec![], - srclocs: vec![], - cur_srcloc: None, - } - } - - /// Emit this section to the CodeSink and other associated sinks. The - /// current offset of the CodeSink must match the starting offset of this - /// section. - pub fn emit(&self, sink: &mut CS) { - assert!(sink.offset() == self.start_offset); - - let mut next_reloc = 0; - let mut next_trap = 0; - let mut next_call_site = 0; - for (idx, byte) in self.data.iter().enumerate() { - if next_reloc < self.relocs.len() { - let reloc = &self.relocs[next_reloc]; - if reloc.offset == idx as CodeOffset { - sink.reloc_external(reloc.srcloc, reloc.kind, &reloc.name, reloc.addend); - next_reloc += 1; - } - } - if next_trap < self.traps.len() { - let trap = &self.traps[next_trap]; - if trap.offset == idx as CodeOffset { - sink.trap(trap.code, trap.srcloc); - next_trap += 1; - } - } - if next_call_site < self.call_sites.len() { - let call_site = &self.call_sites[next_call_site]; - if call_site.ret_addr == idx as CodeOffset { - sink.add_call_site(call_site.opcode, call_site.srcloc); - next_call_site += 1; - } - } - sink.put1(*byte); - } - } -} - -impl MachSectionOutput for MachSection { - fn cur_offset_from_start(&self) -> CodeOffset { - self.start_offset + self.data.len() as CodeOffset - } - - fn start_offset(&self) -> CodeOffset { - self.start_offset - } - - fn put1(&mut self, value: u8) { - assert!(((self.data.len() + 1) as CodeOffset) <= self.length_limit); - self.data.push(value); - } - - fn put_data(&mut self, data: &[u8]) { - assert!(((self.data.len() + data.len()) as CodeOffset) <= self.length_limit); - self.data.extend_from_slice(data); - } - - fn add_reloc(&mut self, srcloc: SourceLoc, kind: Reloc, name: &ExternalName, addend: Addend) { - let name = name.clone(); - self.relocs.push(MachReloc { - offset: self.data.len() as CodeOffset, - srcloc, - kind, - name, - addend, - }); - } - - fn add_trap(&mut self, srcloc: SourceLoc, code: TrapCode) { - self.traps.push(MachTrap { - offset: self.data.len() as CodeOffset, - srcloc, - code, - }); - } - - fn add_call_site(&mut self, srcloc: SourceLoc, opcode: Opcode) { - self.call_sites.push(MachCallSite { - ret_addr: self.data.len() as CodeOffset, - srcloc, - opcode, - }); - } - - fn start_srcloc(&mut self, loc: SourceLoc) { - self.cur_srcloc = Some((self.cur_offset_from_start(), loc)); - } - - fn end_srcloc(&mut self) { - let (start, loc) = self - .cur_srcloc - .take() - .expect("end_srcloc() called without start_srcloc()"); - let end = self.cur_offset_from_start(); - // Skip zero-length extends. - debug_assert!(end >= start); - if end > start { - self.srclocs.push(MachSrcLoc { start, end, loc }); - } - } -} - -/// A MachSectionOutput implementation that records only size. -pub struct MachSectionSize { - /// The starting offset of this section. - pub start_offset: CodeOffset, - /// The current offset of this section. - pub offset: CodeOffset, -} - -impl MachSectionSize { - /// Create a new size-counting dummy section. - pub fn new(start_offset: CodeOffset) -> MachSectionSize { - MachSectionSize { - start_offset, - offset: start_offset, - } - } - - /// Return the size this section would take if emitted with a real sink. - pub fn size(&self) -> CodeOffset { - self.offset - self.start_offset - } -} - -impl MachSectionOutput for MachSectionSize { - fn cur_offset_from_start(&self) -> CodeOffset { - // All size-counting sections conceptually start at offset 0; this doesn't - // matter when counting code size. - self.offset - } - - fn start_offset(&self) -> CodeOffset { - self.start_offset - } - - fn put1(&mut self, _: u8) { - self.offset += 1; - } - - fn put_data(&mut self, data: &[u8]) { - self.offset += data.len() as CodeOffset; - } - - fn add_reloc(&mut self, _: SourceLoc, _: Reloc, _: &ExternalName, _: Addend) {} - - fn add_trap(&mut self, _: SourceLoc, _: TrapCode) {} - - fn add_call_site(&mut self, _: SourceLoc, _: Opcode) {} - - fn start_srcloc(&mut self, _: SourceLoc) {} - - fn end_srcloc(&mut self) {} -} - -/// A relocation resulting from a compilation. -pub struct MachReloc { - /// The offset at which the relocation applies, *relative to the - /// containing section*. - pub offset: CodeOffset, - /// The original source location. - pub srcloc: SourceLoc, - /// The kind of relocation. - pub kind: Reloc, - /// The external symbol / name to which this relocation refers. - pub name: ExternalName, - /// The addend to add to the symbol value. - pub addend: i64, -} - -/// A trap record resulting from a compilation. -pub struct MachTrap { - /// The offset at which the trap instruction occurs, *relative to the - /// containing section*. - pub offset: CodeOffset, - /// The original source location. - pub srcloc: SourceLoc, - /// The trap code. - pub code: TrapCode, -} - -/// A call site record resulting from a compilation. -pub struct MachCallSite { - /// The offset of the call's return address, *relative to the containing section*. - pub ret_addr: CodeOffset, - /// The original source location. - pub srcloc: SourceLoc, - /// The call's opcode. - pub opcode: Opcode, -} - -/// A source-location mapping resulting from a compilation. -#[derive(Clone, Debug)] -pub struct MachSrcLoc { - /// The start of the region of code corresponding to a source location. - /// This is relative to the start of the function, not to the start of the - /// section. - pub start: CodeOffset, - /// The end of the region of code corresponding to a source location. - /// This is relative to the start of the section, not to the start of the - /// section. - pub end: CodeOffset, - /// The source location. - pub loc: SourceLoc, -} diff --git a/cranelift/codegen/src/machinst/vcode.rs b/cranelift/codegen/src/machinst/vcode.rs index ff9961aefa..d4c13bff0c 100644 --- a/cranelift/codegen/src/machinst/vcode.rs +++ b/cranelift/codegen/src/machinst/vcode.rs @@ -17,8 +17,7 @@ //! See the main module comment in `mod.rs` for more details on the VCode-based //! backend pipeline. -use crate::entity::SecondaryMap; -use crate::ir::{self, Block, SourceLoc}; +use crate::ir::{self, SourceLoc}; use crate::machinst::*; use crate::settings; @@ -30,8 +29,6 @@ use regalloc::{ use alloc::boxed::Box; use alloc::{borrow::Cow, vec::Vec}; -use log::debug; -use smallvec::SmallVec; use std::fmt; use std::iter; use std::string::String; @@ -43,8 +40,8 @@ pub type BlockIndex = u32; /// VCodeInst wraps all requirements for a MachInst to be in VCode: it must be /// a `MachInst` and it must be able to emit itself at least to a `SizeCodeSink`. -pub trait VCodeInst: MachInst + MachInstEmit + MachInstEmit {} -impl + MachInstEmit> VCodeInst for I {} +pub trait VCodeInst: MachInst + MachInstEmit {} +impl VCodeInst for I {} /// A function in "VCode" (virtualized-register code) form, after lowering. /// This is essentially a standard CFG of basic blocks, where each basic block @@ -80,29 +77,11 @@ pub struct VCode { /// correspond to each basic block's successors. block_succs: Vec, - /// Block indices by IR block. - block_by_bb: SecondaryMap, - - /// IR block for each VCode Block. The length of this Vec will likely be - /// less than the total number of Blocks, because new Blocks (for edge - /// splits, for example) are appended during lowering. - bb_by_block: Vec, - - /// Order of block IDs in final generated code. - final_block_order: Vec, - - /// Final block offsets. Computed during branch finalization and used - /// during emission. - final_block_offsets: Vec, - - /// Size of code, accounting for block layout / alignment. - code_size: CodeOffset, + /// Block-order information. + block_order: BlockLoweringOrder, /// ABI object. abi: Box>, - - /// The block targeted by fallthrough_returns, if there's one. - pub fallthrough_return_block: Option, } /// A builder for a VCode function body. This builder is designed for the @@ -123,12 +102,8 @@ pub struct VCodeBuilder { /// In-progress VCode. vcode: VCode, - /// Current basic block instructions, in reverse order (because blocks are - /// built bottom-to-top). - bb_insns: SmallVec<[(I, SourceLoc); 32]>, - - /// Current IR-inst instructions, in forward order. - ir_inst_insns: SmallVec<[(I, SourceLoc); 4]>, + /// Index of the last block-start in the vcode. + block_start: InsnIndex, /// Start of succs for the current block in the concatenated succs list. succ_start: usize, @@ -139,12 +114,11 @@ pub struct VCodeBuilder { impl VCodeBuilder { /// Create a new VCodeBuilder. - pub fn new(abi: Box>) -> VCodeBuilder { - let vcode = VCode::new(abi); + pub fn new(abi: Box>, block_order: BlockLoweringOrder) -> VCodeBuilder { + let vcode = VCode::new(abi, block_order); VCodeBuilder { vcode, - bb_insns: SmallVec::new(), - ir_inst_insns: SmallVec::new(), + block_start: 0, succ_start: 0, cur_srcloc: SourceLoc::default(), } @@ -155,14 +129,9 @@ impl VCodeBuilder { &mut *self.vcode.abi } - /// Set the fallthrough_return target block for this function. There must be at most once per - /// function. - pub fn set_fallthrough_return_block(&mut self, bb: Block) { - debug_assert!( - self.vcode.fallthrough_return_block.is_none(), - "a function must have at most one fallthrough-return instruction" - ); - self.vcode.fallthrough_return_block = Some(self.bb_to_bindex(bb)); + /// Access to the BlockLoweringOrder object. + pub fn block_order(&self) -> &BlockLoweringOrder { + &self.vcode.block_order } /// Set the type of a VReg. @@ -173,53 +142,17 @@ impl VCodeBuilder { self.vcode.vreg_types[vreg.get_index()] = ty; } - /// Return the underlying bb-to-BlockIndex map. - pub fn blocks_by_bb(&self) -> &SecondaryMap { - &self.vcode.block_by_bb - } - - /// Initialize the bb-to-BlockIndex map. Returns the first free - /// BlockIndex. - pub fn init_bb_map(&mut self, blocks: &[ir::Block]) -> BlockIndex { - let mut bindex: BlockIndex = 0; - for bb in blocks.iter() { - self.vcode.block_by_bb[*bb] = bindex; - self.vcode.bb_by_block.push(*bb); - bindex += 1; - } - bindex - } - - /// Get the BlockIndex for an IR block. - pub fn bb_to_bindex(&self, bb: ir::Block) -> BlockIndex { - self.vcode.block_by_bb[bb] - } - /// Set the current block as the entry block. pub fn set_entry(&mut self, block: BlockIndex) { self.vcode.entry = block; } - /// End the current IR instruction. Must be called after pushing any - /// instructions and prior to ending the basic block. - pub fn end_ir_inst(&mut self) { - while let Some(pair) = self.ir_inst_insns.pop() { - self.bb_insns.push(pair); - } - } - /// End the current basic block. Must be called after emitting vcode insts /// for IR insts and prior to ending the function (building the VCode). - pub fn end_bb(&mut self) -> BlockIndex { - assert!(self.ir_inst_insns.is_empty()); - let block_num = self.vcode.block_ranges.len() as BlockIndex; - // Push the instructions. - let start_idx = self.vcode.insts.len() as InsnIndex; - while let Some((i, loc)) = self.bb_insns.pop() { - self.vcode.insts.push(i); - self.vcode.srclocs.push(loc); - } + pub fn end_bb(&mut self) { + let start_idx = self.block_start; let end_idx = self.vcode.insts.len() as InsnIndex; + self.block_start = end_idx; // Add the instruction index range to the list of blocks. self.vcode.block_ranges.push((start_idx, end_idx)); // End the successors list. @@ -228,8 +161,6 @@ impl VCodeBuilder { .block_succ_range .push((self.succ_start, succ_end)); self.succ_start = succ_end; - - block_num } /// Push an instruction for the current BB and current IR inst within the BB. @@ -237,19 +168,27 @@ impl VCodeBuilder { match insn.is_term() { MachTerminator::None | MachTerminator::Ret => {} MachTerminator::Uncond(target) => { - self.vcode.block_succs.push(BlockIx::new(target)); + self.vcode.block_succs.push(BlockIx::new(target.get())); } MachTerminator::Cond(true_branch, false_branch) => { - self.vcode.block_succs.push(BlockIx::new(true_branch)); - self.vcode.block_succs.push(BlockIx::new(false_branch)); + self.vcode.block_succs.push(BlockIx::new(true_branch.get())); + self.vcode + .block_succs + .push(BlockIx::new(false_branch.get())); } MachTerminator::Indirect(targets) => { for target in targets { - self.vcode.block_succs.push(BlockIx::new(*target)); + self.vcode.block_succs.push(BlockIx::new(target.get())); } } } - self.ir_inst_insns.push((insn, self.cur_srcloc)); + self.vcode.insts.push(insn); + self.vcode.srclocs.push(self.cur_srcloc); + } + + /// Get the current source location. + pub fn get_srcloc(&self) -> SourceLoc { + self.cur_srcloc } /// Set the current source location. @@ -259,8 +198,6 @@ impl VCodeBuilder { /// Build the final VCode. pub fn build(self) -> VCode { - assert!(self.ir_inst_insns.is_empty()); - assert!(self.bb_insns.is_empty()); self.vcode } } @@ -282,35 +219,9 @@ fn is_redundant_move(insn: &I) -> bool { } } -fn is_trivial_jump_block(vcode: &VCode, block: BlockIndex) -> Option { - let range = vcode.block_insns(BlockIx::new(block)); - - debug!( - "is_trivial_jump_block: block {} has len {}", - block, - range.len() - ); - - if range.len() != 1 { - return None; - } - let insn = range.first(); - - debug!( - " -> only insn is: {:?} with terminator {:?}", - vcode.get_insn(insn), - vcode.get_insn(insn).is_term() - ); - - match vcode.get_insn(insn).is_term() { - MachTerminator::Uncond(target) => Some(target), - _ => None, - } -} - impl VCode { /// New empty VCode. - fn new(abi: Box>) -> VCode { + fn new(abi: Box>, block_order: BlockLoweringOrder) -> VCode { VCode { liveins: abi.liveins(), liveouts: abi.liveouts(), @@ -321,13 +232,8 @@ impl VCode { block_ranges: vec![], block_succ_range: vec![], block_succs: vec![], - block_by_bb: SecondaryMap::with_default(0), - bb_by_block: vec![], - final_block_order: vec![], - final_block_offsets: vec![], - code_size: 0, + block_order, abi, - fallthrough_return_block: None, } } @@ -367,8 +273,6 @@ impl VCode { /// instructions including spliced fill/reload/move instructions, and replace /// the VCode with them. pub fn replace_insns_from_regalloc(&mut self, result: RegAllocResult) { - self.final_block_order = compute_final_block_order(self); - // Record the spillslot count and clobbered registers for the ABI/stack // setup code. self.abi.set_num_spillslots(result.num_spill_slots as usize); @@ -383,11 +287,12 @@ impl VCode { let mut final_block_ranges = vec![(0, 0); self.num_blocks()]; let mut final_srclocs = vec![]; - for block in &self.final_block_order { - let (start, end) = block_ranges[*block as usize]; + for block in 0..self.num_blocks() { + let block = block as BlockIndex; + let (start, end) = block_ranges[block as usize]; let final_start = final_insns.len() as InsnIndex; - if *block == self.entry { + if block == self.entry { // Start with the prologue. let prologue = self.abi.gen_prologue(); let len = prologue.len(); @@ -429,7 +334,7 @@ impl VCode { } let final_end = final_insns.len() as InsnIndex; - final_block_ranges[*block as usize] = (final_start, final_end); + final_block_ranges[block as usize] = (final_start, final_end); } debug_assert!(final_insns.len() == final_srclocs.len()); @@ -439,175 +344,68 @@ impl VCode { self.block_ranges = final_block_ranges; } - /// Removes redundant branches, rewriting targets to point directly to the - /// ultimate block at the end of a chain of trivial one-target jumps. - pub fn remove_redundant_branches(&mut self) { - // For each block, compute the actual target block, looking through up to one - // block with single-target jumps (this will remove empty edge blocks inserted - // by phi-lowering). - let block_rewrites: Vec = (0..self.num_blocks() as u32) - .map(|bix| is_trivial_jump_block(self, bix).unwrap_or(bix)) - .collect(); - let mut refcounts: Vec = vec![0; self.num_blocks()]; - - debug!( - "remove_redundant_branches: block_rewrites = {:?}", - block_rewrites - ); - - refcounts[self.entry as usize] = 1; - - for block in 0..self.num_blocks() as u32 { - for insn in self.block_insns(BlockIx::new(block)) { - self.get_insn_mut(insn) - .with_block_rewrites(&block_rewrites[..]); - match self.get_insn(insn).is_term() { - MachTerminator::Uncond(bix) => { - refcounts[bix as usize] += 1; - } - MachTerminator::Cond(bix1, bix2) => { - refcounts[bix1 as usize] += 1; - refcounts[bix2 as usize] += 1; - } - MachTerminator::Indirect(blocks) => { - for block in blocks { - refcounts[*block as usize] += 1; - } - } - _ => {} - } - } - } - - let deleted: Vec = refcounts.iter().map(|r| *r == 0).collect(); - - let block_order = std::mem::replace(&mut self.final_block_order, vec![]); - self.final_block_order = block_order - .into_iter() - .filter(|b| !deleted[*b as usize]) - .collect(); - - // Rewrite successor information based on the block-rewrite map. - for succ in &mut self.block_succs { - let new_succ = block_rewrites[succ.get() as usize]; - *succ = BlockIx::new(new_succ); - } - } - - /// Mutate branch instructions to (i) lower two-way condbrs to one-way, - /// depending on fallthrough; and (ii) use concrete offsets. - pub fn finalize_branches(&mut self) + /// Emit the instructions to a `MachBuffer`, containing fixed-up code and external + /// reloc/trap/etc. records ready for use. + pub fn emit(&self) -> MachBuffer where - I: MachInstEmit, + I: MachInstEmit, { - // Compute fallthrough block, indexed by block. - let num_final_blocks = self.final_block_order.len(); - let mut block_fallthrough: Vec> = vec![None; self.num_blocks()]; - for i in 0..(num_final_blocks - 1) { - let from = self.final_block_order[i]; - let to = self.final_block_order[i + 1]; - block_fallthrough[from as usize] = Some(to); - } - - // Pass over VCode instructions and finalize two-way branches into - // one-way branches with fallthrough. - for block in 0..self.num_blocks() { - let next_block = block_fallthrough[block]; - let (start, end) = self.block_ranges[block]; - - for iix in start..end { - let insn = &mut self.insts[iix as usize]; - insn.with_fallthrough_block(next_block); - } - } - - let flags = self.abi.flags(); - - // Compute block offsets. - let mut code_section = MachSectionSize::new(0); - let mut block_offsets = vec![0; self.num_blocks()]; + let mut buffer = MachBuffer::new(); let mut state = Default::default(); - for &block in &self.final_block_order { - code_section.offset = I::align_basic_block(code_section.offset); - block_offsets[block as usize] = code_section.offset; - let (start, end) = self.block_ranges[block as usize]; - for iix in start..end { - self.insts[iix as usize].emit(&mut code_section, flags, &mut state); - } - } - // We now have the section layout. - self.final_block_offsets = block_offsets; - self.code_size = code_section.size(); - - // Update branches with known block offsets. This looks like the - // traversal above, but (i) does not update block_offsets, rather uses - // it (so forward references are now possible), and (ii) mutates the - // instructions. - let mut code_section = MachSectionSize::new(0); - let mut state = Default::default(); - for &block in &self.final_block_order { - code_section.offset = I::align_basic_block(code_section.offset); - let (start, end) = self.block_ranges[block as usize]; - for iix in start..end { - self.insts[iix as usize] - .with_block_offsets(code_section.offset, &self.final_block_offsets[..]); - self.insts[iix as usize].emit(&mut code_section, flags, &mut state); - } - } - } - - /// Emit the instructions to a list of sections. - pub fn emit(&self) -> MachSections - where - I: MachInstEmit, - { - let mut sections = MachSections::new(); - let code_idx = sections.add_section(0, self.code_size); - let code_section = sections.get_section(code_idx); - let mut state = Default::default(); + buffer.reserve_labels_for_blocks(self.num_blocks() as BlockIndex); // first N MachLabels are simply block indices. let flags = self.abi.flags(); let mut cur_srcloc = None; - for &block in &self.final_block_order { - let new_offset = I::align_basic_block(code_section.cur_offset_from_start()); - while new_offset > code_section.cur_offset_from_start() { + for block in 0..self.num_blocks() { + let block = block as BlockIndex; + let new_offset = I::align_basic_block(buffer.cur_offset()); + while new_offset > buffer.cur_offset() { // Pad with NOPs up to the aligned block offset. - let nop = I::gen_nop((new_offset - code_section.cur_offset_from_start()) as usize); - nop.emit(code_section, flags, &mut Default::default()); + let nop = I::gen_nop((new_offset - buffer.cur_offset()) as usize); + nop.emit(&mut buffer, flags, &mut Default::default()); } - assert_eq!(code_section.cur_offset_from_start(), new_offset); + assert_eq!(buffer.cur_offset(), new_offset); let (start, end) = self.block_ranges[block as usize]; + buffer.bind_label(MachLabel::from_block(block)); for iix in start..end { let srcloc = self.srclocs[iix as usize]; if cur_srcloc != Some(srcloc) { if cur_srcloc.is_some() { - code_section.end_srcloc(); + buffer.end_srcloc(); } - code_section.start_srcloc(srcloc); + buffer.start_srcloc(srcloc); cur_srcloc = Some(srcloc); } - self.insts[iix as usize].emit(code_section, flags, &mut state); + self.insts[iix as usize].emit(&mut buffer, flags, &mut state); } if cur_srcloc.is_some() { - code_section.end_srcloc(); + buffer.end_srcloc(); cur_srcloc = None; } + + // Do we need an island? Get the worst-case size of the next BB and see if, having + // emitted that many bytes, we will be beyond the deadline. + if block < (self.num_blocks() - 1) as BlockIndex { + let next_block = block + 1; + let next_block_range = self.block_ranges[next_block as usize]; + let next_block_size = next_block_range.1 - next_block_range.0; + let worst_case_next_bb = I::worst_case_size() * next_block_size; + if buffer.island_needed(worst_case_next_bb) { + buffer.emit_island(); + } + } } - sections + buffer } /// Get the IR block for a BlockIndex, if one exists. pub fn bindex_to_bb(&self, block: BlockIndex) -> Option { - if (block as usize) < self.bb_by_block.len() { - Some(self.bb_by_block[block as usize]) - } else { - None - } + self.block_order.lowered_order()[block as usize].orig_block() } } @@ -712,7 +510,6 @@ impl fmt::Debug for VCode { fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { writeln!(f, "VCode_Debug {{")?; writeln!(f, " Entry block: {}", self.entry)?; - writeln!(f, " Final block order: {:?}", self.final_block_order)?; for block in 0..self.num_blocks() { writeln!(f, "Block {}:", block,)?; @@ -736,52 +533,21 @@ impl ShowWithRRU for VCode { fn show_rru(&self, mb_rru: Option<&RealRegUniverse>) -> String { use std::fmt::Write; - // Calculate an order in which to display the blocks. This is the same - // as final_block_order, but also includes blocks which are in the - // representation but not in final_block_order. - let mut display_order = Vec::::new(); - // First display blocks in `final_block_order` - for bix in &self.final_block_order { - assert!((*bix as usize) < self.num_blocks()); - display_order.push(*bix as usize); - } - // Now also take care of those not listed in `final_block_order`. - // This is quadratic, but it's also debug-only code. - for bix in 0..self.num_blocks() { - if display_order.contains(&bix) { - continue; - } - display_order.push(bix); - } - let mut s = String::new(); write!(&mut s, "VCode_ShowWithRRU {{{{\n").unwrap(); write!(&mut s, " Entry block: {}\n", self.entry).unwrap(); - write!( - &mut s, - " Final block order: {:?}\n", - self.final_block_order - ) - .unwrap(); for i in 0..self.num_blocks() { - let block = display_order[i]; + let block = i as BlockIndex; - let omitted = if !self.final_block_order.is_empty() && i >= self.final_block_order.len() - { - "** OMITTED **" - } else { - "" - }; - - write!(&mut s, "Block {}: {}\n", block, omitted).unwrap(); - if let Some(bb) = self.bindex_to_bb(block as BlockIndex) { + write!(&mut s, "Block {}:\n", block).unwrap(); + if let Some(bb) = self.bindex_to_bb(block) { write!(&mut s, " (original IR block: {})\n", bb).unwrap(); } - for succ in self.succs(block as BlockIndex) { + for succ in self.succs(block) { write!(&mut s, " (successor: Block {})\n", succ.get()).unwrap(); } - let (start, end) = self.block_ranges[block]; + let (start, end) = self.block_ranges[block as usize]; write!(&mut s, " (instruction range: {} .. {})\n", start, end).unwrap(); for inst in start..end { write!( diff --git a/cranelift/codegen/src/num_uses.rs b/cranelift/codegen/src/num_uses.rs deleted file mode 100644 index fd6eee8ec1..0000000000 --- a/cranelift/codegen/src/num_uses.rs +++ /dev/null @@ -1,52 +0,0 @@ -//! A pass that computes the number of uses of any given instruction. - -use crate::entity::SecondaryMap; -use crate::ir::dfg::ValueDef; -use crate::ir::Value; -use crate::ir::{DataFlowGraph, Function, Inst}; - -/// Auxiliary data structure that counts the number of uses of any given -/// instruction in a Function. This is used during instruction selection -/// to essentially do incremental DCE: when an instruction is no longer -/// needed because its computation has been isel'd into another machine -/// instruction at every use site, we can skip it. -#[derive(Clone, Debug)] -pub struct NumUses { - uses: SecondaryMap, -} - -impl NumUses { - fn new() -> NumUses { - NumUses { - uses: SecondaryMap::with_default(0), - } - } - - /// Compute the NumUses analysis result for a function. - pub fn compute(func: &Function) -> NumUses { - let mut uses = NumUses::new(); - for bb in func.layout.blocks() { - for inst in func.layout.block_insts(bb) { - for arg in func.dfg.inst_args(inst) { - let v = func.dfg.resolve_aliases(*arg); - uses.add_value(&func.dfg, v); - } - } - } - uses - } - - fn add_value(&mut self, dfg: &DataFlowGraph, v: Value) { - match dfg.value_def(v) { - ValueDef::Result(inst, _) => { - self.uses[inst] += 1; - } - _ => {} - } - } - - /// Take the complete uses map, consuming this analysis result. - pub fn take_uses(self) -> SecondaryMap { - self.uses - } -} diff --git a/cranelift/filetests/filetests/vcode/aarch64/arithmetic.clif b/cranelift/filetests/filetests/vcode/aarch64/arithmetic.clif index 08ecb31d35..9a95c52c64 100644 --- a/cranelift/filetests/filetests/vcode/aarch64/arithmetic.clif +++ b/cranelift/filetests/filetests/vcode/aarch64/arithmetic.clif @@ -1,7 +1,7 @@ test vcode target aarch64 -function %f(i64, i64) -> i64 { +function %f1(i64, i64) -> i64 { block0(v0: i64, v1: i64): v2 = iadd.i64 v0, v1 return v2 @@ -15,7 +15,7 @@ block0(v0: i64, v1: i64): ; nextln: ret -function %f(i64, i64) -> i64 { +function %f2(i64, i64) -> i64 { block0(v0: i64, v1: i64): v2 = isub.i64 v0, v1 return v2 @@ -28,7 +28,7 @@ block0(v0: i64, v1: i64): ; nextln: ldp fp, lr, [sp], #16 ; nextln: ret -function %f(i64, i64) -> i64 { +function %f3(i64, i64) -> i64 { block0(v0: i64, v1: i64): v2 = imul.i64 v0, v1 return v2 @@ -41,7 +41,7 @@ block0(v0: i64, v1: i64): ; nextln: ldp fp, lr, [sp], #16 ; nextln: ret -function %f(i64, i64) -> i64 { +function %f4(i64, i64) -> i64 { block0(v0: i64, v1: i64): v2 = umulhi.i64 v0, v1 return v2 @@ -54,7 +54,7 @@ block0(v0: i64, v1: i64): ; nextln: ldp fp, lr, [sp], #16 ; nextln: ret -function %f(i64, i64) -> i64 { +function %f5(i64, i64) -> i64 { block0(v0: i64, v1: i64): v2 = smulhi.i64 v0, v1 return v2 @@ -67,7 +67,7 @@ block0(v0: i64, v1: i64): ; nextln: ldp fp, lr, [sp], #16 ; nextln: ret -function %f(i64, i64) -> i64 { +function %f6(i64, i64) -> i64 { block0(v0: i64, v1: i64): v2 = sdiv.i64 v0, v1 return v2 @@ -87,7 +87,7 @@ block0(v0: i64, v1: i64): ; nextln: ldp fp, lr, [sp], #16 ; nextln: ret -function %f(i64) -> i64 { +function %f7(i64) -> i64 { block0(v0: i64): v1 = iconst.i64 2 v2 = sdiv.i64 v0, v1 @@ -109,7 +109,7 @@ block0(v0: i64): ; nextln: ldp fp, lr, [sp], #16 ; nextln: ret -function %f(i64, i64) -> i64 { +function %f8(i64, i64) -> i64 { block0(v0: i64, v1: i64): v2 = udiv.i64 v0, v1 return v2 @@ -124,7 +124,7 @@ block0(v0: i64, v1: i64): ; nextln: ldp fp, lr, [sp], #16 ; nextln: ret -function %f(i64) -> i64 { +function %f9(i64) -> i64 { block0(v0: i64): v1 = iconst.i64 2 v2 = udiv.i64 v0, v1 @@ -141,7 +141,7 @@ block0(v0: i64): ; nextln: ldp fp, lr, [sp], #16 ; nextln: ret -function %f(i64, i64) -> i64 { +function %f10(i64, i64) -> i64 { block0(v0: i64, v1: i64): v2 = srem.i64 v0, v1 return v2 @@ -157,7 +157,7 @@ block0(v0: i64, v1: i64): ; nextln: ldp fp, lr, [sp], #16 ; nextln: ret -function %f(i64, i64) -> i64 { +function %f11(i64, i64) -> i64 { block0(v0: i64, v1: i64): v2 = urem.i64 v0, v1 return v2 @@ -174,7 +174,7 @@ block0(v0: i64, v1: i64): ; nextln: ret -function %f(i32, i32) -> i32 { +function %f12(i32, i32) -> i32 { block0(v0: i32, v1: i32): v2 = sdiv.i32 v0, v1 return v2 @@ -195,48 +195,48 @@ block0(v0: i32, v1: i32): ; nextln: ldp fp, lr, [sp], #16 ; nextln: ret -function %f(i32) -> i32 { +function %f13(i32) -> i32 { block0(v0: i32): v1 = iconst.i32 2 v2 = sdiv.i32 v0, v1 return v2 } -; check: stp fp, lr, [sp, #-16]! -; nextln: mov fp, sp -; nextln: mov x1, x0 -; nextln: movz x0, #2 -; nextln: sxtw x1, w1 -; nextln: sxtw x2, w0 -; nextln: sdiv x0, x1, x2 -; nextln: cbz x2, 20 -; nextln: adds wzr, w2, #1 -; nextln: ccmp w1, #1, #nzcv, eq -; nextln: b.vc 12 -; nextln: udf -; nextln: udf -; nextln: mov sp, fp -; nextln: ldp fp, lr, [sp], #16 -; nextln: ret +; check: stp fp, lr, [sp, #-16]! +; nextln: mov fp, sp +; nextln: sxtw x1, w0 +; nextln: movz x0, #2 +; nextln: sxtw x2, w0 +; nextln: sdiv x0, x1, x2 +; nextln: cbz x2, 20 +; nextln: adds wzr, w2, #1 +; nextln: ccmp w1, #1, #nzcv, eq +; nextln: b.vc 12 +; nextln: udf +; nextln: udf +; nextln: mov sp, fp +; nextln: ldp fp, lr, [sp], #16 +; nextln: ret -function %f(i32, i32) -> i32 { +function %f14(i32, i32) -> i32 { block0(v0: i32, v1: i32): v2 = udiv.i32 v0, v1 return v2 } -; check: stp fp, lr, [sp, #-16]! -; nextln: mov fp, sp -; nextln: mov w0, w0 -; nextln: mov w1, w1 -; nextln: udiv x0, x0, x1 -; nextln: cbnz x1, 8 -; nextln: udf -; nextln: mov sp, fp -; nextln: ldp fp, lr, [sp], #16 -; nextln: ret +; check: stp fp, lr, [sp, #-16]! +; nextln: mov fp, sp +; nextln: mov w0, w0 +; nextln: mov w1, w1 +; nextln: udiv x0, x0, x1 +; nextln: cbnz x1, 8 +; nextln: udf +; nextln: mov sp, fp +; nextln: ldp fp, lr, [sp], #16 +; nextln: ret -function %f(i32) -> i32 { + +function %f15(i32) -> i32 { block0(v0: i32): v1 = iconst.i32 2 v2 = udiv.i32 v0, v1 @@ -245,9 +245,8 @@ block0(v0: i32): ; check: stp fp, lr, [sp, #-16]! ; nextln: mov fp, sp -; nextln: movz x1, #2 ; nextln: mov w0, w0 -; nextln: mov w1, w1 +; nextln: movz x1, #2 ; nextln: udiv x0, x0, x1 ; nextln: cbnz x1, 8 ; nextln: udf @@ -255,7 +254,7 @@ block0(v0: i32): ; nextln: ldp fp, lr, [sp], #16 ; nextln: ret -function %f(i32, i32) -> i32 { +function %f16(i32, i32) -> i32 { block0(v0: i32, v1: i32): v2 = srem.i32 v0, v1 return v2 @@ -273,7 +272,7 @@ block0(v0: i32, v1: i32): ; nextln: ldp fp, lr, [sp], #16 ; nextln: ret -function %f(i32, i32) -> i32 { +function %f17(i32, i32) -> i32 { block0(v0: i32, v1: i32): v2 = urem.i32 v0, v1 return v2 @@ -291,7 +290,7 @@ block0(v0: i32, v1: i32): ; nextln: ldp fp, lr, [sp], #16 ; nextln: ret -function %f(i64, i64) -> i64 { +function %f18(i64, i64) -> i64 { block0(v0: i64, v1: i64): v2 = band.i64 v0, v1 return v2 @@ -304,7 +303,7 @@ block0(v0: i64, v1: i64): ; nextln: ldp fp, lr, [sp], #16 ; nextln: ret -function %f(i64, i64) -> i64 { +function %f19(i64, i64) -> i64 { block0(v0: i64, v1: i64): v2 = bor.i64 v0, v1 return v2 @@ -317,7 +316,7 @@ block0(v0: i64, v1: i64): ; nextln: ldp fp, lr, [sp], #16 ; nextln: ret -function %f(i64, i64) -> i64 { +function %f20(i64, i64) -> i64 { block0(v0: i64, v1: i64): v2 = bxor.i64 v0, v1 return v2 @@ -330,7 +329,7 @@ block0(v0: i64, v1: i64): ; nextln: ldp fp, lr, [sp], #16 ; nextln: ret -function %f(i64, i64) -> i64 { +function %f21(i64, i64) -> i64 { block0(v0: i64, v1: i64): v2 = band_not.i64 v0, v1 return v2 @@ -343,7 +342,7 @@ block0(v0: i64, v1: i64): ; nextln: ldp fp, lr, [sp], #16 ; nextln: ret -function %f(i64, i64) -> i64 { +function %f22(i64, i64) -> i64 { block0(v0: i64, v1: i64): v2 = bor_not.i64 v0, v1 return v2 @@ -356,7 +355,7 @@ block0(v0: i64, v1: i64): ; nextln: ldp fp, lr, [sp], #16 ; nextln: ret -function %f(i64, i64) -> i64 { +function %f23(i64, i64) -> i64 { block0(v0: i64, v1: i64): v2 = bxor_not.i64 v0, v1 return v2 @@ -369,7 +368,7 @@ block0(v0: i64, v1: i64): ; nextln: ldp fp, lr, [sp], #16 ; nextln: ret -function %f(i64, i64) -> i64 { +function %f24(i64, i64) -> i64 { block0(v0: i64, v1: i64): v2 = bnot.i64 v0 return v2 diff --git a/cranelift/filetests/filetests/vcode/aarch64/condbr.clif b/cranelift/filetests/filetests/vcode/aarch64/condbr.clif index 596557d8e0..3f0c0766d7 100644 --- a/cranelift/filetests/filetests/vcode/aarch64/condbr.clif +++ b/cranelift/filetests/filetests/vcode/aarch64/condbr.clif @@ -30,17 +30,18 @@ block2: return v5 } +; check: Block 0: ; check: stp fp, lr, [sp, #-16]! ; nextln: mov fp, sp ; nextln: subs xzr, x0, x1 -; nextln: b.eq 20 -; check: Block 2: -; check: movz x0, #2 +; nextln: b.eq label1 ; b label2 +; check: Block 1: +; check: movz x0, #1 ; nextln: mov sp, fp ; nextln: ldp fp, lr, [sp], #16 ; nextln: ret -; check: Block 1: -; check: movz x0, #1 +; check: Block 2: +; check: movz x0, #2 ; nextln: mov sp, fp ; nextln: ldp fp, lr, [sp], #16 ; nextln: ret diff --git a/cranelift/filetests/filetests/vcode/aarch64/jumptable.clif b/cranelift/filetests/filetests/vcode/aarch64/jumptable.clif index 0789173acb..f7c94c50b6 100644 --- a/cranelift/filetests/filetests/vcode/aarch64/jumptable.clif +++ b/cranelift/filetests/filetests/vcode/aarch64/jumptable.clif @@ -30,15 +30,15 @@ block5(v5: i64): ; check: subs wzr, w0, #3 ; nextln: b.hs -; nextln: adr x2, pc+16 ; ldrsw x1, [x2, x0, LSL 2] ; add x2, x2, x1 ; br x2 ; jt_entries +; nextln: adr x1, pc+16 ; ldrsw x2, [x1, x0, LSL 2] ; add x1, x1, x2 ; br x1 ; jt_entries -; check: movz x1, #3 +; check: movz x1, #1 ; nextln: b ; check: movz x1, #2 ; nextln: b -; check: movz x1, #1 +; check: movz x1, #3 ; check: add x0, x0, x1 diff --git a/cranelift/filetests/filetests/vcode/aarch64/saturating-ops.clif b/cranelift/filetests/filetests/vcode/aarch64/saturating-ops.clif index 60b45cc07a..dcb76e0f26 100644 --- a/cranelift/filetests/filetests/vcode/aarch64/saturating-ops.clif +++ b/cranelift/filetests/filetests/vcode/aarch64/saturating-ops.clif @@ -25,10 +25,10 @@ block0(v0: i8, v1: i8): ; check: stp fp, lr, [sp, #-16]! ; nextln: mov fp, sp -; nextln: uxtb x0, w0 -; nextln: uxtb x1, w1 -; nextln: mov v0.d[0], x0 -; nextln: mov v1.d[0], x1 +; nextln: uxtb x2, w0 +; nextln: uxtb x0, w1 +; nextln: mov v0.d[0], x2 +; nextln: mov v1.d[0], x0 ; nextln: uqadd d0, d0, d1 ; nextln: mov x0, v0.d[0] ; nextln: mov sp, fp diff --git a/cranelift/filetests/filetests/vcode/aarch64/shift-rotate.clif b/cranelift/filetests/filetests/vcode/aarch64/shift-rotate.clif index 86bdb2ea34..26a6922b39 100644 --- a/cranelift/filetests/filetests/vcode/aarch64/shift-rotate.clif +++ b/cranelift/filetests/filetests/vcode/aarch64/shift-rotate.clif @@ -366,15 +366,15 @@ block0(v0: i16): return v2 } -; check: stp fp, lr, [sp, #-16]! -; nextln: mov fp, sp -; nextln: uxth w0, w0 -; nextln: lsr w1, w0, #6 -; nextln: lsl w0, w0, #10 -; nextln: orr w0, w0, w1 -; nextln: mov sp, fp -; nextln: ldp fp, lr, [sp], #16 -; nextln: ret +; check: stp fp, lr, [sp, #-16]! +; nextln: mov fp, sp +; nextln: uxth w1, w0 +; nextln: lsr w0, w1, #6 +; nextln: lsl w1, w1, #10 +; nextln: orr w0, w1, w0 +; nextln: mov sp, fp +; nextln: ldp fp, lr, [sp], #16 +; nextln: ret function %f24(i8) -> i8 { block0(v0: i8): @@ -385,10 +385,10 @@ block0(v0: i8): ; check: stp fp, lr, [sp, #-16]! ; nextln: mov fp, sp -; nextln: uxtb w0, w0 -; nextln: lsr w1, w0, #5 -; nextln: lsl w0, w0, #3 -; nextln: orr w0, w0, w1 +; nextln: uxtb w1, w0 +; nextln: lsr w0, w1, #5 +; nextln: lsl w1, w1, #3 +; nextln: orr w0, w1, w0 ; nextln: mov sp, fp ; nextln: ldp fp, lr, [sp], #16 ; nextln: ret diff --git a/crates/environ/src/cranelift.rs b/crates/environ/src/cranelift.rs index 409a6cdad6..e8e05c2438 100644 --- a/crates/environ/src/cranelift.rs +++ b/crates/environ/src/cranelift.rs @@ -93,7 +93,7 @@ use crate::compilation::{ use crate::func_environ::{get_func_name, FuncEnvironment}; use crate::{CacheConfig, FunctionBodyData, ModuleLocal, ModuleTranslation, Tunables}; use cranelift_codegen::ir::{self, ExternalName}; -use cranelift_codegen::machinst::sections::MachSrcLoc; +use cranelift_codegen::machinst::buffer::MachSrcLoc; use cranelift_codegen::print_errors::pretty_error; use cranelift_codegen::{binemit, isa, Context}; use cranelift_entity::PrimaryMap; @@ -215,7 +215,7 @@ fn get_function_address_map<'data>( if let Some(ref mcr) = &context.mach_compile_result { // New-style backend: we have a `MachCompileResult` that will give us `MachSrcLoc` mapping // tuples. - for &MachSrcLoc { start, end, loc } in mcr.sections.get_srclocs_sorted() { + for &MachSrcLoc { start, end, loc } in mcr.buffer.get_srclocs_sorted() { instructions.push(InstructionAddressMap { srcloc: loc, code_offset: start as usize,