diff --git a/cranelift/codegen/src/isa/aarch64/lower.rs b/cranelift/codegen/src/isa/aarch64/lower.rs index dedba775ae..4a0037549a 100644 --- a/cranelift/codegen/src/isa/aarch64/lower.rs +++ b/cranelift/codegen/src/isa/aarch64/lower.rs @@ -2,9 +2,8 @@ //! //! TODO: opportunities for better code generation: //! -//! - Smarter use of addressing modes. Recognize a+SCALE*b patterns; recognize -//! and incorporate sign/zero extension on indices. Recognize pre/post-index -//! opportunities. +//! - Smarter use of addressing modes. Recognize a+SCALE*b patterns. Recognize +//! pre/post-index opportunities. //! //! - Floating-point immediates (FIMM instruction). @@ -21,8 +20,9 @@ use crate::isa::aarch64::AArch64Backend; use super::lower_inst; -use log::debug; +use log::{debug, trace}; use regalloc::{Reg, RegClass, Writable}; +use smallvec::SmallVec; //============================================================================ // Result enum types. @@ -544,105 +544,251 @@ pub(crate) fn alu_inst_immshift( // Lowering: addressing mode support. Takes instruction directly, rather // than an `InsnInput`, to do more introspection. +/// 32-bit addends that make up an address: an input, and an extension mode on that +/// input. +type AddressAddend32List = SmallVec<[(Reg, ExtendOp); 4]>; +/// 64-bit addends that make up an address: just an input. +type AddressAddend64List = SmallVec<[Reg; 4]>; + +/// Collect all addends that feed into an address computation, with extend-modes +/// on each. Note that a load/store may have multiple address components (and +/// the CLIF semantics are that these components are added to form the final +/// address), but sometimes the CLIF that we receive still has arguments that +/// refer to `iadd` instructions. We also want to handle uextend/sextend below +/// the add(s). +/// +/// We match any 64-bit add (and descend into its inputs), and we match any +/// 32-to-64-bit sign or zero extension. The returned addend-list will use +/// NarrowValueMode values to indicate how to extend each input: +/// +/// - NarrowValueMode::None: the associated input is 64 bits wide; no extend. +/// - NarrowValueMode::SignExtend64: the associated input is 32 bits wide; +/// do a sign-extension. +/// - NarrowValueMode::ZeroExtend64: the associated input is 32 bits wide; +/// do a zero-extension. +/// +/// We do not descend further into the inputs of extensions, because supporting +/// (e.g.) a 32-bit add that is later extended would require additional masking +/// of high-order bits, which is too complex. So, in essence, we descend any +/// number of adds from the roots, collecting all 64-bit address addends; then +/// possibly support extensions at these leaves. +fn collect_address_addends>( + ctx: &mut C, + roots: &[InsnInput], +) -> (AddressAddend64List, AddressAddend32List, i64) { + let mut result32: AddressAddend32List = SmallVec::new(); + let mut result64: AddressAddend64List = SmallVec::new(); + let mut offset: i64 = 0; + + let mut workqueue: SmallVec<[InsnInput; 4]> = roots.iter().cloned().collect(); + + while let Some(input) = workqueue.pop() { + debug_assert!(ty_bits(ctx.input_ty(input.insn, input.input)) == 64); + if let Some((op, insn)) = maybe_input_insn_multi( + ctx, + input, + &[ + Opcode::Uextend, + Opcode::Sextend, + Opcode::Iadd, + Opcode::Iconst, + ], + ) { + match op { + Opcode::Uextend | Opcode::Sextend if ty_bits(ctx.input_ty(insn, 0)) == 32 => { + let extendop = if op == Opcode::Uextend { + ExtendOp::UXTW + } else { + ExtendOp::SXTW + }; + let extendee_input = InsnInput { insn, input: 0 }; + let reg = put_input_in_reg(ctx, extendee_input, NarrowValueMode::None); + result32.push((reg, extendop)); + } + Opcode::Uextend | Opcode::Sextend => { + let reg = put_input_in_reg(ctx, input, NarrowValueMode::None); + result64.push(reg); + } + Opcode::Iadd => { + for input in 0..ctx.num_inputs(insn) { + let addend = InsnInput { insn, input }; + workqueue.push(addend); + } + } + Opcode::Iconst => { + let value: i64 = ctx.get_constant(insn).unwrap() as i64; + offset += value; + } + _ => panic!("Unexpected opcode from maybe_input_insn_multi"), + } + } else { + let reg = put_input_in_reg(ctx, input, NarrowValueMode::ZeroExtend64); + result64.push(reg); + } + } + + (result64, result32, offset) +} + /// Lower the address of a load or store. pub(crate) fn lower_address>( ctx: &mut C, elem_ty: Type, - addends: &[InsnInput], + roots: &[InsnInput], offset: i32, ) -> MemArg { // TODO: support base_reg + scale * index_reg. For this, we would need to pattern-match shl or // mul instructions (Load/StoreComplex don't include scale factors). - // Handle one reg and offset. - if addends.len() == 1 { - let reg = put_input_in_reg(ctx, addends[0], NarrowValueMode::ZeroExtend64); - return MemArg::RegOffset(reg, offset as i64, elem_ty); - } + // Collect addends through an arbitrary tree of 32-to-64-bit sign/zero + // extends and addition ops. We update these as we consume address + // components, so they represent the remaining addends not yet handled. + let (mut addends64, mut addends32, args_offset) = collect_address_addends(ctx, roots); + let mut offset = args_offset + (offset as i64); - // Handle two regs and a zero offset with built-in extend, if possible. - if addends.len() == 2 && offset == 0 { - // r1, r2 (to be extended), r2_bits, is_signed - let mut parts: Option<(Reg, Reg, usize, bool)> = None; - // Handle extension of either first or second addend. - for i in 0..2 { - if let Some((op, ext_insn)) = - maybe_input_insn_multi(ctx, addends[i], &[Opcode::Uextend, Opcode::Sextend]) - { - // Non-extended addend. - let r1 = put_input_in_reg(ctx, addends[1 - i], NarrowValueMode::ZeroExtend64); - // Extended addend. - let r2 = put_input_in_reg( - ctx, - InsnInput { - insn: ext_insn, - input: 0, - }, - NarrowValueMode::None, - ); - let r2_bits = ty_bits(ctx.input_ty(ext_insn, 0)); - parts = Some(( - r1, - r2, - r2_bits, - /* is_signed = */ op == Opcode::Sextend, - )); - break; - } + trace!( + "lower_address: addends64 {:?}, addends32 {:?}, offset {}", + addends64, + addends32, + offset + ); + + // First, decide what the `MemArg` will be. Take one extendee and one 64-bit + // reg, or two 64-bit regs, or a 64-bit reg and a 32-bit reg with extension, + // or some other combination as appropriate. + let memarg = if addends64.len() > 0 { + if addends32.len() > 0 { + let (reg32, extendop) = addends32.pop().unwrap(); + let reg64 = addends64.pop().unwrap(); + MemArg::RegExtended(reg64, reg32, extendop) + } else if offset > 0 && offset < 0x1000 { + let reg64 = addends64.pop().unwrap(); + let off = offset; + offset = 0; + MemArg::RegOffset(reg64, off, elem_ty) + } else if addends64.len() >= 2 { + let reg1 = addends64.pop().unwrap(); + let reg2 = addends64.pop().unwrap(); + MemArg::RegReg(reg1, reg2) + } else { + let reg1 = addends64.pop().unwrap(); + MemArg::reg(reg1) } - - if let Some((r1, r2, r2_bits, is_signed)) = parts { - match (r2_bits, is_signed) { - (32, false) => { - return MemArg::RegExtended(r1, r2, ExtendOp::UXTW); - } - (32, true) => { - return MemArg::RegExtended(r1, r2, ExtendOp::SXTW); - } - _ => {} + } else + /* addends64.len() == 0 */ + { + if addends32.len() > 0 { + let tmp = ctx.alloc_tmp(RegClass::I64, I64); + let (reg1, extendop) = addends32.pop().unwrap(); + let signed = match extendop { + ExtendOp::SXTW => true, + ExtendOp::UXTW => false, + _ => unreachable!(), + }; + ctx.emit(Inst::Extend { + rd: tmp, + rn: reg1, + signed, + from_bits: 32, + to_bits: 64, + }); + if let Some((reg2, extendop)) = addends32.pop() { + MemArg::RegExtended(tmp.to_reg(), reg2, extendop) + } else { + MemArg::reg(tmp.to_reg()) } + } else + /* addends32.len() == 0 */ + { + let off_reg = ctx.alloc_tmp(RegClass::I64, I64); + lower_constant_u64(ctx, off_reg, offset as u64); + offset = 0; + MemArg::reg(off_reg.to_reg()) } + }; + + // At this point, if we have any remaining components, we need to allocate a + // temp, replace one of the registers in the MemArg with the temp, and emit + // instructions to add together the remaining components. Return immediately + // if this is *not* the case. + if offset == 0 && addends32.len() == 0 && addends64.len() == 0 { + return memarg; } - // Handle two regs and a zero offset in the general case, if possible. - if addends.len() == 2 && offset == 0 { - let ra = put_input_in_reg(ctx, addends[0], NarrowValueMode::ZeroExtend64); - let rb = put_input_in_reg(ctx, addends[1], NarrowValueMode::ZeroExtend64); - return MemArg::reg_plus_reg(ra, rb); - } - - // Otherwise, generate add instructions. + // Allocate the temp and shoehorn it into the MemArg. let addr = ctx.alloc_tmp(RegClass::I64, I64); + let (reg, memarg) = match memarg { + MemArg::RegExtended(r1, r2, extendop) => { + (r1, MemArg::RegExtended(addr.to_reg(), r2, extendop)) + } + MemArg::RegOffset(r, off, ty) => (r, MemArg::RegOffset(addr.to_reg(), off, ty)), + MemArg::RegReg(r1, r2) => (r2, MemArg::RegReg(addr.to_reg(), r1)), + MemArg::UnsignedOffset(r, imm) => (r, MemArg::UnsignedOffset(addr.to_reg(), imm)), + _ => unreachable!(), + }; - // Get the const into a reg. - lower_constant_u64(ctx, addr.clone(), offset as u64); + // If there is any offset, load that first into `addr`, and add the `reg` + // that we kicked out of the `MemArg`; otherwise, start with that reg. + if offset != 0 { + // If we can fit offset or -offset in an imm12, use an add-imm + // to combine the reg and offset. Otherwise, load value first then add. + if let Some(imm12) = Imm12::maybe_from_u64(offset as u64) { + ctx.emit(Inst::AluRRImm12 { + alu_op: ALUOp::Add64, + rd: addr, + rn: reg, + imm12, + }); + } else if let Some(imm12) = Imm12::maybe_from_u64(offset.wrapping_neg() as u64) { + ctx.emit(Inst::AluRRImm12 { + alu_op: ALUOp::Sub64, + rd: addr, + rn: reg, + imm12, + }); + } else { + lower_constant_u64(ctx, addr, offset as u64); + ctx.emit(Inst::AluRRR { + alu_op: ALUOp::Add64, + rd: addr, + rn: addr.to_reg(), + rm: reg, + }); + } + } else { + ctx.emit(Inst::gen_move(addr, reg, I64)); + } - // Add each addend to the address. - for addend in addends { - let reg = put_input_in_reg(ctx, *addend, NarrowValueMode::ZeroExtend64); - - // In an addition, the stack register is the zero register, so divert it to another - // register just before doing the actual add. + // Now handle reg64 and reg32-extended components. + for reg in addends64 { + // If the register is the stack reg, we must move it to another reg + // before adding it. let reg = if reg == stack_reg() { let tmp = ctx.alloc_tmp(RegClass::I64, I64); - ctx.emit(Inst::Mov { - rd: tmp, - rm: stack_reg(), - }); + ctx.emit(Inst::gen_move(tmp, stack_reg(), I64)); tmp.to_reg() } else { reg }; - ctx.emit(Inst::AluRRR { alu_op: ALUOp::Add64, - rd: addr.clone(), + rd: addr, rn: addr.to_reg(), - rm: reg.clone(), + rm: reg, + }); + } + for (reg, extendop) in addends32 { + assert!(reg != stack_reg()); + ctx.emit(Inst::AluRRRExtend { + alu_op: ALUOp::Add64, + rd: addr, + rn: addr.to_reg(), + rm: reg, + extendop, }); } - MemArg::reg(addr.to_reg()) + memarg } pub(crate) fn lower_constant_u64>( diff --git a/cranelift/filetests/filetests/vcode/aarch64/amodes.clif b/cranelift/filetests/filetests/vcode/aarch64/amodes.clif index 96855d00b6..aaaffd0286 100644 --- a/cranelift/filetests/filetests/vcode/aarch64/amodes.clif +++ b/cranelift/filetests/filetests/vcode/aarch64/amodes.clif @@ -15,7 +15,7 @@ block0(v0: i64, v1: i32): ; nextln: ldp fp, lr, [sp], #16 ; nextln: ret -function %f1(i64, i32) -> i32 { +function %f2(i64, i32) -> i32 { block0(v0: i64, v1: i32): v2 = uextend.i64 v1 v3 = load_complex.i32 v2+v0 @@ -29,7 +29,7 @@ block0(v0: i64, v1: i32): ; nextln: ldp fp, lr, [sp], #16 ; nextln: ret -function %f1(i64, i32) -> i32 { +function %f3(i64, i32) -> i32 { block0(v0: i64, v1: i32): v2 = sextend.i64 v1 v3 = load_complex.i32 v0+v2 @@ -43,7 +43,7 @@ block0(v0: i64, v1: i32): ; nextln: ldp fp, lr, [sp], #16 ; nextln: ret -function %f1(i64, i32) -> i32 { +function %f4(i64, i32) -> i32 { block0(v0: i64, v1: i32): v2 = sextend.i64 v1 v3 = load_complex.i32 v2+v0 @@ -56,3 +56,216 @@ block0(v0: i64, v1: i32): ; nextln: mov sp, fp ; nextln: ldp fp, lr, [sp], #16 ; nextln: ret + +function %f5(i64, i32) -> i32 { +block0(v0: i64, v1: i32): + v2 = sextend.i64 v1 + v3 = iadd.i64 v0, v2 + v4 = load.i32 v3 + return v4 +} + +; check: stp fp, lr, [sp, #-16]! +; nextln: mov fp, sp +; nextln: ldr w0, [x0, w1, SXTW] +; nextln: mov sp, fp +; nextln: ldp fp, lr, [sp], #16 +; nextln: ret + +function %f6(i64, i32) -> i32 { +block0(v0: i64, v1: i32): + v2 = sextend.i64 v1 + v3 = iadd.i64 v2, v0 + v4 = load.i32 v3 + return v4 +} + +; check: stp fp, lr, [sp, #-16]! +; nextln: mov fp, sp +; nextln: ldr w0, [x0, w1, SXTW] +; nextln: mov sp, fp +; nextln: ldp fp, lr, [sp], #16 +; nextln: ret + +function %f7(i32, i32) -> i32 { +block0(v0: i32, v1: i32): + v2 = uextend.i64 v0 + v3 = uextend.i64 v1 + v4 = iadd.i64 v2, v3 + v5 = load.i32 v4 + return v5 +} + +; check: stp fp, lr, [sp, #-16]! +; nextln: mov fp, sp +; nextln: mov w0, w0 +; nextln: ldr w0, [x0, w1, UXTW] +; nextln: mov sp, fp +; nextln: ldp fp, lr, [sp], #16 +; nextln: ret + +function %f8(i64, i32) -> i32 { +block0(v0: i64, v1: i32): + v2 = sextend.i64 v1 + v3 = iconst.i64 32 + v4 = iadd.i64 v2, v3 + v5 = iadd.i64 v4, v0 + v6 = iadd.i64 v5, v5 + v7 = load.i32 v6+4 + return v7 +} + +; v6+4 = 2*v5 = 2*v4 + 2*v0 + 4 = 2*v2 + 2*v3 + 2*v0 + 4 +; = 2*sextend($x1) + 2*$x0 + 68 + +; check: stp fp, lr, [sp, #-16]! +; nextln: mov fp, sp +; nextln: add x2, x0, #68 +; nextln: add x0, x2, x0 +; nextln: add x0, x0, x1, SXTW +; nextln: ldr w0, [x0, w1, SXTW] +; nextln: mov sp, fp +; nextln: ldp fp, lr, [sp], #16 +; nextln: ret + +function %f9(i64, i64, i64) -> i32 { +block0(v0: i64, v1: i64, v2: i64): + v3 = iconst.i64 48 + v4 = iadd.i64 v0, v1 + v5 = iadd.i64 v4, v2 + v6 = iadd.i64 v5, v3 + v7 = load.i32 v6 + return v7 +} + +; v6 = $x0 + $x1 + $x2 + 48 + +; check: stp fp, lr, [sp, #-16]! +; nextln: mov fp, sp +; nextln: add x0, x0, x2 +; nextln: add x0, x0, x1 +; nextln: ldur w0, [x0, #48] +; nextln: mov sp, fp +; nextln: ldp fp, lr, [sp], #16 +; nextln: ret + +function %f10(i64, i64, i64) -> i32 { +block0(v0: i64, v1: i64, v2: i64): + v3 = iconst.i64 4100 + v4 = iadd.i64 v0, v1 + v5 = iadd.i64 v4, v2 + v6 = iadd.i64 v5, v3 + v7 = load.i32 v6 + return v7 +} + +; v6 = $x0 + $x1 + $x2 + 4100 + +; check: stp fp, lr, [sp, #-16]! +; nextln: mov fp, sp +; nextln: movz x3, #4100 +; nextln: add x1, x3, x1 +; nextln: add x1, x1, x2 +; nextln: ldr w0, [x1, x0] +; nextln: mov sp, fp +; nextln: ldp fp, lr, [sp], #16 +; nextln: ret + +function %f10() -> i32 { +block0: + v1 = iconst.i64 1234 + v2 = load.i32 v1 + return v2 +} + +; v6 = $x0 + $x1 + $x2 + 48 + +; check: stp fp, lr, [sp, #-16]! +; nextln: mov fp, sp +; nextln: movz x0, #1234 +; nextln: ldr w0, [x0] +; nextln: mov sp, fp +; nextln: ldp fp, lr, [sp], #16 +; nextln: ret + +function %f11(i64) -> i32 { +block0(v0: i64): + v1 = iconst.i64 8388608 ; Imm12: 0x800 << 12 + v2 = iadd.i64 v0, v1 + v3 = load.i32 v2 + return v3 +} + +; check: stp fp, lr, [sp, #-16]! +; nextln: mov fp, sp +; nextln: add x0, x0, #8388608 +; nextln: ldr w0, [x0] +; nextln: mov sp, fp +; nextln: ldp fp, lr, [sp], #16 +; nextln: ret + +function %f12(i64) -> i32 { +block0(v0: i64): + v1 = iconst.i64 -4 + v2 = iadd.i64 v0, v1 + v3 = load.i32 v2 + return v3 +} + +; check: stp fp, lr, [sp, #-16]! +; nextln: mov fp, sp +; nextln: sub x0, x0, #4 +; nextln: ldr w0, [x0] +; nextln: mov sp, fp +; nextln: ldp fp, lr, [sp], #16 +; nextln: ret + +function %f13(i64) -> i32 { +block0(v0: i64): + v1 = iconst.i64 1000000000 + v2 = iadd.i64 v0, v1 + v3 = load.i32 v2 + return v3 +} + +; check: stp fp, lr, [sp, #-16]! +; nextln: mov fp, sp +; nextln: movz x1, #51712 +; nextln: movk x1, #15258, LSL #16 +; nextln: add x0, x1, x0 +; nextln: ldr w0, [x0] +; nextln: mov sp, fp +; nextln: ldp fp, lr, [sp], #16 +; nextln: ret + +function %f14(i32) -> i32 { +block0(v0: i32): + v1 = sextend.i64 v0 + v2 = load.i32 v1 + return v2 +} + +; check: stp fp, lr, [sp, #-16]! +; nextln: mov fp, sp +; nextln: sxtw x0, w0 +; nextln: ldr w0, [x0] +; nextln: mov sp, fp +; nextln: ldp fp, lr, [sp], #16 +; nextln: ret + +function %f15(i32, i32) -> i32 { +block0(v0: i32, v1: i32): + v2 = sextend.i64 v0 + v3 = sextend.i64 v1 + v4 = iadd.i64 v2, v3 + v5 = load.i32 v4 + return v5 +} + +; check: stp fp, lr, [sp, #-16]! +; nextln: mov fp, sp +; nextln: sxtw x0, w0 +; nextln: ldr w0, [x0, w1, SXTW] +; nextln: mov sp, fp +; nextln: ldp fp, lr, [sp], #16 +; nextln: ret diff --git a/cranelift/filetests/filetests/vcode/aarch64/heap_addr.clif b/cranelift/filetests/filetests/vcode/aarch64/heap_addr.clif index 6ceba929e9..e4ff1471be 100644 --- a/cranelift/filetests/filetests/vcode/aarch64/heap_addr.clif +++ b/cranelift/filetests/filetests/vcode/aarch64/heap_addr.clif @@ -15,7 +15,7 @@ block0(v0: i64, v1: i32): ; check: Block 0: ; check: stp fp, lr, [sp, #-16]! ; nextln: mov fp, sp -; nextln: ldur w2, [x0] +; nextln: ldr w2, [x0] ; nextln: add w2, w2, #0 ; nextln: subs wzr, w1, w2 ; nextln: b.ls label1 ; b label2 diff --git a/cranelift/filetests/filetests/vcode/aarch64/reftypes.clif b/cranelift/filetests/filetests/vcode/aarch64/reftypes.clif index 81ace4f3ca..2458516cfc 100644 --- a/cranelift/filetests/filetests/vcode/aarch64/reftypes.clif +++ b/cranelift/filetests/filetests/vcode/aarch64/reftypes.clif @@ -92,7 +92,7 @@ block3(v7: r64, v8: r64): ; nextln: ldur x19, [sp, #32] ; nextln: ldur x20, [sp, #40] ; nextln: add x1, sp, #16 -; nextln: stur x19, [x1] +; nextln: str x19, [x1] ; nextln: and w0, w0, #1 ; nextln: cbz x0, label1 ; b label3 ; check: Block 1: @@ -108,7 +108,7 @@ block3(v7: r64, v8: r64): ; nextln: b label5 ; check: Block 5: ; check: add x1, sp, #16 -; nextln: ldur x1, [x1] +; nextln: ldr x1, [x1] ; nextln: mov x2, x1 ; nextln: mov x1, x19 ; nextln: ldp x19, x20, [sp], #16 diff --git a/cranelift/filetests/filetests/vcode/aarch64/stack.clif b/cranelift/filetests/filetests/vcode/aarch64/stack.clif index 4ebdc672a0..3d2ae5cf0a 100644 --- a/cranelift/filetests/filetests/vcode/aarch64/stack.clif +++ b/cranelift/filetests/filetests/vcode/aarch64/stack.clif @@ -51,7 +51,7 @@ block0: ; nextln: mov fp, sp ; nextln: sub sp, sp, #16 ; nextln: mov x0, sp -; nextln: ldur x0, [x0] +; nextln: ldr x0, [x0] ; nextln: mov sp, fp ; nextln: ldp fp, lr, [sp], #16 ; nextln: ret @@ -71,7 +71,7 @@ block0: ; nextln: ldr x16, 8 ; b 12 ; data 100016 ; nextln: sub sp, sp, x16, UXTX ; nextln: mov x0, sp -; nextln: ldur x0, [x0] +; nextln: ldr x0, [x0] ; nextln: mov sp, fp ; nextln: ldp fp, lr, [sp], #16 ; nextln: ret @@ -89,7 +89,7 @@ block0(v0: i64): ; nextln: mov fp, sp ; nextln: sub sp, sp, #16 ; nextln: mov x1, sp -; nextln: stur x0, [x1] +; nextln: str x0, [x1] ; nextln: mov sp, fp ; nextln: ldp fp, lr, [sp], #16 ; nextln: ret @@ -109,7 +109,7 @@ block0(v0: i64): ; nextln: ldr x16, 8 ; b 12 ; data 100016 ; nextln: sub sp, sp, x16, UXTX ; nextln: mov x1, sp -; nextln: stur x0, [x1] +; nextln: str x0, [x1] ; nextln: mov sp, fp ; nextln: ldp fp, lr, [sp], #16 ; nextln: ret