diff --git a/cranelift/codegen/src/isa/aarch64/abi.rs b/cranelift/codegen/src/isa/aarch64/abi.rs index 6a264f1604..f9e0de1498 100644 --- a/cranelift/codegen/src/isa/aarch64/abi.rs +++ b/cranelift/codegen/src/isa/aarch64/abi.rs @@ -183,6 +183,7 @@ impl ABIMachineSpec for AArch64MachineDeps { args_or_rets: ArgsOrRets, add_ret_area_ptr: bool, ) -> CodegenResult<(Vec, i64, Option)> { + let is_apple_cc = call_conv == isa::CallConv::AppleAarch64; let is_baldrdash = call_conv.extends_baldrdash(); let has_baldrdash_tls = call_conv == isa::CallConv::Baldrdash2020; @@ -194,8 +195,6 @@ impl ABIMachineSpec for AArch64MachineDeps { // following ways: // - sign- and zero- extensions of data types less than 32 bits are not // implemented yet. - // - i128 arguments passing isn't implemented yet in the standard (non - // MacOS) aarch64 ABI. // - we align the arguments stack space to a 16-bytes boundary, while // the MacOS allows aligning only on 8 bytes. In practice it means we're // slightly overallocating when calling, which is fine, and doesn't @@ -265,20 +264,16 @@ impl ABIMachineSpec for AArch64MachineDeps { "Invalid type for AArch64: {:?}", param.value_type ); - let (rcs, _) = Inst::rc_for_type(param.value_type).unwrap(); - assert!(rcs.len() == 1, "Multi-reg values not supported yet"); - let rc = rcs[0]; - let next_reg = match rc { - RegClass::I64 => &mut next_xreg, - RegClass::V128 => &mut next_vreg, - _ => panic!("Invalid register class: {:?}", rc), - }; + let (rcs, _) = Inst::rc_for_type(param.value_type)?; if let Some(param) = try_fill_baldrdash_reg(call_conv, param) { - assert!(rc == RegClass::I64); + assert!(rcs[0] == RegClass::I64); ret.push(param); - } else if let ir::ArgumentPurpose::StructArgument(size) = param.purpose { + continue; + } + + if let ir::ArgumentPurpose::StructArgument(size) = param.purpose { let offset = next_stack as i64; let size = size as u64; assert!(size % 8 == 0, "StructArgument size is not properly aligned"); @@ -288,50 +283,130 @@ impl ABIMachineSpec for AArch64MachineDeps { size, purpose: param.purpose, }); - } else if *next_reg < max_per_class_reg_vals && remaining_reg_vals > 0 { - let reg = match rc { - RegClass::I64 => xreg(*next_reg), - RegClass::V128 => vreg(*next_reg), - _ => unreachable!(), - }; - ret.push(ABIArg::reg( - reg.to_real_reg(), - param.value_type, - param.extension, - param.purpose, - )); - *next_reg += 1; - remaining_reg_vals -= 1; - } else { - // Compute the stack slot's size. - let size = (ty_bits(param.value_type) / 8) as u64; - - let size = if call_conv == isa::CallConv::AppleAarch64 - || (call_conv.extends_wasmtime() && args_or_rets == ArgsOrRets::Rets) - { - // MacOS aarch64 and Wasmtime allow stack slots with - // sizes less than 8 bytes. They still need to be - // properly aligned on their natural data alignment, - // though. - size - } else { - // Every arg takes a minimum slot of 8 bytes. (16-byte stack - // alignment happens separately after all args.) - std::cmp::max(size, 8) - }; - - // Align the stack slot. - debug_assert!(size.is_power_of_two()); - next_stack = align_to(next_stack, size); - - ret.push(ABIArg::stack( - next_stack as i64, - param.value_type, - param.extension, - param.purpose, - )); - next_stack += size; + continue; } + + // Handle multi register params + // + // See AArch64 ABI (https://c9x.me/compile/bib/abi-arm64.pdf), (Section 5.4 Stage C). + // + // For arguments with alignment of 16 we round up the the register number + // to the next even value. So we can never allocate for example an i128 + // to X1 and X2, we have to skip one register and do X2, X3 + // (Stage C.8) + // Note: The Apple ABI deviates a bit here. They don't respect Stage C.8 + // and will happily allocate a i128 to X1 and X2 + // + // For integer types with alignment of 16 we also have the additional + // restriction of passing the lower half in Xn and the upper half in Xn+1 + // (Stage C.9) + // + // For examples of how llvm handles this: https://godbolt.org/z/bhd3vvEfh + let is_multi_reg = rcs.len() >= 2; + if is_multi_reg { + assert!( + rcs.len() == 2, + "Unable to handle multi reg params with more than 2 regs" + ); + assert!( + rcs == &[RegClass::I64, RegClass::I64], + "Unable to handle non i64 regs" + ); + + let reg_class_space = max_per_class_reg_vals - next_xreg; + let reg_space = remaining_reg_vals; + + if reg_space >= 2 && reg_class_space >= 2 { + // The aarch64 ABI does not allow us to start a split argument + // at an odd numbered register. So we need to skip one register + // + // TODO: The Fast ABI should probably not skip the register + if !is_apple_cc && next_xreg % 2 != 0 { + next_xreg += 1; + } + + let lower_reg = xreg(next_xreg); + let upper_reg = xreg(next_xreg + 1); + + ret.push(ABIArg::Slots { + slots: vec![ + ABIArgSlot::Reg { + reg: lower_reg.to_real_reg(), + ty: param.value_type, + extension: param.extension, + }, + ABIArgSlot::Reg { + reg: upper_reg.to_real_reg(), + ty: param.value_type, + extension: param.extension, + }, + ], + purpose: param.purpose, + }); + + next_xreg += 2; + remaining_reg_vals -= 2; + continue; + } + } + + // Single Register parameters + if !is_multi_reg { + let rc = rcs[0]; + let next_reg = match rc { + RegClass::I64 => &mut next_xreg, + RegClass::V128 => &mut next_vreg, + _ => panic!("Invalid register class: {:?}", rc), + }; + + if *next_reg < max_per_class_reg_vals && remaining_reg_vals > 0 { + let reg = match rc { + RegClass::I64 => xreg(*next_reg), + RegClass::V128 => vreg(*next_reg), + _ => unreachable!(), + }; + ret.push(ABIArg::reg( + reg.to_real_reg(), + param.value_type, + param.extension, + param.purpose, + )); + *next_reg += 1; + remaining_reg_vals -= 1; + continue; + } + } + + // Spill to the stack + + // Compute the stack slot's size. + let size = (ty_bits(param.value_type) / 8) as u64; + + let size = if is_apple_cc + || (call_conv.extends_wasmtime() && args_or_rets == ArgsOrRets::Rets) + { + // MacOS aarch64 and Wasmtime allow stack slots with + // sizes less than 8 bytes. They still need to be + // properly aligned on their natural data alignment, + // though. + size + } else { + // Every arg takes a minimum slot of 8 bytes. (16-byte stack + // alignment happens separately after all args.) + std::cmp::max(size, 8) + }; + + // Align the stack slot. + debug_assert!(size.is_power_of_two()); + next_stack = align_to(next_stack, size); + + ret.push(ABIArg::stack( + next_stack as i64, + param.value_type, + param.extension, + param.purpose, + )); + next_stack += size; } if args_or_rets == ArgsOrRets::Rets && is_baldrdash { diff --git a/cranelift/codegen/src/isa/aarch64/lower.rs b/cranelift/codegen/src/isa/aarch64/lower.rs index 4f5893f54b..7c1cd5f055 100644 --- a/cranelift/codegen/src/isa/aarch64/lower.rs +++ b/cranelift/codegen/src/isa/aarch64/lower.rs @@ -158,42 +158,37 @@ impl NarrowValueMode { } } -/// Lower an instruction input to a reg. -/// -/// The given register will be extended appropriately, according to -/// `narrow_mode` and the input's type. If extended, the value is -/// always extended to 64 bits, for simplicity. -pub(crate) fn put_input_in_reg>( - ctx: &mut C, - input: InsnInput, - narrow_mode: NarrowValueMode, -) -> Reg { - debug!("put_input_in_reg: input {:?}", input); - let ty = ctx.input_ty(input.insn, input.input); - let from_bits = ty_bits(ty) as u8; - let inputs = ctx.get_input_as_source_or_const(input.insn, input.input); - let in_reg = if let Some(c) = inputs.constant { - // Generate constants fresh at each use to minimize long-range register pressure. - let masked = if from_bits < 64 { - c & ((1u64 << from_bits) - 1) - } else { - c - }; - let to_reg = ctx.alloc_tmp(ty).only_reg().unwrap(); - for inst in Inst::gen_constant(ValueRegs::one(to_reg), masked as u128, ty, |ty| { - ctx.alloc_tmp(ty).only_reg().unwrap() - }) - .into_iter() - { - ctx.emit(inst); - } - to_reg.to_reg() +/// Emits instruction(s) to generate the given 64-bit constant value into a newly-allocated +/// temporary register, returning that register. +fn generate_constant>(ctx: &mut C, ty: Type, c: u64) -> ValueRegs { + let from_bits = ty_bits(ty); + let masked = if from_bits < 64 { + c & ((1u64 << from_bits) - 1) } else { - ctx.put_input_in_regs(input.insn, input.input) - .only_reg() - .unwrap() + c }; + let cst_copy = ctx.alloc_tmp(ty); + for inst in Inst::gen_constant(cst_copy, masked as u128, ty, |ty| { + ctx.alloc_tmp(ty).only_reg().unwrap() + }) + .into_iter() + { + ctx.emit(inst); + } + non_writable_value_regs(cst_copy) +} + +/// Extends a register according to `narrow_mode`. +/// If extended, the value is always extended to 64 bits, for simplicity. +fn narrow_reg>( + ctx: &mut C, + ty: Type, + in_reg: Reg, + is_const: bool, + narrow_mode: NarrowValueMode, +) -> Reg { + let from_bits = ty_bits(ty) as u8; match (narrow_mode, from_bits) { (NarrowValueMode::None, _) => in_reg, (NarrowValueMode::ZeroExtend32, n) if n < 32 => { @@ -221,7 +216,7 @@ pub(crate) fn put_input_in_reg>( (NarrowValueMode::ZeroExtend32, 32) | (NarrowValueMode::SignExtend32, 32) => in_reg, (NarrowValueMode::ZeroExtend64, n) if n < 64 => { - if inputs.constant.is_some() { + if is_const { // Constants are zero-extended to full 64-bit width on load already. in_reg } else { @@ -257,6 +252,48 @@ pub(crate) fn put_input_in_reg>( } } +/// Lower an instruction input to a register +/// +/// The given register will be extended appropriately, according to +/// `narrow_mode` and the input's type. If extended, the value is +/// always extended to 64 bits, for simplicity. +pub(crate) fn put_input_in_reg>( + ctx: &mut C, + input: InsnInput, + narrow_mode: NarrowValueMode, +) -> Reg { + let reg = put_input_in_regs(ctx, input) + .only_reg() + .expect("Multi-register value not expected"); + + let is_const = ctx + .get_input_as_source_or_const(input.insn, input.input) + .constant + .is_some(); + + let ty = ctx.input_ty(input.insn, input.input); + narrow_reg(ctx, ty, reg, is_const, narrow_mode) +} + +/// Lower an instruction input to multiple regs +pub(crate) fn put_input_in_regs>( + ctx: &mut C, + input: InsnInput, +) -> ValueRegs { + debug!("put_input_in_reg: input {:?}", input); + let ty = ctx.input_ty(input.insn, input.input); + let inputs = ctx.get_input_as_source_or_const(input.insn, input.input); + + let in_regs = if let Some(c) = inputs.constant { + // Generate constants fresh at each use to minimize long-range register pressure. + generate_constant(ctx, ty, c) + } else { + ctx.put_input_in_regs(input.insn, input.input) + }; + + in_regs +} + /// Lower an instruction input to a reg or reg/shift, or reg/extend operand. /// /// The `narrow_mode` flag indicates whether the consumer of this value needs diff --git a/cranelift/codegen/src/isa/aarch64/lower_inst.rs b/cranelift/codegen/src/isa/aarch64/lower_inst.rs index ede66295e9..5e33c54034 100644 --- a/cranelift/codegen/src/isa/aarch64/lower_inst.rs +++ b/cranelift/codegen/src/isa/aarch64/lower_inst.rs @@ -1572,10 +1572,21 @@ pub(crate) fn lower_insn_to_regs>( // N.B.: according to the AArch64 ABI, the top bits of a register // (above the bits for the value's type) are undefined, so we // need not extend the return values. - let reg = put_input_in_reg(ctx, *input, NarrowValueMode::None); - let retval_reg = ctx.retval(i).only_reg().unwrap(); + let src_regs = put_input_in_regs(ctx, *input); + let retval_regs = ctx.retval(i); + + assert_eq!(src_regs.len(), retval_regs.len()); let ty = ctx.input_ty(insn, i); - ctx.emit(Inst::gen_move(retval_reg, reg, ty)); + let (_, tys) = Inst::rc_for_type(ty)?; + + src_regs + .regs() + .iter() + .zip(retval_regs.regs().iter()) + .zip(tys.iter()) + .for_each(|((&src, &dst), &ty)| { + ctx.emit(Inst::gen_move(dst, src, ty)); + }); } // N.B.: the Ret itself is generated by the ABI. } @@ -1753,13 +1764,13 @@ pub(crate) fn lower_insn_to_regs>( assert!(inputs.len() == abi.num_args()); for i in abi.get_copy_to_arg_order() { let input = inputs[i]; - let arg_reg = put_input_in_reg(ctx, input, NarrowValueMode::None); - abi.emit_copy_regs_to_arg(ctx, i, ValueRegs::one(arg_reg)); + let arg_regs = put_input_in_regs(ctx, input); + abi.emit_copy_regs_to_arg(ctx, i, arg_regs); } abi.emit_call(ctx); for (i, output) in outputs.iter().enumerate() { - let retval_reg = get_output_reg(ctx, *output).only_reg().unwrap(); - abi.emit_copy_retval_to_regs(ctx, i, ValueRegs::one(retval_reg)); + let retval_regs = get_output_reg(ctx, *output); + abi.emit_copy_retval_to_regs(ctx, i, retval_regs); } abi.emit_stack_post_adjust(ctx); } @@ -2306,7 +2317,39 @@ pub(crate) fn lower_insn_to_regs>( panic!("Vector ops not implemented."); } - Opcode::Isplit | Opcode::Iconcat => panic!("Vector ops not supported."), + Opcode::Isplit => { + assert_eq!( + ctx.input_ty(insn, 0), + I128, + "Isplit only implemented for i128's" + ); + assert_eq!(ctx.output_ty(insn, 0), I64); + assert_eq!(ctx.output_ty(insn, 1), I64); + + let src_regs = put_input_in_regs(ctx, inputs[0]); + let dst_lo = get_output_reg(ctx, outputs[0]).only_reg().unwrap(); + let dst_hi = get_output_reg(ctx, outputs[1]).only_reg().unwrap(); + + ctx.emit(Inst::gen_move(dst_lo, src_regs.regs()[0], I64)); + ctx.emit(Inst::gen_move(dst_hi, src_regs.regs()[1], I64)); + } + + Opcode::Iconcat => { + assert_eq!( + ctx.output_ty(insn, 0), + I128, + "Iconcat only implemented for i128's" + ); + assert_eq!(ctx.input_ty(insn, 0), I64); + assert_eq!(ctx.input_ty(insn, 1), I64); + + let src_lo = put_input_in_reg(ctx, inputs[0], NarrowValueMode::None); + let src_hi = put_input_in_reg(ctx, inputs[1], NarrowValueMode::None); + let dst = get_output_reg(ctx, outputs[0]); + + ctx.emit(Inst::gen_move(dst.regs()[0], src_lo, I64)); + ctx.emit(Inst::gen_move(dst.regs()[1], src_hi, I64)); + } Opcode::Imax | Opcode::Umax | Opcode::Umin | Opcode::Imin => { let alu_op = match op { diff --git a/cranelift/filetests/filetests/isa/aarch64/call.clif b/cranelift/filetests/filetests/isa/aarch64/call.clif index a59c59a5b2..090c220167 100644 --- a/cranelift/filetests/filetests/isa/aarch64/call.clif +++ b/cranelift/filetests/filetests/isa/aarch64/call.clif @@ -250,3 +250,113 @@ block0: ; nextln: add sp, sp, #32 ; nextln: ldp fp, lr, [sp], #16 ; nextln: ret + + +; i128 tests +function %f11(i128, i64) -> i64 { +block0(v0: i128, v1: i64): + v2, v3 = isplit v0 + return v3 +} + +; check: stp fp, lr, [sp, #-16]! +; nextln: mov fp, sp +; nextln: mov x0, x1 +; nextln: ldp fp, lr, [sp], #16 +; nextln: ret + + +function %f11_call(i64) -> i64 { + fn0 = %f11(i128, i64) -> i64 + +block0(v0: i64): + v1 = iconst.i64 42 + v2 = iconcat v1, v0 + v3 = call fn0(v2, v1) + return v3 +} + +; check: stp fp, lr, [sp, #-16]! +; nextln: mov fp, sp +; nextln: mov x1, x0 +; nextln: movz x0, #42 +; nextln: movz x2, #42 +; nextln: ldr x3, 8 ; b 12 ; data +; nextln: blr x3 +; nextln: ldp fp, lr, [sp], #16 +; nextln: ret + + +; The aarch64 abi requires that the i128 argument be aligned +; and to be passed in x2 and x3 +function %f12(i64, i128) -> i64 { +block0(v0: i64, v1: i128): + v2, v3 = isplit v1 + return v2 +} + +; check: stp fp, lr, [sp, #-16]! +; nextln: mov fp, sp +; nextln: mov x0, x2 +; nextln: ldp fp, lr, [sp], #16 +; nextln: ret + + + +function %f12_call(i64) -> i64 { + fn0 = %f12(i64, i128) -> i64 + +block0(v0: i64): + v1 = iconst.i64 42 + v2 = iconcat v0, v1 + v3 = call fn0(v1, v2) + return v3 +} + +; check: stp fp, lr, [sp, #-16]! +; nextln: mov fp, sp +; nextln: movz x3, #42 +; nextln: mov x2, x0 +; nextln: movz x0, #42 +; nextln: ldr x1, 8 ; b 12 ; data +; nextln: blr x1 +; nextln: ldp fp, lr, [sp], #16 +; nextln: ret + + + +; aarch64 allows the i128 argument to not be aligned +; and to be passed in x1 and x2 +function %f13(i64, i128) -> i64 apple_aarch64 { +block0(v0: i64, v1: i128): + v2, v3 = isplit v1 + return v2 +} + +; check: stp fp, lr, [sp, #-16]! +; nextln: mov fp, sp +; nextln: mov x0, x1 +; nextln: ldp fp, lr, [sp], #16 +; nextln: ret + + +function %f13_call(i64) -> i64 apple_aarch64 { + fn0 = %f13(i64, i128) -> i64 apple_aarch64 + +block0(v0: i64): + v1 = iconst.i64 42 + v2 = iconcat v0, v1 + v3 = call fn0(v1, v2) + return v3 +} + +; check: stp fp, lr, [sp, #-16]! +; nextln: mov fp, sp +; nextln: movz x2, #42 +; nextln: mov x1, x0 +; nextln: movz x0, #42 +; nextln: ldr x3, 8 ; b 12 ; data +; nextln: blr x3 +; nextln: ldp fp, lr, [sp], #16 +; nextln: ret +