diff --git a/cranelift/codegen/src/isa/aarch64/lower.rs b/cranelift/codegen/src/isa/aarch64/lower.rs index e7853ca98e..039f3b3cd3 100644 --- a/cranelift/codegen/src/isa/aarch64/lower.rs +++ b/cranelift/codegen/src/isa/aarch64/lower.rs @@ -701,7 +701,7 @@ pub(crate) fn lower_pair_address>( // Collect addends through an arbitrary tree of 32-to-64-bit sign/zero // extends and addition ops. We update these as we consume address // components, so they represent the remaining addends not yet handled. - let (addends64, addends32, args_offset) = collect_address_addends(ctx, roots); + let (mut addends64, mut addends32, args_offset) = collect_address_addends(ctx, roots); let offset = args_offset + (offset as i64); trace!( @@ -713,41 +713,40 @@ pub(crate) fn lower_pair_address>( // Pairs basically only have reg + imm formats so we only have to worry about those - let imm7_offset = SImm7Scaled::maybe_from_i64(offset, I64); - match (&addends64[..], &addends32[..], imm7_offset) { - (&[add64], &[], Some(offset)) => PairAMode::SignedOffset(add64, offset), - (&[], &[add32], Some(offset)) => { - let tmp = ctx.alloc_tmp(I64).only_reg().unwrap(); - let (reg, extendop) = add32; - let signed = match extendop { - ExtendOp::SXTW => true, - ExtendOp::UXTW => false, - _ => unreachable!(), - }; - ctx.emit(Inst::Extend { - rd: tmp, - rn: reg, - signed, - from_bits: 32, - to_bits: 64, - }); - PairAMode::SignedOffset(tmp.to_reg(), offset) - } - (&[], &[], Some(offset)) => PairAMode::SignedOffset(zero_reg(), offset), + let base_reg = if let Some(reg64) = addends64.pop() { + reg64 + } else if let Some((reg32, extendop)) = addends32.pop() { + let tmp = ctx.alloc_tmp(I64).only_reg().unwrap(); + let signed = match extendop { + ExtendOp::SXTW => true, + ExtendOp::UXTW => false, + _ => unreachable!(), + }; + ctx.emit(Inst::Extend { + rd: tmp, + rn: reg32, + signed, + from_bits: 32, + to_bits: 64, + }); + tmp.to_reg() + } else { + zero_reg() + }; - (_, _, _) => { - // This is the general case, we just grab all addends and sum them into a register - let addr = ctx.alloc_tmp(I64).only_reg().unwrap(); - lower_add_addends(ctx, addr, addends64, addends32); + let addr = ctx.alloc_tmp(I64).only_reg().unwrap(); + ctx.emit(Inst::gen_move(addr, base_reg, I64)); - let imm7 = imm7_offset.unwrap_or_else(|| { - lower_add_immediate(ctx, addr, addr.to_reg(), offset); - SImm7Scaled::maybe_from_i64(0, I64).unwrap() - }); + // We have the base register, if we have any others, we need to add them + lower_add_addends(ctx, addr, addends64, addends32); - PairAMode::SignedOffset(addr.to_reg(), imm7) - } - } + // Figure out what offset we should emit + let imm7 = SImm7Scaled::maybe_from_i64(offset, I64).unwrap_or_else(|| { + lower_add_immediate(ctx, addr, addr.to_reg(), offset); + SImm7Scaled::maybe_from_i64(0, I64).unwrap() + }); + + PairAMode::SignedOffset(addr.to_reg(), imm7) } /// Lower the address of a load or store. diff --git a/cranelift/filetests/filetests/isa/aarch64/amodes.clif b/cranelift/filetests/filetests/isa/aarch64/amodes.clif index 837ba1815f..fbab91d7f7 100644 --- a/cranelift/filetests/filetests/isa/aarch64/amodes.clif +++ b/cranelift/filetests/filetests/isa/aarch64/amodes.clif @@ -386,3 +386,130 @@ block0(v0: i64, v1: i64, v2: i64): ; nextln: ldrsh x0, [x0] ; nextln: ldp fp, lr, [sp], #16 ; nextln: ret + + +function %i128(i64) -> i128 { +block0(v0: i64): + v1 = load.i128 v0 + store.i128 v1, v0 + return v1 +} + +; check: stp fp, lr, [sp, #-16]! +; nextln: mov fp, sp +; nextln: mov x1, x0 +; nextln: ldp x2, x1, [x1] +; nextln: stp x2, x1, [x0] +; nextln: mov x0, x2 +; nextln: ldp fp, lr, [sp], #16 +; nextln: ret + + +function %i128_imm_offset(i64) -> i128 { +block0(v0: i64): + v1 = load.i128 v0+16 + store.i128 v1, v0+16 + return v1 +} + +; check: stp fp, lr, [sp, #-16]! +; nextln: mov fp, sp +; nextln: mov x1, x0 +; nextln: ldp x2, x1, [x1, #16] +; nextln: stp x2, x1, [x0, #16] +; nextln: mov x0, x2 +; nextln: ldp fp, lr, [sp], #16 +; nextln: ret + +function %i128_imm_offset_large(i64) -> i128 { +block0(v0: i64): + v1 = load.i128 v0+504 + store.i128 v1, v0+504 + return v1 +} + +; check: stp fp, lr, [sp, #-16]! +; nextln: mov fp, sp +; nextln: mov x1, x0 +; nextln: ldp x2, x1, [x1, #504] +; nextln: stp x2, x1, [x0, #504] +; nextln: mov x0, x2 +; nextln: ldp fp, lr, [sp], #16 +; nextln: ret + +function %i128_imm_offset_negative_large(i64) -> i128 { +block0(v0: i64): + v1 = load.i128 v0-512 + store.i128 v1, v0-512 + return v1 +} + +; check: stp fp, lr, [sp, #-16]! +; nextln: mov fp, sp +; nextln: mov x1, x0 +; nextln: ldp x2, x1, [x1, #-512] +; nextln: stp x2, x1, [x0, #-512] +; nextln: mov x0, x2 +; nextln: ldp fp, lr, [sp], #16 +; nextln: ret + + +function %i128_add_offset(i64) -> i128 { +block0(v0: i64): + v1 = iadd_imm v0, 32 + v2 = load.i128 v1 + store.i128 v2, v1 + return v2 +} + +; check: stp fp, lr, [sp, #-16]! +; nextln: mov fp, sp +; nextln: mov x1, x0 +; nextln: ldp x2, x1, [x1, #32] +; nextln: stp x2, x1, [x0, #32] +; nextln: mov x0, x2 +; nextln: ldp fp, lr, [sp], #16 +; nextln: ret + + +function %i128_32bit_sextend_simple(i32) -> i128 { +block0(v0: i32): + v1 = sextend.i64 v0 + v2 = load.i128 v1 + store.i128 v2, v1 + return v2 +} + +; TODO: We should be able to deduplicate the sxtw instruction +; check: stp fp, lr, [sp, #-16]! +; nextln: mov fp, sp +; nextln: sxtw x1, w0 +; nextln: ldp x2, x1, [x1] +; nextln: sxtw x0, w0 +; nextln: stp x2, x1, [x0] +; nextln: mov x0, x2 +; nextln: ldp fp, lr, [sp], #16 +; nextln: ret + + +function %i128_32bit_sextend(i64, i32) -> i128 { +block0(v0: i64, v1: i32): + v2 = sextend.i64 v1 + v3 = iadd.i64 v0, v2 + v4 = iadd_imm.i64 v3, 24 + v5 = load.i128 v4 + store.i128 v5, v4 + return v5 +} + +; check: stp fp, lr, [sp, #-16]! +; nextln: mov fp, sp +; nextln: mov x2, x0 +; nextln: add x2, x2, x1, SXTW +; nextln: ldp x3, x2, [x2, #24] +; nextln: add x0, x0, x1, SXTW +; nextln: stp x3, x2, [x0, #24] +; nextln: mov x0, x3 +; nextln: mov x1, x2 +; nextln: ldp fp, lr, [sp], #16 +; nextln: ret diff --git a/cranelift/filetests/filetests/runtests/i128-load-store.clif b/cranelift/filetests/filetests/runtests/i128-load-store.clif index b02e8ec26b..41046e8717 100644 --- a/cranelift/filetests/filetests/runtests/i128-load-store.clif +++ b/cranelift/filetests/filetests/runtests/i128-load-store.clif @@ -89,3 +89,48 @@ block0(v0: i64,v1: i64): ; run: %i128_stack_store_load_big_offset(0x01234567_89ABCDEF, 0xFEDCBA98_76543210) == true ; run: %i128_stack_store_load_big_offset(0x06060606_06060606, 0xA00A00A0_0A00A00A) == true ; run: %i128_stack_store_load_big_offset(0xC0FFEEEE_DECAFFFF, 0xDECAFFFF_C0FFEEEE) == true + + + +function %i128_store_load(i64, i64) -> b1 { + ss0 = explicit_slot 16 + +block0(v0: i64,v1: i64): + v2 = iconcat v0, v1 + + v3 = stack_addr.i64 ss0 + store.i128 v2, v3 + v4 = load.i128 v3 + + v5 = icmp.i128 eq v2, v4 + return v5 +} +; run: %i128_store_load(0, 0) == true +; run: %i128_store_load(-1, -1) == true +; run: %i128_store_load(-1, 0) == true +; run: %i128_store_load(0, -1) == true +; run: %i128_store_load(0x01234567_89ABCDEF, 0xFEDCBA98_76543210) == true +; run: %i128_store_load(0x06060606_06060606, 0xA00A00A0_0A00A00A) == true +; run: %i128_store_load(0xC0FFEEEE_DECAFFFF, 0xDECAFFFF_C0FFEEEE) == true + + +function %i128_store_load_offset(i64, i64) -> b1 { + ss0 = explicit_slot 32 + +block0(v0: i64,v1: i64): + v2 = iconcat v0, v1 + + v3 = stack_addr.i64 ss0 + store.i128 v2, v3+16 + v4 = load.i128 v3+16 + + v5 = icmp.i128 eq v2, v4 + return v5 +} +; run: %i128_store_load_offset(0, 0) == true +; run: %i128_store_load_offset(-1, -1) == true +; run: %i128_store_load_offset(-1, 0) == true +; run: %i128_store_load_offset(0, -1) == true +; run: %i128_store_load_offset(0x01234567_89ABCDEF, 0xFEDCBA98_76543210) == true +; run: %i128_store_load_offset(0x06060606_06060606, 0xA00A00A0_0A00A00A) == true +; run: %i128_store_load_offset(0xC0FFEEEE_DECAFFFF, 0xDECAFFFF_C0FFEEEE) == true