aarch64: Implement I128 Loads and Stores

2021-06-14 08:48:34 +01:00
parent 3d56728b86
commit 1c05e06bd5
4 changed files with 450 additions and 93 deletions
--- a/cranelift/codegen/src/isa/aarch64/lower.rs
+++ b/cranelift/codegen/src/isa/aarch64/lower.rs
@@ -692,6 +692,64 @@ fn collect_address_addends<C: LowerCtx<I = Inst>>(
    (result64, result32, offset)
 }
 /// Lower the address of a pair load or store.
 pub(crate) fn lower_pair_address<C: LowerCtx<I = Inst>>(
    ctx: &mut C,
    roots: &[InsnInput],
    offset: i32,
 ) -> PairAMode {
    // Collect addends through an arbitrary tree of 32-to-64-bit sign/zero
    // extends and addition ops. We update these as we consume address
    // components, so they represent the remaining addends not yet handled.
    let (addends64, addends32, args_offset) = collect_address_addends(ctx, roots);
    let offset = args_offset + (offset as i64);
    trace!(
        "lower_pair_address: addends64 {:?}, addends32 {:?}, offset {}",
        addends64,
        addends32,
        offset
    );
    // Pairs basically only have reg + imm formats so we only have to worry about those
    let imm7_offset = SImm7Scaled::maybe_from_i64(offset, I64);
    match (&addends64[..], &addends32[..], imm7_offset) {
        (&[add64], &[], Some(offset)) => PairAMode::SignedOffset(add64, offset),
        (&[], &[add32], Some(offset)) => {
            let tmp = ctx.alloc_tmp(I64).only_reg().unwrap();
            let (reg, extendop) = add32;
            let signed = match extendop {
                ExtendOp::SXTW => true,
                ExtendOp::UXTW => false,
                _ => unreachable!(),
            };
            ctx.emit(Inst::Extend {
                rd: tmp,
                rn: reg,
                signed,
                from_bits: 32,
                to_bits: 64,
            });
            PairAMode::SignedOffset(tmp.to_reg(), offset)
        }
        (&[], &[], Some(offset)) => PairAMode::SignedOffset(zero_reg(), offset),
        (_, _, _) => {
            // This is the general case, we just grab all addends and sum them into a register
            let addr = ctx.alloc_tmp(I64).only_reg().unwrap();
            lower_add_addends(ctx, addr, addends64, addends32);
            let imm7 = imm7_offset.unwrap_or_else(|| {
                lower_add_immediate(ctx, addr, addr.to_reg(), offset);
                SImm7Scaled::maybe_from_i64(0, I64).unwrap()
            });
            PairAMode::SignedOffset(addr.to_reg(), imm7)
        }
    }
 }
 /// Lower the address of a load or store.
 pub(crate) fn lower_address<C: LowerCtx<I = Inst>>(
    ctx: &mut C,
@@ -792,36 +850,23 @@ pub(crate) fn lower_address<C: LowerCtx<I = Inst>>(
    // If there is any offset, load that first into `addr`, and add the `reg`
    // that we kicked out of the `AMode`; otherwise, start with that reg.
    if offset != 0 {
-        // If we can fit offset or -offset in an imm12, use an add-imm
+        lower_add_immediate(ctx, addr, reg, offset)
        // to combine the reg and offset. Otherwise, load value first then add.
        if let Some(imm12) = Imm12::maybe_from_u64(offset as u64) {
            ctx.emit(Inst::AluRRImm12 {
                alu_op: ALUOp::Add64,
                rd: addr,
                rn: reg,
                imm12,
            });
        } else if let Some(imm12) = Imm12::maybe_from_u64(offset.wrapping_neg() as u64) {
            ctx.emit(Inst::AluRRImm12 {
                alu_op: ALUOp::Sub64,
                rd: addr,
                rn: reg,
                imm12,
            });
        } else {
            lower_constant_u64(ctx, addr, offset as u64);
            ctx.emit(Inst::AluRRR {
                alu_op: ALUOp::Add64,
                rd: addr,
                rn: addr.to_reg(),
                rm: reg,
            });
        }
    } else {
        ctx.emit(Inst::gen_move(addr, reg, I64));
    }
    // Now handle reg64 and reg32-extended components.
    lower_add_addends(ctx, addr, addends64, addends32);
    memarg
 }
 fn lower_add_addends<C: LowerCtx<I = Inst>>(
    ctx: &mut C,
    rd: Writable<Reg>,
    addends64: AddressAddend64List,
    addends32: AddressAddend32List,
 ) {
    for reg in addends64 {
        // If the register is the stack reg, we must move it to another reg
        // before adding it.
@@ -834,8 +879,8 @@ pub(crate) fn lower_address<C: LowerCtx<I = Inst>>(
        };
        ctx.emit(Inst::AluRRR {
            alu_op: ALUOp::Add64,
-            rd: addr,
+            rd,
-            rn: addr.to_reg(),
+            rn: rd.to_reg(),
            rm: reg,
        });
    }
@@ -843,14 +888,42 @@ pub(crate) fn lower_address<C: LowerCtx<I = Inst>>(
        assert!(reg != stack_reg());
        ctx.emit(Inst::AluRRRExtend {
            alu_op: ALUOp::Add64,
-            rd: addr,
+            rd,
-            rn: addr.to_reg(),
+            rn: rd.to_reg(),
            rm: reg,
            extendop,
        });
    }
 }
-    memarg
+/// Adds into `rd` a signed imm pattern matching the best instruction for it.
 // TODO: This function is duplicated in ctx.gen_add_imm
 fn lower_add_immediate<C: LowerCtx<I = Inst>>(ctx: &mut C, dst: Writable<Reg>, src: Reg, imm: i64) {
    // If we can fit offset or -offset in an imm12, use an add-imm
    // Otherwise, lower the constant first then add.
    if let Some(imm12) = Imm12::maybe_from_u64(imm as u64) {
        ctx.emit(Inst::AluRRImm12 {
            alu_op: ALUOp::Add64,
            rd: dst,
            rn: src,
            imm12,
        });
    } else if let Some(imm12) = Imm12::maybe_from_u64(imm.wrapping_neg() as u64) {
        ctx.emit(Inst::AluRRImm12 {
            alu_op: ALUOp::Sub64,
            rd: dst,
            rn: src,
            imm12,
        });
    } else {
        lower_constant_u64(ctx, dst, imm as u64);
        ctx.emit(Inst::AluRRR {
            alu_op: ALUOp::Add64,
            rd: dst,
            rn: dst.to_reg(),
            rm: src,
        });
    }
 }
 pub(crate) fn lower_constant_u64<C: LowerCtx<I = Inst>>(
@@ -1248,7 +1321,10 @@ fn load_op_to_ty(op: Opcode) -> Option<Type> {
 /// Helper to lower a load instruction; this is used in several places, because
 /// a load can sometimes be merged into another operation.
-pub(crate) fn lower_load<C: LowerCtx<I = Inst>, F: FnMut(&mut C, Writable<Reg>, Type, AMode)>(
+pub(crate) fn lower_load<
    C: LowerCtx<I = Inst>,
    F: FnMut(&mut C, ValueRegs<Writable<Reg>>, Type, AMode),
 >(
    ctx: &mut C,
    ir_inst: IRInst,
    inputs: &[InsnInput],
@@ -1261,7 +1337,7 @@ pub(crate) fn lower_load<C: LowerCtx<I = Inst>, F: FnMut(&mut C, Writable<Reg>,
    let off = ctx.data(ir_inst).load_store_offset().unwrap();
    let mem = lower_address(ctx, elem_ty, &inputs[..], off);
-    let rd = get_output_reg(ctx, output).only_reg().unwrap();
+    let rd = get_output_reg(ctx, output);
    f(ctx, rd, elem_ty, mem);
 }
--- a/cranelift/codegen/src/isa/aarch64/lower_inst.rs
+++ b/cranelift/codegen/src/isa/aarch64/lower_inst.rs
@@ -1180,12 +1180,25 @@ pub(crate) fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(
                .memflags(insn)
                .expect("Load instruction should have memflags");
            let out_ty = ctx.output_ty(insn, 0);
            if out_ty == I128 {
                let off = ctx.data(insn).load_store_offset().unwrap();
                let mem = lower_pair_address(ctx, &inputs[..], off);
                let dst = get_output_reg(ctx, outputs[0]);
                ctx.emit(Inst::LoadP64 {
                    rt: dst.regs()[0],
                    rt2: dst.regs()[1],
                    mem,
                    flags,
                });
            } else {
                lower_load(
                    ctx,
                    insn,
                    &inputs[..],
                    outputs[0],
-                |ctx, rd, elem_ty, mem| {
+                    |ctx, dst, elem_ty, mem| {
                        let rd = dst.only_reg().unwrap();
                        let is_float = ty_has_float_or_vec_representation(elem_ty);
                        ctx.emit(match (ty_bits(elem_ty), sign_extend, is_float) {
                            (1, _, _) => Inst::ULoad8 { rd, mem, flags },
@@ -1200,7 +1213,7 @@ pub(crate) fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(
                            // Note that we treat some of the vector loads as scalar floating-point loads,
                            // which is correct in a little endian environment.
                            (64, _, true) => Inst::FpuLoad64 { rd, mem, flags },
-                        (128, _, _) => Inst::FpuLoad128 { rd, mem, flags },
+                            (128, _, true) => Inst::FpuLoad128 { rd, mem, flags },
                            _ => panic!("Unsupported size in load"),
                        });
@@ -1221,6 +1234,7 @@ pub(crate) fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(
                        };
                        if let Some(t) = vec_extend {
                            let rd = dst.only_reg().unwrap();
                            ctx.emit(Inst::VecExtend {
                                t,
                                rd,
@@ -1231,6 +1245,7 @@ pub(crate) fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(
                    },
                );
            }
        }
        Opcode::Store
        | Opcode::Istore8
@@ -1253,9 +1268,19 @@ pub(crate) fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(
                .memflags(insn)
                .expect("Store instruction should have memflags");
-            let mem = lower_address(ctx, elem_ty, &inputs[1..], off);
+            let dst = put_input_in_regs(ctx, inputs[0]);
            let rd = put_input_in_reg(ctx, inputs[0], NarrowValueMode::None);
            if elem_ty == I128 {
                let mem = lower_pair_address(ctx, &inputs[1..], off);
                ctx.emit(Inst::StoreP64 {
                    rt: dst.regs()[0],
                    rt2: dst.regs()[1],
                    mem,
                    flags,
                });
            } else {
                let rd = dst.only_reg().unwrap();
                let mem = lower_address(ctx, elem_ty, &inputs[1..], off);
                ctx.emit(match (ty_bits(elem_ty), is_float) {
                    (1, _) | (8, _) => Inst::Store8 { rd, mem, flags },
                    (16, _) => Inst::Store16 { rd, mem, flags },
@@ -1267,6 +1292,7 @@ pub(crate) fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(
                    _ => panic!("Unsupported size in store"),
                });
            }
        }
        Opcode::StackAddr => {
            let (stack_slot, offset) = match *ctx.data(insn) {
--- a/cranelift/filetests/filetests/isa/aarch64/stack.clif
+++ b/cranelift/filetests/filetests/isa/aarch64/stack.clif
@@ -276,3 +276,167 @@ block0(v0: b1):
  return v0, v137
 }
 function %i128_stack_store(i128) {
 ss0 = explicit_slot 16
 block0(v0: i128):
  stack_store.i128 v0, ss0
  return
 }
 ; TODO: Codegen improvement opportunities: This should be just a stp x0, x1, [sp, #-16]
 ; check: stp fp, lr, [sp, #-16]!
 ; nextln: mov fp, sp
 ; nextln: sub sp, sp, #16
 ; nextln: mov x2, sp
 ; nextln: stp x0, x1, [x2]
 ; nextln: add sp, sp, #16
 ; nextln: ldp fp, lr, [sp], #16
 function %i128_stack_store_slot_offset(i128) {
 ss0 = explicit_slot 16, offset 16
 block0(v0: i128):
  stack_store.i128 v0, ss0
  return
 }
 ; check: stp fp, lr, [sp, #-16]!
 ; nextln: mov fp, sp
 ; nextln: sub sp, sp, #16
 ; nextln: mov x2, sp
 ; nextln: stp x0, x1, [x2]
 ; nextln: add sp, sp, #16
 ; nextln: ldp fp, lr, [sp], #16
 function %i128_stack_store_inst_offset(i128) {
 ss0 = explicit_slot 16
 ss1 = explicit_slot 16
 block0(v0: i128):
  stack_store.i128 v0, ss1+16
  return
 }
 ; check: stp fp, lr, [sp, #-16]!
 ; nextln: mov fp, sp
 ; nextln: sub sp, sp, #32
 ; nextln: add x2, sp, #32
 ; nextln: stp x0, x1, [x2]
 ; nextln: add sp, sp, #32
 ; nextln: ldp fp, lr, [sp], #16
 ; nextln: ret
 function %i128_stack_store_big(i128) {
 ss0 = explicit_slot 100000
 ss1 = explicit_slot 8
 block0(v0: i128):
  stack_store.i128 v0, ss0
  return
 }
 ; check: stp fp, lr, [sp, #-16]!
 ; nextln: mov fp, sp
 ; nextln: movz w16, #34480
 ; nextln: movk w16, #1, LSL #16
 ; nextln: sub sp, sp, x16, UXTX
 ; nextln: mov x2, sp
 ; nextln: stp x0, x1, [x2]
 ; nextln: movz w16, #34480
 ; nextln: movk w16, #1, LSL #16
 ; nextln: add sp, sp, x16, UXTX
 ; nextln: ldp fp, lr, [sp], #16
 ; nextln: ret
 function %i128_stack_load() -> i128 {
 ss0 = explicit_slot 16
 block0:
  v0 = stack_load.i128 ss0
  return v0
 }
 ; TODO: Codegen improvement opportunities: This should be just a ldp x0, x1, [sp, #-16]
 ; check: stp fp, lr, [sp, #-16]!
 ; nextln: mov fp, sp
 ; nextln: sub sp, sp, #16
 ; nextln: mov x0, sp
 ; nextln: ldp x1, x0, [x0]
 ; nextln: mov x2, x0
 ; nextln: mov x0, x1
 ; nextln: mov x1, x2
 ; nextln: add sp, sp, #16
 ; nextln: ldp fp, lr, [sp], #16
 ; nextln: ret
 function %i128_stack_load_slot_offset() -> i128 {
 ss0 = explicit_slot 16, offset 16
 block0:
  v0 = stack_load.i128 ss0
  return v0
 }
 ; check: stp fp, lr, [sp, #-16]!
 ; nextln: mov fp, sp
 ; nextln: sub sp, sp, #16
 ; nextln: mov x0, sp
 ; nextln: ldp x1, x0, [x0]
 ; nextln: mov x2, x0
 ; nextln: mov x0, x1
 ; nextln: mov x1, x2
 ; nextln: add sp, sp, #16
 ; nextln: ldp fp, lr, [sp], #16
 ; nextln: ret
 function %i128_stack_load_inst_offset() -> i128 {
 ss0 = explicit_slot 16
 ss1 = explicit_slot 16
 block0:
  v0 = stack_load.i128 ss1+16
  return v0
 }
 ; check: stp fp, lr, [sp, #-16]!
 ; nextln: mov fp, sp
 ; nextln: sub sp, sp, #32
 ; nextln: add x0, sp, #32
 ; nextln: ldp x1, x0, [x0]
 ; nextln: mov x2, x0
 ; nextln: mov x0, x1
 ; nextln: mov x1, x2
 ; nextln: add sp, sp, #32
 ; nextln: ldp fp, lr, [sp], #16
 ; nextln: ret
 function %i128_stack_load_big() -> i128 {
 ss0 = explicit_slot 100000
 ss1 = explicit_slot 8
 block0:
  v0 = stack_load.i128 ss0
  return v0
 }
 ; check: stp fp, lr, [sp, #-16]!
 ; nextln: mov fp, sp
 ; nextln: movz w16, #34480
 ; nextln: movk w16, #1, LSL #16
 ; nextln: sub sp, sp, x16, UXTX
 ; nextln: mov x0, sp
 ; nextln: ldp x1, x0, [x0]
 ; nextln: mov x2, x0
 ; nextln: mov x0, x1
 ; nextln: mov x1, x2
 ; nextln: movz w16, #34480
 ; nextln: movk w16, #1, LSL #16
 ; nextln: add sp, sp, x16, UXTX
 ; nextln: ldp fp, lr, [sp], #16
 ; nextln: ret
--- a/cranelift/filetests/filetests/runtests/i128-load-store.clif
+++ b/cranelift/filetests/filetests/runtests/i128-load-store.clif
@@ -0,0 +1,91 @@
 test run
 target x86_64 machinst
 target aarch64
 function %i128_stack_store_load(i64, i64) -> b1 {
    ss0 = explicit_slot 16
 block0(v0: i64,v1: i64):
    v2 = iconcat v0, v1
    stack_store.i128 v2, ss0
    v3 = stack_load.i128 ss0
    v4 = icmp.i128 eq v2, v3
    return v4
 }
 ; run: %i128_stack_store_load(0, 0) == true
 ; run: %i128_stack_store_load(-1, -1) == true
 ; run: %i128_stack_store_load(-1, 0) == true
 ; run: %i128_stack_store_load(0, -1) == true
 ; run: %i128_stack_store_load(0x01234567_89ABCDEF, 0xFEDCBA98_76543210) == true
 ; run: %i128_stack_store_load(0x06060606_06060606, 0xA00A00A0_0A00A00A) == true
 ; run: %i128_stack_store_load(0xC0FFEEEE_DECAFFFF, 0xDECAFFFF_C0FFEEEE) == true
 function %i128_stack_store_load_offset(i64, i64) -> b1 {
    ss0 = explicit_slot 16, offset 16
 block0(v0: i64,v1: i64):
    v2 = iconcat v0, v1
    stack_store.i128 v2, ss0
    v3 = stack_load.i128 ss0
    v4 = icmp.i128 eq v2, v3
    return v4
 }
 ; run: %i128_stack_store_load_offset(0, 0) == true
 ; run: %i128_stack_store_load_offset(-1, -1) == true
 ; run: %i128_stack_store_load_offset(-1, 0) == true
 ; run: %i128_stack_store_load_offset(0, -1) == true
 ; run: %i128_stack_store_load_offset(0x01234567_89ABCDEF, 0xFEDCBA98_76543210) == true
 ; run: %i128_stack_store_load_offset(0x06060606_06060606, 0xA00A00A0_0A00A00A) == true
 ; run: %i128_stack_store_load_offset(0xC0FFEEEE_DECAFFFF, 0xDECAFFFF_C0FFEEEE) == true
 function %i128_stack_store_load_inst_offset(i64, i64) -> b1 {
    ss0 = explicit_slot 16
    ss1 = explicit_slot 16
    ss2 = explicit_slot 16
 block0(v0: i64,v1: i64):
    v2 = iconcat v0, v1
    stack_store.i128 v2, ss1+16
    v3 = stack_load.i128 ss1+16
    v4 = icmp.i128 eq v2, v3
    return v4
 }
 ; run: %i128_stack_store_load_inst_offset(0, 0) == true
 ; run: %i128_stack_store_load_inst_offset(-1, -1) == true
 ; run: %i128_stack_store_load_inst_offset(-1, 0) == true
 ; run: %i128_stack_store_load_inst_offset(0, -1) == true
 ; run: %i128_stack_store_load_inst_offset(0x01234567_89ABCDEF, 0xFEDCBA98_76543210) == true
 ; run: %i128_stack_store_load_inst_offset(0x06060606_06060606, 0xA00A00A0_0A00A00A) == true
 ; run: %i128_stack_store_load_inst_offset(0xC0FFEEEE_DECAFFFF, 0xDECAFFFF_C0FFEEEE) == true
 ; Some arches (aarch64) try to encode the offset into the load/store instructions
 ; test that we spill if the offset is too large and doesn't fit in the instruction
 function %i128_stack_store_load_big_offset(i64, i64) -> b1 {
    ss0 = explicit_slot 100000
    ss1 = explicit_slot 8
 block0(v0: i64,v1: i64):
    v2 = iconcat v0, v1
    stack_store.i128 v2, ss0
    v3 = stack_load.i128 ss0
    v4 = icmp.i128 eq v2, v3
    return v4
 }
 ; run: %i128_stack_store_load_big_offset(0, 0) == true
 ; run: %i128_stack_store_load_big_offset(-1, -1) == true
 ; run: %i128_stack_store_load_big_offset(-1, 0) == true
 ; run: %i128_stack_store_load_big_offset(0, -1) == true
 ; run: %i128_stack_store_load_big_offset(0x01234567_89ABCDEF, 0xFEDCBA98_76543210) == true
 ; run: %i128_stack_store_load_big_offset(0x06060606_06060606, 0xA00A00A0_0A00A00A) == true
 ; run: %i128_stack_store_load_big_offset(0xC0FFEEEE_DECAFFFF, 0xDECAFFFF_C0FFEEEE) == true