Merge pull request #2985 from afonso360/aarch64-i128-load-store
aarch64: Implement I128 Loads and Stores
This commit is contained in:
@@ -692,6 +692,63 @@ fn collect_address_addends<C: LowerCtx<I = Inst>>(
|
||||
(result64, result32, offset)
|
||||
}
|
||||
|
||||
/// Lower the address of a pair load or store.
|
||||
pub(crate) fn lower_pair_address<C: LowerCtx<I = Inst>>(
|
||||
ctx: &mut C,
|
||||
roots: &[InsnInput],
|
||||
offset: i32,
|
||||
) -> PairAMode {
|
||||
// Collect addends through an arbitrary tree of 32-to-64-bit sign/zero
|
||||
// extends and addition ops. We update these as we consume address
|
||||
// components, so they represent the remaining addends not yet handled.
|
||||
let (mut addends64, mut addends32, args_offset) = collect_address_addends(ctx, roots);
|
||||
let offset = args_offset + (offset as i64);
|
||||
|
||||
trace!(
|
||||
"lower_pair_address: addends64 {:?}, addends32 {:?}, offset {}",
|
||||
addends64,
|
||||
addends32,
|
||||
offset
|
||||
);
|
||||
|
||||
// Pairs basically only have reg + imm formats so we only have to worry about those
|
||||
|
||||
let base_reg = if let Some(reg64) = addends64.pop() {
|
||||
reg64
|
||||
} else if let Some((reg32, extendop)) = addends32.pop() {
|
||||
let tmp = ctx.alloc_tmp(I64).only_reg().unwrap();
|
||||
let signed = match extendop {
|
||||
ExtendOp::SXTW => true,
|
||||
ExtendOp::UXTW => false,
|
||||
_ => unreachable!(),
|
||||
};
|
||||
ctx.emit(Inst::Extend {
|
||||
rd: tmp,
|
||||
rn: reg32,
|
||||
signed,
|
||||
from_bits: 32,
|
||||
to_bits: 64,
|
||||
});
|
||||
tmp.to_reg()
|
||||
} else {
|
||||
zero_reg()
|
||||
};
|
||||
|
||||
let addr = ctx.alloc_tmp(I64).only_reg().unwrap();
|
||||
ctx.emit(Inst::gen_move(addr, base_reg, I64));
|
||||
|
||||
// We have the base register, if we have any others, we need to add them
|
||||
lower_add_addends(ctx, addr, addends64, addends32);
|
||||
|
||||
// Figure out what offset we should emit
|
||||
let imm7 = SImm7Scaled::maybe_from_i64(offset, I64).unwrap_or_else(|| {
|
||||
lower_add_immediate(ctx, addr, addr.to_reg(), offset);
|
||||
SImm7Scaled::maybe_from_i64(0, I64).unwrap()
|
||||
});
|
||||
|
||||
PairAMode::SignedOffset(addr.to_reg(), imm7)
|
||||
}
|
||||
|
||||
/// Lower the address of a load or store.
|
||||
pub(crate) fn lower_address<C: LowerCtx<I = Inst>>(
|
||||
ctx: &mut C,
|
||||
@@ -792,36 +849,23 @@ pub(crate) fn lower_address<C: LowerCtx<I = Inst>>(
|
||||
// If there is any offset, load that first into `addr`, and add the `reg`
|
||||
// that we kicked out of the `AMode`; otherwise, start with that reg.
|
||||
if offset != 0 {
|
||||
// If we can fit offset or -offset in an imm12, use an add-imm
|
||||
// to combine the reg and offset. Otherwise, load value first then add.
|
||||
if let Some(imm12) = Imm12::maybe_from_u64(offset as u64) {
|
||||
ctx.emit(Inst::AluRRImm12 {
|
||||
alu_op: ALUOp::Add64,
|
||||
rd: addr,
|
||||
rn: reg,
|
||||
imm12,
|
||||
});
|
||||
} else if let Some(imm12) = Imm12::maybe_from_u64(offset.wrapping_neg() as u64) {
|
||||
ctx.emit(Inst::AluRRImm12 {
|
||||
alu_op: ALUOp::Sub64,
|
||||
rd: addr,
|
||||
rn: reg,
|
||||
imm12,
|
||||
});
|
||||
} else {
|
||||
lower_constant_u64(ctx, addr, offset as u64);
|
||||
ctx.emit(Inst::AluRRR {
|
||||
alu_op: ALUOp::Add64,
|
||||
rd: addr,
|
||||
rn: addr.to_reg(),
|
||||
rm: reg,
|
||||
});
|
||||
}
|
||||
lower_add_immediate(ctx, addr, reg, offset)
|
||||
} else {
|
||||
ctx.emit(Inst::gen_move(addr, reg, I64));
|
||||
}
|
||||
|
||||
// Now handle reg64 and reg32-extended components.
|
||||
lower_add_addends(ctx, addr, addends64, addends32);
|
||||
|
||||
memarg
|
||||
}
|
||||
|
||||
fn lower_add_addends<C: LowerCtx<I = Inst>>(
|
||||
ctx: &mut C,
|
||||
rd: Writable<Reg>,
|
||||
addends64: AddressAddend64List,
|
||||
addends32: AddressAddend32List,
|
||||
) {
|
||||
for reg in addends64 {
|
||||
// If the register is the stack reg, we must move it to another reg
|
||||
// before adding it.
|
||||
@@ -834,8 +878,8 @@ pub(crate) fn lower_address<C: LowerCtx<I = Inst>>(
|
||||
};
|
||||
ctx.emit(Inst::AluRRR {
|
||||
alu_op: ALUOp::Add64,
|
||||
rd: addr,
|
||||
rn: addr.to_reg(),
|
||||
rd,
|
||||
rn: rd.to_reg(),
|
||||
rm: reg,
|
||||
});
|
||||
}
|
||||
@@ -843,14 +887,42 @@ pub(crate) fn lower_address<C: LowerCtx<I = Inst>>(
|
||||
assert!(reg != stack_reg());
|
||||
ctx.emit(Inst::AluRRRExtend {
|
||||
alu_op: ALUOp::Add64,
|
||||
rd: addr,
|
||||
rn: addr.to_reg(),
|
||||
rd,
|
||||
rn: rd.to_reg(),
|
||||
rm: reg,
|
||||
extendop,
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
memarg
|
||||
/// Adds into `rd` a signed imm pattern matching the best instruction for it.
|
||||
// TODO: This function is duplicated in ctx.gen_add_imm
|
||||
fn lower_add_immediate<C: LowerCtx<I = Inst>>(ctx: &mut C, dst: Writable<Reg>, src: Reg, imm: i64) {
|
||||
// If we can fit offset or -offset in an imm12, use an add-imm
|
||||
// Otherwise, lower the constant first then add.
|
||||
if let Some(imm12) = Imm12::maybe_from_u64(imm as u64) {
|
||||
ctx.emit(Inst::AluRRImm12 {
|
||||
alu_op: ALUOp::Add64,
|
||||
rd: dst,
|
||||
rn: src,
|
||||
imm12,
|
||||
});
|
||||
} else if let Some(imm12) = Imm12::maybe_from_u64(imm.wrapping_neg() as u64) {
|
||||
ctx.emit(Inst::AluRRImm12 {
|
||||
alu_op: ALUOp::Sub64,
|
||||
rd: dst,
|
||||
rn: src,
|
||||
imm12,
|
||||
});
|
||||
} else {
|
||||
lower_constant_u64(ctx, dst, imm as u64);
|
||||
ctx.emit(Inst::AluRRR {
|
||||
alu_op: ALUOp::Add64,
|
||||
rd: dst,
|
||||
rn: dst.to_reg(),
|
||||
rm: src,
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
pub(crate) fn lower_constant_u64<C: LowerCtx<I = Inst>>(
|
||||
@@ -1248,7 +1320,10 @@ fn load_op_to_ty(op: Opcode) -> Option<Type> {
|
||||
|
||||
/// Helper to lower a load instruction; this is used in several places, because
|
||||
/// a load can sometimes be merged into another operation.
|
||||
pub(crate) fn lower_load<C: LowerCtx<I = Inst>, F: FnMut(&mut C, Writable<Reg>, Type, AMode)>(
|
||||
pub(crate) fn lower_load<
|
||||
C: LowerCtx<I = Inst>,
|
||||
F: FnMut(&mut C, ValueRegs<Writable<Reg>>, Type, AMode),
|
||||
>(
|
||||
ctx: &mut C,
|
||||
ir_inst: IRInst,
|
||||
inputs: &[InsnInput],
|
||||
@@ -1261,7 +1336,7 @@ pub(crate) fn lower_load<C: LowerCtx<I = Inst>, F: FnMut(&mut C, Writable<Reg>,
|
||||
|
||||
let off = ctx.data(ir_inst).load_store_offset().unwrap();
|
||||
let mem = lower_address(ctx, elem_ty, &inputs[..], off);
|
||||
let rd = get_output_reg(ctx, output).only_reg().unwrap();
|
||||
let rd = get_output_reg(ctx, output);
|
||||
|
||||
f(ctx, rd, elem_ty, mem);
|
||||
}
|
||||
|
||||
@@ -1303,56 +1303,71 @@ pub(crate) fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(
|
||||
.memflags(insn)
|
||||
.expect("Load instruction should have memflags");
|
||||
|
||||
lower_load(
|
||||
ctx,
|
||||
insn,
|
||||
&inputs[..],
|
||||
outputs[0],
|
||||
|ctx, rd, elem_ty, mem| {
|
||||
let is_float = ty_has_float_or_vec_representation(elem_ty);
|
||||
ctx.emit(match (ty_bits(elem_ty), sign_extend, is_float) {
|
||||
(1, _, _) => Inst::ULoad8 { rd, mem, flags },
|
||||
(8, false, _) => Inst::ULoad8 { rd, mem, flags },
|
||||
(8, true, _) => Inst::SLoad8 { rd, mem, flags },
|
||||
(16, false, _) => Inst::ULoad16 { rd, mem, flags },
|
||||
(16, true, _) => Inst::SLoad16 { rd, mem, flags },
|
||||
(32, false, false) => Inst::ULoad32 { rd, mem, flags },
|
||||
(32, true, false) => Inst::SLoad32 { rd, mem, flags },
|
||||
(32, _, true) => Inst::FpuLoad32 { rd, mem, flags },
|
||||
(64, _, false) => Inst::ULoad64 { rd, mem, flags },
|
||||
// Note that we treat some of the vector loads as scalar floating-point loads,
|
||||
// which is correct in a little endian environment.
|
||||
(64, _, true) => Inst::FpuLoad64 { rd, mem, flags },
|
||||
(128, _, _) => Inst::FpuLoad128 { rd, mem, flags },
|
||||
_ => panic!("Unsupported size in load"),
|
||||
});
|
||||
|
||||
let vec_extend = match op {
|
||||
Opcode::Sload8x8 => Some(VecExtendOp::Sxtl8),
|
||||
Opcode::Sload8x8Complex => Some(VecExtendOp::Sxtl8),
|
||||
Opcode::Uload8x8 => Some(VecExtendOp::Uxtl8),
|
||||
Opcode::Uload8x8Complex => Some(VecExtendOp::Uxtl8),
|
||||
Opcode::Sload16x4 => Some(VecExtendOp::Sxtl16),
|
||||
Opcode::Sload16x4Complex => Some(VecExtendOp::Sxtl16),
|
||||
Opcode::Uload16x4 => Some(VecExtendOp::Uxtl16),
|
||||
Opcode::Uload16x4Complex => Some(VecExtendOp::Uxtl16),
|
||||
Opcode::Sload32x2 => Some(VecExtendOp::Sxtl32),
|
||||
Opcode::Sload32x2Complex => Some(VecExtendOp::Sxtl32),
|
||||
Opcode::Uload32x2 => Some(VecExtendOp::Uxtl32),
|
||||
Opcode::Uload32x2Complex => Some(VecExtendOp::Uxtl32),
|
||||
_ => None,
|
||||
};
|
||||
|
||||
if let Some(t) = vec_extend {
|
||||
ctx.emit(Inst::VecExtend {
|
||||
t,
|
||||
rd,
|
||||
rn: rd.to_reg(),
|
||||
high_half: false,
|
||||
let out_ty = ctx.output_ty(insn, 0);
|
||||
if out_ty == I128 {
|
||||
let off = ctx.data(insn).load_store_offset().unwrap();
|
||||
let mem = lower_pair_address(ctx, &inputs[..], off);
|
||||
let dst = get_output_reg(ctx, outputs[0]);
|
||||
ctx.emit(Inst::LoadP64 {
|
||||
rt: dst.regs()[0],
|
||||
rt2: dst.regs()[1],
|
||||
mem,
|
||||
flags,
|
||||
});
|
||||
} else {
|
||||
lower_load(
|
||||
ctx,
|
||||
insn,
|
||||
&inputs[..],
|
||||
outputs[0],
|
||||
|ctx, dst, elem_ty, mem| {
|
||||
let rd = dst.only_reg().unwrap();
|
||||
let is_float = ty_has_float_or_vec_representation(elem_ty);
|
||||
ctx.emit(match (ty_bits(elem_ty), sign_extend, is_float) {
|
||||
(1, _, _) => Inst::ULoad8 { rd, mem, flags },
|
||||
(8, false, _) => Inst::ULoad8 { rd, mem, flags },
|
||||
(8, true, _) => Inst::SLoad8 { rd, mem, flags },
|
||||
(16, false, _) => Inst::ULoad16 { rd, mem, flags },
|
||||
(16, true, _) => Inst::SLoad16 { rd, mem, flags },
|
||||
(32, false, false) => Inst::ULoad32 { rd, mem, flags },
|
||||
(32, true, false) => Inst::SLoad32 { rd, mem, flags },
|
||||
(32, _, true) => Inst::FpuLoad32 { rd, mem, flags },
|
||||
(64, _, false) => Inst::ULoad64 { rd, mem, flags },
|
||||
// Note that we treat some of the vector loads as scalar floating-point loads,
|
||||
// which is correct in a little endian environment.
|
||||
(64, _, true) => Inst::FpuLoad64 { rd, mem, flags },
|
||||
(128, _, true) => Inst::FpuLoad128 { rd, mem, flags },
|
||||
_ => panic!("Unsupported size in load"),
|
||||
});
|
||||
}
|
||||
},
|
||||
);
|
||||
|
||||
let vec_extend = match op {
|
||||
Opcode::Sload8x8 => Some(VecExtendOp::Sxtl8),
|
||||
Opcode::Sload8x8Complex => Some(VecExtendOp::Sxtl8),
|
||||
Opcode::Uload8x8 => Some(VecExtendOp::Uxtl8),
|
||||
Opcode::Uload8x8Complex => Some(VecExtendOp::Uxtl8),
|
||||
Opcode::Sload16x4 => Some(VecExtendOp::Sxtl16),
|
||||
Opcode::Sload16x4Complex => Some(VecExtendOp::Sxtl16),
|
||||
Opcode::Uload16x4 => Some(VecExtendOp::Uxtl16),
|
||||
Opcode::Uload16x4Complex => Some(VecExtendOp::Uxtl16),
|
||||
Opcode::Sload32x2 => Some(VecExtendOp::Sxtl32),
|
||||
Opcode::Sload32x2Complex => Some(VecExtendOp::Sxtl32),
|
||||
Opcode::Uload32x2 => Some(VecExtendOp::Uxtl32),
|
||||
Opcode::Uload32x2Complex => Some(VecExtendOp::Uxtl32),
|
||||
_ => None,
|
||||
};
|
||||
|
||||
if let Some(t) = vec_extend {
|
||||
let rd = dst.only_reg().unwrap();
|
||||
ctx.emit(Inst::VecExtend {
|
||||
t,
|
||||
rd,
|
||||
rn: rd.to_reg(),
|
||||
high_half: false,
|
||||
});
|
||||
}
|
||||
},
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
Opcode::Store
|
||||
@@ -1376,19 +1391,30 @@ pub(crate) fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(
|
||||
.memflags(insn)
|
||||
.expect("Store instruction should have memflags");
|
||||
|
||||
let mem = lower_address(ctx, elem_ty, &inputs[1..], off);
|
||||
let rd = put_input_in_reg(ctx, inputs[0], NarrowValueMode::None);
|
||||
let dst = put_input_in_regs(ctx, inputs[0]);
|
||||
|
||||
ctx.emit(match (ty_bits(elem_ty), is_float) {
|
||||
(1, _) | (8, _) => Inst::Store8 { rd, mem, flags },
|
||||
(16, _) => Inst::Store16 { rd, mem, flags },
|
||||
(32, false) => Inst::Store32 { rd, mem, flags },
|
||||
(32, true) => Inst::FpuStore32 { rd, mem, flags },
|
||||
(64, false) => Inst::Store64 { rd, mem, flags },
|
||||
(64, true) => Inst::FpuStore64 { rd, mem, flags },
|
||||
(128, _) => Inst::FpuStore128 { rd, mem, flags },
|
||||
_ => panic!("Unsupported size in store"),
|
||||
});
|
||||
if elem_ty == I128 {
|
||||
let mem = lower_pair_address(ctx, &inputs[1..], off);
|
||||
ctx.emit(Inst::StoreP64 {
|
||||
rt: dst.regs()[0],
|
||||
rt2: dst.regs()[1],
|
||||
mem,
|
||||
flags,
|
||||
});
|
||||
} else {
|
||||
let rd = dst.only_reg().unwrap();
|
||||
let mem = lower_address(ctx, elem_ty, &inputs[1..], off);
|
||||
ctx.emit(match (ty_bits(elem_ty), is_float) {
|
||||
(1, _) | (8, _) => Inst::Store8 { rd, mem, flags },
|
||||
(16, _) => Inst::Store16 { rd, mem, flags },
|
||||
(32, false) => Inst::Store32 { rd, mem, flags },
|
||||
(32, true) => Inst::FpuStore32 { rd, mem, flags },
|
||||
(64, false) => Inst::Store64 { rd, mem, flags },
|
||||
(64, true) => Inst::FpuStore64 { rd, mem, flags },
|
||||
(128, _) => Inst::FpuStore128 { rd, mem, flags },
|
||||
_ => panic!("Unsupported size in store"),
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
Opcode::StackAddr => {
|
||||
|
||||
@@ -386,3 +386,130 @@ block0(v0: i64, v1: i64, v2: i64):
|
||||
; nextln: ldrsh x0, [x0]
|
||||
; nextln: ldp fp, lr, [sp], #16
|
||||
; nextln: ret
|
||||
|
||||
|
||||
function %i128(i64) -> i128 {
|
||||
block0(v0: i64):
|
||||
v1 = load.i128 v0
|
||||
store.i128 v1, v0
|
||||
return v1
|
||||
}
|
||||
|
||||
; check: stp fp, lr, [sp, #-16]!
|
||||
; nextln: mov fp, sp
|
||||
; nextln: mov x1, x0
|
||||
; nextln: ldp x2, x1, [x1]
|
||||
; nextln: stp x2, x1, [x0]
|
||||
; nextln: mov x0, x2
|
||||
; nextln: ldp fp, lr, [sp], #16
|
||||
; nextln: ret
|
||||
|
||||
|
||||
function %i128_imm_offset(i64) -> i128 {
|
||||
block0(v0: i64):
|
||||
v1 = load.i128 v0+16
|
||||
store.i128 v1, v0+16
|
||||
return v1
|
||||
}
|
||||
|
||||
; check: stp fp, lr, [sp, #-16]!
|
||||
; nextln: mov fp, sp
|
||||
; nextln: mov x1, x0
|
||||
; nextln: ldp x2, x1, [x1, #16]
|
||||
; nextln: stp x2, x1, [x0, #16]
|
||||
; nextln: mov x0, x2
|
||||
; nextln: ldp fp, lr, [sp], #16
|
||||
; nextln: ret
|
||||
|
||||
function %i128_imm_offset_large(i64) -> i128 {
|
||||
block0(v0: i64):
|
||||
v1 = load.i128 v0+504
|
||||
store.i128 v1, v0+504
|
||||
return v1
|
||||
}
|
||||
|
||||
; check: stp fp, lr, [sp, #-16]!
|
||||
; nextln: mov fp, sp
|
||||
; nextln: mov x1, x0
|
||||
; nextln: ldp x2, x1, [x1, #504]
|
||||
; nextln: stp x2, x1, [x0, #504]
|
||||
; nextln: mov x0, x2
|
||||
; nextln: ldp fp, lr, [sp], #16
|
||||
; nextln: ret
|
||||
|
||||
function %i128_imm_offset_negative_large(i64) -> i128 {
|
||||
block0(v0: i64):
|
||||
v1 = load.i128 v0-512
|
||||
store.i128 v1, v0-512
|
||||
return v1
|
||||
}
|
||||
|
||||
; check: stp fp, lr, [sp, #-16]!
|
||||
; nextln: mov fp, sp
|
||||
; nextln: mov x1, x0
|
||||
; nextln: ldp x2, x1, [x1, #-512]
|
||||
; nextln: stp x2, x1, [x0, #-512]
|
||||
; nextln: mov x0, x2
|
||||
; nextln: ldp fp, lr, [sp], #16
|
||||
; nextln: ret
|
||||
|
||||
|
||||
function %i128_add_offset(i64) -> i128 {
|
||||
block0(v0: i64):
|
||||
v1 = iadd_imm v0, 32
|
||||
v2 = load.i128 v1
|
||||
store.i128 v2, v1
|
||||
return v2
|
||||
}
|
||||
|
||||
; check: stp fp, lr, [sp, #-16]!
|
||||
; nextln: mov fp, sp
|
||||
; nextln: mov x1, x0
|
||||
; nextln: ldp x2, x1, [x1, #32]
|
||||
; nextln: stp x2, x1, [x0, #32]
|
||||
; nextln: mov x0, x2
|
||||
; nextln: ldp fp, lr, [sp], #16
|
||||
; nextln: ret
|
||||
|
||||
|
||||
function %i128_32bit_sextend_simple(i32) -> i128 {
|
||||
block0(v0: i32):
|
||||
v1 = sextend.i64 v0
|
||||
v2 = load.i128 v1
|
||||
store.i128 v2, v1
|
||||
return v2
|
||||
}
|
||||
|
||||
; TODO: We should be able to deduplicate the sxtw instruction
|
||||
; check: stp fp, lr, [sp, #-16]!
|
||||
; nextln: mov fp, sp
|
||||
; nextln: sxtw x1, w0
|
||||
; nextln: ldp x2, x1, [x1]
|
||||
; nextln: sxtw x0, w0
|
||||
; nextln: stp x2, x1, [x0]
|
||||
; nextln: mov x0, x2
|
||||
; nextln: ldp fp, lr, [sp], #16
|
||||
; nextln: ret
|
||||
|
||||
|
||||
function %i128_32bit_sextend(i64, i32) -> i128 {
|
||||
block0(v0: i64, v1: i32):
|
||||
v2 = sextend.i64 v1
|
||||
v3 = iadd.i64 v0, v2
|
||||
v4 = iadd_imm.i64 v3, 24
|
||||
v5 = load.i128 v4
|
||||
store.i128 v5, v4
|
||||
return v5
|
||||
}
|
||||
|
||||
; check: stp fp, lr, [sp, #-16]!
|
||||
; nextln: mov fp, sp
|
||||
; nextln: mov x2, x0
|
||||
; nextln: add x2, x2, x1, SXTW
|
||||
; nextln: ldp x3, x2, [x2, #24]
|
||||
; nextln: add x0, x0, x1, SXTW
|
||||
; nextln: stp x3, x2, [x0, #24]
|
||||
; nextln: mov x0, x3
|
||||
; nextln: mov x1, x2
|
||||
; nextln: ldp fp, lr, [sp], #16
|
||||
; nextln: ret
|
||||
|
||||
@@ -276,3 +276,167 @@ block0(v0: b1):
|
||||
|
||||
return v0, v137
|
||||
}
|
||||
|
||||
|
||||
|
||||
function %i128_stack_store(i128) {
|
||||
ss0 = explicit_slot 16
|
||||
|
||||
block0(v0: i128):
|
||||
stack_store.i128 v0, ss0
|
||||
return
|
||||
}
|
||||
; TODO: Codegen improvement opportunities: This should be just a stp x0, x1, [sp, #-16]
|
||||
; check: stp fp, lr, [sp, #-16]!
|
||||
; nextln: mov fp, sp
|
||||
; nextln: sub sp, sp, #16
|
||||
; nextln: mov x2, sp
|
||||
; nextln: stp x0, x1, [x2]
|
||||
; nextln: add sp, sp, #16
|
||||
; nextln: ldp fp, lr, [sp], #16
|
||||
|
||||
|
||||
function %i128_stack_store_slot_offset(i128) {
|
||||
ss0 = explicit_slot 16, offset 16
|
||||
|
||||
block0(v0: i128):
|
||||
stack_store.i128 v0, ss0
|
||||
return
|
||||
}
|
||||
; check: stp fp, lr, [sp, #-16]!
|
||||
; nextln: mov fp, sp
|
||||
; nextln: sub sp, sp, #16
|
||||
; nextln: mov x2, sp
|
||||
; nextln: stp x0, x1, [x2]
|
||||
; nextln: add sp, sp, #16
|
||||
; nextln: ldp fp, lr, [sp], #16
|
||||
|
||||
|
||||
function %i128_stack_store_inst_offset(i128) {
|
||||
ss0 = explicit_slot 16
|
||||
ss1 = explicit_slot 16
|
||||
|
||||
block0(v0: i128):
|
||||
stack_store.i128 v0, ss1+16
|
||||
return
|
||||
}
|
||||
; check: stp fp, lr, [sp, #-16]!
|
||||
; nextln: mov fp, sp
|
||||
; nextln: sub sp, sp, #32
|
||||
; nextln: add x2, sp, #32
|
||||
; nextln: stp x0, x1, [x2]
|
||||
; nextln: add sp, sp, #32
|
||||
; nextln: ldp fp, lr, [sp], #16
|
||||
; nextln: ret
|
||||
|
||||
|
||||
function %i128_stack_store_big(i128) {
|
||||
ss0 = explicit_slot 100000
|
||||
ss1 = explicit_slot 8
|
||||
|
||||
block0(v0: i128):
|
||||
stack_store.i128 v0, ss0
|
||||
return
|
||||
}
|
||||
; check: stp fp, lr, [sp, #-16]!
|
||||
; nextln: mov fp, sp
|
||||
; nextln: movz w16, #34480
|
||||
; nextln: movk w16, #1, LSL #16
|
||||
; nextln: sub sp, sp, x16, UXTX
|
||||
; nextln: mov x2, sp
|
||||
; nextln: stp x0, x1, [x2]
|
||||
; nextln: movz w16, #34480
|
||||
; nextln: movk w16, #1, LSL #16
|
||||
; nextln: add sp, sp, x16, UXTX
|
||||
; nextln: ldp fp, lr, [sp], #16
|
||||
; nextln: ret
|
||||
|
||||
|
||||
|
||||
function %i128_stack_load() -> i128 {
|
||||
ss0 = explicit_slot 16
|
||||
|
||||
block0:
|
||||
v0 = stack_load.i128 ss0
|
||||
return v0
|
||||
}
|
||||
; TODO: Codegen improvement opportunities: This should be just a ldp x0, x1, [sp, #-16]
|
||||
; check: stp fp, lr, [sp, #-16]!
|
||||
; nextln: mov fp, sp
|
||||
; nextln: sub sp, sp, #16
|
||||
; nextln: mov x0, sp
|
||||
; nextln: ldp x1, x0, [x0]
|
||||
; nextln: mov x2, x0
|
||||
; nextln: mov x0, x1
|
||||
; nextln: mov x1, x2
|
||||
; nextln: add sp, sp, #16
|
||||
; nextln: ldp fp, lr, [sp], #16
|
||||
; nextln: ret
|
||||
|
||||
|
||||
function %i128_stack_load_slot_offset() -> i128 {
|
||||
ss0 = explicit_slot 16, offset 16
|
||||
|
||||
block0:
|
||||
v0 = stack_load.i128 ss0
|
||||
return v0
|
||||
}
|
||||
; check: stp fp, lr, [sp, #-16]!
|
||||
; nextln: mov fp, sp
|
||||
; nextln: sub sp, sp, #16
|
||||
; nextln: mov x0, sp
|
||||
; nextln: ldp x1, x0, [x0]
|
||||
; nextln: mov x2, x0
|
||||
; nextln: mov x0, x1
|
||||
; nextln: mov x1, x2
|
||||
; nextln: add sp, sp, #16
|
||||
; nextln: ldp fp, lr, [sp], #16
|
||||
; nextln: ret
|
||||
|
||||
|
||||
|
||||
function %i128_stack_load_inst_offset() -> i128 {
|
||||
ss0 = explicit_slot 16
|
||||
ss1 = explicit_slot 16
|
||||
|
||||
block0:
|
||||
v0 = stack_load.i128 ss1+16
|
||||
return v0
|
||||
}
|
||||
; check: stp fp, lr, [sp, #-16]!
|
||||
; nextln: mov fp, sp
|
||||
; nextln: sub sp, sp, #32
|
||||
; nextln: add x0, sp, #32
|
||||
; nextln: ldp x1, x0, [x0]
|
||||
; nextln: mov x2, x0
|
||||
; nextln: mov x0, x1
|
||||
; nextln: mov x1, x2
|
||||
; nextln: add sp, sp, #32
|
||||
; nextln: ldp fp, lr, [sp], #16
|
||||
; nextln: ret
|
||||
|
||||
|
||||
|
||||
function %i128_stack_load_big() -> i128 {
|
||||
ss0 = explicit_slot 100000
|
||||
ss1 = explicit_slot 8
|
||||
|
||||
block0:
|
||||
v0 = stack_load.i128 ss0
|
||||
return v0
|
||||
}
|
||||
; check: stp fp, lr, [sp, #-16]!
|
||||
; nextln: mov fp, sp
|
||||
; nextln: movz w16, #34480
|
||||
; nextln: movk w16, #1, LSL #16
|
||||
; nextln: sub sp, sp, x16, UXTX
|
||||
; nextln: mov x0, sp
|
||||
; nextln: ldp x1, x0, [x0]
|
||||
; nextln: mov x2, x0
|
||||
; nextln: mov x0, x1
|
||||
; nextln: mov x1, x2
|
||||
; nextln: movz w16, #34480
|
||||
; nextln: movk w16, #1, LSL #16
|
||||
; nextln: add sp, sp, x16, UXTX
|
||||
; nextln: ldp fp, lr, [sp], #16
|
||||
; nextln: ret
|
||||
|
||||
136
cranelift/filetests/filetests/runtests/i128-load-store.clif
Normal file
136
cranelift/filetests/filetests/runtests/i128-load-store.clif
Normal file
@@ -0,0 +1,136 @@
|
||||
test run
|
||||
target x86_64 machinst
|
||||
target aarch64
|
||||
|
||||
function %i128_stack_store_load(i64, i64) -> b1 {
|
||||
ss0 = explicit_slot 16
|
||||
|
||||
block0(v0: i64,v1: i64):
|
||||
v2 = iconcat v0, v1
|
||||
|
||||
stack_store.i128 v2, ss0
|
||||
v3 = stack_load.i128 ss0
|
||||
|
||||
v4 = icmp.i128 eq v2, v3
|
||||
return v4
|
||||
}
|
||||
; run: %i128_stack_store_load(0, 0) == true
|
||||
; run: %i128_stack_store_load(-1, -1) == true
|
||||
; run: %i128_stack_store_load(-1, 0) == true
|
||||
; run: %i128_stack_store_load(0, -1) == true
|
||||
; run: %i128_stack_store_load(0x01234567_89ABCDEF, 0xFEDCBA98_76543210) == true
|
||||
; run: %i128_stack_store_load(0x06060606_06060606, 0xA00A00A0_0A00A00A) == true
|
||||
; run: %i128_stack_store_load(0xC0FFEEEE_DECAFFFF, 0xDECAFFFF_C0FFEEEE) == true
|
||||
|
||||
|
||||
function %i128_stack_store_load_offset(i64, i64) -> b1 {
|
||||
ss0 = explicit_slot 16, offset 16
|
||||
|
||||
block0(v0: i64,v1: i64):
|
||||
v2 = iconcat v0, v1
|
||||
|
||||
stack_store.i128 v2, ss0
|
||||
v3 = stack_load.i128 ss0
|
||||
|
||||
v4 = icmp.i128 eq v2, v3
|
||||
return v4
|
||||
}
|
||||
; run: %i128_stack_store_load_offset(0, 0) == true
|
||||
; run: %i128_stack_store_load_offset(-1, -1) == true
|
||||
; run: %i128_stack_store_load_offset(-1, 0) == true
|
||||
; run: %i128_stack_store_load_offset(0, -1) == true
|
||||
; run: %i128_stack_store_load_offset(0x01234567_89ABCDEF, 0xFEDCBA98_76543210) == true
|
||||
; run: %i128_stack_store_load_offset(0x06060606_06060606, 0xA00A00A0_0A00A00A) == true
|
||||
; run: %i128_stack_store_load_offset(0xC0FFEEEE_DECAFFFF, 0xDECAFFFF_C0FFEEEE) == true
|
||||
|
||||
|
||||
function %i128_stack_store_load_inst_offset(i64, i64) -> b1 {
|
||||
ss0 = explicit_slot 16
|
||||
ss1 = explicit_slot 16
|
||||
ss2 = explicit_slot 16
|
||||
|
||||
block0(v0: i64,v1: i64):
|
||||
v2 = iconcat v0, v1
|
||||
|
||||
stack_store.i128 v2, ss1+16
|
||||
v3 = stack_load.i128 ss1+16
|
||||
|
||||
v4 = icmp.i128 eq v2, v3
|
||||
return v4
|
||||
}
|
||||
; run: %i128_stack_store_load_inst_offset(0, 0) == true
|
||||
; run: %i128_stack_store_load_inst_offset(-1, -1) == true
|
||||
; run: %i128_stack_store_load_inst_offset(-1, 0) == true
|
||||
; run: %i128_stack_store_load_inst_offset(0, -1) == true
|
||||
; run: %i128_stack_store_load_inst_offset(0x01234567_89ABCDEF, 0xFEDCBA98_76543210) == true
|
||||
; run: %i128_stack_store_load_inst_offset(0x06060606_06060606, 0xA00A00A0_0A00A00A) == true
|
||||
; run: %i128_stack_store_load_inst_offset(0xC0FFEEEE_DECAFFFF, 0xDECAFFFF_C0FFEEEE) == true
|
||||
|
||||
|
||||
; Some arches (aarch64) try to encode the offset into the load/store instructions
|
||||
; test that we spill if the offset is too large and doesn't fit in the instruction
|
||||
function %i128_stack_store_load_big_offset(i64, i64) -> b1 {
|
||||
ss0 = explicit_slot 100000
|
||||
ss1 = explicit_slot 8
|
||||
|
||||
block0(v0: i64,v1: i64):
|
||||
v2 = iconcat v0, v1
|
||||
|
||||
stack_store.i128 v2, ss0
|
||||
v3 = stack_load.i128 ss0
|
||||
|
||||
v4 = icmp.i128 eq v2, v3
|
||||
return v4
|
||||
}
|
||||
; run: %i128_stack_store_load_big_offset(0, 0) == true
|
||||
; run: %i128_stack_store_load_big_offset(-1, -1) == true
|
||||
; run: %i128_stack_store_load_big_offset(-1, 0) == true
|
||||
; run: %i128_stack_store_load_big_offset(0, -1) == true
|
||||
; run: %i128_stack_store_load_big_offset(0x01234567_89ABCDEF, 0xFEDCBA98_76543210) == true
|
||||
; run: %i128_stack_store_load_big_offset(0x06060606_06060606, 0xA00A00A0_0A00A00A) == true
|
||||
; run: %i128_stack_store_load_big_offset(0xC0FFEEEE_DECAFFFF, 0xDECAFFFF_C0FFEEEE) == true
|
||||
|
||||
|
||||
|
||||
function %i128_store_load(i64, i64) -> b1 {
|
||||
ss0 = explicit_slot 16
|
||||
|
||||
block0(v0: i64,v1: i64):
|
||||
v2 = iconcat v0, v1
|
||||
|
||||
v3 = stack_addr.i64 ss0
|
||||
store.i128 v2, v3
|
||||
v4 = load.i128 v3
|
||||
|
||||
v5 = icmp.i128 eq v2, v4
|
||||
return v5
|
||||
}
|
||||
; run: %i128_store_load(0, 0) == true
|
||||
; run: %i128_store_load(-1, -1) == true
|
||||
; run: %i128_store_load(-1, 0) == true
|
||||
; run: %i128_store_load(0, -1) == true
|
||||
; run: %i128_store_load(0x01234567_89ABCDEF, 0xFEDCBA98_76543210) == true
|
||||
; run: %i128_store_load(0x06060606_06060606, 0xA00A00A0_0A00A00A) == true
|
||||
; run: %i128_store_load(0xC0FFEEEE_DECAFFFF, 0xDECAFFFF_C0FFEEEE) == true
|
||||
|
||||
|
||||
function %i128_store_load_offset(i64, i64) -> b1 {
|
||||
ss0 = explicit_slot 32
|
||||
|
||||
block0(v0: i64,v1: i64):
|
||||
v2 = iconcat v0, v1
|
||||
|
||||
v3 = stack_addr.i64 ss0
|
||||
store.i128 v2, v3+16
|
||||
v4 = load.i128 v3+16
|
||||
|
||||
v5 = icmp.i128 eq v2, v4
|
||||
return v5
|
||||
}
|
||||
; run: %i128_store_load_offset(0, 0) == true
|
||||
; run: %i128_store_load_offset(-1, -1) == true
|
||||
; run: %i128_store_load_offset(-1, 0) == true
|
||||
; run: %i128_store_load_offset(0, -1) == true
|
||||
; run: %i128_store_load_offset(0x01234567_89ABCDEF, 0xFEDCBA98_76543210) == true
|
||||
; run: %i128_store_load_offset(0x06060606_06060606, 0xA00A00A0_0A00A00A) == true
|
||||
; run: %i128_store_load_offset(0xC0FFEEEE_DECAFFFF, 0xDECAFFFF_C0FFEEEE) == true
|
||||
Reference in New Issue
Block a user