We have 3 different aproaches depending on the type of comparision requested: * For eq/ne we compare the high bits and low bits and check if they are equal * For overflow checks, we perform a i128 add and check the resulting overflow flag * For the remaining comparisions (gt/lt/sgt/etc...) We compare both the low bits and high bits, and if the high bits are equal we return the result of the unsigned comparision on the low bits As with other i128 ops, we are still missing immlogic support.
3594 lines
147 KiB
Rust
3594 lines
147 KiB
Rust
//! Lower a single Cranelift instruction into vcode.
|
|
|
|
use crate::binemit::CodeOffset;
|
|
use crate::ir::condcodes::{FloatCC, IntCC};
|
|
use crate::ir::types::*;
|
|
use crate::ir::Inst as IRInst;
|
|
use crate::ir::{InstructionData, Opcode, TrapCode};
|
|
use crate::isa::aarch64::settings as aarch64_settings;
|
|
use crate::machinst::lower::*;
|
|
use crate::machinst::*;
|
|
use crate::settings::Flags;
|
|
use crate::{CodegenError, CodegenResult};
|
|
|
|
use crate::isa::aarch64::abi::*;
|
|
use crate::isa::aarch64::inst::*;
|
|
|
|
use regalloc::Writable;
|
|
|
|
use alloc::boxed::Box;
|
|
use alloc::vec::Vec;
|
|
use core::convert::TryFrom;
|
|
|
|
use super::lower::*;
|
|
|
|
/// Actually codegen an instruction's results into registers.
|
|
pub(crate) fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(
|
|
ctx: &mut C,
|
|
insn: IRInst,
|
|
flags: &Flags,
|
|
isa_flags: &aarch64_settings::Flags,
|
|
) -> CodegenResult<()> {
|
|
let op = ctx.data(insn).opcode();
|
|
let inputs = insn_inputs(ctx, insn);
|
|
let outputs = insn_outputs(ctx, insn);
|
|
let ty = if outputs.len() > 0 {
|
|
Some(ctx.output_ty(insn, 0))
|
|
} else {
|
|
None
|
|
};
|
|
|
|
match op {
|
|
Opcode::Iconst | Opcode::Bconst | Opcode::Null => {
|
|
let value = ctx.get_constant(insn).unwrap();
|
|
// Sign extend constant if necessary
|
|
let value = match ty.unwrap() {
|
|
I8 => (((value as i64) << 56) >> 56) as u64,
|
|
I16 => (((value as i64) << 48) >> 48) as u64,
|
|
I32 => (((value as i64) << 32) >> 32) as u64,
|
|
I64 | R64 => value,
|
|
ty if ty.is_bool() => value,
|
|
ty => unreachable!("Unknown type for const: {}", ty),
|
|
};
|
|
let rd = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
|
|
lower_constant_u64(ctx, rd, value);
|
|
}
|
|
Opcode::F32const => {
|
|
let value = f32::from_bits(ctx.get_constant(insn).unwrap() as u32);
|
|
let rd = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
|
|
lower_constant_f32(ctx, rd, value);
|
|
}
|
|
Opcode::F64const => {
|
|
let value = f64::from_bits(ctx.get_constant(insn).unwrap());
|
|
let rd = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
|
|
lower_constant_f64(ctx, rd, value);
|
|
}
|
|
Opcode::Iadd => {
|
|
match ty.unwrap() {
|
|
ty if ty.is_vector() => {
|
|
let rd = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
|
|
let rm = put_input_in_reg(ctx, inputs[1], NarrowValueMode::None);
|
|
let rn = put_input_in_reg(ctx, inputs[0], NarrowValueMode::None);
|
|
ctx.emit(Inst::VecRRR {
|
|
rd,
|
|
rn,
|
|
rm,
|
|
alu_op: VecALUOp::Add,
|
|
size: VectorSize::from_ty(ty),
|
|
});
|
|
}
|
|
I128 => {
|
|
let lhs = put_input_in_regs(ctx, inputs[0]);
|
|
let rhs = put_input_in_regs(ctx, inputs[1]);
|
|
let dst = get_output_reg(ctx, outputs[0]);
|
|
assert_eq!(lhs.len(), 2);
|
|
assert_eq!(rhs.len(), 2);
|
|
assert_eq!(dst.len(), 2);
|
|
|
|
// adds x0, x0, x2
|
|
// adc x1, x1, x3
|
|
|
|
ctx.emit(Inst::AluRRR {
|
|
alu_op: ALUOp::AddS64,
|
|
rd: dst.regs()[0],
|
|
rn: lhs.regs()[0],
|
|
rm: rhs.regs()[0],
|
|
});
|
|
ctx.emit(Inst::AluRRR {
|
|
alu_op: ALUOp::Adc64,
|
|
rd: dst.regs()[1],
|
|
rn: lhs.regs()[1],
|
|
rm: rhs.regs()[1],
|
|
});
|
|
}
|
|
ty => {
|
|
let rd = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
|
|
let mul_insn = if let Some(mul_insn) =
|
|
maybe_input_insn(ctx, inputs[1], Opcode::Imul)
|
|
{
|
|
Some((mul_insn, 0))
|
|
} else if let Some(mul_insn) = maybe_input_insn(ctx, inputs[0], Opcode::Imul) {
|
|
Some((mul_insn, 1))
|
|
} else {
|
|
None
|
|
};
|
|
// If possible combine mul + add into madd.
|
|
if let Some((insn, addend_idx)) = mul_insn {
|
|
let alu_op = choose_32_64(ty, ALUOp3::MAdd32, ALUOp3::MAdd64);
|
|
let rn_input = InsnInput { insn, input: 0 };
|
|
let rm_input = InsnInput { insn, input: 1 };
|
|
|
|
let rn = put_input_in_reg(ctx, rn_input, NarrowValueMode::None);
|
|
let rm = put_input_in_reg(ctx, rm_input, NarrowValueMode::None);
|
|
let ra = put_input_in_reg(ctx, inputs[addend_idx], NarrowValueMode::None);
|
|
|
|
ctx.emit(Inst::AluRRRR {
|
|
alu_op,
|
|
rd,
|
|
rn,
|
|
rm,
|
|
ra,
|
|
});
|
|
} else {
|
|
let rn = put_input_in_reg(ctx, inputs[0], NarrowValueMode::None);
|
|
let (rm, negated) = put_input_in_rse_imm12_maybe_negated(
|
|
ctx,
|
|
inputs[1],
|
|
ty_bits(ty),
|
|
NarrowValueMode::None,
|
|
);
|
|
let alu_op = if !negated {
|
|
choose_32_64(ty, ALUOp::Add32, ALUOp::Add64)
|
|
} else {
|
|
choose_32_64(ty, ALUOp::Sub32, ALUOp::Sub64)
|
|
};
|
|
ctx.emit(alu_inst_imm12(alu_op, rd, rn, rm));
|
|
}
|
|
}
|
|
}
|
|
}
|
|
Opcode::Isub => {
|
|
let ty = ty.unwrap();
|
|
if ty == I128 {
|
|
let lhs = put_input_in_regs(ctx, inputs[0]);
|
|
let rhs = put_input_in_regs(ctx, inputs[1]);
|
|
let dst = get_output_reg(ctx, outputs[0]);
|
|
assert_eq!(lhs.len(), 2);
|
|
assert_eq!(rhs.len(), 2);
|
|
assert_eq!(dst.len(), 2);
|
|
|
|
// subs x0, x0, x2
|
|
// sbc x1, x1, x3
|
|
|
|
ctx.emit(Inst::AluRRR {
|
|
alu_op: ALUOp::SubS64,
|
|
rd: dst.regs()[0],
|
|
rn: lhs.regs()[0],
|
|
rm: rhs.regs()[0],
|
|
});
|
|
ctx.emit(Inst::AluRRR {
|
|
alu_op: ALUOp::Sbc64,
|
|
rd: dst.regs()[1],
|
|
rn: lhs.regs()[1],
|
|
rm: rhs.regs()[1],
|
|
});
|
|
} else {
|
|
let rd = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
|
|
let rn = put_input_in_reg(ctx, inputs[0], NarrowValueMode::None);
|
|
if !ty.is_vector() {
|
|
let (rm, negated) = put_input_in_rse_imm12_maybe_negated(
|
|
ctx,
|
|
inputs[1],
|
|
ty_bits(ty),
|
|
NarrowValueMode::None,
|
|
);
|
|
let alu_op = if !negated {
|
|
choose_32_64(ty, ALUOp::Sub32, ALUOp::Sub64)
|
|
} else {
|
|
choose_32_64(ty, ALUOp::Add32, ALUOp::Add64)
|
|
};
|
|
ctx.emit(alu_inst_imm12(alu_op, rd, rn, rm));
|
|
} else {
|
|
let rm = put_input_in_reg(ctx, inputs[1], NarrowValueMode::None);
|
|
ctx.emit(Inst::VecRRR {
|
|
rd,
|
|
rn,
|
|
rm,
|
|
alu_op: VecALUOp::Sub,
|
|
size: VectorSize::from_ty(ty),
|
|
});
|
|
}
|
|
}
|
|
}
|
|
Opcode::UaddSat | Opcode::SaddSat | Opcode::UsubSat | Opcode::SsubSat => {
|
|
let ty = ty.unwrap();
|
|
assert!(ty.is_vector());
|
|
let rd = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
|
|
let rn = put_input_in_reg(ctx, inputs[0], NarrowValueMode::None);
|
|
let rm = put_input_in_reg(ctx, inputs[1], NarrowValueMode::None);
|
|
|
|
let alu_op = match op {
|
|
Opcode::UaddSat => VecALUOp::Uqadd,
|
|
Opcode::SaddSat => VecALUOp::Sqadd,
|
|
Opcode::UsubSat => VecALUOp::Uqsub,
|
|
Opcode::SsubSat => VecALUOp::Sqsub,
|
|
_ => unreachable!(),
|
|
};
|
|
|
|
ctx.emit(Inst::VecRRR {
|
|
rd,
|
|
rn,
|
|
rm,
|
|
alu_op,
|
|
size: VectorSize::from_ty(ty),
|
|
});
|
|
}
|
|
|
|
Opcode::Ineg => {
|
|
let rd = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
|
|
let ty = ty.unwrap();
|
|
if !ty.is_vector() {
|
|
let rn = zero_reg();
|
|
let rm = put_input_in_reg(ctx, inputs[0], NarrowValueMode::None);
|
|
let alu_op = choose_32_64(ty, ALUOp::Sub32, ALUOp::Sub64);
|
|
ctx.emit(Inst::AluRRR { alu_op, rd, rn, rm });
|
|
} else {
|
|
let rn = put_input_in_reg(ctx, inputs[0], NarrowValueMode::None);
|
|
ctx.emit(Inst::VecMisc {
|
|
op: VecMisc2::Neg,
|
|
rd,
|
|
rn,
|
|
size: VectorSize::from_ty(ty),
|
|
});
|
|
}
|
|
}
|
|
|
|
Opcode::Imul => {
|
|
let lhs = put_input_in_regs(ctx, inputs[0]);
|
|
let rhs = put_input_in_regs(ctx, inputs[1]);
|
|
let dst = get_output_reg(ctx, outputs[0]);
|
|
|
|
let rd = dst.regs()[0];
|
|
let rn = lhs.regs()[0];
|
|
let rm = rhs.regs()[0];
|
|
|
|
let ty = ty.unwrap();
|
|
match ty {
|
|
I128 => {
|
|
assert_eq!(lhs.len(), 2);
|
|
assert_eq!(rhs.len(), 2);
|
|
assert_eq!(dst.len(), 2);
|
|
|
|
// 128bit mul formula:
|
|
// dst_lo = lhs_lo * rhs_lo
|
|
// dst_hi = umulhi(lhs_lo, rhs_lo) + (lhs_lo * rhs_hi) + (lhs_hi * rhs_lo)
|
|
//
|
|
// We can convert the above formula into the following
|
|
// umulh dst_hi, lhs_lo, rhs_lo
|
|
// madd dst_hi, lhs_lo, rhs_hi, dst_hi
|
|
// madd dst_hi, lhs_hi, rhs_lo, dst_hi
|
|
// mul dst_lo, lhs_lo, rhs_lo
|
|
|
|
ctx.emit(Inst::AluRRR {
|
|
alu_op: ALUOp::UMulH,
|
|
rd: dst.regs()[1],
|
|
rn: lhs.regs()[0],
|
|
rm: rhs.regs()[0],
|
|
});
|
|
ctx.emit(Inst::AluRRRR {
|
|
alu_op: ALUOp3::MAdd64,
|
|
rd: dst.regs()[1],
|
|
rn: lhs.regs()[0],
|
|
rm: rhs.regs()[1],
|
|
ra: dst.regs()[1].to_reg(),
|
|
});
|
|
ctx.emit(Inst::AluRRRR {
|
|
alu_op: ALUOp3::MAdd64,
|
|
rd: dst.regs()[1],
|
|
rn: lhs.regs()[1],
|
|
rm: rhs.regs()[0],
|
|
ra: dst.regs()[1].to_reg(),
|
|
});
|
|
ctx.emit(Inst::AluRRRR {
|
|
alu_op: ALUOp3::MAdd64,
|
|
rd: dst.regs()[0],
|
|
rn: lhs.regs()[0],
|
|
rm: rhs.regs()[0],
|
|
ra: zero_reg(),
|
|
});
|
|
}
|
|
ty if !ty.is_vector() => {
|
|
let alu_op = choose_32_64(ty, ALUOp3::MAdd32, ALUOp3::MAdd64);
|
|
ctx.emit(Inst::AluRRRR {
|
|
alu_op,
|
|
rd,
|
|
rn,
|
|
rm,
|
|
ra: zero_reg(),
|
|
});
|
|
}
|
|
I64X2 => {
|
|
let tmp1 = ctx.alloc_tmp(I64X2).only_reg().unwrap();
|
|
let tmp2 = ctx.alloc_tmp(I64X2).only_reg().unwrap();
|
|
|
|
// This I64X2 multiplication is performed with several 32-bit
|
|
// operations.
|
|
|
|
// 64-bit numbers x and y, can be represented as:
|
|
// x = a + 2^32(b)
|
|
// y = c + 2^32(d)
|
|
|
|
// A 64-bit multiplication is:
|
|
// x * y = ac + 2^32(ad + bc) + 2^64(bd)
|
|
// note: `2^64(bd)` can be ignored, the value is too large to fit in
|
|
// 64 bits.
|
|
|
|
// This sequence implements a I64X2 multiply, where the registers
|
|
// `rn` and `rm` are split up into 32-bit components:
|
|
// rn = |d|c|b|a|
|
|
// rm = |h|g|f|e|
|
|
//
|
|
// rn * rm = |cg + 2^32(ch + dg)|ae + 2^32(af + be)|
|
|
//
|
|
// The sequence is:
|
|
// rev64 rd.4s, rm.4s
|
|
// mul rd.4s, rd.4s, rn.4s
|
|
// xtn tmp1.2s, rn.2d
|
|
// addp rd.4s, rd.4s, rd.4s
|
|
// xtn tmp2.2s, rm.2d
|
|
// shll rd.2d, rd.2s, #32
|
|
// umlal rd.2d, tmp2.2s, tmp1.2s
|
|
|
|
// Reverse the 32-bit elements in the 64-bit words.
|
|
// rd = |g|h|e|f|
|
|
ctx.emit(Inst::VecMisc {
|
|
op: VecMisc2::Rev64,
|
|
rd,
|
|
rn: rm,
|
|
size: VectorSize::Size32x4,
|
|
});
|
|
|
|
// Calculate the high half components.
|
|
// rd = |dg|ch|be|af|
|
|
//
|
|
// Note that this 32-bit multiply of the high half
|
|
// discards the bits that would overflow, same as
|
|
// if 64-bit operations were used. Also the Shll
|
|
// below would shift out the overflow bits anyway.
|
|
ctx.emit(Inst::VecRRR {
|
|
alu_op: VecALUOp::Mul,
|
|
rd,
|
|
rn: rd.to_reg(),
|
|
rm: rn,
|
|
size: VectorSize::Size32x4,
|
|
});
|
|
|
|
// Extract the low half components of rn.
|
|
// tmp1 = |c|a|
|
|
ctx.emit(Inst::VecMiscNarrow {
|
|
op: VecMiscNarrowOp::Xtn,
|
|
rd: tmp1,
|
|
rn,
|
|
size: VectorSize::Size32x2,
|
|
high_half: false,
|
|
});
|
|
|
|
// Sum the respective high half components.
|
|
// rd = |dg+ch|be+af||dg+ch|be+af|
|
|
ctx.emit(Inst::VecRRR {
|
|
alu_op: VecALUOp::Addp,
|
|
rd: rd,
|
|
rn: rd.to_reg(),
|
|
rm: rd.to_reg(),
|
|
size: VectorSize::Size32x4,
|
|
});
|
|
|
|
// Extract the low half components of rm.
|
|
// tmp2 = |g|e|
|
|
ctx.emit(Inst::VecMiscNarrow {
|
|
op: VecMiscNarrowOp::Xtn,
|
|
rd: tmp2,
|
|
rn: rm,
|
|
size: VectorSize::Size32x2,
|
|
high_half: false,
|
|
});
|
|
|
|
// Shift the high half components, into the high half.
|
|
// rd = |dg+ch << 32|be+af << 32|
|
|
ctx.emit(Inst::VecMisc {
|
|
op: VecMisc2::Shll,
|
|
rd,
|
|
rn: rd.to_reg(),
|
|
size: VectorSize::Size32x2,
|
|
});
|
|
|
|
// Multiply the low components together, and accumulate with the high
|
|
// half.
|
|
// rd = |rd[1] + cg|rd[0] + ae|
|
|
ctx.emit(Inst::VecRRR {
|
|
alu_op: VecALUOp::Umlal,
|
|
rd,
|
|
rn: tmp2.to_reg(),
|
|
rm: tmp1.to_reg(),
|
|
size: VectorSize::Size32x2,
|
|
});
|
|
}
|
|
ty if ty.is_vector() => {
|
|
ctx.emit(Inst::VecRRR {
|
|
alu_op: VecALUOp::Mul,
|
|
rd,
|
|
rn,
|
|
rm,
|
|
size: VectorSize::from_ty(ty),
|
|
});
|
|
}
|
|
_ => panic!("Unable to emit mul for {}", ty),
|
|
}
|
|
}
|
|
|
|
Opcode::Umulhi | Opcode::Smulhi => {
|
|
let rd = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
|
|
let is_signed = op == Opcode::Smulhi;
|
|
let input_ty = ctx.input_ty(insn, 0);
|
|
assert!(ctx.input_ty(insn, 1) == input_ty);
|
|
assert!(ctx.output_ty(insn, 0) == input_ty);
|
|
|
|
match input_ty {
|
|
I64 => {
|
|
let rn = put_input_in_reg(ctx, inputs[0], NarrowValueMode::None);
|
|
let rm = put_input_in_reg(ctx, inputs[1], NarrowValueMode::None);
|
|
let alu_op = if is_signed {
|
|
ALUOp::SMulH
|
|
} else {
|
|
ALUOp::UMulH
|
|
};
|
|
ctx.emit(Inst::AluRRR { alu_op, rd, rn, rm });
|
|
}
|
|
I32 | I16 | I8 => {
|
|
let narrow_mode = if is_signed {
|
|
NarrowValueMode::SignExtend64
|
|
} else {
|
|
NarrowValueMode::ZeroExtend64
|
|
};
|
|
let rn = put_input_in_reg(ctx, inputs[0], narrow_mode);
|
|
let rm = put_input_in_reg(ctx, inputs[1], narrow_mode);
|
|
let ra = zero_reg();
|
|
ctx.emit(Inst::AluRRRR {
|
|
alu_op: ALUOp3::MAdd64,
|
|
rd,
|
|
rn,
|
|
rm,
|
|
ra,
|
|
});
|
|
let shift_op = if is_signed {
|
|
ALUOp::Asr64
|
|
} else {
|
|
ALUOp::Lsr64
|
|
};
|
|
let shift_amt = match input_ty {
|
|
I32 => 32,
|
|
I16 => 16,
|
|
I8 => 8,
|
|
_ => unreachable!(),
|
|
};
|
|
ctx.emit(Inst::AluRRImmShift {
|
|
alu_op: shift_op,
|
|
rd,
|
|
rn: rd.to_reg(),
|
|
immshift: ImmShift::maybe_from_u64(shift_amt).unwrap(),
|
|
});
|
|
}
|
|
_ => {
|
|
panic!("Unsupported argument type for umulhi/smulhi: {}", input_ty);
|
|
}
|
|
}
|
|
}
|
|
|
|
Opcode::Udiv | Opcode::Sdiv | Opcode::Urem | Opcode::Srem => {
|
|
let is_signed = match op {
|
|
Opcode::Udiv | Opcode::Urem => false,
|
|
Opcode::Sdiv | Opcode::Srem => true,
|
|
_ => unreachable!(),
|
|
};
|
|
let is_rem = match op {
|
|
Opcode::Udiv | Opcode::Sdiv => false,
|
|
Opcode::Urem | Opcode::Srem => true,
|
|
_ => unreachable!(),
|
|
};
|
|
let narrow_mode = if is_signed {
|
|
NarrowValueMode::SignExtend64
|
|
} else {
|
|
NarrowValueMode::ZeroExtend64
|
|
};
|
|
// TODO: Add SDiv32 to implement 32-bit directly, rather
|
|
// than extending the input.
|
|
let div_op = if is_signed {
|
|
ALUOp::SDiv64
|
|
} else {
|
|
ALUOp::UDiv64
|
|
};
|
|
|
|
let rd = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
|
|
let rn = put_input_in_reg(ctx, inputs[0], narrow_mode);
|
|
let rm = put_input_in_reg(ctx, inputs[1], narrow_mode);
|
|
// The div instruction does not trap on divide by zero or signed overflow
|
|
// so checks are inserted below.
|
|
//
|
|
// div rd, rn, rm
|
|
ctx.emit(Inst::AluRRR {
|
|
alu_op: div_op,
|
|
rd,
|
|
rn,
|
|
rm,
|
|
});
|
|
|
|
if is_rem {
|
|
// Remainder (rn % rm) is implemented as:
|
|
//
|
|
// tmp = rn / rm
|
|
// rd = rn - (tmp*rm)
|
|
//
|
|
// use 'rd' for tmp and you have:
|
|
//
|
|
// div rd, rn, rm ; rd = rn / rm
|
|
// cbnz rm, #8 ; branch over trap
|
|
// udf ; divide by zero
|
|
// msub rd, rd, rm, rn ; rd = rn - rd * rm
|
|
|
|
// Check for divide by 0.
|
|
let trap_code = TrapCode::IntegerDivisionByZero;
|
|
ctx.emit(Inst::TrapIf {
|
|
trap_code,
|
|
kind: CondBrKind::Zero(rm),
|
|
});
|
|
|
|
ctx.emit(Inst::AluRRRR {
|
|
alu_op: ALUOp3::MSub64,
|
|
rd: rd,
|
|
rn: rd.to_reg(),
|
|
rm: rm,
|
|
ra: rn,
|
|
});
|
|
} else {
|
|
if div_op == ALUOp::SDiv64 {
|
|
// cbnz rm, #8
|
|
// udf ; divide by zero
|
|
// cmn rm, 1
|
|
// ccmp rn, 1, #nzcv, eq
|
|
// b.vc #8
|
|
// udf ; signed overflow
|
|
|
|
// Check for divide by 0.
|
|
let trap_code = TrapCode::IntegerDivisionByZero;
|
|
ctx.emit(Inst::TrapIf {
|
|
trap_code,
|
|
kind: CondBrKind::Zero(rm),
|
|
});
|
|
|
|
// Check for signed overflow. The only case is min_value / -1.
|
|
let ty = ty.unwrap();
|
|
// The following checks must be done in 32-bit or 64-bit, depending
|
|
// on the input type. Even though the initial div instruction is
|
|
// always done in 64-bit currently.
|
|
let size = OperandSize::from_ty(ty);
|
|
// Check RHS is -1.
|
|
ctx.emit(Inst::AluRRImm12 {
|
|
alu_op: choose_32_64(ty, ALUOp::AddS32, ALUOp::AddS64),
|
|
rd: writable_zero_reg(),
|
|
rn: rm,
|
|
imm12: Imm12::maybe_from_u64(1).unwrap(),
|
|
});
|
|
// Check LHS is min_value, by subtracting 1 and branching if
|
|
// there is overflow.
|
|
ctx.emit(Inst::CCmpImm {
|
|
size,
|
|
rn,
|
|
imm: UImm5::maybe_from_u8(1).unwrap(),
|
|
nzcv: NZCV::new(false, false, false, false),
|
|
cond: Cond::Eq,
|
|
});
|
|
let trap_code = TrapCode::IntegerOverflow;
|
|
ctx.emit(Inst::TrapIf {
|
|
trap_code,
|
|
kind: CondBrKind::Cond(Cond::Vs),
|
|
});
|
|
} else {
|
|
// cbnz rm, #8
|
|
// udf ; divide by zero
|
|
|
|
// Check for divide by 0.
|
|
let trap_code = TrapCode::IntegerDivisionByZero;
|
|
ctx.emit(Inst::TrapIf {
|
|
trap_code,
|
|
kind: CondBrKind::Zero(rm),
|
|
});
|
|
}
|
|
}
|
|
}
|
|
|
|
Opcode::Uextend | Opcode::Sextend => {
|
|
let output_ty = ty.unwrap();
|
|
let input_ty = ctx.input_ty(insn, 0);
|
|
let from_bits = ty_bits(input_ty) as u8;
|
|
let to_bits = ty_bits(output_ty) as u8;
|
|
let to_bits = std::cmp::max(32, to_bits);
|
|
assert!(from_bits <= to_bits);
|
|
if from_bits < to_bits {
|
|
let signed = op == Opcode::Sextend;
|
|
let rd = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
|
|
|
|
if let Some(extract_insn) = maybe_input_insn(ctx, inputs[0], Opcode::Extractlane) {
|
|
let idx =
|
|
if let InstructionData::BinaryImm8 { imm, .. } = ctx.data(extract_insn) {
|
|
*imm
|
|
} else {
|
|
unreachable!();
|
|
};
|
|
let input = InsnInput {
|
|
insn: extract_insn,
|
|
input: 0,
|
|
};
|
|
let rn = put_input_in_reg(ctx, input, NarrowValueMode::None);
|
|
let size = VectorSize::from_ty(ctx.input_ty(extract_insn, 0));
|
|
|
|
if signed {
|
|
let scalar_size = OperandSize::from_ty(output_ty);
|
|
|
|
ctx.emit(Inst::MovFromVecSigned {
|
|
rd,
|
|
rn,
|
|
idx,
|
|
size,
|
|
scalar_size,
|
|
});
|
|
} else {
|
|
ctx.emit(Inst::MovFromVec { rd, rn, idx, size });
|
|
}
|
|
} else {
|
|
// If we reach this point, we weren't able to incorporate the extend as
|
|
// a register-mode on another instruction, so we have a 'None'
|
|
// narrow-value/extend mode here, and we emit the explicit instruction.
|
|
let rn = put_input_in_reg(ctx, inputs[0], NarrowValueMode::None);
|
|
ctx.emit(Inst::Extend {
|
|
rd,
|
|
rn,
|
|
signed,
|
|
from_bits,
|
|
to_bits,
|
|
});
|
|
}
|
|
}
|
|
}
|
|
|
|
Opcode::Bnot => {
|
|
let rd = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
|
|
let ty = ty.unwrap();
|
|
if !ty.is_vector() {
|
|
let rm = put_input_in_rs_immlogic(ctx, inputs[0], NarrowValueMode::None);
|
|
let alu_op = choose_32_64(ty, ALUOp::OrrNot32, ALUOp::OrrNot64);
|
|
// NOT rd, rm ==> ORR_NOT rd, zero, rm
|
|
ctx.emit(alu_inst_immlogic(alu_op, rd, zero_reg(), rm));
|
|
} else {
|
|
let rm = put_input_in_reg(ctx, inputs[0], NarrowValueMode::None);
|
|
ctx.emit(Inst::VecMisc {
|
|
op: VecMisc2::Not,
|
|
rd,
|
|
rn: rm,
|
|
size: VectorSize::from_ty(ty),
|
|
});
|
|
}
|
|
}
|
|
|
|
Opcode::Band
|
|
| Opcode::Bor
|
|
| Opcode::Bxor
|
|
| Opcode::BandNot
|
|
| Opcode::BorNot
|
|
| Opcode::BxorNot => {
|
|
let rd = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
|
|
let ty = ty.unwrap();
|
|
if !ty.is_vector() {
|
|
let rn = put_input_in_reg(ctx, inputs[0], NarrowValueMode::None);
|
|
let rm = put_input_in_rs_immlogic(ctx, inputs[1], NarrowValueMode::None);
|
|
let alu_op = match op {
|
|
Opcode::Band => choose_32_64(ty, ALUOp::And32, ALUOp::And64),
|
|
Opcode::Bor => choose_32_64(ty, ALUOp::Orr32, ALUOp::Orr64),
|
|
Opcode::Bxor => choose_32_64(ty, ALUOp::Eor32, ALUOp::Eor64),
|
|
Opcode::BandNot => choose_32_64(ty, ALUOp::AndNot32, ALUOp::AndNot64),
|
|
Opcode::BorNot => choose_32_64(ty, ALUOp::OrrNot32, ALUOp::OrrNot64),
|
|
Opcode::BxorNot => choose_32_64(ty, ALUOp::EorNot32, ALUOp::EorNot64),
|
|
_ => unreachable!(),
|
|
};
|
|
ctx.emit(alu_inst_immlogic(alu_op, rd, rn, rm));
|
|
} else {
|
|
let alu_op = match op {
|
|
Opcode::Band => VecALUOp::And,
|
|
Opcode::BandNot => VecALUOp::Bic,
|
|
Opcode::Bor => VecALUOp::Orr,
|
|
Opcode::Bxor => VecALUOp::Eor,
|
|
_ => unreachable!(),
|
|
};
|
|
|
|
let rn = put_input_in_reg(ctx, inputs[0], NarrowValueMode::None);
|
|
let rm = put_input_in_reg(ctx, inputs[1], NarrowValueMode::None);
|
|
let rd = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
|
|
|
|
ctx.emit(Inst::VecRRR {
|
|
alu_op,
|
|
rd,
|
|
rn,
|
|
rm,
|
|
size: VectorSize::from_ty(ty),
|
|
});
|
|
}
|
|
}
|
|
|
|
Opcode::Ishl | Opcode::Ushr | Opcode::Sshr => {
|
|
let ty = ty.unwrap();
|
|
let rd = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
|
|
if !ty.is_vector() {
|
|
let size = OperandSize::from_bits(ty_bits(ty));
|
|
let narrow_mode = match (op, size) {
|
|
(Opcode::Ishl, _) => NarrowValueMode::None,
|
|
(Opcode::Ushr, OperandSize::Size64) => NarrowValueMode::ZeroExtend64,
|
|
(Opcode::Ushr, OperandSize::Size32) => NarrowValueMode::ZeroExtend32,
|
|
(Opcode::Sshr, OperandSize::Size64) => NarrowValueMode::SignExtend64,
|
|
(Opcode::Sshr, OperandSize::Size32) => NarrowValueMode::SignExtend32,
|
|
_ => unreachable!(),
|
|
};
|
|
let rn = put_input_in_reg(ctx, inputs[0], narrow_mode);
|
|
let rm = put_input_in_reg_immshift(ctx, inputs[1], ty_bits(ty));
|
|
let alu_op = match op {
|
|
Opcode::Ishl => choose_32_64(ty, ALUOp::Lsl32, ALUOp::Lsl64),
|
|
Opcode::Ushr => choose_32_64(ty, ALUOp::Lsr32, ALUOp::Lsr64),
|
|
Opcode::Sshr => choose_32_64(ty, ALUOp::Asr32, ALUOp::Asr64),
|
|
_ => unreachable!(),
|
|
};
|
|
ctx.emit(alu_inst_immshift(alu_op, rd, rn, rm));
|
|
} else {
|
|
let rn = put_input_in_reg(ctx, inputs[0], NarrowValueMode::None);
|
|
let size = VectorSize::from_ty(ty);
|
|
let (alu_op, is_right_shift) = match op {
|
|
Opcode::Ishl => (VecALUOp::Sshl, false),
|
|
Opcode::Ushr => (VecALUOp::Ushl, true),
|
|
Opcode::Sshr => (VecALUOp::Sshl, true),
|
|
_ => unreachable!(),
|
|
};
|
|
|
|
let rm = if is_right_shift {
|
|
// Right shifts are implemented with a negative left shift.
|
|
let tmp = ctx.alloc_tmp(I32).only_reg().unwrap();
|
|
let rm = put_input_in_reg(ctx, inputs[1], NarrowValueMode::None);
|
|
let rn = zero_reg();
|
|
ctx.emit(Inst::AluRRR {
|
|
alu_op: ALUOp::Sub32,
|
|
rd: tmp,
|
|
rn,
|
|
rm,
|
|
});
|
|
tmp.to_reg()
|
|
} else {
|
|
put_input_in_reg(ctx, inputs[1], NarrowValueMode::None)
|
|
};
|
|
|
|
ctx.emit(Inst::VecDup { rd, rn: rm, size });
|
|
|
|
ctx.emit(Inst::VecRRR {
|
|
alu_op,
|
|
rd,
|
|
rn,
|
|
rm: rd.to_reg(),
|
|
size,
|
|
});
|
|
}
|
|
}
|
|
|
|
Opcode::Rotr | Opcode::Rotl => {
|
|
// aarch64 doesn't have a left-rotate instruction, but a left rotation of K places is
|
|
// effectively a right rotation of N - K places, if N is the integer's bit size. We
|
|
// implement left rotations with this trick.
|
|
//
|
|
// For a 32-bit or 64-bit rotate-right, we can use the ROR instruction directly.
|
|
//
|
|
// For a < 32-bit rotate-right, we synthesize this as:
|
|
//
|
|
// rotr rd, rn, rm
|
|
//
|
|
// =>
|
|
//
|
|
// zero-extend rn, <32-or-64>
|
|
// and tmp_masked_rm, rm, <bitwidth - 1>
|
|
// sub tmp1, tmp_masked_rm, <bitwidth>
|
|
// sub tmp1, zero, tmp1 ; neg
|
|
// lsr tmp2, rn, tmp_masked_rm
|
|
// lsl rd, rn, tmp1
|
|
// orr rd, rd, tmp2
|
|
//
|
|
// For a constant amount, we can instead do:
|
|
//
|
|
// zero-extend rn, <32-or-64>
|
|
// lsr tmp2, rn, #<shiftimm>
|
|
// lsl rd, rn, <bitwidth - shiftimm>
|
|
// orr rd, rd, tmp2
|
|
|
|
let is_rotl = op == Opcode::Rotl;
|
|
|
|
let ty = ty.unwrap();
|
|
let ty_bits_size = ty_bits(ty) as u8;
|
|
|
|
let rd = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
|
|
let rn = put_input_in_reg(
|
|
ctx,
|
|
inputs[0],
|
|
if ty_bits_size <= 32 {
|
|
NarrowValueMode::ZeroExtend32
|
|
} else {
|
|
NarrowValueMode::ZeroExtend64
|
|
},
|
|
);
|
|
let rm = put_input_in_reg_immshift(ctx, inputs[1], ty_bits(ty));
|
|
|
|
if ty_bits_size == 32 || ty_bits_size == 64 {
|
|
let alu_op = choose_32_64(ty, ALUOp::RotR32, ALUOp::RotR64);
|
|
match rm {
|
|
ResultRegImmShift::ImmShift(mut immshift) => {
|
|
if is_rotl {
|
|
immshift.imm = ty_bits_size.wrapping_sub(immshift.value());
|
|
}
|
|
immshift.imm &= ty_bits_size - 1;
|
|
ctx.emit(Inst::AluRRImmShift {
|
|
alu_op,
|
|
rd,
|
|
rn,
|
|
immshift,
|
|
});
|
|
}
|
|
|
|
ResultRegImmShift::Reg(rm) => {
|
|
let rm = if is_rotl {
|
|
// Really ty_bits_size - rn, but the upper bits of the result are
|
|
// ignored (because of the implicit masking done by the instruction),
|
|
// so this is equivalent to negating the input.
|
|
let alu_op = choose_32_64(ty, ALUOp::Sub32, ALUOp::Sub64);
|
|
let tmp = ctx.alloc_tmp(ty).only_reg().unwrap();
|
|
ctx.emit(Inst::AluRRR {
|
|
alu_op,
|
|
rd: tmp,
|
|
rn: zero_reg(),
|
|
rm,
|
|
});
|
|
tmp.to_reg()
|
|
} else {
|
|
rm
|
|
};
|
|
ctx.emit(Inst::AluRRR { alu_op, rd, rn, rm });
|
|
}
|
|
}
|
|
} else {
|
|
debug_assert!(ty_bits_size < 32);
|
|
|
|
match rm {
|
|
ResultRegImmShift::Reg(reg) => {
|
|
let reg = if is_rotl {
|
|
// Really ty_bits_size - rn, but the upper bits of the result are
|
|
// ignored (because of the implicit masking done by the instruction),
|
|
// so this is equivalent to negating the input.
|
|
let tmp = ctx.alloc_tmp(I32).only_reg().unwrap();
|
|
ctx.emit(Inst::AluRRR {
|
|
alu_op: ALUOp::Sub32,
|
|
rd: tmp,
|
|
rn: zero_reg(),
|
|
rm: reg,
|
|
});
|
|
tmp.to_reg()
|
|
} else {
|
|
reg
|
|
};
|
|
|
|
// Explicitly mask the rotation count.
|
|
let tmp_masked_rm = ctx.alloc_tmp(I32).only_reg().unwrap();
|
|
ctx.emit(Inst::AluRRImmLogic {
|
|
alu_op: ALUOp::And32,
|
|
rd: tmp_masked_rm,
|
|
rn: reg,
|
|
imml: ImmLogic::maybe_from_u64((ty_bits_size - 1) as u64, I32).unwrap(),
|
|
});
|
|
let tmp_masked_rm = tmp_masked_rm.to_reg();
|
|
|
|
let tmp1 = ctx.alloc_tmp(I32).only_reg().unwrap();
|
|
let tmp2 = ctx.alloc_tmp(I32).only_reg().unwrap();
|
|
ctx.emit(Inst::AluRRImm12 {
|
|
alu_op: ALUOp::Sub32,
|
|
rd: tmp1,
|
|
rn: tmp_masked_rm,
|
|
imm12: Imm12::maybe_from_u64(ty_bits_size as u64).unwrap(),
|
|
});
|
|
ctx.emit(Inst::AluRRR {
|
|
alu_op: ALUOp::Sub32,
|
|
rd: tmp1,
|
|
rn: zero_reg(),
|
|
rm: tmp1.to_reg(),
|
|
});
|
|
ctx.emit(Inst::AluRRR {
|
|
alu_op: ALUOp::Lsr32,
|
|
rd: tmp2,
|
|
rn,
|
|
rm: tmp_masked_rm,
|
|
});
|
|
ctx.emit(Inst::AluRRR {
|
|
alu_op: ALUOp::Lsl32,
|
|
rd,
|
|
rn,
|
|
rm: tmp1.to_reg(),
|
|
});
|
|
ctx.emit(Inst::AluRRR {
|
|
alu_op: ALUOp::Orr32,
|
|
rd,
|
|
rn: rd.to_reg(),
|
|
rm: tmp2.to_reg(),
|
|
});
|
|
}
|
|
|
|
ResultRegImmShift::ImmShift(mut immshift) => {
|
|
if is_rotl {
|
|
immshift.imm = ty_bits_size.wrapping_sub(immshift.value());
|
|
}
|
|
immshift.imm &= ty_bits_size - 1;
|
|
|
|
let tmp1 = ctx.alloc_tmp(I32).only_reg().unwrap();
|
|
ctx.emit(Inst::AluRRImmShift {
|
|
alu_op: ALUOp::Lsr32,
|
|
rd: tmp1,
|
|
rn,
|
|
immshift: immshift.clone(),
|
|
});
|
|
|
|
let amount = immshift.value() & (ty_bits_size - 1);
|
|
let opp_shift =
|
|
ImmShift::maybe_from_u64(ty_bits_size as u64 - amount as u64).unwrap();
|
|
ctx.emit(Inst::AluRRImmShift {
|
|
alu_op: ALUOp::Lsl32,
|
|
rd,
|
|
rn,
|
|
immshift: opp_shift,
|
|
});
|
|
|
|
ctx.emit(Inst::AluRRR {
|
|
alu_op: ALUOp::Orr32,
|
|
rd,
|
|
rn: rd.to_reg(),
|
|
rm: tmp1.to_reg(),
|
|
});
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
Opcode::Bitrev | Opcode::Clz | Opcode::Cls | Opcode::Ctz => {
|
|
let rd = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
|
|
let needs_zext = match op {
|
|
Opcode::Bitrev | Opcode::Ctz => false,
|
|
Opcode::Clz | Opcode::Cls => true,
|
|
_ => unreachable!(),
|
|
};
|
|
let ty = ty.unwrap();
|
|
let narrow_mode = if needs_zext && ty_bits(ty) == 64 {
|
|
NarrowValueMode::ZeroExtend64
|
|
} else if needs_zext {
|
|
NarrowValueMode::ZeroExtend32
|
|
} else {
|
|
NarrowValueMode::None
|
|
};
|
|
let rn = put_input_in_reg(ctx, inputs[0], narrow_mode);
|
|
let op_ty = match ty {
|
|
I8 | I16 | I32 => I32,
|
|
I64 => I64,
|
|
_ => panic!("Unsupported type for Bitrev/Clz/Cls"),
|
|
};
|
|
let bitop = match op {
|
|
Opcode::Clz | Opcode::Cls | Opcode::Bitrev => BitOp::from((op, op_ty)),
|
|
Opcode::Ctz => BitOp::from((Opcode::Bitrev, op_ty)),
|
|
_ => unreachable!(),
|
|
};
|
|
ctx.emit(Inst::BitRR { rd, rn, op: bitop });
|
|
|
|
// Both bitrev and ctz use a bit-reverse (rbit) instruction; ctz to reduce the problem
|
|
// to a clz, and bitrev as the main operation.
|
|
if op == Opcode::Bitrev || op == Opcode::Ctz {
|
|
// Reversing an n-bit value (n < 32) with a 32-bit bitrev instruction will place
|
|
// the reversed result in the highest n bits, so we need to shift them down into
|
|
// place.
|
|
let right_shift = match ty {
|
|
I8 => Some(24),
|
|
I16 => Some(16),
|
|
I32 => None,
|
|
I64 => None,
|
|
_ => panic!("Unsupported type for Bitrev"),
|
|
};
|
|
if let Some(s) = right_shift {
|
|
ctx.emit(Inst::AluRRImmShift {
|
|
alu_op: ALUOp::Lsr32,
|
|
rd,
|
|
rn: rd.to_reg(),
|
|
immshift: ImmShift::maybe_from_u64(s).unwrap(),
|
|
});
|
|
}
|
|
}
|
|
|
|
if op == Opcode::Ctz {
|
|
ctx.emit(Inst::BitRR {
|
|
op: BitOp::from((Opcode::Clz, op_ty)),
|
|
rd,
|
|
rn: rd.to_reg(),
|
|
});
|
|
}
|
|
}
|
|
|
|
Opcode::Popcnt => {
|
|
let rd = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
|
|
let rn = put_input_in_reg(ctx, inputs[0], NarrowValueMode::None);
|
|
let ty = ty.unwrap();
|
|
let size = ScalarSize::from_operand_size(OperandSize::from_ty(ty));
|
|
let tmp = ctx.alloc_tmp(I8X16).only_reg().unwrap();
|
|
|
|
// fmov tmp, rn
|
|
// cnt tmp.8b, tmp.8b
|
|
// addp tmp.8b, tmp.8b, tmp.8b / addv tmp, tmp.8b / (no instruction for 8-bit inputs)
|
|
// umov rd, tmp.b[0]
|
|
|
|
ctx.emit(Inst::MovToFpu {
|
|
rd: tmp,
|
|
rn: rn,
|
|
size,
|
|
});
|
|
ctx.emit(Inst::VecMisc {
|
|
op: VecMisc2::Cnt,
|
|
rd: tmp,
|
|
rn: tmp.to_reg(),
|
|
size: VectorSize::Size8x8,
|
|
});
|
|
|
|
match ScalarSize::from_ty(ty) {
|
|
ScalarSize::Size8 => {}
|
|
ScalarSize::Size16 => {
|
|
// ADDP is usually cheaper than ADDV.
|
|
ctx.emit(Inst::VecRRR {
|
|
alu_op: VecALUOp::Addp,
|
|
rd: tmp,
|
|
rn: tmp.to_reg(),
|
|
rm: tmp.to_reg(),
|
|
size: VectorSize::Size8x8,
|
|
});
|
|
}
|
|
ScalarSize::Size32 | ScalarSize::Size64 => {
|
|
ctx.emit(Inst::VecLanes {
|
|
op: VecLanesOp::Addv,
|
|
rd: tmp,
|
|
rn: tmp.to_reg(),
|
|
size: VectorSize::Size8x8,
|
|
});
|
|
}
|
|
sz => panic!("Unexpected scalar FP operand size: {:?}", sz),
|
|
}
|
|
|
|
ctx.emit(Inst::MovFromVec {
|
|
rd,
|
|
rn: tmp.to_reg(),
|
|
idx: 0,
|
|
size: VectorSize::Size8x16,
|
|
});
|
|
}
|
|
|
|
Opcode::Load
|
|
| Opcode::Uload8
|
|
| Opcode::Sload8
|
|
| Opcode::Uload16
|
|
| Opcode::Sload16
|
|
| Opcode::Uload32
|
|
| Opcode::Sload32
|
|
| Opcode::LoadComplex
|
|
| Opcode::Uload8Complex
|
|
| Opcode::Sload8Complex
|
|
| Opcode::Uload16Complex
|
|
| Opcode::Sload16Complex
|
|
| Opcode::Uload32Complex
|
|
| Opcode::Sload32Complex
|
|
| Opcode::Sload8x8
|
|
| Opcode::Uload8x8
|
|
| Opcode::Sload16x4
|
|
| Opcode::Uload16x4
|
|
| Opcode::Sload32x2
|
|
| Opcode::Uload32x2
|
|
| Opcode::Uload8x8Complex
|
|
| Opcode::Sload8x8Complex
|
|
| Opcode::Uload16x4Complex
|
|
| Opcode::Sload16x4Complex
|
|
| Opcode::Uload32x2Complex
|
|
| Opcode::Sload32x2Complex => {
|
|
let sign_extend = match op {
|
|
Opcode::Sload8
|
|
| Opcode::Sload8Complex
|
|
| Opcode::Sload16
|
|
| Opcode::Sload16Complex
|
|
| Opcode::Sload32
|
|
| Opcode::Sload32Complex => true,
|
|
_ => false,
|
|
};
|
|
let flags = ctx
|
|
.memflags(insn)
|
|
.expect("Load instruction should have memflags");
|
|
|
|
lower_load(
|
|
ctx,
|
|
insn,
|
|
&inputs[..],
|
|
outputs[0],
|
|
|ctx, rd, elem_ty, mem| {
|
|
let is_float = ty_has_float_or_vec_representation(elem_ty);
|
|
ctx.emit(match (ty_bits(elem_ty), sign_extend, is_float) {
|
|
(1, _, _) => Inst::ULoad8 { rd, mem, flags },
|
|
(8, false, _) => Inst::ULoad8 { rd, mem, flags },
|
|
(8, true, _) => Inst::SLoad8 { rd, mem, flags },
|
|
(16, false, _) => Inst::ULoad16 { rd, mem, flags },
|
|
(16, true, _) => Inst::SLoad16 { rd, mem, flags },
|
|
(32, false, false) => Inst::ULoad32 { rd, mem, flags },
|
|
(32, true, false) => Inst::SLoad32 { rd, mem, flags },
|
|
(32, _, true) => Inst::FpuLoad32 { rd, mem, flags },
|
|
(64, _, false) => Inst::ULoad64 { rd, mem, flags },
|
|
// Note that we treat some of the vector loads as scalar floating-point loads,
|
|
// which is correct in a little endian environment.
|
|
(64, _, true) => Inst::FpuLoad64 { rd, mem, flags },
|
|
(128, _, _) => Inst::FpuLoad128 { rd, mem, flags },
|
|
_ => panic!("Unsupported size in load"),
|
|
});
|
|
|
|
let vec_extend = match op {
|
|
Opcode::Sload8x8 => Some(VecExtendOp::Sxtl8),
|
|
Opcode::Sload8x8Complex => Some(VecExtendOp::Sxtl8),
|
|
Opcode::Uload8x8 => Some(VecExtendOp::Uxtl8),
|
|
Opcode::Uload8x8Complex => Some(VecExtendOp::Uxtl8),
|
|
Opcode::Sload16x4 => Some(VecExtendOp::Sxtl16),
|
|
Opcode::Sload16x4Complex => Some(VecExtendOp::Sxtl16),
|
|
Opcode::Uload16x4 => Some(VecExtendOp::Uxtl16),
|
|
Opcode::Uload16x4Complex => Some(VecExtendOp::Uxtl16),
|
|
Opcode::Sload32x2 => Some(VecExtendOp::Sxtl32),
|
|
Opcode::Sload32x2Complex => Some(VecExtendOp::Sxtl32),
|
|
Opcode::Uload32x2 => Some(VecExtendOp::Uxtl32),
|
|
Opcode::Uload32x2Complex => Some(VecExtendOp::Uxtl32),
|
|
_ => None,
|
|
};
|
|
|
|
if let Some(t) = vec_extend {
|
|
ctx.emit(Inst::VecExtend {
|
|
t,
|
|
rd,
|
|
rn: rd.to_reg(),
|
|
high_half: false,
|
|
});
|
|
}
|
|
},
|
|
);
|
|
}
|
|
|
|
Opcode::Store
|
|
| Opcode::Istore8
|
|
| Opcode::Istore16
|
|
| Opcode::Istore32
|
|
| Opcode::StoreComplex
|
|
| Opcode::Istore8Complex
|
|
| Opcode::Istore16Complex
|
|
| Opcode::Istore32Complex => {
|
|
let off = ctx.data(insn).load_store_offset().unwrap();
|
|
let elem_ty = match op {
|
|
Opcode::Istore8 | Opcode::Istore8Complex => I8,
|
|
Opcode::Istore16 | Opcode::Istore16Complex => I16,
|
|
Opcode::Istore32 | Opcode::Istore32Complex => I32,
|
|
Opcode::Store | Opcode::StoreComplex => ctx.input_ty(insn, 0),
|
|
_ => unreachable!(),
|
|
};
|
|
let is_float = ty_has_float_or_vec_representation(elem_ty);
|
|
let flags = ctx
|
|
.memflags(insn)
|
|
.expect("Store instruction should have memflags");
|
|
|
|
let mem = lower_address(ctx, elem_ty, &inputs[1..], off);
|
|
let rd = put_input_in_reg(ctx, inputs[0], NarrowValueMode::None);
|
|
|
|
ctx.emit(match (ty_bits(elem_ty), is_float) {
|
|
(1, _) | (8, _) => Inst::Store8 { rd, mem, flags },
|
|
(16, _) => Inst::Store16 { rd, mem, flags },
|
|
(32, false) => Inst::Store32 { rd, mem, flags },
|
|
(32, true) => Inst::FpuStore32 { rd, mem, flags },
|
|
(64, false) => Inst::Store64 { rd, mem, flags },
|
|
(64, true) => Inst::FpuStore64 { rd, mem, flags },
|
|
(128, _) => Inst::FpuStore128 { rd, mem, flags },
|
|
_ => panic!("Unsupported size in store"),
|
|
});
|
|
}
|
|
|
|
Opcode::StackAddr => {
|
|
let (stack_slot, offset) = match *ctx.data(insn) {
|
|
InstructionData::StackLoad {
|
|
opcode: Opcode::StackAddr,
|
|
stack_slot,
|
|
offset,
|
|
} => (stack_slot, offset),
|
|
_ => unreachable!(),
|
|
};
|
|
let rd = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
|
|
let offset: i32 = offset.into();
|
|
let inst = ctx
|
|
.abi()
|
|
.stackslot_addr(stack_slot, u32::try_from(offset).unwrap(), rd);
|
|
ctx.emit(inst);
|
|
}
|
|
|
|
Opcode::AtomicRmw => {
|
|
let r_dst = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
|
|
let mut r_addr = put_input_in_reg(ctx, inputs[0], NarrowValueMode::None);
|
|
let mut r_arg2 = put_input_in_reg(ctx, inputs[1], NarrowValueMode::None);
|
|
let ty_access = ty.unwrap();
|
|
assert!(is_valid_atomic_transaction_ty(ty_access));
|
|
// Make sure that both args are in virtual regs, since in effect
|
|
// we have to do a parallel copy to get them safely to the AtomicRMW input
|
|
// regs, and that's not guaranteed safe if either is in a real reg.
|
|
r_addr = ctx.ensure_in_vreg(r_addr, I64);
|
|
r_arg2 = ctx.ensure_in_vreg(r_arg2, I64);
|
|
// Move the args to the preordained AtomicRMW input regs
|
|
ctx.emit(Inst::gen_move(Writable::from_reg(xreg(25)), r_addr, I64));
|
|
ctx.emit(Inst::gen_move(Writable::from_reg(xreg(26)), r_arg2, I64));
|
|
// Now the AtomicRMW insn itself
|
|
let op = inst_common::AtomicRmwOp::from(ctx.data(insn).atomic_rmw_op().unwrap());
|
|
ctx.emit(Inst::AtomicRMW { ty: ty_access, op });
|
|
// And finally, copy the preordained AtomicRMW output reg to its destination.
|
|
ctx.emit(Inst::gen_move(r_dst, xreg(27), I64));
|
|
// Also, x24 and x28 are trashed. `fn aarch64_get_regs` must mention that.
|
|
}
|
|
|
|
Opcode::AtomicCas => {
|
|
let r_dst = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
|
|
let mut r_addr = put_input_in_reg(ctx, inputs[0], NarrowValueMode::None);
|
|
let mut r_expected = put_input_in_reg(ctx, inputs[1], NarrowValueMode::None);
|
|
let mut r_replacement = put_input_in_reg(ctx, inputs[2], NarrowValueMode::None);
|
|
let ty_access = ty.unwrap();
|
|
assert!(is_valid_atomic_transaction_ty(ty_access));
|
|
|
|
if isa_flags.use_lse() {
|
|
ctx.emit(Inst::gen_move(r_dst, r_expected, ty_access));
|
|
ctx.emit(Inst::AtomicCAS {
|
|
rs: r_dst,
|
|
rt: r_replacement,
|
|
rn: r_addr,
|
|
ty: ty_access,
|
|
});
|
|
} else {
|
|
// This is very similar to, but not identical to, the AtomicRmw case. Note
|
|
// that the AtomicCASLoop sequence does its own masking, so we don't need to worry
|
|
// about zero-extending narrow (I8/I16/I32) values here.
|
|
// Make sure that all three args are in virtual regs. See corresponding comment
|
|
// for `Opcode::AtomicRmw` above.
|
|
r_addr = ctx.ensure_in_vreg(r_addr, I64);
|
|
r_expected = ctx.ensure_in_vreg(r_expected, I64);
|
|
r_replacement = ctx.ensure_in_vreg(r_replacement, I64);
|
|
// Move the args to the preordained AtomicCASLoop input regs
|
|
ctx.emit(Inst::gen_move(Writable::from_reg(xreg(25)), r_addr, I64));
|
|
ctx.emit(Inst::gen_move(
|
|
Writable::from_reg(xreg(26)),
|
|
r_expected,
|
|
I64,
|
|
));
|
|
ctx.emit(Inst::gen_move(
|
|
Writable::from_reg(xreg(28)),
|
|
r_replacement,
|
|
I64,
|
|
));
|
|
// Now the AtomicCASLoop itself, implemented in the normal way, with an LL-SC loop
|
|
ctx.emit(Inst::AtomicCASLoop { ty: ty_access });
|
|
// And finally, copy the preordained AtomicCASLoop output reg to its destination.
|
|
ctx.emit(Inst::gen_move(r_dst, xreg(27), I64));
|
|
// Also, x24 and x28 are trashed. `fn aarch64_get_regs` must mention that.
|
|
}
|
|
}
|
|
|
|
Opcode::AtomicLoad => {
|
|
let r_data = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
|
|
let r_addr = put_input_in_reg(ctx, inputs[0], NarrowValueMode::None);
|
|
let ty_access = ty.unwrap();
|
|
assert!(is_valid_atomic_transaction_ty(ty_access));
|
|
ctx.emit(Inst::AtomicLoad {
|
|
ty: ty_access,
|
|
r_data,
|
|
r_addr,
|
|
});
|
|
}
|
|
|
|
Opcode::AtomicStore => {
|
|
let r_data = put_input_in_reg(ctx, inputs[0], NarrowValueMode::None);
|
|
let r_addr = put_input_in_reg(ctx, inputs[1], NarrowValueMode::None);
|
|
let ty_access = ctx.input_ty(insn, 0);
|
|
assert!(is_valid_atomic_transaction_ty(ty_access));
|
|
ctx.emit(Inst::AtomicStore {
|
|
ty: ty_access,
|
|
r_data,
|
|
r_addr,
|
|
});
|
|
}
|
|
|
|
Opcode::Fence => {
|
|
ctx.emit(Inst::Fence {});
|
|
}
|
|
|
|
Opcode::StackLoad | Opcode::StackStore => {
|
|
panic!("Direct stack memory access not supported; should not be used by Wasm");
|
|
}
|
|
|
|
Opcode::HeapAddr => {
|
|
panic!("heap_addr should have been removed by legalization!");
|
|
}
|
|
|
|
Opcode::TableAddr => {
|
|
panic!("table_addr should have been removed by legalization!");
|
|
}
|
|
|
|
Opcode::ConstAddr => unimplemented!(),
|
|
|
|
Opcode::Nop => {
|
|
// Nothing.
|
|
}
|
|
|
|
Opcode::Select => {
|
|
let flag_input = inputs[0];
|
|
let cond = if let Some(icmp_insn) =
|
|
maybe_input_insn_via_conv(ctx, flag_input, Opcode::Icmp, Opcode::Bint)
|
|
{
|
|
let condcode = ctx.data(icmp_insn).cond_code().unwrap();
|
|
let cond = lower_condcode(condcode);
|
|
let is_signed = condcode_is_signed(condcode);
|
|
lower_icmp_or_ifcmp_to_flags(ctx, icmp_insn, is_signed);
|
|
cond
|
|
} else if let Some(fcmp_insn) =
|
|
maybe_input_insn_via_conv(ctx, flag_input, Opcode::Fcmp, Opcode::Bint)
|
|
{
|
|
let condcode = ctx.data(fcmp_insn).fp_cond_code().unwrap();
|
|
let cond = lower_fp_condcode(condcode);
|
|
lower_fcmp_or_ffcmp_to_flags(ctx, fcmp_insn);
|
|
cond
|
|
} else {
|
|
let (cmp_op, narrow_mode) = if ty_bits(ctx.input_ty(insn, 0)) > 32 {
|
|
(ALUOp::SubS64, NarrowValueMode::ZeroExtend64)
|
|
} else {
|
|
(ALUOp::SubS32, NarrowValueMode::ZeroExtend32)
|
|
};
|
|
|
|
let rcond = put_input_in_reg(ctx, inputs[0], narrow_mode);
|
|
// cmp rcond, #0
|
|
ctx.emit(Inst::AluRRR {
|
|
alu_op: cmp_op,
|
|
rd: writable_zero_reg(),
|
|
rn: rcond,
|
|
rm: zero_reg(),
|
|
});
|
|
Cond::Ne
|
|
};
|
|
|
|
// csel.cond rd, rn, rm
|
|
let rd = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
|
|
let rn = put_input_in_reg(ctx, inputs[1], NarrowValueMode::None);
|
|
let rm = put_input_in_reg(ctx, inputs[2], NarrowValueMode::None);
|
|
let ty = ctx.output_ty(insn, 0);
|
|
let bits = ty_bits(ty);
|
|
let is_float = ty_has_float_or_vec_representation(ty);
|
|
if is_float && bits == 32 {
|
|
ctx.emit(Inst::FpuCSel32 { cond, rd, rn, rm });
|
|
} else if is_float && bits == 64 {
|
|
ctx.emit(Inst::FpuCSel64 { cond, rd, rn, rm });
|
|
} else if is_float && bits == 128 {
|
|
ctx.emit(Inst::VecCSel { cond, rd, rn, rm });
|
|
} else {
|
|
ctx.emit(Inst::CSel { cond, rd, rn, rm });
|
|
}
|
|
}
|
|
|
|
Opcode::Selectif | Opcode::SelectifSpectreGuard => {
|
|
let condcode = ctx.data(insn).cond_code().unwrap();
|
|
let cond = lower_condcode(condcode);
|
|
let is_signed = condcode_is_signed(condcode);
|
|
// Verification ensures that the input is always a
|
|
// single-def ifcmp.
|
|
let ifcmp_insn = maybe_input_insn(ctx, inputs[0], Opcode::Ifcmp).unwrap();
|
|
lower_icmp_or_ifcmp_to_flags(ctx, ifcmp_insn, is_signed);
|
|
|
|
// csel.COND rd, rn, rm
|
|
let rd = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
|
|
let rn = put_input_in_reg(ctx, inputs[1], NarrowValueMode::None);
|
|
let rm = put_input_in_reg(ctx, inputs[2], NarrowValueMode::None);
|
|
let ty = ctx.output_ty(insn, 0);
|
|
let bits = ty_bits(ty);
|
|
let is_float = ty_has_float_or_vec_representation(ty);
|
|
if is_float && bits == 32 {
|
|
ctx.emit(Inst::FpuCSel32 { cond, rd, rn, rm });
|
|
} else if is_float && bits == 64 {
|
|
ctx.emit(Inst::FpuCSel64 { cond, rd, rn, rm });
|
|
} else {
|
|
ctx.emit(Inst::CSel { cond, rd, rn, rm });
|
|
}
|
|
}
|
|
|
|
Opcode::Bitselect | Opcode::Vselect => {
|
|
let ty = ty.unwrap();
|
|
if !ty.is_vector() {
|
|
debug_assert_ne!(Opcode::Vselect, op);
|
|
let tmp = ctx.alloc_tmp(I64).only_reg().unwrap();
|
|
let rd = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
|
|
let rcond = put_input_in_reg(ctx, inputs[0], NarrowValueMode::None);
|
|
let rn = put_input_in_reg(ctx, inputs[1], NarrowValueMode::None);
|
|
let rm = put_input_in_reg(ctx, inputs[2], NarrowValueMode::None);
|
|
// AND rTmp, rn, rcond
|
|
ctx.emit(Inst::AluRRR {
|
|
alu_op: ALUOp::And64,
|
|
rd: tmp,
|
|
rn,
|
|
rm: rcond,
|
|
});
|
|
// BIC rd, rm, rcond
|
|
ctx.emit(Inst::AluRRR {
|
|
alu_op: ALUOp::AndNot64,
|
|
rd,
|
|
rn: rm,
|
|
rm: rcond,
|
|
});
|
|
// ORR rd, rd, rTmp
|
|
ctx.emit(Inst::AluRRR {
|
|
alu_op: ALUOp::Orr64,
|
|
rd,
|
|
rn: rd.to_reg(),
|
|
rm: tmp.to_reg(),
|
|
});
|
|
} else {
|
|
let rcond = put_input_in_reg(ctx, inputs[0], NarrowValueMode::None);
|
|
let rn = put_input_in_reg(ctx, inputs[1], NarrowValueMode::None);
|
|
let rm = put_input_in_reg(ctx, inputs[2], NarrowValueMode::None);
|
|
let rd = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
|
|
ctx.emit(Inst::gen_move(rd, rcond, ty));
|
|
|
|
ctx.emit(Inst::VecRRR {
|
|
alu_op: VecALUOp::Bsl,
|
|
rd,
|
|
rn,
|
|
rm,
|
|
size: VectorSize::from_ty(ty),
|
|
});
|
|
}
|
|
}
|
|
|
|
Opcode::Trueif => {
|
|
let condcode = ctx.data(insn).cond_code().unwrap();
|
|
let cond = lower_condcode(condcode);
|
|
let is_signed = condcode_is_signed(condcode);
|
|
// Verification ensures that the input is always a
|
|
// single-def ifcmp.
|
|
let ifcmp_insn = maybe_input_insn(ctx, inputs[0], Opcode::Ifcmp).unwrap();
|
|
lower_icmp_or_ifcmp_to_flags(ctx, ifcmp_insn, is_signed);
|
|
let rd = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
|
|
materialize_bool_result(ctx, insn, rd, cond);
|
|
}
|
|
|
|
Opcode::Trueff => {
|
|
let condcode = ctx.data(insn).fp_cond_code().unwrap();
|
|
let cond = lower_fp_condcode(condcode);
|
|
let ffcmp_insn = maybe_input_insn(ctx, inputs[0], Opcode::Ffcmp).unwrap();
|
|
lower_fcmp_or_ffcmp_to_flags(ctx, ffcmp_insn);
|
|
let rd = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
|
|
materialize_bool_result(ctx, insn, rd, cond);
|
|
}
|
|
|
|
Opcode::IsNull | Opcode::IsInvalid => {
|
|
// Null references are represented by the constant value 0; invalid references are
|
|
// represented by the constant value -1. See `define_reftypes()` in
|
|
// `meta/src/isa/x86/encodings.rs` to confirm.
|
|
let rd = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
|
|
let rn = put_input_in_reg(ctx, inputs[0], NarrowValueMode::None);
|
|
let ty = ctx.input_ty(insn, 0);
|
|
let (alu_op, const_value) = match op {
|
|
Opcode::IsNull => {
|
|
// cmp rn, #0
|
|
(choose_32_64(ty, ALUOp::SubS32, ALUOp::SubS64), 0)
|
|
}
|
|
Opcode::IsInvalid => {
|
|
// cmn rn, #1
|
|
(choose_32_64(ty, ALUOp::AddS32, ALUOp::AddS64), 1)
|
|
}
|
|
_ => unreachable!(),
|
|
};
|
|
let const_value = ResultRSEImm12::Imm12(Imm12::maybe_from_u64(const_value).unwrap());
|
|
ctx.emit(alu_inst_imm12(alu_op, writable_zero_reg(), rn, const_value));
|
|
materialize_bool_result(ctx, insn, rd, Cond::Eq);
|
|
}
|
|
|
|
Opcode::Copy => {
|
|
let rd = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
|
|
let rn = put_input_in_reg(ctx, inputs[0], NarrowValueMode::None);
|
|
let ty = ctx.input_ty(insn, 0);
|
|
ctx.emit(Inst::gen_move(rd, rn, ty));
|
|
}
|
|
|
|
Opcode::Breduce | Opcode::Ireduce => {
|
|
// Smaller integers/booleans are stored with high-order bits
|
|
// undefined, so we can simply do a copy.
|
|
let rn = put_input_in_reg(ctx, inputs[0], NarrowValueMode::None);
|
|
let rd = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
|
|
let ty = ctx.input_ty(insn, 0);
|
|
ctx.emit(Inst::gen_move(rd, rn, ty));
|
|
}
|
|
|
|
Opcode::Bextend | Opcode::Bmask => {
|
|
// Bextend and Bmask both simply sign-extend. This works for:
|
|
// - Bextend, because booleans are stored as 0 / -1, so we
|
|
// sign-extend the -1 to a -1 in the wider width.
|
|
// - Bmask, because the resulting integer mask value must be
|
|
// all-ones (-1) if the argument is true.
|
|
|
|
let from_ty = ctx.input_ty(insn, 0);
|
|
let to_ty = ctx.output_ty(insn, 0);
|
|
let from_bits = ty_bits(from_ty);
|
|
let to_bits = ty_bits(to_ty);
|
|
|
|
assert!(
|
|
from_bits <= 64 && to_bits <= 64,
|
|
"Vector Bextend not supported yet"
|
|
);
|
|
assert!(from_bits <= to_bits);
|
|
|
|
if from_bits == to_bits {
|
|
// Nothing.
|
|
} else {
|
|
let rn = put_input_in_reg(ctx, inputs[0], NarrowValueMode::None);
|
|
let rd = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
|
|
let to_bits = if to_bits == 64 {
|
|
64
|
|
} else {
|
|
assert!(to_bits <= 32);
|
|
32
|
|
};
|
|
let from_bits = from_bits as u8;
|
|
ctx.emit(Inst::Extend {
|
|
rd,
|
|
rn,
|
|
signed: true,
|
|
from_bits,
|
|
to_bits,
|
|
});
|
|
}
|
|
}
|
|
|
|
Opcode::Bint => {
|
|
// Booleans are stored as all-zeroes (0) or all-ones (-1). We AND
|
|
// out the LSB to give a 0 / 1-valued integer result.
|
|
let rn = put_input_in_reg(ctx, inputs[0], NarrowValueMode::None);
|
|
let rd = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
|
|
let output_bits = ty_bits(ctx.output_ty(insn, 0));
|
|
|
|
let (imm_ty, alu_op) = if output_bits > 32 {
|
|
(I64, ALUOp::And64)
|
|
} else {
|
|
(I32, ALUOp::And32)
|
|
};
|
|
ctx.emit(Inst::AluRRImmLogic {
|
|
alu_op,
|
|
rd,
|
|
rn,
|
|
imml: ImmLogic::maybe_from_u64(1, imm_ty).unwrap(),
|
|
});
|
|
}
|
|
|
|
Opcode::Bitcast => {
|
|
let rd = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
|
|
let ity = ctx.input_ty(insn, 0);
|
|
let oty = ctx.output_ty(insn, 0);
|
|
let ity_bits = ty_bits(ity);
|
|
let ity_vec_reg = ty_has_float_or_vec_representation(ity);
|
|
let oty_bits = ty_bits(oty);
|
|
let oty_vec_reg = ty_has_float_or_vec_representation(oty);
|
|
|
|
debug_assert_eq!(ity_bits, oty_bits);
|
|
|
|
match (ity_vec_reg, oty_vec_reg) {
|
|
(true, true) => {
|
|
let narrow_mode = if ity_bits <= 32 {
|
|
NarrowValueMode::ZeroExtend32
|
|
} else {
|
|
NarrowValueMode::ZeroExtend64
|
|
};
|
|
let rm = put_input_in_reg(ctx, inputs[0], narrow_mode);
|
|
ctx.emit(Inst::gen_move(rd, rm, oty));
|
|
}
|
|
(false, false) => {
|
|
let rm = put_input_in_reg(ctx, inputs[0], NarrowValueMode::None);
|
|
ctx.emit(Inst::gen_move(rd, rm, oty));
|
|
}
|
|
(false, true) => {
|
|
let rn = put_input_in_reg(ctx, inputs[0], NarrowValueMode::ZeroExtend64);
|
|
ctx.emit(Inst::MovToFpu {
|
|
rd,
|
|
rn,
|
|
size: ScalarSize::Size64,
|
|
});
|
|
}
|
|
(true, false) => {
|
|
let rn = put_input_in_reg(ctx, inputs[0], NarrowValueMode::None);
|
|
let size = VectorSize::from_lane_size(ScalarSize::from_bits(oty_bits), true);
|
|
|
|
ctx.emit(Inst::MovFromVec {
|
|
rd,
|
|
rn,
|
|
idx: 0,
|
|
size,
|
|
});
|
|
}
|
|
}
|
|
}
|
|
|
|
Opcode::FallthroughReturn | Opcode::Return => {
|
|
for (i, input) in inputs.iter().enumerate() {
|
|
// N.B.: according to the AArch64 ABI, the top bits of a register
|
|
// (above the bits for the value's type) are undefined, so we
|
|
// need not extend the return values.
|
|
let src_regs = put_input_in_regs(ctx, *input);
|
|
let retval_regs = ctx.retval(i);
|
|
|
|
assert_eq!(src_regs.len(), retval_regs.len());
|
|
let ty = ctx.input_ty(insn, i);
|
|
let (_, tys) = Inst::rc_for_type(ty)?;
|
|
|
|
src_regs
|
|
.regs()
|
|
.iter()
|
|
.zip(retval_regs.regs().iter())
|
|
.zip(tys.iter())
|
|
.for_each(|((&src, &dst), &ty)| {
|
|
ctx.emit(Inst::gen_move(dst, src, ty));
|
|
});
|
|
}
|
|
// N.B.: the Ret itself is generated by the ABI.
|
|
}
|
|
|
|
Opcode::Ifcmp | Opcode::Ffcmp => {
|
|
// An Ifcmp/Ffcmp must always be seen as a use of a brif/brff or trueif/trueff
|
|
// instruction. This will always be the case as long as the IR uses an Ifcmp/Ffcmp from
|
|
// the same block, or a dominating block. In other words, it cannot pass through a BB
|
|
// param (phi). The flags pass of the verifier will ensure this.
|
|
panic!("Should never reach ifcmp as isel root!");
|
|
}
|
|
|
|
Opcode::Icmp => {
|
|
let condcode = ctx.data(insn).cond_code().unwrap();
|
|
let cond = lower_condcode(condcode);
|
|
let is_signed = condcode_is_signed(condcode);
|
|
let rd = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
|
|
let ty = ctx.input_ty(insn, 0);
|
|
let bits = ty_bits(ty);
|
|
let narrow_mode = match (bits <= 32, is_signed) {
|
|
(true, true) => NarrowValueMode::SignExtend32,
|
|
(true, false) => NarrowValueMode::ZeroExtend32,
|
|
(false, true) => NarrowValueMode::SignExtend64,
|
|
(false, false) => NarrowValueMode::ZeroExtend64,
|
|
};
|
|
|
|
if ty == I128 {
|
|
let lhs = put_input_in_regs(ctx, inputs[0]);
|
|
let rhs = put_input_in_regs(ctx, inputs[1]);
|
|
|
|
let tmp1 = ctx.alloc_tmp(I64).only_reg().unwrap();
|
|
let tmp2 = ctx.alloc_tmp(I64).only_reg().unwrap();
|
|
|
|
match condcode {
|
|
IntCC::Equal | IntCC::NotEqual => {
|
|
// eor tmp1, lhs_lo, rhs_lo
|
|
// eor tmp2, lhs_hi, rhs_hi
|
|
// adds xzr, tmp1, tmp2
|
|
// cset dst, {eq, ne}
|
|
|
|
ctx.emit(Inst::AluRRR {
|
|
alu_op: ALUOp::Eor64,
|
|
rd: tmp1,
|
|
rn: lhs.regs()[0],
|
|
rm: rhs.regs()[0],
|
|
});
|
|
ctx.emit(Inst::AluRRR {
|
|
alu_op: ALUOp::Eor64,
|
|
rd: tmp2,
|
|
rn: lhs.regs()[1],
|
|
rm: rhs.regs()[1],
|
|
});
|
|
ctx.emit(Inst::AluRRR {
|
|
alu_op: ALUOp::AddS64,
|
|
rd: writable_zero_reg(),
|
|
rn: tmp1.to_reg(),
|
|
rm: tmp2.to_reg(),
|
|
});
|
|
materialize_bool_result(ctx, insn, rd, cond);
|
|
}
|
|
IntCC::Overflow | IntCC::NotOverflow => {
|
|
// We can do an 128bit add while throwing away the results
|
|
// and check the overflow flags at the end.
|
|
//
|
|
// adds xzr, lhs_lo, rhs_lo
|
|
// adcs xzr, lhs_hi, rhs_hi
|
|
// cset dst, {vs, vc}
|
|
|
|
ctx.emit(Inst::AluRRR {
|
|
alu_op: ALUOp::AddS64,
|
|
rd: writable_zero_reg(),
|
|
rn: lhs.regs()[0],
|
|
rm: rhs.regs()[0],
|
|
});
|
|
ctx.emit(Inst::AluRRR {
|
|
alu_op: ALUOp::AdcS64,
|
|
rd: writable_zero_reg(),
|
|
rn: lhs.regs()[1],
|
|
rm: rhs.regs()[1],
|
|
});
|
|
materialize_bool_result(ctx, insn, rd, cond);
|
|
}
|
|
_ => {
|
|
// cmp lhs_lo, rhs_lo
|
|
// cset tmp1, low_cc
|
|
// cmp lhs_hi, rhs_hi
|
|
// cset tmp2, cond
|
|
// csel dst, tmp1, tmp2, eq
|
|
|
|
let low_cc = match condcode {
|
|
IntCC::SignedGreaterThanOrEqual | IntCC::UnsignedGreaterThanOrEqual => {
|
|
Cond::Hs
|
|
}
|
|
IntCC::SignedGreaterThan | IntCC::UnsignedGreaterThan => Cond::Hi,
|
|
IntCC::SignedLessThanOrEqual | IntCC::UnsignedLessThanOrEqual => {
|
|
Cond::Ls
|
|
}
|
|
IntCC::SignedLessThan | IntCC::UnsignedLessThan => Cond::Lo,
|
|
_ => unreachable!(),
|
|
};
|
|
|
|
ctx.emit(Inst::AluRRR {
|
|
alu_op: ALUOp::SubS64,
|
|
rd: writable_zero_reg(),
|
|
rn: lhs.regs()[0],
|
|
rm: rhs.regs()[0],
|
|
});
|
|
materialize_bool_result(ctx, insn, tmp1, low_cc);
|
|
ctx.emit(Inst::AluRRR {
|
|
alu_op: ALUOp::SubS64,
|
|
rd: writable_zero_reg(),
|
|
rn: lhs.regs()[1],
|
|
rm: rhs.regs()[1],
|
|
});
|
|
materialize_bool_result(ctx, insn, tmp2, cond);
|
|
ctx.emit(Inst::CSel {
|
|
cond: Cond::Eq,
|
|
rd,
|
|
rn: tmp1.to_reg(),
|
|
rm: tmp2.to_reg(),
|
|
});
|
|
}
|
|
}
|
|
} else if !ty.is_vector() {
|
|
let alu_op = choose_32_64(ty, ALUOp::SubS32, ALUOp::SubS64);
|
|
let rn = put_input_in_reg(ctx, inputs[0], narrow_mode);
|
|
let rm = put_input_in_rse_imm12(ctx, inputs[1], narrow_mode);
|
|
ctx.emit(alu_inst_imm12(alu_op, writable_zero_reg(), rn, rm));
|
|
materialize_bool_result(ctx, insn, rd, cond);
|
|
} else {
|
|
let rn = put_input_in_reg(ctx, inputs[0], narrow_mode);
|
|
let rm = put_input_in_reg(ctx, inputs[1], narrow_mode);
|
|
lower_vector_compare(ctx, rd, rn, rm, ty, cond)?;
|
|
}
|
|
}
|
|
|
|
Opcode::Fcmp => {
|
|
let condcode = ctx.data(insn).fp_cond_code().unwrap();
|
|
let cond = lower_fp_condcode(condcode);
|
|
let ty = ctx.input_ty(insn, 0);
|
|
let rn = put_input_in_reg(ctx, inputs[0], NarrowValueMode::None);
|
|
let rm = put_input_in_reg(ctx, inputs[1], NarrowValueMode::None);
|
|
let rd = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
|
|
|
|
if !ty.is_vector() {
|
|
match ty_bits(ty) {
|
|
32 => {
|
|
ctx.emit(Inst::FpuCmp32 { rn, rm });
|
|
}
|
|
64 => {
|
|
ctx.emit(Inst::FpuCmp64 { rn, rm });
|
|
}
|
|
_ => panic!("Bad float size"),
|
|
}
|
|
materialize_bool_result(ctx, insn, rd, cond);
|
|
} else {
|
|
lower_vector_compare(ctx, rd, rn, rm, ty, cond)?;
|
|
}
|
|
}
|
|
|
|
Opcode::JumpTableEntry | Opcode::JumpTableBase => {
|
|
panic!("Should not appear: we handle BrTable directly");
|
|
}
|
|
|
|
Opcode::Debugtrap => {
|
|
ctx.emit(Inst::Brk);
|
|
}
|
|
|
|
Opcode::Trap | Opcode::ResumableTrap => {
|
|
let trap_code = ctx.data(insn).trap_code().unwrap();
|
|
ctx.emit_safepoint(Inst::Udf { trap_code });
|
|
}
|
|
|
|
Opcode::Trapif | Opcode::Trapff => {
|
|
let trap_code = ctx.data(insn).trap_code().unwrap();
|
|
|
|
let cond = if maybe_input_insn(ctx, inputs[0], Opcode::IaddIfcout).is_some() {
|
|
let condcode = ctx.data(insn).cond_code().unwrap();
|
|
let cond = lower_condcode(condcode);
|
|
// The flags must not have been clobbered by any other
|
|
// instruction between the iadd_ifcout and this instruction, as
|
|
// verified by the CLIF validator; so we can simply use the
|
|
// flags here.
|
|
cond
|
|
} else if op == Opcode::Trapif {
|
|
let condcode = ctx.data(insn).cond_code().unwrap();
|
|
let cond = lower_condcode(condcode);
|
|
let is_signed = condcode_is_signed(condcode);
|
|
|
|
// Verification ensures that the input is always a single-def ifcmp.
|
|
let ifcmp_insn = maybe_input_insn(ctx, inputs[0], Opcode::Ifcmp).unwrap();
|
|
lower_icmp_or_ifcmp_to_flags(ctx, ifcmp_insn, is_signed);
|
|
cond
|
|
} else {
|
|
let condcode = ctx.data(insn).fp_cond_code().unwrap();
|
|
let cond = lower_fp_condcode(condcode);
|
|
|
|
// Verification ensures that the input is always a
|
|
// single-def ffcmp.
|
|
let ffcmp_insn = maybe_input_insn(ctx, inputs[0], Opcode::Ffcmp).unwrap();
|
|
lower_fcmp_or_ffcmp_to_flags(ctx, ffcmp_insn);
|
|
cond
|
|
};
|
|
|
|
ctx.emit_safepoint(Inst::TrapIf {
|
|
trap_code,
|
|
kind: CondBrKind::Cond(cond),
|
|
});
|
|
}
|
|
|
|
Opcode::Safepoint => {
|
|
panic!("safepoint instructions not used by new backend's safepoints!");
|
|
}
|
|
|
|
Opcode::Trapz | Opcode::Trapnz | Opcode::ResumableTrapnz => {
|
|
panic!("trapz / trapnz / resumable_trapnz should have been removed by legalization!");
|
|
}
|
|
|
|
Opcode::FuncAddr => {
|
|
let rd = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
|
|
let (extname, _) = ctx.call_target(insn).unwrap();
|
|
let extname = extname.clone();
|
|
ctx.emit(Inst::LoadExtName {
|
|
rd,
|
|
name: Box::new(extname),
|
|
offset: 0,
|
|
});
|
|
}
|
|
|
|
Opcode::GlobalValue => {
|
|
panic!("global_value should have been removed by legalization!");
|
|
}
|
|
|
|
Opcode::SymbolValue => {
|
|
let rd = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
|
|
let (extname, _, offset) = ctx.symbol_value(insn).unwrap();
|
|
let extname = extname.clone();
|
|
ctx.emit(Inst::LoadExtName {
|
|
rd,
|
|
name: Box::new(extname),
|
|
offset,
|
|
});
|
|
}
|
|
|
|
Opcode::Call | Opcode::CallIndirect => {
|
|
let caller_conv = ctx.abi().call_conv();
|
|
let (mut abi, inputs) = match op {
|
|
Opcode::Call => {
|
|
let (extname, dist) = ctx.call_target(insn).unwrap();
|
|
let extname = extname.clone();
|
|
let sig = ctx.call_sig(insn).unwrap();
|
|
assert!(inputs.len() == sig.params.len());
|
|
assert!(outputs.len() == sig.returns.len());
|
|
(
|
|
AArch64ABICaller::from_func(sig, &extname, dist, caller_conv, flags)?,
|
|
&inputs[..],
|
|
)
|
|
}
|
|
Opcode::CallIndirect => {
|
|
let ptr = put_input_in_reg(ctx, inputs[0], NarrowValueMode::ZeroExtend64);
|
|
let sig = ctx.call_sig(insn).unwrap();
|
|
assert!(inputs.len() - 1 == sig.params.len());
|
|
assert!(outputs.len() == sig.returns.len());
|
|
(
|
|
AArch64ABICaller::from_ptr(sig, ptr, op, caller_conv, flags)?,
|
|
&inputs[1..],
|
|
)
|
|
}
|
|
_ => unreachable!(),
|
|
};
|
|
|
|
abi.emit_stack_pre_adjust(ctx);
|
|
assert!(inputs.len() == abi.num_args());
|
|
for i in abi.get_copy_to_arg_order() {
|
|
let input = inputs[i];
|
|
let arg_regs = put_input_in_regs(ctx, input);
|
|
abi.emit_copy_regs_to_arg(ctx, i, arg_regs);
|
|
}
|
|
abi.emit_call(ctx);
|
|
for (i, output) in outputs.iter().enumerate() {
|
|
let retval_regs = get_output_reg(ctx, *output);
|
|
abi.emit_copy_retval_to_regs(ctx, i, retval_regs);
|
|
}
|
|
abi.emit_stack_post_adjust(ctx);
|
|
}
|
|
|
|
Opcode::GetPinnedReg => {
|
|
let rd = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
|
|
ctx.emit(Inst::gen_move(rd, xreg(PINNED_REG), I64));
|
|
}
|
|
|
|
Opcode::SetPinnedReg => {
|
|
let rm = put_input_in_reg(ctx, inputs[0], NarrowValueMode::None);
|
|
ctx.emit(Inst::gen_move(writable_xreg(PINNED_REG), rm, I64));
|
|
}
|
|
|
|
Opcode::Spill
|
|
| Opcode::Fill
|
|
| Opcode::FillNop
|
|
| Opcode::Regmove
|
|
| Opcode::CopySpecial
|
|
| Opcode::CopyToSsa
|
|
| Opcode::CopyNop
|
|
| Opcode::AdjustSpDown
|
|
| Opcode::AdjustSpUpImm
|
|
| Opcode::AdjustSpDownImm
|
|
| Opcode::IfcmpSp
|
|
| Opcode::Regspill
|
|
| Opcode::Regfill => {
|
|
panic!("Unused opcode should not be encountered.");
|
|
}
|
|
|
|
Opcode::Jump
|
|
| Opcode::Fallthrough
|
|
| Opcode::Brz
|
|
| Opcode::Brnz
|
|
| Opcode::BrIcmp
|
|
| Opcode::Brif
|
|
| Opcode::Brff
|
|
| Opcode::IndirectJumpTableBr
|
|
| Opcode::BrTable => {
|
|
panic!("Branch opcode reached non-branch lowering logic!");
|
|
}
|
|
|
|
Opcode::Vconst => {
|
|
let value = const_param_to_u128(ctx, insn).expect("Invalid immediate bytes");
|
|
let rd = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
|
|
lower_constant_f128(ctx, rd, value);
|
|
}
|
|
|
|
Opcode::RawBitcast => {
|
|
let rm = put_input_in_reg(ctx, inputs[0], NarrowValueMode::None);
|
|
let rd = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
|
|
let ty = ctx.input_ty(insn, 0);
|
|
ctx.emit(Inst::gen_move(rd, rm, ty));
|
|
}
|
|
|
|
Opcode::Extractlane => {
|
|
if let InstructionData::BinaryImm8 { imm, .. } = ctx.data(insn) {
|
|
let idx = *imm;
|
|
let rd = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
|
|
let rn = put_input_in_reg(ctx, inputs[0], NarrowValueMode::None);
|
|
let size = VectorSize::from_ty(ctx.input_ty(insn, 0));
|
|
let ty = ty.unwrap();
|
|
|
|
if ty_has_int_representation(ty) {
|
|
ctx.emit(Inst::MovFromVec { rd, rn, idx, size });
|
|
// Plain moves are faster on some processors.
|
|
} else if idx == 0 {
|
|
ctx.emit(Inst::gen_move(rd, rn, ty));
|
|
} else {
|
|
ctx.emit(Inst::FpuMoveFromVec { rd, rn, idx, size });
|
|
}
|
|
} else {
|
|
unreachable!();
|
|
}
|
|
}
|
|
|
|
Opcode::Insertlane => {
|
|
let idx = if let InstructionData::TernaryImm8 { imm, .. } = ctx.data(insn) {
|
|
*imm
|
|
} else {
|
|
unreachable!();
|
|
};
|
|
let input_ty = ctx.input_ty(insn, 1);
|
|
let rd = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
|
|
let rm = put_input_in_reg(ctx, inputs[0], NarrowValueMode::None);
|
|
let rn = put_input_in_reg(ctx, inputs[1], NarrowValueMode::None);
|
|
let ty = ty.unwrap();
|
|
let size = VectorSize::from_ty(ty);
|
|
|
|
ctx.emit(Inst::gen_move(rd, rm, ty));
|
|
|
|
if ty_has_int_representation(input_ty) {
|
|
ctx.emit(Inst::MovToVec { rd, rn, idx, size });
|
|
} else {
|
|
ctx.emit(Inst::VecMovElement {
|
|
rd,
|
|
rn,
|
|
dest_idx: idx,
|
|
src_idx: 0,
|
|
size,
|
|
});
|
|
}
|
|
}
|
|
|
|
Opcode::Splat => {
|
|
let rd = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
|
|
let size = VectorSize::from_ty(ty.unwrap());
|
|
|
|
if let Some((_, insn)) = maybe_input_insn_multi(
|
|
ctx,
|
|
inputs[0],
|
|
&[
|
|
Opcode::Bconst,
|
|
Opcode::F32const,
|
|
Opcode::F64const,
|
|
Opcode::Iconst,
|
|
],
|
|
) {
|
|
lower_splat_const(ctx, rd, ctx.get_constant(insn).unwrap(), size);
|
|
} else if let Some(insn) =
|
|
maybe_input_insn_via_conv(ctx, inputs[0], Opcode::Iconst, Opcode::Ireduce)
|
|
{
|
|
lower_splat_const(ctx, rd, ctx.get_constant(insn).unwrap(), size);
|
|
} else if let Some(insn) =
|
|
maybe_input_insn_via_conv(ctx, inputs[0], Opcode::Bconst, Opcode::Breduce)
|
|
{
|
|
lower_splat_const(ctx, rd, ctx.get_constant(insn).unwrap(), size);
|
|
} else if let Some((_, insn)) = maybe_input_insn_multi(
|
|
ctx,
|
|
inputs[0],
|
|
&[
|
|
Opcode::Uload8,
|
|
Opcode::Sload8,
|
|
Opcode::Uload16,
|
|
Opcode::Sload16,
|
|
Opcode::Uload32,
|
|
Opcode::Sload32,
|
|
Opcode::Load,
|
|
],
|
|
) {
|
|
ctx.sink_inst(insn);
|
|
let load_inputs = insn_inputs(ctx, insn);
|
|
let load_outputs = insn_outputs(ctx, insn);
|
|
lower_load(
|
|
ctx,
|
|
insn,
|
|
&load_inputs[..],
|
|
load_outputs[0],
|
|
|ctx, _rd, _elem_ty, mem| {
|
|
let tmp = ctx.alloc_tmp(I64).only_reg().unwrap();
|
|
let (addr, addr_inst) = Inst::gen_load_addr(tmp, mem);
|
|
if let Some(addr_inst) = addr_inst {
|
|
ctx.emit(addr_inst);
|
|
}
|
|
ctx.emit(Inst::VecLoadReplicate { rd, rn: addr, size });
|
|
},
|
|
);
|
|
} else {
|
|
let input_ty = ctx.input_ty(insn, 0);
|
|
let rn = put_input_in_reg(ctx, inputs[0], NarrowValueMode::None);
|
|
let inst = if ty_has_int_representation(input_ty) {
|
|
Inst::VecDup { rd, rn, size }
|
|
} else {
|
|
Inst::VecDupFromFpu { rd, rn, size }
|
|
};
|
|
|
|
ctx.emit(inst);
|
|
}
|
|
}
|
|
|
|
Opcode::ScalarToVector => {
|
|
let rn = put_input_in_reg(ctx, inputs[0], NarrowValueMode::None);
|
|
let rd = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
|
|
let input_ty = ctx.input_ty(insn, 0);
|
|
if (input_ty == I32 && ty.unwrap() == I32X4)
|
|
|| (input_ty == I64 && ty.unwrap() == I64X2)
|
|
{
|
|
ctx.emit(Inst::MovToFpu {
|
|
rd,
|
|
rn,
|
|
size: ScalarSize::from_ty(input_ty),
|
|
});
|
|
} else {
|
|
return Err(CodegenError::Unsupported(format!(
|
|
"ScalarToVector: unsupported types {:?} -> {:?}",
|
|
input_ty, ty
|
|
)));
|
|
}
|
|
}
|
|
|
|
Opcode::VallTrue if ctx.input_ty(insn, 0) == I64X2 => {
|
|
let rd = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
|
|
let rm = put_input_in_reg(ctx, inputs[0], NarrowValueMode::None);
|
|
let tmp = ctx.alloc_tmp(I64X2).only_reg().unwrap();
|
|
|
|
// cmeq vtmp.2d, vm.2d, #0
|
|
// addp dtmp, vtmp.2d
|
|
// fcmp dtmp, dtmp
|
|
// cset xd, eq
|
|
//
|
|
// Note that after the ADDP the value of the temporary register will
|
|
// be either 0 when all input elements are true, i.e. non-zero, or a
|
|
// NaN otherwise (either -1 or -2 when represented as an integer);
|
|
// NaNs are the only floating-point numbers that compare unequal to
|
|
// themselves.
|
|
|
|
ctx.emit(Inst::VecMisc {
|
|
op: VecMisc2::Cmeq0,
|
|
rd: tmp,
|
|
rn: rm,
|
|
size: VectorSize::Size64x2,
|
|
});
|
|
ctx.emit(Inst::VecRRPair {
|
|
op: VecPairOp::Addp,
|
|
rd: tmp,
|
|
rn: tmp.to_reg(),
|
|
});
|
|
ctx.emit(Inst::FpuCmp64 {
|
|
rn: tmp.to_reg(),
|
|
rm: tmp.to_reg(),
|
|
});
|
|
materialize_bool_result(ctx, insn, rd, Cond::Eq);
|
|
}
|
|
|
|
Opcode::VanyTrue | Opcode::VallTrue => {
|
|
let rd = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
|
|
let rm = put_input_in_reg(ctx, inputs[0], NarrowValueMode::None);
|
|
let src_ty = ctx.input_ty(insn, 0);
|
|
let tmp = ctx.alloc_tmp(src_ty).only_reg().unwrap();
|
|
|
|
// This operation is implemented by using umaxp or uminv to
|
|
// create a scalar value, which is then compared against zero.
|
|
//
|
|
// umaxp vn.16b, vm.16, vm.16 / uminv bn, vm.16b
|
|
// mov xm, vn.d[0]
|
|
// cmp xm, #0
|
|
// cset xm, ne
|
|
|
|
let size = VectorSize::from_ty(ctx.input_ty(insn, 0));
|
|
|
|
if op == Opcode::VanyTrue {
|
|
ctx.emit(Inst::VecRRR {
|
|
alu_op: VecALUOp::Umaxp,
|
|
rd: tmp,
|
|
rn: rm,
|
|
rm: rm,
|
|
size,
|
|
});
|
|
} else {
|
|
ctx.emit(Inst::VecLanes {
|
|
op: VecLanesOp::Uminv,
|
|
rd: tmp,
|
|
rn: rm,
|
|
size,
|
|
});
|
|
};
|
|
|
|
ctx.emit(Inst::MovFromVec {
|
|
rd,
|
|
rn: tmp.to_reg(),
|
|
idx: 0,
|
|
size: VectorSize::Size64x2,
|
|
});
|
|
|
|
ctx.emit(Inst::AluRRImm12 {
|
|
alu_op: ALUOp::SubS64,
|
|
rd: writable_zero_reg(),
|
|
rn: rd.to_reg(),
|
|
imm12: Imm12::zero(),
|
|
});
|
|
|
|
materialize_bool_result(ctx, insn, rd, Cond::Ne);
|
|
}
|
|
|
|
Opcode::VhighBits => {
|
|
let dst_r = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
|
|
let src_v = put_input_in_reg(ctx, inputs[0], NarrowValueMode::None);
|
|
let ty = ctx.input_ty(insn, 0);
|
|
// All three sequences use one integer temporary and two vector temporaries. The
|
|
// shift is done early so as to give the register allocator the possibility of using
|
|
// the same reg for `tmp_v1` and `src_v` in the case that this is the last use of
|
|
// `src_v`. See https://github.com/WebAssembly/simd/pull/201 for the background and
|
|
// derivation of these sequences. Alternative sequences are discussed in
|
|
// https://github.com/bytecodealliance/wasmtime/issues/2296, although they are not
|
|
// used here.
|
|
let tmp_r0 = ctx.alloc_tmp(I64).only_reg().unwrap();
|
|
let tmp_v0 = ctx.alloc_tmp(I8X16).only_reg().unwrap();
|
|
let tmp_v1 = ctx.alloc_tmp(I8X16).only_reg().unwrap();
|
|
match ty {
|
|
I8X16 => {
|
|
// sshr tmp_v1.16b, src_v.16b, #7
|
|
// mov tmp_r0, #0x0201
|
|
// movk tmp_r0, #0x0804, lsl 16
|
|
// movk tmp_r0, #0x2010, lsl 32
|
|
// movk tmp_r0, #0x8040, lsl 48
|
|
// dup tmp_v0.2d, tmp_r0
|
|
// and tmp_v1.16b, tmp_v1.16b, tmp_v0.16b
|
|
// ext tmp_v0.16b, tmp_v1.16b, tmp_v1.16b, #8
|
|
// zip1 tmp_v0.16b, tmp_v1.16b, tmp_v0.16b
|
|
// addv tmp_v0h, tmp_v0.8h
|
|
// mov dst_r, tmp_v0.h[0]
|
|
ctx.emit(Inst::VecShiftImm {
|
|
op: VecShiftImmOp::Sshr,
|
|
rd: tmp_v1,
|
|
rn: src_v,
|
|
size: VectorSize::Size8x16,
|
|
imm: 7,
|
|
});
|
|
lower_splat_const(ctx, tmp_v0, 0x8040201008040201u64, VectorSize::Size64x2);
|
|
ctx.emit(Inst::VecRRR {
|
|
alu_op: VecALUOp::And,
|
|
rd: tmp_v1,
|
|
rn: tmp_v1.to_reg(),
|
|
rm: tmp_v0.to_reg(),
|
|
size: VectorSize::Size8x16,
|
|
});
|
|
ctx.emit(Inst::VecExtract {
|
|
rd: tmp_v0,
|
|
rn: tmp_v1.to_reg(),
|
|
rm: tmp_v1.to_reg(),
|
|
imm4: 8,
|
|
});
|
|
ctx.emit(Inst::VecRRR {
|
|
alu_op: VecALUOp::Zip1,
|
|
rd: tmp_v0,
|
|
rn: tmp_v1.to_reg(),
|
|
rm: tmp_v0.to_reg(),
|
|
size: VectorSize::Size8x16,
|
|
});
|
|
ctx.emit(Inst::VecLanes {
|
|
op: VecLanesOp::Addv,
|
|
rd: tmp_v0,
|
|
rn: tmp_v0.to_reg(),
|
|
size: VectorSize::Size16x8,
|
|
});
|
|
ctx.emit(Inst::MovFromVec {
|
|
rd: dst_r,
|
|
rn: tmp_v0.to_reg(),
|
|
idx: 0,
|
|
size: VectorSize::Size16x8,
|
|
});
|
|
}
|
|
I16X8 => {
|
|
// sshr tmp_v1.8h, src_v.8h, #15
|
|
// mov tmp_r0, #0x1
|
|
// movk tmp_r0, #0x2, lsl 16
|
|
// movk tmp_r0, #0x4, lsl 32
|
|
// movk tmp_r0, #0x8, lsl 48
|
|
// dup tmp_v0.2d, tmp_r0
|
|
// shl tmp_r0, tmp_r0, #4
|
|
// mov tmp_v0.d[1], tmp_r0
|
|
// and tmp_v0.16b, tmp_v1.16b, tmp_v0.16b
|
|
// addv tmp_v0h, tmp_v0.8h
|
|
// mov dst_r, tmp_v0.h[0]
|
|
ctx.emit(Inst::VecShiftImm {
|
|
op: VecShiftImmOp::Sshr,
|
|
rd: tmp_v1,
|
|
rn: src_v,
|
|
size: VectorSize::Size16x8,
|
|
imm: 15,
|
|
});
|
|
lower_constant_u64(ctx, tmp_r0, 0x0008000400020001u64);
|
|
ctx.emit(Inst::VecDup {
|
|
rd: tmp_v0,
|
|
rn: tmp_r0.to_reg(),
|
|
size: VectorSize::Size64x2,
|
|
});
|
|
ctx.emit(Inst::AluRRImmShift {
|
|
alu_op: ALUOp::Lsl64,
|
|
rd: tmp_r0,
|
|
rn: tmp_r0.to_reg(),
|
|
immshift: ImmShift { imm: 4 },
|
|
});
|
|
ctx.emit(Inst::MovToVec {
|
|
rd: tmp_v0,
|
|
rn: tmp_r0.to_reg(),
|
|
idx: 1,
|
|
size: VectorSize::Size64x2,
|
|
});
|
|
ctx.emit(Inst::VecRRR {
|
|
alu_op: VecALUOp::And,
|
|
rd: tmp_v0,
|
|
rn: tmp_v1.to_reg(),
|
|
rm: tmp_v0.to_reg(),
|
|
size: VectorSize::Size8x16,
|
|
});
|
|
ctx.emit(Inst::VecLanes {
|
|
op: VecLanesOp::Addv,
|
|
rd: tmp_v0,
|
|
rn: tmp_v0.to_reg(),
|
|
size: VectorSize::Size16x8,
|
|
});
|
|
ctx.emit(Inst::MovFromVec {
|
|
rd: dst_r,
|
|
rn: tmp_v0.to_reg(),
|
|
idx: 0,
|
|
size: VectorSize::Size16x8,
|
|
});
|
|
}
|
|
I32X4 => {
|
|
// sshr tmp_v1.4s, src_v.4s, #31
|
|
// mov tmp_r0, #0x1
|
|
// movk tmp_r0, #0x2, lsl 32
|
|
// dup tmp_v0.2d, tmp_r0
|
|
// shl tmp_r0, tmp_r0, #2
|
|
// mov tmp_v0.d[1], tmp_r0
|
|
// and tmp_v0.16b, tmp_v1.16b, tmp_v0.16b
|
|
// addv tmp_v0s, tmp_v0.4s
|
|
// mov dst_r, tmp_v0.s[0]
|
|
ctx.emit(Inst::VecShiftImm {
|
|
op: VecShiftImmOp::Sshr,
|
|
rd: tmp_v1,
|
|
rn: src_v,
|
|
size: VectorSize::Size32x4,
|
|
imm: 31,
|
|
});
|
|
lower_constant_u64(ctx, tmp_r0, 0x0000000200000001u64);
|
|
ctx.emit(Inst::VecDup {
|
|
rd: tmp_v0,
|
|
rn: tmp_r0.to_reg(),
|
|
size: VectorSize::Size64x2,
|
|
});
|
|
ctx.emit(Inst::AluRRImmShift {
|
|
alu_op: ALUOp::Lsl64,
|
|
rd: tmp_r0,
|
|
rn: tmp_r0.to_reg(),
|
|
immshift: ImmShift { imm: 2 },
|
|
});
|
|
ctx.emit(Inst::MovToVec {
|
|
rd: tmp_v0,
|
|
rn: tmp_r0.to_reg(),
|
|
idx: 1,
|
|
size: VectorSize::Size64x2,
|
|
});
|
|
ctx.emit(Inst::VecRRR {
|
|
alu_op: VecALUOp::And,
|
|
rd: tmp_v0,
|
|
rn: tmp_v1.to_reg(),
|
|
rm: tmp_v0.to_reg(),
|
|
size: VectorSize::Size8x16,
|
|
});
|
|
ctx.emit(Inst::VecLanes {
|
|
op: VecLanesOp::Addv,
|
|
rd: tmp_v0,
|
|
rn: tmp_v0.to_reg(),
|
|
size: VectorSize::Size32x4,
|
|
});
|
|
ctx.emit(Inst::MovFromVec {
|
|
rd: dst_r,
|
|
rn: tmp_v0.to_reg(),
|
|
idx: 0,
|
|
size: VectorSize::Size32x4,
|
|
});
|
|
}
|
|
I64X2 => {
|
|
// mov dst_r, src_v.d[0]
|
|
// mov tmp_r0, src_v.d[1]
|
|
// lsr dst_r, dst_r, #63
|
|
// lsr tmp_r0, tmp_r0, #63
|
|
// add dst_r, dst_r, tmp_r0, lsl #1
|
|
ctx.emit(Inst::MovFromVec {
|
|
rd: dst_r,
|
|
rn: src_v,
|
|
idx: 0,
|
|
size: VectorSize::Size64x2,
|
|
});
|
|
ctx.emit(Inst::MovFromVec {
|
|
rd: tmp_r0,
|
|
rn: src_v,
|
|
idx: 1,
|
|
size: VectorSize::Size64x2,
|
|
});
|
|
ctx.emit(Inst::AluRRImmShift {
|
|
alu_op: ALUOp::Lsr64,
|
|
rd: dst_r,
|
|
rn: dst_r.to_reg(),
|
|
immshift: ImmShift::maybe_from_u64(63).unwrap(),
|
|
});
|
|
ctx.emit(Inst::AluRRImmShift {
|
|
alu_op: ALUOp::Lsr64,
|
|
rd: tmp_r0,
|
|
rn: tmp_r0.to_reg(),
|
|
immshift: ImmShift::maybe_from_u64(63).unwrap(),
|
|
});
|
|
ctx.emit(Inst::AluRRRShift {
|
|
alu_op: ALUOp::Add32,
|
|
rd: dst_r,
|
|
rn: dst_r.to_reg(),
|
|
rm: tmp_r0.to_reg(),
|
|
shiftop: ShiftOpAndAmt::new(
|
|
ShiftOp::LSL,
|
|
ShiftOpShiftImm::maybe_from_shift(1).unwrap(),
|
|
),
|
|
});
|
|
}
|
|
_ => panic!("arm64 isel: VhighBits unhandled, ty = {:?}", ty),
|
|
}
|
|
}
|
|
|
|
Opcode::Shuffle => {
|
|
let mask = const_param_to_u128(ctx, insn).expect("Invalid immediate mask bytes");
|
|
let rd = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
|
|
let rn = put_input_in_reg(ctx, inputs[0], NarrowValueMode::None);
|
|
let rn2 = put_input_in_reg(ctx, inputs[1], NarrowValueMode::None);
|
|
// 2 register table vector lookups require consecutive table registers;
|
|
// we satisfy this constraint by hardcoding the usage of v29 and v30.
|
|
let temp = writable_vreg(29);
|
|
let temp2 = writable_vreg(30);
|
|
let input_ty = ctx.input_ty(insn, 0);
|
|
assert_eq!(input_ty, ctx.input_ty(insn, 1));
|
|
// Make sure that both inputs are in virtual registers, since it is
|
|
// not guaranteed that we can get them safely to the temporaries if
|
|
// either is in a real register.
|
|
let rn = ctx.ensure_in_vreg(rn, input_ty);
|
|
let rn2 = ctx.ensure_in_vreg(rn2, input_ty);
|
|
|
|
lower_constant_f128(ctx, rd, mask);
|
|
ctx.emit(Inst::gen_move(temp, rn, input_ty));
|
|
ctx.emit(Inst::gen_move(temp2, rn2, input_ty));
|
|
ctx.emit(Inst::VecTbl2 {
|
|
rd,
|
|
rn: temp.to_reg(),
|
|
rn2: temp2.to_reg(),
|
|
rm: rd.to_reg(),
|
|
is_extension: false,
|
|
});
|
|
}
|
|
|
|
Opcode::Swizzle => {
|
|
let rd = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
|
|
let rm = put_input_in_reg(ctx, inputs[1], NarrowValueMode::None);
|
|
let rn = put_input_in_reg(ctx, inputs[0], NarrowValueMode::None);
|
|
|
|
ctx.emit(Inst::VecTbl {
|
|
rd,
|
|
rn,
|
|
rm,
|
|
is_extension: false,
|
|
});
|
|
}
|
|
|
|
Opcode::Vsplit | Opcode::Vconcat => {
|
|
// TODO
|
|
panic!("Vector ops not implemented.");
|
|
}
|
|
|
|
Opcode::Isplit => {
|
|
assert_eq!(
|
|
ctx.input_ty(insn, 0),
|
|
I128,
|
|
"Isplit only implemented for i128's"
|
|
);
|
|
assert_eq!(ctx.output_ty(insn, 0), I64);
|
|
assert_eq!(ctx.output_ty(insn, 1), I64);
|
|
|
|
let src_regs = put_input_in_regs(ctx, inputs[0]);
|
|
let dst_lo = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
|
|
let dst_hi = get_output_reg(ctx, outputs[1]).only_reg().unwrap();
|
|
|
|
ctx.emit(Inst::gen_move(dst_lo, src_regs.regs()[0], I64));
|
|
ctx.emit(Inst::gen_move(dst_hi, src_regs.regs()[1], I64));
|
|
}
|
|
|
|
Opcode::Iconcat => {
|
|
assert_eq!(
|
|
ctx.output_ty(insn, 0),
|
|
I128,
|
|
"Iconcat only implemented for i128's"
|
|
);
|
|
assert_eq!(ctx.input_ty(insn, 0), I64);
|
|
assert_eq!(ctx.input_ty(insn, 1), I64);
|
|
|
|
let src_lo = put_input_in_reg(ctx, inputs[0], NarrowValueMode::None);
|
|
let src_hi = put_input_in_reg(ctx, inputs[1], NarrowValueMode::None);
|
|
let dst = get_output_reg(ctx, outputs[0]);
|
|
|
|
ctx.emit(Inst::gen_move(dst.regs()[0], src_lo, I64));
|
|
ctx.emit(Inst::gen_move(dst.regs()[1], src_hi, I64));
|
|
}
|
|
|
|
Opcode::Imax | Opcode::Umax | Opcode::Umin | Opcode::Imin => {
|
|
let alu_op = match op {
|
|
Opcode::Umin => VecALUOp::Umin,
|
|
Opcode::Imin => VecALUOp::Smin,
|
|
Opcode::Umax => VecALUOp::Umax,
|
|
Opcode::Imax => VecALUOp::Smax,
|
|
_ => unreachable!(),
|
|
};
|
|
let rd = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
|
|
let rn = put_input_in_reg(ctx, inputs[0], NarrowValueMode::None);
|
|
let rm = put_input_in_reg(ctx, inputs[1], NarrowValueMode::None);
|
|
let ty = ty.unwrap();
|
|
ctx.emit(Inst::VecRRR {
|
|
alu_op,
|
|
rd,
|
|
rn,
|
|
rm,
|
|
size: VectorSize::from_ty(ty),
|
|
});
|
|
}
|
|
|
|
Opcode::WideningPairwiseDotProductS => {
|
|
let r_y = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
|
|
let r_a = put_input_in_reg(ctx, inputs[0], NarrowValueMode::None);
|
|
let r_b = put_input_in_reg(ctx, inputs[1], NarrowValueMode::None);
|
|
let ty = ty.unwrap();
|
|
if ty == I32X4 {
|
|
let tmp = ctx.alloc_tmp(I8X16).only_reg().unwrap();
|
|
// The args have type I16X8.
|
|
// "y = i32x4.dot_i16x8_s(a, b)"
|
|
// => smull tmp, a, b
|
|
// smull2 y, a, b
|
|
// addp y, tmp, y
|
|
ctx.emit(Inst::VecRRR {
|
|
alu_op: VecALUOp::Smull,
|
|
rd: tmp,
|
|
rn: r_a,
|
|
rm: r_b,
|
|
size: VectorSize::Size16x8,
|
|
});
|
|
ctx.emit(Inst::VecRRR {
|
|
alu_op: VecALUOp::Smull2,
|
|
rd: r_y,
|
|
rn: r_a,
|
|
rm: r_b,
|
|
size: VectorSize::Size16x8,
|
|
});
|
|
ctx.emit(Inst::VecRRR {
|
|
alu_op: VecALUOp::Addp,
|
|
rd: r_y,
|
|
rn: tmp.to_reg(),
|
|
rm: r_y.to_reg(),
|
|
size: VectorSize::Size32x4,
|
|
});
|
|
} else {
|
|
return Err(CodegenError::Unsupported(format!(
|
|
"Opcode::WideningPairwiseDotProductS: unsupported laneage: {:?}",
|
|
ty
|
|
)));
|
|
}
|
|
}
|
|
|
|
Opcode::Fadd | Opcode::Fsub | Opcode::Fmul | Opcode::Fdiv | Opcode::Fmin | Opcode::Fmax => {
|
|
let ty = ty.unwrap();
|
|
let bits = ty_bits(ty);
|
|
let rn = put_input_in_reg(ctx, inputs[0], NarrowValueMode::None);
|
|
let rm = put_input_in_reg(ctx, inputs[1], NarrowValueMode::None);
|
|
let rd = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
|
|
if !ty.is_vector() {
|
|
let fpu_op = match (op, bits) {
|
|
(Opcode::Fadd, 32) => FPUOp2::Add32,
|
|
(Opcode::Fadd, 64) => FPUOp2::Add64,
|
|
(Opcode::Fsub, 32) => FPUOp2::Sub32,
|
|
(Opcode::Fsub, 64) => FPUOp2::Sub64,
|
|
(Opcode::Fmul, 32) => FPUOp2::Mul32,
|
|
(Opcode::Fmul, 64) => FPUOp2::Mul64,
|
|
(Opcode::Fdiv, 32) => FPUOp2::Div32,
|
|
(Opcode::Fdiv, 64) => FPUOp2::Div64,
|
|
(Opcode::Fmin, 32) => FPUOp2::Min32,
|
|
(Opcode::Fmin, 64) => FPUOp2::Min64,
|
|
(Opcode::Fmax, 32) => FPUOp2::Max32,
|
|
(Opcode::Fmax, 64) => FPUOp2::Max64,
|
|
_ => panic!("Unknown op/bits combination"),
|
|
};
|
|
ctx.emit(Inst::FpuRRR { fpu_op, rd, rn, rm });
|
|
} else {
|
|
let alu_op = match op {
|
|
Opcode::Fadd => VecALUOp::Fadd,
|
|
Opcode::Fsub => VecALUOp::Fsub,
|
|
Opcode::Fdiv => VecALUOp::Fdiv,
|
|
Opcode::Fmax => VecALUOp::Fmax,
|
|
Opcode::Fmin => VecALUOp::Fmin,
|
|
Opcode::Fmul => VecALUOp::Fmul,
|
|
_ => unreachable!(),
|
|
};
|
|
|
|
ctx.emit(Inst::VecRRR {
|
|
rd,
|
|
rn,
|
|
rm,
|
|
alu_op,
|
|
size: VectorSize::from_ty(ty),
|
|
});
|
|
}
|
|
}
|
|
|
|
Opcode::FminPseudo | Opcode::FmaxPseudo => {
|
|
let ty = ctx.input_ty(insn, 0);
|
|
if ty == F32X4 || ty == F64X2 {
|
|
// pmin(a,b) => bitsel(b, a, cmpgt(a, b))
|
|
// pmax(a,b) => bitsel(b, a, cmpgt(b, a))
|
|
let r_dst = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
|
|
let r_a = put_input_in_reg(ctx, inputs[0], NarrowValueMode::None);
|
|
let r_b = put_input_in_reg(ctx, inputs[1], NarrowValueMode::None);
|
|
// Since we're going to write the output register `r_dst` anyway, we might as
|
|
// well first use it to hold the comparison result. This has the slightly unusual
|
|
// effect that we modify the output register in the first instruction (`fcmgt`)
|
|
// but read both the inputs again in the second instruction (`bsl`), which means
|
|
// that the output register can't be either of the input registers. Regalloc
|
|
// should handle this correctly, nevertheless.
|
|
ctx.emit(Inst::VecRRR {
|
|
alu_op: VecALUOp::Fcmgt,
|
|
rd: r_dst,
|
|
rn: if op == Opcode::FminPseudo { r_a } else { r_b },
|
|
rm: if op == Opcode::FminPseudo { r_b } else { r_a },
|
|
size: if ty == F32X4 {
|
|
VectorSize::Size32x4
|
|
} else {
|
|
VectorSize::Size64x2
|
|
},
|
|
});
|
|
ctx.emit(Inst::VecRRR {
|
|
alu_op: VecALUOp::Bsl,
|
|
rd: r_dst,
|
|
rn: r_b,
|
|
rm: r_a,
|
|
size: VectorSize::Size8x16,
|
|
});
|
|
} else {
|
|
panic!("Opcode::FminPseudo | Opcode::FmaxPseudo: unhandled type");
|
|
}
|
|
}
|
|
|
|
Opcode::Sqrt | Opcode::Fneg | Opcode::Fabs | Opcode::Fpromote | Opcode::Fdemote => {
|
|
let ty = ty.unwrap();
|
|
let bits = ty_bits(ty);
|
|
let rn = put_input_in_reg(ctx, inputs[0], NarrowValueMode::None);
|
|
let rd = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
|
|
if !ty.is_vector() {
|
|
let fpu_op = match (op, bits) {
|
|
(Opcode::Sqrt, 32) => FPUOp1::Sqrt32,
|
|
(Opcode::Sqrt, 64) => FPUOp1::Sqrt64,
|
|
(Opcode::Fneg, 32) => FPUOp1::Neg32,
|
|
(Opcode::Fneg, 64) => FPUOp1::Neg64,
|
|
(Opcode::Fabs, 32) => FPUOp1::Abs32,
|
|
(Opcode::Fabs, 64) => FPUOp1::Abs64,
|
|
(Opcode::Fpromote, 32) => panic!("Cannot promote to 32 bits"),
|
|
(Opcode::Fpromote, 64) => FPUOp1::Cvt32To64,
|
|
(Opcode::Fdemote, 32) => FPUOp1::Cvt64To32,
|
|
(Opcode::Fdemote, 64) => panic!("Cannot demote to 64 bits"),
|
|
_ => panic!("Unknown op/bits combination"),
|
|
};
|
|
ctx.emit(Inst::FpuRR { fpu_op, rd, rn });
|
|
} else {
|
|
let op = match op {
|
|
Opcode::Fabs => VecMisc2::Fabs,
|
|
Opcode::Fneg => VecMisc2::Fneg,
|
|
Opcode::Sqrt => VecMisc2::Fsqrt,
|
|
_ => unimplemented!(),
|
|
};
|
|
|
|
ctx.emit(Inst::VecMisc {
|
|
op,
|
|
rd,
|
|
rn,
|
|
size: VectorSize::from_ty(ty),
|
|
});
|
|
}
|
|
}
|
|
|
|
Opcode::Ceil | Opcode::Floor | Opcode::Trunc | Opcode::Nearest => {
|
|
let ty = ctx.output_ty(insn, 0);
|
|
if !ty.is_vector() {
|
|
let bits = ty_bits(ty);
|
|
let op = match (op, bits) {
|
|
(Opcode::Ceil, 32) => FpuRoundMode::Plus32,
|
|
(Opcode::Ceil, 64) => FpuRoundMode::Plus64,
|
|
(Opcode::Floor, 32) => FpuRoundMode::Minus32,
|
|
(Opcode::Floor, 64) => FpuRoundMode::Minus64,
|
|
(Opcode::Trunc, 32) => FpuRoundMode::Zero32,
|
|
(Opcode::Trunc, 64) => FpuRoundMode::Zero64,
|
|
(Opcode::Nearest, 32) => FpuRoundMode::Nearest32,
|
|
(Opcode::Nearest, 64) => FpuRoundMode::Nearest64,
|
|
_ => panic!("Unknown op/bits combination (scalar)"),
|
|
};
|
|
let rn = put_input_in_reg(ctx, inputs[0], NarrowValueMode::None);
|
|
let rd = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
|
|
ctx.emit(Inst::FpuRound { op, rd, rn });
|
|
} else {
|
|
let (op, size) = match (op, ty) {
|
|
(Opcode::Ceil, F32X4) => (VecMisc2::Frintp, VectorSize::Size32x4),
|
|
(Opcode::Ceil, F64X2) => (VecMisc2::Frintp, VectorSize::Size64x2),
|
|
(Opcode::Floor, F32X4) => (VecMisc2::Frintm, VectorSize::Size32x4),
|
|
(Opcode::Floor, F64X2) => (VecMisc2::Frintm, VectorSize::Size64x2),
|
|
(Opcode::Trunc, F32X4) => (VecMisc2::Frintz, VectorSize::Size32x4),
|
|
(Opcode::Trunc, F64X2) => (VecMisc2::Frintz, VectorSize::Size64x2),
|
|
(Opcode::Nearest, F32X4) => (VecMisc2::Frintn, VectorSize::Size32x4),
|
|
(Opcode::Nearest, F64X2) => (VecMisc2::Frintn, VectorSize::Size64x2),
|
|
_ => panic!("Unknown op/ty combination (vector){:?}", ty),
|
|
};
|
|
let rn = put_input_in_reg(ctx, inputs[0], NarrowValueMode::None);
|
|
let rd = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
|
|
ctx.emit(Inst::VecMisc { op, rd, rn, size });
|
|
}
|
|
}
|
|
|
|
Opcode::Fma => {
|
|
let bits = ty_bits(ctx.output_ty(insn, 0));
|
|
let fpu_op = match bits {
|
|
32 => FPUOp3::MAdd32,
|
|
64 => FPUOp3::MAdd64,
|
|
_ => panic!("Unknown op size"),
|
|
};
|
|
let rn = put_input_in_reg(ctx, inputs[0], NarrowValueMode::None);
|
|
let rm = put_input_in_reg(ctx, inputs[1], NarrowValueMode::None);
|
|
let ra = put_input_in_reg(ctx, inputs[2], NarrowValueMode::None);
|
|
let rd = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
|
|
ctx.emit(Inst::FpuRRRR {
|
|
fpu_op,
|
|
rn,
|
|
rm,
|
|
ra,
|
|
rd,
|
|
});
|
|
}
|
|
|
|
Opcode::Fcopysign => {
|
|
// Copy the sign bit from inputs[1] to inputs[0]. We use the following sequence:
|
|
//
|
|
// This is a scalar Fcopysign.
|
|
// This uses scalar NEON operations for 64-bit and vector operations (2S) for 32-bit.
|
|
// In the latter case it still sets all bits except the lowest 32 to 0.
|
|
//
|
|
// mov vd, vn
|
|
// ushr vtmp, vm, #63 / #31
|
|
// sli vd, vtmp, #63 / #31
|
|
|
|
let ty = ctx.output_ty(insn, 0);
|
|
let bits = ty_bits(ty) as u8;
|
|
assert!(bits == 32 || bits == 64);
|
|
let rn = put_input_in_reg(ctx, inputs[0], NarrowValueMode::None);
|
|
let rm = put_input_in_reg(ctx, inputs[1], NarrowValueMode::None);
|
|
let rd = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
|
|
let tmp = ctx.alloc_tmp(F64).only_reg().unwrap();
|
|
|
|
// Copy LHS to rd.
|
|
ctx.emit(Inst::gen_move(rd, rn, ty));
|
|
|
|
// Copy the sign bit to the lowest bit in tmp.
|
|
let imm = FPURightShiftImm::maybe_from_u8(bits - 1, bits).unwrap();
|
|
ctx.emit(Inst::FpuRRI {
|
|
fpu_op: choose_32_64(ty, FPUOpRI::UShr32(imm), FPUOpRI::UShr64(imm)),
|
|
rd: tmp,
|
|
rn: rm,
|
|
});
|
|
|
|
// Insert the bit from tmp into the sign bit of rd.
|
|
let imm = FPULeftShiftImm::maybe_from_u8(bits - 1, bits).unwrap();
|
|
ctx.emit(Inst::FpuRRI {
|
|
fpu_op: choose_32_64(ty, FPUOpRI::Sli32(imm), FPUOpRI::Sli64(imm)),
|
|
rd,
|
|
rn: tmp.to_reg(),
|
|
});
|
|
}
|
|
|
|
Opcode::FcvtToUint | Opcode::FcvtToSint => {
|
|
let in_bits = ty_bits(ctx.input_ty(insn, 0));
|
|
let out_bits = ty_bits(ctx.output_ty(insn, 0));
|
|
let signed = op == Opcode::FcvtToSint;
|
|
let op = match (signed, in_bits, out_bits) {
|
|
(false, 32, 8) | (false, 32, 16) | (false, 32, 32) => FpuToIntOp::F32ToU32,
|
|
(true, 32, 8) | (true, 32, 16) | (true, 32, 32) => FpuToIntOp::F32ToI32,
|
|
(false, 32, 64) => FpuToIntOp::F32ToU64,
|
|
(true, 32, 64) => FpuToIntOp::F32ToI64,
|
|
(false, 64, 8) | (false, 64, 16) | (false, 64, 32) => FpuToIntOp::F64ToU32,
|
|
(true, 64, 8) | (true, 64, 16) | (true, 64, 32) => FpuToIntOp::F64ToI32,
|
|
(false, 64, 64) => FpuToIntOp::F64ToU64,
|
|
(true, 64, 64) => FpuToIntOp::F64ToI64,
|
|
_ => panic!("Unknown input/output-bits combination"),
|
|
};
|
|
|
|
let rn = put_input_in_reg(ctx, inputs[0], NarrowValueMode::None);
|
|
let rd = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
|
|
|
|
// First, check the output: it's important to carry the NaN conversion before the
|
|
// in-bounds conversion, per wasm semantics.
|
|
|
|
// Check that the input is not a NaN.
|
|
if in_bits == 32 {
|
|
ctx.emit(Inst::FpuCmp32 { rn, rm: rn });
|
|
} else {
|
|
ctx.emit(Inst::FpuCmp64 { rn, rm: rn });
|
|
}
|
|
let trap_code = TrapCode::BadConversionToInteger;
|
|
ctx.emit(Inst::TrapIf {
|
|
trap_code,
|
|
kind: CondBrKind::Cond(lower_fp_condcode(FloatCC::Unordered)),
|
|
});
|
|
|
|
let tmp = ctx.alloc_tmp(I8X16).only_reg().unwrap();
|
|
|
|
// Check that the input is in range, with "truncate towards zero" semantics. This means
|
|
// we allow values that are slightly out of range:
|
|
// - for signed conversions, we allow values strictly greater than INT_MIN-1 (when this
|
|
// can be represented), and strictly less than INT_MAX+1 (when this can be
|
|
// represented).
|
|
// - for unsigned conversions, we allow values strictly greater than -1, and strictly
|
|
// less than UINT_MAX+1 (when this can be represented).
|
|
|
|
if in_bits == 32 {
|
|
// From float32.
|
|
let (low_bound, low_cond, high_bound) = match (signed, out_bits) {
|
|
(true, 8) => (
|
|
i8::min_value() as f32 - 1.,
|
|
FloatCC::GreaterThan,
|
|
i8::max_value() as f32 + 1.,
|
|
),
|
|
(true, 16) => (
|
|
i16::min_value() as f32 - 1.,
|
|
FloatCC::GreaterThan,
|
|
i16::max_value() as f32 + 1.,
|
|
),
|
|
(true, 32) => (
|
|
i32::min_value() as f32, // I32_MIN - 1 isn't precisely representable as a f32.
|
|
FloatCC::GreaterThanOrEqual,
|
|
i32::max_value() as f32 + 1.,
|
|
),
|
|
(true, 64) => (
|
|
i64::min_value() as f32, // I64_MIN - 1 isn't precisely representable as a f32.
|
|
FloatCC::GreaterThanOrEqual,
|
|
i64::max_value() as f32 + 1.,
|
|
),
|
|
(false, 8) => (-1., FloatCC::GreaterThan, u8::max_value() as f32 + 1.),
|
|
(false, 16) => (-1., FloatCC::GreaterThan, u16::max_value() as f32 + 1.),
|
|
(false, 32) => (-1., FloatCC::GreaterThan, u32::max_value() as f32 + 1.),
|
|
(false, 64) => (-1., FloatCC::GreaterThan, u64::max_value() as f32 + 1.),
|
|
_ => panic!("Unknown input/output-bits combination"),
|
|
};
|
|
|
|
// >= low_bound
|
|
lower_constant_f32(ctx, tmp, low_bound);
|
|
ctx.emit(Inst::FpuCmp32 {
|
|
rn,
|
|
rm: tmp.to_reg(),
|
|
});
|
|
let trap_code = TrapCode::IntegerOverflow;
|
|
ctx.emit(Inst::TrapIf {
|
|
trap_code,
|
|
kind: CondBrKind::Cond(lower_fp_condcode(low_cond).invert()),
|
|
});
|
|
|
|
// <= high_bound
|
|
lower_constant_f32(ctx, tmp, high_bound);
|
|
ctx.emit(Inst::FpuCmp32 {
|
|
rn,
|
|
rm: tmp.to_reg(),
|
|
});
|
|
let trap_code = TrapCode::IntegerOverflow;
|
|
ctx.emit(Inst::TrapIf {
|
|
trap_code,
|
|
kind: CondBrKind::Cond(lower_fp_condcode(FloatCC::LessThan).invert()),
|
|
});
|
|
} else {
|
|
// From float64.
|
|
let (low_bound, low_cond, high_bound) = match (signed, out_bits) {
|
|
(true, 8) => (
|
|
i8::min_value() as f64 - 1.,
|
|
FloatCC::GreaterThan,
|
|
i8::max_value() as f64 + 1.,
|
|
),
|
|
(true, 16) => (
|
|
i16::min_value() as f64 - 1.,
|
|
FloatCC::GreaterThan,
|
|
i16::max_value() as f64 + 1.,
|
|
),
|
|
(true, 32) => (
|
|
i32::min_value() as f64 - 1.,
|
|
FloatCC::GreaterThan,
|
|
i32::max_value() as f64 + 1.,
|
|
),
|
|
(true, 64) => (
|
|
i64::min_value() as f64, // I64_MIN - 1 is not precisely representable as an i64.
|
|
FloatCC::GreaterThanOrEqual,
|
|
i64::max_value() as f64 + 1.,
|
|
),
|
|
(false, 8) => (-1., FloatCC::GreaterThan, u8::max_value() as f64 + 1.),
|
|
(false, 16) => (-1., FloatCC::GreaterThan, u16::max_value() as f64 + 1.),
|
|
(false, 32) => (-1., FloatCC::GreaterThan, u32::max_value() as f64 + 1.),
|
|
(false, 64) => (-1., FloatCC::GreaterThan, u64::max_value() as f64 + 1.),
|
|
_ => panic!("Unknown input/output-bits combination"),
|
|
};
|
|
|
|
// >= low_bound
|
|
lower_constant_f64(ctx, tmp, low_bound);
|
|
ctx.emit(Inst::FpuCmp64 {
|
|
rn,
|
|
rm: tmp.to_reg(),
|
|
});
|
|
let trap_code = TrapCode::IntegerOverflow;
|
|
ctx.emit(Inst::TrapIf {
|
|
trap_code,
|
|
kind: CondBrKind::Cond(lower_fp_condcode(low_cond).invert()),
|
|
});
|
|
|
|
// <= high_bound
|
|
lower_constant_f64(ctx, tmp, high_bound);
|
|
ctx.emit(Inst::FpuCmp64 {
|
|
rn,
|
|
rm: tmp.to_reg(),
|
|
});
|
|
let trap_code = TrapCode::IntegerOverflow;
|
|
ctx.emit(Inst::TrapIf {
|
|
trap_code,
|
|
kind: CondBrKind::Cond(lower_fp_condcode(FloatCC::LessThan).invert()),
|
|
});
|
|
};
|
|
|
|
// Do the conversion.
|
|
ctx.emit(Inst::FpuToInt { op, rd, rn });
|
|
}
|
|
|
|
Opcode::FcvtFromUint | Opcode::FcvtFromSint => {
|
|
let ty = ty.unwrap();
|
|
let signed = op == Opcode::FcvtFromSint;
|
|
let rd = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
|
|
|
|
if ty.is_vector() {
|
|
let op = if signed {
|
|
VecMisc2::Scvtf
|
|
} else {
|
|
VecMisc2::Ucvtf
|
|
};
|
|
let rn = put_input_in_reg(ctx, inputs[0], NarrowValueMode::None);
|
|
|
|
ctx.emit(Inst::VecMisc {
|
|
op,
|
|
rd,
|
|
rn,
|
|
size: VectorSize::from_ty(ty),
|
|
});
|
|
} else {
|
|
let in_bits = ty_bits(ctx.input_ty(insn, 0));
|
|
let out_bits = ty_bits(ty);
|
|
let op = match (signed, in_bits, out_bits) {
|
|
(false, 8, 32) | (false, 16, 32) | (false, 32, 32) => IntToFpuOp::U32ToF32,
|
|
(true, 8, 32) | (true, 16, 32) | (true, 32, 32) => IntToFpuOp::I32ToF32,
|
|
(false, 8, 64) | (false, 16, 64) | (false, 32, 64) => IntToFpuOp::U32ToF64,
|
|
(true, 8, 64) | (true, 16, 64) | (true, 32, 64) => IntToFpuOp::I32ToF64,
|
|
(false, 64, 32) => IntToFpuOp::U64ToF32,
|
|
(true, 64, 32) => IntToFpuOp::I64ToF32,
|
|
(false, 64, 64) => IntToFpuOp::U64ToF64,
|
|
(true, 64, 64) => IntToFpuOp::I64ToF64,
|
|
_ => panic!("Unknown input/output-bits combination"),
|
|
};
|
|
let narrow_mode = match (signed, in_bits) {
|
|
(false, 8) | (false, 16) | (false, 32) => NarrowValueMode::ZeroExtend32,
|
|
(true, 8) | (true, 16) | (true, 32) => NarrowValueMode::SignExtend32,
|
|
(false, 64) => NarrowValueMode::ZeroExtend64,
|
|
(true, 64) => NarrowValueMode::SignExtend64,
|
|
_ => panic!("Unknown input size"),
|
|
};
|
|
let rn = put_input_in_reg(ctx, inputs[0], narrow_mode);
|
|
ctx.emit(Inst::IntToFpu { op, rd, rn });
|
|
}
|
|
}
|
|
|
|
Opcode::FcvtToUintSat | Opcode::FcvtToSintSat => {
|
|
let ty = ty.unwrap();
|
|
let out_signed = op == Opcode::FcvtToSintSat;
|
|
let rn = put_input_in_reg(ctx, inputs[0], NarrowValueMode::None);
|
|
let rd = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
|
|
|
|
if ty.is_vector() {
|
|
let op = if out_signed {
|
|
VecMisc2::Fcvtzs
|
|
} else {
|
|
VecMisc2::Fcvtzu
|
|
};
|
|
|
|
ctx.emit(Inst::VecMisc {
|
|
op,
|
|
rd,
|
|
rn,
|
|
size: VectorSize::from_ty(ty),
|
|
});
|
|
} else {
|
|
let in_ty = ctx.input_ty(insn, 0);
|
|
let in_bits = ty_bits(in_ty);
|
|
let out_bits = ty_bits(ty);
|
|
// FIMM Vtmp1, u32::MAX or u64::MAX or i32::MAX or i64::MAX
|
|
// FMIN Vtmp2, Vin, Vtmp1
|
|
// FIMM Vtmp1, 0 or 0 or i32::MIN or i64::MIN
|
|
// FMAX Vtmp2, Vtmp2, Vtmp1
|
|
// (if signed) FIMM Vtmp1, 0
|
|
// FCMP Vin, Vin
|
|
// FCSEL Vtmp2, Vtmp1, Vtmp2, NE // on NaN, select 0
|
|
// convert Rout, Vtmp2
|
|
|
|
assert!(in_bits == 32 || in_bits == 64);
|
|
assert!(out_bits == 32 || out_bits == 64);
|
|
|
|
let min: f64 = match (out_bits, out_signed) {
|
|
(32, true) => std::i32::MIN as f64,
|
|
(32, false) => 0.0,
|
|
(64, true) => std::i64::MIN as f64,
|
|
(64, false) => 0.0,
|
|
_ => unreachable!(),
|
|
};
|
|
|
|
let max = match (out_bits, out_signed) {
|
|
(32, true) => std::i32::MAX as f64,
|
|
(32, false) => std::u32::MAX as f64,
|
|
(64, true) => std::i64::MAX as f64,
|
|
(64, false) => std::u64::MAX as f64,
|
|
_ => unreachable!(),
|
|
};
|
|
|
|
let rtmp1 = ctx.alloc_tmp(in_ty).only_reg().unwrap();
|
|
let rtmp2 = ctx.alloc_tmp(in_ty).only_reg().unwrap();
|
|
|
|
if in_bits == 32 {
|
|
lower_constant_f32(ctx, rtmp1, max as f32);
|
|
} else {
|
|
lower_constant_f64(ctx, rtmp1, max);
|
|
}
|
|
ctx.emit(Inst::FpuRRR {
|
|
fpu_op: choose_32_64(in_ty, FPUOp2::Min32, FPUOp2::Min64),
|
|
rd: rtmp2,
|
|
rn: rn,
|
|
rm: rtmp1.to_reg(),
|
|
});
|
|
if in_bits == 32 {
|
|
lower_constant_f32(ctx, rtmp1, min as f32);
|
|
} else {
|
|
lower_constant_f64(ctx, rtmp1, min);
|
|
}
|
|
ctx.emit(Inst::FpuRRR {
|
|
fpu_op: choose_32_64(in_ty, FPUOp2::Max32, FPUOp2::Max64),
|
|
rd: rtmp2,
|
|
rn: rtmp2.to_reg(),
|
|
rm: rtmp1.to_reg(),
|
|
});
|
|
if out_signed {
|
|
if in_bits == 32 {
|
|
lower_constant_f32(ctx, rtmp1, 0.0);
|
|
} else {
|
|
lower_constant_f64(ctx, rtmp1, 0.0);
|
|
}
|
|
}
|
|
if in_bits == 32 {
|
|
ctx.emit(Inst::FpuCmp32 { rn: rn, rm: rn });
|
|
ctx.emit(Inst::FpuCSel32 {
|
|
rd: rtmp2,
|
|
rn: rtmp1.to_reg(),
|
|
rm: rtmp2.to_reg(),
|
|
cond: Cond::Ne,
|
|
});
|
|
} else {
|
|
ctx.emit(Inst::FpuCmp64 { rn: rn, rm: rn });
|
|
ctx.emit(Inst::FpuCSel64 {
|
|
rd: rtmp2,
|
|
rn: rtmp1.to_reg(),
|
|
rm: rtmp2.to_reg(),
|
|
cond: Cond::Ne,
|
|
});
|
|
}
|
|
|
|
let cvt = match (in_bits, out_bits, out_signed) {
|
|
(32, 32, false) => FpuToIntOp::F32ToU32,
|
|
(32, 32, true) => FpuToIntOp::F32ToI32,
|
|
(32, 64, false) => FpuToIntOp::F32ToU64,
|
|
(32, 64, true) => FpuToIntOp::F32ToI64,
|
|
(64, 32, false) => FpuToIntOp::F64ToU32,
|
|
(64, 32, true) => FpuToIntOp::F64ToI32,
|
|
(64, 64, false) => FpuToIntOp::F64ToU64,
|
|
(64, 64, true) => FpuToIntOp::F64ToI64,
|
|
_ => unreachable!(),
|
|
};
|
|
ctx.emit(Inst::FpuToInt {
|
|
op: cvt,
|
|
rd,
|
|
rn: rtmp2.to_reg(),
|
|
});
|
|
}
|
|
}
|
|
|
|
Opcode::IaddIfcout => {
|
|
// This is a two-output instruction that is needed for the
|
|
// legalizer's explicit heap-check sequence, among possible other
|
|
// uses. Its second output is a flags output only ever meant to
|
|
// check for overflow using the
|
|
// `backend.unsigned_add_overflow_condition()` condition.
|
|
//
|
|
// Note that the CLIF validation will ensure that no flag-setting
|
|
// operation comes between this IaddIfcout and its use (e.g., a
|
|
// Trapif). Thus, we can rely on implicit communication through the
|
|
// processor flags rather than explicitly generating flags into a
|
|
// register. We simply use the variant of the add instruction that
|
|
// sets flags (`adds`) here.
|
|
|
|
// Note that the second output (the flags) need not be generated,
|
|
// because flags are never materialized into a register; the only
|
|
// instructions that can use a value of type `iflags` or `fflags`
|
|
// will look directly for the flags-producing instruction (which can
|
|
// always be found, by construction) and merge it.
|
|
|
|
// Now handle the iadd as above, except use an AddS opcode that sets
|
|
// flags.
|
|
let rd = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
|
|
let rn = put_input_in_reg(ctx, inputs[0], NarrowValueMode::None);
|
|
let rm = put_input_in_rse_imm12(ctx, inputs[1], NarrowValueMode::None);
|
|
let ty = ty.unwrap();
|
|
let alu_op = choose_32_64(ty, ALUOp::AddS32, ALUOp::AddS64);
|
|
ctx.emit(alu_inst_imm12(alu_op, rd, rn, rm));
|
|
}
|
|
|
|
Opcode::IaddImm
|
|
| Opcode::ImulImm
|
|
| Opcode::UdivImm
|
|
| Opcode::SdivImm
|
|
| Opcode::UremImm
|
|
| Opcode::SremImm
|
|
| Opcode::IrsubImm
|
|
| Opcode::IaddCin
|
|
| Opcode::IaddIfcin
|
|
| Opcode::IaddCout
|
|
| Opcode::IaddCarry
|
|
| Opcode::IaddIfcarry
|
|
| Opcode::IsubBin
|
|
| Opcode::IsubIfbin
|
|
| Opcode::IsubBout
|
|
| Opcode::IsubIfbout
|
|
| Opcode::IsubBorrow
|
|
| Opcode::IsubIfborrow
|
|
| Opcode::BandImm
|
|
| Opcode::BorImm
|
|
| Opcode::BxorImm
|
|
| Opcode::RotlImm
|
|
| Opcode::RotrImm
|
|
| Opcode::IshlImm
|
|
| Opcode::UshrImm
|
|
| Opcode::SshrImm
|
|
| Opcode::IcmpImm
|
|
| Opcode::IfcmpImm => {
|
|
panic!("ALU+imm and ALU+carry ops should not appear here!");
|
|
}
|
|
|
|
#[cfg(feature = "x86")]
|
|
Opcode::X86Udivmodx
|
|
| Opcode::X86Sdivmodx
|
|
| Opcode::X86Umulx
|
|
| Opcode::X86Smulx
|
|
| Opcode::X86Cvtt2si
|
|
| Opcode::X86Fmin
|
|
| Opcode::X86Fmax
|
|
| Opcode::X86Push
|
|
| Opcode::X86Pop
|
|
| Opcode::X86Bsr
|
|
| Opcode::X86Bsf
|
|
| Opcode::X86Pblendw
|
|
| Opcode::X86Pshufd
|
|
| Opcode::X86Pshufb
|
|
| Opcode::X86Pextr
|
|
| Opcode::X86Pinsr
|
|
| Opcode::X86Insertps
|
|
| Opcode::X86Movsd
|
|
| Opcode::X86Movlhps
|
|
| Opcode::X86Palignr
|
|
| Opcode::X86Psll
|
|
| Opcode::X86Psrl
|
|
| Opcode::X86Psra
|
|
| Opcode::X86Ptest
|
|
| Opcode::X86Pmaxs
|
|
| Opcode::X86Pmaxu
|
|
| Opcode::X86Pmins
|
|
| Opcode::X86Pminu
|
|
| Opcode::X86Pmullq
|
|
| Opcode::X86Pmuludq
|
|
| Opcode::X86Punpckh
|
|
| Opcode::X86Punpckl
|
|
| Opcode::X86Vcvtudq2ps
|
|
| Opcode::X86ElfTlsGetAddr
|
|
| Opcode::X86MachoTlsGetAddr => {
|
|
panic!("x86-specific opcode in supposedly arch-neutral IR!");
|
|
}
|
|
|
|
Opcode::DummySargT => unreachable!(),
|
|
|
|
Opcode::Iabs => {
|
|
let rd = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
|
|
let rn = put_input_in_reg(ctx, inputs[0], NarrowValueMode::None);
|
|
let ty = ty.unwrap();
|
|
ctx.emit(Inst::VecMisc {
|
|
op: VecMisc2::Abs,
|
|
rd,
|
|
rn,
|
|
size: VectorSize::from_ty(ty),
|
|
});
|
|
}
|
|
Opcode::AvgRound => {
|
|
let rd = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
|
|
let rn = put_input_in_reg(ctx, inputs[0], NarrowValueMode::None);
|
|
let rm = put_input_in_reg(ctx, inputs[1], NarrowValueMode::None);
|
|
let ty = ty.unwrap();
|
|
ctx.emit(Inst::VecRRR {
|
|
alu_op: VecALUOp::Urhadd,
|
|
rd,
|
|
rn,
|
|
rm,
|
|
size: VectorSize::from_ty(ty),
|
|
});
|
|
}
|
|
|
|
Opcode::Snarrow | Opcode::Unarrow => {
|
|
let op = if op == Opcode::Snarrow {
|
|
VecMiscNarrowOp::Sqxtn
|
|
} else {
|
|
VecMiscNarrowOp::Sqxtun
|
|
};
|
|
let rd = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
|
|
let rn = put_input_in_reg(ctx, inputs[0], NarrowValueMode::None);
|
|
let rn2 = put_input_in_reg(ctx, inputs[1], NarrowValueMode::None);
|
|
let ty = ty.unwrap();
|
|
|
|
ctx.emit(Inst::VecMiscNarrow {
|
|
op,
|
|
rd,
|
|
rn,
|
|
size: VectorSize::from_ty(ty),
|
|
high_half: false,
|
|
});
|
|
ctx.emit(Inst::VecMiscNarrow {
|
|
op,
|
|
rd,
|
|
rn: rn2,
|
|
size: VectorSize::from_ty(ty),
|
|
high_half: true,
|
|
});
|
|
}
|
|
|
|
Opcode::SwidenLow | Opcode::SwidenHigh | Opcode::UwidenLow | Opcode::UwidenHigh => {
|
|
let lane_type = ty.unwrap().lane_type();
|
|
let rd = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
|
|
let rn = put_input_in_reg(ctx, inputs[0], NarrowValueMode::None);
|
|
let (t, high_half) = match (lane_type, op) {
|
|
(I16, Opcode::SwidenLow) => (VecExtendOp::Sxtl8, false),
|
|
(I16, Opcode::SwidenHigh) => (VecExtendOp::Sxtl8, true),
|
|
(I16, Opcode::UwidenLow) => (VecExtendOp::Uxtl8, false),
|
|
(I16, Opcode::UwidenHigh) => (VecExtendOp::Uxtl8, true),
|
|
(I32, Opcode::SwidenLow) => (VecExtendOp::Sxtl16, false),
|
|
(I32, Opcode::SwidenHigh) => (VecExtendOp::Sxtl16, true),
|
|
(I32, Opcode::UwidenLow) => (VecExtendOp::Uxtl16, false),
|
|
(I32, Opcode::UwidenHigh) => (VecExtendOp::Uxtl16, true),
|
|
(I64, Opcode::SwidenLow) => (VecExtendOp::Sxtl32, false),
|
|
(I64, Opcode::SwidenHigh) => (VecExtendOp::Sxtl32, true),
|
|
(I64, Opcode::UwidenLow) => (VecExtendOp::Uxtl32, false),
|
|
(I64, Opcode::UwidenHigh) => (VecExtendOp::Uxtl32, true),
|
|
_ => {
|
|
return Err(CodegenError::Unsupported(format!(
|
|
"Unsupported SIMD vector lane type: {:?}",
|
|
lane_type
|
|
)));
|
|
}
|
|
};
|
|
|
|
ctx.emit(Inst::VecExtend {
|
|
t,
|
|
rd,
|
|
rn,
|
|
high_half,
|
|
});
|
|
}
|
|
|
|
Opcode::TlsValue => unimplemented!("tls_value"),
|
|
Opcode::FcvtLowFromSint => unimplemented!("FcvtLowFromSint"),
|
|
Opcode::FvpromoteLow => unimplemented!("FvpromoteLow"),
|
|
Opcode::Fvdemote => unimplemented!("Fvdemote"),
|
|
}
|
|
|
|
Ok(())
|
|
}
|
|
|
|
pub(crate) fn lower_branch<C: LowerCtx<I = Inst>>(
|
|
ctx: &mut C,
|
|
branches: &[IRInst],
|
|
targets: &[MachLabel],
|
|
) -> CodegenResult<()> {
|
|
// A block should end with at most two branches. The first may be a
|
|
// conditional branch; a conditional branch can be followed only by an
|
|
// unconditional branch or fallthrough. Otherwise, if only one branch,
|
|
// it may be an unconditional branch, a fallthrough, a return, or a
|
|
// trap. These conditions are verified by `is_ebb_basic()` during the
|
|
// verifier pass.
|
|
assert!(branches.len() <= 2);
|
|
|
|
if branches.len() == 2 {
|
|
// Must be a conditional branch followed by an unconditional branch.
|
|
let op0 = ctx.data(branches[0]).opcode();
|
|
let op1 = ctx.data(branches[1]).opcode();
|
|
|
|
assert!(op1 == Opcode::Jump || op1 == Opcode::Fallthrough);
|
|
let taken = BranchTarget::Label(targets[0]);
|
|
// not_taken target is the target of the second branch, even if it is a Fallthrough
|
|
// instruction: because we reorder blocks while we lower, the fallthrough in the new
|
|
// order is not (necessarily) the same as the fallthrough in CLIF. So we use the
|
|
// explicitly-provided target.
|
|
let not_taken = BranchTarget::Label(targets[1]);
|
|
|
|
match op0 {
|
|
Opcode::Brz | Opcode::Brnz => {
|
|
let flag_input = InsnInput {
|
|
insn: branches[0],
|
|
input: 0,
|
|
};
|
|
if let Some(icmp_insn) =
|
|
maybe_input_insn_via_conv(ctx, flag_input, Opcode::Icmp, Opcode::Bint)
|
|
{
|
|
let condcode = ctx.data(icmp_insn).cond_code().unwrap();
|
|
let cond = lower_condcode(condcode);
|
|
let is_signed = condcode_is_signed(condcode);
|
|
let negated = op0 == Opcode::Brz;
|
|
let cond = if negated { cond.invert() } else { cond };
|
|
|
|
lower_icmp_or_ifcmp_to_flags(ctx, icmp_insn, is_signed);
|
|
ctx.emit(Inst::CondBr {
|
|
taken,
|
|
not_taken,
|
|
kind: CondBrKind::Cond(cond),
|
|
});
|
|
} else if let Some(fcmp_insn) =
|
|
maybe_input_insn_via_conv(ctx, flag_input, Opcode::Fcmp, Opcode::Bint)
|
|
{
|
|
let condcode = ctx.data(fcmp_insn).fp_cond_code().unwrap();
|
|
let cond = lower_fp_condcode(condcode);
|
|
let negated = op0 == Opcode::Brz;
|
|
let cond = if negated { cond.invert() } else { cond };
|
|
|
|
lower_fcmp_or_ffcmp_to_flags(ctx, fcmp_insn);
|
|
ctx.emit(Inst::CondBr {
|
|
taken,
|
|
not_taken,
|
|
kind: CondBrKind::Cond(cond),
|
|
});
|
|
} else {
|
|
let rt = put_input_in_reg(
|
|
ctx,
|
|
InsnInput {
|
|
insn: branches[0],
|
|
input: 0,
|
|
},
|
|
NarrowValueMode::ZeroExtend64,
|
|
);
|
|
let kind = match op0 {
|
|
Opcode::Brz => CondBrKind::Zero(rt),
|
|
Opcode::Brnz => CondBrKind::NotZero(rt),
|
|
_ => unreachable!(),
|
|
};
|
|
ctx.emit(Inst::CondBr {
|
|
taken,
|
|
not_taken,
|
|
kind,
|
|
});
|
|
}
|
|
}
|
|
Opcode::BrIcmp => {
|
|
let condcode = ctx.data(branches[0]).cond_code().unwrap();
|
|
let cond = lower_condcode(condcode);
|
|
let kind = CondBrKind::Cond(cond);
|
|
|
|
let is_signed = condcode_is_signed(condcode);
|
|
let ty = ctx.input_ty(branches[0], 0);
|
|
let bits = ty_bits(ty);
|
|
let narrow_mode = match (bits <= 32, is_signed) {
|
|
(true, true) => NarrowValueMode::SignExtend32,
|
|
(true, false) => NarrowValueMode::ZeroExtend32,
|
|
(false, true) => NarrowValueMode::SignExtend64,
|
|
(false, false) => NarrowValueMode::ZeroExtend64,
|
|
};
|
|
let rn = put_input_in_reg(
|
|
ctx,
|
|
InsnInput {
|
|
insn: branches[0],
|
|
input: 0,
|
|
},
|
|
narrow_mode,
|
|
);
|
|
let rm = put_input_in_rse_imm12(
|
|
ctx,
|
|
InsnInput {
|
|
insn: branches[0],
|
|
input: 1,
|
|
},
|
|
narrow_mode,
|
|
);
|
|
|
|
let alu_op = choose_32_64(ty, ALUOp::SubS32, ALUOp::SubS64);
|
|
let rd = writable_zero_reg();
|
|
ctx.emit(alu_inst_imm12(alu_op, rd, rn, rm));
|
|
ctx.emit(Inst::CondBr {
|
|
taken,
|
|
not_taken,
|
|
kind,
|
|
});
|
|
}
|
|
|
|
Opcode::Brif => {
|
|
let condcode = ctx.data(branches[0]).cond_code().unwrap();
|
|
let cond = lower_condcode(condcode);
|
|
let kind = CondBrKind::Cond(cond);
|
|
|
|
let is_signed = condcode_is_signed(condcode);
|
|
let flag_input = InsnInput {
|
|
insn: branches[0],
|
|
input: 0,
|
|
};
|
|
if let Some(ifcmp_insn) = maybe_input_insn(ctx, flag_input, Opcode::Ifcmp) {
|
|
lower_icmp_or_ifcmp_to_flags(ctx, ifcmp_insn, is_signed);
|
|
ctx.emit(Inst::CondBr {
|
|
taken,
|
|
not_taken,
|
|
kind,
|
|
});
|
|
} else {
|
|
// If the ifcmp result is actually placed in a
|
|
// register, we need to move it back into the flags.
|
|
let rn = put_input_in_reg(ctx, flag_input, NarrowValueMode::None);
|
|
ctx.emit(Inst::MovToNZCV { rn });
|
|
ctx.emit(Inst::CondBr {
|
|
taken,
|
|
not_taken,
|
|
kind,
|
|
});
|
|
}
|
|
}
|
|
|
|
Opcode::Brff => {
|
|
let condcode = ctx.data(branches[0]).fp_cond_code().unwrap();
|
|
let cond = lower_fp_condcode(condcode);
|
|
let kind = CondBrKind::Cond(cond);
|
|
let flag_input = InsnInput {
|
|
insn: branches[0],
|
|
input: 0,
|
|
};
|
|
if let Some(ffcmp_insn) = maybe_input_insn(ctx, flag_input, Opcode::Ffcmp) {
|
|
lower_fcmp_or_ffcmp_to_flags(ctx, ffcmp_insn);
|
|
ctx.emit(Inst::CondBr {
|
|
taken,
|
|
not_taken,
|
|
kind,
|
|
});
|
|
} else {
|
|
// If the ffcmp result is actually placed in a
|
|
// register, we need to move it back into the flags.
|
|
let rn = put_input_in_reg(ctx, flag_input, NarrowValueMode::None);
|
|
ctx.emit(Inst::MovToNZCV { rn });
|
|
ctx.emit(Inst::CondBr {
|
|
taken,
|
|
not_taken,
|
|
kind,
|
|
});
|
|
}
|
|
}
|
|
|
|
_ => unimplemented!(),
|
|
}
|
|
} else {
|
|
// Must be an unconditional branch or an indirect branch.
|
|
let op = ctx.data(branches[0]).opcode();
|
|
match op {
|
|
Opcode::Jump | Opcode::Fallthrough => {
|
|
assert!(branches.len() == 1);
|
|
// In the Fallthrough case, the machine-independent driver
|
|
// fills in `targets[0]` with our fallthrough block, so this
|
|
// is valid for both Jump and Fallthrough.
|
|
ctx.emit(Inst::Jump {
|
|
dest: BranchTarget::Label(targets[0]),
|
|
});
|
|
}
|
|
|
|
Opcode::BrTable => {
|
|
// Expand `br_table index, default, JT` to:
|
|
//
|
|
// emit_island // this forces an island at this point
|
|
// // if the jumptable would push us past
|
|
// // the deadline
|
|
// subs idx, #jt_size
|
|
// b.hs default
|
|
// adr vTmp1, PC+16
|
|
// ldr vTmp2, [vTmp1, idx, lsl #2]
|
|
// add vTmp2, vTmp2, vTmp1
|
|
// br vTmp2
|
|
// [jumptable offsets relative to JT base]
|
|
let jt_size = targets.len() - 1;
|
|
assert!(jt_size <= std::u32::MAX as usize);
|
|
|
|
ctx.emit(Inst::EmitIsland {
|
|
needed_space: 4 * (6 + jt_size) as CodeOffset,
|
|
});
|
|
|
|
let ridx = put_input_in_reg(
|
|
ctx,
|
|
InsnInput {
|
|
insn: branches[0],
|
|
input: 0,
|
|
},
|
|
NarrowValueMode::ZeroExtend32,
|
|
);
|
|
|
|
let rtmp1 = ctx.alloc_tmp(I32).only_reg().unwrap();
|
|
let rtmp2 = ctx.alloc_tmp(I32).only_reg().unwrap();
|
|
|
|
// Bounds-check, leaving condition codes for JTSequence's
|
|
// branch to default target below.
|
|
if let Some(imm12) = Imm12::maybe_from_u64(jt_size as u64) {
|
|
ctx.emit(Inst::AluRRImm12 {
|
|
alu_op: ALUOp::SubS32,
|
|
rd: writable_zero_reg(),
|
|
rn: ridx,
|
|
imm12,
|
|
});
|
|
} else {
|
|
lower_constant_u64(ctx, rtmp1, jt_size as u64);
|
|
ctx.emit(Inst::AluRRR {
|
|
alu_op: ALUOp::SubS32,
|
|
rd: writable_zero_reg(),
|
|
rn: ridx,
|
|
rm: rtmp1.to_reg(),
|
|
});
|
|
}
|
|
|
|
// Emit the compound instruction that does:
|
|
//
|
|
// b.hs default
|
|
// adr rA, jt
|
|
// ldrsw rB, [rA, rIndex, UXTW 2]
|
|
// add rA, rA, rB
|
|
// br rA
|
|
// [jt entries]
|
|
//
|
|
// This must be *one* instruction in the vcode because
|
|
// we cannot allow regalloc to insert any spills/fills
|
|
// in the middle of the sequence; otherwise, the ADR's
|
|
// PC-rel offset to the jumptable would be incorrect.
|
|
// (The alternative is to introduce a relocation pass
|
|
// for inlined jumptables, which is much worse, IMHO.)
|
|
|
|
let jt_targets: Vec<BranchTarget> = targets
|
|
.iter()
|
|
.skip(1)
|
|
.map(|bix| BranchTarget::Label(*bix))
|
|
.collect();
|
|
let default_target = BranchTarget::Label(targets[0]);
|
|
let targets_for_term: Vec<MachLabel> = targets.to_vec();
|
|
ctx.emit(Inst::JTSequence {
|
|
ridx,
|
|
rtmp1,
|
|
rtmp2,
|
|
info: Box::new(JTSequenceInfo {
|
|
targets: jt_targets,
|
|
default_target,
|
|
targets_for_term,
|
|
}),
|
|
});
|
|
}
|
|
|
|
_ => panic!("Unknown branch type!"),
|
|
}
|
|
}
|
|
|
|
Ok(())
|
|
}
|