machinst x64: add more FP support

This commit is contained in:
Benjamin Bouvier
2020-07-09 18:40:42 +02:00
parent 5c5a30f76c
commit ead8a835c4
10 changed files with 636 additions and 144 deletions

View File

@@ -792,29 +792,111 @@ fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(
ctx.emit(Inst::Ud2 { trap_info })
}
Opcode::F64const => {
// TODO use xorpd for 0
let value = ctx.get_constant(insn).unwrap();
let dst = output_to_reg(ctx, outputs[0]);
for inst in Inst::gen_constant(dst, value, F64, |reg_class, ty| {
ctx.alloc_tmp(reg_class, ty)
}) {
ctx.emit(inst);
}
}
Opcode::F32const => {
// TODO use xorps for 0.
let value = ctx.get_constant(insn).unwrap();
let dst = output_to_reg(ctx, outputs[0]);
for inst in Inst::gen_constant(dst, value, F32, |reg_class, ty| {
ctx.alloc_tmp(reg_class, ty)
}) {
ctx.emit(inst);
}
}
Opcode::Fadd | Opcode::Fsub | Opcode::Fmul | Opcode::Fdiv => {
let lhs = input_to_reg(ctx, inputs[0]);
let lhs = input_to_reg_mem(ctx, inputs[0]);
let rhs = input_to_reg(ctx, inputs[1]);
let dst = output_to_reg(ctx, outputs[0]);
// Note: min and max can't be handled here, because of the way Cranelift defines them:
// if any operand is a NaN, they must return the NaN operand, while the x86 machine
// instruction will return the other operand.
let (f32_op, f64_op) = match op {
Opcode::Fadd => (SseOpcode::Addss, SseOpcode::Addsd),
Opcode::Fsub => (SseOpcode::Subss, SseOpcode::Subsd),
Opcode::Fmul => (SseOpcode::Mulss, SseOpcode::Mulsd),
Opcode::Fdiv => (SseOpcode::Divss, SseOpcode::Divsd),
_ => unreachable!(),
};
let is_64 = flt_ty_is_64(ty.unwrap());
if !is_64 {
let sse_op = match op {
Opcode::Fadd => SseOpcode::Addss,
Opcode::Fsub => SseOpcode::Subss,
Opcode::Fmul => SseOpcode::Mulss,
Opcode::Fdiv => SseOpcode::Divss,
// TODO Fmax, Fmin.
_ => unimplemented!(),
};
ctx.emit(Inst::xmm_mov_rm_r(
SseOpcode::Movss,
RegMem::reg(lhs),
dst,
None,
));
ctx.emit(Inst::xmm_rm_r(sse_op, RegMem::reg(rhs), dst));
let mov_op = if is_64 {
SseOpcode::Movsd
} else {
unimplemented!("unimplemented lowering for opcode {:?}", op);
SseOpcode::Movss
};
ctx.emit(Inst::xmm_mov(mov_op, lhs, dst, None));
let sse_op = if is_64 { f64_op } else { f32_op };
ctx.emit(Inst::xmm_rm_r(sse_op, RegMem::reg(rhs), dst));
}
Opcode::Sqrt => {
let src = input_to_reg_mem(ctx, inputs[0]);
let dst = output_to_reg(ctx, outputs[0]);
let (f32_op, f64_op) = match op {
Opcode::Sqrt => (SseOpcode::Sqrtss, SseOpcode::Sqrtsd),
_ => unreachable!(),
};
let sse_op = if flt_ty_is_64(ty.unwrap()) {
f64_op
} else {
f32_op
};
ctx.emit(Inst::xmm_unary_rm_r(sse_op, src, dst));
}
Opcode::Fpromote => {
let src = input_to_reg_mem(ctx, inputs[0]);
let dst = output_to_reg(ctx, outputs[0]);
ctx.emit(Inst::xmm_unary_rm_r(SseOpcode::Cvtss2sd, src, dst));
}
Opcode::Fdemote => {
let src = input_to_reg_mem(ctx, inputs[0]);
let dst = output_to_reg(ctx, outputs[0]);
ctx.emit(Inst::xmm_unary_rm_r(SseOpcode::Cvtsd2ss, src, dst));
}
Opcode::Bitcast => {
let input_ty = ctx.input_ty(insn, 0);
let output_ty = ctx.output_ty(insn, 0);
match (input_ty, output_ty) {
(F32, I32) => {
let src = input_to_reg(ctx, inputs[0]);
let dst = output_to_reg(ctx, outputs[0]);
ctx.emit(Inst::xmm_to_gpr(SseOpcode::Movd, src, dst));
}
(I32, F32) => {
let src = input_to_reg_mem(ctx, inputs[0]);
let dst = output_to_reg(ctx, outputs[0]);
ctx.emit(Inst::gpr_to_xmm(SseOpcode::Movd, src, dst));
}
(F64, I64) => {
let src = input_to_reg(ctx, inputs[0]);
let dst = output_to_reg(ctx, outputs[0]);
ctx.emit(Inst::xmm_to_gpr(SseOpcode::Movq, src, dst));
}
(I64, F64) => {
let src = input_to_reg_mem(ctx, inputs[0]);
let dst = output_to_reg(ctx, outputs[0]);
ctx.emit(Inst::gpr_to_xmm(SseOpcode::Movq, src, dst));
}
_ => unreachable!("invalid bitcast from {:?} to {:?}", input_ty, output_ty),
}
}
@@ -834,20 +916,19 @@ fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(
let tmp_xmm1 = ctx.alloc_tmp(RegClass::V128, F32);
let tmp_xmm2 = ctx.alloc_tmp(RegClass::V128, F32);
ctx.emit(Inst::imm_r(true, 0x8000_0000, tmp_gpr1));
ctx.emit(Inst::xmm_mov_rm_r(
ctx.emit(Inst::gpr_to_xmm(
SseOpcode::Movd,
RegMem::reg(tmp_gpr1.to_reg()),
tmp_xmm1,
None,
));
ctx.emit(Inst::xmm_mov_rm_r(
ctx.emit(Inst::xmm_mov(
SseOpcode::Movaps,
RegMem::reg(tmp_xmm1.to_reg()),
dst,
None,
));
ctx.emit(Inst::xmm_rm_r(SseOpcode::Andnps, RegMem::reg(lhs), dst));
ctx.emit(Inst::xmm_mov_rm_r(
ctx.emit(Inst::xmm_mov(
SseOpcode::Movss,
RegMem::reg(rhs),
tmp_xmm2,
@@ -982,8 +1063,9 @@ fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(
}
(_, true) => {
ctx.emit(match elem_ty {
F32 => Inst::xmm_mov_rm_r(SseOpcode::Movss, RegMem::mem(addr), dst, srcloc),
_ => unimplemented!("FP load not 32-bit"),
F32 => Inst::xmm_mov(SseOpcode::Movss, RegMem::mem(addr), dst, srcloc),
F64 => Inst::xmm_mov(SseOpcode::Movsd, RegMem::mem(addr), dst, srcloc),
_ => unreachable!("unexpected type for load: {:?}", elem_ty),
});
}
}
@@ -1025,7 +1107,7 @@ fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(
| Opcode::Istore32Complex => {
assert!(
inputs.len() == 3,
"can't handle more than two inputs in complex load"
"can't handle more than two inputs in complex store"
);
let base = input_to_reg(ctx, inputs[1]);
let index = input_to_reg(ctx, inputs[2]);
@@ -1043,7 +1125,8 @@ fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(
if is_float {
ctx.emit(match elem_ty {
F32 => Inst::xmm_mov_r_m(SseOpcode::Movss, src, addr, srcloc),
_ => unimplemented!("FP store not 32-bit"),
F64 => Inst::xmm_mov_r_m(SseOpcode::Movsd, src, addr, srcloc),
_ => panic!("unexpected type for store {:?}", elem_ty),
});
} else {
ctx.emit(Inst::mov_r_m(elem_ty.bytes() as u8, src, addr, srcloc));
@@ -1119,18 +1202,23 @@ fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(
let dst = output_to_reg(ctx, outputs[0]);
let ty = ctx.output_ty(insn, 0);
assert!(is_int_ty(ty), "float cmov NYI");
let size = ty.bytes() as u8;
if size == 1 {
// Sign-extend operands to 32, then do a cmove of size 4.
let lhs_se = ctx.alloc_tmp(RegClass::I64, I32);
ctx.emit(Inst::movsx_rm_r(ExtMode::BL, lhs, lhs_se, None));
ctx.emit(Inst::movsx_rm_r(ExtMode::BL, RegMem::reg(rhs), dst, None));
ctx.emit(Inst::cmove(4, cc, RegMem::reg(lhs_se.to_reg()), dst));
if ty.is_int() {
let size = ty.bytes() as u8;
if size == 1 {
// Sign-extend operands to 32, then do a cmove of size 4.
let lhs_se = ctx.alloc_tmp(RegClass::I64, I32);
ctx.emit(Inst::movsx_rm_r(ExtMode::BL, lhs, lhs_se, None));
ctx.emit(Inst::movsx_rm_r(ExtMode::BL, RegMem::reg(rhs), dst, None));
ctx.emit(Inst::cmove(4, cc, RegMem::reg(lhs_se.to_reg()), dst));
} else {
ctx.emit(Inst::gen_move(dst, rhs, ty));
ctx.emit(Inst::cmove(size, cc, lhs, dst));
}
} else {
debug_assert!(ty == F32 || ty == F64);
ctx.emit(Inst::gen_move(dst, rhs, ty));
ctx.emit(Inst::cmove(size, cc, lhs, dst));
ctx.emit(Inst::xmm_cmove(ty == F64, cc, lhs, dst));
}
}