machinst x64: revamp integer immediate emission;

In particular:

- try to optimize the integer emission into a 32-bit emission, when the
high bits are all zero, and stop relying on the caller of `imm_r` to
ensure this.
- rename `Inst::imm_r`/`Inst::Imm_R` to `Inst::imm`/`Inst::Imm`.
- generate a sign-extending mov 32-bit immediate to 64-bits, whenever
possible.
- fix a few places where the previous commit did introduce the
generation of zero-constants with xor, when calling `put_input_to_reg`,
thus clobbering the flags before they were read.
This commit is contained in:
Benjamin Bouvier
2020-09-08 18:23:11 +02:00
parent d9052d0a9c
commit 3849dc18b1
5 changed files with 229 additions and 172 deletions

View File

@@ -29,7 +29,7 @@ type Ctx<'a> = &'a mut dyn LowerCtx<I = Inst>;
//=============================================================================
// Helpers for instruction lowering.
fn is_int_ty(ty: Type) -> bool {
fn is_int_or_ref_ty(ty: Type) -> bool {
match ty {
types::I8 | types::I16 | types::I32 | types::I64 | types::R64 => true,
types::R32 => panic!("shouldn't have 32-bits refs on x64"),
@@ -207,6 +207,10 @@ fn input_to_reg_mem_imm(ctx: Ctx, spec: InsnInput) -> RegMemImm {
}
}
/// Emits an int comparison instruction.
///
/// Note: make sure that there are no instructions modifying the flags between a call to this
/// function and the use of the flags!
fn emit_cmp(ctx: Ctx, insn: IRInst) {
let ty = ctx.input_ty(insn, 0);
@@ -253,6 +257,10 @@ enum FcmpCondResult {
InvertedEqualOrConditions(CC, CC),
}
/// Emits a float comparison instruction.
///
/// Note: make sure that there are no instructions modifying the flags between a call to this
/// function and the use of the flags!
fn emit_fcmp(ctx: Ctx, insn: IRInst, mut cond_code: FloatCC, spec: FcmpSpec) -> FcmpCondResult {
let (flip_operands, inverted_equal) = match cond_code {
FloatCC::LessThan
@@ -485,12 +493,15 @@ fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(
match op {
Opcode::Iconst | Opcode::Bconst | Opcode::Null => {
let w64 = ctx
let value = ctx
.get_constant(insn)
.expect("constant value for iconst et al");
let dst_is_64 = w64 > 0x7fffffff;
let dst = get_output_reg(ctx, outputs[0]);
ctx.emit(Inst::imm_r(dst_is_64, w64, dst));
for inst in Inst::gen_constant(dst, value, ty.unwrap(), |reg_class, ty| {
ctx.alloc_tmp(reg_class, ty)
}) {
ctx.emit(inst);
}
}
Opcode::Iadd
@@ -704,7 +715,11 @@ fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(
let dst = get_output_reg(ctx, outputs[0]);
let tmp = ctx.alloc_tmp(RegClass::I64, ty);
ctx.emit(Inst::imm_r(ty == types::I64, u64::max_value(), dst));
ctx.emit(Inst::imm(
OperandSize::from_bytes(ty.bytes()),
u64::max_value(),
dst,
));
ctx.emit(Inst::unary_rm_r(
ty.bytes() as u8,
@@ -720,7 +735,11 @@ fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(
tmp,
));
ctx.emit(Inst::imm_r(ty == types::I64, ty.bits() as u64 - 1, dst));
ctx.emit(Inst::imm(
OperandSize::from_bytes(ty.bytes()),
ty.bits() as u64 - 1,
dst,
));
ctx.emit(Inst::alu_rmi_r(
ty == types::I64,
@@ -745,7 +764,7 @@ fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(
let dst = get_output_reg(ctx, outputs[0]);
let tmp = ctx.alloc_tmp(RegClass::I64, ty);
ctx.emit(Inst::imm_r(false /* 64 bits */, ty.bits() as u64, tmp));
ctx.emit(Inst::imm(OperandSize::Size32, ty.bits() as u64, tmp));
ctx.emit(Inst::unary_rm_r(
ty.bytes() as u8,
@@ -797,7 +816,7 @@ fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(
));
// mov 0x7777_7777_7777_7777, cst
ctx.emit(Inst::imm_r(is_64, 0x7777777777777777, cst));
ctx.emit(Inst::imm(OperandSize::Size64, 0x7777777777777777, cst));
// andq cst, tmp1
ctx.emit(Inst::alu_rmi_r(
@@ -881,7 +900,7 @@ fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(
));
// mov $0x0F0F_0F0F_0F0F_0F0F, cst
ctx.emit(Inst::imm_r(is_64, 0x0F0F0F0F0F0F0F0F, cst));
ctx.emit(Inst::imm(OperandSize::Size64, 0x0F0F0F0F0F0F0F0F, cst));
// and cst, dst
ctx.emit(Inst::alu_rmi_r(
@@ -892,7 +911,7 @@ fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(
));
// mov $0x0101_0101_0101_0101, cst
ctx.emit(Inst::imm_r(is_64, 0x0101010101010101, cst));
ctx.emit(Inst::imm(OperandSize::Size64, 0x0101010101010101, cst));
// mul cst, dst
ctx.emit(Inst::alu_rmi_r(
@@ -2160,23 +2179,17 @@ fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(
if let Some(fcmp) = matches_input(ctx, flag_input, Opcode::Fcmp) {
let cond_code = ctx.data(fcmp).fp_cond_code().unwrap();
// we request inversion of Equal to NotEqual here: taking LHS if equal would mean
// take it if both CC::NP and CC::Z are set, the conjunction of which can't be
// modeled with a single cmov instruction. Instead, we'll swap LHS and RHS in the
// select operation, and invert the equal to a not-equal here.
let fcmp_results = emit_fcmp(ctx, fcmp, cond_code, FcmpSpec::InvertEqual);
let (lhs_input, rhs_input) = match fcmp_results {
FcmpCondResult::InvertedEqualOrConditions(_, _) => (inputs[2], inputs[1]),
FcmpCondResult::Condition(_)
| FcmpCondResult::AndConditions(_, _)
| FcmpCondResult::OrConditions(_, _) => (inputs[1], inputs[2]),
// For equal, we flip the operands, because we can't test a conjunction of
// CPU flags with a single cmove; see InvertedEqualOrConditions doc comment.
let (lhs_input, rhs_input) = match cond_code {
FloatCC::Equal => (inputs[2], inputs[1]),
_ => (inputs[1], inputs[2]),
};
let ty = ctx.output_ty(insn, 0);
let rhs = put_input_in_reg(ctx, rhs_input);
let dst = get_output_reg(ctx, outputs[0]);
let lhs = if is_int_ty(ty) && ty.bytes() < 4 {
let lhs = if is_int_or_ref_ty(ty) && ty.bytes() < 4 {
// Special case: since the higher bits are undefined per CLIF semantics, we
// can just apply a 32-bit cmove here. Force inputs into registers, to
// avoid partial spilling out-of-bounds with memory accesses, though.
@@ -2186,11 +2199,22 @@ fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(
input_to_reg_mem(ctx, lhs_input)
};
// We request inversion of Equal to NotEqual here: taking LHS if equal would mean
// take it if both CC::NP and CC::Z are set, the conjunction of which can't be
// modeled with a single cmov instruction. Instead, we'll swap LHS and RHS in the
// select operation, and invert the equal to a not-equal here.
let fcmp_results = emit_fcmp(ctx, fcmp, cond_code, FcmpSpec::InvertEqual);
if let FcmpCondResult::InvertedEqualOrConditions(_, _) = &fcmp_results {
// Keep this sync'd with the lowering of the select inputs above.
assert_eq!(cond_code, FloatCC::Equal);
}
ctx.emit(Inst::gen_move(dst, rhs, ty));
match fcmp_results {
FcmpCondResult::Condition(cc) => {
if is_int_ty(ty) {
if is_int_or_ref_ty(ty) {
let size = u8::max(ty.bytes() as u8, 4);
ctx.emit(Inst::cmove(size, cc, lhs, dst));
} else {
@@ -2204,7 +2228,7 @@ fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(
}
FcmpCondResult::InvertedEqualOrConditions(cc1, cc2)
| FcmpCondResult::OrConditions(cc1, cc2) => {
if is_int_ty(ty) {
if is_int_or_ref_ty(ty) {
let size = u8::max(ty.bytes() as u8, 4);
ctx.emit(Inst::cmove(size, cc1, lhs.clone(), dst));
ctx.emit(Inst::cmove(size, cc2, lhs, dst));
@@ -2215,27 +2239,11 @@ fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(
}
}
} else {
let cc = if let Some(icmp) = matches_input(ctx, flag_input, Opcode::Icmp) {
emit_cmp(ctx, icmp);
let cond_code = ctx.data(icmp).cond_code().unwrap();
CC::from_intcc(cond_code)
} else {
// The input is a boolean value, compare it against zero.
let size = ctx.input_ty(insn, 0).bytes() as u8;
let test = put_input_in_reg(ctx, inputs[0]);
ctx.emit(Inst::cmp_rmi_r(size, RegMemImm::imm(0), test));
CC::NZ
};
let ty = ty.unwrap();
let rhs = put_input_in_reg(ctx, inputs[2]);
let dst = get_output_reg(ctx, outputs[0]);
let ty = ctx.output_ty(insn, 0);
ctx.emit(Inst::gen_move(dst, rhs, ty));
if is_int_ty(ty) {
let mut size = ty.bytes() as u8;
let lhs = if size < 4 {
let mut size = ty.bytes() as u8;
let lhs = if is_int_or_ref_ty(ty) {
if size < 4 {
// Special case: since the higher bits are undefined per CLIF semantics, we
// can just apply a 32-bit cmove here. Force inputs into registers, to
// avoid partial spilling out-of-bounds with memory accesses, though.
@@ -2243,17 +2251,44 @@ fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(
RegMem::reg(put_input_in_reg(ctx, inputs[1]))
} else {
input_to_reg_mem(ctx, inputs[1])
};
}
} else {
input_to_reg_mem(ctx, inputs[1])
};
let rhs = put_input_in_reg(ctx, inputs[2]);
let dst = get_output_reg(ctx, outputs[0]);
let cc = if let Some(icmp) = matches_input(ctx, flag_input, Opcode::Icmp) {
emit_cmp(ctx, icmp);
let cond_code = ctx.data(icmp).cond_code().unwrap();
CC::from_intcc(cond_code)
} else {
// The input is a boolean value, compare it against zero.
let size = ctx.input_ty(insn, 0).bytes() as u8;
let test = put_input_in_reg(ctx, flag_input);
ctx.emit(Inst::cmp_rmi_r(size, RegMemImm::imm(0), test));
CC::NZ
};
// This doesn't affect the flags.
ctx.emit(Inst::gen_move(dst, rhs, ty));
if is_int_or_ref_ty(ty) {
ctx.emit(Inst::cmove(size, cc, lhs, dst));
} else {
debug_assert!(ty == types::F32 || ty == types::F64);
let lhs = input_to_reg_mem(ctx, inputs[1]);
ctx.emit(Inst::xmm_cmove(ty == types::F64, cc, lhs, dst));
}
}
}
Opcode::Selectif | Opcode::SelectifSpectreGuard => {
let lhs = input_to_reg_mem(ctx, inputs[1]);
let rhs = put_input_in_reg(ctx, inputs[2]);
let dst = get_output_reg(ctx, outputs[0]);
let ty = ctx.output_ty(insn, 0);
// Verification ensures that the input is always a single-def ifcmp.
let cmp_insn = ctx
.get_input(inputs[0].insn, inputs[0].input)
@@ -2265,13 +2300,7 @@ fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(
let cc = CC::from_intcc(ctx.data(insn).cond_code().unwrap());
let lhs = input_to_reg_mem(ctx, inputs[1]);
let rhs = put_input_in_reg(ctx, inputs[2]);
let dst = get_output_reg(ctx, outputs[0]);
let ty = ctx.output_ty(insn, 0);
if is_int_ty(ty) {
if is_int_or_ref_ty(ty) {
let size = ty.bytes() as u8;
if size == 1 {
// Sign-extend operands to 32, then do a cmove of size 4.
@@ -2331,7 +2360,12 @@ fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(
} else {
None
};
ctx.emit(Inst::imm_r(true, 0, Writable::from_reg(regs::rdx())));
// TODO use xor
ctx.emit(Inst::imm(
OperandSize::Size32,
0,
Writable::from_reg(regs::rdx()),
));
ctx.emit(Inst::checked_div_or_rem_seq(
kind,
size,
@@ -2343,30 +2377,24 @@ fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(
let divisor = input_to_reg_mem(ctx, inputs[1]);
// Fill in the high parts:
if input_ty == types::I8 {
if kind.is_signed() {
// sign-extend the sign-bit of al into ah, for signed opcodes.
ctx.emit(Inst::sign_extend_data(1));
} else {
ctx.emit(Inst::movzx_rm_r(
ExtMode::BL,
RegMem::reg(regs::rax()),
Writable::from_reg(regs::rax()),
/* infallible */ None,
));
}
if kind.is_signed() {
// sign-extend the sign-bit of al into ah for size 1, or rax into rdx, for
// signed opcodes.
ctx.emit(Inst::sign_extend_data(size));
} else if input_ty == types::I8 {
ctx.emit(Inst::movzx_rm_r(
ExtMode::BL,
RegMem::reg(regs::rax()),
Writable::from_reg(regs::rax()),
/* infallible */ None,
));
} else {
if kind.is_signed() {
// sign-extend the sign-bit of rax into rdx, for signed opcodes.
ctx.emit(Inst::sign_extend_data(size));
} else {
// zero for unsigned opcodes.
ctx.emit(Inst::imm_r(
true, /* is_64 */
0,
Writable::from_reg(regs::rdx()),
));
}
// zero for unsigned opcodes.
ctx.emit(Inst::imm(
OperandSize::Size64,
0,
Writable::from_reg(regs::rdx()),
));
}
// Emit the actual idiv.
@@ -2565,7 +2593,7 @@ impl LowerBackend for X64Backend {
}
FcmpCondResult::InvertedEqualOrConditions(_, _) => unreachable!(),
}
} else if is_int_ty(src_ty) || is_bool_ty(src_ty) {
} else if is_int_or_ref_ty(src_ty) || is_bool_ty(src_ty) {
let src = put_input_in_reg(
ctx,
InsnInput {
@@ -2588,7 +2616,7 @@ impl LowerBackend for X64Backend {
Opcode::BrIcmp => {
let src_ty = ctx.input_ty(branches[0], 0);
if is_int_ty(src_ty) || is_bool_ty(src_ty) {
if is_int_or_ref_ty(src_ty) || is_bool_ty(src_ty) {
let lhs = put_input_in_reg(
ctx,
InsnInput {