As explained in the added documentation and #1342, if we prevent `infer_rex()` and `w()` from being used together then we don't need to check whether the W bit is set when calculating the size of a recipe. This should improve compile time for x86 very slightly since all `infer_rex()` instructions will no longer need this check.
1357 lines
48 KiB
Rust
1357 lines
48 KiB
Rust
//! Encoding tables for x86 ISAs.
|
|
|
|
use super::registers::*;
|
|
use crate::bitset::BitSet;
|
|
use crate::cursor::{Cursor, FuncCursor};
|
|
use crate::flowgraph::ControlFlowGraph;
|
|
use crate::ir::condcodes::{FloatCC, IntCC};
|
|
use crate::ir::types::*;
|
|
use crate::ir::{self, Function, Inst, InstBuilder};
|
|
use crate::isa::constraints::*;
|
|
use crate::isa::enc_tables::*;
|
|
use crate::isa::encoding::base_size;
|
|
use crate::isa::encoding::{Encoding, RecipeSizing};
|
|
use crate::isa::RegUnit;
|
|
use crate::isa::{self, TargetIsa};
|
|
use crate::predicates;
|
|
use crate::regalloc::RegDiversions;
|
|
|
|
include!(concat!(env!("OUT_DIR"), "/encoding-x86.rs"));
|
|
include!(concat!(env!("OUT_DIR"), "/legalize-x86.rs"));
|
|
|
|
/// Whether the REX prefix is needed for encoding extended registers (via REX.RXB).
|
|
///
|
|
/// Normal x86 instructions have only 3 bits for encoding a register.
|
|
/// The REX prefix adds REX.R, REX,X, and REX.B bits, interpreted as fourth bits.
|
|
pub fn is_extended_reg(reg: RegUnit) -> bool {
|
|
// Extended registers have the fourth bit set.
|
|
reg as u8 & 0b1000 != 0
|
|
}
|
|
|
|
pub fn needs_sib_byte(reg: RegUnit) -> bool {
|
|
reg == RU::r12 as RegUnit || reg == RU::rsp as RegUnit
|
|
}
|
|
pub fn needs_offset(reg: RegUnit) -> bool {
|
|
reg == RU::r13 as RegUnit || reg == RU::rbp as RegUnit
|
|
}
|
|
pub fn needs_sib_byte_or_offset(reg: RegUnit) -> bool {
|
|
needs_sib_byte(reg) || needs_offset(reg)
|
|
}
|
|
|
|
fn test_input(
|
|
op_index: usize,
|
|
inst: Inst,
|
|
divert: &RegDiversions,
|
|
func: &Function,
|
|
condition_func: fn(RegUnit) -> bool,
|
|
) -> bool {
|
|
let in_reg = divert.reg(func.dfg.inst_args(inst)[op_index], &func.locations);
|
|
condition_func(in_reg)
|
|
}
|
|
|
|
fn test_result(
|
|
result_index: usize,
|
|
inst: Inst,
|
|
divert: &RegDiversions,
|
|
func: &Function,
|
|
condition_func: fn(RegUnit) -> bool,
|
|
) -> bool {
|
|
let out_reg = divert.reg(func.dfg.inst_results(inst)[result_index], &func.locations);
|
|
condition_func(out_reg)
|
|
}
|
|
|
|
fn size_plus_maybe_offset_for_inreg_0(
|
|
sizing: &RecipeSizing,
|
|
_enc: Encoding,
|
|
inst: Inst,
|
|
divert: &RegDiversions,
|
|
func: &Function,
|
|
) -> u8 {
|
|
let needs_offset = test_input(0, inst, divert, func, needs_offset);
|
|
sizing.base_size + if needs_offset { 1 } else { 0 }
|
|
}
|
|
fn size_plus_maybe_offset_for_inreg_1(
|
|
sizing: &RecipeSizing,
|
|
_enc: Encoding,
|
|
inst: Inst,
|
|
divert: &RegDiversions,
|
|
func: &Function,
|
|
) -> u8 {
|
|
let needs_offset = test_input(1, inst, divert, func, needs_offset);
|
|
sizing.base_size + if needs_offset { 1 } else { 0 }
|
|
}
|
|
fn size_plus_maybe_sib_for_inreg_0(
|
|
sizing: &RecipeSizing,
|
|
_enc: Encoding,
|
|
inst: Inst,
|
|
divert: &RegDiversions,
|
|
func: &Function,
|
|
) -> u8 {
|
|
let needs_sib = test_input(0, inst, divert, func, needs_sib_byte);
|
|
sizing.base_size + if needs_sib { 1 } else { 0 }
|
|
}
|
|
fn size_plus_maybe_sib_for_inreg_1(
|
|
sizing: &RecipeSizing,
|
|
_enc: Encoding,
|
|
inst: Inst,
|
|
divert: &RegDiversions,
|
|
func: &Function,
|
|
) -> u8 {
|
|
let needs_sib = test_input(1, inst, divert, func, needs_sib_byte);
|
|
sizing.base_size + if needs_sib { 1 } else { 0 }
|
|
}
|
|
fn size_plus_maybe_sib_or_offset_for_inreg_0(
|
|
sizing: &RecipeSizing,
|
|
_enc: Encoding,
|
|
inst: Inst,
|
|
divert: &RegDiversions,
|
|
func: &Function,
|
|
) -> u8 {
|
|
let needs_sib_or_offset = test_input(0, inst, divert, func, needs_sib_byte_or_offset);
|
|
sizing.base_size + if needs_sib_or_offset { 1 } else { 0 }
|
|
}
|
|
fn size_plus_maybe_sib_or_offset_for_inreg_1(
|
|
sizing: &RecipeSizing,
|
|
_enc: Encoding,
|
|
inst: Inst,
|
|
divert: &RegDiversions,
|
|
func: &Function,
|
|
) -> u8 {
|
|
let needs_sib_or_offset = test_input(1, inst, divert, func, needs_sib_byte_or_offset);
|
|
sizing.base_size + if needs_sib_or_offset { 1 } else { 0 }
|
|
}
|
|
|
|
/// Calculates the size while inferring if the first and second input registers (inreg0, inreg1)
|
|
/// require a dynamic REX prefix and if the second input register (inreg1) requires a SIB or offset.
|
|
fn size_plus_maybe_sib_or_offset_inreg1_plus_rex_prefix_for_inreg0_inreg1(
|
|
sizing: &RecipeSizing,
|
|
enc: Encoding,
|
|
inst: Inst,
|
|
divert: &RegDiversions,
|
|
func: &Function,
|
|
) -> u8 {
|
|
// No need to check for REX.W in `needs_rex` because `infer_rex().w()` is not allowed.
|
|
let needs_rex = test_input(0, inst, divert, func, is_extended_reg)
|
|
|| test_input(1, inst, divert, func, is_extended_reg);
|
|
size_plus_maybe_sib_or_offset_for_inreg_1(sizing, enc, inst, divert, func)
|
|
+ if needs_rex { 1 } else { 0 }
|
|
}
|
|
|
|
/// Calculates the size while inferring if the first and second input registers (inreg0, inreg1)
|
|
/// require a dynamic REX prefix and if the second input register (inreg1) requires a SIB.
|
|
fn size_plus_maybe_sib_inreg1_plus_rex_prefix_for_inreg0_inreg1(
|
|
sizing: &RecipeSizing,
|
|
enc: Encoding,
|
|
inst: Inst,
|
|
divert: &RegDiversions,
|
|
func: &Function,
|
|
) -> u8 {
|
|
// No need to check for REX.W in `needs_rex` because `infer_rex().w()` is not allowed.
|
|
let needs_rex = test_input(0, inst, divert, func, is_extended_reg)
|
|
|| test_input(1, inst, divert, func, is_extended_reg);
|
|
size_plus_maybe_sib_for_inreg_1(sizing, enc, inst, divert, func) + if needs_rex { 1 } else { 0 }
|
|
}
|
|
|
|
/// Calculates the size while inferring if the first input register (inreg0) and first output
|
|
/// register (outreg0) require a dynamic REX and if the first input register (inreg0) requires a
|
|
/// SIB or offset.
|
|
fn size_plus_maybe_sib_or_offset_for_inreg_0_plus_rex_prefix_for_inreg0_outreg0(
|
|
sizing: &RecipeSizing,
|
|
enc: Encoding,
|
|
inst: Inst,
|
|
divert: &RegDiversions,
|
|
func: &Function,
|
|
) -> u8 {
|
|
// No need to check for REX.W in `needs_rex` because `infer_rex().w()` is not allowed.
|
|
let needs_rex = test_input(0, inst, divert, func, is_extended_reg)
|
|
|| test_result(0, inst, divert, func, is_extended_reg);
|
|
size_plus_maybe_sib_or_offset_for_inreg_0(sizing, enc, inst, divert, func)
|
|
+ if needs_rex { 1 } else { 0 }
|
|
}
|
|
|
|
/// Calculates the size while inferring if the first input register (inreg0) and first output
|
|
/// register (outreg0) require a dynamic REX and if the first input register (inreg0) requires a
|
|
/// SIB.
|
|
fn size_plus_maybe_sib_for_inreg_0_plus_rex_prefix_for_inreg0_outreg0(
|
|
sizing: &RecipeSizing,
|
|
enc: Encoding,
|
|
inst: Inst,
|
|
divert: &RegDiversions,
|
|
func: &Function,
|
|
) -> u8 {
|
|
// No need to check for REX.W in `needs_rex` because `infer_rex().w()` is not allowed.
|
|
let needs_rex = test_input(0, inst, divert, func, is_extended_reg)
|
|
|| test_result(0, inst, divert, func, is_extended_reg);
|
|
size_plus_maybe_sib_for_inreg_0(sizing, enc, inst, divert, func) + if needs_rex { 1 } else { 0 }
|
|
}
|
|
|
|
/// Infers whether a dynamic REX prefix will be emitted, for use with one input reg.
|
|
///
|
|
/// A REX prefix is known to be emitted if either:
|
|
/// 1. The EncodingBits specify that REX.W is to be set.
|
|
/// 2. Registers are used that require REX.R or REX.B bits for encoding.
|
|
fn size_with_inferred_rex_for_inreg0(
|
|
sizing: &RecipeSizing,
|
|
_enc: Encoding,
|
|
inst: Inst,
|
|
divert: &RegDiversions,
|
|
func: &Function,
|
|
) -> u8 {
|
|
// No need to check for REX.W in `needs_rex` because `infer_rex().w()` is not allowed.
|
|
let needs_rex = test_input(0, inst, divert, func, is_extended_reg);
|
|
sizing.base_size + if needs_rex { 1 } else { 0 }
|
|
}
|
|
|
|
/// Infers whether a dynamic REX prefix will be emitted, based on the second operand.
|
|
fn size_with_inferred_rex_for_inreg1(
|
|
sizing: &RecipeSizing,
|
|
_enc: Encoding,
|
|
inst: Inst,
|
|
divert: &RegDiversions,
|
|
func: &Function,
|
|
) -> u8 {
|
|
// No need to check for REX.W in `needs_rex` because `infer_rex().w()` is not allowed.
|
|
let needs_rex = test_input(1, inst, divert, func, is_extended_reg);
|
|
sizing.base_size + if needs_rex { 1 } else { 0 }
|
|
}
|
|
|
|
/// Infers whether a dynamic REX prefix will be emitted, based on the third operand.
|
|
fn size_with_inferred_rex_for_inreg2(
|
|
sizing: &RecipeSizing,
|
|
_: Encoding,
|
|
inst: Inst,
|
|
divert: &RegDiversions,
|
|
func: &Function,
|
|
) -> u8 {
|
|
// No need to check for REX.W in `needs_rex` because `infer_rex().w()` is not allowed.
|
|
let needs_rex = test_input(2, inst, divert, func, is_extended_reg);
|
|
sizing.base_size + if needs_rex { 1 } else { 0 }
|
|
}
|
|
|
|
/// Infers whether a dynamic REX prefix will be emitted, for use with two input registers.
|
|
///
|
|
/// A REX prefix is known to be emitted if either:
|
|
/// 1. The EncodingBits specify that REX.W is to be set.
|
|
/// 2. Registers are used that require REX.R or REX.B bits for encoding.
|
|
fn size_with_inferred_rex_for_inreg0_inreg1(
|
|
sizing: &RecipeSizing,
|
|
_enc: Encoding,
|
|
inst: Inst,
|
|
divert: &RegDiversions,
|
|
func: &Function,
|
|
) -> u8 {
|
|
// No need to check for REX.W in `needs_rex` because `infer_rex().w()` is not allowed.
|
|
let needs_rex = test_input(0, inst, divert, func, is_extended_reg)
|
|
|| test_input(1, inst, divert, func, is_extended_reg);
|
|
sizing.base_size + if needs_rex { 1 } else { 0 }
|
|
}
|
|
|
|
/// Infers whether a dynamic REX prefix will be emitted, based on a single
|
|
/// input register and a single output register.
|
|
fn size_with_inferred_rex_for_inreg0_outreg0(
|
|
sizing: &RecipeSizing,
|
|
_enc: Encoding,
|
|
inst: Inst,
|
|
divert: &RegDiversions,
|
|
func: &Function,
|
|
) -> u8 {
|
|
// No need to check for REX.W in `needs_rex` because `infer_rex().w()` is not allowed.
|
|
let needs_rex = test_input(0, inst, divert, func, is_extended_reg)
|
|
|| test_result(0, inst, divert, func, is_extended_reg);
|
|
sizing.base_size + if needs_rex { 1 } else { 0 }
|
|
}
|
|
|
|
/// Infers whether a dynamic REX prefix will be emitted, based on a single output register.
|
|
fn size_with_inferred_rex_for_outreg0(
|
|
sizing: &RecipeSizing,
|
|
_enc: Encoding,
|
|
inst: Inst,
|
|
divert: &RegDiversions,
|
|
func: &Function,
|
|
) -> u8 {
|
|
// No need to check for REX.W in `needs_rex` because `infer_rex().w()` is not allowed.
|
|
let needs_rex = test_result(0, inst, divert, func, is_extended_reg);
|
|
sizing.base_size + if needs_rex { 1 } else { 0 }
|
|
}
|
|
|
|
/// Infers whether a dynamic REX prefix will be emitted, for use with CMOV.
|
|
///
|
|
/// CMOV uses 3 inputs, with the REX is inferred from reg1 and reg2.
|
|
fn size_with_inferred_rex_for_cmov(
|
|
sizing: &RecipeSizing,
|
|
_enc: Encoding,
|
|
inst: Inst,
|
|
divert: &RegDiversions,
|
|
func: &Function,
|
|
) -> u8 {
|
|
// No need to check for REX.W in `needs_rex` because `infer_rex().w()` is not allowed.
|
|
let needs_rex = test_input(1, inst, divert, func, is_extended_reg)
|
|
|| test_input(2, inst, divert, func, is_extended_reg);
|
|
sizing.base_size + if needs_rex { 1 } else { 0 }
|
|
}
|
|
|
|
/// If the value's definition is a constant immediate, returns its unpacked value, or None
|
|
/// otherwise.
|
|
fn maybe_iconst_imm(pos: &FuncCursor, value: ir::Value) -> Option<i64> {
|
|
if let ir::ValueDef::Result(inst, _) = &pos.func.dfg.value_def(value) {
|
|
if let ir::InstructionData::UnaryImm {
|
|
opcode: ir::Opcode::Iconst,
|
|
imm,
|
|
} = &pos.func.dfg[*inst]
|
|
{
|
|
let value: i64 = (*imm).into();
|
|
Some(value)
|
|
} else {
|
|
None
|
|
}
|
|
} else {
|
|
None
|
|
}
|
|
}
|
|
|
|
/// Expand the `sdiv` and `srem` instructions using `x86_sdivmodx`.
|
|
fn expand_sdivrem(
|
|
inst: ir::Inst,
|
|
func: &mut ir::Function,
|
|
cfg: &mut ControlFlowGraph,
|
|
isa: &dyn TargetIsa,
|
|
) {
|
|
let (x, y, is_srem) = match func.dfg[inst] {
|
|
ir::InstructionData::Binary {
|
|
opcode: ir::Opcode::Sdiv,
|
|
args,
|
|
} => (args[0], args[1], false),
|
|
ir::InstructionData::Binary {
|
|
opcode: ir::Opcode::Srem,
|
|
args,
|
|
} => (args[0], args[1], true),
|
|
_ => panic!("Need sdiv/srem: {}", func.dfg.display_inst(inst, None)),
|
|
};
|
|
|
|
let old_block = func.layout.pp_block(inst);
|
|
let result = func.dfg.first_result(inst);
|
|
let ty = func.dfg.value_type(result);
|
|
|
|
let mut pos = FuncCursor::new(func).at_inst(inst);
|
|
pos.use_srcloc(inst);
|
|
pos.func.dfg.clear_results(inst);
|
|
|
|
let avoid_div_traps = isa.flags().avoid_div_traps();
|
|
|
|
// If we can tolerate native division traps, sdiv doesn't need branching.
|
|
if !avoid_div_traps && !is_srem {
|
|
let xhi = pos.ins().sshr_imm(x, i64::from(ty.lane_bits()) - 1);
|
|
pos.ins().with_result(result).x86_sdivmodx(x, xhi, y);
|
|
pos.remove_inst();
|
|
return;
|
|
}
|
|
|
|
// Try to remove checks if the input value is an immediate other than 0 or -1. For these two
|
|
// immediates, we'd ideally replace conditional traps by traps, but this requires more
|
|
// manipulation of the dfg/cfg, which is out of scope here.
|
|
let (could_be_zero, could_be_minus_one) = if let Some(imm) = maybe_iconst_imm(&pos, y) {
|
|
(imm == 0, imm == -1)
|
|
} else {
|
|
(true, true)
|
|
};
|
|
|
|
// Put in an explicit division-by-zero trap if the environment requires it.
|
|
if avoid_div_traps && could_be_zero {
|
|
pos.ins().trapz(y, ir::TrapCode::IntegerDivisionByZero);
|
|
}
|
|
|
|
if !could_be_minus_one {
|
|
let xhi = pos.ins().sshr_imm(x, i64::from(ty.lane_bits()) - 1);
|
|
let reuse = if is_srem {
|
|
[None, Some(result)]
|
|
} else {
|
|
[Some(result), None]
|
|
};
|
|
pos.ins().with_results(reuse).x86_sdivmodx(x, xhi, y);
|
|
pos.remove_inst();
|
|
return;
|
|
}
|
|
|
|
// block handling the nominal case.
|
|
let nominal = pos.func.dfg.make_block();
|
|
|
|
// block handling the -1 divisor case.
|
|
let minus_one = pos.func.dfg.make_block();
|
|
|
|
// Final block with one argument representing the final result value.
|
|
let done = pos.func.dfg.make_block();
|
|
|
|
// Move the `inst` result value onto the `done` block.
|
|
pos.func.dfg.attach_block_param(done, result);
|
|
|
|
// Start by checking for a -1 divisor which needs to be handled specially.
|
|
let is_m1 = pos.ins().ifcmp_imm(y, -1);
|
|
pos.ins().brif(IntCC::Equal, is_m1, minus_one, &[]);
|
|
pos.ins().jump(nominal, &[]);
|
|
|
|
// Now it is safe to execute the `x86_sdivmodx` instruction which will still trap on division
|
|
// by zero.
|
|
pos.insert_block(nominal);
|
|
let xhi = pos.ins().sshr_imm(x, i64::from(ty.lane_bits()) - 1);
|
|
let (quot, rem) = pos.ins().x86_sdivmodx(x, xhi, y);
|
|
let divres = if is_srem { rem } else { quot };
|
|
pos.ins().jump(done, &[divres]);
|
|
|
|
// Now deal with the -1 divisor case.
|
|
pos.insert_block(minus_one);
|
|
let m1_result = if is_srem {
|
|
// x % -1 = 0.
|
|
pos.ins().iconst(ty, 0)
|
|
} else {
|
|
// Explicitly check for overflow: Trap when x == INT_MIN.
|
|
debug_assert!(avoid_div_traps, "Native trapping divide handled above");
|
|
let f = pos.ins().ifcmp_imm(x, -1 << (ty.lane_bits() - 1));
|
|
pos.ins()
|
|
.trapif(IntCC::Equal, f, ir::TrapCode::IntegerOverflow);
|
|
// x / -1 = -x.
|
|
pos.ins().irsub_imm(x, 0)
|
|
};
|
|
|
|
// Recycle the original instruction as a jump.
|
|
pos.func.dfg.replace(inst).jump(done, &[m1_result]);
|
|
|
|
// Finally insert a label for the completion.
|
|
pos.next_inst();
|
|
pos.insert_block(done);
|
|
|
|
cfg.recompute_block(pos.func, old_block);
|
|
cfg.recompute_block(pos.func, nominal);
|
|
cfg.recompute_block(pos.func, minus_one);
|
|
cfg.recompute_block(pos.func, done);
|
|
}
|
|
|
|
/// Expand the `udiv` and `urem` instructions using `x86_udivmodx`.
|
|
fn expand_udivrem(
|
|
inst: ir::Inst,
|
|
func: &mut ir::Function,
|
|
_cfg: &mut ControlFlowGraph,
|
|
isa: &dyn TargetIsa,
|
|
) {
|
|
let (x, y, is_urem) = match func.dfg[inst] {
|
|
ir::InstructionData::Binary {
|
|
opcode: ir::Opcode::Udiv,
|
|
args,
|
|
} => (args[0], args[1], false),
|
|
ir::InstructionData::Binary {
|
|
opcode: ir::Opcode::Urem,
|
|
args,
|
|
} => (args[0], args[1], true),
|
|
_ => panic!("Need udiv/urem: {}", func.dfg.display_inst(inst, None)),
|
|
};
|
|
let avoid_div_traps = isa.flags().avoid_div_traps();
|
|
let result = func.dfg.first_result(inst);
|
|
let ty = func.dfg.value_type(result);
|
|
|
|
let mut pos = FuncCursor::new(func).at_inst(inst);
|
|
pos.use_srcloc(inst);
|
|
pos.func.dfg.clear_results(inst);
|
|
|
|
// Put in an explicit division-by-zero trap if the environment requires it.
|
|
if avoid_div_traps {
|
|
let zero_check = if let Some(imm) = maybe_iconst_imm(&pos, y) {
|
|
// Ideally, we'd just replace the conditional trap with a trap when the immediate is
|
|
// zero, but this requires more manipulation of the dfg/cfg, which is out of scope
|
|
// here.
|
|
imm == 0
|
|
} else {
|
|
true
|
|
};
|
|
if zero_check {
|
|
pos.ins().trapz(y, ir::TrapCode::IntegerDivisionByZero);
|
|
}
|
|
}
|
|
|
|
// Now it is safe to execute the `x86_udivmodx` instruction.
|
|
let xhi = pos.ins().iconst(ty, 0);
|
|
let reuse = if is_urem {
|
|
[None, Some(result)]
|
|
} else {
|
|
[Some(result), None]
|
|
};
|
|
pos.ins().with_results(reuse).x86_udivmodx(x, xhi, y);
|
|
pos.remove_inst();
|
|
}
|
|
|
|
/// Expand the `fmin` and `fmax` instructions using the x86 `x86_fmin` and `x86_fmax`
|
|
/// instructions.
|
|
fn expand_minmax(
|
|
inst: ir::Inst,
|
|
func: &mut ir::Function,
|
|
cfg: &mut ControlFlowGraph,
|
|
_isa: &dyn TargetIsa,
|
|
) {
|
|
let (x, y, x86_opc, bitwise_opc) = match func.dfg[inst] {
|
|
ir::InstructionData::Binary {
|
|
opcode: ir::Opcode::Fmin,
|
|
args,
|
|
} => (args[0], args[1], ir::Opcode::X86Fmin, ir::Opcode::Bor),
|
|
ir::InstructionData::Binary {
|
|
opcode: ir::Opcode::Fmax,
|
|
args,
|
|
} => (args[0], args[1], ir::Opcode::X86Fmax, ir::Opcode::Band),
|
|
_ => panic!("Expected fmin/fmax: {}", func.dfg.display_inst(inst, None)),
|
|
};
|
|
let old_block = func.layout.pp_block(inst);
|
|
|
|
// We need to handle the following conditions, depending on how x and y compare:
|
|
//
|
|
// 1. LT or GT: The native `x86_opc` min/max instruction does what we need.
|
|
// 2. EQ: We need to use `bitwise_opc` to make sure that
|
|
// fmin(0.0, -0.0) -> -0.0 and fmax(0.0, -0.0) -> 0.0.
|
|
// 3. UN: We need to produce a quiet NaN that is canonical if the inputs are canonical.
|
|
|
|
// block handling case 1) where operands are ordered but not equal.
|
|
let one_block = func.dfg.make_block();
|
|
|
|
// block handling case 3) where one operand is NaN.
|
|
let uno_block = func.dfg.make_block();
|
|
|
|
// block that handles the unordered or equal cases 2) and 3).
|
|
let ueq_block = func.dfg.make_block();
|
|
|
|
// block handling case 2) where operands are ordered and equal.
|
|
let eq_block = func.dfg.make_block();
|
|
|
|
// Final block with one argument representing the final result value.
|
|
let done = func.dfg.make_block();
|
|
|
|
// The basic blocks are laid out to minimize branching for the common cases:
|
|
//
|
|
// 1) One branch not taken, one jump.
|
|
// 2) One branch taken.
|
|
// 3) Two branches taken, one jump.
|
|
|
|
// Move the `inst` result value onto the `done` block.
|
|
let result = func.dfg.first_result(inst);
|
|
let ty = func.dfg.value_type(result);
|
|
func.dfg.clear_results(inst);
|
|
func.dfg.attach_block_param(done, result);
|
|
|
|
// Test for case 1) ordered and not equal.
|
|
let mut pos = FuncCursor::new(func).at_inst(inst);
|
|
pos.use_srcloc(inst);
|
|
let cmp_ueq = pos.ins().fcmp(FloatCC::UnorderedOrEqual, x, y);
|
|
pos.ins().brnz(cmp_ueq, ueq_block, &[]);
|
|
pos.ins().jump(one_block, &[]);
|
|
|
|
// Handle the common ordered, not equal (LT|GT) case.
|
|
pos.insert_block(one_block);
|
|
let one_inst = pos.ins().Binary(x86_opc, ty, x, y).0;
|
|
let one_result = pos.func.dfg.first_result(one_inst);
|
|
pos.ins().jump(done, &[one_result]);
|
|
|
|
// Case 3) Unordered.
|
|
// We know that at least one operand is a NaN that needs to be propagated. We simply use an
|
|
// `fadd` instruction which has the same NaN propagation semantics.
|
|
pos.insert_block(uno_block);
|
|
let uno_result = pos.ins().fadd(x, y);
|
|
pos.ins().jump(done, &[uno_result]);
|
|
|
|
// Case 2) or 3).
|
|
pos.insert_block(ueq_block);
|
|
// Test for case 3) (UN) one value is NaN.
|
|
// TODO: When we get support for flag values, we can reuse the above comparison.
|
|
let cmp_uno = pos.ins().fcmp(FloatCC::Unordered, x, y);
|
|
pos.ins().brnz(cmp_uno, uno_block, &[]);
|
|
pos.ins().jump(eq_block, &[]);
|
|
|
|
// We are now in case 2) where x and y compare EQ.
|
|
// We need a bitwise operation to get the sign right.
|
|
pos.insert_block(eq_block);
|
|
let bw_inst = pos.ins().Binary(bitwise_opc, ty, x, y).0;
|
|
let bw_result = pos.func.dfg.first_result(bw_inst);
|
|
// This should become a fall-through for this second most common case.
|
|
// Recycle the original instruction as a jump.
|
|
pos.func.dfg.replace(inst).jump(done, &[bw_result]);
|
|
|
|
// Finally insert a label for the completion.
|
|
pos.next_inst();
|
|
pos.insert_block(done);
|
|
|
|
cfg.recompute_block(pos.func, old_block);
|
|
cfg.recompute_block(pos.func, one_block);
|
|
cfg.recompute_block(pos.func, uno_block);
|
|
cfg.recompute_block(pos.func, ueq_block);
|
|
cfg.recompute_block(pos.func, eq_block);
|
|
cfg.recompute_block(pos.func, done);
|
|
}
|
|
|
|
/// x86 has no unsigned-to-float conversions. We handle the easy case of zero-extending i32 to
|
|
/// i64 with a pattern, the rest needs more code.
|
|
fn expand_fcvt_from_uint(
|
|
inst: ir::Inst,
|
|
func: &mut ir::Function,
|
|
cfg: &mut ControlFlowGraph,
|
|
_isa: &dyn TargetIsa,
|
|
) {
|
|
let x;
|
|
match func.dfg[inst] {
|
|
ir::InstructionData::Unary {
|
|
opcode: ir::Opcode::FcvtFromUint,
|
|
arg,
|
|
} => x = arg,
|
|
_ => panic!("Need fcvt_from_uint: {}", func.dfg.display_inst(inst, None)),
|
|
}
|
|
let xty = func.dfg.value_type(x);
|
|
let result = func.dfg.first_result(inst);
|
|
let ty = func.dfg.value_type(result);
|
|
let mut pos = FuncCursor::new(func).at_inst(inst);
|
|
pos.use_srcloc(inst);
|
|
|
|
// Conversion from an unsigned int smaller than 64bit is easy on x86-64.
|
|
match xty {
|
|
ir::types::I8 | ir::types::I16 | ir::types::I32 => {
|
|
// TODO: This should be guarded by an ISA check.
|
|
let wide = pos.ins().uextend(ir::types::I64, x);
|
|
pos.func.dfg.replace(inst).fcvt_from_sint(ty, wide);
|
|
return;
|
|
}
|
|
ir::types::I64 => {}
|
|
_ => unimplemented!(),
|
|
}
|
|
|
|
let old_block = pos.func.layout.pp_block(inst);
|
|
|
|
// block handling the case where x >= 0.
|
|
let poszero_block = pos.func.dfg.make_block();
|
|
|
|
// block handling the case where x < 0.
|
|
let neg_block = pos.func.dfg.make_block();
|
|
|
|
// Final block with one argument representing the final result value.
|
|
let done = pos.func.dfg.make_block();
|
|
|
|
// Move the `inst` result value onto the `done` block.
|
|
pos.func.dfg.clear_results(inst);
|
|
pos.func.dfg.attach_block_param(done, result);
|
|
|
|
// If x as a signed int is not negative, we can use the existing `fcvt_from_sint` instruction.
|
|
let is_neg = pos.ins().icmp_imm(IntCC::SignedLessThan, x, 0);
|
|
pos.ins().brnz(is_neg, neg_block, &[]);
|
|
pos.ins().jump(poszero_block, &[]);
|
|
|
|
// Easy case: just use a signed conversion.
|
|
pos.insert_block(poszero_block);
|
|
let posres = pos.ins().fcvt_from_sint(ty, x);
|
|
pos.ins().jump(done, &[posres]);
|
|
|
|
// Now handle the negative case.
|
|
pos.insert_block(neg_block);
|
|
|
|
// Divide x by two to get it in range for the signed conversion, keep the LSB, and scale it
|
|
// back up on the FP side.
|
|
let ihalf = pos.ins().ushr_imm(x, 1);
|
|
let lsb = pos.ins().band_imm(x, 1);
|
|
let ifinal = pos.ins().bor(ihalf, lsb);
|
|
let fhalf = pos.ins().fcvt_from_sint(ty, ifinal);
|
|
let negres = pos.ins().fadd(fhalf, fhalf);
|
|
|
|
// Recycle the original instruction as a jump.
|
|
pos.func.dfg.replace(inst).jump(done, &[negres]);
|
|
|
|
// Finally insert a label for the completion.
|
|
pos.next_inst();
|
|
pos.insert_block(done);
|
|
|
|
cfg.recompute_block(pos.func, old_block);
|
|
cfg.recompute_block(pos.func, poszero_block);
|
|
cfg.recompute_block(pos.func, neg_block);
|
|
cfg.recompute_block(pos.func, done);
|
|
}
|
|
|
|
fn expand_fcvt_to_sint(
|
|
inst: ir::Inst,
|
|
func: &mut ir::Function,
|
|
cfg: &mut ControlFlowGraph,
|
|
_isa: &dyn TargetIsa,
|
|
) {
|
|
use crate::ir::immediates::{Ieee32, Ieee64};
|
|
|
|
let x = match func.dfg[inst] {
|
|
ir::InstructionData::Unary {
|
|
opcode: ir::Opcode::FcvtToSint,
|
|
arg,
|
|
} => arg,
|
|
_ => panic!("Need fcvt_to_sint: {}", func.dfg.display_inst(inst, None)),
|
|
};
|
|
let old_block = func.layout.pp_block(inst);
|
|
let xty = func.dfg.value_type(x);
|
|
let result = func.dfg.first_result(inst);
|
|
let ty = func.dfg.value_type(result);
|
|
|
|
// Final block after the bad value checks.
|
|
let done = func.dfg.make_block();
|
|
|
|
// block for checking failure cases.
|
|
let maybe_trap_block = func.dfg.make_block();
|
|
|
|
// The `x86_cvtt2si` performs the desired conversion, but it doesn't trap on NaN or overflow.
|
|
// It produces an INT_MIN result instead.
|
|
func.dfg.replace(inst).x86_cvtt2si(ty, x);
|
|
|
|
let mut pos = FuncCursor::new(func).after_inst(inst);
|
|
pos.use_srcloc(inst);
|
|
|
|
let is_done = pos
|
|
.ins()
|
|
.icmp_imm(IntCC::NotEqual, result, 1 << (ty.lane_bits() - 1));
|
|
pos.ins().brnz(is_done, done, &[]);
|
|
pos.ins().jump(maybe_trap_block, &[]);
|
|
|
|
// We now have the following possibilities:
|
|
//
|
|
// 1. INT_MIN was actually the correct conversion result.
|
|
// 2. The input was NaN -> trap bad_toint
|
|
// 3. The input was out of range -> trap int_ovf
|
|
//
|
|
pos.insert_block(maybe_trap_block);
|
|
|
|
// Check for NaN.
|
|
let is_nan = pos.ins().fcmp(FloatCC::Unordered, x, x);
|
|
pos.ins()
|
|
.trapnz(is_nan, ir::TrapCode::BadConversionToInteger);
|
|
|
|
// Check for case 1: INT_MIN is the correct result.
|
|
// Determine the smallest floating point number that would convert to INT_MIN.
|
|
let mut overflow_cc = FloatCC::LessThan;
|
|
let output_bits = ty.lane_bits();
|
|
let flimit = match xty {
|
|
ir::types::F32 =>
|
|
// An f32 can represent `i16::min_value() - 1` exactly with precision to spare, so
|
|
// there are values less than -2^(N-1) that convert correctly to INT_MIN.
|
|
{
|
|
pos.ins().f32const(if output_bits < 32 {
|
|
overflow_cc = FloatCC::LessThanOrEqual;
|
|
Ieee32::fcvt_to_sint_negative_overflow(output_bits)
|
|
} else {
|
|
Ieee32::pow2(output_bits - 1).neg()
|
|
})
|
|
}
|
|
ir::types::F64 =>
|
|
// An f64 can represent `i32::min_value() - 1` exactly with precision to spare, so
|
|
// there are values less than -2^(N-1) that convert correctly to INT_MIN.
|
|
{
|
|
pos.ins().f64const(if output_bits < 64 {
|
|
overflow_cc = FloatCC::LessThanOrEqual;
|
|
Ieee64::fcvt_to_sint_negative_overflow(output_bits)
|
|
} else {
|
|
Ieee64::pow2(output_bits - 1).neg()
|
|
})
|
|
}
|
|
_ => panic!("Can't convert {}", xty),
|
|
};
|
|
let overflow = pos.ins().fcmp(overflow_cc, x, flimit);
|
|
pos.ins().trapnz(overflow, ir::TrapCode::IntegerOverflow);
|
|
|
|
// Finally, we could have a positive value that is too large.
|
|
let fzero = match xty {
|
|
ir::types::F32 => pos.ins().f32const(Ieee32::with_bits(0)),
|
|
ir::types::F64 => pos.ins().f64const(Ieee64::with_bits(0)),
|
|
_ => panic!("Can't convert {}", xty),
|
|
};
|
|
let overflow = pos.ins().fcmp(FloatCC::GreaterThanOrEqual, x, fzero);
|
|
pos.ins().trapnz(overflow, ir::TrapCode::IntegerOverflow);
|
|
|
|
pos.ins().jump(done, &[]);
|
|
pos.insert_block(done);
|
|
|
|
cfg.recompute_block(pos.func, old_block);
|
|
cfg.recompute_block(pos.func, maybe_trap_block);
|
|
cfg.recompute_block(pos.func, done);
|
|
}
|
|
|
|
fn expand_fcvt_to_sint_sat(
|
|
inst: ir::Inst,
|
|
func: &mut ir::Function,
|
|
cfg: &mut ControlFlowGraph,
|
|
_isa: &dyn TargetIsa,
|
|
) {
|
|
use crate::ir::immediates::{Ieee32, Ieee64};
|
|
|
|
let x = match func.dfg[inst] {
|
|
ir::InstructionData::Unary {
|
|
opcode: ir::Opcode::FcvtToSintSat,
|
|
arg,
|
|
} => arg,
|
|
_ => panic!(
|
|
"Need fcvt_to_sint_sat: {}",
|
|
func.dfg.display_inst(inst, None)
|
|
),
|
|
};
|
|
|
|
let old_block = func.layout.pp_block(inst);
|
|
let xty = func.dfg.value_type(x);
|
|
let result = func.dfg.first_result(inst);
|
|
let ty = func.dfg.value_type(result);
|
|
|
|
// Final block after the bad value checks.
|
|
let done_block = func.dfg.make_block();
|
|
let intmin_block = func.dfg.make_block();
|
|
let minsat_block = func.dfg.make_block();
|
|
let maxsat_block = func.dfg.make_block();
|
|
func.dfg.clear_results(inst);
|
|
func.dfg.attach_block_param(done_block, result);
|
|
|
|
let mut pos = FuncCursor::new(func).at_inst(inst);
|
|
pos.use_srcloc(inst);
|
|
|
|
// The `x86_cvtt2si` performs the desired conversion, but it doesn't trap on NaN or
|
|
// overflow. It produces an INT_MIN result instead.
|
|
let cvtt2si = pos.ins().x86_cvtt2si(ty, x);
|
|
|
|
let is_done = pos
|
|
.ins()
|
|
.icmp_imm(IntCC::NotEqual, cvtt2si, 1 << (ty.lane_bits() - 1));
|
|
pos.ins().brnz(is_done, done_block, &[cvtt2si]);
|
|
pos.ins().jump(intmin_block, &[]);
|
|
|
|
// We now have the following possibilities:
|
|
//
|
|
// 1. INT_MIN was actually the correct conversion result.
|
|
// 2. The input was NaN -> replace the result value with 0.
|
|
// 3. The input was out of range -> saturate the result to the min/max value.
|
|
pos.insert_block(intmin_block);
|
|
|
|
// Check for NaN, which is truncated to 0.
|
|
let zero = pos.ins().iconst(ty, 0);
|
|
let is_nan = pos.ins().fcmp(FloatCC::Unordered, x, x);
|
|
pos.ins().brnz(is_nan, done_block, &[zero]);
|
|
pos.ins().jump(minsat_block, &[]);
|
|
|
|
// Check for case 1: INT_MIN is the correct result.
|
|
// Determine the smallest floating point number that would convert to INT_MIN.
|
|
pos.insert_block(minsat_block);
|
|
let mut overflow_cc = FloatCC::LessThan;
|
|
let output_bits = ty.lane_bits();
|
|
let flimit = match xty {
|
|
ir::types::F32 =>
|
|
// An f32 can represent `i16::min_value() - 1` exactly with precision to spare, so
|
|
// there are values less than -2^(N-1) that convert correctly to INT_MIN.
|
|
{
|
|
pos.ins().f32const(if output_bits < 32 {
|
|
overflow_cc = FloatCC::LessThanOrEqual;
|
|
Ieee32::fcvt_to_sint_negative_overflow(output_bits)
|
|
} else {
|
|
Ieee32::pow2(output_bits - 1).neg()
|
|
})
|
|
}
|
|
ir::types::F64 =>
|
|
// An f64 can represent `i32::min_value() - 1` exactly with precision to spare, so
|
|
// there are values less than -2^(N-1) that convert correctly to INT_MIN.
|
|
{
|
|
pos.ins().f64const(if output_bits < 64 {
|
|
overflow_cc = FloatCC::LessThanOrEqual;
|
|
Ieee64::fcvt_to_sint_negative_overflow(output_bits)
|
|
} else {
|
|
Ieee64::pow2(output_bits - 1).neg()
|
|
})
|
|
}
|
|
_ => panic!("Can't convert {}", xty),
|
|
};
|
|
|
|
let overflow = pos.ins().fcmp(overflow_cc, x, flimit);
|
|
let min_imm = match ty {
|
|
ir::types::I32 => i32::min_value() as i64,
|
|
ir::types::I64 => i64::min_value(),
|
|
_ => panic!("Don't know the min value for {}", ty),
|
|
};
|
|
let min_value = pos.ins().iconst(ty, min_imm);
|
|
pos.ins().brnz(overflow, done_block, &[min_value]);
|
|
pos.ins().jump(maxsat_block, &[]);
|
|
|
|
// Finally, we could have a positive value that is too large.
|
|
pos.insert_block(maxsat_block);
|
|
let fzero = match xty {
|
|
ir::types::F32 => pos.ins().f32const(Ieee32::with_bits(0)),
|
|
ir::types::F64 => pos.ins().f64const(Ieee64::with_bits(0)),
|
|
_ => panic!("Can't convert {}", xty),
|
|
};
|
|
|
|
let max_imm = match ty {
|
|
ir::types::I32 => i32::max_value() as i64,
|
|
ir::types::I64 => i64::max_value(),
|
|
_ => panic!("Don't know the max value for {}", ty),
|
|
};
|
|
let max_value = pos.ins().iconst(ty, max_imm);
|
|
|
|
let overflow = pos.ins().fcmp(FloatCC::GreaterThanOrEqual, x, fzero);
|
|
pos.ins().brnz(overflow, done_block, &[max_value]);
|
|
|
|
// Recycle the original instruction.
|
|
pos.func.dfg.replace(inst).jump(done_block, &[cvtt2si]);
|
|
|
|
// Finally insert a label for the completion.
|
|
pos.next_inst();
|
|
pos.insert_block(done_block);
|
|
|
|
cfg.recompute_block(pos.func, old_block);
|
|
cfg.recompute_block(pos.func, intmin_block);
|
|
cfg.recompute_block(pos.func, minsat_block);
|
|
cfg.recompute_block(pos.func, maxsat_block);
|
|
cfg.recompute_block(pos.func, done_block);
|
|
}
|
|
|
|
fn expand_fcvt_to_uint(
|
|
inst: ir::Inst,
|
|
func: &mut ir::Function,
|
|
cfg: &mut ControlFlowGraph,
|
|
_isa: &dyn TargetIsa,
|
|
) {
|
|
use crate::ir::immediates::{Ieee32, Ieee64};
|
|
|
|
let x = match func.dfg[inst] {
|
|
ir::InstructionData::Unary {
|
|
opcode: ir::Opcode::FcvtToUint,
|
|
arg,
|
|
} => arg,
|
|
_ => panic!("Need fcvt_to_uint: {}", func.dfg.display_inst(inst, None)),
|
|
};
|
|
|
|
let old_block = func.layout.pp_block(inst);
|
|
let xty = func.dfg.value_type(x);
|
|
let result = func.dfg.first_result(inst);
|
|
let ty = func.dfg.value_type(result);
|
|
|
|
// block handle numbers < 2^(N-1).
|
|
let below_uint_max_block = func.dfg.make_block();
|
|
|
|
// block handle numbers < 0.
|
|
let below_zero_block = func.dfg.make_block();
|
|
|
|
// block handling numbers >= 2^(N-1).
|
|
let large = func.dfg.make_block();
|
|
|
|
// Final block after the bad value checks.
|
|
let done = func.dfg.make_block();
|
|
|
|
// Move the `inst` result value onto the `done` block.
|
|
func.dfg.clear_results(inst);
|
|
func.dfg.attach_block_param(done, result);
|
|
|
|
let mut pos = FuncCursor::new(func).at_inst(inst);
|
|
pos.use_srcloc(inst);
|
|
|
|
// Start by materializing the floating point constant 2^(N-1) where N is the number of bits in
|
|
// the destination integer type.
|
|
let pow2nm1 = match xty {
|
|
ir::types::F32 => pos.ins().f32const(Ieee32::pow2(ty.lane_bits() - 1)),
|
|
ir::types::F64 => pos.ins().f64const(Ieee64::pow2(ty.lane_bits() - 1)),
|
|
_ => panic!("Can't convert {}", xty),
|
|
};
|
|
let is_large = pos.ins().ffcmp(x, pow2nm1);
|
|
pos.ins()
|
|
.brff(FloatCC::GreaterThanOrEqual, is_large, large, &[]);
|
|
pos.ins().jump(below_uint_max_block, &[]);
|
|
|
|
// We need to generate a specific trap code when `x` is NaN, so reuse the flags from the
|
|
// previous comparison.
|
|
pos.insert_block(below_uint_max_block);
|
|
pos.ins().trapff(
|
|
FloatCC::Unordered,
|
|
is_large,
|
|
ir::TrapCode::BadConversionToInteger,
|
|
);
|
|
|
|
// Now we know that x < 2^(N-1) and not NaN.
|
|
let sres = pos.ins().x86_cvtt2si(ty, x);
|
|
let is_neg = pos.ins().ifcmp_imm(sres, 0);
|
|
pos.ins()
|
|
.brif(IntCC::SignedGreaterThanOrEqual, is_neg, done, &[sres]);
|
|
pos.ins().jump(below_zero_block, &[]);
|
|
|
|
pos.insert_block(below_zero_block);
|
|
pos.ins().trap(ir::TrapCode::IntegerOverflow);
|
|
|
|
// Handle the case where x >= 2^(N-1) and not NaN.
|
|
pos.insert_block(large);
|
|
let adjx = pos.ins().fsub(x, pow2nm1);
|
|
let lres = pos.ins().x86_cvtt2si(ty, adjx);
|
|
let is_neg = pos.ins().ifcmp_imm(lres, 0);
|
|
pos.ins()
|
|
.trapif(IntCC::SignedLessThan, is_neg, ir::TrapCode::IntegerOverflow);
|
|
let lfinal = pos.ins().iadd_imm(lres, 1 << (ty.lane_bits() - 1));
|
|
|
|
// Recycle the original instruction as a jump.
|
|
pos.func.dfg.replace(inst).jump(done, &[lfinal]);
|
|
|
|
// Finally insert a label for the completion.
|
|
pos.next_inst();
|
|
pos.insert_block(done);
|
|
|
|
cfg.recompute_block(pos.func, old_block);
|
|
cfg.recompute_block(pos.func, below_uint_max_block);
|
|
cfg.recompute_block(pos.func, below_zero_block);
|
|
cfg.recompute_block(pos.func, large);
|
|
cfg.recompute_block(pos.func, done);
|
|
}
|
|
|
|
fn expand_fcvt_to_uint_sat(
|
|
inst: ir::Inst,
|
|
func: &mut ir::Function,
|
|
cfg: &mut ControlFlowGraph,
|
|
_isa: &dyn TargetIsa,
|
|
) {
|
|
use crate::ir::immediates::{Ieee32, Ieee64};
|
|
|
|
let x = match func.dfg[inst] {
|
|
ir::InstructionData::Unary {
|
|
opcode: ir::Opcode::FcvtToUintSat,
|
|
arg,
|
|
} => arg,
|
|
_ => panic!(
|
|
"Need fcvt_to_uint_sat: {}",
|
|
func.dfg.display_inst(inst, None)
|
|
),
|
|
};
|
|
|
|
let old_block = func.layout.pp_block(inst);
|
|
let xty = func.dfg.value_type(x);
|
|
let result = func.dfg.first_result(inst);
|
|
let ty = func.dfg.value_type(result);
|
|
|
|
// block handle numbers < 2^(N-1).
|
|
let below_pow2nm1_or_nan_block = func.dfg.make_block();
|
|
let below_pow2nm1_block = func.dfg.make_block();
|
|
|
|
// block handling numbers >= 2^(N-1).
|
|
let large = func.dfg.make_block();
|
|
|
|
// block handling numbers < 2^N.
|
|
let uint_large_block = func.dfg.make_block();
|
|
|
|
// Final block after the bad value checks.
|
|
let done = func.dfg.make_block();
|
|
|
|
// Move the `inst` result value onto the `done` block.
|
|
func.dfg.clear_results(inst);
|
|
func.dfg.attach_block_param(done, result);
|
|
|
|
let mut pos = FuncCursor::new(func).at_inst(inst);
|
|
pos.use_srcloc(inst);
|
|
|
|
// Start by materializing the floating point constant 2^(N-1) where N is the number of bits in
|
|
// the destination integer type.
|
|
let pow2nm1 = match xty {
|
|
ir::types::F32 => pos.ins().f32const(Ieee32::pow2(ty.lane_bits() - 1)),
|
|
ir::types::F64 => pos.ins().f64const(Ieee64::pow2(ty.lane_bits() - 1)),
|
|
_ => panic!("Can't convert {}", xty),
|
|
};
|
|
let zero = pos.ins().iconst(ty, 0);
|
|
let is_large = pos.ins().ffcmp(x, pow2nm1);
|
|
pos.ins()
|
|
.brff(FloatCC::GreaterThanOrEqual, is_large, large, &[]);
|
|
pos.ins().jump(below_pow2nm1_or_nan_block, &[]);
|
|
|
|
// We need to generate zero when `x` is NaN, so reuse the flags from the previous comparison.
|
|
pos.insert_block(below_pow2nm1_or_nan_block);
|
|
pos.ins().brff(FloatCC::Unordered, is_large, done, &[zero]);
|
|
pos.ins().jump(below_pow2nm1_block, &[]);
|
|
|
|
// Now we know that x < 2^(N-1) and not NaN. If the result of the cvtt2si is positive, we're
|
|
// done; otherwise saturate to the minimum unsigned value, that is 0.
|
|
pos.insert_block(below_pow2nm1_block);
|
|
let sres = pos.ins().x86_cvtt2si(ty, x);
|
|
let is_neg = pos.ins().ifcmp_imm(sres, 0);
|
|
pos.ins()
|
|
.brif(IntCC::SignedGreaterThanOrEqual, is_neg, done, &[sres]);
|
|
pos.ins().jump(done, &[zero]);
|
|
|
|
// Handle the case where x >= 2^(N-1) and not NaN.
|
|
pos.insert_block(large);
|
|
let adjx = pos.ins().fsub(x, pow2nm1);
|
|
let lres = pos.ins().x86_cvtt2si(ty, adjx);
|
|
let max_value = pos.ins().iconst(
|
|
ty,
|
|
match ty {
|
|
ir::types::I32 => u32::max_value() as i64,
|
|
ir::types::I64 => u64::max_value() as i64,
|
|
_ => panic!("Can't convert {}", ty),
|
|
},
|
|
);
|
|
let is_neg = pos.ins().ifcmp_imm(lres, 0);
|
|
pos.ins()
|
|
.brif(IntCC::SignedLessThan, is_neg, done, &[max_value]);
|
|
pos.ins().jump(uint_large_block, &[]);
|
|
|
|
pos.insert_block(uint_large_block);
|
|
let lfinal = pos.ins().iadd_imm(lres, 1 << (ty.lane_bits() - 1));
|
|
|
|
// Recycle the original instruction as a jump.
|
|
pos.func.dfg.replace(inst).jump(done, &[lfinal]);
|
|
|
|
// Finally insert a label for the completion.
|
|
pos.next_inst();
|
|
pos.insert_block(done);
|
|
|
|
cfg.recompute_block(pos.func, old_block);
|
|
cfg.recompute_block(pos.func, below_pow2nm1_or_nan_block);
|
|
cfg.recompute_block(pos.func, below_pow2nm1_block);
|
|
cfg.recompute_block(pos.func, large);
|
|
cfg.recompute_block(pos.func, uint_large_block);
|
|
cfg.recompute_block(pos.func, done);
|
|
}
|
|
|
|
/// Convert shuffle instructions.
|
|
fn convert_shuffle(
|
|
inst: ir::Inst,
|
|
func: &mut ir::Function,
|
|
_cfg: &mut ControlFlowGraph,
|
|
_isa: &dyn TargetIsa,
|
|
) {
|
|
let mut pos = FuncCursor::new(func).at_inst(inst);
|
|
pos.use_srcloc(inst);
|
|
|
|
if let ir::InstructionData::Shuffle { args, mask, .. } = pos.func.dfg[inst] {
|
|
// A mask-building helper: in 128-bit SIMD, 0-15 indicate which lane to read from and a 1
|
|
// in the most significant position zeroes the lane.
|
|
let zero_unknown_lane_index = |b: u8| if b > 15 { 0b10000000 } else { b };
|
|
|
|
// We only have to worry about aliasing here because copies will be introduced later (in
|
|
// regalloc).
|
|
let a = pos.func.dfg.resolve_aliases(args[0]);
|
|
let b = pos.func.dfg.resolve_aliases(args[1]);
|
|
let mask = pos
|
|
.func
|
|
.dfg
|
|
.immediates
|
|
.get(mask)
|
|
.expect("The shuffle immediate should have been recorded before this point")
|
|
.clone();
|
|
if a == b {
|
|
// PSHUFB the first argument (since it is the same as the second).
|
|
let constructed_mask = mask
|
|
.iter()
|
|
// If the mask is greater than 15 it still may be referring to a lane in b.
|
|
.map(|&b| if b > 15 { b.wrapping_sub(16) } else { b })
|
|
.map(zero_unknown_lane_index)
|
|
.collect();
|
|
let handle = pos.func.dfg.constants.insert(constructed_mask);
|
|
// Move the built mask into another XMM register.
|
|
let a_type = pos.func.dfg.value_type(a);
|
|
let mask_value = pos.ins().vconst(a_type, handle);
|
|
// Shuffle the single incoming argument.
|
|
pos.func.dfg.replace(inst).x86_pshufb(a, mask_value);
|
|
} else {
|
|
// PSHUFB the first argument, placing zeroes for unused lanes.
|
|
let constructed_mask = mask.iter().cloned().map(zero_unknown_lane_index).collect();
|
|
let handle = pos.func.dfg.constants.insert(constructed_mask);
|
|
// Move the built mask into another XMM register.
|
|
let a_type = pos.func.dfg.value_type(a);
|
|
let mask_value = pos.ins().vconst(a_type, handle);
|
|
// Shuffle the first argument.
|
|
let shuffled_first_arg = pos.ins().x86_pshufb(a, mask_value);
|
|
|
|
// PSHUFB the second argument, placing zeroes for unused lanes.
|
|
let constructed_mask = mask
|
|
.iter()
|
|
.map(|b| b.wrapping_sub(16))
|
|
.map(zero_unknown_lane_index)
|
|
.collect();
|
|
let handle = pos.func.dfg.constants.insert(constructed_mask);
|
|
// Move the built mask into another XMM register.
|
|
let b_type = pos.func.dfg.value_type(b);
|
|
let mask_value = pos.ins().vconst(b_type, handle);
|
|
// Shuffle the second argument.
|
|
let shuffled_second_arg = pos.ins().x86_pshufb(b, mask_value);
|
|
|
|
// OR the vectors together to form the final shuffled value.
|
|
pos.func
|
|
.dfg
|
|
.replace(inst)
|
|
.bor(shuffled_first_arg, shuffled_second_arg);
|
|
|
|
// TODO when AVX512 is enabled we should replace this sequence with a single VPERMB
|
|
};
|
|
}
|
|
}
|
|
|
|
/// Because floats already exist in XMM registers, we can keep them there when executing a CLIF
|
|
/// extractlane instruction
|
|
fn convert_extractlane(
|
|
inst: ir::Inst,
|
|
func: &mut ir::Function,
|
|
_cfg: &mut ControlFlowGraph,
|
|
_isa: &dyn TargetIsa,
|
|
) {
|
|
let mut pos = FuncCursor::new(func).at_inst(inst);
|
|
pos.use_srcloc(inst);
|
|
|
|
if let ir::InstructionData::ExtractLane {
|
|
opcode: ir::Opcode::Extractlane,
|
|
arg,
|
|
lane,
|
|
} = pos.func.dfg[inst]
|
|
{
|
|
// NOTE: the following legalization assumes that the upper bits of the XMM register do
|
|
// not need to be zeroed during extractlane.
|
|
let value_type = pos.func.dfg.value_type(arg);
|
|
if value_type.lane_type().is_float() {
|
|
// Floats are already in XMM registers and can stay there.
|
|
let shuffled = if lane != 0 {
|
|
// Replace the extractlane with a PSHUFD to get the float in the right place.
|
|
match value_type {
|
|
F32X4 => {
|
|
// Move the selected lane to the 0 lane.
|
|
let shuffle_mask: u8 = 0b00_00_00_00 | lane;
|
|
pos.ins().x86_pshufd(arg, shuffle_mask)
|
|
}
|
|
F64X2 => {
|
|
assert_eq!(lane, 1);
|
|
// Because we know the lane == 1, we move the upper 64 bits to the lower
|
|
// 64 bits, leaving the top 64 bits as-is.
|
|
let shuffle_mask = 0b11_10_11_10;
|
|
let bitcast = pos.ins().raw_bitcast(F32X4, arg);
|
|
pos.ins().x86_pshufd(bitcast, shuffle_mask)
|
|
}
|
|
_ => unreachable!(),
|
|
}
|
|
} else {
|
|
// Remove the extractlane instruction, leaving the float where it is.
|
|
arg
|
|
};
|
|
// Then we must bitcast to the right type.
|
|
pos.func
|
|
.dfg
|
|
.replace(inst)
|
|
.raw_bitcast(value_type.lane_type(), shuffled);
|
|
} else {
|
|
// For non-floats, lower with the usual PEXTR* instruction.
|
|
pos.func.dfg.replace(inst).x86_pextr(arg, lane);
|
|
}
|
|
}
|
|
}
|
|
|
|
/// Because floats exist in XMM registers, we can keep them there when executing a CLIF
|
|
/// insertlane instruction
|
|
fn convert_insertlane(
|
|
inst: ir::Inst,
|
|
func: &mut ir::Function,
|
|
_cfg: &mut ControlFlowGraph,
|
|
_isa: &dyn TargetIsa,
|
|
) {
|
|
let mut pos = FuncCursor::new(func).at_inst(inst);
|
|
pos.use_srcloc(inst);
|
|
|
|
if let ir::InstructionData::InsertLane {
|
|
opcode: ir::Opcode::Insertlane,
|
|
args: [vector, replacement],
|
|
lane,
|
|
} = pos.func.dfg[inst]
|
|
{
|
|
let value_type = pos.func.dfg.value_type(vector);
|
|
if value_type.lane_type().is_float() {
|
|
// Floats are already in XMM registers and can stay there.
|
|
match value_type {
|
|
F32X4 => {
|
|
assert!(lane <= 3);
|
|
let immediate = 0b00_00_00_00 | lane << 4;
|
|
// Insert 32-bits from replacement (at index 00, bits 7:8) to vector (lane
|
|
// shifted into bits 5:6).
|
|
pos.func
|
|
.dfg
|
|
.replace(inst)
|
|
.x86_insertps(vector, immediate, replacement)
|
|
}
|
|
F64X2 => {
|
|
let replacement_as_vector = pos.ins().raw_bitcast(F64X2, replacement); // only necessary due to SSA types
|
|
if lane == 0 {
|
|
// Move the lowest quadword in replacement to vector without changing
|
|
// the upper bits.
|
|
pos.func
|
|
.dfg
|
|
.replace(inst)
|
|
.x86_movsd(vector, replacement_as_vector)
|
|
} else {
|
|
assert_eq!(lane, 1);
|
|
// Move the low 64 bits of replacement vector to the high 64 bits of the
|
|
// vector.
|
|
pos.func
|
|
.dfg
|
|
.replace(inst)
|
|
.x86_movlhps(vector, replacement_as_vector)
|
|
}
|
|
}
|
|
_ => unreachable!(),
|
|
};
|
|
} else {
|
|
// For non-floats, lower with the usual PINSR* instruction.
|
|
pos.func
|
|
.dfg
|
|
.replace(inst)
|
|
.x86_pinsr(vector, lane, replacement);
|
|
}
|
|
}
|
|
}
|
|
|
|
/// For SIMD or scalar integer negation, convert `ineg` to `vconst + isub` or `iconst + isub`.
|
|
fn convert_ineg(
|
|
inst: ir::Inst,
|
|
func: &mut ir::Function,
|
|
_cfg: &mut ControlFlowGraph,
|
|
_isa: &dyn TargetIsa,
|
|
) {
|
|
let mut pos = FuncCursor::new(func).at_inst(inst);
|
|
pos.use_srcloc(inst);
|
|
|
|
if let ir::InstructionData::Unary {
|
|
opcode: ir::Opcode::Ineg,
|
|
arg,
|
|
} = pos.func.dfg[inst]
|
|
{
|
|
let value_type = pos.func.dfg.value_type(arg);
|
|
let zero_value = if value_type.is_vector() && value_type.lane_type().is_int() {
|
|
let zero_immediate = pos.func.dfg.constants.insert(vec![0; 16].into());
|
|
pos.ins().vconst(value_type, zero_immediate) // this should be legalized to a PXOR
|
|
} else if value_type.is_int() {
|
|
pos.ins().iconst(value_type, 0)
|
|
} else {
|
|
panic!("Can't convert ineg of type {}", value_type)
|
|
};
|
|
pos.func.dfg.replace(inst).isub(zero_value, arg);
|
|
} else {
|
|
unreachable!()
|
|
}
|
|
}
|
|
|
|
fn expand_tls_value(
|
|
inst: ir::Inst,
|
|
func: &mut ir::Function,
|
|
_cfg: &mut ControlFlowGraph,
|
|
isa: &dyn TargetIsa,
|
|
) {
|
|
use crate::settings::TlsModel;
|
|
|
|
assert!(
|
|
isa.triple().architecture == target_lexicon::Architecture::X86_64,
|
|
"Not yet implemented for {:?}",
|
|
isa.triple(),
|
|
);
|
|
|
|
if let ir::InstructionData::UnaryGlobalValue {
|
|
opcode: ir::Opcode::TlsValue,
|
|
global_value,
|
|
} = func.dfg[inst]
|
|
{
|
|
let ctrl_typevar = func.dfg.ctrl_typevar(inst);
|
|
assert_eq!(ctrl_typevar, ir::types::I64);
|
|
|
|
match isa.flags().tls_model() {
|
|
TlsModel::None => panic!("tls_model flag is not set."),
|
|
TlsModel::ElfGd => {
|
|
func.dfg.replace(inst).x86_elf_tls_get_addr(global_value);
|
|
}
|
|
TlsModel::Macho => {
|
|
func.dfg.replace(inst).x86_macho_tls_get_addr(global_value);
|
|
}
|
|
model => unimplemented!("tls_value for tls model {:?}", model),
|
|
}
|
|
} else {
|
|
unreachable!();
|
|
}
|
|
}
|