Add x86 legalization for fcvt_to_uint_sat.i32x4
This converts an `f32x4` into an `i32x4` (unsigned) with rounding by using a long sequence of SSE4.1 compatible instructions.
This commit is contained in:
@@ -383,6 +383,7 @@ fn define_simd(
|
||||
let fcmp = insts.by_name("fcmp");
|
||||
let fcvt_from_uint = insts.by_name("fcvt_from_uint");
|
||||
let fcvt_to_sint_sat = insts.by_name("fcvt_to_sint_sat");
|
||||
let fcvt_to_uint_sat = insts.by_name("fcvt_to_uint_sat");
|
||||
let fmax = insts.by_name("fmax");
|
||||
let fmin = insts.by_name("fmin");
|
||||
let fneg = insts.by_name("fneg");
|
||||
@@ -797,4 +798,5 @@ fn define_simd(
|
||||
|
||||
narrow_avx.custom_legalize(imul, "convert_i64x2_imul");
|
||||
narrow_avx.custom_legalize(fcvt_from_uint, "expand_fcvt_from_uint_vector");
|
||||
narrow_avx.custom_legalize(fcvt_to_uint_sat, "expand_fcvt_to_uint_sat_vector");
|
||||
}
|
||||
|
||||
@@ -47,6 +47,7 @@ pub(crate) fn define(shared_defs: &mut SharedDefinitions) -> TargetIsa {
|
||||
x86_32.legalize_value_type(ReferenceType(R32), x86_expand);
|
||||
x86_32.legalize_type(F32, x86_expand);
|
||||
x86_32.legalize_type(F64, x86_expand);
|
||||
x86_32.legalize_value_type(VectorType::new(I32.into(), 4), x86_narrow_avx);
|
||||
x86_32.legalize_value_type(VectorType::new(I64.into(), 2), x86_narrow_avx);
|
||||
x86_32.legalize_value_type(VectorType::new(F32.into(), 4), x86_narrow_avx);
|
||||
|
||||
@@ -60,6 +61,7 @@ pub(crate) fn define(shared_defs: &mut SharedDefinitions) -> TargetIsa {
|
||||
x86_64.legalize_value_type(ReferenceType(R64), x86_expand);
|
||||
x86_64.legalize_type(F32, x86_expand);
|
||||
x86_64.legalize_type(F64, x86_expand);
|
||||
x86_64.legalize_value_type(VectorType::new(I32.into(), 4), x86_narrow_avx);
|
||||
x86_64.legalize_value_type(VectorType::new(I64.into(), 2), x86_narrow_avx);
|
||||
x86_64.legalize_value_type(VectorType::new(F32.into(), 4), x86_narrow_avx);
|
||||
|
||||
|
||||
@@ -1313,6 +1313,79 @@ fn expand_fcvt_to_uint_sat(
|
||||
cfg.recompute_block(pos.func, done);
|
||||
}
|
||||
|
||||
// Lanes of an I32x4 filled with the max signed integer values converted to an F32x4.
|
||||
static MAX_SIGNED_I32X4S_AS_F32X4S: [u8; 16] = [
|
||||
0x00, 0x00, 0x00, 0x4f, 0x00, 0x00, 0x00, 0x4f, 0x00, 0x00, 0x00, 0x4f, 0x00, 0x00, 0x00, 0x4f,
|
||||
];
|
||||
|
||||
/// This legalization converts a vector of 32-bit floating point lanes to unsigned integer lanes
|
||||
/// using a long sequence of NaN quieting and truncation. This logic is separate from
|
||||
/// [expand_fcvt_to_uint_sat] above (the scalar version), only due to how the transform groups are
|
||||
/// set up; TODO if we change the SIMD legalization groups, then this logic could be merged into
|
||||
/// [expand_fcvt_to_uint_sat] (see https://github.com/bytecodealliance/wasmtime/issues/1745).
|
||||
fn expand_fcvt_to_uint_sat_vector(
|
||||
inst: ir::Inst,
|
||||
func: &mut ir::Function,
|
||||
_cfg: &mut ControlFlowGraph,
|
||||
_isa: &dyn TargetIsa,
|
||||
) {
|
||||
let mut pos = FuncCursor::new(func).at_inst(inst);
|
||||
pos.use_srcloc(inst);
|
||||
|
||||
if let ir::InstructionData::Unary {
|
||||
opcode: ir::Opcode::FcvtToUintSat,
|
||||
arg,
|
||||
} = pos.func.dfg[inst]
|
||||
{
|
||||
let controlling_type = pos.func.dfg.ctrl_typevar(inst);
|
||||
if controlling_type == I32X4 {
|
||||
debug_assert_eq!(pos.func.dfg.value_type(arg), F32X4);
|
||||
// We must both quiet any NaNs--setting that lane to 0--and saturate any
|
||||
// lanes that might overflow during conversion to the highest/lowest integer
|
||||
// allowed in that lane.
|
||||
let zeroes_constant = pos.func.dfg.constants.insert(vec![0x00; 16].into());
|
||||
let max_signed_constant = pos
|
||||
.func
|
||||
.dfg
|
||||
.constants
|
||||
.insert(MAX_SIGNED_I32X4S_AS_F32X4S.as_ref().into());
|
||||
let zeroes = pos.ins().vconst(F32X4, zeroes_constant);
|
||||
let max_signed = pos.ins().vconst(F32X4, max_signed_constant);
|
||||
// Clamp the input to 0 for negative floating point numbers. TODO we need to
|
||||
// convert NaNs to 0 but this doesn't do that?
|
||||
let ge_zero = pos.ins().x86_fmax(arg, zeroes);
|
||||
// Find lanes that exceed the max signed value that CVTTPS2DQ knows how to convert.
|
||||
// For floating point numbers above this, CVTTPS2DQ returns the undefined value
|
||||
// 0x80000000.
|
||||
let minus_max_signed = pos.ins().fsub(ge_zero, max_signed);
|
||||
let le_max_signed =
|
||||
pos.ins()
|
||||
.fcmp(FloatCC::LessThanOrEqual, max_signed, minus_max_signed);
|
||||
// Identify lanes that have minus_max_signed > max_signed || minus_max_signed < 0.
|
||||
// These lanes have the MSB set to 1 after the XOR. We are trying to calculate a
|
||||
// valid, in-range addend.
|
||||
let minus_max_signed_as_int = pos.ins().x86_cvtt2si(I32X4, minus_max_signed);
|
||||
let le_max_signed_as_int = pos.ins().raw_bitcast(I32X4, le_max_signed);
|
||||
let difference = pos
|
||||
.ins()
|
||||
.bxor(minus_max_signed_as_int, le_max_signed_as_int);
|
||||
// Calculate amount to add above 0x7FFFFFF, zeroing out any lanes identified
|
||||
// previously (MSB set to 1).
|
||||
let zeroes_as_int = pos.ins().raw_bitcast(I32X4, zeroes);
|
||||
let addend = pos.ins().x86_pmaxs(difference, zeroes_as_int);
|
||||
// Convert the original clamped number to an integer and add back in the addend
|
||||
// (the part of the value above 0x7FFFFFF, since CVTTPS2DQ overflows with these).
|
||||
let converted = pos.ins().x86_cvtt2si(I32X4, ge_zero);
|
||||
pos.func.dfg.replace(inst).iadd(converted, addend);
|
||||
} else {
|
||||
unreachable!(
|
||||
"{} should not be legalized in expand_fcvt_to_uint_sat_vector",
|
||||
pos.func.dfg.display_inst(inst, None)
|
||||
)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Convert shuffle instructions.
|
||||
fn convert_shuffle(
|
||||
inst: ir::Inst,
|
||||
|
||||
@@ -32,3 +32,23 @@ block0(v0:f32x4):
|
||||
; nextln: v1 = bxor v7, v9
|
||||
return v1
|
||||
}
|
||||
|
||||
function %fcvt_to_uint_sat(f32x4) -> i32x4 {
|
||||
; check: const0 = 0x00000000000000000000000000000000
|
||||
; nextln: const1 = 0x4f0000004f0000004f0000004f000000
|
||||
block0(v0:f32x4):
|
||||
v1 = fcvt_to_uint_sat.i32x4 v0
|
||||
; check: v2 = vconst.f32x4 const0
|
||||
; nextln: v3 = vconst.f32x4 const1
|
||||
; nextln: v4 = x86_fmax v0, v2
|
||||
; nextln: v5 = fsub v4, v3
|
||||
; nextln: v6 = fcmp le v3, v5
|
||||
; nextln: v7 = x86_cvtt2si.i32x4 v5
|
||||
; nextln: v8 = raw_bitcast.i32x4 v6
|
||||
; nextln: v9 = bxor v7, v8
|
||||
; nextln: v10 = raw_bitcast.i32x4 v2
|
||||
; nextln: v11 = x86_pmaxs v9, v10
|
||||
; nextln: v12 = x86_cvtt2si.i32x4 v4
|
||||
; nextln: v1 = iadd v12, v11
|
||||
return v1
|
||||
}
|
||||
|
||||
@@ -28,3 +28,12 @@ block0(v0:f32x4):
|
||||
}
|
||||
; run: %fcvt_to_sint_sat([0x0.0 -0x1.0 0x1.0 0x1.0p100]) == [0 -1 1 0x7FFFFFFF]
|
||||
; run: %fcvt_to_sint_sat([-0x8.1 0x0.0 0x0.0 -0x1.0p100]) == [-8 0 0 0x80000000]
|
||||
|
||||
function %fcvt_to_uint_sat(f32x4) -> i32x4 {
|
||||
block0(v0:f32x4):
|
||||
v1 = fcvt_to_uint_sat.i32x4 v0
|
||||
return v1
|
||||
}
|
||||
; run: %fcvt_to_uint_sat([0x1.0 0x4.2 0x4.6 0x1.0p100]) == [1 4 4 0xFFFFFFFF]
|
||||
; run: %fcvt_to_uint_sat([-0x8.1 -0x0.0 0x0.0 -0x1.0p100]) == [0 0 0 0]
|
||||
; run: %fcvt_to_uint_sat([0xB2D05E00.0 0.0 0.0 0.0]) == [3000000000 0 0 0]
|
||||
|
||||
Reference in New Issue
Block a user