Legalize fcvt_to_sint_sat.i32x4 on x86
Use a lengthy sequence involving CVTTPS2DQ to quiet NaNs and saturate overflow.
This commit is contained in:
@@ -381,6 +381,7 @@ fn define_simd(
|
||||
let extractlane = insts.by_name("extractlane");
|
||||
let fcmp = insts.by_name("fcmp");
|
||||
let fcvt_from_uint = insts.by_name("fcvt_from_uint");
|
||||
let fcvt_to_sint_sat = insts.by_name("fcvt_to_sint_sat");
|
||||
let fabs = insts.by_name("fabs");
|
||||
let fneg = insts.by_name("fneg");
|
||||
let iadd_imm = insts.by_name("iadd_imm");
|
||||
@@ -788,6 +789,7 @@ fn define_simd(
|
||||
narrow.custom_legalize(ineg, "convert_ineg");
|
||||
narrow.custom_legalize(ushr, "convert_ushr");
|
||||
narrow.custom_legalize(ishl, "convert_ishl");
|
||||
narrow.custom_legalize(fcvt_to_sint_sat, "expand_fcvt_to_sint_sat_vector");
|
||||
|
||||
narrow_avx.custom_legalize(imul, "convert_i64x2_imul");
|
||||
narrow_avx.custom_legalize(fcvt_from_uint, "expand_fcvt_from_uint_vector");
|
||||
|
||||
@@ -964,6 +964,61 @@ fn expand_fcvt_to_sint_sat(
|
||||
cfg.recompute_block(pos.func, done_block);
|
||||
}
|
||||
|
||||
/// This legalization converts a vector of 32-bit floating point lanes to signed integer lanes
|
||||
/// using CVTTPS2DQ (see encoding of `x86_cvtt2si`). This logic is separate from [expand_fcvt_to_sint_sat]
|
||||
/// above (the scalar version), only due to how the transform groups are set up; TODO if we change
|
||||
/// the SIMD legalization groups, then this logic could be merged into [expand_fcvt_to_sint_sat]
|
||||
/// (see https://github.com/bytecodealliance/wasmtime/issues/1745).
|
||||
fn expand_fcvt_to_sint_sat_vector(
|
||||
inst: ir::Inst,
|
||||
func: &mut ir::Function,
|
||||
_cfg: &mut ControlFlowGraph,
|
||||
_isa: &dyn TargetIsa,
|
||||
) {
|
||||
let mut pos = FuncCursor::new(func).at_inst(inst);
|
||||
pos.use_srcloc(inst);
|
||||
|
||||
if let ir::InstructionData::Unary {
|
||||
opcode: ir::Opcode::FcvtToSintSat,
|
||||
arg,
|
||||
} = pos.func.dfg[inst]
|
||||
{
|
||||
let controlling_type = pos.func.dfg.ctrl_typevar(inst);
|
||||
if controlling_type == I32X4 {
|
||||
debug_assert_eq!(pos.func.dfg.value_type(arg), F32X4);
|
||||
// We must both quiet any NaNs--setting that lane to 0--and saturate any
|
||||
// lanes that might overflow during conversion to the highest/lowest signed integer
|
||||
// allowed in that lane.
|
||||
|
||||
// Saturate NaNs: `fcmp eq` will not match if a lane contains a NaN. We use ANDPS to
|
||||
// avoid doing the comparison twice (we need the zeroed lanes to find differences).
|
||||
let zeroed_nans = pos.ins().fcmp(FloatCC::Equal, arg, arg);
|
||||
let zeroed_nans_bitcast = pos.ins().raw_bitcast(F32X4, zeroed_nans);
|
||||
let zeroed_nans_copy = pos.ins().band(arg, zeroed_nans_bitcast);
|
||||
|
||||
// Find differences with the zeroed lanes (we will only use the MSB: 1 if positive or
|
||||
// NaN, 0 otherwise).
|
||||
let differences = pos.ins().bxor(zeroed_nans_bitcast, arg);
|
||||
let differences_bitcast = pos.ins().raw_bitcast(I32X4, differences);
|
||||
|
||||
// Convert the numeric lanes. CVTTPS2DQ will mark overflows with 0x80000000 (MSB set).
|
||||
let converted = pos.ins().x86_cvtt2si(I32X4, zeroed_nans_copy);
|
||||
|
||||
// Create a mask of all 1s only on positive overflow, 0s otherwise. This uses the MSB
|
||||
// of `differences` (1 when positive or NaN) and the MSB of `converted` (1 on positive
|
||||
// overflow).
|
||||
let tmp = pos.ins().band(differences_bitcast, converted);
|
||||
let mask = pos.ins().sshr_imm(tmp, 31);
|
||||
|
||||
// Apply the mask to create 0x7FFFFFFF for positive overflow. XOR of all 0s (all other
|
||||
// cases) has no effect.
|
||||
pos.func.dfg.replace(inst).bxor(converted, mask);
|
||||
} else {
|
||||
unimplemented!("cannot legalize {}", pos.func.dfg.display_inst(inst, None))
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
fn expand_fcvt_to_uint(
|
||||
inst: ir::Inst,
|
||||
func: &mut ir::Function,
|
||||
|
||||
@@ -17,3 +17,18 @@ block0(v0:i32x4):
|
||||
; nextln: v1 = fadd v10, v7
|
||||
return v1
|
||||
}
|
||||
|
||||
function %fcvt_to_sint_sat(f32x4) -> i32x4 {
|
||||
block0(v0:f32x4):
|
||||
v1 = fcvt_to_sint_sat.i32x4 v0
|
||||
; check: v2 = fcmp eq v0, v0
|
||||
; nextln: v3 = raw_bitcast.f32x4 v2
|
||||
; nextln: v4 = band v0, v3
|
||||
; nextln: v5 = bxor v3, v0
|
||||
; nextln: v6 = raw_bitcast.i32x4 v5
|
||||
; nextln: v7 = x86_cvtt2si.i32x4 v4
|
||||
; nextln: v8 = band v6, v7
|
||||
; nextln: v9 = sshr_imm v8, 31
|
||||
; nextln: v1 = bxor v7, v9
|
||||
return v1
|
||||
}
|
||||
|
||||
@@ -20,3 +20,11 @@ block0(v0:i32x4):
|
||||
return v1
|
||||
}
|
||||
; run: %fcvt_from_uint([0 0 0 0]) == [0x0.0 0x0.0 0x0.0 0x0.0]
|
||||
|
||||
function %fcvt_to_sint_sat(f32x4) -> i32x4 {
|
||||
block0(v0:f32x4):
|
||||
v1 = fcvt_to_sint_sat.i32x4 v0
|
||||
return v1
|
||||
}
|
||||
; run: %fcvt_to_sint_sat([0x0.0 -0x1.0 0x1.0 0x1.0p100]) == [0 -1 1 0x7FFFFFFF]
|
||||
; run: %fcvt_to_sint_sat([-0x8.1 0x0.0 0x0.0 -0x1.0p100]) == [-8 0 0 0x80000000]
|
||||
|
||||
Reference in New Issue
Block a user