diff --git a/cranelift/codegen/meta/src/isa/x86/legalize.rs b/cranelift/codegen/meta/src/isa/x86/legalize.rs index 940ffe6d01..130205fee0 100644 --- a/cranelift/codegen/meta/src/isa/x86/legalize.rs +++ b/cranelift/codegen/meta/src/isa/x86/legalize.rs @@ -381,6 +381,7 @@ fn define_simd( let extractlane = insts.by_name("extractlane"); let fcmp = insts.by_name("fcmp"); let fcvt_from_uint = insts.by_name("fcvt_from_uint"); + let fcvt_to_sint_sat = insts.by_name("fcvt_to_sint_sat"); let fabs = insts.by_name("fabs"); let fneg = insts.by_name("fneg"); let iadd_imm = insts.by_name("iadd_imm"); @@ -788,6 +789,7 @@ fn define_simd( narrow.custom_legalize(ineg, "convert_ineg"); narrow.custom_legalize(ushr, "convert_ushr"); narrow.custom_legalize(ishl, "convert_ishl"); + narrow.custom_legalize(fcvt_to_sint_sat, "expand_fcvt_to_sint_sat_vector"); narrow_avx.custom_legalize(imul, "convert_i64x2_imul"); narrow_avx.custom_legalize(fcvt_from_uint, "expand_fcvt_from_uint_vector"); diff --git a/cranelift/codegen/src/isa/x86/enc_tables.rs b/cranelift/codegen/src/isa/x86/enc_tables.rs index 0786d37578..42e5064263 100644 --- a/cranelift/codegen/src/isa/x86/enc_tables.rs +++ b/cranelift/codegen/src/isa/x86/enc_tables.rs @@ -964,6 +964,61 @@ fn expand_fcvt_to_sint_sat( cfg.recompute_block(pos.func, done_block); } +/// This legalization converts a vector of 32-bit floating point lanes to signed integer lanes +/// using CVTTPS2DQ (see encoding of `x86_cvtt2si`). This logic is separate from [expand_fcvt_to_sint_sat] +/// above (the scalar version), only due to how the transform groups are set up; TODO if we change +/// the SIMD legalization groups, then this logic could be merged into [expand_fcvt_to_sint_sat] +/// (see https://github.com/bytecodealliance/wasmtime/issues/1745). +fn expand_fcvt_to_sint_sat_vector( + inst: ir::Inst, + func: &mut ir::Function, + _cfg: &mut ControlFlowGraph, + _isa: &dyn TargetIsa, +) { + let mut pos = FuncCursor::new(func).at_inst(inst); + pos.use_srcloc(inst); + + if let ir::InstructionData::Unary { + opcode: ir::Opcode::FcvtToSintSat, + arg, + } = pos.func.dfg[inst] + { + let controlling_type = pos.func.dfg.ctrl_typevar(inst); + if controlling_type == I32X4 { + debug_assert_eq!(pos.func.dfg.value_type(arg), F32X4); + // We must both quiet any NaNs--setting that lane to 0--and saturate any + // lanes that might overflow during conversion to the highest/lowest signed integer + // allowed in that lane. + + // Saturate NaNs: `fcmp eq` will not match if a lane contains a NaN. We use ANDPS to + // avoid doing the comparison twice (we need the zeroed lanes to find differences). + let zeroed_nans = pos.ins().fcmp(FloatCC::Equal, arg, arg); + let zeroed_nans_bitcast = pos.ins().raw_bitcast(F32X4, zeroed_nans); + let zeroed_nans_copy = pos.ins().band(arg, zeroed_nans_bitcast); + + // Find differences with the zeroed lanes (we will only use the MSB: 1 if positive or + // NaN, 0 otherwise). + let differences = pos.ins().bxor(zeroed_nans_bitcast, arg); + let differences_bitcast = pos.ins().raw_bitcast(I32X4, differences); + + // Convert the numeric lanes. CVTTPS2DQ will mark overflows with 0x80000000 (MSB set). + let converted = pos.ins().x86_cvtt2si(I32X4, zeroed_nans_copy); + + // Create a mask of all 1s only on positive overflow, 0s otherwise. This uses the MSB + // of `differences` (1 when positive or NaN) and the MSB of `converted` (1 on positive + // overflow). + let tmp = pos.ins().band(differences_bitcast, converted); + let mask = pos.ins().sshr_imm(tmp, 31); + + // Apply the mask to create 0x7FFFFFFF for positive overflow. XOR of all 0s (all other + // cases) has no effect. + pos.func.dfg.replace(inst).bxor(converted, mask); + } else { + unimplemented!("cannot legalize {}", pos.func.dfg.display_inst(inst, None)) + } + } +} + fn expand_fcvt_to_uint( inst: ir::Inst, func: &mut ir::Function, diff --git a/cranelift/filetests/filetests/isa/x86/simd-conversion-legalize.clif b/cranelift/filetests/filetests/isa/x86/simd-conversion-legalize.clif index 7db52967e4..912c34d0fc 100644 --- a/cranelift/filetests/filetests/isa/x86/simd-conversion-legalize.clif +++ b/cranelift/filetests/filetests/isa/x86/simd-conversion-legalize.clif @@ -17,3 +17,18 @@ block0(v0:i32x4): ; nextln: v1 = fadd v10, v7 return v1 } + +function %fcvt_to_sint_sat(f32x4) -> i32x4 { +block0(v0:f32x4): + v1 = fcvt_to_sint_sat.i32x4 v0 + ; check: v2 = fcmp eq v0, v0 + ; nextln: v3 = raw_bitcast.f32x4 v2 + ; nextln: v4 = band v0, v3 + ; nextln: v5 = bxor v3, v0 + ; nextln: v6 = raw_bitcast.i32x4 v5 + ; nextln: v7 = x86_cvtt2si.i32x4 v4 + ; nextln: v8 = band v6, v7 + ; nextln: v9 = sshr_imm v8, 31 + ; nextln: v1 = bxor v7, v9 + return v1 +} diff --git a/cranelift/filetests/filetests/isa/x86/simd-conversion-run.clif b/cranelift/filetests/filetests/isa/x86/simd-conversion-run.clif index 2a97474adc..8764bceabc 100644 --- a/cranelift/filetests/filetests/isa/x86/simd-conversion-run.clif +++ b/cranelift/filetests/filetests/isa/x86/simd-conversion-run.clif @@ -20,3 +20,11 @@ block0(v0:i32x4): return v1 } ; run: %fcvt_from_uint([0 0 0 0]) == [0x0.0 0x0.0 0x0.0 0x0.0] + +function %fcvt_to_sint_sat(f32x4) -> i32x4 { +block0(v0:f32x4): + v1 = fcvt_to_sint_sat.i32x4 v0 + return v1 +} +; run: %fcvt_to_sint_sat([0x0.0 -0x1.0 0x1.0 0x1.0p100]) == [0 -1 1 0x7FFFFFFF] +; run: %fcvt_to_sint_sat([-0x8.1 0x0.0 0x0.0 -0x1.0p100]) == [-8 0 0 0x80000000]