Add x86 legalization for fcvt_from_uint.f32x4
This converts an `i32x4` into an `f32x4` with some rounding either by using an AVX512VL/F instruction--VCVTUDQ2PS--or a long sequence of SSE4.1 compatible instructions.
This commit is contained in:
@@ -380,6 +380,7 @@ fn define_simd(
|
|||||||
let bxor = insts.by_name("bxor");
|
let bxor = insts.by_name("bxor");
|
||||||
let extractlane = insts.by_name("extractlane");
|
let extractlane = insts.by_name("extractlane");
|
||||||
let fcmp = insts.by_name("fcmp");
|
let fcmp = insts.by_name("fcmp");
|
||||||
|
let fcvt_from_uint = insts.by_name("fcvt_from_uint");
|
||||||
let fabs = insts.by_name("fabs");
|
let fabs = insts.by_name("fabs");
|
||||||
let fneg = insts.by_name("fneg");
|
let fneg = insts.by_name("fneg");
|
||||||
let iadd_imm = insts.by_name("iadd_imm");
|
let iadd_imm = insts.by_name("iadd_imm");
|
||||||
@@ -788,6 +789,6 @@ fn define_simd(
|
|||||||
narrow.custom_legalize(ushr, "convert_ushr");
|
narrow.custom_legalize(ushr, "convert_ushr");
|
||||||
narrow.custom_legalize(ishl, "convert_ishl");
|
narrow.custom_legalize(ishl, "convert_ishl");
|
||||||
|
|
||||||
// This lives in the expand group to avoid conflicting with, e.g., i128 legalizations.
|
|
||||||
narrow_avx.custom_legalize(imul, "convert_i64x2_imul");
|
narrow_avx.custom_legalize(imul, "convert_i64x2_imul");
|
||||||
|
narrow_avx.custom_legalize(fcvt_from_uint, "expand_fcvt_from_uint_vector");
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -48,6 +48,7 @@ pub(crate) fn define(shared_defs: &mut SharedDefinitions) -> TargetIsa {
|
|||||||
x86_32.legalize_type(F32, x86_expand);
|
x86_32.legalize_type(F32, x86_expand);
|
||||||
x86_32.legalize_type(F64, x86_expand);
|
x86_32.legalize_type(F64, x86_expand);
|
||||||
x86_32.legalize_value_type(VectorType::new(I64.into(), 2), x86_narrow_avx);
|
x86_32.legalize_value_type(VectorType::new(I64.into(), 2), x86_narrow_avx);
|
||||||
|
x86_32.legalize_value_type(VectorType::new(F32.into(), 4), x86_narrow_avx);
|
||||||
|
|
||||||
x86_64.legalize_monomorphic(expand_flags);
|
x86_64.legalize_monomorphic(expand_flags);
|
||||||
x86_64.legalize_default(x86_narrow);
|
x86_64.legalize_default(x86_narrow);
|
||||||
@@ -60,6 +61,7 @@ pub(crate) fn define(shared_defs: &mut SharedDefinitions) -> TargetIsa {
|
|||||||
x86_64.legalize_type(F32, x86_expand);
|
x86_64.legalize_type(F32, x86_expand);
|
||||||
x86_64.legalize_type(F64, x86_expand);
|
x86_64.legalize_type(F64, x86_expand);
|
||||||
x86_64.legalize_value_type(VectorType::new(I64.into(), 2), x86_narrow_avx);
|
x86_64.legalize_value_type(VectorType::new(I64.into(), 2), x86_narrow_avx);
|
||||||
|
x86_64.legalize_value_type(VectorType::new(F32.into(), 4), x86_narrow_avx);
|
||||||
|
|
||||||
let recipes = recipes::define(shared_defs, &settings, ®s);
|
let recipes = recipes::define(shared_defs, &settings, ®s);
|
||||||
|
|
||||||
|
|||||||
@@ -598,6 +598,9 @@ fn expand_minmax(
|
|||||||
|
|
||||||
/// x86 has no unsigned-to-float conversions. We handle the easy case of zero-extending i32 to
|
/// x86 has no unsigned-to-float conversions. We handle the easy case of zero-extending i32 to
|
||||||
/// i64 with a pattern, the rest needs more code.
|
/// i64 with a pattern, the rest needs more code.
|
||||||
|
///
|
||||||
|
/// Note that this is the scalar implementation; for the vector implemenation see
|
||||||
|
/// [expand_fcvt_from_uint_vector].
|
||||||
fn expand_fcvt_from_uint(
|
fn expand_fcvt_from_uint(
|
||||||
inst: ir::Inst,
|
inst: ir::Inst,
|
||||||
func: &mut ir::Function,
|
func: &mut ir::Function,
|
||||||
@@ -679,6 +682,56 @@ fn expand_fcvt_from_uint(
|
|||||||
cfg.recompute_block(pos.func, done);
|
cfg.recompute_block(pos.func, done);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/// To convert packed unsigned integers to their float equivalents, we must legalize to a special
|
||||||
|
/// AVX512 instruction (using MCSR rounding) or use a long sequence of instructions. This logic is
|
||||||
|
/// separate from [expand_fcvt_from_uint] above (the scalar version), only due to how the transform
|
||||||
|
/// groups are set up; TODO if we change the SIMD legalization groups, then this logic could be
|
||||||
|
/// merged into [expand_fcvt_from_uint] (see https://github.com/bytecodealliance/wasmtime/issues/1745).
|
||||||
|
fn expand_fcvt_from_uint_vector(
|
||||||
|
inst: ir::Inst,
|
||||||
|
func: &mut ir::Function,
|
||||||
|
_cfg: &mut ControlFlowGraph,
|
||||||
|
isa: &dyn TargetIsa,
|
||||||
|
) {
|
||||||
|
let mut pos = FuncCursor::new(func).at_inst(inst);
|
||||||
|
pos.use_srcloc(inst);
|
||||||
|
|
||||||
|
if let ir::InstructionData::Unary {
|
||||||
|
opcode: ir::Opcode::FcvtFromUint,
|
||||||
|
arg,
|
||||||
|
} = pos.func.dfg[inst]
|
||||||
|
{
|
||||||
|
let controlling_type = pos.func.dfg.ctrl_typevar(inst);
|
||||||
|
if controlling_type == F32X4 {
|
||||||
|
debug_assert_eq!(pos.func.dfg.value_type(arg), I32X4);
|
||||||
|
let x86_isa = isa
|
||||||
|
.as_any()
|
||||||
|
.downcast_ref::<isa::x86::Isa>()
|
||||||
|
.expect("the target ISA must be x86 at this point");
|
||||||
|
if x86_isa.isa_flags.use_avx512vl_simd() || x86_isa.isa_flags.use_avx512f_simd() {
|
||||||
|
// If we have certain AVX512 features, we can lower this instruction simply.
|
||||||
|
pos.func.dfg.replace(inst).x86_vcvtudq2ps(arg);
|
||||||
|
} else {
|
||||||
|
// Otherwise, we default to a very lengthy SSE4.1-compatible sequence: PXOR,
|
||||||
|
// PBLENDW, PSUB, CVTDQ2PS, PSRLD, CVTDQ2PS, ADDPS, ADDPS
|
||||||
|
let bitcast_arg = pos.ins().raw_bitcast(I16X8, arg);
|
||||||
|
let zero_constant = pos.func.dfg.constants.insert(vec![0; 16].into());
|
||||||
|
let zero = pos.ins().vconst(I16X8, zero_constant);
|
||||||
|
let low = pos.ins().x86_pblendw(zero, bitcast_arg, 0x55);
|
||||||
|
let bitcast_low = pos.ins().raw_bitcast(I32X4, low);
|
||||||
|
let high = pos.ins().isub(arg, bitcast_low);
|
||||||
|
let convert_low = pos.ins().fcvt_from_sint(F32X4, bitcast_low);
|
||||||
|
let shift_high = pos.ins().ushr_imm(high, 1);
|
||||||
|
let convert_high = pos.ins().fcvt_from_sint(F32X4, shift_high);
|
||||||
|
let double_high = pos.ins().fadd(convert_high, convert_high);
|
||||||
|
pos.func.dfg.replace(inst).fadd(double_high, convert_low);
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
unimplemented!("cannot legalize {}", pos.func.dfg.display_inst(inst, None))
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
fn expand_fcvt_to_sint(
|
fn expand_fcvt_to_sint(
|
||||||
inst: ir::Inst,
|
inst: ir::Inst,
|
||||||
func: &mut ir::Function,
|
func: &mut ir::Function,
|
||||||
|
|||||||
@@ -0,0 +1,10 @@
|
|||||||
|
test legalizer
|
||||||
|
set enable_simd
|
||||||
|
target x86_64 skylake has_avx512f=true
|
||||||
|
|
||||||
|
function %fcvt_from_uint(i32x4) -> f32x4 {
|
||||||
|
block0(v0:i32x4):
|
||||||
|
v1 = fcvt_from_uint.f32x4 v0
|
||||||
|
; check: v1 = x86_vcvtudq2ps v0
|
||||||
|
return v1
|
||||||
|
}
|
||||||
@@ -0,0 +1,19 @@
|
|||||||
|
test legalizer
|
||||||
|
set enable_simd
|
||||||
|
target x86_64 skylake
|
||||||
|
|
||||||
|
function %fcvt_from_uint(i32x4) -> f32x4 {
|
||||||
|
block0(v0:i32x4):
|
||||||
|
v1 = fcvt_from_uint.f32x4 v0
|
||||||
|
; check: v2 = raw_bitcast.i16x8 v0
|
||||||
|
; nextln: v3 = vconst.i16x8 const0
|
||||||
|
; nextln: v4 = x86_pblendw v3, v2, 85
|
||||||
|
; nextln: v5 = raw_bitcast.i32x4 v4
|
||||||
|
; nextln: v6 = isub v0, v5
|
||||||
|
; nextln: v7 = fcvt_from_sint.f32x4 v5
|
||||||
|
; nextln: v8 = ushr_imm v6, 1
|
||||||
|
; nextln: v9 = fcvt_from_sint.f32x4 v8
|
||||||
|
; nextln: v10 = fadd v9, v9
|
||||||
|
; nextln: v1 = fadd v10, v7
|
||||||
|
return v1
|
||||||
|
}
|
||||||
@@ -13,3 +13,10 @@ block0:
|
|||||||
return v4
|
return v4
|
||||||
}
|
}
|
||||||
; run
|
; run
|
||||||
|
|
||||||
|
function %fcvt_from_uint(i32x4) -> f32x4 {
|
||||||
|
block0(v0:i32x4):
|
||||||
|
v1 = fcvt_from_uint.f32x4 v0
|
||||||
|
return v1
|
||||||
|
}
|
||||||
|
; run: %fcvt_from_uint([0 0 0 0]) == [0x0.0 0x0.0 0x0.0 0x0.0]
|
||||||
|
|||||||
Reference in New Issue
Block a user