From 01d34e71b9bd7b9744a1d87ffb36c969518bf433 Mon Sep 17 00:00:00 2001 From: Andrew Brown Date: Tue, 26 May 2020 15:20:30 -0700 Subject: [PATCH] Add x86 legalization for fcvt_from_uint.f32x4 This converts an `i32x4` into an `f32x4` with some rounding either by using an AVX512VL/F instruction--VCVTUDQ2PS--or a long sequence of SSE4.1 compatible instructions. --- .../codegen/meta/src/isa/x86/legalize.rs | 3 +- cranelift/codegen/meta/src/isa/x86/mod.rs | 2 + cranelift/codegen/src/isa/x86/enc_tables.rs | 53 +++++++++++++++++++ .../x86/simd-avx512-conversion-legalize.clif | 10 ++++ .../isa/x86/simd-conversion-legalize.clif | 19 +++++++ .../isa/x86/simd-conversion-run.clif | 7 +++ 6 files changed, 93 insertions(+), 1 deletion(-) create mode 100644 cranelift/filetests/filetests/isa/x86/simd-avx512-conversion-legalize.clif create mode 100644 cranelift/filetests/filetests/isa/x86/simd-conversion-legalize.clif diff --git a/cranelift/codegen/meta/src/isa/x86/legalize.rs b/cranelift/codegen/meta/src/isa/x86/legalize.rs index 6e5c791b79..940ffe6d01 100644 --- a/cranelift/codegen/meta/src/isa/x86/legalize.rs +++ b/cranelift/codegen/meta/src/isa/x86/legalize.rs @@ -380,6 +380,7 @@ fn define_simd( let bxor = insts.by_name("bxor"); let extractlane = insts.by_name("extractlane"); let fcmp = insts.by_name("fcmp"); + let fcvt_from_uint = insts.by_name("fcvt_from_uint"); let fabs = insts.by_name("fabs"); let fneg = insts.by_name("fneg"); let iadd_imm = insts.by_name("iadd_imm"); @@ -788,6 +789,6 @@ fn define_simd( narrow.custom_legalize(ushr, "convert_ushr"); narrow.custom_legalize(ishl, "convert_ishl"); - // This lives in the expand group to avoid conflicting with, e.g., i128 legalizations. narrow_avx.custom_legalize(imul, "convert_i64x2_imul"); + narrow_avx.custom_legalize(fcvt_from_uint, "expand_fcvt_from_uint_vector"); } diff --git a/cranelift/codegen/meta/src/isa/x86/mod.rs b/cranelift/codegen/meta/src/isa/x86/mod.rs index 2e9305e9f7..8d2e33be73 100644 --- a/cranelift/codegen/meta/src/isa/x86/mod.rs +++ b/cranelift/codegen/meta/src/isa/x86/mod.rs @@ -48,6 +48,7 @@ pub(crate) fn define(shared_defs: &mut SharedDefinitions) -> TargetIsa { x86_32.legalize_type(F32, x86_expand); x86_32.legalize_type(F64, x86_expand); x86_32.legalize_value_type(VectorType::new(I64.into(), 2), x86_narrow_avx); + x86_32.legalize_value_type(VectorType::new(F32.into(), 4), x86_narrow_avx); x86_64.legalize_monomorphic(expand_flags); x86_64.legalize_default(x86_narrow); @@ -60,6 +61,7 @@ pub(crate) fn define(shared_defs: &mut SharedDefinitions) -> TargetIsa { x86_64.legalize_type(F32, x86_expand); x86_64.legalize_type(F64, x86_expand); x86_64.legalize_value_type(VectorType::new(I64.into(), 2), x86_narrow_avx); + x86_64.legalize_value_type(VectorType::new(F32.into(), 4), x86_narrow_avx); let recipes = recipes::define(shared_defs, &settings, ®s); diff --git a/cranelift/codegen/src/isa/x86/enc_tables.rs b/cranelift/codegen/src/isa/x86/enc_tables.rs index a751ea3138..0786d37578 100644 --- a/cranelift/codegen/src/isa/x86/enc_tables.rs +++ b/cranelift/codegen/src/isa/x86/enc_tables.rs @@ -598,6 +598,9 @@ fn expand_minmax( /// x86 has no unsigned-to-float conversions. We handle the easy case of zero-extending i32 to /// i64 with a pattern, the rest needs more code. +/// +/// Note that this is the scalar implementation; for the vector implemenation see +/// [expand_fcvt_from_uint_vector]. fn expand_fcvt_from_uint( inst: ir::Inst, func: &mut ir::Function, @@ -679,6 +682,56 @@ fn expand_fcvt_from_uint( cfg.recompute_block(pos.func, done); } +/// To convert packed unsigned integers to their float equivalents, we must legalize to a special +/// AVX512 instruction (using MCSR rounding) or use a long sequence of instructions. This logic is +/// separate from [expand_fcvt_from_uint] above (the scalar version), only due to how the transform +/// groups are set up; TODO if we change the SIMD legalization groups, then this logic could be +/// merged into [expand_fcvt_from_uint] (see https://github.com/bytecodealliance/wasmtime/issues/1745). +fn expand_fcvt_from_uint_vector( + inst: ir::Inst, + func: &mut ir::Function, + _cfg: &mut ControlFlowGraph, + isa: &dyn TargetIsa, +) { + let mut pos = FuncCursor::new(func).at_inst(inst); + pos.use_srcloc(inst); + + if let ir::InstructionData::Unary { + opcode: ir::Opcode::FcvtFromUint, + arg, + } = pos.func.dfg[inst] + { + let controlling_type = pos.func.dfg.ctrl_typevar(inst); + if controlling_type == F32X4 { + debug_assert_eq!(pos.func.dfg.value_type(arg), I32X4); + let x86_isa = isa + .as_any() + .downcast_ref::() + .expect("the target ISA must be x86 at this point"); + if x86_isa.isa_flags.use_avx512vl_simd() || x86_isa.isa_flags.use_avx512f_simd() { + // If we have certain AVX512 features, we can lower this instruction simply. + pos.func.dfg.replace(inst).x86_vcvtudq2ps(arg); + } else { + // Otherwise, we default to a very lengthy SSE4.1-compatible sequence: PXOR, + // PBLENDW, PSUB, CVTDQ2PS, PSRLD, CVTDQ2PS, ADDPS, ADDPS + let bitcast_arg = pos.ins().raw_bitcast(I16X8, arg); + let zero_constant = pos.func.dfg.constants.insert(vec![0; 16].into()); + let zero = pos.ins().vconst(I16X8, zero_constant); + let low = pos.ins().x86_pblendw(zero, bitcast_arg, 0x55); + let bitcast_low = pos.ins().raw_bitcast(I32X4, low); + let high = pos.ins().isub(arg, bitcast_low); + let convert_low = pos.ins().fcvt_from_sint(F32X4, bitcast_low); + let shift_high = pos.ins().ushr_imm(high, 1); + let convert_high = pos.ins().fcvt_from_sint(F32X4, shift_high); + let double_high = pos.ins().fadd(convert_high, convert_high); + pos.func.dfg.replace(inst).fadd(double_high, convert_low); + } + } else { + unimplemented!("cannot legalize {}", pos.func.dfg.display_inst(inst, None)) + } + } +} + fn expand_fcvt_to_sint( inst: ir::Inst, func: &mut ir::Function, diff --git a/cranelift/filetests/filetests/isa/x86/simd-avx512-conversion-legalize.clif b/cranelift/filetests/filetests/isa/x86/simd-avx512-conversion-legalize.clif new file mode 100644 index 0000000000..78dc1cf220 --- /dev/null +++ b/cranelift/filetests/filetests/isa/x86/simd-avx512-conversion-legalize.clif @@ -0,0 +1,10 @@ +test legalizer +set enable_simd +target x86_64 skylake has_avx512f=true + +function %fcvt_from_uint(i32x4) -> f32x4 { +block0(v0:i32x4): + v1 = fcvt_from_uint.f32x4 v0 + ; check: v1 = x86_vcvtudq2ps v0 + return v1 +} diff --git a/cranelift/filetests/filetests/isa/x86/simd-conversion-legalize.clif b/cranelift/filetests/filetests/isa/x86/simd-conversion-legalize.clif new file mode 100644 index 0000000000..7db52967e4 --- /dev/null +++ b/cranelift/filetests/filetests/isa/x86/simd-conversion-legalize.clif @@ -0,0 +1,19 @@ +test legalizer +set enable_simd +target x86_64 skylake + +function %fcvt_from_uint(i32x4) -> f32x4 { +block0(v0:i32x4): + v1 = fcvt_from_uint.f32x4 v0 + ; check: v2 = raw_bitcast.i16x8 v0 + ; nextln: v3 = vconst.i16x8 const0 + ; nextln: v4 = x86_pblendw v3, v2, 85 + ; nextln: v5 = raw_bitcast.i32x4 v4 + ; nextln: v6 = isub v0, v5 + ; nextln: v7 = fcvt_from_sint.f32x4 v5 + ; nextln: v8 = ushr_imm v6, 1 + ; nextln: v9 = fcvt_from_sint.f32x4 v8 + ; nextln: v10 = fadd v9, v9 + ; nextln: v1 = fadd v10, v7 + return v1 +} diff --git a/cranelift/filetests/filetests/isa/x86/simd-conversion-run.clif b/cranelift/filetests/filetests/isa/x86/simd-conversion-run.clif index 3484818aa3..2a97474adc 100644 --- a/cranelift/filetests/filetests/isa/x86/simd-conversion-run.clif +++ b/cranelift/filetests/filetests/isa/x86/simd-conversion-run.clif @@ -13,3 +13,10 @@ block0: return v4 } ; run + +function %fcvt_from_uint(i32x4) -> f32x4 { +block0(v0:i32x4): + v1 = fcvt_from_uint.f32x4 v0 + return v1 +} +; run: %fcvt_from_uint([0 0 0 0]) == [0x0.0 0x0.0 0x0.0 0x0.0]