From 54b45d28a33ae85867a1b21ad702c09c0674946a Mon Sep 17 00:00:00 2001 From: Andrew Brown Date: Wed, 19 May 2021 12:20:11 -0700 Subject: [PATCH] x64: lower fcvt_from_uint to VCVTUDQ2PS when possible When AVX512VL and AVX512F are available, use a single instruction (`VCVTUDQ2PS`) instead of a length 9-instruction sequence. This optimization is a port from the legacy x86 backend. --- cranelift/codegen/src/isa/x64/inst/args.rs | 5 + cranelift/codegen/src/isa/x64/inst/emit.rs | 11 +- .../codegen/src/isa/x64/inst/emit_tests.rs | 6 + cranelift/codegen/src/isa/x64/lower.rs | 112 ++++++++++-------- .../isa/x64/simd-conversion-run.clif | 22 ++-- 5 files changed, 93 insertions(+), 63 deletions(-) diff --git a/cranelift/codegen/src/isa/x64/inst/args.rs b/cranelift/codegen/src/isa/x64/inst/args.rs index 77b173b42d..005df224dc 100644 --- a/cranelift/codegen/src/isa/x64/inst/args.rs +++ b/cranelift/codegen/src/isa/x64/inst/args.rs @@ -1000,6 +1000,7 @@ impl fmt::Display for SseOpcode { #[derive(Clone)] pub enum Avx512Opcode { + Vcvtudq2ps, Vpabsq, Vpmullq, } @@ -1008,6 +1009,9 @@ impl Avx512Opcode { /// Which `InstructionSet`s support the opcode? pub(crate) fn available_from(&self) -> SmallVec<[InstructionSet; 2]> { match self { + Avx512Opcode::Vcvtudq2ps => { + smallvec![InstructionSet::AVX512F, InstructionSet::AVX512VL] + } Avx512Opcode::Vpabsq => smallvec![InstructionSet::AVX512F, InstructionSet::AVX512VL], Avx512Opcode::Vpmullq => smallvec![InstructionSet::AVX512VL, InstructionSet::AVX512DQ], } @@ -1017,6 +1021,7 @@ impl Avx512Opcode { impl fmt::Debug for Avx512Opcode { fn fmt(&self, fmt: &mut fmt::Formatter) -> fmt::Result { let name = match self { + Avx512Opcode::Vcvtudq2ps => "vcvtudq2ps", Avx512Opcode::Vpabsq => "vpabsq", Avx512Opcode::Vpmullq => "vpmullq", }; diff --git a/cranelift/codegen/src/isa/x64/inst/emit.rs b/cranelift/codegen/src/isa/x64/inst/emit.rs index f57212a3be..5e94395797 100644 --- a/cranelift/codegen/src/isa/x64/inst/emit.rs +++ b/cranelift/codegen/src/isa/x64/inst/emit.rs @@ -1408,16 +1408,17 @@ pub(crate) fn emit( } Inst::XmmUnaryRmREvex { op, src, dst } => { - let opcode = match op { - Avx512Opcode::Vpabsq => 0x1f, + let (prefix, map, w, opcode) = match op { + Avx512Opcode::Vpabsq => (LegacyPrefixes::_66, OpcodeMap::_0F38, true, 0x1f), + Avx512Opcode::Vcvtudq2ps => (LegacyPrefixes::_F2, OpcodeMap::_0F, false, 0x7a), _ => unimplemented!("Opcode {:?} not implemented", op), }; match src { RegMem::Reg { reg: src } => EvexInstruction::new() .length(EvexVectorLength::V128) - .prefix(LegacyPrefixes::_66) - .map(OpcodeMap::_0F38) - .w(true) + .prefix(prefix) + .map(map) + .w(w) .opcode(opcode) .reg(dst.to_reg().get_hw_encoding()) .rm(src.get_hw_encoding()) diff --git a/cranelift/codegen/src/isa/x64/inst/emit_tests.rs b/cranelift/codegen/src/isa/x64/inst/emit_tests.rs index a288327fd5..efe62b4cea 100644 --- a/cranelift/codegen/src/isa/x64/inst/emit_tests.rs +++ b/cranelift/codegen/src/isa/x64/inst/emit_tests.rs @@ -3889,6 +3889,12 @@ fn test_x64_emit() { "vpabsq %xmm2, %xmm8", )); + insns.push(( + Inst::xmm_unary_rm_r_evex(Avx512Opcode::Vcvtudq2ps, RegMem::reg(xmm2), w_xmm8), + "62717F087AC2", + "vcvtudq2ps %xmm2, %xmm8", + )); + // Xmm to int conversions, and conversely. insns.push(( diff --git a/cranelift/codegen/src/isa/x64/lower.rs b/cranelift/codegen/src/isa/x64/lower.rs index a7bd6ad372..2aaa8f69b5 100644 --- a/cranelift/codegen/src/isa/x64/lower.rs +++ b/cranelift/codegen/src/isa/x64/lower.rs @@ -4069,64 +4069,78 @@ fn lower_insn_to_regs>( _ => panic!("unexpected input type for FcvtFromUint: {:?}", input_ty), }; } else { - // Converting packed unsigned integers to packed floats requires a few steps. - // There is no single instruction lowering for converting unsigned floats but there - // is for converting packed signed integers to float (cvtdq2ps). In the steps below - // we isolate the upper half (16 bits) and lower half (16 bits) of each lane and - // then we convert each half separately using cvtdq2ps meant for signed integers. - // In order for this to work for the upper half bits we must shift right by 1 - // (divide by 2) these bits in order to ensure the most significant bit is 0 not - // signed, and then after the conversion we double the value. Finally we add the - // converted values where addition will correctly round. - // - // Sequence: - // -> A = 0xffffffff - // -> Ah = 0xffff0000 - // -> Al = 0x0000ffff - // -> Convert(Al) // Convert int to float - // -> Ah = Ah >> 1 // Shift right 1 to assure Ah conversion isn't treated as signed - // -> Convert(Ah) // Convert .. with no loss of significant digits from previous shift - // -> Ah = Ah + Ah // Double Ah to account for shift right before the conversion. - // -> dst = Ah + Al // Add the two floats together - assert_eq!(ctx.input_ty(insn, 0), types::I32X4); let src = put_input_in_reg(ctx, inputs[0]); let dst = get_output_reg(ctx, outputs[0]).only_reg().unwrap(); - // Create a temporary register - let tmp = ctx.alloc_tmp(types::I32X4).only_reg().unwrap(); - ctx.emit(Inst::xmm_unary_rm_r( - SseOpcode::Movapd, - RegMem::reg(src), - tmp, - )); - ctx.emit(Inst::gen_move(dst, src, ty)); + if isa_flags.use_avx512f_simd() || isa_flags.use_avx512vl_simd() { + // When either AVX512VL or AVX512F are available, + // `fcvt_from_uint` can be lowered to a single instruction. + ctx.emit(Inst::xmm_unary_rm_r_evex( + Avx512Opcode::Vcvtudq2ps, + RegMem::reg(src), + dst, + )); + } else { + // Converting packed unsigned integers to packed floats + // requires a few steps. There is no single instruction + // lowering for converting unsigned floats but there is for + // converting packed signed integers to float (cvtdq2ps). In + // the steps below we isolate the upper half (16 bits) and + // lower half (16 bits) of each lane and then we convert + // each half separately using cvtdq2ps meant for signed + // integers. In order for this to work for the upper half + // bits we must shift right by 1 (divide by 2) these bits in + // order to ensure the most significant bit is 0 not signed, + // and then after the conversion we double the value. + // Finally we add the converted values where addition will + // correctly round. + // + // Sequence: + // -> A = 0xffffffff + // -> Ah = 0xffff0000 + // -> Al = 0x0000ffff + // -> Convert(Al) // Convert int to float + // -> Ah = Ah >> 1 // Shift right 1 to assure Ah conversion isn't treated as signed + // -> Convert(Ah) // Convert .. with no loss of significant digits from previous shift + // -> Ah = Ah + Ah // Double Ah to account for shift right before the conversion. + // -> dst = Ah + Al // Add the two floats together - // Get the low 16 bits - ctx.emit(Inst::xmm_rmi_reg(SseOpcode::Pslld, RegMemImm::imm(16), tmp)); - ctx.emit(Inst::xmm_rmi_reg(SseOpcode::Psrld, RegMemImm::imm(16), tmp)); + // Create a temporary register + let tmp = ctx.alloc_tmp(types::I32X4).only_reg().unwrap(); + ctx.emit(Inst::xmm_unary_rm_r( + SseOpcode::Movapd, + RegMem::reg(src), + tmp, + )); + ctx.emit(Inst::gen_move(dst, src, ty)); - // Get the high 16 bits - ctx.emit(Inst::xmm_rm_r(SseOpcode::Psubd, RegMem::from(tmp), dst)); + // Get the low 16 bits + ctx.emit(Inst::xmm_rmi_reg(SseOpcode::Pslld, RegMemImm::imm(16), tmp)); + ctx.emit(Inst::xmm_rmi_reg(SseOpcode::Psrld, RegMemImm::imm(16), tmp)); - // Convert the low 16 bits - ctx.emit(Inst::xmm_rm_r(SseOpcode::Cvtdq2ps, RegMem::from(tmp), tmp)); + // Get the high 16 bits + ctx.emit(Inst::xmm_rm_r(SseOpcode::Psubd, RegMem::from(tmp), dst)); - // Shift the high bits by 1, convert, and double to get the correct value. - ctx.emit(Inst::xmm_rmi_reg(SseOpcode::Psrld, RegMemImm::imm(1), dst)); - ctx.emit(Inst::xmm_rm_r(SseOpcode::Cvtdq2ps, RegMem::from(dst), dst)); - ctx.emit(Inst::xmm_rm_r( - SseOpcode::Addps, - RegMem::reg(dst.to_reg()), - dst, - )); + // Convert the low 16 bits + ctx.emit(Inst::xmm_rm_r(SseOpcode::Cvtdq2ps, RegMem::from(tmp), tmp)); - // Add together the two converted values. - ctx.emit(Inst::xmm_rm_r( - SseOpcode::Addps, - RegMem::reg(tmp.to_reg()), - dst, - )); + // Shift the high bits by 1, convert, and double to get the correct value. + ctx.emit(Inst::xmm_rmi_reg(SseOpcode::Psrld, RegMemImm::imm(1), dst)); + ctx.emit(Inst::xmm_rm_r(SseOpcode::Cvtdq2ps, RegMem::from(dst), dst)); + ctx.emit(Inst::xmm_rm_r( + SseOpcode::Addps, + RegMem::reg(dst.to_reg()), + dst, + )); + + // Add together the two converted values. + ctx.emit(Inst::xmm_rm_r( + SseOpcode::Addps, + RegMem::reg(tmp.to_reg()), + dst, + )); + } } } diff --git a/cranelift/filetests/filetests/isa/x64/simd-conversion-run.clif b/cranelift/filetests/filetests/isa/x64/simd-conversion-run.clif index d6f353eee2..6d40ad9f37 100644 --- a/cranelift/filetests/filetests/isa/x64/simd-conversion-run.clif +++ b/cranelift/filetests/filetests/isa/x64/simd-conversion-run.clif @@ -2,17 +2,21 @@ test run set enable_simd target x86_64 machinst -function %fcvt_from_sint() -> b1 { -block0: - v0 = vconst.i32x4 [-1 0 1 123456789] +function %fcvt_from_sint(i32x4) -> f32x4 { +block0(v0: i32x4): v1 = fcvt_from_sint.f32x4 v0 - - v2 = vconst.f32x4 [-0x1.0 0.0 0x1.0 0x75bcd18.0] ; 123456789 rounds to 123456792.0, an error of 3 - v3 = fcmp eq v1, v2 - v4 = vall_true v3 - return v4 + return v1 } -; run +; run: %fcvt_from_sint([-1 0 1 123456789]) == [-0x1.0 0.0 0x1.0 0x75bcd18.0] +; Note that 123456789 rounds to 123456792.0, an error of 3 + +function %fcvt_from_uint(i32x4) -> f32x4 { +block0(v0: i32x4): + v1 = fcvt_from_uint.f32x4 v0 + return v1 +} +; run: %fcvt_from_uint([0xFFFFFFFF 0 1 123456789]) == [0x100000000.0 0.0 0x1.0 0x75bcd18.0] +; Note that 0xFFFFFFFF is decimal 4,294,967,295 and is rounded up 1 to 4,294,967,296 in f32x4. function %fcvt_to_sint_sat(f32x4) -> i32x4 { block0(v0:f32x4):