From 508f8fa5a921240ff2d25fd2eb859366c87834a3 Mon Sep 17 00:00:00 2001 From: Andrew Brown Date: Tue, 2 Mar 2021 09:54:19 -0800 Subject: [PATCH] [x64] Add i64x2.abs This instruction has a single instruction lowering in AVX512F/VL and a three instruction lowering in AVX but neither is currently supported in the x64 backend. To implement this, we instead subtract the vector from 0 and use a blending instruction to pick the lanes containing the absolute value. --- build.rs | 5 +++-- cranelift/codegen/src/isa/x64/inst/args.rs | 5 ++++- cranelift/codegen/src/isa/x64/inst/emit.rs | 1 + .../codegen/src/isa/x64/inst/emit_tests.rs | 6 +++++ cranelift/codegen/src/isa/x64/lower.rs | 22 ++++++++++++++++++- cranelift/wasm/src/code_translator.rs | 3 +-- 6 files changed, 36 insertions(+), 6 deletions(-) diff --git a/build.rs b/build.rs index 5d7d836d83..1c55f0ef58 100644 --- a/build.rs +++ b/build.rs @@ -183,7 +183,6 @@ fn experimental_x64_should_panic(testsuite: &str, testname: &str, strategy: &str match (testsuite, testname) { ("simd", "simd_i8x16_arith2") => return true, // Unsupported feature: proposed simd operator I8x16Popcnt - ("simd", "simd_i64x2_arith2") => return true, // Unsupported feature: proposed simd operator I64x2Abs ("simd", "simd_conversions") => return true, // unknown operator or unexpected token: tests/spec_testsuite/proposals/simd/simd_conversions.wast:724:6 ("simd", "simd_i16x8_extadd_pairwise_i8x16") => return true, ("simd", "simd_i16x8_extmul_i8x16") => return true, @@ -231,7 +230,6 @@ fn ignore(testsuite: &str, testname: &str, strategy: &str) -> bool { // These are new instructions that are not really implemented in any backend. ("simd", "simd_i8x16_arith2") - | ("simd", "simd_i64x2_arith2") | ("simd", "simd_conversions") | ("simd", "simd_i16x8_extadd_pairwise_i8x16") | ("simd", "simd_i16x8_extmul_i8x16") @@ -250,6 +248,9 @@ fn ignore(testsuite: &str, testname: &str, strategy: &str) -> bool { | ("simd", "simd_store64_lane") | ("simd", "simd_store8_lane") => return true, + // These are only implemented on x64. + ("simd", "simd_i64x2_arith2") => return !cfg!(feature = "experimental_x64"), + // These are only implemented on aarch64 and x64. ("simd", "simd_i64x2_cmp") | ("simd", "simd_f32x4_pmin_pmax") diff --git a/cranelift/codegen/src/isa/x64/inst/args.rs b/cranelift/codegen/src/isa/x64/inst/args.rs index 930839459b..f68c50449f 100644 --- a/cranelift/codegen/src/isa/x64/inst/args.rs +++ b/cranelift/codegen/src/isa/x64/inst/args.rs @@ -470,6 +470,7 @@ pub enum SseOpcode { Andpd, Andnps, Andnpd, + Blendvpd, Comiss, Comisd, Cmpps, @@ -758,7 +759,8 @@ impl SseOpcode { | SseOpcode::Palignr | SseOpcode::Pshufb => SSSE3, - SseOpcode::Insertps + SseOpcode::Blendvpd + | SseOpcode::Insertps | SseOpcode::Packusdw | SseOpcode::Pcmpeqq | SseOpcode::Pextrb @@ -816,6 +818,7 @@ impl fmt::Debug for SseOpcode { SseOpcode::Andps => "andps", SseOpcode::Andnps => "andnps", SseOpcode::Andnpd => "andnpd", + SseOpcode::Blendvpd => "blendvpd", SseOpcode::Cmpps => "cmpps", SseOpcode::Cmppd => "cmppd", SseOpcode::Cmpss => "cmpss", diff --git a/cranelift/codegen/src/isa/x64/inst/emit.rs b/cranelift/codegen/src/isa/x64/inst/emit.rs index eeb81d19df..7ef5304635 100644 --- a/cranelift/codegen/src/isa/x64/inst/emit.rs +++ b/cranelift/codegen/src/isa/x64/inst/emit.rs @@ -1845,6 +1845,7 @@ pub(crate) fn emit( SseOpcode::Andpd => (LegacyPrefixes::_66, 0x0F54, 2), SseOpcode::Andnps => (LegacyPrefixes::None, 0x0F55, 2), SseOpcode::Andnpd => (LegacyPrefixes::_66, 0x0F55, 2), + SseOpcode::Blendvpd => (LegacyPrefixes::_66, 0x0F3815, 3), SseOpcode::Cvttps2dq => (LegacyPrefixes::_F3, 0x0F5B, 2), SseOpcode::Cvtdq2ps => (LegacyPrefixes::None, 0x0F5B, 2), SseOpcode::Divps => (LegacyPrefixes::None, 0x0F5E, 2), diff --git a/cranelift/codegen/src/isa/x64/inst/emit_tests.rs b/cranelift/codegen/src/isa/x64/inst/emit_tests.rs index f5d7535da4..e407910c2d 100644 --- a/cranelift/codegen/src/isa/x64/inst/emit_tests.rs +++ b/cranelift/codegen/src/isa/x64/inst/emit_tests.rs @@ -3426,6 +3426,12 @@ fn test_x64_emit() { "orps %xmm5, %xmm4", )); + insns.push(( + Inst::xmm_rm_r(SseOpcode::Blendvpd, RegMem::reg(xmm15), w_xmm4), + "66410F3815E7", + "blendvpd %xmm15, %xmm4", + )); + // ======================================================== // XMM_RM_R: Integer Packed diff --git a/cranelift/codegen/src/isa/x64/lower.rs b/cranelift/codegen/src/isa/x64/lower.rs index 827edc69ce..190462caaf 100644 --- a/cranelift/codegen/src/isa/x64/lower.rs +++ b/cranelift/codegen/src/isa/x64/lower.rs @@ -1853,7 +1853,27 @@ fn lower_insn_to_regs>( let src = input_to_reg_mem(ctx, inputs[0]); let dst = get_output_reg(ctx, outputs[0]).only_reg().unwrap(); let ty = ty.unwrap(); - if ty.is_vector() { + if ty == types::I64X2 { + // This lowering could be a single instruction with AVX512F/VL's VPABSQ instruction. + // Instead, we use a separate register, `tmp`, to contain the results of `0 - src` + // and then blend in those results with `BLENDVPD` if the MSB of `tmp` was set to 1 + // (i.e. if `tmp` was negative or, conversely, if `src` was originally positive). + + // Emit all 0s into the `tmp` register. + let tmp = ctx.alloc_tmp(ty).only_reg().unwrap(); + ctx.emit(Inst::xmm_rm_r(SseOpcode::Pxor, RegMem::from(tmp), tmp)); + // Subtract the lanes from 0 and set up `dst`. + ctx.emit(Inst::xmm_rm_r(SseOpcode::Psubq, src.clone(), tmp)); + ctx.emit(Inst::gen_move(dst, tmp.to_reg(), ty)); + // Choose the subtracted lanes when `tmp` has an MSB of 1. BLENDVPD's semantics + // require the "choice" mask to be in XMM0. + ctx.emit(Inst::gen_move( + Writable::from_reg(regs::xmm0()), + tmp.to_reg(), + ty, + )); + ctx.emit(Inst::xmm_rm_r(SseOpcode::Blendvpd, src, dst)); + } else if ty.is_vector() { let opcode = match ty { types::I8X16 => SseOpcode::Pabsb, types::I16X8 => SseOpcode::Pabsw, diff --git a/cranelift/wasm/src/code_translator.rs b/cranelift/wasm/src/code_translator.rs index adeb7bda2d..ba933d7b53 100644 --- a/cranelift/wasm/src/code_translator.rs +++ b/cranelift/wasm/src/code_translator.rs @@ -1564,7 +1564,7 @@ pub fn translate_operator( let a = pop1_with_bitcast(state, type_of(op), builder); state.push1(builder.ins().ineg(a)) } - Operator::I8x16Abs | Operator::I16x8Abs | Operator::I32x4Abs => { + Operator::I8x16Abs | Operator::I16x8Abs | Operator::I32x4Abs | Operator::I64x2Abs => { let a = pop1_with_bitcast(state, type_of(op), builder); state.push1(builder.ins().iabs(a)) } @@ -1852,7 +1852,6 @@ pub fn translate_operator( | Operator::I64x2ExtMulHighI32x4S | Operator::I64x2ExtMulLowI32x4U | Operator::I64x2ExtMulHighI32x4U - | Operator::I64x2Abs | Operator::I64x2AllTrue | Operator::I16x8ExtAddPairwiseI8x16S | Operator::I16x8ExtAddPairwiseI8x16U