diff --git a/build.rs b/build.rs index 49f9f81db7..9864d44570 100644 --- a/build.rs +++ b/build.rs @@ -182,6 +182,7 @@ fn experimental_x64_should_panic(testsuite: &str, testname: &str, strategy: &str match (testsuite, testname) { ("simd", "simd_address") => return false, ("simd", "simd_bitwise") => return false, + ("simd", "simd_boolean") => return false, ("simd", "simd_const") => return false, ("simd", "simd_i8x16_arith") => return false, ("simd", "simd_i8x16_arith2") => return false, @@ -229,9 +230,14 @@ fn ignore(testsuite: &str, testname: &str, strategy: &str) -> bool { return env::var("CARGO_CFG_TARGET_ARCH").unwrap() != "x86_64"; } + // These are only implemented on aarch64 and x64. + ("simd", "simd_boolean") => { + return !(cfg!(feature = "experimental_x64") + || env::var("CARGO_CFG_TARGET_ARCH").unwrap() == "aarch64") + } + // These are only implemented on aarch64. - ("simd", "simd_boolean") - | ("simd", "simd_f32x4_pmin_pmax") + ("simd", "simd_f32x4_pmin_pmax") | ("simd", "simd_f32x4_rounding") | ("simd", "simd_f64x2_pmin_pmax") | ("simd", "simd_f64x2_rounding") => { diff --git a/cranelift/codegen/src/isa/x64/inst/args.rs b/cranelift/codegen/src/isa/x64/inst/args.rs index 48bd822d4d..708e52f36b 100644 --- a/cranelift/codegen/src/isa/x64/inst/args.rs +++ b/cranelift/codegen/src/isa/x64/inst/args.rs @@ -393,6 +393,8 @@ pub enum SseOpcode { Movdqa, Movdqu, Movlhps, + Movmskps, + Movmskpd, Movq, Movss, Movsd, @@ -407,6 +409,7 @@ pub enum SseOpcode { Pabsb, Pabsw, Pabsd, + Packsswb, Paddb, Paddd, Paddq, @@ -445,6 +448,7 @@ pub enum SseOpcode { Pminub, Pminuw, Pminud, + Pmovmskb, Pmulld, Pmullw, Pmuludq, @@ -510,6 +514,7 @@ impl SseOpcode { | SseOpcode::Minss | SseOpcode::Movaps | SseOpcode::Movlhps + | SseOpcode::Movmskps | SseOpcode::Movss | SseOpcode::Movups | SseOpcode::Mulps @@ -546,6 +551,7 @@ impl SseOpcode { | SseOpcode::Minsd | SseOpcode::Movapd | SseOpcode::Movd + | SseOpcode::Movmskpd | SseOpcode::Movq | SseOpcode::Movsd | SseOpcode::Movupd @@ -554,6 +560,7 @@ impl SseOpcode { | SseOpcode::Mulpd | SseOpcode::Mulsd | SseOpcode::Orpd + | SseOpcode::Packsswb | SseOpcode::Paddb | SseOpcode::Paddd | SseOpcode::Paddq @@ -578,6 +585,7 @@ impl SseOpcode { | SseOpcode::Pmaxub | SseOpcode::Pminsw | SseOpcode::Pminub + | SseOpcode::Pmovmskb | SseOpcode::Pmullw | SseOpcode::Pmuludq | SseOpcode::Por @@ -686,6 +694,8 @@ impl fmt::Debug for SseOpcode { SseOpcode::Movdqa => "movdqa", SseOpcode::Movdqu => "movdqu", SseOpcode::Movlhps => "movlhps", + SseOpcode::Movmskps => "movmskps", + SseOpcode::Movmskpd => "movmskpd", SseOpcode::Movq => "movq", SseOpcode::Movss => "movss", SseOpcode::Movsd => "movsd", @@ -700,6 +710,7 @@ impl fmt::Debug for SseOpcode { SseOpcode::Pabsb => "pabsb", SseOpcode::Pabsw => "pabsw", SseOpcode::Pabsd => "pabsd", + SseOpcode::Packsswb => "packsswb", SseOpcode::Paddb => "paddb", SseOpcode::Paddd => "paddd", SseOpcode::Paddq => "paddq", @@ -738,6 +749,7 @@ impl fmt::Debug for SseOpcode { SseOpcode::Pminub => "pminub", SseOpcode::Pminuw => "pminuw", SseOpcode::Pminud => "pminud", + SseOpcode::Pmovmskb => "pmovmskb", SseOpcode::Pmulld => "pmulld", SseOpcode::Pmullw => "pmullw", SseOpcode::Pmuludq => "pmuludq", diff --git a/cranelift/codegen/src/isa/x64/inst/emit.rs b/cranelift/codegen/src/isa/x64/inst/emit.rs index 2ad4c4d723..eeb1d3dacb 100644 --- a/cranelift/codegen/src/isa/x64/inst/emit.rs +++ b/cranelift/codegen/src/isa/x64/inst/emit.rs @@ -1762,6 +1762,7 @@ pub(crate) fn emit( SseOpcode::Mulsd => (LegacyPrefixes::_F2, 0x0F59, 2), SseOpcode::Orpd => (LegacyPrefixes::_66, 0x0F56, 2), SseOpcode::Orps => (LegacyPrefixes::None, 0x0F56, 2), + SseOpcode::Packsswb => (LegacyPrefixes::_66, 0x0F63, 2), SseOpcode::Paddb => (LegacyPrefixes::_66, 0x0FFC, 2), SseOpcode::Paddd => (LegacyPrefixes::_66, 0x0FFE, 2), SseOpcode::Paddq => (LegacyPrefixes::_66, 0x0FD4, 2), @@ -2040,11 +2041,14 @@ pub(crate) fn emit( dst_size, } => { let (prefix, opcode, dst_first) = match op { + SseOpcode::Cvttss2si => (LegacyPrefixes::_F3, 0x0F2C, true), + SseOpcode::Cvttsd2si => (LegacyPrefixes::_F2, 0x0F2C, true), // Movd and movq use the same opcode; the presence of the REX prefix (set below) // actually determines which is used. SseOpcode::Movd | SseOpcode::Movq => (LegacyPrefixes::_66, 0x0F7E, false), - SseOpcode::Cvttss2si => (LegacyPrefixes::_F3, 0x0F2C, true), - SseOpcode::Cvttsd2si => (LegacyPrefixes::_F2, 0x0F2C, true), + SseOpcode::Movmskps => (LegacyPrefixes::None, 0x0F50, true), + SseOpcode::Movmskpd => (LegacyPrefixes::_66, 0x0F50, true), + SseOpcode::Pmovmskb => (LegacyPrefixes::_66, 0x0FD7, true), _ => panic!("unexpected opcode {:?}", op), }; let rex = match dst_size { diff --git a/cranelift/codegen/src/isa/x64/inst/emit_tests.rs b/cranelift/codegen/src/isa/x64/inst/emit_tests.rs index 728bc09c97..0e6ad1d118 100644 --- a/cranelift/codegen/src/isa/x64/inst/emit_tests.rs +++ b/cranelift/codegen/src/isa/x64/inst/emit_tests.rs @@ -3292,6 +3292,12 @@ fn test_x64_emit() { "pshufb %xmm11, %xmm2", )); + insns.push(( + Inst::xmm_rm_r(SseOpcode::Packsswb, RegMem::reg(xmm11), w_xmm2, None), + "66410F63D3", + "packsswb %xmm11, %xmm2", + )); + // ======================================================== // XMM_RM_R: Integer Conversion insns.push(( @@ -3422,6 +3428,22 @@ fn test_x64_emit() { "cvttsd2si %xmm0, %r15", )); + insns.push(( + Inst::xmm_to_gpr(SseOpcode::Pmovmskb, xmm10, w_rax, OperandSize::Size32), + "66410FD7C2", + "pmovmskb %xmm10, %eax", + )); + insns.push(( + Inst::xmm_to_gpr(SseOpcode::Movmskps, xmm2, w_rax, OperandSize::Size32), + "0F50C2", + "movmskps %xmm2, %eax", + )); + insns.push(( + Inst::xmm_to_gpr(SseOpcode::Movmskpd, xmm0, w_rcx, OperandSize::Size32), + "660F50C8", + "movmskpd %xmm0, %ecx", + )); + insns.push(( Inst::gpr_to_xmm( SseOpcode::Movd, diff --git a/cranelift/codegen/src/isa/x64/lower.rs b/cranelift/codegen/src/isa/x64/lower.rs index 8e887b0e90..f63a157d8c 100644 --- a/cranelift/codegen/src/isa/x64/lower.rs +++ b/cranelift/codegen/src/isa/x64/lower.rs @@ -3657,6 +3657,58 @@ fn lower_insn_to_regs>( ctx.emit(Inst::setcc(CC::Z, dst)); } + Opcode::VhighBits => { + let src = put_input_in_reg(ctx, inputs[0]); + let src_ty = ctx.input_ty(insn, 0); + debug_assert!(src_ty.is_vector() && src_ty.bits() == 128); + let dst = get_output_reg(ctx, outputs[0]); + debug_assert!(dst.to_reg().get_class() == RegClass::I64); + + // The Intel specification allows using both 32-bit and 64-bit GPRs as destination for + // the "move mask" instructions. This is controlled by the REX.R bit: "In 64-bit mode, + // the instruction can access additional registers when used with a REX.R prefix. The + // default operand size is 64-bit in 64-bit mode" (PMOVMSKB in IA Software Development + // Manual, vol. 2). This being the case, we will always clear REX.W since its use is + // unnecessary (`OperandSize` is used for setting/clearing REX.W). + let size = OperandSize::Size32; + + match src_ty { + types::I8X16 | types::B8X16 => { + ctx.emit(Inst::xmm_to_gpr(SseOpcode::Pmovmskb, src, dst, size)) + } + types::I32X4 | types::B32X4 | types::F32X4 => { + ctx.emit(Inst::xmm_to_gpr(SseOpcode::Movmskps, src, dst, size)) + } + types::I64X2 | types::B64X2 | types::F64X2 => { + ctx.emit(Inst::xmm_to_gpr(SseOpcode::Movmskpd, src, dst, size)) + } + types::I16X8 | types::B16X8 => { + // There is no x86 instruction for extracting the high bit of 16-bit lanes so + // here we: + // - duplicate the 16-bit lanes of `src` into 8-bit lanes: + // PACKSSWB([x1, x2, ...], [x1, x2, ...]) = [x1', x2', ..., x1', x2', ...] + // - use PMOVMSKB to gather the high bits; now we have duplicates, though + // - shift away the bottom 8 high bits to remove the duplicates. + let tmp = ctx.alloc_tmp(RegClass::V128, src_ty); + ctx.emit(Inst::gen_move(tmp, src, src_ty)); + ctx.emit(Inst::xmm_rm_r( + SseOpcode::Packsswb, + RegMem::reg(src), + tmp, + None, + )); + ctx.emit(Inst::xmm_to_gpr( + SseOpcode::Pmovmskb, + tmp.to_reg(), + dst, + size, + )); + ctx.emit(Inst::shift_r(8, ShiftKind::ShiftRightLogical, Some(8), dst)); + } + _ => unimplemented!("unknown input type {} for {}", src_ty, op), + } + } + Opcode::IaddImm | Opcode::ImulImm | Opcode::UdivImm