diff --git a/cranelift/codegen/src/isa/x64/inst/args.rs b/cranelift/codegen/src/isa/x64/inst/args.rs index 5372d390cd..16acafc940 100644 --- a/cranelift/codegen/src/isa/x64/inst/args.rs +++ b/cranelift/codegen/src/isa/x64/inst/args.rs @@ -344,6 +344,7 @@ pub(crate) enum InstructionSet { SSE2, SSSE3, SSE41, + SSE42, } /// Some SSE operations requiring 2 operands r/m and r. @@ -414,6 +415,14 @@ pub enum SseOpcode { Paddusw, Pavgb, Pavgw, + Pcmpeqb, + Pcmpeqw, + Pcmpeqd, + Pcmpeqq, + Pcmpgtb, + Pcmpgtw, + Pcmpgtd, + Pcmpgtq, Pextrb, Pextrw, Pextrd, @@ -543,6 +552,12 @@ impl SseOpcode { | SseOpcode::Paddusw | SseOpcode::Pavgb | SseOpcode::Pavgw + | SseOpcode::Pcmpeqb + | SseOpcode::Pcmpeqw + | SseOpcode::Pcmpeqd + | SseOpcode::Pcmpgtb + | SseOpcode::Pcmpgtw + | SseOpcode::Pcmpgtd | SseOpcode::Pextrw | SseOpcode::Pinsrw | SseOpcode::Pmaxsw @@ -575,6 +590,7 @@ impl SseOpcode { SseOpcode::Pabsb | SseOpcode::Pabsw | SseOpcode::Pabsd | SseOpcode::Pshufb => SSSE3, SseOpcode::Insertps + | SseOpcode::Pcmpeqq | SseOpcode::Pextrb | SseOpcode::Pextrd | SseOpcode::Pinsrb @@ -590,6 +606,8 @@ impl SseOpcode { | SseOpcode::Pmulld | SseOpcode::Roundss | SseOpcode::Roundsd => SSE41, + + SseOpcode::Pcmpgtq => SSE42, } } @@ -670,6 +688,14 @@ impl fmt::Debug for SseOpcode { SseOpcode::Paddusw => "paddusw", SseOpcode::Pavgb => "pavgb", SseOpcode::Pavgw => "pavgw", + SseOpcode::Pcmpeqb => "pcmpeqb", + SseOpcode::Pcmpeqw => "pcmpeqw", + SseOpcode::Pcmpeqd => "pcmpeqd", + SseOpcode::Pcmpeqq => "pcmpeqq", + SseOpcode::Pcmpgtb => "pcmpgtb", + SseOpcode::Pcmpgtw => "pcmpgtw", + SseOpcode::Pcmpgtd => "pcmpgtd", + SseOpcode::Pcmpgtq => "pcmpgtq", SseOpcode::Pextrb => "pextrb", SseOpcode::Pextrw => "pextrw", SseOpcode::Pextrd => "pextrd", diff --git a/cranelift/codegen/src/isa/x64/inst/emit.rs b/cranelift/codegen/src/isa/x64/inst/emit.rs index 2a472a4439..b4524eb5d3 100644 --- a/cranelift/codegen/src/isa/x64/inst/emit.rs +++ b/cranelift/codegen/src/isa/x64/inst/emit.rs @@ -1786,6 +1786,14 @@ pub(crate) fn emit( SseOpcode::Paddusw => (LegacyPrefixes::_66, 0x0FDD, 2), SseOpcode::Pavgb => (LegacyPrefixes::_66, 0x0FE0, 2), SseOpcode::Pavgw => (LegacyPrefixes::_66, 0x0FE3, 2), + SseOpcode::Pcmpeqb => (LegacyPrefixes::_66, 0x0F74, 2), + SseOpcode::Pcmpeqw => (LegacyPrefixes::_66, 0x0F75, 2), + SseOpcode::Pcmpeqd => (LegacyPrefixes::_66, 0x0F76, 2), + SseOpcode::Pcmpeqq => (LegacyPrefixes::_66, 0x0F3829, 3), + SseOpcode::Pcmpgtb => (LegacyPrefixes::_66, 0x0F64, 2), + SseOpcode::Pcmpgtw => (LegacyPrefixes::_66, 0x0F65, 2), + SseOpcode::Pcmpgtd => (LegacyPrefixes::_66, 0x0F66, 2), + SseOpcode::Pcmpgtq => (LegacyPrefixes::_66, 0x0F3837, 3), SseOpcode::Pmaxsb => (LegacyPrefixes::_66, 0x0F383C, 3), SseOpcode::Pmaxsw => (LegacyPrefixes::_66, 0x0FEE, 2), SseOpcode::Pmaxsd => (LegacyPrefixes::_66, 0x0F383D, 3), diff --git a/cranelift/codegen/src/isa/x64/inst/mod.rs b/cranelift/codegen/src/isa/x64/inst/mod.rs index 577a2ce73b..929afa2938 100644 --- a/cranelift/codegen/src/isa/x64/inst/mod.rs +++ b/cranelift/codegen/src/isa/x64/inst/mod.rs @@ -1125,7 +1125,11 @@ impl Inst { src.to_reg() == Some(dst.to_reg()) && (*op == SseOpcode::Xorps || *op == SseOpcode::Xorpd - || *op == SseOpcode::Pxor) + || *op == SseOpcode::Pxor + || *op == SseOpcode::Pcmpeqb + || *op == SseOpcode::Pcmpeqw + || *op == SseOpcode::Pcmpeqd + || *op == SseOpcode::Pcmpeqq) } Self::XmmRmRImm { diff --git a/cranelift/codegen/src/isa/x64/lower.rs b/cranelift/codegen/src/isa/x64/lower.rs index 4e753a8bea..59b81327e5 100644 --- a/cranelift/codegen/src/isa/x64/lower.rs +++ b/cranelift/codegen/src/isa/x64/lower.rs @@ -3,8 +3,8 @@ #![allow(non_snake_case)] use crate::ir::{ - condcodes::FloatCC, types, AbiParam, ArgumentPurpose, ExternalName, Inst as IRInst, - InstructionData, LibCall, Opcode, Signature, Type, + condcodes::FloatCC, condcodes::IntCC, types, AbiParam, ArgumentPurpose, ExternalName, + Inst as IRInst, InstructionData, LibCall, Opcode, Signature, Type, }; use crate::isa::x64::abi::*; use crate::isa::x64::inst::args::*; @@ -1297,12 +1297,118 @@ fn lower_insn_to_regs>( } Opcode::Icmp => { - emit_cmp(ctx, insn); - let condcode = ctx.data(insn).cond_code().unwrap(); - let cc = CC::from_intcc(condcode); let dst = get_output_reg(ctx, outputs[0]); - ctx.emit(Inst::setcc(cc, dst)); + let ty = ctx.input_ty(insn, 0); + if !ty.is_vector() { + emit_cmp(ctx, insn); + let cc = CC::from_intcc(condcode); + ctx.emit(Inst::setcc(cc, dst)); + } else { + assert_eq!(ty.bits(), 128); + let eq = |ty| match ty { + types::I8X16 => SseOpcode::Pcmpeqb, + types::I16X8 => SseOpcode::Pcmpeqw, + types::I32X4 => SseOpcode::Pcmpeqd, + types::I64X2 => SseOpcode::Pcmpeqq, + _ => panic!( + "Unable to find an instruction for {} for type: {}", + condcode, ty + ), + }; + let gt = |ty| match ty { + types::I8X16 => SseOpcode::Pcmpgtb, + types::I16X8 => SseOpcode::Pcmpgtw, + types::I32X4 => SseOpcode::Pcmpgtd, + types::I64X2 => SseOpcode::Pcmpgtq, + _ => panic!( + "Unable to find an instruction for {} for type: {}", + condcode, ty + ), + }; + let maxu = |ty| match ty { + types::I8X16 => SseOpcode::Pmaxub, + types::I16X8 => SseOpcode::Pmaxuw, + types::I32X4 => SseOpcode::Pmaxud, + _ => panic!( + "Unable to find an instruction for {} for type: {}", + condcode, ty + ), + }; + let mins = |ty| match ty { + types::I8X16 => SseOpcode::Pminsb, + types::I16X8 => SseOpcode::Pminsw, + types::I32X4 => SseOpcode::Pminsd, + _ => panic!( + "Unable to find an instruction for {} for type: {}", + condcode, ty + ), + }; + let minu = |ty| match ty { + types::I8X16 => SseOpcode::Pminub, + types::I16X8 => SseOpcode::Pminuw, + types::I32X4 => SseOpcode::Pminud, + _ => panic!( + "Unable to find an instruction for {} for type: {}", + condcode, ty + ), + }; + + // Here we decide which operand to use as the read/write `dst` (ModRM reg field) + // and which to use as the read `input` (ModRM r/m field). In the normal case we + // use Cranelift's first operand, the `lhs`, as `dst` but we flip the operands for + // the less-than cases so that we can reuse the greater-than implementation. + let input = match condcode { + IntCC::SignedLessThan + | IntCC::SignedLessThanOrEqual + | IntCC::UnsignedLessThan + | IntCC::UnsignedLessThanOrEqual => { + let lhs = input_to_reg_mem(ctx, inputs[0]); + let rhs = put_input_in_reg(ctx, inputs[1]); + ctx.emit(Inst::gen_move(dst, rhs, ty)); + lhs + } + _ => { + let lhs = put_input_in_reg(ctx, inputs[0]); + let rhs = input_to_reg_mem(ctx, inputs[1]); + ctx.emit(Inst::gen_move(dst, lhs, ty)); + rhs + } + }; + + match condcode { + IntCC::Equal => ctx.emit(Inst::xmm_rm_r(eq(ty), input, dst)), + IntCC::NotEqual => { + ctx.emit(Inst::xmm_rm_r(eq(ty), input, dst)); + // Emit all 1s into the `tmp` register. + let tmp = ctx.alloc_tmp(RegClass::V128, ty); + ctx.emit(Inst::xmm_rm_r(eq(ty), RegMem::from(tmp), tmp)); + // Invert the result of the `PCMPEQ*`. + ctx.emit(Inst::xmm_rm_r(SseOpcode::Pxor, RegMem::from(tmp), dst)); + } + IntCC::SignedGreaterThan | IntCC::SignedLessThan => { + ctx.emit(Inst::xmm_rm_r(gt(ty), input, dst)) + } + IntCC::SignedGreaterThanOrEqual | IntCC::SignedLessThanOrEqual => { + ctx.emit(Inst::xmm_rm_r(mins(ty), input.clone(), dst)); + ctx.emit(Inst::xmm_rm_r(eq(ty), input, dst)) + } + IntCC::UnsignedGreaterThan | IntCC::UnsignedLessThan => { + ctx.emit(Inst::xmm_rm_r(maxu(ty), input.clone(), dst)); + ctx.emit(Inst::xmm_rm_r(eq(ty), input, dst)); + // Emit all 1s into the `tmp` register. + let tmp = ctx.alloc_tmp(RegClass::V128, ty); + ctx.emit(Inst::xmm_rm_r(eq(ty), RegMem::from(tmp), tmp)); + // Invert the result of the `PCMPEQ*`. + ctx.emit(Inst::xmm_rm_r(SseOpcode::Pxor, RegMem::from(tmp), dst)); + } + IntCC::UnsignedGreaterThanOrEqual | IntCC::UnsignedLessThanOrEqual => { + ctx.emit(Inst::xmm_rm_r(minu(ty), input.clone(), dst)); + ctx.emit(Inst::xmm_rm_r(eq(ty), input, dst)) + } + _ => unimplemented!("Unimplemented comparison code for icmp: {}", condcode), + } + } } Opcode::Fcmp => {