diff --git a/cranelift/codegen/src/isa/x64/inst/args.rs b/cranelift/codegen/src/isa/x64/inst/args.rs index 7cb64898e6..8b31ce9944 100644 --- a/cranelift/codegen/src/isa/x64/inst/args.rs +++ b/cranelift/codegen/src/isa/x64/inst/args.rs @@ -383,6 +383,7 @@ pub enum SseOpcode { Movd, Movdqa, Movdqu, + Movlhps, Movq, Movss, Movsd, @@ -403,6 +404,9 @@ pub enum SseOpcode { Paddw, Pavgb, Pavgw, + Pinsrb, + Pinsrw, + Pinsrd, Pmaxsb, Pmaxsw, Pmaxsd, @@ -471,6 +475,7 @@ impl SseOpcode { | SseOpcode::Minps | SseOpcode::Minss | SseOpcode::Movaps + | SseOpcode::Movlhps | SseOpcode::Movss | SseOpcode::Movups | SseOpcode::Mulps @@ -519,6 +524,7 @@ impl SseOpcode { | SseOpcode::Paddw | SseOpcode::Pavgb | SseOpcode::Pavgw + | SseOpcode::Pinsrw | SseOpcode::Pmaxsw | SseOpcode::Pmaxub | SseOpcode::Pminsw @@ -548,6 +554,8 @@ impl SseOpcode { SseOpcode::Pabsb | SseOpcode::Pabsw | SseOpcode::Pabsd => SSSE3, SseOpcode::Insertps + | SseOpcode::Pinsrb + | SseOpcode::Pinsrd | SseOpcode::Pmaxsb | SseOpcode::Pmaxsd | SseOpcode::Pmaxuw @@ -614,6 +622,7 @@ impl fmt::Debug for SseOpcode { SseOpcode::Movd => "movd", SseOpcode::Movdqa => "movdqa", SseOpcode::Movdqu => "movdqu", + SseOpcode::Movlhps => "movlhps", SseOpcode::Movq => "movq", SseOpcode::Movss => "movss", SseOpcode::Movsd => "movsd", @@ -634,6 +643,9 @@ impl fmt::Debug for SseOpcode { SseOpcode::Paddw => "paddw", SseOpcode::Pavgb => "pavgb", SseOpcode::Pavgw => "pavgw", + SseOpcode::Pinsrb => "pinsrb", + SseOpcode::Pinsrw => "pinsrw", + SseOpcode::Pinsrd => "pinsrd", SseOpcode::Pmaxsb => "pmaxsb", SseOpcode::Pmaxsw => "pmaxsw", SseOpcode::Pmaxsd => "pmaxsd", diff --git a/cranelift/codegen/src/isa/x64/inst/emit.rs b/cranelift/codegen/src/isa/x64/inst/emit.rs index 341133fc85..e42996ab67 100644 --- a/cranelift/codegen/src/isa/x64/inst/emit.rs +++ b/cranelift/codegen/src/isa/x64/inst/emit.rs @@ -1760,14 +1760,16 @@ pub(crate) fn emit( SseOpcode::Divpd => (LegacyPrefixes::_66, 0x0F5E, 2), SseOpcode::Divss => (LegacyPrefixes::_F3, 0x0F5E, 2), SseOpcode::Divsd => (LegacyPrefixes::_F2, 0x0F5E, 2), - SseOpcode::Minps => (LegacyPrefixes::None, 0x0F5D, 2), - SseOpcode::Minpd => (LegacyPrefixes::_66, 0x0F5D, 2), - SseOpcode::Minss => (LegacyPrefixes::_F3, 0x0F5D, 2), - SseOpcode::Minsd => (LegacyPrefixes::_F2, 0x0F5D, 2), SseOpcode::Maxps => (LegacyPrefixes::None, 0x0F5F, 2), SseOpcode::Maxpd => (LegacyPrefixes::_66, 0x0F5F, 2), SseOpcode::Maxss => (LegacyPrefixes::_F3, 0x0F5F, 2), SseOpcode::Maxsd => (LegacyPrefixes::_F2, 0x0F5F, 2), + SseOpcode::Minps => (LegacyPrefixes::None, 0x0F5D, 2), + SseOpcode::Minpd => (LegacyPrefixes::_66, 0x0F5D, 2), + SseOpcode::Minss => (LegacyPrefixes::_F3, 0x0F5D, 2), + SseOpcode::Minsd => (LegacyPrefixes::_F2, 0x0F5D, 2), + SseOpcode::Movlhps => (LegacyPrefixes::None, 0x0F16, 2), + SseOpcode::Movsd => (LegacyPrefixes::_F2, 0x0F10, 2), SseOpcode::Mulps => (LegacyPrefixes::None, 0x0F59, 2), SseOpcode::Mulpd => (LegacyPrefixes::_66, 0x0F59, 2), SseOpcode::Mulss => (LegacyPrefixes::_F3, 0x0F59, 2), @@ -1906,23 +1908,36 @@ pub(crate) fn emit( sink.bind_label(done); } - Inst::XmmRmRImm { op, src, dst, imm } => { - let prefix = match op { - SseOpcode::Cmpps => LegacyPrefixes::None, - SseOpcode::Cmppd => LegacyPrefixes::_66, - SseOpcode::Cmpss => LegacyPrefixes::_F3, - SseOpcode::Cmpsd => LegacyPrefixes::_F2, + Inst::XmmRmRImm { + op, + src, + dst, + imm, + is64: w, + } => { + let (prefix, opcode, num_opcodes) = match op { + SseOpcode::Cmpps => (LegacyPrefixes::None, 0x0FC2, 2), + SseOpcode::Cmppd => (LegacyPrefixes::_66, 0x0FC2, 2), + SseOpcode::Cmpss => (LegacyPrefixes::_F3, 0x0FC2, 2), + SseOpcode::Cmpsd => (LegacyPrefixes::_F2, 0x0FC2, 2), + SseOpcode::Insertps => (LegacyPrefixes::_66, 0x0F3A21, 3), + SseOpcode::Pinsrb => (LegacyPrefixes::_66, 0x0F3A20, 3), + SseOpcode::Pinsrw => (LegacyPrefixes::_66, 0x0FC4, 2), + SseOpcode::Pinsrd => (LegacyPrefixes::_66, 0x0F3A22, 3), _ => unimplemented!("Opcode {:?} not implemented", op), }; - let opcode = 0x0FC2; - let rex = RexFlags::clear_w(); + let rex = if *w { + RexFlags::set_w() + } else { + RexFlags::clear_w() + }; match src { RegMem::Reg { reg } => { - emit_std_reg_reg(sink, prefix, opcode, 2, dst.to_reg(), *reg, rex); + emit_std_reg_reg(sink, prefix, opcode, num_opcodes, dst.to_reg(), *reg, rex); } RegMem::Mem { addr } => { let addr = &addr.finalize(state); - emit_std_reg_mem(sink, prefix, opcode, 2, dst.to_reg(), addr, rex); + emit_std_reg_mem(sink, prefix, opcode, num_opcodes, dst.to_reg(), addr, rex); } } sink.put1(*imm) diff --git a/cranelift/codegen/src/isa/x64/inst/emit_tests.rs b/cranelift/codegen/src/isa/x64/inst/emit_tests.rs index 05e645cf3c..8945435f6c 100644 --- a/cranelift/codegen/src/isa/x64/inst/emit_tests.rs +++ b/cranelift/codegen/src/isa/x64/inst/emit_tests.rs @@ -3441,12 +3441,12 @@ fn test_x64_emit() { // ======================================================== // XmmRmRImm insns.push(( - Inst::xmm_rm_r_imm(SseOpcode::Cmppd, RegMem::reg(xmm5), w_xmm1, 2), + Inst::xmm_rm_r_imm(SseOpcode::Cmppd, RegMem::reg(xmm5), w_xmm1, 2, false), "660FC2CD02", "cmppd $2, %xmm5, %xmm1", )); insns.push(( - Inst::xmm_rm_r_imm(SseOpcode::Cmpps, RegMem::reg(xmm15), w_xmm7, 0), + Inst::xmm_rm_r_imm(SseOpcode::Cmpps, RegMem::reg(xmm15), w_xmm7, 0, false), "410FC2FF00", "cmpps $0, %xmm15, %xmm7", )); diff --git a/cranelift/codegen/src/isa/x64/inst/mod.rs b/cranelift/codegen/src/isa/x64/inst/mod.rs index b9bf226e76..1812155b33 100644 --- a/cranelift/codegen/src/isa/x64/inst/mod.rs +++ b/cranelift/codegen/src/isa/x64/inst/mod.rs @@ -333,12 +333,13 @@ pub enum Inst { dst: Reg, }, - /// A binary XMM instruction with an 8-bit immediate: cmp (ps pd) imm (reg addr) reg + /// A binary XMM instruction with an 8-bit immediate: e.g. cmp (ps pd) imm (reg addr) reg XmmRmRImm { op: SseOpcode, src: RegMem, dst: Writable, imm: u8, + is64: bool, }, // ===================================== @@ -780,11 +781,22 @@ impl Inst { } } - pub(crate) fn xmm_rm_r_imm(op: SseOpcode, src: RegMem, dst: Writable, imm: u8) -> Inst { - src.assert_regclass_is(RegClass::V128); + pub(crate) fn xmm_rm_r_imm( + op: SseOpcode, + src: RegMem, + dst: Writable, + imm: u8, + w: bool, + ) -> Inst { debug_assert!(dst.to_reg().get_class() == RegClass::V128); debug_assert!(imm < 8); - Inst::XmmRmRImm { op, src, dst, imm } + Inst::XmmRmRImm { + op, + src, + dst, + imm, + is64: w, + } } pub(crate) fn movzx_rm_r( @@ -1118,7 +1130,9 @@ impl Inst { || *op == SseOpcode::Pxor) } - Self::XmmRmRImm { op, src, dst, imm } => { + Self::XmmRmRImm { + op, src, dst, imm, .. + } => { src.to_reg() == Some(dst.to_reg()) && (*op == SseOpcode::Cmppd || *op == SseOpcode::Cmpps) && *imm == FcmpImm::Equal.encode() @@ -1300,9 +1314,9 @@ impl ShowWithRRU for Inst { show_ireg_sized(rhs_dst.to_reg(), mb_rru, 8), ), - Inst::XmmRmRImm { op, src, dst, imm } => format!( + Inst::XmmRmRImm { op, src, dst, imm, is64 } => format!( "{} ${}, {}, {}", - ljustify(op.to_string()), + ljustify(format!("{}{}", op.to_string(), if *is64 { ".w" } else { "" })), imm, src.show_rru(mb_rru), dst.show_rru(mb_rru), diff --git a/cranelift/codegen/src/isa/x64/lower.rs b/cranelift/codegen/src/isa/x64/lower.rs index 2bde1c31f7..35a5657f58 100644 --- a/cranelift/codegen/src/isa/x64/lower.rs +++ b/cranelift/codegen/src/isa/x64/lower.rs @@ -1394,7 +1394,7 @@ fn lower_insn_to_regs>( ctx.emit(Inst::gen_move(dst, lhs, input_ty)); // Emit the comparison. - ctx.emit(Inst::xmm_rm_r_imm(op, rhs, dst, imm.encode())); + ctx.emit(Inst::xmm_rm_r_imm(op, rhs, dst, imm.encode(), false)); } } @@ -1859,6 +1859,7 @@ fn lower_insn_to_regs>( RegMem::reg(tmp.to_reg()), tmp, cond.encode(), + false, ); ctx.emit(cmpps); @@ -2639,6 +2640,56 @@ fn lower_insn_to_regs>( ctx.emit(Inst::gen_move(dst, src, ty)); } + Opcode::Insertlane => { + // The instruction format maps to variables like: %dst = insertlane %in_vec, %src, %lane + let ty = ty.unwrap(); + let dst = get_output_reg(ctx, outputs[0]); + let in_vec = put_input_in_reg(ctx, inputs[0]); + let src_ty = ctx.input_ty(insn, 1); + debug_assert!(!src_ty.is_vector()); + let src = input_to_reg_mem(ctx, inputs[1]); + let lane = if let InstructionData::TernaryImm8 { imm, .. } = ctx.data(insn) { + *imm + } else { + unreachable!(); + }; + debug_assert!(lane < ty.lane_count() as u8); + + ctx.emit(Inst::gen_move(dst, in_vec, ty)); + if !src_ty.is_float() { + let (sse_op, w_bit) = match ty.lane_bits() { + 8 => (SseOpcode::Pinsrb, false), + 16 => (SseOpcode::Pinsrw, false), + 32 => (SseOpcode::Pinsrd, false), + 64 => (SseOpcode::Pinsrd, true), + _ => panic!("Unable to insertlane for lane size: {}", ty.lane_bits()), + }; + ctx.emit(Inst::xmm_rm_r_imm(sse_op, src, dst, lane, w_bit)); + } else if src_ty == types::F32 { + let sse_op = SseOpcode::Insertps; + // Insert 32-bits from replacement (at index 00, bits 7:8) to vector (lane + // shifted into bits 5:6). + let lane = 0b00_00_00_00 | lane << 4; + ctx.emit(Inst::xmm_rm_r_imm(sse_op, src, dst, lane, false)); + } else if src_ty == types::F64 { + let sse_op = match lane { + // Move the lowest quadword in replacement to vector without changing + // the upper bits. + 0 => SseOpcode::Movsd, + // Move the low 64 bits of replacement vector to the high 64 bits of the + // vector. + 1 => SseOpcode::Movlhps, + _ => unreachable!(), + }; + // Here we use the `xmm_rm_r` encoding because it correctly tells the register + // allocator how we are using `dst`: we are using `dst` as a `mod` whereas other + // encoding formats like `xmm_unary_rm_r` treat it as a `def`. + ctx.emit(Inst::xmm_rm_r(sse_op, src, dst)); + } else { + panic!("Unable to insertlane for type: {}", ty); + } + } + Opcode::IaddImm | Opcode::ImulImm | Opcode::UdivImm