diff --git a/cranelift/codegen/src/isa/x64/inst/args.rs b/cranelift/codegen/src/isa/x64/inst/args.rs index 8b31ce9944..f763945766 100644 --- a/cranelift/codegen/src/isa/x64/inst/args.rs +++ b/cranelift/codegen/src/isa/x64/inst/args.rs @@ -404,6 +404,9 @@ pub enum SseOpcode { Paddw, Pavgb, Pavgw, + Pextrb, + Pextrw, + Pextrd, Pinsrb, Pinsrw, Pinsrd, @@ -422,6 +425,7 @@ pub enum SseOpcode { Pmulld, Pmullw, Pmuludq, + Pshufd, Psllw, Pslld, Psllq, @@ -524,6 +528,7 @@ impl SseOpcode { | SseOpcode::Paddw | SseOpcode::Pavgb | SseOpcode::Pavgw + | SseOpcode::Pextrw | SseOpcode::Pinsrw | SseOpcode::Pmaxsw | SseOpcode::Pmaxub @@ -531,6 +536,7 @@ impl SseOpcode { | SseOpcode::Pminub | SseOpcode::Pmullw | SseOpcode::Pmuludq + | SseOpcode::Pshufd | SseOpcode::Psllw | SseOpcode::Pslld | SseOpcode::Psllq @@ -554,6 +560,8 @@ impl SseOpcode { SseOpcode::Pabsb | SseOpcode::Pabsw | SseOpcode::Pabsd => SSSE3, SseOpcode::Insertps + | SseOpcode::Pextrb + | SseOpcode::Pextrd | SseOpcode::Pinsrb | SseOpcode::Pinsrd | SseOpcode::Pmaxsb @@ -643,6 +651,9 @@ impl fmt::Debug for SseOpcode { SseOpcode::Paddw => "paddw", SseOpcode::Pavgb => "pavgb", SseOpcode::Pavgw => "pavgw", + SseOpcode::Pextrb => "pextrb", + SseOpcode::Pextrw => "pextrw", + SseOpcode::Pextrd => "pextrd", SseOpcode::Pinsrb => "pinsrb", SseOpcode::Pinsrw => "pinsrw", SseOpcode::Pinsrd => "pinsrd", @@ -661,6 +672,7 @@ impl fmt::Debug for SseOpcode { SseOpcode::Pmulld => "pmulld", SseOpcode::Pmullw => "pmullw", SseOpcode::Pmuludq => "pmuludq", + SseOpcode::Pshufd => "pshufd", SseOpcode::Psllw => "psllw", SseOpcode::Pslld => "pslld", SseOpcode::Psllq => "psllq", diff --git a/cranelift/codegen/src/isa/x64/inst/emit.rs b/cranelift/codegen/src/isa/x64/inst/emit.rs index e42996ab67..acfc6f27aa 100644 --- a/cranelift/codegen/src/isa/x64/inst/emit.rs +++ b/cranelift/codegen/src/isa/x64/inst/emit.rs @@ -1915,7 +1915,7 @@ pub(crate) fn emit( imm, is64: w, } => { - let (prefix, opcode, num_opcodes) = match op { + let (prefix, opcode, len) = match op { SseOpcode::Cmpps => (LegacyPrefixes::None, 0x0FC2, 2), SseOpcode::Cmppd => (LegacyPrefixes::_66, 0x0FC2, 2), SseOpcode::Cmpss => (LegacyPrefixes::_F3, 0x0FC2, 2), @@ -1924,6 +1924,10 @@ pub(crate) fn emit( SseOpcode::Pinsrb => (LegacyPrefixes::_66, 0x0F3A20, 3), SseOpcode::Pinsrw => (LegacyPrefixes::_66, 0x0FC4, 2), SseOpcode::Pinsrd => (LegacyPrefixes::_66, 0x0F3A22, 3), + SseOpcode::Pextrb => (LegacyPrefixes::_66, 0x0F3A14, 3), + SseOpcode::Pextrw => (LegacyPrefixes::_66, 0x0FC5, 2), + SseOpcode::Pextrd => (LegacyPrefixes::_66, 0x0F3A16, 3), + SseOpcode::Pshufd => (LegacyPrefixes::_66, 0x0F70, 2), _ => unimplemented!("Opcode {:?} not implemented", op), }; let rex = if *w { @@ -1931,13 +1935,29 @@ pub(crate) fn emit( } else { RexFlags::clear_w() }; + let regs_swapped = match *op { + // These opcodes (and not the SSE2 version of PEXTRW) flip the operand + // encoding: `dst` in ModRM's r/m, `src` in ModRM's reg field. + SseOpcode::Pextrb | SseOpcode::Pextrd => true, + // The rest of the opcodes have the customary encoding: `dst` in ModRM's reg, + // `src` in ModRM's r/m field. + _ => false, + }; match src { RegMem::Reg { reg } => { - emit_std_reg_reg(sink, prefix, opcode, num_opcodes, dst.to_reg(), *reg, rex); + if regs_swapped { + emit_std_reg_reg(sink, prefix, opcode, len, *reg, dst.to_reg(), rex); + } else { + emit_std_reg_reg(sink, prefix, opcode, len, dst.to_reg(), *reg, rex); + } } RegMem::Mem { addr } => { let addr = &addr.finalize(state); - emit_std_reg_mem(sink, prefix, opcode, num_opcodes, dst.to_reg(), addr, rex); + assert!( + !regs_swapped, + "No existing way to encode a mem argument in the ModRM r/m field." + ); + emit_std_reg_mem(sink, prefix, opcode, len, dst.to_reg(), addr, rex); } } sink.put1(*imm) diff --git a/cranelift/codegen/src/isa/x64/inst/mod.rs b/cranelift/codegen/src/isa/x64/inst/mod.rs index 1812155b33..577a2ce73b 100644 --- a/cranelift/codegen/src/isa/x64/inst/mod.rs +++ b/cranelift/codegen/src/isa/x64/inst/mod.rs @@ -788,8 +788,6 @@ impl Inst { imm: u8, w: bool, ) -> Inst { - debug_assert!(dst.to_reg().get_class() == RegClass::V128); - debug_assert!(imm < 8); Inst::XmmRmRImm { op, src, @@ -1736,10 +1734,17 @@ fn x64_get_regs(inst: &Inst, collector: &mut RegUsageCollector) { collector.add_mod(*dst); } } - Inst::XmmRmRImm { src, dst, .. } => { + Inst::XmmRmRImm { op, src, dst, .. } => { if inst.produces_const() { // No need to account for src, since src == dst. collector.add_def(*dst); + } else if *op == SseOpcode::Pextrb + || *op == SseOpcode::Pextrw + || *op == SseOpcode::Pextrd + || *op == SseOpcode::Pshufd + { + src.get_regs_as_uses(collector); + collector.add_def(*dst); } else { src.get_regs_as_uses(collector); collector.add_mod(*dst); @@ -2038,6 +2043,7 @@ fn x64_map_regs(inst: &mut Inst, mapper: &RUM) { map_def(mapper, dst); } Inst::XmmRmRImm { + ref op, ref mut src, ref mut dst, .. @@ -2045,6 +2051,13 @@ fn x64_map_regs(inst: &mut Inst, mapper: &RUM) { if produces_const { src.map_as_def(mapper); map_def(mapper, dst); + } else if *op == SseOpcode::Pextrb + || *op == SseOpcode::Pextrw + || *op == SseOpcode::Pextrd + || *op == SseOpcode::Pshufd + { + src.map_uses(mapper); + map_def(mapper, dst); } else { src.map_uses(mapper); map_mod(mapper, dst); diff --git a/cranelift/codegen/src/isa/x64/lower.rs b/cranelift/codegen/src/isa/x64/lower.rs index 35a5657f58..2da82f40f7 100644 --- a/cranelift/codegen/src/isa/x64/lower.rs +++ b/cranelift/codegen/src/isa/x64/lower.rs @@ -2690,6 +2690,55 @@ fn lower_insn_to_regs>( } } + Opcode::Extractlane => { + // The instruction format maps to variables like: %dst = extractlane %src, %lane + let ty = ty.unwrap(); + let dst = get_output_reg(ctx, outputs[0]); + let src_ty = ctx.input_ty(insn, 0); + assert_eq!(src_ty.bits(), 128); + let src = put_input_in_reg(ctx, inputs[0]); + let lane = if let InstructionData::BinaryImm8 { imm, .. } = ctx.data(insn) { + *imm + } else { + unreachable!(); + }; + + if !ty.is_float() { + let (sse_op, w_bit) = match ty.lane_bits() { + 8 => (SseOpcode::Pextrb, false), + 16 => (SseOpcode::Pextrw, false), + 32 => (SseOpcode::Pextrd, false), + 64 => (SseOpcode::Pextrd, true), + _ => panic!("Unable to extractlane for lane size: {}", ty.lane_bits()), + }; + let src = RegMem::reg(src); + ctx.emit(Inst::xmm_rm_r_imm(sse_op, src, dst, lane, w_bit)); + } else { + if lane == 0 { + // Remove the extractlane instruction, leaving the float where it is. The upper + // bits will remain unchanged; for correctness, this relies on Cranelift type + // checking to avoid using those bits. + ctx.emit(Inst::gen_move(dst, src, ty)); + } else { + // Otherwise, shuffle the bits in `lane` to the lowest lane. + let sse_op = SseOpcode::Pshufd; + let mask = match src_ty { + // Move the value at `lane` to lane 0, copying existing value at lane 0 to + // other lanes. Again, this relies on Cranelift type checking to avoid + // using those bits. + types::F32X4 => 0b00_00_00_00 | lane, + // Move the value at `lane` 1 (we know it must be 1 because of the `if` + // statement above) to lane 0 and leave lane 1 unchanged. The Cranelift type + // checking assumption also applies here. + types::F64X2 => 0b11_10_11_10, + _ => unreachable!(), + }; + let src = RegMem::reg(src); + ctx.emit(Inst::xmm_rm_r_imm(sse_op, src, dst, mask, false)); + } + } + } + Opcode::IaddImm | Opcode::ImulImm | Opcode::UdivImm