diff --git a/cranelift/codegen/src/isa/x64/inst.isle b/cranelift/codegen/src/isa/x64/inst.isle index 5a6bc40897..792e25d948 100644 --- a/cranelift/codegen/src/isa/x64/inst.isle +++ b/cranelift/codegen/src/isa/x64/inst.isle @@ -202,6 +202,17 @@ (src2 XmmMem) (dst WritableXmm)) + ;; XMM (scalar or vector) blend op. The mask is used to blend between + ;; src1 and src2. This differs from a use of `XmmRmR` as the mask is + ;; implicitly in register xmm0; this special case exists to allow us to + ;; communicate the constraint on the `mask` register to regalloc2. + (XmmRmRBlend + (op SseOpcode) + (src1 Xmm) + (src2 XmmMem) + (mask Xmm) + (dst WritableXmm)) + ;; XMM (scalar or vector) binary op that relies on the VEX prefix. (XmmRmRVex (op AvxOpcode) (src1 Xmm) @@ -1353,15 +1364,6 @@ (decl intcc_without_eq (IntCC) IntCC) (extern constructor intcc_without_eq intcc_without_eq) -;;;; Helpers for Getting Particular Physical Registers ;;;;;;;;;;;;;;;;;;;;;;;;; -;; -;; These should only be used for legalization purposes, when we can't otherwise -;; rely on something like `Inst::mov_mitosis` to put an operand into the -;; appropriate physical register for whatever reason. - -(decl xmm0 () WritableXmm) -(extern constructor xmm0 xmm0) - ;;;; Helpers for determining the register class of a value type ;;;;;;;;;;;;;;;; (type RegisterClass @@ -2432,33 +2434,21 @@ ;; Priority 0 because multi_lane overlaps with the previous two type patterns. (rule 0 (sse_mov_op (multi_lane _bits _lanes)) (SseOpcode.Movdqa)) +(decl xmm_rm_r_blend (SseOpcode Xmm XmmMem Xmm) Xmm) +(rule (xmm_rm_r_blend op src1 src2 mask) + (let ((dst WritableXmm (temp_writable_xmm)) + (_ Unit (emit (MInst.XmmRmRBlend op src1 src2 mask dst)))) + dst)) + ;; Helper for creating `blendvp{d,s}` and `pblendvb` instructions. -(decl x64_blend (Type XmmMem XmmMem Xmm) Xmm) +(decl x64_blend (Type Xmm XmmMem Xmm) Xmm) (rule (x64_blend ty mask src1 src2) - ;; Move the mask into `xmm0`, as blend instructions implicitly operate on - ;; that register. (This kind of thing would normally happen inside of - ;; `Inst::mov_mitosis`, but has to happen here, where we still have the - ;; mask register, because the mask is implicit and doesn't appear in the - ;; `Inst` itself.) - (let ((mask2 WritableXmm (xmm0)) - (_ Unit (emit (MInst.XmmUnaryRmR (sse_mov_op ty) - mask - mask2)))) - (xmm_rm_r ty (sse_blend_op ty) src2 src1))) + (xmm_rm_r_blend (sse_blend_op ty) src2 src1 mask)) ;; Helper for creating `blendvpd` instructions. (decl x64_blendvpd (Xmm XmmMem Xmm) Xmm) (rule (x64_blendvpd src1 src2 mask) - ;; Move the mask into `xmm0`, as `blendvpd` implicitly operates on that - ;; register. (This kind of thing would normally happen inside of - ;; `Inst::mov_mitosis`, but has to happen here, where we still have the - ;; mask register, because the mask is implicit and doesn't appear in the - ;; `Inst` itself.) - (let ((mask2 WritableXmm (xmm0)) - (_ Unit (emit (MInst.XmmUnaryRmR (SseOpcode.Movapd) - mask - mask2)))) - (xmm_rm_r $F64X2 (SseOpcode.Blendvpd) src1 src2))) + (xmm_rm_r_blend (SseOpcode.Blendvpd) src1 src2 mask)) ;; Helper for creating `movsd` instructions. (decl x64_movsd_regmove (Xmm XmmMem) Xmm) diff --git a/cranelift/codegen/src/isa/x64/inst/emit.rs b/cranelift/codegen/src/isa/x64/inst/emit.rs index 7d75a0ddec..32f284a8c0 100644 --- a/cranelift/codegen/src/isa/x64/inst/emit.rs +++ b/cranelift/codegen/src/isa/x64/inst/emit.rs @@ -1820,8 +1820,6 @@ pub(crate) fn emit( SseOpcode::Andpd => (LegacyPrefixes::_66, 0x0F54, 2), SseOpcode::Andnps => (LegacyPrefixes::None, 0x0F55, 2), SseOpcode::Andnpd => (LegacyPrefixes::_66, 0x0F55, 2), - SseOpcode::Blendvps => (LegacyPrefixes::_66, 0x0F3814, 3), - SseOpcode::Blendvpd => (LegacyPrefixes::_66, 0x0F3815, 3), SseOpcode::Divps => (LegacyPrefixes::None, 0x0F5E, 2), SseOpcode::Divpd => (LegacyPrefixes::_66, 0x0F5E, 2), SseOpcode::Divss => (LegacyPrefixes::_F3, 0x0F5E, 2), @@ -1859,7 +1857,6 @@ pub(crate) fn emit( SseOpcode::Pandn => (LegacyPrefixes::_66, 0x0FDF, 2), SseOpcode::Pavgb => (LegacyPrefixes::_66, 0x0FE0, 2), SseOpcode::Pavgw => (LegacyPrefixes::_66, 0x0FE3, 2), - SseOpcode::Pblendvb => (LegacyPrefixes::_66, 0x0F3810, 3), SseOpcode::Pcmpeqb => (LegacyPrefixes::_66, 0x0F74, 2), SseOpcode::Pcmpeqw => (LegacyPrefixes::_66, 0x0F75, 2), SseOpcode::Pcmpeqd => (LegacyPrefixes::_66, 0x0F76, 2), @@ -1924,6 +1921,39 @@ pub(crate) fn emit( } } + Inst::XmmRmRBlend { + op, + src1, + src2, + dst, + mask, + } => { + let src1 = allocs.next(src1.to_reg()); + let mask = allocs.next(mask.to_reg()); + debug_assert_eq!(mask, regs::xmm0()); + let reg_g = allocs.next(dst.to_reg().to_reg()); + debug_assert_eq!(src1, reg_g); + let src_e = src2.clone().to_reg_mem().with_allocs(allocs); + + let rex = RexFlags::clear_w(); + let (prefix, opcode, length) = match op { + SseOpcode::Blendvps => (LegacyPrefixes::_66, 0x0F3814, 3), + SseOpcode::Blendvpd => (LegacyPrefixes::_66, 0x0F3815, 3), + SseOpcode::Pblendvb => (LegacyPrefixes::_66, 0x0F3810, 3), + _ => unimplemented!("Opcode {:?} not implemented", op), + }; + + match src_e { + RegMem::Reg { reg: reg_e } => { + emit_std_reg_reg(sink, prefix, opcode, length, reg_g, reg_e, rex); + } + RegMem::Mem { addr } => { + let addr = &addr.finalize(state, sink); + emit_std_reg_mem(sink, info, prefix, opcode, length, reg_g, addr, rex, 0); + } + } + } + Inst::XmmRmRVex { op, src1, diff --git a/cranelift/codegen/src/isa/x64/inst/emit_tests.rs b/cranelift/codegen/src/isa/x64/inst/emit_tests.rs index 6b3b31406d..a56be39f44 100644 --- a/cranelift/codegen/src/isa/x64/inst/emit_tests.rs +++ b/cranelift/codegen/src/isa/x64/inst/emit_tests.rs @@ -131,6 +131,16 @@ impl Inst { size, } } + + fn xmm_rm_r_blend(op: SseOpcode, src2: RegMem, dst: Writable) -> Inst { + Inst::XmmRmRBlend { + op, + src1: Xmm::new(dst.to_reg()).unwrap(), + src2: XmmMem::new(src2).unwrap(), + mask: Xmm::new(regs::xmm0()).unwrap(), + dst: WritableXmm::from_writable_reg(dst).unwrap(), + } + } } #[test] @@ -3961,19 +3971,19 @@ fn test_x64_emit() { )); insns.push(( - Inst::xmm_rm_r(SseOpcode::Blendvpd, RegMem::reg(xmm15), w_xmm4), + Inst::xmm_rm_r_blend(SseOpcode::Blendvpd, RegMem::reg(xmm15), w_xmm4), "66410F3815E7", "blendvpd %xmm4, %xmm15, %xmm4", )); insns.push(( - Inst::xmm_rm_r(SseOpcode::Blendvps, RegMem::reg(xmm2), w_xmm3), + Inst::xmm_rm_r_blend(SseOpcode::Blendvps, RegMem::reg(xmm2), w_xmm3), "660F3814DA", "blendvps %xmm3, %xmm2, %xmm3", )); insns.push(( - Inst::xmm_rm_r(SseOpcode::Pblendvb, RegMem::reg(xmm12), w_xmm13), + Inst::xmm_rm_r_blend(SseOpcode::Pblendvb, RegMem::reg(xmm12), w_xmm13), "66450F3810EC", "pblendvb %xmm13, %xmm12, %xmm13", )); diff --git a/cranelift/codegen/src/isa/x64/inst/mod.rs b/cranelift/codegen/src/isa/x64/inst/mod.rs index 7ea05103e7..6ee5e7a26a 100644 --- a/cranelift/codegen/src/isa/x64/inst/mod.rs +++ b/cranelift/codegen/src/isa/x64/inst/mod.rs @@ -3,7 +3,7 @@ use crate::binemit::{Addend, CodeOffset, Reloc, StackMap}; use crate::ir::{types, ExternalName, LibCall, Opcode, RelSourceLoc, TrapCode, Type}; use crate::isa::x64::abi::X64ABIMachineSpec; -use crate::isa::x64::inst::regs::pretty_print_reg; +use crate::isa::x64::inst::regs::{pretty_print_reg, show_ireg_sized}; use crate::isa::x64::settings as x64_settings; use crate::isa::CallConv; use crate::{machinst::*, trace}; @@ -130,6 +130,7 @@ impl Inst { | Inst::XmmMovRM { op, .. } | Inst::XmmRmiReg { opcode: op, .. } | Inst::XmmRmR { op, .. } + | Inst::XmmRmRBlend { op, .. } | Inst::XmmRmRImm { op, .. } | Inst::XmmToGpr { op, .. } | Inst::XmmUnaryRmRImm { op, .. } @@ -938,6 +939,33 @@ impl PrettyPrint for Inst { format!("{} {}, {}, {}", ljustify(op.to_string()), src1, src2, dst) } + Inst::XmmRmRBlend { + op, + src1, + src2, + mask, + dst, + } => { + let src1 = pretty_print_reg(src1.to_reg(), 8, allocs); + let mask = allocs.next(mask.to_reg()); + let mask = if mask.is_virtual() { + format!(" <{}>", show_ireg_sized(mask, 8)) + } else { + debug_assert_eq!(mask, regs::xmm0()); + String::new() + }; + let dst = pretty_print_reg(dst.to_reg().to_reg(), 8, allocs); + let src2 = src2.pretty_print(8, allocs); + format!( + "{} {}, {}, {}{}", + ljustify(op.to_string()), + src1, + src2, + dst, + mask + ) + } + Inst::XmmRmRVex { op, src1, @@ -1765,11 +1793,7 @@ fn x64_get_operands VReg>(inst: &Inst, collector: &mut OperandCol src.get_operands(collector); } Inst::XmmRmR { - src1, - src2, - dst, - op, - .. + src1, src2, dst, .. } => { if inst.produces_const() { collector.reg_def(dst.to_writable_reg()); @@ -1777,15 +1801,24 @@ fn x64_get_operands VReg>(inst: &Inst, collector: &mut OperandCol collector.reg_use(src1.to_reg()); collector.reg_reuse_def(dst.to_writable_reg(), 0); src2.get_operands(collector); - - // Some instructions have an implicit use of XMM0. - if *op == SseOpcode::Blendvpd + } + } + Inst::XmmRmRBlend { + src1, + src2, + mask, + dst, + op, + } => { + assert!( + *op == SseOpcode::Blendvpd || *op == SseOpcode::Blendvps || *op == SseOpcode::Pblendvb - { - collector.reg_use(regs::xmm0()); - } - } + ); + collector.reg_use(src1.to_reg()); + collector.reg_fixed_use(mask.to_reg(), regs::xmm0()); + collector.reg_reuse_def(dst.to_writable_reg(), 0); + src2.get_operands(collector); } Inst::XmmRmRVex { op, diff --git a/cranelift/codegen/src/isa/x64/lower/isle.rs b/cranelift/codegen/src/isa/x64/lower/isle.rs index 71c519a148..53a7de61a5 100644 --- a/cranelift/codegen/src/isa/x64/lower/isle.rs +++ b/cranelift/codegen/src/isa/x64/lower/isle.rs @@ -336,11 +336,6 @@ impl Context for IsleContext<'_, '_, MInst, Flags, IsaFlags, 6> { 0b00_00_00_00 | lane << 4 } - #[inline] - fn xmm0(&mut self) -> WritableXmm { - WritableXmm::from_reg(Xmm::new(regs::xmm0()).unwrap()) - } - #[inline] fn synthetic_amode_to_reg_mem(&mut self, addr: &SyntheticAmode) -> RegMem { RegMem::mem(addr.clone()) diff --git a/cranelift/filetests/filetests/isa/x64/simd-bitselect.clif b/cranelift/filetests/filetests/isa/x64/simd-bitselect.clif index 3a56d97a2f..25f77c2189 100644 --- a/cranelift/filetests/filetests/isa/x64/simd-bitselect.clif +++ b/cranelift/filetests/filetests/isa/x64/simd-bitselect.clif @@ -16,9 +16,9 @@ block0(v0: i8x16, v1: i8x16): ; pcmpeqb %xmm4, %xmm1, %xmm4 ; movdqa %xmm0, %xmm7 ; movdqa %xmm4, %xmm0 -; movdqa %xmm1, %xmm5 -; pblendvb %xmm5, %xmm7, %xmm5 -; movdqa %xmm5, %xmm0 +; movdqa %xmm1, %xmm4 +; pblendvb %xmm4, %xmm7, %xmm4 +; movdqa %xmm4, %xmm0 ; movq %rbp, %rsp ; popq %rbp ; ret @@ -34,9 +34,9 @@ block0(v0: f32x4, v1: f32x4, v2: i32x4, v3: i32x4): ; movq %rsp, %rbp ; block0: ; cmpps $0, %xmm0, %xmm1, %xmm0 -; movdqa %xmm3, %xmm7 -; pblendvb %xmm7, %xmm2, %xmm7 -; movdqa %xmm7, %xmm0 +; movdqa %xmm3, %xmm6 +; pblendvb %xmm6, %xmm2, %xmm6 +; movdqa %xmm6, %xmm0 ; movq %rbp, %rsp ; popq %rbp ; ret @@ -72,10 +72,10 @@ block0(v0: i8x16, v1: i8x16): ; block0: ; movdqa %xmm0, %xmm5 ; movdqu const(0), %xmm0 -; movdqa %xmm5, %xmm7 -; movdqa %xmm1, %xmm5 -; pblendvb %xmm5, %xmm7, %xmm5 -; movdqa %xmm5, %xmm0 +; movdqa %xmm5, %xmm6 +; movdqa %xmm1, %xmm4 +; pblendvb %xmm4, %xmm6, %xmm4 +; movdqa %xmm4, %xmm0 ; movq %rbp, %rsp ; popq %rbp ; ret @@ -92,10 +92,10 @@ block0(v0: i16x8, v1: i16x8): ; block0: ; movdqa %xmm0, %xmm5 ; movdqu const(0), %xmm0 -; movdqa %xmm5, %xmm7 -; movdqa %xmm1, %xmm5 -; pblendvb %xmm5, %xmm7, %xmm5 -; movdqa %xmm5, %xmm0 +; movdqa %xmm5, %xmm6 +; movdqa %xmm1, %xmm4 +; pblendvb %xmm4, %xmm6, %xmm4 +; movdqa %xmm4, %xmm0 ; movq %rbp, %rsp ; popq %rbp ; ret diff --git a/cranelift/filetests/filetests/isa/x64/simd-bitwise-compile.clif b/cranelift/filetests/filetests/isa/x64/simd-bitwise-compile.clif index d4e613198c..5554a5069f 100644 --- a/cranelift/filetests/filetests/isa/x64/simd-bitwise-compile.clif +++ b/cranelift/filetests/filetests/isa/x64/simd-bitwise-compile.clif @@ -137,9 +137,9 @@ block0(v0: i16x8, v1: i16x8, v2: i16x8): ; pushq %rbp ; movq %rsp, %rbp ; block0: -; movdqa %xmm2, %xmm5 -; pblendvb %xmm5, %xmm1, %xmm5 -; movdqa %xmm5, %xmm0 +; movdqa %xmm2, %xmm4 +; pblendvb %xmm4, %xmm1, %xmm4 +; movdqa %xmm4, %xmm0 ; movq %rbp, %rsp ; popq %rbp ; ret @@ -153,9 +153,9 @@ block0(v0: i32x4, v1: f32x4, v2: f32x4): ; pushq %rbp ; movq %rsp, %rbp ; block0: -; movdqa %xmm2, %xmm5 -; blendvps %xmm5, %xmm1, %xmm5 -; movdqa %xmm5, %xmm0 +; movdqa %xmm2, %xmm4 +; blendvps %xmm4, %xmm1, %xmm4 +; movdqa %xmm4, %xmm0 ; movq %rbp, %rsp ; popq %rbp ; ret @@ -169,9 +169,9 @@ block0(v0: i64x2, v1: f64x2, v2: f64x2): ; pushq %rbp ; movq %rsp, %rbp ; block0: -; movdqa %xmm2, %xmm5 -; blendvpd %xmm5, %xmm1, %xmm5 -; movdqa %xmm5, %xmm0 +; movdqa %xmm2, %xmm4 +; blendvpd %xmm4, %xmm1, %xmm4 +; movdqa %xmm4, %xmm0 ; movq %rbp, %rsp ; popq %rbp ; ret