diff --git a/cranelift/codegen/src/isa/x64/inst.isle b/cranelift/codegen/src/isa/x64/inst.isle index d5d8c572a9..46e37bea5b 100644 --- a/cranelift/codegen/src/isa/x64/inst.isle +++ b/cranelift/codegen/src/isa/x64/inst.isle @@ -331,6 +331,12 @@ (dst WritableGpr) (dst_size OperandSize)) + ;; XMM (scalar) unary op (from xmm to integer reg): pextr{w,b,d,q} + (XmmToGprImm (op SseOpcode) + (src Xmm) + (dst WritableGpr) + (imm u8)) + ;; XMM (scalar) unary op (from integer to float reg): movd, movq, ;; cvtsi2s{s,d} (GprToXmm (op SseOpcode) @@ -749,6 +755,7 @@ Pextrb Pextrw Pextrd + Pextrq Pinsrb Pinsrw Pinsrd @@ -3110,16 +3117,9 @@ (xmm_rmr_imm_vex (AvxOpcode.Vinsertps) src1 src2 lane)) ;; Helper for creating `pshufd` instructions. -(decl x64_pshufd (XmmMem u8 OperandSize) Xmm) -(rule (x64_pshufd src imm size) - (let ((dst WritableXmm (temp_writable_xmm)) - (_ Unit (emit (MInst.XmmRmRImm (SseOpcode.Pshufd) - dst - src - dst - imm - size)))) - dst)) +(decl x64_pshufd (XmmMem u8) Xmm) +(rule (x64_pshufd src imm) + (xmm_unary_rm_r_imm (SseOpcode.Pshufd) src imm)) ;; Helper for creating `pshufb` instructions. (decl x64_pshufb (Xmm XmmMem) Xmm) @@ -3314,40 +3314,24 @@ (xmm_rmir_vex (AvxOpcode.Vpsrad) src1 src2)) ;; Helper for creating `pextrb` instructions. -(decl x64_pextrb (Type Xmm u8) Gpr) -(rule (x64_pextrb ty src lane) - (let ((dst WritableGpr (temp_writable_gpr)) - (_ Unit (emit (MInst.XmmRmRImm (SseOpcode.Pextrb) - dst - src - dst - lane - (operand_size_of_type_32_64 (lane_type ty)))))) - dst)) +(decl x64_pextrb (Xmm u8) Gpr) +(rule (x64_pextrb src lane) + (xmm_to_gpr_imm (SseOpcode.Pextrb) src lane)) ;; Helper for creating `pextrw` instructions. -(decl x64_pextrw (Type Xmm u8) Gpr) -(rule (x64_pextrw ty src lane) - (let ((dst WritableGpr (temp_writable_gpr)) - (_ Unit (emit (MInst.XmmRmRImm (SseOpcode.Pextrw) - dst - src - dst - lane - (operand_size_of_type_32_64 (lane_type ty)))))) - dst)) +(decl x64_pextrw (Xmm u8) Gpr) +(rule (x64_pextrw src lane) + (xmm_to_gpr_imm (SseOpcode.Pextrw) src lane)) ;; Helper for creating `pextrd` instructions. -(decl x64_pextrd (Type Xmm u8) Gpr) -(rule (x64_pextrd ty src lane) - (let ((dst WritableGpr (temp_writable_gpr)) - (_ Unit (emit (MInst.XmmRmRImm (SseOpcode.Pextrd) - dst - src - dst - lane - (operand_size_of_type_32_64 (lane_type ty)))))) - dst)) +(decl x64_pextrd (Xmm u8) Gpr) +(rule (x64_pextrd src lane) + (xmm_to_gpr_imm (SseOpcode.Pextrd) src lane)) + +;; Helper for creating `pextrq` instructions. +(decl x64_pextrq (Xmm u8) Gpr) +(rule (x64_pextrq src lane) + (xmm_to_gpr_imm (SseOpcode.Pextrq) src lane)) ;; Helper for creating `MInst.XmmToGpr` instructions. (decl xmm_to_gpr (SseOpcode Xmm OperandSize) Gpr) @@ -3356,6 +3340,13 @@ (_ Unit (emit (MInst.XmmToGpr op src dst size)))) dst)) +;; Helper for creating `MInst.XmmToGpr` instructions. +(decl xmm_to_gpr_imm (SseOpcode Xmm u8) Gpr) +(rule (xmm_to_gpr_imm op src imm) + (let ((dst WritableGpr (temp_writable_gpr)) + (_ Unit (emit (MInst.XmmToGprImm op src dst imm)))) + dst)) + ;; Helper for creating `pmovmskb` instructions. (decl x64_pmovmskb (OperandSize Xmm) Gpr) (rule (x64_pmovmskb size src) diff --git a/cranelift/codegen/src/isa/x64/inst/args.rs b/cranelift/codegen/src/isa/x64/inst/args.rs index 57f644c8e9..9eaded1210 100644 --- a/cranelift/codegen/src/isa/x64/inst/args.rs +++ b/cranelift/codegen/src/isa/x64/inst/args.rs @@ -999,6 +999,7 @@ pub enum SseOpcode { Pextrb, Pextrw, Pextrd, + Pextrq, Pinsrb, Pinsrw, Pinsrd, @@ -1237,6 +1238,7 @@ impl SseOpcode { | SseOpcode::Pcmpeqq | SseOpcode::Pextrb | SseOpcode::Pextrd + | SseOpcode::Pextrq | SseOpcode::Pinsrb | SseOpcode::Pinsrd | SseOpcode::Pmaxsb @@ -1278,22 +1280,6 @@ impl SseOpcode { _ => 8, } } - - /// Does an XmmRmmRImm with this opcode use src1? FIXME: split - /// into separate instructions. - pub(crate) fn uses_src1(&self) -> bool { - match self { - SseOpcode::Pextrb => false, - SseOpcode::Pextrw => false, - SseOpcode::Pextrd => false, - SseOpcode::Pshufd => false, - SseOpcode::Roundss => false, - SseOpcode::Roundsd => false, - SseOpcode::Roundps => false, - SseOpcode::Roundpd => false, - _ => true, - } - } } impl fmt::Debug for SseOpcode { @@ -1393,6 +1379,7 @@ impl fmt::Debug for SseOpcode { SseOpcode::Pextrb => "pextrb", SseOpcode::Pextrw => "pextrw", SseOpcode::Pextrd => "pextrd", + SseOpcode::Pextrq => "pextrq", SseOpcode::Pinsrb => "pinsrb", SseOpcode::Pinsrw => "pinsrw", SseOpcode::Pinsrd => "pinsrd", diff --git a/cranelift/codegen/src/isa/x64/inst/emit.rs b/cranelift/codegen/src/isa/x64/inst/emit.rs index ebbbf16b26..1cb6b34f2b 100644 --- a/cranelift/codegen/src/isa/x64/inst/emit.rs +++ b/cranelift/codegen/src/isa/x64/inst/emit.rs @@ -1792,8 +1792,6 @@ pub(crate) fn emit( } Inst::XmmUnaryRmRImm { op, src, dst, imm } => { - debug_assert!(!op.uses_src1()); - let dst = allocs.next(dst.to_reg().to_reg()); let src = src.clone().to_reg_mem().with_allocs(allocs); let rex = RexFlags::clear_w(); @@ -1803,6 +1801,7 @@ pub(crate) fn emit( SseOpcode::Roundss => (LegacyPrefixes::_66, 0x0F3A0A, 3), SseOpcode::Roundpd => (LegacyPrefixes::_66, 0x0F3A09, 3), SseOpcode::Roundsd => (LegacyPrefixes::_66, 0x0F3A0B, 3), + SseOpcode::Pshufd => (LegacyPrefixes::_66, 0x0F70, 2), _ => unimplemented!("Opcode {:?} not implemented", op), }; match src { @@ -2458,17 +2457,10 @@ pub(crate) fn emit( imm, size, } => { - let (src2, dst) = if !op.uses_src1() { - let dst = allocs.next(dst.to_reg()); - let src2 = src2.with_allocs(allocs); - (src2, dst) - } else { - let src1 = allocs.next(*src1); - let dst = allocs.next(dst.to_reg()); - let src2 = src2.with_allocs(allocs); - debug_assert_eq!(src1, dst); - (src2, dst) - }; + let src1 = allocs.next(*src1); + let dst = allocs.next(dst.to_reg()); + let src2 = src2.with_allocs(allocs); + debug_assert_eq!(src1, dst); let (prefix, opcode, len) = match op { SseOpcode::Cmpps => (LegacyPrefixes::None, 0x0FC2, 2), @@ -2480,10 +2472,6 @@ pub(crate) fn emit( SseOpcode::Pinsrb => (LegacyPrefixes::_66, 0x0F3A20, 3), SseOpcode::Pinsrw => (LegacyPrefixes::_66, 0x0FC4, 2), SseOpcode::Pinsrd => (LegacyPrefixes::_66, 0x0F3A22, 3), - SseOpcode::Pextrb => (LegacyPrefixes::_66, 0x0F3A14, 3), - SseOpcode::Pextrw => (LegacyPrefixes::_66, 0x0FC5, 2), - SseOpcode::Pextrd => (LegacyPrefixes::_66, 0x0F3A16, 3), - SseOpcode::Pshufd => (LegacyPrefixes::_66, 0x0F70, 2), SseOpcode::Shufps => (LegacyPrefixes::None, 0x0FC6, 2), _ => unimplemented!("Opcode {:?} not implemented", op), }; @@ -2566,6 +2554,26 @@ pub(crate) fn emit( emit_std_reg_reg(sink, prefix, opcode, 2, src, dst, rex); } + Inst::XmmToGprImm { op, src, dst, imm } => { + use OperandSize as OS; + + let src = allocs.next(src.to_reg()); + let dst = allocs.next(dst.to_reg().to_reg()); + + let (prefix, opcode, opcode_bytes, dst_size, dst_first) = match op { + SseOpcode::Pextrb => (LegacyPrefixes::_66, 0x0F3A14, 3, OS::Size32, false), + SseOpcode::Pextrw => (LegacyPrefixes::_66, 0x0FC5, 2, OS::Size32, true), + SseOpcode::Pextrd => (LegacyPrefixes::_66, 0x0F3A16, 3, OS::Size32, false), + SseOpcode::Pextrq => (LegacyPrefixes::_66, 0x0F3A16, 3, OS::Size64, false), + _ => panic!("unexpected opcode {:?}", op), + }; + let rex = RexFlags::from(dst_size); + let (src, dst) = if dst_first { (dst, src) } else { (src, dst) }; + + emit_std_reg_reg(sink, prefix, opcode, opcode_bytes, src, dst, rex); + sink.put1(*imm); + } + Inst::GprToXmm { op, src: src_e, diff --git a/cranelift/codegen/src/isa/x64/inst/mod.rs b/cranelift/codegen/src/isa/x64/inst/mod.rs index 217b4b4db6..a717a64c6c 100644 --- a/cranelift/codegen/src/isa/x64/inst/mod.rs +++ b/cranelift/codegen/src/isa/x64/inst/mod.rs @@ -136,6 +136,7 @@ impl Inst { | Inst::XmmRmRBlend { op, .. } | Inst::XmmRmRImm { op, .. } | Inst::XmmToGpr { op, .. } + | Inst::XmmToGprImm { op, .. } | Inst::XmmUnaryRmRImm { op, .. } | Inst::XmmUnaryRmR { op, .. } | Inst::XmmConstOp { op, .. } => smallvec![op.available_from()], @@ -1111,15 +1112,11 @@ impl PrettyPrint for Inst { size, .. } => { - let src1 = if op.uses_src1() { - pretty_print_reg(*src1, 8, allocs) + ", " - } else { - "".into() - }; + let src1 = pretty_print_reg(*src1, 8, allocs); let dst = pretty_print_reg(dst.to_reg(), 8, allocs); let src2 = src2.pretty_print(8, allocs); format!( - "{} ${}, {}{}, {}", + "{} ${imm}, {src1}, {src2}, {dst}", ljustify(format!( "{}{}", op.to_string(), @@ -1129,10 +1126,6 @@ impl PrettyPrint for Inst { "" } )), - imm, - src1, - src2, - dst, ) } @@ -1153,6 +1146,12 @@ impl PrettyPrint for Inst { format!("{} {}, {}", ljustify(op.to_string()), src, dst) } + Inst::XmmToGprImm { op, src, dst, imm } => { + let src = pretty_print_reg(src.to_reg(), 8, allocs); + let dst = pretty_print_reg(dst.to_reg().to_reg(), 8, allocs); + format!("{} ${imm}, {}, {}", ljustify(op.to_string()), src, dst) + } + Inst::GprToXmm { op, src, @@ -1976,23 +1975,11 @@ fn x64_get_operands VReg>(inst: &Inst, collector: &mut OperandCol src1.get_operands(collector); } Inst::XmmRmRImm { - op, - src1, - src2, - dst, - .. + src1, src2, dst, .. } => { - if !op.uses_src1() { - // FIXME: split this instruction into two, so we don't - // need this awkward src1-is-only-sometimes-an-arg - // behavior. - collector.reg_def(*dst); - src2.get_operands(collector); - } else { - collector.reg_use(*src1); - collector.reg_reuse_def(*dst, 0); - src2.get_operands(collector); - } + collector.reg_use(*src1); + collector.reg_reuse_def(*dst, 0); + src2.get_operands(collector); } Inst::XmmConstOp { dst, .. } => { collector.reg_def(dst.to_writable_reg()); @@ -2035,7 +2022,7 @@ fn x64_get_operands VReg>(inst: &Inst, collector: &mut OperandCol collector.reg_use(src.to_reg()); collector.reg_fixed_nonallocatable(*dst); } - Inst::XmmToGpr { src, dst, .. } => { + Inst::XmmToGpr { src, dst, .. } | Inst::XmmToGprImm { src, dst, .. } => { collector.reg_use(src.to_reg()); collector.reg_def(dst.to_writable_reg()); } diff --git a/cranelift/codegen/src/isa/x64/lower.isle b/cranelift/codegen/src/isa/x64/lower.isle index 2dd9fc1bfe..2260fc3975 100644 --- a/cranelift/codegen/src/isa/x64/lower.isle +++ b/cranelift/codegen/src/isa/x64/lower.isle @@ -684,8 +684,8 @@ ;; (TODO: when EVEX support is available, add an alternate lowering here). (rule (lower (has_type $I64X2 (sshr src amt))) (let ((src_ Xmm (put_in_xmm src)) - (lo Gpr (x64_pextrd $I64 src_ 0)) - (hi Gpr (x64_pextrd $I64 src_ 1)) + (lo Gpr (x64_pextrq src_ 0)) + (hi Gpr (x64_pextrq src_ 1)) (amt_ Imm8Gpr (put_masked_in_imm8_gpr amt $I64)) (shifted_lo Gpr (x64_sar $I64 lo amt_)) (shifted_hi Gpr (x64_sar $I64 hi amt_))) @@ -921,12 +921,8 @@ x)) (swiden_high (and (value_type (multi_lane 32 4)) y))))) - (let ((x2 Xmm (x64_pshufd x - 0xFA - (OperandSize.Size32))) - (y2 Xmm (x64_pshufd y - 0xFA - (OperandSize.Size32)))) + (let ((x2 Xmm (x64_pshufd x 0xFA)) + (y2 Xmm (x64_pshufd y 0xFA))) (x64_pmuldq x2 y2))) ;; Special case for `i16x8.extmul_low_i8x16_s`. @@ -957,12 +953,8 @@ x)) (swiden_low (and (value_type (multi_lane 32 4)) y))))) - (let ((x2 Xmm (x64_pshufd x - 0x50 - (OperandSize.Size32))) - (y2 Xmm (x64_pshufd y - 0x50 - (OperandSize.Size32)))) + (let ((x2 Xmm (x64_pshufd x 0x50)) + (y2 Xmm (x64_pshufd y 0x50))) (x64_pmuldq x2 y2))) ;; Special case for `i16x8.extmul_high_i8x16_u`. @@ -997,12 +989,8 @@ x)) (uwiden_high (and (value_type (multi_lane 32 4)) y))))) - (let ((x2 Xmm (x64_pshufd x - 0xFA - (OperandSize.Size32))) - (y2 Xmm (x64_pshufd y - 0xFA - (OperandSize.Size32)))) + (let ((x2 Xmm (x64_pshufd x 0xFA)) + (y2 Xmm (x64_pshufd y 0xFA))) (x64_pmuludq x2 y2))) ;; Special case for `i16x8.extmul_low_i8x16_u`. @@ -1033,12 +1021,8 @@ x)) (uwiden_low (and (value_type (multi_lane 32 4)) y))))) - (let ((x2 Xmm (x64_pshufd x - 0x50 - (OperandSize.Size32))) - (y2 Xmm (x64_pshufd y - 0x50 - (OperandSize.Size32)))) + (let ((x2 Xmm (x64_pshufd x 0x50)) + (y2 Xmm (x64_pshufd y 0x50))) (x64_pmuludq x2 y2))) ;;;; Rules for `iabs` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; @@ -3161,7 +3145,7 @@ (x64_pmovsxwd (x64_palignr x x 8 (OperandSize.Size32))))) (rule (lower (has_type $I64X2 (swiden_high val @ (value_type $I32X4)))) - (x64_pmovsxdq (x64_pshufd val 0xEE (OperandSize.Size32)))) + (x64_pmovsxdq (x64_pshufd val 0xEE))) ;; Rules for `uwiden_low` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; @@ -3185,7 +3169,7 @@ (x64_pmovzxwd (x64_palignr x x 8 (OperandSize.Size32))))) (rule (lower (has_type $I64X2 (uwiden_high val @ (value_type $I32X4)))) - (x64_pmovzxdq (x64_pshufd val 0xEE (OperandSize.Size32)))) + (x64_pmovzxdq (x64_pshufd val 0xEE))) ;; Rules for `snarrow` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; @@ -3481,25 +3465,25 @@ ;; Cases 2-4 for an F32X4 (rule 1 (lower (has_type $F32 (extractlane val @ (value_type (ty_vec128 ty)) (u8_from_uimm8 lane)))) - (x64_pshufd val lane (OperandSize.Size32))) + (x64_pshufd val lane)) ;; This is the only remaining case for F64X2 (rule 1 (lower (has_type $F64 (extractlane val @ (value_type (ty_vec128 ty)) (u8_from_uimm8 1)))) ;; 0xee == 0b11_10_11_10 - (x64_pshufd val 0xee (OperandSize.Size32))) + (x64_pshufd val 0xee)) (rule 0 (lower (extractlane val @ (value_type ty @ (multi_lane 8 16)) (u8_from_uimm8 lane))) - (x64_pextrb ty val lane)) + (x64_pextrb val lane)) (rule 0 (lower (extractlane val @ (value_type ty @ (multi_lane 16 8)) (u8_from_uimm8 lane))) - (x64_pextrw ty val lane)) + (x64_pextrw val lane)) (rule 0 (lower (extractlane val @ (value_type ty @ (multi_lane 32 4)) (u8_from_uimm8 lane))) - (x64_pextrd ty val lane)) + (x64_pextrd val lane)) (rule 0 (lower (extractlane val @ (value_type ty @ (multi_lane 64 2)) (u8_from_uimm8 lane))) - (x64_pextrd ty val lane)) + (x64_pextrq val lane)) ;; Rules for `scalar_to_vector` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; @@ -3537,7 +3521,7 @@ (vec Xmm (vec_insert_lane $I16X8 (xmm_uninit_value) src 0)) (vec Xmm (vec_insert_lane $I16X8 vec src 1))) ;; Shuffle the lowest two lanes to all other lanes. - (x64_pshufd vec 0 (OperandSize.Size32)))) + (x64_pshufd vec 0))) (rule 1 (lower (has_type (multi_lane 32 4) (splat src @ (value_type (ty_scalar_float _))))) (lower_splat_32x4 $F32X4 src)) @@ -3550,7 +3534,7 @@ (let ((src RegMem src) (vec Xmm (vec_insert_lane ty (xmm_uninit_value) src 0))) ;; Shuffle the lowest lane to all other lanes. - (x64_pshufd vec 0 (OperandSize.Size32)))) + (x64_pshufd vec 0))) (rule 1 (lower (has_type (multi_lane 64 2) (splat src @ (value_type (ty_scalar_float _))))) (lower_splat_64x2 $F64X2 src)) diff --git a/cranelift/filetests/filetests/isa/x64/extractlane.clif b/cranelift/filetests/filetests/isa/x64/extractlane.clif index abe9882d36..1cbdfbf7d7 100644 --- a/cranelift/filetests/filetests/isa/x64/extractlane.clif +++ b/cranelift/filetests/filetests/isa/x64/extractlane.clif @@ -86,7 +86,7 @@ block0(v0: i64x2): ; pushq %rbp ; movq %rsp, %rbp ; block0: -; pextrd.w $1, %xmm0, %rax +; pextrq $1, %xmm0, %rax ; movq %rbp, %rsp ; popq %rbp ; ret diff --git a/cranelift/filetests/filetests/isa/x64/simd-bitwise-compile.clif b/cranelift/filetests/filetests/isa/x64/simd-bitwise-compile.clif index 24950d7732..1ecdf31ed0 100644 --- a/cranelift/filetests/filetests/isa/x64/simd-bitwise-compile.clif +++ b/cranelift/filetests/filetests/isa/x64/simd-bitwise-compile.clif @@ -753,8 +753,8 @@ block0(v0: i64x2): ; pushq %rbp ; movq %rsp, %rbp ; block0: -; pextrd.w $0, %xmm0, %rdx -; pextrd.w $1, %xmm0, %r9 +; pextrq $0, %xmm0, %rdx +; pextrq $1, %xmm0, %r9 ; sarq $36, %rdx, %rdx ; sarq $36, %r9, %r9 ; uninit %xmm0 @@ -789,8 +789,8 @@ block0(v0: i64x2, v1: i32): ; pushq %rbp ; movq %rsp, %rbp ; block0: -; pextrd.w $0, %xmm0, %r8 -; pextrd.w $1, %xmm0, %r10 +; pextrq $0, %xmm0, %r8 +; pextrq $1, %xmm0, %r10 ; movq %rdi, %rcx ; sarq %cl, %r8, %r8 ; sarq %cl, %r10, %r10