diff --git a/cranelift/codegen/src/isa/x64/inst.isle b/cranelift/codegen/src/isa/x64/inst.isle index 46e37bea5b..324c86900e 100644 --- a/cranelift/codegen/src/isa/x64/inst.isle +++ b/cranelift/codegen/src/isa/x64/inst.isle @@ -162,7 +162,7 @@ ;; Arithmetic SIMD shifts. (XmmRmiReg (opcode SseOpcode) (src1 Xmm) - (src2 XmmMemImm) + (src2 XmmMemAlignedImm) (dst WritableXmm)) ;; Integer comparisons/tests: cmp or test (b w l q) (reg addr imm) reg. @@ -193,7 +193,7 @@ ;; XMM conditional move; overwrites the destination register. (XmmCmove (ty Type) (cc CC) - (consequent XmmMem) + (consequent XmmMemAligned) (alternative Xmm) (dst WritableXmm)) @@ -218,9 +218,15 @@ ;; (32 64) (reg addr) reg (XmmRmR (op SseOpcode) (src1 Xmm) - (src2 XmmMem) + (src2 XmmMemAligned) (dst WritableXmm)) + ;; Same as `XmmRmR` except the memory operand can be unaligned + (XmmRmRUnaligned (op SseOpcode) + (src1 Xmm) + (src2 XmmMem) + (dst WritableXmm)) + ;; XMM (scalar or vector) production of a constant value by operating ;; on a register with itself. ;; @@ -235,7 +241,7 @@ (XmmRmRBlend (op SseOpcode) (src1 Xmm) - (src2 XmmMem) + (src2 XmmMemAligned) (mask Xmm) (dst WritableXmm)) @@ -300,16 +306,22 @@ ;; not have to be a previously valid value. This is characteristic of mov ;; instructions. (XmmUnaryRmR (op SseOpcode) - (src XmmMem) + (src XmmMemAligned) (dst WritableXmm)) + ;; Same as `XmmUnaryRmR` but used for opcodes where the memory address + ;; can be unaligned. + (XmmUnaryRmRUnaligned (op SseOpcode) + (src XmmMem) + (dst WritableXmm)) + ;; XMM (scalar or vector) unary op with immediate: roundss, roundsd, etc. ;; ;; This differs from XMM_RM_R_IMM in that the dst register of ;; XmmUnaryRmRImm is not used in the computation of the instruction dst ;; value and so does not have to be a previously valid value. (XmmUnaryRmRImm (op SseOpcode) - (src XmmMem) + (src XmmMemAligned) (imm u8) (dst WritableXmm)) @@ -380,7 +392,7 @@ ;; Float comparisons/tests: cmp (b w l q) (reg addr imm) reg. (XmmCmpRmR (op SseOpcode) - (src XmmMem) + (src XmmMemAligned) (dst Xmm)) ;; A binary XMM instruction with an 8-bit immediate: e.g. cmp (ps pd) imm @@ -1334,7 +1346,9 @@ (type WritableXmm (primitive WritableXmm)) (type OptionWritableXmm (primitive OptionWritableXmm)) (type XmmMem extern (enum)) +(type XmmMemAligned extern (enum)) (type XmmMemImm extern (enum)) +(type XmmMemAlignedImm extern (enum)) ;; Convert an `Imm8Reg` into an `Imm8Gpr`. (decl imm8_reg_to_imm8_gpr (Imm8Reg) Imm8Gpr) @@ -1384,6 +1398,25 @@ (decl xmm_mem_to_xmm_mem_imm (XmmMem) XmmMemImm) (extern constructor xmm_mem_to_xmm_mem_imm xmm_mem_to_xmm_mem_imm) +;; Convert an `XmmMem` into an `XmmMemAligned`. +;; +;; Note that this is an infallible conversion, not a fallible one. If the +;; original `XmmMem` source is a register, then it's passed through directly. +;; If it's `Mem` and refers to aligned memory, it's also passed through +;; directly. Otherwise, though, it's a memory source which is not aligned to +;; 16 bytes so a load is performed and the temporary register which is the +;; result of the load is passed through. The end-result is that the return value +;; here is guaranteed to be a register or an aligned memory location. +(decl xmm_mem_to_xmm_mem_aligned (XmmMem) XmmMemAligned) +(extern constructor xmm_mem_to_xmm_mem_aligned xmm_mem_to_xmm_mem_aligned) + +;; Convert an `XmmMemImm` into an `XmmMemImmAligned`. +;; +;; Note that this is the same as `xmm_mem_to_xmm_mem_aligned` except it handles +;; an immediate case as well. +(decl xmm_mem_imm_to_xmm_mem_aligned_imm (XmmMemImm) XmmMemAlignedImm) +(extern constructor xmm_mem_imm_to_xmm_mem_aligned_imm xmm_mem_imm_to_xmm_mem_aligned_imm) + ;; Allocate a new temporary GPR register. (decl temp_writable_gpr () WritableGpr) (extern constructor temp_writable_gpr temp_writable_gpr) @@ -1801,23 +1834,19 @@ dst)) (rule 2 (x64_load $F32 addr _ext_kind) - (xmm_unary_rm_r (SseOpcode.Movss) - addr)) + (xmm_unary_rm_r_unaligned (SseOpcode.Movss) addr)) (rule 2 (x64_load $F64 addr _ext_kind) - (xmm_unary_rm_r (SseOpcode.Movsd) - addr)) + (xmm_unary_rm_r_unaligned (SseOpcode.Movsd) addr)) (rule 2 (x64_load $F32X4 addr _ext_kind) - (xmm_unary_rm_r (SseOpcode.Movups) - addr)) + (xmm_unary_rm_r_unaligned (SseOpcode.Movups) addr)) (rule 2 (x64_load $F64X2 addr _ext_kind) - (xmm_unary_rm_r (SseOpcode.Movupd) - addr)) + (xmm_unary_rm_r_unaligned (SseOpcode.Movupd) addr)) (rule 0 (x64_load (multi_lane _bits _lanes) addr _ext_kind) - (xmm_unary_rm_r (SseOpcode.Movdqu) addr)) + (xmm_unary_rm_r_unaligned (SseOpcode.Movdqu) addr)) (decl x64_mov (Amode) Reg) (rule (x64_mov addr) @@ -1839,19 +1868,19 @@ (decl x64_movss_load (XmmMem) Xmm) (rule (x64_movss_load from) - (xmm_unary_rm_r (SseOpcode.Movss) from)) + (xmm_unary_rm_r_unaligned (SseOpcode.Movss) from)) (decl x64_movsd_load (XmmMem) Xmm) (rule (x64_movsd_load from) - (xmm_unary_rm_r (SseOpcode.Movsd) from)) + (xmm_unary_rm_r_unaligned (SseOpcode.Movsd) from)) (decl x64_movups (XmmMem) Xmm) (rule (x64_movups from) - (xmm_unary_rm_r (SseOpcode.Movups) from)) + (xmm_unary_rm_r_unaligned (SseOpcode.Movups) from)) (decl x64_movupd (XmmMem) Xmm) (rule (x64_movupd from) - (xmm_unary_rm_r (SseOpcode.Movupd) from)) + (xmm_unary_rm_r_unaligned (SseOpcode.Movupd) from)) (decl x64_movd (Xmm) Gpr) (rule (x64_movd from) @@ -1859,7 +1888,7 @@ (decl x64_movdqu (XmmMem) Xmm) (rule (x64_movdqu from) - (xmm_unary_rm_r (SseOpcode.Movdqu) from)) + (xmm_unary_rm_r_unaligned (SseOpcode.Movdqu) from)) (decl x64_movapd (XmmMem) Xmm) (rule (x64_movapd src) @@ -1867,27 +1896,27 @@ (decl x64_pmovsxbw (XmmMem) Xmm) (rule (x64_pmovsxbw from) - (xmm_unary_rm_r (SseOpcode.Pmovsxbw) from)) + (xmm_unary_rm_r_unaligned (SseOpcode.Pmovsxbw) from)) (decl x64_pmovzxbw (XmmMem) Xmm) (rule (x64_pmovzxbw from) - (xmm_unary_rm_r (SseOpcode.Pmovzxbw) from)) + (xmm_unary_rm_r_unaligned (SseOpcode.Pmovzxbw) from)) (decl x64_pmovsxwd (XmmMem) Xmm) (rule (x64_pmovsxwd from) - (xmm_unary_rm_r (SseOpcode.Pmovsxwd) from)) + (xmm_unary_rm_r_unaligned (SseOpcode.Pmovsxwd) from)) (decl x64_pmovzxwd (XmmMem) Xmm) (rule (x64_pmovzxwd from) - (xmm_unary_rm_r (SseOpcode.Pmovzxwd) from)) + (xmm_unary_rm_r_unaligned (SseOpcode.Pmovzxwd) from)) (decl x64_pmovsxdq (XmmMem) Xmm) (rule (x64_pmovsxdq from) - (xmm_unary_rm_r (SseOpcode.Pmovsxdq) from)) + (xmm_unary_rm_r_unaligned (SseOpcode.Pmovsxdq) from)) (decl x64_pmovzxdq (XmmMem) Xmm) (rule (x64_pmovzxdq from) - (xmm_unary_rm_r (SseOpcode.Pmovzxdq) from)) + (xmm_unary_rm_r_unaligned (SseOpcode.Pmovzxdq) from)) (decl x64_movrm (Type SyntheticAmode Gpr) SideEffectNoResult) (rule (x64_movrm ty addr data) @@ -2178,7 +2207,7 @@ (cmp_rmi_r size (CmpOpcode.Cmp) (RegMemImm.Imm src1) src2)) ;; Helper for creating `MInst.XmmCmpRmR` instructions. -(decl xmm_cmp_rm_r (SseOpcode XmmMem Xmm) ProducesFlags) +(decl xmm_cmp_rm_r (SseOpcode XmmMemAligned Xmm) ProducesFlags) (rule (xmm_cmp_rm_r opcode src1 src2) (ProducesFlags.ProducesFlagsSideEffect (MInst.XmmCmpRmR opcode src1 src2))) @@ -2213,7 +2242,7 @@ (MInst.Cmove size cc consequent alternative dst) dst))) -(decl cmove_xmm (Type CC XmmMem Xmm) ConsumesFlags) +(decl cmove_xmm (Type CC XmmMemAligned Xmm) ConsumesFlags) (rule (cmove_xmm ty cc consequent alternative) (let ((dst WritableXmm (temp_writable_xmm))) (ConsumesFlags.ConsumesFlagsReturnsReg @@ -2266,7 +2295,7 @@ cmove2 dst))) -(decl cmove_or_xmm (Type CC CC XmmMem Xmm) ConsumesFlags) +(decl cmove_or_xmm (Type CC CC XmmMemAligned Xmm) ConsumesFlags) (rule (cmove_or_xmm ty cc1 cc2 consequent alternative) (let ((dst WritableXmm (temp_writable_xmm)) (tmp WritableXmm (temp_writable_xmm)) @@ -2324,12 +2353,19 @@ dst))) ;; Helper for creating `MInst.XmmRmR` instructions. -(decl xmm_rm_r (SseOpcode Xmm XmmMem) Xmm) +(decl xmm_rm_r (SseOpcode Xmm XmmMemAligned) Xmm) (rule (xmm_rm_r op src1 src2) (let ((dst WritableXmm (temp_writable_xmm)) (_ Unit (emit (MInst.XmmRmR op src1 src2 dst)))) dst)) +;; Helper for creating `MInst.XmmRmRUnaligned` instructions. +(decl xmm_rm_r_unaligned (SseOpcode Xmm XmmMem) Xmm) +(rule (xmm_rm_r_unaligned op src1 src2) + (let ((dst WritableXmm (temp_writable_xmm)) + (_ Unit (emit (MInst.XmmRmRUnaligned op src1 src2 dst)))) + dst)) + ;; Helper for creating `paddb` instructions. (decl x64_paddb (Xmm XmmMem) Xmm) (rule 0 (x64_paddb src1 src2) @@ -2653,12 +2689,12 @@ ;; Helper for creating `addss` instructions. (decl x64_addss (Xmm XmmMem) Xmm) (rule (x64_addss src1 src2) - (xmm_rm_r (SseOpcode.Addss) src1 src2)) + (xmm_rm_r_unaligned (SseOpcode.Addss) src1 src2)) ;; Helper for creating `addsd` instructions. (decl x64_addsd (Xmm XmmMem) Xmm) (rule (x64_addsd src1 src2) - (xmm_rm_r (SseOpcode.Addsd) src1 src2)) + (xmm_rm_r_unaligned (SseOpcode.Addsd) src1 src2)) ;; Helper for creating `addps` instructions. (decl x64_addps (Xmm XmmMem) Xmm) @@ -2679,12 +2715,12 @@ ;; Helper for creating `subss` instructions. (decl x64_subss (Xmm XmmMem) Xmm) (rule (x64_subss src1 src2) - (xmm_rm_r (SseOpcode.Subss) src1 src2)) + (xmm_rm_r_unaligned (SseOpcode.Subss) src1 src2)) ;; Helper for creating `subsd` instructions. (decl x64_subsd (Xmm XmmMem) Xmm) (rule (x64_subsd src1 src2) - (xmm_rm_r (SseOpcode.Subsd) src1 src2)) + (xmm_rm_r_unaligned (SseOpcode.Subsd) src1 src2)) ;; Helper for creating `subps` instructions. (decl x64_subps (Xmm XmmMem) Xmm) @@ -2705,12 +2741,12 @@ ;; Helper for creating `mulss` instructions. (decl x64_mulss (Xmm XmmMem) Xmm) (rule (x64_mulss src1 src2) - (xmm_rm_r (SseOpcode.Mulss) src1 src2)) + (xmm_rm_r_unaligned (SseOpcode.Mulss) src1 src2)) ;; Helper for creating `mulsd` instructions. (decl x64_mulsd (Xmm XmmMem) Xmm) (rule (x64_mulsd src1 src2) - (xmm_rm_r (SseOpcode.Mulsd) src1 src2)) + (xmm_rm_r_unaligned (SseOpcode.Mulsd) src1 src2)) ;; Helper for creating `mulps` instructions. (decl x64_mulps (Xmm XmmMem) Xmm) @@ -2731,12 +2767,12 @@ ;; Helper for creating `divss` instructions. (decl x64_divss (Xmm XmmMem) Xmm) (rule (x64_divss src1 src2) - (xmm_rm_r (SseOpcode.Divss) src1 src2)) + (xmm_rm_r_unaligned (SseOpcode.Divss) src1 src2)) ;; Helper for creating `divsd` instructions. (decl x64_divsd (Xmm XmmMem) Xmm) (rule (x64_divsd src1 src2) - (xmm_rm_r (SseOpcode.Divsd) src1 src2)) + (xmm_rm_r_unaligned (SseOpcode.Divsd) src1 src2)) ;; Helper for creating `divps` instructions. (decl x64_divps (Xmm XmmMem) Xmm) @@ -2755,7 +2791,7 @@ (xmm_rmir_vex (AvxOpcode.Vdivpd) src1 src2)) ;; Helper for creating `XmmRmRBlend` instructions -(decl xmm_rm_r_blend (SseOpcode Xmm XmmMem Xmm) Xmm) +(decl xmm_rm_r_blend (SseOpcode Xmm XmmMemAligned Xmm) Xmm) (rule (xmm_rm_r_blend op src1 src2 mask) (let ((dst WritableXmm (temp_writable_xmm)) (_ Unit (emit (MInst.XmmRmRBlend op src1 src2 mask dst)))) @@ -2801,7 +2837,7 @@ ;; Helper for creating `movsd` instructions. (decl x64_movsd_regmove (Xmm XmmMem) Xmm) (rule (x64_movsd_regmove src1 src2) - (xmm_rm_r (SseOpcode.Movsd) src1 src2)) + (xmm_rm_r_unaligned (SseOpcode.Movsd) src1 src2)) ;; Helper for creating `movlhps` instructions. (decl x64_movlhps (Xmm XmmMem) Xmm) @@ -3063,7 +3099,7 @@ dst)) ;; Helper for constructing `XmmUnaryRmRImm` instructions. -(decl xmm_unary_rm_r_imm (SseOpcode XmmMem u8) Xmm) +(decl xmm_unary_rm_r_imm (SseOpcode XmmMemAligned u8) Xmm) (rule (xmm_unary_rm_r_imm op src1 imm) (let ((dst WritableXmm (temp_writable_xmm)) (_ Unit (emit (MInst.XmmUnaryRmRImm op src1 imm dst)))) @@ -3142,12 +3178,19 @@ (xmm_rmr_imm_vex (AvxOpcode.Vshufps) src1 src2 byte)) ;; Helper for creating `MInst.XmmUnaryRmR` instructions. -(decl xmm_unary_rm_r (SseOpcode XmmMem) Xmm) +(decl xmm_unary_rm_r (SseOpcode XmmMemAligned) Xmm) (rule (xmm_unary_rm_r op src) (let ((dst WritableXmm (temp_writable_xmm)) (_ Unit (emit (MInst.XmmUnaryRmR op src dst)))) dst)) +;; Helper for creating `MInst.XmmUnaryRmRUnaligned` instructions. +(decl xmm_unary_rm_r_unaligned (SseOpcode XmmMem) Xmm) +(rule (xmm_unary_rm_r_unaligned op src) + (let ((dst WritableXmm (temp_writable_xmm)) + (_ Unit (emit (MInst.XmmUnaryRmRUnaligned op src dst)))) + dst)) + ;; Helper for creating `pabsb` instructions. (decl x64_pabsb (XmmMem) Xmm) (rule (x64_pabsb src) @@ -3240,7 +3283,7 @@ (mul_hi ty $false src1 src2)) ;; Helper for creating `MInst.XmmRmiXmm` instructions. -(decl xmm_rmi_xmm (SseOpcode Xmm XmmMemImm) Xmm) +(decl xmm_rmi_xmm (SseOpcode Xmm XmmMemAlignedImm) Xmm) (rule (xmm_rmi_xmm op src1 src2) (let ((dst WritableXmm (temp_writable_xmm)) (_ Unit (emit (MInst.XmmRmiReg op @@ -3482,22 +3525,17 @@ dst)) ;; Helper for creating `minss` instructions. -(decl x64_minss (Xmm Xmm) Xmm) +(decl x64_minss (Xmm XmmMem) Xmm) (rule (x64_minss x y) - (let ((dst WritableXmm (temp_writable_xmm)) - (_ Unit (emit (MInst.XmmRmR (SseOpcode.Minss) x y dst)))) - dst)) + (xmm_rm_r_unaligned (SseOpcode.Minss) x y)) ;; Helper for creating `minsd` instructions. -(decl x64_minsd (Xmm Xmm) Xmm) +(decl x64_minsd (Xmm XmmMem) Xmm) (rule (x64_minsd x y) - (let ((dst WritableXmm (temp_writable_xmm)) - (_ Unit (emit (MInst.XmmRmR (SseOpcode.Minsd) x y dst)))) - dst)) - + (xmm_rm_r_unaligned (SseOpcode.Minsd) x y)) ;; Helper for creating `minps` instructions. -(decl x64_minps (Xmm Xmm) Xmm) +(decl x64_minps (Xmm XmmMem) Xmm) (rule 0 (x64_minps x y) (xmm_rm_r (SseOpcode.Minps) x y)) (rule 1 (x64_minps x y) @@ -3505,7 +3543,7 @@ (xmm_rmir_vex (AvxOpcode.Vminps) x y)) ;; Helper for creating `minpd` instructions. -(decl x64_minpd (Xmm Xmm) Xmm) +(decl x64_minpd (Xmm XmmMem) Xmm) (rule 0 (x64_minpd x y) (xmm_rm_r (SseOpcode.Minpd) x y)) (rule 1 (x64_minpd x y) @@ -3513,17 +3551,17 @@ (xmm_rmir_vex (AvxOpcode.Vminpd) x y)) ;; Helper for creating `maxss` instructions. -(decl x64_maxss (Xmm Xmm) Xmm) +(decl x64_maxss (Xmm XmmMem) Xmm) (rule (x64_maxss x y) - (xmm_rm_r (SseOpcode.Maxss) x y)) + (xmm_rm_r_unaligned (SseOpcode.Maxss) x y)) ;; Helper for creating `maxsd` instructions. -(decl x64_maxsd (Xmm Xmm) Xmm) +(decl x64_maxsd (Xmm XmmMem) Xmm) (rule (x64_maxsd x y) - (xmm_rm_r (SseOpcode.Maxsd) x y)) + (xmm_rm_r_unaligned (SseOpcode.Maxsd) x y)) ;; Helper for creating `maxps` instructions. -(decl x64_maxps (Xmm Xmm) Xmm) +(decl x64_maxps (Xmm XmmMem) Xmm) (rule 0 (x64_maxps x y) (xmm_rm_r (SseOpcode.Maxps) x y)) (rule 1 (x64_maxps x y) @@ -3531,7 +3569,7 @@ (xmm_rmir_vex (AvxOpcode.Vmaxps) x y)) ;; Helper for creating `maxpd` instructions. -(decl x64_maxpd (Xmm Xmm) Xmm) +(decl x64_maxpd (Xmm XmmMem) Xmm) (rule 0 (x64_maxpd x y) (xmm_rm_r (SseOpcode.Maxpd) x y)) (rule 1 (x64_maxpd x y) @@ -3590,19 +3628,19 @@ ;; Helper for creating `sqrtss` instructions. -(decl x64_sqrtss (Xmm) Xmm) -(rule (x64_sqrtss x) (xmm_unary_rm_r (SseOpcode.Sqrtss) x)) +(decl x64_sqrtss (XmmMem) Xmm) +(rule (x64_sqrtss x) (xmm_unary_rm_r_unaligned (SseOpcode.Sqrtss) x)) ;; Helper for creating `sqrtsd` instructions. -(decl x64_sqrtsd (Xmm) Xmm) -(rule (x64_sqrtsd x) (xmm_unary_rm_r (SseOpcode.Sqrtsd) x)) +(decl x64_sqrtsd (XmmMem) Xmm) +(rule (x64_sqrtsd x) (xmm_unary_rm_r_unaligned (SseOpcode.Sqrtsd) x)) ;; Helper for creating `sqrtps` instructions. -(decl x64_sqrtps (Xmm) Xmm) +(decl x64_sqrtps (XmmMem) Xmm) (rule (x64_sqrtps x) (xmm_unary_rm_r (SseOpcode.Sqrtps) x)) ;; Helper for creating `sqrtpd` instructions. -(decl x64_sqrtpd (Xmm) Xmm) +(decl x64_sqrtpd (XmmMem) Xmm) (rule (x64_sqrtpd x) (xmm_unary_rm_r (SseOpcode.Sqrtpd) x)) ;; Helper for creating `cvtss2sd` instructions. @@ -3614,19 +3652,19 @@ (rule (x64_cvtsd2ss x) (xmm_unary_rm_r (SseOpcode.Cvtsd2ss) x)) ;; Helper for creating `cvtdq2ps` instructions. -(decl x64_cvtdq2ps (Xmm) Xmm) +(decl x64_cvtdq2ps (XmmMem) Xmm) (rule (x64_cvtdq2ps x) (xmm_unary_rm_r (SseOpcode.Cvtdq2ps) x)) ;; Helper for creating `cvtps2pd` instructions. -(decl x64_cvtps2pd (Xmm) Xmm) +(decl x64_cvtps2pd (XmmMem) Xmm) (rule (x64_cvtps2pd x) (xmm_unary_rm_r (SseOpcode.Cvtps2pd) x)) ;; Helper for creating `cvtpd2ps` instructions. -(decl x64_cvtpd2ps (Xmm) Xmm) +(decl x64_cvtpd2ps (XmmMem) Xmm) (rule (x64_cvtpd2ps x) (xmm_unary_rm_r (SseOpcode.Cvtpd2ps) x)) ;; Helper for creating `cvtdq2pd` instructions. -(decl x64_cvtdq2pd (Xmm) Xmm) +(decl x64_cvtdq2pd (XmmMem) Xmm) (rule (x64_cvtdq2pd x) (xmm_unary_rm_r (SseOpcode.Cvtdq2pd) x)) ;; Helper for creating `cvtsi2ss` instructions. @@ -3640,8 +3678,8 @@ (gpr_to_xmm (SseOpcode.Cvtsi2sd) x (raw_operand_size_of_type ty))) ;; Helper for creating `cvttps2dq` instructions. -(decl x64_cvttps2dq (Type XmmMem) Xmm) -(rule (x64_cvttps2dq ty x) +(decl x64_cvttps2dq (XmmMem) Xmm) +(rule (x64_cvttps2dq x) (xmm_unary_rm_r (SseOpcode.Cvttps2dq) x)) ;; Helper for creating `cvttpd2dq` instructions. @@ -4262,6 +4300,7 @@ (convert Xmm InstOutput output_xmm) (convert Value Xmm put_in_xmm) (convert Value XmmMem put_in_xmm_mem) +(convert Value XmmMemAligned put_in_xmm_mem_aligned) (convert Value XmmMemImm put_in_xmm_mem_imm) (convert Xmm Reg xmm_to_reg) (convert Xmm RegMem xmm_to_reg_mem) @@ -4272,6 +4311,7 @@ (convert RegMemImm XmmMemImm mov_rmi_to_xmm) (convert Xmm XmmMem xmm_to_xmm_mem) (convert Xmm XmmMemImm xmm_to_xmm_mem_imm) +(convert Xmm XmmMemAligned xmm_to_xmm_mem_aligned) (convert XmmMem XmmMemImm xmm_mem_to_xmm_mem_imm) (convert XmmMem RegMem xmm_mem_to_reg_mem) (convert WritableXmm Xmm writable_xmm_to_xmm) @@ -4280,6 +4320,15 @@ (convert WritableXmm XmmMem writable_xmm_to_xmm_mem) (convert WritableXmm ValueRegs writable_xmm_to_value_regs) +;; Note that these conversions will introduce a `movupd` instruction if +;; the memory location is not aligned to a 16-byte boundary. This is primarily +;; used to convert `XmmMem` inputs, which themselves were typically created +;; via the `put_in_xmm_mem` constructor, into operands of SSE instructions. +;; Most pre-AVX instructions working with 16-bytes of data (e.g. full xmm +;; registers) require 16-byte alignment. +(convert XmmMem XmmMemAligned xmm_mem_to_xmm_mem_aligned) +(convert XmmMemImm XmmMemAlignedImm xmm_mem_imm_to_xmm_mem_aligned_imm) + (convert Gpr Imm8Gpr gpr_to_imm8_gpr) (convert Imm8Reg Imm8Gpr imm8_reg_to_imm8_gpr) @@ -4288,6 +4337,8 @@ (convert SyntheticAmode GprMem synthetic_amode_to_gpr_mem) (convert Amode XmmMem amode_to_xmm_mem) (convert SyntheticAmode XmmMem synthetic_amode_to_xmm_mem) +(convert Amode XmmMemAligned amode_to_xmm_mem_aligned) +(convert SyntheticAmode XmmMemAligned synthetic_amode_to_xmm_mem_aligned) (convert IntCC CC intcc_to_cc) (convert AtomicRmwOp MachAtomicRmwOp atomic_rmw_op_to_mach_atomic_rmw_op) @@ -4337,6 +4388,15 @@ (decl const_to_synthetic_amode (VCodeConstant) SyntheticAmode) (extern constructor const_to_synthetic_amode const_to_synthetic_amode) +(decl xmm_to_xmm_mem_aligned (Xmm) XmmMemAligned) +(rule (xmm_to_xmm_mem_aligned reg) (xmm_mem_to_xmm_mem_aligned reg)) +(decl amode_to_xmm_mem_aligned (Amode) XmmMemAligned) +(rule (amode_to_xmm_mem_aligned mode) (amode_to_xmm_mem mode)) +(decl synthetic_amode_to_xmm_mem_aligned (SyntheticAmode) XmmMemAligned) +(rule (synthetic_amode_to_xmm_mem_aligned mode) (synthetic_amode_to_xmm_mem mode)) +(decl put_in_xmm_mem_aligned (Value) XmmMemAligned) +(rule (put_in_xmm_mem_aligned val) (put_in_xmm_mem val)) + ;; Helper for creating `MovFromPReg` instructions. (decl mov_from_preg (PReg) Reg) (rule (mov_from_preg preg) diff --git a/cranelift/codegen/src/isa/x64/inst/args.rs b/cranelift/codegen/src/isa/x64/inst/args.rs index 9eaded1210..59b6f6da24 100644 --- a/cranelift/codegen/src/isa/x64/inst/args.rs +++ b/cranelift/codegen/src/isa/x64/inst/args.rs @@ -31,8 +31,8 @@ macro_rules! newtype_of_reg { $newtype_reg:ident, $newtype_writable_reg:ident, $newtype_option_writable_reg:ident, - $newtype_reg_mem:ident, - $newtype_reg_mem_imm:ident, + reg_mem: ($($newtype_reg_mem:ident $(aligned:$aligned:ident)?),*), + reg_mem_imm: ($($newtype_reg_mem_imm:ident $(aligned:$aligned_imm:ident)?),*), $newtype_imm8_reg:ident, |$check_reg:ident| $check:expr ) => { @@ -102,108 +102,130 @@ macro_rules! newtype_of_reg { } } - /// A newtype wrapper around `RegMem` for general-purpose registers. - #[derive(Clone, Debug)] - pub struct $newtype_reg_mem(RegMem); + $( + /// A newtype wrapper around `RegMem` for general-purpose registers. + #[derive(Clone, Debug)] + pub struct $newtype_reg_mem(RegMem); - impl From<$newtype_reg_mem> for RegMem { - fn from(rm: $newtype_reg_mem) -> Self { - rm.0 - } - } - - impl From<$newtype_reg> for $newtype_reg_mem { - fn from(r: $newtype_reg) -> Self { - $newtype_reg_mem(RegMem::reg(r.into())) - } - } - - impl $newtype_reg_mem { - /// Construct a `RegMem` newtype from the given `RegMem`, or return - /// `None` if the `RegMem` is not a valid instance of this `RegMem` - /// newtype. - pub fn new(rm: RegMem) -> Option { - match rm { - RegMem::Mem { addr: _ } => Some(Self(rm)), - RegMem::Reg { reg: $check_reg } if $check => Some(Self(rm)), - RegMem::Reg { reg: _ } => None, + impl From<$newtype_reg_mem> for RegMem { + fn from(rm: $newtype_reg_mem) -> Self { + rm.0 } } - /// Convert this newtype into its underlying `RegMem`. - pub fn to_reg_mem(self) -> RegMem { - self.0 - } - - #[allow(dead_code)] // Used by some newtypes and not others. - pub(crate) fn get_operands VReg>( - &self, - collector: &mut OperandCollector<'_, F>, - ) { - self.0.get_operands(collector); - } - } - impl PrettyPrint for $newtype_reg_mem { - fn pretty_print(&self, size: u8, allocs: &mut AllocationConsumer<'_>) -> String { - self.0.pretty_print(size, allocs) - } - } - - /// A newtype wrapper around `RegMemImm`. - #[derive(Clone, Debug)] - pub struct $newtype_reg_mem_imm(RegMemImm); - - impl From<$newtype_reg_mem_imm> for RegMemImm { - fn from(rmi: $newtype_reg_mem_imm) -> RegMemImm { - rmi.0 - } - } - - impl From<$newtype_reg> for $newtype_reg_mem_imm { - fn from(r: $newtype_reg) -> Self { - $newtype_reg_mem_imm(RegMemImm::reg(r.into())) - } - } - - impl From<$newtype_reg_mem> for $newtype_reg_mem_imm { - fn from(r: $newtype_reg_mem) -> Self { - $newtype_reg_mem_imm(r.0.into()) - } - } - - impl $newtype_reg_mem_imm { - /// Construct this newtype from the given `RegMemImm`, or return - /// `None` if the `RegMemImm` is not a valid instance of this - /// newtype. - pub fn new(rmi: RegMemImm) -> Option { - match rmi { - RegMemImm::Imm { .. } => Some(Self(rmi)), - RegMemImm::Mem { addr: _ } => Some(Self(rmi)), - RegMemImm::Reg { reg: $check_reg } if $check => Some(Self(rmi)), - RegMemImm::Reg { reg: _ } => None, + impl From<$newtype_reg> for $newtype_reg_mem { + fn from(r: $newtype_reg) -> Self { + $newtype_reg_mem(RegMem::reg(r.into())) } } - /// Convert this newtype into its underlying `RegMemImm`. - #[allow(dead_code)] // Used by some newtypes and not others. - pub fn to_reg_mem_imm(self) -> RegMemImm { - self.0 + impl $newtype_reg_mem { + /// Construct a `RegMem` newtype from the given `RegMem`, or return + /// `None` if the `RegMem` is not a valid instance of this `RegMem` + /// newtype. + pub fn new(rm: RegMem) -> Option { + match rm { + RegMem::Mem { addr } => { + let mut _allow = true; + $( + if $aligned { + _allow = addr.aligned(); + } + )? + if _allow { + Some(Self(RegMem::Mem { addr })) + } else { + None + } + } + RegMem::Reg { reg: $check_reg } if $check => Some(Self(rm)), + RegMem::Reg { reg: _ } => None, + } + } + + /// Convert this newtype into its underlying `RegMem`. + pub fn to_reg_mem(self) -> RegMem { + self.0 + } + + #[allow(dead_code)] // Used by some newtypes and not others. + pub(crate) fn get_operands VReg>( + &self, + collector: &mut OperandCollector<'_, F>, + ) { + self.0.get_operands(collector); + } + } + impl PrettyPrint for $newtype_reg_mem { + fn pretty_print(&self, size: u8, allocs: &mut AllocationConsumer<'_>) -> String { + self.0.pretty_print(size, allocs) + } + } + )* + + $( + /// A newtype wrapper around `RegMemImm`. + #[derive(Clone, Debug)] + pub struct $newtype_reg_mem_imm(RegMemImm); + + impl From<$newtype_reg_mem_imm> for RegMemImm { + fn from(rmi: $newtype_reg_mem_imm) -> RegMemImm { + rmi.0 + } } - #[allow(dead_code)] // Used by some newtypes and not others. - pub(crate) fn get_operands VReg>( - &self, - collector: &mut OperandCollector<'_, F>, - ) { - self.0.get_operands(collector); + impl From<$newtype_reg> for $newtype_reg_mem_imm { + fn from(r: $newtype_reg) -> Self { + $newtype_reg_mem_imm(RegMemImm::reg(r.into())) + } } - } - impl PrettyPrint for $newtype_reg_mem_imm { - fn pretty_print(&self, size: u8, allocs: &mut AllocationConsumer<'_>) -> String { - self.0.pretty_print(size, allocs) + impl $newtype_reg_mem_imm { + /// Construct this newtype from the given `RegMemImm`, or return + /// `None` if the `RegMemImm` is not a valid instance of this + /// newtype. + pub fn new(rmi: RegMemImm) -> Option { + match rmi { + RegMemImm::Imm { .. } => Some(Self(rmi)), + RegMemImm::Mem { addr } => { + let mut _allow = true; + $( + if $aligned_imm { + _allow = addr.aligned(); + } + )? + if _allow { + Some(Self(RegMemImm::Mem { addr })) + } else { + None + } + } + RegMemImm::Reg { reg: $check_reg } if $check => Some(Self(rmi)), + RegMemImm::Reg { reg: _ } => None, + } + } + + /// Convert this newtype into its underlying `RegMemImm`. + #[allow(dead_code)] // Used by some newtypes and not others. + pub fn to_reg_mem_imm(self) -> RegMemImm { + self.0 + } + + #[allow(dead_code)] // Used by some newtypes and not others. + pub(crate) fn get_operands VReg>( + &self, + collector: &mut OperandCollector<'_, F>, + ) { + self.0.get_operands(collector); + } } - } + + impl PrettyPrint for $newtype_reg_mem_imm { + fn pretty_print(&self, size: u8, allocs: &mut AllocationConsumer<'_>) -> String { + self.0.pretty_print(size, allocs) + } + } + )* /// A newtype wrapper around `Imm8Reg`. #[derive(Clone, Debug)] @@ -242,8 +264,8 @@ newtype_of_reg!( Gpr, WritableGpr, OptionWritableGpr, - GprMem, - GprMemImm, + reg_mem: (GprMem), + reg_mem_imm: (GprMemImm), Imm8Gpr, |reg| reg.class() == RegClass::Int ); @@ -253,8 +275,8 @@ newtype_of_reg!( Xmm, WritableXmm, OptionWritableXmm, - XmmMem, - XmmMemImm, + reg_mem: (XmmMem, XmmMemAligned aligned:true), + reg_mem_imm: (XmmMemImm, XmmMemAlignedImm aligned:true), Imm8Xmm, |reg| reg.class() == RegClass::Float ); @@ -420,6 +442,10 @@ impl Amode { } ret } + + pub(crate) fn aligned(&self) -> bool { + self.get_flags().aligned() + } } impl PrettyPrint for Amode { @@ -531,6 +557,13 @@ impl SyntheticAmode { } } } + + pub(crate) fn aligned(&self) -> bool { + match self { + SyntheticAmode::Real(addr) => addr.aligned(), + SyntheticAmode::NominalSPOffset { .. } | SyntheticAmode::ConstantOffset { .. } => true, + } + } } impl Into for Amode { diff --git a/cranelift/codegen/src/isa/x64/inst/emit.rs b/cranelift/codegen/src/isa/x64/inst/emit.rs index 1cb6b34f2b..e632833bb1 100644 --- a/cranelift/codegen/src/isa/x64/inst/emit.rs +++ b/cranelift/codegen/src/isa/x64/inst/emit.rs @@ -1731,7 +1731,21 @@ pub(crate) fn emit( sink.bind_label(else_label); } - Inst::XmmUnaryRmR { + Inst::XmmUnaryRmR { op, src, dst } => { + emit( + &Inst::XmmUnaryRmRUnaligned { + op: *op, + src: XmmMem::new(src.clone().into()).unwrap(), + dst: *dst, + }, + allocs, + sink, + info, + state, + ); + } + + Inst::XmmUnaryRmRUnaligned { op, src: src_e, dst: reg_g, @@ -1842,6 +1856,24 @@ pub(crate) fn emit( } Inst::XmmRmR { + op, + src1, + src2, + dst, + } => emit( + &Inst::XmmRmRUnaligned { + op: *op, + dst: *dst, + src1: *src1, + src2: XmmMem::new(src2.clone().to_reg_mem()).unwrap(), + }, + allocs, + sink, + info, + state, + ), + + Inst::XmmRmRUnaligned { op, src1, src2: src_e, diff --git a/cranelift/codegen/src/isa/x64/inst/emit_tests.rs b/cranelift/codegen/src/isa/x64/inst/emit_tests.rs index 1c19fd1820..1ccaf6c7de 100644 --- a/cranelift/codegen/src/isa/x64/inst/emit_tests.rs +++ b/cranelift/codegen/src/isa/x64/inst/emit_tests.rs @@ -34,7 +34,7 @@ impl Inst { debug_assert!(dst.to_reg().class() == RegClass::Float); Inst::XmmUnaryRmRImm { op, - src: XmmMem::new(src).unwrap(), + src: XmmMemAligned::new(src).unwrap(), imm, dst: WritableXmm::from_writable_reg(dst).unwrap(), } @@ -56,7 +56,7 @@ impl Inst { Inst::XmmRmiReg { opcode, src1: Xmm::new(dst.to_reg()).unwrap(), - src2: XmmMemImm::new(src).unwrap(), + src2: XmmMemAlignedImm::new(src).unwrap(), dst: WritableXmm::from_writable_reg(dst).unwrap(), } } @@ -96,7 +96,7 @@ impl Inst { debug_assert!(dst.to_reg().class() == RegClass::Float); Inst::XmmUnaryRmR { op, - src: XmmMem::new(src).unwrap(), + src: XmmMemAligned::new(src).unwrap(), dst: WritableXmm::from_writable_reg(dst).unwrap(), } } @@ -136,7 +136,7 @@ impl Inst { Inst::XmmRmRBlend { op, src1: Xmm::new(dst.to_reg()).unwrap(), - src2: XmmMem::new(src2).unwrap(), + src2: XmmMemAligned::new(src2).unwrap(), mask: Xmm::new(regs::xmm0()).unwrap(), dst: WritableXmm::from_writable_reg(dst).unwrap(), } diff --git a/cranelift/codegen/src/isa/x64/inst/mod.rs b/cranelift/codegen/src/isa/x64/inst/mod.rs index a717a64c6c..0bfe391e20 100644 --- a/cranelift/codegen/src/isa/x64/inst/mod.rs +++ b/cranelift/codegen/src/isa/x64/inst/mod.rs @@ -133,11 +133,13 @@ impl Inst { | Inst::XmmMovRM { op, .. } | Inst::XmmRmiReg { opcode: op, .. } | Inst::XmmRmR { op, .. } + | Inst::XmmRmRUnaligned { op, .. } | Inst::XmmRmRBlend { op, .. } | Inst::XmmRmRImm { op, .. } | Inst::XmmToGpr { op, .. } | Inst::XmmToGprImm { op, .. } | Inst::XmmUnaryRmRImm { op, .. } + | Inst::XmmUnaryRmRUnaligned { op, .. } | Inst::XmmUnaryRmR { op, .. } | Inst::XmmConstOp { op, .. } => smallvec![op.available_from()], @@ -293,7 +295,7 @@ impl Inst { debug_assert!(dst.to_reg().class() == RegClass::Float); Inst::XmmUnaryRmR { op, - src: XmmMem::new(src).unwrap(), + src: XmmMemAligned::new(src).unwrap(), dst: WritableXmm::from_writable_reg(dst).unwrap(), } } @@ -304,7 +306,7 @@ impl Inst { Inst::XmmRmR { op, src1: Xmm::new(dst.to_reg()).unwrap(), - src2: XmmMem::new(src).unwrap(), + src2: XmmMemAligned::new(src).unwrap(), dst: WritableXmm::from_writable_reg(dst).unwrap(), } } @@ -369,7 +371,7 @@ impl Inst { pub(crate) fn xmm_cmp_rm_r(op: SseOpcode, src: RegMem, dst: Reg) -> Inst { src.assert_regclass_is(RegClass::Float); debug_assert!(dst.class() == RegClass::Float); - let src = XmmMem::new(src).unwrap(); + let src = XmmMemAligned::new(src).unwrap(); let dst = Xmm::new(dst).unwrap(); Inst::XmmCmpRmR { op, src, dst } } @@ -894,6 +896,12 @@ impl PrettyPrint for Inst { format!("{} {}, {}", ljustify(op.to_string()), src, dst) } + Inst::XmmUnaryRmRUnaligned { op, src, dst, .. } => { + let dst = pretty_print_reg(dst.to_reg().to_reg(), op.src_size(), allocs); + let src = src.pretty_print(op.src_size(), allocs); + format!("{} {}, {}", ljustify(op.to_string()), src, dst) + } + Inst::XmmUnaryRmRImm { op, src, dst, imm, .. } => { @@ -927,6 +935,19 @@ impl PrettyPrint for Inst { format!("{} {}, {}, {}", ljustify(op.to_string()), src1, src2, dst) } + Inst::XmmRmRUnaligned { + op, + src1, + src2, + dst, + .. + } => { + let src1 = pretty_print_reg(src1.to_reg(), 8, allocs); + let dst = pretty_print_reg(dst.to_reg().to_reg(), 8, allocs); + let src2 = src2.pretty_print(8, allocs); + format!("{} {}, {}, {}", ljustify(op.to_string()), src1, src2, dst) + } + Inst::XmmConstOp { op, dst } => { let dst = pretty_print_reg(dst.to_reg().to_reg(), 8, allocs); format!("{} {dst}, {dst}, {dst}", ljustify(op.to_string())) @@ -1862,9 +1883,11 @@ fn x64_get_operands VReg>(inst: &Inst, collector: &mut OperandCol collector.reg_def(dst.to_writable_reg()); src.get_operands(collector); } - Inst::XmmUnaryRmR { src, dst, .. } - | Inst::XmmUnaryRmREvex { src, dst, .. } - | Inst::XmmUnaryRmRImm { src, dst, .. } => { + Inst::XmmUnaryRmR { src, dst, .. } | Inst::XmmUnaryRmRImm { src, dst, .. } => { + collector.reg_def(dst.to_writable_reg()); + src.get_operands(collector); + } + Inst::XmmUnaryRmREvex { src, dst, .. } | Inst::XmmUnaryRmRUnaligned { src, dst, .. } => { collector.reg_def(dst.to_writable_reg()); src.get_operands(collector); } @@ -1875,6 +1898,13 @@ fn x64_get_operands VReg>(inst: &Inst, collector: &mut OperandCol collector.reg_reuse_def(dst.to_writable_reg(), 0); src2.get_operands(collector); } + Inst::XmmRmRUnaligned { + src1, src2, dst, .. + } => { + collector.reg_use(src1.to_reg()); + collector.reg_reuse_def(dst.to_writable_reg(), 0); + src2.get_operands(collector); + } Inst::XmmRmRBlend { src1, src2, diff --git a/cranelift/codegen/src/isa/x64/lower.isle b/cranelift/codegen/src/isa/x64/lower.isle index 2260fc3975..db78850e6d 100644 --- a/cranelift/codegen/src/isa/x64/lower.isle +++ b/cranelift/codegen/src/isa/x64/lower.isle @@ -2155,10 +2155,6 @@ ;; Rules for `fadd` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; -;; N.B.: there are no load-op merging rules here. We can't guarantee -;; the RHS (if a load) is 128-bit aligned, so we must avoid merging a -;; load. Likewise for other ops below. - (rule (lower (has_type $F32 (fadd x y))) (x64_addss x y)) (rule (lower (has_type $F64 (fadd x y))) @@ -2168,6 +2164,17 @@ (rule (lower (has_type $F64X2 (fadd x y))) (x64_addpd x y)) +;; The above rules automatically sink loads for rhs operands, so additionally +;; add rules for sinking loads with lhs operands. +(rule 1 (lower (has_type $F32 (fadd (sinkable_load x) y))) + (x64_addss y (sink_load x))) +(rule 1 (lower (has_type $F64 (fadd (sinkable_load x) y))) + (x64_addsd y (sink_load x))) +(rule 1 (lower (has_type $F32X4 (fadd (sinkable_load x) y))) + (x64_addps y (sink_load x))) +(rule 1 (lower (has_type $F64X2 (fadd (sinkable_load x) y))) + (x64_addpd y (sink_load x))) + ;; Rules for `fsub` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; (rule (lower (has_type $F32 (fsub x y))) @@ -2190,6 +2197,17 @@ (rule (lower (has_type $F64X2 (fmul x y))) (x64_mulpd x y)) +;; The above rules automatically sink loads for rhs operands, so additionally +;; add rules for sinking loads with lhs operands. +(rule 1 (lower (has_type $F32 (fmul (sinkable_load x) y))) + (x64_mulss y (sink_load x))) +(rule 1 (lower (has_type $F64 (fmul (sinkable_load x) y))) + (x64_mulsd y (sink_load x))) +(rule 1 (lower (has_type $F32X4 (fmul (sinkable_load x) y))) + (x64_mulps y (sink_load x))) +(rule 1 (lower (has_type $F64X2 (fmul (sinkable_load x) y))) + (x64_mulpd y (sink_load x))) + ;; Rules for `fdiv` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; (rule (lower (has_type $F32 (fdiv x y))) @@ -2983,7 +3001,7 @@ (tmp Xmm (x64_pxor tmp dst)) ;; Convert the packed float to packed doubleword. - (dst Xmm (x64_cvttps2dq $F32X4 dst)) + (dst Xmm (x64_cvttps2dq dst)) ;; Set top bit only if < 0 (tmp Xmm (x64_pand dst tmp)) @@ -3064,7 +3082,7 @@ ;; Overflow lanes greater than the maximum allowed signed value will ;; set to 0x80000000. Negative and NaN lanes will be 0x0 (tmp1 Xmm dst) - (dst Xmm (x64_cvttps2dq $F32X4 dst)) + (dst Xmm (x64_cvttps2dq dst)) ;; Set lanes to src - max_signed_int (tmp1 Xmm (x64_subps tmp1 tmp2)) @@ -3074,7 +3092,7 @@ (tmp2 Xmm (x64_cmpps tmp2 tmp1 (FcmpImm.LessThanOrEqual))) ;; Convert those set of lanes that have the max_signed_int factored out. - (tmp1 Xmm (x64_cvttps2dq $F32X4 tmp1)) + (tmp1 Xmm (x64_cvttps2dq tmp1)) ;; Prepare converted lanes by zeroing negative lanes and prepping lanes ;; that have positive overflow (based on the mask) by setting these lanes diff --git a/cranelift/codegen/src/isa/x64/lower.rs b/cranelift/codegen/src/isa/x64/lower.rs index c294fad0b3..868d0860eb 100644 --- a/cranelift/codegen/src/isa/x64/lower.rs +++ b/cranelift/codegen/src/isa/x64/lower.rs @@ -92,12 +92,6 @@ fn is_mergeable_load(ctx: &mut Lower, src_insn: IRInst) -> Option<(InsnInp return None; } - // SIMD instructions can only be load-coalesced when the loaded value comes - // from an aligned address. - if load_ty.is_vector() && !insn_data.memflags().map_or(false, |f| f.aligned()) { - return None; - } - // Just testing the opcode is enough, because the width will always match if // the type does (and the type should match if the CLIF is properly // constructed). diff --git a/cranelift/codegen/src/isa/x64/lower/isle.rs b/cranelift/codegen/src/isa/x64/lower/isle.rs index 0e050cece8..9d684cb879 100644 --- a/cranelift/codegen/src/isa/x64/lower/isle.rs +++ b/cranelift/codegen/src/isa/x64/lower/isle.rs @@ -100,23 +100,9 @@ impl Context for IsleContext<'_, '_, MInst, X64Backend> { if let Some(imm) = to_simm32(c as i64) { return imm.to_reg_mem_imm(); } - - // A load from the constant pool is better than a - // rematerialization into a register, because it reduces - // register pressure. - let vcode_constant = self.emit_u64_le_const(c); - return RegMemImm::mem(SyntheticAmode::ConstantOffset(vcode_constant)); } - if let InputSourceInst::UniqueUse(src_insn, 0) = inputs.inst { - if let Some((addr_input, offset)) = is_mergeable_load(self.lower_ctx, src_insn) { - self.lower_ctx.sink_inst(src_insn); - let amode = lower_to_amode(self.lower_ctx, addr_input, offset); - return RegMemImm::mem(amode); - } - } - - RegMemImm::reg(self.put_in_reg(val)) + self.put_in_reg_mem(val).into() } fn put_in_xmm_mem_imm(&mut self, val: Value) -> XmmMemImm { @@ -150,7 +136,7 @@ impl Context for IsleContext<'_, '_, MInst, X64Backend> { .unwrap(); } - XmmMem::new(RegMem::reg(self.put_in_reg(val))).unwrap() + XmmMem::new(self.put_in_reg_mem(val)).unwrap() } fn put_in_reg_mem(&mut self, val: Value) -> RegMem { @@ -164,12 +150,8 @@ impl Context for IsleContext<'_, '_, MInst, X64Backend> { return RegMem::mem(SyntheticAmode::ConstantOffset(vcode_constant)); } - if let InputSourceInst::UniqueUse(src_insn, 0) = inputs.inst { - if let Some((addr_input, offset)) = is_mergeable_load(self.lower_ctx, src_insn) { - self.lower_ctx.sink_inst(src_insn); - let amode = lower_to_amode(self.lower_ctx, addr_input, offset); - return RegMem::mem(amode); - } + if let Some(load) = self.sinkable_load(val) { + return self.sink_load(&load); } RegMem::reg(self.put_in_reg(val)) @@ -446,7 +428,7 @@ impl Context for IsleContext<'_, '_, MInst, X64Backend> { #[inline] fn xmm_mem_to_xmm_mem_imm(&mut self, r: &XmmMem) -> XmmMemImm { - r.clone().into() + XmmMemImm::new(r.clone().to_reg_mem().into()).unwrap() } #[inline] @@ -997,10 +979,40 @@ impl Context for IsleContext<'_, '_, MInst, X64Backend> { } } } + + fn xmm_mem_to_xmm_mem_aligned(&mut self, arg: &XmmMem) -> XmmMemAligned { + match XmmMemAligned::new(arg.clone().into()) { + Some(aligned) => aligned, + None => match arg.clone().into() { + RegMem::Mem { addr } => self.load_xmm_unaligned(addr).into(), + _ => unreachable!(), + }, + } + } + + fn xmm_mem_imm_to_xmm_mem_aligned_imm(&mut self, arg: &XmmMemImm) -> XmmMemAlignedImm { + match XmmMemAlignedImm::new(arg.clone().into()) { + Some(aligned) => aligned, + None => match arg.clone().into() { + RegMemImm::Mem { addr } => self.load_xmm_unaligned(addr).into(), + _ => unreachable!(), + }, + } + } } impl IsleContext<'_, '_, MInst, X64Backend> { isle_prelude_method_helpers!(X64Caller); + + fn load_xmm_unaligned(&mut self, addr: SyntheticAmode) -> Xmm { + let tmp = self.lower_ctx.alloc_tmp(types::F32X4).only_reg().unwrap(); + self.lower_ctx.emit(MInst::XmmUnaryRmRUnaligned { + op: SseOpcode::Movdqu, + src: XmmMem::new(RegMem::mem(addr)).unwrap(), + dst: Writable::from_reg(Xmm::new(tmp.to_reg()).unwrap()), + }); + Xmm::new(tmp.to_reg()).unwrap() + } } // Since x64 doesn't have 8x16 shifts and we must use a 16x8 shift instead, we diff --git a/cranelift/filetests/filetests/isa/x64/fastcall.clif b/cranelift/filetests/filetests/isa/x64/fastcall.clif index cdb178f133..11eab5008a 100644 --- a/cranelift/filetests/filetests/isa/x64/fastcall.clif +++ b/cranelift/filetests/filetests/isa/x64/fastcall.clif @@ -333,44 +333,42 @@ block0(v0: i64): ; unwind PushFrameRegs { offset_upward_to_caller_sp: 16 } ; movq %rsp, %rbp ; unwind DefineNewFrame { offset_upward_to_caller_sp: 16, offset_downward_to_clobbers: 160 } -; subq %rsp, $256, %rsp -; movdqu %xmm6, 96(%rsp) +; subq %rsp, $224, %rsp +; movdqu %xmm6, 64(%rsp) ; unwind SaveReg { clobber_offset: 0, reg: p6f } -; movdqu %xmm7, 112(%rsp) +; movdqu %xmm7, 80(%rsp) ; unwind SaveReg { clobber_offset: 16, reg: p7f } -; movdqu %xmm8, 128(%rsp) +; movdqu %xmm8, 96(%rsp) ; unwind SaveReg { clobber_offset: 32, reg: p8f } -; movdqu %xmm9, 144(%rsp) +; movdqu %xmm9, 112(%rsp) ; unwind SaveReg { clobber_offset: 48, reg: p9f } -; movdqu %xmm10, 160(%rsp) +; movdqu %xmm10, 128(%rsp) ; unwind SaveReg { clobber_offset: 64, reg: p10f } -; movdqu %xmm11, 176(%rsp) +; movdqu %xmm11, 144(%rsp) ; unwind SaveReg { clobber_offset: 80, reg: p11f } -; movdqu %xmm12, 192(%rsp) +; movdqu %xmm12, 160(%rsp) ; unwind SaveReg { clobber_offset: 96, reg: p12f } -; movdqu %xmm13, 208(%rsp) +; movdqu %xmm13, 176(%rsp) ; unwind SaveReg { clobber_offset: 112, reg: p13f } -; movdqu %xmm14, 224(%rsp) +; movdqu %xmm14, 192(%rsp) ; unwind SaveReg { clobber_offset: 128, reg: p14f } -; movdqu %xmm15, 240(%rsp) +; movdqu %xmm15, 208(%rsp) ; unwind SaveReg { clobber_offset: 144, reg: p15f } ; block0: ; movsd 0(%rcx), %xmm0 ; movsd 8(%rcx), %xmm10 -; movdqu %xmm10, rsp(80 + virtual offset) -; movsd 16(%rcx), %xmm2 -; movdqu %xmm2, rsp(0 + virtual offset) +; movdqu %xmm10, rsp(48 + virtual offset) +; movsd 16(%rcx), %xmm5 ; movsd 24(%rcx), %xmm14 -; movdqu %xmm14, rsp(64 + virtual offset) +; movdqu %xmm14, rsp(32 + virtual offset) ; movsd 32(%rcx), %xmm13 ; movsd 40(%rcx), %xmm15 -; movdqu %xmm15, rsp(48 + virtual offset) +; movdqu %xmm15, rsp(16 + virtual offset) ; movsd 48(%rcx), %xmm7 -; movsd 56(%rcx), %xmm5 -; movdqu %xmm5, rsp(32 + virtual offset) +; movsd 56(%rcx), %xmm8 +; movdqu %xmm8, rsp(0 + virtual offset) ; movsd 64(%rcx), %xmm12 -; movsd 72(%rcx), %xmm4 -; movdqu %xmm4, rsp(16 + virtual offset) +; movsd 72(%rcx), %xmm2 ; movsd 80(%rcx), %xmm9 ; movsd 88(%rcx), %xmm4 ; movsd 96(%rcx), %xmm3 @@ -380,24 +378,21 @@ block0(v0: i64): ; movsd 128(%rcx), %xmm6 ; movsd 136(%rcx), %xmm14 ; movsd 144(%rcx), %xmm1 -; movsd 152(%rcx), %xmm15 -; movdqu rsp(80 + virtual offset), %xmm2 -; addsd %xmm0, %xmm2, %xmm0 -; movdqu rsp(0 + virtual offset), %xmm2 -; movdqu rsp(64 + virtual offset), %xmm5 -; addsd %xmm2, %xmm5, %xmm2 -; movdqu rsp(48 + virtual offset), %xmm5 -; addsd %xmm13, %xmm5, %xmm13 -; movdqu rsp(32 + virtual offset), %xmm5 -; addsd %xmm7, %xmm5, %xmm7 -; movdqu rsp(16 + virtual offset), %xmm5 -; addsd %xmm12, %xmm5, %xmm12 +; movdqu rsp(48 + virtual offset), %xmm15 +; addsd %xmm0, %xmm15, %xmm0 +; movdqu rsp(32 + virtual offset), %xmm15 +; addsd %xmm5, %xmm15, %xmm5 +; movdqu rsp(16 + virtual offset), %xmm15 +; addsd %xmm13, %xmm15, %xmm13 +; movdqu rsp(0 + virtual offset), %xmm15 +; addsd %xmm7, %xmm15, %xmm7 +; addsd %xmm12, %xmm2, %xmm12 ; addsd %xmm9, %xmm4, %xmm9 ; addsd %xmm3, %xmm8, %xmm3 ; addsd %xmm11, %xmm10, %xmm11 ; addsd %xmm6, %xmm14, %xmm6 -; addsd %xmm1, %xmm15, %xmm1 -; addsd %xmm0, %xmm2, %xmm0 +; addsd %xmm1, 152(%rcx), %xmm1 +; addsd %xmm0, %xmm5, %xmm0 ; addsd %xmm13, %xmm7, %xmm13 ; addsd %xmm12, %xmm9, %xmm12 ; addsd %xmm3, %xmm11, %xmm3 @@ -406,17 +401,17 @@ block0(v0: i64): ; addsd %xmm12, %xmm3, %xmm12 ; addsd %xmm0, %xmm12, %xmm0 ; addsd %xmm0, %xmm6, %xmm0 -; movdqu 96(%rsp), %xmm6 -; movdqu 112(%rsp), %xmm7 -; movdqu 128(%rsp), %xmm8 -; movdqu 144(%rsp), %xmm9 -; movdqu 160(%rsp), %xmm10 -; movdqu 176(%rsp), %xmm11 -; movdqu 192(%rsp), %xmm12 -; movdqu 208(%rsp), %xmm13 -; movdqu 224(%rsp), %xmm14 -; movdqu 240(%rsp), %xmm15 -; addq %rsp, $256, %rsp +; movdqu 64(%rsp), %xmm6 +; movdqu 80(%rsp), %xmm7 +; movdqu 96(%rsp), %xmm8 +; movdqu 112(%rsp), %xmm9 +; movdqu 128(%rsp), %xmm10 +; movdqu 144(%rsp), %xmm11 +; movdqu 160(%rsp), %xmm12 +; movdqu 176(%rsp), %xmm13 +; movdqu 192(%rsp), %xmm14 +; movdqu 208(%rsp), %xmm15 +; addq %rsp, $224, %rsp ; movq %rbp, %rsp ; popq %rbp ; ret @@ -425,34 +420,32 @@ block0(v0: i64): ; block0: ; offset 0x0 ; pushq %rbp ; movq %rsp, %rbp -; subq $0x100, %rsp -; movdqu %xmm6, 0x60(%rsp) -; movdqu %xmm7, 0x70(%rsp) -; movdqu %xmm8, 0x80(%rsp) -; movdqu %xmm9, 0x90(%rsp) -; movdqu %xmm10, 0xa0(%rsp) -; movdqu %xmm11, 0xb0(%rsp) -; movdqu %xmm12, 0xc0(%rsp) -; movdqu %xmm13, 0xd0(%rsp) -; movdqu %xmm14, 0xe0(%rsp) -; movdqu %xmm15, 0xf0(%rsp) -; block1: ; offset 0x67 +; subq $0xe0, %rsp +; movdqu %xmm6, 0x40(%rsp) +; movdqu %xmm7, 0x50(%rsp) +; movdqu %xmm8, 0x60(%rsp) +; movdqu %xmm9, 0x70(%rsp) +; movdqu %xmm10, 0x80(%rsp) +; movdqu %xmm11, 0x90(%rsp) +; movdqu %xmm12, 0xa0(%rsp) +; movdqu %xmm13, 0xb0(%rsp) +; movdqu %xmm14, 0xc0(%rsp) +; movdqu %xmm15, 0xd0(%rsp) +; block1: ; offset 0x61 ; movsd (%rcx), %xmm0 ; trap: heap_oob ; movsd 8(%rcx), %xmm10 ; trap: heap_oob -; movdqu %xmm10, 0x50(%rsp) -; movsd 0x10(%rcx), %xmm2 ; trap: heap_oob -; movdqu %xmm2, (%rsp) +; movdqu %xmm10, 0x30(%rsp) +; movsd 0x10(%rcx), %xmm5 ; trap: heap_oob ; movsd 0x18(%rcx), %xmm14 ; trap: heap_oob -; movdqu %xmm14, 0x40(%rsp) +; movdqu %xmm14, 0x20(%rsp) ; movsd 0x20(%rcx), %xmm13 ; trap: heap_oob ; movsd 0x28(%rcx), %xmm15 ; trap: heap_oob -; movdqu %xmm15, 0x30(%rsp) +; movdqu %xmm15, 0x10(%rsp) ; movsd 0x30(%rcx), %xmm7 ; trap: heap_oob -; movsd 0x38(%rcx), %xmm5 ; trap: heap_oob -; movdqu %xmm5, 0x20(%rsp) +; movsd 0x38(%rcx), %xmm8 ; trap: heap_oob +; movdqu %xmm8, (%rsp) ; movsd 0x40(%rcx), %xmm12 ; trap: heap_oob -; movsd 0x48(%rcx), %xmm4 ; trap: heap_oob -; movdqu %xmm4, 0x10(%rsp) +; movsd 0x48(%rcx), %xmm2 ; trap: heap_oob ; movsd 0x50(%rcx), %xmm9 ; trap: heap_oob ; movsd 0x58(%rcx), %xmm4 ; trap: heap_oob ; movsd 0x60(%rcx), %xmm3 ; trap: heap_oob @@ -462,24 +455,21 @@ block0(v0: i64): ; movsd 0x80(%rcx), %xmm6 ; trap: heap_oob ; movsd 0x88(%rcx), %xmm14 ; trap: heap_oob ; movsd 0x90(%rcx), %xmm1 ; trap: heap_oob -; movsd 0x98(%rcx), %xmm15 ; trap: heap_oob -; movdqu 0x50(%rsp), %xmm2 -; addsd %xmm2, %xmm0 -; movdqu (%rsp), %xmm2 -; movdqu 0x40(%rsp), %xmm5 -; addsd %xmm5, %xmm2 -; movdqu 0x30(%rsp), %xmm5 -; addsd %xmm5, %xmm13 -; movdqu 0x20(%rsp), %xmm5 -; addsd %xmm5, %xmm7 -; movdqu 0x10(%rsp), %xmm5 -; addsd %xmm5, %xmm12 +; movdqu 0x30(%rsp), %xmm15 +; addsd %xmm15, %xmm0 +; movdqu 0x20(%rsp), %xmm15 +; addsd %xmm15, %xmm5 +; movdqu 0x10(%rsp), %xmm15 +; addsd %xmm15, %xmm13 +; movdqu (%rsp), %xmm15 +; addsd %xmm15, %xmm7 +; addsd %xmm2, %xmm12 ; addsd %xmm4, %xmm9 ; addsd %xmm8, %xmm3 ; addsd %xmm10, %xmm11 ; addsd %xmm14, %xmm6 -; addsd %xmm15, %xmm1 -; addsd %xmm2, %xmm0 +; addsd 0x98(%rcx), %xmm1 ; trap: heap_oob +; addsd %xmm5, %xmm0 ; addsd %xmm7, %xmm13 ; addsd %xmm9, %xmm12 ; addsd %xmm11, %xmm3 @@ -488,17 +478,17 @@ block0(v0: i64): ; addsd %xmm3, %xmm12 ; addsd %xmm12, %xmm0 ; addsd %xmm6, %xmm0 -; movdqu 0x60(%rsp), %xmm6 -; movdqu 0x70(%rsp), %xmm7 -; movdqu 0x80(%rsp), %xmm8 -; movdqu 0x90(%rsp), %xmm9 -; movdqu 0xa0(%rsp), %xmm10 -; movdqu 0xb0(%rsp), %xmm11 -; movdqu 0xc0(%rsp), %xmm12 -; movdqu 0xd0(%rsp), %xmm13 -; movdqu 0xe0(%rsp), %xmm14 -; movdqu 0xf0(%rsp), %xmm15 -; addq $0x100, %rsp +; movdqu 0x40(%rsp), %xmm6 +; movdqu 0x50(%rsp), %xmm7 +; movdqu 0x60(%rsp), %xmm8 +; movdqu 0x70(%rsp), %xmm9 +; movdqu 0x80(%rsp), %xmm10 +; movdqu 0x90(%rsp), %xmm11 +; movdqu 0xa0(%rsp), %xmm12 +; movdqu 0xb0(%rsp), %xmm13 +; movdqu 0xc0(%rsp), %xmm14 +; movdqu 0xd0(%rsp), %xmm15 +; addq $0xe0, %rsp ; movq %rbp, %rsp ; popq %rbp ; retq diff --git a/cranelift/filetests/filetests/isa/x64/simd-bitwise-avx.clif b/cranelift/filetests/filetests/isa/x64/simd-bitwise-avx.clif index 88225f5a88..479844fe63 100644 --- a/cranelift/filetests/filetests/isa/x64/simd-bitwise-avx.clif +++ b/cranelift/filetests/filetests/isa/x64/simd-bitwise-avx.clif @@ -13,8 +13,7 @@ block0(v0: f32x4, v1: i64): ; pushq %rbp ; movq %rsp, %rbp ; block0: -; movups 0(%rdi), %xmm4 -; vorps %xmm0, %xmm4, %xmm0 +; vorps %xmm0, 0(%rdi), %xmm0 ; movq %rbp, %rsp ; popq %rbp ; ret @@ -24,8 +23,7 @@ block0(v0: f32x4, v1: i64): ; pushq %rbp ; movq %rsp, %rbp ; block1: ; offset 0x4 -; movups (%rdi), %xmm4 -; vorps %xmm4, %xmm0, %xmm0 +; vorps (%rdi), %xmm0, %xmm0 ; movq %rbp, %rsp ; popq %rbp ; retq @@ -42,12 +40,11 @@ block0(v0: i64): ; pushq %rbp ; movq %rsp, %rbp ; block0: -; movss 0(%rdi), %xmm7 -; movl $-2147483648, %ecx -; movd %ecx, %xmm5 -; vandnps %xmm5, const(0), %xmm8 -; vandps %xmm5, %xmm7, %xmm9 -; vorps %xmm8, %xmm9, %xmm0 +; movl $-2147483648, %eax +; movd %eax, %xmm4 +; vandnps %xmm4, const(0), %xmm6 +; vandps %xmm4, 0(%rdi), %xmm8 +; vorps %xmm6, %xmm8, %xmm0 ; movq %rbp, %rsp ; popq %rbp ; ret @@ -57,12 +54,11 @@ block0(v0: i64): ; pushq %rbp ; movq %rsp, %rbp ; block1: ; offset 0x4 -; movss (%rdi), %xmm7 -; movl $0x80000000, %ecx -; movd %ecx, %xmm5 -; vandnps 0x17(%rip), %xmm5, %xmm8 -; vandps %xmm7, %xmm5, %xmm9 -; vorps %xmm9, %xmm8, %xmm0 +; movl $0x80000000, %eax +; movd %eax, %xmm4 +; vandnps 0x1b(%rip), %xmm4, %xmm6 +; vandps (%rdi), %xmm4, %xmm8 +; vorps %xmm8, %xmm6, %xmm0 ; movq %rbp, %rsp ; popq %rbp ; retq @@ -78,6 +74,8 @@ block0(v0: i64): ; addb %al, (%rax) ; addb %al, (%rax) ; addb %al, (%rax) +; addb %al, (%rax) +; addb %al, (%rax) function %bor_f32x4(f32x4, f32x4) -> f32x4 { block0(v0: f32x4, v1: f32x4): diff --git a/cranelift/filetests/filetests/isa/x64/simd-load-extend.clif b/cranelift/filetests/filetests/isa/x64/simd-load-extend.clif new file mode 100644 index 0000000000..3bca887ff9 --- /dev/null +++ b/cranelift/filetests/filetests/isa/x64/simd-load-extend.clif @@ -0,0 +1,154 @@ +test compile precise-output +set enable_simd +target x86_64 + +function %uload8x8(i64) -> i16x8 { +block0(v0: i64): + v1 = uload8x8 v0 + return v1 +} + +; VCode: +; pushq %rbp +; movq %rsp, %rbp +; block0: +; pmovzxbw 0(%rdi), %xmm0 +; movq %rbp, %rsp +; popq %rbp +; ret +; +; Disassembled: +; block0: ; offset 0x0 +; pushq %rbp +; movq %rsp, %rbp +; block1: ; offset 0x4 +; pmovzxbw (%rdi), %xmm0 ; trap: heap_oob +; movq %rbp, %rsp +; popq %rbp +; retq + +function %sload8x8(i64) -> i16x8 { +block0(v0: i64): + v1 = sload8x8 v0 + return v1 +} + +; VCode: +; pushq %rbp +; movq %rsp, %rbp +; block0: +; pmovsxbw 0(%rdi), %xmm0 +; movq %rbp, %rsp +; popq %rbp +; ret +; +; Disassembled: +; block0: ; offset 0x0 +; pushq %rbp +; movq %rsp, %rbp +; block1: ; offset 0x4 +; pmovsxbw (%rdi), %xmm0 ; trap: heap_oob +; movq %rbp, %rsp +; popq %rbp +; retq + +function %uload16x4(i64) -> i32x4 { +block0(v0: i64): + v1 = uload16x4 v0 + return v1 +} + +; VCode: +; pushq %rbp +; movq %rsp, %rbp +; block0: +; pmovzxwd 0(%rdi), %xmm0 +; movq %rbp, %rsp +; popq %rbp +; ret +; +; Disassembled: +; block0: ; offset 0x0 +; pushq %rbp +; movq %rsp, %rbp +; block1: ; offset 0x4 +; pmovzxwd (%rdi), %xmm0 ; trap: heap_oob +; movq %rbp, %rsp +; popq %rbp +; retq + +function %sload16x4(i64) -> i32x4 { +block0(v0: i64): + v1 = sload16x4 v0 + return v1 +} + +; VCode: +; pushq %rbp +; movq %rsp, %rbp +; block0: +; pmovsxwd 0(%rdi), %xmm0 +; movq %rbp, %rsp +; popq %rbp +; ret +; +; Disassembled: +; block0: ; offset 0x0 +; pushq %rbp +; movq %rsp, %rbp +; block1: ; offset 0x4 +; pmovsxwd (%rdi), %xmm0 ; trap: heap_oob +; movq %rbp, %rsp +; popq %rbp +; retq + +function %uload32x2(i64) -> i64x2 { +block0(v0: i64): + v1 = uload32x2 v0 + return v1 +} + +; VCode: +; pushq %rbp +; movq %rsp, %rbp +; block0: +; pmovzxdq 0(%rdi), %xmm0 +; movq %rbp, %rsp +; popq %rbp +; ret +; +; Disassembled: +; block0: ; offset 0x0 +; pushq %rbp +; movq %rsp, %rbp +; block1: ; offset 0x4 +; pmovzxdq (%rdi), %xmm0 ; trap: heap_oob +; movq %rbp, %rsp +; popq %rbp +; retq + +function %sload32x2(i64) -> i64x2 { +block0(v0: i64): + v1 = sload32x2 v0 + return v1 +} + +; VCode: +; pushq %rbp +; movq %rsp, %rbp +; block0: +; pmovsxdq 0(%rdi), %xmm0 +; movq %rbp, %rsp +; popq %rbp +; ret +; +; Disassembled: +; block0: ; offset 0x0 +; pushq %rbp +; movq %rsp, %rbp +; block1: ; offset 0x4 +; pmovsxdq (%rdi), %xmm0 ; trap: heap_oob +; movq %rbp, %rsp +; popq %rbp +; retq +