diff --git a/cranelift/codegen/src/isa/x64/inst.isle b/cranelift/codegen/src/isa/x64/inst.isle index a03e380839..99cfeda06e 100644 --- a/cranelift/codegen/src/isa/x64/inst.isle +++ b/cranelift/codegen/src/isa/x64/inst.isle @@ -227,13 +227,6 @@ (src2 XmmMem) (dst WritableXmm)) - ;; XMM (scalar or vector) production of a constant value by operating - ;; on a register with itself. - ;; - ;; Used to produce all zeros with xor or all one with a comparison. - (XmmConstOp (op SseOpcode) - (dst WritableXmm)) - ;; XMM (scalar or vector) blend op. The mask is used to blend between ;; src1 and src2. This differs from a use of `XmmRmR` as the mask is ;; implicitly in register xmm0; this special case exists to allow us to @@ -294,6 +287,12 @@ (dst WritableXmm) (imm u8)) + ;; XMM (scalar or vector) unary op (from xmm to reg/mem) using the + ;; VEX prefix + (XmmMovRMVex (op AvxOpcode) + (src Reg) + (dst SyntheticAmode)) + ;; XMM (scalar or vector) binary op that relies on the EVEX ;; prefix. Takes two inputs. (XmmRmREvex (op Avx512Opcode) @@ -1359,6 +1358,12 @@ Vpunpcklqdq Vpshuflw Vpshufhw + Vpshufd + Vmovss + Vmovsd + Vmovups + Vmovupd + Vmovdqu )) (type Avx512Opcode extern @@ -1726,21 +1731,27 @@ (decl sinkable_load (SinkableLoad) Value) (extern extractor sinkable_load sinkable_load) -;; Sink a `SinkableLoad` into a `RegMemImm.Mem`. +;; Sink a `SinkableLoad` into a `SyntheticAmode`. ;; ;; This is a side-effectful operation that notifies the context that the ;; instruction that produced the `SinkableImm` has been sunk into another ;; instruction, and no longer needs to be lowered. -(decl sink_load (SinkableLoad) RegMem) +(decl sink_load (SinkableLoad) SyntheticAmode) (extern constructor sink_load sink_load) (decl sink_load_to_gpr_mem_imm (SinkableLoad) GprMemImm) (rule (sink_load_to_gpr_mem_imm load) - (gpr_mem_imm_new (sink_load load))) + (gpr_mem_imm_new load)) (decl sink_load_to_xmm_mem (SinkableLoad) XmmMem) (rule (sink_load_to_xmm_mem load) - (reg_mem_to_xmm_mem (sink_load load))) + (reg_mem_to_xmm_mem load)) + +(decl sink_load_to_reg_mem (SinkableLoad) RegMem) +(rule (sink_load_to_reg_mem load) (RegMem.Mem load)) + +(decl sink_load_to_reg_mem_imm (SinkableLoad) RegMemImm) +(rule (sink_load_to_reg_mem_imm load) (RegMemImm.Mem load)) ;;;; Helpers for Sign/Zero Extending ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; @@ -1799,28 +1810,13 @@ (rule (vec_int_type (multi_lane 32 4)) $I32X4) (rule (vec_int_type (multi_lane 64 2)) $I64X2) -;; Determine the appropriate operation for xor-ing vectors of the specified type -(decl sse_xor_op (Type) SseOpcode) -(rule 1 (sse_xor_op $F32X4) (SseOpcode.Xorps)) -(rule 1 (sse_xor_op $F64X2) (SseOpcode.Xorpd)) -(rule 1 (sse_xor_op $F32) (SseOpcode.Xorps)) -(rule 1 (sse_xor_op $F64) (SseOpcode.Xorpd)) - -;; Priority 0 because multi_lane overlaps with the previous two explicit type -;; patterns. -(rule 0 (sse_xor_op (multi_lane _bits _lanes)) (SseOpcode.Pxor)) - -(decl avx_xor_op (Type) AvxOpcode) -(rule 1 (avx_xor_op $F32X4) (AvxOpcode.Vxorps)) -(rule 1 (avx_xor_op $F64X2) (AvxOpcode.Vxorpd)) -(rule 0 (avx_xor_op (multi_lane _bits _lanes)) (AvxOpcode.Vpxor)) - ;; Performs an xor operation of the two operands specified. -(decl sse_xor (Type Xmm XmmMem) Xmm) -(rule 0 (sse_xor ty x y) (xmm_rm_r (sse_xor_op ty) x y)) -(rule 1 (sse_xor ty @ (multi_lane _ _) x y) - (if-let $true (has_avx)) - (xmm_rmir_vex (avx_xor_op ty) x y)) +(decl x64_xor_vector (Type Xmm XmmMem) Xmm) +(rule 1 (x64_xor_vector $F32 x y) (x64_xorps x y)) +(rule 1 (x64_xor_vector $F64 x y) (x64_xorpd x y)) +(rule 1 (x64_xor_vector $F32X4 x y) (x64_xorps x y)) +(rule 1 (x64_xor_vector $F64X2 x y) (x64_xorpd x y)) +(rule 0 (x64_xor_vector (multi_lane _ _) x y) (x64_pxor x y)) ;; Generates a register value which has an all-ones pattern. ;; @@ -1833,9 +1829,8 @@ ;; we're guaranteeed that everything is equal to itself. (decl vector_all_ones () Xmm) (rule (vector_all_ones) - (let ((r WritableXmm (temp_writable_xmm)) - (_ Unit (emit (MInst.XmmConstOp (SseOpcode.Pcmpeqd) r)))) - r)) + (let ((tmp Xmm (xmm_uninit_value))) + (x64_pcmpeqd tmp tmp))) ;; Helper for creating XmmUninitializedValue instructions. (decl xmm_uninit_value () Xmm) @@ -1891,19 +1886,19 @@ dst)) (rule 2 (x64_load $F32 addr _ext_kind) - (xmm_unary_rm_r_unaligned (SseOpcode.Movss) addr)) + (x64_movss_load addr)) (rule 2 (x64_load $F64 addr _ext_kind) - (xmm_unary_rm_r_unaligned (SseOpcode.Movsd) addr)) + (x64_movsd_load addr)) (rule 2 (x64_load $F32X4 addr _ext_kind) - (xmm_unary_rm_r_unaligned (SseOpcode.Movups) addr)) + (x64_movups_load addr)) (rule 2 (x64_load $F64X2 addr _ext_kind) - (xmm_unary_rm_r_unaligned (SseOpcode.Movupd) addr)) + (x64_movupd_load addr)) (rule 0 (x64_load (multi_lane _bits _lanes) addr _ext_kind) - (xmm_unary_rm_r_unaligned (SseOpcode.Movdqu) addr)) + (x64_movdqu_load addr)) (decl x64_mov (Amode) Reg) (rule (x64_mov addr) @@ -1923,29 +1918,79 @@ (_ Unit (emit (MInst.MovsxRmR mode src dst)))) dst)) -(decl x64_movss_load (XmmMem) Xmm) +(decl x64_movss_load (SyntheticAmode) Xmm) (rule (x64_movss_load from) (xmm_unary_rm_r_unaligned (SseOpcode.Movss) from)) +(rule 1 (x64_movss_load from) + (if-let $true (has_avx)) + (xmm_unary_rm_r_vex (AvxOpcode.Vmovss) from)) -(decl x64_movsd_load (XmmMem) Xmm) +(decl x64_movss_store (SyntheticAmode Xmm) SideEffectNoResult) +(rule (x64_movss_store addr data) + (xmm_movrm (SseOpcode.Movss) addr data)) +(rule 1 (x64_movss_store addr data) + (if-let $true (has_avx)) + (xmm_movrm_vex (AvxOpcode.Vmovss) addr data)) + +(decl x64_movsd_load (SyntheticAmode) Xmm) (rule (x64_movsd_load from) (xmm_unary_rm_r_unaligned (SseOpcode.Movsd) from)) +(rule 1 (x64_movsd_load from) + (if-let $true (has_avx)) + (xmm_unary_rm_r_vex (AvxOpcode.Vmovsd) from)) -(decl x64_movups (XmmMem) Xmm) -(rule (x64_movups from) +(decl x64_movsd_store (SyntheticAmode Xmm) SideEffectNoResult) +(rule (x64_movsd_store addr data) + (xmm_movrm (SseOpcode.Movsd) addr data)) +(rule 1 (x64_movsd_store addr data) + (if-let $true (has_avx)) + (xmm_movrm_vex (AvxOpcode.Vmovsd) addr data)) + +(decl x64_movups_load (SyntheticAmode) Xmm) +(rule (x64_movups_load from) (xmm_unary_rm_r_unaligned (SseOpcode.Movups) from)) +(rule 1 (x64_movups_load from) + (if-let $true (has_avx)) + (xmm_unary_rm_r_vex (AvxOpcode.Vmovups) from)) -(decl x64_movupd (XmmMem) Xmm) -(rule (x64_movupd from) +(decl x64_movups_store (SyntheticAmode Xmm) SideEffectNoResult) +(rule (x64_movups_store addr data) + (xmm_movrm (SseOpcode.Movups) addr data)) +(rule 1 (x64_movups_store addr data) + (if-let $true (has_avx)) + (xmm_movrm_vex (AvxOpcode.Vmovups) addr data)) + +(decl x64_movupd_load (SyntheticAmode) Xmm) +(rule (x64_movupd_load from) (xmm_unary_rm_r_unaligned (SseOpcode.Movupd) from)) +(rule 1 (x64_movupd_load from) + (if-let $true (has_avx)) + (xmm_unary_rm_r_vex (AvxOpcode.Vmovupd) from)) + +(decl x64_movupd_store (SyntheticAmode Xmm) SideEffectNoResult) +(rule (x64_movupd_store addr data) + (xmm_movrm (SseOpcode.Movupd) addr data)) +(rule 1 (x64_movupd_store addr data) + (if-let $true (has_avx)) + (xmm_movrm_vex (AvxOpcode.Vmovupd) addr data)) (decl x64_movd (Xmm) Gpr) (rule (x64_movd from) (xmm_to_gpr (SseOpcode.Movd) from (OperandSize.Size32))) -(decl x64_movdqu (XmmMem) Xmm) -(rule (x64_movdqu from) +(decl x64_movdqu_load (XmmMem) Xmm) +(rule (x64_movdqu_load from) (xmm_unary_rm_r_unaligned (SseOpcode.Movdqu) from)) +(rule 1 (x64_movdqu_load from) + (if-let $true (has_avx)) + (xmm_unary_rm_r_vex (AvxOpcode.Vmovdqu) from)) + +(decl x64_movdqu_store (SyntheticAmode Xmm) SideEffectNoResult) +(rule (x64_movdqu_store addr data) + (xmm_movrm (SseOpcode.Movdqu) addr data)) +(rule 1 (x64_movdqu_store addr data) + (if-let $true (has_avx)) + (xmm_movrm_vex (AvxOpcode.Vmovdqu) addr data)) (decl x64_pmovsxbw (XmmMem) Xmm) (rule (x64_pmovsxbw from) @@ -1994,10 +2039,14 @@ (let ((size OperandSize (raw_operand_size_of_type ty))) (SideEffectNoResult.Inst (MInst.MovRM size data addr)))) -(decl x64_xmm_movrm (SseOpcode SyntheticAmode Xmm) SideEffectNoResult) -(rule (x64_xmm_movrm op addr data) +(decl xmm_movrm (SseOpcode SyntheticAmode Xmm) SideEffectNoResult) +(rule (xmm_movrm op addr data) (SideEffectNoResult.Inst (MInst.XmmMovRM op data addr))) +(decl xmm_movrm_vex (AvxOpcode SyntheticAmode Xmm) SideEffectNoResult) +(rule (xmm_movrm_vex op addr data) + (SideEffectNoResult.Inst (MInst.XmmMovRMVex op data addr))) + ;; Load a constant into an XMM register. (decl x64_xmm_load_const (Type VCodeConstant) Xmm) (rule (x64_xmm_load_const ty const) @@ -2192,26 +2241,19 @@ (xmm_to_reg (xmm_zero ty))) ;; Special case for `f32` zero immediates -(rule 2 (imm ty @ $F32 (u64_zero)) - (let ((wr WritableXmm (temp_writable_xmm)) - (_ Unit (emit (MInst.XmmConstOp (SseOpcode.Xorps) wr)))) - (xmm_to_reg wr))) +(rule 2 (imm ty @ $F32 (u64_zero)) (xmm_zero ty)) ;; TODO: use cmpeqps for all 1s ;; Special case for `f64` zero immediates to use `xorpd`. -(rule 2 (imm ty @ $F64 (u64_zero)) - (let ((wr WritableXmm (temp_writable_xmm)) - (_ Unit (emit (MInst.XmmConstOp (SseOpcode.Xorpd) wr)))) - (xmm_to_reg wr))) +(rule 2 (imm ty @ $F64 (u64_zero)) (xmm_zero ty)) ;; TODO: use cmpeqpd for all 1s (decl xmm_zero (Type) Xmm) (rule (xmm_zero ty) - (let ((wr WritableXmm (temp_writable_xmm)) - (_ Unit (emit (MInst.XmmConstOp (sse_xor_op ty) wr)))) - wr)) + (let ((tmp Xmm (xmm_uninit_value))) + (x64_xor_vector ty tmp tmp))) ;; Helper for creating `MInst.ShiftR` instructions. (decl shift_r (Type ShiftKind Gpr Imm8Gpr) Gpr) @@ -2991,10 +3033,20 @@ (if-let $true (has_avx)) (xmm_rmr_blend_vex (AvxOpcode.Vpblendvb) src1 src2 mask)) -;; Helper for creating `movsd` instructions. -(decl x64_movsd_regmove (Xmm XmmMem) Xmm) +;; Helper for creating a `movsd` instruction which creates a new vector +;; register where the upper 64-bits are from the first operand and the low +;; 64-bits are from the second operand. +;; +;; Note that the second argument here is specifically `Xmm` instead of `XmmMem` +;; because there is no encoding of a 3-operand form of `movsd` and otherwise +;; when used as a load instruction it wipes out the entire destination register +;; which defeats the purpose of this being a 2-operand instruction. +(decl x64_movsd_regmove (Xmm Xmm) Xmm) (rule (x64_movsd_regmove src1 src2) (xmm_rm_r_unaligned (SseOpcode.Movsd) src1 src2)) +(rule 1 (x64_movsd_regmove src1 src2) + (if-let $true (has_avx)) + (xmm_rmir_vex (AvxOpcode.Vmovsd) src1 src2)) ;; Helper for creating `movlhps` instructions. (decl x64_movlhps (Xmm XmmMem) Xmm) @@ -3319,6 +3371,9 @@ (decl x64_pshufd (XmmMem u8) Xmm) (rule (x64_pshufd src imm) (xmm_unary_rm_r_imm (SseOpcode.Pshufd) src imm)) +(rule 1 (x64_pshufd src imm) + (if-let $true (has_avx)) + (xmm_unary_rm_r_imm_vex (AvxOpcode.Vpshufd) src imm)) ;; Helper for creating `pshufb` instructions. (decl x64_pshufb (Xmm XmmMem) Xmm) @@ -4562,9 +4617,11 @@ (convert IntCC CC intcc_to_cc) (convert AtomicRmwOp MachAtomicRmwOp atomic_rmw_op_to_mach_atomic_rmw_op) -(convert SinkableLoad RegMem sink_load) +(convert SinkableLoad RegMem sink_load_to_reg_mem) +(convert SinkableLoad RegMemImm sink_load_to_reg_mem_imm) (convert SinkableLoad GprMemImm sink_load_to_gpr_mem_imm) (convert SinkableLoad XmmMem sink_load_to_xmm_mem) +(convert SinkableLoad SyntheticAmode sink_load) (decl reg_to_xmm_mem (Reg) XmmMem) (rule (reg_to_xmm_mem r) diff --git a/cranelift/codegen/src/isa/x64/inst/args.rs b/cranelift/codegen/src/isa/x64/inst/args.rs index 01ee044ab3..6ae024c010 100644 --- a/cranelift/codegen/src/isa/x64/inst/args.rs +++ b/cranelift/codegen/src/isa/x64/inst/args.rs @@ -1693,7 +1693,13 @@ impl AvxOpcode { | AvxOpcode::Vpunpcklqdq | AvxOpcode::Vpunpckhqdq | AvxOpcode::Vpshuflw - | AvxOpcode::Vpshufhw => { + | AvxOpcode::Vpshufhw + | AvxOpcode::Vpshufd + | AvxOpcode::Vmovss + | AvxOpcode::Vmovsd + | AvxOpcode::Vmovups + | AvxOpcode::Vmovupd + | AvxOpcode::Vmovdqu => { smallvec![InstructionSet::AVX] } } diff --git a/cranelift/codegen/src/isa/x64/inst/emit.rs b/cranelift/codegen/src/isa/x64/inst/emit.rs index 6d86bffd05..9fb2c15994 100644 --- a/cranelift/codegen/src/isa/x64/inst/emit.rs +++ b/cranelift/codegen/src/isa/x64/inst/emit.rs @@ -1976,22 +1976,6 @@ pub(crate) fn emit( } } - Inst::XmmConstOp { op, dst } => { - let dst = allocs.next(dst.to_reg().to_reg()); - emit( - &Inst::XmmRmR { - op: *op, - dst: Writable::from_reg(Xmm::new(dst).unwrap()), - src1: Xmm::new(dst).unwrap(), - src2: Xmm::new(dst).unwrap().into(), - }, - allocs, - sink, - info, - state, - ); - } - Inst::XmmRmRBlend { op, src1, @@ -2181,6 +2165,7 @@ pub(crate) fn emit( AvxOpcode::Vpunpckhdq => (LP::_66, OM::_0F, 0x6A), AvxOpcode::Vpunpcklqdq => (LP::_66, OM::_0F, 0x6C), AvxOpcode::Vpunpckhqdq => (LP::_66, OM::_0F, 0x6D), + AvxOpcode::Vmovsd => (LP::_F2, OM::_0F, 0x10), _ => panic!("unexpected rmir vex opcode {op:?}"), }; VexInstruction::new() @@ -2385,6 +2370,23 @@ pub(crate) fn emit( AvxOpcode::Vcvtps2pd => (LegacyPrefixes::None, OpcodeMap::_0F, 0x5A), AvxOpcode::Vcvttpd2dq => (LegacyPrefixes::_66, OpcodeMap::_0F, 0xE6), AvxOpcode::Vcvttps2dq => (LegacyPrefixes::_F3, OpcodeMap::_0F, 0x5B), + AvxOpcode::Vmovdqu => (LegacyPrefixes::_F3, OpcodeMap::_0F, 0x6F), + AvxOpcode::Vmovups => (LegacyPrefixes::None, OpcodeMap::_0F, 0x10), + AvxOpcode::Vmovupd => (LegacyPrefixes::_66, OpcodeMap::_0F, 0x10), + + // Note that for `vmov{s,d}` the `inst.isle` rules should + // statically ensure that only `Amode` operands are used here. + // Otherwise the other encodings of `vmovss` are more like + // 2-operand instructions which this unary encoding does not + // have. + AvxOpcode::Vmovss => match &src { + RegisterOrAmode::Amode(_) => (LegacyPrefixes::_F3, OpcodeMap::_0F, 0x10), + _ => unreachable!(), + }, + AvxOpcode::Vmovsd => match &src { + RegisterOrAmode::Amode(_) => (LegacyPrefixes::_F2, OpcodeMap::_0F, 0x10), + _ => unreachable!(), + }, _ => panic!("unexpected rmr_imm_vex opcode {op:?}"), }; @@ -2412,6 +2414,7 @@ pub(crate) fn emit( AvxOpcode::Vroundpd => (LegacyPrefixes::_66, OpcodeMap::_0F3A, 0x09), AvxOpcode::Vpshuflw => (LegacyPrefixes::_F2, OpcodeMap::_0F, 0x70), AvxOpcode::Vpshufhw => (LegacyPrefixes::_F3, OpcodeMap::_0F, 0x70), + AvxOpcode::Vpshufd => (LegacyPrefixes::_66, OpcodeMap::_0F, 0x70), _ => panic!("unexpected rmr_imm_vex opcode {op:?}"), }; @@ -2426,6 +2429,28 @@ pub(crate) fn emit( .encode(sink); } + Inst::XmmMovRMVex { op, src, dst } => { + let src = allocs.next(*src); + let dst = dst.with_allocs(allocs).finalize(state, sink); + + let (prefix, map, opcode) = match op { + AvxOpcode::Vmovdqu => (LegacyPrefixes::_F3, OpcodeMap::_0F, 0x7F), + AvxOpcode::Vmovss => (LegacyPrefixes::_F3, OpcodeMap::_0F, 0x11), + AvxOpcode::Vmovsd => (LegacyPrefixes::_F2, OpcodeMap::_0F, 0x11), + AvxOpcode::Vmovups => (LegacyPrefixes::None, OpcodeMap::_0F, 0x11), + AvxOpcode::Vmovupd => (LegacyPrefixes::_66, OpcodeMap::_0F, 0x11), + _ => unimplemented!("Opcode {:?} not implemented", op), + }; + VexInstruction::new() + .length(VexVectorLength::V128) + .prefix(prefix) + .map(map) + .opcode(opcode) + .rm(dst) + .reg(src.to_real_reg().unwrap().hw_enc()) + .encode(sink); + } + Inst::XmmRmREvex { op, src1, diff --git a/cranelift/codegen/src/isa/x64/inst/mod.rs b/cranelift/codegen/src/isa/x64/inst/mod.rs index b0002c59f5..396f9fd4a2 100644 --- a/cranelift/codegen/src/isa/x64/inst/mod.rs +++ b/cranelift/codegen/src/isa/x64/inst/mod.rs @@ -140,8 +140,7 @@ impl Inst { | Inst::XmmToGprImm { op, .. } | Inst::XmmUnaryRmRImm { op, .. } | Inst::XmmUnaryRmRUnaligned { op, .. } - | Inst::XmmUnaryRmR { op, .. } - | Inst::XmmConstOp { op, .. } => smallvec![op.available_from()], + | Inst::XmmUnaryRmR { op, .. } => smallvec![op.available_from()], Inst::XmmUnaryRmREvex { op, .. } | Inst::XmmRmREvex { op, .. } @@ -153,7 +152,8 @@ impl Inst { | Inst::XmmRmRBlendVex { op, .. } | Inst::XmmVexPinsr { op, .. } | Inst::XmmUnaryRmRVex { op, .. } - | Inst::XmmUnaryRmRImmVex { op, .. } => op.available_from(), + | Inst::XmmUnaryRmRImmVex { op, .. } + | Inst::XmmMovRMVex { op, .. } => op.available_from(), } } } @@ -938,6 +938,12 @@ impl PrettyPrint for Inst { format!("{} {}, {}", ljustify(op.to_string()), src, dst) } + Inst::XmmMovRMVex { op, src, dst, .. } => { + let src = pretty_print_reg(*src, 8, allocs); + let dst = dst.pretty_print(8, allocs); + format!("{} {}, {}", ljustify(op.to_string()), src, dst) + } + Inst::XmmRmR { op, src1, @@ -964,11 +970,6 @@ impl PrettyPrint for Inst { format!("{} {}, {}, {}", ljustify(op.to_string()), src1, src2, dst) } - Inst::XmmConstOp { op, dst } => { - let dst = pretty_print_reg(dst.to_reg().to_reg(), 8, allocs); - format!("{} {dst}, {dst}, {dst}", ljustify(op.to_string())) - } - Inst::XmmRmRBlend { op, src1, @@ -2019,9 +2020,6 @@ fn x64_get_operands VReg>(inst: &Inst, collector: &mut OperandCol collector.reg_reuse_def(*dst, 0); src2.get_operands(collector); } - Inst::XmmConstOp { dst, .. } => { - collector.reg_def(dst.to_writable_reg()); - } Inst::XmmUninitializedValue { dst } => collector.reg_def(dst.to_writable_reg()), Inst::XmmMinMaxSeq { lhs, rhs, dst, .. } => { collector.reg_use(rhs.to_reg()); @@ -2035,7 +2033,7 @@ fn x64_get_operands VReg>(inst: &Inst, collector: &mut OperandCol collector.reg_reuse_def(dst.to_writable_reg(), 0); // Reuse RHS. src2.get_operands(collector); } - Inst::XmmMovRM { src, dst, .. } => { + Inst::XmmMovRM { src, dst, .. } | Inst::XmmMovRMVex { src, dst, .. } => { collector.reg_use(*src); dst.get_operands(collector); } diff --git a/cranelift/codegen/src/isa/x64/lower.isle b/cranelift/codegen/src/isa/x64/lower.isle index e33c5ee784..0f13b95002 100644 --- a/cranelift/codegen/src/isa/x64/lower.isle +++ b/cranelift/codegen/src/isa/x64/lower.isle @@ -337,12 +337,12 @@ ;; f32 and f64 (rule 5 (lower (has_type (ty_scalar_float ty) (bxor x y))) - (sse_xor ty x y)) + (x64_xor_vector ty x y)) ;; SSE. (rule 6 (lower (has_type ty @ (multi_lane _bits _lanes) (bxor x y))) - (sse_xor ty x y)) + (x64_xor_vector ty x y)) ;; `{i,b}128`. @@ -1171,12 +1171,12 @@ ;; f32 and f64 (rule -3 (lower (has_type (ty_scalar_float ty) (bnot x))) - (sse_xor ty x (vector_all_ones))) + (x64_xor_vector ty x (vector_all_ones))) ;; Special case for vector-types where bit-negation is an xor against an ;; all-one value (rule -1 (lower (has_type ty @ (multi_lane _bits _lanes) (bnot x))) - (sse_xor ty x (vector_all_ones))) + (x64_xor_vector ty x (vector_all_ones))) ;;;; Rules for `bitselect` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; @@ -1267,20 +1267,10 @@ ;; Here the `movsd` instruction is used specifically to specialize moving ;; into the fist lane where unlike above cases we're not using the lane ;; immediate as an immediate to the instruction itself. -;; -;; Note, though, the `movsd` has different behavior with respect to the second -;; lane of the f64x2 depending on whether the RegMem operand is a register or -;; memory. When loading from a register `movsd` preserves the upper bits, but -;; when loading from memory it zeros the upper bits. We specifically want to -;; preserve the upper bits so if a `RegMem.Mem` is passed in we need to emit -;; two `movsd` instructions. The first `movsd` (used as `xmm_unary_rm_r`) will -;; load from memory into a temp register and then the second `movsd` (modeled -;; internally as `xmm_rm_r` will merge the temp register into our `vec` -;; register. -(rule 1 (vec_insert_lane $F64X2 vec (RegMem.Reg val) 0) +(rule (vec_insert_lane $F64X2 vec (RegMem.Reg val) 0) (x64_movsd_regmove vec val)) -(rule (vec_insert_lane $F64X2 vec mem 0) - (x64_movsd_regmove vec (x64_movsd_load mem))) +(rule (vec_insert_lane $F64X2 vec (RegMem.Mem val) 0) + (x64_movsd_regmove vec (x64_movsd_load val))) ;; f64x2.replace_lane 1 ;; @@ -1288,7 +1278,7 @@ ;; into the second lane where unlike above cases we're not using the lane ;; immediate as an immediate to the instruction itself. (rule (vec_insert_lane $F64X2 vec val 1) - (x64_movlhps vec (reg_mem_to_xmm_mem val))) + (x64_movlhps vec val)) ;;;; Rules for `smin`, `smax`, `umin`, `umax` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; @@ -2557,11 +2547,11 @@ (rule (lower (has_type $F64 (load flags address offset))) (x64_movsd_load (to_amode flags address offset))) (rule (lower (has_type $F32X4 (load flags address offset))) - (x64_movups (to_amode flags address offset))) + (x64_movups_load (to_amode flags address offset))) (rule (lower (has_type $F64X2 (load flags address offset))) - (x64_movupd (to_amode flags address offset))) + (x64_movupd_load (to_amode flags address offset))) (rule -2 (lower (has_type (ty_vec128 ty) (load flags address offset))) - (x64_movdqu (to_amode flags address offset))) + (x64_movdqu_load (to_amode flags address offset))) ;; We can load an I128 by doing two 64-bit loads. (rule -3 (lower (has_type $I128 @@ -2614,7 +2604,7 @@ address offset)) (side_effect - (x64_xmm_movrm (SseOpcode.Movss) (to_amode flags address offset) value))) + (x64_movss_store (to_amode flags address offset) value))) ;; F64 stores of values in XMM registers. (rule 1 (lower (store flags @@ -2622,7 +2612,7 @@ address offset)) (side_effect - (x64_xmm_movrm (SseOpcode.Movsd) (to_amode flags address offset) value))) + (x64_movsd_store (to_amode flags address offset) value))) ;; Stores of F32X4 vectors. (rule 1 (lower (store flags @@ -2630,7 +2620,7 @@ address offset)) (side_effect - (x64_xmm_movrm (SseOpcode.Movups) (to_amode flags address offset) value))) + (x64_movups_store (to_amode flags address offset) value))) ;; Stores of F64X2 vectors. (rule 1 (lower (store flags @@ -2638,7 +2628,7 @@ address offset)) (side_effect - (x64_xmm_movrm (SseOpcode.Movupd) (to_amode flags address offset) value))) + (x64_movupd_store (to_amode flags address offset) value))) ;; Stores of all other 128-bit vector types with integer lanes. (rule -1 (lower (store flags @@ -2646,7 +2636,7 @@ address offset)) (side_effect - (x64_xmm_movrm (SseOpcode.Movdqu) (to_amode flags address offset) value))) + (x64_movdqu_store (to_amode flags address offset) value))) ;; Stores of I128 values: store the two 64-bit halves separately. (rule 0 (lower (store flags @@ -2675,7 +2665,7 @@ src2)) addr offset)) - (let ((_ RegMemImm (sink_load sink))) + (let ((_ RegMemImm sink)) (side_effect (x64_add_mem ty (to_amode flags addr offset) src2)))) @@ -2689,7 +2679,7 @@ (load flags addr offset)))) addr offset)) - (let ((_ RegMemImm (sink_load sink))) + (let ((_ RegMemImm sink)) (side_effect (x64_add_mem ty (to_amode flags addr offset) src2)))) @@ -2703,7 +2693,7 @@ src2)) addr offset)) - (let ((_ RegMemImm (sink_load sink))) + (let ((_ RegMemImm sink)) (side_effect (x64_sub_mem ty (to_amode flags addr offset) src2)))) @@ -2717,7 +2707,7 @@ src2)) addr offset)) - (let ((_ RegMemImm (sink_load sink))) + (let ((_ RegMemImm sink)) (side_effect (x64_and_mem ty (to_amode flags addr offset) src2)))) @@ -2731,7 +2721,7 @@ (load flags addr offset)))) addr offset)) - (let ((_ RegMemImm (sink_load sink))) + (let ((_ RegMemImm sink)) (side_effect (x64_and_mem ty (to_amode flags addr offset) src2)))) @@ -2745,7 +2735,7 @@ src2)) addr offset)) - (let ((_ RegMemImm (sink_load sink))) + (let ((_ RegMemImm sink)) (side_effect (x64_or_mem ty (to_amode flags addr offset) src2)))) @@ -2759,7 +2749,7 @@ (load flags addr offset)))) addr offset)) - (let ((_ RegMemImm (sink_load sink))) + (let ((_ RegMemImm sink)) (side_effect (x64_or_mem ty (to_amode flags addr offset) src2)))) @@ -2773,7 +2763,7 @@ src2)) addr offset)) - (let ((_ RegMemImm (sink_load sink))) + (let ((_ RegMemImm sink)) (side_effect (x64_xor_mem ty (to_amode flags addr offset) src2)))) @@ -2787,7 +2777,7 @@ (load flags addr offset)))) addr offset)) - (let ((_ RegMemImm (sink_load sink))) + (let ((_ RegMemImm sink)) (side_effect (x64_xor_mem ty (to_amode flags addr offset) src2)))) diff --git a/cranelift/codegen/src/isa/x64/lower/isle.rs b/cranelift/codegen/src/isa/x64/lower/isle.rs index a5549e0fd5..815e40e351 100644 --- a/cranelift/codegen/src/isa/x64/lower/isle.rs +++ b/cranelift/codegen/src/isa/x64/lower/isle.rs @@ -151,7 +151,9 @@ impl Context for IsleContext<'_, '_, MInst, X64Backend> { } if let Some(load) = self.sinkable_load(val) { - return self.sink_load(&load); + return RegMem::Mem { + addr: self.sink_load(&load), + }; } RegMem::reg(self.put_in_reg(val)) @@ -277,12 +279,10 @@ impl Context for IsleContext<'_, '_, MInst, X64Backend> { None } - fn sink_load(&mut self, load: &SinkableLoad) -> RegMem { + fn sink_load(&mut self, load: &SinkableLoad) -> SyntheticAmode { self.lower_ctx.sink_inst(load.inst); let addr = lower_to_amode(self.lower_ctx, load.addr_input, load.offset); - RegMem::Mem { - addr: SyntheticAmode::Real(addr), - } + SyntheticAmode::Real(addr) } #[inline] diff --git a/cranelift/filetests/filetests/isa/x64/fabs.clif b/cranelift/filetests/filetests/isa/x64/fabs.clif index 574d34cc12..c915d9c591 100644 --- a/cranelift/filetests/filetests/isa/x64/fabs.clif +++ b/cranelift/filetests/filetests/isa/x64/fabs.clif @@ -69,9 +69,10 @@ block0(v0: f32x4): ; pushq %rbp ; movq %rsp, %rbp ; block0: -; pcmpeqd %xmm3, %xmm3, %xmm3 -; psrld %xmm3, $1, %xmm3 -; andps %xmm0, %xmm3, %xmm0 +; uninit %xmm4 +; pcmpeqd %xmm4, %xmm4, %xmm4 +; psrld %xmm4, $1, %xmm4 +; andps %xmm0, %xmm4, %xmm0 ; movq %rbp, %rsp ; popq %rbp ; ret @@ -81,9 +82,9 @@ block0(v0: f32x4): ; pushq %rbp ; movq %rsp, %rbp ; block1: ; offset 0x4 -; pcmpeqd %xmm3, %xmm3 -; psrld $1, %xmm3 -; andps %xmm3, %xmm0 +; pcmpeqd %xmm4, %xmm4 +; psrld $1, %xmm4 +; andps %xmm4, %xmm0 ; movq %rbp, %rsp ; popq %rbp ; retq @@ -98,9 +99,10 @@ block0(v0: f64x2): ; pushq %rbp ; movq %rsp, %rbp ; block0: -; pcmpeqd %xmm3, %xmm3, %xmm3 -; psrlq %xmm3, $1, %xmm3 -; andpd %xmm0, %xmm3, %xmm0 +; uninit %xmm4 +; pcmpeqd %xmm4, %xmm4, %xmm4 +; psrlq %xmm4, $1, %xmm4 +; andpd %xmm0, %xmm4, %xmm0 ; movq %rbp, %rsp ; popq %rbp ; ret @@ -110,9 +112,9 @@ block0(v0: f64x2): ; pushq %rbp ; movq %rsp, %rbp ; block1: ; offset 0x4 -; pcmpeqd %xmm3, %xmm3 -; psrlq $1, %xmm3 -; andpd %xmm3, %xmm0 +; pcmpeqd %xmm4, %xmm4 +; psrlq $1, %xmm4 +; andpd %xmm4, %xmm0 ; movq %rbp, %rsp ; popq %rbp ; retq diff --git a/cranelift/filetests/filetests/isa/x64/fcvt.clif b/cranelift/filetests/filetests/isa/x64/fcvt.clif index aee96700a8..1a4ab0cc15 100644 --- a/cranelift/filetests/filetests/isa/x64/fcvt.clif +++ b/cranelift/filetests/filetests/isa/x64/fcvt.clif @@ -1032,20 +1032,22 @@ block0(v0: f32x4): ; pushq %rbp ; movq %rsp, %rbp ; block0: -; xorps %xmm5, %xmm5, %xmm5 -; movdqa %xmm0, %xmm9 -; maxps %xmm9, %xmm5, %xmm9 -; pcmpeqd %xmm5, %xmm5, %xmm5 -; psrld %xmm5, $1, %xmm5 -; cvtdq2ps %xmm5, %xmm13 -; cvttps2dq %xmm9, %xmm12 -; subps %xmm9, %xmm13, %xmm9 -; cmpps $2, %xmm13, %xmm9, %xmm13 -; cvttps2dq %xmm9, %xmm0 -; pxor %xmm0, %xmm13, %xmm0 -; pxor %xmm6, %xmm6, %xmm6 -; pmaxsd %xmm0, %xmm6, %xmm0 -; paddd %xmm0, %xmm12, %xmm0 +; uninit %xmm6 +; xorps %xmm6, %xmm6, %xmm6 +; movdqa %xmm0, %xmm11 +; maxps %xmm11, %xmm6, %xmm11 +; pcmpeqd %xmm6, %xmm6, %xmm6 +; psrld %xmm6, $1, %xmm6 +; cvtdq2ps %xmm6, %xmm15 +; cvttps2dq %xmm11, %xmm14 +; subps %xmm11, %xmm15, %xmm11 +; cmpps $2, %xmm15, %xmm11, %xmm15 +; cvttps2dq %xmm11, %xmm0 +; pxor %xmm0, %xmm15, %xmm0 +; uninit %xmm9 +; pxor %xmm9, %xmm9, %xmm9 +; pmaxsd %xmm0, %xmm9, %xmm0 +; paddd %xmm0, %xmm14, %xmm0 ; movq %rbp, %rsp ; popq %rbp ; ret @@ -1055,20 +1057,20 @@ block0(v0: f32x4): ; pushq %rbp ; movq %rsp, %rbp ; block1: ; offset 0x4 -; xorps %xmm5, %xmm5 -; movdqa %xmm0, %xmm9 -; maxps %xmm5, %xmm9 -; pcmpeqd %xmm5, %xmm5 -; psrld $1, %xmm5 -; cvtdq2ps %xmm5, %xmm13 -; cvttps2dq %xmm9, %xmm12 -; subps %xmm13, %xmm9 -; cmpleps %xmm9, %xmm13 -; cvttps2dq %xmm9, %xmm0 -; pxor %xmm13, %xmm0 -; pxor %xmm6, %xmm6 -; pmaxsd %xmm6, %xmm0 -; paddd %xmm12, %xmm0 +; xorps %xmm6, %xmm6 +; movdqa %xmm0, %xmm11 +; maxps %xmm6, %xmm11 +; pcmpeqd %xmm6, %xmm6 +; psrld $1, %xmm6 +; cvtdq2ps %xmm6, %xmm15 +; cvttps2dq %xmm11, %xmm14 +; subps %xmm15, %xmm11 +; cmpleps %xmm11, %xmm15 +; cvttps2dq %xmm11, %xmm0 +; pxor %xmm15, %xmm0 +; pxor %xmm9, %xmm9 +; pmaxsd %xmm9, %xmm0 +; paddd %xmm14, %xmm0 ; movq %rbp, %rsp ; popq %rbp ; retq diff --git a/cranelift/filetests/filetests/isa/x64/float-avx.clif b/cranelift/filetests/filetests/isa/x64/float-avx.clif index 6776bb529a..4e29340a3b 100644 --- a/cranelift/filetests/filetests/isa/x64/float-avx.clif +++ b/cranelift/filetests/filetests/isa/x64/float-avx.clif @@ -589,3 +589,59 @@ block0(v0: f64x2): ; addb %al, (%rax) ; sarb $0xff, %bh +function %load_and_store_f32(i64, i64) { +block0(v0: i64, v1: i64): + v2 = load.f32 v0 + store v2, v1 + return +} + +; VCode: +; pushq %rbp +; movq %rsp, %rbp +; block0: +; vmovss 0(%rdi), %xmm3 +; vmovss %xmm3, 0(%rsi) +; movq %rbp, %rsp +; popq %rbp +; ret +; +; Disassembled: +; block0: ; offset 0x0 +; pushq %rbp +; movq %rsp, %rbp +; block1: ; offset 0x4 +; vmovss (%rdi), %xmm3 ; trap: heap_oob +; vmovss %xmm3, (%rsi) ; trap: heap_oob +; movq %rbp, %rsp +; popq %rbp +; retq + +function %load_and_store_f64(i64, i64) { +block0(v0: i64, v1: i64): + v2 = load.f64 v0 + store v2, v1 + return +} + +; VCode: +; pushq %rbp +; movq %rsp, %rbp +; block0: +; vmovsd 0(%rdi), %xmm3 +; vmovsd %xmm3, 0(%rsi) +; movq %rbp, %rsp +; popq %rbp +; ret +; +; Disassembled: +; block0: ; offset 0x0 +; pushq %rbp +; movq %rsp, %rbp +; block1: ; offset 0x4 +; vmovsd (%rdi), %xmm3 ; trap: heap_oob +; vmovsd %xmm3, (%rsi) ; trap: heap_oob +; movq %rbp, %rsp +; popq %rbp +; retq + diff --git a/cranelift/filetests/filetests/isa/x64/fneg.clif b/cranelift/filetests/filetests/isa/x64/fneg.clif index d990c31c14..4d0b2535ec 100644 --- a/cranelift/filetests/filetests/isa/x64/fneg.clif +++ b/cranelift/filetests/filetests/isa/x64/fneg.clif @@ -69,9 +69,10 @@ block0(v0: f32x4): ; pushq %rbp ; movq %rsp, %rbp ; block0: -; pcmpeqd %xmm3, %xmm3, %xmm3 -; pslld %xmm3, $31, %xmm3 -; xorps %xmm0, %xmm3, %xmm0 +; uninit %xmm4 +; pcmpeqd %xmm4, %xmm4, %xmm4 +; pslld %xmm4, $31, %xmm4 +; xorps %xmm0, %xmm4, %xmm0 ; movq %rbp, %rsp ; popq %rbp ; ret @@ -81,9 +82,9 @@ block0(v0: f32x4): ; pushq %rbp ; movq %rsp, %rbp ; block1: ; offset 0x4 -; pcmpeqd %xmm3, %xmm3 -; pslld $0x1f, %xmm3 -; xorps %xmm3, %xmm0 +; pcmpeqd %xmm4, %xmm4 +; pslld $0x1f, %xmm4 +; xorps %xmm4, %xmm0 ; movq %rbp, %rsp ; popq %rbp ; retq @@ -98,9 +99,10 @@ block0(v0: f64x2): ; pushq %rbp ; movq %rsp, %rbp ; block0: -; pcmpeqd %xmm3, %xmm3, %xmm3 -; psllq %xmm3, $63, %xmm3 -; xorpd %xmm0, %xmm3, %xmm0 +; uninit %xmm4 +; pcmpeqd %xmm4, %xmm4, %xmm4 +; psllq %xmm4, $63, %xmm4 +; xorpd %xmm0, %xmm4, %xmm0 ; movq %rbp, %rsp ; popq %rbp ; ret @@ -110,9 +112,9 @@ block0(v0: f64x2): ; pushq %rbp ; movq %rsp, %rbp ; block1: ; offset 0x4 -; pcmpeqd %xmm3, %xmm3 -; psllq $0x3f, %xmm3 -; xorpd %xmm3, %xmm0 +; pcmpeqd %xmm4, %xmm4 +; psllq $0x3f, %xmm4 +; xorpd %xmm4, %xmm0 ; movq %rbp, %rsp ; popq %rbp ; retq diff --git a/cranelift/filetests/filetests/isa/x64/insertlane.clif b/cranelift/filetests/filetests/isa/x64/insertlane.clif new file mode 100644 index 0000000000..7f2569d028 --- /dev/null +++ b/cranelift/filetests/filetests/isa/x64/insertlane.clif @@ -0,0 +1,82 @@ +test compile precise-output +set enable_simd +target x86_64 has_avx + +function %insertlane_f64x2_zero(f64x2, f64) -> f64x2 { +block0(v0: f64x2, v1: f64): + v2 = insertlane v0, v1, 0 + return v2 +} + +; VCode: +; pushq %rbp +; movq %rsp, %rbp +; block0: +; vmovsd %xmm0, %xmm1, %xmm0 +; movq %rbp, %rsp +; popq %rbp +; ret +; +; Disassembled: +; block0: ; offset 0x0 +; pushq %rbp +; movq %rsp, %rbp +; block1: ; offset 0x4 +; vmovsd %xmm1, %xmm0, %xmm0 +; movq %rbp, %rsp +; popq %rbp +; retq + +function %insertlane_f64x2_one(f64x2, f64) -> f64x2 { +block0(v0: f64x2, v1: f64): + v2 = insertlane v0, v1, 1 + return v2 +} + +; VCode: +; pushq %rbp +; movq %rsp, %rbp +; block0: +; vmovlhps %xmm0, %xmm1, %xmm0 +; movq %rbp, %rsp +; popq %rbp +; ret +; +; Disassembled: +; block0: ; offset 0x0 +; pushq %rbp +; movq %rsp, %rbp +; block1: ; offset 0x4 +; vmovlhps %xmm1, %xmm0, %xmm0 +; movq %rbp, %rsp +; popq %rbp +; retq + +function %insertlane_f64x2_zero_with_load(f64x2, i64) -> f64x2 { +block0(v0: f64x2, v1: i64): + v2 = load.f64 v1 + v3 = insertlane v0, v2, 0 + return v3 +} + +; VCode: +; pushq %rbp +; movq %rsp, %rbp +; block0: +; vmovsd 0(%rdi), %xmm3 +; vmovsd %xmm0, %xmm3, %xmm0 +; movq %rbp, %rsp +; popq %rbp +; ret +; +; Disassembled: +; block0: ; offset 0x0 +; pushq %rbp +; movq %rsp, %rbp +; block1: ; offset 0x4 +; vmovsd (%rdi), %xmm3 ; trap: heap_oob +; vmovsd %xmm3, %xmm0, %xmm0 +; movq %rbp, %rsp +; popq %rbp +; retq + diff --git a/cranelift/filetests/filetests/isa/x64/shuffle.clif b/cranelift/filetests/filetests/isa/x64/shuffle.clif index b056d9f168..ce1d1b4842 100644 --- a/cranelift/filetests/filetests/isa/x64/shuffle.clif +++ b/cranelift/filetests/filetests/isa/x64/shuffle.clif @@ -626,8 +626,9 @@ block0(v0: i8x16, v1: i8x16): ; pushq %rbp ; movq %rsp, %rbp ; block0: -; pxor %xmm3, %xmm3, %xmm3 -; pshufb %xmm0, %xmm3, %xmm0 +; uninit %xmm4 +; pxor %xmm4, %xmm4, %xmm4 +; pshufb %xmm0, %xmm4, %xmm0 ; movq %rbp, %rsp ; popq %rbp ; ret @@ -637,8 +638,8 @@ block0(v0: i8x16, v1: i8x16): ; pushq %rbp ; movq %rsp, %rbp ; block1: ; offset 0x4 -; pxor %xmm3, %xmm3 -; pshufb %xmm3, %xmm0 +; pxor %xmm4, %xmm4 +; pshufb %xmm4, %xmm0 ; movq %rbp, %rsp ; popq %rbp ; retq diff --git a/cranelift/filetests/filetests/isa/x64/simd-arith-avx.clif b/cranelift/filetests/filetests/isa/x64/simd-arith-avx.clif index fdd58032ea..87c1b77101 100644 --- a/cranelift/filetests/filetests/isa/x64/simd-arith-avx.clif +++ b/cranelift/filetests/filetests/isa/x64/simd-arith-avx.clif @@ -610,8 +610,8 @@ block0(v0: i32x4, v1: i32x4): ; pushq %rbp ; movq %rsp, %rbp ; block0: -; pshufd $250, %xmm0, %xmm3 -; pshufd $250, %xmm1, %xmm5 +; vpshufd $250, %xmm0, %xmm3 +; vpshufd $250, %xmm1, %xmm5 ; vpmuldq %xmm3, %xmm5, %xmm0 ; movq %rbp, %rsp ; popq %rbp @@ -622,8 +622,8 @@ block0(v0: i32x4, v1: i32x4): ; pushq %rbp ; movq %rsp, %rbp ; block1: ; offset 0x4 -; pshufd $0xfa, %xmm0, %xmm3 -; pshufd $0xfa, %xmm1, %xmm5 +; vpshufd $0xfa, %xmm0, %xmm3 +; vpshufd $0xfa, %xmm1, %xmm5 ; vpmuldq %xmm5, %xmm3, %xmm0 ; movq %rbp, %rsp ; popq %rbp @@ -641,8 +641,8 @@ block0(v0: i32x4, v1: i32x4): ; pushq %rbp ; movq %rsp, %rbp ; block0: -; pshufd $80, %xmm0, %xmm3 -; pshufd $80, %xmm1, %xmm5 +; vpshufd $80, %xmm0, %xmm3 +; vpshufd $80, %xmm1, %xmm5 ; vpmuludq %xmm3, %xmm5, %xmm0 ; movq %rbp, %rsp ; popq %rbp @@ -653,8 +653,8 @@ block0(v0: i32x4, v1: i32x4): ; pushq %rbp ; movq %rsp, %rbp ; block1: ; offset 0x4 -; pshufd $0x50, %xmm0, %xmm3 -; pshufd $0x50, %xmm1, %xmm5 +; vpshufd $0x50, %xmm0, %xmm3 +; vpshufd $0x50, %xmm1, %xmm5 ; vpmuludq %xmm5, %xmm3, %xmm0 ; movq %rbp, %rsp ; popq %rbp @@ -1233,7 +1233,7 @@ block0(v0: i8x16): ; pushq %rbp ; movq %rsp, %rbp ; block0: -; movdqu const(0), %xmm2 +; vmovdqu const(0), %xmm2 ; vpmaddubsw %xmm2, %xmm0, %xmm0 ; movq %rbp, %rsp ; popq %rbp @@ -1244,7 +1244,7 @@ block0(v0: i8x16): ; pushq %rbp ; movq %rsp, %rbp ; block1: ; offset 0x4 -; movdqu 0x14(%rip), %xmm2 +; vmovdqu 0x14(%rip), %xmm2 ; vpmaddubsw %xmm0, %xmm2, %xmm0 ; movq %rbp, %rsp ; popq %rbp @@ -1317,8 +1317,9 @@ block0(v0: i8): ; block0: ; uninit %xmm2 ; vpinsrb $0 %xmm2, %rdi, %xmm4 -; pxor %xmm6, %xmm6, %xmm6 -; vpshufb %xmm4, %xmm6, %xmm0 +; uninit %xmm6 +; vpxor %xmm6, %xmm6, %xmm8 +; vpshufb %xmm4, %xmm8, %xmm0 ; movq %rbp, %rsp ; popq %rbp ; ret @@ -1329,8 +1330,8 @@ block0(v0: i8): ; movq %rsp, %rbp ; block1: ; offset 0x4 ; vpinsrb $0, %edi, %xmm2, %xmm4 -; pxor %xmm6, %xmm6 -; vpshufb %xmm6, %xmm4, %xmm0 +; vpxor %xmm6, %xmm6, %xmm8 +; vpshufb %xmm8, %xmm4, %xmm0 ; movq %rbp, %rsp ; popq %rbp ; retq @@ -1347,12 +1348,13 @@ block0(v0: f64x2): ; pushq %rbp ; movq %rsp, %rbp ; block0: -; xorpd %xmm2, %xmm2, %xmm2 -; vmaxpd %xmm0, %xmm2, %xmm4 -; vminpd %xmm4, const(0), %xmm6 -; vroundpd $3, %xmm6, %xmm8 -; vaddpd %xmm8, const(1), %xmm10 -; vshufps $136 %xmm10, %xmm2, %xmm0 +; uninit %xmm2 +; vxorpd %xmm2, %xmm2, %xmm4 +; vmaxpd %xmm0, %xmm4, %xmm6 +; vminpd %xmm6, const(0), %xmm8 +; vroundpd $3, %xmm8, %xmm10 +; vaddpd %xmm10, const(1), %xmm12 +; vshufps $136 %xmm12, %xmm4, %xmm0 ; movq %rbp, %rsp ; popq %rbp ; ret @@ -1362,12 +1364,12 @@ block0(v0: f64x2): ; pushq %rbp ; movq %rsp, %rbp ; block1: ; offset 0x4 -; xorpd %xmm2, %xmm2 -; vmaxpd %xmm2, %xmm0, %xmm4 -; vminpd 0x1c(%rip), %xmm4, %xmm6 -; vroundpd $3, %xmm6, %xmm8 -; vaddpd 0x1e(%rip), %xmm8, %xmm10 -; vshufps $0x88, %xmm2, %xmm10, %xmm0 +; vxorpd %xmm2, %xmm2, %xmm4 +; vmaxpd %xmm4, %xmm0, %xmm6 +; vminpd 0x1c(%rip), %xmm6, %xmm8 +; vroundpd $3, %xmm8, %xmm10 +; vaddpd 0x1e(%rip), %xmm10, %xmm12 +; vshufps $0x88, %xmm4, %xmm12, %xmm0 ; movq %rbp, %rsp ; popq %rbp ; retq @@ -1392,7 +1394,7 @@ block0(v0: i8x16, v1: i32): ; vpsllw %xmm0, %xmm5, %xmm7 ; lea const(0), %rsi ; shlq $4, %r10, %r10 -; movdqu 0(%rsi,%r10,1), %xmm13 +; vmovdqu 0(%rsi,%r10,1), %xmm13 ; vpand %xmm7, %xmm13, %xmm0 ; movq %rbp, %rsp ; popq %rbp @@ -1409,7 +1411,7 @@ block0(v0: i8x16, v1: i32): ; vpsllw %xmm5, %xmm0, %xmm7 ; leaq 0x15(%rip), %rsi ; shlq $4, %r10 -; movdqu (%rsi, %r10), %xmm13 +; vmovdqu (%rsi, %r10), %xmm13 ; vpand %xmm13, %xmm7, %xmm0 ; movq %rbp, %rsp ; popq %rbp @@ -1427,7 +1429,7 @@ block0(v0: i8x16): ; movq %rsp, %rbp ; block0: ; vpsllw %xmm0, $1, %xmm2 -; movdqu const(0), %xmm4 +; vmovdqu const(0), %xmm4 ; vpand %xmm2, %xmm4, %xmm0 ; movq %rbp, %rsp ; popq %rbp @@ -1439,7 +1441,7 @@ block0(v0: i8x16): ; movq %rsp, %rbp ; block1: ; offset 0x4 ; vpsllw $1, %xmm0, %xmm2 -; movdqu 0xf(%rip), %xmm4 +; vmovdqu 0xf(%rip), %xmm4 ; vpand %xmm4, %xmm2, %xmm0 ; movq %rbp, %rsp ; popq %rbp diff --git a/cranelift/filetests/filetests/isa/x64/simd-bitwise-avx.clif b/cranelift/filetests/filetests/isa/x64/simd-bitwise-avx.clif index 4cf4352956..7da79a0cc9 100644 --- a/cranelift/filetests/filetests/isa/x64/simd-bitwise-avx.clif +++ b/cranelift/filetests/filetests/isa/x64/simd-bitwise-avx.clif @@ -187,9 +187,10 @@ block0(v0: f32x4): ; pushq %rbp ; movq %rsp, %rbp ; block0: -; pcmpeqd %xmm2, %xmm2, %xmm2 -; vpsrld %xmm2, $1, %xmm4 -; vandps %xmm0, %xmm4, %xmm0 +; uninit %xmm2 +; vpcmpeqd %xmm2, %xmm2, %xmm4 +; vpsrld %xmm4, $1, %xmm6 +; vandps %xmm0, %xmm6, %xmm0 ; movq %rbp, %rsp ; popq %rbp ; ret @@ -199,9 +200,9 @@ block0(v0: f32x4): ; pushq %rbp ; movq %rsp, %rbp ; block1: ; offset 0x4 -; pcmpeqd %xmm2, %xmm2 -; vpsrld $1, %xmm2, %xmm4 -; vandps %xmm4, %xmm0, %xmm0 +; vpcmpeqd %xmm2, %xmm2, %xmm4 +; vpsrld $1, %xmm4, %xmm6 +; vandps %xmm6, %xmm0, %xmm0 ; movq %rbp, %rsp ; popq %rbp ; retq diff --git a/cranelift/filetests/filetests/isa/x64/simd-comparison-legalize.clif b/cranelift/filetests/filetests/isa/x64/simd-comparison-legalize.clif index 80e7fa248a..9dd8be2b13 100644 --- a/cranelift/filetests/filetests/isa/x64/simd-comparison-legalize.clif +++ b/cranelift/filetests/filetests/isa/x64/simd-comparison-legalize.clif @@ -13,8 +13,9 @@ block0(v0: i32x4, v1: i32x4): ; movq %rsp, %rbp ; block0: ; pcmpeqd %xmm0, %xmm1, %xmm0 -; pcmpeqd %xmm5, %xmm5, %xmm5 -; pxor %xmm0, %xmm5, %xmm0 +; uninit %xmm6 +; pcmpeqd %xmm6, %xmm6, %xmm6 +; pxor %xmm0, %xmm6, %xmm0 ; movq %rbp, %rsp ; popq %rbp ; ret @@ -25,8 +26,8 @@ block0(v0: i32x4, v1: i32x4): ; movq %rsp, %rbp ; block1: ; offset 0x4 ; pcmpeqd %xmm1, %xmm0 -; pcmpeqd %xmm5, %xmm5 -; pxor %xmm5, %xmm0 +; pcmpeqd %xmm6, %xmm6 +; pxor %xmm6, %xmm0 ; movq %rbp, %rsp ; popq %rbp ; retq @@ -43,8 +44,9 @@ block0(v0: i32x4, v1: i32x4): ; block0: ; pmaxud %xmm0, %xmm1, %xmm0 ; pcmpeqd %xmm0, %xmm1, %xmm0 -; pcmpeqd %xmm7, %xmm7, %xmm7 -; pxor %xmm0, %xmm7, %xmm0 +; uninit %xmm8 +; pcmpeqd %xmm8, %xmm8, %xmm8 +; pxor %xmm0, %xmm8, %xmm0 ; movq %rbp, %rsp ; popq %rbp ; ret @@ -56,8 +58,8 @@ block0(v0: i32x4, v1: i32x4): ; block1: ; offset 0x4 ; pmaxud %xmm1, %xmm0 ; pcmpeqd %xmm1, %xmm0 -; pcmpeqd %xmm7, %xmm7 -; pxor %xmm7, %xmm0 +; pcmpeqd %xmm8, %xmm8 +; pxor %xmm8, %xmm0 ; movq %rbp, %rsp ; popq %rbp ; retq diff --git a/cranelift/filetests/filetests/isa/x64/simd-lane-access-compile.clif b/cranelift/filetests/filetests/isa/x64/simd-lane-access-compile.clif index f414054edb..f2ce65fec4 100644 --- a/cranelift/filetests/filetests/isa/x64/simd-lane-access-compile.clif +++ b/cranelift/filetests/filetests/isa/x64/simd-lane-access-compile.clif @@ -172,8 +172,9 @@ block0(v0: i8): ; block0: ; uninit %xmm0 ; pinsrb $0, %xmm0, %rdi, %xmm0 -; pxor %xmm6, %xmm6, %xmm6 -; pshufb %xmm0, %xmm6, %xmm0 +; uninit %xmm7 +; pxor %xmm7, %xmm7, %xmm7 +; pshufb %xmm0, %xmm7, %xmm0 ; movq %rbp, %rsp ; popq %rbp ; ret @@ -184,8 +185,8 @@ block0(v0: i8): ; movq %rsp, %rbp ; block1: ; offset 0x4 ; pinsrb $0, %edi, %xmm0 -; pxor %xmm6, %xmm6 -; pshufb %xmm6, %xmm0 +; pxor %xmm7, %xmm7 +; pshufb %xmm7, %xmm0 ; movq %rbp, %rsp ; popq %rbp ; retq diff --git a/cranelift/filetests/filetests/isa/x64/simd-load-avx.clif b/cranelift/filetests/filetests/isa/x64/simd-load-avx.clif index df4f25a996..6f9856e662 100644 --- a/cranelift/filetests/filetests/isa/x64/simd-load-avx.clif +++ b/cranelift/filetests/filetests/isa/x64/simd-load-avx.clif @@ -152,3 +152,87 @@ block0(v0: i64): ; popq %rbp ; retq +function %load_store_i8x16(i64, i64) { +block0(v0: i64, v1: i64): + v2 = load.i8x16 v0 + store v2, v1 + return +} + +; VCode: +; pushq %rbp +; movq %rsp, %rbp +; block0: +; vmovdqu 0(%rdi), %xmm3 +; vmovdqu %xmm3, 0(%rsi) +; movq %rbp, %rsp +; popq %rbp +; ret +; +; Disassembled: +; block0: ; offset 0x0 +; pushq %rbp +; movq %rsp, %rbp +; block1: ; offset 0x4 +; vmovdqu (%rdi), %xmm3 ; trap: heap_oob +; vmovdqu %xmm3, (%rsi) ; trap: heap_oob +; movq %rbp, %rsp +; popq %rbp +; retq + +function %load_store_f32x4(i64, i64) { +block0(v0: i64, v1: i64): + v2 = load.f32x4 v0 + store v2, v1 + return +} + +; VCode: +; pushq %rbp +; movq %rsp, %rbp +; block0: +; vmovups 0(%rdi), %xmm3 +; vmovups %xmm3, 0(%rsi) +; movq %rbp, %rsp +; popq %rbp +; ret +; +; Disassembled: +; block0: ; offset 0x0 +; pushq %rbp +; movq %rsp, %rbp +; block1: ; offset 0x4 +; vmovups (%rdi), %xmm3 ; trap: heap_oob +; vmovups %xmm3, (%rsi) ; trap: heap_oob +; movq %rbp, %rsp +; popq %rbp +; retq + +function %load_store_f64x2(i64, i64) { +block0(v0: i64, v1: i64): + v2 = load.f64x2 v0 + store v2, v1 + return +} + +; VCode: +; pushq %rbp +; movq %rsp, %rbp +; block0: +; vmovupd 0(%rdi), %xmm3 +; vmovupd %xmm3, 0(%rsi) +; movq %rbp, %rsp +; popq %rbp +; ret +; +; Disassembled: +; block0: ; offset 0x0 +; pushq %rbp +; movq %rsp, %rbp +; block1: ; offset 0x4 +; vmovupd (%rdi), %xmm3 ; trap: heap_oob +; vmovupd %xmm3, (%rsi) ; trap: heap_oob +; movq %rbp, %rsp +; popq %rbp +; retq + diff --git a/cranelift/filetests/filetests/isa/x64/simd-logical-compile.clif b/cranelift/filetests/filetests/isa/x64/simd-logical-compile.clif index a4098176d0..60ad9a419f 100644 --- a/cranelift/filetests/filetests/isa/x64/simd-logical-compile.clif +++ b/cranelift/filetests/filetests/isa/x64/simd-logical-compile.clif @@ -12,8 +12,9 @@ block0(v0: i32x4): ; pushq %rbp ; movq %rsp, %rbp ; block0: -; pcmpeqd %xmm2, %xmm2, %xmm2 -; pxor %xmm0, %xmm2, %xmm0 +; uninit %xmm3 +; pcmpeqd %xmm3, %xmm3, %xmm3 +; pxor %xmm0, %xmm3, %xmm0 ; movq %rbp, %rsp ; popq %rbp ; ret @@ -23,8 +24,8 @@ block0(v0: i32x4): ; pushq %rbp ; movq %rsp, %rbp ; block1: ; offset 0x4 -; pcmpeqd %xmm2, %xmm2 -; pxor %xmm2, %xmm0 +; pcmpeqd %xmm3, %xmm3 +; pxor %xmm3, %xmm0 ; movq %rbp, %rsp ; popq %rbp ; retq @@ -66,10 +67,11 @@ block0(v0: i64x2): ; pushq %rbp ; movq %rsp, %rbp ; block0: -; pxor %xmm2, %xmm2, %xmm2 -; movdqa %xmm0, %xmm4 -; pcmpeqq %xmm4, %xmm2, %xmm4 -; ptest %xmm4, %xmm4 +; uninit %xmm3 +; pxor %xmm3, %xmm3, %xmm3 +; movdqa %xmm0, %xmm6 +; pcmpeqq %xmm6, %xmm3, %xmm6 +; ptest %xmm6, %xmm6 ; setz %al ; movq %rbp, %rsp ; popq %rbp @@ -80,10 +82,10 @@ block0(v0: i64x2): ; pushq %rbp ; movq %rsp, %rbp ; block1: ; offset 0x4 -; pxor %xmm2, %xmm2 -; movdqa %xmm0, %xmm4 -; pcmpeqq %xmm2, %xmm4 -; ptest %xmm4, %xmm4 +; pxor %xmm3, %xmm3 +; movdqa %xmm0, %xmm6 +; pcmpeqq %xmm3, %xmm6 +; ptest %xmm6, %xmm6 ; sete %al ; movq %rbp, %rsp ; popq %rbp diff --git a/cranelift/filetests/filetests/isa/x64/uunarrow.clif b/cranelift/filetests/filetests/isa/x64/uunarrow.clif index 643b32f8e7..7bae44a658 100644 --- a/cranelift/filetests/filetests/isa/x64/uunarrow.clif +++ b/cranelift/filetests/filetests/isa/x64/uunarrow.clif @@ -13,13 +13,14 @@ block0(v0: f64x2): ; pushq %rbp ; movq %rsp, %rbp ; block0: -; xorpd %xmm2, %xmm2, %xmm2 -; movdqa %xmm0, %xmm5 -; maxpd %xmm5, %xmm2, %xmm5 -; minpd %xmm5, const(0), %xmm5 -; roundpd $3, %xmm5, %xmm0 +; uninit %xmm3 +; xorpd %xmm3, %xmm3, %xmm3 +; movdqa %xmm0, %xmm7 +; maxpd %xmm7, %xmm3, %xmm7 +; minpd %xmm7, const(0), %xmm7 +; roundpd $3, %xmm7, %xmm0 ; addpd %xmm0, const(1), %xmm0 -; shufps $136, %xmm0, %xmm2, %xmm0 +; shufps $136, %xmm0, %xmm3, %xmm0 ; movq %rbp, %rsp ; popq %rbp ; ret @@ -29,13 +30,13 @@ block0(v0: f64x2): ; pushq %rbp ; movq %rsp, %rbp ; block1: ; offset 0x4 -; xorpd %xmm2, %xmm2 -; movdqa %xmm0, %xmm5 -; maxpd %xmm2, %xmm5 -; minpd 0x18(%rip), %xmm5 -; roundpd $3, %xmm5, %xmm0 +; xorpd %xmm3, %xmm3 +; movdqa %xmm0, %xmm7 +; maxpd %xmm3, %xmm7 +; minpd 0x18(%rip), %xmm7 +; roundpd $3, %xmm7, %xmm0 ; addpd 0x1a(%rip), %xmm0 -; shufps $0x88, %xmm2, %xmm0 +; shufps $0x88, %xmm3, %xmm0 ; movq %rbp, %rsp ; popq %rbp ; retq diff --git a/cranelift/filetests/filetests/runtests/simd-insertlane.clif b/cranelift/filetests/filetests/runtests/simd-insertlane.clif index e0965d8324..6e0182b1cb 100644 --- a/cranelift/filetests/filetests/runtests/simd-insertlane.clif +++ b/cranelift/filetests/filetests/runtests/simd-insertlane.clif @@ -4,6 +4,7 @@ target aarch64 target s390x set enable_simd target x86_64 has_sse3 has_ssse3 has_sse41 +target x86_64 has_sse3 has_ssse3 has_sse41 has_avx function %insertlane_15(i8x16, i8) -> i8x16 { block0(v0: i8x16, v1: i8): @@ -32,3 +33,17 @@ block0(v0: i64x2, v1: i64): return v2 } ; run: %insertlane_0([1 1], 5000000000) == [5000000000 1] + +function %insertlane_0_in_f64x2(f64x2, f64) -> f64x2 { +block0(v0: f64x2, v1: f64): + v2 = insertlane v0, v1, 0 + return v2 +} +; run: %insertlane_0_in_f64x2([0x1.0 0x2.0], 0x3.0) == [0x3.0 0x2.0] + +function %insertlane_1_in_f64x2(f64x2, f64) -> f64x2 { +block0(v0: f64x2, v1: f64): + v2 = insertlane v0, v1, 1 + return v2 +} +; run: %insertlane_1_in_f64x2([0x1.0 0x2.0], 0x3.0) == [0x1.0 0x3.0] diff --git a/cranelift/filetests/filetests/wasm/x64-relaxed-simd-deterministic.wat b/cranelift/filetests/filetests/wasm/x64-relaxed-simd-deterministic.wat index f3ae7c7358..8d7eca8c0d 100644 --- a/cranelift/filetests/filetests/wasm/x64-relaxed-simd-deterministic.wat +++ b/cranelift/filetests/filetests/wasm/x64-relaxed-simd-deterministic.wat @@ -63,19 +63,21 @@ ;; movq %rsp, %rbp ;; unwind DefineNewFrame { offset_upward_to_caller_sp: 16, offset_downward_to_clobbers: 0 } ;; block0: -;; xorps %xmm3, %xmm3, %xmm3 -;; vmaxps %xmm0, %xmm3, %xmm5 -;; vpcmpeqd %xmm3, %xmm3, %xmm7 -;; vpsrld %xmm7, $1, %xmm9 -;; vcvtdq2ps %xmm9, %xmm11 -;; vcvttps2dq %xmm5, %xmm13 -;; vsubps %xmm5, %xmm11, %xmm15 -;; vcmpps $2 %xmm11, %xmm15, %xmm1 -;; vcvttps2dq %xmm15, %xmm3 -;; vpxor %xmm3, %xmm1, %xmm5 -;; pxor %xmm7, %xmm7, %xmm7 -;; vpmaxsd %xmm5, %xmm7, %xmm9 -;; vpaddd %xmm9, %xmm13, %xmm0 +;; uninit %xmm3 +;; vxorps %xmm3, %xmm3, %xmm5 +;; vmaxps %xmm0, %xmm5, %xmm7 +;; vpcmpeqd %xmm5, %xmm5, %xmm9 +;; vpsrld %xmm9, $1, %xmm11 +;; vcvtdq2ps %xmm11, %xmm13 +;; vcvttps2dq %xmm7, %xmm15 +;; vsubps %xmm7, %xmm13, %xmm1 +;; vcmpps $2 %xmm13, %xmm1, %xmm3 +;; vcvttps2dq %xmm1, %xmm5 +;; vpxor %xmm5, %xmm3, %xmm7 +;; uninit %xmm9 +;; vpxor %xmm9, %xmm9, %xmm11 +;; vpmaxsd %xmm7, %xmm11, %xmm13 +;; vpaddd %xmm13, %xmm15, %xmm0 ;; jmp label1 ;; block1: ;; movq %rbp, %rsp @@ -104,12 +106,13 @@ ;; movq %rsp, %rbp ;; unwind DefineNewFrame { offset_upward_to_caller_sp: 16, offset_downward_to_clobbers: 0 } ;; block0: -;; xorpd %xmm3, %xmm3, %xmm3 -;; vmaxpd %xmm0, %xmm3, %xmm5 -;; vminpd %xmm5, const(0), %xmm7 -;; vroundpd $3, %xmm7, %xmm9 -;; vaddpd %xmm9, const(1), %xmm11 -;; vshufps $136 %xmm11, %xmm3, %xmm0 +;; uninit %xmm3 +;; vxorpd %xmm3, %xmm3, %xmm5 +;; vmaxpd %xmm0, %xmm5, %xmm7 +;; vminpd %xmm7, const(0), %xmm9 +;; vroundpd $3, %xmm9, %xmm11 +;; vaddpd %xmm11, const(1), %xmm13 +;; vshufps $136 %xmm13, %xmm5, %xmm0 ;; jmp label1 ;; block1: ;; movq %rbp, %rsp diff --git a/cranelift/filetests/filetests/wasm/x64-relaxed-simd.wat b/cranelift/filetests/filetests/wasm/x64-relaxed-simd.wat index 43586fcb2c..cef3910e4d 100644 --- a/cranelift/filetests/filetests/wasm/x64-relaxed-simd.wat +++ b/cranelift/filetests/filetests/wasm/x64-relaxed-simd.wat @@ -55,20 +55,22 @@ ;; movq %rsp, %rbp ;; unwind DefineNewFrame { offset_upward_to_caller_sp: 16, offset_downward_to_clobbers: 0 } ;; block0: -;; xorps %xmm6, %xmm6, %xmm6 -;; movdqa %xmm0, %xmm10 -;; maxps %xmm10, %xmm6, %xmm10 -;; pcmpeqd %xmm6, %xmm6, %xmm6 -;; psrld %xmm6, $1, %xmm6 -;; cvtdq2ps %xmm6, %xmm14 -;; cvttps2dq %xmm10, %xmm13 -;; subps %xmm10, %xmm14, %xmm10 -;; cmpps $2, %xmm14, %xmm10, %xmm14 -;; cvttps2dq %xmm10, %xmm0 -;; pxor %xmm0, %xmm14, %xmm0 -;; pxor %xmm7, %xmm7, %xmm7 -;; pmaxsd %xmm0, %xmm7, %xmm0 -;; paddd %xmm0, %xmm13, %xmm0 +;; uninit %xmm7 +;; xorps %xmm7, %xmm7, %xmm7 +;; movdqa %xmm0, %xmm12 +;; maxps %xmm12, %xmm7, %xmm12 +;; pcmpeqd %xmm7, %xmm7, %xmm7 +;; psrld %xmm7, $1, %xmm7 +;; cvtdq2ps %xmm7, %xmm1 +;; cvttps2dq %xmm12, %xmm15 +;; subps %xmm12, %xmm1, %xmm12 +;; cmpps $2, %xmm1, %xmm12, %xmm1 +;; cvttps2dq %xmm12, %xmm0 +;; pxor %xmm0, %xmm1, %xmm0 +;; uninit %xmm10 +;; pxor %xmm10, %xmm10, %xmm10 +;; pmaxsd %xmm0, %xmm10, %xmm0 +;; paddd %xmm0, %xmm15, %xmm0 ;; jmp label1 ;; block1: ;; movq %rbp, %rsp @@ -94,13 +96,14 @@ ;; movq %rsp, %rbp ;; unwind DefineNewFrame { offset_upward_to_caller_sp: 16, offset_downward_to_clobbers: 0 } ;; block0: -;; xorpd %xmm3, %xmm3, %xmm3 -;; movdqa %xmm0, %xmm6 -;; maxpd %xmm6, %xmm3, %xmm6 -;; minpd %xmm6, const(0), %xmm6 -;; roundpd $3, %xmm6, %xmm0 +;; uninit %xmm4 +;; xorpd %xmm4, %xmm4, %xmm4 +;; movdqa %xmm0, %xmm8 +;; maxpd %xmm8, %xmm4, %xmm8 +;; minpd %xmm8, const(0), %xmm8 +;; roundpd $3, %xmm8, %xmm0 ;; addpd %xmm0, const(1), %xmm0 -;; shufps $136, %xmm0, %xmm3, %xmm0 +;; shufps $136, %xmm0, %xmm4, %xmm0 ;; jmp label1 ;; block1: ;; movq %rbp, %rsp