diff --git a/cranelift/codegen/src/isa/x64/inst.isle b/cranelift/codegen/src/isa/x64/inst.isle index b74b9c38c0..1fdc6b25c0 100644 --- a/cranelift/codegen/src/isa/x64/inst.isle +++ b/cranelift/codegen/src/isa/x64/inst.isle @@ -320,6 +320,21 @@ (dst WritableGpr) (imm u8)) + ;; XMM (scalar) unary op (from integer to float reg): vmovd, vmovq, + ;; vcvtsi2s{s,d} + (GprToXmmVex (op AvxOpcode) + (src GprMem) + (dst WritableXmm) + (src_size OperandSize)) + + ;; XMM (scalar) unary op (from xmm to integer reg): vmovd, vmovq, + ;; vcvtts{s,d}2si + (XmmToGprVex (op AvxOpcode) + (src Xmm) + (dst WritableGpr) + (dst_size OperandSize)) + + ;; XMM (scalar or vector) binary op that relies on the EVEX ;; prefix. Takes two inputs. (XmmRmREvex (op Avx512Opcode) @@ -1277,6 +1292,13 @@ Vpbroadcastw Vpbroadcastd Vbroadcastss + Vmovd + Vmovq + Vmovmskps + Vmovmskpd + Vpmovmskb + Vcvtsi2ss + Vcvtsi2sd )) (type Avx512Opcode extern @@ -1539,6 +1561,10 @@ (decl lo_gpr (Value) Gpr) (rule (lo_gpr regs) (gpr_new (lo_reg regs))) +;; Construct a new `XmmMemImm` from a 32-bit immediate. +(decl xmi_imm (u32) XmmMemImm) +(extern constructor xmi_imm xmi_imm) + ;;;; Helpers for Working With Integer Comparison Codes ;;;;;;;;;;;;;;;;;;;;;;;;; ;; @@ -1818,10 +1844,7 @@ (decl mov_rmi_to_xmm (RegMemImm) XmmMemImm) (rule (mov_rmi_to_xmm rmi @ (RegMemImm.Mem _)) (xmm_mem_imm_new rmi)) (rule (mov_rmi_to_xmm rmi @ (RegMemImm.Imm _)) (xmm_mem_imm_new rmi)) -(rule (mov_rmi_to_xmm (RegMemImm.Reg r)) - (gpr_to_xmm (SseOpcode.Movd) - r - (OperandSize.Size32))) +(rule (mov_rmi_to_xmm (RegMemImm.Reg r)) (x64_movd_to_xmm r)) ;;;; Helpers for Emitting Calls ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; @@ -1941,9 +1964,37 @@ (if-let $true (use_avx_simd)) (xmm_movrm_vex (AvxOpcode.Vmovupd) addr data)) -(decl x64_movd (Xmm) Gpr) -(rule (x64_movd from) +;; Helper for creating `movd` instructions. +(decl x64_movd_to_gpr (Xmm) Gpr) +(rule (x64_movd_to_gpr from) (xmm_to_gpr (SseOpcode.Movd) from (OperandSize.Size32))) +(rule 1 (x64_movd_to_gpr from) + (if-let $true (use_avx_simd)) + (xmm_to_gpr_vex (AvxOpcode.Vmovd) from (OperandSize.Size32))) + +;; Helper for creating `movd` instructions. +(decl x64_movd_to_xmm (GprMem) Xmm) +(rule (x64_movd_to_xmm from) + (gpr_to_xmm (SseOpcode.Movd) from (OperandSize.Size32))) +(rule 1 (x64_movd_to_xmm from) + (if-let $true (use_avx_simd)) + (gpr_to_xmm_vex (AvxOpcode.Vmovd) from (OperandSize.Size32))) + +;; Helper for creating `movq` instructions. +(decl x64_movq_to_xmm (GprMem) Xmm) +(rule (x64_movq_to_xmm src) + (gpr_to_xmm (SseOpcode.Movq) src (OperandSize.Size64))) +(rule 1 (x64_movq_to_xmm from) + (if-let $true (use_avx_simd)) + (gpr_to_xmm_vex (AvxOpcode.Vmovq) from (OperandSize.Size64))) + +;; Helper for creating `movq` instructions. +(decl x64_movq_to_gpr (Xmm) Gpr) +(rule (x64_movq_to_gpr src) + (xmm_to_gpr (SseOpcode.Movq) src (OperandSize.Size64))) +(rule 1 (x64_movq_to_gpr from) + (if-let $true (use_avx_simd)) + (xmm_to_gpr_vex (AvxOpcode.Vmovq) from (OperandSize.Size64))) (decl x64_movdqu_load (XmmMem) Xmm) (rule (x64_movdqu_load from) @@ -2186,15 +2237,11 @@ ;; `f32` immediates. (rule 2 (imm $F32 (u64_nonzero bits)) - (gpr_to_xmm (SseOpcode.Movd) - (imm $I32 bits) - (OperandSize.Size32))) + (x64_movd_to_xmm (imm $I32 bits))) ;; `f64` immediates. (rule 2 (imm $F64 (u64_nonzero bits)) - (gpr_to_xmm (SseOpcode.Movq) - (imm $I64 bits) - (OperandSize.Size64))) + (x64_movq_to_xmm (imm $I64 bits))) ;; Special case for when a 64-bit immediate fits into 32-bits. We can use a ;; 32-bit move that zero-extends the value, which has a smaller encoding. @@ -3663,20 +3710,44 @@ (_ Unit (emit (MInst.XmmToGprImmVex op src dst imm)))) dst)) +;; Helper for creating `MInst.XmmToGprVex` instructions. +(decl xmm_to_gpr_vex (AvxOpcode Xmm OperandSize) Gpr) +(rule (xmm_to_gpr_vex op src size) + (let ((dst WritableGpr (temp_writable_gpr)) + (_ Unit (emit (MInst.XmmToGprVex op src dst size)))) + dst)) + +;; Helper for creating `MInst.GprToXmmVex` instructions. +(decl gpr_to_xmm_vex (AvxOpcode GprMem OperandSize) Xmm) +(rule (gpr_to_xmm_vex op src size) + (let ((dst WritableXmm (temp_writable_xmm)) + (_ Unit (emit (MInst.GprToXmmVex op src dst size)))) + dst)) + + ;; Helper for creating `pmovmskb` instructions. (decl x64_pmovmskb (OperandSize Xmm) Gpr) (rule (x64_pmovmskb size src) (xmm_to_gpr (SseOpcode.Pmovmskb) src size)) +(rule 1 (x64_pmovmskb size src) + (if-let $true (use_avx_simd)) + (xmm_to_gpr_vex (AvxOpcode.Vpmovmskb) src size)) ;; Helper for creating `movmskps` instructions. (decl x64_movmskps (OperandSize Xmm) Gpr) (rule (x64_movmskps size src) (xmm_to_gpr (SseOpcode.Movmskps) src size)) +(rule 1 (x64_movmskps size src) + (if-let $true (use_avx_simd)) + (xmm_to_gpr_vex (AvxOpcode.Vmovmskps) src size)) ;; Helper for creating `movmskpd` instructions. (decl x64_movmskpd (OperandSize Xmm) Gpr) (rule (x64_movmskpd size src) (xmm_to_gpr (SseOpcode.Movmskpd) src size)) +(rule 1 (x64_movmskpd size src) + (if-let $true (use_avx_simd)) + (xmm_to_gpr_vex (AvxOpcode.Vmovmskpd) src size)) ;; Helper for creating `MInst.GprToXmm` instructions. (decl gpr_to_xmm (SseOpcode GprMem OperandSize) Xmm) @@ -3973,11 +4044,17 @@ (decl x64_cvtsi2ss (Type GprMem) Xmm) (rule (x64_cvtsi2ss ty x) (gpr_to_xmm (SseOpcode.Cvtsi2ss) x (raw_operand_size_of_type ty))) +(rule 1 (x64_cvtsi2ss ty x) + (if-let $true (use_avx_simd)) + (gpr_to_xmm_vex (AvxOpcode.Vcvtsi2ss) x (raw_operand_size_of_type ty))) ;; Helper for creating `cvtsi2sd` instructions. (decl x64_cvtsi2sd (Type GprMem) Xmm) (rule (x64_cvtsi2sd ty x) (gpr_to_xmm (SseOpcode.Cvtsi2sd) x (raw_operand_size_of_type ty))) +(rule 1 (x64_cvtsi2sd ty x) + (if-let $true (use_avx_simd)) + (gpr_to_xmm_vex (AvxOpcode.Vcvtsi2sd) x (raw_operand_size_of_type ty))) ;; Helper for creating `cvttps2dq` instructions. (decl x64_cvttps2dq (XmmMem) Xmm) @@ -4486,15 +4563,15 @@ (decl bitcast_xmm_to_gpr (Type Xmm) Gpr) (rule (bitcast_xmm_to_gpr $F32 src) - (xmm_to_gpr (SseOpcode.Movd) src (OperandSize.Size32))) + (x64_movd_to_gpr src)) (rule (bitcast_xmm_to_gpr $F64 src) - (xmm_to_gpr (SseOpcode.Movq) src (OperandSize.Size64))) + (x64_movq_to_gpr src)) (decl bitcast_gpr_to_xmm (Type Gpr) Xmm) (rule (bitcast_gpr_to_xmm $I32 src) - (gpr_to_xmm (SseOpcode.Movd) src (OperandSize.Size32))) + (x64_movd_to_xmm src)) (rule (bitcast_gpr_to_xmm $I64 src) - (gpr_to_xmm (SseOpcode.Movq) src (OperandSize.Size64))) + (x64_movq_to_xmm src)) ;;;; Stack Addresses ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; @@ -4678,7 +4755,6 @@ (convert Reg XmmMem reg_to_xmm_mem) (convert Reg RegMemImm reg_to_reg_mem_imm) (convert RegMem XmmMem reg_mem_to_xmm_mem) -(convert RegMemImm XmmMemImm mov_rmi_to_xmm) (convert Xmm XmmMem xmm_to_xmm_mem) (convert Xmm XmmMemImm xmm_to_xmm_mem_imm) (convert Xmm XmmMemAligned xmm_to_xmm_mem_aligned) diff --git a/cranelift/codegen/src/isa/x64/inst/args.rs b/cranelift/codegen/src/isa/x64/inst/args.rs index d7a851e66d..a135fc5af1 100644 --- a/cranelift/codegen/src/isa/x64/inst/args.rs +++ b/cranelift/codegen/src/isa/x64/inst/args.rs @@ -1715,7 +1715,14 @@ impl AvxOpcode { | AvxOpcode::Vpextrq | AvxOpcode::Vpblendw | AvxOpcode::Vmovddup - | AvxOpcode::Vbroadcastss => { + | AvxOpcode::Vbroadcastss + | AvxOpcode::Vmovd + | AvxOpcode::Vmovq + | AvxOpcode::Vmovmskps + | AvxOpcode::Vmovmskpd + | AvxOpcode::Vpmovmskb + | AvxOpcode::Vcvtsi2ss + | AvxOpcode::Vcvtsi2sd => { smallvec![InstructionSet::AVX] } diff --git a/cranelift/codegen/src/isa/x64/inst/emit.rs b/cranelift/codegen/src/isa/x64/inst/emit.rs index 6c1dfc9fea..2b0f3af084 100644 --- a/cranelift/codegen/src/isa/x64/inst/emit.rs +++ b/cranelift/codegen/src/isa/x64/inst/emit.rs @@ -2515,6 +2515,89 @@ pub(crate) fn emit( .encode(sink); } + Inst::XmmToGprVex { + op, + src, + dst, + dst_size, + } => { + let src = allocs.next(src.to_reg()); + let dst = allocs.next(dst.to_reg().to_reg()); + + let (prefix, map, opcode) = match op { + // vmovd/vmovq are differentiated by `w` + AvxOpcode::Vmovd | AvxOpcode::Vmovq => (LegacyPrefixes::_66, OpcodeMap::_0F, 0x7E), + AvxOpcode::Vmovmskps => (LegacyPrefixes::None, OpcodeMap::_0F, 0x50), + AvxOpcode::Vmovmskpd => (LegacyPrefixes::_66, OpcodeMap::_0F, 0x50), + AvxOpcode::Vpmovmskb => (LegacyPrefixes::_66, OpcodeMap::_0F, 0xD7), + _ => unimplemented!("Opcode {:?} not implemented", op), + }; + let w = match dst_size { + OperandSize::Size64 => true, + _ => false, + }; + let mut vex = VexInstruction::new() + .length(VexVectorLength::V128) + .w(w) + .prefix(prefix) + .map(map) + .opcode(opcode); + vex = match op { + // The `vmovq/vmovd` reverse the order of the destination/source + // relative to other opcodes using this shape of instruction. + AvxOpcode::Vmovd | AvxOpcode::Vmovq => vex + .rm(dst.to_real_reg().unwrap().hw_enc()) + .reg(src.to_real_reg().unwrap().hw_enc()), + _ => vex + .rm(src.to_real_reg().unwrap().hw_enc()) + .reg(dst.to_real_reg().unwrap().hw_enc()), + }; + vex.encode(sink); + } + + Inst::GprToXmmVex { + op, + src, + dst, + src_size, + } => { + let dst = allocs.next(dst.to_reg().to_reg()); + let src = match src.clone().to_reg_mem().with_allocs(allocs) { + RegMem::Reg { reg } => { + RegisterOrAmode::Register(reg.to_real_reg().unwrap().hw_enc().into()) + } + RegMem::Mem { addr } => RegisterOrAmode::Amode(addr.finalize(state, sink)), + }; + + let (prefix, map, opcode) = match op { + // vmovd/vmovq are differentiated by `w` + AvxOpcode::Vmovd | AvxOpcode::Vmovq => (LegacyPrefixes::_66, OpcodeMap::_0F, 0x6E), + AvxOpcode::Vcvtsi2ss => (LegacyPrefixes::_F3, OpcodeMap::_0F, 0x2A), + AvxOpcode::Vcvtsi2sd => (LegacyPrefixes::_F2, OpcodeMap::_0F, 0x2A), + _ => unimplemented!("Opcode {:?} not implemented", op), + }; + let w = match src_size { + OperandSize::Size64 => true, + _ => false, + }; + let mut insn = VexInstruction::new() + .length(VexVectorLength::V128) + .w(w) + .prefix(prefix) + .map(map) + .opcode(opcode) + .rm(src) + .reg(dst.to_real_reg().unwrap().hw_enc()); + // These opcodes technically take a second operand which is the + // upper bits to preserve during the float conversion. We don't + // actually use this in this backend right now so reuse the + // destination register. This at least matches what LLVM does. + if let AvxOpcode::Vcvtsi2ss | AvxOpcode::Vcvtsi2sd = op { + insn = insn.vvvv(dst.to_real_reg().unwrap().hw_enc()); + } + insn.encode(sink); + } + Inst::XmmRmREvex { op, src1, diff --git a/cranelift/codegen/src/isa/x64/inst/mod.rs b/cranelift/codegen/src/isa/x64/inst/mod.rs index 179368430c..f9dd071dca 100644 --- a/cranelift/codegen/src/isa/x64/inst/mod.rs +++ b/cranelift/codegen/src/isa/x64/inst/mod.rs @@ -158,7 +158,9 @@ impl Inst { | Inst::XmmUnaryRmRImmVex { op, .. } | Inst::XmmMovRMVex { op, .. } | Inst::XmmMovRMImmVex { op, .. } - | Inst::XmmToGprImmVex { op, .. } => op.available_from(), + | Inst::XmmToGprImmVex { op, .. } + | Inst::XmmToGprVex { op, .. } + | Inst::GprToXmmVex { op, .. } => op.available_from(), } } } @@ -1202,6 +1204,18 @@ impl PrettyPrint for Inst { format!("{} {}, {}", ljustify(op.to_string()), src, dst) } + Inst::XmmToGprVex { + op, + src, + dst, + dst_size, + } => { + let dst_size = dst_size.to_bytes(); + let src = pretty_print_reg(src.to_reg(), 8, allocs); + let dst = pretty_print_reg(dst.to_reg().to_reg(), dst_size, allocs); + format!("{} {src}, {dst}", ljustify(op.to_string())) + } + Inst::XmmToGprImm { op, src, dst, imm } => { let src = pretty_print_reg(src.to_reg(), 8, allocs); let dst = pretty_print_reg(dst.to_reg().to_reg(), 8, allocs); @@ -1225,6 +1239,17 @@ impl PrettyPrint for Inst { format!("{} {}, {}", ljustify(op.to_string()), src, dst) } + Inst::GprToXmmVex { + op, + src, + src_size, + dst, + } => { + let dst = pretty_print_reg(dst.to_reg().to_reg(), 8, allocs); + let src = src.pretty_print(src_size.to_bytes(), allocs); + format!("{} {src}, {dst}", ljustify(op.to_string())) + } + Inst::XmmCmpRmR { op, src, dst } => { let dst = pretty_print_reg(dst.to_reg(), 8, allocs); let src = src.pretty_print(8, allocs); @@ -2082,12 +2107,13 @@ fn x64_get_operands VReg>(inst: &Inst, collector: &mut OperandCol collector.reg_fixed_nonallocatable(*dst); } Inst::XmmToGpr { src, dst, .. } + | Inst::XmmToGprVex { src, dst, .. } | Inst::XmmToGprImm { src, dst, .. } | Inst::XmmToGprImmVex { src, dst, .. } => { collector.reg_use(src.to_reg()); collector.reg_def(dst.to_writable_reg()); } - Inst::GprToXmm { src, dst, .. } => { + Inst::GprToXmm { src, dst, .. } | Inst::GprToXmmVex { src, dst, .. } => { collector.reg_def(dst.to_writable_reg()); src.get_operands(collector); } diff --git a/cranelift/codegen/src/isa/x64/lower.isle b/cranelift/codegen/src/isa/x64/lower.isle index 763fc338e0..ee7ca827bd 100644 --- a/cranelift/codegen/src/isa/x64/lower.isle +++ b/cranelift/codegen/src/isa/x64/lower.isle @@ -883,17 +883,17 @@ (let ((a0 Xmm a) (b0 Xmm b) ;; a_hi = A >> 32 - (a_hi Xmm (x64_psrlq a0 (RegMemImm.Imm 32))) + (a_hi Xmm (x64_psrlq a0 (xmi_imm 32))) ;; ah_bl = Ah * Bl (ah_bl Xmm (x64_pmuludq a_hi b0)) ;; b_hi = B >> 32 - (b_hi Xmm (x64_psrlq b0 (RegMemImm.Imm 32))) + (b_hi Xmm (x64_psrlq b0 (xmi_imm 32))) ;; al_bh = Al * Bh (al_bh Xmm (x64_pmuludq a0 b_hi)) ;; aa_bb = ah_bl + al_bh (aa_bb Xmm (x64_paddq ah_bl al_bh)) ;; aa_bb_shifted = aa_bb << 32 - (aa_bb_shifted Xmm (x64_psllq aa_bb (RegMemImm.Imm 32))) + (aa_bb_shifted Xmm (x64_psllq aa_bb (xmi_imm 32))) ;; al_bl = Al * Bl (al_bl Xmm (x64_pmuludq a0 b0))) ;; al_bl + aa_bb_shifted @@ -1087,14 +1087,12 @@ ;; Special case for `f32x4.abs`. (rule (lower (has_type $F32X4 (fabs x))) (x64_andps x - (x64_psrld (vector_all_ones) - (RegMemImm.Imm 1)))) + (x64_psrld (vector_all_ones) (xmi_imm 1)))) ;; Special case for `f64x2.abs`. (rule (lower (has_type $F64X2 (fabs x))) (x64_andpd x - (x64_psrlq (vector_all_ones) - (RegMemImm.Imm 1)))) + (x64_psrlq (vector_all_ones) (xmi_imm 1)))) ;;;; Rules for `fneg` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; @@ -1106,13 +1104,11 @@ (rule (lower (has_type $F32X4 (fneg x))) (x64_xorps x - (x64_pslld (vector_all_ones) - (RegMemImm.Imm 31)))) + (x64_pslld (vector_all_ones) (xmi_imm 31)))) (rule (lower (has_type $F64X2 (fneg x))) (x64_xorpd x - (x64_psllq (vector_all_ones) - (RegMemImm.Imm 63)))) + (x64_psllq (vector_all_ones) (xmi_imm 63)))) ;;;; Rules for `bmask` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; @@ -1918,7 +1914,7 @@ ;; Note that this is a 16x8 shift, but that's OK; we mask ;; off anything that traverses from one byte to the next ;; with the low_mask below. - (shifted_src Xmm (x64_psrlw src (RegMemImm.Imm 4))) + (shifted_src Xmm (x64_psrlw src (xmi_imm 4))) (high_nibbles Xmm (sse_and $I8X16 shifted_src low_mask)) (lookup Xmm (x64_xmm_load_const $I8X16 (popcount_4bit_table))) (bit_counts_low Xmm (x64_pshufb lookup low_nibbles)) @@ -2237,7 +2233,7 @@ ;; All-ones for NaN, shifted down to leave 10 top bits (1 ;; sign, 8 exponent, 1 QNaN bit that must remain set) ;; cleared. - (nan_fraction_mask Xmm (x64_psrld is_nan_mask (RegMemImm.Imm 10))) + (nan_fraction_mask Xmm (x64_psrld is_nan_mask (xmi_imm 10))) ;; Do a NAND, so that we retain every bit not set in ;; `nan_fraction_mask`. This mask will be all zeroes (so ;; we retain every bit) in non-NaN cases, and will have @@ -2254,7 +2250,7 @@ (min_or Xmm (x64_orpd min1 min2)) (is_nan_mask Xmm (x64_cmppd min1 min2 (FcmpImm.Unordered))) (min_or_2 Xmm (x64_orpd min_or is_nan_mask)) - (nan_fraction_mask Xmm (x64_psrlq is_nan_mask (RegMemImm.Imm 13))) + (nan_fraction_mask Xmm (x64_psrlq is_nan_mask (xmi_imm 13))) (final Xmm (x64_andnpd nan_fraction_mask min_or_2))) final)) @@ -2302,7 +2298,7 @@ ;; All-ones for NaN, shifted down to leave 10 top bits (1 ;; sign, 8 exponent, 1 QNaN bit that must remain set) ;; cleared. - (nan_fraction_mask Xmm (x64_psrld is_nan_mask (RegMemImm.Imm 10))) + (nan_fraction_mask Xmm (x64_psrld is_nan_mask (xmi_imm 10))) ;; Do a NAND, so that we retain every bit not set in ;; `nan_fraction_mask`. This mask will be all zeroes (so ;; we retain every bit) in non-NaN cases, and will have @@ -2346,7 +2342,7 @@ ;; All-ones for NaN, shifted down to leave 13 top bits (1 ;; sign, 11 exponent, 1 QNaN bit that must remain set) ;; cleared. - (nan_fraction_mask Xmm (x64_psrlq is_nan_mask (RegMemImm.Imm 13))) + (nan_fraction_mask Xmm (x64_psrlq is_nan_mask (xmi_imm 13))) ;; Do a NAND, so that we retain every bit not set in ;; `nan_fraction_mask`. This mask will be all zeroes (so ;; we retain every bit) in non-NaN cases, and will have @@ -3011,8 +3007,8 @@ (let ((a Xmm val) ;; get the low 16 bits - (a_lo Xmm (x64_pslld a (RegMemImm.Imm 16))) - (a_lo Xmm (x64_psrld a_lo (RegMemImm.Imm 16))) + (a_lo Xmm (x64_pslld a (xmi_imm 16))) + (a_lo Xmm (x64_psrld a_lo (xmi_imm 16))) ;; get the high 16 bits (a_hi Xmm (x64_psubd a a_lo)) @@ -3022,7 +3018,7 @@ ;; shift the high bits by 1, convert, and double to get the correct ;; value - (a_hi Xmm (x64_psrld a_hi (RegMemImm.Imm 1))) + (a_hi Xmm (x64_psrld a_hi (xmi_imm 1))) (a_hi Xmm (x64_cvtdq2ps a_hi)) (a_hi Xmm (x64_addps a_hi a_hi))) @@ -3060,7 +3056,7 @@ ;; Set top bit only if < 0 (tmp Xmm (x64_pand dst tmp)) - (tmp Xmm (x64_psrad tmp (RegMemImm.Imm 31)))) + (tmp Xmm (x64_psrad tmp (xmi_imm 31)))) ;; On overflow 0x80000000 is returned to a lane. ;; Below sets positive overflow lanes to 0x7FFFFFFF @@ -3130,7 +3126,7 @@ ;; integer that it can represent. In the case of INT_MAX, this value gets ;; represented as 0x4f000000 which is the integer value (INT_MAX+1). (tmp2 Xmm (x64_pcmpeqd tmp2 tmp2)) - (tmp2 Xmm (x64_psrld tmp2 (RegMemImm.Imm 1))) + (tmp2 Xmm (x64_psrld tmp2 (xmi_imm 1))) (tmp2 Xmm (x64_cvtdq2ps tmp2)) ;; Make a copy of these lanes and then do the first conversion. diff --git a/cranelift/codegen/src/isa/x64/lower/isle.rs b/cranelift/codegen/src/isa/x64/lower/isle.rs index 3a397a0eb9..1ca927ec31 100644 --- a/cranelift/codegen/src/isa/x64/lower/isle.rs +++ b/cranelift/codegen/src/isa/x64/lower/isle.rs @@ -1038,6 +1038,10 @@ impl Context for IsleContext<'_, '_, MInst, X64Backend> { | bit(h, 7)?, ) } + + fn xmi_imm(&mut self, imm: u32) -> XmmMemImm { + XmmMemImm::new(RegMemImm::imm(imm)).unwrap() + } } impl IsleContext<'_, '_, MInst, X64Backend> { diff --git a/cranelift/filetests/filetests/isa/x64/fcvt-avx.clif b/cranelift/filetests/filetests/isa/x64/fcvt-avx.clif new file mode 100644 index 0000000000..dbdd6cd18c --- /dev/null +++ b/cranelift/filetests/filetests/isa/x64/fcvt-avx.clif @@ -0,0 +1,104 @@ +test compile precise-output +set enable_simd +target x86_64 has_avx + +function %f3(i32) -> f32 { +block0(v0: i32): + v1 = fcvt_from_sint.f32 v0 + return v1 +} + +; VCode: +; pushq %rbp +; movq %rsp, %rbp +; block0: +; vcvtsi2ss %edi, %xmm0 +; movq %rbp, %rsp +; popq %rbp +; ret +; +; Disassembled: +; block0: ; offset 0x0 +; pushq %rbp +; movq %rsp, %rbp +; block1: ; offset 0x4 +; vcvtsi2ssl %edi, %xmm0, %xmm0 +; movq %rbp, %rsp +; popq %rbp +; retq + +function %f4(i64) -> f32 { +block0(v0: i64): + v1 = fcvt_from_sint.f32 v0 + return v1 +} + +; VCode: +; pushq %rbp +; movq %rsp, %rbp +; block0: +; vcvtsi2ss %rdi, %xmm0 +; movq %rbp, %rsp +; popq %rbp +; ret +; +; Disassembled: +; block0: ; offset 0x0 +; pushq %rbp +; movq %rsp, %rbp +; block1: ; offset 0x4 +; vcvtsi2ssq %rdi, %xmm0, %xmm0 +; movq %rbp, %rsp +; popq %rbp +; retq + +function %f7(i32) -> f64 { +block0(v0: i32): + v1 = fcvt_from_sint.f64 v0 + return v1 +} + +; VCode: +; pushq %rbp +; movq %rsp, %rbp +; block0: +; vcvtsi2sd %edi, %xmm0 +; movq %rbp, %rsp +; popq %rbp +; ret +; +; Disassembled: +; block0: ; offset 0x0 +; pushq %rbp +; movq %rsp, %rbp +; block1: ; offset 0x4 +; vcvtsi2sdl %edi, %xmm0, %xmm0 +; movq %rbp, %rsp +; popq %rbp +; retq + +function %f8(i64) -> f64 { +block0(v0: i64): + v1 = fcvt_from_sint.f64 v0 + return v1 +} + +; VCode: +; pushq %rbp +; movq %rsp, %rbp +; block0: +; vcvtsi2sd %rdi, %xmm0 +; movq %rbp, %rsp +; popq %rbp +; ret +; +; Disassembled: +; block0: ; offset 0x0 +; pushq %rbp +; movq %rsp, %rbp +; block1: ; offset 0x4 +; vcvtsi2sdq %rdi, %xmm0, %xmm0 +; movq %rbp, %rsp +; popq %rbp +; retq + diff --git a/cranelift/filetests/filetests/isa/x64/float-bitcast-avx.clif b/cranelift/filetests/filetests/isa/x64/float-bitcast-avx.clif new file mode 100644 index 0000000000..4a08cb345c --- /dev/null +++ b/cranelift/filetests/filetests/isa/x64/float-bitcast-avx.clif @@ -0,0 +1,104 @@ +test compile precise-output +set enable_simd +target x86_64 has_avx + +function %i32_to_f32(i32) -> f32 { +block0(v0: i32): + v1 = bitcast.f32 v0 + return v1 +} + +; VCode: +; pushq %rbp +; movq %rsp, %rbp +; block0: +; vmovd %edi, %xmm0 +; movq %rbp, %rsp +; popq %rbp +; ret +; +; Disassembled: +; block0: ; offset 0x0 +; pushq %rbp +; movq %rsp, %rbp +; block1: ; offset 0x4 +; vmovd %edi, %xmm0 +; movq %rbp, %rsp +; popq %rbp +; retq + +function %i64_to_f64(i64) -> f64 { +block0(v0: i64): + v1 = bitcast.f64 v0 + return v1 +} + +; VCode: +; pushq %rbp +; movq %rsp, %rbp +; block0: +; vmovq %rdi, %xmm0 +; movq %rbp, %rsp +; popq %rbp +; ret +; +; Disassembled: +; block0: ; offset 0x0 +; pushq %rbp +; movq %rsp, %rbp +; block1: ; offset 0x4 +; vmovq %rdi, %xmm0 +; movq %rbp, %rsp +; popq %rbp +; retq + +function %f32_to_i32(f32) -> i32 { +block0(v0: f32): + v1 = bitcast.i32 v0 + return v1 +} + +; VCode: +; pushq %rbp +; movq %rsp, %rbp +; block0: +; vmovd %xmm0, %eax +; movq %rbp, %rsp +; popq %rbp +; ret +; +; Disassembled: +; block0: ; offset 0x0 +; pushq %rbp +; movq %rsp, %rbp +; block1: ; offset 0x4 +; vmovd %xmm0, %eax +; movq %rbp, %rsp +; popq %rbp +; retq + +function %f64_to_i64(f64) -> i64 { +block0(v0: f64): + v1 = bitcast.i64 v0 + return v1 +} + +; VCode: +; pushq %rbp +; movq %rsp, %rbp +; block0: +; vmovq %xmm0, %rax +; movq %rbp, %rsp +; popq %rbp +; ret +; +; Disassembled: +; block0: ; offset 0x0 +; pushq %rbp +; movq %rsp, %rbp +; block1: ; offset 0x4 +; vmovq %xmm0, %rax +; movq %rbp, %rsp +; popq %rbp +; retq + diff --git a/cranelift/filetests/filetests/isa/x64/float-bitcast.clif b/cranelift/filetests/filetests/isa/x64/float-bitcast.clif new file mode 100644 index 0000000000..ec29ebc05a --- /dev/null +++ b/cranelift/filetests/filetests/isa/x64/float-bitcast.clif @@ -0,0 +1,104 @@ +test compile precise-output +set enable_simd +target x86_64 + +function %i32_to_f32(i32) -> f32 { +block0(v0: i32): + v1 = bitcast.f32 v0 + return v1 +} + +; VCode: +; pushq %rbp +; movq %rsp, %rbp +; block0: +; movd %edi, %xmm0 +; movq %rbp, %rsp +; popq %rbp +; ret +; +; Disassembled: +; block0: ; offset 0x0 +; pushq %rbp +; movq %rsp, %rbp +; block1: ; offset 0x4 +; movd %edi, %xmm0 +; movq %rbp, %rsp +; popq %rbp +; retq + +function %i64_to_f64(i64) -> f64 { +block0(v0: i64): + v1 = bitcast.f64 v0 + return v1 +} + +; VCode: +; pushq %rbp +; movq %rsp, %rbp +; block0: +; movq %rdi, %xmm0 +; movq %rbp, %rsp +; popq %rbp +; ret +; +; Disassembled: +; block0: ; offset 0x0 +; pushq %rbp +; movq %rsp, %rbp +; block1: ; offset 0x4 +; movq %rdi, %xmm0 +; movq %rbp, %rsp +; popq %rbp +; retq + +function %f32_to_i32(f32) -> i32 { +block0(v0: f32): + v1 = bitcast.i32 v0 + return v1 +} + +; VCode: +; pushq %rbp +; movq %rsp, %rbp +; block0: +; movd %xmm0, %eax +; movq %rbp, %rsp +; popq %rbp +; ret +; +; Disassembled: +; block0: ; offset 0x0 +; pushq %rbp +; movq %rsp, %rbp +; block1: ; offset 0x4 +; movd %xmm0, %eax +; movq %rbp, %rsp +; popq %rbp +; retq + +function %f64_to_i64(f64) -> i64 { +block0(v0: f64): + v1 = bitcast.i64 v0 + return v1 +} + +; VCode: +; pushq %rbp +; movq %rsp, %rbp +; block0: +; movq %xmm0, %rax +; movq %rbp, %rsp +; popq %rbp +; ret +; +; Disassembled: +; block0: ; offset 0x0 +; pushq %rbp +; movq %rsp, %rbp +; block1: ; offset 0x4 +; movq %xmm0, %rax +; movq %rbp, %rsp +; popq %rbp +; retq + diff --git a/cranelift/filetests/filetests/isa/x64/simd-arith-avx.clif b/cranelift/filetests/filetests/isa/x64/simd-arith-avx.clif index 15f5c84d15..547cbaf39f 100644 --- a/cranelift/filetests/filetests/isa/x64/simd-arith-avx.clif +++ b/cranelift/filetests/filetests/isa/x64/simd-arith-avx.clif @@ -920,7 +920,7 @@ block0(v0: i8x16, v1: i32): ; vpunpcklbw %xmm0, %xmm0, %xmm5 ; vpunpckhbw %xmm0, %xmm0, %xmm7 ; addl %r9d, $8, %r9d -; movd %r9d, %xmm11 +; vmovd %r9d, %xmm11 ; vpsraw %xmm5, %xmm11, %xmm13 ; vpsraw %xmm7, %xmm11, %xmm15 ; vpacksswb %xmm13, %xmm15, %xmm0 @@ -938,7 +938,7 @@ block0(v0: i8x16, v1: i32): ; vpunpcklbw %xmm0, %xmm0, %xmm5 ; vpunpckhbw %xmm0, %xmm0, %xmm7 ; addl $8, %r9d -; movd %r9d, %xmm11 +; vmovd %r9d, %xmm11 ; vpsraw %xmm11, %xmm5, %xmm13 ; vpsraw %xmm11, %xmm7, %xmm15 ; vpacksswb %xmm15, %xmm13, %xmm0 @@ -992,7 +992,7 @@ block0(v0: i16x8, v1: i32): ; block0: ; movq %rdi, %rcx ; andq %rcx, $15, %rcx -; movd %ecx, %xmm5 +; vmovd %ecx, %xmm5 ; vpsraw %xmm0, %xmm5, %xmm0 ; movq %rbp, %rsp ; popq %rbp @@ -1005,7 +1005,7 @@ block0(v0: i16x8, v1: i32): ; block1: ; offset 0x4 ; movq %rdi, %rcx ; andq $0xf, %rcx -; movd %ecx, %xmm5 +; vmovd %ecx, %xmm5 ; vpsraw %xmm5, %xmm0, %xmm0 ; movq %rbp, %rsp ; popq %rbp @@ -1049,7 +1049,7 @@ block0(v0: i32x4, v1: i32): ; block0: ; movq %rdi, %rcx ; andq %rcx, $31, %rcx -; movd %ecx, %xmm5 +; vmovd %ecx, %xmm5 ; vpsrad %xmm0, %xmm5, %xmm0 ; movq %rbp, %rsp ; popq %rbp @@ -1062,7 +1062,7 @@ block0(v0: i32x4, v1: i32): ; block1: ; offset 0x4 ; movq %rdi, %rcx ; andq $0x1f, %rcx -; movd %ecx, %xmm5 +; vmovd %ecx, %xmm5 ; vpsrad %xmm5, %xmm0, %xmm0 ; movq %rbp, %rsp ; popq %rbp @@ -1315,7 +1315,7 @@ block0(v0: i8): ; pushq %rbp ; movq %rsp, %rbp ; block0: -; movd %edi, %xmm2 +; vmovd %edi, %xmm2 ; uninit %xmm4 ; vpxor %xmm4, %xmm4, %xmm6 ; vpshufb %xmm2, %xmm6, %xmm0 @@ -1328,7 +1328,7 @@ block0(v0: i8): ; pushq %rbp ; movq %rsp, %rbp ; block1: ; offset 0x4 -; movd %edi, %xmm2 +; vmovd %edi, %xmm2 ; vpxor %xmm4, %xmm4, %xmm6 ; vpshufb %xmm6, %xmm2, %xmm0 ; movq %rbp, %rsp @@ -1389,7 +1389,7 @@ block0(v0: i8x16, v1: i32): ; block0: ; movq %rdi, %r10 ; andq %r10, $7, %r10 -; movd %r10d, %xmm5 +; vmovd %r10d, %xmm5 ; vpsllw %xmm0, %xmm5, %xmm7 ; lea const(0), %rsi ; shlq $4, %r10, %r10 @@ -1406,7 +1406,7 @@ block0(v0: i8x16, v1: i32): ; block1: ; offset 0x4 ; movq %rdi, %r10 ; andq $7, %r10 -; movd %r10d, %xmm5 +; vmovd %r10d, %xmm5 ; vpsllw %xmm5, %xmm0, %xmm7 ; leaq 0x15(%rip), %rsi ; shlq $4, %r10 @@ -1461,7 +1461,7 @@ block0(v0: i16x8, v1: i32): ; block0: ; movq %rdi, %rcx ; andq %rcx, $15, %rcx -; movd %ecx, %xmm5 +; vmovd %ecx, %xmm5 ; vpsllw %xmm0, %xmm5, %xmm0 ; movq %rbp, %rsp ; popq %rbp @@ -1474,7 +1474,7 @@ block0(v0: i16x8, v1: i32): ; block1: ; offset 0x4 ; movq %rdi, %rcx ; andq $0xf, %rcx -; movd %ecx, %xmm5 +; vmovd %ecx, %xmm5 ; vpsllw %xmm5, %xmm0, %xmm0 ; movq %rbp, %rsp ; popq %rbp @@ -1518,7 +1518,7 @@ block0(v0: i32x4, v1: i32): ; block0: ; movq %rdi, %rcx ; andq %rcx, $31, %rcx -; movd %ecx, %xmm5 +; vmovd %ecx, %xmm5 ; vpslld %xmm0, %xmm5, %xmm0 ; movq %rbp, %rsp ; popq %rbp @@ -1531,7 +1531,7 @@ block0(v0: i32x4, v1: i32): ; block1: ; offset 0x4 ; movq %rdi, %rcx ; andq $0x1f, %rcx -; movd %ecx, %xmm5 +; vmovd %ecx, %xmm5 ; vpslld %xmm5, %xmm0, %xmm0 ; movq %rbp, %rsp ; popq %rbp @@ -1575,7 +1575,7 @@ block0(v0: i64x2, v1: i32): ; block0: ; movq %rdi, %rcx ; andq %rcx, $63, %rcx -; movd %ecx, %xmm5 +; vmovd %ecx, %xmm5 ; vpsllq %xmm0, %xmm5, %xmm0 ; movq %rbp, %rsp ; popq %rbp @@ -1588,7 +1588,7 @@ block0(v0: i64x2, v1: i32): ; block1: ; offset 0x4 ; movq %rdi, %rcx ; andq $0x3f, %rcx -; movd %ecx, %xmm5 +; vmovd %ecx, %xmm5 ; vpsllq %xmm5, %xmm0, %xmm0 ; movq %rbp, %rsp ; popq %rbp @@ -1632,7 +1632,7 @@ block0(v0: i8x16, v1: i32): ; block0: ; movq %rdi, %r10 ; andq %r10, $7, %r10 -; movd %r10d, %xmm5 +; vmovd %r10d, %xmm5 ; vpsrlw %xmm0, %xmm5, %xmm7 ; lea const(0), %rsi ; shlq $4, %r10, %r10 @@ -1648,7 +1648,7 @@ block0(v0: i8x16, v1: i32): ; block1: ; offset 0x4 ; movq %rdi, %r10 ; andq $7, %r10 -; movd %r10d, %xmm5 +; vmovd %r10d, %xmm5 ; vpsrlw %xmm5, %xmm0, %xmm7 ; leaq 0x15(%rip), %rsi ; shlq $4, %r10 @@ -1713,7 +1713,7 @@ block0(v0: i16x8, v1: i32): ; block0: ; movq %rdi, %rcx ; andq %rcx, $15, %rcx -; movd %ecx, %xmm5 +; vmovd %ecx, %xmm5 ; vpsrlw %xmm0, %xmm5, %xmm0 ; movq %rbp, %rsp ; popq %rbp @@ -1726,7 +1726,7 @@ block0(v0: i16x8, v1: i32): ; block1: ; offset 0x4 ; movq %rdi, %rcx ; andq $0xf, %rcx -; movd %ecx, %xmm5 +; vmovd %ecx, %xmm5 ; vpsrlw %xmm5, %xmm0, %xmm0 ; movq %rbp, %rsp ; popq %rbp @@ -1770,7 +1770,7 @@ block0(v0: i32x4, v1: i32): ; block0: ; movq %rdi, %rcx ; andq %rcx, $31, %rcx -; movd %ecx, %xmm5 +; vmovd %ecx, %xmm5 ; vpsrld %xmm0, %xmm5, %xmm0 ; movq %rbp, %rsp ; popq %rbp @@ -1783,7 +1783,7 @@ block0(v0: i32x4, v1: i32): ; block1: ; offset 0x4 ; movq %rdi, %rcx ; andq $0x1f, %rcx -; movd %ecx, %xmm5 +; vmovd %ecx, %xmm5 ; vpsrld %xmm5, %xmm0, %xmm0 ; movq %rbp, %rsp ; popq %rbp @@ -1827,7 +1827,7 @@ block0(v0: i64x2, v1: i32): ; block0: ; movq %rdi, %rcx ; andq %rcx, $63, %rcx -; movd %ecx, %xmm5 +; vmovd %ecx, %xmm5 ; vpsrlq %xmm0, %xmm5, %xmm0 ; movq %rbp, %rsp ; popq %rbp @@ -1840,7 +1840,7 @@ block0(v0: i64x2, v1: i32): ; block1: ; offset 0x4 ; movq %rdi, %rcx ; andq $0x3f, %rcx -; movd %ecx, %xmm5 +; vmovd %ecx, %xmm5 ; vpsrlq %xmm5, %xmm0, %xmm0 ; movq %rbp, %rsp ; popq %rbp diff --git a/cranelift/filetests/filetests/isa/x64/simd-bitwise-avx.clif b/cranelift/filetests/filetests/isa/x64/simd-bitwise-avx.clif index 023d52d81e..a6e32237e9 100644 --- a/cranelift/filetests/filetests/isa/x64/simd-bitwise-avx.clif +++ b/cranelift/filetests/filetests/isa/x64/simd-bitwise-avx.clif @@ -41,7 +41,7 @@ block0(v0: i64): ; movq %rsp, %rbp ; block0: ; movl $-2147483648, %eax -; movd %eax, %xmm4 +; vmovd %eax, %xmm4 ; vandnps %xmm4, const(0), %xmm6 ; vandps %xmm4, 0(%rdi), %xmm8 ; vorps %xmm6, %xmm8, %xmm0 @@ -55,7 +55,7 @@ block0(v0: i64): ; movq %rsp, %rbp ; block1: ; offset 0x4 ; movl $0x80000000, %eax -; movd %eax, %xmm4 +; vmovd %eax, %xmm4 ; vandnps 0x1b(%rip), %xmm4, %xmm6 ; vandps (%rdi), %xmm4, %xmm8 ; vorps %xmm8, %xmm6, %xmm0 diff --git a/cranelift/filetests/filetests/isa/x64/simd-splat-avx.clif b/cranelift/filetests/filetests/isa/x64/simd-splat-avx.clif index 8848b6694a..34ce5c50ce 100644 --- a/cranelift/filetests/filetests/isa/x64/simd-splat-avx.clif +++ b/cranelift/filetests/filetests/isa/x64/simd-splat-avx.clif @@ -12,7 +12,7 @@ block0(v0: i8): ; pushq %rbp ; movq %rsp, %rbp ; block0: -; movd %edi, %xmm2 +; vmovd %edi, %xmm2 ; uninit %xmm4 ; vpxor %xmm4, %xmm4, %xmm6 ; vpshufb %xmm2, %xmm6, %xmm0 @@ -25,7 +25,7 @@ block0(v0: i8): ; pushq %rbp ; movq %rsp, %rbp ; block1: ; offset 0x4 -; movd %edi, %xmm2 +; vmovd %edi, %xmm2 ; vpxor %xmm4, %xmm4, %xmm6 ; vpshufb %xmm6, %xmm2, %xmm0 ; movq %rbp, %rsp @@ -42,7 +42,7 @@ block0(v0: i16): ; pushq %rbp ; movq %rsp, %rbp ; block0: -; movd %edi, %xmm2 +; vmovd %edi, %xmm2 ; vpshuflw $0, %xmm2, %xmm4 ; vpshufd $0, %xmm4, %xmm0 ; movq %rbp, %rsp @@ -54,7 +54,7 @@ block0(v0: i16): ; pushq %rbp ; movq %rsp, %rbp ; block1: ; offset 0x4 -; movd %edi, %xmm2 +; vmovd %edi, %xmm2 ; vpshuflw $0, %xmm2, %xmm4 ; vpshufd $0, %xmm4, %xmm0 ; movq %rbp, %rsp @@ -71,7 +71,7 @@ block0(v0: i32): ; pushq %rbp ; movq %rsp, %rbp ; block0: -; movd %edi, %xmm2 +; vmovd %edi, %xmm2 ; vpshufd $0, %xmm2, %xmm0 ; movq %rbp, %rsp ; popq %rbp @@ -82,7 +82,7 @@ block0(v0: i32): ; pushq %rbp ; movq %rsp, %rbp ; block1: ; offset 0x4 -; movd %edi, %xmm2 +; vmovd %edi, %xmm2 ; vpshufd $0, %xmm2, %xmm0 ; movq %rbp, %rsp ; popq %rbp @@ -98,7 +98,7 @@ block0(v0: i64): ; pushq %rbp ; movq %rsp, %rbp ; block0: -; movq %rdi, %xmm2 +; vmovq %rdi, %xmm2 ; vmovddup %xmm2, %xmm0 ; movq %rbp, %rsp ; popq %rbp @@ -109,7 +109,7 @@ block0(v0: i64): ; pushq %rbp ; movq %rsp, %rbp ; block1: ; offset 0x4 -; movq %rdi, %xmm2 +; vmovq %rdi, %xmm2 ; vmovddup %xmm2, %xmm0 ; movq %rbp, %rsp ; popq %rbp diff --git a/cranelift/filetests/filetests/isa/x64/simd-splat-avx2.clif b/cranelift/filetests/filetests/isa/x64/simd-splat-avx2.clif index 84cdfd3730..a7829d9456 100644 --- a/cranelift/filetests/filetests/isa/x64/simd-splat-avx2.clif +++ b/cranelift/filetests/filetests/isa/x64/simd-splat-avx2.clif @@ -12,7 +12,7 @@ block0(v0: i8): ; pushq %rbp ; movq %rsp, %rbp ; block0: -; movd %edi, %xmm2 +; vmovd %edi, %xmm2 ; vpbroadcastb %xmm2, %xmm0 ; movq %rbp, %rsp ; popq %rbp @@ -23,7 +23,7 @@ block0(v0: i8): ; pushq %rbp ; movq %rsp, %rbp ; block1: ; offset 0x4 -; movd %edi, %xmm2 +; vmovd %edi, %xmm2 ; vpbroadcastb %xmm2, %xmm0 ; movq %rbp, %rsp ; popq %rbp @@ -39,7 +39,7 @@ block0(v0: i16): ; pushq %rbp ; movq %rsp, %rbp ; block0: -; movd %edi, %xmm2 +; vmovd %edi, %xmm2 ; vpbroadcastw %xmm2, %xmm0 ; movq %rbp, %rsp ; popq %rbp @@ -50,7 +50,7 @@ block0(v0: i16): ; pushq %rbp ; movq %rsp, %rbp ; block1: ; offset 0x4 -; movd %edi, %xmm2 +; vmovd %edi, %xmm2 ; vpbroadcastw %xmm2, %xmm0 ; movq %rbp, %rsp ; popq %rbp @@ -66,7 +66,7 @@ block0(v0: i32): ; pushq %rbp ; movq %rsp, %rbp ; block0: -; movd %edi, %xmm2 +; vmovd %edi, %xmm2 ; vpbroadcastd %xmm2, %xmm0 ; movq %rbp, %rsp ; popq %rbp @@ -77,7 +77,7 @@ block0(v0: i32): ; pushq %rbp ; movq %rsp, %rbp ; block1: ; offset 0x4 -; movd %edi, %xmm2 +; vmovd %edi, %xmm2 ; vpbroadcastd %xmm2, %xmm0 ; movq %rbp, %rsp ; popq %rbp @@ -93,7 +93,7 @@ block0(v0: i64): ; pushq %rbp ; movq %rsp, %rbp ; block0: -; movq %rdi, %xmm2 +; vmovq %rdi, %xmm2 ; vmovddup %xmm2, %xmm0 ; movq %rbp, %rsp ; popq %rbp @@ -104,7 +104,7 @@ block0(v0: i64): ; pushq %rbp ; movq %rsp, %rbp ; block1: ; offset 0x4 -; movq %rdi, %xmm2 +; vmovq %rdi, %xmm2 ; vmovddup %xmm2, %xmm0 ; movq %rbp, %rsp ; popq %rbp diff --git a/cranelift/filetests/filetests/isa/x64/vhigh_bits-avx.clif b/cranelift/filetests/filetests/isa/x64/vhigh_bits-avx.clif new file mode 100644 index 0000000000..7579839f80 --- /dev/null +++ b/cranelift/filetests/filetests/isa/x64/vhigh_bits-avx.clif @@ -0,0 +1,108 @@ +test compile precise-output +set enable_simd +target x86_64 has_avx + +function %f1(i8x16) -> i8 { +block0(v0: i8x16): + v1 = vhigh_bits.i8 v0 + return v1 +} + +; VCode: +; pushq %rbp +; movq %rsp, %rbp +; block0: +; vpmovmskb %xmm0, %eax +; movq %rbp, %rsp +; popq %rbp +; ret +; +; Disassembled: +; block0: ; offset 0x0 +; pushq %rbp +; movq %rsp, %rbp +; block1: ; offset 0x4 +; vpmovmskb %xmm0, %eax +; movq %rbp, %rsp +; popq %rbp +; retq + +function %f3(i16x8) -> i8 { +block0(v0: i16x8): + v1 = vhigh_bits.i8 v0 + return v1 +} + +; VCode: +; pushq %rbp +; movq %rsp, %rbp +; block0: +; vpacksswb %xmm0, %xmm0, %xmm2 +; vpmovmskb %xmm2, %eax +; shrq $8, %rax, %rax +; movq %rbp, %rsp +; popq %rbp +; ret +; +; Disassembled: +; block0: ; offset 0x0 +; pushq %rbp +; movq %rsp, %rbp +; block1: ; offset 0x4 +; vpacksswb %xmm0, %xmm0, %xmm2 +; vpmovmskb %xmm2, %eax +; shrq $8, %rax +; movq %rbp, %rsp +; popq %rbp +; retq + +function %f4(i32x4) -> i8 { +block0(v0: i32x4): + v1 = vhigh_bits.i8 v0 + return v1 +} + +; VCode: +; pushq %rbp +; movq %rsp, %rbp +; block0: +; vmovmskps %xmm0, %eax +; movq %rbp, %rsp +; popq %rbp +; ret +; +; Disassembled: +; block0: ; offset 0x0 +; pushq %rbp +; movq %rsp, %rbp +; block1: ; offset 0x4 +; vmovmskps %xmm0, %eax +; movq %rbp, %rsp +; popq %rbp +; retq + +function %f5(i64x2) -> i8 { +block0(v0: i64x2): + v1 = vhigh_bits.i8 v0 + return v1 +} + +; VCode: +; pushq %rbp +; movq %rsp, %rbp +; block0: +; vmovmskpd %xmm0, %eax +; movq %rbp, %rsp +; popq %rbp +; ret +; +; Disassembled: +; block0: ; offset 0x0 +; pushq %rbp +; movq %rsp, %rbp +; block1: ; offset 0x4 +; vmovmskpd %xmm0, %eax +; movq %rbp, %rsp +; popq %rbp +; retq +