diff --git a/cranelift/codegen/src/isa/x64/inst.isle b/cranelift/codegen/src/isa/x64/inst.isle index 99cfeda06e..bcc63bfdbd 100644 --- a/cranelift/codegen/src/isa/x64/inst.isle +++ b/cranelift/codegen/src/isa/x64/inst.isle @@ -290,8 +290,18 @@ ;; XMM (scalar or vector) unary op (from xmm to reg/mem) using the ;; VEX prefix (XmmMovRMVex (op AvxOpcode) - (src Reg) + (src Xmm) (dst SyntheticAmode)) + (XmmMovRMImmVex (op AvxOpcode) + (src Xmm) + (dst SyntheticAmode) + (imm u8)) + + ;; XMM (scalar) unary op (from xmm to integer reg): vpextr{w,b,d,q} + (XmmToGprImmVex (op AvxOpcode) + (src Xmm) + (dst WritableGpr) + (imm u8)) ;; XMM (scalar or vector) binary op that relies on the EVEX ;; prefix. Takes two inputs. @@ -343,8 +353,12 @@ ;; XMM (scalar or vector) unary op (from xmm to reg/mem): stores, movd, ;; movq (XmmMovRM (op SseOpcode) - (src Reg) + (src Xmm) (dst SyntheticAmode)) + (XmmMovRMImm (op SseOpcode) + (src Xmm) + (dst SyntheticAmode) + (imm u8)) ;; XMM (scalar) unary op (from xmm to integer reg): movd, movq, ;; cvtts{s,d}2si @@ -1364,6 +1378,10 @@ Vmovups Vmovupd Vmovdqu + Vpextrb + Vpextrw + Vpextrd + Vpextrq )) (type Avx512Opcode extern @@ -2043,10 +2061,18 @@ (rule (xmm_movrm op addr data) (SideEffectNoResult.Inst (MInst.XmmMovRM op data addr))) +(decl xmm_movrm_imm (SseOpcode SyntheticAmode Xmm u8) SideEffectNoResult) +(rule (xmm_movrm_imm op addr data imm) + (SideEffectNoResult.Inst (MInst.XmmMovRMImm op data addr imm))) + (decl xmm_movrm_vex (AvxOpcode SyntheticAmode Xmm) SideEffectNoResult) (rule (xmm_movrm_vex op addr data) (SideEffectNoResult.Inst (MInst.XmmMovRMVex op data addr))) +(decl xmm_movrm_imm_vex (AvxOpcode SyntheticAmode Xmm u8) SideEffectNoResult) +(rule (xmm_movrm_imm_vex op addr data imm) + (SideEffectNoResult.Inst (MInst.XmmMovRMImmVex op data addr imm))) + ;; Load a constant into an XMM register. (decl x64_xmm_load_const (Type VCodeConstant) Xmm) (rule (x64_xmm_load_const ty const) @@ -3603,21 +3629,61 @@ (decl x64_pextrb (Xmm u8) Gpr) (rule (x64_pextrb src lane) (xmm_to_gpr_imm (SseOpcode.Pextrb) src lane)) +(rule 1 (x64_pextrb src lane) + (if-let $true (has_avx)) + (xmm_to_gpr_imm_vex (AvxOpcode.Vpextrb) src lane)) + +(decl x64_pextrb_store (SyntheticAmode Xmm u8) SideEffectNoResult) +(rule (x64_pextrb_store addr src lane) + (xmm_movrm_imm (SseOpcode.Pextrb) addr src lane)) +(rule 1 (x64_pextrb_store addr src lane) + (if-let $true (has_avx)) + (xmm_movrm_imm_vex (AvxOpcode.Vpextrb) addr src lane)) ;; Helper for creating `pextrw` instructions. (decl x64_pextrw (Xmm u8) Gpr) (rule (x64_pextrw src lane) (xmm_to_gpr_imm (SseOpcode.Pextrw) src lane)) +(rule 1 (x64_pextrw src lane) + (if-let $true (has_avx)) + (xmm_to_gpr_imm_vex (AvxOpcode.Vpextrw) src lane)) + +(decl x64_pextrw_store (SyntheticAmode Xmm u8) SideEffectNoResult) +(rule (x64_pextrw_store addr src lane) + (xmm_movrm_imm (SseOpcode.Pextrw) addr src lane)) +(rule 1 (x64_pextrw_store addr src lane) + (if-let $true (has_avx)) + (xmm_movrm_imm_vex (AvxOpcode.Vpextrw) addr src lane)) ;; Helper for creating `pextrd` instructions. (decl x64_pextrd (Xmm u8) Gpr) (rule (x64_pextrd src lane) (xmm_to_gpr_imm (SseOpcode.Pextrd) src lane)) +(rule 1 (x64_pextrd src lane) + (if-let $true (has_avx)) + (xmm_to_gpr_imm_vex (AvxOpcode.Vpextrd) src lane)) + +(decl x64_pextrd_store (SyntheticAmode Xmm u8) SideEffectNoResult) +(rule (x64_pextrd_store addr src lane) + (xmm_movrm_imm (SseOpcode.Pextrd) addr src lane)) +(rule 1 (x64_pextrd_store addr src lane) + (if-let $true (has_avx)) + (xmm_movrm_imm_vex (AvxOpcode.Vpextrd) addr src lane)) ;; Helper for creating `pextrq` instructions. (decl x64_pextrq (Xmm u8) Gpr) (rule (x64_pextrq src lane) (xmm_to_gpr_imm (SseOpcode.Pextrq) src lane)) +(rule 1 (x64_pextrq src lane) + (if-let $true (has_avx)) + (xmm_to_gpr_imm_vex (AvxOpcode.Vpextrq) src lane)) + +(decl x64_pextrq_store (SyntheticAmode Xmm u8) SideEffectNoResult) +(rule (x64_pextrq_store addr src lane) + (xmm_movrm_imm (SseOpcode.Pextrq) addr src lane)) +(rule 1 (x64_pextrq_store addr src lane) + (if-let $true (has_avx)) + (xmm_movrm_imm_vex (AvxOpcode.Vpextrq) addr src lane)) ;; Helper for creating `MInst.XmmToGpr` instructions. (decl xmm_to_gpr (SseOpcode Xmm OperandSize) Gpr) @@ -3626,13 +3692,20 @@ (_ Unit (emit (MInst.XmmToGpr op src dst size)))) dst)) -;; Helper for creating `MInst.XmmToGpr` instructions. +;; Helper for creating `MInst.XmmToGprImm` instructions. (decl xmm_to_gpr_imm (SseOpcode Xmm u8) Gpr) (rule (xmm_to_gpr_imm op src imm) (let ((dst WritableGpr (temp_writable_gpr)) (_ Unit (emit (MInst.XmmToGprImm op src dst imm)))) dst)) +;; Helper for creating `MInst.XmmToGprImmVex` instructions. +(decl xmm_to_gpr_imm_vex (AvxOpcode Xmm u8) Gpr) +(rule (xmm_to_gpr_imm_vex op src imm) + (let ((dst WritableGpr (temp_writable_gpr)) + (_ Unit (emit (MInst.XmmToGprImmVex op src dst imm)))) + dst)) + ;; Helper for creating `pmovmskb` instructions. (decl x64_pmovmskb (OperandSize Xmm) Gpr) (rule (x64_pmovmskb size src) diff --git a/cranelift/codegen/src/isa/x64/inst/args.rs b/cranelift/codegen/src/isa/x64/inst/args.rs index 6ae024c010..069c38f316 100644 --- a/cranelift/codegen/src/isa/x64/inst/args.rs +++ b/cranelift/codegen/src/isa/x64/inst/args.rs @@ -1699,7 +1699,11 @@ impl AvxOpcode { | AvxOpcode::Vmovsd | AvxOpcode::Vmovups | AvxOpcode::Vmovupd - | AvxOpcode::Vmovdqu => { + | AvxOpcode::Vmovdqu + | AvxOpcode::Vpextrb + | AvxOpcode::Vpextrw + | AvxOpcode::Vpextrd + | AvxOpcode::Vpextrq => { smallvec![InstructionSet::AVX] } } diff --git a/cranelift/codegen/src/isa/x64/inst/emit.rs b/cranelift/codegen/src/isa/x64/inst/emit.rs index 9fb2c15994..a6da7d867c 100644 --- a/cranelift/codegen/src/isa/x64/inst/emit.rs +++ b/cranelift/codegen/src/isa/x64/inst/emit.rs @@ -2430,7 +2430,7 @@ pub(crate) fn emit( } Inst::XmmMovRMVex { op, src, dst } => { - let src = allocs.next(*src); + let src = allocs.next(src.to_reg()); let dst = dst.with_allocs(allocs).finalize(state, sink); let (prefix, map, opcode) = match op { @@ -2451,6 +2451,52 @@ pub(crate) fn emit( .encode(sink); } + Inst::XmmMovRMImmVex { op, src, dst, imm } => { + let src = allocs.next(src.to_reg()); + let dst = dst.with_allocs(allocs).finalize(state, sink); + + let (w, prefix, map, opcode) = match op { + AvxOpcode::Vpextrb => (false, LegacyPrefixes::_66, OpcodeMap::_0F3A, 0x14), + AvxOpcode::Vpextrw => (false, LegacyPrefixes::_66, OpcodeMap::_0F3A, 0x15), + AvxOpcode::Vpextrd => (false, LegacyPrefixes::_66, OpcodeMap::_0F3A, 0x16), + AvxOpcode::Vpextrq => (true, LegacyPrefixes::_66, OpcodeMap::_0F3A, 0x16), + _ => unimplemented!("Opcode {:?} not implemented", op), + }; + VexInstruction::new() + .length(VexVectorLength::V128) + .w(w) + .prefix(prefix) + .map(map) + .opcode(opcode) + .rm(dst) + .reg(src.to_real_reg().unwrap().hw_enc()) + .imm(*imm) + .encode(sink); + } + + Inst::XmmToGprImmVex { op, src, dst, imm } => { + let src = allocs.next(src.to_reg()); + let dst = allocs.next(dst.to_reg().to_reg()); + + let (w, prefix, map, opcode) = match op { + AvxOpcode::Vpextrb => (false, LegacyPrefixes::_66, OpcodeMap::_0F3A, 0x14), + AvxOpcode::Vpextrw => (false, LegacyPrefixes::_66, OpcodeMap::_0F3A, 0x15), + AvxOpcode::Vpextrd => (false, LegacyPrefixes::_66, OpcodeMap::_0F3A, 0x16), + AvxOpcode::Vpextrq => (true, LegacyPrefixes::_66, OpcodeMap::_0F3A, 0x16), + _ => unimplemented!("Opcode {:?} not implemented", op), + }; + VexInstruction::new() + .length(VexVectorLength::V128) + .w(w) + .prefix(prefix) + .map(map) + .opcode(opcode) + .rm(dst.to_real_reg().unwrap().hw_enc()) + .reg(src.to_real_reg().unwrap().hw_enc()) + .imm(*imm) + .encode(sink); + } + Inst::XmmRmREvex { op, src1, @@ -2649,7 +2695,7 @@ pub(crate) fn emit( } Inst::XmmMovRM { op, src, dst } => { - let src = allocs.next(*src); + let src = allocs.next(src.to_reg()); let dst = dst.with_allocs(allocs); let (prefix, opcode) = match op { @@ -2666,6 +2712,27 @@ pub(crate) fn emit( emit_std_reg_mem(sink, prefix, opcode, 2, src, dst, RexFlags::clear_w(), 0); } + Inst::XmmMovRMImm { op, src, dst, imm } => { + let src = allocs.next(src.to_reg()); + let dst = dst.with_allocs(allocs); + + let (w, prefix, opcode) = match op { + SseOpcode::Pextrb => (false, LegacyPrefixes::_66, 0x0F3A14), + SseOpcode::Pextrw => (false, LegacyPrefixes::_66, 0x0F3A15), + SseOpcode::Pextrd => (false, LegacyPrefixes::_66, 0x0F3A16), + SseOpcode::Pextrq => (true, LegacyPrefixes::_66, 0x0F3A16), + _ => unimplemented!("Opcode {:?} not implemented", op), + }; + let rex = if w { + RexFlags::set_w() + } else { + RexFlags::clear_w() + }; + let dst = &dst.finalize(state, sink); + emit_std_reg_mem(sink, prefix, opcode, 3, src, dst, rex, 1); + sink.put1(*imm); + } + Inst::XmmToGpr { op, src, diff --git a/cranelift/codegen/src/isa/x64/inst/emit_tests.rs b/cranelift/codegen/src/isa/x64/inst/emit_tests.rs index 717e0fc391..d94828c557 100644 --- a/cranelift/codegen/src/isa/x64/inst/emit_tests.rs +++ b/cranelift/codegen/src/isa/x64/inst/emit_tests.rs @@ -4871,7 +4871,7 @@ fn test_x64_emit() { imm: 2, }, "C4430920EF02", - "vpinsrb $2 %xmm14, %r15, %xmm13", + "vpinsrb $2, %xmm14, %r15, %xmm13", )); // ======================================================== diff --git a/cranelift/codegen/src/isa/x64/inst/mod.rs b/cranelift/codegen/src/isa/x64/inst/mod.rs index c1c71690da..95e39293dd 100644 --- a/cranelift/codegen/src/isa/x64/inst/mod.rs +++ b/cranelift/codegen/src/isa/x64/inst/mod.rs @@ -131,6 +131,7 @@ impl Inst { // These use dynamic SSE opcodes. Inst::GprToXmm { op, .. } | Inst::XmmMovRM { op, .. } + | Inst::XmmMovRMImm { op, .. } | Inst::XmmRmiReg { opcode: op, .. } | Inst::XmmRmR { op, .. } | Inst::XmmRmRUnaligned { op, .. } @@ -153,7 +154,9 @@ impl Inst { | Inst::XmmVexPinsr { op, .. } | Inst::XmmUnaryRmRVex { op, .. } | Inst::XmmUnaryRmRImmVex { op, .. } - | Inst::XmmMovRMVex { op, .. } => op.available_from(), + | Inst::XmmMovRMVex { op, .. } + | Inst::XmmMovRMImmVex { op, .. } + | Inst::XmmToGprImmVex { op, .. } => op.available_from(), } } } @@ -331,7 +334,7 @@ impl Inst { debug_assert!(src.class() == RegClass::Float); Inst::XmmMovRM { op, - src, + src: Xmm::new(src).unwrap(), dst: dst.into(), } } @@ -933,17 +936,33 @@ impl PrettyPrint for Inst { } Inst::XmmMovRM { op, src, dst, .. } => { - let src = pretty_print_reg(*src, 8, allocs); + let src = pretty_print_reg(src.to_reg(), 8, allocs); let dst = dst.pretty_print(8, allocs); format!("{} {}, {}", ljustify(op.to_string()), src, dst) } Inst::XmmMovRMVex { op, src, dst, .. } => { - let src = pretty_print_reg(*src, 8, allocs); + let src = pretty_print_reg(src.to_reg(), 8, allocs); let dst = dst.pretty_print(8, allocs); format!("{} {}, {}", ljustify(op.to_string()), src, dst) } + Inst::XmmMovRMImm { + op, src, dst, imm, .. + } => { + let src = pretty_print_reg(src.to_reg(), 8, allocs); + let dst = dst.pretty_print(8, allocs); + format!("{} ${imm}, {}, {}", ljustify(op.to_string()), src, dst) + } + + Inst::XmmMovRMImmVex { + op, src, dst, imm, .. + } => { + let src = pretty_print_reg(src.to_reg(), 8, allocs); + let dst = dst.pretty_print(8, allocs); + format!("{} ${imm}, {}, {}", ljustify(op.to_string()), src, dst) + } + Inst::XmmRmR { op, src1, @@ -1023,7 +1042,7 @@ impl PrettyPrint for Inst { let src1 = pretty_print_reg(src1.to_reg(), 8, allocs); let src2 = src2.pretty_print(8, allocs); - format!("{} ${imm} {src1}, {src2}, {dst}", ljustify(op.to_string())) + format!("{} ${imm}, {src1}, {src2}, {dst}", ljustify(op.to_string())) } Inst::XmmVexPinsr { @@ -1038,7 +1057,7 @@ impl PrettyPrint for Inst { let src1 = pretty_print_reg(src1.to_reg(), 8, allocs); let src2 = src2.pretty_print(8, allocs); - format!("{} ${imm} {src1}, {src2}, {dst}", ljustify(op.to_string())) + format!("{} ${imm}, {src1}, {src2}, {dst}", ljustify(op.to_string())) } Inst::XmmRmRVex3 { @@ -1190,6 +1209,12 @@ impl PrettyPrint for Inst { format!("{} ${imm}, {}, {}", ljustify(op.to_string()), src, dst) } + Inst::XmmToGprImmVex { op, src, dst, imm } => { + let src = pretty_print_reg(src.to_reg(), 8, allocs); + let dst = pretty_print_reg(dst.to_reg().to_reg(), 8, allocs); + format!("{} ${imm}, {}, {}", ljustify(op.to_string()), src, dst) + } + Inst::GprToXmm { op, src, @@ -2033,8 +2058,11 @@ fn x64_get_operands VReg>(inst: &Inst, collector: &mut OperandCol collector.reg_reuse_def(dst.to_writable_reg(), 0); // Reuse RHS. src2.get_operands(collector); } - Inst::XmmMovRM { src, dst, .. } | Inst::XmmMovRMVex { src, dst, .. } => { - collector.reg_use(*src); + Inst::XmmMovRM { src, dst, .. } + | Inst::XmmMovRMVex { src, dst, .. } + | Inst::XmmMovRMImm { src, dst, .. } + | Inst::XmmMovRMImmVex { src, dst, .. } => { + collector.reg_use(src.to_reg()); dst.get_operands(collector); } Inst::XmmCmpRmR { src, dst, .. } => { @@ -2058,7 +2086,9 @@ fn x64_get_operands VReg>(inst: &Inst, collector: &mut OperandCol collector.reg_use(src.to_reg()); collector.reg_fixed_nonallocatable(*dst); } - Inst::XmmToGpr { src, dst, .. } | Inst::XmmToGprImm { src, dst, .. } => { + Inst::XmmToGpr { src, dst, .. } + | Inst::XmmToGprImm { src, dst, .. } + | Inst::XmmToGprImmVex { src, dst, .. } => { collector.reg_use(src.to_reg()); collector.reg_def(dst.to_writable_reg()); } diff --git a/cranelift/codegen/src/isa/x64/lower.isle b/cranelift/codegen/src/isa/x64/lower.isle index 56d2dc0f28..a19b6717fa 100644 --- a/cranelift/codegen/src/isa/x64/lower.isle +++ b/cranelift/codegen/src/isa/x64/lower.isle @@ -2659,17 +2659,41 @@ ;; f32 or f64 despite the source perhaps being an integer vector since the ;; result of the instruction is the same. (rule 2 (lower (store flags - (has_type (ty_32 _) (extractlane value (u8_from_uimm8 0))) + (has_type $F32 (extractlane value (u8_from_uimm8 0))) address offset)) (side_effect (x64_movss_store (to_amode flags address offset) value))) -(rule 3 (lower (store flags - (has_type (ty_64 _) (extractlane value (u8_from_uimm8 0))) +(rule 2 (lower (store flags + (has_type $F64 (extractlane value (u8_from_uimm8 0))) address offset)) (side_effect (x64_movsd_store (to_amode flags address offset) value))) +(rule 2 (lower (store flags + (has_type $I8 (extractlane value (u8_from_uimm8 n))) + address + offset)) + (side_effect + (x64_pextrb_store (to_amode flags address offset) value n))) +(rule 2 (lower (store flags + (has_type $I16 (extractlane value (u8_from_uimm8 n))) + address + offset)) + (side_effect + (x64_pextrw_store (to_amode flags address offset) value n))) +(rule 2 (lower (store flags + (has_type $I32 (extractlane value (u8_from_uimm8 n))) + address + offset)) + (side_effect + (x64_pextrd_store (to_amode flags address offset) value n))) +(rule 2 (lower (store flags + (has_type $I64 (extractlane value (u8_from_uimm8 n))) + address + offset)) + (side_effect + (x64_pextrq_store (to_amode flags address offset) value n))) ;; Rules for `load*` + ALU op + `store*` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; diff --git a/cranelift/filetests/filetests/isa/x64/extractlane-avx.clif b/cranelift/filetests/filetests/isa/x64/extractlane-avx.clif new file mode 100644 index 0000000000..cfefa99e89 --- /dev/null +++ b/cranelift/filetests/filetests/isa/x64/extractlane-avx.clif @@ -0,0 +1,309 @@ +test compile precise-output +target x86_64 has_avx + +function %f1(i8x16) -> i8 { +block0(v0: i8x16): + v1 = extractlane v0, 1 + return v1 +} + +; VCode: +; pushq %rbp +; movq %rsp, %rbp +; block0: +; vpextrb $1, %xmm0, %rax +; movq %rbp, %rsp +; popq %rbp +; ret +; +; Disassembled: +; block0: ; offset 0x0 +; pushq %rbp +; movq %rsp, %rbp +; block1: ; offset 0x4 +; vpextrb $1, %xmm0, %eax +; movq %rbp, %rsp +; popq %rbp +; retq + +function %f2(i16x8) -> i16 { +block0(v0: i16x8): + v1 = extractlane v0, 1 + return v1 +} + +; VCode: +; pushq %rbp +; movq %rsp, %rbp +; block0: +; vpextrw $1, %xmm0, %rax +; movq %rbp, %rsp +; popq %rbp +; ret +; +; Disassembled: +; block0: ; offset 0x0 +; pushq %rbp +; movq %rsp, %rbp +; block1: ; offset 0x4 +; vpextrw $1, %xmm0, %eax +; movq %rbp, %rsp +; popq %rbp +; retq + +function %f3(i32x4) -> i32 { +block0(v0: i32x4): + v1 = extractlane v0, 1 + return v1 +} + +; VCode: +; pushq %rbp +; movq %rsp, %rbp +; block0: +; vpextrd $1, %xmm0, %rax +; movq %rbp, %rsp +; popq %rbp +; ret +; +; Disassembled: +; block0: ; offset 0x0 +; pushq %rbp +; movq %rsp, %rbp +; block1: ; offset 0x4 +; vpextrd $1, %xmm0, %eax +; movq %rbp, %rsp +; popq %rbp +; retq + +function %f4(i64x2) -> i64 { +block0(v0: i64x2): + v1 = extractlane v0, 1 + return v1 +} + +; VCode: +; pushq %rbp +; movq %rsp, %rbp +; block0: +; vpextrq $1, %xmm0, %rax +; movq %rbp, %rsp +; popq %rbp +; ret +; +; Disassembled: +; block0: ; offset 0x0 +; pushq %rbp +; movq %rsp, %rbp +; block1: ; offset 0x4 +; vpextrq $1, %xmm0, %rax +; movq %rbp, %rsp +; popq %rbp +; retq + +function %f5(f32x4) -> f32 { +block0(v0: f32x4): + v1 = extractlane v0, 1 + return v1 +} + +; VCode: +; pushq %rbp +; movq %rsp, %rbp +; block0: +; vpshufd $1, %xmm0, %xmm0 +; movq %rbp, %rsp +; popq %rbp +; ret +; +; Disassembled: +; block0: ; offset 0x0 +; pushq %rbp +; movq %rsp, %rbp +; block1: ; offset 0x4 +; vpshufd $1, %xmm0, %xmm0 +; movq %rbp, %rsp +; popq %rbp +; retq + +function %f6(f64x2) -> f64 { +block0(v0: f64x2): + v1 = extractlane v0, 1 + return v1 +} + +; VCode: +; pushq %rbp +; movq %rsp, %rbp +; block0: +; vpshufd $238, %xmm0, %xmm0 +; movq %rbp, %rsp +; popq %rbp +; ret +; +; Disassembled: +; block0: ; offset 0x0 +; pushq %rbp +; movq %rsp, %rbp +; block1: ; offset 0x4 +; vpshufd $0xee, %xmm0, %xmm0 +; movq %rbp, %rsp +; popq %rbp +; retq + +function %extract_i8x16_lane0_to_memory(i8x16, i64) { +block0(v0: i8x16, v1: i64): + v2 = extractlane v0, 0 + store v2, v1 + return +} + +; VCode: +; pushq %rbp +; movq %rsp, %rbp +; block0: +; vpextrb $0, %xmm0, 0(%rdi) +; movq %rbp, %rsp +; popq %rbp +; ret +; +; Disassembled: +; block0: ; offset 0x0 +; pushq %rbp +; movq %rsp, %rbp +; block1: ; offset 0x4 +; vpextrb $0, %xmm0, (%rdi) ; trap: heap_oob +; movq %rbp, %rsp +; popq %rbp +; retq + +function %extract_i16x8_lane0_to_memory(i16x8, i64) { +block0(v0: i16x8, v1: i64): + v2 = extractlane v0, 0 + store v2, v1 + return +} + +; VCode: +; pushq %rbp +; movq %rsp, %rbp +; block0: +; vpextrw $0, %xmm0, 0(%rdi) +; movq %rbp, %rsp +; popq %rbp +; ret +; +; Disassembled: +; block0: ; offset 0x0 +; pushq %rbp +; movq %rsp, %rbp +; block1: ; offset 0x4 +; vpextrw $0, %xmm0, (%rdi) ; trap: heap_oob +; movq %rbp, %rsp +; popq %rbp +; retq + +function %extract_i32x4_lane0_to_memory(i32x4, i64) { +block0(v0: i32x4, v1: i64): + v2 = extractlane v0, 0 + store v2, v1 + return +} + +; VCode: +; pushq %rbp +; movq %rsp, %rbp +; block0: +; vpextrd $0, %xmm0, 0(%rdi) +; movq %rbp, %rsp +; popq %rbp +; ret +; +; Disassembled: +; block0: ; offset 0x0 +; pushq %rbp +; movq %rsp, %rbp +; block1: ; offset 0x4 +; vpextrd $0, %xmm0, (%rdi) ; trap: heap_oob +; movq %rbp, %rsp +; popq %rbp +; retq + +function %extract_f32x4_lane0_to_memory(f32x4, i64) { +block0(v0: f32x4, v1: i64): + v2 = extractlane v0, 0 + store v2, v1 + return +} + +; VCode: +; pushq %rbp +; movq %rsp, %rbp +; block0: +; vmovss %xmm0, 0(%rdi) +; movq %rbp, %rsp +; popq %rbp +; ret +; +; Disassembled: +; block0: ; offset 0x0 +; pushq %rbp +; movq %rsp, %rbp +; block1: ; offset 0x4 +; vmovss %xmm0, (%rdi) ; trap: heap_oob +; movq %rbp, %rsp +; popq %rbp +; retq + +function %extract_i64x2_lane0_to_memory(i64x2, i64) { +block0(v0: i64x2, v1: i64): + v2 = extractlane v0, 0 + store v2, v1 + return +} + +; VCode: +; pushq %rbp +; movq %rsp, %rbp +; block0: +; vpextrq $0, %xmm0, 0(%rdi) +; movq %rbp, %rsp +; popq %rbp +; ret +; +; Disassembled: +; block0: ; offset 0x0 +; pushq %rbp +; movq %rsp, %rbp +; block1: ; offset 0x4 +; vpextrq $0, %xmm0, (%rdi) ; trap: heap_oob +; movq %rbp, %rsp +; popq %rbp +; retq + +function %extract_f64x2_lane0_to_memory(f64x2, i64) { +block0(v0: f64x2, v1: i64): + v2 = extractlane v0, 0 + store v2, v1 + return +} + +; VCode: +; pushq %rbp +; movq %rsp, %rbp +; block0: +; vmovsd %xmm0, 0(%rdi) +; movq %rbp, %rsp +; popq %rbp +; ret +; +; Disassembled: +; block0: ; offset 0x0 +; pushq %rbp +; movq %rsp, %rbp +; block1: ; offset 0x4 +; vmovsd %xmm0, (%rdi) ; trap: heap_oob +; movq %rbp, %rsp +; popq %rbp +; retq + diff --git a/cranelift/filetests/filetests/isa/x64/extractlane.clif b/cranelift/filetests/filetests/isa/x64/extractlane.clif index bf6ce4cc97..b4ff83b12b 100644 --- a/cranelift/filetests/filetests/isa/x64/extractlane.clif +++ b/cranelift/filetests/filetests/isa/x64/extractlane.clif @@ -151,6 +151,58 @@ block0(v0: f64x2): ; popq %rbp ; retq +function %extract_i8x16_lane0_to_memory(i8x16, i64) { +block0(v0: i8x16, v1: i64): + v2 = extractlane v0, 0 + store v2, v1 + return +} + +; VCode: +; pushq %rbp +; movq %rsp, %rbp +; block0: +; pextrb $0, %xmm0, 0(%rdi) +; movq %rbp, %rsp +; popq %rbp +; ret +; +; Disassembled: +; block0: ; offset 0x0 +; pushq %rbp +; movq %rsp, %rbp +; block1: ; offset 0x4 +; pextrb $0, %xmm0, (%rdi) ; trap: heap_oob +; movq %rbp, %rsp +; popq %rbp +; retq + +function %extract_i16x8_lane0_to_memory(i16x8, i64) { +block0(v0: i16x8, v1: i64): + v2 = extractlane v0, 0 + store v2, v1 + return +} + +; VCode: +; pushq %rbp +; movq %rsp, %rbp +; block0: +; pextrw $0, %xmm0, 0(%rdi) +; movq %rbp, %rsp +; popq %rbp +; ret +; +; Disassembled: +; block0: ; offset 0x0 +; pushq %rbp +; movq %rsp, %rbp +; block1: ; offset 0x4 +; pextrw $0, %xmm0, (%rdi) ; trap: heap_oob +; movq %rbp, %rsp +; popq %rbp +; retq + function %extract_i32x4_lane0_to_memory(i32x4, i64) { block0(v0: i32x4, v1: i64): v2 = extractlane v0, 0 @@ -162,7 +214,7 @@ block0(v0: i32x4, v1: i64): ; pushq %rbp ; movq %rsp, %rbp ; block0: -; movss %xmm0, 0(%rdi) +; pextrd $0, %xmm0, 0(%rdi) ; movq %rbp, %rsp ; popq %rbp ; ret @@ -172,7 +224,7 @@ block0(v0: i32x4, v1: i64): ; pushq %rbp ; movq %rsp, %rbp ; block1: ; offset 0x4 -; movss %xmm0, (%rdi) ; trap: heap_oob +; pextrd $0, %xmm0, (%rdi) ; trap: heap_oob ; movq %rbp, %rsp ; popq %rbp ; retq @@ -214,7 +266,7 @@ block0(v0: i64x2, v1: i64): ; pushq %rbp ; movq %rsp, %rbp ; block0: -; movsd %xmm0, 0(%rdi) +; pextrq $0, %xmm0, 0(%rdi) ; movq %rbp, %rsp ; popq %rbp ; ret @@ -224,7 +276,7 @@ block0(v0: i64x2, v1: i64): ; pushq %rbp ; movq %rsp, %rbp ; block1: ; offset 0x4 -; movsd %xmm0, (%rdi) ; trap: heap_oob +; pextrq $0, %xmm0, (%rdi) ; trap: heap_oob ; movq %rbp, %rsp ; popq %rbp ; retq diff --git a/cranelift/filetests/filetests/isa/x64/float-avx.clif b/cranelift/filetests/filetests/isa/x64/float-avx.clif index 4e29340a3b..178a0ee05a 100644 --- a/cranelift/filetests/filetests/isa/x64/float-avx.clif +++ b/cranelift/filetests/filetests/isa/x64/float-avx.clif @@ -526,7 +526,7 @@ block0(v0: f32x4): ; pushq %rbp ; movq %rsp, %rbp ; block0: -; vcmpps $0 %xmm0, %xmm0, %xmm2 +; vcmpps $0, %xmm0, %xmm0, %xmm2 ; vandps %xmm0, %xmm2, %xmm4 ; vpxor %xmm2, %xmm4, %xmm6 ; vcvttps2dq %xmm4, %xmm8 @@ -565,7 +565,7 @@ block0(v0: f64x2): ; pushq %rbp ; movq %rsp, %rbp ; block0: -; vcmppd $0 %xmm0, %xmm0, %xmm2 +; vcmppd $0, %xmm0, %xmm0, %xmm2 ; vandps %xmm2, const(0), %xmm4 ; vminpd %xmm0, %xmm4, %xmm6 ; vcvttpd2dq %xmm6, %xmm0 diff --git a/cranelift/filetests/filetests/isa/x64/insertlane-avx.clif b/cranelift/filetests/filetests/isa/x64/insertlane-avx.clif new file mode 100644 index 0000000000..784c790be1 --- /dev/null +++ b/cranelift/filetests/filetests/isa/x64/insertlane-avx.clif @@ -0,0 +1,190 @@ +test compile precise-output +set enable_simd +target x86_64 has_avx + +function %insertlane_f64x2_zero(f64x2, f64) -> f64x2 { +block0(v0: f64x2, v1: f64): + v2 = insertlane v0, v1, 0 + return v2 +} + +; VCode: +; pushq %rbp +; movq %rsp, %rbp +; block0: +; vmovsd %xmm0, %xmm1, %xmm0 +; movq %rbp, %rsp +; popq %rbp +; ret +; +; Disassembled: +; block0: ; offset 0x0 +; pushq %rbp +; movq %rsp, %rbp +; block1: ; offset 0x4 +; vmovsd %xmm1, %xmm0, %xmm0 +; movq %rbp, %rsp +; popq %rbp +; retq + +function %insertlane_f64x2_one(f64x2, f64) -> f64x2 { +block0(v0: f64x2, v1: f64): + v2 = insertlane v0, v1, 1 + return v2 +} + +; VCode: +; pushq %rbp +; movq %rsp, %rbp +; block0: +; vmovlhps %xmm0, %xmm1, %xmm0 +; movq %rbp, %rsp +; popq %rbp +; ret +; +; Disassembled: +; block0: ; offset 0x0 +; pushq %rbp +; movq %rsp, %rbp +; block1: ; offset 0x4 +; vmovlhps %xmm1, %xmm0, %xmm0 +; movq %rbp, %rsp +; popq %rbp +; retq + +function %insertlane_f64x2_zero_with_load(f64x2, i64) -> f64x2 { +block0(v0: f64x2, v1: i64): + v2 = load.f64 v1 + v3 = insertlane v0, v2, 0 + return v3 +} + +; VCode: +; pushq %rbp +; movq %rsp, %rbp +; block0: +; vmovsd 0(%rdi), %xmm3 +; vmovsd %xmm0, %xmm3, %xmm0 +; movq %rbp, %rsp +; popq %rbp +; ret +; +; Disassembled: +; block0: ; offset 0x0 +; pushq %rbp +; movq %rsp, %rbp +; block1: ; offset 0x4 +; vmovsd (%rdi), %xmm3 ; trap: heap_oob +; vmovsd %xmm3, %xmm0, %xmm0 +; movq %rbp, %rsp +; popq %rbp +; retq + +function %insertlane_i8x16_one_load(i8x16, i64) -> i8x16 { +block0(v0: i8x16, v1: i64): + v2 = load.i8 v1 + v3 = insertlane v0, v2, 1 + return v3 +} + +; VCode: +; pushq %rbp +; movq %rsp, %rbp +; block0: +; movzbq 0(%rdi), %rdx +; vpinsrb $1, %xmm0, %rdx, %xmm0 +; movq %rbp, %rsp +; popq %rbp +; ret +; +; Disassembled: +; block0: ; offset 0x0 +; pushq %rbp +; movq %rsp, %rbp +; block1: ; offset 0x4 +; movzbq (%rdi), %rdx ; trap: heap_oob +; vpinsrb $1, %edx, %xmm0, %xmm0 +; movq %rbp, %rsp +; popq %rbp +; retq + +function %insertlane_i16x8_one_load(i16x8, i64) -> i16x8 { +block0(v0: i16x8, v1: i64): + v2 = load.i16 v1 + v3 = insertlane v0, v2, 1 + return v3 +} + +; VCode: +; pushq %rbp +; movq %rsp, %rbp +; block0: +; movzwq 0(%rdi), %rdx +; vpinsrw $1, %xmm0, %rdx, %xmm0 +; movq %rbp, %rsp +; popq %rbp +; ret +; +; Disassembled: +; block0: ; offset 0x0 +; pushq %rbp +; movq %rsp, %rbp +; block1: ; offset 0x4 +; movzwq (%rdi), %rdx ; trap: heap_oob +; vpinsrw $1, %edx, %xmm0, %xmm0 +; movq %rbp, %rsp +; popq %rbp +; retq + +function %insertlane_i32x4_one_load(i32x4, i64) -> i32x4 { +block0(v0: i32x4, v1: i64): + v2 = load.i32 v1 + v3 = insertlane v0, v2, 1 + return v3 +} + +; VCode: +; pushq %rbp +; movq %rsp, %rbp +; block0: +; vpinsrd $1, %xmm0, 0(%rdi), %xmm0 +; movq %rbp, %rsp +; popq %rbp +; ret +; +; Disassembled: +; block0: ; offset 0x0 +; pushq %rbp +; movq %rsp, %rbp +; block1: ; offset 0x4 +; vpinsrd $1, (%rdi), %xmm0, %xmm0 ; trap: heap_oob +; movq %rbp, %rsp +; popq %rbp +; retq + +function %insertlane_i64x2_one_load(i64x2, i64) -> i64x2 { +block0(v0: i64x2, v1: i64): + v2 = load.i64 v1 + v3 = insertlane v0, v2, 1 + return v3 +} + +; VCode: +; pushq %rbp +; movq %rsp, %rbp +; block0: +; vpinsrq $1, %xmm0, 0(%rdi), %xmm0 +; movq %rbp, %rsp +; popq %rbp +; ret +; +; Disassembled: +; block0: ; offset 0x0 +; pushq %rbp +; movq %rsp, %rbp +; block1: ; offset 0x4 +; vpinsrq $1, (%rdi), %xmm0, %xmm0 ; trap: heap_oob +; movq %rbp, %rsp +; popq %rbp +; retq + diff --git a/cranelift/filetests/filetests/isa/x64/insertlane.clif b/cranelift/filetests/filetests/isa/x64/insertlane.clif index 7f2569d028..f20210c33f 100644 --- a/cranelift/filetests/filetests/isa/x64/insertlane.clif +++ b/cranelift/filetests/filetests/isa/x64/insertlane.clif @@ -1,6 +1,6 @@ test compile precise-output set enable_simd -target x86_64 has_avx +target x86_64 function %insertlane_f64x2_zero(f64x2, f64) -> f64x2 { block0(v0: f64x2, v1: f64): @@ -12,7 +12,7 @@ block0(v0: f64x2, v1: f64): ; pushq %rbp ; movq %rsp, %rbp ; block0: -; vmovsd %xmm0, %xmm1, %xmm0 +; movsd %xmm0, %xmm1, %xmm0 ; movq %rbp, %rsp ; popq %rbp ; ret @@ -22,7 +22,7 @@ block0(v0: f64x2, v1: f64): ; pushq %rbp ; movq %rsp, %rbp ; block1: ; offset 0x4 -; vmovsd %xmm1, %xmm0, %xmm0 +; movsd %xmm1, %xmm0 ; movq %rbp, %rsp ; popq %rbp ; retq @@ -37,7 +37,7 @@ block0(v0: f64x2, v1: f64): ; pushq %rbp ; movq %rsp, %rbp ; block0: -; vmovlhps %xmm0, %xmm1, %xmm0 +; movlhps %xmm0, %xmm1, %xmm0 ; movq %rbp, %rsp ; popq %rbp ; ret @@ -47,7 +47,7 @@ block0(v0: f64x2, v1: f64): ; pushq %rbp ; movq %rsp, %rbp ; block1: ; offset 0x4 -; vmovlhps %xmm1, %xmm0, %xmm0 +; movlhps %xmm1, %xmm0 ; movq %rbp, %rsp ; popq %rbp ; retq @@ -63,8 +63,8 @@ block0(v0: f64x2, v1: i64): ; pushq %rbp ; movq %rsp, %rbp ; block0: -; vmovsd 0(%rdi), %xmm3 -; vmovsd %xmm0, %xmm3, %xmm0 +; movsd 0(%rdi), %xmm3 +; movsd %xmm0, %xmm3, %xmm0 ; movq %rbp, %rsp ; popq %rbp ; ret @@ -74,8 +74,116 @@ block0(v0: f64x2, v1: i64): ; pushq %rbp ; movq %rsp, %rbp ; block1: ; offset 0x4 -; vmovsd (%rdi), %xmm3 ; trap: heap_oob -; vmovsd %xmm3, %xmm0, %xmm0 +; movsd (%rdi), %xmm3 ; trap: heap_oob +; movsd %xmm3, %xmm0 +; movq %rbp, %rsp +; popq %rbp +; retq + +function %insertlane_i8x16_one_load(i8x16, i64) -> i8x16 { +block0(v0: i8x16, v1: i64): + v2 = load.i8 v1 + v3 = insertlane v0, v2, 1 + return v3 +} + +; VCode: +; pushq %rbp +; movq %rsp, %rbp +; block0: +; movzbq 0(%rdi), %rdx +; pinsrb $1, %xmm0, %rdx, %xmm0 +; movq %rbp, %rsp +; popq %rbp +; ret +; +; Disassembled: +; block0: ; offset 0x0 +; pushq %rbp +; movq %rsp, %rbp +; block1: ; offset 0x4 +; movzbq (%rdi), %rdx ; trap: heap_oob +; pinsrb $1, %edx, %xmm0 +; movq %rbp, %rsp +; popq %rbp +; retq + +function %insertlane_i16x8_one_load(i16x8, i64) -> i16x8 { +block0(v0: i16x8, v1: i64): + v2 = load.i16 v1 + v3 = insertlane v0, v2, 1 + return v3 +} + +; VCode: +; pushq %rbp +; movq %rsp, %rbp +; block0: +; movzwq 0(%rdi), %rdx +; pinsrw $1, %xmm0, %rdx, %xmm0 +; movq %rbp, %rsp +; popq %rbp +; ret +; +; Disassembled: +; block0: ; offset 0x0 +; pushq %rbp +; movq %rsp, %rbp +; block1: ; offset 0x4 +; movzwq (%rdi), %rdx ; trap: heap_oob +; pinsrw $1, %edx, %xmm0 +; movq %rbp, %rsp +; popq %rbp +; retq + +function %insertlane_i32x4_one_load(i32x4, i64) -> i32x4 { +block0(v0: i32x4, v1: i64): + v2 = load.i32 v1 + v3 = insertlane v0, v2, 1 + return v3 +} + +; VCode: +; pushq %rbp +; movq %rsp, %rbp +; block0: +; pinsrd $1, %xmm0, 0(%rdi), %xmm0 +; movq %rbp, %rsp +; popq %rbp +; ret +; +; Disassembled: +; block0: ; offset 0x0 +; pushq %rbp +; movq %rsp, %rbp +; block1: ; offset 0x4 +; pinsrd $1, (%rdi), %xmm0 ; trap: heap_oob +; movq %rbp, %rsp +; popq %rbp +; retq + +function %insertlane_i64x2_one_load(i64x2, i64) -> i64x2 { +block0(v0: i64x2, v1: i64): + v2 = load.i64 v1 + v3 = insertlane v0, v2, 1 + return v3 +} + +; VCode: +; pushq %rbp +; movq %rsp, %rbp +; block0: +; pinsrd.w $1, %xmm0, 0(%rdi), %xmm0 +; movq %rbp, %rsp +; popq %rbp +; ret +; +; Disassembled: +; block0: ; offset 0x0 +; pushq %rbp +; movq %rsp, %rbp +; block1: ; offset 0x4 +; pinsrq $1, (%rdi), %xmm0 ; trap: heap_oob ; movq %rbp, %rsp ; popq %rbp ; retq diff --git a/cranelift/filetests/filetests/isa/x64/simd-arith-avx.clif b/cranelift/filetests/filetests/isa/x64/simd-arith-avx.clif index 87c1b77101..ac04216046 100644 --- a/cranelift/filetests/filetests/isa/x64/simd-arith-avx.clif +++ b/cranelift/filetests/filetests/isa/x64/simd-arith-avx.clif @@ -1204,7 +1204,7 @@ block0(v0: i8x16): ; pushq %rbp ; movq %rsp, %rbp ; block0: -; vpalignr $8 %xmm0, %xmm0, %xmm2 +; vpalignr $8, %xmm0, %xmm0, %xmm2 ; vpmovzxbw %xmm2, %xmm0 ; movq %rbp, %rsp ; popq %rbp @@ -1316,7 +1316,7 @@ block0(v0: i8): ; movq %rsp, %rbp ; block0: ; uninit %xmm2 -; vpinsrb $0 %xmm2, %rdi, %xmm4 +; vpinsrb $0, %xmm2, %rdi, %xmm4 ; uninit %xmm6 ; vpxor %xmm6, %xmm6, %xmm8 ; vpshufb %xmm4, %xmm8, %xmm0 @@ -1354,7 +1354,7 @@ block0(v0: f64x2): ; vminpd %xmm6, const(0), %xmm8 ; vroundpd $3, %xmm8, %xmm10 ; vaddpd %xmm10, const(1), %xmm12 -; vshufps $136 %xmm12, %xmm4, %xmm0 +; vshufps $136, %xmm12, %xmm4, %xmm0 ; movq %rbp, %rsp ; popq %rbp ; ret diff --git a/cranelift/filetests/filetests/isa/x64/simd-bitwise-avx.clif b/cranelift/filetests/filetests/isa/x64/simd-bitwise-avx.clif index 7da79a0cc9..023d52d81e 100644 --- a/cranelift/filetests/filetests/isa/x64/simd-bitwise-avx.clif +++ b/cranelift/filetests/filetests/isa/x64/simd-bitwise-avx.clif @@ -529,7 +529,7 @@ block0(v0: f32x4, v1: f32): ; pushq %rbp ; movq %rsp, %rbp ; block0: -; vinsertps $16 %xmm0, %xmm1, %xmm0 +; vinsertps $16, %xmm0, %xmm1, %xmm0 ; movq %rbp, %rsp ; popq %rbp ; ret @@ -579,7 +579,7 @@ block0(v0: i8x16, v1: i8): ; pushq %rbp ; movq %rsp, %rbp ; block0: -; vpinsrb $1 %xmm0, %rdi, %xmm0 +; vpinsrb $1, %xmm0, %rdi, %xmm0 ; movq %rbp, %rsp ; popq %rbp ; ret @@ -604,7 +604,7 @@ block0(v0: i16x8, v1: i16): ; pushq %rbp ; movq %rsp, %rbp ; block0: -; vpinsrw $1 %xmm0, %rdi, %xmm0 +; vpinsrw $1, %xmm0, %rdi, %xmm0 ; movq %rbp, %rsp ; popq %rbp ; ret @@ -629,7 +629,7 @@ block0(v0: i32x4, v1: i32): ; pushq %rbp ; movq %rsp, %rbp ; block0: -; vpinsrd $1 %xmm0, %rdi, %xmm0 +; vpinsrd $1, %xmm0, %rdi, %xmm0 ; movq %rbp, %rsp ; popq %rbp ; ret @@ -654,7 +654,7 @@ block0(v0: i64x2, v1: i64): ; pushq %rbp ; movq %rsp, %rbp ; block0: -; vpinsrq $1 %xmm0, %rdi, %xmm0 +; vpinsrq $1, %xmm0, %rdi, %xmm0 ; movq %rbp, %rsp ; popq %rbp ; ret diff --git a/cranelift/filetests/filetests/isa/x64/simd-cmp-avx.clif b/cranelift/filetests/filetests/isa/x64/simd-cmp-avx.clif index b46b4e722e..9c69ac0196 100644 --- a/cranelift/filetests/filetests/isa/x64/simd-cmp-avx.clif +++ b/cranelift/filetests/filetests/isa/x64/simd-cmp-avx.clif @@ -215,7 +215,7 @@ block0(v0: f32x4, v1: f32x4): ; vminps %xmm0, %xmm1, %xmm3 ; vminps %xmm1, %xmm0, %xmm5 ; vorps %xmm3, %xmm5, %xmm7 -; vcmpps $3 %xmm7, %xmm5, %xmm9 +; vcmpps $3, %xmm7, %xmm5, %xmm9 ; vorps %xmm7, %xmm9, %xmm11 ; vpsrld %xmm9, $10, %xmm13 ; vandnps %xmm13, %xmm11, %xmm0 @@ -252,7 +252,7 @@ block0(v0: f64x2, v1: f64x2): ; vminpd %xmm0, %xmm1, %xmm3 ; vminpd %xmm1, %xmm0, %xmm5 ; vorpd %xmm3, %xmm5, %xmm7 -; vcmppd $3 %xmm3, %xmm5, %xmm9 +; vcmppd $3, %xmm3, %xmm5, %xmm9 ; vorpd %xmm7, %xmm9, %xmm11 ; vpsrlq %xmm9, $13, %xmm13 ; vandnpd %xmm13, %xmm11, %xmm0 @@ -291,7 +291,7 @@ block0(v0: f32x4, v1: f32x4): ; vxorps %xmm3, %xmm5, %xmm7 ; vorps %xmm3, %xmm7, %xmm9 ; vsubps %xmm9, %xmm7, %xmm11 -; vcmpps $3 %xmm9, %xmm9, %xmm13 +; vcmpps $3, %xmm9, %xmm9, %xmm13 ; vpsrld %xmm13, $10, %xmm15 ; vandnps %xmm15, %xmm11, %xmm0 ; movq %rbp, %rsp @@ -330,7 +330,7 @@ block0(v0: f64x2, v1: f64x2): ; vxorpd %xmm3, %xmm5, %xmm7 ; vorpd %xmm3, %xmm7, %xmm9 ; vsubpd %xmm9, %xmm7, %xmm11 -; vcmppd $3 %xmm9, %xmm9, %xmm13 +; vcmppd $3, %xmm9, %xmm9, %xmm13 ; vpsrlq %xmm13, $13, %xmm15 ; vandnpd %xmm15, %xmm11, %xmm0 ; movq %rbp, %rsp diff --git a/cranelift/filetests/filetests/runtests/simd-extractlane.clif b/cranelift/filetests/filetests/runtests/simd-extractlane.clif index 471130f8c2..2216d68c84 100644 --- a/cranelift/filetests/filetests/runtests/simd-extractlane.clif +++ b/cranelift/filetests/filetests/runtests/simd-extractlane.clif @@ -4,6 +4,7 @@ target aarch64 target s390x set enable_simd target x86_64 has_sse3 has_ssse3 has_sse41 +target x86_64 has_sse3 has_ssse3 has_sse41 has_avx function %extractlane_4(i8x16) -> i8 { block0(v0: i8x16): @@ -33,3 +34,69 @@ block0(v0: i64x2): return v1 } ; run: %extractlane_1([0 4294967297]) == 4294967297 + +function %extractlane_i8x16_through_stack(i8x16) -> i8 { + ss0 = explicit_slot 8 +block0(v0: i8x16): + v2 = stack_addr.i64 ss0 + v3 = extractlane v0, 1 + store v3, v2 + v4 = load.i8 v2 + return v4 +} +; run: %extractlane_i8x16_through_stack([1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16]) == 2 + +function %extractlane_i16x8_through_stack(i16x8) -> i16 { + ss0 = explicit_slot 8 +block0(v0: i16x8): + v2 = stack_addr.i64 ss0 + v3 = extractlane v0, 2 + store v3, v2 + v4 = load.i16 v2 + return v4 +} +; run: %extractlane_i16x8_through_stack([1 2 3 4 5 6 7 8]) == 3 + +function %extractlane_i32x4_through_stack(i32x4) -> i32 { + ss0 = explicit_slot 8 +block0(v0: i32x4): + v2 = stack_addr.i64 ss0 + v3 = extractlane v0, 3 + store v3, v2 + v4 = load.i32 v2 + return v4 +} +; run: %extractlane_i32x4_through_stack([1 2 3 4]) == 4 + +function %extractlane_i64x2_through_stack(i64x2) -> i64 { + ss0 = explicit_slot 8 +block0(v0: i64x2): + v2 = stack_addr.i64 ss0 + v3 = extractlane v0, 0 + store v3, v2 + v4 = load.i64 v2 + return v4 +} +; run: %extractlane_i64x2_through_stack([1 2]) == 1 + +function %extractlane_f32x4_through_stack(f32x4) -> f32 { + ss0 = explicit_slot 8 +block0(v0: f32x4): + v2 = stack_addr.i64 ss0 + v3 = extractlane v0, 3 + store v3, v2 + v4 = load.f32 v2 + return v4 +} +; run: %extractlane_f32x4_through_stack([0x1.0 0x2.0 0x3.0 0x4.0]) == 0x4.0 + +function %extractlane_f64x2_through_stack(f64x2) -> f64 { + ss0 = explicit_slot 8 +block0(v0: f64x2): + v2 = stack_addr.i64 ss0 + v3 = extractlane v0, 0 + store v3, v2 + v4 = load.f64 v2 + return v4 +} +; run: %extractlane_f64x2_through_stack([0x1.0 0x2.0]) == 0x1.0 diff --git a/cranelift/filetests/filetests/runtests/simd-insertlane.clif b/cranelift/filetests/filetests/runtests/simd-insertlane.clif index 6e0182b1cb..e9a944ab49 100644 --- a/cranelift/filetests/filetests/runtests/simd-insertlane.clif +++ b/cranelift/filetests/filetests/runtests/simd-insertlane.clif @@ -47,3 +47,91 @@ block0(v0: f64x2, v1: f64): return v2 } ; run: %insertlane_1_in_f64x2([0x1.0 0x2.0], 0x3.0) == [0x1.0 0x3.0] + +function %insertlane_i8x16_through_stack(i8x16, i8) -> i8x16 { + ss0 = explicit_slot 8 +block0(v0: i8x16, v1: i8): + v2 = stack_addr.i64 ss0 + store v1, v2 + v3 = load.i8 v2 + v4 = insertlane v0, v3, 1 + return v4 +} +; run: %insertlane_i8x16_through_stack([1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1], 2) == [1 2 1 1 1 1 1 1 1 1 1 1 1 1 1 1] + +function %insertlane_i16x8_through_stack(i16x8, i16) -> i16x8 { + ss0 = explicit_slot 8 +block0(v0: i16x8, v1: i16): + v2 = stack_addr.i64 ss0 + store v1, v2 + v3 = load.i16 v2 + v4 = insertlane v0, v3, 2 + return v4 +} +; run: %insertlane_i16x8_through_stack([1 1 1 1 1 1 1 1], 2) == [1 1 2 1 1 1 1 1] + +function %insertlane_i32x4_through_stack(i32x4, i32) -> i32x4 { + ss0 = explicit_slot 8 +block0(v0: i32x4, v1: i32): + v2 = stack_addr.i64 ss0 + store v1, v2 + v3 = load.i32 v2 + v4 = insertlane v0, v3, 3 + return v4 +} +; run: %insertlane_i32x4_through_stack([1 1 1 1], 2) == [1 1 1 2] + +function %insertlane_i64x2_through_stack(i64x2, i64) -> i64x2 { + ss0 = explicit_slot 8 +block0(v0: i64x2, v1: i64): + v2 = stack_addr.i64 ss0 + store v1, v2 + v3 = load.i64 v2 + v4 = insertlane v0, v3, 0 + return v4 +} +; run: %insertlane_i64x2_through_stack([1 1], 2) == [2 1] + +function %insertlane_f32x4_through_stack(f32x4, f32) -> f32x4 { + ss0 = explicit_slot 8 +block0(v0: f32x4, v1: f32): + v2 = stack_addr.i64 ss0 + store v1, v2 + v3 = load.f32 v2 + v4 = insertlane v0, v3, 3 + return v4 +} +; run: %insertlane_f32x4_through_stack([0x1.0 0x1.0 0x1.0 0x1.0], 0x2.0) == [0x1.0 0x1.0 0x1.0 0x2.0] + +function %insertlane_f32x4_through_stack2(f32x4, f32) -> f32x4 { + ss0 = explicit_slot 8 +block0(v0: f32x4, v1: f32): + v2 = stack_addr.i64 ss0 + store v1, v2 + v3 = load.f32 v2 + v4 = insertlane v0, v3, 0 + return v4 +} +; run: %insertlane_f32x4_through_stack2([0x1.0 0x1.0 0x1.0 0x1.0], 0x2.0) == [0x2.0 0x1.0 0x1.0 0x1.0] + +function %insertlane_f64x2_through_stack(f64x2, f64) -> f64x2 { + ss0 = explicit_slot 8 +block0(v0: f64x2, v1: f64): + v2 = stack_addr.i64 ss0 + store v1, v2 + v3 = load.f64 v2 + v4 = insertlane v0, v3, 0 + return v4 +} +; run: %insertlane_f64x2_through_stack([0x1.0 0x1.0], 0x2.0) == [0x2.0 0x1.0] + +function %insertlane_f64x2_through_stack2(f64x2, f64) -> f64x2 { + ss0 = explicit_slot 8 +block0(v0: f64x2, v1: f64): + v2 = stack_addr.i64 ss0 + store v1, v2 + v3 = load.f64 v2 + v4 = insertlane v0, v3, 1 + return v4 +} +; run: %insertlane_f64x2_through_stack2([0x1.0 0x1.0], 0x2.0) == [0x1.0 0x2.0] diff --git a/cranelift/filetests/filetests/wasm/x64-relaxed-simd-deterministic.wat b/cranelift/filetests/filetests/wasm/x64-relaxed-simd-deterministic.wat index 8d7eca8c0d..8d80f59dab 100644 --- a/cranelift/filetests/filetests/wasm/x64-relaxed-simd-deterministic.wat +++ b/cranelift/filetests/filetests/wasm/x64-relaxed-simd-deterministic.wat @@ -44,7 +44,7 @@ ;; movq %rsp, %rbp ;; unwind DefineNewFrame { offset_upward_to_caller_sp: 16, offset_downward_to_clobbers: 0 } ;; block0: -;; vcmpps $0 %xmm0, %xmm0, %xmm3 +;; vcmpps $0, %xmm0, %xmm0, %xmm3 ;; vandps %xmm0, %xmm3, %xmm5 ;; vpxor %xmm3, %xmm5, %xmm7 ;; vcvttps2dq %xmm5, %xmm9 @@ -71,7 +71,7 @@ ;; vcvtdq2ps %xmm11, %xmm13 ;; vcvttps2dq %xmm7, %xmm15 ;; vsubps %xmm7, %xmm13, %xmm1 -;; vcmpps $2 %xmm13, %xmm1, %xmm3 +;; vcmpps $2, %xmm13, %xmm1, %xmm3 ;; vcvttps2dq %xmm1, %xmm5 ;; vpxor %xmm5, %xmm3, %xmm7 ;; uninit %xmm9 @@ -90,7 +90,7 @@ ;; movq %rsp, %rbp ;; unwind DefineNewFrame { offset_upward_to_caller_sp: 16, offset_downward_to_clobbers: 0 } ;; block0: -;; vcmppd $0 %xmm0, %xmm0, %xmm3 +;; vcmppd $0, %xmm0, %xmm0, %xmm3 ;; vandps %xmm3, const(0), %xmm5 ;; vminpd %xmm0, %xmm5, %xmm7 ;; vcvttpd2dq %xmm7, %xmm0 @@ -112,7 +112,7 @@ ;; vminpd %xmm7, const(0), %xmm9 ;; vroundpd $3, %xmm9, %xmm11 ;; vaddpd %xmm11, const(1), %xmm13 -;; vshufps $136 %xmm13, %xmm5, %xmm0 +;; vshufps $136, %xmm13, %xmm5, %xmm0 ;; jmp label1 ;; block1: ;; movq %rbp, %rsp @@ -128,9 +128,9 @@ ;; vpmovsxbw %xmm0, %xmm10 ;; vpmovsxbw %xmm1, %xmm12 ;; vpmullw %xmm10, %xmm12, %xmm14 -;; vpalignr $8 %xmm0, %xmm0, %xmm8 +;; vpalignr $8, %xmm0, %xmm0, %xmm8 ;; vpmovsxbw %xmm8, %xmm10 -;; vpalignr $8 %xmm1, %xmm1, %xmm12 +;; vpalignr $8, %xmm1, %xmm1, %xmm12 ;; vpmovsxbw %xmm12, %xmm15 ;; vpmullw %xmm10, %xmm15, %xmm0 ;; vphaddw %xmm14, %xmm0, %xmm0 @@ -149,9 +149,9 @@ ;; vpmovsxbw %xmm0, %xmm13 ;; vpmovsxbw %xmm1, %xmm15 ;; vpmullw %xmm13, %xmm15, %xmm3 -;; vpalignr $8 %xmm0, %xmm0, %xmm11 +;; vpalignr $8, %xmm0, %xmm0, %xmm11 ;; vpmovsxbw %xmm11, %xmm13 -;; vpalignr $8 %xmm1, %xmm1, %xmm15 +;; vpalignr $8, %xmm1, %xmm1, %xmm15 ;; vpmovsxbw %xmm15, %xmm1 ;; vpmullw %xmm13, %xmm1, %xmm4 ;; vphaddw %xmm3, %xmm4, %xmm15