From c26a65a854733735207f641bb7ff23ba574a1f6f Mon Sep 17 00:00:00 2001 From: Alex Crichton Date: Mon, 20 Feb 2023 09:11:52 -0600 Subject: [PATCH] x64: Add most remaining AVX lowerings (#5819) * x64: Add most remaining AVX lowerings This commit goes through `inst.isle` and adds a corresponding AVX lowering for most SSE lowerings. I opted to skip instructions where the SSE lowering didn't read/modify a register, such as `roundps`. I think that AVX will benefit these instructions when there's load-merging since AVX doesn't require alignment, but I've deferred that work to a future PR. Otherwise though in this PR I think all (or almost all) of the 3-operand forms of AVX instructions are supported with their SSE counterparts. This should ideally improve codegen slightly by removing register pressure and the need for `movdqa` between registers. I've attempted to ensure that there's at least one codegen test for all the new instructions. As a side note, the recent capstone integration into `precise-output` tests helped me catch a number of encoding bugs much earlier than otherwise, so I've found that incredibly useful in tests! * Move `vpinsr*` instructions to their own variant Use true `XmmMem` and `GprMem` types in the instruction as well to get more type-level safety for what goes where. * Remove `Inst::produces_const` accessor Instead of conditionally defining regalloc and various other operations instead add dedicated `MInst` variants for operations which are intended to produce a constant to have more clear interactions with regalloc and printing and such. * Fix tests * Register traps in `MachBuffer` for load-folding ops This adds a missing `add_trap` to encoding of VEX instructions with memory operands to ensure that if they cause a segfault that there's appropriate metadata for Wasmtime to understand that the instruction could in fact trap. This fixes a fuzz test case found locally where v8 trapped and Wasmtime didn't catch the signal and crashed the fuzzer. --- cranelift/codegen/src/isa/x64/encoding/vex.rs | 7 + cranelift/codegen/src/isa/x64/inst.isle | 817 +++++-- cranelift/codegen/src/isa/x64/inst/args.rs | 115 +- cranelift/codegen/src/isa/x64/inst/emit.rs | 306 ++- .../codegen/src/isa/x64/inst/emit_tests.rs | 15 + cranelift/codegen/src/isa/x64/inst/mod.rs | 180 +- cranelift/codegen/src/isa/x64/lower.isle | 18 +- cranelift/codegen/src/isa/x64/lower/isle.rs | 11 +- .../filetests/filetests/isa/x64/fcvt.clif | 20 +- .../filetests/isa/x64/simd-arith-avx.clif | 1886 +++++++++++++++++ .../filetests/isa/x64/simd-bitwise-avx.clif | 548 ++++- .../filetests/isa/x64/simd-cmp-avx.clif | 656 ++++++ .../isa/x64/simd-comparison-legalize.clif | 2 +- .../isa/x64/simd-logical-compile.clif | 2 +- .../filetests/runtests/simd-lane-access.clif | 1 + .../simd/load_splat_out_of_bounds.wast | 27 + 16 files changed, 4145 insertions(+), 466 deletions(-) create mode 100644 cranelift/filetests/filetests/isa/x64/simd-arith-avx.clif create mode 100644 cranelift/filetests/filetests/isa/x64/simd-cmp-avx.clif create mode 100644 tests/misc_testsuite/simd/load_splat_out_of_bounds.wast diff --git a/cranelift/codegen/src/isa/x64/encoding/vex.rs b/cranelift/codegen/src/isa/x64/encoding/vex.rs index df2c921697..760c7199d1 100644 --- a/cranelift/codegen/src/isa/x64/encoding/vex.rs +++ b/cranelift/codegen/src/isa/x64/encoding/vex.rs @@ -4,6 +4,7 @@ use super::evex::Register; use super::rex::{LegacyPrefixes, OpcodeMap}; use super::ByteSink; +use crate::ir::TrapCode; use crate::isa::x64::args::Amode; use crate::isa::x64::encoding::rex; use crate::isa::x64::inst::Inst; @@ -267,6 +268,12 @@ impl VexInstruction { /// Emit the VEX-encoded instruction to the provided buffer. pub fn encode(&self, sink: &mut MachBuffer) { + if let RegisterOrAmode::Amode(amode) = &self.rm { + if amode.can_trap() { + sink.add_trap(TrapCode::HeapOutOfBounds); + } + } + // 2/3 byte prefix if self.use_2byte_prefix() { self.encode_2byte_prefix(sink); diff --git a/cranelift/codegen/src/isa/x64/inst.isle b/cranelift/codegen/src/isa/x64/inst.isle index f0f3c70d06..3da3aa66c5 100644 --- a/cranelift/codegen/src/isa/x64/inst.isle +++ b/cranelift/codegen/src/isa/x64/inst.isle @@ -35,6 +35,11 @@ (src2 Gpr) (dst WritableGpr)) + ;; Production of a zero value into a register of the specified size. + (AluConstOp (op AluRmiROpcode) + (size OperandSize) + (dst WritableGpr)) + ;; Instructions on general-purpose registers that only read src and ;; defines dst (dst is not modified). `bsr`, etc. (UnaryRmR (size OperandSize) ;; 2, 4, or 8 @@ -216,6 +221,13 @@ (src2 XmmMem) (dst WritableXmm)) + ;; XMM (scalar or vector) production of a constant value by operating + ;; on a register with itself. + ;; + ;; Used to produce all zeros with xor or all one with a comparison. + (XmmConstOp (op SseOpcode) + (dst WritableXmm)) + ;; XMM (scalar or vector) blend op. The mask is used to blend between ;; src1 and src2. This differs from a use of `XmmRmR` as the mask is ;; implicitly in register xmm0; this special case exists to allow us to @@ -242,6 +254,14 @@ (dst WritableXmm) (imm u8)) + ;; XMM instruction for `vpinsr{b,w,d,q}` which is separte from + ;; `XmmRmRImmVex` because `src2` is a gpr, not xmm register. + (XmmVexPinsr (op AvxOpcode) + (src1 Xmm) + (src2 GprMem) + (dst WritableXmm) + (imm u8)) + ;; XMM (scalar or vector) ternary op that relies on the VEX prefix and ;; has three dynamic inputs. (XmmRmRVex3 (op AvxOpcode) @@ -250,6 +270,13 @@ (src3 XmmMem) (dst WritableXmm)) + ;; XMM blend operation using the VEX encoding. + (XmmRmRBlendVex (op AvxOpcode) + (src1 Xmm) + (src2 XmmMem) + (mask Xmm) + (dst WritableXmm)) + ;; XMM (scalar or vector) binary op that relies on the EVEX ;; prefix. Takes two inputs. (XmmRmREvex (op Avx512Opcode) @@ -1153,11 +1180,110 @@ Vfmadd213sd Vfmadd213ps Vfmadd213pd - Vminps - Vorps - Vandnps Vcmpps - Vpsrld)) + Vcmppd + Vpsrlw + Vpsrld + Vpsrlq + Vpaddb + Vpaddw + Vpaddd + Vpaddq + Vpaddsb + Vpaddsw + Vpaddusb + Vpaddusw + Vpsubb + Vpsubw + Vpsubd + Vpsubq + Vpsubsb + Vpsubsw + Vpsubusb + Vpsubusw + Vpavgb + Vpavgw + Vpand + Vandps + Vandpd + Vpor + Vorps + Vorpd + Vpxor + Vxorps + Vxorpd + Vpmullw + Vpmulld + Vpmulhw + Vpmulhd + Vpmulhrsw + Vpmulhuw + Vpmuldq + Vpmuludq + Vpunpckhwd + Vpunpcklwd + Vunpcklps + Vandnps + Vandnpd + Vpandn + Vaddps + Vaddpd + Vsubps + Vsubpd + Vmulps + Vmulpd + Vdivps + Vdivpd + Vpcmpeqb + Vpcmpeqw + Vpcmpeqd + Vpcmpeqq + Vpcmpgtb + Vpcmpgtw + Vpcmpgtd + Vpcmpgtq + Vminps + Vminpd + Vmaxps + Vmaxpd + Vblendvpd + Vblendvps + Vpblendvb + Vmovlhps + Vpmaxsb + Vpmaxsw + Vpmaxsd + Vpminsb + Vpminsw + Vpminsd + Vpmaxub + Vpmaxuw + Vpmaxud + Vpminub + Vpminuw + Vpminud + Vpunpcklbw + Vpunpckhbw + Vpacksswb + Vpackssdw + Vpackuswb + Vpackusdw + Vpalignr + Vpinsrb + Vpinsrw + Vpinsrd + Vpinsrq + Vpmaddwd + Vpmaddubsw + Vinsertps + Vpshufb + Vshufps + Vpsllw + Vpslld + Vpsllq + Vpsraw + Vpsrad + )) (type Avx512Opcode extern (enum Vcvtudq2ps @@ -1587,9 +1713,17 @@ ;; patterns. (rule 0 (sse_xor_op (multi_lane _bits _lanes)) (SseOpcode.Pxor)) +(decl avx_xor_op (Type) AvxOpcode) +(rule 1 (avx_xor_op $F32X4) (AvxOpcode.Vxorps)) +(rule 1 (avx_xor_op $F64X2) (AvxOpcode.Vxorpd)) +(rule 0 (avx_xor_op (multi_lane _bits _lanes)) (AvxOpcode.Vpxor)) + ;; Performs an xor operation of the two operands specified. (decl sse_xor (Type Xmm XmmMem) Xmm) -(rule (sse_xor ty x y) (xmm_rm_r (sse_xor_op ty) x y)) +(rule 0 (sse_xor ty x y) (xmm_rm_r (sse_xor_op ty) x y)) +(rule 1 (sse_xor ty @ (multi_lane _ _) x y) + (if-let $true (has_avx)) + (xmm_rmir_vex (avx_xor_op ty) x y)) ;; Generates a register value which has an all-ones pattern. ;; @@ -1602,8 +1736,9 @@ ;; we're guaranteeed that everything is equal to itself. (decl vector_all_ones () Xmm) (rule (vector_all_ones) - (let ((r WritableXmm (temp_writable_xmm))) - (x64_pcmpeqd r r))) + (let ((r WritableXmm (temp_writable_xmm)) + (_ Unit (emit (MInst.XmmConstOp (SseOpcode.Pcmpeqd) r)))) + r)) ;; Helper for creating XmmUninitializedValue instructions. (decl xmm_uninit_value () Xmm) @@ -1616,8 +1751,8 @@ (decl make_i64x2_from_lanes (GprMem GprMem) Xmm) (rule (make_i64x2_from_lanes lo hi) (let ((dst Xmm (xmm_uninit_value)) - (dst Xmm (x64_pinsrd dst lo 0 (OperandSize.Size64))) - (dst Xmm (x64_pinsrd dst hi 1 (OperandSize.Size64)))) + (dst Xmm (x64_pinsrq dst lo 0)) + (dst Xmm (x64_pinsrq dst hi 1))) dst)) ;; Move a `RegMemImm.Reg` operand to an XMM register, if necessary. @@ -1940,50 +2075,37 @@ ;; Special case for integer zero immediates: turn them into an `xor r, r`. (rule 1 (imm (fits_in_64 ty) (u64_zero)) (let ((wgpr WritableGpr (temp_writable_gpr)) - (g Gpr wgpr) (size OperandSize (operand_size_of_type_32_64 ty)) - (_ Unit (emit (MInst.AluRmiR size - (AluRmiROpcode.Xor) - g - g - wgpr)))) - (gpr_to_reg g))) + (_ Unit (emit (MInst.AluConstOp (AluRmiROpcode.Xor) size wgpr)))) + (gpr_to_reg wgpr))) ;; Special case for zero immediates with vector types, they turn into an xor ;; specific to the vector type. (rule 0 (imm ty @ (multi_lane _bits _lanes) 0) - (let ((wr WritableXmm (temp_writable_xmm)) - (r Xmm wr) - (_ Unit (emit (MInst.XmmRmR (sse_xor_op ty) - r - r - wr)))) - (xmm_to_reg r))) + (xmm_to_reg (xmm_zero ty))) -;; Special case for `f32` zero immediates to use `xorps`. -(rule 2 (imm $F32 (u64_zero)) +;; Special case for `f32` zero immediates +(rule 2 (imm ty @ $F32 (u64_zero)) (let ((wr WritableXmm (temp_writable_xmm)) - (r Xmm wr) - (_ Unit (emit (MInst.XmmRmR (SseOpcode.Xorps) - r - r - wr)))) - (xmm_to_reg r))) + (_ Unit (emit (MInst.XmmConstOp (SseOpcode.Xorps) wr)))) + (xmm_to_reg wr))) ;; TODO: use cmpeqps for all 1s ;; Special case for `f64` zero immediates to use `xorpd`. -(rule 2 (imm $F64 (u64_zero)) +(rule 2 (imm ty @ $F64 (u64_zero)) (let ((wr WritableXmm (temp_writable_xmm)) - (r Xmm wr) - (_ Unit (emit (MInst.XmmRmR (SseOpcode.Xorpd) - r - r - wr)))) - (xmm_to_reg r))) + (_ Unit (emit (MInst.XmmConstOp (SseOpcode.Xorpd) wr)))) + (xmm_to_reg wr))) ;; TODO: use cmpeqpd for all 1s +(decl xmm_zero (Type) Xmm) +(rule (xmm_zero ty) + (let ((wr WritableXmm (temp_writable_xmm)) + (_ Unit (emit (MInst.XmmConstOp (sse_xor_op ty) wr)))) + wr)) + ;; Helper for creating `MInst.ShiftR` instructions. (decl shift_r (Type ShiftKind Gpr Imm8Gpr) Gpr) (rule (shift_r ty kind src1 src2) @@ -2203,113 +2325,179 @@ ;; Helper for creating `paddb` instructions. (decl x64_paddb (Xmm XmmMem) Xmm) -(rule (x64_paddb src1 src2) +(rule 0 (x64_paddb src1 src2) (xmm_rm_r (SseOpcode.Paddb) src1 src2)) +(rule 1 (x64_paddb src1 src2) + (if-let $true (has_avx)) + (xmm_rmir_vex (AvxOpcode.Vpaddb) src1 src2)) ;; Helper for creating `paddw` instructions. (decl x64_paddw (Xmm XmmMem) Xmm) -(rule (x64_paddw src1 src2) +(rule 0 (x64_paddw src1 src2) (xmm_rm_r (SseOpcode.Paddw) src1 src2)) +(rule 1 (x64_paddw src1 src2) + (if-let $true (has_avx)) + (xmm_rmir_vex (AvxOpcode.Vpaddw) src1 src2)) ;; Helper for creating `paddd` instructions. (decl x64_paddd (Xmm XmmMem) Xmm) -(rule (x64_paddd src1 src2) +(rule 0 (x64_paddd src1 src2) (xmm_rm_r (SseOpcode.Paddd) src1 src2)) +(rule 1 (x64_paddd src1 src2) + (if-let $true (has_avx)) + (xmm_rmir_vex (AvxOpcode.Vpaddd) src1 src2)) ;; Helper for creating `paddq` instructions. (decl x64_paddq (Xmm XmmMem) Xmm) -(rule (x64_paddq src1 src2) +(rule 0 (x64_paddq src1 src2) (xmm_rm_r (SseOpcode.Paddq) src1 src2)) +(rule 1 (x64_paddq src1 src2) + (if-let $true (has_avx)) + (xmm_rmir_vex (AvxOpcode.Vpaddq) src1 src2)) ;; Helper for creating `paddsb` instructions. (decl x64_paddsb (Xmm XmmMem) Xmm) -(rule (x64_paddsb src1 src2) +(rule 0 (x64_paddsb src1 src2) (xmm_rm_r (SseOpcode.Paddsb) src1 src2)) +(rule 1 (x64_paddsb src1 src2) + (if-let $true (has_avx)) + (xmm_rmir_vex (AvxOpcode.Vpaddsb) src1 src2)) ;; Helper for creating `paddsw` instructions. (decl x64_paddsw (Xmm XmmMem) Xmm) -(rule (x64_paddsw src1 src2) +(rule 0 (x64_paddsw src1 src2) (xmm_rm_r (SseOpcode.Paddsw) src1 src2)) +(rule 1 (x64_paddsw src1 src2) + (if-let $true (has_avx)) + (xmm_rmir_vex (AvxOpcode.Vpaddsw) src1 src2)) ;; Helper for creating `paddusb` instructions. (decl x64_paddusb (Xmm XmmMem) Xmm) -(rule (x64_paddusb src1 src2) +(rule 0 (x64_paddusb src1 src2) (xmm_rm_r (SseOpcode.Paddusb) src1 src2)) +(rule 1 (x64_paddusb src1 src2) + (if-let $true (has_avx)) + (xmm_rmir_vex (AvxOpcode.Vpaddusb) src1 src2)) ;; Helper for creating `paddusw` instructions. (decl x64_paddusw (Xmm XmmMem) Xmm) -(rule (x64_paddusw src1 src2) +(rule 0 (x64_paddusw src1 src2) (xmm_rm_r (SseOpcode.Paddusw) src1 src2)) +(rule 1 (x64_paddusw src1 src2) + (if-let $true (has_avx)) + (xmm_rmir_vex (AvxOpcode.Vpaddusw) src1 src2)) ;; Helper for creating `psubb` instructions. (decl x64_psubb (Xmm XmmMem) Xmm) -(rule (x64_psubb src1 src2) +(rule 0 (x64_psubb src1 src2) (xmm_rm_r (SseOpcode.Psubb) src1 src2)) +(rule 1 (x64_psubb src1 src2) + (if-let $true (has_avx)) + (xmm_rmir_vex (AvxOpcode.Vpsubb) src1 src2)) ;; Helper for creating `psubw` instructions. (decl x64_psubw (Xmm XmmMem) Xmm) -(rule (x64_psubw src1 src2) +(rule 0 (x64_psubw src1 src2) (xmm_rm_r (SseOpcode.Psubw) src1 src2)) +(rule 1 (x64_psubw src1 src2) + (if-let $true (has_avx)) + (xmm_rmir_vex (AvxOpcode.Vpsubw) src1 src2)) ;; Helper for creating `psubd` instructions. (decl x64_psubd (Xmm XmmMem) Xmm) -(rule (x64_psubd src1 src2) +(rule 0 (x64_psubd src1 src2) (xmm_rm_r (SseOpcode.Psubd) src1 src2)) +(rule 1 (x64_psubd src1 src2) + (if-let $true (has_avx)) + (xmm_rmir_vex (AvxOpcode.Vpsubd) src1 src2)) ;; Helper for creating `psubq` instructions. (decl x64_psubq (Xmm XmmMem) Xmm) -(rule (x64_psubq src1 src2) +(rule 0 (x64_psubq src1 src2) (xmm_rm_r (SseOpcode.Psubq) src1 src2)) +(rule 1 (x64_psubq src1 src2) + (if-let $true (has_avx)) + (xmm_rmir_vex (AvxOpcode.Vpsubq) src1 src2)) ;; Helper for creating `psubsb` instructions. (decl x64_psubsb (Xmm XmmMem) Xmm) -(rule (x64_psubsb src1 src2) +(rule 0 (x64_psubsb src1 src2) (xmm_rm_r (SseOpcode.Psubsb) src1 src2)) +(rule 1 (x64_psubsb src1 src2) + (if-let $true (has_avx)) + (xmm_rmir_vex (AvxOpcode.Vpsubsb) src1 src2)) ;; Helper for creating `psubsw` instructions. (decl x64_psubsw (Xmm XmmMem) Xmm) -(rule (x64_psubsw src1 src2) +(rule 0 (x64_psubsw src1 src2) (xmm_rm_r (SseOpcode.Psubsw) src1 src2)) +(rule 1 (x64_psubsw src1 src2) + (if-let $true (has_avx)) + (xmm_rmir_vex (AvxOpcode.Vpsubsw) src1 src2)) ;; Helper for creating `psubusb` instructions. (decl x64_psubusb (Xmm XmmMem) Xmm) -(rule (x64_psubusb src1 src2) +(rule 0 (x64_psubusb src1 src2) (xmm_rm_r (SseOpcode.Psubusb) src1 src2)) +(rule 1 (x64_psubusb src1 src2) + (if-let $true (has_avx)) + (xmm_rmir_vex (AvxOpcode.Vpsubusb) src1 src2)) ;; Helper for creating `psubusw` instructions. (decl x64_psubusw (Xmm XmmMem) Xmm) -(rule (x64_psubusw src1 src2) +(rule 0 (x64_psubusw src1 src2) (xmm_rm_r (SseOpcode.Psubusw) src1 src2)) +(rule 1 (x64_psubusw src1 src2) + (if-let $true (has_avx)) + (xmm_rmir_vex (AvxOpcode.Vpsubusw) src1 src2)) ;; Helper for creating `pavgb` instructions. (decl x64_pavgb (Xmm XmmMem) Xmm) -(rule (x64_pavgb src1 src2) +(rule 0 (x64_pavgb src1 src2) (xmm_rm_r (SseOpcode.Pavgb) src1 src2)) +(rule 1 (x64_pavgb src1 src2) + (if-let $true (has_avx)) + (xmm_rmir_vex (AvxOpcode.Vpavgb) src1 src2)) ;; Helper for creating `pavgw` instructions. (decl x64_pavgw (Xmm XmmMem) Xmm) -(rule (x64_pavgw src1 src2) +(rule 0 (x64_pavgw src1 src2) (xmm_rm_r (SseOpcode.Pavgw) src1 src2)) +(rule 1 (x64_pavgw src1 src2) + (if-let $true (has_avx)) + (xmm_rmir_vex (AvxOpcode.Vpavgw) src1 src2)) ;; Helper for creating `pand` instructions. (decl x64_pand (Xmm XmmMem) Xmm) -(rule (x64_pand src1 src2) +(rule 0 (x64_pand src1 src2) (xmm_rm_r (SseOpcode.Pand) src1 src2)) +(rule 1 (x64_pand src1 src2) + (if-let $true (has_avx)) + (xmm_rmir_vex (AvxOpcode.Vpand) src1 src2)) ;; Helper for creating `andps` instructions. (decl x64_andps (Xmm XmmMem) Xmm) -(rule (x64_andps src1 src2) +(rule 0 (x64_andps src1 src2) (xmm_rm_r (SseOpcode.Andps) src1 src2)) +(rule 1 (x64_andps src1 src2) + (if-let $true (has_avx)) + (xmm_rmir_vex (AvxOpcode.Vandps) src1 src2)) ;; Helper for creating `andpd` instructions. (decl x64_andpd (Xmm XmmMem) Xmm) -(rule (x64_andpd src1 src2) +(rule 0 (x64_andpd src1 src2) (xmm_rm_r (SseOpcode.Andpd) src1 src2)) +(rule 1 (x64_andpd src1 src2) + (if-let $true (has_avx)) + (xmm_rmir_vex (AvxOpcode.Vandpd) src1 src2)) ;; Helper for creating `por` instructions. (decl x64_por (Xmm XmmMem) Xmm) -(rule (x64_por src1 src2) +(rule 0 (x64_por src1 src2) (xmm_rm_r (SseOpcode.Por) src1 src2)) +(rule 1 (x64_por src1 src2) + (if-let $true (has_avx)) + (xmm_rmir_vex (AvxOpcode.Vpor) src1 src2)) ;; Helper for creating `orps` instructions. (decl x64_orps (Xmm XmmMem) Xmm) @@ -2321,73 +2509,115 @@ ;; Helper for creating `orpd` instructions. (decl x64_orpd (Xmm XmmMem) Xmm) -(rule (x64_orpd src1 src2) +(rule 0 (x64_orpd src1 src2) (xmm_rm_r (SseOpcode.Orpd) src1 src2)) +(rule 1 (x64_orpd src1 src2) + (if-let $true (has_avx)) + (xmm_rmir_vex (AvxOpcode.Vorpd) src1 src2)) -;; Helper for creating `pxor` instructions. +;; Helper fxor creating `pxor` instructions. (decl x64_pxor (Xmm XmmMem) Xmm) -(rule (x64_pxor src1 src2) +(rule 0 (x64_pxor src1 src2) (xmm_rm_r (SseOpcode.Pxor) src1 src2)) +(rule 1 (x64_pxor src1 src2) + (if-let $true (has_avx)) + (xmm_rmir_vex (AvxOpcode.Vpxor) src1 src2)) -;; Helper for creating `xorps` instructions. +;; Helper fxor creating `xorps` instructions. (decl x64_xorps (Xmm XmmMem) Xmm) -(rule (x64_xorps src1 src2) +(rule 0 (x64_xorps src1 src2) (xmm_rm_r (SseOpcode.Xorps) src1 src2)) +(rule 1 (x64_xorps src1 src2) + (if-let $true (has_avx)) + (xmm_rmir_vex (AvxOpcode.Vxorps) src1 src2)) -;; Helper for creating `xorpd` instructions. +;; Helper fxor creating `xorpd` instructions. (decl x64_xorpd (Xmm XmmMem) Xmm) -(rule (x64_xorpd src1 src2) +(rule 0 (x64_xorpd src1 src2) (xmm_rm_r (SseOpcode.Xorpd) src1 src2)) +(rule 1 (x64_xorpd src1 src2) + (if-let $true (has_avx)) + (xmm_rmir_vex (AvxOpcode.Vxorpd) src1 src2)) ;; Helper for creating `pmullw` instructions. (decl x64_pmullw (Xmm XmmMem) Xmm) -(rule (x64_pmullw src1 src2) +(rule 0 (x64_pmullw src1 src2) (xmm_rm_r (SseOpcode.Pmullw) src1 src2)) +(rule 1 (x64_pmullw src1 src2) + (if-let $true (has_avx)) + (xmm_rmir_vex (AvxOpcode.Vpmullw) src1 src2)) ;; Helper for creating `pmulld` instructions. (decl x64_pmulld (Xmm XmmMem) Xmm) -(rule (x64_pmulld src1 src2) +(rule 0 (x64_pmulld src1 src2) (xmm_rm_r (SseOpcode.Pmulld) src1 src2)) +(rule 1 (x64_pmulld src1 src2) + (if-let $true (has_avx)) + (xmm_rmir_vex (AvxOpcode.Vpmulld) src1 src2)) ;; Helper for creating `pmulhw` instructions. (decl x64_pmulhw (Xmm XmmMem) Xmm) -(rule (x64_pmulhw src1 src2) +(rule 0 (x64_pmulhw src1 src2) (xmm_rm_r (SseOpcode.Pmulhw) src1 src2)) +(rule 1 (x64_pmulhw src1 src2) + (if-let $true (has_avx)) + (xmm_rmir_vex (AvxOpcode.Vpmulhw) src1 src2)) ;; Helper for creating `pmulhrsw` instructions. (decl x64_pmulhrsw (Xmm XmmMem) Xmm) -(rule (x64_pmulhrsw src1 src2) +(rule 0 (x64_pmulhrsw src1 src2) (xmm_rm_r (SseOpcode.Pmulhrsw) src1 src2)) +(rule 1 (x64_pmulhrsw src1 src2) + (if-let $true (has_avx)) + (xmm_rmir_vex (AvxOpcode.Vpmulhrsw) src1 src2)) ;; Helper for creating `pmulhuw` instructions. (decl x64_pmulhuw (Xmm XmmMem) Xmm) -(rule (x64_pmulhuw src1 src2) +(rule 0 (x64_pmulhuw src1 src2) (xmm_rm_r (SseOpcode.Pmulhuw) src1 src2)) +(rule 1 (x64_pmulhuw src1 src2) + (if-let $true (has_avx)) + (xmm_rmir_vex (AvxOpcode.Vpmulhuw) src1 src2)) ;; Helper for creating `pmuldq` instructions. (decl x64_pmuldq (Xmm XmmMem) Xmm) -(rule (x64_pmuldq src1 src2) +(rule 0 (x64_pmuldq src1 src2) (xmm_rm_r (SseOpcode.Pmuldq) src1 src2)) +(rule 1 (x64_pmuldq src1 src2) + (if-let $true (has_avx)) + (xmm_rmir_vex (AvxOpcode.Vpmuldq) src1 src2)) ;; Helper for creating `pmuludq` instructions. (decl x64_pmuludq (Xmm XmmMem) Xmm) -(rule (x64_pmuludq src1 src2) +(rule 0 (x64_pmuludq src1 src2) (xmm_rm_r (SseOpcode.Pmuludq) src1 src2)) +(rule 1 (x64_pmuludq src1 src2) + (if-let $true (has_avx)) + (xmm_rmir_vex (AvxOpcode.Vpmuludq) src1 src2)) ;; Helper for creating `punpckhwd` instructions. (decl x64_punpckhwd (Xmm XmmMem) Xmm) -(rule (x64_punpckhwd src1 src2) +(rule 0 (x64_punpckhwd src1 src2) (xmm_rm_r (SseOpcode.Punpckhwd) src1 src2)) +(rule 1 (x64_punpckhwd src1 src2) + (if-let $true (has_avx)) + (xmm_rmir_vex (AvxOpcode.Vpunpckhwd) src1 src2)) ;; Helper for creating `punpcklwd` instructions. (decl x64_punpcklwd (Xmm XmmMem) Xmm) -(rule (x64_punpcklwd src1 src2) +(rule 0 (x64_punpcklwd src1 src2) (xmm_rm_r (SseOpcode.Punpcklwd) src1 src2)) +(rule 1 (x64_punpcklwd src1 src2) + (if-let $true (has_avx)) + (xmm_rmir_vex (AvxOpcode.Vpunpcklwd) src1 src2)) ;; Helper for creating `unpcklps` instructions. (decl x64_unpcklps (Xmm XmmMem) Xmm) -(rule (x64_unpcklps src1 src2) +(rule 0 (x64_unpcklps src1 src2) (xmm_rm_r (SseOpcode.Unpcklps) src1 src2)) +(rule 1 (x64_unpcklps src1 src2) + (if-let $true (has_avx)) + (xmm_rmir_vex (AvxOpcode.Vunpcklps) src1 src2)) ;; Helper for creating `andnps` instructions. (decl x64_andnps (Xmm XmmMem) Xmm) @@ -2399,13 +2629,19 @@ ;; Helper for creating `andnpd` instructions. (decl x64_andnpd (Xmm XmmMem) Xmm) -(rule (x64_andnpd src1 src2) +(rule 0 (x64_andnpd src1 src2) (xmm_rm_r (SseOpcode.Andnpd) src1 src2)) +(rule 1 (x64_andnpd src1 src2) + (if-let $true (has_avx)) + (xmm_rmir_vex (AvxOpcode.Vandnpd) src1 src2)) ;; Helper for creating `pandn` instructions. (decl x64_pandn (Xmm XmmMem) Xmm) -(rule (x64_pandn src1 src2) +(rule 0 (x64_pandn src1 src2) (xmm_rm_r (SseOpcode.Pandn) src1 src2)) +(rule 1 (x64_pandn src1 src2) + (if-let $true (has_avx)) + (xmm_rmir_vex (AvxOpcode.Vpandn) src1 src2)) ;; Helper for creating `addss` instructions. (decl x64_addss (Xmm XmmMem) Xmm) @@ -2419,13 +2655,19 @@ ;; Helper for creating `addps` instructions. (decl x64_addps (Xmm XmmMem) Xmm) -(rule (x64_addps src1 src2) +(rule 0 (x64_addps src1 src2) (xmm_rm_r (SseOpcode.Addps) src1 src2)) +(rule 1 (x64_addps src1 src2) + (if-let $true (has_avx)) + (xmm_rmir_vex (AvxOpcode.Vaddps) src1 src2)) ;; Helper for creating `addpd` instructions. (decl x64_addpd (Xmm XmmMem) Xmm) -(rule (x64_addpd src1 src2) +(rule 0 (x64_addpd src1 src2) (xmm_rm_r (SseOpcode.Addpd) src1 src2)) +(rule 1 (x64_addpd src1 src2) + (if-let $true (has_avx)) + (xmm_rmir_vex (AvxOpcode.Vaddpd) src1 src2)) ;; Helper for creating `subss` instructions. (decl x64_subss (Xmm XmmMem) Xmm) @@ -2439,13 +2681,19 @@ ;; Helper for creating `subps` instructions. (decl x64_subps (Xmm XmmMem) Xmm) -(rule (x64_subps src1 src2) +(rule 0 (x64_subps src1 src2) (xmm_rm_r (SseOpcode.Subps) src1 src2)) +(rule 1 (x64_subps src1 src2) + (if-let $true (has_avx)) + (xmm_rmir_vex (AvxOpcode.Vsubps) src1 src2)) ;; Helper for creating `subpd` instructions. (decl x64_subpd (Xmm XmmMem) Xmm) -(rule (x64_subpd src1 src2) +(rule 0 (x64_subpd src1 src2) (xmm_rm_r (SseOpcode.Subpd) src1 src2)) +(rule 1 (x64_subpd src1 src2) + (if-let $true (has_avx)) + (xmm_rmir_vex (AvxOpcode.Vsubpd) src1 src2)) ;; Helper for creating `mulss` instructions. (decl x64_mulss (Xmm XmmMem) Xmm) @@ -2459,13 +2707,19 @@ ;; Helper for creating `mulps` instructions. (decl x64_mulps (Xmm XmmMem) Xmm) -(rule (x64_mulps src1 src2) +(rule 0 (x64_mulps src1 src2) (xmm_rm_r (SseOpcode.Mulps) src1 src2)) +(rule 1 (x64_mulps src1 src2) + (if-let $true (has_avx)) + (xmm_rmir_vex (AvxOpcode.Vmulps) src1 src2)) ;; Helper for creating `mulpd` instructions. (decl x64_mulpd (Xmm XmmMem) Xmm) (rule (x64_mulpd src1 src2) (xmm_rm_r (SseOpcode.Mulpd) src1 src2)) +(rule 1 (x64_mulpd src1 src2) + (if-let $true (has_avx)) + (xmm_rmir_vex (AvxOpcode.Vmulpd) src1 src2)) ;; Helper for creating `divss` instructions. (decl x64_divss (Xmm XmmMem) Xmm) @@ -2479,43 +2733,63 @@ ;; Helper for creating `divps` instructions. (decl x64_divps (Xmm XmmMem) Xmm) -(rule (x64_divps src1 src2) +(rule 0 (x64_divps src1 src2) (xmm_rm_r (SseOpcode.Divps) src1 src2)) +(rule 1 (x64_divps src1 src2) + (if-let $true (has_avx)) + (xmm_rmir_vex (AvxOpcode.Vdivps) src1 src2)) ;; Helper for creating `divpd` instructions. (decl x64_divpd (Xmm XmmMem) Xmm) -(rule (x64_divpd src1 src2) +(rule 0 (x64_divpd src1 src2) (xmm_rm_r (SseOpcode.Divpd) src1 src2)) +(rule 1 (x64_divpd src1 src2) + (if-let $true (has_avx)) + (xmm_rmir_vex (AvxOpcode.Vdivpd) src1 src2)) -(decl sse_blend_op (Type) SseOpcode) -(rule 1 (sse_blend_op $F32X4) (SseOpcode.Blendvps)) -(rule 1 (sse_blend_op $F64X2) (SseOpcode.Blendvpd)) - -;; Priority 0 because multi_lane overlaps with the previous two type patterns. -(rule 0 (sse_blend_op (multi_lane _bits _lanes)) (SseOpcode.Pblendvb)) - -(decl sse_mov_op (Type) SseOpcode) -(rule 1 (sse_mov_op $F32X4) (SseOpcode.Movaps)) -(rule 1 (sse_mov_op $F64X2) (SseOpcode.Movapd)) - -;; Priority 0 because multi_lane overlaps with the previous two type patterns. -(rule 0 (sse_mov_op (multi_lane _bits _lanes)) (SseOpcode.Movdqa)) - +;; Helper for creating `XmmRmRBlend` instructions (decl xmm_rm_r_blend (SseOpcode Xmm XmmMem Xmm) Xmm) (rule (xmm_rm_r_blend op src1 src2 mask) (let ((dst WritableXmm (temp_writable_xmm)) (_ Unit (emit (MInst.XmmRmRBlend op src1 src2 mask dst)))) dst)) +;; Helper for creating `XmmRmRBlendVex` instructions +(decl xmm_rmr_blend_vex (AvxOpcode Xmm XmmMem Xmm) Xmm) +(rule (xmm_rmr_blend_vex op src1 src2 mask) + (let ((dst WritableXmm (temp_writable_xmm)) + (_ Unit (emit (MInst.XmmRmRBlendVex op src1 src2 mask dst)))) + dst)) + ;; Helper for creating `blendvp{d,s}` and `pblendvb` instructions. (decl x64_blend (Type Xmm XmmMem Xmm) Xmm) -(rule (x64_blend ty mask src1 src2) - (xmm_rm_r_blend (sse_blend_op ty) src2 src1 mask)) +(rule 1 (x64_blend $F32X4 mask src1 src2) (x64_blendvps src2 src1 mask)) +(rule 1 (x64_blend $F64X2 mask src1 src2) (x64_blendvpd src2 src1 mask)) +(rule 0 (x64_blend (multi_lane _ _) mask src1 src2) (x64_pblendvb src2 src1 mask)) ;; Helper for creating `blendvpd` instructions. (decl x64_blendvpd (Xmm XmmMem Xmm) Xmm) -(rule (x64_blendvpd src1 src2 mask) +(rule 0 (x64_blendvpd src1 src2 mask) (xmm_rm_r_blend (SseOpcode.Blendvpd) src1 src2 mask)) +(rule 1 (x64_blendvpd src1 src2 mask) + (if-let $true (has_avx)) + (xmm_rmr_blend_vex (AvxOpcode.Vblendvpd) src1 src2 mask)) + +;; Helper for creating `blendvps` instructions. +(decl x64_blendvps (Xmm XmmMem Xmm) Xmm) +(rule 0 (x64_blendvps src1 src2 mask) + (xmm_rm_r_blend (SseOpcode.Blendvps) src1 src2 mask)) +(rule 1 (x64_blendvps src1 src2 mask) + (if-let $true (has_avx)) + (xmm_rmr_blend_vex (AvxOpcode.Vblendvps) src1 src2 mask)) + +;; Helper for creating `pblendvb` instructions. +(decl x64_pblendvb (Xmm XmmMem Xmm) Xmm) +(rule 0 (x64_pblendvb src1 src2 mask) + (xmm_rm_r_blend (SseOpcode.Pblendvb) src1 src2 mask)) +(rule 1 (x64_pblendvb src1 src2 mask) + (if-let $true (has_avx)) + (xmm_rmr_blend_vex (AvxOpcode.Vpblendvb) src1 src2 mask)) ;; Helper for creating `movsd` instructions. (decl x64_movsd_regmove (Xmm XmmMem) Xmm) @@ -2524,8 +2798,11 @@ ;; Helper for creating `movlhps` instructions. (decl x64_movlhps (Xmm XmmMem) Xmm) -(rule (x64_movlhps src1 src2) +(rule 0 (x64_movlhps src1 src2) (xmm_rm_r (SseOpcode.Movlhps) src1 src2)) +(rule 1 (x64_movlhps src1 src2) + (if-let $true (has_avx)) + (xmm_rmir_vex (AvxOpcode.Vmovlhps) src1 src2)) ;; Helpers for creating `pmaxs*` instructions. (decl x64_pmaxs (Type Xmm XmmMem) Xmm) @@ -2534,11 +2811,20 @@ (rule (x64_pmaxs $I32X4 x y) (x64_pmaxsd x y)) ;; No $I64X2 version (PMAXSQ) in SSE4.1. (decl x64_pmaxsb (Xmm XmmMem) Xmm) -(rule (x64_pmaxsb src1 src2) (xmm_rm_r (SseOpcode.Pmaxsb) src1 src2)) +(rule 0 (x64_pmaxsb src1 src2) (xmm_rm_r (SseOpcode.Pmaxsb) src1 src2)) +(rule 1 (x64_pmaxsb src1 src2) + (if-let $true (has_avx)) + (xmm_rmir_vex (AvxOpcode.Vpmaxsb) src1 src2)) (decl x64_pmaxsw (Xmm XmmMem) Xmm) -(rule (x64_pmaxsw src1 src2) (xmm_rm_r (SseOpcode.Pmaxsw) src1 src2)) +(rule 0 (x64_pmaxsw src1 src2) (xmm_rm_r (SseOpcode.Pmaxsw) src1 src2)) +(rule 1 (x64_pmaxsw src1 src2) + (if-let $true (has_avx)) + (xmm_rmir_vex (AvxOpcode.Vpmaxsw) src1 src2)) (decl x64_pmaxsd (Xmm XmmMem) Xmm) -(rule (x64_pmaxsd src1 src2) (xmm_rm_r (SseOpcode.Pmaxsd) src1 src2)) +(rule 0 (x64_pmaxsd src1 src2) (xmm_rm_r (SseOpcode.Pmaxsd) src1 src2)) +(rule 1 (x64_pmaxsd src1 src2) + (if-let $true (has_avx)) + (xmm_rmir_vex (AvxOpcode.Vpmaxsd) src1 src2)) ;; Helpers for creating `pmins*` instructions. (decl x64_pmins (Type Xmm XmmMem) Xmm) @@ -2547,11 +2833,20 @@ (rule (x64_pmins $I32X4 x y) (x64_pminsd x y)) ;; No $I64X2 version (PMINSQ) in SSE4.1. (decl x64_pminsb (Xmm XmmMem) Xmm) -(rule (x64_pminsb src1 src2) (xmm_rm_r (SseOpcode.Pminsb) src1 src2)) +(rule 0 (x64_pminsb src1 src2) (xmm_rm_r (SseOpcode.Pminsb) src1 src2)) +(rule 1 (x64_pminsb src1 src2) + (if-let $true (has_avx)) + (xmm_rmir_vex (AvxOpcode.Vpminsb) src1 src2)) (decl x64_pminsw (Xmm XmmMem) Xmm) -(rule (x64_pminsw src1 src2) (xmm_rm_r (SseOpcode.Pminsw) src1 src2)) +(rule 0 (x64_pminsw src1 src2) (xmm_rm_r (SseOpcode.Pminsw) src1 src2)) +(rule 1 (x64_pminsw src1 src2) + (if-let $true (has_avx)) + (xmm_rmir_vex (AvxOpcode.Vpminsw) src1 src2)) (decl x64_pminsd (Xmm XmmMem) Xmm) -(rule (x64_pminsd src1 src2) (xmm_rm_r (SseOpcode.Pminsd) src1 src2)) +(rule 0 (x64_pminsd src1 src2) (xmm_rm_r (SseOpcode.Pminsd) src1 src2)) +(rule 1 (x64_pminsd src1 src2) + (if-let $true (has_avx)) + (xmm_rmir_vex (AvxOpcode.Vpminsd) src1 src2)) ;; Helpers for creating `pmaxu*` instructions. (decl x64_pmaxu (Type Xmm XmmMem) Xmm) @@ -2560,11 +2855,20 @@ (rule (x64_pmaxu $I32X4 x y) (x64_pmaxud x y)) ;; No $I64X2 version (PMAXUQ) in SSE4.1. (decl x64_pmaxub (Xmm XmmMem) Xmm) -(rule (x64_pmaxub src1 src2) (xmm_rm_r (SseOpcode.Pmaxub) src1 src2)) +(rule 0 (x64_pmaxub src1 src2) (xmm_rm_r (SseOpcode.Pmaxub) src1 src2)) +(rule 1 (x64_pmaxub src1 src2) + (if-let $true (has_avx)) + (xmm_rmir_vex (AvxOpcode.Vpmaxub) src1 src2)) (decl x64_pmaxuw (Xmm XmmMem) Xmm) -(rule (x64_pmaxuw src1 src2) (xmm_rm_r (SseOpcode.Pmaxuw) src1 src2)) +(rule 0 (x64_pmaxuw src1 src2) (xmm_rm_r (SseOpcode.Pmaxuw) src1 src2)) +(rule 1 (x64_pmaxuw src1 src2) + (if-let $true (has_avx)) + (xmm_rmir_vex (AvxOpcode.Vpmaxuw) src1 src2)) (decl x64_pmaxud (Xmm XmmMem) Xmm) -(rule (x64_pmaxud src1 src2) (xmm_rm_r (SseOpcode.Pmaxud) src1 src2)) +(rule 0 (x64_pmaxud src1 src2) (xmm_rm_r (SseOpcode.Pmaxud) src1 src2)) +(rule 1 (x64_pmaxud src1 src2) + (if-let $true (has_avx)) + (xmm_rmir_vex (AvxOpcode.Vpmaxud) src1 src2)) ;; Helper for creating `pminu*` instructions. (decl x64_pminu (Type Xmm XmmMem) Xmm) @@ -2573,41 +2877,68 @@ (rule (x64_pminu $I32X4 x y) (x64_pminud x y)) ;; No $I64X2 version (PMINUQ) in SSE4.1. (decl x64_pminub (Xmm XmmMem) Xmm) -(rule (x64_pminub src1 src2) (xmm_rm_r (SseOpcode.Pminub) src1 src2)) +(rule 0 (x64_pminub src1 src2) (xmm_rm_r (SseOpcode.Pminub) src1 src2)) +(rule 1 (x64_pminub src1 src2) + (if-let $true (has_avx)) + (xmm_rmir_vex (AvxOpcode.Vpminub) src1 src2)) (decl x64_pminuw (Xmm XmmMem) Xmm) -(rule (x64_pminuw src1 src2) (xmm_rm_r (SseOpcode.Pminuw) src1 src2)) +(rule 0 (x64_pminuw src1 src2) (xmm_rm_r (SseOpcode.Pminuw) src1 src2)) +(rule 1 (x64_pminuw src1 src2) + (if-let $true (has_avx)) + (xmm_rmir_vex (AvxOpcode.Vpminuw) src1 src2)) (decl x64_pminud (Xmm XmmMem) Xmm) -(rule (x64_pminud src1 src2) (xmm_rm_r (SseOpcode.Pminud) src1 src2)) +(rule 0 (x64_pminud src1 src2) (xmm_rm_r (SseOpcode.Pminud) src1 src2)) +(rule 1 (x64_pminud src1 src2) + (if-let $true (has_avx)) + (xmm_rmir_vex (AvxOpcode.Vpminud) src1 src2)) ;; Helper for creating `punpcklbw` instructions. (decl x64_punpcklbw (Xmm XmmMem) Xmm) -(rule (x64_punpcklbw src1 src2) +(rule 0 (x64_punpcklbw src1 src2) (xmm_rm_r (SseOpcode.Punpcklbw) src1 src2)) +(rule 1 (x64_punpcklbw src1 src2) + (if-let $true (has_avx)) + (xmm_rmir_vex (AvxOpcode.Vpunpcklbw) src1 src2)) ;; Helper for creating `punpckhbw` instructions. (decl x64_punpckhbw (Xmm XmmMem) Xmm) -(rule (x64_punpckhbw src1 src2) +(rule 0 (x64_punpckhbw src1 src2) (xmm_rm_r (SseOpcode.Punpckhbw) src1 src2)) +(rule 1 (x64_punpckhbw src1 src2) + (if-let $true (has_avx)) + (xmm_rmir_vex (AvxOpcode.Vpunpckhbw) src1 src2)) ;; Helper for creating `packsswb` instructions. (decl x64_packsswb (Xmm XmmMem) Xmm) -(rule (x64_packsswb src1 src2) +(rule 0 (x64_packsswb src1 src2) (xmm_rm_r (SseOpcode.Packsswb) src1 src2)) +(rule 1 (x64_packsswb src1 src2) + (if-let $true (has_avx)) + (xmm_rmir_vex (AvxOpcode.Vpacksswb) src1 src2)) ;; Helper for creating `packssdw` instructions. (decl x64_packssdw (Xmm XmmMem) Xmm) -(rule (x64_packssdw src1 src2) +(rule 0 (x64_packssdw src1 src2) (xmm_rm_r (SseOpcode.Packssdw) src1 src2)) +(rule 1 (x64_packssdw src1 src2) + (if-let $true (has_avx)) + (xmm_rmir_vex (AvxOpcode.Vpackssdw) src1 src2)) ;; Helper for creating `packuswb` instructions. (decl x64_packuswb (Xmm XmmMem) Xmm) -(rule (x64_packuswb src1 src2) +(rule 0 (x64_packuswb src1 src2) (xmm_rm_r (SseOpcode.Packuswb) src1 src2)) +(rule 1 (x64_packuswb src1 src2) + (if-let $true (has_avx)) + (xmm_rmir_vex (AvxOpcode.Vpackuswb) src1 src2)) ;; Helper for creating `packusdw` instructions. (decl x64_packusdw (Xmm XmmMem) Xmm) -(rule (x64_packusdw src1 src2) +(rule 0 (x64_packusdw src1 src2) (xmm_rm_r (SseOpcode.Packusdw) src1 src2)) +(rule 1 (x64_packusdw src1 src2) + (if-let $true (has_avx)) + (xmm_rmir_vex (AvxOpcode.Vpackusdw) src1 src2)) ;; Helper for creating `MInst.XmmRmRImm` instructions. (decl xmm_rm_r_imm (SseOpcode Reg RegMem u8 OperandSize) Xmm) @@ -2623,12 +2954,15 @@ ;; Helper for creating `palignr` instructions. (decl x64_palignr (Xmm XmmMem u8 OperandSize) Xmm) -(rule (x64_palignr src1 src2 imm size) +(rule 0 (x64_palignr src1 src2 imm size) (xmm_rm_r_imm (SseOpcode.Palignr) src1 src2 imm size)) +(rule 1 (x64_palignr src1 src2 imm size) + (if-let $true (has_avx)) + (xmm_rmr_imm_vex (AvxOpcode.Vpalignr) src1 src2 imm)) ;; Helpers for creating `cmpp*` instructions. (decl x64_cmpp (Type Xmm XmmMem FcmpImm) Xmm) @@ -2653,39 +2987,73 @@ ;; operations, since this presumably induces the correct encoding of the ;; instruction. (decl x64_cmppd (Xmm XmmMem FcmpImm) Xmm) -(rule (x64_cmppd src1 src2 imm) +(rule 0 (x64_cmppd src1 src2 imm) (xmm_rm_r_imm (SseOpcode.Cmppd) src1 src2 (encode_fcmp_imm imm) (OperandSize.Size32))) +(rule 1 (x64_cmppd src1 src2 imm) + (if-let $true (has_avx)) + (xmm_rmr_imm_vex (AvxOpcode.Vcmppd) + src1 + src2 + (encode_fcmp_imm imm))) ;; Helper for creating `pinsrb` instructions. (decl x64_pinsrb (Xmm GprMem u8) Xmm) -(rule (x64_pinsrb src1 src2 lane) +(rule 0 (x64_pinsrb src1 src2 lane) (xmm_rm_r_imm (SseOpcode.Pinsrb) src1 src2 lane (OperandSize.Size32))) +(rule 1 (x64_pinsrb src1 src2 lane) + (if-let $true (has_avx)) + (xmm_vex_pinsr (AvxOpcode.Vpinsrb) src1 src2 lane)) ;; Helper for creating `pinsrw` instructions. (decl x64_pinsrw (Xmm GprMem u8) Xmm) -(rule (x64_pinsrw src1 src2 lane) +(rule 0 (x64_pinsrw src1 src2 lane) (xmm_rm_r_imm (SseOpcode.Pinsrw) src1 src2 lane (OperandSize.Size32))) +(rule 1 (x64_pinsrw src1 src2 lane) + (if-let $true (has_avx)) + (xmm_vex_pinsr (AvxOpcode.Vpinsrw) src1 src2 lane)) ;; Helper for creating `pinsrd` instructions. -(decl x64_pinsrd (Xmm GprMem u8 OperandSize) Xmm) -(rule (x64_pinsrd src1 src2 lane size) +(decl x64_pinsrd (Xmm GprMem u8) Xmm) +(rule 0 (x64_pinsrd src1 src2 lane) (xmm_rm_r_imm (SseOpcode.Pinsrd) src1 src2 lane - size)) + (OperandSize.Size32))) +(rule 1 (x64_pinsrd src1 src2 lane) + (if-let $true (has_avx)) + (xmm_vex_pinsr (AvxOpcode.Vpinsrd) src1 src2 lane)) + +;; Helper for creating `pinsrq` instructions. +(decl x64_pinsrq (Xmm GprMem u8) Xmm) +(rule (x64_pinsrq src1 src2 lane) + (xmm_rm_r_imm (SseOpcode.Pinsrd) + src1 + src2 + lane + (OperandSize.Size64))) +(rule 1 (x64_pinsrq src1 src2 lane) + (if-let $true (has_avx)) + (xmm_vex_pinsr (AvxOpcode.Vpinsrq) src1 src2 lane)) + +;; Helper for constructing `XmmVexPinsr` instructions. +(decl xmm_vex_pinsr (AvxOpcode Xmm GprMem u8) Xmm) +(rule (xmm_vex_pinsr op src1 src2 imm) + (let ((dst WritableXmm (temp_writable_xmm)) + (_ Unit (emit (MInst.XmmVexPinsr op src1 src2 dst imm)))) + dst)) ;; Helper for constructing `XmmUnaryRmRImm` instructions. (decl xmm_unary_rm_r_imm (SseOpcode XmmMem u8) Xmm) @@ -2716,26 +3084,30 @@ ;; Helper for creating `pmaddwd` instructions. (decl x64_pmaddwd (Xmm XmmMem) Xmm) -(rule (x64_pmaddwd src1 src2) - (let ((dst WritableXmm (temp_writable_xmm)) - (_ Unit (emit (MInst.XmmRmR (SseOpcode.Pmaddwd) - src1 - src2 - dst)))) - dst)) +(rule 0 (x64_pmaddwd src1 src2) + (xmm_rm_r (SseOpcode.Pmaddwd) src1 src2)) +(rule 1 (x64_pmaddwd src1 src2) + (if-let $true (has_avx)) + (xmm_rmir_vex (AvxOpcode.Vpmaddwd) src1 src2)) (decl x64_pmaddubsw (Xmm XmmMem) Xmm) -(rule (x64_pmaddubsw src1 src2) +(rule 0 (x64_pmaddubsw src1 src2) (xmm_rm_r (SseOpcode.Pmaddubsw) src1 src2)) +(rule 1 (x64_pmaddubsw src1 src2) + (if-let $true (has_avx)) + (xmm_rmir_vex (AvxOpcode.Vpmaddubsw) src1 src2)) ;; Helper for creating `insertps` instructions. (decl x64_insertps (Xmm XmmMem u8) Xmm) -(rule (x64_insertps src1 src2 lane) +(rule 0 (x64_insertps src1 src2 lane) (xmm_rm_r_imm (SseOpcode.Insertps) src1 src2 lane (OperandSize.Size32))) +(rule 1 (x64_insertps src1 src2 lane) + (if-let $true (has_avx)) + (xmm_rmr_imm_vex (AvxOpcode.Vinsertps) src1 src2 lane)) ;; Helper for creating `pshufd` instructions. (decl x64_pshufd (XmmMem u8 OperandSize) Xmm) @@ -2751,22 +3123,23 @@ ;; Helper for creating `pshufb` instructions. (decl x64_pshufb (Xmm XmmMem) Xmm) -(rule (x64_pshufb src1 src2) - (let ((dst WritableXmm (temp_writable_xmm)) - (_ Unit (emit (MInst.XmmRmR (SseOpcode.Pshufb) - src1 - src2 - dst)))) - dst)) +(rule 0 (x64_pshufb src1 src2) + (xmm_rm_r (SseOpcode.Pshufb) src1 src2)) +(rule 1 (x64_pshufb src1 src2) + (if-let $true (has_avx)) + (xmm_rmir_vex (AvxOpcode.Vpshufb) src1 src2)) ;; Helper for creating `shufps` instructions. (decl x64_shufps (Xmm XmmMem u8) Xmm) -(rule (x64_shufps src1 src2 byte) +(rule 0 (x64_shufps src1 src2 byte) (xmm_rm_r_imm (SseOpcode.Shufps) src1 src2 byte (OperandSize.Size32))) +(rule 1 (x64_shufps src1 src2 byte) + (if-let $true (has_avx)) + (xmm_rmr_imm_vex (AvxOpcode.Vshufps) src1 src2 byte)) ;; Helper for creating `MInst.XmmUnaryRmR` instructions. (decl xmm_unary_rm_r (SseOpcode XmmMem) Xmm) @@ -2878,23 +3251,35 @@ ;; Helper for creating `psllw` instructions. (decl x64_psllw (Xmm XmmMemImm) Xmm) -(rule (x64_psllw src1 src2) +(rule 0 (x64_psllw src1 src2) (xmm_rmi_xmm (SseOpcode.Psllw) src1 src2)) +(rule 1 (x64_psllw src1 src2) + (if-let $true (has_avx)) + (xmm_rmir_vex (AvxOpcode.Vpsllw) src1 src2)) ;; Helper for creating `pslld` instructions. (decl x64_pslld (Xmm XmmMemImm) Xmm) -(rule (x64_pslld src1 src2) +(rule 0 (x64_pslld src1 src2) (xmm_rmi_xmm (SseOpcode.Pslld) src1 src2)) +(rule 1 (x64_pslld src1 src2) + (if-let $true (has_avx)) + (xmm_rmir_vex (AvxOpcode.Vpslld) src1 src2)) ;; Helper for creating `psllq` instructions. (decl x64_psllq (Xmm XmmMemImm) Xmm) -(rule (x64_psllq src1 src2) +(rule 0 (x64_psllq src1 src2) (xmm_rmi_xmm (SseOpcode.Psllq) src1 src2)) +(rule 1 (x64_psllq src1 src2) + (if-let $true (has_avx)) + (xmm_rmir_vex (AvxOpcode.Vpsllq) src1 src2)) ;; Helper for creating `psrlw` instructions. (decl x64_psrlw (Xmm XmmMemImm) Xmm) -(rule (x64_psrlw src1 src2) +(rule 0 (x64_psrlw src1 src2) (xmm_rmi_xmm (SseOpcode.Psrlw) src1 src2)) +(rule 1 (x64_psrlw src1 src2) + (if-let $true (has_avx)) + (xmm_rmir_vex (AvxOpcode.Vpsrlw) src1 src2)) ;; Helper for creating `psrld` instructions. (decl x64_psrld (Xmm XmmMemImm) Xmm) @@ -2906,18 +3291,27 @@ ;; Helper for creating `psrlq` instructions. (decl x64_psrlq (Xmm XmmMemImm) Xmm) -(rule (x64_psrlq src1 src2) +(rule 0 (x64_psrlq src1 src2) (xmm_rmi_xmm (SseOpcode.Psrlq) src1 src2)) +(rule 1 (x64_psrlq src1 src2) + (if-let $true (has_avx)) + (xmm_rmir_vex (AvxOpcode.Vpsrlq) src1 src2)) ;; Helper for creating `psraw` instructions. (decl x64_psraw (Xmm XmmMemImm) Xmm) -(rule (x64_psraw src1 src2) +(rule 0 (x64_psraw src1 src2) (xmm_rmi_xmm (SseOpcode.Psraw) src1 src2)) +(rule 1 (x64_psraw src1 src2) + (if-let $true (has_avx)) + (xmm_rmir_vex (AvxOpcode.Vpsraw) src1 src2)) ;; Helper for creating `psrad` instructions. (decl x64_psrad (Xmm XmmMemImm) Xmm) -(rule (x64_psrad src1 src2) +(rule 0 (x64_psrad src1 src2) (xmm_rmi_xmm (SseOpcode.Psrad) src1 src2)) +(rule 1 (x64_psrad src1 src2) + (if-let $true (has_avx)) + (xmm_rmir_vex (AvxOpcode.Vpsrad) src1 src2)) ;; Helper for creating `pextrb` instructions. (decl x64_pextrb (Type Xmm u8) Gpr) @@ -3121,8 +3515,11 @@ ;; Helper for creating `minpd` instructions. (decl x64_minpd (Xmm Xmm) Xmm) -(rule (x64_minpd x y) +(rule 0 (x64_minpd x y) (xmm_rm_r (SseOpcode.Minpd) x y)) +(rule 1 (x64_minpd x y) + (if-let $true (has_avx)) + (xmm_rmir_vex (AvxOpcode.Vminpd) x y)) ;; Helper for creating `maxss` instructions. (decl x64_maxss (Xmm Xmm) Xmm) @@ -3136,13 +3533,19 @@ ;; Helper for creating `maxps` instructions. (decl x64_maxps (Xmm Xmm) Xmm) -(rule (x64_maxps x y) +(rule 0 (x64_maxps x y) (xmm_rm_r (SseOpcode.Maxps) x y)) +(rule 1 (x64_maxps x y) + (if-let $true (has_avx)) + (xmm_rmir_vex (AvxOpcode.Vmaxps) x y)) ;; Helper for creating `maxpd` instructions. (decl x64_maxpd (Xmm Xmm) Xmm) -(rule (x64_maxpd x y) +(rule 0 (x64_maxpd x y) (xmm_rm_r (SseOpcode.Maxpd) x y)) +(rule 1 (x64_maxpd x y) + (if-let $true (has_avx)) + (xmm_rmir_vex (AvxOpcode.Vmaxpd) x y)) ;; Helper for creating `MInst.XmmRmiRVex` instructions. @@ -3197,89 +3600,53 @@ ;; Helper for creating `sqrtss` instructions. (decl x64_sqrtss (Xmm) Xmm) -(rule (x64_sqrtss x) - (let ((dst WritableXmm (temp_writable_xmm)) - (_ Unit (emit (MInst.XmmUnaryRmR (SseOpcode.Sqrtss) x dst)))) - dst)) +(rule (x64_sqrtss x) (xmm_unary_rm_r (SseOpcode.Sqrtss) x)) ;; Helper for creating `sqrtsd` instructions. (decl x64_sqrtsd (Xmm) Xmm) -(rule (x64_sqrtsd x) - (let ((dst WritableXmm (temp_writable_xmm)) - (_ Unit (emit (MInst.XmmUnaryRmR (SseOpcode.Sqrtsd) x dst)))) - dst)) +(rule (x64_sqrtsd x) (xmm_unary_rm_r (SseOpcode.Sqrtsd) x)) ;; Helper for creating `sqrtps` instructions. (decl x64_sqrtps (Xmm) Xmm) -(rule (x64_sqrtps x) - (let ((dst WritableXmm (temp_writable_xmm)) - (_ Unit (emit (MInst.XmmUnaryRmR (SseOpcode.Sqrtps) x dst)))) - dst)) +(rule (x64_sqrtps x) (xmm_unary_rm_r (SseOpcode.Sqrtps) x)) ;; Helper for creating `sqrtpd` instructions. (decl x64_sqrtpd (Xmm) Xmm) -(rule (x64_sqrtpd x) - (let ((dst WritableXmm (temp_writable_xmm)) - (_ Unit (emit (MInst.XmmUnaryRmR (SseOpcode.Sqrtpd) x dst)))) - dst)) +(rule (x64_sqrtpd x) (xmm_unary_rm_r (SseOpcode.Sqrtpd) x)) ;; Helper for creating `cvtss2sd` instructions. (decl x64_cvtss2sd (Xmm) Xmm) -(rule (x64_cvtss2sd x) - (let ((dst WritableXmm (temp_writable_xmm)) - (_ Unit (emit (MInst.XmmUnaryRmR (SseOpcode.Cvtss2sd) x dst)))) - dst)) +(rule (x64_cvtss2sd x) (xmm_unary_rm_r (SseOpcode.Cvtss2sd) x)) ;; Helper for creating `cvtsd2ss` instructions. (decl x64_cvtsd2ss (Xmm) Xmm) -(rule (x64_cvtsd2ss x) - (let ((dst WritableXmm (temp_writable_xmm)) - (_ Unit (emit (MInst.XmmUnaryRmR (SseOpcode.Cvtsd2ss) x dst)))) - dst)) +(rule (x64_cvtsd2ss x) (xmm_unary_rm_r (SseOpcode.Cvtsd2ss) x)) ;; Helper for creating `cvtdq2ps` instructions. (decl x64_cvtdq2ps (Xmm) Xmm) -(rule (x64_cvtdq2ps x) - (let ((dst WritableXmm (temp_writable_xmm)) - (_ Unit (emit (MInst.XmmUnaryRmR (SseOpcode.Cvtdq2ps) x dst)))) - dst)) +(rule (x64_cvtdq2ps x) (xmm_unary_rm_r (SseOpcode.Cvtdq2ps) x)) ;; Helper for creating `cvtps2pd` instructions. (decl x64_cvtps2pd (Xmm) Xmm) -(rule (x64_cvtps2pd x) - (let ((dst WritableXmm (temp_writable_xmm)) - (_ Unit (emit (MInst.XmmUnaryRmR (SseOpcode.Cvtps2pd) x dst)))) - dst)) +(rule (x64_cvtps2pd x) (xmm_unary_rm_r (SseOpcode.Cvtps2pd) x)) ;; Helper for creating `cvtpd2ps` instructions. (decl x64_cvtpd2ps (Xmm) Xmm) -(rule (x64_cvtpd2ps x) - (let ((dst WritableXmm (temp_writable_xmm)) - (_ Unit (emit (MInst.XmmUnaryRmR (SseOpcode.Cvtpd2ps) x dst)))) - dst)) +(rule (x64_cvtpd2ps x) (xmm_unary_rm_r (SseOpcode.Cvtpd2ps) x)) ;; Helper for creating `cvtdq2pd` instructions. -(decl x64_cvtdq2pd (Type Xmm) Xmm) -(rule (x64_cvtdq2pd ty x) - (let ((dst WritableXmm (temp_writable_xmm)) - (_ Unit (emit (MInst.XmmUnaryRmR (SseOpcode.Cvtdq2pd) x dst)))) - dst)) +(decl x64_cvtdq2pd (Xmm) Xmm) +(rule (x64_cvtdq2pd x) (xmm_unary_rm_r (SseOpcode.Cvtdq2pd) x)) ;; Helper for creating `cvtsi2ss` instructions. (decl x64_cvtsi2ss (Type GprMem) Xmm) (rule (x64_cvtsi2ss ty x) - (let ((dst WritableXmm (temp_writable_xmm)) - (size OperandSize (raw_operand_size_of_type ty)) - (_ Unit (emit (MInst.GprToXmm (SseOpcode.Cvtsi2ss) x dst size)))) - dst)) + (gpr_to_xmm (SseOpcode.Cvtsi2ss) x (raw_operand_size_of_type ty))) ;; Helper for creating `cvtsi2sd` instructions. (decl x64_cvtsi2sd (Type GprMem) Xmm) (rule (x64_cvtsi2sd ty x) - (let ((dst WritableXmm (temp_writable_xmm)) - (size OperandSize (raw_operand_size_of_type ty)) - (_ Unit (emit (MInst.GprToXmm (SseOpcode.Cvtsi2sd) x dst size)))) - dst)) + (gpr_to_xmm (SseOpcode.Cvtsi2sd) x (raw_operand_size_of_type ty))) ;; Helper for creating `cvttps2dq` instructions. (decl x64_cvttps2dq (Type XmmMem) Xmm) @@ -3337,13 +3704,25 @@ (rule (x64_pcmpeq $I64X2 x y) (x64_pcmpeqq x y)) (decl x64_pcmpeqb (Xmm XmmMem) Xmm) -(rule (x64_pcmpeqb x y) (xmm_rm_r (SseOpcode.Pcmpeqb) x y)) +(rule 0 (x64_pcmpeqb x y) (xmm_rm_r (SseOpcode.Pcmpeqb) x y)) +(rule 1 (x64_pcmpeqb x y) + (if-let $true (has_avx)) + (xmm_rmir_vex (AvxOpcode.Vpcmpeqb) x y)) (decl x64_pcmpeqw (Xmm XmmMem) Xmm) -(rule (x64_pcmpeqw x y) (xmm_rm_r (SseOpcode.Pcmpeqw) x y)) +(rule 0 (x64_pcmpeqw x y) (xmm_rm_r (SseOpcode.Pcmpeqw) x y)) +(rule 1 (x64_pcmpeqw x y) + (if-let $true (has_avx)) + (xmm_rmir_vex (AvxOpcode.Vpcmpeqw) x y)) (decl x64_pcmpeqd (Xmm XmmMem) Xmm) -(rule (x64_pcmpeqd x y) (xmm_rm_r (SseOpcode.Pcmpeqd) x y)) +(rule 0 (x64_pcmpeqd x y) (xmm_rm_r (SseOpcode.Pcmpeqd) x y)) +(rule 1 (x64_pcmpeqd x y) + (if-let $true (has_avx)) + (xmm_rmir_vex (AvxOpcode.Vpcmpeqd) x y)) (decl x64_pcmpeqq (Xmm XmmMem) Xmm) -(rule (x64_pcmpeqq x y) (xmm_rm_r (SseOpcode.Pcmpeqq) x y)) +(rule 0 (x64_pcmpeqq x y) (xmm_rm_r (SseOpcode.Pcmpeqq) x y)) +(rule 1 (x64_pcmpeqq x y) + (if-let $true (has_avx)) + (xmm_rmir_vex (AvxOpcode.Vpcmpeqq) x y)) ;; Helpers for creating `pcmpgt*` instructions. (decl x64_pcmpgt (Type Xmm XmmMem) Xmm) @@ -3353,13 +3732,25 @@ (rule (x64_pcmpgt $I64X2 x y) (x64_pcmpgtq x y)) (decl x64_pcmpgtb (Xmm XmmMem) Xmm) -(rule (x64_pcmpgtb x y) (xmm_rm_r (SseOpcode.Pcmpgtb) x y)) +(rule 0 (x64_pcmpgtb x y) (xmm_rm_r (SseOpcode.Pcmpgtb) x y)) +(rule 1 (x64_pcmpgtb x y) + (if-let $true (has_avx)) + (xmm_rmir_vex (AvxOpcode.Vpcmpgtb) x y)) (decl x64_pcmpgtw (Xmm XmmMem) Xmm) -(rule (x64_pcmpgtw x y) (xmm_rm_r (SseOpcode.Pcmpgtw) x y)) +(rule 0 (x64_pcmpgtw x y) (xmm_rm_r (SseOpcode.Pcmpgtw) x y)) +(rule 1 (x64_pcmpgtw x y) + (if-let $true (has_avx)) + (xmm_rmir_vex (AvxOpcode.Vpcmpgtw) x y)) (decl x64_pcmpgtd (Xmm XmmMem) Xmm) -(rule (x64_pcmpgtd x y) (xmm_rm_r (SseOpcode.Pcmpgtd) x y)) +(rule 0 (x64_pcmpgtd x y) (xmm_rm_r (SseOpcode.Pcmpgtd) x y)) +(rule 1 (x64_pcmpgtd x y) + (if-let $true (has_avx)) + (xmm_rmir_vex (AvxOpcode.Vpcmpgtd) x y)) (decl x64_pcmpgtq (Xmm XmmMem) Xmm) -(rule (x64_pcmpgtq x y) (xmm_rm_r (SseOpcode.Pcmpgtq) x y)) +(rule 0 (x64_pcmpgtq x y) (xmm_rm_r (SseOpcode.Pcmpgtq) x y)) +(rule 1 (x64_pcmpgtq x y) + (if-let $true (has_avx)) + (xmm_rmir_vex (AvxOpcode.Vpcmpgtq) x y)) ;; Helpers for read-modify-write ALU form (AluRM). (decl alu_rm (Type AluRmiROpcode Amode Gpr) SideEffectNoResult) diff --git a/cranelift/codegen/src/isa/x64/inst/args.rs b/cranelift/codegen/src/isa/x64/inst/args.rs index c4de650e4c..57f644c8e9 100644 --- a/cranelift/codegen/src/isa/x64/inst/args.rs +++ b/cranelift/codegen/src/isa/x64/inst/args.rs @@ -617,13 +617,6 @@ impl RegMemImm { } } - pub(crate) fn to_reg(&self) -> Option { - match self { - Self::Reg { reg } => Some(*reg), - _ => None, - } - } - pub(crate) fn with_allocs(&self, allocs: &mut AllocationConsumer<'_>) -> Self { match self { Self::Reg { reg } => Self::Reg { @@ -726,12 +719,6 @@ impl RegMem { RegMem::Mem { addr, .. } => addr.get_operands(collector), } } - pub(crate) fn to_reg(&self) -> Option { - match self { - RegMem::Reg { reg } => Some(*reg), - _ => None, - } - } pub(crate) fn with_allocs(&self, allocs: &mut AllocationConsumer<'_>) -> Self { match self { @@ -1510,10 +1497,108 @@ impl AvxOpcode { | AvxOpcode::Vfmadd213ps | AvxOpcode::Vfmadd213pd => smallvec![InstructionSet::FMA], AvxOpcode::Vminps - | AvxOpcode::Vorps + | AvxOpcode::Vminpd + | AvxOpcode::Vmaxps + | AvxOpcode::Vmaxpd | AvxOpcode::Vandnps + | AvxOpcode::Vandnpd + | AvxOpcode::Vpandn | AvxOpcode::Vcmpps - | AvxOpcode::Vpsrld => { + | AvxOpcode::Vcmppd + | AvxOpcode::Vpsrlw + | AvxOpcode::Vpsrld + | AvxOpcode::Vpsrlq + | AvxOpcode::Vpaddb + | AvxOpcode::Vpaddw + | AvxOpcode::Vpaddd + | AvxOpcode::Vpaddq + | AvxOpcode::Vpaddsb + | AvxOpcode::Vpaddsw + | AvxOpcode::Vpaddusb + | AvxOpcode::Vpaddusw + | AvxOpcode::Vpsubb + | AvxOpcode::Vpsubw + | AvxOpcode::Vpsubd + | AvxOpcode::Vpsubq + | AvxOpcode::Vpsubsb + | AvxOpcode::Vpsubsw + | AvxOpcode::Vpsubusb + | AvxOpcode::Vpsubusw + | AvxOpcode::Vpavgb + | AvxOpcode::Vpavgw + | AvxOpcode::Vpand + | AvxOpcode::Vandps + | AvxOpcode::Vandpd + | AvxOpcode::Vpor + | AvxOpcode::Vorps + | AvxOpcode::Vorpd + | AvxOpcode::Vpxor + | AvxOpcode::Vxorps + | AvxOpcode::Vxorpd + | AvxOpcode::Vpmullw + | AvxOpcode::Vpmulld + | AvxOpcode::Vpmulhw + | AvxOpcode::Vpmulhd + | AvxOpcode::Vpmulhrsw + | AvxOpcode::Vpmulhuw + | AvxOpcode::Vpmuldq + | AvxOpcode::Vpmuludq + | AvxOpcode::Vpunpckhwd + | AvxOpcode::Vpunpcklwd + | AvxOpcode::Vunpcklps + | AvxOpcode::Vaddps + | AvxOpcode::Vaddpd + | AvxOpcode::Vsubps + | AvxOpcode::Vsubpd + | AvxOpcode::Vmulps + | AvxOpcode::Vmulpd + | AvxOpcode::Vdivps + | AvxOpcode::Vdivpd + | AvxOpcode::Vpcmpeqb + | AvxOpcode::Vpcmpeqw + | AvxOpcode::Vpcmpeqd + | AvxOpcode::Vpcmpeqq + | AvxOpcode::Vpcmpgtb + | AvxOpcode::Vpcmpgtw + | AvxOpcode::Vpcmpgtd + | AvxOpcode::Vpcmpgtq + | AvxOpcode::Vblendvps + | AvxOpcode::Vblendvpd + | AvxOpcode::Vpblendvb + | AvxOpcode::Vmovlhps + | AvxOpcode::Vpminsb + | AvxOpcode::Vpminsw + | AvxOpcode::Vpminsd + | AvxOpcode::Vpminub + | AvxOpcode::Vpminuw + | AvxOpcode::Vpminud + | AvxOpcode::Vpmaxsb + | AvxOpcode::Vpmaxsw + | AvxOpcode::Vpmaxsd + | AvxOpcode::Vpmaxub + | AvxOpcode::Vpmaxuw + | AvxOpcode::Vpmaxud + | AvxOpcode::Vpunpcklbw + | AvxOpcode::Vpunpckhbw + | AvxOpcode::Vpacksswb + | AvxOpcode::Vpackssdw + | AvxOpcode::Vpackuswb + | AvxOpcode::Vpackusdw + | AvxOpcode::Vpalignr + | AvxOpcode::Vpinsrb + | AvxOpcode::Vpinsrw + | AvxOpcode::Vpinsrd + | AvxOpcode::Vpinsrq + | AvxOpcode::Vpmaddwd + | AvxOpcode::Vpmaddubsw + | AvxOpcode::Vinsertps + | AvxOpcode::Vpshufb + | AvxOpcode::Vshufps + | AvxOpcode::Vpsllw + | AvxOpcode::Vpslld + | AvxOpcode::Vpsllq + | AvxOpcode::Vpsraw + | AvxOpcode::Vpsrad => { smallvec![InstructionSet::AVX] } } diff --git a/cranelift/codegen/src/isa/x64/inst/emit.rs b/cranelift/codegen/src/isa/x64/inst/emit.rs index 9d984edab5..ebbbf16b26 100644 --- a/cranelift/codegen/src/isa/x64/inst/emit.rs +++ b/cranelift/codegen/src/isa/x64/inst/emit.rs @@ -148,16 +148,10 @@ pub(crate) fn emit( src2, dst: reg_g, } => { - let (reg_g, src2) = if inst.produces_const() { - let reg_g = allocs.next(reg_g.to_reg().to_reg()); - (reg_g, RegMemImm::reg(reg_g)) - } else { - let src1 = allocs.next(src1.to_reg()); - let reg_g = allocs.next(reg_g.to_reg().to_reg()); - debug_assert_eq!(src1, reg_g); - let src2 = src2.clone().to_reg_mem_imm().with_allocs(allocs); - (reg_g, src2) - }; + let src1 = allocs.next(src1.to_reg()); + let reg_g = allocs.next(reg_g.to_reg().to_reg()); + debug_assert_eq!(src1, reg_g); + let src2 = src2.clone().to_reg_mem_imm().with_allocs(allocs); let rex = RexFlags::from(*size); if *op == AluRmiROpcode::Mul { @@ -253,6 +247,23 @@ pub(crate) fn emit( } } + Inst::AluConstOp { op, size, dst } => { + let dst = allocs.next(dst.to_reg().to_reg()); + emit( + &Inst::AluRmiR { + size: *size, + op: *op, + dst: Writable::from_reg(Gpr::new(dst).unwrap()), + src1: Gpr::new(dst).unwrap(), + src2: Gpr::new(dst).unwrap().into(), + }, + allocs, + sink, + info, + state, + ); + } + Inst::AluRM { size, src1_dst, @@ -1837,16 +1848,10 @@ pub(crate) fn emit( src2: src_e, dst: reg_g, } => { - let (src_e, reg_g) = if inst.produces_const() { - let reg_g = allocs.next(reg_g.to_reg().to_reg()); - (RegMem::Reg { reg: reg_g }, reg_g) - } else { - let src1 = allocs.next(src1.to_reg()); - let reg_g = allocs.next(reg_g.to_reg().to_reg()); - let src_e = src_e.clone().to_reg_mem().with_allocs(allocs); - debug_assert_eq!(src1, reg_g); - (src_e, reg_g) - }; + let src1 = allocs.next(src1.to_reg()); + let reg_g = allocs.next(reg_g.to_reg().to_reg()); + let src_e = src_e.clone().to_reg_mem().with_allocs(allocs); + debug_assert_eq!(src1, reg_g); let rex = RexFlags::clear_w(); let (prefix, opcode, length) = match op { @@ -1959,6 +1964,22 @@ pub(crate) fn emit( } } + Inst::XmmConstOp { op, dst } => { + let dst = allocs.next(dst.to_reg().to_reg()); + emit( + &Inst::XmmRmR { + op: *op, + dst: Writable::from_reg(Xmm::new(dst).unwrap()), + src1: Xmm::new(dst).unwrap(), + src2: Xmm::new(dst).unwrap().into(), + }, + allocs, + sink, + info, + state, + ); + } + Inst::XmmRmRBlend { op, src1, @@ -1998,6 +2019,9 @@ pub(crate) fn emit( src2, dst, } => { + use LegacyPrefixes as LP; + use OpcodeMap as OM; + let dst = allocs.next(dst.to_reg().to_reg()); let src1 = allocs.next(src1.to_reg()); let src2 = src2.clone().to_reg_mem_imm().with_allocs(allocs); @@ -2008,8 +2032,15 @@ pub(crate) fn emit( // `opcode_ext`, so handle that specially here. RegMemImm::Imm { simm32 } => { let (opcode, opcode_ext, prefix) = match op { + AvxOpcode::Vpsrlw => (0x71, 2, LegacyPrefixes::_66), AvxOpcode::Vpsrld => (0x72, 2, LegacyPrefixes::_66), - _ => panic!("unexpected avx opcode with immediate {op:?}"), + AvxOpcode::Vpsrlq => (0x73, 2, LegacyPrefixes::_66), + AvxOpcode::Vpsllw => (0x71, 6, LegacyPrefixes::_66), + AvxOpcode::Vpslld => (0x72, 6, LegacyPrefixes::_66), + AvxOpcode::Vpsllq => (0x73, 6, LegacyPrefixes::_66), + AvxOpcode::Vpsraw => (0x71, 4, LegacyPrefixes::_66), + AvxOpcode::Vpsrad => (0x72, 4, LegacyPrefixes::_66), + _ => panic!("unexpected rmi_r_vex opcode with immediate {op:?}"), }; VexInstruction::new() .length(VexVectorLength::V128) @@ -2029,18 +2060,104 @@ pub(crate) fn emit( } RegMemImm::Mem { addr } => RegisterOrAmode::Amode(addr.finalize(state, sink)), }; - let (prefix, opcode) = match op { - AvxOpcode::Vminps => (LegacyPrefixes::None, 0x5D), - AvxOpcode::Vandnps => (LegacyPrefixes::None, 0x55), - AvxOpcode::Vorps => (LegacyPrefixes::None, 0x56), - AvxOpcode::Vpsrld => (LegacyPrefixes::_66, 0xD2), + let (prefix, map, opcode) = match op { + AvxOpcode::Vminps => (LP::None, OM::_0F, 0x5D), + AvxOpcode::Vminpd => (LP::_66, OM::_0F, 0x5D), + AvxOpcode::Vmaxps => (LP::None, OM::_0F, 0x5F), + AvxOpcode::Vmaxpd => (LP::_66, OM::_0F, 0x5F), + AvxOpcode::Vandnps => (LP::None, OM::_0F, 0x55), + AvxOpcode::Vandnpd => (LP::_66, OM::_0F, 0x55), + AvxOpcode::Vpandn => (LP::_66, OM::_0F, 0xDF), + AvxOpcode::Vpsrlw => (LP::_66, OM::_0F, 0xD1), + AvxOpcode::Vpsrld => (LP::_66, OM::_0F, 0xD2), + AvxOpcode::Vpsrlq => (LP::_66, OM::_0F, 0xD3), + AvxOpcode::Vpaddb => (LP::_66, OM::_0F, 0xFC), + AvxOpcode::Vpaddw => (LP::_66, OM::_0F, 0xFD), + AvxOpcode::Vpaddd => (LP::_66, OM::_0F, 0xFE), + AvxOpcode::Vpaddq => (LP::_66, OM::_0F, 0xD4), + AvxOpcode::Vpaddsb => (LP::_66, OM::_0F, 0xEC), + AvxOpcode::Vpaddsw => (LP::_66, OM::_0F, 0xED), + AvxOpcode::Vpaddusb => (LP::_66, OM::_0F, 0xDC), + AvxOpcode::Vpaddusw => (LP::_66, OM::_0F, 0xDD), + AvxOpcode::Vpsubb => (LP::_66, OM::_0F, 0xF8), + AvxOpcode::Vpsubw => (LP::_66, OM::_0F, 0xF9), + AvxOpcode::Vpsubd => (LP::_66, OM::_0F, 0xFA), + AvxOpcode::Vpsubq => (LP::_66, OM::_0F, 0xFB), + AvxOpcode::Vpsubsb => (LP::_66, OM::_0F, 0xE8), + AvxOpcode::Vpsubsw => (LP::_66, OM::_0F, 0xE9), + AvxOpcode::Vpsubusb => (LP::_66, OM::_0F, 0xD8), + AvxOpcode::Vpsubusw => (LP::_66, OM::_0F, 0xD9), + AvxOpcode::Vpavgb => (LP::_66, OM::_0F, 0xE0), + AvxOpcode::Vpavgw => (LP::_66, OM::_0F, 0xE3), + AvxOpcode::Vpand => (LP::_66, OM::_0F, 0xDB), + AvxOpcode::Vandps => (LP::None, OM::_0F, 0x54), + AvxOpcode::Vandpd => (LP::_66, OM::_0F, 0x54), + AvxOpcode::Vpor => (LP::_66, OM::_0F, 0xEB), + AvxOpcode::Vorps => (LP::None, OM::_0F, 0x56), + AvxOpcode::Vorpd => (LP::_66, OM::_0F, 0x56), + AvxOpcode::Vpxor => (LP::_66, OM::_0F, 0xEF), + AvxOpcode::Vxorps => (LP::None, OM::_0F, 0x57), + AvxOpcode::Vxorpd => (LP::_66, OM::_0F, 0x57), + AvxOpcode::Vpmullw => (LP::_66, OM::_0F, 0xD5), + AvxOpcode::Vpmulld => (LP::_66, OM::_0F38, 0x40), + AvxOpcode::Vpmulhw => (LP::_66, OM::_0F, 0xE5), + AvxOpcode::Vpmulhrsw => (LP::_66, OM::_0F38, 0x0B), + AvxOpcode::Vpmulhuw => (LP::_66, OM::_0F, 0xE4), + AvxOpcode::Vpmuldq => (LP::_66, OM::_0F38, 0x28), + AvxOpcode::Vpmuludq => (LP::_66, OM::_0F, 0xF4), + AvxOpcode::Vpunpckhwd => (LP::_66, OM::_0F, 0x69), + AvxOpcode::Vpunpcklwd => (LP::_66, OM::_0F, 0x61), + AvxOpcode::Vunpcklps => (LP::None, OM::_0F, 0x14), + AvxOpcode::Vaddps => (LP::None, OM::_0F, 0x58), + AvxOpcode::Vaddpd => (LP::_66, OM::_0F, 0x58), + AvxOpcode::Vsubps => (LP::None, OM::_0F, 0x5C), + AvxOpcode::Vsubpd => (LP::_66, OM::_0F, 0x5C), + AvxOpcode::Vmulps => (LP::None, OM::_0F, 0x59), + AvxOpcode::Vmulpd => (LP::_66, OM::_0F, 0x59), + AvxOpcode::Vdivps => (LP::None, OM::_0F, 0x5E), + AvxOpcode::Vdivpd => (LP::_66, OM::_0F, 0x5E), + AvxOpcode::Vpcmpeqb => (LP::_66, OM::_0F, 0x74), + AvxOpcode::Vpcmpeqw => (LP::_66, OM::_0F, 0x75), + AvxOpcode::Vpcmpeqd => (LP::_66, OM::_0F, 0x76), + AvxOpcode::Vpcmpeqq => (LP::_66, OM::_0F38, 0x29), + AvxOpcode::Vpcmpgtb => (LP::_66, OM::_0F, 0x64), + AvxOpcode::Vpcmpgtw => (LP::_66, OM::_0F, 0x65), + AvxOpcode::Vpcmpgtd => (LP::_66, OM::_0F, 0x66), + AvxOpcode::Vpcmpgtq => (LP::_66, OM::_0F38, 0x37), + AvxOpcode::Vmovlhps => (LP::None, OM::_0F, 0x16), + AvxOpcode::Vpminsb => (LP::_66, OM::_0F38, 0x38), + AvxOpcode::Vpminsw => (LP::_66, OM::_0F, 0xEA), + AvxOpcode::Vpminsd => (LP::_66, OM::_0F38, 0x39), + AvxOpcode::Vpmaxsb => (LP::_66, OM::_0F38, 0x3C), + AvxOpcode::Vpmaxsw => (LP::_66, OM::_0F, 0xEE), + AvxOpcode::Vpmaxsd => (LP::_66, OM::_0F38, 0x3D), + AvxOpcode::Vpminub => (LP::_66, OM::_0F, 0xDA), + AvxOpcode::Vpminuw => (LP::_66, OM::_0F38, 0x3A), + AvxOpcode::Vpminud => (LP::_66, OM::_0F38, 0x3B), + AvxOpcode::Vpmaxub => (LP::_66, OM::_0F, 0xDE), + AvxOpcode::Vpmaxuw => (LP::_66, OM::_0F38, 0x3E), + AvxOpcode::Vpmaxud => (LP::_66, OM::_0F38, 0x3F), + AvxOpcode::Vpunpcklbw => (LP::_66, OM::_0F, 0x60), + AvxOpcode::Vpunpckhbw => (LP::_66, OM::_0F, 0x68), + AvxOpcode::Vpacksswb => (LP::_66, OM::_0F, 0x63), + AvxOpcode::Vpackssdw => (LP::_66, OM::_0F, 0x6B), + AvxOpcode::Vpackuswb => (LP::_66, OM::_0F, 0x67), + AvxOpcode::Vpackusdw => (LP::_66, OM::_0F38, 0x2B), + AvxOpcode::Vpmaddwd => (LP::_66, OM::_0F, 0xF5), + AvxOpcode::Vpmaddubsw => (LP::_66, OM::_0F38, 0x04), + AvxOpcode::Vpshufb => (LP::_66, OM::_0F38, 0x00), + AvxOpcode::Vpsllw => (LP::_66, OM::_0F, 0xF1), + AvxOpcode::Vpslld => (LP::_66, OM::_0F, 0xF2), + AvxOpcode::Vpsllq => (LP::_66, OM::_0F, 0xF3), + AvxOpcode::Vpsraw => (LP::_66, OM::_0F, 0xE1), + AvxOpcode::Vpsrad => (LP::_66, OM::_0F, 0xE2), _ => panic!("unexpected rmir vex opcode {op:?}"), }; VexInstruction::new() .length(VexVectorLength::V128) .prefix(prefix) + .map(map) .opcode(opcode) - .map(OpcodeMap::_0F) .reg(dst.to_real_reg().unwrap().hw_enc()) .vvvv(src1.to_real_reg().unwrap().hw_enc()) .rm(src2) @@ -2056,27 +2173,70 @@ pub(crate) fn emit( } => { let dst = allocs.next(dst.to_reg().to_reg()); let src1 = allocs.next(src1.to_reg()); - let src2 = src2.clone().to_reg_mem().with_allocs(allocs); - - let (w, opcode) = match op { - AvxOpcode::Vcmpps => (false, 0xC2), - _ => unreachable!(), + let src2 = match src2.clone().to_reg_mem().with_allocs(allocs) { + RegMem::Reg { reg } => { + RegisterOrAmode::Register(reg.to_real_reg().unwrap().hw_enc().into()) + } + RegMem::Mem { addr } => RegisterOrAmode::Amode(addr.finalize(state, sink)), }; - match src2 { - RegMem::Reg { reg: src } => VexInstruction::new() - .length(VexVectorLength::V128) - .prefix(LegacyPrefixes::None) - .map(OpcodeMap::_0F) - .w(w) - .opcode(opcode) - .reg(dst.to_real_reg().unwrap().hw_enc()) - .rm(src.to_real_reg().unwrap().hw_enc()) - .vvvv(src1.to_real_reg().unwrap().hw_enc()) - .imm(*imm) - .encode(sink), - _ => todo!(), + let (w, prefix, map, opcode) = match op { + AvxOpcode::Vcmpps => (false, LegacyPrefixes::None, OpcodeMap::_0F, 0xC2), + AvxOpcode::Vcmppd => (false, LegacyPrefixes::_66, OpcodeMap::_0F, 0xC2), + AvxOpcode::Vpalignr => (false, LegacyPrefixes::_66, OpcodeMap::_0F3A, 0x0F), + AvxOpcode::Vinsertps => (false, LegacyPrefixes::_66, OpcodeMap::_0F3A, 0x21), + AvxOpcode::Vshufps => (false, LegacyPrefixes::None, OpcodeMap::_0F, 0xC6), + _ => panic!("unexpected rmr_imm_vex opcode {op:?}"), }; + + VexInstruction::new() + .length(VexVectorLength::V128) + .prefix(prefix) + .map(map) + .w(w) + .opcode(opcode) + .reg(dst.to_real_reg().unwrap().hw_enc()) + .vvvv(src1.to_real_reg().unwrap().hw_enc()) + .rm(src2) + .imm(*imm) + .encode(sink); + } + + Inst::XmmVexPinsr { + op, + src1, + src2, + dst, + imm, + } => { + let dst = allocs.next(dst.to_reg().to_reg()); + let src1 = allocs.next(src1.to_reg()); + let src2 = match src2.clone().to_reg_mem().with_allocs(allocs) { + RegMem::Reg { reg } => { + RegisterOrAmode::Register(reg.to_real_reg().unwrap().hw_enc().into()) + } + RegMem::Mem { addr } => RegisterOrAmode::Amode(addr.finalize(state, sink)), + }; + + let (w, map, opcode) = match op { + AvxOpcode::Vpinsrb => (false, OpcodeMap::_0F3A, 0x20), + AvxOpcode::Vpinsrw => (false, OpcodeMap::_0F, 0xC4), + AvxOpcode::Vpinsrd => (false, OpcodeMap::_0F3A, 0x22), + AvxOpcode::Vpinsrq => (true, OpcodeMap::_0F3A, 0x22), + _ => panic!("unexpected vex_pinsr opcode {op:?}"), + }; + + VexInstruction::new() + .length(VexVectorLength::V128) + .prefix(LegacyPrefixes::_66) + .map(map) + .w(w) + .opcode(opcode) + .reg(dst.to_real_reg().unwrap().hw_enc()) + .vvvv(src1.to_real_reg().unwrap().hw_enc()) + .rm(src2) + .imm(*imm) + .encode(sink); } Inst::XmmRmRVex3 { @@ -2092,11 +2252,14 @@ pub(crate) fn emit( let src2 = allocs.next(src2.to_reg()); let src3 = src3.clone().to_reg_mem().with_allocs(allocs); - let (w, opcode) = match op { - AvxOpcode::Vfmadd213ss => (false, 0xA9), - AvxOpcode::Vfmadd213sd => (true, 0xA9), - AvxOpcode::Vfmadd213ps => (false, 0xA8), - AvxOpcode::Vfmadd213pd => (true, 0xA8), + let (w, map, opcode) = match op { + AvxOpcode::Vfmadd213ss => (false, OpcodeMap::_0F38, 0xA9), + AvxOpcode::Vfmadd213sd => (true, OpcodeMap::_0F38, 0xA9), + AvxOpcode::Vfmadd213ps => (false, OpcodeMap::_0F38, 0xA8), + AvxOpcode::Vfmadd213pd => (true, OpcodeMap::_0F38, 0xA8), + AvxOpcode::Vblendvps => (false, OpcodeMap::_0F3A, 0x4A), + AvxOpcode::Vblendvpd => (false, OpcodeMap::_0F3A, 0x4B), + AvxOpcode::Vpblendvb => (false, OpcodeMap::_0F3A, 0x4C), _ => unreachable!(), }; @@ -2104,7 +2267,7 @@ pub(crate) fn emit( RegMem::Reg { reg: src } => VexInstruction::new() .length(VexVectorLength::V128) .prefix(LegacyPrefixes::_66) - .map(OpcodeMap::_0F38) + .map(map) .w(w) .opcode(opcode) .reg(dst.to_real_reg().unwrap().hw_enc()) @@ -2115,6 +2278,42 @@ pub(crate) fn emit( }; } + Inst::XmmRmRBlendVex { + op, + src1, + src2, + mask, + dst, + } => { + let dst = allocs.next(dst.to_reg().to_reg()); + let src1 = allocs.next(src1.to_reg()); + let src2 = match src2.clone().to_reg_mem().with_allocs(allocs) { + RegMem::Reg { reg } => { + RegisterOrAmode::Register(reg.to_real_reg().unwrap().hw_enc().into()) + } + RegMem::Mem { addr } => RegisterOrAmode::Amode(addr.finalize(state, sink)), + }; + let mask = allocs.next(mask.to_reg()); + + let opcode = match op { + AvxOpcode::Vblendvps => 0x4A, + AvxOpcode::Vblendvpd => 0x4B, + AvxOpcode::Vpblendvb => 0x4C, + _ => unreachable!(), + }; + + VexInstruction::new() + .length(VexVectorLength::V128) + .prefix(LegacyPrefixes::_66) + .map(OpcodeMap::_0F3A) + .opcode(opcode) + .reg(dst.to_real_reg().unwrap().hw_enc()) + .vvvv(src1.to_real_reg().unwrap().hw_enc()) + .rm(src2) + .imm(mask.to_real_reg().unwrap().hw_enc() << 4) + .encode(sink); + } + Inst::XmmRmREvex { op, src1, @@ -2259,10 +2458,7 @@ pub(crate) fn emit( imm, size, } => { - let (src2, dst) = if inst.produces_const() { - let dst = allocs.next(dst.to_reg()); - (RegMem::Reg { reg: dst }, dst) - } else if !op.uses_src1() { + let (src2, dst) = if !op.uses_src1() { let dst = allocs.next(dst.to_reg()); let src2 = src2.with_allocs(allocs); (src2, dst) diff --git a/cranelift/codegen/src/isa/x64/inst/emit_tests.rs b/cranelift/codegen/src/isa/x64/inst/emit_tests.rs index f63eb3e8e3..1c19fd1820 100644 --- a/cranelift/codegen/src/isa/x64/inst/emit_tests.rs +++ b/cranelift/codegen/src/isa/x64/inst/emit_tests.rs @@ -4860,6 +4860,20 @@ fn test_x64_emit() { "roundpd $0, %xmm15, %xmm15", )); + // ======================================================== + // XmmRmRImmVex + insns.push(( + Inst::XmmVexPinsr { + op: AvxOpcode::Vpinsrb, + dst: Writable::from_reg(Xmm::new(xmm13).unwrap()), + src1: Xmm::new(xmm14).unwrap(), + src2: GprMem::new(RegMem::reg(r15)).unwrap(), + imm: 2, + }, + "C4430920EF02", + "vpinsrb $2 %xmm14, %r15, %xmm13", + )); + // ======================================================== // Pertaining to atomics. let am1: SyntheticAmode = @@ -5135,6 +5149,7 @@ fn test_x64_emit() { isa_flag_builder.enable("has_ssse3").unwrap(); isa_flag_builder.enable("has_sse41").unwrap(); isa_flag_builder.enable("has_fma").unwrap(); + isa_flag_builder.enable("has_avx").unwrap(); isa_flag_builder.enable("has_avx512bitalg").unwrap(); isa_flag_builder.enable("has_avx512dq").unwrap(); isa_flag_builder.enable("has_avx512f").unwrap(); diff --git a/cranelift/codegen/src/isa/x64/inst/mod.rs b/cranelift/codegen/src/isa/x64/inst/mod.rs index a1056b859d..217b4b4db6 100644 --- a/cranelift/codegen/src/isa/x64/inst/mod.rs +++ b/cranelift/codegen/src/isa/x64/inst/mod.rs @@ -122,7 +122,8 @@ impl Inst { | Inst::MachOTlsGetAddr { .. } | Inst::CoffTlsGetAddr { .. } | Inst::Unwind { .. } - | Inst::DummyUse { .. } => smallvec![], + | Inst::DummyUse { .. } + | Inst::AluConstOp { .. } => smallvec![], Inst::AluRmRVex { op, .. } => op.available_from(), Inst::UnaryRmR { op, .. } => op.available_from(), @@ -136,7 +137,8 @@ impl Inst { | Inst::XmmRmRImm { op, .. } | Inst::XmmToGpr { op, .. } | Inst::XmmUnaryRmRImm { op, .. } - | Inst::XmmUnaryRmR { op, .. } => smallvec![op.available_from()], + | Inst::XmmUnaryRmR { op, .. } + | Inst::XmmConstOp { op, .. } => smallvec![op.available_from()], Inst::XmmUnaryRmREvex { op, .. } | Inst::XmmRmREvex { op, .. } @@ -144,7 +146,9 @@ impl Inst { Inst::XmmRmiRVex { op, .. } | Inst::XmmRmRVex3 { op, .. } - | Inst::XmmRmRImmVex { op, .. } => op.available_from(), + | Inst::XmmRmRImmVex { op, .. } + | Inst::XmmRmRBlendVex { op, .. } + | Inst::XmmVexPinsr { op, .. } => op.available_from(), } } } @@ -622,40 +626,6 @@ impl Inst { } } -// Inst helpers. - -impl Inst { - /// In certain cases, instructions of this format can act as a definition of an XMM register, - /// producing a value that is independent of its initial value. - /// - /// For example, a vector equality comparison (`cmppd` or `cmpps`) that compares a register to - /// itself will generate all ones as a result, regardless of its value. From the register - /// allocator's point of view, we should (i) record the first register, which is normally a - /// mod, as a def instead; and (ii) not record the second register as a use, because it is the - /// same as the first register (already handled). - fn produces_const(&self) -> bool { - match self { - Self::AluRmiR { op, src1, src2, .. } => { - src2.clone().to_reg_mem_imm().to_reg() == Some(src1.to_reg()) - && (*op == AluRmiROpcode::Xor || *op == AluRmiROpcode::Sub) - } - - Self::XmmRmR { op, src1, src2, .. } => { - src2.clone().to_reg_mem().to_reg() == Some(src1.to_reg()) - && (*op == SseOpcode::Xorps - || *op == SseOpcode::Xorpd - || *op == SseOpcode::Pxor - || *op == SseOpcode::Pcmpeqb - || *op == SseOpcode::Pcmpeqw - || *op == SseOpcode::Pcmpeqd - || *op == SseOpcode::Pcmpeqq) - } - - _ => false, - } - } -} - //============================================================================= // Instructions: printing @@ -705,16 +675,6 @@ impl PrettyPrint for Inst { match self { Inst::Nop { len } => format!("{} len={}", ljustify("nop".to_string()), len), - Inst::AluRmiR { size, op, dst, .. } if self.produces_const() => { - let dst = pretty_print_reg(dst.to_reg().to_reg(), size.to_bytes(), allocs); - format!( - "{} {}, {}, {}", - ljustify2(op.to_string(), suffix_lqb(*size)), - dst, - dst, - dst - ) - } Inst::AluRmiR { size, op, @@ -734,6 +694,14 @@ impl PrettyPrint for Inst { dst ) } + Inst::AluConstOp { op, dst, size } => { + let size_bytes = size.to_bytes(); + let dst = pretty_print_reg(dst.to_reg().to_reg(), size_bytes, allocs); + format!( + "{} {dst}, {dst}, {dst}", + ljustify2(op.to_string(), suffix_lqb(*size)), + ) + } Inst::AluRM { size, op, @@ -945,11 +913,6 @@ impl PrettyPrint for Inst { format!("{} {}, {}", ljustify(op.to_string()), src, dst) } - Inst::XmmRmR { op, dst, .. } if self.produces_const() => { - let dst = pretty_print_reg(dst.to_reg().to_reg(), 8, allocs); - format!("{} {}, {}, {}", ljustify(op.to_string()), dst, dst, dst) - } - Inst::XmmRmR { op, src1, @@ -963,6 +926,11 @@ impl PrettyPrint for Inst { format!("{} {}, {}, {}", ljustify(op.to_string()), src1, src2, dst) } + Inst::XmmConstOp { op, dst } => { + let dst = pretty_print_reg(dst.to_reg().to_reg(), 8, allocs); + format!("{} {dst}, {dst}, {dst}", ljustify(op.to_string())) + } + Inst::XmmRmRBlend { op, src1, @@ -1016,13 +984,22 @@ impl PrettyPrint for Inst { let src1 = pretty_print_reg(src1.to_reg(), 8, allocs); let src2 = src2.pretty_print(8, allocs); - format!( - "{} ${imm} {}, {}, {}", - ljustify(op.to_string()), - src1, - src2, - dst - ) + format!("{} ${imm} {src1}, {src2}, {dst}", ljustify(op.to_string())) + } + + Inst::XmmVexPinsr { + op, + src1, + src2, + dst, + imm, + .. + } => { + let dst = pretty_print_reg(dst.to_reg().to_reg(), 8, allocs); + let src1 = pretty_print_reg(src1.to_reg(), 8, allocs); + let src2 = src2.pretty_print(8, allocs); + + format!("{} ${imm} {src1}, {src2}, {dst}", ljustify(op.to_string())) } Inst::XmmRmRVex3 { @@ -1048,6 +1025,22 @@ impl PrettyPrint for Inst { ) } + Inst::XmmRmRBlendVex { + op, + src1, + src2, + mask, + dst, + .. + } => { + let src1 = pretty_print_reg(src1.to_reg(), 8, allocs); + let dst = pretty_print_reg(dst.to_reg().to_reg(), 8, allocs); + let src2 = src2.pretty_print(8, allocs); + let mask = pretty_print_reg(mask.to_reg(), 8, allocs); + + format!("{} {src1}, {src2}, {mask}, {dst}", ljustify(op.to_string())) + } + Inst::XmmRmREvex { op, src1, @@ -1109,28 +1102,6 @@ impl PrettyPrint for Inst { ) } - Inst::XmmRmRImm { - op, dst, imm, size, .. - } if self.produces_const() => { - let dst = pretty_print_reg(dst.to_reg(), 8, allocs); - format!( - "{} ${}, {}, {}, {}", - ljustify(format!( - "{}{}", - op.to_string(), - if *size == OperandSize::Size64 { - ".w" - } else { - "" - } - )), - imm, - dst, - dst, - dst, - ) - } - Inst::XmmRmRImm { op, src1, @@ -1799,14 +1770,11 @@ fn x64_get_operands VReg>(inst: &Inst, collector: &mut OperandCol Inst::AluRmiR { src1, src2, dst, .. } => { - if inst.produces_const() { - collector.reg_def(dst.to_writable_reg()); - } else { - collector.reg_use(src1.to_reg()); - collector.reg_reuse_def(dst.to_writable_reg(), 0); - src2.get_operands(collector); - } + collector.reg_use(src1.to_reg()); + collector.reg_reuse_def(dst.to_writable_reg(), 0); + src2.get_operands(collector); } + Inst::AluConstOp { dst, .. } => collector.reg_def(dst.to_writable_reg()), Inst::AluRM { src1_dst, src2, .. } => { collector.reg_use(src2.to_reg()); src1_dst.get_operands(collector); @@ -1904,13 +1872,9 @@ fn x64_get_operands VReg>(inst: &Inst, collector: &mut OperandCol Inst::XmmRmR { src1, src2, dst, .. } => { - if inst.produces_const() { - collector.reg_def(dst.to_writable_reg()); - } else { - collector.reg_use(src1.to_reg()); - collector.reg_reuse_def(dst.to_writable_reg(), 0); - src2.get_operands(collector); - } + collector.reg_use(src1.to_reg()); + collector.reg_reuse_def(dst.to_writable_reg(), 0); + src2.get_operands(collector); } Inst::XmmRmRBlend { src1, @@ -1943,6 +1907,13 @@ fn x64_get_operands VReg>(inst: &Inst, collector: &mut OperandCol collector.reg_use(src1.to_reg()); src2.get_operands(collector); } + Inst::XmmVexPinsr { + src1, src2, dst, .. + } => { + collector.reg_def(dst.to_writable_reg()); + collector.reg_use(src1.to_reg()); + src2.get_operands(collector); + } Inst::XmmRmRVex3 { op, src1, @@ -1966,6 +1937,18 @@ fn x64_get_operands VReg>(inst: &Inst, collector: &mut OperandCol collector.reg_use(src2.to_reg()); src3.get_operands(collector); } + Inst::XmmRmRBlendVex { + src1, + src2, + mask, + dst, + .. + } => { + collector.reg_def(dst.to_writable_reg()); + collector.reg_use(src1.to_reg()); + src2.get_operands(collector); + collector.reg_use(mask.to_reg()); + } Inst::XmmRmREvex { op, src1, @@ -1999,9 +1982,7 @@ fn x64_get_operands VReg>(inst: &Inst, collector: &mut OperandCol dst, .. } => { - if inst.produces_const() { - collector.reg_def(*dst); - } else if !op.uses_src1() { + if !op.uses_src1() { // FIXME: split this instruction into two, so we don't // need this awkward src1-is-only-sometimes-an-arg // behavior. @@ -2013,6 +1994,9 @@ fn x64_get_operands VReg>(inst: &Inst, collector: &mut OperandCol src2.get_operands(collector); } } + Inst::XmmConstOp { dst, .. } => { + collector.reg_def(dst.to_writable_reg()); + } Inst::XmmUninitializedValue { dst } => collector.reg_def(dst.to_writable_reg()), Inst::XmmMinMaxSeq { lhs, rhs, dst, .. } => { collector.reg_use(rhs.to_reg()); diff --git a/cranelift/codegen/src/isa/x64/lower.isle b/cranelift/codegen/src/isa/x64/lower.isle index 1275ab2ad2..d8a89c2af0 100644 --- a/cranelift/codegen/src/isa/x64/lower.isle +++ b/cranelift/codegen/src/isa/x64/lower.isle @@ -1333,11 +1333,11 @@ ;; i32x4.replace_lane (rule (vec_insert_lane $I32X4 vec val idx) - (x64_pinsrd vec val idx (OperandSize.Size32))) + (x64_pinsrd vec val idx)) ;; i64x2.replace_lane (rule (vec_insert_lane $I64X2 vec val idx) - (x64_pinsrd vec val idx (OperandSize.Size64))) + (x64_pinsrq vec val idx)) ;; f32x4.replace_lane (rule (vec_insert_lane $F32X4 vec val idx) @@ -2982,8 +2982,8 @@ ;; Rules for `fcvt_low_from_sint` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; -(rule (lower (fcvt_low_from_sint a @ (value_type ty))) - (x64_cvtdq2pd ty a)) +(rule (lower (fcvt_low_from_sint a)) + (x64_cvtdq2pd a)) ;; Rules for `fcvt_from_uint` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; @@ -3148,7 +3148,7 @@ ;; Converting to unsigned int so if float src is negative or NaN ;; will first set to zero. - (tmp2 Xmm (x64_pxor src src)) ;; make a zero + (tmp2 Xmm (xmm_zero $F32X4)) (dst Xmm (x64_maxps src tmp2)) ;; Set tmp2 to INT_MAX+1. It is important to note here that after it looks @@ -3181,7 +3181,7 @@ ;; that have positive overflow (based on the mask) by setting these lanes ;; to 0x7FFFFFFF (tmp1 Xmm (x64_pxor tmp1 tmp2)) - (tmp2 Xmm (x64_pxor tmp2 tmp2)) ;; make another zero + (tmp2 Xmm (xmm_zero $I32X4)) (tmp1 Xmm (x64_pmaxsd tmp1 tmp2))) ;; Add this second set of converted lanes to the original to properly handle @@ -3611,7 +3611,7 @@ (rule (lower (has_type (multi_lane 8 16) (splat src))) (let ((vec Xmm (vec_insert_lane $I8X16 (xmm_uninit_value) src 0)) - (zeros Xmm (x64_pxor vec vec))) + (zeros Xmm (xmm_zero $I8X16))) ;; Shuffle the lowest byte lane to all other lanes. (x64_pshufb vec zeros))) @@ -3661,7 +3661,7 @@ (rule (lower (vall_true val @ (value_type ty))) (let ((src Xmm val) - (zeros Xmm (x64_pxor src src)) + (zeros Xmm (xmm_zero ty)) (cmp Xmm (x64_pcmpeq (vec_int_type ty) src zeros))) (with_flags (x64_ptest cmp cmp) (x64_setcc (CC.Z))))) @@ -3752,7 +3752,7 @@ ;; MOVAPD xmm_y, xmm_x ;; XORPD xmm_tmp, xmm_tmp - (zeros Xmm (x64_xorpd src src)) + (zeros Xmm (xmm_zero $F64X2)) (dst Xmm (x64_maxpd src zeros)) (umax_mask Xmm (x64_xmm_load_const $F64X2 (uunarrow_umax_mask))) diff --git a/cranelift/codegen/src/isa/x64/lower/isle.rs b/cranelift/codegen/src/isa/x64/lower/isle.rs index 8adbf94465..0e050cece8 100644 --- a/cranelift/codegen/src/isa/x64/lower/isle.rs +++ b/cranelift/codegen/src/isa/x64/lower/isle.rs @@ -891,12 +891,11 @@ impl Context for IsleContext<'_, '_, MInst, X64Backend> { None }; let dividend_hi = self.lower_ctx.alloc_tmp(types::I64).only_reg().unwrap(); - self.lower_ctx.emit(MInst::alu_rmi_r( - OperandSize::Size32, - AluRmiROpcode::Xor, - RegMemImm::reg(dividend_hi.to_reg()), - dividend_hi, - )); + self.lower_ctx.emit(MInst::AluConstOp { + op: AluRmiROpcode::Xor, + size: OperandSize::Size32, + dst: WritableGpr::from_reg(Gpr::new(dividend_hi.to_reg()).unwrap()), + }); self.lower_ctx.emit(MInst::checked_div_or_rem_seq( kind.clone(), size, diff --git a/cranelift/filetests/filetests/isa/x64/fcvt.clif b/cranelift/filetests/filetests/isa/x64/fcvt.clif index 40d570cce9..f5d366095f 100644 --- a/cranelift/filetests/filetests/isa/x64/fcvt.clif +++ b/cranelift/filetests/filetests/isa/x64/fcvt.clif @@ -1032,12 +1032,12 @@ block0(v0: f32x4): ; pushq %rbp ; movq %rsp, %rbp ; block0: -; pxor %xmm2, %xmm2, %xmm2 +; xorps %xmm5, %xmm5, %xmm5 ; movdqa %xmm0, %xmm9 -; maxps %xmm9, %xmm2, %xmm9 -; pcmpeqd %xmm7, %xmm7, %xmm7 -; psrld %xmm7, $1, %xmm7 -; cvtdq2ps %xmm7, %xmm13 +; maxps %xmm9, %xmm5, %xmm9 +; pcmpeqd %xmm5, %xmm5, %xmm5 +; psrld %xmm5, $1, %xmm5 +; cvtdq2ps %xmm5, %xmm13 ; cvttps2dq %xmm9, %xmm12 ; subps %xmm9, %xmm13, %xmm9 ; cmpps $2, %xmm13, %xmm9, %xmm13 @@ -1055,12 +1055,12 @@ block0(v0: f32x4): ; pushq %rbp ; movq %rsp, %rbp ; block1: ; offset 0x4 -; pxor %xmm2, %xmm2 +; xorps %xmm5, %xmm5 ; movdqa %xmm0, %xmm9 -; maxps %xmm2, %xmm9 -; pcmpeqd %xmm7, %xmm7 -; psrld $1, %xmm7 -; cvtdq2ps %xmm7, %xmm13 +; maxps %xmm5, %xmm9 +; pcmpeqd %xmm5, %xmm5 +; psrld $1, %xmm5 +; cvtdq2ps %xmm5, %xmm13 ; cvttps2dq %xmm9, %xmm12 ; subps %xmm13, %xmm9 ; cmpleps %xmm9, %xmm13 diff --git a/cranelift/filetests/filetests/isa/x64/simd-arith-avx.clif b/cranelift/filetests/filetests/isa/x64/simd-arith-avx.clif new file mode 100644 index 0000000000..fa6ceda2c6 --- /dev/null +++ b/cranelift/filetests/filetests/isa/x64/simd-arith-avx.clif @@ -0,0 +1,1886 @@ +test compile precise-output +set enable_simd +target x86_64 has_avx + +function %i8x16_add(i8x16, i8x16) -> i8x16 { +block0(v0: i8x16, v1: i8x16): + v2 = iadd v0, v1 + return v2 +} + +; VCode: +; pushq %rbp +; movq %rsp, %rbp +; block0: +; vpaddb %xmm0, %xmm1, %xmm0 +; movq %rbp, %rsp +; popq %rbp +; ret +; +; Disassembled: +; block0: ; offset 0x0 +; pushq %rbp +; movq %rsp, %rbp +; block1: ; offset 0x4 +; vpaddb %xmm1, %xmm0, %xmm0 +; movq %rbp, %rsp +; popq %rbp +; retq + +function %i16x8_add(i16x8, i16x8) -> i16x8 { +block0(v0: i16x8, v1: i16x8): + v2 = iadd v0, v1 + return v2 +} + +; VCode: +; pushq %rbp +; movq %rsp, %rbp +; block0: +; vpaddw %xmm0, %xmm1, %xmm0 +; movq %rbp, %rsp +; popq %rbp +; ret +; +; Disassembled: +; block0: ; offset 0x0 +; pushq %rbp +; movq %rsp, %rbp +; block1: ; offset 0x4 +; vpaddw %xmm1, %xmm0, %xmm0 +; movq %rbp, %rsp +; popq %rbp +; retq + +function %i32x4_add(i32x4, i32x4) -> i32x4 { +block0(v0: i32x4, v1: i32x4): + v2 = iadd v0, v1 + return v2 +} + +; VCode: +; pushq %rbp +; movq %rsp, %rbp +; block0: +; vpaddd %xmm0, %xmm1, %xmm0 +; movq %rbp, %rsp +; popq %rbp +; ret +; +; Disassembled: +; block0: ; offset 0x0 +; pushq %rbp +; movq %rsp, %rbp +; block1: ; offset 0x4 +; vpaddd %xmm1, %xmm0, %xmm0 +; movq %rbp, %rsp +; popq %rbp +; retq + +function %i64x2_add(i64x2, i64x2) -> i64x2 { +block0(v0: i64x2, v1: i64x2): + v2 = iadd v0, v1 + return v2 +} + +; VCode: +; pushq %rbp +; movq %rsp, %rbp +; block0: +; vpaddq %xmm0, %xmm1, %xmm0 +; movq %rbp, %rsp +; popq %rbp +; ret +; +; Disassembled: +; block0: ; offset 0x0 +; pushq %rbp +; movq %rsp, %rbp +; block1: ; offset 0x4 +; vpaddq %xmm1, %xmm0, %xmm0 +; movq %rbp, %rsp +; popq %rbp +; retq + +function %i8x16_add_sat(i8x16, i8x16) -> i8x16 { +block0(v0: i8x16, v1: i8x16): + v2 = sadd_sat v0, v1 + return v2 +} + +; VCode: +; pushq %rbp +; movq %rsp, %rbp +; block0: +; vpaddsb %xmm0, %xmm1, %xmm0 +; movq %rbp, %rsp +; popq %rbp +; ret +; +; Disassembled: +; block0: ; offset 0x0 +; pushq %rbp +; movq %rsp, %rbp +; block1: ; offset 0x4 +; vpaddsb %xmm1, %xmm0, %xmm0 +; movq %rbp, %rsp +; popq %rbp +; retq + +function %i16x8_add_sat(i16x8, i16x8) -> i16x8 { +block0(v0: i16x8, v1: i16x8): + v2 = sadd_sat v0, v1 + return v2 +} + +; VCode: +; pushq %rbp +; movq %rsp, %rbp +; block0: +; vpaddsw %xmm0, %xmm1, %xmm0 +; movq %rbp, %rsp +; popq %rbp +; ret +; +; Disassembled: +; block0: ; offset 0x0 +; pushq %rbp +; movq %rsp, %rbp +; block1: ; offset 0x4 +; vpaddsw %xmm1, %xmm0, %xmm0 +; movq %rbp, %rsp +; popq %rbp +; retq + +function %u8x16_add_sat(i8x16, i8x16) -> i8x16 { +block0(v0: i8x16, v1: i8x16): + v2 = uadd_sat v0, v1 + return v2 +} + +; VCode: +; pushq %rbp +; movq %rsp, %rbp +; block0: +; vpaddusb %xmm0, %xmm1, %xmm0 +; movq %rbp, %rsp +; popq %rbp +; ret +; +; Disassembled: +; block0: ; offset 0x0 +; pushq %rbp +; movq %rsp, %rbp +; block1: ; offset 0x4 +; vpaddusb %xmm1, %xmm0, %xmm0 +; movq %rbp, %rsp +; popq %rbp +; retq + +function %u16x8_add_sat(i16x8, i16x8) -> i16x8 { +block0(v0: i16x8, v1: i16x8): + v2 = uadd_sat v0, v1 + return v2 +} + +; VCode: +; pushq %rbp +; movq %rsp, %rbp +; block0: +; vpaddusw %xmm0, %xmm1, %xmm0 +; movq %rbp, %rsp +; popq %rbp +; ret +; +; Disassembled: +; block0: ; offset 0x0 +; pushq %rbp +; movq %rsp, %rbp +; block1: ; offset 0x4 +; vpaddusw %xmm1, %xmm0, %xmm0 +; movq %rbp, %rsp +; popq %rbp +; retq + +function %i8x16_sub(i8x16, i8x16) -> i8x16 { +block0(v0: i8x16, v1: i8x16): + v2 = isub v0, v1 + return v2 +} + +; VCode: +; pushq %rbp +; movq %rsp, %rbp +; block0: +; vpsubb %xmm0, %xmm1, %xmm0 +; movq %rbp, %rsp +; popq %rbp +; ret +; +; Disassembled: +; block0: ; offset 0x0 +; pushq %rbp +; movq %rsp, %rbp +; block1: ; offset 0x4 +; vpsubb %xmm1, %xmm0, %xmm0 +; movq %rbp, %rsp +; popq %rbp +; retq + +function %i16x8_sub(i16x8, i16x8) -> i16x8 { +block0(v0: i16x8, v1: i16x8): + v2 = isub v0, v1 + return v2 +} + +; VCode: +; pushq %rbp +; movq %rsp, %rbp +; block0: +; vpsubw %xmm0, %xmm1, %xmm0 +; movq %rbp, %rsp +; popq %rbp +; ret +; +; Disassembled: +; block0: ; offset 0x0 +; pushq %rbp +; movq %rsp, %rbp +; block1: ; offset 0x4 +; vpsubw %xmm1, %xmm0, %xmm0 +; movq %rbp, %rsp +; popq %rbp +; retq + +function %i32x4_sub(i32x4, i32x4) -> i32x4 { +block0(v0: i32x4, v1: i32x4): + v2 = isub v0, v1 + return v2 +} + +; VCode: +; pushq %rbp +; movq %rsp, %rbp +; block0: +; vpsubd %xmm0, %xmm1, %xmm0 +; movq %rbp, %rsp +; popq %rbp +; ret +; +; Disassembled: +; block0: ; offset 0x0 +; pushq %rbp +; movq %rsp, %rbp +; block1: ; offset 0x4 +; vpsubd %xmm1, %xmm0, %xmm0 +; movq %rbp, %rsp +; popq %rbp +; retq + +function %i64x2_sub(i64x2, i64x2) -> i64x2 { +block0(v0: i64x2, v1: i64x2): + v2 = isub v0, v1 + return v2 +} + +; VCode: +; pushq %rbp +; movq %rsp, %rbp +; block0: +; vpsubq %xmm0, %xmm1, %xmm0 +; movq %rbp, %rsp +; popq %rbp +; ret +; +; Disassembled: +; block0: ; offset 0x0 +; pushq %rbp +; movq %rsp, %rbp +; block1: ; offset 0x4 +; vpsubq %xmm1, %xmm0, %xmm0 +; movq %rbp, %rsp +; popq %rbp +; retq + +function %i8x16_sub_sat(i8x16, i8x16) -> i8x16 { +block0(v0: i8x16, v1: i8x16): + v2 = ssub_sat v0, v1 + return v2 +} + +; VCode: +; pushq %rbp +; movq %rsp, %rbp +; block0: +; vpsubsb %xmm0, %xmm1, %xmm0 +; movq %rbp, %rsp +; popq %rbp +; ret +; +; Disassembled: +; block0: ; offset 0x0 +; pushq %rbp +; movq %rsp, %rbp +; block1: ; offset 0x4 +; vpsubsb %xmm1, %xmm0, %xmm0 +; movq %rbp, %rsp +; popq %rbp +; retq + +function %i16x8_sub_sat(i16x8, i16x8) -> i16x8 { +block0(v0: i16x8, v1: i16x8): + v2 = ssub_sat v0, v1 + return v2 +} + +; VCode: +; pushq %rbp +; movq %rsp, %rbp +; block0: +; vpsubsw %xmm0, %xmm1, %xmm0 +; movq %rbp, %rsp +; popq %rbp +; ret +; +; Disassembled: +; block0: ; offset 0x0 +; pushq %rbp +; movq %rsp, %rbp +; block1: ; offset 0x4 +; vpsubsw %xmm1, %xmm0, %xmm0 +; movq %rbp, %rsp +; popq %rbp +; retq + +function %u8x16_sub_sat(i8x16, i8x16) -> i8x16 { +block0(v0: i8x16, v1: i8x16): + v2 = usub_sat v0, v1 + return v2 +} + +; VCode: +; pushq %rbp +; movq %rsp, %rbp +; block0: +; vpsubusb %xmm0, %xmm1, %xmm0 +; movq %rbp, %rsp +; popq %rbp +; ret +; +; Disassembled: +; block0: ; offset 0x0 +; pushq %rbp +; movq %rsp, %rbp +; block1: ; offset 0x4 +; vpsubusb %xmm1, %xmm0, %xmm0 +; movq %rbp, %rsp +; popq %rbp +; retq + +function %u16x8_sub_sat(i16x8, i16x8) -> i16x8 { +block0(v0: i16x8, v1: i16x8): + v2 = usub_sat v0, v1 + return v2 +} + +; VCode: +; pushq %rbp +; movq %rsp, %rbp +; block0: +; vpsubusw %xmm0, %xmm1, %xmm0 +; movq %rbp, %rsp +; popq %rbp +; ret +; +; Disassembled: +; block0: ; offset 0x0 +; pushq %rbp +; movq %rsp, %rbp +; block1: ; offset 0x4 +; vpsubusw %xmm1, %xmm0, %xmm0 +; movq %rbp, %rsp +; popq %rbp +; retq + +function %i8x16_avg(i8x16, i8x16) -> i8x16 { +block0(v0: i8x16, v1: i8x16): + v2 = avg_round v0, v1 + return v2 +} + +; VCode: +; pushq %rbp +; movq %rsp, %rbp +; block0: +; vpavgb %xmm0, %xmm1, %xmm0 +; movq %rbp, %rsp +; popq %rbp +; ret +; +; Disassembled: +; block0: ; offset 0x0 +; pushq %rbp +; movq %rsp, %rbp +; block1: ; offset 0x4 +; vpavgb %xmm1, %xmm0, %xmm0 +; movq %rbp, %rsp +; popq %rbp +; retq + +function %i16x8_avg(i16x8, i16x8) -> i16x8 { +block0(v0: i16x8, v1: i16x8): + v2 = avg_round v0, v1 + return v2 +} + +; VCode: +; pushq %rbp +; movq %rsp, %rbp +; block0: +; vpavgw %xmm0, %xmm1, %xmm0 +; movq %rbp, %rsp +; popq %rbp +; ret +; +; Disassembled: +; block0: ; offset 0x0 +; pushq %rbp +; movq %rsp, %rbp +; block1: ; offset 0x4 +; vpavgw %xmm1, %xmm0, %xmm0 +; movq %rbp, %rsp +; popq %rbp +; retq + +function %i16x8_mul(i16x8, i16x8) -> i16x8 { +block0(v0: i16x8, v1: i16x8): + v2 = imul v0, v1 + return v2 +} + +; VCode: +; pushq %rbp +; movq %rsp, %rbp +; block0: +; vpmullw %xmm0, %xmm1, %xmm0 +; movq %rbp, %rsp +; popq %rbp +; ret +; +; Disassembled: +; block0: ; offset 0x0 +; pushq %rbp +; movq %rsp, %rbp +; block1: ; offset 0x4 +; vpmullw %xmm1, %xmm0, %xmm0 +; movq %rbp, %rsp +; popq %rbp +; retq + +function %i32x4_mul(i32x4, i32x4) -> i32x4 { +block0(v0: i32x4, v1: i32x4): + v2 = imul v0, v1 + return v2 +} + +; VCode: +; pushq %rbp +; movq %rsp, %rbp +; block0: +; vpmulld %xmm0, %xmm1, %xmm0 +; movq %rbp, %rsp +; popq %rbp +; ret +; +; Disassembled: +; block0: ; offset 0x0 +; pushq %rbp +; movq %rsp, %rbp +; block1: ; offset 0x4 +; vpmulld %xmm1, %xmm0, %xmm0 +; movq %rbp, %rsp +; popq %rbp +; retq + +function %i32x4_extmul_high_i16x8_s(i16x8, i16x8) -> i32x4 { +block0(v0: i16x8, v1: i16x8): + v2 = swiden_high v0 + v3 = swiden_high v1 + v4 = imul v2, v3 + return v4 +} + +; VCode: +; pushq %rbp +; movq %rsp, %rbp +; block0: +; vpmullw %xmm0, %xmm1, %xmm3 +; vpmulhw %xmm0, %xmm1, %xmm5 +; vpunpckhwd %xmm3, %xmm5, %xmm0 +; movq %rbp, %rsp +; popq %rbp +; ret +; +; Disassembled: +; block0: ; offset 0x0 +; pushq %rbp +; movq %rsp, %rbp +; block1: ; offset 0x4 +; vpmullw %xmm1, %xmm0, %xmm3 +; vpmulhw %xmm1, %xmm0, %xmm5 +; vpunpckhwd %xmm5, %xmm3, %xmm0 +; movq %rbp, %rsp +; popq %rbp +; retq + +function %i32x4_extmul_low_i16x8_u(i16x8, i16x8) -> i32x4 { +block0(v0: i16x8, v1: i16x8): + v2 = uwiden_low v0 + v3 = uwiden_low v1 + v4 = imul v2, v3 + return v4 +} + +; VCode: +; pushq %rbp +; movq %rsp, %rbp +; block0: +; vpmullw %xmm0, %xmm1, %xmm3 +; vpmulhuw %xmm0, %xmm1, %xmm5 +; vpunpcklwd %xmm3, %xmm5, %xmm0 +; movq %rbp, %rsp +; popq %rbp +; ret +; +; Disassembled: +; block0: ; offset 0x0 +; pushq %rbp +; movq %rsp, %rbp +; block1: ; offset 0x4 +; vpmullw %xmm1, %xmm0, %xmm3 +; vpmulhuw %xmm1, %xmm0, %xmm5 +; vpunpcklwd %xmm5, %xmm3, %xmm0 +; movq %rbp, %rsp +; popq %rbp +; retq + +function %i16x8_sqmul_round_sat(i16x8, i16x8) -> i16x8 { +block0(v0: i16x8, v1: i16x8): + v2 = sqmul_round_sat v0, v1 + return v2 +} + +; VCode: +; pushq %rbp +; movq %rsp, %rbp +; block0: +; movdqu const(0), %xmm3 +; vpmulhrsw %xmm0, %xmm1, %xmm5 +; vpcmpeqw %xmm3, %xmm5, %xmm7 +; vpxor %xmm5, %xmm7, %xmm0 +; movq %rbp, %rsp +; popq %rbp +; ret +; +; Disassembled: +; block0: ; offset 0x0 +; pushq %rbp +; movq %rsp, %rbp +; block1: ; offset 0x4 +; movdqu 0x14(%rip), %xmm3 +; vpmulhrsw %xmm1, %xmm0, %xmm5 +; vpcmpeqw %xmm5, %xmm3, %xmm7 +; vpxor %xmm7, %xmm5, %xmm0 +; movq %rbp, %rsp +; popq %rbp +; retq +; addb %al, (%rax) +; addb %al, -0x7fff8000(%rax) +; addb %al, -0x7fff8000(%rax) + +function %i64x2_extmul_high_i32x4_s(i32x4, i32x4) -> i64x2 { +block0(v0: i32x4, v1: i32x4): + v2 = swiden_high v0 + v3 = swiden_high v1 + v4 = imul v2, v3 + return v4 +} + +; VCode: +; pushq %rbp +; movq %rsp, %rbp +; block0: +; pshufd $250, %xmm0, %xmm3 +; pshufd $250, %xmm1, %xmm5 +; vpmuldq %xmm3, %xmm5, %xmm0 +; movq %rbp, %rsp +; popq %rbp +; ret +; +; Disassembled: +; block0: ; offset 0x0 +; pushq %rbp +; movq %rsp, %rbp +; block1: ; offset 0x4 +; pshufd $0xfa, %xmm0, %xmm3 +; pshufd $0xfa, %xmm1, %xmm5 +; vpmuldq %xmm5, %xmm3, %xmm0 +; movq %rbp, %rsp +; popq %rbp +; retq + +function %i64x2_extmul_low_i32x4_u(i32x4, i32x4) -> i64x2 { +block0(v0: i32x4, v1: i32x4): + v2 = uwiden_low v0 + v3 = uwiden_low v1 + v4 = imul v2, v3 + return v4 +} + +; VCode: +; pushq %rbp +; movq %rsp, %rbp +; block0: +; pshufd $80, %xmm0, %xmm3 +; pshufd $80, %xmm1, %xmm5 +; vpmuludq %xmm3, %xmm5, %xmm0 +; movq %rbp, %rsp +; popq %rbp +; ret +; +; Disassembled: +; block0: ; offset 0x0 +; pushq %rbp +; movq %rsp, %rbp +; block1: ; offset 0x4 +; pshufd $0x50, %xmm0, %xmm3 +; pshufd $0x50, %xmm1, %xmm5 +; vpmuludq %xmm5, %xmm3, %xmm0 +; movq %rbp, %rsp +; popq %rbp +; retq + +function %f64x2_from_i32x4(i32x4) -> f64x2 { +block0(v0: i32x4): + v1 = uwiden_low v0 + v2 = fcvt_from_uint.f64x2 v1 + return v2 +} + +; VCode: +; pushq %rbp +; movq %rsp, %rbp +; block0: +; movdqu const(0), %xmm2 +; vunpcklps %xmm0, %xmm2, %xmm4 +; movdqu const(1), %xmm6 +; vsubpd %xmm4, %xmm6, %xmm0 +; movq %rbp, %rsp +; popq %rbp +; ret +; +; Disassembled: +; block0: ; offset 0x0 +; pushq %rbp +; movq %rsp, %rbp +; block1: ; offset 0x4 +; movdqu 0x24(%rip), %xmm2 +; vunpcklps %xmm2, %xmm0, %xmm4 +; movdqu 0x28(%rip), %xmm6 +; vsubpd %xmm6, %xmm4, %xmm0 +; movq %rbp, %rsp +; popq %rbp +; retq +; addb %al, (%rax) +; addb %al, (%rax) +; addb %al, (%rax) +; addb %al, (%rax) +; addb %al, (%rax) +; addb %al, (%rax) +; addb %al, (%rax) +; addb %al, (%rax) +; addb %dh, (%rax) +; addb %al, (%r8) +; xorb %al, (%rbx) +; addb %al, (%rax) +; addb %al, (%rax) +; addb %al, (%rax) +; addb %al, (%rax) +; addb %al, (%rax) +; addb %al, (%rax) +; addb %dh, (%rax) +; addb %al, (%r8) +; addb %al, (%rax) +; addb %al, (%rax) + +function %f32x4_add(f32x4, f32x4) -> f32x4 { +block0(v0: f32x4, v1: f32x4): + v2 = fadd v0, v1 + return v2 +} + +; VCode: +; pushq %rbp +; movq %rsp, %rbp +; block0: +; vaddps %xmm0, %xmm1, %xmm0 +; movq %rbp, %rsp +; popq %rbp +; ret +; +; Disassembled: +; block0: ; offset 0x0 +; pushq %rbp +; movq %rsp, %rbp +; block1: ; offset 0x4 +; vaddps %xmm1, %xmm0, %xmm0 +; movq %rbp, %rsp +; popq %rbp +; retq + +function %f64x2_add(f64x2, f64x2) -> f64x2 { +block0(v0: f64x2, v1: f64x2): + v2 = fadd v0, v1 + return v2 +} + +; VCode: +; pushq %rbp +; movq %rsp, %rbp +; block0: +; vaddpd %xmm0, %xmm1, %xmm0 +; movq %rbp, %rsp +; popq %rbp +; ret +; +; Disassembled: +; block0: ; offset 0x0 +; pushq %rbp +; movq %rsp, %rbp +; block1: ; offset 0x4 +; vaddpd %xmm1, %xmm0, %xmm0 +; movq %rbp, %rsp +; popq %rbp +; retq + +function %f32x4_sub(f32x4, f32x4) -> f32x4 { +block0(v0: f32x4, v1: f32x4): + v2 = fsub v0, v1 + return v2 +} + +; VCode: +; pushq %rbp +; movq %rsp, %rbp +; block0: +; vsubps %xmm0, %xmm1, %xmm0 +; movq %rbp, %rsp +; popq %rbp +; ret +; +; Disassembled: +; block0: ; offset 0x0 +; pushq %rbp +; movq %rsp, %rbp +; block1: ; offset 0x4 +; vsubps %xmm1, %xmm0, %xmm0 +; movq %rbp, %rsp +; popq %rbp +; retq + +function %f64x2_sub(f64x2, f64x2) -> f64x2 { +block0(v0: f64x2, v1: f64x2): + v2 = fsub v0, v1 + return v2 +} + +; VCode: +; pushq %rbp +; movq %rsp, %rbp +; block0: +; vsubpd %xmm0, %xmm1, %xmm0 +; movq %rbp, %rsp +; popq %rbp +; ret +; +; Disassembled: +; block0: ; offset 0x0 +; pushq %rbp +; movq %rsp, %rbp +; block1: ; offset 0x4 +; vsubpd %xmm1, %xmm0, %xmm0 +; movq %rbp, %rsp +; popq %rbp +; retq + +function %f32x4_mul(f32x4, f32x4) -> f32x4 { +block0(v0: f32x4, v1: f32x4): + v2 = fmul v0, v1 + return v2 +} + +; VCode: +; pushq %rbp +; movq %rsp, %rbp +; block0: +; vmulps %xmm0, %xmm1, %xmm0 +; movq %rbp, %rsp +; popq %rbp +; ret +; +; Disassembled: +; block0: ; offset 0x0 +; pushq %rbp +; movq %rsp, %rbp +; block1: ; offset 0x4 +; vmulps %xmm1, %xmm0, %xmm0 +; movq %rbp, %rsp +; popq %rbp +; retq + +function %f64x2_mul(f64x2, f64x2) -> f64x2 { +block0(v0: f64x2, v1: f64x2): + v2 = fmul v0, v1 + return v2 +} + +; VCode: +; pushq %rbp +; movq %rsp, %rbp +; block0: +; vmulpd %xmm0, %xmm1, %xmm0 +; movq %rbp, %rsp +; popq %rbp +; ret +; +; Disassembled: +; block0: ; offset 0x0 +; pushq %rbp +; movq %rsp, %rbp +; block1: ; offset 0x4 +; vmulpd %xmm1, %xmm0, %xmm0 +; movq %rbp, %rsp +; popq %rbp +; retq + +function %f32x4_div(f32x4, f32x4) -> f32x4 { +block0(v0: f32x4, v1: f32x4): + v2 = fdiv v0, v1 + return v2 +} + +; VCode: +; pushq %rbp +; movq %rsp, %rbp +; block0: +; vdivps %xmm0, %xmm1, %xmm0 +; movq %rbp, %rsp +; popq %rbp +; ret +; +; Disassembled: +; block0: ; offset 0x0 +; pushq %rbp +; movq %rsp, %rbp +; block1: ; offset 0x4 +; vdivps %xmm1, %xmm0, %xmm0 +; movq %rbp, %rsp +; popq %rbp +; retq + +function %f64x2_div(f64x2, f64x2) -> f64x2 { +block0(v0: f64x2, v1: f64x2): + v2 = fdiv v0, v1 + return v2 +} + +; VCode: +; pushq %rbp +; movq %rsp, %rbp +; block0: +; vdivpd %xmm0, %xmm1, %xmm0 +; movq %rbp, %rsp +; popq %rbp +; ret +; +; Disassembled: +; block0: ; offset 0x0 +; pushq %rbp +; movq %rsp, %rbp +; block1: ; offset 0x4 +; vdivpd %xmm1, %xmm0, %xmm0 +; movq %rbp, %rsp +; popq %rbp +; retq + +function %i8x16_ishr(i8x16, i32) -> i8x16 { +block0(v0: i8x16, v1: i32): + v2 = sshr v0, v1 + return v2 +} + +; VCode: +; pushq %rbp +; movq %rsp, %rbp +; block0: +; movq %rdi, %r9 +; andq %r9, $7, %r9 +; vpunpcklbw %xmm0, %xmm0, %xmm5 +; vpunpckhbw %xmm0, %xmm0, %xmm7 +; addl %r9d, $8, %r9d +; movd %r9d, %xmm11 +; vpsraw %xmm5, %xmm11, %xmm13 +; vpsraw %xmm7, %xmm11, %xmm15 +; vpacksswb %xmm13, %xmm15, %xmm0 +; movq %rbp, %rsp +; popq %rbp +; ret +; +; Disassembled: +; block0: ; offset 0x0 +; pushq %rbp +; movq %rsp, %rbp +; block1: ; offset 0x4 +; movq %rdi, %r9 +; andq $7, %r9 +; vpunpcklbw %xmm0, %xmm0, %xmm5 +; vpunpckhbw %xmm0, %xmm0, %xmm7 +; addl $8, %r9d +; movd %r9d, %xmm11 +; vpsraw %xmm11, %xmm5, %xmm13 +; vpsraw %xmm11, %xmm7, %xmm15 +; vpacksswb %xmm15, %xmm13, %xmm0 +; movq %rbp, %rsp +; popq %rbp +; retq + +function %i8x16_ishr_imm(i8x16) -> i8x16 { +block0(v0: i8x16): + v1 = iconst.i32 3 + v2 = sshr v0, v1 + return v2 +} + +; VCode: +; pushq %rbp +; movq %rsp, %rbp +; block0: +; vpunpcklbw %xmm0, %xmm0, %xmm2 +; vpunpckhbw %xmm0, %xmm0, %xmm4 +; vpsraw %xmm2, $11, %xmm6 +; vpsraw %xmm4, $11, %xmm8 +; vpacksswb %xmm6, %xmm8, %xmm0 +; movq %rbp, %rsp +; popq %rbp +; ret +; +; Disassembled: +; block0: ; offset 0x0 +; pushq %rbp +; movq %rsp, %rbp +; block1: ; offset 0x4 +; vpunpcklbw %xmm0, %xmm0, %xmm2 +; vpunpckhbw %xmm0, %xmm0, %xmm4 +; vpsraw $0xb, %xmm2, %xmm6 +; vpsraw $0xb, %xmm4, %xmm8 +; vpacksswb %xmm8, %xmm6, %xmm0 +; movq %rbp, %rsp +; popq %rbp +; retq + +function %i16x8_ishr(i16x8, i32) -> i16x8 { +block0(v0: i16x8, v1: i32): + v2 = sshr v0, v1 + return v2 +} + +; VCode: +; pushq %rbp +; movq %rsp, %rbp +; block0: +; movq %rdi, %rcx +; andq %rcx, $15, %rcx +; movd %ecx, %xmm5 +; vpsraw %xmm0, %xmm5, %xmm0 +; movq %rbp, %rsp +; popq %rbp +; ret +; +; Disassembled: +; block0: ; offset 0x0 +; pushq %rbp +; movq %rsp, %rbp +; block1: ; offset 0x4 +; movq %rdi, %rcx +; andq $0xf, %rcx +; movd %ecx, %xmm5 +; vpsraw %xmm5, %xmm0, %xmm0 +; movq %rbp, %rsp +; popq %rbp +; retq + +function %i16x8_ishr_imm(i16x8) -> i16x8 { +block0(v0: i16x8): + v1 = iconst.i32 3 + v2 = sshr v0, v1 + return v2 +} + +; VCode: +; pushq %rbp +; movq %rsp, %rbp +; block0: +; vpsraw %xmm0, $3, %xmm0 +; movq %rbp, %rsp +; popq %rbp +; ret +; +; Disassembled: +; block0: ; offset 0x0 +; pushq %rbp +; movq %rsp, %rbp +; block1: ; offset 0x4 +; vpsraw $3, %xmm0, %xmm0 +; movq %rbp, %rsp +; popq %rbp +; retq + +function %i32x4_ishr(i32x4, i32) -> i32x4 { +block0(v0: i32x4, v1: i32): + v2 = sshr v0, v1 + return v2 +} + +; VCode: +; pushq %rbp +; movq %rsp, %rbp +; block0: +; movq %rdi, %rcx +; andq %rcx, $31, %rcx +; movd %ecx, %xmm5 +; vpsrad %xmm0, %xmm5, %xmm0 +; movq %rbp, %rsp +; popq %rbp +; ret +; +; Disassembled: +; block0: ; offset 0x0 +; pushq %rbp +; movq %rsp, %rbp +; block1: ; offset 0x4 +; movq %rdi, %rcx +; andq $0x1f, %rcx +; movd %ecx, %xmm5 +; vpsrad %xmm5, %xmm0, %xmm0 +; movq %rbp, %rsp +; popq %rbp +; retq + +function %i32x4_ishr_imm(i32x4) -> i32x4 { +block0(v0: i32x4): + v1 = iconst.i32 3 + v2 = sshr v0, v1 + return v2 +} + +; VCode: +; pushq %rbp +; movq %rsp, %rbp +; block0: +; vpsrad %xmm0, $3, %xmm0 +; movq %rbp, %rsp +; popq %rbp +; ret +; +; Disassembled: +; block0: ; offset 0x0 +; pushq %rbp +; movq %rsp, %rbp +; block1: ; offset 0x4 +; vpsrad $3, %xmm0, %xmm0 +; movq %rbp, %rsp +; popq %rbp +; retq + +function %i8x16_snarrow(i16x8, i16x8) -> i8x16 { +block0(v0: i16x8, v1: i16x8): + v2 = snarrow v0, v1 + return v2 +} + +; VCode: +; pushq %rbp +; movq %rsp, %rbp +; block0: +; vpacksswb %xmm0, %xmm1, %xmm0 +; movq %rbp, %rsp +; popq %rbp +; ret +; +; Disassembled: +; block0: ; offset 0x0 +; pushq %rbp +; movq %rsp, %rbp +; block1: ; offset 0x4 +; vpacksswb %xmm1, %xmm0, %xmm0 +; movq %rbp, %rsp +; popq %rbp +; retq + +function %i8x16_unarrow(i16x8, i16x8) -> i8x16 { +block0(v0: i16x8, v1: i16x8): + v2 = unarrow v0, v1 + return v2 +} + +; VCode: +; pushq %rbp +; movq %rsp, %rbp +; block0: +; vpackuswb %xmm0, %xmm1, %xmm0 +; movq %rbp, %rsp +; popq %rbp +; ret +; +; Disassembled: +; block0: ; offset 0x0 +; pushq %rbp +; movq %rsp, %rbp +; block1: ; offset 0x4 +; vpackuswb %xmm1, %xmm0, %xmm0 +; movq %rbp, %rsp +; popq %rbp +; retq + +function %i16x8_snarrow(i32x4, i32x4) -> i16x8 { +block0(v0: i32x4, v1: i32x4): + v2 = snarrow v0, v1 + return v2 +} + +; VCode: +; pushq %rbp +; movq %rsp, %rbp +; block0: +; vpackssdw %xmm0, %xmm1, %xmm0 +; movq %rbp, %rsp +; popq %rbp +; ret +; +; Disassembled: +; block0: ; offset 0x0 +; pushq %rbp +; movq %rsp, %rbp +; block1: ; offset 0x4 +; vpackssdw %xmm1, %xmm0, %xmm0 +; movq %rbp, %rsp +; popq %rbp +; retq + +function %i16x8_unarrow(i32x4, i32x4) -> i16x8 { +block0(v0: i32x4, v1: i32x4): + v2 = unarrow v0, v1 + return v2 +} + +; VCode: +; pushq %rbp +; movq %rsp, %rbp +; block0: +; vpackusdw %xmm0, %xmm1, %xmm0 +; movq %rbp, %rsp +; popq %rbp +; ret +; +; Disassembled: +; block0: ; offset 0x0 +; pushq %rbp +; movq %rsp, %rbp +; block1: ; offset 0x4 +; vpackusdw %xmm1, %xmm0, %xmm0 +; movq %rbp, %rsp +; popq %rbp +; retq + +function %i8x16_uwiden_high(i8x16) -> i16x8 { +block0(v0: i8x16): + v1 = uwiden_high v0 + return v1 +} + +; VCode: +; pushq %rbp +; movq %rsp, %rbp +; block0: +; vpalignr $8 %xmm0, %xmm0, %xmm2 +; pmovzxbw %xmm2, %xmm0 +; movq %rbp, %rsp +; popq %rbp +; ret +; +; Disassembled: +; block0: ; offset 0x0 +; pushq %rbp +; movq %rsp, %rbp +; block1: ; offset 0x4 +; vpalignr $8, %xmm0, %xmm0, %xmm2 +; pmovzxbw %xmm2, %xmm0 +; movq %rbp, %rsp +; popq %rbp +; retq + +function %i8x16_iadd_pairwise(i8x16) -> i16x8 { +block0(v0: i8x16): + v1 = swiden_high v0 + v2 = swiden_low v0 + v3 = iadd_pairwise v2, v1 + return v3 +} + +; VCode: +; pushq %rbp +; movq %rsp, %rbp +; block0: +; movdqu const(0), %xmm2 +; vpmaddubsw %xmm2, %xmm0, %xmm0 +; movq %rbp, %rsp +; popq %rbp +; ret +; +; Disassembled: +; block0: ; offset 0x0 +; pushq %rbp +; movq %rsp, %rbp +; block1: ; offset 0x4 +; movdqu 0x14(%rip), %xmm2 +; vpmaddubsw %xmm0, %xmm2, %xmm0 +; movq %rbp, %rsp +; popq %rbp +; retq +; addb %al, (%rax) +; addb %al, (%rax) +; addb %al, (%rax) +; addb %al, (%rax) +; addb %al, (%rax) +; addl %eax, (%rcx) +; addl %eax, (%rcx) +; addl %eax, (%rcx) +; addl %eax, (%rcx) +; addl %eax, (%rcx) +; addl %eax, (%rcx) +; addl %eax, (%rcx) +; addl %eax, (%rcx) + +function %i16x8_iadd_pairwise(i16x8) -> i32x4 { +block0(v0: i16x8): + v1 = swiden_high v0 + v2 = swiden_low v0 + v3 = iadd_pairwise v2, v1 + return v3 +} + +; VCode: +; pushq %rbp +; movq %rsp, %rbp +; block0: +; movdqu const(0), %xmm2 +; vpmaddwd %xmm0, %xmm2, %xmm0 +; movq %rbp, %rsp +; popq %rbp +; ret +; +; Disassembled: +; block0: ; offset 0x0 +; pushq %rbp +; movq %rsp, %rbp +; block1: ; offset 0x4 +; movdqu 0x14(%rip), %xmm2 +; vpmaddwd %xmm2, %xmm0, %xmm0 +; movq %rbp, %rsp +; popq %rbp +; retq +; addb %al, (%rax) +; addb %al, (%rax) +; addb %al, (%rax) +; addb %al, (%rax) +; addb %al, (%rax) +; addb %al, (%rcx) +; addb %al, (%rcx) +; addb %al, (%rcx) +; addb %al, (%rcx) +; addb %al, (%rcx) +; addb %al, (%rcx) +; addb %al, (%rcx) +; addb %al, (%rcx) + +function %i8x16_splat(i8) -> i8x16 { +block0(v0: i8): + v1 = splat.i8x16 v0 + return v1 +} + +; VCode: +; pushq %rbp +; movq %rsp, %rbp +; block0: +; uninit %xmm2 +; vpinsrb $0 %xmm2, %rdi, %xmm4 +; pxor %xmm6, %xmm6, %xmm6 +; vpshufb %xmm4, %xmm6, %xmm0 +; movq %rbp, %rsp +; popq %rbp +; ret +; +; Disassembled: +; block0: ; offset 0x0 +; pushq %rbp +; movq %rsp, %rbp +; block1: ; offset 0x4 +; vpinsrb $0, %edi, %xmm2, %xmm4 +; pxor %xmm6, %xmm6 +; vpshufb %xmm6, %xmm4, %xmm0 +; movq %rbp, %rsp +; popq %rbp +; retq + +function %i32x4_trunc_sat_f64x2_u_zero(f64x2) -> i32x4 { +block0(v0: f64x2): + v1 = fcvt_to_uint_sat.i64x2 v0 + v2 = vconst.i64x2 0x00 + v3 = uunarrow v1, v2 + return v3 +} + +; VCode: +; pushq %rbp +; movq %rsp, %rbp +; block0: +; xorpd %xmm2, %xmm2, %xmm2 +; vmaxpd %xmm0, %xmm2, %xmm4 +; movupd const(0), %xmm6 +; vminpd %xmm4, %xmm6, %xmm8 +; roundpd $3, %xmm8, %xmm10 +; movupd const(1), %xmm12 +; vaddpd %xmm10, %xmm12, %xmm14 +; vshufps $136 %xmm14, %xmm2, %xmm0 +; movq %rbp, %rsp +; popq %rbp +; ret +; +; Disassembled: +; block0: ; offset 0x0 +; pushq %rbp +; movq %rsp, %rbp +; block1: ; offset 0x4 +; xorpd %xmm2, %xmm2 +; vmaxpd %xmm2, %xmm0, %xmm4 +; movupd 0x2c(%rip), %xmm6 +; vminpd %xmm6, %xmm4, %xmm8 +; roundpd $3, %xmm8, %xmm10 +; movupd 0x28(%rip), %xmm12 +; vaddpd %xmm12, %xmm10, %xmm14 +; vshufps $0x88, %xmm2, %xmm14, %xmm0 +; movq %rbp, %rsp +; popq %rbp +; retq +; addb %al, (%rax) +; addb %al, (%rax) +; addb %al, (%rax) +; addb %al, (%rax) +; addb %al, (%rax) +; addb %ah, %al + +function %i8x16_shl(i8x16, i32) -> i8x16 { +block0(v0: i8x16, v1: i32): + v2 = ishl v0, v1 + return v2 +} + +; VCode: +; pushq %rbp +; movq %rsp, %rbp +; block0: +; movq %rdi, %r10 +; andq %r10, $7, %r10 +; movd %r10d, %xmm5 +; vpsllw %xmm0, %xmm5, %xmm7 +; lea const(0), %rsi +; shlq $4, %r10, %r10 +; movdqu 0(%rsi,%r10,1), %xmm13 +; vpand %xmm7, %xmm13, %xmm0 +; movq %rbp, %rsp +; popq %rbp +; ret +; +; Disassembled: +; block0: ; offset 0x0 +; pushq %rbp +; movq %rsp, %rbp +; block1: ; offset 0x4 +; movq %rdi, %r10 +; andq $7, %r10 +; movd %r10d, %xmm5 +; vpsllw %xmm5, %xmm0, %xmm7 +; leaq 0x15(%rip), %rsi +; shlq $4, %r10 +; movdqu (%rsi, %r10), %xmm13 +; vpand %xmm13, %xmm7, %xmm0 +; movq %rbp, %rsp +; popq %rbp +; retq + +function %i8x16_shl_imm(i8x16) -> i8x16 { +block0(v0: i8x16): + v1 = iconst.i32 1 + v2 = ishl v0, v1 + return v2 +} + +; VCode: +; pushq %rbp +; movq %rsp, %rbp +; block0: +; vpsllw %xmm0, $1, %xmm2 +; movdqu const(0), %xmm4 +; vpand %xmm2, %xmm4, %xmm0 +; movq %rbp, %rsp +; popq %rbp +; ret +; +; Disassembled: +; block0: ; offset 0x0 +; pushq %rbp +; movq %rsp, %rbp +; block1: ; offset 0x4 +; vpsllw $1, %xmm0, %xmm2 +; movdqu 0xf(%rip), %xmm4 +; vpand %xmm4, %xmm2, %xmm0 +; movq %rbp, %rsp +; popq %rbp +; retq +; addb %al, (%rax) +; addb %al, (%rax) +; addb %al, (%rax) + +function %i16x8_shl(i16x8, i32) -> i16x8 { +block0(v0: i16x8, v1: i32): + v2 = ishl v0, v1 + return v2 +} + +; VCode: +; pushq %rbp +; movq %rsp, %rbp +; block0: +; movq %rdi, %rcx +; andq %rcx, $15, %rcx +; movd %ecx, %xmm5 +; vpsllw %xmm0, %xmm5, %xmm0 +; movq %rbp, %rsp +; popq %rbp +; ret +; +; Disassembled: +; block0: ; offset 0x0 +; pushq %rbp +; movq %rsp, %rbp +; block1: ; offset 0x4 +; movq %rdi, %rcx +; andq $0xf, %rcx +; movd %ecx, %xmm5 +; vpsllw %xmm5, %xmm0, %xmm0 +; movq %rbp, %rsp +; popq %rbp +; retq + +function %i16x8_shl_imm(i16x8) -> i16x8 { +block0(v0: i16x8): + v1 = iconst.i32 1 + v2 = ishl v0, v1 + return v2 +} + +; VCode: +; pushq %rbp +; movq %rsp, %rbp +; block0: +; vpsllw %xmm0, $1, %xmm0 +; movq %rbp, %rsp +; popq %rbp +; ret +; +; Disassembled: +; block0: ; offset 0x0 +; pushq %rbp +; movq %rsp, %rbp +; block1: ; offset 0x4 +; vpsllw $1, %xmm0, %xmm0 +; movq %rbp, %rsp +; popq %rbp +; retq + +function %i32x4_shl(i32x4, i32) -> i32x4 { +block0(v0: i32x4, v1: i32): + v2 = ishl v0, v1 + return v2 +} + +; VCode: +; pushq %rbp +; movq %rsp, %rbp +; block0: +; movq %rdi, %rcx +; andq %rcx, $31, %rcx +; movd %ecx, %xmm5 +; vpslld %xmm0, %xmm5, %xmm0 +; movq %rbp, %rsp +; popq %rbp +; ret +; +; Disassembled: +; block0: ; offset 0x0 +; pushq %rbp +; movq %rsp, %rbp +; block1: ; offset 0x4 +; movq %rdi, %rcx +; andq $0x1f, %rcx +; movd %ecx, %xmm5 +; vpslld %xmm5, %xmm0, %xmm0 +; movq %rbp, %rsp +; popq %rbp +; retq + +function %i32x4_shl_imm(i32x4) -> i32x4 { +block0(v0: i32x4): + v1 = iconst.i32 1 + v2 = ishl v0, v1 + return v2 +} + +; VCode: +; pushq %rbp +; movq %rsp, %rbp +; block0: +; vpslld %xmm0, $1, %xmm0 +; movq %rbp, %rsp +; popq %rbp +; ret +; +; Disassembled: +; block0: ; offset 0x0 +; pushq %rbp +; movq %rsp, %rbp +; block1: ; offset 0x4 +; vpslld $1, %xmm0, %xmm0 +; movq %rbp, %rsp +; popq %rbp +; retq + +function %i64x2_shl(i64x2, i32) -> i64x2 { +block0(v0: i64x2, v1: i32): + v2 = ishl v0, v1 + return v2 +} + +; VCode: +; pushq %rbp +; movq %rsp, %rbp +; block0: +; movq %rdi, %rcx +; andq %rcx, $63, %rcx +; movd %ecx, %xmm5 +; vpsllq %xmm0, %xmm5, %xmm0 +; movq %rbp, %rsp +; popq %rbp +; ret +; +; Disassembled: +; block0: ; offset 0x0 +; pushq %rbp +; movq %rsp, %rbp +; block1: ; offset 0x4 +; movq %rdi, %rcx +; andq $0x3f, %rcx +; movd %ecx, %xmm5 +; vpsllq %xmm5, %xmm0, %xmm0 +; movq %rbp, %rsp +; popq %rbp +; retq + +function %i64x2_shl_imm(i64x2) -> i64x2 { +block0(v0: i64x2): + v1 = iconst.i32 1 + v2 = ishl v0, v1 + return v2 +} + +; VCode: +; pushq %rbp +; movq %rsp, %rbp +; block0: +; vpsllq %xmm0, $1, %xmm0 +; movq %rbp, %rsp +; popq %rbp +; ret +; +; Disassembled: +; block0: ; offset 0x0 +; pushq %rbp +; movq %rsp, %rbp +; block1: ; offset 0x4 +; vpsllq $1, %xmm0, %xmm0 +; movq %rbp, %rsp +; popq %rbp +; retq + +function %i8x16_ushr(i8x16, i32) -> i8x16 { +block0(v0: i8x16, v1: i32): + v2 = ushr v0, v1 + return v2 +} + +; VCode: +; pushq %rbp +; movq %rsp, %rbp +; block0: +; movq %rdi, %r10 +; andq %r10, $7, %r10 +; movd %r10d, %xmm5 +; vpsrlw %xmm0, %xmm5, %xmm7 +; lea const(0), %rsi +; shlq $4, %r10, %r10 +; movdqu 0(%rsi,%r10,1), %xmm13 +; vpand %xmm7, %xmm13, %xmm0 +; movq %rbp, %rsp +; popq %rbp +; ret +; +; Disassembled: +; block0: ; offset 0x0 +; pushq %rbp +; movq %rsp, %rbp +; block1: ; offset 0x4 +; movq %rdi, %r10 +; andq $7, %r10 +; movd %r10d, %xmm5 +; vpsrlw %xmm5, %xmm0, %xmm7 +; leaq 0x15(%rip), %rsi +; shlq $4, %r10 +; movdqu (%rsi, %r10), %xmm13 +; vpand %xmm13, %xmm7, %xmm0 +; movq %rbp, %rsp +; popq %rbp +; retq + +function %i8x16_ushr_imm(i8x16) -> i8x16 { +block0(v0: i8x16): + v1 = iconst.i32 1 + v2 = ushr v0, v1 + return v2 +} + +; VCode: +; pushq %rbp +; movq %rsp, %rbp +; block0: +; vpsrlw %xmm0, $1, %xmm2 +; movdqu const(0), %xmm4 +; vpand %xmm2, %xmm4, %xmm0 +; movq %rbp, %rsp +; popq %rbp +; ret +; +; Disassembled: +; block0: ; offset 0x0 +; pushq %rbp +; movq %rsp, %rbp +; block1: ; offset 0x4 +; vpsrlw $1, %xmm0, %xmm2 +; movdqu 0xf(%rip), %xmm4 +; vpand %xmm4, %xmm2, %xmm0 +; movq %rbp, %rsp +; popq %rbp +; retq +; addb %al, (%rax) +; addb %al, (%rax) +; addb %al, (%rax) +; jg 0xa1 +; jg 0xa3 +; jg 0xa5 +; jg 0xa7 +; jg 0xa9 +; jg 0xab +; jg 0xad +; jg 0xaf + +function %i16x8_ushr(i16x8, i32) -> i16x8 { +block0(v0: i16x8, v1: i32): + v2 = ushr v0, v1 + return v2 +} + +; VCode: +; pushq %rbp +; movq %rsp, %rbp +; block0: +; movq %rdi, %rcx +; andq %rcx, $15, %rcx +; movd %ecx, %xmm5 +; vpsrlw %xmm0, %xmm5, %xmm0 +; movq %rbp, %rsp +; popq %rbp +; ret +; +; Disassembled: +; block0: ; offset 0x0 +; pushq %rbp +; movq %rsp, %rbp +; block1: ; offset 0x4 +; movq %rdi, %rcx +; andq $0xf, %rcx +; movd %ecx, %xmm5 +; vpsrlw %xmm5, %xmm0, %xmm0 +; movq %rbp, %rsp +; popq %rbp +; retq + +function %i16x8_ushr_imm(i16x8) -> i16x8 { +block0(v0: i16x8): + v1 = iconst.i32 1 + v2 = ushr v0, v1 + return v2 +} + +; VCode: +; pushq %rbp +; movq %rsp, %rbp +; block0: +; vpsrlw %xmm0, $1, %xmm0 +; movq %rbp, %rsp +; popq %rbp +; ret +; +; Disassembled: +; block0: ; offset 0x0 +; pushq %rbp +; movq %rsp, %rbp +; block1: ; offset 0x4 +; vpsrlw $1, %xmm0, %xmm0 +; movq %rbp, %rsp +; popq %rbp +; retq + +function %i32x4_ushr(i32x4, i32) -> i32x4 { +block0(v0: i32x4, v1: i32): + v2 = ushr v0, v1 + return v2 +} + +; VCode: +; pushq %rbp +; movq %rsp, %rbp +; block0: +; movq %rdi, %rcx +; andq %rcx, $31, %rcx +; movd %ecx, %xmm5 +; vpsrld %xmm0, %xmm5, %xmm0 +; movq %rbp, %rsp +; popq %rbp +; ret +; +; Disassembled: +; block0: ; offset 0x0 +; pushq %rbp +; movq %rsp, %rbp +; block1: ; offset 0x4 +; movq %rdi, %rcx +; andq $0x1f, %rcx +; movd %ecx, %xmm5 +; vpsrld %xmm5, %xmm0, %xmm0 +; movq %rbp, %rsp +; popq %rbp +; retq + +function %i32x4_ushr_imm(i32x4) -> i32x4 { +block0(v0: i32x4): + v1 = iconst.i32 1 + v2 = ushr v0, v1 + return v2 +} + +; VCode: +; pushq %rbp +; movq %rsp, %rbp +; block0: +; vpsrld %xmm0, $1, %xmm0 +; movq %rbp, %rsp +; popq %rbp +; ret +; +; Disassembled: +; block0: ; offset 0x0 +; pushq %rbp +; movq %rsp, %rbp +; block1: ; offset 0x4 +; vpsrld $1, %xmm0, %xmm0 +; movq %rbp, %rsp +; popq %rbp +; retq + +function %i64x2_ushr(i64x2, i32) -> i64x2 { +block0(v0: i64x2, v1: i32): + v2 = ushr v0, v1 + return v2 +} + +; VCode: +; pushq %rbp +; movq %rsp, %rbp +; block0: +; movq %rdi, %rcx +; andq %rcx, $63, %rcx +; movd %ecx, %xmm5 +; vpsrlq %xmm0, %xmm5, %xmm0 +; movq %rbp, %rsp +; popq %rbp +; ret +; +; Disassembled: +; block0: ; offset 0x0 +; pushq %rbp +; movq %rsp, %rbp +; block1: ; offset 0x4 +; movq %rdi, %rcx +; andq $0x3f, %rcx +; movd %ecx, %xmm5 +; vpsrlq %xmm5, %xmm0, %xmm0 +; movq %rbp, %rsp +; popq %rbp +; retq + +function %i64x2_ushr_imm(i64x2) -> i64x2 { +block0(v0: i64x2): + v1 = iconst.i32 1 + v2 = ushr v0, v1 + return v2 +} + +; VCode: +; pushq %rbp +; movq %rsp, %rbp +; block0: +; vpsrlq %xmm0, $1, %xmm0 +; movq %rbp, %rsp +; popq %rbp +; ret +; +; Disassembled: +; block0: ; offset 0x0 +; pushq %rbp +; movq %rsp, %rbp +; block1: ; offset 0x4 +; vpsrlq $1, %xmm0, %xmm0 +; movq %rbp, %rsp +; popq %rbp +; retq + diff --git a/cranelift/filetests/filetests/isa/x64/simd-bitwise-avx.clif b/cranelift/filetests/filetests/isa/x64/simd-bitwise-avx.clif index eeb096b3e4..88225f5a88 100644 --- a/cranelift/filetests/filetests/isa/x64/simd-bitwise-avx.clif +++ b/cranelift/filetests/filetests/isa/x64/simd-bitwise-avx.clif @@ -2,43 +2,6 @@ test compile precise-output set enable_simd target x86_64 has_avx -function %mask_from_icmp(f32x4, f32x4) -> f32x4 { -block0(v0: f32x4, v1: f32x4): - v2 = fmin v0, v1 - return v2 -} - -; VCode: -; pushq %rbp -; movq %rsp, %rbp -; block0: -; vminps %xmm0, %xmm1, %xmm3 -; vminps %xmm1, %xmm0, %xmm5 -; vorps %xmm3, %xmm5, %xmm7 -; vcmpps $3 %xmm7, %xmm5, %xmm9 -; vorps %xmm7, %xmm9, %xmm11 -; vpsrld %xmm9, $10, %xmm13 -; vandnps %xmm13, %xmm11, %xmm0 -; movq %rbp, %rsp -; popq %rbp -; ret -; -; Disassembled: -; block0: ; offset 0x0 -; pushq %rbp -; movq %rsp, %rbp -; block1: ; offset 0x4 -; vminps %xmm1, %xmm0, %xmm3 -; vminps %xmm0, %xmm1, %xmm5 -; vorps %xmm5, %xmm3, %xmm7 -; vcmpunordps %xmm5, %xmm7, %xmm9 -; vorps %xmm9, %xmm7, %xmm11 -; vpsrld $0xa, %xmm9, %xmm13 -; vandnps %xmm11, %xmm13, %xmm0 -; movq %rbp, %rsp -; popq %rbp -; retq - function %or_from_memory(f32x4, i64) -> f32x4 { block0(v0: f32x4, v1: i64): v2 = load.f32x4 notrap aligned v1 @@ -81,10 +44,10 @@ block0(v0: i64): ; block0: ; movss 0(%rdi), %xmm7 ; movl $-2147483648, %ecx -; movd %ecx, %xmm8 -; vandnps %xmm8, const(0), %xmm9 -; andps %xmm8, %xmm7, %xmm8 -; vorps %xmm9, %xmm8, %xmm0 +; movd %ecx, %xmm5 +; vandnps %xmm5, const(0), %xmm8 +; vandps %xmm5, %xmm7, %xmm9 +; vorps %xmm8, %xmm9, %xmm0 ; movq %rbp, %rsp ; popq %rbp ; ret @@ -96,10 +59,10 @@ block0(v0: i64): ; block1: ; offset 0x4 ; movss (%rdi), %xmm7 ; movl $0x80000000, %ecx -; movd %ecx, %xmm8 -; vandnps 0x16(%rip), %xmm8, %xmm9 -; andps %xmm7, %xmm8 -; vorps %xmm8, %xmm9, %xmm0 +; movd %ecx, %xmm5 +; vandnps 0x17(%rip), %xmm5, %xmm8 +; vandps %xmm7, %xmm5, %xmm9 +; vorps %xmm9, %xmm8, %xmm0 ; movq %rbp, %rsp ; popq %rbp ; retq @@ -166,9 +129,9 @@ block0(v0: f32x4, v1: f32x4): ; popq %rbp ; retq -function %i32x4_shr(i32x4, i32) -> i32x4 { -block0(v0: i32x4, v1: i32): - v2 = ushr v0, v1 +function %band_not_f64x2(f64x2, f64x2) -> f64x2 { +block0(v0: f64x2, v1: f64x2): + v2 = band_not v0, v1 return v2 } @@ -176,10 +139,7 @@ block0(v0: i32x4, v1: i32): ; pushq %rbp ; movq %rsp, %rbp ; block0: -; movq %rdi, %rcx -; andq %rcx, $31, %rcx -; movd %ecx, %xmm5 -; vpsrld %xmm0, %xmm5, %xmm0 +; vandnpd %xmm1, %xmm0, %xmm0 ; movq %rbp, %rsp ; popq %rbp ; ret @@ -189,10 +149,32 @@ block0(v0: i32x4, v1: i32): ; pushq %rbp ; movq %rsp, %rbp ; block1: ; offset 0x4 -; movq %rdi, %rcx -; andq $0x1f, %rcx -; movd %ecx, %xmm5 -; vpsrld %xmm5, %xmm0, %xmm0 +; vandnpd %xmm0, %xmm1, %xmm0 +; movq %rbp, %rsp +; popq %rbp +; retq + +function %band_not_i64x2(i64x2, i64x2) -> i64x2 { +block0(v0: i64x2, v1: i64x2): + v2 = band_not v0, v1 + return v2 +} + +; VCode: +; pushq %rbp +; movq %rsp, %rbp +; block0: +; vpandn %xmm1, %xmm0, %xmm0 +; movq %rbp, %rsp +; popq %rbp +; ret +; +; Disassembled: +; block0: ; offset 0x0 +; pushq %rbp +; movq %rsp, %rbp +; block1: ; offset 0x4 +; vpandn %xmm0, %xmm1, %xmm0 ; movq %rbp, %rsp ; popq %rbp ; retq @@ -209,7 +191,7 @@ block0(v0: f32x4): ; block0: ; pcmpeqd %xmm2, %xmm2, %xmm2 ; vpsrld %xmm2, $1, %xmm4 -; andps %xmm0, %xmm4, %xmm0 +; vandps %xmm0, %xmm4, %xmm0 ; movq %rbp, %rsp ; popq %rbp ; ret @@ -221,7 +203,457 @@ block0(v0: f32x4): ; block1: ; offset 0x4 ; pcmpeqd %xmm2, %xmm2 ; vpsrld $1, %xmm2, %xmm4 -; andps %xmm4, %xmm0 +; vandps %xmm4, %xmm0, %xmm0 +; movq %rbp, %rsp +; popq %rbp +; retq + +function %i16x8_and(i16x8, i16x8) -> i16x8 { +block0(v0: i16x8, v1: i16x8): + v2 = band v0, v1 + return v2 +} + +; VCode: +; pushq %rbp +; movq %rsp, %rbp +; block0: +; vpand %xmm0, %xmm1, %xmm0 +; movq %rbp, %rsp +; popq %rbp +; ret +; +; Disassembled: +; block0: ; offset 0x0 +; pushq %rbp +; movq %rsp, %rbp +; block1: ; offset 0x4 +; vpand %xmm1, %xmm0, %xmm0 +; movq %rbp, %rsp +; popq %rbp +; retq + +function %f32x4_and(f32x4, f32x4) -> f32x4 { +block0(v0: f32x4, v1: f32x4): + v2 = band v0, v1 + return v2 +} + +; VCode: +; pushq %rbp +; movq %rsp, %rbp +; block0: +; vandps %xmm0, %xmm1, %xmm0 +; movq %rbp, %rsp +; popq %rbp +; ret +; +; Disassembled: +; block0: ; offset 0x0 +; pushq %rbp +; movq %rsp, %rbp +; block1: ; offset 0x4 +; vandps %xmm1, %xmm0, %xmm0 +; movq %rbp, %rsp +; popq %rbp +; retq + +function %f64x2_and(f64x2, f64x2) -> f64x2 { +block0(v0: f64x2, v1: f64x2): + v2 = band v0, v1 + return v2 +} + +; VCode: +; pushq %rbp +; movq %rsp, %rbp +; block0: +; vandpd %xmm0, %xmm1, %xmm0 +; movq %rbp, %rsp +; popq %rbp +; ret +; +; Disassembled: +; block0: ; offset 0x0 +; pushq %rbp +; movq %rsp, %rbp +; block1: ; offset 0x4 +; vandpd %xmm1, %xmm0, %xmm0 +; movq %rbp, %rsp +; popq %rbp +; retq + +function %i16x8_or(i16x8, i16x8) -> i16x8 { +block0(v0: i16x8, v1: i16x8): + v2 = bor v0, v1 + return v2 +} + +; VCode: +; pushq %rbp +; movq %rsp, %rbp +; block0: +; vpor %xmm0, %xmm1, %xmm0 +; movq %rbp, %rsp +; popq %rbp +; ret +; +; Disassembled: +; block0: ; offset 0x0 +; pushq %rbp +; movq %rsp, %rbp +; block1: ; offset 0x4 +; vpor %xmm1, %xmm0, %xmm0 +; movq %rbp, %rsp +; popq %rbp +; retq + +function %f32x4_or(f32x4, f32x4) -> f32x4 { +block0(v0: f32x4, v1: f32x4): + v2 = bor v0, v1 + return v2 +} + +; VCode: +; pushq %rbp +; movq %rsp, %rbp +; block0: +; vorps %xmm0, %xmm1, %xmm0 +; movq %rbp, %rsp +; popq %rbp +; ret +; +; Disassembled: +; block0: ; offset 0x0 +; pushq %rbp +; movq %rsp, %rbp +; block1: ; offset 0x4 +; vorps %xmm1, %xmm0, %xmm0 +; movq %rbp, %rsp +; popq %rbp +; retq + +function %f64x2_or(f64x2, f64x2) -> f64x2 { +block0(v0: f64x2, v1: f64x2): + v2 = bor v0, v1 + return v2 +} + +; VCode: +; pushq %rbp +; movq %rsp, %rbp +; block0: +; vorpd %xmm0, %xmm1, %xmm0 +; movq %rbp, %rsp +; popq %rbp +; ret +; +; Disassembled: +; block0: ; offset 0x0 +; pushq %rbp +; movq %rsp, %rbp +; block1: ; offset 0x4 +; vorpd %xmm1, %xmm0, %xmm0 +; movq %rbp, %rsp +; popq %rbp +; retq + +function %i16x8_xor(i16x8, i16x8) -> i16x8 { +block0(v0: i16x8, v1: i16x8): + v2 = bxor v0, v1 + return v2 +} + +; VCode: +; pushq %rbp +; movq %rsp, %rbp +; block0: +; vpxor %xmm0, %xmm1, %xmm0 +; movq %rbp, %rsp +; popq %rbp +; ret +; +; Disassembled: +; block0: ; offset 0x0 +; pushq %rbp +; movq %rsp, %rbp +; block1: ; offset 0x4 +; vpxor %xmm1, %xmm0, %xmm0 +; movq %rbp, %rsp +; popq %rbp +; retq + +function %f32x4_xor(f32x4, f32x4) -> f32x4 { +block0(v0: f32x4, v1: f32x4): + v2 = bxor v0, v1 + return v2 +} + +; VCode: +; pushq %rbp +; movq %rsp, %rbp +; block0: +; vxorps %xmm0, %xmm1, %xmm0 +; movq %rbp, %rsp +; popq %rbp +; ret +; +; Disassembled: +; block0: ; offset 0x0 +; pushq %rbp +; movq %rsp, %rbp +; block1: ; offset 0x4 +; vxorps %xmm1, %xmm0, %xmm0 +; movq %rbp, %rsp +; popq %rbp +; retq + +function %f64x2_xor(f64x2, f64x2) -> f64x2 { +block0(v0: f64x2, v1: f64x2): + v2 = bxor v0, v1 + return v2 +} + +; VCode: +; pushq %rbp +; movq %rsp, %rbp +; block0: +; vxorpd %xmm0, %xmm1, %xmm0 +; movq %rbp, %rsp +; popq %rbp +; ret +; +; Disassembled: +; block0: ; offset 0x0 +; pushq %rbp +; movq %rsp, %rbp +; block1: ; offset 0x4 +; vxorpd %xmm1, %xmm0, %xmm0 +; movq %rbp, %rsp +; popq %rbp +; retq + +function %i16x8_bitselect(i16x8, i16x8, i16x8) -> i16x8 { +block0(v0: i16x8, v1: i16x8, v2: i16x8): + v3 = vselect v0, v1, v2 + return v3 +} + +; VCode: +; pushq %rbp +; movq %rsp, %rbp +; block0: +; vpblendvb %xmm0, %xmm1, %xmm0, %xmm2 +; movq %rbp, %rsp +; popq %rbp +; ret +; +; Disassembled: +; block0: ; offset 0x0 +; pushq %rbp +; movq %rsp, %rbp +; block1: ; offset 0x4 +; vpblendvb %xmm0, %xmm1, %xmm2, %xmm0 +; movq %rbp, %rsp +; popq %rbp +; retq + +function %i32x4_bitselect(i32x4, f32x4, f32x4) -> f32x4 { +block0(v0: i32x4, v1: f32x4, v2: f32x4): + v3 = vselect v0, v1, v2 + return v3 +} + +; VCode: +; pushq %rbp +; movq %rsp, %rbp +; block0: +; vblendvps %xmm0, %xmm1, %xmm0, %xmm2 +; movq %rbp, %rsp +; popq %rbp +; ret +; +; Disassembled: +; block0: ; offset 0x0 +; pushq %rbp +; movq %rsp, %rbp +; block1: ; offset 0x4 +; vblendvps %xmm0, %xmm1, %xmm2, %xmm0 +; movq %rbp, %rsp +; popq %rbp +; retq + +function %i64x2_bitselect(i64x2, f64x2, f64x2) -> f64x2 { +block0(v0: i64x2, v1: f64x2, v2: f64x2): + v3 = vselect v0, v1, v2 + return v3 +} + +; VCode: +; pushq %rbp +; movq %rsp, %rbp +; block0: +; vblendvpd %xmm0, %xmm1, %xmm0, %xmm2 +; movq %rbp, %rsp +; popq %rbp +; ret +; +; Disassembled: +; block0: ; offset 0x0 +; pushq %rbp +; movq %rsp, %rbp +; block1: ; offset 0x4 +; vblendvpd %xmm0, %xmm1, %xmm2, %xmm0 +; movq %rbp, %rsp +; popq %rbp +; retq + +function %f32x4_replace_lane(f32x4, f32) -> f32x4 { +block0(v0: f32x4, v1: f32): + v2 = insertlane v0, v1, 1 + return v2 +} + +; VCode: +; pushq %rbp +; movq %rsp, %rbp +; block0: +; vinsertps $16 %xmm0, %xmm1, %xmm0 +; movq %rbp, %rsp +; popq %rbp +; ret +; +; Disassembled: +; block0: ; offset 0x0 +; pushq %rbp +; movq %rsp, %rbp +; block1: ; offset 0x4 +; vinsertps $0x10, %xmm1, %xmm0, %xmm0 +; movq %rbp, %rsp +; popq %rbp +; retq + +function %f64x2_replace_lane(f64x2, f64) -> f64x2 { +block0(v0: f64x2, v1: f64): + v2 = insertlane v0, v1, 1 + return v2 +} + +; VCode: +; pushq %rbp +; movq %rsp, %rbp +; block0: +; vmovlhps %xmm0, %xmm1, %xmm0 +; movq %rbp, %rsp +; popq %rbp +; ret +; +; Disassembled: +; block0: ; offset 0x0 +; pushq %rbp +; movq %rsp, %rbp +; block1: ; offset 0x4 +; vmovlhps %xmm1, %xmm0, %xmm0 +; movq %rbp, %rsp +; popq %rbp +; retq + +function %i8x16_replace_lane(i8x16, i8) -> i8x16 { +block0(v0: i8x16, v1: i8): + v2 = insertlane v0, v1, 1 + return v2 +} + +; VCode: +; pushq %rbp +; movq %rsp, %rbp +; block0: +; vpinsrb $1 %xmm0, %rdi, %xmm0 +; movq %rbp, %rsp +; popq %rbp +; ret +; +; Disassembled: +; block0: ; offset 0x0 +; pushq %rbp +; movq %rsp, %rbp +; block1: ; offset 0x4 +; vpinsrb $1, %edi, %xmm0, %xmm0 +; movq %rbp, %rsp +; popq %rbp +; retq + +function %i16x8_replace_lane(i16x8, i16) -> i16x8 { +block0(v0: i16x8, v1: i16): + v2 = insertlane v0, v1, 1 + return v2 +} + +; VCode: +; pushq %rbp +; movq %rsp, %rbp +; block0: +; vpinsrw $1 %xmm0, %rdi, %xmm0 +; movq %rbp, %rsp +; popq %rbp +; ret +; +; Disassembled: +; block0: ; offset 0x0 +; pushq %rbp +; movq %rsp, %rbp +; block1: ; offset 0x4 +; vpinsrw $1, %edi, %xmm0, %xmm0 +; movq %rbp, %rsp +; popq %rbp +; retq + +function %i32x4_replace_lane(i32x4, i32) -> i32x4 { +block0(v0: i32x4, v1: i32): + v2 = insertlane v0, v1, 1 + return v2 +} + +; VCode: +; pushq %rbp +; movq %rsp, %rbp +; block0: +; vpinsrd $1 %xmm0, %rdi, %xmm0 +; movq %rbp, %rsp +; popq %rbp +; ret +; +; Disassembled: +; block0: ; offset 0x0 +; pushq %rbp +; movq %rsp, %rbp +; block1: ; offset 0x4 +; vpinsrd $1, %edi, %xmm0, %xmm0 +; movq %rbp, %rsp +; popq %rbp +; retq + +function %i64x2_replace_lane(i64x2, i64) -> i64x2 { +block0(v0: i64x2, v1: i64): + v2 = insertlane v0, v1, 1 + return v2 +} + +; VCode: +; pushq %rbp +; movq %rsp, %rbp +; block0: +; vpinsrq $1 %xmm0, %rdi, %xmm0 +; movq %rbp, %rsp +; popq %rbp +; ret +; +; Disassembled: +; block0: ; offset 0x0 +; pushq %rbp +; movq %rsp, %rbp +; block1: ; offset 0x4 +; vpinsrq $1, %rdi, %xmm0, %xmm0 ; movq %rbp, %rsp ; popq %rbp ; retq diff --git a/cranelift/filetests/filetests/isa/x64/simd-cmp-avx.clif b/cranelift/filetests/filetests/isa/x64/simd-cmp-avx.clif new file mode 100644 index 0000000000..b46b4e722e --- /dev/null +++ b/cranelift/filetests/filetests/isa/x64/simd-cmp-avx.clif @@ -0,0 +1,656 @@ +test compile precise-output +set enable_simd +target x86_64 has_avx + +function %i8x16_eq(i8x16, i8x16) -> i8x16 { +block0(v0: i8x16, v1: i8x16): + v2 = icmp eq v0, v1 + return v2 +} + +; VCode: +; pushq %rbp +; movq %rsp, %rbp +; block0: +; vpcmpeqb %xmm0, %xmm1, %xmm0 +; movq %rbp, %rsp +; popq %rbp +; ret +; +; Disassembled: +; block0: ; offset 0x0 +; pushq %rbp +; movq %rsp, %rbp +; block1: ; offset 0x4 +; vpcmpeqb %xmm1, %xmm0, %xmm0 +; movq %rbp, %rsp +; popq %rbp +; retq + +function %i16x8_eq(i16x8, i16x8) -> i16x8 { +block0(v0: i16x8, v1: i16x8): + v2 = icmp eq v0, v1 + return v2 +} + +; VCode: +; pushq %rbp +; movq %rsp, %rbp +; block0: +; vpcmpeqw %xmm0, %xmm1, %xmm0 +; movq %rbp, %rsp +; popq %rbp +; ret +; +; Disassembled: +; block0: ; offset 0x0 +; pushq %rbp +; movq %rsp, %rbp +; block1: ; offset 0x4 +; vpcmpeqw %xmm1, %xmm0, %xmm0 +; movq %rbp, %rsp +; popq %rbp +; retq + +function %i32x4_eq(i32x4, i32x4) -> i32x4 { +block0(v0: i32x4, v1: i32x4): + v2 = icmp eq v0, v1 + return v2 +} + +; VCode: +; pushq %rbp +; movq %rsp, %rbp +; block0: +; vpcmpeqd %xmm0, %xmm1, %xmm0 +; movq %rbp, %rsp +; popq %rbp +; ret +; +; Disassembled: +; block0: ; offset 0x0 +; pushq %rbp +; movq %rsp, %rbp +; block1: ; offset 0x4 +; vpcmpeqd %xmm1, %xmm0, %xmm0 +; movq %rbp, %rsp +; popq %rbp +; retq + +function %i64x2_eq(i64x2, i64x2) -> i64x2 { +block0(v0: i64x2, v1: i64x2): + v2 = icmp eq v0, v1 + return v2 +} + +; VCode: +; pushq %rbp +; movq %rsp, %rbp +; block0: +; vpcmpeqq %xmm0, %xmm1, %xmm0 +; movq %rbp, %rsp +; popq %rbp +; ret +; +; Disassembled: +; block0: ; offset 0x0 +; pushq %rbp +; movq %rsp, %rbp +; block1: ; offset 0x4 +; vpcmpeqq %xmm1, %xmm0, %xmm0 +; movq %rbp, %rsp +; popq %rbp +; retq + +function %i8x16_gt(i8x16, i8x16) -> i8x16 { +block0(v0: i8x16, v1: i8x16): + v2 = icmp sgt v0, v1 + return v2 +} + +; VCode: +; pushq %rbp +; movq %rsp, %rbp +; block0: +; vpcmpgtb %xmm0, %xmm1, %xmm0 +; movq %rbp, %rsp +; popq %rbp +; ret +; +; Disassembled: +; block0: ; offset 0x0 +; pushq %rbp +; movq %rsp, %rbp +; block1: ; offset 0x4 +; vpcmpgtb %xmm1, %xmm0, %xmm0 +; movq %rbp, %rsp +; popq %rbp +; retq + +function %i16x8_gt(i16x8, i16x8) -> i16x8 { +block0(v0: i16x8, v1: i16x8): + v2 = icmp sgt v0, v1 + return v2 +} + +; VCode: +; pushq %rbp +; movq %rsp, %rbp +; block0: +; vpcmpgtw %xmm0, %xmm1, %xmm0 +; movq %rbp, %rsp +; popq %rbp +; ret +; +; Disassembled: +; block0: ; offset 0x0 +; pushq %rbp +; movq %rsp, %rbp +; block1: ; offset 0x4 +; vpcmpgtw %xmm1, %xmm0, %xmm0 +; movq %rbp, %rsp +; popq %rbp +; retq + +function %i32x4_gt(i32x4, i32x4) -> i32x4 { +block0(v0: i32x4, v1: i32x4): + v2 = icmp sgt v0, v1 + return v2 +} + +; VCode: +; pushq %rbp +; movq %rsp, %rbp +; block0: +; vpcmpgtd %xmm0, %xmm1, %xmm0 +; movq %rbp, %rsp +; popq %rbp +; ret +; +; Disassembled: +; block0: ; offset 0x0 +; pushq %rbp +; movq %rsp, %rbp +; block1: ; offset 0x4 +; vpcmpgtd %xmm1, %xmm0, %xmm0 +; movq %rbp, %rsp +; popq %rbp +; retq + +function %i64x2_gt(i64x2, i64x2) -> i64x2 { +block0(v0: i64x2, v1: i64x2): + v2 = icmp sgt v0, v1 + return v2 +} + +; VCode: +; pushq %rbp +; movq %rsp, %rbp +; block0: +; vpcmpgtq %xmm0, %xmm1, %xmm0 +; movq %rbp, %rsp +; popq %rbp +; ret +; +; Disassembled: +; block0: ; offset 0x0 +; pushq %rbp +; movq %rsp, %rbp +; block1: ; offset 0x4 +; vpcmpgtq %xmm1, %xmm0, %xmm0 +; movq %rbp, %rsp +; popq %rbp +; retq + +function %f32x4_min(f32x4, f32x4) -> f32x4 { +block0(v0: f32x4, v1: f32x4): + v2 = fmin v0, v1 + return v2 +} + +; VCode: +; pushq %rbp +; movq %rsp, %rbp +; block0: +; vminps %xmm0, %xmm1, %xmm3 +; vminps %xmm1, %xmm0, %xmm5 +; vorps %xmm3, %xmm5, %xmm7 +; vcmpps $3 %xmm7, %xmm5, %xmm9 +; vorps %xmm7, %xmm9, %xmm11 +; vpsrld %xmm9, $10, %xmm13 +; vandnps %xmm13, %xmm11, %xmm0 +; movq %rbp, %rsp +; popq %rbp +; ret +; +; Disassembled: +; block0: ; offset 0x0 +; pushq %rbp +; movq %rsp, %rbp +; block1: ; offset 0x4 +; vminps %xmm1, %xmm0, %xmm3 +; vminps %xmm0, %xmm1, %xmm5 +; vorps %xmm5, %xmm3, %xmm7 +; vcmpunordps %xmm5, %xmm7, %xmm9 +; vorps %xmm9, %xmm7, %xmm11 +; vpsrld $0xa, %xmm9, %xmm13 +; vandnps %xmm11, %xmm13, %xmm0 +; movq %rbp, %rsp +; popq %rbp +; retq + +function %f64x2_min(f64x2, f64x2) -> f64x2 { +block0(v0: f64x2, v1: f64x2): + v2 = fmin v0, v1 + return v2 +} + +; VCode: +; pushq %rbp +; movq %rsp, %rbp +; block0: +; vminpd %xmm0, %xmm1, %xmm3 +; vminpd %xmm1, %xmm0, %xmm5 +; vorpd %xmm3, %xmm5, %xmm7 +; vcmppd $3 %xmm3, %xmm5, %xmm9 +; vorpd %xmm7, %xmm9, %xmm11 +; vpsrlq %xmm9, $13, %xmm13 +; vandnpd %xmm13, %xmm11, %xmm0 +; movq %rbp, %rsp +; popq %rbp +; ret +; +; Disassembled: +; block0: ; offset 0x0 +; pushq %rbp +; movq %rsp, %rbp +; block1: ; offset 0x4 +; vminpd %xmm1, %xmm0, %xmm3 +; vminpd %xmm0, %xmm1, %xmm5 +; vorpd %xmm5, %xmm3, %xmm7 +; vcmpunordpd %xmm5, %xmm3, %xmm9 +; vorpd %xmm9, %xmm7, %xmm11 +; vpsrlq $0xd, %xmm9, %xmm13 +; vandnpd %xmm11, %xmm13, %xmm0 +; movq %rbp, %rsp +; popq %rbp +; retq + +function %f32x4_max(f32x4, f32x4) -> f32x4 { +block0(v0: f32x4, v1: f32x4): + v2 = fmax v0, v1 + return v2 +} + +; VCode: +; pushq %rbp +; movq %rsp, %rbp +; block0: +; vmaxps %xmm0, %xmm1, %xmm3 +; vmaxps %xmm1, %xmm0, %xmm5 +; vxorps %xmm3, %xmm5, %xmm7 +; vorps %xmm3, %xmm7, %xmm9 +; vsubps %xmm9, %xmm7, %xmm11 +; vcmpps $3 %xmm9, %xmm9, %xmm13 +; vpsrld %xmm13, $10, %xmm15 +; vandnps %xmm15, %xmm11, %xmm0 +; movq %rbp, %rsp +; popq %rbp +; ret +; +; Disassembled: +; block0: ; offset 0x0 +; pushq %rbp +; movq %rsp, %rbp +; block1: ; offset 0x4 +; vmaxps %xmm1, %xmm0, %xmm3 +; vmaxps %xmm0, %xmm1, %xmm5 +; vxorps %xmm5, %xmm3, %xmm7 +; vorps %xmm7, %xmm3, %xmm9 +; vsubps %xmm7, %xmm9, %xmm11 +; vcmpunordps %xmm9, %xmm9, %xmm13 +; vpsrld $0xa, %xmm13, %xmm15 +; vandnps %xmm11, %xmm15, %xmm0 +; movq %rbp, %rsp +; popq %rbp +; retq + +function %f64x2_max(f64x2, f64x2) -> f64x2 { +block0(v0: f64x2, v1: f64x2): + v2 = fmax v0, v1 + return v2 +} + +; VCode: +; pushq %rbp +; movq %rsp, %rbp +; block0: +; vmaxpd %xmm0, %xmm1, %xmm3 +; vmaxpd %xmm1, %xmm0, %xmm5 +; vxorpd %xmm3, %xmm5, %xmm7 +; vorpd %xmm3, %xmm7, %xmm9 +; vsubpd %xmm9, %xmm7, %xmm11 +; vcmppd $3 %xmm9, %xmm9, %xmm13 +; vpsrlq %xmm13, $13, %xmm15 +; vandnpd %xmm15, %xmm11, %xmm0 +; movq %rbp, %rsp +; popq %rbp +; ret +; +; Disassembled: +; block0: ; offset 0x0 +; pushq %rbp +; movq %rsp, %rbp +; block1: ; offset 0x4 +; vmaxpd %xmm1, %xmm0, %xmm3 +; vmaxpd %xmm0, %xmm1, %xmm5 +; vxorpd %xmm5, %xmm3, %xmm7 +; vorpd %xmm7, %xmm3, %xmm9 +; vsubpd %xmm7, %xmm9, %xmm11 +; vcmpunordpd %xmm9, %xmm9, %xmm13 +; vpsrlq $0xd, %xmm13, %xmm15 +; vandnpd %xmm11, %xmm15, %xmm0 +; movq %rbp, %rsp +; popq %rbp +; retq + +function %i8x16_min(i8x16, i8x16) -> i8x16 { +block0(v0: i8x16, v1: i8x16): + v2 = smin v0, v1 + return v2 +} + +; VCode: +; pushq %rbp +; movq %rsp, %rbp +; block0: +; vpminsb %xmm0, %xmm1, %xmm0 +; movq %rbp, %rsp +; popq %rbp +; ret +; +; Disassembled: +; block0: ; offset 0x0 +; pushq %rbp +; movq %rsp, %rbp +; block1: ; offset 0x4 +; vpminsb %xmm1, %xmm0, %xmm0 +; movq %rbp, %rsp +; popq %rbp +; retq + +function %u8x16_min(i8x16, i8x16) -> i8x16 { +block0(v0: i8x16, v1: i8x16): + v2 = umin v0, v1 + return v2 +} + +; VCode: +; pushq %rbp +; movq %rsp, %rbp +; block0: +; vpminub %xmm0, %xmm1, %xmm0 +; movq %rbp, %rsp +; popq %rbp +; ret +; +; Disassembled: +; block0: ; offset 0x0 +; pushq %rbp +; movq %rsp, %rbp +; block1: ; offset 0x4 +; vpminub %xmm1, %xmm0, %xmm0 +; movq %rbp, %rsp +; popq %rbp +; retq + +function %i16x8_min(i16x8, i16x8) -> i16x8 { +block0(v0: i16x8, v1: i16x8): + v2 = smin v0, v1 + return v2 +} + +; VCode: +; pushq %rbp +; movq %rsp, %rbp +; block0: +; vpminsw %xmm0, %xmm1, %xmm0 +; movq %rbp, %rsp +; popq %rbp +; ret +; +; Disassembled: +; block0: ; offset 0x0 +; pushq %rbp +; movq %rsp, %rbp +; block1: ; offset 0x4 +; vpminsw %xmm1, %xmm0, %xmm0 +; movq %rbp, %rsp +; popq %rbp +; retq + +function %u16x8_min(i16x8, i16x8) -> i16x8 { +block0(v0: i16x8, v1: i16x8): + v2 = umin v0, v1 + return v2 +} + +; VCode: +; pushq %rbp +; movq %rsp, %rbp +; block0: +; vpminuw %xmm0, %xmm1, %xmm0 +; movq %rbp, %rsp +; popq %rbp +; ret +; +; Disassembled: +; block0: ; offset 0x0 +; pushq %rbp +; movq %rsp, %rbp +; block1: ; offset 0x4 +; vpminuw %xmm1, %xmm0, %xmm0 +; movq %rbp, %rsp +; popq %rbp +; retq + +function %i32x4_min(i32x4, i32x4) -> i32x4 { +block0(v0: i32x4, v1: i32x4): + v2 = smin v0, v1 + return v2 +} + +; VCode: +; pushq %rbp +; movq %rsp, %rbp +; block0: +; vpminsd %xmm0, %xmm1, %xmm0 +; movq %rbp, %rsp +; popq %rbp +; ret +; +; Disassembled: +; block0: ; offset 0x0 +; pushq %rbp +; movq %rsp, %rbp +; block1: ; offset 0x4 +; vpminsd %xmm1, %xmm0, %xmm0 +; movq %rbp, %rsp +; popq %rbp +; retq + +function %u32x4_min(i32x4, i32x4) -> i32x4 { +block0(v0: i32x4, v1: i32x4): + v2 = umin v0, v1 + return v2 +} + +; VCode: +; pushq %rbp +; movq %rsp, %rbp +; block0: +; vpminud %xmm0, %xmm1, %xmm0 +; movq %rbp, %rsp +; popq %rbp +; ret +; +; Disassembled: +; block0: ; offset 0x0 +; pushq %rbp +; movq %rsp, %rbp +; block1: ; offset 0x4 +; vpminud %xmm1, %xmm0, %xmm0 +; movq %rbp, %rsp +; popq %rbp +; retq + +function %i8x16_max(i8x16, i8x16) -> i8x16 { +block0(v0: i8x16, v1: i8x16): + v2 = smax v0, v1 + return v2 +} + +; VCode: +; pushq %rbp +; movq %rsp, %rbp +; block0: +; vpmaxsb %xmm0, %xmm1, %xmm0 +; movq %rbp, %rsp +; popq %rbp +; ret +; +; Disassembled: +; block0: ; offset 0x0 +; pushq %rbp +; movq %rsp, %rbp +; block1: ; offset 0x4 +; vpmaxsb %xmm1, %xmm0, %xmm0 +; movq %rbp, %rsp +; popq %rbp +; retq + +function %u8x16_max(i8x16, i8x16) -> i8x16 { +block0(v0: i8x16, v1: i8x16): + v2 = umax v0, v1 + return v2 +} + +; VCode: +; pushq %rbp +; movq %rsp, %rbp +; block0: +; vpmaxub %xmm0, %xmm1, %xmm0 +; movq %rbp, %rsp +; popq %rbp +; ret +; +; Disassembled: +; block0: ; offset 0x0 +; pushq %rbp +; movq %rsp, %rbp +; block1: ; offset 0x4 +; vpmaxub %xmm1, %xmm0, %xmm0 +; movq %rbp, %rsp +; popq %rbp +; retq + +function %i16x8_max(i16x8, i16x8) -> i16x8 { +block0(v0: i16x8, v1: i16x8): + v2 = smax v0, v1 + return v2 +} + +; VCode: +; pushq %rbp +; movq %rsp, %rbp +; block0: +; vpmaxsw %xmm0, %xmm1, %xmm0 +; movq %rbp, %rsp +; popq %rbp +; ret +; +; Disassembled: +; block0: ; offset 0x0 +; pushq %rbp +; movq %rsp, %rbp +; block1: ; offset 0x4 +; vpmaxsw %xmm1, %xmm0, %xmm0 +; movq %rbp, %rsp +; popq %rbp +; retq + +function %u16x8_max(i16x8, i16x8) -> i16x8 { +block0(v0: i16x8, v1: i16x8): + v2 = umax v0, v1 + return v2 +} + +; VCode: +; pushq %rbp +; movq %rsp, %rbp +; block0: +; vpmaxuw %xmm0, %xmm1, %xmm0 +; movq %rbp, %rsp +; popq %rbp +; ret +; +; Disassembled: +; block0: ; offset 0x0 +; pushq %rbp +; movq %rsp, %rbp +; block1: ; offset 0x4 +; vpmaxuw %xmm1, %xmm0, %xmm0 +; movq %rbp, %rsp +; popq %rbp +; retq + +function %i32x4_max(i32x4, i32x4) -> i32x4 { +block0(v0: i32x4, v1: i32x4): + v2 = smax v0, v1 + return v2 +} + +; VCode: +; pushq %rbp +; movq %rsp, %rbp +; block0: +; vpmaxsd %xmm0, %xmm1, %xmm0 +; movq %rbp, %rsp +; popq %rbp +; ret +; +; Disassembled: +; block0: ; offset 0x0 +; pushq %rbp +; movq %rsp, %rbp +; block1: ; offset 0x4 +; vpmaxsd %xmm1, %xmm0, %xmm0 +; movq %rbp, %rsp +; popq %rbp +; retq + +function %u32x4_max(i32x4, i32x4) -> i32x4 { +block0(v0: i32x4, v1: i32x4): + v2 = umax v0, v1 + return v2 +} + +; VCode: +; pushq %rbp +; movq %rsp, %rbp +; block0: +; vpmaxud %xmm0, %xmm1, %xmm0 +; movq %rbp, %rsp +; popq %rbp +; ret +; +; Disassembled: +; block0: ; offset 0x0 +; pushq %rbp +; movq %rsp, %rbp +; block1: ; offset 0x4 +; vpmaxud %xmm1, %xmm0, %xmm0 +; movq %rbp, %rsp +; popq %rbp +; retq + diff --git a/cranelift/filetests/filetests/isa/x64/simd-comparison-legalize.clif b/cranelift/filetests/filetests/isa/x64/simd-comparison-legalize.clif index a7fce09551..80e7fa248a 100644 --- a/cranelift/filetests/filetests/isa/x64/simd-comparison-legalize.clif +++ b/cranelift/filetests/filetests/isa/x64/simd-comparison-legalize.clif @@ -1,6 +1,6 @@ test compile precise-output set enable_simd -target x86_64 skylake +target x86_64 function %icmp_ne_32x4(i32x4, i32x4) -> i32x4 { block0(v0: i32x4, v1: i32x4): diff --git a/cranelift/filetests/filetests/isa/x64/simd-logical-compile.clif b/cranelift/filetests/filetests/isa/x64/simd-logical-compile.clif index 4d07e7912b..a4098176d0 100644 --- a/cranelift/filetests/filetests/isa/x64/simd-logical-compile.clif +++ b/cranelift/filetests/filetests/isa/x64/simd-logical-compile.clif @@ -1,6 +1,6 @@ test compile precise-output set enable_simd -target x86_64 skylake +target x86_64 function %bnot_i32x4(i32x4) -> i32x4 { block0(v0: i32x4): diff --git a/cranelift/filetests/filetests/runtests/simd-lane-access.clif b/cranelift/filetests/filetests/runtests/simd-lane-access.clif index 412de68046..54de6a450d 100644 --- a/cranelift/filetests/filetests/runtests/simd-lane-access.clif +++ b/cranelift/filetests/filetests/runtests/simd-lane-access.clif @@ -3,6 +3,7 @@ target aarch64 target s390x set enable_simd target x86_64 has_sse3 has_ssse3 has_sse41 +target x86_64 has_sse3 has_ssse3 has_sse41 has_avx ;; shuffle diff --git a/tests/misc_testsuite/simd/load_splat_out_of_bounds.wast b/tests/misc_testsuite/simd/load_splat_out_of_bounds.wast new file mode 100644 index 0000000000..9085aa67d7 --- /dev/null +++ b/tests/misc_testsuite/simd/load_splat_out_of_bounds.wast @@ -0,0 +1,27 @@ +;; aligned and out of bounds +(module + (func + i32.const 0 + v128.load32_splat + v128.any_true + if + end + ) + (memory 0 6) + (export "x" (func 0)) +) +(assert_trap (invoke "x") "out of bounds memory access") + +;; unaligned an in bounds +(module + (func + i32.const 1 + v128.load32_splat + v128.any_true + if + end + ) + (memory 1 6) + (export "x" (func 0)) +) +(assert_return (invoke "x"))