;; x86-64 instruction selection and CLIF-to-MachInst lowering. ;; The main lowering constructor term: takes a clif `Inst` and returns the ;; register(s) within which the lowered instruction's result values live. (decl lower (Inst) ValueRegs) ;;;; Rules for `iconst` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;; `i64` and smaller. (rule (lower (has_type (fits_in_64 ty) (iconst (u64_from_imm64 x)))) (value_reg (imm ty x))) ;; `i128` (rule (lower (has_type $I128 (iconst (u64_from_imm64 x)))) (value_regs (imm $I64 x) (imm $I64 0))) ;;;; Rules for `bconst` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;; `b64` and smaller. (rule (lower (has_type (fits_in_64 ty) (bconst $false))) (value_reg (imm ty 0))) (rule (lower (has_type (fits_in_64 ty) (bconst $true))) (value_reg (imm ty 1))) ;; `b128` (rule (lower (has_type $B128 (bconst $false))) (value_regs (imm $B64 0) (imm $B64 0))) (rule (lower (has_type $B128 (bconst $true))) (value_regs (imm $B64 1) (imm $B64 0))) ;;;; Rules for `f32const` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; (rule (lower (f32const (u64_from_ieee32 x))) (value_reg (imm $F32 x))) ;;;; Rules for `f64const` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; (rule (lower (f64const (u64_from_ieee64 x))) (value_reg (imm $F64 x))) ;;;; Rules for `null` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; (rule (lower (has_type ty (null))) (value_reg (imm ty 0))) ;;;; Rules for `iadd` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;; `i64` and smaller. ;; Add two registers. (rule (lower (has_type (fits_in_64 ty) (iadd x y))) (value_reg (add ty (put_in_reg x) (RegMemImm.Reg (put_in_reg y))))) ;; Add a register and an immediate. (rule (lower (has_type (fits_in_64 ty) (iadd x (simm32_from_value y)))) (value_reg (add ty (put_in_reg x) y))) (rule (lower (has_type (fits_in_64 ty) (iadd (simm32_from_value x) y))) (value_reg (add ty (put_in_reg y) x))) ;; Add a register and memory. (rule (lower (has_type (fits_in_64 ty) (iadd x (sinkable_load y)))) (value_reg (add ty (put_in_reg x) (sink_load y)))) (rule (lower (has_type (fits_in_64 ty) (iadd (sinkable_load x) y))) (value_reg (add ty (put_in_reg y) (sink_load x)))) ;; SSE. (rule (lower (has_type (multi_lane 8 16) (iadd x y))) (value_xmm (paddb (put_in_xmm x) (put_in_xmm_mem y)))) (rule (lower (has_type (multi_lane 16 8) (iadd x y))) (value_xmm (paddw (put_in_xmm x) (put_in_xmm_mem y)))) (rule (lower (has_type (multi_lane 32 4) (iadd x y))) (value_xmm (paddd (put_in_xmm x) (put_in_xmm_mem y)))) (rule (lower (has_type (multi_lane 64 2) (iadd x y))) (value_xmm (paddq (put_in_xmm x) (put_in_xmm_mem y)))) ;; `i128` (rule (lower (has_type $I128 (iadd x y))) ;; Get the high/low registers for `x`. (let ((x_regs ValueRegs (put_in_regs x)) (x_lo Reg (value_regs_get x_regs 0)) (x_hi Reg (value_regs_get x_regs 1))) ;; Get the high/low registers for `y`. (let ((y_regs ValueRegs (put_in_regs y)) (y_lo Reg (value_regs_get y_regs 0)) (y_hi Reg (value_regs_get y_regs 1))) ;; Do an add followed by an add-with-carry. (with_flags (add_with_flags $I64 x_lo (RegMemImm.Reg y_lo)) (adc $I64 x_hi (RegMemImm.Reg y_hi)))))) ;;;; Rules for `sadd_sat` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; (rule (lower (has_type (multi_lane 8 16) (sadd_sat x y))) (value_xmm (paddsb (put_in_xmm x) (put_in_xmm_mem y)))) (rule (lower (has_type (multi_lane 16 8) (sadd_sat x y))) (value_xmm (paddsw (put_in_xmm x) (put_in_xmm_mem y)))) ;;;; Rules for `uadd_sat` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; (rule (lower (has_type (multi_lane 8 16) (uadd_sat x y))) (value_xmm (paddusb (put_in_xmm x) (put_in_xmm_mem y)))) (rule (lower (has_type (multi_lane 16 8) (uadd_sat x y))) (value_xmm (paddusw (put_in_xmm x) (put_in_xmm_mem y)))) ;;;; Rules for `iadd_ifcout` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;; N.B.: the second output of `iadd_ifcout` is meant to be the ;; `iflags` value containing the carry result. However, we plan to ;; replace this with a bool carry flag, and all consumers of `iflags` ;; remain in the handwritten pattern-matching code and explicitly ;; match on the flags producer. So we can get away with just ;; allocating a second temp so that the reg-renaming code does the ;; right thing, for now. For safety, we assert elsewhere that no one ;; actually uses the register assigned to the SSA `iflags`-typed ;; `Value`. ;; Add two registers. (rule (lower (has_type (fits_in_64 ty) (iadd_ifcout x y))) (let ((unused_iflags Reg (writable_reg_to_reg (temp_writable_reg $I64)))) (value_regs (add ty (put_in_reg x) (RegMemImm.Reg (put_in_reg y))) unused_iflags))) ;; Add a register and an immediate. (rule (lower (has_type (fits_in_64 ty) (iadd_ifcout x (simm32_from_value y)))) (let ((unused_iflags Reg (writable_reg_to_reg (temp_writable_reg $I64)))) (value_regs (add ty (put_in_reg x) y) unused_iflags))) (rule (lower (has_type (fits_in_64 ty) (iadd_ifcout (simm32_from_value x) y))) (let ((unused_iflags Reg (writable_reg_to_reg (temp_writable_reg $I64)))) (value_regs (add ty (put_in_reg y) x) unused_iflags))) ;; Add a register and memory. (rule (lower (has_type (fits_in_64 ty) (iadd_ifcout x (sinkable_load y)))) (let ((unused_iflags Reg (writable_reg_to_reg (temp_writable_reg $I64)))) (value_regs (add ty (put_in_reg x) (sink_load y)) unused_iflags))) (rule (lower (has_type (fits_in_64 ty) (iadd_ifcout (sinkable_load x) y))) (let ((unused_iflags Reg (writable_reg_to_reg (temp_writable_reg $I64)))) (value_regs (add ty (put_in_reg y) (sink_load x)) unused_iflags))) ;; (No `iadd_ifcout` for `i128`.) ;;;; Rules for `iadd_imm` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;; `i64` and smaller. ;; When the immediate fits in a `RegMemImm.Imm`, use that. (rule (lower (has_type (fits_in_64 ty) (iadd_imm y (simm32_from_imm64 x)))) (value_reg (add ty (put_in_reg y) x))) ;; Otherwise, put the immediate into a register. (rule (lower (has_type (fits_in_64 ty) (iadd_imm y (u64_from_imm64 x)))) (value_reg (add ty (put_in_reg y) (RegMemImm.Reg (imm ty x))))) ;; `i128` ;; When the immediate fits in a `RegMemImm.Imm`, use that. (rule (lower (has_type $I128 (iadd_imm y (simm32_from_imm64 x)))) (let ((y_regs ValueRegs (put_in_regs y)) (y_lo Reg (value_regs_get y_regs 0)) (y_hi Reg (value_regs_get y_regs 1))) (with_flags (add_with_flags $I64 y_lo x) (adc $I64 y_hi (RegMemImm.Imm 0))))) ;; Otherwise, put the immediate into a register. (rule (lower (has_type $I128 (iadd_imm y (u64_from_imm64 x)))) (let ((y_regs ValueRegs (put_in_regs y)) (y_lo Reg (value_regs_get y_regs 0)) (y_hi Reg (value_regs_get y_regs 1)) (x_lo Reg (imm $I64 x))) (with_flags (add_with_flags $I64 y_lo (RegMemImm.Reg x_lo)) (adc $I64 y_hi (RegMemImm.Imm 0))))) ;;;; Rules for `isub` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;; `i64` and smaller. ;; Sub two registers. (rule (lower (has_type (fits_in_64 ty) (isub x y))) (value_reg (sub ty (put_in_reg x) (RegMemImm.Reg (put_in_reg y))))) ;; Sub a register and an immediate. (rule (lower (has_type (fits_in_64 ty) (isub x (simm32_from_value y)))) (value_reg (sub ty (put_in_reg x) y))) ;; Sub a register and memory. (rule (lower (has_type (fits_in_64 ty) (isub x (sinkable_load y)))) (value_reg (sub ty (put_in_reg x) (sink_load y)))) ;; SSE. (rule (lower (has_type (multi_lane 8 16) (isub x y))) (value_xmm (psubb (put_in_xmm x) (put_in_xmm_mem y)))) (rule (lower (has_type (multi_lane 16 8) (isub x y))) (value_xmm (psubw (put_in_xmm x) (put_in_xmm_mem y)))) (rule (lower (has_type (multi_lane 32 4) (isub x y))) (value_xmm (psubd (put_in_xmm x) (put_in_xmm_mem y)))) (rule (lower (has_type (multi_lane 64 2) (isub x y))) (value_xmm (psubq (put_in_xmm x) (put_in_xmm_mem y)))) ;; `i128` (rule (lower (has_type $I128 (isub x y))) ;; Get the high/low registers for `x`. (let ((x_regs ValueRegs (put_in_regs x)) (x_lo Reg (value_regs_get x_regs 0)) (x_hi Reg (value_regs_get x_regs 1))) ;; Get the high/low registers for `y`. (let ((y_regs ValueRegs (put_in_regs y)) (y_lo Reg (value_regs_get y_regs 0)) (y_hi Reg (value_regs_get y_regs 1))) ;; Do a sub followed by an sub-with-borrow. (with_flags (sub_with_flags $I64 x_lo (RegMemImm.Reg y_lo)) (sbb $I64 x_hi (RegMemImm.Reg y_hi)))))) ;;;; Rules for `ssub_sat` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; (rule (lower (has_type (multi_lane 8 16) (ssub_sat x y))) (value_xmm (psubsb (put_in_xmm x) (put_in_xmm_mem y)))) (rule (lower (has_type (multi_lane 16 8) (ssub_sat x y))) (value_xmm (psubsw (put_in_xmm x) (put_in_xmm_mem y)))) ;;;; Rules for `usub_sat` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; (rule (lower (has_type (multi_lane 8 16) (usub_sat x y))) (value_xmm (psubusb (put_in_xmm x) (put_in_xmm_mem y)))) (rule (lower (has_type (multi_lane 16 8) (usub_sat x y))) (value_xmm (psubusw (put_in_xmm x) (put_in_xmm_mem y)))) ;;;; Rules for `band` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;; `{i,b}64` and smaller. ;; And two registers. (rule (lower (has_type (fits_in_64 ty) (band x y))) (value_reg (x64_and ty (put_in_reg x) (RegMemImm.Reg (put_in_reg y))))) ;; And with a memory operand. (rule (lower (has_type (fits_in_64 ty) (band x (sinkable_load y)))) (value_reg (x64_and ty (put_in_reg x) (sink_load y)))) (rule (lower (has_type (fits_in_64 ty) (band (sinkable_load x) y))) (value_reg (x64_and ty (put_in_reg y) (sink_load x)))) ;; And with an immediate. (rule (lower (has_type (fits_in_64 ty) (band x (simm32_from_value y)))) (value_reg (x64_and ty (put_in_reg x) y))) (rule (lower (has_type (fits_in_64 ty) (band (simm32_from_value x) y))) (value_reg (x64_and ty (put_in_reg y) x))) ;; SSE. (decl sse_and (Type Xmm XmmMem) Xmm) (rule (sse_and $F32X4 x y) (andps x y)) (rule (sse_and $F64X2 x y) (andpd x y)) (rule (sse_and (multi_lane _bits _lanes) x y) (pand x y)) (rule (lower (has_type ty @ (multi_lane _bits _lanes) (band x y))) (value_xmm (sse_and ty (put_in_xmm x) (put_in_xmm_mem y)))) ;; `{i,b}128`. (rule (lower (has_type $I128 (band x y))) (let ((x_regs ValueRegs (put_in_regs x)) (x_lo Reg (value_regs_get x_regs 0)) (x_hi Reg (value_regs_get x_regs 1)) (y_regs ValueRegs (put_in_regs y)) (y_lo Reg (value_regs_get y_regs 0)) (y_hi Reg (value_regs_get y_regs 1))) (value_regs (x64_and $I64 x_lo (RegMemImm.Reg y_lo)) (x64_and $I64 x_hi (RegMemImm.Reg y_hi))))) (rule (lower (has_type $B128 (band x y))) ;; Booleans are always `0` or `1`, so we only need to do the `and` on the ;; low half. The high half is always zero but, rather than generate a new ;; zero, we just reuse `x`'s high half which is already zero. (let ((x_regs ValueRegs (put_in_regs x)) (x_lo Reg (value_regs_get x_regs 0)) (x_hi Reg (value_regs_get x_regs 1)) (y_lo Reg (lo_reg y))) (value_regs (x64_and $I64 x_lo (RegMemImm.Reg y_lo)) x_hi))) ;;;; Rules for `bor` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;; `{i,b}64` and smaller. ;; Or two registers. (rule (lower (has_type (fits_in_64 ty) (bor x y))) (value_reg (or ty (put_in_reg x) (RegMemImm.Reg (put_in_reg y))))) ;; Or with a memory operand. (rule (lower (has_type (fits_in_64 ty) (bor x (sinkable_load y)))) (value_reg (or ty (put_in_reg x) (sink_load y)))) (rule (lower (has_type (fits_in_64 ty) (bor (sinkable_load x) y))) (value_reg (or ty (put_in_reg y) (sink_load x)))) ;; Or with an immediate. (rule (lower (has_type (fits_in_64 ty) (bor x (simm32_from_value y)))) (value_reg (or ty (put_in_reg x) y))) (rule (lower (has_type (fits_in_64 ty) (bor (simm32_from_value x) y))) (value_reg (or ty (put_in_reg y) x))) ;; SSE. (decl sse_or (Type Xmm XmmMem) Xmm) (rule (sse_or $F32X4 x y) (orps x y)) (rule (sse_or $F64X2 x y) (orpd x y)) (rule (sse_or (multi_lane _bits _lanes) x y) (por x y)) (rule (lower (has_type ty @ (multi_lane _bits _lanes) (bor x y))) (value_xmm (sse_or ty (put_in_xmm x) (put_in_xmm_mem y)))) ;; `{i,b}128`. (decl or_i128 (ValueRegs ValueRegs) ValueRegs) (rule (or_i128 x y) (let ((x_lo Reg (value_regs_get x 0)) (x_hi Reg (value_regs_get x 1)) (y_lo Reg (value_regs_get y 0)) (y_hi Reg (value_regs_get y 1))) (value_regs (or $I64 x_lo (RegMemImm.Reg y_lo)) (or $I64 x_hi (RegMemImm.Reg y_hi))))) (rule (lower (has_type $I128 (bor x y))) (or_i128 (put_in_regs x) (put_in_regs y))) (rule (lower (has_type $B128 (bor x y))) ;; Booleans are always `0` or `1`, so we only need to do the `or` on the ;; low half. The high half is always zero but, rather than generate a new ;; zero, we just reuse `x`'s high half which is already zero. (let ((x_regs ValueRegs (put_in_regs x)) (x_lo Reg (value_regs_get x_regs 0)) (x_hi Reg (value_regs_get x_regs 1)) (y_lo Reg (lo_reg y))) (value_regs (or $I64 x_lo (RegMemImm.Reg y_lo)) x_hi))) ;;;; Rules for `bxor` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;; `{i,b}64` and smaller. ;; Xor two registers. (rule (lower (has_type (fits_in_64 ty) (bxor x y))) (value_reg (xor ty (put_in_reg x) (RegMemImm.Reg (put_in_reg y))))) ;; Xor with a memory operand. (rule (lower (has_type (fits_in_64 ty) (bxor x (sinkable_load y)))) (value_reg (xor ty (put_in_reg x) (sink_load y)))) (rule (lower (has_type (fits_in_64 ty) (bxor (sinkable_load x) y))) (value_reg (xor ty (put_in_reg y) (sink_load x)))) ;; Xor with an immediate. (rule (lower (has_type (fits_in_64 ty) (bxor x (simm32_from_value y)))) (value_reg (xor ty (put_in_reg x) y))) (rule (lower (has_type (fits_in_64 ty) (bxor (simm32_from_value x) y))) (value_reg (xor ty (put_in_reg y) x))) ;; SSE. (rule (lower (has_type ty @ (multi_lane _bits _lanes) (bxor x y))) (value_xmm (sse_xor ty (put_in_xmm x) (put_in_xmm_mem y)))) ;; `{i,b}128`. (rule (lower (has_type $I128 (bxor x y))) (let ((x_regs ValueRegs (put_in_regs x)) (x_lo Reg (value_regs_get x_regs 0)) (x_hi Reg (value_regs_get x_regs 1)) (y_regs ValueRegs (put_in_regs y)) (y_lo Reg (value_regs_get y_regs 0)) (y_hi Reg (value_regs_get y_regs 1))) (value_regs (xor $I64 x_lo (RegMemImm.Reg y_lo)) (xor $I64 x_hi (RegMemImm.Reg y_hi))))) (rule (lower (has_type $B128 (bxor x y))) ;; Booleans are always `0` or `1`, so we only need to do the `xor` on the ;; low half. The high half is always zero but, rather than generate a new ;; zero, we just reuse `x`'s high half which is already zero. (let ((x_regs ValueRegs (put_in_regs x)) (x_lo Reg (value_regs_get x_regs 0)) (x_hi Reg (value_regs_get x_regs 1)) (y_lo Reg (lo_reg y))) (value_regs (xor $I64 x_lo (RegMemImm.Reg y_lo)) x_hi))) ;;;; Rules for `ishl` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;; `i64` and smaller. (rule (lower (has_type (fits_in_64 ty) (ishl src amt))) (value_reg (shl ty (put_in_reg src) (put_masked_in_imm8_reg amt ty)))) ;; `i128`. (decl shl_i128 (ValueRegs Reg) ValueRegs) (rule (shl_i128 src amt) ;; Unpack the registers that make up the 128-bit value being shifted. (let ((src_lo Reg (value_regs_get src 0)) (src_hi Reg (value_regs_get src 1)) ;; Do two 64-bit shifts. (lo_shifted Reg (shl $I64 src_lo (Imm8Reg.Reg amt))) (hi_shifted Reg (shl $I64 src_hi (Imm8Reg.Reg amt))) ;; `src_lo >> (64 - amt)` are the bits to carry over from the lo ;; into the hi. (carry Reg (shr $I64 src_lo (Imm8Reg.Reg (sub $I64 (imm $I64 64) (RegMemImm.Reg amt))))) (zero Reg (imm $I64 0)) ;; Nullify the carry if we are shifting in by a multiple of 128. (carry_ Reg (with_flags_1 (test (OperandSize.Size64) (RegMemImm.Imm 127) amt) (cmove $I64 (CC.Z) (RegMem.Reg zero) carry))) ;; Add the carry into the high half. (hi_shifted_ Reg (or $I64 carry_ (RegMemImm.Reg hi_shifted)))) ;; Combine the two shifted halves. However, if we are shifting by >= 64 ;; (modulo 128), then the low bits are zero and the high bits are our ;; low bits. (with_flags_2 (test (OperandSize.Size64) (RegMemImm.Imm 64) amt) (cmove $I64 (CC.Z) (RegMem.Reg lo_shifted) zero) (cmove $I64 (CC.Z) (RegMem.Reg hi_shifted_) lo_shifted)))) (rule (lower (has_type $I128 (ishl src amt))) ;; NB: Only the low bits of `amt` matter since we logically mask the shift ;; amount to the value's bit width. (let ((amt_ Reg (lo_reg amt))) (shl_i128 (put_in_regs src) amt_))) ;; SSE. ;; Since the x86 instruction set does not have any 8x16 shift instructions (even ;; in higher feature sets like AVX), we lower the `ishl.i8x16` to a sequence of ;; instructions. The basic idea, whether the amount to shift by is an immediate ;; or not, is to use a 16x8 shift and then mask off the incorrect bits to 0s. (rule (lower (has_type $I8X16 (ishl src amt))) (let ((src_ Xmm (put_in_xmm src)) (amt_gpr RegMemImm (put_in_reg_mem_imm amt)) (amt_xmm XmmMemImm (mov_rmi_to_xmm amt_gpr)) ;; Shift `src` using 16x8. Unfortunately, a 16x8 shift will only be ;; correct for half of the lanes; the others must be fixed up with ;; the mask below. (unmasked Xmm (psllw src_ amt_xmm)) (mask_addr SyntheticAmode (ishl_i8x16_mask amt_gpr)) (mask Reg (x64_load $I8X16 mask_addr (ExtKind.None)))) (value_xmm (sse_and $I8X16 unmasked (reg_mem_to_xmm_mem (RegMem.Reg mask)))))) ;; Get the address of the mask to use when fixing up the lanes that weren't ;; correctly generated by the 16x8 shift. (decl ishl_i8x16_mask (RegMemImm) SyntheticAmode) ;; When the shift amount is known, we can statically (i.e. at compile time) ;; determine the mask to use and only emit that. (decl ishl_i8x16_mask_for_const (u32) SyntheticAmode) (extern constructor ishl_i8x16_mask_for_const ishl_i8x16_mask_for_const) (rule (ishl_i8x16_mask (RegMemImm.Imm amt)) (ishl_i8x16_mask_for_const amt)) ;; Otherwise, we must emit the entire mask table and dynamically (i.e. at run ;; time) find the correct mask offset in the table. We use `lea` to find the ;; base address of the mask table and then complex addressing to offset to the ;; right mask: `base_address + amt << 4` (decl ishl_i8x16_mask_table () SyntheticAmode) (extern constructor ishl_i8x16_mask_table ishl_i8x16_mask_table) (rule (ishl_i8x16_mask (RegMemImm.Reg amt)) (let ((mask_table SyntheticAmode (ishl_i8x16_mask_table)) (base_mask_addr Gpr (lea mask_table)) (mask_offset Reg (shl $I64 amt (Imm8Reg.Imm8 4)))) (amode_to_synthetic_amode (amode_imm_reg_reg_shift 0 base_mask_addr (gpr_new mask_offset) 0)))) (rule (ishl_i8x16_mask (RegMemImm.Mem amt)) (ishl_i8x16_mask (RegMemImm.Reg (x64_load $I64 amt (ExtKind.None))))) ;; 16x8, 32x4, and 64x2 shifts can each use a single instruction. (rule (lower (has_type $I16X8 (ishl src amt))) (value_xmm (psllw (put_in_xmm src) (mov_rmi_to_xmm (put_in_reg_mem_imm amt))))) (rule (lower (has_type $I32X4 (ishl src amt))) (value_xmm (pslld (put_in_xmm src) (mov_rmi_to_xmm (put_in_reg_mem_imm amt))))) (rule (lower (has_type $I64X2 (ishl src amt))) (value_xmm (psllq (put_in_xmm src) (mov_rmi_to_xmm (put_in_reg_mem_imm amt))))) ;;;; Rules for `ushr` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;; `i64` and smaller. (rule (lower (has_type (fits_in_64 ty) (ushr src amt))) (let ((src_ Reg (extend_to_reg src ty (ExtendKind.Zero)))) (value_reg (shr ty src_ (put_masked_in_imm8_reg amt ty))))) ;; `i128`. (decl shr_i128 (ValueRegs Reg) ValueRegs) (rule (shr_i128 src amt) ;; Unpack the lo/hi halves of `src`. (let ((src_lo Reg (value_regs_get src 0)) (src_hi Reg (value_regs_get src 1)) ;; Do a shift on each half. (lo_shifted Reg (shr $I64 src_lo (Imm8Reg.Reg amt))) (hi_shifted Reg (shr $I64 src_hi (Imm8Reg.Reg amt))) ;; `src_hi << (64 - amt)` are the bits to carry over from the hi ;; into the lo. (carry Reg (shl $I64 src_hi (Imm8Reg.Reg (sub $I64 (imm $I64 64) (RegMemImm.Reg amt))))) ;; Nullify the carry if we are shifting by a multiple of 128. (carry_ Reg (with_flags_1 (test (OperandSize.Size64) (RegMemImm.Imm 127) amt) (cmove $I64 (CC.Z) (RegMem.Reg (imm $I64 0)) carry))) ;; Add the carry bits into the lo. (lo_shifted_ Reg (or $I64 carry_ (RegMemImm.Reg lo_shifted)))) ;; Combine the two shifted halves. However, if we are shifting by >= 64 ;; (modulo 128), then the hi bits are zero and the lo bits are what ;; would otherwise be our hi bits. (with_flags_2 (test (OperandSize.Size64) (RegMemImm.Imm 64) amt) (cmove $I64 (CC.Z) (RegMem.Reg lo_shifted_) hi_shifted) (cmove $I64 (CC.Z) (RegMem.Reg hi_shifted) (imm $I64 0))))) (rule (lower (has_type $I128 (ushr src amt))) ;; NB: Only the low bits of `amt` matter since we logically mask the shift ;; amount to the value's bit width. (let ((amt_ Reg (lo_reg amt))) (shr_i128 (put_in_regs src) amt_))) ;; SSE. ;; There are no 8x16 shifts in x64. Do the same 16x8-shift-and-mask thing we do ;; with 8x16 `ishl`. (rule (lower (has_type $I8X16 (ushr src amt))) (let ((src_ Xmm (put_in_xmm src)) (amt_gpr RegMemImm (put_in_reg_mem_imm amt)) (amt_xmm XmmMemImm (mov_rmi_to_xmm amt_gpr)) ;; Shift `src` using 16x8. Unfortunately, a 16x8 shift will only be ;; correct for half of the lanes; the others must be fixed up with ;; the mask below. (unmasked Xmm (psrlw src_ amt_xmm)) (mask_addr SyntheticAmode (ushr_i8x16_mask amt_gpr)) (mask Reg (x64_load $I8X16 mask_addr (ExtKind.None)))) (value_xmm (sse_and $I8X16 unmasked (reg_mem_to_xmm_mem (RegMem.Reg mask)))))) ;; Get the address of the mask to use when fixing up the lanes that weren't ;; correctly generated by the 16x8 shift. (decl ushr_i8x16_mask (RegMemImm) SyntheticAmode) ;; When the shift amount is known, we can statically (i.e. at compile time) ;; determine the mask to use and only emit that. (decl ushr_i8x16_mask_for_const (u32) SyntheticAmode) (extern constructor ushr_i8x16_mask_for_const ushr_i8x16_mask_for_const) (rule (ushr_i8x16_mask (RegMemImm.Imm amt)) (ushr_i8x16_mask_for_const amt)) ;; Otherwise, we must emit the entire mask table and dynamically (i.e. at run ;; time) find the correct mask offset in the table. We use `lea` to find the ;; base address of the mask table and then complex addressing to offset to the ;; right mask: `base_address + amt << 4` (decl ushr_i8x16_mask_table () SyntheticAmode) (extern constructor ushr_i8x16_mask_table ushr_i8x16_mask_table) (rule (ushr_i8x16_mask (RegMemImm.Reg amt)) (let ((mask_table SyntheticAmode (ushr_i8x16_mask_table)) (base_mask_addr Gpr (lea mask_table)) (mask_offset Reg (shl $I64 amt (Imm8Reg.Imm8 4)))) (amode_to_synthetic_amode (amode_imm_reg_reg_shift 0 base_mask_addr (gpr_new mask_offset) 0)))) (rule (ushr_i8x16_mask (RegMemImm.Mem amt)) (ushr_i8x16_mask (RegMemImm.Reg (x64_load $I64 amt (ExtKind.None))))) ;; 16x8, 32x4, and 64x2 shifts can each use a single instruction. (rule (lower (has_type $I16X8 (ushr src amt))) (value_xmm (psrlw (put_in_xmm src) (mov_rmi_to_xmm (put_in_reg_mem_imm amt))))) (rule (lower (has_type $I32X4 (ushr src amt))) (value_xmm (psrld (put_in_xmm src) (mov_rmi_to_xmm (put_in_reg_mem_imm amt))))) (rule (lower (has_type $I64X2 (ushr src amt))) (value_xmm (psrlq (put_in_xmm src) (mov_rmi_to_xmm (put_in_reg_mem_imm amt))))) ;;;; Rules for `sshr` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;; `i64` and smaller. (rule (lower (has_type (fits_in_64 ty) (sshr src amt))) (let ((src_ Reg (extend_to_reg src ty (ExtendKind.Sign)))) (value_reg (sar ty src_ (put_masked_in_imm8_reg amt ty))))) ;; `i128`. (decl sar_i128 (ValueRegs Reg) ValueRegs) (rule (sar_i128 src amt) ;; Unpack the low/high halves of `src`. (let ((src_lo Reg (value_regs_get src 0)) (src_hi Reg (value_regs_get src 1)) ;; Do a shift of each half. NB: the low half uses an unsigned shift ;; because its MSB is not a sign bit. (lo_shifted Reg (shr $I64 src_lo (Imm8Reg.Reg amt))) (hi_shifted Reg (sar $I64 src_hi (Imm8Reg.Reg amt))) ;; `src_hi << (64 - amt)` are the bits to carry over from the low ;; half to the high half. (carry Reg (shl $I64 src_hi (Imm8Reg.Reg (sub $I64 (imm $I64 64) (RegMemImm.Reg amt))))) ;; Nullify the carry if we are shifting by a multiple of 128. (carry_ Reg (with_flags_1 (test (OperandSize.Size64) (RegMemImm.Imm 127) amt) (cmove $I64 (CC.Z) (RegMem.Reg (imm $I64 0)) carry))) ;; Add the carry into the low half. (lo_shifted_ Reg (or $I64 lo_shifted (RegMemImm.Reg carry_))) ;; Get all sign bits. (sign_bits Reg (sar $I64 src_hi (Imm8Reg.Imm8 63)))) ;; Combine the two shifted halves. However, if we are shifting by >= 64 ;; (modulo 128), then the hi bits are all sign bits and the lo bits are ;; what would otherwise be our hi bits. (with_flags_2 (test (OperandSize.Size64) (RegMemImm.Imm 64) amt) (cmove $I64 (CC.Z) (RegMem.Reg lo_shifted_) hi_shifted) (cmove $I64 (CC.Z) (RegMem.Reg hi_shifted) sign_bits)))) (rule (lower (has_type $I128 (sshr src amt))) ;; NB: Only the low bits of `amt` matter since we logically mask the shift ;; amount to the value's bit width. (let ((amt_ Reg (lo_reg amt))) (sar_i128 (put_in_regs src) amt_))) ;; SSE. ;; Since the x86 instruction set does not have an 8x16 shift instruction and the ;; approach used for `ishl` and `ushr` cannot be easily used (the masks do not ;; preserve the sign), we use a different approach here: separate the low and ;; high lanes, shift them separately, and merge them into the final result. ;; ;; Visually, this looks like the following, where `src.i8x16 = [s0, s1, ..., ;; s15]: ;; ;; lo.i16x8 = [(s0, s0), (s1, s1), ..., (s7, s7)] ;; shifted_lo.i16x8 = shift each lane of `low` ;; hi.i16x8 = [(s8, s8), (s9, s9), ..., (s15, s15)] ;; shifted_hi.i16x8 = shift each lane of `high` ;; result = [s0'', s1'', ..., s15''] (rule (lower (has_type $I8X16 (sshr src amt @ (value_type amt_ty)))) (let ((src_ Xmm (put_in_xmm src)) ;; In order for `packsswb` later to only use the high byte of each ;; 16x8 lane, we shift right an extra 8 bits, relying on `psraw` to ;; fill in the upper bits appropriately. (lo Xmm (punpcklbw src_ (xmm_to_xmm_mem src_))) (hi Xmm (punpckhbw src_ (xmm_to_xmm_mem src_))) (amt_ XmmMemImm (sshr_i8x16_bigger_shift amt_ty (put_in_reg_mem_imm amt))) (shifted_lo Xmm (psraw lo amt_)) (shifted_hi Xmm (psraw hi amt_))) (value_xmm (packsswb shifted_lo (xmm_to_xmm_mem shifted_hi))))) (decl sshr_i8x16_bigger_shift (Type RegMemImm) XmmMemImm) (rule (sshr_i8x16_bigger_shift _ty (RegMemImm.Imm i)) (xmm_mem_imm_new (RegMemImm.Imm (u32_add i 8)))) (rule (sshr_i8x16_bigger_shift ty (RegMemImm.Reg r)) (mov_rmi_to_xmm (RegMemImm.Reg (add ty r (RegMemImm.Imm 8))))) (rule (sshr_i8x16_bigger_shift ty rmi @ (RegMemImm.Mem _m)) (mov_rmi_to_xmm (RegMemImm.Reg (add ty (imm ty 8) rmi)))) ;; `sshr.{i16x8,i32x4}` can be a simple `psra{w,d}`, we just have to make sure ;; that if the shift amount is in a register, it is in an XMM register. (rule (lower (has_type $I16X8 (sshr src amt))) (value_xmm (psraw (put_in_xmm src) (mov_rmi_to_xmm (put_in_reg_mem_imm amt))))) (rule (lower (has_type $I32X4 (sshr src amt))) (value_xmm (psrad (put_in_xmm src) (mov_rmi_to_xmm (put_in_reg_mem_imm amt))))) ;; The `sshr.i64x2` CLIF instruction has no single x86 instruction in the older ;; feature sets. Newer ones like AVX512VL + AVX512F include `vpsraq`, a 128-bit ;; instruction that would fit here, but this backend does not currently have ;; support for EVEX encodings. To remedy this, we extract each 64-bit lane to a ;; GPR, shift each using a scalar instruction, and insert the shifted values ;; back in the `dst` XMM register. ;; ;; (TODO: when EVEX support is available, add an alternate lowering here). (rule (lower (has_type $I64X2 (sshr src amt))) (let ((src_ Xmm (put_in_xmm src)) (lo Gpr (pextrd $I64 src_ 0)) (hi Gpr (pextrd $I64 src_ 1)) (amt_ Imm8Reg (put_masked_in_imm8_reg amt $I64)) (shifted_lo Reg (sar $I64 (gpr_to_reg lo) amt_)) (shifted_hi Reg (sar $I64 (gpr_to_reg hi) amt_))) (value_xmm (make_i64x2_from_lanes (reg_mem_to_gpr_mem (RegMem.Reg shifted_lo)) (reg_mem_to_gpr_mem (RegMem.Reg shifted_hi)))))) ;;;; Rules for `rotl` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;; `i16` and `i8`: we need to extend the shift amount, or mask the ;; constant. (rule (lower (has_type (ty_8_or_16 ty) (rotl src amt))) (let ((amt_ Reg (extend_to_reg amt $I32 (ExtendKind.Zero)))) (value_reg (x64_rotl ty (put_in_reg src) (Imm8Reg.Reg amt_))))) (rule (lower (has_type (ty_8_or_16 ty) (rotl src (u64_from_iconst amt)))) (value_reg (x64_rotl ty (put_in_reg src) (const_to_type_masked_imm8 amt ty)))) ;; `i64` and `i32`: we can rely on x86's rotate-amount masking since ;; we operate on the whole register. (rule (lower (has_type (ty_32_or_64 ty) (rotl src amt))) ;; NB: Only the low bits of `amt` matter since we logically mask the ;; shift amount to the value's bit width. (let ((amt_ Reg (lo_reg amt))) (value_reg (x64_rotl ty (put_in_reg src) (Imm8Reg.Reg amt_))))) (rule (lower (has_type (ty_32_or_64 ty) (rotl src (u64_from_iconst amt)))) (value_reg (x64_rotl ty (put_in_reg src) (const_to_type_masked_imm8 amt ty)))) ;; `i128`. (rule (lower (has_type $I128 (rotl src amt))) (let ((src_ ValueRegs (put_in_regs src)) ;; NB: Only the low bits of `amt` matter since we logically mask the ;; rotation amount to the value's bit width. (amt_ Reg (lo_reg amt))) (or_i128 (shl_i128 src_ amt_) (shr_i128 src_ (sub $I64 (imm $I64 128) (RegMemImm.Reg amt_)))))) ;;;; Rules for `rotr` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;; `i16` and `i8`: we need to extend the shift amount, or mask the ;; constant. (rule (lower (has_type (ty_8_or_16 ty) (rotr src amt))) (let ((amt_ Reg (extend_to_reg amt $I32 (ExtendKind.Zero)))) (value_reg (x64_rotr ty (put_in_reg src) (Imm8Reg.Reg amt_))))) (rule (lower (has_type (ty_8_or_16 ty) (rotr src (u64_from_iconst amt)))) (value_reg (x64_rotr ty (put_in_reg src) (const_to_type_masked_imm8 amt ty)))) ;; `i64` and `i32`: we can rely on x86's rotate-amount masking since ;; we operate on the whole register. (rule (lower (has_type (ty_32_or_64 ty) (rotr src amt))) ;; NB: Only the low bits of `amt` matter since we logically mask the ;; shift amount to the value's bit width. (let ((amt_ Reg (lo_reg amt))) (value_reg (x64_rotr ty (put_in_reg src) (Imm8Reg.Reg amt_))))) (rule (lower (has_type (ty_32_or_64 ty) (rotr src (u64_from_iconst amt)))) (value_reg (x64_rotr ty (put_in_reg src) (const_to_type_masked_imm8 amt ty)))) ;; `i128`. (rule (lower (has_type $I128 (rotr src amt))) (let ((src_ ValueRegs (put_in_regs src)) ;; NB: Only the low bits of `amt` matter since we logically mask the ;; rotation amount to the value's bit width. (amt_ Reg (lo_reg amt))) (or_i128 (shr_i128 src_ amt_) (shl_i128 src_ (sub $I64 (imm $I64 128) (RegMemImm.Reg amt_)))))) ;;;; Rules for `ineg` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;; `i64` and smaller. (rule (lower (has_type (fits_in_64 ty) (ineg x))) (value_gpr (neg ty (put_in_gpr x)))) ;; SSE. (rule (lower (has_type $I8X16 (ineg x))) (value_xmm (psubb (xmm_new (imm $I8X16 0)) (put_in_xmm_mem x)))) (rule (lower (has_type $I16X8 (ineg x))) (value_xmm (psubw (xmm_new (imm $I16X8 0)) (put_in_xmm_mem x)))) (rule (lower (has_type $I32X4 (ineg x))) (value_xmm (psubd (xmm_new (imm $I32X4 0)) (put_in_xmm_mem x)))) (rule (lower (has_type $I64X2 (ineg x))) (value_xmm (psubq (xmm_new (imm $I64X2 0)) (put_in_xmm_mem x)))) ;;;; Rules for `avg_round` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; (rule (lower (has_type (multi_lane 8 16) (avg_round x y))) (value_xmm (pavgb (put_in_xmm x) (put_in_xmm_mem y)))) (rule (lower (has_type (multi_lane 16 8) (avg_round x y))) (value_xmm (pavgw (put_in_xmm x) (put_in_xmm_mem y)))) ;;;; Rules for `imul` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;; `i64` and smaller. ;; Multiply two registers. (rule (lower (has_type (fits_in_64 ty) (imul x y))) (value_reg (mul ty (put_in_reg x) (RegMemImm.Reg (put_in_reg y))))) ;; Multiply a register and an immediate. (rule (lower (has_type (fits_in_64 ty) (imul x (simm32_from_value y)))) (value_reg (mul ty (put_in_reg x) y))) (rule (lower (has_type (fits_in_64 ty) (imul (simm32_from_value x) y))) (value_reg (mul ty (put_in_reg y) x))) ;; Multiply a register and a memory load. (rule (lower (has_type (fits_in_64 ty) (imul x (sinkable_load y)))) (value_reg (mul ty (put_in_reg x) (sink_load y)))) (rule (lower (has_type (fits_in_64 ty) (imul (sinkable_load x) y))) (value_reg (mul ty (put_in_reg y) (sink_load x)))) ;; `i128`. ;; mul: ;; dst_lo = lhs_lo * rhs_lo ;; dst_hi = umulhi(lhs_lo, rhs_lo) + ;; lhs_lo * rhs_hi + ;; lhs_hi * rhs_lo ;; ;; so we emit: ;; lo_hi = mul x_lo, y_hi ;; hi_lo = mul x_hi, y_lo ;; hilo_hilo = add lo_hi, hi_lo ;; dst_lo:hi_lolo = mulhi_u x_lo, y_lo ;; dst_hi = add hilo_hilo, hi_lolo ;; return (dst_lo, dst_hi) (rule (lower (has_type $I128 (imul x y))) ;; Put `x` into registers and unpack its hi/lo halves. (let ((x_regs ValueRegs (put_in_regs x)) (x_lo Reg (value_regs_get x_regs 0)) (x_hi Reg (value_regs_get x_regs 1)) ;; Put `y` into registers and unpack its hi/lo halves. (y_regs ValueRegs (put_in_regs y)) (y_lo Reg (value_regs_get y_regs 0)) (y_hi Reg (value_regs_get y_regs 1)) ;; lo_hi = mul x_lo, y_hi (lo_hi Reg (mul $I64 x_lo (RegMemImm.Reg y_hi))) ;; hi_lo = mul x_hi, y_lo (hi_lo Reg (mul $I64 x_hi (RegMemImm.Reg y_lo))) ;; hilo_hilo = add lo_hi, hi_lo (hilo_hilo Reg (add $I64 lo_hi (RegMemImm.Reg hi_lo))) ;; dst_lo:hi_lolo = mulhi_u x_lo, y_lo (mul_regs ValueRegs (mulhi_u $I64 x_lo (RegMem.Reg y_lo))) (dst_lo Reg (value_regs_get mul_regs 0)) (hi_lolo Reg (value_regs_get mul_regs 1)) ;; dst_hi = add hilo_hilo, hi_lolo (dst_hi Reg (add $I64 hilo_hilo (RegMemImm.Reg hi_lolo)))) (value_regs dst_lo dst_hi))) ;; SSE. ;; (No i8x16 multiply.) (rule (lower (has_type (multi_lane 16 8) (imul x y))) (value_xmm (pmullw (put_in_xmm x) (put_in_xmm_mem y)))) (rule (lower (has_type (multi_lane 32 4) (imul x y))) (value_xmm (pmulld (put_in_xmm x) (put_in_xmm_mem y)))) ;; With AVX-512 we can implement `i64x2` multiplication with a single ;; instruction. (rule (lower (has_type (and (avx512vl_enabled) (avx512dq_enabled) (multi_lane 64 2)) (imul x y))) (value_xmm (vpmullq (put_in_xmm_mem x) (put_in_xmm y)))) ;; Otherwise, for i64x2 multiplication we describe a lane A as being composed of ;; a 32-bit upper half "Ah" and a 32-bit lower half "Al". The 32-bit long hand ;; multiplication can then be written as: ;; ;; Ah Al ;; * Bh Bl ;; ----- ;; Al * Bl ;; + (Ah * Bl) << 32 ;; + (Al * Bh) << 32 ;; ;; So for each lane we will compute: ;; ;; A * B = (Al * Bl) + ((Ah * Bl) + (Al * Bh)) << 32 ;; ;; Note, the algorithm will use `pmuldq` which operates directly on the lower ;; 32-bit (`Al` or `Bl`) of a lane and writes the result to the full 64-bits of ;; the lane of the destination. For this reason we don't need shifts to isolate ;; the lower 32-bits, however, we will need to use shifts to isolate the high ;; 32-bits when doing calculations, i.e., `Ah == A >> 32`. (rule (lower (has_type (multi_lane 64 2) (imul a b))) (let ((a0 Xmm (put_in_xmm a)) (b0 Xmm (put_in_xmm b)) ;; a_hi = A >> 32 (a_hi Xmm (psrlq a0 (xmm_mem_imm_new (RegMemImm.Imm 32)))) ;; ah_bl = Ah * Bl (ah_bl Xmm (pmuludq a_hi (xmm_to_xmm_mem b0))) ;; b_hi = B >> 32 (b_hi Xmm (psrlq b0 (xmm_mem_imm_new (RegMemImm.Imm 32)))) ;; al_bh = Al * Bh (al_bh Xmm (pmuludq a0 (xmm_to_xmm_mem b_hi))) ;; aa_bb = ah_bl + al_bh (aa_bb Xmm (paddq ah_bl (xmm_to_xmm_mem al_bh))) ;; aa_bb_shifted = aa_bb << 32 (aa_bb_shifted Xmm (psllq aa_bb (xmm_mem_imm_new (RegMemImm.Imm 32)))) ;; al_bl = Al * Bl (al_bl Xmm (pmuludq a0 (xmm_to_xmm_mem b0)))) ;; al_bl + aa_bb_shifted (value_xmm (paddq al_bl (xmm_to_xmm_mem aa_bb_shifted))))) ;; Special case for `i16x8.extmul_high_i8x16_s`. (rule (lower (has_type (multi_lane 16 8) (imul (def_inst (swiden_high (and (value_type (multi_lane 8 16)) x))) (def_inst (swiden_high (and (value_type (multi_lane 8 16)) y)))))) (let ((x1 Xmm (put_in_xmm x)) (x2 Xmm (palignr x1 (xmm_to_xmm_mem x1) 8 (OperandSize.Size32))) (x3 Xmm (pmovsxbw (xmm_to_xmm_mem x2))) (y1 Xmm (put_in_xmm y)) (y2 Xmm (palignr y1 (xmm_to_xmm_mem y1) 8 (OperandSize.Size32))) (y3 Xmm (pmovsxbw (xmm_to_xmm_mem y2)))) (value_xmm (pmullw x3 (xmm_to_xmm_mem y3))))) ;; Special case for `i32x4.extmul_high_i16x8_s`. (rule (lower (has_type (multi_lane 32 4) (imul (def_inst (swiden_high (and (value_type (multi_lane 16 8)) x))) (def_inst (swiden_high (and (value_type (multi_lane 16 8)) y)))))) (let ((x2 Xmm (put_in_xmm x)) (y2 Xmm (put_in_xmm y)) (lo Xmm (pmullw x2 (xmm_to_xmm_mem y2))) (hi Xmm (pmulhw x2 (xmm_to_xmm_mem y2)))) (value_xmm (punpckhwd lo (xmm_to_xmm_mem hi))))) ;; Special case for `i64x2.extmul_high_i32x4_s`. (rule (lower (has_type (multi_lane 64 2) (imul (def_inst (swiden_high (and (value_type (multi_lane 32 4)) x))) (def_inst (swiden_high (and (value_type (multi_lane 32 4)) y)))))) (let ((x2 Xmm (pshufd (put_in_xmm_mem x) 0xFA (OperandSize.Size32))) (y2 Xmm (pshufd (put_in_xmm_mem y) 0xFA (OperandSize.Size32)))) (value_xmm (pmuldq x2 (xmm_to_xmm_mem y2))))) ;; Special case for `i16x8.extmul_low_i8x16_s`. (rule (lower (has_type (multi_lane 16 8) (imul (def_inst (swiden_low (and (value_type (multi_lane 8 16)) x))) (def_inst (swiden_low (and (value_type (multi_lane 8 16)) y)))))) (let ((x2 Xmm (pmovsxbw (put_in_xmm_mem x))) (y2 Xmm (pmovsxbw (put_in_xmm_mem y)))) (value_xmm (pmullw x2 (xmm_to_xmm_mem y2))))) ;; Special case for `i32x4.extmul_low_i16x8_s`. (rule (lower (has_type (multi_lane 32 4) (imul (def_inst (swiden_low (and (value_type (multi_lane 16 8)) x))) (def_inst (swiden_low (and (value_type (multi_lane 16 8)) y)))))) (let ((x2 Xmm (put_in_xmm x)) (y2 Xmm (put_in_xmm y)) (lo Xmm (pmullw x2 (xmm_to_xmm_mem y2))) (hi Xmm (pmulhw x2 (xmm_to_xmm_mem y2)))) (value_xmm (punpcklwd lo (xmm_to_xmm_mem hi))))) ;; Special case for `i64x2.extmul_low_i32x4_s`. (rule (lower (has_type (multi_lane 64 2) (imul (def_inst (swiden_low (and (value_type (multi_lane 32 4)) x))) (def_inst (swiden_low (and (value_type (multi_lane 32 4)) y)))))) (let ((x2 Xmm (pshufd (put_in_xmm_mem x) 0x50 (OperandSize.Size32))) (y2 Xmm (pshufd (put_in_xmm_mem y) 0x50 (OperandSize.Size32)))) (value_xmm (pmuldq x2 (xmm_to_xmm_mem y2))))) ;; Special case for `i16x8.extmul_high_i8x16_u`. (rule (lower (has_type (multi_lane 16 8) (imul (def_inst (uwiden_high (and (value_type (multi_lane 8 16)) x))) (def_inst (uwiden_high (and (value_type (multi_lane 8 16)) y)))))) (let ((x1 Xmm (put_in_xmm x)) (x2 Xmm (palignr x1 (xmm_to_xmm_mem x1) 8 (OperandSize.Size32))) (x3 Xmm (pmovzxbw (xmm_to_xmm_mem x2))) (y1 Xmm (put_in_xmm y)) (y2 Xmm (palignr y1 (xmm_to_xmm_mem y1) 8 (OperandSize.Size32))) (y3 Xmm (pmovzxbw (xmm_to_xmm_mem y2)))) (value_xmm (pmullw x3 (xmm_to_xmm_mem y3))))) ;; Special case for `i32x4.extmul_high_i16x8_u`. (rule (lower (has_type (multi_lane 32 4) (imul (def_inst (uwiden_high (and (value_type (multi_lane 16 8)) x))) (def_inst (uwiden_high (and (value_type (multi_lane 16 8)) y)))))) (let ((x2 Xmm (put_in_xmm x)) (y2 Xmm (put_in_xmm y)) (lo Xmm (pmullw x2 (xmm_to_xmm_mem y2))) (hi Xmm (pmulhuw x2 (xmm_to_xmm_mem y2)))) (value_xmm (punpckhwd lo (xmm_to_xmm_mem hi))))) ;; Special case for `i64x2.extmul_high_i32x4_u`. (rule (lower (has_type (multi_lane 64 2) (imul (def_inst (uwiden_high (and (value_type (multi_lane 32 4)) x))) (def_inst (uwiden_high (and (value_type (multi_lane 32 4)) y)))))) (let ((x2 Xmm (pshufd (put_in_xmm_mem x) 0xFA (OperandSize.Size32))) (y2 Xmm (pshufd (put_in_xmm_mem y) 0xFA (OperandSize.Size32)))) (value_xmm (pmuludq x2 (xmm_to_xmm_mem y2))))) ;; Special case for `i16x8.extmul_low_i8x16_u`. (rule (lower (has_type (multi_lane 16 8) (imul (def_inst (uwiden_low (and (value_type (multi_lane 8 16)) x))) (def_inst (uwiden_low (and (value_type (multi_lane 8 16)) y)))))) (let ((x2 Xmm (pmovzxbw (put_in_xmm_mem x))) (y2 Xmm (pmovzxbw (put_in_xmm_mem y)))) (value_xmm (pmullw x2 (xmm_to_xmm_mem y2))))) ;; Special case for `i32x4.extmul_low_i16x8_u`. (rule (lower (has_type (multi_lane 32 4) (imul (def_inst (uwiden_low (and (value_type (multi_lane 16 8)) x))) (def_inst (uwiden_low (and (value_type (multi_lane 16 8)) y)))))) (let ((x2 Xmm (put_in_xmm x)) (y2 Xmm (put_in_xmm y)) (lo Xmm (pmullw x2 (xmm_to_xmm_mem y2))) (hi Xmm (pmulhuw x2 (xmm_to_xmm_mem y2)))) (value_xmm (punpcklwd lo (xmm_to_xmm_mem hi))))) ;; Special case for `i64x2.extmul_low_i32x4_u`. (rule (lower (has_type (multi_lane 64 2) (imul (def_inst (uwiden_low (and (value_type (multi_lane 32 4)) x))) (def_inst (uwiden_low (and (value_type (multi_lane 32 4)) y)))))) (let ((x2 Xmm (pshufd (put_in_xmm_mem x) 0x50 (OperandSize.Size32))) (y2 Xmm (pshufd (put_in_xmm_mem y) 0x50 (OperandSize.Size32)))) (value_xmm (pmuludq x2 (xmm_to_xmm_mem y2))))) ;;;; Rules for `band_not` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; (decl sse_and_not (Type Xmm XmmMem) Xmm) (rule (sse_and_not $F32X4 x y) (andnps x y)) (rule (sse_and_not $F64X2 x y) (andnpd x y)) (rule (sse_and_not (multi_lane _bits _lanes) x y) (pandn x y)) ;; Note the flipping of operands below. CLIF specifies ;; ;; band_not(x, y) = and(x, not(y)) ;; ;; while x86 does ;; ;; pandn(x, y) = and(not(x), y) (rule (lower (has_type ty (band_not x y))) (value_xmm (sse_and_not ty (put_in_xmm y) (put_in_xmm_mem x)))) ;;;; Rules for `iabs` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; (rule (lower (has_type $I8X16 (iabs x))) (value_xmm (pabsb (put_in_xmm_mem x)))) (rule (lower (has_type $I16X8 (iabs x))) (value_xmm (pabsw (put_in_xmm_mem x)))) (rule (lower (has_type $I32X4 (iabs x))) (value_xmm (pabsd (put_in_xmm_mem x)))) ;; When AVX512 is available, we can use a single `vpabsq` instruction. (rule (lower (has_type (and (avx512vl_enabled) (avx512f_enabled) $I64X2) (iabs x))) (value_xmm (vpabsq (put_in_xmm_mem x)))) ;; Otherwise, we use a separate xmmister, `neg`, to contain the results of `0 - ;; x` and then blend in those results with `blendvpd` if the MSB of `neg` was ;; set to 1 (i.e. if `neg` was negative or, conversely, if `x` was originally ;; positive). (rule (lower (has_type $I64X2 (iabs x))) (let ((rx Xmm (put_in_xmm x)) (neg Xmm (psubq (xmm_new (imm $I64X2 0)) (xmm_to_xmm_mem rx)))) (value_xmm (blendvpd neg (xmm_to_xmm_mem rx) neg)))) ;;;; Rules for `fabs` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;; Special case for `f32x4.abs`. (rule (lower (has_type $F32X4 (fabs x))) (value_xmm (andps (put_in_xmm x) (xmm_to_xmm_mem (psrld (vector_all_ones $F32X4) (xmm_mem_imm_new (RegMemImm.Imm 1))))))) ;; Special case for `f64x2.abs`. (rule (lower (has_type $F64X2 (fabs x))) (value_xmm (andpd (put_in_xmm x) (xmm_to_xmm_mem (psrlq (vector_all_ones $F64X2) (xmm_mem_imm_new (RegMemImm.Imm 1))))))) ;;;; Rules for `bnot` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;; `i64` and smaller. (rule (lower (has_type (fits_in_64 ty) (bnot x))) (value_gpr (not ty (put_in_gpr x)))) ;; `i128`. (decl i128_not (Value) ValueRegs) (rule (i128_not x) (let ((x_regs ValueRegs (put_in_regs x)) (x_lo Gpr (gpr_new (value_regs_get x_regs 0))) (x_hi Gpr (gpr_new (value_regs_get x_regs 1)))) (value_gprs (not $I64 x_lo) (not $I64 x_hi)))) (rule (lower (has_type $I128 (bnot x))) (i128_not x)) (rule (lower (has_type $B128 (bnot x))) (i128_not x)) ;; Special case for vector-types where bit-negation is an xor against an ;; all-one value (rule (lower (has_type ty @ (multi_lane _bits _lanes) (bnot x))) (value_xmm (sse_xor ty (put_in_xmm x) (xmm_to_xmm_mem (vector_all_ones ty))))) ;;;; Rules for `bitselect` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; (rule (lower (has_type ty @ (multi_lane _bits _lanes) (bitselect condition if_true if_false))) ;; a = and if_true, condition ;; b = and_not condition, if_false ;; or b, a (let ((cond_xmm Xmm (put_in_xmm condition)) (a Xmm (sse_and ty (put_in_xmm if_true) (xmm_to_xmm_mem cond_xmm))) (b Xmm (sse_and_not ty cond_xmm (put_in_xmm_mem if_false)))) (value_xmm (sse_or ty b (xmm_to_xmm_mem a))))) ;;;; Rules for `vselect` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; (rule (lower (has_type ty @ (multi_lane _bits _lanes) (vselect condition if_true if_false))) (value_xmm (sse_blend ty (put_in_xmm_mem condition) (put_in_xmm_mem if_true) (put_in_xmm if_false)))) ;;;; Rules for `insertlane` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; (rule (lower (insertlane vec @ (value_type ty) val (u8_from_uimm8 idx))) (value_xmm (vec_insert_lane ty (put_in_xmm vec) (put_in_reg_mem val) idx))) ;; Helper function used below for `insertlane` but also here for other ;; lowerings. ;; ;; Note that the `Type` used here is the type of vector the insertion is ;; happening into, or the type of the first `Reg` argument. (decl vec_insert_lane (Type Xmm RegMem u8) Xmm) ;; i8x16.replace_lane (rule (vec_insert_lane $I8X16 vec val idx) (pinsrb vec (reg_mem_to_gpr_mem val) idx)) ;; i16x8.replace_lane (rule (vec_insert_lane $I16X8 vec val idx) (pinsrw vec (reg_mem_to_gpr_mem val) idx)) ;; i32x4.replace_lane (rule (vec_insert_lane $I32X4 vec val idx) (pinsrd vec (reg_mem_to_gpr_mem val) idx (OperandSize.Size32))) ;; i64x2.replace_lane (rule (vec_insert_lane $I64X2 vec val idx) (pinsrd vec (reg_mem_to_gpr_mem val) idx (OperandSize.Size64))) ;; f32x4.replace_lane (rule (vec_insert_lane $F32X4 vec val idx) (insertps vec (reg_mem_to_xmm_mem val) (sse_insertps_lane_imm idx))) ;; External rust code used to calculate the immediate value to `insertps`. (decl sse_insertps_lane_imm (u8) u8) (extern constructor sse_insertps_lane_imm sse_insertps_lane_imm) ;; f64x2.replace_lane 0 ;; ;; Here the `movsd` instruction is used specifically to specialize moving ;; into the fist lane where unlike above cases we're not using the lane ;; immediate as an immediate to the instruction itself. ;; ;; Note, though, the `movsd` has different behavior with respect to the second ;; lane of the f64x2 depending on whether the RegMem operand is a register or ;; memory. When loading from a register `movsd` preserves the upper bits, but ;; when loading from memory it zeros the upper bits. We specifically want to ;; preserve the upper bits so if a `RegMem.Mem` is passed in we need to emit ;; two `movsd` instructions. The first `movsd` (used as `xmm_unary_rm_r`) will ;; load from memory into a temp register and then the second `movsd` (modeled ;; internally as `xmm_rm_r` will merge the temp register into our `vec` ;; register. (rule (vec_insert_lane $F64X2 vec (RegMem.Reg val) 0) (movsd vec (reg_mem_to_xmm_mem (RegMem.Reg val)))) (rule (vec_insert_lane $F64X2 vec mem 0) (movsd vec (xmm_to_xmm_mem (xmm_unary_rm_r (SseOpcode.Movsd) (reg_mem_to_xmm_mem mem))))) ;; f64x2.replace_lane 1 ;; ;; Here the `movlhps` instruction is used specifically to specialize moving ;; into the second lane where unlike above cases we're not using the lane ;; immediate as an immediate to the instruction itself. (rule (vec_insert_lane $F64X2 vec val 1) (movlhps vec (reg_mem_to_xmm_mem val))) ;;;; Rules for `imax` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; (rule (lower (has_type $I8X16 (imax x y))) (value_xmm (pmaxsb (put_in_xmm x) (put_in_xmm_mem y)))) (rule (lower (has_type $I16X8 (imax x y))) (value_xmm (pmaxsw (put_in_xmm x) (put_in_xmm_mem y)))) (rule (lower (has_type $I32X4 (imax x y))) (value_xmm (pmaxsd (put_in_xmm x) (put_in_xmm_mem y)))) ;;;; Rules for `imin` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; (rule (lower (has_type $I8X16 (imin x y))) (value_xmm (pminsb (put_in_xmm x) (put_in_xmm_mem y)))) (rule (lower (has_type $I16X8 (imin x y))) (value_xmm (pminsw (put_in_xmm x) (put_in_xmm_mem y)))) (rule (lower (has_type $I32X4 (imin x y))) (value_xmm (pminsd (put_in_xmm x) (put_in_xmm_mem y)))) ;;;; Rules for `umax` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; (rule (lower (has_type $I8X16 (umax x y))) (value_xmm (pmaxub (put_in_xmm x) (put_in_xmm_mem y)))) (rule (lower (has_type $I16X8 (umax x y))) (value_xmm (pmaxuw (put_in_xmm x) (put_in_xmm_mem y)))) (rule (lower (has_type $I32X4 (umax x y))) (value_xmm (pmaxud (put_in_xmm x) (put_in_xmm_mem y)))) ;;;; Rules for `umin` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; (rule (lower (has_type $I8X16 (umin x y))) (value_xmm (pminub (put_in_xmm x) (put_in_xmm_mem y)))) (rule (lower (has_type $I16X8 (umin x y))) (value_xmm (pminuw (put_in_xmm x) (put_in_xmm_mem y)))) (rule (lower (has_type $I32X4 (umin x y))) (value_xmm (pminud (put_in_xmm x) (put_in_xmm_mem y)))) ;;;; Rules for `trap` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; (rule (lower (trap code)) (safepoint (ud2 code))) ;;;; Rules for `resumable_trap` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; (rule (lower (resumable_trap code)) (safepoint (ud2 code)))