;; x86-64 instruction selection and CLIF-to-MachInst lowering. ;; The main lowering constructor term: takes a clif `Inst` and returns the ;; register(s) within which the lowered instruction's result values live. (decl lower (Inst) InstOutput) ;;;; Rules for `iconst` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;; `i64` and smaller. (rule (lower (has_type (fits_in_64 ty) (iconst (u64_from_imm64 x)))) (imm ty x)) ;; `i128` (rule (lower (has_type $I128 (iconst (u64_from_imm64 x)))) (value_regs (imm $I64 x) (imm $I64 0))) ;;;; Rules for `bconst` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;; `b64` and smaller. (rule (lower (has_type (fits_in_64 ty) (bconst $false))) (imm ty 0)) (rule (lower (has_type (fits_in_64 ty) (bconst $true))) (imm ty 1)) ;; `b128` (rule (lower (has_type $B128 (bconst $false))) (value_regs (imm $B64 0) (imm $B64 0))) (rule (lower (has_type $B128 (bconst $true))) (value_regs (imm $B64 1) (imm $B64 0))) ;;;; Rules for `f32const` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; (rule (lower (f32const (u64_from_ieee32 x))) (imm $F32 x)) ;;;; Rules for `f64const` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; (rule (lower (f64const (u64_from_ieee64 x))) (imm $F64 x)) ;;;; Rules for `null` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; (rule (lower (has_type ty (null))) (imm ty 0)) ;;;; Rules for `iadd` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;; `i64` and smaller. ;; Add two registers. (rule (lower (has_type (fits_in_64 ty) (iadd x y))) (x64_add ty x y)) ;; Add a register and an immediate. (rule (lower (has_type (fits_in_64 ty) (iadd x (simm32_from_value y)))) (x64_add ty x y)) (rule (lower (has_type (fits_in_64 ty) (iadd (simm32_from_value x) y))) (x64_add ty y x)) ;; Add a register and memory. (rule (lower (has_type (fits_in_64 ty) (iadd x (sinkable_load y)))) (x64_add ty x (sink_load_to_gpr_mem_imm y))) (rule (lower (has_type (fits_in_64 ty) (iadd (sinkable_load x) y))) (x64_add ty y (sink_load_to_gpr_mem_imm x))) ;; SSE. (rule (lower (has_type (multi_lane 8 16) (iadd x y))) (x64_paddb x y)) (rule (lower (has_type (multi_lane 16 8) (iadd x y))) (x64_paddw x y)) (rule (lower (has_type (multi_lane 32 4) (iadd x y))) (x64_paddd x y)) (rule (lower (has_type (multi_lane 64 2) (iadd x y))) (x64_paddq x y)) ;; `i128` (rule (lower (has_type $I128 (iadd x y))) ;; Get the high/low registers for `x`. (let ((x_regs ValueRegs x) (x_lo Gpr (value_regs_get_gpr x_regs 0)) (x_hi Gpr (value_regs_get_gpr x_regs 1))) ;; Get the high/low registers for `y`. (let ((y_regs ValueRegs y) (y_lo Gpr (value_regs_get_gpr y_regs 0)) (y_hi Gpr (value_regs_get_gpr y_regs 1))) ;; Do an add followed by an add-with-carry. (with_flags (x64_add_with_flags_paired $I64 x_lo y_lo) (x64_adc_paired $I64 x_hi y_hi))))) ;;;; Rules for `sadd_sat` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; (rule (lower (has_type (multi_lane 8 16) (sadd_sat x y))) (x64_paddsb x y)) (rule (lower (has_type (multi_lane 16 8) (sadd_sat x y))) (x64_paddsw x y)) ;;;; Rules for `uadd_sat` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; (rule (lower (has_type (multi_lane 8 16) (uadd_sat x y))) (x64_paddusb x y)) (rule (lower (has_type (multi_lane 16 8) (uadd_sat x y))) (x64_paddusw x y)) ;;;; Rules for `iadd_ifcout` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;; N.B.: the second output of `iadd_ifcout` is meant to be the ;; `iflags` value containing the carry result. However, we plan to ;; replace this with a bool carry flag, and all consumers of `iflags` ;; remain in the handwritten pattern-matching code and explicitly ;; match on the flags producer. So we can get away with just ;; using an invalid second output, and the reg-renaming code does the ;; right thing, for now. For safety, we assert elsewhere that no one ;; actually uses the register assigned to the SSA `iflags`-typed ;; `Value`. (decl output_ifcout (Reg) InstOutput) (rule (output_ifcout reg) (output_pair reg (value_regs_invalid))) ;; Add two registers. (rule (lower (has_type (fits_in_64 ty) (iadd_ifcout x y))) (output_ifcout (x64_add ty x y))) ;; Add a register and an immediate. (rule (lower (has_type (fits_in_64 ty) (iadd_ifcout x (simm32_from_value y)))) (output_ifcout (x64_add ty x y))) (rule (lower (has_type (fits_in_64 ty) (iadd_ifcout (simm32_from_value x) y))) (output_ifcout (x64_add ty y x))) ;; Add a register and memory. (rule (lower (has_type (fits_in_64 ty) (iadd_ifcout x (sinkable_load y)))) (output_ifcout (x64_add ty x (sink_load_to_gpr_mem_imm y)))) (rule (lower (has_type (fits_in_64 ty) (iadd_ifcout (sinkable_load x) y))) (output_ifcout (x64_add ty y (sink_load_to_gpr_mem_imm x)))) ;; (No `iadd_ifcout` for `i128`.) ;;;; Rules for `iadd_imm` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;; `i64` and smaller. ;; When the immediate fits in a `RegMemImm.Imm`, use that. (rule (lower (has_type (fits_in_64 ty) (iadd_imm y (simm32_from_imm64 x)))) (x64_add ty y x)) ;; Otherwise, put the immediate into a register. (rule (lower (has_type (fits_in_64 ty) (iadd_imm y (u64_from_imm64 x)))) (x64_add ty y (imm ty x))) ;; `i128` ;; When the immediate fits in a `RegMemImm.Imm`, use that. (rule (lower (has_type $I128 (iadd_imm y (simm32_from_imm64 x)))) (let ((y_regs ValueRegs y) (y_lo Gpr (value_regs_get_gpr y_regs 0)) (y_hi Gpr (value_regs_get_gpr y_regs 1))) (with_flags (x64_add_with_flags_paired $I64 y_lo x) (x64_adc_paired $I64 y_hi (RegMemImm.Imm 0))))) ;; Otherwise, put the immediate into a register. (rule (lower (has_type $I128 (iadd_imm y (u64_from_imm64 x)))) (let ((y_regs ValueRegs y) (y_lo Gpr (value_regs_get_gpr y_regs 0)) (y_hi Gpr (value_regs_get_gpr y_regs 1)) (x_lo Gpr (imm $I64 x))) (with_flags (x64_add_with_flags_paired $I64 y_lo x_lo) (x64_adc_paired $I64 y_hi (RegMemImm.Imm 0))))) ;;;; Rules for `isub` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;; `i64` and smaller. ;; Sub two registers. (rule (lower (has_type (fits_in_64 ty) (isub x y))) (x64_sub ty x y)) ;; Sub a register and an immediate. (rule (lower (has_type (fits_in_64 ty) (isub x (simm32_from_value y)))) (x64_sub ty x y)) ;; Sub a register and memory. (rule (lower (has_type (fits_in_64 ty) (isub x (sinkable_load y)))) (x64_sub ty x (sink_load_to_gpr_mem_imm y))) ;; SSE. (rule (lower (has_type (multi_lane 8 16) (isub x y))) (x64_psubb x y)) (rule (lower (has_type (multi_lane 16 8) (isub x y))) (x64_psubw x y)) (rule (lower (has_type (multi_lane 32 4) (isub x y))) (x64_psubd x y)) (rule (lower (has_type (multi_lane 64 2) (isub x y))) (x64_psubq x y)) ;; `i128` (rule (lower (has_type $I128 (isub x y))) ;; Get the high/low registers for `x`. (let ((x_regs ValueRegs x) (x_lo Gpr (value_regs_get_gpr x_regs 0)) (x_hi Gpr (value_regs_get_gpr x_regs 1))) ;; Get the high/low registers for `y`. (let ((y_regs ValueRegs y) (y_lo Gpr (value_regs_get_gpr y_regs 0)) (y_hi Gpr (value_regs_get_gpr y_regs 1))) ;; Do a sub followed by an sub-with-borrow. (with_flags (x64_sub_with_flags_paired $I64 x_lo y_lo) (x64_sbb_paired $I64 x_hi y_hi))))) ;;;; Rules for `ssub_sat` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; (rule (lower (has_type (multi_lane 8 16) (ssub_sat x y))) (x64_psubsb x y)) (rule (lower (has_type (multi_lane 16 8) (ssub_sat x y))) (x64_psubsw x y)) ;;;; Rules for `usub_sat` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; (rule (lower (has_type (multi_lane 8 16) (usub_sat x y))) (x64_psubusb x y)) (rule (lower (has_type (multi_lane 16 8) (usub_sat x y))) (x64_psubusw x y)) ;;;; Rules for `band` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;; `{i,b}64` and smaller. ;; And two registers. (rule (lower (has_type (fits_in_64 ty) (band x y))) (x64_and ty x y)) ;; And with a memory operand. (rule (lower (has_type (fits_in_64 ty) (band x (sinkable_load y)))) (x64_and ty x (sink_load_to_gpr_mem_imm y))) (rule (lower (has_type (fits_in_64 ty) (band (sinkable_load x) y))) (x64_and ty y (sink_load_to_gpr_mem_imm x))) ;; And with an immediate. (rule (lower (has_type (fits_in_64 ty) (band x (simm32_from_value y)))) (x64_and ty x y)) (rule (lower (has_type (fits_in_64 ty) (band (simm32_from_value x) y))) (x64_and ty y x)) ;; SSE. (decl sse_and (Type Xmm XmmMem) Xmm) (rule (sse_and $F32X4 x y) (x64_andps x y)) (rule (sse_and $F64X2 x y) (x64_andpd x y)) (rule (sse_and (multi_lane _bits _lanes) x y) (x64_pand x y)) (rule (lower (has_type ty @ (multi_lane _bits _lanes) (band x y))) (sse_and ty x y)) ;; `{i,b}128`. (rule (lower (has_type $I128 (band x y))) (let ((x_regs ValueRegs x) (x_lo Gpr (value_regs_get_gpr x_regs 0)) (x_hi Gpr (value_regs_get_gpr x_regs 1)) (y_regs ValueRegs y) (y_lo Gpr (value_regs_get_gpr y_regs 0)) (y_hi Gpr (value_regs_get_gpr y_regs 1))) (value_gprs (x64_and $I64 x_lo y_lo) (x64_and $I64 x_hi y_hi)))) (rule (lower (has_type $B128 (band x y))) ;; Booleans are always `0` or `1`, so we only need to do the `and` on the ;; low half. The high half is always zero but, rather than generate a new ;; zero, we just reuse `x`'s high half which is already zero. (let ((x_regs ValueRegs x) (x_lo Gpr (value_regs_get_gpr x_regs 0)) (x_hi Gpr (value_regs_get_gpr x_regs 1)) (y_lo Gpr (lo_gpr y))) (value_gprs (x64_and $I64 x_lo y_lo) x_hi))) ;;;; Rules for `bor` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;; `{i,b}64` and smaller. ;; Or two registers. (rule (lower (has_type (fits_in_64 ty) (bor x y))) (x64_or ty x y)) ;; Or with a memory operand. (rule (lower (has_type (fits_in_64 ty) (bor x (sinkable_load y)))) (x64_or ty x (sink_load_to_gpr_mem_imm y))) (rule (lower (has_type (fits_in_64 ty) (bor (sinkable_load x) y))) (x64_or ty y (sink_load_to_gpr_mem_imm x))) ;; Or with an immediate. (rule (lower (has_type (fits_in_64 ty) (bor x (simm32_from_value y)))) (x64_or ty x y)) (rule (lower (has_type (fits_in_64 ty) (bor (simm32_from_value x) y))) (x64_or ty y x)) ;; SSE. (decl sse_or (Type Xmm XmmMem) Xmm) (rule (sse_or $F32X4 x y) (x64_orps x y)) (rule (sse_or $F64X2 x y) (x64_orpd x y)) (rule (sse_or (multi_lane _bits _lanes) x y) (x64_por x y)) (rule (lower (has_type ty @ (multi_lane _bits _lanes) (bor x y))) (sse_or ty x y)) ;; `{i,b}128`. (decl or_i128 (ValueRegs ValueRegs) ValueRegs) (rule (or_i128 x y) (let ((x_lo Gpr (value_regs_get_gpr x 0)) (x_hi Gpr (value_regs_get_gpr x 1)) (y_lo Gpr (value_regs_get_gpr y 0)) (y_hi Gpr (value_regs_get_gpr y 1))) (value_gprs (x64_or $I64 x_lo y_lo) (x64_or $I64 x_hi y_hi)))) (rule (lower (has_type $I128 (bor x y))) (or_i128 x y)) (rule (lower (has_type $B128 (bor x y))) ;; Booleans are always `0` or `1`, so we only need to do the `or` on the ;; low half. The high half is always zero but, rather than generate a new ;; zero, we just reuse `x`'s high half which is already zero. (let ((x_regs ValueRegs x) (x_lo Gpr (value_regs_get_gpr x_regs 0)) (x_hi Gpr (value_regs_get_gpr x_regs 1)) (y_lo Gpr (lo_gpr y))) (value_gprs (x64_or $I64 x_lo y_lo) x_hi))) ;;;; Rules for `bxor` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;; `{i,b}64` and smaller. ;; Xor two registers. (rule (lower (has_type (fits_in_64 ty) (bxor x y))) (x64_xor ty x y)) ;; Xor with a memory operand. (rule (lower (has_type (fits_in_64 ty) (bxor x (sinkable_load y)))) (x64_xor ty x (sink_load_to_gpr_mem_imm y))) (rule (lower (has_type (fits_in_64 ty) (bxor (sinkable_load x) y))) (x64_xor ty y (sink_load_to_gpr_mem_imm x))) ;; Xor with an immediate. (rule (lower (has_type (fits_in_64 ty) (bxor x (simm32_from_value y)))) (x64_xor ty x y)) (rule (lower (has_type (fits_in_64 ty) (bxor (simm32_from_value x) y))) (x64_xor ty y x)) ;; SSE. (rule (lower (has_type ty @ (multi_lane _bits _lanes) (bxor x y))) (sse_xor ty x y)) ;; `{i,b}128`. (rule (lower (has_type $I128 (bxor x y))) (let ((x_regs ValueRegs x) (x_lo Gpr (value_regs_get_gpr x_regs 0)) (x_hi Gpr (value_regs_get_gpr x_regs 1)) (y_regs ValueRegs y) (y_lo Gpr (value_regs_get_gpr y_regs 0)) (y_hi Gpr (value_regs_get_gpr y_regs 1))) (value_gprs (x64_xor $I64 x_lo y_lo) (x64_xor $I64 x_hi y_hi)))) (rule (lower (has_type $B128 (bxor x y))) ;; Booleans are always `0` or `1`, so we only need to do the `xor` on the ;; low half. The high half is always zero but, rather than generate a new ;; zero, we just reuse `x`'s high half which is already zero. (let ((x_regs ValueRegs x) (x_lo Gpr (value_regs_get_gpr x_regs 0)) (x_hi Gpr (value_regs_get_gpr x_regs 1)) (y_lo Gpr (lo_gpr y))) (value_gprs (x64_xor $I64 x_lo y_lo) x_hi))) ;;;; Rules for `ishl` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;; `i64` and smaller. (rule (lower (has_type (fits_in_64 ty) (ishl src amt))) (x64_shl ty src (put_masked_in_imm8_gpr amt ty))) ;; `i128`. (decl shl_i128 (ValueRegs Gpr) ValueRegs) (rule (shl_i128 src amt) ;; Unpack the registers that make up the 128-bit value being shifted. (let ((src_lo Gpr (value_regs_get_gpr src 0)) (src_hi Gpr (value_regs_get_gpr src 1)) ;; Do two 64-bit shifts. (lo_shifted Gpr (x64_shl $I64 src_lo amt)) (hi_shifted Gpr (x64_shl $I64 src_hi amt)) ;; `src_lo >> (64 - amt)` are the bits to carry over from the lo ;; into the hi. (carry Gpr (x64_shr $I64 src_lo (x64_sub $I64 (imm $I64 64) amt))) (zero Gpr (imm $I64 0)) ;; Nullify the carry if we are shifting in by a multiple of 128. (carry_ Gpr (with_flags_reg (x64_test (OperandSize.Size64) (RegMemImm.Imm 127) amt) (cmove $I64 (CC.Z) zero carry))) ;; Add the carry into the high half. (hi_shifted_ Gpr (x64_or $I64 carry_ hi_shifted))) ;; Combine the two shifted halves. However, if we are shifting by >= 64 ;; (modulo 128), then the low bits are zero and the high bits are our ;; low bits. (with_flags (x64_test (OperandSize.Size64) (RegMemImm.Imm 64) amt) (consumes_flags_concat (cmove $I64 (CC.Z) lo_shifted zero) (cmove $I64 (CC.Z) hi_shifted_ lo_shifted))))) (rule (lower (has_type $I128 (ishl src amt))) ;; NB: Only the low bits of `amt` matter since we logically mask the shift ;; amount to the value's bit width. (let ((amt_ Gpr (lo_gpr amt))) (shl_i128 src amt_))) ;; SSE. ;; Since the x86 instruction set does not have any 8x16 shift instructions (even ;; in higher feature sets like AVX), we lower the `ishl.i8x16` to a sequence of ;; instructions. The basic idea, whether the amount to shift by is an immediate ;; or not, is to use a 16x8 shift and then mask off the incorrect bits to 0s. (rule (lower (has_type ty @ $I8X16 (ishl src amt))) (let ( ;; Mask the amount to ensure wrapping behaviour (masked_amt Reg (x64_and $I64 amt (RegMemImm.Imm (shift_mask ty)))) ;; Shift `src` using 16x8. Unfortunately, a 16x8 shift will only be ;; correct for half of the lanes; the others must be fixed up with ;; the mask below. (unmasked Xmm (x64_psllw src (mov_rmi_to_xmm masked_amt))) (mask_addr SyntheticAmode (ishl_i8x16_mask masked_amt)) (mask Reg (x64_load $I8X16 mask_addr (ExtKind.None)))) (sse_and $I8X16 unmasked (RegMem.Reg mask)))) ;; Get the address of the mask to use when fixing up the lanes that weren't ;; correctly generated by the 16x8 shift. (decl ishl_i8x16_mask (RegMemImm) SyntheticAmode) ;; When the shift amount is known, we can statically (i.e. at compile time) ;; determine the mask to use and only emit that. (decl ishl_i8x16_mask_for_const (u32) SyntheticAmode) (extern constructor ishl_i8x16_mask_for_const ishl_i8x16_mask_for_const) (rule (ishl_i8x16_mask (RegMemImm.Imm amt)) (ishl_i8x16_mask_for_const amt)) ;; Otherwise, we must emit the entire mask table and dynamically (i.e. at run ;; time) find the correct mask offset in the table. We use `lea` to find the ;; base address of the mask table and then complex addressing to offset to the ;; right mask: `base_address + amt << 4` (decl ishl_i8x16_mask_table () SyntheticAmode) (extern constructor ishl_i8x16_mask_table ishl_i8x16_mask_table) (rule (ishl_i8x16_mask (RegMemImm.Reg amt)) (let ((mask_table SyntheticAmode (ishl_i8x16_mask_table)) (base_mask_addr Gpr (x64_lea mask_table)) (mask_offset Gpr (x64_shl $I64 amt (imm8_to_imm8_gpr 4)))) (amode_imm_reg_reg_shift 0 base_mask_addr mask_offset 0))) (rule (ishl_i8x16_mask (RegMemImm.Mem amt)) (ishl_i8x16_mask (RegMemImm.Reg (x64_load $I64 amt (ExtKind.None))))) ;; 16x8, 32x4, and 64x2 shifts can each use a single instruction, once the shift amount is masked. (rule (lower (has_type ty @ $I16X8 (ishl src amt))) (let ((masked_amt Reg (x64_and $I64 amt (RegMemImm.Imm (shift_mask ty))))) (x64_psllw src (mov_rmi_to_xmm masked_amt)))) (rule (lower (has_type ty @ $I32X4 (ishl src amt))) (let ((masked_amt Reg (x64_and $I64 amt (RegMemImm.Imm (shift_mask ty))))) (x64_pslld src (mov_rmi_to_xmm masked_amt)))) (rule (lower (has_type ty @ $I64X2 (ishl src amt))) (let ((masked_amt Reg (x64_and $I64 amt (RegMemImm.Imm (shift_mask ty))))) (x64_psllq src (mov_rmi_to_xmm masked_amt)))) ;;;; Rules for `ushr` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;; `i64` and smaller. (rule (lower (has_type (fits_in_64 ty) (ushr src amt))) (let ((src_ Gpr (extend_to_gpr src ty (ExtendKind.Zero)))) (x64_shr ty src_ (put_masked_in_imm8_gpr amt ty)))) ;; `i128`. (decl shr_i128 (ValueRegs Gpr) ValueRegs) (rule (shr_i128 src amt) ;; Unpack the lo/hi halves of `src`. (let ((src_lo Gpr (value_regs_get_gpr src 0)) (src_hi Gpr (value_regs_get_gpr src 1)) ;; Do a shift on each half. (lo_shifted Gpr (x64_shr $I64 src_lo amt)) (hi_shifted Gpr (x64_shr $I64 src_hi amt)) ;; `src_hi << (64 - amt)` are the bits to carry over from the hi ;; into the lo. (carry Gpr (x64_shl $I64 src_hi (x64_sub $I64 (imm $I64 64) amt))) ;; Nullify the carry if we are shifting by a multiple of 128. (carry_ Gpr (with_flags_reg (x64_test (OperandSize.Size64) (RegMemImm.Imm 127) amt) (cmove $I64 (CC.Z) (imm $I64 0) carry))) ;; Add the carry bits into the lo. (lo_shifted_ Gpr (x64_or $I64 carry_ lo_shifted))) ;; Combine the two shifted halves. However, if we are shifting by >= 64 ;; (modulo 128), then the hi bits are zero and the lo bits are what ;; would otherwise be our hi bits. (with_flags (x64_test (OperandSize.Size64) (RegMemImm.Imm 64) amt) (consumes_flags_concat (cmove $I64 (CC.Z) lo_shifted_ hi_shifted) (cmove $I64 (CC.Z) hi_shifted (imm $I64 0)))))) (rule (lower (has_type $I128 (ushr src amt))) ;; NB: Only the low bits of `amt` matter since we logically mask the shift ;; amount to the value's bit width. (let ((amt_ Gpr (lo_gpr amt))) (shr_i128 src amt_))) ;; SSE. ;; There are no 8x16 shifts in x64. Do the same 16x8-shift-and-mask thing we do ;; with 8x16 `ishl`. (rule (lower (has_type ty @ $I8X16 (ushr src amt))) (let ( ;; Mask the amount to ensure wrapping behaviour (masked_amt Reg (x64_and $I64 amt (RegMemImm.Imm (shift_mask ty)))) ;; Shift `src` using 16x8. Unfortunately, a 16x8 shift will only be ;; correct for half of the lanes; the others must be fixed up with ;; the mask below. (unmasked Xmm (x64_psrlw src (mov_rmi_to_xmm masked_amt))) (mask_addr SyntheticAmode (ushr_i8x16_mask masked_amt)) (mask Reg (x64_load $I8X16 mask_addr (ExtKind.None)))) (sse_and $I8X16 unmasked (RegMem.Reg mask)))) ;; Get the address of the mask to use when fixing up the lanes that weren't ;; correctly generated by the 16x8 shift. (decl ushr_i8x16_mask (RegMemImm) SyntheticAmode) ;; When the shift amount is known, we can statically (i.e. at compile time) ;; determine the mask to use and only emit that. (decl ushr_i8x16_mask_for_const (u32) SyntheticAmode) (extern constructor ushr_i8x16_mask_for_const ushr_i8x16_mask_for_const) (rule (ushr_i8x16_mask (RegMemImm.Imm amt)) (ushr_i8x16_mask_for_const amt)) ;; Otherwise, we must emit the entire mask table and dynamically (i.e. at run ;; time) find the correct mask offset in the table. We use `lea` to find the ;; base address of the mask table and then complex addressing to offset to the ;; right mask: `base_address + amt << 4` (decl ushr_i8x16_mask_table () SyntheticAmode) (extern constructor ushr_i8x16_mask_table ushr_i8x16_mask_table) (rule (ushr_i8x16_mask (RegMemImm.Reg amt)) (let ((mask_table SyntheticAmode (ushr_i8x16_mask_table)) (base_mask_addr Gpr (x64_lea mask_table)) (mask_offset Gpr (x64_shl $I64 amt (imm8_to_imm8_gpr 4)))) (amode_imm_reg_reg_shift 0 base_mask_addr mask_offset 0))) (rule (ushr_i8x16_mask (RegMemImm.Mem amt)) (ushr_i8x16_mask (RegMemImm.Reg (x64_load $I64 amt (ExtKind.None))))) ;; 16x8, 32x4, and 64x2 shifts can each use a single instruction, once the shift amount is masked. (rule (lower (has_type ty @ $I16X8 (ushr src amt))) (let ((masked_amt Reg (x64_and $I64 amt (RegMemImm.Imm (shift_mask ty))))) (x64_psrlw src (mov_rmi_to_xmm masked_amt)))) (rule (lower (has_type ty @ $I32X4 (ushr src amt))) (let ((masked_amt Reg (x64_and $I64 amt (RegMemImm.Imm (shift_mask ty))))) (x64_psrld src (mov_rmi_to_xmm masked_amt)))) (rule (lower (has_type ty @ $I64X2 (ushr src amt))) (let ((masked_amt Reg (x64_and $I64 amt (RegMemImm.Imm (shift_mask ty))))) (x64_psrlq src (mov_rmi_to_xmm masked_amt)))) ;;;; Rules for `sshr` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;; `i64` and smaller. (rule (lower (has_type (fits_in_64 ty) (sshr src amt))) (let ((src_ Gpr (extend_to_gpr src ty (ExtendKind.Sign)))) (x64_sar ty src_ (put_masked_in_imm8_gpr amt ty)))) ;; `i128`. (decl sar_i128 (ValueRegs Gpr) ValueRegs) (rule (sar_i128 src amt) ;; Unpack the low/high halves of `src`. (let ((src_lo Gpr (value_regs_get_gpr src 0)) (src_hi Gpr (value_regs_get_gpr src 1)) ;; Do a shift of each half. NB: the low half uses an unsigned shift ;; because its MSB is not a sign bit. (lo_shifted Gpr (x64_shr $I64 src_lo amt)) (hi_shifted Gpr (x64_sar $I64 src_hi amt)) ;; `src_hi << (64 - amt)` are the bits to carry over from the low ;; half to the high half. (carry Gpr (x64_shl $I64 src_hi (x64_sub $I64 (imm $I64 64) amt))) ;; Nullify the carry if we are shifting by a multiple of 128. (carry_ Gpr (with_flags_reg (x64_test (OperandSize.Size64) (RegMemImm.Imm 127) amt) (cmove $I64 (CC.Z) (imm $I64 0) carry))) ;; Add the carry into the low half. (lo_shifted_ Gpr (x64_or $I64 lo_shifted carry_)) ;; Get all sign bits. (sign_bits Gpr (x64_sar $I64 src_hi (imm8_to_imm8_gpr 63)))) ;; Combine the two shifted halves. However, if we are shifting by >= 64 ;; (modulo 128), then the hi bits are all sign bits and the lo bits are ;; what would otherwise be our hi bits. (with_flags (x64_test (OperandSize.Size64) (RegMemImm.Imm 64) amt) (consumes_flags_concat (cmove $I64 (CC.Z) lo_shifted_ hi_shifted) (cmove $I64 (CC.Z) hi_shifted sign_bits))))) (rule (lower (has_type $I128 (sshr src amt))) ;; NB: Only the low bits of `amt` matter since we logically mask the shift ;; amount to the value's bit width. (let ((amt_ Gpr (lo_gpr amt))) (sar_i128 src amt_))) ;; SSE. ;; Since the x86 instruction set does not have an 8x16 shift instruction and the ;; approach used for `ishl` and `ushr` cannot be easily used (the masks do not ;; preserve the sign), we use a different approach here: separate the low and ;; high lanes, shift them separately, and merge them into the final result. ;; ;; Visually, this looks like the following, where `src.i8x16 = [s0, s1, ..., ;; s15]: ;; ;; lo.i16x8 = [(s0, s0), (s1, s1), ..., (s7, s7)] ;; shifted_lo.i16x8 = shift each lane of `low` ;; hi.i16x8 = [(s8, s8), (s9, s9), ..., (s15, s15)] ;; shifted_hi.i16x8 = shift each lane of `high` ;; result = [s0'', s1'', ..., s15''] (rule (lower (has_type ty @ $I8X16 (sshr src amt @ (value_type amt_ty)))) (let ((src_ Xmm (put_in_xmm src)) ;; Mask the amount to ensure wrapping behaviour (masked_amt Reg (x64_and $I64 amt (RegMemImm.Imm (shift_mask ty)))) ;; In order for `packsswb` later to only use the high byte of each ;; 16x8 lane, we shift right an extra 8 bits, relying on `psraw` to ;; fill in the upper bits appropriately. (lo Xmm (x64_punpcklbw src_ src_)) (hi Xmm (x64_punpckhbw src_ src_)) (amt_ XmmMemImm (sshr_i8x16_bigger_shift amt_ty masked_amt)) (shifted_lo Xmm (x64_psraw lo amt_)) (shifted_hi Xmm (x64_psraw hi amt_))) (x64_packsswb shifted_lo shifted_hi))) (decl sshr_i8x16_bigger_shift (Type RegMemImm) XmmMemImm) (rule (sshr_i8x16_bigger_shift _ty (RegMemImm.Imm i)) (xmm_mem_imm_new (RegMemImm.Imm (u32_add i 8)))) (rule (sshr_i8x16_bigger_shift ty (RegMemImm.Reg r)) (mov_rmi_to_xmm (RegMemImm.Reg (x64_add ty r (RegMemImm.Imm 8))))) (rule (sshr_i8x16_bigger_shift ty rmi @ (RegMemImm.Mem _m)) (mov_rmi_to_xmm (RegMemImm.Reg (x64_add ty (imm ty 8) rmi)))) ;; `sshr.{i16x8,i32x4}` can be a simple `psra{w,d}`, we just have to make sure ;; that if the shift amount is in a register, it is in an XMM register. (rule (lower (has_type ty @ $I16X8 (sshr src amt))) (let ((masked_amt Reg (x64_and $I64 amt (RegMemImm.Imm (shift_mask ty))))) (x64_psraw src (mov_rmi_to_xmm masked_amt)))) (rule (lower (has_type ty @ $I32X4 (sshr src amt))) (let ((masked_amt Reg (x64_and $I64 amt (RegMemImm.Imm (shift_mask ty))))) (x64_psrad src (mov_rmi_to_xmm masked_amt)))) ;; The `sshr.i64x2` CLIF instruction has no single x86 instruction in the older ;; feature sets. Newer ones like AVX512VL + AVX512F include `vpsraq`, a 128-bit ;; instruction that would fit here, but this backend does not currently have ;; support for EVEX encodings. To remedy this, we extract each 64-bit lane to a ;; GPR, shift each using a scalar instruction, and insert the shifted values ;; back in the `dst` XMM register. ;; ;; (TODO: when EVEX support is available, add an alternate lowering here). (rule (lower (has_type $I64X2 (sshr src amt))) (let ((src_ Xmm (put_in_xmm src)) (lo Gpr (x64_pextrd $I64 src_ 0)) (hi Gpr (x64_pextrd $I64 src_ 1)) (amt_ Imm8Gpr (put_masked_in_imm8_gpr amt $I64)) (shifted_lo Gpr (x64_sar $I64 lo amt_)) (shifted_hi Gpr (x64_sar $I64 hi amt_))) (make_i64x2_from_lanes shifted_lo shifted_hi))) ;;;; Rules for `rotl` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;; `i16` and `i8`: we need to extend the shift amount, or mask the ;; constant. (rule (lower (has_type (ty_8_or_16 ty) (rotl src amt))) (let ((amt_ Gpr (extend_to_gpr amt $I32 (ExtendKind.Zero)))) (x64_rotl ty src (gpr_to_imm8_gpr amt_)))) (rule (lower (has_type (ty_8_or_16 ty) (rotl src (u64_from_iconst amt)))) (x64_rotl ty src (const_to_type_masked_imm8 amt ty))) ;; `i64` and `i32`: we can rely on x86's rotate-amount masking since ;; we operate on the whole register. (rule (lower (has_type (ty_32_or_64 ty) (rotl src amt))) ;; NB: Only the low bits of `amt` matter since we logically mask the ;; shift amount to the value's bit width. (let ((amt_ Gpr (lo_gpr amt))) (x64_rotl ty src amt_))) (rule (lower (has_type (ty_32_or_64 ty) (rotl src (u64_from_iconst amt)))) (x64_rotl ty src (const_to_type_masked_imm8 amt ty))) ;; `i128`. (rule (lower (has_type $I128 (rotl src amt))) (let ((src_ ValueRegs src) ;; NB: Only the low bits of `amt` matter since we logically mask the ;; rotation amount to the value's bit width. (amt_ Gpr (lo_gpr amt))) (or_i128 (shl_i128 src_ amt_) (shr_i128 src_ (x64_sub $I64 (imm $I64 128) amt_))))) ;;;; Rules for `rotr` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;; `i16` and `i8`: we need to extend the shift amount, or mask the ;; constant. (rule (lower (has_type (ty_8_or_16 ty) (rotr src amt))) (let ((amt_ Gpr (extend_to_gpr amt $I32 (ExtendKind.Zero)))) (x64_rotr ty src amt_))) (rule (lower (has_type (ty_8_or_16 ty) (rotr src (u64_from_iconst amt)))) (x64_rotr ty src (const_to_type_masked_imm8 amt ty))) ;; `i64` and `i32`: we can rely on x86's rotate-amount masking since ;; we operate on the whole register. (rule (lower (has_type (ty_32_or_64 ty) (rotr src amt))) ;; NB: Only the low bits of `amt` matter since we logically mask the ;; shift amount to the value's bit width. (let ((amt_ Gpr (lo_gpr amt))) (x64_rotr ty src amt_))) (rule (lower (has_type (ty_32_or_64 ty) (rotr src (u64_from_iconst amt)))) (x64_rotr ty src (const_to_type_masked_imm8 amt ty))) ;; `i128`. (rule (lower (has_type $I128 (rotr src amt))) (let ((src_ ValueRegs src) ;; NB: Only the low bits of `amt` matter since we logically mask the ;; rotation amount to the value's bit width. (amt_ Gpr (lo_gpr amt))) (or_i128 (shr_i128 src_ amt_) (shl_i128 src_ (x64_sub $I64 (imm $I64 128) amt_))))) ;;;; Rules for `ineg` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;; `i64` and smaller. (rule (lower (has_type (fits_in_64 ty) (ineg x))) (x64_neg ty x)) ;; SSE. (rule (lower (has_type $I8X16 (ineg x))) (x64_psubb (imm $I8X16 0) x)) (rule (lower (has_type $I16X8 (ineg x))) (x64_psubw (imm $I16X8 0) x)) (rule (lower (has_type $I32X4 (ineg x))) (x64_psubd (imm $I32X4 0) x)) (rule (lower (has_type $I64X2 (ineg x))) (x64_psubq (imm $I64X2 0) x)) ;;;; Rules for `avg_round` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; (rule (lower (has_type (multi_lane 8 16) (avg_round x y))) (x64_pavgb x y)) (rule (lower (has_type (multi_lane 16 8) (avg_round x y))) (x64_pavgw x y)) ;;;; Rules for `imul` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;; `i64` and smaller. ;; Multiply two registers. (rule (lower (has_type (fits_in_64 ty) (imul x y))) (x64_mul ty x y)) ;; Multiply a register and an immediate. (rule (lower (has_type (fits_in_64 ty) (imul x (simm32_from_value y)))) (x64_mul ty x y)) (rule (lower (has_type (fits_in_64 ty) (imul (simm32_from_value x) y))) (x64_mul ty y x)) ;; Multiply a register and a memory load. (rule (lower (has_type (fits_in_64 ty) (imul x (sinkable_load y)))) (x64_mul ty x (sink_load_to_gpr_mem_imm y))) (rule (lower (has_type (fits_in_64 ty) (imul (sinkable_load x) y))) (x64_mul ty y (sink_load_to_gpr_mem_imm x))) ;; `i128`. ;; mul: ;; dst_lo = lhs_lo * rhs_lo ;; dst_hi = umulhi(lhs_lo, rhs_lo) + ;; lhs_lo * rhs_hi + ;; lhs_hi * rhs_lo ;; ;; so we emit: ;; lo_hi = mul x_lo, y_hi ;; hi_lo = mul x_hi, y_lo ;; hilo_hilo = add lo_hi, hi_lo ;; dst_lo:hi_lolo = mulhi_u x_lo, y_lo ;; dst_hi = add hilo_hilo, hi_lolo ;; return (dst_lo, dst_hi) (rule (lower (has_type $I128 (imul x y))) ;; Put `x` into registers and unpack its hi/lo halves. (let ((x_regs ValueRegs x) (x_lo Gpr (value_regs_get_gpr x_regs 0)) (x_hi Gpr (value_regs_get_gpr x_regs 1)) ;; Put `y` into registers and unpack its hi/lo halves. (y_regs ValueRegs y) (y_lo Gpr (value_regs_get_gpr y_regs 0)) (y_hi Gpr (value_regs_get_gpr y_regs 1)) ;; lo_hi = mul x_lo, y_hi (lo_hi Gpr (x64_mul $I64 x_lo y_hi)) ;; hi_lo = mul x_hi, y_lo (hi_lo Gpr (x64_mul $I64 x_hi y_lo)) ;; hilo_hilo = add lo_hi, hi_lo (hilo_hilo Gpr (x64_add $I64 lo_hi hi_lo)) ;; dst_lo:hi_lolo = mulhi_u x_lo, y_lo (mul_regs ValueRegs (mulhi_u $I64 x_lo y_lo)) (dst_lo Gpr (value_regs_get_gpr mul_regs 0)) (hi_lolo Gpr (value_regs_get_gpr mul_regs 1)) ;; dst_hi = add hilo_hilo, hi_lolo (dst_hi Gpr (x64_add $I64 hilo_hilo hi_lolo))) (value_gprs dst_lo dst_hi))) ;; SSE. ;; (No i8x16 multiply.) (rule (lower (has_type (multi_lane 16 8) (imul x y))) (x64_pmullw x y)) (rule (lower (has_type (multi_lane 32 4) (imul x y))) (x64_pmulld x y)) ;; With AVX-512 we can implement `i64x2` multiplication with a single ;; instruction. (rule (lower (has_type (and (avx512vl_enabled) (avx512dq_enabled) (multi_lane 64 2)) (imul x y))) (x64_vpmullq x y)) ;; Otherwise, for i64x2 multiplication we describe a lane A as being composed of ;; a 32-bit upper half "Ah" and a 32-bit lower half "Al". The 32-bit long hand ;; multiplication can then be written as: ;; ;; Ah Al ;; * Bh Bl ;; ----- ;; Al * Bl ;; + (Ah * Bl) << 32 ;; + (Al * Bh) << 32 ;; ;; So for each lane we will compute: ;; ;; A * B = (Al * Bl) + ((Ah * Bl) + (Al * Bh)) << 32 ;; ;; Note, the algorithm will use `pmuldq` which operates directly on the lower ;; 32-bit (`Al` or `Bl`) of a lane and writes the result to the full 64-bits of ;; the lane of the destination. For this reason we don't need shifts to isolate ;; the lower 32-bits, however, we will need to use shifts to isolate the high ;; 32-bits when doing calculations, i.e., `Ah == A >> 32`. (rule (lower (has_type (multi_lane 64 2) (imul a b))) (let ((a0 Xmm a) (b0 Xmm b) ;; a_hi = A >> 32 (a_hi Xmm (x64_psrlq a0 (RegMemImm.Imm 32))) ;; ah_bl = Ah * Bl (ah_bl Xmm (x64_pmuludq a_hi b0)) ;; b_hi = B >> 32 (b_hi Xmm (x64_psrlq b0 (RegMemImm.Imm 32))) ;; al_bh = Al * Bh (al_bh Xmm (x64_pmuludq a0 b_hi)) ;; aa_bb = ah_bl + al_bh (aa_bb Xmm (x64_paddq ah_bl al_bh)) ;; aa_bb_shifted = aa_bb << 32 (aa_bb_shifted Xmm (x64_psllq aa_bb (RegMemImm.Imm 32))) ;; al_bl = Al * Bl (al_bl Xmm (x64_pmuludq a0 b0))) ;; al_bl + aa_bb_shifted (x64_paddq al_bl aa_bb_shifted))) ;; Special case for `i16x8.extmul_high_i8x16_s`. (rule (lower (has_type (multi_lane 16 8) (imul (swiden_high (and (value_type (multi_lane 8 16)) x)) (swiden_high (and (value_type (multi_lane 8 16)) y))))) (let ((x1 Xmm x) (x2 Xmm (x64_palignr x1 x1 8 (OperandSize.Size32))) (x3 Xmm (x64_pmovsxbw x2)) (y1 Xmm y) (y2 Xmm (x64_palignr y1 y1 8 (OperandSize.Size32))) (y3 Xmm (x64_pmovsxbw y2))) (x64_pmullw x3 y3))) ;; Special case for `i32x4.extmul_high_i16x8_s`. (rule (lower (has_type (multi_lane 32 4) (imul (swiden_high (and (value_type (multi_lane 16 8)) x)) (swiden_high (and (value_type (multi_lane 16 8)) y))))) (let ((x2 Xmm x) (y2 Xmm y) (lo Xmm (x64_pmullw x2 y2)) (hi Xmm (x64_pmulhw x2 y2))) (x64_punpckhwd lo hi))) ;; Special case for `i64x2.extmul_high_i32x4_s`. (rule (lower (has_type (multi_lane 64 2) (imul (swiden_high (and (value_type (multi_lane 32 4)) x)) (swiden_high (and (value_type (multi_lane 32 4)) y))))) (let ((x2 Xmm (x64_pshufd x 0xFA (OperandSize.Size32))) (y2 Xmm (x64_pshufd y 0xFA (OperandSize.Size32)))) (x64_pmuldq x2 y2))) ;; Special case for `i16x8.extmul_low_i8x16_s`. (rule (lower (has_type (multi_lane 16 8) (imul (swiden_low (and (value_type (multi_lane 8 16)) x)) (swiden_low (and (value_type (multi_lane 8 16)) y))))) (let ((x2 Xmm (x64_pmovsxbw x)) (y2 Xmm (x64_pmovsxbw y))) (x64_pmullw x2 y2))) ;; Special case for `i32x4.extmul_low_i16x8_s`. (rule (lower (has_type (multi_lane 32 4) (imul (swiden_low (and (value_type (multi_lane 16 8)) x)) (swiden_low (and (value_type (multi_lane 16 8)) y))))) (let ((x2 Xmm x) (y2 Xmm y) (lo Xmm (x64_pmullw x2 y2)) (hi Xmm (x64_pmulhw x2 y2))) (x64_punpcklwd lo hi))) ;; Special case for `i64x2.extmul_low_i32x4_s`. (rule (lower (has_type (multi_lane 64 2) (imul (swiden_low (and (value_type (multi_lane 32 4)) x)) (swiden_low (and (value_type (multi_lane 32 4)) y))))) (let ((x2 Xmm (x64_pshufd x 0x50 (OperandSize.Size32))) (y2 Xmm (x64_pshufd y 0x50 (OperandSize.Size32)))) (x64_pmuldq x2 y2))) ;; Special case for `i16x8.extmul_high_i8x16_u`. (rule (lower (has_type (multi_lane 16 8) (imul (uwiden_high (and (value_type (multi_lane 8 16)) x)) (uwiden_high (and (value_type (multi_lane 8 16)) y))))) (let ((x1 Xmm x) (x2 Xmm (x64_palignr x1 x1 8 (OperandSize.Size32))) (x3 Xmm (x64_pmovzxbw x2)) (y1 Xmm y) (y2 Xmm (x64_palignr y1 y1 8 (OperandSize.Size32))) (y3 Xmm (x64_pmovzxbw y2))) (x64_pmullw x3 y3))) ;; Special case for `i32x4.extmul_high_i16x8_u`. (rule (lower (has_type (multi_lane 32 4) (imul (uwiden_high (and (value_type (multi_lane 16 8)) x)) (uwiden_high (and (value_type (multi_lane 16 8)) y))))) (let ((x2 Xmm x) (y2 Xmm y) (lo Xmm (x64_pmullw x2 y2)) (hi Xmm (x64_pmulhuw x2 y2))) (x64_punpckhwd lo hi))) ;; Special case for `i64x2.extmul_high_i32x4_u`. (rule (lower (has_type (multi_lane 64 2) (imul (uwiden_high (and (value_type (multi_lane 32 4)) x)) (uwiden_high (and (value_type (multi_lane 32 4)) y))))) (let ((x2 Xmm (x64_pshufd x 0xFA (OperandSize.Size32))) (y2 Xmm (x64_pshufd y 0xFA (OperandSize.Size32)))) (x64_pmuludq x2 y2))) ;; Special case for `i16x8.extmul_low_i8x16_u`. (rule (lower (has_type (multi_lane 16 8) (imul (uwiden_low (and (value_type (multi_lane 8 16)) x)) (uwiden_low (and (value_type (multi_lane 8 16)) y))))) (let ((x2 Xmm (x64_pmovzxbw x)) (y2 Xmm (x64_pmovzxbw y))) (x64_pmullw x2 y2))) ;; Special case for `i32x4.extmul_low_i16x8_u`. (rule (lower (has_type (multi_lane 32 4) (imul (uwiden_low (and (value_type (multi_lane 16 8)) x)) (uwiden_low (and (value_type (multi_lane 16 8)) y))))) (let ((x2 Xmm x) (y2 Xmm y) (lo Xmm (x64_pmullw x2 y2)) (hi Xmm (x64_pmulhuw x2 y2))) (x64_punpcklwd lo hi))) ;; Special case for `i64x2.extmul_low_i32x4_u`. (rule (lower (has_type (multi_lane 64 2) (imul (uwiden_low (and (value_type (multi_lane 32 4)) x)) (uwiden_low (and (value_type (multi_lane 32 4)) y))))) (let ((x2 Xmm (x64_pshufd x 0x50 (OperandSize.Size32))) (y2 Xmm (x64_pshufd y 0x50 (OperandSize.Size32)))) (x64_pmuludq x2 y2))) ;;;; Rules for `band_not` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; (decl sse_and_not (Type Xmm XmmMem) Xmm) (rule (sse_and_not $F32X4 x y) (x64_andnps x y)) (rule (sse_and_not $F64X2 x y) (x64_andnpd x y)) (rule (sse_and_not (multi_lane _bits _lanes) x y) (x64_pandn x y)) ;; Note the flipping of operands below. CLIF specifies ;; ;; band_not(x, y) = and(x, not(y)) ;; ;; while x86 does ;; ;; pandn(x, y) = and(not(x), y) (rule (lower (has_type ty (band_not x y))) (sse_and_not ty y x)) ;;;; Rules for `iabs` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; (rule (lower (has_type $I8X16 (iabs x))) (x64_pabsb x)) (rule (lower (has_type $I16X8 (iabs x))) (x64_pabsw x)) (rule (lower (has_type $I32X4 (iabs x))) (x64_pabsd x)) ;; When AVX512 is available, we can use a single `vpabsq` instruction. (rule (lower (has_type (and (avx512vl_enabled) (avx512f_enabled) $I64X2) (iabs x))) (x64_vpabsq x)) ;; Otherwise, we use a separate register, `neg`, to contain the results of `0 - ;; x` and then blend in those results with `blendvpd` if the MSB of `neg` was ;; set to 1 (i.e. if `neg` was negative or, conversely, if `x` was originally ;; positive). (rule (lower (has_type $I64X2 (iabs x))) (let ((rx Xmm x) (neg Xmm (x64_psubq (imm $I64X2 0) rx))) (x64_blendvpd neg rx neg))) ;;;; Rules for `fabs` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;; Special case for `f32x4.abs`. (rule (lower (has_type $F32X4 (fabs x))) (x64_andps x (x64_psrld (vector_all_ones $F32X4) (RegMemImm.Imm 1)))) ;; Special case for `f64x2.abs`. (rule (lower (has_type $F64X2 (fabs x))) (x64_andpd x (x64_psrlq (vector_all_ones $F64X2) (RegMemImm.Imm 1)))) ;;;; Rules for `bnot` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;; `i64` and smaller. (rule (lower (has_type (fits_in_64 ty) (bnot x))) (x64_not ty x)) ;; `i128`. (decl i128_not (Value) ValueRegs) (rule (i128_not x) (let ((x_regs ValueRegs x) (x_lo Gpr (value_regs_get_gpr x_regs 0)) (x_hi Gpr (value_regs_get_gpr x_regs 1))) (value_gprs (x64_not $I64 x_lo) (x64_not $I64 x_hi)))) (rule (lower (has_type $I128 (bnot x))) (i128_not x)) (rule (lower (has_type $B128 (bnot x))) (i128_not x)) ;; Special case for vector-types where bit-negation is an xor against an ;; all-one value (rule (lower (has_type ty @ (multi_lane _bits _lanes) (bnot x))) (sse_xor ty x (vector_all_ones ty))) ;;;; Rules for `bitselect` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; (rule (lower (has_type ty @ (multi_lane _bits _lanes) (bitselect condition if_true if_false))) ;; a = and if_true, condition ;; b = and_not condition, if_false ;; or b, a (let ((cond_xmm Xmm condition) (a Xmm (sse_and ty if_true cond_xmm)) (b Xmm (sse_and_not ty cond_xmm if_false))) (sse_or ty b a))) ;;;; Rules for `vselect` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; (rule (lower (has_type ty @ (multi_lane _bits _lanes) (vselect condition if_true if_false))) (x64_blend ty condition if_true if_false)) ;;;; Rules for `insertlane` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; (rule (lower (insertlane vec @ (value_type ty) val (u8_from_uimm8 idx))) (vec_insert_lane ty vec val idx)) ;; Helper function used below for `insertlane` but also here for other ;; lowerings. ;; ;; Note that the `Type` used here is the type of vector the insertion is ;; happening into, or the type of the first `Reg` argument. (decl vec_insert_lane (Type Xmm RegMem u8) Xmm) ;; i8x16.replace_lane (rule (vec_insert_lane $I8X16 vec val idx) (x64_pinsrb vec val idx)) ;; i16x8.replace_lane (rule (vec_insert_lane $I16X8 vec val idx) (x64_pinsrw vec val idx)) ;; i32x4.replace_lane (rule (vec_insert_lane $I32X4 vec val idx) (x64_pinsrd vec val idx (OperandSize.Size32))) ;; i64x2.replace_lane (rule (vec_insert_lane $I64X2 vec val idx) (x64_pinsrd vec val idx (OperandSize.Size64))) ;; f32x4.replace_lane (rule (vec_insert_lane $F32X4 vec val idx) (x64_insertps vec val (sse_insertps_lane_imm idx))) ;; External rust code used to calculate the immediate value to `insertps`. (decl sse_insertps_lane_imm (u8) u8) (extern constructor sse_insertps_lane_imm sse_insertps_lane_imm) ;; f64x2.replace_lane 0 ;; ;; Here the `movsd` instruction is used specifically to specialize moving ;; into the fist lane where unlike above cases we're not using the lane ;; immediate as an immediate to the instruction itself. ;; ;; Note, though, the `movsd` has different behavior with respect to the second ;; lane of the f64x2 depending on whether the RegMem operand is a register or ;; memory. When loading from a register `movsd` preserves the upper bits, but ;; when loading from memory it zeros the upper bits. We specifically want to ;; preserve the upper bits so if a `RegMem.Mem` is passed in we need to emit ;; two `movsd` instructions. The first `movsd` (used as `xmm_unary_rm_r`) will ;; load from memory into a temp register and then the second `movsd` (modeled ;; internally as `xmm_rm_r` will merge the temp register into our `vec` ;; register. (rule (vec_insert_lane $F64X2 vec (RegMem.Reg val) 0) (x64_movsd_regmove vec val)) (rule (vec_insert_lane $F64X2 vec mem 0) (x64_movsd_regmove vec (x64_movsd_load mem))) ;; f64x2.replace_lane 1 ;; ;; Here the `movlhps` instruction is used specifically to specialize moving ;; into the second lane where unlike above cases we're not using the lane ;; immediate as an immediate to the instruction itself. (rule (vec_insert_lane $F64X2 vec val 1) (x64_movlhps vec (reg_mem_to_xmm_mem val))) ;;;; Rules for `imin`, `imax`, `umin`, `umax` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;; `i64` and smaller. (decl cmp_and_choose (Type CC Value Value) ValueRegs) (rule (cmp_and_choose (fits_in_64 ty) cc x y) (let ((size OperandSize (raw_operand_size_of_type ty)) ;; We need to put x and y in registers explicitly because ;; we use the values more than once. Hence, even if these ;; are "unique uses" at the CLIF level and would otherwise ;; allow for load-op merging, here we cannot do that. (x_reg Reg x) (y_reg Reg y)) (with_flags_reg (x64_cmp size x_reg y_reg) (cmove ty cc y_reg x_reg)))) (rule (lower (has_type (fits_in_64 ty) (umin x y))) (cmp_and_choose ty (CC.B) x y)) (rule (lower (has_type (fits_in_64 ty) (umax x y))) (cmp_and_choose ty (CC.NB) x y)) (rule (lower (has_type (fits_in_64 ty) (imin x y))) (cmp_and_choose ty (CC.L) x y)) (rule (lower (has_type (fits_in_64 ty) (imax x y))) (cmp_and_choose ty (CC.NL) x y)) ;; SSE `imax`. (rule (lower (has_type $I8X16 (imax x y))) (x64_pmaxsb x y)) (rule (lower (has_type $I16X8 (imax x y))) (x64_pmaxsw x y)) (rule (lower (has_type $I32X4 (imax x y))) (x64_pmaxsd x y)) ;; SSE `imin`. (rule (lower (has_type $I8X16 (imin x y))) (x64_pminsb x y)) (rule (lower (has_type $I16X8 (imin x y))) (x64_pminsw x y)) (rule (lower (has_type $I32X4 (imin x y))) (x64_pminsd x y)) ;; SSE `umax`. (rule (lower (has_type $I8X16 (umax x y))) (x64_pmaxub x y)) (rule (lower (has_type $I16X8 (umax x y))) (x64_pmaxuw x y)) (rule (lower (has_type $I32X4 (umax x y))) (x64_pmaxud x y)) ;; SSE `umin`. (rule (lower (has_type $I8X16 (umin x y))) (x64_pminub x y)) (rule (lower (has_type $I16X8 (umin x y))) (x64_pminuw x y)) (rule (lower (has_type $I32X4 (umin x y))) (x64_pminud x y)) ;;;; Rules for `trap` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; (rule (lower (trap code)) (side_effect (x64_ud2 code))) ;;;; Rules for `resumable_trap` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; (rule (lower (resumable_trap code)) (side_effect (x64_ud2 code))) ;;;; Rules for `return` and `fallthrough_return` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;; N.B.: the Ret itself is generated by the ABI. (rule (lower (return args)) (lower_return (range 0 (value_slice_len args)) args)) (rule (lower (fallthrough_return args)) (lower_return (range 0 (value_slice_len args)) args)) (decl lower_return (Range ValueSlice) InstOutput) (rule (lower_return (range_empty) _) (output_none)) (rule (lower_return (range_unwrap head tail) args) (let ((_ Unit (copy_to_regs (retval head) (value_slice_get args head)))) (lower_return tail args))) ;;;; Rules for `icmp` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;; For GPR-held values we only need to emit `CMP + SETCC`. We rely here on ;; Cranelift's verification that `a` and `b` are of the same type. ;; Unfortunately for clarity, the registers are flipped here (TODO). (rule (lower (icmp cc a @ (value_type (fits_in_64 ty)) b)) (let ((size OperandSize (raw_operand_size_of_type ty))) (with_flags (x64_cmp size b a) (x64_setcc cc)))) ;; For XMM-held values, we lower to `PCMP*` instructions, sometimes more than ;; one. To note: what is different here about the output values is that each ;; lane will be filled with all 1s or all 0s according to the comparison, ;; whereas for GPR-held values, the result will be simply 0 or 1 (upper bits ;; unset). (rule (lower (icmp (IntCC.Equal) a @ (value_type (ty_vec128 ty)) b)) (x64_pcmpeq ty a b)) ;; To lower a not-equals comparison, we perform an equality comparison ;; (PCMPEQ*) and then invert the bits (PXOR with all 1s). (rule (lower (icmp (IntCC.NotEqual) a @ (value_type (ty_vec128 ty)) b)) (let ((checked Xmm (x64_pcmpeq ty a b)) (all_ones Xmm (vector_all_ones ty))) (x64_pxor checked all_ones))) ;; Signed comparisons have a single-instruction lowering, unlike their unsigned ;; counterparts. These latter instructions use the unsigned min/max ;; (PMINU*/PMAXU*) and negate the result (PXOR with all 1s). (rule (lower (icmp (IntCC.SignedGreaterThan) a @ (value_type (ty_vec128 ty)) b)) (x64_pcmpgt ty a b)) (rule (lower (icmp (IntCC.SignedLessThan) a @ (value_type (ty_vec128 ty)) b)) (x64_pcmpgt ty b a)) (rule (lower (icmp (IntCC.UnsignedGreaterThan) a @ (value_type (ty_vec128 ty)) b)) ;; N.B.: we must manually prevent load coalescing of these operands; the ;; register allocator gets confused otherwise. TODO: ;; https://github.com/bytecodealliance/wasmtime/issues/3953. (let ((xmm_a Xmm (put_in_xmm a)) (xmm_b Xmm (put_in_xmm b)) (max Xmm (x64_pmaxu ty xmm_a xmm_b)) (eq Xmm (x64_pcmpeq ty max xmm_b)) (all_ones Xmm (vector_all_ones ty))) (x64_pxor eq all_ones))) (rule (lower (icmp (IntCC.UnsignedLessThan) a @ (value_type (ty_vec128 ty)) b)) ;; N.B.: see note above. (let ((xmm_a Xmm (put_in_xmm a)) (xmm_b Xmm (put_in_xmm b)) (min Xmm (x64_pminu ty xmm_a xmm_b)) (eq Xmm (x64_pcmpeq ty min xmm_b)) (all_ones Xmm (vector_all_ones ty))) (x64_pxor eq all_ones))) ;; To lower signed and unsigned *-or-equals comparisons, we find the minimum ;; number (PMIN[U|S]*) and compare that to one of the terms (PCMPEQ*). Note that ;; there is no 64x2 version of this lowering (see below). (rule (lower (icmp (IntCC.SignedGreaterThanOrEqual) a @ (value_type (ty_vec128 ty)) b)) (let ((max Xmm (x64_pmaxs ty a b))) (x64_pcmpeq ty a max))) (rule (lower (icmp (IntCC.SignedLessThanOrEqual) a @ (value_type (ty_vec128 ty)) b)) (let ((min Xmm (x64_pmins ty a b))) (x64_pcmpeq ty a min))) (rule (lower (icmp (IntCC.UnsignedGreaterThanOrEqual) a @ (value_type (ty_vec128 ty)) b)) (let ((max Xmm (x64_pmaxu ty a b))) (x64_pcmpeq ty a max))) (rule (lower (icmp (IntCC.UnsignedLessThanOrEqual) a @ (value_type (ty_vec128 ty)) b)) (let ((min Xmm (x64_pminu ty a b))) (x64_pcmpeq ty a min))) ;; The PMIN[S|U]Q instruction is only available in AVX512VL/F so we must instead ;; compare with flipped operands (PCMPGT*) and negate the result (PXOR with all ;; 1s), emitting one more instruction than the smaller-lane versions. (rule (lower (icmp (IntCC.SignedGreaterThanOrEqual) a @ (value_type $I64X2) b)) (let ((checked Xmm (x64_pcmpgt $I64X2 b a)) (all_ones Xmm (vector_all_ones $I64X2))) (x64_pxor checked all_ones))) (rule (lower (icmp (IntCC.SignedLessThanOrEqual) a @ (value_type $I64X2) b)) (let ((checked Xmm (x64_pcmpgt $I64X2 a b)) (all_ones Xmm (vector_all_ones $I64X2))) (x64_pxor checked all_ones))) ;; TODO: not used by WebAssembly translation ;; (rule (lower (icmp (IntCC.UnsignedGreaterThanOrEqual) a @ (value_type $I64X2) b)) ;; TODO: not used by WebAssembly translation ;; (rule (lower (icmp (IntCC.UnsignedLessThanOrEqual) a @ (value_type $I64X2) b)) ;; For I128 values (held in two GPRs), the instruction sequences depend on what ;; kind of condition is tested. (rule (lower (icmp (IntCC.Equal) a @ (value_type $I128) b)) (let ((a_lo Gpr (value_regs_get_gpr a 0)) (a_hi Gpr (value_regs_get_gpr a 1)) (b_lo Gpr (value_regs_get_gpr b 0)) (b_hi Gpr (value_regs_get_gpr b 1)) (cmp_lo Reg (with_flags_reg (x64_cmp (OperandSize.Size64) b_lo a_lo) (x64_setcc (CC.Z)))) (cmp_hi Reg (with_flags_reg (x64_cmp (OperandSize.Size64) b_hi a_hi) (x64_setcc (CC.Z)))) ;; At this point, `cmp_lo` and `cmp_hi` contain either 0 or 1 in the ;; lowest 8 bits--`SETcc` guarantees this. The upper bits may be ;; unchanged so we must compare against 1 below; this instruction ;; combines `cmp_lo` and `cmp_hi` for that final comparison. (cmp Reg (x64_and $I64 cmp_lo cmp_hi))) ;; We must compare one more time against the immediate value 1 to ;; check if both `cmp_lo` and `cmp_hi` are true. If `cmp AND 1 == 0` ;; then the `ZF` will be set (see `TEST` definition); if either of ;; the halves `AND`s to 0, they were not equal, therefore we `SETcc` ;; with `NZ`. (with_flags (x64_test (OperandSize.Size64) (RegMemImm.Imm 1) cmp) (x64_setcc (CC.NZ))))) (rule (lower (icmp (IntCC.NotEqual) a @ (value_type $I128) b)) (let ((a_lo Gpr (value_regs_get_gpr a 0)) (a_hi Gpr (value_regs_get_gpr a 1)) (b_lo Gpr (value_regs_get_gpr b 0)) (b_hi Gpr (value_regs_get_gpr b 1)) (cmp_lo Reg (with_flags_reg (x64_cmp (OperandSize.Size64) b_lo a_lo) (x64_setcc (CC.NZ)))) (cmp_hi Reg (with_flags_reg (x64_cmp (OperandSize.Size64) b_hi a_hi) (x64_setcc (CC.NZ)))) ;; See comments for `IntCC.Equal`. (cmp Reg (x64_or $I64 cmp_lo cmp_hi))) (with_flags (x64_test (OperandSize.Size64) (RegMemImm.Imm 1) cmp) (x64_setcc (CC.NZ))))) ;; Result = (a_hi <> b_hi) || ;; (a_hi == b_hi && a_lo <> b_lo) (rule (lower (icmp cc a @ (value_type $I128) b)) (if (intcc_neq cc (IntCC.Equal))) (if (intcc_neq cc (IntCC.NotEqual))) (let ((a_lo Gpr (value_regs_get_gpr a 0)) (a_hi Gpr (value_regs_get_gpr a 1)) (b_lo Gpr (value_regs_get_gpr b 0)) (b_hi Gpr (value_regs_get_gpr b 1)) (cmp_hi ValueRegs (with_flags (x64_cmp (OperandSize.Size64) b_hi a_hi) (consumes_flags_concat (x64_setcc (intcc_without_eq cc)) (x64_setcc (CC.Z))))) (cc_hi Reg (value_regs_get cmp_hi 0)) (eq_hi Reg (value_regs_get cmp_hi 1)) (cmp_lo Reg (with_flags_reg (x64_cmp (OperandSize.Size64) b_lo a_lo) (x64_setcc (intcc_unsigned cc)))) (res_lo Reg (x64_and $I64 eq_hi cmp_lo)) (res Reg (x64_or $I64 cc_hi res_lo))) (x64_and $I64 res (RegMemImm.Imm 1)))) ;;;; Rules for `fcmp` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;; CLIF's `fcmp` instruction always operates on XMM registers--both scalar and ;; vector. For the scalar versions, we use the flag-setting behavior of the ;; `UCOMIS*` instruction to `SETcc` a 0 or 1 in a GPR register. Note that CLIF's ;; `select` uses the same kind of flag-setting behavior but chooses values other ;; than 0 or 1. ;; ;; Checking the result of `UCOMIS*` is unfortunately difficult in some cases ;; because we do not have `SETcc` instructions that explicitly check ;; simultaneously for the condition (i.e., `eq`, `le`, `gt`, etc.) *and* ;; orderedness. Instead, we must check the flags multiple times. The UCOMIS* ;; documentation (see Intel's Software Developer's Manual, volume 2, chapter 4) ;; is helpful: ;; - unordered assigns Z = 1, P = 1, C = 1 ;; - greater than assigns Z = 0, P = 0, C = 0 ;; - less than assigns Z = 0, P = 0, C = 1 ;; - equal assigns Z = 1, P = 0, C = 0 (rule (lower (fcmp (FloatCC.Equal) a @ (value_type (ty_scalar_float ty)) b)) (let ((maybe ValueRegs (with_flags (x64_ucomis b a) (consumes_flags_concat (x64_setcc (CC.NP)) (x64_setcc (CC.Z))))) (maybe_np Gpr (value_regs_get_gpr maybe 0)) (maybe_z Gpr (value_regs_get_gpr maybe 1))) (x64_and $I32 maybe_np maybe_z))) (rule (lower (fcmp (FloatCC.NotEqual) a @ (value_type (ty_scalar_float ty)) b)) (let ((maybe ValueRegs (with_flags (x64_ucomis b a) (consumes_flags_concat (x64_setcc (CC.P)) (x64_setcc (CC.NZ))))) (maybe_p Gpr (value_regs_get_gpr maybe 0)) (maybe_nz Gpr (value_regs_get_gpr maybe 1))) (x64_or $I32 maybe_p maybe_nz))) ;; Some scalar lowerings correspond to one condition code. (rule (lower (fcmp (FloatCC.Ordered) a @ (value_type (ty_scalar_float ty)) b)) (with_flags (x64_ucomis b a) (x64_setcc (CC.NP)))) (rule (lower (fcmp (FloatCC.Unordered) a @ (value_type (ty_scalar_float ty)) b)) (with_flags (x64_ucomis b a) (x64_setcc (CC.P)))) (rule (lower (fcmp (FloatCC.OrderedNotEqual) a @ (value_type (ty_scalar_float ty)) b)) (with_flags (x64_ucomis b a) (x64_setcc (CC.NZ)))) (rule (lower (fcmp (FloatCC.UnorderedOrEqual) a @ (value_type (ty_scalar_float ty)) b)) (with_flags (x64_ucomis b a) (x64_setcc (CC.Z)))) (rule (lower (fcmp (FloatCC.GreaterThan) a @ (value_type (ty_scalar_float ty)) b)) (with_flags (x64_ucomis b a) (x64_setcc (CC.NBE)))) (rule (lower (fcmp (FloatCC.GreaterThanOrEqual) a @ (value_type (ty_scalar_float ty)) b)) (with_flags (x64_ucomis b a) (x64_setcc (CC.NB)))) (rule (lower (fcmp (FloatCC.UnorderedOrLessThan) a @ (value_type (ty_scalar_float ty)) b)) (with_flags (x64_ucomis b a) (x64_setcc (CC.B)))) (rule (lower (fcmp (FloatCC.UnorderedOrLessThanOrEqual) a @ (value_type (ty_scalar_float ty)) b)) (with_flags (x64_ucomis b a) (x64_setcc (CC.BE)))) ;; Other scalar lowerings are made possible by flipping the operands and ;; reversing the condition code. (rule (lower (fcmp (FloatCC.LessThan) a @ (value_type (ty_scalar_float ty)) b)) ;; Same flags as `GreaterThan`. (with_flags (x64_ucomis a b) (x64_setcc (CC.NBE)))) (rule (lower (fcmp (FloatCC.LessThanOrEqual) a @ (value_type (ty_scalar_float ty)) b)) ;; Same flags as `GreaterThanOrEqual`. (with_flags (x64_ucomis a b) (x64_setcc (CC.NB)))) (rule (lower (fcmp (FloatCC.UnorderedOrGreaterThan) a @ (value_type (ty_scalar_float ty)) b)) ;; Same flags as `UnorderedOrLessThan`. (with_flags (x64_ucomis a b) (x64_setcc (CC.B)))) (rule (lower (fcmp (FloatCC.UnorderedOrGreaterThanOrEqual) a @ (value_type (ty_scalar_float ty)) b)) ;; Same flags as `UnorderedOrLessThanOrEqual`. (with_flags (x64_ucomis a b) (x64_setcc (CC.BE)))) ;; For vector lowerings, we use `CMPP*` instructions with a 3-bit operand that ;; determines the comparison to make. Note that comparisons that succeed will ;; fill the lane with 1s; comparisons that do not will fill the lane with 0s. (rule (lower (fcmp (FloatCC.Equal) a @ (value_type (ty_vec128 ty)) b)) (x64_cmpp ty a b (FcmpImm.Equal))) (rule (lower (fcmp (FloatCC.NotEqual) a @ (value_type (ty_vec128 ty)) b)) (x64_cmpp ty a b (FcmpImm.NotEqual))) (rule (lower (fcmp (FloatCC.LessThan) a @ (value_type (ty_vec128 ty)) b)) (x64_cmpp ty a b (FcmpImm.LessThan))) (rule (lower (fcmp (FloatCC.LessThanOrEqual) a @ (value_type (ty_vec128 ty)) b)) (x64_cmpp ty a b (FcmpImm.LessThanOrEqual))) (rule (lower (fcmp (FloatCC.Ordered) a @ (value_type (ty_vec128 ty)) b)) (x64_cmpp ty a b (FcmpImm.Ordered))) (rule (lower (fcmp (FloatCC.Unordered) a @ (value_type (ty_vec128 ty)) b)) (x64_cmpp ty a b (FcmpImm.Unordered))) (rule (lower (fcmp (FloatCC.UnorderedOrGreaterThan) a @ (value_type (ty_vec128 ty)) b)) (x64_cmpp ty a b (FcmpImm.UnorderedOrGreaterThan))) (rule (lower (fcmp (FloatCC.UnorderedOrGreaterThanOrEqual) a @ (value_type (ty_vec128 ty)) b)) (x64_cmpp ty a b (FcmpImm.UnorderedOrGreaterThanOrEqual))) ;; Some vector lowerings rely on flipping the operands and using a reversed ;; comparison code. (rule (lower (fcmp (FloatCC.GreaterThan) a @ (value_type (ty_vec128 ty)) b)) (x64_cmpp ty b a (FcmpImm.LessThan))) (rule (lower (fcmp (FloatCC.GreaterThanOrEqual) a @ (value_type (ty_vec128 ty)) b)) (x64_cmpp ty b a (FcmpImm.LessThanOrEqual))) (rule (lower (fcmp (FloatCC.UnorderedOrLessThan) a @ (value_type (ty_vec128 ty)) b)) (x64_cmpp ty b a (FcmpImm.UnorderedOrGreaterThan))) (rule (lower (fcmp (FloatCC.UnorderedOrLessThanOrEqual) a @ (value_type (ty_vec128 ty)) b)) (x64_cmpp ty b a (FcmpImm.UnorderedOrGreaterThanOrEqual))) ;; Some vector lowerings are simply not supported for certain codes: ;; - FloatCC::OrderedNotEqual ;; - FloatCC::UnorderedOrEqual ;;;; Rules for `select` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;; CLIF `select` instructions receive a testable argument (i.e. boolean or ;; integer) that determines which of the other two arguments is selected as ;; output. Since Cranelift booleans are typically generated by a comparison, the ;; lowerings in this section "look upwards in the tree" to emit the proper ;; sequence of "selection" instructions. ;; ;; The following rules--for selecting on a floating-point comparison--emit a ;; `UCOMIS*` instruction and then a conditional move, `cmove`. Note that for ;; values contained in XMM registers, `cmove` and `cmove_or` may in fact emit a ;; jump sequence, not `CMOV`. The `cmove` instruction operates on the flags set ;; by `UCOMIS*`; the key to understanding these is the UCOMIS* documentation ;; (see Intel's Software Developer's Manual, volume 2, chapter 4): ;; - unordered assigns Z = 1, P = 1, C = 1 ;; - greater than assigns Z = 0, P = 0, C = 0 ;; - less than assigns Z = 0, P = 0, C = 1 ;; - equal assigns Z = 1, P = 0, C = 0 ;; ;; Note that prefixing the flag with `N` means "not," so that `CC.P -> P = 1` ;; and `CC.NP -> P = 0`. Also, x86 uses mnemonics for certain combinations of ;; flags; e.g.: ;; - `CC.B -> C = 1` (below) ;; - `CC.NB -> C = 0` (not below) ;; - `CC.BE -> C = 1 OR Z = 1` (below or equal) ;; - `CC.NBE -> C = 0 AND Z = 0` (not below or equal) (rule (lower (has_type ty (select (fcmp (FloatCC.Ordered) a b) x y))) (with_flags (x64_ucomis b a) (cmove_from_values ty (CC.NP) x y))) (rule (lower (has_type ty (select (fcmp (FloatCC.Unordered) a b) x y))) (with_flags (x64_ucomis b a) (cmove_from_values ty (CC.P) x y))) (rule (lower (has_type ty (select (fcmp (FloatCC.GreaterThan) a b) x y))) (with_flags (x64_ucomis b a) (cmove_from_values ty (CC.NBE) x y))) (rule (lower (has_type ty (select (fcmp (FloatCC.GreaterThanOrEqual) a b) x y))) (with_flags (x64_ucomis b a) (cmove_from_values ty (CC.NB) x y))) (rule (lower (has_type ty (select (fcmp (FloatCC.UnorderedOrLessThan) a b) x y))) (with_flags (x64_ucomis b a) (cmove_from_values ty (CC.B) x y))) (rule (lower (has_type ty (select (fcmp (FloatCC.UnorderedOrLessThanOrEqual) a b) x y))) (with_flags (x64_ucomis b a) (cmove_from_values ty (CC.BE) x y))) ;; Certain FloatCC variants are implemented by flipping the operands of the ;; comparison (e.g., "greater than" is lowered the same as "less than" but the ;; comparison is reversed). This allows us to use a single flag for the `cmove`, ;; which involves fewer instructions than `cmove_or`. ;; ;; But why flip at all, you may ask? Can't we just use `CC.B` (i.e., below) for ;; `FloatCC.LessThan`? Recall that in these floating-point lowerings, values may ;; be unordered and we must we want to express that `FloatCC.LessThan` is `LT`, ;; not `LT | UNO`. By flipping the operands AND inverting the comparison (e.g., ;; to `CC.NBE`), we also avoid these unordered cases. (rule (lower (has_type ty (select (fcmp (FloatCC.LessThan) a b) x y))) (with_flags (x64_ucomis a b) (cmove_from_values ty (CC.NBE) x y))) (rule (lower (has_type ty (select (fcmp (FloatCC.LessThanOrEqual) a b) x y))) (with_flags (x64_ucomis a b) (cmove_from_values ty (CC.NB) x y))) (rule (lower (has_type ty (select (fcmp (FloatCC.UnorderedOrGreaterThan) a b) x y))) (with_flags (x64_ucomis a b) (cmove_from_values ty (CC.B) x y))) (rule (lower (has_type ty (select (fcmp (FloatCC.UnorderedOrGreaterThanOrEqual) a b) x y))) (with_flags (x64_ucomis a b) (cmove_from_values ty (CC.BE) x y))) ;; `FloatCC.Equal` and `FloatCC.NotEqual` can only be implemented with multiple ;; flag checks. Recall from the flag assignment chart above that equality, e.g., ;; will assign `Z = 1`. But so does an unordered comparison: `Z = 1, P = 1, C = ;; 1`. In order to avoid semantics like `EQ | UNO` for equality, we must ensure ;; that the values are actually ordered, checking that `P = 0` (note that the ;; `C` flag is irrelevant here). Since we cannot find a single instruction that ;; implements a `Z = 1 AND P = 0` check, we invert the flag checks (i.e., `Z = 1 ;; AND P = 0` becomes `Z = 0 OR P = 1`) and also flip the select operands, `x` ;; and `y`. The same argument applies to `FloatCC.NotEqual`. ;; ;; More details about the CLIF semantics for `fcmp` are available at ;; https://docs.rs/cranelift-codegen/latest/cranelift_codegen/ir/trait.InstBuilder.html#method.fcmp. (rule (lower (has_type ty (select (fcmp (FloatCC.Equal) a b) x y))) (with_flags (x64_ucomis a b) (cmove_or_from_values ty (CC.NZ) (CC.P) y x))) (rule (lower (has_type ty (select (fcmp (FloatCC.NotEqual) a b) x y))) (with_flags (x64_ucomis a b) (cmove_or_from_values ty (CC.NZ) (CC.P) x y))) ;; We also can lower `select`s that depend on an `icmp` test, but more simply ;; than the `fcmp` variants above. In these cases, we lower to a `CMP` ;; instruction plus a `CMOV`; recall that `cmove_from_values` here may emit more ;; than one instruction for certain types (e.g., XMM-held, I128). (rule (lower (has_type ty (select (icmp cc a @ (value_type (fits_in_64 a_ty)) b) x y))) (let ((size OperandSize (raw_operand_size_of_type a_ty))) (with_flags (x64_cmp size b a) (cmove_from_values ty cc x y)))) ;; Finally, we lower `select` from a condition value `c`. These rules are meant ;; to be the final, default lowerings if no other patterns matched above. (rule (lower (has_type ty (select c @ (value_type $B1) x y))) (let ((size OperandSize (raw_operand_size_of_type $B1)) ;; N.B.: disallow load-op fusion, see above. TODO: ;; https://github.com/bytecodealliance/wasmtime/issues/3953. (gpr_c Gpr (put_in_gpr c))) (with_flags (x64_test size (RegMemImm.Imm 1) gpr_c) (cmove_from_values ty (CC.NZ) x y)))) (rule (lower (has_type ty (select c @ (value_type (fits_in_64 a_ty)) x y))) (let ((size OperandSize (raw_operand_size_of_type a_ty)) ;; N.B.: disallow load-op fusion, see above. TODO: ;; https://github.com/bytecodealliance/wasmtime/issues/3953. (gpr_c Gpr (put_in_gpr c))) (with_flags (x64_test size gpr_c gpr_c) (cmove_from_values ty (CC.NZ) x y)))) ;; Rules for `clz` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;; If available, we can use a plain lzcnt instruction here. Note no ;; special handling is required for zero inputs, because the machine ;; instruction does what the CLIF expects for zero, i.e. it returns ;; zero. (rule 1 (lower (has_type (and (ty_32_or_64 ty) (use_lzcnt)) (clz src))) (x64_lzcnt ty src)) (rule (lower (has_type (ty_32_or_64 ty) (clz src))) (do_clz ty ty src)) (rule (lower (has_type (ty_8_or_16 ty) (clz src))) (do_clz $I32 ty (extend_to_gpr src $I32 (ExtendKind.Zero)))) (rule (lower (has_type $I128 (clz src))) (let ((upper Gpr (do_clz $I64 $I64 (value_regs_get_gpr src 1))) (lower Gpr (x64_add $I64 (do_clz $I64 $I64 (value_regs_get_gpr src 0)) (RegMemImm.Imm 64))) (result_lo Gpr (with_flags_reg (x64_cmp_imm (OperandSize.Size64) 64 upper) (cmove $I64 (CC.NZ) upper lower)))) (value_regs result_lo (imm $I64 0)))) ;; Implementation helper for clz; operates on 32 or 64-bit units. (decl do_clz (Type Type Gpr) Gpr) (rule (do_clz ty orig_ty src) (let ((highest_bit_index Reg (bsr_or_else ty src (imm_i64 $I64 -1))) (bits_minus_1 Reg (imm ty (u64_sub (ty_bits_u64 orig_ty) 1)))) (x64_sub ty bits_minus_1 highest_bit_index))) ;; Rules for `ctz` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;; Analogous to `clz` cases above, but using mirror instructions ;; (tzcnt vs lzcnt, bsf vs bsr). (rule 1 (lower (has_type (and (ty_32_or_64 ty) (use_bmi1)) (ctz src))) (x64_tzcnt ty src)) (rule (lower (has_type (ty_32_or_64 ty) (ctz src))) (do_ctz ty ty src)) (rule (lower (has_type (ty_8_or_16 ty) (ctz src))) (do_ctz $I32 ty (extend_to_gpr src $I32 (ExtendKind.Zero)))) (rule (lower (has_type $I128 (ctz src))) (let ((lower Gpr (do_ctz $I64 $I64 (value_regs_get_gpr src 0))) (upper Gpr (x64_add $I64 (do_ctz $I64 $I64 (value_regs_get_gpr src 1)) (RegMemImm.Imm 64))) (result_lo Gpr (with_flags_reg (x64_cmp_imm (OperandSize.Size64) 64 lower) (cmove $I64 (CC.Z) upper lower)))) (value_regs result_lo (imm $I64 0)))) (decl do_ctz (Type Type Gpr) Gpr) (rule (do_ctz ty orig_ty src) (bsf_or_else ty src (imm $I64 (ty_bits_u64 orig_ty)))) ;; Rules for `popcnt` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; (rule 1 (lower (has_type (and (ty_32_or_64 ty) (use_popcnt)) (popcnt src))) (x64_popcnt ty src)) (rule 1 (lower (has_type (and (ty_8_or_16 ty) (use_popcnt)) (popcnt src))) (x64_popcnt $I32 (extend_to_gpr src $I32 (ExtendKind.Zero)))) (rule 1 (lower (has_type (and $I128 (use_popcnt)) (popcnt src))) (let ((lo_count Gpr (x64_popcnt $I64 (value_regs_get_gpr src 0))) (hi_count Gpr (x64_popcnt $I64 (value_regs_get_gpr src 1)))) (value_regs (x64_add $I64 lo_count hi_count) (imm $I64 0)))) (rule (lower (has_type (ty_32_or_64 ty) (popcnt src))) (do_popcnt ty src)) (rule (lower (has_type (ty_8_or_16 ty) (popcnt src))) (do_popcnt $I32 (extend_to_gpr src $I32 (ExtendKind.Zero)))) (rule (lower (has_type $I128 (popcnt src))) (let ((lo_count Gpr (do_popcnt $I64 (value_regs_get_gpr src 0))) (hi_count Gpr (do_popcnt $I64 (value_regs_get_gpr src 1)))) (value_regs (x64_add $I64 lo_count hi_count) (imm $I64 0)))) ;; Implementation of popcount when we don't nave a native popcount ;; instruction. (decl do_popcnt (Type Gpr) Gpr) (rule (do_popcnt $I64 src) (let ((shifted1 Gpr (x64_shr $I64 src (Imm8Reg.Imm8 1))) (sevens Gpr (imm $I64 0x7777777777777777)) (masked1 Gpr (x64_and $I64 shifted1 sevens)) ;; diff1 := src - ((src >> 1) & 0b0111_0111_0111...) (diff1 Gpr (x64_sub $I64 src masked1)) (shifted2 Gpr (x64_shr $I64 masked1 (Imm8Reg.Imm8 1))) (masked2 Gpr (x64_and $I64 shifted2 sevens)) ;; diff2 := diff1 - ((diff1 >> 1) & 0b0111_0111_0111...) (diff2 Gpr (x64_sub $I64 diff1 masked2)) (shifted3 Gpr (x64_shr $I64 masked2 (Imm8Reg.Imm8 1))) (masked3 Gpr (x64_and $I64 shifted3 sevens)) ;; diff3 := diff2 - ((diff2 >> 1) & 0b0111_0111_0111...) ;; ;; At this point, each nibble of diff3 is the popcount of ;; that nibble. This works because at each step above, we ;; are basically subtracting floor(value / 2) from the ;; running value; the leftover remainder is 1 if the LSB ;; was 1. After three steps, we have (nibble / 8) -- 0 or ;; 1 for the MSB of the nibble -- plus three possible ;; additions for the three other bits. (diff3 Gpr (x64_sub $I64 diff2 masked3)) ;; Add the two nibbles of each byte together. (sum1 Gpr (x64_add $I64 (x64_shr $I64 diff3 (Imm8Reg.Imm8 4)) diff3)) ;; Mask the above sum to have the popcount for each byte ;; in the lower nibble of that byte. (ofof Gpr (imm $I64 0x0f0f0f0f0f0f0f0f)) (masked4 Gpr (x64_and $I64 sum1 ofof)) (ones Gpr (imm $I64 0x0101010101010101)) ;; Use a multiply to sum all of the bytes' popcounts into ;; the top byte. Consider the binomial expansion for the ;; top byte: it is the sum of the bytes (masked4 >> 56) * ;; 0x01 + (masked4 >> 48) * 0x01 + (masked4 >> 40) * 0x01 ;; + ... + (masked4 >> 0). (mul Gpr (x64_mul $I64 masked4 ones)) ;; Now take that top byte and return it as the popcount. (final Gpr (x64_shr $I64 mul (Imm8Reg.Imm8 56)))) final)) ;; This is the 32-bit version of the above; the steps for each nibble ;; are the same, we just use constants half as wide. (rule (do_popcnt $I32 src) (let ((shifted1 Gpr (x64_shr $I32 src (Imm8Reg.Imm8 1))) (sevens Gpr (imm $I32 0x77777777)) (masked1 Gpr (x64_and $I32 shifted1 sevens)) (diff1 Gpr (x64_sub $I32 src masked1)) (shifted2 Gpr (x64_shr $I32 masked1 (Imm8Reg.Imm8 1))) (masked2 Gpr (x64_and $I32 shifted2 sevens)) (diff2 Gpr (x64_sub $I32 diff1 masked2)) (shifted3 Gpr (x64_shr $I32 masked2 (Imm8Reg.Imm8 1))) (masked3 Gpr (x64_and $I32 shifted3 sevens)) (diff3 Gpr (x64_sub $I32 diff2 masked3)) (sum1 Gpr (x64_add $I32 (x64_shr $I32 diff3 (Imm8Reg.Imm8 4)) diff3)) (masked4 Gpr (x64_and $I32 sum1 (RegMemImm.Imm 0x0f0f0f0f))) (mul Gpr (x64_mul $I32 masked4 (RegMemImm.Imm 0x01010101))) (final Gpr (x64_shr $I32 mul (Imm8Reg.Imm8 24)))) final)) (rule 1 (lower (has_type (and $I8X16 (avx512vl_enabled) (avx512bitalg_enabled)) (popcnt src))) (x64_vpopcntb src)) ;; For SSE 4.2 we use Mula's algorithm (https://arxiv.org/pdf/1611.07612.pdf): ;; ;; __m128i count_bytes ( __m128i v) { ;; __m128i lookup = _mm_setr_epi8(0, 1, 1, 2, 1, 2, 2, 3, 1, 2, 2, 3, 2, 3, 3, 4); ;; __m128i low_mask = _mm_set1_epi8 (0x0f); ;; __m128i lo = _mm_and_si128 (v, low_mask); ;; __m128i hi = _mm_and_si128 (_mm_srli_epi16 (v, 4), low_mask); ;; __m128i cnt1 = _mm_shuffle_epi8 (lookup, lo); ;; __m128i cnt2 = _mm_shuffle_epi8 (lookup, hi); ;; return _mm_add_epi8 (cnt1, cnt2); ;; } ;; ;; Details of the above algorithm can be found in the reference noted above, but the basics ;; are to create a lookup table that pre populates the popcnt values for each number [0,15]. ;; The algorithm uses shifts to isolate 4 bit sections of the vector, pshufb as part of the ;; lookup process, and adds together the results. ;; ;; __m128i lookup = _mm_setr_epi8(0, 1, 1, 2, 1, 2, 2, 3, 1, 2, 2, 3, 2, 3, 3, 4); (decl popcount_4bit_table () VCodeConstant) ;; bits-per-nibble table `lookup` above (extern constructor popcount_4bit_table popcount_4bit_table) (decl popcount_low_mask () VCodeConstant) ;; mask for low nibbles: 0x0f * 16 (extern constructor popcount_low_mask popcount_low_mask) (rule (lower (has_type $I8X16 (popcnt src))) (let ((nibble_table_const VCodeConstant (popcount_4bit_table)) (low_mask Xmm (x64_xmm_load_const $I8X16 (popcount_low_mask))) (low_nibbles Xmm (sse_and $I8X16 src low_mask)) ;; Note that this is a 16x8 shift, but that's OK; we mask ;; off anything that traverses from one byte to the next ;; with the low_mask below. (shifted_src Xmm (x64_psrlw src (RegMemImm.Imm 4))) (high_nibbles Xmm (sse_and $I8X16 shifted_src low_mask)) (lookup Xmm (x64_xmm_load_const $I8X16 (popcount_4bit_table))) (bit_counts_low Xmm (x64_pshufb lookup low_nibbles)) (bit_counts_high Xmm (x64_pshufb lookup high_nibbles))) (x64_paddb bit_counts_low bit_counts_high))) ;; Rules for `bitrev` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; (rule (lower (has_type $I8 (bitrev src))) (do_bitrev8 $I32 src)) (rule (lower (has_type $I16 (bitrev src))) (do_bitrev16 $I32 src)) (rule (lower (has_type $I32 (bitrev src))) (do_bitrev32 $I32 src)) (rule (lower (has_type $I64 (bitrev src))) (do_bitrev64 $I64 src)) (rule (lower (has_type $I128 (bitrev src))) (value_regs (do_bitrev64 $I64 (value_regs_get_gpr src 1)) (do_bitrev64 $I64 (value_regs_get_gpr src 0)))) (decl do_bitrev8 (Type Gpr) Gpr) (rule (do_bitrev8 ty src) (let ((tymask u64 (ty_mask ty)) (mask1 Gpr (imm ty (u64_and tymask 0x5555555555555555))) (lo1 Gpr (x64_and ty src mask1)) (hi1 Gpr (x64_and ty (x64_shr ty src (Imm8Reg.Imm8 1)) mask1)) (swap1 Gpr (x64_or ty (x64_shl ty lo1 (Imm8Reg.Imm8 1)) hi1)) (mask2 Gpr (imm ty (u64_and tymask 0x3333333333333333))) (lo2 Gpr (x64_and ty swap1 mask2)) (hi2 Gpr (x64_and ty (x64_shr ty swap1 (Imm8Reg.Imm8 2)) mask2)) (swap2 Gpr (x64_or ty (x64_shl ty lo2 (Imm8Reg.Imm8 2)) hi2)) (mask4 Gpr (imm ty (u64_and tymask 0x0f0f0f0f0f0f0f0f))) (lo4 Gpr (x64_and ty swap2 mask4)) (hi4 Gpr (x64_and ty (x64_shr ty swap2 (Imm8Reg.Imm8 4)) mask4)) (swap4 Gpr (x64_or ty (x64_shl ty lo4 (Imm8Reg.Imm8 4)) hi4))) swap4)) (decl do_bitrev16 (Type Gpr) Gpr) (rule (do_bitrev16 ty src) (let ((src_ Gpr (do_bitrev8 ty src)) (tymask u64 (ty_mask ty)) (mask8 Gpr (imm ty (u64_and tymask 0x00ff00ff00ff00ff))) (lo8 Gpr (x64_and ty src_ mask8)) (hi8 Gpr (x64_and ty (x64_shr ty src_ (Imm8Reg.Imm8 8)) mask8)) (swap8 Gpr (x64_or ty (x64_shl ty lo8 (Imm8Reg.Imm8 8)) hi8))) swap8)) (decl do_bitrev32 (Type Gpr) Gpr) (rule (do_bitrev32 ty src) (let ((src_ Gpr (do_bitrev16 ty src)) (tymask u64 (ty_mask ty)) (mask16 Gpr (imm ty (u64_and tymask 0x0000ffff0000ffff))) (lo16 Gpr (x64_and ty src_ mask16)) (hi16 Gpr (x64_and ty (x64_shr ty src_ (Imm8Reg.Imm8 16)) mask16)) (swap16 Gpr (x64_or ty (x64_shl ty lo16 (Imm8Reg.Imm8 16)) hi16))) swap16)) (decl do_bitrev64 (Type Gpr) Gpr) (rule (do_bitrev64 ty @ $I64 src) (let ((src_ Gpr (do_bitrev32 ty src)) (mask32 Gpr (imm ty 0xffffffff)) (lo32 Gpr (x64_and ty src_ mask32)) (hi32 Gpr (x64_shr ty src_ (Imm8Reg.Imm8 32))) (swap32 Gpr (x64_or ty (x64_shl ty lo32 (Imm8Reg.Imm8 32)) hi32))) swap32)) ;; Rules for `is_null` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;; Null references are represented by the constant value `0`. (rule (lower (is_null src @ (value_type $R64))) (with_flags (x64_cmp_imm (OperandSize.Size64) 0 src) (x64_setcc (CC.Z)))) ;; Rules for `is_invalid` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;; Null references are represented by the constant value `-1`. (rule (lower (is_invalid src @ (value_type $R64))) (with_flags (x64_cmp_imm (OperandSize.Size64) 0xffffffff src) ;; simm32 0xffff_ffff is sign-extended to -1. (x64_setcc (CC.Z)))) ;; Rules for `uextend` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;; T -> T is a no-op. (rule (lower (has_type ty (uextend src @ (value_type ty)))) src) ;; I64 -> I128. (rule (lower (has_type $I128 (uextend src @ (value_type $I64)))) (value_regs src (imm $I64 0))) ;; I{8,16,32} -> I128. (rule (lower (has_type $I128 (uextend src @ (value_type (fits_in_32 src_ty))))) (value_regs (extend_to_gpr src $I64 (ExtendKind.Zero)) (imm $I64 0))) ;; I{8,16,32} -> I64. (rule (lower (has_type $I64 (uextend src @ (value_type (fits_in_32 src_ty))))) (extend_to_gpr src $I64 (ExtendKind.Zero))) ;; I8 -> I{16,32}, I16 -> I32. (rule (lower (has_type (fits_in_32 dst_ty) (uextend src @ (value_type (fits_in_32 src_ty))))) (extend_to_gpr src $I32 (ExtendKind.Zero))) ;; I32 -> I64 with op that produces a zero-extended value in a register. ;; ;; As a particular x64 extra-pattern matching opportunity, all the ALU ;; opcodes on 32-bits will zero-extend the upper 32-bits, so we can ;; even not generate a zero-extended move in this case. ;; ;; (Note that we unfortunately can't factor out the ;; insts-that-zero-upper-32 pattern into a separate extractor until we ;; can write internal extractors with multiple rules; and we'd rather ;; keep these here than write an external extractor containing bits of ;; the instruction pattern.s) (rule (lower (has_type $I64 (uextend src @ (has_type $I32 (iadd _ _))))) src) (rule (lower (has_type $I64 (uextend src @ (has_type $I32 (iadd_ifcout _ _))))) src) (rule (lower (has_type $I64 (uextend src @ (has_type $I32 (isub _ _))))) src) (rule (lower (has_type $I64 (uextend src @ (has_type $I32 (imul _ _))))) src) (rule (lower (has_type $I64 (uextend src @ (has_type $I32 (band _ _))))) src) (rule (lower (has_type $I64 (uextend src @ (has_type $I32 (bor _ _))))) src) (rule (lower (has_type $I64 (uextend src @ (has_type $I32 (bxor _ _))))) src) (rule (lower (has_type $I64 (uextend src @ (has_type $I32 (ishl _ _))))) src) (rule (lower (has_type $I64 (uextend src @ (has_type $I32 (ushr _ _))))) src) (rule (lower (has_type $I64 (uextend src @ (has_type $I32 (uload32 _ _ _))))) src) ;; Rules for `sextend` / `bextend` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; (decl generic_sextend (Value Type Type) InstOutput) ;; T -> T is a no-op. (rule (generic_sextend src ty ty) src) ;; Produce upper 64 bits sign-extended from lower 64: shift right by ;; 63 bits to spread the sign bit across the result. (decl spread_sign_bit (Gpr) Gpr) (rule (spread_sign_bit src) (x64_sar $I64 src (Imm8Reg.Imm8 63))) ;; I64 -> I128. (rule (generic_sextend src (ty_int_bool_64 _) (ty_int_bool_128 _)) (value_regs src (spread_sign_bit src))) ;; I{8,16,32} -> I128. (rule (generic_sextend src (fits_in_32 src_ty) (ty_int_bool_128 _)) (let ((lo Gpr (extend_to_gpr src $I64 (ExtendKind.Sign))) (hi Gpr (spread_sign_bit lo))) (value_regs lo hi))) ;; I{8,16,32} -> I64. (rule (generic_sextend src (fits_in_32 src_ty) (ty_int_bool_64 _)) (extend_to_gpr src $I64 (ExtendKind.Sign))) ;; I8 -> I{16,32}, I16 -> I32. (rule (generic_sextend src (fits_in_32 src_ty) (fits_in_32 dst_ty)) (extend_to_gpr src $I32 (ExtendKind.Sign))) (rule (lower (has_type dst_ty (sextend src @ (value_type src_ty)))) (generic_sextend src src_ty dst_ty)) ;; Bools are stored as 0/-1 so extends must sign-extend as well. (rule (lower (has_type dst_ty (bextend src @ (value_type src_ty)))) (generic_sextend src src_ty dst_ty)) ;; Rules for `ireduce` / `breduce` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;; T -> T is always a no-op, even I128 -> I128. (rule (lower (has_type ty (ireduce src @ (value_type ty)))) src) ;; T -> I{64,32,16,8}: We can simply pass through the value: values ;; are always stored with high bits undefined, so we can just leave ;; them be. (rule (lower (has_type (fits_in_64 ty) (ireduce src))) (value_regs_get_gpr src 0)) ;; Likewise for breduce. (rule (lower (has_type ty (breduce src @ (value_type ty)))) src) (rule (lower (has_type (fits_in_64 ty) (breduce src))) (value_regs_get_gpr src 0)) ;; Rules for `bint` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;; Booleans are stored as all-zeroes (0) or all-ones (-1). We AND out ;; the LSB to give a 0 / 1-valued integer result. (rule (lower (has_type (fits_in_64 ty) (bint src))) (x64_and ty src (RegMemImm.Imm 1))) (rule (lower (has_type $I128 (bint src))) (value_regs (x64_and $I64 src (RegMemImm.Imm 1)) (imm $I64 0))) ;; Rules for `debugtrap` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; (rule (lower (debugtrap)) (side_effect (x64_hlt))) ;; Rules for `widening_pairwise_dot_product_s` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; (rule (lower (has_type $I32X4 (widening_pairwise_dot_product_s x y))) (x64_pmaddwd x y)) ;; Rules for `fadd` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;; N.B.: there are no load-op merging rules here. We can't guarantee ;; the RHS (if a load) is 128-bit aligned, so we must avoid merging a ;; load. Likewise for other ops below. (rule (lower (has_type $F32 (fadd x y))) (x64_addss x y)) (rule (lower (has_type $F64 (fadd x y))) (x64_addsd x y)) (rule (lower (has_type $F32X4 (fadd x y))) (x64_addps x y)) (rule (lower (has_type $F64X2 (fadd x y))) (x64_addpd x y)) ;; Rules for `fsub` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; (rule (lower (has_type $F32 (fsub x y))) (x64_subss x y)) (rule (lower (has_type $F64 (fsub x y))) (x64_subsd x y)) (rule (lower (has_type $F32X4 (fsub x y))) (x64_subps x y)) (rule (lower (has_type $F64X2 (fsub x y))) (x64_subpd x y)) ;; Rules for `fmul` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; (rule (lower (has_type $F32 (fmul x y))) (x64_mulss x y)) (rule (lower (has_type $F64 (fmul x y))) (x64_mulsd x y)) (rule (lower (has_type $F32X4 (fmul x y))) (x64_mulps x y)) (rule (lower (has_type $F64X2 (fmul x y))) (x64_mulpd x y)) ;; Rules for `fdiv` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; (rule (lower (has_type $F32 (fdiv x y))) (x64_divss x y)) (rule (lower (has_type $F64 (fdiv x y))) (x64_divsd x y)) (rule (lower (has_type $F32X4 (fdiv x y))) (x64_divps x y)) (rule (lower (has_type $F64X2 (fdiv x y))) (x64_divpd x y)) ;; Rules for `sqrt` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; (rule (lower (has_type $F32 (sqrt x))) (x64_sqrtss x)) (rule (lower (has_type $F64 (sqrt x))) (x64_sqrtsd x)) (rule (lower (has_type $F32X4 (sqrt x))) (x64_sqrtps x)) (rule (lower (has_type $F64X2 (sqrt x))) (x64_sqrtpd x)) ;; Rules for `fpromote` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; (rule (lower (has_type $F64 (fpromote x))) (x64_cvtss2sd x)) ;; Rules for `fvpromote` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; (rule (lower (has_type $F64X2 (fvpromote_low x))) (x64_cvtps2pd (put_in_xmm x))) ;; Rules for `fdemote` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; (rule (lower (has_type $F32 (fdemote x))) (x64_cvtsd2ss x)) ;; Rules for `fvdemote` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; (rule (lower (has_type $F32X4 (fvdemote x))) (x64_cvtpd2ps x)) ;; Rules for `fmin` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; (rule (lower (has_type $F32 (fmin x y))) (xmm_min_max_seq $F32 $true x y)) (rule (lower (has_type $F64 (fmin x y))) (xmm_min_max_seq $F64 $true x y)) ;; Vector-typed version. We don't use single pseudoinstructions as ;; above, because we don't need to generate a mini-CFG. Instead, we ;; perform a branchless series of operations. ;; ;; We cannot simply use native min instructions (minps, minpd) because ;; NaN handling is different per CLIF semantics than on ;; x86. Specifically, if an argument is NaN, or the arguments are both ;; zero but of opposite signs, then the x86 instruction always ;; produces the second argument. However, per CLIF semantics, we ;; require that fmin(NaN, _) = fmin(_, NaN) = NaN, and fmin(+0, -0) = ;; fmin(-0, +0) = -0. (rule (lower (has_type $F32X4 (fmin x y))) ;; Compute min(x, y) and min(y, x) with native ;; instructions. These will differ in one of the edge cases ;; above that we have to handle properly. (Conversely, if they ;; don't differ, then the native instruction's answer is the ;; right one per CLIF semantics.) (let ((min1 Xmm (x64_minps x y)) (min2 Xmm (x64_minps y x)) ;; Compute the OR of the two. Note that NaNs have an ;; exponent field of all-ones (0xFF for F32), so if either ;; result is a NaN, this OR will be. And if either is a ;; zero (which has an exponent of 0 and mantissa of 0), ;; this captures a sign-bit of 1 (negative) if either ;; input is negative. ;; ;; In the case where we don't have a +/-0 mismatch or ;; NaNs, then `min1` and `min2` are equal and `min_or` is ;; the correct minimum. (min_or Xmm (x64_orps min1 min2)) ;; "compare unordered" produces a true mask (all ones) in ;; a given lane if the min is a NaN. We use this to ;; generate a mask to ensure quiet NaNs. (is_nan_mask Xmm (x64_cmpps min_or min2 (FcmpImm.Unordered))) ;; OR in the NaN mask. (min_or_2 Xmm (x64_orps min_or is_nan_mask)) ;; Shift the NaN mask down so that it covers just the ;; fraction below the NaN signalling bit; we'll use this ;; to mask off non-canonical NaN payloads. ;; ;; All-ones for NaN, shifted down to leave 10 top bits (1 ;; sign, 8 exponent, 1 QNaN bit that must remain set) ;; cleared. (nan_fraction_mask Xmm (x64_psrld is_nan_mask (RegMemImm.Imm 10))) ;; Do a NAND, so that we retain every bit not set in ;; `nan_fraction_mask`. This mask will be all zeroes (so ;; we retain every bit) in non-NaN cases, and will have ;; ones (so we clear those bits) in NaN-payload bits ;; otherwise. (final Xmm (x64_andnps nan_fraction_mask min_or_2))) final)) ;; Likewise for F64 lanes, except that the right-shift is by 13 bits ;; (1 sign, 11 exponent, 1 QNaN bit). (rule (lower (has_type $F64X2 (fmin x y))) (let ((min1 Xmm (x64_minpd x y)) (min2 Xmm (x64_minpd y x)) (min_or Xmm (x64_orpd min1 min2)) (is_nan_mask Xmm (x64_cmppd min1 min2 (FcmpImm.Unordered))) (min_or_2 Xmm (x64_orpd min_or is_nan_mask)) (nan_fraction_mask Xmm (x64_psrlq is_nan_mask (RegMemImm.Imm 13))) (final Xmm (x64_andnpd nan_fraction_mask min_or_2))) final)) ;; Rules for `fmax` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; (rule (lower (has_type $F32 (fmax x y))) (xmm_min_max_seq $F32 $false x y)) (rule (lower (has_type $F64 (fmax x y))) (xmm_min_max_seq $F64 $false x y)) ;; The vector version of fmax here is a dual to the fmin sequence ;; above, almost, with a few differences. (rule (lower (has_type $F32X4 (fmax x y))) ;; Compute max(x, y) and max(y, x) with native ;; instructions. These will differ in one of the edge cases ;; above that we have to handle properly. (Conversely, if they ;; don't differ, then the native instruction's answer is the ;; right one per CLIF semantics.) (let ((max1 Xmm (x64_maxps x y)) (max2 Xmm (x64_maxps y x)) ;; Compute the XOR of the two maxima. In the case ;; where we don't have a +/-0 mismatch or NaNs, then ;; `min1` and `min2` are equal and this XOR is zero. (max_xor Xmm (x64_xorps max1 max2)) ;; OR the XOR into one of the original maxima. If they are ;; equal, this does nothing. If max2 was NaN, its exponent ;; bits were all-ones, so the xor's exponent bits were the ;; complement of max1, and the OR of max1 and max_xor has ;; an all-ones exponent (is a NaN). If max1 was NaN, then ;; its exponent bits were already all-ones, so the OR will ;; be a NaN as well. (max_blended_nan Xmm (x64_orps max1 max_xor)) ;; Subtract the XOR. This ensures that if we had +0 and ;; -0, we end up with +0. (max_blended_nan_positive Xmm (x64_subps max_blended_nan max_xor)) ;; "compare unordered" produces a true mask (all ones) in ;; a given lane if the min is a NaN. We use this to ;; generate a mask to ensure quiet NaNs. (is_nan_mask Xmm (x64_cmpps max_blended_nan max_blended_nan (FcmpImm.Unordered))) ;; Shift the NaN mask down so that it covers just the ;; fraction below the NaN signalling bit; we'll use this ;; to mask off non-canonical NaN payloads. ;; ;; All-ones for NaN, shifted down to leave 10 top bits (1 ;; sign, 8 exponent, 1 QNaN bit that must remain set) ;; cleared. (nan_fraction_mask Xmm (x64_psrld is_nan_mask (RegMemImm.Imm 10))) ;; Do a NAND, so that we retain every bit not set in ;; `nan_fraction_mask`. This mask will be all zeroes (so ;; we retain every bit) in non-NaN cases, and will have ;; ones (so we clear those bits) in NaN-payload bits ;; otherwise. (final Xmm (x64_andnps nan_fraction_mask max_blended_nan_positive))) final)) (rule (lower (has_type $F64X2 (fmax x y))) ;; Compute max(x, y) and max(y, x) with native ;; instructions. These will differ in one of the edge cases ;; above that we have to handle properly. (Conversely, if they ;; don't differ, then the native instruction's answer is the ;; right one per CLIF semantics.) (let ((max1 Xmm (x64_maxpd x y)) (max2 Xmm (x64_maxpd y x)) ;; Compute the XOR of the two maxima. In the case ;; where we don't have a +/-0 mismatch or NaNs, then ;; `min1` and `min2` are equal and this XOR is zero. (max_xor Xmm (x64_xorpd max1 max2)) ;; OR the XOR into one of the original maxima. If they are ;; equal, this does nothing. If max2 was NaN, its exponent ;; bits were all-ones, so the xor's exponent bits were the ;; complement of max1, and the OR of max1 and max_xor has ;; an all-ones exponent (is a NaN). If max1 was NaN, then ;; its exponent bits were already all-ones, so the OR will ;; be a NaN as well. (max_blended_nan Xmm (x64_orpd max1 max_xor)) ;; Subtract the XOR. This ensures that if we had +0 and ;; -0, we end up with +0. (max_blended_nan_positive Xmm (x64_subpd max_blended_nan max_xor)) ;; `cmpps` with predicate index `3` is `cmpunordps`, or ;; "compare unordered": it produces a true mask (all ones) ;; in a given lane if the min is a NaN. We use this to ;; generate a mask to ensure quiet NaNs. (is_nan_mask Xmm (x64_cmppd max_blended_nan max_blended_nan (FcmpImm.Unordered))) ;; Shift the NaN mask down so that it covers just the ;; fraction below the NaN signalling bit; we'll use this ;; to mask off non-canonical NaN payloads. ;; ;; All-ones for NaN, shifted down to leave 13 top bits (1 ;; sign, 11 exponent, 1 QNaN bit that must remain set) ;; cleared. (nan_fraction_mask Xmm (x64_psrlq is_nan_mask (RegMemImm.Imm 13))) ;; Do a NAND, so that we retain every bit not set in ;; `nan_fraction_mask`. This mask will be all zeroes (so ;; we retain every bit) in non-NaN cases, and will have ;; ones (so we clear those bits) in NaN-payload bits ;; otherwise. (final Xmm (x64_andnpd nan_fraction_mask max_blended_nan_positive))) final)) ;; Rules for `fmin_pseudo` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; (rule (lower (has_type $F32 (fmin_pseudo x y))) (x64_minss y x)) (rule (lower (has_type $F64 (fmin_pseudo x y))) (x64_minsd y x)) (rule (lower (has_type $F32X4 (fmin_pseudo x y))) (x64_minps y x)) (rule (lower (has_type $F64X2 (fmin_pseudo x y))) (x64_minpd y x)) ;; Rules for `fmax_pseudo` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; (rule (lower (has_type $F32 (fmax_pseudo x y))) (x64_maxss y x)) (rule (lower (has_type $F64 (fmax_pseudo x y))) (x64_maxsd y x)) (rule (lower (has_type $F32X4 (fmax_pseudo x y))) (x64_maxps y x)) (rule (lower (has_type $F64X2 (fmax_pseudo x y))) (x64_maxpd y x)) ;; Rules for `fma` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; (rule (lower (has_type $F32X4 (fma x y z))) (x64_vfmadd213ps x y z)) (rule (lower (has_type $F64X2 (fma x y z))) (x64_vfmadd213pd x y z)) ;; Rules for `load*` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;; In order to load a value from memory to a GPR register, we may need to extend ;; the loaded value from 8-, 16-, or 32-bits to this backend's expected GPR ;; width: 64 bits. Note that `ext_mode` will load 1-bit types (booleans) as ;; 8-bit loads. ;; ;; By default, we zero-extend all sub-64-bit loads to a GPR. (rule (lower (has_type (and (fits_in_32 ty) (is_gpr_type _)) (load flags address offset))) (x64_movzx (ext_mode (ty_bits_u16 ty) 64) (to_amode flags address offset))) ;; But if we know that both the `from` and `to` are 64 bits, we simply load with ;; no extension. (rule (lower (has_type (ty_int_bool_ref_64 ty) (load flags address offset))) (x64_mov (to_amode flags address offset))) ;; Also, certain scalar loads have a specific `from` width and extension kind ;; (signed -> `sx`, zeroed -> `zx`). We overwrite the high bits of the 64-bit ;; GPR even if the `to` type is smaller (e.g., 16-bits). (rule (lower (has_type (is_gpr_type ty) (uload8 flags address offset))) (x64_movzx (ExtMode.BQ) (to_amode flags address offset))) (rule (lower (has_type (is_gpr_type ty) (sload8 flags address offset))) (x64_movsx (ExtMode.BQ) (to_amode flags address offset))) (rule (lower (has_type (is_gpr_type ty) (uload16 flags address offset))) (x64_movzx (ExtMode.WQ) (to_amode flags address offset))) (rule (lower (has_type (is_gpr_type ty) (sload16 flags address offset))) (x64_movsx (ExtMode.WQ) (to_amode flags address offset))) (rule (lower (has_type (is_gpr_type ty) (uload32 flags address offset))) (x64_movzx (ExtMode.LQ) (to_amode flags address offset))) (rule (lower (has_type (is_gpr_type ty) (sload32 flags address offset))) (x64_movsx (ExtMode.LQ) (to_amode flags address offset))) ;; To load to XMM registers, we use the x64-specific instructions for each type. ;; For `$F32` and `$F64` this is important--we only want to load 32 or 64 bits. ;; But for the 128-bit types, this is not strictly necessary for performance but ;; might help with clarity during disassembly. (rule (lower (has_type $F32 (load flags address offset))) (x64_movss_load (to_amode flags address offset))) (rule (lower (has_type $F64 (load flags address offset))) (x64_movsd_load (to_amode flags address offset))) (rule (lower (has_type $F32X4 (load flags address offset))) (x64_movups (to_amode flags address offset))) (rule (lower (has_type $F64X2 (load flags address offset))) (x64_movupd (to_amode flags address offset))) (rule (lower (has_type (ty_vec128 ty) (load flags address offset))) (x64_movdqu (to_amode flags address offset))) ;; We can load an I128/B128 by doing two 64-bit loads. (rule (lower (has_type (ty_int_bool_128 _) (load flags address offset))) (let ((addr_lo Amode (to_amode flags address offset)) (addr_hi Amode (amode_offset addr_lo 8)) (value_lo Reg (x64_mov addr_lo)) (value_hi Reg (x64_mov addr_hi))) (value_regs value_lo value_hi))) ;; We also include widening vector loads; these sign- or zero-extend each lane ;; to the next wider width (e.g., 16x4 -> 32x4). (rule (lower (has_type $I16X8 (sload8x8 flags address offset))) (x64_pmovsxbw (to_amode flags address offset))) (rule (lower (has_type $I16X8 (uload8x8 flags address offset))) (x64_pmovzxbw (to_amode flags address offset))) (rule (lower (has_type $I32X4 (sload16x4 flags address offset))) (x64_pmovsxwd (to_amode flags address offset))) (rule (lower (has_type $I32X4 (uload16x4 flags address offset))) (x64_pmovzxwd (to_amode flags address offset))) (rule (lower (has_type $I64X2 (sload32x2 flags address offset))) (x64_pmovsxdq (to_amode flags address offset))) (rule (lower (has_type $I64X2 (uload32x2 flags address offset))) (x64_pmovzxdq (to_amode flags address offset))) ;; Rules for `store*` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;; 8-, 16-, 32- and 64-bit GPR stores. (rule (lower (store flags value @ (value_type (is_gpr_type ty)) address offset)) (side_effect (x64_movrm ty (to_amode flags address offset) value))) ;; Explicit 8/16/32-bit opcodes. (rule (lower (istore8 flags value address offset)) (side_effect (x64_movrm $I8 (to_amode flags address offset) value))) (rule (lower (istore16 flags value address offset)) (side_effect (x64_movrm $I16 (to_amode flags address offset) value))) (rule (lower (istore32 flags value address offset)) (side_effect (x64_movrm $I32 (to_amode flags address offset) value))) ;; F32 stores of values in XMM registers. (rule (lower (store flags value @ (value_type $F32) address offset)) (side_effect (x64_xmm_movrm (SseOpcode.Movss) (to_amode flags address offset) value))) ;; F64 stores of values in XMM registers. (rule (lower (store flags value @ (value_type $F64) address offset)) (side_effect (x64_xmm_movrm (SseOpcode.Movsd) (to_amode flags address offset) value))) ;; Stores of F32X4 vectors. (rule (lower (store flags value @ (value_type $F32X4) address offset)) (side_effect (x64_xmm_movrm (SseOpcode.Movups) (to_amode flags address offset) value))) ;; Stores of F64X2 vectors. (rule (lower (store flags value @ (value_type $F64X2) address offset)) (side_effect (x64_xmm_movrm (SseOpcode.Movupd) (to_amode flags address offset) value))) ;; Stores of all other 128-bit vector types with integer lanes. (rule (lower (store flags value @ (value_type (ty_vec128_int _)) address offset)) (side_effect (x64_xmm_movrm (SseOpcode.Movdqu) (to_amode flags address offset) value))) ;; Stores of I128/B128 values: store the two 64-bit halves separately. (rule (lower (store flags value @ (value_type (ty_int_bool_128 _)) address offset)) (let ((value_reg ValueRegs value) (value_lo Gpr (value_regs_get_gpr value_reg 0)) (value_hi Gpr (value_regs_get_gpr value_reg 1)) (addr_lo Amode (to_amode flags address offset)) (addr_hi Amode (amode_offset addr_lo 8))) (side_effect (side_effect_concat (x64_movrm $I64 addr_lo value_lo) (x64_movrm $I64 addr_hi value_hi))))) ;; Rules for `load*` + ALU op + `store*` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;; Add mem, reg (rule (lower (store flags (has_type (ty_32_or_64 ty) (iadd (and (sinkable_load sink) (load flags addr offset)) src2)) addr offset)) (let ((_ RegMemImm (sink_load sink))) (side_effect (x64_add_mem ty (to_amode flags addr offset) src2)))) ;; Add mem, reg with args swapped (rule (lower (store flags (has_type (ty_32_or_64 ty) (iadd src2 (and (sinkable_load sink) (load flags addr offset)))) addr offset)) (let ((_ RegMemImm (sink_load sink))) (side_effect (x64_add_mem ty (to_amode flags addr offset) src2)))) ;; Sub mem, reg (rule (lower (store flags (has_type (ty_32_or_64 ty) (isub (and (sinkable_load sink) (load flags addr offset)) src2)) addr offset)) (let ((_ RegMemImm (sink_load sink))) (side_effect (x64_sub_mem ty (to_amode flags addr offset) src2)))) ;; And mem, reg (rule (lower (store flags (has_type (ty_32_or_64 ty) (band (and (sinkable_load sink) (load flags addr offset)) src2)) addr offset)) (let ((_ RegMemImm (sink_load sink))) (side_effect (x64_and_mem ty (to_amode flags addr offset) src2)))) ;; And mem, reg with args swapped (rule (lower (store flags (has_type (ty_32_or_64 ty) (band src2 (and (sinkable_load sink) (load flags addr offset)))) addr offset)) (let ((_ RegMemImm (sink_load sink))) (side_effect (x64_and_mem ty (to_amode flags addr offset) src2)))) ;; Or mem, reg (rule (lower (store flags (has_type (ty_32_or_64 ty) (bor (and (sinkable_load sink) (load flags addr offset)) src2)) addr offset)) (let ((_ RegMemImm (sink_load sink))) (side_effect (x64_or_mem ty (to_amode flags addr offset) src2)))) ;; Or mem, reg with args swapped (rule (lower (store flags (has_type (ty_32_or_64 ty) (bor src2 (and (sinkable_load sink) (load flags addr offset)))) addr offset)) (let ((_ RegMemImm (sink_load sink))) (side_effect (x64_or_mem ty (to_amode flags addr offset) src2)))) ;; Xor mem, reg (rule (lower (store flags (has_type (ty_32_or_64 ty) (bxor (and (sinkable_load sink) (load flags addr offset)) src2)) addr offset)) (let ((_ RegMemImm (sink_load sink))) (side_effect (x64_xor_mem ty (to_amode flags addr offset) src2)))) ;; Xor mem, reg with args swapped (rule (lower (store flags (has_type (ty_32_or_64 ty) (bxor src2 (and (sinkable_load sink) (load flags addr offset)))) addr offset)) (let ((_ RegMemImm (sink_load sink))) (side_effect (x64_xor_mem ty (to_amode flags addr offset) src2)))) ;; Rules for `fence` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; (rule (lower (fence)) (side_effect (x64_mfence))) ;; Rules for `func_addr` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; (rule (lower (func_addr (func_ref_data _ extname _))) (load_ext_name extname 0)) ;; Rules for `symbol_value` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; (rule (lower (symbol_value (symbol_value_data extname _ offset))) (load_ext_name extname offset)) ;; Rules for `atomic_load` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;; This is a normal load. The x86-TSO memory model provides sufficient ;; sequencing to satisfy the CLIF synchronisation requirements for `AtomicLoad` ;; without the need for any fence instructions. ;; ;; As described in the `atomic_load` documentation, this lowering is only valid ;; for I8, I16, I32, and I64. The sub-64-bit types are zero extended, as with a ;; normal load. (rule (lower (has_type $I64 (atomic_load flags address))) (x64_mov (to_amode flags address (zero_offset)))) (rule (lower (has_type (and (fits_in_32 ty) (ty_int _)) (atomic_load flags address))) (x64_movzx (ext_mode (ty_bits_u16 ty) 64) (to_amode flags address (zero_offset)))) ;; Rules for `atomic_store` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;; This is a normal store followed by an `mfence` instruction. As described in ;; the `atomic_load` documentation, this lowering is only valid for I8, I16, ;; I32, and I64. (rule (lower (atomic_store flags value @ (value_type (and (fits_in_64 ty) (ty_int _))) address)) (side_effect (side_effect_concat (x64_movrm ty (to_amode flags address (zero_offset)) value) (x64_mfence)))) ;; Rules for `atomic_cas` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; (rule (lower (has_type (and (fits_in_64 ty) (ty_int _)) (atomic_cas flags address expected replacement))) (x64_cmpxchg ty expected replacement (to_amode flags address (zero_offset)))) ;; Rules for `atomic_rmw` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;; This is a simple, general-case atomic update, based on a loop involving ;; `cmpxchg`. Note that we could do much better than this in the case where the ;; old value at the location (that is to say, the SSA `Value` computed by this ;; CLIF instruction) is not required. In that case, we could instead implement ;; this using a single `lock`-prefixed x64 read-modify-write instruction. Also, ;; even in the case where the old value is required, for the `add` and `sub` ;; cases, we can use the single instruction `lock xadd`. However, those ;; improvements have been left for another day. TODO: filed as ;; https://github.com/bytecodealliance/wasmtime/issues/2153. (rule (lower (has_type (and (fits_in_64 ty) (ty_int _)) (atomic_rmw flags op address input))) (x64_atomic_rmw_seq ty op (to_amode flags address (zero_offset)) input)) ;; Rules for `call` and `call_indirect` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; (rule (lower (call (func_ref_data sig_ref extname dist) inputs)) (gen_call sig_ref extname dist inputs)) (rule (lower (call_indirect sig_ref val inputs)) (gen_call_indirect sig_ref val inputs))