1470 lines
59 KiB
Common Lisp
1470 lines
59 KiB
Common Lisp
;; x86-64 instruction selection and CLIF-to-MachInst lowering.
|
|
|
|
;; The main lowering constructor term: takes a clif `Inst` and returns the
|
|
;; register(s) within which the lowered instruction's result values live.
|
|
(decl lower (Inst) ValueRegs)
|
|
|
|
;;;; Rules for `iconst` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
|
|
|
;; `i64` and smaller.
|
|
(rule (lower (has_type (fits_in_64 ty)
|
|
(iconst (u64_from_imm64 x))))
|
|
(value_reg (imm ty x)))
|
|
|
|
;; `i128`
|
|
(rule (lower (has_type $I128
|
|
(iconst (u64_from_imm64 x))))
|
|
(value_regs (imm $I64 x)
|
|
(imm $I64 0)))
|
|
|
|
;;;; Rules for `bconst` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
|
|
|
;; `b64` and smaller.
|
|
|
|
(rule (lower (has_type (fits_in_64 ty)
|
|
(bconst $false)))
|
|
(value_reg (imm ty 0)))
|
|
|
|
(rule (lower (has_type (fits_in_64 ty)
|
|
(bconst $true)))
|
|
(value_reg (imm ty 1)))
|
|
|
|
;; `b128`
|
|
|
|
(rule (lower (has_type $B128
|
|
(bconst $false)))
|
|
(value_regs (imm $B64 0)
|
|
(imm $B64 0)))
|
|
|
|
(rule (lower (has_type $B128
|
|
(bconst $true)))
|
|
(value_regs (imm $B64 1)
|
|
(imm $B64 0)))
|
|
|
|
;;;; Rules for `f32const` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
|
|
|
(rule (lower (f32const (u64_from_ieee32 x)))
|
|
(value_reg (imm $F32 x)))
|
|
|
|
;;;; Rules for `f64const` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
|
|
|
(rule (lower (f64const (u64_from_ieee64 x)))
|
|
(value_reg (imm $F64 x)))
|
|
|
|
;;;; Rules for `null` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
|
|
|
(rule (lower (has_type ty (null)))
|
|
(value_reg (imm ty 0)))
|
|
|
|
;;;; Rules for `iadd` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
|
|
|
;; `i64` and smaller.
|
|
|
|
;; Add two registers.
|
|
(rule (lower (has_type (fits_in_64 ty)
|
|
(iadd x y)))
|
|
(value_reg (add ty
|
|
(put_in_reg x)
|
|
(RegMemImm.Reg (put_in_reg y)))))
|
|
|
|
;; Add a register and an immediate.
|
|
|
|
(rule (lower (has_type (fits_in_64 ty)
|
|
(iadd x (simm32_from_value y))))
|
|
(value_reg (add ty (put_in_reg x) y)))
|
|
|
|
(rule (lower (has_type (fits_in_64 ty)
|
|
(iadd (simm32_from_value x) y)))
|
|
(value_reg (add ty (put_in_reg y) x)))
|
|
|
|
;; Add a register and memory.
|
|
|
|
(rule (lower (has_type (fits_in_64 ty)
|
|
(iadd x (sinkable_load y))))
|
|
(value_reg (add ty
|
|
(put_in_reg x)
|
|
(sink_load y))))
|
|
|
|
(rule (lower (has_type (fits_in_64 ty)
|
|
(iadd (sinkable_load x) y)))
|
|
(value_reg (add ty
|
|
(put_in_reg y)
|
|
(sink_load x))))
|
|
|
|
;; SSE.
|
|
|
|
(rule (lower (has_type (multi_lane 8 16)
|
|
(iadd x y)))
|
|
(value_xmm (paddb (put_in_xmm x)
|
|
(put_in_xmm_mem y))))
|
|
|
|
(rule (lower (has_type (multi_lane 16 8)
|
|
(iadd x y)))
|
|
(value_xmm (paddw (put_in_xmm x)
|
|
(put_in_xmm_mem y))))
|
|
|
|
(rule (lower (has_type (multi_lane 32 4)
|
|
(iadd x y)))
|
|
(value_xmm (paddd (put_in_xmm x)
|
|
(put_in_xmm_mem y))))
|
|
|
|
(rule (lower (has_type (multi_lane 64 2)
|
|
(iadd x y)))
|
|
(value_xmm (paddq (put_in_xmm x)
|
|
(put_in_xmm_mem y))))
|
|
|
|
;; `i128`
|
|
(rule (lower (has_type $I128 (iadd x y)))
|
|
;; Get the high/low registers for `x`.
|
|
(let ((x_regs ValueRegs (put_in_regs x))
|
|
(x_lo Reg (value_regs_get x_regs 0))
|
|
(x_hi Reg (value_regs_get x_regs 1)))
|
|
;; Get the high/low registers for `y`.
|
|
(let ((y_regs ValueRegs (put_in_regs y))
|
|
(y_lo Reg (value_regs_get y_regs 0))
|
|
(y_hi Reg (value_regs_get y_regs 1)))
|
|
;; Do an add followed by an add-with-carry.
|
|
(with_flags (add_with_flags $I64 x_lo (RegMemImm.Reg y_lo))
|
|
(adc $I64 x_hi (RegMemImm.Reg y_hi))))))
|
|
|
|
;;;; Rules for `sadd_sat` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
|
|
|
(rule (lower (has_type (multi_lane 8 16)
|
|
(sadd_sat x y)))
|
|
(value_xmm (paddsb (put_in_xmm x)
|
|
(put_in_xmm_mem y))))
|
|
|
|
(rule (lower (has_type (multi_lane 16 8)
|
|
(sadd_sat x y)))
|
|
(value_xmm (paddsw (put_in_xmm x)
|
|
(put_in_xmm_mem y))))
|
|
|
|
;;;; Rules for `uadd_sat` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
|
|
|
(rule (lower (has_type (multi_lane 8 16)
|
|
(uadd_sat x y)))
|
|
(value_xmm (paddusb (put_in_xmm x)
|
|
(put_in_xmm_mem y))))
|
|
|
|
(rule (lower (has_type (multi_lane 16 8)
|
|
(uadd_sat x y)))
|
|
(value_xmm (paddusw (put_in_xmm x)
|
|
(put_in_xmm_mem y))))
|
|
|
|
;;;; Rules for `iadd_ifcout` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
|
|
|
;; N.B.: the second output of `iadd_ifcout` is meant to be the
|
|
;; `iflags` value containing the carry result. However, we plan to
|
|
;; replace this with a bool carry flag, and all consumers of `iflags`
|
|
;; remain in the handwritten pattern-matching code and explicitly
|
|
;; match on the flags producer. So we can get away with just
|
|
;; allocating a second temp so that the reg-renaming code does the
|
|
;; right thing, for now. For safety, we assert elsewhere that no one
|
|
;; actually uses the register assigned to the SSA `iflags`-typed
|
|
;; `Value`.
|
|
|
|
;; Add two registers.
|
|
(rule (lower (has_type (fits_in_64 ty)
|
|
(iadd_ifcout x y)))
|
|
(let ((unused_iflags Reg (writable_reg_to_reg (temp_writable_reg $I64))))
|
|
(value_regs (add ty
|
|
(put_in_reg x)
|
|
(RegMemImm.Reg (put_in_reg y)))
|
|
unused_iflags)))
|
|
|
|
;; Add a register and an immediate.
|
|
|
|
(rule (lower (has_type (fits_in_64 ty)
|
|
(iadd_ifcout x (simm32_from_value y))))
|
|
(let ((unused_iflags Reg (writable_reg_to_reg (temp_writable_reg $I64))))
|
|
(value_regs (add ty (put_in_reg x) y)
|
|
unused_iflags)))
|
|
|
|
(rule (lower (has_type (fits_in_64 ty)
|
|
(iadd_ifcout (simm32_from_value x) y)))
|
|
(let ((unused_iflags Reg (writable_reg_to_reg (temp_writable_reg $I64))))
|
|
(value_regs (add ty (put_in_reg y) x)
|
|
unused_iflags)))
|
|
|
|
;; Add a register and memory.
|
|
|
|
(rule (lower (has_type (fits_in_64 ty)
|
|
(iadd_ifcout x (sinkable_load y))))
|
|
(let ((unused_iflags Reg (writable_reg_to_reg (temp_writable_reg $I64))))
|
|
(value_regs (add ty
|
|
(put_in_reg x)
|
|
(sink_load y))
|
|
unused_iflags)))
|
|
|
|
(rule (lower (has_type (fits_in_64 ty)
|
|
(iadd_ifcout (sinkable_load x) y)))
|
|
(let ((unused_iflags Reg (writable_reg_to_reg (temp_writable_reg $I64))))
|
|
(value_regs (add ty
|
|
(put_in_reg y)
|
|
(sink_load x))
|
|
unused_iflags)))
|
|
|
|
;; (No `iadd_ifcout` for `i128`.)
|
|
|
|
;;;; Rules for `iadd_imm` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
|
|
|
;; `i64` and smaller.
|
|
|
|
;; When the immediate fits in a `RegMemImm.Imm`, use that.
|
|
(rule (lower (has_type (fits_in_64 ty) (iadd_imm y (simm32_from_imm64 x))))
|
|
(value_reg (add ty (put_in_reg y) x)))
|
|
|
|
;; Otherwise, put the immediate into a register.
|
|
(rule (lower (has_type (fits_in_64 ty) (iadd_imm y (u64_from_imm64 x))))
|
|
(value_reg (add ty (put_in_reg y) (RegMemImm.Reg (imm ty x)))))
|
|
|
|
;; `i128`
|
|
|
|
;; When the immediate fits in a `RegMemImm.Imm`, use that.
|
|
(rule (lower (has_type $I128 (iadd_imm y (simm32_from_imm64 x))))
|
|
(let ((y_regs ValueRegs (put_in_regs y))
|
|
(y_lo Reg (value_regs_get y_regs 0))
|
|
(y_hi Reg (value_regs_get y_regs 1)))
|
|
(with_flags (add_with_flags $I64 y_lo x)
|
|
(adc $I64 y_hi (RegMemImm.Imm 0)))))
|
|
|
|
;; Otherwise, put the immediate into a register.
|
|
(rule (lower (has_type $I128 (iadd_imm y (u64_from_imm64 x))))
|
|
(let ((y_regs ValueRegs (put_in_regs y))
|
|
(y_lo Reg (value_regs_get y_regs 0))
|
|
(y_hi Reg (value_regs_get y_regs 1))
|
|
(x_lo Reg (imm $I64 x)))
|
|
(with_flags (add_with_flags $I64 y_lo (RegMemImm.Reg x_lo))
|
|
(adc $I64 y_hi (RegMemImm.Imm 0)))))
|
|
|
|
;;;; Rules for `isub` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
|
|
|
;; `i64` and smaller.
|
|
|
|
;; Sub two registers.
|
|
(rule (lower (has_type (fits_in_64 ty)
|
|
(isub x y)))
|
|
(value_reg (sub ty
|
|
(put_in_reg x)
|
|
(RegMemImm.Reg (put_in_reg y)))))
|
|
|
|
;; Sub a register and an immediate.
|
|
(rule (lower (has_type (fits_in_64 ty)
|
|
(isub x (simm32_from_value y))))
|
|
(value_reg (sub ty (put_in_reg x) y)))
|
|
|
|
;; Sub a register and memory.
|
|
(rule (lower (has_type (fits_in_64 ty)
|
|
(isub x (sinkable_load y))))
|
|
(value_reg (sub ty
|
|
(put_in_reg x)
|
|
(sink_load y))))
|
|
|
|
;; SSE.
|
|
|
|
(rule (lower (has_type (multi_lane 8 16)
|
|
(isub x y)))
|
|
(value_xmm (psubb (put_in_xmm x)
|
|
(put_in_xmm_mem y))))
|
|
|
|
(rule (lower (has_type (multi_lane 16 8)
|
|
(isub x y)))
|
|
(value_xmm (psubw (put_in_xmm x)
|
|
(put_in_xmm_mem y))))
|
|
|
|
(rule (lower (has_type (multi_lane 32 4)
|
|
(isub x y)))
|
|
(value_xmm (psubd (put_in_xmm x)
|
|
(put_in_xmm_mem y))))
|
|
|
|
(rule (lower (has_type (multi_lane 64 2)
|
|
(isub x y)))
|
|
(value_xmm (psubq (put_in_xmm x)
|
|
(put_in_xmm_mem y))))
|
|
|
|
;; `i128`
|
|
(rule (lower (has_type $I128 (isub x y)))
|
|
;; Get the high/low registers for `x`.
|
|
(let ((x_regs ValueRegs (put_in_regs x))
|
|
(x_lo Reg (value_regs_get x_regs 0))
|
|
(x_hi Reg (value_regs_get x_regs 1)))
|
|
;; Get the high/low registers for `y`.
|
|
(let ((y_regs ValueRegs (put_in_regs y))
|
|
(y_lo Reg (value_regs_get y_regs 0))
|
|
(y_hi Reg (value_regs_get y_regs 1)))
|
|
;; Do a sub followed by an sub-with-borrow.
|
|
(with_flags (sub_with_flags $I64 x_lo (RegMemImm.Reg y_lo))
|
|
(sbb $I64 x_hi (RegMemImm.Reg y_hi))))))
|
|
|
|
;;;; Rules for `ssub_sat` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
|
|
|
(rule (lower (has_type (multi_lane 8 16)
|
|
(ssub_sat x y)))
|
|
(value_xmm (psubsb (put_in_xmm x)
|
|
(put_in_xmm_mem y))))
|
|
|
|
(rule (lower (has_type (multi_lane 16 8)
|
|
(ssub_sat x y)))
|
|
(value_xmm (psubsw (put_in_xmm x)
|
|
(put_in_xmm_mem y))))
|
|
|
|
;;;; Rules for `usub_sat` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
|
|
|
(rule (lower (has_type (multi_lane 8 16)
|
|
(usub_sat x y)))
|
|
(value_xmm (psubusb (put_in_xmm x)
|
|
(put_in_xmm_mem y))))
|
|
|
|
(rule (lower (has_type (multi_lane 16 8)
|
|
(usub_sat x y)))
|
|
(value_xmm (psubusw (put_in_xmm x)
|
|
(put_in_xmm_mem y))))
|
|
|
|
;;;; Rules for `band` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
|
|
|
;; `{i,b}64` and smaller.
|
|
|
|
;; And two registers.
|
|
(rule (lower (has_type (fits_in_64 ty) (band x y)))
|
|
(value_reg (x64_and ty
|
|
(put_in_reg x)
|
|
(RegMemImm.Reg (put_in_reg y)))))
|
|
|
|
;; And with a memory operand.
|
|
|
|
(rule (lower (has_type (fits_in_64 ty)
|
|
(band x (sinkable_load y))))
|
|
(value_reg (x64_and ty
|
|
(put_in_reg x)
|
|
(sink_load y))))
|
|
|
|
(rule (lower (has_type (fits_in_64 ty)
|
|
(band (sinkable_load x) y)))
|
|
(value_reg (x64_and ty
|
|
(put_in_reg y)
|
|
(sink_load x))))
|
|
|
|
;; And with an immediate.
|
|
|
|
(rule (lower (has_type (fits_in_64 ty)
|
|
(band x (simm32_from_value y))))
|
|
(value_reg (x64_and ty
|
|
(put_in_reg x)
|
|
y)))
|
|
|
|
(rule (lower (has_type (fits_in_64 ty)
|
|
(band (simm32_from_value x) y)))
|
|
(value_reg (x64_and ty
|
|
(put_in_reg y)
|
|
x)))
|
|
|
|
;; SSE.
|
|
|
|
(decl sse_and (Type Xmm XmmMem) Xmm)
|
|
(rule (sse_and $F32X4 x y) (andps x y))
|
|
(rule (sse_and $F64X2 x y) (andpd x y))
|
|
(rule (sse_and (multi_lane _bits _lanes) x y) (pand x y))
|
|
|
|
(rule (lower (has_type ty @ (multi_lane _bits _lanes)
|
|
(band x y)))
|
|
(value_xmm (sse_and ty
|
|
(put_in_xmm x)
|
|
(put_in_xmm_mem y))))
|
|
|
|
;; `{i,b}128`.
|
|
|
|
(rule (lower (has_type $I128 (band x y)))
|
|
(let ((x_regs ValueRegs (put_in_regs x))
|
|
(x_lo Reg (value_regs_get x_regs 0))
|
|
(x_hi Reg (value_regs_get x_regs 1))
|
|
(y_regs ValueRegs (put_in_regs y))
|
|
(y_lo Reg (value_regs_get y_regs 0))
|
|
(y_hi Reg (value_regs_get y_regs 1)))
|
|
(value_regs (x64_and $I64 x_lo (RegMemImm.Reg y_lo))
|
|
(x64_and $I64 x_hi (RegMemImm.Reg y_hi)))))
|
|
|
|
(rule (lower (has_type $B128 (band x y)))
|
|
;; Booleans are always `0` or `1`, so we only need to do the `and` on the
|
|
;; low half. The high half is always zero but, rather than generate a new
|
|
;; zero, we just reuse `x`'s high half which is already zero.
|
|
(let ((x_regs ValueRegs (put_in_regs x))
|
|
(x_lo Reg (value_regs_get x_regs 0))
|
|
(x_hi Reg (value_regs_get x_regs 1))
|
|
(y_lo Reg (lo_reg y)))
|
|
(value_regs (x64_and $I64 x_lo (RegMemImm.Reg y_lo))
|
|
x_hi)))
|
|
|
|
;;;; Rules for `bor` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
|
|
|
;; `{i,b}64` and smaller.
|
|
|
|
;; Or two registers.
|
|
(rule (lower (has_type (fits_in_64 ty) (bor x y)))
|
|
(value_reg (or ty
|
|
(put_in_reg x)
|
|
(RegMemImm.Reg (put_in_reg y)))))
|
|
|
|
;; Or with a memory operand.
|
|
|
|
(rule (lower (has_type (fits_in_64 ty)
|
|
(bor x (sinkable_load y))))
|
|
(value_reg (or ty
|
|
(put_in_reg x)
|
|
(sink_load y))))
|
|
|
|
(rule (lower (has_type (fits_in_64 ty)
|
|
(bor (sinkable_load x) y)))
|
|
(value_reg (or ty
|
|
(put_in_reg y)
|
|
(sink_load x))))
|
|
|
|
;; Or with an immediate.
|
|
|
|
(rule (lower (has_type (fits_in_64 ty)
|
|
(bor x (simm32_from_value y))))
|
|
(value_reg (or ty
|
|
(put_in_reg x)
|
|
y)))
|
|
|
|
(rule (lower (has_type (fits_in_64 ty)
|
|
(bor (simm32_from_value x) y)))
|
|
(value_reg (or ty
|
|
(put_in_reg y)
|
|
x)))
|
|
|
|
;; SSE.
|
|
|
|
(decl sse_or (Type Xmm XmmMem) Xmm)
|
|
(rule (sse_or $F32X4 x y) (orps x y))
|
|
(rule (sse_or $F64X2 x y) (orpd x y))
|
|
(rule (sse_or (multi_lane _bits _lanes) x y) (por x y))
|
|
|
|
(rule (lower (has_type ty @ (multi_lane _bits _lanes)
|
|
(bor x y)))
|
|
(value_xmm (sse_or ty
|
|
(put_in_xmm x)
|
|
(put_in_xmm_mem y))))
|
|
|
|
;; `{i,b}128`.
|
|
|
|
(decl or_i128 (ValueRegs ValueRegs) ValueRegs)
|
|
(rule (or_i128 x y)
|
|
(let ((x_lo Reg (value_regs_get x 0))
|
|
(x_hi Reg (value_regs_get x 1))
|
|
(y_lo Reg (value_regs_get y 0))
|
|
(y_hi Reg (value_regs_get y 1)))
|
|
(value_regs (or $I64 x_lo (RegMemImm.Reg y_lo))
|
|
(or $I64 x_hi (RegMemImm.Reg y_hi)))))
|
|
|
|
(rule (lower (has_type $I128 (bor x y)))
|
|
(or_i128 (put_in_regs x) (put_in_regs y)))
|
|
|
|
(rule (lower (has_type $B128 (bor x y)))
|
|
;; Booleans are always `0` or `1`, so we only need to do the `or` on the
|
|
;; low half. The high half is always zero but, rather than generate a new
|
|
;; zero, we just reuse `x`'s high half which is already zero.
|
|
(let ((x_regs ValueRegs (put_in_regs x))
|
|
(x_lo Reg (value_regs_get x_regs 0))
|
|
(x_hi Reg (value_regs_get x_regs 1))
|
|
(y_lo Reg (lo_reg y)))
|
|
(value_regs (or $I64 x_lo (RegMemImm.Reg y_lo))
|
|
x_hi)))
|
|
|
|
;;;; Rules for `bxor` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
|
|
|
;; `{i,b}64` and smaller.
|
|
|
|
;; Xor two registers.
|
|
(rule (lower (has_type (fits_in_64 ty) (bxor x y)))
|
|
(value_reg (xor ty
|
|
(put_in_reg x)
|
|
(RegMemImm.Reg (put_in_reg y)))))
|
|
|
|
;; Xor with a memory operand.
|
|
|
|
(rule (lower (has_type (fits_in_64 ty)
|
|
(bxor x (sinkable_load y))))
|
|
(value_reg (xor ty
|
|
(put_in_reg x)
|
|
(sink_load y))))
|
|
|
|
(rule (lower (has_type (fits_in_64 ty)
|
|
(bxor (sinkable_load x) y)))
|
|
(value_reg (xor ty
|
|
(put_in_reg y)
|
|
(sink_load x))))
|
|
|
|
;; Xor with an immediate.
|
|
|
|
(rule (lower (has_type (fits_in_64 ty)
|
|
(bxor x (simm32_from_value y))))
|
|
(value_reg (xor ty
|
|
(put_in_reg x)
|
|
y)))
|
|
|
|
(rule (lower (has_type (fits_in_64 ty)
|
|
(bxor (simm32_from_value x) y)))
|
|
(value_reg (xor ty
|
|
(put_in_reg y)
|
|
x)))
|
|
|
|
;; SSE.
|
|
|
|
(rule (lower (has_type ty @ (multi_lane _bits _lanes) (bxor x y)))
|
|
(value_xmm (sse_xor ty (put_in_xmm x) (put_in_xmm_mem y))))
|
|
|
|
;; `{i,b}128`.
|
|
|
|
(rule (lower (has_type $I128 (bxor x y)))
|
|
(let ((x_regs ValueRegs (put_in_regs x))
|
|
(x_lo Reg (value_regs_get x_regs 0))
|
|
(x_hi Reg (value_regs_get x_regs 1))
|
|
(y_regs ValueRegs (put_in_regs y))
|
|
(y_lo Reg (value_regs_get y_regs 0))
|
|
(y_hi Reg (value_regs_get y_regs 1)))
|
|
(value_regs (xor $I64 x_lo (RegMemImm.Reg y_lo))
|
|
(xor $I64 x_hi (RegMemImm.Reg y_hi)))))
|
|
|
|
(rule (lower (has_type $B128 (bxor x y)))
|
|
;; Booleans are always `0` or `1`, so we only need to do the `xor` on the
|
|
;; low half. The high half is always zero but, rather than generate a new
|
|
;; zero, we just reuse `x`'s high half which is already zero.
|
|
(let ((x_regs ValueRegs (put_in_regs x))
|
|
(x_lo Reg (value_regs_get x_regs 0))
|
|
(x_hi Reg (value_regs_get x_regs 1))
|
|
(y_lo Reg (lo_reg y)))
|
|
(value_regs (xor $I64 x_lo (RegMemImm.Reg y_lo))
|
|
x_hi)))
|
|
|
|
;;;; Rules for `ishl` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
|
|
|
;; `i64` and smaller.
|
|
|
|
(rule (lower (has_type (fits_in_64 ty) (ishl src amt)))
|
|
(value_reg (shl ty (put_in_reg src) (put_masked_in_imm8_reg amt ty))))
|
|
|
|
;; `i128`.
|
|
|
|
(decl shl_i128 (ValueRegs Reg) ValueRegs)
|
|
(rule (shl_i128 src amt)
|
|
;; Unpack the registers that make up the 128-bit value being shifted.
|
|
(let ((src_lo Reg (value_regs_get src 0))
|
|
(src_hi Reg (value_regs_get src 1))
|
|
;; Do two 64-bit shifts.
|
|
(lo_shifted Reg (shl $I64 src_lo (Imm8Reg.Reg amt)))
|
|
(hi_shifted Reg (shl $I64 src_hi (Imm8Reg.Reg amt)))
|
|
;; `src_lo >> (64 - amt)` are the bits to carry over from the lo
|
|
;; into the hi.
|
|
(carry Reg (shr $I64 src_lo (Imm8Reg.Reg (sub $I64 (imm $I64 64) (RegMemImm.Reg amt)))))
|
|
(zero Reg (imm $I64 0))
|
|
;; Nullify the carry if we are shifting in by a multiple of 128.
|
|
(carry_ Reg (with_flags_1 (test (OperandSize.Size64) (RegMemImm.Imm 127) amt)
|
|
(cmove $I64 (CC.Z) (RegMem.Reg zero) carry)))
|
|
;; Add the carry into the high half.
|
|
(hi_shifted_ Reg (or $I64 carry_ (RegMemImm.Reg hi_shifted))))
|
|
;; Combine the two shifted halves. However, if we are shifting by >= 64
|
|
;; (modulo 128), then the low bits are zero and the high bits are our
|
|
;; low bits.
|
|
(with_flags_2 (test (OperandSize.Size64) (RegMemImm.Imm 64) amt)
|
|
(cmove $I64 (CC.Z) (RegMem.Reg lo_shifted) zero)
|
|
(cmove $I64 (CC.Z) (RegMem.Reg hi_shifted_) lo_shifted))))
|
|
|
|
(rule (lower (has_type $I128 (ishl src amt)))
|
|
;; NB: Only the low bits of `amt` matter since we logically mask the shift
|
|
;; amount to the value's bit width.
|
|
(let ((amt_ Reg (lo_reg amt)))
|
|
(shl_i128 (put_in_regs src) amt_)))
|
|
|
|
;; SSE.
|
|
|
|
;; Since the x86 instruction set does not have any 8x16 shift instructions (even
|
|
;; in higher feature sets like AVX), we lower the `ishl.i8x16` to a sequence of
|
|
;; instructions. The basic idea, whether the amount to shift by is an immediate
|
|
;; or not, is to use a 16x8 shift and then mask off the incorrect bits to 0s.
|
|
(rule (lower (has_type $I8X16 (ishl src amt)))
|
|
(let ((src_ Xmm (put_in_xmm src))
|
|
(amt_gpr RegMemImm (put_in_reg_mem_imm amt))
|
|
(amt_xmm XmmMemImm (mov_rmi_to_xmm amt_gpr))
|
|
;; Shift `src` using 16x8. Unfortunately, a 16x8 shift will only be
|
|
;; correct for half of the lanes; the others must be fixed up with
|
|
;; the mask below.
|
|
(unmasked Xmm (psllw src_ amt_xmm))
|
|
(mask_addr SyntheticAmode (ishl_i8x16_mask amt_gpr))
|
|
(mask Reg (x64_load $I8X16 mask_addr (ExtKind.None))))
|
|
(value_xmm (sse_and $I8X16 unmasked (reg_mem_to_xmm_mem (RegMem.Reg mask))))))
|
|
|
|
;; Get the address of the mask to use when fixing up the lanes that weren't
|
|
;; correctly generated by the 16x8 shift.
|
|
(decl ishl_i8x16_mask (RegMemImm) SyntheticAmode)
|
|
|
|
;; When the shift amount is known, we can statically (i.e. at compile time)
|
|
;; determine the mask to use and only emit that.
|
|
(decl ishl_i8x16_mask_for_const (u32) SyntheticAmode)
|
|
(extern constructor ishl_i8x16_mask_for_const ishl_i8x16_mask_for_const)
|
|
(rule (ishl_i8x16_mask (RegMemImm.Imm amt))
|
|
(ishl_i8x16_mask_for_const amt))
|
|
|
|
;; Otherwise, we must emit the entire mask table and dynamically (i.e. at run
|
|
;; time) find the correct mask offset in the table. We use `lea` to find the
|
|
;; base address of the mask table and then complex addressing to offset to the
|
|
;; right mask: `base_address + amt << 4`
|
|
(decl ishl_i8x16_mask_table () SyntheticAmode)
|
|
(extern constructor ishl_i8x16_mask_table ishl_i8x16_mask_table)
|
|
(rule (ishl_i8x16_mask (RegMemImm.Reg amt))
|
|
(let ((mask_table SyntheticAmode (ishl_i8x16_mask_table))
|
|
(base_mask_addr Gpr (lea mask_table))
|
|
(mask_offset Reg (shl $I64 amt (Imm8Reg.Imm8 4))))
|
|
(amode_to_synthetic_amode (amode_imm_reg_reg_shift 0
|
|
base_mask_addr
|
|
(gpr_new mask_offset)
|
|
0))))
|
|
(rule (ishl_i8x16_mask (RegMemImm.Mem amt))
|
|
(ishl_i8x16_mask (RegMemImm.Reg (x64_load $I64 amt (ExtKind.None)))))
|
|
|
|
;; 16x8, 32x4, and 64x2 shifts can each use a single instruction.
|
|
|
|
(rule (lower (has_type $I16X8 (ishl src amt)))
|
|
(value_xmm (psllw (put_in_xmm src)
|
|
(mov_rmi_to_xmm (put_in_reg_mem_imm amt)))))
|
|
|
|
(rule (lower (has_type $I32X4 (ishl src amt)))
|
|
(value_xmm (pslld (put_in_xmm src)
|
|
(mov_rmi_to_xmm (put_in_reg_mem_imm amt)))))
|
|
|
|
(rule (lower (has_type $I64X2 (ishl src amt)))
|
|
(value_xmm (psllq (put_in_xmm src)
|
|
(mov_rmi_to_xmm (put_in_reg_mem_imm amt)))))
|
|
|
|
;;;; Rules for `ushr` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
|
|
|
;; `i64` and smaller.
|
|
|
|
(rule (lower (has_type (fits_in_64 ty) (ushr src amt)))
|
|
(let ((src_ Reg (extend_to_reg src ty (ExtendKind.Zero))))
|
|
(value_reg (shr ty src_ (put_masked_in_imm8_reg amt ty)))))
|
|
|
|
;; `i128`.
|
|
|
|
(decl shr_i128 (ValueRegs Reg) ValueRegs)
|
|
(rule (shr_i128 src amt)
|
|
;; Unpack the lo/hi halves of `src`.
|
|
(let ((src_lo Reg (value_regs_get src 0))
|
|
(src_hi Reg (value_regs_get src 1))
|
|
;; Do a shift on each half.
|
|
(lo_shifted Reg (shr $I64 src_lo (Imm8Reg.Reg amt)))
|
|
(hi_shifted Reg (shr $I64 src_hi (Imm8Reg.Reg amt)))
|
|
;; `src_hi << (64 - amt)` are the bits to carry over from the hi
|
|
;; into the lo.
|
|
(carry Reg (shl $I64 src_hi (Imm8Reg.Reg (sub $I64 (imm $I64 64) (RegMemImm.Reg amt)))))
|
|
;; Nullify the carry if we are shifting by a multiple of 128.
|
|
(carry_ Reg (with_flags_1 (test (OperandSize.Size64) (RegMemImm.Imm 127) amt)
|
|
(cmove $I64 (CC.Z) (RegMem.Reg (imm $I64 0)) carry)))
|
|
;; Add the carry bits into the lo.
|
|
(lo_shifted_ Reg (or $I64 carry_ (RegMemImm.Reg lo_shifted))))
|
|
;; Combine the two shifted halves. However, if we are shifting by >= 64
|
|
;; (modulo 128), then the hi bits are zero and the lo bits are what
|
|
;; would otherwise be our hi bits.
|
|
(with_flags_2 (test (OperandSize.Size64) (RegMemImm.Imm 64) amt)
|
|
(cmove $I64 (CC.Z) (RegMem.Reg lo_shifted_) hi_shifted)
|
|
(cmove $I64 (CC.Z) (RegMem.Reg hi_shifted) (imm $I64 0)))))
|
|
|
|
(rule (lower (has_type $I128 (ushr src amt)))
|
|
;; NB: Only the low bits of `amt` matter since we logically mask the shift
|
|
;; amount to the value's bit width.
|
|
(let ((amt_ Reg (lo_reg amt)))
|
|
(shr_i128 (put_in_regs src) amt_)))
|
|
|
|
;; SSE.
|
|
|
|
;; There are no 8x16 shifts in x64. Do the same 16x8-shift-and-mask thing we do
|
|
;; with 8x16 `ishl`.
|
|
(rule (lower (has_type $I8X16 (ushr src amt)))
|
|
(let ((src_ Xmm (put_in_xmm src))
|
|
(amt_gpr RegMemImm (put_in_reg_mem_imm amt))
|
|
(amt_xmm XmmMemImm (mov_rmi_to_xmm amt_gpr))
|
|
;; Shift `src` using 16x8. Unfortunately, a 16x8 shift will only be
|
|
;; correct for half of the lanes; the others must be fixed up with
|
|
;; the mask below.
|
|
(unmasked Xmm (psrlw src_ amt_xmm))
|
|
(mask_addr SyntheticAmode (ushr_i8x16_mask amt_gpr))
|
|
(mask Reg (x64_load $I8X16 mask_addr (ExtKind.None))))
|
|
(value_xmm (sse_and $I8X16
|
|
unmasked
|
|
(reg_mem_to_xmm_mem (RegMem.Reg mask))))))
|
|
|
|
;; Get the address of the mask to use when fixing up the lanes that weren't
|
|
;; correctly generated by the 16x8 shift.
|
|
(decl ushr_i8x16_mask (RegMemImm) SyntheticAmode)
|
|
|
|
;; When the shift amount is known, we can statically (i.e. at compile time)
|
|
;; determine the mask to use and only emit that.
|
|
(decl ushr_i8x16_mask_for_const (u32) SyntheticAmode)
|
|
(extern constructor ushr_i8x16_mask_for_const ushr_i8x16_mask_for_const)
|
|
(rule (ushr_i8x16_mask (RegMemImm.Imm amt))
|
|
(ushr_i8x16_mask_for_const amt))
|
|
|
|
;; Otherwise, we must emit the entire mask table and dynamically (i.e. at run
|
|
;; time) find the correct mask offset in the table. We use `lea` to find the
|
|
;; base address of the mask table and then complex addressing to offset to the
|
|
;; right mask: `base_address + amt << 4`
|
|
(decl ushr_i8x16_mask_table () SyntheticAmode)
|
|
(extern constructor ushr_i8x16_mask_table ushr_i8x16_mask_table)
|
|
(rule (ushr_i8x16_mask (RegMemImm.Reg amt))
|
|
(let ((mask_table SyntheticAmode (ushr_i8x16_mask_table))
|
|
(base_mask_addr Gpr (lea mask_table))
|
|
(mask_offset Reg (shl $I64 amt (Imm8Reg.Imm8 4))))
|
|
(amode_to_synthetic_amode (amode_imm_reg_reg_shift 0
|
|
base_mask_addr
|
|
(gpr_new mask_offset)
|
|
0))))
|
|
(rule (ushr_i8x16_mask (RegMemImm.Mem amt))
|
|
(ushr_i8x16_mask (RegMemImm.Reg (x64_load $I64 amt (ExtKind.None)))))
|
|
|
|
;; 16x8, 32x4, and 64x2 shifts can each use a single instruction.
|
|
|
|
(rule (lower (has_type $I16X8 (ushr src amt)))
|
|
(value_xmm (psrlw (put_in_xmm src)
|
|
(mov_rmi_to_xmm (put_in_reg_mem_imm amt)))))
|
|
|
|
(rule (lower (has_type $I32X4 (ushr src amt)))
|
|
(value_xmm (psrld (put_in_xmm src)
|
|
(mov_rmi_to_xmm (put_in_reg_mem_imm amt)))))
|
|
|
|
(rule (lower (has_type $I64X2 (ushr src amt)))
|
|
(value_xmm (psrlq (put_in_xmm src)
|
|
(mov_rmi_to_xmm (put_in_reg_mem_imm amt)))))
|
|
|
|
;;;; Rules for `sshr` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
|
|
|
;; `i64` and smaller.
|
|
|
|
(rule (lower (has_type (fits_in_64 ty) (sshr src amt)))
|
|
(let ((src_ Reg (extend_to_reg src ty (ExtendKind.Sign))))
|
|
(value_reg (sar ty src_ (put_masked_in_imm8_reg amt ty)))))
|
|
|
|
;; `i128`.
|
|
|
|
(decl sar_i128 (ValueRegs Reg) ValueRegs)
|
|
(rule (sar_i128 src amt)
|
|
;; Unpack the low/high halves of `src`.
|
|
(let ((src_lo Reg (value_regs_get src 0))
|
|
(src_hi Reg (value_regs_get src 1))
|
|
;; Do a shift of each half. NB: the low half uses an unsigned shift
|
|
;; because its MSB is not a sign bit.
|
|
(lo_shifted Reg (shr $I64 src_lo (Imm8Reg.Reg amt)))
|
|
(hi_shifted Reg (sar $I64 src_hi (Imm8Reg.Reg amt)))
|
|
;; `src_hi << (64 - amt)` are the bits to carry over from the low
|
|
;; half to the high half.
|
|
(carry Reg (shl $I64 src_hi (Imm8Reg.Reg (sub $I64 (imm $I64 64) (RegMemImm.Reg amt)))))
|
|
;; Nullify the carry if we are shifting by a multiple of 128.
|
|
(carry_ Reg (with_flags_1 (test (OperandSize.Size64) (RegMemImm.Imm 127) amt)
|
|
(cmove $I64 (CC.Z) (RegMem.Reg (imm $I64 0)) carry)))
|
|
;; Add the carry into the low half.
|
|
(lo_shifted_ Reg (or $I64 lo_shifted (RegMemImm.Reg carry_)))
|
|
;; Get all sign bits.
|
|
(sign_bits Reg (sar $I64 src_hi (Imm8Reg.Imm8 63))))
|
|
;; Combine the two shifted halves. However, if we are shifting by >= 64
|
|
;; (modulo 128), then the hi bits are all sign bits and the lo bits are
|
|
;; what would otherwise be our hi bits.
|
|
(with_flags_2 (test (OperandSize.Size64) (RegMemImm.Imm 64) amt)
|
|
(cmove $I64 (CC.Z) (RegMem.Reg lo_shifted_) hi_shifted)
|
|
(cmove $I64 (CC.Z) (RegMem.Reg hi_shifted) sign_bits))))
|
|
|
|
(rule (lower (has_type $I128 (sshr src amt)))
|
|
;; NB: Only the low bits of `amt` matter since we logically mask the shift
|
|
;; amount to the value's bit width.
|
|
(let ((amt_ Reg (lo_reg amt)))
|
|
(sar_i128 (put_in_regs src) amt_)))
|
|
|
|
;; SSE.
|
|
|
|
;; Since the x86 instruction set does not have an 8x16 shift instruction and the
|
|
;; approach used for `ishl` and `ushr` cannot be easily used (the masks do not
|
|
;; preserve the sign), we use a different approach here: separate the low and
|
|
;; high lanes, shift them separately, and merge them into the final result.
|
|
;;
|
|
;; Visually, this looks like the following, where `src.i8x16 = [s0, s1, ...,
|
|
;; s15]:
|
|
;;
|
|
;; lo.i16x8 = [(s0, s0), (s1, s1), ..., (s7, s7)]
|
|
;; shifted_lo.i16x8 = shift each lane of `low`
|
|
;; hi.i16x8 = [(s8, s8), (s9, s9), ..., (s15, s15)]
|
|
;; shifted_hi.i16x8 = shift each lane of `high`
|
|
;; result = [s0'', s1'', ..., s15'']
|
|
(rule (lower (has_type $I8X16 (sshr src amt @ (value_type amt_ty))))
|
|
(let ((src_ Xmm (put_in_xmm src))
|
|
;; In order for `packsswb` later to only use the high byte of each
|
|
;; 16x8 lane, we shift right an extra 8 bits, relying on `psraw` to
|
|
;; fill in the upper bits appropriately.
|
|
(lo Xmm (punpcklbw src_ (xmm_to_xmm_mem src_)))
|
|
(hi Xmm (punpckhbw src_ (xmm_to_xmm_mem src_)))
|
|
(amt_ XmmMemImm (sshr_i8x16_bigger_shift amt_ty (put_in_reg_mem_imm amt)))
|
|
(shifted_lo Xmm (psraw lo amt_))
|
|
(shifted_hi Xmm (psraw hi amt_)))
|
|
(value_xmm (packsswb shifted_lo (xmm_to_xmm_mem shifted_hi)))))
|
|
|
|
(decl sshr_i8x16_bigger_shift (Type RegMemImm) XmmMemImm)
|
|
(rule (sshr_i8x16_bigger_shift _ty (RegMemImm.Imm i))
|
|
(xmm_mem_imm_new (RegMemImm.Imm (u32_add i 8))))
|
|
(rule (sshr_i8x16_bigger_shift ty (RegMemImm.Reg r))
|
|
(mov_rmi_to_xmm (RegMemImm.Reg (add ty r (RegMemImm.Imm 8)))))
|
|
(rule (sshr_i8x16_bigger_shift ty rmi @ (RegMemImm.Mem _m))
|
|
(mov_rmi_to_xmm (RegMemImm.Reg (add ty (imm ty 8) rmi))))
|
|
|
|
;; `sshr.{i16x8,i32x4}` can be a simple `psra{w,d}`, we just have to make sure
|
|
;; that if the shift amount is in a register, it is in an XMM register.
|
|
|
|
(rule (lower (has_type $I16X8 (sshr src amt)))
|
|
(value_xmm (psraw (put_in_xmm src)
|
|
(mov_rmi_to_xmm (put_in_reg_mem_imm amt)))))
|
|
|
|
(rule (lower (has_type $I32X4 (sshr src amt)))
|
|
(value_xmm (psrad (put_in_xmm src)
|
|
(mov_rmi_to_xmm (put_in_reg_mem_imm amt)))))
|
|
|
|
;; The `sshr.i64x2` CLIF instruction has no single x86 instruction in the older
|
|
;; feature sets. Newer ones like AVX512VL + AVX512F include `vpsraq`, a 128-bit
|
|
;; instruction that would fit here, but this backend does not currently have
|
|
;; support for EVEX encodings. To remedy this, we extract each 64-bit lane to a
|
|
;; GPR, shift each using a scalar instruction, and insert the shifted values
|
|
;; back in the `dst` XMM register.
|
|
;;
|
|
;; (TODO: when EVEX support is available, add an alternate lowering here).
|
|
(rule (lower (has_type $I64X2 (sshr src amt)))
|
|
(let ((src_ Xmm (put_in_xmm src))
|
|
(lo Gpr (pextrd $I64 src_ 0))
|
|
(hi Gpr (pextrd $I64 src_ 1))
|
|
(amt_ Imm8Reg (put_masked_in_imm8_reg amt $I64))
|
|
(shifted_lo Reg (sar $I64 (gpr_to_reg lo) amt_))
|
|
(shifted_hi Reg (sar $I64 (gpr_to_reg hi) amt_)))
|
|
(value_xmm (make_i64x2_from_lanes (reg_mem_to_gpr_mem (RegMem.Reg shifted_lo))
|
|
(reg_mem_to_gpr_mem (RegMem.Reg shifted_hi))))))
|
|
|
|
;;;; Rules for `rotl` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
|
|
|
;; `i16` and `i8`: we need to extend the shift amount, or mask the
|
|
;; constant.
|
|
|
|
(rule (lower (has_type (ty_8_or_16 ty) (rotl src amt)))
|
|
(let ((amt_ Reg (extend_to_reg amt $I32 (ExtendKind.Zero))))
|
|
(value_reg (x64_rotl ty (put_in_reg src) (Imm8Reg.Reg amt_)))))
|
|
|
|
(rule (lower (has_type (ty_8_or_16 ty)
|
|
(rotl src (u64_from_iconst amt))))
|
|
(value_reg (x64_rotl ty
|
|
(put_in_reg src)
|
|
(const_to_type_masked_imm8 amt ty))))
|
|
|
|
;; `i64` and `i32`: we can rely on x86's rotate-amount masking since
|
|
;; we operate on the whole register.
|
|
|
|
(rule (lower (has_type (ty_32_or_64 ty) (rotl src amt)))
|
|
;; NB: Only the low bits of `amt` matter since we logically mask the
|
|
;; shift amount to the value's bit width.
|
|
(let ((amt_ Reg (lo_reg amt)))
|
|
(value_reg (x64_rotl ty (put_in_reg src) (Imm8Reg.Reg amt_)))))
|
|
|
|
(rule (lower (has_type (ty_32_or_64 ty)
|
|
(rotl src (u64_from_iconst amt))))
|
|
(value_reg (x64_rotl ty
|
|
(put_in_reg src)
|
|
(const_to_type_masked_imm8 amt ty))))
|
|
|
|
;; `i128`.
|
|
|
|
(rule (lower (has_type $I128 (rotl src amt)))
|
|
(let ((src_ ValueRegs (put_in_regs src))
|
|
;; NB: Only the low bits of `amt` matter since we logically mask the
|
|
;; rotation amount to the value's bit width.
|
|
(amt_ Reg (lo_reg amt)))
|
|
(or_i128 (shl_i128 src_ amt_)
|
|
(shr_i128 src_ (sub $I64 (imm $I64 128) (RegMemImm.Reg amt_))))))
|
|
|
|
;;;; Rules for `rotr` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
|
|
|
;; `i16` and `i8`: we need to extend the shift amount, or mask the
|
|
;; constant.
|
|
|
|
(rule (lower (has_type (ty_8_or_16 ty) (rotr src amt)))
|
|
(let ((amt_ Reg (extend_to_reg amt $I32 (ExtendKind.Zero))))
|
|
(value_reg (x64_rotr ty (put_in_reg src) (Imm8Reg.Reg amt_)))))
|
|
|
|
(rule (lower (has_type (ty_8_or_16 ty)
|
|
(rotr src (u64_from_iconst amt))))
|
|
(value_reg (x64_rotr ty
|
|
(put_in_reg src)
|
|
(const_to_type_masked_imm8 amt ty))))
|
|
|
|
;; `i64` and `i32`: we can rely on x86's rotate-amount masking since
|
|
;; we operate on the whole register.
|
|
|
|
(rule (lower (has_type (ty_32_or_64 ty) (rotr src amt)))
|
|
;; NB: Only the low bits of `amt` matter since we logically mask the
|
|
;; shift amount to the value's bit width.
|
|
(let ((amt_ Reg (lo_reg amt)))
|
|
(value_reg (x64_rotr ty (put_in_reg src) (Imm8Reg.Reg amt_)))))
|
|
|
|
(rule (lower (has_type (ty_32_or_64 ty)
|
|
(rotr src (u64_from_iconst amt))))
|
|
(value_reg (x64_rotr ty
|
|
(put_in_reg src)
|
|
(const_to_type_masked_imm8 amt ty))))
|
|
|
|
;; `i128`.
|
|
|
|
(rule (lower (has_type $I128 (rotr src amt)))
|
|
(let ((src_ ValueRegs (put_in_regs src))
|
|
;; NB: Only the low bits of `amt` matter since we logically mask the
|
|
;; rotation amount to the value's bit width.
|
|
(amt_ Reg (lo_reg amt)))
|
|
(or_i128 (shr_i128 src_ amt_)
|
|
(shl_i128 src_ (sub $I64 (imm $I64 128) (RegMemImm.Reg amt_))))))
|
|
|
|
;;;; Rules for `ineg` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
|
|
|
;; `i64` and smaller.
|
|
|
|
(rule (lower (has_type (fits_in_64 ty) (ineg x)))
|
|
(value_gpr (neg ty (put_in_gpr x))))
|
|
|
|
;; SSE.
|
|
|
|
(rule (lower (has_type $I8X16 (ineg x)))
|
|
(value_xmm (psubb (xmm_new (imm $I8X16 0))
|
|
(put_in_xmm_mem x))))
|
|
|
|
(rule (lower (has_type $I16X8 (ineg x)))
|
|
(value_xmm (psubw (xmm_new (imm $I16X8 0))
|
|
(put_in_xmm_mem x))))
|
|
|
|
(rule (lower (has_type $I32X4 (ineg x)))
|
|
(value_xmm (psubd (xmm_new (imm $I32X4 0))
|
|
(put_in_xmm_mem x))))
|
|
|
|
(rule (lower (has_type $I64X2 (ineg x)))
|
|
(value_xmm (psubq (xmm_new (imm $I64X2 0))
|
|
(put_in_xmm_mem x))))
|
|
|
|
;;;; Rules for `avg_round` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
|
|
|
(rule (lower (has_type (multi_lane 8 16)
|
|
(avg_round x y)))
|
|
(value_xmm (pavgb (put_in_xmm x) (put_in_xmm_mem y))))
|
|
|
|
(rule (lower (has_type (multi_lane 16 8)
|
|
(avg_round x y)))
|
|
(value_xmm (pavgw (put_in_xmm x) (put_in_xmm_mem y))))
|
|
|
|
;;;; Rules for `imul` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
|
|
|
;; `i64` and smaller.
|
|
|
|
;; Multiply two registers.
|
|
(rule (lower (has_type (fits_in_64 ty) (imul x y)))
|
|
(value_reg (mul ty
|
|
(put_in_reg x)
|
|
(RegMemImm.Reg (put_in_reg y)))))
|
|
|
|
;; Multiply a register and an immediate.
|
|
|
|
(rule (lower (has_type (fits_in_64 ty)
|
|
(imul x (simm32_from_value y))))
|
|
(value_reg (mul ty (put_in_reg x) y)))
|
|
|
|
(rule (lower (has_type (fits_in_64 ty)
|
|
(imul (simm32_from_value x) y)))
|
|
(value_reg (mul ty (put_in_reg y) x)))
|
|
|
|
;; Multiply a register and a memory load.
|
|
|
|
(rule (lower (has_type (fits_in_64 ty)
|
|
(imul x (sinkable_load y))))
|
|
(value_reg (mul ty
|
|
(put_in_reg x)
|
|
(sink_load y))))
|
|
|
|
(rule (lower (has_type (fits_in_64 ty)
|
|
(imul (sinkable_load x) y)))
|
|
(value_reg (mul ty
|
|
(put_in_reg y)
|
|
(sink_load x))))
|
|
|
|
;; `i128`.
|
|
|
|
;; mul:
|
|
;; dst_lo = lhs_lo * rhs_lo
|
|
;; dst_hi = umulhi(lhs_lo, rhs_lo) +
|
|
;; lhs_lo * rhs_hi +
|
|
;; lhs_hi * rhs_lo
|
|
;;
|
|
;; so we emit:
|
|
;; lo_hi = mul x_lo, y_hi
|
|
;; hi_lo = mul x_hi, y_lo
|
|
;; hilo_hilo = add lo_hi, hi_lo
|
|
;; dst_lo:hi_lolo = mulhi_u x_lo, y_lo
|
|
;; dst_hi = add hilo_hilo, hi_lolo
|
|
;; return (dst_lo, dst_hi)
|
|
(rule (lower (has_type $I128 (imul x y)))
|
|
;; Put `x` into registers and unpack its hi/lo halves.
|
|
(let ((x_regs ValueRegs (put_in_regs x))
|
|
(x_lo Reg (value_regs_get x_regs 0))
|
|
(x_hi Reg (value_regs_get x_regs 1))
|
|
;; Put `y` into registers and unpack its hi/lo halves.
|
|
(y_regs ValueRegs (put_in_regs y))
|
|
(y_lo Reg (value_regs_get y_regs 0))
|
|
(y_hi Reg (value_regs_get y_regs 1))
|
|
;; lo_hi = mul x_lo, y_hi
|
|
(lo_hi Reg (mul $I64 x_lo (RegMemImm.Reg y_hi)))
|
|
;; hi_lo = mul x_hi, y_lo
|
|
(hi_lo Reg (mul $I64 x_hi (RegMemImm.Reg y_lo)))
|
|
;; hilo_hilo = add lo_hi, hi_lo
|
|
(hilo_hilo Reg (add $I64 lo_hi (RegMemImm.Reg hi_lo)))
|
|
;; dst_lo:hi_lolo = mulhi_u x_lo, y_lo
|
|
(mul_regs ValueRegs (mulhi_u $I64 x_lo (RegMem.Reg y_lo)))
|
|
(dst_lo Reg (value_regs_get mul_regs 0))
|
|
(hi_lolo Reg (value_regs_get mul_regs 1))
|
|
;; dst_hi = add hilo_hilo, hi_lolo
|
|
(dst_hi Reg (add $I64 hilo_hilo (RegMemImm.Reg hi_lolo))))
|
|
(value_regs dst_lo dst_hi)))
|
|
|
|
;; SSE.
|
|
|
|
;; (No i8x16 multiply.)
|
|
|
|
(rule (lower (has_type (multi_lane 16 8) (imul x y)))
|
|
(value_xmm (pmullw (put_in_xmm x) (put_in_xmm_mem y))))
|
|
|
|
(rule (lower (has_type (multi_lane 32 4) (imul x y)))
|
|
(value_xmm (pmulld (put_in_xmm x) (put_in_xmm_mem y))))
|
|
|
|
;; With AVX-512 we can implement `i64x2` multiplication with a single
|
|
;; instruction.
|
|
(rule (lower (has_type (and (avx512vl_enabled)
|
|
(avx512dq_enabled)
|
|
(multi_lane 64 2))
|
|
(imul x y)))
|
|
(value_xmm (vpmullq (put_in_xmm_mem x) (put_in_xmm y))))
|
|
|
|
;; Otherwise, for i64x2 multiplication we describe a lane A as being composed of
|
|
;; a 32-bit upper half "Ah" and a 32-bit lower half "Al". The 32-bit long hand
|
|
;; multiplication can then be written as:
|
|
;;
|
|
;; Ah Al
|
|
;; * Bh Bl
|
|
;; -----
|
|
;; Al * Bl
|
|
;; + (Ah * Bl) << 32
|
|
;; + (Al * Bh) << 32
|
|
;;
|
|
;; So for each lane we will compute:
|
|
;;
|
|
;; A * B = (Al * Bl) + ((Ah * Bl) + (Al * Bh)) << 32
|
|
;;
|
|
;; Note, the algorithm will use `pmuldq` which operates directly on the lower
|
|
;; 32-bit (`Al` or `Bl`) of a lane and writes the result to the full 64-bits of
|
|
;; the lane of the destination. For this reason we don't need shifts to isolate
|
|
;; the lower 32-bits, however, we will need to use shifts to isolate the high
|
|
;; 32-bits when doing calculations, i.e., `Ah == A >> 32`.
|
|
(rule (lower (has_type (multi_lane 64 2)
|
|
(imul a b)))
|
|
(let ((a0 Xmm (put_in_xmm a))
|
|
(b0 Xmm (put_in_xmm b))
|
|
;; a_hi = A >> 32
|
|
(a_hi Xmm (psrlq a0 (xmm_mem_imm_new (RegMemImm.Imm 32))))
|
|
;; ah_bl = Ah * Bl
|
|
(ah_bl Xmm (pmuludq a_hi (xmm_to_xmm_mem b0)))
|
|
;; b_hi = B >> 32
|
|
(b_hi Xmm (psrlq b0 (xmm_mem_imm_new (RegMemImm.Imm 32))))
|
|
;; al_bh = Al * Bh
|
|
(al_bh Xmm (pmuludq a0 (xmm_to_xmm_mem b_hi)))
|
|
;; aa_bb = ah_bl + al_bh
|
|
(aa_bb Xmm (paddq ah_bl (xmm_to_xmm_mem al_bh)))
|
|
;; aa_bb_shifted = aa_bb << 32
|
|
(aa_bb_shifted Xmm (psllq aa_bb (xmm_mem_imm_new (RegMemImm.Imm 32))))
|
|
;; al_bl = Al * Bl
|
|
(al_bl Xmm (pmuludq a0 (xmm_to_xmm_mem b0))))
|
|
;; al_bl + aa_bb_shifted
|
|
(value_xmm (paddq al_bl (xmm_to_xmm_mem aa_bb_shifted)))))
|
|
|
|
;; Special case for `i16x8.extmul_high_i8x16_s`.
|
|
(rule (lower (has_type (multi_lane 16 8)
|
|
(imul (def_inst (swiden_high (and (value_type (multi_lane 8 16))
|
|
x)))
|
|
(def_inst (swiden_high (and (value_type (multi_lane 8 16))
|
|
y))))))
|
|
(let ((x1 Xmm (put_in_xmm x))
|
|
(x2 Xmm (palignr x1 (xmm_to_xmm_mem x1) 8 (OperandSize.Size32)))
|
|
(x3 Xmm (pmovsxbw (xmm_to_xmm_mem x2)))
|
|
(y1 Xmm (put_in_xmm y))
|
|
(y2 Xmm (palignr y1 (xmm_to_xmm_mem y1) 8 (OperandSize.Size32)))
|
|
(y3 Xmm (pmovsxbw (xmm_to_xmm_mem y2))))
|
|
(value_xmm (pmullw x3 (xmm_to_xmm_mem y3)))))
|
|
|
|
;; Special case for `i32x4.extmul_high_i16x8_s`.
|
|
(rule (lower (has_type (multi_lane 32 4)
|
|
(imul (def_inst (swiden_high (and (value_type (multi_lane 16 8))
|
|
x)))
|
|
(def_inst (swiden_high (and (value_type (multi_lane 16 8))
|
|
y))))))
|
|
(let ((x2 Xmm (put_in_xmm x))
|
|
(y2 Xmm (put_in_xmm y))
|
|
(lo Xmm (pmullw x2 (xmm_to_xmm_mem y2)))
|
|
(hi Xmm (pmulhw x2 (xmm_to_xmm_mem y2))))
|
|
(value_xmm (punpckhwd lo (xmm_to_xmm_mem hi)))))
|
|
|
|
;; Special case for `i64x2.extmul_high_i32x4_s`.
|
|
(rule (lower (has_type (multi_lane 64 2)
|
|
(imul (def_inst (swiden_high (and (value_type (multi_lane 32 4))
|
|
x)))
|
|
(def_inst (swiden_high (and (value_type (multi_lane 32 4))
|
|
y))))))
|
|
(let ((x2 Xmm (pshufd (put_in_xmm_mem x)
|
|
0xFA
|
|
(OperandSize.Size32)))
|
|
(y2 Xmm (pshufd (put_in_xmm_mem y)
|
|
0xFA
|
|
(OperandSize.Size32))))
|
|
(value_xmm (pmuldq x2 (xmm_to_xmm_mem y2)))))
|
|
|
|
;; Special case for `i16x8.extmul_low_i8x16_s`.
|
|
(rule (lower (has_type (multi_lane 16 8)
|
|
(imul (def_inst (swiden_low (and (value_type (multi_lane 8 16))
|
|
x)))
|
|
(def_inst (swiden_low (and (value_type (multi_lane 8 16))
|
|
y))))))
|
|
(let ((x2 Xmm (pmovsxbw (put_in_xmm_mem x)))
|
|
(y2 Xmm (pmovsxbw (put_in_xmm_mem y))))
|
|
(value_xmm (pmullw x2 (xmm_to_xmm_mem y2)))))
|
|
|
|
;; Special case for `i32x4.extmul_low_i16x8_s`.
|
|
(rule (lower (has_type (multi_lane 32 4)
|
|
(imul (def_inst (swiden_low (and (value_type (multi_lane 16 8))
|
|
x)))
|
|
(def_inst (swiden_low (and (value_type (multi_lane 16 8))
|
|
y))))))
|
|
(let ((x2 Xmm (put_in_xmm x))
|
|
(y2 Xmm (put_in_xmm y))
|
|
(lo Xmm (pmullw x2 (xmm_to_xmm_mem y2)))
|
|
(hi Xmm (pmulhw x2 (xmm_to_xmm_mem y2))))
|
|
(value_xmm (punpcklwd lo (xmm_to_xmm_mem hi)))))
|
|
|
|
;; Special case for `i64x2.extmul_low_i32x4_s`.
|
|
(rule (lower (has_type (multi_lane 64 2)
|
|
(imul (def_inst (swiden_low (and (value_type (multi_lane 32 4))
|
|
x)))
|
|
(def_inst (swiden_low (and (value_type (multi_lane 32 4))
|
|
y))))))
|
|
(let ((x2 Xmm (pshufd (put_in_xmm_mem x)
|
|
0x50
|
|
(OperandSize.Size32)))
|
|
(y2 Xmm (pshufd (put_in_xmm_mem y)
|
|
0x50
|
|
(OperandSize.Size32))))
|
|
(value_xmm (pmuldq x2 (xmm_to_xmm_mem y2)))))
|
|
|
|
;; Special case for `i16x8.extmul_high_i8x16_u`.
|
|
(rule (lower (has_type (multi_lane 16 8)
|
|
(imul (def_inst (uwiden_high (and (value_type (multi_lane 8 16))
|
|
x)))
|
|
(def_inst (uwiden_high (and (value_type (multi_lane 8 16))
|
|
y))))))
|
|
(let ((x1 Xmm (put_in_xmm x))
|
|
(x2 Xmm (palignr x1 (xmm_to_xmm_mem x1) 8 (OperandSize.Size32)))
|
|
(x3 Xmm (pmovzxbw (xmm_to_xmm_mem x2)))
|
|
(y1 Xmm (put_in_xmm y))
|
|
(y2 Xmm (palignr y1 (xmm_to_xmm_mem y1) 8 (OperandSize.Size32)))
|
|
(y3 Xmm (pmovzxbw (xmm_to_xmm_mem y2))))
|
|
(value_xmm (pmullw x3 (xmm_to_xmm_mem y3)))))
|
|
|
|
;; Special case for `i32x4.extmul_high_i16x8_u`.
|
|
(rule (lower (has_type (multi_lane 32 4)
|
|
(imul (def_inst (uwiden_high (and (value_type (multi_lane 16 8))
|
|
x)))
|
|
(def_inst (uwiden_high (and (value_type (multi_lane 16 8))
|
|
y))))))
|
|
(let ((x2 Xmm (put_in_xmm x))
|
|
(y2 Xmm (put_in_xmm y))
|
|
(lo Xmm (pmullw x2 (xmm_to_xmm_mem y2)))
|
|
(hi Xmm (pmulhuw x2 (xmm_to_xmm_mem y2))))
|
|
(value_xmm (punpckhwd lo (xmm_to_xmm_mem hi)))))
|
|
|
|
;; Special case for `i64x2.extmul_high_i32x4_u`.
|
|
(rule (lower (has_type (multi_lane 64 2)
|
|
(imul (def_inst (uwiden_high (and (value_type (multi_lane 32 4))
|
|
x)))
|
|
(def_inst (uwiden_high (and (value_type (multi_lane 32 4))
|
|
y))))))
|
|
(let ((x2 Xmm (pshufd (put_in_xmm_mem x)
|
|
0xFA
|
|
(OperandSize.Size32)))
|
|
(y2 Xmm (pshufd (put_in_xmm_mem y)
|
|
0xFA
|
|
(OperandSize.Size32))))
|
|
(value_xmm (pmuludq x2 (xmm_to_xmm_mem y2)))))
|
|
|
|
;; Special case for `i16x8.extmul_low_i8x16_u`.
|
|
(rule (lower (has_type (multi_lane 16 8)
|
|
(imul (def_inst (uwiden_low (and (value_type (multi_lane 8 16))
|
|
x)))
|
|
(def_inst (uwiden_low (and (value_type (multi_lane 8 16))
|
|
y))))))
|
|
(let ((x2 Xmm (pmovzxbw (put_in_xmm_mem x)))
|
|
(y2 Xmm (pmovzxbw (put_in_xmm_mem y))))
|
|
(value_xmm (pmullw x2 (xmm_to_xmm_mem y2)))))
|
|
|
|
;; Special case for `i32x4.extmul_low_i16x8_u`.
|
|
(rule (lower (has_type (multi_lane 32 4)
|
|
(imul (def_inst (uwiden_low (and (value_type (multi_lane 16 8))
|
|
x)))
|
|
(def_inst (uwiden_low (and (value_type (multi_lane 16 8))
|
|
y))))))
|
|
(let ((x2 Xmm (put_in_xmm x))
|
|
(y2 Xmm (put_in_xmm y))
|
|
(lo Xmm (pmullw x2 (xmm_to_xmm_mem y2)))
|
|
(hi Xmm (pmulhuw x2 (xmm_to_xmm_mem y2))))
|
|
(value_xmm (punpcklwd lo (xmm_to_xmm_mem hi)))))
|
|
|
|
;; Special case for `i64x2.extmul_low_i32x4_u`.
|
|
(rule (lower (has_type (multi_lane 64 2)
|
|
(imul (def_inst (uwiden_low (and (value_type (multi_lane 32 4))
|
|
x)))
|
|
(def_inst (uwiden_low (and (value_type (multi_lane 32 4))
|
|
y))))))
|
|
(let ((x2 Xmm (pshufd (put_in_xmm_mem x)
|
|
0x50
|
|
(OperandSize.Size32)))
|
|
(y2 Xmm (pshufd (put_in_xmm_mem y)
|
|
0x50
|
|
(OperandSize.Size32))))
|
|
(value_xmm (pmuludq x2 (xmm_to_xmm_mem y2)))))
|
|
|
|
;;;; Rules for `band_not` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
|
|
|
(decl sse_and_not (Type Xmm XmmMem) Xmm)
|
|
(rule (sse_and_not $F32X4 x y) (andnps x y))
|
|
(rule (sse_and_not $F64X2 x y) (andnpd x y))
|
|
(rule (sse_and_not (multi_lane _bits _lanes) x y) (pandn x y))
|
|
|
|
;; Note the flipping of operands below. CLIF specifies
|
|
;;
|
|
;; band_not(x, y) = and(x, not(y))
|
|
;;
|
|
;; while x86 does
|
|
;;
|
|
;; pandn(x, y) = and(not(x), y)
|
|
(rule (lower (has_type ty (band_not x y)))
|
|
(value_xmm (sse_and_not ty
|
|
(put_in_xmm y)
|
|
(put_in_xmm_mem x))))
|
|
|
|
;;;; Rules for `iabs` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
|
|
|
(rule (lower (has_type $I8X16 (iabs x)))
|
|
(value_xmm (pabsb (put_in_xmm_mem x))))
|
|
|
|
(rule (lower (has_type $I16X8 (iabs x)))
|
|
(value_xmm (pabsw (put_in_xmm_mem x))))
|
|
|
|
(rule (lower (has_type $I32X4 (iabs x)))
|
|
(value_xmm (pabsd (put_in_xmm_mem x))))
|
|
|
|
;; When AVX512 is available, we can use a single `vpabsq` instruction.
|
|
(rule (lower (has_type (and (avx512vl_enabled)
|
|
(avx512f_enabled)
|
|
$I64X2)
|
|
(iabs x)))
|
|
(value_xmm (vpabsq (put_in_xmm_mem x))))
|
|
|
|
;; Otherwise, we use a separate xmmister, `neg`, to contain the results of `0 -
|
|
;; x` and then blend in those results with `blendvpd` if the MSB of `neg` was
|
|
;; set to 1 (i.e. if `neg` was negative or, conversely, if `x` was originally
|
|
;; positive).
|
|
(rule (lower (has_type $I64X2 (iabs x)))
|
|
(let ((rx Xmm (put_in_xmm x))
|
|
(neg Xmm (psubq (xmm_new (imm $I64X2 0)) (xmm_to_xmm_mem rx))))
|
|
(value_xmm (blendvpd neg (xmm_to_xmm_mem rx) neg))))
|
|
|
|
;;;; Rules for `fabs` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
|
|
|
;; Special case for `f32x4.abs`.
|
|
(rule (lower (has_type $F32X4 (fabs x)))
|
|
(value_xmm (andps (put_in_xmm x)
|
|
(xmm_to_xmm_mem (psrld (vector_all_ones $F32X4)
|
|
(xmm_mem_imm_new (RegMemImm.Imm 1)))))))
|
|
|
|
;; Special case for `f64x2.abs`.
|
|
(rule (lower (has_type $F64X2 (fabs x)))
|
|
(value_xmm (andpd (put_in_xmm x)
|
|
(xmm_to_xmm_mem (psrlq (vector_all_ones $F64X2)
|
|
(xmm_mem_imm_new (RegMemImm.Imm 1)))))))
|
|
|
|
;;;; Rules for `bnot` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
|
|
|
;; `i64` and smaller.
|
|
|
|
(rule (lower (has_type (fits_in_64 ty) (bnot x)))
|
|
(value_gpr (not ty (put_in_gpr x))))
|
|
|
|
;; `i128`.
|
|
|
|
(decl i128_not (Value) ValueRegs)
|
|
(rule (i128_not x)
|
|
(let ((x_regs ValueRegs (put_in_regs x))
|
|
(x_lo Gpr (gpr_new (value_regs_get x_regs 0)))
|
|
(x_hi Gpr (gpr_new (value_regs_get x_regs 1))))
|
|
(value_gprs (not $I64 x_lo)
|
|
(not $I64 x_hi))))
|
|
|
|
(rule (lower (has_type $I128 (bnot x)))
|
|
(i128_not x))
|
|
|
|
(rule (lower (has_type $B128 (bnot x)))
|
|
(i128_not x))
|
|
|
|
;; Special case for vector-types where bit-negation is an xor against an
|
|
;; all-one value
|
|
(rule (lower (has_type ty @ (multi_lane _bits _lanes) (bnot x)))
|
|
(value_xmm (sse_xor ty (put_in_xmm x) (xmm_to_xmm_mem (vector_all_ones ty)))))
|
|
|
|
;;;; Rules for `bitselect` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
|
|
|
(rule (lower (has_type ty @ (multi_lane _bits _lanes)
|
|
(bitselect condition
|
|
if_true
|
|
if_false)))
|
|
;; a = and if_true, condition
|
|
;; b = and_not condition, if_false
|
|
;; or b, a
|
|
(let ((cond_xmm Xmm (put_in_xmm condition))
|
|
(a Xmm (sse_and ty (put_in_xmm if_true) (xmm_to_xmm_mem cond_xmm)))
|
|
(b Xmm (sse_and_not ty cond_xmm (put_in_xmm_mem if_false))))
|
|
(value_xmm (sse_or ty b (xmm_to_xmm_mem a)))))
|
|
|
|
;;;; Rules for `vselect` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
|
|
|
(rule (lower (has_type ty @ (multi_lane _bits _lanes)
|
|
(vselect condition if_true if_false)))
|
|
(value_xmm (sse_blend ty
|
|
(put_in_xmm_mem condition)
|
|
(put_in_xmm_mem if_true)
|
|
(put_in_xmm if_false))))
|
|
|
|
;;;; Rules for `insertlane` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
|
|
|
(rule (lower (insertlane vec @ (value_type ty) val (u8_from_uimm8 idx)))
|
|
(value_xmm (vec_insert_lane ty (put_in_xmm vec) (put_in_reg_mem val) idx)))
|
|
|
|
;; Helper function used below for `insertlane` but also here for other
|
|
;; lowerings.
|
|
;;
|
|
;; Note that the `Type` used here is the type of vector the insertion is
|
|
;; happening into, or the type of the first `Reg` argument.
|
|
(decl vec_insert_lane (Type Xmm RegMem u8) Xmm)
|
|
|
|
;; i8x16.replace_lane
|
|
(rule (vec_insert_lane $I8X16 vec val idx)
|
|
(pinsrb vec (reg_mem_to_gpr_mem val) idx))
|
|
|
|
;; i16x8.replace_lane
|
|
(rule (vec_insert_lane $I16X8 vec val idx)
|
|
(pinsrw vec (reg_mem_to_gpr_mem val) idx))
|
|
|
|
;; i32x4.replace_lane
|
|
(rule (vec_insert_lane $I32X4 vec val idx)
|
|
(pinsrd vec (reg_mem_to_gpr_mem val) idx (OperandSize.Size32)))
|
|
|
|
;; i64x2.replace_lane
|
|
(rule (vec_insert_lane $I64X2 vec val idx)
|
|
(pinsrd vec (reg_mem_to_gpr_mem val) idx (OperandSize.Size64)))
|
|
|
|
;; f32x4.replace_lane
|
|
(rule (vec_insert_lane $F32X4 vec val idx)
|
|
(insertps vec (reg_mem_to_xmm_mem val) (sse_insertps_lane_imm idx)))
|
|
|
|
;; External rust code used to calculate the immediate value to `insertps`.
|
|
(decl sse_insertps_lane_imm (u8) u8)
|
|
(extern constructor sse_insertps_lane_imm sse_insertps_lane_imm)
|
|
|
|
;; f64x2.replace_lane 0
|
|
;;
|
|
;; Here the `movsd` instruction is used specifically to specialize moving
|
|
;; into the fist lane where unlike above cases we're not using the lane
|
|
;; immediate as an immediate to the instruction itself.
|
|
;;
|
|
;; Note, though, the `movsd` has different behavior with respect to the second
|
|
;; lane of the f64x2 depending on whether the RegMem operand is a register or
|
|
;; memory. When loading from a register `movsd` preserves the upper bits, but
|
|
;; when loading from memory it zeros the upper bits. We specifically want to
|
|
;; preserve the upper bits so if a `RegMem.Mem` is passed in we need to emit
|
|
;; two `movsd` instructions. The first `movsd` (used as `xmm_unary_rm_r`) will
|
|
;; load from memory into a temp register and then the second `movsd` (modeled
|
|
;; internally as `xmm_rm_r` will merge the temp register into our `vec`
|
|
;; register.
|
|
(rule (vec_insert_lane $F64X2 vec (RegMem.Reg val) 0)
|
|
(movsd vec (reg_mem_to_xmm_mem (RegMem.Reg val))))
|
|
(rule (vec_insert_lane $F64X2 vec mem 0)
|
|
(movsd vec (xmm_to_xmm_mem (xmm_unary_rm_r (SseOpcode.Movsd)
|
|
(reg_mem_to_xmm_mem mem)))))
|
|
|
|
;; f64x2.replace_lane 1
|
|
;;
|
|
;; Here the `movlhps` instruction is used specifically to specialize moving
|
|
;; into the second lane where unlike above cases we're not using the lane
|
|
;; immediate as an immediate to the instruction itself.
|
|
(rule (vec_insert_lane $F64X2 vec val 1)
|
|
(movlhps vec (reg_mem_to_xmm_mem val)))
|
|
|
|
;;;; Rules for `imax` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
|
|
|
(rule (lower (has_type $I8X16 (imax x y)))
|
|
(value_xmm (pmaxsb (put_in_xmm x) (put_in_xmm_mem y))))
|
|
|
|
(rule (lower (has_type $I16X8 (imax x y)))
|
|
(value_xmm (pmaxsw (put_in_xmm x) (put_in_xmm_mem y))))
|
|
|
|
(rule (lower (has_type $I32X4 (imax x y)))
|
|
(value_xmm (pmaxsd (put_in_xmm x) (put_in_xmm_mem y))))
|
|
|
|
;;;; Rules for `imin` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
|
|
|
(rule (lower (has_type $I8X16 (imin x y)))
|
|
(value_xmm (pminsb (put_in_xmm x) (put_in_xmm_mem y))))
|
|
|
|
(rule (lower (has_type $I16X8 (imin x y)))
|
|
(value_xmm (pminsw (put_in_xmm x) (put_in_xmm_mem y))))
|
|
|
|
(rule (lower (has_type $I32X4 (imin x y)))
|
|
(value_xmm (pminsd (put_in_xmm x) (put_in_xmm_mem y))))
|
|
|
|
;;;; Rules for `umax` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
|
|
|
(rule (lower (has_type $I8X16 (umax x y)))
|
|
(value_xmm (pmaxub (put_in_xmm x) (put_in_xmm_mem y))))
|
|
|
|
(rule (lower (has_type $I16X8 (umax x y)))
|
|
(value_xmm (pmaxuw (put_in_xmm x) (put_in_xmm_mem y))))
|
|
|
|
(rule (lower (has_type $I32X4 (umax x y)))
|
|
(value_xmm (pmaxud (put_in_xmm x) (put_in_xmm_mem y))))
|
|
|
|
;;;; Rules for `umin` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
|
|
|
(rule (lower (has_type $I8X16 (umin x y)))
|
|
(value_xmm (pminub (put_in_xmm x) (put_in_xmm_mem y))))
|
|
|
|
(rule (lower (has_type $I16X8 (umin x y)))
|
|
(value_xmm (pminuw (put_in_xmm x) (put_in_xmm_mem y))))
|
|
|
|
(rule (lower (has_type $I32X4 (umin x y)))
|
|
(value_xmm (pminud (put_in_xmm x) (put_in_xmm_mem y))))
|
|
|
|
;;;; Rules for `trap` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
|
|
|
(rule (lower (trap code))
|
|
(safepoint (ud2 code)))
|
|
|
|
;;;; Rules for `resumable_trap` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
|
|
|
(rule (lower (resumable_trap code))
|
|
(safepoint (ud2 code)))
|