From db06e4e622eadc486a1ee9c9ef83cb1a81067805 Mon Sep 17 00:00:00 2001 From: Trevor Elliott Date: Thu, 29 Sep 2022 10:09:37 -0700 Subject: [PATCH] ISLE: Resolve remaining x64 overlap errors (#4977) Resolve overlap errors with the x64 backend. --- cranelift/codegen/src/isa/x64/lower.isle | 374 +++++++++++------------ cranelift/codegen/src/machinst/isle.rs | 5 + cranelift/codegen/src/prelude.isle | 7 + 3 files changed, 198 insertions(+), 188 deletions(-) diff --git a/cranelift/codegen/src/isa/x64/lower.isle b/cranelift/codegen/src/isa/x64/lower.isle index 72b82d271c..c78c111f2b 100644 --- a/cranelift/codegen/src/isa/x64/lower.isle +++ b/cranelift/codegen/src/isa/x64/lower.isle @@ -17,7 +17,7 @@ (imm ty x)) ;; `i128` -(rule (lower (has_type $I128 +(rule 1 (lower (has_type $I128 (iconst (u64_from_imm64 x)))) (value_regs (imm $I64 x) (imm $I64 0))) @@ -36,12 +36,12 @@ ;; `b128` -(rule (lower (has_type $B128 +(rule 1 (lower (has_type $B128 (bconst $false))) (value_regs (imm $B64 0) (imm $B64 0))) -(rule (lower (has_type $B128 +(rule 1 (lower (has_type $B128 (bconst $true))) (value_regs (imm $B64 1) (imm $B64 0))) @@ -66,29 +66,29 @@ ;; `i64` and smaller. ;; Add two registers. -(rule (lower (has_type (fits_in_64 ty) +(rule -5 (lower (has_type (fits_in_64 ty) (iadd x y))) (x64_add ty x y)) ;; Add a register and an immediate. -(rule (lower (has_type (fits_in_64 ty) +(rule -4 (lower (has_type (fits_in_64 ty) (iadd x (simm32_from_value y)))) (x64_add ty x y)) -(rule (lower (has_type (fits_in_64 ty) +(rule -3 (lower (has_type (fits_in_64 ty) (iadd (simm32_from_value x) y))) (x64_add ty y x)) ;; Add a register and memory. -(rule (lower (has_type (fits_in_64 ty) +(rule -2 (lower (has_type (fits_in_64 ty) (iadd x (sinkable_load y)))) (x64_add ty x (sink_load_to_gpr_mem_imm y))) -(rule (lower (has_type (fits_in_64 ty) +(rule -1 (lower (has_type (fits_in_64 ty) (iadd (sinkable_load x) y))) (x64_add ty y @@ -113,7 +113,7 @@ (x64_paddq x y)) ;; `i128` -(rule (lower (has_type $I128 (iadd x y))) +(rule 1 (lower (has_type $I128 (iadd x y))) ;; Get the high/low registers for `x`. (let ((x_regs ValueRegs x) (x_lo Gpr (value_regs_get_gpr x_regs 0)) @@ -163,27 +163,27 @@ (output_pair reg (value_regs_invalid))) ;; Add two registers. -(rule (lower (has_type (fits_in_64 ty) +(rule 0 (lower (has_type (fits_in_64 ty) (iadd_ifcout x y))) (output_ifcout (x64_add ty x y))) ;; Add a register and an immediate. -(rule (lower (has_type (fits_in_64 ty) +(rule 1 (lower (has_type (fits_in_64 ty) (iadd_ifcout x (simm32_from_value y)))) (output_ifcout (x64_add ty x y))) -(rule (lower (has_type (fits_in_64 ty) +(rule 2 (lower (has_type (fits_in_64 ty) (iadd_ifcout (simm32_from_value x) y))) (output_ifcout (x64_add ty y x))) ;; Add a register and memory. -(rule (lower (has_type (fits_in_64 ty) +(rule 3 (lower (has_type (fits_in_64 ty) (iadd_ifcout x (sinkable_load y)))) (output_ifcout (x64_add ty x (sink_load_to_gpr_mem_imm y)))) -(rule (lower (has_type (fits_in_64 ty) +(rule 4 (lower (has_type (fits_in_64 ty) (iadd_ifcout (sinkable_load x) y))) (output_ifcout (x64_add ty y (sink_load_to_gpr_mem_imm x)))) @@ -194,17 +194,17 @@ ;; `i64` and smaller. ;; Sub two registers. -(rule (lower (has_type (fits_in_64 ty) +(rule -3 (lower (has_type (fits_in_64 ty) (isub x y))) (x64_sub ty x y)) ;; Sub a register and an immediate. -(rule (lower (has_type (fits_in_64 ty) +(rule -2 (lower (has_type (fits_in_64 ty) (isub x (simm32_from_value y)))) (x64_sub ty x y)) ;; Sub a register and memory. -(rule (lower (has_type (fits_in_64 ty) +(rule -1 (lower (has_type (fits_in_64 ty) (isub x (sinkable_load y)))) (x64_sub ty x (sink_load_to_gpr_mem_imm y))) @@ -228,7 +228,7 @@ (x64_psubq x y)) ;; `i128` -(rule (lower (has_type $I128 (isub x y))) +(rule 1 (lower (has_type $I128 (isub x y))) ;; Get the high/low registers for `x`. (let ((x_regs ValueRegs x) (x_lo Gpr (value_regs_get_gpr x_regs 0)) @@ -266,17 +266,17 @@ ;; `{i,b}64` and smaller. ;; And two registers. -(rule (lower (has_type (fits_in_64 ty) (band x y))) +(rule 0 (lower (has_type (fits_in_64 ty) (band x y))) (x64_and ty x y)) ;; And with a memory operand. -(rule (lower (has_type (fits_in_64 ty) +(rule 1 (lower (has_type (fits_in_64 ty) (band x (sinkable_load y)))) (x64_and ty x (sink_load_to_gpr_mem_imm y))) -(rule (lower (has_type (fits_in_64 ty) +(rule 2 (lower (has_type (fits_in_64 ty) (band (sinkable_load x) y))) (x64_and ty y @@ -284,11 +284,11 @@ ;; And with an immediate. -(rule (lower (has_type (fits_in_64 ty) +(rule 3 (lower (has_type (fits_in_64 ty) (band x (simm32_from_value y)))) (x64_and ty x y)) -(rule (lower (has_type (fits_in_64 ty) +(rule 4 (lower (has_type (fits_in_64 ty) (band (simm32_from_value x) y))) (x64_and ty y x)) @@ -297,15 +297,15 @@ (decl sse_and (Type Xmm XmmMem) Xmm) (rule (sse_and $F32X4 x y) (x64_andps x y)) (rule (sse_and $F64X2 x y) (x64_andpd x y)) -(rule (sse_and (multi_lane _bits _lanes) x y) (x64_pand x y)) +(rule -1 (sse_and (multi_lane _bits _lanes) x y) (x64_pand x y)) -(rule (lower (has_type ty @ (multi_lane _bits _lanes) +(rule 5 (lower (has_type ty @ (multi_lane _bits _lanes) (band x y))) (sse_and ty x y)) ;; `{i,b}128`. -(rule (lower (has_type $I128 (band x y))) +(rule 6 (lower (has_type $I128 (band x y))) (let ((x_regs ValueRegs x) (x_lo Gpr (value_regs_get_gpr x_regs 0)) (x_hi Gpr (value_regs_get_gpr x_regs 1)) @@ -315,7 +315,7 @@ (value_gprs (x64_and $I64 x_lo y_lo) (x64_and $I64 x_hi y_hi)))) -(rule (lower (has_type $B128 (band x y))) +(rule 6 (lower (has_type $B128 (band x y))) ;; Booleans are always `0` or `1`, so we only need to do the `and` on the ;; low half. The high half is always zero but, rather than generate a new ;; zero, we just reuse `x`'s high half which is already zero. @@ -331,28 +331,28 @@ ;; `{i,b}64` and smaller. ;; Or two registers. -(rule (lower (has_type (fits_in_64 ty) (bor x y))) +(rule 0 (lower (has_type (fits_in_64 ty) (bor x y))) (x64_or ty x y)) ;; Or with a memory operand. -(rule (lower (has_type (fits_in_64 ty) +(rule 1 (lower (has_type (fits_in_64 ty) (bor x (sinkable_load y)))) (x64_or ty x (sink_load_to_gpr_mem_imm y))) -(rule (lower (has_type (fits_in_64 ty) +(rule 2 (lower (has_type (fits_in_64 ty) (bor (sinkable_load x) y))) (x64_or ty y (sink_load_to_gpr_mem_imm x))) ;; Or with an immediate. -(rule (lower (has_type (fits_in_64 ty) +(rule 3 (lower (has_type (fits_in_64 ty) (bor x (simm32_from_value y)))) (x64_or ty x y)) -(rule (lower (has_type (fits_in_64 ty) +(rule 4 (lower (has_type (fits_in_64 ty) (bor (simm32_from_value x) y))) (x64_or ty y x)) @@ -361,9 +361,9 @@ (decl sse_or (Type Xmm XmmMem) Xmm) (rule (sse_or $F32X4 x y) (x64_orps x y)) (rule (sse_or $F64X2 x y) (x64_orpd x y)) -(rule (sse_or (multi_lane _bits _lanes) x y) (x64_por x y)) +(rule -1 (sse_or (multi_lane _bits _lanes) x y) (x64_por x y)) -(rule (lower (has_type ty @ (multi_lane _bits _lanes) +(rule 5 (lower (has_type ty @ (multi_lane _bits _lanes) (bor x y))) (sse_or ty x y)) @@ -378,10 +378,10 @@ (value_gprs (x64_or $I64 x_lo y_lo) (x64_or $I64 x_hi y_hi)))) -(rule (lower (has_type $I128 (bor x y))) +(rule 6 (lower (has_type $I128 (bor x y))) (or_i128 x y)) -(rule (lower (has_type $B128 (bor x y))) +(rule 6 (lower (has_type $B128 (bor x y))) ;; Booleans are always `0` or `1`, so we only need to do the `or` on the ;; low half. The high half is always zero but, rather than generate a new ;; zero, we just reuse `x`'s high half which is already zero. @@ -397,39 +397,39 @@ ;; `{i,b}64` and smaller. ;; Xor two registers. -(rule (lower (has_type (fits_in_64 ty) (bxor x y))) +(rule 0 (lower (has_type (fits_in_64 ty) (bxor x y))) (x64_xor ty x y)) ;; Xor with a memory operand. -(rule (lower (has_type (fits_in_64 ty) +(rule 1 (lower (has_type (fits_in_64 ty) (bxor x (sinkable_load y)))) (x64_xor ty x (sink_load_to_gpr_mem_imm y))) -(rule (lower (has_type (fits_in_64 ty) +(rule 2 (lower (has_type (fits_in_64 ty) (bxor (sinkable_load x) y))) (x64_xor ty y (sink_load_to_gpr_mem_imm x))) ;; Xor with an immediate. -(rule (lower (has_type (fits_in_64 ty) +(rule 3 (lower (has_type (fits_in_64 ty) (bxor x (simm32_from_value y)))) (x64_xor ty x y)) -(rule (lower (has_type (fits_in_64 ty) +(rule 4 (lower (has_type (fits_in_64 ty) (bxor (simm32_from_value x) y))) (x64_xor ty y x)) ;; SSE. -(rule (lower (has_type ty @ (multi_lane _bits _lanes) (bxor x y))) +(rule 5 (lower (has_type ty @ (multi_lane _bits _lanes) (bxor x y))) (sse_xor ty x y)) ;; `{i,b}128`. -(rule (lower (has_type $I128 (bxor x y))) +(rule 6 (lower (has_type $I128 (bxor x y))) (let ((x_regs ValueRegs x) (x_lo Gpr (value_regs_get_gpr x_regs 0)) (x_hi Gpr (value_regs_get_gpr x_regs 1)) @@ -439,7 +439,7 @@ (value_gprs (x64_xor $I64 x_lo y_lo) (x64_xor $I64 x_hi y_hi)))) -(rule (lower (has_type $B128 (bxor x y))) +(rule 6 (lower (has_type $B128 (bxor x y))) ;; Booleans are always `0` or `1`, so we only need to do the `xor` on the ;; low half. The high half is always zero but, rather than generate a new ;; zero, we just reuse `x`'s high half which is already zero. @@ -454,7 +454,7 @@ ;; `i64` and smaller. -(rule (lower (has_type (fits_in_64 ty) (ishl src amt))) +(rule -1 (lower (has_type (fits_in_64 ty) (ishl src amt))) (x64_shl ty src (put_masked_in_imm8_gpr amt ty))) ;; `i128`. @@ -565,7 +565,7 @@ ;; `i64` and smaller. -(rule (lower (has_type (fits_in_64 ty) (ushr src amt))) +(rule -1 (lower (has_type (fits_in_64 ty) (ushr src amt))) (let ((src_ Gpr (extend_to_gpr src ty (ExtendKind.Zero)))) (x64_shr ty src_ (put_masked_in_imm8_gpr amt ty)))) @@ -672,7 +672,7 @@ ;; `i64` and smaller. -(rule (lower (has_type (fits_in_64 ty) (sshr src amt))) +(rule -1 (lower (has_type (fits_in_64 ty) (sshr src amt))) (let ((src_ Gpr (extend_to_gpr src ty (ExtendKind.Sign)))) (x64_sar ty src_ (put_masked_in_imm8_gpr amt ty)))) @@ -790,7 +790,7 @@ ;; `i64` and smaller: we can rely on x86's rotate-amount masking since ;; we operate on the whole register. For const's we mask the constant. -(rule (lower (has_type (fits_in_64 ty) (rotl src amt))) +(rule -1 (lower (has_type (fits_in_64 ty) (rotl src amt))) (x64_rotl ty src (put_masked_in_imm8_gpr amt ty))) @@ -811,7 +811,7 @@ ;; `i64` and smaller: we can rely on x86's rotate-amount masking since ;; we operate on the whole register. For const's we mask the constant. -(rule (lower (has_type (fits_in_64 ty) (rotr src amt))) +(rule -1 (lower (has_type (fits_in_64 ty) (rotr src amt))) (x64_rotr ty src (put_masked_in_imm8_gpr amt ty))) @@ -831,7 +831,7 @@ ;; `i64` and smaller. -(rule (lower (has_type (fits_in_64 ty) (ineg x))) +(rule -1 (lower (has_type (fits_in_64 ty) (ineg x))) (x64_neg ty x)) ;; SSE. @@ -863,28 +863,28 @@ ;; `i64` and smaller. ;; Multiply two registers. -(rule (lower (has_type (fits_in_64 ty) (imul x y))) +(rule -4 (lower (has_type (fits_in_64 ty) (imul x y))) (x64_mul ty x y)) ;; Multiply a register and an immediate. -(rule (lower (has_type (fits_in_64 ty) +(rule -2 (lower (has_type (fits_in_64 ty) (imul x (simm32_from_value y)))) (x64_mul ty x y)) -(rule (lower (has_type (fits_in_64 ty) +(rule -3 (lower (has_type (fits_in_64 ty) (imul (simm32_from_value x) y))) (x64_mul ty y x)) ;; Multiply a register and a memory load. -(rule (lower (has_type (fits_in_64 ty) +(rule -1 (lower (has_type (fits_in_64 ty) (imul x (sinkable_load y)))) (x64_mul ty x (sink_load_to_gpr_mem_imm y))) -(rule (lower (has_type (fits_in_64 ty) +(rule 0 (lower (has_type (fits_in_64 ty) (imul (sinkable_load x) y))) (x64_mul ty y (sink_load_to_gpr_mem_imm x))) @@ -904,7 +904,7 @@ ;; dst_lo:hi_lolo = mulhi_u x_lo, y_lo ;; dst_hi = add hilo_hilo, hi_lolo ;; return (dst_lo, dst_hi) -(rule (lower (has_type $I128 (imul x y))) +(rule 2 (lower (has_type $I128 (imul x y))) ;; Put `x` into registers and unpack its hi/lo halves. (let ((x_regs ValueRegs x) (x_lo Gpr (value_regs_get_gpr x_regs 0)) @@ -931,15 +931,15 @@ ;; (No i8x16 multiply.) -(rule (lower (has_type (multi_lane 16 8) (imul x y))) +(rule 1 (lower (has_type (multi_lane 16 8) (imul x y))) (x64_pmullw x y)) -(rule (lower (has_type (multi_lane 32 4) (imul x y))) +(rule 1 (lower (has_type (multi_lane 32 4) (imul x y))) (x64_pmulld x y)) ;; With AVX-512 we can implement `i64x2` multiplication with a single ;; instruction. -(rule (lower (has_type (and (avx512vl_enabled $true) +(rule 3 (lower (has_type (and (avx512vl_enabled $true) (avx512dq_enabled $true) (multi_lane 64 2)) (imul x y))) @@ -965,7 +965,7 @@ ;; the lane of the destination. For this reason we don't need shifts to isolate ;; the lower 32-bits, however, we will need to use shifts to isolate the high ;; 32-bits when doing calculations, i.e., `Ah == A >> 32`. -(rule (lower (has_type (multi_lane 64 2) +(rule 1 (lower (has_type (multi_lane 64 2) (imul a b))) (let ((a0 Xmm a) (b0 Xmm b) @@ -1143,7 +1143,7 @@ (decl sse_and_not (Type Xmm XmmMem) Xmm) (rule (sse_and_not $F32X4 x y) (x64_andnps x y)) (rule (sse_and_not $F64X2 x y) (x64_andnpd x y)) -(rule (sse_and_not (multi_lane _bits _lanes) x y) (x64_pandn x y)) +(rule -1 (sse_and_not (multi_lane _bits _lanes) x y) (x64_pandn x y)) ;; Note the flipping of operands below. CLIF specifies ;; @@ -1167,7 +1167,7 @@ (x64_pabsd x)) ;; When AVX512 is available, we can use a single `vpabsq` instruction. -(rule (lower (has_type (and (avx512vl_enabled $true) +(rule 1 (lower (has_type (and (avx512vl_enabled $true) (avx512f_enabled $true) $I64X2) (iabs x))) @@ -1224,7 +1224,7 @@ ;; `i64` and smaller. -(rule (lower (has_type (fits_in_64 ty) (bnot x))) +(rule -2 (lower (has_type (fits_in_64 ty) (bnot x))) (x64_not ty x)) ;; `i128`. @@ -1245,7 +1245,7 @@ ;; Special case for vector-types where bit-negation is an xor against an ;; all-one value -(rule (lower (has_type ty @ (multi_lane _bits _lanes) (bnot x))) +(rule -1 (lower (has_type ty @ (multi_lane _bits _lanes) (bnot x))) (sse_xor ty x (vector_all_ones))) ;;;; Rules for `bitselect` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; @@ -1322,7 +1322,7 @@ ;; load from memory into a temp register and then the second `movsd` (modeled ;; internally as `xmm_rm_r` will merge the temp register into our `vec` ;; register. -(rule (vec_insert_lane $F64X2 vec (RegMem.Reg val) 0) +(rule 1 (vec_insert_lane $F64X2 vec (RegMem.Reg val) 0) (x64_movsd_regmove vec val)) (rule (vec_insert_lane $F64X2 vec mem 0) (x64_movsd_regmove vec (x64_movsd_load mem))) @@ -1351,16 +1351,16 @@ (with_flags_reg (x64_cmp size x_reg y_reg) (cmove ty cc y_reg x_reg)))) -(rule (lower (has_type (fits_in_64 ty) (umin x y))) +(rule -1 (lower (has_type (fits_in_64 ty) (umin x y))) (cmp_and_choose ty (CC.B) x y)) -(rule (lower (has_type (fits_in_64 ty) (umax x y))) +(rule -1 (lower (has_type (fits_in_64 ty) (umax x y))) (cmp_and_choose ty (CC.NB) x y)) -(rule (lower (has_type (fits_in_64 ty) (imin x y))) +(rule -1 (lower (has_type (fits_in_64 ty) (imin x y))) (cmp_and_choose ty (CC.L) x y)) -(rule (lower (has_type (fits_in_64 ty) (imax x y))) +(rule -1 (lower (has_type (fits_in_64 ty) (imax x y))) (cmp_and_choose ty (CC.NL) x y)) ;; SSE `imax`. @@ -1443,42 +1443,42 @@ ;;;; Rules for `icmp` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; -(rule (lower (icmp cc a @ (value_type (fits_in_64 ty)) b)) +(rule -2 (lower (icmp cc a @ (value_type (fits_in_64 ty)) b)) (lower_icmp_bool (emit_cmp cc a b))) -(rule (lower (icmp cc a @ (value_type $I128) b)) +(rule -1 (lower (icmp cc a @ (value_type $I128) b)) (lower_icmp_bool (emit_cmp cc a b))) ;; Peephole optimization for `x < 0`, when x is a signed 64 bit value -(rule (lower (has_type $B1 (icmp (IntCC.SignedLessThan) x @ (value_type $I64) (u64_from_iconst 0)))) +(rule 2 (lower (has_type $B1 (icmp (IntCC.SignedLessThan) x @ (value_type $I64) (u64_from_iconst 0)))) (x64_shr $I64 x (Imm8Reg.Imm8 63))) ;; Peephole optimization for `0 > x`, when x is a signed 64 bit value -(rule (lower (has_type $B1 (icmp (IntCC.SignedGreaterThan) (u64_from_iconst 0) x @ (value_type $I64)))) +(rule 2 (lower (has_type $B1 (icmp (IntCC.SignedGreaterThan) (u64_from_iconst 0) x @ (value_type $I64)))) (x64_shr $I64 x (Imm8Reg.Imm8 63))) ;; Peephole optimization for `0 <= x`, when x is a signed 64 bit value -(rule (lower (has_type $B1 (icmp (IntCC.SignedLessThanOrEqual) (u64_from_iconst 0) x @ (value_type $I64)))) +(rule 2 (lower (has_type $B1 (icmp (IntCC.SignedLessThanOrEqual) (u64_from_iconst 0) x @ (value_type $I64)))) (x64_shr $I64 (x64_not $I64 x) (Imm8Reg.Imm8 63))) ;; Peephole optimization for `x >= 0`, when x is a signed 64 bit value -(rule (lower (has_type $B1 (icmp (IntCC.SignedGreaterThanOrEqual) x @ (value_type $I64) (u64_from_iconst 0)))) +(rule 2 (lower (has_type $B1 (icmp (IntCC.SignedGreaterThanOrEqual) x @ (value_type $I64) (u64_from_iconst 0)))) (x64_shr $I64 (x64_not $I64 x) (Imm8Reg.Imm8 63))) ;; Peephole optimization for `x < 0`, when x is a signed 32 bit value -(rule (lower (has_type $B1 (icmp (IntCC.SignedLessThan) x @ (value_type $I32) (u64_from_iconst 0)))) +(rule 2 (lower (has_type $B1 (icmp (IntCC.SignedLessThan) x @ (value_type $I32) (u64_from_iconst 0)))) (x64_shr $I32 x (Imm8Reg.Imm8 31))) ;; Peephole optimization for `0 > x`, when x is a signed 32 bit value -(rule (lower (has_type $B1 (icmp (IntCC.SignedGreaterThan) (u64_from_iconst 0) x @ (value_type $I32)))) +(rule 2 (lower (has_type $B1 (icmp (IntCC.SignedGreaterThan) (u64_from_iconst 0) x @ (value_type $I32)))) (x64_shr $I32 x (Imm8Reg.Imm8 31))) ;; Peephole optimization for `0 <= x`, when x is a signed 32 bit value -(rule (lower (has_type $B1 (icmp (IntCC.SignedLessThanOrEqual) (u64_from_iconst 0) x @ (value_type $I32)))) +(rule 2 (lower (has_type $B1 (icmp (IntCC.SignedLessThanOrEqual) (u64_from_iconst 0) x @ (value_type $I32)))) (x64_shr $I32 (x64_not $I64 x) (Imm8Reg.Imm8 31))) ;; Peephole optimization for `x >= 0`, when x is a signed 32 bit value -(rule (lower (has_type $B1 (icmp (IntCC.SignedGreaterThanOrEqual) x @ (value_type $I32) (u64_from_iconst 0)))) +(rule 2 (lower (has_type $B1 (icmp (IntCC.SignedGreaterThanOrEqual) x @ (value_type $I32) (u64_from_iconst 0)))) (x64_shr $I32 (x64_not $I64 x) (Imm8Reg.Imm8 31))) ;; For XMM-held values, we lower to `PCMP*` instructions, sometimes more than @@ -1538,11 +1538,11 @@ ;; The PMIN[S|U]Q instruction is only available in AVX512VL/F so we must instead ;; compare with flipped operands (PCMPGT*) and negate the result (PXOR with all ;; 1s), emitting one more instruction than the smaller-lane versions. -(rule (lower (icmp (IntCC.SignedGreaterThanOrEqual) a @ (value_type $I64X2) b)) +(rule 1 (lower (icmp (IntCC.SignedGreaterThanOrEqual) a @ (value_type $I64X2) b)) (let ((checked Xmm (x64_pcmpgt $I64X2 b a)) (all_ones Xmm (vector_all_ones))) (x64_pxor checked all_ones))) -(rule (lower (icmp (IntCC.SignedLessThanOrEqual) a @ (value_type $I64X2) b)) +(rule 1 (lower (icmp (IntCC.SignedLessThanOrEqual) a @ (value_type $I64X2) b)) (let ((checked Xmm (x64_pcmpgt $I64X2 a b)) (all_ones Xmm (vector_all_ones))) (x64_pxor checked all_ones))) @@ -1571,7 +1571,7 @@ ;; - less than assigns Z = 0, P = 0, C = 1 ;; - equal assigns Z = 1, P = 0, C = 0 -(rule (lower (fcmp cc a @ (value_type (ty_scalar_float ty)) b)) +(rule -1 (lower (fcmp cc a @ (value_type (ty_scalar_float ty)) b)) (lower_fcmp_bool (emit_fcmp cc a b))) ;; For vector lowerings, we use `CMPP*` instructions with a 3-bit operand that @@ -1710,14 +1710,14 @@ ;; Finally, we lower `select` from a condition value `c`. These rules are meant ;; to be the final, default lowerings if no other patterns matched above. -(rule (lower (has_type ty (select c @ (value_type $B1) x y))) +(rule -1 (lower (has_type ty (select c @ (value_type $B1) x y))) (let ((size OperandSize (raw_operand_size_of_type $B1)) ;; N.B.: disallow load-op fusion, see above. TODO: ;; https://github.com/bytecodealliance/wasmtime/issues/3953. (gpr_c Gpr (put_in_gpr c))) (with_flags (x64_test size (RegMemImm.Imm 1) gpr_c) (cmove_from_values ty (CC.NZ) x y)))) -(rule (lower (has_type ty (select c @ (value_type (fits_in_64 a_ty)) x y))) +(rule -2 (lower (has_type ty (select c @ (value_type (fits_in_64 a_ty)) x y))) (let ((size OperandSize (raw_operand_size_of_type a_ty)) ;; N.B.: disallow load-op fusion, see above. TODO: ;; https://github.com/bytecodealliance/wasmtime/issues/3953. @@ -1730,24 +1730,26 @@ ;; special handling is required for zero inputs, because the machine ;; instruction does what the CLIF expects for zero, i.e. it returns ;; zero. -(rule 1 (lower +(rule 2 (lower (has_type (and (ty_32_or_64 ty) (use_lzcnt $true)) (clz src))) (x64_lzcnt ty src)) -(rule (lower - (has_type (ty_32_or_64 ty) +(rule 2 (lower + (has_type (and + (ty_32_or_64 ty) + (use_lzcnt $false)) (clz src))) (do_clz ty ty src)) -(rule (lower +(rule 1 (lower (has_type (ty_8_or_16 ty) (clz src))) (do_clz $I32 ty (extend_to_gpr src $I32 (ExtendKind.Zero)))) -(rule (lower +(rule 0 (lower (has_type $I128 (clz src))) (let ((upper Gpr (do_clz $I64 $I64 (value_regs_get_gpr src 1))) @@ -1772,24 +1774,26 @@ ;; Analogous to `clz` cases above, but using mirror instructions ;; (tzcnt vs lzcnt, bsf vs bsr). -(rule 1 (lower +(rule 2 (lower (has_type (and (ty_32_or_64 ty) (use_bmi1 $true)) (ctz src))) (x64_tzcnt ty src)) -(rule (lower - (has_type (ty_32_or_64 ty) +(rule 2 (lower + (has_type (and + (ty_32_or_64 ty) + (use_bmi1 $false)) (ctz src))) (do_ctz ty ty src)) -(rule (lower +(rule 1 (lower (has_type (ty_8_or_16 ty) (ctz src))) (do_ctz $I32 ty (extend_to_gpr src $I32 (ExtendKind.Zero)))) -(rule (lower +(rule 0 (lower (has_type $I128 (ctz src))) (let ((lower Gpr (do_ctz $I64 $I64 (value_regs_get_gpr src 0))) @@ -1808,14 +1812,14 @@ ;; Rules for `popcnt` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; -(rule 1 (lower +(rule 3 (lower (has_type (and (ty_32_or_64 ty) (use_popcnt $true)) (popcnt src))) (x64_popcnt ty src)) -(rule 1 (lower +(rule 2 (lower (has_type (and (ty_8_or_16 ty) (use_popcnt $true)) @@ -1831,12 +1835,12 @@ (hi_count Gpr (x64_popcnt $I64 (value_regs_get_gpr src 1)))) (value_regs (x64_add $I64 lo_count hi_count) (imm $I64 0)))) -(rule (lower +(rule -1 (lower (has_type (ty_32_or_64 ty) (popcnt src))) (do_popcnt ty src)) -(rule (lower +(rule -2 (lower (has_type (ty_8_or_16 ty) (popcnt src))) (do_popcnt $I32 (extend_to_gpr src $I32 (ExtendKind.Zero)))) @@ -2060,11 +2064,11 @@ ;; Rules for `uextend` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;; T -> T is a no-op. -(rule (lower (has_type ty (uextend src @ (value_type ty)))) +(rule 1 (lower (has_type ty (uextend src @ (value_type ty)))) src) ;; I64 -> I128. -(rule (lower (has_type $I128 (uextend src @ (value_type $I64)))) +(rule -1 (lower (has_type $I128 (uextend src @ (value_type $I64)))) (value_regs src (imm $I64 0))) ;; I{8,16,32} -> I128. @@ -2072,11 +2076,11 @@ (value_regs (extend_to_gpr src $I64 (ExtendKind.Zero)) (imm $I64 0))) ;; I{8,16,32} -> I64. -(rule (lower (has_type $I64 (uextend src @ (value_type (fits_in_32 src_ty))))) +(rule -1 (lower (has_type $I64 (uextend src @ (value_type (fits_in_32 src_ty))))) (extend_to_gpr src $I64 (ExtendKind.Zero))) ;; I8 -> I{16,32}, I16 -> I32. -(rule (lower (has_type (fits_in_32 dst_ty) (uextend src @ (value_type (fits_in_32 src_ty))))) +(rule -2 (lower (has_type (fits_in_32 dst_ty) (uextend src @ (value_type (fits_in_32 src_ty))))) (extend_to_gpr src $I32 (ExtendKind.Zero))) ;; I32 -> I64 with op that produces a zero-extended value in a register. @@ -2126,7 +2130,7 @@ (decl generic_sextend (Value Type Type) InstOutput) ;; T -> T is a no-op. -(rule (generic_sextend src ty ty) +(rule 4 (generic_sextend src ty ty) src) ;; Produce upper 64 bits sign-extended from lower 64: shift right by @@ -2136,21 +2140,21 @@ (x64_sar $I64 src (Imm8Reg.Imm8 63))) ;; I64 -> I128. -(rule (generic_sextend src (ty_int_bool_64 _) (ty_int_bool_128 _)) +(rule 3 (generic_sextend src (ty_int_bool_64 _) (ty_int_bool_128 _)) (value_regs src (spread_sign_bit src))) ;; I{8,16,32} -> I128. -(rule (generic_sextend src (fits_in_32 src_ty) (ty_int_bool_128 _)) +(rule 2 (generic_sextend src (fits_in_32 src_ty) (ty_int_bool_128 _)) (let ((lo Gpr (extend_to_gpr src $I64 (ExtendKind.Sign))) (hi Gpr (spread_sign_bit lo))) (value_regs lo hi))) ;; I{8,16,32} -> I64. -(rule (generic_sextend src (fits_in_32 src_ty) (ty_int_bool_64 _)) +(rule 1 (generic_sextend src (fits_in_32 src_ty) (ty_int_bool_64 _)) (extend_to_gpr src $I64 (ExtendKind.Sign))) ;; I8 -> I{16,32}, I16 -> I32. -(rule (generic_sextend src (fits_in_32 src_ty) (fits_in_32 dst_ty)) +(rule 0 (generic_sextend src (fits_in_32 src_ty) (fits_in_32 dst_ty)) (extend_to_gpr src $I32 (ExtendKind.Sign))) (rule (lower @@ -2173,7 +2177,7 @@ ;; T -> I{64,32,16,8}: We can simply pass through the value: values ;; are always stored with high bits undefined, so we can just leave ;; them be. -(rule (lower (has_type (fits_in_64 ty) (ireduce src))) +(rule 1 (lower (has_type (fits_in_64 ty) (ireduce src))) (value_regs_get_gpr src 0)) ;; Likewise for breduce. @@ -2181,7 +2185,7 @@ (rule (lower (has_type ty (breduce src @ (value_type ty)))) src) -(rule (lower (has_type (fits_in_64 ty) (breduce src))) +(rule 1 (lower (has_type (fits_in_64 ty) (breduce src))) (value_regs_get_gpr src 0)) ;; Rules for `bint` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; @@ -2192,7 +2196,7 @@ (rule (lower (has_type (fits_in_64 ty) (bint src))) (x64_and ty src (RegMemImm.Imm 1))) -(rule (lower (has_type $I128 +(rule 1 (lower (has_type $I128 (bint src))) (value_regs (x64_and $I64 src (RegMemImm.Imm 1)) @@ -2497,11 +2501,11 @@ ;; 8-bit loads. ;; ;; By default, we zero-extend all sub-64-bit loads to a GPR. -(rule (lower (has_type (and (fits_in_32 ty) (is_gpr_type _)) (load flags address offset))) +(rule -4 (lower (has_type (and (fits_in_32 ty) (is_gpr_type _)) (load flags address offset))) (x64_movzx (ext_mode (ty_bits_u16 ty) 64) (to_amode flags address offset))) ;; But if we know that both the `from` and `to` are 64 bits, we simply load with ;; no extension. -(rule (lower (has_type (ty_int_bool_ref_64 ty) (load flags address offset))) +(rule -1 (lower (has_type (ty_int_bool_ref_64 ty) (load flags address offset))) (x64_mov (to_amode flags address offset))) ;; Also, certain scalar loads have a specific `from` width and extension kind ;; (signed -> `sx`, zeroed -> `zx`). We overwrite the high bits of the 64-bit @@ -2531,11 +2535,11 @@ (x64_movups (to_amode flags address offset))) (rule (lower (has_type $F64X2 (load flags address offset))) (x64_movupd (to_amode flags address offset))) -(rule (lower (has_type (ty_vec128 ty) (load flags address offset))) +(rule -2 (lower (has_type (ty_vec128 ty) (load flags address offset))) (x64_movdqu (to_amode flags address offset))) ;; We can load an I128/B128 by doing two 64-bit loads. -(rule (lower (has_type (ty_int_bool_128 _) +(rule -3 (lower (has_type (ty_int_bool_128 _) (load flags address offset))) (let ((addr_lo Amode (to_amode flags address offset)) (addr_hi Amode (amode_offset addr_lo 8)) @@ -2561,7 +2565,7 @@ ;; Rules for `store*` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;; 8-, 16-, 32- and 64-bit GPR stores. -(rule (lower (store flags +(rule -2 (lower (store flags value @ (value_type (is_gpr_type ty)) address offset)) @@ -2580,7 +2584,7 @@ (x64_movrm $I32 (to_amode flags address offset) value))) ;; F32 stores of values in XMM registers. -(rule (lower (store flags +(rule 1 (lower (store flags value @ (value_type $F32) address offset)) @@ -2588,7 +2592,7 @@ (x64_xmm_movrm (SseOpcode.Movss) (to_amode flags address offset) value))) ;; F64 stores of values in XMM registers. -(rule (lower (store flags +(rule 1 (lower (store flags value @ (value_type $F64) address offset)) @@ -2596,7 +2600,7 @@ (x64_xmm_movrm (SseOpcode.Movsd) (to_amode flags address offset) value))) ;; Stores of F32X4 vectors. -(rule (lower (store flags +(rule 1 (lower (store flags value @ (value_type $F32X4) address offset)) @@ -2604,7 +2608,7 @@ (x64_xmm_movrm (SseOpcode.Movups) (to_amode flags address offset) value))) ;; Stores of F64X2 vectors. -(rule (lower (store flags +(rule 1 (lower (store flags value @ (value_type $F64X2) address offset)) @@ -2612,7 +2616,7 @@ (x64_xmm_movrm (SseOpcode.Movupd) (to_amode flags address offset) value))) ;; Stores of all other 128-bit vector types with integer lanes. -(rule (lower (store flags +(rule -1 (lower (store flags value @ (value_type (ty_vec128_int _)) address offset)) @@ -2620,7 +2624,7 @@ (x64_xmm_movrm (SseOpcode.Movdqu) (to_amode flags address offset) value))) ;; Stores of I128/B128 values: store the two 64-bit halves separately. -(rule (lower (store flags +(rule 0 (lower (store flags value @ (value_type (ty_int_bool_128 _)) address offset)) @@ -2637,7 +2641,7 @@ ;; Rules for `load*` + ALU op + `store*` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;; Add mem, reg -(rule (lower +(rule 3 (lower (store flags (has_type (ty_32_or_64 ty) (iadd (and @@ -2651,7 +2655,7 @@ (x64_add_mem ty (to_amode flags addr offset) src2)))) ;; Add mem, reg with args swapped -(rule (lower +(rule 2 (lower (store flags (has_type (ty_32_or_64 ty) (iadd src2 @@ -2665,7 +2669,7 @@ (x64_add_mem ty (to_amode flags addr offset) src2)))) ;; Sub mem, reg -(rule (lower +(rule 2 (lower (store flags (has_type (ty_32_or_64 ty) (isub (and @@ -2679,7 +2683,7 @@ (x64_sub_mem ty (to_amode flags addr offset) src2)))) ;; And mem, reg -(rule (lower +(rule 3 (lower (store flags (has_type (ty_32_or_64 ty) (band (and @@ -2693,7 +2697,7 @@ (x64_and_mem ty (to_amode flags addr offset) src2)))) ;; And mem, reg with args swapped -(rule (lower +(rule 2 (lower (store flags (has_type (ty_32_or_64 ty) (band src2 @@ -2707,7 +2711,7 @@ (x64_and_mem ty (to_amode flags addr offset) src2)))) ;; Or mem, reg -(rule (lower +(rule 3 (lower (store flags (has_type (ty_32_or_64 ty) (bor (and @@ -2721,7 +2725,7 @@ (x64_or_mem ty (to_amode flags addr offset) src2)))) ;; Or mem, reg with args swapped -(rule (lower +(rule 2 (lower (store flags (has_type (ty_32_or_64 ty) (bor src2 @@ -2735,7 +2739,7 @@ (x64_or_mem ty (to_amode flags addr offset) src2)))) ;; Xor mem, reg -(rule (lower +(rule 3 (lower (store flags (has_type (ty_32_or_64 ty) (bxor (and @@ -2749,7 +2753,7 @@ (x64_xor_mem ty (to_amode flags addr offset) src2)))) ;; Xor mem, reg with args swapped -(rule (lower +(rule 2 (lower (store flags (has_type (ty_32_or_64 ty) (bxor src2 @@ -2786,7 +2790,7 @@ ;; As described in the `atomic_load` documentation, this lowering is only valid ;; for I8, I16, I32, and I64. The sub-64-bit types are zero extended, as with a ;; normal load. -(rule (lower (has_type $I64 (atomic_load flags address))) +(rule 1 (lower (has_type $I64 (atomic_load flags address))) (x64_mov (to_amode flags address (zero_offset)))) (rule (lower (has_type (and (fits_in_32 ty) (ty_int _)) (atomic_load flags address))) (x64_movzx (ext_mode (ty_bits_u16 ty) 64) (to_amode flags address (zero_offset)))) @@ -2863,34 +2867,34 @@ ;; Rules for `brz` and `brnz` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; -(rule (lower_branch (brz (icmp cc a b) _ _) (two_targets taken not_taken)) +(rule 2 (lower_branch (brz (icmp cc a b) _ _) (two_targets taken not_taken)) (let ((cmp IcmpCondResult (invert_icmp_cond_result (emit_cmp cc a b)))) (side_effect (jmp_cond_icmp cmp taken not_taken)))) -(rule (lower_branch (brz (fcmp cc a b) _ _) (two_targets taken not_taken)) +(rule 2 (lower_branch (brz (fcmp cc a b) _ _) (two_targets taken not_taken)) (let ((cmp FcmpCondResult (emit_fcmp (floatcc_inverse cc) a b))) (side_effect (jmp_cond_fcmp cmp taken not_taken)))) -(rule (lower_branch (brz val @ (value_type $I128) _ _) (two_targets taken not_taken)) +(rule 1 (lower_branch (brz val @ (value_type $I128) _ _) (two_targets taken not_taken)) (side_effect (jmp_cond_icmp (cmp_zero_i128 (CC.NZ) val) taken not_taken))) -(rule (lower_branch (brz val @ (value_type (ty_int_bool_or_ref)) _ _) (two_targets taken not_taken)) +(rule 0 (lower_branch (brz val @ (value_type (ty_int_bool_or_ref)) _ _) (two_targets taken not_taken)) (side_effect (with_flags_side_effect (cmp_zero_int_bool_ref val) (jmp_cond (CC.Z) taken not_taken)))) -(rule (lower_branch (brnz (icmp cc a b) _ _) (two_targets taken not_taken)) +(rule 2 (lower_branch (brnz (icmp cc a b) _ _) (two_targets taken not_taken)) (side_effect (jmp_cond_icmp (emit_cmp cc a b) taken not_taken))) -(rule (lower_branch (brnz (fcmp cc a b) _ _) (two_targets taken not_taken)) +(rule 2 (lower_branch (brnz (fcmp cc a b) _ _) (two_targets taken not_taken)) (let ((cmp FcmpCondResult (emit_fcmp cc a b))) (side_effect (jmp_cond_fcmp cmp taken not_taken)))) -(rule (lower_branch (brnz val @ (value_type $I128) _ _) (two_targets taken not_taken)) +(rule 1 (lower_branch (brnz val @ (value_type $I128) _ _) (two_targets taken not_taken)) (side_effect (jmp_cond_icmp (cmp_zero_i128 (CC.Z) val) taken not_taken))) -(rule (lower_branch (brnz val @ (value_type (ty_int_bool_or_ref)) _ _) (two_targets taken not_taken)) +(rule 0 (lower_branch (brnz val @ (value_type (ty_int_bool_or_ref)) _ _) (two_targets taken not_taken)) (side_effect (with_flags_side_effect (cmp_zero_int_bool_ref val) (jmp_cond (CC.NZ) taken not_taken)))) @@ -2914,7 +2918,7 @@ (decl cmp_zero_int_bool_ref (Value) ProducesFlags) -(rule (cmp_zero_int_bool_ref val @ (value_type $B1)) +(rule 1 (cmp_zero_int_bool_ref val @ (value_type $B1)) (x64_test (OperandSize.Size8) (RegMemImm.Imm 1) val)) (rule (cmp_zero_int_bool_ref val @ (value_type ty)) (let ((size OperandSize (raw_operand_size_of_type ty)) @@ -2941,25 +2945,25 @@ ;; Rules for `fcvt_from_sint` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; -(rule (lower (has_type $F32 (fcvt_from_sint a @ (value_type $I8)))) +(rule 2 (lower (has_type $F32 (fcvt_from_sint a @ (value_type $I8)))) (x64_cvtsi2ss $I32 (extend_to_gpr a $I32 (ExtendKind.Sign)))) -(rule (lower (has_type $F32 (fcvt_from_sint a @ (value_type $I16)))) +(rule 2 (lower (has_type $F32 (fcvt_from_sint a @ (value_type $I16)))) (x64_cvtsi2ss $I32 (extend_to_gpr a $I32 (ExtendKind.Sign)))) -(rule (lower (has_type $F32 (fcvt_from_sint a @ (value_type (ty_int (fits_in_64 ty)))))) +(rule 1 (lower (has_type $F32 (fcvt_from_sint a @ (value_type (ty_int (fits_in_64 ty)))))) (x64_cvtsi2ss ty a)) -(rule (lower (has_type $F64 (fcvt_from_sint a @ (value_type $I8)))) +(rule 2 (lower (has_type $F64 (fcvt_from_sint a @ (value_type $I8)))) (x64_cvtsi2sd $I32 (extend_to_gpr a $I32 (ExtendKind.Sign)))) -(rule (lower (has_type $F64 (fcvt_from_sint a @ (value_type $I16)))) +(rule 2 (lower (has_type $F64 (fcvt_from_sint a @ (value_type $I16)))) (x64_cvtsi2sd $I32 (extend_to_gpr a $I32 (ExtendKind.Sign)))) -(rule (lower (has_type $F64 (fcvt_from_sint a @ (value_type (ty_int (fits_in_64 ty)))))) +(rule 1 (lower (has_type $F64 (fcvt_from_sint a @ (value_type (ty_int (fits_in_64 ty)))))) (x64_cvtsi2sd ty a)) -(rule (lower (fcvt_from_sint a @ (value_type $I32X4))) +(rule 0 (lower (fcvt_from_sint a @ (value_type $I32X4))) (x64_cvtdq2ps a)) ;; Rules for `fcvt_low_from_sint` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; @@ -2969,10 +2973,10 @@ ;; Rules for `fcvt_from_uint` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; -(rule (lower (has_type $F32 (fcvt_from_uint val @ (value_type (fits_in_32 (ty_int ty)))))) +(rule 1 (lower (has_type $F32 (fcvt_from_uint val @ (value_type (fits_in_32 (ty_int ty)))))) (x64_cvtsi2ss $I64 (extend_to_gpr val $I64 (ExtendKind.Zero)))) -(rule (lower (has_type $F64 (fcvt_from_uint val @ (value_type (fits_in_32 (ty_int ty)))))) +(rule 1 (lower (has_type $F64 (fcvt_from_uint val @ (value_type (fits_in_32 (ty_int ty)))))) (x64_cvtsi2sd $I64 (extend_to_gpr val $I64 (ExtendKind.Zero)))) (rule (lower (has_type ty (fcvt_from_uint val @ (value_type $I64)))) @@ -2982,7 +2986,7 @@ ;; 0x1.0p52 + double(src). 0x1.0p52 is unique because at this exponent ;; every value of the mantissa represents a corresponding uint32 number. ;; When we subtract 0x1.0p52 we are left with double(src). -(rule (lower (has_type $F64X2 (fcvt_from_uint (uwiden_low val @ (value_type $I32X4))))) +(rule 1 (lower (has_type $F64X2 (fcvt_from_uint (uwiden_low val @ (value_type $I32X4))))) (let ((uint_mask Xmm (x64_xmm_load_const $I32X4 (fcvt_uint_mask_const))) (res Xmm (x64_unpcklps val uint_mask)) (uint_mask_high Xmm (x64_xmm_load_const $I32X4 (fcvt_uint_mask_high_const)))) @@ -2990,10 +2994,7 @@ ;; When AVX512VL and AVX512F are available, ;; `fcvt_from_uint` can be lowered to a single instruction. -;; -;; NOTE: the priority of 1 here is to break ties with the next case for $F32X4, -;; as it doesn't require either of the avx512 extensions to be enabled. -(rule 1 (lower (has_type (and (avx512vl_enabled $true) (avx512f_enabled $true) $F32X4) +(rule 2 (lower (has_type (and (avx512vl_enabled $true) (avx512f_enabled $true) $F32X4) (fcvt_from_uint src))) (x64_vcvtudq2ps src)) @@ -3020,7 +3021,7 @@ ;; -> Convert(Ah) // Convert .. with no loss of significant digits from previous shift ;; -> Ah = Ah + Ah // Double Ah to account for shift right before the conversion. ;; -> dst = Ah + Al // Add the two floats together -(rule (lower (has_type $F32X4 (fcvt_from_uint val))) +(rule 1 (lower (has_type $F32X4 (fcvt_from_uint val))) (let ((a Xmm val) ;; get the low 16 bits @@ -3057,7 +3058,7 @@ (cvt_float_to_sint_seq out_ty val $true)) ;; The x64 backend currently only supports these two type combinations. -(rule (lower (has_type $I32X4 (fcvt_to_sint_sat val @ (value_type $F32X4)))) +(rule 1 (lower (has_type $I32X4 (fcvt_to_sint_sat val @ (value_type $F32X4)))) (let ((src Xmm val) ;; Sets tmp to zero if float is NaN @@ -3128,7 +3129,7 @@ ;; ;; | Step 6 | Step 7 | ;; | (0-(INT_MAX+1))..(UINT_MAX-(INT_MAX+1))(w/overflow) | ((INT_MAX+1)-(INT_MAX+1))..(INT_MAX+1) | -(rule (lower (has_type $I32X4 (fcvt_to_uint_sat val @ (value_type $F32X4)))) +(rule 1 (lower (has_type $I32X4 (fcvt_to_uint_sat val @ (value_type $F32X4)))) (let ((src Xmm val) ;; Converting to unsigned int so if float src is negative or NaN @@ -3335,13 +3336,13 @@ (rule (lower (has_type (use_sse41 $true) (ceil a @ (value_type $F32)))) (x64_roundss a (RoundImm.RoundUp))) -(rule (lower (ceil a @ (value_type $F32))) +(rule (lower (has_type (use_sse41 $false) (ceil a @ (value_type $F32)))) (libcall_1 (LibCall.CeilF32) a)) (rule (lower (has_type (use_sse41 $true) (ceil a @ (value_type $F64)))) (x64_roundsd a (RoundImm.RoundUp))) -(rule (lower (ceil a @ (value_type $F64))) +(rule (lower (has_type (use_sse41 $false) (ceil a @ (value_type $F64)))) (libcall_1 (LibCall.CeilF64) a)) (rule (lower (has_type (use_sse41 $true) (ceil a @ (value_type $F32X4)))) @@ -3355,13 +3356,13 @@ (rule (lower (has_type (use_sse41 $true) (floor a @ (value_type $F32)))) (x64_roundss a (RoundImm.RoundDown))) -(rule (lower (floor a @ (value_type $F32))) +(rule (lower (has_type (use_sse41 $false) (floor a @ (value_type $F32)))) (libcall_1 (LibCall.FloorF32) a)) (rule (lower (has_type (use_sse41 $true) (floor a @ (value_type $F64)))) (x64_roundsd a (RoundImm.RoundDown))) -(rule (lower (floor a @ (value_type $F64))) +(rule (lower (has_type (use_sse41 $false) (floor a @ (value_type $F64)))) (libcall_1 (LibCall.FloorF64) a)) (rule (lower (has_type (use_sse41 $true) (floor a @ (value_type $F32X4)))) @@ -3375,13 +3376,13 @@ (rule (lower (has_type (use_sse41 $true) (nearest a @ (value_type $F32)))) (x64_roundss a (RoundImm.RoundNearest))) -(rule (lower (nearest a @ (value_type $F32))) +(rule (lower (has_type (use_sse41 $false) (nearest a @ (value_type $F32)))) (libcall_1 (LibCall.NearestF32) a)) (rule (lower (has_type (use_sse41 $true) (nearest a @ (value_type $F64)))) (x64_roundsd a (RoundImm.RoundNearest))) -(rule (lower (nearest a @ (value_type $F64))) +(rule (lower (has_type (use_sse41 $false) (nearest a @ (value_type $F64)))) (libcall_1 (LibCall.NearestF64) a)) (rule (lower (has_type (use_sse41 $true) (nearest a @ (value_type $F32X4)))) @@ -3395,13 +3396,13 @@ (rule (lower (has_type (use_sse41 $true) (trunc a @ (value_type $F32)))) (x64_roundss a (RoundImm.RoundZero))) -(rule (lower (trunc a @ (value_type $F32))) +(rule (lower (has_type (use_sse41 $false) (trunc a @ (value_type $F32)))) (libcall_1 (LibCall.TruncF32) a)) (rule (lower (has_type (use_sse41 $true) (trunc a @ (value_type $F64)))) (x64_roundsd a (RoundImm.RoundZero))) -(rule (lower (trunc a @ (value_type $F64))) +(rule (lower (has_type (use_sse41 $false) (trunc a @ (value_type $F64)))) (libcall_1 (LibCall.TruncF64) a)) (rule (lower (has_type (use_sse41 $true) (trunc a @ (value_type $F32X4)))) @@ -3500,22 +3501,22 @@ ;; register. We statically build `constructed_mask` to zero out any unknown lane ;; indices (may not be completely necessary: verification could fail incorrect ;; mask values) and fix the indexes to all point to the `dst` vector. -(rule (lower (shuffle a a (vec_mask_from_immediate mask))) +(rule 3 (lower (shuffle a a (vec_mask_from_immediate mask))) (x64_pshufb a (x64_xmm_load_const $I8X16 (shuffle_0_31_mask mask)))) ;; For the case where the shuffle mask contains out-of-bounds values (values ;; greater than 31) we must mask off those resulting values in the result of ;; `vpermi2b`. -(rule (lower (has_type (and (avx512vl_enabled $true) (avx512vbmi_enabled $true)) - (shuffle a b (vec_mask_from_immediate - (perm_from_mask_with_zeros mask zeros))))) +(rule 2 (lower (has_type (and (avx512vl_enabled $true) (avx512vbmi_enabled $true)) + (shuffle a b (vec_mask_from_immediate + (perm_from_mask_with_zeros mask zeros))))) (x64_andps (x64_xmm_load_const $I8X16 zeros) (x64_vpermi2b b a (x64_xmm_load_const $I8X16 mask)))) ;; However, if the shuffle mask contains no out-of-bounds values, we can use ;; `vpermi2b` without any masking. -(rule (lower (has_type (and (avx512vl_enabled $true) (avx512vbmi_enabled $true)) +(rule 1 (lower (has_type (and (avx512vl_enabled $true) (avx512vbmi_enabled $true)) (shuffle a b (vec_mask_from_immediate mask)))) (x64_vpermi2b b a (x64_xmm_load_const $I8X16 (perm_from_mask mask)))) @@ -3546,30 +3547,30 @@ ;; Remove the extractlane instruction, leaving the float where it is. The upper ;; bits will remain unchanged; for correctness, this relies on Cranelift type ;; checking to avoid using those bits. -(rule (lower (has_type (ty_scalar_float _) (extractlane val (u8_from_uimm8 0)))) +(rule 2 (lower (has_type (ty_scalar_float _) (extractlane val (u8_from_uimm8 0)))) val) ;; Cases 2-4 for an F32X4 -(rule (lower (has_type $F32 (extractlane val @ (value_type (ty_vec128 ty)) +(rule 1 (lower (has_type $F32 (extractlane val @ (value_type (ty_vec128 ty)) (u8_from_uimm8 lane)))) (x64_pshufd val lane (OperandSize.Size32))) ;; This is the only remaining case for F64X2 -(rule (lower (has_type $F64 (extractlane val @ (value_type (ty_vec128 ty)) +(rule 1 (lower (has_type $F64 (extractlane val @ (value_type (ty_vec128 ty)) (u8_from_uimm8 1)))) ;; 0xee == 0b11_10_11_10 (x64_pshufd val 0xee (OperandSize.Size32))) -(rule (lower (extractlane val @ (value_type ty @ (multi_lane 8 16)) (u8_from_uimm8 lane))) +(rule 0 (lower (extractlane val @ (value_type ty @ (multi_lane 8 16)) (u8_from_uimm8 lane))) (x64_pextrb ty val lane)) -(rule (lower (extractlane val @ (value_type ty @ (multi_lane 16 8)) (u8_from_uimm8 lane))) +(rule 0 (lower (extractlane val @ (value_type ty @ (multi_lane 16 8)) (u8_from_uimm8 lane))) (x64_pextrw ty val lane)) -(rule (lower (extractlane val @ (value_type ty @ (multi_lane 32 4)) (u8_from_uimm8 lane))) +(rule 0 (lower (extractlane val @ (value_type ty @ (multi_lane 32 4)) (u8_from_uimm8 lane))) (x64_pextrd ty val lane)) -(rule (lower (extractlane val @ (value_type ty @ (multi_lane 64 2)) (u8_from_uimm8 lane))) +(rule 0 (lower (extractlane val @ (value_type ty @ (multi_lane 64 2)) (u8_from_uimm8 lane))) (x64_pextrd ty val lane)) ;; Rules for `scalar_to_vector` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; @@ -3578,7 +3579,7 @@ ;; to another, expecting the register allocator to elide this. Here we ;; assume that the upper bits of a scalar float have not been munged with ;; (the same assumption the old backend makes). -(rule (lower (scalar_to_vector src @ (value_type (ty_scalar_float _)))) +(rule 1 (lower (scalar_to_vector src @ (value_type (ty_scalar_float _)))) src) ;; Case 2: when moving a scalar value of any other type, use MOVD to zero @@ -3588,9 +3589,9 @@ ;; Case 3: when presented with `load + scalar_to_vector`, coalesce into a single ;; MOVSS/MOVSD instruction. -(rule (lower (scalar_to_vector (and (sinkable_load src) (value_type (ty_32 _))))) +(rule 2 (lower (scalar_to_vector (and (sinkable_load src) (value_type (ty_32 _))))) (x64_movss_load (sink_load_to_xmm_mem src))) -(rule (lower (scalar_to_vector (and (sinkable_load src) (value_type (ty_64 _))))) +(rule 3 (lower (scalar_to_vector (and (sinkable_load src) (value_type (ty_64 _))))) (x64_movsd_load (sink_load_to_xmm_mem src))) ;; Rules for `splat` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; @@ -3610,7 +3611,7 @@ ;; Shuffle the lowest two lanes to all other lanes. (x64_pshufd vec 0 (OperandSize.Size32)))) -(rule (lower (has_type (multi_lane 32 4) (splat src @ (value_type (ty_scalar_float _))))) +(rule 1 (lower (has_type (multi_lane 32 4) (splat src @ (value_type (ty_scalar_float _))))) (lower_splat_32x4 $F32X4 src)) (rule (lower (has_type (multi_lane 32 4) (splat src))) @@ -3623,7 +3624,7 @@ ;; Shuffle the lowest lane to all other lanes. (x64_pshufd vec 0 (OperandSize.Size32)))) -(rule (lower (has_type (multi_lane 64 2) (splat src @ (value_type (ty_scalar_float _))))) +(rule 1 (lower (has_type (multi_lane 64 2) (splat src @ (value_type (ty_scalar_float _))))) (lower_splat_64x2 $F64X2 src)) (rule (lower (has_type (multi_lane 64 2) (splat src))) @@ -3698,16 +3699,13 @@ ;; Rules for `tls_value` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; -(rule (lower (tls_value (symbol_value_data name _ _))) - (if (tls_model_is_elf_gd)) +(rule (lower (has_type (tls_model (TlsModel.ElfGd)) (tls_value (symbol_value_data name _ _)))) (elf_tls_get_addr name)) -(rule (lower (tls_value (symbol_value_data name _ _))) - (if (tls_model_is_macho)) +(rule (lower (has_type (tls_model (TlsModel.Macho)) (tls_value (symbol_value_data name _ _)))) (macho_tls_get_addr name)) -(rule (lower (tls_value (symbol_value_data name _ _))) - (if (tls_model_is_coff)) +(rule (lower (has_type (tls_model (TlsModel.Coff)) (tls_value (symbol_value_data name _ _)))) (coff_tls_get_addr name)) ;; Rules for `sqmul_round_sat` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; diff --git a/cranelift/codegen/src/machinst/isle.rs b/cranelift/codegen/src/machinst/isle.rs index 2a1cbb37b3..119727e571 100644 --- a/cranelift/codegen/src/machinst/isle.rs +++ b/cranelift/codegen/src/machinst/isle.rs @@ -718,6 +718,11 @@ macro_rules! isle_prelude_methods { } } + #[inline] + fn tls_model(&mut self, _: Type) -> TlsModel { + self.flags.tls_model() + } + #[inline] fn tls_model_is_elf_gd(&mut self) -> Option<()> { if self.flags.tls_model() == TlsModel::ElfGd { diff --git a/cranelift/codegen/src/prelude.isle b/cranelift/codegen/src/prelude.isle index 2189c91c00..9c4f59ef87 100644 --- a/cranelift/codegen/src/prelude.isle +++ b/cranelift/codegen/src/prelude.isle @@ -889,6 +889,13 @@ (decl avoid_div_traps () Type) (extern extractor avoid_div_traps avoid_div_traps) +;; This definition should be kept up to date with the values defined in +;; cranelift/codegen/meta/src/shared/settings.rs +(type TlsModel extern (enum (None) (ElfGd) (Macho) (Coff))) + +(decl tls_model (TlsModel) Type) +(extern extractor infallible tls_model tls_model) + (decl pure tls_model_is_elf_gd () Unit) (extern constructor tls_model_is_elf_gd tls_model_is_elf_gd)