x64: Refactor and add extractlane special case for uextend/sextend (#6022)
* x64: Refactor sextend/uextend rules
Move much of the meaty logic from these lowering rules into the
`extend_to_gpr` helper to benefit other callers of `extend_to_gpr` to
elide instructions. This additionally simplifies `sextend` and `uextend`
lowerings to rely on optimizations happening within the `extend_to_gpr`
helper.
* x64: Skip `uextend` for `pextr{b,w}` instructions
These instructions are documented as automatically zeroing the upper
bits so `uextend` operations can be skipped. This slightly improves
codegen for the wasm `i{8x16,16x8}.extract_lane_u` instructions, for
example.
* Modernize an extractor pattern
* Trim some superfluous match clauses
Additionally rejigger priorities to be "mostly default" now.
* Refactor 32-to-64 predicate to a helper
Also adjust the pattern matched in the `extend_to_gpr` helper.
* Slightly refactor pextr{b,w} case
* Review comments
This commit is contained in:
@@ -1703,11 +1703,35 @@
|
||||
(decl extend_to_gpr (Value Type ExtendKind) Gpr)
|
||||
|
||||
;; If the value is already of the requested type, no extending is necessary.
|
||||
(rule 3 (extend_to_gpr val @ (value_type ty) ty _kind)
|
||||
val)
|
||||
|
||||
;; I32 -> I64 with op that produces a zero-extended value in a register.
|
||||
;;
|
||||
;; Priority 1 because the equality constraint doesn't prove that this rule
|
||||
;; doesn't overlap with the one below.
|
||||
(rule 1 (extend_to_gpr (and val (value_type ty)) ty _kind)
|
||||
(put_in_gpr val))
|
||||
;; As a particular x64 extra-pattern matching opportunity, all the ALU
|
||||
;; opcodes on 32-bits will zero-extend the upper 32-bits, so we can
|
||||
;; even not generate a zero-extended move in this case.
|
||||
(rule 2 (extend_to_gpr src @ (value_type $I32) $I64 (ExtendKind.Zero))
|
||||
(if-let $true (value32_zeros_upper32 src))
|
||||
src)
|
||||
|
||||
;; The `extractlane` instruction, extended to `$I32`, means that either an
|
||||
;; i8x16 or an i16x8 is being extracted. These are implemented with
|
||||
;; the `pextr{b,w}` instruction which automatically zero the upper bits of the
|
||||
;; destination register so the `uextend` in these cases can be elided.
|
||||
;;
|
||||
;; TODO: the documentation for `pextr{b,w}` seems to indicate it zero extends
|
||||
;; to not only 32-bits but probably the whole 64-bit register. If that's the
|
||||
;; case then this should match a zero-extend to any size instead of just `$I32`.
|
||||
;;
|
||||
;; TODO: the interaction here between this rule and the "it's written far away"
|
||||
;; rule to lower `extractlane` isn't great. Ideally this rule (and the other
|
||||
;; special cases for `value32_zeros_upper32`) would live contextually closer or
|
||||
;; be connected to the extractlane rules. There's some discussion of this on
|
||||
;; #6022 but the gist is that there's not a lot of great options at this time,
|
||||
;; so this doc block is what's here for now.
|
||||
(rule 1 (extend_to_gpr src @ (extractlane _ _) $I32 (ExtendKind.Zero))
|
||||
src)
|
||||
|
||||
(rule (extend_to_gpr (and val (value_type from_ty))
|
||||
to_ty
|
||||
@@ -1732,6 +1756,21 @@
|
||||
(rule (extend (ExtendKind.Sign) ty mode src)
|
||||
(x64_movsx mode src))
|
||||
|
||||
;; Tests whether the operation used to produce the input `Value`, which must
|
||||
;; be a 32-bit operation, will automatically zero the upper 32-bits of the
|
||||
;; destination register that `Value` is placed in.
|
||||
(decl pure value32_zeros_upper32 (Value) bool)
|
||||
(rule (value32_zeros_upper32 (iadd _ _)) $true)
|
||||
(rule (value32_zeros_upper32 (isub _ _)) $true)
|
||||
(rule (value32_zeros_upper32 (imul _ _)) $true)
|
||||
(rule (value32_zeros_upper32 (band _ _)) $true)
|
||||
(rule (value32_zeros_upper32 (bor _ _)) $true)
|
||||
(rule (value32_zeros_upper32 (bxor _ _)) $true)
|
||||
(rule (value32_zeros_upper32 (ishl _ _)) $true)
|
||||
(rule (value32_zeros_upper32 (ushr _ _)) $true)
|
||||
(rule (value32_zeros_upper32 (uload32 _ _ _)) $true)
|
||||
(rule -1 (value32_zeros_upper32 _) $false)
|
||||
|
||||
;;;; Helpers for Working SSE tidbits ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
|
||||
;; Turn a vector type into its integer-typed vector equivalent.
|
||||
|
||||
@@ -2039,101 +2039,38 @@
|
||||
|
||||
;; Rules for `uextend` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
|
||||
;; T -> T is a no-op.
|
||||
(rule 1 (lower (has_type ty (uextend src @ (value_type ty))))
|
||||
src)
|
||||
|
||||
;; I64 -> I128.
|
||||
(rule -1 (lower (has_type $I128 (uextend src @ (value_type $I64))))
|
||||
(value_regs src (imm $I64 0)))
|
||||
|
||||
;; I{8,16,32} -> I128.
|
||||
(rule (lower (has_type $I128 (uextend src @ (value_type (fits_in_32 src_ty)))))
|
||||
;; I{8,16,32,64} -> I128.
|
||||
(rule (lower (has_type $I128 (uextend src)))
|
||||
(value_regs (extend_to_gpr src $I64 (ExtendKind.Zero)) (imm $I64 0)))
|
||||
|
||||
;; I{8,16,32} -> I64.
|
||||
(rule -1 (lower (has_type $I64 (uextend src @ (value_type (fits_in_32 src_ty)))))
|
||||
(rule (lower (has_type $I64 (uextend src)))
|
||||
(extend_to_gpr src $I64 (ExtendKind.Zero)))
|
||||
|
||||
;; I8 -> I{16,32}, I16 -> I32.
|
||||
(rule -2 (lower (has_type (fits_in_32 dst_ty) (uextend src @ (value_type (fits_in_32 src_ty)))))
|
||||
(extend_to_gpr src $I32 (ExtendKind.Zero)))
|
||||
|
||||
;; I32 -> I64 with op that produces a zero-extended value in a register.
|
||||
;;
|
||||
;; As a particular x64 extra-pattern matching opportunity, all the ALU
|
||||
;; opcodes on 32-bits will zero-extend the upper 32-bits, so we can
|
||||
;; even not generate a zero-extended move in this case.
|
||||
;;
|
||||
;; (Note that we unfortunately can't factor out the
|
||||
;; insts-that-zero-upper-32 pattern into a separate extractor until we
|
||||
;; can write internal extractors with multiple rules; and we'd rather
|
||||
;; keep these here than write an external extractor containing bits of
|
||||
;; the instruction pattern.s)
|
||||
(rule (lower (has_type $I64
|
||||
(uextend src @ (has_type $I32 (iadd _ _)))))
|
||||
src)
|
||||
(rule (lower (has_type $I64
|
||||
(uextend src @ (has_type $I32 (isub _ _)))))
|
||||
src)
|
||||
(rule (lower (has_type $I64
|
||||
(uextend src @ (has_type $I32 (imul _ _)))))
|
||||
src)
|
||||
(rule (lower (has_type $I64
|
||||
(uextend src @ (has_type $I32 (band _ _)))))
|
||||
src)
|
||||
(rule (lower (has_type $I64
|
||||
(uextend src @ (has_type $I32 (bor _ _)))))
|
||||
src)
|
||||
(rule (lower (has_type $I64
|
||||
(uextend src @ (has_type $I32 (bxor _ _)))))
|
||||
src)
|
||||
(rule (lower (has_type $I64
|
||||
(uextend src @ (has_type $I32 (ishl _ _)))))
|
||||
src)
|
||||
(rule (lower (has_type $I64
|
||||
(uextend src @ (has_type $I32 (ushr _ _)))))
|
||||
src)
|
||||
(rule (lower (has_type $I64
|
||||
(uextend src @ (has_type $I32 (uload32 _ _ _)))))
|
||||
src)
|
||||
;; I{8,16} -> I32
|
||||
;; I8 -> I16
|
||||
(rule -1 (lower (has_type (fits_in_32 _) (uextend src)))
|
||||
(extend_to_gpr src $I32 (ExtendKind.Zero)))
|
||||
|
||||
;; Rules for `sextend` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
|
||||
(decl generic_sextend (Value Type Type) InstOutput)
|
||||
|
||||
;; T -> T is a no-op.
|
||||
(rule 4 (generic_sextend src ty ty)
|
||||
src)
|
||||
|
||||
;; I{8,16,32} -> I128.
|
||||
;;
|
||||
;; Produce upper 64 bits sign-extended from lower 64: shift right by
|
||||
;; 63 bits to spread the sign bit across the result.
|
||||
(decl spread_sign_bit (Gpr) Gpr)
|
||||
(rule (spread_sign_bit src)
|
||||
(x64_sar $I64 src (Imm8Reg.Imm8 63)))
|
||||
|
||||
;; I64 -> I128.
|
||||
(rule 3 (generic_sextend src $I64 $I128)
|
||||
(value_regs src (spread_sign_bit src)))
|
||||
|
||||
;; I{8,16,32} -> I128.
|
||||
(rule 2 (generic_sextend src (fits_in_32 src_ty) $I128)
|
||||
(rule (lower (has_type $I128 (sextend src)))
|
||||
(let ((lo Gpr (extend_to_gpr src $I64 (ExtendKind.Sign)))
|
||||
(hi Gpr (spread_sign_bit lo)))
|
||||
(hi Gpr (x64_sar $I64 lo (Imm8Reg.Imm8 63))))
|
||||
(value_regs lo hi)))
|
||||
|
||||
;; I{8,16,32} -> I64.
|
||||
(rule 1 (generic_sextend src (fits_in_32 src_ty) $I64)
|
||||
(rule (lower (has_type $I64 (sextend src)))
|
||||
(extend_to_gpr src $I64 (ExtendKind.Sign)))
|
||||
|
||||
;; I8 -> I{16,32}, I16 -> I32.
|
||||
(rule 0 (generic_sextend src (fits_in_32 src_ty) (fits_in_32 dst_ty))
|
||||
(extend_to_gpr src $I32 (ExtendKind.Sign)))
|
||||
|
||||
(rule (lower
|
||||
(has_type dst_ty
|
||||
(sextend src @ (value_type src_ty))))
|
||||
(generic_sextend src src_ty dst_ty))
|
||||
;; I{8,16} -> I32
|
||||
;; I8 -> I16
|
||||
(rule -1 (lower (has_type (fits_in_32 _) (sextend src)))
|
||||
(extend_to_gpr src $I32 (ExtendKind.Sign)))
|
||||
|
||||
;; Rules for `ireduce` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
|
||||
@@ -3859,9 +3796,15 @@
|
||||
;; 0xee == 0b11_10_11_10
|
||||
(x64_pshufd val 0xee))
|
||||
|
||||
;; Note that the `pextrb` lowering here is relied upon by the `extend_to_gpr`
|
||||
;; helper because it will elide a `uextend` operation when `extractlane` is the
|
||||
;; inner node. The `pextrb` operation automatically zero-extends for us so the
|
||||
;; extra `uextend` doesn't have to codegen anything.
|
||||
(rule 0 (lower (extractlane val @ (value_type ty @ (multi_lane 8 16)) (u8_from_uimm8 lane)))
|
||||
(x64_pextrb val lane))
|
||||
|
||||
;; See the note in the 8x16 case above for how this rule is connected to
|
||||
;; `extend_to_gpr`.
|
||||
(rule 0 (lower (extractlane val @ (value_type ty @ (multi_lane 16 8)) (u8_from_uimm8 lane)))
|
||||
(x64_pextrw val lane))
|
||||
|
||||
|
||||
@@ -27,3 +27,81 @@ block0(v0: i32, v1: i32):
|
||||
; popq %rbp
|
||||
; retq
|
||||
|
||||
function %extractlane_i8x16_i16(i8x16) -> i16 {
|
||||
block0(v0: i8x16):
|
||||
v1 = extractlane v0, 1
|
||||
v2 = uextend.i16 v1
|
||||
return v2
|
||||
}
|
||||
|
||||
; VCode:
|
||||
; pushq %rbp
|
||||
; movq %rsp, %rbp
|
||||
; block0:
|
||||
; pextrb $1, %xmm0, %rax
|
||||
; movq %rbp, %rsp
|
||||
; popq %rbp
|
||||
; ret
|
||||
;
|
||||
; Disassembled:
|
||||
; block0: ; offset 0x0
|
||||
; pushq %rbp
|
||||
; movq %rsp, %rbp
|
||||
; block1: ; offset 0x4
|
||||
; pextrb $1, %xmm0, %eax
|
||||
; movq %rbp, %rsp
|
||||
; popq %rbp
|
||||
; retq
|
||||
|
||||
function %extractlane_i8x16_i32(i8x16) -> i32 {
|
||||
block0(v0: i8x16):
|
||||
v1 = extractlane v0, 1
|
||||
v2 = uextend.i32 v1
|
||||
return v2
|
||||
}
|
||||
|
||||
; VCode:
|
||||
; pushq %rbp
|
||||
; movq %rsp, %rbp
|
||||
; block0:
|
||||
; pextrb $1, %xmm0, %rax
|
||||
; movq %rbp, %rsp
|
||||
; popq %rbp
|
||||
; ret
|
||||
;
|
||||
; Disassembled:
|
||||
; block0: ; offset 0x0
|
||||
; pushq %rbp
|
||||
; movq %rsp, %rbp
|
||||
; block1: ; offset 0x4
|
||||
; pextrb $1, %xmm0, %eax
|
||||
; movq %rbp, %rsp
|
||||
; popq %rbp
|
||||
; retq
|
||||
|
||||
function %extractlane_i16x8_i32(i16x8) -> i32 {
|
||||
block0(v0: i16x8):
|
||||
v1 = extractlane v0, 1
|
||||
v2 = uextend.i32 v1
|
||||
return v2
|
||||
}
|
||||
|
||||
; VCode:
|
||||
; pushq %rbp
|
||||
; movq %rsp, %rbp
|
||||
; block0:
|
||||
; pextrw $1, %xmm0, %rax
|
||||
; movq %rbp, %rsp
|
||||
; popq %rbp
|
||||
; ret
|
||||
;
|
||||
; Disassembled:
|
||||
; block0: ; offset 0x0
|
||||
; pushq %rbp
|
||||
; movq %rsp, %rbp
|
||||
; block1: ; offset 0x4
|
||||
; pextrw $1, %xmm0, %eax
|
||||
; movq %rbp, %rsp
|
||||
; popq %rbp
|
||||
; retq
|
||||
|
||||
|
||||
Reference in New Issue
Block a user