x64: Refactor and add extractlane special case for uextend/sextend (#6022)

* x64: Refactor sextend/uextend rules

Move much of the meaty logic from these lowering rules into the
`extend_to_gpr` helper to benefit other callers of `extend_to_gpr` to
elide instructions. This additionally simplifies `sextend` and `uextend`
lowerings to rely on optimizations happening within the `extend_to_gpr`
helper.

* x64: Skip `uextend` for `pextr{b,w}` instructions

These instructions are documented as automatically zeroing the upper
bits so `uextend` operations can be skipped. This slightly improves
codegen for the wasm `i{8x16,16x8}.extract_lane_u` instructions, for
example.

* Modernize an extractor pattern

* Trim some superfluous match clauses

Additionally rejigger priorities to be "mostly default" now.

* Refactor 32-to-64 predicate to a helper

Also adjust the pattern matched in the `extend_to_gpr` helper.

* Slightly refactor pextr{b,w} case

* Review comments
This commit is contained in:
Alex Crichton
2023-03-16 17:14:59 -05:00
committed by GitHub
parent d479951469
commit 8e500099b3
3 changed files with 143 additions and 83 deletions

View File

@@ -1703,11 +1703,35 @@
(decl extend_to_gpr (Value Type ExtendKind) Gpr)
;; If the value is already of the requested type, no extending is necessary.
(rule 3 (extend_to_gpr val @ (value_type ty) ty _kind)
val)
;; I32 -> I64 with op that produces a zero-extended value in a register.
;;
;; Priority 1 because the equality constraint doesn't prove that this rule
;; doesn't overlap with the one below.
(rule 1 (extend_to_gpr (and val (value_type ty)) ty _kind)
(put_in_gpr val))
;; As a particular x64 extra-pattern matching opportunity, all the ALU
;; opcodes on 32-bits will zero-extend the upper 32-bits, so we can
;; even not generate a zero-extended move in this case.
(rule 2 (extend_to_gpr src @ (value_type $I32) $I64 (ExtendKind.Zero))
(if-let $true (value32_zeros_upper32 src))
src)
;; The `extractlane` instruction, extended to `$I32`, means that either an
;; i8x16 or an i16x8 is being extracted. These are implemented with
;; the `pextr{b,w}` instruction which automatically zero the upper bits of the
;; destination register so the `uextend` in these cases can be elided.
;;
;; TODO: the documentation for `pextr{b,w}` seems to indicate it zero extends
;; to not only 32-bits but probably the whole 64-bit register. If that's the
;; case then this should match a zero-extend to any size instead of just `$I32`.
;;
;; TODO: the interaction here between this rule and the "it's written far away"
;; rule to lower `extractlane` isn't great. Ideally this rule (and the other
;; special cases for `value32_zeros_upper32`) would live contextually closer or
;; be connected to the extractlane rules. There's some discussion of this on
;; #6022 but the gist is that there's not a lot of great options at this time,
;; so this doc block is what's here for now.
(rule 1 (extend_to_gpr src @ (extractlane _ _) $I32 (ExtendKind.Zero))
src)
(rule (extend_to_gpr (and val (value_type from_ty))
to_ty
@@ -1732,6 +1756,21 @@
(rule (extend (ExtendKind.Sign) ty mode src)
(x64_movsx mode src))
;; Tests whether the operation used to produce the input `Value`, which must
;; be a 32-bit operation, will automatically zero the upper 32-bits of the
;; destination register that `Value` is placed in.
(decl pure value32_zeros_upper32 (Value) bool)
(rule (value32_zeros_upper32 (iadd _ _)) $true)
(rule (value32_zeros_upper32 (isub _ _)) $true)
(rule (value32_zeros_upper32 (imul _ _)) $true)
(rule (value32_zeros_upper32 (band _ _)) $true)
(rule (value32_zeros_upper32 (bor _ _)) $true)
(rule (value32_zeros_upper32 (bxor _ _)) $true)
(rule (value32_zeros_upper32 (ishl _ _)) $true)
(rule (value32_zeros_upper32 (ushr _ _)) $true)
(rule (value32_zeros_upper32 (uload32 _ _ _)) $true)
(rule -1 (value32_zeros_upper32 _) $false)
;;;; Helpers for Working SSE tidbits ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;; Turn a vector type into its integer-typed vector equivalent.

View File

@@ -2039,101 +2039,38 @@
;; Rules for `uextend` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;; T -> T is a no-op.
(rule 1 (lower (has_type ty (uextend src @ (value_type ty))))
src)
;; I64 -> I128.
(rule -1 (lower (has_type $I128 (uextend src @ (value_type $I64))))
(value_regs src (imm $I64 0)))
;; I{8,16,32} -> I128.
(rule (lower (has_type $I128 (uextend src @ (value_type (fits_in_32 src_ty)))))
;; I{8,16,32,64} -> I128.
(rule (lower (has_type $I128 (uextend src)))
(value_regs (extend_to_gpr src $I64 (ExtendKind.Zero)) (imm $I64 0)))
;; I{8,16,32} -> I64.
(rule -1 (lower (has_type $I64 (uextend src @ (value_type (fits_in_32 src_ty)))))
(rule (lower (has_type $I64 (uextend src)))
(extend_to_gpr src $I64 (ExtendKind.Zero)))
;; I8 -> I{16,32}, I16 -> I32.
(rule -2 (lower (has_type (fits_in_32 dst_ty) (uextend src @ (value_type (fits_in_32 src_ty)))))
(extend_to_gpr src $I32 (ExtendKind.Zero)))
;; I32 -> I64 with op that produces a zero-extended value in a register.
;;
;; As a particular x64 extra-pattern matching opportunity, all the ALU
;; opcodes on 32-bits will zero-extend the upper 32-bits, so we can
;; even not generate a zero-extended move in this case.
;;
;; (Note that we unfortunately can't factor out the
;; insts-that-zero-upper-32 pattern into a separate extractor until we
;; can write internal extractors with multiple rules; and we'd rather
;; keep these here than write an external extractor containing bits of
;; the instruction pattern.s)
(rule (lower (has_type $I64
(uextend src @ (has_type $I32 (iadd _ _)))))
src)
(rule (lower (has_type $I64
(uextend src @ (has_type $I32 (isub _ _)))))
src)
(rule (lower (has_type $I64
(uextend src @ (has_type $I32 (imul _ _)))))
src)
(rule (lower (has_type $I64
(uextend src @ (has_type $I32 (band _ _)))))
src)
(rule (lower (has_type $I64
(uextend src @ (has_type $I32 (bor _ _)))))
src)
(rule (lower (has_type $I64
(uextend src @ (has_type $I32 (bxor _ _)))))
src)
(rule (lower (has_type $I64
(uextend src @ (has_type $I32 (ishl _ _)))))
src)
(rule (lower (has_type $I64
(uextend src @ (has_type $I32 (ushr _ _)))))
src)
(rule (lower (has_type $I64
(uextend src @ (has_type $I32 (uload32 _ _ _)))))
src)
;; I{8,16} -> I32
;; I8 -> I16
(rule -1 (lower (has_type (fits_in_32 _) (uextend src)))
(extend_to_gpr src $I32 (ExtendKind.Zero)))
;; Rules for `sextend` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
(decl generic_sextend (Value Type Type) InstOutput)
;; T -> T is a no-op.
(rule 4 (generic_sextend src ty ty)
src)
;; I{8,16,32} -> I128.
;;
;; Produce upper 64 bits sign-extended from lower 64: shift right by
;; 63 bits to spread the sign bit across the result.
(decl spread_sign_bit (Gpr) Gpr)
(rule (spread_sign_bit src)
(x64_sar $I64 src (Imm8Reg.Imm8 63)))
;; I64 -> I128.
(rule 3 (generic_sextend src $I64 $I128)
(value_regs src (spread_sign_bit src)))
;; I{8,16,32} -> I128.
(rule 2 (generic_sextend src (fits_in_32 src_ty) $I128)
(rule (lower (has_type $I128 (sextend src)))
(let ((lo Gpr (extend_to_gpr src $I64 (ExtendKind.Sign)))
(hi Gpr (spread_sign_bit lo)))
(hi Gpr (x64_sar $I64 lo (Imm8Reg.Imm8 63))))
(value_regs lo hi)))
;; I{8,16,32} -> I64.
(rule 1 (generic_sextend src (fits_in_32 src_ty) $I64)
(rule (lower (has_type $I64 (sextend src)))
(extend_to_gpr src $I64 (ExtendKind.Sign)))
;; I8 -> I{16,32}, I16 -> I32.
(rule 0 (generic_sextend src (fits_in_32 src_ty) (fits_in_32 dst_ty))
(extend_to_gpr src $I32 (ExtendKind.Sign)))
(rule (lower
(has_type dst_ty
(sextend src @ (value_type src_ty))))
(generic_sextend src src_ty dst_ty))
;; I{8,16} -> I32
;; I8 -> I16
(rule -1 (lower (has_type (fits_in_32 _) (sextend src)))
(extend_to_gpr src $I32 (ExtendKind.Sign)))
;; Rules for `ireduce` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
@@ -3859,9 +3796,15 @@
;; 0xee == 0b11_10_11_10
(x64_pshufd val 0xee))
;; Note that the `pextrb` lowering here is relied upon by the `extend_to_gpr`
;; helper because it will elide a `uextend` operation when `extractlane` is the
;; inner node. The `pextrb` operation automatically zero-extends for us so the
;; extra `uextend` doesn't have to codegen anything.
(rule 0 (lower (extractlane val @ (value_type ty @ (multi_lane 8 16)) (u8_from_uimm8 lane)))
(x64_pextrb val lane))
;; See the note in the 8x16 case above for how this rule is connected to
;; `extend_to_gpr`.
(rule 0 (lower (extractlane val @ (value_type ty @ (multi_lane 16 8)) (u8_from_uimm8 lane)))
(x64_pextrw val lane))

View File

@@ -27,3 +27,81 @@ block0(v0: i32, v1: i32):
; popq %rbp
; retq
function %extractlane_i8x16_i16(i8x16) -> i16 {
block0(v0: i8x16):
v1 = extractlane v0, 1
v2 = uextend.i16 v1
return v2
}
; VCode:
; pushq %rbp
; movq %rsp, %rbp
; block0:
; pextrb $1, %xmm0, %rax
; movq %rbp, %rsp
; popq %rbp
; ret
;
; Disassembled:
; block0: ; offset 0x0
; pushq %rbp
; movq %rsp, %rbp
; block1: ; offset 0x4
; pextrb $1, %xmm0, %eax
; movq %rbp, %rsp
; popq %rbp
; retq
function %extractlane_i8x16_i32(i8x16) -> i32 {
block0(v0: i8x16):
v1 = extractlane v0, 1
v2 = uextend.i32 v1
return v2
}
; VCode:
; pushq %rbp
; movq %rsp, %rbp
; block0:
; pextrb $1, %xmm0, %rax
; movq %rbp, %rsp
; popq %rbp
; ret
;
; Disassembled:
; block0: ; offset 0x0
; pushq %rbp
; movq %rsp, %rbp
; block1: ; offset 0x4
; pextrb $1, %xmm0, %eax
; movq %rbp, %rsp
; popq %rbp
; retq
function %extractlane_i16x8_i32(i16x8) -> i32 {
block0(v0: i16x8):
v1 = extractlane v0, 1
v2 = uextend.i32 v1
return v2
}
; VCode:
; pushq %rbp
; movq %rsp, %rbp
; block0:
; pextrw $1, %xmm0, %rax
; movq %rbp, %rsp
; popq %rbp
; ret
;
; Disassembled:
; block0: ; offset 0x0
; pushq %rbp
; movq %rsp, %rbp
; block1: ; offset 0x4
; pextrw $1, %xmm0, %eax
; movq %rbp, %rsp
; popq %rbp
; retq