x64: Refactor and add extractlane special case for uextend/sextend (#6022)

* x64: Refactor sextend/uextend rules Move much of the meaty logic from these lowering rules into the `extend_to_gpr` helper to benefit other callers of `extend_to_gpr` to elide instructions. This additionally simplifies `sextend` and `uextend` lowerings to rely on optimizations happening within the `extend_to_gpr` helper. * x64: Skip `uextend` for `pextr{b,w}` instructions These instructions are documented as automatically zeroing the upper bits so `uextend` operations can be skipped. This slightly improves codegen for the wasm `i{8x16,16x8}.extract_lane_u` instructions, for example. * Modernize an extractor pattern * Trim some superfluous match clauses Additionally rejigger priorities to be "mostly default" now. * Refactor 32-to-64 predicate to a helper Also adjust the pattern matched in the `extend_to_gpr` helper. * Slightly refactor pextr{b,w} case * Review comments
2023-03-16 17:14:59 -05:00
parent d479951469
commit 8e500099b3
3 changed files with 143 additions and 83 deletions
--- a/cranelift/codegen/src/isa/x64/inst.isle
+++ b/cranelift/codegen/src/isa/x64/inst.isle
@@ -1703,11 +1703,35 @@
 (decl extend_to_gpr (Value Type ExtendKind) Gpr)

 ;; If the value is already of the requested type, no extending is necessary.
+(rule 3 (extend_to_gpr val @ (value_type ty) ty _kind)
+      val)
+
+;; I32 -> I64 with op that produces a zero-extended value in a register.
 ;;
-;; Priority 1 because the equality constraint doesn't prove that this rule
-;; doesn't overlap with the one below.
-(rule 1 (extend_to_gpr (and val (value_type ty)) ty _kind)
-      (put_in_gpr val))
+;; As a particular x64 extra-pattern matching opportunity, all the ALU
+;; opcodes on 32-bits will zero-extend the upper 32-bits, so we can
+;; even not generate a zero-extended move in this case.
+(rule 2 (extend_to_gpr src @ (value_type $I32) $I64 (ExtendKind.Zero))
+        (if-let $true (value32_zeros_upper32 src))
+        src)
+
+;; The `extractlane` instruction, extended to `$I32`, means that either an
+;; i8x16 or an i16x8 is being extracted. These are implemented with
+;; the `pextr{b,w}` instruction which automatically zero the upper bits of the
+;; destination register so the `uextend` in these cases can be elided.
+;;
+;; TODO: the documentation for `pextr{b,w}` seems to indicate it zero extends
+;; to not only 32-bits but probably the whole 64-bit register. If that's the
+;; case then this should match a zero-extend to any size instead of just `$I32`.
+;;
+;; TODO: the interaction here between this rule and the "it's written far away"
+;; rule to lower `extractlane` isn't great. Ideally this rule (and the other
+;; special cases for `value32_zeros_upper32`) would live contextually closer or
+;; be connected to the extractlane rules. There's some discussion of this on
+;; #6022 but the gist is that there's not a lot of great options at this time,
+;; so this doc block is what's here for now.
+(rule 1 (extend_to_gpr src @ (extractlane _ _) $I32 (ExtendKind.Zero))
+        src)

 (rule (extend_to_gpr (and val (value_type from_ty))
                     to_ty
@@ -1732,6 +1756,21 @@
 (rule (extend (ExtendKind.Sign) ty mode src)
      (x64_movsx mode src))

+;; Tests whether the operation used to produce the input `Value`, which must
+;; be a 32-bit operation, will automatically zero the upper 32-bits of the
+;; destination register that `Value` is placed in.
+(decl pure value32_zeros_upper32 (Value) bool)
+(rule (value32_zeros_upper32 (iadd _ _)) $true)
+(rule (value32_zeros_upper32 (isub _ _)) $true)
+(rule (value32_zeros_upper32 (imul _ _)) $true)
+(rule (value32_zeros_upper32 (band _ _)) $true)
+(rule (value32_zeros_upper32 (bor _ _)) $true)
+(rule (value32_zeros_upper32 (bxor _ _)) $true)
+(rule (value32_zeros_upper32 (ishl _ _)) $true)
+(rule (value32_zeros_upper32 (ushr _ _)) $true)
+(rule (value32_zeros_upper32 (uload32 _ _ _)) $true)
+(rule -1 (value32_zeros_upper32 _) $false)
+
 ;;;; Helpers for Working SSE tidbits ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;

 ;; Turn a vector type into its integer-typed vector equivalent.
--- a/cranelift/codegen/src/isa/x64/lower.isle
+++ b/cranelift/codegen/src/isa/x64/lower.isle
@@ -2039,101 +2039,38 @@

 ;; Rules for `uextend` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;

-;; T -> T is a no-op.
-(rule 1 (lower (has_type ty (uextend src @ (value_type ty))))
-      src)
-
-;; I64 -> I128.
-(rule -1 (lower (has_type $I128 (uextend src @ (value_type $I64))))
-      (value_regs src (imm $I64 0)))
-
-;; I{8,16,32} -> I128.
-(rule (lower (has_type $I128 (uextend src @ (value_type (fits_in_32 src_ty)))))
+;; I{8,16,32,64} -> I128.
+(rule (lower (has_type $I128 (uextend src)))
      (value_regs (extend_to_gpr src $I64 (ExtendKind.Zero)) (imm $I64 0)))

 ;; I{8,16,32} -> I64.
-(rule -1 (lower (has_type $I64 (uextend src @ (value_type (fits_in_32 src_ty)))))
+(rule (lower (has_type $I64 (uextend src)))
      (extend_to_gpr src $I64 (ExtendKind.Zero)))

-;; I8 -> I{16,32}, I16 -> I32.
-(rule -2 (lower (has_type (fits_in_32 dst_ty) (uextend src @ (value_type (fits_in_32 src_ty)))))
-      (extend_to_gpr src $I32 (ExtendKind.Zero)))
-
-;; I32 -> I64 with op that produces a zero-extended value in a register.
-;;
-;; As a particular x64 extra-pattern matching opportunity, all the ALU
-;; opcodes on 32-bits will zero-extend the upper 32-bits, so we can
-;; even not generate a zero-extended move in this case.
-;;
-;; (Note that we unfortunately can't factor out the
-;; insts-that-zero-upper-32 pattern into a separate extractor until we
-;; can write internal extractors with multiple rules; and we'd rather
-;; keep these here than write an external extractor containing bits of
-;; the instruction pattern.s)
-(rule (lower (has_type $I64
-                       (uextend src @ (has_type $I32 (iadd _ _)))))
-      src)
-(rule (lower (has_type $I64
-                       (uextend src @ (has_type $I32 (isub _ _)))))
-      src)
-(rule (lower (has_type $I64
-                       (uextend src @ (has_type $I32 (imul _ _)))))
-      src)
-(rule (lower (has_type $I64
-                       (uextend src @ (has_type $I32 (band _ _)))))
-      src)
-(rule (lower (has_type $I64
-                       (uextend src @ (has_type $I32 (bor _ _)))))
-      src)
-(rule (lower (has_type $I64
-                       (uextend src @ (has_type $I32 (bxor _ _)))))
-      src)
-(rule (lower (has_type $I64
-                       (uextend src @ (has_type $I32 (ishl _ _)))))
-      src)
-(rule (lower (has_type $I64
-                       (uextend src @ (has_type $I32 (ushr _ _)))))
-      src)
-(rule (lower (has_type $I64
-                       (uextend src @ (has_type $I32 (uload32 _ _ _)))))
-      src)
+;; I{8,16} -> I32
+;; I8 -> I16
+(rule -1 (lower (has_type (fits_in_32 _) (uextend src)))
+         (extend_to_gpr src $I32 (ExtendKind.Zero)))

 ;; Rules for `sextend` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;

-(decl generic_sextend (Value Type Type) InstOutput)
-
-;; T -> T is a no-op.
-(rule 4 (generic_sextend src ty ty)
-      src)
-
+;; I{8,16,32} -> I128.
+;;
 ;; Produce upper 64 bits sign-extended from lower 64: shift right by
 ;; 63 bits to spread the sign bit across the result.
-(decl spread_sign_bit (Gpr) Gpr)
-(rule (spread_sign_bit src)
-      (x64_sar $I64 src (Imm8Reg.Imm8 63)))
-
-;; I64 -> I128.
-(rule 3 (generic_sextend src $I64 $I128)
-      (value_regs src (spread_sign_bit src)))
-
-;; I{8,16,32} -> I128.
-(rule 2 (generic_sextend src (fits_in_32 src_ty) $I128)
+(rule (lower (has_type $I128 (sextend src)))
      (let ((lo Gpr (extend_to_gpr src $I64 (ExtendKind.Sign)))
-            (hi Gpr (spread_sign_bit lo)))
+            (hi Gpr (x64_sar $I64 lo (Imm8Reg.Imm8 63))))
      (value_regs lo hi)))

 ;; I{8,16,32} -> I64.
-(rule 1 (generic_sextend src (fits_in_32 src_ty) $I64)
+(rule (lower (has_type $I64 (sextend src)))
      (extend_to_gpr src $I64 (ExtendKind.Sign)))

-;; I8 -> I{16,32}, I16 -> I32.
-(rule 0 (generic_sextend src (fits_in_32 src_ty) (fits_in_32 dst_ty))
-      (extend_to_gpr src $I32 (ExtendKind.Sign)))
-
-(rule (lower
-       (has_type dst_ty
-                 (sextend src @ (value_type src_ty))))
-      (generic_sextend src src_ty dst_ty))
+;; I{8,16} -> I32
+;; I8 -> I16
+(rule -1 (lower (has_type (fits_in_32 _) (sextend src)))
+         (extend_to_gpr src $I32 (ExtendKind.Sign)))

 ;; Rules for `ireduce` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;

@@ -3859,9 +3796,15 @@
      ;; 0xee == 0b11_10_11_10
      (x64_pshufd val 0xee))

+;; Note that the `pextrb` lowering here is relied upon by the `extend_to_gpr`
+;; helper because it will elide a `uextend` operation when `extractlane` is the
+;; inner node. The `pextrb` operation automatically zero-extends for us so the
+;; extra `uextend` doesn't have to codegen anything.
 (rule 0 (lower (extractlane val @ (value_type ty @ (multi_lane 8 16)) (u8_from_uimm8 lane)))
      (x64_pextrb val lane))

+;; See the note in the 8x16 case above for how this rule is connected to
+;; `extend_to_gpr`.
 (rule 0 (lower (extractlane val @ (value_type ty @ (multi_lane 16 8)) (u8_from_uimm8 lane)))
      (x64_pextrw val lane))

--- a/cranelift/filetests/filetests/isa/x64/uextend-elision.clif
+++ b/cranelift/filetests/filetests/isa/x64/uextend-elision.clif
@@ -27,3 +27,81 @@ block0(v0: i32, v1: i32):
 ;   popq %rbp
 ;   retq

+function %extractlane_i8x16_i16(i8x16) -> i16 {
+block0(v0: i8x16):
+    v1 = extractlane v0, 1
+    v2 = uextend.i16 v1
+    return v2
+}
+
+; VCode:
+;   pushq   %rbp
+;   movq    %rsp, %rbp
+; block0:
+;   pextrb  $1, %xmm0, %rax
+;   movq    %rbp, %rsp
+;   popq    %rbp
+;   ret
+; 
+; Disassembled:
+; block0: ; offset 0x0
+;   pushq %rbp
+;   movq %rsp, %rbp
+; block1: ; offset 0x4
+;   pextrb $1, %xmm0, %eax
+;   movq %rbp, %rsp
+;   popq %rbp
+;   retq
+
+function %extractlane_i8x16_i32(i8x16) -> i32 {
+block0(v0: i8x16):
+    v1 = extractlane v0, 1
+    v2 = uextend.i32 v1
+    return v2
+}
+
+; VCode:
+;   pushq   %rbp
+;   movq    %rsp, %rbp
+; block0:
+;   pextrb  $1, %xmm0, %rax
+;   movq    %rbp, %rsp
+;   popq    %rbp
+;   ret
+; 
+; Disassembled:
+; block0: ; offset 0x0
+;   pushq %rbp
+;   movq %rsp, %rbp
+; block1: ; offset 0x4
+;   pextrb $1, %xmm0, %eax
+;   movq %rbp, %rsp
+;   popq %rbp
+;   retq
+
+function %extractlane_i16x8_i32(i16x8) -> i32 {
+block0(v0: i16x8):
+    v1 = extractlane v0, 1
+    v2 = uextend.i32 v1
+    return v2
+}
+
+; VCode:
+;   pushq   %rbp
+;   movq    %rsp, %rbp
+; block0:
+;   pextrw  $1, %xmm0, %rax
+;   movq    %rbp, %rsp
+;   popq    %rbp
+;   ret
+; 
+; Disassembled:
+; block0: ; offset 0x0
+;   pushq %rbp
+;   movq %rsp, %rbp
+; block1: ; offset 0x4
+;   pextrw $1, %xmm0, %eax
+;   movq %rbp, %rsp
+;   popq %rbp
+;   retq
+