aarch64: Add specialized shuffle lowerings (#5977)

* aarch64: Add `shuffle` lowerings for the `uzp{1,2}` instructions This commit uses the same style of patterns in the x64 backend to start adding specific lowerings of the Cranelift `shuffle` instruction to particular AArch64 instructions. * aarch64: Add `shuffle` lowerings to the `zip{1,2}` instructions These instructions match the `punpck*` family of instructions on x64 and should help provide more efficient lowerings than the current `shuffle` fallback. * aarch64: Add `shuffle` lowerings for `trn{1,2}` Along the lines of prior commits adds specific patterns to lowering for individual AArch64 instructions available. * aarch64: Add a `shuffle` lowering for the `ext` instruction This instruction will more-or-less concatenate two 128-bit vector registers to create a 256-bit value, shift it right, and then take the lower 128-bits into the destination. This can be modeled with a `shuffle` of consecutive bytes so this adds a lowering rule to generate this instruction. * aarch64: Add `shuffle` special case for `dup` This commit adds special cases for Cranelift's `shuffle` on AArch64 when the lowering can be represented with a `dup` instruction which broadcasts one vector's lane into all lanes of the destination. * aarch64: Add `shuffle` specializations for `rev` instructions This commit adds shuffle mask specializations for the `rev{16,32,64}` family of instructions on AArch64 which can be used to reverse bytes, 16-bit values, or 32-bit values within larger values. * Fix tests * Add doc-comments in ISLE
2023-03-10 15:37:13 -06:00
parent 5623f7280c
commit 52896e020d
10 changed files with 1305 additions and 11 deletions
--- a/cranelift/codegen/src/isa/aarch64/inst.isle
+++ b/cranelift/codegen/src/isa/aarch64/inst.isle
@@ -553,7 +553,8 @@
       (VecDupFromFpu
        (rd WritableReg)
        (rn Reg)
-        (size VectorSize))
+        (size VectorSize)
+        (lane u8))

       ;; Duplicate FP immediate to vector.
       (VecDupFPImm
@@ -1390,8 +1391,18 @@
    (Addp)
    ;; Zip vectors (primary) [meaning, high halves]
    (Zip1)
+    ;; Zip vectors (secondary)
+    (Zip2)
    ;; Signed saturating rounding doubling multiply returning high half
    (Sqrdmulh)
+    ;; Unzip vectors (primary)
+    (Uzp1)
+    ;; Unzip vectors (secondary)
+    (Uzp2)
+    ;; Transpose vectors (primary)
+    (Trn1)
+    ;; Transpose vectors (secondary)
+    (Trn2)
 ))

 ;; A Vector ALU operation which modifies a source register.
@@ -1420,6 +1431,10 @@
    (Fneg)
    ;; Floating-point square root
    (Fsqrt)
+    ;; Reverse elements in 16-bit lanes
+    (Rev16)
+    ;; Reverse elements in 32-bit lanes
+    (Rev32)
    ;; Reverse elements in 64-bit doublewords
    (Rev64)
    ;; Floating-point convert to signed integer, rounding toward zero
@@ -1887,10 +1902,10 @@
        dst))

 ;; Helper for emitting `MInst.VecDupFromFpu` instructions.
-(decl vec_dup_from_fpu (Reg VectorSize) Reg)
-(rule (vec_dup_from_fpu src size)
+(decl vec_dup_from_fpu (Reg VectorSize u8) Reg)
+(rule (vec_dup_from_fpu src size lane)
      (let ((dst WritableReg (temp_writable_reg $I8X16))
-            (_ Unit (emit (MInst.VecDupFromFpu dst src size))))
+            (_ Unit (emit (MInst.VecDupFromFpu dst src size lane))))
        dst))

 ;; Helper for emitting `MInst.AluRRImm12` instructions.
@@ -2386,6 +2401,14 @@
 (decl neg (Reg VectorSize) Reg)
 (rule (neg x size) (vec_misc (VecMisc2.Neg) x size))

+;; Helper for generating `rev16` instructions.
+(decl rev16 (Reg VectorSize) Reg)
+(rule (rev16 x size) (vec_misc (VecMisc2.Rev16) x size))
+
+;; Helper for generating `rev32` instructions.
+(decl rev32 (Reg VectorSize) Reg)
+(rule (rev32 x size) (vec_misc (VecMisc2.Rev32) x size))
+
 ;; Helper for generating `rev64` instructions.
 (decl rev64 (Reg VectorSize) Reg)
 (rule (rev64 x size) (vec_misc (VecMisc2.Rev64) x size))
@@ -3767,3 +3790,27 @@
       (emit_side_effect (with_flags_side_effect
            (cmp (OperandSize.Size32) ridx jt_size)
            (jt_sequence ridx jt_info)))))
+
+;; Helper for emitting the `uzp1` instruction
+(decl vec_uzp1 (Reg Reg VectorSize) Reg)
+(rule (vec_uzp1 rn rm size) (vec_rrr (VecALUOp.Uzp1) rn rm size))
+
+;; Helper for emitting the `uzp2` instruction
+(decl vec_uzp2 (Reg Reg VectorSize) Reg)
+(rule (vec_uzp2 rn rm size) (vec_rrr (VecALUOp.Uzp2) rn rm size))
+
+;; Helper for emitting the `zip1` instruction
+(decl vec_zip1 (Reg Reg VectorSize) Reg)
+(rule (vec_zip1 rn rm size) (vec_rrr (VecALUOp.Zip1) rn rm size))
+
+;; Helper for emitting the `zip2` instruction
+(decl vec_zip2 (Reg Reg VectorSize) Reg)
+(rule (vec_zip2 rn rm size) (vec_rrr (VecALUOp.Zip2) rn rm size))
+
+;; Helper for emitting the `trn1` instruction
+(decl vec_trn1 (Reg Reg VectorSize) Reg)
+(rule (vec_trn1 rn rm size) (vec_rrr (VecALUOp.Trn1) rn rm size))
+
+;; Helper for emitting the `trn2` instruction
+(decl vec_trn2 (Reg Reg VectorSize) Reg)
+(rule (vec_trn2 rn rm size) (vec_rrr (VecALUOp.Trn2) rn rm size))