From 368004428ad0d3477d4e9525031c2e2644be1118 Mon Sep 17 00:00:00 2001
From: Trevor Elliott <telliott@fastly.com>
Date: Mon, 28 Nov 2022 15:48:34 -0800
Subject: [PATCH] Fix rule shadowing instances in x64 and aarch64 backends
 (#5334)

Fix shadowing identified in #5322 for imul and swiden_high/swiden_low/uwiden_high/uwiden_low combinations in the x64 backend, and remove some redundant rules from the aarch64 dynamic neon ruleset. Additionally, add tests to the x64 backend showing that the imul specializations are firing.
---
 .../src/isa/aarch64/lower_dynamic_neon.isle   |  14 +-
 cranelift/codegen/src/isa/x64/lower.isle      |  30 +--
 .../filetests/isa/x64/simd-widen-mul.clif     | 246 ++++++++++++++++++
 3 files changed, 263 insertions(+), 27 deletions(-)
 create mode 100644 cranelift/filetests/filetests/isa/x64/simd-widen-mul.clif

diff --git a/cranelift/codegen/src/isa/aarch64/lower_dynamic_neon.isle b/cranelift/codegen/src/isa/aarch64/lower_dynamic_neon.isle
index e97c398d2f..bf99f57f71 100644
--- a/cranelift/codegen/src/isa/aarch64/lower_dynamic_neon.isle
+++ b/cranelift/codegen/src/isa/aarch64/lower_dynamic_neon.isle
@@ -99,22 +99,12 @@
 (rule (lower (extract_vector x 0))
       (value_reg (fpu_move_128 (put_in_reg x))))
 
-;;;; Rules for `swiden_low` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
-
-(rule -1 (lower (has_type ty (swiden_low x)))
-      (value_reg (vec_extend (VecExtendOp.Sxtl) x $false (lane_size ty))))
-
 ;;;; Rules for `swiden_high` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 
 (rule -1 (lower (has_type ty (swiden_high x)))
-      (value_reg (vec_extend (VecExtendOp.Sxtl) x $true (lane_size ty))))
-
-;;;; Rules for `uwiden_low` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
-
-(rule -1 (lower (has_type ty (uwiden_low x)))
-      (value_reg (vec_extend (VecExtendOp.Uxtl) x $false (lane_size ty))))
+      (vec_extend (VecExtendOp.Sxtl) x $true (lane_size ty)))
 
 ;;;; Rules for `uwiden_high` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 
 (rule -1 (lower (has_type ty (uwiden_high x)))
-      (value_reg (vec_extend (VecExtendOp.Uxtl) x $true (lane_size ty))))
+      (vec_extend (VecExtendOp.Uxtl) x $true (lane_size ty)))
diff --git a/cranelift/codegen/src/isa/x64/lower.isle b/cranelift/codegen/src/isa/x64/lower.isle
index 14b07ff141..5abc503259 100644
--- a/cranelift/codegen/src/isa/x64/lower.isle
+++ b/cranelift/codegen/src/isa/x64/lower.isle
@@ -905,10 +905,10 @@
 
 ;; (No i8x16 multiply.)
 
-(rule 1 (lower (has_type (multi_lane 16 8) (imul x y)))
+(rule (lower (has_type (multi_lane 16 8) (imul x y)))
       (x64_pmullw x y))
 
-(rule 1 (lower (has_type (multi_lane 32 4) (imul x y)))
+(rule (lower (has_type (multi_lane 32 4) (imul x y)))
       (x64_pmulld x y))
 
 ;; With AVX-512 we can implement `i64x2` multiplication with a single
@@ -939,7 +939,7 @@
 ;; the lane of the destination. For this reason we don't need shifts to isolate
 ;; the lower 32-bits, however, we will need to use shifts to isolate the high
 ;; 32-bits when doing calculations, i.e., `Ah == A >> 32`.
-(rule 1 (lower (has_type (multi_lane 64 2)
+(rule (lower (has_type (multi_lane 64 2)
                        (imul a b)))
       (let ((a0 Xmm a)
             (b0 Xmm b)
@@ -961,7 +961,7 @@
         (x64_paddq al_bl aa_bb_shifted)))
 
 ;; Special case for `i16x8.extmul_high_i8x16_s`.
-(rule (lower (has_type (multi_lane 16 8)
+(rule 1 (lower (has_type (multi_lane 16 8)
                        (imul (swiden_high (and (value_type (multi_lane 8 16))
                                                x))
                              (swiden_high (and (value_type (multi_lane 8 16))
@@ -975,7 +975,7 @@
         (x64_pmullw x3 y3)))
 
 ;; Special case for `i32x4.extmul_high_i16x8_s`.
-(rule (lower (has_type (multi_lane 32 4)
+(rule 1 (lower (has_type (multi_lane 32 4)
                        (imul (swiden_high (and (value_type (multi_lane 16 8))
                                                x))
                              (swiden_high (and (value_type (multi_lane 16 8))
@@ -987,7 +987,7 @@
         (x64_punpckhwd lo hi)))
 
 ;; Special case for `i64x2.extmul_high_i32x4_s`.
-(rule (lower (has_type (multi_lane 64 2)
+(rule 1 (lower (has_type (multi_lane 64 2)
                        (imul (swiden_high (and (value_type (multi_lane 32 4))
                                                x))
                              (swiden_high (and (value_type (multi_lane 32 4))
@@ -1001,7 +1001,7 @@
         (x64_pmuldq x2 y2)))
 
 ;; Special case for `i16x8.extmul_low_i8x16_s`.
-(rule (lower (has_type (multi_lane 16 8)
+(rule 1 (lower (has_type (multi_lane 16 8)
                        (imul (swiden_low (and (value_type (multi_lane 8 16))
                                               x))
                              (swiden_low (and (value_type (multi_lane 8 16))
@@ -1011,7 +1011,7 @@
         (x64_pmullw x2 y2)))
 
 ;; Special case for `i32x4.extmul_low_i16x8_s`.
-(rule (lower (has_type (multi_lane 32 4)
+(rule 1 (lower (has_type (multi_lane 32 4)
                        (imul (swiden_low (and (value_type (multi_lane 16 8))
                                               x))
                              (swiden_low (and (value_type (multi_lane 16 8))
@@ -1023,7 +1023,7 @@
         (x64_punpcklwd lo hi)))
 
 ;; Special case for `i64x2.extmul_low_i32x4_s`.
-(rule (lower (has_type (multi_lane 64 2)
+(rule 1 (lower (has_type (multi_lane 64 2)
                        (imul (swiden_low (and (value_type (multi_lane 32 4))
                                               x))
                              (swiden_low (and (value_type (multi_lane 32 4))
@@ -1037,7 +1037,7 @@
         (x64_pmuldq x2 y2)))
 
 ;; Special case for `i16x8.extmul_high_i8x16_u`.
-(rule (lower (has_type (multi_lane 16 8)
+(rule 1 (lower (has_type (multi_lane 16 8)
                        (imul (uwiden_high (and (value_type (multi_lane 8 16))
                                                x))
                              (uwiden_high (and (value_type (multi_lane 8 16))
@@ -1051,7 +1051,7 @@
         (x64_pmullw x3 y3)))
 
 ;; Special case for `i32x4.extmul_high_i16x8_u`.
-(rule (lower (has_type (multi_lane 32 4)
+(rule 1 (lower (has_type (multi_lane 32 4)
                        (imul (uwiden_high (and (value_type (multi_lane 16 8))
                                                x))
                              (uwiden_high (and (value_type (multi_lane 16 8))
@@ -1063,7 +1063,7 @@
         (x64_punpckhwd lo hi)))
 
 ;; Special case for `i64x2.extmul_high_i32x4_u`.
-(rule (lower (has_type (multi_lane 64 2)
+(rule 1 (lower (has_type (multi_lane 64 2)
                        (imul (uwiden_high (and (value_type (multi_lane 32 4))
                                                x))
                              (uwiden_high (and (value_type (multi_lane 32 4))
@@ -1077,7 +1077,7 @@
         (x64_pmuludq x2 y2)))
 
 ;; Special case for `i16x8.extmul_low_i8x16_u`.
-(rule (lower (has_type (multi_lane 16 8)
+(rule 1 (lower (has_type (multi_lane 16 8)
                        (imul (uwiden_low (and (value_type (multi_lane 8 16))
                                               x))
                              (uwiden_low (and (value_type (multi_lane 8 16))
@@ -1087,7 +1087,7 @@
         (x64_pmullw x2 y2)))
 
 ;; Special case for `i32x4.extmul_low_i16x8_u`.
-(rule (lower (has_type (multi_lane 32 4)
+(rule 1 (lower (has_type (multi_lane 32 4)
                        (imul (uwiden_low (and (value_type (multi_lane 16 8))
                                               x))
                              (uwiden_low (and (value_type (multi_lane 16 8))
@@ -1099,7 +1099,7 @@
         (x64_punpcklwd lo hi)))
 
 ;; Special case for `i64x2.extmul_low_i32x4_u`.
-(rule (lower (has_type (multi_lane 64 2)
+(rule 1 (lower (has_type (multi_lane 64 2)
                        (imul (uwiden_low (and (value_type (multi_lane 32 4))
                                               x))
                              (uwiden_low (and (value_type (multi_lane 32 4))
diff --git a/cranelift/filetests/filetests/isa/x64/simd-widen-mul.clif b/cranelift/filetests/filetests/isa/x64/simd-widen-mul.clif
new file mode 100644
index 0000000000..daa5ad5ef3
--- /dev/null
+++ b/cranelift/filetests/filetests/isa/x64/simd-widen-mul.clif
@@ -0,0 +1,246 @@
+
+
+test compile precise-output
+set enable_simd
+target x86_64
+
+function %imul_swiden_hi_i8x16(i8x16, i8x16) -> i16x8 {
+block0(v0: i8x16, v1: i8x16):
+    v2 = swiden_high v0
+    v3 = swiden_high v1
+    v4 = imul v2, v3
+    return v4
+}
+
+;   pushq   %rbp
+;   movq    %rsp, %rbp
+; block0:
+;   movdqa  %xmm0, %xmm3
+;   palignr $8, %xmm3, %xmm0, %xmm3
+;   pmovsxbw %xmm3, %xmm0
+;   movdqa  %xmm1, %xmm7
+;   palignr $8, %xmm7, %xmm1, %xmm7
+;   pmovsxbw %xmm7, %xmm9
+;   pmullw  %xmm0, %xmm9, %xmm0
+;   movq    %rbp, %rsp
+;   popq    %rbp
+;   ret
+
+function %imul_swiden_hi_i16x8(i16x8, i16x8) -> i32x4 {
+block0(v0: i16x8, v1: i16x8):
+    v2 = swiden_high v0
+    v3 = swiden_high v1
+    v4 = imul v2, v3
+    return v4
+}
+
+;   pushq   %rbp
+;   movq    %rsp, %rbp
+; block0:
+;   movdqa  %xmm0, %xmm5
+;   pmullw  %xmm5, %xmm1, %xmm5
+;   movdqa  %xmm5, %xmm6
+;   movdqa  %xmm0, %xmm5
+;   pmulhw  %xmm5, %xmm1, %xmm5
+;   movdqa  %xmm6, %xmm0
+;   punpckhwd %xmm0, %xmm5, %xmm0
+;   movq    %rbp, %rsp
+;   popq    %rbp
+;   ret
+
+function %imul_swiden_hi_i32x4(i32x4, i32x4) -> i64x2 {
+block0(v0: i32x4, v1: i32x4):
+    v2 = swiden_high v0
+    v3 = swiden_high v1
+    v4 = imul v2, v3
+    return v4
+}
+
+;   pushq   %rbp
+;   movq    %rsp, %rbp
+; block0:
+;   pshufd  $250, %xmm0, %xmm0
+;   pshufd  $250, %xmm1, %xmm5
+;   pmuldq  %xmm0, %xmm5, %xmm0
+;   movq    %rbp, %rsp
+;   popq    %rbp
+;   ret
+
+function %imul_swiden_low_i8x16(i8x16, i8x16) -> i16x8 {
+block0(v0: i8x16, v1: i8x16):
+    v2 = swiden_low v0
+    v3 = swiden_low v1
+    v4 = imul v2, v3
+    return v4
+}
+
+;   pushq   %rbp
+;   movq    %rsp, %rbp
+; block0:
+;   pmovsxbw %xmm0, %xmm0
+;   pmovsxbw %xmm1, %xmm5
+;   pmullw  %xmm0, %xmm5, %xmm0
+;   movq    %rbp, %rsp
+;   popq    %rbp
+;   ret
+
+function %imul_swiden_low_i16x8(i16x8, i16x8) -> i32x4 {
+block0(v0: i16x8, v1: i16x8):
+    v2 = swiden_low v0
+    v3 = swiden_low v1
+    v4 = imul v2, v3
+    return v4
+}
+
+;   pushq   %rbp
+;   movq    %rsp, %rbp
+; block0:
+;   movdqa  %xmm0, %xmm5
+;   pmullw  %xmm5, %xmm1, %xmm5
+;   movdqa  %xmm5, %xmm6
+;   movdqa  %xmm0, %xmm5
+;   pmulhw  %xmm5, %xmm1, %xmm5
+;   movdqa  %xmm6, %xmm0
+;   punpcklwd %xmm0, %xmm5, %xmm0
+;   movq    %rbp, %rsp
+;   popq    %rbp
+;   ret
+
+function %imul_swiden_low_i32x4(i32x4, i32x4) -> i64x2 {
+block0(v0: i32x4, v1: i32x4):
+    v2 = swiden_low v0
+    v3 = swiden_low v1
+    v4 = imul v2, v3
+    return v4
+}
+
+;   pushq   %rbp
+;   movq    %rsp, %rbp
+; block0:
+;   pshufd  $80, %xmm0, %xmm0
+;   pshufd  $80, %xmm1, %xmm5
+;   pmuldq  %xmm0, %xmm5, %xmm0
+;   movq    %rbp, %rsp
+;   popq    %rbp
+;   ret
+
+function %imul_uwiden_hi_i8x16(i8x16, i8x16) -> i16x8 {
+block0(v0: i8x16, v1: i8x16):
+    v2 = uwiden_high v0
+    v3 = uwiden_high v1
+    v4 = imul v2, v3
+    return v4
+}
+
+;   pushq   %rbp
+;   movq    %rsp, %rbp
+; block0:
+;   movdqa  %xmm0, %xmm3
+;   palignr $8, %xmm3, %xmm0, %xmm3
+;   pmovzxbw %xmm3, %xmm0
+;   movdqa  %xmm1, %xmm7
+;   palignr $8, %xmm7, %xmm1, %xmm7
+;   pmovzxbw %xmm7, %xmm9
+;   pmullw  %xmm0, %xmm9, %xmm0
+;   movq    %rbp, %rsp
+;   popq    %rbp
+;   ret
+
+function %imul_uwiden_hi_i16x8(i16x8, i16x8) -> i32x4 {
+block0(v0: i16x8, v1: i16x8):
+    v2 = uwiden_high v0
+    v3 = uwiden_high v1
+    v4 = imul v2, v3
+    return v4
+}
+
+;   pushq   %rbp
+;   movq    %rsp, %rbp
+; block0:
+;   movdqa  %xmm0, %xmm5
+;   pmullw  %xmm5, %xmm1, %xmm5
+;   movdqa  %xmm5, %xmm6
+;   movdqa  %xmm0, %xmm5
+;   pmulhuw %xmm5, %xmm1, %xmm5
+;   movdqa  %xmm6, %xmm0
+;   punpckhwd %xmm0, %xmm5, %xmm0
+;   movq    %rbp, %rsp
+;   popq    %rbp
+;   ret
+
+function %imul_uwiden_hi_i32x4(i32x4, i32x4) -> i64x2 {
+block0(v0: i32x4, v1: i32x4):
+    v2 = uwiden_high v0
+    v3 = uwiden_high v1
+    v4 = imul v2, v3
+    return v4
+}
+
+;   pushq   %rbp
+;   movq    %rsp, %rbp
+; block0:
+;   pshufd  $250, %xmm0, %xmm0
+;   pshufd  $250, %xmm1, %xmm5
+;   pmuludq %xmm0, %xmm5, %xmm0
+;   movq    %rbp, %rsp
+;   popq    %rbp
+;   ret
+
+function %imul_uwiden_low_i8x16(i8x16, i8x16) -> i16x8 {
+block0(v0: i8x16, v1: i8x16):
+    v2 = uwiden_low v0
+    v3 = uwiden_low v1
+    v4 = imul v2, v3
+    return v4
+}
+
+;   pushq   %rbp
+;   movq    %rsp, %rbp
+; block0:
+;   pmovzxbw %xmm0, %xmm0
+;   pmovzxbw %xmm1, %xmm5
+;   pmullw  %xmm0, %xmm5, %xmm0
+;   movq    %rbp, %rsp
+;   popq    %rbp
+;   ret
+
+function %imul_uwiden_low_i16x8(i16x8, i16x8) -> i32x4 {
+block0(v0: i16x8, v1: i16x8):
+    v2 = uwiden_low v0
+    v3 = uwiden_low v1
+    v4 = imul v2, v3
+    return v4
+}
+
+;   pushq   %rbp
+;   movq    %rsp, %rbp
+; block0:
+;   movdqa  %xmm0, %xmm5
+;   pmullw  %xmm5, %xmm1, %xmm5
+;   movdqa  %xmm5, %xmm6
+;   movdqa  %xmm0, %xmm5
+;   pmulhuw %xmm5, %xmm1, %xmm5
+;   movdqa  %xmm6, %xmm0
+;   punpcklwd %xmm0, %xmm5, %xmm0
+;   movq    %rbp, %rsp
+;   popq    %rbp
+;   ret
+
+function %imul_uwiden_low_i32x4(i32x4, i32x4) -> i64x2 {
+block0(v0: i32x4, v1: i32x4):
+    v2 = uwiden_low v0
+    v3 = uwiden_low v1
+    v4 = imul v2, v3
+    return v4
+}
+
+;   pushq   %rbp
+;   movq    %rsp, %rbp
+; block0:
+;   pshufd  $80, %xmm0, %xmm0
+;   pshufd  $80, %xmm1, %xmm5
+;   pmuludq %xmm0, %xmm5, %xmm0
+;   movq    %rbp, %rsp
+;   popq    %rbp
+;   ret
+