Fix rule shadowing instances in x64 and aarch64 backends (#5334)

Fix shadowing identified in #5322 for imul and swiden_high/swiden_low/uwiden_high/uwiden_low combinations in the x64 backend, and remove some redundant rules from the aarch64 dynamic neon ruleset. Additionally, add tests to the x64 backend showing that the imul specializations are firing.
2022-11-28 15:48:34 -08:00
parent d6d3c49972
commit 368004428a
3 changed files with 263 additions and 27 deletions
--- a/cranelift/codegen/src/isa/aarch64/lower_dynamic_neon.isle
+++ b/cranelift/codegen/src/isa/aarch64/lower_dynamic_neon.isle
@@ -99,22 +99,12 @@
 (rule (lower (extract_vector x 0))
      (value_reg (fpu_move_128 (put_in_reg x))))

-;;;; Rules for `swiden_low` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
-
-(rule -1 (lower (has_type ty (swiden_low x)))
-      (value_reg (vec_extend (VecExtendOp.Sxtl) x $false (lane_size ty))))
-
 ;;;; Rules for `swiden_high` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;

 (rule -1 (lower (has_type ty (swiden_high x)))
-      (value_reg (vec_extend (VecExtendOp.Sxtl) x $true (lane_size ty))))
-
-;;;; Rules for `uwiden_low` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
-
-(rule -1 (lower (has_type ty (uwiden_low x)))
-      (value_reg (vec_extend (VecExtendOp.Uxtl) x $false (lane_size ty))))
+      (vec_extend (VecExtendOp.Sxtl) x $true (lane_size ty)))

 ;;;; Rules for `uwiden_high` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;

 (rule -1 (lower (has_type ty (uwiden_high x)))
-      (value_reg (vec_extend (VecExtendOp.Uxtl) x $true (lane_size ty))))
+      (vec_extend (VecExtendOp.Uxtl) x $true (lane_size ty)))
--- a/cranelift/codegen/src/isa/x64/lower.isle
+++ b/cranelift/codegen/src/isa/x64/lower.isle
@@ -905,10 +905,10 @@

 ;; (No i8x16 multiply.)

-(rule 1 (lower (has_type (multi_lane 16 8) (imul x y)))
+(rule (lower (has_type (multi_lane 16 8) (imul x y)))
      (x64_pmullw x y))

-(rule 1 (lower (has_type (multi_lane 32 4) (imul x y)))
+(rule (lower (has_type (multi_lane 32 4) (imul x y)))
      (x64_pmulld x y))

 ;; With AVX-512 we can implement `i64x2` multiplication with a single
@@ -939,7 +939,7 @@
 ;; the lane of the destination. For this reason we don't need shifts to isolate
 ;; the lower 32-bits, however, we will need to use shifts to isolate the high
 ;; 32-bits when doing calculations, i.e., `Ah == A >> 32`.
-(rule 1 (lower (has_type (multi_lane 64 2)
+(rule (lower (has_type (multi_lane 64 2)
                       (imul a b)))
      (let ((a0 Xmm a)
            (b0 Xmm b)
@@ -961,7 +961,7 @@
        (x64_paddq al_bl aa_bb_shifted)))

 ;; Special case for `i16x8.extmul_high_i8x16_s`.
-(rule (lower (has_type (multi_lane 16 8)
+(rule 1 (lower (has_type (multi_lane 16 8)
                       (imul (swiden_high (and (value_type (multi_lane 8 16))
                                               x))
                             (swiden_high (and (value_type (multi_lane 8 16))
@@ -975,7 +975,7 @@
        (x64_pmullw x3 y3)))

 ;; Special case for `i32x4.extmul_high_i16x8_s`.
-(rule (lower (has_type (multi_lane 32 4)
+(rule 1 (lower (has_type (multi_lane 32 4)
                       (imul (swiden_high (and (value_type (multi_lane 16 8))
                                               x))
                             (swiden_high (and (value_type (multi_lane 16 8))
@@ -987,7 +987,7 @@
        (x64_punpckhwd lo hi)))

 ;; Special case for `i64x2.extmul_high_i32x4_s`.
-(rule (lower (has_type (multi_lane 64 2)
+(rule 1 (lower (has_type (multi_lane 64 2)
                       (imul (swiden_high (and (value_type (multi_lane 32 4))
                                               x))
                             (swiden_high (and (value_type (multi_lane 32 4))
@@ -1001,7 +1001,7 @@
        (x64_pmuldq x2 y2)))

 ;; Special case for `i16x8.extmul_low_i8x16_s`.
-(rule (lower (has_type (multi_lane 16 8)
+(rule 1 (lower (has_type (multi_lane 16 8)
                       (imul (swiden_low (and (value_type (multi_lane 8 16))
                                              x))
                             (swiden_low (and (value_type (multi_lane 8 16))
@@ -1011,7 +1011,7 @@
        (x64_pmullw x2 y2)))

 ;; Special case for `i32x4.extmul_low_i16x8_s`.
-(rule (lower (has_type (multi_lane 32 4)
+(rule 1 (lower (has_type (multi_lane 32 4)
                       (imul (swiden_low (and (value_type (multi_lane 16 8))
                                              x))
                             (swiden_low (and (value_type (multi_lane 16 8))
@@ -1023,7 +1023,7 @@
        (x64_punpcklwd lo hi)))

 ;; Special case for `i64x2.extmul_low_i32x4_s`.
-(rule (lower (has_type (multi_lane 64 2)
+(rule 1 (lower (has_type (multi_lane 64 2)
                       (imul (swiden_low (and (value_type (multi_lane 32 4))
                                              x))
                             (swiden_low (and (value_type (multi_lane 32 4))
@@ -1037,7 +1037,7 @@
        (x64_pmuldq x2 y2)))

 ;; Special case for `i16x8.extmul_high_i8x16_u`.
-(rule (lower (has_type (multi_lane 16 8)
+(rule 1 (lower (has_type (multi_lane 16 8)
                       (imul (uwiden_high (and (value_type (multi_lane 8 16))
                                               x))
                             (uwiden_high (and (value_type (multi_lane 8 16))
@@ -1051,7 +1051,7 @@
        (x64_pmullw x3 y3)))

 ;; Special case for `i32x4.extmul_high_i16x8_u`.
-(rule (lower (has_type (multi_lane 32 4)
+(rule 1 (lower (has_type (multi_lane 32 4)
                       (imul (uwiden_high (and (value_type (multi_lane 16 8))
                                               x))
                             (uwiden_high (and (value_type (multi_lane 16 8))
@@ -1063,7 +1063,7 @@
        (x64_punpckhwd lo hi)))

 ;; Special case for `i64x2.extmul_high_i32x4_u`.
-(rule (lower (has_type (multi_lane 64 2)
+(rule 1 (lower (has_type (multi_lane 64 2)
                       (imul (uwiden_high (and (value_type (multi_lane 32 4))
                                               x))
                             (uwiden_high (and (value_type (multi_lane 32 4))
@@ -1077,7 +1077,7 @@
        (x64_pmuludq x2 y2)))

 ;; Special case for `i16x8.extmul_low_i8x16_u`.
-(rule (lower (has_type (multi_lane 16 8)
+(rule 1 (lower (has_type (multi_lane 16 8)
                       (imul (uwiden_low (and (value_type (multi_lane 8 16))
                                              x))
                             (uwiden_low (and (value_type (multi_lane 8 16))
@@ -1087,7 +1087,7 @@
        (x64_pmullw x2 y2)))

 ;; Special case for `i32x4.extmul_low_i16x8_u`.
-(rule (lower (has_type (multi_lane 32 4)
+(rule 1 (lower (has_type (multi_lane 32 4)
                       (imul (uwiden_low (and (value_type (multi_lane 16 8))
                                              x))
                             (uwiden_low (and (value_type (multi_lane 16 8))
@@ -1099,7 +1099,7 @@
        (x64_punpcklwd lo hi)))

 ;; Special case for `i64x2.extmul_low_i32x4_u`.
-(rule (lower (has_type (multi_lane 64 2)
+(rule 1 (lower (has_type (multi_lane 64 2)
                       (imul (uwiden_low (and (value_type (multi_lane 32 4))
                                              x))
                             (uwiden_low (and (value_type (multi_lane 32 4))
--- a/cranelift/filetests/filetests/isa/x64/simd-widen-mul.clif
+++ b/cranelift/filetests/filetests/isa/x64/simd-widen-mul.clif
@@ -0,0 +1,246 @@
+
+
+test compile precise-output
+set enable_simd
+target x86_64
+
+function %imul_swiden_hi_i8x16(i8x16, i8x16) -> i16x8 {
+block0(v0: i8x16, v1: i8x16):
+    v2 = swiden_high v0
+    v3 = swiden_high v1
+    v4 = imul v2, v3
+    return v4
+}
+
+;   pushq   %rbp
+;   movq    %rsp, %rbp
+; block0:
+;   movdqa  %xmm0, %xmm3
+;   palignr $8, %xmm3, %xmm0, %xmm3
+;   pmovsxbw %xmm3, %xmm0
+;   movdqa  %xmm1, %xmm7
+;   palignr $8, %xmm7, %xmm1, %xmm7
+;   pmovsxbw %xmm7, %xmm9
+;   pmullw  %xmm0, %xmm9, %xmm0
+;   movq    %rbp, %rsp
+;   popq    %rbp
+;   ret
+
+function %imul_swiden_hi_i16x8(i16x8, i16x8) -> i32x4 {
+block0(v0: i16x8, v1: i16x8):
+    v2 = swiden_high v0
+    v3 = swiden_high v1
+    v4 = imul v2, v3
+    return v4
+}
+
+;   pushq   %rbp
+;   movq    %rsp, %rbp
+; block0:
+;   movdqa  %xmm0, %xmm5
+;   pmullw  %xmm5, %xmm1, %xmm5
+;   movdqa  %xmm5, %xmm6
+;   movdqa  %xmm0, %xmm5
+;   pmulhw  %xmm5, %xmm1, %xmm5
+;   movdqa  %xmm6, %xmm0
+;   punpckhwd %xmm0, %xmm5, %xmm0
+;   movq    %rbp, %rsp
+;   popq    %rbp
+;   ret
+
+function %imul_swiden_hi_i32x4(i32x4, i32x4) -> i64x2 {
+block0(v0: i32x4, v1: i32x4):
+    v2 = swiden_high v0
+    v3 = swiden_high v1
+    v4 = imul v2, v3
+    return v4
+}
+
+;   pushq   %rbp
+;   movq    %rsp, %rbp
+; block0:
+;   pshufd  $250, %xmm0, %xmm0
+;   pshufd  $250, %xmm1, %xmm5
+;   pmuldq  %xmm0, %xmm5, %xmm0
+;   movq    %rbp, %rsp
+;   popq    %rbp
+;   ret
+
+function %imul_swiden_low_i8x16(i8x16, i8x16) -> i16x8 {
+block0(v0: i8x16, v1: i8x16):
+    v2 = swiden_low v0
+    v3 = swiden_low v1
+    v4 = imul v2, v3
+    return v4
+}
+
+;   pushq   %rbp
+;   movq    %rsp, %rbp
+; block0:
+;   pmovsxbw %xmm0, %xmm0
+;   pmovsxbw %xmm1, %xmm5
+;   pmullw  %xmm0, %xmm5, %xmm0
+;   movq    %rbp, %rsp
+;   popq    %rbp
+;   ret
+
+function %imul_swiden_low_i16x8(i16x8, i16x8) -> i32x4 {
+block0(v0: i16x8, v1: i16x8):
+    v2 = swiden_low v0
+    v3 = swiden_low v1
+    v4 = imul v2, v3
+    return v4
+}
+
+;   pushq   %rbp
+;   movq    %rsp, %rbp
+; block0:
+;   movdqa  %xmm0, %xmm5
+;   pmullw  %xmm5, %xmm1, %xmm5
+;   movdqa  %xmm5, %xmm6
+;   movdqa  %xmm0, %xmm5
+;   pmulhw  %xmm5, %xmm1, %xmm5
+;   movdqa  %xmm6, %xmm0
+;   punpcklwd %xmm0, %xmm5, %xmm0
+;   movq    %rbp, %rsp
+;   popq    %rbp
+;   ret
+
+function %imul_swiden_low_i32x4(i32x4, i32x4) -> i64x2 {
+block0(v0: i32x4, v1: i32x4):
+    v2 = swiden_low v0
+    v3 = swiden_low v1
+    v4 = imul v2, v3
+    return v4
+}
+
+;   pushq   %rbp
+;   movq    %rsp, %rbp
+; block0:
+;   pshufd  $80, %xmm0, %xmm0
+;   pshufd  $80, %xmm1, %xmm5
+;   pmuldq  %xmm0, %xmm5, %xmm0
+;   movq    %rbp, %rsp
+;   popq    %rbp
+;   ret
+
+function %imul_uwiden_hi_i8x16(i8x16, i8x16) -> i16x8 {
+block0(v0: i8x16, v1: i8x16):
+    v2 = uwiden_high v0
+    v3 = uwiden_high v1
+    v4 = imul v2, v3
+    return v4
+}
+
+;   pushq   %rbp
+;   movq    %rsp, %rbp
+; block0:
+;   movdqa  %xmm0, %xmm3
+;   palignr $8, %xmm3, %xmm0, %xmm3
+;   pmovzxbw %xmm3, %xmm0
+;   movdqa  %xmm1, %xmm7
+;   palignr $8, %xmm7, %xmm1, %xmm7
+;   pmovzxbw %xmm7, %xmm9
+;   pmullw  %xmm0, %xmm9, %xmm0
+;   movq    %rbp, %rsp
+;   popq    %rbp
+;   ret
+
+function %imul_uwiden_hi_i16x8(i16x8, i16x8) -> i32x4 {
+block0(v0: i16x8, v1: i16x8):
+    v2 = uwiden_high v0
+    v3 = uwiden_high v1
+    v4 = imul v2, v3
+    return v4
+}
+
+;   pushq   %rbp
+;   movq    %rsp, %rbp
+; block0:
+;   movdqa  %xmm0, %xmm5
+;   pmullw  %xmm5, %xmm1, %xmm5
+;   movdqa  %xmm5, %xmm6
+;   movdqa  %xmm0, %xmm5
+;   pmulhuw %xmm5, %xmm1, %xmm5
+;   movdqa  %xmm6, %xmm0
+;   punpckhwd %xmm0, %xmm5, %xmm0
+;   movq    %rbp, %rsp
+;   popq    %rbp
+;   ret
+
+function %imul_uwiden_hi_i32x4(i32x4, i32x4) -> i64x2 {
+block0(v0: i32x4, v1: i32x4):
+    v2 = uwiden_high v0
+    v3 = uwiden_high v1
+    v4 = imul v2, v3
+    return v4
+}
+
+;   pushq   %rbp
+;   movq    %rsp, %rbp
+; block0:
+;   pshufd  $250, %xmm0, %xmm0
+;   pshufd  $250, %xmm1, %xmm5
+;   pmuludq %xmm0, %xmm5, %xmm0
+;   movq    %rbp, %rsp
+;   popq    %rbp
+;   ret
+
+function %imul_uwiden_low_i8x16(i8x16, i8x16) -> i16x8 {
+block0(v0: i8x16, v1: i8x16):
+    v2 = uwiden_low v0
+    v3 = uwiden_low v1
+    v4 = imul v2, v3
+    return v4
+}
+
+;   pushq   %rbp
+;   movq    %rsp, %rbp
+; block0:
+;   pmovzxbw %xmm0, %xmm0
+;   pmovzxbw %xmm1, %xmm5
+;   pmullw  %xmm0, %xmm5, %xmm0
+;   movq    %rbp, %rsp
+;   popq    %rbp
+;   ret
+
+function %imul_uwiden_low_i16x8(i16x8, i16x8) -> i32x4 {
+block0(v0: i16x8, v1: i16x8):
+    v2 = uwiden_low v0
+    v3 = uwiden_low v1
+    v4 = imul v2, v3
+    return v4
+}
+
+;   pushq   %rbp
+;   movq    %rsp, %rbp
+; block0:
+;   movdqa  %xmm0, %xmm5
+;   pmullw  %xmm5, %xmm1, %xmm5
+;   movdqa  %xmm5, %xmm6
+;   movdqa  %xmm0, %xmm5
+;   pmulhuw %xmm5, %xmm1, %xmm5
+;   movdqa  %xmm6, %xmm0
+;   punpcklwd %xmm0, %xmm5, %xmm0
+;   movq    %rbp, %rsp
+;   popq    %rbp
+;   ret
+
+function %imul_uwiden_low_i32x4(i32x4, i32x4) -> i64x2 {
+block0(v0: i32x4, v1: i32x4):
+    v2 = uwiden_low v0
+    v3 = uwiden_low v1
+    v4 = imul v2, v3
+    return v4
+}
+
+;   pushq   %rbp
+;   movq    %rsp, %rbp
+; block0:
+;   pshufd  $80, %xmm0, %xmm0
+;   pshufd  $80, %xmm1, %xmm5
+;   pmuludq %xmm0, %xmm5, %xmm0
+;   movq    %rbp, %rsp
+;   popq    %rbp
+;   ret
+