Fix rule shadowing instances in x64 and aarch64 backends (#5334)

Fix shadowing identified in #5322 for imul and swiden_high/swiden_low/uwiden_high/uwiden_low combinations in the x64 backend, and remove some redundant rules from the aarch64 dynamic neon ruleset. Additionally, add tests to the x64 backend showing that the imul specializations are firing.
This commit is contained in:
Trevor Elliott
2022-11-28 15:48:34 -08:00
committed by GitHub
parent d6d3c49972
commit 368004428a
3 changed files with 263 additions and 27 deletions

View File

@@ -99,22 +99,12 @@
(rule (lower (extract_vector x 0)) (rule (lower (extract_vector x 0))
(value_reg (fpu_move_128 (put_in_reg x)))) (value_reg (fpu_move_128 (put_in_reg x))))
;;;; Rules for `swiden_low` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
(rule -1 (lower (has_type ty (swiden_low x)))
(value_reg (vec_extend (VecExtendOp.Sxtl) x $false (lane_size ty))))
;;;; Rules for `swiden_high` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;;;; Rules for `swiden_high` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
(rule -1 (lower (has_type ty (swiden_high x))) (rule -1 (lower (has_type ty (swiden_high x)))
(value_reg (vec_extend (VecExtendOp.Sxtl) x $true (lane_size ty)))) (vec_extend (VecExtendOp.Sxtl) x $true (lane_size ty)))
;;;; Rules for `uwiden_low` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
(rule -1 (lower (has_type ty (uwiden_low x)))
(value_reg (vec_extend (VecExtendOp.Uxtl) x $false (lane_size ty))))
;;;; Rules for `uwiden_high` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;;;; Rules for `uwiden_high` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
(rule -1 (lower (has_type ty (uwiden_high x))) (rule -1 (lower (has_type ty (uwiden_high x)))
(value_reg (vec_extend (VecExtendOp.Uxtl) x $true (lane_size ty)))) (vec_extend (VecExtendOp.Uxtl) x $true (lane_size ty)))

View File

@@ -905,10 +905,10 @@
;; (No i8x16 multiply.) ;; (No i8x16 multiply.)
(rule 1 (lower (has_type (multi_lane 16 8) (imul x y))) (rule (lower (has_type (multi_lane 16 8) (imul x y)))
(x64_pmullw x y)) (x64_pmullw x y))
(rule 1 (lower (has_type (multi_lane 32 4) (imul x y))) (rule (lower (has_type (multi_lane 32 4) (imul x y)))
(x64_pmulld x y)) (x64_pmulld x y))
;; With AVX-512 we can implement `i64x2` multiplication with a single ;; With AVX-512 we can implement `i64x2` multiplication with a single
@@ -939,7 +939,7 @@
;; the lane of the destination. For this reason we don't need shifts to isolate ;; the lane of the destination. For this reason we don't need shifts to isolate
;; the lower 32-bits, however, we will need to use shifts to isolate the high ;; the lower 32-bits, however, we will need to use shifts to isolate the high
;; 32-bits when doing calculations, i.e., `Ah == A >> 32`. ;; 32-bits when doing calculations, i.e., `Ah == A >> 32`.
(rule 1 (lower (has_type (multi_lane 64 2) (rule (lower (has_type (multi_lane 64 2)
(imul a b))) (imul a b)))
(let ((a0 Xmm a) (let ((a0 Xmm a)
(b0 Xmm b) (b0 Xmm b)
@@ -961,7 +961,7 @@
(x64_paddq al_bl aa_bb_shifted))) (x64_paddq al_bl aa_bb_shifted)))
;; Special case for `i16x8.extmul_high_i8x16_s`. ;; Special case for `i16x8.extmul_high_i8x16_s`.
(rule (lower (has_type (multi_lane 16 8) (rule 1 (lower (has_type (multi_lane 16 8)
(imul (swiden_high (and (value_type (multi_lane 8 16)) (imul (swiden_high (and (value_type (multi_lane 8 16))
x)) x))
(swiden_high (and (value_type (multi_lane 8 16)) (swiden_high (and (value_type (multi_lane 8 16))
@@ -975,7 +975,7 @@
(x64_pmullw x3 y3))) (x64_pmullw x3 y3)))
;; Special case for `i32x4.extmul_high_i16x8_s`. ;; Special case for `i32x4.extmul_high_i16x8_s`.
(rule (lower (has_type (multi_lane 32 4) (rule 1 (lower (has_type (multi_lane 32 4)
(imul (swiden_high (and (value_type (multi_lane 16 8)) (imul (swiden_high (and (value_type (multi_lane 16 8))
x)) x))
(swiden_high (and (value_type (multi_lane 16 8)) (swiden_high (and (value_type (multi_lane 16 8))
@@ -987,7 +987,7 @@
(x64_punpckhwd lo hi))) (x64_punpckhwd lo hi)))
;; Special case for `i64x2.extmul_high_i32x4_s`. ;; Special case for `i64x2.extmul_high_i32x4_s`.
(rule (lower (has_type (multi_lane 64 2) (rule 1 (lower (has_type (multi_lane 64 2)
(imul (swiden_high (and (value_type (multi_lane 32 4)) (imul (swiden_high (and (value_type (multi_lane 32 4))
x)) x))
(swiden_high (and (value_type (multi_lane 32 4)) (swiden_high (and (value_type (multi_lane 32 4))
@@ -1001,7 +1001,7 @@
(x64_pmuldq x2 y2))) (x64_pmuldq x2 y2)))
;; Special case for `i16x8.extmul_low_i8x16_s`. ;; Special case for `i16x8.extmul_low_i8x16_s`.
(rule (lower (has_type (multi_lane 16 8) (rule 1 (lower (has_type (multi_lane 16 8)
(imul (swiden_low (and (value_type (multi_lane 8 16)) (imul (swiden_low (and (value_type (multi_lane 8 16))
x)) x))
(swiden_low (and (value_type (multi_lane 8 16)) (swiden_low (and (value_type (multi_lane 8 16))
@@ -1011,7 +1011,7 @@
(x64_pmullw x2 y2))) (x64_pmullw x2 y2)))
;; Special case for `i32x4.extmul_low_i16x8_s`. ;; Special case for `i32x4.extmul_low_i16x8_s`.
(rule (lower (has_type (multi_lane 32 4) (rule 1 (lower (has_type (multi_lane 32 4)
(imul (swiden_low (and (value_type (multi_lane 16 8)) (imul (swiden_low (and (value_type (multi_lane 16 8))
x)) x))
(swiden_low (and (value_type (multi_lane 16 8)) (swiden_low (and (value_type (multi_lane 16 8))
@@ -1023,7 +1023,7 @@
(x64_punpcklwd lo hi))) (x64_punpcklwd lo hi)))
;; Special case for `i64x2.extmul_low_i32x4_s`. ;; Special case for `i64x2.extmul_low_i32x4_s`.
(rule (lower (has_type (multi_lane 64 2) (rule 1 (lower (has_type (multi_lane 64 2)
(imul (swiden_low (and (value_type (multi_lane 32 4)) (imul (swiden_low (and (value_type (multi_lane 32 4))
x)) x))
(swiden_low (and (value_type (multi_lane 32 4)) (swiden_low (and (value_type (multi_lane 32 4))
@@ -1037,7 +1037,7 @@
(x64_pmuldq x2 y2))) (x64_pmuldq x2 y2)))
;; Special case for `i16x8.extmul_high_i8x16_u`. ;; Special case for `i16x8.extmul_high_i8x16_u`.
(rule (lower (has_type (multi_lane 16 8) (rule 1 (lower (has_type (multi_lane 16 8)
(imul (uwiden_high (and (value_type (multi_lane 8 16)) (imul (uwiden_high (and (value_type (multi_lane 8 16))
x)) x))
(uwiden_high (and (value_type (multi_lane 8 16)) (uwiden_high (and (value_type (multi_lane 8 16))
@@ -1051,7 +1051,7 @@
(x64_pmullw x3 y3))) (x64_pmullw x3 y3)))
;; Special case for `i32x4.extmul_high_i16x8_u`. ;; Special case for `i32x4.extmul_high_i16x8_u`.
(rule (lower (has_type (multi_lane 32 4) (rule 1 (lower (has_type (multi_lane 32 4)
(imul (uwiden_high (and (value_type (multi_lane 16 8)) (imul (uwiden_high (and (value_type (multi_lane 16 8))
x)) x))
(uwiden_high (and (value_type (multi_lane 16 8)) (uwiden_high (and (value_type (multi_lane 16 8))
@@ -1063,7 +1063,7 @@
(x64_punpckhwd lo hi))) (x64_punpckhwd lo hi)))
;; Special case for `i64x2.extmul_high_i32x4_u`. ;; Special case for `i64x2.extmul_high_i32x4_u`.
(rule (lower (has_type (multi_lane 64 2) (rule 1 (lower (has_type (multi_lane 64 2)
(imul (uwiden_high (and (value_type (multi_lane 32 4)) (imul (uwiden_high (and (value_type (multi_lane 32 4))
x)) x))
(uwiden_high (and (value_type (multi_lane 32 4)) (uwiden_high (and (value_type (multi_lane 32 4))
@@ -1077,7 +1077,7 @@
(x64_pmuludq x2 y2))) (x64_pmuludq x2 y2)))
;; Special case for `i16x8.extmul_low_i8x16_u`. ;; Special case for `i16x8.extmul_low_i8x16_u`.
(rule (lower (has_type (multi_lane 16 8) (rule 1 (lower (has_type (multi_lane 16 8)
(imul (uwiden_low (and (value_type (multi_lane 8 16)) (imul (uwiden_low (and (value_type (multi_lane 8 16))
x)) x))
(uwiden_low (and (value_type (multi_lane 8 16)) (uwiden_low (and (value_type (multi_lane 8 16))
@@ -1087,7 +1087,7 @@
(x64_pmullw x2 y2))) (x64_pmullw x2 y2)))
;; Special case for `i32x4.extmul_low_i16x8_u`. ;; Special case for `i32x4.extmul_low_i16x8_u`.
(rule (lower (has_type (multi_lane 32 4) (rule 1 (lower (has_type (multi_lane 32 4)
(imul (uwiden_low (and (value_type (multi_lane 16 8)) (imul (uwiden_low (and (value_type (multi_lane 16 8))
x)) x))
(uwiden_low (and (value_type (multi_lane 16 8)) (uwiden_low (and (value_type (multi_lane 16 8))
@@ -1099,7 +1099,7 @@
(x64_punpcklwd lo hi))) (x64_punpcklwd lo hi)))
;; Special case for `i64x2.extmul_low_i32x4_u`. ;; Special case for `i64x2.extmul_low_i32x4_u`.
(rule (lower (has_type (multi_lane 64 2) (rule 1 (lower (has_type (multi_lane 64 2)
(imul (uwiden_low (and (value_type (multi_lane 32 4)) (imul (uwiden_low (and (value_type (multi_lane 32 4))
x)) x))
(uwiden_low (and (value_type (multi_lane 32 4)) (uwiden_low (and (value_type (multi_lane 32 4))

View File

@@ -0,0 +1,246 @@
test compile precise-output
set enable_simd
target x86_64
function %imul_swiden_hi_i8x16(i8x16, i8x16) -> i16x8 {
block0(v0: i8x16, v1: i8x16):
v2 = swiden_high v0
v3 = swiden_high v1
v4 = imul v2, v3
return v4
}
; pushq %rbp
; movq %rsp, %rbp
; block0:
; movdqa %xmm0, %xmm3
; palignr $8, %xmm3, %xmm0, %xmm3
; pmovsxbw %xmm3, %xmm0
; movdqa %xmm1, %xmm7
; palignr $8, %xmm7, %xmm1, %xmm7
; pmovsxbw %xmm7, %xmm9
; pmullw %xmm0, %xmm9, %xmm0
; movq %rbp, %rsp
; popq %rbp
; ret
function %imul_swiden_hi_i16x8(i16x8, i16x8) -> i32x4 {
block0(v0: i16x8, v1: i16x8):
v2 = swiden_high v0
v3 = swiden_high v1
v4 = imul v2, v3
return v4
}
; pushq %rbp
; movq %rsp, %rbp
; block0:
; movdqa %xmm0, %xmm5
; pmullw %xmm5, %xmm1, %xmm5
; movdqa %xmm5, %xmm6
; movdqa %xmm0, %xmm5
; pmulhw %xmm5, %xmm1, %xmm5
; movdqa %xmm6, %xmm0
; punpckhwd %xmm0, %xmm5, %xmm0
; movq %rbp, %rsp
; popq %rbp
; ret
function %imul_swiden_hi_i32x4(i32x4, i32x4) -> i64x2 {
block0(v0: i32x4, v1: i32x4):
v2 = swiden_high v0
v3 = swiden_high v1
v4 = imul v2, v3
return v4
}
; pushq %rbp
; movq %rsp, %rbp
; block0:
; pshufd $250, %xmm0, %xmm0
; pshufd $250, %xmm1, %xmm5
; pmuldq %xmm0, %xmm5, %xmm0
; movq %rbp, %rsp
; popq %rbp
; ret
function %imul_swiden_low_i8x16(i8x16, i8x16) -> i16x8 {
block0(v0: i8x16, v1: i8x16):
v2 = swiden_low v0
v3 = swiden_low v1
v4 = imul v2, v3
return v4
}
; pushq %rbp
; movq %rsp, %rbp
; block0:
; pmovsxbw %xmm0, %xmm0
; pmovsxbw %xmm1, %xmm5
; pmullw %xmm0, %xmm5, %xmm0
; movq %rbp, %rsp
; popq %rbp
; ret
function %imul_swiden_low_i16x8(i16x8, i16x8) -> i32x4 {
block0(v0: i16x8, v1: i16x8):
v2 = swiden_low v0
v3 = swiden_low v1
v4 = imul v2, v3
return v4
}
; pushq %rbp
; movq %rsp, %rbp
; block0:
; movdqa %xmm0, %xmm5
; pmullw %xmm5, %xmm1, %xmm5
; movdqa %xmm5, %xmm6
; movdqa %xmm0, %xmm5
; pmulhw %xmm5, %xmm1, %xmm5
; movdqa %xmm6, %xmm0
; punpcklwd %xmm0, %xmm5, %xmm0
; movq %rbp, %rsp
; popq %rbp
; ret
function %imul_swiden_low_i32x4(i32x4, i32x4) -> i64x2 {
block0(v0: i32x4, v1: i32x4):
v2 = swiden_low v0
v3 = swiden_low v1
v4 = imul v2, v3
return v4
}
; pushq %rbp
; movq %rsp, %rbp
; block0:
; pshufd $80, %xmm0, %xmm0
; pshufd $80, %xmm1, %xmm5
; pmuldq %xmm0, %xmm5, %xmm0
; movq %rbp, %rsp
; popq %rbp
; ret
function %imul_uwiden_hi_i8x16(i8x16, i8x16) -> i16x8 {
block0(v0: i8x16, v1: i8x16):
v2 = uwiden_high v0
v3 = uwiden_high v1
v4 = imul v2, v3
return v4
}
; pushq %rbp
; movq %rsp, %rbp
; block0:
; movdqa %xmm0, %xmm3
; palignr $8, %xmm3, %xmm0, %xmm3
; pmovzxbw %xmm3, %xmm0
; movdqa %xmm1, %xmm7
; palignr $8, %xmm7, %xmm1, %xmm7
; pmovzxbw %xmm7, %xmm9
; pmullw %xmm0, %xmm9, %xmm0
; movq %rbp, %rsp
; popq %rbp
; ret
function %imul_uwiden_hi_i16x8(i16x8, i16x8) -> i32x4 {
block0(v0: i16x8, v1: i16x8):
v2 = uwiden_high v0
v3 = uwiden_high v1
v4 = imul v2, v3
return v4
}
; pushq %rbp
; movq %rsp, %rbp
; block0:
; movdqa %xmm0, %xmm5
; pmullw %xmm5, %xmm1, %xmm5
; movdqa %xmm5, %xmm6
; movdqa %xmm0, %xmm5
; pmulhuw %xmm5, %xmm1, %xmm5
; movdqa %xmm6, %xmm0
; punpckhwd %xmm0, %xmm5, %xmm0
; movq %rbp, %rsp
; popq %rbp
; ret
function %imul_uwiden_hi_i32x4(i32x4, i32x4) -> i64x2 {
block0(v0: i32x4, v1: i32x4):
v2 = uwiden_high v0
v3 = uwiden_high v1
v4 = imul v2, v3
return v4
}
; pushq %rbp
; movq %rsp, %rbp
; block0:
; pshufd $250, %xmm0, %xmm0
; pshufd $250, %xmm1, %xmm5
; pmuludq %xmm0, %xmm5, %xmm0
; movq %rbp, %rsp
; popq %rbp
; ret
function %imul_uwiden_low_i8x16(i8x16, i8x16) -> i16x8 {
block0(v0: i8x16, v1: i8x16):
v2 = uwiden_low v0
v3 = uwiden_low v1
v4 = imul v2, v3
return v4
}
; pushq %rbp
; movq %rsp, %rbp
; block0:
; pmovzxbw %xmm0, %xmm0
; pmovzxbw %xmm1, %xmm5
; pmullw %xmm0, %xmm5, %xmm0
; movq %rbp, %rsp
; popq %rbp
; ret
function %imul_uwiden_low_i16x8(i16x8, i16x8) -> i32x4 {
block0(v0: i16x8, v1: i16x8):
v2 = uwiden_low v0
v3 = uwiden_low v1
v4 = imul v2, v3
return v4
}
; pushq %rbp
; movq %rsp, %rbp
; block0:
; movdqa %xmm0, %xmm5
; pmullw %xmm5, %xmm1, %xmm5
; movdqa %xmm5, %xmm6
; movdqa %xmm0, %xmm5
; pmulhuw %xmm5, %xmm1, %xmm5
; movdqa %xmm6, %xmm0
; punpcklwd %xmm0, %xmm5, %xmm0
; movq %rbp, %rsp
; popq %rbp
; ret
function %imul_uwiden_low_i32x4(i32x4, i32x4) -> i64x2 {
block0(v0: i32x4, v1: i32x4):
v2 = uwiden_low v0
v3 = uwiden_low v1
v4 = imul v2, v3
return v4
}
; pushq %rbp
; movq %rsp, %rbp
; block0:
; pshufd $80, %xmm0, %xmm0
; pshufd $80, %xmm1, %xmm5
; pmuludq %xmm0, %xmm5, %xmm0
; movq %rbp, %rsp
; popq %rbp
; ret