Fix rule shadowing instances in x64 and aarch64 backends (#5334)

Fix shadowing identified in #5322 for imul and swiden_high/swiden_low/uwiden_high/uwiden_low combinations in the x64 backend, and remove some redundant rules from the aarch64 dynamic neon ruleset. Additionally, add tests to the x64 backend showing that the imul specializations are firing.
This commit is contained in:
Trevor Elliott
2022-11-28 15:48:34 -08:00
committed by GitHub
parent d6d3c49972
commit 368004428a
3 changed files with 263 additions and 27 deletions

View File

@@ -0,0 +1,246 @@
test compile precise-output
set enable_simd
target x86_64
function %imul_swiden_hi_i8x16(i8x16, i8x16) -> i16x8 {
block0(v0: i8x16, v1: i8x16):
v2 = swiden_high v0
v3 = swiden_high v1
v4 = imul v2, v3
return v4
}
; pushq %rbp
; movq %rsp, %rbp
; block0:
; movdqa %xmm0, %xmm3
; palignr $8, %xmm3, %xmm0, %xmm3
; pmovsxbw %xmm3, %xmm0
; movdqa %xmm1, %xmm7
; palignr $8, %xmm7, %xmm1, %xmm7
; pmovsxbw %xmm7, %xmm9
; pmullw %xmm0, %xmm9, %xmm0
; movq %rbp, %rsp
; popq %rbp
; ret
function %imul_swiden_hi_i16x8(i16x8, i16x8) -> i32x4 {
block0(v0: i16x8, v1: i16x8):
v2 = swiden_high v0
v3 = swiden_high v1
v4 = imul v2, v3
return v4
}
; pushq %rbp
; movq %rsp, %rbp
; block0:
; movdqa %xmm0, %xmm5
; pmullw %xmm5, %xmm1, %xmm5
; movdqa %xmm5, %xmm6
; movdqa %xmm0, %xmm5
; pmulhw %xmm5, %xmm1, %xmm5
; movdqa %xmm6, %xmm0
; punpckhwd %xmm0, %xmm5, %xmm0
; movq %rbp, %rsp
; popq %rbp
; ret
function %imul_swiden_hi_i32x4(i32x4, i32x4) -> i64x2 {
block0(v0: i32x4, v1: i32x4):
v2 = swiden_high v0
v3 = swiden_high v1
v4 = imul v2, v3
return v4
}
; pushq %rbp
; movq %rsp, %rbp
; block0:
; pshufd $250, %xmm0, %xmm0
; pshufd $250, %xmm1, %xmm5
; pmuldq %xmm0, %xmm5, %xmm0
; movq %rbp, %rsp
; popq %rbp
; ret
function %imul_swiden_low_i8x16(i8x16, i8x16) -> i16x8 {
block0(v0: i8x16, v1: i8x16):
v2 = swiden_low v0
v3 = swiden_low v1
v4 = imul v2, v3
return v4
}
; pushq %rbp
; movq %rsp, %rbp
; block0:
; pmovsxbw %xmm0, %xmm0
; pmovsxbw %xmm1, %xmm5
; pmullw %xmm0, %xmm5, %xmm0
; movq %rbp, %rsp
; popq %rbp
; ret
function %imul_swiden_low_i16x8(i16x8, i16x8) -> i32x4 {
block0(v0: i16x8, v1: i16x8):
v2 = swiden_low v0
v3 = swiden_low v1
v4 = imul v2, v3
return v4
}
; pushq %rbp
; movq %rsp, %rbp
; block0:
; movdqa %xmm0, %xmm5
; pmullw %xmm5, %xmm1, %xmm5
; movdqa %xmm5, %xmm6
; movdqa %xmm0, %xmm5
; pmulhw %xmm5, %xmm1, %xmm5
; movdqa %xmm6, %xmm0
; punpcklwd %xmm0, %xmm5, %xmm0
; movq %rbp, %rsp
; popq %rbp
; ret
function %imul_swiden_low_i32x4(i32x4, i32x4) -> i64x2 {
block0(v0: i32x4, v1: i32x4):
v2 = swiden_low v0
v3 = swiden_low v1
v4 = imul v2, v3
return v4
}
; pushq %rbp
; movq %rsp, %rbp
; block0:
; pshufd $80, %xmm0, %xmm0
; pshufd $80, %xmm1, %xmm5
; pmuldq %xmm0, %xmm5, %xmm0
; movq %rbp, %rsp
; popq %rbp
; ret
function %imul_uwiden_hi_i8x16(i8x16, i8x16) -> i16x8 {
block0(v0: i8x16, v1: i8x16):
v2 = uwiden_high v0
v3 = uwiden_high v1
v4 = imul v2, v3
return v4
}
; pushq %rbp
; movq %rsp, %rbp
; block0:
; movdqa %xmm0, %xmm3
; palignr $8, %xmm3, %xmm0, %xmm3
; pmovzxbw %xmm3, %xmm0
; movdqa %xmm1, %xmm7
; palignr $8, %xmm7, %xmm1, %xmm7
; pmovzxbw %xmm7, %xmm9
; pmullw %xmm0, %xmm9, %xmm0
; movq %rbp, %rsp
; popq %rbp
; ret
function %imul_uwiden_hi_i16x8(i16x8, i16x8) -> i32x4 {
block0(v0: i16x8, v1: i16x8):
v2 = uwiden_high v0
v3 = uwiden_high v1
v4 = imul v2, v3
return v4
}
; pushq %rbp
; movq %rsp, %rbp
; block0:
; movdqa %xmm0, %xmm5
; pmullw %xmm5, %xmm1, %xmm5
; movdqa %xmm5, %xmm6
; movdqa %xmm0, %xmm5
; pmulhuw %xmm5, %xmm1, %xmm5
; movdqa %xmm6, %xmm0
; punpckhwd %xmm0, %xmm5, %xmm0
; movq %rbp, %rsp
; popq %rbp
; ret
function %imul_uwiden_hi_i32x4(i32x4, i32x4) -> i64x2 {
block0(v0: i32x4, v1: i32x4):
v2 = uwiden_high v0
v3 = uwiden_high v1
v4 = imul v2, v3
return v4
}
; pushq %rbp
; movq %rsp, %rbp
; block0:
; pshufd $250, %xmm0, %xmm0
; pshufd $250, %xmm1, %xmm5
; pmuludq %xmm0, %xmm5, %xmm0
; movq %rbp, %rsp
; popq %rbp
; ret
function %imul_uwiden_low_i8x16(i8x16, i8x16) -> i16x8 {
block0(v0: i8x16, v1: i8x16):
v2 = uwiden_low v0
v3 = uwiden_low v1
v4 = imul v2, v3
return v4
}
; pushq %rbp
; movq %rsp, %rbp
; block0:
; pmovzxbw %xmm0, %xmm0
; pmovzxbw %xmm1, %xmm5
; pmullw %xmm0, %xmm5, %xmm0
; movq %rbp, %rsp
; popq %rbp
; ret
function %imul_uwiden_low_i16x8(i16x8, i16x8) -> i32x4 {
block0(v0: i16x8, v1: i16x8):
v2 = uwiden_low v0
v3 = uwiden_low v1
v4 = imul v2, v3
return v4
}
; pushq %rbp
; movq %rsp, %rbp
; block0:
; movdqa %xmm0, %xmm5
; pmullw %xmm5, %xmm1, %xmm5
; movdqa %xmm5, %xmm6
; movdqa %xmm0, %xmm5
; pmulhuw %xmm5, %xmm1, %xmm5
; movdqa %xmm6, %xmm0
; punpcklwd %xmm0, %xmm5, %xmm0
; movq %rbp, %rsp
; popq %rbp
; ret
function %imul_uwiden_low_i32x4(i32x4, i32x4) -> i64x2 {
block0(v0: i32x4, v1: i32x4):
v2 = uwiden_low v0
v3 = uwiden_low v1
v4 = imul v2, v3
return v4
}
; pushq %rbp
; movq %rsp, %rbp
; block0:
; pshufd $80, %xmm0, %xmm0
; pshufd $80, %xmm1, %xmm5
; pmuludq %xmm0, %xmm5, %xmm0
; movq %rbp, %rsp
; popq %rbp
; ret