x64: Fix vbroadcastss with AVX2 and without AVX (#6060)

* x64: Fix vbroadcastss with AVX2 and without AVX

This commit fixes a corner case in the emission of the
`vbroadcasts{s,d}` instructions. The memory-to-xmm form of these
instructions was available with the AVX instruction set, but the
xmm-to-xmm form of these instructions wasn't available until AVX2.
The instruction requirement for these are listed as AVX but the lowering
rules are appropriately annotated to use either AVX2 or AVX when
appropriate.

While this should work in practice this didn't work for the assertion
about enabled features for each instruction. The `vbroadcastss`
instruction was listed as requiring AVX but could get emitted when AVX2
was enabled (due to the reg-to-reg form being available). This caused an
issue for the fuzzer where AVX2 was enabled but AVX was disabled.

One possible fix would be to add more opcodes, one for reg-to-reg and
one for mem-to-reg. That seemed like somewhat overkill for a pretty
niche situation that shouldn't actually come up in practice anywhere.
Instead this commit changes all the `has_avx` accessors to the
`use_avx_simd` predicate already available in the target flags. The
`use_avx2_simd` predicate was then updated to additionally require
`has_avx`, so if AVX2 is enabled and AVX is disabled then the
`vbroadcastss` instruction won't get emitted any more.

Closes #6059

* Pass `enable_simd` on a few more files
This commit is contained in:
Alex Crichton
2023-03-18 13:38:03 -05:00
committed by GitHub
parent d72010b749
commit f7dda1ab2c
7 changed files with 186 additions and 181 deletions

View File

@@ -138,7 +138,10 @@ fn define_settings(shared: &SettingGroup) -> SettingGroup {
); );
settings.add_predicate("use_avx_simd", predicate!(shared_enable_simd && has_avx)); settings.add_predicate("use_avx_simd", predicate!(shared_enable_simd && has_avx));
settings.add_predicate("use_avx2_simd", predicate!(shared_enable_simd && has_avx2)); settings.add_predicate(
"use_avx2_simd",
predicate!(shared_enable_simd && has_avx && has_avx2),
);
settings.add_predicate( settings.add_predicate(
"use_avx512bitalg_simd", "use_avx512bitalg_simd",
predicate!(shared_enable_simd && has_avx512bitalg), predicate!(shared_enable_simd && has_avx512bitalg),

File diff suppressed because it is too large Load Diff

View File

@@ -3853,12 +3853,12 @@
(rule 0 (lower (has_type $I8X16 (splat src))) (rule 0 (lower (has_type $I8X16 (splat src)))
(x64_pshufb (bitcast_gpr_to_xmm $I32 src) (xmm_zero $I8X16))) (x64_pshufb (bitcast_gpr_to_xmm $I32 src) (xmm_zero $I8X16)))
(rule 1 (lower (has_type $I8X16 (splat src))) (rule 1 (lower (has_type $I8X16 (splat src)))
(if-let $true (has_avx2)) (if-let $true (use_avx2_simd))
(x64_vpbroadcastb (bitcast_gpr_to_xmm $I32 src))) (x64_vpbroadcastb (bitcast_gpr_to_xmm $I32 src)))
(rule 2 (lower (has_type $I8X16 (splat (sinkable_load_exact addr)))) (rule 2 (lower (has_type $I8X16 (splat (sinkable_load_exact addr))))
(x64_pshufb (x64_pinsrb (xmm_uninit_value) addr 0) (xmm_zero $I8X16))) (x64_pshufb (x64_pinsrb (xmm_uninit_value) addr 0) (xmm_zero $I8X16)))
(rule 3 (lower (has_type $I8X16 (splat (sinkable_load_exact addr)))) (rule 3 (lower (has_type $I8X16 (splat (sinkable_load_exact addr))))
(if-let $true (has_avx2)) (if-let $true (use_avx2_simd))
(x64_vpbroadcastb addr)) (x64_vpbroadcastb addr))
;; i16x8 splats: use `vpbroadcastw` on AVX2 and otherwise a 16-bit value is ;; i16x8 splats: use `vpbroadcastw` on AVX2 and otherwise a 16-bit value is
@@ -3869,12 +3869,12 @@
(rule 0 (lower (has_type $I16X8 (splat src))) (rule 0 (lower (has_type $I16X8 (splat src)))
(x64_pshufd (x64_pshuflw (bitcast_gpr_to_xmm $I32 src) 0) 0)) (x64_pshufd (x64_pshuflw (bitcast_gpr_to_xmm $I32 src) 0) 0))
(rule 1 (lower (has_type $I16X8 (splat src))) (rule 1 (lower (has_type $I16X8 (splat src)))
(if-let $true (has_avx2)) (if-let $true (use_avx2_simd))
(x64_vpbroadcastw (bitcast_gpr_to_xmm $I32 src))) (x64_vpbroadcastw (bitcast_gpr_to_xmm $I32 src)))
(rule 2 (lower (has_type $I16X8 (splat (sinkable_load_exact addr)))) (rule 2 (lower (has_type $I16X8 (splat (sinkable_load_exact addr))))
(x64_pshufd (x64_pshuflw (x64_pinsrw (xmm_uninit_value) addr 0) 0) 0)) (x64_pshufd (x64_pshuflw (x64_pinsrw (xmm_uninit_value) addr 0) 0) 0))
(rule 3 (lower (has_type $I16X8 (splat (sinkable_load_exact addr)))) (rule 3 (lower (has_type $I16X8 (splat (sinkable_load_exact addr))))
(if-let $true (has_avx2)) (if-let $true (use_avx2_simd))
(x64_vpbroadcastw addr)) (x64_vpbroadcastw addr))
;; i32x4.splat - use `vpbroadcastd` on AVX2 and otherwise `pshufd` can be ;; i32x4.splat - use `vpbroadcastd` on AVX2 and otherwise `pshufd` can be
@@ -3884,7 +3884,7 @@
(rule 0 (lower (has_type $I32X4 (splat src))) (rule 0 (lower (has_type $I32X4 (splat src)))
(x64_pshufd (bitcast_gpr_to_xmm $I32 src) 0)) (x64_pshufd (bitcast_gpr_to_xmm $I32 src) 0))
(rule 1 (lower (has_type $I32X4 (splat src))) (rule 1 (lower (has_type $I32X4 (splat src)))
(if-let $true (has_avx2)) (if-let $true (use_avx2_simd))
(x64_vpbroadcastd (bitcast_gpr_to_xmm $I32 src))) (x64_vpbroadcastd (bitcast_gpr_to_xmm $I32 src)))
;; f32x4.splat - the source is already in an xmm register so `shufps` is all ;; f32x4.splat - the source is already in an xmm register so `shufps` is all
@@ -3894,7 +3894,7 @@
(let ((tmp Xmm src)) (let ((tmp Xmm src))
(x64_shufps src src 0))) (x64_shufps src src 0)))
(rule 1 (lower (has_type $F32X4 (splat src))) (rule 1 (lower (has_type $F32X4 (splat src)))
(if-let $true (has_avx2)) (if-let $true (use_avx2_simd))
(x64_vbroadcastss src)) (x64_vbroadcastss src))
;; t32x4.splat of a load - use a `movss` to load into an xmm register and then ;; t32x4.splat of a load - use a `movss` to load into an xmm register and then
@@ -3905,12 +3905,12 @@
;; that the memory-operand encoding of `vbroadcastss` is usable with AVX, but ;; that the memory-operand encoding of `vbroadcastss` is usable with AVX, but
;; the register-based encoding is only available with AVX2. With the ;; the register-based encoding is only available with AVX2. With the
;; `sinkable_load` extractor this should be guaranteed to use the memory-based ;; `sinkable_load` extractor this should be guaranteed to use the memory-based
;; encoding hence the `has_avx` test. ;; encoding hence the `use_avx_simd` test.
(rule 4 (lower (has_type (multi_lane 32 4) (splat (sinkable_load addr)))) (rule 4 (lower (has_type (multi_lane 32 4) (splat (sinkable_load addr))))
(let ((tmp Xmm (x64_movss_load addr))) (let ((tmp Xmm (x64_movss_load addr)))
(x64_shufps tmp tmp 0))) (x64_shufps tmp tmp 0)))
(rule 5 (lower (has_type (multi_lane 32 4) (splat (sinkable_load addr)))) (rule 5 (lower (has_type (multi_lane 32 4) (splat (sinkable_load addr))))
(if-let $true (has_avx)) (if-let $true (use_avx_simd))
(x64_vbroadcastss addr)) (x64_vbroadcastss addr))
;; t64x2.splat - use `movddup` which is exactly what we want and there's a ;; t64x2.splat - use `movddup` which is exactly what we want and there's a

View File

@@ -170,13 +170,13 @@ impl Context for IsleContext<'_, '_, MInst, X64Backend> {
} }
#[inline] #[inline]
fn has_avx(&mut self) -> bool { fn use_avx_simd(&mut self) -> bool {
self.backend.x64_flags.has_avx() self.backend.x64_flags.use_avx_simd()
} }
#[inline] #[inline]
fn has_avx2(&mut self) -> bool { fn use_avx2_simd(&mut self) -> bool {
self.backend.x64_flags.has_avx2() self.backend.x64_flags.use_avx2_simd()
} }
#[inline] #[inline]

View File

@@ -1,4 +1,5 @@
test compile precise-output test compile precise-output
set enable_simd
target x86_64 has_avx target x86_64 has_avx
function %f1(i8x16) -> i8 { function %f1(i8x16) -> i8 {

View File

@@ -1,5 +1,6 @@
test interpret test interpret
test run test run
set enable_simd
target x86_64 has_avx has_fma target x86_64 has_avx has_fma
target x86_64 has_avx=false has_fma=false target x86_64 has_avx=false has_fma=false
target aarch64 target aarch64

View File

@@ -1,7 +1,7 @@
;;! target = "x86_64" ;;! target = "x86_64"
;;! compile = true ;;! compile = true
;;! relaxed_simd_deterministic = true ;;! relaxed_simd_deterministic = true
;;! settings = ["has_avx=true"] ;;! settings = ["enable_simd", "has_avx"]
(module (module
(func (param v128) (result v128) (func (param v128) (result v128)