x64: Lower SIMD requirement to SSE4.1 from SSE4.2 (#6206)
Cranelift only has one instruction SIMD which depends on SSE4.2 so this commit adds a lowering rule for `pcmpgtq` which doesn't use SSE4.2 and enables lowering the baseline requirement for SIMD support from SSE4.2 to SSE4.1. The `has_sse42` setting is no longer enabled by default for Cranelift. Additionally `enable_simd` no longer requires `has_sse42` on x64. Finally the fuzz-generator for Wasmtime codegen settings now enables flipping the `has_sse42` setting instead of unconditionally setting it to `true`. The specific lowering for `pcmpgtq` is copied from LLVM's lowering of this instruction.
This commit is contained in:
@@ -38,7 +38,7 @@ fn define_settings(shared: &SettingGroup) -> SettingGroup {
|
|||||||
"has_sse42",
|
"has_sse42",
|
||||||
"Has support for SSE4.2.",
|
"Has support for SSE4.2.",
|
||||||
"SSE4.2: CPUID.01H:ECX.SSE4_2[bit 20]",
|
"SSE4.2: CPUID.01H:ECX.SSE4_2[bit 20]",
|
||||||
true,
|
false,
|
||||||
);
|
);
|
||||||
let has_avx = settings.add_bool(
|
let has_avx = settings.add_bool(
|
||||||
"has_avx",
|
"has_avx",
|
||||||
|
|||||||
@@ -1642,6 +1642,9 @@
|
|||||||
(decl use_sse41 (bool) Type)
|
(decl use_sse41 (bool) Type)
|
||||||
(extern extractor infallible use_sse41 use_sse41)
|
(extern extractor infallible use_sse41 use_sse41)
|
||||||
|
|
||||||
|
(decl pure use_sse42 () bool)
|
||||||
|
(extern constructor use_sse42 use_sse42)
|
||||||
|
|
||||||
(decl pure use_avx_simd () bool)
|
(decl pure use_avx_simd () bool)
|
||||||
(extern constructor use_avx_simd use_avx_simd)
|
(extern constructor use_avx_simd use_avx_simd)
|
||||||
|
|
||||||
@@ -4214,7 +4217,51 @@
|
|||||||
(rule (x64_pcmpgt $I8X16 x y) (x64_pcmpgtb x y))
|
(rule (x64_pcmpgt $I8X16 x y) (x64_pcmpgtb x y))
|
||||||
(rule (x64_pcmpgt $I16X8 x y) (x64_pcmpgtw x y))
|
(rule (x64_pcmpgt $I16X8 x y) (x64_pcmpgtw x y))
|
||||||
(rule (x64_pcmpgt $I32X4 x y) (x64_pcmpgtd x y))
|
(rule (x64_pcmpgt $I32X4 x y) (x64_pcmpgtd x y))
|
||||||
(rule (x64_pcmpgt $I64X2 x y) (x64_pcmpgtq x y))
|
|
||||||
|
;; SSE4.2 gives a single-instruction for this lowering, but prior to that it's a
|
||||||
|
;; bit more complicated.
|
||||||
|
(rule 1 (x64_pcmpgt $I64X2 x y)
|
||||||
|
(if-let $true (use_sse42))
|
||||||
|
(x64_pcmpgtq x y))
|
||||||
|
|
||||||
|
;; Without SSE4.2 a 64-bit comparison is expanded to a number of instructions.
|
||||||
|
;; The basic idea is to delegate to a 32-bit comparison and work with the
|
||||||
|
;; results from there. The comparison to execute is:
|
||||||
|
;;
|
||||||
|
;; [ xhi ][ xlo ] > [ yhi ][ ylo ]
|
||||||
|
;;
|
||||||
|
;; If xhi != yhi, then the result is whatever the result of that comparison is.
|
||||||
|
;; If xhi == yhi, then the result is the unsigned comparison of xlo/ylo since
|
||||||
|
;; the 64-bit value is positive. To achieve this as part of the same comparison
|
||||||
|
;; the upper bit of `xlo` and `ylo` is flipped to change the sign when compared
|
||||||
|
;; as a 32-bit signed number. The result here is then:
|
||||||
|
;;
|
||||||
|
;; * if xlo and yhi had the same upper bit, then the unsigned comparison should
|
||||||
|
;; be the same as comparing the flipped versions as signed.
|
||||||
|
;; * if xlo had an upper bit of 0 and ylo had an upper bit of 1, then xlo > ylo
|
||||||
|
;; is false. When flipping the bits xlo becomes negative and ylo becomes
|
||||||
|
;; positive when compared as 32-bits, so the result is the same.
|
||||||
|
;; * if xlo had an upper bit of 1 and ylo had an upper bit of 0, then xlo > ylo
|
||||||
|
;; is true. When flipping the bits xlo becomes positive and ylo becomes
|
||||||
|
;; negative when compared as 32-bits, so the result is the same.
|
||||||
|
;;
|
||||||
|
;; Given all that the sequence here is to flip the upper bits of xlo and ylo,
|
||||||
|
;; then compare the masked results for equality and for gt. If the upper 32-bits
|
||||||
|
;; are not equal then the gt result for the upper bits is used. If the upper
|
||||||
|
;; 32-bits are equal then the lower 32-bits comparison is used instead.
|
||||||
|
(rule 0 (x64_pcmpgt $I64X2 x y)
|
||||||
|
(let (
|
||||||
|
(mask Xmm (x64_movdqu_load (emit_u128_le_const 0x00000000_80000000_00000000_80000000)))
|
||||||
|
(x_masked Xmm (x64_pxor mask x))
|
||||||
|
(y_masked Xmm (x64_pxor mask y))
|
||||||
|
(cmp32 Xmm (x64_pcmpgtd x_masked y_masked))
|
||||||
|
(low_halves_gt Xmm (x64_pshufd cmp32 0xa0))
|
||||||
|
(high_halves_gt Xmm (x64_pshufd cmp32 0xf5))
|
||||||
|
(cmp_eq Xmm (x64_pcmpeqd x_masked y_masked))
|
||||||
|
(high_halves_eq Xmm (x64_pshufd cmp_eq 0xf5))
|
||||||
|
(low_gt_and_high_eq Xmm (x64_pand low_halves_gt high_halves_eq))
|
||||||
|
)
|
||||||
|
(x64_por low_gt_and_high_eq high_halves_gt)))
|
||||||
|
|
||||||
(decl x64_pcmpgtb (Xmm XmmMem) Xmm)
|
(decl x64_pcmpgtb (Xmm XmmMem) Xmm)
|
||||||
(rule 0 (x64_pcmpgtb x y) (xmm_rm_r (SseOpcode.Pcmpgtb) x y))
|
(rule 0 (x64_pcmpgtb x y) (xmm_rm_r (SseOpcode.Pcmpgtb) x y))
|
||||||
|
|||||||
@@ -229,6 +229,11 @@ impl Context for IsleContext<'_, '_, MInst, X64Backend> {
|
|||||||
self.backend.x64_flags.use_sse41()
|
self.backend.x64_flags.use_sse41()
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#[inline]
|
||||||
|
fn use_sse42(&mut self) -> bool {
|
||||||
|
self.backend.x64_flags.use_sse42()
|
||||||
|
}
|
||||||
|
|
||||||
#[inline]
|
#[inline]
|
||||||
fn imm8_from_value(&mut self, val: Value) -> Option<Imm8Reg> {
|
fn imm8_from_value(&mut self, val: Value) -> Option<Imm8Reg> {
|
||||||
let inst = self.lower_ctx.dfg().value_def(val).inst()?;
|
let inst = self.lower_ctx.dfg().value_def(val).inst()?;
|
||||||
|
|||||||
@@ -216,13 +216,9 @@ fn isa_constructor(
|
|||||||
// Check for compatibility between flags and ISA level
|
// Check for compatibility between flags and ISA level
|
||||||
// requested. In particular, SIMD support requires SSE4.2.
|
// requested. In particular, SIMD support requires SSE4.2.
|
||||||
if shared_flags.enable_simd() {
|
if shared_flags.enable_simd() {
|
||||||
if !isa_flags.has_sse3()
|
if !isa_flags.has_sse3() || !isa_flags.has_ssse3() || !isa_flags.has_sse41() {
|
||||||
|| !isa_flags.has_ssse3()
|
|
||||||
|| !isa_flags.has_sse41()
|
|
||||||
|| !isa_flags.has_sse42()
|
|
||||||
{
|
|
||||||
return Err(CodegenError::Unsupported(
|
return Err(CodegenError::Unsupported(
|
||||||
"SIMD support requires SSE3, SSSE3, SSE4.1, and SSE4.2 on x86_64.".into(),
|
"SIMD support requires SSE3, SSSE3, and SSE4.1 on x86_64.".into(),
|
||||||
));
|
));
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@@ -247,7 +243,6 @@ mod test {
|
|||||||
isa_builder.set("has_sse3", "false").unwrap();
|
isa_builder.set("has_sse3", "false").unwrap();
|
||||||
isa_builder.set("has_ssse3", "false").unwrap();
|
isa_builder.set("has_ssse3", "false").unwrap();
|
||||||
isa_builder.set("has_sse41", "false").unwrap();
|
isa_builder.set("has_sse41", "false").unwrap();
|
||||||
isa_builder.set("has_sse42", "false").unwrap();
|
|
||||||
assert!(matches!(
|
assert!(matches!(
|
||||||
isa_builder.finish(shared_flags),
|
isa_builder.finish(shared_flags),
|
||||||
Err(CodegenError::Unsupported(_)),
|
Err(CodegenError::Unsupported(_)),
|
||||||
|
|||||||
@@ -1,6 +1,6 @@
|
|||||||
test compile precise-output
|
test compile precise-output
|
||||||
set enable_simd
|
set enable_simd
|
||||||
target x86_64 has_avx
|
target x86_64 sse42 has_avx
|
||||||
|
|
||||||
function %i8x16_eq(i8x16, i8x16) -> i8x16 {
|
function %i8x16_eq(i8x16, i8x16) -> i8x16 {
|
||||||
block0(v0: i8x16, v1: i8x16):
|
block0(v0: i8x16, v1: i8x16):
|
||||||
|
|||||||
@@ -2,7 +2,8 @@ test interpret
|
|||||||
test run
|
test run
|
||||||
set enable_simd
|
set enable_simd
|
||||||
target x86_64
|
target x86_64
|
||||||
target x86_64 has_avx
|
target x86_64 sse42
|
||||||
|
target x86_64 sse42 has_avx
|
||||||
target aarch64
|
target aarch64
|
||||||
target s390x
|
target s390x
|
||||||
|
|
||||||
|
|||||||
@@ -80,7 +80,6 @@ pub fn infer_native_flags(isa_builder: &mut dyn Configurable) -> Result<(), &'st
|
|||||||
isa_builder.set("has_sse3", "false").unwrap();
|
isa_builder.set("has_sse3", "false").unwrap();
|
||||||
isa_builder.set("has_ssse3", "false").unwrap();
|
isa_builder.set("has_ssse3", "false").unwrap();
|
||||||
isa_builder.set("has_sse41", "false").unwrap();
|
isa_builder.set("has_sse41", "false").unwrap();
|
||||||
isa_builder.set("has_sse42", "false").unwrap();
|
|
||||||
|
|
||||||
if std::is_x86_feature_detected!("sse3") {
|
if std::is_x86_feature_detected!("sse3") {
|
||||||
isa_builder.enable("has_sse3").unwrap();
|
isa_builder.enable("has_sse3").unwrap();
|
||||||
|
|||||||
@@ -35,7 +35,7 @@ impl CodegenSettings {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Features such as sse4.2 are unconditionally enabled on the x86_64 target
|
/// Features such as sse3 are unconditionally enabled on the x86_64 target
|
||||||
/// because they are hard required for SIMD, but when SIMD is disabled, for
|
/// because they are hard required for SIMD, but when SIMD is disabled, for
|
||||||
/// example, we support disabling these features.
|
/// example, we support disabling these features.
|
||||||
///
|
///
|
||||||
@@ -57,7 +57,7 @@ impl CodegenSettings {
|
|||||||
// to have test case failures unrelated to codegen setting input
|
// to have test case failures unrelated to codegen setting input
|
||||||
// that fail on one architecture to fail on other architectures as
|
// that fail on one architecture to fail on other architectures as
|
||||||
// well.
|
// well.
|
||||||
let new_flags = ["has_sse3", "has_ssse3", "has_sse41", "has_sse42"]
|
let new_flags = ["has_sse3", "has_ssse3", "has_sse41"]
|
||||||
.into_iter()
|
.into_iter()
|
||||||
.map(|name| Ok((name, u.arbitrary()?)))
|
.map(|name| Ok((name, u.arbitrary()?)))
|
||||||
.collect::<arbitrary::Result<HashMap<_, bool>>>()?;
|
.collect::<arbitrary::Result<HashMap<_, bool>>>()?;
|
||||||
@@ -147,8 +147,8 @@ impl<'a> Arbitrary<'a> for CodegenSettings {
|
|||||||
std:"sse3" => clif:"has_sse3" ratio: 1 in 1,
|
std:"sse3" => clif:"has_sse3" ratio: 1 in 1,
|
||||||
std:"ssse3" => clif:"has_ssse3" ratio: 1 in 1,
|
std:"ssse3" => clif:"has_ssse3" ratio: 1 in 1,
|
||||||
std:"sse4.1" => clif:"has_sse41" ratio: 1 in 1,
|
std:"sse4.1" => clif:"has_sse41" ratio: 1 in 1,
|
||||||
std:"sse4.2" => clif:"has_sse42" ratio: 1 in 1,
|
|
||||||
|
|
||||||
|
std:"sse4.2" => clif:"has_sse42",
|
||||||
std:"popcnt" => clif:"has_popcnt",
|
std:"popcnt" => clif:"has_popcnt",
|
||||||
std:"avx" => clif:"has_avx",
|
std:"avx" => clif:"has_avx",
|
||||||
std:"avx2" => clif:"has_avx2",
|
std:"avx2" => clif:"has_avx2",
|
||||||
|
|||||||
Reference in New Issue
Block a user