diff --git a/cranelift/codegen/src/nan_canonicalization.rs b/cranelift/codegen/src/nan_canonicalization.rs index e7c0e53419..107985e27e 100644 --- a/cranelift/codegen/src/nan_canonicalization.rs +++ b/cranelift/codegen/src/nan_canonicalization.rs @@ -6,7 +6,6 @@ use crate::cursor::{Cursor, FuncCursor}; use crate::ir::condcodes::FloatCC; use crate::ir::immediates::{Ieee32, Ieee64}; use crate::ir::types; -use crate::ir::types::Type; use crate::ir::{Function, Inst, InstBuilder, InstructionData, Opcode, Value}; use crate::timing; @@ -64,22 +63,44 @@ fn add_nan_canon_seq(pos: &mut FuncCursor, inst: Inst) { // Insert a comparison instruction, to check if `inst_res` is NaN. Select // the canonical NaN value if `val` is NaN, assign the result to `inst`. let is_nan = pos.ins().fcmp(FloatCC::NotEqual, new_res, new_res); - let canon_nan = insert_nan_const(pos, val_type); - pos.ins() - .with_result(val) - .select(is_nan, canon_nan, new_res); - pos.prev_inst(); // Step backwards so the pass does not skip instructions. -} + let scalar_select = |pos: &mut FuncCursor, canon_nan: Value| { + pos.ins() + .with_result(val) + .select(is_nan, canon_nan, new_res); + }; + let vector_select = |pos: &mut FuncCursor, canon_nan: Value| { + let cond = pos.ins().raw_bitcast(types::I8X16, is_nan); + let canon_nan = pos.ins().raw_bitcast(types::I8X16, canon_nan); + let result = pos.ins().raw_bitcast(types::I8X16, new_res); + let bitmask = pos.ins().bitselect(cond, canon_nan, result); + pos.ins().with_result(val).raw_bitcast(val_type, bitmask); + }; -/// Insert a canonical 32-bit or 64-bit NaN constant at the current position. -fn insert_nan_const(pos: &mut FuncCursor, nan_type: Type) -> Value { - match nan_type { - types::F32 => pos.ins().f32const(Ieee32::with_bits(CANON_32BIT_NAN)), - types::F64 => pos.ins().f64const(Ieee64::with_bits(CANON_64BIT_NAN)), + match val_type { + types::F32 => { + let canon_nan = pos.ins().f32const(Ieee32::with_bits(CANON_32BIT_NAN)); + scalar_select(pos, canon_nan); + } + types::F64 => { + let canon_nan = pos.ins().f64const(Ieee64::with_bits(CANON_64BIT_NAN)); + scalar_select(pos, canon_nan); + } + types::F32X4 => { + let canon_nan = pos.ins().iconst(types::I32, i64::from(CANON_32BIT_NAN)); + let canon_nan = pos.ins().splat(types::I32X4, canon_nan); + vector_select(pos, canon_nan); + } + types::F64X2 => { + let canon_nan = pos.ins().iconst(types::I64, CANON_64BIT_NAN as i64); + let canon_nan = pos.ins().splat(types::I64X2, canon_nan); + vector_select(pos, canon_nan); + } _ => { // Panic if the type given was not an IEEE floating point type. panic!("Could not canonicalize NaN: Unexpected result type found."); } } + + pos.prev_inst(); // Step backwards so the pass does not skip instructions. } diff --git a/tests/all/wast.rs b/tests/all/wast.rs index c4e10da30d..47f2176aba 100644 --- a/tests/all/wast.rs +++ b/tests/all/wast.rs @@ -40,6 +40,10 @@ fn run_wast(wast: &str, strategy: Strategy, pooling: bool) -> anyhow::Result<()> .strategy(strategy)? .cranelift_debug_verifier(true); + if wast.ends_with("canonicalize-nan.wast") { + cfg.cranelift_nan_canonicalization(true); + } + // By default we'll allocate huge chunks (6gb) of the address space for each // linear memory. This is typically fine but when we emulate tests with QEMU // it turns out that it causes memory usage to balloon massively. Leave a diff --git a/tests/misc_testsuite/simd/canonicalize-nan.wast b/tests/misc_testsuite/simd/canonicalize-nan.wast new file mode 100644 index 0000000000..1e43ce2516 --- /dev/null +++ b/tests/misc_testsuite/simd/canonicalize-nan.wast @@ -0,0 +1,58 @@ +;; This *.wast test should be run with `cranelift_nan_canonicalization` set to +;; `true` in `wast.rs` + +(module + (func (export "f32x4.floor") (param v128) (result v128) + local.get 0 + f32x4.floor) + (func (export "f32x4.nearest") (param v128) (result v128) + local.get 0 + f32x4.nearest) + (func (export "f32x4.sqrt") (param v128) (result v128) + local.get 0 + f32x4.sqrt) + (func (export "f32x4.trunc") (param v128) (result v128) + local.get 0 + f32x4.trunc) + (func (export "f32x4.ceil") (param v128) (result v128) + local.get 0 + f32x4.ceil) + + (func (export "f64x2.floor") (param v128) (result v128) + local.get 0 + f64x2.floor) + (func (export "f64x2.nearest") (param v128) (result v128) + local.get 0 + f64x2.nearest) + (func (export "f64x2.sqrt") (param v128) (result v128) + local.get 0 + f64x2.sqrt) + (func (export "f64x2.trunc") (param v128) (result v128) + local.get 0 + f64x2.trunc) + (func (export "f64x2.ceil") (param v128) (result v128) + local.get 0 + f64x2.ceil) +) + +(assert_return (invoke "f32x4.floor" (v128.const f32x4 1 -2.2 3.4 nan)) + (v128.const f32x4 1 -3 3 nan)) +(assert_return (invoke "f32x4.nearest" (v128.const f32x4 1 -2.2 3.4 nan)) + (v128.const f32x4 1 -2 3 nan)) +(assert_return (invoke "f32x4.sqrt" (v128.const f32x4 1 4 -1 nan)) + (v128.const f32x4 1 2 nan nan)) +(assert_return (invoke "f32x4.trunc" (v128.const f32x4 1 -2.2 3.4 nan)) + (v128.const f32x4 1 -2 3 nan)) +(assert_return (invoke "f32x4.ceil" (v128.const f32x4 1 -2.2 3.4 nan)) + (v128.const f32x4 1 -2 4 nan)) + +(assert_return (invoke "f64x2.floor" (v128.const f64x2 -2.2 nan)) + (v128.const f64x2 -3 nan)) +(assert_return (invoke "f64x2.nearest" (v128.const f64x2 -2.2 nan)) + (v128.const f64x2 -2 nan)) +(assert_return (invoke "f64x2.sqrt" (v128.const f64x2 4 nan)) + (v128.const f64x2 2 nan)) +(assert_return (invoke "f64x2.trunc" (v128.const f64x2 3.4 nan)) + (v128.const f64x2 3 nan)) +(assert_return (invoke "f64x2.ceil" (v128.const f64x2 3.4 nan)) + (v128.const f64x2 4 nan))