cranelift: Implement nan canonicalization for vectors (#3146)

This fixes some fuzz bugs that came about enabling simd where nan canonicalization is performed on the fuzzers but cranelift would panic on these ops for vectors. This adds some custom codegen with `bitselect` to ensure any nan lanes are canonical-nan lanes in the canonicalized operations.
2021-08-05 13:44:16 -05:00
parent 9e142f8792
commit c6b095f9a3
3 changed files with 95 additions and 12 deletions
--- a/cranelift/codegen/src/nan_canonicalization.rs
+++ b/cranelift/codegen/src/nan_canonicalization.rs
@@ -6,7 +6,6 @@ use crate::cursor::{Cursor, FuncCursor};
 use crate::ir::condcodes::FloatCC;
 use crate::ir::immediates::{Ieee32, Ieee64};
 use crate::ir::types;
-use crate::ir::types::Type;
 use crate::ir::{Function, Inst, InstBuilder, InstructionData, Opcode, Value};
 use crate::timing;

@@ -64,22 +63,44 @@ fn add_nan_canon_seq(pos: &mut FuncCursor, inst: Inst) {
    // Insert a comparison instruction, to check if `inst_res` is NaN. Select
    // the canonical NaN value if `val` is NaN, assign the result to `inst`.
    let is_nan = pos.ins().fcmp(FloatCC::NotEqual, new_res, new_res);
-    let canon_nan = insert_nan_const(pos, val_type);
+
+    let scalar_select = |pos: &mut FuncCursor, canon_nan: Value| {
        pos.ins()
            .with_result(val)
            .select(is_nan, canon_nan, new_res);
+    };
+    let vector_select = |pos: &mut FuncCursor, canon_nan: Value| {
+        let cond = pos.ins().raw_bitcast(types::I8X16, is_nan);
+        let canon_nan = pos.ins().raw_bitcast(types::I8X16, canon_nan);
+        let result = pos.ins().raw_bitcast(types::I8X16, new_res);
+        let bitmask = pos.ins().bitselect(cond, canon_nan, result);
+        pos.ins().with_result(val).raw_bitcast(val_type, bitmask);
+    };

-    pos.prev_inst(); // Step backwards so the pass does not skip instructions.
-}
-
-/// Insert a canonical 32-bit or 64-bit NaN constant at the current position.
-fn insert_nan_const(pos: &mut FuncCursor, nan_type: Type) -> Value {
-    match nan_type {
-        types::F32 => pos.ins().f32const(Ieee32::with_bits(CANON_32BIT_NAN)),
-        types::F64 => pos.ins().f64const(Ieee64::with_bits(CANON_64BIT_NAN)),
+    match val_type {
+        types::F32 => {
+            let canon_nan = pos.ins().f32const(Ieee32::with_bits(CANON_32BIT_NAN));
+            scalar_select(pos, canon_nan);
+        }
+        types::F64 => {
+            let canon_nan = pos.ins().f64const(Ieee64::with_bits(CANON_64BIT_NAN));
+            scalar_select(pos, canon_nan);
+        }
+        types::F32X4 => {
+            let canon_nan = pos.ins().iconst(types::I32, i64::from(CANON_32BIT_NAN));
+            let canon_nan = pos.ins().splat(types::I32X4, canon_nan);
+            vector_select(pos, canon_nan);
+        }
+        types::F64X2 => {
+            let canon_nan = pos.ins().iconst(types::I64, CANON_64BIT_NAN as i64);
+            let canon_nan = pos.ins().splat(types::I64X2, canon_nan);
+            vector_select(pos, canon_nan);
+        }
        _ => {
            // Panic if the type given was not an IEEE floating point type.
            panic!("Could not canonicalize NaN: Unexpected result type found.");
        }
    }
+
+    pos.prev_inst(); // Step backwards so the pass does not skip instructions.
 }
--- a/tests/all/wast.rs
+++ b/tests/all/wast.rs
@@ -40,6 +40,10 @@ fn run_wast(wast: &str, strategy: Strategy, pooling: bool) -> anyhow::Result<()>
        .strategy(strategy)?
        .cranelift_debug_verifier(true);

+    if wast.ends_with("canonicalize-nan.wast") {
+        cfg.cranelift_nan_canonicalization(true);
+    }
+
    // By default we'll allocate huge chunks (6gb) of the address space for each
    // linear memory. This is typically fine but when we emulate tests with QEMU
    // it turns out that it causes memory usage to balloon massively. Leave a
--- a/tests/misc_testsuite/simd/canonicalize-nan.wast
+++ b/tests/misc_testsuite/simd/canonicalize-nan.wast
@@ -0,0 +1,58 @@
+;; This *.wast test should be run with `cranelift_nan_canonicalization` set to
+;; `true` in `wast.rs`
+
+(module
+  (func (export "f32x4.floor") (param v128) (result v128)
+    local.get 0
+    f32x4.floor)
+  (func (export "f32x4.nearest") (param v128) (result v128)
+    local.get 0
+    f32x4.nearest)
+  (func (export "f32x4.sqrt") (param v128) (result v128)
+    local.get 0
+    f32x4.sqrt)
+  (func (export "f32x4.trunc") (param v128) (result v128)
+    local.get 0
+    f32x4.trunc)
+  (func (export "f32x4.ceil") (param v128) (result v128)
+    local.get 0
+    f32x4.ceil)
+
+  (func (export "f64x2.floor") (param v128) (result v128)
+    local.get 0
+    f64x2.floor)
+  (func (export "f64x2.nearest") (param v128) (result v128)
+    local.get 0
+    f64x2.nearest)
+  (func (export "f64x2.sqrt") (param v128) (result v128)
+    local.get 0
+    f64x2.sqrt)
+  (func (export "f64x2.trunc") (param v128) (result v128)
+    local.get 0
+    f64x2.trunc)
+  (func (export "f64x2.ceil") (param v128) (result v128)
+    local.get 0
+    f64x2.ceil)
+)
+
+(assert_return (invoke "f32x4.floor" (v128.const f32x4 1 -2.2 3.4 nan))
+               (v128.const f32x4 1 -3 3 nan))
+(assert_return (invoke "f32x4.nearest" (v128.const f32x4 1 -2.2 3.4 nan))
+               (v128.const f32x4 1 -2 3 nan))
+(assert_return (invoke "f32x4.sqrt" (v128.const f32x4 1 4 -1 nan))
+               (v128.const f32x4 1 2 nan nan))
+(assert_return (invoke "f32x4.trunc" (v128.const f32x4 1 -2.2 3.4 nan))
+               (v128.const f32x4 1 -2 3 nan))
+(assert_return (invoke "f32x4.ceil" (v128.const f32x4 1 -2.2 3.4 nan))
+               (v128.const f32x4 1 -2 4 nan))
+
+(assert_return (invoke "f64x2.floor" (v128.const f64x2 -2.2 nan))
+               (v128.const f64x2 -3 nan))
+(assert_return (invoke "f64x2.nearest" (v128.const f64x2 -2.2 nan))
+               (v128.const f64x2 -2 nan))
+(assert_return (invoke "f64x2.sqrt" (v128.const f64x2 4 nan))
+               (v128.const f64x2 2 nan))
+(assert_return (invoke "f64x2.trunc" (v128.const f64x2 3.4 nan))
+               (v128.const f64x2 3 nan))
+(assert_return (invoke "f64x2.ceil" (v128.const f64x2 3.4 nan))
+               (v128.const f64x2 4 nan))