From b6b474a8c982f40adeb0e25016abb32ab58d465a Mon Sep 17 00:00:00 2001
From: Jakob Stoklund Olesen <jolesen@mozilla.com>
Date: Wed, 27 Sep 2017 11:02:57 -0700
Subject: [PATCH] Add Intel legalization for fmin and fmax.

The native x86_fmin and x86_fmax instructions don't behave correctly for
NaN inputs and when comparing +0.0 to -0.0, so we need separate branches
for those cases.
---
 .../filetests/isa/intel/legalize-custom.cton  | 19 ++++
 cranelift/filetests/wasm/f32-arith.cton       | 13 ++-
 cranelift/filetests/wasm/f64-arith.cton       | 13 ++-
 lib/cretonne/meta/isa/intel/legalize.py       |  4 +
 lib/cretonne/src/isa/intel/enc_tables.rs      | 92 ++++++++++++++++++-
 5 files changed, 136 insertions(+), 5 deletions(-)

diff --git a/cranelift/filetests/isa/intel/legalize-custom.cton b/cranelift/filetests/isa/intel/legalize-custom.cton
index 8c9e898586..94aa7ab1d2 100644
--- a/cranelift/filetests/isa/intel/legalize-custom.cton
+++ b/cranelift/filetests/isa/intel/legalize-custom.cton
@@ -78,3 +78,22 @@ ebb0(v0: f64, v1: f64, v2: i32):
     ; nextln: return $v3
     return v3
 }
+
+function %f32_min(f32, f32) -> f32 {
+ebb0(v0: f32, v1: f32):
+    v2 = fmin v0, v1
+    return v2
+    ; check: $(vnat=$V) = x86_fmin $v0, $v1
+    ; nextln: jump $(done=$EBB)($vnat)
+
+    ; check: $(uno=$EBB):
+    ; nextln: $(vuno=$V) = fadd.f32 $v0, $v1
+    ; nextln: jump $(done=$EBB)($vuno)
+
+    ; check: $(ueq=$EBB):
+    ; check: $(veq=$V) = bor.f32 $v0, $v1
+    ; nextln: jump $(done=$EBB)($veq)
+
+    ; check: $done($v2: f32):
+    ; nextln: return $v2
+}
diff --git a/cranelift/filetests/wasm/f32-arith.cton b/cranelift/filetests/wasm/f32-arith.cton
index ddb05138e9..331f9c7fad 100644
--- a/cranelift/filetests/wasm/f32-arith.cton
+++ b/cranelift/filetests/wasm/f32-arith.cton
@@ -85,8 +85,17 @@ ebb0(v0: f32, v1: f32):
     return v2
 }
 
-; function %f32_min(f32, f32) -> f32
-; function %f32_max(f32, f32) -> f32
+function %f32_min(f32, f32) -> f32 {
+ebb0(v0: f32, v1: f32):
+    v2 = fmin v0, v1
+    return v2
+}
+
+function %f32_max(f32, f32) -> f32 {
+ebb0(v0: f32, v1: f32):
+    v2 = fmax v0, v1
+    return v2
+}
 
 function %f32_copysign(f32, f32) -> f32 {
 ebb0(v0: f32, v1: f32):
diff --git a/cranelift/filetests/wasm/f64-arith.cton b/cranelift/filetests/wasm/f64-arith.cton
index d67066e615..4f39fb6a5f 100644
--- a/cranelift/filetests/wasm/f64-arith.cton
+++ b/cranelift/filetests/wasm/f64-arith.cton
@@ -82,8 +82,17 @@ ebb0(v0: f64, v1: f64):
     return v2
 }
 
-; function %f64_min(f64, f64) -> f64
-; function %f64_max(f64, f64) -> f64
+function %f64_min(f64, f64) -> f64 {
+ebb0(v0: f64, v1: f64):
+    v2 = fmin v0, v1
+    return v2
+}
+
+function %f64_max(f64, f64) -> f64 {
+ebb0(v0: f64, v1: f64):
+    v2 = fmax v0, v1
+    return v2
+}
 
 function %f64_copysign(f64, f64) -> f64 {
 ebb0(v0: f64, v1: f64):
diff --git a/lib/cretonne/meta/isa/intel/legalize.py b/lib/cretonne/meta/isa/intel/legalize.py
index 6125dcd0e4..8d93337c69 100644
--- a/lib/cretonne/meta/isa/intel/legalize.py
+++ b/lib/cretonne/meta/isa/intel/legalize.py
@@ -92,3 +92,7 @@ for cc,               rev_cc in [
             Rtl(
                 a << insts.fcmp(rev_cc, y, x)
             ))
+
+# We need to modify the CFG for min/max legalization.
+intel_expand.custom_legalize(insts.fmin, 'expand_minmax')
+intel_expand.custom_legalize(insts.fmax, 'expand_minmax')
diff --git a/lib/cretonne/src/isa/intel/enc_tables.rs b/lib/cretonne/src/isa/intel/enc_tables.rs
index 35ca9a9d76..4c8c428bfc 100644
--- a/lib/cretonne/src/isa/intel/enc_tables.rs
+++ b/lib/cretonne/src/isa/intel/enc_tables.rs
@@ -1,7 +1,9 @@
 //! Encoding tables for Intel ISAs.
 
 use bitset::BitSet;
-use ir;
+use cursor::{Cursor, FuncCursor};
+use flowgraph::ControlFlowGraph;
+use ir::{self, InstBuilder};
 use isa::constraints::*;
 use isa::enc_tables::*;
 use isa::encoding::RecipeSizing;
@@ -11,3 +13,91 @@ use super::registers::*;
 
 include!(concat!(env!("OUT_DIR"), "/encoding-intel.rs"));
 include!(concat!(env!("OUT_DIR"), "/legalize-intel.rs"));
+
+/// Expand the `fmin` and `fmax` instructions using the Intel `x86_fmin` and `x86_fmax`
+/// instructions.
+fn expand_minmax(inst: ir::Inst, func: &mut ir::Function, cfg: &mut ControlFlowGraph) {
+    use ir::condcodes::FloatCC;
+
+    let (x, y, x86_opc, bitwise_opc) = match func.dfg[inst] {
+        ir::InstructionData::Binary {
+            opcode: ir::Opcode::Fmin,
+            args,
+        } => (args[0], args[1], ir::Opcode::X86Fmin, ir::Opcode::Bor),
+        ir::InstructionData::Binary {
+            opcode: ir::Opcode::Fmax,
+            args,
+        } => (args[0], args[1], ir::Opcode::X86Fmax, ir::Opcode::Band),
+        _ => panic!("Expected fmin/fmax: {}", func.dfg.display_inst(inst, None)),
+    };
+    let old_ebb = func.layout.pp_ebb(inst);
+
+    // We need to handle the following conditions, depending on how x and y compare:
+    //
+    // 1. LT or GT: The native `x86_opc` min/max instruction does what we need.
+    // 2. EQ: We need to use `bitwise_opc` to make sure that
+    //    fmin(0.0, -0.0) -> -0.0 and fmax(0.0, -0.0) -> 0.0.
+    // 3. UN: We need to produce a quiet NaN that is canonical if the inputs are canonical.
+
+    // EBB handling case 3) where one operand is NaN.
+    let uno_ebb = func.dfg.make_ebb();
+
+    // EBB that handles the unordered or equal cases 2) and 3).
+    let ueq_ebb = func.dfg.make_ebb();
+
+    // Final EBB with one argument representing the final result value.
+    let done = func.dfg.make_ebb();
+
+    // The basic blocks are laid out to minimize branching for the common cases:
+    //
+    // 1) One branch not taken, one jump.
+    // 2) One branch taken.
+    // 3) Two branches taken, one jump.
+
+    // Move the `inst` result value onto the `done` EBB.
+    let result = func.dfg.first_result(inst);
+    let ty = func.dfg.value_type(result);
+    func.dfg.clear_results(inst);
+    func.dfg.attach_ebb_arg(done, result);
+
+    // Test for case 1) ordered and not equal.
+    let mut pos = FuncCursor::new(func).at_inst(inst);
+    let cmp_ueq = pos.ins().fcmp(FloatCC::UnorderedOrEqual, x, y);
+    pos.ins().brnz(cmp_ueq, ueq_ebb, &[]);
+
+    // Handle the common ordered, not equal (LT|GT) case.
+    let one_inst = pos.ins().Binary(x86_opc, ty, x, y).0;
+    let one_result = pos.func.dfg.first_result(one_inst);
+    pos.ins().jump(done, &[one_result]);
+
+    // Case 3) Unordered.
+    // We know that at least one operand is a NaN that needs to be propagated. We simply use an
+    // `fadd` instruction which has the same NaN propagation semantics.
+    pos.insert_ebb(uno_ebb);
+    let uno_result = pos.ins().fadd(x, y);
+    pos.ins().jump(done, &[uno_result]);
+
+    // Case 2) or 3).
+    pos.insert_ebb(ueq_ebb);
+    // Test for case 3) (UN) one value is NaN.
+    // TODO: When we get support for flag values, we can reuse the above comparison.
+    let cmp_uno = pos.ins().fcmp(FloatCC::Unordered, x, y);
+    pos.ins().brnz(cmp_uno, uno_ebb, &[]);
+
+    // We are now in case 2) where x and y compare EQ.
+    // We need a bitwise operation to get the sign right.
+    let bw_inst = pos.ins().Binary(bitwise_opc, ty, x, y).0;
+    let bw_result = pos.func.dfg.first_result(bw_inst);
+    // This should become a fall-through for this second most common case.
+    // Recycle the original instruction as a jump.
+    pos.func.dfg.replace(inst).jump(done, &[bw_result]);
+
+    // Finally insert a label for the completion.
+    pos.next_inst();
+    pos.insert_ebb(done);
+
+    cfg.recompute_ebb(pos.func, old_ebb);
+    cfg.recompute_ebb(pos.func, ueq_ebb);
+    cfg.recompute_ebb(pos.func, uno_ebb);
+    cfg.recompute_ebb(pos.func, done);
+}