Add Intel legalization for fmin and fmax.

The native x86_fmin and x86_fmax instructions don't behave correctly for NaN inputs and when comparing +0.0 to -0.0, so we need separate branches for those cases.
2017-09-27 11:02:57 -07:00
parent 384b04b411
commit b6b474a8c9
5 changed files with 136 additions and 5 deletions
--- a/cranelift/filetests/isa/intel/legalize-custom.cton
+++ b/cranelift/filetests/isa/intel/legalize-custom.cton
@@ -78,3 +78,22 @@ ebb0(v0: f64, v1: f64, v2: i32):
    ; nextln: return $v3
    return v3
 }
 function %f32_min(f32, f32) -> f32 {
 ebb0(v0: f32, v1: f32):
    v2 = fmin v0, v1
    return v2
    ; check: $(vnat=$V) = x86_fmin $v0, $v1
    ; nextln: jump $(done=$EBB)($vnat)
    ; check: $(uno=$EBB):
    ; nextln: $(vuno=$V) = fadd.f32 $v0, $v1
    ; nextln: jump $(done=$EBB)($vuno)
    ; check: $(ueq=$EBB):
    ; check: $(veq=$V) = bor.f32 $v0, $v1
    ; nextln: jump $(done=$EBB)($veq)
    ; check: $done($v2: f32):
    ; nextln: return $v2
 }
--- a/cranelift/filetests/wasm/f32-arith.cton
+++ b/cranelift/filetests/wasm/f32-arith.cton
@@ -85,8 +85,17 @@ ebb0(v0: f32, v1: f32):
    return v2
 }
-; function %f32_min(f32, f32) -> f32
+function %f32_min(f32, f32) -> f32 {
-; function %f32_max(f32, f32) -> f32
+ebb0(v0: f32, v1: f32):
    v2 = fmin v0, v1
    return v2
 }
 function %f32_max(f32, f32) -> f32 {
 ebb0(v0: f32, v1: f32):
    v2 = fmax v0, v1
    return v2
 }
 function %f32_copysign(f32, f32) -> f32 {
 ebb0(v0: f32, v1: f32):
--- a/cranelift/filetests/wasm/f64-arith.cton
+++ b/cranelift/filetests/wasm/f64-arith.cton
@@ -82,8 +82,17 @@ ebb0(v0: f64, v1: f64):
    return v2
 }
-; function %f64_min(f64, f64) -> f64
+function %f64_min(f64, f64) -> f64 {
-; function %f64_max(f64, f64) -> f64
+ebb0(v0: f64, v1: f64):
    v2 = fmin v0, v1
    return v2
 }
 function %f64_max(f64, f64) -> f64 {
 ebb0(v0: f64, v1: f64):
    v2 = fmax v0, v1
    return v2
 }
 function %f64_copysign(f64, f64) -> f64 {
 ebb0(v0: f64, v1: f64):
--- a/lib/cretonne/meta/isa/intel/legalize.py
+++ b/lib/cretonne/meta/isa/intel/legalize.py
@@ -92,3 +92,7 @@ for cc,               rev_cc in [
            Rtl(
                a << insts.fcmp(rev_cc, y, x)
            ))
 # We need to modify the CFG for min/max legalization.
 intel_expand.custom_legalize(insts.fmin, 'expand_minmax')
 intel_expand.custom_legalize(insts.fmax, 'expand_minmax')
--- a/lib/cretonne/src/isa/intel/enc_tables.rs
+++ b/lib/cretonne/src/isa/intel/enc_tables.rs
@@ -1,7 +1,9 @@
 //! Encoding tables for Intel ISAs.
 use bitset::BitSet;
-use ir;
+use cursor::{Cursor, FuncCursor};
 use flowgraph::ControlFlowGraph;
 use ir::{self, InstBuilder};
 use isa::constraints::*;
 use isa::enc_tables::*;
 use isa::encoding::RecipeSizing;
@@ -11,3 +13,91 @@ use super::registers::*;
 include!(concat!(env!("OUT_DIR"), "/encoding-intel.rs"));
 include!(concat!(env!("OUT_DIR"), "/legalize-intel.rs"));
 /// Expand the `fmin` and `fmax` instructions using the Intel `x86_fmin` and `x86_fmax`
 /// instructions.
 fn expand_minmax(inst: ir::Inst, func: &mut ir::Function, cfg: &mut ControlFlowGraph) {
    use ir::condcodes::FloatCC;
    let (x, y, x86_opc, bitwise_opc) = match func.dfg[inst] {
        ir::InstructionData::Binary {
            opcode: ir::Opcode::Fmin,
            args,
        } => (args[0], args[1], ir::Opcode::X86Fmin, ir::Opcode::Bor),
        ir::InstructionData::Binary {
            opcode: ir::Opcode::Fmax,
            args,
        } => (args[0], args[1], ir::Opcode::X86Fmax, ir::Opcode::Band),
        _ => panic!("Expected fmin/fmax: {}", func.dfg.display_inst(inst, None)),
    };
    let old_ebb = func.layout.pp_ebb(inst);
    // We need to handle the following conditions, depending on how x and y compare:
    //
    // 1. LT or GT: The native `x86_opc` min/max instruction does what we need.
    // 2. EQ: We need to use `bitwise_opc` to make sure that
    //    fmin(0.0, -0.0) -> -0.0 and fmax(0.0, -0.0) -> 0.0.
    // 3. UN: We need to produce a quiet NaN that is canonical if the inputs are canonical.
    // EBB handling case 3) where one operand is NaN.
    let uno_ebb = func.dfg.make_ebb();
    // EBB that handles the unordered or equal cases 2) and 3).
    let ueq_ebb = func.dfg.make_ebb();
    // Final EBB with one argument representing the final result value.
    let done = func.dfg.make_ebb();
    // The basic blocks are laid out to minimize branching for the common cases:
    //
    // 1) One branch not taken, one jump.
    // 2) One branch taken.
    // 3) Two branches taken, one jump.
    // Move the `inst` result value onto the `done` EBB.
    let result = func.dfg.first_result(inst);
    let ty = func.dfg.value_type(result);
    func.dfg.clear_results(inst);
    func.dfg.attach_ebb_arg(done, result);
    // Test for case 1) ordered and not equal.
    let mut pos = FuncCursor::new(func).at_inst(inst);
    let cmp_ueq = pos.ins().fcmp(FloatCC::UnorderedOrEqual, x, y);
    pos.ins().brnz(cmp_ueq, ueq_ebb, &[]);
    // Handle the common ordered, not equal (LT|GT) case.
    let one_inst = pos.ins().Binary(x86_opc, ty, x, y).0;
    let one_result = pos.func.dfg.first_result(one_inst);
    pos.ins().jump(done, &[one_result]);
    // Case 3) Unordered.
    // We know that at least one operand is a NaN that needs to be propagated. We simply use an
    // `fadd` instruction which has the same NaN propagation semantics.
    pos.insert_ebb(uno_ebb);
    let uno_result = pos.ins().fadd(x, y);
    pos.ins().jump(done, &[uno_result]);
    // Case 2) or 3).
    pos.insert_ebb(ueq_ebb);
    // Test for case 3) (UN) one value is NaN.
    // TODO: When we get support for flag values, we can reuse the above comparison.
    let cmp_uno = pos.ins().fcmp(FloatCC::Unordered, x, y);
    pos.ins().brnz(cmp_uno, uno_ebb, &[]);
    // We are now in case 2) where x and y compare EQ.
    // We need a bitwise operation to get the sign right.
    let bw_inst = pos.ins().Binary(bitwise_opc, ty, x, y).0;
    let bw_result = pos.func.dfg.first_result(bw_inst);
    // This should become a fall-through for this second most common case.
    // Recycle the original instruction as a jump.
    pos.func.dfg.replace(inst).jump(done, &[bw_result]);
    // Finally insert a label for the completion.
    pos.next_inst();
    pos.insert_ebb(done);
    cfg.recompute_ebb(pos.func, old_ebb);
    cfg.recompute_ebb(pos.func, ueq_ebb);
    cfg.recompute_ebb(pos.func, uno_ebb);
    cfg.recompute_ebb(pos.func, done);
 }