From b6b474a8c982f40adeb0e25016abb32ab58d465a Mon Sep 17 00:00:00 2001 From: Jakob Stoklund Olesen Date: Wed, 27 Sep 2017 11:02:57 -0700 Subject: [PATCH] Add Intel legalization for fmin and fmax. The native x86_fmin and x86_fmax instructions don't behave correctly for NaN inputs and when comparing +0.0 to -0.0, so we need separate branches for those cases. --- .../filetests/isa/intel/legalize-custom.cton | 19 ++++ cranelift/filetests/wasm/f32-arith.cton | 13 ++- cranelift/filetests/wasm/f64-arith.cton | 13 ++- lib/cretonne/meta/isa/intel/legalize.py | 4 + lib/cretonne/src/isa/intel/enc_tables.rs | 92 ++++++++++++++++++- 5 files changed, 136 insertions(+), 5 deletions(-) diff --git a/cranelift/filetests/isa/intel/legalize-custom.cton b/cranelift/filetests/isa/intel/legalize-custom.cton index 8c9e898586..94aa7ab1d2 100644 --- a/cranelift/filetests/isa/intel/legalize-custom.cton +++ b/cranelift/filetests/isa/intel/legalize-custom.cton @@ -78,3 +78,22 @@ ebb0(v0: f64, v1: f64, v2: i32): ; nextln: return $v3 return v3 } + +function %f32_min(f32, f32) -> f32 { +ebb0(v0: f32, v1: f32): + v2 = fmin v0, v1 + return v2 + ; check: $(vnat=$V) = x86_fmin $v0, $v1 + ; nextln: jump $(done=$EBB)($vnat) + + ; check: $(uno=$EBB): + ; nextln: $(vuno=$V) = fadd.f32 $v0, $v1 + ; nextln: jump $(done=$EBB)($vuno) + + ; check: $(ueq=$EBB): + ; check: $(veq=$V) = bor.f32 $v0, $v1 + ; nextln: jump $(done=$EBB)($veq) + + ; check: $done($v2: f32): + ; nextln: return $v2 +} diff --git a/cranelift/filetests/wasm/f32-arith.cton b/cranelift/filetests/wasm/f32-arith.cton index ddb05138e9..331f9c7fad 100644 --- a/cranelift/filetests/wasm/f32-arith.cton +++ b/cranelift/filetests/wasm/f32-arith.cton @@ -85,8 +85,17 @@ ebb0(v0: f32, v1: f32): return v2 } -; function %f32_min(f32, f32) -> f32 -; function %f32_max(f32, f32) -> f32 +function %f32_min(f32, f32) -> f32 { +ebb0(v0: f32, v1: f32): + v2 = fmin v0, v1 + return v2 +} + +function %f32_max(f32, f32) -> f32 { +ebb0(v0: f32, v1: f32): + v2 = fmax v0, v1 + return v2 +} function %f32_copysign(f32, f32) -> f32 { ebb0(v0: f32, v1: f32): diff --git a/cranelift/filetests/wasm/f64-arith.cton b/cranelift/filetests/wasm/f64-arith.cton index d67066e615..4f39fb6a5f 100644 --- a/cranelift/filetests/wasm/f64-arith.cton +++ b/cranelift/filetests/wasm/f64-arith.cton @@ -82,8 +82,17 @@ ebb0(v0: f64, v1: f64): return v2 } -; function %f64_min(f64, f64) -> f64 -; function %f64_max(f64, f64) -> f64 +function %f64_min(f64, f64) -> f64 { +ebb0(v0: f64, v1: f64): + v2 = fmin v0, v1 + return v2 +} + +function %f64_max(f64, f64) -> f64 { +ebb0(v0: f64, v1: f64): + v2 = fmax v0, v1 + return v2 +} function %f64_copysign(f64, f64) -> f64 { ebb0(v0: f64, v1: f64): diff --git a/lib/cretonne/meta/isa/intel/legalize.py b/lib/cretonne/meta/isa/intel/legalize.py index 6125dcd0e4..8d93337c69 100644 --- a/lib/cretonne/meta/isa/intel/legalize.py +++ b/lib/cretonne/meta/isa/intel/legalize.py @@ -92,3 +92,7 @@ for cc, rev_cc in [ Rtl( a << insts.fcmp(rev_cc, y, x) )) + +# We need to modify the CFG for min/max legalization. +intel_expand.custom_legalize(insts.fmin, 'expand_minmax') +intel_expand.custom_legalize(insts.fmax, 'expand_minmax') diff --git a/lib/cretonne/src/isa/intel/enc_tables.rs b/lib/cretonne/src/isa/intel/enc_tables.rs index 35ca9a9d76..4c8c428bfc 100644 --- a/lib/cretonne/src/isa/intel/enc_tables.rs +++ b/lib/cretonne/src/isa/intel/enc_tables.rs @@ -1,7 +1,9 @@ //! Encoding tables for Intel ISAs. use bitset::BitSet; -use ir; +use cursor::{Cursor, FuncCursor}; +use flowgraph::ControlFlowGraph; +use ir::{self, InstBuilder}; use isa::constraints::*; use isa::enc_tables::*; use isa::encoding::RecipeSizing; @@ -11,3 +13,91 @@ use super::registers::*; include!(concat!(env!("OUT_DIR"), "/encoding-intel.rs")); include!(concat!(env!("OUT_DIR"), "/legalize-intel.rs")); + +/// Expand the `fmin` and `fmax` instructions using the Intel `x86_fmin` and `x86_fmax` +/// instructions. +fn expand_minmax(inst: ir::Inst, func: &mut ir::Function, cfg: &mut ControlFlowGraph) { + use ir::condcodes::FloatCC; + + let (x, y, x86_opc, bitwise_opc) = match func.dfg[inst] { + ir::InstructionData::Binary { + opcode: ir::Opcode::Fmin, + args, + } => (args[0], args[1], ir::Opcode::X86Fmin, ir::Opcode::Bor), + ir::InstructionData::Binary { + opcode: ir::Opcode::Fmax, + args, + } => (args[0], args[1], ir::Opcode::X86Fmax, ir::Opcode::Band), + _ => panic!("Expected fmin/fmax: {}", func.dfg.display_inst(inst, None)), + }; + let old_ebb = func.layout.pp_ebb(inst); + + // We need to handle the following conditions, depending on how x and y compare: + // + // 1. LT or GT: The native `x86_opc` min/max instruction does what we need. + // 2. EQ: We need to use `bitwise_opc` to make sure that + // fmin(0.0, -0.0) -> -0.0 and fmax(0.0, -0.0) -> 0.0. + // 3. UN: We need to produce a quiet NaN that is canonical if the inputs are canonical. + + // EBB handling case 3) where one operand is NaN. + let uno_ebb = func.dfg.make_ebb(); + + // EBB that handles the unordered or equal cases 2) and 3). + let ueq_ebb = func.dfg.make_ebb(); + + // Final EBB with one argument representing the final result value. + let done = func.dfg.make_ebb(); + + // The basic blocks are laid out to minimize branching for the common cases: + // + // 1) One branch not taken, one jump. + // 2) One branch taken. + // 3) Two branches taken, one jump. + + // Move the `inst` result value onto the `done` EBB. + let result = func.dfg.first_result(inst); + let ty = func.dfg.value_type(result); + func.dfg.clear_results(inst); + func.dfg.attach_ebb_arg(done, result); + + // Test for case 1) ordered and not equal. + let mut pos = FuncCursor::new(func).at_inst(inst); + let cmp_ueq = pos.ins().fcmp(FloatCC::UnorderedOrEqual, x, y); + pos.ins().brnz(cmp_ueq, ueq_ebb, &[]); + + // Handle the common ordered, not equal (LT|GT) case. + let one_inst = pos.ins().Binary(x86_opc, ty, x, y).0; + let one_result = pos.func.dfg.first_result(one_inst); + pos.ins().jump(done, &[one_result]); + + // Case 3) Unordered. + // We know that at least one operand is a NaN that needs to be propagated. We simply use an + // `fadd` instruction which has the same NaN propagation semantics. + pos.insert_ebb(uno_ebb); + let uno_result = pos.ins().fadd(x, y); + pos.ins().jump(done, &[uno_result]); + + // Case 2) or 3). + pos.insert_ebb(ueq_ebb); + // Test for case 3) (UN) one value is NaN. + // TODO: When we get support for flag values, we can reuse the above comparison. + let cmp_uno = pos.ins().fcmp(FloatCC::Unordered, x, y); + pos.ins().brnz(cmp_uno, uno_ebb, &[]); + + // We are now in case 2) where x and y compare EQ. + // We need a bitwise operation to get the sign right. + let bw_inst = pos.ins().Binary(bitwise_opc, ty, x, y).0; + let bw_result = pos.func.dfg.first_result(bw_inst); + // This should become a fall-through for this second most common case. + // Recycle the original instruction as a jump. + pos.func.dfg.replace(inst).jump(done, &[bw_result]); + + // Finally insert a label for the completion. + pos.next_inst(); + pos.insert_ebb(done); + + cfg.recompute_ebb(pos.func, old_ebb); + cfg.recompute_ebb(pos.func, ueq_ebb); + cfg.recompute_ebb(pos.func, uno_ebb); + cfg.recompute_ebb(pos.func, done); +}