From 7fb6159a85ac4ca0c424bc47850b5c1a5563d97e Mon Sep 17 00:00:00 2001
From: Jakob Stoklund Olesen <jolesen@mozilla.com>
Date: Tue, 26 Sep 2017 09:54:54 -0700
Subject: [PATCH] Add Intel encodings for the fcmp instruction.

Not all floating point condition codes are directly supported by the
ucimiss/ucomisd instructions. Some inequalities need to be reversed and
eq+ne require two separate tests.
---
 .../filetests/isa/intel/binary32-float.cton   | 60 +++++++++++++++++++
 .../filetests/isa/intel/binary64-float.cton   | 60 +++++++++++++++++++
 cranelift/filetests/wasm/f32-compares.cton    | 50 ++++++++++++++++
 cranelift/filetests/wasm/f64-compares.cton    | 50 ++++++++++++++++
 lib/cretonne/meta/gen_legalizer.py            |  6 +-
 lib/cretonne/meta/isa/intel/defs.py           | 13 ++++
 lib/cretonne/meta/isa/intel/encodings.py      | 22 +++++--
 lib/cretonne/meta/isa/intel/legalize.py       | 38 +++++++++++-
 lib/cretonne/meta/isa/intel/recipes.py        | 53 +++++++++++++++-
 9 files changed, 342 insertions(+), 10 deletions(-)
 create mode 100644 cranelift/filetests/wasm/f32-compares.cton
 create mode 100644 cranelift/filetests/wasm/f64-compares.cton

diff --git a/cranelift/filetests/isa/intel/binary32-float.cton b/cranelift/filetests/isa/intel/binary32-float.cton
index 4fa083b3de..b0a2c83c8f 100644
--- a/cranelift/filetests/isa/intel/binary32-float.cton
+++ b/cranelift/filetests/isa/intel/binary32-float.cton
@@ -157,6 +157,36 @@ ebb0:
     ; asm: movd 1032(%esp), %xmm2
     [-,%xmm2]           v211 = fill v201                        ; bin: 66 0f 6e 94 24 00000408
 
+    ; Comparisons.
+    ;
+    ; Only `supported_floatccs` are tested here. Others are handled by
+    ; legalization paterns.
+
+    ; asm: ucomiss %xmm2, %xmm5
+    ; asm: setnp %bl
+    [-,%rbx]            v300 = fcmp ord v10, v11                ; bin: 0f 2e ea 0f 9b c3
+    ; asm: ucomiss %xmm5, %xmm2
+    ; asm: setp %bl
+    [-,%rbx]            v301 = fcmp uno v11, v10                ; bin: 0f 2e d5 0f 9a c3
+    ; asm: ucomiss %xmm2, %xmm5
+    ; asm: setne %dl
+    [-,%rdx]            v302 = fcmp one v10, v11                ; bin: 0f 2e ea 0f 95 c2
+    ; asm: ucomiss %xmm5, %xmm2
+    ; asm: sete %dl
+    [-,%rdx]            v303 = fcmp ueq v11, v10                ; bin: 0f 2e d5 0f 94 c2
+    ; asm: ucomiss %xmm2, %xmm5
+    ; asm: seta %bl
+    [-,%rbx]            v304 = fcmp gt v10, v11                 ; bin: 0f 2e ea 0f 97 c3
+    ; asm: ucomiss %xmm5, %xmm2
+    ; asm: setae %bl
+    [-,%rbx]            v305 = fcmp ge v11, v10                 ; bin: 0f 2e d5 0f 93 c3
+    ; asm: ucomiss %xmm2, %xmm5
+    ; asm: setb %dl
+    [-,%rdx]            v306 = fcmp ult v10, v11                ; bin: 0f 2e ea 0f 92 c2
+    ; asm: ucomiss %xmm5, %xmm2
+    ; asm: setbe %dl
+    [-,%rdx]            v307 = fcmp ule v11, v10                ; bin: 0f 2e d5 0f 96 c2
+
     return
 }
 
@@ -302,5 +332,35 @@ ebb0:
     ; asm: movq 1032(%esp), %xmm2
     [-,%xmm2]           v211 = fill v201                        ; bin: f3 0f 7e 94 24 00000408
 
+    ; Comparisons.
+    ;
+    ; Only `supported_floatccs` are tested here. Others are handled by
+    ; legalization paterns.
+
+    ; asm: ucomisd %xmm2, %xmm5
+    ; asm: setnp %bl
+    [-,%rbx]            v300 = fcmp ord v10, v11                ; bin: 66 0f 2e ea 0f 9b c3
+    ; asm: ucomisd %xmm5, %xmm2
+    ; asm: setp %bl
+    [-,%rbx]            v301 = fcmp uno v11, v10                ; bin: 66 0f 2e d5 0f 9a c3
+    ; asm: ucomisd %xmm2, %xmm5
+    ; asm: setne %dl
+    [-,%rdx]            v302 = fcmp one v10, v11                ; bin: 66 0f 2e ea 0f 95 c2
+    ; asm: ucomisd %xmm5, %xmm2
+    ; asm: sete %dl
+    [-,%rdx]            v303 = fcmp ueq v11, v10                ; bin: 66 0f 2e d5 0f 94 c2
+    ; asm: ucomisd %xmm2, %xmm5
+    ; asm: seta %bl
+    [-,%rbx]            v304 = fcmp gt v10, v11                 ; bin: 66 0f 2e ea 0f 97 c3
+    ; asm: ucomisd %xmm5, %xmm2
+    ; asm: setae %bl
+    [-,%rbx]            v305 = fcmp ge v11, v10                 ; bin: 66 0f 2e d5 0f 93 c3
+    ; asm: ucomisd %xmm2, %xmm5
+    ; asm: setb %dl
+    [-,%rdx]            v306 = fcmp ult v10, v11                ; bin: 66 0f 2e ea 0f 92 c2
+    ; asm: ucomisd %xmm5, %xmm2
+    ; asm: setbe %dl
+    [-,%rdx]            v307 = fcmp ule v11, v10                ; bin: 66 0f 2e d5 0f 96 c2
+
     return
 }
diff --git a/cranelift/filetests/isa/intel/binary64-float.cton b/cranelift/filetests/isa/intel/binary64-float.cton
index c8d4df232f..542a712f26 100644
--- a/cranelift/filetests/isa/intel/binary64-float.cton
+++ b/cranelift/filetests/isa/intel/binary64-float.cton
@@ -166,6 +166,36 @@ ebb0:
     ; asm: movd 1032(%rsp), %xmm10
     [-,%xmm10]          v211 = fill v201                        ; bin: 66 44 0f 6e 94 24 00000408
 
+    ; Comparisons.
+    ;
+    ; Only `supported_floatccs` are tested here. Others are handled by
+    ; legalization paterns.
+
+    ; asm: ucomiss %xmm10, %xmm5
+    ; asm: setnp %bl
+    [-,%rbx]            v300 = fcmp ord v10, v11                ; bin: 41 0f 2e ea 0f 9b c3
+    ; asm: ucomiss %xmm5, %xmm10
+    ; asm: setp %bl
+    [-,%rbx]            v301 = fcmp uno v11, v10                ; bin: 44 0f 2e d5 0f 9a c3
+    ; asm: ucomiss %xmm10, %xmm5
+    ; asm: setne %dl
+    [-,%rdx]            v302 = fcmp one v10, v11                ; bin: 41 0f 2e ea 0f 95 c2
+    ; asm: ucomiss %xmm5, %xmm10
+    ; asm: sete %dl
+    [-,%rdx]            v303 = fcmp ueq v11, v10                ; bin: 44 0f 2e d5 0f 94 c2
+    ; asm: ucomiss %xmm10, %xmm5
+    ; asm: seta %bl
+    [-,%rbx]            v304 = fcmp gt v10, v11                 ; bin: 41 0f 2e ea 0f 97 c3
+    ; asm: ucomiss %xmm5, %xmm10
+    ; asm: setae %bl
+    [-,%rbx]            v305 = fcmp ge v11, v10                 ; bin: 44 0f 2e d5 0f 93 c3
+    ; asm: ucomiss %xmm10, %xmm5
+    ; asm: setb %dl
+    [-,%rdx]            v306 = fcmp ult v10, v11                ; bin: 41 0f 2e ea 0f 92 c2
+    ; asm: ucomiss %xmm5, %xmm10
+    ; asm: setbe %dl
+    [-,%rdx]            v307 = fcmp ule v11, v10                ; bin: 44 0f 2e d5 0f 96 c2
+
     return
 }
 
@@ -326,5 +356,35 @@ ebb0:
     ; asm: movq 1032(%rsp), %xmm10
     [-,%xmm10]          v211 = fill v201                        ; bin: f3 44 0f 7e 94 24 00000408
 
+    ; Comparisons.
+    ;
+    ; Only `supported_floatccs` are tested here. Others are handled by
+    ; legalization paterns.
+
+    ; asm: ucomisd %xmm10, %xmm5
+    ; asm: setnp %bl
+    [-,%rbx]            v300 = fcmp ord v10, v11                ; bin: 66 41 0f 2e ea 0f 9b c3
+    ; asm: ucomisd %xmm5, %xmm10
+    ; asm: setp %bl
+    [-,%rbx]            v301 = fcmp uno v11, v10                ; bin: 66 44 0f 2e d5 0f 9a c3
+    ; asm: ucomisd %xmm10, %xmm5
+    ; asm: setne %dl
+    [-,%rdx]            v302 = fcmp one v10, v11                ; bin: 66 41 0f 2e ea 0f 95 c2
+    ; asm: ucomisd %xmm5, %xmm10
+    ; asm: sete %dl
+    [-,%rdx]            v303 = fcmp ueq v11, v10                ; bin: 66 44 0f 2e d5 0f 94 c2
+    ; asm: ucomisd %xmm10, %xmm5
+    ; asm: seta %bl
+    [-,%rbx]            v304 = fcmp gt v10, v11                 ; bin: 66 41 0f 2e ea 0f 97 c3
+    ; asm: ucomisd %xmm5, %xmm10
+    ; asm: setae %bl
+    [-,%rbx]            v305 = fcmp ge v11, v10                 ; bin: 66 44 0f 2e d5 0f 93 c3
+    ; asm: ucomisd %xmm10, %xmm5
+    ; asm: setb %dl
+    [-,%rdx]            v306 = fcmp ult v10, v11                ; bin: 66 41 0f 2e ea 0f 92 c2
+    ; asm: ucomisd %xmm5, %xmm10
+    ; asm: setbe %dl
+    [-,%rdx]            v307 = fcmp ule v11, v10                ; bin: 66 44 0f 2e d5 0f 96 c2
+
     return
 }
diff --git a/cranelift/filetests/wasm/f32-compares.cton b/cranelift/filetests/wasm/f32-compares.cton
new file mode 100644
index 0000000000..560b86ebcb
--- /dev/null
+++ b/cranelift/filetests/wasm/f32-compares.cton
@@ -0,0 +1,50 @@
+; Test code generation for WebAssembly f32 comparison operators.
+test compile
+
+set is_64bit=0
+isa intel haswell
+
+set is_64bit=1
+isa intel haswell
+
+function %f32_eq(f32, f32) -> i32 {
+ebb0(v0: f32, v1: f32):
+    v2 = fcmp eq v0, v1
+    v3 = bint.i32 v2
+    return v3
+}
+
+function %f32_ne(f32, f32) -> i32 {
+ebb0(v0: f32, v1: f32):
+    v2 = fcmp ne v0, v1
+    v3 = bint.i32 v2
+    return v3
+}
+
+function %f32_lt(f32, f32) -> i32 {
+ebb0(v0: f32, v1: f32):
+    v2 = fcmp lt v0, v1
+    v3 = bint.i32 v2
+    return v3
+}
+
+function %f32_gt(f32, f32) -> i32 {
+ebb0(v0: f32, v1: f32):
+    v2 = fcmp gt v0, v1
+    v3 = bint.i32 v2
+    return v3
+}
+
+function %f32_le(f32, f32) -> i32 {
+ebb0(v0: f32, v1: f32):
+    v2 = fcmp le v0, v1
+    v3 = bint.i32 v2
+    return v3
+}
+
+function %f32_ge(f32, f32) -> i32 {
+ebb0(v0: f32, v1: f32):
+    v2 = fcmp ge v0, v1
+    v3 = bint.i32 v2
+    return v3
+}
diff --git a/cranelift/filetests/wasm/f64-compares.cton b/cranelift/filetests/wasm/f64-compares.cton
new file mode 100644
index 0000000000..78a260ef27
--- /dev/null
+++ b/cranelift/filetests/wasm/f64-compares.cton
@@ -0,0 +1,50 @@
+; Test code generation for WebAssembly f64 comparison operators.
+test compile
+
+set is_64bit=0
+isa intel haswell
+
+set is_64bit=1
+isa intel haswell
+
+function %f64_eq(f64, f64) -> i32 {
+ebb0(v0: f64, v1: f64):
+    v2 = fcmp eq v0, v1
+    v3 = bint.i32 v2
+    return v3
+}
+
+function %f64_ne(f64, f64) -> i32 {
+ebb0(v0: f64, v1: f64):
+    v2 = fcmp ne v0, v1
+    v3 = bint.i32 v2
+    return v3
+}
+
+function %f64_lt(f64, f64) -> i32 {
+ebb0(v0: f64, v1: f64):
+    v2 = fcmp lt v0, v1
+    v3 = bint.i32 v2
+    return v3
+}
+
+function %f64_gt(f64, f64) -> i32 {
+ebb0(v0: f64, v1: f64):
+    v2 = fcmp gt v0, v1
+    v3 = bint.i32 v2
+    return v3
+}
+
+function %f64_le(f64, f64) -> i32 {
+ebb0(v0: f64, v1: f64):
+    v2 = fcmp le v0, v1
+    v3 = bint.i32 v2
+    return v3
+}
+
+function %f64_ge(f64, f64) -> i32 {
+ebb0(v0: f64, v1: f64):
+    v2 = fcmp ge v0, v1
+    v3 = bint.i32 v2
+    return v3
+}
diff --git a/lib/cretonne/meta/gen_legalizer.py b/lib/cretonne/meta/gen_legalizer.py
index 0d8407bb30..1b39918658 100644
--- a/lib/cretonne/meta/gen_legalizer.py
+++ b/lib/cretonne/meta/gen_legalizer.py
@@ -169,10 +169,12 @@ def unwrap_inst(iref, node, fmt):
     iform = expr.inst.format
     nvops = iform.num_value_operands
 
-    # The tuple of locals we're extracting is `expr.args`.
+    # The tuple of locals to extract is the `Var` instances in `expr.args`.
+    arg_names = tuple(
+            arg.name if isinstance(arg, Var) else '_' for arg in expr.args)
     with fmt.indented(
             'let ({}, predicate) = if let ir::InstructionData::{} {{'
-            .format(', '.join(map(str, expr.args)), iform.name), '};'):
+            .format(', '.join(map(str, arg_names)), iform.name), '};'):
         # Fields are encoded directly.
         for f in iform.imm_fields:
             fmt.line('{},'.format(f.member))
diff --git a/lib/cretonne/meta/isa/intel/defs.py b/lib/cretonne/meta/isa/intel/defs.py
index ad13741ebc..d5bb0b5a1f 100644
--- a/lib/cretonne/meta/isa/intel/defs.py
+++ b/lib/cretonne/meta/isa/intel/defs.py
@@ -7,9 +7,22 @@ from __future__ import absolute_import
 from cdsl.isa import TargetISA, CPUMode
 import base.instructions
 from . import instructions as x86
+from base.immediates import floatcc
 
 ISA = TargetISA('intel', [base.instructions.GROUP, x86.GROUP])
 
 # CPU modes for 32-bit and 64-bit operation.
 I64 = CPUMode('I64', ISA)
 I32 = CPUMode('I32', ISA)
+
+# The set of floating point condition codes that are directly supported.
+# Other condition codes need to be reversed or expressed as two tests.
+supported_floatccs = [
+        floatcc.ord,
+        floatcc.uno,
+        floatcc.one,
+        floatcc.ueq,
+        floatcc.gt,
+        floatcc.ge,
+        floatcc.ult,
+        floatcc.ule]
diff --git a/lib/cretonne/meta/isa/intel/encodings.py b/lib/cretonne/meta/isa/intel/encodings.py
index 6d46111d7f..c7f9a7c0b6 100644
--- a/lib/cretonne/meta/isa/intel/encodings.py
+++ b/lib/cretonne/meta/isa/intel/encodings.py
@@ -26,8 +26,8 @@ I32.legalize_type(
         default=narrow,
         b1=expand,
         i32=intel_expand,
-        f32=expand,
-        f64=expand)
+        f32=intel_expand,
+        f64=intel_expand)
 
 I64.legalize_monomorphic(expand)
 I64.legalize_type(
@@ -35,8 +35,8 @@ I64.legalize_type(
         b1=expand,
         i32=intel_expand,
         i64=intel_expand,
-        f32=expand,
-        f64=expand)
+        f32=intel_expand,
+        f64=intel_expand)
 
 
 #
@@ -106,6 +106,13 @@ for inst,           opc in [
         (base.bxor, 0x31)]:
     enc_i32_i64(inst, r.rr, opc)
 
+# Also add a `b1` encodings for the logic instructions.
+# TODO: Should this be done with 8-bit instructions? It would improve
+# partial register dependencies.
+enc_flt(base.band.b1, r.rr, 0x21)
+enc_flt(base.bor.b1,  r.rr, 0x09)
+enc_flt(base.bxor.b1, r.rr, 0x31)
+
 enc_i32_i64(base.imul, r.rrx, 0x0f, 0xaf)
 enc_i32_i64(x86.sdivmodx, r.div, 0xf7, rrr=7)
 enc_i32_i64(x86.udivmodx, r.div, 0xf7, rrr=6)
@@ -391,3 +398,10 @@ for inst,               opc in [
         (base.bxor,     0x57)]:
     enc_flt(inst.f32, r.frm, 0x0f, opc)
     enc_flt(inst.f64, r.frm, 0x0f, opc)
+
+# Comparisons.
+#
+# This only covers the condition codes in `supported_floatccs`, the rest are
+# handled by legalization patterns.
+enc_flt(base.fcmp.f32, r.fcscc, 0x0f, 0x2e)
+enc_flt(base.fcmp.f64, r.fcscc, 0x66, 0x0f, 0x2e)
diff --git a/lib/cretonne/meta/isa/intel/legalize.py b/lib/cretonne/meta/isa/intel/legalize.py
index cc46846d81..6125dcd0e4 100644
--- a/lib/cretonne/meta/isa/intel/legalize.py
+++ b/lib/cretonne/meta/isa/intel/legalize.py
@@ -4,7 +4,7 @@ Custom legalization patterns for Intel.
 from __future__ import absolute_import
 from cdsl.ast import Var
 from cdsl.xform import Rtl, XFormGroup
-from base.immediates import imm64
+from base.immediates import imm64, floatcc
 from base.types import i32, i64
 from base import legalize as shared
 from base import instructions as insts
@@ -25,6 +25,8 @@ dead = Var('dead')
 x = Var('x')
 xhi = Var('xhi')
 y = Var('y')
+a1 = Var('a1')
+a2 = Var('a2')
 
 #
 # Division and remainder.
@@ -56,3 +58,37 @@ for ty in [i32, i64]:
                 xhi << insts.sshr_imm(x, imm64(ty.lane_bits() - 1)),
                 (dead, a) << x86.sdivmodx(x, xhi, y)
             ))
+
+# Floating point condition codes.
+#
+# The 8 condition codes in `supported_floatccs` are directly supported by a
+# `ucomiss` or `ucomisd` instruction. The remaining codes need legalization
+# patterns.
+
+# Equality needs an explicit `ord` test which checks the parity bit.
+intel_expand.legalize(
+        a << insts.fcmp(floatcc.eq, x, y),
+        Rtl(
+            a1 << insts.fcmp(floatcc.ord, x, y),
+            a2 << insts.fcmp(floatcc.ueq, x, y),
+            a << insts.band(a1, a2)
+        ))
+intel_expand.legalize(
+        a << insts.fcmp(floatcc.ne, x, y),
+        Rtl(
+            a1 << insts.fcmp(floatcc.uno, x, y),
+            a2 << insts.fcmp(floatcc.one, x, y),
+            a << insts.bor(a1, a2)
+        ))
+
+# Inequalities that need to be reversed.
+for cc,               rev_cc in [
+        (floatcc.lt,  floatcc.gt),
+        (floatcc.le,  floatcc.ge),
+        (floatcc.ugt, floatcc.ult),
+        (floatcc.uge, floatcc.ule)]:
+    intel_expand.legalize(
+            a << insts.fcmp(cc, x, y),
+            Rtl(
+                a << insts.fcmp(rev_cc, y, x)
+            ))
diff --git a/lib/cretonne/meta/isa/intel/recipes.py b/lib/cretonne/meta/isa/intel/recipes.py
index 04c7c91891..74391b0746 100644
--- a/lib/cretonne/meta/isa/intel/recipes.py
+++ b/lib/cretonne/meta/isa/intel/recipes.py
@@ -3,12 +3,13 @@ Intel Encoding recipes.
 """
 from __future__ import absolute_import
 from cdsl.isa import EncRecipe
-from cdsl.predicates import IsSignedInt, IsEqual
+from cdsl.predicates import IsSignedInt, IsEqual, Or
 from base.formats import Unary, UnaryImm, Binary, BinaryImm, MultiAry
 from base.formats import Trap, Call, IndirectCall, Store, Load
-from base.formats import IntCompare
+from base.formats import IntCompare, FloatCompare
 from base.formats import RegMove, Ternary, Jump, Branch, FuncAddr
 from .registers import GPR, ABCD, FPR, GPR8, FPR8, StackGPR32, StackFPR32
+from .defs import supported_floatccs
 
 try:
     from typing import Tuple, Dict, Sequence  # noqa
@@ -696,7 +697,7 @@ t8jccb_abcd = TailRecipe(
 # This bandaid macro doesn't support a REX prefix for the final `setCC`
 # instruction, so it is limited to the `ABCD` register class for booleans.
 icscc = TailRecipe(
-        'cscc', IntCompare, size=1 + 3, ins=(GPR, GPR), outs=ABCD,
+        'icscc', IntCompare, size=1 + 3, ins=(GPR, GPR), outs=ABCD,
         emit='''
         // Comparison instruction.
         PUT_OP(bits, rex2(in_reg0, in_reg1), sink);
@@ -719,3 +720,49 @@ icscc = TailRecipe(
         sink.put1(setcc);
         modrm_rr(out_reg0, 0, sink);
         ''')
+
+
+# Make a FloatCompare instruction predicate with the supported condition codes.
+
+# Same thing for floating point.
+#
+# The ucomiss/ucomisd instructions set the EFLAGS bits CF/PF/CF like this:
+#
+#    ZPC OSA
+# UN 111 000
+# GT 000 000
+# LT 001 000
+# EQ 100 000
+#
+# Not all floating point condition codes are supported.
+fcscc = TailRecipe(
+        'fcscc', FloatCompare, size=1 + 3, ins=(FPR, FPR), outs=ABCD,
+        instp=Or(*(IsEqual(FloatCompare.cond, cc)
+                   for cc in supported_floatccs)),
+        emit='''
+        // Comparison instruction.
+        PUT_OP(bits, rex2(in_reg1, in_reg0), sink);
+        modrm_rr(in_reg1, in_reg0, sink);
+        // `setCC` instruction, no REX.
+        use ir::condcodes::FloatCC::*;
+        let setcc = match cond {
+            Ordered                    => 0x9b, // EQ|LT|GT => setnp (P=0)
+            Unordered                  => 0x9a, // UN       => setp  (P=1)
+            OrderedNotEqual            => 0x95, // LT|GT    => setne (Z=0),
+            UnorderedOrEqual           => 0x94, // UN|EQ    => sete  (Z=1)
+            GreaterThan                => 0x97, // GT       => seta  (C=0&Z=0)
+            GreaterThanOrEqual         => 0x93, // GT|EQ    => setae (C=0)
+            UnorderedOrLessThan        => 0x92, // UN|LT    => setb  (C=1)
+            UnorderedOrLessThanOrEqual => 0x96, // UN|LT|EQ => setbe (Z=1|C=1)
+            Equal |                       // EQ
+            NotEqual |                    // UN|LT|GT
+            LessThan |                    // LT
+            LessThanOrEqual |             // LT|EQ
+            UnorderedOrGreaterThan |      // UN|GT
+            UnorderedOrGreaterThanOrEqual // UN|GT|EQ
+            => panic!("{} not supported by fcscc", cond),
+        };
+        sink.put1(0x0f);
+        sink.put1(setcc);
+        modrm_rr(out_reg0, 0, sink);
+        ''')