Add Intel encodings for the fcmp instruction.

Not all floating point condition codes are directly supported by the
ucimiss/ucomisd instructions. Some inequalities need to be reversed and
eq+ne require two separate tests.
This commit is contained in:
Jakob Stoklund Olesen
2017-09-26 09:54:54 -07:00
parent 79968a2325
commit 7fb6159a85
9 changed files with 342 additions and 10 deletions

View File

@@ -157,6 +157,36 @@ ebb0:
; asm: movd 1032(%esp), %xmm2
[-,%xmm2] v211 = fill v201 ; bin: 66 0f 6e 94 24 00000408
; Comparisons.
;
; Only `supported_floatccs` are tested here. Others are handled by
; legalization paterns.
; asm: ucomiss %xmm2, %xmm5
; asm: setnp %bl
[-,%rbx] v300 = fcmp ord v10, v11 ; bin: 0f 2e ea 0f 9b c3
; asm: ucomiss %xmm5, %xmm2
; asm: setp %bl
[-,%rbx] v301 = fcmp uno v11, v10 ; bin: 0f 2e d5 0f 9a c3
; asm: ucomiss %xmm2, %xmm5
; asm: setne %dl
[-,%rdx] v302 = fcmp one v10, v11 ; bin: 0f 2e ea 0f 95 c2
; asm: ucomiss %xmm5, %xmm2
; asm: sete %dl
[-,%rdx] v303 = fcmp ueq v11, v10 ; bin: 0f 2e d5 0f 94 c2
; asm: ucomiss %xmm2, %xmm5
; asm: seta %bl
[-,%rbx] v304 = fcmp gt v10, v11 ; bin: 0f 2e ea 0f 97 c3
; asm: ucomiss %xmm5, %xmm2
; asm: setae %bl
[-,%rbx] v305 = fcmp ge v11, v10 ; bin: 0f 2e d5 0f 93 c3
; asm: ucomiss %xmm2, %xmm5
; asm: setb %dl
[-,%rdx] v306 = fcmp ult v10, v11 ; bin: 0f 2e ea 0f 92 c2
; asm: ucomiss %xmm5, %xmm2
; asm: setbe %dl
[-,%rdx] v307 = fcmp ule v11, v10 ; bin: 0f 2e d5 0f 96 c2
return
}
@@ -302,5 +332,35 @@ ebb0:
; asm: movq 1032(%esp), %xmm2
[-,%xmm2] v211 = fill v201 ; bin: f3 0f 7e 94 24 00000408
; Comparisons.
;
; Only `supported_floatccs` are tested here. Others are handled by
; legalization paterns.
; asm: ucomisd %xmm2, %xmm5
; asm: setnp %bl
[-,%rbx] v300 = fcmp ord v10, v11 ; bin: 66 0f 2e ea 0f 9b c3
; asm: ucomisd %xmm5, %xmm2
; asm: setp %bl
[-,%rbx] v301 = fcmp uno v11, v10 ; bin: 66 0f 2e d5 0f 9a c3
; asm: ucomisd %xmm2, %xmm5
; asm: setne %dl
[-,%rdx] v302 = fcmp one v10, v11 ; bin: 66 0f 2e ea 0f 95 c2
; asm: ucomisd %xmm5, %xmm2
; asm: sete %dl
[-,%rdx] v303 = fcmp ueq v11, v10 ; bin: 66 0f 2e d5 0f 94 c2
; asm: ucomisd %xmm2, %xmm5
; asm: seta %bl
[-,%rbx] v304 = fcmp gt v10, v11 ; bin: 66 0f 2e ea 0f 97 c3
; asm: ucomisd %xmm5, %xmm2
; asm: setae %bl
[-,%rbx] v305 = fcmp ge v11, v10 ; bin: 66 0f 2e d5 0f 93 c3
; asm: ucomisd %xmm2, %xmm5
; asm: setb %dl
[-,%rdx] v306 = fcmp ult v10, v11 ; bin: 66 0f 2e ea 0f 92 c2
; asm: ucomisd %xmm5, %xmm2
; asm: setbe %dl
[-,%rdx] v307 = fcmp ule v11, v10 ; bin: 66 0f 2e d5 0f 96 c2
return
}

View File

@@ -166,6 +166,36 @@ ebb0:
; asm: movd 1032(%rsp), %xmm10
[-,%xmm10] v211 = fill v201 ; bin: 66 44 0f 6e 94 24 00000408
; Comparisons.
;
; Only `supported_floatccs` are tested here. Others are handled by
; legalization paterns.
; asm: ucomiss %xmm10, %xmm5
; asm: setnp %bl
[-,%rbx] v300 = fcmp ord v10, v11 ; bin: 41 0f 2e ea 0f 9b c3
; asm: ucomiss %xmm5, %xmm10
; asm: setp %bl
[-,%rbx] v301 = fcmp uno v11, v10 ; bin: 44 0f 2e d5 0f 9a c3
; asm: ucomiss %xmm10, %xmm5
; asm: setne %dl
[-,%rdx] v302 = fcmp one v10, v11 ; bin: 41 0f 2e ea 0f 95 c2
; asm: ucomiss %xmm5, %xmm10
; asm: sete %dl
[-,%rdx] v303 = fcmp ueq v11, v10 ; bin: 44 0f 2e d5 0f 94 c2
; asm: ucomiss %xmm10, %xmm5
; asm: seta %bl
[-,%rbx] v304 = fcmp gt v10, v11 ; bin: 41 0f 2e ea 0f 97 c3
; asm: ucomiss %xmm5, %xmm10
; asm: setae %bl
[-,%rbx] v305 = fcmp ge v11, v10 ; bin: 44 0f 2e d5 0f 93 c3
; asm: ucomiss %xmm10, %xmm5
; asm: setb %dl
[-,%rdx] v306 = fcmp ult v10, v11 ; bin: 41 0f 2e ea 0f 92 c2
; asm: ucomiss %xmm5, %xmm10
; asm: setbe %dl
[-,%rdx] v307 = fcmp ule v11, v10 ; bin: 44 0f 2e d5 0f 96 c2
return
}
@@ -326,5 +356,35 @@ ebb0:
; asm: movq 1032(%rsp), %xmm10
[-,%xmm10] v211 = fill v201 ; bin: f3 44 0f 7e 94 24 00000408
; Comparisons.
;
; Only `supported_floatccs` are tested here. Others are handled by
; legalization paterns.
; asm: ucomisd %xmm10, %xmm5
; asm: setnp %bl
[-,%rbx] v300 = fcmp ord v10, v11 ; bin: 66 41 0f 2e ea 0f 9b c3
; asm: ucomisd %xmm5, %xmm10
; asm: setp %bl
[-,%rbx] v301 = fcmp uno v11, v10 ; bin: 66 44 0f 2e d5 0f 9a c3
; asm: ucomisd %xmm10, %xmm5
; asm: setne %dl
[-,%rdx] v302 = fcmp one v10, v11 ; bin: 66 41 0f 2e ea 0f 95 c2
; asm: ucomisd %xmm5, %xmm10
; asm: sete %dl
[-,%rdx] v303 = fcmp ueq v11, v10 ; bin: 66 44 0f 2e d5 0f 94 c2
; asm: ucomisd %xmm10, %xmm5
; asm: seta %bl
[-,%rbx] v304 = fcmp gt v10, v11 ; bin: 66 41 0f 2e ea 0f 97 c3
; asm: ucomisd %xmm5, %xmm10
; asm: setae %bl
[-,%rbx] v305 = fcmp ge v11, v10 ; bin: 66 44 0f 2e d5 0f 93 c3
; asm: ucomisd %xmm10, %xmm5
; asm: setb %dl
[-,%rdx] v306 = fcmp ult v10, v11 ; bin: 66 41 0f 2e ea 0f 92 c2
; asm: ucomisd %xmm5, %xmm10
; asm: setbe %dl
[-,%rdx] v307 = fcmp ule v11, v10 ; bin: 66 44 0f 2e d5 0f 96 c2
return
}

View File

@@ -0,0 +1,50 @@
; Test code generation for WebAssembly f32 comparison operators.
test compile
set is_64bit=0
isa intel haswell
set is_64bit=1
isa intel haswell
function %f32_eq(f32, f32) -> i32 {
ebb0(v0: f32, v1: f32):
v2 = fcmp eq v0, v1
v3 = bint.i32 v2
return v3
}
function %f32_ne(f32, f32) -> i32 {
ebb0(v0: f32, v1: f32):
v2 = fcmp ne v0, v1
v3 = bint.i32 v2
return v3
}
function %f32_lt(f32, f32) -> i32 {
ebb0(v0: f32, v1: f32):
v2 = fcmp lt v0, v1
v3 = bint.i32 v2
return v3
}
function %f32_gt(f32, f32) -> i32 {
ebb0(v0: f32, v1: f32):
v2 = fcmp gt v0, v1
v3 = bint.i32 v2
return v3
}
function %f32_le(f32, f32) -> i32 {
ebb0(v0: f32, v1: f32):
v2 = fcmp le v0, v1
v3 = bint.i32 v2
return v3
}
function %f32_ge(f32, f32) -> i32 {
ebb0(v0: f32, v1: f32):
v2 = fcmp ge v0, v1
v3 = bint.i32 v2
return v3
}

View File

@@ -0,0 +1,50 @@
; Test code generation for WebAssembly f64 comparison operators.
test compile
set is_64bit=0
isa intel haswell
set is_64bit=1
isa intel haswell
function %f64_eq(f64, f64) -> i32 {
ebb0(v0: f64, v1: f64):
v2 = fcmp eq v0, v1
v3 = bint.i32 v2
return v3
}
function %f64_ne(f64, f64) -> i32 {
ebb0(v0: f64, v1: f64):
v2 = fcmp ne v0, v1
v3 = bint.i32 v2
return v3
}
function %f64_lt(f64, f64) -> i32 {
ebb0(v0: f64, v1: f64):
v2 = fcmp lt v0, v1
v3 = bint.i32 v2
return v3
}
function %f64_gt(f64, f64) -> i32 {
ebb0(v0: f64, v1: f64):
v2 = fcmp gt v0, v1
v3 = bint.i32 v2
return v3
}
function %f64_le(f64, f64) -> i32 {
ebb0(v0: f64, v1: f64):
v2 = fcmp le v0, v1
v3 = bint.i32 v2
return v3
}
function %f64_ge(f64, f64) -> i32 {
ebb0(v0: f64, v1: f64):
v2 = fcmp ge v0, v1
v3 = bint.i32 v2
return v3
}

View File

@@ -169,10 +169,12 @@ def unwrap_inst(iref, node, fmt):
iform = expr.inst.format
nvops = iform.num_value_operands
# The tuple of locals we're extracting is `expr.args`.
# The tuple of locals to extract is the `Var` instances in `expr.args`.
arg_names = tuple(
arg.name if isinstance(arg, Var) else '_' for arg in expr.args)
with fmt.indented(
'let ({}, predicate) = if let ir::InstructionData::{} {{'
.format(', '.join(map(str, expr.args)), iform.name), '};'):
.format(', '.join(map(str, arg_names)), iform.name), '};'):
# Fields are encoded directly.
for f in iform.imm_fields:
fmt.line('{},'.format(f.member))

View File

@@ -7,9 +7,22 @@ from __future__ import absolute_import
from cdsl.isa import TargetISA, CPUMode
import base.instructions
from . import instructions as x86
from base.immediates import floatcc
ISA = TargetISA('intel', [base.instructions.GROUP, x86.GROUP])
# CPU modes for 32-bit and 64-bit operation.
I64 = CPUMode('I64', ISA)
I32 = CPUMode('I32', ISA)
# The set of floating point condition codes that are directly supported.
# Other condition codes need to be reversed or expressed as two tests.
supported_floatccs = [
floatcc.ord,
floatcc.uno,
floatcc.one,
floatcc.ueq,
floatcc.gt,
floatcc.ge,
floatcc.ult,
floatcc.ule]

View File

@@ -26,8 +26,8 @@ I32.legalize_type(
default=narrow,
b1=expand,
i32=intel_expand,
f32=expand,
f64=expand)
f32=intel_expand,
f64=intel_expand)
I64.legalize_monomorphic(expand)
I64.legalize_type(
@@ -35,8 +35,8 @@ I64.legalize_type(
b1=expand,
i32=intel_expand,
i64=intel_expand,
f32=expand,
f64=expand)
f32=intel_expand,
f64=intel_expand)
#
@@ -106,6 +106,13 @@ for inst, opc in [
(base.bxor, 0x31)]:
enc_i32_i64(inst, r.rr, opc)
# Also add a `b1` encodings for the logic instructions.
# TODO: Should this be done with 8-bit instructions? It would improve
# partial register dependencies.
enc_flt(base.band.b1, r.rr, 0x21)
enc_flt(base.bor.b1, r.rr, 0x09)
enc_flt(base.bxor.b1, r.rr, 0x31)
enc_i32_i64(base.imul, r.rrx, 0x0f, 0xaf)
enc_i32_i64(x86.sdivmodx, r.div, 0xf7, rrr=7)
enc_i32_i64(x86.udivmodx, r.div, 0xf7, rrr=6)
@@ -391,3 +398,10 @@ for inst, opc in [
(base.bxor, 0x57)]:
enc_flt(inst.f32, r.frm, 0x0f, opc)
enc_flt(inst.f64, r.frm, 0x0f, opc)
# Comparisons.
#
# This only covers the condition codes in `supported_floatccs`, the rest are
# handled by legalization patterns.
enc_flt(base.fcmp.f32, r.fcscc, 0x0f, 0x2e)
enc_flt(base.fcmp.f64, r.fcscc, 0x66, 0x0f, 0x2e)

View File

@@ -4,7 +4,7 @@ Custom legalization patterns for Intel.
from __future__ import absolute_import
from cdsl.ast import Var
from cdsl.xform import Rtl, XFormGroup
from base.immediates import imm64
from base.immediates import imm64, floatcc
from base.types import i32, i64
from base import legalize as shared
from base import instructions as insts
@@ -25,6 +25,8 @@ dead = Var('dead')
x = Var('x')
xhi = Var('xhi')
y = Var('y')
a1 = Var('a1')
a2 = Var('a2')
#
# Division and remainder.
@@ -56,3 +58,37 @@ for ty in [i32, i64]:
xhi << insts.sshr_imm(x, imm64(ty.lane_bits() - 1)),
(dead, a) << x86.sdivmodx(x, xhi, y)
))
# Floating point condition codes.
#
# The 8 condition codes in `supported_floatccs` are directly supported by a
# `ucomiss` or `ucomisd` instruction. The remaining codes need legalization
# patterns.
# Equality needs an explicit `ord` test which checks the parity bit.
intel_expand.legalize(
a << insts.fcmp(floatcc.eq, x, y),
Rtl(
a1 << insts.fcmp(floatcc.ord, x, y),
a2 << insts.fcmp(floatcc.ueq, x, y),
a << insts.band(a1, a2)
))
intel_expand.legalize(
a << insts.fcmp(floatcc.ne, x, y),
Rtl(
a1 << insts.fcmp(floatcc.uno, x, y),
a2 << insts.fcmp(floatcc.one, x, y),
a << insts.bor(a1, a2)
))
# Inequalities that need to be reversed.
for cc, rev_cc in [
(floatcc.lt, floatcc.gt),
(floatcc.le, floatcc.ge),
(floatcc.ugt, floatcc.ult),
(floatcc.uge, floatcc.ule)]:
intel_expand.legalize(
a << insts.fcmp(cc, x, y),
Rtl(
a << insts.fcmp(rev_cc, y, x)
))

View File

@@ -3,12 +3,13 @@ Intel Encoding recipes.
"""
from __future__ import absolute_import
from cdsl.isa import EncRecipe
from cdsl.predicates import IsSignedInt, IsEqual
from cdsl.predicates import IsSignedInt, IsEqual, Or
from base.formats import Unary, UnaryImm, Binary, BinaryImm, MultiAry
from base.formats import Trap, Call, IndirectCall, Store, Load
from base.formats import IntCompare
from base.formats import IntCompare, FloatCompare
from base.formats import RegMove, Ternary, Jump, Branch, FuncAddr
from .registers import GPR, ABCD, FPR, GPR8, FPR8, StackGPR32, StackFPR32
from .defs import supported_floatccs
try:
from typing import Tuple, Dict, Sequence # noqa
@@ -696,7 +697,7 @@ t8jccb_abcd = TailRecipe(
# This bandaid macro doesn't support a REX prefix for the final `setCC`
# instruction, so it is limited to the `ABCD` register class for booleans.
icscc = TailRecipe(
'cscc', IntCompare, size=1 + 3, ins=(GPR, GPR), outs=ABCD,
'icscc', IntCompare, size=1 + 3, ins=(GPR, GPR), outs=ABCD,
emit='''
// Comparison instruction.
PUT_OP(bits, rex2(in_reg0, in_reg1), sink);
@@ -719,3 +720,49 @@ icscc = TailRecipe(
sink.put1(setcc);
modrm_rr(out_reg0, 0, sink);
''')
# Make a FloatCompare instruction predicate with the supported condition codes.
# Same thing for floating point.
#
# The ucomiss/ucomisd instructions set the EFLAGS bits CF/PF/CF like this:
#
# ZPC OSA
# UN 111 000
# GT 000 000
# LT 001 000
# EQ 100 000
#
# Not all floating point condition codes are supported.
fcscc = TailRecipe(
'fcscc', FloatCompare, size=1 + 3, ins=(FPR, FPR), outs=ABCD,
instp=Or(*(IsEqual(FloatCompare.cond, cc)
for cc in supported_floatccs)),
emit='''
// Comparison instruction.
PUT_OP(bits, rex2(in_reg1, in_reg0), sink);
modrm_rr(in_reg1, in_reg0, sink);
// `setCC` instruction, no REX.
use ir::condcodes::FloatCC::*;
let setcc = match cond {
Ordered => 0x9b, // EQ|LT|GT => setnp (P=0)
Unordered => 0x9a, // UN => setp (P=1)
OrderedNotEqual => 0x95, // LT|GT => setne (Z=0),
UnorderedOrEqual => 0x94, // UN|EQ => sete (Z=1)
GreaterThan => 0x97, // GT => seta (C=0&Z=0)
GreaterThanOrEqual => 0x93, // GT|EQ => setae (C=0)
UnorderedOrLessThan => 0x92, // UN|LT => setb (C=1)
UnorderedOrLessThanOrEqual => 0x96, // UN|LT|EQ => setbe (Z=1|C=1)
Equal | // EQ
NotEqual | // UN|LT|GT
LessThan | // LT
LessThanOrEqual | // LT|EQ
UnorderedOrGreaterThan | // UN|GT
UnorderedOrGreaterThanOrEqual // UN|GT|EQ
=> panic!("{} not supported by fcscc", cond),
};
sink.put1(0x0f);
sink.put1(setcc);
modrm_rr(out_reg0, 0, sink);
''')