Intel encodings for nearest/floor/ceil/trunc.

These floating point rounding operations all use the roundss/roundsd
instructions that are available in SSE 4.1.
This commit is contained in:
Jakob Stoklund Olesen
2017-09-25 14:57:01 -07:00
parent ac343ba92a
commit 6bec5f8507
8 changed files with 216 additions and 14 deletions

View File

@@ -89,6 +89,33 @@ ebb0:
; asm: sqrtss %xmm2, %xmm5 ; asm: sqrtss %xmm2, %xmm5
[-,%xmm5] v51 = sqrt v11 ; bin: f3 0f 51 ea [-,%xmm5] v51 = sqrt v11 ; bin: f3 0f 51 ea
; asm: roundss $0, %xmm5, %xmm4
[-,%xmm4] v52 = nearest v10 ; bin: 66 0f 3a 0a e5 00
; asm: roundss $0, %xmm2, %xmm5
[-,%xmm5] v53 = nearest v11 ; bin: 66 0f 3a 0a ea 00
; asm: roundss $0, %xmm5, %xmm2
[-,%xmm2] v54 = nearest v10 ; bin: 66 0f 3a 0a d5 00
; asm: roundss $1, %xmm5, %xmm4
[-,%xmm4] v55 = floor v10 ; bin: 66 0f 3a 0a e5 01
; asm: roundss $1, %xmm2, %xmm5
[-,%xmm5] v56 = floor v11 ; bin: 66 0f 3a 0a ea 01
; asm: roundss $1, %xmm5, %xmm2
[-,%xmm2] v57 = floor v10 ; bin: 66 0f 3a 0a d5 01
; asm: roundss $2, %xmm5, %xmm4
[-,%xmm4] v58 = ceil v10 ; bin: 66 0f 3a 0a e5 02
; asm: roundss $2, %xmm2, %xmm5
[-,%xmm5] v59 = ceil v11 ; bin: 66 0f 3a 0a ea 02
; asm: roundss $2, %xmm5, %xmm2
[-,%xmm2] v60 = ceil v10 ; bin: 66 0f 3a 0a d5 02
; asm: roundss $3, %xmm5, %xmm4
[-,%xmm4] v61 = trunc v10 ; bin: 66 0f 3a 0a e5 03
; asm: roundss $3, %xmm2, %xmm5
[-,%xmm5] v62 = trunc v11 ; bin: 66 0f 3a 0a ea 03
; asm: roundss $3, %xmm5, %xmm2
[-,%xmm2] v63 = trunc v10 ; bin: 66 0f 3a 0a d5 03
; Load/Store ; Load/Store
@@ -207,6 +234,33 @@ ebb0:
; asm: sqrtsd %xmm2, %xmm5 ; asm: sqrtsd %xmm2, %xmm5
[-,%xmm5] v51 = sqrt v11 ; bin: f2 0f 51 ea [-,%xmm5] v51 = sqrt v11 ; bin: f2 0f 51 ea
; asm: roundsd $0, %xmm5, %xmm4
[-,%xmm4] v52 = nearest v10 ; bin: 66 0f 3a 0b e5 00
; asm: roundsd $0, %xmm2, %xmm5
[-,%xmm5] v53 = nearest v11 ; bin: 66 0f 3a 0b ea 00
; asm: roundsd $0, %xmm5, %xmm2
[-,%xmm2] v54 = nearest v10 ; bin: 66 0f 3a 0b d5 00
; asm: roundsd $1, %xmm5, %xmm4
[-,%xmm4] v55 = floor v10 ; bin: 66 0f 3a 0b e5 01
; asm: roundsd $1, %xmm2, %xmm5
[-,%xmm5] v56 = floor v11 ; bin: 66 0f 3a 0b ea 01
; asm: roundsd $1, %xmm5, %xmm2
[-,%xmm2] v57 = floor v10 ; bin: 66 0f 3a 0b d5 01
; asm: roundsd $2, %xmm5, %xmm4
[-,%xmm4] v58 = ceil v10 ; bin: 66 0f 3a 0b e5 02
; asm: roundsd $2, %xmm2, %xmm5
[-,%xmm5] v59 = ceil v11 ; bin: 66 0f 3a 0b ea 02
; asm: roundsd $2, %xmm5, %xmm2
[-,%xmm2] v60 = ceil v10 ; bin: 66 0f 3a 0b d5 02
; asm: roundsd $3, %xmm5, %xmm4
[-,%xmm4] v61 = trunc v10 ; bin: 66 0f 3a 0b e5 03
; asm: roundsd $3, %xmm2, %xmm5
[-,%xmm5] v62 = trunc v11 ; bin: 66 0f 3a 0b ea 03
; asm: roundsd $3, %xmm5, %xmm2
[-,%xmm2] v63 = trunc v10 ; bin: 66 0f 3a 0b d5 03
; Load/Store ; Load/Store

View File

@@ -98,6 +98,34 @@ ebb0:
; asm: sqrtss %xmm10, %xmm5 ; asm: sqrtss %xmm10, %xmm5
[-,%xmm5] v51 = sqrt v11 ; bin: f3 41 0f 51 ea [-,%xmm5] v51 = sqrt v11 ; bin: f3 41 0f 51 ea
; asm: roundss $0, %xmm5, %xmm10
[-,%xmm10] v52 = nearest v10 ; bin: 66 44 0f 3a 0a d5 00
; asm: roundss $0, %xmm10, %xmm5
[-,%xmm5] v53 = nearest v11 ; bin: 66 41 0f 3a 0a ea 00
; asm: roundss $0, %xmm5, %xmm2
[-,%xmm2] v54 = nearest v10 ; bin: 66 0f 3a 0a d5 00
; asm: roundss $1, %xmm5, %xmm10
[-,%xmm10] v55 = floor v10 ; bin: 66 44 0f 3a 0a d5 01
; asm: roundss $1, %xmm10, %xmm5
[-,%xmm5] v56 = floor v11 ; bin: 66 41 0f 3a 0a ea 01
; asm: roundss $1, %xmm5, %xmm2
[-,%xmm2] v57 = floor v10 ; bin: 66 0f 3a 0a d5 01
; asm: roundss $2, %xmm5, %xmm10
[-,%xmm10] v58 = ceil v10 ; bin: 66 44 0f 3a 0a d5 02
; asm: roundss $2, %xmm10, %xmm5
[-,%xmm5] v59 = ceil v11 ; bin: 66 41 0f 3a 0a ea 02
; asm: roundss $2, %xmm5, %xmm2
[-,%xmm2] v60 = ceil v10 ; bin: 66 0f 3a 0a d5 02
; asm: roundss $3, %xmm5, %xmm10
[-,%xmm10] v61 = trunc v10 ; bin: 66 44 0f 3a 0a d5 03
; asm: roundss $3, %xmm10, %xmm5
[-,%xmm5] v62 = trunc v11 ; bin: 66 41 0f 3a 0a ea 03
; asm: roundss $3, %xmm5, %xmm2
[-,%xmm2] v63 = trunc v10 ; bin: 66 0f 3a 0a d5 03
; Load/Store ; Load/Store
; asm: movd (%r14), %xmm5 ; asm: movd (%r14), %xmm5
@@ -230,6 +258,34 @@ ebb0:
; asm: sqrtsd %xmm10, %xmm5 ; asm: sqrtsd %xmm10, %xmm5
[-,%xmm5] v51 = sqrt v11 ; bin: f2 41 0f 51 ea [-,%xmm5] v51 = sqrt v11 ; bin: f2 41 0f 51 ea
; asm: roundsd $0, %xmm5, %xmm10
[-,%xmm10] v52 = nearest v10 ; bin: 66 44 0f 3a 0b d5 00
; asm: roundsd $0, %xmm10, %xmm5
[-,%xmm5] v53 = nearest v11 ; bin: 66 41 0f 3a 0b ea 00
; asm: roundsd $0, %xmm5, %xmm2
[-,%xmm2] v54 = nearest v10 ; bin: 66 0f 3a 0b d5 00
; asm: roundsd $1, %xmm5, %xmm10
[-,%xmm10] v55 = floor v10 ; bin: 66 44 0f 3a 0b d5 01
; asm: roundsd $1, %xmm10, %xmm5
[-,%xmm5] v56 = floor v11 ; bin: 66 41 0f 3a 0b ea 01
; asm: roundsd $1, %xmm5, %xmm2
[-,%xmm2] v57 = floor v10 ; bin: 66 0f 3a 0b d5 01
; asm: roundsd $2, %xmm5, %xmm10
[-,%xmm10] v58 = ceil v10 ; bin: 66 44 0f 3a 0b d5 02
; asm: roundsd $2, %xmm10, %xmm5
[-,%xmm5] v59 = ceil v11 ; bin: 66 41 0f 3a 0b ea 02
; asm: roundsd $2, %xmm5, %xmm2
[-,%xmm2] v60 = ceil v10 ; bin: 66 0f 3a 0b d5 02
; asm: roundsd $3, %xmm5, %xmm10
[-,%xmm10] v61 = trunc v10 ; bin: 66 44 0f 3a 0b d5 03
; asm: roundsd $3, %xmm10, %xmm5
[-,%xmm5] v62 = trunc v11 ; bin: 66 41 0f 3a 0b ea 03
; asm: roundsd $3, %xmm5, %xmm2
[-,%xmm2] v63 = trunc v10 ; bin: 66 0f 3a 0b d5 03
; Load/Store ; Load/Store
; asm: movq (%r14), %xmm5 ; asm: movq (%r14), %xmm5

View File

@@ -35,10 +35,29 @@ ebb0(v0: f32):
return v1 return v1
} }
; function %f32_ceil(f32) -> f32 function %f32_ceil(f32) -> f32 {
; function %f32_floor(f32) -> f32 ebb0(v0: f32):
; function %f32_trunc(f32) -> f32 v1 = ceil v0
; function %f32_nearest (f32) -> f32 return v1
}
function %f32_floor(f32) -> f32 {
ebb0(v0: f32):
v1 = floor v0
return v1
}
function %f32_trunc(f32) -> f32 {
ebb0(v0: f32):
v1 = trunc v0
return v1
}
function %f32_nearest (f32) -> f32 {
ebb0(v0: f32):
v1 = nearest v0
return v1
}
; Binary Operations ; Binary Operations

View File

@@ -32,10 +32,29 @@ ebb0(v0: f64):
return v1 return v1
} }
; function %f64_ceil(f64) -> f64 function %f64_ceil(f64) -> f64 {
; function %f64_floor(f64) -> f64 ebb0(v0: f64):
; function %f64_trunc(f64) -> f64 v1 = ceil v0
; function %f64_nearest (f64) -> f64 return v1
}
function %f64_floor(f64) -> f64 {
ebb0(v0: f64):
v1 = floor v0
return v1
}
function %f64_trunc(f64) -> f64 {
ebb0(v0: f64):
v1 = trunc v0
return v1
}
function %f64_nearest (f64) -> f64 {
ebb0(v0: f64):
v1 = nearest v0
return v1
}
; Binary Operations ; Binary Operations

View File

@@ -38,6 +38,7 @@ def gen_recipe(recipe, fmt):
with fmt.indented( with fmt.indented(
'if let InstructionData::{} {{'.format(iform.name), 'if let InstructionData::{} {{'.format(iform.name),
'}'): '}'):
fmt.line('opcode,')
for f in iform.imm_fields: for f in iform.imm_fields:
fmt.line('{},'.format(f.member)) fmt.line('{},'.format(f.member))
if want_args: if want_args:

View File

@@ -11,9 +11,10 @@ from . import settings as cfg
from . import instructions as x86 from . import instructions as x86
from .legalize import intel_expand from .legalize import intel_expand
from base.legalize import narrow, expand from base.legalize import narrow, expand
from .settings import use_sse41
try: try:
from typing import TYPE_CHECKING from typing import TYPE_CHECKING, Any # noqa
if TYPE_CHECKING: if TYPE_CHECKING:
from cdsl.instructions import MaybeBoundInst # noqa from cdsl.instructions import MaybeBoundInst # noqa
except ImportError: except ImportError:
@@ -82,7 +83,7 @@ def enc_i32_i64_ld_st(inst, w_bit, recipe, *args, **kwargs):
def enc_flt(inst, recipe, *args, **kwargs): def enc_flt(inst, recipe, *args, **kwargs):
# type: (MaybeBoundInst, r.TailRecipe, *int, **int) -> None # type: (MaybeBoundInst, r.TailRecipe, *int, **Any) -> None
""" """
Add encodings for floating point instruction `inst` to both I32 and I64. Add encodings for floating point instruction `inst` to both I32 and I64.
""" """
@@ -363,6 +364,16 @@ enc_flt(base.fdemote.f32.f64, r.furm, 0xf2, 0x0f, 0x5a)
enc_flt(base.sqrt.f32, r.furm, 0xf3, 0x0f, 0x51) enc_flt(base.sqrt.f32, r.furm, 0xf3, 0x0f, 0x51)
enc_flt(base.sqrt.f64, r.furm, 0xf2, 0x0f, 0x51) enc_flt(base.sqrt.f64, r.furm, 0xf2, 0x0f, 0x51)
# Rounding. The recipe looks at the opcode to pick an immediate.
for inst in [
base.nearest,
base.floor,
base.ceil,
base.trunc]:
enc_flt(inst.f32, r.furmi_rnd, 0x66, 0x0f, 0x3a, 0x0a, isap=use_sse41)
enc_flt(inst.f64, r.furmi_rnd, 0x66, 0x0f, 0x3a, 0x0b, isap=use_sse41)
# Binary arithmetic ops. # Binary arithmetic ops.
for inst, opc in [ for inst, opc in [
(base.fadd, 0x58), (base.fadd, 0x58),

View File

@@ -289,6 +289,21 @@ frurm = TailRecipe(
modrm_rr(in_reg0, out_reg0, sink); modrm_rr(in_reg0, out_reg0, sink);
''') ''')
# XX /r, RMI form for one of the roundXX SSE 4.1 instructions.
furmi_rnd = TailRecipe(
'furmi_rnd', Unary, size=2, ins=FPR, outs=FPR,
emit='''
PUT_OP(bits, rex2(in_reg0, out_reg0), sink);
modrm_rr(in_reg0, out_reg0, sink);
sink.put1(match opcode {
Opcode::Nearest => 0b00,
Opcode::Floor => 0b01,
Opcode::Ceil => 0b10,
Opcode::Trunc => 0b11,
x => panic!("{} unexpected for furmi_rnd", opcode),
});
''')
# XX /r, for regmove instructions. # XX /r, for regmove instructions.
rmov = TailRecipe( rmov = TailRecipe(
'ur', RegMove, size=1, ins=GPR, outs=(), 'ur', RegMove, size=1, ins=GPR, outs=(),

View File

@@ -1,7 +1,7 @@
//! Emitting binary Intel machine code. //! Emitting binary Intel machine code.
use binemit::{CodeSink, Reloc, bad_encoding}; use binemit::{CodeSink, Reloc, bad_encoding};
use ir::{Function, Inst, Ebb, InstructionData}; use ir::{Function, Inst, Ebb, InstructionData, Opcode};
use isa::{RegUnit, StackRef, StackBase, StackBaseMask}; use isa::{RegUnit, StackRef, StackBase, StackBaseMask};
use regalloc::RegDiversions; use regalloc::RegDiversions;
use super::registers::RU; use super::registers::RU;
@@ -41,6 +41,9 @@ fn stk_base(base: StackBase) -> RegUnit {
// Mandatory prefix bytes for Mp* opcodes. // Mandatory prefix bytes for Mp* opcodes.
const PREFIX: [u8; 3] = [0x66, 0xf3, 0xf2]; const PREFIX: [u8; 3] = [0x66, 0xf3, 0xf2];
// Second byte for three-byte opcodes for mm=0b10 and mm=0b11.
const OP3_BYTE2: [u8; 2] = [0x38, 0x3a];
// A REX prefix with no bits set: 0b0100WRXB. // A REX prefix with no bits set: 0b0100WRXB.
const BASE_REX: u8 = 0b0100_0000; const BASE_REX: u8 = 0b0100_0000;
@@ -111,6 +114,15 @@ fn put_mp1<CS: CodeSink + ?Sized>(bits: u16, rex: u8, sink: &mut CS) {
sink.put1(bits as u8); sink.put1(bits as u8);
} }
// Emit single-byte opcode with mandatory prefix and REX.
fn put_rexmp1<CS: CodeSink + ?Sized>(bits: u16, rex: u8, sink: &mut CS) {
debug_assert_eq!(bits & 0x0c00, 0, "Invalid encoding bits for Mp1*");
let pp = (bits >> 8) & 3;
sink.put1(PREFIX[(pp - 1) as usize]);
rex_prefix(bits, rex, sink);
sink.put1(bits as u8);
}
// Emit two-byte opcode (0F XX) with mandatory prefix. // Emit two-byte opcode (0F XX) with mandatory prefix.
fn put_mp2<CS: CodeSink + ?Sized>(bits: u16, rex: u8, sink: &mut CS) { fn put_mp2<CS: CodeSink + ?Sized>(bits: u16, rex: u8, sink: &mut CS) {
debug_assert_eq!(bits & 0x8c00, 0x0400, "Invalid encoding bits for Mp2*"); debug_assert_eq!(bits & 0x8c00, 0x0400, "Invalid encoding bits for Mp2*");
@@ -131,12 +143,27 @@ fn put_rexmp2<CS: CodeSink + ?Sized>(bits: u16, rex: u8, sink: &mut CS) {
sink.put1(bits as u8); sink.put1(bits as u8);
} }
// Emit single-byte opcode with mandatory prefix and REX. // Emit three-byte opcode (0F 3[8A] XX) with mandatory prefix.
fn put_rexmp1<CS: CodeSink + ?Sized>(bits: u16, rex: u8, sink: &mut CS) { fn put_mp3<CS: CodeSink + ?Sized>(bits: u16, rex: u8, sink: &mut CS) {
debug_assert_eq!(bits & 0x0c00, 0, "Invalid encoding bits for Mp1*"); debug_assert_eq!(bits & 0x8800, 0x0800, "Invalid encoding bits for Mp3*");
let pp = (bits >> 8) & 3;
sink.put1(PREFIX[(pp - 1) as usize]);
debug_assert_eq!(rex, BASE_REX, "Invalid registers for REX-less Mp3 encoding");
let mm = (bits >> 10) & 3;
sink.put1(0x0f);
sink.put1(OP3_BYTE2[(mm - 2) as usize]);
sink.put1(bits as u8);
}
// Emit three-byte opcode (0F 3[8A] XX) with mandatory prefix and REX
fn put_rexmp3<CS: CodeSink + ?Sized>(bits: u16, rex: u8, sink: &mut CS) {
debug_assert_eq!(bits & 0x0800, 0x0800, "Invalid encoding bits for Mp3*");
let pp = (bits >> 8) & 3; let pp = (bits >> 8) & 3;
sink.put1(PREFIX[(pp - 1) as usize]); sink.put1(PREFIX[(pp - 1) as usize]);
rex_prefix(bits, rex, sink); rex_prefix(bits, rex, sink);
let mm = (bits >> 10) & 3;
sink.put1(0x0f);
sink.put1(OP3_BYTE2[(mm - 2) as usize]);
sink.put1(bits as u8); sink.put1(bits as u8);
} }