diff --git a/cranelift/filetests/isa/intel/binary32-float.cton b/cranelift/filetests/isa/intel/binary32-float.cton index 85d613fc7a..4fa083b3de 100644 --- a/cranelift/filetests/isa/intel/binary32-float.cton +++ b/cranelift/filetests/isa/intel/binary32-float.cton @@ -89,6 +89,33 @@ ebb0: ; asm: sqrtss %xmm2, %xmm5 [-,%xmm5] v51 = sqrt v11 ; bin: f3 0f 51 ea + ; asm: roundss $0, %xmm5, %xmm4 + [-,%xmm4] v52 = nearest v10 ; bin: 66 0f 3a 0a e5 00 + ; asm: roundss $0, %xmm2, %xmm5 + [-,%xmm5] v53 = nearest v11 ; bin: 66 0f 3a 0a ea 00 + ; asm: roundss $0, %xmm5, %xmm2 + [-,%xmm2] v54 = nearest v10 ; bin: 66 0f 3a 0a d5 00 + + ; asm: roundss $1, %xmm5, %xmm4 + [-,%xmm4] v55 = floor v10 ; bin: 66 0f 3a 0a e5 01 + ; asm: roundss $1, %xmm2, %xmm5 + [-,%xmm5] v56 = floor v11 ; bin: 66 0f 3a 0a ea 01 + ; asm: roundss $1, %xmm5, %xmm2 + [-,%xmm2] v57 = floor v10 ; bin: 66 0f 3a 0a d5 01 + + ; asm: roundss $2, %xmm5, %xmm4 + [-,%xmm4] v58 = ceil v10 ; bin: 66 0f 3a 0a e5 02 + ; asm: roundss $2, %xmm2, %xmm5 + [-,%xmm5] v59 = ceil v11 ; bin: 66 0f 3a 0a ea 02 + ; asm: roundss $2, %xmm5, %xmm2 + [-,%xmm2] v60 = ceil v10 ; bin: 66 0f 3a 0a d5 02 + + ; asm: roundss $3, %xmm5, %xmm4 + [-,%xmm4] v61 = trunc v10 ; bin: 66 0f 3a 0a e5 03 + ; asm: roundss $3, %xmm2, %xmm5 + [-,%xmm5] v62 = trunc v11 ; bin: 66 0f 3a 0a ea 03 + ; asm: roundss $3, %xmm5, %xmm2 + [-,%xmm2] v63 = trunc v10 ; bin: 66 0f 3a 0a d5 03 ; Load/Store @@ -207,6 +234,33 @@ ebb0: ; asm: sqrtsd %xmm2, %xmm5 [-,%xmm5] v51 = sqrt v11 ; bin: f2 0f 51 ea + ; asm: roundsd $0, %xmm5, %xmm4 + [-,%xmm4] v52 = nearest v10 ; bin: 66 0f 3a 0b e5 00 + ; asm: roundsd $0, %xmm2, %xmm5 + [-,%xmm5] v53 = nearest v11 ; bin: 66 0f 3a 0b ea 00 + ; asm: roundsd $0, %xmm5, %xmm2 + [-,%xmm2] v54 = nearest v10 ; bin: 66 0f 3a 0b d5 00 + + ; asm: roundsd $1, %xmm5, %xmm4 + [-,%xmm4] v55 = floor v10 ; bin: 66 0f 3a 0b e5 01 + ; asm: roundsd $1, %xmm2, %xmm5 + [-,%xmm5] v56 = floor v11 ; bin: 66 0f 3a 0b ea 01 + ; asm: roundsd $1, %xmm5, %xmm2 + [-,%xmm2] v57 = floor v10 ; bin: 66 0f 3a 0b d5 01 + + ; asm: roundsd $2, %xmm5, %xmm4 + [-,%xmm4] v58 = ceil v10 ; bin: 66 0f 3a 0b e5 02 + ; asm: roundsd $2, %xmm2, %xmm5 + [-,%xmm5] v59 = ceil v11 ; bin: 66 0f 3a 0b ea 02 + ; asm: roundsd $2, %xmm5, %xmm2 + [-,%xmm2] v60 = ceil v10 ; bin: 66 0f 3a 0b d5 02 + + ; asm: roundsd $3, %xmm5, %xmm4 + [-,%xmm4] v61 = trunc v10 ; bin: 66 0f 3a 0b e5 03 + ; asm: roundsd $3, %xmm2, %xmm5 + [-,%xmm5] v62 = trunc v11 ; bin: 66 0f 3a 0b ea 03 + ; asm: roundsd $3, %xmm5, %xmm2 + [-,%xmm2] v63 = trunc v10 ; bin: 66 0f 3a 0b d5 03 ; Load/Store diff --git a/cranelift/filetests/isa/intel/binary64-float.cton b/cranelift/filetests/isa/intel/binary64-float.cton index e59ca0d1de..c8d4df232f 100644 --- a/cranelift/filetests/isa/intel/binary64-float.cton +++ b/cranelift/filetests/isa/intel/binary64-float.cton @@ -98,6 +98,34 @@ ebb0: ; asm: sqrtss %xmm10, %xmm5 [-,%xmm5] v51 = sqrt v11 ; bin: f3 41 0f 51 ea + ; asm: roundss $0, %xmm5, %xmm10 + [-,%xmm10] v52 = nearest v10 ; bin: 66 44 0f 3a 0a d5 00 + ; asm: roundss $0, %xmm10, %xmm5 + [-,%xmm5] v53 = nearest v11 ; bin: 66 41 0f 3a 0a ea 00 + ; asm: roundss $0, %xmm5, %xmm2 + [-,%xmm2] v54 = nearest v10 ; bin: 66 0f 3a 0a d5 00 + + ; asm: roundss $1, %xmm5, %xmm10 + [-,%xmm10] v55 = floor v10 ; bin: 66 44 0f 3a 0a d5 01 + ; asm: roundss $1, %xmm10, %xmm5 + [-,%xmm5] v56 = floor v11 ; bin: 66 41 0f 3a 0a ea 01 + ; asm: roundss $1, %xmm5, %xmm2 + [-,%xmm2] v57 = floor v10 ; bin: 66 0f 3a 0a d5 01 + + ; asm: roundss $2, %xmm5, %xmm10 + [-,%xmm10] v58 = ceil v10 ; bin: 66 44 0f 3a 0a d5 02 + ; asm: roundss $2, %xmm10, %xmm5 + [-,%xmm5] v59 = ceil v11 ; bin: 66 41 0f 3a 0a ea 02 + ; asm: roundss $2, %xmm5, %xmm2 + [-,%xmm2] v60 = ceil v10 ; bin: 66 0f 3a 0a d5 02 + + ; asm: roundss $3, %xmm5, %xmm10 + [-,%xmm10] v61 = trunc v10 ; bin: 66 44 0f 3a 0a d5 03 + ; asm: roundss $3, %xmm10, %xmm5 + [-,%xmm5] v62 = trunc v11 ; bin: 66 41 0f 3a 0a ea 03 + ; asm: roundss $3, %xmm5, %xmm2 + [-,%xmm2] v63 = trunc v10 ; bin: 66 0f 3a 0a d5 03 + ; Load/Store ; asm: movd (%r14), %xmm5 @@ -230,6 +258,34 @@ ebb0: ; asm: sqrtsd %xmm10, %xmm5 [-,%xmm5] v51 = sqrt v11 ; bin: f2 41 0f 51 ea + ; asm: roundsd $0, %xmm5, %xmm10 + [-,%xmm10] v52 = nearest v10 ; bin: 66 44 0f 3a 0b d5 00 + ; asm: roundsd $0, %xmm10, %xmm5 + [-,%xmm5] v53 = nearest v11 ; bin: 66 41 0f 3a 0b ea 00 + ; asm: roundsd $0, %xmm5, %xmm2 + [-,%xmm2] v54 = nearest v10 ; bin: 66 0f 3a 0b d5 00 + + ; asm: roundsd $1, %xmm5, %xmm10 + [-,%xmm10] v55 = floor v10 ; bin: 66 44 0f 3a 0b d5 01 + ; asm: roundsd $1, %xmm10, %xmm5 + [-,%xmm5] v56 = floor v11 ; bin: 66 41 0f 3a 0b ea 01 + ; asm: roundsd $1, %xmm5, %xmm2 + [-,%xmm2] v57 = floor v10 ; bin: 66 0f 3a 0b d5 01 + + ; asm: roundsd $2, %xmm5, %xmm10 + [-,%xmm10] v58 = ceil v10 ; bin: 66 44 0f 3a 0b d5 02 + ; asm: roundsd $2, %xmm10, %xmm5 + [-,%xmm5] v59 = ceil v11 ; bin: 66 41 0f 3a 0b ea 02 + ; asm: roundsd $2, %xmm5, %xmm2 + [-,%xmm2] v60 = ceil v10 ; bin: 66 0f 3a 0b d5 02 + + ; asm: roundsd $3, %xmm5, %xmm10 + [-,%xmm10] v61 = trunc v10 ; bin: 66 44 0f 3a 0b d5 03 + ; asm: roundsd $3, %xmm10, %xmm5 + [-,%xmm5] v62 = trunc v11 ; bin: 66 41 0f 3a 0b ea 03 + ; asm: roundsd $3, %xmm5, %xmm2 + [-,%xmm2] v63 = trunc v10 ; bin: 66 0f 3a 0b d5 03 + ; Load/Store ; asm: movq (%r14), %xmm5 diff --git a/cranelift/filetests/wasm/f32-arith.cton b/cranelift/filetests/wasm/f32-arith.cton index 5a84579136..2e7281398d 100644 --- a/cranelift/filetests/wasm/f32-arith.cton +++ b/cranelift/filetests/wasm/f32-arith.cton @@ -35,10 +35,29 @@ ebb0(v0: f32): return v1 } -; function %f32_ceil(f32) -> f32 -; function %f32_floor(f32) -> f32 -; function %f32_trunc(f32) -> f32 -; function %f32_nearest (f32) -> f32 +function %f32_ceil(f32) -> f32 { +ebb0(v0: f32): + v1 = ceil v0 + return v1 +} + +function %f32_floor(f32) -> f32 { +ebb0(v0: f32): + v1 = floor v0 + return v1 +} + +function %f32_trunc(f32) -> f32 { +ebb0(v0: f32): + v1 = trunc v0 + return v1 +} + +function %f32_nearest (f32) -> f32 { +ebb0(v0: f32): + v1 = nearest v0 + return v1 +} ; Binary Operations diff --git a/cranelift/filetests/wasm/f64-arith.cton b/cranelift/filetests/wasm/f64-arith.cton index 3eaecc5a0b..cfed4f95a5 100644 --- a/cranelift/filetests/wasm/f64-arith.cton +++ b/cranelift/filetests/wasm/f64-arith.cton @@ -32,10 +32,29 @@ ebb0(v0: f64): return v1 } -; function %f64_ceil(f64) -> f64 -; function %f64_floor(f64) -> f64 -; function %f64_trunc(f64) -> f64 -; function %f64_nearest (f64) -> f64 +function %f64_ceil(f64) -> f64 { +ebb0(v0: f64): + v1 = ceil v0 + return v1 +} + +function %f64_floor(f64) -> f64 { +ebb0(v0: f64): + v1 = floor v0 + return v1 +} + +function %f64_trunc(f64) -> f64 { +ebb0(v0: f64): + v1 = trunc v0 + return v1 +} + +function %f64_nearest (f64) -> f64 { +ebb0(v0: f64): + v1 = nearest v0 + return v1 +} ; Binary Operations diff --git a/lib/cretonne/meta/gen_binemit.py b/lib/cretonne/meta/gen_binemit.py index 39cab5ab20..58a44473a3 100644 --- a/lib/cretonne/meta/gen_binemit.py +++ b/lib/cretonne/meta/gen_binemit.py @@ -38,6 +38,7 @@ def gen_recipe(recipe, fmt): with fmt.indented( 'if let InstructionData::{} {{'.format(iform.name), '}'): + fmt.line('opcode,') for f in iform.imm_fields: fmt.line('{},'.format(f.member)) if want_args: diff --git a/lib/cretonne/meta/isa/intel/encodings.py b/lib/cretonne/meta/isa/intel/encodings.py index 37766c5207..6d46111d7f 100644 --- a/lib/cretonne/meta/isa/intel/encodings.py +++ b/lib/cretonne/meta/isa/intel/encodings.py @@ -11,9 +11,10 @@ from . import settings as cfg from . import instructions as x86 from .legalize import intel_expand from base.legalize import narrow, expand +from .settings import use_sse41 try: - from typing import TYPE_CHECKING + from typing import TYPE_CHECKING, Any # noqa if TYPE_CHECKING: from cdsl.instructions import MaybeBoundInst # noqa except ImportError: @@ -82,7 +83,7 @@ def enc_i32_i64_ld_st(inst, w_bit, recipe, *args, **kwargs): def enc_flt(inst, recipe, *args, **kwargs): - # type: (MaybeBoundInst, r.TailRecipe, *int, **int) -> None + # type: (MaybeBoundInst, r.TailRecipe, *int, **Any) -> None """ Add encodings for floating point instruction `inst` to both I32 and I64. """ @@ -363,6 +364,16 @@ enc_flt(base.fdemote.f32.f64, r.furm, 0xf2, 0x0f, 0x5a) enc_flt(base.sqrt.f32, r.furm, 0xf3, 0x0f, 0x51) enc_flt(base.sqrt.f64, r.furm, 0xf2, 0x0f, 0x51) +# Rounding. The recipe looks at the opcode to pick an immediate. +for inst in [ + base.nearest, + base.floor, + base.ceil, + base.trunc]: + enc_flt(inst.f32, r.furmi_rnd, 0x66, 0x0f, 0x3a, 0x0a, isap=use_sse41) + enc_flt(inst.f64, r.furmi_rnd, 0x66, 0x0f, 0x3a, 0x0b, isap=use_sse41) + + # Binary arithmetic ops. for inst, opc in [ (base.fadd, 0x58), diff --git a/lib/cretonne/meta/isa/intel/recipes.py b/lib/cretonne/meta/isa/intel/recipes.py index 358f9cae72..04c7c91891 100644 --- a/lib/cretonne/meta/isa/intel/recipes.py +++ b/lib/cretonne/meta/isa/intel/recipes.py @@ -289,6 +289,21 @@ frurm = TailRecipe( modrm_rr(in_reg0, out_reg0, sink); ''') +# XX /r, RMI form for one of the roundXX SSE 4.1 instructions. +furmi_rnd = TailRecipe( + 'furmi_rnd', Unary, size=2, ins=FPR, outs=FPR, + emit=''' + PUT_OP(bits, rex2(in_reg0, out_reg0), sink); + modrm_rr(in_reg0, out_reg0, sink); + sink.put1(match opcode { + Opcode::Nearest => 0b00, + Opcode::Floor => 0b01, + Opcode::Ceil => 0b10, + Opcode::Trunc => 0b11, + x => panic!("{} unexpected for furmi_rnd", opcode), + }); + ''') + # XX /r, for regmove instructions. rmov = TailRecipe( 'ur', RegMove, size=1, ins=GPR, outs=(), diff --git a/lib/cretonne/src/isa/intel/binemit.rs b/lib/cretonne/src/isa/intel/binemit.rs index 888128caed..4358c9da15 100644 --- a/lib/cretonne/src/isa/intel/binemit.rs +++ b/lib/cretonne/src/isa/intel/binemit.rs @@ -1,7 +1,7 @@ //! Emitting binary Intel machine code. use binemit::{CodeSink, Reloc, bad_encoding}; -use ir::{Function, Inst, Ebb, InstructionData}; +use ir::{Function, Inst, Ebb, InstructionData, Opcode}; use isa::{RegUnit, StackRef, StackBase, StackBaseMask}; use regalloc::RegDiversions; use super::registers::RU; @@ -41,6 +41,9 @@ fn stk_base(base: StackBase) -> RegUnit { // Mandatory prefix bytes for Mp* opcodes. const PREFIX: [u8; 3] = [0x66, 0xf3, 0xf2]; +// Second byte for three-byte opcodes for mm=0b10 and mm=0b11. +const OP3_BYTE2: [u8; 2] = [0x38, 0x3a]; + // A REX prefix with no bits set: 0b0100WRXB. const BASE_REX: u8 = 0b0100_0000; @@ -111,6 +114,15 @@ fn put_mp1(bits: u16, rex: u8, sink: &mut CS) { sink.put1(bits as u8); } +// Emit single-byte opcode with mandatory prefix and REX. +fn put_rexmp1(bits: u16, rex: u8, sink: &mut CS) { + debug_assert_eq!(bits & 0x0c00, 0, "Invalid encoding bits for Mp1*"); + let pp = (bits >> 8) & 3; + sink.put1(PREFIX[(pp - 1) as usize]); + rex_prefix(bits, rex, sink); + sink.put1(bits as u8); +} + // Emit two-byte opcode (0F XX) with mandatory prefix. fn put_mp2(bits: u16, rex: u8, sink: &mut CS) { debug_assert_eq!(bits & 0x8c00, 0x0400, "Invalid encoding bits for Mp2*"); @@ -131,12 +143,27 @@ fn put_rexmp2(bits: u16, rex: u8, sink: &mut CS) { sink.put1(bits as u8); } -// Emit single-byte opcode with mandatory prefix and REX. -fn put_rexmp1(bits: u16, rex: u8, sink: &mut CS) { - debug_assert_eq!(bits & 0x0c00, 0, "Invalid encoding bits for Mp1*"); +// Emit three-byte opcode (0F 3[8A] XX) with mandatory prefix. +fn put_mp3(bits: u16, rex: u8, sink: &mut CS) { + debug_assert_eq!(bits & 0x8800, 0x0800, "Invalid encoding bits for Mp3*"); + let pp = (bits >> 8) & 3; + sink.put1(PREFIX[(pp - 1) as usize]); + debug_assert_eq!(rex, BASE_REX, "Invalid registers for REX-less Mp3 encoding"); + let mm = (bits >> 10) & 3; + sink.put1(0x0f); + sink.put1(OP3_BYTE2[(mm - 2) as usize]); + sink.put1(bits as u8); +} + +// Emit three-byte opcode (0F 3[8A] XX) with mandatory prefix and REX +fn put_rexmp3(bits: u16, rex: u8, sink: &mut CS) { + debug_assert_eq!(bits & 0x0800, 0x0800, "Invalid encoding bits for Mp3*"); let pp = (bits >> 8) & 3; sink.put1(PREFIX[(pp - 1) as usize]); rex_prefix(bits, rex, sink); + let mm = (bits >> 10) & 3; + sink.put1(0x0f); + sink.put1(OP3_BYTE2[(mm - 2) as usize]); sink.put1(bits as u8); }