diff --git a/filetests/isa/intel/binary32-float.cton b/filetests/isa/intel/binary32-float.cton index 4a4361707c..5c0dc43b18 100644 --- a/filetests/isa/intel/binary32-float.cton +++ b/filetests/isa/intel/binary32-float.cton @@ -17,6 +17,21 @@ ebb0: ; asm: cvtsi2ss %esi, %xmm2 [-,%xmm2] v11 = fcvt_from_sint.f32 v1 ; bin: f3 0f 2a d6 + ; asm: cvtss2sd %xmm2, %xmm5 + [-,%xmm5] v12 = fpromote.f64 v11 ; bin: f3 0f 5a ea + ; asm: cvtss2sd %xmm5, %xmm2 + [-,%xmm2] v13 = fpromote.f64 v10 ; bin: f3 0f 5a d5 + + ; asm: movd %ecx, %xmm5 + [-,%xmm5] v14 = bitcast.f32 v0 ; bin: 66 0f 6e e9 + ; asm: movd %esi, %xmm2 + [-,%xmm2] v15 = bitcast.f32 v1 ; bin: 66 0f 6e d6 + + ; asm: movd %xmm5, %ecx + [-,%rcx] v16 = bitcast.i32 v10 ; bin: 66 0f 7e e9 + ; asm: movd %xmm2, %esi + [-,%rsi] v17 = bitcast.i32 v11 ; bin: 66 0f 7e d6 + ; Binary arithmetic. ; asm: addss %xmm2, %xmm5 @@ -70,13 +85,20 @@ ebb0: [-,%rcx] v0 = iconst.i32 1 [-,%rsi] v1 = iconst.i32 2 - ; Binary arithmetic. - ; asm: cvtsi2sd %ecx, %xmm5 [-,%xmm5] v10 = fcvt_from_sint.f64 v0 ; bin: f2 0f 2a e9 ; asm: cvtsi2sd %esi, %xmm2 [-,%xmm2] v11 = fcvt_from_sint.f64 v1 ; bin: f2 0f 2a d6 + ; asm: cvtsd2ss %xmm2, %xmm5 + [-,%xmm5] v12 = fdemote.f32 v11 ; bin: f2 0f 5a ea + ; asm: cvtsd2ss %xmm5, %xmm2 + [-,%xmm2] v13 = fdemote.f32 v10 ; bin: f2 0f 5a d5 + + ; No i64 <-> f64 bitcasts in 32-bit mode. + + ; Binary arithmetic. + ; asm: addsd %xmm2, %xmm5 [-,%xmm5] v20 = fadd v10, v11 ; bin: f2 0f 58 ea ; asm: addsd %xmm5, %xmm2 diff --git a/filetests/isa/intel/binary64-float.cton b/filetests/isa/intel/binary64-float.cton index e8a7c574c1..64dd1ebd05 100644 --- a/filetests/isa/intel/binary64-float.cton +++ b/filetests/isa/intel/binary64-float.cton @@ -25,27 +25,42 @@ ebb0: ; asm: cvtsi2ssq %r14, %xmm10 [-,%xmm10] v13 = fcvt_from_sint.f32 v3 ; bin: f3 4d 0f 2a d6 + ; asm: cvtss2sd %xmm10, %xmm5 + [-,%xmm5] v14 = fpromote.f64 v11 ; bin: f3 41 0f 5a ea + ; asm: cvtss2sd %xmm5, %xmm10 + [-,%xmm10] v15 = fpromote.f64 v10 ; bin: f3 44 0f 5a d5 + + ; asm: movd %r11d, %xmm5 + [-,%xmm5] v16 = bitcast.f32 v0 ; bin: 66 41 0f 6e eb + ; asm: movd %esi, %xmm10 + [-,%xmm10] v17 = bitcast.f32 v1 ; bin: 66 44 0f 6e d6 + + ; asm: movd %xmm5, %ecx + [-,%rcx] v18 = bitcast.i32 v10 ; bin: 66 40 0f 7e e9 + ; asm: movd %xmm10, %esi + [-,%rsi] v19 = bitcast.i32 v11 ; bin: 66 44 0f 7e d6 + ; Binary arithmetic. ; asm: addss %xmm10, %xmm5 [-,%xmm5] v20 = fadd v10, v11 ; bin: f3 41 0f 58 ea ; asm: addss %xmm5, %xmm10 - [-,%xmm10] v21 = fadd v11, v10 ; bin: f3 44 0f 58 d5 + [-,%xmm10] v21 = fadd v11, v10 ; bin: f3 44 0f 58 d5 ; asm: subss %xmm10, %xmm5 [-,%xmm5] v22 = fsub v10, v11 ; bin: f3 41 0f 5c ea ; asm: subss %xmm5, %xmm10 - [-,%xmm10] v23 = fsub v11, v10 ; bin: f3 44 0f 5c d5 + [-,%xmm10] v23 = fsub v11, v10 ; bin: f3 44 0f 5c d5 ; asm: mulss %xmm10, %xmm5 [-,%xmm5] v24 = fmul v10, v11 ; bin: f3 41 0f 59 ea ; asm: mulss %xmm5, %xmm10 - [-,%xmm10] v25 = fmul v11, v10 ; bin: f3 44 0f 59 d5 + [-,%xmm10] v25 = fmul v11, v10 ; bin: f3 44 0f 59 d5 ; asm: divss %xmm10, %xmm5 [-,%xmm5] v26 = fdiv v10, v11 ; bin: f3 41 0f 5e ea ; asm: divss %xmm5, %xmm10 - [-,%xmm10] v27 = fdiv v11, v10 ; bin: f3 44 0f 5e d5 + [-,%xmm10] v27 = fdiv v11, v10 ; bin: f3 44 0f 5e d5 ; Bitwise ops. ; We use the *ps SSE instructions for everything because they are smaller. @@ -90,12 +105,27 @@ ebb0: ; asm: cvtsi2sdq %r14, %xmm10 [-,%xmm10] v13 = fcvt_from_sint.f64 v3 ; bin: f2 4d 0f 2a d6 + ; asm: cvtsd2ss %xmm10, %xmm5 + [-,%xmm5] v14 = fdemote.f32 v11 ; bin: f2 41 0f 5a ea + ; asm: cvtsd2ss %xmm5, %xmm10 + [-,%xmm10] v15 = fdemote.f32 v10 ; bin: f2 44 0f 5a d5 + + ; asm: movq %rax, %xmm5 + [-,%xmm5] v16 = bitcast.f64 v2 ; bin: 66 48 0f 6e e8 + ; asm: movq %r14, %xmm10 + [-,%xmm10] v17 = bitcast.f64 v3 ; bin: 66 4d 0f 6e d6 + + ; asm: movq %xmm5, %rcx + [-,%rcx] v18 = bitcast.i64 v10 ; bin: 66 48 0f 7e e9 + ; asm: movq %xmm10, %rsi + [-,%rsi] v19 = bitcast.i64 v11 ; bin: 66 4c 0f 7e d6 + ; Binary arithmetic. ; asm: addsd %xmm10, %xmm5 [-,%xmm5] v20 = fadd v10, v11 ; bin: f2 41 0f 58 ea ; asm: addsd %xmm5, %xmm10 - [-,%xmm10] v21 = fadd v11, v10 ; bin: f2 44 0f 58 d5 + [-,%xmm10] v21 = fadd v11, v10 ; bin: f2 44 0f 58 d5 ; asm: subsd %xmm10, %xmm5 [-,%xmm5] v22 = fsub v10, v11 ; bin: f2 41 0f 5c ea diff --git a/filetests/wasm/conversions.cton b/filetests/wasm/conversions.cton index 6f2354d624..e742ebba27 100644 --- a/filetests/wasm/conversions.cton +++ b/filetests/wasm/conversions.cton @@ -22,6 +22,27 @@ ebb0(v0: i32): return v1 } +; function %i32_trunc_s_f32(f32) -> i32 +; function %i32_trunc_u_f32(f32) -> i32 +; function %i32_trunc_s_f64(f64) -> i32 +; function %i32_trunc_u_f64(f64) -> i32 +; function %i64_trunc_s_f32(f32) -> i64 +; function %i64_trunc_u_f32(f32) -> i64 +; function %i64_trunc_s_f64(f64) -> i64 +; function %i64_trunc_u_f64(f64) -> i64 + +function %f32_trunc_f64(f64) -> f32 { +ebb0(v0: f64): + v1 = fdemote.f32 v0 + return v1 +} + +function %f64_promote_f32(f32) -> f64 { +ebb0(v0: f32): + v1 = fpromote.f64 v0 + return v1 +} + function %f32_convert_s_i32(i32) -> f32 { ebb0(v0: i32): v1 = fcvt_from_sint.f32 v0 @@ -47,3 +68,27 @@ ebb0(v0: i64): } ; TODO: f*_convert_u_i* (Don't exist on Intel). + +function %i32_reinterpret_f32(f32) -> i32 { +ebb0(v0: f32): + v1 = bitcast.i32 v0 + return v1 +} + +function %f32_reinterpret_i32(i32) -> f32 { +ebb0(v0: i32): + v1 = bitcast.f32 v0 + return v1 +} + +function %i64_reinterpret_f64(f64) -> i64 { +ebb0(v0: f64): + v1 = bitcast.i64 v0 + return v1 +} + +function %f64_reinterpret_i64(i64) -> f64 { +ebb0(v0: i64): + v1 = bitcast.f64 v0 + return v1 +} diff --git a/lib/cretonne/meta/isa/intel/encodings.py b/lib/cretonne/meta/isa/intel/encodings.py index df95cf7f48..44f8c16677 100644 --- a/lib/cretonne/meta/isa/intel/encodings.py +++ b/lib/cretonne/meta/isa/intel/encodings.py @@ -11,6 +11,14 @@ from . import settings as cfg from . import instructions as x86 from base.legalize import narrow, expand +try: + from typing import TYPE_CHECKING + if TYPE_CHECKING: + from cdsl.instructions import MaybeBoundInst # noqa +except ImportError: + pass + + I32.legalize_type( default=narrow, i32=expand, @@ -24,42 +32,52 @@ I64.legalize_type( f32=expand, f64=expand) + +# +# Helper functions for generating encodings. +# + +def enc_i32_i64(inst, recipe, *args, **kwargs): + # type: (MaybeBoundInst, r.TailRecipe, *int, **int) -> None + """ + Add encodings for `inst.i32` to I32. + Add encodings for `inst.i32` to I64 with and without REX. + Add encodings for `inst.i64` to I64 with a REX.W prefix. + """ + I32.enc(inst.i32, *recipe(*args, **kwargs)) + + # REX-less encoding must come after REX encoding so we don't use it by + # default. Otherwise reg-alloc would never use r8 and up. + I64.enc(inst.i32, *recipe.rex(*args, **kwargs)) + I64.enc(inst.i32, *recipe(*args, **kwargs)) + + I64.enc(inst.i64, *recipe.rex(*args, w=1, **kwargs)) + + +def enc_flt(inst, recipe, *args, **kwargs): + # type: (MaybeBoundInst, r.TailRecipe, *int, **int) -> None + """ + Add encodings for floating point instruction `inst` to both I32 and I64. + """ + I32.enc(inst, *recipe(*args, **kwargs)) + I64.enc(inst, *recipe.rex(*args, **kwargs)) + I64.enc(inst, *recipe(*args, **kwargs)) + + for inst, opc in [ (base.iadd, 0x01), (base.isub, 0x29), (base.band, 0x21), (base.bor, 0x09), (base.bxor, 0x31)]: - I32.enc(inst.i32, *r.rr(opc)) + enc_i32_i64(inst, r.rr, opc) - I64.enc(inst.i64, *r.rr.rex(opc, w=1)) - I64.enc(inst.i32, *r.rr.rex(opc)) - # REX-less encoding must come after REX encoding so we don't use it by - # default. Otherwise reg-alloc would never use r8 and up. - I64.enc(inst.i32, *r.rr(opc)) +enc_i32_i64(base.imul, r.rrx, 0x0f, 0xaf) +enc_i32_i64(x86.sdivmodx, r.div, 0xf7, rrr=7) +enc_i32_i64(x86.udivmodx, r.div, 0xf7, rrr=6) -I32.enc(base.imul.i32, *r.rrx(0x0f, 0xaf)) -I64.enc(base.imul.i64, *r.rrx.rex(0x0f, 0xaf, w=1)) -I64.enc(base.imul.i32, *r.rrx.rex(0x0f, 0xaf)) -I64.enc(base.imul.i32, *r.rrx(0x0f, 0xaf)) - -for inst, rrr in [ - (x86.sdivmodx, 7), - (x86.udivmodx, 6)]: - I32.enc(inst.i32, *r.div(0xf7, rrr=rrr)) - I64.enc(inst.i64, *r.div.rex(0xf7, rrr=rrr, w=1)) - I64.enc(inst.i32, *r.div.rex(0xf7, rrr=rrr)) - I64.enc(inst.i32, *r.div(0xf7, rrr=rrr)) - -I32.enc(base.copy.i32, *r.umr(0x89)) -I64.enc(base.copy.i64, *r.umr.rex(0x89, w=1)) -I64.enc(base.copy.i32, *r.umr.rex(0x89)) -I64.enc(base.copy.i32, *r.umr(0x89)) - -I32.enc(base.regmove.i32, *r.rmov(0x89)) -I64.enc(base.regmove.i64, *r.rmov.rex(0x89, w=1)) -I64.enc(base.regmove.i32, *r.rmov.rex(0x89)) -I64.enc(base.regmove.i32, *r.rmov(0x89)) +enc_i32_i64(base.copy, r.umr, 0x89) +enc_i32_i64(base.regmove, r.rmov, 0x89) # Immediate instructions with sign-extended 8-bit and 32-bit immediate. for inst, rrr in [ @@ -67,15 +85,8 @@ for inst, rrr in [ (base.band_imm, 4), (base.bor_imm, 1), (base.bxor_imm, 6)]: - I32.enc(inst.i32, *r.rib(0x83, rrr=rrr)) - I32.enc(inst.i32, *r.rid(0x81, rrr=rrr)) - - I64.enc(inst.i64, *r.rib.rex(0x83, rrr=rrr, w=1)) - I64.enc(inst.i64, *r.rid.rex(0x81, rrr=rrr, w=1)) - I64.enc(inst.i32, *r.rib.rex(0x83, rrr=rrr)) - I64.enc(inst.i32, *r.rid.rex(0x81, rrr=rrr)) - I64.enc(inst.i32, *r.rib(0x83, rrr=rrr)) - I64.enc(inst.i32, *r.rid(0x81, rrr=rrr)) + enc_i32_i64(inst, r.rib, 0x83, rrr=rrr) + enc_i32_i64(inst, r.rid, 0x81, rrr=rrr) # TODO: band_imm.i64 with an unsigned 32-bit immediate can be encoded as # band_imm.i32. Can even use the single-byte immediate for 0xffff_ffXX masks. @@ -179,15 +190,8 @@ I32.enc(base.jump, *r.jmpd(0xe9)) I64.enc(base.jump, *r.jmpb(0xeb)) I64.enc(base.jump, *r.jmpd(0xe9)) -I32.enc(base.brz.i32, *r.tjccb(0x74)) -I64.enc(base.brz.i64, *r.tjccb.rex(0x74, w=1)) -I64.enc(base.brz.i32, *r.tjccb.rex(0x74)) -I64.enc(base.brz.i32, *r.tjccb(0x74)) - -I32.enc(base.brnz.i32, *r.tjccb(0x75)) -I64.enc(base.brnz.i64, *r.tjccb.rex(0x75, w=1)) -I64.enc(base.brnz.i32, *r.tjccb.rex(0x75)) -I64.enc(base.brnz.i32, *r.tjccb(0x75)) +enc_i32_i64(base.brz, r.tjccb, 0x74) +enc_i32_i64(base.brnz, r.tjccb, 0x75) # # Trap as ud2 @@ -198,10 +202,7 @@ I64.enc(base.trap, *r.noop(0x0f, 0x0b)) # # Comparisons # -I32.enc(base.icmp.i32, *r.icscc(0x39)) -I64.enc(base.icmp.i64, *r.icscc.rex(0x39, w=1)) -I64.enc(base.icmp.i32, *r.icscc.rex(0x39)) -I64.enc(base.icmp.i32, *r.icscc(0x39)) +enc_i32_i64(base.icmp, r.icscc, 0x39) # # Convert bool to int. @@ -223,21 +224,31 @@ I64.enc(base.sextend.i64.i32, *r.urm.rex(0x63, w=1)) I64.enc(base.uextend.i64.i32, *r.umr.rex(0x89)) I64.enc(base.uextend.i64.i32, *r.umr(0x89)) + # # Floating point # +# movd +enc_flt(base.bitcast.f32.i32, r.frurm, 0x66, 0x0f, 0x6e) +enc_flt(base.bitcast.i32.f32, r.rfumr, 0x66, 0x0f, 0x7e) + +# movq +I64.enc(base.bitcast.f64.i64, *r.frurm.rex(0x66, 0x0f, 0x6e, w=1)) +I64.enc(base.bitcast.i64.f64, *r.rfumr.rex(0x66, 0x0f, 0x7e, w=1)) + # cvtsi2ss -I32.enc(base.fcvt_from_sint.f32.i32, *r.furm(0xf3, 0x0f, 0x2A)) -I64.enc(base.fcvt_from_sint.f32.i64, *r.furm.rex(0xf3, 0x0f, 0x2A, w=1)) -I64.enc(base.fcvt_from_sint.f32.i32, *r.furm.rex(0xf3, 0x0f, 0x2A)) -I64.enc(base.fcvt_from_sint.f32.i32, *r.furm(0xf3, 0x0f, 0x2A)) +enc_i32_i64(base.fcvt_from_sint.f32, r.frurm, 0xf3, 0x0f, 0x2a) # cvtsi2sd -I32.enc(base.fcvt_from_sint.f64.i32, *r.furm(0xf2, 0x0f, 0x2A)) -I64.enc(base.fcvt_from_sint.f64.i64, *r.furm.rex(0xf2, 0x0f, 0x2A, w=1)) -I64.enc(base.fcvt_from_sint.f64.i32, *r.furm.rex(0xf2, 0x0f, 0x2A)) -I64.enc(base.fcvt_from_sint.f64.i32, *r.furm(0xf2, 0x0f, 0x2A)) +enc_i32_i64(base.fcvt_from_sint.f64, r.frurm, 0xf2, 0x0f, 0x2a) + +# cvtss2sd +enc_flt(base.fpromote.f64.f32, r.furm, 0xf3, 0x0f, 0x5a) + +# cvtsd2ss +enc_flt(base.fdemote.f32.f64, r.furm, 0xf2, 0x0f, 0x5a) + # Binary arithmetic ops. for inst, opc in [ @@ -245,13 +256,8 @@ for inst, opc in [ (base.fsub, 0x5c), (base.fmul, 0x59), (base.fdiv, 0x5e)]: - I32.enc(inst.f32, *r.frm(0xf3, 0x0f, opc)) - I64.enc(inst.f32, *r.frm.rex(0xf3, 0x0f, opc)) - I64.enc(inst.f32, *r.frm(0xf3, 0x0f, opc)) - - I32.enc(inst.f64, *r.frm(0xf2, 0x0f, opc)) - I64.enc(inst.f64, *r.frm.rex(0xf2, 0x0f, opc)) - I64.enc(inst.f64, *r.frm(0xf2, 0x0f, opc)) + enc_flt(inst.f32, r.frm, 0xf3, 0x0f, opc) + enc_flt(inst.f64, r.frm, 0xf2, 0x0f, opc) # Binary bitwise ops. for inst, opc in [ @@ -259,10 +265,5 @@ for inst, opc in [ (base.band_not, 0x55), (base.bor, 0x56), (base.bxor, 0x57)]: - I32.enc(inst.f32, *r.frm(0x0f, opc)) - I64.enc(inst.f32, *r.frm.rex(0x0f, opc)) - I64.enc(inst.f32, *r.frm(0x0f, opc)) - - I32.enc(inst.f64, *r.frm(0x0f, opc)) - I64.enc(inst.f64, *r.frm.rex(0x0f, opc)) - I64.enc(inst.f64, *r.frm(0x0f, opc)) + enc_flt(inst.f32, r.frm, 0x0f, opc) + enc_flt(inst.f64, r.frm, 0x0f, opc) diff --git a/lib/cretonne/meta/isa/intel/recipes.py b/lib/cretonne/meta/isa/intel/recipes.py index 3eb3d51671..3e038e1208 100644 --- a/lib/cretonne/meta/isa/intel/recipes.py +++ b/lib/cretonne/meta/isa/intel/recipes.py @@ -232,6 +232,14 @@ umr = TailRecipe( modrm_rr(out_reg0, in_reg0, sink); ''') +# Same as umr, but with FPR -> GPR registers. +rfumr = TailRecipe( + 'rfumr', Unary, size=1, ins=FPR, outs=GPR, + emit=''' + PUT_OP(bits, rex2(out_reg0, in_reg0), sink); + modrm_rr(out_reg0, in_reg0, sink); + ''') + # XX /r, but for a unary operator with separate input/output register. # RM form. urm = TailRecipe( @@ -249,9 +257,17 @@ urm_abcd = TailRecipe( modrm_rr(in_reg0, out_reg0, sink); ''') -# XX /r, RM form, GPR -> FPR. +# XX /r, RM form, FPR -> FPR. furm = TailRecipe( - 'furm', Unary, size=1, ins=GPR, outs=FPR, + 'furm', Unary, size=1, ins=FPR, outs=FPR, + emit=''' + PUT_OP(bits, rex2(in_reg0, out_reg0), sink); + modrm_rr(in_reg0, out_reg0, sink); + ''') + +# XX /r, RM form, GPR -> FPR. +frurm = TailRecipe( + 'frurm', Unary, size=1, ins=GPR, outs=FPR, emit=''' PUT_OP(bits, rex2(in_reg0, out_reg0), sink); modrm_rr(in_reg0, out_reg0, sink);