Add Intel encodings for more conversion instructions.

The following instructions have simple encodings:

- bitcast.f32.i32
- bitcast.i32.f32
- bitcast.f64.i64
- bitcast.i64.f64
- fpromote.f64.f32
- fdemote.f32.f64

Also add helper functions enc_flt() and enc_i32_i64 to
intel.encodings.py for generating the common set of encodings for an
instruction: I32, I64 w/REX, I64 w/o REX.
This commit is contained in:
Jakob Stoklund Olesen
2017-07-27 10:58:00 -07:00
parent 06bab60fcc
commit ebf5c80959
5 changed files with 195 additions and 81 deletions

View File

@@ -17,6 +17,21 @@ ebb0:
; asm: cvtsi2ss %esi, %xmm2 ; asm: cvtsi2ss %esi, %xmm2
[-,%xmm2] v11 = fcvt_from_sint.f32 v1 ; bin: f3 0f 2a d6 [-,%xmm2] v11 = fcvt_from_sint.f32 v1 ; bin: f3 0f 2a d6
; asm: cvtss2sd %xmm2, %xmm5
[-,%xmm5] v12 = fpromote.f64 v11 ; bin: f3 0f 5a ea
; asm: cvtss2sd %xmm5, %xmm2
[-,%xmm2] v13 = fpromote.f64 v10 ; bin: f3 0f 5a d5
; asm: movd %ecx, %xmm5
[-,%xmm5] v14 = bitcast.f32 v0 ; bin: 66 0f 6e e9
; asm: movd %esi, %xmm2
[-,%xmm2] v15 = bitcast.f32 v1 ; bin: 66 0f 6e d6
; asm: movd %xmm5, %ecx
[-,%rcx] v16 = bitcast.i32 v10 ; bin: 66 0f 7e e9
; asm: movd %xmm2, %esi
[-,%rsi] v17 = bitcast.i32 v11 ; bin: 66 0f 7e d6
; Binary arithmetic. ; Binary arithmetic.
; asm: addss %xmm2, %xmm5 ; asm: addss %xmm2, %xmm5
@@ -70,13 +85,20 @@ ebb0:
[-,%rcx] v0 = iconst.i32 1 [-,%rcx] v0 = iconst.i32 1
[-,%rsi] v1 = iconst.i32 2 [-,%rsi] v1 = iconst.i32 2
; Binary arithmetic.
; asm: cvtsi2sd %ecx, %xmm5 ; asm: cvtsi2sd %ecx, %xmm5
[-,%xmm5] v10 = fcvt_from_sint.f64 v0 ; bin: f2 0f 2a e9 [-,%xmm5] v10 = fcvt_from_sint.f64 v0 ; bin: f2 0f 2a e9
; asm: cvtsi2sd %esi, %xmm2 ; asm: cvtsi2sd %esi, %xmm2
[-,%xmm2] v11 = fcvt_from_sint.f64 v1 ; bin: f2 0f 2a d6 [-,%xmm2] v11 = fcvt_from_sint.f64 v1 ; bin: f2 0f 2a d6
; asm: cvtsd2ss %xmm2, %xmm5
[-,%xmm5] v12 = fdemote.f32 v11 ; bin: f2 0f 5a ea
; asm: cvtsd2ss %xmm5, %xmm2
[-,%xmm2] v13 = fdemote.f32 v10 ; bin: f2 0f 5a d5
; No i64 <-> f64 bitcasts in 32-bit mode.
; Binary arithmetic.
; asm: addsd %xmm2, %xmm5 ; asm: addsd %xmm2, %xmm5
[-,%xmm5] v20 = fadd v10, v11 ; bin: f2 0f 58 ea [-,%xmm5] v20 = fadd v10, v11 ; bin: f2 0f 58 ea
; asm: addsd %xmm5, %xmm2 ; asm: addsd %xmm5, %xmm2

View File

@@ -25,27 +25,42 @@ ebb0:
; asm: cvtsi2ssq %r14, %xmm10 ; asm: cvtsi2ssq %r14, %xmm10
[-,%xmm10] v13 = fcvt_from_sint.f32 v3 ; bin: f3 4d 0f 2a d6 [-,%xmm10] v13 = fcvt_from_sint.f32 v3 ; bin: f3 4d 0f 2a d6
; asm: cvtss2sd %xmm10, %xmm5
[-,%xmm5] v14 = fpromote.f64 v11 ; bin: f3 41 0f 5a ea
; asm: cvtss2sd %xmm5, %xmm10
[-,%xmm10] v15 = fpromote.f64 v10 ; bin: f3 44 0f 5a d5
; asm: movd %r11d, %xmm5
[-,%xmm5] v16 = bitcast.f32 v0 ; bin: 66 41 0f 6e eb
; asm: movd %esi, %xmm10
[-,%xmm10] v17 = bitcast.f32 v1 ; bin: 66 44 0f 6e d6
; asm: movd %xmm5, %ecx
[-,%rcx] v18 = bitcast.i32 v10 ; bin: 66 40 0f 7e e9
; asm: movd %xmm10, %esi
[-,%rsi] v19 = bitcast.i32 v11 ; bin: 66 44 0f 7e d6
; Binary arithmetic. ; Binary arithmetic.
; asm: addss %xmm10, %xmm5 ; asm: addss %xmm10, %xmm5
[-,%xmm5] v20 = fadd v10, v11 ; bin: f3 41 0f 58 ea [-,%xmm5] v20 = fadd v10, v11 ; bin: f3 41 0f 58 ea
; asm: addss %xmm5, %xmm10 ; asm: addss %xmm5, %xmm10
[-,%xmm10] v21 = fadd v11, v10 ; bin: f3 44 0f 58 d5 [-,%xmm10] v21 = fadd v11, v10 ; bin: f3 44 0f 58 d5
; asm: subss %xmm10, %xmm5 ; asm: subss %xmm10, %xmm5
[-,%xmm5] v22 = fsub v10, v11 ; bin: f3 41 0f 5c ea [-,%xmm5] v22 = fsub v10, v11 ; bin: f3 41 0f 5c ea
; asm: subss %xmm5, %xmm10 ; asm: subss %xmm5, %xmm10
[-,%xmm10] v23 = fsub v11, v10 ; bin: f3 44 0f 5c d5 [-,%xmm10] v23 = fsub v11, v10 ; bin: f3 44 0f 5c d5
; asm: mulss %xmm10, %xmm5 ; asm: mulss %xmm10, %xmm5
[-,%xmm5] v24 = fmul v10, v11 ; bin: f3 41 0f 59 ea [-,%xmm5] v24 = fmul v10, v11 ; bin: f3 41 0f 59 ea
; asm: mulss %xmm5, %xmm10 ; asm: mulss %xmm5, %xmm10
[-,%xmm10] v25 = fmul v11, v10 ; bin: f3 44 0f 59 d5 [-,%xmm10] v25 = fmul v11, v10 ; bin: f3 44 0f 59 d5
; asm: divss %xmm10, %xmm5 ; asm: divss %xmm10, %xmm5
[-,%xmm5] v26 = fdiv v10, v11 ; bin: f3 41 0f 5e ea [-,%xmm5] v26 = fdiv v10, v11 ; bin: f3 41 0f 5e ea
; asm: divss %xmm5, %xmm10 ; asm: divss %xmm5, %xmm10
[-,%xmm10] v27 = fdiv v11, v10 ; bin: f3 44 0f 5e d5 [-,%xmm10] v27 = fdiv v11, v10 ; bin: f3 44 0f 5e d5
; Bitwise ops. ; Bitwise ops.
; We use the *ps SSE instructions for everything because they are smaller. ; We use the *ps SSE instructions for everything because they are smaller.
@@ -90,12 +105,27 @@ ebb0:
; asm: cvtsi2sdq %r14, %xmm10 ; asm: cvtsi2sdq %r14, %xmm10
[-,%xmm10] v13 = fcvt_from_sint.f64 v3 ; bin: f2 4d 0f 2a d6 [-,%xmm10] v13 = fcvt_from_sint.f64 v3 ; bin: f2 4d 0f 2a d6
; asm: cvtsd2ss %xmm10, %xmm5
[-,%xmm5] v14 = fdemote.f32 v11 ; bin: f2 41 0f 5a ea
; asm: cvtsd2ss %xmm5, %xmm10
[-,%xmm10] v15 = fdemote.f32 v10 ; bin: f2 44 0f 5a d5
; asm: movq %rax, %xmm5
[-,%xmm5] v16 = bitcast.f64 v2 ; bin: 66 48 0f 6e e8
; asm: movq %r14, %xmm10
[-,%xmm10] v17 = bitcast.f64 v3 ; bin: 66 4d 0f 6e d6
; asm: movq %xmm5, %rcx
[-,%rcx] v18 = bitcast.i64 v10 ; bin: 66 48 0f 7e e9
; asm: movq %xmm10, %rsi
[-,%rsi] v19 = bitcast.i64 v11 ; bin: 66 4c 0f 7e d6
; Binary arithmetic. ; Binary arithmetic.
; asm: addsd %xmm10, %xmm5 ; asm: addsd %xmm10, %xmm5
[-,%xmm5] v20 = fadd v10, v11 ; bin: f2 41 0f 58 ea [-,%xmm5] v20 = fadd v10, v11 ; bin: f2 41 0f 58 ea
; asm: addsd %xmm5, %xmm10 ; asm: addsd %xmm5, %xmm10
[-,%xmm10] v21 = fadd v11, v10 ; bin: f2 44 0f 58 d5 [-,%xmm10] v21 = fadd v11, v10 ; bin: f2 44 0f 58 d5
; asm: subsd %xmm10, %xmm5 ; asm: subsd %xmm10, %xmm5
[-,%xmm5] v22 = fsub v10, v11 ; bin: f2 41 0f 5c ea [-,%xmm5] v22 = fsub v10, v11 ; bin: f2 41 0f 5c ea

View File

@@ -22,6 +22,27 @@ ebb0(v0: i32):
return v1 return v1
} }
; function %i32_trunc_s_f32(f32) -> i32
; function %i32_trunc_u_f32(f32) -> i32
; function %i32_trunc_s_f64(f64) -> i32
; function %i32_trunc_u_f64(f64) -> i32
; function %i64_trunc_s_f32(f32) -> i64
; function %i64_trunc_u_f32(f32) -> i64
; function %i64_trunc_s_f64(f64) -> i64
; function %i64_trunc_u_f64(f64) -> i64
function %f32_trunc_f64(f64) -> f32 {
ebb0(v0: f64):
v1 = fdemote.f32 v0
return v1
}
function %f64_promote_f32(f32) -> f64 {
ebb0(v0: f32):
v1 = fpromote.f64 v0
return v1
}
function %f32_convert_s_i32(i32) -> f32 { function %f32_convert_s_i32(i32) -> f32 {
ebb0(v0: i32): ebb0(v0: i32):
v1 = fcvt_from_sint.f32 v0 v1 = fcvt_from_sint.f32 v0
@@ -47,3 +68,27 @@ ebb0(v0: i64):
} }
; TODO: f*_convert_u_i* (Don't exist on Intel). ; TODO: f*_convert_u_i* (Don't exist on Intel).
function %i32_reinterpret_f32(f32) -> i32 {
ebb0(v0: f32):
v1 = bitcast.i32 v0
return v1
}
function %f32_reinterpret_i32(i32) -> f32 {
ebb0(v0: i32):
v1 = bitcast.f32 v0
return v1
}
function %i64_reinterpret_f64(f64) -> i64 {
ebb0(v0: f64):
v1 = bitcast.i64 v0
return v1
}
function %f64_reinterpret_i64(i64) -> f64 {
ebb0(v0: i64):
v1 = bitcast.f64 v0
return v1
}

View File

@@ -11,6 +11,14 @@ from . import settings as cfg
from . import instructions as x86 from . import instructions as x86
from base.legalize import narrow, expand from base.legalize import narrow, expand
try:
from typing import TYPE_CHECKING
if TYPE_CHECKING:
from cdsl.instructions import MaybeBoundInst # noqa
except ImportError:
pass
I32.legalize_type( I32.legalize_type(
default=narrow, default=narrow,
i32=expand, i32=expand,
@@ -24,42 +32,52 @@ I64.legalize_type(
f32=expand, f32=expand,
f64=expand) f64=expand)
#
# Helper functions for generating encodings.
#
def enc_i32_i64(inst, recipe, *args, **kwargs):
# type: (MaybeBoundInst, r.TailRecipe, *int, **int) -> None
"""
Add encodings for `inst.i32` to I32.
Add encodings for `inst.i32` to I64 with and without REX.
Add encodings for `inst.i64` to I64 with a REX.W prefix.
"""
I32.enc(inst.i32, *recipe(*args, **kwargs))
# REX-less encoding must come after REX encoding so we don't use it by
# default. Otherwise reg-alloc would never use r8 and up.
I64.enc(inst.i32, *recipe.rex(*args, **kwargs))
I64.enc(inst.i32, *recipe(*args, **kwargs))
I64.enc(inst.i64, *recipe.rex(*args, w=1, **kwargs))
def enc_flt(inst, recipe, *args, **kwargs):
# type: (MaybeBoundInst, r.TailRecipe, *int, **int) -> None
"""
Add encodings for floating point instruction `inst` to both I32 and I64.
"""
I32.enc(inst, *recipe(*args, **kwargs))
I64.enc(inst, *recipe.rex(*args, **kwargs))
I64.enc(inst, *recipe(*args, **kwargs))
for inst, opc in [ for inst, opc in [
(base.iadd, 0x01), (base.iadd, 0x01),
(base.isub, 0x29), (base.isub, 0x29),
(base.band, 0x21), (base.band, 0x21),
(base.bor, 0x09), (base.bor, 0x09),
(base.bxor, 0x31)]: (base.bxor, 0x31)]:
I32.enc(inst.i32, *r.rr(opc)) enc_i32_i64(inst, r.rr, opc)
I64.enc(inst.i64, *r.rr.rex(opc, w=1)) enc_i32_i64(base.imul, r.rrx, 0x0f, 0xaf)
I64.enc(inst.i32, *r.rr.rex(opc)) enc_i32_i64(x86.sdivmodx, r.div, 0xf7, rrr=7)
# REX-less encoding must come after REX encoding so we don't use it by enc_i32_i64(x86.udivmodx, r.div, 0xf7, rrr=6)
# default. Otherwise reg-alloc would never use r8 and up.
I64.enc(inst.i32, *r.rr(opc))
I32.enc(base.imul.i32, *r.rrx(0x0f, 0xaf)) enc_i32_i64(base.copy, r.umr, 0x89)
I64.enc(base.imul.i64, *r.rrx.rex(0x0f, 0xaf, w=1)) enc_i32_i64(base.regmove, r.rmov, 0x89)
I64.enc(base.imul.i32, *r.rrx.rex(0x0f, 0xaf))
I64.enc(base.imul.i32, *r.rrx(0x0f, 0xaf))
for inst, rrr in [
(x86.sdivmodx, 7),
(x86.udivmodx, 6)]:
I32.enc(inst.i32, *r.div(0xf7, rrr=rrr))
I64.enc(inst.i64, *r.div.rex(0xf7, rrr=rrr, w=1))
I64.enc(inst.i32, *r.div.rex(0xf7, rrr=rrr))
I64.enc(inst.i32, *r.div(0xf7, rrr=rrr))
I32.enc(base.copy.i32, *r.umr(0x89))
I64.enc(base.copy.i64, *r.umr.rex(0x89, w=1))
I64.enc(base.copy.i32, *r.umr.rex(0x89))
I64.enc(base.copy.i32, *r.umr(0x89))
I32.enc(base.regmove.i32, *r.rmov(0x89))
I64.enc(base.regmove.i64, *r.rmov.rex(0x89, w=1))
I64.enc(base.regmove.i32, *r.rmov.rex(0x89))
I64.enc(base.regmove.i32, *r.rmov(0x89))
# Immediate instructions with sign-extended 8-bit and 32-bit immediate. # Immediate instructions with sign-extended 8-bit and 32-bit immediate.
for inst, rrr in [ for inst, rrr in [
@@ -67,15 +85,8 @@ for inst, rrr in [
(base.band_imm, 4), (base.band_imm, 4),
(base.bor_imm, 1), (base.bor_imm, 1),
(base.bxor_imm, 6)]: (base.bxor_imm, 6)]:
I32.enc(inst.i32, *r.rib(0x83, rrr=rrr)) enc_i32_i64(inst, r.rib, 0x83, rrr=rrr)
I32.enc(inst.i32, *r.rid(0x81, rrr=rrr)) enc_i32_i64(inst, r.rid, 0x81, rrr=rrr)
I64.enc(inst.i64, *r.rib.rex(0x83, rrr=rrr, w=1))
I64.enc(inst.i64, *r.rid.rex(0x81, rrr=rrr, w=1))
I64.enc(inst.i32, *r.rib.rex(0x83, rrr=rrr))
I64.enc(inst.i32, *r.rid.rex(0x81, rrr=rrr))
I64.enc(inst.i32, *r.rib(0x83, rrr=rrr))
I64.enc(inst.i32, *r.rid(0x81, rrr=rrr))
# TODO: band_imm.i64 with an unsigned 32-bit immediate can be encoded as # TODO: band_imm.i64 with an unsigned 32-bit immediate can be encoded as
# band_imm.i32. Can even use the single-byte immediate for 0xffff_ffXX masks. # band_imm.i32. Can even use the single-byte immediate for 0xffff_ffXX masks.
@@ -179,15 +190,8 @@ I32.enc(base.jump, *r.jmpd(0xe9))
I64.enc(base.jump, *r.jmpb(0xeb)) I64.enc(base.jump, *r.jmpb(0xeb))
I64.enc(base.jump, *r.jmpd(0xe9)) I64.enc(base.jump, *r.jmpd(0xe9))
I32.enc(base.brz.i32, *r.tjccb(0x74)) enc_i32_i64(base.brz, r.tjccb, 0x74)
I64.enc(base.brz.i64, *r.tjccb.rex(0x74, w=1)) enc_i32_i64(base.brnz, r.tjccb, 0x75)
I64.enc(base.brz.i32, *r.tjccb.rex(0x74))
I64.enc(base.brz.i32, *r.tjccb(0x74))
I32.enc(base.brnz.i32, *r.tjccb(0x75))
I64.enc(base.brnz.i64, *r.tjccb.rex(0x75, w=1))
I64.enc(base.brnz.i32, *r.tjccb.rex(0x75))
I64.enc(base.brnz.i32, *r.tjccb(0x75))
# #
# Trap as ud2 # Trap as ud2
@@ -198,10 +202,7 @@ I64.enc(base.trap, *r.noop(0x0f, 0x0b))
# #
# Comparisons # Comparisons
# #
I32.enc(base.icmp.i32, *r.icscc(0x39)) enc_i32_i64(base.icmp, r.icscc, 0x39)
I64.enc(base.icmp.i64, *r.icscc.rex(0x39, w=1))
I64.enc(base.icmp.i32, *r.icscc.rex(0x39))
I64.enc(base.icmp.i32, *r.icscc(0x39))
# #
# Convert bool to int. # Convert bool to int.
@@ -223,21 +224,31 @@ I64.enc(base.sextend.i64.i32, *r.urm.rex(0x63, w=1))
I64.enc(base.uextend.i64.i32, *r.umr.rex(0x89)) I64.enc(base.uextend.i64.i32, *r.umr.rex(0x89))
I64.enc(base.uextend.i64.i32, *r.umr(0x89)) I64.enc(base.uextend.i64.i32, *r.umr(0x89))
# #
# Floating point # Floating point
# #
# movd
enc_flt(base.bitcast.f32.i32, r.frurm, 0x66, 0x0f, 0x6e)
enc_flt(base.bitcast.i32.f32, r.rfumr, 0x66, 0x0f, 0x7e)
# movq
I64.enc(base.bitcast.f64.i64, *r.frurm.rex(0x66, 0x0f, 0x6e, w=1))
I64.enc(base.bitcast.i64.f64, *r.rfumr.rex(0x66, 0x0f, 0x7e, w=1))
# cvtsi2ss # cvtsi2ss
I32.enc(base.fcvt_from_sint.f32.i32, *r.furm(0xf3, 0x0f, 0x2A)) enc_i32_i64(base.fcvt_from_sint.f32, r.frurm, 0xf3, 0x0f, 0x2a)
I64.enc(base.fcvt_from_sint.f32.i64, *r.furm.rex(0xf3, 0x0f, 0x2A, w=1))
I64.enc(base.fcvt_from_sint.f32.i32, *r.furm.rex(0xf3, 0x0f, 0x2A))
I64.enc(base.fcvt_from_sint.f32.i32, *r.furm(0xf3, 0x0f, 0x2A))
# cvtsi2sd # cvtsi2sd
I32.enc(base.fcvt_from_sint.f64.i32, *r.furm(0xf2, 0x0f, 0x2A)) enc_i32_i64(base.fcvt_from_sint.f64, r.frurm, 0xf2, 0x0f, 0x2a)
I64.enc(base.fcvt_from_sint.f64.i64, *r.furm.rex(0xf2, 0x0f, 0x2A, w=1))
I64.enc(base.fcvt_from_sint.f64.i32, *r.furm.rex(0xf2, 0x0f, 0x2A)) # cvtss2sd
I64.enc(base.fcvt_from_sint.f64.i32, *r.furm(0xf2, 0x0f, 0x2A)) enc_flt(base.fpromote.f64.f32, r.furm, 0xf3, 0x0f, 0x5a)
# cvtsd2ss
enc_flt(base.fdemote.f32.f64, r.furm, 0xf2, 0x0f, 0x5a)
# Binary arithmetic ops. # Binary arithmetic ops.
for inst, opc in [ for inst, opc in [
@@ -245,13 +256,8 @@ for inst, opc in [
(base.fsub, 0x5c), (base.fsub, 0x5c),
(base.fmul, 0x59), (base.fmul, 0x59),
(base.fdiv, 0x5e)]: (base.fdiv, 0x5e)]:
I32.enc(inst.f32, *r.frm(0xf3, 0x0f, opc)) enc_flt(inst.f32, r.frm, 0xf3, 0x0f, opc)
I64.enc(inst.f32, *r.frm.rex(0xf3, 0x0f, opc)) enc_flt(inst.f64, r.frm, 0xf2, 0x0f, opc)
I64.enc(inst.f32, *r.frm(0xf3, 0x0f, opc))
I32.enc(inst.f64, *r.frm(0xf2, 0x0f, opc))
I64.enc(inst.f64, *r.frm.rex(0xf2, 0x0f, opc))
I64.enc(inst.f64, *r.frm(0xf2, 0x0f, opc))
# Binary bitwise ops. # Binary bitwise ops.
for inst, opc in [ for inst, opc in [
@@ -259,10 +265,5 @@ for inst, opc in [
(base.band_not, 0x55), (base.band_not, 0x55),
(base.bor, 0x56), (base.bor, 0x56),
(base.bxor, 0x57)]: (base.bxor, 0x57)]:
I32.enc(inst.f32, *r.frm(0x0f, opc)) enc_flt(inst.f32, r.frm, 0x0f, opc)
I64.enc(inst.f32, *r.frm.rex(0x0f, opc)) enc_flt(inst.f64, r.frm, 0x0f, opc)
I64.enc(inst.f32, *r.frm(0x0f, opc))
I32.enc(inst.f64, *r.frm(0x0f, opc))
I64.enc(inst.f64, *r.frm.rex(0x0f, opc))
I64.enc(inst.f64, *r.frm(0x0f, opc))

View File

@@ -232,6 +232,14 @@ umr = TailRecipe(
modrm_rr(out_reg0, in_reg0, sink); modrm_rr(out_reg0, in_reg0, sink);
''') ''')
# Same as umr, but with FPR -> GPR registers.
rfumr = TailRecipe(
'rfumr', Unary, size=1, ins=FPR, outs=GPR,
emit='''
PUT_OP(bits, rex2(out_reg0, in_reg0), sink);
modrm_rr(out_reg0, in_reg0, sink);
''')
# XX /r, but for a unary operator with separate input/output register. # XX /r, but for a unary operator with separate input/output register.
# RM form. # RM form.
urm = TailRecipe( urm = TailRecipe(
@@ -249,9 +257,17 @@ urm_abcd = TailRecipe(
modrm_rr(in_reg0, out_reg0, sink); modrm_rr(in_reg0, out_reg0, sink);
''') ''')
# XX /r, RM form, GPR -> FPR. # XX /r, RM form, FPR -> FPR.
furm = TailRecipe( furm = TailRecipe(
'furm', Unary, size=1, ins=GPR, outs=FPR, 'furm', Unary, size=1, ins=FPR, outs=FPR,
emit='''
PUT_OP(bits, rex2(in_reg0, out_reg0), sink);
modrm_rr(in_reg0, out_reg0, sink);
''')
# XX /r, RM form, GPR -> FPR.
frurm = TailRecipe(
'frurm', Unary, size=1, ins=GPR, outs=FPR,
emit=''' emit='''
PUT_OP(bits, rex2(in_reg0, out_reg0), sink); PUT_OP(bits, rex2(in_reg0, out_reg0), sink);
modrm_rr(in_reg0, out_reg0, sink); modrm_rr(in_reg0, out_reg0, sink);