""" x86 Encodings. """ from __future__ import absolute_import from cdsl.predicates import IsUnsignedInt, Not, And from base.predicates import IsColocatedFunc, IsColocatedData from base import instructions as base from base.formats import UnaryImm, FuncAddr, Call from .defs import X86_64, X86_32 from . import recipes as r from . import settings as cfg from . import instructions as x86 from .legalize import x86_expand from base.legalize import narrow, expand_flags from base.settings import allones_funcaddrs, is_pic from .settings import use_sse41 try: from typing import TYPE_CHECKING, Any # noqa if TYPE_CHECKING: from cdsl.instructions import MaybeBoundInst # noqa except ImportError: pass X86_32.legalize_monomorphic(expand_flags) X86_32.legalize_type( default=narrow, b1=expand_flags, i32=x86_expand, f32=x86_expand, f64=x86_expand) X86_64.legalize_monomorphic(expand_flags) X86_64.legalize_type( default=narrow, b1=expand_flags, i32=x86_expand, i64=x86_expand, f32=x86_expand, f64=x86_expand) # # Helper functions for generating encodings. # def enc_x86_64(inst, recipe, *args, **kwargs): # type: (MaybeBoundInst, r.TailRecipe, *int, **int) -> None """ Add encodings for `inst` to X86_64 with and without a REX prefix. """ X86_64.enc(inst, *recipe.rex(*args, **kwargs)) X86_64.enc(inst, *recipe(*args, **kwargs)) def enc_both(inst, recipe, *args, **kwargs): # type: (MaybeBoundInst, r.TailRecipe, *int, **Any) -> None """ Add encodings for `inst` to both X86_32 and X86_64. """ X86_32.enc(inst, *recipe(*args, **kwargs)) enc_x86_64(inst, recipe, *args, **kwargs) def enc_i32_i64(inst, recipe, *args, **kwargs): # type: (MaybeBoundInst, r.TailRecipe, *int, **int) -> None """ Add encodings for `inst.i32` to X86_32. Add encodings for `inst.i32` to X86_64 with and without REX. Add encodings for `inst.i64` to X86_64 with a REX.W prefix. """ X86_32.enc(inst.i32, *recipe(*args, **kwargs)) # REX-less encoding must come after REX encoding so we don't use it by # default. Otherwise reg-alloc would never use r8 and up. X86_64.enc(inst.i32, *recipe.rex(*args, **kwargs)) X86_64.enc(inst.i32, *recipe(*args, **kwargs)) X86_64.enc(inst.i64, *recipe.rex(*args, w=1, **kwargs)) def enc_i32_i64_ld_st(inst, w_bit, recipe, *args, **kwargs): # type: (MaybeBoundInst, bool, r.TailRecipe, *int, **int) -> None """ Add encodings for `inst.i32` to X86_32. Add encodings for `inst.i32` to X86_64 with and without REX. Add encodings for `inst.i64` to X86_64 with a REX prefix, using the `w_bit` argument to determine whether or not to set the REX.W bit. """ X86_32.enc(inst.i32.any, *recipe(*args, **kwargs)) # REX-less encoding must come after REX encoding so we don't use it by # default. Otherwise reg-alloc would never use r8 and up. X86_64.enc(inst.i32.any, *recipe.rex(*args, **kwargs)) X86_64.enc(inst.i32.any, *recipe(*args, **kwargs)) if w_bit: X86_64.enc(inst.i64.any, *recipe.rex(*args, w=1, **kwargs)) else: X86_64.enc(inst.i64.any, *recipe.rex(*args, **kwargs)) X86_64.enc(inst.i64.any, *recipe(*args, **kwargs)) for inst, opc in [ (base.iadd, 0x01), (base.isub, 0x29), (base.band, 0x21), (base.bor, 0x09), (base.bxor, 0x31)]: enc_i32_i64(inst, r.rr, opc) # Also add a `b1` encodings for the logic instructions. # TODO: Should this be done with 8-bit instructions? It would improve # partial register dependencies. enc_both(base.band.b1, r.rr, 0x21) enc_both(base.bor.b1, r.rr, 0x09) enc_both(base.bxor.b1, r.rr, 0x31) enc_i32_i64(base.imul, r.rrx, 0x0f, 0xaf) enc_i32_i64(x86.sdivmodx, r.div, 0xf7, rrr=7) enc_i32_i64(x86.udivmodx, r.div, 0xf7, rrr=6) enc_i32_i64(x86.smulx, r.mulx, 0xf7, rrr=5) enc_i32_i64(x86.umulx, r.mulx, 0xf7, rrr=4) enc_i32_i64(base.copy, r.umr, 0x89) enc_both(base.copy.b1, r.umr, 0x89) enc_i32_i64(base.regmove, r.rmov, 0x89) enc_both(base.regmove.b1, r.rmov, 0x89) # Immediate instructions with sign-extended 8-bit and 32-bit immediate. for inst, rrr in [ (base.iadd_imm, 0), (base.band_imm, 4), (base.bor_imm, 1), (base.bxor_imm, 6)]: enc_i32_i64(inst, r.rib, 0x83, rrr=rrr) enc_i32_i64(inst, r.rid, 0x81, rrr=rrr) # TODO: band_imm.i64 with an unsigned 32-bit immediate can be encoded as # band_imm.i32. Can even use the single-byte immediate for 0xffff_ffXX masks. # Immediate constants. X86_32.enc(base.iconst.i32, *r.puid(0xb8)) X86_64.enc(base.iconst.i32, *r.puid.rex(0xb8)) X86_64.enc(base.iconst.i32, *r.puid(0xb8)) # The 32-bit immediate movl also zero-extends to 64 bits. X86_64.enc(base.iconst.i64, *r.puid.rex(0xb8), instp=IsUnsignedInt(UnaryImm.imm, 32)) X86_64.enc(base.iconst.i64, *r.puid(0xb8), instp=IsUnsignedInt(UnaryImm.imm, 32)) # Sign-extended 32-bit immediate. X86_64.enc(base.iconst.i64, *r.uid.rex(0xc7, rrr=0, w=1)) # Finally, the 0xb8 opcode takes an 8-byte immediate with a REX.W prefix. X86_64.enc(base.iconst.i64, *r.puiq.rex(0xb8, w=1)) # bool constants. enc_both(base.bconst.b1, r.puid_bool, 0xb8) # Shifts and rotates. # Note that the dynamic shift amount is only masked by 5 or 6 bits; the 8-bit # and 16-bit shifts would need explicit masking. for inst, rrr in [ (base.rotl, 0), (base.rotr, 1), (base.ishl, 4), (base.ushr, 5), (base.sshr, 7)]: # Cannot use enc_i32_i64 for this pattern because instructions require # .any suffix. X86_32.enc(inst.i32.any, *r.rc(0xd3, rrr=rrr)) X86_64.enc(inst.i64.any, *r.rc.rex(0xd3, rrr=rrr, w=1)) X86_64.enc(inst.i32.any, *r.rc.rex(0xd3, rrr=rrr)) X86_64.enc(inst.i32.any, *r.rc(0xd3, rrr=rrr)) for inst, rrr in [ (base.ishl_imm, 4), (base.ushr_imm, 5), (base.sshr_imm, 7)]: enc_i32_i64(inst, r.rib, 0xc1, rrr=rrr) # Population count. X86_32.enc(base.popcnt.i32, *r.urm(0xf3, 0x0f, 0xb8), isap=cfg.use_popcnt) X86_64.enc(base.popcnt.i64, *r.urm.rex(0xf3, 0x0f, 0xb8, w=1), isap=cfg.use_popcnt) X86_64.enc(base.popcnt.i32, *r.urm.rex(0xf3, 0x0f, 0xb8), isap=cfg.use_popcnt) X86_64.enc(base.popcnt.i32, *r.urm(0xf3, 0x0f, 0xb8), isap=cfg.use_popcnt) # Count leading zero bits. X86_32.enc(base.clz.i32, *r.urm(0xf3, 0x0f, 0xbd), isap=cfg.use_lzcnt) X86_64.enc(base.clz.i64, *r.urm.rex(0xf3, 0x0f, 0xbd, w=1), isap=cfg.use_lzcnt) X86_64.enc(base.clz.i32, *r.urm.rex(0xf3, 0x0f, 0xbd), isap=cfg.use_lzcnt) X86_64.enc(base.clz.i32, *r.urm(0xf3, 0x0f, 0xbd), isap=cfg.use_lzcnt) # Count trailing zero bits. X86_32.enc(base.ctz.i32, *r.urm(0xf3, 0x0f, 0xbc), isap=cfg.use_bmi1) X86_64.enc(base.ctz.i64, *r.urm.rex(0xf3, 0x0f, 0xbc, w=1), isap=cfg.use_bmi1) X86_64.enc(base.ctz.i32, *r.urm.rex(0xf3, 0x0f, 0xbc), isap=cfg.use_bmi1) X86_64.enc(base.ctz.i32, *r.urm(0xf3, 0x0f, 0xbc), isap=cfg.use_bmi1) # # Loads and stores. # for recipe in [r.st, r.stDisp8, r.stDisp32]: enc_i32_i64_ld_st(base.store, True, recipe, 0x89) enc_x86_64(base.istore32.i64.any, recipe, 0x89) enc_i32_i64_ld_st(base.istore16, False, recipe, 0x66, 0x89) # Byte stores are more complicated because the registers they can address # depends of the presence of a REX prefix. The st*_abcd recipes fall back to # the corresponding st* recipes when a REX prefix is applied. for recipe in [r.st_abcd, r.stDisp8_abcd, r.stDisp32_abcd]: enc_both(base.istore8.i32.any, recipe, 0x88) enc_x86_64(base.istore8.i64.any, recipe, 0x88) enc_i32_i64(base.spill, r.spillSib32, 0x89) enc_i32_i64(base.regspill, r.regspill32, 0x89) # Use a 32-bit write for spilling `b1` to avoid constraining the permitted # registers. # See MIN_SPILL_SLOT_SIZE which makes this safe. enc_both(base.spill.b1, r.spillSib32, 0x89) enc_both(base.regspill.b1, r.regspill32, 0x89) for recipe in [r.ld, r.ldDisp8, r.ldDisp32]: enc_i32_i64_ld_st(base.load, True, recipe, 0x8b) enc_x86_64(base.uload32.i64, recipe, 0x8b) X86_64.enc(base.sload32.i64, *recipe.rex(0x63, w=1)) enc_i32_i64_ld_st(base.uload16, True, recipe, 0x0f, 0xb7) enc_i32_i64_ld_st(base.sload16, True, recipe, 0x0f, 0xbf) enc_i32_i64_ld_st(base.uload8, True, recipe, 0x0f, 0xb6) enc_i32_i64_ld_st(base.sload8, True, recipe, 0x0f, 0xbe) enc_i32_i64(base.fill, r.fillSib32, 0x8b) enc_i32_i64(base.regfill, r.regfill32, 0x8b) # Load 32 bits from `b1` spill slots. See `spill.b1` above. enc_both(base.fill.b1, r.fillSib32, 0x8b) enc_both(base.regfill.b1, r.regfill32, 0x8b) # Push and Pop X86_32.enc(x86.push.i32, *r.pushq(0x50)) enc_x86_64(x86.push.i64, r.pushq, 0x50) X86_32.enc(x86.pop.i32, *r.popq(0x58)) enc_x86_64(x86.pop.i64, r.popq, 0x58) # Copy Special X86_64.enc(base.copy_special, *r.copysp.rex(0x89, w=1)) X86_32.enc(base.copy_special, *r.copysp(0x89)) # Adjust SP Imm X86_32.enc(base.adjust_sp_imm, *r.adjustsp8(0x83)) X86_32.enc(base.adjust_sp_imm, *r.adjustsp32(0x81)) X86_64.enc(base.adjust_sp_imm, *r.adjustsp8.rex(0x83, w=1)) X86_64.enc(base.adjust_sp_imm, *r.adjustsp32.rex(0x81, w=1)) # # Float loads and stores. # enc_both(base.load.f32.any, r.fld, 0xf3, 0x0f, 0x10) enc_both(base.load.f32.any, r.fldDisp8, 0xf3, 0x0f, 0x10) enc_both(base.load.f32.any, r.fldDisp32, 0xf3, 0x0f, 0x10) enc_both(base.load.f64.any, r.fld, 0xf2, 0x0f, 0x10) enc_both(base.load.f64.any, r.fldDisp8, 0xf2, 0x0f, 0x10) enc_both(base.load.f64.any, r.fldDisp32, 0xf2, 0x0f, 0x10) enc_both(base.store.f32.any, r.fst, 0xf3, 0x0f, 0x11) enc_both(base.store.f32.any, r.fstDisp8, 0xf3, 0x0f, 0x11) enc_both(base.store.f32.any, r.fstDisp32, 0xf3, 0x0f, 0x11) enc_both(base.store.f64.any, r.fst, 0xf2, 0x0f, 0x11) enc_both(base.store.f64.any, r.fstDisp8, 0xf2, 0x0f, 0x11) enc_both(base.store.f64.any, r.fstDisp32, 0xf2, 0x0f, 0x11) enc_both(base.fill.f32, r.ffillSib32, 0xf3, 0x0f, 0x10) enc_both(base.regfill.f32, r.fregfill32, 0xf3, 0x0f, 0x10) enc_both(base.fill.f64, r.ffillSib32, 0xf2, 0x0f, 0x10) enc_both(base.regfill.f64, r.fregfill32, 0xf2, 0x0f, 0x10) enc_both(base.spill.f32, r.fspillSib32, 0xf3, 0x0f, 0x11) enc_both(base.regspill.f32, r.fregspill32, 0xf3, 0x0f, 0x11) enc_both(base.spill.f64, r.fspillSib32, 0xf2, 0x0f, 0x11) enc_both(base.regspill.f64, r.fregspill32, 0xf2, 0x0f, 0x11) # # Function addresses. # # Non-PIC, all-ones funcaddresses. X86_32.enc(base.func_addr.i32, *r.fnaddr4(0xb8), isap=And(Not(allones_funcaddrs), Not(is_pic))) X86_64.enc(base.func_addr.i64, *r.fnaddr8.rex(0xb8, w=1), isap=And(Not(allones_funcaddrs), Not(is_pic))) # Non-PIC, all-zeros funcaddresses. X86_32.enc(base.func_addr.i32, *r.allones_fnaddr4(0xb8), isap=And(allones_funcaddrs, Not(is_pic))) X86_64.enc(base.func_addr.i64, *r.allones_fnaddr8.rex(0xb8, w=1), isap=And(allones_funcaddrs, Not(is_pic))) # 64-bit, colocated, both PIC and non-PIC. Use the lea instruction's # pc-relative field. X86_64.enc(base.func_addr.i64, *r.pcrel_fnaddr8.rex(0x8d, w=1), instp=IsColocatedFunc(FuncAddr.func_ref)) # 64-bit, non-colocated, PIC. X86_64.enc(base.func_addr.i64, *r.got_fnaddr8.rex(0x8b, w=1), isap=is_pic) # # Global addresses. # # Non-PIC X86_32.enc(base.globalsym_addr.i32, *r.gvaddr4(0xb8), isap=Not(is_pic)) X86_64.enc(base.globalsym_addr.i64, *r.gvaddr8.rex(0xb8, w=1), isap=Not(is_pic)) # PIC, colocated X86_64.enc(base.globalsym_addr.i64, *r.pcrel_gvaddr8.rex(0x8d, w=1), isap=is_pic, instp=IsColocatedData()) # PIC, non-colocated X86_64.enc(base.globalsym_addr.i64, *r.got_gvaddr8.rex(0x8b, w=1), isap=is_pic) # # Call/return # # 32-bit, both PIC and non-PIC. X86_32.enc(base.call, *r.call_id(0xe8)) # 64-bit, colocated, both PIC and non-PIC. Use the call instruction's # pc-relative field. X86_64.enc(base.call, *r.call_id(0xe8), instp=IsColocatedFunc(Call.func_ref)) # 64-bit, non-colocated, PIC. There is no 64-bit non-colocated non-PIC version, # since non-PIC is currently using the large model, which requires calls be # lowered to func_addr+call_indirect. X86_64.enc(base.call, *r.call_plt_id(0xe8), isap=is_pic) X86_32.enc(base.call_indirect.i32, *r.call_r(0xff, rrr=2)) X86_64.enc(base.call_indirect.i64, *r.call_r.rex(0xff, rrr=2)) X86_64.enc(base.call_indirect.i64, *r.call_r(0xff, rrr=2)) X86_32.enc(base.x_return, *r.ret(0xc3)) X86_64.enc(base.x_return, *r.ret(0xc3)) # # Branches # enc_both(base.jump, r.jmpb, 0xeb) enc_both(base.jump, r.jmpd, 0xe9) enc_both(base.brif, r.brib, 0x70) enc_both(base.brif, r.brid, 0x0f, 0x80) # Not all float condition codes are legal, see `supported_floatccs`. enc_both(base.brff, r.brfb, 0x70) enc_both(base.brff, r.brfd, 0x0f, 0x80) # Note that the tjccd opcode will be prefixed with 0x0f. enc_i32_i64(base.brz, r.tjccb, 0x74) enc_i32_i64(base.brz, r.tjccd, 0x84) enc_i32_i64(base.brnz, r.tjccb, 0x75) enc_i32_i64(base.brnz, r.tjccd, 0x85) # Branch on a b1 value in a register only looks at the low 8 bits. See also # bint encodings below. # # Start with the worst-case encoding for X86_32 only. The register allocator # can't handle a branch with an ABCD-constrained operand. X86_32.enc(base.brz.b1, *r.t8jccd_long(0x84)) X86_32.enc(base.brnz.b1, *r.t8jccd_long(0x85)) enc_both(base.brz.b1, r.t8jccb_abcd, 0x74) enc_both(base.brz.b1, r.t8jccd_abcd, 0x84) enc_both(base.brnz.b1, r.t8jccb_abcd, 0x75) enc_both(base.brnz.b1, r.t8jccd_abcd, 0x85) # # Trap as ud2 # X86_32.enc(base.trap, *r.trap(0x0f, 0x0b)) X86_64.enc(base.trap, *r.trap(0x0f, 0x0b)) # Using a standard EncRecipe, not the TailRecipe. X86_32.enc(base.trapif, r.trapif, 0) X86_64.enc(base.trapif, r.trapif, 0) X86_32.enc(base.trapff, r.trapff, 0) X86_64.enc(base.trapff, r.trapff, 0) # # Comparisons # enc_i32_i64(base.icmp, r.icscc, 0x39) enc_i32_i64(base.icmp_imm, r.icsccib, 0x83, rrr=7) enc_i32_i64(base.icmp_imm, r.icsccid, 0x81, rrr=7) enc_i32_i64(base.ifcmp, r.rcmp, 0x39) enc_i32_i64(base.ifcmp_imm, r.rcmpib, 0x83, rrr=7) enc_i32_i64(base.ifcmp_imm, r.rcmpid, 0x81, rrr=7) # TODO: We could special-case ifcmp_imm(x, 0) to TEST(x, x). X86_32.enc(base.ifcmp_sp.i32, *r.rcmp_sp(0x39)) X86_64.enc(base.ifcmp_sp.i64, *r.rcmp_sp.rex(0x39, w=1)) # # Convert flags to bool. # # This encodes `b1` as an 8-bit low register with the value 0 or 1. enc_both(base.trueif, r.seti_abcd, 0x0f, 0x90) enc_both(base.trueff, r.setf_abcd, 0x0f, 0x90) # # Conditional move (a.k.a integer select) # enc_i32_i64(base.selectif, r.cmov, 0x0F, 0x40) # # Bit scan forwards and reverse # enc_i32_i64(x86.bsf, r.bsf_and_bsr, 0x0F, 0xBC) enc_i32_i64(x86.bsr, r.bsf_and_bsr, 0x0F, 0xBD) # # Convert bool to int. # # This assumes that b1 is represented as an 8-bit low register with the value 0 # or 1. # # Encode movzbq as movzbl, because it's equivalent and shorter. X86_32.enc(base.bint.i32.b1, *r.urm_noflags_abcd(0x0f, 0xb6)) X86_64.enc(base.bint.i64.b1, *r.urm_noflags.rex(0x0f, 0xb6)) X86_64.enc(base.bint.i64.b1, *r.urm_noflags_abcd(0x0f, 0xb6)) X86_64.enc(base.bint.i32.b1, *r.urm_noflags.rex(0x0f, 0xb6)) X86_64.enc(base.bint.i32.b1, *r.urm_noflags_abcd(0x0f, 0xb6)) # Numerical conversions. # Reducing an integer is a no-op. X86_32.enc(base.ireduce.i8.i32, r.null, 0) X86_32.enc(base.ireduce.i16.i32, r.null, 0) X86_64.enc(base.ireduce.i8.i32, r.null, 0) X86_64.enc(base.ireduce.i16.i32, r.null, 0) X86_64.enc(base.ireduce.i8.i64, r.null, 0) X86_64.enc(base.ireduce.i16.i64, r.null, 0) X86_64.enc(base.ireduce.i32.i64, r.null, 0) # TODO: Add encodings for cbw, cwde, cdqe, which are sign-extending # instructions for %al/%ax/%eax to %ax/%eax/%rax. # movsbl X86_32.enc(base.sextend.i32.i8, *r.urm_noflags(0x0f, 0xbe)) X86_64.enc(base.sextend.i32.i8, *r.urm_noflags.rex(0x0f, 0xbe)) X86_64.enc(base.sextend.i32.i8, *r.urm_noflags(0x0f, 0xbe)) # movswl X86_32.enc(base.sextend.i32.i16, *r.urm_noflags(0x0f, 0xbf)) X86_64.enc(base.sextend.i32.i16, *r.urm_noflags.rex(0x0f, 0xbf)) X86_64.enc(base.sextend.i32.i16, *r.urm_noflags(0x0f, 0xbf)) # movsbq X86_64.enc(base.sextend.i64.i8, *r.urm_noflags.rex(0x0f, 0xbe, w=1)) # movswq X86_64.enc(base.sextend.i64.i16, *r.urm_noflags.rex(0x0f, 0xbf, w=1)) # movslq X86_64.enc(base.sextend.i64.i32, *r.urm_noflags.rex(0x63, w=1)) # movzbl X86_32.enc(base.uextend.i32.i8, *r.urm_noflags(0x0f, 0xb6)) X86_64.enc(base.uextend.i32.i8, *r.urm_noflags.rex(0x0f, 0xb6)) X86_64.enc(base.uextend.i32.i8, *r.urm_noflags(0x0f, 0xb6)) # movzwl X86_32.enc(base.uextend.i32.i16, *r.urm_noflags(0x0f, 0xb7)) X86_64.enc(base.uextend.i32.i16, *r.urm_noflags.rex(0x0f, 0xb7)) X86_64.enc(base.uextend.i32.i16, *r.urm_noflags(0x0f, 0xb7)) # movzbq, encoded as movzbl because it's equivalent and shorter X86_64.enc(base.uextend.i64.i8, *r.urm_noflags.rex(0x0f, 0xb6)) X86_64.enc(base.uextend.i64.i8, *r.urm_noflags(0x0f, 0xb6)) # movzwq, encoded as movzwl because it's equivalent and shorter X86_64.enc(base.uextend.i64.i16, *r.urm_noflags.rex(0x0f, 0xb7)) X86_64.enc(base.uextend.i64.i16, *r.urm_noflags(0x0f, 0xb7)) # A 32-bit register copy clears the high 32 bits. X86_64.enc(base.uextend.i64.i32, *r.umr.rex(0x89)) X86_64.enc(base.uextend.i64.i32, *r.umr(0x89)) # # Floating point # # movd enc_both(base.bitcast.f32.i32, r.frurm, 0x66, 0x0f, 0x6e) enc_both(base.bitcast.i32.f32, r.rfumr, 0x66, 0x0f, 0x7e) # movq X86_64.enc(base.bitcast.f64.i64, *r.frurm.rex(0x66, 0x0f, 0x6e, w=1)) X86_64.enc(base.bitcast.i64.f64, *r.rfumr.rex(0x66, 0x0f, 0x7e, w=1)) # movaps enc_both(base.copy.f32, r.furm, 0x0f, 0x28) enc_both(base.copy.f64, r.furm, 0x0f, 0x28) enc_both(base.regmove.f32, r.frmov, 0x0f, 0x28) enc_both(base.regmove.f64, r.frmov, 0x0f, 0x28) # cvtsi2ss enc_i32_i64(base.fcvt_from_sint.f32, r.frurm, 0xf3, 0x0f, 0x2a) # cvtsi2sd enc_i32_i64(base.fcvt_from_sint.f64, r.frurm, 0xf2, 0x0f, 0x2a) # cvtss2sd enc_both(base.fpromote.f64.f32, r.furm, 0xf3, 0x0f, 0x5a) # cvtsd2ss enc_both(base.fdemote.f32.f64, r.furm, 0xf2, 0x0f, 0x5a) # cvttss2si enc_both(x86.cvtt2si.i32.f32, r.rfurm, 0xf3, 0x0f, 0x2c) X86_64.enc(x86.cvtt2si.i64.f32, *r.rfurm.rex(0xf3, 0x0f, 0x2c, w=1)) # cvttsd2si enc_both(x86.cvtt2si.i32.f64, r.rfurm, 0xf2, 0x0f, 0x2c) X86_64.enc(x86.cvtt2si.i64.f64, *r.rfurm.rex(0xf2, 0x0f, 0x2c, w=1)) # Exact square roots. enc_both(base.sqrt.f32, r.furm, 0xf3, 0x0f, 0x51) enc_both(base.sqrt.f64, r.furm, 0xf2, 0x0f, 0x51) # Rounding. The recipe looks at the opcode to pick an immediate. for inst in [ base.nearest, base.floor, base.ceil, base.trunc]: enc_both(inst.f32, r.furmi_rnd, 0x66, 0x0f, 0x3a, 0x0a, isap=use_sse41) enc_both(inst.f64, r.furmi_rnd, 0x66, 0x0f, 0x3a, 0x0b, isap=use_sse41) # Binary arithmetic ops. for inst, opc in [ (base.fadd, 0x58), (base.fsub, 0x5c), (base.fmul, 0x59), (base.fdiv, 0x5e), (x86.fmin, 0x5d), (x86.fmax, 0x5f)]: enc_both(inst.f32, r.fa, 0xf3, 0x0f, opc) enc_both(inst.f64, r.fa, 0xf2, 0x0f, opc) # Binary bitwise ops. for inst, opc in [ (base.band, 0x54), (base.bor, 0x56), (base.bxor, 0x57)]: enc_both(inst.f32, r.fa, 0x0f, opc) enc_both(inst.f64, r.fa, 0x0f, opc) # The `andnps(x,y)` instruction computes `~x&y`, while band_not(x,y)` is `x&~y. enc_both(base.band_not.f32, r.fax, 0x0f, 0x55) enc_both(base.band_not.f64, r.fax, 0x0f, 0x55) # Comparisons. # # This only covers the condition codes in `supported_floatccs`, the rest are # handled by legalization patterns. enc_both(base.fcmp.f32, r.fcscc, 0x0f, 0x2e) enc_both(base.fcmp.f64, r.fcscc, 0x66, 0x0f, 0x2e) enc_both(base.ffcmp.f32, r.fcmp, 0x0f, 0x2e) enc_both(base.ffcmp.f64, r.fcmp, 0x66, 0x0f, 0x2e)