test-repo/parseinstrs.py

#!/usr/bin/python3

from binascii import unhexlify
from collections import OrderedDict, defaultdict, namedtuple
from copy import copy
from enum import Enum, IntEnum
from itertools import accumulate
import struct
import sys

def bitstruct(name, fields):
    names, sizes = zip(*(field.split(":") for field in fields))
    sizes = tuple(map(int, sizes))
    offsets = (0,) + tuple(accumulate(sizes))
    class __class:
        def __init__(self, **kwargs):
            for name in names:
                setattr(self, name, kwargs.get(name, 0))
        def _encode(self):
            return sum((getattr(self, name) & ((1 << size) - 1)) << offset
                for name, size, offset in zip(names, sizes, offsets))
    __class.__name__ = name
    __class._encode_size = offsets[-1]
    return __class

InstrFlags = bitstruct("InstrFlags", [
    "modrm_idx:2",
    "modreg_idx:2",
    "vexreg_idx:2",
    "zeroreg_idx:2",
    "operand_sizes:8",
    "imm_idx:2",
    "imm_size:2",
    "imm_control:3",
    "imm_byte:1",
    "gp_size_8:1",
    "gp_size_def64:1",
    "gp_instr_width:1",
    "gp_fixed_operand_size:3",
])
assert InstrFlags._encode_size <= 32

ENCODINGS = {
    "NP": InstrFlags(),
    "M": InstrFlags(modrm_idx=0^3),
    "M1": InstrFlags(modrm_idx=0^3, imm_idx=1^3, imm_control=1),
    "MI": InstrFlags(modrm_idx=0^3, imm_idx=1^3, imm_control=3),
    "MR": InstrFlags(modrm_idx=0^3, modreg_idx=1^3),
    "RM": InstrFlags(modrm_idx=1^3, modreg_idx=0^3),
    "RMA": InstrFlags(modrm_idx=1^3, modreg_idx=0^3, zeroreg_idx=2^3),
    "MRI": InstrFlags(modrm_idx=0^3, modreg_idx=1^3, imm_idx=2^3, imm_control=3),
    "RMI": InstrFlags(modrm_idx=1^3, modreg_idx=0^3, imm_idx=2^3, imm_control=3),
    "I": InstrFlags(imm_idx=0^3, imm_control=3),
    "IA": InstrFlags(zeroreg_idx=0^3, imm_idx=1^3, imm_control=3),
    "O": InstrFlags(modreg_idx=0^3),
    "OI": InstrFlags(modreg_idx=0^3, imm_idx=1^3, imm_control=3),
    "OA": InstrFlags(modreg_idx=0^3, zeroreg_idx=1^3),
    "AO": InstrFlags(modreg_idx=1^3, zeroreg_idx=0^3),
    "D": InstrFlags(imm_idx=0^3, imm_control=4),
    "FD": InstrFlags(zeroreg_idx=0^3, imm_idx=1^3, imm_control=2),
    "TD": InstrFlags(zeroreg_idx=1^3, imm_idx=0^3, imm_control=2),

    "RVM": InstrFlags(modrm_idx=2^3, modreg_idx=0^3, vexreg_idx=1^3),
    "RVMI": InstrFlags(modrm_idx=2^3, modreg_idx=0^3, vexreg_idx=1^3, imm_idx=3^3, imm_control=3, imm_byte=1),
    "RVMR": InstrFlags(modrm_idx=2^3, modreg_idx=0^3, vexreg_idx=1^3, imm_idx=3^3, imm_control=5, imm_byte=1),
    "RMV": InstrFlags(modrm_idx=1^3, modreg_idx=0^3, vexreg_idx=2^3),
    "VM": InstrFlags(modrm_idx=1^3, vexreg_idx=0^3),
    "VMI": InstrFlags(modrm_idx=1^3, vexreg_idx=0^3, imm_idx=2^3, imm_control=3, imm_byte=1),
    "MVR": InstrFlags(modrm_idx=0^3, modreg_idx=2^3, vexreg_idx=1^3),
}

OPKIND_LOOKUP = {
    "-": (0, 0),
    "IMM": (2, 0),
    "IMM8": (1, 0),
    "IMM16": (1, 1),
    "IMM32": (1, 2),
    "GP": (2, 0),
    "GP8": (1, 0),
    "GP16": (1, 1),
    "GP32": (1, 2),
    "GP64": (1, 3),
    "XMM": (3, 0),
    "XMM8": (1, 0),
    "XMM16": (1, 1),
    "XMM32": (1, 2),
    "XMM64": (1, 3),
    "XMM128": (1, 4),
    "XMM256": (1, 5),
    "SREG": (0, 0),
    "FPU": (0, 0),
}

class InstrDesc(namedtuple("InstrDesc", "mnemonic,flags,encoding")):
    __slots__ = ()
    @classmethod
    def parse(cls, desc):
        desc = desc.split()

        fixed_opsz = set()
        opsizes = 0
        for i, opkind in enumerate(desc[1:5]):
            enc_size, fixed_size = OPKIND_LOOKUP[opkind]
            if enc_size == 1: fixed_opsz.add(fixed_size)
            opsizes |= enc_size << 2 * i

        flags = copy(ENCODINGS[desc[0]])
        flags.operand_sizes = opsizes
        if fixed_opsz: flags.gp_fixed_operand_size = next(iter(fixed_opsz))

        # Miscellaneous Flags
        if "DEF64" in desc[6:]:       flags.gp_size_def64 = 1
        if "SIZE_8" in desc[6:]:      flags.gp_size_8 = 1
        if "INSTR_WIDTH" in desc[6:]: flags.gp_instr_width = 1
        if "IMM_8" in desc[6:]:       flags.imm_byte = 1

        return cls(desc[5], frozenset(desc[6:]), flags._encode())
    def encode(self, mnemonics_lut):
        return struct.pack("<HL", mnemonics_lut[self.mnemonic], self.encoding)

class EntryKind(Enum):
    NONE = 0
    INSTR = 1
    TABLE256 = 2
    TABLE8 = 3
    TABLE72 = 4
    TABLE_PREFIX = 5

    @property
    def table_length(self):
        return {
            EntryKind.INSTR: 0,
            EntryKind.TABLE256: 256,
            EntryKind.TABLE8: 8,
            EntryKind.TABLE72: 72,
            EntryKind.TABLE_PREFIX: 16
        }[self]

import re
opcode_regex = re.compile(r"^(?P<prefixes>(?P<vex>VEX\.)?(?P<legacy>NP|66|F2|F3)\.(?P<rexw>W[01]\.)?(?P<vexl>L[01]\.)?)?(?P<opcode>(?:[0-9a-f]{2})+)(?P<modrm>//?[0-7]|//[c-f][0-9a-f])?(?P<extended>\+)?$")

def parse_opcode(opcode_string):
    """
    Parse opcode string into list of type-index tuples.
    """
    match = opcode_regex.match(opcode_string)
    if match is None:
        raise Exception("invalid opcode: '%s'" % opcode_string)

    extended = match.group("extended") is not None

    opcode = [(EntryKind.TABLE256, x) for x in unhexlify(match.group("opcode"))]

    opcext = match.group("modrm")
    if opcext:
        if opcext[1] == "/":
            opcext = int(opcext[2:], 16)
            assert (0 <= opcext <= 7) or (0xc0 <= opcext <= 0xff)
            if opcext >= 0xc0:
                opcext -= 0xb8
            opcode.append((EntryKind.TABLE72, opcext))
        else:
            opcode.append((EntryKind.TABLE8, int(opcext[1:], 16)))

    if match.group("prefixes"):
        assert not extended

        legacy = {"NP": 0, "66": 1, "F3": 2, "F2": 3}[match.group("legacy")]
        entry = legacy | ((1 << 3) if match.group("vex") else 0)

        if match.group("vexl"):
            print("ignored mandatory VEX.L prefix for:", opcode_string)

        rexw = match.group("rexw")
        if not rexw:
            return [tuple(opcode) + ((EntryKind.TABLE_PREFIX, entry),),
                    tuple(opcode) + ((EntryKind.TABLE_PREFIX, entry | (1 << 2)),)]

        entry |= (1 << 2) if "W1" in rexw else 0
        return [tuple(opcode) + ((EntryKind.TABLE_PREFIX, entry),)]

    if not extended:
        return [tuple(opcode)]

    last_type, last_index = opcode[-1]
    assert last_type in (EntryKind.TABLE256, EntryKind.TABLE72)
    assert last_index & 7 == 0

    common_prefix = tuple(opcode[:-1])
    return [common_prefix + ((last_type, last_index + i),) for i in range(8)]

class Table:
    def __init__(self, root_count=1):
        self.data = OrderedDict()
        for i in range(root_count):
            self.data["root%d"%i] = (EntryKind.TABLE256, [None] * 256)

    def compile(self, mnemonics_lut):
        offsets = {}
        annotations = {}
        currentOffset = 0
        stats = defaultdict(int)
        for name, (kind, _) in self.data.items():
            annotations[currentOffset] = "%s(%d)" % (name, kind.value)
            offsets[name] = currentOffset
            stats[kind] += 1
            if kind.table_length:
                currentOffset += kind.table_length * 2
            else:
                currentOffset += 6
            currentOffset = (currentOffset + 7) & ~7
            assert currentOffset < 0x10000

        data = b""
        for name, (kind, value) in self.data.items():
            if len(data) < offsets[name]:
                data += b"\0" * (offsets[name] - len(data))
            assert len(data) == offsets[name]
            if kind == EntryKind.INSTR:
                data += value
            else: # Table
                # count = sum(1 for x in value if x is not None)
                # print("Table of kind", kind, "with %d/%d entries"%(count, kind.table_length))
                for i, entry in enumerate(value):
                    if entry is not None:
                        targetKind, _ = self.data[entry]
                        value = (offsets[entry] & ~7) | targetKind.value
                    else:
                        value = 0
                    data += struct.pack("<H", value)

        print("%d bytes" % len(data), stats)
        return data, annotations

    def deduplicate(self):
        # Make values hashable
        for n, (k, v) in self.data.items():
            self.data[n] = k, (v if k == EntryKind.INSTR else tuple(v))
        synonyms = True
        while synonyms:
            entries = {} # Mapping from entry to name
            synonyms = {} # Mapping from name to unique name
            for name, entry in self.data.items():
                if entry in entries:
                    synonyms[name] = entries[entry]
                else:
                    entries[entry] = name
            for name, (kind, value) in self.data.items():
                if kind != EntryKind.INSTR:
                    self.data[name] = kind, tuple(synonyms.get(v, v) for v in value)
            for key in synonyms:
                del self.data[key]

    def add_opcode(self, opcode, instr_encoding, root_idx=0):
        opcode = list(opcode) + [(None, None)]
        opcode = [(opcode[i+1][0], opcode[i][1]) for i in range(len(opcode)-1)]

        name, table = "t%d"%root_idx, self.data["root%d"%root_idx]
        for kind, byte in opcode[:-1]:
            if table[1][byte] is None:
                name += "{:02x}".format(byte)
                self.data[name] = kind, [None] * kind.table_length
                table[1][byte] = name
            else:
                name = table[1][byte]
            table = self.data[name]
            assert table[0] == kind

        # An opcode can occur once only.
        assert table[1][opcode[-1][1]] is None

        name += "{:02x}/{}".format(opcode[-1][1], "??")
        table[1][opcode[-1][1]] = name
        self.data[name] = EntryKind.INSTR, instr_encoding

def wrap(string):
    return "\n".join(string[i:i+80] for i in range(0, len(string), 80))

def bytes_to_table(data, notes):
    hexdata = ",".join("0x{:02x}".format(byte) for byte in data)
    offs = [0] + sorted(notes.keys()) + [len(data)]
    return "\n".join(wrap(hexdata[p*5:c*5]) + "\n//%04x "%c + notes.get(c, "")
                     for p, c in zip(offs, offs[1:]))

template = """// Auto-generated file -- do not modify!
#if defined(FD_DECODE_TABLE_DATA_32)
{hex_table32}
#elif defined(FD_DECODE_TABLE_DATA_64)
{hex_table64}
#elif defined(FD_DECODE_TABLE_MNEMONICS)
{mnemonic_list}
#elif defined(FD_DECODE_TABLE_STRTAB1)
{mnemonic_cstr}
#elif defined(FD_DECODE_TABLE_STRTAB2)
{mnemonic_offsets}
#else
#error "unspecified decode table"
#endif
"""

if __name__ == "__main__":
    entries = defaultdict(list)
    with open(sys.argv[1], "r") as f:
        for line in f.read().splitlines():
            if line and line[0] != "#":
                opcode_string, desc = tuple(line.split(maxsplit=1))
                for opcode in parse_opcode(opcode_string):
                    entries[opcode].append(InstrDesc.parse(desc))

    mnemonics = sorted({desc.mnemonic for descs in entries.values() for desc in descs})
    mnemonics_lut = {name: mnemonics.index(name) for name in mnemonics}
    table32 = Table()
    table64 = Table()
    masks = "ONLY64", "ONLY32"
    for opcode, descs in entries.items():
        for desc in descs:
            if "ONLY64" not in desc.flags:
                table32.add_opcode(opcode, desc.encode(mnemonics_lut))
            if "ONLY32" not in desc.flags:
                table64.add_opcode(opcode, desc.encode(mnemonics_lut))

    table32.deduplicate()
    table64.deduplicate()

    mnemonic_tab = [0]
    for name in mnemonics:
        mnemonic_tab.append(mnemonic_tab[-1] + len(name) + 1)
    mnemonic_cstr = '"' + "\\0".join(mnemonics) + '"'

    file = template.format(
        hex_table32=bytes_to_table(*table32.compile(mnemonics_lut)),
        hex_table64=bytes_to_table(*table64.compile(mnemonics_lut)),
        mnemonic_list="\n".join("FD_MNEMONIC(%s,%d)"%entry for entry in mnemonics_lut.items()),
        mnemonic_cstr=mnemonic_cstr,
        mnemonic_offsets=",".join(str(off) for off in mnemonic_tab),
    )

    with open(sys.argv[2], "w") as f:
        f.write(file)