moved crates in lib/ to src/, renamed crates, modified some files' text (#660)

2019-01-28 18:56:54 -05:00
parent 54959cf5bb
commit 747ad3c4c5
508 changed files with 94 additions and 92 deletions
--- a/cranelift/codegen/meta-python/isa/init.py
+++ b/cranelift/codegen/meta-python/isa/init.py
@@ -0,0 +1,24 @@
+"""
+Cranelift target ISA definitions
+--------------------------------
+
+The :py:mod:`isa` package contains sub-packages for each target instruction set
+architecture supported by Cranelift.
+"""
+from __future__ import absolute_import
+from cdsl.isa import TargetISA  # noqa
+from . import riscv, x86, arm32, arm64
+
+try:
+    from typing import List  # noqa
+except ImportError:
+    pass
+
+
+def all_isas():
+    # type: () -> List[TargetISA]
+    """
+    Get a list of all the supported target ISAs. Each target ISA is represented
+    as a :py:class:`cranelift.TargetISA` instance.
+    """
+    return [riscv.ISA, x86.ISA, arm32.ISA, arm64.ISA]
--- a/cranelift/codegen/meta-python/isa/arm32/init.py
+++ b/cranelift/codegen/meta-python/isa/arm32/init.py
@@ -0,0 +1,15 @@
+"""
+ARM 32-bit Architecture
+-----------------------
+
+This target ISA generates code for ARMv7 and ARMv8 CPUs in 32-bit mode
+(AArch32). We support both ARM and Thumb2 instruction encodings.
+"""
+
+from __future__ import absolute_import
+from . import defs
+from . import settings, registers  # noqa
+from cdsl.isa import TargetISA  # noqa
+
+# Re-export the primary target ISA definition.
+ISA = defs.ISA.finish()  # type: TargetISA
--- a/cranelift/codegen/meta-python/isa/arm32/defs.py
+++ b/cranelift/codegen/meta-python/isa/arm32/defs.py
@@ -0,0 +1,19 @@
+"""
+ARM 32-bit definitions.
+
+Commonly used definitions.
+"""
+from __future__ import absolute_import
+from cdsl.isa import TargetISA, CPUMode
+import base.instructions
+from base.legalize import narrow
+
+ISA = TargetISA('arm32', [base.instructions.GROUP])  # type: TargetISA
+
+# CPU modes for 32-bit ARM and Thumb2.
+A32 = CPUMode('A32', ISA)
+T32 = CPUMode('T32', ISA)
+
+# TODO: Refine these.
+A32.legalize_type(narrow)
+T32.legalize_type(narrow)
--- a/cranelift/codegen/meta-python/isa/arm32/registers.py
+++ b/cranelift/codegen/meta-python/isa/arm32/registers.py
@@ -0,0 +1,45 @@
+"""
+ARM32 register banks.
+"""
+from __future__ import absolute_import
+from cdsl.registers import RegBank, RegClass
+from .defs import ISA
+
+
+# Define the larger float bank first to avoid the alignment gap.
+FloatRegs = RegBank(
+        'FloatRegs', ISA, r"""
+        Floating point registers.
+
+        The floating point register units correspond to the S-registers, but
+        extended as if there were 64 registers.
+
+        - S registers are one unit each.
+        - D registers are two units each, even D16 and above.
+        - Q registers are 4 units each.
+        """,
+        units=64, prefix='s')
+
+# Special register units:
+# - r15 is the program counter.
+# - r14 is the link register.
+# - r13 is usually the stack pointer.
+IntRegs = RegBank(
+        'IntRegs', ISA,
+        'General purpose registers',
+        units=16, prefix='r')
+
+FlagRegs = RegBank(
+        'FlagRegs', ISA,
+        'Flag registers',
+        units=1,
+        pressure_tracking=False,
+        names=['nzcv'])
+
+GPR = RegClass(IntRegs)
+S = RegClass(FloatRegs, count=32)
+D = RegClass(FloatRegs, width=2)
+Q = RegClass(FloatRegs, width=4)
+FLAG = RegClass(FlagRegs)
+
+RegClass.extract_names(globals())
--- a/cranelift/codegen/meta-python/isa/arm32/settings.py
+++ b/cranelift/codegen/meta-python/isa/arm32/settings.py
@@ -0,0 +1,11 @@
+"""
+ARM32 settings.
+"""
+from __future__ import absolute_import
+from cdsl.settings import SettingGroup
+import base.settings as shared
+from .defs import ISA
+
+ISA.settings = SettingGroup('arm32', parent=shared.group)
+
+ISA.settings.close(globals())
--- a/cranelift/codegen/meta-python/isa/arm64/init.py
+++ b/cranelift/codegen/meta-python/isa/arm64/init.py
@@ -0,0 +1,14 @@
+"""
+ARM 64-bit Architecture
+-----------------------
+
+ARMv8 CPUs running the Aarch64 architecture.
+"""
+
+from __future__ import absolute_import
+from . import defs
+from . import settings, registers  # noqa
+from cdsl.isa import TargetISA  # noqa
+
+# Re-export the primary target ISA definition.
+ISA = defs.ISA.finish()  # type: TargetISA
--- a/cranelift/codegen/meta-python/isa/arm64/defs.py
+++ b/cranelift/codegen/meta-python/isa/arm64/defs.py
@@ -0,0 +1,15 @@
+"""
+ARM64 definitions.
+
+Commonly used definitions.
+"""
+from __future__ import absolute_import
+from cdsl.isa import TargetISA, CPUMode
+import base.instructions
+from base.legalize import narrow
+
+ISA = TargetISA('arm64', [base.instructions.GROUP])  # type: TargetISA
+A64 = CPUMode('A64', ISA)
+
+# TODO: Refine these
+A64.legalize_type(narrow)
--- a/cranelift/codegen/meta-python/isa/arm64/registers.py
+++ b/cranelift/codegen/meta-python/isa/arm64/registers.py
@@ -0,0 +1,32 @@
+"""
+Aarch64 register banks.
+"""
+from __future__ import absolute_import
+from cdsl.registers import RegBank, RegClass
+from .defs import ISA
+
+
+# The `x31` regunit serves as the stack pointer / zero register depending on
+# context. We reserve it and don't model the difference.
+IntRegs = RegBank(
+        'IntRegs', ISA,
+        'General purpose registers',
+        units=32, prefix='x')
+
+FloatRegs = RegBank(
+        'FloatRegs', ISA,
+        'Floating point registers',
+        units=32, prefix='v')
+
+FlagRegs = RegBank(
+        'FlagRegs', ISA,
+        'Flag registers',
+        units=1,
+        pressure_tracking=False,
+        names=['nzcv'])
+
+GPR = RegClass(IntRegs)
+FPR = RegClass(FloatRegs)
+FLAG = RegClass(FlagRegs)
+
+RegClass.extract_names(globals())
--- a/cranelift/codegen/meta-python/isa/arm64/settings.py
+++ b/cranelift/codegen/meta-python/isa/arm64/settings.py
@@ -0,0 +1,11 @@
+"""
+ARM64 settings.
+"""
+from __future__ import absolute_import
+from cdsl.settings import SettingGroup
+import base.settings as shared
+from .defs import ISA
+
+ISA.settings = SettingGroup('arm64', parent=shared.group)
+
+ISA.settings.close(globals())
--- a/cranelift/codegen/meta-python/isa/riscv/init.py
+++ b/cranelift/codegen/meta-python/isa/riscv/init.py
@@ -0,0 +1,33 @@
+"""
+RISC-V Target
+-------------
+
+`RISC-V <https://riscv.org/>`_ is an open instruction set architecture
+originally developed at UC Berkeley. It is a RISC-style ISA with either a
+32-bit (RV32I) or 64-bit (RV32I) base instruction set and a number of optional
+extensions:
+
+RV32M / RV64M
+    Integer multiplication and division.
+
+RV32A / RV64A
+    Atomics.
+
+RV32F / RV64F
+    Single-precision IEEE floating point.
+
+RV32D / RV64D
+    Double-precision IEEE floating point.
+
+RV32G / RV64G
+    General purpose instruction sets. This represents the union of the I, M, A,
+    F, and D instruction sets listed above.
+
+"""
+from __future__ import absolute_import
+from . import defs
+from . import encodings, settings, registers  # noqa
+from cdsl.isa import TargetISA  # noqa
+
+# Re-export the primary target ISA definition.
+ISA = defs.ISA.finish()  # type: TargetISA
--- a/cranelift/codegen/meta-python/isa/riscv/defs.py
+++ b/cranelift/codegen/meta-python/isa/riscv/defs.py
@@ -0,0 +1,14 @@
+"""
+RISC-V definitions.
+
+Commonly used definitions.
+"""
+from __future__ import absolute_import
+from cdsl.isa import TargetISA, CPUMode
+import base.instructions
+
+ISA = TargetISA('riscv', [base.instructions.GROUP])  # type: TargetISA
+
+# CPU modes for 32-bit and 64-bit operation.
+RV32 = CPUMode('RV32', ISA)
+RV64 = CPUMode('RV64', ISA)
--- a/cranelift/codegen/meta-python/isa/riscv/encodings.py
+++ b/cranelift/codegen/meta-python/isa/riscv/encodings.py
@@ -0,0 +1,162 @@
+"""
+RISC-V Encodings.
+"""
+from __future__ import absolute_import
+from base import instructions as base
+from base.immediates import intcc
+from .defs import RV32, RV64
+from .recipes import OPIMM, OPIMM32, OP, OP32, LUI, BRANCH, JALR, JAL
+from .recipes import LOAD, STORE
+from .recipes import R, Rshamt, Ricmp, Ii, Iz, Iicmp, Iret, Icall, Icopy
+from .recipes import U, UJ, UJcall, SB, SBzero, GPsp, GPfi, Irmov
+from .settings import use_m
+from cdsl.ast import Var
+from base.legalize import narrow, expand
+
+RV32.legalize_monomorphic(expand)
+RV32.legalize_type(
+        default=narrow,
+        i32=expand,
+        f32=expand,
+        f64=expand)
+
+RV64.legalize_monomorphic(expand)
+RV64.legalize_type(
+        default=narrow,
+        i32=expand,
+        i64=expand,
+        f32=expand,
+        f64=expand)
+
+# Dummies for instruction predicates.
+x = Var('x')
+y = Var('y')
+dest = Var('dest')
+args = Var('args')
+
+# Basic arithmetic binary instructions are encoded in an R-type instruction.
+for inst,           inst_imm,      f3,    f7 in [
+        (base.iadd, base.iadd_imm, 0b000, 0b0000000),
+        (base.isub, None,          0b000, 0b0100000),
+        (base.bxor, base.bxor_imm, 0b100, 0b0000000),
+        (base.bor,  base.bor_imm,  0b110, 0b0000000),
+        (base.band, base.band_imm, 0b111, 0b0000000)
+        ]:
+    RV32.enc(inst.i32, R, OP(f3, f7))
+    RV64.enc(inst.i64, R, OP(f3, f7))
+
+    # Immediate versions for add/xor/or/and.
+    if inst_imm:
+        RV32.enc(inst_imm.i32, Ii, OPIMM(f3))
+        RV64.enc(inst_imm.i64, Ii, OPIMM(f3))
+
+# 32-bit ops in RV64.
+RV64.enc(base.iadd.i32, R, OP32(0b000, 0b0000000))
+RV64.enc(base.isub.i32, R, OP32(0b000, 0b0100000))
+# There are no andiw/oriw/xoriw variations.
+RV64.enc(base.iadd_imm.i32, Ii, OPIMM32(0b000))
+
+# Use iadd_imm with %x0 to materialize constants.
+RV32.enc(base.iconst.i32, Iz, OPIMM(0b000))
+RV64.enc(base.iconst.i32, Iz, OPIMM(0b000))
+RV64.enc(base.iconst.i64, Iz, OPIMM(0b000))
+
+# Dynamic shifts have the same masking semantics as the clif base instructions.
+for inst,           inst_imm,      f3,    f7 in [
+        (base.ishl, base.ishl_imm, 0b001, 0b0000000),
+        (base.ushr, base.ushr_imm, 0b101, 0b0000000),
+        (base.sshr, base.sshr_imm, 0b101, 0b0100000),
+        ]:
+    RV32.enc(inst.i32.i32, R, OP(f3, f7))
+    RV64.enc(inst.i64.i64, R, OP(f3, f7))
+    RV64.enc(inst.i32.i32, R, OP32(f3, f7))
+    # Allow i32 shift amounts in 64-bit shifts.
+    RV64.enc(inst.i64.i32, R, OP(f3, f7))
+    RV64.enc(inst.i32.i64, R, OP32(f3, f7))
+
+    # Immediate shifts.
+    RV32.enc(inst_imm.i32, Rshamt, OPIMM(f3, f7))
+    RV64.enc(inst_imm.i64, Rshamt, OPIMM(f3, f7))
+    RV64.enc(inst_imm.i32, Rshamt, OPIMM32(f3, f7))
+
+# Signed and unsigned integer 'less than'. There are no 'w' variants for
+# comparing 32-bit numbers in RV64.
+RV32.enc(base.icmp.i32(intcc.slt, x, y), Ricmp, OP(0b010, 0b0000000))
+RV64.enc(base.icmp.i64(intcc.slt, x, y), Ricmp, OP(0b010, 0b0000000))
+RV32.enc(base.icmp.i32(intcc.ult, x, y), Ricmp, OP(0b011, 0b0000000))
+RV64.enc(base.icmp.i64(intcc.ult, x, y), Ricmp, OP(0b011, 0b0000000))
+
+RV32.enc(base.icmp_imm.i32(intcc.slt, x, y), Iicmp, OPIMM(0b010))
+RV64.enc(base.icmp_imm.i64(intcc.slt, x, y), Iicmp, OPIMM(0b010))
+RV32.enc(base.icmp_imm.i32(intcc.ult, x, y), Iicmp, OPIMM(0b011))
+RV64.enc(base.icmp_imm.i64(intcc.ult, x, y), Iicmp, OPIMM(0b011))
+
+# Integer constants with the low 12 bits clear are materialized by lui.
+RV32.enc(base.iconst.i32, U, LUI())
+RV64.enc(base.iconst.i32, U, LUI())
+RV64.enc(base.iconst.i64, U, LUI())
+
+# "M" Standard Extension for Integer Multiplication and Division.
+# Gated by the `use_m` flag.
+RV32.enc(base.imul.i32, R, OP(0b000, 0b0000001), isap=use_m)
+RV64.enc(base.imul.i64, R, OP(0b000, 0b0000001), isap=use_m)
+RV64.enc(base.imul.i32, R, OP32(0b000, 0b0000001), isap=use_m)
+
+# Control flow.
+
+# Unconditional branches.
+RV32.enc(base.jump, UJ, JAL())
+RV64.enc(base.jump, UJ, JAL())
+RV32.enc(base.call, UJcall, JAL())
+RV64.enc(base.call, UJcall, JAL())
+
+# Conditional branches.
+for cond,           f3 in [
+        (intcc.eq,  0b000),
+        (intcc.ne,  0b001),
+        (intcc.slt, 0b100),
+        (intcc.sge, 0b101),
+        (intcc.ult, 0b110),
+        (intcc.uge, 0b111)
+        ]:
+    RV32.enc(base.br_icmp.i32(cond, x, y, dest, args), SB, BRANCH(f3))
+    RV64.enc(base.br_icmp.i64(cond, x, y, dest, args), SB, BRANCH(f3))
+
+for inst,           f3 in [
+        (base.brz,  0b000),
+        (base.brnz, 0b001)
+        ]:
+    RV32.enc(inst.i32, SBzero, BRANCH(f3))
+    RV64.enc(inst.i64, SBzero, BRANCH(f3))
+    RV32.enc(inst.b1, SBzero, BRANCH(f3))
+    RV64.enc(inst.b1, SBzero, BRANCH(f3))
+
+# Returns are a special case of JALR using %x1 to hold the return address.
+# The return address is provided by a special-purpose `link` return value that
+# is added by legalize_signature().
+RV32.enc(base.x_return, Iret, JALR())
+RV64.enc(base.x_return, Iret, JALR())
+RV32.enc(base.call_indirect.i32, Icall, JALR())
+RV64.enc(base.call_indirect.i64, Icall, JALR())
+
+# Spill and fill.
+RV32.enc(base.spill.i32, GPsp, STORE(0b010))
+RV64.enc(base.spill.i32, GPsp, STORE(0b010))
+RV64.enc(base.spill.i64, GPsp, STORE(0b011))
+RV32.enc(base.fill.i32, GPfi, LOAD(0b010))
+RV64.enc(base.fill.i32, GPfi, LOAD(0b010))
+RV64.enc(base.fill.i64, GPfi, LOAD(0b011))
+
+# Register copies.
+RV32.enc(base.copy.i32, Icopy, OPIMM(0b000))
+RV64.enc(base.copy.i64, Icopy, OPIMM(0b000))
+RV64.enc(base.copy.i32, Icopy, OPIMM32(0b000))
+
+RV32.enc(base.regmove.i32, Irmov, OPIMM(0b000))
+RV64.enc(base.regmove.i64, Irmov, OPIMM(0b000))
+RV64.enc(base.regmove.i32, Irmov, OPIMM32(0b000))
+
+RV32.enc(base.copy.b1, Icopy, OPIMM(0b000))
+RV64.enc(base.copy.b1, Icopy, OPIMM(0b000))
+RV32.enc(base.regmove.b1, Irmov, OPIMM(0b000))
+RV64.enc(base.regmove.b1, Irmov, OPIMM(0b000))
--- a/cranelift/codegen/meta-python/isa/riscv/recipes.py
+++ b/cranelift/codegen/meta-python/isa/riscv/recipes.py
@@ -0,0 +1,225 @@
+"""
+RISC-V Encoding recipes.
+
+The encoding recipes defined here more or less correspond to the RISC-V native
+instruction formats described in the reference:
+
+    The RISC-V Instruction Set Manual
+    Volume I: User-Level ISA
+    Version 2.1
+"""
+from __future__ import absolute_import
+from cdsl.isa import EncRecipe
+from cdsl.predicates import IsSignedInt
+from cdsl.registers import Stack
+from base.formats import Binary, BinaryImm, MultiAry, IntCompare, IntCompareImm
+from base.formats import Unary, UnaryImm, BranchIcmp, Branch, Jump
+from base.formats import Call, CallIndirect, RegMove
+from .registers import GPR
+
+# The low 7 bits of a RISC-V instruction is the base opcode. All 32-bit
+# instructions have 11 as the two low bits, with bits 6:2 determining the base
+# opcode.
+#
+# Encbits for the 32-bit recipes are opcode[6:2] | (funct3 << 5) | ...
+# The functions below encode the encbits.
+
+
+def LOAD(funct3):
+    # type: (int) -> int
+    assert funct3 <= 0b111
+    return 0b00000 | (funct3 << 5)
+
+
+def STORE(funct3):
+    # type: (int) -> int
+    assert funct3 <= 0b111
+    return 0b01000 | (funct3 << 5)
+
+
+def BRANCH(funct3):
+    # type: (int) -> int
+    assert funct3 <= 0b111
+    return 0b11000 | (funct3 << 5)
+
+
+def JALR(funct3=0):
+    # type: (int) -> int
+    assert funct3 <= 0b111
+    return 0b11001 | (funct3 << 5)
+
+
+def JAL():
+    # type: () -> int
+    return 0b11011
+
+
+def OPIMM(funct3, funct7=0):
+    # type: (int, int) -> int
+    assert funct3 <= 0b111
+    return 0b00100 | (funct3 << 5) | (funct7 << 8)
+
+
+def OPIMM32(funct3, funct7=0):
+    # type: (int, int) -> int
+    assert funct3 <= 0b111
+    return 0b00110 | (funct3 << 5) | (funct7 << 8)
+
+
+def OP(funct3, funct7):
+    # type: (int, int) -> int
+    assert funct3 <= 0b111
+    assert funct7 <= 0b1111111
+    return 0b01100 | (funct3 << 5) | (funct7 << 8)
+
+
+def OP32(funct3, funct7):
+    # type: (int, int) -> int
+    assert funct3 <= 0b111
+    assert funct7 <= 0b1111111
+    return 0b01110 | (funct3 << 5) | (funct7 << 8)
+
+
+def AIUPC():
+    # type: () -> int
+    return 0b00101
+
+
+def LUI():
+    # type: () -> int
+    return 0b01101
+
+
+# R-type 32-bit instructions: These are mostly binary arithmetic instructions.
+# The encbits are `opcode[6:2] | (funct3 << 5) | (funct7 << 8)
+R = EncRecipe(
+        'R', Binary, base_size=4, ins=(GPR, GPR), outs=GPR,
+        emit='put_r(bits, in_reg0, in_reg1, out_reg0, sink);')
+
+# R-type with an immediate shift amount instead of rs2.
+Rshamt = EncRecipe(
+        'Rshamt', BinaryImm, base_size=4, ins=GPR, outs=GPR,
+        emit='put_rshamt(bits, in_reg0, imm.into(), out_reg0, sink);')
+
+# R-type encoding of an integer comparison.
+Ricmp = EncRecipe(
+        'Ricmp', IntCompare, base_size=4, ins=(GPR, GPR), outs=GPR,
+        emit='put_r(bits, in_reg0, in_reg1, out_reg0, sink);')
+
+Ii = EncRecipe(
+        'Ii', BinaryImm, base_size=4, ins=GPR, outs=GPR,
+        instp=IsSignedInt(BinaryImm.imm, 12),
+        emit='put_i(bits, in_reg0, imm.into(), out_reg0, sink);')
+
+# I-type instruction with a hardcoded %x0 rs1.
+Iz = EncRecipe(
+        'Iz', UnaryImm, base_size=4, ins=(), outs=GPR,
+        instp=IsSignedInt(UnaryImm.imm, 12),
+        emit='put_i(bits, 0, imm.into(), out_reg0, sink);')
+
+# I-type encoding of an integer comparison.
+Iicmp = EncRecipe(
+        'Iicmp', IntCompareImm, base_size=4, ins=GPR, outs=GPR,
+        instp=IsSignedInt(IntCompareImm.imm, 12),
+        emit='put_i(bits, in_reg0, imm.into(), out_reg0, sink);')
+
+# I-type encoding for `jalr` as a return instruction. We won't use the
+# immediate offset.
+# The variable return values are not encoded.
+Iret = EncRecipe(
+        'Iret', MultiAry, base_size=4, ins=(), outs=(),
+        emit='''
+        // Return instructions are always a jalr to %x1.
+        // The return address is provided as a special-purpose link argument.
+        put_i(
+            bits,
+            1, // rs1 = %x1
+            0, // no offset.
+            0, // rd = %x0: no address written.
+            sink,
+        );
+        ''')
+
+# I-type encoding for `jalr` as a call_indirect.
+Icall = EncRecipe(
+        'Icall', CallIndirect, base_size=4, ins=GPR, outs=(),
+        emit='''
+        // call_indirect instructions are jalr with rd=%x1.
+        put_i(
+            bits,
+            in_reg0,
+            0, // no offset.
+            1, // rd = %x1: link register.
+            sink,
+        );
+        ''')
+
+
+# Copy of a GPR is implemented as addi x, 0.
+Icopy = EncRecipe(
+        'Icopy', Unary, base_size=4, ins=GPR, outs=GPR,
+        emit='put_i(bits, in_reg0, 0, out_reg0, sink);')
+
+# Same for a GPR regmove.
+Irmov = EncRecipe(
+        'Irmov', RegMove, base_size=4, ins=GPR, outs=(),
+        emit='put_i(bits, src, 0, dst, sink);')
+
+# U-type instructions have a 20-bit immediate that targets bits 12-31.
+U = EncRecipe(
+        'U', UnaryImm, base_size=4, ins=(), outs=GPR,
+        instp=IsSignedInt(UnaryImm.imm, 32, 12),
+        emit='put_u(bits, imm.into(), out_reg0, sink);')
+
+# UJ-type unconditional branch instructions.
+UJ = EncRecipe(
+        'UJ', Jump, base_size=4, ins=(), outs=(), branch_range=(0, 21),
+        emit='''
+        let dest = i64::from(func.offsets[destination]);
+        let disp = dest - i64::from(sink.offset());
+        put_uj(bits, disp, 0, sink);
+        ''')
+
+UJcall = EncRecipe(
+        'UJcall', Call, base_size=4, ins=(), outs=(),
+        emit='''
+        sink.reloc_external(Reloc::RiscvCall,
+                            &func.dfg.ext_funcs[func_ref].name,
+                            0);
+        // rd=%x1 is the standard link register.
+        put_uj(bits, 0, 1, sink);
+        ''')
+
+# SB-type branch instructions.
+SB = EncRecipe(
+        'SB', BranchIcmp, base_size=4,
+        ins=(GPR, GPR), outs=(),
+        branch_range=(0, 13),
+        emit='''
+        let dest = i64::from(func.offsets[destination]);
+        let disp = dest - i64::from(sink.offset());
+        put_sb(bits, disp, in_reg0, in_reg1, sink);
+        ''')
+
+# SB-type branch instruction with rs2 fixed to zero.
+SBzero = EncRecipe(
+        'SBzero', Branch, base_size=4,
+        ins=(GPR), outs=(),
+        branch_range=(0, 13),
+        emit='''
+        let dest = i64::from(func.offsets[destination]);
+        let disp = dest - i64::from(sink.offset());
+        put_sb(bits, disp, in_reg0, 0, sink);
+        ''')
+
+# Spill of a GPR.
+GPsp = EncRecipe(
+        'GPsp', Unary, base_size=4,
+        ins=GPR, outs=Stack(GPR),
+        emit='unimplemented!();')
+
+# Fill of a GPR.
+GPfi = EncRecipe(
+        'GPfi', Unary, base_size=4,
+        ins=Stack(GPR), outs=GPR,
+        emit='unimplemented!();')
--- a/cranelift/codegen/meta-python/isa/riscv/registers.py
+++ b/cranelift/codegen/meta-python/isa/riscv/registers.py
@@ -0,0 +1,23 @@
+"""
+RISC-V register banks.
+"""
+from __future__ import absolute_import
+from cdsl.registers import RegBank, RegClass
+from .defs import ISA
+
+
+# We include `x0`, a.k.a `zero` in the register bank. It will be reserved.
+IntRegs = RegBank(
+        'IntRegs', ISA,
+        'General purpose registers',
+        units=32, prefix='x')
+
+FloatRegs = RegBank(
+        'FloatRegs', ISA,
+        'Floating point registers',
+        units=32, prefix='f')
+
+GPR = RegClass(IntRegs)
+FPR = RegClass(FloatRegs)
+
+RegClass.extract_names(globals())
--- a/cranelift/codegen/meta-python/isa/riscv/settings.py
+++ b/cranelift/codegen/meta-python/isa/riscv/settings.py
@@ -0,0 +1,31 @@
+"""
+RISC-V settings.
+"""
+from __future__ import absolute_import
+from cdsl.settings import SettingGroup, BoolSetting
+from cdsl.predicates import And
+import base.settings as shared
+from .defs import ISA
+
+ISA.settings = SettingGroup('riscv', parent=shared.group)
+
+supports_m = BoolSetting("CPU supports the 'M' extension (mul/div)")
+supports_a = BoolSetting("CPU supports the 'A' extension (atomics)")
+supports_f = BoolSetting("CPU supports the 'F' extension (float)")
+supports_d = BoolSetting("CPU supports the 'D' extension (double)")
+
+enable_m = BoolSetting(
+        "Enable the use of 'M' instructions if available",
+        default=True)
+
+enable_e = BoolSetting(
+        "Enable the 'RV32E' instruction set with only 16 registers")
+
+use_m = And(supports_m, enable_m)
+use_a = And(supports_a, shared.enable_atomics)
+use_f = And(supports_f, shared.enable_float)
+use_d = And(supports_d, shared.enable_float)
+
+full_float = And(shared.enable_simd, supports_f, supports_d)
+
+ISA.settings.close(globals())
--- a/cranelift/codegen/meta-python/isa/x86/init.py
+++ b/cranelift/codegen/meta-python/isa/x86/init.py
@@ -0,0 +1,22 @@
+"""
+x86 Target Architecture
+-----------------------
+
+This target ISA generates code for x86 CPUs with two separate CPU modes:
+
+`I32`
+    32-bit x86 architecture, also known as 'IA-32', also sometimes referred
+    to as 'i386', however note that Cranelift depends on instructions not
+    in the original `i386`, such as SSE2, CMOVcc, and UD2.
+
+`I64`
+    x86-64 architecture, also known as 'AMD64`, `Intel 64`, and 'x64'.
+"""
+
+from __future__ import absolute_import
+from . import defs
+from . import encodings, settings, registers  # noqa
+from cdsl.isa import TargetISA  # noqa
+
+# Re-export the primary target ISA definition.
+ISA = defs.ISA.finish()  # type: TargetISA
--- a/cranelift/codegen/meta-python/isa/x86/defs.py
+++ b/cranelift/codegen/meta-python/isa/x86/defs.py
@@ -0,0 +1,28 @@
+"""
+x86 definitions.
+
+Commonly used definitions.
+"""
+from __future__ import absolute_import
+from cdsl.isa import TargetISA, CPUMode
+import base.instructions
+from . import instructions as x86
+from base.immediates import floatcc
+
+ISA = TargetISA('x86', [base.instructions.GROUP, x86.GROUP])  # type: TargetISA
+
+# CPU modes for 32-bit and 64-bit operation.
+X86_64 = CPUMode('I64', ISA)
+X86_32 = CPUMode('I32', ISA)
+
+# The set of floating point condition codes that are directly supported.
+# Other condition codes need to be reversed or expressed as two tests.
+supported_floatccs = [
+        floatcc.ord,
+        floatcc.uno,
+        floatcc.one,
+        floatcc.ueq,
+        floatcc.gt,
+        floatcc.ge,
+        floatcc.ult,
+        floatcc.ule]
--- a/cranelift/codegen/meta-python/isa/x86/encodings.py
+++ b/cranelift/codegen/meta-python/isa/x86/encodings.py
@@ -0,0 +1,748 @@
+"""
+x86 Encodings.
+"""
+from __future__ import absolute_import
+from cdsl.predicates import IsZero32BitFloat, IsZero64BitFloat
+from cdsl.predicates import IsUnsignedInt, Not, And
+from base.predicates import IsColocatedFunc, IsColocatedData, LengthEquals
+from base import instructions as base
+from base import types
+from base.formats import UnaryIeee32, UnaryIeee64, UnaryImm
+from base.formats import FuncAddr, Call, LoadComplex, StoreComplex
+from .defs import X86_64, X86_32
+from . import recipes as r
+from . import settings as cfg
+from . import instructions as x86
+from .legalize import x86_expand
+from base.legalize import narrow, widen, expand_flags
+from base.settings import allones_funcaddrs, is_pic
+from .settings import use_sse41
+
+try:
+    from typing import TYPE_CHECKING, Any  # noqa
+    if TYPE_CHECKING:
+        from cdsl.instructions import MaybeBoundInst  # noqa
+        from cdsl.predicates import FieldPredicate # noqa
+except ImportError:
+    pass
+
+
+X86_32.legalize_monomorphic(expand_flags)
+X86_32.legalize_type(
+    default=narrow,
+    b1=expand_flags,
+    i8=widen,
+    i16=widen,
+    i32=x86_expand,
+    f32=x86_expand,
+    f64=x86_expand)
+
+X86_64.legalize_monomorphic(expand_flags)
+X86_64.legalize_type(
+    default=narrow,
+    b1=expand_flags,
+    i8=widen,
+    i16=widen,
+    i32=x86_expand,
+    i64=x86_expand,
+    f32=x86_expand,
+    f64=x86_expand)
+
+
+#
+# Helper functions for generating encodings.
+#
+
+def enc_x86_64(inst, recipe, *args, **kwargs):
+    # type: (MaybeBoundInst, r.TailRecipe, *int, **int) -> None
+    """
+    Add encodings for `inst` to X86_64 with and without a REX prefix.
+    """
+    X86_64.enc(inst, *recipe.rex(*args, **kwargs))
+    X86_64.enc(inst, *recipe(*args, **kwargs))
+
+
+def enc_x86_64_instp(inst, recipe, instp, *args, **kwargs):
+    # type: (MaybeBoundInst, r.TailRecipe, FieldPredicate, *int, **int) -> None
+    """
+    Add encodings for `inst` to X86_64 with and without a REX prefix.
+    """
+    X86_64.enc(inst, *recipe.rex(*args, **kwargs), instp=instp)
+    X86_64.enc(inst, *recipe(*args, **kwargs), instp=instp)
+
+
+def enc_both(inst, recipe, *args, **kwargs):
+    # type: (MaybeBoundInst, r.TailRecipe, *int, **Any) -> None
+    """
+    Add encodings for `inst` to both X86_32 and X86_64.
+    """
+    X86_32.enc(inst, *recipe(*args, **kwargs))
+    enc_x86_64(inst, recipe, *args, **kwargs)
+
+
+def enc_both_instp(inst, recipe, instp, *args, **kwargs):
+    # type: (MaybeBoundInst, r.TailRecipe, FieldPredicate, *int, **Any) -> None
+    """
+    Add encodings for `inst` to both X86_32 and X86_64.
+    """
+    X86_32.enc(inst, *recipe(*args, **kwargs), instp=instp)
+    enc_x86_64_instp(inst, recipe, instp, *args, **kwargs)
+
+
+def enc_i32_i64(inst, recipe, *args, **kwargs):
+    # type: (MaybeBoundInst, r.TailRecipe, *int, **int) -> None
+    """
+    Add encodings for `inst.i32` to X86_32.
+    Add encodings for `inst.i32` to X86_64 with and without REX.
+    Add encodings for `inst.i64` to X86_64 with a REX.W prefix.
+    """
+    X86_32.enc(inst.i32, *recipe(*args, **kwargs))
+
+    # REX-less encoding must come after REX encoding so we don't use it by
+    # default. Otherwise reg-alloc would never use r8 and up.
+    X86_64.enc(inst.i32, *recipe.rex(*args, **kwargs))
+    X86_64.enc(inst.i32, *recipe(*args, **kwargs))
+
+    X86_64.enc(inst.i64, *recipe.rex(*args, w=1, **kwargs))
+
+
+def enc_i32_i64_instp(inst, recipe, instp, *args, **kwargs):
+    # type: (MaybeBoundInst, r.TailRecipe, FieldPredicate, *int, **int) -> None
+    """
+    Add encodings for `inst.i32` to X86_32.
+    Add encodings for `inst.i32` to X86_64 with and without REX.
+    Add encodings for `inst.i64` to X86_64 with a REX.W prefix.
+
+    Similar to `enc_i32_i64` but applies `instp` to each encoding.
+    """
+    X86_32.enc(inst.i32, *recipe(*args, **kwargs), instp=instp)
+
+    # REX-less encoding must come after REX encoding so we don't use it by
+    # default. Otherwise reg-alloc would never use r8 and up.
+    X86_64.enc(inst.i32, *recipe.rex(*args, **kwargs), instp=instp)
+    X86_64.enc(inst.i32, *recipe(*args, **kwargs), instp=instp)
+
+    X86_64.enc(inst.i64, *recipe.rex(*args, w=1, **kwargs), instp=instp)
+
+
+def enc_i32_i64_ld_st(inst, w_bit, recipe, *args, **kwargs):
+    # type: (MaybeBoundInst, bool, r.TailRecipe, *int, **int) -> None
+    """
+    Add encodings for `inst.i32` to X86_32.
+    Add encodings for `inst.i32` to X86_64 with and without REX.
+    Add encodings for `inst.i64` to X86_64 with a REX prefix, using the `w_bit`
+    argument to determine whether or not to set the REX.W bit.
+    """
+    X86_32.enc(inst.i32.any, *recipe(*args, **kwargs))
+
+    # REX-less encoding must come after REX encoding so we don't use it by
+    # default. Otherwise reg-alloc would never use r8 and up.
+    X86_64.enc(inst.i32.any, *recipe.rex(*args, **kwargs))
+    X86_64.enc(inst.i32.any, *recipe(*args, **kwargs))
+
+    if w_bit:
+        X86_64.enc(inst.i64.any, *recipe.rex(*args, w=1, **kwargs))
+    else:
+        X86_64.enc(inst.i64.any, *recipe.rex(*args, **kwargs))
+        X86_64.enc(inst.i64.any, *recipe(*args, **kwargs))
+
+
+for inst,           opc in [
+        (base.iadd, 0x01),
+        (base.isub, 0x29),
+        (base.band, 0x21),
+        (base.bor,  0x09),
+        (base.bxor, 0x31)]:
+    enc_i32_i64(inst, r.rr, opc)
+
+# x86 has a bitwise not instruction NOT.
+enc_i32_i64(base.bnot, r.ur, 0xf7, rrr=2)
+
+# Also add a `b1` encodings for the logic instructions.
+# TODO: Should this be done with 8-bit instructions? It would improve
+# partial register dependencies.
+enc_both(base.band.b1, r.rr, 0x21)
+enc_both(base.bor.b1,  r.rr, 0x09)
+enc_both(base.bxor.b1, r.rr, 0x31)
+
+enc_i32_i64(base.imul, r.rrx, 0x0f, 0xaf)
+enc_i32_i64(x86.sdivmodx, r.div, 0xf7, rrr=7)
+enc_i32_i64(x86.udivmodx, r.div, 0xf7, rrr=6)
+
+enc_i32_i64(x86.smulx, r.mulx, 0xf7, rrr=5)
+enc_i32_i64(x86.umulx, r.mulx, 0xf7, rrr=4)
+
+enc_i32_i64(base.copy, r.umr, 0x89)
+for ty in [types.b1, types.i8, types.i16]:
+    enc_both(base.copy.bind(ty), r.umr, 0x89)
+
+# For x86-64, only define REX forms for now, since we can't describe the
+# special regunit immediate operands with the current constraint language.
+for ty in [types.i8, types.i16, types.i32]:
+    X86_32.enc(base.regmove.bind(ty), *r.rmov(0x89))
+    X86_64.enc(base.regmove.bind(ty), *r.rmov.rex(0x89))
+X86_64.enc(base.regmove.i64, *r.rmov.rex(0x89, w=1))
+
+enc_both(base.regmove.b1, r.rmov, 0x89)
+enc_both(base.regmove.i8, r.rmov, 0x89)
+
+# Immediate instructions with sign-extended 8-bit and 32-bit immediate.
+for inst,               rrr in [
+        (base.iadd_imm, 0),
+        (base.band_imm, 4),
+        (base.bor_imm,  1),
+        (base.bxor_imm, 6)]:
+    enc_i32_i64(inst, r.r_ib, 0x83, rrr=rrr)
+    enc_i32_i64(inst, r.r_id, 0x81, rrr=rrr)
+
+# TODO: band_imm.i64 with an unsigned 32-bit immediate can be encoded as
+# band_imm.i32. Can even use the single-byte immediate for 0xffff_ffXX masks.
+
+# Immediate constants.
+X86_32.enc(base.iconst.i32, *r.pu_id(0xb8))
+
+X86_64.enc(base.iconst.i32, *r.pu_id.rex(0xb8))
+X86_64.enc(base.iconst.i32, *r.pu_id(0xb8))
+# The 32-bit immediate movl also zero-extends to 64 bits.
+X86_64.enc(base.iconst.i64, *r.pu_id.rex(0xb8),
+           instp=IsUnsignedInt(UnaryImm.imm, 32))
+X86_64.enc(base.iconst.i64, *r.pu_id(0xb8),
+           instp=IsUnsignedInt(UnaryImm.imm, 32))
+# Sign-extended 32-bit immediate.
+X86_64.enc(base.iconst.i64, *r.u_id.rex(0xc7, rrr=0, w=1))
+# Finally, the 0xb8 opcode takes an 8-byte immediate with a REX.W prefix.
+X86_64.enc(base.iconst.i64, *r.pu_iq.rex(0xb8, w=1))
+
+# bool constants.
+enc_both(base.bconst.b1, r.pu_id_bool, 0xb8)
+
+# Shifts and rotates.
+# Note that the dynamic shift amount is only masked by 5 or 6 bits; the 8-bit
+# and 16-bit shifts would need explicit masking.
+for inst,           rrr in [
+        (base.rotl, 0),
+        (base.rotr, 1),
+        (base.ishl, 4),
+        (base.ushr, 5),
+        (base.sshr, 7)]:
+    # Cannot use enc_i32_i64 for this pattern because instructions require
+    # .any suffix.
+    X86_32.enc(inst.i32.any, *r.rc(0xd3, rrr=rrr))
+    X86_64.enc(inst.i64.any, *r.rc.rex(0xd3, rrr=rrr, w=1))
+    X86_64.enc(inst.i32.any, *r.rc.rex(0xd3, rrr=rrr))
+    X86_64.enc(inst.i32.any, *r.rc(0xd3, rrr=rrr))
+
+for inst,           rrr in [
+        (base.ishl_imm, 4),
+        (base.ushr_imm, 5),
+        (base.sshr_imm, 7)]:
+    enc_i32_i64(inst, r.r_ib, 0xc1, rrr=rrr)
+
+# Population count.
+X86_32.enc(base.popcnt.i32, *r.urm(0xf3, 0x0f, 0xb8), isap=cfg.use_popcnt)
+X86_64.enc(base.popcnt.i64, *r.urm.rex(0xf3, 0x0f, 0xb8, w=1),
+           isap=cfg.use_popcnt)
+X86_64.enc(base.popcnt.i32, *r.urm.rex(0xf3, 0x0f, 0xb8), isap=cfg.use_popcnt)
+X86_64.enc(base.popcnt.i32, *r.urm(0xf3, 0x0f, 0xb8), isap=cfg.use_popcnt)
+
+# Count leading zero bits.
+X86_32.enc(base.clz.i32, *r.urm(0xf3, 0x0f, 0xbd), isap=cfg.use_lzcnt)
+X86_64.enc(base.clz.i64, *r.urm.rex(0xf3, 0x0f, 0xbd, w=1),
+           isap=cfg.use_lzcnt)
+X86_64.enc(base.clz.i32, *r.urm.rex(0xf3, 0x0f, 0xbd), isap=cfg.use_lzcnt)
+X86_64.enc(base.clz.i32, *r.urm(0xf3, 0x0f, 0xbd), isap=cfg.use_lzcnt)
+
+# Count trailing zero bits.
+X86_32.enc(base.ctz.i32, *r.urm(0xf3, 0x0f, 0xbc), isap=cfg.use_bmi1)
+X86_64.enc(base.ctz.i64, *r.urm.rex(0xf3, 0x0f, 0xbc, w=1),
+           isap=cfg.use_bmi1)
+X86_64.enc(base.ctz.i32, *r.urm.rex(0xf3, 0x0f, 0xbc), isap=cfg.use_bmi1)
+X86_64.enc(base.ctz.i32, *r.urm(0xf3, 0x0f, 0xbc), isap=cfg.use_bmi1)
+
+#
+# Loads and stores.
+#
+
+ldcomplexp = LengthEquals(LoadComplex, 2)
+for recipe in [r.ldWithIndex, r.ldWithIndexDisp8, r.ldWithIndexDisp32]:
+    enc_i32_i64_instp(base.load_complex, recipe, ldcomplexp, 0x8b)
+    enc_x86_64_instp(base.uload32_complex, recipe, ldcomplexp, 0x8b)
+    X86_64.enc(base.sload32_complex, *recipe.rex(0x63, w=1),
+               instp=ldcomplexp)
+    enc_i32_i64_instp(base.uload16_complex, recipe, ldcomplexp, 0x0f, 0xb7)
+    enc_i32_i64_instp(base.sload16_complex, recipe, ldcomplexp, 0x0f, 0xbf)
+    enc_i32_i64_instp(base.uload8_complex, recipe, ldcomplexp, 0x0f, 0xb6)
+    enc_i32_i64_instp(base.sload8_complex, recipe, ldcomplexp, 0x0f, 0xbe)
+
+stcomplexp = LengthEquals(StoreComplex, 3)
+for recipe in [r.stWithIndex, r.stWithIndexDisp8, r.stWithIndexDisp32]:
+    enc_i32_i64_instp(base.store_complex, recipe, stcomplexp, 0x89)
+    enc_x86_64_instp(base.istore32_complex, recipe, stcomplexp, 0x89)
+    enc_both_instp(base.istore16_complex.i32, recipe, stcomplexp, 0x66, 0x89)
+    enc_x86_64_instp(base.istore16_complex.i64, recipe, stcomplexp, 0x66, 0x89)
+
+for recipe in [r.stWithIndex_abcd,
+               r.stWithIndexDisp8_abcd,
+               r.stWithIndexDisp32_abcd]:
+    enc_both_instp(base.istore8_complex.i32, recipe, stcomplexp, 0x88)
+    enc_x86_64_instp(base.istore8_complex.i64, recipe, stcomplexp, 0x88)
+
+for recipe in [r.st, r.stDisp8, r.stDisp32]:
+    enc_i32_i64_ld_st(base.store, True, recipe, 0x89)
+    enc_x86_64(base.istore32.i64.any, recipe, 0x89)
+    enc_i32_i64_ld_st(base.istore16, False, recipe, 0x66, 0x89)
+
+# Byte stores are more complicated because the registers they can address
+# depends of the presence of a REX prefix. The st*_abcd recipes fall back to
+# the corresponding st* recipes when a REX prefix is applied.
+for recipe in [r.st_abcd, r.stDisp8_abcd, r.stDisp32_abcd]:
+    enc_both(base.istore8.i32.any, recipe, 0x88)
+    enc_x86_64(base.istore8.i64.any, recipe, 0x88)
+
+enc_i32_i64(base.spill, r.spillSib32, 0x89)
+enc_i32_i64(base.regspill, r.regspill32, 0x89)
+
+# Use a 32-bit write for spilling `b1`, `i8` and `i16` to avoid
+# constraining the permitted registers.
+# See MIN_SPILL_SLOT_SIZE which makes this safe.
+for ty in [types.b1, types.i8, types.i16]:
+    enc_both(base.spill.bind(ty), r.spillSib32, 0x89)
+    enc_both(base.regspill.bind(ty), r.regspill32, 0x89)
+
+for recipe in [r.ld, r.ldDisp8, r.ldDisp32]:
+    enc_i32_i64_ld_st(base.load, True, recipe, 0x8b)
+    enc_x86_64(base.uload32.i64, recipe, 0x8b)
+    X86_64.enc(base.sload32.i64, *recipe.rex(0x63, w=1))
+    enc_i32_i64_ld_st(base.uload16, True, recipe, 0x0f, 0xb7)
+    enc_i32_i64_ld_st(base.sload16, True, recipe, 0x0f, 0xbf)
+    enc_i32_i64_ld_st(base.uload8, True, recipe, 0x0f, 0xb6)
+    enc_i32_i64_ld_st(base.sload8, True, recipe, 0x0f, 0xbe)
+
+enc_i32_i64(base.fill, r.fillSib32, 0x8b)
+enc_i32_i64(base.regfill, r.regfill32, 0x8b)
+
+# Load 32 bits from `b1`, `i8` and `i16` spill slots. See `spill.b1` above.
+for ty in [types.b1, types.i8, types.i16]:
+    enc_both(base.fill.bind(ty), r.fillSib32, 0x8b)
+    enc_both(base.regfill.bind(ty), r.regfill32, 0x8b)
+
+# Push and Pop
+X86_32.enc(x86.push.i32, *r.pushq(0x50))
+enc_x86_64(x86.push.i64, r.pushq, 0x50)
+
+X86_32.enc(x86.pop.i32, *r.popq(0x58))
+enc_x86_64(x86.pop.i64, r.popq, 0x58)
+
+# Copy Special
+# For x86-64, only define REX forms for now, since we can't describe the
+# special regunit immediate operands with the current constraint language.
+X86_64.enc(base.copy_special, *r.copysp.rex(0x89, w=1))
+X86_32.enc(base.copy_special, *r.copysp(0x89))
+
+# Adjust SP down by a dynamic value (or up, with a negative operand).
+X86_32.enc(base.adjust_sp_down.i32, *r.adjustsp(0x29))
+X86_64.enc(base.adjust_sp_down.i64, *r.adjustsp.rex(0x29, w=1))
+
+# Adjust SP up by an immediate (or down, with a negative immediate)
+X86_32.enc(base.adjust_sp_up_imm, *r.adjustsp_ib(0x83))
+X86_32.enc(base.adjust_sp_up_imm, *r.adjustsp_id(0x81))
+X86_64.enc(base.adjust_sp_up_imm, *r.adjustsp_ib.rex(0x83, w=1))
+X86_64.enc(base.adjust_sp_up_imm, *r.adjustsp_id.rex(0x81, w=1))
+
+# Adjust SP down by an immediate (or up, with a negative immediate)
+X86_32.enc(base.adjust_sp_down_imm, *r.adjustsp_ib(0x83, rrr=5))
+X86_32.enc(base.adjust_sp_down_imm, *r.adjustsp_id(0x81, rrr=5))
+X86_64.enc(base.adjust_sp_down_imm, *r.adjustsp_ib.rex(0x83, rrr=5, w=1))
+X86_64.enc(base.adjust_sp_down_imm, *r.adjustsp_id.rex(0x81, rrr=5, w=1))
+
+#
+# Float loads and stores.
+#
+
+enc_both(base.load.f32.any, r.fld, 0xf3, 0x0f, 0x10)
+enc_both(base.load.f32.any, r.fldDisp8, 0xf3, 0x0f, 0x10)
+enc_both(base.load.f32.any, r.fldDisp32, 0xf3, 0x0f, 0x10)
+
+enc_both(base.load_complex.f32, r.fldWithIndex, 0xf3, 0x0f, 0x10)
+enc_both(base.load_complex.f32, r.fldWithIndexDisp8, 0xf3, 0x0f, 0x10)
+enc_both(base.load_complex.f32, r.fldWithIndexDisp32, 0xf3, 0x0f, 0x10)
+
+enc_both(base.load.f64.any, r.fld, 0xf2, 0x0f, 0x10)
+enc_both(base.load.f64.any, r.fldDisp8, 0xf2, 0x0f, 0x10)
+enc_both(base.load.f64.any, r.fldDisp32, 0xf2, 0x0f, 0x10)
+
+enc_both(base.load_complex.f64, r.fldWithIndex, 0xf2, 0x0f, 0x10)
+enc_both(base.load_complex.f64, r.fldWithIndexDisp8, 0xf2, 0x0f, 0x10)
+enc_both(base.load_complex.f64, r.fldWithIndexDisp32, 0xf2, 0x0f, 0x10)
+
+enc_both(base.store.f32.any, r.fst, 0xf3, 0x0f, 0x11)
+enc_both(base.store.f32.any, r.fstDisp8, 0xf3, 0x0f, 0x11)
+enc_both(base.store.f32.any, r.fstDisp32, 0xf3, 0x0f, 0x11)
+
+enc_both(base.store_complex.f32, r.fstWithIndex, 0xf3, 0x0f, 0x11)
+enc_both(base.store_complex.f32, r.fstWithIndexDisp8, 0xf3, 0x0f, 0x11)
+enc_both(base.store_complex.f32, r.fstWithIndexDisp32, 0xf3, 0x0f, 0x11)
+
+enc_both(base.store.f64.any, r.fst, 0xf2, 0x0f, 0x11)
+enc_both(base.store.f64.any, r.fstDisp8, 0xf2, 0x0f, 0x11)
+enc_both(base.store.f64.any, r.fstDisp32, 0xf2, 0x0f, 0x11)
+
+enc_both(base.store_complex.f64, r.fstWithIndex, 0xf2, 0x0f, 0x11)
+enc_both(base.store_complex.f64, r.fstWithIndexDisp8, 0xf2, 0x0f, 0x11)
+enc_both(base.store_complex.f64, r.fstWithIndexDisp32, 0xf2, 0x0f, 0x11)
+
+enc_both(base.fill.f32, r.ffillSib32, 0xf3, 0x0f, 0x10)
+enc_both(base.regfill.f32, r.fregfill32, 0xf3, 0x0f, 0x10)
+enc_both(base.fill.f64, r.ffillSib32, 0xf2, 0x0f, 0x10)
+enc_both(base.regfill.f64, r.fregfill32, 0xf2, 0x0f, 0x10)
+
+enc_both(base.spill.f32, r.fspillSib32, 0xf3, 0x0f, 0x11)
+enc_both(base.regspill.f32, r.fregspill32, 0xf3, 0x0f, 0x11)
+enc_both(base.spill.f64, r.fspillSib32, 0xf2, 0x0f, 0x11)
+enc_both(base.regspill.f64, r.fregspill32, 0xf2, 0x0f, 0x11)
+
+#
+# Function addresses.
+#
+
+# Non-PIC, all-ones funcaddresses.
+X86_32.enc(base.func_addr.i32, *r.fnaddr4(0xb8),
+           isap=And(Not(allones_funcaddrs), Not(is_pic)))
+X86_64.enc(base.func_addr.i64, *r.fnaddr8.rex(0xb8, w=1),
+           isap=And(Not(allones_funcaddrs), Not(is_pic)))
+
+# Non-PIC, all-zeros funcaddresses.
+X86_32.enc(base.func_addr.i32, *r.allones_fnaddr4(0xb8),
+           isap=And(allones_funcaddrs, Not(is_pic)))
+X86_64.enc(base.func_addr.i64, *r.allones_fnaddr8.rex(0xb8, w=1),
+           isap=And(allones_funcaddrs, Not(is_pic)))
+
+# 64-bit, colocated, both PIC and non-PIC. Use the lea instruction's
+# pc-relative field.
+X86_64.enc(base.func_addr.i64, *r.pcrel_fnaddr8.rex(0x8d, w=1),
+           instp=IsColocatedFunc(FuncAddr.func_ref))
+
+# 64-bit, non-colocated, PIC.
+X86_64.enc(base.func_addr.i64, *r.got_fnaddr8.rex(0x8b, w=1),
+           isap=is_pic)
+
+#
+# Global addresses.
+#
+
+# Non-PIC
+X86_32.enc(base.symbol_value.i32, *r.gvaddr4(0xb8),
+           isap=Not(is_pic))
+X86_64.enc(base.symbol_value.i64, *r.gvaddr8.rex(0xb8, w=1),
+           isap=Not(is_pic))
+
+# PIC, colocated
+X86_64.enc(base.symbol_value.i64, *r.pcrel_gvaddr8.rex(0x8d, w=1),
+           isap=is_pic,
+           instp=IsColocatedData())
+
+# PIC, non-colocated
+X86_64.enc(base.symbol_value.i64, *r.got_gvaddr8.rex(0x8b, w=1),
+           isap=is_pic)
+
+#
+# Stack addresses.
+#
+# TODO: Add encoding rules for stack_load and stack_store, so that they
+# don't get legalized to stack_addr + load/store.
+#
+X86_32.enc(base.stack_addr.i32, *r.spaddr4_id(0x8d))
+X86_64.enc(base.stack_addr.i64, *r.spaddr8_id.rex(0x8d, w=1))
+
+#
+# Call/return
+#
+
+# 32-bit, both PIC and non-PIC.
+X86_32.enc(base.call, *r.call_id(0xe8))
+
+# 64-bit, colocated, both PIC and non-PIC. Use the call instruction's
+# pc-relative field.
+X86_64.enc(base.call, *r.call_id(0xe8),
+           instp=IsColocatedFunc(Call.func_ref))
+
+# 64-bit, non-colocated, PIC. There is no 64-bit non-colocated non-PIC version,
+# since non-PIC is currently using the large model, which requires calls be
+# lowered to func_addr+call_indirect.
+X86_64.enc(base.call, *r.call_plt_id(0xe8), isap=is_pic)
+
+X86_32.enc(base.call_indirect.i32, *r.call_r(0xff, rrr=2))
+X86_64.enc(base.call_indirect.i64, *r.call_r.rex(0xff, rrr=2))
+X86_64.enc(base.call_indirect.i64, *r.call_r(0xff, rrr=2))
+
+X86_32.enc(base.x_return, *r.ret(0xc3))
+X86_64.enc(base.x_return, *r.ret(0xc3))
+
+#
+# Branches
+#
+enc_both(base.jump, r.jmpb, 0xeb)
+enc_both(base.jump, r.jmpd, 0xe9)
+
+enc_both(base.brif, r.brib, 0x70)
+enc_both(base.brif, r.brid, 0x0f, 0x80)
+
+# Not all float condition codes are legal, see `supported_floatccs`.
+enc_both(base.brff, r.brfb, 0x70)
+enc_both(base.brff, r.brfd, 0x0f, 0x80)
+
+# Note that the tjccd opcode will be prefixed with 0x0f.
+enc_i32_i64(base.brz, r.tjccb, 0x74)
+enc_i32_i64(base.brz, r.tjccd, 0x84)
+enc_i32_i64(base.brnz, r.tjccb, 0x75)
+enc_i32_i64(base.brnz, r.tjccd, 0x85)
+
+# Branch on a b1 value in a register only looks at the low 8 bits. See also
+# bint encodings below.
+#
+# Start with the worst-case encoding for X86_32 only. The register allocator
+# can't handle a branch with an ABCD-constrained operand.
+X86_32.enc(base.brz.b1, *r.t8jccd_long(0x84))
+X86_32.enc(base.brnz.b1, *r.t8jccd_long(0x85))
+
+enc_both(base.brz.b1, r.t8jccb_abcd, 0x74)
+enc_both(base.brz.b1, r.t8jccd_abcd, 0x84)
+enc_both(base.brnz.b1, r.t8jccb_abcd, 0x75)
+enc_both(base.brnz.b1, r.t8jccd_abcd, 0x85)
+
+#
+# Jump tables
+#
+X86_64.enc(base.jump_table_entry.i64.any.any, *r.jt_entry.rex(0x63, w=1))
+X86_32.enc(base.jump_table_entry.i32.any.any, *r.jt_entry(0x8b))
+
+X86_64.enc(base.jump_table_base.i64, *r.jt_base.rex(0x8d, w=1))
+X86_32.enc(base.jump_table_base.i32, *r.jt_base(0x8d))
+
+enc_x86_64(base.indirect_jump_table_br.i64, r.indirect_jmp, 0xff, rrr=4)
+X86_32.enc(base.indirect_jump_table_br.i32, *r.indirect_jmp(0xff, rrr=4))
+
+#
+# Trap as ud2
+#
+X86_32.enc(base.trap, *r.trap(0x0f, 0x0b))
+X86_64.enc(base.trap, *r.trap(0x0f, 0x0b))
+
+# Debug trap as int3
+X86_32.enc(base.debugtrap, r.debugtrap, 0)
+X86_64.enc(base.debugtrap, r.debugtrap, 0)
+
+# Using a standard EncRecipe, not the TailRecipe.
+X86_32.enc(base.trapif, r.trapif, 0)
+X86_64.enc(base.trapif, r.trapif, 0)
+X86_32.enc(base.trapff, r.trapff, 0)
+X86_64.enc(base.trapff, r.trapff, 0)
+
+#
+# Comparisons
+#
+enc_i32_i64(base.icmp, r.icscc, 0x39)
+enc_i32_i64(base.icmp_imm, r.icscc_ib, 0x83, rrr=7)
+enc_i32_i64(base.icmp_imm, r.icscc_id, 0x81, rrr=7)
+enc_i32_i64(base.ifcmp, r.rcmp, 0x39)
+enc_i32_i64(base.ifcmp_imm, r.rcmp_ib, 0x83, rrr=7)
+enc_i32_i64(base.ifcmp_imm, r.rcmp_id, 0x81, rrr=7)
+# TODO: We could special-case ifcmp_imm(x, 0) to TEST(x, x).
+
+X86_32.enc(base.ifcmp_sp.i32, *r.rcmp_sp(0x39))
+X86_64.enc(base.ifcmp_sp.i64, *r.rcmp_sp.rex(0x39, w=1))
+
+#
+# Convert flags to bool.
+#
+# This encodes `b1` as an 8-bit low register with the value 0 or 1.
+enc_both(base.trueif, r.seti_abcd, 0x0f, 0x90)
+enc_both(base.trueff, r.setf_abcd, 0x0f, 0x90)
+
+#
+# Conditional move (a.k.a integer select)
+#
+enc_i32_i64(base.selectif, r.cmov, 0x0F, 0x40)
+
+#
+# Bit scan forwards and reverse
+#
+enc_i32_i64(x86.bsf, r.bsf_and_bsr, 0x0F, 0xBC)
+enc_i32_i64(x86.bsr, r.bsf_and_bsr, 0x0F, 0xBD)
+
+#
+# Convert bool to int.
+#
+# This assumes that b1 is represented as an 8-bit low register with the value 0
+# or 1.
+#
+# Encode movzbq as movzbl, because it's equivalent and shorter.
+X86_32.enc(base.bint.i32.b1, *r.urm_noflags_abcd(0x0f, 0xb6))
+X86_64.enc(base.bint.i64.b1, *r.urm_noflags.rex(0x0f, 0xb6))
+X86_64.enc(base.bint.i64.b1, *r.urm_noflags_abcd(0x0f, 0xb6))
+X86_64.enc(base.bint.i32.b1, *r.urm_noflags.rex(0x0f, 0xb6))
+X86_64.enc(base.bint.i32.b1, *r.urm_noflags_abcd(0x0f, 0xb6))
+
+# Numerical conversions.
+
+# Reducing an integer is a no-op.
+X86_32.enc(base.ireduce.i8.i16, r.null, 0)
+X86_32.enc(base.ireduce.i8.i32, r.null, 0)
+X86_32.enc(base.ireduce.i16.i32, r.null, 0)
+
+X86_64.enc(base.ireduce.i8.i16, r.null, 0)
+X86_64.enc(base.ireduce.i8.i32, r.null, 0)
+X86_64.enc(base.ireduce.i16.i32, r.null, 0)
+X86_64.enc(base.ireduce.i8.i64, r.null, 0)
+X86_64.enc(base.ireduce.i16.i64, r.null, 0)
+X86_64.enc(base.ireduce.i32.i64, r.null, 0)
+
+# TODO: Add encodings for cbw, cwde, cdqe, which are sign-extending
+# instructions for %al/%ax/%eax to %ax/%eax/%rax.
+
+# movsbl
+X86_32.enc(base.sextend.i32.i8, *r.urm_noflags_abcd(0x0f, 0xbe))
+X86_64.enc(base.sextend.i32.i8, *r.urm_noflags.rex(0x0f, 0xbe))
+X86_64.enc(base.sextend.i32.i8, *r.urm_noflags_abcd(0x0f, 0xbe))
+
+# movswl
+X86_32.enc(base.sextend.i32.i16, *r.urm_noflags(0x0f, 0xbf))
+X86_64.enc(base.sextend.i32.i16, *r.urm_noflags.rex(0x0f, 0xbf))
+X86_64.enc(base.sextend.i32.i16, *r.urm_noflags(0x0f, 0xbf))
+
+# movsbq
+X86_64.enc(base.sextend.i64.i8, *r.urm_noflags.rex(0x0f, 0xbe, w=1))
+
+# movswq
+X86_64.enc(base.sextend.i64.i16, *r.urm_noflags.rex(0x0f, 0xbf, w=1))
+
+# movslq
+X86_64.enc(base.sextend.i64.i32, *r.urm_noflags.rex(0x63, w=1))
+
+# movzbl
+X86_32.enc(base.uextend.i32.i8, *r.urm_noflags_abcd(0x0f, 0xb6))
+X86_64.enc(base.uextend.i32.i8, *r.urm_noflags.rex(0x0f, 0xb6))
+X86_64.enc(base.uextend.i32.i8, *r.urm_noflags_abcd(0x0f, 0xb6))
+
+# movzwl
+X86_32.enc(base.uextend.i32.i16, *r.urm_noflags(0x0f, 0xb7))
+X86_64.enc(base.uextend.i32.i16, *r.urm_noflags.rex(0x0f, 0xb7))
+X86_64.enc(base.uextend.i32.i16, *r.urm_noflags(0x0f, 0xb7))
+
+# movzbq, encoded as movzbl because it's equivalent and shorter
+X86_64.enc(base.uextend.i64.i8, *r.urm_noflags.rex(0x0f, 0xb6))
+X86_64.enc(base.uextend.i64.i8, *r.urm_noflags(0x0f, 0xb6))
+
+# movzwq, encoded as movzwl because it's equivalent and shorter
+X86_64.enc(base.uextend.i64.i16, *r.urm_noflags.rex(0x0f, 0xb7))
+X86_64.enc(base.uextend.i64.i16, *r.urm_noflags(0x0f, 0xb7))
+
+# A 32-bit register copy clears the high 32 bits.
+X86_64.enc(base.uextend.i64.i32, *r.umr.rex(0x89))
+X86_64.enc(base.uextend.i64.i32, *r.umr(0x89))
+
+
+#
+# Floating point
+#
+
+# floating-point constants equal to 0.0 can be encoded using either
+# `xorps` or `xorpd`, for 32-bit and 64-bit floats respectively.
+X86_32.enc(base.f32const, *r.f32imm_z(0x0f, 0x57),
+           instp=IsZero32BitFloat(UnaryIeee32.imm))
+X86_32.enc(base.f64const, *r.f64imm_z(0x66, 0x0f, 0x57),
+           instp=IsZero64BitFloat(UnaryIeee64.imm))
+
+enc_x86_64_instp(base.f32const, r.f32imm_z,
+                 IsZero32BitFloat(UnaryIeee32.imm), 0x0f, 0x57)
+enc_x86_64_instp(base.f64const, r.f64imm_z,
+                 IsZero64BitFloat(UnaryIeee64.imm), 0x66, 0x0f, 0x57)
+
+# movd
+enc_both(base.bitcast.f32.i32, r.frurm, 0x66, 0x0f, 0x6e)
+enc_both(base.bitcast.i32.f32, r.rfumr, 0x66, 0x0f, 0x7e)
+
+# movq
+X86_64.enc(base.bitcast.f64.i64, *r.frurm.rex(0x66, 0x0f, 0x6e, w=1))
+X86_64.enc(base.bitcast.i64.f64, *r.rfumr.rex(0x66, 0x0f, 0x7e, w=1))
+
+# movaps
+enc_both(base.copy.f32, r.furm, 0x0f, 0x28)
+enc_both(base.copy.f64, r.furm, 0x0f, 0x28)
+
+# For x86-64, only define REX forms for now, since we can't describe the
+# special regunit immediate operands with the current constraint language.
+X86_32.enc(base.regmove.f32, *r.frmov(0x0f, 0x28))
+X86_64.enc(base.regmove.f32, *r.frmov.rex(0x0f, 0x28))
+
+# For x86-64, only define REX forms for now, since we can't describe the
+# special regunit immediate operands with the current constraint language.
+X86_32.enc(base.regmove.f64, *r.frmov(0x0f, 0x28))
+X86_64.enc(base.regmove.f64, *r.frmov.rex(0x0f, 0x28))
+
+# cvtsi2ss
+enc_i32_i64(base.fcvt_from_sint.f32, r.frurm, 0xf3, 0x0f, 0x2a)
+
+# cvtsi2sd
+enc_i32_i64(base.fcvt_from_sint.f64, r.frurm, 0xf2, 0x0f, 0x2a)
+
+# cvtss2sd
+enc_both(base.fpromote.f64.f32, r.furm, 0xf3, 0x0f, 0x5a)
+
+# cvtsd2ss
+enc_both(base.fdemote.f32.f64, r.furm, 0xf2, 0x0f, 0x5a)
+
+# cvttss2si
+enc_both(x86.cvtt2si.i32.f32, r.rfurm, 0xf3, 0x0f, 0x2c)
+X86_64.enc(x86.cvtt2si.i64.f32, *r.rfurm.rex(0xf3, 0x0f, 0x2c, w=1))
+
+# cvttsd2si
+enc_both(x86.cvtt2si.i32.f64, r.rfurm, 0xf2, 0x0f, 0x2c)
+X86_64.enc(x86.cvtt2si.i64.f64, *r.rfurm.rex(0xf2, 0x0f, 0x2c, w=1))
+
+# Exact square roots.
+enc_both(base.sqrt.f32, r.furm, 0xf3, 0x0f, 0x51)
+enc_both(base.sqrt.f64, r.furm, 0xf2, 0x0f, 0x51)
+
+# Rounding. The recipe looks at the opcode to pick an immediate.
+for inst in [
+        base.nearest,
+        base.floor,
+        base.ceil,
+        base.trunc]:
+    enc_both(inst.f32, r.furmi_rnd, 0x66, 0x0f, 0x3a, 0x0a, isap=use_sse41)
+    enc_both(inst.f64, r.furmi_rnd, 0x66, 0x0f, 0x3a, 0x0b, isap=use_sse41)
+
+
+# Binary arithmetic ops.
+for inst,           opc in [
+        (base.fadd, 0x58),
+        (base.fsub, 0x5c),
+        (base.fmul, 0x59),
+        (base.fdiv, 0x5e),
+        (x86.fmin,  0x5d),
+        (x86.fmax,  0x5f)]:
+    enc_both(inst.f32, r.fa, 0xf3, 0x0f, opc)
+    enc_both(inst.f64, r.fa, 0xf2, 0x0f, opc)
+
+# Binary bitwise ops.
+for inst,               opc in [
+        (base.band,     0x54),
+        (base.bor,      0x56),
+        (base.bxor,     0x57)]:
+    enc_both(inst.f32, r.fa, 0x0f, opc)
+    enc_both(inst.f64, r.fa, 0x0f, opc)
+
+# The `andnps(x,y)` instruction computes `~x&y`, while band_not(x,y)` is `x&~y.
+enc_both(base.band_not.f32, r.fax, 0x0f, 0x55)
+enc_both(base.band_not.f64, r.fax, 0x0f, 0x55)
+
+# Comparisons.
+#
+# This only covers the condition codes in `supported_floatccs`, the rest are
+# handled by legalization patterns.
+enc_both(base.fcmp.f32, r.fcscc, 0x0f, 0x2e)
+enc_both(base.fcmp.f64, r.fcscc, 0x66, 0x0f, 0x2e)
+
+enc_both(base.ffcmp.f32, r.fcmp, 0x0f, 0x2e)
+enc_both(base.ffcmp.f64, r.fcmp, 0x66, 0x0f, 0x2e)
--- a/cranelift/codegen/meta-python/isa/x86/instructions.py
+++ b/cranelift/codegen/meta-python/isa/x86/instructions.py
@@ -0,0 +1,173 @@
+"""
+Supplementary instruction definitions for x86.
+
+This module defines additional instructions that are useful only to the x86
+target ISA.
+"""
+
+from base.types import iflags
+from cdsl.operands import Operand
+from cdsl.typevar import TypeVar
+from cdsl.instructions import Instruction, InstructionGroup
+
+
+GROUP = InstructionGroup("x86", "x86-specific instruction set")
+
+iWord = TypeVar('iWord', 'A scalar integer machine word', ints=(32, 64))
+
+nlo = Operand('nlo', iWord, doc='Low part of numerator')
+nhi = Operand('nhi', iWord, doc='High part of numerator')
+d = Operand('d', iWord, doc='Denominator')
+q = Operand('q', iWord, doc='Quotient')
+r = Operand('r', iWord, doc='Remainder')
+
+udivmodx = Instruction(
+        'x86_udivmodx', r"""
+        Extended unsigned division.
+
+        Concatenate the bits in `nhi` and `nlo` to form the numerator.
+        Interpret the bits as an unsigned number and divide by the unsigned
+        denominator `d`. Trap when `d` is zero or if the quotient is larger
+        than the range of the output.
+
+        Return both quotient and remainder.
+        """,
+        ins=(nlo, nhi, d), outs=(q, r), can_trap=True)
+
+sdivmodx = Instruction(
+        'x86_sdivmodx', r"""
+        Extended signed division.
+
+        Concatenate the bits in `nhi` and `nlo` to form the numerator.
+        Interpret the bits as a signed number and divide by the signed
+        denominator `d`. Trap when `d` is zero or if the quotient is outside
+        the range of the output.
+
+        Return both quotient and remainder.
+        """,
+        ins=(nlo, nhi, d), outs=(q, r), can_trap=True)
+
+argL = Operand('argL', iWord)
+argR = Operand('argR', iWord)
+resLo = Operand('resLo', iWord)
+resHi = Operand('resHi', iWord)
+
+umulx = Instruction(
+        'x86_umulx', r"""
+        Unsigned integer multiplication, producing a double-length result.
+
+        Polymorphic over all scalar integer types, but does not support vector
+        types.
+        """,
+        ins=(argL, argR), outs=(resLo, resHi))
+
+smulx = Instruction(
+        'x86_smulx', r"""
+        Signed integer multiplication, producing a double-length result.
+
+        Polymorphic over all scalar integer types, but does not support vector
+        types.
+        """,
+        ins=(argL, argR), outs=(resLo, resHi))
+
+Float = TypeVar(
+        'Float', 'A scalar or vector floating point number',
+        floats=True, simd=True)
+IntTo = TypeVar(
+        'IntTo', 'An integer type with the same number of lanes',
+        ints=(32, 64), simd=True)
+
+x = Operand('x', Float)
+a = Operand('a', IntTo)
+
+cvtt2si = Instruction(
+        'x86_cvtt2si', r"""
+        Convert with truncation floating point to signed integer.
+
+        The source floating point operand is converted to a signed integer by
+        rounding towards zero. If the result can't be represented in the output
+        type, returns the smallest signed value the output type can represent.
+
+        This instruction does not trap.
+        """,
+        ins=x, outs=a)
+
+x = Operand('x', Float)
+a = Operand('a', Float)
+y = Operand('y', Float)
+
+fmin = Instruction(
+        'x86_fmin', r"""
+        Floating point minimum with x86 semantics.
+
+        This is equivalent to the C ternary operator `x < y ? x : y` which
+        differs from :inst:`fmin` when either operand is NaN or when comparing
+        +0.0 to -0.0.
+
+        When the two operands don't compare as LT, `y` is returned unchanged,
+        even if it is a signalling NaN.
+        """,
+        ins=(x, y), outs=a)
+
+fmax = Instruction(
+        'x86_fmax', r"""
+        Floating point maximum with x86 semantics.
+
+        This is equivalent to the C ternary operator `x > y ? x : y` which
+        differs from :inst:`fmax` when either operand is NaN or when comparing
+        +0.0 to -0.0.
+
+        When the two operands don't compare as GT, `y` is returned unchanged,
+        even if it is a signalling NaN.
+        """,
+        ins=(x, y), outs=a)
+
+
+x = Operand('x', iWord)
+
+push = Instruction(
+    'x86_push', r"""
+    Pushes a value onto the stack.
+
+    Decrements the stack pointer and stores the specified value on to the top.
+
+    This is polymorphic in i32 and i64. However, it is only implemented for i64
+    in 64-bit mode, and only for i32 in 32-bit mode.
+    """,
+    ins=x, can_store=True, other_side_effects=True)
+
+pop = Instruction(
+    'x86_pop', r"""
+    Pops a value from the stack.
+
+    Loads a value from the top of the stack and then increments the stack
+    pointer.
+
+    This is polymorphic in i32 and i64. However, it is only implemented for i64
+    in 64-bit mode, and only for i32 in 32-bit mode.
+    """,
+    outs=x, can_load=True, other_side_effects=True)
+
+y = Operand('y', iWord)
+rflags = Operand('rflags', iflags)
+
+bsr = Instruction(
+    'x86_bsr', r"""
+    Bit Scan Reverse -- returns the bit-index of the most significant 1
+    in the word. Result is undefined if the argument is zero. However, it
+    sets the Z flag depending on the argument, so it is at least easy to
+    detect and handle that case.
+
+    This is polymorphic in i32 and i64. It is implemented for both i64 and
+    i32 in 64-bit mode, and only for i32 in 32-bit mode.
+    """,
+    ins=x, outs=(y, rflags))
+
+bsf = Instruction(
+    'x86_bsf', r"""
+    Bit Scan Forwards -- returns the bit-index of the least significant 1
+    in the word. Is otherwise identical to 'bsr', just above.
+    """,
+    ins=x, outs=(y, rflags))
+
+GROUP.close()
--- a/cranelift/codegen/meta-python/isa/x86/legalize.py
+++ b/cranelift/codegen/meta-python/isa/x86/legalize.py
@@ -0,0 +1,229 @@
+"""
+Custom legalization patterns for x86.
+"""
+from __future__ import absolute_import
+from cdsl.ast import Var
+from cdsl.xform import Rtl, XFormGroup
+from base.immediates import imm64, intcc, floatcc
+from base import legalize as shared
+from base import instructions as insts
+from . import instructions as x86
+from .defs import ISA
+
+x86_expand = XFormGroup(
+        'x86_expand',
+        """
+        Legalize instructions by expansion.
+
+        Use x86-specific instructions if needed.
+        """,
+        isa=ISA, chain=shared.expand_flags)
+
+a = Var('a')
+dead = Var('dead')
+x = Var('x')
+xhi = Var('xhi')
+y = Var('y')
+a1 = Var('a1')
+a2 = Var('a2')
+
+#
+# Division and remainder.
+#
+# The srem expansion requires custom code because srem INT_MIN, -1 is not
+# allowed to trap. The other ops need to check avoid_div_traps.
+x86_expand.custom_legalize(insts.sdiv, 'expand_sdivrem')
+x86_expand.custom_legalize(insts.srem, 'expand_sdivrem')
+x86_expand.custom_legalize(insts.udiv, 'expand_udivrem')
+x86_expand.custom_legalize(insts.urem, 'expand_udivrem')
+
+#
+# Double length (widening) multiplication
+#
+resLo = Var('resLo')
+resHi = Var('resHi')
+x86_expand.legalize(
+        resHi << insts.umulhi(x, y),
+        Rtl(
+            (resLo, resHi) << x86.umulx(x, y)
+        ))
+
+x86_expand.legalize(
+        resHi << insts.smulhi(x, y),
+        Rtl(
+            (resLo, resHi) << x86.smulx(x, y)
+        ))
+
+# Floating point condition codes.
+#
+# The 8 condition codes in `supported_floatccs` are directly supported by a
+# `ucomiss` or `ucomisd` instruction. The remaining codes need legalization
+# patterns.
+
+# Equality needs an explicit `ord` test which checks the parity bit.
+x86_expand.legalize(
+        a << insts.fcmp(floatcc.eq, x, y),
+        Rtl(
+            a1 << insts.fcmp(floatcc.ord, x, y),
+            a2 << insts.fcmp(floatcc.ueq, x, y),
+            a << insts.band(a1, a2)
+        ))
+x86_expand.legalize(
+        a << insts.fcmp(floatcc.ne, x, y),
+        Rtl(
+            a1 << insts.fcmp(floatcc.uno, x, y),
+            a2 << insts.fcmp(floatcc.one, x, y),
+            a << insts.bor(a1, a2)
+        ))
+
+# Inequalities that need to be reversed.
+for cc,               rev_cc in [
+        (floatcc.lt,  floatcc.gt),
+        (floatcc.le,  floatcc.ge),
+        (floatcc.ugt, floatcc.ult),
+        (floatcc.uge, floatcc.ule)]:
+    x86_expand.legalize(
+            a << insts.fcmp(cc, x, y),
+            Rtl(
+                a << insts.fcmp(rev_cc, y, x)
+            ))
+
+# We need to modify the CFG for min/max legalization.
+x86_expand.custom_legalize(insts.fmin, 'expand_minmax')
+x86_expand.custom_legalize(insts.fmax, 'expand_minmax')
+
+# Conversions from unsigned need special handling.
+x86_expand.custom_legalize(insts.fcvt_from_uint, 'expand_fcvt_from_uint')
+# Conversions from float to int can trap and modify the control flow graph.
+x86_expand.custom_legalize(insts.fcvt_to_sint, 'expand_fcvt_to_sint')
+x86_expand.custom_legalize(insts.fcvt_to_uint, 'expand_fcvt_to_uint')
+x86_expand.custom_legalize(insts.fcvt_to_sint_sat, 'expand_fcvt_to_sint_sat')
+x86_expand.custom_legalize(insts.fcvt_to_uint_sat, 'expand_fcvt_to_uint_sat')
+
+# Count leading and trailing zeroes, for baseline x86_64
+c_minus_one = Var('c_minus_one')
+c_thirty_one = Var('c_thirty_one')
+c_thirty_two = Var('c_thirty_two')
+c_sixty_three = Var('c_sixty_three')
+c_sixty_four = Var('c_sixty_four')
+index1 = Var('index1')
+r2flags = Var('r2flags')
+index2 = Var('index2')
+
+x86_expand.legalize(
+    a << insts.clz.i64(x),
+    Rtl(
+        c_minus_one << insts.iconst(imm64(-1)),
+        c_sixty_three << insts.iconst(imm64(63)),
+        (index1, r2flags) << x86.bsr(x),
+        index2 << insts.selectif(intcc.eq, r2flags, c_minus_one, index1),
+        a << insts.isub(c_sixty_three, index2),
+    ))
+
+x86_expand.legalize(
+    a << insts.clz.i32(x),
+    Rtl(
+        c_minus_one << insts.iconst(imm64(-1)),
+        c_thirty_one << insts.iconst(imm64(31)),
+        (index1, r2flags) << x86.bsr(x),
+        index2 << insts.selectif(intcc.eq, r2flags, c_minus_one, index1),
+        a << insts.isub(c_thirty_one, index2),
+    ))
+
+x86_expand.legalize(
+    a << insts.ctz.i64(x),
+    Rtl(
+        c_sixty_four << insts.iconst(imm64(64)),
+        (index1, r2flags) << x86.bsf(x),
+        a << insts.selectif(intcc.eq, r2flags, c_sixty_four, index1),
+    ))
+
+x86_expand.legalize(
+    a << insts.ctz.i32(x),
+    Rtl(
+        c_thirty_two << insts.iconst(imm64(32)),
+        (index1, r2flags) << x86.bsf(x),
+        a << insts.selectif(intcc.eq, r2flags, c_thirty_two, index1),
+    ))
+
+
+# Population count for baseline x86_64
+qv1 = Var('qv1')
+qv3 = Var('qv3')
+qv4 = Var('qv4')
+qv5 = Var('qv5')
+qv6 = Var('qv6')
+qv7 = Var('qv7')
+qv8 = Var('qv8')
+qv9 = Var('qv9')
+qv10 = Var('qv10')
+qv11 = Var('qv11')
+qv12 = Var('qv12')
+qv13 = Var('qv13')
+qv14 = Var('qv14')
+qv15 = Var('qv15')
+qv16 = Var('qv16')
+qc77 = Var('qc77')
+qc0F = Var('qc0F')
+qc01 = Var('qc01')
+x86_expand.legalize(
+    qv16 << insts.popcnt.i64(qv1),
+    Rtl(
+        qv3 << insts.ushr_imm(qv1, imm64(1)),
+        qc77 << insts.iconst(imm64(0x7777777777777777)),
+        qv4 << insts.band(qv3, qc77),
+        qv5 << insts.isub(qv1, qv4),
+        qv6 << insts.ushr_imm(qv4, imm64(1)),
+        qv7 << insts.band(qv6, qc77),
+        qv8 << insts.isub(qv5, qv7),
+        qv9 << insts.ushr_imm(qv7, imm64(1)),
+        qv10 << insts.band(qv9, qc77),
+        qv11 << insts.isub(qv8, qv10),
+        qv12 << insts.ushr_imm(qv11, imm64(4)),
+        qv13 << insts.iadd(qv11, qv12),
+        qc0F << insts.iconst(imm64(0x0F0F0F0F0F0F0F0F)),
+        qv14 << insts.band(qv13, qc0F),
+        qc01 << insts.iconst(imm64(0x0101010101010101)),
+        qv15 << insts.imul(qv14, qc01),
+        qv16 << insts.ushr_imm(qv15, imm64(56))
+    ))
+
+lv1 = Var('lv1')
+lv3 = Var('lv3')
+lv4 = Var('lv4')
+lv5 = Var('lv5')
+lv6 = Var('lv6')
+lv7 = Var('lv7')
+lv8 = Var('lv8')
+lv9 = Var('lv9')
+lv10 = Var('lv10')
+lv11 = Var('lv11')
+lv12 = Var('lv12')
+lv13 = Var('lv13')
+lv14 = Var('lv14')
+lv15 = Var('lv15')
+lv16 = Var('lv16')
+lc77 = Var('lc77')
+lc0F = Var('lc0F')
+lc01 = Var('lc01')
+x86_expand.legalize(
+    lv16 << insts.popcnt.i32(lv1),
+    Rtl(
+        lv3 << insts.ushr_imm(lv1, imm64(1)),
+        lc77 << insts.iconst(imm64(0x77777777)),
+        lv4 << insts.band(lv3, lc77),
+        lv5 << insts.isub(lv1, lv4),
+        lv6 << insts.ushr_imm(lv4, imm64(1)),
+        lv7 << insts.band(lv6, lc77),
+        lv8 << insts.isub(lv5, lv7),
+        lv9 << insts.ushr_imm(lv7, imm64(1)),
+        lv10 << insts.band(lv9, lc77),
+        lv11 << insts.isub(lv8, lv10),
+        lv12 << insts.ushr_imm(lv11, imm64(4)),
+        lv13 << insts.iadd(lv11, lv12),
+        lc0F << insts.iconst(imm64(0x0F0F0F0F)),
+        lv14 << insts.band(lv13, lc0F),
+        lc01 << insts.iconst(imm64(0x01010101)),
+        lv15 << insts.imul(lv14, lc01),
+        lv16 << insts.ushr_imm(lv15, imm64(24))
+    ))
--- a/cranelift/codegen/meta-python/isa/x86/recipes.py
+++ b/cranelift/codegen/meta-python/isa/x86/recipes.py
--- a/cranelift/codegen/meta-python/isa/x86/registers.py
+++ b/cranelift/codegen/meta-python/isa/x86/registers.py
@@ -0,0 +1,61 @@
+"""
+x86 register banks.
+
+While the floating-point registers are straight-forward, the general purpose
+register bank has a few quirks on x86. We have these encodings of the 8-bit
+registers:
+
+         I32 I64  |  16b 32b  64b
+    000  AL  AL   |  AX  EAX  RAX
+    001  CL  CL   |  CX  ECX  RCX
+    010  DL  DL   |  DX  EDX  RDX
+    011  BL  BL   |  BX  EBX  RBX
+    100  AH  SPL  |  SP  ESP  RSP
+    101  CH  BPL  |  BP  EBP  RBP
+    110  DH  SIL  |  SI  ESI  RSI
+    111  BH  DIL  |  DI  EDI  RDI
+
+Here, the I64 column refers to the registers you get with a REX prefix. Without
+the REX prefix, you get the I32 registers.
+
+The 8-bit registers are not that useful since WebAssembly only has i32 and i64
+data types, and the H-registers even less so. Rather than trying to model the
+H-registers accurately, we'll avoid using them in both I32 and I64 modes.
+"""
+from __future__ import absolute_import
+from cdsl.registers import RegBank, RegClass, Stack
+from .defs import ISA
+
+
+IntRegs = RegBank(
+        'IntRegs', ISA,
+        'General purpose registers',
+        units=16, prefix='r',
+        names='rax rcx rdx rbx rsp rbp rsi rdi'.split())
+
+FloatRegs = RegBank(
+        'FloatRegs', ISA,
+        'SSE floating point registers',
+        units=16, prefix='xmm')
+
+FlagRegs = RegBank(
+        'FlagRegs', ISA,
+        'Flag registers',
+        units=1,
+        pressure_tracking=False,
+        names=['rflags'])
+
+GPR = RegClass(IntRegs)
+GPR8 = GPR[0:8]
+ABCD = GPR[0:4]
+FPR = RegClass(FloatRegs)
+FPR8 = FPR[0:8]
+FLAG = RegClass(FlagRegs)
+
+# Constraints for stack operands.
+
+# Stack operand with a 32-bit signed displacement from either RBP or RSP.
+StackGPR32 = Stack(GPR)
+StackFPR32 = Stack(FPR)
+
+RegClass.extract_names(globals())
--- a/cranelift/codegen/meta-python/isa/x86/settings.py
+++ b/cranelift/codegen/meta-python/isa/x86/settings.py
@@ -0,0 +1,54 @@
+"""
+x86 settings.
+"""
+from __future__ import absolute_import
+from cdsl.settings import SettingGroup, BoolSetting, Preset
+from cdsl.predicates import And
+import base.settings as shared
+from .defs import ISA
+
+ISA.settings = SettingGroup('x86', parent=shared.group)
+
+# The has_* settings here correspond to CPUID bits.
+
+# CPUID.01H:ECX
+has_sse3 = BoolSetting("SSE3: CPUID.01H:ECX.SSE3[bit 0]")
+has_ssse3 = BoolSetting("SSSE3: CPUID.01H:ECX.SSSE3[bit 9]")
+has_sse41 = BoolSetting("SSE4.1: CPUID.01H:ECX.SSE4_1[bit 19]")
+has_sse42 = BoolSetting("SSE4.2: CPUID.01H:ECX.SSE4_2[bit 20]")
+has_popcnt = BoolSetting("POPCNT: CPUID.01H:ECX.POPCNT[bit 23]")
+has_avx = BoolSetting("AVX: CPUID.01H:ECX.AVX[bit 28]")
+
+# CPUID.(EAX=07H, ECX=0H):EBX
+has_bmi1 = BoolSetting("BMI1: CPUID.(EAX=07H, ECX=0H):EBX.BMI1[bit 3]")
+has_bmi2 = BoolSetting("BMI2: CPUID.(EAX=07H, ECX=0H):EBX.BMI2[bit 8]")
+
+# CPUID.EAX=80000001H:ECX
+has_lzcnt = BoolSetting("LZCNT: CPUID.EAX=80000001H:ECX.LZCNT[bit 5]")
+
+
+# The use_* settings here are used to determine if a feature can be used.
+
+use_sse41 = And(has_sse41)
+use_sse42 = And(has_sse42, use_sse41)
+use_popcnt = And(has_popcnt, has_sse42)
+use_bmi1 = And(has_bmi1)
+use_lzcnt = And(has_lzcnt)
+
+# Presets corresponding to x86 CPUs.
+
+baseline = Preset()
+
+nehalem = Preset(
+        has_sse3, has_ssse3, has_sse41, has_sse42, has_popcnt)
+haswell = Preset(nehalem, has_bmi1, has_bmi2, has_lzcnt)
+broadwell = Preset(haswell)
+skylake = Preset(broadwell)
+cannonlake = Preset(skylake)
+icelake = Preset(cannonlake)
+
+znver1 = Preset(
+        has_sse3, has_ssse3, has_sse41, has_sse42, has_popcnt,
+        has_bmi1, has_bmi2, has_lzcnt)
+
+ISA.settings.close(globals())