moved crates in lib/ to src/, renamed crates, modified some files' text (#660)

2019-01-28 18:56:54 -05:00
parent 54959cf5bb
commit 747ad3c4c5
508 changed files with 94 additions and 92 deletions
--- a/cranelift/codegen/meta-python/isa/x86/init.py
+++ b/cranelift/codegen/meta-python/isa/x86/init.py
@@ -0,0 +1,22 @@
+"""
+x86 Target Architecture
+-----------------------
+
+This target ISA generates code for x86 CPUs with two separate CPU modes:
+
+`I32`
+    32-bit x86 architecture, also known as 'IA-32', also sometimes referred
+    to as 'i386', however note that Cranelift depends on instructions not
+    in the original `i386`, such as SSE2, CMOVcc, and UD2.
+
+`I64`
+    x86-64 architecture, also known as 'AMD64`, `Intel 64`, and 'x64'.
+"""
+
+from __future__ import absolute_import
+from . import defs
+from . import encodings, settings, registers  # noqa
+from cdsl.isa import TargetISA  # noqa
+
+# Re-export the primary target ISA definition.
+ISA = defs.ISA.finish()  # type: TargetISA
--- a/cranelift/codegen/meta-python/isa/x86/defs.py
+++ b/cranelift/codegen/meta-python/isa/x86/defs.py
@@ -0,0 +1,28 @@
+"""
+x86 definitions.
+
+Commonly used definitions.
+"""
+from __future__ import absolute_import
+from cdsl.isa import TargetISA, CPUMode
+import base.instructions
+from . import instructions as x86
+from base.immediates import floatcc
+
+ISA = TargetISA('x86', [base.instructions.GROUP, x86.GROUP])  # type: TargetISA
+
+# CPU modes for 32-bit and 64-bit operation.
+X86_64 = CPUMode('I64', ISA)
+X86_32 = CPUMode('I32', ISA)
+
+# The set of floating point condition codes that are directly supported.
+# Other condition codes need to be reversed or expressed as two tests.
+supported_floatccs = [
+        floatcc.ord,
+        floatcc.uno,
+        floatcc.one,
+        floatcc.ueq,
+        floatcc.gt,
+        floatcc.ge,
+        floatcc.ult,
+        floatcc.ule]
--- a/cranelift/codegen/meta-python/isa/x86/encodings.py
+++ b/cranelift/codegen/meta-python/isa/x86/encodings.py
@@ -0,0 +1,748 @@
+"""
+x86 Encodings.
+"""
+from __future__ import absolute_import
+from cdsl.predicates import IsZero32BitFloat, IsZero64BitFloat
+from cdsl.predicates import IsUnsignedInt, Not, And
+from base.predicates import IsColocatedFunc, IsColocatedData, LengthEquals
+from base import instructions as base
+from base import types
+from base.formats import UnaryIeee32, UnaryIeee64, UnaryImm
+from base.formats import FuncAddr, Call, LoadComplex, StoreComplex
+from .defs import X86_64, X86_32
+from . import recipes as r
+from . import settings as cfg
+from . import instructions as x86
+from .legalize import x86_expand
+from base.legalize import narrow, widen, expand_flags
+from base.settings import allones_funcaddrs, is_pic
+from .settings import use_sse41
+
+try:
+    from typing import TYPE_CHECKING, Any  # noqa
+    if TYPE_CHECKING:
+        from cdsl.instructions import MaybeBoundInst  # noqa
+        from cdsl.predicates import FieldPredicate # noqa
+except ImportError:
+    pass
+
+
+X86_32.legalize_monomorphic(expand_flags)
+X86_32.legalize_type(
+    default=narrow,
+    b1=expand_flags,
+    i8=widen,
+    i16=widen,
+    i32=x86_expand,
+    f32=x86_expand,
+    f64=x86_expand)
+
+X86_64.legalize_monomorphic(expand_flags)
+X86_64.legalize_type(
+    default=narrow,
+    b1=expand_flags,
+    i8=widen,
+    i16=widen,
+    i32=x86_expand,
+    i64=x86_expand,
+    f32=x86_expand,
+    f64=x86_expand)
+
+
+#
+# Helper functions for generating encodings.
+#
+
+def enc_x86_64(inst, recipe, *args, **kwargs):
+    # type: (MaybeBoundInst, r.TailRecipe, *int, **int) -> None
+    """
+    Add encodings for `inst` to X86_64 with and without a REX prefix.
+    """
+    X86_64.enc(inst, *recipe.rex(*args, **kwargs))
+    X86_64.enc(inst, *recipe(*args, **kwargs))
+
+
+def enc_x86_64_instp(inst, recipe, instp, *args, **kwargs):
+    # type: (MaybeBoundInst, r.TailRecipe, FieldPredicate, *int, **int) -> None
+    """
+    Add encodings for `inst` to X86_64 with and without a REX prefix.
+    """
+    X86_64.enc(inst, *recipe.rex(*args, **kwargs), instp=instp)
+    X86_64.enc(inst, *recipe(*args, **kwargs), instp=instp)
+
+
+def enc_both(inst, recipe, *args, **kwargs):
+    # type: (MaybeBoundInst, r.TailRecipe, *int, **Any) -> None
+    """
+    Add encodings for `inst` to both X86_32 and X86_64.
+    """
+    X86_32.enc(inst, *recipe(*args, **kwargs))
+    enc_x86_64(inst, recipe, *args, **kwargs)
+
+
+def enc_both_instp(inst, recipe, instp, *args, **kwargs):
+    # type: (MaybeBoundInst, r.TailRecipe, FieldPredicate, *int, **Any) -> None
+    """
+    Add encodings for `inst` to both X86_32 and X86_64.
+    """
+    X86_32.enc(inst, *recipe(*args, **kwargs), instp=instp)
+    enc_x86_64_instp(inst, recipe, instp, *args, **kwargs)
+
+
+def enc_i32_i64(inst, recipe, *args, **kwargs):
+    # type: (MaybeBoundInst, r.TailRecipe, *int, **int) -> None
+    """
+    Add encodings for `inst.i32` to X86_32.
+    Add encodings for `inst.i32` to X86_64 with and without REX.
+    Add encodings for `inst.i64` to X86_64 with a REX.W prefix.
+    """
+    X86_32.enc(inst.i32, *recipe(*args, **kwargs))
+
+    # REX-less encoding must come after REX encoding so we don't use it by
+    # default. Otherwise reg-alloc would never use r8 and up.
+    X86_64.enc(inst.i32, *recipe.rex(*args, **kwargs))
+    X86_64.enc(inst.i32, *recipe(*args, **kwargs))
+
+    X86_64.enc(inst.i64, *recipe.rex(*args, w=1, **kwargs))
+
+
+def enc_i32_i64_instp(inst, recipe, instp, *args, **kwargs):
+    # type: (MaybeBoundInst, r.TailRecipe, FieldPredicate, *int, **int) -> None
+    """
+    Add encodings for `inst.i32` to X86_32.
+    Add encodings for `inst.i32` to X86_64 with and without REX.
+    Add encodings for `inst.i64` to X86_64 with a REX.W prefix.
+
+    Similar to `enc_i32_i64` but applies `instp` to each encoding.
+    """
+    X86_32.enc(inst.i32, *recipe(*args, **kwargs), instp=instp)
+
+    # REX-less encoding must come after REX encoding so we don't use it by
+    # default. Otherwise reg-alloc would never use r8 and up.
+    X86_64.enc(inst.i32, *recipe.rex(*args, **kwargs), instp=instp)
+    X86_64.enc(inst.i32, *recipe(*args, **kwargs), instp=instp)
+
+    X86_64.enc(inst.i64, *recipe.rex(*args, w=1, **kwargs), instp=instp)
+
+
+def enc_i32_i64_ld_st(inst, w_bit, recipe, *args, **kwargs):
+    # type: (MaybeBoundInst, bool, r.TailRecipe, *int, **int) -> None
+    """
+    Add encodings for `inst.i32` to X86_32.
+    Add encodings for `inst.i32` to X86_64 with and without REX.
+    Add encodings for `inst.i64` to X86_64 with a REX prefix, using the `w_bit`
+    argument to determine whether or not to set the REX.W bit.
+    """
+    X86_32.enc(inst.i32.any, *recipe(*args, **kwargs))
+
+    # REX-less encoding must come after REX encoding so we don't use it by
+    # default. Otherwise reg-alloc would never use r8 and up.
+    X86_64.enc(inst.i32.any, *recipe.rex(*args, **kwargs))
+    X86_64.enc(inst.i32.any, *recipe(*args, **kwargs))
+
+    if w_bit:
+        X86_64.enc(inst.i64.any, *recipe.rex(*args, w=1, **kwargs))
+    else:
+        X86_64.enc(inst.i64.any, *recipe.rex(*args, **kwargs))
+        X86_64.enc(inst.i64.any, *recipe(*args, **kwargs))
+
+
+for inst,           opc in [
+        (base.iadd, 0x01),
+        (base.isub, 0x29),
+        (base.band, 0x21),
+        (base.bor,  0x09),
+        (base.bxor, 0x31)]:
+    enc_i32_i64(inst, r.rr, opc)
+
+# x86 has a bitwise not instruction NOT.
+enc_i32_i64(base.bnot, r.ur, 0xf7, rrr=2)
+
+# Also add a `b1` encodings for the logic instructions.
+# TODO: Should this be done with 8-bit instructions? It would improve
+# partial register dependencies.
+enc_both(base.band.b1, r.rr, 0x21)
+enc_both(base.bor.b1,  r.rr, 0x09)
+enc_both(base.bxor.b1, r.rr, 0x31)
+
+enc_i32_i64(base.imul, r.rrx, 0x0f, 0xaf)
+enc_i32_i64(x86.sdivmodx, r.div, 0xf7, rrr=7)
+enc_i32_i64(x86.udivmodx, r.div, 0xf7, rrr=6)
+
+enc_i32_i64(x86.smulx, r.mulx, 0xf7, rrr=5)
+enc_i32_i64(x86.umulx, r.mulx, 0xf7, rrr=4)
+
+enc_i32_i64(base.copy, r.umr, 0x89)
+for ty in [types.b1, types.i8, types.i16]:
+    enc_both(base.copy.bind(ty), r.umr, 0x89)
+
+# For x86-64, only define REX forms for now, since we can't describe the
+# special regunit immediate operands with the current constraint language.
+for ty in [types.i8, types.i16, types.i32]:
+    X86_32.enc(base.regmove.bind(ty), *r.rmov(0x89))
+    X86_64.enc(base.regmove.bind(ty), *r.rmov.rex(0x89))
+X86_64.enc(base.regmove.i64, *r.rmov.rex(0x89, w=1))
+
+enc_both(base.regmove.b1, r.rmov, 0x89)
+enc_both(base.regmove.i8, r.rmov, 0x89)
+
+# Immediate instructions with sign-extended 8-bit and 32-bit immediate.
+for inst,               rrr in [
+        (base.iadd_imm, 0),
+        (base.band_imm, 4),
+        (base.bor_imm,  1),
+        (base.bxor_imm, 6)]:
+    enc_i32_i64(inst, r.r_ib, 0x83, rrr=rrr)
+    enc_i32_i64(inst, r.r_id, 0x81, rrr=rrr)
+
+# TODO: band_imm.i64 with an unsigned 32-bit immediate can be encoded as
+# band_imm.i32. Can even use the single-byte immediate for 0xffff_ffXX masks.
+
+# Immediate constants.
+X86_32.enc(base.iconst.i32, *r.pu_id(0xb8))
+
+X86_64.enc(base.iconst.i32, *r.pu_id.rex(0xb8))
+X86_64.enc(base.iconst.i32, *r.pu_id(0xb8))
+# The 32-bit immediate movl also zero-extends to 64 bits.
+X86_64.enc(base.iconst.i64, *r.pu_id.rex(0xb8),
+           instp=IsUnsignedInt(UnaryImm.imm, 32))
+X86_64.enc(base.iconst.i64, *r.pu_id(0xb8),
+           instp=IsUnsignedInt(UnaryImm.imm, 32))
+# Sign-extended 32-bit immediate.
+X86_64.enc(base.iconst.i64, *r.u_id.rex(0xc7, rrr=0, w=1))
+# Finally, the 0xb8 opcode takes an 8-byte immediate with a REX.W prefix.
+X86_64.enc(base.iconst.i64, *r.pu_iq.rex(0xb8, w=1))
+
+# bool constants.
+enc_both(base.bconst.b1, r.pu_id_bool, 0xb8)
+
+# Shifts and rotates.
+# Note that the dynamic shift amount is only masked by 5 or 6 bits; the 8-bit
+# and 16-bit shifts would need explicit masking.
+for inst,           rrr in [
+        (base.rotl, 0),
+        (base.rotr, 1),
+        (base.ishl, 4),
+        (base.ushr, 5),
+        (base.sshr, 7)]:
+    # Cannot use enc_i32_i64 for this pattern because instructions require
+    # .any suffix.
+    X86_32.enc(inst.i32.any, *r.rc(0xd3, rrr=rrr))
+    X86_64.enc(inst.i64.any, *r.rc.rex(0xd3, rrr=rrr, w=1))
+    X86_64.enc(inst.i32.any, *r.rc.rex(0xd3, rrr=rrr))
+    X86_64.enc(inst.i32.any, *r.rc(0xd3, rrr=rrr))
+
+for inst,           rrr in [
+        (base.ishl_imm, 4),
+        (base.ushr_imm, 5),
+        (base.sshr_imm, 7)]:
+    enc_i32_i64(inst, r.r_ib, 0xc1, rrr=rrr)
+
+# Population count.
+X86_32.enc(base.popcnt.i32, *r.urm(0xf3, 0x0f, 0xb8), isap=cfg.use_popcnt)
+X86_64.enc(base.popcnt.i64, *r.urm.rex(0xf3, 0x0f, 0xb8, w=1),
+           isap=cfg.use_popcnt)
+X86_64.enc(base.popcnt.i32, *r.urm.rex(0xf3, 0x0f, 0xb8), isap=cfg.use_popcnt)
+X86_64.enc(base.popcnt.i32, *r.urm(0xf3, 0x0f, 0xb8), isap=cfg.use_popcnt)
+
+# Count leading zero bits.
+X86_32.enc(base.clz.i32, *r.urm(0xf3, 0x0f, 0xbd), isap=cfg.use_lzcnt)
+X86_64.enc(base.clz.i64, *r.urm.rex(0xf3, 0x0f, 0xbd, w=1),
+           isap=cfg.use_lzcnt)
+X86_64.enc(base.clz.i32, *r.urm.rex(0xf3, 0x0f, 0xbd), isap=cfg.use_lzcnt)
+X86_64.enc(base.clz.i32, *r.urm(0xf3, 0x0f, 0xbd), isap=cfg.use_lzcnt)
+
+# Count trailing zero bits.
+X86_32.enc(base.ctz.i32, *r.urm(0xf3, 0x0f, 0xbc), isap=cfg.use_bmi1)
+X86_64.enc(base.ctz.i64, *r.urm.rex(0xf3, 0x0f, 0xbc, w=1),
+           isap=cfg.use_bmi1)
+X86_64.enc(base.ctz.i32, *r.urm.rex(0xf3, 0x0f, 0xbc), isap=cfg.use_bmi1)
+X86_64.enc(base.ctz.i32, *r.urm(0xf3, 0x0f, 0xbc), isap=cfg.use_bmi1)
+
+#
+# Loads and stores.
+#
+
+ldcomplexp = LengthEquals(LoadComplex, 2)
+for recipe in [r.ldWithIndex, r.ldWithIndexDisp8, r.ldWithIndexDisp32]:
+    enc_i32_i64_instp(base.load_complex, recipe, ldcomplexp, 0x8b)
+    enc_x86_64_instp(base.uload32_complex, recipe, ldcomplexp, 0x8b)
+    X86_64.enc(base.sload32_complex, *recipe.rex(0x63, w=1),
+               instp=ldcomplexp)
+    enc_i32_i64_instp(base.uload16_complex, recipe, ldcomplexp, 0x0f, 0xb7)
+    enc_i32_i64_instp(base.sload16_complex, recipe, ldcomplexp, 0x0f, 0xbf)
+    enc_i32_i64_instp(base.uload8_complex, recipe, ldcomplexp, 0x0f, 0xb6)
+    enc_i32_i64_instp(base.sload8_complex, recipe, ldcomplexp, 0x0f, 0xbe)
+
+stcomplexp = LengthEquals(StoreComplex, 3)
+for recipe in [r.stWithIndex, r.stWithIndexDisp8, r.stWithIndexDisp32]:
+    enc_i32_i64_instp(base.store_complex, recipe, stcomplexp, 0x89)
+    enc_x86_64_instp(base.istore32_complex, recipe, stcomplexp, 0x89)
+    enc_both_instp(base.istore16_complex.i32, recipe, stcomplexp, 0x66, 0x89)
+    enc_x86_64_instp(base.istore16_complex.i64, recipe, stcomplexp, 0x66, 0x89)
+
+for recipe in [r.stWithIndex_abcd,
+               r.stWithIndexDisp8_abcd,
+               r.stWithIndexDisp32_abcd]:
+    enc_both_instp(base.istore8_complex.i32, recipe, stcomplexp, 0x88)
+    enc_x86_64_instp(base.istore8_complex.i64, recipe, stcomplexp, 0x88)
+
+for recipe in [r.st, r.stDisp8, r.stDisp32]:
+    enc_i32_i64_ld_st(base.store, True, recipe, 0x89)
+    enc_x86_64(base.istore32.i64.any, recipe, 0x89)
+    enc_i32_i64_ld_st(base.istore16, False, recipe, 0x66, 0x89)
+
+# Byte stores are more complicated because the registers they can address
+# depends of the presence of a REX prefix. The st*_abcd recipes fall back to
+# the corresponding st* recipes when a REX prefix is applied.
+for recipe in [r.st_abcd, r.stDisp8_abcd, r.stDisp32_abcd]:
+    enc_both(base.istore8.i32.any, recipe, 0x88)
+    enc_x86_64(base.istore8.i64.any, recipe, 0x88)
+
+enc_i32_i64(base.spill, r.spillSib32, 0x89)
+enc_i32_i64(base.regspill, r.regspill32, 0x89)
+
+# Use a 32-bit write for spilling `b1`, `i8` and `i16` to avoid
+# constraining the permitted registers.
+# See MIN_SPILL_SLOT_SIZE which makes this safe.
+for ty in [types.b1, types.i8, types.i16]:
+    enc_both(base.spill.bind(ty), r.spillSib32, 0x89)
+    enc_both(base.regspill.bind(ty), r.regspill32, 0x89)
+
+for recipe in [r.ld, r.ldDisp8, r.ldDisp32]:
+    enc_i32_i64_ld_st(base.load, True, recipe, 0x8b)
+    enc_x86_64(base.uload32.i64, recipe, 0x8b)
+    X86_64.enc(base.sload32.i64, *recipe.rex(0x63, w=1))
+    enc_i32_i64_ld_st(base.uload16, True, recipe, 0x0f, 0xb7)
+    enc_i32_i64_ld_st(base.sload16, True, recipe, 0x0f, 0xbf)
+    enc_i32_i64_ld_st(base.uload8, True, recipe, 0x0f, 0xb6)
+    enc_i32_i64_ld_st(base.sload8, True, recipe, 0x0f, 0xbe)
+
+enc_i32_i64(base.fill, r.fillSib32, 0x8b)
+enc_i32_i64(base.regfill, r.regfill32, 0x8b)
+
+# Load 32 bits from `b1`, `i8` and `i16` spill slots. See `spill.b1` above.
+for ty in [types.b1, types.i8, types.i16]:
+    enc_both(base.fill.bind(ty), r.fillSib32, 0x8b)
+    enc_both(base.regfill.bind(ty), r.regfill32, 0x8b)
+
+# Push and Pop
+X86_32.enc(x86.push.i32, *r.pushq(0x50))
+enc_x86_64(x86.push.i64, r.pushq, 0x50)
+
+X86_32.enc(x86.pop.i32, *r.popq(0x58))
+enc_x86_64(x86.pop.i64, r.popq, 0x58)
+
+# Copy Special
+# For x86-64, only define REX forms for now, since we can't describe the
+# special regunit immediate operands with the current constraint language.
+X86_64.enc(base.copy_special, *r.copysp.rex(0x89, w=1))
+X86_32.enc(base.copy_special, *r.copysp(0x89))
+
+# Adjust SP down by a dynamic value (or up, with a negative operand).
+X86_32.enc(base.adjust_sp_down.i32, *r.adjustsp(0x29))
+X86_64.enc(base.adjust_sp_down.i64, *r.adjustsp.rex(0x29, w=1))
+
+# Adjust SP up by an immediate (or down, with a negative immediate)
+X86_32.enc(base.adjust_sp_up_imm, *r.adjustsp_ib(0x83))
+X86_32.enc(base.adjust_sp_up_imm, *r.adjustsp_id(0x81))
+X86_64.enc(base.adjust_sp_up_imm, *r.adjustsp_ib.rex(0x83, w=1))
+X86_64.enc(base.adjust_sp_up_imm, *r.adjustsp_id.rex(0x81, w=1))
+
+# Adjust SP down by an immediate (or up, with a negative immediate)
+X86_32.enc(base.adjust_sp_down_imm, *r.adjustsp_ib(0x83, rrr=5))
+X86_32.enc(base.adjust_sp_down_imm, *r.adjustsp_id(0x81, rrr=5))
+X86_64.enc(base.adjust_sp_down_imm, *r.adjustsp_ib.rex(0x83, rrr=5, w=1))
+X86_64.enc(base.adjust_sp_down_imm, *r.adjustsp_id.rex(0x81, rrr=5, w=1))
+
+#
+# Float loads and stores.
+#
+
+enc_both(base.load.f32.any, r.fld, 0xf3, 0x0f, 0x10)
+enc_both(base.load.f32.any, r.fldDisp8, 0xf3, 0x0f, 0x10)
+enc_both(base.load.f32.any, r.fldDisp32, 0xf3, 0x0f, 0x10)
+
+enc_both(base.load_complex.f32, r.fldWithIndex, 0xf3, 0x0f, 0x10)
+enc_both(base.load_complex.f32, r.fldWithIndexDisp8, 0xf3, 0x0f, 0x10)
+enc_both(base.load_complex.f32, r.fldWithIndexDisp32, 0xf3, 0x0f, 0x10)
+
+enc_both(base.load.f64.any, r.fld, 0xf2, 0x0f, 0x10)
+enc_both(base.load.f64.any, r.fldDisp8, 0xf2, 0x0f, 0x10)
+enc_both(base.load.f64.any, r.fldDisp32, 0xf2, 0x0f, 0x10)
+
+enc_both(base.load_complex.f64, r.fldWithIndex, 0xf2, 0x0f, 0x10)
+enc_both(base.load_complex.f64, r.fldWithIndexDisp8, 0xf2, 0x0f, 0x10)
+enc_both(base.load_complex.f64, r.fldWithIndexDisp32, 0xf2, 0x0f, 0x10)
+
+enc_both(base.store.f32.any, r.fst, 0xf3, 0x0f, 0x11)
+enc_both(base.store.f32.any, r.fstDisp8, 0xf3, 0x0f, 0x11)
+enc_both(base.store.f32.any, r.fstDisp32, 0xf3, 0x0f, 0x11)
+
+enc_both(base.store_complex.f32, r.fstWithIndex, 0xf3, 0x0f, 0x11)
+enc_both(base.store_complex.f32, r.fstWithIndexDisp8, 0xf3, 0x0f, 0x11)
+enc_both(base.store_complex.f32, r.fstWithIndexDisp32, 0xf3, 0x0f, 0x11)
+
+enc_both(base.store.f64.any, r.fst, 0xf2, 0x0f, 0x11)
+enc_both(base.store.f64.any, r.fstDisp8, 0xf2, 0x0f, 0x11)
+enc_both(base.store.f64.any, r.fstDisp32, 0xf2, 0x0f, 0x11)
+
+enc_both(base.store_complex.f64, r.fstWithIndex, 0xf2, 0x0f, 0x11)
+enc_both(base.store_complex.f64, r.fstWithIndexDisp8, 0xf2, 0x0f, 0x11)
+enc_both(base.store_complex.f64, r.fstWithIndexDisp32, 0xf2, 0x0f, 0x11)
+
+enc_both(base.fill.f32, r.ffillSib32, 0xf3, 0x0f, 0x10)
+enc_both(base.regfill.f32, r.fregfill32, 0xf3, 0x0f, 0x10)
+enc_both(base.fill.f64, r.ffillSib32, 0xf2, 0x0f, 0x10)
+enc_both(base.regfill.f64, r.fregfill32, 0xf2, 0x0f, 0x10)
+
+enc_both(base.spill.f32, r.fspillSib32, 0xf3, 0x0f, 0x11)
+enc_both(base.regspill.f32, r.fregspill32, 0xf3, 0x0f, 0x11)
+enc_both(base.spill.f64, r.fspillSib32, 0xf2, 0x0f, 0x11)
+enc_both(base.regspill.f64, r.fregspill32, 0xf2, 0x0f, 0x11)
+
+#
+# Function addresses.
+#
+
+# Non-PIC, all-ones funcaddresses.
+X86_32.enc(base.func_addr.i32, *r.fnaddr4(0xb8),
+           isap=And(Not(allones_funcaddrs), Not(is_pic)))
+X86_64.enc(base.func_addr.i64, *r.fnaddr8.rex(0xb8, w=1),
+           isap=And(Not(allones_funcaddrs), Not(is_pic)))
+
+# Non-PIC, all-zeros funcaddresses.
+X86_32.enc(base.func_addr.i32, *r.allones_fnaddr4(0xb8),
+           isap=And(allones_funcaddrs, Not(is_pic)))
+X86_64.enc(base.func_addr.i64, *r.allones_fnaddr8.rex(0xb8, w=1),
+           isap=And(allones_funcaddrs, Not(is_pic)))
+
+# 64-bit, colocated, both PIC and non-PIC. Use the lea instruction's
+# pc-relative field.
+X86_64.enc(base.func_addr.i64, *r.pcrel_fnaddr8.rex(0x8d, w=1),
+           instp=IsColocatedFunc(FuncAddr.func_ref))
+
+# 64-bit, non-colocated, PIC.
+X86_64.enc(base.func_addr.i64, *r.got_fnaddr8.rex(0x8b, w=1),
+           isap=is_pic)
+
+#
+# Global addresses.
+#
+
+# Non-PIC
+X86_32.enc(base.symbol_value.i32, *r.gvaddr4(0xb8),
+           isap=Not(is_pic))
+X86_64.enc(base.symbol_value.i64, *r.gvaddr8.rex(0xb8, w=1),
+           isap=Not(is_pic))
+
+# PIC, colocated
+X86_64.enc(base.symbol_value.i64, *r.pcrel_gvaddr8.rex(0x8d, w=1),
+           isap=is_pic,
+           instp=IsColocatedData())
+
+# PIC, non-colocated
+X86_64.enc(base.symbol_value.i64, *r.got_gvaddr8.rex(0x8b, w=1),
+           isap=is_pic)
+
+#
+# Stack addresses.
+#
+# TODO: Add encoding rules for stack_load and stack_store, so that they
+# don't get legalized to stack_addr + load/store.
+#
+X86_32.enc(base.stack_addr.i32, *r.spaddr4_id(0x8d))
+X86_64.enc(base.stack_addr.i64, *r.spaddr8_id.rex(0x8d, w=1))
+
+#
+# Call/return
+#
+
+# 32-bit, both PIC and non-PIC.
+X86_32.enc(base.call, *r.call_id(0xe8))
+
+# 64-bit, colocated, both PIC and non-PIC. Use the call instruction's
+# pc-relative field.
+X86_64.enc(base.call, *r.call_id(0xe8),
+           instp=IsColocatedFunc(Call.func_ref))
+
+# 64-bit, non-colocated, PIC. There is no 64-bit non-colocated non-PIC version,
+# since non-PIC is currently using the large model, which requires calls be
+# lowered to func_addr+call_indirect.
+X86_64.enc(base.call, *r.call_plt_id(0xe8), isap=is_pic)
+
+X86_32.enc(base.call_indirect.i32, *r.call_r(0xff, rrr=2))
+X86_64.enc(base.call_indirect.i64, *r.call_r.rex(0xff, rrr=2))
+X86_64.enc(base.call_indirect.i64, *r.call_r(0xff, rrr=2))
+
+X86_32.enc(base.x_return, *r.ret(0xc3))
+X86_64.enc(base.x_return, *r.ret(0xc3))
+
+#
+# Branches
+#
+enc_both(base.jump, r.jmpb, 0xeb)
+enc_both(base.jump, r.jmpd, 0xe9)
+
+enc_both(base.brif, r.brib, 0x70)
+enc_both(base.brif, r.brid, 0x0f, 0x80)
+
+# Not all float condition codes are legal, see `supported_floatccs`.
+enc_both(base.brff, r.brfb, 0x70)
+enc_both(base.brff, r.brfd, 0x0f, 0x80)
+
+# Note that the tjccd opcode will be prefixed with 0x0f.
+enc_i32_i64(base.brz, r.tjccb, 0x74)
+enc_i32_i64(base.brz, r.tjccd, 0x84)
+enc_i32_i64(base.brnz, r.tjccb, 0x75)
+enc_i32_i64(base.brnz, r.tjccd, 0x85)
+
+# Branch on a b1 value in a register only looks at the low 8 bits. See also
+# bint encodings below.
+#
+# Start with the worst-case encoding for X86_32 only. The register allocator
+# can't handle a branch with an ABCD-constrained operand.
+X86_32.enc(base.brz.b1, *r.t8jccd_long(0x84))
+X86_32.enc(base.brnz.b1, *r.t8jccd_long(0x85))
+
+enc_both(base.brz.b1, r.t8jccb_abcd, 0x74)
+enc_both(base.brz.b1, r.t8jccd_abcd, 0x84)
+enc_both(base.brnz.b1, r.t8jccb_abcd, 0x75)
+enc_both(base.brnz.b1, r.t8jccd_abcd, 0x85)
+
+#
+# Jump tables
+#
+X86_64.enc(base.jump_table_entry.i64.any.any, *r.jt_entry.rex(0x63, w=1))
+X86_32.enc(base.jump_table_entry.i32.any.any, *r.jt_entry(0x8b))
+
+X86_64.enc(base.jump_table_base.i64, *r.jt_base.rex(0x8d, w=1))
+X86_32.enc(base.jump_table_base.i32, *r.jt_base(0x8d))
+
+enc_x86_64(base.indirect_jump_table_br.i64, r.indirect_jmp, 0xff, rrr=4)
+X86_32.enc(base.indirect_jump_table_br.i32, *r.indirect_jmp(0xff, rrr=4))
+
+#
+# Trap as ud2
+#
+X86_32.enc(base.trap, *r.trap(0x0f, 0x0b))
+X86_64.enc(base.trap, *r.trap(0x0f, 0x0b))
+
+# Debug trap as int3
+X86_32.enc(base.debugtrap, r.debugtrap, 0)
+X86_64.enc(base.debugtrap, r.debugtrap, 0)
+
+# Using a standard EncRecipe, not the TailRecipe.
+X86_32.enc(base.trapif, r.trapif, 0)
+X86_64.enc(base.trapif, r.trapif, 0)
+X86_32.enc(base.trapff, r.trapff, 0)
+X86_64.enc(base.trapff, r.trapff, 0)
+
+#
+# Comparisons
+#
+enc_i32_i64(base.icmp, r.icscc, 0x39)
+enc_i32_i64(base.icmp_imm, r.icscc_ib, 0x83, rrr=7)
+enc_i32_i64(base.icmp_imm, r.icscc_id, 0x81, rrr=7)
+enc_i32_i64(base.ifcmp, r.rcmp, 0x39)
+enc_i32_i64(base.ifcmp_imm, r.rcmp_ib, 0x83, rrr=7)
+enc_i32_i64(base.ifcmp_imm, r.rcmp_id, 0x81, rrr=7)
+# TODO: We could special-case ifcmp_imm(x, 0) to TEST(x, x).
+
+X86_32.enc(base.ifcmp_sp.i32, *r.rcmp_sp(0x39))
+X86_64.enc(base.ifcmp_sp.i64, *r.rcmp_sp.rex(0x39, w=1))
+
+#
+# Convert flags to bool.
+#
+# This encodes `b1` as an 8-bit low register with the value 0 or 1.
+enc_both(base.trueif, r.seti_abcd, 0x0f, 0x90)
+enc_both(base.trueff, r.setf_abcd, 0x0f, 0x90)
+
+#
+# Conditional move (a.k.a integer select)
+#
+enc_i32_i64(base.selectif, r.cmov, 0x0F, 0x40)
+
+#
+# Bit scan forwards and reverse
+#
+enc_i32_i64(x86.bsf, r.bsf_and_bsr, 0x0F, 0xBC)
+enc_i32_i64(x86.bsr, r.bsf_and_bsr, 0x0F, 0xBD)
+
+#
+# Convert bool to int.
+#
+# This assumes that b1 is represented as an 8-bit low register with the value 0
+# or 1.
+#
+# Encode movzbq as movzbl, because it's equivalent and shorter.
+X86_32.enc(base.bint.i32.b1, *r.urm_noflags_abcd(0x0f, 0xb6))
+X86_64.enc(base.bint.i64.b1, *r.urm_noflags.rex(0x0f, 0xb6))
+X86_64.enc(base.bint.i64.b1, *r.urm_noflags_abcd(0x0f, 0xb6))
+X86_64.enc(base.bint.i32.b1, *r.urm_noflags.rex(0x0f, 0xb6))
+X86_64.enc(base.bint.i32.b1, *r.urm_noflags_abcd(0x0f, 0xb6))
+
+# Numerical conversions.
+
+# Reducing an integer is a no-op.
+X86_32.enc(base.ireduce.i8.i16, r.null, 0)
+X86_32.enc(base.ireduce.i8.i32, r.null, 0)
+X86_32.enc(base.ireduce.i16.i32, r.null, 0)
+
+X86_64.enc(base.ireduce.i8.i16, r.null, 0)
+X86_64.enc(base.ireduce.i8.i32, r.null, 0)
+X86_64.enc(base.ireduce.i16.i32, r.null, 0)
+X86_64.enc(base.ireduce.i8.i64, r.null, 0)
+X86_64.enc(base.ireduce.i16.i64, r.null, 0)
+X86_64.enc(base.ireduce.i32.i64, r.null, 0)
+
+# TODO: Add encodings for cbw, cwde, cdqe, which are sign-extending
+# instructions for %al/%ax/%eax to %ax/%eax/%rax.
+
+# movsbl
+X86_32.enc(base.sextend.i32.i8, *r.urm_noflags_abcd(0x0f, 0xbe))
+X86_64.enc(base.sextend.i32.i8, *r.urm_noflags.rex(0x0f, 0xbe))
+X86_64.enc(base.sextend.i32.i8, *r.urm_noflags_abcd(0x0f, 0xbe))
+
+# movswl
+X86_32.enc(base.sextend.i32.i16, *r.urm_noflags(0x0f, 0xbf))
+X86_64.enc(base.sextend.i32.i16, *r.urm_noflags.rex(0x0f, 0xbf))
+X86_64.enc(base.sextend.i32.i16, *r.urm_noflags(0x0f, 0xbf))
+
+# movsbq
+X86_64.enc(base.sextend.i64.i8, *r.urm_noflags.rex(0x0f, 0xbe, w=1))
+
+# movswq
+X86_64.enc(base.sextend.i64.i16, *r.urm_noflags.rex(0x0f, 0xbf, w=1))
+
+# movslq
+X86_64.enc(base.sextend.i64.i32, *r.urm_noflags.rex(0x63, w=1))
+
+# movzbl
+X86_32.enc(base.uextend.i32.i8, *r.urm_noflags_abcd(0x0f, 0xb6))
+X86_64.enc(base.uextend.i32.i8, *r.urm_noflags.rex(0x0f, 0xb6))
+X86_64.enc(base.uextend.i32.i8, *r.urm_noflags_abcd(0x0f, 0xb6))
+
+# movzwl
+X86_32.enc(base.uextend.i32.i16, *r.urm_noflags(0x0f, 0xb7))
+X86_64.enc(base.uextend.i32.i16, *r.urm_noflags.rex(0x0f, 0xb7))
+X86_64.enc(base.uextend.i32.i16, *r.urm_noflags(0x0f, 0xb7))
+
+# movzbq, encoded as movzbl because it's equivalent and shorter
+X86_64.enc(base.uextend.i64.i8, *r.urm_noflags.rex(0x0f, 0xb6))
+X86_64.enc(base.uextend.i64.i8, *r.urm_noflags(0x0f, 0xb6))
+
+# movzwq, encoded as movzwl because it's equivalent and shorter
+X86_64.enc(base.uextend.i64.i16, *r.urm_noflags.rex(0x0f, 0xb7))
+X86_64.enc(base.uextend.i64.i16, *r.urm_noflags(0x0f, 0xb7))
+
+# A 32-bit register copy clears the high 32 bits.
+X86_64.enc(base.uextend.i64.i32, *r.umr.rex(0x89))
+X86_64.enc(base.uextend.i64.i32, *r.umr(0x89))
+
+
+#
+# Floating point
+#
+
+# floating-point constants equal to 0.0 can be encoded using either
+# `xorps` or `xorpd`, for 32-bit and 64-bit floats respectively.
+X86_32.enc(base.f32const, *r.f32imm_z(0x0f, 0x57),
+           instp=IsZero32BitFloat(UnaryIeee32.imm))
+X86_32.enc(base.f64const, *r.f64imm_z(0x66, 0x0f, 0x57),
+           instp=IsZero64BitFloat(UnaryIeee64.imm))
+
+enc_x86_64_instp(base.f32const, r.f32imm_z,
+                 IsZero32BitFloat(UnaryIeee32.imm), 0x0f, 0x57)
+enc_x86_64_instp(base.f64const, r.f64imm_z,
+                 IsZero64BitFloat(UnaryIeee64.imm), 0x66, 0x0f, 0x57)
+
+# movd
+enc_both(base.bitcast.f32.i32, r.frurm, 0x66, 0x0f, 0x6e)
+enc_both(base.bitcast.i32.f32, r.rfumr, 0x66, 0x0f, 0x7e)
+
+# movq
+X86_64.enc(base.bitcast.f64.i64, *r.frurm.rex(0x66, 0x0f, 0x6e, w=1))
+X86_64.enc(base.bitcast.i64.f64, *r.rfumr.rex(0x66, 0x0f, 0x7e, w=1))
+
+# movaps
+enc_both(base.copy.f32, r.furm, 0x0f, 0x28)
+enc_both(base.copy.f64, r.furm, 0x0f, 0x28)
+
+# For x86-64, only define REX forms for now, since we can't describe the
+# special regunit immediate operands with the current constraint language.
+X86_32.enc(base.regmove.f32, *r.frmov(0x0f, 0x28))
+X86_64.enc(base.regmove.f32, *r.frmov.rex(0x0f, 0x28))
+
+# For x86-64, only define REX forms for now, since we can't describe the
+# special regunit immediate operands with the current constraint language.
+X86_32.enc(base.regmove.f64, *r.frmov(0x0f, 0x28))
+X86_64.enc(base.regmove.f64, *r.frmov.rex(0x0f, 0x28))
+
+# cvtsi2ss
+enc_i32_i64(base.fcvt_from_sint.f32, r.frurm, 0xf3, 0x0f, 0x2a)
+
+# cvtsi2sd
+enc_i32_i64(base.fcvt_from_sint.f64, r.frurm, 0xf2, 0x0f, 0x2a)
+
+# cvtss2sd
+enc_both(base.fpromote.f64.f32, r.furm, 0xf3, 0x0f, 0x5a)
+
+# cvtsd2ss
+enc_both(base.fdemote.f32.f64, r.furm, 0xf2, 0x0f, 0x5a)
+
+# cvttss2si
+enc_both(x86.cvtt2si.i32.f32, r.rfurm, 0xf3, 0x0f, 0x2c)
+X86_64.enc(x86.cvtt2si.i64.f32, *r.rfurm.rex(0xf3, 0x0f, 0x2c, w=1))
+
+# cvttsd2si
+enc_both(x86.cvtt2si.i32.f64, r.rfurm, 0xf2, 0x0f, 0x2c)
+X86_64.enc(x86.cvtt2si.i64.f64, *r.rfurm.rex(0xf2, 0x0f, 0x2c, w=1))
+
+# Exact square roots.
+enc_both(base.sqrt.f32, r.furm, 0xf3, 0x0f, 0x51)
+enc_both(base.sqrt.f64, r.furm, 0xf2, 0x0f, 0x51)
+
+# Rounding. The recipe looks at the opcode to pick an immediate.
+for inst in [
+        base.nearest,
+        base.floor,
+        base.ceil,
+        base.trunc]:
+    enc_both(inst.f32, r.furmi_rnd, 0x66, 0x0f, 0x3a, 0x0a, isap=use_sse41)
+    enc_both(inst.f64, r.furmi_rnd, 0x66, 0x0f, 0x3a, 0x0b, isap=use_sse41)
+
+
+# Binary arithmetic ops.
+for inst,           opc in [
+        (base.fadd, 0x58),
+        (base.fsub, 0x5c),
+        (base.fmul, 0x59),
+        (base.fdiv, 0x5e),
+        (x86.fmin,  0x5d),
+        (x86.fmax,  0x5f)]:
+    enc_both(inst.f32, r.fa, 0xf3, 0x0f, opc)
+    enc_both(inst.f64, r.fa, 0xf2, 0x0f, opc)
+
+# Binary bitwise ops.
+for inst,               opc in [
+        (base.band,     0x54),
+        (base.bor,      0x56),
+        (base.bxor,     0x57)]:
+    enc_both(inst.f32, r.fa, 0x0f, opc)
+    enc_both(inst.f64, r.fa, 0x0f, opc)
+
+# The `andnps(x,y)` instruction computes `~x&y`, while band_not(x,y)` is `x&~y.
+enc_both(base.band_not.f32, r.fax, 0x0f, 0x55)
+enc_both(base.band_not.f64, r.fax, 0x0f, 0x55)
+
+# Comparisons.
+#
+# This only covers the condition codes in `supported_floatccs`, the rest are
+# handled by legalization patterns.
+enc_both(base.fcmp.f32, r.fcscc, 0x0f, 0x2e)
+enc_both(base.fcmp.f64, r.fcscc, 0x66, 0x0f, 0x2e)
+
+enc_both(base.ffcmp.f32, r.fcmp, 0x0f, 0x2e)
+enc_both(base.ffcmp.f64, r.fcmp, 0x66, 0x0f, 0x2e)
--- a/cranelift/codegen/meta-python/isa/x86/instructions.py
+++ b/cranelift/codegen/meta-python/isa/x86/instructions.py
@@ -0,0 +1,173 @@
+"""
+Supplementary instruction definitions for x86.
+
+This module defines additional instructions that are useful only to the x86
+target ISA.
+"""
+
+from base.types import iflags
+from cdsl.operands import Operand
+from cdsl.typevar import TypeVar
+from cdsl.instructions import Instruction, InstructionGroup
+
+
+GROUP = InstructionGroup("x86", "x86-specific instruction set")
+
+iWord = TypeVar('iWord', 'A scalar integer machine word', ints=(32, 64))
+
+nlo = Operand('nlo', iWord, doc='Low part of numerator')
+nhi = Operand('nhi', iWord, doc='High part of numerator')
+d = Operand('d', iWord, doc='Denominator')
+q = Operand('q', iWord, doc='Quotient')
+r = Operand('r', iWord, doc='Remainder')
+
+udivmodx = Instruction(
+        'x86_udivmodx', r"""
+        Extended unsigned division.
+
+        Concatenate the bits in `nhi` and `nlo` to form the numerator.
+        Interpret the bits as an unsigned number and divide by the unsigned
+        denominator `d`. Trap when `d` is zero or if the quotient is larger
+        than the range of the output.
+
+        Return both quotient and remainder.
+        """,
+        ins=(nlo, nhi, d), outs=(q, r), can_trap=True)
+
+sdivmodx = Instruction(
+        'x86_sdivmodx', r"""
+        Extended signed division.
+
+        Concatenate the bits in `nhi` and `nlo` to form the numerator.
+        Interpret the bits as a signed number and divide by the signed
+        denominator `d`. Trap when `d` is zero or if the quotient is outside
+        the range of the output.
+
+        Return both quotient and remainder.
+        """,
+        ins=(nlo, nhi, d), outs=(q, r), can_trap=True)
+
+argL = Operand('argL', iWord)
+argR = Operand('argR', iWord)
+resLo = Operand('resLo', iWord)
+resHi = Operand('resHi', iWord)
+
+umulx = Instruction(
+        'x86_umulx', r"""
+        Unsigned integer multiplication, producing a double-length result.
+
+        Polymorphic over all scalar integer types, but does not support vector
+        types.
+        """,
+        ins=(argL, argR), outs=(resLo, resHi))
+
+smulx = Instruction(
+        'x86_smulx', r"""
+        Signed integer multiplication, producing a double-length result.
+
+        Polymorphic over all scalar integer types, but does not support vector
+        types.
+        """,
+        ins=(argL, argR), outs=(resLo, resHi))
+
+Float = TypeVar(
+        'Float', 'A scalar or vector floating point number',
+        floats=True, simd=True)
+IntTo = TypeVar(
+        'IntTo', 'An integer type with the same number of lanes',
+        ints=(32, 64), simd=True)
+
+x = Operand('x', Float)
+a = Operand('a', IntTo)
+
+cvtt2si = Instruction(
+        'x86_cvtt2si', r"""
+        Convert with truncation floating point to signed integer.
+
+        The source floating point operand is converted to a signed integer by
+        rounding towards zero. If the result can't be represented in the output
+        type, returns the smallest signed value the output type can represent.
+
+        This instruction does not trap.
+        """,
+        ins=x, outs=a)
+
+x = Operand('x', Float)
+a = Operand('a', Float)
+y = Operand('y', Float)
+
+fmin = Instruction(
+        'x86_fmin', r"""
+        Floating point minimum with x86 semantics.
+
+        This is equivalent to the C ternary operator `x < y ? x : y` which
+        differs from :inst:`fmin` when either operand is NaN or when comparing
+        +0.0 to -0.0.
+
+        When the two operands don't compare as LT, `y` is returned unchanged,
+        even if it is a signalling NaN.
+        """,
+        ins=(x, y), outs=a)
+
+fmax = Instruction(
+        'x86_fmax', r"""
+        Floating point maximum with x86 semantics.
+
+        This is equivalent to the C ternary operator `x > y ? x : y` which
+        differs from :inst:`fmax` when either operand is NaN or when comparing
+        +0.0 to -0.0.
+
+        When the two operands don't compare as GT, `y` is returned unchanged,
+        even if it is a signalling NaN.
+        """,
+        ins=(x, y), outs=a)
+
+
+x = Operand('x', iWord)
+
+push = Instruction(
+    'x86_push', r"""
+    Pushes a value onto the stack.
+
+    Decrements the stack pointer and stores the specified value on to the top.
+
+    This is polymorphic in i32 and i64. However, it is only implemented for i64
+    in 64-bit mode, and only for i32 in 32-bit mode.
+    """,
+    ins=x, can_store=True, other_side_effects=True)
+
+pop = Instruction(
+    'x86_pop', r"""
+    Pops a value from the stack.
+
+    Loads a value from the top of the stack and then increments the stack
+    pointer.
+
+    This is polymorphic in i32 and i64. However, it is only implemented for i64
+    in 64-bit mode, and only for i32 in 32-bit mode.
+    """,
+    outs=x, can_load=True, other_side_effects=True)
+
+y = Operand('y', iWord)
+rflags = Operand('rflags', iflags)
+
+bsr = Instruction(
+    'x86_bsr', r"""
+    Bit Scan Reverse -- returns the bit-index of the most significant 1
+    in the word. Result is undefined if the argument is zero. However, it
+    sets the Z flag depending on the argument, so it is at least easy to
+    detect and handle that case.
+
+    This is polymorphic in i32 and i64. It is implemented for both i64 and
+    i32 in 64-bit mode, and only for i32 in 32-bit mode.
+    """,
+    ins=x, outs=(y, rflags))
+
+bsf = Instruction(
+    'x86_bsf', r"""
+    Bit Scan Forwards -- returns the bit-index of the least significant 1
+    in the word. Is otherwise identical to 'bsr', just above.
+    """,
+    ins=x, outs=(y, rflags))
+
+GROUP.close()
--- a/cranelift/codegen/meta-python/isa/x86/legalize.py
+++ b/cranelift/codegen/meta-python/isa/x86/legalize.py
@@ -0,0 +1,229 @@
+"""
+Custom legalization patterns for x86.
+"""
+from __future__ import absolute_import
+from cdsl.ast import Var
+from cdsl.xform import Rtl, XFormGroup
+from base.immediates import imm64, intcc, floatcc
+from base import legalize as shared
+from base import instructions as insts
+from . import instructions as x86
+from .defs import ISA
+
+x86_expand = XFormGroup(
+        'x86_expand',
+        """
+        Legalize instructions by expansion.
+
+        Use x86-specific instructions if needed.
+        """,
+        isa=ISA, chain=shared.expand_flags)
+
+a = Var('a')
+dead = Var('dead')
+x = Var('x')
+xhi = Var('xhi')
+y = Var('y')
+a1 = Var('a1')
+a2 = Var('a2')
+
+#
+# Division and remainder.
+#
+# The srem expansion requires custom code because srem INT_MIN, -1 is not
+# allowed to trap. The other ops need to check avoid_div_traps.
+x86_expand.custom_legalize(insts.sdiv, 'expand_sdivrem')
+x86_expand.custom_legalize(insts.srem, 'expand_sdivrem')
+x86_expand.custom_legalize(insts.udiv, 'expand_udivrem')
+x86_expand.custom_legalize(insts.urem, 'expand_udivrem')
+
+#
+# Double length (widening) multiplication
+#
+resLo = Var('resLo')
+resHi = Var('resHi')
+x86_expand.legalize(
+        resHi << insts.umulhi(x, y),
+        Rtl(
+            (resLo, resHi) << x86.umulx(x, y)
+        ))
+
+x86_expand.legalize(
+        resHi << insts.smulhi(x, y),
+        Rtl(
+            (resLo, resHi) << x86.smulx(x, y)
+        ))
+
+# Floating point condition codes.
+#
+# The 8 condition codes in `supported_floatccs` are directly supported by a
+# `ucomiss` or `ucomisd` instruction. The remaining codes need legalization
+# patterns.
+
+# Equality needs an explicit `ord` test which checks the parity bit.
+x86_expand.legalize(
+        a << insts.fcmp(floatcc.eq, x, y),
+        Rtl(
+            a1 << insts.fcmp(floatcc.ord, x, y),
+            a2 << insts.fcmp(floatcc.ueq, x, y),
+            a << insts.band(a1, a2)
+        ))
+x86_expand.legalize(
+        a << insts.fcmp(floatcc.ne, x, y),
+        Rtl(
+            a1 << insts.fcmp(floatcc.uno, x, y),
+            a2 << insts.fcmp(floatcc.one, x, y),
+            a << insts.bor(a1, a2)
+        ))
+
+# Inequalities that need to be reversed.
+for cc,               rev_cc in [
+        (floatcc.lt,  floatcc.gt),
+        (floatcc.le,  floatcc.ge),
+        (floatcc.ugt, floatcc.ult),
+        (floatcc.uge, floatcc.ule)]:
+    x86_expand.legalize(
+            a << insts.fcmp(cc, x, y),
+            Rtl(
+                a << insts.fcmp(rev_cc, y, x)
+            ))
+
+# We need to modify the CFG for min/max legalization.
+x86_expand.custom_legalize(insts.fmin, 'expand_minmax')
+x86_expand.custom_legalize(insts.fmax, 'expand_minmax')
+
+# Conversions from unsigned need special handling.
+x86_expand.custom_legalize(insts.fcvt_from_uint, 'expand_fcvt_from_uint')
+# Conversions from float to int can trap and modify the control flow graph.
+x86_expand.custom_legalize(insts.fcvt_to_sint, 'expand_fcvt_to_sint')
+x86_expand.custom_legalize(insts.fcvt_to_uint, 'expand_fcvt_to_uint')
+x86_expand.custom_legalize(insts.fcvt_to_sint_sat, 'expand_fcvt_to_sint_sat')
+x86_expand.custom_legalize(insts.fcvt_to_uint_sat, 'expand_fcvt_to_uint_sat')
+
+# Count leading and trailing zeroes, for baseline x86_64
+c_minus_one = Var('c_minus_one')
+c_thirty_one = Var('c_thirty_one')
+c_thirty_two = Var('c_thirty_two')
+c_sixty_three = Var('c_sixty_three')
+c_sixty_four = Var('c_sixty_four')
+index1 = Var('index1')
+r2flags = Var('r2flags')
+index2 = Var('index2')
+
+x86_expand.legalize(
+    a << insts.clz.i64(x),
+    Rtl(
+        c_minus_one << insts.iconst(imm64(-1)),
+        c_sixty_three << insts.iconst(imm64(63)),
+        (index1, r2flags) << x86.bsr(x),
+        index2 << insts.selectif(intcc.eq, r2flags, c_minus_one, index1),
+        a << insts.isub(c_sixty_three, index2),
+    ))
+
+x86_expand.legalize(
+    a << insts.clz.i32(x),
+    Rtl(
+        c_minus_one << insts.iconst(imm64(-1)),
+        c_thirty_one << insts.iconst(imm64(31)),
+        (index1, r2flags) << x86.bsr(x),
+        index2 << insts.selectif(intcc.eq, r2flags, c_minus_one, index1),
+        a << insts.isub(c_thirty_one, index2),
+    ))
+
+x86_expand.legalize(
+    a << insts.ctz.i64(x),
+    Rtl(
+        c_sixty_four << insts.iconst(imm64(64)),
+        (index1, r2flags) << x86.bsf(x),
+        a << insts.selectif(intcc.eq, r2flags, c_sixty_four, index1),
+    ))
+
+x86_expand.legalize(
+    a << insts.ctz.i32(x),
+    Rtl(
+        c_thirty_two << insts.iconst(imm64(32)),
+        (index1, r2flags) << x86.bsf(x),
+        a << insts.selectif(intcc.eq, r2flags, c_thirty_two, index1),
+    ))
+
+
+# Population count for baseline x86_64
+qv1 = Var('qv1')
+qv3 = Var('qv3')
+qv4 = Var('qv4')
+qv5 = Var('qv5')
+qv6 = Var('qv6')
+qv7 = Var('qv7')
+qv8 = Var('qv8')
+qv9 = Var('qv9')
+qv10 = Var('qv10')
+qv11 = Var('qv11')
+qv12 = Var('qv12')
+qv13 = Var('qv13')
+qv14 = Var('qv14')
+qv15 = Var('qv15')
+qv16 = Var('qv16')
+qc77 = Var('qc77')
+qc0F = Var('qc0F')
+qc01 = Var('qc01')
+x86_expand.legalize(
+    qv16 << insts.popcnt.i64(qv1),
+    Rtl(
+        qv3 << insts.ushr_imm(qv1, imm64(1)),
+        qc77 << insts.iconst(imm64(0x7777777777777777)),
+        qv4 << insts.band(qv3, qc77),
+        qv5 << insts.isub(qv1, qv4),
+        qv6 << insts.ushr_imm(qv4, imm64(1)),
+        qv7 << insts.band(qv6, qc77),
+        qv8 << insts.isub(qv5, qv7),
+        qv9 << insts.ushr_imm(qv7, imm64(1)),
+        qv10 << insts.band(qv9, qc77),
+        qv11 << insts.isub(qv8, qv10),
+        qv12 << insts.ushr_imm(qv11, imm64(4)),
+        qv13 << insts.iadd(qv11, qv12),
+        qc0F << insts.iconst(imm64(0x0F0F0F0F0F0F0F0F)),
+        qv14 << insts.band(qv13, qc0F),
+        qc01 << insts.iconst(imm64(0x0101010101010101)),
+        qv15 << insts.imul(qv14, qc01),
+        qv16 << insts.ushr_imm(qv15, imm64(56))
+    ))
+
+lv1 = Var('lv1')
+lv3 = Var('lv3')
+lv4 = Var('lv4')
+lv5 = Var('lv5')
+lv6 = Var('lv6')
+lv7 = Var('lv7')
+lv8 = Var('lv8')
+lv9 = Var('lv9')
+lv10 = Var('lv10')
+lv11 = Var('lv11')
+lv12 = Var('lv12')
+lv13 = Var('lv13')
+lv14 = Var('lv14')
+lv15 = Var('lv15')
+lv16 = Var('lv16')
+lc77 = Var('lc77')
+lc0F = Var('lc0F')
+lc01 = Var('lc01')
+x86_expand.legalize(
+    lv16 << insts.popcnt.i32(lv1),
+    Rtl(
+        lv3 << insts.ushr_imm(lv1, imm64(1)),
+        lc77 << insts.iconst(imm64(0x77777777)),
+        lv4 << insts.band(lv3, lc77),
+        lv5 << insts.isub(lv1, lv4),
+        lv6 << insts.ushr_imm(lv4, imm64(1)),
+        lv7 << insts.band(lv6, lc77),
+        lv8 << insts.isub(lv5, lv7),
+        lv9 << insts.ushr_imm(lv7, imm64(1)),
+        lv10 << insts.band(lv9, lc77),
+        lv11 << insts.isub(lv8, lv10),
+        lv12 << insts.ushr_imm(lv11, imm64(4)),
+        lv13 << insts.iadd(lv11, lv12),
+        lc0F << insts.iconst(imm64(0x0F0F0F0F)),
+        lv14 << insts.band(lv13, lc0F),
+        lc01 << insts.iconst(imm64(0x01010101)),
+        lv15 << insts.imul(lv14, lc01),
+        lv16 << insts.ushr_imm(lv15, imm64(24))
+    ))
--- a/cranelift/codegen/meta-python/isa/x86/recipes.py
+++ b/cranelift/codegen/meta-python/isa/x86/recipes.py
--- a/cranelift/codegen/meta-python/isa/x86/registers.py
+++ b/cranelift/codegen/meta-python/isa/x86/registers.py
@@ -0,0 +1,61 @@
+"""
+x86 register banks.
+
+While the floating-point registers are straight-forward, the general purpose
+register bank has a few quirks on x86. We have these encodings of the 8-bit
+registers:
+
+         I32 I64  |  16b 32b  64b
+    000  AL  AL   |  AX  EAX  RAX
+    001  CL  CL   |  CX  ECX  RCX
+    010  DL  DL   |  DX  EDX  RDX
+    011  BL  BL   |  BX  EBX  RBX
+    100  AH  SPL  |  SP  ESP  RSP
+    101  CH  BPL  |  BP  EBP  RBP
+    110  DH  SIL  |  SI  ESI  RSI
+    111  BH  DIL  |  DI  EDI  RDI
+
+Here, the I64 column refers to the registers you get with a REX prefix. Without
+the REX prefix, you get the I32 registers.
+
+The 8-bit registers are not that useful since WebAssembly only has i32 and i64
+data types, and the H-registers even less so. Rather than trying to model the
+H-registers accurately, we'll avoid using them in both I32 and I64 modes.
+"""
+from __future__ import absolute_import
+from cdsl.registers import RegBank, RegClass, Stack
+from .defs import ISA
+
+
+IntRegs = RegBank(
+        'IntRegs', ISA,
+        'General purpose registers',
+        units=16, prefix='r',
+        names='rax rcx rdx rbx rsp rbp rsi rdi'.split())
+
+FloatRegs = RegBank(
+        'FloatRegs', ISA,
+        'SSE floating point registers',
+        units=16, prefix='xmm')
+
+FlagRegs = RegBank(
+        'FlagRegs', ISA,
+        'Flag registers',
+        units=1,
+        pressure_tracking=False,
+        names=['rflags'])
+
+GPR = RegClass(IntRegs)
+GPR8 = GPR[0:8]
+ABCD = GPR[0:4]
+FPR = RegClass(FloatRegs)
+FPR8 = FPR[0:8]
+FLAG = RegClass(FlagRegs)
+
+# Constraints for stack operands.
+
+# Stack operand with a 32-bit signed displacement from either RBP or RSP.
+StackGPR32 = Stack(GPR)
+StackFPR32 = Stack(FPR)
+
+RegClass.extract_names(globals())
--- a/cranelift/codegen/meta-python/isa/x86/settings.py
+++ b/cranelift/codegen/meta-python/isa/x86/settings.py
@@ -0,0 +1,54 @@
+"""
+x86 settings.
+"""
+from __future__ import absolute_import
+from cdsl.settings import SettingGroup, BoolSetting, Preset
+from cdsl.predicates import And
+import base.settings as shared
+from .defs import ISA
+
+ISA.settings = SettingGroup('x86', parent=shared.group)
+
+# The has_* settings here correspond to CPUID bits.
+
+# CPUID.01H:ECX
+has_sse3 = BoolSetting("SSE3: CPUID.01H:ECX.SSE3[bit 0]")
+has_ssse3 = BoolSetting("SSSE3: CPUID.01H:ECX.SSSE3[bit 9]")
+has_sse41 = BoolSetting("SSE4.1: CPUID.01H:ECX.SSE4_1[bit 19]")
+has_sse42 = BoolSetting("SSE4.2: CPUID.01H:ECX.SSE4_2[bit 20]")
+has_popcnt = BoolSetting("POPCNT: CPUID.01H:ECX.POPCNT[bit 23]")
+has_avx = BoolSetting("AVX: CPUID.01H:ECX.AVX[bit 28]")
+
+# CPUID.(EAX=07H, ECX=0H):EBX
+has_bmi1 = BoolSetting("BMI1: CPUID.(EAX=07H, ECX=0H):EBX.BMI1[bit 3]")
+has_bmi2 = BoolSetting("BMI2: CPUID.(EAX=07H, ECX=0H):EBX.BMI2[bit 8]")
+
+# CPUID.EAX=80000001H:ECX
+has_lzcnt = BoolSetting("LZCNT: CPUID.EAX=80000001H:ECX.LZCNT[bit 5]")
+
+
+# The use_* settings here are used to determine if a feature can be used.
+
+use_sse41 = And(has_sse41)
+use_sse42 = And(has_sse42, use_sse41)
+use_popcnt = And(has_popcnt, has_sse42)
+use_bmi1 = And(has_bmi1)
+use_lzcnt = And(has_lzcnt)
+
+# Presets corresponding to x86 CPUs.
+
+baseline = Preset()
+
+nehalem = Preset(
+        has_sse3, has_ssse3, has_sse41, has_sse42, has_popcnt)
+haswell = Preset(nehalem, has_bmi1, has_bmi2, has_lzcnt)
+broadwell = Preset(haswell)
+skylake = Preset(broadwell)
+cannonlake = Preset(skylake)
+icelake = Preset(cannonlake)
+
+znver1 = Preset(
+        has_sse3, has_ssse3, has_sse41, has_sse42, has_popcnt,
+        has_bmi1, has_bmi2, has_lzcnt)
+
+ISA.settings.close(globals())