machinst x64: implement calls and int cmp/store/loads;

This makes it possible to run a simple recursive fibonacci function in wasmtime.
2020-06-12 16:20:30 +02:00
parent 2d364f75bd
commit c9a3f05afd
11 changed files with 2364 additions and 998 deletions
--- a/cranelift/codegen/src/isa/aarch64/inst/mod.rs
+++ b/cranelift/codegen/src/isa/aarch64/inst/mod.rs
@@ -899,7 +899,7 @@ pub enum Inst {
    },
    /// Marker, no-op in generated code: SP "virtual offset" is adjusted. This
-    /// controls MemArg::NominalSPOffset args are lowered.
+    /// controls how MemArg::NominalSPOffset args are lowered.
    VirtualSPOffsetAdj {
        offset: i64,
    },
--- a/cranelift/codegen/src/isa/x64/abi.rs
+++ b/cranelift/codegen/src/isa/x64/abi.rs
--- a/cranelift/codegen/src/isa/x64/inst/args.rs
+++ b/cranelift/codegen/src/isa/x64/inst/args.rs
@@ -3,16 +3,20 @@
 use std::fmt;
 use std::string::{String, ToString};
-use regalloc::{RealRegUniverse, Reg, RegClass, RegUsageCollector};
+use regalloc::{RealRegUniverse, Reg, RegClass, RegUsageCollector, RegUsageMapper};
 use crate::ir::condcodes::IntCC;
 use crate::machinst::*;
-use super::regs::show_ireg_sized;
+use super::{
    regs::{self, show_ireg_sized},
    EmitState,
 };
-/// A Memory Address. These denote a 64-bit value only.
+/// A possible addressing mode (amode) that can be used in instructions.
 /// These denote a 64-bit value only.
 #[derive(Clone)]
-pub(crate) enum Addr {
+pub enum Amode {
    /// Immediate sign-extended and a Register.
    ImmReg { simm32: u32, base: Reg },
@@ -25,7 +29,7 @@ pub(crate) enum Addr {
    },
 }
-impl Addr {
+impl Amode {
    pub(crate) fn imm_reg(simm32: u32, base: Reg) -> Self {
        debug_assert!(base.get_class() == RegClass::I64);
        Self::ImmReg { simm32, base }
@@ -46,15 +50,10 @@ impl Addr {
    /// Add the regs mentioned by `self` to `collector`.
    pub(crate) fn get_regs_as_uses(&self, collector: &mut RegUsageCollector) {
        match self {
-            Addr::ImmReg { simm32: _, base } => {
+            Amode::ImmReg { base, .. } => {
                collector.add_use(*base);
            }
-            Addr::ImmRegRegShift {
+            Amode::ImmRegRegShift { base, index, .. } => {
                simm32: _,
                base,
                index,
                shift: _,
            } => {
                collector.add_use(*base);
                collector.add_use(*index);
            }
@@ -62,13 +61,13 @@ impl Addr {
    }
 }
-impl ShowWithRRU for Addr {
+impl ShowWithRRU for Amode {
    fn show_rru(&self, mb_rru: Option<&RealRegUniverse>) -> String {
        match self {
-            Addr::ImmReg { simm32, base } => {
+            Amode::ImmReg { simm32, base } => {
                format!("{}({})", *simm32 as i32, base.show_rru(mb_rru))
            }
-            Addr::ImmRegRegShift {
+            Amode::ImmRegRegShift {
                simm32,
                base,
                index,
@@ -84,14 +83,84 @@ impl ShowWithRRU for Addr {
    }
 }
 /// A Memory Address. These denote a 64-bit value only.
 /// Used for usual addressing modes as well as addressing modes used during compilation, when the
 /// moving SP offset is not known.
 #[derive(Clone)]
 pub enum SyntheticAmode {
    /// A real amode.
    Real(Amode),
    /// A (virtual) offset to the "nominal SP" value, which will be recomputed as we push and pop
    /// within the function.
    NominalSPOffset { simm32: u32 },
 }
 impl SyntheticAmode {
    pub(crate) fn nominal_sp_offset(simm32: u32) -> Self {
        SyntheticAmode::NominalSPOffset { simm32 }
    }
    /// Add the regs mentioned by `self` to `collector`.
    pub(crate) fn get_regs_as_uses(&self, collector: &mut RegUsageCollector) {
        match self {
            SyntheticAmode::Real(addr) => addr.get_regs_as_uses(collector),
            SyntheticAmode::NominalSPOffset { .. } => {
                // Nothing to do; the base is SP and isn't involved in regalloc.
            }
        }
    }
    pub(crate) fn map_uses<RUM: RegUsageMapper>(&mut self, map: &RUM) {
        match self {
            SyntheticAmode::Real(addr) => addr.map_uses(map),
            SyntheticAmode::NominalSPOffset { .. } => {
                // Nothing to do.
            }
        }
    }
    pub(crate) fn finalize(&self, state: &mut EmitState) -> Amode {
        match self {
            SyntheticAmode::Real(addr) => addr.clone(),
            SyntheticAmode::NominalSPOffset { simm32 } => {
                let off = *simm32 as i64 + state.virtual_sp_offset;
                // TODO will require a sequence of add etc.
                assert!(
                    off <= u32::max_value() as i64,
                    "amode finalize: add sequence NYI"
                );
                Amode::imm_reg(off as u32, regs::rsp())
            }
        }
    }
 }
 impl Into<SyntheticAmode> for Amode {
    fn into(self) -> SyntheticAmode {
        SyntheticAmode::Real(self)
    }
 }
 impl ShowWithRRU for SyntheticAmode {
    fn show_rru(&self, mb_rru: Option<&RealRegUniverse>) -> String {
        match self {
            SyntheticAmode::Real(addr) => addr.show_rru(mb_rru),
            SyntheticAmode::NominalSPOffset { simm32 } => {
                format!("rsp({} + virtual offset)", *simm32 as i32)
            }
        }
    }
 }
 /// An operand which is either an integer Register, a value in Memory or an Immediate.  This can
 /// denote an 8, 16, 32 or 64 bit value.  For the Immediate form, in the 8- and 16-bit case, only
 /// the lower 8 or 16 bits of `simm32` is relevant.  In the 64-bit case, the value denoted by
 /// `simm32` is its sign-extension out to 64 bits.
 #[derive(Clone)]
-pub(crate) enum RegMemImm {
+pub enum RegMemImm {
    Reg { reg: Reg },
-    Mem { addr: Addr },
+    Mem { addr: SyntheticAmode },
    Imm { simm32: u32 },
 }
@@ -100,8 +169,8 @@ impl RegMemImm {
        debug_assert!(reg.get_class() == RegClass::I64);
        Self::Reg { reg }
    }
-    pub(crate) fn mem(addr: Addr) -> Self {
+    pub(crate) fn mem(addr: impl Into<SyntheticAmode>) -> Self {
-        Self::Mem { addr }
+        Self::Mem { addr: addr.into() }
    }
    pub(crate) fn imm(simm32: u32) -> Self {
        Self::Imm { simm32 }
@@ -134,9 +203,9 @@ impl ShowWithRRU for RegMemImm {
 /// An operand which is either an integer Register or a value in Memory.  This can denote an 8, 16,
 /// 32 or 64 bit value.
 #[derive(Clone)]
-pub(crate) enum RegMem {
+pub enum RegMem {
    Reg { reg: Reg },
-    Mem { addr: Addr },
+    Mem { addr: SyntheticAmode },
 }
 impl RegMem {
@@ -144,8 +213,8 @@ impl RegMem {
        debug_assert!(reg.get_class() == RegClass::I64 || reg.get_class() == RegClass::V128);
        Self::Reg { reg }
    }
-    pub(crate) fn mem(addr: Addr) -> Self {
+    pub(crate) fn mem(addr: impl Into<SyntheticAmode>) -> Self {
-        Self::Mem { addr }
+        Self::Mem { addr: addr.into() }
    }
    /// Add the regs mentioned by `self` to `collector`.
@@ -382,6 +451,13 @@ pub enum ExtMode {
 }
 impl ExtMode {
    pub(crate) fn src_size(&self) -> u8 {
        match self {
            ExtMode::BL | ExtMode::BQ => 1,
            ExtMode::WL | ExtMode::WQ => 2,
            ExtMode::LQ => 4,
        }
    }
    pub(crate) fn dst_size(&self) -> u8 {
        match self {
            ExtMode::BL | ExtMode::WL => 4,
--- a/cranelift/codegen/src/isa/x64/inst/emit.rs
+++ b/cranelift/codegen/src/isa/x64/inst/emit.rs
@@ -1,6 +1,9 @@
-use crate::isa::x64::inst::*;
+use log::debug;
 use regalloc::Reg;
 use crate::binemit::Reloc;
 use crate::isa::x64::inst::*;
 fn low8_will_sign_extend_to_64(x: u32) -> bool {
    let xs = (x as i32) as i64;
    xs == ((xs << 56) >> 56)
@@ -164,7 +167,7 @@ fn emit_std_enc_mem(
    opcodes: u32,
    mut num_opcodes: usize,
    enc_g: u8,
-    mem_e: &Addr,
+    mem_e: &Amode,
    rex: RexFlags,
 ) {
    // General comment for this function: the registers in `mem_e` must be
@@ -174,7 +177,7 @@ fn emit_std_enc_mem(
    prefix.emit(sink);
    match mem_e {
-        Addr::ImmReg { simm32, base } => {
+        Amode::ImmReg { simm32, base } => {
            // First, the REX byte.
            let enc_e = int_reg_enc(*base);
            rex.emit_two_op(sink, enc_g, enc_e);
@@ -228,7 +231,7 @@ fn emit_std_enc_mem(
            }
        }
-        Addr::ImmRegRegShift {
+        Amode::ImmRegRegShift {
            simm32,
            base: reg_base,
            index: reg_index,
@@ -306,7 +309,7 @@ fn emit_std_reg_mem(
    opcodes: u32,
    num_opcodes: usize,
    reg_g: Reg,
-    mem_e: &Addr,
+    mem_e: &Amode,
    rex: RexFlags,
 ) {
    let enc_g = reg_enc(reg_g);
@@ -389,10 +392,13 @@ fn emit_simm(sink: &mut MachBuffer<Inst>, size: u8, simm32: u32) {
 ///
 /// * there's a shorter encoding for shl/shr/sar by a 1-bit immediate.  (Do we
 ///   care?)
-pub(crate) fn emit(inst: &Inst, sink: &mut MachBuffer<Inst>) {
+pub(crate) fn emit(
    inst: &Inst,
    sink: &mut MachBuffer<Inst>,
    _flags: &settings::Flags,
    state: &mut EmitState,
 ) {
    match inst {
        Inst::Nop { len: 0 } => {}
        Inst::Alu_RMI_R {
            is_64,
            op,
@@ -428,7 +434,7 @@ pub(crate) fn emit(inst: &Inst, sink: &mut MachBuffer<Inst>) {
                            0x0FAF,
                            2,
                            reg_g.to_reg(),
-                            addr,
+                            &addr.finalize(state),
                            rex,
                        );
                    }
@@ -460,47 +466,39 @@ pub(crate) fn emit(inst: &Inst, sink: &mut MachBuffer<Inst>) {
                };
                match src {
-                    RegMemImm::Reg { reg: regE } => {
+                    RegMemImm::Reg { reg: reg_e } => {
-                        // Note.  The arguments .. regE .. reg_g .. sequence
+                        // GCC/llvm use the swapped operand encoding (viz., the R/RM vs RM/R
-                        // here is the opposite of what is expected.  I'm not
+                        // duality). Do this too, so as to be able to compare generated machine
-                        // sure why this is.  But I am fairly sure that the
+                        // code easily.
                        // arg order could be switched back to the expected
                        // .. reg_g .. regE .. if opcode_rr is also switched
                        // over to the "other" basic integer opcode (viz, the
                        // R/RM vs RM/R duality).  However, that would mean
                        // that the test results won't be in accordance with
                        // the GNU as reference output.  In other words, the
                        // inversion exists as a result of using GNU as as a
                        // gold standard.
                        emit_std_reg_reg(
                            sink,
                            LegacyPrefix::None,
                            opcode_r,
                            1,
-                            *regE,
+                            *reg_e,
                            reg_g.to_reg(),
                            rex,
                        );
-                        // NB: if this is ever extended to handle byte size
+                        // NB: if this is ever extended to handle byte size ops, be sure to retain
-                        // ops, be sure to retain redundant REX prefixes.
+                        // redundant REX prefixes.
                    }
                    RegMemImm::Mem { addr } => {
-                        // Whereas here we revert to the "normal" G-E ordering.
+                        // Here we revert to the "normal" G-E ordering.
                        emit_std_reg_mem(
                            sink,
                            LegacyPrefix::None,
                            opcode_m,
                            1,
                            reg_g.to_reg(),
-                            addr,
+                            &addr.finalize(state),
                            rex,
                        );
                    }
                    RegMemImm::Imm { simm32 } => {
-                        let useImm8 = low8_will_sign_extend_to_32(*simm32);
+                        let use_imm8 = low8_will_sign_extend_to_32(*simm32);
-                        let opcode = if useImm8 { 0x83 } else { 0x81 };
+                        let opcode = if use_imm8 { 0x83 } else { 0x81 };
                        // And also here we use the "normal" G-E ordering.
                        let enc_g = int_reg_enc(reg_g.to_reg());
                        emit_std_enc_enc(
@@ -512,7 +510,7 @@ pub(crate) fn emit(inst: &Inst, sink: &mut MachBuffer<Inst>) {
                            enc_g,
                            rex,
                        );
-                        emit_simm(sink, if useImm8 { 1 } else { 4 }, *simm32);
+                        emit_simm(sink, if use_imm8 { 1 } else { 4 }, *simm32);
                    }
                }
            }
@@ -548,161 +546,129 @@ pub(crate) fn emit(inst: &Inst, sink: &mut MachBuffer<Inst>) {
            emit_std_reg_reg(sink, LegacyPrefix::None, 0x89, 1, *src, dst.to_reg(), rex);
        }
-        Inst::MovZX_M_R { extMode, addr, dst } => {
+        Inst::MovZX_RM_R { ext_mode, src, dst } => {
-            match extMode {
+            let (opcodes, num_opcodes, rex_flags) = match ext_mode {
                ExtMode::BL => {
                    // MOVZBL is (REX.W==0) 0F B6 /r
-                    emit_std_reg_mem(
+                    (0x0FB6, 2, RexFlags::clear_w())
                        sink,
                        LegacyPrefix::None,
                        0x0FB6,
                        2,
                        dst.to_reg(),
                        addr,
                        RexFlags::clear_w(),
                    )
                }
                ExtMode::BQ => {
                    // MOVZBQ is (REX.W==1) 0F B6 /r
                    // I'm not sure why the Intel manual offers different
                    // encodings for MOVZBQ than for MOVZBL.  AIUI they should
                    // achieve the same, since MOVZBL is just going to zero out
                    // the upper half of the destination anyway.
-                    emit_std_reg_mem(
+                    (0x0FB6, 2, RexFlags::set_w())
                        sink,
                        LegacyPrefix::None,
                        0x0FB6,
                        2,
                        dst.to_reg(),
                        addr,
                        RexFlags::set_w(),
                    )
                }
                ExtMode::WL => {
                    // MOVZWL is (REX.W==0) 0F B7 /r
-                    emit_std_reg_mem(
+                    (0x0FB7, 2, RexFlags::clear_w())
                        sink,
                        LegacyPrefix::None,
                        0x0FB7,
                        2,
                        dst.to_reg(),
                        addr,
                        RexFlags::clear_w(),
                    )
                }
                ExtMode::WQ => {
                    // MOVZWQ is (REX.W==1) 0F B7 /r
-                    emit_std_reg_mem(
+                    (0x0FB7, 2, RexFlags::set_w())
                        sink,
                        LegacyPrefix::None,
                        0x0FB7,
                        2,
                        dst.to_reg(),
                        addr,
                        RexFlags::set_w(),
                    )
                }
                ExtMode::LQ => {
                    // This is just a standard 32 bit load, and we rely on the
                    // default zero-extension rule to perform the extension.
                    // Note that in reg/reg mode, gcc seems to use the swapped form R/RM, which we
                    // don't do here, since it's the same encoding size.
                    // MOV r/m32, r32 is (REX.W==0) 8B /r
-                    emit_std_reg_mem(
+                    (0x8B, 1, RexFlags::clear_w())
                }
            };
            match src {
                RegMem::Reg { reg: src } => emit_std_reg_reg(
                    sink,
                    LegacyPrefix::None,
-                        0x8B,
+                    opcodes,
-                        1,
+                    num_opcodes,
                    dst.to_reg(),
-                        addr,
+                    *src,
-                        RexFlags::clear_w(),
+                    rex_flags,
-                    )
+                ),
-                }
+                RegMem::Mem { addr: src } => emit_std_reg_mem(
                    sink,
                    LegacyPrefix::None,
                    opcodes,
                    num_opcodes,
                    dst.to_reg(),
                    &src.finalize(state),
                    rex_flags,
                ),
            }
        }
-        Inst::Mov64_M_R { addr, dst } => emit_std_reg_mem(
+        Inst::Mov64_M_R { src, dst } => emit_std_reg_mem(
            sink,
            LegacyPrefix::None,
            0x8B,
            1,
            dst.to_reg(),
-            addr,
+            &src.finalize(state),
            RexFlags::set_w(),
        ),
-        Inst::MovSX_M_R { extMode, addr, dst } => {
+        Inst::LoadEffectiveAddress { addr, dst } => emit_std_reg_mem(
            match extMode {
                ExtMode::BL => {
                    // MOVSBL is (REX.W==0) 0F BE /r
                    emit_std_reg_mem(
            sink,
            LegacyPrefix::None,
-                        0x0FBE,
+            0x8D,
                        2,
                        dst.to_reg(),
                        addr,
                        RexFlags::clear_w(),
                    )
                }
                ExtMode::BQ => {
                    // MOVSBQ is (REX.W==1) 0F BE /r
                    emit_std_reg_mem(
                        sink,
                        LegacyPrefix::None,
                        0x0FBE,
                        2,
                        dst.to_reg(),
                        addr,
                        RexFlags::set_w(),
                    )
                }
                ExtMode::WL => {
                    // MOVSWL is (REX.W==0) 0F BF /r
                    emit_std_reg_mem(
                        sink,
                        LegacyPrefix::None,
                        0x0FBF,
                        2,
                        dst.to_reg(),
                        addr,
                        RexFlags::clear_w(),
                    )
                }
                ExtMode::WQ => {
                    // MOVSWQ is (REX.W==1) 0F BF /r
                    emit_std_reg_mem(
                        sink,
                        LegacyPrefix::None,
                        0x0FBF,
                        2,
                        dst.to_reg(),
                        addr,
                        RexFlags::set_w(),
                    )
                }
                ExtMode::LQ => {
                    // MOVSLQ is (REX.W==1) 63 /r
                    emit_std_reg_mem(
                        sink,
                        LegacyPrefix::None,
                        0x63,
            1,
            dst.to_reg(),
-                        addr,
+            &addr.finalize(state),
            RexFlags::set_w(),
-                    )
+        ),
        Inst::MovSX_RM_R { ext_mode, src, dst } => {
            let (opcodes, num_opcodes, rex_flags) = match ext_mode {
                ExtMode::BL => {
                    // MOVSBL is (REX.W==0) 0F BE /r
                    (0x0FBE, 2, RexFlags::clear_w())
                }
                ExtMode::BQ => {
                    // MOVSBQ is (REX.W==1) 0F BE /r
                    (0x0FBE, 2, RexFlags::set_w())
                }
                ExtMode::WL => {
                    // MOVSWL is (REX.W==0) 0F BF /r
                    (0x0FBF, 2, RexFlags::clear_w())
                }
                ExtMode::WQ => {
                    // MOVSWQ is (REX.W==1) 0F BF /r
                    (0x0FBF, 2, RexFlags::set_w())
                }
                ExtMode::LQ => {
                    // MOVSLQ is (REX.W==1) 63 /r
                    (0x63, 1, RexFlags::set_w())
                }
            };
            match src {
                RegMem::Reg { reg: src } => emit_std_reg_reg(
                    sink,
                    LegacyPrefix::None,
                    opcodes,
                    num_opcodes,
                    dst.to_reg(),
                    *src,
                    rex_flags,
                ),
                RegMem::Mem { addr: src } => emit_std_reg_mem(
                    sink,
                    LegacyPrefix::None,
                    opcodes,
                    num_opcodes,
                    dst.to_reg(),
                    &src.finalize(state),
                    rex_flags,
                ),
            }
        }
-        Inst::Mov_R_M { size, src, addr } => {
+        Inst::Mov_R_M { size, src, dst } => {
            let dst = &dst.finalize(state);
            match size {
                1 => {
                    // This is one of the few places where the presence of a
@@ -716,7 +682,7 @@ pub(crate) fn emit(inst: &Inst, sink: &mut MachBuffer<Inst>) {
                    };
                    // MOV r8, r/m8 is (REX.W==0) 88 /r
-                    emit_std_reg_mem(sink, LegacyPrefix::None, 0x88, 1, *src, addr, rex)
+                    emit_std_reg_mem(sink, LegacyPrefix::None, 0x88, 1, *src, dst, rex)
                }
                2 => {
@@ -727,7 +693,7 @@ pub(crate) fn emit(inst: &Inst, sink: &mut MachBuffer<Inst>) {
                        0x89,
                        1,
                        *src,
-                        addr,
+                        dst,
                        RexFlags::clear_w(),
                    )
                }
@@ -740,7 +706,7 @@ pub(crate) fn emit(inst: &Inst, sink: &mut MachBuffer<Inst>) {
                        0x89,
                        1,
                        *src,
-                        addr,
+                        dst,
                        RexFlags::clear_w(),
                    )
                }
@@ -753,7 +719,7 @@ pub(crate) fn emit(inst: &Inst, sink: &mut MachBuffer<Inst>) {
                        0x89,
                        1,
                        *src,
-                        addr,
+                        dst,
                        RexFlags::set_w(),
                    )
                }
@@ -825,23 +791,25 @@ pub(crate) fn emit(inst: &Inst, sink: &mut MachBuffer<Inst>) {
            };
            match src_e {
-                RegMemImm::Reg { reg: regE } => {
+                RegMemImm::Reg { reg: reg_e } => {
                    let opcode = if *size == 1 { 0x38 } else { 0x39 };
                    if *size == 1 {
-                        // We also need to check whether the E register forces
+                        // Check whether the E register forces the use of a redundant REX.
-                        // the use of a redundant REX.
+                        let enc_e = int_reg_enc(*reg_e);
-                        let encE = int_reg_enc(*regE);
+                        if enc_e >= 4 && enc_e <= 7 {
                        if encE >= 4 && encE <= 7 {
                            rex.always_emit();
                        }
                    }
-                    // Same comment re swapped args as for Alu_RMI_R.
+
-                    emit_std_reg_reg(sink, prefix, opcode, 1, *regE, *reg_g, rex);
+                    // Use the swapped operands encoding, to stay consistent with the output of
                    // gcc/llvm.
                    let opcode = if *size == 1 { 0x38 } else { 0x39 };
                    emit_std_reg_reg(sink, prefix, opcode, 1, *reg_e, *reg_g, rex);
                }
                RegMemImm::Mem { addr } => {
-                    let opcode = if *size == 1 { 0x3A } else { 0x3B };
+                    let addr = &addr.finalize(state);
                    // Whereas here we revert to the "normal" G-E ordering.
                    let opcode = if *size == 1 { 0x3A } else { 0x3B };
                    emit_std_reg_mem(sink, prefix, opcode, 1, *reg_g, addr, rex);
                }
@@ -849,6 +817,8 @@ pub(crate) fn emit(inst: &Inst, sink: &mut MachBuffer<Inst>) {
                    // FIXME JRS 2020Feb11: there are shorter encodings for
                    // cmp $imm, rax/eax/ax/al.
                    let use_imm8 = low8_will_sign_extend_to_32(*simm32);
                    // And also here we use the "normal" G-E ordering.
                    let opcode = if *size == 1 {
                        0x80
                    } else if use_imm8 {
@@ -857,7 +827,6 @@ pub(crate) fn emit(inst: &Inst, sink: &mut MachBuffer<Inst>) {
                        0x81
                    };
                    // And also here we use the "normal" G-E ordering.
                    let enc_g = int_reg_enc(*reg_g);
                    emit_std_enc_enc(sink, prefix, opcode, 1, 7 /*subopcode*/, enc_g, rex);
                    emit_simm(sink, if use_imm8 { 1 } else { *size }, *simm32);
@@ -865,6 +834,21 @@ pub(crate) fn emit(inst: &Inst, sink: &mut MachBuffer<Inst>) {
            }
        }
        Inst::Setcc { cc, dst } => {
            let opcode = 0x0f90 + cc.get_enc() as u32;
            let mut rex_flags = RexFlags::clear_w();
            rex_flags.always_emit();
            emit_std_enc_enc(
                sink,
                LegacyPrefix::None,
                opcode,
                2,
                0,
                reg_enc(dst.to_reg()),
                rex_flags,
            );
        }
        Inst::Push64 { src } => {
            match src {
                RegMemImm::Reg { reg } => {
@@ -877,6 +861,7 @@ pub(crate) fn emit(inst: &Inst, sink: &mut MachBuffer<Inst>) {
                }
                RegMemImm::Mem { addr } => {
                    let addr = &addr.finalize(state);
                    emit_std_enc_mem(
                        sink,
                        LegacyPrefix::None,
@@ -910,7 +895,22 @@ pub(crate) fn emit(inst: &Inst, sink: &mut MachBuffer<Inst>) {
            sink.put1(0x58 + (encDst & 7));
        }
-        Inst::CallUnknown { dest } => {
+        Inst::CallKnown {
            dest, loc, opcode, ..
        } => {
            sink.put1(0xE8);
            // The addend adjusts for the difference between the end of the instruction and the
            // beginning of the immediate field.
            sink.add_reloc(*loc, Reloc::X86CallPCRel4, &dest, -4);
            sink.put4(0);
            if opcode.is_call() {
                sink.add_call_site(*loc, *opcode);
            }
        }
        Inst::CallUnknown {
            dest, opcode, loc, ..
        } => {
            match dest {
                RegMem::Reg { reg } => {
                    let reg_enc = int_reg_enc(*reg);
@@ -926,6 +926,7 @@ pub(crate) fn emit(inst: &Inst, sink: &mut MachBuffer<Inst>) {
                }
                RegMem::Mem { addr } => {
                    let addr = &addr.finalize(state);
                    emit_std_enc_mem(
                        sink,
                        LegacyPrefix::None,
@@ -937,61 +938,61 @@ pub(crate) fn emit(inst: &Inst, sink: &mut MachBuffer<Inst>) {
                    );
                }
            }
            if opcode.is_call() {
                sink.add_call_site(*loc, *opcode);
            }
        }
        Inst::Ret {} => sink.put1(0xC3),
-        Inst::JmpKnown { dest } => {
+        Inst::JmpKnown { dst } => {
            let disp = dest.as_offset32_or_zero() - 5;
            let disp = disp as u32;
            let br_start = sink.cur_offset();
            let br_disp_off = br_start + 1;
            let br_end = br_start + 5;
-            if let Some(l) = dest.as_label() {
+            if let Some(l) = dst.as_label() {
-                sink.use_label_at_offset(br_disp_off, l, LabelUse::Rel32);
+                sink.use_label_at_offset(br_disp_off, l, LabelUse::JmpRel32);
                sink.add_uncond_branch(br_start, br_end, l);
            }
            let disp = dst.as_offset32_or_zero();
            let disp = disp as u32;
            sink.put1(0xE9);
            sink.put4(disp);
        }
-        Inst::JmpCondSymm {
+        Inst::JmpCond {
            cc,
            taken,
            not_taken,
        } => {
-            // Conditional part.
+            // If taken.
            // This insn is 6 bytes long.  Currently `offset` is relative to
            // the start of this insn, but the Intel encoding requires it to
            // be relative to the start of the next instruction.  Hence the
            // adjustment.
            let taken_disp = taken.as_offset32_or_zero() - 6;
            let taken_disp = taken_disp as u32;
            let cond_start = sink.cur_offset();
            let cond_disp_off = cond_start + 2;
            let cond_end = cond_start + 6;
            if let Some(l) = taken.as_label() {
-                sink.use_label_at_offset(cond_disp_off, l, LabelUse::Rel32);
+                sink.use_label_at_offset(cond_disp_off, l, LabelUse::JmpRel32);
                let inverted: [u8; 6] =
-                    [0x0F, 0x80 + (cc.invert().get_enc()), 0xFA, 0xFF, 0xFF, 0xFF];
+                    [0x0F, 0x80 + (cc.invert().get_enc()), 0x00, 0x00, 0x00, 0x00];
                sink.add_cond_branch(cond_start, cond_end, l, &inverted[..]);
            }
            let taken_disp = taken.as_offset32_or_zero();
            let taken_disp = taken_disp as u32;
            sink.put1(0x0F);
            sink.put1(0x80 + cc.get_enc());
            sink.put4(taken_disp);
-            // Unconditional part.
+            // If not taken.
            let nt_disp = not_taken.as_offset32_or_zero() - 5;
            let nt_disp = nt_disp as u32;
            let uncond_start = sink.cur_offset();
            let uncond_disp_off = uncond_start + 1;
            let uncond_end = uncond_start + 5;
            if let Some(l) = not_taken.as_label() {
-                sink.use_label_at_offset(uncond_disp_off, l, LabelUse::Rel32);
+                sink.use_label_at_offset(uncond_disp_off, l, LabelUse::JmpRel32);
                sink.add_uncond_branch(uncond_start, uncond_end, l);
            }
            let nt_disp = not_taken.as_offset32_or_zero();
            let nt_disp = nt_disp as u32;
            sink.put1(0xE9);
            sink.put4(nt_disp);
        }
@@ -1012,6 +1013,7 @@ pub(crate) fn emit(inst: &Inst, sink: &mut MachBuffer<Inst>) {
                }
                RegMem::Mem { addr } => {
                    let addr = &addr.finalize(state);
                    emit_std_enc_mem(
                        sink,
                        LegacyPrefix::None,
@@ -1045,6 +1047,7 @@ pub(crate) fn emit(inst: &Inst, sink: &mut MachBuffer<Inst>) {
                }
                RegMem::Mem { addr } => {
                    let addr = &addr.finalize(state);
                    emit_std_reg_mem(sink, prefix, opcode, 2, reg_g.to_reg(), addr, rex);
                }
            }
@@ -1074,11 +1077,33 @@ pub(crate) fn emit(inst: &Inst, sink: &mut MachBuffer<Inst>) {
                }
                RegMem::Mem { addr } => {
                    let addr = &addr.finalize(state);
                    emit_std_reg_mem(sink, prefix, opcode, 2, reg_g.to_reg(), addr, rex);
                }
            }
        }
-        _ => panic!("x64_emit: unhandled: {} ", inst.show_rru(None)),
+        Inst::Hlt => {
            sink.put1(0xcc);
        }
        Inst::Ud2 { trap_info } => {
            sink.add_trap(trap_info.0, trap_info.1);
            sink.put1(0x0f);
            sink.put1(0x0b);
        }
        Inst::VirtualSPOffsetAdj { offset } => {
            debug!(
                "virtual sp offset adjusted by {} -> {}",
                offset,
                state.virtual_sp_offset + offset
            );
            state.virtual_sp_offset += offset;
        }
        Inst::Nop { .. } | Inst::EpiloguePlaceholder => {
            // Generate no code.
        }
    }
 }
--- a/cranelift/codegen/src/isa/x64/inst/emit_tests.rs
+++ b/cranelift/codegen/src/isa/x64/inst/emit_tests.rs
--- a/cranelift/codegen/src/isa/x64/inst/mod.rs
+++ b/cranelift/codegen/src/isa/x64/inst/mod.rs
@@ -4,19 +4,17 @@
 #![allow(non_snake_case)]
 #![allow(non_camel_case_types)]
-use core::convert::TryFrom;
+use alloc::vec::Vec;
 use smallvec::SmallVec;
 use std::fmt;
 use std::string::{String, ToString};
 use regalloc::RegUsageCollector;
 use regalloc::Set;
 use regalloc::{RealRegUniverse, Reg, RegClass, RegUsageMapper, SpillSlot, VirtualReg, Writable};
 use crate::binemit::CodeOffset;
 use crate::ir::types::{B1, B128, B16, B32, B64, B8, F32, F64, I128, I16, I32, I64, I8};
-use crate::ir::ExternalName;
+use crate::ir::{ExternalName, Opcode, SourceLoc, TrapCode, Type};
 use crate::ir::Type;
 use crate::machinst::*;
 use crate::settings::Flags;
 use crate::{settings, CodegenError, CodegenResult};
@@ -37,11 +35,13 @@ use regs::{create_reg_universe_systemv, show_ireg_sized};
 /// Instructions.  Destinations are on the RIGHT (a la AT&T syntax).
 #[derive(Clone)]
-pub(crate) enum Inst {
+pub enum Inst {
    /// nops of various sizes, including zero
    Nop { len: u8 },
-    /// (add sub and or xor mul adc? sbb?) (32 64) (reg addr imm) reg
+    // =====================================
    // Integer instructions.
    /// Integer arithmetic/bit-twiddling: (add sub and or xor mul adc? sbb?) (32 64) (reg addr imm) reg
    Alu_RMI_R {
        is_64: bool,
        op: AluRmiROpcode,
@@ -49,49 +49,57 @@ pub(crate) enum Inst {
        dst: Writable<Reg>,
    },
-    /// (imm32 imm64) reg.
+    /// Constant materialization: (imm32 imm64) reg.
-    /// Either: movl $imm32, %reg32 or movabsq $imm64, %reg32
+    /// Either: movl $imm32, %reg32 or movabsq $imm64, %reg32.
    Imm_R {
        dst_is_64: bool,
        simm64: u64,
        dst: Writable<Reg>,
    },
-    /// mov (64 32) reg reg
+    /// GPR to GPR move: mov (64 32) reg reg.
    Mov_R_R {
        is_64: bool,
        src: Reg,
        dst: Writable<Reg>,
    },
-    /// movz (bl bq wl wq lq) addr reg (good for all ZX loads except 64->64).
+    /// Zero-extended loads, except for 64 bits: movz (bl bq wl wq lq) addr reg.
-    /// Note that the lq variant doesn't really exist since the default
+    /// Note that the lq variant doesn't really exist since the default zero-extend rule makes it
-    /// zero-extend rule makes it unnecessary.  For that case we emit the
+    /// unnecessary. For that case we emit the equivalent "movl AM, reg32".
-    /// equivalent "movl AM, reg32".
+    MovZX_RM_R {
-    MovZX_M_R {
+        ext_mode: ExtMode,
-        extMode: ExtMode,
+        src: RegMem,
        addr: Addr,
        dst: Writable<Reg>,
    },
-    /// A plain 64-bit integer load, since MovZX_M_R can't represent that
+    /// A plain 64-bit integer load, since MovZX_RM_R can't represent that.
-    Mov64_M_R { addr: Addr, dst: Writable<Reg> },
+    Mov64_M_R {
-
+        src: SyntheticAmode,
    /// movs (bl bq wl wq lq) addr reg (good for all SX loads)
    MovSX_M_R {
        extMode: ExtMode,
        addr: Addr,
        dst: Writable<Reg>,
    },
-    /// mov (b w l q) reg addr (good for all integer stores)
+    /// Loads the memory address of addr into dst.
    LoadEffectiveAddress {
        addr: SyntheticAmode,
        dst: Writable<Reg>,
    },
    /// Sign-extended loads and moves: movs (bl bq wl wq lq) addr reg.
    MovSX_RM_R {
        ext_mode: ExtMode,
        src: RegMem,
        dst: Writable<Reg>,
    },
    /// Integer stores: mov (b w l q) reg addr.
    Mov_R_M {
-        size: u8, // 1, 2, 4 or 8
+        size: u8, // 1, 2, 4 or 8.
        src: Reg,
-        addr: Addr,
+        dst: SyntheticAmode,
    },
-    /// (shl shr sar) (l q) imm reg
+    /// Arithmetic shifts: (shl shr sar) (l q) imm reg.
    Shift_R {
        is_64: bool,
        kind: ShiftKind,
@@ -100,75 +108,95 @@ pub(crate) enum Inst {
        dst: Writable<Reg>,
    },
-    /// cmp (b w l q) (reg addr imm) reg
+    /// Integer comparisons/tests: cmp (b w l q) (reg addr imm) reg.
    Cmp_RMI_R {
        size: u8, // 1, 2, 4 or 8
        src: RegMemImm,
        dst: Reg,
    },
    /// Materializes the requested condition code in the destination reg.
    Setcc { cc: CC, dst: Writable<Reg> },
    // =====================================
    // Stack manipulation.
    /// pushq (reg addr imm)
    Push64 { src: RegMemImm },
    /// popq reg
    Pop64 { dst: Writable<Reg> },
-    /// call simm32
+    // =====================================
-    CallKnown {
+    // Floating-point operations.
-        dest: ExternalName,
+    /// Float arithmetic/bit-twiddling: (add sub and or xor mul adc? sbb?) (32 64) (reg addr) reg
-        uses: Set<Reg>,
+    XMM_RM_R {
-        defs: Set<Writable<Reg>>,
+        op: SseOpcode,
        src: RegMem,
        dst: Writable<Reg>,
    },
    /// callq (reg mem)
    CallUnknown {
        dest: RegMem,
        //uses: Set<Reg>,
        //defs: Set<Writable<Reg>>,
    },
    // ---- branches (exactly one must appear at end of BB) ----
    /// ret
    Ret,
    /// A placeholder instruction, generating no code, meaning that a function epilogue must be
    /// inserted there.
    EpiloguePlaceholder,
    /// jmp simm32
    JmpKnown { dest: BranchTarget },
    /// jcond cond target target
    /// Symmetrical two-way conditional branch.
    /// Emitted as a compound sequence; the MachBuffer will shrink it
    /// as appropriate.
    JmpCondSymm {
        cc: CC,
        taken: BranchTarget,
        not_taken: BranchTarget,
    },
    /// jmpq (reg mem)
    JmpUnknown { target: RegMem },
    /// mov between XMM registers (32 64) (reg addr) reg
-    /// XMM_MOV_RM_R differs from XMM_RM_R in that the dst
+    /// XMM_MOV_RM_R differs from XMM_RM_R in that the dst register of XMM_MOV_RM_R is not used in
-    /// register of XMM_MOV_RM_R is not used in the computation
+    /// the computation of the instruction dst value and so does not have to be a previously valid
-    /// of the instruction dst value and so does not have to
+    /// value. This is characteristic of mov instructions.
    /// be a previously valid value. This is characteristic of
    /// mov instructions.
    XMM_MOV_RM_R {
        op: SseOpcode,
        src: RegMem,
        dst: Writable<Reg>,
    },
-    /// (add sub and or xor mul adc? sbb?) (32 64) (reg addr imm) reg
+    // =====================================
-    XMM_RM_R {
+    // Control flow instructions.
-        op: SseOpcode,
+    /// Direct call: call simm32.
-        src: RegMem,
+    CallKnown {
-        dst: Writable<Reg>,
+        dest: ExternalName,
        uses: Vec<Reg>,
        defs: Vec<Writable<Reg>>,
        loc: SourceLoc,
        opcode: Opcode,
    },
    /// Indirect call: callq (reg mem).
    CallUnknown {
        dest: RegMem,
        uses: Vec<Reg>,
        defs: Vec<Writable<Reg>>,
        loc: SourceLoc,
        opcode: Opcode,
    },
    /// Return.
    Ret,
    /// A placeholder instruction, generating no code, meaning that a function epilogue must be
    /// inserted there.
    EpiloguePlaceholder,
    /// Jump to a known target: jmp simm32.
    JmpKnown { dst: BranchTarget },
    /// Two-way conditional branch: jcond cond target target.
    /// Emitted as a compound sequence; the MachBuffer will shrink it as appropriate.
    JmpCond {
        cc: CC,
        taken: BranchTarget,
        not_taken: BranchTarget,
    },
    /// Indirect jump: jmpq (reg mem).
    JmpUnknown { target: RegMem },
    /// A debug trap.
    Hlt,
    /// An instruction that will always trigger the illegal instruction exception.
    Ud2 { trap_info: (SourceLoc, TrapCode) },
    // =====================================
    // Meta-instructions generating no code.
    /// Marker, no-op in generated code: SP "virtual offset" is adjusted. This
    /// controls how MemArg::NominalSPOffset args are lowered.
    VirtualSPOffsetAdj { offset: i64 },
 }
 // Handy constructors for Insts.
@@ -229,29 +257,44 @@ impl Inst {
        Inst::XMM_RM_R { op, src, dst }
    }
-    pub(crate) fn movzx_m_r(extMode: ExtMode, addr: Addr, dst: Writable<Reg>) -> Inst {
+    pub(crate) fn movzx_rm_r(ext_mode: ExtMode, src: RegMem, dst: Writable<Reg>) -> Inst {
        debug_assert!(dst.to_reg().get_class() == RegClass::I64);
-        Inst::MovZX_M_R { extMode, addr, dst }
+        Inst::MovZX_RM_R { ext_mode, src, dst }
    }
-    pub(crate) fn mov64_m_r(addr: Addr, dst: Writable<Reg>) -> Inst {
+    pub(crate) fn mov64_m_r(src: impl Into<SyntheticAmode>, dst: Writable<Reg>) -> Inst {
        debug_assert!(dst.to_reg().get_class() == RegClass::I64);
-        Inst::Mov64_M_R { addr, dst }
+        Inst::Mov64_M_R {
            src: src.into(),
            dst,
        }
    }
-    pub(crate) fn movsx_m_r(extMode: ExtMode, addr: Addr, dst: Writable<Reg>) -> Inst {
+    pub(crate) fn movsx_rm_r(ext_mode: ExtMode, src: RegMem, dst: Writable<Reg>) -> Inst {
        debug_assert!(dst.to_reg().get_class() == RegClass::I64);
-        Inst::MovSX_M_R { extMode, addr, dst }
+        Inst::MovSX_RM_R { ext_mode, src, dst }
    }
    pub(crate) fn mov_r_m(
        size: u8, // 1, 2, 4 or 8
        src: Reg,
-        addr: Addr,
+        dst: impl Into<SyntheticAmode>,
    ) -> Inst {
        debug_assert!(size == 8 || size == 4 || size == 2 || size == 1);
        debug_assert!(src.get_class() == RegClass::I64);
-        Inst::Mov_R_M { size, src, addr }
+        Inst::Mov_R_M {
            size,
            src,
            dst: dst.into(),
        }
    }
    pub(crate) fn lea(addr: impl Into<SyntheticAmode>, dst: Writable<Reg>) -> Inst {
        debug_assert!(dst.to_reg().get_class() == RegClass::I64);
        Inst::LoadEffectiveAddress {
            addr: addr.into(),
            dst,
        }
    }
    pub(crate) fn shift_r(
@@ -274,6 +317,8 @@ impl Inst {
        }
    }
    /// Does a comparison of dst - src for operands of size `size`, as stated by the machine
    /// instruction semantics. Be careful with the order of parameters!
    pub(crate) fn cmp_rmi_r(
        size: u8, // 1, 2, 4 or 8
        src: RegMemImm,
@@ -284,6 +329,11 @@ impl Inst {
        Inst::Cmp_RMI_R { size, src, dst }
    }
    pub(crate) fn setcc(cc: CC, dst: Writable<Reg>) -> Inst {
        debug_assert!(dst.to_reg().get_class() == RegClass::I64);
        Inst::Setcc { cc, dst }
    }
    pub(crate) fn push64(src: RegMemImm) -> Inst {
        Inst::Push64 { src }
    }
@@ -292,8 +342,36 @@ impl Inst {
        Inst::Pop64 { dst }
    }
-    pub(crate) fn call_unknown(dest: RegMem) -> Inst {
+    pub(crate) fn call_known(
-        Inst::CallUnknown { dest }
+        dest: ExternalName,
        uses: Vec<Reg>,
        defs: Vec<Writable<Reg>>,
        loc: SourceLoc,
        opcode: Opcode,
    ) -> Inst {
        Inst::CallKnown {
            dest,
            uses,
            defs,
            loc,
            opcode,
        }
    }
    pub(crate) fn call_unknown(
        dest: RegMem,
        uses: Vec<Reg>,
        defs: Vec<Writable<Reg>>,
        loc: SourceLoc,
        opcode: Opcode,
    ) -> Inst {
        Inst::CallUnknown {
            dest,
            uses,
            defs,
            loc,
            opcode,
        }
    }
    pub(crate) fn ret() -> Inst {
@@ -304,12 +382,12 @@ impl Inst {
        Inst::EpiloguePlaceholder
    }
-    pub(crate) fn jmp_known(dest: BranchTarget) -> Inst {
+    pub(crate) fn jmp_known(dst: BranchTarget) -> Inst {
-        Inst::JmpKnown { dest }
+        Inst::JmpKnown { dst }
    }
-    pub(crate) fn jmp_cond_symm(cc: CC, taken: BranchTarget, not_taken: BranchTarget) -> Inst {
+    pub(crate) fn jmp_cond(cc: CC, taken: BranchTarget, not_taken: BranchTarget) -> Inst {
-        Inst::JmpCondSymm {
+        Inst::JmpCond {
            cc,
            taken,
            not_taken,
@@ -414,40 +492,46 @@ impl ShowWithRRU for Inst {
                show_ireg_sized(*src, mb_rru, sizeLQ(*is_64)),
                show_ireg_sized(dst.to_reg(), mb_rru, sizeLQ(*is_64))
            ),
-            Inst::MovZX_M_R { extMode, addr, dst } => {
+            Inst::MovZX_RM_R { ext_mode, src, dst } => {
-                if *extMode == ExtMode::LQ {
+                if *ext_mode == ExtMode::LQ {
                    format!(
                        "{} {}, {}",
                        ljustify("movl".to_string()),
-                        addr.show_rru(mb_rru),
+                        src.show_rru_sized(mb_rru, ext_mode.src_size()),
                        show_ireg_sized(dst.to_reg(), mb_rru, 4)
                    )
                } else {
                    format!(
                        "{} {}, {}",
-                        ljustify2("movz".to_string(), extMode.to_string()),
+                        ljustify2("movz".to_string(), ext_mode.to_string()),
-                        addr.show_rru(mb_rru),
+                        src.show_rru_sized(mb_rru, ext_mode.src_size()),
-                        show_ireg_sized(dst.to_reg(), mb_rru, extMode.dst_size())
+                        show_ireg_sized(dst.to_reg(), mb_rru, ext_mode.dst_size())
                    )
                }
            }
-            Inst::Mov64_M_R { addr, dst } => format!(
+            Inst::Mov64_M_R { src, dst } => format!(
                "{} {}, {}",
                ljustify("movq".to_string()),
                src.show_rru(mb_rru),
                dst.show_rru(mb_rru)
            ),
            Inst::LoadEffectiveAddress { addr, dst } => format!(
                "{} {}, {}",
                ljustify("lea".to_string()),
                addr.show_rru(mb_rru),
                dst.show_rru(mb_rru)
            ),
-            Inst::MovSX_M_R { extMode, addr, dst } => format!(
+            Inst::MovSX_RM_R { ext_mode, src, dst } => format!(
                "{} {}, {}",
-                ljustify2("movs".to_string(), extMode.to_string()),
+                ljustify2("movs".to_string(), ext_mode.to_string()),
-                addr.show_rru(mb_rru),
+                src.show_rru_sized(mb_rru, ext_mode.src_size()),
-                show_ireg_sized(dst.to_reg(), mb_rru, extMode.dst_size())
+                show_ireg_sized(dst.to_reg(), mb_rru, ext_mode.dst_size())
            ),
-            Inst::Mov_R_M { size, src, addr } => format!(
+            Inst::Mov_R_M { size, src, dst } => format!(
                "{} {}, {}",
                ljustify2("mov".to_string(), suffixBWLQ(*size)),
                show_ireg_sized(*src, mb_rru, *size),
-                addr.show_rru(mb_rru)
+                dst.show_rru(mb_rru)
            ),
            Inst::Shift_R {
                is_64,
@@ -474,25 +558,29 @@ impl ShowWithRRU for Inst {
                src.show_rru_sized(mb_rru, *size),
                show_ireg_sized(*dst, mb_rru, *size)
            ),
            Inst::Setcc { cc, dst } => format!(
                "{} {}",
                ljustify2("set".to_string(), cc.to_string()),
                show_ireg_sized(dst.to_reg(), mb_rru, 1)
            ),
            Inst::Push64 { src } => {
                format!("{} {}", ljustify("pushq".to_string()), src.show_rru(mb_rru))
            }
            Inst::Pop64 { dst } => {
                format!("{} {}", ljustify("popq".to_string()), dst.show_rru(mb_rru))
            }
-            //Inst::CallKnown { target } => format!("{} {:?}", ljustify("call".to_string()), target),
+            Inst::CallKnown { dest, .. } => format!("{} {:?}", ljustify("call".to_string()), dest),
-            Inst::CallKnown { .. } => "**CallKnown**".to_string(),
+            Inst::CallUnknown { dest, .. } => format!(
            Inst::CallUnknown { dest } => format!(
                "{} *{}",
                ljustify("call".to_string()),
                dest.show_rru(mb_rru)
            ),
            Inst::Ret => "ret".to_string(),
            Inst::EpiloguePlaceholder => "epilogue placeholder".to_string(),
-            Inst::JmpKnown { dest } => {
+            Inst::JmpKnown { dst } => {
-                format!("{} {}", ljustify("jmp".to_string()), dest.show_rru(mb_rru))
+                format!("{} {}", ljustify("jmp".to_string()), dst.show_rru(mb_rru))
            }
-            Inst::JmpCondSymm {
+            Inst::JmpCond {
                cc,
                taken,
                not_taken,
@@ -508,6 +596,9 @@ impl ShowWithRRU for Inst {
                ljustify("jmp".to_string()),
                target.show_rru(mb_rru)
            ),
            Inst::VirtualSPOffsetAdj { offset } => format!("virtual_sp_offset_adjust {}", offset),
            Inst::Hlt => "hlt".into(),
            Inst::Ud2 { trap_info } => format!("ud2 {}", trap_info.1),
        }
    }
 }
@@ -526,7 +617,6 @@ fn x64_get_regs(inst: &Inst, collector: &mut RegUsageCollector) {
    // regalloc.rs will "fix" this for us by removing the the modified set from the use and def
    // sets.
    match inst {
        // ** Nop
        Inst::Alu_RMI_R {
            is_64: _,
            op: _,
@@ -544,40 +634,28 @@ fn x64_get_regs(inst: &Inst, collector: &mut RegUsageCollector) {
            src.get_regs_as_uses(collector);
            collector.add_mod(*dst);
        }
-        Inst::Imm_R {
+        Inst::Imm_R { dst, .. } => {
            dst_is_64: _,
            simm64: _,
            dst,
        } => {
            collector.add_def(*dst);
        }
-        Inst::Mov_R_R { is_64: _, src, dst } => {
+        Inst::Mov_R_R { src, dst, .. } => {
            collector.add_use(*src);
            collector.add_def(*dst);
        }
-        Inst::MovZX_M_R {
+        Inst::MovZX_RM_R { src, dst, .. } => {
-            extMode: _,
+            src.get_regs_as_uses(collector);
            addr,
            dst,
        } => {
            addr.get_regs_as_uses(collector);
            collector.add_def(*dst);
        }
-        Inst::Mov64_M_R { addr, dst } => {
+        Inst::Mov64_M_R { src, dst } | Inst::LoadEffectiveAddress { addr: src, dst } => {
-            addr.get_regs_as_uses(collector);
+            src.get_regs_as_uses(collector);
            collector.add_def(*dst)
        }
        Inst::MovSX_RM_R { src, dst, .. } => {
            src.get_regs_as_uses(collector);
            collector.add_def(*dst);
        }
-        Inst::MovSX_M_R {
+        Inst::Mov_R_M { src, dst, .. } => {
            extMode: _,
            addr,
            dst,
        } => {
            addr.get_regs_as_uses(collector);
            collector.add_def(*dst);
        }
        Inst::Mov_R_M { size: _, src, addr } => {
            collector.add_use(*src);
-            addr.get_regs_as_uses(collector);
+            dst.get_regs_as_uses(collector);
        }
        Inst::Shift_R {
            is_64: _,
@@ -594,6 +672,9 @@ fn x64_get_regs(inst: &Inst, collector: &mut RegUsageCollector) {
            src.get_regs_as_uses(collector);
            collector.add_use(*dst); // yes, really `add_use`
        }
        Inst::Setcc { dst, .. } => {
            collector.add_def(*dst);
        }
        Inst::Push64 { src } => {
            src.get_regs_as_uses(collector);
            collector.add_mod(Writable::from_reg(regs::rsp()));
@@ -601,29 +682,36 @@ fn x64_get_regs(inst: &Inst, collector: &mut RegUsageCollector) {
        Inst::Pop64 { dst } => {
            collector.add_def(*dst);
        }
        Inst::CallKnown {
-            dest: _,
+            ref uses, ref defs, ..
            uses: _,
            defs: _,
        } => {
-            // FIXME add arg regs (iru.used) and caller-saved regs (iru.defined)
+            collector.add_uses(uses);
-            unimplemented!();
+            collector.add_defs(defs);
        }
-        Inst::CallUnknown { dest } => {
+
        Inst::CallUnknown {
            ref uses,
            ref defs,
            dest,
            ..
        } => {
            collector.add_uses(uses);
            collector.add_defs(defs);
            dest.get_regs_as_uses(collector);
        }
-        Inst::Ret => {}
+
-        Inst::EpiloguePlaceholder => {}
+        Inst::Ret
-        Inst::JmpKnown { dest: _ } => {}
+        | Inst::EpiloguePlaceholder
-        Inst::JmpCondSymm {
+        | Inst::JmpKnown { .. }
-            cc: _,
+        | Inst::JmpCond { .. }
-            taken: _,
+        | Inst::Nop { .. }
-            not_taken: _,
+        | Inst::JmpUnknown { .. }
-        } => {}
+        | Inst::VirtualSPOffsetAdj { .. }
-        //Inst::JmpUnknown { target } => {
+        | Inst::Hlt
-        //    target.get_regs_as_uses(collector);
+        | Inst::Ud2 { .. } => {
-        //}
+            // No registers are used.
-        Inst::Nop { .. } | Inst::JmpUnknown { .. } => unimplemented!("x64_get_regs inst"),
+        }
    }
 }
@@ -631,34 +719,34 @@ fn x64_get_regs(inst: &Inst, collector: &mut RegUsageCollector) {
 // Instructions and subcomponents: map_regs
 fn map_use<RUM: RegUsageMapper>(m: &RUM, r: &mut Reg) {
-    if r.is_virtual() {
+    if let Some(reg) = r.as_virtual_reg() {
-        let new = m.get_use(r.to_virtual_reg()).unwrap().to_reg();
+        let new = m.get_use(reg).unwrap().to_reg();
        *r = new;
    }
 }
 fn map_def<RUM: RegUsageMapper>(m: &RUM, r: &mut Writable<Reg>) {
-    if r.to_reg().is_virtual() {
+    if let Some(reg) = r.to_reg().as_virtual_reg() {
-        let new = m.get_def(r.to_reg().to_virtual_reg()).unwrap().to_reg();
+        let new = m.get_def(reg).unwrap().to_reg();
        *r = Writable::from_reg(new);
    }
 }
 fn map_mod<RUM: RegUsageMapper>(m: &RUM, r: &mut Writable<Reg>) {
-    if r.to_reg().is_virtual() {
+    if let Some(reg) = r.to_reg().as_virtual_reg() {
-        let new = m.get_mod(r.to_reg().to_virtual_reg()).unwrap().to_reg();
+        let new = m.get_mod(reg).unwrap().to_reg();
        *r = Writable::from_reg(new);
    }
 }
-impl Addr {
+impl Amode {
    fn map_uses<RUM: RegUsageMapper>(&mut self, map: &RUM) {
        match self {
-            Addr::ImmReg {
+            Amode::ImmReg {
                simm32: _,
                ref mut base,
            } => map_use(map, base),
-            Addr::ImmRegRegShift {
+            Amode::ImmRegRegShift {
                simm32: _,
                ref mut base,
                ref mut index,
@@ -732,33 +820,33 @@ fn x64_map_regs<RUM: RegUsageMapper>(inst: &mut Inst, mapper: &RUM) {
            map_use(mapper, src);
            map_def(mapper, dst);
        }
-        Inst::MovZX_M_R {
+        Inst::MovZX_RM_R {
-            extMode: _,
+            ref mut src,
            ref mut addr,
            ref mut dst,
            ..
        } => {
-            addr.map_uses(mapper);
+            src.map_uses(mapper);
            map_def(mapper, dst);
        }
-        Inst::Mov64_M_R { addr, dst } => {
+        Inst::Mov64_M_R { src, dst } | Inst::LoadEffectiveAddress { addr: src, dst } => {
-            addr.map_uses(mapper);
+            src.map_uses(mapper);
            map_def(mapper, dst);
        }
-        Inst::MovSX_M_R {
+        Inst::MovSX_RM_R {
-            extMode: _,
+            ref mut src,
            ref mut addr,
            ref mut dst,
            ..
        } => {
-            addr.map_uses(mapper);
+            src.map_uses(mapper);
            map_def(mapper, dst);
        }
        Inst::Mov_R_M {
            size: _,
            ref mut src,
-            ref mut addr,
+            ref mut dst,
            ..
        } => {
            map_use(mapper, src);
-            addr.map_uses(mapper);
+            dst.map_uses(mapper);
        }
        Inst::Shift_R {
            is_64: _,
@@ -776,28 +864,51 @@ fn x64_map_regs<RUM: RegUsageMapper>(inst: &mut Inst, mapper: &RUM) {
            src.map_uses(mapper);
            map_use(mapper, dst);
        }
        Inst::Setcc { ref mut dst, .. } => map_def(mapper, dst),
        Inst::Push64 { ref mut src } => src.map_uses(mapper),
        Inst::Pop64 { ref mut dst } => {
            map_def(mapper, dst);
        }
        Inst::CallKnown {
-            dest: _,
+            ref mut uses,
-            uses: _,
+            ref mut defs,
-            defs: _,
+            ..
-        } => {}
+        } => {
-        Inst::CallUnknown { dest } => dest.map_uses(mapper),
+            for r in uses.iter_mut() {
-        Inst::Ret => {}
+                map_use(mapper, r);
-        Inst::EpiloguePlaceholder => {}
+            }
-        Inst::JmpKnown { dest: _ } => {}
+            for r in defs.iter_mut() {
-        Inst::JmpCondSymm {
+                map_def(mapper, r);
-            cc: _,
+            }
-            taken: _,
+        }
-            not_taken: _,
+
-        } => {}
+        Inst::CallUnknown {
-        //Inst::JmpUnknown { target } => {
+            ref mut uses,
-        //    target.apply_map(mapper);
+            ref mut defs,
-        //}
+            ref mut dest,
-        Inst::Nop { .. } | Inst::JmpUnknown { .. } => unimplemented!("x64_map_regs opcode"),
+            ..
        } => {
            for r in uses.iter_mut() {
                map_use(mapper, r);
            }
            for r in defs.iter_mut() {
                map_def(mapper, r);
            }
            dest.map_uses(mapper);
        }
        Inst::Ret
        | Inst::EpiloguePlaceholder
        | Inst::JmpKnown { .. }
        | Inst::JmpCond { .. }
        | Inst::Nop { .. }
        | Inst::JmpUnknown { .. }
        | Inst::VirtualSPOffsetAdj { .. }
        | Inst::Ud2 { .. }
        | Inst::Hlt => {
            // No registers are used.
        }
    }
 }
@@ -847,8 +958,8 @@ impl MachInst for Inst {
        match self {
            // Interesting cases.
            &Self::Ret | &Self::EpiloguePlaceholder => MachTerminator::Ret,
-            &Self::JmpKnown { dest } => MachTerminator::Uncond(dest.as_label().unwrap()),
+            &Self::JmpKnown { dst } => MachTerminator::Uncond(dst.as_label().unwrap()),
-            &Self::JmpCondSymm {
+            &Self::JmpCond {
                cc: _,
                taken,
                not_taken,
@@ -875,7 +986,7 @@ impl MachInst for Inst {
    }
    fn gen_zero_len_nop() -> Inst {
-        unimplemented!()
+        Inst::Nop { len: 0 }
    }
    fn gen_nop(_preferred_size: usize) -> Inst {
@@ -919,20 +1030,27 @@ impl MachInst for Inst {
    type LabelUse = LabelUse;
 }
-impl MachInstEmit for Inst {
+/// State carried between emissions of a sequence of instructions.
-    type State = ();
+#[derive(Default, Clone, Debug)]
 pub struct EmitState {
    virtual_sp_offset: i64,
 }
-    fn emit(&self, sink: &mut MachBuffer<Inst>, _flags: &settings::Flags, _: &mut Self::State) {
+impl MachInstEmit for Inst {
-        emit::emit(self, sink);
+    type State = EmitState;
    fn emit(&self, sink: &mut MachBuffer<Inst>, flags: &settings::Flags, state: &mut Self::State) {
        emit::emit(self, sink, flags, state);
    }
 }
 /// A label-use (internal relocation) in generated code.
 #[derive(Clone, Copy, Debug, PartialEq, Eq)]
-pub(crate) enum LabelUse {
+pub enum LabelUse {
-    /// A 32-bit offset from location of relocation itself, added to the
+    /// A 32-bit offset from location of relocation itself, added to the existing value at that
-    /// existing value at that location.
+    /// location. Used for control flow instructions which consider an offset from the start of the
-    Rel32,
+    /// next instruction (so the size of the payload -- 4 bytes -- is subtracted from the payload).
    JmpRel32,
 }
 impl MachInstLabelUse for LabelUse {
@@ -940,30 +1058,31 @@ impl MachInstLabelUse for LabelUse {
    fn max_pos_range(self) -> CodeOffset {
        match self {
-            LabelUse::Rel32 => 0x7fff_ffff,
+            LabelUse::JmpRel32 => 0x7fff_ffff,
        }
    }
    fn max_neg_range(self) -> CodeOffset {
        match self {
-            LabelUse::Rel32 => 0x8000_0000,
+            LabelUse::JmpRel32 => 0x8000_0000,
        }
    }
    fn patch_size(self) -> CodeOffset {
        match self {
-            LabelUse::Rel32 => 4,
+            LabelUse::JmpRel32 => 4,
        }
    }
    fn patch(self, buffer: &mut [u8], use_offset: CodeOffset, label_offset: CodeOffset) {
        let pc_rel = (label_offset as i64) - (use_offset as i64);
        debug_assert!(pc_rel <= self.max_pos_range() as i64);
        debug_assert!(pc_rel >= -(self.max_neg_range() as i64));
        let pc_rel = pc_rel as u32;
        match self {
-            LabelUse::Rel32 => {
+            LabelUse::JmpRel32 => {
-                let addend = i32::from_le_bytes([buffer[0], buffer[1], buffer[2], buffer[3]]);
+                let addend = u32::from_le_bytes([buffer[0], buffer[1], buffer[2], buffer[3]]);
-                let value = i32::try_from(label_offset)
+                let value = pc_rel.wrapping_add(addend).wrapping_sub(4);
                    .unwrap()
                    .wrapping_sub(i32::try_from(use_offset).unwrap())
                    .wrapping_add(addend);
                buffer.copy_from_slice(&value.to_le_bytes()[..]);
            }
        }
@@ -971,20 +1090,20 @@ impl MachInstLabelUse for LabelUse {
    fn supports_veneer(self) -> bool {
        match self {
-            LabelUse::Rel32 => false,
+            LabelUse::JmpRel32 => false,
        }
    }
    fn veneer_size(self) -> CodeOffset {
        match self {
-            LabelUse::Rel32 => 0,
+            LabelUse::JmpRel32 => 0,
        }
    }
    fn generate_veneer(self, _: &mut [u8], _: CodeOffset) -> (CodeOffset, LabelUse) {
        match self {
-            LabelUse::Rel32 => {
+            LabelUse::JmpRel32 => {
-                panic!("Veneer not supported for Rel32 label-use.");
+                panic!("Veneer not supported for JumpRel32 label-use.");
            }
        }
    }
--- a/cranelift/codegen/src/isa/x64/lower.rs
+++ b/cranelift/codegen/src/isa/x64/lower.rs
@@ -1,20 +1,22 @@
 //! Lowering rules for X64.
 #![allow(dead_code)]
 #![allow(non_snake_case)]
 use log::trace;
 use regalloc::{Reg, RegClass, Writable};
 use smallvec::SmallVec;
 use std::convert::TryFrom;
 use crate::ir::types;
 use crate::ir::types::*;
 use crate::ir::Inst as IRInst;
-use crate::ir::{condcodes::IntCC, InstructionData, Opcode, Type};
+use crate::ir::{condcodes::IntCC, InstructionData, Opcode, TrapCode, Type};
 use crate::machinst::lower::*;
 use crate::machinst::*;
 use crate::result::CodegenResult;
 use crate::isa::x64::abi::*;
 use crate::isa::x64::inst::args::*;
 use crate::isa::x64::inst::*;
 use crate::isa::x64::X64Backend;
@@ -32,6 +34,20 @@ fn is_int_ty(ty: Type) -> bool {
    }
 }
 fn is_bool_ty(ty: Type) -> bool {
    match ty {
        types::B1 | types::B8 | types::B16 | types::B32 | types::B64 => true,
        _ => false,
    }
 }
 fn is_float_ty(ty: Type) -> bool {
    match ty {
        types::F32 | types::F64 => true,
        _ => false,
    }
 }
 fn int_ty_is_64(ty: Type) -> bool {
    match ty {
        types::I8 | types::I16 | types::I32 => false,
@@ -48,30 +64,18 @@ fn flt_ty_is_64(ty: Type) -> bool {
    }
 }
-fn int_ty_to_sizeB(ty: Type) -> u8 {
+fn iri_to_u64_imm(ctx: Ctx, inst: IRInst) -> Option<u64> {
-    match ty {
+    ctx.get_constant(inst)
        types::I8 => 1,
        types::I16 => 2,
        types::I32 => 4,
        types::I64 => 8,
        _ => panic!("ity_to_sizeB"),
    }
 }
-fn iri_to_u64_immediate<'a>(ctx: Ctx<'a>, iri: IRInst) -> Option<u64> {
+fn inst_trapcode(data: &InstructionData) -> Option<TrapCode> {
-    let inst_data = ctx.data(iri);
+    match data {
-    if inst_data.opcode() == Opcode::Null {
+        &InstructionData::Trap { code, .. }
-        Some(0)
+        | &InstructionData::CondTrap { code, .. }
-    } else {
+        | &InstructionData::IntCondTrap { code, .. }
-        match inst_data {
+        | &InstructionData::FloatCondTrap { code, .. } => Some(code),
            &InstructionData::UnaryImm { opcode: _, imm } => {
                // Only has Into for i64; we use u64 elsewhere, so we cast.
                let imm: i64 = imm.into();
                Some(imm as u64)
            }
        _ => None,
    }
    }
 }
 fn inst_condcode(data: &InstructionData) -> IntCC {
@@ -87,36 +91,88 @@ fn inst_condcode(data: &InstructionData) -> IntCC {
    }
 }
-fn input_to_reg<'a>(ctx: Ctx<'a>, iri: IRInst, input: usize) -> Reg {
+fn ldst_offset(data: &InstructionData) -> Option<i32> {
-    let inputs = ctx.get_input(iri, input);
+    match data {
        &InstructionData::Load { offset, .. }
        | &InstructionData::StackLoad { offset, .. }
        | &InstructionData::LoadComplex { offset, .. }
        | &InstructionData::Store { offset, .. }
        | &InstructionData::StackStore { offset, .. }
        | &InstructionData::StoreComplex { offset, .. } => Some(offset.into()),
        _ => None,
    }
 }
 /// Identifier for a particular input of an instruction.
 #[derive(Clone, Copy, Debug, PartialEq, Eq)]
 struct InsnInput {
    insn: IRInst,
    input: usize,
 }
 /// Identifier for a particular output of an instruction.
 #[derive(Clone, Copy, Debug, PartialEq, Eq)]
 struct InsnOutput {
    insn: IRInst,
    output: usize,
 }
 fn input_to_reg<'a>(ctx: Ctx<'a>, spec: InsnInput) -> Reg {
    let inputs = ctx.get_input(spec.insn, spec.input);
    ctx.use_input_reg(inputs);
    inputs.reg
 }
-fn output_to_reg<'a>(ctx: Ctx<'a>, iri: IRInst, output: usize) -> Writable<Reg> {
+/// Try to use an immediate for constant inputs, and a register otherwise.
-    ctx.get_output(iri, output)
+/// TODO: handle memory as well!
 fn input_to_reg_mem_imm(ctx: Ctx, spec: InsnInput) -> RegMemImm {
    let imm = ctx.get_input(spec.insn, spec.input).constant.and_then(|x| {
        let as_u32 = x as u32;
        let extended = as_u32 as u64;
        // If the truncation and sign-extension don't change the value, use it.
        if extended == x {
            Some(as_u32)
        } else {
            None
        }
    });
    match imm {
        Some(x) => RegMemImm::imm(x),
        None => RegMemImm::reg(input_to_reg(ctx, spec)),
    }
 }
 fn output_to_reg<'a>(ctx: Ctx<'a>, spec: InsnOutput) -> Writable<Reg> {
    ctx.get_output(spec.insn, spec.output)
 }
 //=============================================================================
 // Top-level instruction lowering entry point, for one instruction.
 /// Actually codegen an instruction's results into registers.
-fn lower_insn_to_regs<'a>(ctx: Ctx<'a>, inst: IRInst) {
+fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(ctx: &mut C, insn: IRInst) -> CodegenResult<()> {
-    let op = ctx.data(inst).opcode();
+    let op = ctx.data(insn).opcode();
-    let ty = if ctx.num_outputs(inst) == 1 {
+
-        Some(ctx.output_ty(inst, 0))
+    let inputs: SmallVec<[InsnInput; 4]> = (0..ctx.num_inputs(insn))
        .map(|i| InsnInput { insn, input: i })
        .collect();
    let outputs: SmallVec<[InsnOutput; 2]> = (0..ctx.num_outputs(insn))
        .map(|i| InsnOutput { insn, output: i })
        .collect();
    let ty = if outputs.len() > 0 {
        Some(ctx.output_ty(insn, 0))
    } else {
        None
    };
    // This is all outstandingly feeble.  TODO: much better!
    match op {
        Opcode::Iconst => {
-            if let Some(w64) = iri_to_u64_immediate(ctx, inst) {
+            if let Some(w64) = iri_to_u64_imm(ctx, insn) {
                // Get exactly the bit pattern in 'w64' into the dest.  No
                // monkeying with sign extension etc.
                let dst_is_64 = w64 > 0xFFFF_FFFF;
-                let dst = output_to_reg(ctx, inst, 0);
+                let dst = output_to_reg(ctx, outputs[0]);
                ctx.emit(Inst::imm_r(dst_is_64, w64, dst));
            } else {
                unimplemented!();
@@ -124,28 +180,32 @@ fn lower_insn_to_regs<'a>(ctx: Ctx<'a>, inst: IRInst) {
        }
        Opcode::Iadd | Opcode::Isub => {
-            let dst = output_to_reg(ctx, inst, 0);
+            let lhs = input_to_reg(ctx, inputs[0]);
-            let lhs = input_to_reg(ctx, inst, 0);
+            let rhs = input_to_reg_mem_imm(ctx, inputs[1]);
-            let rhs = input_to_reg(ctx, inst, 1);
+            let dst = output_to_reg(ctx, outputs[0]);
            // TODO For add, try to commute the operands if one is an immediate.
            let is_64 = int_ty_is_64(ty.unwrap());
            let alu_op = if op == Opcode::Iadd {
                AluRmiROpcode::Add
            } else {
                AluRmiROpcode::Sub
            };
            ctx.emit(Inst::mov_r_r(true, lhs, dst));
-            ctx.emit(Inst::alu_rmi_r(is_64, alu_op, RegMemImm::reg(rhs), dst));
+            ctx.emit(Inst::alu_rmi_r(is_64, alu_op, rhs, dst));
        }
        Opcode::Ishl | Opcode::Ushr | Opcode::Sshr => {
            // TODO: implement imm shift value into insn
-            let dst_ty = ctx.output_ty(inst, 0);
+            let dst_ty = ctx.output_ty(insn, 0);
-            assert_eq!(ctx.input_ty(inst, 0), dst_ty);
+            assert_eq!(ctx.input_ty(insn, 0), dst_ty);
            assert!(dst_ty == types::I32 || dst_ty == types::I64);
-            let lhs = input_to_reg(ctx, inst, 0);
+            let lhs = input_to_reg(ctx, inputs[0]);
-            let rhs = input_to_reg(ctx, inst, 1);
+            let rhs = input_to_reg(ctx, inputs[1]);
-            let dst = output_to_reg(ctx, inst, 0);
+            let dst = output_to_reg(ctx, outputs[0]);
            let shift_kind = match op {
                Opcode::Ishl => ShiftKind::Left,
@@ -161,30 +221,68 @@ fn lower_insn_to_regs<'a>(ctx: Ctx<'a>, inst: IRInst) {
            ctx.emit(Inst::shift_r(is_64, shift_kind, None /*%cl*/, dst));
        }
-        Opcode::Uextend | Opcode::Sextend => {
+        Opcode::Uextend
-            // TODO: this is all extremely lame, all because Mov{ZX,SX}_M_R
+        | Opcode::Sextend
-            // don't accept a register source operand.  They should be changed
+        | Opcode::Bint
-            // so as to have _RM_R form.
+        | Opcode::Breduce
-            // TODO2: if the source operand is a load, incorporate that.
+        | Opcode::Bextend
-            let zero_extend = op == Opcode::Uextend;
+        | Opcode::Ireduce => {
-            let src_ty = ctx.input_ty(inst, 0);
+            let src_ty = ctx.input_ty(insn, 0);
-            let dst_ty = ctx.output_ty(inst, 0);
+            let dst_ty = ctx.output_ty(insn, 0);
            let src = input_to_reg(ctx, inst, 0);
            let dst = output_to_reg(ctx, inst, 0);
-            ctx.emit(Inst::mov_r_r(true, src, dst));
+            // TODO: if the source operand is a load, incorporate that.
-            match (src_ty, dst_ty, zero_extend) {
+            let src = input_to_reg(ctx, inputs[0]);
-                (types::I8, types::I64, false) => {
+            let dst = output_to_reg(ctx, outputs[0]);
-                    ctx.emit(Inst::shift_r(true, ShiftKind::Left, Some(56), dst));
+
-                    ctx.emit(Inst::shift_r(true, ShiftKind::RightS, Some(56), dst));
+            let ext_mode = match (src_ty.bits(), dst_ty.bits()) {
                (1, 32) | (8, 32) => ExtMode::BL,
                (1, 64) | (8, 64) => ExtMode::BQ,
                (16, 32) => ExtMode::WL,
                (16, 64) => ExtMode::WQ,
                (32, 64) => ExtMode::LQ,
                _ => unreachable!(
                    "unexpected extension kind from {:?} to {:?}",
                    src_ty, dst_ty
                ),
            };
            if op == Opcode::Sextend {
                ctx.emit(Inst::movsx_rm_r(ext_mode, RegMem::reg(src), dst));
            } else {
                // All of these other opcodes are simply a move from a zero-extended source.  Here
                // is why this works, in each case:
                //
                // - Bint: Bool-to-int. We always represent a bool as a 0 or 1, so we
                //   merely need to zero-extend here.
                //
                // - Breduce, Bextend: changing width of a boolean. We represent a
                //   bool as a 0 or 1, so again, this is a zero-extend / no-op.
                //
                // - Ireduce: changing width of an integer. Smaller ints are stored
                //   with undefined high-order bits, so we can simply do a copy.
                ctx.emit(Inst::movzx_rm_r(ext_mode, RegMem::reg(src), dst));
            }
                _ => unimplemented!(),
        }
        Opcode::Icmp => {
            let condcode = inst_condcode(ctx.data(insn));
            let cc = CC::from_intcc(condcode);
            let ty = ctx.input_ty(insn, 0);
            // TODO Try to commute the operands (and invert the condition) if one is an immediate.
            let lhs = input_to_reg(ctx, inputs[0]);
            let rhs = input_to_reg_mem_imm(ctx, inputs[1]);
            let dst = output_to_reg(ctx, outputs[0]);
            // Cranelift's icmp semantics want to compare lhs - rhs, while Intel gives
            // us dst - src at the machine instruction level, so invert operands.
            ctx.emit(Inst::cmp_rmi_r(ty.bytes() as u8, rhs, lhs));
            ctx.emit(Inst::setcc(cc, dst));
        }
        Opcode::FallthroughReturn | Opcode::Return => {
-            for i in 0..ctx.num_inputs(inst) {
+            for i in 0..ctx.num_inputs(insn) {
-                let src_reg = input_to_reg(ctx, inst, i);
+                let src_reg = input_to_reg(ctx, inputs[i]);
                let retval_reg = ctx.retval(i);
                if src_reg.get_class() == RegClass::I64 {
                    ctx.emit(Inst::mov_r_r(true, src_reg, retval_reg));
@@ -199,10 +297,58 @@ fn lower_insn_to_regs<'a>(ctx: Ctx<'a>, inst: IRInst) {
            // N.B.: the Ret itself is generated by the ABI.
        }
        Opcode::Call | Opcode::CallIndirect => {
            let loc = ctx.srcloc(insn);
            let (mut abi, inputs) = match op {
                Opcode::Call => {
                    let (extname, dist) = ctx.call_target(insn).unwrap();
                    let sig = ctx.call_sig(insn).unwrap();
                    assert!(inputs.len() == sig.params.len());
                    assert!(outputs.len() == sig.returns.len());
                    (
                        X64ABICall::from_func(sig, &extname, dist, loc)?,
                        &inputs[..],
                    )
                }
                Opcode::CallIndirect => {
                    let ptr = input_to_reg(ctx, inputs[0]);
                    let sig = ctx.call_sig(insn).unwrap();
                    assert!(inputs.len() - 1 == sig.params.len());
                    assert!(outputs.len() == sig.returns.len());
                    (X64ABICall::from_ptr(sig, ptr, loc, op)?, &inputs[1..])
                }
                _ => unreachable!(),
            };
            abi.emit_stack_pre_adjust(ctx);
            assert!(inputs.len() == abi.num_args());
            for (i, input) in inputs.iter().enumerate() {
                let arg_reg = input_to_reg(ctx, *input);
                abi.emit_copy_reg_to_arg(ctx, i, arg_reg);
            }
            abi.emit_call(ctx);
            for (i, output) in outputs.iter().enumerate() {
                let retval_reg = output_to_reg(ctx, *output);
                abi.emit_copy_retval_to_reg(ctx, i, retval_reg);
            }
            abi.emit_stack_post_adjust(ctx);
        }
        Opcode::Debugtrap => {
            ctx.emit(Inst::Hlt);
        }
        Opcode::Trap => {
            let trap_info = (ctx.srcloc(insn), inst_trapcode(ctx.data(insn)).unwrap());
            ctx.emit(Inst::Ud2 { trap_info })
        }
        Opcode::Fadd | Opcode::Fsub | Opcode::Fmul | Opcode::Fdiv => {
-            let dst = output_to_reg(ctx, inst, 0);
+            let lhs = input_to_reg(ctx, inputs[0]);
-            let lhs = input_to_reg(ctx, inst, 0);
+            let rhs = input_to_reg(ctx, inputs[1]);
-            let rhs = input_to_reg(ctx, inst, 1);
+            let dst = output_to_reg(ctx, outputs[0]);
            let is_64 = flt_ty_is_64(ty.unwrap());
            if !is_64 {
                let sse_op = match op {
@@ -219,10 +365,11 @@ fn lower_insn_to_regs<'a>(ctx: Ctx<'a>, inst: IRInst) {
                unimplemented!("unimplemented lowering for opcode {:?}", op);
            }
        }
        Opcode::Fcopysign => {
-            let dst = output_to_reg(ctx, inst, 0);
+            let dst = output_to_reg(ctx, outputs[0]);
-            let lhs = input_to_reg(ctx, inst, 0);
+            let lhs = input_to_reg(ctx, inputs[0]);
-            let rhs = input_to_reg(ctx, inst, 1);
+            let rhs = input_to_reg(ctx, inputs[1]);
            if !flt_ty_is_64(ty.unwrap()) {
                // movabs   0x8000_0000, tmp_gpr1
                // movd     tmp_gpr1, tmp_xmm1
@@ -265,6 +412,185 @@ fn lower_insn_to_regs<'a>(ctx: Ctx<'a>, inst: IRInst) {
                unimplemented!("{:?} for non 32-bit destination is not supported", op);
            }
        }
        Opcode::Load
        | Opcode::Uload8
        | Opcode::Sload8
        | Opcode::Uload16
        | Opcode::Sload16
        | Opcode::Uload32
        | Opcode::Sload32
        | Opcode::LoadComplex
        | Opcode::Uload8Complex
        | Opcode::Sload8Complex
        | Opcode::Uload16Complex
        | Opcode::Sload16Complex
        | Opcode::Uload32Complex
        | Opcode::Sload32Complex => {
            let offset = ldst_offset(ctx.data(insn)).unwrap();
            let elem_ty = match op {
                Opcode::Sload8 | Opcode::Uload8 | Opcode::Sload8Complex | Opcode::Uload8Complex => {
                    types::I8
                }
                Opcode::Sload16
                | Opcode::Uload16
                | Opcode::Sload16Complex
                | Opcode::Uload16Complex => types::I16,
                Opcode::Sload32
                | Opcode::Uload32
                | Opcode::Sload32Complex
                | Opcode::Uload32Complex => types::I32,
                Opcode::Load | Opcode::LoadComplex => ctx.output_ty(insn, 0),
                _ => unimplemented!(),
            };
            let ext_mode = match elem_ty.bytes() {
                1 => Some(ExtMode::BQ),
                2 => Some(ExtMode::WQ),
                4 => Some(ExtMode::LQ),
                _ => None,
            };
            let sign_extend = match op {
                Opcode::Sload8
                | Opcode::Sload8Complex
                | Opcode::Sload16
                | Opcode::Sload16Complex
                | Opcode::Sload32
                | Opcode::Sload32Complex => true,
                _ => false,
            };
            let is_float = is_float_ty(elem_ty);
            let addr = match op {
                Opcode::Load
                | Opcode::Uload8
                | Opcode::Sload8
                | Opcode::Uload16
                | Opcode::Sload16
                | Opcode::Uload32
                | Opcode::Sload32 => {
                    assert!(inputs.len() == 1, "only one input for load operands");
                    let base = input_to_reg(ctx, inputs[0]);
                    Amode::imm_reg(offset as u32, base)
                }
                Opcode::LoadComplex
                | Opcode::Uload8Complex
                | Opcode::Sload8Complex
                | Opcode::Uload16Complex
                | Opcode::Sload16Complex
                | Opcode::Uload32Complex
                | Opcode::Sload32Complex => {
                    assert!(
                        inputs.len() == 2,
                        "can't handle more than two inputs in complex load"
                    );
                    let base = input_to_reg(ctx, inputs[0]);
                    let index = input_to_reg(ctx, inputs[1]);
                    let shift = 0;
                    Amode::imm_reg_reg_shift(offset as u32, base, index, shift)
                }
                _ => unreachable!(),
            };
            let dst = output_to_reg(ctx, outputs[0]);
            match (sign_extend, is_float) {
                (true, false) => {
                    // The load is sign-extended only when the output size is lower than 64 bits,
                    // so ext-mode is defined in this case.
                    ctx.emit(Inst::movsx_rm_r(ext_mode.unwrap(), RegMem::mem(addr), dst));
                }
                (false, false) => {
                    if elem_ty.bytes() == 8 {
                        // Use a plain load.
                        ctx.emit(Inst::mov64_m_r(addr, dst))
                    } else {
                        // Use a zero-extended load.
                        ctx.emit(Inst::movzx_rm_r(ext_mode.unwrap(), RegMem::mem(addr), dst))
                    }
                }
                (_, true) => unimplemented!("FPU loads"),
            }
        }
        Opcode::Store
        | Opcode::Istore8
        | Opcode::Istore16
        | Opcode::Istore32
        | Opcode::StoreComplex
        | Opcode::Istore8Complex
        | Opcode::Istore16Complex
        | Opcode::Istore32Complex => {
            let offset = ldst_offset(ctx.data(insn)).unwrap();
            let elem_ty = match op {
                Opcode::Istore8 | Opcode::Istore8Complex => types::I8,
                Opcode::Istore16 | Opcode::Istore16Complex => types::I16,
                Opcode::Istore32 | Opcode::Istore32Complex => types::I32,
                Opcode::Store | Opcode::StoreComplex => ctx.input_ty(insn, 0),
                _ => unreachable!(),
            };
            let is_float = is_float_ty(elem_ty);
            let addr = match op {
                Opcode::Store | Opcode::Istore8 | Opcode::Istore16 | Opcode::Istore32 => {
                    assert!(
                        inputs.len() == 2,
                        "only one input for store memory operands"
                    );
                    let base = input_to_reg(ctx, inputs[1]);
                    // TODO sign?
                    Amode::imm_reg(offset as u32, base)
                }
                Opcode::StoreComplex
                | Opcode::Istore8Complex
                | Opcode::Istore16Complex
                | Opcode::Istore32Complex => {
                    assert!(
                        inputs.len() == 3,
                        "can't handle more than two inputs in complex load"
                    );
                    let base = input_to_reg(ctx, inputs[1]);
                    let index = input_to_reg(ctx, inputs[2]);
                    let shift = 0;
                    Amode::imm_reg_reg_shift(offset as u32, base, index, shift)
                }
                _ => unreachable!(),
            };
            let src = input_to_reg(ctx, inputs[0]);
            if is_float {
                unimplemented!("FPU stores");
            } else {
                ctx.emit(Inst::mov_r_m(elem_ty.bytes() as u8, src, addr));
            }
        }
        Opcode::StackAddr => {
            let (stack_slot, offset) = match *ctx.data(insn) {
                InstructionData::StackLoad {
                    opcode: Opcode::StackAddr,
                    stack_slot,
                    offset,
                } => (stack_slot, offset),
                _ => unreachable!(),
            };
            let dst = output_to_reg(ctx, outputs[0]);
            let offset: i32 = offset.into();
            println!("stackslot_addr: {:?} @ off{}", stack_slot, offset);
            let inst = ctx
                .abi()
                .stackslot_addr(stack_slot, u32::try_from(offset).unwrap(), dst);
            ctx.emit(inst);
        }
        Opcode::IaddImm
        | Opcode::ImulImm
        | Opcode::UdivImm
@@ -296,6 +622,8 @@ fn lower_insn_to_regs<'a>(ctx: Ctx<'a>, inst: IRInst) {
        }
        _ => unimplemented!("unimplemented lowering for opcode {:?}", op),
    }
    Ok(())
 }
 //=============================================================================
@@ -305,8 +633,7 @@ impl LowerBackend for X64Backend {
    type MInst = Inst;
    fn lower<C: LowerCtx<I = Inst>>(&self, ctx: &mut C, ir_inst: IRInst) -> CodegenResult<()> {
-        lower_insn_to_regs(ctx, ir_inst);
+        lower_insn_to_regs(ctx, ir_inst)
        Ok(())
    }
    fn lower_branch_group<C: LowerCtx<I = Inst>>(
@@ -346,33 +673,52 @@ impl LowerBackend for X64Backend {
            match op0 {
                Opcode::Brz | Opcode::Brnz => {
                    let src_ty = ctx.input_ty(branches[0], 0);
-                    if is_int_ty(src_ty) {
+                    if is_int_ty(src_ty) || is_bool_ty(src_ty) {
-                        let src = input_to_reg(ctx, branches[0], 0);
+                        let src = input_to_reg(
                            ctx,
                            InsnInput {
                                insn: branches[0],
                                input: 0,
                            },
                        );
                        let cc = match op0 {
                            Opcode::Brz => CC::Z,
                            Opcode::Brnz => CC::NZ,
                            _ => unreachable!(),
                        };
-                        let sizeB = int_ty_to_sizeB(src_ty);
+                        let size_bytes = src_ty.bytes() as u8;
-                        ctx.emit(Inst::cmp_rmi_r(sizeB, RegMemImm::imm(0), src));
+                        ctx.emit(Inst::cmp_rmi_r(size_bytes, RegMemImm::imm(0), src));
-                        ctx.emit(Inst::jmp_cond_symm(cc, taken, not_taken));
+                        ctx.emit(Inst::jmp_cond(cc, taken, not_taken));
                    } else {
-                        unimplemented!("brz/brnz with non-int type");
+                        unimplemented!("brz/brnz with non-int type {:?}", src_ty);
                    }
                }
                Opcode::BrIcmp => {
                    let src_ty = ctx.input_ty(branches[0], 0);
-                    if is_int_ty(src_ty) {
+                    if is_int_ty(src_ty) || is_bool_ty(src_ty) {
-                        let lhs = input_to_reg(ctx, branches[0], 0);
+                        let lhs = input_to_reg(
-                        let rhs = input_to_reg(ctx, branches[0], 1);
+                            ctx,
                            InsnInput {
                                insn: branches[0],
                                input: 0,
                            },
                        );
                        let rhs = input_to_reg_mem_imm(
                            ctx,
                            InsnInput {
                                insn: branches[0],
                                input: 1,
                            },
                        );
                        let cc = CC::from_intcc(inst_condcode(ctx.data(branches[0])));
-                        let byte_size = int_ty_to_sizeB(src_ty);
+                        let byte_size = src_ty.bytes() as u8;
-                        // FIXME verify rSR vs rSL ordering
+                        // Cranelift's icmp semantics want to compare lhs - rhs, while Intel gives
-                        ctx.emit(Inst::cmp_rmi_r(byte_size, RegMemImm::reg(rhs), lhs));
+                        // us dst - src at the machine instruction level, so invert operands.
-                        ctx.emit(Inst::jmp_cond_symm(cc, taken, not_taken));
+                        ctx.emit(Inst::cmp_rmi_r(byte_size, rhs, lhs));
                        ctx.emit(Inst::jmp_cond(cc, taken, not_taken));
                    } else {
-                        unimplemented!("bricmp with non-int type");
+                        unimplemented!("bricmp with non-int type {:?}", src_ty);
                    }
                }
@@ -385,15 +731,9 @@ impl LowerBackend for X64Backend {
            // Must be an unconditional branch or trap.
            let op = ctx.data(branches[0]).opcode();
            match op {
-                Opcode::Jump => {
+                Opcode::Jump | Opcode::Fallthrough => {
                    ctx.emit(Inst::jmp_known(BranchTarget::Label(targets[0])));
                }
                Opcode::Fallthrough => {
                    ctx.emit(Inst::jmp_known(BranchTarget::Label(targets[0])));
                }
                Opcode::Trap => {
                    unimplemented!("trap");
                }
                _ => panic!("Unknown branch type!"),
            }
        }
--- a/cranelift/codegen/src/isa/x64/mod.rs
+++ b/cranelift/codegen/src/isa/x64/mod.rs
@@ -40,7 +40,7 @@ impl X64Backend {
    fn compile_vcode(&self, func: &Function, flags: Flags) -> CodegenResult<VCode<inst::Inst>> {
        // This performs lowering to VCode, register-allocates the code, computes
        // block layout and finalizes branches. The result is ready for binary emission.
-        let abi = Box::new(abi::X64ABIBody::new(&func, flags));
+        let abi = Box::new(abi::X64ABIBody::new(&func, flags)?);
        compile::compile::<Self>(&func, self, abi)
    }
 }
--- a/cranelift/codegen/src/machinst/buffer.rs
+++ b/cranelift/codegen/src/machinst/buffer.rs
@@ -1024,7 +1024,7 @@ impl<I: VCodeInst> MachBuffer<I> {
                let veneer_offset = self.cur_offset();
                trace!("making a veneer at {}", veneer_offset);
                let slice = &mut self.data[start..end];
-                // Patch the original label use to refer to teh veneer.
+                // Patch the original label use to refer to the veneer.
                trace!(
                    "patching original at offset {} to veneer offset {}",
                    offset,
--- a/crates/jit/src/link.rs
+++ b/crates/jit/src/link.rs
@@ -106,6 +106,19 @@ fn apply_reloc(
                .wrapping_add(reloc_addend as u32);
            write_unaligned(reloc_address as *mut u32, reloc_delta_u32);
        },
        #[cfg(target_pointer_width = "64")]
        Reloc::X86CallPCRel4 => unsafe {
            let reloc_address = body.add(r.offset as usize) as usize;
            let reloc_addend = r.addend as isize;
            let reloc_delta_u64 = (target_func_address as u64)
                .wrapping_sub(reloc_address as u64)
                .wrapping_add(reloc_addend as u64);
            assert!(
                reloc_delta_u64 as isize <= i32::max_value() as isize,
                "relocation too large to fit in i32"
            );
            write_unaligned(reloc_address as *mut u32, reloc_delta_u64 as u32);
        },
        Reloc::X86PCRelRodata4 => {
            // ignore
        }